In a Spark DataFrame how can I flatten the struct

Question

Can anyone help me in understanding that how can I flatten the struct in Spark Data frame?

Shubham · Answer 1 · May 24, 2018

You can go ahead and use the flatMap method.

answered May 24, 2018 by Shubham
• 13,490 points

score +1 · Answer 2 · Jul 4, 2019

// Collect data from input avro file and create dataset
            Dataset<Row> inputRecordsCollection = spark.read().format("avro").load(inputFile);

            // Flatten data of nested schema file to remove nested fields
            inputRecordsCollection.createOrReplaceTempView("inputFileTable");
            

          //function call to flatten data  

           String fileSQL = flattenSchema(inputRecordsCollection.schema(), null);
            Dataset<Row> inputFlattRecords = spark.sql("SELECT " + fileSQL + " FROM inputFileTable");
            inputFlattRecords.show(10);

public static String flattenSchema(StructType schema, String prefix) {
        final StringBuilder selectSQLQuery = new StringBuilder();

        for (StructField field : schema.fields()) {
            final String fieldName = field.name();

            if (fieldName.startsWith("@")) {
                continue;
            }

            String colName = prefix == null ? fieldName : (prefix + "[0]." + fieldName);
            String colNameTarget = colName.replace("[0].", "_");

            DataType dtype = field.dataType();
            if (dtype.getClass().equals(ArrayType.class)) {
                dtype = ((ArrayType) dtype).elementType();

            }
            if (dtype.getClass().equals(StructType.class)) {
                selectSQLQuery.append(flattenSchema((StructType) dtype, colName));
            } else {
                selectSQLQuery.append(colName);
                selectSQLQuery.append(" as ");
                selectSQLQuery.append(colNameTarget);
            }

            selectSQLQuery.append(",");
        }

        if (selectSQLQuery.length() > 0) {
            selectSQLQuery.deleteCharAt(selectSQLQuery.length() - 1);
        }

        return selectSQLQuery.toString();

    }