// Collect data from input avro file and create dataset
Dataset<Row> inputRecordsCollection = spark.read().format("avro").load(inputFile);
// Flatten data of nested schema file to remove nested fields
inputRecordsCollection.createOrReplaceTempView("inputFileTable");
//function call to flatten data
String fileSQL = flattenSchema(inputRecordsCollection.schema(), null);
Dataset<Row> inputFlattRecords = spark.sql("SELECT " + fileSQL + " FROM inputFileTable");
inputFlattRecords.show(10);
public static String flattenSchema(StructType schema, String prefix) {
final StringBuilder selectSQLQuery = new StringBuilder();
for (StructField field : schema.fields()) {
final String fieldName = field.name();
if (fieldName.startsWith("@")) {
continue;
}
String colName = prefix == null ? fieldName : (prefix + "[0]." + fieldName);
String colNameTarget = colName.replace("[0].", "_");
DataType dtype = field.dataType();
if (dtype.getClass().equals(ArrayType.class)) {
dtype = ((ArrayType) dtype).elementType();
}
if (dtype.getClass().equals(StructType.class)) {
selectSQLQuery.append(flattenSchema((StructType) dtype, colName));
} else {
selectSQLQuery.append(colName);
selectSQLQuery.append(" as ");
selectSQLQuery.append(colNameTarget);
}
selectSQLQuery.append(",");
}
if (selectSQLQuery.length() > 0) {
selectSQLQuery.deleteCharAt(selectSQLQuery.length() - 1);
}
return selectSQLQuery.toString();
}