Nested json flatten data via spark scala

Nested json flatten data via spark scala

Nested json flatten by function which can do automatically flatten struct as well as array.


Consider json data as, address.json look like this as printSchema


Solution :

//load data as Data Frame

val df = spark.read.format("json")

.option("multiline","true")

.load("file:///D:/address.json")


df.show()

df.printSchema()



def flatten_df(nested_df: DataFrame): DataFrame = {

def flattenColumns(df: DataFrame): DataFrame = {

val flatColumns = df.schema.fields.flatMap { field =>

field.dataType match {

//flatten array

case array if array.typeName.startsWith("array") =>


Seq(explode(col(field.name)).alias(field.name))



//flatten struct

case nested if nested.typeName.startsWith("struct") =>

flattenColumns(df.select(col(field.name + ".*")))

.columns

.map(c => col(field.name + "." + c).alias(field.name + "_" + c))


case _ => Seq(col(field.name))

}

}

df.select(flatColumns: _*)

}


flattenColumns(nested_df)

}


//flatten_df function invoke

val flat_df = flatten_df(df)

flat_df.show()

flat_df.printSchema()

Comments