Nested json flatten data via spark scala
Nested json flatten by function which can do automatically flatten struct as well as array.
Consider json data as, address.json look like this as printSchema
Solution :
//load data as Data Frame
val df = spark.read.format("json")
.option("multiline","true")
.load("file:///D:/address.json")
df.show()
df.printSchema()
def flatten_df(nested_df: DataFrame): DataFrame = {
def flattenColumns(df: DataFrame): DataFrame = {
val flatColumns = df.schema.fields.flatMap { field =>
field.dataType match {
//flatten array
case array if array.typeName.startsWith("array") =>
Seq(explode(col(field.name)).alias(field.name))
//flatten struct
case nested if nested.typeName.startsWith("struct") =>
flattenColumns(df.select(col(field.name + ".*")))
.columns
.map(c => col(field.name + "." + c).alias(field.name + "_" + c))
case _ => Seq(col(field.name))
}
}
df.select(flatColumns: _*)
}
flattenColumns(nested_df)
}
//flatten_df function invoke
val flat_df = flatten_df(df)
flat_df.show()
flat_df.printSchema()
Comments
Post a Comment