How to sum two column country with dept in below table in json via pyspark
Input:
Output:
Solution:
df= spark.read.format("json").option("multiline","True").load("file:////home/cloudera/countrydept.json")
df.show()
dfe = df.selectExpr("coalesce(country1.dept1.count, 0) as dept1",
"coalesce(country1.dept2.count, 0) as dept2",
"coalesce(country1.dept3.count, 0) as dept3",
"coalesce(country1.dept4.count, 0) as dept4",
"coalesce(country2.dept1.count, 0) as dept1_2",
"coalesce(country2.dept2.count, 0) as dept2_2",
"coalesce(country2.dept4.count, 0) as dept4_2")
dfe.show()
dfg = dfe.agg(sum("dept1").alias("sum_dept1"),
sum("dept2").alias("sum_dept2"),
sum("dept3").alias("sum_dept3"),
sum("dept4").alias("sum_dept4"),
sum("dept1_2").alias("sum_dept1_2"),
sum("dept2_2").alias("sum_dept2_2"),
sum("dept4_2").alias("sum_dept4_2"))
dfg.show()
dfs = dfg.selectExpr("sum_dept1 + sum_dept1_2 as sum_dept1",
"sum_dept2 + sum_dept2_2 as sum_dept2",
"sum_dept3 as sum_dept3",
"sum_dept4 + sum_dept4_2 as sum_dept4")
dfs.show()
Comments
Post a Comment