How to sum two column country with dept in below table in json via pyspark

How to sum two column country with dept in below table in json via pyspark 

Input:



Output:

Solution:

df= spark.read.format("json").option("multiline","True").load("file:////home/cloudera/countrydept.json")

df.show()

dfe = df.selectExpr("coalesce(country1.dept1.count, 0) as dept1", 

                   "coalesce(country1.dept2.count, 0) as dept2",

                   "coalesce(country1.dept3.count, 0) as dept3", 

                   "coalesce(country1.dept4.count, 0) as dept4",

                   "coalesce(country2.dept1.count, 0) as dept1_2", 

                   "coalesce(country2.dept2.count, 0) as dept2_2",

                   "coalesce(country2.dept4.count, 0) as dept4_2")

dfe.show()

dfg = dfe.agg(sum("dept1").alias("sum_dept1"),

            sum("dept2").alias("sum_dept2"),

            sum("dept3").alias("sum_dept3"),

            sum("dept4").alias("sum_dept4"),

            sum("dept1_2").alias("sum_dept1_2"),

            sum("dept2_2").alias("sum_dept2_2"),

            sum("dept4_2").alias("sum_dept4_2"))

dfg.show()

dfs = dfg.selectExpr("sum_dept1 + sum_dept1_2 as sum_dept1",

                   "sum_dept2 + sum_dept2_2 as sum_dept2",

                   "sum_dept3  as sum_dept3",

                   "sum_dept4 + sum_dept4_2 as sum_dept4")

dfs.show()



Comments