Find Total working hours of each employee in a day through spark and scala
File name : epid.txt
1,"2020-12-30T09:00:00","IN"
1,"2020-12-30T13:00:00","OUT"
1,"2020-12-30T14:00:00","IN"
1,"2020-12-30T17:00:00","OUT"
1,"2020-12-30T17:30:00","IN"
1,"2020-12-30T20:00:00","OUT"
2,"2020-12-30T10:30:00","IN"
2,"2020-12-30T15:00:00","OUT"
2,"2020-12-30T16:30:00","IN"
2,"2020-12-30T21:30:00","IN"
Solution :
val schemax = new StructType()
.add("eid",IntegerType,true)
.add("ts",TimestampType,true)
.add("event",StringType,true)
val df = spark.read.format("csv").schema(schemax).load("file:///D:/epid.txt")
df.show()
df.printSchema()
val unixtime = unix_timestamp(col("ts"))
val pro = (unixtime-lag(unixtime,1).over(Window.partitionBy("eid") orderBy("ts"))).alias("time_diff")
val dx = df.select(col("eid"),pro)
.filter(col("event")==="OUT")
.withColumn("time_diff_minutes", col("time_diff") / 60)
.drop("time_diff")
dx.show()
val result = dx.groupBy("eid")
.agg(sum("time_diff_minutes").alias("time_diff"))
result.show()
Comments
Post a Comment