from pyspark.sql import SparkSession from pyspark.sql import * from pyspark.sql.functions import * from pyspark.sql.functions import lag from pyspark.sql import functions as F sc = SparkSession\ .builder\ .master("local[*]")\ .appName('example_spark')\ .getOrCreate()
from pyspark.sql import SparkSession from pyspark.sql import * from pyspark.sql.functions import * from pyspark.sql.functions import lag from pyspark.sql import functions as F sc = SparkSession\ .builder\ .master("local[*]")\ .appName('example_spark')\ .getOrCreate()
17:20 distinct is not a narrow transformation
Window function will helps to solve this pyspark code.
Interviewee has 9 years of experience as a data engineer
Sir, Can you please share the pyspark code of that problem?
thank u sir.
from pyspark.sql import SparkSession
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.functions import lag
from pyspark.sql import functions as F
sc = SparkSession\
.builder\
.master("local[*]")\
.appName('example_spark')\
.getOrCreate()
# creating a dataframe
data = [
(2000,'2024-01-01'),
(3000,'2024-01-02'),
(45000,'2024-01-22'),
(40000,'2024-02-02'),
(13000,'2024-03-03')
]
headers = ("revenue","date")
df = sc.createDataFrame(data, headers)
df.show()
df = df.withColumn('month',date_format(df.date, 'yyyy-MM'))
df = df.groupBy('month').agg(sum('revenue').alias('revenue')).orderBy('month')
my_window = (Window.orderBy('month')
.rowsBetween(Window.unboundedPreceding, 0))
df_new = df.withColumn('cum_sum', F.sum('revenue').over(my_window))
df_new.show()
from pyspark.sql import SparkSession
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark.sql.functions import lag
from pyspark.sql import functions as F
sc = SparkSession\
.builder\
.master("local[*]")\
.appName('example_spark')\
.getOrCreate()
# creating a dataframe
data = [
(2000,'2024-01-01'),
(3000,'2024-01-02'),
(45000,'2024-01-22'),
(40000,'2024-02-02'),
(13000,'2024-03-03')
]
headers = ("revenue","date")
df = sc.createDataFrame(data, headers)
df.show()
df = df.withColumn('month',date_format(df.date, 'yyyy-MM'))
df = df.groupBy('month').agg(sum('revenue').alias('revenue')).orderBy('month')
my_window = (Window.orderBy('month')
.rowsBetween(Window.unboundedPreceding, 0))
df_new = df.withColumn('cum_sum', F.sum('revenue').over(my_window))
df_new.show()