0% found this document useful (0 votes)
133 views

Data Engineering 101 - Day 24 - SQL Vs PySpark

Books data engineering SQL

Uploaded by

sagarvshinde
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
133 views

Data Engineering 101 - Day 24 - SQL Vs PySpark

Books data engineering SQL

Uploaded by

sagarvshinde
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 82

Data Engineering 101 -

SQL vs PySpark
80 +
comparisons

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

SELECT COLUMNS
SQL
SELECT column1, column2

1
FROM table;

PYSPARK
df.select("column1", "column2")

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

FILTER ROWS
SQL
SELECT * FROM table

2
WHERE condition;

PYSPARK
df.filter("condition")

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

AGGREGATE FUNCTIONS
SQL
SELECT AVG(column)

3
FROM table;

PYSPARK
df.select(F.avg("column"))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

GROUP BY
SQL
SELECT column, COUNT(*)

4
FROM table
GROUP BY column;

PYSPARK
df.groupBy("column").count()

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

ORDER BY
SQL
SELECT *

5
FROM table
ORDER BY column ASC;

PYSPARK
df.orderBy("column",
ascending=True)

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

JOIN
SQL
SELECT * FROM table1

6
JOIN table2
ON table1.id = table2.id;

PYSPARK
df1.join(df2, df1.id == df2.id)

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

UNION
SQL
SELECT * FROM table1

7
UNION
SELECT * FROM table2;

PYSPARK
df1.union(df2)

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

LIMIT
SQL
SELECT *

8
FROM table
LIMIT 100;

PYSPARK
df.limit(100)

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

DISTINCT VALUES
SQL
SELECT DISTINCT column

9
FROM table;

PYSPARK
df.select("column").distinct()

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

ADDING A NEW COLUMN


SQL
SELECT *, (column1 + column2)

10
AS new_column
FROM table;

PYSPARK
df.withColumn("new_column",
F.col("column1") +
F.col("column2"))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

COLUMN ALIAS
SQL
SELECT column AS alias_name

11
FROM table;

PYSPARK
df.select(F.col("column").alias("
alias_name"))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

FILTERING ON MULTIPLE
CONDITIONS
SQL
SELECT * FROM table

12
WHERE
condition1 AND condition2;

PYSPARK
df.filter((F.col("condition1")) &
(F.col("condition2")))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

SUBQUERY
SQL
SELECT * FROM

13
(SELECT * FROM table
WHERE condition) AS subquery;

PYSPARK
df.filter("condition").alias("subq
uery")

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

BETWEEN
SQL
SELECT * FROM table

14
WHERE column
BETWEEN val1 AND val2;

PYSPARK
df.filter(F.col("column") \
.between("val1", "val2"))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

LIKE
SQL
SELECT * FROM table

15
WHERE column LIKE pattern;

PYSPARK
df.filter(F.col("column") \
.like("pattern"))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

CASE WHEN
SQL
SELECT CASE

16
WHEN condition THEN result1
ELSE result2 END
FROM table;

PYSPARK
df.select(F.when(F.col("conditio
n"), "result1") \
.otherwise("result2"))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

CAST DATA TYPE


SQL
SELECT

17
CAST(column AS datatype)
FROM table;

PYSPARK
df.select(F.col("column") \
.cast("datatype"))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

COUNT DISTINCT
SQL
SELECT

18
COUNT(DISTINCT column)
FROM table;

PYSPARK
df.select(F.countDistinct("colu
mn"))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

SUBSTRING
SQL
SELECT SUBSTRING(column,

19
start, length)
FROM table;

PYSPARK
df.select(F.substring("column",
start, length))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

CONCATENATE COLUMNS
SQL
SELECT

20
CONCAT(column1, column2) AS
new_column
FROM table;

PYSPARK
df.withColumn("new_column",
F.concat(F.col("column1"),
F.col("column2")))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

AVERAGE OVER PARTITION


SQL
SELECT AVG(column)

21
OVER (PARTITION BY column2)
FROM table;

PYSPARK
df.withColumn("avg", F.avg("column") \
.over(Window.partitionBy("column2")))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

SUM OVER PARTITION


SQL
SELECT SUM(column)

22
OVER (PARTITION BY column2)
FROM table;

PYSPARK
df.withColumn("sum", F.sum("column") \
.over(Window.partitionBy("column2")))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

LEAD FUNCTION
SQL
SELECT LEAD(column, 1)

23
OVER (ORDER BY column2)
FROM table;

PYSPARK
df.withColumn("lead",
F.lead("column", 1) \
.over(Window.orderBy("column2")))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

LAG FUNCTION
SQL
SELECT LAG(column, 1)

24
OVER (ORDER BY column2)
FROM table;

PYSPARK
df.withColumn("lag", F.lag("column", 1) \
.over(Window.orderBy("column2")))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

ROW COUNT
SQL
SELECT COUNT(*)

25
FROM table;

PYSPARK
df.count()

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

DROP COLUMN
SQL
ALTER TABLE table

26
DROP COLUMN column;

PYSPARK
df.drop("column")

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

RENAME COLUMN
SQL
ALTER TABLE table RENAME

27
COLUMN column1 TO column2;

PYSPARK
df.withColumnRenamed("colu
mn1", "column2")

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

CHANGE COLUMN TYPE


SQL
ALTER TABLE table

28
ALTER COLUMN column TYPE
new_type;

PYSPARK
df.withColumn("column",
df["column"] \
.cast("new_type"))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

CREATING A TABLE FROM


SELECT
SQL
CREATE TABLE new_table

29
AS SELECT * FROM table;

PYSPARK
(df.write.format("parquet") \
.saveAsTable("new_table"))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

INSERTING SELECTED DATA


INTO TABLE
SQL
INSERT INTO table2

30
SELECT * FROM table1;

PYSPARK
(df1.write.insertInto("table2"))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

CREATING A TABLE WITH


SPECIFIC COLUMNS
SQL
CREATE TABLE new_table

31
AS
SELECT column1, column2
FROM table;

PYSPARK
(df.select("column1", "column2") \
.write.format("parquet") \
.saveAsTable("new_table"))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

AGGREGATE WITH ALIAS


SQL
SELECT column,

32
COUNT(*) AS count
FROM table
GROUP BY column;

PYSPARK
df.groupBy("column") \
.agg(F.count("*") \
.alias("count"))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

NESTED SUBQUERY
SQL
SELECT * FROM

33
(SELECT *
FROM table
WHERE condition) sub
WHERE sub.condition2;

PYSPARK
df.filter("condition") \
.alias("sub") \
.filter("sub.condition2")

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

MULTIPLE JOINS
SQL
SELECT * FROM table1

34
JOIN table2
ON table1.id = table2.id
JOIN table3
ON table1.id = table3.id;

PYSPARK
df1.join(df2, "id").join(df3, "id")

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

CROSS JOIN
SQL
SELECT *

35
FROM table1
CROSS JOIN table2;

PYSPARK
df1.crossJoin(df2)

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

GROUP BY HAVING COUNT


GREATER THAN
SQL
SELECT column,

36
COUNT(*)
FROM table
GROUP BY column
HAVING COUNT(*) > 1;

PYSPARK
df.groupBy("column") \
.count() \
.filter(F.col("count") > 1)

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

ALIAS FOR TABLE IN JOIN


SQL
SELECT t1.*

37
FROM table1 t1
JOIN table2 t2
ON t1.id = t2.id;

PYSPARK
df1.alias("t1") \
.join(df2.alias("t2"), F.col("t1.id")
== F.col("t2.id"))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

SELECTING FROM MULTIPLE


TABLES
SQL
SELECT t1.column, t2.column

38
FROM table1 t1, table2 t2
WHERE t1.id = t2.id;

PYSPARK
df1.join(df2, df1.id == df2.id) \
.select(df1.column, df2.column)

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

CASE WHEN WITH MULTIPLE


CONDITIONS
SQL
SELECT CASE WHEN

39
condition THEN 'value1'
WHEN condition2 THEN 'value2' ELSE
'value3'
END
FROM table;

PYSPARK
df.select(F.when(F.col("condition"),
"value1").when(F.col("condition2"),
"value2").otherwise("value3"))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

EXTRACTING DATE PARTS


SQL
SELECT EXTRACT(YEAR FROM

40
date_column)
FROM table;

PYSPARK
df.select(F.year(F.col("date_colu
mn")))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

INEQUALITY FILTERING
SQL
SELECT *

41
FROM table
WHERE column != 'value';

PYSPARK
df.filter(df.column != 'value')

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

IN LIST
SQL
SELECT *

42
FROM table
WHERE column IN ('value1',
'value2');

PYSPARK
df.filter(df.column.isin('value1',
'value2'))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

NOT IN LIST
SQL
SELECT *

43
FROM table
WHERE column NOT IN ('value1',
'value2');

PYSPARK
df.filter(~df.column.isin('value1',
'value2'))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

NULL VALUES
SQL
SELECT * FROM

44
table
WHERE column IS NULL;

PYSPARK
df.filter(df.column.isNull())

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

NOT NULL VALUES


SQL
SELECT *

45
FROM table
WHERE column IS NOT NULL;

PYSPARK
df.filter(df.column.isNotNull())

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

STRING UPPER CASE


SQL
SELECT UPPER(column)

46
FROM table;

PYSPARK
df.select(F.upper(df.column))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

STRING LOWER CASE


SQL
SELECT LOWER(column)

47
FROM table;

PYSPARK
df.select(F.lower(df.column))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

STRING LENGTH
SQL
SELECT LENGTH(column)

48
FROM table;

PYSPARK
df.select(F.length(df.column))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

TRIM STRING
SQL
SELECT TRIM(column)

49
FROM table;

PYSPARK
df.select(F.trim(df.column))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

LEFT TRIM STRING


SQL
SELECT LTRIM(column)

50
FROM table;

PYSPARK
df.select(F.ltrim(df.column))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

RIGHT TRIM STRING


SQL
SELECT RTRIM(column)

51
FROM table;

PYSPARK
df.select(F.rtrim(df.column))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

STRING REPLACE
SQL
SELECT REPLACE(column, 'find',

52
'replace')
FROM table;

PYSPARK
df.select(F.regexp_replace(df.c
olumn, 'find', 'replace'))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

SUBSTRING INDEX
SQL
SELECT

53
SUBSTRING_INDEX(column,
'delim', count)
FROM table;

PYSPARK
df.select(F.expr("split(column,
'delim')[count-1]"))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

DATE DIFFERENCE
SQL
SELECT DATEDIFF('date1', 'date2')

54
FROM table;

PYSPARK
df.select(F.datediff(F.col('date1'
), F.col('date2')))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

ADD MONTHS TO DATE


SQL
SELECT

55
ADD_MONTHS(date_column,
num_months)
FROM table;

PYSPARK
df.select(F.add_months
(df.date_column, num_months))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

FIRST VALUE IN GROUP


SQL
SELECT FIRST_VALUE(column)

56
OVER (PARTITION BY column2)
FROM table;

PYSPARK
df.withColumn("first_val",
F.first("column") \
.over(Window.partitionBy("column2")))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

LAST VALUE IN GROUP


SQL
SELECT LAST_VALUE(column)

57
OVER (PARTITION BY column2)
FROM table;

PYSPARK
df.withColumn("last_val",
F.last("column") \
.over(Window.partitionBy("column2")))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

ROW NUMBER OVER


PARTITION
SQL
SELECT ROW_NUMBER()

58
OVER (PARTITION BY column
ORDER BY column)
FROM table;

PYSPARK
df.withColumn("row_num",
F.row_number() \
.over(Window.partitionBy("column") \
.orderBy("column")))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

RANK OVER PARTITION


SQL
SELECT RANK()

59
OVER (PARTITION BY column
ORDER BY column)
FROM table;

PYSPARK
df.withColumn("rank",
F.rank().over(Window.partitionBy
("column").orderBy("column")))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

DENSE RANK OVER


PARTITION
SQL
SELECT DENSE_RANK()

60
OVER (PARTITION BY column
ORDER BY column)
FROM table;

PYSPARK
df.withColumn("dense_rank",
F.dense_rank().over(Window.partitio
nBy("column").orderBy("column")))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

COUNT ROWS
SQL
SELECT COUNT(*)

61
FROM table;

PYSPARK
df.count()

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

MATHEMATICAL
OPERATIONS
SQL
SELECT column1 + column2

62
FROM table;

PYSPARK
df.select(F.col("column1") +
F.col("column2"))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

STRING CONCATENATION
SQL
SELECT column1 | column2

63
AS new_column
FROM table;

PYSPARK
df.withColumn("new_column",
F.concat_ws("|",
F.col("column1"),
F.col("column2")))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

FIND MINIMUM VALUE


SQL
SELECT MIN(column)

64
FROM table;

PYSPARK
df.select(F.min("column"))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

FIND MAXIMUM VALUE


SQL
SELECT MAX(column)

65
FROM table;

PYSPARK
df.select(F.max("column"))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

REMOVING DUPLICATES
SQL
SELECT DISTINCT *

66
FROM table;

PYSPARK
df.distinct()

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

LEFT JOIN
SQL
SELECT * FROM table1

67
LEFT JOIN table2
ON table1.id = table2.id;

PYSPARK
df1.join(df2, df1.id == df2.id,
"left")

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

RIGHT JOIN
SQL
SELECT * FROM table1

68
RIGHT JOIN table2
ON table1.id = table2.id;

PYSPARK
df1.join(df2, df1.id == df2.id,
"right")

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

FULL OUTER JOIN


SQL
SELECT * FROM table1

69
FULL OUTER
JOIN table2
ON table1.id = table2.id;

PYSPARK
df1.join(df2, df1.id == df2.id,
"outer")

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

GROUP BY WITH HAVING


SQL
SELECT column, COUNT(*)

70
FROM table
GROUP BY column
HAVING COUNT(*) > 10;

PYSPARK
df.groupBy("column") \
.count() \
.filter(F.col("count") > 10)

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

ROUND DECIMAL VALUES


SQL
SELECT ROUND(column, 2)

71
FROM table;

PYSPARK
df.select(F.round("column", 2))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

GET CURRENT DATE


SQL
SELECT CURRENT_DATE();

72
PYSPARK
df.select(F.current_date())

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

DATE ADDITION
SQL
SELECT

73
DATE_ADD(date_column, 10)
FROM table;

PYSPARK
df.select(F.date_add(F.col("dat
e_column"), 10))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

DATE SUBTRACTION
SQL
SELECT

74
DATE_SUB(date_column, 10)
FROM table;

PYSPARK
df.select(F.date_sub(F.col("dat
e_column"), 10))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

EXTRACT YEAR FROM DATE


SQL
SELECT YEAR(date_column)

75
FROM table;

PYSPARK
df.select(F.year(F.col("date_col
umn")))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

EXTRACT MONTH FROM


DATE
SQL
SELECT MONTH(date_column)

76
FROM table;

PYSPARK
df.select(F.month(F.col("date_
column")))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

EXTRACT DAY FROM DATE


SQL
SELECT DAY(date_column)

77
FROM table;

PYSPARK
df.select(F.dayofmonth(F.col("d
ate_column")))

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

SORTING DESCENDING
SQL
SELECT *

78
FROM table
ORDER BY column DESC;

PYSPARK
df.orderBy(F.col("column").desc())

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

GROUP BY MULTIPLE
COLUMNS
SQL
SELECT col1, col2, COUNT(*)

79
FROM table
GROUP BY col1, col2;

PYSPARK
df.groupBy("col1", "col2") \
.count()

Shwetank Singh
GritSetGrow - GSGLearn.com
DATA ENGINEERING - SQL VS PYSPARK

CONDITIONAL COLUMN
UPDATE
SQL
UPDATE table

80
SET column1 = CASE
WHEN condition
THEN 'value1' ELSE 'value2' END;

PYSPARK
df.withColumn("column1",
F.when(F.col("condition"),
"value1").otherwise("value2"))

Shwetank Singh
GritSetGrow - GSGLearn.com

You might also like