emp = [(1, "Smith",-1,"2018","10", "M",3000), \
(2, "Rose", 1, "2010","20", "Μ",4000), \
(3, "Williams", 1, "2010","10", "Μ",1000), \
(4, "Jones", 2, "2005","10","F", 2000), \
(5, "Brown", 2, "2010","40","",-1),\
(6, "Brown", 2, "2010", "50","",-1)\
]
empColumns = ["emp_id", "name","superior_emp_id", "year_joined", \
"emp_dept_id", "gender", "salary"]
empDF = spark.createDataFrame(data=emp, schema = empColumns)
empDF.printSchema()
display (empDF)
dept = [("Finance",10), \
("Marketing", 20), \
("Sales", 30), \
("IT",40) \
deptColumns = ["dept_name","dept_id"]
deptDF = spark.createDataFrame (data=dept, schema = deptColumns)
deptDF.printSchema()
display(deptDF)
empDF.createOrReplaceTempView("EMP")
deptDF.createOrReplaceTempView("DEPT")
joinDF spark.sql("select from EMP e, DEPT d where e.emp_dept_id == d.dept_id") \
.show(truncate=False)
syntax: select from emp e inner join dept d where e.id == d.id sql server = select
from emp e inner join dept d on e.id == d.id
joinDF2 = spark.sql("select * from EMP e left JOIN DEPT d ON e.emp_dept_id ==
d.dept_id") \
.show(truncate=False)
syntax for joins
innerjoin = empDF.join(deptDF, empDF.emp_dept_id == deptDF.deptDF.dept_id,"inner")
display(innerjoin)
leftjoin = empDF.join(deptDF, empDF.emp_dept_id == deptDF.deptDF.dept_id,"left")
display(leftjoin)
rightjoin = empDF.join(deptDF, empDF.emp_dept_id == deptDF.deptDF.dept_id,"right")
display(rightjoin)
fullouterjoin = empDF.join(deptDF, empDF.emp_dept_id ==
deptDF.deptDF.dept_id,"outer")
display(fullouterjoin)
leftsemi = empDF.join(deptDF, empDF.emp_dept_id ==
deptDF.deptDF.dept_id,"leftsemi")
display(leftsemi)