0% found this document useful (0 votes)

93 views7 pages

Python Code

The document discusses Python code for cleaning and preparing sales data from a Redshift database for analysis. It performs operations like casting data types, removing null values, and transforming fields before writing the cleaned data to S3 as Parquet files. It also shows joining multiple datasets from Redshift into a single dynamic frame and writing the output to S3, Redshift, and partitioned Parquet files.

Uploaded by

Gnan Shetty

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as DOCX, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

93 views7 pages

Python Code

Uploaded by

Gnan Shetty

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as DOCX, PDF, TXT or read online on Scribd

You are on page 1/ 7

Data Cleaning Python code:

import sys

from awsglue.utils import getResolvedOptions

from pyspark.context import SparkContext

from awsglue.context import GlueContext

from awsglue.dynamicframe import DynamicFrame

from awsglue.job import Job

from pyspark.sql.functions import udf

from pyspark.sql.types import StringType

glueContext = GlueContext(SparkContext.getOrCreate())

# Data Catalog: database and table name

db_name = "Redshift"

tbl_name = "Sales_report"

# S3 location for output

output_dir = "s3://glue-sample-target/output-dir/sales_report"

# Read data into a DynamicFrame using the Data Catalog metadata

medicare_dyf = glueContext.create_dynamic_frame.from_catalog(database = Redshift, table_name = Sales_report)

# The `provider id` field will be choice between long and string

# Cast choices into integers, those values that cannot cast result in null

sales_res = sales_dyf.resolveChoice(specs = [('provider id','cast:long')])

# Remove erroneous records

sales_df = sales_res.toDF()

sales_df = sales_df.where("`provider id` is NOT NULL")

# Apply a lambda to remove the '$'

chop_f = udf(lambda x: x[1:], StringType())

sales_df = sales_df.withColumn("ACC", chop_f(sales_df["average covered charges"])).withColumn("ATP",
chop_f(sales_df["average total payments"])).withColumn("AMP", chop_f(sales_df["average sales payments"]))

# Turn it back to a dynamic frame

sales_tmp = DynamicFrame.fromDF(sales_df, glueContext, "nested")

# Rename, cast, and nest with apply_mapping

sales_nest = sales_tmp.apply_mapping([('drg definition', 'string', 'drg', 'string'),

('id', 'long', 'provider.id', 'long'),

('name', 'string', 'provider.name', 'string'),

('city', 'string', 'provider.city', 'string'),

('state', 'string', 'provider.state', 'string'),

('zip code', 'long', 'provider.zip', 'long'),

('sales referral region description', 'string','rr', 'string'),

('ACC', 'string', 'charges.covered', 'double'),

('ATP', 'string', 'charges.total_pay', 'double'),

('AMP', 'string', 'charges.sales_pay', 'double')])

# Write it out in Parquet

glueContext.write_dynamic_frame.from_options(frame = sales_nest, connection_type = "s3", connection_options =

{"path": output_dir}, format = "parquet")
Join and relationalize of data:
# Copyright 2016-2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.

# SPDX-License-Identifier: MIT-0

import sys

from awsglue.transforms import Join

from awsglue.utils import getResolvedOptions

from pyspark.context import SparkContext

from awsglue.context import GlueContext

from awsglue.job import Job

glueContext = GlueContext(SparkContext.getOrCreate())

# catalog: database and table names

db_name = "Redshift"

tbl_persons = "persons_json"

tbl_membership = "memberships_json"

tbl_organization = "organizations_json"

# output s3 and temp directories

output_history_dir = "s3://glue-sample-target/output-dir/sales_history"

redshift_temp_dir = "s3://glue-sample-target/temp-dir/"

# Create dynamic frames from the source tables

persons = glueContext.create_dynamic_frame.from_catalog(database=Redshift, table_name=tbl_persons)

memberships = glueContext.create_dynamic_frame.from_catalog(database=Redshift, table_name=tbl_membership)

orgs = glueContext.create_dynamic_frame.from_catalog(database=Redshift, table_name=tbl_organization)

# Keep the fields we need and rename some.

orgs = orgs.drop_fields(['other_names', 'identifiers']).rename_field('id', 'org_id').rename_field('name', 'org_name')

# Join the frames to create history

l_history = Join.apply(orgs, Join.apply(persons, memberships, 'id', 'person_id'), 'org_id',
'organization_id').drop_fields(['person_id', 'org_id'])

# ---- Write out the history ----

# Write out the dynamic frame into parquet in "sales_history" directory

print("Writing to /legislator_history ...")

glueContext.write_dynamic_frame.from_options(frame = l_history, connection_type = "s3", connection_options =

{"path": output_history_dir}, format = "parquet")

# Convert to data frame, write to directory "legislator_part", partitioned by (separate) Senate and House.

print("Writing to /legislator_part, partitioned by Senate and House ...")

l_history.toDF().write.parquet(output_lg_partitioned_dir, partitionBy=['org_name'])

# ---- Write out to relational databases ----

# Convert the data to flat tables

print("Converting to flat tables ...")

dfc = l_history.relationalize("hist_root", redshift_temp_dir)

# Cycle through and write to Redshift.

for df_name in dfc.keys():

m_df = dfc.select(df_name)

print("Writing to Redshift table: ", df_name, " ...")

glueContext.write_dynamic_frame.from_jdbc_conf(frame = m_df, catalog_connection = "redshift3",

connection_options = {"dbtable": df_name, "database": "testdb"}, redshift_tmp_dir = redshift_temp_dir)
Data visualization using Python Scripts

1)Histogram:
import pandas as pd
import matplotlib.pyplot as plt

data = [['E001', 'M', 34, 123, 'Normal', 350],
        ['E002', 'F', 40, 114, 'Overweight', 450],
        ['E003', 'F', 37, 135, 'Obesity', 169],
        ['E004', 'M', 30, 139, 'Underweight', 189],
        ['E005', 'F', 44, 117, 'Underweight', 183],
        ['E006', 'M', 36, 121, 'Normal', 80],
        ['E007', 'M', 32, 133, 'Obesity', 166],
        ['E008', 'F', 26, 140, 'Normal', 120],
        ['E009', 'M', 32, 133, 'Normal', 75],
        ['E010', 'M', 36, 133, 'Underweight', 40] ]
  df = pd.DataFrame(data, columns = ['EMPID', 'Gender',
                                    'Age', 'Sales',
                                    'BMI', 'Income'] )
  df.hist()
  plt.show()

Output
2)Column Chart:
data = [['E001', 'M', 34, 123, 'Normal', 350],
        ['E002', 'F', 40, 114, 'Overweight', 450],
        ['E003', 'F', 37, 135, 'Obesity', 169],
        ['E004', 'M', 30, 139, 'Underweight', 189],
        ['E005', 'F', 44, 117, 'Underweight', 183],
        ['E006', 'M', 36, 121, 'Normal', 80],
        ['E007', 'M', 32, 133, 'Obesity', 166],
        ['E008', 'F', 26, 140, 'Normal', 120],
        ['E009', 'M', 32, 133, 'Normal', 75],
        ['E010', 'M', 36, 133, 'Underweight', 40] ]
  df = pd.DataFrame(data, columns = ['EMPID', 'Gender',
                                    'Age', 'Sales',
                                    'BMI', 'Income'] )

df.plot.bar()

plt.bar(df['Age'], df['Sales'])
plt.xlabel("Age")
plt.ylabel("Sales")
plt.show()

Output:
3) Scatter Plot:
import matplotlib.pyplot as plt

x_axis = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

y_axis = [5, 16, 34, 56, 32, 56, 32, 12, 76, 89]

plt.title("Prices over 10 years")

plt.scatter(x_axis, y_axis, color='darkblue', marker='x', label="item 1")

plt.xlabel("Time (years)")
plt.ylabel("Price (dollars)")

plt.grid(True)
plt.legend()

plt.show()

Output:

Pyspark Basics
No ratings yet
Pyspark Basics
16 pages
Lesson 1 Temenos Products
No ratings yet
Lesson 1 Temenos Products
16 pages
De Mod 1 Get Started With Databricks Data Science and Engineering Workspace
No ratings yet
De Mod 1 Get Started With Databricks Data Science and Engineering Workspace
27 pages
Dse Admin 60
No ratings yet
Dse Admin 60
1,015 pages
TMM 384
No ratings yet
TMM 384
198 pages
Week 5 - Docker
No ratings yet
Week 5 - Docker
39 pages
Kubernetes Cluster Using Terraform and Ansible
No ratings yet
Kubernetes Cluster Using Terraform and Ansible
48 pages
Vmware
No ratings yet
Vmware
3 pages
Set Up EKS Cluster Using Terraform
No ratings yet
Set Up EKS Cluster Using Terraform
12 pages
Kubernetes - by Shivansh Vasu
No ratings yet
Kubernetes - by Shivansh Vasu
19 pages
Slides Setting Up Secure, Well-Governed Machine Learning Environments On AWS
No ratings yet
Slides Setting Up Secure, Well-Governed Machine Learning Environments On AWS
39 pages
Big Data Computing - Assignment 8
No ratings yet
Big Data Computing - Assignment 8
3 pages
Learneverythingai 1661068200
No ratings yet
Learneverythingai 1661068200
66 pages
Note 36677 - Structure of Components For Customer Messages
No ratings yet
Note 36677 - Structure of Components For Customer Messages
421 pages
Problem Description: Sensitivity: Internal & Restricted
No ratings yet
Problem Description: Sensitivity: Internal & Restricted
2 pages
2.1 Intro-Ec2-Lab-V1.0 PDF
No ratings yet
2.1 Intro-Ec2-Lab-V1.0 PDF
23 pages
Himanshu Sharma Resume V1.0
No ratings yet
Himanshu Sharma Resume V1.0
1 page
Anil Kumar: Data Engineer
No ratings yet
Anil Kumar: Data Engineer
8 pages
Spark Streaming Twitter Example
No ratings yet
Spark Streaming Twitter Example
4 pages
New DOC Document
100% (1)
New DOC Document
6 pages
DP 201
No ratings yet
DP 201
200 pages
Alberto Benjamin Site Reliability Engineer/ Lead (SRE) : Professional Summary
No ratings yet
Alberto Benjamin Site Reliability Engineer/ Lead (SRE) : Professional Summary
6 pages
Lecture 4 - Pair RDD and DataFrame
No ratings yet
Lecture 4 - Pair RDD and DataFrame
38 pages
BD - Spark - Baladasu A - SightSpectrum
No ratings yet
BD - Spark - Baladasu A - SightSpectrum
3 pages
Data Warehouse Ques
No ratings yet
Data Warehouse Ques
10 pages
Spark A To Z
No ratings yet
Spark A To Z
63 pages
AWS Certified Data Engineer - Cheat Sheet - MyDE
No ratings yet
AWS Certified Data Engineer - Cheat Sheet - MyDE
87 pages
Spark Scala Interview Question
No ratings yet
Spark Scala Interview Question
3 pages
DVS SPARK Course Content PDF
No ratings yet
DVS SPARK Course Content PDF
2 pages
Unstructured Dataload Into Hive Database Through PySpark
No ratings yet
Unstructured Dataload Into Hive Database Through PySpark
9 pages
SAP NetWeaver PI - Using The Integration Directory API
No ratings yet
SAP NetWeaver PI - Using The Integration Directory API
41 pages
Lab7 Loading Data From AWS
No ratings yet
Lab7 Loading Data From AWS
2 pages
TF On Spark
No ratings yet
TF On Spark
35 pages
AWS Data Lake
No ratings yet
AWS Data Lake
13 pages
Real Time Analytics Spark Streaming PDF
No ratings yet
Real Time Analytics Spark Streaming PDF
20 pages
How-To - Install CDH On Mac OSX 10
No ratings yet
How-To - Install CDH On Mac OSX 10
20 pages
Apache Spark Theory by Arsh
No ratings yet
Apache Spark Theory by Arsh
4 pages
Manideep Lenkalapally
No ratings yet
Manideep Lenkalapally
7 pages
Apache Hive
No ratings yet
Apache Hive
3 pages
Lesson 3 - Python Data Structures
No ratings yet
Lesson 3 - Python Data Structures
38 pages
Hortonworks Data Platform: Apache Hive Performance Tuning
No ratings yet
Hortonworks Data Platform: Apache Hive Performance Tuning
48 pages
Python Advanced - Pipes in Python
No ratings yet
Python Advanced - Pipes in Python
7 pages
Module 7: Data Management Backup, DR, Test/Dev Environments
No ratings yet
Module 7: Data Management Backup, DR, Test/Dev Environments
9 pages
MSSQL Server 2008 Developer
No ratings yet
MSSQL Server 2008 Developer
240 pages
Snowflake Setup - MD
No ratings yet
Snowflake Setup - MD
2 pages
Snowflake Fundamentals Anand Jha
No ratings yet
Snowflake Fundamentals Anand Jha
50 pages
DW Concepts
No ratings yet
DW Concepts
40 pages
9.3-Amazon Step Functions - Digital Cloud Training PDF
No ratings yet
9.3-Amazon Step Functions - Digital Cloud Training PDF
3 pages
Spark
No ratings yet
Spark
13 pages
Debarun Sarkar - Resume
No ratings yet
Debarun Sarkar - Resume
1 page
380 Notes Fa2016
0% (2)
380 Notes Fa2016
79 pages
Ansible 2
No ratings yet
Ansible 2
15 pages
Dashrath Nandan BDA (Unit-2) Notes
No ratings yet
Dashrath Nandan BDA (Unit-2) Notes
23 pages
Azure Devops: Sato Naoki (Neo) - @satonaoki Jazug Tohoku Azure Devops #Jazug #Azuredevops
No ratings yet
Azure Devops: Sato Naoki (Neo) - @satonaoki Jazug Tohoku Azure Devops #Jazug #Azuredevops
34 pages
Azure Virtual Network
No ratings yet
Azure Virtual Network
22 pages
Data Warehouse - What Is It
No ratings yet
Data Warehouse - What Is It
5 pages
Spark NLP Training-Public-April 2020
No ratings yet
Spark NLP Training-Public-April 2020
39 pages
Dataengieer
No ratings yet
Dataengieer
23 pages
Cypress Io
No ratings yet
Cypress Io
14 pages
Analytics Workshop Redshift Notebook
No ratings yet
Analytics Workshop Redshift Notebook
6 pages
Tranform Data (MD File)
No ratings yet
Tranform Data (MD File)
13 pages
92 - 93 Daprogrameba - VB
No ratings yet
92 - 93 Daprogrameba - VB
279 pages
JDBC, Java Server Pages, and MySQL - Database MCQ Questions and Answers - Technical Aptitude Page-2 Section-1
No ratings yet
JDBC, Java Server Pages, and MySQL - Database MCQ Questions and Answers - Technical Aptitude Page-2 Section-1
4 pages
666 Computer Technology 6th Sem
No ratings yet
666 Computer Technology 6th Sem
28 pages
Databases: Topic 9: Database Design
No ratings yet
Databases: Topic 9: Database Design
27 pages
Intro To Templates
No ratings yet
Intro To Templates
23 pages
Java Assignment
No ratings yet
Java Assignment
2 pages
Exception Handling in CPP PDF
No ratings yet
Exception Handling in CPP PDF
2 pages
Libusb Developers Guide Vers0 1
No ratings yet
Libusb Developers Guide Vers0 1
19 pages
Mod Menu Log - Com - Je.supersus
No ratings yet
Mod Menu Log - Com - Je.supersus
20 pages
Sample Exam Problems
100% (1)
Sample Exam Problems
9 pages
UGRD-ITE6300 Cloud Computing and Internet of Things PRELIM LAB EXAM
No ratings yet
UGRD-ITE6300 Cloud Computing and Internet of Things PRELIM LAB EXAM
3 pages
Baocao Group3
No ratings yet
Baocao Group3
10 pages
Python - Variable Exercises
No ratings yet
Python - Variable Exercises
1 page
Hafijur Rahman: Exam Title Concentration/Major Institute Result Pass Year Duration
No ratings yet
Hafijur Rahman: Exam Title Concentration/Major Institute Result Pass Year Duration
3 pages
Online Document Management System in Spring Boot and Hibernate With Source Code - Codebun
No ratings yet
Online Document Management System in Spring Boot and Hibernate With Source Code - Codebun
14 pages
Cublas Library
No ratings yet
Cublas Library
146 pages
Deepak Panchal CV
No ratings yet
Deepak Panchal CV
1 page
Omsdk Session Client
No ratings yet
Omsdk Session Client
32 pages
SOAR QRadar Integration Guide
No ratings yet
SOAR QRadar Integration Guide
38 pages
Nanak Dutt Resume Atlassian 1
No ratings yet
Nanak Dutt Resume Atlassian 1
1 page
Experiment No.2
No ratings yet
Experiment No.2
4 pages
04 Using Karel With Pycharm
No ratings yet
04 Using Karel With Pycharm
9 pages
Complete Testing Report
No ratings yet
Complete Testing Report
12 pages
Lesson 200.2 Basic Searching
No ratings yet
Lesson 200.2 Basic Searching
74 pages
Oracle 10g Cursor Concepts
100% (1)
Oracle 10g Cursor Concepts
11 pages
SQLServer 2022 Editions Datasheet
No ratings yet
SQLServer 2022 Editions Datasheet
3 pages
BPM Application Development Kemsley
No ratings yet
BPM Application Development Kemsley
11 pages
Windows Penetration Testing Command
No ratings yet
Windows Penetration Testing Command
16 pages
Introduction To Windows Forms Applications: E E U F F
No ratings yet
Introduction To Windows Forms Applications: E E U F F
32 pages

Python Code

Uploaded by

Python Code

Uploaded by

Data Cleaning Python code:

from awsglue.utils import getResolvedOptions

from pyspark.context import SparkContext

from awsglue.context import GlueContext

from awsglue.dynamicframe import DynamicFrame

from awsglue.job import Job

from pyspark.sql.functions import udf

from pyspark.sql.types import StringType

# Data Catalog: database and table name

# S3 location for output

# Read data into a DynamicFrame using the Data Catalog metadata

medicare_dyf = glueContext.create_dynamic_frame.from_catalog(database = Redshift, table_name = Sales_report)

sales_res = sales_dyf.resolveChoice(specs = [('provider id','cast:long')])

# Remove erroneous records

sales_df = sales_df.where("`provider id` is NOT NULL")

# Apply a lambda to remove the '$'

chop_f = udf(lambda x: x[1:], StringType())

# Turn it back to a dynamic frame

sales_tmp = DynamicFrame.fromDF(sales_df, glueContext, "nested")

# Rename, cast, and nest with apply_mapping

sales_nest = sales_tmp.apply_mapping([('drg definition', 'string', 'drg', 'string'),

('id', 'long', 'provider.id', 'long'),

('name', 'string', 'provider.name', 'string'),

('city', 'string', 'provider.city', 'string'),

('state', 'string', 'provider.state', 'string'),

('zip code', 'long', 'provider.zip', 'long'),

('sales referral region description', 'string','rr', 'string'),

('ACC', 'string', 'charges.covered', 'double'),

('ATP', 'string', 'charges.total_pay', 'double'),

('AMP', 'string', 'charges.sales_pay', 'double')])

# Write it out in Parquet

glueContext.write_dynamic_frame.from_options(frame = sales_nest, connection_type = "s3", connection_options =

from awsglue.transforms import Join

from awsglue.utils import getResolvedOptions

from pyspark.context import SparkContext

from awsglue.context import GlueContext

from awsglue.job import Job

# catalog: database and table names

# output s3 and temp directories

# Create dynamic frames from the source tables

persons = glueContext.create_dynamic_frame.from_catalog(database=Redshift, table_name=tbl_persons)

memberships = glueContext.create_dynamic_frame.from_catalog(database=Redshift, table_name=tbl_membership)

orgs = glueContext.create_dynamic_frame.from_catalog(database=Redshift, table_name=tbl_organization)

# Keep the fields we need and rename some.

orgs = orgs.drop_fields(['other_names', 'identifiers']).rename_field('id', 'org_id').rename_field('name', 'org_name')

# Join the frames to create history

# ---- Write out the history ----

# Write out the dynamic frame into parquet in "sales_history" directory

print("Writing to /legislator_history ...")

glueContext.write_dynamic_frame.from_options(frame = l_history, connection_type = "s3", connection_options =

print("Writing to /legislator_part, partitioned by Senate and House ...")

# ---- Write out to relational databases ----

# Convert the data to flat tables

print("Converting to flat tables ...")

dfc = l_history.relationalize("hist_root", redshift_temp_dir)

# Cycle through and write to Redshift.

for df_name in dfc.keys():

print("Writing to Redshift table: ", df_name, " ...")

glueContext.write_dynamic_frame.from_jdbc_conf(frame = m_df, catalog_connection = "redshift3",

x_axis = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]

plt.title("Prices over 10 years")

You might also like