R with SQL (2)
R with SQL (2)
install.packages('sqldf')
library('sqldf')
library(nycflights13)
str(flights)
# $ year : int 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
# $ dep_time : int 517 533 542 544 554 554 555 557 557 558 ...
# $ arr_time : int 830 850 923 1004 812 740 913 709 838 753 ...
# $ flight : int 1545 1714 1141 725 461 1696 507 5708 79 301 ...
# $ air_time : num 227 227 160 183 116 150 158 53 140 138 ...
# $ distance : num 1400 1416 1089 1576 762 ...
ej1 = sqldf("
SELECT
dep_time
,dep_delay
,arr_time
,carrier
,tailnum
FROM
flights
")
head(ej1)
# In R we can use SQL with the sqldf function. It works exactly the same as in
#a database
# The data.frame (in this case flights) represents the table we are querying
# We can also compute new variables in the select statement using the syntax:
# old_variables as new_variable
ej2 = sqldf("
SELECT
carrier
FROM
flights
")
ej2[1:5, ]
# gain carrier
#1 9 UA
# 2 16 UA
# 3 31 AA
# 4 -17 B6
# 5 -19 DL
### GROUP BY
ej3 = sqldf("
SELECT
avg(arr_delay) as mean_arr_delay,
avg(dep_delay) as mean_dep_delay,
carrier
FROM
flights
GROUP BY
carrier
")
ej3[1:16, ]
#1 7.3796692 16.725769 9E
#2 0.3642909 8.586016 AA
#3 -9.9308886 5.804775 AS
#4 9.4579733 13.022522 B6
#5 1.6443409 9.264505 DL
#6 15.7964311 19.955390 EV
#7 21.9207048 20.215543 F9
#8 20.1159055 18.726075 FL
#9 -6.9152047 4.900585 HA
# 10 10.7747334 10.552041 MQ
# 11 11.9310345 12.586207 OO
# 12 3.5580111 12.106073 UA
# 13 2.1295951 3.782418 US
# 14 1.7644644 12.869421 VX
# 15 9.6491199 17.711744 WN
# 16 15.5569853 18.996330 YV
# Other aggregations
ej4 = sqldf("
SELECT
avg(arr_delay) as mean_arr_delay,
min(dep_delay) as min_dep_delay,
max(dep_delay) as max_dep_delay,
carrier
FROM
flights
GROUP BY
carrier
")
# We can compute the minimun, mean, and maximum values of a numeric value
ej4
# c1 c2
#1 a
#2 b
#3 c
#4 d
# c1 c2
#2 b
#4 c
#5 d
#6 e
# This means to match the observations of the column we would join the tables by.
inner = sqldf("
SELECT
A.c1, B.c2
FROM
A INNER JOIN B
ON A.c1 = B.c1
")
inner
# c1 c2
# 2 b
# 4 c
# the left outer join, sometimes just called left join will return the
# first all the values of the column used from the A table
left = sqldf("
SELECT
A.c1, B.c2
FROM
A LEFT OUTER JOIN B
ON A.c1 = B.c1
")
left
# c1 c2
# 1 <NA>
# 2 b
# 3 <NA>
# 4 c