Data Vizualization using ggplot in R
Data Vizualization
• R has several systems for making graphs
• ggplot2 - most elegant and most versatile
• ggplot2 implements the grammar of graphics, a coherent system for describing and building graphs
• Prerequisites: - install.packages(“tidyverse”) - library(tidyverse)
• to specify explicitly about where a function (or dataset) comes from use
– package::function()
– ggplot2 :: ggplot
Creating a graph
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.2
## -- Attaching packages --------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.0 v purrr 0.3.4
## v tibble 3.0.1 v dplyr 1.0.0
## v tidyr 1.1.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## Warning: package 'tidyr' was built under R version 4.0.2
## Warning: package 'dplyr' was built under R version 4.0.2
## -- Conflicts ------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
ggplot(iris)
1
str(cars)
## 'data.frame': 50 obs. of 2 variables:
## $ speed: num 4 4 7 7 8 9 10 10 10 11 ...
## $ dist : num 2 10 4 22 16 10 18 26 34 17 ...
ggplot(data = cars) + geom_point(mapping = aes(x=speed, y= dist))
2
125
100
75
dist
50
25
0
5 10 15 20 25
speed
str(mpg)
## tibble [234 x 11] (S3: tbl_df/tbl/data.frame)
## $ manufacturer: chr [1:234] "audi" "audi" "audi" "audi" ...
## $ model : chr [1:234] "a4" "a4" "a4" "a4" ...
## $ displ : num [1:234] 1.8 1.8 2 2 2.8 2.8 3.1 1.8 1.8 2 ...
## $ year : int [1:234] 1999 1999 2008 2008 1999 1999 2008 1999 1999 2008 ...
## $ cyl : int [1:234] 4 4 4 4 6 6 6 4 4 4 ...
## $ trans : chr [1:234] "auto(l5)" "manual(m5)" "manual(m6)" "auto(av)" ...
## $ drv : chr [1:234] "f" "f" "f" "f" ...
## $ cty : int [1:234] 18 21 20 21 16 18 18 18 16 20 ...
## $ hwy : int [1:234] 29 29 31 30 26 26 27 26 25 28 ...
## $ fl : chr [1:234] "p" "p" "p" "p" ...
## $ class : chr [1:234] "compact" "compact" "compact" "compact" ...
ggplot(data = mpg) + geom_point(mapping = aes(x=displ, y= hwy))
3
40
30
hwy
20
2 3 4 5 6 7
displ
Plot model & year
ggplot(data = mpg) + geom_point(mapping = aes(x=displ, y= hwy))
4
40
30
hwy
20
2 3 4 5 6 7
displ
#Scatter plot is best for continuous variables (x,y) and not useful to display the categorical variables
Aesthetic mappings
• An aesthetic is a visual property of the objects in your plot. Aesthetics include things like the size, the
shape, or the color of your points. #color
str(mpg)
## tibble [234 x 11] (S3: tbl_df/tbl/data.frame)
## $ manufacturer: chr [1:234] "audi" "audi" "audi" "audi" ...
## $ model : chr [1:234] "a4" "a4" "a4" "a4" ...
## $ displ : num [1:234] 1.8 1.8 2 2 2.8 2.8 3.1 1.8 1.8 2 ...
## $ year : int [1:234] 1999 1999 2008 2008 1999 1999 2008 1999 1999 2008 ...
## $ cyl : int [1:234] 4 4 4 4 6 6 6 4 4 4 ...
## $ trans : chr [1:234] "auto(l5)" "manual(m5)" "manual(m6)" "auto(av)" ...
## $ drv : chr [1:234] "f" "f" "f" "f" ...
## $ cty : int [1:234] 18 21 20 21 16 18 18 18 16 20 ...
## $ hwy : int [1:234] 29 29 31 30 26 26 27 26 25 28 ...
## $ fl : chr [1:234] "p" "p" "p" "p" ...
## $ class : chr [1:234] "compact" "compact" "compact" "compact" ...
ggplot(data = mpg) + geom_point(mapping = aes(x=displ, y= hwy, color=class))
5
40
class
2seater
compact
30
midsize
hwy
minivan
pickup
subcompact
suv
20
2 3 4 5 6 7
displ
#Color manufacturers with different color
ggplot(data = mpg) + geom_point(mapping = aes(x=displ, y= hwy, color=manufacturer))
6
manufacturer
audi
40 chevrolet
dodge
ford
honda
hyundai
30
jeep
hwy
land rover
lincoln
mercury
nissan
20
pontiac
subaru
toyota
volkswagen
2 3 4 5 6 7
displ
##Problems #size
#Using size for a discrete variable is not advised.
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy, size = manufacturer))
## Warning: Using size for a discrete variable is not advised.
7
manufacturer
audi
40 chevrolet
dodge
ford
honda
hyundai
30
jeep
hwy
land rover
lincoln
mercury
nissan
20
pontiac
subaru
toyota
volkswagen
2 3 4 5 6 7
displ
ggplot(data = cars) +
geom_point(mapping = aes(x=speed, y= dist, size = speed))
8
125
100
speed
75 5
10
dist
15
50 20
25
25
0
5 10 15 20 25
speed
#use shape
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy, shape = class))
## Warning: The shape palette can deal with a maximum of 6 discrete values because
## more than 6 becomes difficult to discriminate; you have 7. Consider
## specifying shapes manually if you must have them.
## Warning: Removed 62 rows containing missing values (geom_point).
9
40
class
2seater
compact
30
midsize
hwy
minivan
pickup
subcompact
suv
20
2 3 4 5 6 7
displ #
use numbers for the shape to specify upto 25 R buit-in shapes (0 to 24)
shapes <- data.frame(
shape = c(0:19, 22, 21, 24, 23, 20),
x = 0:24 %/% 5,
y = -(0:24 %% 5)
)
ggplot(shapes, aes(x, y)) +
geom_point(aes(shape = shape), size = 5, fill = "red") +
geom_text(aes(label = shape), hjust = 0, nudge_x = 0.15) +
scale_shape_identity() +
expand_limits(x = 4.1) +
theme_void()
10
0 5 10 15 22
1 6 11 16 21
2 7 12 17 24
3 8 13 18 23
4 9 14 19 20
#use alpha
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy, alpha = class))
## Warning: Using alpha for a discrete variable is not advised.
11
40
class
2seater
compact
30
midsize
hwy
minivan
pickup
subcompact
suv
20
2 3 4 5 6 7
displ
12
Set the aesthetic properties of geom manually
40
30
hwy
20
2 3 4 5 6 7
displ
##Facets #One way to add additional variables is with aesthetics. #Another way, particularly useful for
categorical variables, is to split your plot into facets, subplots that each display one subset of the data.
#To facet your plot by a single variable, use facet_wrap(). #The first argument of facet_wrap() should be a
formula, which you create with ~ followed by a variable name (here “formula” is the name of a data structure
in R, not a synonym for “equation”). The variable that you pass to facet_wrap() should be discrete.
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy)) +
facet_wrap(~ class, nrow = 2)
13
2seater compact midsize minivan
40
30
20
hwy
2 3 4 5 6 7
pickup subcompact suv
40
30
20
2 3 4 5 6 7 2 3 4 5 6 7 2 3 4 5 6 7
displ
#To facet your plot on the combination of two variables, add facet_grid() to your plot call. The formula
should contain two variable names separated by a ~.
ggplot(data = mpg) +
geom_point(mapping = aes(x = displ, y = hwy)) +
facet_grid(drv ~ cyl)
14
4 5 6 8
40
30
4
20
40
hwy
30
f
20
40
30
r
20
2 3 4 5 6 7 2 3 4 5 6 7 2 3 4 5 6 7 2 3 4 5 6 7
displ
#Statistical Transformation
str(diamonds)
## tibble [53,940 x 10] (S3: tbl_df/tbl/data.frame)
## $ carat : num [1:53940] 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
## $ cut : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
## $ color : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
## $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
## $ depth : num [1:53940] 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
## $ table : num [1:53940] 55 61 65 58 58 57 57 55 61 61 ...
## $ price : int [1:53940] 326 326 327 334 335 336 336 337 337 338 ...
## $ x : num [1:53940] 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
## $ y : num [1:53940] 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
## $ z : num [1:53940] 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
ggplot(data = diamonds) +
geom_bar(mapping = aes(x = cut))
15
20000
15000
count
10000
5000
Fair Good Very Good Premium Ideal
cut
#For iris dataset
glimpse(iris)
## Rows: 150
## Columns: 5
## $ Sepal.Length <dbl> 5.1, 4.9, 4.7, 4.6, 5.0, 5.4, 4.6, 5.0, 4.4, 4.9, 5.4,...
## $ Sepal.Width <dbl> 3.5, 3.0, 3.2, 3.1, 3.6, 3.9, 3.4, 3.4, 2.9, 3.1, 3.7,...
## $ Petal.Length <dbl> 1.4, 1.4, 1.3, 1.5, 1.4, 1.7, 1.4, 1.5, 1.4, 1.5, 1.5,...
## $ Petal.Width <dbl> 0.2, 0.2, 0.2, 0.2, 0.2, 0.4, 0.3, 0.2, 0.2, 0.1, 0.2,...
## $ Species <fct> setosa, setosa, setosa, setosa, setosa, setosa, setosa...
ggplot(data = iris) +
geom_bar(mapping = aes(x = Species))
16
50
40
30
count
20
10
setosa versicolor virginica
Species
On flights dataset
library(nycflights13)
## Warning: package 'nycflights13' was built under R version 4.0.2
glimpse(flights)
## Rows: 336,776
## Columns: 19
## $ year <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013...
## $ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ dep_time <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 55...
## $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 60...
## $ dep_delay <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2,...
## $ arr_time <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 8...
## $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 8...
## $ arr_delay <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7,...
## $ carrier <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6"...
## $ flight <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301...
## $ tailnum <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N...
## $ origin <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LG...
## $ dest <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IA...
## $ air_time <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149...
## $ distance <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 73...
17
## $ hour <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6...
## $ minute <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59...
## $ time_hour <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-0...
ggplot(data = flights) +
geom_bar(mapping = aes(x = carrier))
60000
40000
count
20000
9E AA AS B6 DL EV F9 FL HA MQ OO UA US VX WN YV
carrier
ggplot(data = flights) +
geom_bar(mapping = aes(x = origin))
18
125000
100000
75000
count
50000
25000
EWR JFK LGA
origin
ggplot(data = flights) +
geom_bar(mapping = aes(x = flight))
19
1000
750
count
500
250
0 2000 4000 6000 8000
flight
(fl100<-sample_n(flights,100))
## # A tibble: 100 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 10 12 1758 1359 239 1910 1511
## 2 2013 10 28 1152 1200 -8 1343 1334
## 3 2013 4 6 1508 1515 -7 1731 1740
## 4 2013 6 18 604 610 -6 1000 924
## 5 2013 6 10 1522 1500 22 1633 1646
## 6 2013 9 15 NA 1159 NA NA 1309
## 7 2013 3 10 2051 2045 6 2349 2357
## 8 2013 8 4 1815 1627 108 1958 1813
## 9 2013 7 26 646 647 -1 806 809
## 10 2013 9 24 1258 1305 -7 1424 1438
## # ... with 90 more rows, and 11 more variables: arr_delay <dbl>, carrier <chr>,
## # flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
ggplot(data = fl100) +
geom_bar(mapping = aes(x = carrier))
20
20
15
count
10
9E AA B6 DL EV F9 MQ UA US VX WN YV
carrier
ggplot(data = fl100) +
geom_bar(mapping = aes(x = origin))
21
40
30
count
20
10
EWR JFK LGA
origin
select(fl100,flight,origin,dest,air_time,carrier)
## # A tibble: 100 x 5
## flight origin dest air_time carrier
## <int> <chr> <chr> <dbl> <chr>
## 1 1703 EWR BOS 35 UA
## 2 667 LGA ORD 122 UA
## 3 4309 JFK IND 106 MQ
## 4 303 JFK SFO 369 UA
## 5 1498 LGA ORD 105 UA
## 6 3386 JFK DCA NA MQ
## 7 677 JFK LAX 318 B6
## 8 5940 EWR RDU 68 EV
## 9 905 JFK ORD 111 B6
## 10 5378 LGA PIT 54 EV
## # ... with 90 more rows
#Additional
use alpha
glimpse(mpg)
## Rows: 234
## Columns: 11
## $ manufacturer <chr> "audi", "audi", "audi", "audi", "audi", "audi", "audi"...
## $ model <chr> "a4", "a4", "a4", "a4", "a4", "a4", "a4", "a4 quattro"...
22
## $ displ <dbl> 1.8, 1.8, 2.0, 2.0, 2.8, 2.8, 3.1, 1.8, 1.8, 2.0, 2.0,...
## $ year <int> 1999, 1999, 2008, 2008, 1999, 1999, 2008, 1999, 1999, ...
## $ cyl <int> 4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 6, 6, 6, 6, 6, 6, 8, ...
## $ trans <chr> "auto(l5)", "manual(m5)", "manual(m6)", "auto(av)", "a...
## $ drv <chr> "f", "f", "f", "f", "f", "f", "f", "4", "4", "4", "4",...
## $ cty <int> 18, 21, 20, 21, 16, 18, 18, 18, 16, 20, 19, 15, 17, 17...
## $ hwy <int> 29, 29, 31, 30, 26, 26, 27, 26, 25, 28, 27, 25, 25, 25...
## $ fl <chr> "p", "p", "p", "p", "p", "p", "p", "p", "p", "p", "p",...
## $ class <chr> "compact", "compact", "compact", "compact", "compact",...
ggplot(data = mpg) + geom_point(mapping = aes(x=displ, y= hwy, alpha=year))
40
year
2000
30
2002
hwy
2004
2006
2008
20
2 3 4 5 6 7
displ
23