1.
Variables and data types
model1 <- 'hello'
typeof(model1) #character
class(model1) #character
model1
hello_string <- 'hello'
hello_string
x <- 100
typeof(x) #double
y <- 100L
typeof(y) #integer
a = TRUE
typeof(a) #logical
num <- 3.7
typeof(num) #double
class(num) #numeric
2. Arithmetic and Logical Operator
100/2
100*2
100%%2
#output true / false
10>20 & 10<20
10>20 | 10<20
!(10==3)
x<-100
y<-200
x==y
y>x
auction.data <- read.csv("")
auction.data
head(auction.data) #see top 5 values
auction <- read.csv("")
view(auction) #see in table form
x<-auction[auction$bidder=='chiuk' & auction$bid==100,]
x<-auction[auction$bidder=='chiuk' | auction$bid==100,]
#Assigning variables
selling <- 500
cost <-250
profit <- selling - cost
profit
3. Print Formatting
x <- 10
print(x)
print(paste('hello','world')) #hello world
print(paste('hello','world',sep='-')) #hello-world
#paste0 to avoid any space
paste0('hello','world') #helloworld
paste0('Welcome','to','R')
x <- 10:30
x
class(x) #integer
4. Vectors
- a sequence of data elements with same basic types
v1 <- c(1,2,3,4,5)
print(v1)
class(v1) #numeric
typeof(v1) #double
x <- c(TRUE,FALSE) #logical
x <- 9:29 #integer
x <- c(1+0i, 2+4i) #complex
x<- vector("numeric",length = 10)
#create a vector of numbers
numbers = c(1,2,3,4,5,6)
numbers
print(numbers)
#create a vector of letters
ltrs = c('a','b','c','d')
print(ltrs)
#concatenating the both above
mixed_vec = c(numbers,ltrs)
print(mixed_vec)
class(mixed_vec) #character
#integer vector
num = 1:10
#numeric vector
num = c(1:10,10.5)
num
#character vector
ltrs = letters[1:10]
ltrs
#factor vector
fac = as.factor(ltrs)
class(fac)
5. List
- a generic vector that can contain object of different types
list <- list(x=c(10,20,30),
y=c("a","b","c"),
z=c(TRUE,FALSE))
list
test <- list("music tracks",100,5)
test
is.list(test) #TRUE
names(test) <- c("product","count","rating")
test
#$product
#music tracks
test[[1]]
product.category <- list(product="music tracks",count=100,ratings=5)
product.category
str(product.category) #more compact display
similar.prod <- list(product="films",count=50,ratings=4)
product.category <- list(product="music tracks",count=100,ratings=5,similar=similar.prod)
product.category
product.category[["product"]]
product.category$product
6. Matrix
matrix(1:6,nrow=2) # 1 2 3
#246
matrix(1:6,ncol=2) # 1 4
#25
#36
matrix(1:6,nrow=2,byrow=TRUE) # 1 2 3
#456
matrix(1:3,nrow=2,ncol=3) # 1 3 2
#213
cbind(1:3,1:3) # 1 1
#22
#33
rbind(1:3,1:3) # 1 2 3
#123
n<-matrix(1:6,byrow=TRUE,nrow=2) # 1 2 3
n #456
rbind(n,7:9) # 1 2 3
n #456
#789
cbind(n,c(10,11))
n <- matrix(1:6,byrow=TRUE,nrow=2)
rownames(n) <- c("row1","row2")
n
colnames(n) <- c("col1","col2","col3")
n
x <- matrix(1:8,ncol=2)
l <- matrix(LETTERS[1:6],nrow=4,ncol=3)
l
cbind(x,l)
7. Data frame
- used to store data in the form of table
name <- c("John","Peter","Patrick","Julie","Bob")
age <- c(28,30,31,38,35)
children <- c(FALSE,TRUE,TRUE,FALSE,TRUE)
df <- data.frame(name,age,children)
df <- data.frame(Name=name,Age=age,Children=children)
df
df[3,2] #31
df[3,"age"]
df[3,]
df[["age"]]
df[c(3,5),c("Age","Children")]
df[2]
Modify data
height <- c(163,177,163,162,157)
df$height <- height
df
weight <- c(75,65,54,34,78)
cbind(df,weight)
tom <- data.frame("Tom",36,FALSE,183,89)
rbind(df,tom)
Sorting (Ascending)
sort(df$age)
ranks <- order(df$age)
df[ranks,]
Sorting (Descending)
df[order(df$age,descending = TRUE),]
Vectors
#create vectors
vec1 <- c(10,20,30)
vec2 <- c("a","b","c")
vec3 <- c(TRUE,FALSE,TRUE)
vec1;vec2;vec3
vec4 <- c(10,"a",TRUE) #character
vec5 <- c(FALSE,2) #numeric
vec6 <- c('A',1) #character
vec7 <- c(1:20)
vec8 <- seq(1,20)
vec9 <- 1:25
#create a vector with odd values 1 to 20
odd_value <- seq(1,20,2)
odd_value
#create vector with even values
even_value <- seq(2,20,2)
even_value
#create vector with 10 odd values starting from 20
vec10 <- seq(from=21,by=2,length.out=10)
vec10
#naming vectors using names() function
temperature <- c(72,71,68,73,70,75,71)
temperature
names(temperature) <- c('Mon','Tue','Wed','Thu','Fri','Sat','Sun')
temperature
#another way to name your vectors
days <- c('Mon','Tue','Wed','Thu','Fri','Sat','Sun')
temperature1 <- c(72,71,68,73,70,75,71)
names(temperature1) <- days
temperature1
#give name to each price values
price <- seq(100,220,20)
names(price) <- paste0("p",1:7)
price
p1 p2 p3 p4 p5 p6 p7
100 120 140 160 180 200 220
#basic operation on vectors
v1 <- c(2,1,4)
sum(v1)
sd(v1)
var(v1)
prod(v1)
max(v1)
min(v1)
#vector licing and indexing
price1 <- seq(550,670,20)
names(price1) <- paste0("p",1:7)
price1
#using index position
price1[3]
price1[3:4]
price1[c(1,4)]
#using names
price1["p3"]
price1[c("p3","p4")]
#using logical position
price1[c(T,F,T,T,F,F,F)]
price1[c(T,F)]
#using exclude position method
price1[-2]
price1[c(-2,-5)]
#using comparison operator
price1[price1>600]
#handling NA values in vectors
order_detail <- c(10,20,30,NA,50,60)
names(order_detail) <- c("Mon","Tue","Wed","Thu","Fri","Sat")
order_detail + 5
#to add two vectors
new_order <- c(5,10)
update_order <- order_detail + new_order
#create subset of vector
firsttwo <- order_detail[1:2]
firsttwo
l <- length(order_detail)
v1 <- order_detail[(l-1):1]
v3 <- order_detail[l:1]
#to omit NA value from the vector
na.omit(order_detail[order_detail<30])
#find order details that are multiple of 3
(order_detail %% 3) == 3
order_detail[(order_detail %% 3)==0]
na.omit(order_detail[(order_detail %% 3)==0])
#remove NA values
sum(order_detail,na.rm=T)
Matrix
mat1 <- matrix(0,3,3) #3 columns 3 rows
mat1
mat2 <- matrix(1:9,3,3)
mat2
mat3 <- matrix(1:9,nrow=3,byrow=T)
mat3
#create matrix from vectors
stock1 <- c(450,451,452,445,468)
stock2 <- c(230,231,232,236,228)
stocks <- c(stock1,stock2)
stocks
stock.matrix <- matrix(stocks,byrow=TRUE,nrow=2)
stock.matrix
#naming a matrix
days <- c('Mon','Tue','Wed','Thu','Fri')
st.names <- c('Stock1','Stock2')
colnames(stock.matrix) <- days
rownames(stock.matrix) <- st.names
stock.matrix
#functions associated with matrix
nrow(mat3) #no. of rows
ncol(mat3) #no. of columns
dim(mat3) #no. of rows and columns
rownames(stock.matrix)
colnames(stock.matrix)
colSums(stock.matrix)
rowSums(stock.matrix)
rowMeans(stock.matrix)
# add rows and colums to a matrix
stock3 <- c(150,151,149,120,114)
total_stock <- rbind(stock.matrix,stock3)
total_stock
avg <- rowMeans(total_stock)
total_stock <- cbind(total_stock,avg)
total_stock
#matrix selection and indexing
student <-
matrix(c(20,30,NA,70,22,28,36,80,24,26,32,75,26,24,NA,50),nrow=4,ncol=4,byrow=T)
dimnames(student) <-
list(c("John","Mathew","Sam","Alice"),c("Phy","Chem","Bio","Maths"))
student
#extraction of colums
student[,1:2]
student[,c(1,3)]
student["John",2:3]
student[c("John","Sam"),4]
#find average score of John
mean(student[c("John"),],na.rm=T)
#find average and total score of all students
apply(student,1,mean,na.rm=T)
apply(student,1,sum,na.rm=T)
passing_score <- c(25,25,25,70)
passing_score
#how many subject Alice has passed
pass <- (student[c("Alice"),]>passing_score)
pass
sum(pass,na.rm=T)
Data frames
data()
data(AirPassengers)
head(AirPassengers)
head(iris)
view(iris)
view(state.x77)
view(USPersonalExpenditure)
tail(USPersonalExpenditure)
summary(iris)
str(iris)
#filtering data frame
subset(df,subset=rain==TRUE)
subset(df,subset=temp>25)
#sorting data frame
sorted.temp <- order(df['temp'])
df[sorted.temp,]
sorted.temp
desc.temp <- order(-df['temp'])
df[desc.temp,]
#another method to sort
sort.temp <- order(df$temp)
df[sort.temp,]
#merging data frames
(m1 <- merge(authors,books,by.x="surname",by.y="name"))
#manipulate data frame
salesreport <- data.frame(Id=101:110,
Product=c("A","B"),
Unitprice=as.integer(runif(10,100,200)),
Qty=as.integer(runif(10,20,30)))
salesreport
#transpose data frame
transpose.salesreport <- t(salesreport)
view(transpose.salesreport)
head(transpose.salesreport)
#sorting data frame
salesreport[order(salesreport$Unitprice),]
salesreport[order(salesreport$Unitprice,decreasing=T),]
#subnetting data frame
subset.ProductA <- subset(salesreport,Product=="A")
subset.ProductA
subset.salesreport <- subset(salesreport,Product=="A" & Unitprice>150,c(1,4))
#merging data frame
setA <- subset(salesreport,Product=="A")
setB <- subset(salesreport,Product=="B")
cbind(setA,setB)
rbind(setA,setB)
#aggregate function
aggregate(salesreport$Qty,list(salesreport$Prod),sum,na.rm=T)
List
list2 <- list(vec=seq(1:10),
mat=matrix(1:9,3,3),
lis=list(a=10,b=20))
list2
is.recursive(list2)
#indexing of list
list2[2]
list2$mat
list2['vec']
list2[[3]][2] #3rd element second value
list2$lis[2]
#conversion of vectors to list
price <- c(10,20,30)
pricelist <- as.list(price)
newPrice <- unlist(pricelist)
#convert vectors to matrix
price1 <- c(10,20,30,40)
dim(price1) <- c(2,2)
toupper(c1) #uppercase
substr(c1,1,2) #extract set of characters
newname <- sub("Rob","Cena",name) #replace
8. Flow Control
#if-else
x <- 30L
if(is.integer(x)){
print("x is an integer")
}else{
print("x is not an integer")
}
#while loop
v <- c("Hello World")
count <- 2
while (count<7){
print(v)
count=count+1
}
#for loop
fruit <- c('Apple','Orange','Banana')
for (i in fruit){
print(i)
}
for (i in 1:length(fruit)){
print(fruit[i])
}
#print square and square roots
for(i in 1:25){
sq=i*i
sqroot=sqrt(i)
message("i=",i,"sq=",sq,"sqroot=",sqroot) }
9. Functions
Build-in function
text <- "R is fun to learn"
grepl('fun',text)
v <- c('a','b','c','d')
grep('b',v)
Function
squares <- function(a) {
for(i in 1:a){
b <- i^2
print(b)
}
}
squares(4)
#simple function
hello_world <- function(){
print("Hello world in R")
}
hello_world()
hello_name <- function(name){
print(paste('Hello',name))
}
hello_name('John')
add_num <- function(num1,num2){
print(num1+num2)
}
add_num(30,40)
#add a vector to a number
add_num(c(10,20,30),5)
#function with default argument values
hello_name <- function(name='Rick'){
print(paste('Hello',name))
}
hello_name()
hello_name('Sam')
#return value from a function
full_name <- function(name='Sachin',title='Tendulkar'){
return(paste(name,'',title))
}
full_name()
full_name2 <- full_name('Don','Bradman')
full_name2
full_name(title="john",name='Smith')
#10 generate 1000 radom values from a normal distribution of mean 0 and 1000
normalDist <- rnorm(100,0,1)
mean(normalDist)
hist(normalDist)
#to get histogram with large number of interval breaks
hist(normalDist,breaks=50)
#function with optional arguments
expoValue <- function(x,power=2){
hist(x^power)
}
expoValue(normalDist)
#or
expoValue(normalDist,power=4)
#pass other argument
expoValue <- function(x,power=2,...){
hist(x^power,...)
}
expoValue(normalDist,power=2,breaks=50)
#passing logical arguments
expoValue <- function(x,exp=2,hist=FALSE,...){
if(hist==TRUE){
hist(x^exp)
}else{
x^exp
}
}
expoValue(normalDist)
expoValue(normalDist,hist=TRUE)
#example function1
status <- function(marks){
result="Not Defined"
if(marks>50) result="PASS"
message("Your result is ",result)
}
status(60)
status(30)
#example2
status <- function(age){
ageGrp = "Not Defined"
vote = "Not Defined"
if(age>=18){
ageGrp='Adult'
vote = "Yes"
}
message("Your age group is ",ageGrp)
message("Voting status is ",vote)
}
status(20)
#example to convert a name into uppercase
status <- function(name){
len <- nchar(name)
if(len>5) name=toupper(name)
message("User given name is ",name)
}
status("Johnny")
#example to calculate bonus
get_bonus <- function(salary,exp){
if(exp>5){
bonus_per=10
}else{
bonus_per=5
}
bonus=salary*(bonus_per/100)
return(bonus)
}
get_bonus(25000,5)
#function example
sqr <- function(n)
{
repeat{
square=n*n
message("The square is ",square)
if(square>=100) break
n=n+1
}
return(n)
}
sqr(6)
Switch case
HRA <- function(city){
hra_amt <- switch(toupper(city),
BLR=7500,
MUM=1000,
CHN=7500,
5000
)
return(hra_amt)
}
HRA("VOK")
salary_range <- function(band){
range <- switch(band,
L1 = "10000 - 15000",
L2 = "15000-25000",
L3 = "25000-40000")
return(range)
}
salary_range("L1")
salary_range("B1")
Repeat
time <- 15
repeat{
message("Hello,welcome to R")
if(time>=20) break
time=time+1
}
Build in function
#build in function
seq(1,10,by=2)
v <- c(11,4,5,7,3,10,2)
sort(v)
sort(v, decreasing = TRUE)
v2 <- c(1,2,3,4,5)
rev(v2)
append(v,v2)
Factor
dresssize <- c("M","L","S","S","L","M","L","M")
dresssize_factor <- factor(dresssize,ordered=TRUE,levels=c("S","M","L"))
dresssize_factor
dresssize_factor[1] < dresssize_factor[2]
Type.factor <- factor(Type,ordered=T,level=c("S","M","L").
labels=c("Small","Medium","Large"))
Date
Sys.Date
as.Date('1990-11-03')
as.Date("Nov-03-90",format="%b-%d-%y")
#%b = full month name, %y = year in 4 digits
8. Data manipulation
- dplyr package is used to transform and summarize tabular data with rows and columns
install.packages("dplyr")
library(dplyr)
install.packages('nycflights13')
library('nycflights13')
view(flights)
head(flights)
#filter()
f1 <- filter(flights,month==07)
view(f1)
f2 <- filter (flights,month==07,day==3)
f2
view(f2)
view(filter(flights,month==09,day==2,origin=='LGA'))
#OR -more specific
head(flights[flights$month==09 & flights$day==2 & flights$origin=='LGA',])
#slice()
slice(flights,1:5)
slice(flights,5:10)
#mutate() to add new column
over_delay <- mutate(flights,overall_delay=arr_delay-dep_delay)
view(over_delay)
#transmute() used to show only new column
over_delay <- transmute(flights,overall_delay=arr_delay-dep_delay)
#summarise() used to find descriptive statistics
summarise(flights,avg_air_time=mean(air_time,na.rm=T))
#group by()
head(mtcars)
by_gear <- mtcars %>% group_by(gear)
by_gear
a<- summarise(by_gear,gear1=sum(gear),gear2=mean(gear))
a
summarise(group_by(mtcars,cy1),mean(gear,na.rm=TRUE))
b <- by_gear %>% summarise(gear1=sum(gear),gear2=mean(gear))
b
view(by_gear)
#example2
by_cy1 <- mtcars %>% group_by(cy1)
by_cy1 %>% summarise(
gear = mean(gear)
hp = mean(hp)
)
head(by_cy1)
#sample()
sample_n(flight,15) #15 random samples
sample_frac(flights,0.4) #return 40% of the total data
#arrange() to sort dataset
view(arrange(flights,yesr,dep_time))
head(arrange(flights,yesr,dep_time))
#usage of pipe operator %>%
df <- mtcars
df
view(mtcars)
result <- arrange(sample_n(filter(df,mpg>20),size=5),desc(mpg))
view(result)
#multiple assignment
a <- filter(df,mpg=20)
b <- sample_n(a,size=5)
result <- arrange(b,desc(mpg))
result
#using pipe operator
result <- df %>% filter(mpg>20) %>% sample_n(size=10) %>% arrange(desc(mpg))
result
#selected columns
#ot same as filter, filter only specific data
df
df_mpg_hp_cy1 = df %>% select(mpg,hp,cy1)
head(df_mpg_hp_cy1)
- tidyr package helps to create tidy data.
install.packages('tidyr')
library('tidyr')
n=10
wide <- data.frame(
ID = c(1:n)
Face.1 = c(411,723,325,456,579,612,709,513,527,379),
Face.2 = c(541,568,523,547,985,632,410,568,954,156),
Face.3 = c(547,352,985,785,458,126,844,695,758,945),
)
view(wide)
#gather() to long format
long <- wide %>% gather(Face, ResponseTime, Face.1:Face.3)
view(long)
#seperate() to split single column into multiple columns
long_separate <- long %>% separate(Face, c("Target","Number"))
view(long_separate)
#unite() combines multiple to single column
long_unite <- long_seperate %>% unite(Face, Target,Number,sep='.')
view(long_unite)
#spread()-take two columns (key&value) and spreads into multiple comlumns
#it makes "long" data wider
back_to_wide <- long_unite %>% spread(Face, ResponseTime)
view(back_to_wide)
9. Data visualization
installed.packages("datasets")
plot(ChickWeight)
#base graphics
library(MASS)
plot(UScereal$sugars,UScereal$calories)
title("plot(UScereal$sugars,UScereal$calories)")
x <- UScereal$sugars
y <- UScereal$calories
library(grid)
#grid graphics
pushViewport(plotViewport())
pushViewport(dataViewport(x,y))
grid.rect()
grid.xaxis()
grid.yaxis()
grid.points(x,y)
grid.text("UScereal$calories",x=unit(-3,"lines"),rot=90)
grid.text("UScereal$sugars",y=unit(-3,"lines"),rot=0)
popViewport(2)
- ggplot is a data visualization package to create graphs in R // decomposing complex graphs
into logical subunits
- ggplot uses geoms or geometric objects to form the basis of different types of graphs
geom_bar for bar plots
geom_line for line graphs
geom_points for scatter plots
geom_boxplot for box plots
geom_quantile for continuous x
geom_violin for richer display of distribution
geom_jitter for small data
#example1
library(ggplot2)
head(mpg,n=10)
str(mpg)
install.packages("tidyverse")
library(tidyverse)
ggplot(mpg) + geo_bar(aes(x=class))
#change graph color
ggplot(mpg)+
geom_bar(aes(x=class,fill=blue))
#stacked bar chart
ggplot(mpg)+
geom_bar(aes(x=class,fill=drv))
#dodged bar
ggplot(mpg) +
geom_bar(aes(x=class,
fill=factory(cy1)),
position=position_dodge(preserve='single'))
#line graph
library(tidyverse)
#filter data we need
Tree_1 <- filter(Orange,Tree==1)
#graph the data
ggplot(tree_1)+
geom_line(aes(x=age,y=circumference))
#pie chart
#create data for the graph
x <- c(33,45,70,110)
labels <- c("Soap","Detergent","Oil","Shampoo")
#plot the chart
pie(x,labels)
pie(x,labels,main="City pie chart",col=rainbow(length(x)))
piepercent <- round(100*x/sum(x),1)
pie(x,labels=piepercent,main="City pie chart",col=rainbow(length(x)))
legend("topright",c("Soap","Shampoo","Oil","Grocery"),cex=0.8,
fill=rainbow(length(x)))
#3D pie chart
install.packages("plotrix")
library(plotrix)
x <- c(33,45,70,110)
lbl <- c("Soap","Detergent","Oil","Shampoo")
#plot the chart
pie3D(x,labels=lbl,explode=0.1,main="Pie Chart of Countries")
#histogram
v <- c(9,13,21,8,36,22,12,41,31,33,19)
hist(v,xlab="weight",col="green",border="red")
hist(v,xlab="weight",col="green",border="red",xlim=x(0,40),ylim=c(0,5),
breaks=5)
data("air quality")
view("air quality")
#scatter plots
plot(airquality$ozone,airquality$wind)
plot(airquality$ozone,airquality$wind,col='red')
plot(airquality$ozone,airquality$wind,type='h',col='blue')
plot(airquality)
#assign labels to the plot
plot(airquality$ozone='Ozone Concentration'ylab='No of Instances',main='Ozone levels in
NY city',col='green')
#histogram
hist(airquality$Solar.R)
hist(airquality$Solar.R,main='Solar Radiation vales in air',xlab='Solar rad.')
Temperature <- airquality$Temp
hist(Temperature)
#histogram with labels
h <- hist(Temperature,ylim=c(0,40))
text(h$mids,h$counts,labels=h$counts,adj=c(0.5,-0.5))
#histogram with non-uniform width
hist(Temperature.
main="Maximum daily temperature at LA Airport",
xlab="Temperature in degreed Fahrenheit".
xlim=c(50,100),
col="chocolate",
border="brown",
break=c(55,60,70,75,80,100))
#boxplot
boxplot(airquality$Solar.R)
#multiple box plots
boxplots(airquality[,0:4],main='Multiple Box plots')
#scatter plots
ggplot(data=mtcars,mapping=aes(x=wt,y=mpg))+geom_point()
#scatter plots by factors
view(mtcars)
ggplot(sata=mtcars,mapping=aes(x=wt,y=mpg,col='red'))+geom_point()
#scatter plots by size
ggplot(data=mtcars,mapping=aes(x=wt,y=mpg,size=qsec))+geom_point
ggplot(data=mtcars,mapping=aes(x=mt,y=mpg,col='red',size=qsec))+geom_point()
#visualization using mpg dataset
ggplot2:mpg
view(ggplot2::mpg)
#bar plots
ggplot(data=ggplot::pmg,aes(class))+geom_bar()
#stacked bar chart
ggplot(data=ggplot2::mpg,aes(class)) + geom_bar(aes(fill=drv))
#using dodge
ggplot(data=ggplot2::mpg,aes(class))+geom_bar(aes(fill=drv),position="dodge")
ggplot(data=ggplot2::mpg) +
geom_point(mapping=aes(x=displ,y=hwy))
ggplot(data=ggplot2::mpg) +
geom_point(mapping=aes(x=displ,y=hwy,color=class))
10. Time Series Analysis