# Answer (1)
v1 =c(1:20)
v1
v2 =seq(20,1,-1)
v2
v3=c(1:20, seq(19,1,-1))
v3
temp=c(4,6,3)
temp
# Answer (2)
#2(e)
ve=rep(c(4,6,3),10)
ve
#2(f)
temp=c(4,6,3)
rep(temp,times=11,length.out=31)
#2(g)
vg=c(rep(4,10),rep(6,20),rep(3,30))
vg
#or we can do this in 2nd way
temp=c(4,6,3)
rep(temp,c(10,20,30))
# Answer (3)
x =seq(3,6,by = 0.1)
v2 =exp(x) * cos(x)
v2
# Answer (4)
heights=c(180,165,160,193)
weights=c(180,165,160,193)
bmi=c(weights/(heights^2))
print(bmi)
bmi_25 =subset(weights, bmi > 25)
print(bmi_25)
# Answer (5)
temp=c(23,27,19)
f=(9/5*temp)+32
print(f)
# Answer (6)
set.seed(75)
aMat=matrix (sample(10, size=60, replace=T), nrow=6)
aMat
#Answer 6)a
apply(aMat, 1, function(x){sum(x>4)})
#Answer 6)b
which( apply(aMat,1,function(x){sum(x==7)==2}) )
#Answer 6)c
aMatColSums =colSums(aMat)
aMatColSums
which( outer(aMatColSums,aMatColSums,"+")>75, arr.ind=T )
#or
#if want to exclude repeats
aMatColSums <- colSums(aMat)
logicalMat <- outer(aMatColSums,aMatColSums,"+")>75
logicalMat[lower.tri(logicalMat,diag=T)] <- F
which(logicalMat, arr.ind=T)
#Answer (7)
Number = c(1,2,3,4)
Diet = c("Poor","Poor","Good","Good")
Sex = c("M","F","M","F")
Weight= c(156,180,167,190)
fat.content= c(34,43,40,50)
Morph = c("Winged","Winged","Wingless","Intermediate")
sdf1= data.frame(Number,Diet, Sex, Weight,fat.content,Morph)
sdf1
is.factor(sdf1$Diet) #is.factor is false it means it is character type
#Answer (8)
#class(): Determine the kind of variable
#mode() : a mutually exclusive classification of objects according to their basic structure
#typeof(): Data type of an object
#is() : to check any variable of certain data type
#rm() function is used to delete objects from the memory
v=cbind(c(1,2,3),c(4,5,6))
v
class(v) #class will be matrix and array
mode(v) # mode will be numeric
typeof(v) # typeof will be double
is.array(v) # will return TRUE
is.matrix(v) #will return TRUE
rm(v)
v #object not found
#Answer (9)
fy = c(2010, 2011, 2012, 2010, 2011, 2012, 2010, 2011, 2012)
fy
company =c("Apple", "Apple", "Apple", "Google", "Google", "Google", "Microsoft", "Microsoft",
"Microsoft")
company
revenue = c(65225, 108249, 156508, 29321, 37905, 50175, 62484, 69943, 73723)
revenue
profit =c( 14013, 25922, 41733, 8505, 9737, 10737, 18760, 23150, 16978)
profit
companiesData=data.frame(fy,company,revenue,profit)
companiesData
write.csv(companiesData, file="ray.csv")
head(companiesData)
dim(companiesData)
nrow(companiesData)
ncol(companiesData)
str(companiesData)
min(companiesData$profit)
max(companiesData$profit)
subset(companiesData, fy >=2011, select=c(fy,profit))
#Answer (10)
gender <- factor(c(rep("female", 91), rep("male", 92)))
table(gender)
gender <- factor(gender, levels = c("male", "female")) # mapping of male to female and female to male
table(gender)
gender <- factor(gender, levels = c("Male", "female")) #mapping of Male to male and female to female
Male value will be zero
table(gender)
rm(gender)
gender # object not found because of rm(gender)code
#Answer (11)
for (i in 1:7){
print(i**3)
}
#Answer (12)
Same as 8th Question
#Answer (13)
mx = matrix(c(1,1,3,5,2,6,-2,-1,-3), nrow = 3, byrow = TRUE)
fun = function(mx) {
ifelse(mx %% 2 == 0, mx, 2*mx)
}
res <- fun(mx)
res
#Answer (14)
num=c(19)
#num=as.numeric(readline(prompt = "Enter The number: "))
is.prime <- function(num) {
if (num == 2) {
print(" Number is Prime")
} else if (any(num %% 2:(num-1) == 0)) {
print(" Number is not Prime")
} else {
print(" Number is Prime")
}
}
is.prime(num)
#Answer (15)
data = airquality
print(class(data))
result = data[order(data[,1]),]
print("Order the entire data frame by the first and second column:")
print(result)
#Answer (16)
data(mtcars)
head(mtcars)
#16)a
print("Original dataframe:")
print(mtcars)
print("Structure of the said data frame:")
print(str(mtcars))
#16)b
print("Original dataframe:")
print(mtcars)
print("Statistical summary and nature of the data of the said dataframe:")
print(summary(mtcars))
head(mtcars)
#16)c
print(" to extract specific column from a data frame using column name:")
data.frame(mtcars$mpg)
data.frame(mtcars$mpg,mtcars$gear)
#16)d
print("Original dataframe:")
print(mtcars)
print("Extract first two rows:")
result = mtcars[1:2,]
print(result)
#16)e
print("Original dataframe:")
print(mtcars)
print("Extract 3rd and 5th rows with 1st and 3rd columns :")
result = mtcars[c(3,5),c(1,3)]
print(result)
#16)f
print("Original dataframe:")
print(mtcars)
print("New data frame after adding the 'country' column:")
mtcars$country =
c("USA","USA","USA","USA","USA","USA","USA","USA","USA","USA","India","USA","India","USA","India","
USA","India","USA","USA","USA","India","USA","India","USA","India","USA","India","USA","India","USA","In
dia","USA")
print(mtcars)
#16)g
Alto = data.frame(
mpg = c(21.0, 12.0),
cyl = c(10.5, 9),
disp = c(1, 3),
hp = c(11, 3),
drat = c(12, 3),
wt = c(14, 3),
qsec = c(10.1, 3.0),
vs = c(11.0, 3),
am = c(12, 3),
gear = c(17, 3),
carb = c(12, 3),
country = c('Usa', 'Usa')
)
mtcars = rbind(mtcars, Alto)
print("After adding new row(s) to an existing data frame:")
print(mtcars)
#16)h
print("Original dataframe:")
print(mtcars)
print("After removing col(s) to an existing data frame:")
mtcars = subset(mtcars, select = -c(cyl, qsec))
print(mtcars)
#16)i
print("Original dataframe:")
print(mtcars)
print("After removing row(s) to an existing data frame:")
print(mtcars[-c(33,34), ])
#16)j
print("to sort a given data frame by multiple column(s):")
print(mtcars[with(mtcars, order(mpg, carb)), ] )
#16)k
print("Original dataframe:")
print(mtcars)
print("Change column-name 'mpg' to 'MPG' of the said dataframe:")
colnames(mtcars)[which(names(mtcars) == "mpg")] = "MPG"
print(mtcars)
#Answer 17)
library(tidyverse)
# merging two csv files
df1 <- read.csv("religion.csv",
header = TRUE,
sep = ",")
df1
df2 <- read.csv("religion.csv",
header = TRUE,
sep = ",")
df2
subset1<-df1[1:5,2:4]
subset1
subset2<-df2[6:10,1:3]
subset2
#b) rename the headers
subset1
names(subset1)<-c("caste","10k","20k")
subset1
subset2
names(subset2)<-c("ID","caste","10k")
subset2
#c) Merge multiple datasets
md<-merge(subset1,subset2,all = TRUE)
md
#d) Delete the last column of resultant dataset
md$ID<-NULL
head(md)
#e) Add a new column as the second column of resultant dataset
ID<-1:10
ID
md<-cbind(md,ID)
View(md)
#f) Save the final data as a csv file using write.table
write.csv(md,file = "myd.csv")
#Answer 18)
R <- c(2.27, 1.98, 1.69, 1.88, 1.64, 2.14)
H <- c(8.28, 8.04, 9.06, 8.70, 7.58, 8.34)
volumes<-c(1/3*pi*R^2*H)
volumes
#Answer 19)
mean(volumes)
median(volumes)
sd(volumes)
vol<- subset(volumes,H<8.5)
vol
mean(vol)
#Answer 20)
rainforest<-read.csv("rainforest.csv",header = TRUE)
rainforest
Acmena <- subset(rainforest, species == "Acmena smithii")
Acmena
order1 <- order(Acmena$dbh)
order1
Statistics Assignment in R
Q1. Consider the data from Brendon, Jason, Melissa, Paula, and McGuirk. Report for each
answer, indicate how you know, when appropriate, by reporting the values of the statistic you
are using or other information you used.
install.packages("psych")
library(psych)
install.packages("PerformanceAnalytics")
library(PerformanceAnalytics)
install.packages("DescTools")
library(DescTools)
install.packages("ggplot2")
library(ggplot2)
install.packages("rcompanion")
library(rcompanion)
Input = ("
Instructor Grade Weight Calories Sodium Score
'Brendon Small' 6 43 2069 1287 77
'Brendon Small' 6 41 1990 1164 76
'Brendon Small' 6 40 1975 1177 76
'Brendon Small' 6 44 2116 1262 84
'Brendon Small' 6 45 2161 1271 86
'Brendon Small' 6 44 2091 1222 87
'Brendon Small' 6 48 2236 1377 90
'Brendon Small' 6 47 2198 1288 78
'Brendon Small' 6 46 2190 1284 89
'Jason Penopolis' 7 45 2134 1262 76
'Jason Penopolis' 7 45 2128 1281 80
'Jason Penopolis' 7 46 2190 1305 84
'Jason Penopolis' 7 43 2070 1199 68
'Jason Penopolis' 7 48 2266 1368 85
'Jason Penopolis' 7 47 2216 1340 76
'Jason Penopolis' 7 47 2203 1273 69
'Jason Penopolis' 7 43 2040 1277 86
'Jason Penopolis' 7 48 2248 1329 81
'Melissa Robins' 8 48 2265 1361 67
'Melissa Robins' 8 46 2184 1268 68
'Melissa Robins' 8 53 2441 1380 66
'Melissa Robins' 8 48 2234 1386 65
'Melissa Robins' 8 52 2403 1408 70
'Melissa Robins' 8 53 2438 1380 83
'Melissa Robins' 8 52 2360 1378 74
'Melissa Robins' 8 51 2344 1413 65
'Melissa Robins' 8 51 2351 1400 68
'Paula Small' 9 52 2390 1412 78
'Paula Small' 9 54 2470 1422 62
'Paula Small' 9 49 2280 1382 61
'Paula Small' 9 50 2308 1410 72
'Paula Small' 9 55 2505 1410 80
'Paula Small' 9 52 2409 1382 60
'Paula Small' 9 53 2431 1422 70
'Paula Small' 9 56 2523 1388 79
'Paula Small' 9 50 2315 1404 71
'Coach McGuirk' 10 52 2406 1420 68
'Coach McGuirk' 10 58 2699 1405 65
'Coach McGuirk' 10 57 2571 1400 64
'Coach McGuirk' 10 52 2394 1420 69
'Coach McGuirk' 10 55 2518 1379 70
'Coach McGuirk' 10 52 2379 1393 61
'Coach McGuirk' 10 59 2636 1417 70
'Coach McGuirk' 10 54 2465 1414 59
'Coach McGuirk' 10 54 2479 1383 61
")
Data = read.table(textConnection(Input),header=TRUE)
Data
head(Data)
output:-
> head(Data)
Instructor Grade Weight Calories Sodium Score
1 Brendon Small 6 43 2069 1287 77
2 Brendon Small 6 41 1990 1164 76
3 Brendon Small 6 40 1975 1177 76
4 Brendon Small 6 44 2116 1262 84
5 Brendon Small 6 45 2161 1271 86
6 Brendon Small 6 44 2091 1222 87
Data$Instructor = factor(Data$Instructor,
levels=unique(Data$Instructor))
pairs(data=Data,
~ Grade + Weight + Calories + Sodium + Score)
Output:-
Data.num = Data[c("Grade", "Weight", "Calories", "Sodium",
"Score")]
corr.test(Data.num,
use = "pairwise",
method = "pearson",
adjust = "none")
output:-
> Data.num = Data[c("Grade", "Weight", "Calories", "Sodium", "Score")]
> corr.test(Data.num,
+ use = "pairwise",
+ method = "pearson",
+ adjust = "none")
Call:corr.test(x = Data.num, use = "pairwise", method = "pearson",
adjust = "none")
Correlation matrix
Grade Weight Calories Sodium Score
Grade 1.00 0.85 0.85 0.79 -0.70
Weight 0.85 1.00 0.99 0.87 -0.48sss
Calories 0.85 0.99 1.00 0.85 -0.48
Sodium 0.79 0.87 0.85 1.00 -0.45
Score -0.70 -0.48 -0.48 -0.45 1.00
Sample Size
[1] 45
Probability values (Entries above the diagonal are adjusted for multiple
tests.)
Grade Weight Calories Sodium Score
Grade 0 0 0 0 0
Weight 0 0 0 0 0
Calories 0 0 0 0 0
Sodium 0 0 0 0 0
Score 0 0 0 0 0
To see confidence intervals of the correlations, print with the
short=FALSE option
library(PerformanceAnalytics)
chart.Correlation(Data.num,
method="pearson",
histogram=TRUE,
pch=16)
output:-
plot(Sodium ~ Calories,
data=Data,
pch=16,
xlab = "Calories",
ylab = "Sodium")
output:-
cor.test( ~ Sodium + Calories,
data=Data,
method = "pearson")
output:-
> cor.test( ~ Sodium + Calories,
+ data=Data,
+ method = "pearson")
Pearson's product-moment correlation
data: Sodium and Calories
t = 10.534, df = 43, p-value = 1.737e-13
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.7397691 0.9145785
sample estimates:
cor
0.8489548
model = lm(Sodium ~ Calories,
data = Data)
x = residuals(model)
library(rcompanion)
plotNormalHistogram(x)
output:-
cor.test( ~ Sodium + Calories,
data=Data,
method = "kendall")
output:-
cor.test( ~ Sodium + Calories,
data=Data,
method = "kendall")
Kendall's rank correlation tau
z = 6.2631, p-value = 3.774e-10
alternative hypothesis: true tau is not equal to 0
sample estimates:
tau
0.6490902
model = lm(Sodium ~ Calories,
data = Data)
summary(model)
output:-
> model = lm(Sodium ~ Calories,
+ data = Data)
> summary(model)
Call:
lm(formula = Sodium ~ Calories, data = Data)
Residuals:
Min 1Q Median 3Q Max
-83.263 -26.263 -0.486 29.973 64.714
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 519.07547 78.78211 6.589 5.09e-08 ***
Calories 0.35909 0.03409 10.534 1.74e-13 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 38.89 on 43 degrees of freedom
Multiple R-squared: 0.7207, Adjusted R-squared: 0.7142
F-statistic: 111 on 1 and 43 DF, p-value: 1.737e-13
plot(Sodium ~ Calories,
data=Data,
pch=16,
xlab = "Calories",
ylab = "Sodium")
abline(model,
col = "blue",
lwd = 2)
output:-
x = residuals(model)
library(rcompanion)
plotNormalHistogram(x)
output:-
plot(fitted(model),
residuals(model))
output:-
a. Which two variables are the most strongly correlated?
Ans-(weight & calorie)
Correlation :- 0.99
b. Which two variables are the least strongly correlated?
Ans-(Grade & score)
Correlation- (-0.70)
c. Are there any pairs of variables that are statistically uncorrelated? Which?
Ans-There are no pairs of variables that are statistically uncorrelated
d. Name a pair of variables that is positively correlated.
Ans- -(weight & calorie)
Correlation :- 0.99
e. Name a pair of variables that is negatively correlated.
Ans--(Grade & score)
Correlation- (-0.70)
f. Is Sodium significantly correlated with Calories?
Ans-Yes,Sodium is significantly correlated with Calories
Correlation- (0.85)
g. By linear regression, is there a significant linear relationship of Sodium vs. Calories?
Ans- Yes
h. Does the quadratic polynomial model fit the Sodium vs. Calories data better than the
linear model? Consider the p-value, the r-squared value, the range of values for each
of Sodium and Calories, and your practical conclusions
model = lm(Sodium ~ Calories,
data = Data)
summary(model)
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 519.07547 78.78211 6.589 5.09e-08 ***
Calories 0.35909 0.03409 10.534 1.74e-13 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 38.89 on 43 degrees of freedom
Multiple R-squared: 0.7207, Adjusted R-squared: 0.7142
F-statistic: 111 on 1 and 43 DF, p-value: 1.737e-13
Equation:- y = 0.35909x + 519.07547
Q2.As part of a professional skills program, a 4-H club tests its members for typing
proficiency (Words.per.minute), Proofreading skill, proficiency with using a Spreadsheet,
and acumen in Statistics.
data <- read.csv(text = "
Instructor,Grade,Words.per.minute,Proofreading,Spreadsheet,Statistics
'Dr. Katz',6,35,53,75,61
'Dr. Katz',6,50,77,24,51
'Dr. Katz',6,55,71,62,55
'Dr. Katz',6,60,78,27,91
'Dr. Katz',6,65,84,44,95
'Dr. Katz',6,60,79,38,50
'Dr. Katz',6,70,96,12,94
'Dr. Katz',6,55,61,55,76
'Dr. Katz',6,45,73,59,75
'Dr. Katz',6,55,75,55,80
'Dr. Katz',6,60,85,35,84
'Dr. Katz',6,45,61,49,80
'Laura',7,55,59,79,57
'Laura',7,60,60,60,60
'Laura',7,75,90,19,64
'Laura',7,65,87,32,65
'Laura',7,60,70,33,94
'Laura',7,70,84,27,54
'Laura',7,75,87,24,59
'Laura',7,70,97,38,74
'Laura',7,65,86,30,52
'Laura',7,72,91,36,66
'Laura',7,73,88,20,57
'Laura',7,65,86,19,71
'Ben Katz',8,55,84,20,76
'Ben Katz',8,55,63,44,94
'Ben Katz',8,70,95,31,88
'Ben Katz',8,55,63,69,93
'Ben Katz',8,65,65,47,70
'Ben Katz',8,60,61,63,92
'Ben Katz',8,70,80,35,60
'Ben Katz',8,60,88,38,58
'Ben Katz',8,60,71,65,99
'Ben Katz',8,62,78,46,54
'Ben Katz',8,63,89,17,60
'Ben Katz',8,65,75,33,77
")
data
head(data)
data$Instructor = factor(data$Instructor,
levels=unique(data$Instructor))
pairs(data=data,
~ Grade+Words.per.minute+Proofreading+Spreadsheet+Statistics)
Output:-
Data.num = data[c("Grade", "Words.per.minute", "Proofreading", "Spreadsheet",
"Statistics")]
Data.num[] <- lapply(Data.num, as.numeric)
corr.test(Data.num,
use = "pairwise",
method = "pearson",
)
Output:-
Call:corr.test(x = Data.num, use = "pairwise", method = "pearson")
Correlation matrix
Grade Words.per.minute Proofreading Spreadsheet Statistics
Grade 1.00 0.24 0.05 -0.05 0.07
Words.per.minute 0.24 1.00 0.52 -0.45 -0.24
Proofreading 0.05 0.52 1.00 -0.79 -0.13
Spreadsheet -0.05 -0.45 -0.79 1.00 0.12
Statistics 0.07 -0.24 -0.13 0.12 1.00
Sample Size
[1] 36
chart.Correlation(Data.num,
method="pearson",
histogram=TRUE,
pch=16)
output:-
a. Which two variables are the most strongly correlated?
Ans- (words.per.minute & proofreading)
Correlation- 0.52
b. Name a pair of variables that are statistically uncorrelated.
Ans- (Spreadsheet & Statistics)
Correlation - 0.12
c. Name a pair of variables that is positively correlated.
Ans- (words.per.minute & proofreading)
Correlation- 0.52
d. Name a pair of variables that is negatively correlated.
Ans- (proofreading & spreadsheet)
Correlation- (-0.79)
e. Consider the correlation between Spreadsheet and Proofreading.
(i). What is the value of the correlation coefficient r for this correlation?
cor.test( ~ Spreadsheet + Proofreading,
data=Data.num,
method = "pearson")
output:-
Pearson's product-moment correlation
data: Spreadsheet and Proofreading
t = -7.5962, df = 34, p-value = 7.97e-09
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.8898707 -0.6284997
sample estimates:
cor
-0.793243
Ans- r=(-0.793)
(ii). What is the value of tau?
cor.test( ~ Spreadsheet + Proofreading,
data=Data.num,
method = "kendall")
output:-
Kendall's rank correlation tau
data: Spreadsheet and Proofreading
z = -4.888, p-value = 1.019e-06
alternative hypothesis: true tau is not equal to 0
sample estimates:
tau
-0.5792888
Ans- tau=(-0.579)
(iii). What is the value of rho?
cor.test( ~ Spreadsheet + Proofreading,
data=Data.num,
method = "spearman")
output:-
Spearman's rank correlation rho
data: Spreadsheet and Proofreading
S = 13634, p-value = 1.057e-07
alternative hypothesis: true rho is not equal to 0
sample estimates:
rho
-0.7546574
Ans- rho=(-0.755)
f. Conduct a linear regression of Proofreading vs. Words.per.minute.
model = lm(Proofreading ~ Words.per.minute,
data = Data.num)
summary(model)
output:-
Call:
lm(formula = Proofreading ~ Words.per.minute, data = Data.num)
Residuals:
Min 1Q Median 3Q Max
-20.932 -8.960 1.568 7.913 17.561
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 64.4820 4.0977 15.736 < 2e-16 ***
Words.per.minute 2.4928 0.7103 3.509 0.00129 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 10.45 on 34 degrees of freedom
Multiple R-squared: 0.2659, Adjusted R-squared: 0.2443
F-statistic: 12.32 on 1 and 34 DF, p-value: 0.001287
(i). What is the p-value for this model?
Ans- 0.001287
(ii). What is the r-squared value?
Ans- 0.2659
(iii). Do the residuals suggest that the linear regression model is an appropriate model?
Ans- plot(Proofreading ~ Words.per.minute,
data=Data.num,
pch=16,
xlab = "Proofreading",
ylab = "Words.per.minute")
abline(model,
col = "blue",
lwd = 2)
x = residuals(model)
library(rcompanion)
plotNormalHistogram(x)
plot(fitted(model),
residuals(model))
iv. What can you conclude about the results of the linear regression? Consider the p-
value, the r-squared value, the range of values for each
of Proofreading and Words.per.minute, and your practical conclusions
model = lm(Proofreading ~ Words.per.minute,
data = Data.num)
summary(model)
output:-
Call:
lm(formula = Proofreading ~ Words.per.minute, data = Data.num)
Residuals:
Min 1Q Median 3Q Max
-20.932 -8.960 1.568 7.913 17.561
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 64.4820 4.0977 15.736 < 2e-16 ***
Words.per.minute 2.4928 0.7103 3.509 0.00129 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 10.45 on 34 degrees of freedom
Multiple R-squared: 0.2659, Adjusted R-squared: 0.2443
F-statistic: 12.32 on 1 and 34 DF, p-value: 0.001287
Ans- y=2.4928x + 64.4820