IRIS Commands Practice

Download as txt, pdf, or txt
Download as txt, pdf, or txt
You are on page 1of 10

https://r4ds.had.co.nz/transform.

html ---link for r commands

install.packages("dplyr");
install.packages("plyr")
install.packages("readr")
install.packages("FSelector");
library(dplyr);
library(readr)
library(FSelector)
library(plyr)
library(dplyr);
library(FSelector);

> setwd("C:\\Users\\aqiba\\OneDrive\\Desktop\\Data Analytics\\IRIS dataset");


> mydata <- read_csv("iris_new.csv");

-- Column specification --------------------------------------------------------


cols(
sepal.length = col_double(),
sepal.width = col_double(),
petal.length = col_double(),
petal.width = col_double(),
variety = col_character()
)

> mydata
> filter(mydata, variety =="Setosa" & sepal.length > 4);
> summary(mydata)
sepal.length sepal.width petal.length petal.width
Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
Median :5.800 Median :3.000 Median :4.350 Median :1.300
Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
variety
Length:150
Class :character
Mode :character

> transform(mydata, variety =="Setose");


> show(mydata)
> View(mydata)
> select(mydata, variety, sepal.length);

> mutate(mydata, total = sepal.width + petal.width);


> select(mydata, total);
> View(mydata)
> transform(mydata, sepal.width = -sepal.width);
> View(mydata)
> v_change_sepal <- transform(mydata, sepal.width = -sepal.width);
> v_change_sepal
rename(iris_date_new, c("variety" = "species")); ------to rename any column of data
set

sum(is.na(iris_null_data)); ---for finding null values in dataset


[1] 5
OR

any(is.na(data)) -----for checking null values if any in entire dataframe

> for (i in which(sapply(iris_null_data, is.numeric))) {


+ iris_null_data[is.na(iris_null_data[, i]), i] <- mean(iris_null_data[, i],
na.rm = TRUE) + } -----for replacing null values
> newdata <- na.omit(iris_null_data)

------------------------links ------------------------------
https://subscription.packtpub.com/book/big_data_and_business_intelligence/978178528
6544/1/ch01lvl1sec11/data-preprocessing-techniques---- preprocessing steps
http://dataanalyticsedge.com/2018/05/02/data-cleaning-using-r/ ------for funtions
of checking missing values
-----------------------------------
regression----------------------------------------------------------------
https://www.datacamp.com/community/tutorials/linear-regression-R

?USJudgeRatings
head(USJudgeRatings)
USJUDGE_DATE <- USJudgeRatings
X <- as.matrix([-10]);
X <- as.matrix(USJUDGE_DATE[-10]);
X
X <- as.matrix(USJUDGE_DATE[-7]);
X
X <- as.matrix(USJUDGE_DATE[1]);
X
X <- as.matrix(USJUDGE_DATE[-8]);
X
V_REG <- lm(RTEN ~ CONT + INTG, data = USJUDGE_DATE);
V_REG
plot(V_REG);
plot(V_REG);
abline(V_REG);
V_REG <- lm(RTEN ~ CONT data = USJUDGE_DATE);
V_REG <- lm(RTEN ~ CONT, data = USJUDGE_DATE);
abline(V_REG);
v_plot <- abline(V_REG);
v_plot
abline(V_REG);
plot(V_REG, pch = 16, col = "blue");

---------------------------------
CORRELATION--------------------------------------------------------------

head(iris_new)

> x <- iris_new[1:1];


> y <- iris_new[2:2];
> y
# A tibble: 150 x 1
sepal.width
<dbl>
1 3.5
2 3
3 3.2
4 3.1
5 3.6
6 3.9
7 3.4
8 3.4
9 2.9
10 3.1
# ... with 140 more rows
> x
# A tibble: 150 x 1
sepal.length
<dbl>
1 5.1
2 4.9
3 4.7
4 4.6
5 5
6 5.4
7 4.6
8 5
9 4.4
10 4.9
# ... with 140 more rows
> v_get_correlation <- cor(x,y);
> v_get_correlation
sepal.width
sepal.length -0.1175698
> v_get_correlation <- cor(x,y, method = "spearman");
> v_get_correlation
sepal.width
sepal.length -0.1667777
> install.packages("ggpubr");
Error in install.packages : Updating loaded packages
> install.packages("ggpubr")
WARNING: Rtools is required to build R packages but is not currently installed.
Please download and install the appropriate version of Rtools before proceeding:

https://cran.rstudio.com/bin/windows/Rtools/
Installing package into ‘C:/Users/aqiba/OneDrive/Documents/R/win-library/4.0’
(as ‘lib’ is unspecified)

>
>
>

> library(ggpubr);

Attaching package: ‘ggpubr’

The following object is masked from ‘package:plyr’:

mutate

> plot(v_get_correlation);
> ggqqplot(iris_new$sepal.length,iris_new$petal.width);
Error in data[, x] : incorrect number of dimensions
> ggqqplot(iris_new$sepal.length, ylab = "sepal_length");
> ggqqplot(iris_new$petal.width, ylab = "petal_width");
------------------------------------------------------------------

install.packages("rpart.plot");
WARNING: Rtools is required to build R packages but is not currently installed.
Please download and install the appropriate version of Rtools before proceeding:

https://cran.rstudio.com/bin/windows/Rtools/
Installing package into ‘C:/Users/aqiba/OneDrive/Documents/R/win-library/4.0’
(as ‘lib’ is unspecified)
trying URL 'https://cran.rstudio.com/bin/windows/contrib/4.0/rpart.plot_3.0.9.zip'
Content type 'application/zip' length 1034182 bytes (1009 KB)
downloaded 1009 KB

package ‘rpart.plot’ successfully unpacked and MD5 sums checked

The downloaded binary packages are in


C:\Users\aqiba\AppData\Local\Temp\RtmpcVOHBJ\downloaded_packages
> install.packages("data.tree");
> install.packages("party");
> library(dplyr);
> library(readr);
> library(plyr);
-------------------------------------------------------------------------------
You have loaded plyr after dplyr - this is likely to cause problems.
If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
library(plyr); library(dplyr)
-------------------------------------------------------------------------------
> setwd("C:\\Users\\aqiba\\OneDrive\\Desktop\\Data Analytics\\IRIS dataset");
> iris_data <- read.csv("iris_new.csv");
> iris_data;
> library(rpart);
> library(rpart.plot);
> create_tree <- rpart(variety ~ sepal.length, data = iris_data);
> create_tree
n= 150

node), split, n, loss, yval, (yprob)


* denotes terminal node

1) root 150 100 Setosa (0.33333333 0.33333333 0.33333333)


2) sepal.length< 5.45 52 7 Setosa (0.86538462 0.11538462 0.01923077) *
3) sepal.length>=5.45 98 49 Virginica (0.05102041 0.44897959 0.50000000)
6) sepal.length< 6.15 43 15 Versicolor (0.11627907 0.65116279 0.23255814) *
7) sepal.length>=6.15 55 16 Virginica (0.00000000 0.29090909 0.70909091) *
> rpart.plot(create_tree, extra = 7);
Warning message:
extra=7 but the response has 3 levels (only the 2nd level is displayed)
> rpart.plot(create_tree, extra = 3);
> View(iris_data)
> create_tree <- rpart(variety ~ sepal.length + sepal.width, data = iris_data);
> create_tree
n= 150

node), split, n, loss, yval, (yprob)


* denotes terminal node

1) root 150 100 Setosa (0.33333333 0.33333333 0.33333333)


2) sepal.length< 5.45 52 7 Setosa (0.86538462 0.11538462 0.01923077)
4) sepal.width>=2.8 45 1 Setosa (0.97777778 0.02222222 0.00000000) *
5) sepal.width< 2.8 7 2 Versicolor (0.14285714 0.71428571 0.14285714) *
3) sepal.length>=5.45 98 49 Virginica (0.05102041 0.44897959 0.50000000)
6) sepal.length< 6.15 43 15 Versicolor (0.11627907 0.65116279 0.23255814)
12) sepal.width>=3.1 7 2 Setosa (0.71428571 0.28571429 0.00000000) *
13) sepal.width< 3.1 36 10 Versicolor (0.00000000 0.72222222 0.27777778) *
7) sepal.length>=6.15 55 16 Virginica (0.00000000 0.29090909 0.70909091) *
> rpart.plot(create_tree, extra = 7);
Warning message:
extra=7 but the response has 3 levels (only the 2nd level is displayed)

> rpart.plot(create_tree, extra = 3);

-----------------------other method for


tree---------------------------------------------

create_tree <- ctree(sepal.width ~ sepal.length, data = iris_data);


> create_tree

Model formula:
sepal.width ~ sepal.length

Fitted party:
[1] root: 3.057 (n = 150, err = 28.3)

Number of inner nodes: 0


Number of terminal nodes: 1
> plot(create_tree);
> create_tree <- ctree(petal.length ~ sepal.length, data = iris_data);
> create_tree

Model formula:
petal.length ~ sepal.length

Fitted party:
[1] root
| [2] sepal.length <= 5.5
| | [3] sepal.length <= 5.4: 1.769 (n = 52, err = 34.1)
| | [4] sepal.length > 5.4: 3.229 (n = 7, err = 10.2)
| [5] sepal.length > 5.5
| | [6] sepal.length <= 6.2
| | | [7] sepal.length <= 5.8: 3.924 (n = 21, err = 25.8)
| | | [8] sepal.length > 5.8: 4.711 (n = 19, err = 3.3)
| | [9] sepal.length > 6.2
| | | [10] sepal.length <= 7: 5.169 (n = 39, err = 8.9)
| | | [11] sepal.length > 7: 6.300 (n = 12, err = 1.4)

Number of inner nodes: 5


Number of terminal nodes: 6
> plot(create_tree);
> create_tree <- rpart(petal.length ~ sepal.length, data = iris_data);
> plot(create_tree);
> rpart.plot(create_tree, extra = 3);

The 'extra' argument:


0 No extra information
1 Number of observations in the node
2 Class models: Classification rate (ncorrect/nobservations)
Poisson and exp models: number of events
3 Class models: Misclassification rate
4 Class models: Probability per class
5 Class models: Like 4 but don't display the fitted class
6 Class models: Probability of second class only
7 Class models: Like 6 but don't display the fitted class
8 Class models: Probability of the fitted class
9 Class models: Probability relative to all observations
10 Class models: like 9 but display the probability of the second class only

Add 100 to also display the percentage of observations in the node

Error: extra=3 is legal only for "class" models (you have an "anova" model)
> rpart.plot(create_tree);
> rpart.plot(create_tree, extra = 2);

The 'extra' argument:


0 No extra information
1 Number of observations in the node
2 Class models: Classification rate (ncorrect/nobservations)
Poisson and exp models: number of events
3 Class models: Misclassification rate
4 Class models: Probability per class
5 Class models: Like 4 but don't display the fitted class
6 Class models: Probability of second class only
7 Class models: Like 6 but don't display the fitted class
8 Class models: Probability of the fitted class
9 Class models: Probability relative to all observations
10 Class models: like 9 but display the probability of the second class only

Add 100 to also display the percentage of observations in the node

Error: extra=2 is legal only for "class", "poisson" and "exp" models (you have an
"anova" model)
> create_tree
n= 150

node), split, n, deviance, yval


* denotes terminal node

1) root 150 464.325400 3.758000


2) sepal.length< 5.55 59 57.404070 1.942373
4) sepal.length< 5.45 52 34.090770 1.769231 *
5) sepal.length>=5.45 7 10.174290 3.228571 *
3) sepal.length>=5.55 91 86.327470 4.935165
6) sepal.length< 6.25 40 35.249750 4.297500
12) sepal.length< 5.85 21 25.778100 3.923810 *
13) sepal.length>=5.85 19 3.297895 4.710526 *
7) sepal.length>=6.25 51 22.056470 5.435294
14) sepal.length< 7.05 39 8.923077 5.169231 *
15) sepal.length>=7.05 12 1.400000 6.300000 *
> rpart.plot(create_tree, extra = 1);
----------------------------------------------------------------------------neural
network-----------------

v_neural <- read.csv("neural_net.csv");


> v_neural
TCK CSS placed
1 20 90 1
2 10 20 0
3 30 40 0
4 20 50 0
5 80 50 1
6 30 80 1

> nn=neuralnet(placed~TCK+CSS,data=v_neural, hidden=3,act.fct = "logistic",


+ linear.output = FALSE);
> plot(nn)

--------------------------------------------------------------------
clustering---------------------------------------------

>iris_data
> Iris_cluster = iris_data;
> Iris_cluster
> Iris_cluster$variety = NULL;
> Iris_cluster
> create_cluster <- kmeans(Iris_cluster,3);
> create_cluster
K-means clustering with 3 clusters of sizes 38, 50, 62

Cluster means:
sepal.length sepal.width petal.length petal.width
1 6.850000 3.073684 5.742105 2.071053
2 5.006000 3.428000 1.462000 0.246000
3 5.901613 2.748387 4.393548 1.433871

Clustering vector:
[1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
[39] 2 2 2 2 2 2 2 2 2 2 2 2 3 3 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
[77] 3 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 3 1 1 1 1 3 1 1 1 1 1 1 3
[115] 3 1 1 1 1 3 1 3 1 3 1 1 3 3 1 1 1 1 1 3 1 1 1 1 3 1 1 1 3 1 1 1 3 1 1 3

Within cluster sum of squares by cluster:


[1] 23.87947 15.15100 39.82097
(between_SS / total_SS = 88.4 %)

Available components:

[1] "cluster" "centers" "totss" "withinss" "tot.withinss"


[6] "betweenss" "size" "iter" "ifault"
> create_cluster <- kmeans(Iris_cluster,2);
> create_cluster
K-means clustering with 2 clusters of sizes 97, 53

Cluster means:
sepal.length sepal.width petal.length petal.width
1 6.301031 2.886598 4.958763 1.695876
2 5.005660 3.369811 1.560377 0.290566

Clustering vector:
[1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
[39] 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[77] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
[115] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1

Within cluster sum of squares by cluster:


[1] 123.79588 28.55208
(between_SS / total_SS = 77.6 %)
Available components:

[1] "cluster" "centers" "totss" "withinss" "tot.withinss"


[6] "betweenss" "size" "iter" "ifault"
> create_cluster <- kmeans(Iris_cluster,3);
> create_cluster
K-means clustering with 3 clusters of sizes 38, 62, 50

Cluster means:
sepal.length sepal.width petal.length petal.width
1 6.850000 3.073684 5.742105 2.071053
2 5.901613 2.748387 4.393548 1.433871
3 5.006000 3.428000 1.462000 0.246000

Clustering vector:
[1] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
[39] 3 3 3 3 3 3 3 3 3 3 3 3 2 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
[77] 2 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 2 1 1 1 1 2 1 1 1 1 1 1 2
[115] 2 1 1 1 1 2 1 2 1 2 1 1 2 2 1 1 1 1 1 2 1 1 1 1 2 1 1 1 2 1 1 1 2 1 1 2

Within cluster sum of squares by cluster:


[1] 23.87947 39.82097 15.15100
(between_SS / total_SS = 88.4 %)

Available components:

[1] "cluster" "centers" "totss" "withinss" "tot.withinss"


[6] "betweenss" "size" "iter" "ifault"
> create_cluster$size
[1] 38 62 50
>
>
>
>
> table(iris_data$variety, create_cluster);
Error in table(iris_data$variety, create_cluster) :
all arguments must have the same length
> table(iris_data$variety, create_cluster$cluster);

1 2 3
Setosa 0 0 50
Versicolor 2 48 0
Virginica 36 14 0
> plot(iris_data[c("petal.length", "petal.width")], col = create_cluster$cluster);
> plot(iris_data[c("sepal.length", "sepal.width")], col = create_cluster$cluster);
>

---------------------------to find if there is any null values in entire data


set----------------------------------
any(is.na(v_iris_data));

---------------------------to remove all rows having null values or N/A value in


any row----------------------------
v_get_new_iris_data <- na.omit(v_iris_data);
> v_get_new_iris_data;
-----------------------------------principle component
analysis---------------------------------
----https://www.datacamp.com/community/tutorials/pca-analysis-r
------https://aaronschlegel.me/principal-component-analysis-r-example.html

v_get_pca <- prcomp(v_get_new_iris_data[,1:3]);


> v_get_pca
>install.packages("ggfortify");
pca.plot <- autoplot(v_get_pca, data = v_get_new_iris_data, colour = 'Group');
Error in autoplot(v_get_pca, data = v_get_new_iris_data, colour = "Group") :
could not find function "autoplot"
> library(ggfortify);
> pca.plot <- autoplot(v_get_pca, data = v_get_new_iris_data, colour = 'Group');
> pca.plot
Error: Unknown colour name: Group
> pca.plot <- autoplot(v_get_pca, data = v_get_new_iris_data, colour = 'Red');
> pca.plot

-----------------------------------------create hisogram
------------------------------
> hist(v_get_new_iris_data$petal.length);

-----------------------------str command as similar to


summary---------------------------
str(v_get_new_iris_data);
----------------------------------
correlation-------------------------------------------

x <- v_get_new_iris_data[1:1];
> y <- v_get_new_iris_data[3:3];
> v_find_correlation <- cor(x,y);
> v_find_correlation;
petal.length
sepal.length 0.8679478
> v_find_correlation <- cor(x,y, method = "spearman");
> v_find_correlation;
petal.length
sepal.length 0.8800297
> v_find_correlation <- cor(x,y, method = "kendal");
> v_find_correlation <- cor(x,y, method = "kendal");
> v_find_correlation;
petal.length
sepal.length 0.7157654

> library(ggplot2)
> plot(v_find_correlation);

?mtcar
No documentation for ‘mtcar’ in specified packages and libraries:
you could try ‘??mtcar’
> mtcars
> v_find_correlation <- cor(mtcars);
> v_find_correlation
> library(corrplot)
> plot(mtcars, method="circle");
> plot(mtcars, method="pie");

> install.packages("PerformanceAnalytics");
> library("PerformanceAnalytics");
> my_data <- mtcars[, c(1,3,4,5,6,7)]
> chart.Correlation(my_data, histogram=TRUE, pch=19);

------------------------------------regression
line---------------------------------
abline(lm(mpg~wt), col="red") # regression line (y~x)
lines(lowess(wt,mpg), col="blue") # lowess line (x,y)

> install.packages("ggcorrplot");
> v_linear_regression <- lm(iris$Petal.Length ~ iris$Sepal.Length +
iris$Sepal.Width, data = iris);
> v_linear_regression;

Call:
lm(formula = iris$Petal.Length ~ iris$Sepal.Length + iris$Sepal.Width,
data = iris)

Coefficients:
(Intercept) iris$Sepal.Length iris$Sepal.Width
-2.525 1.776 -1.339

> library(ggplot2);
> plot(v_linear_regression);

You might also like