Skip to content

Commit 7f898c1

Browse files
committed
z-test, t-test, confidence level
0 parents  commit 7f898c1

36 files changed

+10138
-0
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
packages
2+
ascdata

HLR.r

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
//Hierarchical Linear Regression Example
2+
//created by John M. Quick
3+
//http://www.johnmquick.com
4+
//December 13, 2009
5+
6+
> #read data into variable
7+
> datavar <- read.csv("dataset_enrollmentForecast.csv")
8+
9+
> #attach data variable
10+
> attach(datavar)
11+
12+
> #display all data
13+
> datavar
14+
YEAR ROLL UNEM HGRAD INC
15+
1 1 5501 8.1 9552 1923
16+
2 2 5945 7.0 9680 1961
17+
3 3 6629 7.3 9731 1979
18+
4 4 7556 7.5 11666 2030
19+
5 5 8716 7.0 14675 2112
20+
6 6 9369 6.4 15265 2192
21+
7 7 9920 6.5 15484 2235
22+
8 8 10167 6.4 15723 2351
23+
9 9 11084 6.3 16501 2411
24+
10 10 12504 7.7 16890 2475
25+
11 11 13746 8.2 17203 2524
26+
12 12 13656 7.5 17707 2674
27+
13 13 13850 7.4 18108 2833
28+
14 14 14145 8.2 18266 2863
29+
15 15 14888 10.1 19308 2839
30+
16 16 14991 9.2 18224 2898
31+
17 17 14836 7.7 18997 3123
32+
18 18 14478 5.7 19505 3195
33+
19 19 14539 6.5 19800 3239
34+
20 20 14395 7.5 19546 3129
35+
21 21 14599 7.3 19117 3100
36+
22 22 14969 9.2 18774 3008
37+
23 23 15107 10.1 17813 2983
38+
24 24 14831 7.5 17304 3069
39+
25 25 15081 8.8 16756 3151
40+
26 26 15127 9.1 16749 3127
41+
27 27 15856 8.8 16925 3179
42+
28 28 15938 7.8 17231 3207
43+
29 29 16081 7.0 16816 3345
44+
45+
> #create three linear models using lm(FORMULA, DATAVAR)
46+
> #one predictor model
47+
> onePredictorModel <- lm(ROLL ~ UNEM, datavar)
48+
> #two predictor model
49+
> twoPredictorModel <- lm(ROLL ~ UNEM + HGRAD, datavar)
50+
> #three predictor model
51+
> threePredictorModel <- lm(ROLL ~ UNEM + HGRAD + INC, datavar)
52+
53+
> #get summary data for each model using summary(OBJECT)
54+
> summary(onePredictorModel)
55+
56+
Call:
57+
lm(formula = ROLL ~ UNEM, data = datavar)
58+
59+
Residuals:
60+
Min 1Q Median 3Q Max
61+
-7640.0 -1046.5 602.8 1934.3 4187.2
62+
63+
Coefficients:
64+
Estimate Std. Error t value Pr(>|t|)
65+
(Intercept) 3957.0 4000.1 0.989 0.3313
66+
UNEM 1133.8 513.1 2.210 0.0358 *
67+
---
68+
Signif. codes: 0 Ô***Õ 0.001 Ô**Õ 0.01 Ô*Õ 0.05 Ô.Õ 0.1 Ô Õ 1
69+
70+
Residual standard error: 3049 on 27 degrees of freedom
71+
Multiple R-squared: 0.1531, Adjusted R-squared: 0.1218
72+
F-statistic: 4.883 on 1 and 27 DF, p-value: 0.03579
73+
74+
> summary(twoPredictorModel)
75+
76+
Call:
77+
lm(formula = ROLL ~ UNEM + HGRAD, data = datavar)
78+
79+
Residuals:
80+
Min 1Q Median 3Q Max
81+
-2102.2 -861.6 -349.4 374.5 3603.5
82+
83+
Coefficients:
84+
Estimate Std. Error t value Pr(>|t|)
85+
(Intercept) -8.256e+03 2.052e+03 -4.023 0.00044 ***
86+
UNEM 6.983e+02 2.244e+02 3.111 0.00449 **
87+
HGRAD 9.423e-01 8.613e-02 10.941 3.16e-11 ***
88+
---
89+
Signif. codes: 0 Ô***Õ 0.001 Ô**Õ 0.01 Ô*Õ 0.05 Ô.Õ 0.1 Ô Õ 1
90+
91+
Residual standard error: 1313 on 26 degrees of freedom
92+
Multiple R-squared: 0.8489, Adjusted R-squared: 0.8373
93+
F-statistic: 73.03 on 2 and 26 DF, p-value: 2.144e-11
94+
95+
> summary(threePredictorModel)
96+
97+
Call:
98+
lm(formula = ROLL ~ UNEM + HGRAD + INC, data = datavar)
99+
100+
Residuals:
101+
Min 1Q Median 3Q Max
102+
-1148.840 -489.712 -1.876 387.400 1425.753
103+
104+
Coefficients:
105+
Estimate Std. Error t value Pr(>|t|)
106+
(Intercept) -9.153e+03 1.053e+03 -8.691 5.02e-09 ***
107+
UNEM 4.501e+02 1.182e+02 3.809 0.000807 ***
108+
HGRAD 4.065e-01 7.602e-02 5.347 1.52e-05 ***
109+
INC 4.275e+00 4.947e-01 8.642 5.59e-09 ***
110+
---
111+
Signif. codes: 0 Ô***Õ 0.001 Ô**Õ 0.01 Ô*Õ 0.05 Ô.Õ 0.1 Ô Õ 1
112+
113+
Residual standard error: 670.4 on 25 degrees of freedom
114+
Multiple R-squared: 0.9621, Adjusted R-squared: 0.9576
115+
F-statistic: 211.5 on 3 and 25 DF, p-value: < 2.2e-16
116+
117+
> #compare successive models using anova(MODEL1, MODEL2, MODELi)
118+
> anova(onePredictorModel, twoPredictorModel, threePredictorModel)
119+
Analysis of Variance Table
120+
121+
Model 1: ROLL ~ UNEM
122+
Model 2: ROLL ~ UNEM + HGRAD
123+
Model 3: ROLL ~ UNEM + HGRAD + INC
124+
Res.Df RSS Df Sum of Sq F Pr(>F)
125+
1 27 251084710
126+
2 26 44805568 1 206279143 458.92 < 2.2e-16 ***
127+
3 25 11237313 1 33568255 74.68 5.594e-09 ***
128+
---
129+
Signif. codes: 0 Ô***Õ 0.001 Ô**Õ 0.01 Ô*Õ 0.05 Ô.Õ 0.1 Ô Õ 1

Notes.md

Lines changed: 211 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,211 @@
1+
## R list and vector. data.frame is really a lis
2+
lisp: Every scalar is a vector of length one.
3+
http://stackoverflow.com/questions/2050790/how-to-correctly-use-lists-in-r
4+
5+
## factor, categorical data, as opposed to continuous random variable
6+
http://www.ats.ucla.edu/stat/R/modules/factor_variables.htm
7+
gender <- factor(c('male', 'female'))
8+
smoke <- facotr(c('smoke', 'non-smoke'))
9+
10+
## cut numerica vector into factor using cut breaks.
11+
incomes = c(60, 49, 40, 61, 64, 60, 59, 54, 62, 69, 70, 42, 56, 67)
12+
incomes <- factor(cut(incomes, breaks = 35+10*(0:7)))
13+
14+
statef <- factor(c('ca','il','ny','wa','tx'))
15+
statef <- factor(rep(statef, 6))
16+
17+
table(statef, incommes)
18+
19+
a <- c(1,2,3,4,5,2,3,4,5,6,7)
20+
f <- cut(a,3) # cut evenly into 3 chunks
21+
f <- cut(a, 3, dig.lab=4, ordered = TRUE)
22+
23+
## Frequency tables from factors
24+
A two way table is two level break down of data, first break down by gender, then smoke.
25+
tb <- table(breakdown_by_factor_1, breakdown_by_factor2)
26+
All breakdown arguments must have the same length
27+
28+
sexsmoke<-matrix(c(70,120,65,140),ncol=2,byrow=TRUE)
29+
rownames(sexsmoke)<-c("male","female")
30+
colnames(sexsmoke)<-c("smoke","nosmoke")
31+
sexsmoke <- as.table(sexsmoke)
32+
33+
## data.frame, defined as a table(column1, column2,...), data frame is really a list.
34+
data frame is a list of columns with the same num of rows with unique row names.
35+
36+
f <- data.frame(col1, col2, ...)
37+
f <- data.frame(colname=factor1, colname=factor2, ...)
38+
39+
sort a data.frame, just sort the col, get a index vector, then use that index vector.
40+
d<-order(LifeCycleSavings$dpi)
41+
LifeCycleSavings[d,]
42+
43+
## existing data frame
44+
LifeCycleSavings {datasets}
45+
46+
## summary is used to check data distribution. We should avoid data clustering within certain area.
47+
1. summary() returns five number summary plus mean for numeric vector
48+
and returns the frequencies for categorical vector.
49+
2. var() returns the sample variance,
50+
sd() the sample standard deviation, and
51+
cor() the sample correlation coefficient between two vectors:
52+
cor(Mileage,Weight)
53+
54+
55+
## Statistics explained: (sample, population, test, expected value, confidence level)
56+
N measurements, how their avg, variance, and the ratio between them distributed !!
57+
58+
Center Limit Theorem(CLT) : random variables tends to follow normal distribution.
59+
Test and hypothesis: statistic take n random samples of population, as n samples approximately normal dist, from n samples sigma, SE, etc, we can estimate population.
60+
61+
If the observed value is normally distributed, then under the null hypothesis, the z-statistic has a standard normal distribution
62+
So the further the z-statistic is from zero, such extreme z-statitics should unlikely to happen under normal distribution. this serves as the stronger the evidence that the null hypothesis is false.
63+
The P-value of the test is the chance of getting a test statistic as extreme, or more extreme, than the one we observed(z-value), if the null hypothesis is true
64+
65+
n sample, mean, variance, sigma, SE. confidence level.
66+
68% population within mean+- 1 sd, confidence level 68%
67+
95% population within mean+- 2 sd, confidence level 95%.
68+
SE = 2*sigma.
69+
70+
Ex: U.S ppl life mean 68, sd(sigma) is 5. so 95% ppl live from 58-78.
71+
72+
z= ( observed − expected ) / standard_error
73+
z-test = (xbar - expected)/(sd/sqrt(n))
74+
t-test, unrealistic popluation's sd(sigma) is known, use samples sigma
75+
76+
77+
Ex: 42 out of 100 says yes, does it support null hypothesis that half p=.5 yes ?
78+
prop.test(42, 100, p=.5)
79+
80+
Ex: 420 out of 1000 says yes, does it support null hypothesis that half p=.5 yes ?
81+
prop.test(420, 1000, p=.5)
82+
83+
For binary value(true/false), do multiple tests, each test is a sample, then calculate mean, sigma, SE among tests.
84+
85+
Ex: Honda claim mpg=25. We sample 10 ppl, get avg xbar=22 with sigma=1.5.
86+
null hypothesis H_0 mpg=25, alternative hypothesis, mpg<25
87+
xbar=22;s=1.5;n=10
88+
t = (xbar-25)/(s/sqrt(n))
89+
t = [1] -6.324555
90+
we use pt to get the prob dist of t >
91+
pt(t,df=n-1)
92+
[1] 6.846828e-05
93+
pt is a small p-value (0.000068). Reject h0 in favor of alternative.
94+
95+
Ex: I survey a random sample of 1000 adults. They have an average income of $32000 a year, with a SD of $11000 a year. What is the 95% confidence interval for the average annual income of all adults?
96+
97+
Estimated average = $32000
98+
Estimate of SE = $11000/sqrt(1000) = $350
99+
95% CI is $32000 +/- 2*$350 = $31300 to $32700
100+
101+
Hypothesis tests, P-value and statistical significant
102+
z-test = (xbar - expected) / (sigma/sqrt(n))
103+
from z value, from standard normal distribution table, find prob area for the value.
104+
105+
Ex: hypothesis the avg of mean value in a blackbox is 10. Assume the real avg is 50. so the expected value is 50.
106+
we draw 100 tickets from the box: these have mean 45 and SD 10.
107+
The expected sample average is 50
108+
Standard error of sample average is estimated as 10/sqrt(100) = 1
109+
We observed the sample average was 45
110+
if avg is really 50, then 45 is 5 standard error below.
111+
It's unlikely we'd get 5 SEs below the expected value just by chance
112+
Therefore, we have evidence the box average is not 50
113+
114+
Ex: toss a coin 100 time, get head 35 times,
115+
P(35 or fewer heads) = 0.18%
116+
P(35 or fewer tails) = 0.18%
117+
prop.test(35, 100, p=0.5)
118+
p-value = 0.003732
119+
35 heads is very unlikely to happen by chance(3%), so reject the null. The coin is biased.
120+
121+
Statistic significant:
122+
A P-value of less than 5% is statistically significant: the difference can't be explained by chance, so reject the null hypothesis
123+
A P-value of more than 5% is not statistically significant: the difference can be explained by chance. We don't have enough evidence to reject the null hypothesis.
124+
125+
## Confidence Level : how close sample to population. z=diff(sample,population) is to 0 by confidence. Prob(-1<(p-xp)/SE<1)=0.68
126+
Claim: 42 likes out of 100 surveyed with error 9%. This is 95% CI.
127+
1. sample mean approximate to true mean, with sample size n, population std sigma, and CI.
128+
2. zstar follow cumulative distribution qnorm(1-alpha) where alpha=1-CI
129+
3. SE = sigmal/sqrt(n) # sigmal = sqrt(variance)
130+
4. value interval is xbar+(-zstar*SE, zstar*SE)
131+
132+
## Chisq test: goodness of fit test. distribution of variance, categorical data variance.
133+
134+
categorical data. indep ? same effect of use seat belt or not, or Rows comes from same distribution ?
135+
yes-belt = c(12813,647,359,42)
136+
no-belt = c(65963,4000,2642,303)
137+
chisq.test(data.frame(yes-belt,no-belt))
138+
139+
## ANOVA
140+
F-dist: distribution between two variances, the ratio of two chi-square variables. ANOVA.
141+
variance analysis. hypothesis test two samples with t test to get p-value to reject null hypo.
142+
## hypothesis test with t test and p-value: with avg/var and its ratio of sample, prob of getting more extrem value.
143+
144+
## z-test : diff between sample and true population statistics.
145+
## Assume true sigma is known for the distribution, which is not realistic; t-test, use sigma from sample.
146+
## One sample t test(against true mean): sample weight, analysis variances.
147+
148+
8 tosses 6 head, tot=pow(2,8), 6head=C(8,6)=28. p=28/256, not small enough to reject H0.
149+
150+
w=c(175 ,176 ,173 ,175 ,174 ,173 ,173 ,176 ,173 ,179)
151+
hist(w), boxplot(w) and qqnorm(w)
152+
t.test(w) p-value: a small p-value is considered strong evidence against the null hypothesis.
153+
154+
t-test from one sampled against claimed mean : calc xbar and s sigma from sample value.
155+
t = (xbar-claimed-mean) / (s/sqrt(n)) # n values in one sample, s is sampled standard deviation. SE = s/sqrt(n)
156+
p-value = pt(t, df=n-1)
157+
if Estimat and SE is known from regression or anova, conduct one-side test to see if est is high ?
158+
t = (calced-est - claimed-or-new-guess) / SE # SE = sampled-standard-deviation/sqrt(n)
159+
pt(t, df=n, lower.tail=F)
160+
if p-value is not small, mean new-guess has the same effect.
161+
162+
or Proportion test to check bias : two samples test(prop one Vs another)
163+
prop.test(42,100, conf.level=0.90)
164+
prop.test(42,100,p=.5)
165+
prop.test(420,1000,p=.5)
166+
167+
## Two samples t test (compare null hypothesis(Equal Mean) between two groups, or many groups)
168+
t.test(weight[feed='meat'], mu=mean(weight0)
169+
t.test(weight[feed == ’casein’],weight[feed == ’sunflower’])
170+
171+
## N sample t test : one-way ANOVA check the variance among n groups, H0, Equal mean among all groups.
172+
oneway.test(values ~ ind, data=stack(data.frame(x,y,z), var.equal=T)
173+
174+
## chi-square test: goodness of fit test: fit a model.
175+
dice is fair, i.e, each face prob is 1/6. chisq.test(c(22,21,22,27,22,36), p = c(1,1,1,1,1,1)/6)
176+
letter dist: chisq.test(c(100,110,80,55,14), p=c(29, 21, 17, 17, 16)/100))
177+
test for indep: no co-relation between variables. chisq.test(data.frame(yes-belt,no-belt))
178+
179+
## Mining Data pipeline: [ discretizer, smoothers, principle component analysis, regression, classifiers ]
180+
1. model data, fit func, predict and estimation.
181+
2. estimation and variance analysis, calc mean, hypothesis, etc.
182+
3. finding cluster, frequent items, etc.
183+
4. Principle Component Analysis.
184+
5. Classifier, Machine Learning : train SVM with GA algo to classify instance.
185+
186+
## PCA : prcomp(wine, scale=TRUE)
187+
1. PCA: normalize feature values scale=True, check feature value distribution with summary() / table().
188+
2. biplot dominate eigenvalues(principle components) and find features that align to x(pc1) and y(pc2).
189+
1. classifier algorithm: align test data to training data, which are already classified properly.
190+
2. classifier by SVM : support vector to divide components.
191+
192+
## Machine learning : train SVM use GA to search best fit tuning params, gen SVM classifier.
193+
194+
1. SVM : input is a vector[feature-val-1, feature-val-2,...], tuning params, training set.
195+
Desired Output Classification : [ Input Vector ] => should produce this output classification.
196+
1. GA is domain agnostic: input as a set of string, [attr1, attr2,..] and grade the output as how fit it is.
197+
2. use GA to search for best SVM tuning parameters.
198+
3. GA input candidate: a string repr all possible tuning params. GA params: empirical to avoid stuck or speedify fast converge.
199+
4. GA fitness func: the Mean-Square-Error of training set for each tuning params set.
200+
5.
201+
202+
## N-gram distribution
203+
sentence = [ 'to','be','or','not','to','be']
204+
2-gram = [] 3-gram = []
205+
for i in sentence[0..len]
206+
2-gram.push sentence[i..i+2] # ['to be', 'be or', 'or not']
207+
3-gram.push sentence[i..i+3] # ['to be or', 'be or not', 'or not to']
208+
209+
* convert a seq of char into a _set_ of n-grams, repr in vector space, approx(similarity) based on cosine dist among vectors.
210+
* for 3-gram letter, there are 26^3 dimention vector space. For english vocabulary with 3000 words, dim = (3000^3)
211+
*

README.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# Statistics and data mining with R
2+
3+
This repo contains notes and examples from my learning of statistics, data mining, and machine learning algorithm with R.
4+
5+
Refer to Notes.md for my learning notes of R.
6+
7+
## Incanter
8+
9+
We Incanter platform along with clojure for our Statistical computing and visualization.

anova.r

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
#!/usr/bin/Rscript
2+
3+
# an example of anova among 3 samples.
4+
# 3 grp of with 7 observations
5+
#
6+
y1 = c(18.2, 20.1, 17.6, 16.8, 18.8, 19.7, 19.1)
7+
y2 = c(17.4, 18.7, 19.1, 16.4, 15.9, 18.4, 17.7)
8+
y3 = c(15.2, 18.8, 17.7, 16.5, 15.9, 17.1, 16.7)
9+
10+
# another way
11+
# scores = data.frame(x,y,z)
12+
# boxplot(scores)
13+
# scores = stack(scores)
14+
# oneway.test(values ~ ind, data=scores, var.equal=T)
15+
16+
# comb inot one big vect
17+
y = c(y1,y2,y3)
18+
cat(y)
19+
n = rep(7,3) # repeat 7 three times, 7 7 7
20+
group = rep(1:3, n) # for 1,2,3, each repeat 7 times
21+
tmp = tapply(y, group, stem) # create 2 columns, result ~ indic(data), can use stack.
22+
stem(y)
23+
24+
tmpfn = function(x) {
25+
c(sum=sum(x), mean=mean(x), var=var(x), n=length(x))
26+
}
27+
28+
tapply(y, group, tmpfn) # grp vect, apply the func to each grp
29+
tmpfn(y)
30+
31+
# create data frame with 2 cols, y and group
32+
data = data.frame(y=y, group=factor(group))
33+
fit = lm( y ~ group, data)

0 commit comments

Comments
 (0)