LECTURE 1
# --------------------------------------------------------
# ST4061 / ST6041
# 2021-2022
# Eric Wolsztynski
# ...
# Exercises Section 1: Date pre-processing
# --------------------------------------------------------
library(glmnet)
library(survival)
library(ISLR)
################################################################## Exercise
1: effect of scaling
###############################################################
dat = iris[1:100,]
dat$Species = droplevels(dat$Species)#去掉没有的level
x = dat[,1:4]
y = dat$Species
# we can also apply scaling to the x data directly:
dats = dat
dats[,1:4] = apply(dats[,1:4],2,scale)# 归一化,apply(2是按列)
xs = apply(x,2,scale)
# scale是一个泛型函数,默认将一个数据矩阵的列(column)进行中心化以及归一化处理。
# (1)
pca.unscaled = prcomp(x) # perform PCA on unscaled datasets
pca.scaled = prcomp(x,scale=TRUE)# perform PCA on scaled datasets
pca.scaled.2 = prcomp(xs) # should be the same as pca.scaled
par(mfrow=c(2,2))
plot(pca.unscaled) # scree plot(碎石图)类似直方图
biplot(pca.unscaled) # biplot
plot(pca.scaled) # scree plot
biplot(pca.scaled) # biplot
# now analyse this plot :)
# 分析结果:
依次是: Sepal.Length Sepal.Width Petal.Length Petal.Width
前两幅图是unscaled的图,可以看到第一个成分占了很大占比接近91%(看cumulative proportion)
后两幅图是scaled的图,消除了scale的影响,可以看见第二个成分扮演了更重要的角色。
# 重新组合数据,这些都是observation points
# 用pca进行数据缩减data reduction
new_x=pca.scaled$x[,1:2]
#切断后面两列,画biplot的时候可以分离数据,将左右分隔开
#聚类或者分类
#切断后面两列,那么PC1和PC2会更加均匀地保留在两个维度
#取决于你想干什么,如果想要分类,可以只针对PC1做分析,可以得到stronger的分析,但同时可能会失去其他细节。
# What are your scale?可以看到最后一个图,有82和74被两条线穿过,同一象限(不懂)
# plot the data on its first 2 dimensions in each space:
par(mfrow=c(1,3))
plot(x[,1:2], pch=20, col=y, main="Data in original space")
# col=y 就是 col=c(1,2)[y]
biplot(pca.unscaled, main="Data in PCA space")
abline(v=0, col='orange')
# see the binary separation in orange along PC1?
# re-use this into biplot in original space:
pca.cols = c("blue","orange")[1+as.numeric(pca.unscaled$x[,1]>0)]
plot(x[,1:2], pch=20, col=pca.cols,
main="Data in original space\n colour-coded using PC1-split")
# (2)
logreg.unscaled = glm(Species~., data=dat) # make this work
#ERROR因为必须要说明family
logreg.unscaled = glm(Species~., data=dat, family='binomial')
logreg.scaled = glm(Species~., data=dats, family='binomial')
# discuss...
# (... are the fits different?)
cbind(coef(logreg.unscaled), coef(logreg.scaled))
第一列是unscaled的系数,第二列是scaled系数
可以看到PetaL.Length和Petal.Width变化很大
查看相关系数,特别高,这两者可以互换。
其他两个变量Sepal的,降低了40%左右。
# (... does this align with the PCA analysis?)
# (yes, we see a change in the role of each variable in the information space)
# 我们看到了每个变量在信息空间中的改变,所以与PCA分析一致。但PCA到底是啥。
# (... and why?)
# (both linear techniques)都是线性技术。
# (3)
x.m = model.matrix(Species~.+0, data=dat)
lasso.cv = cv.glmnet(x.m, y, family="binomial")
lasso.unscaled=glmnet(x.m,y,family="binomial",lambda=lasso.cv$lambda.min)
lasso.pred = predict(lasso.unscaled, newx=x.m, type="class")
# lasso???
xs.m = model.matrix(Species~.+0, data=dats)
lasso.cv = cv.glmnet(xs.m, y, family="binomial")
lasso.scaled=glmnet(xs.m,y,family="binomial",lambda=lasso.cv$lambda.min)
lasso.s.pred = predict(lasso.scaled, newx=xs.m, type="class")
#
cbind(coef(lasso.unscaled), coef(lasso.scaled))
Intercept变得非常不同,因为我们将所有协方量正规化了,意味着现在不需要intercept
在两个模型中,Sepal.Length都被拿掉了。
Lasso回归与岭回归都是对线性回归进行正则化。
Lasso可以筛选掉一些系数为0的变量,在协变量中只选择一个子集应用到最终模型中,而非用上全部协变量。相反,岭回归可能很难筛选出来,所以变量不会缩减。
缺点:不太会处理相关性,所以会直接删除到一些变量。把Sepal.Length拿掉可能就意味着他的相关性很小?
table(lasso.pred, lasso.s.pred) # meh
################################################################## Exercise
2: data imputation
###############################################################
data(cancer, package="survival")
summary(lung)
有很多缺失值
167/228=0.7324561
丢失了27%的数据,因为缺失值消失了,为了避免这个问题,我们要填补空白,填补缺失值,这就是所谓的数据插补。Data imputation.
用缺失的观察值或者中位数,用改数据集的变量平均值替换缺失值。
手写板书:
Data imputation:
- Missing at random
---misisingness is not related to the data generation process
---’missing completely at random’ or ‘missing at random’完全随机缺失和随机缺失
---can be ignored, ok to do imputation
- Missing not at random
---there is a relationship between the value of the variables and its missingness.
---EX: -saturated sensor.饱和传感器
--------threshold detection阈值检测
---must not be ignored
---must be modeled
Some tests:
---follow on study (ex: data from a survey)
---chi-test of the distribution (卡方分布)(Looking for a uniform distribution)
---field expertise / expect knowledge专家知识
Some techniques:
---Multiple imputation (using mean/ median)(randomized process)
---interpolation线性插值
---deletion (na.omit())
---forecasting (using the other variables in the datasets as predictors)
boxplot(lung$meal.cal~lung$sex, col=c("cyan","pink"))
# can you think of other ways of analysing this?
# (1) lung cancer data: compare meal.cal values between male and female cohorts,
# and discuss w.r.t. gender-specific data imputation
# NB: "missing at random" vs "missing not at random"??
nas = is.na(lung$meal.cal) # track missing values看是否是缺失值,是T否F
table(nas, lung$sex)
sex=1的有24个缺失值,sex=2的有23个缺失值。
imales = which(lung$sex==1)
m.all = mean(lung$meal.cal, na.rm=TRUE)
m.males = mean(lung$meal.cal[imales], na.rm=TRUE)
m.females = mean(lung$meal.cal[-imales], na.rm=TRUE)
t.test(lung$meal.cal[imales], lung$meal.cal[-imales])
# significant difference, hence must use different imputation
# values for each gender
该测试分为两部分,对不同性别的mean.cal数据进行测试。
P=0.01989 shows a significant difference.说明性别对meal.cal的影响显著。
我们必须对每个性别使用不同的归因值imputation value。
Cox比例风险模型
# (2) Run Cox PHMs on original, overall imputed and gender-specific imputed datsets, using the cohort sample mean for data imputation. Compare and discuss model fitting output.使用队列样本均值进行数据归因,对原始、总体归因和性别特定归因数据集运行Cox phm。比较和讨论模型拟合输出。
dat1 = dat2 = lung
# impute overall mean in dat1:
dat1$meal.cal[nas] = m.all
Nas是查看是否为缺失值,直接索引nas得到所有的NA,将缺失值全部用均值代替。
# impute gender=sepcific mean in dat2:
dat2$meal.cal[(is.na(lung$meal.cal) & (lung$sex==1))] = m.males
dat2$meal.cal[(is.na(lung$meal.cal) & (lung$sex==2))] = m.females
dat2是给不同性别的meal.cal赋予该性别的均值。
cox0 = coxph(Surv(time,status)~.,data=lung) # original
cox1 = coxph(Surv(time,status)~.,data=dat1) # overall imputed
cox2 = coxph(Surv(time,status)~.,data=dat2) # gender-specific imputed
summary(cox0)
summary(cox1)
summary(cox2)
round(cbind(coef(cox0),coef(cox1),coef(cox2)),3)
# - dat1 and dat2 yield increased sample size (from 167 to 209, both imputed datasets having 209 observations)
# - overall coefficient effects comparable between the 2 sets
# - marginal differences in covariate effect and significance between lung and {dat1;dat2}
# - no substantial difference between dat1 and dat2 outputs
# - dat1和dat2产生了增加的样本大小(从167到209,都有209个观测数据集)
# -总体系数在两组之间的影响差不多
# -lung和{dat1;dat2}之间的协变量效应和显著性的边际差异
dat1和dat2的输出没有实质性的区别
################################################################## Exercise
3: data imputation
###############################################################
library(ISLR)
dat = Hitters
# (1) (Deletion)
方法一:删除(na.omit就是去除数据里的缺失值,直接删除)
sdat = na.omit(dat) #第一步就删除
sx = model.matrix(Salary~.+0,data=sdat)
sy = sdat$Salary
cv.l = cv.glmnet(sx,sy)
slo = glmnet(sx,sy,lambda=cv.l$lambda.min) # fit a lasso model
Model.matrix它创建了一个设计(或模型)矩阵,例如,通过扩展因子到一组虚拟变量(取决于对比)和扩展交互作用。
这一步就是把League(N or A)类似这种变量变成两个虚拟变量LeagueN and LeagueA,值为0或1.
点的符号是代表所有变量,+0是默认截距为0.
cv.glmnet是Cross-validation for glmnet,对glmnet进行k-fold交叉验证,生成图形,并返回lambda的值(如果relax=TRUE,则返回gamma值)
Glmnet是fit a GLM with lasso or elasticnet regularization,用惩罚极大似然拟合广义线性模型。正则化路径是计算在正则化参数lambda值的网格上的套索或弹性网惩罚。可以处理各种形状的数据,包括非常大的稀疏数据矩阵。符合线性、logistic和多项式、泊松和Cox回归模型。
# (2) Simple imputation (of Y) using overall mean
方法二:把缺失值用均值简单填补。
ina = which(is.na(dat$Salary))
dat$Salary[ina] = mean(dat$Salary[-ina])
x = model.matrix(Salary~.+0,data=dat)
y = dat$Salary
cv.l = cv.glmnet(x,y)
lo = glmnet(x,y,lambda=cv.l$lambda.min)
首先glmnet需要确保x变量必须是model.matrix之后的x。然后缺失的y值用均值补上。再进行cv.glmnet和glmnet.
# (3)
# Compare and discuss on model fitting outputs.
# Can you think of an alternative approach to imputing the dependent variable?
slop = predict(slo,newx=sx)
lop = predict(lo,newx=x)
sqrt(mean((slop-sy)^2))
sqrt(mean((lop-y)^2))
plot(slop,lop[-ina])
abline(a=0,b=1) #画一条y=x的线,y=bx+a
abline(lm(lop[-ina]~slop), col='navy')
# What could we do instead of imputing the Y?
################################################################## Exercise
4: resampling
###############################################################
# You don't need help for this one!
################################################################## Exercise
5: resampling (CV vs bootstrapping)
###############################################################
# Implement this simple analysis and discuss - think about
# (sub)sample sizes!
x = trees$Girth # sorted in increasing order...
y = trees$Height
plot(x, y, pch=20)
summary(lm(y~x))
N = nrow(trees)
# (1) 10-fold CV on original dataset
set.seed(4060)
K = 10
cc = numeric(K)
folds = cut(1:N, K, labels=FALSE)
for(k in 1:K){
i = which(folds==k)
# train:
lmo = lm(y[-i]~x[-i])
cc[k] = summary(lmo)$coef[2,2]
# (NB: no testing here, so not the conventional use of CV)
}
mean(cc)
# coef[2,2]是x的std.error
# (2) 10-fold CV on randomized dataset
set.seed(1)
mix = sample(1:nrow(trees), replace=FALSE)
xr = trees$Girth[mix]
yr = trees$Height[mix]
set.seed(4060)
K = 10
ccr = numeric(K)
folds = cut(1:N, K, labels=FALSE)
for(k in 1:K){
i = which(folds==k)
lmo = lm(yr[-i]~xr[-i])
ccr[k] = summary(lmo)$coef[2,2]
}
mean(ccr)
sd(ccr)
sd(cc)
boxplot(cc,ccr)
t.test(cc,ccr)
var.test(cc,ccr)
# 随机和不随机的区别
# (3) Bootstrapping (additional note)
set.seed(4060)
K = 100
cb = numeric(K)
for(i in 1:K){
# bootstrapping
ib = sample(1:N,N,replace=TRUE)
lmb = lm(y[ib]~x[ib])
cb[i] = summary(lmb)$coef[2,2]
}
mean(cb)
dev.new()
par(font=2, font.axis=2, font.lab=2)
boxplot(cbind(cc,ccr,cb), names=c("CV","CVr","Bootstrap"))
abline(h=1.0544)
t.test(cc,cb)
# Explain why these are different?
round(cc, 3)