Lecture1 code-CFANZ编程社区

LECTURE 1

# --------------------------------------------------------

# ST4061 / ST6041

# 2021-2022

# Eric Wolsztynski

# ...

# Exercises Section 1: Date pre-processing

# --------------------------------------------------------

library(glmnet)

library(survival)

library(ISLR)

################################################################## Exercise

1: effect of scaling

###############################################################

dat = iris[1:100,]

dat$Species = droplevels(dat$Species)#去掉没有的level

x = dat[,1:4]

y = dat$Species

# we can also apply scaling to the x data directly:

dats = dat

dats[,1:4] = apply(dats[,1:4],2,scale)# 归一化，apply(2是按列)

xs = apply(x,2,scale)

# scale是一个泛型函数，默认将一个数据矩阵的列（column）进行中心化以及归一化处理。

# (1)

pca.unscaled = prcomp(x) # perform PCA on unscaled datasets

pca.scaled = prcomp(x,scale=TRUE)# perform PCA on scaled datasets

pca.scaled.2 = prcomp(xs) # should be the same as pca.scaled

par(mfrow=c(2,2))

plot(pca.unscaled) # scree plot(碎石图)类似直方图

biplot(pca.unscaled) # biplot

plot(pca.scaled) # scree plot

biplot(pca.scaled) # biplot

# now analyse this plot :)

# 分析结果：

依次是： Sepal.Length Sepal.Width Petal.Length Petal.Width

前两幅图是unscaled的图，可以看到第一个成分占了很大占比接近91%(看cumulative proportion)

后两幅图是scaled的图，消除了scale的影响，可以看见第二个成分扮演了更重要的角色。

# 重新组合数据，这些都是observation points

# 用pca进行数据缩减data reduction

new_x=pca.scaled$x[,1:2]

#切断后面两列，画biplot的时候可以分离数据，将左右分隔开

#聚类或者分类

#切断后面两列，那么PC1和PC2会更加均匀地保留在两个维度

#取决于你想干什么，如果想要分类，可以只针对PC1做分析，可以得到stronger的分析，但同时可能会失去其他细节。

# What are your scale?可以看到最后一个图，有82和74被两条线穿过，同一象限（不懂）

# plot the data on its first 2 dimensions in each space:

par(mfrow=c(1,3))

plot(x[,1:2], pch=20, col=y, main="Data in original space")

# col=y 就是 col=c(1,2)[y]

biplot(pca.unscaled, main="Data in PCA space")

abline(v=0, col='orange')

# see the binary separation in orange along PC1?

# re-use this into biplot in original space:

pca.cols = c("blue","orange")[1+as.numeric(pca.unscaled$x[,1]>0)]

plot(x[,1:2], pch=20, col=pca.cols,

main="Data in original space\n colour-coded using PC1-split")

# (2)

logreg.unscaled = glm(Species~., data=dat) # make this work

#ERROR因为必须要说明family

logreg.unscaled = glm(Species~., data=dat, family='binomial')

logreg.scaled = glm(Species~., data=dats, family='binomial')

# discuss...

# (... are the fits different?)

cbind(coef(logreg.unscaled), coef(logreg.scaled))

第一列是unscaled的系数，第二列是scaled系数

可以看到PetaL.Length和Petal.Width变化很大

查看相关系数，特别高，这两者可以互换。

其他两个变量Sepal的，降低了40%左右。

# (... does this align with the PCA analysis?)

# (yes, we see a change in the role of each variable in the information space)

# 我们看到了每个变量在信息空间中的改变，所以与PCA分析一致。但PCA到底是啥。

# (... and why?)

# (both linear techniques)都是线性技术。

# (3)

x.m = model.matrix(Species~.+0, data=dat)

lasso.cv = cv.glmnet(x.m, y, family="binomial")

lasso.unscaled=glmnet(x.m,y,family="binomial",lambda=lasso.cv$lambda.min)

lasso.pred = predict(lasso.unscaled, newx=x.m, type="class")

# lasso???

xs.m = model.matrix(Species~.+0, data=dats)

lasso.cv = cv.glmnet(xs.m, y, family="binomial")

lasso.scaled=glmnet(xs.m,y,family="binomial",lambda=lasso.cv$lambda.min)

lasso.s.pred = predict(lasso.scaled, newx=xs.m, type="class")

cbind(coef(lasso.unscaled), coef(lasso.scaled))

Intercept变得非常不同，因为我们将所有协方量正规化了，意味着现在不需要intercept

在两个模型中，Sepal.Length都被拿掉了。

Lasso回归与岭回归都是对线性回归进行正则化。

Lasso可以筛选掉一些系数为0的变量，在协变量中只选择一个子集应用到最终模型中，而非用上全部协变量。相反，岭回归可能很难筛选出来，所以变量不会缩减。

缺点：不太会处理相关性，所以会直接删除到一些变量。把Sepal.Length拿掉可能就意味着他的相关性很小？

table(lasso.pred, lasso.s.pred) # meh

################################################################## Exercise

2: data imputation

###############################################################

data(cancer, package="survival")

summary(lung)

有很多缺失值

167/228=0.7324561

丢失了27%的数据，因为缺失值消失了，为了避免这个问题，我们要填补空白，填补缺失值，这就是所谓的数据插补。Data imputation.

用缺失的观察值或者中位数，用改数据集的变量平均值替换缺失值。

手写板书：

Data imputation:

Missing at random

---misisingness is not related to the data generation process

---’missing completely at random’ or ‘missing at random’完全随机缺失和随机缺失

---can be ignored, ok to do imputation

Missing not at random

---there is a relationship between the value of the variables and its missingness.

---EX: -saturated sensor.饱和传感器

--------threshold detection阈值检测

---must not be ignored

---must be modeled

Some tests:

---follow on study (ex: data from a survey)

---chi-test of the distribution (卡方分布)(Looking for a uniform distribution)

---field expertise / expect knowledge专家知识

Some techniques:

---Multiple imputation (using mean/ median)(randomized process)

---interpolation线性插值

---deletion (na.omit())

---forecasting (using the other variables in the datasets as predictors)

boxplot(lung$meal.cal~lung$sex, col=c("cyan","pink"))

# can you think of other ways of analysing this?

# (1) lung cancer data: compare meal.cal values between male and female cohorts,

# and discuss w.r.t. gender-specific data imputation

# NB: "missing at random" vs "missing not at random"??

nas = is.na(lung$meal.cal) # track missing values看是否是缺失值，是T否F

table(nas, lung$sex)

sex=1的有24个缺失值，sex=2的有23个缺失值。

imales = which(lung$sex==1)

m.all = mean(lung$meal.cal, na.rm=TRUE)

m.males = mean(lung$meal.cal[imales], na.rm=TRUE)

m.females = mean(lung$meal.cal[-imales], na.rm=TRUE)

t.test(lung$meal.cal[imales], lung$meal.cal[-imales])

# significant difference, hence must use different imputation

# values for each gender

该测试分为两部分，对不同性别的mean.cal数据进行测试。

P=0.01989 shows a significant difference.说明性别对meal.cal的影响显著。

我们必须对每个性别使用不同的归因值imputation value。

Cox比例风险模型

# (2) Run Cox PHMs on original, overall imputed and gender-specific imputed datsets, using the cohort sample mean for data imputation. Compare and discuss model fitting output.使用队列样本均值进行数据归因，对原始、总体归因和性别特定归因数据集运行Cox phm。比较和讨论模型拟合输出。

dat1 = dat2 = lung

# impute overall mean in dat1:

dat1$meal.cal[nas] = m.all

Nas是查看是否为缺失值，直接索引nas得到所有的NA，将缺失值全部用均值代替。

# impute gender=sepcific mean in dat2:

dat2$meal.cal[(is.na(lung$meal.cal) & (lung$sex==1))] = m.males

dat2$meal.cal[(is.na(lung$meal.cal) & (lung$sex==2))] = m.females

dat2是给不同性别的meal.cal赋予该性别的均值。

cox0 = coxph(Surv(time,status)~.,data=lung) # original

cox1 = coxph(Surv(time,status)~.,data=dat1) # overall imputed

cox2 = coxph(Surv(time,status)~.,data=dat2) # gender-specific imputed

summary(cox0)

summary(cox1)

summary(cox2)

round(cbind(coef(cox0),coef(cox1),coef(cox2)),3)

# - dat1 and dat2 yield increased sample size (from 167 to 209, both imputed datasets having 209 observations)

# - overall coefficient effects comparable between the 2 sets

# - marginal differences in covariate effect and significance between lung and {dat1;dat2}

# - no substantial difference between dat1 and dat2 outputs

# - dat1和dat2产生了增加的样本大小(从167到209，都有209个观测数据集)

# -总体系数在两组之间的影响差不多

# -lung和{dat1;dat2}之间的协变量效应和显著性的边际差异

dat1和dat2的输出没有实质性的区别

################################################################## Exercise

3: data imputation

###############################################################

library(ISLR)

dat = Hitters

# (1) (Deletion)

方法一：删除(na.omit就是去除数据里的缺失值，直接删除)

sdat = na.omit(dat) #第一步就删除

sx = model.matrix(Salary~.+0,data=sdat)

sy = sdat$Salary

cv.l = cv.glmnet(sx,sy)

slo = glmnet(sx,sy,lambda=cv.l$lambda.min) # fit a lasso model

Model.matrix它创建了一个设计(或模型)矩阵，例如，通过扩展因子到一组虚拟变量(取决于对比)和扩展交互作用。

这一步就是把League(N or A)类似这种变量变成两个虚拟变量LeagueN and LeagueA,值为0或1.

点的符号是代表所有变量，+0是默认截距为0.

cv.glmnet是Cross-validation for glmnet，对glmnet进行k-fold交叉验证，生成图形，并返回lambda的值(如果relax=TRUE，则返回gamma值)

Glmnet是fit a GLM with lasso or elasticnet regularization，用惩罚极大似然拟合广义线性模型。正则化路径是计算在正则化参数lambda值的网格上的套索或弹性网惩罚。可以处理各种形状的数据，包括非常大的稀疏数据矩阵。符合线性、logistic和多项式、泊松和Cox回归模型。

# (2) Simple imputation (of Y) using overall mean

方法二：把缺失值用均值简单填补。

ina = which(is.na(dat$Salary))

dat$Salary[ina] = mean(dat$Salary[-ina])

x = model.matrix(Salary~.+0,data=dat)

y = dat$Salary

cv.l = cv.glmnet(x,y)

lo = glmnet(x,y,lambda=cv.l$lambda.min)

首先glmnet需要确保x变量必须是model.matrix之后的x。然后缺失的y值用均值补上。再进行cv.glmnet和glmnet.

# (3)

# Compare and discuss on model fitting outputs.

# Can you think of an alternative approach to imputing the dependent variable?

slop = predict(slo,newx=sx)

lop = predict(lo,newx=x)

sqrt(mean((slop-sy)^2))

sqrt(mean((lop-y)^2))

plot(slop,lop[-ina])

abline(a=0,b=1) #画一条y=x的线，y=bx+a

abline(lm(lop[-ina]~slop), col='navy')

# What could we do instead of imputing the Y?

################################################################## Exercise

4: resampling

###############################################################

# You don't need help for this one!

################################################################## Exercise

5: resampling (CV vs bootstrapping)

###############################################################

# Implement this simple analysis and discuss - think about

# (sub)sample sizes!

x = trees$Girth # sorted in increasing order...

y = trees$Height

plot(x, y, pch=20)

summary(lm(y~x))

N = nrow(trees)

# (1) 10-fold CV on original dataset

set.seed(4060)

K = 10

cc = numeric(K)

folds = cut(1:N, K, labels=FALSE)

for(k in 1:K){

i = which(folds==k)

# train:

lmo = lm(y[-i]~x[-i])

cc[k] = summary(lmo)$coef[2,2]

# (NB: no testing here, so not the conventional use of CV)

}

mean(cc)

# coef[2,2]是x的std.error

# (2) 10-fold CV on randomized dataset

set.seed(1)

mix = sample(1:nrow(trees), replace=FALSE)

xr = trees$Girth[mix]

yr = trees$Height[mix]

set.seed(4060)

K = 10

ccr = numeric(K)

folds = cut(1:N, K, labels=FALSE)

for(k in 1:K){

i = which(folds==k)

lmo = lm(yr[-i]~xr[-i])

ccr[k] = summary(lmo)$coef[2,2]

}

mean(ccr)

sd(ccr)

sd(cc)

boxplot(cc,ccr)

t.test(cc,ccr)

var.test(cc,ccr)

# 随机和不随机的区别

# (3) Bootstrapping (additional note)

set.seed(4060)

K = 100

cb = numeric(K)

for(i in 1:K){

# bootstrapping

ib = sample(1:N,N,replace=TRUE)

lmb = lm(y[ib]~x[ib])

cb[i] = summary(lmb)$coef[2,2]

}

mean(cb)

dev.new()

par(font=2, font.axis=2, font.lab=2)

boxplot(cbind(cc,ccr,cb), names=c("CV","CVr","Bootstrap"))

abline(h=1.0544)

t.test(cc,cb)

# Explain why these are different?

round(cc, 3)