R包minfi处理DNA甲基化芯片数据-CFANZ编程社区

minfi：Analyze Illumina Infinium DNA methylation arrays

1. 数据读入

if (!require("BiocManager", quietly = TRUE))
  install.packages("BiocManager")

BiocManager::install("minfi",force = TRUE)
BiocManager::install("minfiData")

library(minfi)

library(minfiData)

### 1. 数据读入
# RGsetEx
# ## RGsetEx: RGChannelSet, 622,399 features
# class(RGsetEx) #"RGChannelSet"


# MsetEx <- preprocessRaw(RGsetEx)  
# class(MsetEx)  # "MethylSet"
# ## MsetEx: MethylSet, 485,512 features

# GMsetEx <- mapToGenome(MsetEx)
# class(GMsetEx) #"GenomicMethylSet"
# ## GMsetEx: GenomicMethylSet, 485,512 features

## ----baseDir------------------------------------------------------------------
baseDir <- system.file("extdata", package = "minfiData")
list.files(baseDir)
list.files(file.path(baseDir, "5723646052"))
## 读Illumina methylation sample sheet(csv结尾的文件),含有样本信息以及文件位置
targets <- read.metharray.sheet(baseDir) # targets 为data.frame

# Usage
# read.metharray.sheet(base, pattern = "csv$", ignore.case = TRUE,
#                      recursive = TRUE, verbose = TRUE)

 
# sub(baseDir, "", targets$Basename) # 替换
## 通过样本信息（targets为data.frame）读取甲基化芯片数据idat
RGset <- read.metharray.exp(targets = targets)
class(RGset) # "RGChannelSet"
# 只读取某一文件夹下面的idat文件
RGset2 <- read.metharray.exp(file.path(baseDir, "5723646052"))
# 读取baseDir下所有子文件夹下面的idat文件
RGset3 <- read.metharray.exp(baseDir, recursive = TRUE)

2. 质控

### 2. 质控
## 甲基化β值的豆状密度图。
par(mar=c(5,6,4,2))  # c(bottom, left, top, right)
densityBeanPlot(RGset, sampNames=names, sampGroups=group)
## β值密度曲线
densityPlot(RGset,sampGroups=group)

# 检测失败的位点
detP<- detectionP(RGset, type = "m+u")
failed <- detP>0.01
colMeans(failed) # Fraction of failed positions per sample
sum(rowMeans(failed)>0.5) # How many positions failed in >50% of samples

3.数据类型转化

### 3.数据类型转化
# 产生没有标准的MethylSet类型
Mset <- preprocessRaw(RGset)
# quantile normalization
gmSet <- preprocessQuantile(Mset)

# GMset <- mapToGenome(Mset)
# getMeth(GMset)
# getUnmeth(GMset)

## 对于RGset，Mset，GMset， gmSet对象 都有这些函数
pd <- pData(gmSet) # 表型数据
getBeta(gmSet)
group <- pd$Sample_Group # 样品分组
annotation(gmSet) # 注释信息
getAnnotatio(gmSet)
ids <- rownames(gmSet)  # 探针id名
names <- colnames(gmSet)  # 样品名
beta <- getBeta(gmSet) #β值
#assays(RGset)[[1]]  # assays(RGset)$Green
#assays(RGset)[2] # assays(RGset)$Red

4. 读入TCGA数据和GEO数据

### 4. 读入TCGA数据和GEO数据
data_file <- "/Users/zhengxueming/test/test_READ.methylation450.tsv"
gmSet_tcga <- readTCGA(data_file)
class(gmSet_tcga) #"GenomicRatioSet"
?readGEORawFile
?read.metharray
?read.metharray.exp 
?read.metharray.sheet

5.甲基化差异化分析

### 5.甲基化差异化分析
## 检测差异甲基化位点
# 要用normalization后的gmSet数据
pd <- pData(gmSet)
beta <- getBeta(gmSet)
group <- pd$Sample_Group
dmp <- dmpFinder(beta, pheno = group, type = "categorical")
class(dump) # "data.frame"
# Usage
# dmpFinder(dat, pheno, type = c("categorical", "continuous"),
#           qCutoff = 1, shrinkVar = FALSE)
# Continuous phenotypes are tested with linear regression, 
# while an F-test is used for categorical phenotypes.

head(dmp)
## 选择统计学差异的位点
# sum(dmp$qval < 0.05, na.rm=TRUE)
dmp_sig <- dmp[which(dmp$qval < 0.05),]
head(dmp_sig)

##检测差异甲基化区域
designMatrix <- model.matrix(~ gmSet$status)
#运行时间较长，并行处理
require(doParallel)
registerDoParallel(cores = 2) 
bumps <- bumphunter(gmSet, design = designMatrix, B = 10,
                    type = "Beta", cutoff = 0.25)

class(bumps) #"list"
names(bumps) 
dim(bumps$table)
head(bumps$coef)
length(bumps$coef)
head(bumps$pvaluesMarginal)
bumps$pvaluesMarginal
head(bumps$fitted)

bumps_table <- bumps$table
dim(bumps_table)
head(bumps_table)
summary(bumps_table$fwer)
bumps_table_sig <- bumps_table[which(bumps_table$p.value < 0.01),]
head(bumps_table_sig)
dim(bumps_table_sig)

bumps_table_sig[which(bumps_table_sig$chr=="chr7"),]

# dat <- dummyData()
# # Enable parallelization
# require(doParallel)
# registerDoParallel(cores = 2)
# # Find bumps
# dmrs <- bumphunter(dat$mat, design=dat$design, chr=dat$chr, pos=dat$pos,
#                     cluster=dat$cluster, coef=2, cutoff= 0.28, nullMethod="bootstrap",
#                     smooth=TRUE, B=250, verbose=TRUE,
#                     smoothFunction=loessByCluster)
# head(dmrs$table[,1:4], n =3)

参考：

https://www.bioconductor.org/packages/release/bioc/manuals/minfi/man/minfi.pdf