先了解背景知识:http://www.bio-info-trainee.com/3263.html 并且下载数据。
再探索并且完全理解GDSC数据背景: http://www.bio-info-trainee.com/3266.html
发表该数据集的文章是:Gene expression profiling and correlation with outcome in clinical trials of the proteasome inhibitor bortezomib. Blood 2007 Apr 15;109(8):3177-88. PMID: 17185464 用到了两个芯片平台, Affymetrix 133A/B microarray .
从GEO数据库里面下载有 4 个数据集,用代码提取使用了Affymetrix 133A平台的264个样本即可 :
names(bortezomib_mas5) "GSE9782-GPL96_series_matrix-1.txt.gz" "GSE9782-GPL96_series_matrix-2.txt.gz" "GSE9782-GPL97_series_matrix-1.txt.gz" "GSE9782-GPL97_series_matrix-2.txt.gz"
代码是:
load("../Data/bortezomibData/bortGeo.RData") # loads the geo data "bortezomib_mas5"
exprDataU133a <- cbind(exprs(bortezomib_mas5[[1]]), exprs(bortezomib_mas5[[2]]))
dim(exprDataU133a);exprDataU133a[1:4,1:4]
bortIndex <- c(which(pData(phenoData(bortezomib_mas5[[1]]))[,"characteristics_ch1.1"] == "treatment = PS341"), 255 + which(pData(phenoData(bortezomib_mas5[[2]]))[,"characteristics_ch1.1"] == "treatment = PS341"))
dexIndex <- c(which(pData(phenoData(bortezomib_mas5[[1]]))[,"characteristics_ch1.1"] == "treatment = Dex"), 255 + which(pData(phenoData(bortezomib_mas5[[2]]))[,"characteristics_ch1.1"] == "treatment = Dex"))
studyIndex <- c(as.character(pData(bortezomib_mas5[[1]])[, "characteristics_ch1"]), as.character(pData(bortezomib_mas5[[2]])[, "characteristics_ch1"]))
# bortezomib_mas5 <- getGEO("GSE9782") # uncomment this line to download the data directly from GEO.
# exprDataU133a <- exprs(bortezomib_mas5[[1]])
# bortIndex <- which(pData(phenoData(bortezomib_mas5[[1]]))[,"characteristics_ch1.1"] == "treatment = PS341")
# dexIndex <- which(pData(phenoData(bortezomib_mas5[[1]]))[,"characteristics_ch1.1"] == "treatment = Dex")
表达矩阵截取如下:
> dim(exprDataU133a);exprDataU133a[1:4,1:4] [1] 22283 264 GSM246523 GSM246524 GSM246525 GSM246526 1007_s_at 235.5230 498.2220 309.2070 307.5690 1053_at 41.4470 69.0219 69.3994 36.9310 117_at 84.8689 56.8352 49.4388 54.6669 121_at 530.4490 457.9310 536.1780 325.3630
Cell types | Multiple Myeloma |
---|---|
Study types | Gene expression by array |
Plateform/chip | Affymetrix Human Genome U133 A/B |
Normalization | MAS5 |
Number of samples | 264 |
其中有78个是 DEX(dexamethasone) 处理的,还有188人接受的是bortezomib (PS-341)处理,有其它文章已经重新分析了这个数据好几次了。
需要把探针ID转换为基因ID
代码如下:
library("hgu133a.db") # version 2.8.0
x <- hgu133aSYMBOL
mapped_probes <- mappedkeys(x)
names(mapped_probes) <- as.character(x[mapped_probes])
affy2sym <- as.character(x[mapped_probes])
sym2affy <- affy2sym
names(sym2affy) <- names(affy2sym)
rownames(exprDataU133a) <- sym2affy[rownames(exprDataU133a)]
> head(sym2affy)
1053_at 117_at 121_at 1255_g_at 1316_at 1320_at
"RFC2" "HSPA6" "PAX8" "GUCA1A" "THRA" "PTPN21"
> dim(exprDataU133a);exprDataU133a[1:4,1:4]
[1] 22283 264
GSM246523 GSM246524 GSM246525 GSM246526
<NA> 235.5230 498.2220 309.2070 307.5690
RFC2 41.4470 69.0219 69.3994 36.9310
HSPA6 84.8689 56.8352 49.4388 54.6669
PAX8 530.4490 457.9310 536.1780 325.3630
这时候可以看到有些探针是没办法对应到基因的。
载入GDSC数据库
人工整理好的 sensitivity_data_for_drug_104.csv 文件包含着药物在细胞系的信息。
load(file="../Data/GdscProcData/gdsc_brainarray_syms.RData")
sensBortezomib <- read.csv("../Data/bortezomibData/sensitivity_data_for_drug_104.csv",
as.is=TRUE)
bortic50s <- sensBortezomib$"IC.50"
names(bortic50s) <- sensBortezomib$"Cell.Line.Name"
tissue <- sensBortezomib$"Tissue"
names(tissue) <- sensBortezomib$"Cell.Line.Name"
pData <- read.delim("../Data/GdscPdata/E-MTAB-783.sdrf.txt", as.is=TRUE)
pDataUnique <- pData[pData$Source.Name %in% names(which(table(pData$Source.Name) ==
1)), ]
rownames(pDataUnique) <- pDataUnique$Source.Name
commonCellLines <- rownames(pDataUnique)[rownames(pDataUnique) %in% names(bortic50s)]
pDataUniqueOrd <- pDataUnique[commonCellLines, ]
bortic50sOrd <- bortic50s[commonCellLines]
trainDataOrd <- gdsc_brainarray_syms[, pDataUniqueOrd$"Array.Data.File"]
可以看到的是,虽然GDSC数据库里面关于Bortezomib药物的记录有352个细胞系,去冗余后剩下 280 个。
> head(sensBortezomib);dim(sensBortezomib) Drug.Name Cell.Line.Name Tissue Cancer.Type COSMIC.ID IC.Results.ID IC.50 IC.50.Low IC.50.High Beta 1 Bortezomib BC-3 B cell lymphoma blood 910918 1131350 -5.0189 -6.0704 -2.35070 1.60370 2 Bortezomib BC-1 B cell lymphoma blood 910919 1012645 -6.0692 -6.2088 -5.72720 1.72680 3 Bortezomib A101D malignant_melanoma skin 910921 969219 -8.3978 -8.7201 -8.04290 1.08380 4 Bortezomib SCC-3 B cell lymphoma blood 910930 1158865 -6.0446 -6.6354 -5.34020 2.20430 5 Bortezomib AM-38 glioma CNS 910933 869044 -2.4295 -3.3882 -0.68546 0.96089 6 Bortezomib A4-Fuk malignant_melanoma skin 910934 1131250 -7.3812 -7.7508 -6.65810 2.16070 Timestamp 1 02-MAR-12 2 21-JUL-11 3 08-SEP-10 4 14-OCT-11 5 27-AUG-10 6 24-FEB-12 [1] 352 11 > trainDataOrd[1:4,1:3];dim(trainDataOrd) 5500024034290101707049.A01.CEL 5500024032848101507998.F02.CEL 5500024032848101507998.E03.CEL NAT2 4.682166 4.472247 4.553256 ADA 10.476154 6.531855 8.314500 CDH2 6.668168 6.678154 7.211503 AKT3 7.158402 7.006838 7.592698 [1] 12092 280
预测IC50就一句话
predictedSensitivity133a <- calcPhenotype(exprDataU133a[, bortIndex], trainDataOrd, bortic50sOrd,
selection=1)
其实是被包装到了4个R脚本里面。
scriptsDir <- "../scripts"
source(file.path(scriptsDir, "compute_phenotype_function.R"))
source(file.path(scriptsDir, "summarizeGenesByMean.R"))
source(file.path(scriptsDir, "homogenize_data.R"))
source(file.path(scriptsDir, "do_variable_selection.R"))
然后就可以检验一下预测结果, 在 85个病人和 84个病人组别里面比较预测的IC50值。
resp133a PGx_Responder = IE PGx_Responder = NR PGx_Responder = R 19 84 85
代码是:
resp133a <- c(as.character(pData(bortezomib_mas5[[1]])[, "characteristics_ch1.8"]),
as.character(pData(bortezomib_mas5[[2]])[, "characteristics_ch1.8"]))[bortIndex]
t.test(predictedSensitivity133a[resp133a == "PGx_Responder = NR"],
predictedSensitivity133a[resp133a == "PGx_Responder = R"], alternative="greater")
lTwoa <- list("Responder"=predictedSensitivity133a[resp133a == "PGx_Responder = R"],
"Non-responder"=predictedSensitivity133a[resp133a == "PGx_Responder = NR"])
boxplot(lTwoa, outline=FALSE, border="grey", ylab="Predicted Sensitivity (log(IC50)",
main="(a)")
stripchart(lTwoa, vertical=TRUE, pch=20, method="jitter", add=TRUE)
同样的代码也可以应用到其它数据;
setwd("cisplatinAnalysis/") library("ridge") library("sva") library("car") library("preprocessCore") library("ROCR") source("../scripts/compute_phenotype_function.R") source("../scripts/summarizeGenesByMean.R") source("../scripts/homogenize_data.R") source("../scripts/do_variable_selection.R") # 首先载入两个表达矩阵 load(file="../Data/GdscProcData/gdsc_brainarray_syms.RData") load(file="../Data/cisplatinData/cisplatinBreast.RData") # 然后载入药物处理信息 sensCis <- read.csv("../Data/cisplatinData/sensitivity_data_for_drug_1005.csv", as.is=TRUE) cisic50s <- sensCis$"IC.50" names(cisic50s) <- sensCis$"Cell.Line.Name" pData <- read.delim("../Data/GdscPdata/E-MTAB-783.sdrf.txt", as.is=TRUE) pDataUnique <- pData[pData$Source.Name %in% names(which(table(pData$Source.Name) == 1)), ] rownames(pDataUnique) <- pDataUnique$Source.Name commonCellLines <- rownames(pDataUnique)[rownames(pDataUnique) %in% names(cisic50s)] pDataUniqueOrd <- pDataUnique[commonCellLines, ] cisic50sOrd <- cisic50s[commonCellLines] trainDataOrd <- gdsc_brainarray_syms[, pDataUniqueOrd$"Array.Data.File"] dim(trainDataOrd) dim(cisVivoNorm_syms) # 最后根据已知的497个细胞系的药物处理信息,来推断想预测的24个 样本的 药物反应情况。 newPtype <- calcPhenotype(cisVivoNorm_syms, trainDataOrd, cisic50sOrd, selection=1) ## 绘图可视化预测结果 l <- list(cCR=newPtype[respOrd=="cCR"], cPR=newPtype[respOrd=="cPR"], SD=newPtype[respOrd=="SD"], PD=newPtype[respOrd=="PD"]) boxplot(l, outline=FALSE, border="grey", ylab="Predicted Sensitivity (log(IC50)", main="(a)") stripchart(l, vertical=TRUE, pch=20, method="jitter", add=TRUE) ## 统计检验分类的显著性 respVar <- c(rep(0, length(l[["cCR"]])), rep(1, length(l[["cPR"]])), rep(2, length(l[["SD"]])), rep(3, length(l[["PD"]]))) predResp <- c(l[["cCR"]], l[["cPR"]], l[["SD"]], l[["PD"]]) summary(lm(predResp~respVar)) sapply(l, median) loocvOut <- predictionAccuracyByCv(cisVivoNorm_syms, trainDataOrd, cisic50sOrd) cor.test(loocvOut$cvPtype, loocvOut$realPtype) sessionInfo()