把GDSC数据应用到GSE9782数据集

先了解背景知识:http://www.bio-info-trainee.com/3263.html 并且下载数据。

再探索并且完全理解GDSC数据背景: http://www.bio-info-trainee.com/3266.html

了解GSE9782数据集

发表该数据集的文章是:Gene expression profiling and correlation with outcome in clinical trials of the proteasome inhibitor bortezomib. Blood 2007 Apr 15;109(8):3177-88. PMID: 17185464 用到了两个芯片平台, Affymetrix 133A/B microarray .

从GEO数据库里面下载有 4 个数据集,用代码提取使用了Affymetrix 133A平台的264个样本即可 :

names(bortezomib_mas5)
"GSE9782-GPL96_series_matrix-1.txt.gz" 
"GSE9782-GPL96_series_matrix-2.txt.gz" 
"GSE9782-GPL97_series_matrix-1.txt.gz"
"GSE9782-GPL97_series_matrix-2.txt.gz"

代码是:

load("../Data/bortezomibData/bortGeo.RData") # loads the geo data "bortezomib_mas5"exprDataU133a <- cbind(exprs(bortezomib_mas5[[1]]), exprs(bortezomib_mas5[[2]]))
dim(exprDataU133a);exprDataU133a[1:4,1:4]
bortIndex <- c(which(pData(phenoData(bortezomib_mas5[[1]]))[,"characteristics_ch1.1"] == "treatment = PS341"), 255 + which(pData(phenoData(bortezomib_mas5[[2]]))[,"characteristics_ch1.1"] == "treatment = PS341"))
dexIndex <- c(which(pData(phenoData(bortezomib_mas5[[1]]))[,"characteristics_ch1.1"] == "treatment = Dex"), 255 + which(pData(phenoData(bortezomib_mas5[[2]]))[,"characteristics_ch1.1"] == "treatment = Dex"))
studyIndex <- c(as.character(pData(bortezomib_mas5[[1]])[, "characteristics_ch1"]), as.character(pData(bortezomib_mas5[[2]])[, "characteristics_ch1"]))
​
# bortezomib_mas5 <- getGEO("GSE9782") # uncomment this line to download the data directly from GEO.
# exprDataU133a <- exprs(bortezomib_mas5[[1]])
# bortIndex <- which(pData(phenoData(bortezomib_mas5[[1]]))[,"characteristics_ch1.1"] == "treatment = PS341")
# dexIndex <- which(pData(phenoData(bortezomib_mas5[[1]]))[,"characteristics_ch1.1"] == "treatment = Dex")

表达矩阵截取如下:

> dim(exprDataU133a);exprDataU133a[1:4,1:4]
[1] 22283   264
          GSM246523 GSM246524 GSM246525 GSM246526
1007_s_at  235.5230  498.2220  309.2070  307.5690
1053_at     41.4470   69.0219   69.3994   36.9310
117_at      84.8689   56.8352   49.4388   54.6669
121_at     530.4490  457.9310  536.1780  325.3630
Cell types Multiple Myeloma
Study types Gene expression by array
Plateform/chip Affymetrix Human Genome U133 A/B
Normalization MAS5
Number of samples 264

其中有78个是 DEX(dexamethasone) 处理的,还有188人接受的是bortezomib (PS-341)处理,有其它文章已经重新分析了这个数据好几次了。

需要把探针ID转换为基因ID

代码如下:

library("hgu133a.db") # version 2.8.0
x <- hgu133aSYMBOL
mapped_probes <- mappedkeys(x)
names(mapped_probes) <- as.character(x[mapped_probes])
affy2sym <- as.character(x[mapped_probes])
sym2affy <- affy2sym
names(sym2affy) <- names(affy2sym)
rownames(exprDataU133a) <- sym2affy[rownames(exprDataU133a)]
​
> head(sym2affy)
  1053_at    117_at    121_at 1255_g_at   1316_at   1320_at 
   "RFC2"   "HSPA6"    "PAX8"  "GUCA1A"    "THRA"  "PTPN21"> dim(exprDataU133a);exprDataU133a[1:4,1:4]
[1] 22283   264
      GSM246523 GSM246524 GSM246525 GSM246526
<NA>   235.5230  498.2220  309.2070  307.5690
RFC2    41.4470   69.0219   69.3994   36.9310
HSPA6   84.8689   56.8352   49.4388   54.6669
PAX8   530.4490  457.9310  536.1780  325.3630

这时候可以看到有些探针是没办法对应到基因的。

载入GDSC数据库

人工整理好的 sensitivity_data_for_drug_104.csv 文件包含着药物在细胞系的信息。

load(file="../Data/GdscProcData/gdsc_brainarray_syms.RData") 
sensBortezomib <- read.csv("../Data/bortezomibData/sensitivity_data_for_drug_104.csv", 
as.is=TRUE)
bortic50s <- sensBortezomib$"IC.50"
names(bortic50s) <- sensBortezomib$"Cell.Line.Name"
tissue <- sensBortezomib$"Tissue"
names(tissue) <- sensBortezomib$"Cell.Line.Name"
 
pData <- read.delim("../Data/GdscPdata/E-MTAB-783.sdrf.txt", as.is=TRUE)
pDataUnique <- pData[pData$Source.Name %in% names(which(table(pData$Source.Name) ==
1)), ]
rownames(pDataUnique) <- pDataUnique$Source.Name
commonCellLines <- rownames(pDataUnique)[rownames(pDataUnique) %in% names(bortic50s)]
pDataUniqueOrd <- pDataUnique[commonCellLines, ]
bortic50sOrd <- bortic50s[commonCellLines]
trainDataOrd <- gdsc_brainarray_syms[, pDataUniqueOrd$"Array.Data.File"]

可以看到的是,虽然GDSC数据库里面关于Bortezomib药物的记录有352个细胞系,去冗余后剩下 280 个。

> head(sensBortezomib);dim(sensBortezomib) 
   Drug.Name Cell.Line.Name             Tissue Cancer.Type COSMIC.ID IC.Results.ID   IC.50 IC.50.Low IC.50.High    Beta
1 Bortezomib           BC-3    B cell lymphoma       blood    910918       1131350 -5.0189   -6.0704   -2.35070 1.60370
2 Bortezomib           BC-1    B cell lymphoma       blood    910919       1012645 -6.0692   -6.2088   -5.72720 1.72680
3 Bortezomib          A101D malignant_melanoma        skin    910921        969219 -8.3978   -8.7201   -8.04290 1.08380
4 Bortezomib          SCC-3    B cell lymphoma       blood    910930       1158865 -6.0446   -6.6354   -5.34020 2.20430
5 Bortezomib          AM-38             glioma         CNS    910933        869044 -2.4295   -3.3882   -0.68546 0.96089
6 Bortezomib         A4-Fuk malignant_melanoma        skin    910934       1131250 -7.3812   -7.7508   -6.65810 2.16070
  Timestamp
1 02-MAR-12
2 21-JUL-11
3 08-SEP-10
4 14-OCT-11
5 27-AUG-10
6 24-FEB-12
[1] 352  11
> trainDataOrd[1:4,1:3];dim(trainDataOrd)
     5500024034290101707049.A01.CEL 5500024032848101507998.F02.CEL 5500024032848101507998.E03.CEL
NAT2                       4.682166                       4.472247                       4.553256
ADA                       10.476154                       6.531855                       8.314500
CDH2                       6.668168                       6.678154                       7.211503
AKT3                       7.158402                       7.006838                       7.592698
[1] 12092   280

预测IC50就一句话

predictedSensitivity133a <- calcPhenotype(exprDataU133a[, bortIndex], trainDataOrd, bortic50sOrd,
selection=1)

其实是被包装到了4个R脚本里面。

scriptsDir <- "../scripts"
source(file.path(scriptsDir, "compute_phenotype_function.R"))
source(file.path(scriptsDir, "summarizeGenesByMean.R"))
source(file.path(scriptsDir, "homogenize_data.R"))
source(file.path(scriptsDir, "do_variable_selection.R"))

然后就可以检验一下预测结果, 在 85个病人和 84个病人组别里面比较预测的IC50值。

resp133a
PGx_Responder = IE PGx_Responder = NR  PGx_Responder = R 
                19                 84                 85

代码是:

resp133a <- c(as.character(pData(bortezomib_mas5[[1]])[, "characteristics_ch1.8"]),
as.character(pData(bortezomib_mas5[[2]])[, "characteristics_ch1.8"]))[bortIndex]
​
t.test(predictedSensitivity133a[resp133a == "PGx_Responder = NR"],
predictedSensitivity133a[resp133a == "PGx_Responder = R"], alternative="greater")
​
lTwoa <- list("Responder"=predictedSensitivity133a[resp133a == "PGx_Responder = R"],
"Non-responder"=predictedSensitivity133a[resp133a == "PGx_Responder = NR"])
boxplot(lTwoa, outline=FALSE, border="grey", ylab="Predicted Sensitivity (log(IC50)",
main="(a)")
stripchart(lTwoa, vertical=TRUE, pch=20, method="jitter", add=TRUE)
​

同样的代码也可以应用到其它数据;

setwd("cisplatinAnalysis/") 
library("ridge")
library("sva")
library("car")
library("preprocessCore")
library("ROCR") 
source("../scripts/compute_phenotype_function.R")
source("../scripts/summarizeGenesByMean.R")
source("../scripts/homogenize_data.R")
source("../scripts/do_variable_selection.R")
​
# 首先载入两个表达矩阵
​
load(file="../Data/GdscProcData/gdsc_brainarray_syms.RData")
load(file="../Data/cisplatinData/cisplatinBreast.RData")
​
# 然后载入药物处理信息
sensCis <- read.csv("../Data/cisplatinData/sensitivity_data_for_drug_1005.csv", as.is=TRUE)
cisic50s <- sensCis$"IC.50"
names(cisic50s) <- sensCis$"Cell.Line.Name"
pData <- read.delim("../Data/GdscPdata/E-MTAB-783.sdrf.txt", as.is=TRUE)
pDataUnique <- pData[pData$Source.Name %in% names(which(table(pData$Source.Name) == 1)), ]
rownames(pDataUnique) <- pDataUnique$Source.Name
commonCellLines <- rownames(pDataUnique)[rownames(pDataUnique) %in% names(cisic50s)]
pDataUniqueOrd <- pDataUnique[commonCellLines, ]
cisic50sOrd <- cisic50s[commonCellLines]
trainDataOrd <- gdsc_brainarray_syms[, pDataUniqueOrd$"Array.Data.File"]
​
dim(trainDataOrd)
dim(cisVivoNorm_syms)
# 最后根据已知的497个细胞系的药物处理信息,来推断想预测的24个 样本的 药物反应情况。
newPtype <- calcPhenotype(cisVivoNorm_syms, trainDataOrd, cisic50sOrd, selection=1)
​
## 绘图可视化预测结果
l <- list(cCR=newPtype[respOrd=="cCR"], cPR=newPtype[respOrd=="cPR"],  SD=newPtype[respOrd=="SD"], PD=newPtype[respOrd=="PD"])
boxplot(l, outline=FALSE, border="grey", ylab="Predicted Sensitivity (log(IC50)",
main="(a)")
stripchart(l, vertical=TRUE, pch=20, method="jitter", add=TRUE)
## 统计检验分类的显著性
respVar <- c(rep(0, length(l[["cCR"]])), rep(1, length(l[["cPR"]])), rep(2, length(l[["SD"]])), rep(3, length(l[["PD"]])))
predResp <- c(l[["cCR"]], l[["cPR"]], l[["SD"]], l[["PD"]])
summary(lm(predResp~respVar))
sapply(l, median)
 
loocvOut <- predictionAccuracyByCv(cisVivoNorm_syms, trainDataOrd, cisic50sOrd)
cor.test(loocvOut$cvPtype, loocvOut$realPtype)
 
sessionInfo()
​
​
​

Comments are closed.