你还缺乳腺癌表达量数据集吗

生存分析你还是在TCGA吗?

最近有粉丝求助说他研究乳腺癌做了单细胞转录组数据,定位到了一个稀有细胞亚群,先看它感兴趣的亚群细胞特异性基因的临床意义,问我有没有除了TCGA数据库之外的其它数据库资源推荐。恰好我做这方面就顺手检索了一下,发现了 curatedBreastData 包,值得推荐!

安装和加载相信已经无需我多说了:

BiocManager::install('curatedBreastData') 
# 这个包接近300M
library("curatedBreastData")

包内置的表达矩阵数据集

data(curatedBreastDataExprSetList); 
# 34个数据集
length(curatedBreastDataExprSetList)
names(curatedBreastDataExprSetList)

可以看到绝大部分的芯片都是affy公司的GPL96,就是hgu133plus2:

> names(curatedBreastDataExprSetList)
 [1] "study_1379_GPL1223_all" "study_2034_GPL96_all" 
 [3] "study_4913_GPL3558_all" "study_6577_GPL3883_all" 
 [5] "study_9893_GPL5049_all" "study_12071_GPL5186_all" 
 [7] "study_12093_GPL96_all" "study_16391_GPL570_all" 
 [9] "study_16446_GPL570_all" "study_17705_GPL96_JBI_Tissue_BC_Tamoxifen" 
[11] "study_17705_GPL96_MDACC_Tissue_BC_Tamoxifen" "study_18728_GPL570_all" 
[13] "study_19615_GPL570_all" "study_19697_GPL570_all" 
[15] "study_20181_GPL96_all" "study_20194_GPL96_all" 
[17] "study_21974_GPL6480_all" "study_21997_GPL1390_all" 
[19] "study_21997_GPL5325_all" "study_21997_GPL7504_all" 
[21] "study_22226_GPL1708_all" "study_22226_GPL4133_all" 
[23] "study_22358_GPL5325_all" "study_23428_GPL5325_all" 
[25] "study_25055_GPL96_MDACC_M" "study_25055_GPL96_MDACC_PERU" 
[27] "study_25065_GPL96_LBJ" "study_25065_GPL96_MDACC" 
[29] "study_25065_GPL96_MDACC_MDA" "study_25065_GPL96_PERU" 
[31] "study_25065_GPL96_Spain" "study_25065_GPL96_USO" 
[33] "study_32646_GPL570_all" "study_33658_GPL570_all"

每个数据集都是一个独立的ExpressionSet对象:

> eset=curatedBreastDataExprSetList[[3]]
> head(pData(eset)[c(1:3), c(1:10)])
 datasetName dbUniquePatientID study_ID.x patient_ID GEO_GSMID platform_ID
110388 study_4913_GPL3558_all 597 4913 110388 110388 3558
110392 study_4913_GPL3558_all 598 4913 110392 110392 3558
110394 study_4913_GPL3558_all 599 4913 110394 110394 3558
 GEO_platform_ID AE_platform_ID coordinating_GSE_series_GSMID original_study_ID
110388 GPL3558 <NA> NA wsb 10167
110392 GPL3558 <NA> NA wsb 1281
110394 GPL3558 <NA> NA wsb 1319

这个ExpressionSet对象凡是搞GEO数据挖掘的应该是都没有问题的。

可以看到,每个数据集的独立对象里面其实是有该样品的表型信息,但是没有临床属性,它存储在另外一个对象。可以看到,34个数据集合起来是2719个样品,而且它们总共是139个临床属性啦。

image-20201130160814662

探索这个临床属性的代码如下:

data(clinicalData)
#look at some of the clinical variable name definitions
clinicalData$clinicalVarDef[c(1:2),]
#Check out the treatment information.
#just do first three patients
head(clinicalData$clinicalTable)[c(1:3),
 c(112:ncol(clinicalData$clinicalTable))]
#how many had chemotherapy?
numChemoPatients <- length(which(
 clinicalData$clinicalTable$chemotherapyClass==1))
#around 1500 had chemotherapy
numChemoPatients
#which patients specifically had a taxane chemotherapy?
numChemoTaxane <- length(which(clinicalData$clinicalTable$taxane==1))
numChemoTaxane

#how many had adjuvant therapy?
numAdjPatients <- length(which(
 clinicalData$clinicalTable$neoadjuvant_or_adjuvant=="adj"))
#over a 1000 had (documented) adjuvant therapy
numAdjPatients

#how many patients have non-NA OS binary data?
length(which(!is.na(clinicalData$clinicalTable$OS)))
#how many have OS data in the more granular form of months until OS? 
#this variable includes studies that had a cieling for tracking OS
length(which(!is.na(clinicalData$clinicalTable$OS_months_or_MIN_months_of_OS)))
#how many patients have OS information that is definitively 
#followed up until their death (details on how studies collect OS data can be surprising!)
length(which(!is.na(clinicalData$clinicalTable$OS_up_until_death)))

这个包还提供了一个数据处理函数:The wrapper function processExpressionSetList() completes all these post-processing steps on a list of S4 ExpressionSet objects like the curatedBreastDataExprSetList.rda list provided in this package.

不过,我觉得没有不要使用它的函数了。

Comments are closed.