看到vip交流群有人询问根据基因名称,批量下载碱基序列的方法,其实就是一个R包《rentrez: An R package for the NCBI eUtils API》
有意思的是,我根据rentrez这样的关键词进行谷歌搜索的时候,无意中发现了:https://journal.r-project.org/archive/2017/RJ-2017-058/index.html
简单探索了一下,发现这个期刊可能是蛮久都没有更新了,最新的一期是:
- https://journal.r-project.org/archive/2020-2/ ,Volume 12/2, December 2020
再怎么强调生物信息学数据分析学习过程的计算机基础知识的打磨都不为过,我把它粗略的分成基于R语言的统计可视化,以及基于Linux的NGS数据处理:
把R的知识点路线图搞定,如下:
- 了解常量和变量概念
- 加减乘除等运算(计算器)
- 多种数据类型(数值,字符,逻辑,因子)
- 多种数据结构(向量,矩阵,数组,数据框,列表)
- 文件读取和写出
- 简单统计可视化
- 无限量函数学习
当然了,rentrez 确实是很方便,不过vip交流群的HT也分享了他自己的代码,如下所示:
library(Biostrings) # Provides DNAString, DNAStringSet, etc
library(BSgenome) # Provides getSeq()
library(GenomicRanges) # Provides GRanges, etc
library(GenomicFeatures)
library(rtracklayer) # Provides import() and export()
library(BSgenome.Hsapiens.UCSC.hg38)
Package Biostrings
offers classes for storing DNA strings, DNAString, amino acid sequences, AAString, or anything else in a BString. These are very like character strings, but a variety of biologically meaningful functions can be applied to them.
myseq <- DNAString("ACCATTGATTAT") #load seq into R
as.character(myseq)
reverseComplement(myseq) #reverse seq
translate(myseq) #Protein seq
Often we want to work with a list of sequences, such as chromosomes.
myset <- DNAStringSet( list(chrI=myseq, chrII=DNAString("ACGTACGT")) )
# We may then wish to refer to regions of these sequences, often with an associated strand. This is done with the GRanges type. GRanges builds on IRanges, “integer ranges”. An IRanges has a starts and ends. A GRanges additionally has sequence names and strand information.
# forward seq
range1 <- GRanges("chrI", IRanges(start=3,end=5), strand="+")
range1
getSeq(myset, range1)
# reverse seq
range2 <- GRanges("chrI", IRanges(start=3,end=5), strand="-")
getSeq(myset, range2)
# Accessing GRanges data
seqnames(range1)
# strand
strand(range1)
# transfer to as.data.frame
as.data.frame(range1)
# GRanges are sometimes like vectors:
c(range1, range2)
# GRanges can have metadata columns, so they are also like data frames:
mcols(range1)$wobble <- 10
range1
mcols(range1)
# A handy way to create a GRanges
as("chrI:3-5:+", "GRanges")
extract seq from interested region
## seqs <- readDNAStringSet("r-more-files/Escherichia_coli_k_12.GCA_000800765.1.29.dna.genome.fa")
library(BSgenome.Mmusculus.UCSC.mm10)
myseq<- getSeq(Mmusculus,"chr2",start=10000,end=10020)
setwd("~/Desktop/Wenhui")
sites<- read.table("sites.txt",sep="\t",header=T,stringsAsFactors = F)
# transfer gene and location info to GRange format
ranges<- GRanges(sites$Chromosome, IRanges(start=sites$Start_Position-300,end=sites$End_Position+300), strand=sites$Strand)
# extract sequence
seqs<- getSeq(Mmusculus,sites$Chromosome,start=sites$Start_Position-300,end=sites$End_Position+300,strand=sites$Strand)
# GRanges can have metadata columns, so they are also like data frames:
mcols(ranges)$geneid<- sites$Hugo_Symbol
mcols(ranges)$Variant_Classification<- sites$Variant_Classification
mcols(ranges)$rsid<- sites$dbSNP_RS
mcols(ranges)$seq<- seqs
# as.data.frame()
seq.data<- as.data.frame(ranges)
# createLink
library(org.Mm.eg.db)
eg2symbol=toTable(org.Mm.egSYMBOL)
eg2name=toTable(org.Mm.egGENENAME)
eg2alias=toTable(org.Mm.egALIAS2EG)
eg2alis_list=lapply(split(eg2alias,eg2alias$gene_id),function(x){paste0(x[,2],collapse = ";")})
symbols=seq.data$geneid
geneIds=eg2symbol[match(symbols,eg2symbol$symbol),'gene_id']
geneAlias=sapply(geneIds,function(x){ifelse(is.null(eg2alis_list[[x]]),"no_alias",eg2alis_list[[x]])})
geneNames=eg2name[match(geneIds,eg2name$gene_id),'gene_name']
createLink <- function(base,val) {
sprintf('<a href="%s" class="btn btn-link" target="_blank" >%s</a>',base,val)
## target="_blank"
}
if(F){
gene_info=data.frame(
symbols=symbols,
geneIds=createLink(paste0("http://www.ncbi.nlm.nih.gov/gene/",geneIds),geneIds),
geneCard=createLink(paste0("https://www.genecards.org/cgi-bin/carddisp.pl?gene=",symbols,"&keywords=",symbols),symbols),
geneNames=geneNames,
genegCHR=seq.data$seqnames,
genegCHRLOCSTART=seq.data$start,
geneCHRLOCEND=seq.data$end,
geneStrand=seq.data$strand,
geneAlias=geneAlias,
stringsAsFactors = F
)
gene_info_ok=na.omit(gene_info)
#library("xtable")
#print(xtable(gene_info), type="html",include.rownames=F, file='all_gene.anno',sanitize.text.function = force)
file='gene_seq_ok.html'
y <- DT::datatable(gene_info_ok,escape = F,rownames=F)
DT::saveWidget(y,file)
}