加载包
rm(list = ls())
options(stringsAsFactors = F)
gc()
library(TCGAbiolinks)
library(scRNAseq)
library(data.table)
library(limma)
library(dplyr)
library(DT)
library(survival)
library(survminer)
具体如何获取矩阵和生存数据,在我去年TCGA更新之后写的很清楚
TCGAbiolinks整理表达数据和临床数据这篇推文地址在下面
https://blog.csdn.net/sayhello1025/article/details/125218337
下载TCGA 胃癌的数据
###表达数据下载
query <- GDCquery(
project = "TCGA-STAD",
data.category = "Transcriptome Profiling",
data.type = "Gene Expression Quantification",
workflow.type = "STAR - Counts"
)
GDCdownload(query = query)
expData<- GDCprepare(query = query,
save = TRUE,
save.filename = 'STAD_exp.rda'
)
##利用scRNAseq这个包直接获取
tpm_data <- assay(expData,i = "tpm_unstrand")
row_file <- data.table::fread('./GDCdata/TCGA-STAD/Transcriptome_Profiling/Gene_Expression_Quantification/002cf74b-098b-4de6-9a76-0d889c894ab6/49a0c23d-e0e7-4b67-b9f0-527c14105345.rna_seq.augmented_star_gene_counts.tsv',data.table = F)
row_file <- row_file[-c(1:4),]
rownames(row_file) <- row_file[,1]
###
same <- intersect(row.names(tpm_data),row.names(row_file))
STAD_tpmExp <- cbind(row_file[same,],tpm_data[same,])
STAD_tpmExp <- STAD_tpmExp[,-c(1,3:9)]
rt=as.matrix(STAD_tpmExp)
rownames(rt)=rt[,1]
exp=rt[,2:ncol(rt)]
dimnames=list(rownames(exp),colnames(exp))
data=matrix(as.numeric(as.matrix(exp)),nrow=nrow(exp),dimnames=dimnames)
data=avereps(data)
STAD_exp=data[rowMeans(data)>0,]
Out=rbind(id=colnames(STAD_exp), STAD_exp)
write.table(Out, file="./STAD_exp.txt", sep="\t", quote=F, col.names=F)
下在临床数据
query <- GDCquery(
project = "TCGA-STAD",
data.category = "Clinical",
data.type = "Clinical Supplement",
data.format = "BCR XML"
)
GDCdownload(query)
cli <- GDCprepare_clinic(query,'follow_up')
cli <- cli %>%
select(bcr_followup_barcode,vital_status,
days_to_death,days_to_last_followup) %>%
distinct(bcr_followup_barcode, .keep_all = TRUE)
table(cli$vital_status)
dead_patient <- cli %>%
dplyr::filter(vital_status == 'Dead') %>%
dplyr::select(-days_to_last_followup) %>%
rename(c(bcr_followup_barcode = 'Barcode',
vital_status = 'fustat',
days_to_death='futime'
)) %>%
mutate(fustat=ifelse(fustat=='Dead',1,0))%>%
mutate(futime=futime/365)
alive_patient <- cli %>%
dplyr::filter(vital_status == 'Alive') %>%
dplyr::select(-days_to_death) %>%
rename(c(bcr_followup_barcode = 'Barcode',
vital_status = 'fustat',
days_to_last_followup='futime'
)) %>%
mutate(fustat=ifelse(fustat=='Dead',1,0))%>%
mutate(futime=futime/365)
survival_data <- rbind(dead_patient,alive_patient)
write.csv(survival_data,file="./STAD_surviv.csv")
整合后跑生存分析
rt =read.table('./STAD_exp.txt',sep = '\t',header = T,check.names = F,row.names = 1)
gene = c('TNF','IL6','MMP9','MYC','HIF1A','PTGS2','MAPK3','IFNG','IL10','PPARG',
'CCND1','EGF','ICAM1','FOS','IL1A','IL2','CRP','MPO','CDKN1A','SERPINE1')
rt1 = rt[gene,]
exp <- data.frame(t(rt1))
rownames(exp) <- gsub('\\.','\\-',rownames(exp))
cli <- read.csv('./STAD_surviv.csv',header = T,row.names = 1)
exp$ID1 <- substr(rownames(exp), 1, 12)
cli$ID2 <- substr(rownames(cli), 1, 12)
map <- match(exp$ID,cli$ID)
data3 <- cli[map,]##注意看这个
data4 <- cbind(exp,data3)
data5 <- na.omit(data4)
desired_columns <- c("fustat","futime",'ID1','ID2')
all_columns <- c(desired_columns, setdiff(names(data5), desired_columns))
data6 <- data5[all_columns]
生存分析
for (i in (colnames(rt)[5:24])) {
var=i
data=rt[,c("futime","fustat",var)]
group=ifelse(data[,3]>median(data[,3]),"High","Low")
diff=survdiff(Surv(futime, fustat) ~group,data = data)
pValue=1-pchisq(diff$chisq,df=1)
if(pValue<0.001){
pValue="p<0.001"
}else{
pValue=paste0("p=",sprintf("%.03f",pValue))
}
fit <- survfit(Surv(futime, fustat) ~ group, data = data)
surPlot=ggsurvplot(fit,
data=data,
conf.int=FALSE,
pval=pValue,
pval.size=5,
legend.labs=c("High", "Low"),
legend.title=var,
xlab="Time(years)",
break.time.by = 1,
risk.table.title="",
palette=c("#ED545C","#0075C5"),
risk.table=T,ggtheme = theme_bw(),
risk.table.height=.25)
filename= paste0('./01.survival/',var,'.pdf')
pdf(file=filename,onefile = FALSE,width = 6,height =5)
print(surPlot)
dev.off()
}
效果如下