|
|
@@ -0,0 +1,173 @@ |
|
|
|
#!/usr/bin/env Rscript |
|
|
|
###Copyright 2019 Ying Yu from Fudan-PGx group |
|
|
|
# example: |
|
|
|
# Rscript RNAseq_5_pwGSEA.R -o /home/yuying/rnaseqreport_test -i example_geneexp_log2fpkm_floor0p01_c13r58395_2019-04-30.txt -g group13_2.txt |
|
|
|
|
|
|
|
suppressPackageStartupMessages(library("optparse")) |
|
|
|
suppressPackageStartupMessages(library("fgsea")) |
|
|
|
|
|
|
|
# specify our desired options in a list |
|
|
|
# by default OptionParser will add an help option equivalent to |
|
|
|
# make_option(c("-h", "--help"), action="store_true", default=FALSE, |
|
|
|
# help="Show this help message and exit") |
|
|
|
|
|
|
|
# input input list , rds, from * to * |
|
|
|
|
|
|
|
option_list <- list( |
|
|
|
make_option(c("-o", "--out_dir"), type="character",default="./", |
|
|
|
help="The output directory [default ./]"), |
|
|
|
make_option(c("-i", "--input"),type="character", default=NULL, |
|
|
|
help="The input expression files. Required!"), |
|
|
|
make_option(c("-e", "--type_gene_id"),type="character", default="EnsemblGID", |
|
|
|
help="The type of gene symbol. Could be either of EnsemblGID/EntrezID/GeneSymbol [default: EnsemblGID]"), |
|
|
|
make_option(c("-g", "--sample_group"),type="character", default=NULL, |
|
|
|
help="File for sample group infomation.The input file containing sample name and group infomation. note colname must be like: sample group1 group2... Required! "), |
|
|
|
make_option(c("-q", "--padjvalueCutoff"), type="double",default=0.2,metavar="number", |
|
|
|
help="Cutoff value of adjusted p value. [default: 0.2]"), |
|
|
|
make_option(c("-p", "--project_code"), type="character",default="rnaseq", |
|
|
|
help="Project code, which is used as prefix of output file. [default: rnaseq]") |
|
|
|
) |
|
|
|
|
|
|
|
# get command line options, if help option encountered print help and exit, |
|
|
|
# otherwise if options not found on command line then set defaults, |
|
|
|
opt <- parse_args(OptionParser(option_list=option_list)) |
|
|
|
|
|
|
|
if (is.null(opt$input)){ |
|
|
|
print_help(opt_parser) |
|
|
|
stop("At least one argument must be supplied (input file).", call.=FALSE) |
|
|
|
} |
|
|
|
|
|
|
|
if (is.null(opt$sample_group)){ |
|
|
|
stop("At least one argument must be supplied (input group infomation for DEG analysis).", call.=FALSE) |
|
|
|
} |
|
|
|
|
|
|
|
##import file |
|
|
|
out_dir<-paste(gsub("/$","",opt$out_dir),"/",sep="") |
|
|
|
logexpr<-read.table(opt$input,header=T,stringsAsFactors=F,row.names=1) |
|
|
|
|
|
|
|
#check exp file is log scale |
|
|
|
if(max(logexpr[,1])-min(logexpr[,1])>100){ |
|
|
|
stop("DEG anlaysis should be conducted based on expression profile on log scale. Please run log2 first", call.=FALSE) |
|
|
|
} |
|
|
|
|
|
|
|
##import sample group file and check |
|
|
|
sample_group<-read.table(opt$sample_group,sep="\t",header=T) |
|
|
|
|
|
|
|
if(length(grep("group",colnames(sample_group)))==0){ |
|
|
|
stop("No group is identified in sample_group file. Make sure the head of sample_group file is like sample, group1, group2.") |
|
|
|
} |
|
|
|
#c2: curated gene sets (rdata file) |
|
|
|
load("./human_c2_v5p2.rdata") |
|
|
|
#c5: GO gene sets (rdata file) |
|
|
|
load("./human_c5_v5p2.rdata") |
|
|
|
|
|
|
|
########################## |
|
|
|
#########ID convert####### |
|
|
|
########################## |
|
|
|
|
|
|
|
message("Begin ID conversion.") |
|
|
|
|
|
|
|
if(length(grep("ID_convert_table.rds",dir()))>0){ |
|
|
|
idconvert<-readRDS("./ID_convert_table.rds") |
|
|
|
}else{ |
|
|
|
stop("Cannot find ID_convert_table.rds in the working folder. Exit!", call.=FALSE) |
|
|
|
} |
|
|
|
|
|
|
|
if(opt$type_gene_id=="EnsemblGID"){ |
|
|
|
gene_entrez<-idconvert$EntrezID[match(rownames(logexpr),idconvert$EnsemblID)] |
|
|
|
if(length(which(is.na(gene_entrez)))==nrow(logexpr)){ |
|
|
|
stop("Cannot convert Ensembl gene ID to Entrez gene ID. Exit!", call.=FALSE) |
|
|
|
}else{ |
|
|
|
logexpr1<-logexpr[!gene_entrez=="",] |
|
|
|
gene_entrez1<-gene_entrez[!gene_entrez==""] |
|
|
|
logexpr.entrez<-apply(logexpr1,2,function(x){unlist(tapply(x,as.factor(gene_entrez1),mean))}) |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
if(opt$type_gene_id=="GeneSymbol"){ |
|
|
|
gene_entrez<-idconvert$EntrezID[match(rownames(logexpr),idconvert$GeneSymbol)] |
|
|
|
if(length(which(is.na(gene_entrez)))==nrow(logexpr)){ |
|
|
|
stop("Cannot convert Ensembl gene ID to Entrez gene ID. Exit!", call.=FALSE) |
|
|
|
}else{ |
|
|
|
logexpr1<-logexpr[!gene_entrez=="",] |
|
|
|
gene_entrez1<-gene_entrez[!gene_entrez==""] |
|
|
|
logexpr.entrez<-apply(logexpr1,2,function(x){unlist(tapply(x,as.factor(gene_entrez1),mean))}) |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
if(opt$type_gene_id=="EntrezID"){ |
|
|
|
logexpr.entrez<-logexpr |
|
|
|
} |
|
|
|
message("Finish ID conversion.") |
|
|
|
|
|
|
|
###################### |
|
|
|
######## GSEA ######## |
|
|
|
###################### |
|
|
|
|
|
|
|
groupn<-grep("group",colnames(sample_group)) |
|
|
|
|
|
|
|
c5sigall<-c() |
|
|
|
c2sigall<-c() |
|
|
|
for ( i in groupn){ |
|
|
|
compgroup<-combn(unique(sample_group[,i]), 2) |
|
|
|
for ( j in 1:ncol(compgroup)){ |
|
|
|
nam<-paste(compgroup[1,j],"vs",compgroup[2,j],sep="") |
|
|
|
versus<-paste(compgroup[1,j],"vs",compgroup[2,j],sep=" ") |
|
|
|
groupA<-logexpr.entrez[,as.character(sample_group$sample[sample_group[,i] %in% compgroup[1,j]])] |
|
|
|
groupB<-logexpr.entrez[,as.character(sample_group$sample[sample_group[,i] %in% compgroup[2,j]])] |
|
|
|
|
|
|
|
logfc<-rowMeans(groupA)-rowMeans(groupB) |
|
|
|
logfc<-logfc[order(-logfc)] |
|
|
|
|
|
|
|
#GSEA in GO term |
|
|
|
fgseaRes.c5 <- fgsea(Hs.c5, logfc, minSize=15, maxSize = 500, nperm=1000) |
|
|
|
c5sig<-fgseaRes.c5[fgseaRes.c5$padj<opt$padjvalueCutoff,] |
|
|
|
c5sig<-c5sig[order(c5sig$pval),] |
|
|
|
c5sig<-data.frame(c5sig) |
|
|
|
c5sig$leadingEdge<-sapply(c5sig$leadingEdge,function(x){paste0(unlist(x),collapse=", ")}) |
|
|
|
if(nrow(c5sig)==0){ |
|
|
|
message(paste("No significant GO term is identified in group ",nam,".",sep="")) |
|
|
|
}else{ |
|
|
|
message(paste(nrow(c5sig)," significant GO term(s) is(are) identified in group ",nam,".",sep="")) |
|
|
|
c5sigall<-rbind(c5sigall,cbind(versus,c5sig)) |
|
|
|
} |
|
|
|
#GSEA in curated gene sets |
|
|
|
|
|
|
|
fgseaRes.c2 <- fgsea(Hs.c2, logfc, minSize=15, maxSize = 500, nperm=1000) |
|
|
|
c2sig<-fgseaRes.c2[fgseaRes.c2$padj<opt$padjvalueCutoff,] |
|
|
|
c2sig<-c2sig[order(c2sig$pval),] |
|
|
|
c2sig<-data.frame(c2sig) |
|
|
|
c2sig$leadingEdge<-sapply(c2sig$leadingEdge,function(x){paste0(unlist(x),collapse=", ")}) |
|
|
|
|
|
|
|
if(nrow(c2sig)==0){ |
|
|
|
message(paste("No significant curated gene sets is identified in group ",nam,".",sep="")) |
|
|
|
}else{ |
|
|
|
message(paste(nrow(c2sig)," significant curated gene sets are identified in group ",nam,".",sep="")) |
|
|
|
c2sigall<-rbind(c2sigall,cbind(versus,c2sig)) |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
if(nrow(c5sigall)==0){ |
|
|
|
message("No significant GO term is identified.") |
|
|
|
}else{ |
|
|
|
c5sigall$pval<-signif(c5sigall$pval,4) |
|
|
|
c5sigall$padj<-signif(c5sigall$padj,4) |
|
|
|
c5sigall$ES<-signif(c5sigall$ES,4) |
|
|
|
c5sigall$NES<-signif(c5sigall$NES,4) |
|
|
|
rownames(c5sigall)<-c(1:nrow(c5sigall)) |
|
|
|
write.csv(c5sigall,paste(out_dir,opt$project_code,"_gsea_go.csv",sep="")) |
|
|
|
} |
|
|
|
|
|
|
|
if(nrow(c2sigall)==0){ |
|
|
|
message("No significant GO term is identified.") |
|
|
|
}else{ |
|
|
|
c2sigall$pval<-signif(c2sigall$pval,4) |
|
|
|
c2sigall$padj<-signif(c2sigall$padj,4) |
|
|
|
c2sigall$ES<-signif(c2sigall$ES,4) |
|
|
|
c2sigall$NES<-signif(c2sigall$NES,4) |
|
|
|
rownames(c2sigall)<-c(1:nrow(c2sigall)) |
|
|
|
write.csv(c2sigall,paste(out_dir,opt$project_code,"_gsea_curatedgenesets.csv",sep="")) |
|
|
|
} |
|
|
|
|
|
|
|
|