6 年前 · 90c446a726
--- a/RNAseq_5_pwGSEA.R
+++ b/RNAseq_5_pwGSEA.R
@@ -0,0 +1,173 @@
 #!/usr/bin/env Rscript
 ###Copyright 2019 Ying Yu from Fudan-PGx group 
 # example:
 # Rscript RNAseq_5_pwGSEA.R -o /home/yuying/rnaseqreport_test -i example_geneexp_log2fpkm_floor0p01_c13r58395_2019-04-30.txt  -g group13_2.txt 

 suppressPackageStartupMessages(library("optparse"))
 suppressPackageStartupMessages(library("fgsea"))

 # specify our desired options in a list
 # by default OptionParser will add an help option equivalent to 
 # make_option(c("-h", "--help"), action="store_true", default=FALSE, 
 #               help="Show this help message and exit")

 # input input list , rds, from * to *

 option_list <- list( 
    make_option(c("-o", "--out_dir"), type="character",default="./",
        help="The output directory [default ./]"),
    make_option(c("-i", "--input"),type="character", default=NULL,
        help="The input expression files. Required!"),
 	make_option(c("-e", "--type_gene_id"),type="character", default="EnsemblGID",
        help="The type of gene symbol. Could be either of EnsemblGID/EntrezID/GeneSymbol [default: EnsemblGID]"),
 	make_option(c("-g", "--sample_group"),type="character",  default=NULL,
        help="File for sample group infomation.The input file containing sample name and group infomation. note colname must be like: sample	group1	group2... Required! "),
 	make_option(c("-q", "--padjvalueCutoff"), type="double",default=0.2,metavar="number",
 		help="Cutoff value of adjusted p value. [default: 0.2]"),
   	make_option(c("-p", "--project_code"), type="character",default="rnaseq",
        help="Project code, which is used as prefix of output file. [default: rnaseq]")
 		)

 # get command line options, if help option encountered print help and exit,
 # otherwise if options not found on command line then set defaults, 
 opt <- parse_args(OptionParser(option_list=option_list))

 if (is.null(opt$input)){
  print_help(opt_parser)
  stop("At least one argument must be supplied (input file).", call.=FALSE)
 }

 if (is.null(opt$sample_group)){
  stop("At least one argument must be supplied (input group infomation for DEG analysis).", call.=FALSE)
 }

 ##import file
 out_dir<-paste(gsub("/$","",opt$out_dir),"/",sep="")
 logexpr<-read.table(opt$input,header=T,stringsAsFactors=F,row.names=1)

 #check exp file is log scale
 if(max(logexpr[,1])-min(logexpr[,1])>100){
  stop("DEG anlaysis should be conducted based on expression profile on log scale. Please run log2 first", call.=FALSE)
 }

 ##import sample group file and check 
 sample_group<-read.table(opt$sample_group,sep="\t",header=T)

 if(length(grep("group",colnames(sample_group)))==0){
 stop("No group is identified in sample_group file. Make sure the head of sample_group file is like sample, group1, group2.")
 }
 #c2: curated gene sets (rdata file)
 load("./human_c2_v5p2.rdata")
 #c5: GO gene sets (rdata file)
 load("./human_c5_v5p2.rdata")

 ##########################
 #########ID convert#######
 ##########################

 message("Begin ID conversion.")

 if(length(grep("ID_convert_table.rds",dir()))>0){
 idconvert<-readRDS("./ID_convert_table.rds")
 }else{
 stop("Cannot find ID_convert_table.rds in the working folder. Exit!", call.=FALSE)
 }

 if(opt$type_gene_id=="EnsemblGID"){
 gene_entrez<-idconvert$EntrezID[match(rownames(logexpr),idconvert$EnsemblID)]
 if(length(which(is.na(gene_entrez)))==nrow(logexpr)){
 stop("Cannot convert Ensembl gene ID to Entrez gene ID. Exit!", call.=FALSE)
 }else{
 logexpr1<-logexpr[!gene_entrez=="",]
 gene_entrez1<-gene_entrez[!gene_entrez==""]
 logexpr.entrez<-apply(logexpr1,2,function(x){unlist(tapply(x,as.factor(gene_entrez1),mean))})
 }
 }

 if(opt$type_gene_id=="GeneSymbol"){
 gene_entrez<-idconvert$EntrezID[match(rownames(logexpr),idconvert$GeneSymbol)]
 if(length(which(is.na(gene_entrez)))==nrow(logexpr)){
 stop("Cannot convert Ensembl gene ID to Entrez gene ID. Exit!", call.=FALSE)
 }else{
 logexpr1<-logexpr[!gene_entrez=="",]
 gene_entrez1<-gene_entrez[!gene_entrez==""]
 logexpr.entrez<-apply(logexpr1,2,function(x){unlist(tapply(x,as.factor(gene_entrez1),mean))})
 }
 }

 if(opt$type_gene_id=="EntrezID"){
 logexpr.entrez<-logexpr
 }
 message("Finish ID conversion.")

 ######################
 ######## GSEA ########
 ######################

 groupn<-grep("group",colnames(sample_group))

 c5sigall<-c()
 c2sigall<-c()
 for ( i in groupn){
 compgroup<-combn(unique(sample_group[,i]), 2)
 for ( j in 1:ncol(compgroup)){
 nam<-paste(compgroup[1,j],"vs",compgroup[2,j],sep="")
 versus<-paste(compgroup[1,j],"vs",compgroup[2,j],sep=" ")
 groupA<-logexpr.entrez[,as.character(sample_group$sample[sample_group[,i] %in% compgroup[1,j]])]
 groupB<-logexpr.entrez[,as.character(sample_group$sample[sample_group[,i] %in% compgroup[2,j]])]

 logfc<-rowMeans(groupA)-rowMeans(groupB)
 logfc<-logfc[order(-logfc)]

 #GSEA in GO term
 fgseaRes.c5 <- fgsea(Hs.c5, logfc, minSize=15, maxSize = 500, nperm=1000)
 c5sig<-fgseaRes.c5[fgseaRes.c5$padj<opt$padjvalueCutoff,]
 c5sig<-c5sig[order(c5sig$pval),]
 c5sig<-data.frame(c5sig)
 c5sig$leadingEdge<-sapply(c5sig$leadingEdge,function(x){paste0(unlist(x),collapse=", ")})
 if(nrow(c5sig)==0){
 message(paste("No significant GO term is identified in group ",nam,".",sep=""))
 }else{
 message(paste(nrow(c5sig)," significant GO term(s) is(are) identified in group ",nam,".",sep=""))
 c5sigall<-rbind(c5sigall,cbind(versus,c5sig))
 }
 #GSEA in curated gene sets

 fgseaRes.c2 <- fgsea(Hs.c2, logfc, minSize=15, maxSize = 500, nperm=1000)
 c2sig<-fgseaRes.c2[fgseaRes.c2$padj<opt$padjvalueCutoff,]
 c2sig<-c2sig[order(c2sig$pval),]
 c2sig<-data.frame(c2sig)
 c2sig$leadingEdge<-sapply(c2sig$leadingEdge,function(x){paste0(unlist(x),collapse=", ")})

 if(nrow(c2sig)==0){
 message(paste("No significant curated gene sets is identified in group ",nam,".",sep=""))
 }else{
 message(paste(nrow(c2sig)," significant curated gene sets are identified in group ",nam,".",sep=""))
 c2sigall<-rbind(c2sigall,cbind(versus,c2sig))
 }
 }
 }

 if(nrow(c5sigall)==0){
 message("No significant GO term is identified.")
 }else{
 c5sigall$pval<-signif(c5sigall$pval,4)
 c5sigall$padj<-signif(c5sigall$padj,4)
 c5sigall$ES<-signif(c5sigall$ES,4)
 c5sigall$NES<-signif(c5sigall$NES,4)
 rownames(c5sigall)<-c(1:nrow(c5sigall))
 write.csv(c5sigall,paste(out_dir,opt$project_code,"_gsea_go.csv",sep=""))
 }

 if(nrow(c2sigall)==0){
 message("No significant GO term is identified.")
 }else{
 c2sigall$pval<-signif(c2sigall$pval,4)
 c2sigall$padj<-signif(c2sigall$padj,4)
 c2sigall$ES<-signif(c2sigall$ES,4)
 c2sigall$NES<-signif(c2sigall$NES,4)
 rownames(c2sigall)<-c(1:nrow(c2sigall))
 write.csv(c2sigall,paste(out_dir,opt$project_code,"_gsea_curatedgenesets.csv",sep=""))
 }

 
--- a/RNAseq_6_enrichfunc.R
+++ b/RNAseq_6_enrichfunc.R
@@ -1,7 +1,7 @@
 #!/usr/bin/env Rscript
 ###Copyright 2019 Ying Yu from Fudan-PGx group 
 # example:
 # Rscript RNAseq_2_pca.R -o /home/yuying/rnaseqreport_test -i ballgown_geneexp_log2fpkm_floor0p01_c3r58395_2019-04-29.txt  -g group1.txt -p organoid 
 # Rscript RNAseq_6_enrichfunc.R -o /home/yuying/rnaseqreport_test -i ballgown_geneexp_log2fpkm_floor0p01_c3r58395_2019-04-29.txt  -g group1.txt -p organoid 

 suppressPackageStartupMessages(library("optparse"))
 suppressPackageStartupMessages(library("clusterProfiler"))
@@ -108,8 +108,6 @@ colnames(ekg1)[1]<-c("versus")
 ekeggall<-rbind(ekeggall,ekg1)
 }



 }else{
 for (i in 1:length(groupn)){
 g1<-gene$EntrezID[gene[,2]==groupn[i]]
@@ -132,12 +130,30 @@ colnames(ekg1)[1]<-c("versus")
 ekeggall<-rbind(ekeggall,ekg1)
 }
 }

 }

 #write output
 if(nrow(egoall)==0){
 message("No significant GO term is identified.")
 }else{
 message(paste(nrow(egoall),"significant GO term(s) is(are) identified."))
 rownames(egoall)<-c(1:nrow(egoall))
 egoall$pvalue<-signif(egoall$pvalue,4)
 egoall$p.adjust<-signif(egoall$p.adjust,4)
 egoall$qvalue<-signif(egoall$qvalue,4)
 write.csv(egoall,paste(out_dir,opt$project_code,"_GOenrich.csv",sep=""))
 }

 if(nrow(ekeggall)==0){
 message("No significant KEGG pathway is identified.")
 }else{
 message(paste(nrow(ekeggall),"significant KEGG pathway(s) is(are) identified."))
 rownames(ekeggall)<-c(1:nrow(ekeggall))
 ekeggall$pvalue<-signif(ekeggall$pvalue,4)
 ekeggall$p.adjust<-signif(ekeggall$p.adjust,4)
 ekeggall$qvalue<-signif(ekeggall$qvalue,4)
 write.csv(ekeggall,paste(out_dir,opt$project_code,"_KEGGenrich.csv",sep=""))
 }
 ########