#!/usr/bin/env Rscript ###Copyright 2019 Ying Yu from Fudan-PGx group # example: # Rscript RNAseq_5_pwGSEA.R -o /home/yuying/rnaseqreport_test -i example_geneexp_log2fpkm_floor0p01_c13r58395_2019-04-30.txt -g group13_2.txt suppressPackageStartupMessages(library("optparse")) suppressPackageStartupMessages(library("fgsea")) suppressPackageStartupMessages(library("data.table")) # specify our desired options in a list # by default OptionParser will add an help option equivalent to # make_option(c("-h", "--help"), action="store_true", default=FALSE, # help="Show this help message and exit") # input input list , rds, from * to * option_list <- list( make_option(c("-o", "--out_dir"), type="character",default="./", help="The output directory [default ./]"), make_option(c("-i", "--input"),type="character", default=NULL, help="The input expression files. Required!"), make_option(c("-e", "--type_gene_id"),type="character", default="EnsemblGID", help="The type of gene symbol. Could be either of EnsemblGID/EntrezID/GeneSymbol [default: EnsemblGID]"), make_option(c("-g", "--sample_group"),type="character", default=NULL, help="File in tab-delimited format for sample group infomation. The input file containing sample name and group infomation. note colname must be like: sample group1 group2... Required! "), make_option(c("-q", "--padjvalueCutoff"), type="double",default=0.2,metavar="number", help="Cutoff value of adjusted p value. [default: 0.2]"), make_option(c("-p", "--project_code"), type="character",default="rnaseq", help="Project code, which is used as prefix of output file. [default: rnaseq]"), make_option(c("-d", "--ref_rdata_dir"), type="character",default="./", help="The directory of reference files: human_c2_v5p2.rdata, human_c5_v5p2.rdata and ID_convert_table.rds. [default: ./]") ) # get command line options, if help option encountered print help and exit, # otherwise if options not found on command line then set defaults, opt <- parse_args(OptionParser(option_list=option_list)) if (is.null(opt$input)){ print_help(opt_parser) stop("At least one argument must be supplied (input file).", call.=FALSE) } if (is.null(opt$sample_group)){ stop("At least one argument must be supplied (input group infomation for DEG analysis).", call.=FALSE) } ##import file out_dir<-paste(gsub("/$","",opt$out_dir),"/",sep="") logexpr<-fread(opt$input,header=T,stringsAsFactors=F,row.names=1,check.names=F,data.table=F) #check exp file is log scale if(max(logexpr[,1])-min(logexpr[,1])>100){ stop("DEG anlaysis should be conducted based on expression profile on log scale. Please run log2 first", call.=FALSE) } ##import sample group file and check sample_group<-read.table(opt$sample_group,sep="\t",header=T) if(length(grep("group",colnames(sample_group)))==0){ stop("No group is identified in sample_group file. Make sure the head of sample_group file is like sample, group1, group2.") } #refdir refdir<-paste(gsub("/$","",opt$ref_rdata_dir),"/",sep="") #c2: curated gene sets (rdata file) #c5: GO gene sets (rdata file) if(length(grep("human_c2_v5p2.rdata",dir(refdir)))>0){ load(paste(refdir,"human_c2_v5p2.rdata",sep="")) }else{ stop("Cannot find human_c2_v5p2.rds in the ref_rdata_dir. Exit!", call.=FALSE) } if(length(grep("human_c5_v5p2.rdata",dir(refdir)))>0){ load(paste(refdir,"human_c5_v5p2.rdata",sep="")) }else{ stop("Cannot find human_c5_v5p2.rds in the ref_rdata_dir. Exit!", call.=FALSE) } ########################## #########ID convert####### ########################## message("Begin ID conversion.") if(length(grep("ID_convert_table.rds",dir(refdir)))>0){ idconvert<-readRDS(paste(refdir,"ID_convert_table.rds",sep="")) }else{ stop("Cannot find ID_convert_table.rds in the working folder. Exit!", call.=FALSE) } if(opt$type_gene_id=="EnsemblGID"){ gene_entrez<-idconvert$EntrezID[match(rownames(logexpr),idconvert$EnsemblID)] if(length(which(is.na(gene_entrez)))==nrow(logexpr)){ stop("Cannot convert Ensembl gene ID to Entrez gene ID. Exit!", call.=FALSE) }else{ logexpr1<-logexpr[!gene_entrez=="",] gene_entrez1<-gene_entrez[!gene_entrez==""] logexpr.entrez<-apply(logexpr1,2,function(x){unlist(tapply(x,as.factor(gene_entrez1),mean))}) } } if(opt$type_gene_id=="GeneSymbol"){ gene_entrez<-idconvert$EntrezID[match(rownames(logexpr),idconvert$GeneSymbol)] if(length(which(is.na(gene_entrez)))==nrow(logexpr)){ stop("Cannot convert Ensembl gene ID to Entrez gene ID. Exit!", call.=FALSE) }else{ logexpr1<-logexpr[!gene_entrez=="",] gene_entrez1<-gene_entrez[!gene_entrez==""] logexpr.entrez<-apply(logexpr1,2,function(x){unlist(tapply(x,as.factor(gene_entrez1),mean))}) } } if(opt$type_gene_id=="EntrezID"){ logexpr.entrez<-logexpr } message("Finish ID conversion.") ###################### ######## GSEA ######## ###################### groupn<-grep("group",colnames(sample_group)) c5sigall<-c() c2sigall<-c() for ( i in groupn){ compgroup<-combn(unique(sample_group[,i]), 2) for ( j in 1:ncol(compgroup)){ nam<-paste(compgroup[1,j],"vs",compgroup[2,j],sep="") versus<-paste(compgroup[1,j],"vs",compgroup[2,j],sep=" ") groupA<-logexpr.entrez[,as.character(sample_group$sample[sample_group[,i] %in% compgroup[1,j]])] groupB<-logexpr.entrez[,as.character(sample_group$sample[sample_group[,i] %in% compgroup[2,j]])] logfc<-rowMeans(groupA)-rowMeans(groupB) logfc<-logfc[order(-logfc)] #GSEA in GO term fgseaRes.c5 <- fgsea(Hs.c5, logfc, minSize=15, maxSize = 500, nperm=1000) c5sig<-fgseaRes.c5[fgseaRes.c5$padj