RNA-seq下游数据分析-ballgown到报告。 以Rscript为主,对接PGx RNA-seq choppy现有pipeline,到生成RNA-seq分析报告所需的rds和csv文件。
r
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

192 lines
7.1KB

  1. #!/usr/bin/env Rscript
  2. ###Copyright 2019 Ying Yu from Fudan-PGx group
  3. # example:
  4. # Rscript RNAseq_5_pwGSEA.R -o /home/yuying/rnaseqreport_test -i example_geneexp_log2fpkm_floor0p01_c13r58395_2019-04-30.txt -g group13_2.txt
  5. suppressPackageStartupMessages(library("optparse"))
  6. suppressPackageStartupMessages(library("fgsea"))
  7. suppressPackageStartupMessages(library("data.table"))
  8. # specify our desired options in a list
  9. # by default OptionParser will add an help option equivalent to
  10. # make_option(c("-h", "--help"), action="store_true", default=FALSE,
  11. # help="Show this help message and exit")
  12. # input input list , rds, from * to *
  13. option_list <- list(
  14. make_option(c("-o", "--out_dir"), type="character",default="./",
  15. help="The output directory [default ./]"),
  16. make_option(c("-i", "--input"),type="character", default=NULL,
  17. help="The input expression files. Required!"),
  18. make_option(c("-e", "--type_gene_id"),type="character", default="EnsemblGID",
  19. help="The type of gene symbol. Could be either of EnsemblGID/EntrezID/GeneSymbol [default: EnsemblGID]"),
  20. make_option(c("-g", "--sample_group"),type="character", default=NULL,
  21. help="File in tab-delimited format for sample group infomation. The input file containing sample name and group infomation. note colname must be like: sample group1 group2... Required! "),
  22. make_option(c("-q", "--padjvalueCutoff"), type="double",default=0.2,metavar="number",
  23. help="Cutoff value of adjusted p value. [default: 0.2]"),
  24. make_option(c("-p", "--project_code"), type="character",default="rnaseq",
  25. help="Project code, which is used as prefix of output file. [default: rnaseq]"),
  26. make_option(c("-d", "--ref_rdata_dir"), type="character",default="./",
  27. help="The directory of reference files: human_c2_v5p2.rdata, human_c5_v5p2.rdata and ID_convert_table.rds. [default: ./]")
  28. )
  29. # get command line options, if help option encountered print help and exit,
  30. # otherwise if options not found on command line then set defaults,
  31. opt <- parse_args(OptionParser(option_list=option_list))
  32. if (is.null(opt$input)){
  33. print_help(opt_parser)
  34. stop("At least one argument must be supplied (input file).", call.=FALSE)
  35. }
  36. if (is.null(opt$sample_group)){
  37. stop("At least one argument must be supplied (input group infomation for DEG analysis).", call.=FALSE)
  38. }
  39. ##import file
  40. out_dir<-paste(gsub("/$","",opt$out_dir),"/",sep="")
  41. logexpr<-fread(opt$input,header=T,stringsAsFactors=F,row.names=1,check.names=F,data.table=F)
  42. #check exp file is log scale
  43. if(max(logexpr[,1])-min(logexpr[,1])>100){
  44. stop("DEG anlaysis should be conducted based on expression profile on log scale. Please run log2 first", call.=FALSE)
  45. }
  46. ##import sample group file and check
  47. sample_group<-read.table(opt$sample_group,sep="\t",header=T)
  48. if(length(grep("group",colnames(sample_group)))==0){
  49. stop("No group is identified in sample_group file. Make sure the head of sample_group file is like sample, group1, group2.")
  50. }
  51. #refdir
  52. refdir<-paste(gsub("/$","",opt$ref_rdata_dir),"/",sep="")
  53. #c2: curated gene sets (rdata file)
  54. #c5: GO gene sets (rdata file)
  55. if(length(grep("human_c2_v5p2.rdata",dir(refdir)))>0){
  56. load(paste(refdir,"human_c2_v5p2.rdata",sep=""))
  57. }else{
  58. stop("Cannot find human_c2_v5p2.rds in the ref_rdata_dir. Exit!", call.=FALSE)
  59. }
  60. if(length(grep("human_c5_v5p2.rdata",dir(refdir)))>0){
  61. load(paste(refdir,"human_c5_v5p2.rdata",sep=""))
  62. }else{
  63. stop("Cannot find human_c5_v5p2.rds in the ref_rdata_dir. Exit!", call.=FALSE)
  64. }
  65. ##########################
  66. #########ID convert#######
  67. ##########################
  68. message("Begin ID conversion.")
  69. if(length(grep("ID_convert_table.rds",dir(refdir)))>0){
  70. idconvert<-readRDS(paste(refdir,"ID_convert_table.rds",sep=""))
  71. }else{
  72. stop("Cannot find ID_convert_table.rds in the working folder. Exit!", call.=FALSE)
  73. }
  74. if(opt$type_gene_id=="EnsemblGID"){
  75. gene_entrez<-idconvert$EntrezID[match(rownames(logexpr),idconvert$EnsemblID)]
  76. if(length(which(is.na(gene_entrez)))==nrow(logexpr)){
  77. stop("Cannot convert Ensembl gene ID to Entrez gene ID. Exit!", call.=FALSE)
  78. }else{
  79. logexpr1<-logexpr[!gene_entrez=="",]
  80. gene_entrez1<-gene_entrez[!gene_entrez==""]
  81. logexpr.entrez<-apply(logexpr1,2,function(x){unlist(tapply(x,as.factor(gene_entrez1),mean))})
  82. }
  83. }
  84. if(opt$type_gene_id=="GeneSymbol"){
  85. gene_entrez<-idconvert$EntrezID[match(rownames(logexpr),idconvert$GeneSymbol)]
  86. if(length(which(is.na(gene_entrez)))==nrow(logexpr)){
  87. stop("Cannot convert Ensembl gene ID to Entrez gene ID. Exit!", call.=FALSE)
  88. }else{
  89. logexpr1<-logexpr[!gene_entrez=="",]
  90. gene_entrez1<-gene_entrez[!gene_entrez==""]
  91. logexpr.entrez<-apply(logexpr1,2,function(x){unlist(tapply(x,as.factor(gene_entrez1),mean))})
  92. }
  93. }
  94. if(opt$type_gene_id=="EntrezID"){
  95. logexpr.entrez<-logexpr
  96. }
  97. message("Finish ID conversion.")
  98. ######################
  99. ######## GSEA ########
  100. ######################
  101. groupn<-grep("group",colnames(sample_group))
  102. c5sigall<-c()
  103. c2sigall<-c()
  104. for ( i in groupn){
  105. compgroup<-combn(unique(sample_group[,i]), 2)
  106. for ( j in 1:ncol(compgroup)){
  107. nam<-paste(compgroup[1,j],"vs",compgroup[2,j],sep="")
  108. versus<-paste(compgroup[1,j],"vs",compgroup[2,j],sep=" ")
  109. groupA<-logexpr.entrez[,as.character(sample_group$sample[sample_group[,i] %in% compgroup[1,j]])]
  110. groupB<-logexpr.entrez[,as.character(sample_group$sample[sample_group[,i] %in% compgroup[2,j]])]
  111. logfc<-rowMeans(groupA)-rowMeans(groupB)
  112. logfc<-logfc[order(-logfc)]
  113. #GSEA in GO term
  114. fgseaRes.c5 <- fgsea(Hs.c5, logfc, minSize=15, maxSize = 500, nperm=1000)
  115. c5sig<-fgseaRes.c5[fgseaRes.c5$padj<opt$padjvalueCutoff,]
  116. if(nrow(c5sig)==0){
  117. message(paste("No significant GO term is identified in group ",nam,".",sep=""))
  118. }else{
  119. message(paste(nrow(c5sig)," significant GO term(s) is(are) identified in group ",nam,".",sep=""))
  120. c5sig<-c5sig[order(c5sig$pval),]
  121. c5sig<-data.frame(c5sig)
  122. c5sig$leadingEdge<-sapply(c5sig$leadingEdge,function(x){paste0(unlist(x),collapse=", ")})
  123. c5sigall<-rbind(c5sigall,cbind(versus,c5sig))
  124. }
  125. #GSEA in curated gene sets
  126. fgseaRes.c2 <- fgsea(Hs.c2, logfc, minSize=15, maxSize = 500, nperm=1000)
  127. c2sig<-fgseaRes.c2[fgseaRes.c2$padj<opt$padjvalueCutoff,]
  128. if(nrow(c2sig)==0){
  129. message(paste("No significant curated gene sets is identified in group ",nam,".",sep=""))
  130. }else{
  131. message(paste(nrow(c2sig)," significant curated gene sets are identified in group ",nam,".",sep=""))
  132. c2sig<-c2sig[order(c2sig$pval),]
  133. c2sig<-data.frame(c2sig)
  134. c2sig$leadingEdge<-sapply(c2sig$leadingEdge,function(x){paste0(unlist(x),collapse=", ")})
  135. c2sigall<-rbind(c2sigall,cbind(versus,c2sig))
  136. }
  137. }
  138. }
  139. if(length(c5sigall)==0){
  140. message("No significant GO term is identified.")
  141. }else{
  142. c5sigall$pval<-signif(c5sigall$pval,4)
  143. c5sigall$padj<-signif(c5sigall$padj,4)
  144. c5sigall$ES<-signif(c5sigall$ES,4)
  145. c5sigall$NES<-signif(c5sigall$NES,4)
  146. rownames(c5sigall)<-c(1:nrow(c5sigall))
  147. write.csv(c5sigall,paste(out_dir,opt$project_code,"_gsea_go.csv",sep=""))
  148. }
  149. if(length(c2sigall)==0){
  150. message("No significant GO term is identified.")
  151. }else{
  152. c2sigall$pval<-signif(c2sigall$pval,4)
  153. c2sigall$padj<-signif(c2sigall$padj,4)
  154. c2sigall$ES<-signif(c2sigall$ES,4)
  155. c2sigall$NES<-signif(c2sigall$NES,4)
  156. rownames(c2sigall)<-c(1:nrow(c2sigall))
  157. write.csv(c2sigall,paste(out_dir,opt$project_code,"_gsea_curatedgenesets.csv",sep=""))
  158. }