6 年前 · 1f3f7580e9
--- a/RNAseq_1_ballgown.R
+++ b/RNAseq_1_ballgown.R
@@ -0,0 +1,73 @@
 #!/usr/bin/env Rscript
 # example:
 # Rscript RNAseq_1_ballgown.R -o /home/yuying/rnaseqreport_test -i ./ballgown/ -l FALSE -p test 
 # Rscript RNAseq_1_ballgown.R -o /home/yuying/rnaseqreport_test -i ./ballgown/ 


 suppressPackageStartupMessages(library("optparse"))
 suppressPackageStartupMessages(library("ballgown"))

 # specify our desired options in a list
 # by default OptionParser will add an help option equivalent to 
 # make_option(c("-h", "--help"), action="store_true", default=FALSE, 
 #               help="Show this help message and exit")

 option_list <- list( 
    make_option(c("-o", "--out_dir"), type="character",default="./",
        help="The output directory [default ./]"),
    make_option(c("-i", "--input"),type="character", default=NULL,
        help="The directory input of expression files. It is output from ballgown software."),
    make_option(c("-f", "--floor_value"),metavar="number",default=0.01,
        help="A number to add to each value before log2 transformation to avoid infinite value.[default: 0.01]"),
    make_option(c("-l", "--log2_norm"), metavar="TRUE", default=TRUE,
        help="Perform log2 transformation on FPKM value. [default: TRUE]"),
 	make_option(c("-p", "--project_code"), type="character",default="rnaseq",
        help="Project code, which is used as prefix of output file. [default: rnaseq]")
 		)

 # get command line options, if help option encountered print help and exit,
 # otherwise if options not found on command line then set defaults, 
 opt <- parse_args(OptionParser(option_list=option_list))

 if (is.null(opt$input)){
  print_help(opt_parser)
  stop("At least one argument must be supplied (input file).", call.=FALSE)
 }

 #generate FPKM expression profile from ballgown outputs
 geballgown_expr <- ballgown(dataDir =  opt$input ,samplePattern = ".*",meas = "all")
 expr <- gexpr(geballgown_expr)     
 message("finish ballgown\n")

 #remove _1P and FPKM from colnames, _1P is from alicloud app, FPKM is added due to default output of stringtie/ballgown. 
 nam<-colnames(expr)
 nam<-gsub("_1P$","",nam)
 nam<-gsub("^FPKM.","",nam)
 colnames(expr) <- nam

 out_dir<-paste(gsub("/$","",opt$out_dir),"/",sep="")

 if(opt$log2_norm==TRUE){
 message("start log2 transformation\n")

 logexpr<-apply(expr,2,function(x){log2(x+as.numeric(opt$floor_value))})
 logexpr_out<-cbind(rownames(logexpr),round(logexpr,3))
 colnames(logexpr_out)[1]<-"Gene"

 message("output log2 expression file\n")

 write.table(logexpr_out,file = paste(out_dir,opt$project_code,"_geneexp_log2fpkm_floor0p01_c",ncol(logexpr),"r",nrow(logexpr),"_",Sys.Date(),".txt",sep=""),sep="\t",row.names=F,quote=F)
 }else{ 

 #output expression file with fpkm 
 expr<-cbind(rownames(expr),round(expr,3))
 colnames(expr)[1]<-"Gene"

 message("output fpkm expression file\n")

 write.table(expr,file = paste(out_dir,opt$project_code,"_geneexp_fpkm_c",ncol(expr),"r",nrow(expr),"_",Sys.Date(),".txt",sep=""),sep="\t",row.names=F,quote=F)
 }




--- a/RNAseq_2_pca.R
+++ b/RNAseq_2_pca.R
@@ -0,0 +1,75 @@
 #!/usr/bin/env Rscript
 ###Copyright 2019 Ying Yu from Fudan-PGx group 
 # example:
 # Rscript RNAseq_2_pca.R -o /home/yuying/rnaseqreport_test -i ballgown_geneexp_log2fpkm_floor0p01_c3r58395_2019-04-29.txt  -g group1.txt -p organoid 

 suppressPackageStartupMessages(library("optparse"))

 # specify our desired options in a list
 # by default OptionParser will add an help option equivalent to 
 # make_option(c("-h", "--help"), action="store_true", default=FALSE, 
 #               help="Show this help message and exit")

 option_list <- list( 
    make_option(c("-o", "--out_dir"), type="character",default="./",
        help="The output directory [default ./]"),
    make_option(c("-i", "--input"),type="character", default=NULL,
        help="The input expression files. required!"),
    make_option(c("-g", "--sample_group"),type="character",  default=NULL,
        help="File for sample group infomation. The input file containing sample name and group infomation. note colname must be like: sample	group1	group2... "),
   	make_option(c("-p", "--project_code"), type="character",default="rnaseq",
        help="Project code, which is used as prefix of output file. [default: rnaseq]")
 		)

 # get command line options, if help option encountered print help and exit,
 # otherwise if options not found on command line then set defaults, 
 opt <- parse_args(OptionParser(option_list=option_list))

 if (is.null(opt$input)){
  print_help(opt_parser)
  stop("At least one argument must be supplied (input file).", call.=FALSE)
 }

 ##import exp file
 out_dir<-paste(gsub("/$","",opt$out_dir),"/",sep="")
 logexpr<-read.table(opt$input,header=T,stringsAsFactors=F,row.names=1)

 #check exp file is log scale
 if(max(logexpr[,1])-min(logexpr[,1])>100){
  stop("PCA anlaysis shoulc be conducted based on expression profile on log scale.", call.=FALSE)
 }
 #####################
 ##########PCA #######
 ##calculate pca
 pc.cr<-prcomp(t(logexpr),retx = TRUE)
 pca<-pc.cr$x
 pca<-data.frame(pca)
 pca$sample<-rownames(pca)
 pcanew<-pca
 message("PCA finished.")
 ####finished PCA

 ####add group infomaiton if imort

 if (is.null(opt$sample_group)){
  message("Warning: no group sample file. PCA will not be able to colored by group.")
 }else{
 sample_group<-read.table(opt$sample_group,sep="\t",header=T)


 if(length(grep("group",colnames(sample_group)))==0){
 message("No group is identified in sample_group file. Make sure the head of sample_group file is like sample, group1, group2.")
 }else{
 groupn<-grep("group",colnames(sample_group))
 for ( i in groupn){
 pcanew<- cbind(pcanew,sample_group[match(pca$sample,sample_group$sample),i])
 }
 colnames(pcanew)[c((ncol(pca)+1):ncol(pcanew))]<-colnames(sample_group)[groupn]
 }
 }

 #write output
 write.csv(pcanew,paste(out_dir,opt$project_code,"_pca.csv",sep=""))
 saveRDS(pcanew,paste(out_dir,opt$project_code,"_pca.rds",sep=""))
 ########

--- a/RNAseq_3_cor.R
+++ b/RNAseq_3_cor.R
@@ -0,0 +1,50 @@
 #!/usr/bin/env Rscript
 ###Copyright 2019 Ying Yu from Fudan-PGx group 
 # example:
 # Rscript RNAseq_3_cor.R -o /home/yuying/rnaseqreport_test -i ballgown_geneexp_log2fpkm_floor0p01_c3r58395_2019-04-29.txt  -g group1.txt -p organoid 

 suppressPackageStartupMessages(library("optparse"))

 # specify our desired options in a list
 # by default OptionParser will add an help option equivalent to 
 # make_option(c("-h", "--help"), action="store_true", default=FALSE, 
 #               help="Show this help message and exit")

 option_list <- list( 
    make_option(c("-o", "--out_dir"), type="character",default="./",
        help="The output directory [default ./]"),
    make_option(c("-i", "--input"),type="character", default=NULL,
        help="The input expression files. required!"),
    make_option(c("-g", "--sample_group"),type="character",  default=NULL,
        help="File for sample group infomation.The input file containing sample name and group infomation. note colname must be like: sample	group1	group2... "),
   	make_option(c("-p", "--project_code"), type="character",default="rnaseq",
        help="Project code, which is used as prefix of output file. [default: rnaseq]")
 		)

 # get command line options, if help option encountered print help and exit,
 # otherwise if options not found on command line then set defaults, 
 opt <- parse_args(OptionParser(option_list=option_list))

 if (is.null(opt$input)){
  print_help(opt_parser)
  stop("At least one argument must be supplied (input file).", call.=FALSE)
 }

 ##import exp file
 out_dir<-paste(gsub("/$","",opt$out_dir),"/",sep="")
 logexpr<-read.table(opt$input,header=T,stringsAsFactors=F,row.names=1)

 #check exp file is log scale
 if(max(logexpr[,1])-min(logexpr[,1])>100){
  stop("Correlation anlaysis should be conducted based on expression profile on log scale.", call.=FALSE)
 }
 #####################
 ##########correlation#######
 correlation<-cor(logexpr)
 ####finished cor

 #write output
 write.csv(correlation,paste(out_dir,opt$project_code,"_cor.csv",sep=""))
 saveRDS(correlation,paste(out_dir,opt$project_code,"_cor.rds",sep=""))
 ########

--- a/RNAseq_4_pwDEG.R
+++ b/RNAseq_4_pwDEG.R
@@ -0,0 +1,165 @@
 #!/usr/bin/env Rscript
 ###Copyright 2019 Ying Yu from Fudan-PGx group 
 # example:
 # Rscript RNAseq_4_pwDEG.R  -i ballgown_geneexp_log2fpkm_floor0p01_c3r58395_2019-04-29.txt  -g group1.txt -p organoid 

 # choppy report script like : @scatter-plot(dataFile='/mnt/c/Users/YY/Documents/working/choppy_report/data/zhanggroup_P1-6vsP7-13_choppy_scatterplot_degs.rds', dataType='rds', xAxis='log2FC', xTitle="log2FC",yAxis='log10p',yTitle="-log10 (p)")

 suppressPackageStartupMessages(library("optparse"))

 # specify our desired options in a list
 # by default OptionParser will add an help option equivalent to 
 # make_option(c("-h", "--help"), action="store_true", default=FALSE, 
 #               help="Show this help message and exit")

 option_list <- list( 
    make_option(c("-o", "--out_dir"), type="character",default="./",
        help="The output directory [default ./]"),
    make_option(c("-i", "--input"),type="character", default=NULL,
        help="The input expression files. Required!"),
    make_option(c("-g", "--sample_group"),type="character",  default=NULL,
        help="File for sample group infomation.The input file containing sample name and group infomation. note colname must be like: sample	group1	group2... Required! "),
   	make_option(c("-p", "--project_code"), type="character",default="rnaseq",
        help="Project code, which is used as prefix of output file. [default: rnaseq]"),
 	make_option(c("-a", "--output_all_genes"), metavar="FALSE", default=FALSE,
        help="Output rds files for choppy contains all genes. By default, only DEGs are listed in the output rds and csv for report. NOTE choppy report may not be availble to display correctly if too many points exit. [default: FALSE]"),
 	make_option(c("-b", "--low_expr_filter"), metavar="FALSE",default=FALSE,
        help="Conduct low expression filtering before DEG analysis. [default: FALSE]"),
 	make_option(c("-f", "--low_expr_filter_cutoff"), type="double",default=0,metavar="number",help="Genes across all samples with expreesion lower than this value will be filtered out [default: 0]")
 		)

 # get command line options, if help option encountered print help and exit,
 # otherwise if options not found on command line then set defaults, 
 opt <- parse_args(OptionParser(option_list=option_list))

 if (is.null(opt$input)){
  print_help(opt_parser)
  stop("At least one argument must be supplied (input expression file).", call.=FALSE)
 }
 if (is.null(opt$sample_group)){
  stop("At least one argument must be supplied (input group infomation for DEG analysis).", call.=FALSE)
 }

 ##import files
 out_dir<-paste(gsub("/$","",opt$out_dir),"/",sep="")
 logexpr<-read.table(opt$input,header=T,stringsAsFactors=F,row.names=1)

 #check exp file is log scale
 if(max(logexpr[,1])-min(logexpr[,1])>100){
  stop("DEG anlaysis should be conducted based on expression profile on log scale. Please run log2 first", call.=FALSE)
 }

 ##import sample group file and check 
 sample_group<-read.table(opt$sample_group,sep="\t",header=T)

 if(length(grep("group",colnames(sample_group)))==0){
 stop("No group is identified in sample_group file. Make sure the head of sample_group file is like sample, group1, group2.")
 }

 ##### low_expr_filter
 if(opt$low_expr_filter==TRUE){
 message("Conduct low expression filtering before DEG analysis. \n ")
 message(paste("Genes across all samples with expreesion lower than", opt$low_expr_filter_cutoff, "will be filtered out. ",sep=" "))
 logexpr<-logexpr[which(apply(logexpr,1,max)>=as.numeric(as.character(opt$low_expr_filter_cutoff))), ]
 }

 ###################################
 ##########DEGs pairwise DEGs 
 #################################
 pvalue <- function(x,ncolA,ncolB) {
    obj<-try(t.test(x[1:ncolA],x[(ncolA+1):(ncolA+ncolB)],var.equal = TRUE), silent=TRUE)
   if (is(obj, "try-error")) return(1) else return(obj$p.value)
   }

 message("Conducting DEG analysis. \n ")

 groupn<-grep("group",colnames(sample_group))
 deg<-0
 for ( i in groupn){
 compgroup<-combn(unique(sample_group[,i]), 2)
 for ( j in 1:ncol(compgroup)){


 nam<-paste(compgroup[1,j],"vs",compgroup[2,j],sep="")
 groupA<-logexpr[,sample_group$sample[sample_group[,i] %in% compgroup[1,j]]]
 groupB<-logexpr[,sample_group$sample[sample_group[,i] %in% compgroup[2,j]]]
 group2<-cbind(groupA,groupB)
 ncolA<-ncol(groupA)
 ncolB<-ncol(groupB)
 ttest<-apply(group2,1,function(x){pvalue(x,ncolA,ncolB)})
 logfc<-rowMeans(groupA)-rowMeans(groupB)

 p <- data.frame(
 gene=rownames(logexpr),
 versus=rep(paste(compgroup[1,j],"vs",compgroup[2,j],sep=" "),nrow(logexpr)),
 ttest=ttest,
 log10p=(-log10(ttest)),
 logfc=logfc,
 sigene= 0)

 p$sigene[intersect(which(p$ttest<0.05),which(abs(p$logfc)>=1))]<-1
 pdiff<-p[p$sigene==1,]

 message(paste("Identify ", nrow(pdiff), " DEGs in ", nam, " comparison.",sep=""))

 p$sigene<-ifelse(p$sigene==1,"DEG","nonDEG")
 deg<-rbind(deg,pdiff)

 #modify output
 pout<-p[,c("gene","versus","logfc","log10p","sigene")]
 pout$sigene<-as.factor(pout$sigene)
 rownames(pout)<-c(1:nrow(pout))

 if (opt$output_all_genes==FALSE){
 #print only DEGs
 pout<-pout[pout$sigene=="DEG",]
 }
 message(paste("Writing results of DEG analysis of ", nam, " comparison.",sep=""))
 write.csv(pout,paste(out_dir,opt$project_code,"_",nam,"_degs.csv",sep=""),row.names=F)
 saveRDS(pout,paste(out_dir,opt$project_code,"_",nam,"_degs.rds",sep=""))
 }

 #clear
 groupA<-c()
 groupB<-c()
 group2<-c()
 ttest<-c()
 logfc<-c()
 nam<-c()
 }


 deg<-deg[!is.na(deg[,1]),]
 if(nrow(deg)==0){
 message("No DEGs are identified!")
 }else{

 degtable<-deg[,c("gene","versus","ttest","logfc")]
 colnames(degtable)<-gsub("logfc","log2FC",colnames(degtable))
 colnames(degtable)<-gsub("ttest","pvalue",colnames(degtable))
 degtable$pvalue<-signif(degtable$pvalue,4)
 degtable$log2FC<-signif(degtable$log2FC,4)
 rownames(degtable)<-c(1:nrow(degtable))

 #write output
 write.csv(degtable,paste(out_dir,opt$project_code,"_degs_acrossgroups.csv",sep=""),row.names=F)
 ### statistics number of up and down regulated genes and outpu
 degtable$updown<-ifelse(degtable$log2FC>0,"up","down")
 up<-degtable[degtable$updown=="up",]
 down<-degtable[degtable$updown=="down",]

 degstat<-rbind(
 cbind(tapply(up$updown,as.factor(as.character(up$versus)),length),"upregulated"),
 cbind(tapply(down$updown,as.factor(as.character(down$versus)),length),"downregulated")
 )

 degstat<-cbind(degstat,rownames(degstat))
 degstat<-data.frame(degstat)
 rownames(degstat)<-c(1:nrow(degstat))
 colnames(degstat)<-c("number","type","versus")
 degstat$number<-as.numeric(as.character(degstat$number))
 ########
 write.csv(degstat,paste(out_dir,opt$project_code,"_degs_stats.csv",sep=""),row.names=F)
 saveRDS(degstat,paste(out_dir,opt$project_code,"_degs_stats.rds",sep=""))

 }
--- a/RNAseq_6_enrichfunc.R
+++ b/RNAseq_6_enrichfunc.R
@@ -0,0 +1,143 @@
 #!/usr/bin/env Rscript
 ###Copyright 2019 Ying Yu from Fudan-PGx group 
 # example:
 # Rscript RNAseq_2_pca.R -o /home/yuying/rnaseqreport_test -i ballgown_geneexp_log2fpkm_floor0p01_c3r58395_2019-04-29.txt  -g group1.txt -p organoid 

 suppressPackageStartupMessages(library("optparse"))
 suppressPackageStartupMessages(library("clusterProfiler"))

 # specify our desired options in a list
 # by default OptionParser will add an help option equivalent to 
 # make_option(c("-h", "--help"), action="store_true", default=FALSE, 
 #               help="Show this help message and exit")

 # input input list , rds, from * to *

 option_list <- list( 
    make_option(c("-o", "--out_dir"), type="character",default="./",
        help="The output directory [default ./]"),
    make_option(c("-i", "--input"),type="character", default=NULL,
        help="The input DEG list in csv format. The first column: gene; second column: group. Required! "),
 	make_option(c("-e", "--type_gene_id"),type="character", default="EnsemblGID",
        help="The type of gene symbol. Could be either of EnsemblGID/EntrezID/GeneSymbol [default: EnsemblGID]"),
 	make_option(c("-f", "--pvalueCutoff"), type="double",default=0.05,metavar="number",
 		help="Cutoff value of p value. [default: 0.05]"),
 	make_option(c("-m", "--pAdjustMethod"), type="character",default="BH",
 		help="Method of adjust p value. One of \"holm\", \"hochberg\", \"hommel\", \"bonferroni\", \"BH\", \"BY\", \"fdr\", \"none\". [default: BH]"),
 	make_option(c("-q", "--qvalueCutoff"), type="double",default=0.2,metavar="number",
 		help="Cutoff value of q value. [default: 0.2]"),
   	make_option(c("-p", "--project_code"), type="character",default="rnaseq",
        help="Project code, which is used as prefix of output file. [default: rnaseq]")
 		)

 # get command line options, if help option encountered print help and exit,
 # otherwise if options not found on command line then set defaults, 
 opt <- parse_args(OptionParser(option_list=option_list))

 if (is.null(opt$input)){
  print_help(opt_parser)
  stop("At least one argument must be supplied (input file).", call.=FALSE)
 }

 ##import file
 out_dir<-paste(gsub("/$","",opt$out_dir),"/",sep="")
 gene<-read.csv(opt$input,header=T,stringsAsFactors=F)


 ##########################
 #########ID convert#######
 ##########################
 if(length(grep("ID_convert_table.rds",dir()))>0){

 idconvert<-readRDS("ID_convert_table.rds")
 }else{
 stop("Cannot find ID_convert_table.rds in the working folder. Exit!", call.=FALSE)
 }

 if(opt$type_gene_id=="EnsemblGID"){
 gene$EntrezID<-idconvert$EntrezID[match(gene[,1],idconvert$EnsemblID)]
 if(length(which(is.na(gene$EntrezID)))==nrow(gene)){
 stop("Cannot convert Ensembl gene ID to Entrez gene ID. Exit!", call.=FALSE)
 }
 }

 if(opt$type_gene_id=="GeneSymbol"){
 gene$EntrezID<-idconvert$EntrezID[match(gene[,1],idconvert$GeneSymbol)]
 if(length(which(is.na(gene$EntrezID)))==nrow(gene)){
 stop("Cannot convert GeneSymbol to Entrez gene ID. Exit!", call.=FALSE)
 }
 }

 if(opt$type_gene_id=="EntrezID"){
 gene$EntrezID<-gene[,1]
 }

 ##########################
 #########Enrich GO#######
 ##########################

 groupn<-unique(gene[,2])
 if(length(groupn)==0){
 message("Warning: no group infomation. Function enrichment will be conducted as one group.")
 }else{
 message(paste("A number of ", length(groupn)," group(s) is detected. Function enrichment will be conducted in ",length(groupn), " group(s).",sep=""))
 }

 ####
 egoall<-c()
 ekeggall<-c()
 if(length(groupn)==0){
 g1<-gene$EntrezID
 g1<-g1[!g1==""]
 #conduct enrichment

 ego<-data.frame(enrichGO(g1, 'org.Hs.eg.db', ont = 'ALL', pvalueCutoff = opt$pvalueCutoff, pAdjustMethod = opt$pAdjustMethod, qvalueCutoff = opt$qvalueCutoff))
 
 ekg<- data.frame( enrichKEGG(g1, organism = "hsa", keyType = "kegg", pvalueCutoff = opt$pvalueCutoff, pAdjustMethod = opt$pAdjustMethod, qvalueCutoff = opt$qvalueCutoff))
  
 #modify output
 if(!nrow(ego)==0){
 ego1<-cbind(groupn[i],ego)
 colnames(ego1)[1]<-c("versus")
 egoall<-rbind(egoall,ego1)
 }

 if(!nrow(ekg)==0){
 ekg1<-cbind(groupn[i],ekg)
 colnames(ekg1)[1]<-c("versus")
 ekeggall<-rbind(ekeggall,ekg1)
 }



 }else{
 for (i in 1:length(groupn)){
 g1<-gene$EntrezID[gene[,2]==groupn[i]]
 g1<-g1[!g1==""]
 #conduct enrichment

 ego<-data.frame(enrichGO(g1, 'org.Hs.eg.db', ont = 'ALL', pvalueCutoff = opt$pvalueCutoff, pAdjustMethod = opt$pAdjustMethod, qvalueCutoff = opt$qvalueCutoff))

 ekg<- data.frame( enrichKEGG(g1, organism = "hsa", keyType = "kegg", pvalueCutoff = opt$pvalueCutoff, pAdjustMethod = opt$pAdjustMethod, qvalueCutoff = opt$qvalueCutoff))

  if(!nrow(ego)==0){
 ego1<-cbind(groupn[i],ego)
 colnames(ego1)[1]<-c("versus")
 egoall<-rbind(egoall,ego1)
 }

 if(!nrow(ekg)==0){
 ekg1<-cbind(groupn[i],ekg)
 colnames(ekg1)[1]<-c("versus")
 ekeggall<-rbind(ekeggall,ekg1)
 }
 }

 }

 #write output
 write.csv(egoall,paste(out_dir,opt$project_code,"_GOenrich.csv",sep=""))
 write.csv(ekeggall,paste(out_dir,opt$project_code,"_KEGGenrich.csv",sep=""))
 ########