RNA-seq下游数据分析-ballgown到报告。 以Rscript为主,对接PGx RNA-seq choppy现有pipeline,到生成RNA-seq分析报告所需的rds和csv文件。
r
No puede seleccionar más de 25 temas Los temas deben comenzar con una letra o número, pueden incluir guiones ('-') y pueden tener hasta 35 caracteres de largo.

166 líneas
6.6KB

  1. #!/usr/bin/env Rscript
  2. ###Copyright 2019 Ying Yu from Fudan-PGx group
  3. # example:
  4. # Rscript RNAseq_4_pwDEG.R -i ballgown_geneexp_log2fpkm_floor0p01_c3r58395_2019-04-29.txt -g group1.txt -p organoid
  5. # choppy report script like : @scatter-plot(dataFile='/mnt/c/Users/YY/Documents/working/choppy_report/data/zhanggroup_P1-6vsP7-13_choppy_scatterplot_degs.rds', dataType='rds', xAxis='log2FC', xTitle="log2FC",yAxis='log10p',yTitle="-log10 (p)")
  6. suppressPackageStartupMessages(library("optparse"))
  7. # specify our desired options in a list
  8. # by default OptionParser will add an help option equivalent to
  9. # make_option(c("-h", "--help"), action="store_true", default=FALSE,
  10. # help="Show this help message and exit")
  11. option_list <- list(
  12. make_option(c("-o", "--out_dir"), type="character",default="./",
  13. help="The output directory [default ./]"),
  14. make_option(c("-i", "--input"),type="character", default=NULL,
  15. help="The input expression files. Required!"),
  16. make_option(c("-g", "--sample_group"),type="character", default=NULL,
  17. help="File in tab-delimited format for sample group infomation. The input file containing sample name and group infomation. note colname must be like: sample group1 group2... Required! "),
  18. make_option(c("-p", "--project_code"), type="character",default="rnaseq",
  19. help="Project code, which is used as prefix of output file. [default: rnaseq]"),
  20. make_option(c("-a", "--output_all_genes"), metavar="FALSE", default=FALSE,
  21. help="Output rds files for choppy contains all genes. By default, only DEGs are listed in the output rds and csv for report. NOTE choppy report may not be availble to display correctly if too many points exit. [default: FALSE]"),
  22. make_option(c("-b", "--low_expr_filter"), metavar="FALSE",default=FALSE,
  23. help="Conduct low expression filtering before DEG analysis. [default: FALSE]"),
  24. make_option(c("-f", "--low_expr_filter_cutoff"), type="double",default=0,metavar="number",help="Genes across all samples with expreesion lower than this value will be filtered out [default: 0]")
  25. )
  26. # get command line options, if help option encountered print help and exit,
  27. # otherwise if options not found on command line then set defaults,
  28. opt <- parse_args(OptionParser(option_list=option_list))
  29. if (is.null(opt$input)){
  30. print_help(opt_parser)
  31. stop("At least one argument must be supplied (input expression file).", call.=FALSE)
  32. }
  33. if (is.null(opt$sample_group)){
  34. stop("At least one argument must be supplied (input group infomation for DEG analysis).", call.=FALSE)
  35. }
  36. ##import files
  37. out_dir<-paste(gsub("/$","",opt$out_dir),"/",sep="")
  38. logexpr<-read.table(opt$input,header=T,stringsAsFactors=F,row.names=1,check.names=F)
  39. #check exp file is log scale
  40. if(max(logexpr[,1])-min(logexpr[,1])>100){
  41. stop("DEG anlaysis should be conducted based on expression profile on log scale. Please run log2 first", call.=FALSE)
  42. }
  43. ##import sample group file and check
  44. sample_group<-read.table(opt$sample_group,sep="\t",header=T)
  45. if(length(grep("group",colnames(sample_group)))==0){
  46. stop("No group is identified in sample_group file. Make sure the head of sample_group file is like sample, group1, group2.")
  47. }
  48. ##### low_expr_filter
  49. if(opt$low_expr_filter==TRUE){
  50. message("Conduct low expression filtering before DEG analysis. \n ")
  51. message(paste("Genes across all samples with expreesion lower than", opt$low_expr_filter_cutoff, "will be filtered out. ",sep=" "))
  52. logexpr<-logexpr[which(apply(logexpr,1,max)>=as.numeric(as.character(opt$low_expr_filter_cutoff))), ]
  53. }
  54. ###################################
  55. ##########DEGs pairwise DEGs
  56. #################################
  57. pvalue <- function(x,ncolA,ncolB) {
  58. obj<-try(t.test(x[1:ncolA],x[(ncolA+1):(ncolA+ncolB)],var.equal = TRUE), silent=TRUE)
  59. if (is(obj, "try-error")) return(1) else return(obj$p.value)
  60. }
  61. message("Conducting DEG analysis. \n ")
  62. groupn<-grep("group",colnames(sample_group))
  63. deg<-0
  64. for ( i in groupn){
  65. compgroup<-combn(unique(sample_group[,i]), 2)
  66. for ( j in 1:ncol(compgroup)){
  67. nam<-paste(compgroup[1,j],"vs",compgroup[2,j],sep="")
  68. groupA<-logexpr[,sample_group$sample[sample_group[,i] %in% compgroup[1,j]]]
  69. groupB<-logexpr[,sample_group$sample[sample_group[,i] %in% compgroup[2,j]]]
  70. group2<-cbind(groupA,groupB)
  71. ncolA<-ncol(groupA)
  72. ncolB<-ncol(groupB)
  73. ttest<-apply(group2,1,function(x){pvalue(x,ncolA,ncolB)})
  74. logfc<-rowMeans(groupA)-rowMeans(groupB)
  75. p <- data.frame(
  76. gene=rownames(logexpr),
  77. versus=rep(paste(compgroup[1,j],"vs",compgroup[2,j],sep=" "),nrow(logexpr)),
  78. ttest=ttest,
  79. log10p=(-log10(ttest)),
  80. logfc=logfc,
  81. sigene= 0)
  82. p$sigene[intersect(which(p$ttest<0.05),which(abs(p$logfc)>=1))]<-1
  83. pdiff<-p[p$sigene==1,]
  84. message(paste("Identify ", nrow(pdiff), " DEGs in ", nam, " comparison.",sep=""))
  85. p$sigene<-ifelse(p$sigene==1,"DEG","nonDEG")
  86. deg<-rbind(deg,pdiff)
  87. #modify output
  88. pout<-p[,c("gene","versus","logfc","log10p","sigene")]
  89. pout$sigene<-as.factor(pout$sigene)
  90. rownames(pout)<-c(1:nrow(pout))
  91. if (opt$output_all_genes==FALSE){
  92. #print only DEGs
  93. pout<-pout[pout$sigene=="DEG",]
  94. }
  95. message(paste("Writing results of DEG analysis of ", nam, " comparison.",sep=""))
  96. write.csv(pout,paste(out_dir,opt$project_code,"_",nam,"_degs.csv",sep=""),row.names=F)
  97. saveRDS(pout,paste(out_dir,opt$project_code,"_",nam,"_degs.rds",sep=""))
  98. }
  99. #clear
  100. groupA<-c()
  101. groupB<-c()
  102. group2<-c()
  103. ttest<-c()
  104. logfc<-c()
  105. nam<-c()
  106. }
  107. deg<-deg[!is.na(deg[,1]),]
  108. if(nrow(deg)==0){
  109. message("No DEGs are identified!")
  110. }else{
  111. degtable<-deg[,c("gene","versus","ttest","logfc")]
  112. colnames(degtable)<-gsub("logfc","log2FC",colnames(degtable))
  113. colnames(degtable)<-gsub("ttest","pvalue",colnames(degtable))
  114. degtable$pvalue<-signif(degtable$pvalue,4)
  115. degtable$log2FC<-signif(degtable$log2FC,4)
  116. rownames(degtable)<-c(1:nrow(degtable))
  117. #write output
  118. write.csv(degtable,paste(out_dir,opt$project_code,"_degs_acrossgroups.csv",sep=""),row.names=F)
  119. ### statistics number of up and down regulated genes and outpu
  120. degtable$updown<-ifelse(degtable$log2FC>0,"up","down")
  121. up<-degtable[degtable$updown=="up",]
  122. down<-degtable[degtable$updown=="down",]
  123. degstat<-rbind(
  124. cbind(tapply(up$updown,as.factor(as.character(up$versus)),length),"upregulated"),
  125. cbind(tapply(down$updown,as.factor(as.character(down$versus)),length),"downregulated")
  126. )
  127. degstat<-cbind(degstat,rownames(degstat))
  128. degstat<-data.frame(degstat)
  129. rownames(degstat)<-c(1:nrow(degstat))
  130. colnames(degstat)<-c("number","type","versus")
  131. degstat$number<-as.numeric(as.character(degstat$number))
  132. ########
  133. write.csv(degstat,paste(out_dir,opt$project_code,"_degs_stats.csv",sep=""),row.names=F)
  134. saveRDS(degstat,paste(out_dir,opt$project_code,"_degs_stats.rds",sep=""))
  135. message("RNAseq_4_pwDEG.R finished!")
  136. }