用于miRNA-seq二代测序数据分析
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

пре 6 година
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118
  1. task ReadStats {
  2. String sample_id
  3. File in_log_trimAdatper
  4. File in_log_readFilter
  5. File in_log_align_miRNA
  6. File in_log_align_preMiRNA
  7. File in_log_align_piRNA
  8. File in_log_align_tRNA
  9. File in_log_align_RNA
  10. File in_log_align_hg38
  11. File in_sam_align_RNA
  12. String cluster_config
  13. String disk_size
  14. command <<<
  15. set -o pipefail
  16. set -e
  17. n_Total_Sequence=$(cat ${in_log_trimAdatper} | grep 'total reads' | head -n 1 | cut -d ':' -f 2 | sed 's/ //g')
  18. n_AdapterNotFound=$(cat ${in_log_trimAdatper} | grep 'reads failed due to too long' | cut -d ':' -f 2 | sed 's/ //g')
  19. n_Total_forCount=$[$n_Total_Sequence-$n_AdapterNotFound]
  20. n_Pass_trimAdatper=$(cat ${in_log_trimAdatper} | grep 'reads passed filter' | tail -n 1 | cut -d ':' -f 2 | sed 's/ //g')
  21. n_Adapter_dimer=$[$n_Total_Sequence-$n_AdapterNotFound-$n_Pass_trimAdatper]
  22. n_Too_short=$(cat ${in_log_readFilter} | grep 'too short' | tail -n 1 | cut -d ':' -f 2 | sed 's/ //g')
  23. n_Low_quality_singleBase=$(cat ${in_log_readFilter} | grep 'low quality' | head -n 1 | cut -d ':' -f 2 | sed 's/ //g')
  24. n_Low_quality_tooManyN=$(cat ${in_log_readFilter} | grep 'too many N' | head -n 1 | cut -d ':' -f 2 | sed 's/ //g')
  25. n_Low_quality=$[$n_Low_quality_singleBase+$n_Low_quality_tooManyN]
  26. n_ForAlign=$(cat ${in_log_readFilter} | grep 'reads passed filter' | head -n 1 | cut -d ':' -f 2 | sed 's/ //g')
  27. n_miRNA_mature=$(cat ${in_log_align_miRNA} | grep 'at least one reported alignment' | head -n 1 | cut -d ':' -f 2 | cut -d '(' -f 1 | sed 's/ //g')
  28. n_miRNA_hairpin=$(cat ${in_log_align_preMiRNA} | grep 'at least one reported alignment' | head -n 1 | cut -d ':' -f 2 | cut -d '(' -f 1 | sed 's/ //g')
  29. n_piRNA=$(cat ${in_log_align_piRNA} | grep 'at least one reported alignment' | head -n 1 | cut -d ':' -f 2 | cut -d '(' -f 1 | sed 's/ //g')
  30. n_tRNA=$(cat ${in_log_align_tRNA} | grep 'at least one reported alignment' | head -n 1 | cut -d ':' -f 2 | cut -d '(' -f 1 | sed 's/ //g')
  31. n_RNA=$(cat ${in_log_align_RNA} | grep 'at least one reported alignment' | head -n 1 | cut -d ':' -f 2 | cut -d '(' -f 1 | sed 's/ //g')
  32. n_otGenomic=$(cat ${in_log_align_hg38} | grep 'at least one reported alignment' | head -n 1 | cut -d ':' -f 2 | cut -d '(' -f 1 | sed 's/ //g')
  33. n_notGenomic=$(cat ${in_log_align_hg38} | grep 'reads that failed to align' | head -n 1 | cut -d ':' -f 2 | cut -d '(' -f 1 | sed 's/ //g' )
  34. groupedReadCount=${sample_id}.trimAdapt.filter.align2RNA.grouped.readCount
  35. cat ${in_sam_align_RNA} | grep -v '@' | awk '($2!=4)' | cut -f 3 | sed 's/.*;//g' | awk '{a[$1]++}END{for(i in a){printf "%s\t%d\n",i,a[i]}}' > $groupedReadCount
  36. if [ $(cat $groupedReadCount | grep '^mRNA' | wc -l ) -gt 0 ]
  37. then
  38. n_mRNA=$(cat $groupedReadCount | grep '^mRNA' | cut -f 2 )
  39. else
  40. n_mRNA=0
  41. fi
  42. if [ $(cat $groupedReadCount | grep '^long_non-coding_RNA' | wc -l ) -gt 0 ]
  43. then
  44. n_lncRNA=$(cat $groupedReadCount | grep '^long_non-coding_RNA' | cut -f 2 )
  45. else
  46. n_lncRNA=0
  47. fi
  48. if [ $(cat $groupedReadCount | grep '^ribosomal_RNA' | wc -l ) -gt 0 ]
  49. then
  50. n_rRNA=$(cat $groupedReadCount | grep '^ribosomal_RNA' | cut -f 2 )
  51. else
  52. n_rRNA=0
  53. fi
  54. if [ $(cat $groupedReadCount | grep '^Y_RNA' | wc -l ) -gt 0 ]
  55. then
  56. n_YRNA=$(cat $groupedReadCount | grep '^Y_RNA' | cut -f 2 )
  57. else
  58. n_YRNA=0
  59. fi
  60. if [ $(cat $groupedReadCount | grep -E '^misc_RNA|small|guide_RNA|vault_RNA' | wc -l ) -gt 0 ]
  61. then
  62. n_otsmall=$(cat $groupedReadCount | grep -E '^misc_RNA|small|guide_RNA|vault_RNA' | cut -f 2 | awk '{sum+=$1}END{print sum}')
  63. else
  64. n_otsmall=0
  65. fi
  66. n_otTranscript=$[$n_RNA-$n_mRNA-$n_lncRNA-$n_rRNA-$n_YRNA-$n_otsmall]
  67. file_output=${sample_id}.readStats
  68. echo -e "Stage\tReadCount" > $file_output
  69. echo -e "adapter not found\t$n_AdapterNotFound" >> $file_output
  70. echo -e "adapter dimer\t$n_Adapter_dimer" >> $file_output
  71. echo -e "too short\t$n_Too_short" >> $file_output
  72. echo -e "low sequencing quality\t$n_Low_quality" >> $file_output
  73. echo -e "mature miRNA\t$n_miRNA_mature" >> $file_output
  74. echo -e "hairpin miRNA\t$n_miRNA_hairpin" >> $file_output
  75. echo -e "piRNA\t$n_piRNA" >> $file_output
  76. echo -e "tRNA\t$n_tRNA" >> $file_output
  77. echo -e "mRNA\t$n_mRNA" >> $file_output
  78. echo -e "lncRNA\t$n_lncRNA" >> $file_output
  79. echo -e "rRNA\t$n_rRNA" >> $file_output
  80. echo -e "YRNA\t$n_YRNA" >> $file_output
  81. echo -e "other small RNA\t$n_otsmall" >> $file_output
  82. echo -e "other from transcriptome\t$n_otTranscript" >> $file_output
  83. echo -e "other from human genome\t$n_otGenomic" >> $file_output
  84. echo -e "not from human genome\t$n_notGenomic" >> $file_output
  85. >>>
  86. runtime {
  87. cluster: cluster_config
  88. systemDisk: "cloud_ssd 40"
  89. dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
  90. }
  91. output {
  92. File out="${sample_id}.readStats"
  93. File out_groupedCount="${sample_id}.trimAdapt.filter.align2RNA.grouped.readCount"
  94. }
  95. }