将fastq转换成bam文件,并用qualimap对BAM文件进行质控
Вы не можете выбрать более 25 тем Темы должны начинаться с буквы или цифры, могут содержать дефисы(-) и должны содержать не более 35 символов.

508 lines
13KB

  1. task bcftools {
  2. File ref_dir
  3. String fasta
  4. File vcf
  5. String basename = basename(vcf,".vcf")
  6. String docker
  7. String cluster_config
  8. String disk_size
  9. command <<<
  10. set -o pipefail
  11. set -e
  12. nt=$(nproc)
  13. # bcftools norm -m -both ${vcf} | bcftools norm -f ${ref_dir}/${fasta} -Ov -o ${basename}.norm.vcf
  14. # Split multiallelic sites
  15. bcftools norm -m -both ${vcf} -o ${basename}.norm.vcf
  16. >>>
  17. runtime {
  18. docker: docker
  19. cluster: cluster_config
  20. systemDisk: "cloud_ssd 40"
  21. dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
  22. }
  23. output {
  24. File norm_vcf = "${basename}.norm.vcf"
  25. }
  26. }
  27. task Sentieon_BQSR{
  28. File ref_dir
  29. File dbsnp_dir
  30. File dbmills_dir
  31. String sample_id
  32. String ref_fasta
  33. String dbsnp
  34. String db_mills
  35. File deduped_bam
  36. File deduped_bam_index
  37. # excute env
  38. String docker
  39. String cluster_config
  40. String disk_size
  41. String SENTIEON_LICENSE
  42. command<<<
  43. set -o pipefail
  44. set -exo
  45. export SENTIEON_LICENSE=${SENTIEON_LICENSE}
  46. nt=$(nproc)
  47. sentieon driver -t $nt \
  48. -r ${ref_dir}/${ref_fasta} -i ${deduped_bam} \
  49. --algo QualCal \
  50. -k ${dbsnp_dir}/${dbsnp} -k ${dbmills_dir}/${db_mills} \
  51. ${sample_id}_recal_data.table
  52. sentieon driver -t $nt \
  53. -r ${ref_dir}/${ref_fasta} -i ${deduped_bam} \
  54. -q ${sample_id}_recal_data.table \
  55. --algo QualCal \
  56. -k ${dbsnp_dir}/${dbsnp} -k ${dbmills_dir}/${db_mills} \
  57. ${sample_id}_recal_data.table.post \
  58. --algo ReadWriter ${sample_id}.sorted.deduped.recaled.bam
  59. sentieon driver -t $nt --algo QualCal \
  60. --plot --before ${sample_id}_recal_data.table --after ${sample_id}_recal_data.table.post ${sample_id}_recal_data.csv
  61. sentieon plot bqsr -o ${sample_id}_bqsrreport.pdf ${sample_id}_recal_data.csv
  62. >>>
  63. runtime{
  64. docker:docker
  65. cluster:cluster_config
  66. systemDisk:"cloud_ssd 250"
  67. dataDisk:"cloud_ssd " + disk_size + " /cromwell_root/"
  68. }
  69. output{
  70. File recal_table = "${sample_id}_recal_data.table"
  71. File recal_post = "${sample_id}_recal_data.table.post"
  72. File recaled_bam = "${sample_id}.sorted.deduped.recaled.bam"
  73. File recaled_bam_index = "${sample_id}.sorted.deduped.recaled.bam.bai"
  74. File recal_csv = "${sample_id}_recal_data.csv"
  75. File bqsrreport_pdf = "${sample_id}_bqsrreport.pdf"
  76. }
  77. }
  78. task fastp {
  79. # I/O options
  80. File in1
  81. File in2
  82. String sample_id
  83. Boolean? phred64 = false
  84. Boolean? fix_mgi_id = false
  85. String? adapter_sequence
  86. String? adapter_sequence_r2
  87. Int? reads_to_process # specify how many reads/pairs to be processed. Default 0 means process all reads.
  88. # reporting options
  89. String json = sample_id+"fastp.json"
  90. String html = sample_id+"fastp.html"
  91. String report_title = "\'fastp report\'"
  92. # excute env
  93. String docker
  94. String cluster_config
  95. String disk_size
  96. String out1_name = sample_id+'_clean_1.fastq'
  97. String out2_name = sample_id+'_clean_2.fastq'
  98. command <<<
  99. # basic command
  100. /opt/conda/bin/fastp \
  101. --in1 ${in1} \
  102. --in2 ${in2} \
  103. --out1 ${out1_name} \
  104. --out2 ${out2_name} \
  105. --json ${json} \
  106. --html ${html} \
  107. --report_title ${report_title} \
  108. # options
  109. ${ true="--phred64 " false="" phred64 } \
  110. ${ "--reads_to_process " + reads_to_process } \
  111. ${ true="--fix_mgi_id " false="" fix_mgi_id } \
  112. ${ "--adapter_sequence " + adapter_sequence } \
  113. ${ "--adapter_sequence_r2 " + adapter_sequence_r2 }
  114. >>>
  115. runtime {
  116. docker:docker
  117. cluster:cluster_config
  118. systemDisk:"cloud_ssd 40"
  119. dataDisk:"cloud_ssd " + disk_size + " /cromwell_root/"
  120. }
  121. output {
  122. File out1 = out1_name
  123. File out2 = out2_name
  124. File json_report = json
  125. File html_report = html
  126. }
  127. }
  128. task SentieonFastqToBam {
  129. # 工具输入文件和参数
  130. File fastq1
  131. File fastq2
  132. String sample_id
  133. String Seq_platform
  134. String ref_fasta
  135. File ref_fasta_dir
  136. String SENTIEON_LICENSE
  137. String docker
  138. String cluster_config
  139. String disk_size
  140. ## Extra driver parameters
  141. String qc_driver_args = ""
  142. String lc_driver_args = "--traverse_param=200000/10000"
  143. String dedup_driver_args = "--traverse_param=200000/10000"
  144. ## Extra algo parameters
  145. String bwa_args = "-Y -M"
  146. String bwa_chunk_size = "100000000"
  147. String lc_args = ""
  148. String bam_option = "--bam_compression 1"
  149. String out_bam = sample_id + ".dedup.bam"
  150. String out_bai = sample_id + ".dedup.bam.bai"
  151. # 工具运行命令
  152. command <<<
  153. set -exo pipefail
  154. export SENTIEON_LICENSE=${SENTIEON_LICENSE}
  155. nt=$(nproc)
  156. sentieon bwa mem -R "@RG\tID:${sample_id}\tSM:${sample_id}\tPL:${Seq_platform}" ${bwa_args} -K ${bwa_chunk_size} -t $nt ${ref_fasta_dir}/${ref_fasta} ${fastq1} ${fastq2} \
  157. | sentieon util sort ${bam_option} -i - -r ${ref_fasta_dir}/${ref_fasta} -t $nt -o ${sample_id}.sorted.bam --sam2bam
  158. ls ./
  159. sentieon driver -r ${ref_fasta_dir}/${ref_fasta} -t $nt -i ${sample_id}.sorted.bam ${qc_driver_args} \
  160. --algo MeanQualityByCycle ${sample_id}.mq_metrics.txt \
  161. --algo QualDistribution ${sample_id}.qd_metrics.txt \
  162. --algo GCBias --summary ${sample_id}.gc_summary_metrics.txt ${sample_id}.gc_metrics.txt \
  163. --algo AlignmentStat ${sample_id}.aln_metrics.txt \
  164. --algo InsertSizeMetricAlgo ${sample_id}.is_metrics.txt
  165. ls ./
  166. sentieon driver -r ${ref_fasta_dir}/${ref_fasta} -t $nt -i ${sample_id}.sorted.bam ${lc_driver_args} \
  167. --algo LocusCollector \
  168. ${lc_args} \
  169. ${sample_id}.score.txt.gz
  170. ls ./
  171. sentieon driver -r ${ref_fasta_dir}/${ref_fasta} -t $nt -i ${sample_id}.sorted.bam ${dedup_driver_args} \
  172. --algo Dedup \
  173. --score_info ${sample_id}.score.txt.gz \
  174. --metrics ${sample_id}.dedup_metrics.txt \
  175. ${bam_option} ${out_bam}
  176. ls ./
  177. >>>
  178. runtime {
  179. docker:docker
  180. cluster:cluster_config
  181. systemDisk:"cloud_ssd 40"
  182. dataDisk:"cloud_ssd " + disk_size + " /cromwell_root/"
  183. }
  184. # 工具运行输出结果
  185. output {
  186. File deduped_bam = out_bam
  187. File deduped_bam_bai = out_bai
  188. Array[File] qc_metrics = glob("*_metrics.txt")
  189. }
  190. }
  191. task manta_calling{
  192. File tumor_bam
  193. File tumor_bam_bai
  194. File normal_bam
  195. File normal_bam_bai
  196. String ref_fasta
  197. File ref_dir
  198. String sample_id
  199. String docker
  200. String cluster_config
  201. String disk_size
  202. String out_dir = "${sample_id}_result"
  203. command <<<
  204. set -exo pipefail
  205. nt=$(nproc)
  206. /home/biosoft/manta-1.6.0.centos6_x86_64/bin/configManta.py \
  207. --normalBam ${normal_bam} \
  208. --tumorBam ${tumor_bam} \
  209. --referenceFasta ${ref_dir}/${ref_fasta} \
  210. --runDir ${out_dir}
  211. ls ${out_dir}
  212. python2.7 ${out_dir}/runWorkflow.py -m local -j $nt
  213. ls ${out_dir}
  214. tar cvf ${out_dir}.tar ${out_dir}
  215. >>>
  216. runtime{
  217. docker:docker
  218. cluster:cluster_config
  219. systemDisk:"cloud_ssd 40"
  220. dataDisk:"cloud_ssd " + disk_size + " /cromwell_root/"
  221. }
  222. output{
  223. File out_file = "${out_dir}.tar"
  224. File manta_indel_vcf = "${out_dir}/results/variants/candidateSmallIndels.vcf.gz"
  225. File manta_indel_vcf_index = "${out_dir}/results/variants/candidateSmallIndels.vcf.gz.tbi"
  226. }
  227. }
  228. task qualimap{
  229. String sample_id
  230. File bam_file
  231. File bam_bai
  232. File annot_gff
  233. String docker
  234. String cluster_config
  235. String disk_size
  236. String out_dir = sample_id+'_BamQC'
  237. command <<<
  238. set -o pipefail
  239. set -exo
  240. nt=$(nproc)
  241. /opt/qualimap/qualimap bamqc -bam ${bam_file} -gff ${annot_gff} -outformat PDF:HTML -nt $nt -outdir ${out_dir} --java-mem-size=32G
  242. tar -zcvf ${out_dir}.tar ${out_dir}
  243. >>>
  244. runtime{
  245. docker:docker
  246. cluster:cluster_config
  247. systemDisk:"cloud_ssd 40"
  248. dataDisk:"cloud_ssd " + disk_size + " /cromwell_root/"
  249. }
  250. output{
  251. File out_file = "${out_dir}.tar"
  252. }
  253. }
  254. task strelka_calling{
  255. File tumor_bam
  256. File tumor_bam_bai
  257. File normal_bam
  258. File normal_bam_bai
  259. String ref_fasta
  260. File ref_dir
  261. String sample_id
  262. File manta_indel_vcf
  263. File manta_indel_vcf_index
  264. String docker
  265. String cluster_config
  266. String disk_size
  267. String out_dir = "${sample_id}_result"
  268. command <<<
  269. set -exo pipefail
  270. nt=$(nproc)
  271. /home/biosoft/strelka-2.9.10.centos6_x86_64/bin/configureStrelkaSomaticWorkflow.py \
  272. --normalBam ${normal_bam} \
  273. --tumorBam ${tumor_bam} \
  274. --referenceFasta ${ref_dir}/${ref_fasta} \
  275. --indelCandidates ${manta_indel_vcf} \
  276. --runDir ${out_dir}
  277. ls ${out_dir}
  278. python2.7 ${out_dir}/runWorkflow.py -m local -j $nt
  279. ls ${out_dir}
  280. tar cvf ${out_dir}.tar ${out_dir}
  281. >>>
  282. runtime{
  283. docker:docker
  284. cluster:cluster_config
  285. systemDisk:"cloud_ssd 40"
  286. dataDisk:"cloud_ssd " + disk_size + " /cromwell_root/"
  287. }
  288. output{
  289. File out_file = "${out_dir}.tar"
  290. }
  291. }
  292. task sentieon_TNscope{
  293. String sample_id
  294. File tumor_bam
  295. File tumor_bam_bai
  296. File? normal_bam
  297. File? normal_bam_bai
  298. String tumor_name
  299. String normal_name
  300. File tumor_recall_data
  301. File normal_recall_data
  302. File ref_dir
  303. String ref_fasta
  304. File dbsnp_dir
  305. String dbsnp
  306. # excute env
  307. String docker
  308. String cluster_config
  309. String disk_size
  310. String SENTIEON_LICENSE
  311. command <<<
  312. set -o pipefail
  313. set -exo
  314. export SENTIEON_LICENSE=${SENTIEON_LICENSE}
  315. nt=$(nproc)
  316. sentieon driver -t $nt -r ${ref_dir}/${ref_fasta} \
  317. -i ${tumor_bam} -q ${tumor_recall_data} \
  318. -i ${normal_bam} -q ${normal_recall_data} \
  319. --algo TNscope --tumor_sample ${tumor_name} --normal_sample ${normal_name} \
  320. --disable_detector sv --trim_soft_clip \
  321. --dbsnp ${dbsnp_dir}/${dbsnp} ${sample_id}.TNscope.vcf || { echo "TNscope failed"; exit 1; }
  322. ls ./
  323. >>>
  324. runtime{
  325. docker:docker
  326. cluster:cluster_config
  327. systemDisk:"cloud_ssd 40"
  328. dataDisk:"cloud_ssd " + disk_size + " /cromwell_root/"
  329. }
  330. output{
  331. File vcf = "${sample_id}.TNscope.vcf"
  332. File vcf_index = "${sample_id}.TNscope.vcf.idx"
  333. }
  334. }
  335. task sentieon_TNseq{
  336. String sample_id
  337. File tumor_bam
  338. File tumor_bam_bai
  339. File? normal_bam
  340. File? normal_bam_bai
  341. String tumor_name
  342. String normal_name
  343. File ref_dir
  344. String ref_fasta
  345. File germline_resource
  346. File germline_resource_tbi
  347. # excute env
  348. String docker
  349. String cluster_config
  350. String disk_size
  351. String SENTIEON_LICENSE
  352. command <<<
  353. set -o pipefail
  354. set -exo
  355. export SENTIEON_LICENSE=${SENTIEON_LICENSE}
  356. nt=$(nproc)
  357. if [${normal_bam}];then
  358. INPUT="-i ${tumor_bam} -i ${normal_bam}"
  359. SAMPLE="--tumor_sample ${tumor_name} --normal_sample ${normal_name}"
  360. else
  361. INPUT="-i ${tumor_bam}"
  362. SAMPLE="--tumor_sample ${tumor_name}"
  363. fi
  364. sentieon driver -t $nt -r ${ref_dir}/${ref_fasta} \
  365. $INPUT \
  366. --algo TNhaplotyper2 $SAMPLE \
  367. --germline_vcf ${germline_resource} \
  368. ${sample_id}.TNseq.raw.vcf \
  369. --algo OrientationBias --tumor_sample ${tumor_name} \
  370. ${sample_id}.orientation \
  371. --algo ContaminationModel $SAMPLE \
  372. --vcf ${germline_resource} \
  373. --tumor_segments ${sample_id}.contamination.segments \
  374. ${sample_id}.contamination
  375. sentieon driver -t $nt \
  376. -r ${ref_dir}/${ref_fasta} \
  377. --algo TNfilter $SAMPLE \
  378. -v ${sample_id}.TNseq.raw.vcf \
  379. --contamination ${sample_id}.contamination \
  380. --tumor_segments ${sample_id}.contamination.segments \
  381. --orientation_priors ${sample_id}.orientation \
  382. ${sample_id}.bwa_TNseq.vcf
  383. >>>
  384. runtime{
  385. docker:docker
  386. cluster:cluster_config
  387. systemDisk:"cloud_ssd 40"
  388. dataDisk:"cloud_ssd " + disk_size + " /cromwell_root/"
  389. }
  390. output{
  391. File raw_vcf = "${sample_id}.TNseq.raw.vcf"
  392. File raw_vcf_index = "${sample_id}.TNseq.raw.vcf.idx"
  393. File vcf = "${sample_id}.bwa_TNseq.vcf"
  394. File vcf_index = "${sample_id}.bwa_TNseq.vcf.idx"
  395. File contamination = "${sample_id}.contamination"
  396. File contamination_segments = "${sample_id}.contamination.segments"
  397. File orientation = "${sample_id}.orientation"
  398. }
  399. }