Browse Source

add realigner and bqsr

master
YaqingLiu 4 years ago
commit
f1dcb168c2
13 changed files with 525 additions and 0 deletions
  1. BIN
      .DS_Store
  2. +34
    -0
      README.md
  3. +17
    -0
      defaults
  4. +20
    -0
      inputs
  5. BIN
      pictures/.DS_Store
  6. +39
    -0
      tasks/Dedup.wdl
  7. +49
    -0
      tasks/Metrics.wdl
  8. +45
    -0
      tasks/deduped_Metrics.wdl
  9. +33
    -0
      tasks/fastqc.wdl
  10. +40
    -0
      tasks/fastqscreen.wdl
  11. +36
    -0
      tasks/mapping.wdl
  12. +33
    -0
      tasks/qualimap.wdl
  13. +179
    -0
      workflow.wdl

BIN
.DS_Store View File


+ 34
- 0
README.md View File

@@ -0,0 +1,34 @@
# README.md

> Author: Yaqing Liu
>
> Email: [yaqing.liu@outlook.com](mailto:yaqing.liu@outlook.com)
>
> Last Updates: 23/04/2021

#### Requirements

- choppy
- Ali-Cloud
- Linux

#### Introduction
This APP is used to
* convert a FASTQ file to an aligned BAM file.
* QC the data at the level of FASTQ and BAM, based on FastQC, FastQ Screen and Qualimap.

**Please carefully check the reference genome, bed file, etc.**
#### Usage
```
open-choppy-env

choppy install YaqingLiu/cbcga-wes-qc-latest

choppy samples YaqingLiu/cbcga-wes-qc-latest --no-default
# sample_id,fastq_1,fastq_2

choppy batch YaqingLiu/cbcga-wes-qc-latest samples.csv -p Project -l Label

# Query the status of all tasks in the project
choppy query -L Label | grep "status"
```

+ 17
- 0
defaults View File

@@ -0,0 +1,17 @@
{
"fastqscreen_docker": "registry.cn-shanghai.aliyuncs.com/pgx-docker-registry/fastqscreen:0.12.0",
"fastqc_docker": "registry.cn-shanghai.aliyuncs.com/pgx-docker-registry/fastqc:0.11.8",
"qualimap_docker": "registry.cn-shanghai.aliyuncs.com/pgx-docker-registry/qualimap:2.0.0",
"sentieon_docker": "registry.cn-shanghai.aliyuncs.com/pgx-docker-registry/sentieon-genomics:v2019.11.28",
"CPU2_GB8_cluster": "OnDemand bcs.ps.g.large img-ubuntu-vpc",
"CPU4_GB16_cluster": "OnDemand bcs.ps.g.xlarge img-ubuntu-vpc",
"CPU8_GB32_cluster": "OnDemand bcs.ps.g.2xlarge img-ubuntu-vpc",
"disk_size": "300",
"SENTIEON_INSTALL_DIR": "/opt/sentieon-genomics",
"SENTIEON_LICENSE": "192.168.0.55:8990",
"covered_bed": "oss://pgx-reference-data/bed/cbcga/S07604514_Covered.bed",
"fastq_screen_conf": "oss://pgx-reference-data/fastq_screen_reference/fastq_screen.conf",
"screen_ref_dir": "oss://pgx-reference-data/fastq_screen_reference/",
"fasta": "GRCh38.d1.vd1.fa",
"ref_dir": "oss://pgx-reference-data/GRCh38.d1.vd1/"
}

+ 20
- 0
inputs View File

@@ -0,0 +1,20 @@
{
"{{ project_name }}.sample_id": "{{ sample_id }}",
"{{ project_name }}.fastq_1": "{{ fastq_1 }}",
"{{ project_name }}.fastq_2": "{{ fastq_2 }}",
"{{ project_name }}.fastqscreen_docker": "{{ fastqscreen_docker }}",
"{{ project_name }}.fastqc_docker": "{{ fastqc_docker }}",
"{{ project_name }}.qualimap_docker": "{{ qualimap_docker }}",
"{{ project_name }}.sentieon_docker": "{{ sentieon_docker }}",
"{{ project_name }}.SENTIEON_INSTALL_DIR": "{{ SENTIEON_INSTALL_DIR }}",
"{{ project_name }}.SENTIEON_LICENSE": "{{ SENTIEON_LICENSE }}",
"{{ project_name }}.CPU2_GB8_cluster": "{{ CPU2_GB8_cluster }}",
"{{ project_name }}.CPU4_GB16_cluster": "{{ CPU4_GB16_cluster }}",
"{{ project_name }}.CPU8_GB32_cluster": "{{ CPU8_GB32_cluster }}",
"{{ project_name }}.disk_size": "{{ disk_size }}",
"{{ project_name }}.covered_bed": "{{ covered_bed }}",
"{{ project_name }}.fastq_screen_conf": "{{ fastq_screen_conf }}",
"{{ project_name }}.screen_ref_dir": "{{ screen_ref_dir }}",
"{{ project_name }}.fasta": "{{ fasta }}",
"{{ project_name }}.ref_dir": "{{ ref_dir }}"
}

BIN
pictures/.DS_Store View File


+ 39
- 0
tasks/Dedup.wdl View File

@@ -0,0 +1,39 @@
task Dedup {
String SENTIEON_INSTALL_DIR
String SENTIEON_LICENSE
String sample
File sorted_bam
File sorted_bam_index
String docker
String cluster_config
String disk_size
command <<<
set -o pipefail
set -e
export SENTIEON_LICENSE=${SENTIEON_LICENSE}
nt=$(nproc)
${SENTIEON_INSTALL_DIR}/bin/sentieon driver -t $nt -i ${sorted_bam} --algo LocusCollector --fun score_info ${sample}_score.txt
${SENTIEON_INSTALL_DIR}/bin/sentieon driver -t $nt -i ${sorted_bam} --algo Dedup --rmdup --score_info ${sample}_score.txt --metrics ${sample}_dedup_metrics.txt ${sample}.sorted.deduped.bam
sed -n '3p' ${sample}_dedup_metrics.txt | awk -F'\t' '{print "'"${sample}"'""\t"$9*100}' > ${sample}_picard_duplication.txt
# ${sample}_marked_dup_metrics.txt can be recognized as the picard output
sed '1i\#DuplicationMetrics' ${sample}_dedup_metrics.txt > ${sample}_marked_dup_metrics.txt
>>>
runtime {
docker: docker
cluster: cluster_config
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}
output {
File score = "${sample}_score.txt"
File dedup_metrics = "${sample}_marked_dup_metrics.txt"
File duplication = "${sample}_picard_duplication.txt"
File deduped_bam = "${sample}.sorted.deduped.bam"
File deduped_bam_index = "${sample}.sorted.deduped.bam.bai"
}
}

+ 49
- 0
tasks/Metrics.wdl View File

@@ -0,0 +1,49 @@
task Metrics {
File ref_dir
String SENTIEON_INSTALL_DIR
String SENTIEON_LICENSE
String sample
String fasta
File sorted_bam
File sorted_bam_index
String docker
String cluster_config
String disk_size
command <<<
set -o pipefail
set -e
export SENTIEON_LICENSE=${SENTIEON_LICENSE}
nt=$(nproc)
${SENTIEON_INSTALL_DIR}/bin/sentieon driver -r ${ref_dir}/${fasta} -t $nt -i ${sorted_bam} --algo MeanQualityByCycle ${sample}_mq_metrics.txt --algo QualDistribution ${sample}_qd_metrics.txt --algo GCBias --summary ${sample}_gc_summary.txt ${sample}_gc_metrics.txt --algo AlignmentStat ${sample}_aln_metrics.txt --algo InsertSizeMetricAlgo ${sample}_is_metrics.txt --algo CoverageMetrics --omit_base_output ${sample}_coverage_metrics
${SENTIEON_INSTALL_DIR}/bin/sentieon plot metrics -o ${sample}_metrics_report.pdf gc=${sample}_gc_metrics.txt qd=${sample}_qd_metrics.txt mq=${sample}_mq_metrics.txt isize=${sample}_is_metrics.txt
>>>
runtime {
docker: docker
cluster: cluster_config
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}
output {
File qd_metrics = "${sample}_qd_metrics.txt"
File qd_metrics_pdf = "${sample}_qd_metrics.pdf"
File mq_metrics = "${sample}_mq_metrics.txt"
File mq_metrics_pdf = "${sample}_mq_metrics.pdf"
File is_metrics = "${sample}_is_metrics.txt"
File is_metrics_pdf = "${sample}_is_metrics.pdf"
File gc_summary = "${sample}_gc_summary.txt"
File gc_metrics = "${sample}_gc_metrics.txt"
File gc_metrics_pdf = "${sample}_gc_metrics.pdf"
File aln_metrics = "${sample}_aln_metrics.txt"
File coverage_metrics_sample_summary = "${sample}_coverage_metrics.sample_summary"
File coverage_metrics_sample_statistics = "${sample}_coverage_metrics.sample_statistics"
File coverage_metrics_sample_interval_statistics = "${sample}_coverage_metrics.sample_interval_statistics"
File coverage_metrics_sample_cumulative_coverage_proportions = "${sample}_coverage_metrics.sample_cumulative_coverage_proportions"
File coverage_metrics_sample_cumulative_coverage_counts = "${sample}_coverage_metrics.sample_cumulative_coverage_counts"
}
}

+ 45
- 0
tasks/deduped_Metrics.wdl View File

@@ -0,0 +1,45 @@
task deduped_Metrics {
File ref_dir
String SENTIEON_INSTALL_DIR
String SENTIEON_LICENSE
String sample
String fasta
File deduped_bam
File deduped_bam_index
String docker
String cluster_config
String disk_size
command <<<
set -o pipefail
set -e
export SENTIEON_LICENSE=${SENTIEON_LICENSE}
nt=$(nproc)
${SENTIEON_INSTALL_DIR}/bin/sentieon driver -r ${ref_dir}/${fasta} -t $nt -i ${deduped_bam} --algo CoverageMetrics --omit_base_output ${sample}_deduped_coverage_metrics --algo MeanQualityByCycle ${sample}_deduped_mq_metrics.txt --algo QualDistribution ${sample}_deduped_qd_metrics.txt --algo GCBias --summary ${sample}_deduped_gc_summary.txt ${sample}_deduped_gc_metrics.txt --algo AlignmentStat ${sample}_deduped_aln_metrics.txt --algo InsertSizeMetricAlgo ${sample}_deduped_is_metrics.txt --algo QualityYield ${sample}_deduped_QualityYield.txt --algo WgsMetricsAlgo ${sample}_deduped_WgsMetricsAlgo.txt
>>>
runtime {
docker: docker
cluster: cluster_config
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}
output {
File deduped_coverage_metrics_sample_summary = "${sample}_deduped_coverage_metrics.sample_summary"
File deduped_coverage_metrics_sample_statistics = "${sample}_deduped_coverage_metrics.sample_statistics"
File deduped_coverage_metrics_sample_interval_statistics = "${sample}_deduped_coverage_metrics.sample_interval_statistics"
File deduped_coverage_metrics_sample_cumulative_coverage_proportions = "${sample}_deduped_coverage_metrics.sample_cumulative_coverage_proportions"
File deduped_coverage_metrics_sample_cumulative_coverage_counts = "${sample}_deduped_coverage_metrics.sample_cumulative_coverage_counts"
File deduped_mean_quality = "${sample}_deduped_mq_metrics.txt"
File deduped_qd_metrics = "${sample}_deduped_qd_metrics.txt"
File deduped_gc_summary = "${sample}_deduped_gc_summary.txt"
File deduped_gc_metrics = "${sample}_deduped_gc_metrics.txt"
File dedeuped_aln_metrics = "${sample}_deduped_aln_metrics.txt"
File deduped_is_metrics = "${sample}_deduped_is_metrics.txt"
File deduped_QualityYield = "${sample}_deduped_QualityYield.txt"
File deduped_wgsmetrics = "${sample}_deduped_WgsMetricsAlgo.txt"
}
}

+ 33
- 0
tasks/fastqc.wdl View File

@@ -0,0 +1,33 @@
task fastqc {
String sample
File read1
File read2
String docker
String cluster_config
String disk_size

command <<<
set -o pipefail
set -e
nt=$(nproc)
ln -s ${read1} ${sample}_R1.fastq.gz
ln -s ${read2} ${sample}_R2.fastq.gz
fastqc -t $nt -o ./ ${sample}_R1.fastq.gz
fastqc -t $nt -o ./ ${sample}_R2.fastq.gz
>>>

runtime {
docker:docker
cluster: cluster_config
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}
output {
File read1_html="${sample}_R1_fastqc.html"
File read1_zip="${sample}_R1_fastqc.zip"
File read2_html="${sample}_R2_fastqc.html"
File read2_zip="${sample}_R2_fastqc.zip"
}
}

+ 40
- 0
tasks/fastqscreen.wdl View File

@@ -0,0 +1,40 @@
task fastq_screen {
String sample
File read1
File read2
File screen_ref_dir
File fastq_screen_conf

String docker
String cluster_config
String disk_size

command <<<
set -o pipefail
set -e
nt=$(nproc)
mkdir -p /cromwell_root/tmp
cp -r ${screen_ref_dir} /cromwell_root/tmp/
ln -s ${read1} ${sample}_R1.fastq.gz
ln -s ${read2} ${sample}_R2.fastq.gz
fastq_screen --aligner bowtie2 --conf ${fastq_screen_conf} --top 100000 --threads $nt ${sample}_R1.fastq.gz
fastq_screen --aligner bowtie2 --conf ${fastq_screen_conf} --top 100000 --threads $nt ${sample}_R2.fastq.gz
>>>

runtime {
docker:docker
cluster: cluster_config
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}

output {
File png1 = "${sample}_R1_screen.png"
File txt1 = "${sample}_R1_screen.txt"
File html1 = "${sample}_R1_screen.html"
File png2 = "${sample}_R2_screen.png"
File txt2 = "${sample}_R2_screen.txt"
File html2 = "${sample}_R2_screen.html"
}
}

+ 36
- 0
tasks/mapping.wdl View File

@@ -0,0 +1,36 @@
task mapping {
String sample
File fastq_1
File fastq_2
String SENTIEON_INSTALL_DIR
String SENTIEON_LICENSE
String group
String pl
File ref_dir
String fasta
String docker
String cluster_config
String disk_size
command <<<
set -o pipefail
set -e
export SENTIEON_LICENSE=${SENTIEON_LICENSE}
nt=$(nproc)
${SENTIEON_INSTALL_DIR}/bin/bwa mem -M -R "@RG\tID:${group}\tSM:${sample}\tPL:${pl}" -t $nt -K 10000000 ${ref_dir}/${fasta} ${fastq_1} ${fastq_2} | ${SENTIEON_INSTALL_DIR}/bin/sentieon util sort -o ${sample}.sorted.bam -t $nt --sam2bam -i -
>>>
runtime {
docker: docker
cluster: cluster_config
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}
output {
File sorted_bam = "${sample}.sorted.bam"
File sorted_bam_index = "${sample}.sorted.bam.bai"
}
}

+ 33
- 0
tasks/qualimap.wdl View File

@@ -0,0 +1,33 @@
task qualimap {
String sample
File bam
File bai
File covered_bed
String docker
String cluster_config
String disk_size

command <<<
set -o pipefail
set -e
nt=$(nproc)
awk 'BEGIN{OFS="\t"}{sub("\r","",$3);print $1,$2,$3,"",0,"."}' ${covered_bed} > new.bed
/opt/qualimap/qualimap bamqc -bam ${bam} -gff new.bed -outformat PDF:HTML -nt $nt -outdir ${sample} --java-mem-size=32G
cat ${sample}/genome_results.txt | grep duplication | awk -F "= |%" '{print "'"${sample}"'""\t"$2}' > ${sample}_qualimap_duplication.txt
tar -zcvf ${sample}_qualimap.tar ${sample}
>>>

runtime {
docker:docker
cluster:cluster_config
systemDisk:"cloud_ssd 40"
dataDisk:"cloud_ssd " + disk_size + " /cromwell_root/"
}
output {
File tar = "${sample}_qualimap.tar"
File duplication = "${sample}_qualimap_duplication.txt"
}
}

+ 179
- 0
workflow.wdl View File

@@ -0,0 +1,179 @@
import "./tasks/fastqc.wdl" as fastqc
import "./tasks/fastqscreen.wdl" as fastqscreen
import "./tasks/qualimap.wdl" as qualimap
import "./tasks/mapping.wdl" as mapping
import "./tasks/Metrics.wdl" as Metrics
import "./tasks/Dedup.wdl" as Dedup
import "./tasks/deduped_Metrics.wdl" as deduped_Metrics
import "./tasks/Realigner.wdl" as Realigner
import "./tasks/BQSR.wdl" as BQSR


workflow {{ project_name }} {
String sample_id
File fastq_1
File fastq_2

File screen_ref_dir
File fastq_screen_conf
File ref_dir
String fasta
File covered_bed

String SENTIEON_INSTALL_DIR
String SENTIEON_LICENSE
String sentieon_docker
String fastqc_docker
String fastqscreen_docker
String qualimap_docker
String CPU8_GB32_cluster
String CPU4_GB16_cluster
String CPU2_GB8_cluster
String disk_size

call fastqc.fastqc as fastqc {
input:
sample=sample_id,
read1=fastq_1,
read2=fastq_2,
docker=fastqc_docker,
disk_size=disk_size,
cluster_config=CPU8_GB32_cluster
}

call fastqscreen.fastq_screen as fastqscreen {
input:
sample=sample_id,
read1=fastq_1,
read2=fastq_2,
screen_ref_dir=screen_ref_dir,
fastq_screen_conf=fastq_screen_conf,
docker=fastqscreen_docker,
disk_size=disk_size,
cluster_config=CPU2_GB8_cluster
}

call mapping.mapping as mapping {
input:
group=sample_id,
sample=sample_id,
fastq_1=fastq_1,
fastq_2=fastq_2,
SENTIEON_INSTALL_DIR=SENTIEON_INSTALL_DIR,
SENTIEON_LICENSE=SENTIEON_LICENSE,
pl="ILLUMINAL",
fasta=fasta,
ref_dir=ref_dir,
docker=sentieon_docker,
disk_size=disk_size,
cluster_config=CPU8_GB32_cluster
}

call Metrics.Metrics as Metrics {
input:
SENTIEON_INSTALL_DIR=SENTIEON_INSTALL_DIR,
SENTIEON_LICENSE=SENTIEON_LICENSE,
fasta=fasta,
ref_dir=ref_dir,
sorted_bam=mapping.sorted_bam,
sorted_bam_index=mapping.sorted_bam_index,
sample=sample_id,
docker=sentieon_docker,
disk_size=disk_size,
cluster_config=CPU2_GB8_cluster
}

call Dedup.Dedup as Dedup {
input:
SENTIEON_INSTALL_DIR=SENTIEON_INSTALL_DIR,
SENTIEON_LICENSE=SENTIEON_LICENSE,
sorted_bam=mapping.sorted_bam,
sorted_bam_index=mapping.sorted_bam_index,
sample=sample_id,
docker=sentieon_docker,
disk_size=disk_size,
cluster_config=CPU8_GB32_cluster
}

call deduped_Metrics.deduped_Metrics as deduped_Metrics {
input:
SENTIEON_INSTALL_DIR=SENTIEON_INSTALL_DIR,
SENTIEON_LICENSE=SENTIEON_LICENSE,
fasta=fasta,
ref_dir=ref_dir,
deduped_bam=Dedup.deduped_bam,
deduped_bam_index=Dedup.deduped_bam_index,
sample=sample_id,
docker=sentieon_docker,
disk_size=disk_size,
cluster_config=CPU2_GB8_cluster
}
call qualimap.qualimap as qualimap {
input:
sample=sample_id,
bam=Dedup.deduped_bam,
bai=Dedup.deduped_bam_index,
covered_bed=covered_bed,
docker=qualimap_docker,
disk_size=disk_size,
cluster_config=CPU8_GB32_cluster
}
call Realigner.Realigner as Realigner {
input:
SENTIEON_INSTALL_DIR=SENTIEON_INSTALL_DIR,
SENTIEON_LICENSE=SENTIEON_LICENSE,
fasta=fasta,
ref_dir=ref_dir,
deduped_bam=Dedup.deduped_bam,
deduped_bam_index=Dedup.deduped_bam_index,
db_mills=db_mills,
dbmills_dir=dbmills_dir,
sample=sample_id + '_tumor',
docker=sentieon_docker,
disk_size=disk_size,
cluster_config=cluster_config
}

call BQSR.BQSR as BQSR {
input:
SENTIEON_INSTALL_DIR=SENTIEON_INSTALL_DIR,
SENTIEON_LICENSE=SENTIEON_LICENSE,
fasta=fasta,
ref_dir=ref_dir,
realigned_bam=Realigner.realigner_bam,
realigned_bam_index=Realigner.realigner_bam_index,
db_mills=db_mills,
dbmills_dir=dbmills_dir,
dbsnp=dbsnp,
dbsnp_dir=dbsnp_dir,
sample=sample_id + '_tumor',
docker=sentieon_docker,
disk_size=disk_size,
cluster_config=cluster_config
}

output {
File fastqc_read1_html = fastqc.read1_html
File fastqc_read1_zip = fastqc.read1_zip
File fastqc_read2_html = fastqc.read2_html
File fastqc_read2_zip = fastqc.read2_zip
File fastqscreen_png1 = fastqscreen.png1
File fastqscreen_txt1 = fastqscreen.txt1
File fastqscreen_html1 = fastqscreen.html1
File fastqscreen_png2 = fastqscreen.png2
File fastqscreen_txt2 = fastqscreen.txt2
File fastqscreen_html2 = fastqscreen.html2
File qualimap_tar = qualimap.tar
File qualimap_duplication = qualimap.duplication
File Dedup_dedup_metrics = Dedup.dedup_metrics
File Dedup_duplication = Dedup.duplication
File deduped_bam = Dedup.deduped_bam
File deduped_bam_index = Dedup.deduped_bam_index
File Metrics_aln_metrics = Metrics.aln_metrics
File Metrics_gc_metrics = Metrics.gc_metrics
File Metrics_gc_summary = Metrics.gc_summary
File Metrics_is_metrics = Metrics.is_metrics
}
}

Loading…
Cancel
Save