2 лет назад · 63debca463
--- a/tasks
+++ b/tasks
@@ -0,0 +1,508 @@
 task bcftools {
  
  File ref_dir
  String fasta
  File vcf
  String basename = basename(vcf,".vcf")
  String docker
  String cluster_config
  String disk_size

  command <<<
    set -o pipefail
    set -e
    nt=$(nproc)
    
    # bcftools norm -m -both ${vcf} | bcftools norm -f ${ref_dir}/${fasta} -Ov -o ${basename}.norm.vcf
    # Split multiallelic sites
    bcftools norm -m -both ${vcf} -o ${basename}.norm.vcf
  >>>
  
  runtime {
    docker: docker
    cluster: cluster_config
    systemDisk: "cloud_ssd 40"
    dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
  }
  
  output {
    File norm_vcf = "${basename}.norm.vcf"
  }
 }
 task Sentieon_BQSR{
    File ref_dir
    File dbsnp_dir
    File dbmills_dir

    String sample_id
    String ref_fasta
    String dbsnp
    String db_mills

    File deduped_bam
    File deduped_bam_index

    # excute env
    String docker
    String cluster_config
    String disk_size
    
    String SENTIEON_LICENSE

    command<<<
        set -o pipefail
        set -exo
        export SENTIEON_LICENSE=${SENTIEON_LICENSE}
        
        nt=$(nproc)


        sentieon driver -t $nt \
        -r ${ref_dir}/${ref_fasta} -i ${deduped_bam} \
        --algo QualCal \
        -k ${dbsnp_dir}/${dbsnp} -k ${dbmills_dir}/${db_mills} \
        ${sample_id}_recal_data.table

        sentieon driver -t $nt \
        -r ${ref_dir}/${ref_fasta} -i ${deduped_bam} \
        -q ${sample_id}_recal_data.table \
        --algo QualCal \
        -k ${dbsnp_dir}/${dbsnp} -k ${dbmills_dir}/${db_mills} \
        ${sample_id}_recal_data.table.post \
        --algo ReadWriter ${sample_id}.sorted.deduped.recaled.bam

        sentieon driver -t $nt --algo QualCal \
        --plot --before ${sample_id}_recal_data.table --after ${sample_id}_recal_data.table.post ${sample_id}_recal_data.csv

        sentieon plot bqsr -o ${sample_id}_bqsrreport.pdf ${sample_id}_recal_data.csv
    >>>

    runtime{
        docker:docker
        cluster:cluster_config
        systemDisk:"cloud_ssd 250"
        dataDisk:"cloud_ssd " + disk_size + " /cromwell_root/"


    }

    output{
        File recal_table = "${sample_id}_recal_data.table"
        File recal_post = "${sample_id}_recal_data.table.post"
        File recaled_bam = "${sample_id}.sorted.deduped.recaled.bam"
        File recaled_bam_index = "${sample_id}.sorted.deduped.recaled.bam.bai"
        File recal_csv = "${sample_id}_recal_data.csv"
        File bqsrreport_pdf = "${sample_id}_bqsrreport.pdf"

    }
 }


 task fastp {
    
    # I/O options
    File in1
    File in2
    String sample_id

    Boolean? phred64 = false 
    Boolean? fix_mgi_id = false

    String? adapter_sequence
    String? adapter_sequence_r2

    Int? reads_to_process # specify how many reads/pairs to be processed. Default 0 means process all reads.

    # reporting options
    String json = sample_id+"fastp.json"
    String html = sample_id+"fastp.html"
    String report_title = "\'fastp report\'"

    # excute env
    String docker
    String cluster_config
    String disk_size

    String out1_name = sample_id+'_clean_1.fastq'
    String out2_name = sample_id+'_clean_2.fastq'

    command <<<

        # basic command
        /opt/conda/bin/fastp \
        --in1 ${in1} \
        --in2 ${in2} \
        --out1 ${out1_name} \
        --out2 ${out2_name} \
        --json ${json} \
        --html ${html} \
        --report_title ${report_title} \
        
        # options 
        ${ true="--phred64 " false="" phred64 } \
        ${ "--reads_to_process " + reads_to_process } \
        ${ true="--fix_mgi_id " false="" fix_mgi_id } \
        ${ "--adapter_sequence " + adapter_sequence } \
        ${ "--adapter_sequence_r2 " + adapter_sequence_r2 }

    >>>

    runtime {
 		docker:docker
 		cluster:cluster_config
 		systemDisk:"cloud_ssd 40"
 		dataDisk:"cloud_ssd " + disk_size + " /cromwell_root/"
    }

    output {
        File out1 = out1_name
        File out2 = out2_name
        File json_report = json
        File html_report = html
    }

 }

 task SentieonFastqToBam {
    # 工具输入文件和参数
    File fastq1
    File fastq2
    String sample_id
    String Seq_platform
    String ref_fasta
    File ref_fasta_dir
    String SENTIEON_LICENSE

    String docker
    String cluster_config
    String disk_size

    ## Extra driver parameters
    String qc_driver_args = ""
    String lc_driver_args = "--traverse_param=200000/10000"
    String dedup_driver_args = "--traverse_param=200000/10000"
    ## Extra algo parameters
    String bwa_args = "-Y -M"
    String bwa_chunk_size = "100000000"
    String lc_args = ""
    String bam_option = "--bam_compression 1"



    
    String out_bam = sample_id + ".dedup.bam"
    String out_bai = sample_id + ".dedup.bam.bai"

    # 工具运行命令
    command <<<
        set -exo pipefail
        export SENTIEON_LICENSE=${SENTIEON_LICENSE}
        nt=$(nproc)
        
        sentieon bwa mem -R "@RG\tID:${sample_id}\tSM:${sample_id}\tPL:${Seq_platform}" ${bwa_args} -K ${bwa_chunk_size} -t $nt ${ref_fasta_dir}/${ref_fasta} ${fastq1} ${fastq2} \
        | sentieon util sort ${bam_option} -i - -r ${ref_fasta_dir}/${ref_fasta} -t $nt -o ${sample_id}.sorted.bam --sam2bam

        ls ./
                  
        sentieon driver -r ${ref_fasta_dir}/${ref_fasta} -t $nt -i ${sample_id}.sorted.bam ${qc_driver_args} \
        --algo MeanQualityByCycle ${sample_id}.mq_metrics.txt \
        --algo QualDistribution ${sample_id}.qd_metrics.txt \
        --algo GCBias --summary ${sample_id}.gc_summary_metrics.txt ${sample_id}.gc_metrics.txt \
        --algo AlignmentStat ${sample_id}.aln_metrics.txt \
        --algo InsertSizeMetricAlgo ${sample_id}.is_metrics.txt
                  
        ls ./

        sentieon driver -r ${ref_fasta_dir}/${ref_fasta} -t $nt -i ${sample_id}.sorted.bam ${lc_driver_args} \
         --algo LocusCollector \
         ${lc_args} \
         ${sample_id}.score.txt.gz
                  
        ls ./

        sentieon driver -r ${ref_fasta_dir}/${ref_fasta} -t $nt -i ${sample_id}.sorted.bam ${dedup_driver_args} \
         --algo Dedup \
         --score_info ${sample_id}.score.txt.gz \
         --metrics ${sample_id}.dedup_metrics.txt \
         ${bam_option} ${out_bam} 
         ls ./

    >>>


    runtime {
        docker:docker
        cluster:cluster_config
        systemDisk:"cloud_ssd 40"
        dataDisk:"cloud_ssd " + disk_size + " /cromwell_root/"

    }
    

    # 工具运行输出结果
    output {
        File deduped_bam = out_bam
        File deduped_bam_bai = out_bai
        Array[File] qc_metrics = glob("*_metrics.txt")
    }

 }
 task manta_calling{
    File tumor_bam
    File tumor_bam_bai
    File normal_bam
    File normal_bam_bai
    String ref_fasta
    File ref_dir
    String sample_id
    
    String docker
    String cluster_config
    String disk_size

    
    String out_dir = "${sample_id}_result"
    command <<<
    set -exo pipefail
    nt=$(nproc)
    /home/biosoft/manta-1.6.0.centos6_x86_64/bin/configManta.py \
    --normalBam ${normal_bam} \
    --tumorBam ${tumor_bam} \
    --referenceFasta ${ref_dir}/${ref_fasta} \
    --runDir ${out_dir}
    
    ls ${out_dir}

    python2.7 ${out_dir}/runWorkflow.py -m local -j $nt

    ls ${out_dir}

    tar cvf ${out_dir}.tar ${out_dir}
    >>>

    runtime{
        docker:docker
        cluster:cluster_config
        systemDisk:"cloud_ssd 40"
        dataDisk:"cloud_ssd " + disk_size + " /cromwell_root/"

    }

    output{
        File out_file = "${out_dir}.tar"
        File manta_indel_vcf = "${out_dir}/results/variants/candidateSmallIndels.vcf.gz"
        File manta_indel_vcf_index = "${out_dir}/results/variants/candidateSmallIndels.vcf.gz.tbi"
    }
 }

 task qualimap{
    String sample_id
    File bam_file
    File bam_bai
    File annot_gff

    String docker
    String cluster_config
    String disk_size

    String out_dir = sample_id+'_BamQC'

    command <<<
        set -o pipefail
        set -exo
        nt=$(nproc)
        /opt/qualimap/qualimap bamqc -bam ${bam_file} -gff ${annot_gff} -outformat PDF:HTML -nt $nt -outdir ${out_dir} --java-mem-size=32G
        tar -zcvf ${out_dir}.tar ${out_dir}
    >>>

    runtime{
 		docker:docker
 		cluster:cluster_config
 		systemDisk:"cloud_ssd 40"
 		dataDisk:"cloud_ssd " + disk_size + " /cromwell_root/"
    }

    output{
        File out_file = "${out_dir}.tar"
    }
 }

 task strelka_calling{
    File tumor_bam
    File tumor_bam_bai
    File normal_bam
    File normal_bam_bai
    String ref_fasta
    File ref_dir
    String sample_id
    File manta_indel_vcf
    File manta_indel_vcf_index
    
    String docker
    String cluster_config
    String disk_size

    
    String out_dir = "${sample_id}_result"
    command <<<
    set -exo pipefail
    nt=$(nproc)
    /home/biosoft/strelka-2.9.10.centos6_x86_64/bin/configureStrelkaSomaticWorkflow.py \
    --normalBam ${normal_bam} \
    --tumorBam ${tumor_bam} \
    --referenceFasta ${ref_dir}/${ref_fasta} \
    --indelCandidates ${manta_indel_vcf} \
    --runDir ${out_dir}
    
    ls ${out_dir}

    python2.7 ${out_dir}/runWorkflow.py -m local -j $nt

    ls ${out_dir}

    tar cvf ${out_dir}.tar ${out_dir}
    >>>

    runtime{
        docker:docker
        cluster:cluster_config
        systemDisk:"cloud_ssd 40"
        dataDisk:"cloud_ssd " + disk_size + " /cromwell_root/"
    }

    output{
        File out_file = "${out_dir}.tar"
    }
 }
 task sentieon_TNscope{
    String sample_id
    File tumor_bam
    File tumor_bam_bai
    File? normal_bam
    File? normal_bam_bai
    String tumor_name
    String normal_name
    File tumor_recall_data
    File normal_recall_data

    File ref_dir
    String ref_fasta
    File dbsnp_dir
    String dbsnp

    # excute env
    String docker
    String cluster_config
    String disk_size
    String SENTIEON_LICENSE


    command <<<
        set -o pipefail
        set -exo
        export SENTIEON_LICENSE=${SENTIEON_LICENSE}
        nt=$(nproc)

        sentieon driver -t $nt -r ${ref_dir}/${ref_fasta} \
        -i ${tumor_bam} -q ${tumor_recall_data} \
        -i ${normal_bam} -q ${normal_recall_data} \
        --algo TNscope --tumor_sample ${tumor_name} --normal_sample ${normal_name} \
        --disable_detector sv --trim_soft_clip \
        --dbsnp ${dbsnp_dir}/${dbsnp} ${sample_id}.TNscope.vcf || { echo "TNscope failed"; exit 1; }
        
        ls ./

    >>>

    runtime{
        docker:docker
        cluster:cluster_config
        systemDisk:"cloud_ssd 40"
        dataDisk:"cloud_ssd " + disk_size + " /cromwell_root/"
    }

    output{
        File vcf = "${sample_id}.TNscope.vcf"
        File vcf_index = "${sample_id}.TNscope.vcf.idx"

    }
 }
 task sentieon_TNseq{
    String sample_id
    File tumor_bam
    File tumor_bam_bai
    File? normal_bam
    File? normal_bam_bai
    String tumor_name
    String normal_name

    File ref_dir
    String ref_fasta
    File germline_resource
    File germline_resource_tbi

    # excute env
    String docker
    String cluster_config
    String disk_size
    String SENTIEON_LICENSE


    command <<<
        set -o pipefail
        set -exo
        export SENTIEON_LICENSE=${SENTIEON_LICENSE}
        nt=$(nproc)


        if [${normal_bam}];then
            INPUT="-i ${tumor_bam} -i ${normal_bam}"
            SAMPLE="--tumor_sample ${tumor_name} --normal_sample ${normal_name}"
        else
           INPUT="-i ${tumor_bam}"
           SAMPLE="--tumor_sample ${tumor_name}"
        fi

        sentieon driver -t $nt -r ${ref_dir}/${ref_fasta} \
        $INPUT \
        --algo TNhaplotyper2 $SAMPLE \
        --germline_vcf ${germline_resource} \
        ${sample_id}.TNseq.raw.vcf \
        --algo OrientationBias --tumor_sample ${tumor_name} \
        ${sample_id}.orientation \
        --algo ContaminationModel $SAMPLE \
        --vcf ${germline_resource} \
        --tumor_segments ${sample_id}.contamination.segments \
        ${sample_id}.contamination

        sentieon driver -t $nt \
        -r ${ref_dir}/${ref_fasta} \
        --algo TNfilter $SAMPLE \
        -v ${sample_id}.TNseq.raw.vcf \
        --contamination ${sample_id}.contamination \
        --tumor_segments ${sample_id}.contamination.segments \
        --orientation_priors ${sample_id}.orientation \
        ${sample_id}.bwa_TNseq.vcf

    >>>

    runtime{
        docker:docker
        cluster:cluster_config
        systemDisk:"cloud_ssd 40"
        dataDisk:"cloud_ssd " + disk_size + " /cromwell_root/"

    }

    output{
        File raw_vcf = "${sample_id}.TNseq.raw.vcf"
        File raw_vcf_index = "${sample_id}.TNseq.raw.vcf.idx"
        File vcf = "${sample_id}.bwa_TNseq.vcf"
        File vcf_index = "${sample_id}.bwa_TNseq.vcf.idx"
        File contamination = "${sample_id}.contamination"
        File contamination_segments = "${sample_id}.contamination.segments"
        File orientation = "${sample_id}.orientation"


    }
 }