增加Unaligned片段成分分析

преди 6 години · e4edf57a69
--- a/tasks/Align.wdl
+++ b/tasks/Align.wdl
@@ -1,13 +1,61 @@
 task Align {
 task AlignToSenseOnly {
    
    String sample_ID
    String sample_id
    File in_fastq

    String refname
    
    File dir_index
    String prefix_index 

    Int max_mismatch_allowed
    
    String prefix_index
    String docker
    String cluster_config
    String disk_size

    command <<<
        set -o pipefail
        set -e
        nt=$(nproc)

    Int sum_unmatch_quality_limit
        bowtie --threads $nt \
            ${dir_index}/${prefix_index} --norc \
            -v ${max_mismatch_allowed} \
            -q ${in_fastq} \
            --un ${sample_id}.${refname}Unaligned.fastq \
            -S ${sample_id}.align2${refname}.sam \
        2> ${sample_id}.align2${refname}.log

        gzip ${sample_id}.${refname}Unaligned.fastq
    >>>

    runtime {
        docker: docker
        cluster: cluster_config
        systemDisk: "cloud_ssd 40" 
        dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
    }

    output {
        File out_sam="${sample_id}.align2${refname}.sam"
        File out_fastq="${sample_id}.${refname}Unaligned.fastq.gz"
        File out_log="${sample_id}.align2${refname}.log"
    }
 }

 task AlignToBothStrand {
    
    String sample_id
    File in_fastq

    String refname
    
    File dir_index
    String prefix_index 

    Int max_mismatch_allowed
    
    String docker
    String cluster_config
    String disk_size
@@ -17,26 +65,27 @@ task Align {
        set -e
        nt=$(nproc)


        bowtie --threads $nt \
            ${dir_index}/${prefix_index} \
            -e ${sum_unmatch_quality_limit} \
            -v ${max_mismatch_allowed} \
            -q ${in_fastq} \
            --un ${sample_ID}.matureUnaligned.fastq \
            -S ${sample_ID}.align2mature.sam \
        2> ${sample_ID}.align2mature.log
            --un ${sample_id}.${refname}Unaligned.fastq \
            -S ${sample_id}.align2${refname}.sam \
        2> ${sample_id}.align2${refname}.log

        gzip ${sample_id}.${refname}Unaligned.fastq
    >>>

    runtime {
        docker: docker
        cluster: cluster_config
        systemDisk: "cloud_ssd 40"
        systemDisk: "cloud_ssd 40" 
        dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
    }

    output {
        File out_sam="${sample_ID}.align2mature.sam"
        File out_fastq="${sample_ID}.matureUnaligned.fastq"
        File out_log="${sample_ID}.align2mature.log"
        File out_sam="${sample_id}.align2${refname}.sam"
        File out_fastq="${sample_id}.${refname}Unaligned.fastq.gz"
        File out_log="${sample_id}.align2${refname}.log"
    }
 }
 }
--- a/tasks/Fastqc.wdl
+++ b/tasks/Fastqc.wdl
@@ -0,0 +1,24 @@
 task Fastqc {
 	File in_fastq
 	String docker
 	String cluster_config
 	String disk_size

 	command <<<
 		set -o pipefail
 		set -e
 		nt=$(nproc)
 		fastqc -t $nt -o ./ ${in_fastq}
 	>>>

 	runtime {
 		docker:docker
        cluster: cluster_config
        systemDisk: "cloud_ssd 40"
        dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
 	}
 	output {
 		File fastqc_html = sub(basename(in_fastq), "\\.(fastq|fq)\\.gz$", "_fastqc.html")
 		File fastqc_zip = sub(basename(in_fastq), "\\.(fastq|fq)\\.gz$", "_fastqc.zip")
 	}
 }
--- a/tasks/Quantification.wdl
+++ b/tasks/Quantification.wdl
@@ -1,5 +1,5 @@
 task Quantification {
    String sample_ID
    String sample_id
    File in_sam

    String cluster_config
@@ -10,8 +10,8 @@ task Quantification {
        set -o pipefail
        set -e

    echo -e "ID.miRNA\tReadCount" > ${sample_ID}.matureMiR.readCount
    cat ${in_sam} | grep -v '^@'  | awk '($2==0)' | awk '{a[$3]++}END{for(i in a){printf "%s\t%d\n",i,a[i]}}' | sort >> ${sample_ID}.matureMiR.readCount
    echo -e "ID.miRNA\tReadCount" > ${sample_id}.matureMiR.readCount
    cat ${in_sam} | grep -v '^@'  | awk '($2==0)' | awk '{a[$3]++}END{for(i in a){printf "%s\t%d\n",i,a[i]}}' | sort >> ${sample_id}.matureMiR.readCount

    >>>

@@ -22,6 +22,6 @@ task Quantification {
    }

    output {
        File out_readCount="${sample_ID}.matureMiR.readCount"
        File out_readCount="${sample_id}.matureMiR.readCount"
    }
 }
--- a/tasks/ReadFilter.wdl
+++ b/tasks/ReadFilter.wdl
@@ -1,6 +1,6 @@
 task ReadFilter {
    
    String sample_ID
    String sample_id
    File in_fastq
    Int qualified_quality_phred
    Int unqualified_percent_limit
@@ -23,8 +23,8 @@ task ReadFilter {
            --n_base_limit ${n_base_limit} \
            --length_required ${length_required} \
            -i ${in_fastq} \
            -o ${sample_ID}.trimAdapt.filter.fastq.gz \
            2> ${sample_ID}.filter.log
            -o ${sample_id}.trimAdapt.filter.fastq.gz \
            2> ${sample_id}.filter.log
        >>>

    runtime {
@@ -35,7 +35,7 @@ task ReadFilter {
    }

    output {
        File out_fastq="${sample_ID}.trimAdapt.filter.fastq.gz"
        File out_log="${sample_ID}.filter.log"
        File out_fastq="${sample_id}.trimAdapt.filter.fastq.gz"
        File out_log="${sample_id}.filter.log"
    }
 }
--- a/tasks/ReadStats.wdl
+++ b/tasks/ReadStats.wdl
@@ -1,10 +1,16 @@
 task ReadStats {
    
    String sample_ID
    String sample_id
    File in_log_trimAdatper
    File in_log_readFilter
    File in_log_align_mature
   
    File in_log_align_miRNA
    File in_log_align_preMiRNA
    File in_log_align_piRNA
    File in_log_align_tRNA
    File in_log_align_RNA 
    File in_log_align_hg38
    File in_sam_align_RNA

    String cluster_config
    String disk_size

@@ -12,29 +18,78 @@ task ReadStats {
        set -o pipefail
        set -e

        Total_input=$(cat ${in_log_trimAdatper} | grep 'total reads' | head -n 1 | cut -d ':' -f 2 | sed 's/ //g')
        n_Total_Sequence=$(cat ${in_log_trimAdatper} | grep 'total reads' | head -n 1 | cut -d ':' -f 2 | sed 's/ //g')

   

        Pass_trimAdatper=$(cat ${in_log_trimAdatper} | grep 'reads passed filter' | tail -n 1  | cut -d ':' -f 2 | sed 's/ //g')
        Adapter_dimer=$(bc<<<$Total_input-$Pass_trimAdatper)
        echo "Pass1"
        n_AdapterNotFound=$(cat ${in_log_trimAdatper} | grep 'reads failed due to too long' | cut -d ':' -f 2 | sed 's/ //g')
        n_Total_forCount=$(bc<<<$n_Total_Sequence-$n_AdapterNotFound)

        Too_short=$(cat ${in_log_readFilter} | grep 'too short' | tail -n 1 | cut -d ':' -f 2 | sed 's/ //g')
        n_Pass_trimAdatper=$(cat ${in_log_trimAdatper} | grep 'reads passed filter' | tail -n 1 | cut -d ':' -f 2 | sed 's/ //g')
        n_Adapter_dimer=$(bc<<<$n_Total_Sequence-$n_AdapterNotFound-$n_Pass_trimAdatper)
        
        n_Too_short=$(cat ${in_log_readFilter} | grep 'too short' | tail -n 1 | cut -d ':' -f 2 | sed 's/ //g')
        n_Low_quality_singleBase=$(cat ${in_log_readFilter} | grep 'low quality' | head -n 1 | cut -d ':' -f 2 | sed 's/ //g')
        n_Low_quality_tooManyN=$(cat ${in_log_readFilter} | grep 'too many N' | head -n 1 | cut -d ':' -f 2 | sed 's/ //g')
        n_Low_quality=$(bc<<<$n_Low_quality_singleBase+$n_Low_quality_tooManyN)
        n_ForAlign=$(cat ${in_log_readFilter} | grep 'reads passed filter' | head -n 1 | cut -d ':' -f 2 | sed 's/ //g')

        Low_quality_singleBase=$(cat ${in_log_readFilter} | grep 'low quality' | head -n 1 | cut -d ':' -f 2 | sed 's/ //g')
        Low_quality_tooManyN=$(cat ${in_log_readFilter} | grep 'too many N' | head -n 1 | cut -d ':' -f 2 | sed 's/ //g')
        Low_quality=$(bc<<<$Low_quality_singleBase+$Low_quality_tooManyN)
        echo "Pass2"
        n_miRNA_mature=$(cat ${in_log_align_miRNA} | grep 'at least one reported alignment' | head -n 1 | cut -d ':' -f 2 | cut -d '(' -f 1 | sed 's/ //g')
        n_miRNA_hairpin=$(cat ${in_log_align_preMiRNA} | grep 'at least one reported alignment' | head -n 1 | cut -d ':' -f 2 | cut -d '(' -f 1 | sed 's/ //g')
        n_piRNA=$(cat ${in_log_align_piRNA} | grep 'at least one reported alignment' | head -n 1 | cut -d ':' -f 2 | cut -d '(' -f 1 | sed 's/ //g')
        n_tRNA=$(cat ${in_log_align_tRNA} | grep 'at least one reported alignment' | head -n 1 | cut -d ':' -f 2 | cut -d '(' -f 1 | sed 's/ //g')

        ForAlign=$(cat ${in_log_readFilter} | grep 'reads passed filter' | head -n 1 | cut -d ':' -f 2 | sed 's/ //g')
        n_RNA=$(cat ${in_log_align_RNA} | grep 'at least one reported alignment' | head -n 1 | cut -d ':' -f 2 | cut -d '(' -f 1 | sed 's/ //g')
        n_otGenomic=$(cat ${in_log_align_hg38} | grep 'at least one reported alignment' | head -n 1 | cut -d ':' -f 2 | cut -d '(' -f 1 | sed 's/ //g')

        Align_miRNA_mature=$(cat ${in_log_align_mature} | grep 'at least one reported alignment' | head -n 1 | cut -d ':' -f 2 | cut -d '(' -f 1 | sed 's/ //g')
        n_notGenomic=$(cat ${in_log_align_hg38} | grep 'reads that failed to align' | head -n 1 | cut -d ':' -f 2 | cut -d '(' -f 1 | sed 's/ //g' )

        echo -e "Stage\tReadCount" > ${sample_ID}.readStats
        echo -e "Total Input\t$Total_input" >> ${sample_ID}.readStats
        echo -e "Adapter Dimer\t$Adapter_dimer" >> ${sample_ID}.readStats
        echo -e "Too Short\t$Too_short" >> ${sample_ID}.readStats
        echo -e "Low Quality\t$Low_quality" >> ${sample_ID}.readStats
        echo -e "For Align\t$ForAlign" >> ${sample_ID}.readStats
        echo -e "Mature miRNA\t$Align_miRNA_mature" >> ${sample_ID}.readStats
    
        echo "Pass3"
        
       # mkdir -p /cromwell_root/tmp
        echo "Pass3.1"

        groupedReadCount=/cromwell_root/tmp/${sample_id}.trimAdapt.filter.align2RNA.grouped.readCount
        echo "Pass3.2"

       # cat ${in_sam_align_RNA} | head -n 4 
        echo "Pass3.3"

       # cat ${in_sam_align_RNA} | grep -v '@' | awk '($2!=4)' | cut -f 3 | sed 's/.*;//g' | awk '{a[$1]++}END{for(i in a){printf "%s\t%d\n",i,a[i]}}' > $groupedReadCount

        echo "Pass4"

       # n_mRNA=$(cat $groupedReadCount | grep '^mRNA' | cut -f 2 )
       # n_lncRNA=$(cat $groupedReadCount | grep '^long_non-coding_RNA' | cut -f 2 )
       # n_rRNA=$(cat $groupedReadCount | grep '^ribosomal_RNA' | cut -f 2 )
       # n_YRNA=$(cat $groupedReadCount | grep '^Y_RNA' | cut -f 2 )
       # n_otsmall=$(cat $groupedReadCount | grep -E '^misc_RNA|small|guide_RNA|vault_RNA' | cut -f 2 | awk '{sum+=$1}END{print sum}')
       # n_otTranscript=$(bc<<<$n_RNA-$n_mRNA-$n_lncRNA-$n_rRNA-$n_YRNA-$n_otsmall)

        echo "Pass5"
        
        file_output=${sample_id}.readStats
        echo -e "Stage\tReadCount" > $file_output
        echo -e "adapter not found\t$n_AdapterNotFound" >> $file_output
        echo -e "adapter dimer\t$n_Adapter_dimer" >> $file_output
        echo -e "too short\t$n_Too_short" >> $file_output
        echo -e "low sequencing quality\t$n_Low_quality" >> $file_output
        echo -e "mature miRNA\t$n_miRNA_mature" >> $file_output
        echo -e "hairpin miRNA\t$n_miRNA_hairpin" >> $file_output
        echo -e "piRNA\t$n_piRNA" >> $file_output
        echo -e "tRNA\t$n_tRNA" >> $file_output
       # echo -e "mRNA\t$n_mRNA" >> $file_output
       # echo -e "lncRNA\t$n_lncRNA" >> $file_output
       # echo -e "rRNA\t$n_rRNA" >> $file_output
       # echo -e "YRNA\t$n_YRNA" >> $file_output
       # echo -e "other small RNA\t$n_otsmall" >> $file_output
       # echo -e "other from transcriptome\t$n_otTranscript" >> $file_output
        echo -e "other from human genome\t$n_otGenomic" >> $file_output
        echo -e "not from human genome\t$n_notGenomic" >> $file_output

        echo "Pass6"
    >>>

    runtime {
@@ -44,6 +99,6 @@ task ReadStats {
    }

    output {
        File out="${sample_ID}.readStats"
        File out="${sample_id}.readStats"
    }
 }