Переглянути джерело

同步至HPC-v20190527版

增加Unaligned片段成分分析
master
chenziyin 6 роки тому
джерело
коміт
e4edf57a69
5 змінених файлів з 172 додано та 44 видалено
  1. +63
    -14
      tasks/Align.wdl
  2. +24
    -0
      tasks/Fastqc.wdl
  3. +4
    -4
      tasks/Quantification.wdl
  4. +5
    -5
      tasks/ReadFilter.wdl
  5. +76
    -21
      tasks/ReadStats.wdl

+ 63
- 14
tasks/Align.wdl Переглянути файл

task Align {
task AlignToSenseOnly {
String sample_ID
String sample_id
File in_fastq File in_fastq

String refname
File dir_index File dir_index
String prefix_index

Int max_mismatch_allowed
String prefix_index
String docker
String cluster_config
String disk_size

command <<<
set -o pipefail
set -e
nt=$(nproc)


Int sum_unmatch_quality_limit
bowtie --threads $nt \
${dir_index}/${prefix_index} --norc \
-v ${max_mismatch_allowed} \
-q ${in_fastq} \
--un ${sample_id}.${refname}Unaligned.fastq \
-S ${sample_id}.align2${refname}.sam \
2> ${sample_id}.align2${refname}.log


gzip ${sample_id}.${refname}Unaligned.fastq
>>>

runtime {
docker: docker
cluster: cluster_config
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}

output {
File out_sam="${sample_id}.align2${refname}.sam"
File out_fastq="${sample_id}.${refname}Unaligned.fastq.gz"
File out_log="${sample_id}.align2${refname}.log"
}
}

task AlignToBothStrand {
String sample_id
File in_fastq

String refname
File dir_index
String prefix_index

Int max_mismatch_allowed
String docker String docker
String cluster_config String cluster_config
String disk_size String disk_size
set -e set -e
nt=$(nproc) nt=$(nproc)



bowtie --threads $nt \ bowtie --threads $nt \
${dir_index}/${prefix_index} \ ${dir_index}/${prefix_index} \
-e ${sum_unmatch_quality_limit} \
-v ${max_mismatch_allowed} \
-q ${in_fastq} \ -q ${in_fastq} \
--un ${sample_ID}.matureUnaligned.fastq \
-S ${sample_ID}.align2mature.sam \
2> ${sample_ID}.align2mature.log
--un ${sample_id}.${refname}Unaligned.fastq \
-S ${sample_id}.align2${refname}.sam \
2> ${sample_id}.align2${refname}.log

gzip ${sample_id}.${refname}Unaligned.fastq
>>> >>>


runtime { runtime {
docker: docker docker: docker
cluster: cluster_config cluster: cluster_config
systemDisk: "cloud_ssd 40"
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
} }


output { output {
File out_sam="${sample_ID}.align2mature.sam"
File out_fastq="${sample_ID}.matureUnaligned.fastq"
File out_log="${sample_ID}.align2mature.log"
File out_sam="${sample_id}.align2${refname}.sam"
File out_fastq="${sample_id}.${refname}Unaligned.fastq.gz"
File out_log="${sample_id}.align2${refname}.log"
} }
}
}

+ 24
- 0
tasks/Fastqc.wdl Переглянути файл

task Fastqc {
File in_fastq
String docker
String cluster_config
String disk_size

command <<<
set -o pipefail
set -e
nt=$(nproc)
fastqc -t $nt -o ./ ${in_fastq}
>>>

runtime {
docker:docker
cluster: cluster_config
systemDisk: "cloud_ssd 40"
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
}
output {
File fastqc_html = sub(basename(in_fastq), "\\.(fastq|fq)\\.gz$", "_fastqc.html")
File fastqc_zip = sub(basename(in_fastq), "\\.(fastq|fq)\\.gz$", "_fastqc.zip")
}
}

+ 4
- 4
tasks/Quantification.wdl Переглянути файл

task Quantification { task Quantification {
String sample_ID
String sample_id
File in_sam File in_sam


String cluster_config String cluster_config
set -o pipefail set -o pipefail
set -e set -e


echo -e "ID.miRNA\tReadCount" > ${sample_ID}.matureMiR.readCount
cat ${in_sam} | grep -v '^@' | awk '($2==0)' | awk '{a[$3]++}END{for(i in a){printf "%s\t%d\n",i,a[i]}}' | sort >> ${sample_ID}.matureMiR.readCount
echo -e "ID.miRNA\tReadCount" > ${sample_id}.matureMiR.readCount
cat ${in_sam} | grep -v '^@' | awk '($2==0)' | awk '{a[$3]++}END{for(i in a){printf "%s\t%d\n",i,a[i]}}' | sort >> ${sample_id}.matureMiR.readCount


>>> >>>


} }


output { output {
File out_readCount="${sample_ID}.matureMiR.readCount"
File out_readCount="${sample_id}.matureMiR.readCount"
} }
} }

+ 5
- 5
tasks/ReadFilter.wdl Переглянути файл

task ReadFilter { task ReadFilter {
String sample_ID
String sample_id
File in_fastq File in_fastq
Int qualified_quality_phred Int qualified_quality_phred
Int unqualified_percent_limit Int unqualified_percent_limit
--n_base_limit ${n_base_limit} \ --n_base_limit ${n_base_limit} \
--length_required ${length_required} \ --length_required ${length_required} \
-i ${in_fastq} \ -i ${in_fastq} \
-o ${sample_ID}.trimAdapt.filter.fastq.gz \
2> ${sample_ID}.filter.log
-o ${sample_id}.trimAdapt.filter.fastq.gz \
2> ${sample_id}.filter.log
>>> >>>


runtime { runtime {
} }


output { output {
File out_fastq="${sample_ID}.trimAdapt.filter.fastq.gz"
File out_log="${sample_ID}.filter.log"
File out_fastq="${sample_id}.trimAdapt.filter.fastq.gz"
File out_log="${sample_id}.filter.log"
} }
} }

+ 76
- 21
tasks/ReadStats.wdl Переглянути файл

task ReadStats { task ReadStats {
String sample_ID
String sample_id
File in_log_trimAdatper File in_log_trimAdatper
File in_log_readFilter File in_log_readFilter
File in_log_align_mature
File in_log_align_miRNA
File in_log_align_preMiRNA
File in_log_align_piRNA
File in_log_align_tRNA
File in_log_align_RNA
File in_log_align_hg38
File in_sam_align_RNA

String cluster_config String cluster_config
String disk_size String disk_size


set -o pipefail set -o pipefail
set -e set -e


Total_input=$(cat ${in_log_trimAdatper} | grep 'total reads' | head -n 1 | cut -d ':' -f 2 | sed 's/ //g')
n_Total_Sequence=$(cat ${in_log_trimAdatper} | grep 'total reads' | head -n 1 | cut -d ':' -f 2 | sed 's/ //g')



Pass_trimAdatper=$(cat ${in_log_trimAdatper} | grep 'reads passed filter' | tail -n 1 | cut -d ':' -f 2 | sed 's/ //g')
Adapter_dimer=$(bc<<<$Total_input-$Pass_trimAdatper)
echo "Pass1"
n_AdapterNotFound=$(cat ${in_log_trimAdatper} | grep 'reads failed due to too long' | cut -d ':' -f 2 | sed 's/ //g')
n_Total_forCount=$(bc<<<$n_Total_Sequence-$n_AdapterNotFound)


Too_short=$(cat ${in_log_readFilter} | grep 'too short' | tail -n 1 | cut -d ':' -f 2 | sed 's/ //g')
n_Pass_trimAdatper=$(cat ${in_log_trimAdatper} | grep 'reads passed filter' | tail -n 1 | cut -d ':' -f 2 | sed 's/ //g')
n_Adapter_dimer=$(bc<<<$n_Total_Sequence-$n_AdapterNotFound-$n_Pass_trimAdatper)
n_Too_short=$(cat ${in_log_readFilter} | grep 'too short' | tail -n 1 | cut -d ':' -f 2 | sed 's/ //g')
n_Low_quality_singleBase=$(cat ${in_log_readFilter} | grep 'low quality' | head -n 1 | cut -d ':' -f 2 | sed 's/ //g')
n_Low_quality_tooManyN=$(cat ${in_log_readFilter} | grep 'too many N' | head -n 1 | cut -d ':' -f 2 | sed 's/ //g')
n_Low_quality=$(bc<<<$n_Low_quality_singleBase+$n_Low_quality_tooManyN)
n_ForAlign=$(cat ${in_log_readFilter} | grep 'reads passed filter' | head -n 1 | cut -d ':' -f 2 | sed 's/ //g')


Low_quality_singleBase=$(cat ${in_log_readFilter} | grep 'low quality' | head -n 1 | cut -d ':' -f 2 | sed 's/ //g')
Low_quality_tooManyN=$(cat ${in_log_readFilter} | grep 'too many N' | head -n 1 | cut -d ':' -f 2 | sed 's/ //g')
Low_quality=$(bc<<<$Low_quality_singleBase+$Low_quality_tooManyN)
echo "Pass2"
n_miRNA_mature=$(cat ${in_log_align_miRNA} | grep 'at least one reported alignment' | head -n 1 | cut -d ':' -f 2 | cut -d '(' -f 1 | sed 's/ //g')
n_miRNA_hairpin=$(cat ${in_log_align_preMiRNA} | grep 'at least one reported alignment' | head -n 1 | cut -d ':' -f 2 | cut -d '(' -f 1 | sed 's/ //g')
n_piRNA=$(cat ${in_log_align_piRNA} | grep 'at least one reported alignment' | head -n 1 | cut -d ':' -f 2 | cut -d '(' -f 1 | sed 's/ //g')
n_tRNA=$(cat ${in_log_align_tRNA} | grep 'at least one reported alignment' | head -n 1 | cut -d ':' -f 2 | cut -d '(' -f 1 | sed 's/ //g')


ForAlign=$(cat ${in_log_readFilter} | grep 'reads passed filter' | head -n 1 | cut -d ':' -f 2 | sed 's/ //g')
n_RNA=$(cat ${in_log_align_RNA} | grep 'at least one reported alignment' | head -n 1 | cut -d ':' -f 2 | cut -d '(' -f 1 | sed 's/ //g')
n_otGenomic=$(cat ${in_log_align_hg38} | grep 'at least one reported alignment' | head -n 1 | cut -d ':' -f 2 | cut -d '(' -f 1 | sed 's/ //g')


Align_miRNA_mature=$(cat ${in_log_align_mature} | grep 'at least one reported alignment' | head -n 1 | cut -d ':' -f 2 | cut -d '(' -f 1 | sed 's/ //g')
n_notGenomic=$(cat ${in_log_align_hg38} | grep 'reads that failed to align' | head -n 1 | cut -d ':' -f 2 | cut -d '(' -f 1 | sed 's/ //g' )


echo -e "Stage\tReadCount" > ${sample_ID}.readStats
echo -e "Total Input\t$Total_input" >> ${sample_ID}.readStats
echo -e "Adapter Dimer\t$Adapter_dimer" >> ${sample_ID}.readStats
echo -e "Too Short\t$Too_short" >> ${sample_ID}.readStats
echo -e "Low Quality\t$Low_quality" >> ${sample_ID}.readStats
echo -e "For Align\t$ForAlign" >> ${sample_ID}.readStats
echo -e "Mature miRNA\t$Align_miRNA_mature" >> ${sample_ID}.readStats
echo "Pass3"
# mkdir -p /cromwell_root/tmp
echo "Pass3.1"

groupedReadCount=/cromwell_root/tmp/${sample_id}.trimAdapt.filter.align2RNA.grouped.readCount
echo "Pass3.2"

# cat ${in_sam_align_RNA} | head -n 4
echo "Pass3.3"

# cat ${in_sam_align_RNA} | grep -v '@' | awk '($2!=4)' | cut -f 3 | sed 's/.*;//g' | awk '{a[$1]++}END{for(i in a){printf "%s\t%d\n",i,a[i]}}' > $groupedReadCount

echo "Pass4"

# n_mRNA=$(cat $groupedReadCount | grep '^mRNA' | cut -f 2 )
# n_lncRNA=$(cat $groupedReadCount | grep '^long_non-coding_RNA' | cut -f 2 )
# n_rRNA=$(cat $groupedReadCount | grep '^ribosomal_RNA' | cut -f 2 )
# n_YRNA=$(cat $groupedReadCount | grep '^Y_RNA' | cut -f 2 )
# n_otsmall=$(cat $groupedReadCount | grep -E '^misc_RNA|small|guide_RNA|vault_RNA' | cut -f 2 | awk '{sum+=$1}END{print sum}')
# n_otTranscript=$(bc<<<$n_RNA-$n_mRNA-$n_lncRNA-$n_rRNA-$n_YRNA-$n_otsmall)

echo "Pass5"
file_output=${sample_id}.readStats
echo -e "Stage\tReadCount" > $file_output
echo -e "adapter not found\t$n_AdapterNotFound" >> $file_output
echo -e "adapter dimer\t$n_Adapter_dimer" >> $file_output
echo -e "too short\t$n_Too_short" >> $file_output
echo -e "low sequencing quality\t$n_Low_quality" >> $file_output
echo -e "mature miRNA\t$n_miRNA_mature" >> $file_output
echo -e "hairpin miRNA\t$n_miRNA_hairpin" >> $file_output
echo -e "piRNA\t$n_piRNA" >> $file_output
echo -e "tRNA\t$n_tRNA" >> $file_output
# echo -e "mRNA\t$n_mRNA" >> $file_output
# echo -e "lncRNA\t$n_lncRNA" >> $file_output
# echo -e "rRNA\t$n_rRNA" >> $file_output
# echo -e "YRNA\t$n_YRNA" >> $file_output
# echo -e "other small RNA\t$n_otsmall" >> $file_output
# echo -e "other from transcriptome\t$n_otTranscript" >> $file_output
echo -e "other from human genome\t$n_otGenomic" >> $file_output
echo -e "not from human genome\t$n_notGenomic" >> $file_output

echo "Pass6"
>>> >>>


runtime { runtime {
} }


output { output {
File out="${sample_ID}.readStats"
File out="${sample_id}.readStats"
} }
} }

Завантаження…
Відмінити
Зберегти