# 原始数据和比对数据的质量控制 | |||||
> Author:Ren Luyao | |||||
> | |||||
> Git: http://choppy.3steps.cn/renluyao/RNAseq_QC.git | |||||
> | |||||
> Email: 18110700050@fudan.edu.cn | |||||
> | |||||
> Date: 2020/02/09 | |||||
# APP概述 | |||||
本APP包含了原始数据质量控制软件FastQC和FastqScreen和比对质量控制软件Qualimap,以及对多样本数据结果整合的multiqc。 | |||||
# APP输入 | |||||
本APP只有一个输入即inputSamplesFile,包含了需要计算样本的fastq read1,bam和bam的index。 | |||||
这个文件是一个txt,tab分隔,第一列是read1的阿里云地址,第二列是bam,第三列是bam index。每一行是一个样本。可查看模版inputSamplesFileExamples.tsv,**注意:#read1 #bam #bai这一行要删掉**。 | |||||
```bash | |||||
#read1 #bam #bai | |||||
``` | |||||
将准备好的inputSamplesFile文件上传至阿里云。 | |||||
choppy samples文件中就填inputSamplesFile在阿里云上的地址。 | |||||
```bash | |||||
# 1. 启动choppy | |||||
source activate choppy | |||||
# 2. 安装APP | |||||
choppy install renluyao/RNAseq_QC | |||||
# 3. 获得choppy samples的csv文件 | |||||
choppy samples RNAseq_QC-latest --output RNAseq_qc_samples | |||||
# 4. 编辑samples文件 | |||||
# samples_id,inputSamplesFile | |||||
# samples_id是choppy对workflow的编号,写阿拉伯数字就行 | |||||
# 即 | |||||
# 1,inputSamplesFile的阿里云地址 | |||||
# 5. 提交任务 | |||||
choppy batch RNAseq_QC-latest --project-name <project_name> | |||||
# 6. 查询任务 | |||||
choppy query -s <workflow_id> | |||||
choppy query -s <workflow_id> -m | |||||
``` | |||||
# APP输出结果 | |||||
所有的结果都会整合进multiqc。从阿里云上下载multiqc模块的输出 | |||||
1. **multiqc.html** | |||||
2. **glob_一大串数字的文件夹** | |||||
下载上述文件,将**glob_一大串数字的文件夹**名称改成**multiqc**,双击multiqc.html在浏览器中打开就能查看结果了。 | |||||
如果需要各模块详细的结果,可下载对应结果。 |
# This is an example configuration file for FastQ Screen | |||||
############################ | |||||
## Bowtie, Bowtie 2 or BWA # | |||||
############################ | |||||
## If the Bowtie, Bowtie 2 or BWA binary is not in your PATH, you can set | |||||
## this value to tell the program where to find your chosen aligner. Uncomment | |||||
## the relevant line below and set the appropriate location. Please note, | |||||
## this path should INCLUDE the executable filename. | |||||
#BOWTIE /usr/local/bin/bowtie/bowtie | |||||
#BOWTIE2 /usr/local/bowtie2/bowtie2 | |||||
#BWA /usr/local/bwa/bwa | |||||
############################################ | |||||
## Bismark (for bisulfite sequencing only) # | |||||
############################################ | |||||
## If the Bismark binary is not in your PATH then you can set this value to | |||||
## tell the program where to find it. Uncomment the line below and set the | |||||
## appropriate location. Please note, this path should INCLUDE the executable | |||||
## filename. | |||||
#BISMARK /usr/local/bin/bismark/bismark | |||||
############ | |||||
## Threads # | |||||
############ | |||||
## Genome aligners can be made to run across multiple CPU cores to speed up | |||||
## searches. Set this value to the number of cores you want for mapping reads. | |||||
THREADS 32 | |||||
############## | |||||
## DATABASES # | |||||
############## | |||||
## This section enables you to configure multiple genomes databases (aligner index | |||||
## files) to search against in your screen. For each genome you need to provide a | |||||
## database name (which can't contain spaces) and the location of the aligner index | |||||
## files. | |||||
## | |||||
## The path to the index files SHOULD INCLUDE THE BASENAME of the index, e.g: | |||||
## /data/public/Genomes/Human_Bowtie/GRCh37/Homo_sapiens.GRCh37 | |||||
## Thus, the index files (Homo_sapiens.GRCh37.1.bt2, Homo_sapiens.GRCh37.2.bt2, etc.) | |||||
## are found in a folder named 'GRCh37'. | |||||
## | |||||
## If, for example, the Bowtie, Bowtie2 and BWA indices of a given genome reside in | |||||
## the SAME FOLDER, a SINLGE path may be provided to ALL the of indices. The index | |||||
## used will be the one compatible with the chosen aligner (as specified using the | |||||
## --aligner flag). | |||||
## | |||||
## The entries shown below are only suggested examples, you can add as many DATABASE | |||||
## sections as required, and you can comment out or remove as many of the existing | |||||
## entries as desired. We suggest including genomes and sequences that may be sources | |||||
## of contamination either because they where run on your sequencer previously, or may | |||||
## have contaminated your sample during the library preparation step. | |||||
## | |||||
## Human - sequences available from | |||||
## ftp://ftp.ensembl.org/pub/current/fasta/homo_sapiens/dna/ | |||||
#DATABASE Human /data/public/Genomes/Human_Bowtie/GRCh37/Homo_sapiens.GRCh37 | |||||
## | |||||
## Mouse - sequence available from | |||||
## ftp://ftp.ensembl.org/pub/current/fasta/mus_musculus/dna/ | |||||
#DATABASE Mouse /data/public/Genomes/Mouse/NCBIM37/Mus_musculus.NCBIM37 | |||||
## | |||||
## Ecoli- sequence available from EMBL accession U00096.2 | |||||
#DATABASE Ecoli /data/public/Genomes/Ecoli/Ecoli | |||||
## | |||||
## PhiX - sequence available from Refseq accession NC_001422.1 | |||||
#DATABASE PhiX /data/public/Genomes/PhiX/phi_plus_SNPs | |||||
## | |||||
## Adapters - sequence derived from the FastQC contaminats file found at: www.bioinformatics.babraham.ac.uk/projects/fastqc | |||||
#DATABASE Adapters /data/public/Genomes/Contaminants/Contaminants | |||||
## | |||||
## Vector - Sequence taken from the UniVec database | |||||
## http://www.ncbi.nlm.nih.gov/VecScreen/UniVec.html | |||||
#DATABASE Vectors /data/public/Genomes/Vectors/Vectors | |||||
DATABASE Human /cromwell_root/tmp/fastq_screen_reference/genome | |||||
DATABASE Mouse /cromwell_root/tmp/fastq_screen_reference/mouse | |||||
DATABASE ERCC /cromwell_root/tmp/fastq_screen_reference/ERCC | |||||
DATABASE EColi /cromwell_root/tmp/fastq_screen_reference/ecoli | |||||
DATABASE Adapter /cromwell_root/tmp/fastq_screen_reference/adapters | |||||
DATABASE Vector /cromwell_root/tmp/fastq_screen_reference/vector | |||||
DATABASE rRNA /cromwell_root/tmp/fastq_screen_reference/rRNARef | |||||
DATABASE Virus /cromwell_root/tmp/fastq_screen_reference/viral | |||||
DATABASE Yeast /cromwell_root/tmp/fastq_screen_reference/GCF_000146045.2_R64_genomic_modify | |||||
DATABASE Mitoch /cromwell_root/tmp/fastq_screen_reference/Human_mitoch | |||||
DATABASE Phix /cromwell_root/tmp/fastq_screen_reference/phix |
oss://chinese-quartet/quartet-test-data/WEGENE_T7/Quartet_DNA_BGI_T7_WGE_LCL5_1_20191105_R1.fastq.gz oss://chinese-quartet/quartet-test-data/WEGENE_T7/Quartet_DNA_BGI_T7_WGE_LCL5_1_20191105_R2.fastq.gz oss://pgx-result/renluyao/quality_control/20191223_wegeneT7_sentieon/0214dda2-9408-4bc2-9e7e-322e18d488ad/call-Dedup/Quartet_DNA_BGI_T7_WGE_LCL8_1_20191105.sorted.deduped.bam oss://pgx-result/renluyao/quality_control/20191223_wegeneT7_sentieon/0214dda2-9408-4bc2-9e7e-322e18d488ad/call-Dedup/Quartet_DNA_BGI_T7_WGE_LCL8_1_20191105.sorted.deduped.bam.bai |
{ | |||||
"{{ project_name }}.qualimap.docker": "registry.cn-shanghai.aliyuncs.com/pgx-docker-registry/qualimap:2.0.0", | |||||
"{{ project_name }}.qualimap.cluster_config": "OnDemand bcs.a2.7xlarge img-ubuntu-vpc", | |||||
"{{ project_name }}.fasta": "GRCh38.d1.vd1.fa", | |||||
"{{ project_name }}.fastqc.disk_size": "150", | |||||
"{{ project_name }}.fastqscreen.cluster_config": "OnDemand bcs.b2.3xlarge img-ubuntu-vpc", | |||||
"{{ project_name }}.fastqc.cluster_config": "OnDemand bcs.b2.3xlarge img-ubuntu-vpc", | |||||
"{{ project_name }}.fastqc.docker": "registry.cn-shanghai.aliyuncs.com/pgx-docker-registry/fastqc:v0.11.5", | |||||
"{{ project_name }}.inputSamplesFile": "{{ inputSamplesFile }}", | |||||
"{{ project_name }}.fastqscreen.docker": "registry.cn-shanghai.aliyuncs.com/pgx-docker-registry/fastqscreen:0.12.0", | |||||
"{{ project_name }}.screen_ref_dir": "oss://pgx-reference-data/fastq_screen_reference/", | |||||
"{{ project_name }}.fastq_screen_conf": "oss://pgx-reference-data/fastq_screen_reference/fastq_screen.conf", | |||||
"{{ project_name }}.multiqc.cluster_config": "OnDemand bcs.b2.3xlarge img-ubuntu-vpc", | |||||
"{{ project_name }}.qualimap.disk_size": "500", | |||||
"{{ project_name }}.multiqc.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/multiqc:v1.8", | |||||
"{{ project_name }}.fastqscreen.disk_size": "100", | |||||
"{{ project_name }}.multiqc.disk_size": "100", | |||||
"{{ project_name }}.ref_dir": "oss://chinese-quartet/quartet-storage-data/reference_data/" | |||||
} |
task fastqc { | |||||
File read1 | |||||
String docker | |||||
String cluster_config | |||||
String disk_size | |||||
command <<< | |||||
set -o pipefail | |||||
set -e | |||||
nt=$(nproc) | |||||
fastqc -t $nt -o ./ ${read1} | |||||
>>> | |||||
runtime { | |||||
docker:docker | |||||
cluster: cluster_config | |||||
systemDisk: "cloud_ssd 40" | |||||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||||
} | |||||
output { | |||||
File read1_html = sub(basename(read1), "\\.(fastq|fq)\\.gz$", "_fastqc.html") | |||||
File read1_zip = sub(basename(read1), "\\.(fastq|fq)\\.gz$", "_fastqc.zip") | |||||
} | |||||
} |
task fastq_screen { | |||||
File read1 | |||||
File read2 | |||||
File screen_ref_dir | |||||
File fastq_screen_conf | |||||
String read1name = basename(read1,".fastq.gz") | |||||
String read2name = basename(read2,".fastq.gz") | |||||
String docker | |||||
String cluster_config | |||||
String disk_size | |||||
command <<< | |||||
set -o pipefail | |||||
set -e | |||||
nt=$(nproc) | |||||
mkdir -p /cromwell_root/tmp | |||||
cp -r ${screen_ref_dir} /cromwell_root/tmp/ | |||||
fastq_screen --aligner bowtie2 --conf ${fastq_screen_conf} --top 100000 --threads $nt ${read1} | |||||
fastq_screen --aligner bowtie2 --conf ${fastq_screen_conf} --top 100000 --threads $nt ${read2} | |||||
>>> | |||||
runtime { | |||||
docker:docker | |||||
cluster: cluster_config | |||||
systemDisk: "cloud_ssd 40" | |||||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||||
} | |||||
output { | |||||
File png1 = "${read1name}_screen.png" | |||||
File txt1 = "${read1name}_screen.txt" | |||||
File html1 = "${read1name}_screen.html" | |||||
File png2 = "${read2name}_screen.png" | |||||
File txt2 = "${read2name}_screen.txt" | |||||
File html2 = "${read2name}_screen.html" | |||||
} | |||||
} |
task multiqc { | |||||
Array[File] read1_zip | |||||
Array[File] txt1 | |||||
Array[File] zip | |||||
String docker | |||||
String cluster_config | |||||
String disk_size | |||||
command <<< | |||||
set -o pipefail | |||||
set -e | |||||
mkdir -p /cromwell_root/tmp/fastqc | |||||
mkdir -p /cromwell_root/tmp/fastqscreen | |||||
mkdir -p /cromwell_root/tmp/bamqc | |||||
cp ${sep=" " read1_zip} /cromwell_root/tmp/fastqc | |||||
cp ${sep=" " txt1} /cromwell_root/tmp/fastqscreen | |||||
for i in ${sep=" " zip} | |||||
do | |||||
tar -zxvf $i -C /cromwell_root/tmp/bamqc | |||||
done | |||||
multiqc /cromwell_root/tmp/ | |||||
>>> | |||||
runtime { | |||||
docker:docker | |||||
cluster:cluster_config | |||||
systemDisk:"cloud_ssd 40" | |||||
dataDisk:"cloud_ssd " + disk_size + " /cromwell_root/" | |||||
} | |||||
output { | |||||
File multiqc_html = "multiqc_report.html" | |||||
Array[File] multiqc_txt = glob("multiqc_data/*") | |||||
} | |||||
} |
task qualimap { | |||||
File bam | |||||
File bai | |||||
String bamname = basename(bam,".bam") | |||||
String docker | |||||
String cluster_config | |||||
String disk_size | |||||
command <<< | |||||
set -o pipefail | |||||
set -e | |||||
nt=$(nproc) | |||||
/opt/qualimap/qualimap bamqc -bam ${bam} -outformat PDF:HTML -nt $nt -outdir ${bamname} --java-mem-size=32G | |||||
tar -zcvf ${bamname}_qualimap.zip ${bamname} | |||||
>>> | |||||
runtime { | |||||
docker:docker | |||||
cluster:cluster_config | |||||
systemDisk:"cloud_ssd 40" | |||||
dataDisk:"cloud_ssd " + disk_size + " /cromwell_root/" | |||||
} | |||||
output { | |||||
File zip = "${bamname}_qualimap.zip" | |||||
} | |||||
} |
import "./tasks/fastqc.wdl" as fastqc | |||||
import "./tasks/fastqscreen.wdl" as fastqscreen | |||||
import "./tasks/qualimap.wdl" as qualimap | |||||
import "./tasks/multiqc.wdl" as multiqc | |||||
workflow {{ project_name }} { | |||||
File inputSamplesFile | |||||
Array[Array[File]] inputSamples = read_tsv(inputSamplesFile) | |||||
File screen_ref_dir | |||||
File fastq_screen_conf | |||||
File ref_dir | |||||
String fasta | |||||
scatter (sample in inputSamples) { | |||||
call fastqc.fastqc as fastqc { | |||||
input: | |||||
read1=sample[0] | |||||
} | |||||
call fastqscreen.fastq_screen as fastqscreen { | |||||
input: | |||||
read1=sample[0], | |||||
screen_ref_dir=screen_ref_dir, | |||||
fastq_screen_conf=fastq_screen_conf | |||||
} | |||||
call qualimap.qualimap as qualimap { | |||||
input: | |||||
bam=sample[1], | |||||
bai=sample[2] | |||||
} | |||||
} | |||||
call multiqc.multiqc as multiqc { | |||||
input: | |||||
read1_zip=fastqc.read1_zip, | |||||
txt1=fastqscreen.txt1, | |||||
zip=qualimap.zip | |||||
} | |||||
} | |||||