@@ -96,11 +96,99 @@ hap.py <truth_vcf> <query_vcf> -f <bed_file> --threads <threads> -o <output_file | |||
vbt mendelian -ref <fasta_file> -mother <family_merged_vcf> -father <family_merged_vcf> -child <family_merged_vcf> -pedigree <ped_file> -outDir <output_directory> -out-prefix <output_directory_prefix> --output-violation-regions -thread-count <threads> | |||
``` | |||
## App输入文件 | |||
```bash | |||
choppy samples WGS_germline_datapotal-latest --output samples | |||
``` | |||
####Samples文件的输入包括 | |||
**inputSamplesFile** | |||
```bash | |||
#read1 #read2 #sample_name | |||
``` | |||
read1 是阿里云上fastq read1的地址 | |||
read2 是阿里云上fastq read2的地址 | |||
sample_name是指样本的命名 | |||
所有上传的文件应有规范的命名 | |||
Quartet_DNA_SequenceTech_SequenceMachine_SequenceSite_Sample_Replicate_Date.R1/R2.fastq.gz | |||
SequenceTech是指测序平台,如ILM、BGI等 | |||
SequenceMachine是指测序仪器,如XTen、Nova、Hiseq(Illumina)、SEQ500、SEQ1000(BGI)等 | |||
SequenceSite是指测序单位的英文缩写 | |||
Sample是指LCL5、LCL6、LCL7、LCL8 | |||
Replicate是指技术重复,从1开始依次增加 | |||
Date是指数据获得日期,格式为20200710 | |||
后缀一定是R1/R2.fastq.gz,不可以随意更改,R1/R2不可以写成r1/r2,fastq.gz不可以写成fq.gz | |||
各个缩写规范请见 https://fudan-pgx.yuque.com/docs/share/5baa851b-da97-47b9-b6c4-78f2b60595ab?# 《数据命名规范》 | |||
**project** | |||
这个项目的名称,可以写自己可以识别的字符串,只能写英文和数字,不可以写中文 | |||
#### Quartet样本的组合问题 | |||
##### 1. 没有测LCL5和LCL6,或者没有同时测LCL5和LCL6 | |||
只给出原始数据质控、比对数据质控、与标准集的比较 | |||
##### 2. 包含LCL5和LCL6同卵双胞胎的数据,但是父母的数据不全 | |||
只给出原始数据质控、比对数据质控、与标准集的比较、同卵双胞胎一致性 | |||
##### 3. 四个quartet样本都测了 | |||
给出所有结果原始数据质控、比对数据质控、与标准集的比较、同卵双胞胎一致性,符合孟德尔遗传比例 | |||
**注意**:本app假设每个批次测的技术重复都一样,如batch 1测了LCL5、LCL6、LCL7和LCL8,batch 2 和batch 3也都测了这四个样本。本app不解决特别复杂的问题,例如batch1测了LCL5,LCL6,batch2测了LCL7和LCL8,本app只能给出原始数据质控、比对数据质控、与标准集的比较,不会把多个批次的数据合并计算孟德尔符合率和姐妹一致性。 | |||
## App输出文件 | |||
本计算会产生大量的中间结果,这里说明最后整合好的结果文件,一共5个文件。 | |||
本计算会产生大量的中间结果,这里说明最后整合好的结果文件,上述的三种情况得到的结果文件如下。 | |||
####1. 原始数据质量控制 | |||
###1. 没有测LCL5和LCL6,或者没有同时测LCL5和LCL6 | |||
####1.1 原始数据质量控制 | |||
####1.2 比对后数据质量控制 | |||
##### 输出目录 | |||
extract_multiqc | |||
##### 输出结果文件 | |||
qualimap.final.result.txt | |||
####1.3 突变检出数据质量控制 | |||
#####与标准集进行比较 | |||
######输出目录 | |||
extract_multiqc | |||
######输出结果文件 | |||
benchmark.final.result.txt | |||
###2. 包含LCL5和LCL6同卵双胞胎的数据,但是父母的数据不全 | |||
#### 2.1 原始数据质量控制 | |||
##### 输出目录 | |||
@@ -112,7 +200,7 @@ fastqc.final.result.txt | |||
fastqscreen.final.result.txt | |||
####2. 比对后数据质量控制 | |||
#### 2.2 比对后数据质量控制 | |||
##### 输出目录 | |||
@@ -122,19 +210,65 @@ extract_multiqc | |||
qualimap.final.result.txt | |||
####3. 突变检出数据质量控制 | |||
#### 2.3 突变检出数据质量控制 | |||
#####3.1 与标准集进行比较 | |||
##### 2.3.1 与标准集进行比较 | |||
######输出目录 | |||
#####输出目录 | |||
extract_multiqc | |||
######输出结果文件 | |||
#####输出结果文件 | |||
benchmark.final.result.txt | |||
##### 3.2 通过Quartet家系设计 | |||
**2.3.2** **姐妹一致性** | |||
##### 输出目录 | |||
D5_D6 | |||
##### 输出结果文件 | |||
${project}.sister.txt | |||
### 3. 四个quartet样本都测了 | |||
#### 3.1 原始数据质量控制 | |||
##### 输出目录 | |||
extract_multiqc | |||
##### 输出结果文件 | |||
fastqc.final.result.txt | |||
fastqscreen.final.result.txt | |||
#### 3.2 比对后数据质量控制 | |||
##### 输出目录 | |||
extract_multiqc | |||
##### 输出结果文件 | |||
qualimap.final.result.txt | |||
#### 3.3 突变检出数据质量控制 | |||
##### 3.3.1 与标准集进行比较 | |||
###### 输出目录 | |||
extract_multiqc | |||
###### 输出结果文件 | |||
benchmark.final.result.txt | |||
##### 3.3.2 通过Quartet家系设计 | |||
###### 输出目录 | |||
@@ -221,7 +355,14 @@ FastQC和FastqScreen是两个常用的原始数据质量控制软件 | |||
| Precision | 查准率 | | |||
| Recall | 查全率 | | |||
#####3.2 Quartet家系关系评估 ${project}.mendelian.txt | |||
##### 3.2 姐妹一致性 ${project}.sister.txt (样本没有没有测全,但是同时测了LCL5和LCL6) | |||
| 列名 | 说明 | | |||
| --------------------- | ------------------------------------------------------------ | | |||
| Family | 家庭名字,我们目前的设计是4个Quartet样本,每个三个技术重复,family_1是指rep1的4个样本组成的家庭单位,以此类推。 | | |||
| Reproducibility_D5_D6 | Quartet-D5和Quartet-D6的一致性 | | |||
#####3.3 Quartet家系关系评估 ${project}.mendelian.txt | |||
| 列名 | 说明 | | |||
| ----------------------------- | ------------------------------------------------------------ | |
@@ -0,0 +1,60 @@ | |||
from __future__ import division | |||
import pandas as pd | |||
import sys, argparse, os | |||
# input arguments | |||
parser = argparse.ArgumentParser(description="this script is to calculate reproducibility between Quartet_D5 and Quartet_D6s") | |||
parser.add_argument('-sister', '--sister', type=str, help='sister.txt', required=True) | |||
parser.add_argument('-project', '--project', type=str, help='project name', required=True) | |||
args = parser.parse_args() | |||
sister_file = args.sister | |||
project_name = args.project | |||
# output file | |||
output_name = project_name + '.sister.reproducibility.txt' | |||
output_file = open(output_name,'w') | |||
# input files | |||
sister_dat = pd.read_table(sister_file) | |||
sister_same = 0 | |||
sister_diff = 0 | |||
for row in sister_dat.itertuples(): | |||
# sister | |||
if row[5] == row[6]: | |||
if row[5] == './.': | |||
mendelian = 'noInfo' | |||
sister_count = "no" | |||
elif row[5] == '0/0': | |||
mendelian = 'Ref' | |||
sister_count = "no" | |||
else: | |||
mendelian = '1' | |||
sister_count = "yes_same" | |||
else: | |||
mendelian = '0' | |||
if (row[5] == './.' or row[5] == '0/0') and (row[6] == './.' or row[6] == '0/0'): | |||
sister_count = "no" | |||
else: | |||
sister_count = "yes_diff" | |||
if sister_count == 'yes_same': | |||
sister_same += 1 | |||
elif sister_count == 'yes_diff': | |||
sister_diff += 1 | |||
else: | |||
pass | |||
sister = sister_same/(sister_same + sister_diff) | |||
outcolumn = 'Project\tReproducibility_D5_D6\n' | |||
outResult = project_name + '\t' + str(sister) + '\n' | |||
output_file.write(outcolumn) | |||
output_file.write(outResult) | |||
@@ -0,0 +1,50 @@ | |||
import pandas as pd | |||
import sys, argparse, os | |||
from operator import itemgetter | |||
parser = argparse.ArgumentParser(description="This script is to get how many samples") | |||
parser.add_argument('-sample', '--sample', type=str, help='quartet_sample', required=True) | |||
parser.add_argument('-rep', '--rep', type=str, help='quartet_rep', required=True) | |||
args = parser.parse_args() | |||
# Rename input: | |||
sample = args.sample | |||
rep = args.rep | |||
quartet_sample = pd.read_table(sample,header=None) | |||
quartet_sample = list(quartet_sample[0]) | |||
quartet_rep = pd.read_table(rep.header=None) | |||
quartet_rep = quartet_rep[0] | |||
#tags | |||
sister_tag = 'false' | |||
quartet_tag = 'false' | |||
quartet_rep_unique = list(set(quartet_rep)) | |||
single_rep = [i for i in range(len(quartet_rep)) if quartet_rep[i] == quartet_rep_unique[0]] | |||
single_batch_sample = itemgetter(*single_rep)(quartet_sample) | |||
num = len(single_batch_sample) | |||
if num == 1: | |||
sister_tag = 'false' | |||
quartet_tag = 'false' | |||
elif num == 2: | |||
if set(single_batch_sample) == set(['LCL5','LCL6']): | |||
sister_tag = 'true' | |||
quartet_tag = 'false' | |||
elif num == 3: | |||
if ('LCL5' in single_batch_sample) and ('LCL6' in single_batch_sample): | |||
sister_tag = 'true' | |||
quartet_tag = 'false' | |||
elif num == 4: | |||
if set(single_batch_sample) == set(['LCL5','LCL6','LCL7','LCL8']): | |||
sister_tag = 'false' | |||
quartet_tag = 'true' | |||
sister_outfile = open('sister_tag','w') | |||
quartet_outfile = open('quartet_tag','w') | |||
sister_outfile.write(sister_tag) | |||
quartet_outfile.write(quartet_tag) |
@@ -0,0 +1,49 @@ | |||
task D5_D6 { | |||
Array[File] splited_vcf | |||
String project | |||
String docker | |||
String cluster_config | |||
String disk_size | |||
command <<< | |||
mkdir -p /cromwell_root/tmp/vcf | |||
cp ${sep=" " splited_vcf} /cromwell_root/tmp/vcf | |||
for i in /cromwell_root/tmp/vcf/*vcf | |||
do | |||
for j in /cromwell_root/tmp/vcf/*vcf | |||
do | |||
sample_i=$(echo $i | cut -f7 -d_) | |||
sample_j=$(echo $j | cut -f7 -d_) | |||
rep_i=$(echo $i | cut -f8 -d_) | |||
rep_j=$(echo $j | cut -f8 -d_) | |||
if [[ $sample_i == "LCL5" ] && [ $sample_j == "LCL6" ] && [ $rep_i == $rep_j ]];then | |||
cat $i | grep -v '##' | cut -f1,2,4,5,10 | cut -d ':' -f1 > LCL5.txt | |||
cat $j | grep -v '##' | cut -f10 | cut -d ':' -f1 > LCL6.txt | |||
paste LCL5.txt LCL6.txt > sister.txt | |||
python /opt/D5_D6.py -sister sister.txt -project ${project}.$rep_i | |||
fi | |||
done | |||
done | |||
for i in *.reproducibility.txt | |||
do | |||
cat $i | sed -n '2,2p' >> sister.summary | |||
done | |||
sed '1i\Family\tReproducibility_D5_D6' sister.summary > ${project}.sister.txt | |||
>>> | |||
runtime { | |||
docker:docker | |||
cluster: cluster_config | |||
systemDisk: "cloud_ssd 40" | |||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||
} | |||
output { | |||
Array[File] sister_file = glob("*.reproducibility.txt") | |||
File sister_summary = "${project}.sister.txt" | |||
} | |||
} |
@@ -1,42 +0,0 @@ | |||
task TNscope { | |||
File ref_dir | |||
File dbsnp_dir | |||
String SENTIEON_INSTALL_DIR | |||
String tumor_name | |||
String normal_name | |||
String docker | |||
String cluster_config | |||
String fasta | |||
File corealigner_bam | |||
File corealigner_bam_index | |||
String dbsnp | |||
String disk_size | |||
command <<< | |||
set -o pipefail | |||
set -e | |||
export SENTIEON_LICENSE=192.168.0.55:8990 | |||
nt=$(nproc) | |||
${SENTIEON_INSTALL_DIR}/bin/sentieon driver -r ${ref_dir}/${fasta} -t $nt -i ${corealigner_bam} --algo TNscope --tumor_sample ${tumor_name} --normal_sample ${normal_name} --dbsnp ${dbsnp_dir}/${dbsnp} ${sample}.TNscope.TN.vcf | |||
>>> | |||
runtime { | |||
docker:docker | |||
cluster: cluster_config | |||
systemDisk: "cloud_ssd 40" | |||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||
} | |||
output { | |||
File TNscope_vcf= "${sample}.TNscope.TN.vcf" | |||
File TNscope_vcf_index = "${sample}.TNscope.TN.vcf.idx" | |||
} | |||
} |
@@ -1,43 +0,0 @@ | |||
task TNseq { | |||
File ref_dir | |||
File dbsnp_dir | |||
String SENTIEON_INSTALL_DIR | |||
String tumor_name | |||
String normal_name | |||
String docker | |||
String cluster_config | |||
String fasta | |||
File corealigner_bam | |||
File corealigner_bam_index | |||
String dbsnp | |||
String disk_size | |||
command <<< | |||
set -o pipefail | |||
set -e | |||
export SENTIEON_LICENSE=192.168.0.55:8990 | |||
nt=$(nproc) | |||
${SENTIEON_INSTALL_DIR}/bin/sentieon driver -r ${ref_dir}/${fasta} -t $nt -i ${corealigner_bam} --algo TNhaplotyper --tumor_sample ${tumor_name} --normal_sample ${normal_name} --dbsnp ${dbsnp_dir}/${dbsnp} ${sample}.TNseq.TN.vcf | |||
>>> | |||
runtime { | |||
docker:docker | |||
cluster: cluster_config | |||
systemDisk: "cloud_ssd 40" | |||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||
} | |||
output { | |||
File TNseq_vcf= "${sample}.TNseq.TN.vcf" | |||
File TNseq_vcf_index = "${sample}.TNseq.TN.vcf.idx" | |||
} | |||
} | |||
@@ -1,31 +0,0 @@ | |||
task joint_variant_calling { | |||
File merged_bed | |||
String sample | |||
String docker | |||
String disk_size | |||
String cluster_config | |||
command <<< | |||
sentieon driver -r REFERENCE --algo GVCFtyper -v s1_VARIANT_GVCF \ -v s2_VARIANT_GVCF -v s3_VARIANT_GVCF VARIANT_VCF | |||
>>> | |||
runtime { | |||
docker:docker | |||
cluster:cluster_config | |||
systemDisk: "cloud_ssd 40" | |||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||
} | |||
output { | |||
File consensus_bed = "${sample}.27consensus.bed" | |||
File filtered_bed = "${sample}.filtered.bed" | |||
} | |||
} | |||
@@ -0,0 +1,56 @@ | |||
task merge_family { | |||
Array[File] splited_vcf | |||
String project | |||
String docker | |||
String cluster_config | |||
String disk_size | |||
command <<< | |||
mkdir -p /cromwell_root/tmp/vcf | |||
cp ${sep=" " splited_vcf} /cromwell_root/tmp/vcf | |||
for a in /cromwell_root/tmp/vcf/*vcf | |||
do | |||
for b in /cromwell_root/tmp/vcf/*vcf | |||
do | |||
for c in /cromwell_root/tmp/vcf/*vcf | |||
do | |||
for d in /cromwell_root/tmp/vcf/*vcf | |||
do | |||
sample_a=$(echo $a | cut -f7 -d_) | |||
sample_b=$(echo $b | cut -f7 -d_) | |||
sample_c=$(echo $c | cut -f7 -d_) | |||
sample_d=$(echo $d | cut -f7 -d_) | |||
rep_a=$(echo $a | cut -f8 -d_) | |||
rep_b=$(echo $b | cut -f8 -d_) | |||
rep_c=$(echo $c | cut -f8 -d_) | |||
rep_d=$(echo $d | cut -f8 -d_) | |||
if [ $sample_a == "LCL5" ] && [ $sample_b == "LCL6" ] && [ $sample_c == "LCL7" ] && [ $sample_d == "LCL8" ] && [ $rep_a == $rep_b ] && [ $rep_c == $rep_d ] && [ $rep_b == $rep_c ];then | |||
cat $a | grep -v '#' > LCL5.body | |||
cat $b | grep -v '#' | cut -f 10 > LCL6.body | |||
cat $c | grep -v '#' | cut -f 10 > LCL7.body | |||
cat $d | grep -v '#' | cut -f 10 > LCL8.body | |||
echo -e "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tLCL5\tLCL6\tLCL7\tLCL8" > header_name | |||
cat $a | grep '##' | cat - header_name > header | |||
paste LCL5.body LCL6.body LCL7.body LCL8.body > family.body | |||
cat header family.body > ${project}.$rep_a.family.vcf | |||
fi | |||
done | |||
done | |||
done | |||
done | |||
>>> | |||
runtime { | |||
docker:docker | |||
cluster: cluster_config | |||
systemDisk: "cloud_ssd 40" | |||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||
} | |||
output { | |||
Array[File] family_vcf = glob("*.family.vcf") | |||
} | |||
} |
@@ -11,14 +11,14 @@ task merge_mendelian { | |||
cat ${D5_trio_vcf} | grep -v '##' > ${family_name}.D5.txt | |||
cat ${D6_trio_vcf} | grep -v '##' > ${family_name}.D6.txt | |||
cat ${family_vcf} | grep -v '##' | awk ' | |||
BEGIN { OFS = "\t" } | |||
NF > 2 && FNR > 1 { | |||
for ( i=9; i<=NF; i++ ) { | |||
split($i,a,":") ;$i = a[1]; | |||
} | |||
} | |||
{ print } | |||
' > ${family_name}.consensus.txt | |||
BEGIN { OFS = "\t" } | |||
NF > 2 && FNR > 1 { | |||
for ( i=9; i<=NF; i++ ) { | |||
split($i,a,":") ;$i = a[1]; | |||
} | |||
} | |||
{ print } | |||
' > ${family_name}.consensus.txt | |||
python /opt/merge_two_family_with_genotype.py -LCL5 ${family_name}.D5.txt -LCL6 ${family_name}.D6.txt -genotype ${family_name}.consensus.txt -family ${family_name} | |||
>>> | |||
@@ -29,7 +29,7 @@ task merge_mendelian { | |||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||
} | |||
output { | |||
File project_mendelian = "${family_name}.mendelian.txt" | |||
File project_mendelian_summary = "${family_name}.mendelian.summary.txt" | |||
File project_mendelian = "${family_name}.txt" | |||
File project_mendelian_summary = "${family_name}.summary.txt" | |||
} | |||
} |
@@ -31,10 +31,10 @@ task multiqc { | |||
multiqc /cromwell_root/tmp/ | |||
cp multiqc_data/multiqc_general_stats.txt > multiqc_general_stats.txt | |||
cp multiqc_data/multiqc_fastqc.txt > multiqc_fastqc.txt | |||
cp multiqc_data/multiqc_fastq_screen.txt > multiqc_fastq_screen.txt | |||
cp multiqc_data/multiqc_happy_data.json > multiqc_happy_data.json | |||
cat multiqc_data/multiqc_general_stats.txt > multiqc_general_stats.txt | |||
cat multiqc_data/multiqc_fastqc.txt > multiqc_fastqc.txt | |||
cat multiqc_data/multiqc_fastq_screen.txt > multiqc_fastq_screen.txt | |||
cat multiqc_data/multiqc_happy_data.json > multiqc_happy_data.json | |||
>>> | |||
@@ -1,4 +1,4 @@ | |||
task split_gvcf_files { | |||
task split_gvcf_files { | |||
File gvcf | |||
String project | |||
String docker | |||
@@ -18,7 +18,6 @@ task split_gvcf_files { | |||
cat body | grep -w '^chr1\|^chr2\|^chr3\|^chr4\|^chr5\|^chr6\|^chr7\|^chr8\|^chr9\|^chr10\|^chr11\|^chr12\|^chr13\|^chr14\|^chr15\|^chr16\|^chr17\|^chr18\|^chr19\|^chr20\|^chr21\|^chr22\|^chrX' > body.filtered | |||
cat header body.filtered > ${project}.filtered.g.vcf | |||
for i in $(seq 10 $ncol); do cat ${project}.filtered.g.vcf | cut -f1-9,$i > $i.splited.vcf; done | |||
ls *splited.vcf | sort -n | paste - name > rename | |||
@@ -26,62 +25,13 @@ task split_gvcf_files { | |||
cat rename | while read a b | |||
do | |||
mv $a $b.splited.vcf | |||
if [[ $b.vcf =~ "LCL5_1" ]];then | |||
cp $b.splited.vcf ${project}.LCL5_1.vcf | |||
elif [[ $b.vcf =~ "LCL5_2" ]]; then | |||
cp $b.splited.vcf ${project}.LCL5_2.vcf | |||
elif [[ $b.vcf =~ "LCL5_3" ]]; then | |||
cp $b.splited.vcf ${project}.LCL5_3.vcf | |||
elif [[ $b.vcf =~ "LCL6_1" ]]; then | |||
cp $b.splited.vcf ${project}.LCL6_1.vcf | |||
elif [[ $b.vcf =~ "LCL6_2" ]]; then | |||
cp $b.splited.vcf ${project}.LCL6_2.vcf | |||
elif [[ $b.vcf =~ "LCL6_3" ]]; then | |||
cp $b.splited.vcf ${project}.LCL6_3.vcf | |||
elif [[ $b.vcf =~ "LCL7_1" ]]; then | |||
cp $b.splited.vcf ${project}.LCL7_1.vcf | |||
elif [[ $b.vcf =~ "LCL7_2" ]]; then | |||
cp $b.splited.vcf ${project}.LCL7_2.vcf | |||
elif [[ $b.vcf =~ "LCL7_3" ]]; then | |||
cp $b.splited.vcf ${project}.LCL7_3.vcf | |||
elif [[ $b.vcf =~ "LCL8_1" ]]; then | |||
cp $b.splited.vcf ${project}.LCL8_1.vcf | |||
elif [[ $b.vcf =~ "LCL8_2" ]]; then | |||
cp $b.splited.vcf ${project}.LCL8_2.vcf | |||
elif [[ $b.vcf =~ "LCL8_3" ]]; then | |||
cp $b.splited.vcf ${project}.LCL8_3.vcf | |||
fi | |||
sample=$(echo $b | cut -f6 -d_) | |||
rep=$(echo $b | cut -f7 -d_) | |||
echo $sample >> quartet_sample | |||
echo $rep >> quartet_rep | |||
done | |||
cat ${project}.LCL5_1.vcf | grep -v '#' > LCL5_1.body | |||
cat ${project}.LCL5_2.vcf | grep -v '#' > LCL5_2.body | |||
cat ${project}.LCL5_3.vcf | grep -v '#' > LCL5_3.body | |||
cat ${project}.LCL6_1.vcf | grep -v '#' | cut -f 10 > LCL6_1.body | |||
cat ${project}.LCL6_2.vcf | grep -v '#' | cut -f 10 > LCL6_2.body | |||
cat ${project}.LCL6_3.vcf | grep -v '#' | cut -f 10 > LCL6_3.body | |||
cat ${project}.LCL7_1.vcf | grep -v '#' | cut -f 10 > LCL7_1.body | |||
cat ${project}.LCL7_2.vcf | grep -v '#' | cut -f 10 > LCL7_2.body | |||
cat ${project}.LCL7_3.vcf | grep -v '#' | cut -f 10 > LCL7_3.body | |||
cat ${project}.LCL8_1.vcf | grep -v '#' | cut -f 10 > LCL8_1.body | |||
cat ${project}.LCL8_2.vcf | grep -v '#' | cut -f 10 > LCL8_2.body | |||
cat ${project}.LCL8_3.vcf | grep -v '#' | cut -f 10 > LCL8_3.body | |||
echo -e "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tLCL5\tLCL6\tLCL7\tLCL8" > header_name | |||
cat ${project}.LCL5_1.vcf | grep '##' | cat - header_name > header | |||
paste LCL5_1.body LCL6_1.body LCL7_1.body LCL8_1.body > family_1.body | |||
paste LCL5_2.body LCL6_2.body LCL7_2.body LCL8_2.body > family_2.body | |||
paste LCL5_3.body LCL6_3.body LCL7_3.body LCL8_3.body > family_3.body | |||
cat header family_1.body > ${project}.1.family.vcf | |||
cat header family_2.body > ${project}.2.family.vcf | |||
cat header family_3.body > ${project}.3.family.vcf | |||
python /opt/how_many_samples.py -sample quartet_sample -rep quartet_rep | |||
>>> | |||
@@ -94,18 +44,7 @@ task split_gvcf_files { | |||
} | |||
output { | |||
Array[File] splited_vcf = glob("*.splited.vcf") | |||
Array[File] family_vcf = glob("*.family.vcf") | |||
File LCL5_1 = "${project}.LCL5_1.vcf" | |||
File LCL5_2 = "${project}.LCL5_2.vcf" | |||
File LCL5_3 = "${project}.LCL5_3.vcf" | |||
File LCL6_1 = "${project}.LCL6_1.vcf" | |||
File LCL6_2 = "${project}.LCL6_2.vcf" | |||
File LCL6_3 = "${project}.LCL6_3.vcf" | |||
File LCL7_1 = "${project}.LCL7_1.vcf" | |||
File LCL7_2 = "${project}.LCL7_2.vcf" | |||
File LCL7_3 = "${project}.LCL7_3.vcf" | |||
File LCL8_1 = "${project}.LCL8_1.vcf" | |||
File LCL8_2 = "${project}.LCL8_2.vcf" | |||
File LCL8_3 = "${project}.LCL8_3.vcf" | |||
File sister_tag = "sister_tag" | |||
File quartet_tag = "quartet_tag" | |||
} | |||
} |
@@ -16,6 +16,9 @@ import "./tasks/fastqc.wdl" as fastqc | |||
import "./tasks/fastqscreen.wdl" as fastqscreen | |||
import "./tasks/qualimap.wdl" as qualimap | |||
import "./tasks/extract_multiqc.wdl" as extract_multiqc | |||
import "./tasks/D5_D6.wdl" as D5_D6 | |||
import "./tasks/merge_family.wdl" as merge_family | |||
workflow {{ project_name }} { | |||
@@ -191,12 +194,13 @@ workflow {{ project_name }} { | |||
call split_gvcf_files.split_gvcf_files as split_gvcf_files { | |||
input: | |||
gvcf=GVCFtyper.gvcf, | |||
docker=BENCHMARKdocker, | |||
docker=DIYdocker, | |||
project=project, | |||
cluster_config=SMALLcluster_config, | |||
disk_size=disk_size | |||
} | |||
Array[File] single_gvcf = split_gvcf_files.splited_vcf | |||
scatter (idx in range(length(single_gvcf))) { | |||
@@ -236,39 +240,62 @@ workflow {{ project_name }} { | |||
disk_size=disk_size | |||
} | |||
Array[File] family_vcfs = split_gvcf_files.family_vcf | |||
Boolean sister_tag = read_boolean(split_gvcf_files.sister_tag) | |||
Boolean quartet_tag = read_boolean(split_gvcf_files.quartet_tag) | |||
scatter (idx in range(length(family_vcfs))) { | |||
call mendelian.mendelian as mendelian { | |||
if (sister_tag) { | |||
call D5_D6.D5_D6 as D5_D6 { | |||
input: | |||
family_vcf=family_vcfs[idx], | |||
ref_dir=ref_dir, | |||
fasta=fasta, | |||
docker=MENDELIANdocker, | |||
cluster_config=BIGcluster_config, | |||
disk_size=disk_size | |||
splited_vcf=split_gvcf_files.splited_vcf, | |||
project=project, | |||
docker=DIYdocker, | |||
cluster_config=SMALLcluster_config, | |||
disk_size=disk_size, | |||
} | |||
} | |||
call merge_mendelian.merge_mendelian as merge_mendelian { | |||
if (quartet_tag) { | |||
call merge_family.merge_family as merge_family { | |||
input: | |||
D5_trio_vcf=mendelian.D5_trio_vcf, | |||
D6_trio_vcf=mendelian.D6_trio_vcf, | |||
family_vcf=family_vcfs[idx], | |||
splited_vcf=split_gvcf_files.splited_vcf, | |||
project=project, | |||
docker=DIYdocker, | |||
cluster_config=SMALLcluster_config, | |||
disk_size=disk_size | |||
disk_size=disk_size, | |||
} | |||
} | |||
call quartet_mendelian.quartet_mendelian as quartet_mendelian { | |||
input: | |||
project_mendelian_summary=merge_mendelian.project_mendelian_summary, | |||
project=project, | |||
docker=DIYdocker, | |||
cluster_config=SMALLcluster_config, | |||
disk_size=disk_size | |||
} | |||
Array[File] family_vcfs = merge_family.family_vcf | |||
scatter (idx in range(length(family_vcfs))) { | |||
call mendelian.mendelian as mendelian { | |||
input: | |||
family_vcf=family_vcfs[idx], | |||
ref_dir=ref_dir, | |||
fasta=fasta, | |||
docker=MENDELIANdocker, | |||
cluster_config=BIGcluster_config, | |||
disk_size=disk_size | |||
} | |||
call merge_mendelian.merge_mendelian as merge_mendelian { | |||
input: | |||
D5_trio_vcf=mendelian.D5_trio_vcf, | |||
D6_trio_vcf=mendelian.D6_trio_vcf, | |||
family_vcf=family_vcfs[idx], | |||
docker=DIYdocker, | |||
cluster_config=SMALLcluster_config, | |||
disk_size=disk_size | |||
} | |||
} | |||
call quartet_mendelian.quartet_mendelian as quartet_mendelian { | |||
input: | |||
project_mendelian_summary=merge_mendelian.project_mendelian_summary, | |||
project=project, | |||
docker=DIYdocker, | |||
cluster_config=SMALLcluster_config, | |||
disk_size=disk_size | |||
} | |||
} | |||
} | |||