from __future__ import division | |||||
import pandas as pd | |||||
import sys, argparse, os | |||||
import fileinput | |||||
import re | |||||
# input arguments | |||||
parser = argparse.ArgumentParser(description="this script is to extract mendelian concordance information") | |||||
parser.add_argument('-LCL5', '--LCL5', type=str, help='LCL5 family info', required=True) | |||||
parser.add_argument('-LCL6', '--LCL6', type=str, help='LCL6 family info', required=True) | |||||
parser.add_argument('-genotype', '--genotype', type=str, help='Genotype information of a set of four family members', required=True) | |||||
parser.add_argument('-family', '--family', type=str, help='family name', required=True) | |||||
args = parser.parse_args() | |||||
lcl5 = args.LCL5 | |||||
lcl6 = args.LCL6 | |||||
genotype = args.genotype | |||||
family = args.family | |||||
# output file | |||||
family_name = family + '.txt' | |||||
family_file = open(family_name,'w') | |||||
# input files | |||||
lcl5_dat = pd.read_table(lcl5) | |||||
lcl6_dat = pd.read_table(lcl6) | |||||
genotype_dat = pd.read_table(genotype) | |||||
merged_df = pd.merge(lcl5_dat, lcl6_dat, how='outer', left_on=['#CHROM','POS'], right_on = ['#CHROM','POS']) | |||||
merged_genotype_df = pd.merge(merged_df, genotype_dat, how='outer', left_on=['#CHROM','POS'], right_on = ['#CHROM','POS']) | |||||
merged_genotype_df_sub = merged_genotype_df.iloc[:,[0,1,22,23,24,25,26,27,7,17]] | |||||
merged_genotype_df_sub.columns = ['CHROM', 'POS', 'REF', 'ALT','LCL5','LCL6','LCL7','LCL8', 'TRIO5', 'TRIO6'] | |||||
for row in merged_genotype_df_sub.itertuples(): | |||||
# sister | |||||
if row.LCL5 == row.LCL6: | |||||
if row.LCL5 == './.': | |||||
mendelian = 'noInfo' | |||||
sister_count = "no" | |||||
elif row.LCL5 == '0/0': | |||||
mendelian = 'Ref' | |||||
sister_count = "no" | |||||
else: | |||||
mendelian = '1' | |||||
sister_count = "yes_same" | |||||
else: | |||||
mendelian = '0' | |||||
if (row.LCL5 == './.' or row.LCL5 == '0/0') and (row.LCL6 == './.' or row.LCL6 == '0/0'): | |||||
sister_count = "no" | |||||
else: | |||||
sister_count = "yes_diff" | |||||
# family trio5 | |||||
if row.LCL5 == row. LCL7 == row.LCL8 == './.': | |||||
mendelian = mendelian + ':noInfo' | |||||
elif row.LCL5 == row. LCL7 == row.LCL8 == '0/0': | |||||
mendelian = mendelian + ':Ref' | |||||
elif pd.isnull(row.TRIO5) == True: | |||||
mendelian = mendelian + ':unVBT' | |||||
else: | |||||
mendelian = mendelian + ':' + row.TRIO5.split('=')[1] | |||||
# family trio6 | |||||
if row.LCL6 == row. LCL7 == row.LCL8 == './.': | |||||
mendelian = mendelian + ':noInfo' | |||||
elif row.LCL6 == row. LCL7 == row.LCL8 == '0/0': | |||||
mendelian = mendelian + ':Ref' | |||||
elif pd.isnull(row.TRIO6) == True: | |||||
mendelian = mendelian + ':unVBT' | |||||
else: | |||||
mendelian = mendelian + ':' + row.TRIO6.split('=')[1] | |||||
# not count into family | |||||
if (row.LCL5 == './.' or row.LCL5 == '0/0') and (row.LCL6 == './.' or row.LCL6 == '0/0') and (row.LCL7 == './.' or row.LCL7 == '0/0') and (row.LCL8 == './.' or row.LCL8 == '0/0'): | |||||
mendelian_count = "no" | |||||
else: | |||||
mendelian_count = "yes" | |||||
outline = row.CHROM + '\t' + str(row.POS) + '\t' + row.REF + '\t' + row.ALT + '\t' + row.LCL5 + '\t' + row.LCL6 + '\t' + row.LCL7 + '\t' + row.LCL8 + '\t' + str(row.TRIO5) + '\t' + str(row.TRIO6) + '\t' + str(mendelian) + '\t' + str(mendelian_count) + '\t' + str(sister_count) + '\n' | |||||
family_file.write(outline) |
{ | { | ||||
"{{ project_name }}.fasta": "GRCh38.d1.vd1.fa", | "{{ project_name }}.fasta": "GRCh38.d1.vd1.fa", | ||||
"{{ project_name }}.two_family_merge.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call_manuscript:v1.4", | |||||
"{{ project_name }}.family_name": "{{ family_name }}", | |||||
"{{ project_name }}.sister.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/vbt:v1.1", | |||||
"{{ project_name }}.disk_size": "150", | "{{ project_name }}.disk_size": "150", | ||||
"{{ project_name }}.merge.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call_manuscript:v1.4", | |||||
"{{ project_name }}.inputSamplesFile": "{{ inputSamplesFile }}", | "{{ project_name }}.inputSamplesFile": "{{ inputSamplesFile }}", | ||||
"{{ project_name }}.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/vbt:v1.1", | |||||
"{{ project_name }}.cluster_config": "OnDemand bcs.b4.xlarge img-ubuntu-vpc", | "{{ project_name }}.cluster_config": "OnDemand bcs.b4.xlarge img-ubuntu-vpc", | ||||
"{{ project_name }}.ref_dir": "oss://chinese-quartet/quartet-storage-data/reference_data/" | "{{ project_name }}.ref_dir": "oss://chinese-quartet/quartet-storage-data/reference_data/" | ||||
} | |||||
} |
task VCFinfo { | |||||
File repeat_annotated_vcf | |||||
String sample | |||||
String docker | |||||
String cluster_config | |||||
String disk_size | |||||
command <<< | |||||
python /opt/variants_quality_location_intergration.py -vcf ${repeat_annotated_vcf} -prefix ${sample} | |||||
cat ${sample}_variant_quality_location.txt | grep '#CHROM' > header | |||||
for i in chr1 chr2 chr3 chr4 chr5 chr6 chr7 chr8 chr9 chr10 chr11 chr12 chr13 chr14 chr15 chr16 chr17 chr18 chr19 chr20 chr21 chr22 chrX | |||||
do | |||||
cat ${sample}_variant_quality_location.txt | grep -w $i | cat header - > ${sample}.$i.vcfInfo.txt | |||||
done | |||||
>>> | |||||
runtime { | |||||
docker:docker | |||||
cluster: cluster_config | |||||
systemDisk: "cloud_ssd 40" | |||||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||||
} | |||||
output { | |||||
File extracted_info = "${sample}_variant_quality_location.txt" | |||||
Array[File] chromo_vcfInfo = glob("*.vcfInfo.txt") | |||||
} | |||||
} |
task VCFrename { | |||||
File trio_vcf_gz | |||||
File trio_vcf_idx | |||||
String mother_name | |||||
String father_name | |||||
String child_name | |||||
String family_name | |||||
String child | |||||
String docker | |||||
String cluster_config | |||||
String disk_size | |||||
command <<< | |||||
echo "MOTHER ${mother_name}.${child} | |||||
FATHER ${father_name}.${child} | |||||
CHILD ${child_name}" > rename.txt | |||||
rtg vcfannotate -i ${trio_vcf_gz} -o ${family_name}.${child}.rename.vcf.gz --relabel=rename.txt | |||||
>>> | |||||
runtime { | |||||
docker:docker | |||||
cluster: cluster_config | |||||
systemDisk: "cloud_ssd 40" | |||||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||||
} | |||||
output { | |||||
File rename_trio_vcf_gz = "${family_name}.${child}.rename.vcf.gz" | |||||
File rename_trio_vcf_idx = "${family_name}.${child}.rename.vcf.gz.tbi" | |||||
} | |||||
} |
task bed_annotation { | |||||
File merged_vcf_gz | |||||
File merged_vcf_idx | |||||
File repeat_bed | |||||
String sample | |||||
String docker | |||||
String cluster_config | |||||
String disk_size | |||||
command <<< | |||||
rtg vcfannotate --bed-info=${repeat_bed} -i ${merged_vcf_gz} -o ${sample}.mendelian.merged.repeatAnno.vcf.gz | |||||
gunzip ${sample}.mendelian.merged.repeatAnno.vcf.gz | |||||
>>> | |||||
runtime { | |||||
docker:docker | |||||
cluster: cluster_config | |||||
systemDisk: "cloud_ssd 40" | |||||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||||
} | |||||
output { | |||||
File repeat_annotated_vcf = "${sample}.mendelian.merged.repeatAnno.vcf" | |||||
} | |||||
} |
task FinalResult { | |||||
File extracted_info | |||||
File annotated_txt | |||||
String prefix = basename(annotated_txt,".mendelian.txt") | |||||
String sample | |||||
String docker | |||||
String cluster_config | |||||
String disk_size | |||||
command <<< | |||||
python /opt/FinalResult2VCF.py -vcfInfo ${extracted_info} -mendelianInfo ${annotated_txt} -prefix ${prefix} -sample ${sample} | |||||
>>> | |||||
runtime { | |||||
docker:docker | |||||
cluster: cluster_config | |||||
systemDisk: "cloud_ssd 40" | |||||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||||
} | |||||
output { | |||||
File benchmarking_calls = "${prefix}_benchmarking_calls.vcf" | |||||
File all_info = "${prefix}_all_sample_information.vcf" | |||||
} | |||||
} |
task mendelian { | |||||
File child_vcf | |||||
File LCL7_vcf | |||||
File LCL8_vcf | |||||
String LCL7_name | |||||
String LCL8_name | |||||
String child_name | |||||
File ref_dir | |||||
String fasta | |||||
String docker | |||||
String cluster_config | |||||
String disk_size | |||||
command <<< | |||||
export LD_LIBRARY_PATH=/opt/htslib-1.9 | |||||
nt=$(nproc) | |||||
mkdir VBT | |||||
/opt/VBT-TrioAnalysis/vbt mendelian -ref ${ref_dir}/${fasta} -mother ${LCL8_vcf} -father ${LCL7_vcf} -child ${child_vcf} -outDir VBT -out-prefix ${child_name}.family --output-violation-regions -thread-count $nt | |||||
cat VBT/${child_name}.family_trio.vcf > ${child_name}.family.vcf | |||||
>>> | |||||
runtime { | |||||
docker:docker | |||||
cluster: cluster_config | |||||
systemDisk: "cloud_ssd 40" | |||||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||||
} | |||||
output { | |||||
Array[File] vbt_mendelian = glob("VBT/*") | |||||
File trio_vcf = "${child_name}.family.vcf" | |||||
} | |||||
} |
task merge { | task merge { | ||||
Array[File] family_vcf_gz | |||||
Array[File] family_vcf_idx | |||||
String sample | |||||
Array[File] family_mendelian_info | |||||
String family_name | |||||
String docker | String docker | ||||
String cluster_config | String cluster_config | ||||
String disk_size | String disk_size | ||||
command <<< | command <<< | ||||
rtg vcfmerge --force-merge-all -o ${sample}.merged.vcf.gz ${sep=" " family_vcf_gz} | |||||
cat ${sep=" " family_mendelian_info} | sort -k1,1 -k2,2n > ${family_name}.mendelian.txt | |||||
zcat ${sample}.merged.vcf.gz | grep -v '#' | cut -f1-2 | sed s'/\t/_/g' | sort | uniq -c | sed 's/\s\+/\t/g' | awk '{ if ($1 != 1) { print } }' | cut -f3 > ${sample}.vcf_dup.txt | |||||
cat ${family_name}.mendelian.txt | cut -f13 | sort | uniq -c > ${family_name}.sister.reproducibility.txt | |||||
cat ${family_name}.mendelian.txt | cut -f11 | sort | uniq -c | grep '1:1:1' > ${family_name}.mendelian.txt | |||||
cat ${family_name}.mendelian.txt | cut -f11 | sort | uniq -c | grep 'Ref:1:1' >> ${family_name}.mendelian.txt | |||||
cat ${family_name}.mendelian.txt | cut -f12 | sort | uniq -c | grep 'yes' >> ${family_name}.mendelian.txt | |||||
>>> | >>> | ||||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | ||||
} | } | ||||
output { | output { | ||||
File merged_vcf_gz = "${sample}.merged.vcf.gz" | |||||
File merged_vcf_idx = "${sample}.merged.vcf.gz.tbi" | |||||
File vcf_dup = "${sample}.vcf_dup.txt" | |||||
File family_all_info = "${family_name}.mendelian.txt" | |||||
File sister_consistency = "${family_name}.sister.reproducibility.txt" | |||||
File family_mendelian = "${family_name}.mendelian.txt" | |||||
} | } | ||||
} | } |
task mergeSister { | |||||
File LCL5_trio_vcf_gz | |||||
File LCL5_trio_vcf_idx | |||||
File LCL6_trio_vcf_gz | |||||
File LCL6_trio_vcf_idx | |||||
String family_name | |||||
String docker | |||||
String cluster_config | |||||
String disk_size | |||||
command <<< | |||||
rtg vcfmerge -o LCL5.LCL6.merged.vcf.gz ${LCL5_trio_vcf_gz} ${LCL6_trio_vcf_gz} | |||||
rtg vcfmerge -o LCL6.LCL5.merged.vcf.gz ${LCL6_trio_vcf_gz} ${LCL5_trio_vcf_gz} | |||||
zcat LCL5.LCL6.merged.vcf.gz | grep '##' > header | |||||
zcat LCL5.LCL6.merged.vcf.gz | grep -v '##' | cut -f8 > LCL5.mendelian | |||||
zcat LCL6.LCL5.merged.vcf.gz | grep -v '##' | paste - LCL5.mendelian > body | |||||
cat header body > ${family_name}.trio.info.vcf | |||||
>>> | |||||
runtime { | |||||
docker:docker | |||||
cluster: cluster_config | |||||
systemDisk: "cloud_ssd 40" | |||||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||||
} | |||||
output { | |||||
File family_mendelian_info = "${family_name}.trio.info.vcf" | |||||
} | |||||
} |
task mergeVCFInfo { | |||||
Array[File] vcf_gz | |||||
Array[File] vcf_idx | |||||
String sample | |||||
String docker | |||||
String cluster_config | |||||
String disk_size | |||||
command <<< | |||||
rtg vcfmerge --force-merge-all -o ${sample}.merged.info.vcf.gz ${sep=" " vcf_gz} | |||||
>>> | |||||
runtime { | |||||
docker:docker | |||||
cluster: cluster_config | |||||
systemDisk: "cloud_ssd 40" | |||||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||||
} | |||||
output { | |||||
File merged_vcf = "${sample}.merged.info.vcf.gz" | |||||
File merged_vcf_idx = "${sample}.merged.info.vcf.gz.tbi" | |||||
} | |||||
} |
task merge_info { | |||||
File vcfInfo | |||||
File mendelianInfo | |||||
String sample | |||||
String docker | |||||
String cluster_config | |||||
String disk_size | |||||
command <<< | |||||
python /opt/merge_mendelian_vcfinfo.py -vcfInfo ${vcfInfo} -mendelianInfo ${mendelianInfo} -sample ${sample} | |||||
>>> | |||||
runtime { | |||||
docker:docker | |||||
cluster: cluster_config | |||||
systemDisk: "cloud_ssd 40" | |||||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||||
} | |||||
output { | |||||
File all_info = "${sample}_mendelian_vcfInfo.vcf" | |||||
} | |||||
} |
task oneClass { | |||||
File snv_train_vcf | |||||
File snv_test_vcf | |||||
File indel_train_vcf | |||||
File indel_test_vcf | |||||
String sampleName = basename(snv_train_vcf,".normed.snv.train.txt") | |||||
String docker | |||||
String cluster_config | |||||
String disk_size | |||||
command <<< | |||||
python /opt/oneClass.py -train ${snv_train_vcf} -test ${snv_test_vcf} -name ${sampleName}_snv | |||||
python /opt/oneClass.py -train ${indel_train_vcf} -test ${indel_test_vcf} -name ${sampleName}_indel | |||||
>>> | |||||
runtime { | |||||
docker:docker | |||||
cluster: cluster_config | |||||
systemDisk: "cloud_ssd 40" | |||||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||||
} | |||||
output { | |||||
File snv_true_txt = "${sampleName}_snv_predicted_true.txt" | |||||
File snv_false_txt = "${sampleName}_snv_predicted_false.txt" | |||||
File snv_true_bed = "${sampleName}_snv_predicted_true.bed" | |||||
File snv_false_bed = "${sampleName}_snv_predicted_false.bed" | |||||
File snv_padding = "${sampleName}_snv_padding.bed" | |||||
File indel_true_txt = "${sampleName}_indel_predicted_true.txt" | |||||
File indel_false_txt = "${sampleName}_indel_predicted_false.txt" | |||||
File indel_true_bed = "${sampleName}_indel_predicted_true.bed" | |||||
File indel_false_bed = "${sampleName}_indel_predicted_false.bed" | |||||
File indel_padding = "${sampleName}_indel_padding.bed" | |||||
} | |||||
} | |||||
task reformVCF { | |||||
File family_mendelian_info | |||||
File family_name | |||||
String docker | |||||
String cluster_config | |||||
String disk_size | |||||
command <<< | |||||
python /opt/reformVCF.py -vcf ${family_mendelian_info} -name ${family_name} | |||||
cat ${family_name}.LCL5.vcf | grep -v '##' | grep -v '0/0' | grep -v '\./\.' > ${family_name}.LCL5.txt | |||||
cat ${family_name}.LCL6.vcf | grep -v '##' | grep -v '0/0' | grep -v '\./\.' > ${family_name}.LCL6.txt | |||||
cat ${family_name}.LCL7.vcf | grep -v '##' | grep -v '0/0' | grep -v '\./\.' > ${family_name}.LCL7.txt | |||||
cat ${family_name}.LCL8.vcf | grep -v '##' | grep -v '0/0' | grep -v '\./\.' > ${family_name}.LCL8.txt | |||||
>>> | |||||
runtime { | |||||
docker:docker | |||||
cluster: cluster_config | |||||
systemDisk: "cloud_ssd 40" | |||||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||||
} | |||||
output { | |||||
File LCL5_family_info = "${family_name}.LCL5.vcf" | |||||
File LCL6_family_info = "${family_name}.LCL6.vcf" | |||||
File LCL7_family_info = "${family_name}.LCL7.vcf" | |||||
File LCL8_family_info = "${family_name}.LCL8.vcf" | |||||
File family_info = "${family_name}.vcf" | |||||
File LCL5_family_info_txt = "${family_name}.LCL5.txt" | |||||
File LCL6_family_info_txt = "${family_name}.LCL6.txt" | |||||
File LCL7_family_info_txt = "${family_name}.LCL7.txt" | |||||
File LCL8_family_info_txt = "${family_name}.LCL8.txt" | |||||
} | |||||
} | |||||
String LCL7_name | String LCL7_name | ||||
String LCL8_name | String LCL8_name | ||||
String fasta | String fasta | ||||
String family_name | |||||
String family_chromo_name | |||||
String docker | String docker | ||||
String cluster_config | String cluster_config | ||||
String disk_size | String disk_size | ||||
cat ${LCL7_vcf} | grep -v '##' | cut -f10 > F7 | cat ${LCL7_vcf} | grep -v '##' | cut -f10 > F7 | ||||
cat ${LCL8_vcf} | grep -v '##' | cut -f10 > M8 | cat ${LCL8_vcf} | grep -v '##' | cut -f10 > M8 | ||||
cat ${LCL5_vcf} | grep -v '##' | paste - D6 F7 M8 > body | cat ${LCL5_vcf} | grep -v '##' | paste - D6 F7 M8 > body | ||||
cat ${LCL5_vcf} | grep '##' | cat - body > ${family_name}.vcf | |||||
cat ${LCL5_vcf} | grep '##' | cat - body > ${family_chromo_name}.vcf | |||||
# prepare ped file, D5 | # prepare ped file, D5 | ||||
echo "${family_name} ${LCL8_name} 0 0 2 -9 | |||||
${family_name} ${LCL7_name} 0 0 1 -9 | |||||
${family_name} ${LCL5_name} ${LCL7_name} ${LCL8_name} 2 -9" > ${family_name}.D5.ped | |||||
echo "${family_chromo_name} ${LCL8_name} 0 0 2 -9 | |||||
${family_chromo_name} ${LCL7_name} 0 0 1 -9 | |||||
${family_chromo_name} ${LCL5_name} ${LCL7_name} ${LCL8_name} 2 -9" > ${family_chromo_name}.D5.ped | |||||
mkdir VBT_D5 | mkdir VBT_D5 | ||||
/opt/VBT-TrioAnalysis/vbt mendelian -ref ${ref_dir}/${fasta} -mother ${family_name}.vcf -father ${family_name}.vcf -child ${family_name}.vcf -pedigree ${family_name}.D5.ped -outDir VBT_D5 -out-prefix ${family_name}.D5 --output-violation-regions -thread-count $nt | |||||
/opt/VBT-TrioAnalysis/vbt mendelian -ref ${ref_dir}/${fasta} -mother ${family_chromo_name}.vcf -father ${family_chromo_name}.vcf -child ${family_chromo_name}.vcf -pedigree ${family_chromo_name}.D5.ped -outDir VBT_D5 -out-prefix ${family_chromo_name}.D5 --output-violation-regions -thread-count $nt | |||||
cat VBT_D5/${family_chromo_name}.D5_trio.vcf > ${family_chromo_name}.D5.vcf | |||||
# prepare ped file, D6 | # prepare ped file, D6 | ||||
echo "${family_name} ${LCL8_name} 0 0 2 -9 | |||||
${family_name} ${LCL7_name} 0 0 1 -9 | |||||
${family_name} ${LCL6_name} ${LCL7_name} ${LCL8_name} 2 -9" > ${family_name}.D6.ped | |||||
echo "${family_chromo_name} ${LCL8_name} 0 0 2 -9 | |||||
${family_chromo_name} ${LCL7_name} 0 0 1 -9 | |||||
${family_chromo_name} ${LCL6_name} ${LCL7_name} ${LCL8_name} 2 -9" > ${family_chromo_name}.D6.ped | |||||
mkdir VBT_D6 | mkdir VBT_D6 | ||||
/opt/VBT-TrioAnalysis/vbt mendelian -ref ${ref_dir}/${fasta} -mother ${family_name}.vcf -father ${family_name}.vcf -child ${family_name}.vcf -pedigree ${family_name}.D6.ped -outDir VBT_D6 -out-prefix ${family_name}.D6 --output-violation-regions -thread-count $nt | |||||
/opt/VBT-TrioAnalysis/vbt mendelian -ref ${ref_dir}/${fasta} -mother ${family_chromo_name}.vcf -father ${family_chromo_name}.vcf -child ${family_chromo_name}.vcf -pedigree ${family_chromo_name}.D6.ped -outDir VBT_D6 -out-prefix ${family_chromo_name}.D6 --output-violation-regions -thread-count $nt | |||||
cat VBT_D6/${family_chromo_name}.D6_trio.vcf > ${family_chromo_name}.D6.vcf | |||||
>>> | >>> | ||||
runtime { | runtime { | ||||
output { | output { | ||||
Array[File] D5_mendelian = glob("VBT_D5/*") | Array[File] D5_mendelian = glob("VBT_D5/*") | ||||
Array[File] D6_mendelian = glob("VBT_D6/*") | Array[File] D6_mendelian = glob("VBT_D6/*") | ||||
File family_vcf = "${family_name}.vcf" | |||||
File D5_trio_vcf = "${family_chromo_name}.D5.vcf" | |||||
File D6_trio_vcf = "${family_chromo_name}.D6.vcf" | |||||
File family_vcf = "${family_chromo_name}.vcf" | |||||
} | } | ||||
} | } |
task two_family_merge { | task two_family_merge { | ||||
File LCL5_trio_vcf | File LCL5_trio_vcf | ||||
File LCL6_trio_vcf | File LCL6_trio_vcf | ||||
String family_name | |||||
File genotype_file | |||||
String family_chromo_name | |||||
String docker | String docker | ||||
String cluster_config | String cluster_config | ||||
String disk_size | String disk_size | ||||
command <<< | command <<< | ||||
cat ${LCL5_trio_vcf} | grep -v '##' > ${family_name}.LCL5.txt | |||||
cat ${LCL6_trio_vcf} | grep -v '##' > ${family_name}.LCL6.txt | |||||
python /opt/merge_two_family.py -LCL5 ${family_name}.LCL5.txt -LCL6 ${family_name}.LCL6.txt -family ${family_name} | |||||
cat ${LCL5_trio_vcf} | grep -v '##' > ${family_chromo_name}.LCL5.txt | |||||
cat ${LCL6_trio_vcf} | grep -v '##' > ${family_chromo_name}.LCL6.txt | |||||
cat ${genotype_file} | grep -v '##' | awk ' | |||||
BEGIN { OFS = "\t" } | |||||
NF > 2 && FNR > 1 { | |||||
for ( i=9; i<=NF; i++ ) { | |||||
split($i,a,":") ;$i = a[1]; | |||||
} | |||||
} | |||||
{ print } | |||||
' | cut -f1,2,4,5,10- > ${family_chromo_name}.genotype.txt | |||||
python /opt/merge_two_family_with_genotype.py -LCL5 ${family_chromo_name}.LCL5.txt -LCL6 ${family_chromo_name}.LCL6.txt -genotype ${family_chromo_name}.genotype.txt -family ${family_chromo_name} | |||||
>>> | >>> | ||||
runtime { | runtime { | ||||
} | } | ||||
output { | output { | ||||
File family_mendelian_info = "${family_name}.txt" | |||||
File family_mendelian_info = "${family_chromo_name}.txt" | |||||
} | } | ||||
} | } |
task variantsNorm { | |||||
File vcf | |||||
File ref_dir | |||||
String fasta | |||||
String sampleName | |||||
String docker | |||||
String cluster_config | |||||
String disk_size | |||||
command <<< | |||||
cat ${vcf} | grep '#' > header | |||||
cat ${vcf} | grep -v '#' > body | |||||
cat body | grep -w '^chr1\|^chr2\|^chr3\|^chr4\|^chr5\|^chr6\|^chr7\|^chr8\|^chr9\|^chr10\|^chr11\|^chr12\|^chr13\|^chr14\|^chr15\|^chr16\|^chr17\|^chr18\|^chr19\|^chr20\|^chr21\|^chr22\|^chrX' > body.filtered | |||||
cat header body.filtered > ${sampleName}.filtered.vcf | |||||
/opt/hall-lab/bcftools-1.9/bin/bcftools norm -f ${ref_dir}/${fasta} ${sampleName}.filtered.vcf > ${sampleName}.normed.vcf | |||||
cat ${sampleName}.normed.vcf | grep -v '##' > ${sampleName}.normed.txt | |||||
>>> | |||||
runtime { | |||||
docker:docker | |||||
cluster: cluster_config | |||||
systemDisk: "cloud_ssd 40" | |||||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||||
} | |||||
output { | |||||
File normed_vcf = "${sampleName}.normed.vcf" | |||||
File normed_txt = "${sampleName}.normed.txt" | |||||
} | |||||
} |
task votes { | |||||
Array[File] family_mendelian_info | |||||
File vcf | |||||
String chromo | |||||
String docker | |||||
String cluster_config | |||||
String disk_size | |||||
command <<< | |||||
mkdir temp | |||||
for i in ${sep=" " family_mendelian_info} | |||||
do | |||||
cp $i temp | |||||
done | |||||
cat ${vcf} | grep -v '##' > vcf_info.txt | |||||
python /opt/voted_by_vcfinfo_mendelianinfo.py -folder ./temp -vcf vcf_info.txt | |||||
cp LCL5_voted.vcf LCL5.${chromo}.voted.vcf | |||||
cp LCL6_voted.vcf LCL6.${chromo}.voted.vcf | |||||
cp LCL7_voted.vcf LCL7.${chromo}.voted.vcf | |||||
cp LCL8_voted.vcf LCL8.${chromo}.voted.vcf | |||||
>>> | |||||
runtime { | |||||
docker:docker | |||||
cluster: cluster_config | |||||
systemDisk: "cloud_ssd 40" | |||||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||||
} | |||||
output { | |||||
File LCL5_voted_vcf = "LCL5.${chromo}.voted.vcf" | |||||
File LCL6_voted_vcf = "LCL6.${chromo}.voted.vcf" | |||||
File LCL7_voted_vcf = "LCL7.${chromo}.voted.vcf" | |||||
File LCL8_voted_vcf = "LCL8.${chromo}.voted.vcf" | |||||
File all_sample_info = "all_sample_information.txt" | |||||
} | |||||
} | |||||
task zipIndex { | |||||
File vcf | |||||
String vcf_name = basename(vcf,".vcf") | |||||
String docker | |||||
String cluster_config | |||||
String disk_size | |||||
command <<< | |||||
rtg bgzip ${vcf} -c > ${vcf_name}.vcf.gz | |||||
rtg index -f vcf ${vcf_name}.vcf.gz | |||||
>>> | |||||
runtime { | |||||
docker:docker | |||||
cluster: cluster_config | |||||
systemDisk: "cloud_ssd 40" | |||||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||||
} | |||||
output { | |||||
File vcf_gz = "${vcf_name}.vcf.gz" | |||||
File vcf_idx = "${vcf_name}.vcf.gz.tbi" | |||||
} | |||||
} |
import "./tasks/sister.wdl" as sister | import "./tasks/sister.wdl" as sister | ||||
import "./tasks/two_family_merge.wdl" as two_family_merge | |||||
import "./tasks/merge.wdl" as merge | |||||
workflow {{ project_name }} { | workflow {{ project_name }} { | ||||
File inputSamplesFile | File inputSamplesFile | ||||
Array[Array[File]] inputSamples = read_tsv(inputSamplesFile) | Array[Array[File]] inputSamples = read_tsv(inputSamplesFile) | ||||
File ref_dir | File ref_dir | ||||
String docker | |||||
String family_name | |||||
String fasta | String fasta | ||||
String cluster_config | String cluster_config | ||||
String disk_size | String disk_size | ||||
LCL7_name=quartet[6], | LCL7_name=quartet[6], | ||||
LCL8_name=quartet[7], | LCL8_name=quartet[7], | ||||
fasta=fasta, | fasta=fasta, | ||||
family_name=quartet[8], | |||||
docker=docker, | |||||
family_chromo_name=quartet[8], | |||||
cluster_config=cluster_config, | cluster_config=cluster_config, | ||||
disk_size=disk_size | disk_size=disk_size | ||||
} | } | ||||
call two_family_merge.two_family_merge as two_family_merge { | |||||
input: | |||||
LCL5_trio_vcf=sister.D5_trio_vcf, | |||||
LCL6_trio_vcf=sister.D6_trio_vcf, | |||||
genotype_file=sister.family_vcf, | |||||
family_chromo_name=quartet[8], | |||||
cluster_config=cluster_config, | |||||
disk_size=disk_size | |||||
} | |||||
} | |||||
call merge.merge as merge { | |||||
input: | |||||
family_mendelian_info=two_family_merge.family_mendelian_info, | |||||
family_name=family_name, | |||||
cluster_config=cluster_config, | |||||
disk_size=disk_size | |||||
} | } | ||||
} | } |