from __future__ import division | |||||
import pandas as pd | |||||
import sys, argparse, os | |||||
# input arguments | |||||
parser = argparse.ArgumentParser(description="this script is to calculate reproducibility between Quartet_D5 and Quartet_D6s") | |||||
parser.add_argument('-sister', '--sister', type=str, help='sister.txt', required=True) | |||||
parser.add_argument('-project', '--project', type=str, help='project name', required=True) | |||||
args = parser.parse_args() | |||||
sister_file = args.sister | |||||
project_name = args.project | |||||
# output file | |||||
output_name = project_name + '.sister.reproducibility.txt' | |||||
output_file = open(output_name,'w') | |||||
# input files | |||||
sister_dat = pd.read_table(sister_file) | |||||
sister_same = 0 | |||||
sister_diff = 0 | |||||
for row in sister_dat.itertuples(): | |||||
# sister | |||||
if row[5] == row[6]: | |||||
if row[5] == './.': | |||||
mendelian = 'noInfo' | |||||
sister_count = "no" | |||||
elif row[5] == '0/0': | |||||
mendelian = 'Ref' | |||||
sister_count = "no" | |||||
else: | |||||
mendelian = '1' | |||||
sister_count = "yes_same" | |||||
else: | |||||
mendelian = '0' | |||||
if (row[5] == './.' or row[5] == '0/0') and (row[6] == './.' or row[6] == '0/0'): | |||||
sister_count = "no" | |||||
else: | |||||
sister_count = "yes_diff" | |||||
if sister_count == 'yes_same': | |||||
sister_same += 1 | |||||
elif sister_count == 'yes_diff': | |||||
sister_diff += 1 | |||||
else: | |||||
pass | |||||
sister = sister_same/(sister_same + sister_diff) | |||||
outcolumn = 'Project\tReproducibility_D5_D6\n' | |||||
outResult = project_name + '\t' + str(sister) + '\n' | |||||
output_file.write(outcolumn) | |||||
output_file.write(outResult) | |||||
import pandas as pd | |||||
import sys, argparse, os | |||||
mut = pd.read_table('/mnt/pgx_src_data_pool_4/home/renluyao/manuscript/MIE/vcf/mutation_type',header=None) | |||||
outIndel = open(sys.argv[1],'w') | |||||
for row in mut.itertuples(): | |||||
if ',' in row._4: | |||||
alt_seq = row._4.split(',') | |||||
alt_len = [len(i) for i in alt_seq] | |||||
alt = max(alt_len) | |||||
else: | |||||
alt = len(row._4) | |||||
ref = row._3 | |||||
pos = row._2 | |||||
if len(ref) == 1 and alt == 1: | |||||
pass | |||||
elif len(ref) > alt: | |||||
StartPos = int(pos) - 1 | |||||
EndPos = int(pos) + (len(ref) - 1) | |||||
outline_indel = row._1 + '\t' + str(StartPos) + '\t' + str(EndPos) + '\n' | |||||
outIndel.write(outline_indel) | |||||
elif alt > len(ref): | |||||
StartPos = int(pos) - 1 | |||||
EndPos = int(pos) + (alt - 1) | |||||
outline_indel = row._1 + '\t' + str(StartPos) + '\t' + str(EndPos) + '\n' | |||||
outIndel.write(outline_indel) | |||||
elif len(ref) == alt: | |||||
StartPos = int(pos) - 1 | |||||
EndPos = int(pos) + (alt - 1) | |||||
outline_indel = row._1 + '\t' + str(StartPos) + '\t' + str(EndPos) + '\n' | |||||
outIndel.write(outline_indel) | |||||
import pandas as pd | |||||
import sys, argparse, os | |||||
mut = mut = pd.read_table('/mnt/pgx_src_data_pool_4/home/renluyao/manuscript/benchmark_calls/vcf/mutation_type',header=None) | |||||
vote = pd.read_table('/mnt/pgx_src_data_pool_4/home/renluyao/manuscript/benchmark_calls/all_info/benchmark.vote.mendelian.txt',header=None) | |||||
merged_df = pd.merge(vote, mut, how='inner', left_on=[0,1], right_on = [0,1]) | |||||
outFile = open(sys.argv[1],'w') | |||||
outIndel = open(sys.argv[2],'w') | |||||
for row in merged_df.itertuples(): | |||||
#d5 | |||||
if ',' in row._7: | |||||
d5 = row._7.split(',') | |||||
d5_len = [len(i) for i in d5] | |||||
d5_alt = max(d5_len) | |||||
else: | |||||
d5_alt = len(row._7) | |||||
#d6 | |||||
if ',' in row._15: | |||||
d6 = row._15.split(',') | |||||
d6_len = [len(i) for i in d6] | |||||
d6_alt = max(d6_len) | |||||
else: | |||||
d6_alt = len(row._15) | |||||
#f7 | |||||
if ',' in row._23: | |||||
f7 = row._23.split(',') | |||||
f7_len = [len(i) for i in f7] | |||||
f7_alt = max(f7_len) | |||||
else: | |||||
f7_alt = len(row._23) | |||||
#m8 | |||||
if ',' in row._31: | |||||
m8 = row._31.split(',') | |||||
m8_len = [len(i) for i in m8] | |||||
m8_alt = max(m8_len) | |||||
else: | |||||
m8_alt = len(row._31) | |||||
all_length = [d5_alt,d6_alt,f7_alt,m8_alt] | |||||
alt = max(all_length) | |||||
ref = row._35 | |||||
pos = int(row._2) | |||||
if len(ref) == 1 and alt == 1: | |||||
StartPos = int(pos) -1 | |||||
EndPos = int(pos) | |||||
cate = 'SNV' | |||||
elif len(ref) > alt: | |||||
StartPos = int(pos) - 1 | |||||
EndPos = int(pos) + (len(ref) - 1) | |||||
cate = 'INDEL' | |||||
outline_indel = row._1 + '\t' + str(StartPos) + '\t' + str(EndPos) + '\n' | |||||
outIndel.write(outline_indel) | |||||
elif alt > len(ref): | |||||
StartPos = int(pos) - 1 | |||||
EndPos = int(pos) + (alt - 1) | |||||
cate = 'INDEL' | |||||
outline_indel = row._1 + '\t' + str(StartPos) + '\t' + str(EndPos) + '\n' | |||||
outIndel.write(outline_indel) | |||||
elif len(ref) == alt: | |||||
StartPos = int(pos) - 1 | |||||
EndPos = int(pos) + (alt - 1) | |||||
cate = 'INDEL' | |||||
outline_indel = row._1 + '\t' + str(StartPos) + '\t' + str(EndPos) + '\n' | |||||
outIndel.write(outline_indel) | |||||
outline = row._1 + '\t' + str(StartPos) + '\t' + str(EndPos) + '\t' + str(row._2) + '\t' + cate + '\n' | |||||
outFile.write(outline) | |||||
cat benchmark.men.vote.diffbed.lengthlessthan50.txt | awk '{print $1"\t"$2"\t"".""\t"$35"\t"$7"\t.\t.\t.\tGT\t"$6}' | grep -v '0/0' > LCL5.body | |||||
cat benchmark.men.vote.diffbed.lengthlessthan50.txt | awk '{print $1"\t"$2"\t"".""\t"$35"\t"$15"\t.\t.\t.\tGT\t"$14}' | grep -v '0/0' > LCL6.body | |||||
cat benchmark.men.vote.diffbed.lengthlessthan50.txt | awk '{print $1"\t"$2"\t"".""\t"$35"\t"$23"\t.\t.\t.\tGT\t"$22}' | grep -v '0/0'> LCL7.body | |||||
cat benchmark.men.vote.diffbed.lengthlessthan50.txt | awk '{print $1"\t"$2"\t"".""\t"$35"\t"$31"\t.\t.\t.\tGT\t"$30}'| grep -v '0/0' > LCL8.body | |||||
cat header5 LCL5.body > LCL5.beforediffbed.vcf | |||||
cat header6 LCL6.body > LCL6.beforediffbed.vcf | |||||
cat header7 LCL7.body > LCL7.beforediffbed.vcf | |||||
cat header8 LCL8.body > LCL8.beforediffbed.vcf | |||||
rtg bgzip *beforediffbed.vcf | |||||
rtg index *beforediffbed.vcf.gz | |||||
rtg vcffilter -i LCL5.beforediffbed.vcf.gz --exclude-bed=/mnt/pgx_src_data_pool_4/home/renluyao/manuscript/benchmark_calls/MIE/diff.merged.bed -o LCL5.afterfilterdiffbed.vcf.gz | |||||
rtg vcffilter -i LCL6.beforediffbed.vcf.gz --exclude-bed=/mnt/pgx_src_data_pool_4/home/renluyao/manuscript/benchmark_calls/MIE/diff.merged.bed -o LCL6.afterfilterdiffbed.vcf.gz | |||||
rtg vcffilter -i LCL7.beforediffbed.vcf.gz --exclude-bed=/mnt/pgx_src_data_pool_4/home/renluyao/manuscript/benchmark_calls/MIE/diff.merged.bed -o LCL7.afterfilterdiffbed.vcf.gz | |||||
rtg vcffilter -i LCL8.beforediffbed.vcf.gz --exclude-bed=/mnt/pgx_src_data_pool_4/home/renluyao/manuscript/benchmark_calls/MIE/diff.merged.bed -o LCL8.afterfilterdiffbed.vcf.gz | |||||
/mnt/pgx_src_data_pool_4/home/renluyao/softwares/annovar/table_annovar.pl LCL5.beforediffbed.vcf.gz /mnt/pgx_src_data_pool_4/home/renluyao/softwares/annovar/humandb \ | |||||
-buildver hg38 \ | |||||
-out LCL5 \ | |||||
-remove \ | |||||
-protocol 1000g2015aug_all,1000g2015aug_afr,1000g2015aug_amr,1000g2015aug_eas,1000g2015aug_eur,1000g2015aug_sas,clinvar_20190305,gnomad211_genome \ | |||||
-operation f,f,f,f,f,f,f,f \ | |||||
-nastring . \ | |||||
-vcfinput \ | |||||
--thread 8 | |||||
rtg vcfeval -b /mnt/pgx_src_data_pool_4/home/renluyao/Quartet/GIAB/NA12878_HG001/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_PGandRTGphasetransfer.vcf.gz -c LCL5.afterfilterdiffbed.vcf.gz -o LCL5_NIST -t /mnt/pgx_src_data_pool_4/home/renluyao/annotation/hg38/GRCh38.d1.vd1.sdf/ | |||||
rtg vcfeval -b /mnt/pgx_src_data_pool_4/home/renluyao/Quartet/GIAB/NA12878_HG001/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_PGandRTGphasetransfer.vcf.gz -c LCL6.afterfilterdiffbed.vcf.gz -o LCL6_NIST -t /mnt/pgx_src_data_pool_4/home/renluyao/annotation/hg38/GRCh38.d1.vd1.sdf/ | |||||
rtg vcfeval -b /mnt/pgx_src_data_pool_4/home/renluyao/Quartet/GIAB/NA12878_HG001/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_PGandRTGphasetransfer.vcf.gz -c LCL7.afterfilterdiffbed.vcf.gz -o LCL7_NIST -t /mnt/pgx_src_data_pool_4/home/renluyao/annotation/hg38/GRCh38.d1.vd1.sdf/ | |||||
rtg vcfeval -b /mnt/pgx_src_data_pool_4/home/renluyao/Quartet/GIAB/NA12878_HG001/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_PGandRTGphasetransfer.vcf.gz -c LCL8.afterfilterdiffbed.vcf.gz -o LCL8_NIST -t /mnt/pgx_src_data_pool_4/home/renluyao/annotation/hg38/GRCh38.d1.vd1.sdf/ | |||||
zcat LCL5.afterfilterdiffbed.vcf.gz | grep -v '#' | awk '{ if ((length($4) == 1) && (length($5) == 1)) { print } }' | wc -l | |||||
zcat LCL6.afterfilterdiffbed.vcf.gz | grep -v '#' | awk '{ if ((length($4) == 1) && (length($5) == 1)) { print } }' | wc -l | |||||
zcat LCL7.afterfilterdiffbed.vcf.gz | grep -v '#' | awk '{ if ((length($4) == 1) && (length($5) == 1)) { print } }' | wc -l | |||||
zcat LCL8.afterfilterdiffbed.vcf.gz | grep -v '#' | awk '{ if ((length($4) == 1) && (length($5) == 1)) { print } }' | wc -l | |||||
zcat LCL5.afterfilterdiffbed.vcf.gz | grep -v '#' | awk '{ if ((length($4) == 1) && (length($5) < 11) && (length($5) > 1)) { print } }' | wc -l | |||||
zcat LCL6.afterfilterdiffbed.vcf.gz | grep -v '#' | awk '{ if ((length($4) == 1) && (length($5) < 11) && (length($5) > 1)) { print } }' | wc -l | |||||
zcat LCL7.afterfilterdiffbed.vcf.gz | grep -v '#' | awk '{ if ((length($4) == 1) && (length($5) < 11) && (length($5) > 1)) { print } }' | wc -l | |||||
zcat LCL8.afterfilterdiffbed.vcf.gz | grep -v '#' | awk '{ if ((length($4) == 1) && (length($5) < 11) && (length($5) > 1)) { print } }' | wc -l | |||||
zcat LCL5.afterfilterdiffbed.vcf.gz | grep -v '#' | awk '{ if ((length($4) == 1) && (length($5) > 10)) { print } }' | wc -l | |||||
zcat LCL6.afterfilterdiffbed.vcf.gz | grep -v '#' | awk '{ if ((length($4) == 1) && (length($5) > 10)) { print } }' | wc -l | |||||
zcat LCL7.afterfilterdiffbed.vcf.gz | grep -v '#' | awk '{ if ((length($4) == 1) && (length($5) > 10)) { print } }' | wc -l | |||||
zcat LCL8.afterfilterdiffbed.vcf.gz | grep -v '#' | awk '{ if ((length($4) == 1) && (length($5) > 10)) { print } }' | wc -l | |||||
bedtools subtract -a LCL5.27.homo_ref.consensus.bed -b /mnt/pgx_src_data_pool_4/home/renluyao/manuscript/benchmark_calls/MIE/diff.merged.bed > LCL5.27.homo_ref.consensus.filtereddiffbed.bed | |||||
bedtools subtract -a LCL6.27.homo_ref.consensus.bed -b /mnt/pgx_src_data_pool_4/home/renluyao/manuscript/benchmark_calls/MIE/diff.merged.bed > LCL6.27.homo_ref.consensus.filtereddiffbed.bed | |||||
bedtools subtract -a LCL7.27.homo_ref.consensus.bed -b /mnt/pgx_src_data_pool_4/home/renluyao/manuscript/benchmark_calls/MIE/diff.merged.bed > LCL7.27.homo_ref.consensus.filtereddiffbed.bed | |||||
bedtools subtract -a LCL8.27.homo_ref.consensus.bed -b /mnt/pgx_src_data_pool_4/home/renluyao/manuscript/benchmark_calls/MIE/diff.merged.bed > LCL8.27.homo_ref.consensus.filtereddiffbed.bed | |||||
python vcf2bed.py LCL5.body LCL5.variants.bed | |||||
python vcf2bed.py LCL6.body LCL6.variants.bed | |||||
python vcf2bed.py LCL7.body LCL7.variants.bed | |||||
python vcf2bed.py LCL8.body LCL8.variants.bed | |||||
cat /mnt/pgx_src_data_pool_4/home/renluyao/manuscript/benchmark_calls/all_info/LCL5.variants.bed | cut -f1,11,12 | cat - LCL5.27.homo_ref.consensus.filtereddiffbed.bed | sort -k1,1 -k2,2n > LCL5.high.confidence.bed | |||||
##contig=<ID=chr1,length=248956422> | |||||
##contig=<ID=chr2,length=242193529> | |||||
##contig=<ID=chr3,length=198295559> | |||||
##contig=<ID=chr4,length=190214555> | |||||
##contig=<ID=chr5,length=181538259> | |||||
##contig=<ID=chr6,length=170805979> | |||||
##contig=<ID=chr7,length=159345973> | |||||
##contig=<ID=chr8,length=145138636> | |||||
##contig=<ID=chr9,length=138394717> | |||||
##contig=<ID=chr10,length=133797422> | |||||
##contig=<ID=chr11,length=135086622> | |||||
##contig=<ID=chr12,length=133275309> | |||||
##contig=<ID=chr13,length=114364328> | |||||
##contig=<ID=chr14,length=107043718> | |||||
##contig=<ID=chr15,length=101991189> | |||||
##contig=<ID=chr16,length=90338345> | |||||
##contig=<ID=chr17,length=83257441> | |||||
##contig=<ID=chr18,length=80373285> | |||||
##contig=<ID=chr19,length=58617616> | |||||
##contig=<ID=chr20,length=64444167> | |||||
##contig=<ID=chr21,length=46709983> | |||||
##contig=<ID=chr22,length=50818468> | |||||
##contig=<ID=chrX,length=156040895> |
import json | |||||
import pandas as pd | |||||
import sys, argparse, os | |||||
parser = argparse.ArgumentParser(description="This script is to get information from multiqc") | |||||
parser.add_argument('-fastqc_qualimap', '--fastqc_qualimap', type=str, help='multiqc_general_stats.txt', required=True) | |||||
parser.add_argument('-fastqc', '--fastqc', type=str, help='multiqc_fastqc.txt', required=True) | |||||
parser.add_argument('-fastqscreen', '--fastqscreen', type=str, help='multiqc_fastq_screen.txt', required=True) | |||||
parser.add_argument('-hap', '--happy', type=str, help='multiqc_happy_data.json', required=True) | |||||
args = parser.parse_args() | |||||
# Rename input: | |||||
fastqc_qualimap_file = args.fastqc_qualimap | |||||
fastqc_file = args.fastqc | |||||
fastqscreen_file = args.fastqscreen | |||||
hap_file = args.happy | |||||
# fastqc and qualimap | |||||
dat = pd.read_table(fastqc_qualimap_file) | |||||
fastqc = dat.loc[:, dat.columns.str.startswith('FastQC')] | |||||
fastqc.insert(loc=0, column='Sample', value=dat['Sample']) | |||||
fastqc_stat = fastqc.dropna() | |||||
# qulimap | |||||
qualimap = dat.loc[:, dat.columns.str.startswith('QualiMap')] | |||||
qualimap.insert(loc=0, column='Sample', value=dat['Sample']) | |||||
qualimap_stat = qualimap.dropna() | |||||
# fastqc | |||||
dat = pd.read_table(fastqc_file) | |||||
fastqc_module = dat.loc[:, "per_base_sequence_quality":"kmer_content"] | |||||
fastqc_module.insert(loc=0, column='Sample', value=dat['Sample']) | |||||
fastqc_all = pd.merge(fastqc_stat,fastqc_module, how='outer', left_on=['Sample'], right_on = ['Sample']) | |||||
# fastqscreen | |||||
dat = pd.read_table(fastqscreen_file) | |||||
fastqscreen = dat.loc[:, dat.columns.str.endswith('percentage')] | |||||
dat['Sample'] = [i.replace('_screen','') for i in dat['Sample']] | |||||
fastqscreen.insert(loc=0, column='Sample', value=dat['Sample']) | |||||
# benchmark | |||||
with open(hap_file) as hap_json: | |||||
happy = json.load(hap_json) | |||||
dat =pd.DataFrame.from_records(happy) | |||||
dat = dat.loc[:, dat.columns.str.endswith('ALL')] | |||||
dat_transposed = dat.T | |||||
benchmark = dat_transposed.loc[:,['sample_id','METRIC.Precision','METRIC.Recall']] | |||||
benchmark.columns = ['Sample','Precision','Recall'] | |||||
#output | |||||
fastqc_all.to_csv('fastqc.final.result.txt',sep="\t",index=0) | |||||
fastqscreen.to_csv('fastqscreen.final.result.txt',sep="\t",index=0) | |||||
qualimap_stat.to_csv('qualimap.final.result.txt',sep="\t",index=0) | |||||
benchmark.to_csv('benchmark.final.result.txt',sep="\t",index=0) | |||||
import sys,getopt | |||||
import os | |||||
import re | |||||
import fileinput | |||||
import pandas as pd | |||||
def usage(): | |||||
print( | |||||
""" | |||||
Usage: python extract_vcf_information.py -i input_merged_vcf_file -o parsed_file | |||||
This script will extract SNVs and Indels information from the vcf files and output a tab-delimited files. | |||||
Input: | |||||
-i the selected vcf file | |||||
Output: | |||||
-o tab-delimited parsed file | |||||
""") | |||||
# select supported small variants | |||||
def process(oneLine): | |||||
line = oneLine.rstrip() | |||||
strings = line.strip().split('\t') | |||||
infoParsed = parse_INFO(strings[7]) | |||||
formatKeys = strings[8].split(':') | |||||
formatValues = strings[9].split(':') | |||||
for i in range(0,len(formatKeys) -1) : | |||||
if formatKeys[i] == 'AD': | |||||
ra = formatValues[i].split(',') | |||||
infoParsed['RefDP'] = ra[0] | |||||
infoParsed['AltDP'] = ra[1] | |||||
if (int(ra[1]) + int(ra[0])) != 0: | |||||
infoParsed['af'] = float(int(ra[1])/(int(ra[1]) + int(ra[0]))) | |||||
else: | |||||
pass | |||||
else: | |||||
infoParsed[formatKeys[i]] = formatValues[i] | |||||
infoParsed['chromo'] = strings[0] | |||||
infoParsed['pos'] = strings[1] | |||||
infoParsed['id'] = strings[2] | |||||
infoParsed['ref'] = strings[3] | |||||
infoParsed['alt'] = strings[4] | |||||
infoParsed['qual'] = strings[5] | |||||
return infoParsed | |||||
def parse_INFO(info): | |||||
strings = info.strip().split(';') | |||||
keys = [] | |||||
values = [] | |||||
for i in strings: | |||||
kv = i.split('=') | |||||
if kv[0] == 'DB': | |||||
keys.append('DB') | |||||
values.append('1') | |||||
elif kv[0] == 'AF': | |||||
pass | |||||
else: | |||||
keys.append(kv[0]) | |||||
values.append(kv[1]) | |||||
infoDict = dict(zip(keys, values)) | |||||
return infoDict | |||||
opts,args = getopt.getopt(sys.argv[1:],"hi:o:") | |||||
for op,value in opts: | |||||
if op == "-i": | |||||
inputFile=value | |||||
elif op == "-o": | |||||
outputFile=value | |||||
elif op == "-h": | |||||
usage() | |||||
sys.exit() | |||||
if len(sys.argv[1:]) < 3: | |||||
usage() | |||||
sys.exit() | |||||
allDict = [] | |||||
for line in fileinput.input(inputFile): | |||||
m = re.match('^\#',line) | |||||
if m is not None: | |||||
pass | |||||
else: | |||||
oneDict = process(line) | |||||
allDict.append(oneDict) | |||||
allTable = pd.DataFrame(allDict) | |||||
allTable.to_csv(outputFile,sep='\t',index=False) | |||||
import sys,getopt | |||||
from itertools import islice | |||||
over_50_outfile = open("indel_lenth_over_50.txt",'w') | |||||
less_50_outfile = open("benchmark.men.vote.diffbed.lengthlessthan50.txt","w") | |||||
def process(line): | |||||
strings = line.strip().split('\t') | |||||
#d5 | |||||
if ',' in strings[6]: | |||||
d5 = strings[6].split(',') | |||||
d5_len = [len(i) for i in d5] | |||||
d5_alt = max(d5_len) | |||||
else: | |||||
d5_alt = len(strings[6]) | |||||
#d6 | |||||
if ',' in strings[14]: | |||||
d6 = strings[14].split(',') | |||||
d6_len = [len(i) for i in d6] | |||||
d6_alt = max(d6_len) | |||||
else: | |||||
d6_alt = len(strings[14]) | |||||
#f7 | |||||
if ',' in strings[22]: | |||||
f7 = strings[22].split(',') | |||||
f7_len = [len(i) for i in f7] | |||||
f7_alt = max(f7_len) | |||||
else: | |||||
f7_alt = len(strings[22]) | |||||
#m8 | |||||
if ',' in strings[30]: | |||||
m8 = strings[30].split(',') | |||||
m8_len = [len(i) for i in m8] | |||||
m8_alt = max(m8_len) | |||||
else: | |||||
m8_alt = len(strings[30]) | |||||
#ref | |||||
ref_len = len(strings[34]) | |||||
if (d5_alt > 50) or (d6_alt > 50) or (f7_alt > 50) or (m8_alt > 50) or (ref_len > 50): | |||||
over_50_outfile.write(line) | |||||
else: | |||||
less_50_outfile.write(line) | |||||
input_file = open(sys.argv[1]) | |||||
for line in islice(input_file, 1, None): | |||||
process(line) |
from itertools import islice | |||||
import fileinput | |||||
import sys, argparse, os | |||||
# input arguments | |||||
parser = argparse.ArgumentParser(description="this script is to exclude indel over 50bp") | |||||
parser.add_argument('-i', '--mergedGVCF', type=str, help='merged gVCF txt with only chr, pos, ref, alt and genotypes', required=True) | |||||
parser.add_argument('-prefix', '--prefix', type=str, help='prefix of output file', required=True) | |||||
args = parser.parse_args() | |||||
input_dat = args.mergedGVCF | |||||
prefix = args.prefix | |||||
# output file | |||||
output_name = prefix + '.indel.lessthan50bp.txt' | |||||
outfile = open(output_name,'w') | |||||
def process(line): | |||||
strings = line.strip().split('\t') | |||||
#d5 | |||||
if ',' in strings[3]: | |||||
alt = strings[3].split(',') | |||||
alt_len = [len(i) for i in alt] | |||||
alt_max = max(alt_len) | |||||
else: | |||||
alt_max = len(strings[3]) | |||||
#ref | |||||
ref_len = len(strings[2]) | |||||
if (alt_max > 50) or (ref_len > 50): | |||||
pass | |||||
else: | |||||
outfile.write(line) | |||||
for line in fileinput.input(input_dat): | |||||
process(line) | |||||
from __future__ import division | |||||
import pandas as pd | |||||
import sys, argparse, os | |||||
import fileinput | |||||
# input arguments | |||||
parser = argparse.ArgumentParser(description="this script is to get filtered and benchmark vcf info") | |||||
parser.add_argument('-filtered', '--filtered', type=str, help='filtered position', required=True) | |||||
parser.add_argument('-benchmark', '--benchmark', type=str, help='benchmark position', required=True) | |||||
parser.add_argument('-vcf', '--vcf', type=str, help='one specific vcf', required=True) | |||||
parser.add_argument('-filename', '--filename', type=str, help='output file name', required=True) | |||||
args = parser.parse_args() | |||||
filtered = args.filtered | |||||
benchmark = args.benchmark | |||||
vcf = args.vcf | |||||
filename = args.filename | |||||
# output file | |||||
filtered_filename = filename + '.filtered.txt' | |||||
benchmark_filename = filename + '.benchmark.txt' | |||||
# input files | |||||
filtered_dat = pd.read_table(filtered,header=None) | |||||
benchmark_dat = pd.read_table(benchmark,header=None) | |||||
vcf_dat = pd.read_table(vcf) | |||||
filtered_merged_df = pd.merge(filtered_dat, vcf_dat, how='inner',left_on=[0,1], right_on = ['#CHROM','POS']) | |||||
benchmark_merged_df = pd.merge(benchmark_dat,vcf_dat, how='inner',left_on=[0,1], right_on = ['#CHROM','POS']) | |||||
filtered_merged_df.to_csv(filtered_filename,sep='\t',index=False) | |||||
benchmark_merged_df.to_csv(benchmark_filename,sep='\t',index=False) |
from __future__ import division | |||||
import sys, argparse, os | |||||
import fileinput | |||||
import re | |||||
import pandas as pd | |||||
from operator import itemgetter | |||||
from collections import Counter | |||||
from itertools import islice | |||||
from numpy import * | |||||
import statistics | |||||
# input arguments | |||||
parser = argparse.ArgumentParser(description="this script is to count voting number") | |||||
parser.add_argument('-vcf', '--multi_sample_vcf', type=str, help='The VCF file you want to count the voting number', required=True) | |||||
parser.add_argument('-dup', '--dup_list', type=str, help='Duplication list', required=True) | |||||
parser.add_argument('-sample', '--sample_name', type=str, help='which sample of quartet', required=True) | |||||
parser.add_argument('-prefix', '--prefix', type=str, help='Prefix of output file name', required=True) | |||||
args = parser.parse_args() | |||||
multi_sample_vcf = args.multi_sample_vcf | |||||
dup_list = args.dup_list | |||||
prefix = args.prefix | |||||
sample_name = args.sample_name | |||||
vcf_header = '''##fileformat=VCFv4.2 | |||||
##fileDate=20200331 | |||||
##source=high_confidence_calls_intergration(choppy app) | |||||
##reference=GRCh38.d1.vd1 | |||||
##INFO=<ID=location,Number=1,Type=String,Description="Repeat region"> | |||||
##INFO=<ID=DETECTED,Number=1,Type=Integer,Description="Number of detected votes"> | |||||
##INFO=<ID=VOTED,Number=1,Type=Integer,Description="Number of consnesus votes"> | |||||
##INFO=<ID=FAM,Number=1,Type=Integer,Description="Number mendelian consisitent votes"> | |||||
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> | |||||
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Sum depth of all samples"> | |||||
##FORMAT=<ID=ALT,Number=1,Type=Integer,Description="Sum alternative depth of all samples"> | |||||
##FORMAT=<ID=AF,Number=1,Type=Float,Description="Allele frequency, sum alternative depth / sum depth"> | |||||
##FORMAT=<ID=GQ,Number=1,Type=Float,Description="Average genotype quality"> | |||||
##FORMAT=<ID=QD,Number=1,Type=Float,Description="Average Variant Confidence/Quality by Depth"> | |||||
##FORMAT=<ID=MQ,Number=1,Type=Float,Description="Average mapping quality"> | |||||
##FORMAT=<ID=FS,Number=1,Type=Float,Description="Average Phred-scaled p-value using Fisher's exact test to detect strand bias"> | |||||
##FORMAT=<ID=QUALI,Number=1,Type=Float,Description="Average variant quality"> | |||||
##contig=<ID=chr1,length=248956422> | |||||
##contig=<ID=chr2,length=242193529> | |||||
##contig=<ID=chr3,length=198295559> | |||||
##contig=<ID=chr4,length=190214555> | |||||
##contig=<ID=chr5,length=181538259> | |||||
##contig=<ID=chr6,length=170805979> | |||||
##contig=<ID=chr7,length=159345973> | |||||
##contig=<ID=chr8,length=145138636> | |||||
##contig=<ID=chr9,length=138394717> | |||||
##contig=<ID=chr10,length=133797422> | |||||
##contig=<ID=chr11,length=135086622> | |||||
##contig=<ID=chr12,length=133275309> | |||||
##contig=<ID=chr13,length=114364328> | |||||
##contig=<ID=chr14,length=107043718> | |||||
##contig=<ID=chr15,length=101991189> | |||||
##contig=<ID=chr16,length=90338345> | |||||
##contig=<ID=chr17,length=83257441> | |||||
##contig=<ID=chr18,length=80373285> | |||||
##contig=<ID=chr19,length=58617616> | |||||
##contig=<ID=chr20,length=64444167> | |||||
##contig=<ID=chr21,length=46709983> | |||||
##contig=<ID=chr22,length=50818468> | |||||
##contig=<ID=chrX,length=156040895> | |||||
''' | |||||
vcf_header_all_sample = '''##fileformat=VCFv4.2 | |||||
##fileDate=20200331 | |||||
##reference=GRCh38.d1.vd1 | |||||
##INFO=<ID=location,Number=1,Type=String,Description="Repeat region"> | |||||
##INFO=<ID=DUP,Number=1,Type=Flag,Description="Duplicated variant records"> | |||||
##INFO=<ID=DETECTED,Number=1,Type=Integer,Description="Number of detected votes"> | |||||
##INFO=<ID=VOTED,Number=1,Type=Integer,Description="Number of consnesus votes"> | |||||
##INFO=<ID=FAM,Number=1,Type=Integer,Description="Number mendelian consisitent votes"> | |||||
##INFO=<ID=ALL_ALT,Number=1,Type=Float,Description="Sum of alternative reads of all samples"> | |||||
##INFO=<ID=ALL_DP,Number=1,Type=Float,Description="Sum of depth of all samples"> | |||||
##INFO=<ID=ALL_AF,Number=1,Type=Float,Description="Allele frequency of net alternatice reads and net depth"> | |||||
##INFO=<ID=GQ_MEAN,Number=1,Type=Float,Description="Mean of genotype quality of all samples"> | |||||
##INFO=<ID=QD_MEAN,Number=1,Type=Float,Description="Average Variant Confidence/Quality by Depth"> | |||||
##INFO=<ID=MQ_MEAN,Number=1,Type=Float,Description="Mean of mapping quality of all samples"> | |||||
##INFO=<ID=FS_MEAN,Number=1,Type=Float,Description="Average Phred-scaled p-value using Fisher's exact test to detect strand bias"> | |||||
##INFO=<ID=QUAL_MEAN,Number=1,Type=Float,Description="Average variant quality"> | |||||
##INFO=<ID=PCR,Number=1,Type=String,Description="Consensus of PCR votes"> | |||||
##INFO=<ID=PCR_FREE,Number=1,Type=String,Description="Consensus of PCR-free votes"> | |||||
##INFO=<ID=CONSENSUS,Number=1,Type=String,Description="Consensus calls"> | |||||
##INFO=<ID=CONSENSUS_SEQ,Number=1,Type=String,Description="Consensus sequence"> | |||||
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> | |||||
##FORMAT=<ID=DP,Number=1,Type=String,Description="Depth"> | |||||
##FORMAT=<ID=ALT,Number=1,Type=Integer,Description="Alternative Depth"> | |||||
##FORMAT=<ID=AF,Number=1,Type=String,Description="Allele frequency"> | |||||
##FORMAT=<ID=GQ,Number=1,Type=String,Description="Genotype quality"> | |||||
##FORMAT=<ID=MQ,Number=1,Type=String,Description="Mapping quality"> | |||||
##FORMAT=<ID=TWINS,Number=1,Type=String,Description="1 is twins shared, 0 is twins discordant "> | |||||
##FORMAT=<ID=TRIO5,Number=1,Type=String,Description="1 is LCL7, LCL8 and LCL5 mendelian consistent, 0 is mendelian vioaltion"> | |||||
##FORMAT=<ID=TRIO6,Number=1,Type=String,Description="1 is LCL7, LCL8 and LCL6 mendelian consistent, 0 is mendelian vioaltion"> | |||||
##contig=<ID=chr1,length=248956422> | |||||
##contig=<ID=chr2,length=242193529> | |||||
##contig=<ID=chr3,length=198295559> | |||||
##contig=<ID=chr4,length=190214555> | |||||
##contig=<ID=chr5,length=181538259> | |||||
##contig=<ID=chr6,length=170805979> | |||||
##contig=<ID=chr7,length=159345973> | |||||
##contig=<ID=chr8,length=145138636> | |||||
##contig=<ID=chr9,length=138394717> | |||||
##contig=<ID=chr10,length=133797422> | |||||
##contig=<ID=chr11,length=135086622> | |||||
##contig=<ID=chr12,length=133275309> | |||||
##contig=<ID=chr13,length=114364328> | |||||
##contig=<ID=chr14,length=107043718> | |||||
##contig=<ID=chr15,length=101991189> | |||||
##contig=<ID=chr16,length=90338345> | |||||
##contig=<ID=chr17,length=83257441> | |||||
##contig=<ID=chr18,length=80373285> | |||||
##contig=<ID=chr19,length=58617616> | |||||
##contig=<ID=chr20,length=64444167> | |||||
##contig=<ID=chr21,length=46709983> | |||||
##contig=<ID=chr22,length=50818468> | |||||
##contig=<ID=chrX,length=156040895> | |||||
''' | |||||
# read in duplication list | |||||
dup = pd.read_table(dup_list,header=None) | |||||
var_dup = dup[0].tolist() | |||||
# output file | |||||
benchmark_file_name = prefix + '_voted.vcf' | |||||
benchmark_outfile = open(benchmark_file_name,'w') | |||||
all_sample_file_name = prefix + '_all_sample_information.vcf' | |||||
all_sample_outfile = open(all_sample_file_name,'w') | |||||
# write VCF | |||||
outputcolumn = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t' + sample_name + '_benchmark_calls\n' | |||||
benchmark_outfile.write(vcf_header) | |||||
benchmark_outfile.write(outputcolumn) | |||||
outputcolumn_all_sample = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+ \ | |||||
'Quartet_DNA_BGI_SEQ2000_BGI_1_20180518\tQuartet_DNA_BGI_SEQ2000_BGI_2_20180530\tQuartet_DNA_BGI_SEQ2000_BGI_3_20180530\t' + \ | |||||
'Quartet_DNA_BGI_T7_WGE_1_20191105\tQuartet_DNA_BGI_T7_WGE_2_20191105\tQuartet_DNA_BGI_T7_WGE_3_20191105\t' + \ | |||||
'Quartet_DNA_ILM_Nova_ARD_1_20181108\tQuartet_DNA_ILM_Nova_ARD_2_20181108\tQuartet_DNA_ILM_Nova_ARD_3_20181108\t' + \ | |||||
'Quartet_DNA_ILM_Nova_ARD_4_20190111\tQuartet_DNA_ILM_Nova_ARD_5_20190111\tQuartet_DNA_ILM_Nova_ARD_6_20190111\t' + \ | |||||
'Quartet_DNA_ILM_Nova_BRG_1_20180930\tQuartet_DNA_ILM_Nova_BRG_2_20180930\tQuartet_DNA_ILM_Nova_BRG_3_20180930\t' + \ | |||||
'Quartet_DNA_ILM_Nova_WUX_1_20190917\tQuartet_DNA_ILM_Nova_WUX_2_20190917\tQuartet_DNA_ILM_Nova_WUX_3_20190917\t' + \ | |||||
'Quartet_DNA_ILM_XTen_ARD_1_20170403\tQuartet_DNA_ILM_XTen_ARD_2_20170403\tQuartet_DNA_ILM_XTen_ARD_3_20170403\t' + \ | |||||
'Quartet_DNA_ILM_XTen_NVG_1_20170329\tQuartet_DNA_ILM_XTen_NVG_2_20170329\tQuartet_DNA_ILM_XTen_NVG_3_20170329\t' + \ | |||||
'Quartet_DNA_ILM_XTen_WUX_1_20170216\tQuartet_DNA_ILM_XTen_WUX_2_20170216\tQuartet_DNA_ILM_XTen_WUX_3_20170216\n' | |||||
all_sample_outfile.write(vcf_header_all_sample) | |||||
all_sample_outfile.write(outputcolumn_all_sample) | |||||
#function | |||||
def replace_nan(strings_list): | |||||
updated_list = [] | |||||
for i in strings_list: | |||||
if i == '.': | |||||
updated_list.append('.:.:.:.:.:.:.:.:.:.:.:.') | |||||
else: | |||||
updated_list.append(i) | |||||
return updated_list | |||||
def remove_dot(strings_list): | |||||
updated_list = [] | |||||
for i in strings_list: | |||||
if i == '.': | |||||
pass | |||||
else: | |||||
updated_list.append(i) | |||||
return updated_list | |||||
def detected_number(strings): | |||||
gt = [x.split(':')[0] for x in strings] | |||||
percentage = 27 - gt.count('.') | |||||
return(str(percentage)) | |||||
def vote_number(strings,consensus_call): | |||||
gt = [x.split(':')[0] for x in strings] | |||||
gt = [x.replace('.','0/0') for x in gt] | |||||
gt = list(map(gt_uniform,[i for i in gt])) | |||||
vote_num = gt.count(consensus_call) | |||||
return(str(vote_num)) | |||||
def family_vote(strings,consensus_call): | |||||
gt = [x.split(':')[0] for x in strings] | |||||
gt = [x.replace('.','0/0') for x in gt] | |||||
gt = list(map(gt_uniform,[i for i in gt])) | |||||
mendelian = [':'.join(x.split(':')[1:4]) for x in strings] | |||||
indices = [i for i, x in enumerate(gt) if x == consensus_call] | |||||
matched_mendelian = itemgetter(*indices)(mendelian) | |||||
mendelian_num = matched_mendelian.count('1:1:1') | |||||
return(str(mendelian_num)) | |||||
def gt_uniform(strings): | |||||
uniformed_gt = '' | |||||
allele1 = strings.split('/')[0] | |||||
allele2 = strings.split('/')[1] | |||||
if int(allele1) > int(allele2): | |||||
uniformed_gt = allele2 + '/' + allele1 | |||||
else: | |||||
uniformed_gt = allele1 + '/' + allele2 | |||||
return uniformed_gt | |||||
def decide_by_rep(strings): | |||||
consensus_rep = '' | |||||
mendelian = [':'.join(x.split(':')[1:4]) for x in strings] | |||||
gt = [x.split(':')[0] for x in strings] | |||||
gt = [x.replace('.','0/0') for x in gt] | |||||
# modified gt turn 2/1 to 1/2 | |||||
gt = list(map(gt_uniform,[i for i in gt])) | |||||
# mendelian consistent? | |||||
mendelian_dict = Counter(mendelian) | |||||
highest_mendelian = mendelian_dict.most_common(1) | |||||
candidate_mendelian = highest_mendelian[0][0] | |||||
freq_mendelian = highest_mendelian[0][1] | |||||
if (candidate_mendelian == '1:1:1') and (freq_mendelian >= 2): | |||||
gt_num_dict = Counter(gt) | |||||
highest_gt = gt_num_dict.most_common(1) | |||||
candidate_gt = highest_gt[0][0] | |||||
freq_gt = highest_gt[0][1] | |||||
if (candidate_gt != '0/0') and (freq_gt >= 2): | |||||
consensus_rep = candidate_gt | |||||
elif (candidate_gt == '0/0') and (freq_gt >= 2): | |||||
consensus_rep = '0/0' | |||||
else: | |||||
consensus_rep = 'inconGT' | |||||
elif (candidate_mendelian == '') and (freq_mendelian >= 2): | |||||
consensus_rep = 'noInfo' | |||||
else: | |||||
consensus_rep = 'inconMen' | |||||
return consensus_rep | |||||
def main(): | |||||
for line in fileinput.input(multi_sample_vcf): | |||||
headline = re.match('^\#',line) | |||||
if headline is not None: | |||||
pass | |||||
else: | |||||
line = line.strip() | |||||
strings = line.split('\t') | |||||
variant_id = '_'.join([strings[0],strings[1]]) | |||||
# check if the variants location is duplicated | |||||
if variant_id in var_dup: | |||||
strings[7] = strings[7] + ';DUP' | |||||
outLine = '\t'.join(strings) + '\n' | |||||
all_sample_outfile.write(outLine) | |||||
else: | |||||
# pre-define | |||||
pcr_consensus = '.' | |||||
pcr_free_consensus = '.' | |||||
consensus_call = '.' | |||||
consensus_alt_seq = '.' | |||||
# pcr | |||||
strings[9:] = replace_nan(strings[9:]) | |||||
pcr = itemgetter(*[9,10,11,27,28,29,30,31,32,33,34,35])(strings) | |||||
SEQ2000 = decide_by_rep(pcr[0:3]) | |||||
XTen_ARD = decide_by_rep(pcr[3:6]) | |||||
XTen_NVG = decide_by_rep(pcr[6:9]) | |||||
XTen_WUX = decide_by_rep(pcr[9:12]) | |||||
sequence_site = [SEQ2000,XTen_ARD,XTen_NVG,XTen_WUX] | |||||
sequence_dict = Counter(sequence_site) | |||||
highest_sequence = sequence_dict.most_common(1) | |||||
candidate_sequence = highest_sequence[0][0] | |||||
freq_sequence = highest_sequence[0][1] | |||||
if freq_sequence > 2: | |||||
pcr_consensus = candidate_sequence | |||||
else: | |||||
pcr_consensus = 'inconSequenceSite' | |||||
# pcr-free | |||||
pcr_free = itemgetter(*[12,13,14,15,16,17,18,19,20,21,22,23,24,25,26])(strings) | |||||
T7_WGE = decide_by_rep(pcr_free[0:3]) | |||||
Nova_ARD_1 = decide_by_rep(pcr_free[3:6]) | |||||
Nova_ARD_2 = decide_by_rep(pcr_free[6:9]) | |||||
Nova_BRG = decide_by_rep(pcr_free[9:12]) | |||||
Nova_WUX = decide_by_rep(pcr_free[12:15]) | |||||
sequence_site = [T7_WGE,Nova_ARD_1,Nova_ARD_2,Nova_BRG,Nova_WUX] | |||||
highest_sequence = sequence_dict.most_common(1) | |||||
candidate_sequence = highest_sequence[0][0] | |||||
freq_sequence = highest_sequence[0][1] | |||||
if freq_sequence > 3: | |||||
pcr_free_consensus = candidate_sequence | |||||
else: | |||||
pcr_free_consensus = 'inconSequenceSite' | |||||
# pcr and pcr-free | |||||
tag = ['inconGT','noInfo','inconMen','inconSequenceSite'] | |||||
if (pcr_consensus == pcr_free_consensus) and (pcr_consensus not in tag) and (pcr_consensus != '0/0'): | |||||
consensus_call = pcr_consensus | |||||
VOTED = vote_number(strings[9:],consensus_call) | |||||
strings[7] = strings[7] + ';VOTED=' + VOTED | |||||
DETECTED = detected_number(strings[9:]) | |||||
strings[7] = strings[7] + ';DETECTED=' + DETECTED | |||||
FAM = family_vote(strings[9:],consensus_call) | |||||
strings[7] = strings[7] + ';FAM=' + FAM | |||||
# Delete multiple alternative genotype to necessary expression | |||||
alt = strings[4] | |||||
alt_gt = alt.split(',') | |||||
if len(alt_gt) > 1: | |||||
allele1 = consensus_call.split('/')[0] | |||||
allele2 = consensus_call.split('/')[1] | |||||
if allele1 == '0': | |||||
allele2_seq = alt_gt[int(allele2) - 1] | |||||
consensus_alt_seq = allele2_seq | |||||
consensus_call = '0/1' | |||||
else: | |||||
allele1_seq = alt_gt[int(allele1) - 1] | |||||
allele2_seq = alt_gt[int(allele2) - 1] | |||||
if int(allele1) > int(allele2): | |||||
consensus_alt_seq = allele2_seq + ',' + allele1_seq | |||||
consensus_call = '1/2' | |||||
elif int(allele1) < int(allele2): | |||||
consensus_alt_seq = allele1_seq + ',' + allele2_seq | |||||
consensus_call = '1/2' | |||||
else: | |||||
consensus_alt_seq = allele1_seq | |||||
consensus_call = '1/1' | |||||
else: | |||||
consensus_alt_seq = alt | |||||
# GT:DP:ALT:AF:GQ:QD:MQ:FS:QUAL | |||||
# GT:TWINS:TRIO5:TRIO6:DP:ALT:AF:GQ:QD:MQ:FS:QUAL:rawGT | |||||
# DP | |||||
DP = [x.split(':')[4] for x in strings[9:]] | |||||
DP = remove_dot(DP) | |||||
DP = [int(x) for x in DP] | |||||
ALL_DP = sum(DP) | |||||
# AF | |||||
ALT = [x.split(':')[5] for x in strings[9:]] | |||||
ALT = remove_dot(ALT) | |||||
ALT = [int(x) for x in ALT] | |||||
ALL_ALT = sum(ALT) | |||||
ALL_AF = round(ALL_ALT/ALL_DP,2) | |||||
# GQ | |||||
GQ = [x.split(':')[7] for x in strings[9:]] | |||||
GQ = remove_dot(GQ) | |||||
GQ = [int(x) for x in GQ] | |||||
GQ_MEAN = round(mean(GQ),2) | |||||
# QD | |||||
QD = [x.split(':')[8] for x in strings[9:]] | |||||
QD = remove_dot(QD) | |||||
QD = [float(x) for x in QD] | |||||
QD_MEAN = round(mean(QD),2) | |||||
# MQ | |||||
MQ = [x.split(':')[9] for x in strings[9:]] | |||||
MQ = remove_dot(MQ) | |||||
MQ = [float(x) for x in MQ] | |||||
MQ_MEAN = round(mean(MQ),2) | |||||
# FS | |||||
FS = [x.split(':')[10] for x in strings[9:]] | |||||
FS = remove_dot(FS) | |||||
FS = [float(x) for x in FS] | |||||
FS_MEAN = round(mean(FS),2) | |||||
# QUAL | |||||
QUAL = [x.split(':')[11] for x in strings[9:]] | |||||
QUAL = remove_dot(QUAL) | |||||
QUAL = [float(x) for x in QUAL] | |||||
QUAL_MEAN = round(mean(QUAL),2) | |||||
# benchmark output | |||||
output_format = consensus_call + ':' + str(ALL_DP) + ':' + str(ALL_ALT) + ':' + str(ALL_AF) + ':' + str(GQ_MEAN) + ':' + str(QD_MEAN) + ':' + str(MQ_MEAN) + ':' + str(FS_MEAN) + ':' + str(QUAL_MEAN) | |||||
outLine = strings[0] + '\t' + strings[1] + '\t' + strings[2] + '\t' + strings[3] + '\t' + consensus_alt_seq + '\t' + '.' + '\t' + '.' + '\t' + strings[7] + '\t' + 'GT:DP:ALT:AF:GQ:QD:MQ:FS:QUAL' + '\t' + output_format + '\n' | |||||
benchmark_outfile.write(outLine) | |||||
# all sample output | |||||
strings[7] = strings[7] + ';ALL_ALT=' + str(ALL_ALT) + ';ALL_DP=' + str(ALL_DP) + ';ALL_AF=' + str(ALL_AF) \ | |||||
+ ';GQ_MEAN=' + str(GQ_MEAN) + ';QD_MEAN=' + str(QD_MEAN) + ';MQ_MEAN=' + str(MQ_MEAN) + ';FS_MEAN=' + str(FS_MEAN) \ | |||||
+ ';QUAL_MEAN=' + str(QUAL_MEAN) + ';PCR=' + consensus_call + ';PCR_FREE=' + consensus_call + ';CONSENSUS=' + consensus_call \ | |||||
+ ';CONSENSUS_SEQ=' + consensus_alt_seq | |||||
all_sample_outLine = '\t'.join(strings) + '\n' | |||||
all_sample_outfile.write(all_sample_outLine) | |||||
elif (pcr_consensus in tag) and (pcr_free_consensus in tag): | |||||
consensus_call = 'filtered' | |||||
DETECTED = detected_number(strings[9:]) | |||||
strings[7] = strings[7] + ';DETECTED=' + DETECTED | |||||
strings[7] = strings[7] + ';CONSENSUS=' + consensus_call | |||||
all_sample_outLine = '\t'.join(strings) + '\n' | |||||
all_sample_outfile.write(all_sample_outLine) | |||||
elif ((pcr_consensus == '0/0') or (pcr_consensus in tag)) and ((pcr_free_consensus not in tag) and (pcr_free_consensus != '0/0')): | |||||
consensus_call = 'pcr-free-speicifc' | |||||
DETECTED = detected_number(strings[9:]) | |||||
strings[7] = strings[7] + ';DETECTED=' + DETECTED | |||||
strings[7] = strings[7] + ';CONSENSUS=' + consensus_call | |||||
all_sample_outLine = '\t'.join(strings) + '\n' | |||||
all_sample_outfile.write(all_sample_outLine) | |||||
elif ((pcr_consensus != '0/0') or (pcr_consensus not in tag)) and ((pcr_free_consensus in tag) and (pcr_free_consensus == '0/0')): | |||||
consensus_call = 'pcr-speicifc' | |||||
DETECTED = detected_number(strings[9:]) | |||||
strings[7] = strings[7] + ';DETECTED=' + DETECTED | |||||
strings[7] = strings[7] + ';CONSENSUS=' + consensus_call + ';PCR=' + pcr_consensus + ';PCR_FREE=' + pcr_free_consensus | |||||
all_sample_outLine = '\t'.join(strings) + '\n' | |||||
all_sample_outfile.write(all_sample_outLine) | |||||
elif (pcr_consensus == '0/0') and (pcr_free_consensus == '0/0'): | |||||
consensus_call = 'confirm for parents' | |||||
DETECTED = detected_number(strings[9:]) | |||||
strings[7] = strings[7] + ';DETECTED=' + DETECTED | |||||
strings[7] = strings[7] + ';CONSENSUS=' + consensus_call | |||||
all_sample_outLine = '\t'.join(strings) + '\n' | |||||
all_sample_outfile.write(all_sample_outLine) | |||||
else: | |||||
consensus_call = 'filtered' | |||||
DETECTED = detected_number(strings[9:]) | |||||
strings[7] = strings[7] + ';DETECTED=' + DETECTED | |||||
strings[7] = strings[7] + ';CONSENSUS=' + consensus_call | |||||
all_sample_outLine = '\t'.join(strings) + '\n' | |||||
all_sample_outfile.write(all_sample_outLine) | |||||
if __name__ == '__main__': | |||||
main() | |||||
import pandas as pd | |||||
import sys, argparse, os | |||||
mut = pd.read_table(sys.argv[1]) | |||||
outFile = open(sys.argv[2],'w') | |||||
for row in mut.itertuples(): | |||||
#d5 | |||||
if ',' in row.V4: | |||||
alt = row.V4.split(',') | |||||
alt_len = [len(i) for i in alt] | |||||
alt_max = max(alt_len) | |||||
else: | |||||
alt_max = len(row.V4) | |||||
#d6 | |||||
alt = alt_max | |||||
ref = row.V3 | |||||
pos = int(row.V2) | |||||
if len(ref) == 1 and alt == 1: | |||||
StartPos = int(pos) -1 | |||||
EndPos = int(pos) | |||||
cate = 'SNV' | |||||
elif len(ref) > alt: | |||||
StartPos = int(pos) - 1 | |||||
EndPos = int(pos) + (len(ref) - 1) | |||||
cate = 'INDEL' | |||||
elif alt > len(ref): | |||||
StartPos = int(pos) - 1 | |||||
EndPos = int(pos) + (alt - 1) | |||||
cate = 'INDEL' | |||||
elif len(ref) == alt: | |||||
StartPos = int(pos) - 1 | |||||
EndPos = int(pos) + (alt - 1) | |||||
cate = 'INDEL' | |||||
outline = row.V1 + '\t' + str(StartPos) + '\t' + str(EndPos) + '\t' + str(row.V2) + '\t' + cate + '\n' | |||||
outFile.write(outline) | |||||
import pandas as pd | |||||
import sys, argparse, os | |||||
from operator import itemgetter | |||||
parser = argparse.ArgumentParser(description="This script is to get how many samples") | |||||
parser.add_argument('-sample', '--sample', type=str, help='quartet_sample', required=True) | |||||
parser.add_argument('-rep', '--rep', type=str, help='quartet_rep', required=True) | |||||
args = parser.parse_args() | |||||
# Rename input: | |||||
sample = args.sample | |||||
rep = args.rep | |||||
quartet_sample = pd.read_table(sample,header=None) | |||||
quartet_sample = list(quartet_sample[0]) | |||||
quartet_rep = pd.read_table(rep,header=None) | |||||
quartet_rep = quartet_rep[0] | |||||
#tags | |||||
sister_tag = 'false' | |||||
quartet_tag = 'false' | |||||
quartet_rep_unique = list(set(quartet_rep)) | |||||
single_rep = [i for i in range(len(quartet_rep)) if quartet_rep[i] == quartet_rep_unique[0]] | |||||
single_batch_sample = itemgetter(*single_rep)(quartet_sample) | |||||
num = len(single_batch_sample) | |||||
if num == 1: | |||||
sister_tag = 'false' | |||||
quartet_tag = 'false' | |||||
elif num == 2: | |||||
if set(single_batch_sample) == set(['LCL5','LCL6']): | |||||
sister_tag = 'true' | |||||
quartet_tag = 'false' | |||||
elif num == 3: | |||||
if ('LCL5' in single_batch_sample) and ('LCL6' in single_batch_sample): | |||||
sister_tag = 'true' | |||||
quartet_tag = 'false' | |||||
elif num == 4: | |||||
if set(single_batch_sample) == set(['LCL5','LCL6','LCL7','LCL8']): | |||||
sister_tag = 'false' | |||||
quartet_tag = 'true' | |||||
sister_outfile = open('sister_tag','w') | |||||
quartet_outfile = open('quartet_tag','w') | |||||
sister_outfile.write(sister_tag) | |||||
quartet_outfile.write(quartet_tag) |
from __future__ import division | |||||
import sys, argparse, os | |||||
import pandas as pd | |||||
from collections import Counter | |||||
# input arguments | |||||
parser = argparse.ArgumentParser(description="this script is to merge mendelian and vcfinfo, and extract high_confidence_calls") | |||||
parser.add_argument('-vcf', '--vcf', type=str, help='merged multiple sample vcf', required=True) | |||||
args = parser.parse_args() | |||||
vcf = args.vcf | |||||
lcl5_outfile = open('LCL5_all_variants.txt','w') | |||||
filtered_outfile = open('LCL5_filtered_variants.txt','w') | |||||
vcf_dat = pd.read_table(vcf) | |||||
for row in vcf_dat.itertuples(): | |||||
lcl5_list = [row.Quartet_DNA_BGI_SEQ2000_BGI_LCL5_1_20180518,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL5_2_20180530,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL5_3_20180530, \ | |||||
row.Quartet_DNA_BGI_T7_WGE_LCL5_1_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL5_2_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL5_3_20191105, \ | |||||
row.Quartet_DNA_ILM_Nova_ARD_LCL5_1_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL5_2_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL5_3_20181108, \ | |||||
row.Quartet_DNA_ILM_Nova_ARD_LCL5_4_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL5_5_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL5_6_20190111, \ | |||||
row.Quartet_DNA_ILM_Nova_BRG_LCL5_1_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL5_2_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL5_3_20180930, \ | |||||
row.Quartet_DNA_ILM_Nova_WUX_LCL5_1_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL5_2_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL5_3_20190917, \ | |||||
row.Quartet_DNA_ILM_XTen_ARD_LCL5_1_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL5_2_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL5_3_20170403, \ | |||||
row.Quartet_DNA_ILM_XTen_NVG_LCL5_1_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL5_2_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL5_3_20170329, \ | |||||
row.Quartet_DNA_ILM_XTen_WUX_LCL5_1_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL5_2_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL5_3_20170216] | |||||
lcl5_vcf_gt = [x.split(':')[0] for x in lcl5_list] | |||||
lcl5_gt=[item.replace('./.', '0/0') for item in lcl5_vcf_gt] | |||||
gt_dict = Counter(lcl5_gt) | |||||
highest_gt = gt_dict.most_common(1) | |||||
candidate_gt = highest_gt[0][0] | |||||
freq_gt = highest_gt[0][1] | |||||
output = row._1 + '\t' + str(row.POS) + '\t' + '\t'.join(lcl5_gt) + '\n' | |||||
if (candidate_gt == '0/0') and (freq_gt == 27): | |||||
filtered_outfile.write(output) | |||||
else: | |||||
lcl5_outfile.write(output) | |||||
cat benchmark.men.vote.diffbed.filtered | awk '{print $1"\t"$2"\t"".""\t"$35"\t"$7"\t.\t.\t.\tGT\t"$6}' | grep -v '2_y' > LCL5.body | |||||
cat benchmark.men.vote.diffbed.filtered | awk '{print $1"\t"$2"\t"".""\t"$35"\t"$15"\t.\t.\t.\tGT\t"$14}' | grep -v '2_y' > LCL6.body | |||||
cat benchmark.men.vote.diffbed.filtered | awk '{print $1"\t"$2"\t"".""\t"$35"\t"$23"\t.\t.\t.\tGT\t"$22}' | grep -v '2_y' > LCL7.body | |||||
cat benchmark.men.vote.diffbed.filtered | awk '{print $1"\t"$2"\t"".""\t"$35"\t"$31"\t.\t.\t.\tGT\t"$30}' | grep -v '2_y' > LCL8.body | |||||
for i in *txt; do cat $i | awk '{ if ((length($3) == 1) && (length($4) == 1)) { print } }' | grep -v '#' | cut -f3,4 | sort |uniq -c | sed 's/\s\+/\t/g' | cut -f2 > $i.mut; done |
from __future__ import division | |||||
import pandas as pd | |||||
import sys, argparse, os | |||||
import fileinput | |||||
import re | |||||
# input arguments | |||||
parser = argparse.ArgumentParser(description="this script is to get final high confidence calls and information of all replicates") | |||||
parser.add_argument('-vcfInfo', '--vcfInfo', type=str, help='The txt file of variants information, this file is named as prefix__variant_quality_location.txt', required=True) | |||||
parser.add_argument('-mendelianInfo', '--mendelianInfo', type=str, help='The merged mendelian information of all samples', required=True) | |||||
parser.add_argument('-sample', '--sample_name', type=str, help='which sample of quartet', required=True) | |||||
args = parser.parse_args() | |||||
vcfInfo = args.vcfInfo | |||||
mendelianInfo = args.mendelianInfo | |||||
sample_name = args.sample_name | |||||
#GT:TWINS:TRIO5:TRIO6:DP:AF:GQ:QD:MQ:FS:QUAL | |||||
vcf_header = '''##fileformat=VCFv4.2 | |||||
##fileDate=20200331 | |||||
##source=high_confidence_calls_intergration(choppy app) | |||||
##reference=GRCh38.d1.vd1 | |||||
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> | |||||
##FORMAT=<ID=TWINS,Number=1,Type=Flag,Description="1 for sister consistent, 0 for sister different"> | |||||
##FORMAT=<ID=TRIO5,Number=1,Type=Flag,Description="1 for LCL7, LCL8 and LCL5 mendelian consistent, 0 for family violation"> | |||||
##FORMAT=<ID=TRIO6,Number=1,Type=Flag,Description="1 for LCL7, LCL8 and LCL6 mendelian consistent, 0 for family violation"> | |||||
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Depth"> | |||||
##FORMAT=<ID=ALT,Number=1,Type=Integer,Description="Alternative Depth"> | |||||
##FORMAT=<ID=AF,Number=1,Type=Float,Description="Allele frequency"> | |||||
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype quality"> | |||||
##FORMAT=<ID=QD,Number=1,Type=Float,Description="Variant Confidence/Quality by Depth"> | |||||
##FORMAT=<ID=MQ,Number=1,Type=Float,Description="Mapping quality"> | |||||
##FORMAT=<ID=FS,Number=1,Type=Float,Description="Phred-scaled p-value using Fisher's exact test to detect strand bias"> | |||||
##FORMAT=<ID=QUAL,Number=1,Type=Float,Description="variant quality"> | |||||
##contig=<ID=chr1,length=248956422> | |||||
##contig=<ID=chr2,length=242193529> | |||||
##contig=<ID=chr3,length=198295559> | |||||
##contig=<ID=chr4,length=190214555> | |||||
##contig=<ID=chr5,length=181538259> | |||||
##contig=<ID=chr6,length=170805979> | |||||
##contig=<ID=chr7,length=159345973> | |||||
##contig=<ID=chr8,length=145138636> | |||||
##contig=<ID=chr9,length=138394717> | |||||
##contig=<ID=chr10,length=133797422> | |||||
##contig=<ID=chr11,length=135086622> | |||||
##contig=<ID=chr12,length=133275309> | |||||
##contig=<ID=chr13,length=114364328> | |||||
##contig=<ID=chr14,length=107043718> | |||||
##contig=<ID=chr15,length=101991189> | |||||
##contig=<ID=chr16,length=90338345> | |||||
##contig=<ID=chr17,length=83257441> | |||||
##contig=<ID=chr18,length=80373285> | |||||
##contig=<ID=chr19,length=58617616> | |||||
##contig=<ID=chr20,length=64444167> | |||||
##contig=<ID=chr21,length=46709983> | |||||
##contig=<ID=chr22,length=50818468> | |||||
##contig=<ID=chrX,length=156040895> | |||||
''' | |||||
# output file | |||||
file_name = sample_name + '_mendelian_vcfInfo.vcf' | |||||
outfile = open(file_name,'w') | |||||
outputcolumn = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t' + sample_name + '\n' | |||||
outfile.write(vcf_header) | |||||
outfile.write(outputcolumn) | |||||
# input files | |||||
vcf_info = pd.read_table(vcfInfo) | |||||
mendelian_info = pd.read_table(mendelianInfo) | |||||
merged_df = pd.merge(vcf_info, mendelian_info, how='outer', left_on=['#CHROM','POS'], right_on = ['#CHROM','POS']) | |||||
merged_df = merged_df.fillna('.') | |||||
# | |||||
def parse_INFO(info): | |||||
strings = info.strip().split(';') | |||||
keys = [] | |||||
values = [] | |||||
for i in strings: | |||||
kv = i.split('=') | |||||
if kv[0] == 'DB': | |||||
keys.append('DB') | |||||
values.append('1') | |||||
else: | |||||
keys.append(kv[0]) | |||||
values.append(kv[1]) | |||||
infoDict = dict(zip(keys, values)) | |||||
return infoDict | |||||
# | |||||
for row in merged_df.itertuples(): | |||||
if row[18] != '.': | |||||
# format | |||||
# GT:TWINS:TRIO5:TRIO6:DP:AF:GQ:QD:MQ:FS:QUAL | |||||
FORMAT_x = row[10].split(':') | |||||
ALT = int(FORMAT_x[1].split(',')[1]) | |||||
if int(FORMAT_x[2]) != 0: | |||||
AF = round(ALT/int(FORMAT_x[2]),2) | |||||
else: | |||||
AF = '.' | |||||
INFO_x = parse_INFO(row.INFO_x) | |||||
if FORMAT_x[2] == '0': | |||||
INFO_x['QD'] = '.' | |||||
else: | |||||
pass | |||||
FORMAT = row[18] + ':' + FORMAT_x[2] + ':' + str(ALT) + ':' + str(AF) + ':' + FORMAT_x[3] + ':' + INFO_x['QD'] + ':' + INFO_x['MQ'] + ':' + INFO_x['FS'] + ':' + str(row.QUAL_x) | |||||
# outline | |||||
outline = row._1 + '\t' + str(row.POS) + '\t' + row.ID_x + '\t' + row.REF_y + '\t' + row.ALT_y + '\t' + '.' + '\t' + '.' + '\t' + '.' + '\t' + 'GT:TWINS:TRIO5:TRIO6:DP:ALT:AF:GQ:QD:MQ:FS:QUAL' + '\t' + FORMAT + '\n' | |||||
else: | |||||
rawGT = row[10].split(':') | |||||
FORMAT_x = row[10].split(':') | |||||
ALT = int(FORMAT_x[1].split(',')[1]) | |||||
if int(FORMAT_x[2]) != 0: | |||||
AF = round(ALT/int(FORMAT_x[2]),2) | |||||
else: | |||||
AF = '.' | |||||
INFO_x = parse_INFO(row.INFO_x) | |||||
if FORMAT_x[2] == '0': | |||||
INFO_x['QD'] = '.' | |||||
else: | |||||
pass | |||||
FORMAT = '.:.:.:.' + ':' + FORMAT_x[2] + ':' + str(ALT) + ':' + str(AF) + ':' + FORMAT_x[3] + ':' + INFO_x['QD'] + ':' + INFO_x['MQ'] + ':' + INFO_x['FS'] + ':' + str(row.QUAL_x) + ':' + rawGT[0] | |||||
# outline | |||||
outline = row._1 + '\t' + str(row.POS) + '\t' + row.ID_x + '\t' + row.REF_x + '\t' + row.ALT_x + '\t' + '.' + '\t' + '.' + '\t' + '.' + '\t' + 'GT:TWINS:TRIO5:TRIO6:DP:ALT:AF:GQ:QD:MQ:FS:QUAL:rawGT' + '\t' + FORMAT + '\n' | |||||
outfile.write(outline) |
from __future__ import division | |||||
import pandas as pd | |||||
import sys, argparse, os | |||||
import fileinput | |||||
import re | |||||
# input arguments | |||||
parser = argparse.ArgumentParser(description="this script is to extract mendelian concordance information") | |||||
parser.add_argument('-LCL5', '--LCL5', type=str, help='LCL5 family info', required=True) | |||||
parser.add_argument('-LCL6', '--LCL6', type=str, help='LCL6 family info', required=True) | |||||
parser.add_argument('-family', '--family', type=str, help='family name', required=True) | |||||
args = parser.parse_args() | |||||
lcl5 = args.LCL5 | |||||
lcl6 = args.LCL6 | |||||
family = args.family | |||||
# output file | |||||
family_name = family + '.txt' | |||||
family_file = open(family_name,'w') | |||||
# input files | |||||
lcl5_dat = pd.read_table(lcl5) | |||||
lcl6_dat = pd.read_table(lcl6) | |||||
merged_df = pd.merge(lcl5_dat, lcl6_dat, how='outer', left_on=['#CHROM','POS'], right_on = ['#CHROM','POS']) | |||||
def alt_seq(alt, genotype): | |||||
if genotype == './.': | |||||
seq = './.' | |||||
elif genotype == '0/0': | |||||
seq = '0/0' | |||||
else: | |||||
alt = alt.split(',') | |||||
genotype = genotype.split('/') | |||||
if genotype[0] == '0': | |||||
allele2 = alt[int(genotype[1]) - 1] | |||||
seq = '0/' + allele2 | |||||
else: | |||||
allele1 = alt[int(genotype[0]) - 1] | |||||
allele2 = alt[int(genotype[1]) - 1] | |||||
seq = allele1 + '/' + allele2 | |||||
return seq | |||||
for row in merged_df.itertuples(): | |||||
# correction of multiallele | |||||
if pd.isnull(row.INFO_x) == True or pd.isnull(row.INFO_y) == True: | |||||
mendelian = '.' | |||||
else: | |||||
lcl5_seq = alt_seq(row.ALT_x, row.CHILD_x) | |||||
lcl6_seq = alt_seq(row.ALT_y, row.CHILD_y) | |||||
if lcl5_seq == lcl6_seq: | |||||
mendelian = '1' | |||||
else: | |||||
mendelian = '0' | |||||
if pd.isnull(row.INFO_x) == True: | |||||
mendelian = mendelian + ':.' | |||||
else: | |||||
mendelian = mendelian + ':' + row.INFO_x.split('=')[1] | |||||
if pd.isnull(row.INFO_y) == True: | |||||
mendelian = mendelian + ':.' | |||||
else: | |||||
mendelian = mendelian + ':' + row.INFO_y.split('=')[1] | |||||
outline = row._1 + '\t' + str(row.POS) + '\t' + mendelian + '\n' | |||||
family_file.write(outline) |
from __future__ import division | |||||
import pandas as pd | |||||
import sys, argparse, os | |||||
import fileinput | |||||
import re | |||||
# input arguments | |||||
parser = argparse.ArgumentParser(description="this script is to extract mendelian concordance information") | |||||
parser.add_argument('-LCL5', '--LCL5', type=str, help='LCL5 family info', required=True) | |||||
parser.add_argument('-LCL6', '--LCL6', type=str, help='LCL6 family info', required=True) | |||||
parser.add_argument('-genotype', '--genotype', type=str, help='Genotype information of a set of four family members', required=True) | |||||
parser.add_argument('-family', '--family', type=str, help='family name', required=True) | |||||
args = parser.parse_args() | |||||
lcl5 = args.LCL5 | |||||
lcl6 = args.LCL6 | |||||
genotype = args.genotype | |||||
family = args.family | |||||
# output file | |||||
family_name = family + '.txt' | |||||
family_file = open(family_name,'w') | |||||
summary_name = family + '.summary.txt' | |||||
summary_file = open(summary_name,'w') | |||||
# input files | |||||
lcl5_dat = pd.read_table(lcl5) | |||||
lcl6_dat = pd.read_table(lcl6) | |||||
genotype_dat = pd.read_table(genotype) | |||||
merged_df = pd.merge(lcl5_dat, lcl6_dat, how='outer', left_on=['#CHROM','POS'], right_on = ['#CHROM','POS']) | |||||
merged_genotype_df = pd.merge(merged_df, genotype_dat, how='outer', left_on=['#CHROM','POS'], right_on = ['#CHROM','POS']) | |||||
merged_genotype_df_sub = merged_genotype_df.iloc[:,[0,1,23,24,29,30,31,32,7,17]] | |||||
merged_genotype_df_sub.columns = ['CHROM', 'POS', 'REF', 'ALT','LCL5','LCL6','LCL7','LCL8', 'TRIO5', 'TRIO6'] | |||||
sister_same = 0 | |||||
sister_diff = 0 | |||||
family_all = 0 | |||||
family_mendelian = 0 | |||||
for row in merged_genotype_df_sub.itertuples(): | |||||
# sister | |||||
if row.LCL5 == row.LCL6: | |||||
if row.LCL5 == './.': | |||||
mendelian = 'noInfo' | |||||
sister_count = "no" | |||||
elif row.LCL5 == '0/0': | |||||
mendelian = 'Ref' | |||||
sister_count = "no" | |||||
else: | |||||
mendelian = '1' | |||||
sister_count = "yes_same" | |||||
else: | |||||
mendelian = '0' | |||||
if (row.LCL5 == './.' or row.LCL5 == '0/0') and (row.LCL6 == './.' or row.LCL6 == '0/0'): | |||||
sister_count = "no" | |||||
else: | |||||
sister_count = "yes_diff" | |||||
if sister_count == 'yes_same': | |||||
sister_same += 1 | |||||
elif sister_count == 'yes_diff': | |||||
sister_diff += 1 | |||||
else: | |||||
pass | |||||
# family trio5 | |||||
if row.LCL5 == row. LCL7 == row.LCL8 == './.': | |||||
mendelian = mendelian + ':noInfo' | |||||
elif row.LCL5 == row. LCL7 == row.LCL8 == '0/0': | |||||
mendelian = mendelian + ':Ref' | |||||
elif pd.isnull(row.TRIO5) == True: | |||||
mendelian = mendelian + ':unVBT' | |||||
else: | |||||
mendelian = mendelian + ':' + row.TRIO5.split('=')[1] | |||||
# family trio6 | |||||
if row.LCL6 == row.LCL7 == row.LCL8 == './.': | |||||
mendelian = mendelian + ':noInfo' | |||||
elif row.LCL6 == row. LCL7 == row.LCL8 == '0/0': | |||||
mendelian = mendelian + ':Ref' | |||||
elif pd.isnull(row.TRIO6) == True: | |||||
mendelian = mendelian + ':unVBT' | |||||
else: | |||||
mendelian = mendelian + ':' + row.TRIO6.split('=')[1] | |||||
# not count into family | |||||
if (row.LCL5 == './.' or row.LCL5 == '0/0') and (row.LCL6 == './.' or row.LCL6 == '0/0') and (row.LCL7 == './.' or row.LCL7 == '0/0') and (row.LCL8 == './.' or row.LCL8 == '0/0'): | |||||
mendelian_count = "no" | |||||
else: | |||||
mendelian_count = "yes" | |||||
outline = row.CHROM + '\t' + str(row.POS) + '\t' + row.REF + '\t' + row.ALT + '\t' + row.LCL5 + '\t' + row.LCL6 + '\t' + row.LCL7 + '\t' + row.LCL8 + '\t' + str(row.TRIO5) + '\t' + str(row.TRIO6) + '\t' + str(mendelian) + '\t' + str(mendelian_count) + '\t' + str(sister_count) + '\n' | |||||
family_file.write(outline) | |||||
if mendelian_count == 'yes': | |||||
family_all += 1 | |||||
else: | |||||
pass | |||||
if mendelian == '1:1:1': | |||||
family_mendelian += 1 | |||||
elif mendelian == 'Ref:1:1': | |||||
family_mendelian += 1 | |||||
else: | |||||
pass | |||||
sister = sister_same/(sister_same + sister_diff) | |||||
quartet = family_mendelian/family_all | |||||
outcolumn = 'Family\tReproducibility_D5_D6\tMendelian_Concordance_Quartet\n' | |||||
outResult = family + '\t' + str(sister) + '\t' + str(quartet) + '\n' | |||||
summary_file.write(outcolumn) | |||||
summary_file.write(outResult) | |||||
# import modules | |||||
import numpy as np | |||||
import pandas as pd | |||||
from sklearn import svm | |||||
from sklearn import preprocessing | |||||
import sys, argparse, os | |||||
from vcf2bed import position_to_bed,padding_region | |||||
parser = argparse.ArgumentParser(description="this script is to preform one calss svm on each chromosome") | |||||
parser.add_argument('-train', '--trainDataset', type=str, help='training dataset generated from extracting vcf information part, with mutaitons supported by callsets', required=True) | |||||
parser.add_argument('-test', '--testDataset', type=str, help='testing dataset generated from extracting vcf information part, with mutaitons not called by all callsets', required=True) | |||||
parser.add_argument('-name', '--sampleName', type=str, help='sample name for output file name', required=True) | |||||
args = parser.parse_args() | |||||
# Rename input: | |||||
train_input = args.trainDataset | |||||
test_input = args.testDataset | |||||
sample_name = args.sampleName | |||||
# default columns, which will be included in the included in the calssifier | |||||
chromosome = ['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15' ,'chr16','chr17','chr18','chr19','chr20','chr21','chr22','chrX','chrY'] | |||||
feature_heter_cols = ['AltDP','BaseQRankSum','DB','DP','FS','GQ','MQ','MQRankSum','QD','ReadPosRankSum','RefDP','SOR','af'] | |||||
feature_homo_cols = ['AltDP','DB','DP','FS','GQ','MQ','QD','RefDP','SOR','af'] | |||||
# import datasets sepearate the records with or without BaseQRankSum annotation, etc. | |||||
def load_dat(dat_file_name): | |||||
dat = pd.read_table(dat_file_name) | |||||
dat['DB'] = dat['DB'].fillna(0) | |||||
dat = dat[dat['DP'] != 0] | |||||
dat['af'] = dat['AltDP']/(dat['AltDP'] + dat['RefDP']) | |||||
homo_rows = dat[dat['BaseQRankSum'].isnull()] | |||||
heter_rows = dat[dat['BaseQRankSum'].notnull()] | |||||
return homo_rows,heter_rows | |||||
train_homo,train_heter = load_dat(train_input) | |||||
test_homo,test_heter = load_dat(test_input) | |||||
clf = svm.OneClassSVM(nu=0.05,kernel='rbf', gamma='auto_deprecated',cache_size=500) | |||||
def prepare_dat(train_dat,test_dat,feature_cols,chromo): | |||||
chr_train = train_dat[train_dat['chromo'] == chromo] | |||||
chr_test = test_dat[test_dat['chromo'] == chromo] | |||||
train_dat = chr_train.loc[:,feature_cols] | |||||
test_dat = chr_test.loc[:,feature_cols] | |||||
train_dat_scaled = preprocessing.scale(train_dat) | |||||
test_dat_scaled = preprocessing.scale(test_dat) | |||||
return chr_test,train_dat_scaled,test_dat_scaled | |||||
def oneclass(X_train,X_test,chr_test): | |||||
clf.fit(X_train) | |||||
y_pred_test = clf.predict(X_test) | |||||
test_true_dat = chr_test[y_pred_test == 1] | |||||
test_false_dat = chr_test[y_pred_test == -1] | |||||
return test_true_dat,test_false_dat | |||||
predicted_true = pd.DataFrame(columns=train_homo.columns) | |||||
predicted_false = pd.DataFrame(columns=train_homo.columns) | |||||
for chromo in chromosome: | |||||
# homo datasets | |||||
chr_test_homo,X_train_homo,X_test_homo = prepare_dat(train_homo,test_homo,feature_homo_cols,chromo) | |||||
test_true_homo,test_false_homo = oneclass(X_train_homo,X_test_homo,chr_test_homo) | |||||
predicted_true = predicted_true.append(test_true_homo) | |||||
predicted_false = predicted_false.append(test_false_homo) | |||||
# heter datasets | |||||
chr_test_heter,X_train_heter,X_test_heter = prepare_dat(train_heter,test_heter,feature_heter_cols,chromo) | |||||
test_true_heter,test_false_heter = oneclass(X_train_heter,X_test_heter,chr_test_heter) | |||||
predicted_true = predicted_true.append(test_true_heter) | |||||
predicted_false = predicted_false.append(test_false_heter) | |||||
predicted_true_filename = sample_name + '_predicted_true.txt' | |||||
predicted_false_filename = sample_name + '_predicted_false.txt' | |||||
predicted_true.to_csv(predicted_true_filename,sep='\t',index=False) | |||||
predicted_false.to_csv(predicted_false_filename,sep='\t',index=False) | |||||
# output the bed file and padding bed region 50bp | |||||
predicted_true_bed_filename = sample_name + '_predicted_true.bed' | |||||
predicted_false_bed_filename = sample_name + '_predicted_false.bed' | |||||
padding_filename = sample_name + '_padding.bed' | |||||
predicted_true_bed = open(predicted_true_bed_filename,'w') | |||||
predicted_false_bed = open(predicted_false_bed_filename,'w') | |||||
padding = open(padding_filename,'w') | |||||
# | |||||
for index,row in predicted_false.iterrows(): | |||||
chromo,pos1,pos2 = position_to_bed(row['chromo'],row['pos'],row['ref'],row['alt']) | |||||
outline_pos = chromo + '\t' + str(pos1) + '\t' + str(pos2) + '\n' | |||||
predicted_false_bed.write(outline_pos) | |||||
chromo,pad_pos1,pad_pos2,pad_pos3,pad_pos4 = padding_region(chromo,pos1,pos2,50) | |||||
outline_pad_1 = chromo + '\t' + str(pad_pos1) + '\t' + str(pad_pos2) + '\n' | |||||
outline_pad_2 = chromo + '\t' + str(pad_pos3) + '\t' + str(pad_pos4) + '\n' | |||||
padding.write(outline_pad_1) | |||||
padding.write(outline_pad_2) | |||||
for index,row in predicted_true.iterrows(): | |||||
chromo,pos1,pos2 = position_to_bed(row['chromo'],row['pos'],row['ref'],row['alt']) | |||||
outline_pos = chromo + '\t' + str(pos1) + '\t' + str(pos2) + '\n' | |||||
predicted_true_bed.write(outline_pos) | |||||
# import modules | |||||
import sys, argparse, os | |||||
import fileinput | |||||
import re | |||||
parser = argparse.ArgumentParser(description="This script is to split samples in VCF files and rewrite to the right style") | |||||
parser.add_argument('-vcf', '--familyVCF', type=str, help='VCF with sister and mendelian infomation', required=True) | |||||
parser.add_argument('-name', '--familyName', type=str, help='Family name of the VCF file', required=True) | |||||
args = parser.parse_args() | |||||
# Rename input: | |||||
inputFile = args.familyVCF | |||||
family_name = args.familyName | |||||
# output filename | |||||
LCL5_name = family_name + '.LCL5.vcf' | |||||
LCL5file = open(LCL5_name,'w') | |||||
LCL6_name = family_name + '.LCL6.vcf' | |||||
LCL6file = open(LCL6_name,'w') | |||||
LCL7_name = family_name + '.LCL7.vcf' | |||||
LCL7file = open(LCL7_name,'w') | |||||
LCL8_name = family_name + '.LCL8.vcf' | |||||
LCL8file = open(LCL8_name,'w') | |||||
family_filename = family_name + '.vcf' | |||||
familyfile = open(family_filename,'w') | |||||
# default columns, which will be included in the included in the calssifier | |||||
vcfheader = '''##fileformat=VCFv4.2 | |||||
##FILTER=<ID=PASS,Description="the same genotype between twin sister and mendelian consistent in 578 and 678"> | |||||
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> | |||||
##FORMAT=<ID=TWINS,Number=0,Type=Flag,Description="0 for sister consistent, 1 for sister inconsistent"> | |||||
##FORMAT=<ID=TRIO5,Number=0,Type=Flag,Description="0 for trio consistent, 1 for trio inconsistent"> | |||||
##FORMAT=<ID=TRIO6,Number=0,Type=Flag,Description="0 for trio consistent, 1 for trio inconsistent"> | |||||
##contig=<ID=chr1,length=248956422> | |||||
##contig=<ID=chr2,length=242193529> | |||||
##contig=<ID=chr3,length=198295559> | |||||
##contig=<ID=chr4,length=190214555> | |||||
##contig=<ID=chr5,length=181538259> | |||||
##contig=<ID=chr6,length=170805979> | |||||
##contig=<ID=chr7,length=159345973> | |||||
##contig=<ID=chr8,length=145138636> | |||||
##contig=<ID=chr9,length=138394717> | |||||
##contig=<ID=chr10,length=133797422> | |||||
##contig=<ID=chr11,length=135086622> | |||||
##contig=<ID=chr12,length=133275309> | |||||
##contig=<ID=chr13,length=114364328> | |||||
##contig=<ID=chr14,length=107043718> | |||||
##contig=<ID=chr15,length=101991189> | |||||
##contig=<ID=chr16,length=90338345> | |||||
##contig=<ID=chr17,length=83257441> | |||||
##contig=<ID=chr18,length=80373285> | |||||
##contig=<ID=chr19,length=58617616> | |||||
##contig=<ID=chr20,length=64444167> | |||||
##contig=<ID=chr21,length=46709983> | |||||
##contig=<ID=chr22,length=50818468> | |||||
##contig=<ID=chrX,length=156040895> | |||||
''' | |||||
# write VCF | |||||
LCL5colname = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+family_name+'_LCL5'+'\n' | |||||
LCL5file.write(vcfheader) | |||||
LCL5file.write(LCL5colname) | |||||
LCL6colname = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+family_name+'_LCL6'+'\n' | |||||
LCL6file.write(vcfheader) | |||||
LCL6file.write(LCL6colname) | |||||
LCL7colname = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+family_name+'_LCL7'+'\n' | |||||
LCL7file.write(vcfheader) | |||||
LCL7file.write(LCL7colname) | |||||
LCL8colname = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+family_name+'_LCL8'+'\n' | |||||
LCL8file.write(vcfheader) | |||||
LCL8file.write(LCL8colname) | |||||
familycolname = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+'LCL5\t'+'LCL6\t'+'LCL7\t'+'LCL8'+'\n' | |||||
familyfile.write(vcfheader) | |||||
familyfile.write(familycolname) | |||||
# reform VCF | |||||
def process(oneLine): | |||||
line = oneLine.rstrip() | |||||
strings = line.strip().split('\t') | |||||
# replace . | |||||
# LCL6 uniq | |||||
if strings[11] == '.': | |||||
strings[11] = '0/0' | |||||
strings[9] = strings[12] | |||||
strings[10] = strings[13] | |||||
else: | |||||
pass | |||||
# LCL5 uniq | |||||
if strings[14] == '.': | |||||
strings[14] = '0/0' | |||||
strings[12] = strings[9] | |||||
strings[13] = strings[10] | |||||
else: | |||||
pass | |||||
# sister | |||||
if strings[11] == strings[14]: | |||||
add_format = ":1" | |||||
else: | |||||
add_format = ":0" | |||||
# trioLCL5 | |||||
if strings[15] == 'MD=1': | |||||
add_format = add_format + ":1" | |||||
else: | |||||
add_format = add_format + ":0" | |||||
# trioLCL6 | |||||
if strings[7] == 'MD=1': | |||||
add_format = add_format + ":1" | |||||
else: | |||||
add_format = add_format + ":0" | |||||
# filter | |||||
if (strings[11] == strings[14]) and (strings[15] == 'MD=1') and (strings[7] == 'MD=1'): | |||||
strings[6] = 'PASS' | |||||
else: | |||||
strings[6] = '.' | |||||
# output LCL5 | |||||
LCL5outLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+'.'+'\t'+strings[6]+'\t'+ '.' +'\t'+ 'GT:TWINS:TRIO5:TRIO6' + '\t' + strings[14] + add_format + '\n' | |||||
LCL5file.write(LCL5outLine) | |||||
# output LCL6 | |||||
LCL6outLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+'.'+'\t'+strings[6]+'\t'+ '.' +'\t'+ 'GT:TWINS:TRIO5:TRIO6' + '\t' + strings[11] + add_format + '\n' | |||||
LCL6file.write(LCL6outLine) | |||||
# output LCL7 | |||||
LCL7outLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+'.'+'\t'+strings[6]+'\t'+ '.' +'\t'+ 'GT:TWINS:TRIO5:TRIO6' + '\t' + strings[10] + add_format + '\n' | |||||
LCL7file.write(LCL7outLine) | |||||
# output LCL8 | |||||
LCL8outLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+'.'+'\t'+strings[6]+'\t'+ '.' +'\t'+ 'GT:TWINS:TRIO5:TRIO6' + '\t' + strings[9] + add_format + '\n' | |||||
LCL8file.write(LCL8outLine) | |||||
# output family | |||||
familyoutLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+ '.'+'\t'+strings[6]+'\t'+ '.' +'\t'+ 'GT:TWINS:TRIO5:TRIO6' + '\t' + strings[14] + add_format +'\t' + strings[11] + add_format + '\t' + strings[10] + add_format +'\t' + strings[9] + add_format + '\n' | |||||
familyfile.write(familyoutLine) | |||||
for line in fileinput.input(inputFile): | |||||
m = re.match('^\#',line) | |||||
if m is not None: | |||||
pass | |||||
else: | |||||
process(line) | |||||
from __future__ import division | |||||
from glob import glob | |||||
import sys, argparse, os | |||||
import fileinput | |||||
import re | |||||
import pandas as pd | |||||
from operator import itemgetter | |||||
from collections import Counter | |||||
from itertools import islice | |||||
from numpy import * | |||||
import statistics | |||||
# input arguments | |||||
parser = argparse.ArgumentParser(description="this script is to merge mendelian and vcfinfo, and extract high_confidence_calls") | |||||
parser.add_argument('-prefix', '--prefix', type=str, help='prefix of output file', required=True) | |||||
parser.add_argument('-vcf', '--vcf', type=str, help='merged multiple sample vcf', required=True) | |||||
args = parser.parse_args() | |||||
prefix = args.prefix | |||||
vcf = args.vcf | |||||
# input files | |||||
vcf_dat = pd.read_table(vcf) | |||||
# all info | |||||
all_file_name = prefix + "_all_summary.txt" | |||||
all_sample_outfile = open(all_file_name,'w') | |||||
all_info_col = 'CHROM\tPOS\tREF\tALT\tLCL5_consensus_calls\tLCL5_detect_number\tLCL5_same_diff\tLCL6_consensus_calls\tLCL6_detect_number\tLCl6_same_diff\tLCL7_consensus_calls\tLCL7_detect_number\tLCL7_same_diff\tLCL8_consensus_calls\tLCL8_detect_number\tLCL8_same_diff\n' | |||||
all_sample_outfile.write(all_info_col) | |||||
# filtered info | |||||
vcf_header = '''##fileformat=VCFv4.2 | |||||
##fileDate=20200501 | |||||
##source=high_confidence_calls_intergration(choppy app) | |||||
##reference=GRCh38.d1.vd1 | |||||
##INFO=<ID=VOTED,Number=1,Type=Integer,Description="Number mendelian consisitent votes"> | |||||
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> | |||||
##contig=<ID=chr1,length=248956422> | |||||
##contig=<ID=chr2,length=242193529> | |||||
##contig=<ID=chr3,length=198295559> | |||||
##contig=<ID=chr4,length=190214555> | |||||
##contig=<ID=chr5,length=181538259> | |||||
##contig=<ID=chr6,length=170805979> | |||||
##contig=<ID=chr7,length=159345973> | |||||
##contig=<ID=chr8,length=145138636> | |||||
##contig=<ID=chr9,length=138394717> | |||||
##contig=<ID=chr10,length=133797422> | |||||
##contig=<ID=chr11,length=135086622> | |||||
##contig=<ID=chr12,length=133275309> | |||||
##contig=<ID=chr13,length=114364328> | |||||
##contig=<ID=chr14,length=107043718> | |||||
##contig=<ID=chr15,length=101991189> | |||||
##contig=<ID=chr16,length=90338345> | |||||
##contig=<ID=chr17,length=83257441> | |||||
##contig=<ID=chr18,length=80373285> | |||||
##contig=<ID=chr19,length=58617616> | |||||
##contig=<ID=chr20,length=64444167> | |||||
##contig=<ID=chr21,length=46709983> | |||||
##contig=<ID=chr22,length=50818468> | |||||
##contig=<ID=chrX,length=156040895> | |||||
''' | |||||
consensus_file_name = prefix + "_consensus.vcf" | |||||
consensus_outfile = open(consensus_file_name,'w') | |||||
consensus_col = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tLCL5_consensus_call\tLCL6_consensus_call\tLCL7_consensus_call\tLCL8_consensus_call\n' | |||||
consensus_outfile.write(vcf_header) | |||||
consensus_outfile.write(consensus_col) | |||||
# function | |||||
def decide_by_rep(vcf_list): | |||||
consensus_rep = '' | |||||
gt = [x.split(':')[0] for x in vcf_list] | |||||
gt_num_dict = Counter(gt) | |||||
highest_gt = gt_num_dict.most_common(1) | |||||
candidate_gt = highest_gt[0][0] | |||||
freq_gt = highest_gt[0][1] | |||||
if freq_gt >= 2: | |||||
consensus_rep = candidate_gt | |||||
else: | |||||
consensus_rep = 'inconGT' | |||||
return consensus_rep | |||||
def consensus_call(vcf_info_list): | |||||
consensus_call = '.' | |||||
detect_number = '.' | |||||
same_diff = '.' | |||||
# pcr | |||||
SEQ2000 = decide_by_rep(vcf_info_list[0:3]) | |||||
XTen_ARD = decide_by_rep(vcf_info_list[18:21]) | |||||
XTen_NVG = decide_by_rep(vcf_info_list[21:24]) | |||||
XTen_WUX = decide_by_rep(vcf_info_list[24:27]) | |||||
Nova_WUX = decide_by_rep(vcf_info_list[15:18]) | |||||
pcr_sequence_site = [SEQ2000,XTen_ARD,XTen_NVG,XTen_WUX,Nova_WUX] | |||||
pcr_sequence_dict = Counter(pcr_sequence_site) | |||||
pcr_highest_sequence = pcr_sequence_dict.most_common(1) | |||||
pcr_candidate_sequence = pcr_highest_sequence[0][0] | |||||
pcr_freq_sequence = pcr_highest_sequence[0][1] | |||||
if pcr_freq_sequence > 3: | |||||
pcr_consensus = pcr_candidate_sequence | |||||
else: | |||||
pcr_consensus = 'inconSequenceSite' | |||||
# pcr-free | |||||
T7_WGE = decide_by_rep(vcf_info_list[3:6]) | |||||
Nova_ARD_1 = decide_by_rep(vcf_info_list[6:9]) | |||||
Nova_ARD_2 = decide_by_rep(vcf_info_list[9:12]) | |||||
Nova_BRG = decide_by_rep(vcf_info_list[12:15]) | |||||
sequence_site = [T7_WGE,Nova_ARD_1,Nova_ARD_2,Nova_BRG] | |||||
sequence_dict = Counter(sequence_site) | |||||
highest_sequence = sequence_dict.most_common(1) | |||||
candidate_sequence = highest_sequence[0][0] | |||||
freq_sequence = highest_sequence[0][1] | |||||
if freq_sequence > 2: | |||||
pcr_free_consensus = candidate_sequence | |||||
else: | |||||
pcr_free_consensus = 'inconSequenceSite' | |||||
gt = [x.split(':')[0] for x in vcf_info_list] | |||||
gt = [x.replace('./.','.') for x in gt] | |||||
detected_num = 27 - gt.count('.') | |||||
gt_remain = [e for e in gt if e not in {'.'}] | |||||
gt_set = set(gt_remain) | |||||
if len(gt_set) == 1: | |||||
same_diff = 'same' | |||||
else: | |||||
same_diff = 'diff' | |||||
tag = ['inconGT','inconSequenceSite'] | |||||
if (pcr_consensus == pcr_free_consensus) and (pcr_consensus not in tag): | |||||
consensus_call = pcr_consensus | |||||
elif (pcr_consensus in tag) and (pcr_free_consensus in tag): | |||||
consensus_call = 'notAgree' | |||||
else: | |||||
consensus_call = 'notConsensus' | |||||
return consensus_call, detected_num, same_diff | |||||
elif (pcr_consensus in tag) and (pcr_free_consensus in tag): | |||||
consensus_call = 'filtered' | |||||
elif ((pcr_consensus == './.') or (pcr_consensus in tag)) and ((pcr_free_consensus not in tag) and (pcr_free_consensus != './.')): | |||||
consensus_call = 'pcr-free-speicifc' | |||||
elif ((pcr_consensus != './.') or (pcr_consensus not in tag)) and ((pcr_free_consensus in tag) and (pcr_free_consensus == './.')): | |||||
consensus_call = 'pcr-speicifc' | |||||
elif (pcr_consensus == '0/0') and (pcr_free_consensus == '0/0'): | |||||
consensus_call = '0/0' | |||||
else: | |||||
consensus_call = 'filtered' | |||||
for row in vcf_dat.itertuples(): | |||||
# length | |||||
#alt | |||||
if ',' in row.ALT: | |||||
alt = row.ALT.split(',') | |||||
alt_len = [len(i) for i in alt] | |||||
alt_max = max(alt_len) | |||||
else: | |||||
alt_max = len(row.ALT) | |||||
#ref | |||||
ref_len = len(row.REF) | |||||
if (alt_max > 50) or (ref_len > 50): | |||||
pass | |||||
else: | |||||
# consensus | |||||
lcl5_list = [row.Quartet_DNA_BGI_SEQ2000_BGI_LCL5_1_20180518,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL5_2_20180530,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL5_3_20180530, \ | |||||
row.Quartet_DNA_BGI_T7_WGE_LCL5_1_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL5_2_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL5_3_20191105, \ | |||||
row.Quartet_DNA_ILM_Nova_ARD_LCL5_1_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL5_2_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL5_3_20181108, \ | |||||
row.Quartet_DNA_ILM_Nova_ARD_LCL5_4_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL5_5_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL5_6_20190111, \ | |||||
row.Quartet_DNA_ILM_Nova_BRG_LCL5_1_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL5_2_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL5_3_20180930, \ | |||||
row.Quartet_DNA_ILM_Nova_WUX_LCL5_1_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL5_2_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL5_3_20190917, \ | |||||
row.Quartet_DNA_ILM_XTen_ARD_LCL5_1_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL5_2_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL5_3_20170403, \ | |||||
row.Quartet_DNA_ILM_XTen_NVG_LCL5_1_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL5_2_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL5_3_20170329, \ | |||||
row.Quartet_DNA_ILM_XTen_WUX_LCL5_1_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL5_2_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL5_3_20170216] | |||||
lcl6_list = [row.Quartet_DNA_BGI_SEQ2000_BGI_LCL6_1_20180518,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL6_2_20180530,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL6_3_20180530, \ | |||||
row.Quartet_DNA_BGI_T7_WGE_LCL6_1_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL6_2_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL6_3_20191105, \ | |||||
row.Quartet_DNA_ILM_Nova_ARD_LCL6_1_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL6_2_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL6_3_20181108, \ | |||||
row.Quartet_DNA_ILM_Nova_ARD_LCL6_4_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL6_5_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL6_6_20190111, \ | |||||
row.Quartet_DNA_ILM_Nova_BRG_LCL6_1_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL6_2_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL6_3_20180930, \ | |||||
row.Quartet_DNA_ILM_Nova_WUX_LCL6_1_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL6_2_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL6_3_20190917, \ | |||||
row.Quartet_DNA_ILM_XTen_ARD_LCL6_1_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL6_2_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL6_3_20170403, \ | |||||
row.Quartet_DNA_ILM_XTen_NVG_LCL6_1_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL6_2_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL6_3_20170329, \ | |||||
row.Quartet_DNA_ILM_XTen_WUX_LCL6_1_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL6_2_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL6_3_20170216] | |||||
lcl7_list = [row.Quartet_DNA_BGI_SEQ2000_BGI_LCL7_1_20180518,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL7_2_20180530,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL7_3_20180530, \ | |||||
row.Quartet_DNA_BGI_T7_WGE_LCL7_1_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL7_2_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL7_3_20191105, \ | |||||
row.Quartet_DNA_ILM_Nova_ARD_LCL7_1_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL7_2_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL7_3_20181108, \ | |||||
row.Quartet_DNA_ILM_Nova_ARD_LCL7_4_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL7_5_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL7_6_20190111, \ | |||||
row.Quartet_DNA_ILM_Nova_BRG_LCL7_1_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL7_2_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL7_3_20180930, \ | |||||
row.Quartet_DNA_ILM_Nova_WUX_LCL7_1_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL7_2_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL7_3_20190917, \ | |||||
row.Quartet_DNA_ILM_XTen_ARD_LCL7_1_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL7_2_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL7_3_20170403, \ | |||||
row.Quartet_DNA_ILM_XTen_NVG_LCL7_1_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL7_2_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL7_3_20170329, \ | |||||
row.Quartet_DNA_ILM_XTen_WUX_LCL7_1_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL7_2_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL7_3_20170216] | |||||
lcl8_list = [row.Quartet_DNA_BGI_SEQ2000_BGI_LCL8_1_20180518,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL8_2_20180530,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL8_3_20180530, \ | |||||
row.Quartet_DNA_BGI_T7_WGE_LCL8_1_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL8_2_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL8_3_20191105, \ | |||||
row.Quartet_DNA_ILM_Nova_ARD_LCL8_1_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL8_2_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL8_3_20181108, \ | |||||
row.Quartet_DNA_ILM_Nova_ARD_LCL8_4_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL8_5_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL8_6_20190111, \ | |||||
row.Quartet_DNA_ILM_Nova_BRG_LCL8_1_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL8_2_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL8_3_20180930, \ | |||||
row.Quartet_DNA_ILM_Nova_WUX_LCL8_1_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL8_2_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL8_3_20190917, \ | |||||
row.Quartet_DNA_ILM_XTen_ARD_LCL8_1_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL8_2_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL8_3_20170403, \ | |||||
row.Quartet_DNA_ILM_XTen_NVG_LCL8_1_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL8_2_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL8_3_20170329, \ | |||||
row.Quartet_DNA_ILM_XTen_WUX_LCL8_1_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL8_2_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL8_3_20170216] | |||||
# LCL5 | |||||
LCL5_consensus_call, LCL5_detected_num, LCL5_same_diff = consensus_call(lcl5_list) | |||||
# LCL6 | |||||
LCL6_consensus_call, LCL6_detected_num, LCL6_same_diff = consensus_call(lcl6_list) | |||||
# LCL7 | |||||
LCL7_consensus_call, LCL7_detected_num, LCL7_same_diff = consensus_call(lcl7_list) | |||||
# LCL8 | |||||
LCL8_consensus_call, LCL8_detected_num, LCL8_same_diff = consensus_call(lcl8_list) | |||||
# all data | |||||
all_output = row._1 + '\t' + str(row.POS) + '\t' + row.REF + '\t' + row.ALT + '\t' + LCL5_consensus_call + '\t' + str(LCL5_detected_num) + '\t' + LCL5_same_diff + '\t' +\ | |||||
LCL6_consensus_call + '\t' + str(LCL6_detected_num) + '\t' + LCL6_same_diff + '\t' +\ | |||||
LCL7_consensus_call + '\t' + str(LCL7_detected_num) + '\t' + LCL7_same_diff + '\t' +\ | |||||
LCL8_consensus_call + '\t' + str(LCL8_detected_num) + '\t' + LCL8_same_diff + '\n' | |||||
all_sample_outfile.write(all_output) | |||||
#consensus vcf | |||||
one_position = [LCL5_consensus_call,LCL6_consensus_call,LCL7_consensus_call,LCL8_consensus_call] | |||||
if ('notConsensus' in one_position) or (((len(set(one_position)) == 1) and ('./.' in set(one_position))) or ((len(set(one_position)) == 1) and ('0/0' in set(one_position))) or ((len(set(one_position)) == 2) and ('0/0' in set(one_position) and ('./.' in set(one_position))))): | |||||
pass | |||||
else: | |||||
consensus_output = row._1 + '\t' + str(row.POS) + '\t' + '.' + '\t' + row.REF + '\t' + row.ALT + '\t' + '.' + '\t' + '.' + '\t' +'.' + '\t' + 'GT' + '\t' + LCL5_consensus_call + '\t' + LCL6_consensus_call + '\t' + LCL7_consensus_call + '\t' + LCL8_consensus_call +'\n' | |||||
consensus_outfile.write(consensus_output) | |||||
import re | |||||
def position_to_bed(chromo,pos,ref,alt): | |||||
# snv | |||||
# Start cooridinate BED = start coordinate VCF - 1 | |||||
# End cooridinate BED = start coordinate VCF | |||||
if len(ref) == 1 and len(alt) == 1: | |||||
StartPos = int(pos) -1 | |||||
EndPos = int(pos) | |||||
# deletions | |||||
# Start cooridinate BED = start coordinate VCF - 1 | |||||
# End cooridinate BED = start coordinate VCF + (reference length - alternate length) | |||||
elif len(ref) > 1 and len(alt) == 1: | |||||
StartPos = int(pos) - 1 | |||||
EndPos = int(pos) + (len(ref) - 1) | |||||
#insertions | |||||
# For insertions: | |||||
# Start cooridinate BED = start coordinate VCF - 1 | |||||
# End cooridinate BED = start coordinate VCF + (alternate length - reference length) | |||||
else: | |||||
StartPos = int(pos) - 1 | |||||
EndPos = int(pos) + (len(alt) - 1) | |||||
return chromo,StartPos,EndPos | |||||
def padding_region(chromo,pos1,pos2,padding): | |||||
StartPos1 = pos1 - padding | |||||
EndPos1 = pos1 | |||||
StartPos2 = pos2 | |||||
EndPos2 = pos2 + padding | |||||
return chromo,StartPos1,EndPos1,StartPos2,EndPos2 |
from __future__ import division | |||||
from glob import glob | |||||
import sys, argparse, os | |||||
import fileinput | |||||
import re | |||||
import pandas as pd | |||||
from operator import itemgetter | |||||
from collections import Counter | |||||
from itertools import islice | |||||
from numpy import * | |||||
import statistics | |||||
# input arguments | |||||
parser = argparse.ArgumentParser(description="this script is to merge mendelian and vcfinfo, and extract high_confidence_calls") | |||||
parser.add_argument('-folder', '--folder', type=str, help='directory that holds all the mendelian info', required=True) | |||||
parser.add_argument('-vcf', '--vcf', type=str, help='merged multiple sample vcf', required=True) | |||||
args = parser.parse_args() | |||||
folder = args.folder | |||||
vcf = args.vcf | |||||
# input files | |||||
folder = folder + '/*.txt' | |||||
filenames = glob(folder) | |||||
dataframes = [] | |||||
for filename in filenames: | |||||
dataframes.append(pd.read_table(filename,header=None)) | |||||
dfs = [df.set_index([0, 1]) for df in dataframes] | |||||
merged_mendelian = pd.concat(dfs, axis=1).reset_index() | |||||
family_name = [i.split('/')[-1].replace('.txt','') for i in filenames] | |||||
columns = ['CHROM','POS'] + family_name | |||||
merged_mendelian.columns = columns | |||||
vcf_dat = pd.read_table(vcf) | |||||
merged_df = pd.merge(merged_mendelian, vcf_dat, how='outer', left_on=['CHROM','POS'], right_on = ['#CHROM','POS']) | |||||
merged_df = merged_df.fillna('nan') | |||||
vcf_header = '''##fileformat=VCFv4.2 | |||||
##fileDate=20200501 | |||||
##source=high_confidence_calls_intergration(choppy app) | |||||
##reference=GRCh38.d1.vd1 | |||||
##INFO=<ID=VOTED,Number=1,Type=Integer,Description="Number mendelian consisitent votes"> | |||||
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> | |||||
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Sum depth of all samples"> | |||||
##FORMAT=<ID=ALT,Number=1,Type=Integer,Description="Sum alternative depth of all samples"> | |||||
##contig=<ID=chr1,length=248956422> | |||||
##contig=<ID=chr2,length=242193529> | |||||
##contig=<ID=chr3,length=198295559> | |||||
##contig=<ID=chr4,length=190214555> | |||||
##contig=<ID=chr5,length=181538259> | |||||
##contig=<ID=chr6,length=170805979> | |||||
##contig=<ID=chr7,length=159345973> | |||||
##contig=<ID=chr8,length=145138636> | |||||
##contig=<ID=chr9,length=138394717> | |||||
##contig=<ID=chr10,length=133797422> | |||||
##contig=<ID=chr11,length=135086622> | |||||
##contig=<ID=chr12,length=133275309> | |||||
##contig=<ID=chr13,length=114364328> | |||||
##contig=<ID=chr14,length=107043718> | |||||
##contig=<ID=chr15,length=101991189> | |||||
##contig=<ID=chr16,length=90338345> | |||||
##contig=<ID=chr17,length=83257441> | |||||
##contig=<ID=chr18,length=80373285> | |||||
##contig=<ID=chr19,length=58617616> | |||||
##contig=<ID=chr20,length=64444167> | |||||
##contig=<ID=chr21,length=46709983> | |||||
##contig=<ID=chr22,length=50818468> | |||||
##contig=<ID=chrX,length=156040895> | |||||
''' | |||||
# output files | |||||
benchmark_LCL5 = open('LCL5_voted.vcf','w') | |||||
benchmark_LCL6 = open('LCL6_voted.vcf','w') | |||||
benchmark_LCL7 = open('LCL7_voted.vcf','w') | |||||
benchmark_LCL8 = open('LCL8_voted.vcf','w') | |||||
all_sample_outfile = open('all_sample_information.txt','w') | |||||
# write VCF | |||||
LCL5_col = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tLCL5_benchmark_calls\n' | |||||
LCL6_col = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tLCL6_benchmark_calls\n' | |||||
LCL7_col = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tLCL7_benchmark_calls\n' | |||||
LCL8_col = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tLCL8_benchmark_calls\n' | |||||
benchmark_LCL5.write(vcf_header) | |||||
benchmark_LCL5.write(LCL5_col) | |||||
benchmark_LCL6.write(vcf_header) | |||||
benchmark_LCL6.write(LCL6_col) | |||||
benchmark_LCL7.write(vcf_header) | |||||
benchmark_LCL7.write(LCL7_col) | |||||
benchmark_LCL8.write(vcf_header) | |||||
benchmark_LCL8.write(LCL8_col) | |||||
# all info | |||||
all_info_col = 'CHROM\tPOS\tLCL5_pcr_consensus\tLCL5_pcr_free_consensus\tLCL5_mendelian_num\tLCL5_consensus_call\tLCL5_consensus_alt_seq\tLCL5_alt\tLCL5_dp\tLCL5_detected_num\tLCL6_pcr_consensus\tLCL6_pcr_free_consensus\tLCL6_mendelian_num\tLCL6_consensus_call\tLCL6_consensus_alt_seq\tLCL6_alt\tLCL6_dp\tLCL6_detected_num\tLCL7_pcr_consensus\tLCL7_pcr_free_consensus\tLCL7_mendelian_num\t LCL7_consensus_call\tLCL7_consensus_alt_seq\tLCL7_alt\tLCL7_dp\tLCL7_detected_num\tLCL8_pcr_consensus\tLCL8_pcr_free_consensus\tLCL8_mendelian_num\tLCL8_consensus_call\tLCL8_consensus_alt_seq\tLCL8_alt\tLCL8_dp\tLCL8_detected_num\n' | |||||
all_sample_outfile.write(all_info_col) | |||||
# function | |||||
def decide_by_rep(vcf_list,mendelian_list): | |||||
consensus_rep = '' | |||||
gt = [x.split(':')[0] for x in vcf_list] | |||||
# mendelian consistent? | |||||
mendelian_dict = Counter(mendelian_list) | |||||
highest_mendelian = mendelian_dict.most_common(1) | |||||
candidate_mendelian = highest_mendelian[0][0] | |||||
freq_mendelian = highest_mendelian[0][1] | |||||
if (candidate_mendelian == '1:1:1') and (freq_mendelian >= 2): | |||||
con_loc = [i for i in range(len(mendelian_list)) if mendelian_list[i] == '1:1:1'] | |||||
gt_con = itemgetter(*con_loc)(gt) | |||||
gt_num_dict = Counter(gt_con) | |||||
highest_gt = gt_num_dict.most_common(1) | |||||
candidate_gt = highest_gt[0][0] | |||||
freq_gt = highest_gt[0][1] | |||||
if (candidate_gt != './.') and (freq_gt >= 2): | |||||
consensus_rep = candidate_gt | |||||
elif (candidate_gt == './.') and (freq_gt >= 2): | |||||
consensus_rep = 'noGTInfo' | |||||
else: | |||||
consensus_rep = 'inconGT' | |||||
elif (candidate_mendelian == 'nan') and (freq_mendelian >= 2): | |||||
consensus_rep = 'noMenInfo' | |||||
else: | |||||
consensus_rep = 'inconMen' | |||||
return consensus_rep | |||||
def consensus_call(vcf_info_list,mendelian_list,alt_seq): | |||||
pcr_consensus = '.' | |||||
pcr_free_consensus = '.' | |||||
mendelian_num = '.' | |||||
consensus_call = '.' | |||||
consensus_alt_seq = '.' | |||||
# pcr | |||||
SEQ2000 = decide_by_rep(vcf_info_list[0:3],mendelian_list[0:3]) | |||||
XTen_ARD = decide_by_rep(vcf_info_list[18:21],mendelian_list[18:21]) | |||||
XTen_NVG = decide_by_rep(vcf_info_list[21:24],mendelian_list[21:24]) | |||||
XTen_WUX = decide_by_rep(vcf_info_list[24:27],mendelian_list[24:27]) | |||||
pcr_sequence_site = [SEQ2000,XTen_ARD,XTen_NVG,XTen_WUX] | |||||
pcr_sequence_dict = Counter(pcr_sequence_site) | |||||
pcr_highest_sequence = pcr_sequence_dict.most_common(1) | |||||
pcr_candidate_sequence = pcr_highest_sequence[0][0] | |||||
pcr_freq_sequence = pcr_highest_sequence[0][1] | |||||
if pcr_freq_sequence > 2: | |||||
pcr_consensus = pcr_candidate_sequence | |||||
else: | |||||
pcr_consensus = 'inconSequenceSite' | |||||
# pcr-free | |||||
T7_WGE = decide_by_rep(vcf_info_list[3:6],mendelian_list[3:6]) | |||||
Nova_ARD_1 = decide_by_rep(vcf_info_list[6:9],mendelian_list[6:9]) | |||||
Nova_ARD_2 = decide_by_rep(vcf_info_list[9:12],mendelian_list[9:12]) | |||||
Nova_BRG = decide_by_rep(vcf_info_list[12:15],mendelian_list[12:15]) | |||||
Nova_WUX = decide_by_rep(vcf_info_list[15:18],mendelian_list[15:18]) | |||||
sequence_site = [T7_WGE,Nova_ARD_1,Nova_ARD_2,Nova_BRG,Nova_WUX] | |||||
sequence_dict = Counter(sequence_site) | |||||
highest_sequence = sequence_dict.most_common(1) | |||||
candidate_sequence = highest_sequence[0][0] | |||||
freq_sequence = highest_sequence[0][1] | |||||
if freq_sequence > 3: | |||||
pcr_free_consensus = candidate_sequence | |||||
else: | |||||
pcr_free_consensus = 'inconSequenceSite' | |||||
# net alt, dp | |||||
# alt | |||||
AD = [x.split(':')[1] for x in vcf_info_list] | |||||
ALT = [x.split(',')[1] for x in AD] | |||||
ALT = [int(x) for x in ALT] | |||||
ALL_ALT = sum(ALT) | |||||
# dp | |||||
DP = [x.split(':')[2] for x in vcf_info_list] | |||||
DP = [int(x) for x in DP] | |||||
ALL_DP = sum(DP) | |||||
# detected number | |||||
gt = [x.split(':')[0] for x in vcf_info_list] | |||||
gt = [x.replace('0/0','.') for x in gt] | |||||
gt = [x.replace('./.','.') for x in gt] | |||||
detected_num = 27 - gt.count('.') | |||||
# decide consensus calls | |||||
tag = ['inconGT','noMenInfo','inconMen','inconSequenceSite','noGTInfo'] | |||||
if (pcr_consensus != '0/0') and (pcr_consensus == pcr_free_consensus) and (pcr_consensus not in tag): | |||||
consensus_call = pcr_consensus | |||||
gt = [x.split(':')[0] for x in vcf_info_list] | |||||
indices = [i for i, x in enumerate(gt) if x == consensus_call] | |||||
matched_mendelian = itemgetter(*indices)(mendelian_list) | |||||
mendelian_num = matched_mendelian.count('1:1:1') | |||||
# Delete multiple alternative genotype to necessary expression | |||||
alt_gt = alt_seq.split(',') | |||||
if len(alt_gt) > 1: | |||||
allele1 = consensus_call.split('/')[0] | |||||
allele2 = consensus_call.split('/')[1] | |||||
if allele1 == '0': | |||||
allele2_seq = alt_gt[int(allele2) - 1] | |||||
consensus_alt_seq = allele2_seq | |||||
consensus_call = '0/1' | |||||
else: | |||||
allele1_seq = alt_gt[int(allele1) - 1] | |||||
allele2_seq = alt_gt[int(allele2) - 1] | |||||
if int(allele1) > int(allele2): | |||||
consensus_alt_seq = allele2_seq + ',' + allele1_seq | |||||
consensus_call = '1/2' | |||||
elif int(allele1) < int(allele2): | |||||
consensus_alt_seq = allele1_seq + ',' + allele2_seq | |||||
consensus_call = '1/2' | |||||
else: | |||||
consensus_alt_seq = allele1_seq | |||||
consensus_call = '1/1' | |||||
else: | |||||
consensus_alt_seq = alt_seq | |||||
elif (pcr_consensus in tag) and (pcr_free_consensus in tag): | |||||
consensus_call = 'filtered' | |||||
elif ((pcr_consensus == './.') or (pcr_consensus in tag)) and ((pcr_free_consensus not in tag) and (pcr_free_consensus != './.')): | |||||
consensus_call = 'pcr-free-speicifc' | |||||
elif ((pcr_consensus != './.') or (pcr_consensus not in tag)) and ((pcr_free_consensus in tag) and (pcr_free_consensus == './.')): | |||||
consensus_call = 'pcr-speicifc' | |||||
elif (pcr_consensus == '0/0') and (pcr_free_consensus == '0/0'): | |||||
consensus_call = '0/0' | |||||
else: | |||||
consensus_call = 'filtered' | |||||
return pcr_consensus, pcr_free_consensus, mendelian_num, consensus_call, consensus_alt_seq, ALL_ALT, ALL_DP, detected_num | |||||
for row in merged_df.itertuples(): | |||||
mendelian_list = [row.Quartet_DNA_BGI_SEQ2000_BGI_1_20180518,row.Quartet_DNA_BGI_SEQ2000_BGI_2_20180530,row.Quartet_DNA_BGI_SEQ2000_BGI_3_20180530, \ | |||||
row.Quartet_DNA_BGI_T7_WGE_1_20191105,row.Quartet_DNA_BGI_T7_WGE_2_20191105,row.Quartet_DNA_BGI_T7_WGE_3_20191105, \ | |||||
row.Quartet_DNA_ILM_Nova_ARD_1_20181108,row.Quartet_DNA_ILM_Nova_ARD_2_20181108,row.Quartet_DNA_ILM_Nova_ARD_3_20181108, \ | |||||
row.Quartet_DNA_ILM_Nova_ARD_4_20190111,row.Quartet_DNA_ILM_Nova_ARD_5_20190111,row.Quartet_DNA_ILM_Nova_ARD_6_20190111, \ | |||||
row.Quartet_DNA_ILM_Nova_BRG_1_20180930,row.Quartet_DNA_ILM_Nova_BRG_2_20180930,row.Quartet_DNA_ILM_Nova_BRG_3_20180930, \ | |||||
row.Quartet_DNA_ILM_Nova_WUX_1_20190917,row.Quartet_DNA_ILM_Nova_WUX_2_20190917,row.Quartet_DNA_ILM_Nova_WUX_3_20190917, \ | |||||
row.Quartet_DNA_ILM_XTen_ARD_1_20170403,row.Quartet_DNA_ILM_XTen_ARD_2_20170403,row.Quartet_DNA_ILM_XTen_ARD_3_20170403, \ | |||||
row.Quartet_DNA_ILM_XTen_NVG_1_20170329,row.Quartet_DNA_ILM_XTen_NVG_2_20170329,row.Quartet_DNA_ILM_XTen_NVG_3_20170329, \ | |||||
row.Quartet_DNA_ILM_XTen_WUX_1_20170216,row.Quartet_DNA_ILM_XTen_WUX_2_20170216,row.Quartet_DNA_ILM_XTen_WUX_3_20170216] | |||||
lcl5_list = [row.Quartet_DNA_BGI_SEQ2000_BGI_LCL5_1_20180518,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL5_2_20180530,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL5_3_20180530, \ | |||||
row.Quartet_DNA_BGI_T7_WGE_LCL5_1_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL5_2_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL5_3_20191105, \ | |||||
row.Quartet_DNA_ILM_Nova_ARD_LCL5_1_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL5_2_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL5_3_20181108, \ | |||||
row.Quartet_DNA_ILM_Nova_ARD_LCL5_4_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL5_5_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL5_6_20190111, \ | |||||
row.Quartet_DNA_ILM_Nova_BRG_LCL5_1_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL5_2_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL5_3_20180930, \ | |||||
row.Quartet_DNA_ILM_Nova_WUX_LCL5_1_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL5_2_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL5_3_20190917, \ | |||||
row.Quartet_DNA_ILM_XTen_ARD_LCL5_1_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL5_2_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL5_3_20170403, \ | |||||
row.Quartet_DNA_ILM_XTen_NVG_LCL5_1_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL5_2_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL5_3_20170329, \ | |||||
row.Quartet_DNA_ILM_XTen_WUX_LCL5_1_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL5_2_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL5_3_20170216] | |||||
lcl6_list = [row.Quartet_DNA_BGI_SEQ2000_BGI_LCL6_1_20180518,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL6_2_20180530,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL6_3_20180530, \ | |||||
row.Quartet_DNA_BGI_T7_WGE_LCL6_1_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL6_2_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL6_3_20191105, \ | |||||
row.Quartet_DNA_ILM_Nova_ARD_LCL6_1_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL6_2_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL6_3_20181108, \ | |||||
row.Quartet_DNA_ILM_Nova_ARD_LCL6_4_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL6_5_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL6_6_20190111, \ | |||||
row.Quartet_DNA_ILM_Nova_BRG_LCL6_1_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL6_2_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL6_3_20180930, \ | |||||
row.Quartet_DNA_ILM_Nova_WUX_LCL6_1_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL6_2_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL6_3_20190917, \ | |||||
row.Quartet_DNA_ILM_XTen_ARD_LCL6_1_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL6_2_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL6_3_20170403, \ | |||||
row.Quartet_DNA_ILM_XTen_NVG_LCL6_1_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL6_2_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL6_3_20170329, \ | |||||
row.Quartet_DNA_ILM_XTen_WUX_LCL6_1_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL6_2_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL6_3_20170216] | |||||
lcl7_list = [row.Quartet_DNA_BGI_SEQ2000_BGI_LCL7_1_20180518,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL7_2_20180530,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL7_3_20180530, \ | |||||
row.Quartet_DNA_BGI_T7_WGE_LCL7_1_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL7_2_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL7_3_20191105, \ | |||||
row.Quartet_DNA_ILM_Nova_ARD_LCL7_1_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL7_2_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL7_3_20181108, \ | |||||
row.Quartet_DNA_ILM_Nova_ARD_LCL7_4_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL7_5_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL7_6_20190111, \ | |||||
row.Quartet_DNA_ILM_Nova_BRG_LCL7_1_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL7_2_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL7_3_20180930, \ | |||||
row.Quartet_DNA_ILM_Nova_WUX_LCL7_1_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL7_2_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL7_3_20190917, \ | |||||
row.Quartet_DNA_ILM_XTen_ARD_LCL7_1_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL7_2_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL7_3_20170403, \ | |||||
row.Quartet_DNA_ILM_XTen_NVG_LCL7_1_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL7_2_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL7_3_20170329, \ | |||||
row.Quartet_DNA_ILM_XTen_WUX_LCL7_1_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL7_2_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL7_3_20170216] | |||||
lcl8_list = [row.Quartet_DNA_BGI_SEQ2000_BGI_LCL8_1_20180518,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL8_2_20180530,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL8_3_20180530, \ | |||||
row.Quartet_DNA_BGI_T7_WGE_LCL8_1_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL8_2_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL8_3_20191105, \ | |||||
row.Quartet_DNA_ILM_Nova_ARD_LCL8_1_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL8_2_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL8_3_20181108, \ | |||||
row.Quartet_DNA_ILM_Nova_ARD_LCL8_4_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL8_5_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL8_6_20190111, \ | |||||
row.Quartet_DNA_ILM_Nova_BRG_LCL8_1_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL8_2_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL8_3_20180930, \ | |||||
row.Quartet_DNA_ILM_Nova_WUX_LCL8_1_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL8_2_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL8_3_20190917, \ | |||||
row.Quartet_DNA_ILM_XTen_ARD_LCL8_1_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL8_2_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL8_3_20170403, \ | |||||
row.Quartet_DNA_ILM_XTen_NVG_LCL8_1_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL8_2_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL8_3_20170329, \ | |||||
row.Quartet_DNA_ILM_XTen_WUX_LCL8_1_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL8_2_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL8_3_20170216] | |||||
# LCL5 | |||||
LCL5_pcr_consensus, LCL5_pcr_free_consensus, LCL5_mendelian_num, LCL5_consensus_call, LCL5_consensus_alt_seq, LCL5_alt, LCL5_dp, LCL5_detected_num = consensus_call(lcl5_list,mendelian_list,row.ALT) | |||||
if LCL5_mendelian_num != '.': | |||||
LCL5_output = row.CHROM + '\t' + str(row.POS) + '\t' + '.' + '\t' + row.REF + '\t' + LCL5_consensus_alt_seq + '\t' + '.' + '\t' + '.' + '\t' +'VOTED=' + str(LCL5_mendelian_num) + '\t' + 'GT:ALT:DP' + '\t' + LCL5_consensus_call + ':' + str(LCL5_alt) + ':' + str(LCL5_dp) + '\n' | |||||
benchmark_LCL5.write(LCL5_output) | |||||
# LCL6 | |||||
LCL6_pcr_consensus, LCL6_pcr_free_consensus, LCL6_mendelian_num, LCL6_consensus_call, LCL6_consensus_alt_seq, LCL6_alt, LCL6_dp, LCL6_detected_num = consensus_call(lcl6_list,mendelian_list,row.ALT) | |||||
if LCL6_mendelian_num != '.': | |||||
LCL6_output = row.CHROM + '\t' + str(row.POS) + '\t' + '.' + '\t' + row.REF + '\t' + LCL6_consensus_alt_seq + '\t' + '.' + '\t' + '.' + '\t' +'VOTED=' + str(LCL6_mendelian_num) + '\t' + 'GT:ALT:DP' + '\t' + LCL6_consensus_call + ':' + str(LCL6_alt) + ':' + str(LCL6_dp) + '\n' | |||||
benchmark_LCL6.write(LCL6_output) | |||||
# LCL7 | |||||
LCL7_pcr_consensus, LCL7_pcr_free_consensus, LCL7_mendelian_num, LCL7_consensus_call, LCL7_consensus_alt_seq, LCL7_alt, LCL7_dp, LCL7_detected_num = consensus_call(lcl7_list,mendelian_list,row.ALT) | |||||
if LCL7_mendelian_num != '.': | |||||
LCL7_output = row.CHROM + '\t' + str(row.POS) + '\t' + '.' + '\t' + row.REF + '\t' + LCL7_consensus_alt_seq + '\t' + '.' + '\t' + '.' + '\t' +'VOTED=' + str(LCL7_mendelian_num) + '\t' + 'GT:ALT:DP' + '\t' + LCL7_consensus_call + ':' + str(LCL7_alt) + ':' + str(LCL7_dp) + '\n' | |||||
benchmark_LCL7.write(LCL7_output) | |||||
# LCL8 | |||||
LCL8_pcr_consensus, LCL8_pcr_free_consensus, LCL8_mendelian_num, LCL8_consensus_call, LCL8_consensus_alt_seq, LCL8_alt, LCL8_dp, LCL8_detected_num = consensus_call(lcl8_list,mendelian_list,row.ALT) | |||||
if LCL8_mendelian_num != '.': | |||||
LCL8_output = row.CHROM + '\t' + str(row.POS) + '\t' + '.' + '\t' + row.REF + '\t' + LCL8_consensus_alt_seq + '\t' + '.' + '\t' + '.' + '\t' +'VOTED=' + str(LCL8_mendelian_num) + '\t' + 'GT:ALT:DP' + '\t' + LCL8_consensus_call + ':' + str(LCL8_alt) + ':' + str(LCL8_dp) + '\n' | |||||
benchmark_LCL8.write(LCL8_output) | |||||
# all data | |||||
all_output = row.CHROM + '\t' + str(row.POS) + '\t' + LCL5_pcr_consensus + '\t' + LCL5_pcr_free_consensus + '\t' + str(LCL5_mendelian_num) + '\t' + LCL5_consensus_call + '\t' + LCL5_consensus_alt_seq + '\t' + str(LCL5_alt) + '\t' + str(LCL5_dp) + '\t' + str(LCL5_detected_num) + '\t' +\ | |||||
LCL6_pcr_consensus + '\t' + LCL6_pcr_free_consensus + '\t' + str(LCL6_mendelian_num) + '\t' + LCL6_consensus_call + '\t' + LCL6_consensus_alt_seq + '\t' + str(LCL6_alt) + '\t' + str(LCL6_dp) + '\t' + str(LCL6_detected_num) + '\t' +\ | |||||
LCL7_pcr_consensus + '\t' + LCL7_pcr_free_consensus + '\t' + str(LCL7_mendelian_num) + '\t' + LCL7_consensus_call + '\t' + LCL7_consensus_alt_seq + '\t' + str(LCL7_alt) + '\t' + str(LCL7_dp) + '\t' + str(LCL7_detected_num) + '\t' +\ | |||||
LCL8_pcr_consensus + '\t' + LCL8_pcr_free_consensus + '\t' + str(LCL8_mendelian_num) + '\t' + LCL8_consensus_call + '\t' + LCL8_consensus_alt_seq + '\t' + str(LCL8_alt) + '\t' + str(LCL8_dp) + '\t' + str(LCL8_detected_num) + '\n' | |||||
all_sample_outfile.write(all_output) | |||||
{ | |||||
"{{ project_name }}.benchmarking_dir": "oss://pgx-result/renluyao/manuscript_v3.0/reference_datasets_v202103/", | |||||
"{{ project_name }}.fasta": "GRCh38.d1.vd1.fa", | |||||
"{{ project_name }}.BENCHMARKdocker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-hap:latest", | |||||
"{{ project_name }}.LCL8": "{{ LCL8 }}", | |||||
"{{ project_name }}.disk_size": "500", | |||||
"{{ project_name }}.LCL5": "{{ LCL5 }}", | |||||
"{{ project_name }}.project": "{{ project }}", | |||||
"{{ project_name }}.SMALLcluster_config": "OnDemand bcs.ps.g.xlarge img-ubuntu-vpc", | |||||
"{{ project_name }}.bed": "oss://pgx-result/renluyao/manuscript_v3.0/reference_datasets_v202105/benchmark_regions.filtered.bed", | |||||
"{{ project_name }}.BIGcluster_config": "OnDemand bcs.ps.g.2xlarge img-ubuntu-vpc", | |||||
"{{ project_name }}.LCL6": "{{ LCL6 }}", | |||||
"{{ project_name }}.sdf": "oss://pgx-reference-data/GRCh38.d1.vd1/GRCh38.d1.vd1.sdf/", | |||||
"{{ project_name }}.contig": "oss://pgx-result/renluyao/NCTR/contig.txt", | |||||
"{{ project_name }}.MENDELIANdocker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/vbt:v1.1", | |||||
"{{ project_name }}.LCL7": "{{ LCL7 }}", | |||||
"{{ project_name }}.DIYdocker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call_manuscript:v1.4", | |||||
"{{ project_name }}.ref_dir": "oss://pgx-reference-data/GRCh38.d1.vd1/" | |||||
} |
LCL5_10X_1 LCL5 | |||||
LCL6_10X LCL6 | |||||
LCL7_10X LCL7 | |||||
LCL8_10X LCL8 |
gvcf_list,project,sample_id,gvcf_idx_list | |||||
oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-0/Quartet_DNA_BGI_SEQ2000_BGI_LCL5_1_20180518_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-1/Quartet_DNA_BGI_SEQ2000_BGI_LCL5_2_20180530_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-10/Quartet_DNA_BGI_SEQ2000_BGI_LCL8_2_20180530_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-11/Quartet_DNA_BGI_SEQ2000_BGI_LCL8_3_20180530_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-2/Quartet_DNA_BGI_SEQ2000_BGI_LCL5_3_20180530_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-3/Quartet_DNA_BGI_SEQ2000_BGI_LCL6_1_20180518_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-4/Quartet_DNA_BGI_SEQ2000_BGI_LCL6_2_20180530_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-5/Quartet_DNA_BGI_SEQ2000_BGI_LCL6_3_20180530_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-6/Quartet_DNA_BGI_SEQ2000_BGI_LCL7_1_20180518_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-7/Quartet_DNA_BGI_SEQ2000_BGI_LCL7_2_20180530_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-8/Quartet_DNA_BGI_SEQ2000_BGI_LCL7_3_20180530_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-9/Quartet_DNA_BGI_SEQ2000_BGI_LCL8_1_20180518_hc.g.vcf,Quartet_DNA_BGI_SEQ2000_BGI_20180530,1,oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-0/Quartet_DNA_BGI_SEQ2000_BGI_LCL5_1_20180518_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-1/Quartet_DNA_BGI_SEQ2000_BGI_LCL5_2_20180530_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-10/Quartet_DNA_BGI_SEQ2000_BGI_LCL8_2_20180530_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-11/Quartet_DNA_BGI_SEQ2000_BGI_LCL8_3_20180530_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-2/Quartet_DNA_BGI_SEQ2000_BGI_LCL5_3_20180530_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-3/Quartet_DNA_BGI_SEQ2000_BGI_LCL6_1_20180518_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-4/Quartet_DNA_BGI_SEQ2000_BGI_LCL6_2_20180530_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-5/Quartet_DNA_BGI_SEQ2000_BGI_LCL6_3_20180530_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-6/Quartet_DNA_BGI_SEQ2000_BGI_LCL7_1_20180518_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-7/Quartet_DNA_BGI_SEQ2000_BGI_LCL7_2_20180530_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-8/Quartet_DNA_BGI_SEQ2000_BGI_LCL7_3_20180530_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-9/Quartet_DNA_BGI_SEQ2000_BGI_LCL8_1_20180518_hc.g.vcf.idx | |||||
oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-72/Quartet_DNA_BGI_T7_WGE_LCL5_1_20191105_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-73/Quartet_DNA_BGI_T7_WGE_LCL5_2_20191105_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-74/Quartet_DNA_BGI_T7_WGE_LCL5_3_20191105_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-75/Quartet_DNA_BGI_T7_WGE_LCL6_1_20191105_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-76/Quartet_DNA_BGI_T7_WGE_LCL6_2_20191105_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-77/Quartet_DNA_BGI_T7_WGE_LCL6_3_20191105_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-78/Quartet_DNA_BGI_T7_WGE_LCL7_1_20191105_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-79/Quartet_DNA_BGI_T7_WGE_LCL7_2_20191105_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-80/Quartet_DNA_BGI_T7_WGE_LCL7_3_20191105_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-81/Quartet_DNA_BGI_T7_WGE_LCL8_1_20191105_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-82/Quartet_DNA_BGI_T7_WGE_LCL8_2_20191105_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-83/Quartet_DNA_BGI_T7_WGE_LCL8_3_20191105_hc.g.vcf,Quartet_DNA_BGI_T7_WGE_20191105,2,oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-72/Quartet_DNA_BGI_T7_WGE_LCL5_1_20191105_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-73/Quartet_DNA_BGI_T7_WGE_LCL5_2_20191105_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-74/Quartet_DNA_BGI_T7_WGE_LCL5_3_20191105_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-75/Quartet_DNA_BGI_T7_WGE_LCL6_1_20191105_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-76/Quartet_DNA_BGI_T7_WGE_LCL6_2_20191105_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-77/Quartet_DNA_BGI_T7_WGE_LCL6_3_20191105_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-78/Quartet_DNA_BGI_T7_WGE_LCL7_1_20191105_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-79/Quartet_DNA_BGI_T7_WGE_LCL7_2_20191105_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-80/Quartet_DNA_BGI_T7_WGE_LCL7_3_20191105_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-81/Quartet_DNA_BGI_T7_WGE_LCL8_1_20191105_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-82/Quartet_DNA_BGI_T7_WGE_LCL8_2_20191105_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-83/Quartet_DNA_BGI_T7_WGE_LCL8_3_20191105_hc.g.vcf.idx | |||||
oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-48/Quartet_DNA_ILM_Nova_ARD_LCL5_1_20181108_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-49/Quartet_DNA_ILM_Nova_ARD_LCL5_2_20181108_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-50/Quartet_DNA_ILM_Nova_ARD_LCL5_3_20181108_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-54/Quartet_DNA_ILM_Nova_ARD_LCL6_1_20181108_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-55/Quartet_DNA_ILM_Nova_ARD_LCL6_2_20181108_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-56/Quartet_DNA_ILM_Nova_ARD_LCL6_3_20181108_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-60/Quartet_DNA_ILM_Nova_ARD_LCL7_1_20181108_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-61/Quartet_DNA_ILM_Nova_ARD_LCL7_2_20181108_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-62/Quartet_DNA_ILM_Nova_ARD_LCL7_3_20181108_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-66/Quartet_DNA_ILM_Nova_ARD_LCL8_1_20181108_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-67/Quartet_DNA_ILM_Nova_ARD_LCL8_2_20181108_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-68/Quartet_DNA_ILM_Nova_ARD_LCL8_3_20181108_hc.g.vcf,Quartet_DNA_ILM_Nova_ARD_20181108,3,oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-48/Quartet_DNA_ILM_Nova_ARD_LCL5_1_20181108_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-49/Quartet_DNA_ILM_Nova_ARD_LCL5_2_20181108_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-50/Quartet_DNA_ILM_Nova_ARD_LCL5_3_20181108_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-54/Quartet_DNA_ILM_Nova_ARD_LCL6_1_20181108_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-55/Quartet_DNA_ILM_Nova_ARD_LCL6_2_20181108_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-56/Quartet_DNA_ILM_Nova_ARD_LCL6_3_20181108_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-60/Quartet_DNA_ILM_Nova_ARD_LCL7_1_20181108_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-61/Quartet_DNA_ILM_Nova_ARD_LCL7_2_20181108_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-62/Quartet_DNA_ILM_Nova_ARD_LCL7_3_20181108_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-66/Quartet_DNA_ILM_Nova_ARD_LCL8_1_20181108_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-67/Quartet_DNA_ILM_Nova_ARD_LCL8_2_20181108_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-68/Quartet_DNA_ILM_Nova_ARD_LCL8_3_20181108_hc.g.vcf.idx | |||||
oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-51/Quartet_DNA_ILM_Nova_ARD_LCL5_4_20190111_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-52/Quartet_DNA_ILM_Nova_ARD_LCL5_5_20190111_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-53/Quartet_DNA_ILM_Nova_ARD_LCL5_6_20190111_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-57/Quartet_DNA_ILM_Nova_ARD_LCL6_4_20190111_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-58/Quartet_DNA_ILM_Nova_ARD_LCL6_5_20190111_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-59/Quartet_DNA_ILM_Nova_ARD_LCL6_6_20190111_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-63/Quartet_DNA_ILM_Nova_ARD_LCL7_4_20190111_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-64/Quartet_DNA_ILM_Nova_ARD_LCL7_5_20190111_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-65/Quartet_DNA_ILM_Nova_ARD_LCL7_6_20190111_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-69/Quartet_DNA_ILM_Nova_ARD_LCL8_4_20190111_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-70/Quartet_DNA_ILM_Nova_ARD_LCL8_5_20190111_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-71/Quartet_DNA_ILM_Nova_ARD_LCL8_6_20190111_hc.g.vcf,Quartet_DNA_ILM_Nova_ARD_20190111,4,oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-51/Quartet_DNA_ILM_Nova_ARD_LCL5_4_20190111_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-52/Quartet_DNA_ILM_Nova_ARD_LCL5_5_20190111_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-53/Quartet_DNA_ILM_Nova_ARD_LCL5_6_20190111_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-57/Quartet_DNA_ILM_Nova_ARD_LCL6_4_20190111_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-58/Quartet_DNA_ILM_Nova_ARD_LCL6_5_20190111_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-59/Quartet_DNA_ILM_Nova_ARD_LCL6_6_20190111_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-63/Quartet_DNA_ILM_Nova_ARD_LCL7_4_20190111_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-64/Quartet_DNA_ILM_Nova_ARD_LCL7_5_20190111_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-65/Quartet_DNA_ILM_Nova_ARD_LCL7_6_20190111_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-69/Quartet_DNA_ILM_Nova_ARD_LCL8_4_20190111_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-70/Quartet_DNA_ILM_Nova_ARD_LCL8_5_20190111_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-71/Quartet_DNA_ILM_Nova_ARD_LCL8_6_20190111_hc.g.vcf.idx | |||||
oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-96/Quartet_DNA_ILM_Nova_BRG_LCL5_1_20180930_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-97/Quartet_DNA_ILM_Nova_BRG_LCL5_2_20180930_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-98/Quartet_DNA_ILM_Nova_BRG_LCL5_3_20180930_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-99/Quartet_DNA_ILM_Nova_BRG_LCL6_1_20180930_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-100/Quartet_DNA_ILM_Nova_BRG_LCL6_2_20180930_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-101/Quartet_DNA_ILM_Nova_BRG_LCL6_3_20180930_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-102/Quartet_DNA_ILM_Nova_BRG_LCL7_1_20180930_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-103/Quartet_DNA_ILM_Nova_BRG_LCL7_2_20180930_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-104/Quartet_DNA_ILM_Nova_BRG_LCL7_3_20180930_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-105/Quartet_DNA_ILM_Nova_BRG_LCL8_1_20180930_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-106/Quartet_DNA_ILM_Nova_BRG_LCL8_2_20180930_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-107/Quartet_DNA_ILM_Nova_BRG_LCL8_3_20180930_hc.g.vcf,Quartet_DNA_ILM_Nova_BRG_20180930,5,oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-96/Quartet_DNA_ILM_Nova_BRG_LCL5_1_20180930_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-97/Quartet_DNA_ILM_Nova_BRG_LCL5_2_20180930_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-98/Quartet_DNA_ILM_Nova_BRG_LCL5_3_20180930_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-99/Quartet_DNA_ILM_Nova_BRG_LCL6_1_20180930_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-100/Quartet_DNA_ILM_Nova_BRG_LCL6_2_20180930_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-101/Quartet_DNA_ILM_Nova_BRG_LCL6_3_20180930_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-102/Quartet_DNA_ILM_Nova_BRG_LCL7_1_20180930_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-103/Quartet_DNA_ILM_Nova_BRG_LCL7_2_20180930_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-104/Quartet_DNA_ILM_Nova_BRG_LCL7_3_20180930_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-105/Quartet_DNA_ILM_Nova_BRG_LCL8_1_20180930_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-106/Quartet_DNA_ILM_Nova_BRG_LCL8_2_20180930_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-107/Quartet_DNA_ILM_Nova_BRG_LCL8_3_20180930_hc.g.vcf.idx | |||||
oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-84/Quartet_DNA_ILM_Nova_WUX_LCL5_1_20190917_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-91/Quartet_DNA_ILM_Nova_WUX_LCL5_2_20190917_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-95/Quartet_DNA_ILM_Nova_WUX_LCL5_3_20190917_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-85/Quartet_DNA_ILM_Nova_WUX_LCL6_1_20190917_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-89/Quartet_DNA_ILM_Nova_WUX_LCL6_2_20190917_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-94/Quartet_DNA_ILM_Nova_WUX_LCL6_3_20190917_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-86/Quartet_DNA_ILM_Nova_WUX_LCL7_1_20190917_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-90/Quartet_DNA_ILM_Nova_WUX_LCL7_2_20190917_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-93/Quartet_DNA_ILM_Nova_WUX_LCL7_3_20190917_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-87/Quartet_DNA_ILM_Nova_WUX_LCL8_1_20190917_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-88/Quartet_DNA_ILM_Nova_WUX_LCL8_2_20190917_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-92/Quartet_DNA_ILM_Nova_WUX_LCL8_3_20190917_hc.g.vcf,Quartet_DNA_ILM_Nova_WUX_20190917,6,oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-84/Quartet_DNA_ILM_Nova_WUX_LCL5_1_20190917_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-91/Quartet_DNA_ILM_Nova_WUX_LCL5_2_20190917_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-95/Quartet_DNA_ILM_Nova_WUX_LCL5_3_20190917_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-85/Quartet_DNA_ILM_Nova_WUX_LCL6_1_20190917_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-89/Quartet_DNA_ILM_Nova_WUX_LCL6_2_20190917_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-94/Quartet_DNA_ILM_Nova_WUX_LCL6_3_20190917_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-86/Quartet_DNA_ILM_Nova_WUX_LCL7_1_20190917_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-90/Quartet_DNA_ILM_Nova_WUX_LCL7_2_20190917_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-93/Quartet_DNA_ILM_Nova_WUX_LCL7_3_20190917_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-87/Quartet_DNA_ILM_Nova_WUX_LCL8_1_20190917_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-88/Quartet_DNA_ILM_Nova_WUX_LCL8_2_20190917_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-92/Quartet_DNA_ILM_Nova_WUX_LCL8_3_20190917_hc.g.vcf.idx | |||||
oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-12/Quartet_DNA_ILM_XTen_ARD_LCL5_1_20170403_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-13/Quartet_DNA_ILM_XTen_ARD_LCL5_2_20170403_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-14/Quartet_DNA_ILM_XTen_ARD_LCL5_3_20170403_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-15/Quartet_DNA_ILM_XTen_ARD_LCL6_1_20170403_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-16/Quartet_DNA_ILM_XTen_ARD_LCL6_2_20170403_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-17/Quartet_DNA_ILM_XTen_ARD_LCL6_3_20170403_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-18/Quartet_DNA_ILM_XTen_ARD_LCL7_1_20170403_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-19/Quartet_DNA_ILM_XTen_ARD_LCL7_2_20170403_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-20/Quartet_DNA_ILM_XTen_ARD_LCL7_3_20170403_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-21/Quartet_DNA_ILM_XTen_ARD_LCL8_1_20170403_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-22/Quartet_DNA_ILM_XTen_ARD_LCL8_2_20170403_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-23/Quartet_DNA_ILM_XTen_ARD_LCL8_3_20170403_hc.g.vcf,Quartet_DNA_ILM_XTen_ARD_20170403,7,oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-12/Quartet_DNA_ILM_XTen_ARD_LCL5_1_20170403_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-13/Quartet_DNA_ILM_XTen_ARD_LCL5_2_20170403_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-14/Quartet_DNA_ILM_XTen_ARD_LCL5_3_20170403_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-15/Quartet_DNA_ILM_XTen_ARD_LCL6_1_20170403_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-16/Quartet_DNA_ILM_XTen_ARD_LCL6_2_20170403_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-17/Quartet_DNA_ILM_XTen_ARD_LCL6_3_20170403_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-18/Quartet_DNA_ILM_XTen_ARD_LCL7_1_20170403_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-19/Quartet_DNA_ILM_XTen_ARD_LCL7_2_20170403_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-20/Quartet_DNA_ILM_XTen_ARD_LCL7_3_20170403_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-21/Quartet_DNA_ILM_XTen_ARD_LCL8_1_20170403_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-22/Quartet_DNA_ILM_XTen_ARD_LCL8_2_20170403_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-23/Quartet_DNA_ILM_XTen_ARD_LCL8_3_20170403_hc.g.vcf.idx | |||||
oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-24/Quartet_DNA_ILM_XTen_NVG_LCL5_1_20170329_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-25/Quartet_DNA_ILM_XTen_NVG_LCL5_2_20170329_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-26/Quartet_DNA_ILM_XTen_NVG_LCL5_3_20170329_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-27/Quartet_DNA_ILM_XTen_NVG_LCL6_1_20170329_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-28/Quartet_DNA_ILM_XTen_NVG_LCL6_2_20170329_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-29/Quartet_DNA_ILM_XTen_NVG_LCL6_3_20170329_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-30/Quartet_DNA_ILM_XTen_NVG_LCL7_1_20170329_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-31/Quartet_DNA_ILM_XTen_NVG_LCL7_2_20170329_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-32/Quartet_DNA_ILM_XTen_NVG_LCL7_3_20170329_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-33/Quartet_DNA_ILM_XTen_NVG_LCL8_1_20170329_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-34/Quartet_DNA_ILM_XTen_NVG_LCL8_2_20170329_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-35/Quartet_DNA_ILM_XTen_NVG_LCL8_3_20170329_hc.g.vcf,Quartet_DNA_ILM_XTen_NVG_20170329,8,oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-24/Quartet_DNA_ILM_XTen_NVG_LCL5_1_20170329_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-25/Quartet_DNA_ILM_XTen_NVG_LCL5_2_20170329_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-26/Quartet_DNA_ILM_XTen_NVG_LCL5_3_20170329_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-27/Quartet_DNA_ILM_XTen_NVG_LCL6_1_20170329_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-28/Quartet_DNA_ILM_XTen_NVG_LCL6_2_20170329_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-29/Quartet_DNA_ILM_XTen_NVG_LCL6_3_20170329_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-30/Quartet_DNA_ILM_XTen_NVG_LCL7_1_20170329_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-31/Quartet_DNA_ILM_XTen_NVG_LCL7_2_20170329_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-32/Quartet_DNA_ILM_XTen_NVG_LCL7_3_20170329_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-33/Quartet_DNA_ILM_XTen_NVG_LCL8_1_20170329_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-34/Quartet_DNA_ILM_XTen_NVG_LCL8_2_20170329_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-35/Quartet_DNA_ILM_XTen_NVG_LCL8_3_20170329_hc.g.vcf.idx | |||||
oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-36/Quartet_DNA_ILM_XTen_WUX_LCL5_1_20170216_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-37/Quartet_DNA_ILM_XTen_WUX_LCL5_2_20170216_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-38/Quartet_DNA_ILM_XTen_WUX_LCL5_3_20170216_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-39/Quartet_DNA_ILM_XTen_WUX_LCL6_1_20170216_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-40/Quartet_DNA_ILM_XTen_WUX_LCL6_2_20170216_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-41/Quartet_DNA_ILM_XTen_WUX_LCL6_3_20170216_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-42/Quartet_DNA_ILM_XTen_WUX_LCL7_1_20170216_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-43/Quartet_DNA_ILM_XTen_WUX_LCL7_2_20170216_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-44/Quartet_DNA_ILM_XTen_WUX_LCL7_3_20170216_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-45/Quartet_DNA_ILM_XTen_WUX_LCL8_1_20170216_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-46/Quartet_DNA_ILM_XTen_WUX_LCL8_2_20170216_hc.g.vcf;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-47/Quartet_DNA_ILM_XTen_WUX_LCL8_3_20170216_hc.g.vcf,Quartet_DNA_ILM_XTen_WUX_20170216,9,oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-36/Quartet_DNA_ILM_XTen_WUX_LCL5_1_20170216_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-37/Quartet_DNA_ILM_XTen_WUX_LCL5_2_20170216_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-38/Quartet_DNA_ILM_XTen_WUX_LCL5_3_20170216_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-39/Quartet_DNA_ILM_XTen_WUX_LCL6_1_20170216_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-40/Quartet_DNA_ILM_XTen_WUX_LCL6_2_20170216_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-41/Quartet_DNA_ILM_XTen_WUX_LCL6_3_20170216_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-42/Quartet_DNA_ILM_XTen_WUX_LCL7_1_20170216_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-43/Quartet_DNA_ILM_XTen_WUX_LCL7_2_20170216_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-44/Quartet_DNA_ILM_XTen_WUX_LCL7_3_20170216_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-45/Quartet_DNA_ILM_XTen_WUX_LCL8_1_20170216_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-46/Quartet_DNA_ILM_XTen_WUX_LCL8_2_20170216_hc.g.vcf.idx;oss://pgx-result/renluyao/manuscript/20200417_gVCF/call-Haplotyper/shard-47/Quartet_DNA_ILM_XTen_WUX_LCL8_3_20170216_hc.g.vcf.idx | |||||
task BQSR { | |||||
File ref_dir | |||||
File dbsnp_dir | |||||
File dbmills_dir | |||||
String sample | |||||
String SENTIEON_INSTALL_DIR | |||||
String fasta | |||||
String dbsnp | |||||
String db_mills | |||||
File realigned_bam | |||||
File realigned_bam_index | |||||
String docker | |||||
String cluster_config | |||||
String disk_size | |||||
command <<< | |||||
set -o pipefail | |||||
set -e | |||||
export SENTIEON_LICENSE=192.168.0.55:8990 | |||||
nt=$(nproc) | |||||
${SENTIEON_INSTALL_DIR}/bin/sentieon driver -r ${ref_dir}/${fasta} -t $nt -i ${realigned_bam} --algo QualCal -k ${dbsnp_dir}/${dbsnp} -k ${dbmills_dir}/${db_mills} ${sample}_recal_data.table | |||||
${SENTIEON_INSTALL_DIR}/bin/sentieon driver -r ${ref_dir}/${fasta} -t $nt -i ${realigned_bam} -q ${sample}_recal_data.table --algo QualCal -k ${dbsnp_dir}/${dbsnp} -k ${dbmills_dir}/${db_mills} ${sample}_recal_data.table.post --algo ReadWriter ${sample}.sorted.deduped.realigned.recaled.bam | |||||
${SENTIEON_INSTALL_DIR}/bin/sentieon driver -t $nt --algo QualCal --plot --before ${sample}_recal_data.table --after ${sample}_recal_data.table.post ${sample}_recal_data.csv | |||||
${SENTIEON_INSTALL_DIR}/bin/sentieon plot QualCal -o ${sample}_bqsrreport.pdf ${sample}_recal_data.csv | |||||
>>> | |||||
runtime { | |||||
docker:docker | |||||
cluster: cluster_config | |||||
systemDisk: "cloud_ssd 40" | |||||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||||
} | |||||
output { | |||||
File recal_table = "${sample}_recal_data.table" | |||||
File recal_post = "${sample}_recal_data.table.post" | |||||
File recaled_bam = "${sample}.sorted.deduped.realigned.recaled.bam" | |||||
File recaled_bam_index = "${sample}.sorted.deduped.realigned.recaled.bam.bai" | |||||
File recal_csv = "${sample}_recal_data.csv" | |||||
File bqsrreport_pdf = "${sample}_bqsrreport.pdf" | |||||
} | |||||
} |
task D5_D6 { | |||||
Array[File] splited_vcf | |||||
String project | |||||
String docker | |||||
String cluster_config | |||||
String disk_size | |||||
command <<< | |||||
mkdir -p /cromwell_root/tmp/vcf | |||||
cp ${sep=" " splited_vcf} /cromwell_root/tmp/vcf | |||||
for i in /cromwell_root/tmp/vcf/*vcf | |||||
do | |||||
for j in /cromwell_root/tmp/vcf/*vcf | |||||
do | |||||
sample_i=$(echo $i | cut -f7 -d_) | |||||
sample_j=$(echo $j | cut -f7 -d_) | |||||
rep_i=$(echo $i | cut -f8 -d_) | |||||
rep_j=$(echo $j | cut -f8 -d_) | |||||
if [[ $sample_i == "LCL5" ] && [ $sample_j == "LCL6" ] && [ $rep_i == $rep_j ]];then | |||||
cat $i | grep -v '##' | cut -f1,2,4,5,10 | cut -d ':' -f1 > LCL5.txt | |||||
cat $j | grep -v '##' | cut -f10 | cut -d ':' -f1 > LCL6.txt | |||||
paste LCL5.txt LCL6.txt > sister.txt | |||||
python /opt/D5_D6.py -sister sister.txt -project ${project}.$rep_i | |||||
fi | |||||
done | |||||
done | |||||
for i in *.reproducibility.txt | |||||
do | |||||
cat $i | sed -n '2,2p' >> sister.summary | |||||
done | |||||
sed '1i\Family\tReproducibility_D5_D6' sister.summary > ${project}.sister.txt | |||||
>>> | |||||
runtime { | |||||
docker:docker | |||||
cluster: cluster_config | |||||
systemDisk: "cloud_ssd 40" | |||||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||||
} | |||||
output { | |||||
Array[File] sister_file = glob("*.reproducibility.txt") | |||||
File sister_summary = "${project}.sister.txt" | |||||
} | |||||
} |
task Dedup { | |||||
String SENTIEON_INSTALL_DIR | |||||
String sample | |||||
File sorted_bam | |||||
File sorted_bam_index | |||||
String docker | |||||
String cluster_config | |||||
String disk_size | |||||
command <<< | |||||
set -o pipefail | |||||
set -e | |||||
export SENTIEON_LICENSE=192.168.0.55:8990 | |||||
nt=$(nproc) | |||||
${SENTIEON_INSTALL_DIR}/bin/sentieon driver -t $nt -i ${sorted_bam} --algo LocusCollector --fun score_info ${sample}_score.txt | |||||
${SENTIEON_INSTALL_DIR}/bin/sentieon driver -t $nt -i ${sorted_bam} --algo Dedup --rmdup --score_info ${sample}_score.txt --metrics ${sample}_dedup_metrics.txt ${sample}.sorted.deduped.bam | |||||
>>> | |||||
runtime { | |||||
docker:docker | |||||
cluster: cluster_config | |||||
systemDisk: "cloud_ssd 40" | |||||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||||
} | |||||
output { | |||||
File score = "${sample}_score.txt" | |||||
File dedup_metrics = "${sample}_dedup_metrics.txt" | |||||
File Dedup_bam = "${sample}.sorted.deduped.bam" | |||||
File Dedup_bam_index = "${sample}.sorted.deduped.bam.bai" | |||||
} | |||||
} | |||||
task GVCFtyper { | |||||
File ref_dir | |||||
String SENTIEON_INSTALL_DIR | |||||
String fasta | |||||
Array[File] vcf | |||||
Array[File] vcf_idx | |||||
String project | |||||
String docker | |||||
String cluster_config | |||||
String disk_size | |||||
command <<< | |||||
set -o pipefail | |||||
set -e | |||||
export SENTIEON_LICENSE=192.168.0.55:8990 | |||||
nt=$(nproc) | |||||
${SENTIEON_INSTALL_DIR}/bin/sentieon driver -r ${ref_dir}/${fasta} --algo GVCFtyper ${project}.joint.g.vcf ${sep=" " vcf} | |||||
>>> | |||||
runtime { | |||||
docker:docker | |||||
cluster: cluster_config | |||||
systemDisk: "cloud_ssd 40" | |||||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||||
} | |||||
output { | |||||
File merged_gvcf = "${project}.joint.g.vcf" | |||||
File merged_gvcf_idx = "${project}.joint.g.vcf.idx" | |||||
} | |||||
} | |||||
task Haplotyper_gVCF { | |||||
File ref_dir | |||||
String SENTIEON_INSTALL_DIR | |||||
String fasta | |||||
File recaled_bam | |||||
File recaled_bam_index | |||||
String sample | |||||
String docker | |||||
String cluster_config | |||||
String disk_size | |||||
command <<< | |||||
set -o pipefail | |||||
set -e | |||||
export SENTIEON_LICENSE=192.168.0.55:8990 | |||||
nt=$(nproc) | |||||
${SENTIEON_INSTALL_DIR}/bin/sentieon driver -r ${ref_dir}/${fasta} -t $nt -i ${recaled_bam} --algo Haplotyper --emit_mode gvcf ${sample}_hc.g.vcf | |||||
>>> | |||||
runtime { | |||||
docker:docker | |||||
cluster: cluster_config | |||||
systemDisk: "cloud_ssd 40" | |||||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||||
} | |||||
output { | |||||
File vcf = "${sample}_hc.g.vcf" | |||||
File vcf_idx = "${sample}_hc.g.vcf.idx" | |||||
} | |||||
} | |||||
task Metrics { | |||||
File ref_dir | |||||
String SENTIEON_INSTALL_DIR | |||||
String sample | |||||
String docker | |||||
String cluster_config | |||||
String fasta | |||||
File sorted_bam | |||||
File sorted_bam_index | |||||
String disk_size | |||||
command <<< | |||||
set -o pipefail | |||||
set -e | |||||
export SENTIEON_LICENSE=192.168.0.55:8990 | |||||
nt=$(nproc) | |||||
${SENTIEON_INSTALL_DIR}/bin/sentieon driver -r ${ref_dir}/${fasta} -t $nt -i ${sorted_bam} --algo MeanQualityByCycle ${sample}_mq_metrics.txt --algo QualDistribution ${sample}_qd_metrics.txt --algo GCBias --summary ${sample}_gc_summary.txt ${sample}_gc_metrics.txt --algo AlignmentStat ${sample}_aln_metrics.txt --algo InsertSizeMetricAlgo ${sample}_is_metrics.txt --algo CoverageMetrics --omit_base_output ${sample}_coverage_metrics | |||||
${SENTIEON_INSTALL_DIR}/bin/sentieon plot metrics -o ${sample}_metrics_report.pdf gc=${sample}_gc_metrics.txt qd=${sample}_qd_metrics.txt mq=${sample}_mq_metrics.txt isize=${sample}_is_metrics.txt | |||||
>>> | |||||
runtime { | |||||
docker:docker | |||||
cluster: cluster_config | |||||
systemDisk: "cloud_ssd 40" | |||||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||||
} | |||||
output { | |||||
File qd_metrics = "${sample}_qd_metrics.txt" | |||||
File qd_metrics_pdf = "${sample}_qd_metrics.pdf" | |||||
File mq_metrics = "${sample}_mq_metrics.txt" | |||||
File mq_metrics_pdf = "${sample}_mq_metrics.pdf" | |||||
File is_metrics = "${sample}_is_metrics.txt" | |||||
File is_metrics_pdf = "${sample}_is_metrics.pdf" | |||||
File gc_summary = "${sample}_gc_summary.txt" | |||||
File gc_metrics = "${sample}_gc_metrics.txt" | |||||
File gc_metrics_pdf = "${sample}_gc_metrics.pdf" | |||||
File aln_metrics = "${sample}_aln_metrics.txt" | |||||
File coverage_metrics_sample_summary = "${sample}_coverage_metrics.sample_summary" | |||||
File coverage_metrics_sample_statistics = "${sample}_coverage_metrics.sample_statistics" | |||||
File coverage_metrics_sample_interval_statistics = "${sample}_coverage_metrics.sample_interval_statistics" | |||||
File coverage_metrics_sample_cumulative_coverage_proportions = "${sample}_coverage_metrics.sample_cumulative_coverage_proportions" | |||||
File coverage_metrics_sample_cumulative_coverage_counts = "${sample}_coverage_metrics.sample_cumulative_coverage_counts" | |||||
} | |||||
} | |||||
task Realigner { | |||||
File ref_dir | |||||
File dbmills_dir | |||||
String SENTIEON_INSTALL_DIR | |||||
String sample | |||||
String fasta | |||||
File Dedup_bam | |||||
File Dedup_bam_index | |||||
String db_mills | |||||
String docker | |||||
String cluster_config | |||||
String disk_size | |||||
command <<< | |||||
set -o pipefail | |||||
set -e | |||||
export SENTIEON_LICENSE=192.168.0.55:8990 | |||||
nt=$(nproc) | |||||
${SENTIEON_INSTALL_DIR}/bin/sentieon driver -r ${ref_dir}/${fasta} -t $nt -i ${Dedup_bam} --algo Realigner -k ${dbmills_dir}/${db_mills} ${sample}.sorted.deduped.realigned.bam | |||||
>>> | |||||
runtime { | |||||
docker:docker | |||||
cluster: cluster_config | |||||
systemDisk: "cloud_ssd 40" | |||||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||||
} | |||||
output { | |||||
File realigner_bam = "${sample}.sorted.deduped.realigned.bam" | |||||
File realigner_bam_index = "${sample}.sorted.deduped.realigned.bam.bai" | |||||
} | |||||
} | |||||
task benchmark { | |||||
File vcf | |||||
File benchmarking_dir | |||||
File contig | |||||
File sdf | |||||
String project | |||||
String sample = basename(vcf,".raw.vcf.gz") | |||||
String docker | |||||
String cluster_config | |||||
String disk_size | |||||
command <<< | |||||
set -o pipefail | |||||
set -e | |||||
nt=$(nproc) | |||||
mkdir -p /cromwell_root/tmp | |||||
cp -r ${benchmarking_dir} /cromwell_root/tmp/ | |||||
if [[ ${sample} =~ "gz" ]];then | |||||
gunzip ${vcf} -c > unzip.vcf | |||||
else | |||||
cp ${vcf} unzip.vcf | |||||
fi | |||||
cat unzip.vcf | grep '#CHROM' > name | |||||
cat unzip.vcf | grep -v '#' > body | |||||
cat body | grep -w '^chr1\|^chr2\|^chr3\|^chr4\|^chr5\|^chr6\|^chr7\|^chr8\|^chr9\|^chr10\|^chr11\|^chr12\|^chr13\|^chr14\|^chr15\|^chr16\|^chr17\|^chr18\|^chr19\|^chr20\|^chr21\|^chr22\|^chrX' > body.filtered | |||||
if grep -q PASS "body.filtered"; then | |||||
cat body.filtered | grep -v '0/0' | grep 'PASS' > body.filtered.gt | |||||
cat unzip.vcf | grep '##' | grep -v 'contig' | cat - ${contig} name body.filtered.gt > filtered.vcf | |||||
else | |||||
cat body.filtered | grep -v '0/0' > body.filtered.gt | |||||
cat unzip.vcf | grep '##' | grep -v 'contig' | cat - ${contig} name body.filtered.gt > filtered.vcf | |||||
fi | |||||
echo -e "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tLCL5" > LCL5_name | |||||
echo -e "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tLCL6" > LCL6_name | |||||
echo -e "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tLCL7" > LCL7_name | |||||
echo -e "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tLCL8" > LCL8_name | |||||
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg bgzip filtered.vcf -c > ${project}.${sample}.rtg.vcf.gz | |||||
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg index -f vcf ${project}.${sample}.rtg.vcf.gz | |||||
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcffilter -i ${project}.${sample}.rtg.vcf.gz -o ${sample}.rtg.SNV.vcf.gz --snps-only | |||||
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcffilter -i ${project}.${sample}.rtg.vcf.gz -o ${sample}.rtg.INDEL.vcf.gz --non-snps-only | |||||
if [[ ${sample} =~ "LCL5" ]];then | |||||
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcfeval -b /cromwell_root/tmp/reference_datasets_v202103/LCL5.high.confidence.calls.SNV.vcf.gz -c ${sample}.rtg.SNV.vcf.gz -o ${sample}_SNV -t ${sdf} --evaluation-regions=/cromwell_root/tmp/reference_datasets_v202103/Quartet.high.confidence.region.v202103.bed | |||||
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcfeval -b /cromwell_root/tmp/reference_datasets_v202103/LCL5.high.confidence.calls.INDEL.vcf.gz -c ${sample}.rtg.INDEL.vcf.gz -o ${sample}_INDEL -t ${sdf} --evaluation-regions=/cromwell_root/tmp/reference_datasets_v202103/Quartet.high.confidence.region.v202103.bed | |||||
cat filtered.vcf | grep '##' > header | |||||
cat filtered.vcf | grep -v '#' > body | |||||
cat header LCL5_name body > LCL5.vcf | |||||
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg bgzip LCL5.vcf -c > ${project}.${sample}.reformed.vcf.gz | |||||
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg index -f vcf ${project}.${sample}.reformed.vcf.gz | |||||
elif [[ ${sample} =~ "LCL6" ]]; then | |||||
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcfeval -b /cromwell_root/tmp/reference_datasets_v202103/LCL6.high.confidence.calls.SNV.vcf.gz -c ${sample}.rtg.SNV.vcf.gz -o ${sample}_SNV -t ${sdf} --evaluation-regions=/cromwell_root/tmp/reference_datasets_v202103/Quartet.high.confidence.region.v202103.bed | |||||
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcfeval -b /cromwell_root/tmp/reference_datasets_v202103/LCL6.high.confidence.calls.INDEL.vcf.gz -c ${sample}.rtg.INDEL.vcf.gz -o ${sample}_INDEL -t ${sdf} --evaluation-regions=/cromwell_root/tmp/reference_datasets_v202103/Quartet.high.confidence.region.v202103.bed | |||||
cat filtered.vcf | grep '##' > header | |||||
cat filtered.vcf | grep -v '#' > body | |||||
cat header LCL6_name body > LCL6.vcf | |||||
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg bgzip LCL6.vcf -c > ${project}.${sample}.reformed.vcf.gz | |||||
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg index -f vcf ${project}.${sample}.reformed.vcf.gz | |||||
elif [[ ${sample} =~ "LCL7" ]]; then | |||||
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcfeval -b /cromwell_root/tmp/reference_datasets_v202103/LCL7.high.confidence.calls.SNV.vcf.gz -c ${sample}.rtg.SNV.vcf.gz -o ${sample}_SNV -t ${sdf} --evaluation-regions=/cromwell_root/tmp/reference_datasets_v202103/Quartet.high.confidence.region.v202103.bed | |||||
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcfeval -b /cromwell_root/tmp/reference_datasets_v202103/LCL7.high.confidence.calls.INDEL.vcf.gz -c ${sample}.rtg.INDEL.vcf.gz -o ${sample}_INDEL -t ${sdf} --evaluation-regions=/cromwell_root/tmp/reference_datasets_v202103/Quartet.high.confidence.region.v202103.bed | |||||
cat filtered.vcf | grep '##' > header | |||||
cat filtered.vcf | grep -v '#' > body | |||||
cat header LCL7_name body > LCL7.vcf | |||||
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg bgzip LCL7.vcf -c > ${project}.${sample}.reformed.vcf.gz | |||||
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg index -f vcf ${project}.${sample}.reformed.vcf.gz | |||||
elif [[ ${sample} =~ "LCL8" ]]; then | |||||
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcfeval -b /cromwell_root/tmp/reference_datasets_v202103/LCL8.high.confidence.calls.SNV.vcf.gz -c ${sample}.rtg.SNV.vcf.gz -o ${sample}_SNV -t ${sdf} --evaluation-regions=/cromwell_root/tmp/reference_datasets_v202103/Quartet.high.confidence.region.v202103.bed | |||||
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcfeval -b /cromwell_root/tmp/reference_datasets_v202103/LCL8.high.confidence.calls.INDEL.vcf.gz -c ${sample}.rtg.INDEL.vcf.gz -o ${sample}_INDEL -t ${sdf} --evaluation-regions=/cromwell_root/tmp/reference_datasets_v202103/Quartet.high.confidence.region.v202103.bed | |||||
cat filtered.vcf | grep '##' > header | |||||
cat filtered.vcf | grep -v '#' > body | |||||
cat header LCL8_name body > LCL8.vcf | |||||
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg bgzip LCL8.vcf -c > ${project}.${sample}.reformed.vcf.gz | |||||
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg index -f vcf ${project}.${sample}.reformed.vcf.gz | |||||
else | |||||
echo "only for quartet samples" | |||||
fi | |||||
cat ${sample}_SNV/summary.txt > ${project}.${sample}_SNV_precision_recall.txt | |||||
cat ${sample}_INDEL/summary.txt > ${project}.${sample}_INDEL_precision_recall.txt | |||||
>>> | |||||
runtime { | |||||
docker:docker | |||||
cluster:cluster_config | |||||
systemDisk:"cloud_ssd 40" | |||||
dataDisk:"cloud_ssd " + disk_size + " /cromwell_root/" | |||||
} | |||||
output { | |||||
File rtg_vcf = "${project}.${sample}.reformed.vcf.gz" | |||||
File rtg_vcf_index = "${project}.${sample}.reformed.vcf.gz.tbi" | |||||
File SNV_result = "${project}.${sample}_SNV_precision_recall.txt" | |||||
File Indel_result = "${project}.${sample}_INDEL_precision_recall.txt" | |||||
} | |||||
} |
task corealigner { | |||||
File ref_dir | |||||
File dbsnp_dir | |||||
File dbmills_dir | |||||
String sample | |||||
String SENTIEON_INSTALL_DIR | |||||
String docker | |||||
String cluster_config | |||||
String fasta | |||||
String dbsnp | |||||
String db_mills | |||||
File tumor_recaled_bam | |||||
File tumor_recaled_bam_index | |||||
File normal_recaled_bam | |||||
File normal_recaled_bam_index | |||||
String disk_size | |||||
command <<< | |||||
set -o pipefail | |||||
set -e | |||||
export SENTIEON_LICENSE=192.168.0.55:8990 | |||||
nt=$(nproc) | |||||
${SENTIEON_INSTALL_DIR}/bin/sentieon driver -r ${ref_dir}/${fasta} -t $nt -i ${tumor_recaled_bam} -i ${normal_recaled_bam} --algo Realigner -k ${db_mills} -k ${dbsnp} ${sample}_corealigned.bam | |||||
>>> | |||||
runtime { | |||||
docker:docker | |||||
cluster: cluster_config | |||||
systemDisk: "cloud_ssd 40" | |||||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||||
} | |||||
output { | |||||
File corealigner_bam = "${sample}_corealigned.bam" | |||||
File corealigner_bam_index = "${sample}_corealigned.bam.bai" | |||||
} | |||||
} | |||||
task deduped_Metrics { | |||||
File ref_dir | |||||
String SENTIEON_INSTALL_DIR | |||||
String sample | |||||
String fasta | |||||
File Dedup_bam | |||||
File Dedup_bam_index | |||||
String docker | |||||
String cluster_config | |||||
String disk_size | |||||
command <<< | |||||
set -o pipefail | |||||
set -e | |||||
export SENTIEON_LICENSE=192.168.0.55:8990 | |||||
nt=$(nproc) | |||||
${SENTIEON_INSTALL_DIR}/bin/sentieon driver -r ${ref_dir}/${fasta} -t $nt -i ${Dedup_bam} --algo CoverageMetrics --omit_base_output ${sample}_deduped_coverage_metrics --algo MeanQualityByCycle ${sample}_deduped_mq_metrics.txt --algo QualDistribution ${sample}_deduped_qd_metrics.txt --algo GCBias --summary ${sample}_deduped_gc_summary.txt ${sample}_deduped_gc_metrics.txt --algo AlignmentStat ${sample}_deduped_aln_metrics.txt --algo InsertSizeMetricAlgo ${sample}_deduped_is_metrics.txt | |||||
>>> | |||||
runtime { | |||||
docker:docker | |||||
cluster: cluster_config | |||||
systemDisk: "cloud_ssd 40" | |||||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||||
} | |||||
output { | |||||
File deduped_coverage_metrics_sample_summary = "${sample}_deduped_coverage_metrics.sample_summary" | |||||
File deduped_coverage_metrics_sample_statistics = "${sample}_deduped_coverage_metrics.sample_statistics" | |||||
File deduped_coverage_metrics_sample_interval_statistics = "${sample}_deduped_coverage_metrics.sample_interval_statistics" | |||||
File deduped_coverage_metrics_sample_cumulative_coverage_proportions = "${sample}_deduped_coverage_metrics.sample_cumulative_coverage_proportions" | |||||
File deduped_coverage_metrics_sample_cumulative_coverage_counts = "${sample}_deduped_coverage_metrics.sample_cumulative_coverage_counts" | |||||
} | |||||
} |
task extract_multiqc { | |||||
File fastqc_qualimap | |||||
File fastqc | |||||
File fastqscreen | |||||
File hap | |||||
String docker | |||||
String cluster_config | |||||
String disk_size | |||||
command <<< | |||||
python /opt/extract_multiqc.py -fastqc_qualimap ${fastqc_qualimap} -fastqc ${fastqc} -fastqscreen ${fastqscreen} -hap ${hap} | |||||
>>> | |||||
runtime { | |||||
docker:docker | |||||
cluster:cluster_config | |||||
systemDisk:"cloud_ssd 40" | |||||
dataDisk:"cloud_ssd " + disk_size + " /cromwell_root/" | |||||
} | |||||
output { | |||||
File fastqc_result = "fastqc.final.result.txt" | |||||
File fastqscreen_result = "fastqscreen.final.result.txt" | |||||
File qualimap_result = "qualimap.final.result.txt" | |||||
File hap_result = "benchmark.final.result.txt" | |||||
} | |||||
} |
task fastqc { | |||||
File read1 | |||||
File read2 | |||||
String docker | |||||
String cluster_config | |||||
String disk_size | |||||
command <<< | |||||
set -o pipefail | |||||
set -e | |||||
nt=$(nproc) | |||||
fastqc -t $nt -o ./ ${read1} | |||||
fastqc -t $nt -o ./ ${read2} | |||||
>>> | |||||
runtime { | |||||
docker:docker | |||||
cluster: cluster_config | |||||
systemDisk: "cloud_ssd 40" | |||||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||||
} | |||||
output { | |||||
File read1_html = sub(basename(read1), "\\.(fastq|fq)\\.gz$", "_fastqc.html") | |||||
File read1_zip = sub(basename(read1), "\\.(fastq|fq)\\.gz$", "_fastqc.zip") | |||||
File read2_html = sub(basename(read2), "\\.(fastq|fq)\\.gz$", "_fastqc.html") | |||||
File read2_zip = sub(basename(read2), "\\.(fastq|fq)\\.gz$", "_fastqc.zip") | |||||
} | |||||
} |
task fastq_screen { | |||||
File read1 | |||||
File read2 | |||||
File screen_ref_dir | |||||
File fastq_screen_conf | |||||
String read1name = basename(read1,".fastq.gz") | |||||
String read2name = basename(read2,".fastq.gz") | |||||
String docker | |||||
String cluster_config | |||||
String disk_size | |||||
command <<< | |||||
set -o pipefail | |||||
set -e | |||||
nt=$(nproc) | |||||
mkdir -p /cromwell_root/tmp | |||||
cp -r ${screen_ref_dir} /cromwell_root/tmp/ | |||||
fastq_screen --aligner bowtie2 --conf ${fastq_screen_conf} --top 100000 --threads $nt ${read1} | |||||
fastq_screen --aligner bowtie2 --conf ${fastq_screen_conf} --top 100000 --threads $nt ${read2} | |||||
>>> | |||||
runtime { | |||||
docker:docker | |||||
cluster: cluster_config | |||||
systemDisk: "cloud_ssd 40" | |||||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||||
} | |||||
output { | |||||
File png1 = "${read1name}_screen.png" | |||||
File txt1 = "${read1name}_screen.txt" | |||||
File html1 = "${read1name}_screen.html" | |||||
File png2 = "${read2name}_screen.png" | |||||
File txt2 = "${read2name}_screen.txt" | |||||
File html2 = "${read2name}_screen.html" | |||||
} | |||||
} |
task mapping { | |||||
File ref_dir | |||||
String fasta | |||||
File fastq_1 | |||||
File fastq_2 | |||||
String SENTIEON_INSTALL_DIR | |||||
String group | |||||
String sample | |||||
String pl | |||||
String docker | |||||
String cluster_config | |||||
String disk_size | |||||
command <<< | |||||
set -o pipefail | |||||
set -e | |||||
export SENTIEON_LICENSE=192.168.0.55:8990 | |||||
nt=$(nproc) | |||||
${SENTIEON_INSTALL_DIR}/bin/bwa mem -M -R "@RG\tID:${group}\tSM:${sample}\tPL:${pl}" -t $nt -K 10000000 ${ref_dir}/${fasta} ${fastq_1} ${fastq_2} | ${SENTIEON_INSTALL_DIR}/bin/sentieon util sort -o ${sample}.sorted.bam -t $nt --sam2bam -i - | |||||
>>> | |||||
runtime { | |||||
docker:docker | |||||
cluster: cluster_config | |||||
systemDisk: "cloud_ssd 40" | |||||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||||
} | |||||
output { | |||||
File sorted_bam = "${sample}.sorted.bam" | |||||
File sorted_bam_index = "${sample}.sorted.bam.bai" | |||||
} | |||||
} |
task mendelian { | |||||
File family_vcf | |||||
File family_vcf_inside | |||||
File family_vcf_outside | |||||
File ref_dir | |||||
String family_name = basename(family_vcf,".family.vcf") | |||||
String fasta | |||||
String docker | |||||
String cluster_config | |||||
String disk_size | |||||
command <<< | |||||
export LD_LIBRARY_PATH=/opt/htslib-1.9 | |||||
nt=$(nproc) | |||||
echo -e "${family_name}\tLCL8\t0\t0\t2\t-9\n${family_name}\tLCL7\t0\t0\t1\t-9\n${family_name}\tLCL5\tLCL7\tLCL8\t2\t-9" > ${family_name}.D5.ped | |||||
mkdir VBT_D5 | |||||
/opt/VBT-TrioAnalysis/vbt mendelian -ref ${ref_dir}/${fasta} -mother ${family_vcf} -father ${family_vcf} -child ${family_vcf} -pedigree ${family_name}.D5.ped -outDir VBT_D5 -out-prefix ${family_name}.D5 --output-violation-regions -thread-count $nt | |||||
cat VBT_D5/${family_name}.D5_trio.vcf > ${family_name}.D5.vcf | |||||
mkdir VBT_D5_inside | |||||
/opt/VBT-TrioAnalysis/vbt mendelian -ref ${ref_dir}/${fasta} -mother ${family_vcf_inside} -father ${family_vcf_inside} -child ${family_vcf_inside} -pedigree ${family_name}.D5.ped -outDir VBT_D5_inside -out-prefix ${family_name}.D5.inside --output-violation-regions -thread-count $nt | |||||
cat VBT_D5_inside/${family_name}.D5.inside_trio.vcf > ${family_name}.D5.inside.vcf | |||||
mkdir VBT_D5_outside | |||||
/opt/VBT-TrioAnalysis/vbt mendelian -ref ${ref_dir}/${fasta} -mother ${family_vcf_outside} -father ${family_vcf_outside} -child ${family_vcf_outside} -pedigree ${family_name}.D5.ped -outDir VBT_D5_outside -out-prefix ${family_name}.D5.outside --output-violation-regions -thread-count $nt | |||||
cat VBT_D5_inside/${family_name}.D5.outside_trio.vcf > ${family_name}.D5.outside.vcf | |||||
echo -e "${family_name}\tLCL8\t0\t0\t2\t-9\n${family_name}\tLCL7\t0\t0\t1\t-9\n${family_name}\tLCL6\tLCL7\tLCL8\t2\t-9" > ${family_name}.D6.ped | |||||
mkdir VBT_D6 | |||||
/opt/VBT-TrioAnalysis/vbt mendelian -ref ${ref_dir}/${fasta} -mother ${family_vcf} -father ${family_vcf} -child ${family_vcf} -pedigree ${family_name}.D6.ped -outDir VBT_D6 -out-prefix ${family_name}.D6 --output-violation-regions -thread-count $nt | |||||
cat VBT_D6/${family_name}.D6_trio.vcf > ${family_name}.D6.vcf | |||||
mkdir VBT_D6_inside | |||||
/opt/VBT-TrioAnalysis/vbt mendelian -ref ${ref_dir}/${fasta} -mother ${family_vcf_inside} -father ${family_vcf_inside} -child ${family_vcf_inside} -pedigree ${family_name}.D6.ped -outDir VBT_D6_inside -out-prefix ${family_name}.D6.inside --output-violation-regions -thread-count $nt | |||||
cat VBT_D6_inside/${family_name}.D6.inside_trio.vcf > ${family_name}.D6.inside.vcf | |||||
mkdir VBT_D6_outside | |||||
/opt/VBT-TrioAnalysis/vbt mendelian -ref ${ref_dir}/${fasta} -mother ${family_vcf_outside} -father ${family_vcf_outside} -child ${family_vcf_outside} -pedigree ${family_name}.D6.ped -outDir VBT_D6_outside -out-prefix ${family_name}.D6.outside --output-violation-regions -thread-count $nt | |||||
cat VBT_D6_inside/${family_name}.D6.outside_trio.vcf > ${family_name}.D6.outside.vcf | |||||
>>> | |||||
runtime { | |||||
docker:docker | |||||
cluster: cluster_config | |||||
systemDisk: "cloud_ssd 40" | |||||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||||
} | |||||
output { | |||||
File D5_ped = "${family_name}.D5.ped" | |||||
File D6_ped = "${family_name}.D6.ped" | |||||
Array[File] D5_mendelian = glob("VBT_D5/*") | |||||
Array[File] D6_mendelian = glob("VBT_D6/*") | |||||
File D5_trio_vcf = "${family_name}.D5.vcf" | |||||
File D6_trio_vcf = "${family_name}.D6.vcf" | |||||
Array[File] D5_mendelian_inside = glob("VBT_D5_inside/*") | |||||
Array[File] D6_mendelian_inside = glob("VBT_D6_inside/*") | |||||
File D5_trio_vcf_inside = "${family_name}.D5.inside.vcf" | |||||
File D6_trio_vcf_inside = "${family_name}.D6.inside.vcf" | |||||
Array[File] D5_mendelian_outside = glob("VBT_D5_outside/*") | |||||
Array[File] D6_mendelian_outside = glob("VBT_D6_outside/*") | |||||
File D5_trio_vcf_outside = "${family_name}.D5.outside.vcf" | |||||
File D6_trio_vcf_outside = "${family_name}.D6.outside.vcf" | |||||
} | |||||
} | |||||
task mergeNum { | |||||
Array[File] vcfnumber | |||||
String docker | |||||
String cluster_config | |||||
String disk_size | |||||
command <<< | |||||
set -o pipefail | |||||
set -e | |||||
for i in ${sep=" " vcfnumber} | |||||
do | |||||
cat $i | cut -d':' -f2 | tr '\n' '\t' | sed s'/\t$/\n/g' >> vcfstats | |||||
done | |||||
sed '1i\File\tFailed Filters\tPassed Filters\tSNPs\tMNPs\tInsertions\tDeletions\tIndels\tSame as reference\tSNP Transitions/Transversions\tTotal Het/Hom ratio\tSNP Het/Hom ratio\tMNP Het/Hom ratio\tInsertion Het/Hom ratio\tDeletion Het/Hom ratio\tIndel Het/Hom ratio\tInsertion/Deletion ratio\tIndel/SNP+MNP ratio' vcfstats > vcfstats.txt | |||||
>>> | |||||
runtime { | |||||
docker:docker | |||||
cluster:cluster_config | |||||
systemDisk:"cloud_ssd 40" | |||||
dataDisk:"cloud_ssd " + disk_size + " /cromwell_root/" | |||||
} | |||||
output { | |||||
File vcfstat="vcfstats.txt" | |||||
} | |||||
} |
task mergeSentieon { | |||||
Array[File] aln_metrics_header | |||||
Array[File] aln_metrics_data | |||||
Array[File] dedup_metrics_header | |||||
Array[File] dedup_metrics_data | |||||
Array[File] is_metrics_header | |||||
Array[File] is_metrics_data | |||||
Array[File] deduped_coverage_header | |||||
Array[File] deduped_coverage_data | |||||
String docker | |||||
String cluster_config | |||||
String disk_size | |||||
command <<< | |||||
set -o pipefail | |||||
set -e | |||||
cat ${sep=" " aln_metrics_header} | sed -n '1,1p' | cat - ${sep=" " aln_metrics_data} > aln_metrics.txt | |||||
cat ${sep=" " dedup_metrics_header} | sed -n '1,1p' | cat - ${sep=" " dedup_metrics_data} > dedup_metrics.txt | |||||
cat ${sep=" " is_metrics_header} | sed -n '1,1p' | cat - ${sep=" " is_metrics_data} > is_metrics.txt | |||||
cat ${sep=" " deduped_coverage_header} | sed -n '1,1p' | cat - ${sep=" " deduped_coverage_data} > deduped_coverage.txt | |||||
>>> | |||||
runtime { | |||||
docker:docker | |||||
cluster:cluster_config | |||||
systemDisk:"cloud_ssd 40" | |||||
dataDisk:"cloud_ssd " + disk_size + " /cromwell_root/" | |||||
} | |||||
output { | |||||
File aln_metrics_merge = "aln_metrics.txt" | |||||
File dedup_metrics_merge = "dedup_metrics.txt" | |||||
File is_metrics_merge = "is_metrics.txt" | |||||
File deduped_coverage_merge = "deduped_coverage.txt" | |||||
} | |||||
} |
task merge_family { | |||||
File LCL5_vcf_gz | |||||
File LCL5_vcf_idx | |||||
File LCL6_vcf_gz | |||||
File LCL6_vcf_idx | |||||
File LCL7_vcf_gz | |||||
File LCL7_vcf_idx | |||||
File LCL8_vcf_gz | |||||
File LCL8_vcf_idx | |||||
File bed | |||||
String project | |||||
String docker | |||||
String cluster_config | |||||
String disk_size | |||||
command <<< | |||||
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcfmerge --force-merge-all -o ${project}.family.vcf.gz ${LCL5_vcf_gz} ${LCL6_vcf_gz} ${LCL7_vcf_gz} ${LCL8_vcf_gz} | |||||
gunzip ${project}.family.vcf.gz | |||||
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcffilter -i ${LCL5_vcf_gz} --include-bed=${bed} -o LCL5.inside.vcf.gz | |||||
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcffilter -i ${LCL6_vcf_gz} --include-bed=${bed} -o LCL6.inside.vcf.gz | |||||
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcffilter -i ${LCL7_vcf_gz} --include-bed=${bed} -o LCL7.inside.vcf.gz | |||||
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcffilter -i ${LCL8_vcf_gz} --include-bed=${bed} -o LCL8.inside.vcf.gz | |||||
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcfmerge --force-merge-all -o ${project}.family.inside.vcf.gz LCL5.inside.vcf.gz LCL6.inside.vcf.gz LCL7.inside.vcf.gz LCL8.inside.vcf.gz | |||||
gunzip ${project}.family.inside.vcf.gz | |||||
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcffilter -i ${LCL5_vcf_gz} --exclude-bed=${bed} -o LCL5.outside.vcf.gz | |||||
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcffilter -i ${LCL6_vcf_gz} --exclude-bed=${bed} -o LCL6.outside.vcf.gz | |||||
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcffilter -i ${LCL7_vcf_gz} --exclude-bed=${bed} -o LCL7.outside.vcf.gz | |||||
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcffilter -i ${LCL8_vcf_gz} --exclude-bed=${bed} -o LCL8.outside.vcf.gz | |||||
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcfmerge --force-merge-all -o ${project}.family.outside.vcf.gz LCL5.outside.vcf.gz LCL6.outside.vcf.gz LCL7.outside.vcf.gz LCL8.outside.vcf.gz | |||||
gunzip ${project}.family.outside.vcf.gz | |||||
>>> | |||||
runtime { | |||||
docker:docker | |||||
cluster: cluster_config | |||||
systemDisk: "cloud_ssd 40" | |||||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||||
} | |||||
output { | |||||
File merged_vcf = "${project}.family.vcf" | |||||
File outside_vcf = "${project}.family.outside.vcf" | |||||
File inside_vcf = "${project}.family.inside.vcf" | |||||
} | |||||
} | |||||
task merge_mendelian { | |||||
File D5_trio_vcf | |||||
File D6_trio_vcf | |||||
File family_vcf | |||||
File D5_trio_vcf_inside | |||||
File D6_trio_vcf_inside | |||||
File family_vcf_inside | |||||
File D5_trio_vcf_outside | |||||
File D6_trio_vcf_outside | |||||
File family_vcf_outside | |||||
String family_name = basename(family_vcf,".family.vcf") | |||||
String docker | |||||
String cluster_config | |||||
String disk_size | |||||
command <<< | |||||
cat ${D5_trio_vcf} | grep -v '##' > ${family_name}.D5.txt | |||||
cat ${D6_trio_vcf} | grep -v '##' > ${family_name}.D6.txt | |||||
cat ${family_vcf} | grep -v '##' | awk ' | |||||
BEGIN { OFS = "\t" } | |||||
NF > 2 && FNR > 1 { | |||||
for ( i=9; i<=NF; i++ ) { | |||||
split($i,a,":") ;$i = a[1]; | |||||
} | |||||
} | |||||
{ print } | |||||
' > ${family_name}.consensus.txt | |||||
python /opt/merge_two_family_with_genotype.py -LCL5 ${family_name}.D5.txt -LCL6 ${family_name}.D6.txt -genotype ${family_name}.consensus.txt -family ${family_name} | |||||
cat ${D5_trio_vcf_inside} | grep -v '##' > ${family_name}.D5.inside.txt | |||||
cat ${D6_trio_vcf_inside} | grep -v '##' > ${family_name}.D6.inside.txt | |||||
cat ${family_vcf_inside} | grep -v '##' | awk ' | |||||
BEGIN { OFS = "\t" } | |||||
NF > 2 && FNR > 1 { | |||||
for ( i=9; i<=NF; i++ ) { | |||||
split($i,a,":") ;$i = a[1]; | |||||
} | |||||
} | |||||
{ print } | |||||
' > ${family_name}.consensus.inside.txt | |||||
python /opt/merge_two_family_with_genotype.py -LCL5 ${family_name}.D5.inside.txt -LCL6 ${family_name}.D6.inside.txt -genotype ${family_name}.consensus.inside.txt -family ${family_name}.inside | |||||
cat ${D5_trio_vcf_outside} | grep -v '##' > ${family_name}.D5.outside.txt | |||||
cat ${D6_trio_vcf_outside} | grep -v '##' > ${family_name}.D6.outside.txt | |||||
cat ${family_vcf_outside} | grep -v '##' | awk ' | |||||
BEGIN { OFS = "\t" } | |||||
NF > 2 && FNR > 1 { | |||||
for ( i=9; i<=NF; i++ ) { | |||||
split($i,a,":") ;$i = a[1]; | |||||
} | |||||
} | |||||
{ print } | |||||
' > ${family_name}.consensus.outside.txt | |||||
python /opt/merge_two_family_with_genotype.py -LCL5 ${family_name}.D5.outside.txt -LCL6 ${family_name}.D6.outside.txt -genotype ${family_name}.consensus.outside.txt -family ${family_name}.outside | |||||
>>> | |||||
runtime { | |||||
docker:docker | |||||
cluster: cluster_config | |||||
systemDisk: "cloud_ssd 40" | |||||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||||
} | |||||
output { | |||||
File project_mendelian = "${family_name}.txt" | |||||
File project_mendelian_summary = "${family_name}.summary.txt" | |||||
File project_mendelian_inside = "${family_name}.inside.txt" | |||||
File project_mendelian_summary_inside = "${family_name}.inside.summary.txt" | |||||
File project_mendelian_outside = "${family_name}.outside.txt" | |||||
File project_mendelian_summary_outside = "${family_name}.outside.summary.txt" | |||||
} | |||||
} |
task multiqc { | |||||
Array[File] read1_zip | |||||
Array[File] read2_zip | |||||
Array[File] txt1 | |||||
Array[File] txt2 | |||||
Array[File] zip | |||||
Array[File] summary | |||||
String docker | |||||
String cluster_config | |||||
String disk_size | |||||
command <<< | |||||
set -o pipefail | |||||
set -e | |||||
mkdir -p /cromwell_root/tmp/fastqc | |||||
mkdir -p /cromwell_root/tmp/fastqscreen | |||||
mkdir -p /cromwell_root/tmp/bamqc | |||||
mkdir -p /cromwell_root/tmp/benchmark | |||||
cp ${sep=" " summary} /cromwell_root/tmp/benchmark | |||||
cp ${sep=" " read1_zip} ${sep=" " read2_zip} /cromwell_root/tmp/fastqc | |||||
cp ${sep=" " txt1} ${sep=" " txt2} /cromwell_root/tmp/fastqscreen | |||||
for i in ${sep=" " zip} | |||||
do | |||||
tar -zxvf $i -C /cromwell_root/tmp/bamqc | |||||
done | |||||
multiqc /cromwell_root/tmp/ | |||||
cat multiqc_data/multiqc_general_stats.txt > multiqc_general_stats.txt | |||||
cat multiqc_data/multiqc_fastqc.txt > multiqc_fastqc.txt | |||||
cat multiqc_data/multiqc_fastq_screen.txt > multiqc_fastq_screen.txt | |||||
cat multiqc_data/multiqc_happy_data.json > multiqc_happy_data.json | |||||
>>> | |||||
runtime { | |||||
docker:docker | |||||
cluster:cluster_config | |||||
systemDisk:"cloud_ssd 40" | |||||
dataDisk:"cloud_ssd " + disk_size + " /cromwell_root/" | |||||
} | |||||
output { | |||||
File multiqc_html = "multiqc_report.html" | |||||
Array[File] multiqc_txt = glob("multiqc_data/*") | |||||
File fastqc_qualimap = "multiqc_general_stats.txt" | |||||
File fastqc = "multiqc_fastqc.txt" | |||||
File fastqscreen = "multiqc_fastq_screen.txt" | |||||
File hap = "multiqc_happy_data.json" | |||||
} | |||||
} |
task qualimap { | |||||
File bam | |||||
File bai | |||||
String bamname = basename(bam,".bam") | |||||
String docker | |||||
String cluster_config | |||||
String disk_size | |||||
command <<< | |||||
set -o pipefail | |||||
set -e | |||||
nt=$(nproc) | |||||
/opt/qualimap/qualimap bamqc -bam ${bam} -outformat PDF:HTML -nt $nt -outdir ${bamname} --java-mem-size=60G | |||||
tar -zcvf ${bamname}_qualimap.zip ${bamname} | |||||
>>> | |||||
runtime { | |||||
docker:docker | |||||
cluster:cluster_config | |||||
systemDisk:"cloud_ssd 40" | |||||
dataDisk:"cloud_ssd " + disk_size + " /cromwell_root/" | |||||
} | |||||
output { | |||||
File zip = "${bamname}_qualimap.zip" | |||||
} | |||||
} |
task quartet_mendelian { | |||||
Array[File] project_mendelian_summary | |||||
String project | |||||
String docker | |||||
String cluster_config | |||||
String disk_size | |||||
command <<< | |||||
for i in ${sep=" " project_mendelian_summary} | |||||
do | |||||
cat $i | sed -n '2,3p' >> mendelian.summary | |||||
done | |||||
sed '1i\Family\tReproducibility_D5_D6\tMendelian_Concordance_Quartet' mendelian.summary > ${project}.mendelian.txt | |||||
>>> | |||||
runtime { | |||||
docker:docker | |||||
cluster:cluster_config | |||||
systemDisk:"cloud_ssd 40" | |||||
dataDisk:"cloud_ssd " + disk_size + " /cromwell_root/" | |||||
} | |||||
output { | |||||
File mendelian_summary="${project}.mendelian.txt" | |||||
} | |||||
} |
task sentieon { | |||||
File aln_metrics | |||||
File dedup_metrics | |||||
File is_metrics | |||||
File deduped_coverage | |||||
String sample_name | |||||
String docker | |||||
String cluster_config | |||||
String disk_size | |||||
command <<< | |||||
set -o pipefail | |||||
set -e | |||||
cat ${aln_metrics} | sed -n '2,2p' > aln_metrics.header | |||||
cat ${aln_metrics} | sed -n '5,5p' > ${sample_name}.aln_metrics | |||||
cat ${dedup_metrics} | sed -n '2,2p' > dedup_metrics.header | |||||
cat ${dedup_metrics} | sed -n '3,3p' > ${sample_name}.dedup_metrics | |||||
cat ${is_metrics} | sed -n '2,2p' > is_metrics.header | |||||
cat ${is_metrics} | sed -n '3,3p' > ${sample_name}.is_metrics | |||||
cat ${deduped_coverage} | sed -n '1,1p' > deduped_coverage.header | |||||
cat ${deduped_coverage} | sed -n '2,2p' > ${sample_name}.deduped_coverage | |||||
>>> | |||||
runtime { | |||||
docker:docker | |||||
cluster:cluster_config | |||||
systemDisk:"cloud_ssd 40" | |||||
dataDisk:"cloud_ssd " + disk_size + " /cromwell_root/" | |||||
} | |||||
output { | |||||
File aln_metrics_header = "aln_metrics.header" | |||||
File aln_metrics_data = "${sample_name}.aln_metrics" | |||||
File dedup_metrics_header = "dedup_metrics.header" | |||||
File dedup_metrics_data = "${sample_name}.dedup_metrics" | |||||
File is_metrics_header = "is_metrics.header" | |||||
File is_metrics_data = "${sample_name}.is_metrics" | |||||
File deduped_coverage_header = "deduped_coverage.header" | |||||
File deduped_coverage_data = "${sample_name}.deduped_coverage" | |||||
} | |||||
} |
task split_gvcf_files { | |||||
File merged_gvcf | |||||
String project | |||||
String docker | |||||
String cluster_config | |||||
String disk_size | |||||
command <<< | |||||
cat ${merged_gvcf} | grep '#CHROM' | sed s'/\t/\n/g' > name | |||||
ncol=`cat name | wc -l` | |||||
sed -i '1,9d' name | |||||
cat ${merged_gvcf} | grep '#' > header | |||||
cat ${merged_gvcf} | grep -v '#' > body | |||||
cat body | grep -w '^chr1\|^chr2\|^chr3\|^chr4\|^chr5\|^chr6\|^chr7\|^chr8\|^chr9\|^chr10\|^chr11\|^chr12\|^chr13\|^chr14\|^chr15\|^chr16\|^chr17\|^chr18\|^chr19\|^chr20\|^chr21\|^chr22\|^chrX' > body.filtered | |||||
cat header body.filtered > ${project}.filtered.g.vcf | |||||
for i in $(seq 10 $ncol); do cat ${project}.filtered.g.vcf | cut -f1-9,$i > $i.splited.vcf; done | |||||
ls *splited.vcf | sort -n | paste - name > rename | |||||
cat rename | while read a b | |||||
do | |||||
mv $a $b.splited.vcf | |||||
sample=$(echo $b | cut -f6 -d_) | |||||
rep=$(echo $b | cut -f7 -d_) | |||||
echo $sample >> quartet_sample | |||||
echo $rep >> quartet_rep | |||||
done | |||||
python /opt/how_many_samples.py -sample quartet_sample -rep quartet_rep | |||||
>>> | |||||
runtime { | |||||
docker:docker | |||||
cluster: cluster_config | |||||
systemDisk: "cloud_ssd 40" | |||||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||||
} | |||||
output { | |||||
Array[File] splited_vcf = glob("*.splited.vcf") | |||||
File sister_tag = "sister_tag" | |||||
File quartet_tag = "quartet_tag" | |||||
} | |||||
} |
task vcfstat { | |||||
File rtg_vcf | |||||
File rtg_vcf_index | |||||
String sample_name | |||||
String docker | |||||
String cluster_config | |||||
String disk_size | |||||
command <<< | |||||
set -o pipefail | |||||
set -e | |||||
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcfstats ${rtg_vcf} > ${sample_name}.stats.txt | |||||
>>> | |||||
runtime { | |||||
docker:docker | |||||
cluster:cluster_config | |||||
systemDisk:"cloud_ssd 40" | |||||
dataDisk:"cloud_ssd " + disk_size + " /cromwell_root/" | |||||
} | |||||
output { | |||||
File vcfnumber="${sample_name}.stats.txt" | |||||
} | |||||
} |
import "./tasks/benchmark.wdl" as benchmark | |||||
import "./tasks/mendelian.wdl" as mendelian | |||||
import "./tasks/merge_mendelian.wdl" as merge_mendelian | |||||
import "./tasks/merge_family.wdl" as merge_family | |||||
workflow {{ project_name }} { | |||||
File LCL5 | |||||
File LCL6 | |||||
File LCL7 | |||||
File LCL8 | |||||
String BENCHMARKdocker | |||||
String MENDELIANdocker | |||||
String DIYdocker | |||||
String fasta | |||||
File ref_dir | |||||
File sdf | |||||
File contig | |||||
File bed | |||||
File benchmarking_dir | |||||
String project | |||||
String disk_size | |||||
String BIGcluster_config | |||||
String SMALLcluster_config | |||||
call benchmark.benchmark as LCL5benchmark { | |||||
input: | |||||
vcf=LCL5, | |||||
benchmarking_dir=benchmarking_dir, | |||||
sdf=sdf, | |||||
contig=contig, | |||||
project=project, | |||||
docker=BENCHMARKdocker, | |||||
cluster_config=BIGcluster_config, | |||||
disk_size=disk_size, | |||||
} | |||||
call benchmark.benchmark as LCL6benchmark { | |||||
input: | |||||
vcf=LCL6, | |||||
benchmarking_dir=benchmarking_dir, | |||||
sdf=sdf, | |||||
contig=contig, | |||||
project=project, | |||||
docker=BENCHMARKdocker, | |||||
cluster_config=BIGcluster_config, | |||||
disk_size=disk_size, | |||||
} | |||||
call benchmark.benchmark as LCL7benchmark { | |||||
input: | |||||
vcf=LCL7, | |||||
benchmarking_dir=benchmarking_dir, | |||||
sdf=sdf, | |||||
contig=contig, | |||||
project=project, | |||||
docker=BENCHMARKdocker, | |||||
cluster_config=BIGcluster_config, | |||||
disk_size=disk_size, | |||||
} | |||||
call benchmark.benchmark as LCL8benchmark { | |||||
input: | |||||
vcf=LCL8, | |||||
benchmarking_dir=benchmarking_dir, | |||||
sdf=sdf, | |||||
contig=contig, | |||||
project=project, | |||||
docker=BENCHMARKdocker, | |||||
cluster_config=BIGcluster_config, | |||||
disk_size=disk_size, | |||||
} | |||||
call merge_family.merge_family as merge_family { | |||||
input: | |||||
LCL5_vcf_gz=LCL5benchmark.rtg_vcf, | |||||
LCL5_vcf_idx=LCL5benchmark.rtg_vcf_index, | |||||
LCL6_vcf_gz=LCL6benchmark.rtg_vcf, | |||||
LCL6_vcf_idx=LCL6benchmark.rtg_vcf_index, | |||||
LCL7_vcf_gz=LCL7benchmark.rtg_vcf, | |||||
LCL7_vcf_idx=LCL7benchmark.rtg_vcf_index, | |||||
LCL8_vcf_gz=LCL8benchmark.rtg_vcf, | |||||
LCL8_vcf_idx=LCL8benchmark.rtg_vcf, | |||||
bed=bed, | |||||
project=project, | |||||
docker=BENCHMARKdocker, | |||||
cluster_config=BIGcluster_config, | |||||
disk_size=disk_size, | |||||
} | |||||
call mendelian.mendelian as mendelian { | |||||
input: | |||||
family_vcf=merge_family.merged_vcf, | |||||
family_vcf_inside=merge_family.inside_vcf, | |||||
family_vcf_outside=merge_family.outside_vcf, | |||||
ref_dir=ref_dir, | |||||
fasta=fasta, | |||||
docker=MENDELIANdocker, | |||||
cluster_config=BIGcluster_config, | |||||
disk_size=disk_size | |||||
} | |||||
call merge_mendelian.merge_mendelian as merge_mendelian { | |||||
input: | |||||
D5_trio_vcf=mendelian.D5_trio_vcf, | |||||
D6_trio_vcf=mendelian.D6_trio_vcf, | |||||
family_vcf=merge_family.merged_vcf, | |||||
D5_trio_vcf_inside=mendelian.D5_trio_vcf_inside, | |||||
D6_trio_vcf_inside=mendelian.D6_trio_vcf_inside, | |||||
family_vcf_inside=merge_family.inside_vcf, | |||||
D5_trio_vcf_outside=mendelian.D5_trio_vcf_outside, | |||||
D6_trio_vcf_outside=mendelian.D6_trio_vcf_outside, | |||||
family_vcf_outside=merge_family.outside_vcf, | |||||
docker=DIYdocker, | |||||
cluster_config=SMALLcluster_config, | |||||
disk_size=disk_size | |||||
} | |||||
} | |||||