@@ -0,0 +1,60 @@ | |||
from __future__ import division | |||
import pandas as pd | |||
import sys, argparse, os | |||
# input arguments | |||
parser = argparse.ArgumentParser(description="this script is to calculate reproducibility between Quartet_D5 and Quartet_D6s") | |||
parser.add_argument('-sister', '--sister', type=str, help='sister.txt', required=True) | |||
parser.add_argument('-project', '--project', type=str, help='project name', required=True) | |||
args = parser.parse_args() | |||
sister_file = args.sister | |||
project_name = args.project | |||
# output file | |||
output_name = project_name + '.sister.reproducibility.txt' | |||
output_file = open(output_name,'w') | |||
# input files | |||
sister_dat = pd.read_table(sister_file) | |||
sister_same = 0 | |||
sister_diff = 0 | |||
for row in sister_dat.itertuples(): | |||
# sister | |||
if row[5] == row[6]: | |||
if row[5] == './.': | |||
mendelian = 'noInfo' | |||
sister_count = "no" | |||
elif row[5] == '0/0': | |||
mendelian = 'Ref' | |||
sister_count = "no" | |||
else: | |||
mendelian = '1' | |||
sister_count = "yes_same" | |||
else: | |||
mendelian = '0' | |||
if (row[5] == './.' or row[5] == '0/0') and (row[6] == './.' or row[6] == '0/0'): | |||
sister_count = "no" | |||
else: | |||
sister_count = "yes_diff" | |||
if sister_count == 'yes_same': | |||
sister_same += 1 | |||
elif sister_count == 'yes_diff': | |||
sister_diff += 1 | |||
else: | |||
pass | |||
sister = sister_same/(sister_same + sister_diff) | |||
outcolumn = 'Project\tReproducibility_D5_D6\n' | |||
outResult = project_name + '\t' + str(sister) + '\n' | |||
output_file.write(outcolumn) | |||
output_file.write(outResult) | |||
@@ -0,0 +1,37 @@ | |||
import pandas as pd | |||
import sys, argparse, os | |||
mut = pd.read_table('/mnt/pgx_src_data_pool_4/home/renluyao/manuscript/MIE/vcf/mutation_type',header=None) | |||
outIndel = open(sys.argv[1],'w') | |||
for row in mut.itertuples(): | |||
if ',' in row._4: | |||
alt_seq = row._4.split(',') | |||
alt_len = [len(i) for i in alt_seq] | |||
alt = max(alt_len) | |||
else: | |||
alt = len(row._4) | |||
ref = row._3 | |||
pos = row._2 | |||
if len(ref) == 1 and alt == 1: | |||
pass | |||
elif len(ref) > alt: | |||
StartPos = int(pos) - 1 | |||
EndPos = int(pos) + (len(ref) - 1) | |||
outline_indel = row._1 + '\t' + str(StartPos) + '\t' + str(EndPos) + '\n' | |||
outIndel.write(outline_indel) | |||
elif alt > len(ref): | |||
StartPos = int(pos) - 1 | |||
EndPos = int(pos) + (alt - 1) | |||
outline_indel = row._1 + '\t' + str(StartPos) + '\t' + str(EndPos) + '\n' | |||
outIndel.write(outline_indel) | |||
elif len(ref) == alt: | |||
StartPos = int(pos) - 1 | |||
EndPos = int(pos) + (alt - 1) | |||
outline_indel = row._1 + '\t' + str(StartPos) + '\t' + str(EndPos) + '\n' | |||
outIndel.write(outline_indel) | |||
@@ -0,0 +1,72 @@ | |||
import pandas as pd | |||
import sys, argparse, os | |||
mut = mut = pd.read_table('/mnt/pgx_src_data_pool_4/home/renluyao/manuscript/benchmark_calls/vcf/mutation_type',header=None) | |||
vote = pd.read_table('/mnt/pgx_src_data_pool_4/home/renluyao/manuscript/benchmark_calls/all_info/benchmark.vote.mendelian.txt',header=None) | |||
merged_df = pd.merge(vote, mut, how='inner', left_on=[0,1], right_on = [0,1]) | |||
outFile = open(sys.argv[1],'w') | |||
outIndel = open(sys.argv[2],'w') | |||
for row in merged_df.itertuples(): | |||
#d5 | |||
if ',' in row._7: | |||
d5 = row._7.split(',') | |||
d5_len = [len(i) for i in d5] | |||
d5_alt = max(d5_len) | |||
else: | |||
d5_alt = len(row._7) | |||
#d6 | |||
if ',' in row._15: | |||
d6 = row._15.split(',') | |||
d6_len = [len(i) for i in d6] | |||
d6_alt = max(d6_len) | |||
else: | |||
d6_alt = len(row._15) | |||
#f7 | |||
if ',' in row._23: | |||
f7 = row._23.split(',') | |||
f7_len = [len(i) for i in f7] | |||
f7_alt = max(f7_len) | |||
else: | |||
f7_alt = len(row._23) | |||
#m8 | |||
if ',' in row._31: | |||
m8 = row._31.split(',') | |||
m8_len = [len(i) for i in m8] | |||
m8_alt = max(m8_len) | |||
else: | |||
m8_alt = len(row._31) | |||
all_length = [d5_alt,d6_alt,f7_alt,m8_alt] | |||
alt = max(all_length) | |||
ref = row._35 | |||
pos = int(row._2) | |||
if len(ref) == 1 and alt == 1: | |||
StartPos = int(pos) -1 | |||
EndPos = int(pos) | |||
cate = 'SNV' | |||
elif len(ref) > alt: | |||
StartPos = int(pos) - 1 | |||
EndPos = int(pos) + (len(ref) - 1) | |||
cate = 'INDEL' | |||
outline_indel = row._1 + '\t' + str(StartPos) + '\t' + str(EndPos) + '\n' | |||
outIndel.write(outline_indel) | |||
elif alt > len(ref): | |||
StartPos = int(pos) - 1 | |||
EndPos = int(pos) + (alt - 1) | |||
cate = 'INDEL' | |||
outline_indel = row._1 + '\t' + str(StartPos) + '\t' + str(EndPos) + '\n' | |||
outIndel.write(outline_indel) | |||
elif len(ref) == alt: | |||
StartPos = int(pos) - 1 | |||
EndPos = int(pos) + (alt - 1) | |||
cate = 'INDEL' | |||
outline_indel = row._1 + '\t' + str(StartPos) + '\t' + str(EndPos) + '\n' | |||
outIndel.write(outline_indel) | |||
outline = row._1 + '\t' + str(StartPos) + '\t' + str(EndPos) + '\t' + str(row._2) + '\t' + cate + '\n' | |||
outFile.write(outline) | |||
@@ -0,0 +1,67 @@ | |||
cat benchmark.men.vote.diffbed.lengthlessthan50.txt | awk '{print $1"\t"$2"\t"".""\t"$35"\t"$7"\t.\t.\t.\tGT\t"$6}' | grep -v '0/0' > LCL5.body | |||
cat benchmark.men.vote.diffbed.lengthlessthan50.txt | awk '{print $1"\t"$2"\t"".""\t"$35"\t"$15"\t.\t.\t.\tGT\t"$14}' | grep -v '0/0' > LCL6.body | |||
cat benchmark.men.vote.diffbed.lengthlessthan50.txt | awk '{print $1"\t"$2"\t"".""\t"$35"\t"$23"\t.\t.\t.\tGT\t"$22}' | grep -v '0/0'> LCL7.body | |||
cat benchmark.men.vote.diffbed.lengthlessthan50.txt | awk '{print $1"\t"$2"\t"".""\t"$35"\t"$31"\t.\t.\t.\tGT\t"$30}'| grep -v '0/0' > LCL8.body | |||
cat header5 LCL5.body > LCL5.beforediffbed.vcf | |||
cat header6 LCL6.body > LCL6.beforediffbed.vcf | |||
cat header7 LCL7.body > LCL7.beforediffbed.vcf | |||
cat header8 LCL8.body > LCL8.beforediffbed.vcf | |||
rtg bgzip *beforediffbed.vcf | |||
rtg index *beforediffbed.vcf.gz | |||
rtg vcffilter -i LCL5.beforediffbed.vcf.gz --exclude-bed=/mnt/pgx_src_data_pool_4/home/renluyao/manuscript/benchmark_calls/MIE/diff.merged.bed -o LCL5.afterfilterdiffbed.vcf.gz | |||
rtg vcffilter -i LCL6.beforediffbed.vcf.gz --exclude-bed=/mnt/pgx_src_data_pool_4/home/renluyao/manuscript/benchmark_calls/MIE/diff.merged.bed -o LCL6.afterfilterdiffbed.vcf.gz | |||
rtg vcffilter -i LCL7.beforediffbed.vcf.gz --exclude-bed=/mnt/pgx_src_data_pool_4/home/renluyao/manuscript/benchmark_calls/MIE/diff.merged.bed -o LCL7.afterfilterdiffbed.vcf.gz | |||
rtg vcffilter -i LCL8.beforediffbed.vcf.gz --exclude-bed=/mnt/pgx_src_data_pool_4/home/renluyao/manuscript/benchmark_calls/MIE/diff.merged.bed -o LCL8.afterfilterdiffbed.vcf.gz | |||
/mnt/pgx_src_data_pool_4/home/renluyao/softwares/annovar/table_annovar.pl LCL5.beforediffbed.vcf.gz /mnt/pgx_src_data_pool_4/home/renluyao/softwares/annovar/humandb \ | |||
-buildver hg38 \ | |||
-out LCL5 \ | |||
-remove \ | |||
-protocol 1000g2015aug_all,1000g2015aug_afr,1000g2015aug_amr,1000g2015aug_eas,1000g2015aug_eur,1000g2015aug_sas,clinvar_20190305,gnomad211_genome \ | |||
-operation f,f,f,f,f,f,f,f \ | |||
-nastring . \ | |||
-vcfinput \ | |||
--thread 8 | |||
rtg vcfeval -b /mnt/pgx_src_data_pool_4/home/renluyao/Quartet/GIAB/NA12878_HG001/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_PGandRTGphasetransfer.vcf.gz -c LCL5.afterfilterdiffbed.vcf.gz -o LCL5_NIST -t /mnt/pgx_src_data_pool_4/home/renluyao/annotation/hg38/GRCh38.d1.vd1.sdf/ | |||
rtg vcfeval -b /mnt/pgx_src_data_pool_4/home/renluyao/Quartet/GIAB/NA12878_HG001/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_PGandRTGphasetransfer.vcf.gz -c LCL6.afterfilterdiffbed.vcf.gz -o LCL6_NIST -t /mnt/pgx_src_data_pool_4/home/renluyao/annotation/hg38/GRCh38.d1.vd1.sdf/ | |||
rtg vcfeval -b /mnt/pgx_src_data_pool_4/home/renluyao/Quartet/GIAB/NA12878_HG001/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_PGandRTGphasetransfer.vcf.gz -c LCL7.afterfilterdiffbed.vcf.gz -o LCL7_NIST -t /mnt/pgx_src_data_pool_4/home/renluyao/annotation/hg38/GRCh38.d1.vd1.sdf/ | |||
rtg vcfeval -b /mnt/pgx_src_data_pool_4/home/renluyao/Quartet/GIAB/NA12878_HG001/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_PGandRTGphasetransfer.vcf.gz -c LCL8.afterfilterdiffbed.vcf.gz -o LCL8_NIST -t /mnt/pgx_src_data_pool_4/home/renluyao/annotation/hg38/GRCh38.d1.vd1.sdf/ | |||
zcat LCL5.afterfilterdiffbed.vcf.gz | grep -v '#' | awk '{ if ((length($4) == 1) && (length($5) == 1)) { print } }' | wc -l | |||
zcat LCL6.afterfilterdiffbed.vcf.gz | grep -v '#' | awk '{ if ((length($4) == 1) && (length($5) == 1)) { print } }' | wc -l | |||
zcat LCL7.afterfilterdiffbed.vcf.gz | grep -v '#' | awk '{ if ((length($4) == 1) && (length($5) == 1)) { print } }' | wc -l | |||
zcat LCL8.afterfilterdiffbed.vcf.gz | grep -v '#' | awk '{ if ((length($4) == 1) && (length($5) == 1)) { print } }' | wc -l | |||
zcat LCL5.afterfilterdiffbed.vcf.gz | grep -v '#' | awk '{ if ((length($4) == 1) && (length($5) < 11) && (length($5) > 1)) { print } }' | wc -l | |||
zcat LCL6.afterfilterdiffbed.vcf.gz | grep -v '#' | awk '{ if ((length($4) == 1) && (length($5) < 11) && (length($5) > 1)) { print } }' | wc -l | |||
zcat LCL7.afterfilterdiffbed.vcf.gz | grep -v '#' | awk '{ if ((length($4) == 1) && (length($5) < 11) && (length($5) > 1)) { print } }' | wc -l | |||
zcat LCL8.afterfilterdiffbed.vcf.gz | grep -v '#' | awk '{ if ((length($4) == 1) && (length($5) < 11) && (length($5) > 1)) { print } }' | wc -l | |||
zcat LCL5.afterfilterdiffbed.vcf.gz | grep -v '#' | awk '{ if ((length($4) == 1) && (length($5) > 10)) { print } }' | wc -l | |||
zcat LCL6.afterfilterdiffbed.vcf.gz | grep -v '#' | awk '{ if ((length($4) == 1) && (length($5) > 10)) { print } }' | wc -l | |||
zcat LCL7.afterfilterdiffbed.vcf.gz | grep -v '#' | awk '{ if ((length($4) == 1) && (length($5) > 10)) { print } }' | wc -l | |||
zcat LCL8.afterfilterdiffbed.vcf.gz | grep -v '#' | awk '{ if ((length($4) == 1) && (length($5) > 10)) { print } }' | wc -l | |||
bedtools subtract -a LCL5.27.homo_ref.consensus.bed -b /mnt/pgx_src_data_pool_4/home/renluyao/manuscript/benchmark_calls/MIE/diff.merged.bed > LCL5.27.homo_ref.consensus.filtereddiffbed.bed | |||
bedtools subtract -a LCL6.27.homo_ref.consensus.bed -b /mnt/pgx_src_data_pool_4/home/renluyao/manuscript/benchmark_calls/MIE/diff.merged.bed > LCL6.27.homo_ref.consensus.filtereddiffbed.bed | |||
bedtools subtract -a LCL7.27.homo_ref.consensus.bed -b /mnt/pgx_src_data_pool_4/home/renluyao/manuscript/benchmark_calls/MIE/diff.merged.bed > LCL7.27.homo_ref.consensus.filtereddiffbed.bed | |||
bedtools subtract -a LCL8.27.homo_ref.consensus.bed -b /mnt/pgx_src_data_pool_4/home/renluyao/manuscript/benchmark_calls/MIE/diff.merged.bed > LCL8.27.homo_ref.consensus.filtereddiffbed.bed | |||
python vcf2bed.py LCL5.body LCL5.variants.bed | |||
python vcf2bed.py LCL6.body LCL6.variants.bed | |||
python vcf2bed.py LCL7.body LCL7.variants.bed | |||
python vcf2bed.py LCL8.body LCL8.variants.bed | |||
cat /mnt/pgx_src_data_pool_4/home/renluyao/manuscript/benchmark_calls/all_info/LCL5.variants.bed | cut -f1,11,12 | cat - LCL5.27.homo_ref.consensus.filtereddiffbed.bed | sort -k1,1 -k2,2n > LCL5.high.confidence.bed | |||
@@ -0,0 +1,23 @@ | |||
##contig=<ID=chr1,length=248956422> | |||
##contig=<ID=chr2,length=242193529> | |||
##contig=<ID=chr3,length=198295559> | |||
##contig=<ID=chr4,length=190214555> | |||
##contig=<ID=chr5,length=181538259> | |||
##contig=<ID=chr6,length=170805979> | |||
##contig=<ID=chr7,length=159345973> | |||
##contig=<ID=chr8,length=145138636> | |||
##contig=<ID=chr9,length=138394717> | |||
##contig=<ID=chr10,length=133797422> | |||
##contig=<ID=chr11,length=135086622> | |||
##contig=<ID=chr12,length=133275309> | |||
##contig=<ID=chr13,length=114364328> | |||
##contig=<ID=chr14,length=107043718> | |||
##contig=<ID=chr15,length=101991189> | |||
##contig=<ID=chr16,length=90338345> | |||
##contig=<ID=chr17,length=83257441> | |||
##contig=<ID=chr18,length=80373285> | |||
##contig=<ID=chr19,length=58617616> | |||
##contig=<ID=chr20,length=64444167> | |||
##contig=<ID=chr21,length=46709983> | |||
##contig=<ID=chr22,length=50818468> | |||
##contig=<ID=chrX,length=156040895> |
@@ -0,0 +1,65 @@ | |||
import json | |||
import pandas as pd | |||
import sys, argparse, os | |||
parser = argparse.ArgumentParser(description="This script is to get information from multiqc") | |||
parser.add_argument('-fastqc_qualimap', '--fastqc_qualimap', type=str, help='multiqc_general_stats.txt', required=True) | |||
parser.add_argument('-fastqc', '--fastqc', type=str, help='multiqc_fastqc.txt', required=True) | |||
parser.add_argument('-fastqscreen', '--fastqscreen', type=str, help='multiqc_fastq_screen.txt', required=True) | |||
parser.add_argument('-hap', '--happy', type=str, help='multiqc_happy_data.json', required=True) | |||
args = parser.parse_args() | |||
# Rename input: | |||
fastqc_qualimap_file = args.fastqc_qualimap | |||
fastqc_file = args.fastqc | |||
fastqscreen_file = args.fastqscreen | |||
hap_file = args.happy | |||
# fastqc and qualimap | |||
dat = pd.read_table(fastqc_qualimap_file) | |||
fastqc = dat.loc[:, dat.columns.str.startswith('FastQC')] | |||
fastqc.insert(loc=0, column='Sample', value=dat['Sample']) | |||
fastqc_stat = fastqc.dropna() | |||
# qulimap | |||
qualimap = dat.loc[:, dat.columns.str.startswith('QualiMap')] | |||
qualimap.insert(loc=0, column='Sample', value=dat['Sample']) | |||
qualimap_stat = qualimap.dropna() | |||
# fastqc | |||
dat = pd.read_table(fastqc_file) | |||
fastqc_module = dat.loc[:, "per_base_sequence_quality":"kmer_content"] | |||
fastqc_module.insert(loc=0, column='Sample', value=dat['Sample']) | |||
fastqc_all = pd.merge(fastqc_stat,fastqc_module, how='outer', left_on=['Sample'], right_on = ['Sample']) | |||
# fastqscreen | |||
dat = pd.read_table(fastqscreen_file) | |||
fastqscreen = dat.loc[:, dat.columns.str.endswith('percentage')] | |||
dat['Sample'] = [i.replace('_screen','') for i in dat['Sample']] | |||
fastqscreen.insert(loc=0, column='Sample', value=dat['Sample']) | |||
# benchmark | |||
with open(hap_file) as hap_json: | |||
happy = json.load(hap_json) | |||
dat =pd.DataFrame.from_records(happy) | |||
dat = dat.loc[:, dat.columns.str.endswith('ALL')] | |||
dat_transposed = dat.T | |||
benchmark = dat_transposed.loc[:,['sample_id','METRIC.Precision','METRIC.Recall']] | |||
benchmark.columns = ['Sample','Precision','Recall'] | |||
#output | |||
fastqc_all.to_csv('fastqc.final.result.txt',sep="\t",index=0) | |||
fastqscreen.to_csv('fastqscreen.final.result.txt',sep="\t",index=0) | |||
qualimap_stat.to_csv('qualimap.final.result.txt',sep="\t",index=0) | |||
benchmark.to_csv('benchmark.final.result.txt',sep="\t",index=0) | |||
@@ -0,0 +1,92 @@ | |||
import sys,getopt | |||
import os | |||
import re | |||
import fileinput | |||
import pandas as pd | |||
def usage(): | |||
print( | |||
""" | |||
Usage: python extract_vcf_information.py -i input_merged_vcf_file -o parsed_file | |||
This script will extract SNVs and Indels information from the vcf files and output a tab-delimited files. | |||
Input: | |||
-i the selected vcf file | |||
Output: | |||
-o tab-delimited parsed file | |||
""") | |||
# select supported small variants | |||
def process(oneLine): | |||
line = oneLine.rstrip() | |||
strings = line.strip().split('\t') | |||
infoParsed = parse_INFO(strings[7]) | |||
formatKeys = strings[8].split(':') | |||
formatValues = strings[9].split(':') | |||
for i in range(0,len(formatKeys) -1) : | |||
if formatKeys[i] == 'AD': | |||
ra = formatValues[i].split(',') | |||
infoParsed['RefDP'] = ra[0] | |||
infoParsed['AltDP'] = ra[1] | |||
if (int(ra[1]) + int(ra[0])) != 0: | |||
infoParsed['af'] = float(int(ra[1])/(int(ra[1]) + int(ra[0]))) | |||
else: | |||
pass | |||
else: | |||
infoParsed[formatKeys[i]] = formatValues[i] | |||
infoParsed['chromo'] = strings[0] | |||
infoParsed['pos'] = strings[1] | |||
infoParsed['id'] = strings[2] | |||
infoParsed['ref'] = strings[3] | |||
infoParsed['alt'] = strings[4] | |||
infoParsed['qual'] = strings[5] | |||
return infoParsed | |||
def parse_INFO(info): | |||
strings = info.strip().split(';') | |||
keys = [] | |||
values = [] | |||
for i in strings: | |||
kv = i.split('=') | |||
if kv[0] == 'DB': | |||
keys.append('DB') | |||
values.append('1') | |||
elif kv[0] == 'AF': | |||
pass | |||
else: | |||
keys.append(kv[0]) | |||
values.append(kv[1]) | |||
infoDict = dict(zip(keys, values)) | |||
return infoDict | |||
opts,args = getopt.getopt(sys.argv[1:],"hi:o:") | |||
for op,value in opts: | |||
if op == "-i": | |||
inputFile=value | |||
elif op == "-o": | |||
outputFile=value | |||
elif op == "-h": | |||
usage() | |||
sys.exit() | |||
if len(sys.argv[1:]) < 3: | |||
usage() | |||
sys.exit() | |||
allDict = [] | |||
for line in fileinput.input(inputFile): | |||
m = re.match('^\#',line) | |||
if m is not None: | |||
pass | |||
else: | |||
oneDict = process(line) | |||
allDict.append(oneDict) | |||
allTable = pd.DataFrame(allDict) | |||
allTable.to_csv(outputFile,sep='\t',index=False) | |||
@@ -0,0 +1,47 @@ | |||
import sys,getopt | |||
from itertools import islice | |||
over_50_outfile = open("indel_lenth_over_50.txt",'w') | |||
less_50_outfile = open("benchmark.men.vote.diffbed.lengthlessthan50.txt","w") | |||
def process(line): | |||
strings = line.strip().split('\t') | |||
#d5 | |||
if ',' in strings[6]: | |||
d5 = strings[6].split(',') | |||
d5_len = [len(i) for i in d5] | |||
d5_alt = max(d5_len) | |||
else: | |||
d5_alt = len(strings[6]) | |||
#d6 | |||
if ',' in strings[14]: | |||
d6 = strings[14].split(',') | |||
d6_len = [len(i) for i in d6] | |||
d6_alt = max(d6_len) | |||
else: | |||
d6_alt = len(strings[14]) | |||
#f7 | |||
if ',' in strings[22]: | |||
f7 = strings[22].split(',') | |||
f7_len = [len(i) for i in f7] | |||
f7_alt = max(f7_len) | |||
else: | |||
f7_alt = len(strings[22]) | |||
#m8 | |||
if ',' in strings[30]: | |||
m8 = strings[30].split(',') | |||
m8_len = [len(i) for i in m8] | |||
m8_alt = max(m8_len) | |||
else: | |||
m8_alt = len(strings[30]) | |||
#ref | |||
ref_len = len(strings[34]) | |||
if (d5_alt > 50) or (d6_alt > 50) or (f7_alt > 50) or (m8_alt > 50) or (ref_len > 50): | |||
over_50_outfile.write(line) | |||
else: | |||
less_50_outfile.write(line) | |||
input_file = open(sys.argv[1]) | |||
for line in islice(input_file, 1, None): | |||
process(line) |
@@ -0,0 +1,43 @@ | |||
from itertools import islice | |||
import fileinput | |||
import sys, argparse, os | |||
# input arguments | |||
parser = argparse.ArgumentParser(description="this script is to exclude indel over 50bp") | |||
parser.add_argument('-i', '--mergedGVCF', type=str, help='merged gVCF txt with only chr, pos, ref, alt and genotypes', required=True) | |||
parser.add_argument('-prefix', '--prefix', type=str, help='prefix of output file', required=True) | |||
args = parser.parse_args() | |||
input_dat = args.mergedGVCF | |||
prefix = args.prefix | |||
# output file | |||
output_name = prefix + '.indel.lessthan50bp.txt' | |||
outfile = open(output_name,'w') | |||
def process(line): | |||
strings = line.strip().split('\t') | |||
#d5 | |||
if ',' in strings[3]: | |||
alt = strings[3].split(',') | |||
alt_len = [len(i) for i in alt] | |||
alt_max = max(alt_len) | |||
else: | |||
alt_max = len(strings[3]) | |||
#ref | |||
ref_len = len(strings[2]) | |||
if (alt_max > 50) or (ref_len > 50): | |||
pass | |||
else: | |||
outfile.write(line) | |||
for line in fileinput.input(input_dat): | |||
process(line) | |||
@@ -0,0 +1,36 @@ | |||
from __future__ import division | |||
import pandas as pd | |||
import sys, argparse, os | |||
import fileinput | |||
# input arguments | |||
parser = argparse.ArgumentParser(description="this script is to get filtered and benchmark vcf info") | |||
parser.add_argument('-filtered', '--filtered', type=str, help='filtered position', required=True) | |||
parser.add_argument('-benchmark', '--benchmark', type=str, help='benchmark position', required=True) | |||
parser.add_argument('-vcf', '--vcf', type=str, help='one specific vcf', required=True) | |||
parser.add_argument('-filename', '--filename', type=str, help='output file name', required=True) | |||
args = parser.parse_args() | |||
filtered = args.filtered | |||
benchmark = args.benchmark | |||
vcf = args.vcf | |||
filename = args.filename | |||
# output file | |||
filtered_filename = filename + '.filtered.txt' | |||
benchmark_filename = filename + '.benchmark.txt' | |||
# input files | |||
filtered_dat = pd.read_table(filtered,header=None) | |||
benchmark_dat = pd.read_table(benchmark,header=None) | |||
vcf_dat = pd.read_table(vcf) | |||
filtered_merged_df = pd.merge(filtered_dat, vcf_dat, how='inner',left_on=[0,1], right_on = ['#CHROM','POS']) | |||
benchmark_merged_df = pd.merge(benchmark_dat,vcf_dat, how='inner',left_on=[0,1], right_on = ['#CHROM','POS']) | |||
filtered_merged_df.to_csv(filtered_filename,sep='\t',index=False) | |||
benchmark_merged_df.to_csv(benchmark_filename,sep='\t',index=False) |
@@ -0,0 +1,50 @@ | |||
import pandas as pd | |||
import sys, argparse, os | |||
parser = argparse.ArgumentParser(description="This script is to get information from hap") | |||
parser.add_argument('-hap', '--happy', type=str, help='hap.py table', required=True) | |||
parser.add_argument('-name', '--name', type=str, help='sample name', required=True) | |||
args = parser.parse_args() | |||
hap_file = args.happy | |||
name = args.name | |||
dat = pd.read_table(hap_file) | |||
dat['QUERY.TP'] = dat['QUERY.TOTAL'].astype(int) - dat['QUERY.UNK'].astype(int) - dat['QUERY.FP'].astype(int) | |||
dat['QUERY'] = dat['QUERY.TOTAL'].astype(int) - dat['QUERY.UNK'].astype(int) | |||
indel = dat[['INDEL' in s for s in dat['Type']]] | |||
snv = dat[['SNP' in s for s in dat['Type']]] | |||
indel.reset_index(drop=True, inplace=True) | |||
snv.reset_index(drop=True, inplace=True) | |||
benchmark = pd.concat([snv, indel], axis=1) | |||
benchmark = benchmark[[ 'QUERY.TOTAL', 'QUERY','QUERY.TP','QUERY.FP','TRUTH.FN','METRIC.Precision', 'METRIC.Recall','METRIC.F1_Score']] | |||
benchmark.columns = ['SNV number','INDEL number','SNV query','INDEL query','SNV TP','INDEL TP','SNV FP','INDEL FP','SNV FN','INDEL FN','SNV precision','INDEL precision','SNV recall','INDEL recall','SNV F1','INDEL F1'] | |||
benchmark = benchmark[['SNV number','INDEL number','SNV query','INDEL query','SNV TP','INDEL TP','SNV FP','INDEL FP','SNV FN','INDEL FN','SNV precision','INDEL precision','SNV recall','INDEL recall','SNV F1','INDEL F1']] | |||
benchmark['SNV precision'] = benchmark['SNV precision'].astype(float) | |||
benchmark['INDEL precision'] = benchmark['INDEL precision'].astype(float) | |||
benchmark['SNV recall'] = benchmark['SNV recall'].astype(float) | |||
benchmark['INDEL recall'] = benchmark['INDEL recall'].astype(float) | |||
benchmark['SNV F1'] = benchmark['SNV F1'].astype(float) | |||
benchmark['INDEL F1'] = benchmark['INDEL F1'].astype(float) | |||
benchmark = benchmark.round(2) | |||
name_array = name.split("_") | |||
LCL5_1 = name_array[0] + "_" + name_array[1] + "_" + name_array[2] + "_" + name_array[3] + "_" + name_array[4] + "_" + "LCL5_1" + "_" + name_array[5] | |||
LCL5_2 = name_array[0] + "_" + name_array[1] + "_" + name_array[2] + "_" + name_array[3] + "_" + name_array[4] + "_" + "LCL5_2" + "_" + name_array[5] | |||
LCL5_3 = name_array[0] + "_" + name_array[1] + "_" + name_array[2] + "_" + name_array[3] + "_" + name_array[4] + "_" + "LCL5_3" + "_" + name_array[5] | |||
LCL6_1 = name_array[0] + "_" + name_array[1] + "_" + name_array[2] + "_" + name_array[3] + "_" + name_array[4] + "_" + "LCL6_1" + "_" + name_array[5] | |||
LCL6_2 = name_array[0] + "_" + name_array[1] + "_" + name_array[2] + "_" + name_array[3] + "_" + name_array[4] + "_" + "LCL6_2" + "_" + name_array[5] | |||
LCL6_3 = name_array[0] + "_" + name_array[1] + "_" + name_array[2] + "_" + name_array[3] + "_" + name_array[4] + "_" + "LCL6_3" + "_" + name_array[5] | |||
LCL7_1 = name_array[0] + "_" + name_array[1] + "_" + name_array[2] + "_" + name_array[3] + "_" + name_array[4] + "_" + "LCL7_1" + "_" + name_array[5] | |||
LCL7_2 = name_array[0] + "_" + name_array[1] + "_" + name_array[2] + "_" + name_array[3] + "_" + name_array[4] + "_" + "LCL7_2" + "_" + name_array[5] | |||
LCL7_3 = name_array[0] + "_" + name_array[1] + "_" + name_array[2] + "_" + name_array[3] + "_" + name_array[4] + "_" + "LCL7_3" + "_" + name_array[5] | |||
LCL8_1 = name_array[0] + "_" + name_array[1] + "_" + name_array[2] + "_" + name_array[3] + "_" + name_array[4] + "_" + "LCL8_1" + "_" + name_array[5] | |||
LCL8_2 = name_array[0] + "_" + name_array[1] + "_" + name_array[2] + "_" + name_array[3] + "_" + name_array[4] + "_" + "LCL8_2" + "_" + name_array[5] | |||
LCL8_3 = name_array[0] + "_" + name_array[1] + "_" + name_array[2] + "_" + name_array[3] + "_" + name_array[4] + "_" + "LCL8_3" + "_" + name_array[5] | |||
benchmark.insert(loc=0, column='Sample', value=[LCL5_1,LCL5_2,LCL5_3,LCL6_1,LCL6_2,LCL6_3,LCL7_1,LCL7_2,LCL7_3,LCL8_1,LCL8_2,LCL8_3]) | |||
benchmark.to_csv('variants.calling.qc.txt',sep="\t",index=0) |
@@ -0,0 +1,416 @@ | |||
from __future__ import division | |||
import sys, argparse, os | |||
import fileinput | |||
import re | |||
import pandas as pd | |||
from operator import itemgetter | |||
from collections import Counter | |||
from itertools import islice | |||
from numpy import * | |||
import statistics | |||
# input arguments | |||
parser = argparse.ArgumentParser(description="this script is to count voting number") | |||
parser.add_argument('-vcf', '--multi_sample_vcf', type=str, help='The VCF file you want to count the voting number', required=True) | |||
parser.add_argument('-dup', '--dup_list', type=str, help='Duplication list', required=True) | |||
parser.add_argument('-sample', '--sample_name', type=str, help='which sample of quartet', required=True) | |||
parser.add_argument('-prefix', '--prefix', type=str, help='Prefix of output file name', required=True) | |||
args = parser.parse_args() | |||
multi_sample_vcf = args.multi_sample_vcf | |||
dup_list = args.dup_list | |||
prefix = args.prefix | |||
sample_name = args.sample_name | |||
vcf_header = '''##fileformat=VCFv4.2 | |||
##fileDate=20200331 | |||
##source=high_confidence_calls_intergration(choppy app) | |||
##reference=GRCh38.d1.vd1 | |||
##INFO=<ID=location,Number=1,Type=String,Description="Repeat region"> | |||
##INFO=<ID=DETECTED,Number=1,Type=Integer,Description="Number of detected votes"> | |||
##INFO=<ID=VOTED,Number=1,Type=Integer,Description="Number of consnesus votes"> | |||
##INFO=<ID=FAM,Number=1,Type=Integer,Description="Number mendelian consisitent votes"> | |||
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> | |||
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Sum depth of all samples"> | |||
##FORMAT=<ID=ALT,Number=1,Type=Integer,Description="Sum alternative depth of all samples"> | |||
##FORMAT=<ID=AF,Number=1,Type=Float,Description="Allele frequency, sum alternative depth / sum depth"> | |||
##FORMAT=<ID=GQ,Number=1,Type=Float,Description="Average genotype quality"> | |||
##FORMAT=<ID=QD,Number=1,Type=Float,Description="Average Variant Confidence/Quality by Depth"> | |||
##FORMAT=<ID=MQ,Number=1,Type=Float,Description="Average mapping quality"> | |||
##FORMAT=<ID=FS,Number=1,Type=Float,Description="Average Phred-scaled p-value using Fisher's exact test to detect strand bias"> | |||
##FORMAT=<ID=QUALI,Number=1,Type=Float,Description="Average variant quality"> | |||
##contig=<ID=chr1,length=248956422> | |||
##contig=<ID=chr2,length=242193529> | |||
##contig=<ID=chr3,length=198295559> | |||
##contig=<ID=chr4,length=190214555> | |||
##contig=<ID=chr5,length=181538259> | |||
##contig=<ID=chr6,length=170805979> | |||
##contig=<ID=chr7,length=159345973> | |||
##contig=<ID=chr8,length=145138636> | |||
##contig=<ID=chr9,length=138394717> | |||
##contig=<ID=chr10,length=133797422> | |||
##contig=<ID=chr11,length=135086622> | |||
##contig=<ID=chr12,length=133275309> | |||
##contig=<ID=chr13,length=114364328> | |||
##contig=<ID=chr14,length=107043718> | |||
##contig=<ID=chr15,length=101991189> | |||
##contig=<ID=chr16,length=90338345> | |||
##contig=<ID=chr17,length=83257441> | |||
##contig=<ID=chr18,length=80373285> | |||
##contig=<ID=chr19,length=58617616> | |||
##contig=<ID=chr20,length=64444167> | |||
##contig=<ID=chr21,length=46709983> | |||
##contig=<ID=chr22,length=50818468> | |||
##contig=<ID=chrX,length=156040895> | |||
''' | |||
vcf_header_all_sample = '''##fileformat=VCFv4.2 | |||
##fileDate=20200331 | |||
##reference=GRCh38.d1.vd1 | |||
##INFO=<ID=location,Number=1,Type=String,Description="Repeat region"> | |||
##INFO=<ID=DUP,Number=1,Type=Flag,Description="Duplicated variant records"> | |||
##INFO=<ID=DETECTED,Number=1,Type=Integer,Description="Number of detected votes"> | |||
##INFO=<ID=VOTED,Number=1,Type=Integer,Description="Number of consnesus votes"> | |||
##INFO=<ID=FAM,Number=1,Type=Integer,Description="Number mendelian consisitent votes"> | |||
##INFO=<ID=ALL_ALT,Number=1,Type=Float,Description="Sum of alternative reads of all samples"> | |||
##INFO=<ID=ALL_DP,Number=1,Type=Float,Description="Sum of depth of all samples"> | |||
##INFO=<ID=ALL_AF,Number=1,Type=Float,Description="Allele frequency of net alternatice reads and net depth"> | |||
##INFO=<ID=GQ_MEAN,Number=1,Type=Float,Description="Mean of genotype quality of all samples"> | |||
##INFO=<ID=QD_MEAN,Number=1,Type=Float,Description="Average Variant Confidence/Quality by Depth"> | |||
##INFO=<ID=MQ_MEAN,Number=1,Type=Float,Description="Mean of mapping quality of all samples"> | |||
##INFO=<ID=FS_MEAN,Number=1,Type=Float,Description="Average Phred-scaled p-value using Fisher's exact test to detect strand bias"> | |||
##INFO=<ID=QUAL_MEAN,Number=1,Type=Float,Description="Average variant quality"> | |||
##INFO=<ID=PCR,Number=1,Type=String,Description="Consensus of PCR votes"> | |||
##INFO=<ID=PCR_FREE,Number=1,Type=String,Description="Consensus of PCR-free votes"> | |||
##INFO=<ID=CONSENSUS,Number=1,Type=String,Description="Consensus calls"> | |||
##INFO=<ID=CONSENSUS_SEQ,Number=1,Type=String,Description="Consensus sequence"> | |||
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> | |||
##FORMAT=<ID=DP,Number=1,Type=String,Description="Depth"> | |||
##FORMAT=<ID=ALT,Number=1,Type=Integer,Description="Alternative Depth"> | |||
##FORMAT=<ID=AF,Number=1,Type=String,Description="Allele frequency"> | |||
##FORMAT=<ID=GQ,Number=1,Type=String,Description="Genotype quality"> | |||
##FORMAT=<ID=MQ,Number=1,Type=String,Description="Mapping quality"> | |||
##FORMAT=<ID=TWINS,Number=1,Type=String,Description="1 is twins shared, 0 is twins discordant "> | |||
##FORMAT=<ID=TRIO5,Number=1,Type=String,Description="1 is LCL7, LCL8 and LCL5 mendelian consistent, 0 is mendelian vioaltion"> | |||
##FORMAT=<ID=TRIO6,Number=1,Type=String,Description="1 is LCL7, LCL8 and LCL6 mendelian consistent, 0 is mendelian vioaltion"> | |||
##contig=<ID=chr1,length=248956422> | |||
##contig=<ID=chr2,length=242193529> | |||
##contig=<ID=chr3,length=198295559> | |||
##contig=<ID=chr4,length=190214555> | |||
##contig=<ID=chr5,length=181538259> | |||
##contig=<ID=chr6,length=170805979> | |||
##contig=<ID=chr7,length=159345973> | |||
##contig=<ID=chr8,length=145138636> | |||
##contig=<ID=chr9,length=138394717> | |||
##contig=<ID=chr10,length=133797422> | |||
##contig=<ID=chr11,length=135086622> | |||
##contig=<ID=chr12,length=133275309> | |||
##contig=<ID=chr13,length=114364328> | |||
##contig=<ID=chr14,length=107043718> | |||
##contig=<ID=chr15,length=101991189> | |||
##contig=<ID=chr16,length=90338345> | |||
##contig=<ID=chr17,length=83257441> | |||
##contig=<ID=chr18,length=80373285> | |||
##contig=<ID=chr19,length=58617616> | |||
##contig=<ID=chr20,length=64444167> | |||
##contig=<ID=chr21,length=46709983> | |||
##contig=<ID=chr22,length=50818468> | |||
##contig=<ID=chrX,length=156040895> | |||
''' | |||
# read in duplication list | |||
dup = pd.read_table(dup_list,header=None) | |||
var_dup = dup[0].tolist() | |||
# output file | |||
benchmark_file_name = prefix + '_voted.vcf' | |||
benchmark_outfile = open(benchmark_file_name,'w') | |||
all_sample_file_name = prefix + '_all_sample_information.vcf' | |||
all_sample_outfile = open(all_sample_file_name,'w') | |||
# write VCF | |||
outputcolumn = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t' + sample_name + '_benchmark_calls\n' | |||
benchmark_outfile.write(vcf_header) | |||
benchmark_outfile.write(outputcolumn) | |||
outputcolumn_all_sample = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+ \ | |||
'Quartet_DNA_BGI_SEQ2000_BGI_1_20180518\tQuartet_DNA_BGI_SEQ2000_BGI_2_20180530\tQuartet_DNA_BGI_SEQ2000_BGI_3_20180530\t' + \ | |||
'Quartet_DNA_BGI_T7_WGE_1_20191105\tQuartet_DNA_BGI_T7_WGE_2_20191105\tQuartet_DNA_BGI_T7_WGE_3_20191105\t' + \ | |||
'Quartet_DNA_ILM_Nova_ARD_1_20181108\tQuartet_DNA_ILM_Nova_ARD_2_20181108\tQuartet_DNA_ILM_Nova_ARD_3_20181108\t' + \ | |||
'Quartet_DNA_ILM_Nova_ARD_4_20190111\tQuartet_DNA_ILM_Nova_ARD_5_20190111\tQuartet_DNA_ILM_Nova_ARD_6_20190111\t' + \ | |||
'Quartet_DNA_ILM_Nova_BRG_1_20180930\tQuartet_DNA_ILM_Nova_BRG_2_20180930\tQuartet_DNA_ILM_Nova_BRG_3_20180930\t' + \ | |||
'Quartet_DNA_ILM_Nova_WUX_1_20190917\tQuartet_DNA_ILM_Nova_WUX_2_20190917\tQuartet_DNA_ILM_Nova_WUX_3_20190917\t' + \ | |||
'Quartet_DNA_ILM_XTen_ARD_1_20170403\tQuartet_DNA_ILM_XTen_ARD_2_20170403\tQuartet_DNA_ILM_XTen_ARD_3_20170403\t' + \ | |||
'Quartet_DNA_ILM_XTen_NVG_1_20170329\tQuartet_DNA_ILM_XTen_NVG_2_20170329\tQuartet_DNA_ILM_XTen_NVG_3_20170329\t' + \ | |||
'Quartet_DNA_ILM_XTen_WUX_1_20170216\tQuartet_DNA_ILM_XTen_WUX_2_20170216\tQuartet_DNA_ILM_XTen_WUX_3_20170216\n' | |||
all_sample_outfile.write(vcf_header_all_sample) | |||
all_sample_outfile.write(outputcolumn_all_sample) | |||
#function | |||
def replace_nan(strings_list): | |||
updated_list = [] | |||
for i in strings_list: | |||
if i == '.': | |||
updated_list.append('.:.:.:.:.:.:.:.:.:.:.:.') | |||
else: | |||
updated_list.append(i) | |||
return updated_list | |||
def remove_dot(strings_list): | |||
updated_list = [] | |||
for i in strings_list: | |||
if i == '.': | |||
pass | |||
else: | |||
updated_list.append(i) | |||
return updated_list | |||
def detected_number(strings): | |||
gt = [x.split(':')[0] for x in strings] | |||
percentage = 27 - gt.count('.') | |||
return(str(percentage)) | |||
def vote_number(strings,consensus_call): | |||
gt = [x.split(':')[0] for x in strings] | |||
gt = [x.replace('.','0/0') for x in gt] | |||
gt = list(map(gt_uniform,[i for i in gt])) | |||
vote_num = gt.count(consensus_call) | |||
return(str(vote_num)) | |||
def family_vote(strings,consensus_call): | |||
gt = [x.split(':')[0] for x in strings] | |||
gt = [x.replace('.','0/0') for x in gt] | |||
gt = list(map(gt_uniform,[i for i in gt])) | |||
mendelian = [':'.join(x.split(':')[1:4]) for x in strings] | |||
indices = [i for i, x in enumerate(gt) if x == consensus_call] | |||
matched_mendelian = itemgetter(*indices)(mendelian) | |||
mendelian_num = matched_mendelian.count('1:1:1') | |||
return(str(mendelian_num)) | |||
def gt_uniform(strings): | |||
uniformed_gt = '' | |||
allele1 = strings.split('/')[0] | |||
allele2 = strings.split('/')[1] | |||
if int(allele1) > int(allele2): | |||
uniformed_gt = allele2 + '/' + allele1 | |||
else: | |||
uniformed_gt = allele1 + '/' + allele2 | |||
return uniformed_gt | |||
def decide_by_rep(strings): | |||
consensus_rep = '' | |||
mendelian = [':'.join(x.split(':')[1:4]) for x in strings] | |||
gt = [x.split(':')[0] for x in strings] | |||
gt = [x.replace('.','0/0') for x in gt] | |||
# modified gt turn 2/1 to 1/2 | |||
gt = list(map(gt_uniform,[i for i in gt])) | |||
# mendelian consistent? | |||
mendelian_dict = Counter(mendelian) | |||
highest_mendelian = mendelian_dict.most_common(1) | |||
candidate_mendelian = highest_mendelian[0][0] | |||
freq_mendelian = highest_mendelian[0][1] | |||
if (candidate_mendelian == '1:1:1') and (freq_mendelian >= 2): | |||
gt_num_dict = Counter(gt) | |||
highest_gt = gt_num_dict.most_common(1) | |||
candidate_gt = highest_gt[0][0] | |||
freq_gt = highest_gt[0][1] | |||
if (candidate_gt != '0/0') and (freq_gt >= 2): | |||
consensus_rep = candidate_gt | |||
elif (candidate_gt == '0/0') and (freq_gt >= 2): | |||
consensus_rep = '0/0' | |||
else: | |||
consensus_rep = 'inconGT' | |||
elif (candidate_mendelian == '') and (freq_mendelian >= 2): | |||
consensus_rep = 'noInfo' | |||
else: | |||
consensus_rep = 'inconMen' | |||
return consensus_rep | |||
def main(): | |||
for line in fileinput.input(multi_sample_vcf): | |||
headline = re.match('^\#',line) | |||
if headline is not None: | |||
pass | |||
else: | |||
line = line.strip() | |||
strings = line.split('\t') | |||
variant_id = '_'.join([strings[0],strings[1]]) | |||
# check if the variants location is duplicated | |||
if variant_id in var_dup: | |||
strings[7] = strings[7] + ';DUP' | |||
outLine = '\t'.join(strings) + '\n' | |||
all_sample_outfile.write(outLine) | |||
else: | |||
# pre-define | |||
pcr_consensus = '.' | |||
pcr_free_consensus = '.' | |||
consensus_call = '.' | |||
consensus_alt_seq = '.' | |||
# pcr | |||
strings[9:] = replace_nan(strings[9:]) | |||
pcr = itemgetter(*[9,10,11,27,28,29,30,31,32,33,34,35])(strings) | |||
SEQ2000 = decide_by_rep(pcr[0:3]) | |||
XTen_ARD = decide_by_rep(pcr[3:6]) | |||
XTen_NVG = decide_by_rep(pcr[6:9]) | |||
XTen_WUX = decide_by_rep(pcr[9:12]) | |||
sequence_site = [SEQ2000,XTen_ARD,XTen_NVG,XTen_WUX] | |||
sequence_dict = Counter(sequence_site) | |||
highest_sequence = sequence_dict.most_common(1) | |||
candidate_sequence = highest_sequence[0][0] | |||
freq_sequence = highest_sequence[0][1] | |||
if freq_sequence > 2: | |||
pcr_consensus = candidate_sequence | |||
else: | |||
pcr_consensus = 'inconSequenceSite' | |||
# pcr-free | |||
pcr_free = itemgetter(*[12,13,14,15,16,17,18,19,20,21,22,23,24,25,26])(strings) | |||
T7_WGE = decide_by_rep(pcr_free[0:3]) | |||
Nova_ARD_1 = decide_by_rep(pcr_free[3:6]) | |||
Nova_ARD_2 = decide_by_rep(pcr_free[6:9]) | |||
Nova_BRG = decide_by_rep(pcr_free[9:12]) | |||
Nova_WUX = decide_by_rep(pcr_free[12:15]) | |||
sequence_site = [T7_WGE,Nova_ARD_1,Nova_ARD_2,Nova_BRG,Nova_WUX] | |||
highest_sequence = sequence_dict.most_common(1) | |||
candidate_sequence = highest_sequence[0][0] | |||
freq_sequence = highest_sequence[0][1] | |||
if freq_sequence > 3: | |||
pcr_free_consensus = candidate_sequence | |||
else: | |||
pcr_free_consensus = 'inconSequenceSite' | |||
# pcr and pcr-free | |||
tag = ['inconGT','noInfo','inconMen','inconSequenceSite'] | |||
if (pcr_consensus == pcr_free_consensus) and (pcr_consensus not in tag) and (pcr_consensus != '0/0'): | |||
consensus_call = pcr_consensus | |||
VOTED = vote_number(strings[9:],consensus_call) | |||
strings[7] = strings[7] + ';VOTED=' + VOTED | |||
DETECTED = detected_number(strings[9:]) | |||
strings[7] = strings[7] + ';DETECTED=' + DETECTED | |||
FAM = family_vote(strings[9:],consensus_call) | |||
strings[7] = strings[7] + ';FAM=' + FAM | |||
# Delete multiple alternative genotype to necessary expression | |||
alt = strings[4] | |||
alt_gt = alt.split(',') | |||
if len(alt_gt) > 1: | |||
allele1 = consensus_call.split('/')[0] | |||
allele2 = consensus_call.split('/')[1] | |||
if allele1 == '0': | |||
allele2_seq = alt_gt[int(allele2) - 1] | |||
consensus_alt_seq = allele2_seq | |||
consensus_call = '0/1' | |||
else: | |||
allele1_seq = alt_gt[int(allele1) - 1] | |||
allele2_seq = alt_gt[int(allele2) - 1] | |||
if int(allele1) > int(allele2): | |||
consensus_alt_seq = allele2_seq + ',' + allele1_seq | |||
consensus_call = '1/2' | |||
elif int(allele1) < int(allele2): | |||
consensus_alt_seq = allele1_seq + ',' + allele2_seq | |||
consensus_call = '1/2' | |||
else: | |||
consensus_alt_seq = allele1_seq | |||
consensus_call = '1/1' | |||
else: | |||
consensus_alt_seq = alt | |||
# GT:DP:ALT:AF:GQ:QD:MQ:FS:QUAL | |||
# GT:TWINS:TRIO5:TRIO6:DP:ALT:AF:GQ:QD:MQ:FS:QUAL:rawGT | |||
# DP | |||
DP = [x.split(':')[4] for x in strings[9:]] | |||
DP = remove_dot(DP) | |||
DP = [int(x) for x in DP] | |||
ALL_DP = sum(DP) | |||
# AF | |||
ALT = [x.split(':')[5] for x in strings[9:]] | |||
ALT = remove_dot(ALT) | |||
ALT = [int(x) for x in ALT] | |||
ALL_ALT = sum(ALT) | |||
ALL_AF = round(ALL_ALT/ALL_DP,2) | |||
# GQ | |||
GQ = [x.split(':')[7] for x in strings[9:]] | |||
GQ = remove_dot(GQ) | |||
GQ = [int(x) for x in GQ] | |||
GQ_MEAN = round(mean(GQ),2) | |||
# QD | |||
QD = [x.split(':')[8] for x in strings[9:]] | |||
QD = remove_dot(QD) | |||
QD = [float(x) for x in QD] | |||
QD_MEAN = round(mean(QD),2) | |||
# MQ | |||
MQ = [x.split(':')[9] for x in strings[9:]] | |||
MQ = remove_dot(MQ) | |||
MQ = [float(x) for x in MQ] | |||
MQ_MEAN = round(mean(MQ),2) | |||
# FS | |||
FS = [x.split(':')[10] for x in strings[9:]] | |||
FS = remove_dot(FS) | |||
FS = [float(x) for x in FS] | |||
FS_MEAN = round(mean(FS),2) | |||
# QUAL | |||
QUAL = [x.split(':')[11] for x in strings[9:]] | |||
QUAL = remove_dot(QUAL) | |||
QUAL = [float(x) for x in QUAL] | |||
QUAL_MEAN = round(mean(QUAL),2) | |||
# benchmark output | |||
output_format = consensus_call + ':' + str(ALL_DP) + ':' + str(ALL_ALT) + ':' + str(ALL_AF) + ':' + str(GQ_MEAN) + ':' + str(QD_MEAN) + ':' + str(MQ_MEAN) + ':' + str(FS_MEAN) + ':' + str(QUAL_MEAN) | |||
outLine = strings[0] + '\t' + strings[1] + '\t' + strings[2] + '\t' + strings[3] + '\t' + consensus_alt_seq + '\t' + '.' + '\t' + '.' + '\t' + strings[7] + '\t' + 'GT:DP:ALT:AF:GQ:QD:MQ:FS:QUAL' + '\t' + output_format + '\n' | |||
benchmark_outfile.write(outLine) | |||
# all sample output | |||
strings[7] = strings[7] + ';ALL_ALT=' + str(ALL_ALT) + ';ALL_DP=' + str(ALL_DP) + ';ALL_AF=' + str(ALL_AF) \ | |||
+ ';GQ_MEAN=' + str(GQ_MEAN) + ';QD_MEAN=' + str(QD_MEAN) + ';MQ_MEAN=' + str(MQ_MEAN) + ';FS_MEAN=' + str(FS_MEAN) \ | |||
+ ';QUAL_MEAN=' + str(QUAL_MEAN) + ';PCR=' + consensus_call + ';PCR_FREE=' + consensus_call + ';CONSENSUS=' + consensus_call \ | |||
+ ';CONSENSUS_SEQ=' + consensus_alt_seq | |||
all_sample_outLine = '\t'.join(strings) + '\n' | |||
all_sample_outfile.write(all_sample_outLine) | |||
elif (pcr_consensus in tag) and (pcr_free_consensus in tag): | |||
consensus_call = 'filtered' | |||
DETECTED = detected_number(strings[9:]) | |||
strings[7] = strings[7] + ';DETECTED=' + DETECTED | |||
strings[7] = strings[7] + ';CONSENSUS=' + consensus_call | |||
all_sample_outLine = '\t'.join(strings) + '\n' | |||
all_sample_outfile.write(all_sample_outLine) | |||
elif ((pcr_consensus == '0/0') or (pcr_consensus in tag)) and ((pcr_free_consensus not in tag) and (pcr_free_consensus != '0/0')): | |||
consensus_call = 'pcr-free-speicifc' | |||
DETECTED = detected_number(strings[9:]) | |||
strings[7] = strings[7] + ';DETECTED=' + DETECTED | |||
strings[7] = strings[7] + ';CONSENSUS=' + consensus_call | |||
all_sample_outLine = '\t'.join(strings) + '\n' | |||
all_sample_outfile.write(all_sample_outLine) | |||
elif ((pcr_consensus != '0/0') or (pcr_consensus not in tag)) and ((pcr_free_consensus in tag) and (pcr_free_consensus == '0/0')): | |||
consensus_call = 'pcr-speicifc' | |||
DETECTED = detected_number(strings[9:]) | |||
strings[7] = strings[7] + ';DETECTED=' + DETECTED | |||
strings[7] = strings[7] + ';CONSENSUS=' + consensus_call + ';PCR=' + pcr_consensus + ';PCR_FREE=' + pcr_free_consensus | |||
all_sample_outLine = '\t'.join(strings) + '\n' | |||
all_sample_outfile.write(all_sample_outLine) | |||
elif (pcr_consensus == '0/0') and (pcr_free_consensus == '0/0'): | |||
consensus_call = 'confirm for parents' | |||
DETECTED = detected_number(strings[9:]) | |||
strings[7] = strings[7] + ';DETECTED=' + DETECTED | |||
strings[7] = strings[7] + ';CONSENSUS=' + consensus_call | |||
all_sample_outLine = '\t'.join(strings) + '\n' | |||
all_sample_outfile.write(all_sample_outLine) | |||
else: | |||
consensus_call = 'filtered' | |||
DETECTED = detected_number(strings[9:]) | |||
strings[7] = strings[7] + ';DETECTED=' + DETECTED | |||
strings[7] = strings[7] + ';CONSENSUS=' + consensus_call | |||
all_sample_outLine = '\t'.join(strings) + '\n' | |||
all_sample_outfile.write(all_sample_outLine) | |||
if __name__ == '__main__': | |||
main() | |||
@@ -0,0 +1,42 @@ | |||
import pandas as pd | |||
import sys, argparse, os | |||
mut = pd.read_table(sys.argv[1]) | |||
outFile = open(sys.argv[2],'w') | |||
for row in mut.itertuples(): | |||
#d5 | |||
if ',' in row.V4: | |||
alt = row.V4.split(',') | |||
alt_len = [len(i) for i in alt] | |||
alt_max = max(alt_len) | |||
else: | |||
alt_max = len(row.V4) | |||
#d6 | |||
alt = alt_max | |||
ref = row.V3 | |||
pos = int(row.V2) | |||
if len(ref) == 1 and alt == 1: | |||
StartPos = int(pos) -1 | |||
EndPos = int(pos) | |||
cate = 'SNV' | |||
elif len(ref) > alt: | |||
StartPos = int(pos) - 1 | |||
EndPos = int(pos) + (len(ref) - 1) | |||
cate = 'INDEL' | |||
elif alt > len(ref): | |||
StartPos = int(pos) - 1 | |||
EndPos = int(pos) + (alt - 1) | |||
cate = 'INDEL' | |||
elif len(ref) == alt: | |||
StartPos = int(pos) - 1 | |||
EndPos = int(pos) + (alt - 1) | |||
cate = 'INDEL' | |||
outline = row.V1 + '\t' + str(StartPos) + '\t' + str(EndPos) + '\t' + str(row.V2) + '\t' + cate + '\n' | |||
outFile.write(outline) | |||
@@ -0,0 +1,50 @@ | |||
import pandas as pd | |||
import sys, argparse, os | |||
from operator import itemgetter | |||
parser = argparse.ArgumentParser(description="This script is to get how many samples") | |||
parser.add_argument('-sample', '--sample', type=str, help='quartet_sample', required=True) | |||
parser.add_argument('-rep', '--rep', type=str, help='quartet_rep', required=True) | |||
args = parser.parse_args() | |||
# Rename input: | |||
sample = args.sample | |||
rep = args.rep | |||
quartet_sample = pd.read_table(sample,header=None) | |||
quartet_sample = list(quartet_sample[0]) | |||
quartet_rep = pd.read_table(rep,header=None) | |||
quartet_rep = quartet_rep[0] | |||
#tags | |||
sister_tag = 'false' | |||
quartet_tag = 'false' | |||
quartet_rep_unique = list(set(quartet_rep)) | |||
single_rep = [i for i in range(len(quartet_rep)) if quartet_rep[i] == quartet_rep_unique[0]] | |||
single_batch_sample = itemgetter(*single_rep)(quartet_sample) | |||
num = len(single_batch_sample) | |||
if num == 1: | |||
sister_tag = 'false' | |||
quartet_tag = 'false' | |||
elif num == 2: | |||
if set(single_batch_sample) == set(['LCL5','LCL6']): | |||
sister_tag = 'true' | |||
quartet_tag = 'false' | |||
elif num == 3: | |||
if ('LCL5' in single_batch_sample) and ('LCL6' in single_batch_sample): | |||
sister_tag = 'true' | |||
quartet_tag = 'false' | |||
elif num == 4: | |||
if set(single_batch_sample) == set(['LCL5','LCL6','LCL7','LCL8']): | |||
sister_tag = 'false' | |||
quartet_tag = 'true' | |||
sister_outfile = open('sister_tag','w') | |||
quartet_outfile = open('quartet_tag','w') | |||
sister_outfile.write(sister_tag) | |||
quartet_outfile.write(quartet_tag) |
@@ -0,0 +1,42 @@ | |||
from __future__ import division | |||
import sys, argparse, os | |||
import pandas as pd | |||
from collections import Counter | |||
# input arguments | |||
parser = argparse.ArgumentParser(description="this script is to merge mendelian and vcfinfo, and extract high_confidence_calls") | |||
parser.add_argument('-vcf', '--vcf', type=str, help='merged multiple sample vcf', required=True) | |||
args = parser.parse_args() | |||
vcf = args.vcf | |||
lcl5_outfile = open('LCL5_all_variants.txt','w') | |||
filtered_outfile = open('LCL5_filtered_variants.txt','w') | |||
vcf_dat = pd.read_table(vcf) | |||
for row in vcf_dat.itertuples(): | |||
lcl5_list = [row.Quartet_DNA_BGI_SEQ2000_BGI_LCL5_1_20180518,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL5_2_20180530,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL5_3_20180530, \ | |||
row.Quartet_DNA_BGI_T7_WGE_LCL5_1_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL5_2_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL5_3_20191105, \ | |||
row.Quartet_DNA_ILM_Nova_ARD_LCL5_1_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL5_2_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL5_3_20181108, \ | |||
row.Quartet_DNA_ILM_Nova_ARD_LCL5_4_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL5_5_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL5_6_20190111, \ | |||
row.Quartet_DNA_ILM_Nova_BRG_LCL5_1_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL5_2_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL5_3_20180930, \ | |||
row.Quartet_DNA_ILM_Nova_WUX_LCL5_1_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL5_2_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL5_3_20190917, \ | |||
row.Quartet_DNA_ILM_XTen_ARD_LCL5_1_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL5_2_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL5_3_20170403, \ | |||
row.Quartet_DNA_ILM_XTen_NVG_LCL5_1_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL5_2_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL5_3_20170329, \ | |||
row.Quartet_DNA_ILM_XTen_WUX_LCL5_1_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL5_2_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL5_3_20170216] | |||
lcl5_vcf_gt = [x.split(':')[0] for x in lcl5_list] | |||
lcl5_gt=[item.replace('./.', '0/0') for item in lcl5_vcf_gt] | |||
gt_dict = Counter(lcl5_gt) | |||
highest_gt = gt_dict.most_common(1) | |||
candidate_gt = highest_gt[0][0] | |||
freq_gt = highest_gt[0][1] | |||
output = row._1 + '\t' + str(row.POS) + '\t' + '\t'.join(lcl5_gt) + '\n' | |||
if (candidate_gt == '0/0') and (freq_gt == 27): | |||
filtered_outfile.write(output) | |||
else: | |||
lcl5_outfile.write(output) | |||
@@ -0,0 +1,6 @@ | |||
cat benchmark.men.vote.diffbed.filtered | awk '{print $1"\t"$2"\t"".""\t"$35"\t"$7"\t.\t.\t.\tGT\t"$6}' | grep -v '2_y' > LCL5.body | |||
cat benchmark.men.vote.diffbed.filtered | awk '{print $1"\t"$2"\t"".""\t"$35"\t"$15"\t.\t.\t.\tGT\t"$14}' | grep -v '2_y' > LCL6.body | |||
cat benchmark.men.vote.diffbed.filtered | awk '{print $1"\t"$2"\t"".""\t"$35"\t"$23"\t.\t.\t.\tGT\t"$22}' | grep -v '2_y' > LCL7.body | |||
cat benchmark.men.vote.diffbed.filtered | awk '{print $1"\t"$2"\t"".""\t"$35"\t"$31"\t.\t.\t.\tGT\t"$30}' | grep -v '2_y' > LCL8.body | |||
for i in *txt; do cat $i | awk '{ if ((length($3) == 1) && (length($4) == 1)) { print } }' | grep -v '#' | cut -f3,4 | sort |uniq -c | sed 's/\s\+/\t/g' | cut -f2 > $i.mut; done |
@@ -0,0 +1,129 @@ | |||
from __future__ import division | |||
import pandas as pd | |||
import sys, argparse, os | |||
import fileinput | |||
import re | |||
# input arguments | |||
parser = argparse.ArgumentParser(description="this script is to get final high confidence calls and information of all replicates") | |||
parser.add_argument('-vcfInfo', '--vcfInfo', type=str, help='The txt file of variants information, this file is named as prefix__variant_quality_location.txt', required=True) | |||
parser.add_argument('-mendelianInfo', '--mendelianInfo', type=str, help='The merged mendelian information of all samples', required=True) | |||
parser.add_argument('-sample', '--sample_name', type=str, help='which sample of quartet', required=True) | |||
args = parser.parse_args() | |||
vcfInfo = args.vcfInfo | |||
mendelianInfo = args.mendelianInfo | |||
sample_name = args.sample_name | |||
#GT:TWINS:TRIO5:TRIO6:DP:AF:GQ:QD:MQ:FS:QUAL | |||
vcf_header = '''##fileformat=VCFv4.2 | |||
##fileDate=20200331 | |||
##source=high_confidence_calls_intergration(choppy app) | |||
##reference=GRCh38.d1.vd1 | |||
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> | |||
##FORMAT=<ID=TWINS,Number=1,Type=Flag,Description="1 for sister consistent, 0 for sister different"> | |||
##FORMAT=<ID=TRIO5,Number=1,Type=Flag,Description="1 for LCL7, LCL8 and LCL5 mendelian consistent, 0 for family violation"> | |||
##FORMAT=<ID=TRIO6,Number=1,Type=Flag,Description="1 for LCL7, LCL8 and LCL6 mendelian consistent, 0 for family violation"> | |||
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Depth"> | |||
##FORMAT=<ID=ALT,Number=1,Type=Integer,Description="Alternative Depth"> | |||
##FORMAT=<ID=AF,Number=1,Type=Float,Description="Allele frequency"> | |||
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype quality"> | |||
##FORMAT=<ID=QD,Number=1,Type=Float,Description="Variant Confidence/Quality by Depth"> | |||
##FORMAT=<ID=MQ,Number=1,Type=Float,Description="Mapping quality"> | |||
##FORMAT=<ID=FS,Number=1,Type=Float,Description="Phred-scaled p-value using Fisher's exact test to detect strand bias"> | |||
##FORMAT=<ID=QUAL,Number=1,Type=Float,Description="variant quality"> | |||
##contig=<ID=chr1,length=248956422> | |||
##contig=<ID=chr2,length=242193529> | |||
##contig=<ID=chr3,length=198295559> | |||
##contig=<ID=chr4,length=190214555> | |||
##contig=<ID=chr5,length=181538259> | |||
##contig=<ID=chr6,length=170805979> | |||
##contig=<ID=chr7,length=159345973> | |||
##contig=<ID=chr8,length=145138636> | |||
##contig=<ID=chr9,length=138394717> | |||
##contig=<ID=chr10,length=133797422> | |||
##contig=<ID=chr11,length=135086622> | |||
##contig=<ID=chr12,length=133275309> | |||
##contig=<ID=chr13,length=114364328> | |||
##contig=<ID=chr14,length=107043718> | |||
##contig=<ID=chr15,length=101991189> | |||
##contig=<ID=chr16,length=90338345> | |||
##contig=<ID=chr17,length=83257441> | |||
##contig=<ID=chr18,length=80373285> | |||
##contig=<ID=chr19,length=58617616> | |||
##contig=<ID=chr20,length=64444167> | |||
##contig=<ID=chr21,length=46709983> | |||
##contig=<ID=chr22,length=50818468> | |||
##contig=<ID=chrX,length=156040895> | |||
''' | |||
# output file | |||
file_name = sample_name + '_mendelian_vcfInfo.vcf' | |||
outfile = open(file_name,'w') | |||
outputcolumn = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t' + sample_name + '\n' | |||
outfile.write(vcf_header) | |||
outfile.write(outputcolumn) | |||
# input files | |||
vcf_info = pd.read_table(vcfInfo) | |||
mendelian_info = pd.read_table(mendelianInfo) | |||
merged_df = pd.merge(vcf_info, mendelian_info, how='outer', left_on=['#CHROM','POS'], right_on = ['#CHROM','POS']) | |||
merged_df = merged_df.fillna('.') | |||
# | |||
def parse_INFO(info): | |||
strings = info.strip().split(';') | |||
keys = [] | |||
values = [] | |||
for i in strings: | |||
kv = i.split('=') | |||
if kv[0] == 'DB': | |||
keys.append('DB') | |||
values.append('1') | |||
else: | |||
keys.append(kv[0]) | |||
values.append(kv[1]) | |||
infoDict = dict(zip(keys, values)) | |||
return infoDict | |||
# | |||
for row in merged_df.itertuples(): | |||
if row[18] != '.': | |||
# format | |||
# GT:TWINS:TRIO5:TRIO6:DP:AF:GQ:QD:MQ:FS:QUAL | |||
FORMAT_x = row[10].split(':') | |||
ALT = int(FORMAT_x[1].split(',')[1]) | |||
if int(FORMAT_x[2]) != 0: | |||
AF = round(ALT/int(FORMAT_x[2]),2) | |||
else: | |||
AF = '.' | |||
INFO_x = parse_INFO(row.INFO_x) | |||
if FORMAT_x[2] == '0': | |||
INFO_x['QD'] = '.' | |||
else: | |||
pass | |||
FORMAT = row[18] + ':' + FORMAT_x[2] + ':' + str(ALT) + ':' + str(AF) + ':' + FORMAT_x[3] + ':' + INFO_x['QD'] + ':' + INFO_x['MQ'] + ':' + INFO_x['FS'] + ':' + str(row.QUAL_x) | |||
# outline | |||
outline = row._1 + '\t' + str(row.POS) + '\t' + row.ID_x + '\t' + row.REF_y + '\t' + row.ALT_y + '\t' + '.' + '\t' + '.' + '\t' + '.' + '\t' + 'GT:TWINS:TRIO5:TRIO6:DP:ALT:AF:GQ:QD:MQ:FS:QUAL' + '\t' + FORMAT + '\n' | |||
else: | |||
rawGT = row[10].split(':') | |||
FORMAT_x = row[10].split(':') | |||
ALT = int(FORMAT_x[1].split(',')[1]) | |||
if int(FORMAT_x[2]) != 0: | |||
AF = round(ALT/int(FORMAT_x[2]),2) | |||
else: | |||
AF = '.' | |||
INFO_x = parse_INFO(row.INFO_x) | |||
if FORMAT_x[2] == '0': | |||
INFO_x['QD'] = '.' | |||
else: | |||
pass | |||
FORMAT = '.:.:.:.' + ':' + FORMAT_x[2] + ':' + str(ALT) + ':' + str(AF) + ':' + FORMAT_x[3] + ':' + INFO_x['QD'] + ':' + INFO_x['MQ'] + ':' + INFO_x['FS'] + ':' + str(row.QUAL_x) + ':' + rawGT[0] | |||
# outline | |||
outline = row._1 + '\t' + str(row.POS) + '\t' + row.ID_x + '\t' + row.REF_x + '\t' + row.ALT_x + '\t' + '.' + '\t' + '.' + '\t' + '.' + '\t' + 'GT:TWINS:TRIO5:TRIO6:DP:ALT:AF:GQ:QD:MQ:FS:QUAL:rawGT' + '\t' + FORMAT + '\n' | |||
outfile.write(outline) |
@@ -0,0 +1,71 @@ | |||
from __future__ import division | |||
import pandas as pd | |||
import sys, argparse, os | |||
import fileinput | |||
import re | |||
# input arguments | |||
parser = argparse.ArgumentParser(description="this script is to extract mendelian concordance information") | |||
parser.add_argument('-LCL5', '--LCL5', type=str, help='LCL5 family info', required=True) | |||
parser.add_argument('-LCL6', '--LCL6', type=str, help='LCL6 family info', required=True) | |||
parser.add_argument('-family', '--family', type=str, help='family name', required=True) | |||
args = parser.parse_args() | |||
lcl5 = args.LCL5 | |||
lcl6 = args.LCL6 | |||
family = args.family | |||
# output file | |||
family_name = family + '.txt' | |||
family_file = open(family_name,'w') | |||
# input files | |||
lcl5_dat = pd.read_table(lcl5) | |||
lcl6_dat = pd.read_table(lcl6) | |||
merged_df = pd.merge(lcl5_dat, lcl6_dat, how='outer', left_on=['#CHROM','POS'], right_on = ['#CHROM','POS']) | |||
def alt_seq(alt, genotype): | |||
if genotype == './.': | |||
seq = './.' | |||
elif genotype == '0/0': | |||
seq = '0/0' | |||
else: | |||
alt = alt.split(',') | |||
genotype = genotype.split('/') | |||
if genotype[0] == '0': | |||
allele2 = alt[int(genotype[1]) - 1] | |||
seq = '0/' + allele2 | |||
else: | |||
allele1 = alt[int(genotype[0]) - 1] | |||
allele2 = alt[int(genotype[1]) - 1] | |||
seq = allele1 + '/' + allele2 | |||
return seq | |||
for row in merged_df.itertuples(): | |||
# correction of multiallele | |||
if pd.isnull(row.INFO_x) == True or pd.isnull(row.INFO_y) == True: | |||
mendelian = '.' | |||
else: | |||
lcl5_seq = alt_seq(row.ALT_x, row.CHILD_x) | |||
lcl6_seq = alt_seq(row.ALT_y, row.CHILD_y) | |||
if lcl5_seq == lcl6_seq: | |||
mendelian = '1' | |||
else: | |||
mendelian = '0' | |||
if pd.isnull(row.INFO_x) == True: | |||
mendelian = mendelian + ':.' | |||
else: | |||
mendelian = mendelian + ':' + row.INFO_x.split('=')[1] | |||
if pd.isnull(row.INFO_y) == True: | |||
mendelian = mendelian + ':.' | |||
else: | |||
mendelian = mendelian + ':' + row.INFO_y.split('=')[1] | |||
outline = row._1 + '\t' + str(row.POS) + '\t' + mendelian + '\n' | |||
family_file.write(outline) |
@@ -0,0 +1,115 @@ | |||
from __future__ import division | |||
import pandas as pd | |||
import sys, argparse, os | |||
import fileinput | |||
import re | |||
# input arguments | |||
parser = argparse.ArgumentParser(description="this script is to extract mendelian concordance information") | |||
parser.add_argument('-LCL5', '--LCL5', type=str, help='LCL5 family info', required=True) | |||
parser.add_argument('-LCL6', '--LCL6', type=str, help='LCL6 family info', required=True) | |||
parser.add_argument('-genotype', '--genotype', type=str, help='Genotype information of a set of four family members', required=True) | |||
parser.add_argument('-family', '--family', type=str, help='family name', required=True) | |||
args = parser.parse_args() | |||
lcl5 = args.LCL5 | |||
lcl6 = args.LCL6 | |||
genotype = args.genotype | |||
family = args.family | |||
# output file | |||
family_name = family + '.txt' | |||
family_file = open(family_name,'w') | |||
summary_name = family + '.summary.txt' | |||
summary_file = open(summary_name,'w') | |||
# input files | |||
lcl5_dat = pd.read_table(lcl5) | |||
lcl6_dat = pd.read_table(lcl6) | |||
genotype_dat = pd.read_table(genotype) | |||
merged_df = pd.merge(lcl5_dat, lcl6_dat, how='outer', left_on=['#CHROM','POS'], right_on = ['#CHROM','POS']) | |||
merged_genotype_df = pd.merge(merged_df, genotype_dat, how='outer', left_on=['#CHROM','POS'], right_on = ['#CHROM','POS']) | |||
merged_genotype_df_sub = merged_genotype_df.iloc[:,[0,1,23,24,29,30,31,32,7,17]] | |||
merged_genotype_df_sub.columns = ['CHROM', 'POS', 'REF', 'ALT','LCL5','LCL6','LCL7','LCL8', 'TRIO5', 'TRIO6'] | |||
sister_same = 0 | |||
sister_diff = 0 | |||
family_all = 0 | |||
family_mendelian = 0 | |||
for row in merged_genotype_df_sub.itertuples(): | |||
# sister | |||
if row.LCL5 == row.LCL6: | |||
if row.LCL5 == './.': | |||
mendelian = 'noInfo' | |||
sister_count = "no" | |||
elif row.LCL5 == '0/0': | |||
mendelian = 'Ref' | |||
sister_count = "no" | |||
else: | |||
mendelian = '1' | |||
sister_count = "yes_same" | |||
else: | |||
mendelian = '0' | |||
if (row.LCL5 == './.' or row.LCL5 == '0/0') and (row.LCL6 == './.' or row.LCL6 == '0/0'): | |||
sister_count = "no" | |||
else: | |||
sister_count = "yes_diff" | |||
if sister_count == 'yes_same': | |||
sister_same += 1 | |||
elif sister_count == 'yes_diff': | |||
sister_diff += 1 | |||
else: | |||
pass | |||
# family trio5 | |||
if row.LCL5 == row. LCL7 == row.LCL8 == './.': | |||
mendelian = mendelian + ':noInfo' | |||
elif row.LCL5 == row. LCL7 == row.LCL8 == '0/0': | |||
mendelian = mendelian + ':Ref' | |||
elif pd.isnull(row.TRIO5) == True: | |||
mendelian = mendelian + ':unVBT' | |||
else: | |||
mendelian = mendelian + ':' + row.TRIO5.split('=')[1] | |||
# family trio6 | |||
if row.LCL6 == row.LCL7 == row.LCL8 == './.': | |||
mendelian = mendelian + ':noInfo' | |||
elif row.LCL6 == row. LCL7 == row.LCL8 == '0/0': | |||
mendelian = mendelian + ':Ref' | |||
elif pd.isnull(row.TRIO6) == True: | |||
mendelian = mendelian + ':unVBT' | |||
else: | |||
mendelian = mendelian + ':' + row.TRIO6.split('=')[1] | |||
# not count into family | |||
if (row.LCL5 == './.' or row.LCL5 == '0/0') and (row.LCL6 == './.' or row.LCL6 == '0/0') and (row.LCL7 == './.' or row.LCL7 == '0/0') and (row.LCL8 == './.' or row.LCL8 == '0/0'): | |||
mendelian_count = "no" | |||
else: | |||
mendelian_count = "yes" | |||
outline = row.CHROM + '\t' + str(row.POS) + '\t' + row.REF + '\t' + row.ALT + '\t' + row.LCL5 + '\t' + row.LCL6 + '\t' + row.LCL7 + '\t' + row.LCL8 + '\t' + str(row.TRIO5) + '\t' + str(row.TRIO6) + '\t' + str(mendelian) + '\t' + str(mendelian_count) + '\t' + str(sister_count) + '\n' | |||
family_file.write(outline) | |||
if mendelian_count == 'yes': | |||
family_all += 1 | |||
else: | |||
pass | |||
if mendelian == '1:1:1': | |||
family_mendelian += 1 | |||
elif mendelian == 'Ref:1:1': | |||
family_mendelian += 1 | |||
else: | |||
pass | |||
sister = sister_same/(sister_same + sister_diff) | |||
quartet = family_mendelian/family_all | |||
outcolumn = 'Family\tReproducibility_D5_D6\tMendelian_Concordance_Quartet\n' | |||
outResult = family + '\t' + str(sister) + '\t' + str(quartet) + '\n' | |||
summary_file.write(outcolumn) | |||
summary_file.write(outResult) | |||
@@ -0,0 +1,109 @@ | |||
# import modules | |||
import numpy as np | |||
import pandas as pd | |||
from sklearn import svm | |||
from sklearn import preprocessing | |||
import sys, argparse, os | |||
from vcf2bed import position_to_bed,padding_region | |||
parser = argparse.ArgumentParser(description="this script is to preform one calss svm on each chromosome") | |||
parser.add_argument('-train', '--trainDataset', type=str, help='training dataset generated from extracting vcf information part, with mutaitons supported by callsets', required=True) | |||
parser.add_argument('-test', '--testDataset', type=str, help='testing dataset generated from extracting vcf information part, with mutaitons not called by all callsets', required=True) | |||
parser.add_argument('-name', '--sampleName', type=str, help='sample name for output file name', required=True) | |||
args = parser.parse_args() | |||
# Rename input: | |||
train_input = args.trainDataset | |||
test_input = args.testDataset | |||
sample_name = args.sampleName | |||
# default columns, which will be included in the included in the calssifier | |||
chromosome = ['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15' ,'chr16','chr17','chr18','chr19','chr20','chr21','chr22','chrX','chrY'] | |||
feature_heter_cols = ['AltDP','BaseQRankSum','DB','DP','FS','GQ','MQ','MQRankSum','QD','ReadPosRankSum','RefDP','SOR','af'] | |||
feature_homo_cols = ['AltDP','DB','DP','FS','GQ','MQ','QD','RefDP','SOR','af'] | |||
# import datasets sepearate the records with or without BaseQRankSum annotation, etc. | |||
def load_dat(dat_file_name): | |||
dat = pd.read_table(dat_file_name) | |||
dat['DB'] = dat['DB'].fillna(0) | |||
dat = dat[dat['DP'] != 0] | |||
dat['af'] = dat['AltDP']/(dat['AltDP'] + dat['RefDP']) | |||
homo_rows = dat[dat['BaseQRankSum'].isnull()] | |||
heter_rows = dat[dat['BaseQRankSum'].notnull()] | |||
return homo_rows,heter_rows | |||
train_homo,train_heter = load_dat(train_input) | |||
test_homo,test_heter = load_dat(test_input) | |||
clf = svm.OneClassSVM(nu=0.05,kernel='rbf', gamma='auto_deprecated',cache_size=500) | |||
def prepare_dat(train_dat,test_dat,feature_cols,chromo): | |||
chr_train = train_dat[train_dat['chromo'] == chromo] | |||
chr_test = test_dat[test_dat['chromo'] == chromo] | |||
train_dat = chr_train.loc[:,feature_cols] | |||
test_dat = chr_test.loc[:,feature_cols] | |||
train_dat_scaled = preprocessing.scale(train_dat) | |||
test_dat_scaled = preprocessing.scale(test_dat) | |||
return chr_test,train_dat_scaled,test_dat_scaled | |||
def oneclass(X_train,X_test,chr_test): | |||
clf.fit(X_train) | |||
y_pred_test = clf.predict(X_test) | |||
test_true_dat = chr_test[y_pred_test == 1] | |||
test_false_dat = chr_test[y_pred_test == -1] | |||
return test_true_dat,test_false_dat | |||
predicted_true = pd.DataFrame(columns=train_homo.columns) | |||
predicted_false = pd.DataFrame(columns=train_homo.columns) | |||
for chromo in chromosome: | |||
# homo datasets | |||
chr_test_homo,X_train_homo,X_test_homo = prepare_dat(train_homo,test_homo,feature_homo_cols,chromo) | |||
test_true_homo,test_false_homo = oneclass(X_train_homo,X_test_homo,chr_test_homo) | |||
predicted_true = predicted_true.append(test_true_homo) | |||
predicted_false = predicted_false.append(test_false_homo) | |||
# heter datasets | |||
chr_test_heter,X_train_heter,X_test_heter = prepare_dat(train_heter,test_heter,feature_heter_cols,chromo) | |||
test_true_heter,test_false_heter = oneclass(X_train_heter,X_test_heter,chr_test_heter) | |||
predicted_true = predicted_true.append(test_true_heter) | |||
predicted_false = predicted_false.append(test_false_heter) | |||
predicted_true_filename = sample_name + '_predicted_true.txt' | |||
predicted_false_filename = sample_name + '_predicted_false.txt' | |||
predicted_true.to_csv(predicted_true_filename,sep='\t',index=False) | |||
predicted_false.to_csv(predicted_false_filename,sep='\t',index=False) | |||
# output the bed file and padding bed region 50bp | |||
predicted_true_bed_filename = sample_name + '_predicted_true.bed' | |||
predicted_false_bed_filename = sample_name + '_predicted_false.bed' | |||
padding_filename = sample_name + '_padding.bed' | |||
predicted_true_bed = open(predicted_true_bed_filename,'w') | |||
predicted_false_bed = open(predicted_false_bed_filename,'w') | |||
padding = open(padding_filename,'w') | |||
# | |||
for index,row in predicted_false.iterrows(): | |||
chromo,pos1,pos2 = position_to_bed(row['chromo'],row['pos'],row['ref'],row['alt']) | |||
outline_pos = chromo + '\t' + str(pos1) + '\t' + str(pos2) + '\n' | |||
predicted_false_bed.write(outline_pos) | |||
chromo,pad_pos1,pad_pos2,pad_pos3,pad_pos4 = padding_region(chromo,pos1,pos2,50) | |||
outline_pad_1 = chromo + '\t' + str(pad_pos1) + '\t' + str(pad_pos2) + '\n' | |||
outline_pad_2 = chromo + '\t' + str(pad_pos3) + '\t' + str(pad_pos4) + '\n' | |||
padding.write(outline_pad_1) | |||
padding.write(outline_pad_2) | |||
for index,row in predicted_true.iterrows(): | |||
chromo,pos1,pos2 = position_to_bed(row['chromo'],row['pos'],row['ref'],row['alt']) | |||
outline_pos = chromo + '\t' + str(pos1) + '\t' + str(pos2) + '\n' | |||
predicted_true_bed.write(outline_pos) | |||
@@ -0,0 +1,25 @@ | |||
Type Filter TRUTH.TOTAL TRUTH.TP TRUTH.FN QUERY.TOTAL QUERY.FP QUERY.UNK FP.gt METRIC.Recall METRIC.Precision METRIC.Frac_NA METRIC.F1_Score TRUTH.TOTAL.TiTv_ratio QUERY.TOTAL.TiTv_ratio TRUTH.TOTAL.het_hom_ratio QUERY.TOTAL.het_hom_ratio | |||
INDEL ALL 611469 32000 579469 52892 10380 10361 5049 0.052333 0.755943 0.19589 0.097889 1.60450055781 0.506141804262 | |||
SNP ALL 3607071 270183 3336888 342022 52610 19197 32200 0.074904 0.837032 0.056128 0.137503 2.06510079715 2.31948838383 1.35160768181 0.585277660595 | |||
INDEL ALL 611469 32000 579469 52892 10380 10361 5049 0.052333 0.755943 0.19589 0.097889 1.60450055781 0.506141804262 | |||
SNP ALL 3607071 270183 3336888 342022 52610 19197 32200 0.074904 0.837032 0.056128 0.137503 2.06510079715 2.31948838383 1.35160768181 0.585277660595 | |||
INDEL ALL 611469 32000 579469 52892 10380 10361 5049 0.052333 0.755943 0.19589 0.097889 1.60450055781 0.506141804262 | |||
SNP ALL 3607071 270183 3336888 342022 52610 19197 32200 0.074904 0.837032 0.056128 0.137503 2.06510079715 2.31948838383 1.35160768181 0.585277660595 | |||
INDEL ALL 611469 27751 583718 45833 8555 9382 4372 0.045384 0.765301 0.2047 0.085687 1.60450055781 0.544685459037 | |||
SNP ALL 3607071 229603 3377468 285728 38719 17379 25945 0.063654 0.855714 0.060824 0.118493 2.06510079715 2.2934525799 1.35160768181 0.611069128478 | |||
INDEL ALL 611469 27751 583718 45833 8555 9382 4372 0.045384 0.765301 0.2047 0.085687 1.60450055781 0.544685459037 | |||
SNP ALL 3607071 229603 3377468 285728 38719 17379 25945 0.063654 0.855714 0.060824 0.118493 2.06510079715 2.2934525799 1.35160768181 0.611069128478 | |||
INDEL ALL 611469 27751 583718 45833 8555 9382 4372 0.045384 0.765301 0.2047 0.085687 1.60450055781 0.544685459037 | |||
SNP ALL 3607071 229603 3377468 285728 38719 17379 25945 0.063654 0.855714 0.060824 0.118493 2.06510079715 2.2934525799 1.35160768181 0.611069128478 | |||
INDEL ALL 602890 30755 572135 49871 9073 9927 4713 0.051013 0.772857 0.199054 0.095708 1.45794812615 0.537973508294 | |||
SNP ALL 3586918 261507 3325411 323937 43923 18478 30285 0.072906 0.856207 0.057042 0.13437 2.06425234942 2.30355959336 1.29577693834 0.580073379456 | |||
INDEL ALL 602890 30755 572135 49871 9073 9927 4713 0.051013 0.772857 0.199054 0.095708 1.45794812615 0.537973508294 | |||
SNP ALL 3586918 261507 3325411 323937 43923 18478 30285 0.072906 0.856207 0.057042 0.13437 2.06425234942 2.30355959336 1.29577693834 0.580073379456 | |||
INDEL ALL 602890 30755 572135 49871 9073 9927 4713 0.051013 0.772857 0.199054 0.095708 1.45794812615 0.537973508294 | |||
SNP ALL 3586918 261507 3325411 323937 43923 18478 30285 0.072906 0.856207 0.057042 0.13437 2.06425234942 2.30355959336 1.29577693834 0.580073379456 | |||
INDEL ALL 608169 27666 580503 45885 9139 8953 4193 0.045491 0.752545 0.195118 0.085795 1.50870570513 0.547063202057 | |||
SNP ALL 3604200 234173 3370027 307086 54292 18602 26830 0.064972 0.811802 0.060576 0.120315 2.06546454872 2.33046326017 1.34261182662 0.630876119451 | |||
INDEL ALL 608169 27666 580503 45885 9139 8953 4193 0.045491 0.752545 0.195118 0.085795 1.50870570513 0.547063202057 | |||
SNP ALL 3604200 234173 3370027 307086 54292 18602 26830 0.064972 0.811802 0.060576 0.120315 2.06546454872 2.33046326017 1.34261182662 0.630876119451 | |||
INDEL ALL 608169 27666 580503 45885 9139 8953 4193 0.045491 0.752545 0.195118 0.085795 1.50870570513 0.547063202057 | |||
SNP ALL 3604200 234173 3370027 307086 54292 18602 26830 0.064972 0.811802 0.060576 0.120315 2.06546454872 2.33046326017 1.34261182662 0.630876119451 |
@@ -0,0 +1,144 @@ | |||
# import modules | |||
import sys, argparse, os | |||
import fileinput | |||
import re | |||
parser = argparse.ArgumentParser(description="This script is to split samples in VCF files and rewrite to the right style") | |||
parser.add_argument('-vcf', '--familyVCF', type=str, help='VCF with sister and mendelian infomation', required=True) | |||
parser.add_argument('-name', '--familyName', type=str, help='Family name of the VCF file', required=True) | |||
args = parser.parse_args() | |||
# Rename input: | |||
inputFile = args.familyVCF | |||
family_name = args.familyName | |||
# output filename | |||
LCL5_name = family_name + '.LCL5.vcf' | |||
LCL5file = open(LCL5_name,'w') | |||
LCL6_name = family_name + '.LCL6.vcf' | |||
LCL6file = open(LCL6_name,'w') | |||
LCL7_name = family_name + '.LCL7.vcf' | |||
LCL7file = open(LCL7_name,'w') | |||
LCL8_name = family_name + '.LCL8.vcf' | |||
LCL8file = open(LCL8_name,'w') | |||
family_filename = family_name + '.vcf' | |||
familyfile = open(family_filename,'w') | |||
# default columns, which will be included in the included in the calssifier | |||
vcfheader = '''##fileformat=VCFv4.2 | |||
##FILTER=<ID=PASS,Description="the same genotype between twin sister and mendelian consistent in 578 and 678"> | |||
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> | |||
##FORMAT=<ID=TWINS,Number=0,Type=Flag,Description="0 for sister consistent, 1 for sister inconsistent"> | |||
##FORMAT=<ID=TRIO5,Number=0,Type=Flag,Description="0 for trio consistent, 1 for trio inconsistent"> | |||
##FORMAT=<ID=TRIO6,Number=0,Type=Flag,Description="0 for trio consistent, 1 for trio inconsistent"> | |||
##contig=<ID=chr1,length=248956422> | |||
##contig=<ID=chr2,length=242193529> | |||
##contig=<ID=chr3,length=198295559> | |||
##contig=<ID=chr4,length=190214555> | |||
##contig=<ID=chr5,length=181538259> | |||
##contig=<ID=chr6,length=170805979> | |||
##contig=<ID=chr7,length=159345973> | |||
##contig=<ID=chr8,length=145138636> | |||
##contig=<ID=chr9,length=138394717> | |||
##contig=<ID=chr10,length=133797422> | |||
##contig=<ID=chr11,length=135086622> | |||
##contig=<ID=chr12,length=133275309> | |||
##contig=<ID=chr13,length=114364328> | |||
##contig=<ID=chr14,length=107043718> | |||
##contig=<ID=chr15,length=101991189> | |||
##contig=<ID=chr16,length=90338345> | |||
##contig=<ID=chr17,length=83257441> | |||
##contig=<ID=chr18,length=80373285> | |||
##contig=<ID=chr19,length=58617616> | |||
##contig=<ID=chr20,length=64444167> | |||
##contig=<ID=chr21,length=46709983> | |||
##contig=<ID=chr22,length=50818468> | |||
##contig=<ID=chrX,length=156040895> | |||
''' | |||
# write VCF | |||
LCL5colname = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+family_name+'_LCL5'+'\n' | |||
LCL5file.write(vcfheader) | |||
LCL5file.write(LCL5colname) | |||
LCL6colname = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+family_name+'_LCL6'+'\n' | |||
LCL6file.write(vcfheader) | |||
LCL6file.write(LCL6colname) | |||
LCL7colname = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+family_name+'_LCL7'+'\n' | |||
LCL7file.write(vcfheader) | |||
LCL7file.write(LCL7colname) | |||
LCL8colname = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+family_name+'_LCL8'+'\n' | |||
LCL8file.write(vcfheader) | |||
LCL8file.write(LCL8colname) | |||
familycolname = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+'LCL5\t'+'LCL6\t'+'LCL7\t'+'LCL8'+'\n' | |||
familyfile.write(vcfheader) | |||
familyfile.write(familycolname) | |||
# reform VCF | |||
def process(oneLine): | |||
line = oneLine.rstrip() | |||
strings = line.strip().split('\t') | |||
# replace . | |||
# LCL6 uniq | |||
if strings[11] == '.': | |||
strings[11] = '0/0' | |||
strings[9] = strings[12] | |||
strings[10] = strings[13] | |||
else: | |||
pass | |||
# LCL5 uniq | |||
if strings[14] == '.': | |||
strings[14] = '0/0' | |||
strings[12] = strings[9] | |||
strings[13] = strings[10] | |||
else: | |||
pass | |||
# sister | |||
if strings[11] == strings[14]: | |||
add_format = ":1" | |||
else: | |||
add_format = ":0" | |||
# trioLCL5 | |||
if strings[15] == 'MD=1': | |||
add_format = add_format + ":1" | |||
else: | |||
add_format = add_format + ":0" | |||
# trioLCL6 | |||
if strings[7] == 'MD=1': | |||
add_format = add_format + ":1" | |||
else: | |||
add_format = add_format + ":0" | |||
# filter | |||
if (strings[11] == strings[14]) and (strings[15] == 'MD=1') and (strings[7] == 'MD=1'): | |||
strings[6] = 'PASS' | |||
else: | |||
strings[6] = '.' | |||
# output LCL5 | |||
LCL5outLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+'.'+'\t'+strings[6]+'\t'+ '.' +'\t'+ 'GT:TWINS:TRIO5:TRIO6' + '\t' + strings[14] + add_format + '\n' | |||
LCL5file.write(LCL5outLine) | |||
# output LCL6 | |||
LCL6outLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+'.'+'\t'+strings[6]+'\t'+ '.' +'\t'+ 'GT:TWINS:TRIO5:TRIO6' + '\t' + strings[11] + add_format + '\n' | |||
LCL6file.write(LCL6outLine) | |||
# output LCL7 | |||
LCL7outLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+'.'+'\t'+strings[6]+'\t'+ '.' +'\t'+ 'GT:TWINS:TRIO5:TRIO6' + '\t' + strings[10] + add_format + '\n' | |||
LCL7file.write(LCL7outLine) | |||
# output LCL8 | |||
LCL8outLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+'.'+'\t'+strings[6]+'\t'+ '.' +'\t'+ 'GT:TWINS:TRIO5:TRIO6' + '\t' + strings[9] + add_format + '\n' | |||
LCL8file.write(LCL8outLine) | |||
# output family | |||
familyoutLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+ '.'+'\t'+strings[6]+'\t'+ '.' +'\t'+ 'GT:TWINS:TRIO5:TRIO6' + '\t' + strings[14] + add_format +'\t' + strings[11] + add_format + '\t' + strings[10] + add_format +'\t' + strings[9] + add_format + '\n' | |||
familyfile.write(familyoutLine) | |||
for line in fileinput.input(inputFile): | |||
m = re.match('^\#',line) | |||
if m is not None: | |||
pass | |||
else: | |||
process(line) | |||
@@ -0,0 +1,227 @@ | |||
from __future__ import division | |||
from glob import glob | |||
import sys, argparse, os | |||
import fileinput | |||
import re | |||
import pandas as pd | |||
from operator import itemgetter | |||
from collections import Counter | |||
from itertools import islice | |||
from numpy import * | |||
import statistics | |||
# input arguments | |||
parser = argparse.ArgumentParser(description="this script is to merge mendelian and vcfinfo, and extract high_confidence_calls") | |||
parser.add_argument('-prefix', '--prefix', type=str, help='prefix of output file', required=True) | |||
parser.add_argument('-vcf', '--vcf', type=str, help='merged multiple sample vcf', required=True) | |||
args = parser.parse_args() | |||
prefix = args.prefix | |||
vcf = args.vcf | |||
# input files | |||
vcf_dat = pd.read_table(vcf) | |||
# all info | |||
all_file_name = prefix + "_all_summary.txt" | |||
all_sample_outfile = open(all_file_name,'w') | |||
all_info_col = 'CHROM\tPOS\tREF\tALT\tLCL5_consensus_calls\tLCL5_detect_number\tLCL5_same_diff\tLCL6_consensus_calls\tLCL6_detect_number\tLCl6_same_diff\tLCL7_consensus_calls\tLCL7_detect_number\tLCL7_same_diff\tLCL8_consensus_calls\tLCL8_detect_number\tLCL8_same_diff\n' | |||
all_sample_outfile.write(all_info_col) | |||
# filtered info | |||
vcf_header = '''##fileformat=VCFv4.2 | |||
##fileDate=20200501 | |||
##source=high_confidence_calls_intergration(choppy app) | |||
##reference=GRCh38.d1.vd1 | |||
##INFO=<ID=VOTED,Number=1,Type=Integer,Description="Number mendelian consisitent votes"> | |||
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> | |||
##contig=<ID=chr1,length=248956422> | |||
##contig=<ID=chr2,length=242193529> | |||
##contig=<ID=chr3,length=198295559> | |||
##contig=<ID=chr4,length=190214555> | |||
##contig=<ID=chr5,length=181538259> | |||
##contig=<ID=chr6,length=170805979> | |||
##contig=<ID=chr7,length=159345973> | |||
##contig=<ID=chr8,length=145138636> | |||
##contig=<ID=chr9,length=138394717> | |||
##contig=<ID=chr10,length=133797422> | |||
##contig=<ID=chr11,length=135086622> | |||
##contig=<ID=chr12,length=133275309> | |||
##contig=<ID=chr13,length=114364328> | |||
##contig=<ID=chr14,length=107043718> | |||
##contig=<ID=chr15,length=101991189> | |||
##contig=<ID=chr16,length=90338345> | |||
##contig=<ID=chr17,length=83257441> | |||
##contig=<ID=chr18,length=80373285> | |||
##contig=<ID=chr19,length=58617616> | |||
##contig=<ID=chr20,length=64444167> | |||
##contig=<ID=chr21,length=46709983> | |||
##contig=<ID=chr22,length=50818468> | |||
##contig=<ID=chrX,length=156040895> | |||
''' | |||
consensus_file_name = prefix + "_consensus.vcf" | |||
consensus_outfile = open(consensus_file_name,'w') | |||
consensus_col = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tLCL5_consensus_call\tLCL6_consensus_call\tLCL7_consensus_call\tLCL8_consensus_call\n' | |||
consensus_outfile.write(vcf_header) | |||
consensus_outfile.write(consensus_col) | |||
# function | |||
def decide_by_rep(vcf_list): | |||
consensus_rep = '' | |||
gt = [x.split(':')[0] for x in vcf_list] | |||
gt_num_dict = Counter(gt) | |||
highest_gt = gt_num_dict.most_common(1) | |||
candidate_gt = highest_gt[0][0] | |||
freq_gt = highest_gt[0][1] | |||
if freq_gt >= 2: | |||
consensus_rep = candidate_gt | |||
else: | |||
consensus_rep = 'inconGT' | |||
return consensus_rep | |||
def consensus_call(vcf_info_list): | |||
consensus_call = '.' | |||
detect_number = '.' | |||
same_diff = '.' | |||
# pcr | |||
SEQ2000 = decide_by_rep(vcf_info_list[0:3]) | |||
XTen_ARD = decide_by_rep(vcf_info_list[18:21]) | |||
XTen_NVG = decide_by_rep(vcf_info_list[21:24]) | |||
XTen_WUX = decide_by_rep(vcf_info_list[24:27]) | |||
Nova_WUX = decide_by_rep(vcf_info_list[15:18]) | |||
pcr_sequence_site = [SEQ2000,XTen_ARD,XTen_NVG,XTen_WUX,Nova_WUX] | |||
pcr_sequence_dict = Counter(pcr_sequence_site) | |||
pcr_highest_sequence = pcr_sequence_dict.most_common(1) | |||
pcr_candidate_sequence = pcr_highest_sequence[0][0] | |||
pcr_freq_sequence = pcr_highest_sequence[0][1] | |||
if pcr_freq_sequence > 3: | |||
pcr_consensus = pcr_candidate_sequence | |||
else: | |||
pcr_consensus = 'inconSequenceSite' | |||
# pcr-free | |||
T7_WGE = decide_by_rep(vcf_info_list[3:6]) | |||
Nova_ARD_1 = decide_by_rep(vcf_info_list[6:9]) | |||
Nova_ARD_2 = decide_by_rep(vcf_info_list[9:12]) | |||
Nova_BRG = decide_by_rep(vcf_info_list[12:15]) | |||
sequence_site = [T7_WGE,Nova_ARD_1,Nova_ARD_2,Nova_BRG] | |||
sequence_dict = Counter(sequence_site) | |||
highest_sequence = sequence_dict.most_common(1) | |||
candidate_sequence = highest_sequence[0][0] | |||
freq_sequence = highest_sequence[0][1] | |||
if freq_sequence > 2: | |||
pcr_free_consensus = candidate_sequence | |||
else: | |||
pcr_free_consensus = 'inconSequenceSite' | |||
gt = [x.split(':')[0] for x in vcf_info_list] | |||
gt = [x.replace('./.','.') for x in gt] | |||
detected_num = 27 - gt.count('.') | |||
gt_remain = [e for e in gt if e not in {'.'}] | |||
gt_set = set(gt_remain) | |||
if len(gt_set) == 1: | |||
same_diff = 'same' | |||
else: | |||
same_diff = 'diff' | |||
tag = ['inconGT','inconSequenceSite'] | |||
if (pcr_consensus == pcr_free_consensus) and (pcr_consensus not in tag): | |||
consensus_call = pcr_consensus | |||
elif (pcr_consensus in tag) and (pcr_free_consensus in tag): | |||
consensus_call = 'notAgree' | |||
else: | |||
consensus_call = 'notConsensus' | |||
return consensus_call, detected_num, same_diff | |||
elif (pcr_consensus in tag) and (pcr_free_consensus in tag): | |||
consensus_call = 'filtered' | |||
elif ((pcr_consensus == './.') or (pcr_consensus in tag)) and ((pcr_free_consensus not in tag) and (pcr_free_consensus != './.')): | |||
consensus_call = 'pcr-free-speicifc' | |||
elif ((pcr_consensus != './.') or (pcr_consensus not in tag)) and ((pcr_free_consensus in tag) and (pcr_free_consensus == './.')): | |||
consensus_call = 'pcr-speicifc' | |||
elif (pcr_consensus == '0/0') and (pcr_free_consensus == '0/0'): | |||
consensus_call = '0/0' | |||
else: | |||
consensus_call = 'filtered' | |||
for row in vcf_dat.itertuples(): | |||
# length | |||
#alt | |||
if ',' in row.ALT: | |||
alt = row.ALT.split(',') | |||
alt_len = [len(i) for i in alt] | |||
alt_max = max(alt_len) | |||
else: | |||
alt_max = len(row.ALT) | |||
#ref | |||
ref_len = len(row.REF) | |||
if (alt_max > 50) or (ref_len > 50): | |||
pass | |||
else: | |||
# consensus | |||
lcl5_list = [row.Quartet_DNA_BGI_SEQ2000_BGI_LCL5_1_20180518,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL5_2_20180530,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL5_3_20180530, \ | |||
row.Quartet_DNA_BGI_T7_WGE_LCL5_1_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL5_2_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL5_3_20191105, \ | |||
row.Quartet_DNA_ILM_Nova_ARD_LCL5_1_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL5_2_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL5_3_20181108, \ | |||
row.Quartet_DNA_ILM_Nova_ARD_LCL5_4_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL5_5_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL5_6_20190111, \ | |||
row.Quartet_DNA_ILM_Nova_BRG_LCL5_1_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL5_2_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL5_3_20180930, \ | |||
row.Quartet_DNA_ILM_Nova_WUX_LCL5_1_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL5_2_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL5_3_20190917, \ | |||
row.Quartet_DNA_ILM_XTen_ARD_LCL5_1_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL5_2_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL5_3_20170403, \ | |||
row.Quartet_DNA_ILM_XTen_NVG_LCL5_1_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL5_2_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL5_3_20170329, \ | |||
row.Quartet_DNA_ILM_XTen_WUX_LCL5_1_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL5_2_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL5_3_20170216] | |||
lcl6_list = [row.Quartet_DNA_BGI_SEQ2000_BGI_LCL6_1_20180518,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL6_2_20180530,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL6_3_20180530, \ | |||
row.Quartet_DNA_BGI_T7_WGE_LCL6_1_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL6_2_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL6_3_20191105, \ | |||
row.Quartet_DNA_ILM_Nova_ARD_LCL6_1_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL6_2_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL6_3_20181108, \ | |||
row.Quartet_DNA_ILM_Nova_ARD_LCL6_4_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL6_5_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL6_6_20190111, \ | |||
row.Quartet_DNA_ILM_Nova_BRG_LCL6_1_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL6_2_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL6_3_20180930, \ | |||
row.Quartet_DNA_ILM_Nova_WUX_LCL6_1_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL6_2_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL6_3_20190917, \ | |||
row.Quartet_DNA_ILM_XTen_ARD_LCL6_1_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL6_2_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL6_3_20170403, \ | |||
row.Quartet_DNA_ILM_XTen_NVG_LCL6_1_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL6_2_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL6_3_20170329, \ | |||
row.Quartet_DNA_ILM_XTen_WUX_LCL6_1_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL6_2_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL6_3_20170216] | |||
lcl7_list = [row.Quartet_DNA_BGI_SEQ2000_BGI_LCL7_1_20180518,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL7_2_20180530,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL7_3_20180530, \ | |||
row.Quartet_DNA_BGI_T7_WGE_LCL7_1_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL7_2_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL7_3_20191105, \ | |||
row.Quartet_DNA_ILM_Nova_ARD_LCL7_1_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL7_2_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL7_3_20181108, \ | |||
row.Quartet_DNA_ILM_Nova_ARD_LCL7_4_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL7_5_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL7_6_20190111, \ | |||
row.Quartet_DNA_ILM_Nova_BRG_LCL7_1_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL7_2_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL7_3_20180930, \ | |||
row.Quartet_DNA_ILM_Nova_WUX_LCL7_1_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL7_2_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL7_3_20190917, \ | |||
row.Quartet_DNA_ILM_XTen_ARD_LCL7_1_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL7_2_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL7_3_20170403, \ | |||
row.Quartet_DNA_ILM_XTen_NVG_LCL7_1_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL7_2_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL7_3_20170329, \ | |||
row.Quartet_DNA_ILM_XTen_WUX_LCL7_1_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL7_2_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL7_3_20170216] | |||
lcl8_list = [row.Quartet_DNA_BGI_SEQ2000_BGI_LCL8_1_20180518,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL8_2_20180530,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL8_3_20180530, \ | |||
row.Quartet_DNA_BGI_T7_WGE_LCL8_1_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL8_2_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL8_3_20191105, \ | |||
row.Quartet_DNA_ILM_Nova_ARD_LCL8_1_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL8_2_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL8_3_20181108, \ | |||
row.Quartet_DNA_ILM_Nova_ARD_LCL8_4_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL8_5_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL8_6_20190111, \ | |||
row.Quartet_DNA_ILM_Nova_BRG_LCL8_1_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL8_2_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL8_3_20180930, \ | |||
row.Quartet_DNA_ILM_Nova_WUX_LCL8_1_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL8_2_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL8_3_20190917, \ | |||
row.Quartet_DNA_ILM_XTen_ARD_LCL8_1_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL8_2_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL8_3_20170403, \ | |||
row.Quartet_DNA_ILM_XTen_NVG_LCL8_1_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL8_2_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL8_3_20170329, \ | |||
row.Quartet_DNA_ILM_XTen_WUX_LCL8_1_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL8_2_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL8_3_20170216] | |||
# LCL5 | |||
LCL5_consensus_call, LCL5_detected_num, LCL5_same_diff = consensus_call(lcl5_list) | |||
# LCL6 | |||
LCL6_consensus_call, LCL6_detected_num, LCL6_same_diff = consensus_call(lcl6_list) | |||
# LCL7 | |||
LCL7_consensus_call, LCL7_detected_num, LCL7_same_diff = consensus_call(lcl7_list) | |||
# LCL8 | |||
LCL8_consensus_call, LCL8_detected_num, LCL8_same_diff = consensus_call(lcl8_list) | |||
# all data | |||
all_output = row._1 + '\t' + str(row.POS) + '\t' + row.REF + '\t' + row.ALT + '\t' + LCL5_consensus_call + '\t' + str(LCL5_detected_num) + '\t' + LCL5_same_diff + '\t' +\ | |||
LCL6_consensus_call + '\t' + str(LCL6_detected_num) + '\t' + LCL6_same_diff + '\t' +\ | |||
LCL7_consensus_call + '\t' + str(LCL7_detected_num) + '\t' + LCL7_same_diff + '\t' +\ | |||
LCL8_consensus_call + '\t' + str(LCL8_detected_num) + '\t' + LCL8_same_diff + '\n' | |||
all_sample_outfile.write(all_output) | |||
#consensus vcf | |||
one_position = [LCL5_consensus_call,LCL6_consensus_call,LCL7_consensus_call,LCL8_consensus_call] | |||
if ('notConsensus' in one_position) or (((len(set(one_position)) == 1) and ('./.' in set(one_position))) or ((len(set(one_position)) == 1) and ('0/0' in set(one_position))) or ((len(set(one_position)) == 2) and ('0/0' in set(one_position) and ('./.' in set(one_position))))): | |||
pass | |||
else: | |||
consensus_output = row._1 + '\t' + str(row.POS) + '\t' + '.' + '\t' + row.REF + '\t' + row.ALT + '\t' + '.' + '\t' + '.' + '\t' +'.' + '\t' + 'GT' + '\t' + LCL5_consensus_call + '\t' + LCL6_consensus_call + '\t' + LCL7_consensus_call + '\t' + LCL8_consensus_call +'\n' | |||
consensus_outfile.write(consensus_output) | |||
@@ -0,0 +1,13 @@ | |||
Sample SNV number INDEL number SNV query INDEL query SNV TP INDEL TP SNV FP INDEL FP SNV FN INDEL FN SNV precision INDEL precision SNV recall INDEL recall SNV F1 INDEL F1 | |||
Quartet_DNA_ILM_XTen_NVG_LCL5_1_20170531 342022 52892 322825 42531 270215 32151 52610 10380 3336888 579469 0.84 0.76 0.07 0.05 0.14 0.1 | |||
Quartet_DNA_ILM_XTen_NVG_LCL5_2_20170531 342022 52892 322825 42531 270215 32151 52610 10380 3336888 579469 0.84 0.76 0.07 0.05 0.14 0.1 | |||
Quartet_DNA_ILM_XTen_NVG_LCL5_3_20170531 342022 52892 322825 42531 270215 32151 52610 10380 3336888 579469 0.84 0.76 0.07 0.05 0.14 0.1 | |||
Quartet_DNA_ILM_XTen_NVG_LCL6_1_20170531 285728 45833 268349 36451 229630 27896 38719 8555 3377468 583718 0.86 0.77 0.06 0.05 0.12 0.09 | |||
Quartet_DNA_ILM_XTen_NVG_LCL6_2_20170531 285728 45833 268349 36451 229630 27896 38719 8555 3377468 583718 0.86 0.77 0.06 0.05 0.12 0.09 | |||
Quartet_DNA_ILM_XTen_NVG_LCL6_3_20170531 285728 45833 268349 36451 229630 27896 38719 8555 3377468 583718 0.86 0.77 0.06 0.05 0.12 0.09 | |||
Quartet_DNA_ILM_XTen_NVG_LCL7_1_20170531 323937 49871 305459 39944 261536 30871 43923 9073 3325411 572135 0.86 0.77 0.07 0.05 0.13 0.1 | |||
Quartet_DNA_ILM_XTen_NVG_LCL7_2_20170531 323937 49871 305459 39944 261536 30871 43923 9073 3325411 572135 0.86 0.77 0.07 0.05 0.13 0.1 | |||
Quartet_DNA_ILM_XTen_NVG_LCL7_3_20170531 323937 49871 305459 39944 261536 30871 43923 9073 3325411 572135 0.86 0.77 0.07 0.05 0.13 0.1 | |||
Quartet_DNA_ILM_XTen_NVG_LCL8_1_20170531 307086 45885 288484 36932 234192 27793 54292 9139 3370027 580503 0.81 0.75 0.06 0.05 0.12 0.09 | |||
Quartet_DNA_ILM_XTen_NVG_LCL8_2_20170531 307086 45885 288484 36932 234192 27793 54292 9139 3370027 580503 0.81 0.75 0.06 0.05 0.12 0.09 | |||
Quartet_DNA_ILM_XTen_NVG_LCL8_3_20170531 307086 45885 288484 36932 234192 27793 54292 9139 3370027 580503 0.81 0.75 0.06 0.05 0.12 0.09 |
@@ -0,0 +1,36 @@ | |||
import re | |||
def position_to_bed(chromo,pos,ref,alt): | |||
# snv | |||
# Start cooridinate BED = start coordinate VCF - 1 | |||
# End cooridinate BED = start coordinate VCF | |||
if len(ref) == 1 and len(alt) == 1: | |||
StartPos = int(pos) -1 | |||
EndPos = int(pos) | |||
# deletions | |||
# Start cooridinate BED = start coordinate VCF - 1 | |||
# End cooridinate BED = start coordinate VCF + (reference length - alternate length) | |||
elif len(ref) > 1 and len(alt) == 1: | |||
StartPos = int(pos) - 1 | |||
EndPos = int(pos) + (len(ref) - 1) | |||
#insertions | |||
# For insertions: | |||
# Start cooridinate BED = start coordinate VCF - 1 | |||
# End cooridinate BED = start coordinate VCF + (alternate length - reference length) | |||
else: | |||
StartPos = int(pos) - 1 | |||
EndPos = int(pos) + (len(alt) - 1) | |||
return chromo,StartPos,EndPos | |||
def padding_region(chromo,pos1,pos2,padding): | |||
StartPos1 = pos1 - padding | |||
EndPos1 = pos1 | |||
StartPos2 = pos2 | |||
EndPos2 = pos2 + padding | |||
return chromo,StartPos1,EndPos1,StartPos2,EndPos2 |
@@ -0,0 +1,295 @@ | |||
from __future__ import division | |||
from glob import glob | |||
import sys, argparse, os | |||
import fileinput | |||
import re | |||
import pandas as pd | |||
from operator import itemgetter | |||
from collections import Counter | |||
from itertools import islice | |||
from numpy import * | |||
import statistics | |||
# input arguments | |||
parser = argparse.ArgumentParser(description="this script is to merge mendelian and vcfinfo, and extract high_confidence_calls") | |||
parser.add_argument('-folder', '--folder', type=str, help='directory that holds all the mendelian info', required=True) | |||
parser.add_argument('-vcf', '--vcf', type=str, help='merged multiple sample vcf', required=True) | |||
args = parser.parse_args() | |||
folder = args.folder | |||
vcf = args.vcf | |||
# input files | |||
folder = folder + '/*.txt' | |||
filenames = glob(folder) | |||
dataframes = [] | |||
for filename in filenames: | |||
dataframes.append(pd.read_table(filename,header=None)) | |||
dfs = [df.set_index([0, 1]) for df in dataframes] | |||
merged_mendelian = pd.concat(dfs, axis=1).reset_index() | |||
family_name = [i.split('/')[-1].replace('.txt','') for i in filenames] | |||
columns = ['CHROM','POS'] + family_name | |||
merged_mendelian.columns = columns | |||
vcf_dat = pd.read_table(vcf) | |||
merged_df = pd.merge(merged_mendelian, vcf_dat, how='outer', left_on=['CHROM','POS'], right_on = ['#CHROM','POS']) | |||
merged_df = merged_df.fillna('nan') | |||
vcf_header = '''##fileformat=VCFv4.2 | |||
##fileDate=20200501 | |||
##source=high_confidence_calls_intergration(choppy app) | |||
##reference=GRCh38.d1.vd1 | |||
##INFO=<ID=VOTED,Number=1,Type=Integer,Description="Number mendelian consisitent votes"> | |||
##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype"> | |||
##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Sum depth of all samples"> | |||
##FORMAT=<ID=ALT,Number=1,Type=Integer,Description="Sum alternative depth of all samples"> | |||
##contig=<ID=chr1,length=248956422> | |||
##contig=<ID=chr2,length=242193529> | |||
##contig=<ID=chr3,length=198295559> | |||
##contig=<ID=chr4,length=190214555> | |||
##contig=<ID=chr5,length=181538259> | |||
##contig=<ID=chr6,length=170805979> | |||
##contig=<ID=chr7,length=159345973> | |||
##contig=<ID=chr8,length=145138636> | |||
##contig=<ID=chr9,length=138394717> | |||
##contig=<ID=chr10,length=133797422> | |||
##contig=<ID=chr11,length=135086622> | |||
##contig=<ID=chr12,length=133275309> | |||
##contig=<ID=chr13,length=114364328> | |||
##contig=<ID=chr14,length=107043718> | |||
##contig=<ID=chr15,length=101991189> | |||
##contig=<ID=chr16,length=90338345> | |||
##contig=<ID=chr17,length=83257441> | |||
##contig=<ID=chr18,length=80373285> | |||
##contig=<ID=chr19,length=58617616> | |||
##contig=<ID=chr20,length=64444167> | |||
##contig=<ID=chr21,length=46709983> | |||
##contig=<ID=chr22,length=50818468> | |||
##contig=<ID=chrX,length=156040895> | |||
''' | |||
# output files | |||
benchmark_LCL5 = open('LCL5_voted.vcf','w') | |||
benchmark_LCL6 = open('LCL6_voted.vcf','w') | |||
benchmark_LCL7 = open('LCL7_voted.vcf','w') | |||
benchmark_LCL8 = open('LCL8_voted.vcf','w') | |||
all_sample_outfile = open('all_sample_information.txt','w') | |||
# write VCF | |||
LCL5_col = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tLCL5_benchmark_calls\n' | |||
LCL6_col = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tLCL6_benchmark_calls\n' | |||
LCL7_col = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tLCL7_benchmark_calls\n' | |||
LCL8_col = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tLCL8_benchmark_calls\n' | |||
benchmark_LCL5.write(vcf_header) | |||
benchmark_LCL5.write(LCL5_col) | |||
benchmark_LCL6.write(vcf_header) | |||
benchmark_LCL6.write(LCL6_col) | |||
benchmark_LCL7.write(vcf_header) | |||
benchmark_LCL7.write(LCL7_col) | |||
benchmark_LCL8.write(vcf_header) | |||
benchmark_LCL8.write(LCL8_col) | |||
# all info | |||
all_info_col = 'CHROM\tPOS\tLCL5_pcr_consensus\tLCL5_pcr_free_consensus\tLCL5_mendelian_num\tLCL5_consensus_call\tLCL5_consensus_alt_seq\tLCL5_alt\tLCL5_dp\tLCL5_detected_num\tLCL6_pcr_consensus\tLCL6_pcr_free_consensus\tLCL6_mendelian_num\tLCL6_consensus_call\tLCL6_consensus_alt_seq\tLCL6_alt\tLCL6_dp\tLCL6_detected_num\tLCL7_pcr_consensus\tLCL7_pcr_free_consensus\tLCL7_mendelian_num\t LCL7_consensus_call\tLCL7_consensus_alt_seq\tLCL7_alt\tLCL7_dp\tLCL7_detected_num\tLCL8_pcr_consensus\tLCL8_pcr_free_consensus\tLCL8_mendelian_num\tLCL8_consensus_call\tLCL8_consensus_alt_seq\tLCL8_alt\tLCL8_dp\tLCL8_detected_num\n' | |||
all_sample_outfile.write(all_info_col) | |||
# function | |||
def decide_by_rep(vcf_list,mendelian_list): | |||
consensus_rep = '' | |||
gt = [x.split(':')[0] for x in vcf_list] | |||
# mendelian consistent? | |||
mendelian_dict = Counter(mendelian_list) | |||
highest_mendelian = mendelian_dict.most_common(1) | |||
candidate_mendelian = highest_mendelian[0][0] | |||
freq_mendelian = highest_mendelian[0][1] | |||
if (candidate_mendelian == '1:1:1') and (freq_mendelian >= 2): | |||
con_loc = [i for i in range(len(mendelian_list)) if mendelian_list[i] == '1:1:1'] | |||
gt_con = itemgetter(*con_loc)(gt) | |||
gt_num_dict = Counter(gt_con) | |||
highest_gt = gt_num_dict.most_common(1) | |||
candidate_gt = highest_gt[0][0] | |||
freq_gt = highest_gt[0][1] | |||
if (candidate_gt != './.') and (freq_gt >= 2): | |||
consensus_rep = candidate_gt | |||
elif (candidate_gt == './.') and (freq_gt >= 2): | |||
consensus_rep = 'noGTInfo' | |||
else: | |||
consensus_rep = 'inconGT' | |||
elif (candidate_mendelian == 'nan') and (freq_mendelian >= 2): | |||
consensus_rep = 'noMenInfo' | |||
else: | |||
consensus_rep = 'inconMen' | |||
return consensus_rep | |||
def consensus_call(vcf_info_list,mendelian_list,alt_seq): | |||
pcr_consensus = '.' | |||
pcr_free_consensus = '.' | |||
mendelian_num = '.' | |||
consensus_call = '.' | |||
consensus_alt_seq = '.' | |||
# pcr | |||
SEQ2000 = decide_by_rep(vcf_info_list[0:3],mendelian_list[0:3]) | |||
XTen_ARD = decide_by_rep(vcf_info_list[18:21],mendelian_list[18:21]) | |||
XTen_NVG = decide_by_rep(vcf_info_list[21:24],mendelian_list[21:24]) | |||
XTen_WUX = decide_by_rep(vcf_info_list[24:27],mendelian_list[24:27]) | |||
pcr_sequence_site = [SEQ2000,XTen_ARD,XTen_NVG,XTen_WUX] | |||
pcr_sequence_dict = Counter(pcr_sequence_site) | |||
pcr_highest_sequence = pcr_sequence_dict.most_common(1) | |||
pcr_candidate_sequence = pcr_highest_sequence[0][0] | |||
pcr_freq_sequence = pcr_highest_sequence[0][1] | |||
if pcr_freq_sequence > 2: | |||
pcr_consensus = pcr_candidate_sequence | |||
else: | |||
pcr_consensus = 'inconSequenceSite' | |||
# pcr-free | |||
T7_WGE = decide_by_rep(vcf_info_list[3:6],mendelian_list[3:6]) | |||
Nova_ARD_1 = decide_by_rep(vcf_info_list[6:9],mendelian_list[6:9]) | |||
Nova_ARD_2 = decide_by_rep(vcf_info_list[9:12],mendelian_list[9:12]) | |||
Nova_BRG = decide_by_rep(vcf_info_list[12:15],mendelian_list[12:15]) | |||
Nova_WUX = decide_by_rep(vcf_info_list[15:18],mendelian_list[15:18]) | |||
sequence_site = [T7_WGE,Nova_ARD_1,Nova_ARD_2,Nova_BRG,Nova_WUX] | |||
sequence_dict = Counter(sequence_site) | |||
highest_sequence = sequence_dict.most_common(1) | |||
candidate_sequence = highest_sequence[0][0] | |||
freq_sequence = highest_sequence[0][1] | |||
if freq_sequence > 3: | |||
pcr_free_consensus = candidate_sequence | |||
else: | |||
pcr_free_consensus = 'inconSequenceSite' | |||
# net alt, dp | |||
# alt | |||
AD = [x.split(':')[1] for x in vcf_info_list] | |||
ALT = [x.split(',')[1] for x in AD] | |||
ALT = [int(x) for x in ALT] | |||
ALL_ALT = sum(ALT) | |||
# dp | |||
DP = [x.split(':')[2] for x in vcf_info_list] | |||
DP = [int(x) for x in DP] | |||
ALL_DP = sum(DP) | |||
# detected number | |||
gt = [x.split(':')[0] for x in vcf_info_list] | |||
gt = [x.replace('0/0','.') for x in gt] | |||
gt = [x.replace('./.','.') for x in gt] | |||
detected_num = 27 - gt.count('.') | |||
# decide consensus calls | |||
tag = ['inconGT','noMenInfo','inconMen','inconSequenceSite','noGTInfo'] | |||
if (pcr_consensus != '0/0') and (pcr_consensus == pcr_free_consensus) and (pcr_consensus not in tag): | |||
consensus_call = pcr_consensus | |||
gt = [x.split(':')[0] for x in vcf_info_list] | |||
indices = [i for i, x in enumerate(gt) if x == consensus_call] | |||
matched_mendelian = itemgetter(*indices)(mendelian_list) | |||
mendelian_num = matched_mendelian.count('1:1:1') | |||
# Delete multiple alternative genotype to necessary expression | |||
alt_gt = alt_seq.split(',') | |||
if len(alt_gt) > 1: | |||
allele1 = consensus_call.split('/')[0] | |||
allele2 = consensus_call.split('/')[1] | |||
if allele1 == '0': | |||
allele2_seq = alt_gt[int(allele2) - 1] | |||
consensus_alt_seq = allele2_seq | |||
consensus_call = '0/1' | |||
else: | |||
allele1_seq = alt_gt[int(allele1) - 1] | |||
allele2_seq = alt_gt[int(allele2) - 1] | |||
if int(allele1) > int(allele2): | |||
consensus_alt_seq = allele2_seq + ',' + allele1_seq | |||
consensus_call = '1/2' | |||
elif int(allele1) < int(allele2): | |||
consensus_alt_seq = allele1_seq + ',' + allele2_seq | |||
consensus_call = '1/2' | |||
else: | |||
consensus_alt_seq = allele1_seq | |||
consensus_call = '1/1' | |||
else: | |||
consensus_alt_seq = alt_seq | |||
elif (pcr_consensus in tag) and (pcr_free_consensus in tag): | |||
consensus_call = 'filtered' | |||
elif ((pcr_consensus == './.') or (pcr_consensus in tag)) and ((pcr_free_consensus not in tag) and (pcr_free_consensus != './.')): | |||
consensus_call = 'pcr-free-speicifc' | |||
elif ((pcr_consensus != './.') or (pcr_consensus not in tag)) and ((pcr_free_consensus in tag) and (pcr_free_consensus == './.')): | |||
consensus_call = 'pcr-speicifc' | |||
elif (pcr_consensus == '0/0') and (pcr_free_consensus == '0/0'): | |||
consensus_call = '0/0' | |||
else: | |||
consensus_call = 'filtered' | |||
return pcr_consensus, pcr_free_consensus, mendelian_num, consensus_call, consensus_alt_seq, ALL_ALT, ALL_DP, detected_num | |||
for row in merged_df.itertuples(): | |||
mendelian_list = [row.Quartet_DNA_BGI_SEQ2000_BGI_1_20180518,row.Quartet_DNA_BGI_SEQ2000_BGI_2_20180530,row.Quartet_DNA_BGI_SEQ2000_BGI_3_20180530, \ | |||
row.Quartet_DNA_BGI_T7_WGE_1_20191105,row.Quartet_DNA_BGI_T7_WGE_2_20191105,row.Quartet_DNA_BGI_T7_WGE_3_20191105, \ | |||
row.Quartet_DNA_ILM_Nova_ARD_1_20181108,row.Quartet_DNA_ILM_Nova_ARD_2_20181108,row.Quartet_DNA_ILM_Nova_ARD_3_20181108, \ | |||
row.Quartet_DNA_ILM_Nova_ARD_4_20190111,row.Quartet_DNA_ILM_Nova_ARD_5_20190111,row.Quartet_DNA_ILM_Nova_ARD_6_20190111, \ | |||
row.Quartet_DNA_ILM_Nova_BRG_1_20180930,row.Quartet_DNA_ILM_Nova_BRG_2_20180930,row.Quartet_DNA_ILM_Nova_BRG_3_20180930, \ | |||
row.Quartet_DNA_ILM_Nova_WUX_1_20190917,row.Quartet_DNA_ILM_Nova_WUX_2_20190917,row.Quartet_DNA_ILM_Nova_WUX_3_20190917, \ | |||
row.Quartet_DNA_ILM_XTen_ARD_1_20170403,row.Quartet_DNA_ILM_XTen_ARD_2_20170403,row.Quartet_DNA_ILM_XTen_ARD_3_20170403, \ | |||
row.Quartet_DNA_ILM_XTen_NVG_1_20170329,row.Quartet_DNA_ILM_XTen_NVG_2_20170329,row.Quartet_DNA_ILM_XTen_NVG_3_20170329, \ | |||
row.Quartet_DNA_ILM_XTen_WUX_1_20170216,row.Quartet_DNA_ILM_XTen_WUX_2_20170216,row.Quartet_DNA_ILM_XTen_WUX_3_20170216] | |||
lcl5_list = [row.Quartet_DNA_BGI_SEQ2000_BGI_LCL5_1_20180518,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL5_2_20180530,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL5_3_20180530, \ | |||
row.Quartet_DNA_BGI_T7_WGE_LCL5_1_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL5_2_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL5_3_20191105, \ | |||
row.Quartet_DNA_ILM_Nova_ARD_LCL5_1_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL5_2_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL5_3_20181108, \ | |||
row.Quartet_DNA_ILM_Nova_ARD_LCL5_4_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL5_5_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL5_6_20190111, \ | |||
row.Quartet_DNA_ILM_Nova_BRG_LCL5_1_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL5_2_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL5_3_20180930, \ | |||
row.Quartet_DNA_ILM_Nova_WUX_LCL5_1_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL5_2_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL5_3_20190917, \ | |||
row.Quartet_DNA_ILM_XTen_ARD_LCL5_1_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL5_2_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL5_3_20170403, \ | |||
row.Quartet_DNA_ILM_XTen_NVG_LCL5_1_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL5_2_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL5_3_20170329, \ | |||
row.Quartet_DNA_ILM_XTen_WUX_LCL5_1_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL5_2_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL5_3_20170216] | |||
lcl6_list = [row.Quartet_DNA_BGI_SEQ2000_BGI_LCL6_1_20180518,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL6_2_20180530,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL6_3_20180530, \ | |||
row.Quartet_DNA_BGI_T7_WGE_LCL6_1_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL6_2_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL6_3_20191105, \ | |||
row.Quartet_DNA_ILM_Nova_ARD_LCL6_1_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL6_2_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL6_3_20181108, \ | |||
row.Quartet_DNA_ILM_Nova_ARD_LCL6_4_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL6_5_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL6_6_20190111, \ | |||
row.Quartet_DNA_ILM_Nova_BRG_LCL6_1_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL6_2_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL6_3_20180930, \ | |||
row.Quartet_DNA_ILM_Nova_WUX_LCL6_1_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL6_2_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL6_3_20190917, \ | |||
row.Quartet_DNA_ILM_XTen_ARD_LCL6_1_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL6_2_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL6_3_20170403, \ | |||
row.Quartet_DNA_ILM_XTen_NVG_LCL6_1_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL6_2_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL6_3_20170329, \ | |||
row.Quartet_DNA_ILM_XTen_WUX_LCL6_1_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL6_2_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL6_3_20170216] | |||
lcl7_list = [row.Quartet_DNA_BGI_SEQ2000_BGI_LCL7_1_20180518,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL7_2_20180530,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL7_3_20180530, \ | |||
row.Quartet_DNA_BGI_T7_WGE_LCL7_1_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL7_2_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL7_3_20191105, \ | |||
row.Quartet_DNA_ILM_Nova_ARD_LCL7_1_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL7_2_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL7_3_20181108, \ | |||
row.Quartet_DNA_ILM_Nova_ARD_LCL7_4_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL7_5_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL7_6_20190111, \ | |||
row.Quartet_DNA_ILM_Nova_BRG_LCL7_1_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL7_2_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL7_3_20180930, \ | |||
row.Quartet_DNA_ILM_Nova_WUX_LCL7_1_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL7_2_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL7_3_20190917, \ | |||
row.Quartet_DNA_ILM_XTen_ARD_LCL7_1_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL7_2_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL7_3_20170403, \ | |||
row.Quartet_DNA_ILM_XTen_NVG_LCL7_1_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL7_2_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL7_3_20170329, \ | |||
row.Quartet_DNA_ILM_XTen_WUX_LCL7_1_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL7_2_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL7_3_20170216] | |||
lcl8_list = [row.Quartet_DNA_BGI_SEQ2000_BGI_LCL8_1_20180518,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL8_2_20180530,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL8_3_20180530, \ | |||
row.Quartet_DNA_BGI_T7_WGE_LCL8_1_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL8_2_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL8_3_20191105, \ | |||
row.Quartet_DNA_ILM_Nova_ARD_LCL8_1_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL8_2_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL8_3_20181108, \ | |||
row.Quartet_DNA_ILM_Nova_ARD_LCL8_4_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL8_5_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL8_6_20190111, \ | |||
row.Quartet_DNA_ILM_Nova_BRG_LCL8_1_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL8_2_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL8_3_20180930, \ | |||
row.Quartet_DNA_ILM_Nova_WUX_LCL8_1_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL8_2_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL8_3_20190917, \ | |||
row.Quartet_DNA_ILM_XTen_ARD_LCL8_1_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL8_2_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL8_3_20170403, \ | |||
row.Quartet_DNA_ILM_XTen_NVG_LCL8_1_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL8_2_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL8_3_20170329, \ | |||
row.Quartet_DNA_ILM_XTen_WUX_LCL8_1_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL8_2_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL8_3_20170216] | |||
# LCL5 | |||
LCL5_pcr_consensus, LCL5_pcr_free_consensus, LCL5_mendelian_num, LCL5_consensus_call, LCL5_consensus_alt_seq, LCL5_alt, LCL5_dp, LCL5_detected_num = consensus_call(lcl5_list,mendelian_list,row.ALT) | |||
if LCL5_mendelian_num != '.': | |||
LCL5_output = row.CHROM + '\t' + str(row.POS) + '\t' + '.' + '\t' + row.REF + '\t' + LCL5_consensus_alt_seq + '\t' + '.' + '\t' + '.' + '\t' +'VOTED=' + str(LCL5_mendelian_num) + '\t' + 'GT:ALT:DP' + '\t' + LCL5_consensus_call + ':' + str(LCL5_alt) + ':' + str(LCL5_dp) + '\n' | |||
benchmark_LCL5.write(LCL5_output) | |||
# LCL6 | |||
LCL6_pcr_consensus, LCL6_pcr_free_consensus, LCL6_mendelian_num, LCL6_consensus_call, LCL6_consensus_alt_seq, LCL6_alt, LCL6_dp, LCL6_detected_num = consensus_call(lcl6_list,mendelian_list,row.ALT) | |||
if LCL6_mendelian_num != '.': | |||
LCL6_output = row.CHROM + '\t' + str(row.POS) + '\t' + '.' + '\t' + row.REF + '\t' + LCL6_consensus_alt_seq + '\t' + '.' + '\t' + '.' + '\t' +'VOTED=' + str(LCL6_mendelian_num) + '\t' + 'GT:ALT:DP' + '\t' + LCL6_consensus_call + ':' + str(LCL6_alt) + ':' + str(LCL6_dp) + '\n' | |||
benchmark_LCL6.write(LCL6_output) | |||
# LCL7 | |||
LCL7_pcr_consensus, LCL7_pcr_free_consensus, LCL7_mendelian_num, LCL7_consensus_call, LCL7_consensus_alt_seq, LCL7_alt, LCL7_dp, LCL7_detected_num = consensus_call(lcl7_list,mendelian_list,row.ALT) | |||
if LCL7_mendelian_num != '.': | |||
LCL7_output = row.CHROM + '\t' + str(row.POS) + '\t' + '.' + '\t' + row.REF + '\t' + LCL7_consensus_alt_seq + '\t' + '.' + '\t' + '.' + '\t' +'VOTED=' + str(LCL7_mendelian_num) + '\t' + 'GT:ALT:DP' + '\t' + LCL7_consensus_call + ':' + str(LCL7_alt) + ':' + str(LCL7_dp) + '\n' | |||
benchmark_LCL7.write(LCL7_output) | |||
# LCL8 | |||
LCL8_pcr_consensus, LCL8_pcr_free_consensus, LCL8_mendelian_num, LCL8_consensus_call, LCL8_consensus_alt_seq, LCL8_alt, LCL8_dp, LCL8_detected_num = consensus_call(lcl8_list,mendelian_list,row.ALT) | |||
if LCL8_mendelian_num != '.': | |||
LCL8_output = row.CHROM + '\t' + str(row.POS) + '\t' + '.' + '\t' + row.REF + '\t' + LCL8_consensus_alt_seq + '\t' + '.' + '\t' + '.' + '\t' +'VOTED=' + str(LCL8_mendelian_num) + '\t' + 'GT:ALT:DP' + '\t' + LCL8_consensus_call + ':' + str(LCL8_alt) + ':' + str(LCL8_dp) + '\n' | |||
benchmark_LCL8.write(LCL8_output) | |||
# all data | |||
all_output = row.CHROM + '\t' + str(row.POS) + '\t' + LCL5_pcr_consensus + '\t' + LCL5_pcr_free_consensus + '\t' + str(LCL5_mendelian_num) + '\t' + LCL5_consensus_call + '\t' + LCL5_consensus_alt_seq + '\t' + str(LCL5_alt) + '\t' + str(LCL5_dp) + '\t' + str(LCL5_detected_num) + '\t' +\ | |||
LCL6_pcr_consensus + '\t' + LCL6_pcr_free_consensus + '\t' + str(LCL6_mendelian_num) + '\t' + LCL6_consensus_call + '\t' + LCL6_consensus_alt_seq + '\t' + str(LCL6_alt) + '\t' + str(LCL6_dp) + '\t' + str(LCL6_detected_num) + '\t' +\ | |||
LCL7_pcr_consensus + '\t' + LCL7_pcr_free_consensus + '\t' + str(LCL7_mendelian_num) + '\t' + LCL7_consensus_call + '\t' + LCL7_consensus_alt_seq + '\t' + str(LCL7_alt) + '\t' + str(LCL7_dp) + '\t' + str(LCL7_detected_num) + '\t' +\ | |||
LCL8_pcr_consensus + '\t' + LCL8_pcr_free_consensus + '\t' + str(LCL8_mendelian_num) + '\t' + LCL8_consensus_call + '\t' + LCL8_consensus_alt_seq + '\t' + str(LCL8_alt) + '\t' + str(LCL8_dp) + '\t' + str(LCL8_detected_num) + '\n' | |||
all_sample_outfile.write(all_output) | |||
@@ -0,0 +1,24 @@ | |||
{ | |||
"{{ project_name }}.benchmarking_dir": "oss://pgx-result/renluyao/manuscript_v3.0/reference_datasets_v202103/", | |||
"{{ project_name }}.fasta": "GRCh38.d1.vd1.fa", | |||
"{{ project_name }}.BENCHMARKdocker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-hap:latest", | |||
"{{ project_name }}.LCL6_1": "{{ LCL6_1 }}", | |||
"{{ project_name }}.LCL5_3": "{{ LCL5_3 }}", | |||
"{{ project_name }}.LCL8_2": "{{ LCL8_2 }}", | |||
"{{ project_name }}.disk_size": "500", | |||
"{{ project_name }}.LCL8_1": "{{ LCL8_1 }}", | |||
"{{ project_name }}.LCL6_3": "{{ LCL6_3 }}", | |||
"{{ project_name }}.project": "{{ project }}", | |||
"{{ project_name }}.LCL7_3": "{{ LCL7_3 }}", | |||
"{{ project_name }}.LCL5_1": "{{ LCL5_1 }}", | |||
"{{ project_name }}.SMALLcluster_config": "OnDemand bcs.ps.g.xlarge img-ubuntu-vpc", | |||
"{{ project_name }}.LCL6_2": "{{ LCL6_2 }}", | |||
"{{ project_name }}.BIGcluster_config": "OnDemand bcs.ps.g.2xlarge img-ubuntu-vpc", | |||
"{{ project_name }}.LCL7_2": "{{ LCL7_2 }}", | |||
"{{ project_name }}.LCL5_2": "{{ LCL5_2 }}", | |||
"{{ project_name }}.LCL7_1": "{{ LCL7_1 }}", | |||
"{{ project_name }}.MENDELIANdocker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/vbt:v1.1", | |||
"{{ project_name }}.LCL8_3": "{{ LCL8_3 }}", | |||
"{{ project_name }}.DIYdocker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call_manuscript:v1.4", | |||
"{{ project_name }}.ref_dir": "oss://pgx-reference-data/GRCh38.d1.vd1/" | |||
} |
@@ -0,0 +1,81 @@ | |||
task benchmark { | |||
File vcf | |||
File benchmarking_dir | |||
File ref_dir | |||
String sample = basename(vcf,".vcf") | |||
String fasta | |||
String docker | |||
String cluster_config | |||
String disk_size | |||
command <<< | |||
set -o pipefail | |||
set -e | |||
nt=$(nproc) | |||
mkdir -p /cromwell_root/tmp | |||
cp -r ${ref_dir} /cromwell_root/tmp/ | |||
cp -r ${benchmarking_dir} /cromwell_root/tmp/ | |||
export HGREF=/cromwell_root/tmp/reference_data/GRCh38.d1.vd1.fa | |||
echo -e "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tLCL5" > LCL5_name | |||
echo -e "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tLCL6" > LCL6_name | |||
echo -e "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tLCL7" > LCL7_name | |||
echo -e "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tLCL8" > LCL8_name | |||
if [[ ${sample} =~ "LCL5" ]];then | |||
/opt/hap.py/bin/hap.py /cromwell_root/tmp/reference_datasets_v202103/LCL5.high.confidence.calls.vcf ${vcf} -f /cromwell_root/tmp/reference_datasets_v202103/Quartet.high.confidence.region.v202103.bed --threads $nt -o ${sample} -r ${ref_dir}/${fasta} | |||
cat ${vcf} | grep '##' > header | |||
cat ${vcf} | grep -v '#' > body | |||
cat header LCL5_name body > LCL5.vcf | |||
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg bgzip LCL5.vcf -c > ${sample}.reformed.vcf.gz | |||
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg index -f vcf ${sample}.reformed.vcf.gz | |||
elif [[ ${sample} =~ "LCL6" ]]; then | |||
/opt/hap.py/bin/hap.py /cromwell_root/tmp/reference_datasets_v202103/LCL6.high.confidence.calls.vcf ${vcf} -f /cromwell_root/tmp/reference_datasets_v202103/Quartet.high.confidence.region.v202103.bed --threads $nt -o ${sample} -r ${ref_dir}/${fasta} | |||
cat ${vcf} | grep '##' > header | |||
cat ${vcf} | grep -v '#' > body | |||
cat header LCL6_name body > LCL6.vcf | |||
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg bgzip LCL6.vcf -c > ${sample}.reformed.vcf.gz | |||
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg index -f vcf ${sample}.reformed.vcf.gz | |||
elif [[ ${sample} =~ "LCL7" ]]; then | |||
/opt/hap.py/bin/hap.py /cromwell_root/tmp/reference_datasets_v202103/LCL7.high.confidence.calls.vcf ${vcf} -f /cromwell_root/tmp/reference_datasets_v202103/Quartet.high.confidence.region.v202103.bed --threads $nt -o ${sample} -r ${ref_dir}/${fasta} | |||
cat ${vcf} | grep '##' > header | |||
cat ${vcf} | grep -v '#' > body | |||
cat header LCL7_name body > LCL7.vcf | |||
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg bgzip LCL7.vcf -c > ${sample}.reformed.vcf.gz | |||
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg index -f vcf ${sample}.reformed.vcf.gz | |||
elif [[ ${sample} =~ "LCL8" ]]; then | |||
/opt/hap.py/bin/hap.py /cromwell_root/tmp/reference_datasets_v202103/LCL8.high.confidence.calls.vcf ${vcf} -f /cromwell_root/tmp/reference_datasets_v202103/Quartet.high.confidence.region.v202103.bed --threads $nt -o ${sample} -r ${ref_dir}/${fasta} | |||
cat ${vcf} | grep '##' > header | |||
cat ${vcf} | grep -v '#' > body | |||
cat header LCL8_name body > LCL8.vcf | |||
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg bgzip LCL8.vcf -c > ${sample}.reformed.vcf.gz | |||
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg index -f vcf ${sample}.reformed.vcf.gz | |||
else | |||
echo "only for quartet samples" | |||
fi | |||
>>> | |||
runtime { | |||
docker:docker | |||
cluster:cluster_config | |||
systemDisk:"cloud_ssd 40" | |||
dataDisk:"cloud_ssd " + disk_size + " /cromwell_root/" | |||
} | |||
output { | |||
File rtg_vcf = "${sample}.reformed.vcf.gz" | |||
File rtg_vcf_index = "${sample}.reformed.vcf.gz.tbi" | |||
File gzip_vcf = "${sample}.vcf.gz" | |||
File gzip_vcf_index = "${sample}.vcf.gz.tbi" | |||
File roc_all_csv = "${sample}.roc.all.csv.gz" | |||
File roc_indel = "${sample}.roc.Locations.INDEL.csv.gz" | |||
File roc_indel_pass = "${sample}.roc.Locations.INDEL.PASS.csv.gz" | |||
File roc_snp = "${sample}.roc.Locations.SNP.csv.gz" | |||
File roc_snp_pass = "${sample}.roc.Locations.SNP.PASS.csv.gz" | |||
File summary = "${sample}.summary.csv" | |||
File extended = "${sample}.extended.csv" | |||
File metrics = "${sample}.metrics.json.gz" | |||
} | |||
} |
@@ -0,0 +1,46 @@ | |||
task mendelian { | |||
File family_vcf | |||
File ref_dir | |||
String family_name = basename(family_vcf,".family.vcf") | |||
String fasta | |||
String docker | |||
String cluster_config | |||
String disk_size | |||
command <<< | |||
export LD_LIBRARY_PATH=/opt/htslib-1.9 | |||
nt=$(nproc) | |||
echo -e "${family_name}\tLCL8\t0\t0\t2\t-9\n${family_name}\tLCL7\t0\t0\t1\t-9\n${family_name}\tLCL5\tLCL7\tLCL8\t2\t-9" > ${family_name}.D5.ped | |||
mkdir VBT_D5 | |||
/opt/VBT-TrioAnalysis/vbt mendelian -ref ${ref_dir}/${fasta} -mother ${family_vcf} -father ${family_vcf} -child ${family_vcf} -pedigree ${family_name}.D5.ped -outDir VBT_D5 -out-prefix ${family_name}.D5 --output-violation-regions -thread-count $nt | |||
cat VBT_D5/${family_name}.D5_trio.vcf > ${family_name}.D5.vcf | |||
echo -e "${family_name}\tLCL8\t0\t0\t2\t-9\n${family_name}\tLCL7\t0\t0\t1\t-9\n${family_name}\tLCL6\tLCL7\tLCL8\t2\t-9" > ${family_name}.D6.ped | |||
mkdir VBT_D6 | |||
/opt/VBT-TrioAnalysis/vbt mendelian -ref ${ref_dir}/${fasta} -mother ${family_vcf} -father ${family_vcf} -child ${family_vcf} -pedigree ${family_name}.D6.ped -outDir VBT_D6 -out-prefix ${family_name}.D6 --output-violation-regions -thread-count $nt | |||
cat VBT_D6/${family_name}.D6_trio.vcf > ${family_name}.D6.vcf | |||
>>> | |||
runtime { | |||
docker:docker | |||
cluster: cluster_config | |||
systemDisk: "cloud_ssd 40" | |||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||
} | |||
output { | |||
File D5_ped = "${family_name}.D5.ped" | |||
File D6_ped = "${family_name}.D6.ped" | |||
Array[File] D5_mendelian = glob("VBT_D5/*") | |||
Array[File] D6_mendelian = glob("VBT_D6/*") | |||
File D5_trio_vcf = "${family_name}.D5.vcf" | |||
File D6_trio_vcf = "${family_name}.D6.vcf" | |||
} | |||
} | |||
@@ -0,0 +1,35 @@ | |||
task merge_family { | |||
File LCL5_vcf_gz | |||
File LCL5_vcf_idx | |||
File LCL6_vcf_gz | |||
File LCL6_vcf_idx | |||
File LCL7_vcf_gz | |||
File LCL7_vcf_idx | |||
File LCL8_vcf_gz | |||
File LCL8_vcf_idx | |||
String project | |||
String rep | |||
String docker | |||
String cluster_config | |||
String disk_size | |||
command <<< | |||
/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcfmerge --force-merge-all -o ${project}_${rep}.family.vcf.gz ${LCL5_vcf_gz} ${LCL6_vcf_gz} ${LCL7_vcf_gz} ${LCL8_vcf_gz} | |||
gunzip ${project}_${rep}.family.vcf.gz | |||
>>> | |||
runtime { | |||
docker:docker | |||
cluster: cluster_config | |||
systemDisk: "cloud_ssd 40" | |||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||
} | |||
output { | |||
File merged_vcf = "${project}_${rep}.family.vcf" | |||
} | |||
} | |||
@@ -0,0 +1,35 @@ | |||
task merge_mendelian { | |||
File D5_trio_vcf | |||
File D6_trio_vcf | |||
File family_vcf | |||
String family_name = basename(family_vcf,".family.vcf") | |||
String docker | |||
String cluster_config | |||
String disk_size | |||
command <<< | |||
cat ${D5_trio_vcf} | grep -v '##' > ${family_name}.D5.txt | |||
cat ${D6_trio_vcf} | grep -v '##' > ${family_name}.D6.txt | |||
cat ${family_vcf} | grep -v '##' | awk ' | |||
BEGIN { OFS = "\t" } | |||
NF > 2 && FNR > 1 { | |||
for ( i=9; i<=NF; i++ ) { | |||
split($i,a,":") ;$i = a[1]; | |||
} | |||
} | |||
{ print } | |||
' > ${family_name}.consensus.txt | |||
python /opt/merge_two_family_with_genotype.py -LCL5 ${family_name}.D5.txt -LCL6 ${family_name}.D6.txt -genotype ${family_name}.consensus.txt -family ${family_name} | |||
>>> | |||
runtime { | |||
docker:docker | |||
cluster: cluster_config | |||
systemDisk: "cloud_ssd 40" | |||
dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/" | |||
} | |||
output { | |||
File project_mendelian = "${family_name}.txt" | |||
File project_mendelian_summary = "${family_name}.summary.txt" | |||
} | |||
} |
@@ -0,0 +1,62 @@ | |||
task quartet_mendelian { | |||
File summary_1 | |||
File summary_2 | |||
File summary_3 | |||
File LCL5_hap_1 | |||
File LCL5_hap_2 | |||
File LCL5_hap_3 | |||
File LCL6_hap_1 | |||
File LCL6_hap_2 | |||
File LCL6_hap_3 | |||
File LCL7_hap_1 | |||
File LCL7_hap_2 | |||
File LCL7_hap_3 | |||
File LCL8_hap_1 | |||
File LCL8_hap_2 | |||
File LCL8_hap_3 | |||
String docker | |||
String project | |||
String cluster_config | |||
String disk_size | |||
command <<< | |||
cat ${summary_1} ${summary_2} ${summary_3} | grep -v 'Family'> mendelian.summary | |||
sed '1iFamily\tTotal_Variants\tMendelian_Concordant_Variants\tMendelian_Concordance_Rate' mendelian.summary > mendelian.txt | |||
cat mendelian.txt | grep 'INDEL' | cut -f4 | grep -v 'Mendelian_Concordance_Rate' | awk '{for(i=1;i<=NF;i++) {sum[i] += $i; sumsq[i] += ($i)^2}} | |||
END {for (i=1;i<=NF;i++) { | |||
printf "%f %f \n", sum[i]/NR, sqrt((sumsq[i]-sum[i]^2/NR)/NR)} | |||
}' >> quartet_indel_aver-std.txt | |||
cat mendelian.txt | grep 'SNV' | cut -f4 | grep -v 'Mendelian_Concordance_Rate' | awk '{for(i=1;i<=NF;i++) {sum[i] += $i; sumsq[i] += ($i)^2}} | |||
END {for (i=1;i<=NF;i++) { | |||
printf "%f %f \n", sum[i]/NR, sqrt((sumsq[i]-sum[i]^2/NR)/NR)} | |||
}' >> quartet_snv_aver-std.txt | |||
cat ${LCL5_hap_1} ${LCL5_hap_2} ${LCL5_hap_3} ${LCL6_hap_1} ${LCL6_hap_2} ${LCL6_hap_3} ${LCL7_hap_1} ${LCL7_hap_2} ${LCL7_hap_3} ${LCL8_hap_1} ${LCL8_hap_2} ${LCL8_hap_3} | grep ALL | sed s'/,/\t/g' > hap.summary | |||
sed '1i\Type\tFilter\tTRUTH.TOTAL\tTRUTH.TP\tTRUTH.FN\tQUERY.TOTAL\tQUERY.FP\tQUERY.UNK\tFP.gt\tMETRIC.Recall\tMETRIC.Precision\tMETRIC.Frac_NA\tMETRIC.F1_Score\tTRUTH.TOTAL.TiTv_ratio\tQUERY.TOTAL.TiTv_ratio\tTRUTH.TOTAL.het_hom_ratio\tQUERY.TOTAL.het_hom_ratio' hap.summary > precision_recall | |||
python /opt/hap_summary.py -hap precision_recall -name ${project} | |||
cat variants.calling.qc.txt | cut -f12- | grep -v 'SNV' | awk '{for(i=1;i<=NF;i++) {sum[i] += $i; sumsq[i] += ($i)^2}} | |||
END {for (i=1;i<=NF;i++) { | |||
printf "%f %f \n", sum[i]/NR, sqrt((sumsq[i]-sum[i]^2/NR)/NR)} | |||
}' >> reference_datasets_aver-std.txt | |||
>>> | |||
runtime { | |||
docker:docker | |||
cluster:cluster_config | |||
systemDisk:"cloud_ssd 40" | |||
dataDisk:"cloud_ssd " + disk_size + " /cromwell_root/" | |||
} | |||
output { | |||
File mendelian_summary = "mendelian.txt" | |||
File snv_aver_std = "quartet_snv_aver-std.txt" | |||
File indel_aver_std = "quartet_indel_aver-std.txt" | |||
File pr = "precision_recall" | |||
File hap_summary = "variants.calling.qc.txt" | |||
File precision_recall_aver_std = "reference_datasets_aver-std.txt" | |||
} | |||
} |
@@ -0,0 +1,310 @@ | |||
import "./tasks/benchmark.wdl" as benchmark | |||
import "./tasks/mendelian.wdl" as mendelian | |||
import "./tasks/merge_mendelian.wdl" as merge_mendelian | |||
import "./tasks/merge_family.wdl" as merge_family | |||
import "./tasks/quartet_mendelian.wdl" as quartet_mendelian | |||
workflow {{ project_name }} { | |||
File LCL5_1 | |||
File LCL6_1 | |||
File LCL7_1 | |||
File LCL8_1 | |||
File LCL5_2 | |||
File LCL6_2 | |||
File LCL7_2 | |||
File LCL8_2 | |||
File LCL5_3 | |||
File LCL6_3 | |||
File LCL7_3 | |||
File LCL8_3 | |||
String BENCHMARKdocker | |||
String MENDELIANdocker | |||
String DIYdocker | |||
String fasta | |||
File ref_dir | |||
File benchmarking_dir | |||
String project | |||
String disk_size | |||
String BIGcluster_config | |||
String SMALLcluster_config | |||
call benchmark.benchmark as LCL5_1_benchmark { | |||
input: | |||
vcf=LCL5_1, | |||
benchmarking_dir=benchmarking_dir, | |||
ref_dir=ref_dir, | |||
fasta=fasta, | |||
docker=BENCHMARKdocker, | |||
cluster_config=BIGcluster_config, | |||
disk_size=disk_size, | |||
} | |||
call benchmark.benchmark as LCL5_2_benchmark { | |||
input: | |||
vcf=LCL5_2, | |||
benchmarking_dir=benchmarking_dir, | |||
ref_dir=ref_dir, | |||
fasta=fasta, | |||
docker=BENCHMARKdocker, | |||
cluster_config=BIGcluster_config, | |||
disk_size=disk_size, | |||
} | |||
call benchmark.benchmark as LCL5_3_benchmark { | |||
input: | |||
vcf=LCL5_3, | |||
benchmarking_dir=benchmarking_dir, | |||
ref_dir=ref_dir, | |||
fasta=fasta, | |||
docker=BENCHMARKdocker, | |||
cluster_config=BIGcluster_config, | |||
disk_size=disk_size, | |||
} | |||
call benchmark.benchmark as LCL6_1_benchmark { | |||
input: | |||
vcf=LCL6_1, | |||
benchmarking_dir=benchmarking_dir, | |||
ref_dir=ref_dir, | |||
fasta=fasta, | |||
docker=BENCHMARKdocker, | |||
cluster_config=BIGcluster_config, | |||
disk_size=disk_size, | |||
} | |||
call benchmark.benchmark as LCL6_2_benchmark { | |||
input: | |||
vcf=LCL6_2, | |||
benchmarking_dir=benchmarking_dir, | |||
ref_dir=ref_dir, | |||
fasta=fasta, | |||
docker=BENCHMARKdocker, | |||
cluster_config=BIGcluster_config, | |||
disk_size=disk_size, | |||
} | |||
call benchmark.benchmark as LCL6_3_benchmark { | |||
input: | |||
vcf=LCL6_3, | |||
benchmarking_dir=benchmarking_dir, | |||
ref_dir=ref_dir, | |||
fasta=fasta, | |||
docker=BENCHMARKdocker, | |||
cluster_config=BIGcluster_config, | |||
disk_size=disk_size, | |||
} | |||
call benchmark.benchmark as LCL7_1_benchmark { | |||
input: | |||
vcf=LCL7_1, | |||
benchmarking_dir=benchmarking_dir, | |||
ref_dir=ref_dir, | |||
fasta=fasta, | |||
docker=BENCHMARKdocker, | |||
cluster_config=BIGcluster_config, | |||
disk_size=disk_size, | |||
} | |||
call benchmark.benchmark as LCL7_2_benchmark { | |||
input: | |||
vcf=LCL7_2, | |||
benchmarking_dir=benchmarking_dir, | |||
ref_dir=ref_dir, | |||
fasta=fasta, | |||
docker=BENCHMARKdocker, | |||
cluster_config=BIGcluster_config, | |||
disk_size=disk_size, | |||
} | |||
call benchmark.benchmark as LCL7_3_benchmark { | |||
input: | |||
vcf=LCL7_3, | |||
benchmarking_dir=benchmarking_dir, | |||
ref_dir=ref_dir, | |||
fasta=fasta, | |||
docker=BENCHMARKdocker, | |||
cluster_config=BIGcluster_config, | |||
disk_size=disk_size, | |||
} | |||
call benchmark.benchmark as LCL8_1_benchmark { | |||
input: | |||
vcf=LCL8_1, | |||
benchmarking_dir=benchmarking_dir, | |||
ref_dir=ref_dir, | |||
fasta=fasta, | |||
docker=BENCHMARKdocker, | |||
cluster_config=BIGcluster_config, | |||
disk_size=disk_size, | |||
} | |||
call benchmark.benchmark as LCL8_2_benchmark { | |||
input: | |||
vcf=LCL8_2, | |||
benchmarking_dir=benchmarking_dir, | |||
ref_dir=ref_dir, | |||
fasta=fasta, | |||
docker=BENCHMARKdocker, | |||
cluster_config=BIGcluster_config, | |||
disk_size=disk_size, | |||
} | |||
call benchmark.benchmark as LCL8_3_benchmark { | |||
input: | |||
vcf=LCL8_3, | |||
benchmarking_dir=benchmarking_dir, | |||
ref_dir=ref_dir, | |||
fasta=fasta, | |||
docker=BENCHMARKdocker, | |||
cluster_config=BIGcluster_config, | |||
disk_size=disk_size, | |||
} | |||
call merge_family.merge_family as merge_family_1 { | |||
input: | |||
LCL5_vcf_gz=LCL5_1_benchmark.rtg_vcf, | |||
LCL5_vcf_idx=LCL5_1_benchmark.rtg_vcf_index, | |||
LCL6_vcf_gz=LCL6_1_benchmark.rtg_vcf, | |||
LCL6_vcf_idx=LCL6_1_benchmark.rtg_vcf_index, | |||
LCL7_vcf_gz=LCL7_1_benchmark.rtg_vcf, | |||
LCL7_vcf_idx=LCL7_1_benchmark.rtg_vcf_index, | |||
LCL8_vcf_gz=LCL8_1_benchmark.rtg_vcf, | |||
LCL8_vcf_idx=LCL8_1_benchmark.rtg_vcf, | |||
project=project, | |||
rep="1", | |||
docker=BENCHMARKdocker, | |||
cluster_config=BIGcluster_config, | |||
disk_size=disk_size, | |||
} | |||
call merge_family.merge_family as merge_family_2 { | |||
input: | |||
LCL5_vcf_gz=LCL5_2_benchmark.rtg_vcf, | |||
LCL5_vcf_idx=LCL5_2_benchmark.rtg_vcf_index, | |||
LCL6_vcf_gz=LCL6_2_benchmark.rtg_vcf, | |||
LCL6_vcf_idx=LCL6_2_benchmark.rtg_vcf_index, | |||
LCL7_vcf_gz=LCL7_2_benchmark.rtg_vcf, | |||
LCL7_vcf_idx=LCL7_2_benchmark.rtg_vcf_index, | |||
LCL8_vcf_gz=LCL8_2_benchmark.rtg_vcf, | |||
LCL8_vcf_idx=LCL8_2_benchmark.rtg_vcf, | |||
project=project, | |||
rep="2", | |||
docker=BENCHMARKdocker, | |||
cluster_config=BIGcluster_config, | |||
disk_size=disk_size, | |||
} | |||
call merge_family.merge_family as merge_family_3 { | |||
input: | |||
LCL5_vcf_gz=LCL5_3_benchmark.rtg_vcf, | |||
LCL5_vcf_idx=LCL5_3_benchmark.rtg_vcf_index, | |||
LCL6_vcf_gz=LCL6_3_benchmark.rtg_vcf, | |||
LCL6_vcf_idx=LCL6_3_benchmark.rtg_vcf_index, | |||
LCL7_vcf_gz=LCL7_3_benchmark.rtg_vcf, | |||
LCL7_vcf_idx=LCL7_3_benchmark.rtg_vcf_index, | |||
LCL8_vcf_gz=LCL8_3_benchmark.rtg_vcf, | |||
LCL8_vcf_idx=LCL8_3_benchmark.rtg_vcf, | |||
project=project, | |||
rep="3", | |||
docker=BENCHMARKdocker, | |||
cluster_config=BIGcluster_config, | |||
disk_size=disk_size, | |||
} | |||
call mendelian.mendelian as mendelian_1 { | |||
input: | |||
family_vcf=merge_family_1.merged_vcf, | |||
ref_dir=ref_dir, | |||
fasta=fasta, | |||
docker=MENDELIANdocker, | |||
cluster_config=BIGcluster_config, | |||
disk_size=disk_size | |||
} | |||
call mendelian.mendelian as mendelian_2 { | |||
input: | |||
family_vcf=merge_family_2.merged_vcf, | |||
ref_dir=ref_dir, | |||
fasta=fasta, | |||
docker=MENDELIANdocker, | |||
cluster_config=BIGcluster_config, | |||
disk_size=disk_size | |||
} | |||
call mendelian.mendelian as mendelian_3 { | |||
input: | |||
family_vcf=merge_family_3.merged_vcf, | |||
ref_dir=ref_dir, | |||
fasta=fasta, | |||
docker=MENDELIANdocker, | |||
cluster_config=BIGcluster_config, | |||
disk_size=disk_size | |||
} | |||
call merge_mendelian.merge_mendelian as merge_mendelian_1 { | |||
input: | |||
D5_trio_vcf=mendelian_1.D5_trio_vcf, | |||
D6_trio_vcf=mendelian_1.D6_trio_vcf, | |||
family_vcf=merge_family_1.merged_vcf, | |||
docker=DIYdocker, | |||
cluster_config=SMALLcluster_config, | |||
disk_size=disk_size | |||
} | |||
call merge_mendelian.merge_mendelian as merge_mendelian_2 { | |||
input: | |||
D5_trio_vcf=mendelian_2.D5_trio_vcf, | |||
D6_trio_vcf=mendelian_2.D6_trio_vcf, | |||
family_vcf=merge_family_2.merged_vcf, | |||
docker=DIYdocker, | |||
cluster_config=SMALLcluster_config, | |||
disk_size=disk_size | |||
} | |||
call merge_mendelian.merge_mendelian as merge_mendelian_3 { | |||
input: | |||
D5_trio_vcf=mendelian_3.D5_trio_vcf, | |||
D6_trio_vcf=mendelian_3.D6_trio_vcf, | |||
family_vcf=merge_family_3.merged_vcf, | |||
docker=DIYdocker, | |||
cluster_config=SMALLcluster_config, | |||
disk_size=disk_size | |||
} | |||
call quartet_mendelian.quartet_mendelian as quartet_mendelian{ | |||
input: | |||
summary_1=merge_mendelian_1.project_mendelian_summary, | |||
summary_2=merge_mendelian_2.project_mendelian_summary, | |||
summary_3=merge_mendelian_3.project_mendelian_summary, | |||
LCL5_hap_1=LCL5_1_benchmark.summary, | |||
LCL5_hap_2=LCL5_2_benchmark.summary, | |||
LCL5_hap_3=LCL5_3_benchmark.summary, | |||
LCL6_hap_1=LCL6_1_benchmark.summary, | |||
LCL6_hap_2=LCL6_2_benchmark.summary, | |||
LCL6_hap_3=LCL6_3_benchmark.summary, | |||
LCL7_hap_1=LCL7_1_benchmark.summary, | |||
LCL7_hap_2=LCL7_2_benchmark.summary, | |||
LCL7_hap_3=LCL7_3_benchmark.summary, | |||
LCL8_hap_1=LCL8_1_benchmark.summary, | |||
LCL8_hap_2=LCL8_2_benchmark.summary, | |||
LCL8_hap_3=LCL8_3_benchmark.summary, | |||
docker=DIYdocker, | |||
project=project, | |||
cluster_config=SMALLcluster_config, | |||
disk_size=disk_size | |||
} | |||
} | |||