4 роки тому · ee499eea5f
--- a/codescripts/.DS_Store
+++ b/codescripts/.DS_Store
--- a/codescripts/D5_D6.py
+++ b/codescripts/D5_D6.py
@@ -0,0 +1,60 @@
 from __future__ import division
 import pandas as pd
 import sys, argparse, os


 # input arguments
 parser = argparse.ArgumentParser(description="this script is to calculate reproducibility between Quartet_D5 and Quartet_D6s")

 parser.add_argument('-sister', '--sister', type=str, help='sister.txt',  required=True)
 parser.add_argument('-project', '--project', type=str, help='project name',  required=True)


 args = parser.parse_args()
 sister_file = args.sister
 project_name = args.project

 # output file
 output_name = project_name + '.sister.reproducibility.txt'

 output_file = open(output_name,'w')

 # input files
 sister_dat = pd.read_table(sister_file)

 sister_same = 0
 sister_diff = 0

 for row in sister_dat.itertuples():
 	# sister
 	if row[5] == row[6]:
 		if row[5] == './.':
 			mendelian = 'noInfo'
 			sister_count = "no"
 		elif row[5] == '0/0':
 			mendelian = 'Ref'
 			sister_count = "no"
 		else:
 			mendelian = '1'
 			sister_count = "yes_same"
 	else:
 		mendelian = '0'
 		if (row[5] == './.' or row[5] == '0/0') and (row[6] == './.' or row[6] == '0/0'):
 			sister_count = "no"
 		else:
 			sister_count = "yes_diff"
 	if sister_count == 'yes_same':
 		sister_same += 1
 	elif sister_count == 'yes_diff':
 		sister_diff += 1
 	else:
 		pass

 sister = sister_same/(sister_same + sister_diff)
 outcolumn =  'Project\tReproducibility_D5_D6\n'
 outResult = project_name + '\t' + str(sister)  + '\n'
 output_file.write(outcolumn)
 output_file.write(outResult)



--- a/codescripts/Indel_bed.py
+++ b/codescripts/Indel_bed.py
@@ -0,0 +1,37 @@
 import pandas as pd
 import sys, argparse, os
 mut = pd.read_table('/mnt/pgx_src_data_pool_4/home/renluyao/manuscript/MIE/vcf/mutation_type',header=None)
 outIndel = open(sys.argv[1],'w')
 for row in mut.itertuples():
 	if ',' in row._4:
 		alt_seq = row._4.split(',')
 		alt_len = [len(i) for i in alt_seq]
 		alt = max(alt_len)
 	else:
 		alt = len(row._4)
 	ref = row._3
 	pos = row._2
 	if len(ref) == 1 and alt == 1:
 		pass
 	elif len(ref) > alt:
 		StartPos = int(pos) - 1
 		EndPos = int(pos) + (len(ref) - 1)
 		outline_indel = row._1 + '\t' + str(StartPos) + '\t' + str(EndPos) + '\n'
 		outIndel.write(outline_indel)
 	elif alt > len(ref):
 		StartPos = int(pos) - 1
 		EndPos = int(pos) + (alt - 1)
 		outline_indel = row._1 + '\t' + str(StartPos) + '\t' + str(EndPos) + '\n'
 		outIndel.write(outline_indel)
 	elif len(ref) == alt:
 		StartPos = int(pos) - 1
 		EndPos = int(pos) + (alt - 1)
 		outline_indel = row._1 + '\t' + str(StartPos) + '\t' + str(EndPos) + '\n'
 		outIndel.write(outline_indel)







--- a/codescripts/bed_region.py
+++ b/codescripts/bed_region.py
@@ -0,0 +1,72 @@
 import pandas as pd
 import sys, argparse, os
 mut = mut = pd.read_table('/mnt/pgx_src_data_pool_4/home/renluyao/manuscript/benchmark_calls/vcf/mutation_type',header=None)
 vote = pd.read_table('/mnt/pgx_src_data_pool_4/home/renluyao/manuscript/benchmark_calls/all_info/benchmark.vote.mendelian.txt',header=None)
 merged_df = pd.merge(vote, mut,  how='inner', left_on=[0,1], right_on = [0,1])
 outFile = open(sys.argv[1],'w')
 outIndel = open(sys.argv[2],'w')
 for row in merged_df.itertuples():
 #d5
 	if ',' in row._7:
 		d5 = row._7.split(',')
 		d5_len = [len(i) for i in d5]
 		d5_alt = max(d5_len)
 	else:
 		d5_alt = len(row._7)
 #d6
 	if ',' in row._15:
 		d6 = row._15.split(',')
 		d6_len = [len(i) for i in d6]
 		d6_alt = max(d6_len)
 	else:
 		d6_alt = len(row._15)
 #f7
 	if ',' in row._23:
 		f7 = row._23.split(',')
 		f7_len = [len(i) for i in f7]
 		f7_alt = max(f7_len)
 	else:
 		f7_alt = len(row._23)
 #m8
 	if ',' in row._31:
 		m8 = row._31.split(',')
 		m8_len = [len(i) for i in m8]
 		m8_alt = max(m8_len)
 	else:
 		m8_alt = len(row._31)
 	all_length = [d5_alt,d6_alt,f7_alt,m8_alt]
 	alt = max(all_length)
 	ref = row._35
 	pos = int(row._2)
 	if len(ref) == 1 and alt == 1:
 		StartPos = int(pos) -1
 		EndPos = int(pos)
 		cate = 'SNV'
 	elif len(ref) > alt:
 		StartPos = int(pos) - 1
 		EndPos = int(pos) + (len(ref) - 1)
 		cate = 'INDEL'
 		outline_indel = row._1 + '\t' + str(StartPos) + '\t' + str(EndPos) + '\n'
 		outIndel.write(outline_indel)
 	elif alt > len(ref):
 		StartPos = int(pos) - 1
 		EndPos = int(pos) + (alt - 1)
 		cate = 'INDEL'
 		outline_indel = row._1 + '\t' + str(StartPos) + '\t' + str(EndPos) + '\n'
 		outIndel.write(outline_indel)
 	elif len(ref) == alt:
 		StartPos = int(pos) - 1
 		EndPos = int(pos) + (alt - 1)
 		cate = 'INDEL'
 		outline_indel = row._1 + '\t' + str(StartPos) + '\t' + str(EndPos) + '\n'
 		outIndel.write(outline_indel)
 	outline = row._1 + '\t' + str(StartPos) + '\t' + str(EndPos) + '\t' + str(row._2) + '\t' + cate + '\n'
 	outFile.write(outline)








--- a/codescripts/cluster.sh
+++ b/codescripts/cluster.sh
@@ -0,0 +1,67 @@
 cat benchmark.men.vote.diffbed.lengthlessthan50.txt | awk '{print $1"\t"$2"\t"".""\t"$35"\t"$7"\t.\t.\t.\tGT\t"$6}' | grep -v '0/0' > LCL5.body
 cat benchmark.men.vote.diffbed.lengthlessthan50.txt | awk '{print $1"\t"$2"\t"".""\t"$35"\t"$15"\t.\t.\t.\tGT\t"$14}' | grep -v '0/0' > LCL6.body
 cat benchmark.men.vote.diffbed.lengthlessthan50.txt | awk '{print $1"\t"$2"\t"".""\t"$35"\t"$23"\t.\t.\t.\tGT\t"$22}' | grep -v '0/0'> LCL7.body
 cat benchmark.men.vote.diffbed.lengthlessthan50.txt | awk '{print $1"\t"$2"\t"".""\t"$35"\t"$31"\t.\t.\t.\tGT\t"$30}'| grep -v '0/0' > LCL8.body
 cat header5 LCL5.body > LCL5.beforediffbed.vcf
 cat header6 LCL6.body > LCL6.beforediffbed.vcf
 cat header7 LCL7.body > LCL7.beforediffbed.vcf
 cat header8 LCL8.body > LCL8.beforediffbed.vcf
 rtg bgzip *beforediffbed.vcf
 rtg index *beforediffbed.vcf.gz

 rtg vcffilter -i LCL5.beforediffbed.vcf.gz --exclude-bed=/mnt/pgx_src_data_pool_4/home/renluyao/manuscript/benchmark_calls/MIE/diff.merged.bed -o LCL5.afterfilterdiffbed.vcf.gz
 rtg vcffilter -i LCL6.beforediffbed.vcf.gz --exclude-bed=/mnt/pgx_src_data_pool_4/home/renluyao/manuscript/benchmark_calls/MIE/diff.merged.bed -o LCL6.afterfilterdiffbed.vcf.gz
 rtg vcffilter -i LCL7.beforediffbed.vcf.gz --exclude-bed=/mnt/pgx_src_data_pool_4/home/renluyao/manuscript/benchmark_calls/MIE/diff.merged.bed -o LCL7.afterfilterdiffbed.vcf.gz
 rtg vcffilter -i LCL8.beforediffbed.vcf.gz --exclude-bed=/mnt/pgx_src_data_pool_4/home/renluyao/manuscript/benchmark_calls/MIE/diff.merged.bed -o LCL8.afterfilterdiffbed.vcf.gz

 /mnt/pgx_src_data_pool_4/home/renluyao/softwares/annovar/table_annovar.pl LCL5.beforediffbed.vcf.gz /mnt/pgx_src_data_pool_4/home/renluyao/softwares/annovar/humandb \
 -buildver hg38 \
 -out LCL5 \
 -remove \
 -protocol 1000g2015aug_all,1000g2015aug_afr,1000g2015aug_amr,1000g2015aug_eas,1000g2015aug_eur,1000g2015aug_sas,clinvar_20190305,gnomad211_genome \
 -operation f,f,f,f,f,f,f,f \
 -nastring . \
 -vcfinput \
 --thread 8

 rtg vcfeval -b /mnt/pgx_src_data_pool_4/home/renluyao/Quartet/GIAB/NA12878_HG001/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_PGandRTGphasetransfer.vcf.gz -c LCL5.afterfilterdiffbed.vcf.gz -o LCL5_NIST -t /mnt/pgx_src_data_pool_4/home/renluyao/annotation/hg38/GRCh38.d1.vd1.sdf/
 rtg vcfeval -b /mnt/pgx_src_data_pool_4/home/renluyao/Quartet/GIAB/NA12878_HG001/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_PGandRTGphasetransfer.vcf.gz -c LCL6.afterfilterdiffbed.vcf.gz -o LCL6_NIST -t /mnt/pgx_src_data_pool_4/home/renluyao/annotation/hg38/GRCh38.d1.vd1.sdf/
 rtg vcfeval -b /mnt/pgx_src_data_pool_4/home/renluyao/Quartet/GIAB/NA12878_HG001/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_PGandRTGphasetransfer.vcf.gz -c LCL7.afterfilterdiffbed.vcf.gz -o LCL7_NIST -t /mnt/pgx_src_data_pool_4/home/renluyao/annotation/hg38/GRCh38.d1.vd1.sdf/
 rtg vcfeval -b /mnt/pgx_src_data_pool_4/home/renluyao/Quartet/GIAB/NA12878_HG001/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_PGandRTGphasetransfer.vcf.gz -c LCL8.afterfilterdiffbed.vcf.gz -o LCL8_NIST -t /mnt/pgx_src_data_pool_4/home/renluyao/annotation/hg38/GRCh38.d1.vd1.sdf/


 zcat LCL5.afterfilterdiffbed.vcf.gz | grep -v '#' | awk '{ if ((length($4) == 1) && (length($5) == 1)) { print } }' | wc -l
 zcat LCL6.afterfilterdiffbed.vcf.gz | grep -v '#' | awk '{ if ((length($4) == 1) && (length($5) == 1)) { print } }' | wc -l
 zcat LCL7.afterfilterdiffbed.vcf.gz | grep -v '#' | awk '{ if ((length($4) == 1) && (length($5) == 1)) { print } }' | wc -l
 zcat LCL8.afterfilterdiffbed.vcf.gz | grep -v '#' | awk '{ if ((length($4) == 1) && (length($5) == 1)) { print } }' | wc -l


 zcat LCL5.afterfilterdiffbed.vcf.gz | grep -v '#' | awk '{ if ((length($4) == 1) && (length($5) < 11) && (length($5) > 1)) { print } }' | wc -l
 zcat LCL6.afterfilterdiffbed.vcf.gz | grep -v '#' | awk '{ if ((length($4) == 1) && (length($5) < 11) && (length($5) > 1)) { print } }' | wc -l
 zcat LCL7.afterfilterdiffbed.vcf.gz | grep -v '#' | awk '{ if ((length($4) == 1) && (length($5) < 11) && (length($5) > 1)) { print } }' | wc -l
 zcat LCL8.afterfilterdiffbed.vcf.gz | grep -v '#' | awk '{ if ((length($4) == 1) && (length($5) < 11) && (length($5) > 1)) { print } }' | wc -l


 zcat LCL5.afterfilterdiffbed.vcf.gz | grep -v '#' | awk '{ if ((length($4) == 1) && (length($5) > 10)) { print } }' | wc -l
 zcat LCL6.afterfilterdiffbed.vcf.gz | grep -v '#' | awk '{ if ((length($4) == 1) && (length($5) > 10)) { print } }' | wc -l
 zcat LCL7.afterfilterdiffbed.vcf.gz | grep -v '#' | awk '{ if ((length($4) == 1) && (length($5) > 10)) { print } }' | wc -l
 zcat LCL8.afterfilterdiffbed.vcf.gz | grep -v '#' | awk '{ if ((length($4) == 1) && (length($5) > 10)) { print } }' | wc -l

 bedtools subtract -a LCL5.27.homo_ref.consensus.bed -b /mnt/pgx_src_data_pool_4/home/renluyao/manuscript/benchmark_calls/MIE/diff.merged.bed > LCL5.27.homo_ref.consensus.filtereddiffbed.bed
 bedtools subtract -a LCL6.27.homo_ref.consensus.bed -b /mnt/pgx_src_data_pool_4/home/renluyao/manuscript/benchmark_calls/MIE/diff.merged.bed > LCL6.27.homo_ref.consensus.filtereddiffbed.bed
 bedtools subtract -a LCL7.27.homo_ref.consensus.bed -b /mnt/pgx_src_data_pool_4/home/renluyao/manuscript/benchmark_calls/MIE/diff.merged.bed > LCL7.27.homo_ref.consensus.filtereddiffbed.bed
 bedtools subtract -a LCL8.27.homo_ref.consensus.bed -b /mnt/pgx_src_data_pool_4/home/renluyao/manuscript/benchmark_calls/MIE/diff.merged.bed > LCL8.27.homo_ref.consensus.filtereddiffbed.bed


 python vcf2bed.py LCL5.body LCL5.variants.bed
 python vcf2bed.py LCL6.body LCL6.variants.bed
 python vcf2bed.py LCL7.body LCL7.variants.bed
 python vcf2bed.py LCL8.body LCL8.variants.bed



 cat /mnt/pgx_src_data_pool_4/home/renluyao/manuscript/benchmark_calls/all_info/LCL5.variants.bed | cut -f1,11,12 | cat - LCL5.27.homo_ref.consensus.filtereddiffbed.bed | sort -k1,1 -k2,2n > LCL5.high.confidence.bed




--- a/codescripts/contig.txt
+++ b/codescripts/contig.txt
@@ -0,0 +1,23 @@
 ##contig=<ID=chr1,length=248956422>
 ##contig=<ID=chr2,length=242193529>
 ##contig=<ID=chr3,length=198295559>
 ##contig=<ID=chr4,length=190214555>
 ##contig=<ID=chr5,length=181538259>
 ##contig=<ID=chr6,length=170805979>
 ##contig=<ID=chr7,length=159345973>
 ##contig=<ID=chr8,length=145138636>
 ##contig=<ID=chr9,length=138394717>
 ##contig=<ID=chr10,length=133797422>
 ##contig=<ID=chr11,length=135086622>
 ##contig=<ID=chr12,length=133275309>
 ##contig=<ID=chr13,length=114364328>
 ##contig=<ID=chr14,length=107043718>
 ##contig=<ID=chr15,length=101991189>
 ##contig=<ID=chr16,length=90338345>
 ##contig=<ID=chr17,length=83257441>
 ##contig=<ID=chr18,length=80373285>
 ##contig=<ID=chr19,length=58617616>
 ##contig=<ID=chr20,length=64444167>
 ##contig=<ID=chr21,length=46709983>
 ##contig=<ID=chr22,length=50818468>
 ##contig=<ID=chrX,length=156040895>
--- a/codescripts/extract_multiqc.py
+++ b/codescripts/extract_multiqc.py
@@ -0,0 +1,65 @@
 import json
 import pandas as pd
 import sys, argparse, os

 parser = argparse.ArgumentParser(description="This script is to get information from multiqc")

 parser.add_argument('-fastqc_qualimap', '--fastqc_qualimap', type=str, help='multiqc_general_stats.txt',  required=True)
 parser.add_argument('-fastqc', '--fastqc', type=str, help='multiqc_fastqc.txt',  required=True)
 parser.add_argument('-fastqscreen', '--fastqscreen', type=str, help='multiqc_fastq_screen.txt',  required=True)
 parser.add_argument('-hap', '--happy', type=str, help='multiqc_happy_data.json',  required=True)

 args = parser.parse_args()

 # Rename input:
 fastqc_qualimap_file = args.fastqc_qualimap
 fastqc_file = args.fastqc
 fastqscreen_file = args.fastqscreen
 hap_file = args.happy


 # fastqc and qualimap
 dat = pd.read_table(fastqc_qualimap_file)

 fastqc = dat.loc[:, dat.columns.str.startswith('FastQC')]
 fastqc.insert(loc=0, column='Sample', value=dat['Sample'])
 fastqc_stat = fastqc.dropna()

 # qulimap
 qualimap = dat.loc[:, dat.columns.str.startswith('QualiMap')]
 qualimap.insert(loc=0, column='Sample', value=dat['Sample'])
 qualimap_stat = qualimap.dropna()

 # fastqc
 dat = pd.read_table(fastqc_file)

 fastqc_module = dat.loc[:, "per_base_sequence_quality":"kmer_content"]
 fastqc_module.insert(loc=0, column='Sample', value=dat['Sample'])
 fastqc_all = pd.merge(fastqc_stat,fastqc_module,  how='outer', left_on=['Sample'], right_on = ['Sample'])

 # fastqscreen
 dat = pd.read_table(fastqscreen_file)
 fastqscreen = dat.loc[:, dat.columns.str.endswith('percentage')]
 dat['Sample'] = [i.replace('_screen','') for i in dat['Sample']]
 fastqscreen.insert(loc=0, column='Sample', value=dat['Sample'])

 # benchmark
 with open(hap_file) as hap_json:
 	happy = json.load(hap_json)
 dat =pd.DataFrame.from_records(happy)
 dat = dat.loc[:, dat.columns.str.endswith('ALL')]
 dat_transposed = dat.T
 benchmark = dat_transposed.loc[:,['sample_id','METRIC.Precision','METRIC.Recall']]
 benchmark.columns = ['Sample','Precision','Recall']

 #output
 fastqc_all.to_csv('fastqc.final.result.txt',sep="\t",index=0)
 fastqscreen.to_csv('fastqscreen.final.result.txt',sep="\t",index=0)
 qualimap_stat.to_csv('qualimap.final.result.txt',sep="\t",index=0)
 benchmark.to_csv('benchmark.final.result.txt',sep="\t",index=0)






--- a/codescripts/extract_vcf_information.py
+++ b/codescripts/extract_vcf_information.py
@@ -0,0 +1,92 @@
 import sys,getopt
 import os
 import re
 import fileinput
 import pandas as pd

 def usage():
 	print(
 		"""
 Usage: python extract_vcf_information.py -i input_merged_vcf_file -o parsed_file

 This script will extract SNVs and Indels information from the vcf files and output a tab-delimited files.

 Input:
 -i the selected vcf file

 Output:
 -o tab-delimited parsed file
 		""")

 # select supported small variants
 def process(oneLine):
 	line = oneLine.rstrip()
 	strings = line.strip().split('\t')
 	infoParsed = parse_INFO(strings[7])
 	formatKeys = strings[8].split(':')
 	formatValues = strings[9].split(':')
 	for i in range(0,len(formatKeys) -1) :
 		if formatKeys[i] == 'AD':
 			ra = formatValues[i].split(',')
 			infoParsed['RefDP'] = ra[0]
 			infoParsed['AltDP'] = ra[1]
 			if (int(ra[1]) + int(ra[0])) != 0:
 				infoParsed['af'] = float(int(ra[1])/(int(ra[1]) + int(ra[0])))
 			else:
 				pass
 		else:
 			infoParsed[formatKeys[i]] = formatValues[i]
 	infoParsed['chromo'] = strings[0]
 	infoParsed['pos'] = strings[1]
 	infoParsed['id'] = strings[2]
 	infoParsed['ref'] = strings[3]
 	infoParsed['alt'] = strings[4]
 	infoParsed['qual'] = strings[5]
 	return infoParsed


 def parse_INFO(info):
 	strings = info.strip().split(';')
 	keys = []
 	values = []
 	for i in strings:
 		kv = i.split('=')
 		if kv[0] == 'DB':
 			keys.append('DB')
 			values.append('1')
 		elif kv[0] == 'AF':
 			pass
 		else:
 			keys.append(kv[0])
 			values.append(kv[1])
 	infoDict = dict(zip(keys, values))
 	return infoDict
 	

 opts,args = getopt.getopt(sys.argv[1:],"hi:o:") 
 for op,value in opts:
 	if op == "-i":
 		inputFile=value
 	elif op == "-o":
 		outputFile=value	
 	elif op == "-h":
 		usage()
 		sys.exit()

 if len(sys.argv[1:]) < 3:
 	usage()
 	sys.exit()

 allDict = []
 for line in fileinput.input(inputFile):
 	m = re.match('^\#',line)
 	if m is not None:
 		pass
 	else:
 		oneDict = process(line)
 		allDict.append(oneDict)

 allTable = pd.DataFrame(allDict)

 allTable.to_csv(outputFile,sep='\t',index=False)

--- a/codescripts/filter_indel_over_50_cluster.py
+++ b/codescripts/filter_indel_over_50_cluster.py
@@ -0,0 +1,47 @@
 import sys,getopt
 from itertools import islice

 over_50_outfile = open("indel_lenth_over_50.txt",'w')
 less_50_outfile = open("benchmark.men.vote.diffbed.lengthlessthan50.txt","w")

 def process(line):
 	strings = line.strip().split('\t')
 #d5
 	if ',' in strings[6]:
 		d5 = strings[6].split(',')
 		d5_len = [len(i) for i in d5]
 		d5_alt = max(d5_len)
 	else:
 		d5_alt = len(strings[6])
 #d6
 	if ',' in strings[14]:
 		d6 = strings[14].split(',')
 		d6_len = [len(i) for i in d6]
 		d6_alt = max(d6_len)
 	else:
 		d6_alt = len(strings[14])
 #f7
 	if ',' in strings[22]:
 		f7 = strings[22].split(',')
 		f7_len = [len(i) for i in f7]
 		f7_alt = max(f7_len)
 	else:
 		f7_alt = len(strings[22])
 #m8
 	if ',' in strings[30]:
 		m8 = strings[30].split(',')
 		m8_len = [len(i) for i in m8]
 		m8_alt = max(m8_len)
 	else:
 		m8_alt = len(strings[30])
 #ref
 	ref_len = len(strings[34])
 	if (d5_alt > 50) or (d6_alt > 50) or (f7_alt > 50) or (m8_alt > 50) or (ref_len > 50):
 		over_50_outfile.write(line)
 	else:
 		less_50_outfile.write(line)


 input_file = open(sys.argv[1])  
 for line in islice(input_file, 1, None):  
 	process(line)
--- a/codescripts/filter_indel_over_50_mendelian.py
+++ b/codescripts/filter_indel_over_50_mendelian.py
@@ -0,0 +1,43 @@
 from itertools import islice
 import fileinput
 import sys, argparse, os


 # input arguments
 parser = argparse.ArgumentParser(description="this script is to exclude indel over 50bp")

 parser.add_argument('-i', '--mergedGVCF', type=str, help='merged gVCF txt with only chr, pos, ref, alt and genotypes',  required=True)
 parser.add_argument('-prefix', '--prefix', type=str, help='prefix of output file',  required=True)


 args = parser.parse_args()
 input_dat = args.mergedGVCF
 prefix = args.prefix


 # output file
 output_name = prefix + '.indel.lessthan50bp.txt'
 outfile = open(output_name,'w')


 def process(line):
 	strings = line.strip().split('\t')
 #d5
 	if ',' in strings[3]:
 		alt = strings[3].split(',')
 		alt_len = [len(i) for i in alt]
 		alt_max = max(alt_len)
 	else:
 		alt_max = len(strings[3])
 #ref
 	ref_len = len(strings[2])
 	if (alt_max > 50) or (ref_len > 50):
 		pass
 	else:
 		outfile.write(line)


 for line in fileinput.input(input_dat):
 	process(line)


--- a/codescripts/get_filtered_benchmark_vcfinfo.py
+++ b/codescripts/get_filtered_benchmark_vcfinfo.py
@@ -0,0 +1,36 @@
 from __future__ import division
 import pandas as pd
 import sys, argparse, os
 import fileinput


 # input arguments
 parser = argparse.ArgumentParser(description="this script is to get filtered and benchmark vcf info")

 parser.add_argument('-filtered', '--filtered', type=str, help='filtered position',  required=True)
 parser.add_argument('-benchmark', '--benchmark', type=str, help='benchmark position',  required=True)
 parser.add_argument('-vcf', '--vcf', type=str, help='one specific vcf',  required=True)
 parser.add_argument('-filename', '--filename', type=str, help='output file name',  required=True)


 args = parser.parse_args()
 filtered = args.filtered
 benchmark = args.benchmark
 vcf = args.vcf
 filename = args.filename


 # output file
 filtered_filename = filename + '.filtered.txt'
 benchmark_filename = filename + '.benchmark.txt'

 # input files
 filtered_dat = pd.read_table(filtered,header=None)
 benchmark_dat = pd.read_table(benchmark,header=None)
 vcf_dat = pd.read_table(vcf)

 filtered_merged_df = pd.merge(filtered_dat, vcf_dat,  how='inner',left_on=[0,1], right_on = ['#CHROM','POS'])
 benchmark_merged_df = pd.merge(benchmark_dat,vcf_dat, how='inner',left_on=[0,1], right_on = ['#CHROM','POS'])

 filtered_merged_df.to_csv(filtered_filename,sep='\t',index=False)
 benchmark_merged_df.to_csv(benchmark_filename,sep='\t',index=False)
--- a/codescripts/hap_summary.py
+++ b/codescripts/hap_summary.py
@@ -0,0 +1,50 @@
 import pandas as pd
 import sys, argparse, os

 parser = argparse.ArgumentParser(description="This script is to get information from hap")
 parser.add_argument('-hap', '--happy', type=str, help='hap.py table',  required=True)
 parser.add_argument('-name', '--name', type=str, help='sample name',  required=True)

 args = parser.parse_args()

 hap_file = args.happy
 name = args.name

 dat = pd.read_table(hap_file)
 dat['QUERY.TP'] = dat['QUERY.TOTAL'].astype(int) - dat['QUERY.UNK'].astype(int) - dat['QUERY.FP'].astype(int)
 dat['QUERY'] = dat['QUERY.TOTAL'].astype(int) - dat['QUERY.UNK'].astype(int)
 indel = dat[['INDEL' in s for s in dat['Type']]]
 snv = dat[['SNP' in s for s in dat['Type']]]
 indel.reset_index(drop=True, inplace=True)
 snv.reset_index(drop=True, inplace=True)
 benchmark = pd.concat([snv, indel], axis=1)
 benchmark = benchmark[[ 'QUERY.TOTAL', 'QUERY','QUERY.TP','QUERY.FP','TRUTH.FN','METRIC.Precision', 'METRIC.Recall','METRIC.F1_Score']]
 benchmark.columns = ['SNV number','INDEL number','SNV query','INDEL query','SNV TP','INDEL TP','SNV FP','INDEL FP','SNV FN','INDEL FN','SNV precision','INDEL precision','SNV recall','INDEL recall','SNV F1','INDEL F1']
 benchmark = benchmark[['SNV number','INDEL number','SNV query','INDEL query','SNV TP','INDEL TP','SNV FP','INDEL FP','SNV FN','INDEL FN','SNV precision','INDEL precision','SNV recall','INDEL recall','SNV F1','INDEL F1']]
 benchmark['SNV precision'] = benchmark['SNV precision'].astype(float)
 benchmark['INDEL precision'] = benchmark['INDEL precision'].astype(float)
 benchmark['SNV recall'] = benchmark['SNV recall'].astype(float)
 benchmark['INDEL recall'] = benchmark['INDEL recall'].astype(float)
 benchmark['SNV F1'] = benchmark['SNV F1'].astype(float)
 benchmark['INDEL F1'] = benchmark['INDEL F1'].astype(float)
 benchmark = benchmark.round(2)

 name_array = name.split("_")
 LCL5_1 = name_array[0] + "_" + name_array[1] + "_" + name_array[2] + "_" + name_array[3] + "_"  + name_array[4] + "_" + "LCL5_1" + "_" + name_array[5]
 LCL5_2 = name_array[0] + "_" + name_array[1] + "_" + name_array[2] + "_" + name_array[3] + "_"  + name_array[4] + "_" + "LCL5_2" + "_" + name_array[5]
 LCL5_3 = name_array[0] + "_" + name_array[1] + "_" + name_array[2] + "_" + name_array[3] + "_"  + name_array[4] + "_" + "LCL5_3" + "_" + name_array[5]

 LCL6_1 = name_array[0] + "_" + name_array[1] + "_" + name_array[2] + "_" + name_array[3] + "_"  + name_array[4] + "_" + "LCL6_1" + "_" + name_array[5]
 LCL6_2 = name_array[0] + "_" + name_array[1] + "_" + name_array[2] + "_" + name_array[3] + "_"  + name_array[4] + "_" + "LCL6_2" + "_" + name_array[5]
 LCL6_3 = name_array[0] + "_" + name_array[1] + "_" + name_array[2] + "_" + name_array[3] + "_"  + name_array[4] + "_" + "LCL6_3" + "_" + name_array[5]

 LCL7_1 = name_array[0] + "_" + name_array[1] + "_" + name_array[2] + "_" + name_array[3] + "_"  + name_array[4] + "_" + "LCL7_1" + "_" + name_array[5]
 LCL7_2 = name_array[0] + "_" + name_array[1] + "_" + name_array[2] + "_" + name_array[3] + "_"  + name_array[4] + "_" + "LCL7_2" + "_" + name_array[5]
 LCL7_3 = name_array[0] + "_" + name_array[1] + "_" + name_array[2] + "_" + name_array[3] + "_"  + name_array[4] + "_" + "LCL7_3" + "_" + name_array[5]

 LCL8_1 = name_array[0] + "_" + name_array[1] + "_" + name_array[2] + "_" + name_array[3] + "_"  + name_array[4] + "_" + "LCL8_1" + "_" + name_array[5]
 LCL8_2 = name_array[0] + "_" + name_array[1] + "_" + name_array[2] + "_" + name_array[3] + "_"  + name_array[4] + "_" + "LCL8_2" + "_" + name_array[5]
 LCL8_3 = name_array[0] + "_" + name_array[1] + "_" + name_array[2] + "_" + name_array[3] + "_"  + name_array[4] + "_" + "LCL8_3" + "_" + name_array[5]

 benchmark.insert(loc=0, column='Sample', value=[LCL5_1,LCL5_2,LCL5_3,LCL6_1,LCL6_2,LCL6_3,LCL7_1,LCL7_2,LCL7_3,LCL8_1,LCL8_2,LCL8_3])
 benchmark.to_csv('variants.calling.qc.txt',sep="\t",index=0)
--- a/codescripts/high_confidence_call_vote.py
+++ b/codescripts/high_confidence_call_vote.py
@@ -0,0 +1,416 @@
 from __future__ import division 
 import sys, argparse, os
 import fileinput
 import re
 import pandas as pd
 from operator import itemgetter
 from collections import Counter
 from itertools import islice
 from numpy import *
 import statistics

 # input arguments
 parser = argparse.ArgumentParser(description="this script is to count voting number")

 parser.add_argument('-vcf', '--multi_sample_vcf', type=str, help='The VCF file you want to count the voting number',  required=True)
 parser.add_argument('-dup', '--dup_list', type=str, help='Duplication list',  required=True)
 parser.add_argument('-sample', '--sample_name', type=str, help='which sample of quartet',  required=True)
 parser.add_argument('-prefix', '--prefix', type=str, help='Prefix of output file name',  required=True)

 args = parser.parse_args()
 multi_sample_vcf = args.multi_sample_vcf
 dup_list = args.dup_list
 prefix = args.prefix
 sample_name = args.sample_name

 vcf_header = '''##fileformat=VCFv4.2
 ##fileDate=20200331
 ##source=high_confidence_calls_intergration(choppy app)
 ##reference=GRCh38.d1.vd1
 ##INFO=<ID=location,Number=1,Type=String,Description="Repeat region">
 ##INFO=<ID=DETECTED,Number=1,Type=Integer,Description="Number of detected votes">
 ##INFO=<ID=VOTED,Number=1,Type=Integer,Description="Number of consnesus votes">
 ##INFO=<ID=FAM,Number=1,Type=Integer,Description="Number mendelian consisitent votes">
 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
 ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Sum depth of all samples">
 ##FORMAT=<ID=ALT,Number=1,Type=Integer,Description="Sum alternative depth of all samples">
 ##FORMAT=<ID=AF,Number=1,Type=Float,Description="Allele frequency, sum alternative depth / sum depth">
 ##FORMAT=<ID=GQ,Number=1,Type=Float,Description="Average genotype quality">
 ##FORMAT=<ID=QD,Number=1,Type=Float,Description="Average Variant Confidence/Quality by Depth">
 ##FORMAT=<ID=MQ,Number=1,Type=Float,Description="Average mapping quality">
 ##FORMAT=<ID=FS,Number=1,Type=Float,Description="Average Phred-scaled p-value using Fisher's exact test to detect strand bias">
 ##FORMAT=<ID=QUALI,Number=1,Type=Float,Description="Average variant quality">
 ##contig=<ID=chr1,length=248956422>
 ##contig=<ID=chr2,length=242193529>
 ##contig=<ID=chr3,length=198295559>
 ##contig=<ID=chr4,length=190214555>
 ##contig=<ID=chr5,length=181538259>
 ##contig=<ID=chr6,length=170805979>
 ##contig=<ID=chr7,length=159345973>
 ##contig=<ID=chr8,length=145138636>
 ##contig=<ID=chr9,length=138394717>
 ##contig=<ID=chr10,length=133797422>
 ##contig=<ID=chr11,length=135086622>
 ##contig=<ID=chr12,length=133275309>
 ##contig=<ID=chr13,length=114364328>
 ##contig=<ID=chr14,length=107043718>
 ##contig=<ID=chr15,length=101991189>
 ##contig=<ID=chr16,length=90338345>
 ##contig=<ID=chr17,length=83257441>
 ##contig=<ID=chr18,length=80373285>
 ##contig=<ID=chr19,length=58617616>
 ##contig=<ID=chr20,length=64444167>
 ##contig=<ID=chr21,length=46709983>
 ##contig=<ID=chr22,length=50818468>
 ##contig=<ID=chrX,length=156040895>
 '''

 vcf_header_all_sample = '''##fileformat=VCFv4.2
 ##fileDate=20200331
 ##reference=GRCh38.d1.vd1
 ##INFO=<ID=location,Number=1,Type=String,Description="Repeat region">
 ##INFO=<ID=DUP,Number=1,Type=Flag,Description="Duplicated variant records">
 ##INFO=<ID=DETECTED,Number=1,Type=Integer,Description="Number of detected votes">
 ##INFO=<ID=VOTED,Number=1,Type=Integer,Description="Number of consnesus votes">
 ##INFO=<ID=FAM,Number=1,Type=Integer,Description="Number mendelian consisitent votes">
 ##INFO=<ID=ALL_ALT,Number=1,Type=Float,Description="Sum of alternative reads of all samples">
 ##INFO=<ID=ALL_DP,Number=1,Type=Float,Description="Sum of depth of all samples">
 ##INFO=<ID=ALL_AF,Number=1,Type=Float,Description="Allele frequency of net alternatice reads and net depth">
 ##INFO=<ID=GQ_MEAN,Number=1,Type=Float,Description="Mean of genotype quality of all samples">
 ##INFO=<ID=QD_MEAN,Number=1,Type=Float,Description="Average Variant Confidence/Quality by Depth">
 ##INFO=<ID=MQ_MEAN,Number=1,Type=Float,Description="Mean of mapping quality of all samples">
 ##INFO=<ID=FS_MEAN,Number=1,Type=Float,Description="Average Phred-scaled p-value using Fisher's exact test to detect strand bias">
 ##INFO=<ID=QUAL_MEAN,Number=1,Type=Float,Description="Average variant quality">
 ##INFO=<ID=PCR,Number=1,Type=String,Description="Consensus of PCR votes">
 ##INFO=<ID=PCR_FREE,Number=1,Type=String,Description="Consensus of PCR-free votes">
 ##INFO=<ID=CONSENSUS,Number=1,Type=String,Description="Consensus calls">
 ##INFO=<ID=CONSENSUS_SEQ,Number=1,Type=String,Description="Consensus sequence">
 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
 ##FORMAT=<ID=DP,Number=1,Type=String,Description="Depth">
 ##FORMAT=<ID=ALT,Number=1,Type=Integer,Description="Alternative Depth">
 ##FORMAT=<ID=AF,Number=1,Type=String,Description="Allele frequency">
 ##FORMAT=<ID=GQ,Number=1,Type=String,Description="Genotype quality">
 ##FORMAT=<ID=MQ,Number=1,Type=String,Description="Mapping quality">
 ##FORMAT=<ID=TWINS,Number=1,Type=String,Description="1 is twins shared, 0 is twins discordant ">
 ##FORMAT=<ID=TRIO5,Number=1,Type=String,Description="1 is LCL7, LCL8 and LCL5 mendelian consistent, 0 is mendelian vioaltion">
 ##FORMAT=<ID=TRIO6,Number=1,Type=String,Description="1 is LCL7, LCL8 and LCL6 mendelian consistent, 0 is mendelian vioaltion">
 ##contig=<ID=chr1,length=248956422>
 ##contig=<ID=chr2,length=242193529>
 ##contig=<ID=chr3,length=198295559>
 ##contig=<ID=chr4,length=190214555>
 ##contig=<ID=chr5,length=181538259>
 ##contig=<ID=chr6,length=170805979>
 ##contig=<ID=chr7,length=159345973>
 ##contig=<ID=chr8,length=145138636>
 ##contig=<ID=chr9,length=138394717>
 ##contig=<ID=chr10,length=133797422>
 ##contig=<ID=chr11,length=135086622>
 ##contig=<ID=chr12,length=133275309>
 ##contig=<ID=chr13,length=114364328>
 ##contig=<ID=chr14,length=107043718>
 ##contig=<ID=chr15,length=101991189>
 ##contig=<ID=chr16,length=90338345>
 ##contig=<ID=chr17,length=83257441>
 ##contig=<ID=chr18,length=80373285>
 ##contig=<ID=chr19,length=58617616>
 ##contig=<ID=chr20,length=64444167>
 ##contig=<ID=chr21,length=46709983>
 ##contig=<ID=chr22,length=50818468>
 ##contig=<ID=chrX,length=156040895>
 '''
 # read in duplication list
 dup = pd.read_table(dup_list,header=None)
 var_dup = dup[0].tolist()

 # output file
 benchmark_file_name = prefix + '_voted.vcf'
 benchmark_outfile = open(benchmark_file_name,'w')

 all_sample_file_name = prefix + '_all_sample_information.vcf'
 all_sample_outfile = open(all_sample_file_name,'w')

 # write VCF
 outputcolumn = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t' + sample_name + '_benchmark_calls\n'
 benchmark_outfile.write(vcf_header)
 benchmark_outfile.write(outputcolumn)

 outputcolumn_all_sample = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+ \
 'Quartet_DNA_BGI_SEQ2000_BGI_1_20180518\tQuartet_DNA_BGI_SEQ2000_BGI_2_20180530\tQuartet_DNA_BGI_SEQ2000_BGI_3_20180530\t' + \
 'Quartet_DNA_BGI_T7_WGE_1_20191105\tQuartet_DNA_BGI_T7_WGE_2_20191105\tQuartet_DNA_BGI_T7_WGE_3_20191105\t' + \
 'Quartet_DNA_ILM_Nova_ARD_1_20181108\tQuartet_DNA_ILM_Nova_ARD_2_20181108\tQuartet_DNA_ILM_Nova_ARD_3_20181108\t' + \
 'Quartet_DNA_ILM_Nova_ARD_4_20190111\tQuartet_DNA_ILM_Nova_ARD_5_20190111\tQuartet_DNA_ILM_Nova_ARD_6_20190111\t' + \
 'Quartet_DNA_ILM_Nova_BRG_1_20180930\tQuartet_DNA_ILM_Nova_BRG_2_20180930\tQuartet_DNA_ILM_Nova_BRG_3_20180930\t' + \
 'Quartet_DNA_ILM_Nova_WUX_1_20190917\tQuartet_DNA_ILM_Nova_WUX_2_20190917\tQuartet_DNA_ILM_Nova_WUX_3_20190917\t' + \
 'Quartet_DNA_ILM_XTen_ARD_1_20170403\tQuartet_DNA_ILM_XTen_ARD_2_20170403\tQuartet_DNA_ILM_XTen_ARD_3_20170403\t' + \
 'Quartet_DNA_ILM_XTen_NVG_1_20170329\tQuartet_DNA_ILM_XTen_NVG_2_20170329\tQuartet_DNA_ILM_XTen_NVG_3_20170329\t' + \
 'Quartet_DNA_ILM_XTen_WUX_1_20170216\tQuartet_DNA_ILM_XTen_WUX_2_20170216\tQuartet_DNA_ILM_XTen_WUX_3_20170216\n'
 all_sample_outfile.write(vcf_header_all_sample)
 all_sample_outfile.write(outputcolumn_all_sample)



 #function
 def replace_nan(strings_list):
 	updated_list = []
 	for i in strings_list:
 		if i == '.':
 			updated_list.append('.:.:.:.:.:.:.:.:.:.:.:.')
 		else:
 			updated_list.append(i)
 	return updated_list

 def remove_dot(strings_list):
 	updated_list = []
 	for i in strings_list:
 		if i == '.':
 			pass
 		else:
 			updated_list.append(i)
 	return updated_list	

 def detected_number(strings):
 	gt = [x.split(':')[0] for x in strings]
 	percentage = 27 - gt.count('.')
 	return(str(percentage))

 def vote_number(strings,consensus_call):
 	gt = [x.split(':')[0] for x in strings]
 	gt = [x.replace('.','0/0') for x in gt]
 	gt = list(map(gt_uniform,[i for i in gt]))
 	vote_num = gt.count(consensus_call)
 	return(str(vote_num))

 def family_vote(strings,consensus_call):
 	gt = [x.split(':')[0] for x in strings]
 	gt = [x.replace('.','0/0') for x in gt]
 	gt = list(map(gt_uniform,[i for i in gt]))
 	mendelian = [':'.join(x.split(':')[1:4]) for x in strings]
 	indices = [i for i, x in enumerate(gt) if x == consensus_call]
 	matched_mendelian = itemgetter(*indices)(mendelian)
 	mendelian_num = matched_mendelian.count('1:1:1')
 	return(str(mendelian_num))

 def gt_uniform(strings):
 	uniformed_gt = ''
 	allele1 = strings.split('/')[0]
 	allele2 = strings.split('/')[1]
 	if int(allele1) > int(allele2):
 		uniformed_gt = allele2 + '/' + allele1
 	else:
 		uniformed_gt = allele1 + '/' + allele2
 	return uniformed_gt

 def decide_by_rep(strings):
 	consensus_rep = ''
 	mendelian = [':'.join(x.split(':')[1:4]) for x in strings]
 	gt = [x.split(':')[0] for x in strings]
 	gt = [x.replace('.','0/0') for x in gt]
 	# modified gt turn 2/1 to 1/2
 	gt = list(map(gt_uniform,[i for i in gt]))
 	# mendelian consistent?
 	mendelian_dict = Counter(mendelian)
 	highest_mendelian = mendelian_dict.most_common(1)
 	candidate_mendelian = highest_mendelian[0][0]
 	freq_mendelian = highest_mendelian[0][1]
 	if (candidate_mendelian == '1:1:1') and (freq_mendelian >= 2):
 		gt_num_dict = Counter(gt)
 		highest_gt = gt_num_dict.most_common(1)
 		candidate_gt = highest_gt[0][0]
 		freq_gt = highest_gt[0][1]
 		if (candidate_gt != '0/0') and (freq_gt >= 2):
 			consensus_rep = candidate_gt
 		elif (candidate_gt == '0/0') and (freq_gt >= 2):
 			consensus_rep = '0/0'
 		else:
 			consensus_rep = 'inconGT'
 	elif (candidate_mendelian == '') and (freq_mendelian >= 2):
 		consensus_rep = 'noInfo'
 	else:
 		consensus_rep = 'inconMen'
 	return consensus_rep


 def main():
 	for line in fileinput.input(multi_sample_vcf):
 		headline = re.match('^\#',line)
 		if headline is not None:
 			pass
 		else:
 			line = line.strip()
 			strings = line.split('\t')
 			variant_id = '_'.join([strings[0],strings[1]])
 			# check if the variants location is duplicated
 			if variant_id in var_dup:
 				strings[7] = strings[7] + ';DUP'
 				outLine = '\t'.join(strings) + '\n'
 				all_sample_outfile.write(outLine)
 			else:
 				# pre-define
 				pcr_consensus = '.'
 				pcr_free_consensus = '.'
 				consensus_call = '.'
 				consensus_alt_seq = '.'
 				# pcr 
 				strings[9:] = replace_nan(strings[9:])
 				pcr = itemgetter(*[9,10,11,27,28,29,30,31,32,33,34,35])(strings)
 				SEQ2000 = decide_by_rep(pcr[0:3])
 				XTen_ARD = decide_by_rep(pcr[3:6])
 				XTen_NVG = decide_by_rep(pcr[6:9])
 				XTen_WUX = decide_by_rep(pcr[9:12])
 				sequence_site = [SEQ2000,XTen_ARD,XTen_NVG,XTen_WUX]
 				sequence_dict = Counter(sequence_site)
 				highest_sequence = sequence_dict.most_common(1)
 				candidate_sequence = highest_sequence[0][0]
 				freq_sequence = highest_sequence[0][1]
 				if freq_sequence > 2:
 					pcr_consensus = candidate_sequence
 				else:
 					pcr_consensus = 'inconSequenceSite'
 				# pcr-free
 				pcr_free = itemgetter(*[12,13,14,15,16,17,18,19,20,21,22,23,24,25,26])(strings)
 				T7_WGE = decide_by_rep(pcr_free[0:3])
 				Nova_ARD_1 = decide_by_rep(pcr_free[3:6])
 				Nova_ARD_2 = decide_by_rep(pcr_free[6:9])
 				Nova_BRG = decide_by_rep(pcr_free[9:12])
 				Nova_WUX = decide_by_rep(pcr_free[12:15])
 				sequence_site = [T7_WGE,Nova_ARD_1,Nova_ARD_2,Nova_BRG,Nova_WUX]
 				highest_sequence = sequence_dict.most_common(1)
 				candidate_sequence = highest_sequence[0][0]
 				freq_sequence = highest_sequence[0][1]
 				if freq_sequence > 3:
 					pcr_free_consensus = candidate_sequence
 				else:
 					pcr_free_consensus = 'inconSequenceSite'
 				# pcr and pcr-free
 				tag = ['inconGT','noInfo','inconMen','inconSequenceSite']
 				if (pcr_consensus == pcr_free_consensus) and (pcr_consensus not in tag) and (pcr_consensus != '0/0'):
 					consensus_call = pcr_consensus
 					VOTED = vote_number(strings[9:],consensus_call)
 					strings[7] = strings[7] + ';VOTED=' + VOTED
 					DETECTED = detected_number(strings[9:])
 					strings[7] = strings[7] + ';DETECTED=' + DETECTED
 					FAM = family_vote(strings[9:],consensus_call)
 					strings[7] = strings[7] + ';FAM=' + FAM
 					# Delete multiple alternative genotype to necessary expression
 					alt = strings[4]
 					alt_gt = alt.split(',')
 					if len(alt_gt) > 1:
 						allele1 = consensus_call.split('/')[0]
 						allele2 = consensus_call.split('/')[1]
 						if allele1 == '0':
 							allele2_seq = alt_gt[int(allele2) - 1]
 							consensus_alt_seq = allele2_seq
 							consensus_call = '0/1'
 						else:
 							allele1_seq = alt_gt[int(allele1) - 1]
 							allele2_seq = alt_gt[int(allele2) - 1]
 							if int(allele1) > int(allele2):
 								consensus_alt_seq = allele2_seq + ',' + allele1_seq
 								consensus_call = '1/2'
 							elif int(allele1) < int(allele2):
 								consensus_alt_seq = allele1_seq + ',' + allele2_seq
 								consensus_call = '1/2'
 							else:
 								consensus_alt_seq = allele1_seq 
 								consensus_call = '1/1'
 					else:
 						consensus_alt_seq = alt
 					# GT:DP:ALT:AF:GQ:QD:MQ:FS:QUAL
 					# GT:TWINS:TRIO5:TRIO6:DP:ALT:AF:GQ:QD:MQ:FS:QUAL:rawGT
 					# DP
 					DP = [x.split(':')[4] for x in strings[9:]]
 					DP = remove_dot(DP)
 					DP = [int(x) for x in DP]
 					ALL_DP = sum(DP)
 					# AF
 					ALT = [x.split(':')[5] for x in strings[9:]]
 					ALT = remove_dot(ALT)
 					ALT = [int(x) for x in ALT]
 					ALL_ALT = sum(ALT)
 					ALL_AF = round(ALL_ALT/ALL_DP,2)
 					# GQ
 					GQ = [x.split(':')[7] for x in strings[9:]]
 					GQ = remove_dot(GQ)
 					GQ = [int(x) for x in GQ]
 					GQ_MEAN = round(mean(GQ),2)
 					# QD
 					QD = [x.split(':')[8] for x in strings[9:]]
 					QD = remove_dot(QD)
 					QD = [float(x) for x in QD]
 					QD_MEAN = round(mean(QD),2)
 					# MQ
 					MQ = [x.split(':')[9] for x in strings[9:]]
 					MQ = remove_dot(MQ)
 					MQ = [float(x) for x in MQ]
 					MQ_MEAN = round(mean(MQ),2)
 					# FS
 					FS = [x.split(':')[10] for x in strings[9:]]
 					FS = remove_dot(FS)
 					FS = [float(x) for x in FS]
 					FS_MEAN = round(mean(FS),2)
 					# QUAL
 					QUAL = [x.split(':')[11] for x in strings[9:]]
 					QUAL = remove_dot(QUAL)
 					QUAL = [float(x) for x in QUAL]
 					QUAL_MEAN = round(mean(QUAL),2)
 					# benchmark output 
 					output_format = consensus_call + ':' + str(ALL_DP) + ':' + str(ALL_ALT) + ':' + str(ALL_AF) + ':' + str(GQ_MEAN) + ':' + str(QD_MEAN) + ':' + str(MQ_MEAN) + ':' + str(FS_MEAN) + ':' + str(QUAL_MEAN)
 					outLine = strings[0] + '\t' + strings[1] + '\t' + strings[2] + '\t' + strings[3] + '\t' + consensus_alt_seq + '\t' + '.' + '\t' + '.' + '\t' + strings[7] + '\t' + 'GT:DP:ALT:AF:GQ:QD:MQ:FS:QUAL' + '\t' + output_format + '\n'
 					benchmark_outfile.write(outLine)
 					# all sample output
 					strings[7] = strings[7] + ';ALL_ALT=' + str(ALL_ALT) + ';ALL_DP=' + str(ALL_DP) + ';ALL_AF=' + str(ALL_AF) \
 					+ ';GQ_MEAN=' + str(GQ_MEAN) + ';QD_MEAN=' + str(QD_MEAN) + ';MQ_MEAN=' + str(MQ_MEAN) + ';FS_MEAN=' + str(FS_MEAN) \
 					+ ';QUAL_MEAN=' + str(QUAL_MEAN) + ';PCR=' + consensus_call + ';PCR_FREE=' + consensus_call + ';CONSENSUS=' + consensus_call \
 					+ ';CONSENSUS_SEQ=' + consensus_alt_seq
 					all_sample_outLine = '\t'.join(strings) + '\n'
 					all_sample_outfile.write(all_sample_outLine)
 				elif (pcr_consensus in tag) and (pcr_free_consensus in tag):
 					consensus_call = 'filtered'
 					DETECTED = detected_number(strings[9:])
 					strings[7] = strings[7] + ';DETECTED=' + DETECTED
 					strings[7] = strings[7] + ';CONSENSUS=' + consensus_call
 					all_sample_outLine = '\t'.join(strings) + '\n'
 					all_sample_outfile.write(all_sample_outLine)					
 				elif ((pcr_consensus == '0/0') or (pcr_consensus in tag)) and ((pcr_free_consensus not in tag) and (pcr_free_consensus != '0/0')):
 					consensus_call = 'pcr-free-speicifc'
 					DETECTED = detected_number(strings[9:])
 					strings[7] = strings[7] + ';DETECTED=' + DETECTED
 					strings[7] = strings[7] + ';CONSENSUS=' + consensus_call
 					all_sample_outLine = '\t'.join(strings) + '\n'
 					all_sample_outfile.write(all_sample_outLine)					
 				elif ((pcr_consensus != '0/0') or (pcr_consensus not in tag)) and ((pcr_free_consensus in tag) and (pcr_free_consensus == '0/0')):
 					consensus_call = 'pcr-speicifc'
 					DETECTED = detected_number(strings[9:])
 					strings[7] = strings[7] + ';DETECTED=' + DETECTED
 					strings[7] = strings[7] + ';CONSENSUS=' + consensus_call + ';PCR=' + pcr_consensus + ';PCR_FREE=' + pcr_free_consensus
 					all_sample_outLine = '\t'.join(strings) + '\n'
 					all_sample_outfile.write(all_sample_outLine)					
 				elif (pcr_consensus == '0/0') and (pcr_free_consensus == '0/0'):
 					consensus_call = 'confirm for parents'				
 					DETECTED = detected_number(strings[9:])
 					strings[7] = strings[7] + ';DETECTED=' + DETECTED
 					strings[7] = strings[7] + ';CONSENSUS=' + consensus_call
 					all_sample_outLine = '\t'.join(strings) + '\n'
 					all_sample_outfile.write(all_sample_outLine)					
 				else:
 					consensus_call = 'filtered'
 					DETECTED = detected_number(strings[9:])
 					strings[7] = strings[7] + ';DETECTED=' + DETECTED
 					strings[7] = strings[7] + ';CONSENSUS=' + consensus_call
 					all_sample_outLine = '\t'.join(strings) + '\n'
 					all_sample_outfile.write(all_sample_outLine)					

 if __name__ == '__main__':
 	main()












--- a/codescripts/high_voted_mendelian_bed.py
+++ b/codescripts/high_voted_mendelian_bed.py
@@ -0,0 +1,42 @@
 import pandas as pd
 import sys, argparse, os
 mut = pd.read_table(sys.argv[1])
 outFile = open(sys.argv[2],'w')
 for row in mut.itertuples():
 #d5
 	if ',' in row.V4:
 		alt = row.V4.split(',')
 		alt_len = [len(i) for i in alt]
 		alt_max = max(alt_len)
 	else:
 		alt_max = len(row.V4)
 #d6
 	alt = alt_max
 	ref = row.V3
 	pos = int(row.V2)
 	if len(ref) == 1 and alt == 1:
 		StartPos = int(pos) -1
 		EndPos = int(pos)
 		cate = 'SNV'
 	elif len(ref) > alt:
 		StartPos = int(pos) - 1
 		EndPos = int(pos) + (len(ref) - 1)
 		cate = 'INDEL'
 	elif alt > len(ref):
 		StartPos = int(pos) - 1
 		EndPos = int(pos) + (alt - 1)
 		cate = 'INDEL'
 	elif len(ref) == alt:
 		StartPos = int(pos) - 1
 		EndPos = int(pos) + (alt - 1)
 		cate = 'INDEL'
 	outline = row.V1 + '\t' + str(StartPos) + '\t' + str(EndPos) + '\t' + str(row.V2) + '\t' + cate + '\n'
 	outFile.write(outline)








--- a/codescripts/how_many_samples.py
+++ b/codescripts/how_many_samples.py
@@ -0,0 +1,50 @@
 import pandas as pd
 import sys, argparse, os
 from operator import itemgetter

 parser = argparse.ArgumentParser(description="This script is to get how many samples")

 parser.add_argument('-sample', '--sample', type=str, help='quartet_sample',  required=True)
 parser.add_argument('-rep', '--rep', type=str, help='quartet_rep',  required=True)
 args = parser.parse_args()

 # Rename input:
 sample = args.sample
 rep = args.rep

 quartet_sample = pd.read_table(sample,header=None)
 quartet_sample = list(quartet_sample[0])
 quartet_rep = pd.read_table(rep,header=None)
 quartet_rep = quartet_rep[0]

 #tags
 sister_tag = 'false'
 quartet_tag = 'false'

 quartet_rep_unique = list(set(quartet_rep))

 single_rep = [i for i in range(len(quartet_rep)) if quartet_rep[i] == quartet_rep_unique[0]]
 single_batch_sample = itemgetter(*single_rep)(quartet_sample)

 num = len(single_batch_sample)
 if num == 1:
 	sister_tag = 'false'
 	quartet_tag = 'false'
 elif num == 2:
 	if set(single_batch_sample) == set(['LCL5','LCL6']):
 		sister_tag = 'true'
 		quartet_tag = 'false'
 elif num == 3:
 	if ('LCL5' in single_batch_sample) and ('LCL6' in single_batch_sample):
 		sister_tag = 'true'
 		quartet_tag = 'false'
 elif num == 4:
 	if set(single_batch_sample) == set(['LCL5','LCL6','LCL7','LCL8']):
 		sister_tag = 'false'
 		quartet_tag = 'true'

 sister_outfile = open('sister_tag','w')
 quartet_outfile = open('quartet_tag','w')

 sister_outfile.write(sister_tag)
 quartet_outfile.write(quartet_tag)
--- a/codescripts/lcl5_all_called_variants.py
+++ b/codescripts/lcl5_all_called_variants.py
@@ -0,0 +1,42 @@
 from __future__ import division
 import sys, argparse, os
 import pandas as pd
 from collections import Counter

 # input arguments
 parser = argparse.ArgumentParser(description="this script is to merge mendelian and vcfinfo, and extract high_confidence_calls")
 parser.add_argument('-vcf', '--vcf', type=str, help='merged multiple sample vcf',  required=True)


 args = parser.parse_args()
 vcf = args.vcf

 lcl5_outfile = open('LCL5_all_variants.txt','w')
 filtered_outfile = open('LCL5_filtered_variants.txt','w')

 vcf_dat = pd.read_table(vcf)


 for row in vcf_dat.itertuples():
 	lcl5_list = [row.Quartet_DNA_BGI_SEQ2000_BGI_LCL5_1_20180518,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL5_2_20180530,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL5_3_20180530, \
 					row.Quartet_DNA_BGI_T7_WGE_LCL5_1_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL5_2_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL5_3_20191105, \
 					row.Quartet_DNA_ILM_Nova_ARD_LCL5_1_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL5_2_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL5_3_20181108, \
 					row.Quartet_DNA_ILM_Nova_ARD_LCL5_4_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL5_5_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL5_6_20190111, \
 					row.Quartet_DNA_ILM_Nova_BRG_LCL5_1_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL5_2_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL5_3_20180930, \
 					row.Quartet_DNA_ILM_Nova_WUX_LCL5_1_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL5_2_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL5_3_20190917, \
 					row.Quartet_DNA_ILM_XTen_ARD_LCL5_1_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL5_2_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL5_3_20170403, \
 					row.Quartet_DNA_ILM_XTen_NVG_LCL5_1_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL5_2_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL5_3_20170329, \
 					row.Quartet_DNA_ILM_XTen_WUX_LCL5_1_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL5_2_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL5_3_20170216]
 	lcl5_vcf_gt = [x.split(':')[0] for x in lcl5_list]
 	lcl5_gt=[item.replace('./.', '0/0') for item in lcl5_vcf_gt]
 	gt_dict = Counter(lcl5_gt)
 	highest_gt = gt_dict.most_common(1)
 	candidate_gt = highest_gt[0][0]
 	freq_gt = highest_gt[0][1]
 	output = row._1 + '\t' + str(row.POS) + '\t' + '\t'.join(lcl5_gt) + '\n'
 	if (candidate_gt == '0/0') and (freq_gt == 27):
 		filtered_outfile.write(output)
 	else:
 		lcl5_outfile.write(output)


--- a/codescripts/linux_command.sh
+++ b/codescripts/linux_command.sh
@@ -0,0 +1,6 @@
 cat benchmark.men.vote.diffbed.filtered | awk '{print $1"\t"$2"\t"".""\t"$35"\t"$7"\t.\t.\t.\tGT\t"$6}' | grep -v '2_y' > LCL5.body
 cat benchmark.men.vote.diffbed.filtered | awk '{print $1"\t"$2"\t"".""\t"$35"\t"$15"\t.\t.\t.\tGT\t"$14}' | grep -v '2_y' > LCL6.body
 cat benchmark.men.vote.diffbed.filtered | awk '{print $1"\t"$2"\t"".""\t"$35"\t"$23"\t.\t.\t.\tGT\t"$22}' | grep -v '2_y' > LCL7.body
 cat benchmark.men.vote.diffbed.filtered | awk '{print $1"\t"$2"\t"".""\t"$35"\t"$31"\t.\t.\t.\tGT\t"$30}' | grep -v '2_y' > LCL8.body

 for i in *txt; do cat $i | awk '{ if ((length($3) == 1) && (length($4) == 1)) { print } }' | grep -v '#' | cut -f3,4 | sort |uniq -c | sed 's/\s\+/\t/g' | cut -f2 > $i.mut; done
--- a/codescripts/merge_mendelian_vcfinfo.py
+++ b/codescripts/merge_mendelian_vcfinfo.py
@@ -0,0 +1,129 @@
 from __future__ import division
 import pandas as pd
 import sys, argparse, os
 import fileinput
 import re

 # input arguments
 parser = argparse.ArgumentParser(description="this script is to get final high confidence calls and information of all replicates")

 parser.add_argument('-vcfInfo', '--vcfInfo', type=str, help='The txt file of variants information, this file is named as prefix__variant_quality_location.txt',  required=True)
 parser.add_argument('-mendelianInfo', '--mendelianInfo', type=str, help='The merged mendelian information of all samples',  required=True)
 parser.add_argument('-sample', '--sample_name', type=str, help='which sample of quartet',  required=True)


 args = parser.parse_args()
 vcfInfo = args.vcfInfo
 mendelianInfo = args.mendelianInfo
 sample_name = args.sample_name


 #GT:TWINS:TRIO5:TRIO6:DP:AF:GQ:QD:MQ:FS:QUAL

 vcf_header = '''##fileformat=VCFv4.2
 ##fileDate=20200331
 ##source=high_confidence_calls_intergration(choppy app)
 ##reference=GRCh38.d1.vd1
 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
 ##FORMAT=<ID=TWINS,Number=1,Type=Flag,Description="1 for sister consistent, 0 for sister different">
 ##FORMAT=<ID=TRIO5,Number=1,Type=Flag,Description="1 for LCL7, LCL8 and LCL5 mendelian consistent, 0 for family violation">
 ##FORMAT=<ID=TRIO6,Number=1,Type=Flag,Description="1 for LCL7, LCL8 and LCL6 mendelian consistent, 0 for family violation">
 ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Depth">
 ##FORMAT=<ID=ALT,Number=1,Type=Integer,Description="Alternative Depth">
 ##FORMAT=<ID=AF,Number=1,Type=Float,Description="Allele frequency">
 ##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype quality">
 ##FORMAT=<ID=QD,Number=1,Type=Float,Description="Variant Confidence/Quality by Depth">
 ##FORMAT=<ID=MQ,Number=1,Type=Float,Description="Mapping quality">
 ##FORMAT=<ID=FS,Number=1,Type=Float,Description="Phred-scaled p-value using Fisher's exact test to detect strand bias">
 ##FORMAT=<ID=QUAL,Number=1,Type=Float,Description="variant quality">
 ##contig=<ID=chr1,length=248956422>
 ##contig=<ID=chr2,length=242193529>
 ##contig=<ID=chr3,length=198295559>
 ##contig=<ID=chr4,length=190214555>
 ##contig=<ID=chr5,length=181538259>
 ##contig=<ID=chr6,length=170805979>
 ##contig=<ID=chr7,length=159345973>
 ##contig=<ID=chr8,length=145138636>
 ##contig=<ID=chr9,length=138394717>
 ##contig=<ID=chr10,length=133797422>
 ##contig=<ID=chr11,length=135086622>
 ##contig=<ID=chr12,length=133275309>
 ##contig=<ID=chr13,length=114364328>
 ##contig=<ID=chr14,length=107043718>
 ##contig=<ID=chr15,length=101991189>
 ##contig=<ID=chr16,length=90338345>
 ##contig=<ID=chr17,length=83257441>
 ##contig=<ID=chr18,length=80373285>
 ##contig=<ID=chr19,length=58617616>
 ##contig=<ID=chr20,length=64444167>
 ##contig=<ID=chr21,length=46709983>
 ##contig=<ID=chr22,length=50818468>
 ##contig=<ID=chrX,length=156040895>
 '''

 # output file
 file_name = sample_name + '_mendelian_vcfInfo.vcf'
 outfile = open(file_name,'w')

 outputcolumn = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t' +  sample_name + '\n'
 outfile.write(vcf_header)
 outfile.write(outputcolumn)

 # input files
 vcf_info = pd.read_table(vcfInfo)
 mendelian_info = pd.read_table(mendelianInfo)

 merged_df = pd.merge(vcf_info, mendelian_info,  how='outer', left_on=['#CHROM','POS'], right_on = ['#CHROM','POS'])
 merged_df = merged_df.fillna('.')

 #
 def parse_INFO(info):
 	strings = info.strip().split(';')
 	keys = []
 	values = []
 	for i in strings:
 		kv = i.split('=')
 		if kv[0] == 'DB':
 			keys.append('DB')
 			values.append('1')
 		else:
 			keys.append(kv[0])
 			values.append(kv[1])
 	infoDict = dict(zip(keys, values))
 	return infoDict
 #
 for row in merged_df.itertuples():
 	if row[18] != '.':
 		# format
 		# GT:TWINS:TRIO5:TRIO6:DP:AF:GQ:QD:MQ:FS:QUAL
 		FORMAT_x = row[10].split(':')
 		ALT = int(FORMAT_x[1].split(',')[1])
 		if int(FORMAT_x[2]) != 0:
 			AF = round(ALT/int(FORMAT_x[2]),2)
 		else:
 			AF = '.'
 		INFO_x = parse_INFO(row.INFO_x)
 		if FORMAT_x[2] == '0':
 			INFO_x['QD'] = '.'
 		else:
 			pass
 		FORMAT = row[18] + ':' + FORMAT_x[2] + ':' + str(ALT) + ':' + str(AF) + ':' + FORMAT_x[3] + ':' + INFO_x['QD'] + ':' + INFO_x['MQ'] + ':' + INFO_x['FS'] + ':' + str(row.QUAL_x)
 		# outline
 		outline = row._1 + '\t' + str(row.POS) + '\t' + row.ID_x + '\t' + row.REF_y + '\t' + row.ALT_y + '\t' + '.' + '\t' + '.' + '\t' + '.' + '\t' + 'GT:TWINS:TRIO5:TRIO6:DP:ALT:AF:GQ:QD:MQ:FS:QUAL' + '\t' + FORMAT + '\n'
 	else:
 		rawGT = row[10].split(':')
 		FORMAT_x = row[10].split(':')
 		ALT = int(FORMAT_x[1].split(',')[1])
 		if int(FORMAT_x[2]) != 0:
 			AF = round(ALT/int(FORMAT_x[2]),2)
 		else:
 			AF = '.'
 		INFO_x = parse_INFO(row.INFO_x)
 		if FORMAT_x[2] == '0':
 			INFO_x['QD'] = '.'
 		else:
 			pass
 		FORMAT = '.:.:.:.' + ':' + FORMAT_x[2] + ':' + str(ALT) + ':' + str(AF) + ':' + FORMAT_x[3] + ':' + INFO_x['QD'] + ':' + INFO_x['MQ'] + ':' + INFO_x['FS'] + ':' + str(row.QUAL_x) + ':' + rawGT[0]
 		# outline
 		outline = row._1 + '\t' + str(row.POS) + '\t' + row.ID_x + '\t' + row.REF_x + '\t' + row.ALT_x + '\t' + '.' + '\t' + '.' + '\t' + '.' + '\t' + 'GT:TWINS:TRIO5:TRIO6:DP:ALT:AF:GQ:QD:MQ:FS:QUAL:rawGT' + '\t' + FORMAT + '\n'
 	outfile.write(outline)
--- a/codescripts/merge_two_family.py
+++ b/codescripts/merge_two_family.py
@@ -0,0 +1,71 @@
 from __future__ import division
 import pandas as pd
 import sys, argparse, os
 import fileinput
 import re

 # input arguments
 parser = argparse.ArgumentParser(description="this script is to extract mendelian concordance information")

 parser.add_argument('-LCL5', '--LCL5', type=str, help='LCL5 family info',  required=True)
 parser.add_argument('-LCL6', '--LCL6', type=str, help='LCL6 family info',  required=True)
 parser.add_argument('-family', '--family', type=str, help='family name',  required=True)


 args = parser.parse_args()
 lcl5 = args.LCL5
 lcl6 = args.LCL6
 family = args.family


 # output file
 family_name = family + '.txt'

 family_file = open(family_name,'w')

 # input files
 lcl5_dat = pd.read_table(lcl5)
 lcl6_dat = pd.read_table(lcl6)

 merged_df = pd.merge(lcl5_dat, lcl6_dat,  how='outer', left_on=['#CHROM','POS'], right_on = ['#CHROM','POS'])

 def alt_seq(alt, genotype):
 	if genotype == './.':
 		seq = './.'
 	elif genotype == '0/0':
 		seq = '0/0'
 	else:
 		alt = alt.split(',')
 		genotype = genotype.split('/')
 		if genotype[0] == '0':
 			allele2 = alt[int(genotype[1]) - 1]
 			seq = '0/' + allele2
 		else:
 			allele1 = alt[int(genotype[0]) - 1]
 			allele2 = alt[int(genotype[1]) - 1]
 			seq = allele1 + '/' +  allele2
 	return seq

 for row in merged_df.itertuples():
 	# correction of multiallele
 	if pd.isnull(row.INFO_x) == True or pd.isnull(row.INFO_y) == True:
 		mendelian = '.'
 	else:
 		lcl5_seq = alt_seq(row.ALT_x, row.CHILD_x)
 		lcl6_seq = alt_seq(row.ALT_y, row.CHILD_y)
 		if lcl5_seq == lcl6_seq:
 			mendelian = '1'
 		else:
 			mendelian = '0'
 	if pd.isnull(row.INFO_x) == True:
 		mendelian = mendelian + ':.'
 	else:
 		mendelian = mendelian + ':' + row.INFO_x.split('=')[1]
 	if pd.isnull(row.INFO_y) == True:
 		mendelian = mendelian + ':.'
 	else:
 		mendelian = mendelian + ':' + row.INFO_y.split('=')[1]


 	outline = row._1 + '\t' + str(row.POS) + '\t' + mendelian + '\n'
 	family_file.write(outline)
--- a/codescripts/merge_two_family_with_genotype.py
+++ b/codescripts/merge_two_family_with_genotype.py
@@ -0,0 +1,115 @@
 from __future__ import division
 import pandas as pd
 import sys, argparse, os
 import fileinput
 import re

 # input arguments
 parser = argparse.ArgumentParser(description="this script is to extract mendelian concordance information")

 parser.add_argument('-LCL5', '--LCL5', type=str, help='LCL5 family info',  required=True)
 parser.add_argument('-LCL6', '--LCL6', type=str, help='LCL6 family info',  required=True)
 parser.add_argument('-genotype', '--genotype', type=str, help='Genotype information of a set of four family members',  required=True)
 parser.add_argument('-family', '--family', type=str, help='family name',  required=True)


 args = parser.parse_args()
 lcl5 = args.LCL5
 lcl6 = args.LCL6
 genotype = args.genotype
 family = args.family


 # output file
 family_name = family + '.txt'

 family_file = open(family_name,'w')

 summary_name = family + '.summary.txt'

 summary_file = open(summary_name,'w')

 # input files
 lcl5_dat = pd.read_table(lcl5)
 lcl6_dat = pd.read_table(lcl6)
 genotype_dat = pd.read_table(genotype)
 merged_df = pd.merge(lcl5_dat, lcl6_dat,  how='outer', left_on=['#CHROM','POS'], right_on = ['#CHROM','POS'])
 merged_genotype_df = pd.merge(merged_df, genotype_dat,  how='outer', left_on=['#CHROM','POS'], right_on = ['#CHROM','POS'])

 merged_genotype_df_sub = merged_genotype_df.iloc[:,[0,1,23,24,29,30,31,32,7,17]]
 merged_genotype_df_sub.columns = ['CHROM', 'POS', 'REF', 'ALT','LCL5','LCL6','LCL7','LCL8', 'TRIO5', 'TRIO6']

 sister_same = 0
 sister_diff = 0
 family_all = 0
 family_mendelian = 0

 for row in merged_genotype_df_sub.itertuples():
 	# sister
 	if row.LCL5 == row.LCL6:
 		if row.LCL5 == './.':
 			mendelian = 'noInfo'
 			sister_count = "no"
 		elif row.LCL5 == '0/0':
 			mendelian = 'Ref'
 			sister_count = "no"
 		else:
 			mendelian = '1'
 			sister_count = "yes_same"
 	else:
 		mendelian = '0'
 		if (row.LCL5 == './.' or row.LCL5 == '0/0') and (row.LCL6 == './.' or row.LCL6 == '0/0'):
 			sister_count = "no"
 		else:
 			sister_count = "yes_diff"
 	if sister_count == 'yes_same':
 		sister_same += 1
 	elif sister_count == 'yes_diff':
 		sister_diff += 1
 	else:
 		pass
 	# family trio5
 	if row.LCL5 == row. LCL7 == row.LCL8 == './.':
 		mendelian = mendelian + ':noInfo'
 	elif row.LCL5 == row. LCL7 == row.LCL8 == '0/0':
 		mendelian = mendelian + ':Ref'
 	elif pd.isnull(row.TRIO5) == True:
 		mendelian = mendelian + ':unVBT'
 	else:
 		mendelian = mendelian + ':' + row.TRIO5.split('=')[1]
 	# family trio6
 	if row.LCL6 == row.LCL7 == row.LCL8 == './.':
 		mendelian = mendelian + ':noInfo'
 	elif row.LCL6 == row. LCL7 == row.LCL8 == '0/0':
 		mendelian = mendelian + ':Ref'
 	elif pd.isnull(row.TRIO6) == True:
 		mendelian = mendelian + ':unVBT'
 	else:
 		mendelian =  mendelian + ':' + row.TRIO6.split('=')[1]
 	# not count into family
 	if (row.LCL5 == './.' or row.LCL5 == '0/0') and (row.LCL6 == './.' or row.LCL6 == '0/0') and (row.LCL7 == './.' or row.LCL7 == '0/0') and (row.LCL8 == './.' or row.LCL8 == '0/0'):
 		mendelian_count = "no"
 	else:
 		mendelian_count = "yes"
 	outline = row.CHROM + '\t' + str(row.POS) + '\t' + row.REF + '\t' + row.ALT + '\t' + row.LCL5 + '\t' + row.LCL6 + '\t' + row.LCL7 + '\t' + row.LCL8 + '\t' + str(row.TRIO5) + '\t' + str(row.TRIO6) + '\t' + str(mendelian) + '\t' + str(mendelian_count) + '\t' + str(sister_count) + '\n'
 	family_file.write(outline)
 	if mendelian_count == 'yes':
 		family_all += 1
 	else:
 		pass
 	if mendelian == '1:1:1':
 		family_mendelian += 1
 	elif mendelian == 'Ref:1:1':
 		family_mendelian += 1
 	else:
 		pass

 sister = sister_same/(sister_same + sister_diff)
 quartet = family_mendelian/family_all
 outcolumn =  'Family\tReproducibility_D5_D6\tMendelian_Concordance_Quartet\n'
 outResult = family + '\t' + str(sister) + '\t' + str(quartet) + '\n'
 summary_file.write(outcolumn)
 summary_file.write(outResult)



--- a/codescripts/oneClass.py
+++ b/codescripts/oneClass.py
@@ -0,0 +1,109 @@
 # import modules
 import numpy as np
 import pandas as pd
 from sklearn import svm
 from sklearn import preprocessing
 import sys, argparse, os
 from vcf2bed import position_to_bed,padding_region



 parser = argparse.ArgumentParser(description="this script is to preform one calss svm on each chromosome")

 parser.add_argument('-train', '--trainDataset', type=str, help='training dataset generated from extracting vcf information part, with mutaitons supported by callsets',  required=True)
 parser.add_argument('-test', '--testDataset', type=str, help='testing dataset generated from extracting vcf information part, with mutaitons not called by all callsets',  required=True)
 parser.add_argument('-name', '--sampleName', type=str, help='sample name for output file name',  required=True)

 args = parser.parse_args()

 # Rename input:
 train_input = args.trainDataset
 test_input = args.testDataset
 sample_name = args.sampleName

 # default columns, which will be included in the included in the calssifier
 chromosome = ['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15' ,'chr16','chr17','chr18','chr19','chr20','chr21','chr22','chrX','chrY']
 feature_heter_cols = ['AltDP','BaseQRankSum','DB','DP','FS','GQ','MQ','MQRankSum','QD','ReadPosRankSum','RefDP','SOR','af']
 feature_homo_cols = ['AltDP','DB','DP','FS','GQ','MQ','QD','RefDP','SOR','af']


 # import datasets sepearate the records with or without BaseQRankSum annotation, etc.
 def load_dat(dat_file_name):
    dat = pd.read_table(dat_file_name)
    dat['DB'] = dat['DB'].fillna(0)
    dat = dat[dat['DP'] != 0]
    dat['af'] = dat['AltDP']/(dat['AltDP'] + dat['RefDP'])
    homo_rows = dat[dat['BaseQRankSum'].isnull()]
    heter_rows = dat[dat['BaseQRankSum'].notnull()]
    return homo_rows,heter_rows


 train_homo,train_heter = load_dat(train_input)
 test_homo,test_heter = load_dat(test_input)
 clf = svm.OneClassSVM(nu=0.05,kernel='rbf', gamma='auto_deprecated',cache_size=500)

 def prepare_dat(train_dat,test_dat,feature_cols,chromo):
 	chr_train = train_dat[train_dat['chromo'] == chromo]
 	chr_test = test_dat[test_dat['chromo'] == chromo]
 	
 	train_dat = chr_train.loc[:,feature_cols]
 	test_dat = chr_test.loc[:,feature_cols]
 	train_dat_scaled = preprocessing.scale(train_dat)
 	test_dat_scaled = preprocessing.scale(test_dat)
 	return chr_test,train_dat_scaled,test_dat_scaled

 def oneclass(X_train,X_test,chr_test):
 	clf.fit(X_train)
 	y_pred_test = clf.predict(X_test)
 	test_true_dat = chr_test[y_pred_test == 1]
 	test_false_dat = chr_test[y_pred_test == -1]
 	return test_true_dat,test_false_dat

 predicted_true = pd.DataFrame(columns=train_homo.columns)
 predicted_false = pd.DataFrame(columns=train_homo.columns)

 for chromo in chromosome:
 	# homo datasets
 	chr_test_homo,X_train_homo,X_test_homo = prepare_dat(train_homo,test_homo,feature_homo_cols,chromo)
 	test_true_homo,test_false_homo = oneclass(X_train_homo,X_test_homo,chr_test_homo)
 	predicted_true = predicted_true.append(test_true_homo)
 	predicted_false = predicted_false.append(test_false_homo)
 	# heter datasets
 	chr_test_heter,X_train_heter,X_test_heter = prepare_dat(train_heter,test_heter,feature_heter_cols,chromo)
 	test_true_heter,test_false_heter = oneclass(X_train_heter,X_test_heter,chr_test_heter)
 	predicted_true = predicted_true.append(test_true_heter)
 	predicted_false = predicted_false.append(test_false_heter)

 predicted_true_filename = sample_name + '_predicted_true.txt'
 predicted_false_filename = sample_name + '_predicted_false.txt'

 predicted_true.to_csv(predicted_true_filename,sep='\t',index=False)
 predicted_false.to_csv(predicted_false_filename,sep='\t',index=False)

 # output the bed file and padding bed region 50bp

 predicted_true_bed_filename = sample_name + '_predicted_true.bed'
 predicted_false_bed_filename = sample_name + '_predicted_false.bed'
 padding_filename = sample_name + '_padding.bed'

 predicted_true_bed = open(predicted_true_bed_filename,'w')
 predicted_false_bed = open(predicted_false_bed_filename,'w')
 padding = open(padding_filename,'w')

 #
 for index,row in predicted_false.iterrows():
 	chromo,pos1,pos2 = position_to_bed(row['chromo'],row['pos'],row['ref'],row['alt'])
 	outline_pos = chromo + '\t' + str(pos1) + '\t' + str(pos2) + '\n'
 	predicted_false_bed.write(outline_pos)
 	chromo,pad_pos1,pad_pos2,pad_pos3,pad_pos4 = padding_region(chromo,pos1,pos2,50)
 	outline_pad_1 = chromo + '\t' + str(pad_pos1) + '\t' + str(pad_pos2) + '\n'
 	outline_pad_2 = chromo + '\t' + str(pad_pos3) + '\t' + str(pad_pos4) + '\n'
 	padding.write(outline_pad_1)
 	padding.write(outline_pad_2)

 for index,row in predicted_true.iterrows():
 	chromo,pos1,pos2 = position_to_bed(row['chromo'],row['pos'],row['ref'],row['alt'])
 	outline_pos = chromo + '\t' + str(pos1) + '\t' + str(pos2) + '\n'
 	predicted_true_bed.write(outline_pos)


--- a/codescripts/precision_recall
+++ b/codescripts/precision_recall
@@ -0,0 +1,25 @@
 Type	Filter	TRUTH.TOTAL	TRUTH.TP	TRUTH.FN	QUERY.TOTAL	QUERY.FP	QUERY.UNK	FP.gt	METRIC.Recall	METRIC.Precision	METRIC.Frac_NA	METRIC.F1_Score	TRUTH.TOTAL.TiTv_ratio	QUERY.TOTAL.TiTv_ratio	TRUTH.TOTAL.het_hom_ratio	QUERY.TOTAL.het_hom_ratio
 INDEL	ALL	611469	32000	579469	52892	10380	10361	5049	0.052333	0.755943	0.19589	0.097889			1.60450055781	0.506141804262
 SNP	ALL	3607071	270183	3336888	342022	52610	19197	32200	0.074904	0.837032	0.056128	0.137503	2.06510079715	2.31948838383	1.35160768181	0.585277660595
 INDEL	ALL	611469	32000	579469	52892	10380	10361	5049	0.052333	0.755943	0.19589	0.097889			1.60450055781	0.506141804262
 SNP	ALL	3607071	270183	3336888	342022	52610	19197	32200	0.074904	0.837032	0.056128	0.137503	2.06510079715	2.31948838383	1.35160768181	0.585277660595
 INDEL	ALL	611469	32000	579469	52892	10380	10361	5049	0.052333	0.755943	0.19589	0.097889			1.60450055781	0.506141804262
 SNP	ALL	3607071	270183	3336888	342022	52610	19197	32200	0.074904	0.837032	0.056128	0.137503	2.06510079715	2.31948838383	1.35160768181	0.585277660595
 INDEL	ALL	611469	27751	583718	45833	8555	9382	4372	0.045384	0.765301	0.2047	0.085687			1.60450055781	0.544685459037
 SNP	ALL	3607071	229603	3377468	285728	38719	17379	25945	0.063654	0.855714	0.060824	0.118493	2.06510079715	2.2934525799	1.35160768181	0.611069128478
 INDEL	ALL	611469	27751	583718	45833	8555	9382	4372	0.045384	0.765301	0.2047	0.085687			1.60450055781	0.544685459037
 SNP	ALL	3607071	229603	3377468	285728	38719	17379	25945	0.063654	0.855714	0.060824	0.118493	2.06510079715	2.2934525799	1.35160768181	0.611069128478
 INDEL	ALL	611469	27751	583718	45833	8555	9382	4372	0.045384	0.765301	0.2047	0.085687			1.60450055781	0.544685459037
 SNP	ALL	3607071	229603	3377468	285728	38719	17379	25945	0.063654	0.855714	0.060824	0.118493	2.06510079715	2.2934525799	1.35160768181	0.611069128478
 INDEL	ALL	602890	30755	572135	49871	9073	9927	4713	0.051013	0.772857	0.199054	0.095708			1.45794812615	0.537973508294
 SNP	ALL	3586918	261507	3325411	323937	43923	18478	30285	0.072906	0.856207	0.057042	0.13437	2.06425234942	2.30355959336	1.29577693834	0.580073379456
 INDEL	ALL	602890	30755	572135	49871	9073	9927	4713	0.051013	0.772857	0.199054	0.095708			1.45794812615	0.537973508294
 SNP	ALL	3586918	261507	3325411	323937	43923	18478	30285	0.072906	0.856207	0.057042	0.13437	2.06425234942	2.30355959336	1.29577693834	0.580073379456
 INDEL	ALL	602890	30755	572135	49871	9073	9927	4713	0.051013	0.772857	0.199054	0.095708			1.45794812615	0.537973508294
 SNP	ALL	3586918	261507	3325411	323937	43923	18478	30285	0.072906	0.856207	0.057042	0.13437	2.06425234942	2.30355959336	1.29577693834	0.580073379456
 INDEL	ALL	608169	27666	580503	45885	9139	8953	4193	0.045491	0.752545	0.195118	0.085795			1.50870570513	0.547063202057
 SNP	ALL	3604200	234173	3370027	307086	54292	18602	26830	0.064972	0.811802	0.060576	0.120315	2.06546454872	2.33046326017	1.34261182662	0.630876119451
 INDEL	ALL	608169	27666	580503	45885	9139	8953	4193	0.045491	0.752545	0.195118	0.085795			1.50870570513	0.547063202057
 SNP	ALL	3604200	234173	3370027	307086	54292	18602	26830	0.064972	0.811802	0.060576	0.120315	2.06546454872	2.33046326017	1.34261182662	0.630876119451
 INDEL	ALL	608169	27666	580503	45885	9139	8953	4193	0.045491	0.752545	0.195118	0.085795			1.50870570513	0.547063202057
 SNP	ALL	3604200	234173	3370027	307086	54292	18602	26830	0.064972	0.811802	0.060576	0.120315	2.06546454872	2.33046326017	1.34261182662	0.630876119451
--- a/codescripts/reformVCF.py
+++ b/codescripts/reformVCF.py
@@ -0,0 +1,144 @@
 # import modules
 import sys, argparse, os
 import fileinput
 import re

 parser = argparse.ArgumentParser(description="This script is to split samples in VCF files and rewrite to the right style")

 parser.add_argument('-vcf', '--familyVCF', type=str, help='VCF with sister and mendelian infomation',  required=True)
 parser.add_argument('-name', '--familyName', type=str, help='Family name of the VCF file',  required=True)

 args = parser.parse_args()

 # Rename input:
 inputFile = args.familyVCF
 family_name = args.familyName

 # output filename
 LCL5_name = family_name + '.LCL5.vcf'
 LCL5file = open(LCL5_name,'w')
 LCL6_name = family_name + '.LCL6.vcf'
 LCL6file = open(LCL6_name,'w')
 LCL7_name = family_name + '.LCL7.vcf'
 LCL7file = open(LCL7_name,'w')
 LCL8_name = family_name + '.LCL8.vcf'
 LCL8file = open(LCL8_name,'w')
 family_filename = family_name + '.vcf'
 familyfile = open(family_filename,'w')

 # default columns, which will be included in the included in the calssifier
 vcfheader = '''##fileformat=VCFv4.2
 ##FILTER=<ID=PASS,Description="the same genotype between twin sister and mendelian consistent in 578 and 678">
 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
 ##FORMAT=<ID=TWINS,Number=0,Type=Flag,Description="0 for sister consistent, 1 for sister inconsistent">
 ##FORMAT=<ID=TRIO5,Number=0,Type=Flag,Description="0 for trio consistent, 1 for trio inconsistent">
 ##FORMAT=<ID=TRIO6,Number=0,Type=Flag,Description="0 for trio consistent, 1 for trio inconsistent">
 ##contig=<ID=chr1,length=248956422>
 ##contig=<ID=chr2,length=242193529>
 ##contig=<ID=chr3,length=198295559>
 ##contig=<ID=chr4,length=190214555>
 ##contig=<ID=chr5,length=181538259>
 ##contig=<ID=chr6,length=170805979>
 ##contig=<ID=chr7,length=159345973>
 ##contig=<ID=chr8,length=145138636>
 ##contig=<ID=chr9,length=138394717>
 ##contig=<ID=chr10,length=133797422>
 ##contig=<ID=chr11,length=135086622>
 ##contig=<ID=chr12,length=133275309>
 ##contig=<ID=chr13,length=114364328>
 ##contig=<ID=chr14,length=107043718>
 ##contig=<ID=chr15,length=101991189>
 ##contig=<ID=chr16,length=90338345>
 ##contig=<ID=chr17,length=83257441>
 ##contig=<ID=chr18,length=80373285>
 ##contig=<ID=chr19,length=58617616>
 ##contig=<ID=chr20,length=64444167>
 ##contig=<ID=chr21,length=46709983>
 ##contig=<ID=chr22,length=50818468>
 ##contig=<ID=chrX,length=156040895>
 '''
 # write VCF
 LCL5colname = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+family_name+'_LCL5'+'\n'
 LCL5file.write(vcfheader)
 LCL5file.write(LCL5colname)

 LCL6colname = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+family_name+'_LCL6'+'\n'
 LCL6file.write(vcfheader)
 LCL6file.write(LCL6colname)

 LCL7colname = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+family_name+'_LCL7'+'\n'
 LCL7file.write(vcfheader)
 LCL7file.write(LCL7colname)

 LCL8colname = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+family_name+'_LCL8'+'\n'
 LCL8file.write(vcfheader)
 LCL8file.write(LCL8colname)

 familycolname = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+'LCL5\t'+'LCL6\t'+'LCL7\t'+'LCL8'+'\n'
 familyfile.write(vcfheader)
 familyfile.write(familycolname)

 # reform VCF
 def process(oneLine):
 	line = oneLine.rstrip()
 	strings = line.strip().split('\t')
 	# replace .
 	# LCL6 uniq
 	if strings[11] == '.':
 		strings[11] = '0/0'
 		strings[9] = strings[12]
 		strings[10] = strings[13]
 	else:
 		pass
 	# LCL5 uniq
 	if strings[14] == '.':
 		strings[14] = '0/0'
 		strings[12] = strings[9]
 		strings[13] = strings[10]
 	else:
 		pass
 	# sister
 	if strings[11] == strings[14]:
 		add_format = ":1"
 	else:
 		add_format = ":0"
 	# trioLCL5
 	if strings[15] == 'MD=1':
 		add_format = add_format + ":1"
 	else:
 		add_format = add_format + ":0"
 	# trioLCL6
 	if strings[7] == 'MD=1':
 		add_format = add_format + ":1"
 	else:
 		add_format = add_format + ":0"
 	# filter
 	if (strings[11] == strings[14]) and (strings[15] == 'MD=1') and (strings[7] == 'MD=1'):
 		strings[6] = 'PASS'
 	else:
 		strings[6] = '.'
 	# output LCL5
 	LCL5outLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+'.'+'\t'+strings[6]+'\t'+ '.' +'\t'+ 'GT:TWINS:TRIO5:TRIO6' + '\t' + strings[14] + add_format + '\n'
 	LCL5file.write(LCL5outLine)
 	# output LCL6
 	LCL6outLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+'.'+'\t'+strings[6]+'\t'+ '.' +'\t'+ 'GT:TWINS:TRIO5:TRIO6' + '\t' + strings[11] + add_format + '\n'
 	LCL6file.write(LCL6outLine)
 	# output LCL7
 	LCL7outLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+'.'+'\t'+strings[6]+'\t'+ '.' +'\t'+ 'GT:TWINS:TRIO5:TRIO6' + '\t' + strings[10] + add_format + '\n'
 	LCL7file.write(LCL7outLine)
 	# output LCL8
 	LCL8outLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+'.'+'\t'+strings[6]+'\t'+ '.' +'\t'+ 'GT:TWINS:TRIO5:TRIO6' + '\t' + strings[9] + add_format + '\n'
 	LCL8file.write(LCL8outLine)
 	# output family
 	familyoutLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+ '.'+'\t'+strings[6]+'\t'+ '.' +'\t'+ 'GT:TWINS:TRIO5:TRIO6' + '\t' + strings[14] + add_format +'\t' + strings[11] + add_format + '\t' + strings[10] + add_format +'\t' + strings[9] + add_format + '\n'
 	familyfile.write(familyoutLine)


 for line in fileinput.input(inputFile):
 	m = re.match('^\#',line)
 	if m is not None:
 		pass
 	else:
 		process(line)
 		

--- a/codescripts/replicates_consensus.py
+++ b/codescripts/replicates_consensus.py
@@ -0,0 +1,227 @@
 from __future__ import division
 from glob import glob
 import sys, argparse, os
 import fileinput
 import re
 import pandas as pd
 from operator import itemgetter
 from collections import Counter
 from itertools import islice
 from numpy import *
 import statistics

 # input arguments
 parser = argparse.ArgumentParser(description="this script is to merge mendelian and vcfinfo, and extract high_confidence_calls")

 parser.add_argument('-prefix', '--prefix', type=str, help='prefix of output file',  required=True)
 parser.add_argument('-vcf', '--vcf', type=str, help='merged multiple sample vcf',  required=True)


 args = parser.parse_args()
 prefix = args.prefix
 vcf = args.vcf

 # input files
 vcf_dat = pd.read_table(vcf)

 # all info
 all_file_name = prefix + "_all_summary.txt"
 all_sample_outfile = open(all_file_name,'w')
 all_info_col = 'CHROM\tPOS\tREF\tALT\tLCL5_consensus_calls\tLCL5_detect_number\tLCL5_same_diff\tLCL6_consensus_calls\tLCL6_detect_number\tLCl6_same_diff\tLCL7_consensus_calls\tLCL7_detect_number\tLCL7_same_diff\tLCL8_consensus_calls\tLCL8_detect_number\tLCL8_same_diff\n'
 all_sample_outfile.write(all_info_col)

 # filtered info
 vcf_header = '''##fileformat=VCFv4.2
 ##fileDate=20200501
 ##source=high_confidence_calls_intergration(choppy app)
 ##reference=GRCh38.d1.vd1
 ##INFO=<ID=VOTED,Number=1,Type=Integer,Description="Number mendelian consisitent votes">
 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
 ##contig=<ID=chr1,length=248956422>
 ##contig=<ID=chr2,length=242193529>
 ##contig=<ID=chr3,length=198295559>
 ##contig=<ID=chr4,length=190214555>
 ##contig=<ID=chr5,length=181538259>
 ##contig=<ID=chr6,length=170805979>
 ##contig=<ID=chr7,length=159345973>
 ##contig=<ID=chr8,length=145138636>
 ##contig=<ID=chr9,length=138394717>
 ##contig=<ID=chr10,length=133797422>
 ##contig=<ID=chr11,length=135086622>
 ##contig=<ID=chr12,length=133275309>
 ##contig=<ID=chr13,length=114364328>
 ##contig=<ID=chr14,length=107043718>
 ##contig=<ID=chr15,length=101991189>
 ##contig=<ID=chr16,length=90338345>
 ##contig=<ID=chr17,length=83257441>
 ##contig=<ID=chr18,length=80373285>
 ##contig=<ID=chr19,length=58617616>
 ##contig=<ID=chr20,length=64444167>
 ##contig=<ID=chr21,length=46709983>
 ##contig=<ID=chr22,length=50818468>
 ##contig=<ID=chrX,length=156040895>
 '''
 consensus_file_name = prefix + "_consensus.vcf"
 consensus_outfile = open(consensus_file_name,'w')
 consensus_col = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tLCL5_consensus_call\tLCL6_consensus_call\tLCL7_consensus_call\tLCL8_consensus_call\n'
 consensus_outfile.write(vcf_header)
 consensus_outfile.write(consensus_col)

 # function
 def decide_by_rep(vcf_list):
 	consensus_rep = ''
 	gt = [x.split(':')[0] for x in vcf_list]
 	gt_num_dict = Counter(gt)
 	highest_gt = gt_num_dict.most_common(1)
 	candidate_gt = highest_gt[0][0]
 	freq_gt = highest_gt[0][1]
 	if freq_gt >= 2:
 		consensus_rep = candidate_gt
 	else:
 		consensus_rep = 'inconGT'
 	return consensus_rep

 def consensus_call(vcf_info_list):
 	consensus_call = '.'
 	detect_number = '.'
 	same_diff = '.'
 	# pcr
 	SEQ2000 = decide_by_rep(vcf_info_list[0:3])
 	XTen_ARD = decide_by_rep(vcf_info_list[18:21])
 	XTen_NVG = decide_by_rep(vcf_info_list[21:24])
 	XTen_WUX = decide_by_rep(vcf_info_list[24:27])
 	Nova_WUX = decide_by_rep(vcf_info_list[15:18])
 	pcr_sequence_site = [SEQ2000,XTen_ARD,XTen_NVG,XTen_WUX,Nova_WUX]
 	pcr_sequence_dict = Counter(pcr_sequence_site)
 	pcr_highest_sequence = pcr_sequence_dict.most_common(1)
 	pcr_candidate_sequence = pcr_highest_sequence[0][0]
 	pcr_freq_sequence = pcr_highest_sequence[0][1]
 	if pcr_freq_sequence > 3:
 		pcr_consensus = pcr_candidate_sequence
 	else:
 		pcr_consensus = 'inconSequenceSite'
 	# pcr-free
 	T7_WGE = decide_by_rep(vcf_info_list[3:6])
 	Nova_ARD_1 = decide_by_rep(vcf_info_list[6:9])
 	Nova_ARD_2 = decide_by_rep(vcf_info_list[9:12])
 	Nova_BRG = decide_by_rep(vcf_info_list[12:15])
 	sequence_site = [T7_WGE,Nova_ARD_1,Nova_ARD_2,Nova_BRG]
 	sequence_dict = Counter(sequence_site)
 	highest_sequence = sequence_dict.most_common(1)
 	candidate_sequence = highest_sequence[0][0]
 	freq_sequence = highest_sequence[0][1]
 	if freq_sequence > 2:
 		pcr_free_consensus = candidate_sequence
 	else:
 		pcr_free_consensus = 'inconSequenceSite'
 	gt = [x.split(':')[0] for x in vcf_info_list]
 	gt = [x.replace('./.','.') for x in gt]
 	detected_num = 27 - gt.count('.')
 	gt_remain = [e for e in gt if e not in {'.'}]
 	gt_set = set(gt_remain)
 	if len(gt_set) == 1:
 		same_diff = 'same'
 	else:
 		same_diff = 'diff'
 	tag = ['inconGT','inconSequenceSite']
 	if (pcr_consensus == pcr_free_consensus) and (pcr_consensus not in tag):
 		consensus_call = pcr_consensus
 	elif (pcr_consensus in tag) and (pcr_free_consensus in tag):
 		consensus_call = 'notAgree'				

 	else:
 		consensus_call = 'notConsensus'
 	return consensus_call, detected_num, same_diff

 	elif (pcr_consensus in tag) and (pcr_free_consensus in tag):
 		consensus_call = 'filtered'				
 	elif ((pcr_consensus == './.') or (pcr_consensus in tag)) and ((pcr_free_consensus not in tag) and (pcr_free_consensus != './.')):
 		consensus_call = 'pcr-free-speicifc'				
 	elif ((pcr_consensus != './.') or (pcr_consensus not in tag)) and ((pcr_free_consensus in tag) and (pcr_free_consensus == './.')):
 		consensus_call = 'pcr-speicifc'			
 	elif (pcr_consensus == '0/0') and (pcr_free_consensus == '0/0'):
 		consensus_call = '0/0'								
 	else:
 		consensus_call = 'filtered'


 for row in vcf_dat.itertuples():
 # length
 #alt
 	if ',' in row.ALT:
 		alt = row.ALT.split(',')
 		alt_len = [len(i) for i in alt]
 		alt_max = max(alt_len)
 	else:
 		alt_max = len(row.ALT)
 #ref
 	ref_len = len(row.REF)
 	if (alt_max > 50) or (ref_len > 50):
 		pass
 	else:
 # consensus
 		lcl5_list = [row.Quartet_DNA_BGI_SEQ2000_BGI_LCL5_1_20180518,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL5_2_20180530,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL5_3_20180530, \
 						row.Quartet_DNA_BGI_T7_WGE_LCL5_1_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL5_2_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL5_3_20191105, \
 						row.Quartet_DNA_ILM_Nova_ARD_LCL5_1_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL5_2_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL5_3_20181108, \
 						row.Quartet_DNA_ILM_Nova_ARD_LCL5_4_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL5_5_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL5_6_20190111, \
 						row.Quartet_DNA_ILM_Nova_BRG_LCL5_1_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL5_2_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL5_3_20180930, \
 						row.Quartet_DNA_ILM_Nova_WUX_LCL5_1_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL5_2_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL5_3_20190917, \
 						row.Quartet_DNA_ILM_XTen_ARD_LCL5_1_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL5_2_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL5_3_20170403, \
 						row.Quartet_DNA_ILM_XTen_NVG_LCL5_1_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL5_2_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL5_3_20170329, \
 						row.Quartet_DNA_ILM_XTen_WUX_LCL5_1_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL5_2_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL5_3_20170216]
 		lcl6_list = [row.Quartet_DNA_BGI_SEQ2000_BGI_LCL6_1_20180518,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL6_2_20180530,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL6_3_20180530, \
 						row.Quartet_DNA_BGI_T7_WGE_LCL6_1_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL6_2_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL6_3_20191105, \
 						row.Quartet_DNA_ILM_Nova_ARD_LCL6_1_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL6_2_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL6_3_20181108, \
 						row.Quartet_DNA_ILM_Nova_ARD_LCL6_4_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL6_5_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL6_6_20190111, \
 						row.Quartet_DNA_ILM_Nova_BRG_LCL6_1_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL6_2_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL6_3_20180930, \
 						row.Quartet_DNA_ILM_Nova_WUX_LCL6_1_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL6_2_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL6_3_20190917, \
 						row.Quartet_DNA_ILM_XTen_ARD_LCL6_1_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL6_2_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL6_3_20170403, \
 						row.Quartet_DNA_ILM_XTen_NVG_LCL6_1_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL6_2_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL6_3_20170329, \
 						row.Quartet_DNA_ILM_XTen_WUX_LCL6_1_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL6_2_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL6_3_20170216]
 		lcl7_list = [row.Quartet_DNA_BGI_SEQ2000_BGI_LCL7_1_20180518,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL7_2_20180530,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL7_3_20180530, \
 						row.Quartet_DNA_BGI_T7_WGE_LCL7_1_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL7_2_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL7_3_20191105, \
 						row.Quartet_DNA_ILM_Nova_ARD_LCL7_1_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL7_2_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL7_3_20181108, \
 						row.Quartet_DNA_ILM_Nova_ARD_LCL7_4_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL7_5_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL7_6_20190111, \
 						row.Quartet_DNA_ILM_Nova_BRG_LCL7_1_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL7_2_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL7_3_20180930, \
 						row.Quartet_DNA_ILM_Nova_WUX_LCL7_1_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL7_2_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL7_3_20190917, \
 						row.Quartet_DNA_ILM_XTen_ARD_LCL7_1_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL7_2_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL7_3_20170403, \
 						row.Quartet_DNA_ILM_XTen_NVG_LCL7_1_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL7_2_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL7_3_20170329, \
 						row.Quartet_DNA_ILM_XTen_WUX_LCL7_1_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL7_2_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL7_3_20170216]
 		lcl8_list = [row.Quartet_DNA_BGI_SEQ2000_BGI_LCL8_1_20180518,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL8_2_20180530,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL8_3_20180530, \
 						row.Quartet_DNA_BGI_T7_WGE_LCL8_1_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL8_2_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL8_3_20191105, \
 						row.Quartet_DNA_ILM_Nova_ARD_LCL8_1_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL8_2_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL8_3_20181108, \
 						row.Quartet_DNA_ILM_Nova_ARD_LCL8_4_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL8_5_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL8_6_20190111, \
 						row.Quartet_DNA_ILM_Nova_BRG_LCL8_1_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL8_2_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL8_3_20180930, \
 						row.Quartet_DNA_ILM_Nova_WUX_LCL8_1_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL8_2_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL8_3_20190917, \
 						row.Quartet_DNA_ILM_XTen_ARD_LCL8_1_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL8_2_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL8_3_20170403, \
 						row.Quartet_DNA_ILM_XTen_NVG_LCL8_1_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL8_2_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL8_3_20170329, \
 						row.Quartet_DNA_ILM_XTen_WUX_LCL8_1_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL8_2_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL8_3_20170216]
 		# LCL5
 		LCL5_consensus_call, LCL5_detected_num, LCL5_same_diff = consensus_call(lcl5_list)
 		# LCL6
 		LCL6_consensus_call, LCL6_detected_num, LCL6_same_diff = consensus_call(lcl6_list)
 		# LCL7
 		LCL7_consensus_call, LCL7_detected_num, LCL7_same_diff = consensus_call(lcl7_list)
 		# LCL8
 		LCL8_consensus_call, LCL8_detected_num, LCL8_same_diff = consensus_call(lcl8_list)
 		# all data
 		all_output = row._1 + '\t' + str(row.POS) + '\t' + row.REF + '\t' + row.ALT + '\t' +  LCL5_consensus_call + '\t' + str(LCL5_detected_num) + '\t' + LCL5_same_diff + '\t' +\
 						 LCL6_consensus_call + '\t' + str(LCL6_detected_num) + '\t' + LCL6_same_diff + '\t' +\
 						 LCL7_consensus_call + '\t' + str(LCL7_detected_num) + '\t' + LCL7_same_diff + '\t' +\
 						 LCL8_consensus_call + '\t' + str(LCL8_detected_num) + '\t' + LCL8_same_diff + '\n'
 		all_sample_outfile.write(all_output)
 		#consensus vcf
 		one_position = [LCL5_consensus_call,LCL6_consensus_call,LCL7_consensus_call,LCL8_consensus_call]
 		if ('notConsensus' in one_position) or (((len(set(one_position)) == 1) and ('./.' in set(one_position))) or ((len(set(one_position)) == 1) and ('0/0' in set(one_position))) or ((len(set(one_position)) == 2) and ('0/0' in set(one_position) and ('./.' in set(one_position))))):
 			pass
 		else:
 			consensus_output = row._1 + '\t' + str(row.POS) + '\t' + '.' + '\t' + row.REF + '\t' + row.ALT + '\t' + '.' + '\t' + '.' + '\t' +'.' + '\t' + 'GT' + '\t' + LCL5_consensus_call + '\t' + LCL6_consensus_call + '\t' + LCL7_consensus_call + '\t' + LCL8_consensus_call +'\n'
 			consensus_outfile.write(consensus_output)








--- a/codescripts/variants.calling.qc.txt
+++ b/codescripts/variants.calling.qc.txt
@@ -0,0 +1,13 @@
 Sample	SNV number	INDEL number	SNV query	INDEL query	SNV TP	INDEL TP	SNV FP	INDEL FP	SNV FN	INDEL FN	SNV precision	INDEL precision	SNV recall	INDEL recall	SNV F1	INDEL F1
 Quartet_DNA_ILM_XTen_NVG_LCL5_1_20170531	342022	52892	322825	42531	270215	32151	52610	10380	3336888	579469	0.84	0.76	0.07	0.05	0.14	0.1
 Quartet_DNA_ILM_XTen_NVG_LCL5_2_20170531	342022	52892	322825	42531	270215	32151	52610	10380	3336888	579469	0.84	0.76	0.07	0.05	0.14	0.1
 Quartet_DNA_ILM_XTen_NVG_LCL5_3_20170531	342022	52892	322825	42531	270215	32151	52610	10380	3336888	579469	0.84	0.76	0.07	0.05	0.14	0.1
 Quartet_DNA_ILM_XTen_NVG_LCL6_1_20170531	285728	45833	268349	36451	229630	27896	38719	8555	3377468	583718	0.86	0.77	0.06	0.05	0.12	0.09
 Quartet_DNA_ILM_XTen_NVG_LCL6_2_20170531	285728	45833	268349	36451	229630	27896	38719	8555	3377468	583718	0.86	0.77	0.06	0.05	0.12	0.09
 Quartet_DNA_ILM_XTen_NVG_LCL6_3_20170531	285728	45833	268349	36451	229630	27896	38719	8555	3377468	583718	0.86	0.77	0.06	0.05	0.12	0.09
 Quartet_DNA_ILM_XTen_NVG_LCL7_1_20170531	323937	49871	305459	39944	261536	30871	43923	9073	3325411	572135	0.86	0.77	0.07	0.05	0.13	0.1
 Quartet_DNA_ILM_XTen_NVG_LCL7_2_20170531	323937	49871	305459	39944	261536	30871	43923	9073	3325411	572135	0.86	0.77	0.07	0.05	0.13	0.1
 Quartet_DNA_ILM_XTen_NVG_LCL7_3_20170531	323937	49871	305459	39944	261536	30871	43923	9073	3325411	572135	0.86	0.77	0.07	0.05	0.13	0.1
 Quartet_DNA_ILM_XTen_NVG_LCL8_1_20170531	307086	45885	288484	36932	234192	27793	54292	9139	3370027	580503	0.81	0.75	0.06	0.05	0.12	0.09
 Quartet_DNA_ILM_XTen_NVG_LCL8_2_20170531	307086	45885	288484	36932	234192	27793	54292	9139	3370027	580503	0.81	0.75	0.06	0.05	0.12	0.09
 Quartet_DNA_ILM_XTen_NVG_LCL8_3_20170531	307086	45885	288484	36932	234192	27793	54292	9139	3370027	580503	0.81	0.75	0.06	0.05	0.12	0.09
--- a/codescripts/vcf2bed.py
+++ b/codescripts/vcf2bed.py
@@ -0,0 +1,36 @@
 import re

 def position_to_bed(chromo,pos,ref,alt):
    # snv
    # Start cooridinate BED = start coordinate VCF - 1
    # End cooridinate BED = start coordinate VCF 

    if len(ref) == 1 and len(alt) == 1:
        StartPos = int(pos) -1
        EndPos = int(pos)
    
    # deletions
    # Start cooridinate BED = start coordinate VCF - 1
    # End cooridinate BED = start coordinate VCF + (reference length - alternate length)

    elif len(ref) > 1 and len(alt) == 1:
        StartPos = int(pos) - 1
        EndPos = int(pos) + (len(ref) - 1)
        
    #insertions
    # For insertions:
    # Start cooridinate BED = start coordinate VCF - 1
    # End cooridinate BED = start coordinate VCF + (alternate length - reference length)

    else:
        StartPos = int(pos) - 1
        EndPos = int(pos) + (len(alt) - 1)

    return chromo,StartPos,EndPos

 def padding_region(chromo,pos1,pos2,padding):
    StartPos1 = pos1 - padding
    EndPos1 = pos1
    StartPos2 = pos2
    EndPos2 = pos2 + padding
    return chromo,StartPos1,EndPos1,StartPos2,EndPos2
--- a/codescripts/voted_by_vcfinfo_mendelianinfo.py
+++ b/codescripts/voted_by_vcfinfo_mendelianinfo.py
@@ -0,0 +1,295 @@
 from __future__ import division
 from glob import glob
 import sys, argparse, os
 import fileinput
 import re
 import pandas as pd
 from operator import itemgetter
 from collections import Counter
 from itertools import islice
 from numpy import *
 import statistics

 # input arguments
 parser = argparse.ArgumentParser(description="this script is to merge mendelian and vcfinfo, and extract high_confidence_calls")

 parser.add_argument('-folder', '--folder', type=str, help='directory that holds all the mendelian info',  required=True)
 parser.add_argument('-vcf', '--vcf', type=str, help='merged multiple sample vcf',  required=True)


 args = parser.parse_args()
 folder = args.folder
 vcf = args.vcf

 # input files
 folder = folder + '/*.txt'
 filenames = glob(folder)
 dataframes = []
 for filename in filenames:
 	dataframes.append(pd.read_table(filename,header=None))

 dfs = [df.set_index([0, 1]) for df in dataframes]
 merged_mendelian = pd.concat(dfs, axis=1).reset_index()
 family_name = [i.split('/')[-1].replace('.txt','') for i in filenames]
 columns = ['CHROM','POS'] + family_name
 merged_mendelian.columns = columns

 vcf_dat = pd.read_table(vcf)

 merged_df = pd.merge(merged_mendelian, vcf_dat,  how='outer', left_on=['CHROM','POS'], right_on = ['#CHROM','POS'])
 merged_df = merged_df.fillna('nan')

 vcf_header = '''##fileformat=VCFv4.2
 ##fileDate=20200501
 ##source=high_confidence_calls_intergration(choppy app)
 ##reference=GRCh38.d1.vd1
 ##INFO=<ID=VOTED,Number=1,Type=Integer,Description="Number mendelian consisitent votes">
 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
 ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Sum depth of all samples">
 ##FORMAT=<ID=ALT,Number=1,Type=Integer,Description="Sum alternative depth of all samples">
 ##contig=<ID=chr1,length=248956422>
 ##contig=<ID=chr2,length=242193529>
 ##contig=<ID=chr3,length=198295559>
 ##contig=<ID=chr4,length=190214555>
 ##contig=<ID=chr5,length=181538259>
 ##contig=<ID=chr6,length=170805979>
 ##contig=<ID=chr7,length=159345973>
 ##contig=<ID=chr8,length=145138636>
 ##contig=<ID=chr9,length=138394717>
 ##contig=<ID=chr10,length=133797422>
 ##contig=<ID=chr11,length=135086622>
 ##contig=<ID=chr12,length=133275309>
 ##contig=<ID=chr13,length=114364328>
 ##contig=<ID=chr14,length=107043718>
 ##contig=<ID=chr15,length=101991189>
 ##contig=<ID=chr16,length=90338345>
 ##contig=<ID=chr17,length=83257441>
 ##contig=<ID=chr18,length=80373285>
 ##contig=<ID=chr19,length=58617616>
 ##contig=<ID=chr20,length=64444167>
 ##contig=<ID=chr21,length=46709983>
 ##contig=<ID=chr22,length=50818468>
 ##contig=<ID=chrX,length=156040895>
 '''
 # output files
 benchmark_LCL5 = open('LCL5_voted.vcf','w')
 benchmark_LCL6 = open('LCL6_voted.vcf','w')
 benchmark_LCL7 = open('LCL7_voted.vcf','w')
 benchmark_LCL8 = open('LCL8_voted.vcf','w')

 all_sample_outfile = open('all_sample_information.txt','w')

 # write VCF
 LCL5_col = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tLCL5_benchmark_calls\n'
 LCL6_col = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tLCL6_benchmark_calls\n'
 LCL7_col = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tLCL7_benchmark_calls\n'
 LCL8_col = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tLCL8_benchmark_calls\n'

 benchmark_LCL5.write(vcf_header)
 benchmark_LCL5.write(LCL5_col)
 benchmark_LCL6.write(vcf_header)
 benchmark_LCL6.write(LCL6_col)
 benchmark_LCL7.write(vcf_header)
 benchmark_LCL7.write(LCL7_col)
 benchmark_LCL8.write(vcf_header)
 benchmark_LCL8.write(LCL8_col)

 # all info
 all_info_col = 'CHROM\tPOS\tLCL5_pcr_consensus\tLCL5_pcr_free_consensus\tLCL5_mendelian_num\tLCL5_consensus_call\tLCL5_consensus_alt_seq\tLCL5_alt\tLCL5_dp\tLCL5_detected_num\tLCL6_pcr_consensus\tLCL6_pcr_free_consensus\tLCL6_mendelian_num\tLCL6_consensus_call\tLCL6_consensus_alt_seq\tLCL6_alt\tLCL6_dp\tLCL6_detected_num\tLCL7_pcr_consensus\tLCL7_pcr_free_consensus\tLCL7_mendelian_num\t LCL7_consensus_call\tLCL7_consensus_alt_seq\tLCL7_alt\tLCL7_dp\tLCL7_detected_num\tLCL8_pcr_consensus\tLCL8_pcr_free_consensus\tLCL8_mendelian_num\tLCL8_consensus_call\tLCL8_consensus_alt_seq\tLCL8_alt\tLCL8_dp\tLCL8_detected_num\n'
 all_sample_outfile.write(all_info_col)

 # function
 def decide_by_rep(vcf_list,mendelian_list):
 	consensus_rep = ''
 	gt = [x.split(':')[0] for x in vcf_list]
 	# mendelian consistent?
 	mendelian_dict = Counter(mendelian_list)
 	highest_mendelian = mendelian_dict.most_common(1)
 	candidate_mendelian = highest_mendelian[0][0]
 	freq_mendelian = highest_mendelian[0][1]
 	if (candidate_mendelian == '1:1:1') and (freq_mendelian >= 2):
 		con_loc = [i for i in range(len(mendelian_list)) if mendelian_list[i] == '1:1:1']
 		gt_con = itemgetter(*con_loc)(gt)
 		gt_num_dict = Counter(gt_con)
 		highest_gt = gt_num_dict.most_common(1)
 		candidate_gt = highest_gt[0][0]
 		freq_gt = highest_gt[0][1]
 		if (candidate_gt != './.') and (freq_gt >= 2):
 			consensus_rep = candidate_gt
 		elif (candidate_gt == './.') and (freq_gt >= 2):
 			consensus_rep = 'noGTInfo'
 		else:
 			consensus_rep = 'inconGT'
 	elif (candidate_mendelian == 'nan') and (freq_mendelian >= 2):
 		consensus_rep = 'noMenInfo'
 	else:
 		consensus_rep = 'inconMen'
 	return consensus_rep

 def consensus_call(vcf_info_list,mendelian_list,alt_seq):
 	pcr_consensus = '.'
 	pcr_free_consensus = '.'
 	mendelian_num = '.'
 	consensus_call = '.'
 	consensus_alt_seq = '.'
 	# pcr
 	SEQ2000 = decide_by_rep(vcf_info_list[0:3],mendelian_list[0:3])
 	XTen_ARD = decide_by_rep(vcf_info_list[18:21],mendelian_list[18:21])
 	XTen_NVG = decide_by_rep(vcf_info_list[21:24],mendelian_list[21:24])
 	XTen_WUX = decide_by_rep(vcf_info_list[24:27],mendelian_list[24:27])
 	pcr_sequence_site = [SEQ2000,XTen_ARD,XTen_NVG,XTen_WUX]
 	pcr_sequence_dict = Counter(pcr_sequence_site)
 	pcr_highest_sequence = pcr_sequence_dict.most_common(1)
 	pcr_candidate_sequence = pcr_highest_sequence[0][0]
 	pcr_freq_sequence = pcr_highest_sequence[0][1]
 	if pcr_freq_sequence > 2:
 		pcr_consensus = pcr_candidate_sequence
 	else:
 		pcr_consensus = 'inconSequenceSite'
 	# pcr-free
 	T7_WGE = decide_by_rep(vcf_info_list[3:6],mendelian_list[3:6])
 	Nova_ARD_1 = decide_by_rep(vcf_info_list[6:9],mendelian_list[6:9])
 	Nova_ARD_2 = decide_by_rep(vcf_info_list[9:12],mendelian_list[9:12])
 	Nova_BRG = decide_by_rep(vcf_info_list[12:15],mendelian_list[12:15])
 	Nova_WUX = decide_by_rep(vcf_info_list[15:18],mendelian_list[15:18])
 	sequence_site = [T7_WGE,Nova_ARD_1,Nova_ARD_2,Nova_BRG,Nova_WUX]
 	sequence_dict = Counter(sequence_site)
 	highest_sequence = sequence_dict.most_common(1)
 	candidate_sequence = highest_sequence[0][0]
 	freq_sequence = highest_sequence[0][1]
 	if freq_sequence > 3:
 		pcr_free_consensus = candidate_sequence
 	else:
 		pcr_free_consensus = 'inconSequenceSite'
 	# net alt, dp
 	# alt
 	AD = [x.split(':')[1] for x in vcf_info_list]
 	ALT = [x.split(',')[1] for x in AD]
 	ALT = [int(x) for x in ALT]
 	ALL_ALT = sum(ALT)
 	# dp
 	DP = [x.split(':')[2] for x in vcf_info_list]
 	DP = [int(x) for x in DP]
 	ALL_DP = sum(DP)
 	# detected number
 	gt = [x.split(':')[0] for x in vcf_info_list]
 	gt = [x.replace('0/0','.') for x in gt]
 	gt = [x.replace('./.','.') for x in gt]
 	detected_num = 27 - gt.count('.')
 	# decide consensus calls
 	tag = ['inconGT','noMenInfo','inconMen','inconSequenceSite','noGTInfo']
 	if (pcr_consensus != '0/0') and (pcr_consensus == pcr_free_consensus) and (pcr_consensus not in tag):
 		consensus_call = pcr_consensus
 		gt = [x.split(':')[0] for x in vcf_info_list]
 		indices = [i for i, x in enumerate(gt) if x == consensus_call]
 		matched_mendelian = itemgetter(*indices)(mendelian_list)
 		mendelian_num = matched_mendelian.count('1:1:1')
 		# Delete multiple alternative genotype to necessary expression
 		alt_gt = alt_seq.split(',')
 		if len(alt_gt) > 1:
 			allele1 = consensus_call.split('/')[0]
 			allele2 = consensus_call.split('/')[1]
 			if allele1 == '0':
 				allele2_seq = alt_gt[int(allele2) - 1]
 				consensus_alt_seq = allele2_seq
 				consensus_call = '0/1'
 			else:
 				allele1_seq = alt_gt[int(allele1) - 1]
 				allele2_seq = alt_gt[int(allele2) - 1]
 				if int(allele1) > int(allele2):
 					consensus_alt_seq = allele2_seq + ',' + allele1_seq
 					consensus_call = '1/2'
 				elif int(allele1) < int(allele2):
 					consensus_alt_seq = allele1_seq + ',' + allele2_seq
 					consensus_call = '1/2'
 				else:
 					consensus_alt_seq = allele1_seq 
 					consensus_call = '1/1'
 		else:
 			consensus_alt_seq = alt_seq
 	elif (pcr_consensus in tag) and (pcr_free_consensus in tag):
 		consensus_call = 'filtered'				
 	elif ((pcr_consensus == './.') or (pcr_consensus in tag)) and ((pcr_free_consensus not in tag) and (pcr_free_consensus != './.')):
 		consensus_call = 'pcr-free-speicifc'				
 	elif ((pcr_consensus != './.') or (pcr_consensus not in tag)) and ((pcr_free_consensus in tag) and (pcr_free_consensus == './.')):
 		consensus_call = 'pcr-speicifc'			
 	elif (pcr_consensus == '0/0') and (pcr_free_consensus == '0/0'):
 		consensus_call = '0/0'								
 	else:
 		consensus_call = 'filtered'
 	return pcr_consensus, pcr_free_consensus, mendelian_num, consensus_call, consensus_alt_seq, ALL_ALT, ALL_DP, detected_num


 for row in merged_df.itertuples():
 	mendelian_list = [row.Quartet_DNA_BGI_SEQ2000_BGI_1_20180518,row.Quartet_DNA_BGI_SEQ2000_BGI_2_20180530,row.Quartet_DNA_BGI_SEQ2000_BGI_3_20180530, \
 					row.Quartet_DNA_BGI_T7_WGE_1_20191105,row.Quartet_DNA_BGI_T7_WGE_2_20191105,row.Quartet_DNA_BGI_T7_WGE_3_20191105, \
 					row.Quartet_DNA_ILM_Nova_ARD_1_20181108,row.Quartet_DNA_ILM_Nova_ARD_2_20181108,row.Quartet_DNA_ILM_Nova_ARD_3_20181108, \
 					row.Quartet_DNA_ILM_Nova_ARD_4_20190111,row.Quartet_DNA_ILM_Nova_ARD_5_20190111,row.Quartet_DNA_ILM_Nova_ARD_6_20190111, \
 					row.Quartet_DNA_ILM_Nova_BRG_1_20180930,row.Quartet_DNA_ILM_Nova_BRG_2_20180930,row.Quartet_DNA_ILM_Nova_BRG_3_20180930, \
 					row.Quartet_DNA_ILM_Nova_WUX_1_20190917,row.Quartet_DNA_ILM_Nova_WUX_2_20190917,row.Quartet_DNA_ILM_Nova_WUX_3_20190917, \
 					row.Quartet_DNA_ILM_XTen_ARD_1_20170403,row.Quartet_DNA_ILM_XTen_ARD_2_20170403,row.Quartet_DNA_ILM_XTen_ARD_3_20170403, \
 					row.Quartet_DNA_ILM_XTen_NVG_1_20170329,row.Quartet_DNA_ILM_XTen_NVG_2_20170329,row.Quartet_DNA_ILM_XTen_NVG_3_20170329, \
 					row.Quartet_DNA_ILM_XTen_WUX_1_20170216,row.Quartet_DNA_ILM_XTen_WUX_2_20170216,row.Quartet_DNA_ILM_XTen_WUX_3_20170216]
 	lcl5_list = [row.Quartet_DNA_BGI_SEQ2000_BGI_LCL5_1_20180518,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL5_2_20180530,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL5_3_20180530, \
 					row.Quartet_DNA_BGI_T7_WGE_LCL5_1_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL5_2_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL5_3_20191105, \
 					row.Quartet_DNA_ILM_Nova_ARD_LCL5_1_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL5_2_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL5_3_20181108, \
 					row.Quartet_DNA_ILM_Nova_ARD_LCL5_4_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL5_5_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL5_6_20190111, \
 					row.Quartet_DNA_ILM_Nova_BRG_LCL5_1_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL5_2_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL5_3_20180930, \
 					row.Quartet_DNA_ILM_Nova_WUX_LCL5_1_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL5_2_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL5_3_20190917, \
 					row.Quartet_DNA_ILM_XTen_ARD_LCL5_1_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL5_2_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL5_3_20170403, \
 					row.Quartet_DNA_ILM_XTen_NVG_LCL5_1_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL5_2_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL5_3_20170329, \
 					row.Quartet_DNA_ILM_XTen_WUX_LCL5_1_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL5_2_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL5_3_20170216]
 	lcl6_list = [row.Quartet_DNA_BGI_SEQ2000_BGI_LCL6_1_20180518,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL6_2_20180530,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL6_3_20180530, \
 					row.Quartet_DNA_BGI_T7_WGE_LCL6_1_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL6_2_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL6_3_20191105, \
 					row.Quartet_DNA_ILM_Nova_ARD_LCL6_1_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL6_2_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL6_3_20181108, \
 					row.Quartet_DNA_ILM_Nova_ARD_LCL6_4_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL6_5_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL6_6_20190111, \
 					row.Quartet_DNA_ILM_Nova_BRG_LCL6_1_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL6_2_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL6_3_20180930, \
 					row.Quartet_DNA_ILM_Nova_WUX_LCL6_1_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL6_2_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL6_3_20190917, \
 					row.Quartet_DNA_ILM_XTen_ARD_LCL6_1_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL6_2_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL6_3_20170403, \
 					row.Quartet_DNA_ILM_XTen_NVG_LCL6_1_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL6_2_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL6_3_20170329, \
 					row.Quartet_DNA_ILM_XTen_WUX_LCL6_1_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL6_2_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL6_3_20170216]
 	lcl7_list = [row.Quartet_DNA_BGI_SEQ2000_BGI_LCL7_1_20180518,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL7_2_20180530,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL7_3_20180530, \
 					row.Quartet_DNA_BGI_T7_WGE_LCL7_1_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL7_2_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL7_3_20191105, \
 					row.Quartet_DNA_ILM_Nova_ARD_LCL7_1_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL7_2_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL7_3_20181108, \
 					row.Quartet_DNA_ILM_Nova_ARD_LCL7_4_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL7_5_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL7_6_20190111, \
 					row.Quartet_DNA_ILM_Nova_BRG_LCL7_1_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL7_2_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL7_3_20180930, \
 					row.Quartet_DNA_ILM_Nova_WUX_LCL7_1_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL7_2_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL7_3_20190917, \
 					row.Quartet_DNA_ILM_XTen_ARD_LCL7_1_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL7_2_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL7_3_20170403, \
 					row.Quartet_DNA_ILM_XTen_NVG_LCL7_1_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL7_2_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL7_3_20170329, \
 					row.Quartet_DNA_ILM_XTen_WUX_LCL7_1_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL7_2_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL7_3_20170216]
 	lcl8_list = [row.Quartet_DNA_BGI_SEQ2000_BGI_LCL8_1_20180518,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL8_2_20180530,row.Quartet_DNA_BGI_SEQ2000_BGI_LCL8_3_20180530, \
 					row.Quartet_DNA_BGI_T7_WGE_LCL8_1_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL8_2_20191105,row.Quartet_DNA_BGI_T7_WGE_LCL8_3_20191105, \
 					row.Quartet_DNA_ILM_Nova_ARD_LCL8_1_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL8_2_20181108,row.Quartet_DNA_ILM_Nova_ARD_LCL8_3_20181108, \
 					row.Quartet_DNA_ILM_Nova_ARD_LCL8_4_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL8_5_20190111,row.Quartet_DNA_ILM_Nova_ARD_LCL8_6_20190111, \
 					row.Quartet_DNA_ILM_Nova_BRG_LCL8_1_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL8_2_20180930,row.Quartet_DNA_ILM_Nova_BRG_LCL8_3_20180930, \
 					row.Quartet_DNA_ILM_Nova_WUX_LCL8_1_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL8_2_20190917,row.Quartet_DNA_ILM_Nova_WUX_LCL8_3_20190917, \
 					row.Quartet_DNA_ILM_XTen_ARD_LCL8_1_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL8_2_20170403,row.Quartet_DNA_ILM_XTen_ARD_LCL8_3_20170403, \
 					row.Quartet_DNA_ILM_XTen_NVG_LCL8_1_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL8_2_20170329,row.Quartet_DNA_ILM_XTen_NVG_LCL8_3_20170329, \
 					row.Quartet_DNA_ILM_XTen_WUX_LCL8_1_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL8_2_20170216,row.Quartet_DNA_ILM_XTen_WUX_LCL8_3_20170216]
 	# LCL5
 	LCL5_pcr_consensus, LCL5_pcr_free_consensus, LCL5_mendelian_num, LCL5_consensus_call, LCL5_consensus_alt_seq, LCL5_alt, LCL5_dp, LCL5_detected_num = consensus_call(lcl5_list,mendelian_list,row.ALT)
 	if LCL5_mendelian_num != '.':
 		LCL5_output = row.CHROM + '\t' + str(row.POS) + '\t' + '.' + '\t' + row.REF + '\t' + LCL5_consensus_alt_seq + '\t' + '.' + '\t' + '.' + '\t' +'VOTED=' + str(LCL5_mendelian_num) + '\t' + 'GT:ALT:DP' + '\t' + LCL5_consensus_call + ':' + str(LCL5_alt) + ':' + str(LCL5_dp) +  '\n'
 		benchmark_LCL5.write(LCL5_output)
 	# LCL6
 	LCL6_pcr_consensus, LCL6_pcr_free_consensus, LCL6_mendelian_num, LCL6_consensus_call, LCL6_consensus_alt_seq, LCL6_alt, LCL6_dp, LCL6_detected_num = consensus_call(lcl6_list,mendelian_list,row.ALT)
 	if LCL6_mendelian_num != '.':
 		LCL6_output = row.CHROM + '\t' + str(row.POS) + '\t' + '.' + '\t' + row.REF + '\t' + LCL6_consensus_alt_seq + '\t' + '.' + '\t' + '.' + '\t' +'VOTED=' + str(LCL6_mendelian_num) + '\t' + 'GT:ALT:DP' + '\t' + LCL6_consensus_call + ':' + str(LCL6_alt) + ':' + str(LCL6_dp) +  '\n'
 		benchmark_LCL6.write(LCL6_output)
 	# LCL7
 	LCL7_pcr_consensus, LCL7_pcr_free_consensus, LCL7_mendelian_num, LCL7_consensus_call, LCL7_consensus_alt_seq, LCL7_alt, LCL7_dp, LCL7_detected_num = consensus_call(lcl7_list,mendelian_list,row.ALT)
 	if LCL7_mendelian_num != '.':
 		LCL7_output = row.CHROM + '\t' + str(row.POS) + '\t' + '.' + '\t' + row.REF + '\t' + LCL7_consensus_alt_seq + '\t' + '.' + '\t' + '.' + '\t' +'VOTED=' + str(LCL7_mendelian_num) + '\t' + 'GT:ALT:DP' + '\t' + LCL7_consensus_call + ':' + str(LCL7_alt) + ':' + str(LCL7_dp) +  '\n'
 		benchmark_LCL7.write(LCL7_output)
 	# LCL8
 	LCL8_pcr_consensus, LCL8_pcr_free_consensus, LCL8_mendelian_num, LCL8_consensus_call, LCL8_consensus_alt_seq, LCL8_alt, LCL8_dp, LCL8_detected_num = consensus_call(lcl8_list,mendelian_list,row.ALT)
 	if LCL8_mendelian_num != '.':
 		LCL8_output = row.CHROM + '\t' + str(row.POS) + '\t' + '.' + '\t' + row.REF + '\t' + LCL8_consensus_alt_seq + '\t' + '.' + '\t' + '.' + '\t' +'VOTED=' + str(LCL8_mendelian_num) + '\t' + 'GT:ALT:DP' + '\t' + LCL8_consensus_call + ':' + str(LCL8_alt) + ':' + str(LCL8_dp) +  '\n'
 		benchmark_LCL8.write(LCL8_output)
 	# all data
 	all_output = row.CHROM + '\t' + str(row.POS) + '\t' + LCL5_pcr_consensus + '\t' + LCL5_pcr_free_consensus + '\t' + str(LCL5_mendelian_num) + '\t' + LCL5_consensus_call + '\t' + LCL5_consensus_alt_seq + '\t' + str(LCL5_alt) + '\t' + str(LCL5_dp)  + '\t' + str(LCL5_detected_num) + '\t' +\
 					LCL6_pcr_consensus + '\t' + LCL6_pcr_free_consensus + '\t' + str(LCL6_mendelian_num) + '\t' + LCL6_consensus_call + '\t' + LCL6_consensus_alt_seq + '\t' + str(LCL6_alt) + '\t' + str(LCL6_dp)  + '\t' + str(LCL6_detected_num) + '\t' +\
 					LCL7_pcr_consensus + '\t' + LCL7_pcr_free_consensus + '\t' + str(LCL7_mendelian_num) + '\t' + LCL7_consensus_call + '\t' + LCL7_consensus_alt_seq + '\t' + str(LCL7_alt) + '\t' + str(LCL7_dp) + '\t' + str(LCL7_detected_num) + '\t' +\
 					LCL8_pcr_consensus + '\t' + LCL8_pcr_free_consensus + '\t' + str(LCL8_mendelian_num) + '\t' + LCL8_consensus_call + '\t' + LCL8_consensus_alt_seq + '\t' + str(LCL8_alt) + '\t' + str(LCL8_dp) + '\t' + str(LCL8_detected_num) + '\n'
 	all_sample_outfile.write(all_output)

--- a/inputs
+++ b/inputs
@@ -0,0 +1,24 @@
 {
  "{{ project_name }}.benchmarking_dir": "oss://pgx-result/renluyao/manuscript_v3.0/reference_datasets_v202103/",
  "{{ project_name }}.fasta": "GRCh38.d1.vd1.fa",
  "{{ project_name }}.BENCHMARKdocker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-hap:latest",
  "{{ project_name }}.LCL6_1": "{{ LCL6_1 }}",
  "{{ project_name }}.LCL5_3": "{{ LCL5_3 }}",
  "{{ project_name }}.LCL8_2": "{{ LCL8_2 }}",
  "{{ project_name }}.disk_size": "500",
  "{{ project_name }}.LCL8_1": "{{ LCL8_1 }}",
  "{{ project_name }}.LCL6_3": "{{ LCL6_3 }}",
  "{{ project_name }}.project": "{{ project }}",
  "{{ project_name }}.LCL7_3": "{{ LCL7_3 }}",
  "{{ project_name }}.LCL5_1": "{{ LCL5_1 }}",
  "{{ project_name }}.SMALLcluster_config": "OnDemand bcs.ps.g.xlarge img-ubuntu-vpc",
  "{{ project_name }}.LCL6_2": "{{ LCL6_2 }}",
  "{{ project_name }}.BIGcluster_config": "OnDemand bcs.ps.g.2xlarge img-ubuntu-vpc",
  "{{ project_name }}.LCL7_2": "{{ LCL7_2 }}",
  "{{ project_name }}.LCL5_2": "{{ LCL5_2 }}",
  "{{ project_name }}.LCL7_1": "{{ LCL7_1 }}",
  "{{ project_name }}.MENDELIANdocker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/vbt:v1.1",
  "{{ project_name }}.LCL8_3": "{{ LCL8_3 }}",
  "{{ project_name }}.DIYdocker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call_manuscript:v1.4",
  "{{ project_name }}.ref_dir": "oss://pgx-reference-data/GRCh38.d1.vd1/"
 }
--- a/tasks/.DS_Store
+++ b/tasks/.DS_Store
--- a/tasks/benchmark.wdl
+++ b/tasks/benchmark.wdl
@@ -0,0 +1,81 @@
 task benchmark {
 	File vcf
 	File benchmarking_dir
 	File ref_dir
 	String sample = basename(vcf,".vcf")
 	String fasta
 	String docker
 	String cluster_config
 	String disk_size


 	command <<<
 		set -o pipefail
 		set -e
 		nt=$(nproc)
 		mkdir -p /cromwell_root/tmp
 		cp -r ${ref_dir} /cromwell_root/tmp/
 		cp -r ${benchmarking_dir} /cromwell_root/tmp/

 		export HGREF=/cromwell_root/tmp/reference_data/GRCh38.d1.vd1.fa

 		echo -e "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tLCL5" > LCL5_name
 		echo -e "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tLCL6" > LCL6_name
 		echo -e "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tLCL7" > LCL7_name
 		echo -e "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tLCL8" > LCL8_name

 		if [[ ${sample} =~ "LCL5" ]];then
 			/opt/hap.py/bin/hap.py /cromwell_root/tmp/reference_datasets_v202103/LCL5.high.confidence.calls.vcf ${vcf} -f /cromwell_root/tmp/reference_datasets_v202103/Quartet.high.confidence.region.v202103.bed --threads $nt -o ${sample} -r ${ref_dir}/${fasta}
 			cat ${vcf} | grep '##' > header
 			cat ${vcf} | grep -v '#' > body
 			cat header LCL5_name body > LCL5.vcf
 			/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg bgzip LCL5.vcf -c > ${sample}.reformed.vcf.gz
 			/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg index -f vcf ${sample}.reformed.vcf.gz
 	    elif [[ ${sample} =~ "LCL6" ]]; then
 	    	/opt/hap.py/bin/hap.py /cromwell_root/tmp/reference_datasets_v202103/LCL6.high.confidence.calls.vcf ${vcf} -f /cromwell_root/tmp/reference_datasets_v202103/Quartet.high.confidence.region.v202103.bed --threads $nt -o ${sample} -r ${ref_dir}/${fasta}
 			cat ${vcf} | grep '##' > header
 			cat ${vcf} | grep -v '#' > body
 			cat header LCL6_name body > LCL6.vcf
 			/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg bgzip LCL6.vcf -c > ${sample}.reformed.vcf.gz
 			/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg index -f vcf ${sample}.reformed.vcf.gz
        elif [[ ${sample} =~ "LCL7" ]]; then
        	/opt/hap.py/bin/hap.py /cromwell_root/tmp/reference_datasets_v202103/LCL7.high.confidence.calls.vcf ${vcf} -f /cromwell_root/tmp/reference_datasets_v202103/Quartet.high.confidence.region.v202103.bed --threads $nt -o ${sample} -r ${ref_dir}/${fasta}
 			cat ${vcf} | grep '##' > header
 			cat ${vcf} | grep -v '#' > body
 			cat header LCL7_name body > LCL7.vcf
 			/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg bgzip LCL7.vcf -c > ${sample}.reformed.vcf.gz
 			/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg index -f vcf ${sample}.reformed.vcf.gz
 	    elif [[ ${sample} =~ "LCL8" ]]; then
 			/opt/hap.py/bin/hap.py /cromwell_root/tmp/reference_datasets_v202103/LCL8.high.confidence.calls.vcf ${vcf} -f /cromwell_root/tmp/reference_datasets_v202103/Quartet.high.confidence.region.v202103.bed --threads $nt -o ${sample} -r ${ref_dir}/${fasta}
 			cat ${vcf} | grep '##' > header
 			cat ${vcf} | grep -v '#' > body
 			cat header LCL8_name body > LCL8.vcf
 			/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg bgzip LCL8.vcf -c > ${sample}.reformed.vcf.gz
 			/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg index -f vcf ${sample}.reformed.vcf.gz
        else
        	echo "only for quartet samples"
        fi		
 	>>>

 	runtime {
 		docker:docker
 		cluster:cluster_config
 		systemDisk:"cloud_ssd 40"
 		dataDisk:"cloud_ssd " + disk_size + " /cromwell_root/"
 	}

 	output {
 		File rtg_vcf = "${sample}.reformed.vcf.gz"
 		File rtg_vcf_index = "${sample}.reformed.vcf.gz.tbi"
 		File gzip_vcf = "${sample}.vcf.gz"
 		File gzip_vcf_index = "${sample}.vcf.gz.tbi"
 		File roc_all_csv = "${sample}.roc.all.csv.gz"
 		File roc_indel = "${sample}.roc.Locations.INDEL.csv.gz"
 		File roc_indel_pass = "${sample}.roc.Locations.INDEL.PASS.csv.gz"
 		File roc_snp = "${sample}.roc.Locations.SNP.csv.gz"
 		File roc_snp_pass = "${sample}.roc.Locations.SNP.PASS.csv.gz"
 		File summary = "${sample}.summary.csv"
 		File extended = "${sample}.extended.csv"
 		File metrics = "${sample}.metrics.json.gz"
 	}
 }
--- a/tasks/mendelian.wdl
+++ b/tasks/mendelian.wdl
@@ -0,0 +1,46 @@
 task mendelian {
 	File family_vcf
 	File ref_dir
 	String family_name = basename(family_vcf,".family.vcf")
 	String fasta
 	String docker
 	String cluster_config
 	String disk_size
 	
 	command <<<
 		export LD_LIBRARY_PATH=/opt/htslib-1.9
 		nt=$(nproc)

 		echo -e "${family_name}\tLCL8\t0\t0\t2\t-9\n${family_name}\tLCL7\t0\t0\t1\t-9\n${family_name}\tLCL5\tLCL7\tLCL8\t2\t-9" > ${family_name}.D5.ped

 		mkdir VBT_D5
 		/opt/VBT-TrioAnalysis/vbt mendelian -ref ${ref_dir}/${fasta} -mother ${family_vcf} -father ${family_vcf} -child ${family_vcf} -pedigree ${family_name}.D5.ped -outDir VBT_D5 -out-prefix ${family_name}.D5 --output-violation-regions -thread-count $nt

 		cat VBT_D5/${family_name}.D5_trio.vcf > ${family_name}.D5.vcf

 		echo -e "${family_name}\tLCL8\t0\t0\t2\t-9\n${family_name}\tLCL7\t0\t0\t1\t-9\n${family_name}\tLCL6\tLCL7\tLCL8\t2\t-9" > ${family_name}.D6.ped

 		mkdir VBT_D6
 		/opt/VBT-TrioAnalysis/vbt mendelian -ref ${ref_dir}/${fasta} -mother ${family_vcf} -father ${family_vcf} -child ${family_vcf} -pedigree ${family_name}.D6.ped -outDir VBT_D6 -out-prefix ${family_name}.D6 --output-violation-regions -thread-count $nt

 		cat VBT_D6/${family_name}.D6_trio.vcf > ${family_name}.D6.vcf
 	>>>

 	runtime {
 		docker:docker
 		cluster: cluster_config
 		systemDisk: "cloud_ssd 40"
 		dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
 	}
 	output {
 		File D5_ped = "${family_name}.D5.ped"
 		File D6_ped = "${family_name}.D6.ped"
 		Array[File] D5_mendelian = glob("VBT_D5/*")
 		Array[File] D6_mendelian = glob("VBT_D6/*")
 		File D5_trio_vcf = "${family_name}.D5.vcf"
 		File D6_trio_vcf = "${family_name}.D6.vcf"
 	}
 }



--- a/tasks/merge_family.wdl
+++ b/tasks/merge_family.wdl
@@ -0,0 +1,35 @@
 task merge_family {
 	File LCL5_vcf_gz
 	File LCL5_vcf_idx
 	File LCL6_vcf_gz
 	File LCL6_vcf_idx
 	File LCL7_vcf_gz
 	File LCL7_vcf_idx
 	File LCL8_vcf_gz
 	File LCL8_vcf_idx
 	String project
 	String rep
 	String docker
 	String cluster_config
 	String disk_size
 	
 	command <<<

 		/opt/rtg-tools/dist/rtg-tools-3.10.1-4d58ead/rtg vcfmerge --force-merge-all -o ${project}_${rep}.family.vcf.gz ${LCL5_vcf_gz} ${LCL6_vcf_gz} ${LCL7_vcf_gz} ${LCL8_vcf_gz}

 		gunzip ${project}_${rep}.family.vcf.gz

 	>>>

 	runtime {
 		docker:docker
 		cluster: cluster_config
 		systemDisk: "cloud_ssd 40"
 		dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
 	}
 	output {
 		File merged_vcf = "${project}_${rep}.family.vcf"
 	}
 }


--- a/tasks/merge_mendelian.wdl
+++ b/tasks/merge_mendelian.wdl
@@ -0,0 +1,35 @@
 task merge_mendelian {
 	File D5_trio_vcf
 	File D6_trio_vcf
 	File family_vcf
 	String family_name = basename(family_vcf,".family.vcf")
 	String docker
 	String cluster_config
 	String disk_size
 	
 	command <<<
 		cat ${D5_trio_vcf} | grep -v '##' > ${family_name}.D5.txt
 		cat ${D6_trio_vcf} | grep -v '##' > ${family_name}.D6.txt
 		cat ${family_vcf} | grep -v '##' | awk '
 		    BEGIN { OFS = "\t" }
 		    NF > 2 && FNR > 1 { 
 		        for ( i=9; i<=NF; i++ ) { 
 		            split($i,a,":") ;$i = a[1];
 		        } 
 		    } 
 		    { print }
 		' > ${family_name}.consensus.txt
 		python /opt/merge_two_family_with_genotype.py -LCL5 ${family_name}.D5.txt -LCL6 ${family_name}.D6.txt -genotype ${family_name}.consensus.txt -family ${family_name}
 	>>>

 	runtime {
 		docker:docker
 		cluster: cluster_config
 		systemDisk: "cloud_ssd 40"
 		dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
 	}
 	output {
 		File project_mendelian = "${family_name}.txt"
 		File project_mendelian_summary = "${family_name}.summary.txt"
 	}
 }
--- a/tasks/quartet_mendelian.wdl
+++ b/tasks/quartet_mendelian.wdl
@@ -0,0 +1,62 @@
 task quartet_mendelian {
 	File summary_1
 	File summary_2
 	File summary_3
 	File LCL5_hap_1
 	File LCL5_hap_2
 	File LCL5_hap_3
 	File LCL6_hap_1
 	File LCL6_hap_2
 	File LCL6_hap_3
 	File LCL7_hap_1
 	File LCL7_hap_2
 	File LCL7_hap_3	
 	File LCL8_hap_1
 	File LCL8_hap_2
 	File LCL8_hap_3
 	String docker
 	String project
 	String cluster_config
 	String disk_size

 	command <<<
 		cat ${summary_1} ${summary_2} ${summary_3} | grep -v 'Family'> mendelian.summary
 		sed '1iFamily\tTotal_Variants\tMendelian_Concordant_Variants\tMendelian_Concordance_Rate' mendelian.summary > mendelian.txt

 		cat mendelian.txt | grep 'INDEL' | cut -f4 | grep -v 'Mendelian_Concordance_Rate' | awk '{for(i=1;i<=NF;i++) {sum[i] += $i; sumsq[i] += ($i)^2}} 
          END {for (i=1;i<=NF;i++) {
          printf "%f %f \n", sum[i]/NR, sqrt((sumsq[i]-sum[i]^2/NR)/NR)}
         }' >> quartet_indel_aver-std.txt

 		cat mendelian.txt | grep 'SNV' | cut -f4 | grep -v 'Mendelian_Concordance_Rate' | awk '{for(i=1;i<=NF;i++) {sum[i] += $i; sumsq[i] += ($i)^2}} 
          END {for (i=1;i<=NF;i++) {
          printf "%f %f \n", sum[i]/NR, sqrt((sumsq[i]-sum[i]^2/NR)/NR)}
         }' >> quartet_snv_aver-std.txt

 		cat ${LCL5_hap_1} ${LCL5_hap_2} ${LCL5_hap_3} ${LCL6_hap_1} ${LCL6_hap_2} ${LCL6_hap_3} ${LCL7_hap_1} ${LCL7_hap_2} ${LCL7_hap_3} ${LCL8_hap_1} ${LCL8_hap_2} ${LCL8_hap_3} | grep ALL | sed s'/,/\t/g' > hap.summary
 		sed '1i\Type\tFilter\tTRUTH.TOTAL\tTRUTH.TP\tTRUTH.FN\tQUERY.TOTAL\tQUERY.FP\tQUERY.UNK\tFP.gt\tMETRIC.Recall\tMETRIC.Precision\tMETRIC.Frac_NA\tMETRIC.F1_Score\tTRUTH.TOTAL.TiTv_ratio\tQUERY.TOTAL.TiTv_ratio\tTRUTH.TOTAL.het_hom_ratio\tQUERY.TOTAL.het_hom_ratio' hap.summary > precision_recall

 		python /opt/hap_summary.py -hap precision_recall -name ${project}

 		cat variants.calling.qc.txt | cut -f12- | grep -v 'SNV' | awk '{for(i=1;i<=NF;i++) {sum[i] += $i; sumsq[i] += ($i)^2}} 
          END {for (i=1;i<=NF;i++) {
          printf "%f %f \n", sum[i]/NR, sqrt((sumsq[i]-sum[i]^2/NR)/NR)}
         }' >> reference_datasets_aver-std.txt

 	>>>

 	runtime {
 		docker:docker
    	cluster:cluster_config
    	systemDisk:"cloud_ssd 40"
    	dataDisk:"cloud_ssd " + disk_size + " /cromwell_root/"
 	}
 	output {
 		File mendelian_summary = "mendelian.txt"
 		File snv_aver_std = "quartet_snv_aver-std.txt"
 		File indel_aver_std = "quartet_indel_aver-std.txt"
 		File pr = "precision_recall"
 		File hap_summary = "variants.calling.qc.txt"
 		File precision_recall_aver_std = "reference_datasets_aver-std.txt"
 	}
 }
--- a/workflow.wdl
+++ b/workflow.wdl
@@ -0,0 +1,310 @@
 import "./tasks/benchmark.wdl" as benchmark
 import "./tasks/mendelian.wdl" as mendelian
 import "./tasks/merge_mendelian.wdl" as merge_mendelian
 import "./tasks/merge_family.wdl" as merge_family
 import "./tasks/quartet_mendelian.wdl" as quartet_mendelian


 workflow {{ project_name }} {

 	File LCL5_1
 	File LCL6_1
 	File LCL7_1
 	File LCL8_1

 	File LCL5_2
 	File LCL6_2
 	File LCL7_2
 	File LCL8_2

 	File LCL5_3
 	File LCL6_3
 	File LCL7_3
 	File LCL8_3

 	String BENCHMARKdocker
 	String MENDELIANdocker
 	String DIYdocker


 	String fasta
 	File ref_dir

 	File benchmarking_dir

 	String project

 	String disk_size
 	String BIGcluster_config
 	String SMALLcluster_config

 	call benchmark.benchmark as LCL5_1_benchmark {
 		input:
 		vcf=LCL5_1,
 		benchmarking_dir=benchmarking_dir,
 		ref_dir=ref_dir,
 		fasta=fasta,
 		docker=BENCHMARKdocker,
 		cluster_config=BIGcluster_config,
 		disk_size=disk_size,
 	}

 	call benchmark.benchmark as LCL5_2_benchmark {
 		input:
 		vcf=LCL5_2,
 		benchmarking_dir=benchmarking_dir,
 		ref_dir=ref_dir,
 		fasta=fasta,
 		docker=BENCHMARKdocker,
 		cluster_config=BIGcluster_config,
 		disk_size=disk_size,
 	}

 	call benchmark.benchmark as LCL5_3_benchmark {
 		input:
 		vcf=LCL5_3,
 		benchmarking_dir=benchmarking_dir,
 		ref_dir=ref_dir,
 		fasta=fasta,
 		docker=BENCHMARKdocker,
 		cluster_config=BIGcluster_config,
 		disk_size=disk_size,
 	}

 	call benchmark.benchmark as LCL6_1_benchmark {
 		input:
 		vcf=LCL6_1,
 		benchmarking_dir=benchmarking_dir,
 		ref_dir=ref_dir,
 		fasta=fasta,
 		docker=BENCHMARKdocker,
 		cluster_config=BIGcluster_config,
 		disk_size=disk_size,
 	}

 	call benchmark.benchmark as LCL6_2_benchmark {
 		input:
 		vcf=LCL6_2,
 		benchmarking_dir=benchmarking_dir,
 		ref_dir=ref_dir,
 		fasta=fasta,
 		docker=BENCHMARKdocker,
 		cluster_config=BIGcluster_config,
 		disk_size=disk_size,
 	}

 	call benchmark.benchmark as LCL6_3_benchmark {
 		input:
 		vcf=LCL6_3,
 		benchmarking_dir=benchmarking_dir,
 		ref_dir=ref_dir,
 		fasta=fasta,
 		docker=BENCHMARKdocker,
 		cluster_config=BIGcluster_config,
 		disk_size=disk_size,
 	}

 	call benchmark.benchmark as LCL7_1_benchmark {
 		input:
 		vcf=LCL7_1,
 		benchmarking_dir=benchmarking_dir,
 		ref_dir=ref_dir,
 		fasta=fasta,
 		docker=BENCHMARKdocker,
 		cluster_config=BIGcluster_config,
 		disk_size=disk_size,
 	}	

 	call benchmark.benchmark as LCL7_2_benchmark {
 		input:
 		vcf=LCL7_2,
 		benchmarking_dir=benchmarking_dir,
 		ref_dir=ref_dir,
 		fasta=fasta,
 		docker=BENCHMARKdocker,
 		cluster_config=BIGcluster_config,
 		disk_size=disk_size,
 	}	

 	call benchmark.benchmark as LCL7_3_benchmark {
 		input:
 		vcf=LCL7_3,
 		benchmarking_dir=benchmarking_dir,
 		ref_dir=ref_dir,
 		fasta=fasta,
 		docker=BENCHMARKdocker,
 		cluster_config=BIGcluster_config,
 		disk_size=disk_size,
 	}	


 	call benchmark.benchmark as LCL8_1_benchmark {
 		input:
 		vcf=LCL8_1,
 		benchmarking_dir=benchmarking_dir,
 		ref_dir=ref_dir,
 		fasta=fasta,
 		docker=BENCHMARKdocker,
 		cluster_config=BIGcluster_config,
 		disk_size=disk_size,
 	}

 	call benchmark.benchmark as LCL8_2_benchmark {
 		input:
 		vcf=LCL8_2,
 		benchmarking_dir=benchmarking_dir,
 		ref_dir=ref_dir,
 		fasta=fasta,
 		docker=BENCHMARKdocker,
 		cluster_config=BIGcluster_config,
 		disk_size=disk_size,
 	}

 	call benchmark.benchmark as LCL8_3_benchmark {
 		input:
 		vcf=LCL8_3,
 		benchmarking_dir=benchmarking_dir,
 		ref_dir=ref_dir,
 		fasta=fasta,
 		docker=BENCHMARKdocker,
 		cluster_config=BIGcluster_config,
 		disk_size=disk_size,
 	}

 	call merge_family.merge_family as merge_family_1 {
 		input:
 		LCL5_vcf_gz=LCL5_1_benchmark.rtg_vcf,
 		LCL5_vcf_idx=LCL5_1_benchmark.rtg_vcf_index,
 		LCL6_vcf_gz=LCL6_1_benchmark.rtg_vcf,
 		LCL6_vcf_idx=LCL6_1_benchmark.rtg_vcf_index,
 		LCL7_vcf_gz=LCL7_1_benchmark.rtg_vcf,
 		LCL7_vcf_idx=LCL7_1_benchmark.rtg_vcf_index,
 		LCL8_vcf_gz=LCL8_1_benchmark.rtg_vcf,
 		LCL8_vcf_idx=LCL8_1_benchmark.rtg_vcf,
 		project=project,
 		rep="1",
 		docker=BENCHMARKdocker,
 		cluster_config=BIGcluster_config,
 		disk_size=disk_size,
 	}

 	call merge_family.merge_family as merge_family_2 {
 		input:
 		LCL5_vcf_gz=LCL5_2_benchmark.rtg_vcf,
 		LCL5_vcf_idx=LCL5_2_benchmark.rtg_vcf_index,
 		LCL6_vcf_gz=LCL6_2_benchmark.rtg_vcf,
 		LCL6_vcf_idx=LCL6_2_benchmark.rtg_vcf_index,
 		LCL7_vcf_gz=LCL7_2_benchmark.rtg_vcf,
 		LCL7_vcf_idx=LCL7_2_benchmark.rtg_vcf_index,
 		LCL8_vcf_gz=LCL8_2_benchmark.rtg_vcf,
 		LCL8_vcf_idx=LCL8_2_benchmark.rtg_vcf,
 		project=project,
 		rep="2",
 		docker=BENCHMARKdocker,
 		cluster_config=BIGcluster_config,
 		disk_size=disk_size,
 	}

 	call merge_family.merge_family as merge_family_3 {
 		input:
 		LCL5_vcf_gz=LCL5_3_benchmark.rtg_vcf,
 		LCL5_vcf_idx=LCL5_3_benchmark.rtg_vcf_index,
 		LCL6_vcf_gz=LCL6_3_benchmark.rtg_vcf,
 		LCL6_vcf_idx=LCL6_3_benchmark.rtg_vcf_index,
 		LCL7_vcf_gz=LCL7_3_benchmark.rtg_vcf,
 		LCL7_vcf_idx=LCL7_3_benchmark.rtg_vcf_index,
 		LCL8_vcf_gz=LCL8_3_benchmark.rtg_vcf,
 		LCL8_vcf_idx=LCL8_3_benchmark.rtg_vcf,
 		project=project,
 		rep="3",
 		docker=BENCHMARKdocker,
 		cluster_config=BIGcluster_config,
 		disk_size=disk_size,
 	}

 	call mendelian.mendelian as mendelian_1 {
 		input:
 		family_vcf=merge_family_1.merged_vcf,
 		ref_dir=ref_dir,
 		fasta=fasta,
 		docker=MENDELIANdocker,
 		cluster_config=BIGcluster_config,
 		disk_size=disk_size		
 	}

 	call mendelian.mendelian as mendelian_2 {
 		input:
 		family_vcf=merge_family_2.merged_vcf,
 		ref_dir=ref_dir,
 		fasta=fasta,
 		docker=MENDELIANdocker,
 		cluster_config=BIGcluster_config,
 		disk_size=disk_size		
 	}

 	call mendelian.mendelian as mendelian_3 {
 		input:
 		family_vcf=merge_family_3.merged_vcf,
 		ref_dir=ref_dir,
 		fasta=fasta,
 		docker=MENDELIANdocker,
 		cluster_config=BIGcluster_config,
 		disk_size=disk_size		
 	}

 	call merge_mendelian.merge_mendelian as merge_mendelian_1 {
 		input:
 		D5_trio_vcf=mendelian_1.D5_trio_vcf,
 		D6_trio_vcf=mendelian_1.D6_trio_vcf,
 		family_vcf=merge_family_1.merged_vcf,
 		docker=DIYdocker,
 		cluster_config=SMALLcluster_config,
 		disk_size=disk_size
 	}

 	call merge_mendelian.merge_mendelian as merge_mendelian_2 {
 		input:
 		D5_trio_vcf=mendelian_2.D5_trio_vcf,
 		D6_trio_vcf=mendelian_2.D6_trio_vcf,
 		family_vcf=merge_family_2.merged_vcf,
 		docker=DIYdocker,
 		cluster_config=SMALLcluster_config,
 		disk_size=disk_size
 	}

 	call merge_mendelian.merge_mendelian as merge_mendelian_3 {
 		input:
 		D5_trio_vcf=mendelian_3.D5_trio_vcf,
 		D6_trio_vcf=mendelian_3.D6_trio_vcf,
 		family_vcf=merge_family_3.merged_vcf,
 		docker=DIYdocker,
 		cluster_config=SMALLcluster_config,
 		disk_size=disk_size
 	}

 	call quartet_mendelian.quartet_mendelian as quartet_mendelian{
 		input:
 		summary_1=merge_mendelian_1.project_mendelian_summary,
 		summary_2=merge_mendelian_2.project_mendelian_summary,
 		summary_3=merge_mendelian_3.project_mendelian_summary,
 		LCL5_hap_1=LCL5_1_benchmark.summary,
 		LCL5_hap_2=LCL5_2_benchmark.summary,
 		LCL5_hap_3=LCL5_3_benchmark.summary,
 		LCL6_hap_1=LCL6_1_benchmark.summary,
 		LCL6_hap_2=LCL6_2_benchmark.summary,
 		LCL6_hap_3=LCL6_3_benchmark.summary,
 		LCL7_hap_1=LCL7_1_benchmark.summary,
 		LCL7_hap_2=LCL7_2_benchmark.summary,
 		LCL7_hap_3=LCL7_3_benchmark.summary,	
 		LCL8_hap_1=LCL8_1_benchmark.summary,
 		LCL8_hap_2=LCL8_2_benchmark.summary,
 		LCL8_hap_3=LCL8_3_benchmark.summary,
 		docker=DIYdocker,
 		project=project,
 		cluster_config=SMALLcluster_config,
 		disk_size=disk_size
 	}

 }