5 年前 · e5602a0858
--- a/README.md
+++ b/README.md
@@ -0,0 +1,95 @@
 #高置信突变位点的整合

 > Author： Run Luyao
 >
 > E-mail：18110700050@fudan.edu.cn
 >
 > Git：
 >
 > Last Updates: 18/03/2020

 ## 安装指南

 ```bash
 # 激活choppy环境
 source activate choppy
 # 安装app
 choppy install renluyao/high_confidence_calls_manuscript
 ```

 ## App概述

 中华家系1号全基因组高置信small variants（SNVs和Indels）的整合流程。



 ## 流程与参数

 1. variantsNorm

 保留chr1-22，X上的突变

 用bcftools norm进行突变格式的统一

 2. mendelian

 LCL5、LCL7和LCL8为三口之家，进行trio-analysis，分析符合孟德尔和不符合孟德尔遗传规律的突变位点

 LCL6、LCL7和LCL8为三口之家，进行trio-analysis，分析符合孟德尔和不符合孟德尔遗传规律的突变位点

 得到LCL5和LCL6两个家系合并的vcf文件

 3. zipIndex

 对LCL5和LCL6两个家系合并的文件压缩和检索引

 4. VCFrename

 将VBT的输出结果，VCF文件中的MOTHER FATHER CHILD改成对应的样本名

 5. mergeSister

 将LCL5和LCL6修改过名字后的家系VCF文件合并

 6. reformVCF

 根据两个三口之家和姐妹的孟德尔遗传的信息，将之前和合并的VCF分解成4个人的vcf，并且包含了家系遗传的信息

 7. familyzipIndex

 将上一步输出的4个文件进行压缩和检索引

 8. merge

 将所有注释后的LCL5 vcf文件合并

 将所有注释后的LCL6 vcf文件合并

 将所有注释后的LCL7 vcf文件合并

 将所有注释后的LCL8 vcf文件合并

 9. vote

 投票选择高置信的突变位点

 t

 ## App输入变量与输入文件



 ## App输出文件



 ## 结果展示与解读



 ## CHANGELOG



 ## FAQ

--- a/codescripts/.DS_Store
+++ b/codescripts/.DS_Store
--- a/codescripts/bed_for_bamReadcount.py
+++ b/codescripts/bed_for_bamReadcount.py
@@ -0,0 +1,68 @@
 import sys,getopt
 import os
 import re
 import fileinput

 def usage():
    print(
        """
 Usage: python bed_for_bamReadcount.py -i input_vcf_file -o prefix

 This script selects SNPs and Indels supported by all callsets.

 Please notice that bam-readcount only takes in 1-based coordinates.

 Input:
 -i a vcf file

 Output:
 -o  a indel bed file for bam-readcount
        """)

 # select supported small variants
 def process(oneLine):
    m = re.match('^\#',oneLine)
    if m is not None:
        pass
    else:
        line = oneLine.rstrip()
        strings = line.strip().split('\t')
        # convert the position to bed file for bam-readcount
        # deletion
        if len(strings[3]) > 1 and len(strings[4]) == 1:
            pos = int(strings[1]) + 1
            outline = strings[0] + '\t' + str(pos) + '\t' + str(pos) + '\t' + strings[3] + '\t' + strings[4]+'\n'
            outINDEL.write(outline)
        # insertion
        elif len(strings[3]) == 1 and len(strings[4]) > 1 and (',' not in strings[4]):
            outline = strings[0] + '\t' + strings[1] + '\t' + strings[1] +  '\t' + strings[3] +  '\t' + strings[4] + '\n'
            outINDEL.write(outline)
        else:
            outMNP.write(oneLine)

 opts,args = getopt.getopt(sys.argv[1:],"hi:o:")
 for op,value in opts:
    if op == "-i":
        inputFile=value
    elif op == "-o":
        prefix=value    
    elif op == "-h":
        usage()
        sys.exit()

 if len(sys.argv[1:]) < 3:
    usage()
    sys.exit()

 INDELname = prefix + '.bed'
 MNPname = prefix + '_MNP.txt'

 outINDEL = open(INDELname,'w')
 outMNP = open(MNPname,'w')

 for line in fileinput.input(inputFile):
    process(line)

 outINDEL.close()
 outMNP.close()

--- a/codescripts/extract_vcf_information.py
+++ b/codescripts/extract_vcf_information.py
@@ -0,0 +1,96 @@
 import sys,getopt
 import os
 import re
 import fileinput
 import pandas as pd

 def usage():
 	print(
 		"""
 Usage: python extract_vcf_information.py -i input_merged_vcf_file -o parsed_file

 This script will extract SNVs and Indels information from the vcf files and output a tab-delimited files.

 Input:
 -i the selected vcf file

 Output:
 -o tab-delimited parsed file
 		""")

 # select supported small variants
 def process(oneLine):
 	line = oneLine.rstrip()
 	strings = line.strip().split('\t')
 	infoParsed = parse_INFO(strings[7])
 	formatKeys = strings[8].split(':')
 	formatValues = strings[9].split(':')
 	for i in range(0,len(formatKeys) -1) :
 		if formatKeys[i] == 'AD':
 			ra = formatValues[i].split(',')
 			infoParsed['RefDP'] = ra[0]
 			infoParsed['AltDP'] = ra[1]
 			if (int(ra[1]) + int(ra[0])) != 0:
 				infoParsed['af'] = float(int(ra[1])/(int(ra[1]) + int(ra[0])))
 			else:
 				pass
 		else:
 			infoParsed[formatKeys[i]] = formatValues[i]
 	infoParsed['chromo'] = strings[0]
 	infoParsed['pos'] = strings[1]
 	infoParsed['id'] = strings[2]
 	infoParsed['ref'] = strings[3]
 	infoParsed['alt'] = strings[4]
 	infoParsed['qual'] = strings[5]
 	return infoParsed


 def parse_INFO(info):
 	strings = info.strip().split(';')
 	keys = []
 	values = []
 	for i in strings:
 		kv = i.split('=')
 		if kv[0] == 'DB':
 			keys.append('DB')
 			values.append('1')
 		elif kv[0] == 'AF':
 			pass
 		elif kv[0] == 'POSITIVE_TRAIN_SITE':
 			pass
 		elif kv[0] == 'NEGATIVE_TRAIN_SITE':
 			pass
 		else:
 			keys.append(kv[0])
 			values.append(kv[1])
 	infoDict = dict(zip(keys, values))
 	return infoDict
 	

 opts,args = getopt.getopt(sys.argv[1:],"hi:o:") 
 for op,value in opts:
 	if op == "-i":
 		inputFile=value
 	elif op == "-o":
 		outputFile=value	
 	elif op == "-h":
 		usage()
 		sys.exit()

 if len(sys.argv[1:]) < 3:
 	usage()
 	sys.exit()

 allDict = []
 for line in fileinput.input(inputFile):
 	m = re.match('^\#',line)
 	if m is not None:
 		pass
 	else:
 		oneDict = process(line)
 		allDict.append(oneDict)

 allTable = pd.DataFrame(allDict)

 allTable.to_csv(outputFile,sep='\t',index=False)

--- a/codescripts/high_confidence_call_vote.py
+++ b/codescripts/high_confidence_call_vote.py
@@ -0,0 +1,255 @@
 # import modules
 import sys, argparse, os
 import fileinput
 import re
 import pandas as pd
 from operator import itemgetter
 from collections import Counter
 from itertools import islice  

 # input arguments
 parser = argparse.ArgumentParser(description="this script is to count voting number")

 parser.add_argument('-vcf', '--multi_sample_vcf', type=str, help='The VCF file you want to count the voting number',  required=True)
 parser.add_argument('-dup', '--dup_list', type=str, help='Duplication list',  required=True)
 parser.add_argument('-sample', '--sample_name', type=str, help='which sample of quartet',  required=True)
 parser.add_argument('-prefix', '--prefix', type=str, help='Prefix of output file name',  required=True)

 args = parser.parse_args()
 multi_sample_vcf = args.multi_sample_vcf
 dup_list = args.dup_list
 prefix = args.prefix
 sample_name = args.sample_name

 vcf_header = '''##fileformat=VCFv4.2
 ##fileDate=20191224
 ##source=high_confidence_calls_intergration(choppy app)
 ##reference=GRCh38.d1.vd1
 ##INFO=<ID=DPCT,Number=1,Type=Float,Description="Percentage of detected votes">
 ##INFO=<ID=VPCT,Number=1,Type=Float,Description="Percentage of consnesus votes">
 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
 ##contig=<ID=chr1,length=248956422>
 ##contig=<ID=chr2,length=242193529>
 ##contig=<ID=chr3,length=198295559>
 ##contig=<ID=chr4,length=190214555>
 ##contig=<ID=chr5,length=181538259>
 ##contig=<ID=chr6,length=170805979>
 ##contig=<ID=chr7,length=159345973>
 ##contig=<ID=chr8,length=145138636>
 ##contig=<ID=chr9,length=138394717>
 ##contig=<ID=chr10,length=133797422>
 ##contig=<ID=chr11,length=135086622>
 ##contig=<ID=chr12,length=133275309>
 ##contig=<ID=chr13,length=114364328>
 ##contig=<ID=chr14,length=107043718>
 ##contig=<ID=chr15,length=101991189>
 ##contig=<ID=chr16,length=90338345>
 ##contig=<ID=chr17,length=83257441>
 ##contig=<ID=chr18,length=80373285>
 ##contig=<ID=chr19,length=58617616>
 ##contig=<ID=chr20,length=64444167>
 ##contig=<ID=chr21,length=46709983>
 ##contig=<ID=chr22,length=50818468>
 ##contig=<ID=chrX,length=156040895>
 '''

 # read in duplication list
 dup = pd.read_table(dup_list,header=None)
 var_dup = dup[0].tolist()

 # output file
 file_name = prefix + '_annotated.vcf'
 outfile = open(file_name,'w')

 # write VCF
 outputcolumn = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tQuartet_DNA_BGI_SEQ2000_BGI_1_20180518_LCL5\tQuartet_DNA_BGI_SEQ2000_BGI_2_20180530_LCL5\tQuartet_DNA_BGI_SEQ2000_BGI_3_20180530_LCL5\tQuartet_DNA_BGI_SEQ2000_WGE_1_20190402_LCL5\tQuartet_DNA_BGI_SEQ2000_WGE_2_20190402_LCL5\tQuartet_DNA_BGI_SEQ500_BGI_1_20180328_LCL5 \tQuartet_DNA_BGI_SEQ500_BGI_2_20180328_LCL5\tQuartet_DNA_BGI_SEQ500_BGI_3_20180328_LCL5\tQuartet_DNA_ILM_Nova_ARD_1_20181108_LCL5\tQuartet_DNA_ILM_Nova_ARD_2_20181108_LCL5\tQuartet_DNA_ILM_Nova_ARD_3_20181108_LCL5\tQuartet_DNA_ILM_Nova_ARD_4_20190111_LCL5\tQuartet_DNA_ILM_Nova_ARD_5_20190111_LCL5\tQuartet_DNA_ILM_Nova_ARD_6_20190111_LCL5\tQuartet_DNA_ILM_Nova_BRG_1_20171024_LCL5\tQuartet_DNA_ILM_Nova_BRG_1_20180930_LCL5\tQuartet_DNA_ILM_Nova_BRG_2_20180930_LCL5\tQuartet_DNA_ILM_Nova_BRG_3_20180930_LCL5\tQuartet_DNA_ILM_Nova_GAC_1_20171025_LCL5\tQuartet_DNA_ILM_Nova_NVG_1_20171024_LCL5\tQuartet_DNA_ILM_Nova_WUX_1_20171024_LCL5\tQuartet_DNA_ILM_XTen_ARD_1_20170403_LCL5\tQuartet_DNA_ILM_XTen_ARD_2_20170403_LCL5\tQuartet_DNA_ILM_XTen_ARD_3_20170403_LCL5\tQuartet_DNA_ILM_XTen_NVG_1_20170329_LCL5\tQuartet_DNA_ILM_XTen_NVG_2_20170329_LCL5\tQuartet_DNA_ILM_XTen_NVG_3_20170329_LCL5\tQuartet_DNA_ILM_XTen_WUX_1_20170216_LCL5\tQuartet_DNA_ILM_XTen_WUX_2_20170216_LCL5\tQuartet_DNA_ILM_XTen_WUX_3_20170216_LCL5\tQuartet_DNA_ILM_XTen_WUX_4_20180703_LCL5\tQuartet_DNA_ILM_XTen_WUX_5_20180703_LCL5\tQuartet_DNA_ILM_XTen_WUX_6_20180703_LCL5' +'\t'+ sample_name+'_pcr'+'\t' + sample_name+'_pcr-free'+ '\t'+ sample_name +'_consensus' + '\t' + sample_name + '_consensus_alt_seq' +'\n'
 outfile.write(vcf_header)
 outfile.write(outputcolumn)

 #function

 def detected_percentage(strings):
 	strings = [x.replace('0/0','.') for x in strings]
 	gt = [x.split(':')[0] for x in strings]
 	percentage = round((33 - gt.count('.'))/33,4)
 	return(str(percentage))

 def vote_percentage(strings,consensus_call):
 	strings = [x.replace('.','0/0') for x in strings]
 	gt = [x.split(':')[0] for x in strings]
 	gt = list(map(gt_uniform,[i for i in gt]))
 	percentage = round(gt.count(consensus_call)/33,4)
 	return(str(percentage))

 def family_vote(strings,consensus_call):
 	pass

 def gt_uniform(strings):
 	uniformed_gt = ''
 	allele1 = strings.split('/')[0]
 	allele2 = strings.split('/')[1]
 	if int(allele1) > int(allele2):
 		uniformed_gt = allele2 + '/' + allele1
 	else:
 		uniformed_gt = allele1 + '/' + allele2
 	return uniformed_gt

 def decide_by_rep(strings):
 	consensus_rep = ''
 	mendelian = [x[-5:] for x in strings]
 	strings = [x.replace('.','0/0') for x in strings]
 	gt = [x.split(':')[0] for x in strings]
 	# modified gt turn 2/1 to 1/2
 	gt = list(map(gt_uniform,[i for i in gt]))
 	# mendelian consistent?
 	mendelian_dict = Counter(mendelian)
 	highest_mendelian = mendelian_dict.most_common(1)
 	candidate_mendelian = highest_mendelian[0][0]
 	freq_mendelian = highest_mendelian[0][1]
 	if (candidate_mendelian == '1:1:1') and (freq_mendelian >= 2):
 		gt_num_dict = Counter(gt)
 		highest_gt = gt_num_dict.most_common(1)
 		candidate_gt = highest_gt[0][0]
 		freq_gt = highest_gt[0][1]
 		if (candidate_gt != '0/0') and (freq_gt >= 2):
 			consensus_rep = candidate_gt
 		elif (candidate_gt == '0/0') and (freq_gt >= 2):
 			consensus_rep = '0/0'
 		else:
 			consensus_rep = 'inconGT'
 	elif (candidate_mendelian == '.') and (freq_mendelian >= 2):
 		consensus_rep = 'noInfo'
 	else:
 		consensus_rep = 'inconMen'
 	return consensus_rep


 def main():
 	for line in fileinput.input(multi_sample_vcf):
 		headline = re.match('^\#',line)
 		if headline is not None:
 			pass
 		else:
 			line = line.strip()
 			strings = line.split('\t')
 			variant_id = '_'.join([strings[0],strings[1]])
 			# check if the variants location is duplicated
 			if variant_id in var_dup:
 				strings[6] = 'dupVar'
 				outLine = '\t'.join(strings) + '\t' + '.' +'\t' + '.' + '\t' + 'dupVar' + '\t' + '.' +'\n'
 				outfile.write(outLine)
 			else:
 				# pre-define
 				pcr_consensus = ''
 				pcr_free_consensus = ''
 				consensus_call = ''
 				consensus_alt_seq = '.'
 				# pcr 
 				pcr = itemgetter(*[9,10,11,12,14,15,16,23,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41])(strings)
 				SEQ2000 = decide_by_rep(pcr[0:3])
 				SEQ500 = decide_by_rep(pcr[4:7])
 				Nova = decide_by_rep(pcr[7:11])
 				XTen_ARD = decide_by_rep(pcr[11:14])
 				XTen_NVG = decide_by_rep(pcr[14:17])
 				XTen_WUX_1 = decide_by_rep(pcr[17:20])
 				XTen_WUX_2 = decide_by_rep(pcr[20:23])
 				sequence_site = [SEQ2000,SEQ500,Nova,XTen_ARD,XTen_NVG,XTen_WUX_1,XTen_WUX_2]
 				sequence_dict = Counter(sequence_site)
 				highest_sequence = sequence_dict.most_common(1)
 				candidate_sequence = highest_sequence[0][0]
 				freq_sequence = highest_sequence[0][1]
 				if freq_sequence > 4:
 					pcr_consensus = candidate_sequence
 				else:
 					pcr_consensus = 'inconSequenceSite'
 				# pcr-free
 				pcr_free = itemgetter(*[13,17,18,19,20,21,22,24,25,26])(strings)
 				#SEQ2000 = decide_by_rep(pcr_free[0])
 				Nova_ARD_1 = decide_by_rep(pcr_free[1:4])
 				Nova_ARD_2 = decide_by_rep(pcr_free[4:7])
 				Nova_BRG = decide_by_rep(pcr_free[7:10])
 				sequence_site = [SEQ2000,Nova_ARD_1,Nova_ARD_2,Nova_BRG]
 				highest_sequence = sequence_dict.most_common(1)
 				candidate_sequence = highest_sequence[0][0]
 				freq_sequence = highest_sequence[0][1]
 				if freq_sequence > 2:
 					pcr_free_consensus = candidate_sequence
 				else:
 					pcr_free_consensus = 'inconSequenceSite'
 				# pcr and pcr-free
 				tag = ['inconGT','noInfo','inconMen','inconSequenceSite']
 				if (pcr_consensus == pcr_free_consensus) and (pcr_consensus not in tag) and (pcr_consensus != '0/0'):
 					consensus_call = pcr_consensus
 					VPCT = vote_percentage(strings[9:],consensus_call)
 					strings[7] = 'VPCT=' + VPCT
 					DPCT = detected_percentage(strings[9:])
 					strings[7] = strings[7] + ';DPCT=' + DPCT
 					# Delete multiple alternative genotype to necessary expression
 					strings[6] = 'reproducible'
 					alt = strings[4]
 					alt_gt = alt.split(',')
 					if len(alt_gt) > 1:
 						allele1 = consensus_call.split('/')[0]
 						allele2 = consensus_call.split('/')[1]
 						if allele1 == '0':
 							allele2_seq = alt_gt[int(allele2) - 1]
 							consensus_alt_seq = allele2_seq
 							consensus_call = '0/1'
 						else:
 							allele1_seq = alt_gt[int(allele1) - 1]
 							allele2_seq = alt_gt[int(allele2) - 1]
 							if int(allele1) > int(allele2):
 								consensus_alt_seq = allele2_seq + ',' + allele1_seq
 								consensus_call = '1/2'
 							elif int(allele1) < int(allele2):
 								consensus_alt_seq = allele1_seq + ',' + allele2_seq
 								consensus_call = '1/2'
 							else:
 								consensus_alt_seq = allele1_seq 
 								consensus_call = '1/1'
 					else:
 						consensus_alt_seq = alt
 				elif (pcr_consensus in tag) and (pcr_free_consensus in tag):
 					consensus_call = 'filtered'
 					strings[6] = 'filtered'
 					DPCT = detected_percentage(strings[9:])
 					strings[7] = 'DPCT=' + DPCT
 				elif ((pcr_consensus == '0/0') or (pcr_consensus in tag)) and ((pcr_free_consensus not in tag) and (pcr_free_consensus != '0/0')):
 					consensus_call = 'pcr-free-speicifc'
 					strings[6] = 'pcr-free-speicifc'
 					DPCT = detected_percentage(strings[9:])
 					strings[7] = 'DPCT=' + DPCT
 				elif ((pcr_consensus != '0/0') or (pcr_consensus not in tag)) and ((pcr_free_consensus in tag) and (pcr_free_consensus == '0/0')):
 					consensus_call = 'pcr-speicifc'
 					strings[6] = 'pcr-speicifc'
 					DPCT = detected_percentage(strings[9:])
 					strings[7] = 'DPCT=' + DPCT
 				elif (pcr_consensus == '0/0') and (pcr_free_consensus == '0/0'):
 					consensus_call = 'confirm for parents'
 					strings[6] = 'confirm for parents'					
 					DPCT = detected_percentage(strings[9:])
 					strings[7] = 'DPCT=' + DPCT
 				else:
 					consensus_call = 'filtered'
 					strings[6] = 'filtered'
 					DPCT = detected_percentage(strings[9:])
 					strings[7] = 'DPCT=' + DPCT
 				# output 
 				outLine = '\t'.join(strings) + '\t' + pcr_consensus +'\t' + pcr_free_consensus + '\t' + consensus_call + '\t' + consensus_alt_seq + '\n'
 				outfile.write(outLine)


 if __name__ == '__main__':
 	main()












--- a/codescripts/oneClass.py
+++ b/codescripts/oneClass.py
@@ -0,0 +1,109 @@
 # import modules
 import numpy as np
 import pandas as pd
 from sklearn import svm
 from sklearn import preprocessing
 import sys, argparse, os
 from vcf2bed import position_to_bed,padding_region



 parser = argparse.ArgumentParser(description="this script is to preform one calss svm on each chromosome")

 parser.add_argument('-train', '--trainDataset', type=str, help='training dataset generated from extracting vcf information part, with mutaitons supported by callsets',  required=True)
 parser.add_argument('-test', '--testDataset', type=str, help='testing dataset generated from extracting vcf information part, with mutaitons not called by all callsets',  required=True)
 parser.add_argument('-name', '--sampleName', type=str, help='sample name for output file name',  required=True)

 args = parser.parse_args()

 # Rename input:
 train_input = args.trainDataset
 test_input = args.testDataset
 sample_name = args.sampleName

 # default columns, which will be included in the included in the calssifier
 chromosome = ['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15' ,'chr16','chr17','chr18','chr19','chr20','chr21','chr22','chrX','chrY']
 feature_heter_cols = ['AltDP','BaseQRankSum','DB','DP','FS','GQ','MQ','MQRankSum','QD','ReadPosRankSum','RefDP','SOR','af']
 feature_homo_cols = ['AltDP','DB','DP','FS','GQ','MQ','QD','RefDP','SOR','af']


 # import datasets sepearate the records with or without BaseQRankSum annotation, etc.
 def load_dat(dat_file_name):
    dat = pd.read_table(dat_file_name)
    dat['DB'] = dat['DB'].fillna(0)
    dat = dat[dat['DP'] != 0]
    dat['af'] = dat['AltDP']/(dat['AltDP'] + dat['RefDP'])
    homo_rows = dat[dat['BaseQRankSum'].isnull()]
    heter_rows = dat[dat['BaseQRankSum'].notnull()]
    return homo_rows,heter_rows


 train_homo,train_heter = load_dat(train_input)
 test_homo,test_heter = load_dat(test_input)
 clf = svm.OneClassSVM(nu=0.05,kernel='rbf', gamma='auto_deprecated',cache_size=500)

 def prepare_dat(train_dat,test_dat,feature_cols,chromo):
 	chr_train = train_dat[train_dat['chromo'] == chromo]
 	chr_test = test_dat[test_dat['chromo'] == chromo]
 	
 	train_dat = chr_train.loc[:,feature_cols]
 	test_dat = chr_test.loc[:,feature_cols]
 	train_dat_scaled = preprocessing.scale(train_dat)
 	test_dat_scaled = preprocessing.scale(test_dat)
 	return chr_test,train_dat_scaled,test_dat_scaled

 def oneclass(X_train,X_test,chr_test):
 	clf.fit(X_train)
 	y_pred_test = clf.predict(X_test)
 	test_true_dat = chr_test[y_pred_test == 1]
 	test_false_dat = chr_test[y_pred_test == -1]
 	return test_true_dat,test_false_dat

 predicted_true = pd.DataFrame(columns=train_homo.columns)
 predicted_false = pd.DataFrame(columns=train_homo.columns)

 for chromo in chromosome:
 	# homo datasets
 	chr_test_homo,X_train_homo,X_test_homo = prepare_dat(train_homo,test_homo,feature_homo_cols,chromo)
 	test_true_homo,test_false_homo = oneclass(X_train_homo,X_test_homo,chr_test_homo)
 	predicted_true = predicted_true.append(test_true_homo)
 	predicted_false = predicted_false.append(test_false_homo)
 	# heter datasets
 	chr_test_heter,X_train_heter,X_test_heter = prepare_dat(train_heter,test_heter,feature_heter_cols,chromo)
 	test_true_heter,test_false_heter = oneclass(X_train_heter,X_test_heter,chr_test_heter)
 	predicted_true = predicted_true.append(test_true_heter)
 	predicted_false = predicted_false.append(test_false_heter)

 predicted_true_filename = sample_name + '_predicted_true.txt'
 predicted_false_filename = sample_name + '_predicted_false.txt'

 predicted_true.to_csv(predicted_true_filename,sep='\t',index=False)
 predicted_false.to_csv(predicted_false_filename,sep='\t',index=False)

 # output the bed file and padding bed region 50bp

 predicted_true_bed_filename = sample_name + '_predicted_true.bed'
 predicted_false_bed_filename = sample_name + '_predicted_false.bed'
 padding_filename = sample_name + '_padding.bed'

 predicted_true_bed = open(predicted_true_bed_filename,'w')
 predicted_false_bed = open(predicted_false_bed_filename,'w')
 padding = open(padding_filename,'w')

 #
 for index,row in predicted_false.iterrows():
 	chromo,pos1,pos2 = position_to_bed(row['chromo'],row['pos'],row['ref'],row['alt'])
 	outline_pos = chromo + '\t' + str(pos1) + '\t' + str(pos2) + '\n'
 	predicted_false_bed.write(outline_pos)
 	chromo,pad_pos1,pad_pos2,pad_pos3,pad_pos4 = padding_region(chromo,pos1,pos2,50)
 	outline_pad_1 = chromo + '\t' + str(pad_pos1) + '\t' + str(pad_pos2) + '\n'
 	outline_pad_2 = chromo + '\t' + str(pad_pos3) + '\t' + str(pad_pos4) + '\n'
 	padding.write(outline_pad_1)
 	padding.write(outline_pad_2)

 for index,row in predicted_true.iterrows():
 	chromo,pos1,pos2 = position_to_bed(row['chromo'],row['pos'],row['ref'],row['alt'])
 	outline_pos = chromo + '\t' + str(pos1) + '\t' + str(pos2) + '\n'
 	predicted_true_bed.write(outline_pos)


--- a/codescripts/reformVCF.py
+++ b/codescripts/reformVCF.py
@@ -0,0 +1,144 @@
 # import modules
 import sys, argparse, os
 import fileinput
 import re

 parser = argparse.ArgumentParser(description="This script is to split samples in VCF files and rewrite to the right style")

 parser.add_argument('-vcf', '--familyVCF', type=str, help='VCF with sister and mendelian infomation',  required=True)
 parser.add_argument('-name', '--familyName', type=str, help='Family name of the VCF file',  required=True)

 args = parser.parse_args()

 # Rename input:
 inputFile = args.familyVCF
 family_name = args.familyName

 # output filename
 LCL5_name = family_name + '.LCL5.vcf'
 LCL5file = open(LCL5_name,'w')
 LCL6_name = family_name + '.LCL6.vcf'
 LCL6file = open(LCL6_name,'w')
 LCL7_name = family_name + '.LCL7.vcf'
 LCL7file = open(LCL7_name,'w')
 LCL8_name = family_name + '.LCL8.vcf'
 LCL8file = open(LCL8_name,'w')
 family_filename = family_name + '.vcf'
 familyfile = open(family_filename,'w')

 # default columns, which will be included in the included in the calssifier
 vcfheader = '''##fileformat=VCFv4.2
 ##FILTER=<ID=PASS,Description="the same genotype between twin sister and mendelian consistent in 578 and 678">
 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
 ##FORMAT=<ID=TWINS,Number=0,Type=Flag,Description="0 for sister consistent, 1 for sister inconsistent">
 ##FORMAT=<ID=TRIO5,Number=0,Type=Flag,Description="0 for trio consistent, 1 for trio inconsistent">
 ##FORMAT=<ID=TRIO6,Number=0,Type=Flag,Description="0 for trio consistent, 1 for trio inconsistent">
 ##contig=<ID=chr1,length=248956422>
 ##contig=<ID=chr2,length=242193529>
 ##contig=<ID=chr3,length=198295559>
 ##contig=<ID=chr4,length=190214555>
 ##contig=<ID=chr5,length=181538259>
 ##contig=<ID=chr6,length=170805979>
 ##contig=<ID=chr7,length=159345973>
 ##contig=<ID=chr8,length=145138636>
 ##contig=<ID=chr9,length=138394717>
 ##contig=<ID=chr10,length=133797422>
 ##contig=<ID=chr11,length=135086622>
 ##contig=<ID=chr12,length=133275309>
 ##contig=<ID=chr13,length=114364328>
 ##contig=<ID=chr14,length=107043718>
 ##contig=<ID=chr15,length=101991189>
 ##contig=<ID=chr16,length=90338345>
 ##contig=<ID=chr17,length=83257441>
 ##contig=<ID=chr18,length=80373285>
 ##contig=<ID=chr19,length=58617616>
 ##contig=<ID=chr20,length=64444167>
 ##contig=<ID=chr21,length=46709983>
 ##contig=<ID=chr22,length=50818468>
 ##contig=<ID=chrX,length=156040895>
 '''
 # write VCF
 LCL5colname = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+family_name+'_LCL5'+'\n'
 LCL5file.write(vcfheader)
 LCL5file.write(LCL5colname)

 LCL6colname = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+family_name+'_LCL6'+'\n'
 LCL6file.write(vcfheader)
 LCL6file.write(LCL6colname)

 LCL7colname = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+family_name+'_LCL7'+'\n'
 LCL7file.write(vcfheader)
 LCL7file.write(LCL7colname)

 LCL8colname = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+family_name+'_LCL8'+'\n'
 LCL8file.write(vcfheader)
 LCL8file.write(LCL8colname)

 familycolname = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t'+'LCL5\t'+'LCL6\t'+'LCL7\t'+'LCL8'+'\n'
 familyfile.write(vcfheader)
 familyfile.write(familycolname)

 # reform VCF
 def process(oneLine):
 	line = oneLine.rstrip()
 	strings = line.strip().split('\t')
 	# replace .
 	# LCL5 uniq
 	if strings[11] == '.':
 		strings[11] = '0/0'
 		strings[9] = strings[12]
 		strings[10] = strings[13]
 	else:
 		pass
 	# LCL6 uniq
 	if strings[14] == '.':
 		strings[14] = '0/0'
 		strings[12] = strings[9]
 		strings[13] = strings[10]
 	else:
 		pass
 	# sister
 	if strings[11] == strings[14]:
 		add_format = ":1"
 	else:
 		add_format = ":0"
 	# trioLCL5
 	if strings[15] == 'MD=1':
 		add_format = add_format + ":1"
 	else:
 		add_format = add_format + ":0"
 	# trioLCL6
 	if strings[7] == 'MD=1':
 		add_format = add_format + ":1"
 	else:
 		add_format = add_format + ":0"
 	# filter
 	if (strings[11] == strings[14]) and (strings[15] == 'MD=1') and (strings[7] == 'MD=1'):
 		strings[6] = 'PASS'
 	else:
 		strings[6] = '.'
 	# output LCL5
 	LCL5outLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+'.'+'\t'+strings[6]+'\t'+ '.' +'\t'+ 'GT:TWINS:TRIO5:TRIO6' + '\t' + strings[14] + add_format + '\n'
 	LCL5file.write(LCL5outLine)
 	# output LCL6
 	LCL6outLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+'.'+'\t'+strings[6]+'\t'+ '.' +'\t'+ 'GT:TWINS:TRIO5:TRIO6' + '\t' + strings[11] + add_format + '\n'
 	LCL6file.write(LCL6outLine)
 	# output LCL7
 	LCL7outLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+'.'+'\t'+strings[6]+'\t'+ '.' +'\t'+ 'GT:TWINS:TRIO5:TRIO6' + '\t' + strings[10] + add_format + '\n'
 	LCL7file.write(LCL7outLine)
 	# output LCL8
 	LCL8outLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+'.'+'\t'+strings[6]+'\t'+ '.' +'\t'+ 'GT:TWINS:TRIO5:TRIO6' + '\t' + strings[9] + add_format + '\n'
 	LCL8file.write(LCL8outLine)
 	# output family
 	familyoutLine = strings[0]+'\t'+strings[1]+'\t'+strings[2]+'\t'+strings[3]+'\t'+strings[4]+'\t'+ '.'+'\t'+strings[6]+'\t'+ '.' +'\t'+ 'GT:TWINS:TRIO5:TRIO6' + '\t' + strings[14] + add_format +'\t' + strings[11] + add_format + '\t' + strings[10] + add_format +'\t' + strings[9] + add_format + '\n'
 	familyfile.write(familyoutLine)


 for line in fileinput.input(inputFile):
 	m = re.match('^\#',line)
 	if m is not None:
 		pass
 	else:
 		process(line)
 		

--- a/codescripts/select_small_variants_supported_by_all_callsets.py
+++ b/codescripts/select_small_variants_supported_by_all_callsets.py
@@ -0,0 +1,62 @@
 import sys,getopt
 import os
 import re
 import fileinput

 def usage():
 	print(
 		"""
 Usage: python select_small_variants_supported_by_all_callsets.py -i input_merged_vcf_file -o prefix

 This script selects SNPs and Indels supported by all callsets.

 Input:
 -i a merged vcf file

 Output:
 -o  a vcf file containd the selected SNPs and Indels
 		""")

 # select supported small variants
 def process(oneLine):
 	m = re.match('^\#',oneLine)
 	if m is not None:
 		outVCF.write(oneLine)
 		OUTname.write(oneLine)
 	else:
 		line = oneLine.rstrip()
 		strings = line.strip().split('\t')
 		gt = [i.split(':', 1)[0] for i in strings[9:len(strings)]]
 		if all(e == gt[0] for e in gt) and (gt[0] != '.'):
 			# output the record to vcf
 			outVCF.write(oneLine)
 		else:
 			OUTname.write(oneLine)


 opts,args = getopt.getopt(sys.argv[1:],"hi:o:")
 for op,value in opts:
 	if op == "-i":
 		inputFile=value
 	elif op == "-o":
 		prefix=value	
 	elif op == "-h":
 		usage()
 		sys.exit()

 if len(sys.argv[1:]) < 3:
 	usage()
 	sys.exit()

 VCFname = prefix + '.vcf'
 OUTname = prefix + '_outlier.vcf'

 outVCF = open(VCFname,'w')
 OUTname = open(OUTname,'w')

 for line in fileinput.input(inputFile):
 	process(line)

 outVCF.close()
 OUTname.close()

--- a/codescripts/vcf2bed.py
+++ b/codescripts/vcf2bed.py
@@ -0,0 +1,36 @@
 import re

 def position_to_bed(chromo,pos,ref,alt):
    # snv
    # Start cooridinate BED = start coordinate VCF - 1
    # End cooridinate BED = start coordinate VCF 

    if len(ref) == 1 and len(alt) == 1:
        StartPos = int(pos) -1
        EndPos = int(pos)
    
    # deletions
    # Start cooridinate BED = start coordinate VCF - 1
    # End cooridinate BED = start coordinate VCF + (reference length - alternate length)

    elif len(ref) > 1 and len(alt) == 1:
        StartPos = int(pos) - 1
        EndPos = int(pos) + (len(ref) - 1)
        
    #insertions
    # For insertions:
    # Start cooridinate BED = start coordinate VCF - 1
    # End cooridinate BED = start coordinate VCF + (alternate length - reference length)

    else:
        StartPos = int(pos) - 1
        EndPos = int(pos) + (len(alt) - 1)

    return chromo,StartPos,EndPos

 def padding_region(chromo,pos1,pos2,padding):
    StartPos1 = pos1 - padding
    EndPos1 = pos1
    StartPos2 = pos2
    EndPos2 = pos2 + padding
    return chromo,StartPos1,EndPos1,StartPos2,EndPos2
--- a/family.tsv
+++ b/family.tsv
@@ -0,0 +1 @@
 #LCL5vcf	LCL6vcf	LCL7vcf	LCL8vcf	LCL5name	LCL6name	LCL7name	LCL8name	familyname
--- a/inputs
+++ b/inputs
@@ -0,0 +1,29 @@
 {
  "{{ project_name }}.LCL7merge.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
  "{{ project_name }}.fasta": "GRCh38.d1.vd1.fa",
  "{{ project_name }}.LCL6familyzipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
  "{{ project_name }}.LCL5VCFrename.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
  "{{ project_name }}.LCL6mendelian.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/vbt:v1.1",
  "{{ project_name }}.mergeSister.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
  "{{ project_name }}.LCL5mendelian.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/vbt:v1.1",
  "{{ project_name }}.disk_size": "150",
  "{{ project_name }}.inputSamplesFile": "{{ inputSamplesFile }}",
  "{{ project_name }}.LCL6merge.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
  "{{ project_name }}.LCL6variantsNorm.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/bcftools:v1.9",
  "{{ project_name }}.LCL6zipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
  "{{ project_name }}.LCL7familyzipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
  "{{ project_name }}.LCL5familyzipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
  "{{ project_name }}.LCL6VCFrename.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
  "{{ project_name }}.LCL5merge.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
  "{{ project_name }}.reformVCF.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/high_confidence_call:v1.1",
  "{{ project_name }}.LCL5zipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
  "{{ project_name }}.cluster_config": "OnDemand bcs.a2.xlarge img-ubuntu-vpc",
  "{{ project_name }}.LCL8familyzipIndex.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
  "{{ project_name }}.LCL7variantsNorm.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/bcftools:v1.9",
  "{{ project_name }}.LCL8merge.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/rtg-tools:latest",
  "{{ project_name }}.LCL5variantsNorm.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/bcftools:v1.9",
  "{{ project_name }}.LCL8variantsNorm.docker": "registry-vpc.cn-shanghai.aliyuncs.com/pgx-docker-registry/bcftools:v1.9",
  "{{ project_name }}.ref_dir": "oss://chinese-quartet/quartet-storage-data/reference_data/"
 }


--- a/pictures/.DS_Store
+++ b/pictures/.DS_Store
--- a/pictures/workflow.png
+++ b/pictures/workflow.png
--- a/tasks/.DS_Store
+++ b/tasks/.DS_Store
--- a/tasks/VCFinfo.wdl
+++ b/tasks/VCFinfo.wdl
@@ -0,0 +1,23 @@
 task VCFinfo {
 	File merged_info
 	String sample
 	String docker
 	String cluster_config
 	String disk_size
 	
 	command <<<

 		
 
 	>>>

 	runtime {
 		docker:docker
 		cluster: cluster_config
 		systemDisk: "cloud_ssd 40"
 		dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
 	}
 	output {
 		File extracted_info = ""
 	}
 }
--- a/tasks/VCFrename.wdl
+++ b/tasks/VCFrename.wdl
@@ -0,0 +1,31 @@
 task VCFrename {
 	File trio_vcf_gz
 	File trio_vcf_idx
 	String mother_name
 	String father_name
 	String child_name
 	String family_name
 	String child
 	String docker
 	String cluster_config
 	String disk_size
 	
 	command <<<
 		echo "MOTHER ${mother_name}.${child}
 		FATHER ${father_name}.${child}
 		CHILD ${child_name}" > rename.txt

 		rtg vcfannotate -i ${trio_vcf_gz} -o ${family_name}.${child}.rename.vcf.gz --relabel=rename.txt
 	>>>

 	runtime {
 		docker:docker
 		cluster: cluster_config
 		systemDisk: "cloud_ssd 40"
 		dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
 	}
 	output {
 		File rename_trio_vcf_gz = "${family_name}.${child}.rename.vcf.gz"
 		File rename_trio_vcf_idx = "${family_name}.${child}.rename.vcf.gz.tbi"
 	}
 }
--- a/tasks/indelNorm.wdl
+++ b/tasks/indelNorm.wdl
@@ -0,0 +1,30 @@
 task indelNorm {
 	File vcf
 	File ref_dir
 	String fasta
 	String sampleName
 	String docker
 	String cluster_config
 	String disk_size
 	
 	command <<<

 		cat ${vcf} | grep '#' > header
 		cat ${vcf} | grep -v '#' > body
 		cat body | grep -w '^chr1\|^chr2\|^chr3\|^chr4\|^chr5\|^chr6\|^chr7\|^chr8\|^chr9\|^chr10\|^chr11\|^chr12\|^chr13\|^chr14\|^chr15\|^chr16\|^chr17\|^chr18\|^chr19\|^chr20\|^chr21\|^chr22\|^chrX' > body.filtered
 		cat header body.filtered > ${sampleName}.filtered.vcf

 		/opt/hall-lab/bcftools-1.9/bin/bcftools norm -f ${ref_dir}/${fasta} ${sampleName}.filtered.vcf > ${sampleName}.normed.vcf

 	>>>

 	runtime {
 		docker:docker
 		cluster: cluster_config
 		systemDisk: "cloud_ssd 40"
 		dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
 	}
 	output {
 		File normed_vcf = "${sampleName}.normed.vcf"
 	}
 }
--- a/tasks/mendelian.wdl
+++ b/tasks/mendelian.wdl
@@ -0,0 +1,34 @@
 task mendelian {
 	File child_vcf
 	File LCL7_vcf
 	File LCL8_vcf
 	String LCL7_name
 	String LCL8_name
 	String child_name
 	File ref_dir
 	String fasta
 	String docker
 	String cluster_config
 	String disk_size
 	
 	command <<<
 		export LD_LIBRARY_PATH=/opt/htslib-1.9
 		nt=$(nproc)
 		mkdir VBT

 		/opt/VBT-TrioAnalysis/vbt mendelian -ref ${ref_dir}/${fasta} -mother ${LCL8_vcf} -father ${LCL7_vcf} -child ${child_vcf} -outDir VBT -out-prefix ${child_name}.family --output-violation-regions -thread-count $nt

 		cat VBT/${child_name}.family_trio.vcf > ${child_name}.family.vcf
 	>>>

 	runtime {
 		docker:docker
 		cluster: cluster_config
 		systemDisk: "cloud_ssd 40"
 		dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
 	}
 	output {
 		Array[File] vbt_mendelian = glob("VBT/*")
 		File trio_vcf = "${child_name}.family.vcf"
 	}
 }
--- a/tasks/merge.wdl
+++ b/tasks/merge.wdl
@@ -0,0 +1,27 @@
 task merge {
 	Array[File] family_vcf_gz
 	Array[File] family_vcf_idx
 	String sample
 	String docker
 	String cluster_config
 	String disk_size
 	
 	command <<<

 		rtg vcfmerge --force-merge-all --no-gzip -o ${sample}.merged.vcf ${sep=" " family_vcf_gz}

 		cat ${sample}.merged.vcf | grep -v '#' | cut -f1-2 | sed s'/\t/_/g' | sort | uniq -c | sed 's/\s\+/\t/g' | awk '{ if ($1 != 1) { print } }' | cut -f3 > ${sample}.vcf_dup.txt

 	>>>

 	runtime {
 		docker:docker
 		cluster: cluster_config
 		systemDisk: "cloud_ssd 40"
 		dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
 	}
 	output {
 		File merged_vcf = "${sample}.merged.vcf"
 		File vcf_dup = "${sample}.vcf_dup.txt"
 	}
 }
--- a/tasks/mergeSister.wdl
+++ b/tasks/mergeSister.wdl
@@ -0,0 +1,34 @@
 task mergeSister {
 	File LCL5_trio_vcf_gz
 	File LCL5_trio_vcf_idx
 	File LCL6_trio_vcf_gz
 	File LCL6_trio_vcf_idx
 	String family_name
 	String docker
 	String cluster_config
 	String disk_size
 	
 	command <<<
 		rtg vcfmerge -o LCL5.LCL6.merged.vcf.gz ${LCL5_trio_vcf_gz} ${LCL6_trio_vcf_gz}

 		rtg vcfmerge -o LCL6.LCL5.merged.vcf.gz ${LCL6_trio_vcf_gz} ${LCL5_trio_vcf_gz}

 		zcat LCL5.LCL6.merged.vcf.gz | grep '##' > header
 		zcat LCL5.LCL6.merged.vcf.gz | grep -v '##' | cut -f8 > LCL5.mendelian
 		zcat LCL6.LCL5.merged.vcf.gz | grep -v '##' | paste - LCL5.mendelian > body

 		cat header body > ${family_name}.trio.info.vcf
 	>>>

 	runtime {
 		docker:docker
 		cluster: cluster_config
 		systemDisk: "cloud_ssd 40"
 		dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
 	}

 	output {
 		File family_mendelian_info = "${family_name}.trio.info.vcf"
 	}

 }
--- a/tasks/mergeVCFInfo.wdl
+++ b/tasks/mergeVCFInfo.wdl
@@ -0,0 +1,24 @@
 task mergeVCFInfo {
 	Array[File] vcf_gz
 	Array[File] vcf_idx
 	String sample
 	String docker
 	String cluster_config
 	String disk_size
 	
 	command <<<

 		rtg vcfmerge --force-merge-all --no-gzip -o ${sample}.merged.info.vcf ${sep=" " vcf_gz}
 
 	>>>

 	runtime {
 		docker:docker
 		cluster: cluster_config
 		systemDisk: "cloud_ssd 40"
 		dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
 	}
 	output {
 		File merged_info = "${sample}.merged.info.vcf"
 	}
 }
--- a/tasks/oneClass.wdl
+++ b/tasks/oneClass.wdl
@@ -0,0 +1,39 @@
 task oneClass {
 	File snv_train_vcf
 	File snv_test_vcf
 	File indel_train_vcf
 	File indel_test_vcf
 	String sampleName = basename(snv_train_vcf,".normed.snv.train.txt")
 	String docker
 	String cluster_config
 	String disk_size
 	
 	command <<<

 	python /opt/oneClass.py -train ${snv_train_vcf} -test ${snv_test_vcf} -name ${sampleName}_snv

 	python /opt/oneClass.py -train ${indel_train_vcf} -test ${indel_test_vcf} -name ${sampleName}_indel	

 	>>>

 	runtime {
 		docker:docker
 		cluster: cluster_config
 		systemDisk: "cloud_ssd 40"
 		dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
 	}

 	output {
 		File snv_true_txt = "${sampleName}_snv_predicted_true.txt"
 		File snv_false_txt = "${sampleName}_snv_predicted_false.txt"
 		File snv_true_bed = "${sampleName}_snv_predicted_true.bed"
 		File snv_false_bed = "${sampleName}_snv_predicted_false.bed"
 		File snv_padding = "${sampleName}_snv_padding.bed"
 		File indel_true_txt = "${sampleName}_indel_predicted_true.txt"
 		File indel_false_txt = "${sampleName}_indel_predicted_false.txt"
 		File indel_true_bed = "${sampleName}_indel_predicted_true.bed"
 		File indel_false_bed = "${sampleName}_indel_predicted_false.bed"
 		File indel_padding = "${sampleName}_indel_padding.bed"
 	}
 }

--- a/tasks/reformVCF.wdl
+++ b/tasks/reformVCF.wdl
@@ -0,0 +1,30 @@
 task reformVCF {
 	File family_mendelian_info
 	File family_name
 	String docker
 	String cluster_config
 	String disk_size
 	
 	command <<<

 	python /opt/reformVCF.py -vcf ${family_mendelian_info} -name ${family_name}
 	

 	>>>

 	runtime {
 		docker:docker
 		cluster: cluster_config
 		systemDisk: "cloud_ssd 40"
 		dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
 	}

 	output {
 		File LCL5_family_info = "${family_name}.LCL5.vcf"
 		File LCL6_family_info = "${family_name}.LCL6.vcf"
 		File LCL7_family_info = "${family_name}.LCL7.vcf"
 		File LCL8_family_info = "${family_name}.LCL8.vcf"
 		File family_info = "${family_name}.vcf"
 	}
 }

--- a/tasks/sister.wdl
+++ b/tasks/sister.wdl
@@ -0,0 +1,35 @@
 task sister {
 	File LCL5_vcf
 	File LCL6_vcf
 	File ref_dir
 	String fasta
 	String family_name
 	String docker
 	String cluster_config
 	String disk_size
 	
 	command <<<
 		export LD_LIBRARY_PATH=/opt/htslib-1.9

 		mkdir sister
 		/opt/VBT-TrioAnalysis/vbt varcomp -called ${LCL5_vcf} -base ${LCL6_vcf} -ref ${ref_dir}/${fasta} -outDir sister -filter none

 		mv sister/TPBase.vcf ${family_name}.sister.consistent.vcf
 		mv sister/FP.vcf ${family_name}.LCL5.uniq.vcf
 		mv sister/FN.vcf ${family_name}.LCL6.uniq.vcf
 		mv sister/log.txt ${family_name}.sister.vbt.log.txt
 	>>>

 	runtime {
 		docker:docker
 		cluster: cluster_config
 		systemDisk: "cloud_ssd 40"
 		dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
 	}
 	output {
 		File sister_consistent_vcf = "${family_name}.sister.consistent.vcf"
 		File LCL5_uniq = "${family_name}.LCL5.uniq.vcf"
 		File LCL6_uniq = "${family_name}.LCL6.uniq.vcf"
 		File log = "${family_name}.sister.vbt.log.txt"
 	}
 }
--- a/tasks/variantsNorm.wdl
+++ b/tasks/variantsNorm.wdl
@@ -0,0 +1,30 @@
 task variantsNorm {
 	File vcf
 	File ref_dir
 	String fasta
 	String sampleName
 	String docker
 	String cluster_config
 	String disk_size
 	
 	command <<<

 		cat ${vcf} | grep '#' > header
 		cat ${vcf} | grep -v '#' > body
 		cat body | grep -w '^chr1\|^chr2\|^chr3\|^chr4\|^chr5\|^chr6\|^chr7\|^chr8\|^chr9\|^chr10\|^chr11\|^chr12\|^chr13\|^chr14\|^chr15\|^chr16\|^chr17\|^chr18\|^chr19\|^chr20\|^chr21\|^chr22\|^chrX' > body.filtered
 		cat header body.filtered > ${sampleName}.filtered.vcf

 		/opt/hall-lab/bcftools-1.9/bin/bcftools norm -f ${ref_dir}/${fasta} ${sampleName}.filtered.vcf > ${sampleName}.normed.vcf

 	>>>

 	runtime {
 		docker:docker
 		cluster: cluster_config
 		systemDisk: "cloud_ssd 40"
 		dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
 	}
 	output {
 		File normed_vcf = "${sampleName}.normed.vcf"
 	}
 }
--- a/tasks/votes.wdl
+++ b/tasks/votes.wdl
@@ -0,0 +1,25 @@
 task votes {
 	File merged_vcf
 	File vcf_dup
 	String sample
 	String prefix
 	String docker
 	String cluster_config
 	String disk_size
 	
 	command <<<
 		python /opt/high_confidence_call_vote.py -vcf ${merged_vcf} -dup ${vcf_dup} -sample ${sample} -prefix ${prefix}
 		cat ${prefix}_annotated.vcf | cut -f1-9,45 | grep -v 'filtered' | grep -v 'confirm for parents' | grep -v 'pcr-free-speicifc' | grep -v 'pcr-speicifc' | grep -v 'dupVar' > ${prefix}_bechmarking_calls.vcf
 	>>>

 	runtime {
 		docker:docker
 		cluster: cluster_config
 		systemDisk: "cloud_ssd 40"
 		dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
 	}
 	output {
 		File annotated_vcf = "${prefix}_annotated.vcf"
 		File benchmark_call = "${prefix}_bechmarking_calls.vcf"
 	}
 }
--- a/tasks/zipIndex.wdl
+++ b/tasks/zipIndex.wdl
@@ -0,0 +1,25 @@
 task zipIndex {
 	File vcf
 	String sample
 	String family_name
 	String docker
 	String cluster_config
 	String disk_size
 	
 	command <<<
 		rtg bgzip ${vcf} -c > ${family_name}.${sample}.vcf.gz
 		rtg index -f vcf ${family_name}.${sample}.vcf.gz

 	>>>

 	runtime {
 		docker:docker
 		cluster: cluster_config
 		systemDisk: "cloud_ssd 40"
 		dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
 	}
 	output {
 		File vcf_gz = "${family_name}.${sample}.vcf.gz"
 		File vcf_idx = "${family_name}.${sample}.vcf.gz.tbi"
 	}
 }
--- a/workflow.wdl
+++ b/workflow.wdl
@@ -0,0 +1,207 @@
 import "./tasks/variantsNorm.wdl" as variantsNorm
 import "./tasks/mendelian.wdl" as mendelian
 import "./tasks/zipIndex.wdl" as zipIndex
 import "./tasks/VCFrename.wdl" as VCFrename
 import "./tasks/mergeSister.wdl" as mergeSister
 import "./tasks/reformVCF.wdl" as reformVCF
 import "./tasks/merge.wdl" as merge

 workflow {{ project_name }} {
 	File inputSamplesFile
 	Array[Array[File]] inputSamples = read_tsv(inputSamplesFile)
 	File ref_dir
 	String fasta
 	String cluster_config
 	String disk_size

 	scatter (quartet in inputSamples){
 		call variantsNorm.variantsNorm as LCL5variantsNorm{
 			input:
 			vcf=quartet[0],
 			ref_dir=ref_dir,
 			fasta=fasta,
 			sampleName=quartet[4],
 			cluster_config=cluster_config,
 			disk_size=disk_size
 		}
 		call variantsNorm.variantsNorm as LCL6variantsNorm{
 			input:
 			vcf=quartet[1],
 			ref_dir=ref_dir,
 			fasta=fasta,
 			sampleName=quartet[5],
 			cluster_config=cluster_config,
 			disk_size=disk_size
 		}
 		call variantsNorm.variantsNorm as LCL7variantsNorm{
 			input:
 			vcf=quartet[2],
 			ref_dir=ref_dir,
 			fasta=fasta,
 			sampleName=quartet[6],
 			cluster_config=cluster_config,
 			disk_size=disk_size
 		}
 		call variantsNorm.variantsNorm as LCL8variantsNorm{
 			input:
 			vcf=quartet[3],
 			ref_dir=ref_dir,
 			fasta=fasta,
 			sampleName=quartet[7],
 			cluster_config=cluster_config,
 			disk_size=disk_size
 		}
 		call mendelian.mendelian as LCL5mendelian {
 			input:
 			child_vcf=LCL5variantsNorm.normed_vcf,
 			LCL7_vcf=LCL7variantsNorm.normed_vcf,
 			LCL8_vcf=LCL8variantsNorm.normed_vcf,
 			LCL7_name=quartet[6],
 			LCL8_name=quartet[7],
 			child_name=quartet[4],
 			ref_dir=ref_dir,
 			fasta=fasta,
 			cluster_config=cluster_config,
 			disk_size=disk_size
 		}
 		call mendelian.mendelian as LCL6mendelian {
 			input:
 			child_vcf=LCL6variantsNorm.normed_vcf,
 			LCL7_vcf=LCL7variantsNorm.normed_vcf,
 			LCL8_vcf=LCL8variantsNorm.normed_vcf,
 			LCL7_name=quartet[6],
 			LCL8_name=quartet[7],
 			child_name=quartet[5],
 			ref_dir=ref_dir,
 			fasta=fasta,
 			cluster_config=cluster_config,
 			disk_size=disk_size
 		}
 		call zipIndex.zipIndex as LCL5zipIndex {
 			input:
 			vcf=LCL5mendelian.trio_vcf,
 			sample="LCL5",
 			family_name=quartet[8],
 			cluster_config=cluster_config,
 			disk_size=disk_size
 		}
 		call zipIndex.zipIndex as LCL6zipIndex {
 			input:
 			vcf=LCL6mendelian.trio_vcf,
 			sample="LCL6",
 			family_name=quartet[8],
 			cluster_config=cluster_config,
 			disk_size=disk_size
 		}		
 		call VCFrename.VCFrename as LCL5VCFrename {
 			input:
 			trio_vcf_gz=LCL5zipIndex.vcf_gz,
 			trio_vcf_idx=LCL5zipIndex.vcf_idx,
 			mother_name=quartet[7],
 			father_name=quartet[6],
 			child_name=quartet[4],
 			family_name=quartet[8],
 			child="LCL5",
 			cluster_config=cluster_config,
 			disk_size=disk_size
 		}
 		call VCFrename.VCFrename as LCL6VCFrename {
 			input:
 			trio_vcf_gz=LCL6zipIndex.vcf_gz,
 			trio_vcf_idx=LCL6zipIndex.vcf_idx,
 			mother_name=quartet[7],
 			father_name=quartet[6],
 			child_name=quartet[5],
 			family_name=quartet[8],
 			child="LCL6",
 			cluster_config=cluster_config,
 			disk_size=disk_size
 		}
 		call mergeSister.mergeSister as mergeSister {
 			input:
 			LCL5_trio_vcf_gz=LCL5VCFrename.rename_trio_vcf_gz,
 			LCL5_trio_vcf_idx=LCL5VCFrename.rename_trio_vcf_idx,
 			LCL6_trio_vcf_gz=LCL6VCFrename.rename_trio_vcf_gz,
 			LCL6_trio_vcf_idx=LCL6VCFrename.rename_trio_vcf_idx,
 			family_name=quartet[8],
 			cluster_config=cluster_config,
 			disk_size=disk_size
 		}
 		call reformVCF.reformVCF as reformVCF {
 			input:
 			family_mendelian_info=mergeSister.family_mendelian_info,
 			family_name=quartet[8],
 			cluster_config=cluster_config,
 			disk_size=disk_size			
 		}
 		call zipIndex.zipIndex as LCL5familyzipIndex {
 			input:
 			vcf=reformVCF.LCL5_family_info,
 			sample='LCL5',
 			family_name=quartet[8],
 			cluster_config=cluster_config,
 			disk_size=disk_size
 		}
 		call zipIndex.zipIndex as LCL6familyzipIndex {
 			input:
 			vcf=reformVCF.LCL6_family_info,
 			sample='LCL6',
 			family_name=quartet[8],
 			cluster_config=cluster_config,
 			disk_size=disk_size
 		}
 		call zipIndex.zipIndex as LCL7familyzipIndex {
 			input:
 			vcf=reformVCF.LCL7_family_info,
 			sample='LCL7',
 			family_name=quartet[8],
 			cluster_config=cluster_config,
 			disk_size=disk_size
 		}
 		call zipIndex.zipIndex as LCL8familyzipIndex {
 			input:
 			vcf=reformVCF.LCL8_family_info,
 			sample='LCL8',
 			family_name=quartet[8],
 			cluster_config=cluster_config,
 			disk_size=disk_size
 		}		
 	}
 	call merge.merge as LCL5merge {
 		input:
 		family_vcf_gz=LCL5familyzipIndex.vcf_gz,
 		family_vcf_idx=LCL5familyzipIndex.vcf_idx,
 		sample="LCL5",
 		cluster_config=cluster_config,
 		disk_size=disk_size
 	}
 	
 	call merge.merge as LCL6merge {
 		input:
 		family_vcf_gz=LCL6familyzipIndex.vcf_gz,
 		family_vcf_idx=LCL6familyzipIndex.vcf_idx,
 		sample="LCL6",
 		cluster_config=cluster_config,
 		disk_size=disk_size
 	}
 	
 	call merge.merge as LCL7merge {
 		input:
 		family_vcf_gz=LCL7familyzipIndex.vcf_gz,
 		family_vcf_idx=LCL7familyzipIndex.vcf_idx,
 		sample="LCL7",
 		cluster_config=cluster_config,
 		disk_size=disk_size
 	}
 	
 	call merge.merge as LCL8merge {
 		input:
 		family_vcf_gz=LCL8familyzipIndex.vcf_gz,
 		family_vcf_idx=LCL8familyzipIndex.vcf_idx,
 		sample="LCL8",
 		cluster_config=cluster_config,
 		disk_size=disk_size
 	}
 	
 }
			@@ -0,0 +1 @@
			#LCL5vcf LCL6vcf LCL7vcf LCL8vcf LCL5name LCL6name LCL7name LCL8name familyname