from __future__ import division import pandas as pd import sys, argparse, os import fileinput import re # input arguments parser = argparse.ArgumentParser(description="this script is to get final high confidence calls and information of all replicates") parser.add_argument('-vcfInfo', '--vcfInfo', type=str, help='The txt file of variants information, this file is named as prefix__variant_quality_location.txt', required=True) parser.add_argument('-mendelianInfo', '--mendelianInfo', type=str, help='The merged mendelian information of all samples', required=True) parser.add_argument('-sample', '--sample_name', type=str, help='which sample of quartet', required=True) args = parser.parse_args() vcfInfo = args.vcfInfo mendelianInfo = args.mendelianInfo sample_name = args.sample_name #GT:TWINS:TRIO5:TRIO6:DP:AF:GQ:QD:MQ:FS:QUAL vcf_header = '''##fileformat=VCFv4.2 ##fileDate=20200331 ##source=high_confidence_calls_intergration(choppy app) ##reference=GRCh38.d1.vd1 ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= ##FORMAT= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ##contig= ''' # output file file_name = sample_name + '_mendelian_vcfInfo.vcf' outfile = open(file_name,'w') outputcolumn = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t' + sample_name + '\n' outfile.write(vcf_header) outfile.write(outputcolumn) # input files vcf_info = pd.read_table(vcfInfo) mendelian_info = pd.read_table(mendelianInfo) merged_df = pd.merge(vcf_info, mendelian_info, how='outer', left_on=['#CHROM','POS'], right_on = ['#CHROM','POS']) merged_df = merged_df.fillna('.') # def parse_INFO(info): strings = info.strip().split(';') keys = [] values = [] for i in strings: kv = i.split('=') if kv[0] == 'DB': keys.append('DB') values.append('1') else: keys.append(kv[0]) values.append(kv[1]) infoDict = dict(zip(keys, values)) return infoDict # for row in merged_df.itertuples(): if row[18] != '.': # format # GT:TWINS:TRIO5:TRIO6:DP:AF:GQ:QD:MQ:FS:QUAL FORMAT_x = row[10].split(':') ALT = int(FORMAT_x[1].split(',')[1]) if int(FORMAT_x[2]) != 0: AF = round(ALT/int(FORMAT_x[2]),2) else: AF = '.' INFO_x = parse_INFO(row.INFO_x) if FORMAT_x[2] == '0': INFO_x['QD'] = '.' else: pass FORMAT = row[18] + ':' + FORMAT_x[2] + ':' + str(ALT) + ':' + str(AF) + ':' + FORMAT_x[3] + ':' + INFO_x['QD'] + ':' + INFO_x['MQ'] + ':' + INFO_x['FS'] + ':' + str(row.QUAL_x) # outline outline = row._1 + '\t' + str(row.POS) + '\t' + row.ID_x + '\t' + row.REF_y + '\t' + row.ALT_y + '\t' + '.' + '\t' + '.' + '\t' + '.' + '\t' + 'GT:TWINS:TRIO5:TRIO6:DP:ALT:AF:GQ:QD:MQ:FS:QUAL' + '\t' + FORMAT + '\n' else: rawGT = row[10].split(':') FORMAT_x = row[10].split(':') ALT = int(FORMAT_x[1].split(',')[1]) if int(FORMAT_x[2]) != 0: AF = round(ALT/int(FORMAT_x[2]),2) else: AF = '.' INFO_x = parse_INFO(row.INFO_x) if FORMAT_x[2] == '0': INFO_x['QD'] = '.' else: pass FORMAT = '.:.:.:.' + ':' + FORMAT_x[2] + ':' + str(ALT) + ':' + str(AF) + ':' + FORMAT_x[3] + ':' + INFO_x['QD'] + ':' + INFO_x['MQ'] + ':' + INFO_x['FS'] + ':' + str(row.QUAL_x) + ':' + rawGT[0] # outline outline = row._1 + '\t' + str(row.POS) + '\t' + row.ID_x + '\t' + row.REF_x + '\t' + row.ALT_x + '\t' + '.' + '\t' + '.' + '\t' + '.' + '\t' + 'GT:TWINS:TRIO5:TRIO6:DP:ALT:AF:GQ:QD:MQ:FS:QUAL:rawGT' + '\t' + FORMAT + '\n' outfile.write(outline)