Nevar pievienot vairāk kā 25 tēmas Tēmai ir jāsākas ar burtu vai ciparu, tā var saturēt domu zīmes ('-') un var būt līdz 35 simboliem gara.

merge_mendelian_vcfinfo.py 5.0KB

pirms 2 gadiem
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129
  1. from __future__ import division
  2. import pandas as pd
  3. import sys, argparse, os
  4. import fileinput
  5. import re
  6. # input arguments
  7. parser = argparse.ArgumentParser(description="this script is to get final high confidence calls and information of all replicates")
  8. parser.add_argument('-vcfInfo', '--vcfInfo', type=str, help='The txt file of variants information, this file is named as prefix__variant_quality_location.txt', required=True)
  9. parser.add_argument('-mendelianInfo', '--mendelianInfo', type=str, help='The merged mendelian information of all samples', required=True)
  10. parser.add_argument('-sample', '--sample_name', type=str, help='which sample of quartet', required=True)
  11. args = parser.parse_args()
  12. vcfInfo = args.vcfInfo
  13. mendelianInfo = args.mendelianInfo
  14. sample_name = args.sample_name
  15. #GT:TWINS:TRIO5:TRIO6:DP:AF:GQ:QD:MQ:FS:QUAL
  16. vcf_header = '''##fileformat=VCFv4.2
  17. ##fileDate=20200331
  18. ##source=high_confidence_calls_intergration(choppy app)
  19. ##reference=GRCh38.d1.vd1
  20. ##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">
  21. ##FORMAT=<ID=TWINS,Number=1,Type=Flag,Description="1 for sister consistent, 0 for sister different">
  22. ##FORMAT=<ID=TRIO5,Number=1,Type=Flag,Description="1 for LCL7, LCL8 and LCL5 mendelian consistent, 0 for family violation">
  23. ##FORMAT=<ID=TRIO6,Number=1,Type=Flag,Description="1 for LCL7, LCL8 and LCL6 mendelian consistent, 0 for family violation">
  24. ##FORMAT=<ID=DP,Number=1,Type=Integer,Description="Depth">
  25. ##FORMAT=<ID=ALT,Number=1,Type=Integer,Description="Alternative Depth">
  26. ##FORMAT=<ID=AF,Number=1,Type=Float,Description="Allele frequency">
  27. ##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype quality">
  28. ##FORMAT=<ID=QD,Number=1,Type=Float,Description="Variant Confidence/Quality by Depth">
  29. ##FORMAT=<ID=MQ,Number=1,Type=Float,Description="Mapping quality">
  30. ##FORMAT=<ID=FS,Number=1,Type=Float,Description="Phred-scaled p-value using Fisher's exact test to detect strand bias">
  31. ##FORMAT=<ID=QUAL,Number=1,Type=Float,Description="variant quality">
  32. ##contig=<ID=chr1,length=248956422>
  33. ##contig=<ID=chr2,length=242193529>
  34. ##contig=<ID=chr3,length=198295559>
  35. ##contig=<ID=chr4,length=190214555>
  36. ##contig=<ID=chr5,length=181538259>
  37. ##contig=<ID=chr6,length=170805979>
  38. ##contig=<ID=chr7,length=159345973>
  39. ##contig=<ID=chr8,length=145138636>
  40. ##contig=<ID=chr9,length=138394717>
  41. ##contig=<ID=chr10,length=133797422>
  42. ##contig=<ID=chr11,length=135086622>
  43. ##contig=<ID=chr12,length=133275309>
  44. ##contig=<ID=chr13,length=114364328>
  45. ##contig=<ID=chr14,length=107043718>
  46. ##contig=<ID=chr15,length=101991189>
  47. ##contig=<ID=chr16,length=90338345>
  48. ##contig=<ID=chr17,length=83257441>
  49. ##contig=<ID=chr18,length=80373285>
  50. ##contig=<ID=chr19,length=58617616>
  51. ##contig=<ID=chr20,length=64444167>
  52. ##contig=<ID=chr21,length=46709983>
  53. ##contig=<ID=chr22,length=50818468>
  54. ##contig=<ID=chrX,length=156040895>
  55. '''
  56. # output file
  57. file_name = sample_name + '_mendelian_vcfInfo.vcf'
  58. outfile = open(file_name,'w')
  59. outputcolumn = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t' + sample_name + '\n'
  60. outfile.write(vcf_header)
  61. outfile.write(outputcolumn)
  62. # input files
  63. vcf_info = pd.read_table(vcfInfo)
  64. mendelian_info = pd.read_table(mendelianInfo)
  65. merged_df = pd.merge(vcf_info, mendelian_info, how='outer', left_on=['#CHROM','POS'], right_on = ['#CHROM','POS'])
  66. merged_df = merged_df.fillna('.')
  67. #
  68. def parse_INFO(info):
  69. strings = info.strip().split(';')
  70. keys = []
  71. values = []
  72. for i in strings:
  73. kv = i.split('=')
  74. if kv[0] == 'DB':
  75. keys.append('DB')
  76. values.append('1')
  77. else:
  78. keys.append(kv[0])
  79. values.append(kv[1])
  80. infoDict = dict(zip(keys, values))
  81. return infoDict
  82. #
  83. for row in merged_df.itertuples():
  84. if row[18] != '.':
  85. # format
  86. # GT:TWINS:TRIO5:TRIO6:DP:AF:GQ:QD:MQ:FS:QUAL
  87. FORMAT_x = row[10].split(':')
  88. ALT = int(FORMAT_x[1].split(',')[1])
  89. if int(FORMAT_x[2]) != 0:
  90. AF = round(ALT/int(FORMAT_x[2]),2)
  91. else:
  92. AF = '.'
  93. INFO_x = parse_INFO(row.INFO_x)
  94. if FORMAT_x[2] == '0':
  95. INFO_x['QD'] = '.'
  96. else:
  97. pass
  98. FORMAT = row[18] + ':' + FORMAT_x[2] + ':' + str(ALT) + ':' + str(AF) + ':' + FORMAT_x[3] + ':' + INFO_x['QD'] + ':' + INFO_x['MQ'] + ':' + INFO_x['FS'] + ':' + str(row.QUAL_x)
  99. # outline
  100. outline = row._1 + '\t' + str(row.POS) + '\t' + row.ID_x + '\t' + row.REF_y + '\t' + row.ALT_y + '\t' + '.' + '\t' + '.' + '\t' + '.' + '\t' + 'GT:TWINS:TRIO5:TRIO6:DP:ALT:AF:GQ:QD:MQ:FS:QUAL' + '\t' + FORMAT + '\n'
  101. else:
  102. rawGT = row[10].split(':')
  103. FORMAT_x = row[10].split(':')
  104. ALT = int(FORMAT_x[1].split(',')[1])
  105. if int(FORMAT_x[2]) != 0:
  106. AF = round(ALT/int(FORMAT_x[2]),2)
  107. else:
  108. AF = '.'
  109. INFO_x = parse_INFO(row.INFO_x)
  110. if FORMAT_x[2] == '0':
  111. INFO_x['QD'] = '.'
  112. else:
  113. pass
  114. FORMAT = '.:.:.:.' + ':' + FORMAT_x[2] + ':' + str(ALT) + ':' + str(AF) + ':' + FORMAT_x[3] + ':' + INFO_x['QD'] + ':' + INFO_x['MQ'] + ':' + INFO_x['FS'] + ':' + str(row.QUAL_x) + ':' + rawGT[0]
  115. # outline
  116. outline = row._1 + '\t' + str(row.POS) + '\t' + row.ID_x + '\t' + row.REF_x + '\t' + row.ALT_x + '\t' + '.' + '\t' + '.' + '\t' + '.' + '\t' + 'GT:TWINS:TRIO5:TRIO6:DP:ALT:AF:GQ:QD:MQ:FS:QUAL:rawGT' + '\t' + FORMAT + '\n'
  117. outfile.write(outline)