You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

79 line
3.2KB

  1. import sys, argparse, os
  2. import fileinput
  3. import re
  4. import statistics
  5. # input arguments
  6. parser = argparse.ArgumentParser(description="this script is to intergeate vcf information, variants quality and location")
  7. parser.add_argument('-vcf', '--multi_sample_vcf', type=str, help='The VCF file you want to count the voting number', required=True)
  8. parser.add_argument('-prefix', '--prefix', type=str, help='Prefix of output file name', required=True)
  9. args = parser.parse_args()
  10. multi_sample_vcf = args.multi_sample_vcf
  11. prefix = args.prefix
  12. def get_location(info):
  13. repeat = ''
  14. if 'ANN' in info:
  15. strings = info.strip().split(';')
  16. for element in strings:
  17. m = re.match('ANN',element)
  18. if m is not None:
  19. repeat = element.split('=')[1]
  20. else:
  21. repeat = '.'
  22. return repeat
  23. def extract_info_normal(strings):
  24. AF = []
  25. GQ = []
  26. MQ = []
  27. DP = []
  28. ALT = []
  29. for element in strings:
  30. if element == '.':
  31. pass
  32. else:
  33. ad = element.split(':')[1]
  34. ref = ad.split(',')[0]
  35. alt = ad.split(',')[1]
  36. af = float(int(alt)/(int(ref) + int(alt)))
  37. gq = int(element.split(':')[3])
  38. mq = float(element.split(':')[5])
  39. dp = int(element.split(':')[2])
  40. AF.append(af)
  41. GQ.append(gq)
  42. MQ.append(mq)
  43. DP.append(dp)
  44. ALT.append(int(alt))
  45. AF_m = statistics.mean(AF)
  46. GQ_m = statistics.mean(GQ)
  47. MQ_m = statistics.mean(MQ)
  48. DP_a = sum(DP)
  49. ALT_a = sum(ALT)
  50. return AF_m,GQ_m,MQ_m,DP_a,ALT_a
  51. file_name = prefix + '_variant_quality_location.txt'
  52. outfile = open(file_name,'w')
  53. outputcolumn = '#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tQuartet_DNA_BGI_SEQ2000_BGI_1_20180518_LCL5\tQuartet_DNA_BGI_SEQ2000_BGI_2_20180530_LCL5\tQuartet_DNA_BGI_SEQ2000_BGI_3_20180530_LCL5\tQuartet_DNA_BGI_T7_WGE_1_20191105_LCL5\tQuartet_DNA_BGI_T7_WGE_2_20191105_LCL5\tQuartet_DNA_BGI_T7_WGE_3_20191105_LCL5\tQuartet_DNA_ILM_Nova_ARD_1_20181108_LCL5\tQuartet_DNA_ILM_Nova_ARD_2_20181108_LCL5\tQuartet_DNA_ILM_Nova_ARD_3_20181108_LCL5\tQuartet_DNA_ILM_Nova_ARD_4_20190111_LCL5\tQuartet_DNA_ILM_Nova_ARD_5_20190111_LCL5\tQuartet_DNA_ILM_Nova_ARD_6_20190111_LCL5\tQuartet_DNA_ILM_Nova_BRG_1_20180930_LCL5\tQuartet_DNA_ILM_Nova_BRG_2_20180930_LCL5\tQuartet_DNA_ILM_Nova_BRG_3_20180930_LCL5\tQuartet_DNA_ILM_Nova_WUX_1_20190917_LCL5\tQuartet_DNA_ILM_Nova_WUX_2_20190917_LCL5\tQuartet_DNA_ILM_Nova_WUX_3_20190917_LCL5\tQuartet_DNA_ILM_XTen_ARD_1_20170403_LCL5\tQuartet_DNA_ILM_XTen_ARD_2_20170403_LCL5\tQuartet_DNA_ILM_XTen_ARD_3_20170403_LCL5\tQuartet_DNA_ILM_XTen_NVG_1_20170329_LCL5\tQuartet_DNA_ILM_XTen_NVG_2_20170329_LCL5\tQuartet_DNA_ILM_XTen_NVG_3_20170329_LCL5\tQuartet_DNA_ILM_XTen_WUX_1_20170216_LCL5\tQuartet_DNA_ILM_XTen_WUX_2_20170216_LCL5\tQuartet_DNA_ILM_XTen_WUX_3_20170216_LCL5' +'\t'+ 'location' + '\t' + 'AF' + '\t' + 'GQ' + '\t' + 'MQ' + '\t' + 'DP' + '\t' + 'ALT' +'\n'
  54. outfile.write(outputcolumn)
  55. for line in fileinput.input(multi_sample_vcf):
  56. m = re.match('^\#',line)
  57. if m is not None:
  58. pass
  59. else:
  60. line = line.strip()
  61. strings = line.split('\t')
  62. repeat = get_location(strings[7])
  63. AF,GQ,MQ,DP,ALT = extract_info_normal(strings[9:])
  64. outLine = '\t'.join(strings) + '\t' + repeat +'\t' + str(AF) + '\t' + str(GQ) + '\t' + str(MQ) + '\t' + str(DP) + '\t' + str(ALT) + '\n'
  65. outfile.write(outLine)