您最多选择25个主题 主题必须以字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符

95 行
2.2KB

  1. from __future__ import division
  2. import pandas as pd
  3. import sys, argparse, os
  4. # input arguments
  5. parser = argparse.ArgumentParser(description="this script is to calculate reproducibility between Quartet_D5 and Quartet_D6s")
  6. parser.add_argument('-sister', '--sister', type=str, help='sister.txt', required=True)
  7. parser.add_argument('-project', '--project', type=str, help='project name', required=True)
  8. args = parser.parse_args()
  9. sister_file = args.sister
  10. project_name = args.project
  11. # output file
  12. output_name = project_name + '.sister.reproducibility.txt'
  13. output_file = open(output_name,'w')
  14. # input files
  15. sister_dat = pd.read_table(sister_file)
  16. indel_sister_same = 0
  17. indel_sister_diff = 0
  18. snv_sister_same = 0
  19. snv_sister_diff = 0
  20. for row in sister_dat.itertuples():
  21. # snv indel
  22. if ',' in row.ALT:
  23. alt = row.ALT.split(',')
  24. alt_len = [len(i) for i in alt]
  25. alt_max = max(alt_len)
  26. else:
  27. alt_max = len(row.ALT)
  28. alt = alt_max
  29. ref = row.REF
  30. if len(ref) == 1 and alt == 1:
  31. cate = 'SNV'
  32. elif len(ref) > alt:
  33. cate = 'INDEL'
  34. elif alt > len(ref):
  35. cate = 'INDEL'
  36. elif len(ref) == alt:
  37. cate = 'INDEL'
  38. # sister
  39. if row[5] == row[6]:
  40. if row[5] == './.':
  41. mendelian = 'noInfo'
  42. sister_count = "no"
  43. elif row[5] == '0/0':
  44. mendelian = 'Ref'
  45. sister_count = "no"
  46. else:
  47. mendelian = '1'
  48. sister_count = "yes_same"
  49. else:
  50. mendelian = '0'
  51. if (row[5] == './.' or row[5] == '0/0') and (row[6] == './.' or row[6] == '0/0'):
  52. sister_count = "no"
  53. else:
  54. sister_count = "yes_diff"
  55. if sister_count == 'yes_same':
  56. sister_same += 1
  57. elif sister_count == 'yes_diff':
  58. sister_diff += 1
  59. else:
  60. pass
  61. if cate == 'SNV':
  62. if sister_count == 'yes_same':
  63. snv_sister_same += 1
  64. elif sister_count == 'yes_diff':
  65. snv_sister_diff += 1
  66. else:
  67. pass
  68. elif cate == 'INDEL':
  69. if sister_count == 'yes_same':
  70. indel_sister_same += 1
  71. elif sister_count == 'yes_diff':
  72. indel_sister_diff += 1
  73. else:
  74. pass
  75. indel_sister = indel_sister_same/(indel_sister_same + indel_sister_diff)
  76. snv_sister = snv_sister_same/(snv_sister_same + snv_sister_diff)
  77. outcolumn = 'Project\tReproducibility_D5_D6\n'
  78. outResult = project_name + '\t' + str(sister) + '\n'
  79. output_file.write(outcolumn)
  80. output_file.write(outResult)