您最多选择25个主题 主题必须以字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符

76 行
2.7KB

  1. import itertools
  2. import pandas as pd
  3. import sys,argparse,os
  4. parser = argparse.ArgumentParser(description="this script is to preform combination on vcf files, get any two of them for next jaccard index calculation")
  5. parser.add_argument('-vcf', '--vcffile', type=str, help='input vcf file', required=True)
  6. args = parser.parse_args()
  7. # Rename input:
  8. vcf_input = args.vcffile
  9. file = pd.read_csv(vcf_input,header=None,sep='\t')
  10. com = list(itertools.combinations(file[0],2))
  11. outfile = open('rtg_pairs.txt','w')
  12. for pair in com:
  13. # prepare a
  14. one = pair[0].strip().split('/')
  15. file_a = one[len(one)-1]
  16. a = one[len(one)-1].replace('.normed.vcf.gz','')
  17. strings_a = a.strip().split('_')
  18. sequenceTechA = strings_a[0] + strings_a[1]
  19. sequenceSiteA = strings_a[2]
  20. sampleA = strings_a[3]
  21. repA = strings_a[4]
  22. # analysisSiteA = strings_a[8]
  23. # pipelineA = strings_a[9] + strings_a[10]
  24. # prepare b
  25. two = pair[1].strip().split('/')
  26. file_b = two[len(two)-1]
  27. b = two[len(two)-1].replace('.normed.vcf.gz','')
  28. strings_b = b.strip().split('_')
  29. sequenceTechB = strings_b[0] + strings_b[1]
  30. sequenceSiteB = strings_b[2]
  31. sampleB = strings_b[3]
  32. repB = strings_b[4]
  33. # analysisSiteB = strings_b[8]
  34. # pipelineB = strings_b[9] + strings_b[10]
  35. folder = a + '-' + b
  36. ## add annotation
  37. # sequencing technology
  38. if (sequenceTechA != sequenceTechB) and (sequenceSiteA == sequenceSiteB) and (sampleA == sampleB):
  39. outline = file_a + '\t' + file_b + '\t' + folder + '\t' + 'sequenceTech' + '\n'
  40. outfile.write(outline)
  41. # sequencing site
  42. elif (sequenceTechA == sequenceTechB) and (sequenceSiteA != sequenceSiteB) and (sampleA == sampleB):
  43. outline = file_a + '\t' + file_b + '\t' + folder + '\t' + 'sequenceSite' + '\n'
  44. outfile.write(outline)
  45. # sample
  46. elif (sequenceTechA == sequenceTechB) and (sequenceSiteA == sequenceSiteB) and (sampleA != sampleB):
  47. outline = file_a + '\t' + file_b + '\t' + folder + '\t' + 'Sample' + '\n'
  48. outfile.write(outline)
  49. # replicate
  50. elif (sequenceTechA == sequenceTechB) and (sequenceSiteA == sequenceSiteB) and (sampleA == sampleB) and (repA != repB):
  51. outline = file_a + '\t' + file_b + '\t' + folder + '\t' + 'replicate' + '\n'
  52. outfile.write(outline)
  53. # analysis site
  54. # elif (sequenceTechA == sequenceTechB) and (sequenceSiteA == sequenceSiteB) and (sampleA == sampleB):
  55. # outline = pair[0] + '\t' + pair[1] + '\t' + folder + '\t' + 'analysisSite' + '\n'
  56. # outfile.write(outline)
  57. # pipeline
  58. # elif (sequenceTechA == sequenceTechB) and (sequenceSiteA == sequenceSiteB) and (sampleA == sampleB) and (analysisSiteA == analysisSiteB) and (pipelineA != pipelineB):
  59. # outline = pair[0] + '\t' + pair[1] + '\t' + folder + '\t' + 'pipeline' + '\n'
  60. # outfile.write(outline)
  61. else:
  62. pass
  63. outfile.close()