You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

74 line
2.6KB

  1. import itertools
  2. import pandas as pd
  3. import sys,argparse,os
  4. parser = argparse.ArgumentParser(description="this script is to preform combination on vcf files, get any two of them for next jaccard index calculation")
  5. parser.add_argument('-vcf', '--vcffile', type=str, help='input vcf file', required=True)
  6. args = parser.parse_args()
  7. # Rename input:
  8. vcf_input = args.vcffile
  9. file = pd.read_csv(vcf_input,header=None,sep='\t')
  10. com = list(itertools.combinations(file[0],2))
  11. outfile = open('rtg_pairs.txt','w')
  12. for pair in com:
  13. # prepare a
  14. one = pair[0].strip().split('/')
  15. a = one[len(one)-1].replace('.normed.vcf.gz','')
  16. strings_a = a.strip().split('_')
  17. sequenceTechA = strings_a[0] + strings_a[1]
  18. sequenceSiteA = strings_a[2]
  19. sampleA = strings_a[3]
  20. repA = strings_a[4]
  21. # analysisSiteA = strings_a[8]
  22. # pipelineA = strings_a[9] + strings_a[10]
  23. # prepare b
  24. two = pair[1].strip().split('/')
  25. b = two[len(two)-1].replace('.normed.vcf.gz','')
  26. strings_b = b.strip().split('_')
  27. sequenceTechB = strings_b[0] + strings_b[1]
  28. sequenceSiteB = strings_b[2]
  29. sampleB = strings_b[3]
  30. repB = strings_b[4]
  31. # analysisSiteB = strings_b[8]
  32. # pipelineB = strings_b[9] + strings_b[10]
  33. folder = a + '-' + b
  34. ## add annotation
  35. # sequencing technology
  36. if (sequenceTechA != sequenceTechB) and (sequenceSiteA == sequenceSiteB) and (sampleA == sampleB):
  37. outline = pair[0] + '\t' + pair[1] + '\t' + folder + '\t' + 'sequenceTech' + '\n'
  38. outfile.write(outline)
  39. # sequencing site
  40. elif (sequenceTechA == sequenceTechB) and (sequenceSiteA != sequenceSiteB) and (sampleA == sampleB):
  41. outline = pair[0] + '\t' + pair[1] + '\t' + folder + '\t' + 'sequenceSite' + '\n'
  42. outfile.write(outline)
  43. # sample
  44. elif (sequenceTechA == sequenceTechB) and (sequenceSiteA == sequenceSiteB) and (sampleA != sampleB):
  45. outline = pair[0] + '\t' + pair[1] + '\t' + folder + '\t' + 'Sample' + '\n'
  46. outfile.write(outline)
  47. # replicate
  48. elif (sequenceTechA == sequenceTechB) and (sequenceSiteA == sequenceSiteB) and (sampleA == sampleB) and (repA != repB):
  49. outline = pair[0] + '\t' + pair[1] + '\t' + folder + '\t' + 'replicate' + '\n'
  50. outfile.write(outline)
  51. # analysis site
  52. # elif (sequenceTechA == sequenceTechB) and (sequenceSiteA == sequenceSiteB) and (sampleA == sampleB):
  53. # outline = pair[0] + '\t' + pair[1] + '\t' + folder + '\t' + 'analysisSite' + '\n'
  54. # outfile.write(outline)
  55. # pipeline
  56. # elif (sequenceTechA == sequenceTechB) and (sequenceSiteA == sequenceSiteB) and (sampleA == sampleB) and (analysisSiteA == analysisSiteB) and (pipelineA != pipelineB):
  57. # outline = pair[0] + '\t' + pair[1] + '\t' + folder + '\t' + 'pipeline' + '\n'
  58. # outfile.write(outline)
  59. else:
  60. pass
  61. outfile.close()