|
12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849 |
- import itertools
- import pandas as pd
- import sys,argparse,os
-
- parser = argparse.ArgumentParser(description="this script is to preform combination on vcf files, get any two of three replicates for next jaccard index calculation")
-
- parser.add_argument('-vcf', '--vcffile', type=str, help='input vcf file', required=True)
-
- args = parser.parse_args()
-
- # Rename input:
- vcf_input = args.vcffile
-
-
- file = pd.read_csv(vcf_input,header=None,sep='\t')
- location_com = list(itertools.combinations(file[0],2))
- name_com = list(itertools.combinations(file[1],2))
- num = len(name_com)
-
- outfile = open('rtg_pairs.txt','w')
-
- for i in range(0,num):
- # prepare a
- one = name_com[i][0]
- strings_one = one.split('_')
- sequenceTechA = strings_one[0] + strings_one[1]
- sequenceSiteA = strings_one[2]
- sampleA = strings_one[3]
- repA = strings_one[4]
-
- # prepare b
- two = name_com[i][1]
- strings_two = two.split('_')
- sequenceTechB = strings_two[0] + strings_two[1]
- sequenceSiteB = strings_two[2]
- sampleB = strings_two[3]
- repB = strings_two[4]
-
- folder = one + '-' + two
- ## add annotation
- # sequencing technology
- if (sequenceTechA == sequenceTechB) and (sequenceSiteA == sequenceSiteB) and (sampleA == sampleB) and (repA != repB):
- outline = location_com[i][0] + '\t' + location_com[i][1] + '\t' + folder + '\n'
- outfile.write(outline)
- else:
- pass
-
- outfile.close()
-
|