import itertools import pandas as pd import sys,argparse,os parser = argparse.ArgumentParser(description="this script is to preform combination on vcf files, get any two of them for next jaccard index calculation") parser.add_argument('-vcf', '--vcffile', type=str, help='input vcf file', required=True) args = parser.parse_args() # Rename input: vcf_input = args.vcffile file = pd.read_csv(vcf_input,header=None,sep='\t') com = list(itertools.combinations(file[0],2)) outfile = open('rtg_pairs.txt','w') for pair in com: # prepare a one = pair[0].strip().split('/') file_a = one[len(one)-1] a = one[len(one)-1].replace('.normed.vcf.gz','') strings_a = a.strip().split('_') sequenceTechA = strings_a[0] + strings_a[1] sequenceSiteA = strings_a[2] sampleA = strings_a[3] repA = strings_a[4] # analysisSiteA = strings_a[8] # pipelineA = strings_a[9] + strings_a[10] # prepare b two = pair[1].strip().split('/') file_b = two[len(two)-1] b = two[len(two)-1].replace('.normed.vcf.gz','') strings_b = b.strip().split('_') sequenceTechB = strings_b[0] + strings_b[1] sequenceSiteB = strings_b[2] sampleB = strings_b[3] repB = strings_b[4] # analysisSiteB = strings_b[8] # pipelineB = strings_b[9] + strings_b[10] folder = a + '-' + b ## add annotation # sequencing technology if (sequenceTechA != sequenceTechB) and (sequenceSiteA == sequenceSiteB) and (sampleA == sampleB): outline = file_a + '\t' + file_b + '\t' + folder + '\t' + 'sequenceTech' + '\n' outfile.write(outline) # sequencing site elif (sequenceTechA == sequenceTechB) and (sequenceSiteA != sequenceSiteB) and (sampleA == sampleB): outline = file_a + '\t' + file_b + '\t' + folder + '\t' + 'sequenceSite' + '\n' outfile.write(outline) # sample elif (sequenceTechA == sequenceTechB) and (sequenceSiteA == sequenceSiteB) and (sampleA != sampleB): outline = file_a + '\t' + file_b + '\t' + folder + '\t' + 'Sample' + '\n' outfile.write(outline) # replicate elif (sequenceTechA == sequenceTechB) and (sequenceSiteA == sequenceSiteB) and (sampleA == sampleB) and (repA != repB): outline = file_a + '\t' + file_b + '\t' + folder + '\t' + 'replicate' + '\n' outfile.write(outline) # analysis site # elif (sequenceTechA == sequenceTechB) and (sequenceSiteA == sequenceSiteB) and (sampleA == sampleB): # outline = pair[0] + '\t' + pair[1] + '\t' + folder + '\t' + 'analysisSite' + '\n' # outfile.write(outline) # pipeline # elif (sequenceTechA == sequenceTechB) and (sequenceSiteA == sequenceSiteB) and (sampleA == sampleB) and (analysisSiteA == analysisSiteB) and (pipelineA != pipelineB): # outline = pair[0] + '\t' + pair[1] + '\t' + folder + '\t' + 'pipeline' + '\n' # outfile.write(outline) else: pass outfile.close()