|
- import pandas as pd
- import sys, argparse, os
- mut = mut = pd.read_table('/mnt/pgx_src_data_pool_4/home/renluyao/manuscript/benchmark_calls/vcf/mutation_type',header=None)
- vote = pd.read_table('/mnt/pgx_src_data_pool_4/home/renluyao/manuscript/benchmark_calls/all_info/benchmark.vote.mendelian.txt',header=None)
- merged_df = pd.merge(vote, mut, how='inner', left_on=[0,1], right_on = [0,1])
- outFile = open(sys.argv[1],'w')
- outIndel = open(sys.argv[2],'w')
- for row in merged_df.itertuples():
- #d5
- if ',' in row._7:
- d5 = row._7.split(',')
- d5_len = [len(i) for i in d5]
- d5_alt = max(d5_len)
- else:
- d5_alt = len(row._7)
- #d6
- if ',' in row._15:
- d6 = row._15.split(',')
- d6_len = [len(i) for i in d6]
- d6_alt = max(d6_len)
- else:
- d6_alt = len(row._15)
- #f7
- if ',' in row._23:
- f7 = row._23.split(',')
- f7_len = [len(i) for i in f7]
- f7_alt = max(f7_len)
- else:
- f7_alt = len(row._23)
- #m8
- if ',' in row._31:
- m8 = row._31.split(',')
- m8_len = [len(i) for i in m8]
- m8_alt = max(m8_len)
- else:
- m8_alt = len(row._31)
- all_length = [d5_alt,d6_alt,f7_alt,m8_alt]
- alt = max(all_length)
- ref = row._35
- pos = int(row._2)
- if len(ref) == 1 and alt == 1:
- StartPos = int(pos) -1
- EndPos = int(pos)
- cate = 'SNV'
- elif len(ref) > alt:
- StartPos = int(pos) - 1
- EndPos = int(pos) + (len(ref) - 1)
- cate = 'INDEL'
- outline_indel = row._1 + '\t' + str(StartPos) + '\t' + str(EndPos) + '\n'
- outIndel.write(outline_indel)
- elif alt > len(ref):
- StartPos = int(pos) - 1
- EndPos = int(pos) + (alt - 1)
- cate = 'INDEL'
- outline_indel = row._1 + '\t' + str(StartPos) + '\t' + str(EndPos) + '\n'
- outIndel.write(outline_indel)
- elif len(ref) == alt:
- StartPos = int(pos) - 1
- EndPos = int(pos) + (alt - 1)
- cate = 'INDEL'
- outline_indel = row._1 + '\t' + str(StartPos) + '\t' + str(EndPos) + '\n'
- outIndel.write(outline_indel)
- outline = row._1 + '\t' + str(StartPos) + '\t' + str(EndPos) + '\t' + str(row._2) + '\t' + cate + '\n'
- outFile.write(outline)
-
-
-
-
-
-
-
|