import pandas as pd import sys, argparse, os parser = argparse.ArgumentParser(description="This script is to get information from hap") parser.add_argument('-hap', '--happy', type=str, help='hap.py table', required=True) parser.add_argument('-name', '--name', type=str, help='sample name', required=True) args = parser.parse_args() hap_file = args.happy name = args.name dat = pd.read_table(hap_file) dat['QUERY.TP'] = dat['QUERY.TOTAL'].astype(int) - dat['QUERY.UNK'].astype(int) - dat['QUERY.FP'].astype(int) dat['QUERY'] = dat['QUERY.TOTAL'].astype(int) - dat['QUERY.UNK'].astype(int) indel = dat[['INDEL' in s for s in dat['Type']]] snv = dat[['SNP' in s for s in dat['Type']]] indel.reset_index(drop=True, inplace=True) snv.reset_index(drop=True, inplace=True) benchmark = pd.concat([snv, indel], axis=1) benchmark = benchmark[[ 'QUERY.TOTAL', 'QUERY','QUERY.TP','QUERY.FP','TRUTH.FN','METRIC.Precision', 'METRIC.Recall','METRIC.F1_Score']] benchmark.columns = ['SNV number','INDEL number','SNV query','INDEL query','SNV TP','INDEL TP','SNV FP','INDEL FP','SNV FN','INDEL FN','SNV precision','INDEL precision','SNV recall','INDEL recall','SNV F1','INDEL F1'] benchmark = benchmark[['SNV number','INDEL number','SNV query','INDEL query','SNV TP','INDEL TP','SNV FP','INDEL FP','SNV FN','INDEL FN','SNV precision','INDEL precision','SNV recall','INDEL recall','SNV F1','INDEL F1']] benchmark['SNV precision'] = benchmark['SNV precision'].astype(float) benchmark['INDEL precision'] = benchmark['INDEL precision'].astype(float) benchmark['SNV recall'] = benchmark['SNV recall'].astype(float) benchmark['INDEL recall'] = benchmark['INDEL recall'].astype(float) benchmark['SNV F1'] = benchmark['SNV F1'].astype(float) benchmark['INDEL F1'] = benchmark['INDEL F1'].astype(float) benchmark = benchmark.round(2) name_array = name.split("_") LCL5_1 = name_array[0] + "_" + name_array[1] + "_" + name_array[2] + "_" + name_array[3] + "_" + name_array[4] + "_" + "LCL5_1" + "_" + name_array[5] LCL5_2 = name_array[0] + "_" + name_array[1] + "_" + name_array[2] + "_" + name_array[3] + "_" + name_array[4] + "_" + "LCL5_2" + "_" + name_array[5] LCL5_3 = name_array[0] + "_" + name_array[1] + "_" + name_array[2] + "_" + name_array[3] + "_" + name_array[4] + "_" + "LCL5_3" + "_" + name_array[5] LCL6_1 = name_array[0] + "_" + name_array[1] + "_" + name_array[2] + "_" + name_array[3] + "_" + name_array[4] + "_" + "LCL6_1" + "_" + name_array[5] LCL6_2 = name_array[0] + "_" + name_array[1] + "_" + name_array[2] + "_" + name_array[3] + "_" + name_array[4] + "_" + "LCL6_2" + "_" + name_array[5] LCL6_3 = name_array[0] + "_" + name_array[1] + "_" + name_array[2] + "_" + name_array[3] + "_" + name_array[4] + "_" + "LCL6_3" + "_" + name_array[5] LCL7_1 = name_array[0] + "_" + name_array[1] + "_" + name_array[2] + "_" + name_array[3] + "_" + name_array[4] + "_" + "LCL7_1" + "_" + name_array[5] LCL7_2 = name_array[0] + "_" + name_array[1] + "_" + name_array[2] + "_" + name_array[3] + "_" + name_array[4] + "_" + "LCL7_2" + "_" + name_array[5] LCL7_3 = name_array[0] + "_" + name_array[1] + "_" + name_array[2] + "_" + name_array[3] + "_" + name_array[4] + "_" + "LCL7_3" + "_" + name_array[5] LCL8_1 = name_array[0] + "_" + name_array[1] + "_" + name_array[2] + "_" + name_array[3] + "_" + name_array[4] + "_" + "LCL8_1" + "_" + name_array[5] LCL8_2 = name_array[0] + "_" + name_array[1] + "_" + name_array[2] + "_" + name_array[3] + "_" + name_array[4] + "_" + "LCL8_2" + "_" + name_array[5] LCL8_3 = name_array[0] + "_" + name_array[1] + "_" + name_array[2] + "_" + name_array[3] + "_" + name_array[4] + "_" + "LCL8_3" + "_" + name_array[5] benchmark.insert(loc=0, column='Sample', value=[LCL5_1,LCL5_2,LCL5_3,LCL6_1,LCL6_2,LCL6_3,LCL7_1,LCL7_2,LCL7_3,LCL8_1,LCL8_2,LCL8_3]) benchmark.to_csv('variants.calling.qc.txt',sep="\t",index=0)