# import modules import numpy as np import pandas as pd from sklearn import svm from sklearn import preprocessing import sys, argparse, os from vcf2bed import position_to_bed,padding_region parser = argparse.ArgumentParser(description="this script is to preform one calss svm on each chromosome") parser.add_argument('-train', '--trainDataset', type=str, help='training dataset generated from extracting vcf information part, with mutaitons supported by callsets', required=True) parser.add_argument('-test', '--testDataset', type=str, help='testing dataset generated from extracting vcf information part, with mutaitons not called by all callsets', required=True) parser.add_argument('-name', '--sampleName', type=str, help='sample name for output file name', required=True) args = parser.parse_args() # Rename input: train_input = args.trainDataset test_input = args.testDataset sample_name = args.sampleName # default columns, which will be included in the included in the calssifier chromosome = ['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15' ,'chr16','chr17','chr18','chr19','chr20','chr21','chr22','chrX','chrY'] feature_heter_cols = ['AltDP','BaseQRankSum','DB','DP','FS','GQ','MQ','MQRankSum','QD','ReadPosRankSum','RefDP','SOR','af'] feature_homo_cols = ['AltDP','DB','DP','FS','GQ','MQ','QD','RefDP','SOR','af'] # import datasets sepearate the records with or without BaseQRankSum annotation, etc. def load_dat(dat_file_name): dat = pd.read_table(dat_file_name) dat['DB'] = dat['DB'].fillna(0) dat = dat[dat['DP'] != 0] dat['af'] = dat['AltDP']/(dat['AltDP'] + dat['RefDP']) homo_rows = dat[dat['BaseQRankSum'].isnull()] heter_rows = dat[dat['BaseQRankSum'].notnull()] return homo_rows,heter_rows train_homo,train_heter = load_dat(train_input) test_homo,test_heter = load_dat(test_input) clf = svm.OneClassSVM(nu=0.05,kernel='rbf', gamma='auto_deprecated',cache_size=500) def prepare_dat(train_dat,test_dat,feature_cols,chromo): chr_train = train_dat[train_dat['chromo'] == chromo] chr_test = test_dat[test_dat['chromo'] == chromo] train_dat = chr_train.loc[:,feature_cols] test_dat = chr_test.loc[:,feature_cols] train_dat_scaled = preprocessing.scale(train_dat) test_dat_scaled = preprocessing.scale(test_dat) return chr_test,train_dat_scaled,test_dat_scaled def oneclass(X_train,X_test,chr_test): clf.fit(X_train) y_pred_test = clf.predict(X_test) test_true_dat = chr_test[y_pred_test == 1] test_false_dat = chr_test[y_pred_test == -1] return test_true_dat,test_false_dat predicted_true = pd.DataFrame(columns=train_homo.columns) predicted_false = pd.DataFrame(columns=train_homo.columns) for chromo in chromosome: # homo datasets chr_test_homo,X_train_homo,X_test_homo = prepare_dat(train_homo,test_homo,feature_homo_cols,chromo) test_true_homo,test_false_homo = oneclass(X_train_homo,X_test_homo,chr_test_homo) predicted_true = predicted_true.append(test_true_homo) predicted_false = predicted_false.append(test_false_homo) # heter datasets chr_test_heter,X_train_heter,X_test_heter = prepare_dat(train_heter,test_heter,feature_heter_cols,chromo) test_true_heter,test_false_heter = oneclass(X_train_heter,X_test_heter,chr_test_heter) predicted_true = predicted_true.append(test_true_heter) predicted_false = predicted_false.append(test_false_heter) predicted_true_filename = sample_name + '_predicted_true.txt' predicted_false_filename = sample_name + '_predicted_false.txt' predicted_true.to_csv(predicted_true_filename,sep='\t',index=False) predicted_false.to_csv(predicted_false_filename,sep='\t',index=False) # output the bed file and padding bed region 50bp predicted_true_bed_filename = sample_name + '_predicted_true.bed' predicted_false_bed_filename = sample_name + '_predicted_false.bed' padding_filename = sample_name + '_padding.bed' predicted_true_bed = open(predicted_true_bed_filename,'w') predicted_false_bed = open(predicted_false_bed_filename,'w') padding = open(padding_filename,'w') # for index,row in predicted_false.iterrows(): chromo,pos1,pos2 = position_to_bed(row['chromo'],row['pos'],row['ref'],row['alt']) outline_pos = chromo + '\t' + str(pos1) + '\t' + str(pos2) + '\n' predicted_false_bed.write(outline_pos) chromo,pad_pos1,pad_pos2,pad_pos3,pad_pos4 = padding_region(chromo,pos1,pos2,50) outline_pad_1 = chromo + '\t' + str(pad_pos1) + '\t' + str(pad_pos2) + '\n' outline_pad_2 = chromo + '\t' + str(pad_pos3) + '\t' + str(pad_pos4) + '\n' padding.write(outline_pad_1) padding.write(outline_pad_2) for index,row in predicted_true.iterrows(): chromo,pos1,pos2 = position_to_bed(row['chromo'],row['pos'],row['ref'],row['alt']) outline_pos = chromo + '\t' + str(pos1) + '\t' + str(pos2) + '\n' predicted_true_bed.write(outline_pos)