You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

110 lines
4.7KB

  1. # import modules
  2. import numpy as np
  3. import pandas as pd
  4. from sklearn import svm
  5. from sklearn import preprocessing
  6. import sys, argparse, os
  7. from vcf2bed import position_to_bed,padding_region
  8. parser = argparse.ArgumentParser(description="this script is to preform one calss svm on each chromosome")
  9. parser.add_argument('-train', '--trainDataset', type=str, help='training dataset generated from extracting vcf information part, with mutaitons supported by callsets', required=True)
  10. parser.add_argument('-test', '--testDataset', type=str, help='testing dataset generated from extracting vcf information part, with mutaitons not called by all callsets', required=True)
  11. parser.add_argument('-name', '--sampleName', type=str, help='sample name for output file name', required=True)
  12. args = parser.parse_args()
  13. # Rename input:
  14. train_input = args.trainDataset
  15. test_input = args.testDataset
  16. sample_name = args.sampleName
  17. # default columns, which will be included in the included in the calssifier
  18. chromosome = ['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15' ,'chr16','chr17','chr18','chr19','chr20','chr21','chr22','chrX','chrY']
  19. feature_heter_cols = ['AltDP','BaseQRankSum','DB','DP','FS','GQ','MQ','MQRankSum','QD','ReadPosRankSum','RefDP','SOR','af']
  20. feature_homo_cols = ['AltDP','DB','DP','FS','GQ','MQ','QD','RefDP','SOR','af']
  21. # import datasets sepearate the records with or without BaseQRankSum annotation, etc.
  22. def load_dat(dat_file_name):
  23. dat = pd.read_table(dat_file_name)
  24. dat['DB'] = dat['DB'].fillna(0)
  25. dat = dat[dat['DP'] != 0]
  26. dat['af'] = dat['AltDP']/(dat['AltDP'] + dat['RefDP'])
  27. homo_rows = dat[dat['BaseQRankSum'].isnull()]
  28. heter_rows = dat[dat['BaseQRankSum'].notnull()]
  29. return homo_rows,heter_rows
  30. train_homo,train_heter = load_dat(train_input)
  31. test_homo,test_heter = load_dat(test_input)
  32. clf = svm.OneClassSVM(nu=0.05,kernel='rbf', gamma='auto_deprecated',cache_size=500)
  33. def prepare_dat(train_dat,test_dat,feature_cols,chromo):
  34. chr_train = train_dat[train_dat['chromo'] == chromo]
  35. chr_test = test_dat[test_dat['chromo'] == chromo]
  36. train_dat = chr_train.loc[:,feature_cols]
  37. test_dat = chr_test.loc[:,feature_cols]
  38. train_dat_scaled = preprocessing.scale(train_dat)
  39. test_dat_scaled = preprocessing.scale(test_dat)
  40. return chr_test,train_dat_scaled,test_dat_scaled
  41. def oneclass(X_train,X_test,chr_test):
  42. clf.fit(X_train)
  43. y_pred_test = clf.predict(X_test)
  44. test_true_dat = chr_test[y_pred_test == 1]
  45. test_false_dat = chr_test[y_pred_test == -1]
  46. return test_true_dat,test_false_dat
  47. predicted_true = pd.DataFrame(columns=train_homo.columns)
  48. predicted_false = pd.DataFrame(columns=train_homo.columns)
  49. for chromo in chromosome:
  50. # homo datasets
  51. chr_test_homo,X_train_homo,X_test_homo = prepare_dat(train_homo,test_homo,feature_homo_cols,chromo)
  52. test_true_homo,test_false_homo = oneclass(X_train_homo,X_test_homo,chr_test_homo)
  53. predicted_true = predicted_true.append(test_true_homo)
  54. predicted_false = predicted_false.append(test_false_homo)
  55. # heter datasets
  56. chr_test_heter,X_train_heter,X_test_heter = prepare_dat(train_heter,test_heter,feature_heter_cols,chromo)
  57. test_true_heter,test_false_heter = oneclass(X_train_heter,X_test_heter,chr_test_heter)
  58. predicted_true = predicted_true.append(test_true_heter)
  59. predicted_false = predicted_false.append(test_false_heter)
  60. predicted_true_filename = sample_name + '_predicted_true.txt'
  61. predicted_false_filename = sample_name + '_predicted_false.txt'
  62. predicted_true.to_csv(predicted_true_filename,sep='\t',index=False)
  63. predicted_false.to_csv(predicted_false_filename,sep='\t',index=False)
  64. # output the bed file and padding bed region 50bp
  65. predicted_true_bed_filename = sample_name + '_predicted_true.bed'
  66. predicted_false_bed_filename = sample_name + '_predicted_false.bed'
  67. padding_filename = sample_name + '_padding.bed'
  68. predicted_true_bed = open(predicted_true_bed_filename,'w')
  69. predicted_false_bed = open(predicted_false_bed_filename,'w')
  70. padding = open(padding_filename,'w')
  71. #
  72. for index,row in predicted_false.iterrows():
  73. chromo,pos1,pos2 = position_to_bed(row['chromo'],row['pos'],row['ref'],row['alt'])
  74. outline_pos = chromo + '\t' + str(pos1) + '\t' + str(pos2) + '\n'
  75. predicted_false_bed.write(outline_pos)
  76. chromo,pad_pos1,pad_pos2,pad_pos3,pad_pos4 = padding_region(chromo,pos1,pos2,50)
  77. outline_pad_1 = chromo + '\t' + str(pad_pos1) + '\t' + str(pad_pos2) + '\n'
  78. outline_pad_2 = chromo + '\t' + str(pad_pos3) + '\t' + str(pad_pos4) + '\n'
  79. padding.write(outline_pad_1)
  80. padding.write(outline_pad_2)
  81. for index,row in predicted_true.iterrows():
  82. chromo,pos1,pos2 = position_to_bed(row['chromo'],row['pos'],row['ref'],row['alt'])
  83. outline_pos = chromo + '\t' + str(pos1) + '\t' + str(pos2) + '\n'
  84. predicted_true_bed.write(outline_pos)