Du kan inte välja fler än 25 ämnen Ämnen måste starta med en bokstav eller siffra, kan innehålla bindestreck ('-') och vara max 35 tecken långa.

115 lines
5.1KB

  1. # import modules
  2. import numpy as np
  3. import pandas as pd
  4. from sklearn import svm
  5. from sklearn import preprocessing
  6. import sys, argparse, os
  7. from vcf2bed import position_to_bed,padding_region
  8. parser = argparse.ArgumentParser(description="this script is to preform one calss svm on each chromosome")
  9. parser.add_argument('-train', '--trainDataset', type=str, help='training dataset generated from extracting vcf information part, with mutaitons supported by callsets', required=True)
  10. parser.add_argument('-test', '--testDataset', type=str, help='testing dataset generated from extracting vcf information part, with mutaitons not called by all callsets', required=True)
  11. parser.add_argument('-name', '--sampleName', type=str, help='sample name for output file name', required=True)
  12. parser.add_argument('-kernel', '--SVMkernel', type=str, help='kernel you choose to perform one class svm', required=True)
  13. parser.add_argument('-nu', '--SVMnu', type=float, help='An upper bound on the fraction of training errors and a lower bound of the fraction of support vectors. Should be in the interval (0, 1]', required=True)
  14. args = parser.parse_args()
  15. # Rename input:
  16. train_input = args.trainDataset
  17. test_input = args.testDataset
  18. sample_name = args.sampleName
  19. kernel = args.SVMkernel
  20. nu = args.SVMnu
  21. # default columns, which will be included in the included in the calssifier
  22. chromosome = ['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15' ,'chr16','chr17','chr18','chr19','chr20','chr21','chr22','chrX']
  23. feature_heter_cols = ['AltDP','BaseQRankSum','DB','DP','FS','GQ','MQ','MQRankSum','QD','ReadPosRankSum','RefDP','SOR','af']
  24. feature_homo_cols = ['AltDP','DB','DP','FS','GQ','MQ','QD','RefDP','SOR','af']
  25. # import datasets sepearate the records with or without BaseQRankSum annotation, etc.
  26. def load_dat(dat_file_name):
  27. dat = pd.read_table(dat_file_name)
  28. dat['DB'] = dat['DB'].fillna(0)
  29. dat = dat[dat['DP'] != 0]
  30. dat['af'] = dat['AltDP']/(dat['AltDP'] + dat['RefDP'])
  31. homo_rows = dat[dat['BaseQRankSum'].isnull()]
  32. heter_rows = dat[dat['BaseQRankSum'].notnull()]
  33. return homo_rows,heter_rows
  34. train_homo,train_heter = load_dat(train_input)
  35. test_homo,test_heter = load_dat(test_input)
  36. clf = svm.OneClassSVM(nu=nu,kernel=kernel, gamma='auto_deprecated',cache_size=500)
  37. def prepare_dat(train_dat,test_dat,feature_cols,chromo):
  38. chr_train = train_dat[train_dat['chromo'] == chromo]
  39. chr_test = test_dat[test_dat['chromo'] == chromo]
  40. train_dat = chr_train.loc[:,feature_cols]
  41. test_dat = chr_test.loc[:,feature_cols]
  42. train_dat_scaled = preprocessing.scale(train_dat)
  43. test_dat_scaled = preprocessing.scale(test_dat)
  44. return chr_test,train_dat_scaled,test_dat_scaled
  45. def oneclass(X_train,X_test,chr_test):
  46. clf.fit(X_train)
  47. y_pred_test = clf.predict(X_test)
  48. test_true_dat = chr_test[y_pred_test == 1]
  49. test_false_dat = chr_test[y_pred_test == -1]
  50. return test_true_dat,test_false_dat
  51. predicted_true = pd.DataFrame(columns=train_homo.columns)
  52. predicted_false = pd.DataFrame(columns=train_homo.columns)
  53. for chromo in chromosome:
  54. # homo datasets
  55. chr_test_homo,X_train_homo,X_test_homo = prepare_dat(train_homo,test_homo,feature_homo_cols,chromo)
  56. test_true_homo,test_false_homo = oneclass(X_train_homo,X_test_homo,chr_test_homo)
  57. predicted_true = predicted_true.append(test_true_homo)
  58. predicted_false = predicted_false.append(test_false_homo)
  59. # heter datasets
  60. chr_test_heter,X_train_heter,X_test_heter = prepare_dat(train_heter,test_heter,feature_heter_cols,chromo)
  61. test_true_heter,test_false_heter = oneclass(X_train_heter,X_test_heter,chr_test_heter)
  62. predicted_true = predicted_true.append(test_true_heter)
  63. predicted_false = predicted_false.append(test_false_heter)
  64. predicted_true_filename = sample_name + '_predicted_true.txt'
  65. predicted_false_filename = sample_name + '_predicted_false.txt'
  66. predicted_true.to_csv(predicted_true_filename,sep='\t',index=False)
  67. predicted_false.to_csv(predicted_false_filename,sep='\t',index=False)
  68. # output the bed file and padding bed region 50bp
  69. predicted_true_bed_filename = sample_name + '_predicted_true.bed'
  70. predicted_false_bed_filename = sample_name + '_predicted_false.bed'
  71. padding_filename = sample_name + '_padding.bed'
  72. predicted_true_bed = open(predicted_true_bed_filename,'w')
  73. predicted_false_bed = open(predicted_false_bed_filename,'w')
  74. padding = open(padding_filename,'w')
  75. #
  76. for index,row in predicted_false.iterrows():
  77. chromo,pos1,pos2 = position_to_bed(row['chromo'],row['pos'],row['ref'],row['alt'])
  78. outline_pos = chromo + '\t' + str(pos1) + '\t' + str(pos2) + '\n'
  79. predicted_false_bed.write(outline_pos)
  80. chromo,pad_pos1,pad_pos2,pad_pos3,pad_pos4 = padding_region(chromo,pos1,pos2,50)
  81. outline_pad_1 = chromo + '\t' + str(pad_pos1) + '\t' + str(pad_pos2) + '\n'
  82. outline_pad_2 = chromo + '\t' + str(pad_pos3) + '\t' + str(pad_pos4) + '\n'
  83. padding.write(outline_pad_1)
  84. padding.write(outline_pad_2)
  85. for index,row in predicted_true.iterrows():
  86. chromo,pos1,pos2 = position_to_bed(row['chromo'],row['pos'],row['ref'],row['alt'])
  87. outline_pos = chromo + '\t' + str(pos1) + '\t' + str(pos2) + '\n'
  88. predicted_true_bed.write(outline_pos)