Nevar pievienot vairāk kā 25 tēmas Tēmai ir jāsākas ar burtu vai ciparu, tā var saturēt domu zīmes ('-') un var būt līdz 35 simboliem gara.

pirms 4 gadiem
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109
  1. # import modules
  2. import numpy as np
  3. import pandas as pd
  4. from sklearn import svm
  5. from sklearn import preprocessing
  6. import sys, argparse, os
  7. from vcf2bed import position_to_bed,padding_region
  8. parser = argparse.ArgumentParser(description="this script is to preform one calss svm on each chromosome")
  9. parser.add_argument('-train', '--trainDataset', type=str, help='training dataset generated from extracting vcf information part, with mutaitons supported by callsets', required=True)
  10. parser.add_argument('-test', '--testDataset', type=str, help='testing dataset generated from extracting vcf information part, with mutaitons not called by all callsets', required=True)
  11. parser.add_argument('-name', '--sampleName', type=str, help='sample name for output file name', required=True)
  12. args = parser.parse_args()
  13. # Rename input:
  14. train_input = args.trainDataset
  15. test_input = args.testDataset
  16. sample_name = args.sampleName
  17. # default columns, which will be included in the included in the calssifier
  18. chromosome = ['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15' ,'chr16','chr17','chr18','chr19','chr20','chr21','chr22','chrX','chrY']
  19. feature_heter_cols = ['AltDP','BaseQRankSum','DB','DP','FS','GQ','MQ','MQRankSum','QD','ReadPosRankSum','RefDP','SOR','af']
  20. feature_homo_cols = ['AltDP','DB','DP','FS','GQ','MQ','QD','RefDP','SOR','af']
  21. # import datasets sepearate the records with or without BaseQRankSum annotation, etc.
  22. def load_dat(dat_file_name):
  23. dat = pd.read_table(dat_file_name)
  24. dat['DB'] = dat['DB'].fillna(0)
  25. dat = dat[dat['DP'] != 0]
  26. dat['af'] = dat['AltDP']/(dat['AltDP'] + dat['RefDP'])
  27. homo_rows = dat[dat['BaseQRankSum'].isnull()]
  28. heter_rows = dat[dat['BaseQRankSum'].notnull()]
  29. return homo_rows,heter_rows
  30. train_homo,train_heter = load_dat(train_input)
  31. test_homo,test_heter = load_dat(test_input)
  32. clf = svm.OneClassSVM(nu=0.05,kernel='rbf', gamma='auto_deprecated',cache_size=500)
  33. def prepare_dat(train_dat,test_dat,feature_cols,chromo):
  34. chr_train = train_dat[train_dat['chromo'] == chromo]
  35. chr_test = test_dat[test_dat['chromo'] == chromo]
  36. train_dat = chr_train.loc[:,feature_cols]
  37. test_dat = chr_test.loc[:,feature_cols]
  38. train_dat_scaled = preprocessing.scale(train_dat)
  39. test_dat_scaled = preprocessing.scale(test_dat)
  40. return chr_test,train_dat_scaled,test_dat_scaled
  41. def oneclass(X_train,X_test,chr_test):
  42. clf.fit(X_train)
  43. y_pred_test = clf.predict(X_test)
  44. test_true_dat = chr_test[y_pred_test == 1]
  45. test_false_dat = chr_test[y_pred_test == -1]
  46. return test_true_dat,test_false_dat
  47. predicted_true = pd.DataFrame(columns=train_homo.columns)
  48. predicted_false = pd.DataFrame(columns=train_homo.columns)
  49. for chromo in chromosome:
  50. # homo datasets
  51. chr_test_homo,X_train_homo,X_test_homo = prepare_dat(train_homo,test_homo,feature_homo_cols,chromo)
  52. test_true_homo,test_false_homo = oneclass(X_train_homo,X_test_homo,chr_test_homo)
  53. predicted_true = predicted_true.append(test_true_homo)
  54. predicted_false = predicted_false.append(test_false_homo)
  55. # heter datasets
  56. chr_test_heter,X_train_heter,X_test_heter = prepare_dat(train_heter,test_heter,feature_heter_cols,chromo)
  57. test_true_heter,test_false_heter = oneclass(X_train_heter,X_test_heter,chr_test_heter)
  58. predicted_true = predicted_true.append(test_true_heter)
  59. predicted_false = predicted_false.append(test_false_heter)
  60. predicted_true_filename = sample_name + '_predicted_true.txt'
  61. predicted_false_filename = sample_name + '_predicted_false.txt'
  62. predicted_true.to_csv(predicted_true_filename,sep='\t',index=False)
  63. predicted_false.to_csv(predicted_false_filename,sep='\t',index=False)
  64. # output the bed file and padding bed region 50bp
  65. predicted_true_bed_filename = sample_name + '_predicted_true.bed'
  66. predicted_false_bed_filename = sample_name + '_predicted_false.bed'
  67. padding_filename = sample_name + '_padding.bed'
  68. predicted_true_bed = open(predicted_true_bed_filename,'w')
  69. predicted_false_bed = open(predicted_false_bed_filename,'w')
  70. padding = open(padding_filename,'w')
  71. #
  72. for index,row in predicted_false.iterrows():
  73. chromo,pos1,pos2 = position_to_bed(row['chromo'],row['pos'],row['ref'],row['alt'])
  74. outline_pos = chromo + '\t' + str(pos1) + '\t' + str(pos2) + '\n'
  75. predicted_false_bed.write(outline_pos)
  76. chromo,pad_pos1,pad_pos2,pad_pos3,pad_pos4 = padding_region(chromo,pos1,pos2,50)
  77. outline_pad_1 = chromo + '\t' + str(pad_pos1) + '\t' + str(pad_pos2) + '\n'
  78. outline_pad_2 = chromo + '\t' + str(pad_pos3) + '\t' + str(pad_pos4) + '\n'
  79. padding.write(outline_pad_1)
  80. padding.write(outline_pad_2)
  81. for index,row in predicted_true.iterrows():
  82. chromo,pos1,pos2 = position_to_bed(row['chromo'],row['pos'],row['ref'],row['alt'])
  83. outline_pos = chromo + '\t' + str(pos1) + '\t' + str(pos2) + '\n'
  84. predicted_true_bed.write(outline_pos)