您最多选择25个主题 主题必须以字母或数字开头,可以包含连字符 (-),并且长度不得超过35个字符

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109
  1. # import modules
  2. import numpy as np
  3. import pandas as pd
  4. from sklearn import svm
  5. from sklearn import preprocessing
  6. import sys, argparse, os
  7. from vcf2bed import position_to_bed,padding_region
  8. parser = argparse.ArgumentParser(description="this script is to preform one calss svm on each chromosome")
  9. parser.add_argument('-train', '--trainDataset', type=str, help='training dataset generated from extracting vcf information part, with mutaitons supported by callsets', required=True)
  10. parser.add_argument('-test', '--testDataset', type=str, help='testing dataset generated from extracting vcf information part, with mutaitons not called by all callsets', required=True)
  11. parser.add_argument('-name', '--sampleName', type=str, help='sample name for output file name', required=True)
  12. args = parser.parse_args()
  13. # Rename input:
  14. train_input = args.trainDataset
  15. test_input = args.testDataset
  16. sample_name = args.sampleName
  17. # default columns, which will be included in the included in the calssifier
  18. chromosome = ['chr1','chr2','chr3','chr4','chr5','chr6','chr7','chr8','chr9','chr10','chr11','chr12','chr13','chr14','chr15' ,'chr16','chr17','chr18','chr19','chr20','chr21','chr22','chrX','chrY']
  19. feature_heter_cols = ['AltDP','BaseQRankSum','DB','DP','FS','GQ','MQ','MQRankSum','QD','ReadPosRankSum','RefDP','SOR','af']
  20. feature_homo_cols = ['AltDP','DB','DP','FS','GQ','MQ','QD','RefDP','SOR','af']
  21. # import datasets sepearate the records with or without BaseQRankSum annotation, etc.
  22. def load_dat(dat_file_name):
  23. dat = pd.read_table(dat_file_name)
  24. dat['DB'] = dat['DB'].fillna(0)
  25. dat = dat[dat['DP'] != 0]
  26. dat['af'] = dat['AltDP']/(dat['AltDP'] + dat['RefDP'])
  27. homo_rows = dat[dat['BaseQRankSum'].isnull()]
  28. heter_rows = dat[dat['BaseQRankSum'].notnull()]
  29. return homo_rows,heter_rows
  30. train_homo,train_heter = load_dat(train_input)
  31. test_homo,test_heter = load_dat(test_input)
  32. clf = svm.OneClassSVM(nu=0.05,kernel='rbf', gamma='auto_deprecated',cache_size=500)
  33. def prepare_dat(train_dat,test_dat,feature_cols,chromo):
  34. chr_train = train_dat[train_dat['chromo'] == chromo]
  35. chr_test = test_dat[test_dat['chromo'] == chromo]
  36. train_dat = chr_train.loc[:,feature_cols]
  37. test_dat = chr_test.loc[:,feature_cols]
  38. train_dat_scaled = preprocessing.scale(train_dat)
  39. test_dat_scaled = preprocessing.scale(test_dat)
  40. return chr_test,train_dat_scaled,test_dat_scaled
  41. def oneclass(X_train,X_test,chr_test):
  42. clf.fit(X_train)
  43. y_pred_test = clf.predict(X_test)
  44. test_true_dat = chr_test[y_pred_test == 1]
  45. test_false_dat = chr_test[y_pred_test == -1]
  46. return test_true_dat,test_false_dat
  47. predicted_true = pd.DataFrame(columns=train_homo.columns)
  48. predicted_false = pd.DataFrame(columns=train_homo.columns)
  49. for chromo in chromosome:
  50. # homo datasets
  51. chr_test_homo,X_train_homo,X_test_homo = prepare_dat(train_homo,test_homo,feature_homo_cols,chromo)
  52. test_true_homo,test_false_homo = oneclass(X_train_homo,X_test_homo,chr_test_homo)
  53. predicted_true = predicted_true.append(test_true_homo)
  54. predicted_false = predicted_false.append(test_false_homo)
  55. # heter datasets
  56. chr_test_heter,X_train_heter,X_test_heter = prepare_dat(train_heter,test_heter,feature_heter_cols,chromo)
  57. test_true_heter,test_false_heter = oneclass(X_train_heter,X_test_heter,chr_test_heter)
  58. predicted_true = predicted_true.append(test_true_heter)
  59. predicted_false = predicted_false.append(test_false_heter)
  60. predicted_true_filename = sample_name + '_predicted_true.txt'
  61. predicted_false_filename = sample_name + '_predicted_false.txt'
  62. predicted_true.to_csv(predicted_true_filename,sep='\t',index=False)
  63. predicted_false.to_csv(predicted_false_filename,sep='\t',index=False)
  64. # output the bed file and padding bed region 50bp
  65. predicted_true_bed_filename = sample_name + '_predicted_true.bed'
  66. predicted_false_bed_filename = sample_name + '_predicted_false.bed'
  67. padding_filename = sample_name + '_padding.bed'
  68. predicted_true_bed = open(predicted_true_bed_filename,'w')
  69. predicted_false_bed = open(predicted_false_bed_filename,'w')
  70. padding = open(padding_filename,'w')
  71. #
  72. for index,row in predicted_false.iterrows():
  73. chromo,pos1,pos2 = position_to_bed(row['chromo'],row['pos'],row['ref'],row['alt'])
  74. outline_pos = chromo + '\t' + str(pos1) + '\t' + str(pos2) + '\n'
  75. predicted_false_bed.write(outline_pos)
  76. chromo,pad_pos1,pad_pos2,pad_pos3,pad_pos4 = padding_region(chromo,pos1,pos2,50)
  77. outline_pad_1 = chromo + '\t' + str(pad_pos1) + '\t' + str(pad_pos2) + '\n'
  78. outline_pad_2 = chromo + '\t' + str(pad_pos3) + '\t' + str(pad_pos4) + '\n'
  79. padding.write(outline_pad_1)
  80. padding.write(outline_pad_2)
  81. for index,row in predicted_true.iterrows():
  82. chromo,pos1,pos2 = position_to_bed(row['chromo'],row['pos'],row['ref'],row['alt'])
  83. outline_pos = chromo + '\t' + str(pos1) + '\t' + str(pos2) + '\n'
  84. predicted_true_bed.write(outline_pos)