You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

527 line
21KB

  1. #########################################################################
  2. ############################ HOW TO USE ##########################
  3. #########################################################################
  4. # 'x' is a matrix of segmented output from ASCAT, with at least the
  5. # following columns (column names are not important):
  6. # 1: sample id
  7. # 2: chromosome (numeric)
  8. # 3: segment start
  9. # 4: segment end
  10. # 5: number of probes
  11. # 6: total copy number
  12. # 7: nA
  13. # 8: nB
  14. # 9: ploidy
  15. # 10: contamination, aberrant cell fraction
  16. #chrominfo <- GetChrominfo() # hg19
  17. #
  18. #save(chrominfo,file = "f:/LX/results/ascatNgs_res/chrominfo.RData")
  19. #load("f:/LX/results/ascatNgs_res/chrominfo.RData")
  20. # chrominfo is a 5 column matrix with information about the chromosomes:
  21. # 1: chromosome number (23 = X, 24 = Y)
  22. # 2: chromosome length
  23. # 3: centromere location (mean of start and end)
  24. # 4: centromere start
  25. # 5: centromere end
  26. #ntai <- calc.ai(x, chrominfo)
  27. #lst <- calc.lst(x, chrominfo)
  28. #hrd <- calc.hrd(x) # Throughout this text, "hrd" refers to "HRD-LOH"
  29. #########################################################################
  30. ############################ FUNCTIONS ##########################
  31. #########################################################################
  32. #########################################################################
  33. # Number of telomeric AI
  34. # updated and improved to also account for ploidy - 2013-03-11
  35. #########################################################################
  36. calc.ai <- function(
  37. seg, # Matrix of segmented output from ASCAT
  38. chrominfo = FALSE, # Matrix with information about the chromosomes
  39. min.size = 0, # Minimum size of segments
  40. min.probes = 500, # Minimum number of probes in segments
  41. cont = 0, # Contamination threshold. 0 ignores contamination
  42. check.names = TRUE, # Rename any duplicated samples
  43. ploidyByChromosome = TRUE,
  44. return.loc = FALSE, # Return location of NtAI's
  45. shrink = TRUE # Joins segments of identical allelic copy number
  46. ) {
  47. if(ploidyByChromosome){cat("Determining chromosome-specific ploidy by major copy number fraction\n")}
  48. if(!ploidyByChromosome){cat("Determining sample ploidy by major copy number fraction over-all\n")}
  49. seg[,1] <- as.character(seg[,1])
  50. if(check.names){
  51. seg <- check.names.fn(seg)
  52. }
  53. # check and potentially fix if nB is always the smaller column
  54. if(!all(seg[,8] <= seg[,7]) ){
  55. # In case ASCAT people change the algorithm
  56. cat("Warning!! nB not always <= nA!! -- Correcting for internal use (only!)\n")
  57. tmp <- seg
  58. seg[tmp[,8] > tmp[,7],7] <- tmp[tmp[,8] > tmp[,7],8]
  59. seg[tmp[,8] > tmp[,7],8] <- tmp[tmp[,8] > tmp[,7],7]
  60. }
  61. # remove segments smaller min.size and min.probes,
  62. # and with too much contamination
  63. seg <- seg[seg[,5] >= min.probes,]
  64. seg <- seg[seg[,4]- seg[,3] >= min.size,]
  65. seg <- seg[seg[,10] >= cont,]
  66. samples <- as.character(unique(seg[,1]))
  67. if(shrink){
  68. new.seg <- seg[1,]
  69. for(j in samples){
  70. sample.seg <- seg[seg[,1] %in% j,]
  71. new.sample.seg <- seg[1,]
  72. for(i in unique(sample.seg[,2])){
  73. sample.chrom.seg <- sample.seg[sample.seg[,2] %in% i,,drop=F]
  74. sample.chrom.seg <- shrink.seg.ai(sample.chrom.seg)
  75. new.sample.seg <- rbind(new.sample.seg, sample.chrom.seg)
  76. }
  77. new.seg <- rbind(new.seg, new.sample.seg[-1,])
  78. }
  79. seg <- new.seg[-1,]
  80. }
  81. AI <- rep(NA, nrow(seg)) # Add a column to call AI
  82. seg <- cbind(seg, AI)
  83. samples <- as.character(unique(seg[,1]))
  84. ascat.ploidy <- setNames(seg[!duplicated(seg[,1]),9], seg[!duplicated(seg[,1]),1])
  85. for(j in samples){
  86. sample.seg <- seg[seg[,1] %in% j,]
  87. if(!ploidyByChromosome){
  88. ploidy <- vector()
  89. for(k in unique(sample.seg[,6])){
  90. tmp <- sample.seg[sample.seg[,6] %in% k,]
  91. ploidy <- c(ploidy, setNames(sum(tmp[,4]-tmp[,3]), k))
  92. }
  93. ploidy <- as.numeric(names(ploidy[order(ploidy,decreasing=T)]))[1]
  94. # Update "ploidy" column, so the new calculated value can be returned
  95. sample.seg[,9] <- ploidy
  96. # Add a columnm to define AI,
  97. # with codes for telomeric/interstitial/whole chromosome:
  98. # 0 = no AI
  99. # 1 = telomeric
  100. # 2 = interstitial
  101. # 3 = whole chromosome
  102. if(ploidy %in% c(1,seq(2, 200,by=2))){
  103. sample.seg[,'AI'] <- c(0,2)[match(sample.seg[,7] == sample.seg[,8], c('TRUE', 'FALSE'))]
  104. }
  105. if(!ploidy %in% c(1,seq(2, 200,by=2))){
  106. sample.seg[,'AI'] <- c(0,2)[match(sample.seg[,7] + sample.seg[,8] == ploidy & sample.seg[,7] != ploidy, c('TRUE', 'FALSE'))]
  107. }
  108. }
  109. new.sample.seg<- sample.seg[1,]
  110. for(i in unique(sample.seg[,2])){
  111. sample.chrom.seg <- sample.seg[sample.seg[,2] %in% i,,drop=F]
  112. if(nrow(sample.chrom.seg) == 0){ next}
  113. if(ploidyByChromosome){
  114. ploidy <- vector()
  115. for(k in unique(sample.seg[,6])){
  116. tmp <- sample.chrom.seg[sample.chrom.seg[,6] %in% k,]
  117. ploidy <- c(ploidy, setNames(sum(tmp[,4]-tmp[,3]), k))
  118. ploidy <- ploidy[!names(ploidy) %in% 0] #Remove any ploidy calls of zero
  119. }
  120. ploidy <- as.numeric(names(ploidy[order(ploidy,decreasing=T)]))[1]
  121. sample.chrom.seg[,9] <- ploidy # update "ploidy" column, so the new calculated value can be returned
  122. if(ploidy %in% c(1,seq(2, 200,by=2))){
  123. sample.chrom.seg[,'AI'] <- c(0,2)[match(sample.chrom.seg[,7] == sample.chrom.seg[,8], c('TRUE', 'FALSE'))]
  124. }
  125. if(!ploidy %in% c(1,seq(2, 200,by=2))){
  126. sample.chrom.seg[,'AI'] <- c(0,2)[match(sample.chrom.seg[,7] + sample.chrom.seg[,8] == ploidy & sample.chrom.seg[,8] != 0, c('TRUE', 'FALSE'))]
  127. }
  128. sample.seg[sample.seg[,2] %in% i,9] <-ploidy
  129. sample.seg[sample.seg[,2] %in% i,'AI'] <-sample.chrom.seg[,'AI']
  130. }
  131. # By logical, we assume that chrominfo == FALSE, hence we here proceed without checking for the centromere (useful for non-human samples)
  132. if(class(chrominfo) == 'logical'){
  133. if(sample.chrom.seg[1,'AI'] == 2 & nrow(sample.chrom.seg) != 1){
  134. sample.seg[sample.seg[,2]==i,'AI'][1] <- 1
  135. }
  136. if(sample.chrom.seg[nrow(sample.chrom.seg),'AI'] == 2 & nrow(sample.chrom.seg) != 1){
  137. sample.seg[sample.seg[,2]==i,'AI'][nrow(sample.seg[sample.seg[,2]==i,])] <- 1
  138. }
  139. }
  140. if(class(chrominfo) != 'logical'){ # Here we consider the centromere
  141. if(sample.chrom.seg[1,'AI'] == 2 & nrow(sample.chrom.seg) != 1 & sample.chrom.seg[1,4] < (chrominfo[i,3])){
  142. sample.seg[sample.seg[,2]==i,'AI'][1] <- 1
  143. }
  144. if(sample.chrom.seg[nrow(sample.chrom.seg),'AI'] == 2 & nrow(sample.chrom.seg) != 1 & sample.chrom.seg[nrow(sample.chrom.seg),3] > (chrominfo[i,3])){
  145. sample.seg[sample.seg[,2]==i,'AI'][nrow(sample.seg[sample.seg[,2]==i,])] <- 1
  146. }
  147. }
  148. if(nrow(sample.seg[sample.seg[,2]==i,]) == 1 & sample.seg[sample.seg[,2]==i,'AI'][1] != 0){
  149. sample.seg[sample.seg[,2]==i,'AI'][1] <- 3
  150. }
  151. }
  152. seg[seg[,1] %in% j,] <- sample.seg
  153. }
  154. samples <- as.character(unique(seg[,1]))
  155. no.events <- matrix(0, nrow=length(samples), ncol=12)
  156. rownames(no.events) <- samples
  157. colnames(no.events) <- c("Telomeric AI", "Mean size", "Interstitial AI", "Mean Size", "Whole chr AI", "Telomeric LOH", "Mean size", "Interstitial LOH", "Mean Size", "Whole chr LOH", "Ploidy", "Aberrant cell fraction")
  158. for(j in samples){
  159. sample.seg <- seg[seg[,1] %in% j,]
  160. no.events[j,1] <- nrow(sample.seg[sample.seg[,'AI'] == 1,])
  161. no.events[j,2] <- mean(sample.seg[sample.seg[,'AI'] == 1,4] - sample.seg[sample.seg[,'AI'] == 1,3])
  162. no.events[j,3] <- nrow(sample.seg[sample.seg[,'AI'] == 2,])
  163. no.events[j,4] <- mean(sample.seg[sample.seg[,'AI'] == 2,4] - sample.seg[sample.seg[,'AI'] == 2,3])
  164. no.events[j,5] <- nrow(sample.seg[sample.seg[,'AI'] == 3,])
  165. no.events[j,11] <- ascat.ploidy[j]
  166. no.events[j,12] <- unique(sample.seg[,10]) # aberrant cell fraction
  167. # Here we restrict ourselves to real LOH
  168. sample.seg <- sample.seg[sample.seg[,8] == 0,]
  169. no.events[j,6] <- nrow(sample.seg[sample.seg[,'AI'] == 1,])
  170. no.events[j,7] <- mean(sample.seg[sample.seg[,'AI'] == 1,4] - sample.seg[sample.seg[,'AI'] == 1,3])
  171. no.events[j,8] <- nrow(sample.seg[sample.seg[,'AI'] == 2,])
  172. no.events[j,9] <- mean(sample.seg[sample.seg[,'AI'] == 2,4] - sample.seg[sample.seg[,'AI'] == 2,3])
  173. no.events[j,10] <- nrow(sample.seg[sample.seg[,'AI'] == 3,])
  174. }
  175. if(return.loc){
  176. return(seg)
  177. } else {
  178. return(no.events)
  179. }
  180. }
  181. #########################################################################
  182. # This function calculates Popova's LST measure
  183. # (Popova 2012, Cancer research)
  184. # http://www.bio-protocol.org/e814
  185. # Popova's cutoffs: 15 LSTs in near-diploid, 20 in near-tetraploid
  186. #########################################################################
  187. calc.lst <- function(
  188. seg, # Matrix of segmented output from ASCAT
  189. chrominfo, # Matrix with information about the chromosomes
  190. nA = 7, # The column index of 'seg' where copy number of A
  191. # allele is found
  192. check.names = T, # Rename any duplicated samples
  193. min.probes = 50, # As described by Popova (50 for SNP6)
  194. return.loc = FALSE, # Return location of LST sites
  195. chr.arm = 'no' # Option to use chromosome arms defined during
  196. # segmentation. The option must give a column
  197. # that holds the chromosome arm information
  198. # (or 'no' to not use this)
  199. ){
  200. seg[,1] <- as.character(seg[,1])
  201. if(check.names){
  202. seg <- check.names.fn(seg)
  203. }
  204. if(! all(seg[,8] <= seg[,7]) ){
  205. # In case ASCAT people change the algorithm ### They did!!
  206. cat("Warning!! nB not always <= nA!! -- Correcting for internal use (only!)\n")
  207. tmp <- seg
  208. seg[tmp[,8] > tmp[,7],7] <- tmp[tmp[,8] > tmp[,7],8]
  209. seg[tmp[,8] > tmp[,7],8] <- tmp[tmp[,8] > tmp[,7],7]
  210. }
  211. seg <- seg[seg[,5] >= min.probes,]
  212. nB <- nA+1
  213. samples <- unique(seg[,1])
  214. output <- setNames(rep(0,length(samples)), samples)
  215. if(return.loc) {
  216. out.seg <- matrix(0,0,10)
  217. colnames(out.seg) <- c(colnames(seg)[1:8],'LST breakpoint', 'chr. arm')
  218. }
  219. for(j in samples){
  220. sample.seg <- seg[seg[,1] %in% j,]
  221. sample.lst <- c()
  222. for(i in unique(sample.seg[,2])){
  223. sample.chrom.seg <- sample.seg[sample.seg[,2] %in% i,,drop=F]
  224. if(chr.arm !='no'){
  225. p.max <- if(any(sample.chrom.seg[,chr.arm] == 'p')){max(sample.chrom.seg[sample.chrom.seg[,chr.arm] == 'p',4])}
  226. q.min <- min(sample.chrom.seg[sample.chrom.seg[,chr.arm] == 'q',3])
  227. }
  228. if(nrow(sample.chrom.seg) < 2) {next}
  229. sample.chrom.seg.new <- sample.chrom.seg
  230. if(chr.arm == 'no'){
  231. # split into chromosome arms
  232. p.arm <- sample.chrom.seg.new[sample.chrom.seg.new[,3] <= chrominfo[i,4],]
  233. q.arm <- sample.chrom.seg.new[sample.chrom.seg.new[,4] >= chrominfo[i,5],]
  234. q.arm<- shrink.seg.ai(q.arm)
  235. p.arm<- shrink.seg.ai(p.arm)
  236. p.arm[nrow(p.arm),4] <- chrominfo[i,4]
  237. q.arm[1,3] <- chrominfo[i,5]
  238. }
  239. if(chr.arm != 'no'){
  240. q.arm <- sample.chrom.seg.new[sample.chrom.seg.new[,chr.arm] == 'q',,drop=F]
  241. q.arm<- shrink.seg.ai(q.arm)
  242. q.arm[1,3] <- q.min
  243. if(any(sample.chrom.seg.new[,chr.arm] == 'p')){
  244. # split into chromosome arms
  245. p.arm <- sample.chrom.seg.new[sample.chrom.seg.new[,chr.arm] == 'p',,drop=F]
  246. p.arm<- shrink.seg.ai(p.arm)
  247. p.arm[nrow(p.arm),4] <- p.max
  248. }
  249. }
  250. n.3mb <- which((p.arm[,4] - p.arm[,3]) < 3e6)
  251. while(length(n.3mb) > 0){
  252. p.arm <- p.arm[-(n.3mb[1]),]
  253. p.arm <- shrink.seg.ai(p.arm)
  254. n.3mb <- which((p.arm[,4] - p.arm[,3]) < 3e6)
  255. }
  256. if(nrow(p.arm) >= 2){
  257. p.arm <- cbind(p.arm[,1:8], c(0,1)[match((p.arm[,4]-p.arm[,3]) >= 10e6, c('FALSE','TRUE'))])
  258. for(k in 2:nrow(p.arm)){
  259. if(p.arm[k,9] == 1 & p.arm[(k-1),9]==1 & (p.arm[k,3]-p.arm[(k-1),4]) < 3e6){
  260. sample.lst <- c(sample.lst, 1)
  261. if(return.loc){
  262. ## Number indicates if start (1) or end (2) defines the breakpoint
  263. a<- cbind(p.arm[(k-1),1:8], 2,'p-arm')
  264. b <- cbind(p.arm[k,1:8], 1,'p-arm')
  265. colnames(a)[9:10]<- colnames(b)[9:10]<- c('LST breakpoint', 'chr. arm')
  266. out.seg <- rbind(out.seg, a,b)
  267. }
  268. }
  269. }
  270. }
  271. n.3mb <- which((q.arm[,4] - q.arm[,3]) < 3e6)
  272. while(length(n.3mb) > 0){
  273. q.arm <- q.arm[-(n.3mb[1]),]
  274. q.arm <- shrink.seg.ai(q.arm)
  275. n.3mb <- which((q.arm[,4] - q.arm[,3]) < 3e6)
  276. }
  277. if(nrow(q.arm) >= 2){
  278. q.arm <- cbind(q.arm[,1:8], c(0,1)[match((q.arm[,4]-q.arm[,3]) >= 10e6, c('FALSE','TRUE'))])
  279. for(k in 2:nrow(q.arm)){
  280. if(q.arm[k,9] == 1 & q.arm[(k-1),9]==1 & (q.arm[k,3]-q.arm[(k-1),4]) < 3e6){
  281. sample.lst <- c(sample.lst, 1)
  282. if(return.loc){
  283. ## Number indicates if start (1) or end (2) defines the breakpoint
  284. a<- cbind(q.arm[(k-1),1:8], 2,'q-arm')
  285. b <- cbind(q.arm[k,1:8], 1,'q-arm')
  286. colnames(a)[9:10]<- colnames(b)[9:10]<- c('LST breakpoint', 'chr. arm')
  287. out.seg <- rbind(out.seg, a,b)
  288. }
  289. }
  290. }
  291. }
  292. }
  293. output[j] <- sum(sample.lst)
  294. }
  295. if(return.loc){
  296. return(out.seg)
  297. } else {
  298. return(output)
  299. }
  300. }
  301. #########################################################################
  302. # This function is an implementation of the cisplatin predictor developed
  303. # by Myriad with Gordon Mills PMID: 23047548.
  304. # Abkevichs cutoffs: > 10, found in the supplementary info
  305. #########################################################################
  306. calc.hrd <- function(
  307. seg, # Matrix of segmented output from ASCAT
  308. nA = 7, # The column index of 'seg' where copy number of A
  309. # allele is found
  310. check.names = TRUE, # Rename any duplicated samples
  311. return.loc = FALSE # Return location of HRD sites
  312. ){
  313. seg[,1] <- as.character(seg[,1])
  314. if(check.names){
  315. seg <- check.names.fn(seg)
  316. }
  317. if(! all(seg[,8] <= seg[,7]) ){
  318. # In case ASCAT people change the algorithm ### They did!!
  319. cat("Warning!! nB not always <= nA!! -- Correcting for internal use (only!)\n")
  320. tmp <- seg
  321. seg[tmp[,8] > tmp[,7],7] <- tmp[tmp[,8] > tmp[,7],8]
  322. seg[tmp[,8] > tmp[,7],8] <- tmp[tmp[,8] > tmp[,7],7]
  323. }
  324. nB <- nA+1
  325. output <- rep(0, length(unique(seg[,1])))
  326. names(output) <- unique(seg[,1])
  327. if(return.loc) {
  328. out.seg <- matrix(0,0,9)
  329. colnames(out.seg) <- c(colnames(seg)[1:8],'HRD breakpoint')
  330. }
  331. for(i in unique(seg[,1])){
  332. segSamp <- seg[seg[,1] %in% i,]
  333. chrDel <-vector()
  334. for(j in unique(segSamp[,2])){
  335. if(all(segSamp[segSamp[,2] == j,nB] == 0)) {
  336. chrDel <- c(chrDel, j)
  337. }
  338. }
  339. segLOH <- segSamp[segSamp[,nB] == 0 & segSamp[,nA] != 0,,drop=F]
  340. segLOH <- segLOH[segLOH[,4]-segLOH[,3] > 15e6,,drop=F]
  341. segLOH <- segLOH[!segLOH[,2] %in% chrDel,,drop=F]
  342. output[i] <- nrow(segLOH)
  343. if(return.loc){
  344. if(nrow(segLOH) < 1){next}
  345. segLOH <- cbind(segLOH[,1:8], 1)
  346. colnames(segLOH)[9] <- 'HRD breakpoint'
  347. out.seg <- rbind(out.seg, segLOH)
  348. }
  349. }
  350. if(return.loc){
  351. return(out.seg)
  352. } else {
  353. return(output)
  354. }
  355. }
  356. #########################################################################
  357. # Function to check for duplicate sample names in an ASCAT segmented
  358. # object, and return new names
  359. #########################################################################
  360. check.names.fn <- function(
  361. seg,
  362. max.size = 3.2e9,
  363. remove.dup = TRUE
  364. ){
  365. tmp <- setNames(paste(seg[,1],seg[,9],seg[,11],sep='_'), seg[,1])
  366. sample.names <- names(tmp[!duplicated(tmp)]) # this is based on the fact that ASCAT ploidy is always highly variable, as it is a mean of all the segments
  367. if(any(duplicated(sample.names))){
  368. cat("Warning!! Duplicate names found! Renaming duplicates, adding a '.(number)' to each beyond no. 1\n")
  369. dup.samp <- unique(sample.names[duplicated(sample.names)])
  370. for(i in 1:length(dup.samp)){
  371. dup.samp.seg <- seg[seg[,1] %in% dup.samp[i],]
  372. tmp<- unique(paste(dup.samp.seg[,1],dup.samp.seg[,9],dup.samp.seg[,11],sep='_'))
  373. for(k in 2:length(tmp)){
  374. dup.samp.seg[paste(dup.samp.seg[,1],dup.samp.seg[,9],dup.samp.seg[,11],sep='_') %in% tmp[k],1] <- paste(dup.samp.seg[paste(dup.samp.seg[,1],dup.samp.seg[,9],dup.samp.seg[,11],sep='_') %in% tmp[k],1],'.',k,sep='')
  375. }
  376. seg[seg[,1] %in% dup.samp[i],] <- dup.samp.seg
  377. }
  378. }
  379. # Now check for "silent" duplicates
  380. sample.names <- unique(seg[,1])
  381. for(i in sample.names){
  382. tmp <- seg[seg[,1] %in% i,]
  383. if(sum(tmp[,4] - tmp[,3]) > max.size){
  384. a <- which(tmp[,2] %in% 1)
  385. b <- which(tmp[,2] %in% 22)
  386. a <- a[!(a == (1:b[1])[1:length(a)])][1]
  387. tmp[a:nrow(tmp),1] <- paste('Duplicate', tmp[1,1], sep='-')
  388. seg[seg[,1] %in% i,] <- tmp
  389. }
  390. }
  391. if(remove.dup){
  392. seg <- seg[!grepl('Duplicate', seg[,1]),]
  393. }
  394. return(seg)
  395. }
  396. #########################################################################
  397. # AI-specific function to shrink a DNAcopy style matrix.
  398. # Works on a chromosome level.
  399. # Used to condense rows with identical CN values, which may have been
  400. # separated due to other values not currently of interest (e.g. filtered
  401. # out for number of probes)
  402. #########################################################################
  403. shrink.seg.ai <- function(chr.seg) {
  404. new.chr <- matrix(0,0,ncol(chr.seg))
  405. colnames(new.chr) <- colnames(chr.seg)
  406. new.chr <- chr.seg
  407. seg.class <- c(1)
  408. for(j in 2:nrow(new.chr)){
  409. ifelse(new.chr[(j-1),7] == new.chr[j,7] & new.chr[(j-1),8] == new.chr[j,8], seg.class <- c(seg.class, seg.class[j-1]), seg.class <- c(seg.class, seg.class[j-1]+1))
  410. }
  411. for(j in unique(seg.class)){
  412. new.chr[seg.class %in% j,4] <- max(new.chr[seg.class %in% j,4])
  413. new.chr[seg.class %in% j,5] <- sum(new.chr[seg.class %in% j,5])
  414. }
  415. new.chr<- new.chr[!duplicated(seg.class),]
  416. return(new.chr)
  417. }
  418. #########################################################################
  419. # Functions used to get/make the chrominfo table, which is needed to
  420. # calculate NtAI and LST scores.
  421. #########################################################################
  422. GetGzFromUrl <- function(url, ...) {
  423. # http://stackoverflow.com/questions/9548630/read-gzipped-csv-directly-from-a-url-in-r
  424. con <- gzcon(url(url))
  425. txt <- readLines(con)
  426. dat <- read.delim(textConnection(txt), ...)
  427. return(dat)
  428. }
  429. GetChrominfo <- function() {
  430. # Get chromInfo table from UCSC
  431. chrom <- GetGzFromUrl("http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/chromInfo.txt.gz", header = FALSE)
  432. chrom <- subset(chrom, grepl("^chr[0-9XY]{1,2}$", chrom[,1]))
  433. # Get gap table from UCSC
  434. gaps <- GetGzFromUrl("http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/gap.txt.gz", header = FALSE)
  435. centro <- subset(gaps, gaps[,8] == "centromere")
  436. # Merge the relevant info from the two tables
  437. chrominfo <- merge(chrom[,1:2], centro[,2:4], by.x = 1, by.y = 1) # merge size and centromere location
  438. chrominfo$centromere <- rowMeans(chrominfo[,3:4]) # convert centromere start and end into one location (the mean)
  439. chrominfo <- chrominfo[,c(1,2,5,3,4)] # keep chromosome, size and centromere location
  440. colnames(chrominfo) <- c("chr", "size", "centromere", "centstart", "centend")
  441. chrominfo[,1] <- as.character(chrominfo[,1])
  442. chrominfo$chr <- sub("chr", "", chrominfo$chr)
  443. chrominfo$chr <- sub("X", "23", chrominfo$chr)
  444. chrominfo$chr <- sub("Y", "24", chrominfo$chr)
  445. chrominfo[,1] <- as.numeric(chrominfo[,1])
  446. chrominfo <- chrominfo[order(chrominfo$chr), ] # order by chromosome number
  447. rownames(chrominfo) <- as.character(chrominfo[,1])
  448. chrominfo <- as.matrix(chrominfo)
  449. return(chrominfo)
  450. }
  451. #################################################
  452. #################################################
  453. ######## Functions used to count ########
  454. ######## total probe number ########
  455. ######## within a segment ########
  456. #################################################
  457. #################################################
  458. count_total_numbers <- function(segment_data,##This part from ascat.output$segments (ASCAT output object)
  459. SNP_pos_info ##Tihs part from ascat.bc$SNPpos (ASCAT object)
  460. ){
  461. ###produce segment start and end positions
  462. ###different samples might have different start and end positions
  463. seg_start <- paste(segment_data[,1],segment_data[,2],segment_data[,3],sep = "_")
  464. seg_end <- paste(segment_data[,1],segment_data[,2],segment_data[,4],sep = "_")
  465. ##make matrix containing
  466. ##each columns a sample with all SNP sites and position
  467. samples <- unique(segment_data[,1])
  468. all_SNP_pos <- paste(SNP_pos_info[,1],SNP_pos_info[,2],sep = "_")
  469. sample_SNP_pos <- sapply(samples,function(x){
  470. paste(x,all_SNP_pos,sep = "_")
  471. })
  472. rownames(sample_SNP_pos) <- rownames(SNP_pos_info)
  473. ##start and end positions for each sample
  474. ##True means that this SNP sites was start or end
  475. start_SNP_pos <- apply(sample_SNP_pos,2,function(x,y,z){
  476. setNames(x%in%y,z)
  477. },y=seg_start,z=rownames(SNP_pos_info))
  478. end_SNP_pos <- apply(sample_SNP_pos,2,function(x,y,z){
  479. setNames(x%in%y,z)
  480. },y=seg_end,z=rownames(SNP_pos_info))
  481. ##count probe numbers within a segment
  482. total_probe_number <- apply(end_SNP_pos,2,function(x){
  483. end_pos_in_a_sample <- which(x)
  484. c(end_pos_in_a_sample[1],diff(end_pos_in_a_sample,lag = 1))
  485. })
  486. unlist(total_probe_number)
  487. }