6 vuotta sitten · 69db19243b
--- a/HRD_functions.R
+++ b/HRD_functions.R
@@ -0,0 +1,526 @@
 #########################################################################
 ############################     HOW TO USE    ##########################
 #########################################################################

 # 'x' is a matrix of segmented output from ASCAT, with at least the
 #   following columns (column names are not important):
 # 1: sample id
 # 2: chromosome (numeric)
 # 3: segment start
 # 4: segment end
 # 5: number of probes
 # 6: total copy number
 # 7: nA
 # 8: nB
 # 9: ploidy
 # 10: contamination, aberrant cell fraction

 #chrominfo <- GetChrominfo() # hg19
 #
 #save(chrominfo,file = "f:/LX/results/ascatNgs_res/chrominfo.RData")
 #load("f:/LX/results/ascatNgs_res/chrominfo.RData")

 # chrominfo is a 5 column matrix with information about the chromosomes:
 # 1: chromosome number (23 = X, 24 = Y)
 # 2: chromosome length
 # 3: centromere location (mean of start and end)
 # 4: centromere start
 # 5: centromere end

 #ntai <- calc.ai(x, chrominfo)
 #lst <- calc.lst(x, chrominfo)
 #hrd <- calc.hrd(x)      # Throughout this text, "hrd" refers to "HRD-LOH"



 #########################################################################
 ############################     FUNCTIONS     ##########################
 #########################################################################


 #########################################################################
 # Number of telomeric AI
 # updated and improved to also account for ploidy - 2013-03-11
 #########################################################################

 calc.ai <- function(
  seg,                # Matrix of segmented output from ASCAT
  chrominfo = FALSE,  # Matrix with information about the chromosomes
  min.size = 0,       # Minimum size of segments
  min.probes = 500,   # Minimum number of probes in segments
  cont = 0,           # Contamination threshold. 0 ignores contamination
  check.names = TRUE, # Rename any duplicated samples
  ploidyByChromosome = TRUE,
  return.loc = FALSE, # Return location of NtAI's
  shrink = TRUE       # Joins segments of identical allelic copy number
 ) {
  if(ploidyByChromosome){cat("Determining chromosome-specific ploidy by major copy number fraction\n")}
  if(!ploidyByChromosome){cat("Determining sample ploidy by major copy number fraction over-all\n")}
  seg[,1] <- as.character(seg[,1])
  if(check.names){
    seg <- check.names.fn(seg)
  }
  # check and potentially fix if nB is always the smaller column
  if(!all(seg[,8] <= seg[,7]) ){
    # In case ASCAT people change the algorithm
    cat("Warning!! nB  not always <= nA!!  -- Correcting for internal use (only!)\n") 
    tmp <- seg
    seg[tmp[,8] > tmp[,7],7]  <- tmp[tmp[,8] > tmp[,7],8]
    seg[tmp[,8] > tmp[,7],8]  <- tmp[tmp[,8] > tmp[,7],7]
  }
  # remove segments smaller min.size and min.probes,
  #   and with too much contamination
  seg <- seg[seg[,5] >= min.probes,]
  seg <- seg[seg[,4]- seg[,3] >= min.size,]
  seg <- seg[seg[,10] >= cont,]
  samples <- as.character(unique(seg[,1]))
  if(shrink){
    new.seg <- seg[1,]
    for(j in samples){
      sample.seg <- seg[seg[,1] %in% j,]
      new.sample.seg <- seg[1,]
      for(i in unique(sample.seg[,2])){
        sample.chrom.seg <- sample.seg[sample.seg[,2] %in% i,,drop=F]
        sample.chrom.seg <- shrink.seg.ai(sample.chrom.seg)
        new.sample.seg <- rbind(new.sample.seg, sample.chrom.seg)
      }
      new.seg <- rbind(new.seg, new.sample.seg[-1,])    
    }
    seg <- new.seg[-1,]
  }      
  AI <- rep(NA, nrow(seg)) # Add a column to call AI
  seg <- cbind(seg, AI)
  samples <- as.character(unique(seg[,1]))
  ascat.ploidy <- setNames(seg[!duplicated(seg[,1]),9], seg[!duplicated(seg[,1]),1])
  for(j in samples){
    sample.seg <- seg[seg[,1] %in% j,]
    if(!ploidyByChromosome){
      ploidy <- vector()
      for(k in unique(sample.seg[,6])){
        tmp <- sample.seg[sample.seg[,6] %in% k,]
        ploidy <- c(ploidy, setNames(sum(tmp[,4]-tmp[,3]), k))
      }
      ploidy <- as.numeric(names(ploidy[order(ploidy,decreasing=T)]))[1]    
      # Update "ploidy" column, so the new calculated value can be returned
      sample.seg[,9] <- ploidy 
      # Add a columnm to define AI,
      # with codes for telomeric/interstitial/whole chromosome:
      # 0 = no AI
      # 1 = telomeric
      # 2 = interstitial
      # 3 = whole chromosome
      if(ploidy %in% c(1,seq(2, 200,by=2))){
        sample.seg[,'AI'] <- c(0,2)[match(sample.seg[,7] == sample.seg[,8], c('TRUE', 'FALSE'))]
      }    
      if(!ploidy %in%  c(1,seq(2, 200,by=2))){
        sample.seg[,'AI'] <- c(0,2)[match(sample.seg[,7] + sample.seg[,8] == ploidy & sample.seg[,7] != ploidy, c('TRUE', 'FALSE'))]
      }
    }
    new.sample.seg<- sample.seg[1,]
    for(i in unique(sample.seg[,2])){
      sample.chrom.seg <- sample.seg[sample.seg[,2] %in% i,,drop=F]
      if(nrow(sample.chrom.seg) == 0){ next}
      if(ploidyByChromosome){
        ploidy <- vector()
        for(k in unique(sample.seg[,6])){
          tmp <- sample.chrom.seg[sample.chrom.seg[,6] %in% k,]
          ploidy <- c(ploidy, setNames(sum(tmp[,4]-tmp[,3]), k))
          ploidy <- ploidy[!names(ploidy) %in% 0] #Remove any ploidy calls of zero
        }
        ploidy <- as.numeric(names(ploidy[order(ploidy,decreasing=T)]))[1]    
        sample.chrom.seg[,9] <- ploidy # update "ploidy" column, so the new calculated value can be returned
        if(ploidy %in% c(1,seq(2, 200,by=2))){
          sample.chrom.seg[,'AI'] <- c(0,2)[match(sample.chrom.seg[,7] == sample.chrom.seg[,8], c('TRUE', 'FALSE'))]
        }    
        if(!ploidy %in%  c(1,seq(2, 200,by=2))){
          sample.chrom.seg[,'AI'] <- c(0,2)[match(sample.chrom.seg[,7] + sample.chrom.seg[,8] == ploidy & sample.chrom.seg[,8] != 0, c('TRUE', 'FALSE'))]
        }
        sample.seg[sample.seg[,2] %in% i,9] <-ploidy
        sample.seg[sample.seg[,2] %in% i,'AI'] <-sample.chrom.seg[,'AI']
      }
      # By logical, we assume that chrominfo == FALSE, hence we here proceed without checking for the centromere (useful for non-human samples)
      if(class(chrominfo) == 'logical'){      
        if(sample.chrom.seg[1,'AI'] == 2 & nrow(sample.chrom.seg) != 1){
          sample.seg[sample.seg[,2]==i,'AI'][1] <- 1
        }
        if(sample.chrom.seg[nrow(sample.chrom.seg),'AI'] == 2 & nrow(sample.chrom.seg) != 1){
          sample.seg[sample.seg[,2]==i,'AI'][nrow(sample.seg[sample.seg[,2]==i,])] <- 1
        }
      }
      if(class(chrominfo) != 'logical'){    # Here we consider the centromere    
        if(sample.chrom.seg[1,'AI'] == 2 & nrow(sample.chrom.seg) != 1 & sample.chrom.seg[1,4] < (chrominfo[i,3])){
          sample.seg[sample.seg[,2]==i,'AI'][1] <- 1
        }
        if(sample.chrom.seg[nrow(sample.chrom.seg),'AI'] == 2 & nrow(sample.chrom.seg) != 1 & sample.chrom.seg[nrow(sample.chrom.seg),3] > (chrominfo[i,3])){
          sample.seg[sample.seg[,2]==i,'AI'][nrow(sample.seg[sample.seg[,2]==i,])] <- 1
        }
      }
      if(nrow(sample.seg[sample.seg[,2]==i,]) == 1 & sample.seg[sample.seg[,2]==i,'AI'][1] != 0){
        sample.seg[sample.seg[,2]==i,'AI'][1] <- 3
      }
    }
    seg[seg[,1] %in% j,] <- sample.seg
  }  
  samples <- as.character(unique(seg[,1]))
  no.events <- matrix(0, nrow=length(samples), ncol=12)
  rownames(no.events) <- samples
  colnames(no.events) <- c("Telomeric AI", "Mean size", "Interstitial AI", "Mean Size", "Whole chr AI", "Telomeric LOH",  "Mean size", "Interstitial LOH", "Mean Size", "Whole chr LOH", "Ploidy", "Aberrant cell fraction")
  for(j in samples){
    sample.seg <- seg[seg[,1] %in% j,]
    no.events[j,1] <- nrow(sample.seg[sample.seg[,'AI'] == 1,])
    no.events[j,2] <- mean(sample.seg[sample.seg[,'AI'] == 1,4] - sample.seg[sample.seg[,'AI'] == 1,3])
    no.events[j,3] <- nrow(sample.seg[sample.seg[,'AI'] == 2,])
    no.events[j,4] <- mean(sample.seg[sample.seg[,'AI'] == 2,4] - sample.seg[sample.seg[,'AI'] == 2,3])
    no.events[j,5] <- nrow(sample.seg[sample.seg[,'AI'] == 3,])
    no.events[j,11] <- ascat.ploidy[j]
    no.events[j,12] <- unique(sample.seg[,10]) # aberrant cell fraction
    # Here we restrict ourselves to real LOH
    sample.seg <- sample.seg[sample.seg[,8] == 0,]
    no.events[j,6] <- nrow(sample.seg[sample.seg[,'AI'] == 1,])
    no.events[j,7] <- mean(sample.seg[sample.seg[,'AI'] == 1,4] - sample.seg[sample.seg[,'AI'] == 1,3])
    no.events[j,8] <- nrow(sample.seg[sample.seg[,'AI'] == 2,])
    no.events[j,9] <- mean(sample.seg[sample.seg[,'AI'] == 2,4] - sample.seg[sample.seg[,'AI'] == 2,3])
    no.events[j,10] <- nrow(sample.seg[sample.seg[,'AI'] == 3,])
  }
  if(return.loc){
    return(seg)
  } else {
    return(no.events)
  }
 }




 #########################################################################
 # This function calculates Popova's LST measure
 # (Popova 2012, Cancer research)
 # http://www.bio-protocol.org/e814
 # Popova's cutoffs: 15 LSTs in near-diploid, 20 in near-tetraploid
 #########################################################################

 calc.lst <- function(
  seg,                # Matrix of segmented output from ASCAT
  chrominfo,          # Matrix with information about the chromosomes
  nA = 7,             # The column index of 'seg' where copy number of A
  #   allele is found
  check.names = T,    # Rename any duplicated samples
  min.probes = 50,    # As described by Popova (50 for SNP6)  
  return.loc = FALSE, # Return location of LST sites
  chr.arm = 'no'      # Option to use chromosome arms defined during
  #   segmentation. The option must give a column
  #   that holds the chromosome arm information
  #   (or 'no' to not use this)
 ){
  seg[,1] <- as.character(seg[,1])
  if(check.names){
    seg <- check.names.fn(seg)
  }
  if(! all(seg[,8] <= seg[,7]) ){
    # In case ASCAT people change the algorithm  ### They did!!
    cat("Warning!! nB  not always <= nA!!  -- Correcting for internal use (only!)\n") 
    tmp <- seg
    seg[tmp[,8] > tmp[,7],7]  <- tmp[tmp[,8] > tmp[,7],8]
    seg[tmp[,8] > tmp[,7],8]  <- tmp[tmp[,8] > tmp[,7],7]
  }
  seg <- seg[seg[,5] >= min.probes,]
  nB <- nA+1
  samples <- unique(seg[,1])
  output <- setNames(rep(0,length(samples)), samples)
  if(return.loc) {
    out.seg <- matrix(0,0,10)
    colnames(out.seg) <- c(colnames(seg)[1:8],'LST breakpoint', 'chr. arm')
  }
  for(j in samples){
    sample.seg <- seg[seg[,1] %in% j,]
    sample.lst <- c()
    for(i in unique(sample.seg[,2])){
      sample.chrom.seg <- sample.seg[sample.seg[,2] %in% i,,drop=F]
      if(chr.arm !='no'){
        p.max <- if(any(sample.chrom.seg[,chr.arm] == 'p')){max(sample.chrom.seg[sample.chrom.seg[,chr.arm] == 'p',4])}
        q.min <- min(sample.chrom.seg[sample.chrom.seg[,chr.arm] == 'q',3])
      }
      if(nrow(sample.chrom.seg) < 2) {next}
      sample.chrom.seg.new <- sample.chrom.seg
      if(chr.arm == 'no'){
        # split into chromosome arms
        p.arm <- sample.chrom.seg.new[sample.chrom.seg.new[,3] <= chrominfo[i,4],] 
        q.arm <- sample.chrom.seg.new[sample.chrom.seg.new[,4] >= chrominfo[i,5],]
        q.arm<- shrink.seg.ai(q.arm)
        p.arm<- shrink.seg.ai(p.arm)
        p.arm[nrow(p.arm),4] <- chrominfo[i,4]
        q.arm[1,3] <- chrominfo[i,5]
      }
      if(chr.arm != 'no'){
        q.arm <- sample.chrom.seg.new[sample.chrom.seg.new[,chr.arm] == 'q',,drop=F]
        q.arm<- shrink.seg.ai(q.arm)
        q.arm[1,3] <- q.min
        if(any(sample.chrom.seg.new[,chr.arm] == 'p')){
          # split into chromosome arms
          p.arm <- sample.chrom.seg.new[sample.chrom.seg.new[,chr.arm] == 'p',,drop=F] 
          p.arm<- shrink.seg.ai(p.arm)
          p.arm[nrow(p.arm),4] <- p.max
        }
      }  
      n.3mb <- which((p.arm[,4] - p.arm[,3]) < 3e6)
      while(length(n.3mb) > 0){
        p.arm <- p.arm[-(n.3mb[1]),]
        p.arm <- shrink.seg.ai(p.arm)
        n.3mb <- which((p.arm[,4] - p.arm[,3]) < 3e6)
      }
      if(nrow(p.arm) >= 2){
        p.arm <- cbind(p.arm[,1:8], c(0,1)[match((p.arm[,4]-p.arm[,3]) >= 10e6, c('FALSE','TRUE'))])
        for(k in 2:nrow(p.arm)){
          if(p.arm[k,9] == 1 & p.arm[(k-1),9]==1 & (p.arm[k,3]-p.arm[(k-1),4]) < 3e6){
            sample.lst <- c(sample.lst, 1)
            if(return.loc){
              ## Number indicates if start (1) or end (2) defines the breakpoint
              a<- cbind(p.arm[(k-1),1:8], 2,'p-arm') 
              b <- cbind(p.arm[k,1:8], 1,'p-arm')
              colnames(a)[9:10]<- colnames(b)[9:10]<- c('LST breakpoint', 'chr. arm')
              out.seg <- rbind(out.seg, a,b)
            }
          }
        }
      }
      n.3mb <- which((q.arm[,4] - q.arm[,3]) < 3e6)
      while(length(n.3mb) > 0){
        q.arm <- q.arm[-(n.3mb[1]),]
        q.arm <- shrink.seg.ai(q.arm)
        n.3mb <- which((q.arm[,4] - q.arm[,3]) < 3e6)
      }
      if(nrow(q.arm) >= 2){
        q.arm <- cbind(q.arm[,1:8], c(0,1)[match((q.arm[,4]-q.arm[,3]) >= 10e6, c('FALSE','TRUE'))])
        for(k in 2:nrow(q.arm)){
          if(q.arm[k,9] == 1 & q.arm[(k-1),9]==1 & (q.arm[k,3]-q.arm[(k-1),4]) < 3e6){
            sample.lst <- c(sample.lst, 1)
            if(return.loc){
              ## Number indicates if start (1) or end (2) defines the breakpoint
              a<- cbind(q.arm[(k-1),1:8], 2,'q-arm') 
              b <- cbind(q.arm[k,1:8], 1,'q-arm')
              colnames(a)[9:10]<- colnames(b)[9:10]<- c('LST breakpoint', 'chr. arm')
              out.seg <- rbind(out.seg, a,b)
            }
          }
        }
      }
    }
    output[j] <- sum(sample.lst)
  }
  if(return.loc){
    return(out.seg)
  } else {
    return(output)
  }
 }



 #########################################################################
 # This function is an implementation of the cisplatin predictor developed
 # by Myriad with Gordon Mills PMID: 23047548.
 # Abkevichs cutoffs: > 10, found in the supplementary info
 #########################################################################

 calc.hrd <- function(
  seg,                # Matrix of segmented output from ASCAT
  nA = 7,             # The column index of 'seg' where copy number of A
  #   allele is found
  check.names = TRUE, # Rename any duplicated samples
  return.loc = FALSE  # Return location of HRD sites
 ){
  seg[,1] <- as.character(seg[,1])
  if(check.names){
    seg <- check.names.fn(seg)
  }
  if(! all(seg[,8] <= seg[,7]) ){
    # In case ASCAT people change the algorithm  ### They did!!
    cat("Warning!! nB  not always <= nA!!  -- Correcting for internal use (only!)\n") 
    tmp <- seg
    seg[tmp[,8] > tmp[,7],7]  <- tmp[tmp[,8] > tmp[,7],8]
    seg[tmp[,8] > tmp[,7],8]  <- tmp[tmp[,8] > tmp[,7],7]
  }
  nB <- nA+1
  output <- rep(0, length(unique(seg[,1])))
  names(output) <- unique(seg[,1])
  if(return.loc) {
    out.seg <- matrix(0,0,9)
    colnames(out.seg) <- c(colnames(seg)[1:8],'HRD breakpoint')
  }
  for(i in unique(seg[,1])){
    segSamp <- seg[seg[,1] %in% i,]
    chrDel <-vector() 
    for(j in unique(segSamp[,2])){   
      if(all(segSamp[segSamp[,2] == j,nB] == 0)) {
        chrDel <- c(chrDel, j)
      }
    }
    segLOH <- segSamp[segSamp[,nB] == 0 & segSamp[,nA] != 0,,drop=F]
    segLOH <- segLOH[segLOH[,4]-segLOH[,3] > 15e6,,drop=F]
    segLOH <- segLOH[!segLOH[,2] %in% chrDel,,drop=F]
    output[i] <- nrow(segLOH)
    if(return.loc){
      if(nrow(segLOH) < 1){next}
      segLOH <- cbind(segLOH[,1:8], 1)
      colnames(segLOH)[9] <- 'HRD breakpoint'
      out.seg <- rbind(out.seg, segLOH)
    }
  }
  if(return.loc){
    return(out.seg)
  } else {
    return(output)
  }
 }  



 #########################################################################
 # Function to check for duplicate sample names in an ASCAT segmented
 # object, and return new names
 #########################################################################

 check.names.fn <- function(
  seg,
  max.size = 3.2e9,
  remove.dup = TRUE
 ){
  tmp <- setNames(paste(seg[,1],seg[,9],seg[,11],sep='_'), seg[,1])
  sample.names <-  names(tmp[!duplicated(tmp)]) # this is based on the fact that ASCAT ploidy is always highly variable, as it is a mean of all the segments
  if(any(duplicated(sample.names))){
    cat("Warning!! Duplicate names found! Renaming duplicates, adding a '.(number)' to each beyond no. 1\n")
    dup.samp <- unique(sample.names[duplicated(sample.names)])
    for(i in 1:length(dup.samp)){
      dup.samp.seg <- seg[seg[,1] %in% dup.samp[i],]    
      tmp<- unique(paste(dup.samp.seg[,1],dup.samp.seg[,9],dup.samp.seg[,11],sep='_'))
      for(k in 2:length(tmp)){
        dup.samp.seg[paste(dup.samp.seg[,1],dup.samp.seg[,9],dup.samp.seg[,11],sep='_') %in% tmp[k],1] <- paste(dup.samp.seg[paste(dup.samp.seg[,1],dup.samp.seg[,9],dup.samp.seg[,11],sep='_') %in% tmp[k],1],'.',k,sep='')
      }
      seg[seg[,1] %in% dup.samp[i],] <- dup.samp.seg
    }
  }
  # Now check for "silent" duplicates
  sample.names <- unique(seg[,1])
  for(i in sample.names){
    tmp <- seg[seg[,1] %in% i,]
    if(sum(tmp[,4] - tmp[,3]) > max.size){
      a <- which(tmp[,2] %in% 1)
      b <- which(tmp[,2] %in% 22)
      a <- a[!(a == (1:b[1])[1:length(a)])][1]
      tmp[a:nrow(tmp),1] <- paste('Duplicate', tmp[1,1], sep='-')
      seg[seg[,1] %in% i,] <- tmp
    }
  }
  if(remove.dup){
    seg <- seg[!grepl('Duplicate', seg[,1]),]
  }
  return(seg)
 }



 #########################################################################
 # AI-specific function to shrink a DNAcopy style matrix.
 # Works on a chromosome level.
 # Used to condense rows with identical CN values, which may have been
 # separated due to other values not currently of interest (e.g. filtered
 # out for number of probes)
 #########################################################################

 shrink.seg.ai <- function(chr.seg)  {  
  new.chr <- matrix(0,0,ncol(chr.seg))
  colnames(new.chr) <- colnames(chr.seg)
  new.chr <- chr.seg
  seg.class <- c(1)
  for(j in 2:nrow(new.chr)){
    ifelse(new.chr[(j-1),7] == new.chr[j,7] & new.chr[(j-1),8] == new.chr[j,8], seg.class <- c(seg.class, seg.class[j-1]), seg.class <- c(seg.class, seg.class[j-1]+1))
  }
  for(j in unique(seg.class)){
    new.chr[seg.class %in% j,4] <- max(new.chr[seg.class %in% j,4])
    new.chr[seg.class %in% j,5] <- sum(new.chr[seg.class %in% j,5])
  }
  new.chr<- new.chr[!duplicated(seg.class),]
  return(new.chr)
 }

 #########################################################################
 # Functions used to get/make the chrominfo table, which is needed to
 # calculate NtAI and LST scores.
 #########################################################################

 GetGzFromUrl <- function(url, ...) {
  # http://stackoverflow.com/questions/9548630/read-gzipped-csv-directly-from-a-url-in-r
  con <- gzcon(url(url))
  txt <- readLines(con)
  dat <- read.delim(textConnection(txt), ...)  
  return(dat)
 }

 GetChrominfo <- function() {
  # Get chromInfo table from UCSC
  chrom <- GetGzFromUrl("http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/chromInfo.txt.gz", header = FALSE)
  chrom <- subset(chrom, grepl("^chr[0-9XY]{1,2}$", chrom[,1]))
  # Get gap table from UCSC
  gaps <- GetGzFromUrl("http://hgdownload.cse.ucsc.edu/goldenPath/hg19/database/gap.txt.gz", header = FALSE)
  centro <- subset(gaps, gaps[,8] == "centromere")
  # Merge the relevant info from the two tables
  chrominfo <- merge(chrom[,1:2], centro[,2:4], by.x = 1, by.y = 1) # merge size and centromere location
  chrominfo$centromere <- rowMeans(chrominfo[,3:4]) # convert centromere start and end into one location (the mean)
  chrominfo <- chrominfo[,c(1,2,5,3,4)] # keep chromosome, size and centromere location
  colnames(chrominfo) <- c("chr", "size", "centromere", "centstart", "centend")
  chrominfo[,1] <- as.character(chrominfo[,1])
  chrominfo$chr <- sub("chr", "", chrominfo$chr)
  chrominfo$chr <- sub("X", "23", chrominfo$chr)
  chrominfo$chr <- sub("Y", "24", chrominfo$chr)
  chrominfo[,1] <- as.numeric(chrominfo[,1])
  chrominfo <- chrominfo[order(chrominfo$chr), ]  # order by chromosome number
  rownames(chrominfo) <- as.character(chrominfo[,1])
  chrominfo <- as.matrix(chrominfo)
  return(chrominfo)  
 }


 #################################################
 #################################################
 ########     Functions used to count     ########
 ########        total probe number       ########
 ########         within a segment        ########
 #################################################
 #################################################

 count_total_numbers <- function(segment_data,##This part from ascat.output$segments (ASCAT output object)
                                SNP_pos_info ##Tihs part from ascat.bc$SNPpos (ASCAT object)
 ){
  ###produce segment start and end positions
  ###different samples might have different start and end positions
  seg_start <- paste(segment_data[,1],segment_data[,2],segment_data[,3],sep = "_")
  seg_end <- paste(segment_data[,1],segment_data[,2],segment_data[,4],sep = "_")
  
  ##make matrix containing 
  ##each columns a sample with all SNP sites and position
  samples <- unique(segment_data[,1])
  all_SNP_pos <- paste(SNP_pos_info[,1],SNP_pos_info[,2],sep = "_")
  sample_SNP_pos <- sapply(samples,function(x){
    paste(x,all_SNP_pos,sep = "_")
  })
  rownames(sample_SNP_pos) <- rownames(SNP_pos_info)
  
  ##start and end positions for each sample
  ##True means that this SNP sites was start or end
  start_SNP_pos <- apply(sample_SNP_pos,2,function(x,y,z){
    setNames(x%in%y,z)
  },y=seg_start,z=rownames(SNP_pos_info))
  end_SNP_pos <- apply(sample_SNP_pos,2,function(x,y,z){
    setNames(x%in%y,z)
  },y=seg_end,z=rownames(SNP_pos_info))
  
  ##count probe numbers within a segment
  total_probe_number <- apply(end_SNP_pos,2,function(x){
    end_pos_in_a_sample <- which(x)
    c(end_pos_in_a_sample[1],diff(end_pos_in_a_sample,lag = 1))
  })
  unlist(total_probe_number)
 }



--- a/calculate_HRD.R
+++ b/calculate_HRD.R
@@ -0,0 +1,97 @@
 args <- commandArgs(TRUE)

 if(length(args)<1){
  args <- c("--help")
 }

 ##help section
 if("--help"%in%args){
  cat("Arguments:
      --out_dir=out_directory             The output directory
      --Tumor_LogR_file=tumor_LogR        The output tumor LogR file from ascatNgs
      --Tumor_BAF_file=tumor_BAF          The output tumor BAF file from ascatNgs
      --Germline_LogR_file=Normal_LogR    The output normal LogR file from ascatNgs
      --Germline_BAF_file=Normal_BAF      The output normal BAF file from ascatNgs
      --HRD_file_prefix=prefix            The name prefix of HRD score results

      These parameters must be set in orders!
      ")
  q(save = "no")
 }

 message(paste0("\n\nYou have the Rights to copy, edit, send and distribute this files,",
               " but not as Commercial activities.\nEnjoy!\n"))

 args[[1]] <- gsub("--out_dir=","",args[[1]])
 args[[2]] <- gsub("--Tumor_LogR_file=","",args[[2]])
 args[[3]] <- gsub("--Tumor_BAF_file=","",args[[3]])
 args[[4]] <- gsub("--Germline_LogR_file=","",args[[4]])
 args[[5]] <- gsub("--Germline_BAF_file=","",args[[5]])
 args[[6]] <- gsub("--HRD_file_prefix=","",args[[6]])





 #################################################
 #################################################
 ########     Deal with ASCAT results     ########
 ########        Output of ascatNgs       ########
 #################################################
 #################################################
 library(ASCAT)
 source("f:/choppy_app/HRD_score/HRD_functions.R")

 #setwd(args[[1]])
 rep_1_ascat.bc <- ascat.loadData(Tumor_LogR_file = args[[2]],
                               Tumor_BAF_file = args[[3]],
                               Germline_LogR_file = args[[4]],
                               Germline_BAF_file = args[[5]],
                               gender = "XX",
                               sexchromosomes = c("X","Y"))
 #dir.create("../res/")
 setwd(args[[1]])
 ##no GC content correction here
 #rep_1_ascat.bc <- ascat.GCcorrect(rep_1_ascat.bc,GCcontentfile = "../SnpGcCorrections.tsv")
 ascat.plotRawData(rep_1_ascat.bc)
 rep_1_ascat.bc = ascat.aspcf(rep_1_ascat.bc)
 ascat.plotSegmentedData(rep_1_ascat.bc)
 rep_1_ascat.output = ascat.runAscat(rep_1_ascat.bc)


 #################################################
 #################################################
 ########    Calculate Three HRD score    ########
 #################################################
 #################################################

 rep_1_x <- cbind(rep_1_ascat.output$segments[,1:4],
                 number_of_propes=count_total_numbers(segment_data = rep_1_ascat.output$segments,
                                                      SNP_pos_info = rep_1_ascat.bc$SNPpos)[,1],
                 total_copy_number=apply(rep_1_ascat.output$segments[,5:6],1,sum),
                 rep_1_ascat.output$segments[,5:6],
                 ploidy=rep_1_ascat.output$ploidy,contamination=rep_1_ascat.output$aberrantcellfraction,
                 stringsAsFactors = FALSE)


 ##load chromsome information
 load("f:/choppy_app/HRD_score/chrominfo.RData")			 
 				
 ##modify names of chromosomes to match the chromosomes in the chrominfo object
 rep_1_x[rep_1_x[,2]=="X",2] <- 23
 rep_1_x[rep_1_x[,2]=="Y",2] <- 24

 rep_1_ntai <- calc.ai(rep_1_x, chrominfo,check.names = FALSE,return.loc = FALSE)
 rep_1_lst <- calc.lst(rep_1_x,chrominfo,check.names = FALSE,return.loc = FALSE)
 rep_1_hrd <- calc.hrd(rep_1_x,check.names = FALSE,return.loc = FALSE)

 HRD_score <- rep_1_ntai[1,1]+rep_1_lst+rep_1_hrd
 names(HRD_score) <- "HRD_score"

 write.table(HRD_score,file = paste0(args[[1]],"/",args[[6]],".txt"),col.names = FALSE,quote = FALSE)

 q(save="no")




--- a/chrominfo.RData
+++ b/chrominfo.RData
--- a/tasks/hrd_score.wdl
+++ b/tasks/hrd_score.wdl
@@ -0,0 +1,24 @@
 task HRD_score {
 	String input_dir
 	String Tumor_LogR_file
 	String Tumor_BAF_file
 	String Germline_LogR_file
 	String Germline_BAF_file
 	String output_file_prefix
 	String docker
 	String cluster_config
 	String disk_size
 	
 	command {
 		Rscript /opt/hrd_score/calculate_HRD.R ${input_dir} ${Tumor_LogR_file} ${Tumor_BAF_file} ${Germline_LogR_file} ${Germline_BAF_file} ${output_file_prefix}
 	}
 	runtime {
 		docker:docker
    	cluster: cluster_config
    	systemDisk: "cloud_ssd 40"
    	dataDisk: "cloud_ssd " + disk_size + " /cromwell_root/"
 	}
 	output {
 		File hrd_score = "${output_file_prefix}.txt"
 	}
 }
--- a/workflow.wdl
+++ b/workflow.wdl
@@ -0,0 +1,16 @@
 import "./tasks/hrd_score.wdl" as HRD_score_so_Amazing

 workflow {{project_name}} {
 	call HRD_score_so_Amazing.hrd_score.wdl as HRD_score {
 		input:
 		input_dir=input_dir,
 		Tumor_LogR_file=Tumor_LogR_file,
 		Tumor_BAF_file=Tumor_BAF_file,
 		Germline_LogR_file=Germline_LogR_file,
 		Germline_BAF_file=Germline_BAF_file,
 		output_file_prefix=output_file_prefix,
 		docker=docker,
 		cluster_config=cluster_config,
 		disk_size=disk_size
 	}
 }