##########################################################################################################################
# R code for normalization of RNAseq data with regards to sequencing depth and transcript length.                        #
# The code is supplied without any warranty or guaranteed support.                                                       #
##########################################################################################################################

RNAseq.normalization = function(counts.raw, transcript.lengths = NULL, 
                                quantile.M = 0.3, quantile.A = 0.05, 
                                do.weighting = TRUE) {
  
  ## If the transcript.lengths vector is named, reorder in the same order 
  ## as the genes in the count.matrix. Otherwise, assume that they are 
  ## in the same order.
  if (!is.null(transcript.lengths) & !is.null(names(transcript.lengths)) 
      & !is.null(rownames(counts.raw))) {
    transcript.lengths <- transcript.lengths[match(rownames(counts.raw), 
                                                   names(transcript.lengths))]
  } 
  
  ## Calculate normalization factors using the TMM approach
  total.count <- colSums(counts.raw)
  tmp <- sweep(counts.raw, 2, total.count, "/")
  M <- log2(sweep(tmp, 1, tmp[, 1], "/"))
  A <- 1/2*log2(sweep(tmp, 1, tmp[, 1], "*"))
  nf <- c(1)
  for (i in 2:ncol(counts.raw)) {
    isfin <- is.finite(M[, i]) & is.finite(A[, i]) & A[, i] > -1e10
    Mcurr <- M[isfin, i]
    Acurr <- A[isfin, i]
    low.limit.M <- floor(length(which(isfin))*quantile.M) + 1
    high.limit.M <- length(which(isfin)) + 1 - low.limit.M
    keep.genes.M <- order(Mcurr)[low.limit.M:high.limit.M]
    low.limit.A <- floor(length(which(isfin))*quantile.A) + 1
    high.limit.A <- length(which(isfin)) + 1 - low.limit.A
    keep.genes.A <- order(Acurr)[low.limit.A:high.limit.A]
    keep.genes <- intersect(keep.genes.M, keep.genes.A)
    Mtemp <- Mcurr[keep.genes]
    if (do.weighting == TRUE) {
      w <- (total.count[i] - counts.raw[, i])/(total.count[i]*counts.raw[, i]) +
        (total.count[1] - counts.raw[, 1])/(total.count[1]*counts.raw[, 1])
      w <- w[isfin][keep.genes]
      nf <- c(nf, 2^(sum(1/w*Mtemp, na.rm = TRUE)/sum(1/w, na.rm = TRUE)))
    } else {
      nf <- c(nf, 2^mean(Mtemp, na.rm = TRUE))
    }
  }
  
  nf = nf * colSums(counts.raw)
  
  ## Add 0.5 to the counts
  counts.norm = counts.raw + 0.5
  ## Divide by normalization factors
  counts.norm = sweep(counts.norm, 2, nf, "/")
  if (!is.null(transcript.lengths)) {
    ## Divide by gene lengths in kb
    counts.norm = 1000 * sweep(counts.norm, 1, transcript.lengths, "/")
  }
  ## Multiply by 1e6 and log-transform 
  counts.norm = log2(counts.norm * 1e+06)
  return(counts.norm)
}