########################################################################################################################## # R code for normalization of RNAseq data with regards to sequencing depth and transcript length. # # The code is supplied without any warranty or guaranteed support. # ########################################################################################################################## RNAseq.normalization = function(counts.raw, transcript.lengths = NULL, quantile.M = 0.3, quantile.A = 0.05, do.weighting = TRUE) { ## If the transcript.lengths vector is named, reorder in the same order ## as the genes in the count.matrix. Otherwise, assume that they are ## in the same order. if (!is.null(transcript.lengths) & !is.null(names(transcript.lengths)) & !is.null(rownames(counts.raw))) { transcript.lengths <- transcript.lengths[match(rownames(counts.raw), names(transcript.lengths))] } ## Calculate normalization factors using the TMM approach total.count <- colSums(counts.raw) tmp <- sweep(counts.raw, 2, total.count, "/") M <- log2(sweep(tmp, 1, tmp[, 1], "/")) A <- 1/2*log2(sweep(tmp, 1, tmp[, 1], "*")) nf <- c(1) for (i in 2:ncol(counts.raw)) { isfin <- is.finite(M[, i]) & is.finite(A[, i]) & A[, i] > -1e10 Mcurr <- M[isfin, i] Acurr <- A[isfin, i] low.limit.M <- floor(length(which(isfin))*quantile.M) + 1 high.limit.M <- length(which(isfin)) + 1 - low.limit.M keep.genes.M <- order(Mcurr)[low.limit.M:high.limit.M] low.limit.A <- floor(length(which(isfin))*quantile.A) + 1 high.limit.A <- length(which(isfin)) + 1 - low.limit.A keep.genes.A <- order(Acurr)[low.limit.A:high.limit.A] keep.genes <- intersect(keep.genes.M, keep.genes.A) Mtemp <- Mcurr[keep.genes] if (do.weighting == TRUE) { w <- (total.count[i] - counts.raw[, i])/(total.count[i]*counts.raw[, i]) + (total.count[1] - counts.raw[, 1])/(total.count[1]*counts.raw[, 1]) w <- w[isfin][keep.genes] nf <- c(nf, 2^(sum(1/w*Mtemp, na.rm = TRUE)/sum(1/w, na.rm = TRUE))) } else { nf <- c(nf, 2^mean(Mtemp, na.rm = TRUE)) } } nf = nf * colSums(counts.raw) ## Add 0.5 to the counts counts.norm = counts.raw + 0.5 ## Divide by normalization factors counts.norm = sweep(counts.norm, 2, nf, "/") if (!is.null(transcript.lengths)) { ## Divide by gene lengths in kb counts.norm = 1000 * sweep(counts.norm, 1, transcript.lengths, "/") } ## Multiply by 1e6 and log-transform counts.norm = log2(counts.norm * 1e+06) return(counts.norm) }