Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added the functionality of scale.factor in NormalizeData being set to "median" of counts #9389

Open
wants to merge 9 commits into
base: develop
Choose a base branch
from
18 changes: 18 additions & 0 deletions R/preprocessing.R
Original file line number Diff line number Diff line change
Expand Up @@ -3120,6 +3120,15 @@ RelativeCounts <- function(data, scale.factor = 1, verbose = TRUE) {
if (verbose) {
cat("Performing relative-counts-normalization\n", file = stderr())
}

#setting scale.factor to be the median of counts across all columns if scale.factor is the string "median"
if (is.character(scale.factor) && scale.factor == "median") {
if(verbose){
cat("Calculating median scale factor\n", file = stderr())
}
scale.factor <- median(Matrix::colSums(data))
}

norm.data <- data
norm.data@x <- norm.data@x / rep.int(Matrix::colSums(norm.data), diff(norm.data@p)) * scale.factor
return(norm.data)
Expand Down Expand Up @@ -4336,6 +4345,15 @@ LogNormalize.V3Matrix <- function(
if (verbose) {
cat("Performing log-normalization\n", file = stderr())
}

#setting scale.factor to be the median of counts across all columns if scale.factor is the string "median"
if (is.character(scale.factor) && scale.factor == "median") {
if(verbose){
cat("Calculating median scale factor\n", file = stderr())
}
scale.factor <- median(Matrix::colSums(data))
}

norm.data <- LogNorm(data, scale_factor = scale.factor, display_progress = verbose)
colnames(x = norm.data) <- colnames(x = data)
rownames(x = norm.data) <- rownames(x = data)
Expand Down
50 changes: 50 additions & 0 deletions R/preprocessing5.R
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,20 @@ LogNormalize.default <- function(
if (isTRUE(x = verbose)) {
pb <- txtProgressBar(file = stderr(), style = 3)
}

#setting scale.factor to be the median of counts across all columns if scale.factor is the string "median"
if (is.character(scale.factor) && scale.factor == "median") {
if(verbose){
cat("Calculating median scale factor\n", file = stderr())
}
sums <- if (margin == 1L) {
rowSums(data) # Sum of each row (gene) if margin is 1L
} else {
colSums(data) # Sum of each column (cell) if margin is 2L
}
scale.factor = median(sums)
}

for (i in seq_len(length.out = ncells)) {
x <- if (margin == 1L) {
data[i, ]
Expand Down Expand Up @@ -288,6 +302,15 @@ LogNormalize.IterableMatrix <- function(
verbose = TRUE,
...
) {

#setting scale.factor to be the median of counts across all columns if scale.factor is the string "median"
if (is.character(scale.factor) && scale.factor == "median") {
if(verbose){
cat("Calculating median scale factor\n", file = stderr())
}
scale.factor <- median(colSums(data))
}

data <- BPCells::t(BPCells::t(data) / colSums(data))
# Log normalization
data <- log1p(data * scale.factor)
Expand Down Expand Up @@ -860,6 +883,33 @@ DISP <- function(
p <- p + 1L
}
np <- length(x = p) - 1L

#adding a progress bar for median calculation is verbose is TRUE
if (is.character(scale.factor) && scale.factor == "median" && isTRUE(x = verbose)) {
cat("Calculating median scale factor\n", file = stderr())
pb_median <- txtProgressBar(style = 3L, file = stderr())
}

#setting scale.factor to be the median of counts across all columns if scale.factor is the string "median"
if (is.character(scale.factor) && scale.factor == "median") {
col_sums <- numeric(np)
for (i in seq_len(length.out = np)) {
idx <- seq.int(from = p[i], to = p[i + 1] - 1L)
xidx <- slot(object = data, name = entryname)[idx]
col_sums[i] <- sum(xidx)

if (isTRUE(x = verbose)) {
setTxtProgressBar(pb_median, value = i / np)
}
}

if (isTRUE(x = verbose)) {
close(pb_median)
}

scale.factor <- median(col_sums)
}

if (isTRUE(x = verbose)) {
pb <- txtProgressBar(style = 3L, file = stderr())
}
Expand Down
53 changes: 53 additions & 0 deletions tests/testthat/test_preprocessing.R
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,21 @@ test_that("Relative count normalization returns expected values", {
expect_equal(rc.counts[2, 1], 14285.71, tolerance = 1e-6)
})

denseMatrix <- as.matrix(pbmc.test) # Matrix to test LogNormalize.V3Matrix and RelativeCounts methods
test_that("LogNormalize.V3Matrix computes median scale factor correctly", {
expectedMedian <- median(colSums(denseMatrix))
resultFromExpectedMedian <- LogNormalize.V3Matrix(data = denseMatrix, scale.factor = expectedMedian, margin = 2L, verbose = FALSE)
resultFromScaleFactorSetToMedian <- LogNormalize.V3Matrix(data = denseMatrix, scale.factor = "median", margin = 2L, verbose = FALSE)
expect_equal(as.matrix(resultFromExpectedMedian), as.matrix(resultFromScaleFactorSetToMedian), tolerance = 1e-6)
})

test_that("RelativeCounts computes median scale factor correctly", {
expectedMedian <- median(colSums(denseMatrix))
resultFromExpectedMedian <- RelativeCounts(data = denseMatrix, scale.factor = expectedMedian, verbose = FALSE)
resultFromScaleFactorSetToMedian <- RelativeCounts(data = denseMatrix, scale.factor = "median", verbose = FALSE)
expect_equal(as.matrix(resultFromExpectedMedian), as.matrix(resultFromScaleFactorSetToMedian), tolerance = 1e-6)
})

# Tests for v5 NormalizeData
# --------------------------------------------------------------------------------
context("v5 NormalizeData")
Expand Down Expand Up @@ -175,6 +190,44 @@ test_that("LogNormalize normalizes properly for BPCells", {
)
})

test_that("LogNormalize.IterableMatrix computes median scale factor correctly", {
skip_on_cran()
library(Matrix)
skip_if_not_installed("BPCells")
library(BPCells)
mat_bpcells <- t(as(t(object[['RNA']]$counts ), "IterableMatrix"))
expectedMedian <- median(colSums(mat_bpcells))
resultFromExpectedMedian <- LogNormalize.IterableMatrix(data = mat_bpcells, scale.factor = expectedMedian, margin = 2L, verbose = FALSE)
resultFromScaleFactorSetToMedian <- LogNormalize.IterableMatrix(data = mat_bpcells, scale.factor = "median", margin = 2L, verbose = FALSE)
expect_equal(as.matrix(resultFromExpectedMedian), as.matrix(resultFromScaleFactorSetToMedian), tolerance = 1e-6)
})

denseMatrix <- as.matrix(pbmc.test) # Matrix to test LogNormalize.default when scale.factor is set to "median"
test_that("LogNormalize.default computes median scale factor correctly for both margin values", {
expectedMedianForMargin1L <- median(rowSums(denseMatrix))
expectedMedianForMargin2L <- median(colSums(denseMatrix))

resultFromExpectedMedianForMargin1L <- LogNormalize.default(data = denseMatrix, scale.factor = expectedMedianForMargin1L, margin = 1L, verbose = FALSE)
resultFromExpectedMedianForMargin2L <- LogNormalize.default(data = denseMatrix, scale.factor = expectedMedianForMargin2L, margin = 2L, verbose = FALSE)

resultsFromScaleFactorSetToMedianForMargin1L <- LogNormalize.default(data = denseMatrix, scale.factor = "median", margin = 1L, verbose = FALSE)#if the normalization is across rows (genes)
resultsFromScaleFactorSetToMedianForMargin2L <- LogNormalize.default(data = denseMatrix, scale.factor = "median", margin = 2L, verbose = FALSE)#if the normalization is across columns (cells)

expect_equal(as.matrix(resultFromExpectedMedianForMargin1L), as.matrix(resultsFromScaleFactorSetToMedianForMargin1L), tolerance = 1e-6)
expect_equal(as.matrix(resultFromExpectedMedianForMargin2L), as.matrix(resultsFromScaleFactorSetToMedianForMargin2L), tolerance = 1e-6)
})

theSparseMatrix <- as.sparse(denseMatrix) # Sparse Matrix to test .SparseNormalize computes median scale factor correctly
test_that("LogNormalize.default computes median scale factor correctly for both margin values", {
expectedMedian <- median(colSums(theSparseMatrix))

resultFromExpectedMedian <- .SparseNormalize(data = theSparseMatrix, scale.factor = expectedMedian, verbose = FALSE)
resultsFromScaleFactorSetToMedian <- .SparseNormalize(data = theSparseMatrix, scale.factor = "median", verbose = FALSE)

expect_equal(resultFromExpectedMedian, resultsFromScaleFactorSetToMedian, tolerance = 1e-6)
})


# Tests for ScaleData
# --------------------------------------------------------------------------------
context("ScaleData")
Expand Down