Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ReadDragen to support Illumina Dragen output #6070

Open
wants to merge 1 commit into
base: develop
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
182 changes: 182 additions & 0 deletions R/preprocessing.R
Original file line number Diff line number Diff line change
Expand Up @@ -3479,3 +3479,185 @@ RegressOutMatrix <- function(
dimnames(x = data.resid) <- dimnames(x = data.expr)
return(data.resid)
}


#' Load in data from DRAGEN
#'
#' Enables easy loading of sparse data matrices provided by DRAGEN genomics.
#'
#' @param data.dir Directory containing the *.scRNA.matrix.mtx, *.scRNA.genes.tsv (or features.tsv), *.scRNA.barcodes.tsv, and *.scRNA.barcodeSummary.tsv
#' files provided by DRAGEN. A vector or named vector can be given in order to load
#' several data directories. If a named vector is given, the cell barcode names
#' will be prefixed with the name.
#' @param gene.column Specify which column of genes.tsv or features.tsv to use for gene names; default is 2
#' @param cell.column Specify which column of barcodes.tsv to use for cell names; default is 1
#' @param unique.features Make feature names unique (default TRUE)
#' @param strip.suffix Remove trailing "-1" if present in all cell barcodes.
#'
#' @return If features.csv indicates the data has multiple data types, a list
#' containing a sparse matrix of the data from each type will be returned.
#' Otherwise a sparse matrix containing the expression data will be returned.
#'
#'
#' @export
#' @concept preprocessing
#'
#' @examples
#' \dontrun{
#' # For output from CellRanger < 3.0
#' data_dir <- 'path/to/data/directory'
#' list.files(data_dir) # Should show *.scRNA.matrix.mtx, *.scRNA.genes.tsv, *.scRNA.barcodes.tsv, and *.scRNA.barcodeSummary.tsv
#' expression_matrix <- ReadDRAGEN(data.dir = data_dir)
#' seurat_object = CreateSeuratObject(counts = expression_matrix)
#'
#' # For output from CellRanger >= 3.0 with multiple data types
#' data_dir <- 'path/to/data/directory'
#' list.files(data_dir) # Should show barcodes.tsv.gz, features.tsv.gz, and matrix.mtx.gz
#' data <- ReadDRAGEN(data.dir = data_dir)
#' seurat_object = CreateSeuratObject(counts = data$`Gene Expression`)
#' seurat_object[['Protein']] = CreateAssayObject(counts = data$`Antibody Capture`)
#' }
#'
ReadDRAGEN <- function(
data.dir,
gene.column = 2,
cell.column = 1,
unique.features = TRUE,
strip.suffix = FALSE
) {
full.data <- list()
for (i in seq_along(along.with = data.dir)) {
run <- data.dir[i]
if (!dir.exists(paths = run)) {
stop("Directory provided does not exist")
}
barcode.loc <- file.path(run, list.files(path=run, pattern='*barcodes\\.tsv', recursive=FALSE)[1])
gene.loc <- file.path(run, list.files(path=run, pattern='*genes\\.tsv', recursive=FALSE)[1])
features.loc <- file.path(run, list.files(path=run, pattern='*features\\.tsv', recursive=FALSE)[1])
matrix.loc <- file.path(run, list.files(path=run, pattern='*matrix\\.mtx', recursive=FALSE)[1])
barcodeSummary.loc <- file.path(run, list.files(path=run, pattern='*barcodeSummary\\.tsv', recursive=FALSE)[1])

# Flag to indicate if this data is from CellRanger >= 3.0
pre_ver_3 <- file.exists(gene.loc)
if (!file.exists(barcode.loc)) {
stop("Barcode file missing. Expecting ", basename(path = barcode.loc))
}
if (!pre_ver_3 && !file.exists(features.loc) ) {
stop("Gene name or features file missing. Expecting ", basename(path = features.loc))
}
if (!file.exists(matrix.loc)) {
stop("Expression matrix file missing. Expecting ", basename(path = matrix.loc))
}
if (!file.exists(barcodeSummary.loc)) {
warning(paste0("Barcode summary file missing, DRAGEN barcode filtering will be skipped!. Expecting ", basename(path = barcodeSummary.loc)))
}
data <- Matrix::readMM(file = matrix.loc)
cell.barcodes <- read.table(file = barcode.loc, header = FALSE, sep = '\t', row.names = NULL)
if (ncol(x = cell.barcodes) > 1) {
cell.names <- cell.barcodes[, cell.column]
} else {
cell.names <- readLines(con = barcode.loc)
}
if (all(grepl(pattern = "\\-1$", x = cell.names)) & strip.suffix) {
cell.names <- as.vector(x = as.character(x = sapply(
X = cell.names,
FUN = ExtractField,
field = 1,
delim = "-"
)))
}
if (is.null(x = names(x = data.dir))) {
if (length(x = data.dir) < 2) {
##TRUE
colnames(x = data) <- cell.names
} else {
colnames(x = data) <- paste0(i, "_", cell.names)
}
} else {
colnames(x = data) <- paste0(names(x = data.dir)[i], "_", cell.names)
}
feature.names <- utils::read.delim(
file = ifelse(test = pre_ver_3, yes = gene.loc, no = features.loc),
header = FALSE,
stringsAsFactors = FALSE
)
if (any(is.na(x = feature.names[, gene.column]))) {
warning(
'Some features names are NA. Replacing NA names with ID from the opposite column requested',
call. = FALSE,
immediate. = TRUE
)
na.features <- which(x = is.na(x = feature.names[, gene.column]))
replacement.column <- ifelse(test = gene.column == 2, yes = 1, no = 2)
feature.names[na.features, gene.column] <- feature.names[na.features, replacement.column]
}
if (unique.features) {
fcols = ncol(x = feature.names)
if (fcols < gene.column) {
stop(paste0("gene.column was set to ", gene.column,
" but feature.tsv.gz (or genes.tsv) only has ", fcols, " columns.",
" Try setting the gene.column argument to a value <= to ", fcols, "."))
}
rownames(x = data) <- make.unique(names = feature.names[, gene.column])
}

# DRAGEN filter using barcodeSummary file
if(file.exists(barcodeSummary.loc)){
print(barcodeSummary.loc)
barcodeSummary <- utils::read.delim(
file=barcodeSummary.loc,
header = TRUE,
stringsAsFactors = FALSE
)
tryCatch({
data <- data[, barcodeSummary[barcodeSummary$Filter=="PASS","Barcode"]]
print(dim(data))
},
interrupt = function(x){warning("Warning, could not filter DRAGEN dataset!")},
error = function(x){warning("Warning, could not filter DRAGEN dataset!")},
warning = function(x){warning("Warning, could not filter DRAGEN dataset!")}
)
}


# In cell ranger 3.0, a third column specifying the type of data was added
# and we will return each type of data as a separate matrix
if (ncol(x = feature.names) > 2) {
data_types <- factor(x = feature.names$V3)
lvls <- levels(x = data_types)
if (length(x = lvls) > 1 && length(x = full.data) == 0) {
message("10X data contains more than one type and is being returned as a list containing matrices of each type.")
}
expr_name <- "Gene Expression"
if (expr_name %in% lvls) { # Return Gene Expression first
lvls <- c(expr_name, lvls[-which(x = lvls == expr_name)])
}
data <- lapply(
X = lvls,
FUN = function(l) {
return(data[data_types == l, , drop = FALSE])
}
)
names(x = data) <- lvls
} else{
data <- list(data)
}
full.data[[length(x = full.data) + 1]] <- data
}
# Combine all the data from different directories into one big matrix, note this
# assumes that all data directories essentially have the same features files
list_of_data <- list()
for (j in 1:length(x = full.data[[1]])) {
list_of_data[[j]] <- do.call(cbind, lapply(X = full.data, FUN = `[[`, j))
# Fix for Issue #913
list_of_data[[j]] <- as(object = list_of_data[[j]], Class = "dgCMatrix")
}
names(x = list_of_data) <- names(x = full.data[[1]])
# If multiple features, will return a list, otherwise
# a matrix.
if (length(x = list_of_data) == 1) {
return(list_of_data[[1]])
} else {
return(list_of_data)
}
}