satijalab · parvsachdeva · Jun 13, 2022
diff --git a/R/preprocessing.R b/R/preprocessing.R
@@ -3479,3 +3479,185 @@ RegressOutMatrix <- function(
   dimnames(x = data.resid) <- dimnames(x = data.expr)
   return(data.resid)
 }
+
+
+#' Load in data from DRAGEN
+#'
+#' Enables easy loading of sparse data matrices provided by DRAGEN genomics.
+#'
+#' @param data.dir Directory containing the *.scRNA.matrix.mtx, *.scRNA.genes.tsv (or features.tsv), *.scRNA.barcodes.tsv, and *.scRNA.barcodeSummary.tsv
+#' files provided by DRAGEN. A vector or named vector can be given in order to load
+#' several data directories. If a named vector is given, the cell barcode names
+#' will be prefixed with the name.
+#' @param gene.column Specify which column of genes.tsv or features.tsv to use for gene names; default is 2
+#' @param cell.column Specify which column of barcodes.tsv to use for cell names; default is 1
+#' @param unique.features Make feature names unique (default TRUE)
+#' @param strip.suffix Remove trailing "-1" if present in all cell barcodes.
+#'
+#' @return If features.csv indicates the data has multiple data types, a list
+#'   containing a sparse matrix of the data from each type will be returned.
+#'   Otherwise a sparse matrix containing the expression data will be returned.
+#'
+#'
+#' @export
+#' @concept preprocessing
+#'
+#' @examples
+#' \dontrun{
+#' # For output from CellRanger < 3.0
+#' data_dir <- 'path/to/data/directory'
+#' list.files(data_dir) # Should show *.scRNA.matrix.mtx, *.scRNA.genes.tsv, *.scRNA.barcodes.tsv, and *.scRNA.barcodeSummary.tsv
+#' expression_matrix <- ReadDRAGEN(data.dir = data_dir)
+#' seurat_object = CreateSeuratObject(counts = expression_matrix)
+#'
+#' # For output from CellRanger >= 3.0 with multiple data types
+#' data_dir <- 'path/to/data/directory'
+#' list.files(data_dir) # Should show barcodes.tsv.gz, features.tsv.gz, and matrix.mtx.gz
+#' data <- ReadDRAGEN(data.dir = data_dir)
+#' seurat_object = CreateSeuratObject(counts = data$`Gene Expression`)
+#' seurat_object[['Protein']] = CreateAssayObject(counts = data$`Antibody Capture`)
+#' }
+#'
+ReadDRAGEN <- function(
+  data.dir,
+  gene.column = 2,
+  cell.column = 1,
+  unique.features = TRUE,
+  strip.suffix = FALSE
+) {
+  full.data <- list()
+  for (i in seq_along(along.with = data.dir)) {
+    run <- data.dir[i]
+    if (!dir.exists(paths = run)) {
+      stop("Directory provided does not exist")
+    }
+    barcode.loc <- file.path(run, list.files(path=run, pattern='*barcodes\\.tsv', recursive=FALSE)[1])
+    gene.loc <- file.path(run, list.files(path=run, pattern='*genes\\.tsv', recursive=FALSE)[1])
+    features.loc <- file.path(run, list.files(path=run, pattern='*features\\.tsv', recursive=FALSE)[1])
+    matrix.loc <- file.path(run, list.files(path=run, pattern='*matrix\\.mtx', recursive=FALSE)[1])
+    barcodeSummary.loc <- file.path(run, list.files(path=run, pattern='*barcodeSummary\\.tsv', recursive=FALSE)[1])
+
+    # Flag to indicate if this data is from CellRanger >= 3.0
+    pre_ver_3 <- file.exists(gene.loc)
+    if (!file.exists(barcode.loc)) {
+      stop("Barcode file missing. Expecting ", basename(path = barcode.loc))
+    }
+    if (!pre_ver_3 && !file.exists(features.loc) ) {
+      stop("Gene name or features file missing. Expecting ", basename(path = features.loc))
+    }
+    if (!file.exists(matrix.loc)) {
+      stop("Expression matrix file missing. Expecting ", basename(path = matrix.loc))
+    }
+    if (!file.exists(barcodeSummary.loc)) {
+      warning(paste0("Barcode summary file missing, DRAGEN barcode filtering will be skipped!. Expecting ", basename(path = barcodeSummary.loc)))
+    }    
+    data <- Matrix::readMM(file = matrix.loc)
+    cell.barcodes <- read.table(file = barcode.loc, header = FALSE, sep = '\t', row.names = NULL)
+    if (ncol(x = cell.barcodes) > 1) {
+      cell.names <- cell.barcodes[, cell.column]
+    } else {
+      cell.names <- readLines(con = barcode.loc)
+    }
+    if (all(grepl(pattern = "\\-1$", x = cell.names)) & strip.suffix) {
+      cell.names <- as.vector(x = as.character(x = sapply(
+        X = cell.names,
+        FUN = ExtractField,
+        field = 1,
+        delim = "-"
+      )))
+    }
+    if (is.null(x = names(x = data.dir))) {
+      if (length(x = data.dir) < 2) {
+        ##TRUE
+        colnames(x = data) <- cell.names
+      } else {
+        colnames(x = data) <- paste0(i, "_", cell.names)
+      }
+    } else {
+      colnames(x = data) <- paste0(names(x = data.dir)[i], "_", cell.names)
+    }
+    feature.names <- utils::read.delim(
+      file = ifelse(test = pre_ver_3, yes = gene.loc, no = features.loc),
+      header = FALSE,
+      stringsAsFactors = FALSE
+    )
+    if (any(is.na(x = feature.names[, gene.column]))) {
+      warning(
+        'Some features names are NA. Replacing NA names with ID from the opposite column requested',
+        call. = FALSE,
+        immediate. = TRUE
+      )
+      na.features <- which(x = is.na(x = feature.names[, gene.column]))
+      replacement.column <- ifelse(test = gene.column == 2, yes = 1, no = 2)
+      feature.names[na.features, gene.column] <- feature.names[na.features, replacement.column]
+    }
+    if (unique.features) {
+      fcols = ncol(x = feature.names)
+      if (fcols < gene.column) {
+        stop(paste0("gene.column was set to ", gene.column,
+                    " but feature.tsv.gz (or genes.tsv) only has ", fcols, " columns.",
+                    " Try setting the gene.column argument to a value <= to ", fcols, "."))
+      }
+      rownames(x = data) <- make.unique(names = feature.names[, gene.column])
+    }
+
+    # DRAGEN filter using barcodeSummary file
+    if(file.exists(barcodeSummary.loc)){
+        print(barcodeSummary.loc)
+        barcodeSummary <- utils::read.delim(
+        file=barcodeSummary.loc,
+        header = TRUE,
+        stringsAsFactors = FALSE
+        )
+        tryCatch({
+            data <- data[, barcodeSummary[barcodeSummary$Filter=="PASS","Barcode"]]
+            print(dim(data))
+            }, 
+        interrupt = function(x){warning("Warning, could not filter DRAGEN dataset!")},
+        error = function(x){warning("Warning, could not filter DRAGEN dataset!")},
+        warning = function(x){warning("Warning, could not filter DRAGEN dataset!")}
+        )
+    }
+
+
+    # In cell ranger 3.0, a third column specifying the type of data was added
+    # and we will return each type of data as a separate matrix
+    if (ncol(x = feature.names) > 2) {
+      data_types <- factor(x = feature.names$V3)
+      lvls <- levels(x = data_types)
+      if (length(x = lvls) > 1 && length(x = full.data) == 0) {
+        message("10X data contains more than one type and is being returned as a list containing matrices of each type.")
+      }
+      expr_name <- "Gene Expression"
+      if (expr_name %in% lvls) { # Return Gene Expression first
+        lvls <- c(expr_name, lvls[-which(x = lvls == expr_name)])
+      }
+      data <- lapply(
+        X = lvls,
+        FUN = function(l) {
+          return(data[data_types == l, , drop = FALSE])
+        }
+      )
+      names(x = data) <- lvls
+    } else{
+      data <- list(data)
+    }
+    full.data[[length(x = full.data) + 1]] <- data
+  }
+  # Combine all the data from different directories into one big matrix, note this
+  # assumes that all data directories essentially have the same features files
+  list_of_data <- list()
+  for (j in 1:length(x = full.data[[1]])) {
+    list_of_data[[j]] <- do.call(cbind, lapply(X = full.data, FUN = `[[`, j))
+    # Fix for Issue #913
+    list_of_data[[j]] <- as(object = list_of_data[[j]], Class = "dgCMatrix")
+  }
+  names(x = list_of_data) <- names(x = full.data[[1]])
+  # If multiple features, will return a list, otherwise
+  # a matrix.
+  if (length(x = list_of_data) == 1) {
+    return(list_of_data[[1]])
+  } else {
+    return(list_of_data)
+  }
+}