major update aka skipping excel

SchmidtPaul · Apr 13, 2021 · f6fcb20 · f6fcb20
1 parent 6a8ad47
commit f6fcb20
Show file tree

Hide file tree

Showing 69 changed files with 2,349 additions and 468 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: CitaviR
 Type: Package
 Title: A set of tools for dealing with Citavi data
-Version: 0.3.0
+Version: 0.4.0
 Authors@R: 
     c(person(given = "Paul",
              family = "Schmidt",
@@ -33,7 +33,9 @@ Imports:
     purrr,
     stringdist,
     RcppAlgos,
-    textcat
+    textcat,
+    DBI,
+    RSQLite
 Suggests: 
     testthat (>= 3.0.0),
     knitr,

diff --git a/NAMESPACE b/NAMESPACE
@@ -3,15 +3,25 @@
 export("%>%")
 export("%not_in%")
 export(detect_language)
-export(example_xlsx)
+export(example_file)
 export(find_obvious_dups)
 export(find_potential_dups)
 export(handle_obvious_dups)
+export(read_Citavi_ctv6)
 export(read_Citavi_xlsx)
+export(update_Citavi_ctv6)
 export(write_Citavi_xlsx)
 import(crayon)
 import(dplyr)
+importFrom(DBI,dbClearResult)
+importFrom(DBI,dbConnect)
+importFrom(DBI,dbDisconnect)
+importFrom(DBI,dbListTables)
+importFrom(DBI,dbReadTable)
+importFrom(DBI,dbSendQuery)
+importFrom(RSQLite,SQLite)
 importFrom(RcppAlgos,comboGeneral)
+importFrom(dplyr,as_tibble)
 importFrom(janitor,make_clean_names)
 importFrom(magrittr,"%>%")
 importFrom(openxlsx,write.xlsx)
@@ -29,6 +39,7 @@ importFrom(stringr,str_split)
 importFrom(textcat,textcat)
 importFrom(tidyr,fill)
 importFrom(tidyr,pivot_longer)
+importFrom(tidyr,replace_na)
 importFrom(tidyr,separate_rows)
 importFrom(tidyr,unite)
 importFrom(utils,tail)
diff --git a/R/data.R b/R/data.R
diff --git a/R/detect_language.R b/R/detect_language.R
@@ -12,16 +12,17 @@
 #'
 #' @details
 #' `r lifecycle::badge("experimental")` \cr
-#' The underyling core function determining the language is \code{textcat::textcat()}.
+#' The underlying core function determining the language is \code{textcat::textcat()}.
 #'
 #' @examples
+#' \dontrun{
 #' CitDat <- CitaviR::diabetesprevalence %>%
 #'   dplyr::slice(1952:1955, 4390:4393)
 #'
 #' CitDat %>%
 #'   detect_language() %>%
 #'   dplyr::select(Abstract, det_lang, det_lang_wanted)
-#'
+#' }
 #' @return A tibble containing at least one additional column: \code{det_lang}.
 #' @importFrom textcat textcat
 #' @importFrom tidyr unite

diff --git a/R/example_xlsx.R → R/example_file.R b/R/example_xlsx.R → R/example_file.R
@@ -1,4 +1,4 @@
-#' @title Get path to example xlsx exported from Citavi
+#' @title Get path to example file
 #'
 #' @description
 #' `r lifecycle::badge("stable")` \cr
@@ -8,12 +8,13 @@
 #' @param file Name of file. If `NULL`, all example files will be listed.
 #' @export
 #' @examples
-#' example_xlsx()
-#' example_xlsx("3dupsin5refs.xlsx")
+#' example_file()
+#' example_file("3dupsin5refs.xlsx")
+#' example_file("3dupsin5refs/3dupsin5refs.ctv6")
 
-example_xlsx <- function(file = NULL) {
+example_file <- function(file = NULL) {
   if (is.null(file)) {
-    dir(system.file("extdata", package = "CitaviR"))
+    dir(system.file("extdata", package = "CitaviR"), recursive = TRUE)
   } else {
     system.file("extdata", file, package = "CitaviR", mustWork = TRUE)
   }

diff --git a/R/find_obvious_dups.R b/R/find_obvious_dups.R
@@ -4,7 +4,8 @@
 #' The following columns \bold{must be present}: \code{ID}, \code{Title}, \code{Year}.
 #' @param dupInfoAfterID If TRUE (default), the newly created columns
 #' \code{clean_title}, \code{clean_title_id}, \code{has_obv_dup} and \code{obv_dup_id}
-#' are moved right next to the \code{ID} column.
+#' are moved right next to the \code{ID} column. Additionally, the \code{ID} column is
+#' moved to the first position.
 #' @param preferDupsWithPDF If TRUE (default), obvious duplicates are sorted by their info
 #' in columns \code{has_attachment} and/or \code{Locations} (given they are present in the dataset).
 #' After sorting, duplicates with the most occurences of \code{".pdf"} in \code{Locations} and a
@@ -16,8 +17,8 @@
 #' was set to "English" so that column names are "Short Title" etc.
 #'
 #' @examples
-#' path <- example_xlsx("3dupsin5refs.xlsx")
-#' read_Citavi_xlsx(path) %>%
+#' example_path <- example_file("3dupsin5refs/3dupsin5refs.ctv6")
+#' read_Citavi_ctv6(example_path) %>%
 #'    find_obvious_dups() %>%
 #'    dplyr::select(clean_title:obv_dup_id)
 #'
@@ -32,9 +33,17 @@
 
 find_obvious_dups <- function(CitDat, dupInfoAfterID = TRUE, preferDupsWithPDF = TRUE) {
 
-  required_cols <- c("ID", "Title", "Year")
+  if ("StaticIDs" %in% names(CitDat)) {
+    CitDat <- CitDat %>% rename("RefID" = "StaticIDs")
+    OriginalRefIDLabel <- "StaticIDs"
+  } else {
+    CitDat <- CitDat %>% rename("RefID" = "ID")
+    OriginalRefIDLabel <- "ID"
+  }
+
+  required_cols <- c("RefID", "Title", "Year")
 
-  # ID, Title & Year present? -----------------------------------------------
+  # RefID, Title & Year present? --------------------------------------------
   for (col_name_i in required_cols) {
     if (col_name_i %not_in% names(CitDat)) {
       stop(paste(col_name_i, "column is missing!"))
@@ -96,16 +105,21 @@ find_obvious_dups <- function(CitDat, dupInfoAfterID = TRUE, preferDupsWithPDF =
   # dupInfoAfterID ----------------------------------------------------------
   if (dupInfoAfterID) {
     CitDat <- CitDat %>%
+      relocate("RefID", dplyr::everything()) %>%
       relocate(c(
         "clean_title",
         "clean_title_id",
         "has_obv_dup",
         "obv_dup_id"
       ),
-      .after = "ID")
+      .after = "RefID")
   }
 
 
+  # Original RefID label ----------------------------------------------------
+  names(CitDat)[names(CitDat) == "RefID"] <- OriginalRefIDLabel
+
+
   # return tibble -----------------------------------------------------------
   CitDat
 

diff --git a/R/find_potential_dups.R b/R/find_potential_dups.R
@@ -15,8 +15,8 @@
 #' was set to "English" so that column names are "Short Title" etc.
 #'
 #' @examples
-#' path <- example_xlsx("3dupsin5refs.xlsx")
-#' CitDat <- read_Citavi_xlsx(path) %>%
+#' example_path <- example_file("3dupsin5refs/3dupsin5refs.ctv6")
+#' CitDat <- read_Citavi_ctv6(example_path) %>%
 #'    find_obvious_dups() %>%
 #'    find_potential_dups()
 #'

diff --git a/R/handle_obvious_dups.R b/R/handle_obvious_dups.R
@@ -19,18 +19,18 @@
 #'
 #'
 #' @examples
-#' path <- example_xlsx("3dupsin5refs.xlsx")
-#' CitDat <- read_Citavi_xlsx(path) %>%
+#' example_path <- example_file("3dupsin5refs/3dupsin5refs.ctv6")
+#' CitDat <- read_Citavi_ctv6(example_path) %>%
 #'    find_obvious_dups()
 #'
 #' # before
 #' CitDat %>%
-#'    dplyr::select("clean_title", "clean_title_id", "obv_dup_id", "DOI name", "PubMed ID")
+#'    dplyr::select("clean_title", "clean_title_id", "obv_dup_id", "DOI", "PubMedID")
 #'
 #' # after
 #' CitDat %>%
-#'    handle_obvious_dups(fieldsToHandle = c("DOI name", "PubMed ID")) %>%
-#'    dplyr::select("clean_title", "clean_title_id", "obv_dup_id", "DOI name", "PubMed ID")
+#'    handle_obvious_dups(fieldsToHandle = c("DOI", "PubMedID")) %>%
+#'    dplyr::select("clean_title", "clean_title_id", "obv_dup_id", "DOI", "PubMedID")
 #'
 #' @return A tibble where information from obvious duplicates was brought together for \code{dup_01}, respectively.
 #' @importFrom purrr map

diff --git a/R/read_Citavi_ctv6.R b/R/read_Citavi_ctv6.R
@@ -0,0 +1,60 @@
+#' @title Read table from Citavi database (via SQL)
+#'
+#' @param path Path to the local Citavi project file (.ctv6).
+#' @param CitDBTableName Name of the table to be read from the connected Citavi database (via \code{DBI::dbReadTable()}).
+#' Set to "Reference" by default. Shows all table names when set to NULL (via \code{DBI::dbListTables}).
+#'
+#' @details
+#' `r lifecycle::badge("experimental")` \cr
+#' The underlying core functions are \code{DBI::dbConnect()} \code{RSQLite::SQLite()}, \code{DBI::dbReadTable()} and \code{DBI::dbListTables}.
+#'
+#' @examples
+#' # example Citavi project
+#' example_path <- example_file("3dupsin5refs/3dupsin5refs.ctv6")
+#'
+#' # import reference (=default) table
+#' CitDat <- read_Citavi_ctv6(example_path)
+#' CitDat %>% dplyr::select(Title, Year, Abstract, DOI)
+#'
+#' # show table names
+#' read_Citavi_ctv6(example_path, CitDBTableName = NULL)
+#'
+#' @return A tibble
+#' @importFrom RSQLite SQLite
+#' @importFrom DBI dbConnect
+#' @importFrom DBI dbDisconnect
+#' @importFrom DBI dbListTables
+#' @importFrom DBI dbReadTable
+#' @importFrom dplyr as_tibble
+#' @export
+
+read_Citavi_ctv6 <- function(path = NULL, CitDBTableName = "Reference") {
+
+    stopifnot(is.character(path)) # path required
+
+    if (is.null(CitDBTableName)) {
+
+      # connect, ListTables, disconnect -----------------------------------------
+      Citcon <- DBI::dbConnect(RSQLite::SQLite(), path)
+      CitDBTablesVector <- DBI::dbListTables(conn = Citcon)
+      Citcon %>% DBI::dbDisconnect()
+
+      CitDBTablesVector # return vector
+
+    } else {
+
+      # connect, ReadTable, disconnect ------------------------------------------
+      Citcon <- DBI::dbConnect(RSQLite::SQLite(), path)
+      CitDBTable <- DBI::dbReadTable(conn = Citcon, name = CitDBTableName)
+      Citcon %>% DBI::dbDisconnect()
+
+
+      # format ------------------------------------------------------------------
+      CitDBTable <- CitDBTable %>%
+        dplyr::as_tibble()
+
+
+      # return tibble -----------------------------------------------------------
+      CitDBTable
+    }
+  }
diff --git a/R/read_Citavi_xlsx.R b/R/read_Citavi_xlsx.R
@@ -26,8 +26,8 @@
 #' was set to "English" so that column names are "Short title" etc.
 #'
 #' @examples
-#' path <- example_xlsx("3dupsin5refs.xlsx") # use this package's example xlsx file
-#' read_Citavi_xlsx(path)
+#' example_path <- example_file("3dupsin5refs.xlsx") # use this package's example xlsx file
+#' read_Citavi_xlsx(example_path)
 #'
 #' \dontrun{
 #' CitDat <- read_Citavi_xlsx("data/yourCitaviExport.xlsx")
@@ -41,7 +41,7 @@
 read_Citavi_xlsx <- function(path = NULL, keepMarksCols = TRUE, useYearDerived = TRUE, setSuggestedColOrder = TRUE, setSuggestedColTypes = TRUE, ...) {
 
   if (is.null(path)) {
-    stop("You did not provide a path to the Excel file.\n  If you want to use an example file provided in this package instead, try\n  read_Citavi_xlsx(example_xlsx('3dupsin5refs.xlsx'))")
+    stop("You did not provide a path to the Excel file.\n  If you want to use an example file provided in this package instead, try\n  read_Citavi_xlsx(example_file('3dupsin5refs.xlsx'))")
   }