v0.3.7 (#42)

V 0.3.7 ======= - New features: - New functions: - Function *as.POSIXct_fast* is now available. It helps to transform to POSIXct way faster (if the same date value is present multiple times in the column). - New features in existing functions: - In dates identifications, we make it faster by computing search of format only on unique values. - In date transformation, we made it faster by using *as.POSIXct_fast* when it is necessary. - Functions *findAndTransFormDates*, *findAndTransformNumerics* and *unFactor* now accept argument *cols* to limitate search. - Bug fixes: - Control that over-allocate option is activated on every data.table to avoid issues with set. Package should be more robust. - In bijection search (internal function *fastIsBijection*) there was a bug on some rare cases. Fixed but slower. -Code quality: - Improving code quality using lintr - Suppressing some useless code - Meeting new covr standard - Improve log of setColAsXXX
ELToulemonde · Aug 21, 2018 · 7558f80 · 7558f80
1 parent 49ea9e5
commit 7558f80
Show file tree

Hide file tree

Showing 43 changed files with 2,552 additions and 153 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: dataPreparation
 Title: Automated Data Preparation
-Version: 0.3.6
+Version: 0.3.7
 Authors@R: person("Emmanuel-Lin", "Toulemonde", email = "[email protected]", role = c("aut", "cre"))
 Description: Do most of the painful data preparation for a data science project with a minimum amount of code; Take advantages of data.table efficiency and use some algorithmic trick in order to perform data preparation in a time and RAM efficient way.
 Depends:
@@ -12,7 +12,7 @@ Depends:
 License: GPL-3 | file LICENSE
 LazyData: true
 Encoding: UTF-8
-RoxygenNote: 6.0.1
+RoxygenNote: 6.1.0
 Suggests: knitr,
     rmarkdown,
 	kableExtra,

diff --git a/NAMESPACE b/NAMESPACE
@@ -1,6 +1,7 @@
 # Generated by roxygen2: do not edit by hand
 
 export(aggregateByKey)
+export(as.POSIXct_fast)
 export(build_bins)
 export(build_encoding)
 export(build_scales)

diff --git a/NEWS b/NEWS
@@ -1,3 +1,23 @@
+V 0.3.7
+=======
+- New features:
+	- New functions:
+		- Function *as.POSIXct_fast* is now available. It helps to transform to POSIXct way faster (if the same date value is present multiple times in the column).
+	- New features in existing functions:
+		- In dates identifications, we make it faster by computing search of format only on unique values.
+		- In date transformation, we made it faster by using *as.POSIXct_fast* when it is necessary.
+		- Functions *findAndTransFormDates*, *findAndTransformNumerics* and *unFactor* now accept argument *cols* to limitate search.
+
+- Bug fixes:
+	- Control that over-allocate option is activated on every data.table to avoid issues with set. Package should be more robust.
+	- In bijection search (internal function *fastIsBijection*) there was a bug on some rare cases. Fixed but slower.
+
+-Code quality:
+	- Improving code quality using lintr
+	- Suppressing some useless code
+	- Meeting new covr standard
+	- Improve log of setColAsXXX
+
 V 0.3.6
 =======
 - Bug fixes:

diff --git a/NEWS.md b/NEWS.md
@@ -1,3 +1,23 @@
+V 0.3.7
+=======
+- New features:
+	- New functions:
+		- Function *as.POSIXct_fast* is now available. It helps to transform to POSIXct way faster (if the same date value is present multiple times in the column).
+	- New features in existing functions:
+		- In dates identifications, we make it faster by computing search of format only on unique values.
+		- In date transformation, we made it faster by using *as.POSIXct_fast* when it is necessary.
+		- Functions *findAndTransFormDates*, *findAndTransformNumerics* and *unFactor* now accept argument *cols* to limitate search.
+
+- Bug fixes:
+	- Control that over-allocate option is activated on every data.table to avoid issues with set. Package should be more robust.
+	- In bijection search (internal function *fastIsBijection*) there was a bug on some rare cases. Fixed but slower.
+
+-Code quality:
+	- Improving code quality using lintr
+	- Suppressing some useless code
+	- Meeting new covr standard
+	- Improve log of setColAsXXX
+
 V 0.3.6
 =======
 - Bug fixes:

diff --git a/R/aggregate.R b/R/aggregate.R
@@ -85,15 +85,15 @@ aggregateByKey <- function(dataSet, key, verbose = TRUE, thresh = 53, ...){
     }
     if (verbose){
       printl(function_name, ": ", ncol(result), " columns have been constructed. It took ", 
-             round((proc.time() - start_time)[[3]], 2), " seconds. ")
+             round( (proc.time() - start_time)[[3]], 2), " seconds. ")
     }
 
     return(result)
   }
   else{ # If there is as many unique key as lines, there is nothing to do
     return(dataSet)
   }
-}
+  }
 
 
 ###########################################################################
@@ -113,6 +113,7 @@ aggregateByKey <- function(dataSet, key, verbose = TRUE, thresh = 53, ...){
 # @param thresh number of max distinct values for frequencies count
 # @
 # @export # Before exporting this function should be improved!
+#' @import data.table
 aggregateAcolumn <- function(dataSet, col, key, unique_keys, name_separator = ".", 
                              functions, thresh = 53, ...){
   ## Environement
@@ -124,24 +125,24 @@ aggregateAcolumn <- function(dataSet, col, key, unique_keys, name_separator = ".
     stop(paste0(function_name, ": dataSet should have 2 columns. (This is a private function)."))
   }
   ## Initialization
-  maxNbValuePerKey <- max(unique(dataSet)[, .N, by = key]$N)
+  max_unique_val_per_key <- max(unique(dataSet)[, .N, by = key]$N)
 
   ## Computation 
-  if (maxNbValuePerKey > 1){
+  if (max_unique_val_per_key > 1){
     result_tmp <- copy(unique_keys) # copy because it's a data.table, otherwise it append it
     ## Aggregation of numerics
     if (is.numeric(dataSet[[col]])){ 
       # To-do: if there is a constant nbr of value for each line consider make
       # them columns
       # To-do: if there is a small amount of values: factorize
-      code = "result_tmp = dataSet[, .("
+      code <- "result_tmp = dataSet[, .("
       for (fct in functions){
-        code = paste0(code, paste(fct, col, sep = name_separator), "=", fct, "(get(col)), ")
+        code <- paste0(code, paste(fct, col, sep = name_separator), "=", fct, "(get(col)), ")
       }
       if (length(functions) > 0){
-        code = substr(code, start = 1, stop = nchar(code) - 2)
+        code <- substr(code, start = 1, stop = nchar(code) - 2)
       }
-      code = paste0(code, "), by = key]")
+      code <- paste0(code, "), by = key]")
       try(eval(parse(text = code)))
       if ("sd" %in% functions){
         # Bug fixing, sd is giving NA if you only have one value while standard deviation is supposed to be 0
@@ -172,7 +173,7 @@ aggregateAcolumn <- function(dataSet, col, key, unique_keys, name_separator = ".
     }
 
   }
-  if (maxNbValuePerKey == 1){
+  if (max_unique_val_per_key == 1){
     # Only one different value by key: we put the value one time by key.
     result_tmp <- unique(dataSet)
   }

diff --git a/R/datesManipulations.R b/R/datesManipulations.R
@@ -1,10 +1,12 @@
-###################################################################################
-########################### findAndTransformDates #################################
-###################################################################################
+###################################################################################################
+########################### findAndTransformDates #################################################
+###################################################################################################
 #' Identify date columns
 #' 
 #' Find and transform dates that are hidden in a character column. \cr
 #' It use a bunch of default formats, and you can also add your own formats.
+#' @param cols List of column(s) name(s) of dataSet to look into. To check all all columns, set it 
+#'  to "auto". (characters, default to "auto")
 #' @param dataSet Matrix, data.frame or data.table
 #' @param formats List of additional Date formats to check (see \code{\link{strptime}})
 #' @param n_test Number of non-null rows on which to test (numeric, default to 30)
@@ -28,6 +30,8 @@
 #' \item \code{SOLVE} function will try to solve ambiguity by going through more lines, so will be slower. 
 #' If it is able to solve it, it will transform the column, if not it will print the various acceptable formats.
 #' }
+#' If there are some columns that have no chance to be a match think of removing them from \code{cols} 
+#' to save some computation time.
 #' @return dataSet set (as a data.table) with identified dates transformed by \strong{reference}.
 #' @examples
 #' # Load exemple set
@@ -50,7 +54,7 @@
 #' }
 #' # "##NOT RUN:" mean that this example hasn't been run on CRAN since its long. But you can run it!
 #' @export
-findAndTransformDates <- function(dataSet, formats = NULL, n_test = 30, ambiguities = "IGNORE", verbose = TRUE){
+findAndTransformDates <- function(dataSet, cols = "auto", formats = NULL, n_test = 30, ambiguities = "IGNORE", verbose = TRUE){
   ## Working environement
   function_name <- "findAndTransformDates"
 
@@ -60,12 +64,14 @@ findAndTransformDates <- function(dataSet, formats = NULL, n_test = 30, ambiguit
   if (!is.character(ambiguities) || ! ambiguities %in% c("IGNORE", "WARN", "SOLVE")){
     stop(paste0(function_name, ": ambiguities should be either IGNORE, WARN or SOLVE."))
   }
+  cols <- real_cols(dataSet, cols, function_name)
   ## initialization
   start_time <- proc.time()
 
   ## Computation
   # First we find dates
-  formats_found <- identifyDates(dataSet, formats = formats, n_test = n_test, ambiguities = ambiguities, verbose = verbose)
+  formats_found <- identifyDates(dataSet, cols = cols, formats = formats, n_test = n_test, 
+                                 ambiguities = ambiguities, verbose = verbose)
   if (verbose){
     printl(function_name, ": It took me ", round( (proc.time() - start_time)[[3]], 2), "s to identify formats")
   }
@@ -92,6 +98,8 @@ findAndTransformDates <- function(dataSet, formats = NULL, n_test = 30, ambiguit
 #' 
 #' Function to identify dates columns and give there format. It use a bunch of default formats. But you can also add your own formats.
 #' @param dataSet Matrix, data.frame or data.table
+#' @param cols List of column(s) name(s) of dataSet to look into. To check all all columns, set it 
+#'  to "auto". (characters, default to "auto")
 #' @param formats List of additional Date formats to check (see \code{\link{strptime}})
 #' @param n_test Number of non-null rows on which to test (numeric, default to 30)
 #' @param ambiguities How ambiguities should be treated (see details in ambiguities section)
@@ -126,23 +134,23 @@ findAndTransformDates <- function(dataSet, formats = NULL, n_test = 30, ambiguit
 #' # using the findAndTransformDates
 #' identifyDates(messy_adult, n_test = 5)
 #' @export
-identifyDates <- function(dataSet, formats = NULL, n_test = 30, ambiguities = "IGNORE", verbose = TRUE){
+identifyDates <- function(dataSet, cols = "auto", formats = NULL, n_test = 30, ambiguities = "IGNORE", verbose = TRUE){
   ## Working environement
   function_name <- "identifyDates"
 
   ## Sanity check
   dataSet <- checkAndReturnDataTable(dataSet)
   n_test <- control_nb_rows(dataSet = dataSet, nb_rows = n_test, function_name = function_name, variable_name = "n_test")
   is.verbose(verbose)
-
+  cols <- real_cols(dataSet, cols, function_name)
   ## Initialization
   formats_found <- list()
   format <- NULL # Initialize format to NULL to avoid considering it by mistake as the function format
   ## Computation
   if (verbose){ 
     pb <- initPB(function_name, names(dataSet))
   }
-  for ( col in names(dataSet) ){ 
+  for ( col in cols){ 
     # We search dates only in characters
     if (is.character(dataSet[[col]]) || is.numeric(dataSet[[col]]) || (is.factor(dataSet[[col]]) & is.character(levels(dataSet[[col]])))){
       # Look for the good format
@@ -207,7 +215,7 @@ identifyDatesFormats <- function(dataSet, formats, n_test = 30, ambiguities="IGN
     data_sample <- findNFirstNonNull(levels(dataSet), n_test)
   }
   else{
-    data_sample <- findNFirstNonNull(dataSet, n_test)
+    data_sample <- unique(findNFirstNonNull(dataSet, n_test))  # Take unique to avoid useless computations
   }
 
   ## Computation
@@ -231,18 +239,27 @@ identifyDatesFormats <- function(dataSet, formats, n_test = 30, ambiguities="IGN
       defaultDateFormats <- getPossibleDatesFormats(date_sep_tmp, date_hours = date_hours)
       formats_tmp <- unique(c(defaultDateFormats, formats))
 
+      # # Check if we should change local time (ie: should it be english?) => Future, because it as to be changed is setColAsDate too
+      # change_lc_time <- any(grepl("Aug|May|Apr", data_sample))
+
+      # if (change_lc_time){
+        # store_loctime <- Sys.getlocale("LC_TIME")
+        # Sys.setlocale("LC_TIME", "C")
+      # }
+      #  Check formats
       for (format in formats_tmp){
         converted <- as.POSIXct(data_sample, format = format)
         un_converted <- format(converted, format = format)
         if (control_date_conversion(un_converted, data_sample)){
+		  temp_format <- c(temp_format, format)
           if (ambiguities == "IGNORE"){ # If don't care about possible ambiguities: return found format
-            return(format)
-          }
-          else{ # Else keep searching
-            temp_format <- c(temp_format, format)
+            break
           }
         }
       }
+      # if (change_lc_time){ # Reset it
+        # Sys.setlocale("LC_TIME", store_loctime)
+      # }
     }
     if (is.numeric(data_sample)){
       temp_format <- identifyTimeStampsFormats(dataSet = data_sample)
@@ -394,7 +411,8 @@ getPossibleDatesFormats <- function(date_sep =  c("," , "/", "-", "_", ":"), dat
     }
   }
 
-  # Complete the list with the same formats but with a time format at the and separed by a ' ' or a "T" and optionaly with a "Z" at the end
+  # Complete the list with the same formats but with a time format at the and separed by 
+  # a ' ' or a "T" and optionaly with a "Z" at the end
   formats <- c(datesFormats, hours_format)
   if (date_hours){
     for (datesFormat in datesFormats){
@@ -404,6 +422,9 @@ getPossibleDatesFormats <- function(date_sep =  c("," , "/", "-", "_", ":"), dat
     }
   }
 
+  # Add optional time zone at the end.
+  # formats <- c(formats, paste(formats, "%z"))
+
   ## Wrapp-up
   return(formats)
 }
@@ -413,6 +434,8 @@ getPossibleDatesFormats <- function(date_sep =  c("," , "/", "-", "_", ":"), dat
 ############################### Format for parse_date_time ############################
 #######################################################################################
 # Code is commented and result hard written so that it's way faster. You should keep it that way
+#' @importFrom lubridate parse_date_time
+#' @importFrom stringr str_replace_all
 formatForparse_date_time<- function(){
   # Get complete liste of format
   # listOfFastFormat <- getPossibleDatesFormats()
@@ -444,7 +467,6 @@ formatForparse_date_time<- function(){
               "ydm H", "dmy HMS", "dmy HM", "dmy H", "mdy HMS", "mdy HM", "mdy H"
   )
 
-
   ## Wrapp-up
   return(result)
 }
diff --git a/R/description.R b/R/description.R
@@ -43,16 +43,16 @@ description <- function(dataSet, level = 1, path_to_write = NULL, verbose = TRUE
   # Level 1: Univariate description
   if (level >= 1){
     cat("#####################################\n## Level 1: univariate description ##\n#####################################\n")
-	card <- dataSet[, lapply(.SD, uniqueN)]
+    card <- dataSet[, lapply(.SD, uniqueN)]
     for (col in colnames(dataSet)){
-	  # Unique or distinct values
+      # Unique or distinct values
       if (card[[col]] == 1){
-		printl(col, " only has 1 value.")
-	  } 
-	  if (card[[col]] == nrow(dataSet)){
-	    printl(col, " has all unique values.")
-	  }
-	  # Numerical and date cols
+        printl(col, " only has 1 value.")
+      } 
+      if (card[[col]] == nrow(dataSet)){
+        printl(col, " has all unique values.")
+      }
+      # Numerical and date cols
       if (is.numeric(dataSet[[col]]) || is.date(dataSet[[col]])){
         if (is.numeric(dataSet[[col]])){
           printl("Summary for numeric variable ", col)  
@@ -62,7 +62,7 @@ description <- function(dataSet, level = 1, path_to_write = NULL, verbose = TRUE
         }
         print(summary(dataSet[[col]]))
       }
-	  # Factor cols
+      # Factor cols
       if (is.factor(dataSet[[col]]) || is.logical(dataSet[[col]])){
         if (is.factor(dataSet[[col]])){
           printl("Table of occurence for factor variable ", col)  
@@ -72,7 +72,7 @@ description <- function(dataSet, level = 1, path_to_write = NULL, verbose = TRUE
         }
         print(table(dataSet[[col]]))
       }
-	  # character
+      # character
       if (is.character(dataSet[[col]])){
         printl("character variable ", col, " takes ", uniqueN(dataSet[[col]]), " different values.")
       }

diff --git a/R/discretization.R b/R/discretization.R
@@ -136,7 +136,7 @@ fastDiscretization <- function(dataSet, bins = NULL, verbose = TRUE){
       }
       res <- which(splits[-length(splits)] <= x & x < splits[-1])
       if (length(res) == 0){
-        res <- length(splits) -1
+        res <- length(splits) - 1
       }
       return(res)
     }
@@ -231,5 +231,4 @@ build_splits_names <- function(splits){
   split_names[length(split_names)] <- gsub("+Inf\\]$", "+Inf[", split_names[length(split_names)])
 
   return(split_names)
-}
-
+}
diff --git a/R/factorManipulation.R b/R/factorManipulation.R
@@ -3,6 +3,8 @@
 #' To unfactorize all columns that have more than a given amount of various values. This
 #'  function will be usefull after using some reading functions that put every string as factor.
 #' @param dataSet Matrix, data.frame or data.table
+#' @param cols List of column(s) name(s) of dataSet to look into. To check all all columns, set it 
+#'  to "auto". (characters, default to "auto")
 #' @param n_unfactor Number of max element in a factor (numeric, default to 53)
 #' @param verbose Should the algorithm talk? (logical, default to TRUE)
 #' @details 
@@ -26,7 +28,7 @@
 #' 
 #' @import data.table
 #' @export
-unFactor <- function(dataSet, n_unfactor = 53, verbose = TRUE){
+unFactor <- function(dataSet, cols = "auto", n_unfactor = 53, verbose = TRUE){
   ## Working environement
   function_name <- "unFactor"
 
@@ -45,7 +47,7 @@ unFactor <- function(dataSet, n_unfactor = 53, verbose = TRUE){
   checkAndReturnDataTable(dataSet = dataSet)
 
   ## Initialization
-  cols <- real_cols(dataSet, cols = "auto", function_name = function_name, types = "factor")
+  cols <- real_cols(dataSet, cols = cols, function_name = function_name, types = "factor")
   if (verbose){ 
     pb <- initPB(function_name, cols)
     printl(function_name, ": I will identify variable that are factor but shouldn't be.")