update

mengluchu · Jun 27, 2020 · a159327 · a159327
1 parent ff787dd
commit a159327
Show file tree

Hide file tree

Showing 28 changed files with 6,274 additions and 28 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/Python/.DS_Store b/Python/.DS_Store
diff --git a/Python/calc_predictors/.DS_Store b/Python/calc_predictors/.DS_Store
diff --git a/Python/calc_predictors/preprocessing/.DS_Store b/Python/calc_predictors/preprocessing/.DS_Store
diff --git a/Python/calc_predictors/preprocessing/openstreetmap/.DS_Store b/Python/calc_predictors/preprocessing/openstreetmap/.DS_Store
diff --git a/Python/deep_learning/.DS_Store b/Python/deep_learning/.DS_Store
diff --git a/Python/deep_learning/air_pollution_road_structure.ipynb b/Python/deep_learning/air_pollution_road_structure.ipynb
diff --git a/Python/deep_learning/archive/.DS_Store b/Python/deep_learning/archive/.DS_Store
diff --git a/Python/deep_learning/cnnap.py → Python/deep_learning/archive/cnnap.py b/Python/deep_learning/cnnap.py → Python/deep_learning/archive/cnnap.py
diff --git a/Python/deep_learning/cnnap_kfold.py → Python/deep_learning/archive/cnnap_kfold.py b/Python/deep_learning/cnnap_kfold.py → Python/deep_learning/archive/cnnap_kfold.py
diff --git a/R_scripts/.DS_Store b/R_scripts/.DS_Store
diff --git a/R_scripts/Introduction/.DS_Store b/R_scripts/Introduction/.DS_Store
diff --git a/R_scripts/convolution_filter/.Rhistory b/R_scripts/convolution_filter/.Rhistory
@@ -0,0 +1,166 @@
+length(yvar)
+knitr::opts_chunk$set(fig.width=12, fig.height=8, fig.path=paste0('hyperparametertunning',  "/"),
+echo=T, warning=FALSE, message=FALSE, dev = "pdf")
+ipak <- function(pkg){
+new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
+if (length(new.pkg))
+install.packages(new.pkg, dependencies = TRUE , repos='http://cran.muenster.r-project.org')
+sapply(pkg, require, character.only = TRUE)
+}
+packages <- c( "dplyr",  "devtools", "ranger", "gbm","xgboost", "data.table", "vcd","Matrix","mlr3","tmap", "sf","spgwr", "ggplot2", "ggtheme","tidyr","tmap", "gstat", "caret")
+ipak(packages)
+#install_github("mengluchu/APMtools")
+library(APMtools)
+data("global_annual")
+y_var = "value_mean"
+prestring =  "road|nightlight|population|temp|wind|trop|indu|elev"
+varstring = paste(prestring,y_var,sep="|")
+merged = global_annual  %>%   merge_roads( c(3, 4, 5), keep = F) %>%na.omit() %>%
+ungroup()%>%dplyr::select(matches(varstring))
+#install.packages("caret")
+#library(caret)
+xgboostgrid = expand.grid(nrounds = seq(300, 2000, by = 50), max_depth = 3:5, eta = seq(0.05, 0.2, by = 0.05),gamma =  1, lambda = seq(0.05, 0.4, by = 0.05),
+colsample_bytree = 1,
+min_child_weight = 1,
+subsample = 0.7)
+#gamma: Minimum loss reduction required to make a further partition on a leaf node of the tree. The larger gamma is, the more conservative the algorithm will be.
+trainControl <- trainControl(method="cv", number=5, savePredictions = "final", allowParallel = T) #5 - folds
+# train the model
+model <- train(value_mean~., data=merged, method="xgbTree", trControl=trainControl, tuneGrid =xgboostgrid)
+ipak <- function(pkg){
+new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
+if (length(new.pkg))
+install.packages(new.pkg, dependencies = TRUE, repos='http://cran.muenster.r-project.org')
+sapply(pkg, require, character.only = TRUE)
+}
+packages <- c( "raster
+ipak <- function(pkg){
+new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
+if (length(new.pkg))
+install.packages(new.pkg, dependencies = TRUE, repos='http://cran.muenster.r-project.org')
+sapply(pkg, require, character.only = TRUE)
+}
+packages <- c( "raster", "dplyr", "devtools", "rgdal","Matrix","xgboost", "data.table" , "randomForest", "glmnet" ,"rasterVis" ,"RcolorBrew"  )
+ipak(packages)
+xgb2= raster("~/Downloads/xgbph_slowlr.tif")
+library(rasterVis)
+myTheme = rasterTheme(region = rev(brewer.pal(7, "Spectral")))
+#quantile(global_annual$value_mean, c(0.01, 0.999, 0.99999))
+#quite clear LAsso is not robust against exterme values. Question is do we need to remove very high values, like larger than 200, before aggregation? the maximum is 98 for the meanIt's going to be 62 for 99% and 84 for 99.9%
+levelplot(xgb1, at =seq(10,100, by =3) , par.settings = myTheme)
+#quantile(global_annual$value_mean, c(0.01, 0.999, 0.99999))
+#quite clear LAsso is not robust against exterme values. Question is do we need to remove very high values, like larger than 200, before aggregation? the maximum is 98 for the meanIt's going to be 62 for 99% and 84 for 99.9%
+levelplot(xgb2, at =seq(10,100, by =3) , par.settings = myTheme)
+#quantile(global_annual$value_mean, c(0.01, 0.999, 0.99999))
+#quite clear LAsso is not robust against exterme values. Question is do we need to remove very high values, like larger than 200, before aggregation? the maximum is 98 for the meanIt's going to be 62 for 99% and 84 for 99.9%
+levelplot(xgb2,   par.settings = myTheme)
+myTheme = rasterTheme(region = rev(brewer.pal(7, "Spectral")))
+#quantile(global_annual$value_mean, c(0.01, 0.999, 0.99999))
+#quite clear LAsso is not robust against exterme values. Question is do we need to remove very high values, like larger than 200, before aggregation? the maximum is 98 for the meanIt's going to be 62 for 99% and 84 for 99.9%
+levelplot(xgb2,   par.settings = myTheme)
+myTheme = rasterTheme(region = rev(brewer.pal(7, "Spectral")))
+library(RColorBrewer)
+myTheme = rasterTheme(region = rev(brewer.pal(7, "Spectral")))
+levelplot(la1, par.settings = myTheme) #
+#quantile(global_annual$value_mean, c(0.01, 0.999, 0.99999))
+#quite clear LAsso is not robust against exterme values. Question is do we need to remove very high values, like larger than 200, before aggregation? the maximum is 98 for the meanIt's going to be 62 for 99% and 84 for 99.9%
+levelplot(xgb2,   par.settings = myTheme)
+###########################################################################
+# visualize
+###############################################################
+rf1= raster("~/Downloads/non_backups/rfmph.tif")
+rf1 = projectRaster(rf1, crs=   CRS("+init=epsg:4326"))
+xgb2 = projectRaster(xgb2, crs=   CRS("+init=epsg:4326"))
+getwd()
+locations_sf = st_as_sf(global_annual, coords = c("long","lat"))
+osm_valuemean = tm_shape(rf1)+
+tm_raster(names(rf1),palette = "-Spectral", n = 9,alpha = 0.9)+
+tm_shape(xgb2)+
+tm_raster(names(xgb2),palette = "-Spectral", n = 9,alpha = 0.9)+
+tm_shape(locations_sf)+
+tm_dots( "value_mean", col = "value_mean", size = 0.05,title = "NO2 value",
+popup.vars = c("value_mean" ))+
+tm_view(basemaps = c('OpenStreetMap'))
+tmap_save(osm_valuemean, "/Users/menglu/Documents/GitHub/Global mapping/rfxgb_phe.html")
+xgb3= raster("~/Downloads/lr003lam002.tif")
+xgb3 = projectRaster(xgb3, crs=   CRS("+init=epsg:4326"))
+locations_sf = st_as_sf(global_annual, coords = c("long","lat"))
+osm_valuemean = tm_shape(rf1)+
+tm_raster(names(rf1),palette = "-Spectral", n = 9,alpha = 0.9)+
+tm_shape(xgb2)+
+tm_raster(names(xgb2),palette = "-Spectral", n = 9,alpha = 0.9)+
+tm_shape(xgb3)+
+tm_raster(names(xgb3),palette = "-Spectral", n = 9,alpha = 0.9)+
+tm_shape(locations_sf)+
+tm_dots( "value_mean", col = "value_mean", size = 0.05,title = "NO2 value",
+popup.vars = c("value_mean" ))+
+tm_view(basemaps = c('OpenStreetMap'))
+tmap_save(osm_valuemean, "/Users/menglu/Documents/GitHub/Global mapping/rfxgb_phe.html")
+ipak <- function(pkg){
+new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
+if (length(new.pkg))
+install.packages(new.pkg, dependencies = TRUE , repos='http://cran.muenster.r-project.org')
+sapply(pkg, require, character.only = TRUE)
+}
+packages <- c( "devtools", "dplyr","data.table" , "ggplot2" , "RColorBrewer", "raster", "rasterVis", "rgdal","Matrix","xgboost",  "glmnet", "ranger", "randomForest"
+,"tidyverse" ,"stargazer")
+ipak(packages)
+install_github("mengluchu/APMtools")
+library(APMtools)
+data(global_annual)
+install_github("mengluchu/APMtools")
+knitr::opts_chunk$set(fig.width=12, fig.height=8, fig.path=paste0('global_crossvali',"/"),
+echo=F, warning=FALSE, message=FALSE, dev = "png", include = T)
+ipak <- function(pkg){
+new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
+if (length(new.pkg))
+install.packages(new.pkg, dependencies = TRUE , repos='http://cran.muenster.r-project.org')
+sapply(pkg, require, character.only = TRUE)
+}
+packages <- c( "devtools", "dplyr","data.table" , "ggplot2" , "RColorBrewer", "raster", "rasterVis", "rgdal","Matrix","xgboost",  "glmnet", "ranger", "randomForest"
+,"tidyverse" ,"stargazer")
+ipak(packages)
+install_github("mengluchu/APMtools")
+library(APMtools)
+data(global_annual)
+ipak <- function(pkg){
+new.pkg <- pkg[!(pkg %in% installed.packages()[, "Package"])]
+if (length(new.pkg))
+install.packages(new.pkg, dependencies = TRUE , repos='http://cran.muenster.r-project.org')
+sapply(pkg, require, character.only = TRUE)
+}
+packages <- c( "devtools", "dplyr","data.table" , "ggplot2" , "RColorBrewer", "raster", "rasterVis", "rgdal","Matrix","xgboost",  "glmnet", "ranger", "randomForest"
+,"tidyverse" ,"stargazer")
+ipak(packages)
+install_github("mengluchu/APMtools")
+library(APMtools)
+data(global_annual)
+y_var = "value_mean"
+prestring =  "road|nightlight|population|temp|wind|trop|indu|elev"
+varstring = paste(prestring,y_var,sep="|")
+merged = global_annual  %>%   merge_roads( c(3, 4, 5), keep = F)  %>%
+ungroup()%>%dplyr::select(matches(varstring))
+merged_xgb= global_annual  %>%   merge_roads( c(3, 4, 5), keep = F)  %>%
+ungroup()%>%dplyr::select(matches(varstring))
+#V2= c("P_LM_NO_OMI_day","P_LM_with_OMI_day","P_LM_night","P_Lasso_day","P_lasso_night", "P_rf_day","P_rf_night","P_ctree_day")
+#for ( i in 1:10)
+crossvali =  function(n,df, y_var) {
+smp_size <- floor(0.8 * nrow(df))
+set.seed(n)
+training<- sample(seq_len(nrow(df)), size = smp_size)
+test = seq_len(nrow(df))[-training]
+#P_rf = rf_LUR(df, numtrees =  1000, mtry = 34, vis1 = F,y_varname= y_var, training=training, test=test, grepstring =varstring)
+#P_rf_la = rf_Lasso_LUR(df, numtrees =  1000, mtry = 34, vis1 = F,y_varname= y_var, training=training, test=test, grepstring =varstring)
+P_xgb= xgboost_LUR(df, max_depth =5, gamma=1, eta =0.003, nthread = 32, xgblambda = 0.002, nrounds =1700, y_varname= y_var,training=training, test=test, grepstring =varstring)
+#P_Lasso =  Lasso(df,alpha =1 , vis1  = F,y_varname = y_var,training=training, test=test,grepstring =prestring )
+#V = cbind(P_xgb, P_rf, P_rf_la, P_Lasso)
+}
+V2 = lapply(1:2, df = merged_xgb, y_var = y_var,crossvali)
+y_var
+y_var
+V2 = lapply(1:2, df = merged_xgb, _var = y_var,crossvali)
+V2 = lapply(1:2, df = merged_xgb, y_var = y_var,crossvali)
+xgboost_LUR
+install_github("mengluchu/APMtools")
+install_github("mengluchu/APMtools")
+library(APMtools)
diff --git a/R_scripts/modeling_process/.DS_Store b/R_scripts/modeling_process/.DS_Store
diff --git a/R_scripts/modeling_process/Glo_crossvali.Rmd b/R_scripts/modeling_process/Glo_crossvali.Rmd
@@ -41,7 +41,12 @@ prepare the dataset for modeling
 merged = global_annual  %>%   merge_roads( c(3, 4, 5), keep = F) %>%na.omit() %>%
   ungroup()%>%dplyr::select(matches(varstring))
 ```
-
+
+```{r}
+merged_xgb= global_annual  %>%   merge_roads( c(3, 4, 5), keep = F)  %>%
+  ungroup()%>%dplyr::select(matches(varstring))
+```
+
 Variable importance: 20-times bootstrapping
 ```{r, eval=F}
 Impor_val =  function(n,df, method , y_var  ) {
@@ -89,15 +94,15 @@ set.seed(n)
 training<- sample(seq_len(nrow(df)), size = smp_size)
 test = seq_len(nrow(df))[-training] 
 
-P_rf = rf_LUR(df, numtrees =  1000, mtry = 34, vis1 = F,y_varname= y_var, training=training, test=test, grepstring =varstring)
-P_rf_la = rf_Lasso_LUR(df, numtrees =  1000, mtry = 34, vis1 = F,y_varname= y_var, training=training, test=test, grepstring =varstring)
+#P_rf = rf_LUR(df, numtrees =  1000, mtry = 34, vis1 = F,y_varname= y_var, training=training, test=test, grepstring =varstring)
+#P_rf_la = rf_Lasso_LUR(df, numtrees =  1000, mtry = 34, vis1 = F,y_varname= y_var, training=training, test=test, grepstring =varstring)
 
-P_xgb= xgboost_LUR(df, max_depth =4, gamma=1, eta =0.05, nthread = 4, nrounds =1650, y_varname= y_var,training=training, test=test, grepstring =varstring)
-P_Lasso =  Lasso(df,alpha =1 , vis1  = F,y_varname = y_var,training=training, test=test,grepstring =prestring )
+P_xgb= xgboost_LUR(df, max_depth =5, gamma=1, eta =0.003, nthread = 32, xgblambda = 0.002, nrounds =1700, y_varname= y_var,training=training, test=test, grepstring =varstring)
+#P_Lasso =  Lasso(df,alpha =1 , vis1  = F,y_varname = y_var,training=training, test=test,grepstring =prestring )
 
-V = cbind(P_xgb, P_rf, P_rf_la, P_Lasso)
+#V = cbind(P_xgb, P_rf, P_rf_la, P_Lasso)
 } 
-V2 = lapply(1:20, df = merged, y_var = y_var,crossvali)
+V2 = lapply(1:2, df = merged_xgb, y_var = y_var,crossvali)
 V3 = data.frame(V2)
 #save(V3, file = paste0("V3.rdata"))
 ```

diff --git a/R_scripts/modeling_process/Glo_map.Rmd b/R_scripts/modeling_process/Glo_map.Rmd
@@ -10,12 +10,7 @@ knitr::opts_chunk$set(fig.width=12, fig.height=8, fig.path=paste0('output',"/"),
 
 Let's see the global model making prediction at an area.
 
-file names for the output maps
-```{r}
-xgbname = "xgbdc.tif"
-rfname = "rfdc.tif"
-laname = "Ladc.tif"
-```
+
 
 packages to install
 ```{r}
@@ -47,18 +42,28 @@ sr = stack("dc.grd")
 
 ```
 
-
-
-Mapping, implemented in the predicLA_RF_XGBtiles are three methods, random forest using randomForest package, Lasso using glmnet package, and xgboost using xgboost prackage. APMtools:: ranger_stack predicts using a ranger model, as it seems the raster::predict still does not support ranger (actually it explicitly support only randomForest and gbm, others it gives it a try), there are other work-arounds. Here the APMtools::ranger_stack function turns rasterStack into matrix for caluclation and put predictions back to rasters. 
+Mapping using xgboost, which handels missing value in predictors both in the modeling and prediction process
+```{r}
+# nrounds 1650 and eta 0.003 in practice, current setting for fast demonstration, 
+xgb_stack(sr=sr, df_var = inde_var, y_var ="value_mean",xgbname = "densexgb.tif",nrounds = 150, eta = 0.05)
+levelplot(raster("densexgb.tif"))
+```
 
+Comparison between prediction patterns using differernt methods. We use the APMtools::predicLA_RF_XGBtiles, which is implemented for three methods, random forest using randomForest package, Lasso using glmnet package, and xgboost using xgboost prackage. 
+
+file names for the output maps
+```{r}
+xgbname = "xgbdc.tif"
+rfname = "rfdc.tif"
+laname = "Ladc.tif"
+```
 
 ```{r}
 set.seed(1)
-xgb_rounds = 120 # 1650 in practice, 120 for fast demonstration
-rf_ntree = 1000 # 1000 in practice, 500 for fast demonstration
+xgb_rounds = 150 # 1650 in practice, 150 for fast demonstration
+rf_ntree = 1000 #  
 rf_mtry = 34  # optimized from caret
 predicLA_RF_XGBtiles(df = inde_var, rasstack = sr, yname = "value_mean", varstring = varstring, xgbname=xgbname, rfname = rfname, laname = laname, ntree = rf_ntree, mtry = rf_mtry,  max_depth = 4, eta = 0.05, nthread = 4, gamma = 1,  nrounds = xgb_rounds) 
-ranger_pre = ranger_stack(inde_var, "value_mean", sr, 34, 1000)
 ```
 
 
@@ -80,34 +85,42 @@ stack(rfs, xgbs )%>%levelplot(at = seq(20, 45, by =3), par.settings = myTheme, n
 #levelplot(stack(rfs, xgbs))
 #dev.off()
 
-# show on openStreetMap
+
+```
+
+Visualize using Openstreetmap,show on openStreetMap -importantly, need to reproject to the wgs84 epsg: 4326
+
+```{r}
 locations_sf = st_as_sf(global_annual, coords = c("long","lat"))
 rfs = projectRaster(rfs, crs=   CRS("+init=epsg:4326"))
 xgbs = projectRaster(xgbs, crs=   CRS("+init=epsg:4326"))
 las = projectRaster(las, crs=   CRS("+init=epsg:4326"))
-ranger_pre = projectRaster(ranger_pre, crs=   CRS("+init=epsg:4326"))
-```
 
-Visualize using Openstreetmap
-```{r}
- 
 osm_valuemean = tm_shape(rfs)+
   tm_raster(names(rfs),palette = "-Spectral", n = 9,alpha = 0.9)+
   tm_shape(xgbs)+tm_raster(names(xgbs),"-Spectral", n = 9,alpha = 0.9)+
   tm_shape(las)+tm_raster(names(las),"-Spectral", n = 9,alpha = 0.9) +
   tm_shape(locations_sf) +
   tm_dots( "value_mean", col = "value_mean", size = 0.05,title = "NO2 value",
-           popup.vars = c("value_mean" ))+
-  tm_shape(ranger_pre)+
-  tm_raster(names(ranger_pre),"-Spectral", n = 9,alpha = 0.9)+
+           popup.vars = c("value_mean" ))+ 
        tm_view(basemaps = c('OpenStreetMap'))
 tmap_save(osm_valuemean, "predictions_pre.html")
 
 ```
 
+APMtools:: ranger_stack predicts using a ranger model, as it seems the raster::predict still does not support ranger (actually it explicitly support only randomForest and gbm, others it gives it a try), there are other work-arounds. Here the APMtools::ranger_stack function turns rasterStack into matrix for caluclation and put predictions back to rasters. 
+```{r, eval = F}
+ranger_pre = ranger_stack(inde_var, "value_mean", sr, 34, 1000)
+levelplot(ranger_pre)
+```
+
 Compare also ranger and randomForest prediction.
 
 ```{r, eval = F}
+
+
+ranger_pre = projectRaster(ranger_pre, crs=   CRS("+init=epsg:4326"))
+
 levelplot(stack(rfs, ranger_pre))
 levelplot(rfs- ranger_pre)
 values(rfs- ranger_pre)%>%summary() # 0.1 maybe not worth worrying

diff --git a/R_scripts/modeling_process/Glo_map.html b/R_scripts/modeling_process/Glo_map.html
diff --git a/R_scripts/modeling_process/Ladc.tif b/R_scripts/modeling_process/Ladc.tif
diff --git a/R_scripts/modeling_process/densexgb.tif b/R_scripts/modeling_process/densexgb.tif
diff --git a/R_scripts/modeling_process/output/unnamed-chunk-4-1.png b/R_scripts/modeling_process/output/unnamed-chunk-4-1.png
diff --git a/R_scripts/modeling_process/output/unnamed-chunk-6-1.png b/R_scripts/modeling_process/output/unnamed-chunk-6-1.png
diff --git a/R_scripts/modeling_process/output/unnamed-chunk-6-2.png b/R_scripts/modeling_process/output/unnamed-chunk-6-2.png
diff --git a/R_scripts/modeling_process/output/unnamed-chunk-6-3.png b/R_scripts/modeling_process/output/unnamed-chunk-6-3.png
diff --git a/R_scripts/modeling_process/output/unnamed-chunk-6-4.png b/R_scripts/modeling_process/output/unnamed-chunk-6-4.png