This report is automatically generated with the R
package knitr
(version 1.40
)
.
library(mgcv) #gam() predictionModels <- function(dataframe, .knownYcol=NULL, .knownXcol=NULL, knownX=NULL, preferExtrapModel=NULL){ # dataframe - a AqFishSamplePairing() object if(is.null(knownX)) stop("Please include a knownX value.") # .knownYcol <- quo(Result) - to test function line-wise KnownYcol <- enquo(.knownYcol) # .knownXcol <- quo(TLAvgLength) - to test function line-wise KnownXcol <- enquo(.knownXcol) # Unique number of samples n <- nrow(dataframe %>% distinct(!!KnownXcol)) # Standard Error of Regression (SER) used to determine best model # SER is std.dev (or measure of dispersion) around regression line # SER is std. deviation of predictions with dof correction, which helps select a simple model vs a more complicated model -> dof = unique(n) minus # of coefficients estimated # SER is also called Residual Standard Error & can be called using summary() with summary(Model)$sigma and dof using summary(model)$df[2] # RMSE is std. deviation of predictions without dof correction # Info about how SER was calculated # https://www.youtube.com/watch?v=SM96OglU7Gg # GAM: Generalized additive models with integrated smoothness estimation iterations <- 1:(min(4, n)) # limit smoothness coefficient to k=4 or no higher than n, the number of unique samples # cap the smoothness of gam model at 4 or Unique number of samples, whichever is smaller df_gam <- NULL for(i in iterations){ capture.output( # use capture.output to stop printing cat warning i_gam <- try( dataframe %>% nest(data=colnames(dataframe)) %>% # data=colnames(dataframe) needed so nest doesn't through error if dataframe is not grouped mutate(knownXcol = as_label(KnownXcol), knownYcol = as_label(KnownYcol), knownX = knownX, ModelName = paste0("gam_k=", i)) %>% mutate(Model = purrr::map(.x=data, .f = ~eval(parse(text=paste0("gam(", as_label(KnownYcol), " ~ s(", as_label(KnownXcol), ", k=i), data=.)"))) )) %>% mutate(predictedY = purrr::map_dbl(.x=Model, .f = ~predict(.x, tibble(!!KnownXcol := knownX)))) %>% mutate(RMSE = purrr::map_dbl(.x=Model, .f = ~sqrt(mean((predict(.x) - dataframe %>% pull(!!KnownYcol))^2)) )) %>% mutate(SER = purrr::map_dbl(.x=Model, .f = ~sqrt( sum((predict(.x) - dataframe %>% pull(!!KnownYcol))^2) / (n - length(.x$coefficients)) ) )) ) ) if(inherits(i_gam, "try-error")) next df_gam <- rbind(df_gam, i_gam) } # NLS: Nonlinear (weighted) least-squares estimates of a nonlinear model capture.output( # use capture.output to stop printing cat warning df_nls <- try( dataframe %>% # https://stackoverflow.com/a/18305916/7903538 nest(data=colnames(dataframe)) %>% # data=colnames(dataframe) needed so nest doesn't through error if dataframe is not grouped mutate(knownXcol = as_label(KnownXcol), knownYcol = as_label(KnownYcol), knownX = knownX, ModelName = "nls") %>% mutate(Model = purrr::map(.x=data, .f = ~eval(parse(text=paste0("nls(", as_label(KnownYcol), " ~ ", as_label(KnownXcol),"^b, start=c(b=1), algorithm='plinear', data=.)"))) )) %>% mutate(predictedY = purrr::map_dbl(.x=Model, .f = ~predict(.x, tibble(!!KnownXcol := knownX)))) %>% mutate(RMSE = purrr::map_dbl(.x=Model, .f = ~sqrt(mean(summary(.x)$residuals^2)) )) %>% mutate(SER = purrr::map_dbl(.x=Model, .f = ~sqrt( sum(summary(.x)$residuals^2) / (n - 2) ) )) # nls() doesn't return $coefficients but it is 2 (m & b); y = m*x^b ) ) # LM: Fit a standard linear model y = ax + b df_lm <- dataframe %>% nest(data=colnames(dataframe)) %>% # data=colnames(dataframe) needed so nest doesn't through error if dataframe is not grouped mutate(knownXcol = as_label(KnownXcol), knownYcol = as_label(KnownYcol), knownX = knownX, ModelName = "lm") %>% mutate(Model = purrr::map(.x=data, .f = ~eval(parse(text=paste0("lm(", as_label(KnownYcol), " ~", as_label(KnownXcol),", data=.)"))) )) %>% mutate(predictedY = purrr::map_dbl(.x=Model, .f = ~predict(.x, tibble(!!KnownXcol := knownX)))) %>% mutate(RMSE = purrr::map_dbl(.x=Model, .f = ~sqrt(mean(summary(.x)$residuals^2)) )) %>% mutate(SER = purrr::map_dbl(.x=Model, .f = ~sqrt( sum(summary(.x)$residuals^2) / (n - length(.x$coefficients)) ) )) # LM: Fit a Power function y = a * x^b as done by excel # reference: https://stackoverflow.com/a/18308359 df_pwr <- dataframe %>% # https://stackoverflow.com/questions/18305852/power-regression-in-r-similar-to-excel nest(data=colnames(dataframe)) %>% # data=colnames(dataframe) needed so nest doesn't through error if dataframe is not grouped mutate(knownXcol = as_label(KnownXcol), knownYcol = as_label(KnownYcol), knownX = knownX, ModelName = "pwr") %>% mutate(Model = purrr::map(.x=data, .f = ~eval(parse(text=paste0("lm(log(", as_label(KnownYcol), ") ~ log(", as_label(KnownXcol),"), data=.)"))) )) %>% mutate(predictedY = purrr::map_dbl(.x=Model, .f = ~exp(predict(.x, tibble(!!KnownXcol := knownX)))) ) %>% mutate(RMSE = purrr::map_dbl(.x=Model, .f = ~sqrt(mean((exp(predict(.x)) - dataframe %>% pull(!!KnownYcol))^2)) )) %>% mutate(SER = purrr::map_dbl(.x=Model, .f = ~sqrt( sum((exp(predict(.x)) - dataframe %>% pull(!!KnownYcol))^2) / (n - length(.x$coefficients)) ) )) # LM: Fit a Logarithmic function y = a + b*ln(X) as done by excel # reference: https://www.statology.org/logarithmic-regression-in-r/ df_log <- dataframe %>% nest(data=colnames(dataframe)) %>% # data=colnames(dataframe) needed so nest doesn't through error if dataframe is not grouped mutate(knownXcol = as_label(KnownXcol), knownYcol = as_label(KnownYcol), knownX = knownX, ModelName = "log") %>% mutate(Model = purrr::map(.x=data, .f = ~eval(parse(text=paste0("lm(", as_label(KnownYcol), " ~ log(", as_label(KnownXcol),"), data=.)"))) )) %>% mutate(predictedY = purrr::map_dbl(.x=Model, .f = ~predict(.x, tibble(!!KnownXcol := knownX)))) %>% mutate(RMSE = purrr::map_dbl(.x=Model, .f = ~sqrt(mean(summary(.x)$residuals^2)) )) %>% mutate(SER = purrr::map_dbl(.x=Model, .f = ~sqrt( sum(summary(.x)$residuals^2) / (n - length(.x$coefficients)) ) )) # LM: Fit a Exponential function y = a * b^(X) as done by excel # reference: https://www.statology.org/exponential-regression-in-r/ df_exp <- dataframe %>% nest(data=colnames(dataframe)) %>% # data=colnames(dataframe) needed so nest doesn't through error if dataframe is not grouped mutate(knownXcol = as_label(KnownXcol), knownYcol = as_label(KnownYcol), knownX = knownX, ModelName = "exp") %>% mutate(Model = purrr::map(.x=data, .f = ~eval(parse(text=paste0("lm(log(", as_label(KnownYcol), ") ~", as_label(KnownXcol),", data=.)"))) )) %>% mutate(predictedY = purrr::map_dbl(.x=Model, .f = ~exp(predict(.x, tibble(!!KnownXcol := knownX)))) ) %>% mutate(RMSE = purrr::map_dbl(.x=Model, .f = ~sqrt(mean((exp(predict(.x)) - dataframe %>% pull(!!KnownYcol))^2)) )) %>% mutate(SER = purrr::map_dbl(.x=Model, .f = ~sqrt( sum((exp(predict(.x)) - dataframe %>% pull(!!KnownYcol))^2) / (n - length(.x$coefficients)) ) )) dataXcol <- dataframe %>% select(!!KnownXcol) if(is.null(df_gam) & inherits(df_nls, "try-error")) { predictionModels <- rbind( df_lm, df_pwr, df_log, df_exp) # exclude df_gam & df_nls } else if(is.null(df_gam)){ predictionModels <- rbind( df_nls, df_lm, df_pwr, df_log, df_exp) # exclude df_gam } else if(inherits(df_nls, "try-error")){ predictionModels <- rbind(df_gam, df_lm, df_pwr, df_log, df_exp) # exclude df_nls } else { predictionModels <- rbind(df_gam, df_nls, df_lm, df_pwr, df_log, df_exp) } predictionModels <- predictionModels %>% mutate(n = nrow(dataframe), PredictionType = case_when(knownX >= min(dataXcol) & knownX <= max(dataXcol) ~ "Interpolated", T ~ "Extrapolated"), ExtrapLength = case_when(PredictionType == "Extrapolated" ~ min(abs(min(dataXcol)-knownX), abs(max(dataXcol)-knownX)), T ~ 0) ) %>% arrange( if(all(PredictionType == "Extrapolated") & !is.null(preferExtrapModel)){ # if there is a preferred model when prediction is extrapolated, put that model first match(ModelName, c(preferExtrapModel, ModelName[ModelName != preferExtrapModel])) }else { # Otherwise, sort by SER # make sure SER is real number. When n=2, SER can be NA or Inf. If SER is NA or Inf, use RMSE instead if(all(is.finite(SER))) SER else RMSE } ) return(predictionModels) } # EXAMPLE CODE TO USE MODEL FOR REGRESSION # predictionModel_final <- predictionModels %>% # filter(SER == min(SER)) # # model_predictOrig <- tibble(fish_Stdz350=seq(0.05,max(prediction2000_2019$fish_Stdz350), by=0.005)) # model_predictOrig <- model_predictOrig %>% # mutate(aq_Samp=predict(predictionModel_final$Model[[1]], model_predictOrig %>% select(fish_Stdz350)))
The R session information (including the OS info, R version and all packages used):
sessionInfo()
## R version 4.2.2 (2022-10-31 ucrt) ## Platform: x86_64-w64-mingw32/x64 (64-bit) ## Running under: Windows 10 x64 (build 22621) ## ## Matrix products: default ## ## locale: ## [1] LC_COLLATE=English_United States.utf8 LC_CTYPE=English_United States.utf8 ## [3] LC_MONETARY=English_United States.utf8 LC_NUMERIC=C ## [5] LC_TIME=English_United States.utf8 ## ## attached base packages: ## [1] stats graphics grDevices utils datasets methods base ## ## other attached packages: ## [1] mgcv_1.8-41 nlme_3.1-160 lubridate_1.8.0 plotly_4.10.0 ## [5] readxl_1.4.1 actuar_3.3-0 NADA_1.6-1.1 forcats_0.5.2 ## [9] stringr_1.4.1 dplyr_1.0.9 purrr_0.3.4 readr_2.1.2 ## [13] tidyr_1.2.0 tibble_3.1.8 ggplot2_3.3.6 tidyverse_1.3.2 ## [17] fitdistrplus_1.1-8 survival_3.4-0 MASS_7.3-58.1 ## ## loaded via a namespace (and not attached): ## [1] lattice_0.20-45 assertthat_0.2.1 digest_0.6.29 utf8_1.2.2 ## [5] R6_2.5.1 cellranger_1.1.0 backports_1.4.1 reprex_2.0.2 ## [9] evaluate_0.16 highr_0.9 httr_1.4.4 pillar_1.8.1 ## [13] rlang_1.0.5 lazyeval_0.2.2 googlesheets4_1.0.1 rstudioapi_0.14 ## [17] data.table_1.14.2 Matrix_1.5-1 splines_4.2.2 googledrive_2.0.0 ## [21] htmlwidgets_1.5.4 munsell_0.5.0 broom_1.0.1 compiler_4.2.2 ## [25] modelr_0.1.9 xfun_0.32 pkgconfig_2.0.3 htmltools_0.5.3 ## [29] tidyselect_1.1.2 fansi_1.0.3 viridisLite_0.4.1 crayon_1.5.1 ## [33] tzdb_0.3.0 dbplyr_2.2.1 withr_2.5.0 grid_4.2.2 ## [37] jsonlite_1.8.0 gtable_0.3.1 lifecycle_1.0.1 DBI_1.1.3 ## [41] magrittr_2.0.3 scales_1.2.1 cli_3.3.0 stringi_1.7.8 ## [45] fs_1.5.2 xml2_1.3.3 ellipsis_0.3.2 generics_0.1.3 ## [49] vctrs_0.4.1 expint_0.1-7 tools_4.2.2 glue_1.6.2 ## [53] hms_1.1.2 fastmap_1.1.0 colorspace_2.0-3 gargle_1.2.0 ## [57] rvest_1.0.3 knitr_1.40 haven_2.5.1
Sys.time()
## [1] "2023-12-26 09:47:26 PST"