This report is automatically generated with the R
package knitr
(version 1.40
)
.
source("R Functions/functions_QA data.R") ### LOAD DATA ### - need to set WD to aqueous folder for Riginal file #FIND AND REPLACE (Ctrl+F) 'WORKSHEEET' WITH MORE APPROPRIATE NAME (e.g., 'CALFED_data') #CHANGE 'FILENAME' & 'SHEETNAME' WITH ACTUAL NAMES SNIPFISH <- readxl::read_excel('Reeval_Impl_Goals_Linkage_Analysis/Data/Aqueous/SNIP_Riginal.xlsx', sheet='SNIP_MasterDatabase_Final_HgRec', guess_max = 30000) nrow(SNIPFISH) #number of rows should match the Excel file (minus the header row)
## [1] 39729
### LIST COLUMNS TO BE USED, ADD USER DEFINED COLUMNS, & RENAME COLUMNS TO CEDEN STANDARDS ### #Use 1.READ ME.xlsx, 'ColumnsForR' to list & identify columns that match corresponding CEDEN Standard columns keep_cols <- c('SourceID','SourceRow','StationName','StationCode','TargetLatitude','TargetLongitude','SampleDate','CollectionTime','QACode', 'NumberFishPerComp','Analyte','Unit','Result','MDL','RL','ResultQualCode','Datum','Program','ParentProject','Project', 'CompositeCommonName','CompositeFinalID','CompositeTissueName','MethodName','Weight','Length', 'SampleID', 'CompositeCompositeID','WaterBodyType','SampleComments','SubmissionCode', 'BatchVerification', 'ComplianceCode' ) temp_cols <- c('SampleYear') #Include columns that do not match CEDEN standards but may be useful (e.g., Unit columns for MDL & RL) #temp_cols are removed before the data is merged with other datasets SNIPFISH_new <- SNIPFISH %>% select( c(keep_cols,temp_cols) ) %>% #DO NOT CHANGE - selects columns specified above rename( #Rename SNIPFISH columns to CEDEN format here: CEDEN 'COLUMNNAME' = SNIPFISH 'COLUMNNAME' #DELTE COLUMN NAMES THAT DO NOT HAVE AN EQUIVALENT COLUMN IN THE SNIPFISH 'SampleTime' = 'CollectionTime', 'CoordSystem' = 'Datum', 'ProgramName' = 'Program', 'ParentProjectName' = 'ParentProject', 'ProjectName' = 'Project', 'CommonName' = 'CompositeCommonName', 'TaxonomicName' = 'CompositeFinalID', 'TissueName' = 'CompositeTissueName', 'Method' = 'MethodName', 'WeightAvg g' = 'Weight', #need to make sure column weight units are all in grams 'TLAvgLength mm' = 'Length', #need to make sure column Lengthunits are all in mm 'CompositeRowID' = 'CompositeCompositeID', 'WBT' = 'WaterBodyType', 'ResultComments' = 'SampleComments', 'LabSubmissionCode' = 'SubmissionCode' ) %>% mutate( #Create Missing column or modify existing column here: CEDEN COLUMNNAME = 'SPECIFIED VALUE' or FUNCTION #DELTE COLUMN NAMES THAT DO NOT NEED TO BE CHANGED CitationCode = 'SNIPFISH', ProjectCode = NA_character_, CompositeID = NA_character_, `TLMin mm` = NA_real_, `TLMax mm` = NA_real_ ) nrow(SNIPFISH_new)
## [1] 39729
#str(SNIPFISH_new) #just to check data class of different columns - e.g., is Date column in POSIX format? #View(SNIPFISH_new) ### FORMAT COLUMN PARAMETERS ### # Standardize TissueName Groups - "Fillet" & "Whole Body" # unique(SNIPFISH_new$TissueName)
## [1] NA ## [2] "fillet" ## [3] "Whole Body" ## [4] "Fillet" ## [5] "Organism, whole" ## [6] "Liver" ## [7] "Gills" ## [8] "Brain" ## [9] "Kidney" ## [10] "whole body" ## [11] "Fillet skin-off" ## [12] "FILLET" ## [13] "WHOLE BODY" ## [14] "Whole body" ## [15] "Unknown" ## [16] "soft tissue (e.g. clams) with gonads intact" ## [17] "Soft Tissue" ## [18] "whole without Head, Tail and Guts"
SNIPFISH_new <- SNIPFISH_new %>% mutate(TissueName = recode(TissueName, "fillet" = "Fillet", "FILLEt" = "Fillet", "Fillet skin-off" = "Fillet", "Organism, whole" = "Whole Body", "WHOLE BODY" = "Whole Body", "whole body" = "Whole Body", "Whole body" = "Whole Body" ) ) %>% filter(TissueName %in% c('Fillet','Whole Body')) unique(SNIPFISH_new$TissueName)
## [1] "Fillet" "Whole Body"
# Standardize WBT (WaterBodyType) Groups - "River/Stream", "Drain/Canal", "Wetland", "Spring", "Slough", # "Pond", "Lake/Reservoir", "Delta", "Forebay/Afterbay", "Not Recorded" # unique(SNIPFISH_new$WBT)
## [1] "Reservoir/Lake" "River/Creek" "river/Creek" "Hatchery" "Delta"
SNIPFISH_new <- SNIPFISH_new %>% filter(WBT %are not% c("Hatchery")) %>% mutate(WBT = recode(WBT, "river/Creek" = "River/Stream", "River/Creek" = "River/Stream", "Reservoir/Lake" = "Lake/Reservoir" #Jennie confirmed some Lake/Reservoirs are in scope )) unique(SNIPFISH_new$WBT)
## [1] "Lake/Reservoir" "River/Stream" "Delta"
# Filter to include or exclude specified sites with pond or lake in name that are not part of rivers or in delta includedSites <- c('"old F&G pond",238', "Cachement Basin", "Cachment basin", "Central Pond,222", "Greens Lake,226", "PG&E pond,223", "seasonal flooded pond,245", "Shag Slough") # list of sites to be included that have lake/pond in name but are part of a river or delta includedSites %in% SNIPFISH_new$StationName #Test to see if these sites are present
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
#^ all false so don't need to include any code to make sure these sites are not excluded based on WBT SNIPFISH_new <- SNIPFISH_new %>% filter(WBT %in% c("River/Stream", "Delta")) nrow(SNIPFISH_new)
## [1] 4334
# Standardize ResultQualCode Groups - "ND", "DNQ", NA# unique(SNIPFISH_new$ResultQualCode) #Identifies OLDNAMES
## [1] "=" "ND" "DNQ" "<"
SNIPFISH_new <- SNIPFISH_new %>% mutate(ResultQualCode = recode(ResultQualCode, "<" = "DNQ" ) ) unique(SNIPFISH_new$ResultQualCode) #New naming structure for ResultQualCode Groupings
## [1] "=" "ND" "DNQ"
# Format Result Column to Numeric# # Check column for text - based on text user needs to decide what to do if(!is.numeric(SNIPFISH_new$Result)){ old <-SNIPFISH_new$Result new <-SNIPFISH_new$Result new[grepl('<|[a-df-zA-DF-Z]', new)] <- NA #skip 'e' for exponential notation e.g., "8e-005" #Print what text was found and what is being done cat(paste0("'Result' column should be numeric but some cells contain ", grammaticList(setdiff(old, new)), ".\nACTIONS TAKEN:\n", "~explain here~.\n")) #SNIPFISH_new <- SNIPFISH_new %>% # mutate( #Due stuff to prep column to be converted to Numeric # Result = as.numeric(new) # ) } else { cat("'Result' column converted to numeric format\n") SNIPFISH_new$Result <- as.numeric(SNIPFISH_new$Result)}
## 'Result' column converted to numeric format
# Format MDL Column to Numeric# # Check column for text - based on text user needs to decide what to do if(!is.numeric(SNIPFISH_new$MDL)){ old <-SNIPFISH_new$MDL new <-SNIPFISH_new$MDL new[grepl('[a-df-zA-DF-Z]', new)] <- NA #skip 'e' for exponential notation e.g., "8e-005" #Print what text was found and what is being done cat(paste0("'Result' column should be numeric but some cells contain ", grammaticList(setdiff(old, new)), ".\nACTIONS TAKEN:\n", "~explain here~.\n")) #SNIPFISH_new <- SNIPFISH_new %>% # mutate( #Do stuff to prep column to be converted to Numeric # MDL = as.numeric(new) # ) } else { cat("'MDL' column converted to numeric format\n") SNIPFISH_new$MDL <- as.numeric(SNIPFISH_new$MDL)}
## 'MDL' column converted to numeric format
if(any(grepl('[a-df-zA-DF-Z]', SNIPFISH_new$RL))){ old <-SNIPFISH_new$RL new <-SNIPFISH_new$RL new[grepl('[a-df-zA-DF-Z]', new)] <- NA #skip 'e' for exponential notation e.g., "8e-005" #Print what text was found and what is being done cat(paste0("'Result' column should be numeric but some cells contain ", grammaticList(setdiff(old, new)), ".\nACTIONS TAKEN:\n", "~explain here~.\n")) #SNIPFISH_new <- SNIPFISH_new %>% # mutate( #Do stuff to prep column to be converted to Numeric # RL = as.numeric(new) # ) } else { cat("'RL' column converted to numeric format\n") SNIPFISH_new$RL <- as.numeric(SNIPFISH_new$RL)}
## 'RL' column converted to numeric format
#str(SNIPFISH_new) # Check if Result, MDL, & RL Columns all equal <NA> or 0 - these rows have no useful information nrow(SNIPFISH_new) #Number rows before
## [1] 4334
#CODE BELOW REQUIRES USER TROUBLESHOOTING DEPENDING ON AVAILABLE COLUMNS AND SPREADSHEET SPECIFIC CONDITIONS# SNIPFISH_new <- SNIPFISH_new %>% #Set 0 & negative values as blank mutate(Result = ifelse(Result <= 0, NA_real_, Result), MDL = ifelse(MDL <= 0, NA_real_, MDL), RL = ifelse(RL <= 0, NA_real_, RL)) na_results <- SNIPFISH_new %>% #Record rows where Result, MDL, & RL all equal <NA> filter( is.na(Result) & is.na(MDL) & is.na(RL) ) #View(na_results) SNIPFISH_new <- anti_join(SNIPFISH_new, na_results, by='SourceRow') #returns rows from SNIPFISH_new not matching values in no_result nrow(SNIPFISH_new) #Number rows after
## [1] 4333
# Standardize Analyte Groups - "Mercury, Total" (we consider Total Mercury and Methylmercury to be approx equal) unique(SNIPFISH_new$Analyte)
## [1] "Mercury, Total" ## [2] "Mercury, Methyl" ## [3] "Mercury" ## [4] "Mercury, biota, tissue, recoverable, dry weight, milli" ## [5] "Mercury, biota, tissue, recoverable, dry weight, micrograms per gram" ## [6] "Mercury, biota, tissue, recoverable, wet weight, milligrams per kilogram" ## [7] "MERCURY, TOTAL IN FISH (PPM,WET WEIGHT BASIS); TISSUE, WET WEIGHT; MG/KG"
SNIPFISH_new <- SNIPFISH_new %>% mutate( Analyte = recode(Analyte, "Mercury" = "Mercury, Total", "Mercury, biota, tissue, recoverable, wet weight, milligrams per kilogram" = "Mercury, Total", "MERCURY, TOTAL IN FISH (PPM,WET WEIGHT BASIS); TISSUE, WET WEIGHT; MG/KG" = "Mercury, Total", "Mercury, biota, tissue, recoverable, dry weight, micrograms per gram" = "Mercury, Total dw", "Mercury; ng/g dw" = "Mercury, Total dw", "Mercury, biota, tissue, recoverable, dry weight, milli" = "Mercury, Total dw") ) unique(SNIPFISH_new$Analyte) #New naming structure for Analyte Groupings; keep "dw" in analyte so it can be added to Unit column later
## [1] "Mercury, Total" "Mercury, Methyl" "Mercury, Total dw"
unique(paste(SNIPFISH_new$Analyte, SNIPFISH_new$Unit, sep='; '))
## [1] "Mercury, Total; µg/g ww" ## [2] "Mercury, Methyl; ppb dry wt" ## [3] "Mercury, Methyl; ppb wet wt" ## [4] "Mercury, Total; ppm wet wt" ## [5] "Mercury, Total; ppb wet wt" ## [6] "Mercury, Total dw; mg/kg" ## [7] "Mercury, Total; ppm wet wt (adj for whole body analysis 1.62 x orig hg cnc)" ## [8] "Mercury, Total; ppm dry wt" ## [9] "Mercury, Total dw; µg/g dw" ## [10] "Mercury, Total; mg/kg" ## [11] "Mercury, Total; µg/g dw" ## [12] "Mercury, Methyl; ng/g dw" ## [13] "Mercury, Total; ng/g dw" ## [14] "Mercury, Methyl; ng/g ww" ## [15] "Mercury, Total; ng/g ww" ## [16] "Mercury, Methyl; ppm dry wt" ## [17] "Mercury, Methyl; ppm wet wt"
# Format Units Column - "mg/Kg ww" unique(SNIPFISH_new$Unit) #Identifies OLDNAMES
## [1] "µg/g ww" ## [2] "ppb dry wt" ## [3] "ppb wet wt" ## [4] "ppm wet wt" ## [5] "mg/kg" ## [6] "ppm wet wt (adj for whole body analysis 1.62 x orig hg cnc)" ## [7] "ppm dry wt" ## [8] "µg/g dw" ## [9] "ng/g dw" ## [10] "ng/g ww"
# If more than 1 unit colmn exists (e.g., for RL and MDL columns) see WQP script for example on merging into 1 column SNIPFISH_new <- SNIPFISH_new %>% standardizeUnits(pp = "mass") unique(SNIPFISH_new$Unit)
## [1] "mg/Kg ww" ## [2] "mg/Kg dry wt" ## [3] "mg/Kg wet wt" ## [4] "mg/Kg" ## [5] "mg/Kg wet wt (adj for whole body analysis 1.62 x orig hg cnc)" ## [6] "mg/Kg dw"
# simplify "dry wt" & "wet wt" in Unit name to "dw" and "ww" SNIPFISH_new <- SNIPFISH_new %>% mutate(Unit = case_when(grepl('wet', Unit) ~ sub('[^mg/Kg].*$', ' ww', Unit), #exclude mg/Kg "[^mg/Kg]", include all characters at end ".*$" grepl('dry', Unit) ~ sub('[^mg/Kg].*$', ' dw', Unit), TRUE ~ Unit ) ) unique(SNIPFISH_new$Unit)
## [1] "mg/Kg ww" "mg/Kg dw" "mg/Kg"
unique(paste(SNIPFISH_new$Analyte, SNIPFISH_new$Unit, sep='; '))
## [1] "Mercury, Total; mg/Kg ww" "Mercury, Methyl; mg/Kg dw" ## [3] "Mercury, Methyl; mg/Kg ww" "Mercury, Total dw; mg/Kg" ## [5] "Mercury, Total; mg/Kg dw" "Mercury, Total dw; mg/Kg dw" ## [7] "Mercury, Total; mg/Kg"
#if the analyte is "Mercury, Total dw" append dw onto unit if it's not already there SNIPFISH_new <- SNIPFISH_new %>% mutate( Unit = if_else(Analyte == 'Mercury, Total dw' & Unit == 'mg/Kg', paste(Unit, 'dw'), Unit), #assume non-specified mg/kg is ww Unit = if_else(grepl('dw|ww', Unit), Unit, paste(Unit, 'ww')), Analyte = "Mercury, Total" ) unique(SNIPFISH_new$Unit) #New naming structure for Units
## [1] "mg/Kg ww" "mg/Kg dw"
unique(paste(SNIPFISH_new$Analyte, SNIPFISH_new$Unit, sep='; '))
## [1] "Mercury, Total; mg/Kg ww" "Mercury, Total; mg/Kg dw"
# Format Date and Time Column # # THE EXAMPLE CODE BELOW ASSUMES DATE AND TIME ARE IN SAME COLUMNS - IF TIME IS IN SEPERATE COLUMN LOOK AT AQ LINKAGE DATA TEMPLATE SNIPFISH_new <- SNIPFISH_new %>% mutate( SampleDate = ifelse(is.na(SampleDate), paste0('1/1/',SampleYear), SampleDate) ) SNIPFISH_new <- SNIPFISH_new %>% mutate( SampleDate = recode(SampleDate, "Sept-Oct 2002" ="9/15/2002", '2000-2001' = '1/1/2001' )) SNIPFISH_new <- SNIPFISH_new %>% mutate(SampleDate = lubridate::parse_date_time(SampleDate, orders = c("mdy")) #had to break this into a seperate mutate for it to work properly ) length(SNIPFISH_new$SourceRow[is.na(SNIPFISH_new$SampleDate)]) #this needs to be 0
## [1] 0
##STOP## length(SNIPFISH_new$SourceRow[is.na(SNIPFISH_new$SampleTime)])
## [1] 0
SNIPFISH_new <- SNIPFISH_new %>% mutate(SampleTime = if_else(SampleTime == '9999', NA_character_, SampleTime) , SampleTime = lubridate::parse_date_time(SampleTime, orders = c("HM", "HMS", "mdyHMS")) ) SNIPFISH_new<- SNIPFISH_new %>% mutate( #If SampleDate & CollectioTime are not in Character format by defualt, turn it into a character class so it exports better SampleDate = ifelse(sapply(SampleDate, is.character), SampleDate, as.character(as.Date(SampleDate))), SampleTime = ifelse(sapply(SampleTime, is.character), SampleTime, format(lubridate::ymd_hms(SampleTime), "%H:%M:%S")), #COMBINE DATE AND TIME INTO SampleDateTime COLUMN SampleDateTime = if_else(!is.na(SampleTime), paste(SampleDate, SampleTime), paste(SampleDate, '00:00:00')), #FORMAT SampleDateTime COLUMN TO DATE FORMAT SampleDateTime = lubridate::ymd_hms(SampleDateTime) ) ### REMOVE TEMPORARY COLUMNS ### SNIPFISH_new <- SNIPFISH_new %>% select(-one_of(temp_cols)) #Remove temp columns since they are no longer needed #View(SNIPFISH_new) ## SAVE FORMATTED DATA AS EXCEL FILE ## writexl::write_xlsx(SNIPFISH_new, path='Reeval_Impl_Goals_Linkage_Analysis/Data/Fish/SNIPFISH_ceden_format.xlsx') # In excel, to convert SampleDate column to Date format # 1 - Select the date column. # 2 - Go to the Data-tab and choose "Text to Columns". # 3 - On the first screen, leave radio button on "delimited" and click Next. # 4 - Unselect any delimiter boxes (everything blank) and click Next. # 5 - Under column data format choose Date, select YMD # 6 - Click Finish.
The R session information (including the OS info, R version and all packages used):
sessionInfo()
## R version 4.2.2 (2022-10-31 ucrt) ## Platform: x86_64-w64-mingw32/x64 (64-bit) ## Running under: Windows 10 x64 (build 22621) ## ## Matrix products: default ## ## locale: ## [1] LC_COLLATE=English_United States.utf8 LC_CTYPE=English_United States.utf8 ## [3] LC_MONETARY=English_United States.utf8 LC_NUMERIC=C ## [5] LC_TIME=English_United States.utf8 ## ## attached base packages: ## [1] stats graphics grDevices utils datasets methods base ## ## other attached packages: ## [1] lubridate_1.8.0 plotly_4.10.0 readxl_1.4.1 actuar_3.3-0 ## [5] NADA_1.6-1.1 forcats_0.5.2 stringr_1.4.1 dplyr_1.0.9 ## [9] purrr_0.3.4 readr_2.1.2 tidyr_1.2.0 tibble_3.1.8 ## [13] ggplot2_3.3.6 tidyverse_1.3.2 fitdistrplus_1.1-8 survival_3.4-0 ## [17] MASS_7.3-58.1 ## ## loaded via a namespace (and not attached): ## [1] lattice_0.20-45 assertthat_0.2.1 digest_0.6.29 utf8_1.2.2 ## [5] R6_2.5.1 cellranger_1.1.0 backports_1.4.1 reprex_2.0.2 ## [9] evaluate_0.16 highr_0.9 httr_1.4.4 pillar_1.8.1 ## [13] rlang_1.0.5 lazyeval_0.2.2 googlesheets4_1.0.1 rstudioapi_0.14 ## [17] data.table_1.14.2 Matrix_1.5-1 splines_4.2.2 googledrive_2.0.0 ## [21] htmlwidgets_1.5.4 munsell_0.5.0 broom_1.0.1 compiler_4.2.2 ## [25] modelr_0.1.9 xfun_0.32 pkgconfig_2.0.3 htmltools_0.5.3 ## [29] tidyselect_1.1.2 fansi_1.0.3 viridisLite_0.4.1 crayon_1.5.1 ## [33] tzdb_0.3.0 dbplyr_2.2.1 withr_2.5.0 grid_4.2.2 ## [37] jsonlite_1.8.0 gtable_0.3.1 lifecycle_1.0.1 DBI_1.1.3 ## [41] magrittr_2.0.3 scales_1.2.1 writexl_1.4.0 cli_3.3.0 ## [45] stringi_1.7.8 fs_1.5.2 xml2_1.3.3 ellipsis_0.3.2 ## [49] generics_0.1.3 vctrs_0.4.1 expint_0.1-7 tools_4.2.2 ## [53] glue_1.6.2 hms_1.1.2 fastmap_1.1.0 colorspace_2.0-3 ## [57] gargle_1.2.0 rvest_1.0.3 knitr_1.40 haven_2.5.1
Sys.time()
## [1] "2024-01-05 11:03:15 PST"