This report is automatically generated with the R
package knitr
(version 1.40
)
.
source("R Functions/functions_QA data.R") ### LOAD DATA ### DeltaSED <- readxl::read_excel('Reeval_Impl_Goals_Linkage_Analysis/Data/Aqueous/DeltaSED.xlsx', sheet='Surface Sediment TotHg Data', guess_max = 30000) nrow(DeltaSED) #number of rows should match the Excel file (minus the header row)
## [1] 424
### LIST COLUMNS TO BE USED, ADD USER DEFINED COLUMNS, & RENAME COLUMNS TO CEDEN STANDARDS ### #Use 1.READ ME.xlsx, 'ColumnsForR' to list & identify columns that match corresponding CEDEN Standard columns keep_cols <- c('SourceID','SourceRow','Station Name','Station #','Date','MMHg (ng/g)','THg (mg/kg)', 'Latitude','Longitude') temp_cols <- c('Citation','MMHg Qual (less than MDL)','THg Qual (Less than MDL)','Delta Subarea','Grouped Location') DeltaSED_new <- DeltaSED %>% select( c(keep_cols, temp_cols) ) %>% #DO NOT CHANGE - selects columns specified above mutate( #Add user defined columns CoordSystem = NA_character_, CitationCode = Citation, Project = Citation, SampleTime = NA_character_, LabBatch = NA_character_, LabSampleID = NA_character_, MatrixName = 'Sediment', WBT = 'Not Recorded', MethodName = NA_character_, RL = NA_real_, SampleID = NA_character_, SampleComments = paste( if_else(is.na(`Delta Subarea`), '', if_else(is.na(`Grouped Location`),paste0('In "',`Delta Subarea`,'" Delta Subarea'), paste0('In "',`Delta Subarea`,'" Delta Subarea at'))), if_else(is.na(`Grouped Location`), '', if_else(is.na(`Delta Subarea`),paste('At',`Grouped Location`,'(Not in Legal Delta)'),`Grouped Location`)) ), QACode = NA_character_, BatchVerification = NA_character_, ComplianceCode = NA_character_, CollectionComments = NA_character_, ResultsComments = NA_character_, BatchComments = NA_character_, SampleTypeCode = NA_character_ ) %>% rename( #Rename columns to CEDEN standards #CEDEN 'COLUMNNAME' = DeltaSED 'COLUMNNAME' #If a column doesnt match the CEDEN Standard, delete that column below, talk to Robin and discuss if we can add it using other information #If a column below doesn't exist; move COLUMNNAME to Mutate above using format; StationName = NA_character_ 'StationName' = 'Station Name', 'StationCode' = 'Station #', 'SampleDate' = 'Date', 'TargetLatitude' = 'Latitude', 'TargetLongitude' = 'Longitude' ) nrow(DeltaSED_new)
## [1] 424
#str(DeltaSED_new) #just to check data class of different columns - e.g., is Date column in POSIX format? ### FORMAT WIDE TO LONG ### DeltaSED_new <- DeltaSED_new %>% gather(c('MMHg (ng/g)','THg (mg/kg)'), key="Analyte", value="Result") %>% mutate( # Use units form Analyte to create a Unit Column Unit = if_else(Analyte == 'MMHg (ng/g)', 'ng/g', 'mg/Kg'), # Use Analyte to create MDL column from 'MMHg Qual (less than MDL)' & 'THg Qual (Less than MDL)' column values - based on info provided MDL = case_when(Analyte == 'MMHg (ng/g)' ~ 0.019, Analyte == 'THg (mg/kg)' ~ 0.04), # Use Analyte and 'MMHg Qual (less than MDL)' & 'THg Qual (Less than MDL)' columns to ResultQualCode column ResultQualCode = case_when(Analyte == 'MMHg (ng/g)' & grepl('<', `MMHg Qual (less than MDL)`) ~ 'ND', Analyte == 'THg (mg/kg)' & grepl('<', `THg Qual (Less than MDL)`) ~ 'ND', TRUE ~ '='), # Remove values in Result when ND Result = if_else(ResultQualCode == 'ND', NA_real_, Result), # Rename Analyte groupings to CEDEN standards Analyte = recode(Analyte, 'MMHg (ng/g)' = 'Methylmercury, Total', 'THg (mg/kg)' = 'Mercury, Total') ) nrow(DeltaSED_new)
## [1] 848
#PASTE LETTER OF COLUMN TO RESULT ID SO THE RESULT ID IS NOT DUPLICATED seqPasteLetters <- function(chr_vector){ col_letters <- LETTERS[c(13,15)] # letters corresponding to column of excel data for (i in 1:length(chr_vector)){ chr_vector[i] <- paste0(chr_vector[i], col_letters[i]) } return(chr_vector) } DeltaSED_new <- DeltaSED_new %>% mutate(tempSourceRow = SourceRow) %>% group_by(tempSourceRow) %>% mutate(SourceRow = seqPasteLetters(SourceRow)) %>% ungroup() %>% dplyr::select(-tempSourceRow) nrow(DeltaSED_new)
## [1] 848
### FORMAT COLUMN PARAMETERS ### # Standardize MatrixName Groups - "Water", "Sediment", "Soil" # unique(DeltaSED_new$MatrixName) #Identifies OLDNAMES
## [1] "Sediment"
# [1] "Sediment" - no changes necessary # Standardize WBT unique(DeltaSED_new$WBT)
## [1] "Not Recorded"
# [1] NA - no WBT yet - may need GIS to figure what WBT is. # Standardize Analyte Groups - "Mercury, Total", "Mercury, Dissolved", "Mercury, Suspended", & same for Methylmercury # unique(DeltaSED_new$Analyte) #IF ANALYTE IS IN 2 COLUMNS e.g., (Hg & MeHg); (Total, Dissolved, Suspended) SEE WQP SCRIPT FOR EXAMPLE
## [1] "Methylmercury, Total" "Mercury, Total"
# [1] "Methylmercury, Total" "Mercury, Total" - formatted when converting wide to long data format # Standardize ResultQualCode Groups - "ND", "DNQ", NA# unique(DeltaSED_new$ResultQualCode) #Identifies OLDNAMES
## [1] "=" "ND"
# [1] NA "ND" - formatted when converting wide to long data format # Format Result Column to Numeric# class(DeltaSED_new$Result)
## [1] "numeric"
# [1] "numeric" - no changes necessary # Format MDL Column to Numeric# class(DeltaSED_new$MDL)
## [1] "numeric"
# [1] "numeric" - no changes necessary # Check if Result & ResultQualCode Columns both equal <NA> - these rows have no useful information nrow(DeltaSED_new) #Number rows before
## [1] 848
#CODE BELOW REQUIRES USER TROUBLESHOOTING DEPENDING ON AVAILABLE COLUMNS AND SPREADSHEET SPECIFIC CONDITIONS# DeltaSED_new <- DeltaSED_new %>% #Set 0 & negative values as blank mutate(Result = ifelse(Result <= 0, NA_real_, Result), MDL = ifelse(MDL <= 0, NA_real_, MDL), RL = ifelse(RL <= 0, NA_real_, RL)) na_results <- DeltaSED_new %>% #Record rows where Result, MDL, & RL all equal <NA> filter( is.na(Result) & is.na(MDL) & is.na(RL) ) #View(na_results) DeltaSED_new <- anti_join(DeltaSED_new, na_results, by='SourceRow') #returns rows from DeltaSED_new not matching values in no_result nrow(DeltaSED_new) #Number rows after
## [1] 848
# Format Units Column - "ng/L", "mg/Kg" unique(DeltaSED_new$Unit) #Identifies OLDNAMES
## [1] "ng/g" "mg/Kg"
DeltaSED_new <- DeltaSED_new %>% standardizeUnits unique(DeltaSED_new$Unit)
## [1] "mg/Kg"
# Format Date and Time Column # # NEED TO TALK ABOUT HOW WE WANT TO DO THIS - To graph in R we need Date and Time in same column # THE EXAMPLE CODE BELOW ASSUMES DATE AND TIME ARE IN SEPERATE COLUMNS DeltaSED_new <- DeltaSED_new %>% #rowise() %>% # rowise is very slow - so used sapply to make this a rowise operation mutate( #If SampleDate & CollectioTIme are not in Character format by defualt, turn it into a character class so it exports better SampleDate = ifelse(sapply(SampleDate, is.character), SampleDate, as.character(as.Date(SampleDate))), SampleTime = ifelse(sapply(SampleTime, is.character), SampleTime, format(lubridate::ymd_hms(SampleTime), "%H:%M:%S")), #COMBINE DATE AND TIME INTO SampleDateTime COLUMN SampleDateTime = ifelse(!is.na(SampleDate), ifelse(!is.na(SampleTime), paste(SampleDate, SampleTime), paste(SampleDate, '00:00:00')), NA), #FORMAT SampleDateTime COLUMN TO DATE FORMAT SampleDateTime = lubridate::ymd_hms(SampleDateTime) ) ### REMOVE TEMPORARY COLUMNS ### DeltaSED_new <- DeltaSED_new %>% select(-one_of(temp_cols)) #Remove temp columns since they are no longer needed #View(DeltaSED_new) ## SAVE FORMATTED DATA AS EXCEL FILE ## writexl::write_xlsx(DeltaSED_new, path='Reeval_Impl_Goals_Linkage_Analysis/Data/Aqueous/DeltaSED_ceden_format.xlsx') # In excel, to convert SampleDate column to Date format # 1 - Select the date column. # 2 - Go to the Data-tab and choose "Text to Columns". # 3 - On the first screen, leave radio button on "delimited" and click Next. # 4 - Unselect any delimiter boxes (everything blank) and click Next. # 5 - Under column data format choose Date, select YMD # 6 - Click Finish.
The R session information (including the OS info, R version and all packages used):
sessionInfo()
## R version 4.2.2 (2022-10-31 ucrt) ## Platform: x86_64-w64-mingw32/x64 (64-bit) ## Running under: Windows 10 x64 (build 22621) ## ## Matrix products: default ## ## locale: ## [1] LC_COLLATE=English_United States.utf8 LC_CTYPE=C ## [3] LC_MONETARY=English_United States.utf8 LC_NUMERIC=C ## [5] LC_TIME=English_United States.utf8 ## system code page: 65001 ## ## attached base packages: ## [1] grid stats graphics grDevices utils datasets methods base ## ## other attached packages: ## [1] mgcv_1.8-41 nlme_3.1-160 lubridate_1.8.0 plotly_4.10.0 ## [5] readxl_1.4.1 actuar_3.3-0 NADA_1.6-1.1 forcats_0.5.2 ## [9] stringr_1.4.1 dplyr_1.0.9 purrr_0.3.4 readr_2.1.2 ## [13] tidyr_1.2.0 tibble_3.1.8 ggplot2_3.3.6 tidyverse_1.3.2 ## [17] fitdistrplus_1.1-8 survival_3.4-0 MASS_7.3-58.1 ## ## loaded via a namespace (and not attached): ## [1] httr_1.4.4 jsonlite_1.8.0 viridisLite_0.4.1 splines_4.2.2 ## [5] modelr_0.1.9 assertthat_0.2.1 highr_0.9 googlesheets4_1.0.1 ## [9] cellranger_1.1.0 yaml_2.3.5 pillar_1.8.1 backports_1.4.1 ## [13] lattice_0.20-45 glue_1.6.2 digest_0.6.29 rvest_1.0.3 ## [17] colorspace_2.0-3 htmltools_0.5.3 Matrix_1.5-1 pkgconfig_2.0.3 ## [21] broom_1.0.1 haven_2.5.1 webshot_0.5.3 scales_1.2.1 ## [25] tzdb_0.3.0 googledrive_2.0.0 generics_0.1.3 ellipsis_0.3.2 ## [29] withr_2.5.0 lazyeval_0.2.2 cli_3.3.0 magrittr_2.0.3 ## [33] crayon_1.5.1 evaluate_0.16 fs_1.5.2 fansi_1.0.3 ## [37] xml2_1.3.3 tools_4.2.2 data.table_1.14.2 hms_1.1.2 ## [41] expint_0.1-7 gargle_1.2.0 lifecycle_1.0.1 munsell_0.5.0 ## [45] reprex_2.0.2 writexl_1.4.0 compiler_4.2.2 rlang_1.0.5 ## [49] rstudioapi_0.14 htmlwidgets_1.5.4 crosstalk_1.2.0 rmarkdown_2.16 ## [53] gtable_0.3.1 DBI_1.1.3 R6_2.5.1 knitr_1.40 ## [57] fastmap_1.1.0 utf8_1.2.2 stringi_1.7.8 vctrs_0.4.1 ## [61] dbplyr_2.2.1 tidyselect_1.1.2 xfun_0.32
Sys.time()
## [1] "2023-12-29 14:23:37 PST"