This report is automatically generated with the R package knitr (version 1.40) .

source("R Functions/functions_QA data.R")

### LOAD DATA ###
CALFED2003_1a <- readxl::read_excel('Reeval_Impl_Goals_Linkage_Analysis/Data/Aqueous/CALFED2003-1a.xls', sheet='Load Calcs_workingWide', guess_max = 30000)
nrow(CALFED2003_1a) #number of rows should match the Excel file (minus the header row)
## [1] 145
#str(CALFED2003_1a)


### FORMAT DATA FROM WIDE TO LONG ###
#View(CALFED2003_1a)
#remove flow and calculated load columns
CALFED2003_1a <- CALFED2003_1a %>%
  select(-Flow, -`THg Loading`:-`DMMHg Loading`)
#slice off first row that has units
units <- CALFED2003_1a %>% #saving units to new variable
  slice(1) %>%
  select(THg:SO4) %>% #select columns that have units
  gather(key="Analyte", value="Unit") #format from wide to long
CALFED2003_1a <- CALFED2003_1a %>%
  slice(-1) #remove 1st row with units from main data
#CALFED wide to long
CALFED2003_1a <- CALFED2003_1a %>%
  gather(THg:SO4,
         key="Analyte", value="Result")
#PASTE LETTER OF COLUMN TO RESULT ID SO THE RESULT ID IS NOT DUPLICATED
seqPasteLetters <- function(chr_vector){
  col_letters <- LETTERS[c(6:15,17)] # letters corresponding to column of excel data - excludes Flow column
  for (i in 1:length(chr_vector)){
    chr_vector[i] <- paste0(chr_vector[i], col_letters[i])
  }
  return(chr_vector)
}
CALFED2003_1a <- CALFED2003_1a %>%
  mutate(tempSourceRow = SourceRow) %>%
  group_by(tempSourceRow) %>%
  mutate(SourceRow = seqPasteLetters(SourceRow)) %>%
  ungroup() %>%
  dplyr::select(-tempSourceRow)
#Add Units back in
CALFED2003_1a_new <- left_join(CALFED2003_1a, units, by="Analyte")

#View(CALFED2003_1a_new)



### FORMAT COLUMN PARAMETERS ###
  # Format Result Column to Numeric#
  # Check column for text - based on text user needs to decide what to do
if(any(grepl('<|[a-df-zA-DF-Z]', CALFED2003_1a_new$Result))){
  old <- CALFED2003_1a_new$Result
  new <- CALFED2003_1a_new$Result
  new[grepl('<|[a-df-zA-DF-Z]', new)] <- NA #skip 'e' for exponential notation e.g., "8e-005"
  #Print what text was found and what is being done
  cat(paste0("'Result' column should be numeric but some cells contain ", grammaticList(setdiff(old, new)),
             ".\nACTIONS TAKEN:\n",
             "Values with 'U' or '<' were recorded in MDL column, 'ND' in ResultQualCode column, and 'NA' in Result column.\n",
             "Result with ND (only @ Pospect Slough for DMMHg on 7/19/00), 'ND' recorded in ResultQualCode column 'NA' in Result column, and 0.022 in MDL from data_MMHg_QA.xlsx\n",
             "Results with 'broken' and 'reservoir dry' were replaced with 'NA' in Result column.\n"))
  CALFED2003_1a_new <- CALFED2003_1a_new %>%
    mutate(
      MDL = case_when( grepl('U|<', Result) ~ gsub('U|<', '', Result)),
      ResultQualCode = case_when( grepl('U|<|nd', Result) ~ 'ND',
                                  TRUE ~ '='),
      MDL = case_when(grepl('^nd$', Result) ~ "0.022", #this ND only occurs once -  Pospect Slough for DMMHg on 7/19/00 and MDL was found to be 0.022 from data_MMHg_QA.xlsx)
                      TRUE ~ MDL),
      Result =  case_when( grepl('<|[a-df-zA-DF-Z]', Result) ~ NA_character_,
                           TRUE ~ Result),
      Result = as.numeric(Result)
      )
} else {
  cat("'Result' column converted to numeric format\n")
  CALFED2003_1a_new$Result <- as.numeric(CALFED2003_1a_new$Result)
  }
## 'Result' column should be numeric but some cells contain broken, 0.022U, 0.014U, 0.013U, 0.007U, nd, leak in State Water Project lift Canal, reservoir dry, <1.0, <1, and <2.0.
## ACTIONS TAKEN:
## Values with 'U' or '<' were recorded in MDL column, 'ND' in ResultQualCode column, and 'NA' in Result column.
## Result with ND (only @ Pospect Slough for DMMHg on 7/19/00), 'ND' recorded in ResultQualCode column 'NA' in Result column, and 0.022 in MDL from data_MMHg_QA.xlsx
## Results with 'broken' and 'reservoir dry' were replaced with 'NA' in Result column.
  # Format MDL Column to Numeric #
CALFED2003_1a_new$MDL <- as.numeric(CALFED2003_1a_new$MDL)


  # Check if Result, MDL, & RL Columns all equal <NA> or 0 - these rows have no useful information
nrow(CALFED2003_1a_new) #Number rows before
## [1] 1584
#CODE BELOW REQUIRES USER TROUBLESHOOTING DEPENDING ON AVAILABLE COLUMNS AND SPREADSHEET SPECIFIC CONDITIONS#
CALFED2003_1a_new <- CALFED2003_1a_new %>%
  #Set 0 & negative values as blank
  mutate(Result = ifelse(Result <= 0, NA_real_, Result),
         MDL = ifelse(MDL <= 0, NA_real_, MDL))
na_results <- CALFED2003_1a_new %>% #Record rows where Result, MDL, & RL all equal <NA> or 0
  filter( is.na(Result) & is.na(MDL) )
#View(na_results)
CALFED2003_1a_new <- anti_join(CALFED2003_1a_new, na_results, by='SourceRow') #returns rows from CALFED2003_1a_new not matching values in no_result
nrow(CALFED2003_1a_new) #Number rows after
## [1] 1223
  # Format Analyte Column
unique(CALFED2003_1a_new$Analyte)
##  [1] "THg"        "DHg"        "TMMHg"      "DMMHg"      "field EC"   "field Temp"
##  [7] "TSS"        "VSS"        "Chlorophyl" "Phaeoph"    "SO4"
#Change Mercury but deciding on Ancillary consituent naming later
CALFED2003_1a_new <- CALFED2003_1a_new %>%
  mutate(Analyte = recode(Analyte,
                      "THg" = "Mercury, Total",
                      "DHg" = "Mercury, Dissolved",
                      "TMMHg" = "Methylmercury, Total",
                      "DMMHg" = "Methylmercury, Dissolved"
                      )
  )

  # Format Date and Time Column #
# NEED TO TALK ABOUT HOW WE WANT TO DO THIS - To graph in R we need Date and Time in same column
# No TIme column so use 00:00:00 as default
CALFED2003_1a_new <- CALFED2003_1a_new %>%
  #rowise() %>%    # rowise is very slow - so used sapply to make this a rowise operation
  mutate(
    #If SampleDate & CollectioTIme are not in Character format by defualt, turn it into a character class so it exports better
    SampleDate = ifelse(sapply(SampleDate, is.character), SampleDate, as.character(as.Date(SampleDate))),
    #COMBINE DATE AND TIME INTO SampleDateTime COLUMN
    SampleDateTime = paste(SampleDate, '00:00:00'),
    #FORMAT SampleDateTime COLUMN TO DATE FORMAT
    SampleDateTime = lubridate::ymd_hms(SampleDateTime)
  )


### ADD USER DEFINED & MISSING COLUMNS NAMED TO CEDEN STANDARDS ###
CALFED2003_1a_new <- CALFED2003_1a_new %>%
  mutate(              #Add user defined columns
    CitationCode = 'CALFED_2003_1a',  #Check with Jennie
    CoordSystem = NA_character_,
    Project = 'CALFED 2003 Task1a',
    StationCode = NA_character_,
    SampleTime = NA_character_,
    LabBatch = NA_character_,
    LabSampleID = NA_character_,
    MatrixName = 'Aqueous',
    WBT = case_when(grepl('River|Creek', StationName) ~ 'River/Stream',
                    grepl('Slough', StationName) ~ 'Slough',
                    grepl('Canal', StationName) ~ 'Drain/Canal',
                    grepl('State Water Project', StationName) ~ 'Drain/Canal',
                    grepl('X2', StationName) ~ 'Delta'
                    ),
    MethodName = NA_character_,
    RL= NA_real_,
    SampleID = NA_character_,
    QACode = NA_character_,
    BatchVerification = NA_character_,
    ComplianceCode = NA_character_,
    SampleComments =NA_character_,
    CollectionComments = NA_character_,
    ResultsComments = NA_character_,
    BatchComments = NA_character_,
    SampleTypeCode = NA_character_
  )
#View(CALFED2003_1a_new)


### SPLIT ANCILLARY DATA AND MERCURY DATA INTO SEPERATE DATAFRAMES & STANDARDIZE UNITS FOR EACH ###
CALFED2003_1a_mercury <- CALFED2003_1a_new %>%
  filter(grepl('mercury', Analyte, ignore.case=T))

unique(CALFED2003_1a_mercury$Unit)
## [1] "ng/l"
CALFED2003_1a_mercury <- CALFED2003_1a_mercury %>%
  # Format Units Column - "ng/L", "mg/Kg"
  standardizeUnits
unique(CALFED2003_1a_mercury$Unit)
## [1] "ng/L"
CALFED2003_1a_ancillary <- CALFED2003_1a_new %>%
  filter(!grepl('mercury', Analyte, ignore.case=T))

nrow(CALFED2003_1a_mercury) + nrow(CALFED2003_1a_ancillary) == nrow(CALFED2003_1a_new)  #needs to be true
## [1] TRUE
## SAVE FORMATTED DATA AS EXCEL FILE ##
#Save Mercury data
writexl::write_xlsx(CALFED2003_1a_mercury, path='Reeval_Impl_Goals_Linkage_Analysis/Data/Aqueous/CALFED2003-1aMerc_ceden_format.xlsx')

#Save Ancillary data
writexl::write_xlsx(CALFED2003_1a_ancillary, path='Reeval_Impl_Goals_Linkage_Analysis/Data/Ancillary/CALFED2003-1aAncillary_ceden_format.xlsx')
# In excel, to convert SampleDate column to Date format
# 1 - Select the date column.
# 2 - Go to the Data-tab and choose "Text to Columns".
# 3 - On the first screen, leave radio button on "delimited" and click Next.
# 4 - Unselect any delimiter boxes (everything blank) and click Next.
# 5 - Under column data format choose Date, select YMD
# 6 - Click Finish.

The R session information (including the OS info, R version and all packages used):

    sessionInfo()
## R version 4.2.2 (2022-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 22621)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8  LC_CTYPE=C                            
## [3] LC_MONETARY=English_United States.utf8 LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## system code page: 65001
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] mgcv_1.8-41        nlme_3.1-160       lubridate_1.8.0    plotly_4.10.0     
##  [5] readxl_1.4.1       actuar_3.3-0       NADA_1.6-1.1       forcats_0.5.2     
##  [9] stringr_1.4.1      dplyr_1.0.9        purrr_0.3.4        readr_2.1.2       
## [13] tidyr_1.2.0        tibble_3.1.8       ggplot2_3.3.6      tidyverse_1.3.2   
## [17] fitdistrplus_1.1-8 survival_3.4-0     MASS_7.3-58.1     
## 
## loaded via a namespace (and not attached):
##  [1] httr_1.4.4          jsonlite_1.8.0      viridisLite_0.4.1   splines_4.2.2      
##  [5] modelr_0.1.9        assertthat_0.2.1    highr_0.9           googlesheets4_1.0.1
##  [9] cellranger_1.1.0    yaml_2.3.5          pillar_1.8.1        backports_1.4.1    
## [13] lattice_0.20-45     glue_1.6.2          digest_0.6.29       rvest_1.0.3        
## [17] colorspace_2.0-3    htmltools_0.5.3     Matrix_1.5-1        pkgconfig_2.0.3    
## [21] broom_1.0.1         haven_2.5.1         webshot_0.5.3       scales_1.2.1       
## [25] tzdb_0.3.0          googledrive_2.0.0   generics_0.1.3      ellipsis_0.3.2     
## [29] withr_2.5.0         lazyeval_0.2.2      cli_3.3.0           magrittr_2.0.3     
## [33] crayon_1.5.1        evaluate_0.16       fs_1.5.2            fansi_1.0.3        
## [37] xml2_1.3.3          tools_4.2.2         data.table_1.14.2   hms_1.1.2          
## [41] expint_0.1-7        gargle_1.2.0        lifecycle_1.0.1     munsell_0.5.0      
## [45] reprex_2.0.2        writexl_1.4.0       compiler_4.2.2      rlang_1.0.5        
## [49] rstudioapi_0.14     htmlwidgets_1.5.4   crosstalk_1.2.0     rmarkdown_2.16     
## [53] gtable_0.3.1        DBI_1.1.3           R6_2.5.1            knitr_1.40         
## [57] fastmap_1.1.0       utf8_1.2.2          stringi_1.7.8       vctrs_0.4.1        
## [61] dbplyr_2.2.1        tidyselect_1.1.2    xfun_0.32
    Sys.time()
## [1] "2023-12-29 13:49:58 PST"