This report is automatically generated with the R package knitr (version 1.40) .

source("R Functions/functions_QA data.R")


### LOAD DATA ###
AppK <- readxl::read_excel('Reeval_Impl_Goals_Linkage_Analysis/Data/Fish/AppK_Fish MeHg.xlsx', sheet='Data', guess_max = 30000)
There were 50 or more warnings (use warnings() to see the first 50)
cat('"50 or more warnings" result from Date column having years and not a full day, month, year date. The "Format Date & Time" section addresses this issue\n')
## "50 or more warnings" result from Date column having years and not a full day, month, year date. The "Format Date & Time" section addresses this issue
nrow(AppK) #number of rows should match the Excel file (minus the header row)
## [1] 2479
### LOAD REFERENCE SHEETS ###
#Citations
AppK_citations <- readxl::read_excel('Reeval_Impl_Goals_Linkage_Analysis/Data/Fish/AppK_Fish MeHg.xlsx', sheet='Project Citations', guess_max = 30000) %>%
  select(ProjID, Citation) %>%
  rename(ProjectCode = ProjID,
         CitationCode = Citation) %>%
  tidyr::separate_rows(., ProjectCode, sep = ",")  #seperates a row that has more than one ProjectCode listed for each Citation into multiple rows

#Lat, Long, Coord system
AppK_coord <- readxl::read_excel('Reeval_Impl_Goals_Linkage_Analysis/Data/Fish/AppK_Fish MeHg.xlsx', sheet='Sites', guess_max = 30000) %>%
  select(SiteName, Latitude, Longitude, Datum) %>%
  filter(!is.na(Latitude)) %>%
  distinct(SiteName, .keep_all=T) %>%
  rename(StationName = SiteName,
         TargetLatitude = Latitude,
         TargetLongitude = Longitude,
         CoordSystem = Datum)


### LIST COLUMNS TO BE USED, ADD USER DEFINED COLUMNS, & RENAME COLUMNS TO CEDEN STANDARDS ###
#Use 1.READ ME.xlsx, 'ColumnsForR' to list & identify columns that match corresponding CEDEN Standard columns
keep_cols <- c('SourceID','SourceRow', 'ProjID', 'SiteName', 'SampleDate_FIXED', 'Common', 'Number', 'Tissue', 'Hg_ppmWetWt', 'Weight(g)', 'Length(mm)', 'SampleID')

#temp_cols are removed before the data is merged with other datasets
#Include columns that do not match CEDEN standards but may be useful (e.g., Unit columns for MDL & RL)
temp_cols <- c('GroupedSites', 'DeltaSubRegion-NumTarget', 'DeltaSubRegion-Linkage-Subregion_1998-2001', 'Genus', 'Species', 'LabQualHGppmWetWt',
               'SampleID-Alternate', 'Matrix1', 'Individual&Composite', 'LengthType', 'LabQualHGppmWetWt')

AppK_new <- AppK %>%
  select( c(keep_cols,temp_cols) ) %>% #DO NOT CHANGE - selects columns specified above
  rename(
    #OldName = NewName
    ProjectCode = ProjID,
    StationName = SiteName,
    SampleDate = SampleDate_FIXED,
    CommonName = Common,
    NumberFishPerComp = Number,
    TissueName = Tissue,
    Result = Hg_ppmWetWt,
    `WeightAvg g` = `Weight(g)`,
    `TLAvgLength mm` = `Length(mm)`
  ) %>%
  mutate(
    #NewName = 'SPECIFIED VALUE' or FUNCTION
    ProgramName = NA_character_,
    ParentProjectName = NA_character_,
    ProjectName = NA_character_,
    CompositeID = NA_character_,
    StationCode = paste(paste0('GroupedSites: ',GroupedSites),
                        paste0('DeltaSubRegion-NumTarget: ',`DeltaSubRegion-NumTarget`),
                        paste0('DeltaSubRegion-Linkage: ', `DeltaSubRegion-Linkage-Subregion_1998-2001`),
                        sep=' ~ '),
    SampleTime = NA_character_,
    TaxonomicName = paste(Genus, Species),
    Method = NA_character_,
    Analyte = 'Mercury, Total',
    Unit = 'mg/Kg ww',
    ResultQualCode = case_when(grepl('<DL',LabQualHGppmWetWt) ~ 'ND',
                            grepl('<RL',LabQualHGppmWetWt) ~ 'ND',
                            grepl('BRL',LabQualHGppmWetWt) ~ 'ND', # BRL=Below Reporting Limit; a RL value is given but all we know is that the result is <RL
                            TRUE ~ NA_character_),
    Result = case_when(grepl('<DL',LabQualHGppmWetWt) ~ NA_real_, #remove Result value because when Result = MDL/RL is causes confusion
                       grepl('<RL',LabQualHGppmWetWt) ~ NA_real_, #remove Result value because when Result = MDL/RL is causes confusion
                       grepl('BRL',LabQualHGppmWetWt) ~ NA_real_, #remove Result value because when Result = MDL/RL is causes confusion
                       TRUE ~ Result),
    MDL = case_when(grepl('DL',LabQualHGppmWetWt) ~ .0386,
                    TRUE ~ NA_real_),
    RL = case_when(grepl('RL',LabQualHGppmWetWt) ~ .00314,
                   grepl('BRL',LabQualHGppmWetWt) ~ .0282,
                   TRUE ~ NA_real_),
    `TLMin mm` = NA_character_,
    `TLMax mm` = NA_character_,
    CompositeRowID = NA_character_,
    SampleID = paste(paste0('SampleID: ', SampleID),
                     paste0('SampleID-Alt: ',`SampleID-Alternate`),
                     sep=' ~ '),
    WBT = 'Not Recorded',
    QACode = NA_character_,
    BatchVerification = NA_character_,
    ComplianceCode = NA_character_,
    ResultComments = paste(ifelse(grepl('x',`Individual&Composite`,ignore.case=T), 'Included in both individual & composite samples', ''),
                           paste0('LengthType: ',ifelse(grepl('placeholder',LengthType),'*placeholder',ifelse(grepl('TL',LengthType),'total length',''))), #shorten '*placeholder...' notation & clarify 'TL' notation 
                           paste0('LabQualHGppmWetWt: ',ifelse(grepl('average of',LabQualHGppmWetWt), LabQualHGppmWetWt, '')),  #just note the average of duplicates
                           sep=' ~ '),
    LabSubmissionCode = NA_character_
  ) %>%
  filter(Matrix1 %are not% c('IV', 'MO')) %>% #Exclude IV=Invertebrate & MO=Mollusk
  left_join(., AppK_citations, by='ProjectCode') %>% # Add CitationCode column
  left_join(., AppK_coord, by='StationName') # Add TargetLat, TargetLong, and CoordSystem columns
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(keep_cols)` instead of `keep_cols` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(temp_cols)` instead of `temp_cols` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
nrow(AppK_new)
## [1] 1816
#str(AppK_new) #just to check data class of different columns - e.g., is Date column in POSIX format?
#View(AppK_new)


### FORMAT COLUMN PARAMETERS ###

  # Standardize TissueName Groups - "Fillet" or "Whole Body" #
unique(AppK_new$TissueName)
## [1] "F" "W"
AppK_new <- AppK_new %>%
  mutate(TissueName = recode(TissueName,
                             "F" = "Fillet",
                             "W" = "Whole Body"
                             )
         ) %>%
  filter(TissueName %in% c('Fillet','Whole Body'))
unique(AppK_new$TissueName)
## [1] "Fillet"     "Whole Body"
  # Standardize Analyte Groups - "Mercury, Total" or "Methylmercury, Total" #
unique(AppK_new$Analyte)
## [1] "Mercury, Total"
# [1] "Mercury, Total" - no changes needed


  # Standardize ResultQualCode Groups - "ND", "DNQ", NA#
unique(AppK_new$ResultQualCode) #Identifies OLDNAMES
## [1] NA   "ND"
# [1] NA   "ND" - no changes needed


  # Format Result Column to Numeric#
  # Check column for text - based on text user needs to decide what to do
if(!is.numeric(AppK_new$Result)){
  old <-AppK_new$Result
  new <-AppK_new$Result
  new[grepl('<|[a-df-zA-DF-Z]', new)] <- NA #skip 'e' for exponential notation e.g., "8e-005"
  #Print what text was found and what is being done
  cat(paste0("'Result' column should be numeric but some cells contain ", grammaticList(setdiff(old, new)),
             ".\nACTIONS TAKEN:\n",
             "~explain here~.\n"))
  #AppK_new <- AppK_new %>%
  #  mutate( #Do stuff to prep column to be converted to Numeric
  #    Result = as.numeric(new)
  #    )
} else {
  cat("'Result' column is in numeric format\n")}
## 'Result' column is in numeric format
  # Format MDL Column to Numeric#
  # Check column for text - based on text user needs to decide what to do
if(!is.numeric(AppK_new$MDL)){
  old <-AppK_new$MDL
  new <- AppK_new$MDL
  new[grepl('[a-df-zA-DF-Z]', new)] <- NA #skip 'e' for exponential notation e.g., "8e-005"
  #Print what text was found and what is being done
  cat(paste0("'Result' column should be numeric but some cells contain ", grammaticList(setdiff(old, new)),
             ".\nACTIONS TAKEN:\n",
             "~explain here~.\n"))
  #AppK_new <- AppK_new %>%
  #  mutate( #Do stuff to prep column to be converted to Numeric
  #    MDL = as.numeric(new)
  #  )
} else {
  cat("'MDL' column is in numeric format\n")}
## 'MDL' column is in numeric format
  # Format RL Column to Numeric#  
  # Check column for text - based on text user needs to decide what to do
if(!is.numeric(AppK_new$RL)){
  old <-AppK_new$RL
  new <-AppK_new$RL
  new[grepl('[a-df-zA-DF-Z]', new)] <- NA #skip 'e' for exponential notation e.g., "8e-005"
  #Print what text was found and what is being done
  cat(paste0("'Result' column should be numeric but some cells contain ", grammaticList(setdiff(old, new)),
             ".\nACTIONS TAKEN:\n",
             "~explain here~.\n"))
  #AppK_new <- AppK_new %>%
  #  mutate( #Due stuff to prep column to be converted to Numeric
  #    RL = as.numeric(new)
  #  )
} else {
  cat("'RL' column is in numeric format\n")}
## 'RL' column is in numeric format
  # Check if Result, MDL, & RL Columns all equal <NA> or 0 - these rows have no useful information
nrow(AppK_new) #Number rows before
## [1] 1816
#CODE BELOW REQUIRES USER TROUBLESHOOTING DEPENDING ON AVAILABLE COLUMNS AND SPREADSHEET SPECIFIC CONDITIONS#
AppK_new <- AppK_new %>%
  #Set 0 & negative values as blank
  mutate(Result = ifelse(Result <= 0, NA_real_, Result),
         MDL = ifelse(MDL <= 0, NA_real_, MDL),
         RL = ifelse(RL <= 0, NA_real_, RL))
na_results <- AppK_new %>% #Record rows where Result, MDL, & RL all equal <NA>
  filter( is.na(Result) & is.na(MDL) & is.na(RL) )
nrow(na_results)
## [1] 0
AppK_new <- anti_join(AppK_new, na_results, by='SourceRow') #returns rows from AppK_new not matching values in no_result
nrow(AppK_new) #Number rows after
## [1] 1816
  # Format Units Column - "mg/Kg ww" or "mg/Kg dw"
unique(AppK_new$Unit) #Identifies OLDNAMES
## [1] "mg/Kg ww"
# If more than 1 unit colmn exists (e.g., for RL and MDL columns) see WQP script for example on merging into 1 column
AppK_new <- AppK_new %>%
  standardizeUnits(pp = "mass")
unique(AppK_new$Unit) #New naming structure for Unit Groupings
## [1] "mg/Kg ww"
# Format Date & Time
AppK_new <- AppK_new %>%
  #rowise() %>%    # rowise is very slow - so used sapply to make this a rowise operation
  mutate(
    #If SampleDate & CollectioTIme are not in Character format by defualt, turn it into a character class so it exports better
    SampleDate = ifelse(sapply(SampleDate, is.character), SampleDate, as.character(as.Date(SampleDate))),
    SampleTime = ifelse(sapply(SampleTime, is.character), SampleTime, format(lubridate::ymd_hms(SampleTime), "%H:%M:%S")),
    #COMBINE DATE AND TIME INTO SampleDateTime COLUMN
    SampleDateTime = ifelse(!is.na(SampleTime), paste(SampleDate, SampleTime), paste(SampleDate, '00:00:00')),
    #FORMAT SampleDateTime COLUMN TO DATE FORMAT
    SampleDateTime = lubridate::ymd_hms(SampleDateTime)
  )


### REMOVE TEMPORARY COLUMNS ###
AppK_new <- AppK_new %>%
  select(-one_of(temp_cols)) #Remove temp columns since they are no longer needed
#View(AppK_new)

## SAVE FORMATTED DATA AS EXCEL FILE ##
writexl::write_xlsx(AppK_new, path='Reeval_Impl_Goals_Linkage_Analysis/Data/Fish/AppK_Fish MeHg_ceden_format.xlsx')
# In excel, to convert SampleDate column to Date format
# 1 - Select the date column.
# 2 - Go to the Data-tab and choose "Text to Columns".
# 3 - On the first screen, leave radio button on "delimited" and click Next.
# 4 - Unselect any delimiter boxes (everything blank) and click Next.
# 5 - Under column data format choose Date, select YMD
# 6 - Click Finish.

The R session information (including the OS info, R version and all packages used):

    sessionInfo()
## R version 4.2.2 (2022-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 22621)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8  LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8 LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] lubridate_1.8.0    plotly_4.10.0      readxl_1.4.1       actuar_3.3-0      
##  [5] NADA_1.6-1.1       forcats_0.5.2      stringr_1.4.1      dplyr_1.0.9       
##  [9] purrr_0.3.4        readr_2.1.2        tidyr_1.2.0        tibble_3.1.8      
## [13] ggplot2_3.3.6      tidyverse_1.3.2    fitdistrplus_1.1-8 survival_3.4-0    
## [17] MASS_7.3-58.1     
## 
## loaded via a namespace (and not attached):
##  [1] lattice_0.20-45     assertthat_0.2.1    digest_0.6.29       utf8_1.2.2         
##  [5] R6_2.5.1            cellranger_1.1.0    backports_1.4.1     reprex_2.0.2       
##  [9] evaluate_0.16       highr_0.9           httr_1.4.4          pillar_1.8.1       
## [13] rlang_1.0.5         lazyeval_0.2.2      googlesheets4_1.0.1 rstudioapi_0.14    
## [17] data.table_1.14.2   Matrix_1.5-1        splines_4.2.2       googledrive_2.0.0  
## [21] htmlwidgets_1.5.4   munsell_0.5.0       broom_1.0.1         compiler_4.2.2     
## [25] modelr_0.1.9        xfun_0.32           pkgconfig_2.0.3     htmltools_0.5.3    
## [29] tidyselect_1.1.2    fansi_1.0.3         viridisLite_0.4.1   crayon_1.5.1       
## [33] tzdb_0.3.0          dbplyr_2.2.1        withr_2.5.0         grid_4.2.2         
## [37] jsonlite_1.8.0      gtable_0.3.1        lifecycle_1.0.1     DBI_1.1.3          
## [41] magrittr_2.0.3      scales_1.2.1        writexl_1.4.0       cli_3.3.0          
## [45] stringi_1.7.8       fs_1.5.2            xml2_1.3.3          ellipsis_0.3.2     
## [49] generics_0.1.3      vctrs_0.4.1         expint_0.1-7        tools_4.2.2        
## [53] glue_1.6.2          hms_1.1.2           fastmap_1.1.0       colorspace_2.0-3   
## [57] gargle_1.2.0        rvest_1.0.3         knitr_1.40          haven_2.5.1
    Sys.time()
## [1] "2024-01-05 09:11:47 PST"