This report is automatically generated with the R package knitr (version 1.40) .

source("R Functions/functions_QA data.R")


### LOAD DATA ###
DRMP_TIResults <- readxl::read_excel('Reeval_Impl_Goals_Linkage_Analysis/Data/Fish/DRMP_2017_2019_Fish.xlsx', sheet='TIResults', guess_max = 30000)
nrow(DRMP_TIResults) #number of rows should match the Excel file (minus the header row)
## [1] 743
DRMP_FishComposite <- readxl::read_excel('Reeval_Impl_Goals_Linkage_Analysis/Data/Fish/DRMP_2017_2019_Fish.xlsx', sheet='FishComposite', guess_max = 30000) %>%
  select(-CompositeComments, -CompositeReplicate, -CompositeType, -OrganismGroup) #%>%  #remove columns that are the same in both sheets (except CompositeID & PrepPreservationName)
  # distinct(CompositeID, .keep_all=T) # rows are duplicated for some CompositeIDs
nrow(DRMP_FishComposite) #number of rows should match the Excel file (minus the header row)
## [1] 317
DRMP_2017_19 <- inner_join(DRMP_TIResults, DRMP_FishComposite, by='CompositeID') %>%
  mutate(PrepPreservationName = paste(PrepPreservationName.x, PrepPreservationName.y, sep=', ' ))
nrow(DRMP_2017_19)  # should be same as nrow(DRMP_TIResults)
## [1] 743
### LIST COLUMNS TO BE USED, ADD USER DEFINED COLUMNS, & RENAME COLUMNS TO CEDEN STANDARDS ###
#Use 1.READ ME.xlsx, 'ColumnsForR' to list & identify columns that match corresponding CEDEN Standard columns
keep_cols <- c('SourceID', 'SourceRow', 'ProjectCode', 'CompositeID', 'StationCode', 'SampleDate', 'OrganismName', 'TotalCount', 'TissueName', 'MethodName', 'UnitName', 'Result', 'ResQualCode', 'MDL', 'RL',
               'TissueWeight', 'CompositeRowID', 'LabSampleID', 'QACode', 'ComplianceCode', 'QACodeLab')

temp_cols <- c('AnalyteName', 'FractionName', 'TotalLength', 'PrepPreservationName', 'CompositeComments', 'PartsComments', 'ProcessedOrganismsExpandedFishComments', 'SampleTypeCode', 'SampleComments', 'TissueCollectionComments', 'TissueResultComments') #Include columns that do not match CEDEN standards but may be useful (e.g., Unit columns for MDL & RL)
#temp_cols are removed before the data is merged with other datasets

DRMP_2017_19_new <- DRMP_2017_19 %>%
  select( c(keep_cols,temp_cols) ) %>% #DO NOT CHANGE - selects columns specified above
  rename(
    #Rename DRMP_2017_19 columns to CEDEN format here: CEDEN 'COLUMNNAME' = DRMP_2017_19 'COLUMNNAME'
#CitationCode = , added with mutate below
#CoordSystem = , added with left_join(DRMP_locations)
#ProgramName = , added with Mutate below
#ParentProjectName = , added with Mutate below
#ProjectName = ,  added with Mutate below
#StationName = , added with left_join(DRMP_locations)
#TargetLatitude = , added with left_join(DRMP_locations)
#TargetLongitude = , added with left_join(DRMP_locations)
#SampleTime = , added with Mutate below
#CommonName = , added with left_join(Taxo_Comm_Name_LookUp) below
    TaxonomicName = OrganismName,
    NumberFishPerComp = TotalCount,
    Method = MethodName,
#Analyte = , added with Mutate below
    Unit = UnitName,
    ResultQualCode = ResQualCode,
    `WeightAvg g` = TissueWeight,
#TLMin mm = , added with Mutate below
#TLMax mm = , added with Mutate below
#TLAvgLength mm = , added with Mutate below
    SampleID = LabSampleID,
#WBT = , added with left_join(DRMP_locations)
#BatchVerification = , added with Mutate below
#ResultComments = , added with Mutate below
    LabSubmissionCode = QACodeLab
  ) %>%
  mutate(
    CitationCode = 'DRMP',
    ProgramName = NA_character_,
    ParentProjectName = NA_character_,
    ProjectName = NA_character_,
    SampleTime = NA_character_,
    Analyte = paste(AnalyteName, FractionName, sep=', '),
    `TLMin mm` = TotalLength,
    `TLMax mm` = TotalLength,
    `TLAvgLength mm` = TotalLength,
    BatchVerification = NA_character_,
    ResultComments = paste0(ifelse(is.na(PrepPreservationName),'',paste0('Prep-',PrepPreservationName)), ifelse(is.na(CompositeComments),'',paste0(' ~ Composite-',CompositeComments)), ifelse(is.na(PartsComments),'',paste0(' ~ Parts-',PartsComments)),
                           ifelse(is.na(ProcessedOrganismsExpandedFishComments),'',paste0(' ~ Process-',ProcessedOrganismsExpandedFishComments)), ifelse(is.na(SampleComments),'',paste0(' ~ Sampling-',SampleComments)),
                           ifelse(is.na(TissueCollectionComments),'',paste0(' ~ TissueCollect-',TissueCollectionComments)), ifelse(is.na(TissueResultComments),'',paste0(' ~ TissueResult-',TissueResultComments))
                           )
  )

nrow(DRMP_2017_19_new)
## [1] 743
#str(DRMP_2017_19_new) #just to check data class of different columns - e.g., is Date column in POSIX format?
#View(DRMP_2017_19_new)


### LOAD EXTERNAL DATA & ADD COLUMNS USING JOIN ###
#Add columns from "DRMP_locations.xlsx"
#before joining check StationCode for blanks
unique(DRMP_2017_19_new$StationCode) #no blanks go ahead and remove LABQA
## [1] "510ADVLIM" "510ST1317" "541SJC501" "544ADVLM6" "544LILPSL" "544MDRBH4" "510ST1666"
## [8] "LABQA"
DRMP_2017_19_new <- DRMP_2017_19_new %>%
  filter(StationCode != 'LABQA')
nrow(DRMP_2017_19_new)
## [1] 683
DRMP_locations <- readxl::read_excel('Reeval_Impl_Goals_Linkage_Analysis/Data/Fish/DRMP_locations.xlsx', sheet='Locations', guess_max = 30000) %>%
  select(-LocationCode)
DRMP_2017_19_new <- left_join(DRMP_2017_19_new, DRMP_locations, by='StationCode')            #Adds columns StationName, WBT, LocationCode, TargetLatitude, TargetLongitude, CoordSystem based on StationCode
nrow(DRMP_2017_19_new)
## [1] 683
#Add CommonName column from 1_Taxonomic_Common Name LookUp from WQP.xlsx
#before joining check TaxonmicName for blanks
unique(DRMP_2017_19_new$TaxonomicName) # no blanks good to go
## [1] "Micropterus salmoides"   "Micropterus punctulatus" "Micropterus dolomieu"
Taxo_Comm_Name_LookUp <- readxl::read_excel('Reeval_Impl_Goals_Linkage_Analysis/Data/Fish/0_Taxonomic_Common Name LookUp from WQP.xlsx', sheet='SpecieNames', guess_max = 30000) %>%
  select(CommonName, SubjectTaxonomicName)
DRMP_2017_19_new <- left_join(DRMP_2017_19_new, Taxo_Comm_Name_LookUp, by=c('TaxonomicName'='SubjectTaxonomicName'))  # Adds CommonNameColumn
unique(DRMP_2017_19_new$CommonName)
## [1] "Largemouth Bass" "Spotted Bass"    "Smallmouth Bass"
nrow(DRMP_2017_19_new)
## [1] 683
### FORMAT COLUMN PARAMETERS ###

  # Standardize WBT (WaterBodyType) Groups - "River/Stream", "Estuary", Drain/Canal", "Wetland", "Spring", "Slough", 
  #                                          "Pond",  "Lake/Reservoir", "Delta", "Forebay/Afterbay", "Not Recorded" #
unique(DRMP_2017_19_new$WBT) #Identifies OLDNAMES
## [1] "River/Stream" "X2"
# [1] "River/Stream" "X2" no changes needed


  # Standardize TissueName Groups - "Fillet" or "Whole Body" #
unique(DRMP_2017_19_new$TissueName)
## [1] "FIL"    "fillet"
DRMP_2017_19_new <- DRMP_2017_19_new %>%
  mutate(TissueName = recode(TissueName,
                             "FIL" = "Fillet",
                             "fillet" = "Fillet"
                             )
         )
unique(DRMP_2017_19_new$TissueName)
## [1] "Fillet"
  # Standardize Analyte Groups - "Mercury, Total" (we consider Total Mercury and Methylmercury to be approx equal) #
unique(paste(DRMP_2017_19_new$Analyte, DRMP_2017_19_new$Unit, sep=' ~ '))
## [1] "Moisture, Total ~ %"      "Mercury, Total ~ ug/g ww"
#Only keep Mercury, Total, which will filter out Moisture, Total
DRMP_2017_19_new <- DRMP_2017_19_new %>%
  filter(grepl('Mercury', Analyte))
unique(paste(DRMP_2017_19_new$Analyte, DRMP_2017_19_new$Unit, sep=' ~ '))
## [1] "Mercury, Total ~ ug/g ww"
nrow(DRMP_2017_19_new)
## [1] 364
  # Standardize ResultQualCode Groups - "ND", "DNQ", NA#
unique(DRMP_2017_19_new$ResultQualCode) #Identifies OLDNAMES
## [1] "="
# [1] "=" no changes needed


# Format Result Column to Numeric#  
# Check column for text - based on text user needs to decide what to do
if(!is.numeric(DRMP_2017_19_new$Result)){
  if(all(is.na(DRMP_2017_19_new$Result))){
    DRMP_2017_19_new <- DRMP_2017_19_new %>%
      mutate( #Column is all blanks and will be converted to Numeric
        RL = as.numeric(new)
      )
    cat("'Result' column is all blanks and was converted to numeric format\n")
  }else{
    old <-DRMP_2017_19_new$Result
    new <-DRMP_2017_19_new$Result
    new[grepl('[a-df-zA-DF-Z]', new)] <- NA #skip 'e' for exponential notation e.g., "8e-005"
    #Print what text was found and what is being done
    cat(paste0("'Result' column should be numeric but some cells contain ", grammaticList(setdiff(old, new)),
               ".\nACTIONS TAKEN:\n",
               "~explain here~.\n"))
    #DRMP_2017_19_new <- DRMP_2017_19_new %>%
    #  mutate( #Do stuff to prep column to be converted to Numeric
    #    Result = as.numeric(new)
    #    )
  }
}else{
  cat("'Result' column is in numeric format\n")}
## 'Result' column is in numeric format
  # Format MDL Column to Numeric#
  # Check column for text - based on text user needs to decide what to do
if(!is.numeric(DRMP_2017_19_new$MDL)){
  if(all(is.na(DRMP_2017_19_new$MDL))){
    DRMP_2017_19_new <- DRMP_2017_19_new %>%
      mutate( #Column is all blanks and will be converted to Numeric
        RL = as.numeric(new)
      )
    cat("'MDL' column is all blanks and was converted to numeric format\n")
  }else{
    old <-DRMP_2017_19_new$MDL
    new <-DRMP_2017_19_new$MDL
    new[grepl('[a-df-zA-DF-Z]', new)] <- NA #skip 'e' for exponential notation e.g., "8e-005"
    #Print what text was found and what is being done
    cat(paste0("'MDL' column should be numeric but some cells contain ", grammaticList(setdiff(old, new)),
               ".\nACTIONS TAKEN:\n",
               "~explain here~.\n"))
    #DRMP_2017_19_new <- DRMP_2017_19_new %>%
    #  mutate( #Do stuff to prep column to be converted to Numeric
    #    MDL = as.numeric(new)
    #    )
  }
}else{
  cat("'MDL' column is in numeric format\n")}
## 'MDL' column is in numeric format
  # Format RL Column to Numeric#  
  # Check column for text - based on text user needs to decide what to do
if(!is.numeric(DRMP_2017_19_new$RL)){
  if(all(is.na(DRMP_2017_19_new$RL))){
    DRMP_2017_19_new <- DRMP_2017_19_new %>%
      mutate( #Column is all blanks and will be converted to Numeric
        RL = as.numeric(new)
      )
    cat("'RL' column is all blanks and was converted to numeric format\n")
  }else{
    old <-DRMP_2017_19_new$RL
    new <-DRMP_2017_19_new$RL
    new[grepl('[a-df-zA-DF-Z]', new)] <- NA #skip 'e' for exponential notation e.g., "8e-005"
    #Print what text was found and what is being done
    cat(paste0("'RL' column should be numeric but some cells contain ", grammaticList(setdiff(old, new)),
               ".\nACTIONS TAKEN:\n",
               "~explain here~.\n"))
    #DRMP_2017_19_new <- DRMP_2017_19_new %>%
    #  mutate( #Do stuff to prep column to be converted to Numeric
    #    RL = as.numeric(new)
    #    )
  }
}else{
  cat("'RL' column is in numeric format\n")}
## 'RL' column is in numeric format
  # Check if Result, MDL, & RL Columns all equal <NA> or 0 - these rows have no useful information
nrow(DRMP_2017_19_new) #Number rows before
## [1] 364
#CODE BELOW REQUIRES USER TROUBLESHOOTING DEPENDING ON AVAILABLE COLUMNS AND SPREADSHEET SPECIFIC CONDITIONS#
DRMP_2017_19_new <- DRMP_2017_19_new %>%
  #Set 0 & negative values as blank
  mutate(Result = ifelse(Result <= 0, NA_real_, Result),
         MDL = ifelse(MDL <= 0, NA_real_, MDL),
         RL = ifelse(RL <= 0, NA_real_, RL))
na_results <- DRMP_2017_19_new %>% #Record rows where Result, MDL, & RL all equal <NA>
  filter( is.na(Result) & is.na(MDL) & is.na(RL) )
#View(na_results)
DRMP_2017_19_new <- anti_join(DRMP_2017_19_new, na_results, by='SourceRow') #returns rows from DRMP_2017_19_new not matching values in no_result
nrow(DRMP_2017_19_new) #Number rows after
## [1] 364
# SampleID shows that some data are matrix spikes '-MS' or '-MSD' or duplicates '-dup', either remove dups or average with sample
DRMP_2017_19_new$SampleID[grep('-[A-z]*',DRMP_2017_19_new$SampleID)]      #show which SampleIDs have alphabetic characters after a '-'
##  [1] "B3382-MSD" "B3382-MS"  "B3384-dup" "B3326-dup" "B3341-MSD" "B3341-MS"  "B3357-MSD"
##  [8] "B3357-MS"  "B3311-dup" "B3314-dup" "B3301-MS"  "B3301-MSD" "B3370-MS"  "B3370-MSD"
## [15] "B3371-dup" "B5163-MS"  "B5205-MSD" "B5181-dup" "B5137-MSD" "B5117-dup" "B5163-MSD"
## [22] "B5160-dup" "B5202-dup" "B5205-MS"  "B5195-MS"  "B5127-dup" "B5162-MS"  "B5141-MS" 
## [29] "B5137-MS"  "B5195-MSD" "B5162-MSD" "B5141-MSD" "B5101-dup" "B6627-MSD" "B6627-MS" 
## [36] "B6628-dup" "B6607-MSD" "B6607-MS"  "B6608-dup" "B7226-dup" "B6647-MSD" "B6647-MS" 
## [43] "B6648-dup" "B6556-dup" "B6578-MSD" "B6578-MS"  "B6583-MSD" "B6583-MS"  "B6584-dup"
## [50] "B6596-MSD" "B6596-MS"
unique(sub('[^-][^A-z]*', '', DRMP_2017_19_new$SampleID, perl=T)) #show unique alphabetic tags
## [1] ""    "MSD" "MS"  "dup"
DRMP_2017_19_new <- DRMP_2017_19_new %>%
  filter(!grepl('ms', SampleID, ignore.case=T))           #remove data marked as matrix spike 'ms', 'MS', 'msd', or 'MSD'

unique(sub('[^-][^A-z]*', '', DRMP_2017_19_new$SampleID, perl=T)) #only dups are left
## [1] ""    "dup"
nrow(DRMP_2017_19_new)
## [1] 330
#Add Lab Rep note to SampleID
DRMP_2017_19_new <- DRMP_2017_19_new %>%
  mutate(tempSampleID = sub('-[A-z]*$', '', DRMP_2017_19_new$SampleID, perl=T)) %>%                                       # remove '-dup' and use temp column for grouping same IDs
  arrange(tempSampleID, AnalyteName) %>%                                                                          # arrange so dup sample is always second in list
  group_by(tempSampleID, AnalyteName) %>%
  mutate(n = n(),
        #SampleID = ifelse(n>1, paste(tempSampleID, 'Lab Rep', sep='-'), tempSampleID)) %>%                       # adds 'Lab Rep' to both duplicates in SampleID Column
         SampleID = ifelse(n>1, paste(tempSampleID, paste0('Lab Rep', row_number()), sep='-'), tempSampleID)) %>% # use this code to add replicate numb e.g., Lab Rep1, Lab Rep2
  ungroup %>%
  select(-tempSampleID, -n)                                                                                       # remove temp col used for grouping & 'n' col used to identify duplicates
nrow(DRMP_2017_19_new)
## [1] 330
#OPT #1 - AVERAGE SAMPLE & DUPLICATE
temp1 <- DRMP_2017_19_new %>%
  mutate(SampleID = ifelse(grepl('Lab Rep[0-9]?', SampleID), sub('Lab Rep[0-9]?', 'Lab Rep Avg', SampleID), SampleID)) %>%
  group_by(SampleID, AnalyteName) %>%
  mutate(Result = mean(Result)) %>%
  distinct(SampleID, AnalyteName, .keep_all=TRUE)
nrow(temp1)
## [1] 313
#OPT #2 - REMOVE DUPLICATES
temp2 <- DRMP_2017_19_new %>%
  mutate(tempSampleID = sub('Rep[0-9]?', 'Rep', SampleID)) %>%  # Remove digit so SampleIDs are the same for the same Analyte
  distinct(tempSampleID, AnalyteName, .keep_all=TRUE) %>%
  select(-tempSampleID)
nrow(temp2)
## [1] 313
  # Format Units Column - "mg/Kg ww" or "mg/Kg dw"
unique(DRMP_2017_19_new$Unit) #Identifies OLDNAMES
## [1] "ug/g ww"
# If more than 1 unit colmn exists (e.g., for RL and MDL columns) see WQP script for example on merging into 1 column
DRMP_2017_19_new <- DRMP_2017_19_new %>%
  standardizeUnits(pp = "mass")
unique(DRMP_2017_19_new$Unit)
## [1] "mg/Kg ww"
  # Format Date and Time Column #
# THE EXAMPLE CODE BELOW ASSUMES DATE AND TIME ARE IN SAME COLUMNS - IF TIME IS IN SEPERATE COLUMN LOOK AT AQ LINKAGE DATA TEMPLATE
DRMP_2017_19_new <- DRMP_2017_19_new %>%
  #rowise() %>%    # rowise is very slow - so used sapply to make this a rowise operation
  mutate(
    #If SampleDate & CollectioTIme are not in Character format by defualt, turn it into a character class so it exports better
    SampleDate = ifelse(sapply(SampleDate, is.character), SampleDate, as.character(as.Date(SampleDate))),
    SampleTime = ifelse(sapply(SampleTime, is.character), SampleTime, format(lubridate::ymd_hms(SampleTime), "%H:%M:%S")),
    #COMBINE DATE AND TIME INTO SampleDateTime COLUMN
    SampleDateTime = ifelse(!is.na(SampleTime), paste(SampleDate, SampleTime), paste(SampleDate, '00:00:00')),
    #FORMAT SampleDateTime COLUMN TO DATE FORMAT
    SampleDateTime = lubridate::ymd_hms(SampleDateTime)
  )


### REMOVE TEMPORARY COLUMNS ###
DRMP_2017_19_new <- DRMP_2017_19_new %>%
  select(-one_of(temp_cols)) #Remove temp columns since they are no longer needed
#View(DRMP_2017_19_new)

## SAVE FORMATTED DATA AS EXCEL FILE ##
writexl::write_xlsx(DRMP_2017_19_new, path='Reeval_Impl_Goals_Linkage_Analysis/Data/Fish/DRMP_2017_19_ceden_format.xlsx')
# In excel, to convert SampleDate column to Date format
# 1 - Select the date column.
# 2 - Go to the Data-tab and choose "Text to Columns".
# 3 - On the first screen, leave radio button on "delimited" and click Next.
# 4 - Unselect any delimiter boxes (everything blank) and click Next.
# 5 - Under column data format choose Date, select YMD
# 6 - Click Finish.

The R session information (including the OS info, R version and all packages used):

    sessionInfo()
## R version 4.2.2 (2022-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 22621)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8  LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8 LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] lubridate_1.8.0    plotly_4.10.0      readxl_1.4.1       actuar_3.3-0      
##  [5] NADA_1.6-1.1       forcats_0.5.2      stringr_1.4.1      dplyr_1.0.9       
##  [9] purrr_0.3.4        readr_2.1.2        tidyr_1.2.0        tibble_3.1.8      
## [13] ggplot2_3.3.6      tidyverse_1.3.2    fitdistrplus_1.1-8 survival_3.4-0    
## [17] MASS_7.3-58.1     
## 
## loaded via a namespace (and not attached):
##  [1] lattice_0.20-45     assertthat_0.2.1    digest_0.6.29       utf8_1.2.2         
##  [5] R6_2.5.1            cellranger_1.1.0    backports_1.4.1     reprex_2.0.2       
##  [9] evaluate_0.16       highr_0.9           httr_1.4.4          pillar_1.8.1       
## [13] rlang_1.0.5         lazyeval_0.2.2      googlesheets4_1.0.1 rstudioapi_0.14    
## [17] data.table_1.14.2   Matrix_1.5-1        splines_4.2.2       googledrive_2.0.0  
## [21] htmlwidgets_1.5.4   munsell_0.5.0       broom_1.0.1         compiler_4.2.2     
## [25] modelr_0.1.9        xfun_0.32           pkgconfig_2.0.3     htmltools_0.5.3    
## [29] tidyselect_1.1.2    fansi_1.0.3         viridisLite_0.4.1   crayon_1.5.1       
## [33] tzdb_0.3.0          dbplyr_2.2.1        withr_2.5.0         grid_4.2.2         
## [37] jsonlite_1.8.0      gtable_0.3.1        lifecycle_1.0.1     DBI_1.1.3          
## [41] magrittr_2.0.3      scales_1.2.1        writexl_1.4.0       cli_3.3.0          
## [45] stringi_1.7.8       fs_1.5.2            xml2_1.3.3          ellipsis_0.3.2     
## [49] generics_0.1.3      vctrs_0.4.1         expint_0.1-7        tools_4.2.2        
## [53] glue_1.6.2          hms_1.1.2           fastmap_1.1.0       colorspace_2.0-3   
## [57] gargle_1.2.0        rvest_1.0.3         knitr_1.40          haven_2.5.1
    Sys.time()
## [1] "2024-01-05 10:17:19 PST"