This report is automatically generated with the R package knitr (version 1.40) .

source("R Functions/functions_QA data.R")


# General Method - Use Common Name, Length, Date, Subarea, Analyte, Result, and Unit to identify data repeated between data sources
# Remove False Repeats from "repeats_All" by hand
# For True repeats sort repeats based on data source preference - see "DataSourceHierarchy.xlsx"
# Remove data remaining in "repeats_All" from Master data list - remove data from "repeats_All" that we want to keep in Master



# Load QA'd Data Master & Add SourceID Hierarchy ------------------------------------
#SourceID Hierarchy Column to be used when removing repeating data
dataSourceHierarchy <- readxl::read_excel('Reeval_Impl_Goals_Linkage_Analysis/Data/Fish/6_DataSourceHierarchy.xlsx', sheet='Data Selection Hierarchy', guess_max = 30000)
FishMaster_DELTA      <- readxl::read_excel('Reeval_Impl_Goals_Linkage_Analysis/Data/Fish/5a_FishMaster_QA.xlsx', sheet='Sheet1', guess_max = 30000) %>%
  left_join(., dataSourceHierarchy, by='SourceID')   # adds Hierarchy column based on SourceID
nrow(FishMaster_DELTA)
## [1] 7719
## Export to Excel to view ----
writexl::write_xlsx(FishMaster_DELTA, path=paste0('Reeval_Impl_Goals_Linkage_Analysis/Data/Fish/6b.0_FishMaster_withRepeats', '.xlsx'))


# Identify Repeats pt. 1 ---------------------------------------------------------
# Arrange by MDL column so row with MDL value is on top. This allows it to be captured 
# by distinct() and then removed from repeated data.
fish_repeats <- FishMaster_DELTA %>%
  mutate(Repeats = paste(Subarea, CommonName, TLAvgLength, Analyte, SampleDate, Result, Unit)) %>%
  group_by(Repeats) %>%
  mutate(n = n()) %>%
  filter(n > 1)   %>%
  arrange(Repeats, MDL) %>%
  ungroup
nrow(fish_repeats)
## [1] 4180
## Standardizing weights ---------------------------------------------------
# To capture known weights across repeated rows
# Standardize weights across repeats/append known 0's and NA's 
# First, get Repeats where WeightAvg > 0
weight_not_na <- fish_repeats %>%
  filter(WeightAvg>0) %>%
  mutate(WeightAvg2 = WeightAvg) %>%
  distinct(Repeats, .keep_all=T) %>%
  select(Repeats, WeightAvg2)

# Next, add WeightAvg from repeats_withWeightAvg to repeats where WeightAvg = 0
repeats_with_weightavg <- fish_repeats %>%
  left_join(., weight_not_na, by='Repeats') %>%    # add WeightAvg2 column
  mutate(WeightAvg = case_when(WeightAvg == 0 | is.na(WeightAvg) ~ WeightAvg2,  # add WeightAvg2 value where WeightAvg is 0
                               T ~ WeightAvg))  %>%
  select(- WeightAvg2)                           # remove unneeded columns


## Export to Excel to View ---- 
# export to Excel and examine the repeats (includes "N/A" Result values, will be addressed later)
writexl::write_xlsx(repeats_with_weightavg, path=paste0('Reeval_Impl_Goals_Linkage_Analysis/Data/Fish/6b.1_Repeats_',today(),'.xlsx'))


## Check rounded latitude/longitude ---- 
# See if that helps with identifying repeats
# Decided not to do this because we do not know how accurate the lat/long values are
# and it may be excluding true repeats from fish_repeats
# Below is the test code, ultimately not utilized except as a QA check

# roundedlatlong <- fish_clean %>% 
#   mutate(TargetLatitude=round(TargetLatitude,0))%>% 
#   mutate(TargetLongitude=round(TargetLongitude,0))
# 
# test_repeats <- roundedlatlong %>%
#   mutate(Repeats = paste(Subarea, CommonName, TLAvgLength, Analyte, SampleDate, Result, TargetLatitude, TargetLongitude)) %>%
#   group_by(Repeats) %>%
#   mutate(n = n()) %>%
#   filter(n > 1)   %>%
#   arrange(Repeats, MDL) %>%   
#   ungroup
# nrow(test_repeats)



# Remove Repeated Data pt. 1 ----------------------------------------------------
# Load manually corrected repeated data, remove single instance we want to keep in Master, then remove remaining repeated data from Master
first_repeats <- readxl::read_excel('Reeval_Impl_Goals_Linkage_Analysis/Data/Fish/6c.2_Repeats_2023-04-12_ManuallyRemovedFalseRepeats.xlsx', sheet='Sheet1', guess_max = 30000) %>%  # load manually corrected repeated data
  arrange(Repeats, Hierarchy)

repeats_tokeep <- first_repeats %>%
  filter(n != 4) %>%
  filter(n != 6) %>%                        # Do not keep any instances of n=6, already manually deleted the instances to keep in Excel; don't keep any instances here so remainder are all removed
  distinct(Repeats, n, .keep_all=T)   # capture "top" single instance we want to keep in Master

repeats_toberemoved   <- anti_join(first_repeats, repeats_tokeep, by=c('SourceID','SourceRow')) %>%
  # remove instance we want to keep from list of repeated data
  select(-n, -Repeats)

nrow(first_repeats) == nrow(repeats_tokeep) + nrow(repeats_toberemoved) # this needs to be TRUE
## [1] TRUE
## Add WeightAvg from repeats_tokeep ----
# Because repeats_tokeep kept CEDEN over R5MF, but CEDEN did not have correct WeightAvg value.
repeats_WeightAvg <- repeats_tokeep %>%
  mutate(WeightAvg2 = WeightAvg) %>%
  select(SourceID, SourceRow, WeightAvg2)

first_fish_final <- FishMaster_DELTA %>%
  left_join(., repeats_WeightAvg, by=c('SourceID','SourceRow')) %>%              # adds WeightAvg2 column
  mutate(WeightAvg = case_when(!is.na(WeightAvg2) ~ WeightAvg2,  # add WeightAvg2 value to repeats that were kept
                               T ~ WeightAvg))  %>%
  select(- WeightAvg2)


## Remove repeats_toberemoved from master table ----
first_FishMaster_noRepeats <- anti_join(first_fish_final, repeats_toberemoved, by=c('SourceID','SourceRow')) %>%        # remove repeated data from Master
  select(-Hierarchy)  # remove extraneous columns

nrow(first_fish_final) == nrow(first_FishMaster_noRepeats) + nrow(repeats_toberemoved)  # this needs to be TRUE
## [1] TRUE
## Export file to excel: ----
# writexl::write_xlsx(FishMaster_noRepeats, path=paste0('Reeval_Impl_Goals_Linkage_Analysis/Data/Fish/6a_first_FishMaster_noRepeats_',today(),'.xlsx'))



# Catching Missed Repeats ---------------------------------------------
## New Year's Day (Incorrect Dates) ----
# Standardize Dates across repeats/append known incorrect values (just like was done 
# with weights) - we know there would be no sampling done on this holiday and must have been 
# incorrectly written as January 1st
# First, modifying the table to catch rounding error repeats, to maintain consistency 
# and be efficient for this second round of repeat removal
rounded_repeats <- first_FishMaster_noRepeats %>%
  mutate(RoundedResult = round(Result, digits = 3)) %>%
  mutate(RoundedLength = round(as.numeric(TLAvgLength), digits = 1)) %>%
  mutate(Year = year(SampleDate)) %>%
  mutate(Repeats = paste(Subarea, CommonName, RoundedLength, Analyte, Year, RoundedResult, Unit)) %>%
  left_join(., dataSourceHierarchy, by='SourceID')

# Get Repeats where Date is not 01/01 (manually checked: all in 1998-2003, no other 
# occurrences of January samples after 2003; all January occurrences are 01/01)
truedates <- rounded_repeats %>%
  filter((month(SampleDateTime) != month(1))) %>%
  mutate(SampleDateTime2 = SampleDateTime) %>%
  distinct(Repeats, .keep_all=T) %>%               # 9/8/21 ROBIN's NOTE: Instead of distinct() may need "group_by(Repeats)" & "filter(n() > 1)", "filter(n() == 1)", or another filter() depending on intention of truedates
  select(Repeats, SampleDateTime2)

# Add SampleDateTime from truedates to dataset where SampleDateTime is 01/01
corrected_dates <- rounded_repeats %>%
  left_join(., truedates, by='Repeats') %>%    # adds SampleDateTime2 column
  mutate(SampleDateTime =
           case_when((
             month(SampleDateTime) == month(1)) & !is.na(SampleDateTime2) ~ SampleDateTime2, # add SampleDateTime2 value where SampleDateTime is 01/01 and there is a corresponding SampleDateTime2 value
             T ~ SampleDateTime))  %>%
  mutate(SampleDate = ymd(SampleDateTime)) %>%
  select(- SampleDateTime2, -Repeats)              # remove unneeded columns
# Results show that only 2 sampling events remain with incorrect, 01/01 data
# At which point nothing can be done for these (no true date data)


## Identifying Repeats pt. 2 -------------------------------
# Fixing missed repeats from rounding errors, incorrect dates and common names 
# rounded Result and Length to capture true repeats
# caught repeats using Year, gets some repeats with incorrect dates to manually remove later
# ensured sourceID's are different
fish_repeats2 <- corrected_dates %>%
  mutate(Repeats1 = paste(Subarea, CommonName, RoundedLength, Analyte, Year, RoundedResult, Unit)) %>%
  group_by(Repeats1) %>%
  mutate(number_repeats = n()) %>%
  filter(number_repeats > 1)   %>%
  mutate(Repeats2 = paste(SourceID, Repeats1)) %>%
  group_by(Repeats2) %>%
  mutate(number_sources = n()) %>%
  filter(number_sources == 1)   %>%
  arrange(Repeats1, MDL) %>%
  ungroup
nrow(fish_repeats2)
## [1] 1927
## Export to Excel ----
# Export to Excel and examine the repeats (includes "N/A" Result values)
writexl::write_xlsx(fish_repeats2, path=paste0('Reeval_Impl_Goals_Linkage_Analysis/Data/Fish/6d.1_Repeats_rounding_',today(),'.xlsx'))



# Remove Repeated Data pt. 2 ----------------------------------------------------
# Same process: Load manually corrected repeated data, remove single instance we want to 
# keep in Master, then remove remaining repeated data from Master
removed_repeats_rounding <- readxl::read_excel('Reeval_Impl_Goals_Linkage_Analysis/Data/Fish/6d.3_Repeats_rounding_manuallyremoved_2023-04-12.xlsx', sheet='Sheet1', guess_max = 30000) %>%  # load manually corrected repeated data
  arrange(Repeats1, Hierarchy)

repeats_tokeep_rounding <- removed_repeats_rounding %>%
  filter(number_repeats != 3) %>%                       # Do not keep any instances of n=3, already manually deleted the instances to keep, so don't keep any instances here so remainder are all removed
  distinct(Repeats1, number_repeats, .keep_all=T)   # capture "top" single instance we want to keep in Master

repeats_toberemoved_rounding   <- anti_join(removed_repeats_rounding, repeats_tokeep_rounding, by=c('SourceID','SourceRow')) %>%     # remove instance we want to keep from list of repeated data
  select(-number_repeats, -number_sources, -Repeats1, -Repeats2) # remove columns we no longer need

nrow(removed_repeats_rounding) == nrow(repeats_tokeep_rounding) + nrow(repeats_toberemoved_rounding) # this needs to be TRUE
## [1] TRUE
## Remove repeats_toberemoved from master table ----
FishMaster_norepeats <- anti_join(corrected_dates, repeats_toberemoved_rounding, by=c('SourceID','SourceRow')) %>%        # remove repeated data from Master
  select(-Hierarchy, -RoundedResult, -RoundedLength, -Year)  # remove extraneous columns

nrow(corrected_dates) == nrow(FishMaster_norepeats) + nrow(repeats_toberemoved_rounding)  # this needs to be TRUE
## [1] TRUE
# Final data clean
# Append true SampleDateTime and SampleDate from repeats_tokeep_rounding because
# there were some inaccurate 01/01 date values missed from "Identify Repeats pt. 1"
final_fix_dates <- FishMaster_norepeats %>%
  mutate(Year = year(SampleDate)) %>%
  mutate(Repeats = paste(Subarea, CommonName, TLAvgLength, Analyte, Year, Result, Unit)) %>%
  group_by(Repeats) %>%
  arrange(Repeats, MDL) %>%
  ungroup
nrow(final_fix_dates)
## [1] 4526
original_with_truedates <-  FishMaster_DELTA %>%
  mutate(Year = year(SampleDate)) %>%
  mutate(Repeats = paste(Subarea, CommonName, TLAvgLength, Analyte, Year, Result, Unit)) %>%
  filter((month(SampleDateTime) != month(1))) %>%
  mutate(SampleDateTime2 = SampleDateTime) %>%
  distinct(Repeats, .keep_all=T) %>%
  select(Repeats, SampleDateTime2)


fish_final <- final_fix_dates %>%
  left_join(., original_with_truedates, by='Repeats') %>%    # adds SampleDateTime2 column
  mutate(SampleDateTime =
           case_when((
             month(SampleDateTime) == month(1)) & !is.na(SampleDateTime2) ~ SampleDateTime2, # add SampleDateTime2 value where SampleDateTime is 01/01
             T ~ SampleDateTime))  %>%
  mutate(SampleDate = ymd(SampleDateTime)) %>%
  select(- SampleDateTime2, -Repeats)
nrow(fish_final)
## [1] 4526
# Comparing two versions shows the code above is really more of a check, as it doesn't
# change any of the 01/01 dates
nrow(fish_final %>% filter((month(SampleDateTime) == month(1))))
## [1] 570
nrow(FishMaster_norepeats %>% filter((month(SampleDateTime) == month(1))))
## [1] 570
View(FishMaster_norepeats %>% filter((month(SampleDateTime) == month(1))))


# export final file to excel:
writexl::write_xlsx(FishMaster_norepeats, path=paste0('Reeval_Impl_Goals_Linkage_Analysis/Data/Fish/6a_FishMaster_noRepeats.xlsx'))

The R session information (including the OS info, R version and all packages used):

    sessionInfo()
## R version 4.2.2 (2022-10-31 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 22621)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8  LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8 LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] lubridate_1.8.0    plotly_4.10.0      readxl_1.4.1       actuar_3.3-0      
##  [5] NADA_1.6-1.1       forcats_0.5.2      stringr_1.4.1      dplyr_1.0.9       
##  [9] purrr_0.3.4        readr_2.1.2        tidyr_1.2.0        tibble_3.1.8      
## [13] ggplot2_3.3.6      tidyverse_1.3.2    fitdistrplus_1.1-8 survival_3.4-0    
## [17] MASS_7.3-58.1     
## 
## loaded via a namespace (and not attached):
##  [1] lattice_0.20-45     assertthat_0.2.1    digest_0.6.29       utf8_1.2.2         
##  [5] R6_2.5.1            cellranger_1.1.0    backports_1.4.1     reprex_2.0.2       
##  [9] evaluate_0.16       highr_0.9           httr_1.4.4          pillar_1.8.1       
## [13] rlang_1.0.5         lazyeval_0.2.2      googlesheets4_1.0.1 rstudioapi_0.14    
## [17] data.table_1.14.2   Matrix_1.5-1        splines_4.2.2       googledrive_2.0.0  
## [21] htmlwidgets_1.5.4   munsell_0.5.0       broom_1.0.1         compiler_4.2.2     
## [25] modelr_0.1.9        xfun_0.32           pkgconfig_2.0.3     htmltools_0.5.3    
## [29] tidyselect_1.1.2    fansi_1.0.3         viridisLite_0.4.1   crayon_1.5.1       
## [33] tzdb_0.3.0          dbplyr_2.2.1        withr_2.5.0         grid_4.2.2         
## [37] jsonlite_1.8.0      gtable_0.3.1        lifecycle_1.0.1     DBI_1.1.3          
## [41] magrittr_2.0.3      scales_1.2.1        writexl_1.4.0       cli_3.3.0          
## [45] stringi_1.7.8       fs_1.5.2            xml2_1.3.3          ellipsis_0.3.2     
## [49] generics_0.1.3      vctrs_0.4.1         expint_0.1-7        tools_4.2.2        
## [53] glue_1.6.2          hms_1.1.2           fastmap_1.1.0       colorspace_2.0-3   
## [57] gargle_1.2.0        rvest_1.0.3         knitr_1.40          haven_2.5.1
    Sys.time()
## [1] "2024-01-05 08:55:50 PST"