This report is automatically generated with the R
package knitr
(version 1.40
)
.
source("R Functions/functions_QA data.R") ### LOAD DATA ### DRMP_TIResults <- readxl::read_excel('Reeval_Impl_Goals_Linkage_Analysis/Data/Fish/DRMP_2017_2019_Fish.xlsx', sheet='TIResults', guess_max = 30000) nrow(DRMP_TIResults) #number of rows should match the Excel file (minus the header row)
## [1] 743
DRMP_FishComposite <- readxl::read_excel('Reeval_Impl_Goals_Linkage_Analysis/Data/Fish/DRMP_2017_2019_Fish.xlsx', sheet='FishComposite', guess_max = 30000) %>% select(-CompositeComments, -CompositeReplicate, -CompositeType, -OrganismGroup) #%>% #remove columns that are the same in both sheets (except CompositeID & PrepPreservationName) # distinct(CompositeID, .keep_all=T) # rows are duplicated for some CompositeIDs nrow(DRMP_FishComposite) #number of rows should match the Excel file (minus the header row)
## [1] 317
DRMP_2017_19 <- inner_join(DRMP_TIResults, DRMP_FishComposite, by='CompositeID') %>% mutate(PrepPreservationName = paste(PrepPreservationName.x, PrepPreservationName.y, sep=', ' )) nrow(DRMP_2017_19) # should be same as nrow(DRMP_TIResults)
## [1] 743
### LIST COLUMNS TO BE USED, ADD USER DEFINED COLUMNS, & RENAME COLUMNS TO CEDEN STANDARDS ### #Use 1.READ ME.xlsx, 'ColumnsForR' to list & identify columns that match corresponding CEDEN Standard columns keep_cols <- c('SourceID', 'SourceRow', 'ProjectCode', 'CompositeID', 'StationCode', 'SampleDate', 'OrganismName', 'TotalCount', 'TissueName', 'MethodName', 'UnitName', 'Result', 'ResQualCode', 'MDL', 'RL', 'TissueWeight', 'CompositeRowID', 'LabSampleID', 'QACode', 'ComplianceCode', 'QACodeLab') temp_cols <- c('AnalyteName', 'FractionName', 'TotalLength', 'PrepPreservationName', 'CompositeComments', 'PartsComments', 'ProcessedOrganismsExpandedFishComments', 'SampleTypeCode', 'SampleComments', 'TissueCollectionComments', 'TissueResultComments') #Include columns that do not match CEDEN standards but may be useful (e.g., Unit columns for MDL & RL) #temp_cols are removed before the data is merged with other datasets DRMP_2017_19_new <- DRMP_2017_19 %>% select( c(keep_cols,temp_cols) ) %>% #DO NOT CHANGE - selects columns specified above rename( #Rename DRMP_2017_19 columns to CEDEN format here: CEDEN 'COLUMNNAME' = DRMP_2017_19 'COLUMNNAME' #CitationCode = , added with mutate below #CoordSystem = , added with left_join(DRMP_locations) #ProgramName = , added with Mutate below #ParentProjectName = , added with Mutate below #ProjectName = , added with Mutate below #StationName = , added with left_join(DRMP_locations) #TargetLatitude = , added with left_join(DRMP_locations) #TargetLongitude = , added with left_join(DRMP_locations) #SampleTime = , added with Mutate below #CommonName = , added with left_join(Taxo_Comm_Name_LookUp) below TaxonomicName = OrganismName, NumberFishPerComp = TotalCount, Method = MethodName, #Analyte = , added with Mutate below Unit = UnitName, ResultQualCode = ResQualCode, `WeightAvg g` = TissueWeight, #TLMin mm = , added with Mutate below #TLMax mm = , added with Mutate below #TLAvgLength mm = , added with Mutate below SampleID = LabSampleID, #WBT = , added with left_join(DRMP_locations) #BatchVerification = , added with Mutate below #ResultComments = , added with Mutate below LabSubmissionCode = QACodeLab ) %>% mutate( CitationCode = 'DRMP', ProgramName = NA_character_, ParentProjectName = NA_character_, ProjectName = NA_character_, SampleTime = NA_character_, Analyte = paste(AnalyteName, FractionName, sep=', '), `TLMin mm` = TotalLength, `TLMax mm` = TotalLength, `TLAvgLength mm` = TotalLength, BatchVerification = NA_character_, ResultComments = paste0(ifelse(is.na(PrepPreservationName),'',paste0('Prep-',PrepPreservationName)), ifelse(is.na(CompositeComments),'',paste0(' ~ Composite-',CompositeComments)), ifelse(is.na(PartsComments),'',paste0(' ~ Parts-',PartsComments)), ifelse(is.na(ProcessedOrganismsExpandedFishComments),'',paste0(' ~ Process-',ProcessedOrganismsExpandedFishComments)), ifelse(is.na(SampleComments),'',paste0(' ~ Sampling-',SampleComments)), ifelse(is.na(TissueCollectionComments),'',paste0(' ~ TissueCollect-',TissueCollectionComments)), ifelse(is.na(TissueResultComments),'',paste0(' ~ TissueResult-',TissueResultComments)) ) ) nrow(DRMP_2017_19_new)
## [1] 743
#str(DRMP_2017_19_new) #just to check data class of different columns - e.g., is Date column in POSIX format? #View(DRMP_2017_19_new) ### LOAD EXTERNAL DATA & ADD COLUMNS USING JOIN ### #Add columns from "DRMP_locations.xlsx" #before joining check StationCode for blanks unique(DRMP_2017_19_new$StationCode) #no blanks go ahead and remove LABQA
## [1] "510ADVLIM" "510ST1317" "541SJC501" "544ADVLM6" "544LILPSL" "544MDRBH4" "510ST1666" ## [8] "LABQA"
DRMP_2017_19_new <- DRMP_2017_19_new %>% filter(StationCode != 'LABQA') nrow(DRMP_2017_19_new)
## [1] 683
DRMP_locations <- readxl::read_excel('Reeval_Impl_Goals_Linkage_Analysis/Data/Fish/DRMP_locations.xlsx', sheet='Locations', guess_max = 30000) %>% select(-LocationCode) DRMP_2017_19_new <- left_join(DRMP_2017_19_new, DRMP_locations, by='StationCode') #Adds columns StationName, WBT, LocationCode, TargetLatitude, TargetLongitude, CoordSystem based on StationCode nrow(DRMP_2017_19_new)
## [1] 683
#Add CommonName column from 1_Taxonomic_Common Name LookUp from WQP.xlsx #before joining check TaxonmicName for blanks unique(DRMP_2017_19_new$TaxonomicName) # no blanks good to go
## [1] "Micropterus salmoides" "Micropterus punctulatus" "Micropterus dolomieu"
Taxo_Comm_Name_LookUp <- readxl::read_excel('Reeval_Impl_Goals_Linkage_Analysis/Data/Fish/0_Taxonomic_Common Name LookUp from WQP.xlsx', sheet='SpecieNames', guess_max = 30000) %>% select(CommonName, SubjectTaxonomicName) DRMP_2017_19_new <- left_join(DRMP_2017_19_new, Taxo_Comm_Name_LookUp, by=c('TaxonomicName'='SubjectTaxonomicName')) # Adds CommonNameColumn unique(DRMP_2017_19_new$CommonName)
## [1] "Largemouth Bass" "Spotted Bass" "Smallmouth Bass"
nrow(DRMP_2017_19_new)
## [1] 683
### FORMAT COLUMN PARAMETERS ### # Standardize WBT (WaterBodyType) Groups - "River/Stream", "Estuary", Drain/Canal", "Wetland", "Spring", "Slough", # "Pond", "Lake/Reservoir", "Delta", "Forebay/Afterbay", "Not Recorded" # unique(DRMP_2017_19_new$WBT) #Identifies OLDNAMES
## [1] "River/Stream" "X2"
# [1] "River/Stream" "X2" no changes needed # Standardize TissueName Groups - "Fillet" or "Whole Body" # unique(DRMP_2017_19_new$TissueName)
## [1] "FIL" "fillet"
DRMP_2017_19_new <- DRMP_2017_19_new %>% mutate(TissueName = recode(TissueName, "FIL" = "Fillet", "fillet" = "Fillet" ) ) unique(DRMP_2017_19_new$TissueName)
## [1] "Fillet"
# Standardize Analyte Groups - "Mercury, Total" (we consider Total Mercury and Methylmercury to be approx equal) # unique(paste(DRMP_2017_19_new$Analyte, DRMP_2017_19_new$Unit, sep=' ~ '))
## [1] "Moisture, Total ~ %" "Mercury, Total ~ ug/g ww"
#Only keep Mercury, Total, which will filter out Moisture, Total DRMP_2017_19_new <- DRMP_2017_19_new %>% filter(grepl('Mercury', Analyte)) unique(paste(DRMP_2017_19_new$Analyte, DRMP_2017_19_new$Unit, sep=' ~ '))
## [1] "Mercury, Total ~ ug/g ww"
nrow(DRMP_2017_19_new)
## [1] 364
# Standardize ResultQualCode Groups - "ND", "DNQ", NA# unique(DRMP_2017_19_new$ResultQualCode) #Identifies OLDNAMES
## [1] "="
# [1] "=" no changes needed # Format Result Column to Numeric# # Check column for text - based on text user needs to decide what to do if(!is.numeric(DRMP_2017_19_new$Result)){ if(all(is.na(DRMP_2017_19_new$Result))){ DRMP_2017_19_new <- DRMP_2017_19_new %>% mutate( #Column is all blanks and will be converted to Numeric RL = as.numeric(new) ) cat("'Result' column is all blanks and was converted to numeric format\n") }else{ old <-DRMP_2017_19_new$Result new <-DRMP_2017_19_new$Result new[grepl('[a-df-zA-DF-Z]', new)] <- NA #skip 'e' for exponential notation e.g., "8e-005" #Print what text was found and what is being done cat(paste0("'Result' column should be numeric but some cells contain ", grammaticList(setdiff(old, new)), ".\nACTIONS TAKEN:\n", "~explain here~.\n")) #DRMP_2017_19_new <- DRMP_2017_19_new %>% # mutate( #Do stuff to prep column to be converted to Numeric # Result = as.numeric(new) # ) } }else{ cat("'Result' column is in numeric format\n")}
## 'Result' column is in numeric format
# Format MDL Column to Numeric# # Check column for text - based on text user needs to decide what to do if(!is.numeric(DRMP_2017_19_new$MDL)){ if(all(is.na(DRMP_2017_19_new$MDL))){ DRMP_2017_19_new <- DRMP_2017_19_new %>% mutate( #Column is all blanks and will be converted to Numeric RL = as.numeric(new) ) cat("'MDL' column is all blanks and was converted to numeric format\n") }else{ old <-DRMP_2017_19_new$MDL new <-DRMP_2017_19_new$MDL new[grepl('[a-df-zA-DF-Z]', new)] <- NA #skip 'e' for exponential notation e.g., "8e-005" #Print what text was found and what is being done cat(paste0("'MDL' column should be numeric but some cells contain ", grammaticList(setdiff(old, new)), ".\nACTIONS TAKEN:\n", "~explain here~.\n")) #DRMP_2017_19_new <- DRMP_2017_19_new %>% # mutate( #Do stuff to prep column to be converted to Numeric # MDL = as.numeric(new) # ) } }else{ cat("'MDL' column is in numeric format\n")}
## 'MDL' column is in numeric format
# Format RL Column to Numeric# # Check column for text - based on text user needs to decide what to do if(!is.numeric(DRMP_2017_19_new$RL)){ if(all(is.na(DRMP_2017_19_new$RL))){ DRMP_2017_19_new <- DRMP_2017_19_new %>% mutate( #Column is all blanks and will be converted to Numeric RL = as.numeric(new) ) cat("'RL' column is all blanks and was converted to numeric format\n") }else{ old <-DRMP_2017_19_new$RL new <-DRMP_2017_19_new$RL new[grepl('[a-df-zA-DF-Z]', new)] <- NA #skip 'e' for exponential notation e.g., "8e-005" #Print what text was found and what is being done cat(paste0("'RL' column should be numeric but some cells contain ", grammaticList(setdiff(old, new)), ".\nACTIONS TAKEN:\n", "~explain here~.\n")) #DRMP_2017_19_new <- DRMP_2017_19_new %>% # mutate( #Do stuff to prep column to be converted to Numeric # RL = as.numeric(new) # ) } }else{ cat("'RL' column is in numeric format\n")}
## 'RL' column is in numeric format
# Check if Result, MDL, & RL Columns all equal <NA> or 0 - these rows have no useful information nrow(DRMP_2017_19_new) #Number rows before
## [1] 364
#CODE BELOW REQUIRES USER TROUBLESHOOTING DEPENDING ON AVAILABLE COLUMNS AND SPREADSHEET SPECIFIC CONDITIONS# DRMP_2017_19_new <- DRMP_2017_19_new %>% #Set 0 & negative values as blank mutate(Result = ifelse(Result <= 0, NA_real_, Result), MDL = ifelse(MDL <= 0, NA_real_, MDL), RL = ifelse(RL <= 0, NA_real_, RL)) na_results <- DRMP_2017_19_new %>% #Record rows where Result, MDL, & RL all equal <NA> filter( is.na(Result) & is.na(MDL) & is.na(RL) ) #View(na_results) DRMP_2017_19_new <- anti_join(DRMP_2017_19_new, na_results, by='SourceRow') #returns rows from DRMP_2017_19_new not matching values in no_result nrow(DRMP_2017_19_new) #Number rows after
## [1] 364
# SampleID shows that some data are matrix spikes '-MS' or '-MSD' or duplicates '-dup', either remove dups or average with sample DRMP_2017_19_new$SampleID[grep('-[A-z]*',DRMP_2017_19_new$SampleID)] #show which SampleIDs have alphabetic characters after a '-'
## [1] "B3382-MSD" "B3382-MS" "B3384-dup" "B3326-dup" "B3341-MSD" "B3341-MS" "B3357-MSD" ## [8] "B3357-MS" "B3311-dup" "B3314-dup" "B3301-MS" "B3301-MSD" "B3370-MS" "B3370-MSD" ## [15] "B3371-dup" "B5163-MS" "B5205-MSD" "B5181-dup" "B5137-MSD" "B5117-dup" "B5163-MSD" ## [22] "B5160-dup" "B5202-dup" "B5205-MS" "B5195-MS" "B5127-dup" "B5162-MS" "B5141-MS" ## [29] "B5137-MS" "B5195-MSD" "B5162-MSD" "B5141-MSD" "B5101-dup" "B6627-MSD" "B6627-MS" ## [36] "B6628-dup" "B6607-MSD" "B6607-MS" "B6608-dup" "B7226-dup" "B6647-MSD" "B6647-MS" ## [43] "B6648-dup" "B6556-dup" "B6578-MSD" "B6578-MS" "B6583-MSD" "B6583-MS" "B6584-dup" ## [50] "B6596-MSD" "B6596-MS"
unique(sub('[^-][^A-z]*', '', DRMP_2017_19_new$SampleID, perl=T)) #show unique alphabetic tags
## [1] "" "MSD" "MS" "dup"
DRMP_2017_19_new <- DRMP_2017_19_new %>% filter(!grepl('ms', SampleID, ignore.case=T)) #remove data marked as matrix spike 'ms', 'MS', 'msd', or 'MSD' unique(sub('[^-][^A-z]*', '', DRMP_2017_19_new$SampleID, perl=T)) #only dups are left
## [1] "" "dup"
nrow(DRMP_2017_19_new)
## [1] 330
#Add Lab Rep note to SampleID DRMP_2017_19_new <- DRMP_2017_19_new %>% mutate(tempSampleID = sub('-[A-z]*$', '', DRMP_2017_19_new$SampleID, perl=T)) %>% # remove '-dup' and use temp column for grouping same IDs arrange(tempSampleID, AnalyteName) %>% # arrange so dup sample is always second in list group_by(tempSampleID, AnalyteName) %>% mutate(n = n(), #SampleID = ifelse(n>1, paste(tempSampleID, 'Lab Rep', sep='-'), tempSampleID)) %>% # adds 'Lab Rep' to both duplicates in SampleID Column SampleID = ifelse(n>1, paste(tempSampleID, paste0('Lab Rep', row_number()), sep='-'), tempSampleID)) %>% # use this code to add replicate numb e.g., Lab Rep1, Lab Rep2 ungroup %>% select(-tempSampleID, -n) # remove temp col used for grouping & 'n' col used to identify duplicates nrow(DRMP_2017_19_new)
## [1] 330
#OPT #1 - AVERAGE SAMPLE & DUPLICATE temp1 <- DRMP_2017_19_new %>% mutate(SampleID = ifelse(grepl('Lab Rep[0-9]?', SampleID), sub('Lab Rep[0-9]?', 'Lab Rep Avg', SampleID), SampleID)) %>% group_by(SampleID, AnalyteName) %>% mutate(Result = mean(Result)) %>% distinct(SampleID, AnalyteName, .keep_all=TRUE) nrow(temp1)
## [1] 313
#OPT #2 - REMOVE DUPLICATES temp2 <- DRMP_2017_19_new %>% mutate(tempSampleID = sub('Rep[0-9]?', 'Rep', SampleID)) %>% # Remove digit so SampleIDs are the same for the same Analyte distinct(tempSampleID, AnalyteName, .keep_all=TRUE) %>% select(-tempSampleID) nrow(temp2)
## [1] 313
# Format Units Column - "mg/Kg ww" or "mg/Kg dw" unique(DRMP_2017_19_new$Unit) #Identifies OLDNAMES
## [1] "ug/g ww"
# If more than 1 unit colmn exists (e.g., for RL and MDL columns) see WQP script for example on merging into 1 column DRMP_2017_19_new <- DRMP_2017_19_new %>% standardizeUnits(pp = "mass") unique(DRMP_2017_19_new$Unit)
## [1] "mg/Kg ww"
# Format Date and Time Column # # THE EXAMPLE CODE BELOW ASSUMES DATE AND TIME ARE IN SAME COLUMNS - IF TIME IS IN SEPERATE COLUMN LOOK AT AQ LINKAGE DATA TEMPLATE DRMP_2017_19_new <- DRMP_2017_19_new %>% #rowise() %>% # rowise is very slow - so used sapply to make this a rowise operation mutate( #If SampleDate & CollectioTIme are not in Character format by defualt, turn it into a character class so it exports better SampleDate = ifelse(sapply(SampleDate, is.character), SampleDate, as.character(as.Date(SampleDate))), SampleTime = ifelse(sapply(SampleTime, is.character), SampleTime, format(lubridate::ymd_hms(SampleTime), "%H:%M:%S")), #COMBINE DATE AND TIME INTO SampleDateTime COLUMN SampleDateTime = ifelse(!is.na(SampleTime), paste(SampleDate, SampleTime), paste(SampleDate, '00:00:00')), #FORMAT SampleDateTime COLUMN TO DATE FORMAT SampleDateTime = lubridate::ymd_hms(SampleDateTime) ) ### REMOVE TEMPORARY COLUMNS ### DRMP_2017_19_new <- DRMP_2017_19_new %>% select(-one_of(temp_cols)) #Remove temp columns since they are no longer needed #View(DRMP_2017_19_new) ## SAVE FORMATTED DATA AS EXCEL FILE ## writexl::write_xlsx(DRMP_2017_19_new, path='Reeval_Impl_Goals_Linkage_Analysis/Data/Fish/DRMP_2017_19_ceden_format.xlsx') # In excel, to convert SampleDate column to Date format # 1 - Select the date column. # 2 - Go to the Data-tab and choose "Text to Columns". # 3 - On the first screen, leave radio button on "delimited" and click Next. # 4 - Unselect any delimiter boxes (everything blank) and click Next. # 5 - Under column data format choose Date, select YMD # 6 - Click Finish.
The R session information (including the OS info, R version and all packages used):
sessionInfo()
## R version 4.2.2 (2022-10-31 ucrt) ## Platform: x86_64-w64-mingw32/x64 (64-bit) ## Running under: Windows 10 x64 (build 22621) ## ## Matrix products: default ## ## locale: ## [1] LC_COLLATE=English_United States.utf8 LC_CTYPE=English_United States.utf8 ## [3] LC_MONETARY=English_United States.utf8 LC_NUMERIC=C ## [5] LC_TIME=English_United States.utf8 ## ## attached base packages: ## [1] stats graphics grDevices utils datasets methods base ## ## other attached packages: ## [1] lubridate_1.8.0 plotly_4.10.0 readxl_1.4.1 actuar_3.3-0 ## [5] NADA_1.6-1.1 forcats_0.5.2 stringr_1.4.1 dplyr_1.0.9 ## [9] purrr_0.3.4 readr_2.1.2 tidyr_1.2.0 tibble_3.1.8 ## [13] ggplot2_3.3.6 tidyverse_1.3.2 fitdistrplus_1.1-8 survival_3.4-0 ## [17] MASS_7.3-58.1 ## ## loaded via a namespace (and not attached): ## [1] lattice_0.20-45 assertthat_0.2.1 digest_0.6.29 utf8_1.2.2 ## [5] R6_2.5.1 cellranger_1.1.0 backports_1.4.1 reprex_2.0.2 ## [9] evaluate_0.16 highr_0.9 httr_1.4.4 pillar_1.8.1 ## [13] rlang_1.0.5 lazyeval_0.2.2 googlesheets4_1.0.1 rstudioapi_0.14 ## [17] data.table_1.14.2 Matrix_1.5-1 splines_4.2.2 googledrive_2.0.0 ## [21] htmlwidgets_1.5.4 munsell_0.5.0 broom_1.0.1 compiler_4.2.2 ## [25] modelr_0.1.9 xfun_0.32 pkgconfig_2.0.3 htmltools_0.5.3 ## [29] tidyselect_1.1.2 fansi_1.0.3 viridisLite_0.4.1 crayon_1.5.1 ## [33] tzdb_0.3.0 dbplyr_2.2.1 withr_2.5.0 grid_4.2.2 ## [37] jsonlite_1.8.0 gtable_0.3.1 lifecycle_1.0.1 DBI_1.1.3 ## [41] magrittr_2.0.3 scales_1.2.1 writexl_1.4.0 cli_3.3.0 ## [45] stringi_1.7.8 fs_1.5.2 xml2_1.3.3 ellipsis_0.3.2 ## [49] generics_0.1.3 vctrs_0.4.1 expint_0.1-7 tools_4.2.2 ## [53] glue_1.6.2 hms_1.1.2 fastmap_1.1.0 colorspace_2.0-3 ## [57] gargle_1.2.0 rvest_1.0.3 knitr_1.40 haven_2.5.1
Sys.time()
## [1] "2024-01-05 10:17:19 PST"