This report is automatically generated with the R
package knitr
(version 1.40
)
.
options(scipen = 999) # turns off scientific notation # library(scales) library(readxl) # loads excel files as tibbles (part of tidyverse) library(plotly) # interactive graphs library(lubridate) # makes working with dates easier library(tidyverse) # functions to work with tibbles # CUSTOM FUNCTIONS # sigfig <- function(vec, n=3){ ### function to print values to N significant digits without dropping trailing zeros # input: vec vector of numeric # n integer is the required sigfig # output: outvec vector of numeric rounded to N sigfig # returns a character output - used in "hovertext" for plotly graphs formatC(signif(vec,digits=n), digits=n, format="fg", flag="#") } # Function to round values in a dataframe signif_df <- function(x, digits) { # round numeric columns # x = data frame # digits = number of digits to round numeric_columns <- sapply(x, mode) == 'numeric' x[numeric_columns] <- signif(x[numeric_columns], digits) x } `%are not%` <- function (x, table) is.na(match(x, table, nomatch=NA_integer_)) #https://stackoverflow.com/questions/5831794/opposite-of-in # Collapses vector to be like a grammatical list (i.e., seperated by commas, with individual quotes, and last element "and") # # Needs to be wrapped in a cat() to print properly; e.g., cat(grammaticList(c('a','b', 'c'))) grammaticList <- function(vec) sub(",\\s+([^,]+)$", ", and \\1", ifelse(length(vec) == 2, paste(vec, collapse=" and "), paste(vec, collapse=", "))) #https://plotly-r.com/working-with-symbols.html cust_shape <- c('cross-open', 'diamond-open', 'circle-open', 'x-open', 'triangle-up-open', 'hash-open', 'star-square-open', 'star-open', 'diamond-tall-open', 'star-triangle-down-open', 'star-diamond-open', 'hourglass-open', 'asterisk-open', 'square-open') # FUNCTION TO LOAD DATA # loadData <- function(filename){ data <- readxl::read_xlsx(filename, sheet=1, guess_max = 100000) %>% mutate(tempSourceID = substr(SourceID, 1, 5), Shape = as.character(cust_shape[as.factor(tempSourceID)])) %>% #Sets shape initally so shape doesn't change for SourceID on each graph; SourceID needs to be a factor; Shape needs to be character for Plotly graphing select(-tempSourceID) cat("Data set has", nrow(data),"rows. Added Shape column based on SourceID for use in result_time_Plotly().") return(data) } # FUNCTION TO STANDARDIZE UNITS standardizeUnits <- function(dataframe, pp='volume'){ # pp = "parts per" million (ppm) or billion (ppb); if not by volume then assumes by mass # If Column is All NA; convert form Logical NA to Real NA if(all(is.na(dataframe$Result))){ dataframe$Result <- NA_real_ } if(all(is.na(dataframe$MDL))){ dataframe$MDL <- NA_real_ } if(all(is.na(dataframe$RL))){ dataframe$RL <- NA_real_ } newUnits <- dataframe %>% mutate( #ug/L * 1000 = ng/L "\U00B5" is unicode symbol for mu (the micro symbol) Result = ifelse(grepl('(\U00B5|u)g/l',Unit,ignore.case=T, perl=T), Result*1000, Result), MDL = ifelse(grepl('(\U00B5|u)g/l',Unit,ignore.case=T, perl=T), MDL*1000, MDL), RL = ifelse(grepl('(\U00B5|u)g/l',Unit,ignore.case=T, perl=T), RL*1000, RL), Unit = gsub('(\U00B5|u)g/l', 'ng/L', Unit, ignore.case=T, perl=T), #mg/L * 1,000,000 = ng/L & ppm * 1,000,000 = ng/L #if ppm only appies to measurements by volume; replace 'mg/l' with 'mg/l|ppm' Result = ifelse(grepl('mg/l',Unit,ignore.case=T, perl=F), Result*1000000, Result), MDL = ifelse(grepl('mg/l',Unit,ignore.case=T, perl=F), MDL*1000000, MDL), RL = ifelse(grepl('mg/l',Unit,ignore.case=T, perl=F), RL*1000000, RL), Unit = gsub('mg/l', 'ng/L', Unit, ignore.case=T, perl=F), #ug/kg * 0.001 = mg/kg; ng/g * 0.001= mg/kg Result = ifelse(grepl('(\U00B5|u)g/kg|ng/g',Unit,ignore.case=T, perl=T), Result*0.001, Result), MDL = ifelse(grepl('(\U00B5|u)g/kg|ng/g',Unit,ignore.case=T, perl=T), MDL*0.001, MDL), RL = ifelse(grepl('(\U00B5|u)g/kg|ng/g',Unit,ignore.case=T, perl=T), RL*0.001, RL), Unit = gsub('(\U00B5|u)g/kg|ng/g','mg/Kg', Unit, ignore.case=T, perl=T), #ug/g = mg/kg; only need to change Unit Unit = gsub('(\U00B5|u)g/g','mg/Kg', Unit, ignore.case=T, perl=T), # ppm * 1,000,000 = ng/L | ppm = mg/kg # ppb * 1,000 = ng/L | ppb * 0.001 = mg/kg Result = case_when(pp == "volume" & grepl('ppm',Unit,ignore.case=T, perl=F) ~ Result*1000000, # ppm * 1,000,000 = ng/L pp == "volume" & grepl('ppb',Unit,ignore.case=T, perl=F) ~ Result*1000, # ppb * 1,000 = ng/L pp != "volume" & grepl('ppb',Unit,ignore.case=T, perl=F) ~ Result*0.001, # ppb * 0.001 = mg/kg T ~ Result), # ppm = mg/kg MDL = case_when(pp == "volume" & grepl('ppm',Unit,ignore.case=T, perl=F) ~ MDL*1000000, # ppm * 1,000,000 = ng/L pp == "volume" & grepl('ppb',Unit,ignore.case=T, perl=F) ~ MDL*1000, # ppb * 1,000 = ng/L pp != "volume" & grepl('ppb',Unit,ignore.case=T, perl=F) ~ MDL*0.001, # ppb * 0.001 = mg/kg T ~ MDL), # ppm = mg/kg RL = case_when(pp == "volume" & grepl('ppm',Unit,ignore.case=T, perl=F) ~ RL*1000000, # ppm * 1,000,000 = ng/L pp == "volume" & grepl('ppb',Unit,ignore.case=T, perl=F) ~ RL*1000, # ppb * 1,000 = ng/L pp != "volume" & grepl('ppb',Unit,ignore.case=T, perl=F) ~ RL*0.001, # ppb * 0.001 = mg/kg T ~ RL), # ppm = mg/kg Unit = case_when(pp == "volume" ~ gsub('ppm|ppb','ng/L', Unit, ignore.case=T, perl=T), pp != "volume" ~ gsub('ppm|ppb','mg/Kg', Unit, ignore.case=T, perl=T), T ~ Unit), #standardize ng/L and mg/Kg letter case Unit = gsub('ng/l', 'ng/L', Unit, ignore.case=T, perl=F), Unit = gsub('mg/kg', 'mg/Kg', Unit, ignore.case=T, perl=F) ) return(newUnits) } # FUNCTION TO CONVERT CHARACTER NUMBERS TO NUMERIC FORMAT # chara_to_NumDate <- function(dataframe){ #Look for columns that are character format & if no letters (except e or E for scientific notation) convert format to numeric charaTOnumdate <- function(df){ #1) Look for columns that are character format & if no letters but contains multiple -'s: convert format to date if(is.character(df) & !any(grepl('[a-zA-Z]', df)) & any(grepl('-.*-', df))){ymd(df)} #2) Look for columns that are character format & if no letters but contains a ':' - convert format to time else if(is.character(df) & !any(grepl('[a-zA-Z]', df)) & any(grepl(':', df))){hms(df, quiet=T)} #3) Look for columns that are character format & if no letters (except e or E for scientific notation): convert format to numeric else if(is.character(df) & !any(grepl('[a-df-zA-DF-Z]', df))) {as.numeric(df)} #converts character columns that are numeric to numeric; excludes in case of sci notation else{df} } #Apply charaTOnumdate function dataframe <- as_tibble(lapply(dataframe, charaTOnumdate)) return(dataframe) } # CHECK FOR BLANKS IN A COLUMN # anyBlank <- function(dataframe, column, showBlanks = T){ #https://dplyr.tidyverse.org/articles/programming.html column <- enquo(column) columnBlanks <- dataframe %>% filter (is.na(!! column) | !! column == ''| !! column == 'Not Reported') if(nrow(columnBlanks) == 0){ cat("The column", as_label(column), "does not contain any blanks.") }else{ sources <- grammaticList(unique(dataframe$SourceID)) cat('The following', nrow(columnBlanks),"rows have a blank value in the", as_label(column), "column: ", sources ) if(showBlanks) View(columnBlanks) return(columnBlanks) } } # LOOK AT FILTERABLE FACTORS IN SPECIFIED COLUMNS # unique_factors <- function(dataframe, ...){ #https://dplyr.tidyverse.org/articles/programming.html selector_var <- enquos(...) factors <- dataframe %>% select(!!! selector_var) lapply(factors, function(x) unique(sort(x, na.last=F))) #TO PUT THIS INTO A TIBBLE: tbl <- tibble(Columns = names(aq_factors), Factors = lapply(aq_factors, function(x) unique(sort(x)))); tbl$Columns[[1]]; tbl$Factors[[1]] } ## GRAPH RESULT, MDL, & RL VALUES VS TIME # graphSubset <- function(dataframe, groupByCol, column){ #https://dplyr.tidyverse.org/articles/programming.html groupBycolumn <- enquo(groupByCol) column <- enquo(column) graph_subset <- dataframe %>% select(!! groupBycolumn, Value = !! column, .data$SampleDate, .data$SourceID, .data$SourceRow, .data$Subarea, .data$StationName, #Selects and renames 'column' to Value .data$Analyte, .data$Unit, .data$ResultQualCode, .data$Shape, Matrix = any_of(c("MatrixName", "TissueName"))) %>% filter(!is.na(Value)) %>% mutate(ValueType = as_label(column)) #Put variable name of column (e.g., "Result", "MDL", or "RL") in column called "ValueType" return(graph_subset) } result_time_Plotly <- function(dataframe, groupByCol, showMean=F, interactive=T, logscale=T, showLegend=F){ ## Required Columns # Result - numeric # SampleDate - dates # SourceID - can be NA # Subarea - can be NA # Shape - can be NA # groupByCol <- quo(Subarea) - to test function line-wise groupBycolumn <- enquo(groupByCol) df_name <- deparse(substitute(dataframe)) results <- graphSubset(dataframe, groupByCol=as_label(groupBycolumn), Result) %>% mutate(ValueType = ifelse(ResultQualCode == "ND", "ND", ValueType), # Distinguish ND Results from "=" & DNQ Results n = 1) # add dummy n column so can rbind with result_monthlyAvg monthlyAvg <- results %>% mutate(ValueType = "Monthly Avg", SampleDate = ymd(paste0(year(SampleDate),'-', month(SampleDate), "-15"))) %>% group_by(!!groupBycolumn, SampleDate) %>% mutate(Value = mean(Value), # Geometric mean: exp(mean(log(Value))) n=n()) %>% distinct(!!groupBycolumn, SampleDate, .keep_all=T) %>% # keep all columns so can rbind with results, mdl, rl dataframes ungroup mdl <- graphSubset(dataframe, groupBy=as_label(groupBycolumn), MDL) %>% mutate(n=1) rl <- graphSubset(dataframe, groupBy=as_label(groupBycolumn), RL) %>% mutate(n=1) graphdata <- rbind(results,mdl, rl) graphdata$ValueType <- factor(graphdata$ValueType, levels = c("Result", "MDL", "RL", "ND")) #makes factor order consistent cust_color <- c("gray24", "deeppink2", "turquoise3", "gray54") used_colors <- setNames(cust_color, levels(graphdata$ValueType)) #<or> "graphdata$Color <- cust_color[graphdata$ValueType]" #ValueType needs to be a factor; "used_colors <- setNames(unique(graphdata$Color), unique(graphdata$ValueType))" used_shapes <- setNames(graphdata$Shape, graphdata$SourceID) #<or if Shape column not used> used_shapes <- setNames(cust_shape[1:length(unique(graphdata$SourceID))], unique(graphdata$SourceID)) used_shapes <- used_shapes[unique(names(used_shapes))] p <- ggplot() + geom_point(data=graphdata, aes(x=SampleDate, y=Value, color=ValueType, shape=SourceID, text=purrr::map(paste0('SourceRow: <b>',SourceRow, '</b><br>', 'StationName: <b>',StationName,'</b><br>', '<b>', Subarea, '</b>' ), htmltools::HTML)), # sprintf("ValueType: %s<br>SouceID: %s<br>SourceRow: %s<br>StationName: %s", ValueType, SourceID, SourceRow, StationName)), show.legend=showLegend) + scale_color_manual(values = used_colors) + scale_shape_manual(values = used_shapes) + theme_light() + facet_grid(rows=vars(!!groupBycolumn)) if(showMean == T){p <- p + geom_line(data=monthlyAvg, aes(x=SampleDate, y=Value), color="darkorange", size=.3) + geom_point(data=monthlyAvg, aes(x=SampleDate, y=Value, text=purrr::map(paste0('<b>', month.abb[month(SampleDate)], ' ', year(SampleDate), '</b><br>Monthly Avg = <b>', signif(Value,4), '</b><br>', 'n: <b>',n,'</b>'), htmltools::HTML)), # sprintf("ValueType: %s<br>n: %s", ValueType, n)), color="darkorange", size=.7)} if(logscale == T){p <- p + scale_y_log10()} if(interactive == T){ if(showLegend == T) {ggplotly(p)}else{hide_legend(ggplotly(p))} }else{p} } # STANDARDIZE RESULT QUAL CODE # use rules outlined in '1_ResQualCode Rules.xlsx' # Secondary Functions to Support Primary Function betweenDNQ <- function(x, y, z) y!=z & x > y & x < z #betweenDNQ(0.2, 0.2, 0.3) => F #Result should only be blank with DNQ or ND removeData <- function(dataframe, filter_fun){ #https://dplyr.tidyverse.org/articles/programming.html filter_fun <- enquo(filter_fun) filter_print <- rlang::quo_text(filter_fun, width=150) #width is the character length that a next line '\n' will be placed; default is 90 deletedSamples <- dataframe %>% filter(!! filter_fun) if(nrow(deletedSamples) > 0){ dataframe1 <- dataframe %>% anti_join(., deletedSamples, by=names(.)) dfText <- capture.output(anti_join(dataframe, dataframe1, by=names(dataframe))) #captures the dataframe output as text dfText <- dfText[!seq_len(length(dfText)) %in% c(1,3)] #removes tibble information (1st & 3rd row) from printing dfPrint <- paste(dfText, "\n", sep="") #pastes a line seperator after each row of dataframe so it prints nicely cat('\nDataset tested for the following logic:\n ', filter_print, '\nwhich is TRUE for',nrow(deletedSamples), 'instances.\n', dfPrint) View(deletedSamples) readline(prompt="The above data is considered to be erroneous and will be deleted. Click in 'Console' and Press [enter] to continue.") return(dataframe1) }else{ cat('\nDataset tested for the following logic:\n ', filter_print, '\nNo instances found to be TRUE. No data removed.\n\n') return(dataframe) } } # Primary Function standQualCode <- function(dataframe, QualCodeColumn){ # QualCodeColumn <- quo(ResultQualCode) - to test function line-wise qualColumn <- enquo(QualCodeColumn) #remove instances where (ResultQualCode is blank, 'ND', or 'Not Reported') & (MDL & RL are blank) -> not trustworthy Result dataframe <- removeData(dataframe, !is.na(Result) & is.na(MDL) & is.na(RL)& !! qualColumn == 'ND') #remove instances where (Result is blank) & (MDL & RL is not blank) & (ResultQualCode is not ND or DNQ) -> blank Result does not make sense for ResultQualCode =, E, or AVG dataframe <- removeData(dataframe, is.na(Result) & (!is.na(MDL) & !is.na(RL)) & (!! qualColumn != 'ND' & !! qualColumn != 'DNQ')) #remove instances where Result = RL & MDL < RL & ResultQualCode = ND - when ResultQualCode is ND, Result should be less than or equal to MDL not RL dataframe <- removeData(dataframe, Result == RL & MDL < RL & !! qualColumn == 'ND') #standardize ResultQualCode standQualCode <- dataframe %>% mutate(origResult = Result, #back up the original Result column for reference origQualCode = !! qualColumn, #back up the original QualCode column for reference Result = case_when(Result == 0 ~ NA_real_, (Result == MDL | Result == RL) & origQualCode =='ND' ~ NA_real_, Result < MDL & is.na(RL) ~ NA_real_, Result < RL & is.na(MDL) ~ NA_real_, TRUE ~ Result ), !! qualColumn := case_when(pmap_lgl(list(Result, MDL, RL), betweenDNQ) ~ 'DNQ', #need pmap_lgl to do between() as a rowwise function !is.na(Result) & is.na(MDL) & is.na(RL) & !is.na(origQualCode) ~ '=', Result > MDL & Result > RL ~ '=', Result > MDL & is.na(RL) ~ '=', Result > RL & is.na(MDL) ~ '=', Result == MDL & RL > MDL & origQualCode != 'ND' ~ 'DNQ', (Result == RL | Result == MDL) & (origQualCode != 'ND' & origQualCode != 'DNQ') ~ '=', (Result < MDL & is.na(RL)) | (Result < RL & is.na(MDL)) ~ 'ND', is.na(Result) & ((!is.na(MDL) & is.na(RL)) | (is.na(MDL) & !is.na(RL))) ~ 'ND', MDL > RL & (Result < MDL | is.na(Result)) ~ 'ND', TRUE ~ origQualCode) ) #See any new QualCodes are NA, 'Not Reported' or '' cat('\n', as_label(qualColumn), 'is now standardized and original QualCodes saved in column named origQualCode. ') temp <- anyBlank(standQualCode, !! qualColumn, showBlanks=F) # returns a NULL if no blanks if(!is.null(temp)){ temp <- temp %>% select(TMDL, Subarea, SourceID, SourceRow, StationName, SampleDateTime, Analyte, Result, MDL, RL, Unit, origQualCode, !! qualColumn) dfText <- capture.output(temp) #captures the dataframe output as text dfText <- dfText[!seq_len(length(dfText)) %in% c(1,3)] #removes tibble information (1st & 3rd row) from printing dfPrint <- paste(dfText, "\n", sep="") #pastes a line seperator after each row of dataframe so it prints nicely makingNewQualCodeEqual <- quo((is.na(!! qualColumn) | !! qualColumn == 'Not Reported' | !! qualColumn == '') & !is.na(Result) & is.na(MDL) & is.na(RL)) #logic to make a blank QualCode '=' standQualCode <- standQualCode %>% mutate(!! qualColumn := if_else(!! makingNewQualCodeEqual, '=', !! qualColumn)) %>% #if Result is not blank and no new QualCode -> make new QualCode '=' anti_join(. ,temp, by=names(temp)) #remove any samples where the new QualCode is still blank temp2 <- temp %>% mutate(!! qualColumn := if_else(!! makingNewQualCodeEqual, '=', !! qualColumn)) %>% #do same to temp, so User can see results anti_join(. ,temp, by=names(temp)) dfText2 <- capture.output(temp2) #captures the dataframe output as text dfText2 <- dfText2[!seq_len(length(dfText2)) %in% c(1:3)] #removes tibble information (1st & 3rd row) from printing dfPrint2 <- paste(dfText2, "\n", sep="") #pastes a line seperator after each row of dataframe so it prints nicely cat('\n', dfPrint, '\nOf the samples above, the following samples contain a Result but no MDL, RL, or QualCode and will be given a new QualCode of "=":\n', dfPrint2) if(nrow(temp) == nrow(temp2)){ cat('All samples now contain a standardized QualCode.') }else{ cat('\nThe remaining',nrow(temp)-nrow(temp2),'samples still have a blank QualCode and will be deleted. See "deletedSamples" tab.') col_names <- names(temp)[names(temp) %are not% as_label(qualColumn)] #don't include standradized QualCodes when looking for differences between temp and temp2 deletedSamples <- temp %>% anti_join(., temp2, by=col_names) View(deletedSamples) } }else{ cat('\n\nQualCode standardization completed based on rules established in "1_ResQualCode Rules.xlsx".') } return(standQualCode) } # # TESTING RESULTQUALCODE STANDARDIZATION FUNCTION # # Result<-c( 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, NA_real_, 0.2, NA_real_, 0.2, NA_real_, NA_real_, NA_real_, 0.1, 0.1, 0.1, 0.3, 0.3, 0.3, 0.2, 0.05, NA_real_, NA_real_, NA_real_, 0.4, NA_real_, NA_real_, NA_real_, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2) # MDL <- c( 0.3, NA_real_, NA_real_, NA_real_, 0.1, NA_real_, 0.3, 0.3, NA_real_, NA_real_, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, NA_real_, NA_real_, NA_real_, 0.2, 0.2, 0.2) # RL <- c( 0.4, NA_real_, NA_real_, NA_real_, NA_real_, 0.1, NA_real_, NA_real_, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.2, 0.2, 0.2, NA_real_, NA_real_, NA_real_, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2) # Code <- c('Not Reported',NA_character_, 'ND', 'notND', 'any', 'any', 'any', 'any', 'any', 'any','any', 'ND', 'DNQ','notNDNQ','ND','DNQ','notNDNQ','ND','DNQ','notNDNQ','any','any', 'ND', 'DNQ','notNDNQ','any', 'ND', 'DNQ','notNDNQ', 'ND', 'DNQ','notNDNQ', 'ND', 'DNQ','notNDNQ','ND','DNQ','notNDNQ') # dfTestStandFun <- tibble(Result, MDL, RL, ResultQualCode=Code, TMDL='', SourceID='', Subarea='', SourceRow='', StationName='', SampleDateTime='', Analyte='', Unit='') # fixedQualCode <- standQualCode(dfTestStandFun, ResultQualCode) # View(dfTestStandFun) # View(fixedQualCode) # View(anti_join(dfTestStandFun, fixedQualCode, by=c("Result"="origResult", "MDL", "RL", "ResultQualCode"="origQualCode"))) # Select only complete tidal cycles for analysis # Function to create unique identifiers for each tide cycle, returns a data frame with an additional column "TideID" unique_tidal_cycle <- function(clean_flow){ # clean_flow is a data frame of clean flow data id_flow <- clean_flow %>% mutate(TideID = paste0(SiteType, '1')) # setup column w/ initial values to be changed by for loop below n_rows <- nrow(id_flow) j = 1 # numeric value to ID tide # for loop for(n in 2:n_rows){ if(sign(id_flow$Flow[n-1]) != sign(id_flow$Flow[n])){ j = j + 1 } id_flow$TideID[n] <- paste0(id_flow$SiteType[n], j) } return(id_flow) # return data frame } # Function to identify gaps longer than 16 minutes between time interval measurements, returns a data frame with additional column "Gap" (TRUE if a gap larger than set time in minutes, NA if not a gap larger than set time in minutes) identify_tidal_gaps <- function(flow, time_gaps){ # flow is a data frame with unique cycle ID column # time_gaps is a time (in minutes); any gap between rows that is greater than this time is identified as a gap with TRUE flow_gaps <- flow %>% mutate(Gap = NA) # create an empty column for identified gaps n_rows <- nrow(flow_gaps) flow_gaps$Gap[1] = FALSE # assuming the first and last tidal cycles are incomplete flow_gaps$Gap[n_rows] = FALSE k = NA # set default of k to NA # for loop for(n in 2:(n_rows-1)){ if((difftime(flow_gaps$SampleDateTime[n+1], flow_gaps$SampleDateTime[n], units = "mins")) > time_gaps){ k = TRUE } else if ((difftime(flow_gaps$SampleDateTime[n], flow_gaps$SampleDateTime[n-1], units = "mins")) > time_gaps){ k = TRUE } else { k = FALSE } flow_gaps$Gap[n] = k } return(flow_gaps) # return data frame } # Randomization Test - tests for significant difference between mean and median. Provides p-value of test randomizationTest <- function(dataframe, .dataCol, .groupingCol, N=1000){ # Currently, this code can test for significance between more than 2 grouping variables, but it can not test for pairwise differences. set.seed(124) # .dataCol <- quo(Result) - to test function line-wise dataCol <- enquo(.dataCol) # .groupingCol <- quo(SampleTypeCode) - to test function line-wise groupingCol <- enquo(.groupingCol) observed_stats <- dataframe %>% group_by(!!groupingCol) %>% summarise(obsMean = mean(!!dataCol), obsMedian = median(!!dataCol)) obs_mean_diff <- diff(observed_stats %>% pull(obsMean)) obs_median_diff <- diff(observed_stats %>% pull(obsMedian)) sample_size <- nrow(dataframe) mean_diff_dist <- NULL median_diff_dist <- NULL for(i in 1:N){ shuffle_data <- dataframe %>% slice_sample(n=sample_size, replace=F) %>% pull(!!dataCol) shuffle_stats <- dataframe %>% mutate(dataShuffle = shuffle_data) %>% group_by(!!groupingCol) %>% summarise(shuffleMean = mean(dataShuffle), shuffleMedian = median(dataShuffle)) mean_diff_dist <- c(mean_diff_dist, diff(shuffle_stats %>% pull(shuffleMean))) median_diff_dist <- c(median_diff_dist, diff(shuffle_stats %>% pull(shuffleMedian))) } # More than or equal to abs value equates to a two tailed test mean_count <- ifelse(abs(mean_diff_dist) >= abs(obs_mean_diff), 1, 0) median_count <- ifelse(abs(median_diff_dist) >= abs(obs_median_diff), 1, 0) # get plot characteristics for ggplot mean_plot <- hist(mean_diff_dist, breaks=20, plot=F) median_plot <- hist(median_diff_dist, breaks=20, plot=F) # The percentage of 1s is the p-value for the test. This percentage will be small if the observed mean difference is unlikely to have happened by chance. results <- list(TestedGroups = dataframe %>% distinct(!!groupingCol) %>% pull, obs_mean_diff = obs_mean_diff, p_value_diff_means = sum(mean_count)/N, mean_diff_dist = mean_diff_dist, mean_diff_dist_plot = tibble(Mean_diff = mean_diff_dist) %>% ggplot + geom_histogram(aes(Mean_diff), bins=20) + ylab("Frequency") + xlab(paste("Randomized", paste(dataframe %>% distinct(!!groupingCol) %>% pull, collapse = " & "), "Mean Difference")) + geom_vline(xintercept = obs_mean_diff, color="red", size=1) + annotate(geom='text', x=obs_mean_diff - diff(mean_plot$breaks[1:2]/3), y=max(mean_plot$counts)/2, label='Observed Difference', color='red', angle=90)+ theme_light(), obs_median_diff = obs_median_diff, p_value_diff_medians = sum(median_count)/N, median_diff_dist = median_diff_dist, median_diff_dist_plot = tibble(median_diff = median_diff_dist) %>% ggplot + geom_histogram(aes(median_diff), bins=20) + ylab("Frequency") + xlab(paste("Randomized", paste(dataframe %>% distinct(!!groupingCol) %>% pull, collapse = " & "), "Median Difference")) + geom_vline(xintercept = obs_median_diff, color="red", size=1) + annotate(geom='text', x=obs_median_diff - diff(median_plot$breaks[1:2]/3), y=max(median_plot$counts)/2, label='Observed Difference', color='red', angle=90)+ theme_light() ) print("The P-value will be small if the observed difference is unlikely to have happened by chance") return(results) }
The R session information (including the OS info, R version and all packages used):
sessionInfo()
## R version 4.2.2 (2022-10-31 ucrt) ## Platform: x86_64-w64-mingw32/x64 (64-bit) ## Running under: Windows 10 x64 (build 22621) ## ## Matrix products: default ## ## locale: ## [1] LC_COLLATE=English_United States.utf8 LC_CTYPE=English_United States.utf8 ## [3] LC_MONETARY=English_United States.utf8 LC_NUMERIC=C ## [5] LC_TIME=English_United States.utf8 ## ## attached base packages: ## [1] stats graphics grDevices utils datasets methods base ## ## other attached packages: ## [1] mgcv_1.8-41 nlme_3.1-160 lubridate_1.8.0 plotly_4.10.0 ## [5] readxl_1.4.1 actuar_3.3-0 NADA_1.6-1.1 forcats_0.5.2 ## [9] stringr_1.4.1 dplyr_1.0.9 purrr_0.3.4 readr_2.1.2 ## [13] tidyr_1.2.0 tibble_3.1.8 ggplot2_3.3.6 tidyverse_1.3.2 ## [17] fitdistrplus_1.1-8 survival_3.4-0 MASS_7.3-58.1 ## ## loaded via a namespace (and not attached): ## [1] lattice_0.20-45 assertthat_0.2.1 digest_0.6.29 utf8_1.2.2 ## [5] R6_2.5.1 cellranger_1.1.0 backports_1.4.1 reprex_2.0.2 ## [9] evaluate_0.16 highr_0.9 httr_1.4.4 pillar_1.8.1 ## [13] rlang_1.0.5 lazyeval_0.2.2 googlesheets4_1.0.1 rstudioapi_0.14 ## [17] data.table_1.14.2 Matrix_1.5-1 splines_4.2.2 webshot_0.5.3 ## [21] googledrive_2.0.0 htmlwidgets_1.5.4 munsell_0.5.0 broom_1.0.1 ## [25] compiler_4.2.2 modelr_0.1.9 xfun_0.32 pkgconfig_2.0.3 ## [29] htmltools_0.5.3 tidyselect_1.1.2 viridisLite_0.4.1 fansi_1.0.3 ## [33] crayon_1.5.1 tzdb_0.3.0 dbplyr_2.2.1 withr_2.5.0 ## [37] grid_4.2.2 jsonlite_1.8.0 gtable_0.3.1 lifecycle_1.0.1 ## [41] DBI_1.1.3 magrittr_2.0.3 scales_1.2.1 writexl_1.4.0 ## [45] cli_3.3.0 stringi_1.7.8 fs_1.5.2 xml2_1.3.3 ## [49] ellipsis_0.3.2 generics_0.1.3 vctrs_0.4.1 expint_0.1-7 ## [53] tools_4.2.2 glue_1.6.2 crosstalk_1.2.0 hms_1.1.2 ## [57] yaml_2.3.5 fastmap_1.1.0 colorspace_2.0-3 gargle_1.2.0 ## [61] rvest_1.0.3 knitr_1.40 haven_2.5.1
Sys.time()
## [1] "2024-01-04 15:10:04 PST"