# Load libraries for data processing, modelling, and visualisation
library(tidyverse)
library(openxlsx2)
library(MASS)
library(corrplot)
library(DT)
library(climwin)
library(jtools)
library(DHARMa)
library(lavaan)
library(DiagrammeR)
library(lmtest)
library(glmmTMB)
library(scales)
library(performance)
# Load daily extreme weather event (EWE) data
extreme_weather <- wb_to_df("Breeding_colony_ewes/Tenth_ewes.xlsx") %>%
dplyr::mutate(across(4:20, ~ ifelse(is.na(.), 0, .)))
# Create binary version: 1 = event occurred, 0 = no event, NA = missing
extreme_weather_binary <- extreme_weather %>%
dplyr::mutate(across(4:20, ~ ifelse(!is.na(.) & . != 0, 1, ifelse(is.na(.), NA, 0))))
# Note:
# Missing values in extreme weather data are replaced with zero. This is critical because the slidingwin method ("method1" and "method2") internally calculates means when NA values are present, which is not suitable when assessing extreme values. We are specifically interested in whether an extreme event occurred, not in average conditions.
# A small number of missing values are present in the dataset, and replacing them with zero ensures consistency without introducing bias in this context.
# Load breeding data
breeding_data <- wb_to_df("Breeding_data/AUFS.xlsx", sheet = "Tenth_Island") %>%
dplyr::filter(!is.na(pup)) # Remove seasons without productivity data
# Record sample size
sample_size <- nrow(breeding_data)
# Assess normality of response variable
# If p > 0.05, the data does not significantly deviate from normality.
shapiro.test(breeding_data$pup)
##
## Shapiro-Wilk normality test
##
## data: breeding_data$pup
## W = 0.95332, p-value = 0.1433
# Histogram with density curve
hist(breeding_data$pup,
main = "Histogram of Pup Count",
xlab = "Pup Count",
col = "#a6d6fa",
border = "white",
prob = TRUE
)
# Overlay kernel density estimate
lines(density(breeding_data$pup, na.rm = TRUE), col = "#0D92F4", lwd = 2)
# Q-Q plot
ggplot(breeding_data, aes(sample = pup)) +
stat_qq() +
stat_qq_line(colour = "red") +
labs(title = "Q-Q Plot of Pup count",
x = "Theoretical Quantiles",
y = "Sample Quantiles") +
theme_classic()
# Calculate mean and variance
mean(breeding_data$pup)
## [1] 265.8
var(breeding_data$pup)
## [1] 10682.52
The Shapiro–Wilk test did not indicate a significant deviation from normality (W = 0.95332, p = 0.1433); therefore, we fail to reject the null hypothesis that the data are normally distributed.
Furthermore, the mean (265.8) and variance (10682.52) differ substantially, indicating overdispersion and suggesting that a Poisson distribution—where the mean and variance are expected to be equal—is not appropriate for this dataset. Therefore, we will use a Negative Binomial distribution to model the data.
# Run the sliding window analysis using actual (non-binary) values
output1 <- slidingwin(xvar = list(warm_day = extreme_weather$warm_day,
warm_night = extreme_weather$warm_night,
heatwave = extreme_weather$heatwave,
cool_day = extreme_weather$cool_day,
cool_night = extreme_weather$cool_night,
coldwave = extreme_weather$coldwave,
wet_day = extreme_weather$wet_day,
heavy_rain_day = extreme_weather$heavy_rain_day,
very_heavy_rain_day = extreme_weather$very_heavy_rain_day,
ewdp = extreme_weather$ewdp,
vwdp = extreme_weather$vwdp,
extreme_wind_day = extreme_weather$extreme_wind_day,
extreme_wave_energy_day = extreme_weather$extreme_wave_energy_day,
extreme_wbt_day = extreme_weather$extreme_wbt_day,
extreme_wbgt_day = extreme_weather$extreme_wbgt_day,
extreme_at_day = extreme_weather$extreme_at_day,
extreme_wind_chill_day = extreme_weather$extreme_wind_chill_day),
cdate = extreme_weather$date, # Daily climate record dates
bdate = breeding_data$date, # Biological event dates (pup count dates)
baseline = glm.nb(pup ~ 1,
link = "log",
data = breeding_data), # Baseline model: Negative Binomial
cohort = breeding_data$season, # Group by season
refday = c(28, 01), # Last day of monitoring across the seasons
cinterval = "day", # Daily resolution
range = c(94, 0), # Test all possible windows within the range
type = "absolute", # Absolute to each biological event date
stat = "sum", # Sum of EWE values over the window
func = "lin" # Test linear relationships
)
# Run the sliding window analysis using binary event indicators
output2 <- slidingwin(xvar = list(warm_day_bi = extreme_weather_binary$warm_day,
warm_night_bi = extreme_weather_binary$warm_night,
heatwave_bi = extreme_weather_binary$heatwave,
cool_day_bi = extreme_weather_binary$cool_day,
cool_night_bi = extreme_weather_binary$cool_night,
codlwave_bi = extreme_weather_binary$coldwave,
wet_day_bi = extreme_weather_binary$wet_day,
heavy_rain_day_bi = extreme_weather_binary$heavy_rain_day,
very_heavy_rain_day_bi = extreme_weather_binary$very_heavy_rain_day,
ewdp_bi = extreme_weather_binary$ewdp,
vwdp_bi = extreme_weather_binary$vwdp,
extreme_wind_day_bi = extreme_weather_binary$extreme_wind_day,
extreme_wave_energy_day_bi = extreme_weather_binary$extreme_wave_energy_day,
extreme_wbt_day_bi = extreme_weather_binary$extreme_wbt_day,
extreme_wbgt_day_bi = extreme_weather_binary$extreme_wbgt_day,
extreme_at_day_bi = extreme_weather_binary$extreme_at_day,
extreme_wind_chill_day_bi = extreme_weather_binary$extreme_wind_chill_day),
cdate = extreme_weather_binary$date, # Daily climate record dates
bdate = breeding_data$date, # Biological event dates (pup count dates)
baseline = glm.nb(pup ~ 1,
link = "log",
data = breeding_data), # Baseline model: Negative Binomial
cohort = breeding_data$season, # Group by season
refday = c(28, 01), # Last day of monitoring across the seasons
cinterval = "day", # Daily resolution
range = c(94, 0), # Test all possible windows within the range
type = "absolute", # Absolute to each biological event date
stat = "sum", # Sum of EWE values over the window
func = "lin" # Test linear relationships
)
# Calculate the circular mean of directional data (in degrees)
# This function returns the mean direction of angular values (e.g. wave or wind direction), accounting for circularity (i.e. wrap-around at 360°).
circ_mean <- function(x) {
# Convert degrees to radians
radians <- x * pi / 180
# Compute mean sine and cosine
mean_sin <- mean(sin(radians), na.rm = TRUE)
mean_cos <- mean(cos(radians), na.rm = TRUE)
# Calculate circular mean in radians and convert back to degrees
mean_angle <- atan2(mean_sin, mean_cos) * 180 / pi
# Ensure result is within 0–360 degrees
if (mean_angle < 0) mean_angle + 360 else mean_angle
}
# Run the sliding window analysis with daily mean wave direction data
output3 <- slidingwin(xvar = list(wave_direction = extreme_weather$wave_direction),
cdate = extreme_weather$date, # Daily climate record dates
bdate = breeding_data$date, # Biological event dates (pup count dates)
baseline = glm.nb(pup ~ 1,
link = "log",
data = breeding_data), # Baseline model: Negative Binomial
cohort = breeding_data$season, # Group by season
refday = c(28, 01), # Last day of monitoring across the seasons
cinterval = "day", # Daily resolution
range = c(94, 0), # Test all possible windows within the range
type = "absolute", # Absolute to each biological event date
stat = "circ_mean", # Circular mean of daily wave direction values over the window
func = c("lin", "quad") # Test linear and quadratic relationships
)
# Examine the tested combinations
datatable(output3$combos %>%
dplyr::mutate(WindowDuration = WindowOpen - WindowClose + 1),
options = list(pageLength = 10, orderClasses = TRUE)
)
# Combine output from actual and binary sliding window analyses
output <- merge_results(output1, output2)
# View merged model combinations with calculated window duration
datatable(output$combos %>%
dplyr::mutate(WindowDuration = WindowOpen - WindowClose + 1),
options = list(pageLength = 10, orderClasses = TRUE))
Before running the randomisation process, we need to identify the best-performing model for each extreme weather variable. This ensures that we are testing the most likely biologically relevant window against random expectation.
What we are doing here: For each weather variable (e.g., heavy rain, wet days), we extract the model with:
The lowest AIC value, and
A window duration longer than 14 days, to focus on ecologically meaningful timeframes.
These best models represent the strongest climate–breeding success relationships, and will be used for the randomisation test to assess whether the relationship is likely to have occurred by chance.
# Summary of the best model
summary(output[[21]]$BestModel)
##
## Call:
## glm.nb(formula = yvar ~ climate, data = modeldat, init.theta = 8.413941891,
## link = "log")
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.72705 0.07751 73.885 < 2e-16 ***
## climate -0.08742 0.02757 -3.171 0.00152 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(8.4139) family taken to be 1)
##
## Null deviance: 44.504 on 34 degrees of freedom
## Residual deviance: 35.655 on 33 degrees of freedom
## AIC: 418.72
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 8.41
## Std. Err.: 2.04
##
## 2 x log-likelihood: -412.719
# Calculate the median window from models within 95% confidence interval of the best model
medwin(output[[21]]$Dataset)
## $`Median Window Open`
## [1] 64
##
## $`Median Window Close`
## [1] 28
# Randomisation test to assess if the detected signal is likely by chance
cool_day_randwin <- randwin(repeats = 10,
window = "sliding",
xvar = list(cool_day_bi = extreme_weather_binary$cool_day),
cdate = extreme_weather_binary$date,
bdate = breeding_data$date,
baseline = glm.nb(pup ~ 1,
link = "log",
data = breeding_data),
cohort = breeding_data$season,
cinterval = "day",
refday = c(28, 01),
range = c(94, 0),
type = "absolute",
stat = c("sum"),
func = c("lin")
)
# Calculate the p-value using Climwin Metric C
climwin::pvalue(dataset = output[[21]]$Dataset,
datasetrand = cool_day_randwin[[1]],
metric = "C",
sample.size = sample_size
)
## [1] 0.7664887
# Plot sliding window and randomisation result
climwin::plotall(dataset = output[[21]]$Dataset,
datasetrand = cool_day_randwin[[1]],
bestmodel = output[[21]]$BestModel,
bestmodeldata = output[[21]]$BestModelData,
arrow = TRUE
)
# Summary of the best model
summary(output[[27]]$BestModel)
##
## Call:
## glm.nb(formula = yvar ~ climate, data = modeldat, init.theta = 7.454601792,
## link = "log")
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.63861 0.07013 80.404 <2e-16 ***
## climate -0.31699 0.15741 -2.014 0.044 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(7.4546) family taken to be 1)
##
## Null deviance: 39.585 on 34 degrees of freedom
## Residual deviance: 35.769 on 33 degrees of freedom
## AIC: 423.02
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 7.45
## Std. Err.: 1.80
##
## 2 x log-likelihood: -417.016
# Calculate the median window from models within 95% confidence interval of the best model
medwin(output[[27]]$Dataset)
## $`Median Window Open`
## [1] 68
##
## $`Median Window Close`
## [1] 28
# Randomisation test to assess if the detected signal is likely by chance
ewdp_randwin <- randwin(repeats = 10,
window = "sliding",
xvar = list(ewdp_bi = extreme_weather_binary$ewdp),
cdate = extreme_weather_binary$date,
bdate = breeding_data$date,
baseline = glm.nb(pup ~ 1,
link = "log",
data = breeding_data),
cohort = breeding_data$season,
cinterval = "day",
refday = c(28, 01),
range = c(94, 0),
type = "absolute",
stat = c("sum"),
func = c("lin")
)
# Calculate the p-value using Climwin Metric C
climwin::pvalue(dataset = output[[27]]$Dataset,
datasetrand = ewdp_randwin[[1]],
metric = "C",
sample.size = sample_size
)
## [1] 0.9347596
# Plot sliding window and randomisation result
climwin::plotall(dataset = output[[27]]$Dataset,
datasetrand = ewdp_randwin[[1]],
bestmodel = output[[27]]$BestModel,
bestmodeldata = output[[27]]$BestModelData,
arrow = TRUE
)
# Summary of the best model
summary(output[[13]]$BestModel)
##
## Call:
## glm.nb(formula = yvar ~ climate, data = modeldat, init.theta = 12.03499633,
## link = "log")
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.867368 0.077169 76.033 < 2e-16 ***
## climate -0.005666 0.001059 -5.352 8.72e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(12.035) family taken to be 1)
##
## Null deviance: 62.729 on 34 degrees of freedom
## Residual deviance: 35.479 on 33 degrees of freedom
## AIC: 406.34
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 12.03
## Std. Err.: 2.98
##
## 2 x log-likelihood: -400.34
# Calculate the median window from models within 95% confidence interval of the best model
medwin(output[[13]]$Dataset)
## $`Median Window Open`
## [1] 64
##
## $`Median Window Close`
## [1] 24
# Randomisation test to assess if the detected signal is likely by chance
extreme_wave_energy_randwin <- randwin(repeats = 10,
window = "sliding",
xvar = list(extreme_wave_energy_day = extreme_weather$extreme_wave_energy_day),
cdate = extreme_weather$date,
bdate = breeding_data$date,
baseline = glm.nb(pup ~ 1,
link = "log",
data = breeding_data),
cohort = breeding_data$season,
cinterval = "day",
refday = c(28, 01),
range = c(94, 0),
type = "absolute",
stat = c("sum"),
func = c("lin")
)
# Calculate the p-value using Climwin Metric C
climwin::pvalue(dataset = output[[13]]$Dataset,
datasetrand = extreme_wave_energy_randwin[[1]],
metric = "C",
sample.size = sample_size
)
## [1] 0.001761849
# Plot sliding window and randomisation result
climwin::plotall(dataset = output[[13]]$Dataset,
datasetrand = extreme_wave_energy_randwin[[1]],
bestmodel = output[[13]]$BestModel,
bestmodeldata = output[[13]]$BestModelData,
arrow = TRUE
)
# k-fold cross-validation allows to improve the accuracy of the R^2 estimate as R^2 estimates using slidingwin can be biased at low sample size and/or effect size
extreme_wave_energy_k_fold <- slidingwin(k = 10,
xvar = list(extreme_wave_energy_day = extreme_weather$extreme_wave_energy_day),
cdate = extreme_weather$date,
bdate = breeding_data$date,
baseline = glm.nb(pup ~ 1,
link = "log",
data = breeding_data),
cohort = breeding_data$season,
cinterval = "day",
refday = c(28, 01),
range = c(94, 0),
type = "absolute",
stat = c("sum"),
func = c("lin")
)
# Summary of the best model from k-fold cross-validation
summary(extreme_wave_energy_k_fold[[1]]$BestModel)
##
## Call:
## glm.nb(formula = yvar ~ climate, data = modeldat, init.theta = 11.7882937,
## link = "log")
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.823656 0.072118 80.752 < 2e-16 ***
## climate -0.005392 0.001028 -5.244 1.57e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(11.7883) family taken to be 1)
##
## Null deviance: 61.504 on 34 degrees of freedom
## Residual deviance: 35.481 on 33 degrees of freedom
## AIC: 407.04
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 11.79
## Std. Err.: 2.92
##
## 2 x log-likelihood: -401.041
# Calculate R² for the best model from k-fold cross-validation
r2(extreme_wave_energy_k_fold[[1]]$BestModel)
## # R2 for Generalized Linear Regression
## Nagelkerke's R2: 0.634
# Summary of the best model
summary(output3[[2]]$BestModel)
##
## Call:
## glm.nb(formula = yvar ~ climate + I(climate^2), data = modeldat,
## init.theta = 10.29065436, link = "log")
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.362e+02 3.706e+01 -3.674 0.000239 ***
## climate 8.433e-01 2.232e-01 3.778 0.000158 ***
## I(climate^2) -1.252e-03 3.356e-04 -3.730 0.000191 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(10.2907) family taken to be 1)
##
## Null deviance: 54.016 on 34 degrees of freedom
## Residual deviance: 35.519 on 32 degrees of freedom
## AIC: 413.69
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 10.29
## Std. Err.: 2.52
##
## 2 x log-likelihood: -405.687
# Calculate the median window from models within 95% confidence interval of the best model
medwin(output3[[2]]$Dataset)
## $`Median Window Open`
## [1] 62
##
## $`Median Window Close`
## [1] 29
# Randomisation test to assess if the detected signal is likely by chance
wave_direction_randwin <- randwin(repeats = 10,
window = "sliding",
xvar = list(wave_direction = extreme_weather$wave_direction),
cdate = extreme_weather$date,
bdate = breeding_data$date,
baseline = glm.nb(pup ~ 1,
link = "log",
data = breeding_data),
cohort = breeding_data$season,
cinterval = "day",
range = c(94, 0),
refday = c(28, 01),
type = "absolute",
stat = "circ_mean",
func = c("quad")
)
# Calculate the p-value using Climwin Metric C
climwin::pvalue(dataset = output3[[2]]$Dataset,
datasetrand = wave_direction_randwin[[1]],
metric = "C",
sample.size = sample_size
)
## [1] 0.7399056
# Plot sliding window and randomisation result
climwin::plotall(dataset = output3[[2]]$Dataset,
datasetrand = wave_direction_randwin[[1]],
bestmodel = output3[[2]]$BestModel,
bestmodeldata = output3[[2]]$BestModelData,
arrow = TRUE
)
Check for colinearity between the climate signals.
# Add identified significant climate signals to the original breeding data
breeding_data <- breeding_data %>%
dplyr::mutate(extreme_wave_energy_signal = output[[13]]$BestModelData$climate)
# Final model
final_model <- glm.nb(pup ~ 1 + extreme_wave_energy_signal,
link = "log",
data = breeding_data)
# Summary of the final model
summary(final_model)
##
## Call:
## glm.nb(formula = pup ~ 1 + extreme_wave_energy_signal, data = breeding_data,
## link = "log", init.theta = 12.03499633)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 5.867368 0.077169 76.033 < 2e-16 ***
## extreme_wave_energy_signal -0.005666 0.001059 -5.352 8.72e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(12.035) family taken to be 1)
##
## Null deviance: 62.729 on 34 degrees of freedom
## Residual deviance: 35.479 on 33 degrees of freedom
## AIC: 406.34
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 12.03
## Std. Err.: 2.98
##
## 2 x log-likelihood: -400.34
# Creates scaled residuals by simulating from the fitted model
simulationOutput <- simulateResiduals(fittedModel = final_model, plot = TRUE)
# Test for over/underdispersion
testDispersion(simulationOutput, plot = TRUE)
##
## DHARMa nonparametric dispersion test via sd of residuals fitted vs.
## simulated
##
## data: simulationOutput
## dispersion = 0.97319, p-value = 1
## alternative hypothesis: two.sided
# Plot the fitted effect
effect_plot(final_model, pred = extreme_wave_energy_signal, interval = TRUE, plot.points = TRUE,
main.title = "Relationship between Pup count and Extreme wave energy",
x.label = "No. of extreme wave energy days",
y.label = "Pup count",
colors = c("#7B8FA1"),
line.colors = c("#0D92F4"),
line.thickness = 1,
point.size = 2.5,
point.alpha = 0.5,
rug = TRUE) +
drop_gridlines() +
theme_classic()
# Save the final model data with climate signals
write_xlsx(breeding_data, "Output_data/AUFS/AUFS_Tenth_signal.xlsx")
# Save 95% confidence set for each climate signal
saveRDS(output[[13]]$Dataset, "Output_data/AUFS/Tenth_extreme_wave_energy_dataset.rds")
# Checking trends in cumulative intensity, frequency, and duration for each climate signal (using row data)
# Only interested during identified median climate window (95% CI from the best model) from reference date
# Fixed reference date: 14 April each year
ref_day <- "01-28"
# Create yearly open/close window lookup
window_dates <- data.frame(
season = 1979:2024,
window_open = as.Date(paste0(1979:2024, "-", ref_day)) - 64,
window_close = as.Date(paste0(1979:2024, "-", ref_day)) - 24
)
# Add month-day format for easy matching
window_dates <- window_dates %>%
mutate(
window_open_md = format(window_open, "%m-%d"),
window_close_md = format(window_close, "%m-%d")
)
# Summarise extreme events per season within the fixed median window
summarised_extreme_waves <- extreme_weather %>%
filter(season >= 1979 & season <= 2024) %>%
# Join to add window dates per season
left_join(window_dates, by = "season") %>%
# Filter rows within the per-season open/close window
filter(
format(date, "%m-%d") >= window_open_md | format(date, "%m-%d") <= window_close_md
) %>%
group_by(season) %>%
arrange(season, date) %>%
summarise(
# Extract daily index values for the current season
values = list(extreme_wave_energy_day),
# Identify run lengths of contiguous non-zero values
rle_obj = list(rle(values[[1]] > 0)),
# Compute event-level summaries
extreme_event_summaries = list({
vals <- values[[1]]
runs <- rle_obj[[1]]
if (any(runs$values)) {
starts <- cumsum(c(1, head(runs$lengths, -1)))[runs$values]
lengths <- runs$lengths[runs$values]
event_sums <- mapply(function(start, len) sum(vals[start:(start + len - 1)], na.rm = TRUE),
starts, lengths)
tibble::tibble(
event_intensity = event_sums,
event_duration = lengths
)
} else {
tibble::tibble(event_intensity = numeric(0), event_duration = numeric(0))
}
}),
# Derived summaries
extreme_wave_energy_frequency = nrow(extreme_event_summaries[[1]]),
extreme_wave_energy_cum_intensity = sum(extreme_event_summaries[[1]]$event_intensity, na.rm = TRUE),
extreme_wave_energy_tot_duration = sum(extreme_event_summaries[[1]]$event_duration, na.rm = TRUE),
.groups = "drop"
) %>%
dplyr::select(season, extreme_wave_energy_frequency, extreme_wave_energy_cum_intensity, extreme_wave_energy_tot_duration)
############################## Write function to run Durbin-Watson test ##############################
# Run Durbin-Watson test for autocorrelation
run_dw_test <- function(data, vars) {
results <- lapply(vars, function(var) {
formula_obj <- as.formula(paste(var, "~ season"))
model <- lm(formula_obj, data = data)
dw <- dwtest(model)
data.frame(
variable = var,
DW_statistic = round(dw$statistic[[1]], 3),
p_value = round(dw$p.value, 4),
autocorrelation = ifelse(dw$p.value < 0.05,
ifelse(dw$statistic < 2, "positive", "negative"),
"none"),
row.names = NULL
)
})
do.call(rbind, results)
}
# Check for autocorrelation in breeding success
run_dw_test(breeding_data, c("pup"))
## variable DW_statistic p_value autocorrelation
## 1 pup 2.182 0.641 none
# Check for autocorrelation in extreme weather variables
run_dw_test(summarised_extreme_waves, c("extreme_wave_energy_cum_intensity", "extreme_wave_energy_frequency", "extreme_wave_energy_tot_duration"))
## variable DW_statistic p_value autocorrelation
## 1 extreme_wave_energy_cum_intensity 2.072 0.5355 none
## 2 extreme_wave_energy_frequency 1.967 0.3932 none
## 3 extreme_wave_energy_tot_duration 2.136 0.6214 none
# Function to fit glmmTMB without autocorrelation
fit_glmmTMB_no_ar1 <- function(data, variables_families) {
for (entry in variables_families) {
var <- entry$var
fam <- entry$family
model <- tryCatch({
glmmTMB::glmmTMB(as.formula(paste(var, "~ season")), data = data, family = fam)
}, error = function(e) {
message("Could not fit model for ", var, ": ", e$message)
return(NULL)
})
if (is.null(model)) next
# Summary
cat("\n============================\n")
cat("Model summary for:", var, "\n")
print(summary(model))
# Extract p-value safely
coefs <- summary(model)$coefficients$cond
season_row <- grep("^season$", rownames(coefs))
p <- coefs[season_row, "Pr(>|z|)"]
# Base plot
p_plot <- ggplot(data, aes(x = season, y = .data[[var]])) +
geom_point(shape = 19, size = 2.5, color = "#7B8FA1") +
geom_line(linewidth = 0.5, color = "#7B8FA1", na.rm = TRUE)
# Add model trend and annotation only if p < 0.05
if (!is.na(p) && p < 0.05) {
# Predictions
newdata <- data.frame(season = seq(min(data$season), max(data$season)))
preds <- predict(model, newdata = newdata, se.fit = TRUE, type = "response")
# Annotation
p_label <- if (p < 0.001) "p < 0.001" else paste0("p = ", round(p, 3))
n <- nrow(data)
ann_text <- paste0("n = ", n, "\n", p_label)
p_plot <- p_plot +
geom_ribbon(data = data.frame(
season = newdata$season,
ymin = preds$fit - 1.96 * preds$se.fit,
ymax = preds$fit + 1.96 * preds$se.fit
), aes(x = season, ymin = ymin, ymax = ymax),
fill = "#B6D0E2", alpha = 0.4, inherit.aes = FALSE) +
geom_line(data = data.frame(
season = newdata$season,
fit = preds$fit
), aes(x = season, y = fit),
color = "#0D92F4", linewidth = 0.75, inherit.aes = FALSE) +
annotate("text",
x = Inf, y = Inf,
hjust = 1.1, vjust = 1.2,
size = 3, color = "#1a1a1a",
label = ann_text)
}
# Final styling and print
p_plot <- p_plot +
scale_x_continuous(breaks = scales::pretty_breaks(n = 5),
expand = ggplot2::expansion(mult = c(0.02, 0.02))) +
scale_y_continuous(breaks = scales::pretty_breaks(n = 5),
expand = ggplot2::expansion(mult = c(0.02, 0.02))) +
labs(
title = paste("Trend in", gsub("_", " ", var)),
x = "Season", y = var
) +
theme_classic(base_family = "Helvetica") +
theme(
plot.title = element_text(size = 10, margin = margin(b = 1), colour = "#0d0d0d"),
axis.title = element_text(size = 9, colour = "#0d0d0d"),
axis.text = element_text(size = 8),
axis.ticks.length = unit(1, "pt"),
axis.ticks = element_line(linewidth = 0.5),
axis.line = element_blank(),
panel.border = element_rect(color = "#1a1a1a", fill = NA, size = 0.25)
)
print(p_plot)
}
}
# Fit glmmTMB and plot
fit_glmmTMB_no_ar1(breeding_data,
variables_families = list(list(var = "pup",
family = nbinom2())
)
)
##
## ============================
## Model summary for: pup
## Family: nbinom2 ( log )
## Formula: pup ~ season
## Data: data
##
## AIC BIC logLik -2*log(L) df.resid
## 422.8 427.5 -208.4 416.8 32
##
##
## Dispersion parameter for nbinom2 family (): 7.5
##
## Conditional model:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 30.690309 12.507730 2.454 0.0141 *
## season -0.012515 0.006232 -2.008 0.0446 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Fit glmmTMB and plot
fit_glmmTMB_no_ar1(summarised_extreme_waves %>% dplyr::mutate(
extreme_wave_energy_cum_intensity = ifelse(extreme_wave_energy_cum_intensity == 0, 1e-6, extreme_wave_energy_cum_intensity)),
variables_families = list(
list(var = "extreme_wave_energy_frequency", family = poisson()),
list(var = "extreme_wave_energy_cum_intensity", family = Gamma(link = "log")),
list(var = "extreme_wave_energy_tot_duration", family = nbinom2())
)
)
##
## ============================
## Model summary for: extreme_wave_energy_frequency
## Family: poisson ( log )
## Formula: extreme_wave_energy_frequency ~ season
## Data: data
##
## AIC BIC logLik -2*log(L) df.resid
## 162.2 165.8 -79.1 158.2 44
##
##
## Conditional model:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.940803 14.445845 -0.204 0.839
## season 0.001900 0.007216 0.263 0.792
##
## ============================
## Model summary for: extreme_wave_energy_cum_intensity
## Family: Gamma ( log )
## Formula: extreme_wave_energy_cum_intensity ~ season
## Data: data
##
## AIC BIC logLik -2*log(L) df.resid
## 519.7 525.2 -256.9 513.7 43
##
##
## Dispersion estimate for Gamma family (sigma^2): 1.59
##
## Conditional model:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 8.484355 26.922197 0.315 0.753
## season -0.001905 0.013451 -0.142 0.887
##
## ============================
## Model summary for: extreme_wave_energy_tot_duration
## Family: nbinom2 ( log )
## Formula: extreme_wave_energy_tot_duration ~ season
## Data: data
##
## AIC BIC logLik -2*log(L) df.resid
## 217.2 222.6 -105.6 211.2 43
##
##
## Dispersion parameter for nbinom2 family (): 9.39
##
## Conditional model:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.4542814 12.7875569 0.270 0.787
## season -0.0009891 0.0063892 -0.155 0.877
# Standardise each numeric column to have mean zero and standard deviation of one
breeding_data_standardised <- breeding_data %>%
mutate(across(where(is.numeric), ~ ( . - mean(.) ) / sd(.)))
# Extract individual regressions from SEM
m1 <- lm(extreme_wave_energy_signal ~ season, data = breeding_data_standardised)
m2 <- lm(pup ~ extreme_wave_energy_signal + season, data = breeding_data_standardised)
# Summarise individual regressions
summary(m1)
##
## Call:
## lm(formula = extreme_wave_energy_signal ~ season, data = breeding_data_standardised)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.3654 -0.6482 -0.1333 0.6642 2.1770
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.708e-15 1.571e-01 0.000 1.0000
## season 4.014e-01 1.594e-01 2.518 0.0168 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9297 on 33 degrees of freedom
## Multiple R-squared: 0.1611, Adjusted R-squared: 0.1357
## F-statistic: 6.339 on 1 and 33 DF, p-value: 0.01684
summary(m2)
##
## Call:
## lm(formula = pup ~ extreme_wave_energy_signal + season, data = breeding_data_standardised)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.83431 -0.45184 0.06228 0.34327 1.54933
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.101e-16 1.330e-01 0.000 1.000000
## extreme_wave_energy_signal -6.124e-01 1.474e-01 -4.156 0.000226 ***
## season -7.430e-02 1.474e-01 -0.504 0.617560
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.787 on 32 degrees of freedom
## Multiple R-squared: 0.4171, Adjusted R-squared: 0.3806
## F-statistic: 11.45 on 2 and 32 DF, p-value: 0.0001777
# Check model assumptions for individual regressions
par(mfrow = c(2, 2))
plot(m1)
plot(m2)
# Define the SEM model
sem_model <- '
# Regress breeding success on climate signals and time
pup ~ extreme_wave_energy_signal + season
# Regress climate signals on time
extreme_wave_energy_signal ~ season
# Define intercept for breeding success
pup ~ 1
'
# Fit the SEM model with non-parametric bootstrapping with 1000 iterations
set.seed(666)
fit <- lavaan::sem(sem_model, data = breeding_data_standardised, se = "bootstrap", bootstrap = 1000)
# Summarise the model fit
summary(fit, fit.measures = TRUE, rsquare=TRUE)
## lavaan 0.6-20 ended normally after 1 iteration
##
## Estimator ML
## Optimization method NLMINB
## Number of model parameters 7
##
## Number of observations 35
##
## Model Test User Model:
##
## Test statistic 0.000
## Degrees of freedom 0
##
## Model Test Baseline Model:
##
## Test statistic 25.040
## Degrees of freedom 3
## P-value 0.000
##
## User Model versus Baseline Model:
##
## Comparative Fit Index (CFI) 1.000
## Tucker-Lewis Index (TLI) 1.000
##
## Loglikelihood and Information Criteria:
##
## Loglikelihood user model (H0) -85.791
## Loglikelihood unrestricted model (H1) -85.791
##
## Akaike (AIC) 185.583
## Bayesian (BIC) 196.470
## Sample-size adjusted Bayesian (SABIC) 174.613
##
## Root Mean Square Error of Approximation:
##
## RMSEA 0.000
## 90 Percent confidence interval - lower 0.000
## 90 Percent confidence interval - upper 0.000
## P-value H_0: RMSEA <= 0.050 NA
## P-value H_0: RMSEA >= 0.080 NA
##
## Standardized Root Mean Square Residual:
##
## SRMR 0.000
##
## Parameter Estimates:
##
## Standard errors Bootstrap
## Number of requested bootstrap draws 1000
## Number of successful bootstrap draws 1000
##
## Regressions:
## Estimate Std.Err z-value P(>|z|)
## pup ~
## extrm_wv_nrgy_ -0.612 0.125 -4.916 0.000
## season -0.074 0.114 -0.649 0.516
## extreme_wave_energy_signal ~
## season 0.401 0.156 2.568 0.010
##
## Intercepts:
## Estimate Std.Err z-value P(>|z|)
## .pup -0.000 0.130 -0.000 1.000
## .extrm_wv_nrgy_ 0.000 0.158 0.000 1.000
##
## Variances:
## Estimate Std.Err z-value P(>|z|)
## .pup 0.566 0.128 4.432 0.000
## .extrm_wv_nrgy_ 0.815 0.159 5.113 0.000
##
## R-Square:
## Estimate
## pup 0.417
## extrm_wv_nrgy_ 0.161
# Extract parameter estimates and standard errors using parameterEstimates()
params <- lavaan::parameterEstimates(fit)
relationships <- params[params$op == "~", ]
# Create a data frame with the desired results
results <- data.frame(
SEM_int = params$est[params$lhs == "pup" & params$op == "~1"],
SEM_beta_wave_energy = params$est[params$lhs == "pup" & params$rhs == "extreme_wave_energy_signal"],
SEM_beta_season = params$est[params$lhs == "pup" & params$rhs == "season"],
SEM_SE_wave_energy = params$se[params$lhs == "pup" & params$rhs == "extreme_wave_energy_signal"],
SEM_SE_season = params$se[params$lhs == "pup" & params$rhs == "season"],
Yr_beta_wave_energy = params$est[params$lhs == "extreme_wave_energy_signal" & params$rhs == "season"],
Yr_SE_wave_energy = params$se[params$lhs == "extreme_wave_energy_signal" & params$rhs == "season"]
) %>%
mutate(
EWE_pathway = SEM_beta_wave_energy * Yr_beta_wave_energy,
Total_effect_season = SEM_beta_season + EWE_pathway,
change_due_to_EWE_pathway = (EWE_pathway / Total_effect_season) * 100
)
# Print results
print(results)
## SEM_int SEM_beta_wave_energy SEM_beta_season SEM_SE_wave_energy
## 1 -6.219873e-16 -0.6123916 -0.0743034 0.1245718
## SEM_SE_season Yr_beta_wave_energy Yr_SE_wave_energy EWE_pathway
## 1 0.1144614 0.4014271 0.1563107 -0.2458306
## Total_effect_season change_due_to_EWE_pathway
## 1 -0.320134 76.7899
# Add arrow colors, line styles, and significance stars based on significance and direction
relationships <- relationships %>%
mutate(
color = case_when(
as.numeric(est) > 0 & as.numeric(pvalue) < 0.05 ~ "#0079FF", # Positive and significant = blue
as.numeric(est) < 0 & as.numeric(pvalue) < 0.05 ~ "#FF2929", # Negative and significant = red
TRUE ~ "#B7B7B7" # Non-significant = grey
),
style = ifelse(as.numeric(pvalue) < 0.05, "solid", "dashed"), # Significant = solid, Non-significant = dashed
unit = case_when(
lhs == "pup" & rhs == "season" ~ "no./season",
lhs == "pup" & rhs == "extreme_wave_energy_signal" ~ "no./kW m⁻¹",
lhs == "extreme_wave_energy_signal" ~ "kW m⁻¹/season",
TRUE ~ "unit" # Default unit for other relationships
),
stars = case_when( # Add stars based on significance level
pvalue < 0.001 ~ "***",
pvalue < 0.01 ~ "**",
pvalue < 0.05 ~ "*",
TRUE ~ ""
)
)
# Extract R² values
rsq <- lavaan::inspect(fit, "rsquare")
node_labels <- c(
paste0("Pup count\nr² = ", round(rsq["pup"], 2)),
paste0("Extreme wave energy\nr² = ", round(rsq["extreme_wave_energy_signal"], 2)),
"Season"
)
# Construct edges for the plot, adding significance stars in the label
edges <- apply(relationships, 1, function(row) {
paste0(
"\"", row["rhs"], "\" -> \"", row["lhs"], "\" ", # Add double quotes to handle special characters in node names
"[color=\"", row["color"], "\", style=", row["style"],
", label=\"", round(as.numeric(row["est"]), 2), row["stars"], "\n(", row["unit"], ")\", fontsize=10]"
)
})
# Generate DiagrammeR graph with rectangular nodes and custom labels
graph_code <- paste0(
"digraph SEM {",
"\nnode [shape=rectangle, style=filled, fillcolor=white];",
"\n", paste(edges, collapse = "\n"),
"\n", paste0(
"\"pup\" [label=\"Pup count\\nr² = ", round(rsq["pup"], 2), "\", fontsize=12];",
"\"extreme_wave_energy_signal\" [label=\"Extreme wave energy\\nr² = ", round(rsq["extreme_wave_energy_signal"], 2), "\", fontsize=12];",
"\"season\" [label=\"Season\", fontsize=12];"
),
"\n}"
)
# Render the DiagrammeR graph
DiagrammeR::grViz(graph_code)