# Load libraries for data processing, modelling, and visualisation
library(tidyverse)
library(openxlsx2)
library(MASS)
library(corrplot)
library(DT)
library(climwin)
library(jtools)
library(DHARMa)
library(lavaan)
library(DiagrammeR)
library(lmtest)
library(glmmTMB)
library(scales)

1 Load and Process Data

# Load daily extreme weather event (EWE) data
extreme_weather <- wb_to_df("Breeding_colony_ewes/Judgement_ewes.xlsx") %>%
                   dplyr::mutate(across(4:20, ~ ifelse(is.na(.), 0, .)))  
                   

# Create binary version: 1 = event occurred, 0 = no event, NA = missing
extreme_weather_binary <- extreme_weather %>%
                          dplyr::mutate(across(4:20, ~ ifelse(!is.na(.) & . != 0, 1, ifelse(is.na(.), NA, 0))))


# Note:
# Missing values in extreme weather data are replaced with zero. This is critical because the slidingwin method ("method1" and "method2") internally calculates means when NA values are present, which is not suitable when assessing extreme values. We are specifically interested in whether an extreme event occurred, not in average conditions.
# A small number of missing values are present in the dataset, and replacing them with zero ensures consistency without introducing bias in this context.


# Load Short-tailed Shearwater breeding data
breeding_data <- wb_to_df("Breeding_data/AUFS.xlsx", sheet = "Judgement_Rocks") %>%
                 dplyr::filter(!is.na(pup))  # Remove seasons without productivity data


# Record sample size
sample_size <- nrow(breeding_data)


# Assess normality of response variable
# If p > 0.05, the data does not significantly deviate from normality.
shapiro.test(breeding_data$pup)
## 
##  Shapiro-Wilk normality test
## 
## data:  breeding_data$pup
## W = 0.93074, p-value = 0.224
# Histogram with density curve
hist(breeding_data$pup,
     main = "Histogram of Pup Count",
     xlab = "Pup Count",
     col = "#a6d6fa",
     border = "white",
     prob = TRUE  
)

# Overlay kernel density estimate
lines(density(breeding_data$pup, na.rm = TRUE), col = "#0D92F4", lwd = 2)

# Q-Q plot
ggplot(breeding_data, aes(sample = pup)) +
  stat_qq() +
  stat_qq_line(colour = "red") +
  labs(title = "Q-Q Plot of Pup count",
       x = "Theoretical Quantiles",
       y = "Sample Quantiles") +
  theme_classic()

# Calculate mean and variance
mean(breeding_data$pup)
## [1] 2107.941
var(breeding_data$pup)
## [1] 328240.2

The Shapiro–Wilk test did not indicate a significant deviation from normality (W = 0.93101, p = 0.2263); therefore, we fail to reject the null hypothesis that the data are normally distributed. Visual assessments, including the histogram and Q–Q plot, also support the assumption that the data approximate a normal distribution.

Furthermore, the mean (2108.235) and variance (328441.1) differ substantially, indicating overdispersion and suggesting that a Poisson distribution—where the mean and variance are expected to be equal—is not appropriate for this dataset. Therefore, we will use a Negative Binomial distribution to model the data.

2 Sliding window analysis

2.1 Actual above threshold values

# Run the sliding window analysis using actual (non-binary) values
output1 <- slidingwin(xvar = list(warm_day                 = extreme_weather$warm_day,
                                  warm_night               = extreme_weather$warm_night,
                                  heatwave                 = extreme_weather$heatwave,
                                  cool_day                 = extreme_weather$cool_day,
                                  cool_night               = extreme_weather$cool_night,
                                  coldwave                 = extreme_weather$coldwave,
                                  wet_day                  = extreme_weather$wet_day,
                                  heavy_rain_day           = extreme_weather$heavy_rain_day,
                                  very_heavy_rain_day      = extreme_weather$very_heavy_rain_day,
                                  ewdp                     = extreme_weather$ewdp,
                                  vwdp                     = extreme_weather$vwdp,
                                  extreme_wind_day         = extreme_weather$extreme_wind_day,
                                  extreme_wave_energy_day  = extreme_weather$extreme_wave_energy_day,
                                  extreme_wbt_day          = extreme_weather$extreme_wbt_day,
                                  extreme_wbgt_day         = extreme_weather$extreme_wbgt_day,
                                  extreme_at_day           = extreme_weather$extreme_at_day,
                                  extreme_wind_chill_day   = extreme_weather$extreme_wind_chill_day),
                      cdate     = extreme_weather$date,                       # Daily climate record dates
                      bdate     = breeding_data$date,                         # Biological event dates (pup count dates)
                      baseline  = glm.nb(pup ~ 1,
                                         link = "log",
                                         data = breeding_data),               # Baseline model: Negative Binomial
                      cohort    = breeding_data$season,                       # Group by season
                      refday    = c(29, 01),                                  # Last day of monitoring across the seasons
                      cinterval = "day",                                      # Daily resolution
                      range     = c(95, 0),                                   # Test all possible windows within the range
                      type      = "absolute",                                 # Absolute to each biological event date
                      stat      = "sum",                                      # Sum of EWE values over the window
                      func      = "lin"                                       # Test linear relationships
                      )

2.2 Binary above threshold values

# Run the sliding window analysis using binary event indicators
output2 <- slidingwin(xvar = list(warm_day_bi                 = extreme_weather_binary$warm_day,
                                  warm_night_bi               = extreme_weather_binary$warm_night,
                                  heatwave_bi                 = extreme_weather_binary$heatwave,
                                  cool_day_bi                 = extreme_weather_binary$cool_day,
                                  cool_night_bi               = extreme_weather_binary$cool_night,
                                  codlwave_bi                 = extreme_weather_binary$coldwave,
                                  wet_day_bi                  = extreme_weather_binary$wet_day,
                                  heavy_rain_day_bi           = extreme_weather_binary$heavy_rain_day,
                                  very_heavy_rain_day_bi      = extreme_weather_binary$very_heavy_rain_day,
                                  ewdp_bi                     = extreme_weather_binary$ewdp,
                                  vwdp_bi                     = extreme_weather_binary$vwdp,
                                  extreme_wind_day_bi         = extreme_weather_binary$extreme_wind_day,
                                  extreme_wave_energy_day_bi  = extreme_weather_binary$extreme_wave_energy_day,
                                  extreme_wbt_day_bi          = extreme_weather_binary$extreme_wbt_day,
                                  extreme_wbgt_day_bi         = extreme_weather_binary$extreme_wbgt_day,
                                  extreme_at_day_bi           = extreme_weather_binary$extreme_at_day,
                                  extreme_wind_chill_day_bi   = extreme_weather_binary$extreme_wind_chill_day),
                      cdate     = extreme_weather_binary$date,                # Daily climate record dates
                      bdate     = breeding_data$date,                         # Biological event dates (pup count dates)
                      baseline  = glm.nb(pup ~ 1,
                                         link = "log",
                                         data = breeding_data),               # Baseline model: Negative Binomial
                      cohort    = breeding_data$season,                       # Group by season
                      refday    = c(29, 01),                                  # Last day of monitoring across the seasons
                      cinterval = "day",                                      # Daily resolution
                      range     = c(95, 0),                                   # Test all possible windows within the range
                      type      = "absolute",                                 # Absolute to each biological event date
                      stat      = "sum",                                      # Sum of EWE values over the window
                      func      = "lin"                                       # Test linear relationships
                      )

2.3 Wave direction

# Calculate the circular mean of directional data (in degrees)
# This function returns the mean direction of angular values (e.g. wave or wind direction), accounting for circularity (i.e. wrap-around at 360°).
circ_mean <- function(x) {
  # Convert degrees to radians
  radians <- x * pi / 180
  
  # Compute mean sine and cosine
  mean_sin <- mean(sin(radians), na.rm = TRUE)
  mean_cos <- mean(cos(radians), na.rm = TRUE)
  
  # Calculate circular mean in radians and convert back to degrees
  mean_angle <- atan2(mean_sin, mean_cos) * 180 / pi
  
  # Ensure result is within 0–360 degrees
  if (mean_angle < 0) mean_angle + 360 else mean_angle
}



# Run the sliding window analysis with daily mean wave direction data
output3 <- slidingwin(xvar      = list(wave_direction = extreme_weather$wave_direction),
                      cdate     = extreme_weather$date,                       # Daily climate record dates
                      bdate     = breeding_data$date,                         # Biological event dates (pup count dates)
                      baseline  = glm.nb(pup ~ 1,
                                         link = "log",
                                         data = breeding_data),               # Baseline model: Negative Binomial
                      cohort    = breeding_data$season,                       # Group by season
                      refday    = c(29, 01),                                  # Last day of monitoring across the seasons
                      cinterval = "day",                                      # Daily resolution
                      range     = c(95, 0),                                   # Test all possible windows within the range
                      type      = "absolute",                                 # Absolute to each biological event date
                      stat      = "circ_mean",                                # Circular mean of daily wave direction values over the window
                      func      = c("lin", "quad")                            # Test linear and quadratic relationships
                      )
                

# Examine the tested combinations
datatable(output3$combos %>% 
          dplyr::mutate(WindowDuration = WindowOpen - WindowClose + 1),
          options = list(pageLength = 10, orderClasses = TRUE)
          )

2.4 Merge the results

# Combine output from actual and binary sliding window analyses
output <- merge_results(output1, output2)


# View merged model combinations with calculated window duration
datatable(output$combos %>% 
          dplyr::mutate(WindowDuration = WindowOpen - WindowClose + 1),
          options = list(pageLength = 10, orderClasses = TRUE))

2.5 Check best model for each variable

Before running the randomisation process, we need to identify the best-performing model for each extreme weather variable. This ensures that we are testing the most likely biologically relevant window against random expectation.

What we are doing here: For each weather variable (e.g., heavy rain, wet days), we extract the model with:

The lowest AIC value, and

A window duration longer than 14 days, to focus on ecologically meaningful timeframes.

These best models represent the strongest climate–breeding success relationships, and will be used for the randomisation test to assess whether the relationship is likely to have occurred by chance.

2.5.1 Cool night

# Summary of the best model
summary(output[[5]]$BestModel)
## 
## Call:
## glm.nb(formula = yvar ~ climate, data = modeldat, init.theta = 22.15222258, 
##     link = "log")
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  7.53751    0.06314 119.382  < 2e-16 ***
## climate     -0.07484    0.02597  -2.882  0.00395 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(22.1522) family taken to be 1)
## 
##     Null deviance: 26.200  on 16  degrees of freedom
## Residual deviance: 17.138  on 15  degrees of freedom
## AIC: 261.06
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  22.15 
##           Std. Err.:  7.63 
## 
##  2 x log-likelihood:  -255.058
# Calculate the median window from models within 95% confidence interval of the best model
medwin(output[[5]]$Dataset)
## $`Median Window Open`
## [1] 63
## 
## $`Median Window Close`
## [1] 22
# Randomisation test to assess if the detected signal is likely by chance
cool_night_randwin <- randwin(repeats   = 10,
                              window    = "sliding",
                              xvar      = list(cool_night = extreme_weather$cool_night),
                              cdate     = extreme_weather$date,
                              bdate     = breeding_data$date,
                              baseline  = glm.nb(pup ~ 1,
                                                 link = "log",
                                                 data = breeding_data),
                              cohort    = breeding_data$season,
                              cinterval = "day",
                              refday    = c(29, 01),
                              range     = c(95, 0),
                              type      = "absolute",
                              stat      = c("sum"),
                              func      = c("lin")
                              )
# Calculate the p-value using Climwin Metric C
climwin::pvalue(dataset     = output[[5]]$Dataset,
                datasetrand = cool_night_randwin[[1]],
                metric      = "C",
                sample.size = sample_size
                )
## [1] 0.493866
# Plot sliding window and randomisation result
climwin::plotall(dataset        = output[[5]]$Dataset,
                 datasetrand    = cool_night_randwin[[1]],
                 bestmodel      = output[[5]]$BestModel,
                 bestmodeldata  = output[[5]]$BestModelData,
                 arrow          = TRUE
                 )

2.5.2 Extreme AT

# Summary of the best model
summary(output[[33]]$BestModel)
## 
## Call:
## glm.nb(formula = yvar ~ climate, data = modeldat, init.theta = 23.35392602, 
##     link = "log")
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  7.97705    0.11088  71.941  < 2e-16 ***
## climate     -0.06901    0.02024  -3.409 0.000651 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(23.3539) family taken to be 1)
## 
##     Null deviance: 27.605  on 16  degrees of freedom
## Residual deviance: 17.112  on 15  degrees of freedom
## AIC: 260.14
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  23.35 
##           Std. Err.:  8.04 
## 
##  2 x log-likelihood:  -254.137
# Calculate the median window from models within 95% confidence interval of the best model
medwin(output[[33]]$Dataset)
## $`Median Window Open`
## [1] 75
## 
## $`Median Window Close`
## [1] 33
# Randomisation test to assess if the detected signal is likely by chance
extreme_at_randwin <- randwin(repeats   = 10,
                              window    = "sliding",
                              xvar      = list(extreme_at_day_bi = extreme_weather_binary$extreme_at_day),
                              cdate     = extreme_weather_binary$date,
                              bdate     = breeding_data$date,
                              baseline  = glm.nb(pup ~ 1,
                                                 link = "log",
                                                 data = breeding_data),
                              cohort    = breeding_data$season,
                              cinterval = "day",
                              refday    = c(29, 01),
                              range     = c(95, 0),
                              type      = "absolute",
                              stat      = c("sum"),
                              func      = c("lin")
                              )
# Calculate the p-value using Climwin Metric C
climwin::pvalue(dataset     = output[[33]]$Dataset,
                datasetrand = extreme_at_randwin[[1]],
                metric      = "C",
                sample.size = sample_size
                )
## [1] 0.4772722
# Plot sliding window and randomisation result
climwin::plotall(dataset        = output[[33]]$Dataset,
                 datasetrand    = extreme_at_randwin[[1]],
                 bestmodel      = output[[33]]$BestModel,
                 bestmodeldata  = output[[33]]$BestModelData,
                 arrow          = TRUE
                 )

2.5.3 Extreme WBT

# Summary of the best model
summary(output[[14]]$BestModel)
## 
## Call:
## glm.nb(formula = yvar ~ climate, data = modeldat, init.theta = 21.28996911, 
##     link = "log")
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  7.776903   0.070973 109.576  < 2e-16 ***
## climate     -0.003933   0.001387  -2.837  0.00455 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(21.29) family taken to be 1)
## 
##     Null deviance: 25.191  on 16  degrees of freedom
## Residual deviance: 17.135  on 15  degrees of freedom
## AIC: 261.73
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  21.29 
##           Std. Err.:  7.32 
## 
##  2 x log-likelihood:  -255.728
# Calculate the median window from models within 95% confidence interval of the best model
medwin(output[[14]]$Dataset)
## $`Median Window Open`
## [1] 69
## 
## $`Median Window Close`
## [1] 27
# Randomisation test to assess if the detected signal is likely by chance
extreme_wbt_randwin <- randwin(repeats   = 10,
                               window    = "sliding",
                               xvar      = list(extreme_wbt_day = extreme_weather$extreme_wbt_day),
                               cdate     = extreme_weather$date,
                               bdate     = breeding_data$date,
                               baseline  = glm.nb(pup ~ 1,
                                                  link = "log",
                                                  data = breeding_data),
                               cohort    = breeding_data$season,
                               cinterval = "day",
                               refday    = c(29, 01),
                               range     = c(95, 0),
                               type      = "absolute",
                               stat      = c("sum"),
                               func      = c("lin")
                               )
# Calculate the p-value using Climwin Metric C
climwin::pvalue(dataset     = output[[14]]$Dataset,
                datasetrand = extreme_wbt_randwin[[1]],
                metric      = "C",
                sample.size = sample_size
                )
## [1] 0.6368382
# Plot sliding window and randomisation result
climwin::plotall(dataset        = output[[14]]$Dataset,
                 datasetrand    = extreme_wbt_randwin[[1]],
                 bestmodel      = output[[14]]$BestModel,
                 bestmodeldata  = output[[14]]$BestModelData,
                 arrow          = TRUE
                 )

2.5.4 Very heavy rain

# Summary of the best model
summary(output[[9]]$BestModel)
## 
## Call:
## glm.nb(formula = yvar ~ climate, data = modeldat, init.theta = 17.51543973, 
##     link = "log")
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  7.712330   0.067002 115.107   <2e-16 ***
## climate     -0.004526   0.002318  -1.953   0.0508 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(17.5154) family taken to be 1)
## 
##     Null deviance: 20.764  on 16  degrees of freedom
## Residual deviance: 17.160  on 15  degrees of freedom
## AIC: 265.07
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  17.52 
##           Std. Err.:  6.00 
## 
##  2 x log-likelihood:  -259.067

2.5.5 Warm night

# Summary of the best model
summary(output[[19]]$BestModel)
## 
## Call:
## glm.nb(formula = yvar ~ climate, data = modeldat, init.theta = 21.59453663, 
##     link = "log")
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  7.80290    0.07510 103.900  < 2e-16 ***
## climate     -0.06676    0.02232  -2.991  0.00278 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(21.5945) family taken to be 1)
## 
##     Null deviance: 25.547  on 16  degrees of freedom
## Residual deviance: 17.127  on 15  degrees of freedom
## AIC: 261.48
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  21.59 
##           Std. Err.:  7.43 
## 
##  2 x log-likelihood:  -255.479
# Calculate the median window from models within 95% confidence interval of the best model
medwin(output[[19]]$Dataset)
## $`Median Window Open`
## [1] 70
## 
## $`Median Window Close`
## [1] 28
# Randomisation test to assess if the detected signal is likely by chance
warm_night_randwin <- randwin(repeats   = 10,
                              window    = "sliding",
                              xvar      = list(warm_night_bi = extreme_weather_binary$warm_night),
                              cdate     = extreme_weather_binary$date,
                              bdate     = breeding_data$date,
                              baseline  = glm.nb(pup ~ 1,
                                                 link = "log",
                                                 data = breeding_data),
                              cohort    = breeding_data$season,
                              cinterval = "day",
                              refday    = c(29, 01),
                              range     = c(95, 0),
                              type      = "absolute",
                              stat      = c("sum"),
                              func      = c("lin")
                              )
# Calculate the p-value using Climwin Metric C
climwin::pvalue(dataset     = output[[19]]$Dataset,
                datasetrand = warm_night_randwin[[1]],
                metric      = "C",
                sample.size = sample_size
                )
## [1] 0.6718193
# Plot sliding window and randomisation result
climwin::plotall(dataset        = output[[19]]$Dataset,
                 datasetrand    = warm_night_randwin[[1]],
                 bestmodel      = output[[19]]$BestModel,
                 bestmodeldata  = output[[19]]$BestModelData,
                 arrow          = TRUE
                 )

2.5.6 Wave direction

# Summary of the best model
summary(output3[[2]]$BestModel) 
## 
## Call:
## glm.nb(formula = yvar ~ climate + I(climate^2), data = modeldat, 
##     init.theta = 38.38938385, link = "log")
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -4.465e+01  1.582e+01  -2.823 0.004755 ** 
## climate       4.515e-01  1.313e-01   3.438 0.000587 ***
## I(climate^2) -9.667e-04  2.720e-04  -3.555 0.000378 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(38.3894) family taken to be 1)
## 
##     Null deviance: 45.041  on 16  degrees of freedom
## Residual deviance: 17.098  on 14  degrees of freedom
## AIC: 253.75
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  38.4 
##           Std. Err.:  13.4 
## 
##  2 x log-likelihood:  -245.754
# Calculate the median window from models within 95% confidence interval of the best model
medwin(output3[[2]]$Dataset)
## $`Median Window Open`
## [1] 74
## 
## $`Median Window Close`
## [1] 33
# Randomisation test to assess if the detected signal is likely by chance
wave_direction_randwin <- randwin(repeats   = 10,
                                  window    = "sliding",
                                  xvar      = list(wave_direction = extreme_weather$wave_direction),
                                  cdate     = extreme_weather$date,
                                  bdate     = breeding_data$date,
                                  baseline  = glm.nb(pup ~ 1,
                                                     link = "log",
                                                     data = breeding_data),
                                  cohort    = breeding_data$season,
                                  cinterval = "day",
                                  range     = c(95, 0),
                                  refday    = c(29, 01),
                                  type      = "absolute",
                                  stat      = "circ_mean",
                                  func      = c("quad")
                                  )
# Calculate the p-value using Climwin Metric C
climwin::pvalue(dataset     = output3[[2]]$Dataset,
                datasetrand = wave_direction_randwin[[1]],
                metric      = "C",
                sample.size = sample_size
                )
## [1] 0.6109865
# Plot sliding window and randomisation result
climwin::plotall(dataset        = output3[[2]]$Dataset,
                 datasetrand    = wave_direction_randwin[[1]],
                 bestmodel      = output3[[2]]$BestModel,
                 bestmodeldata  = output3[[2]]$BestModelData,
                 arrow          = TRUE
                 )