# Load libraries for data processing, modelling, and visualisation
library(tidyverse)
library(openxlsx2)
library(MASS)
library(corrplot)
library(DT)
library(climwin)
library(jtools)
library(DHARMa)
library(lavaan)
library(DiagrammeR)
library(lmtest)
library(glmmTMB)
library(scales)

1 Load and Process Data

# Load daily extreme weather event (EWE) data
extreme_weather <- wb_to_df("Breeding_colony_ewes/Maatsuyker_ewes.xlsx") %>%
                   dplyr::mutate(across(4:8, ~ ifelse(is.na(.), 0, .)))  # Replace NA with 0 for analysis


# Create binary version: 1 = event occurred, 0 = no event, NA = missing
extreme_weather_binary <- extreme_weather %>%
                          dplyr::mutate(across(4:8, ~ ifelse(!is.na(.) & . != 0, 1, ifelse(is.na(.), NA, 0))))


# Note:
# Missing values in extreme weather data are replaced with zero. This is critical because the slidingwin method ("method1" and "method2") internally calculates means when NA values are present, which is not suitable when assessing extreme values. We are specifically interested in whether an extreme event occurred, not in average conditions.
# A small number of missing values are present in the dataset, and replacing them with zero ensures consistency without introducing bias in this context.


# Load Short-tailed Shearwater breeding data
breeding_data <- wb_to_df("Breeding_data/STSH.xlsx", sheet = "Maatsuyker_Island") %>%
                 dplyr::filter(!is.na(bs))  # Remove seasons without breeding success data


# Record sample size
sample_size <- nrow(breeding_data)


# Assess normality of response variable
# If p > 0.05, the data does not significantly deviate from normality.
shapiro.test(breeding_data$bs)
## 
##  Shapiro-Wilk normality test
## 
## data:  breeding_data$bs
## W = 0.96007, p-value = 0.7725
# Histogram with density curve
hist(breeding_data$bs,
     main = "Histogram of Breeding Success",
     xlab = "Breeding Success",
     col = "#a6d6fa",
     border = "white",
     prob = TRUE  
)

# Overlay kernel density estimate
lines(density(breeding_data$bs, na.rm = TRUE), col = "#0D92F4", lwd = 2)

# Q-Q plot
ggplot(breeding_data, aes(sample = bs)) +
  stat_qq() +
  stat_qq_line(colour = "red") +
  labs(title = "Q-Q Plot of Breeding Success",
       x = "Theoretical Quantiles",
       y = "Sample Quantiles") +
  theme_classic()

The Shapiro–Wilk test did not indicate a significant deviation from normality (W = 0.96007, p = 0.7725); therefore, we failed to reject the null hypothesis that the data are normally distributed.

2 Sliding window analysis

2.1 Actual above threshold values

# Run the sliding window analysis using actual (non-binary) values
output1 <- slidingwin(xvar = list(wet_day             = extreme_weather$wet_day,
                                  heavy_rain_day      = extreme_weather$heavy_rain_day,
                                  very_heavy_rain_day = extreme_weather$very_heavy_rain_day,
                                  ewdp                = extreme_weather$ewdp,
                                  vwdp                = extreme_weather$vwdp),
                      cdate     = extreme_weather$date,                       # Climate date
                      bdate     = breeding_data$date,                         # Biological event date
                      baseline  = lm(bs ~ 1, 
                                     data = breeding_data),                   # Baseline model
                      cohort    = breeding_data$season,                       # Group by season
                      cinterval = "day",                                      # Daily resolution
                      range     = c(105, 0),                                  # Check windows from 1 December to 16 March
                      refday    = c(16, 03),                                  # Reference date: 16 March
                      type      = "absolute",                                 # Absolute window type
                      stat      = "sum",                                      # Sum values within each window
                      func      = "lin"                                       # For linear relationship
                      )

2.2 Binary above threshold values

# Run the sliding window analysis using binary event indicators
output2 <- slidingwin(xvar = list(wet_day_bi             = extreme_weather_binary$wet_day,
                                  heavy_rain_day_bi      = extreme_weather_binary$heavy_rain_day,
                                  very_heavy_rain_day_bi = extreme_weather_binary$very_heavy_rain_day,
                                  ewdp_bi                = extreme_weather_binary$ewdp,
                                  vwdp_bi                = extreme_weather_binary$vwdp),
                      cdate     = extreme_weather_binary$date,                # Climate date
                      bdate     = breeding_data$date,                         # Biological event date
                      baseline  = lm(bs ~ 1, 
                                     data = breeding_data),                   # Baseline model
                      cohort    = breeding_data$season,                       # Group by season
                      cinterval = "day",                                      # Daily resolution
                      range     = c(105, 0),                                  # Check windows from 1 December to 16 March
                      refday    = c(16, 03),                                  # Reference date: 16 March
                      type      = "absolute",                                 # Absolute window type
                      stat      = "sum",                                      # Sum values within each window
                      func      = "lin"                                       # For linear relationship
                      )

2.3 Merge the results

# Combine output from actual and binary sliding window analyses
output <- merge_results(output1, output2)


# View merged model combinations with calculated window duration
datatable(output$combos %>% 
          dplyr::mutate(WindowDuration = WindowOpen - WindowClose + 1),
          options = list(pageLength = 10, orderClasses = TRUE))

2.4 Check best model for each variable

Before running the randomisation process, we need to identify the best-performing model for each extreme weather variable. This ensures that we are testing the most likely biologically relevant window against random expectation.

What we are doing here: For each weather variable (e.g., heavy rain, wet days), we extract the model with:

The lowest AIC value, and

A window duration longer than 14 days, to focus on ecologically meaningful timeframes.

These best models represent the strongest climate–breeding success relationships, and will be used for the randomisation test to assess whether the relationship is likely to have occurred by chance.

2.4.1 EWDP

# Summary of the best model
summary(output[[9]]$BestModel)
## 
## Call:
## lm(formula = yvar ~ climate, data = modeldat)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.105048 -0.041722 -0.003205  0.030234  0.115604 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.73726    0.02324  31.725 1.51e-10 ***
## climate     -0.08499    0.03147  -2.701   0.0244 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.06709 on 9 degrees of freedom
## Multiple R-squared:  0.4477, Adjusted R-squared:  0.3863 
## F-statistic: 7.295 on 1 and 9 DF,  p-value: 0.02436
# Calculate the median window from models within 95% confidence interval of the best model
medwin(output[[9]]$Dataset)
## $`Median Window Open`
## [1] 67
## 
## $`Median Window Close`
## [1] 17
# Randomisation test to assess if the detected signal is likely by chance
ewdp_randwin <- randwin(repeats   = 10,
                        window    = "sliding",
                        xvar      = list(ewdp_bi = extreme_weather_binary$ewdp),
                        cdate     = extreme_weather_binary$date,
                        bdate     = breeding_data$date,
                        baseline  = lm(bs ~ 1, 
                                     data = breeding_data),
                        cohort    = breeding_data$season,
                        cinterval = "day",
                        range     = c(105, 0),
                        refday    = c(16, 03),
                        type      = "absolute",
                        stat      = c("sum"),
                        func      = c("lin")
                        )           
# Calculate the p-value using Climwin Metric C
climwin::pvalue(dataset     = output[[9]]$Dataset,
                datasetrand = ewdp_randwin[[1]],
                metric      = "C",
                sample.size = sample_size
                )
## [1] 0.2729416
# Plot sliding window and randomisation result
climwin::plotall(dataset        = output[[9]]$Dataset,
                 datasetrand    = ewdp_randwin[[1]],
                 bestmodel      = output[[9]]$BestModel,
                 bestmodeldata  = output[[9]]$BestModelData,
                 arrow          = TRUE
                 )

2.4.2 Heavy rain

# Summarise the best model 
summary(output[[7]]$BestModel)
## 
## Call:
## lm(formula = yvar ~ climate, data = modeldat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.07214 -0.03199 -0.01832  0.03843  0.07874 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.85361    0.03859  22.122 3.73e-09 ***
## climate     -0.05062    0.01209  -4.186  0.00236 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.05259 on 9 degrees of freedom
## Multiple R-squared:  0.6606, Adjusted R-squared:  0.6229 
## F-statistic: 17.52 on 1 and 9 DF,  p-value: 0.002356
# Calculate the median window from models within 95% confidence interval of the best model
medwin(output[[7]]$Dataset)
## $`Median Window Open`
## [1] 75
## 
## $`Median Window Close`
## [1] 30
# Randomisation test to assess if the detected signal is likely by chance
heavy_rain_randwin <- randwin(repeats   = 10,
                              window    = "sliding",
                              xvar      = list(heavy_rain_day_bi = extreme_weather_binary$heavy_rain_day),
                              cdate     = extreme_weather_binary$date,
                              bdate     = breeding_data$date,
                              baseline  = lm(bs ~ 1, 
                                             data = breeding_data),
                              cohort    = breeding_data$season,
                              cinterval = "day",
                              range     = c(105, 0),
                              refday    = c(16, 03),
                              type      = "absolute",
                              stat      = c("sum"),
                              func      = c("lin")
                              )
# Calculate the p-value using Climwin Metric C
climwin::pvalue(dataset     = output[[7]]$Dataset,
                datasetrand = heavy_rain_randwin[[1]],
                metric      = "C",
                sample.size = sample_size
                )
## [1] 0.5103463
# Plot sliding window and randomisation result
climwin::plotall(dataset        = output[[7]]$Dataset,
                 datasetrand    = heavy_rain_randwin[[1]],
                 bestmodel      = output[[7]]$BestModel,
                 bestmodeldata  = output[[7]]$BestModelData,
                 arrow          = TRUE
                 )

2.4.3 Wet day

# Summary of the best model
summary(output[[1]]$BestModel)
## 
## Call:
## lm(formula = yvar ~ climate, data = modeldat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.04349 -0.02620 -0.01523  0.01816  0.07661 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.0020391  0.0567158  17.668  2.7e-08 ***
## climate     -0.0024117  0.0004497  -5.363 0.000455 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.04407 on 9 degrees of freedom
## Multiple R-squared:  0.7616, Adjusted R-squared:  0.7352 
## F-statistic: 28.76 on 1 and 9 DF,  p-value: 0.0004548
# Calculate the median window from models within 95% confidence interval of the best model
medwin(output[[1]]$Dataset)
## $`Median Window Open`
## [1] 82
## 
## $`Median Window Close`
## [1] 30
# Randomisation test to assess if the detected signal is likely by chance
wet_day_randwin <- randwin(repeats   = 10,
                           window    = "sliding",
                           xvar      = list(wet_day = extreme_weather$wet_day),
                           cdate     = extreme_weather$date,
                           bdate     = breeding_data$date,
                           baseline  = lm(bs ~ 1, 
                                          data = breeding_data),
                           cohort    = breeding_data$season,
                           cinterval = "day",
                           range     = c(105, 0),
                           refday    = c(16, 03),
                           type      = "absolute",
                           stat      = c("sum"),
                           func      = c("lin")
                           )           
# Calculate the p-value using Climwin Metric C
climwin::pvalue(dataset     = output[[1]]$Dataset,
                datasetrand = wet_day_randwin[[1]],
                metric      = "C",
                sample.size = sample_size
                )
## [1] 0.1645192
# Plot sliding window and randomisation result
climwin::plotall(dataset        = output[[1]]$Dataset,
                 datasetrand    = wet_day_randwin[[1]],
                 bestmodel      = output[[1]]$BestModel,
                 bestmodeldata  = output[[1]]$BestModelData,
                 arrow          = TRUE
                 )