library(dplyr)
library(ggplot2)
library(maps)
library(lubridate)
library(tidyr)
# Reload supplementary data fresh to ensure we have all data
supplementary_fresh <- read.csv("data/supplementary_data.csv")
# Define dome teams to exclude
dome_teams <- c("NO", "DET", "MIN", "DAL", "HOU", "ARI", "ATL", "IND", "LV", "LAC")
# Map team abbreviations to states
team_to_state <- data.frame(
team_abbr = c("ARI", "ATL", "BAL", "BUF", "CAR", "CHI", "CIN", "CLE", "DAL", "DEN",
"DET", "GB", "HOU", "IND", "JAX", "KC", "LV", "LAC", "LAR", "MIA",
"MIN", "NE", "NO", "NYG", "NYJ", "PHI", "PIT", "SF", "SEA", "TB",
"TEN", "WAS"),
state = c("Arizona", "Georgia", "Maryland", "New York", "North Carolina", "Illinois",
"Ohio", "Ohio", "Texas", "Colorado",
"Michigan", "Wisconsin", "Texas", "Indiana", "Florida", "Missouri",
"Nevada", "California", "California", "Florida",
"Minnesota", "Massachusetts", "Louisiana", "New Jersey", "New Jersey",
"Pennsylvania", "Pennsylvania", "California", "Washington", "Florida",
"Tennessee", "Maryland"),
stringsAsFactors = FALSE
)
# Filter supplementary data for outdoor stadiums and valid pass plays
# Create early season (weeks 1-4) vs late season (weeks 15-18) groupings
pass_data <- supplementary_fresh |>
filter(
!home_team_abbr %in% dome_teams,
pass_result %in% c("C", "I", "IN"),
!is.na(pass_length),
pass_length >= 0, # Exclude negative/invalid values
!is.na(week)
) |>
mutate(
season_period = case_when(
week %in% 1:4 ~ "Early",
week %in% 15:18 ~ "Late",
TRUE ~ "Mid"
)
) |>
left_join(team_to_state, by = c("home_team_abbr" = "team_abbr")) |>
filter(!is.na(state))
# Calculate average depth of target by state for early vs late season
state_early_late <- pass_data |>
filter(season_period %in% c("Early", "Late")) |>
group_by(state, season_period) |>
summarise(
avg_depth = mean(pass_length, na.rm = TRUE),
n_plays = n(),
.groups = "drop"
)
# Add region column
state_early_late <- state_early_late |>
mutate(region = tolower(trimws(state)))
# Pivot to get Early and Late in separate columns
state_comparison <- state_early_late |>
tidyr::pivot_wider(
names_from = season_period,
values_from = c(avg_depth, n_plays)
) |>
mutate(
depth_change = avg_depth_Late - avg_depth_Early # Positive = deeper passes late season
) |>
filter(!is.na(depth_change))
# Print summary
print(state_comparison |> select(state, avg_depth_Early, avg_depth_Late, depth_change, n_plays_Early, n_plays_Late))