# =========================
# Lab 5 – Assignment Script
# =========================

# ---- Libraries ----
library(patchwork)
library(tidyr)
library(grid)
library(gridExtra)

library(terra)       # rasters
## terra 1.8.60
## 
## Attaching package: 'terra'
## The following object is masked from 'package:grid':
## 
##     depth
## The following object is masked from 'package:tidyr':
## 
##     extract
## The following object is masked from 'package:patchwork':
## 
##     area
library(sf)          # vector (amenities)
## Linking to GEOS 3.13.0, GDAL 3.10.1, PROJ 9.5.1; sf_use_s2() is TRUE
library(dplyr)       # data wrangling
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:terra':
## 
##     intersect, union
## The following object is masked from 'package:gridExtra':
## 
##     combine
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)     # plotting
library(pROC)        # AUC/ROC
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(caret)       # confusion matrix + split helpers
## Loading required package: lattice
# For clustering
library(parallelDist)
library(fastcluster)
## 
## Attaching package: 'fastcluster'
## The following object is masked from 'package:stats':
## 
##     hclust
set.seed(42)

# -------------------------------
# PART 1: Salt Lake City – model
# -------------------------------

# (Lab 5 uses these rasters/files)  :contentReference[oaicite:4]{index=4}
NLCD_2001 <- rast("NLCD_2001_SL.tif")
NLCD_2016 <- rast("NLCD_2016_SL.tif")

Park_dist <- rast("Parks_dist_SL.tif")     # distance to parks/protected areas (km)
Rd_dns1km <- rast("Rd_dns1km_SL.tif")      # road density (1 km neighborhood)
WaterDist <- rast("WaterDist_SL.tif")      # distance to water (km)
DEM       <- rast("DEM_SL.tif")            # elevation

# Make sure all rasters align
r <- c(NLCD_2001, NLCD_2016, Park_dist, Rd_dns1km, WaterDist, DEM)
r <- terra::rast(r)  # ensures SpatRaster

# Define "urban" (NLCD developed classes 21–24)
is_urban <- function(x) x %in% c(21, 22, 23, 24)

urban2001 <- app(NLCD_2001, is_urban)
urban2016 <- app(NLCD_2016, is_urban)

# --- make sure response has a name BEFORE sampling ---
newUrban <- (urban2016 == 1) & (urban2001 == 0)
newUrban <- as.int(newUrban)
names(newUrban) <- "newUrban"   # <--- critical

# predictors
urban2001_dist <- distance(urban2001)
xstack <- c(Park_dist, Rd_dns1km, WaterDist, DEM, urban2001_dist)
names(xstack) <- c("Park_dist", "Rd_dns1km", "WaterDist", "DEM", "Urban2001Dist")

# sample
samp <- spatSample(c(newUrban, xstack),
                   size = 8000,
                   method = "random",
                   na.rm = TRUE,
                   as.points = TRUE)

df <- as.data.frame(samp, xy = TRUE, na.rm = TRUE)

# confirm it exists
print(names(df))
## [1] "newUrban"      "Park_dist"     "Rd_dns1km"     "WaterDist"    
## [5] "DEM"           "Urban2001Dist" "xy"            "na.rm"
# now this will work
df$newUrban <- as.integer(df$newUrban)


# Train/test split
idx <- createDataPartition(df$newUrban, p = 0.7, list = FALSE)
train <- df[idx, ]
test  <- df[-idx, ]

# Fit logistic regression (interpretable “suitability” model)
fit <- glm(
  newUrban ~ Park_dist + Rd_dns1km + WaterDist + DEM + Urban2001Dist,
  data = train,
  family = binomial()
)

summary(fit)
## 
## Call:
## glm(formula = newUrban ~ Park_dist + Rd_dns1km + WaterDist + 
##     DEM + Urban2001Dist, family = binomial(), data = train)
## 
## Coefficients: (1 not defined because of singularities)
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    1.198e+00  7.485e-01   1.600   0.1095    
## Park_dist      1.721e-04  1.574e-05  10.933  < 2e-16 ***
## Rd_dns1km      1.037e+00  5.686e-01   1.824   0.0681 .  
## WaterDist      3.963e-04  6.919e-05   5.728 1.01e-08 ***
## DEM           -4.270e-03  5.430e-04  -7.865 3.69e-15 ***
## Urban2001Dist         NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2110.1  on 5599  degrees of freedom
## Residual deviance: 1723.9  on 5595  degrees of freedom
## AIC: 1733.9
## 
## Number of Fisher Scoring iterations: 8
# Goodness-of-fit evidence (holdout AUC + confusion matrix)
test$pred_p <- predict(fit, newdata = test, type = "response")

roc_obj <- roc(response = test$newUrban, predictor = test$pred_p)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc_val <- auc(roc_obj)
print(auc_val)
## Area under the curve: 0.8208
plot(roc_obj, main = paste0("ROC (AUC = ", round(auc_val, 3), ")"))

# Confusion matrix at 0.5 threshold (you can also optimize threshold from ROC)
test$pred_class <- ifelse(test$pred_p >= 0.5, 1, 0)
cm <- confusionMatrix(
  factor(test$pred_class, levels = c(0, 1)),
  factor(test$newUrban,   levels = c(0, 1))
)
print(cm)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2293  107
##          1    0    0
##                                           
##                Accuracy : 0.9554          
##                  95% CI : (0.9464, 0.9633)
##     No Information Rate : 0.9554          
##     P-Value [Acc > NIR] : 0.5257          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.0000          
##          Pos Pred Value : 0.9554          
##          Neg Pred Value :    NaN          
##              Prevalence : 0.9554          
##          Detection Rate : 0.9554          
##    Detection Prevalence : 1.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : 0               
## 
# Map predicted suitability (probability of conversion)
suitability <- predict(xstack, fit, type = "response")
plot(suitability, main = "Suitability for New Urban Development (P[newUrban])")

# ----------------------------------------
# PART 2: Amenity landscape – clustering
# ----------------------------------------

# Lab 5 reads AmenDataAll.shp (10km US grid). :contentReference[oaicite:5]{index=5}
amenData <- st_read("AmenDataAll.shp", quiet = TRUE)

# Thematic inquiry example:
# "Consumer + tourism amenities" (choose a small set of variables you can interpret)
# NOTE: adjust names if your attribute table differs.
candidate_vars <- c(
  "ShopTOT",      # total shopping
  "HotelTOT",     # total hotels
  "ZooAmusSAL"    # zoo/amusement sales (example field shown in preview)
)

# Keep only variables that exist (prevents script breaking if names differ)
vars <- candidate_vars[candidate_vars %in% names(amenData)]
if (length(vars) < 2) {
  stop("Not enough selected variables found in AmenDataAll.shp. Inspect names(amenData) and update candidate_vars.")
}

amen_mat <- amenData |>
  dplyr::select(all_of(vars)) |>
  st_drop_geometry() |>
  mutate(across(everything(), ~ ifelse(is.na(.x), 0, .x))) |>
  scale() |>
  as.matrix()

# Distance matrix + Ward hierarchical clustering
d <- parDist(amen_mat, method = "euclidean")
hc <- hclust(d, method = "ward.D2")

# Choose k (set to what you can justify from dendrogram; start with 6)
k <- 6
cluster_id <- cutree(hc, k = k)

amenData$cluster <- factor(cluster_id)

# Evidence for characterizations: cluster-wise means (in ORIGINAL units)
cluster_summary <- amenData |>
  st_drop_geometry() |>
  mutate(cluster = factor(cluster_id)) |>
  group_by(cluster) |>
  summarise(across(all_of(vars), ~ mean(.x, na.rm = TRUE)), .groups = "drop")

print(cluster_summary)
## # A tibble: 6 × 4
##   cluster ShopTOT HotelTOT ZooAmusSAL
##   <fct>     <dbl>    <dbl>      <dbl>
## 1 1        0         0.405       15.0
## 2 2        0.0859   21.9       2139. 
## 3 3        1         4.98       153. 
## 4 4        0.4      89.5      71480. 
## 5 5        2.44     10.8        634. 
## 6 6        0.667    23      1346324.
# Visualization 1: map of clusters
ggplot(amenData) +
  geom_sf(aes(fill = cluster), color = NA) +
  ggtitle(paste0("Amenity Landscape Clusters (k = ", k, ")")) +
  theme_minimal()

# Visualization 2: cluster profiles (standardized means)
profile <- data.frame(cluster = factor(cluster_id), amen_mat) |>
  as_tibble() |>
  group_by(cluster) |>
  summarise(across(all_of(vars), mean), .groups = "drop") |>
  tidyr::pivot_longer(-cluster, names_to = "variable", values_to = "z_mean")

ggplot(profile, aes(x = variable, y = z_mean, group = cluster)) +
  geom_line() +
  geom_point() +
  theme_minimal() +
  ggtitle("Cluster Profiles (Mean Z-scores by Variable)") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

summary(fit)
## 
## Call:
## glm(formula = newUrban ~ Park_dist + Rd_dns1km + WaterDist + 
##     DEM + Urban2001Dist, family = binomial(), data = train)
## 
## Coefficients: (1 not defined because of singularities)
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    1.198e+00  7.485e-01   1.600   0.1095    
## Park_dist      1.721e-04  1.574e-05  10.933  < 2e-16 ***
## Rd_dns1km      1.037e+00  5.686e-01   1.824   0.0681 .  
## WaterDist      3.963e-04  6.919e-05   5.728 1.01e-08 ***
## DEM           -4.270e-03  5.430e-04  -7.865 3.69e-15 ***
## Urban2001Dist         NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2110.1  on 5599  degrees of freedom
## Residual deviance: 1723.9  on 5595  degrees of freedom
## AIC: 1733.9
## 
## Number of Fisher Scoring iterations: 8
auc_val
## Area under the curve: 0.8208
# ----------------------------
# 2x2 GRID OF THE FOUR FIGURES
# ----------------------------

plot_with_caption <- function(plot_obj, caption_text) {
  plot_obj /
    wrap_elements(
      grid::textGrob(
        caption_text,
        x = 0,
        hjust = 0,
        gp = grid::gpar(fontsize = 9)
      )
    ) +
    plot_layout(heights = c(10, 1))
}
# 1) ROC curve
p1_base <- ggroc(roc_obj) +
  ggtitle("Model performance (ROC)") +
  theme_minimal()

# 2) Suitability map
suit_df <- as.data.frame(suitability, xy = TRUE, na.rm = TRUE)
names(suit_df) <- c("x", "y", "p")

p2_base <- ggplot(suit_df, aes(x = x, y = y, fill = p)) +
  geom_raster() +
  coord_equal() +
  scale_fill_viridis_c() +
  ggtitle("Predicted suitability for new urban development") +
  theme_minimal()

# 3) Amenity cluster map
p3_base <- ggplot(amenData) +
  geom_sf(aes(fill = cluster), color = NA) +
  ggtitle("Amenity landscape clusters") +
  theme_minimal()

# 4) Cluster profiles
p4_base <- ggplot(profile, aes(x = variable, y = z_mean, group = cluster)) +
  geom_line() +
  geom_point() +
  ggtitle("Cluster profiles (standardized means)") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

p1 <- plot_with_caption(
  p1_base,
  "The ROC curve shows model performance. Because the curve stays above the diagonal, the model predicts new urban development better than random chance."
)

p2 <- plot_with_caption(
  p2_base,
  "Higher values indicate areas with greater predicted suitability for urban development based on roads, elevation, and distance variables."
)

p3 <- plot_with_caption(
  p3_base,
  "Each color represents a distinct amenity cluster. Similar amenity landscapes group together across space."
)

p4 <- plot_with_caption(
  p4_base,
  "The profile plot shows which amenities are most characteristic of each cluster based on standardized mean values."
)

(p1 | p2) /
(p3 | p4)

##Question 1 – Salt Lake City urban suitability model (1 paragraph)

I modeled new urban development as areas that changed from non-urban to urban between the two NLCD years and used distance to parks, road density, distance to water, and elevation as predictors. The model performed well, with an AUC of 0.82, which shows it can distinguish between areas that did and did not develop much better than random. Development in Salt Lake City is most strongly related to lower elevation, proximity to water, and distance from parks, while road density also shows a weaker positive effect. The elevation coefficient is negative and highly significant, meaning urban growth is more likely in lower, flatter areas. One variable that could improve the model is slope, because steep terrain limits construction even when elevation is low and would better capture physical constraints on development.

##Question 2 – Amenity landscape clustering analysis (1 paragraph)

I analyzed the amenity database using a clustering approach based on selected amenity variables that represent a common theme, and grouped locations with similar amenity characteristics. The cluster profile plot provides evidence for the interpretation of each group by showing how the average values of amenities differ between clusters, while the map shows where these groups occur spatially. High-amenity clusters tend to be concentrated around major urban areas and tourism centers, while low-amenity clusters dominate more rural and remote regions. Intermediate clusters appear around regional service centers and suburban areas. This spatial pattern makes sense because amenities tend to follow population, economic activity, and travel demand rather than being evenly distributed across space.