## ----setup, include=FALSE-----------------------------------------------------
knitr::opts_chunk$set(collapse = TRUE, comment = "#>", fig.width = 7, fig.height = 5.5, dpi = 150,
                      warning = FALSE, message = FALSE)

## ----load-data----------------------------------------------------------------
library(palimpsestr)
data(villa_romana)
cat("Finds:", nrow(villa_romana), "\n")
cat("Stratigraphic units:", length(unique(villa_romana$context)), "\n")
cat("Material classes:", length(unique(villa_romana$class)), "\n")
cat("Date range:", min(villa_romana$date_min), "to", max(villa_romana$date_max), "\n")

## ----data-summary-------------------------------------------------------------
cat("Material classes:\n")
sort(table(villa_romana$class), decreasing = TRUE)

## ----data-structure-----------------------------------------------------------
str(villa_romana)

## ----compare-k----------------------------------------------------------------
ck <- compare_k(
  villa_romana,
  k_values = 2:7,
  tafonomy = "taf_score",
  context  = "context",
  seed     = 42
)
print(ck)

## ----gg-compare-k, fig.cap="Model selection diagnostics: BIC, PDI, and mean entropy across candidate phase counts. The dashed line marks the optimal K."----
if (requireNamespace("ggplot2", quietly = TRUE)) {
  gg_compare_k(ck)
}

## ----fit-model----------------------------------------------------------------
fit <- fit_sef(
  villa_romana,
  k        = 4,
  tafonomy = "taf_score",
  context  = "context",
  n_init   = 10,
  seed     = 42
)
print(fit)

## ----model-summary------------------------------------------------------------
summary(fit)

## ----model-stats--------------------------------------------------------------
cat("PDI:", pdi(fit), "\n")
sef_summary(fit)

## ----gg-convergence, fig.cap="EM convergence trace showing log-likelihood stabilisation across iterations."----
if (requireNamespace("ggplot2", quietly = TRUE)) {
  gg_convergence(fit)
}

## ----gg-phasefield, fig.cap="Dominant phase assignment. Point size reflects assignment confidence (inverse entropy). Larger points indicate higher confidence."----
if (requireNamespace("ggplot2", quietly = TRUE)) {
  gg_phasefield(fit)
}

## ----gg-entropy, fig.cap="Spatial distribution of Shannon entropy. High-entropy zones (bright) mark areas of uncertain phase assignment, typically at phase boundaries or in heavily mixed deposits."----
if (requireNamespace("ggplot2", quietly = TRUE)) {
  gg_entropy(fit)
}

## ----gg-energy, fig.cap="Excavation Stratigraphic Energy field. Elevated ESE values (bright) highlight zones of depositional disruption where neighbouring finds belong to different phases."----
if (requireNamespace("ggplot2", quietly = TRUE)) {
  gg_energy(fit)
}

## ----gg-intrusions, fig.cap="Candidate intrusions overlaid on the phase-field map. Circled finds exhibit high entropy, high stratigraphic energy, and low entanglement with their neighbours. Top 10 suspects are labelled."----
if (requireNamespace("ggplot2", quietly = TRUE)) {
  gg_intrusions(fit, top_n = 10)
}

## ----gg-phase-profile, fig.cap="Vertical phase profile: depth vs horizontal position. Phase 1 (deepest) corresponds to the earliest occupation; Phase 4 (shallowest) to the most recent."----
if (requireNamespace("ggplot2", quietly = TRUE)) {
  gg_phase_profile(fit)
}

## ----intrusions---------------------------------------------------------------
di <- detect_intrusions(fit)
suspects <- di[di$intrusion_prob > 0.5, ]
cat("Suspected intrusions:", nrow(suspects), "out of", nrow(villa_romana), "finds",
    sprintf("(%.1f%%)\n", 100 * nrow(suspects) / nrow(villa_romana)))

## ----intrusion-table----------------------------------------------------------
if (nrow(suspects) > 0) {
  # Merge with source data and phase assignments
  phase_tab <- as_phase_table(fit)
  susp_data <- merge(suspects, villa_romana[, c("id", "context", "class")], by = "id")
  susp_data <- merge(susp_data, phase_tab[, c("id", "dominant_phase")], by = "id")
  susp_data <- susp_data[order(susp_data$intrusion_prob, decreasing = TRUE), ]
  susp_show <- susp_data[, c("id", "context", "class", "dominant_phase", "intrusion_prob")]
  names(susp_show) <- c("Find", "US", "Class", "Phase", "Intrusion Prob.")
  knitr::kable(head(susp_show, 15), row.names = FALSE, digits = 3,
               caption = "Top suspected intrusions ranked by intrusion probability.")
}

## ----us-purity----------------------------------------------------------------
us_tab <- us_summary_table(fit)
knitr::kable(us_tab, row.names = FALSE, digits = 3,
             caption = "Per-US diagnostics: dominant phase, purity, mean entropy, mean ESE, and number of intrusions.")

## ----purity-stats-------------------------------------------------------------
n_pure <- sum(us_tab$purity >= 0.9)
cat(n_pure, "out of", nrow(us_tab), "units have purity >= 90%\n")

## ----transition-matrix--------------------------------------------------------
tm <- phase_transition_matrix(fit)
print(tm)

## ----validation---------------------------------------------------------------
ari <- adjusted_rand_index(fit, villa_romana$true_phase)
cat("Adjusted Rand Index:", round(ari, 3), "\n")

## ----confusion----------------------------------------------------------------
confusion_matrix(fit, villa_romana$true_phase)

## ----gg-confusion, fig.cap="Confusion matrix heatmap. Strong diagonal = correct phase recovery. Off-diagonal cells indicate misattributed finds, often corresponding to bioturbated or redeposited material."----
if (requireNamespace("ggplot2", quietly = TRUE)) {
  gg_confusion(fit, villa_romana$true_phase)
}

## ----sensitivity--------------------------------------------------------------
configs <- list(
  equal    = c(ws = 1, wz = 1, wt = 1, wc = 1),
  spatial  = c(ws = 2, wz = 1, wt = 0.5, wc = 0.5),
  temporal = c(ws = 0.5, wz = 0.5, wt = 2, wc = 1)
)

sens <- do.call(rbind, lapply(names(configs), function(nm) {
  w <- configs[[nm]]
  f <- fit_sef(villa_romana, k = 4, weights = w, seed = 42,
               tafonomy = "taf_score", context = "context")
  data.frame(
    config = nm,
    pdi = pdi(f),
    ari = adjusted_rand_index(f, villa_romana$true_phase),
    mean_entropy = mean(f$entropy, na.rm = TRUE),
    stringsAsFactors = FALSE
  )
}))
knitr::kable(sens, digits = 4,
             caption = "Sensitivity of PDI, ARI, and mean entropy to weight configuration.")

## ----bootstrap----------------------------------------------------------------
bs <- bootstrap_sef(fit, n_boot = 50, true_labels = villa_romana$true_phase, verbose = FALSE)
knitr::kable(bs, digits = 4,
             caption = "Bootstrap confidence intervals (50 replicates) for key model statistics.")

## ----gg-bootstrap, fig.cap="Bootstrap confidence intervals for PDI, mean entropy, mean energy, log-likelihood, and ARI."----
if (requireNamespace("ggplot2", quietly = TRUE)) {
  gg_bootstrap(bs)
}

## ----harris-------------------------------------------------------------------
H <- harris_from_contexts(villa_romana, z_col = "z", context_col = "context")
cat("Harris penalty matrix:", nrow(H), "x", ncol(H), "\n")

## ----harris-fit---------------------------------------------------------------
fit_h <- fit_sef(villa_romana, k = 4, harris = H,
                 tafonomy = "taf_score", context = "context",
                 n_init = 5, seed = 42)
cat("PDI without Harris:", round(pdi(fit), 4), "\n")
cat("PDI with Harris:   ", round(pdi(fit_h), 4), "\n")
cat("ARI without Harris:", round(adjusted_rand_index(fit, villa_romana$true_phase), 4), "\n")
cat("ARI with Harris:   ", round(adjusted_rand_index(fit_h, villa_romana$true_phase), 4), "\n")

## ----harris-validate----------------------------------------------------------
validate_phases_harris(fit)

## ----report-en----------------------------------------------------------------
report_sef(fit, lang = "en")

## ----report-it----------------------------------------------------------------
report_sef(fit, lang = "it")

## ----export, eval=FALSE-------------------------------------------------------
# export_results(fit, dir = "results/", format = "csv", prefix = "villa_romana")

## ----gis-export, eval=FALSE---------------------------------------------------
# if (requireNamespace("sf", quietly = TRUE)) {
#   sf_pts   <- as_sf_phase(fit, crs = 32632L, dims = "XYZ")
#   sf_links <- as_sf_links(fit, quantile_threshold = 0.90, crs = 32632L)
#   sf::st_write(sf_pts,   "villa_romana.gpkg", layer = "phases")
#   sf::st_write(sf_links, "villa_romana.gpkg", layer = "sei_links")
# }

## ----interactive, eval=FALSE--------------------------------------------------
# as_plotly(gg_phasefield(fit))
# as_plotly(gg_intrusions(fit))

## ----eval=FALSE---------------------------------------------------------------
# # install.packages("remotes")
# remotes::install_github("enzococca/palimpsestr")