## ----setup, include = FALSE--------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", eval = FALSE ) ## ----fl-load------------------------------------------------------------------ # library(treeSS) # data(fl_deaths) # # str(fl_deaths) # #> 'data.frame': N obs. of 5 variables: # #> $ county_fips: chr ... # #> $ icd10_code : chr "C56" "C61" "I10" ... # #> $ icd10_desc : chr ... # #> $ deaths : int ... # #> $ population : int ... # # c(rows = nrow(fl_deaths), # counties = length(unique(fl_deaths$county_fips)), # icd10_codes = length(unique(fl_deaths$icd10_code)), # total_deaths = sum(fl_deaths$deaths)) ## ----fl-tree------------------------------------------------------------------ # icd_codes <- sort(unique(fl_deaths$icd10_code)) # # parsed <- data.frame( # code = icd_codes, # three = ifelse(grepl("\\.", icd_codes), # sub("\\..*", "", icd_codes), # icd_codes), # is_three_leaf = !grepl("\\.", icd_codes) & nchar(icd_codes) == 3, # stringsAsFactors = FALSE # ) # # # Map 3-character codes to ICD-10 chapters # icd_chapter_of <- function(three) { # L <- substr(three, 1, 1) # num <- suppressWarnings(as.integer(substr(three, 2, 3))) # switch(L, # A = "I_AB", B = "I_AB", # C = "II_C_D", # D = if (!is.na(num) && num <= 48) "II_C_D" else "III_D", # E = "IV_E", F = "V_F", G = "VI_G", # H = if (!is.na(num) && num <= 59) "VII_H" else "VIII_H", # I = "IX_I", J = "X_J", K = "XI_K", L = "XII_L", # M = "XIII_M", N = "XIV_N", O = "XV_O", P = "XVI_P", # Q = "XVII_Q", R = "XVIII_R", # S = "XIX_S_T", T = "XIX_S_T", # V = "XX_V_Y", W = "XX_V_Y", X = "XX_V_Y", Y = "XX_V_Y", # "Other" # ) # } # parsed$chapter <- sapply(parsed$three, icd_chapter_of) # # fl_tree <- unique(rbind( # data.frame(node_id = "Root", parent_id = NA_character_, # stringsAsFactors = FALSE), # data.frame(node_id = unique(parsed$chapter), parent_id = "Root", # stringsAsFactors = FALSE), # unique(data.frame(node_id = parsed$three, parent_id = parsed$chapter, # stringsAsFactors = FALSE)), # data.frame( # node_id = parsed$code[!parsed$is_three_leaf], # parent_id = parsed$three[!parsed$is_three_leaf], # stringsAsFactors = FALSE # ) # )) # # # Sanity check: every code in fl_deaths is a leaf of fl_tree # parents <- unique(fl_tree$parent_id[!is.na(fl_tree$parent_id)]) # leaves <- setdiff(fl_tree$node_id, parents) # stopifnot(length(setdiff(icd_codes, leaves)) == 0) # # c(total_nodes = nrow(fl_tree), # chapters = length(unique(parsed$chapter)), # leaves = length(leaves)) ## ----fl-polygons-------------------------------------------------------------- # library(sf) # library(tigris) # # fl_map <- counties(state = "FL", cb = TRUE, year = 2016) # fl_map <- st_transform(fl_map, 4326) # # centroids <- st_centroid(st_geometry(fl_map)) # coords <- st_coordinates(centroids) # fl_map$x <- coords[, "X"] # fl_map$y <- coords[, "Y"] ## ----fl-vectors--------------------------------------------------------------- # # Per-county population: same value across all leaves, so take any one # fl_pop <- aggregate(population ~ county_fips, data = fl_deaths, # FUN = max) # # # Per-county centroids from the polygon layer # xy <- data.frame(county_fips = fl_map$GEOID, # x = fl_map$x, y = fl_map$y, # stringsAsFactors = FALSE) # # dat <- merge(fl_deaths[, c("county_fips", "icd10_code", "deaths")], # fl_pop, by = "county_fips") # dat <- merge(dat, xy, by = "county_fips") # # # Integer region_id is convenient downstream; here we use the rank # # of county_fips in alphabetical order # dat$region_id <- as.integer(factor( # dat$county_fips, levels = sort(unique(dat$county_fips)) # )) # # # Keep only rows with at least one death (sparse-by-default scan) # dat <- dat[dat$deaths > 0, ] # # c(long_rows = nrow(dat), # counties = length(unique(dat$region_id))) ## ----fl-scan------------------------------------------------------------------ # result_fl <- treespatial_scan( # cases = dat$deaths, # population = dat$population, # region_id = dat$region_id, # x = dat$x, # y = dat$y, # node_id = dat$icd10_code, # tree = fl_tree, # max_pop_pct = 0.05, # nsim = 999, seed = 2016, # n_cores = 4L # ) # print(result_fl) ## ----fl-filter---------------------------------------------------------------- # filter_clusters(result_fl)[1:5, # c("node_id", "n_regions", # "cases", "expected", "llr", # "pvalue")] ## ----fl-seq------------------------------------------------------------------- # seq_fl <- sequential_scan( # cases = dat$deaths, # population = dat$population, # region_id = dat$region_id, # x = dat$x, # y = dat$y, # node_id = dat$icd10_code, # tree = fl_tree, # max_iter = 5L, alpha = 0.05, # nsim = 999, seed = 2016, # max_pop_pct = 0.05, n_cores = 4L # ) # print(seq_fl) ## ----fl-plot------------------------------------------------------------------ # library(ggplot2) # library(dplyr) # # region_info <- unique(dat[, c("region_id", "county_fips")]) # # cr_seq <- merge( # get_cluster_regions(seq_fl, overlap = TRUE), # region_info, by = "region_id" # ) # # # Cross-join the polygon layer with every iteration panel so all # # counties appear in every panel (those outside the analysis # # dataset show as NA-coloured rather than as an extra empty panel) # panels <- unique(cr_seq$panel) # cr_seq_keys <- unique(cr_seq[, c("county_fips", "cluster", # "node_id", "llr", "pvalue", # "panel")]) # fl_seq_plot <- merge( # do.call(rbind, # lapply(panels, function(p) cbind(fl_map, panel = p))), # cr_seq_keys, # by.x = c("GEOID", "panel"), # by.y = c("county_fips", "panel"), # all.x = TRUE # ) # # ggplot(fl_seq_plot) + # geom_sf(aes(fill = factor(cluster)), color = "gray40", # linewidth = 0.12, alpha = 0.75) + # scale_fill_manual( # values = c("#C44E52", "#4C72B0", "#55A868", # "#8172B2", "#CCB974"), # na.value = "gray95", na.translate = FALSE # ) + # facet_wrap(~ panel, nrow = 1) + # theme_minimal() + # theme(legend.position = "none")