## ----setup, include = FALSE--------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", eval = FALSE ) ## ----load-data---------------------------------------------------------------- # library(ukbflow) # # data <- ops_toy(n = 500000, seed = 2026) # #> ✔ ops_toy: 500000 participants | 75 columns | scenario = "cohort" | seed = 2026 ## ----decode-names------------------------------------------------------------- # data <- decode_names(data) # #> Field dictionary not cached - calling `extract_ls()` to populate it. # #> Using dataset: "XXXXXXXXXXXXXXXXXXXXXXX" # #> Fetching approved fields... (cached after first call) # #> 29951 fields available. Assign to a variable or use pattern= to search. # #> ✔ Renamed 68 columns. # #> ! 5 column names longer than 60 characters - consider renaming manually: # #> • interpolated_year_when_non_cancer_illness_first_diagnosed_i0_a0 # #> • interpolated_year_when_non_cancer_illness_first_diagnosed_i0_a1 # #> • interpolated_year_when_non_cancer_illness_first_diagnosed_i0_a2 # #> • interpolated_year_when_non_cancer_illness_first_diagnosed_i0_a3 # #> • interpolated_year_when_non_cancer_illness_first_diagnosed_i0_a4 ## ----derive-missing----------------------------------------------------------- # data <- derive_missing(data) # #> ✔ derive_missing: replaced 279650 values across 3 columns (action = "na"). ## ----derive-covariate--------------------------------------------------------- # data <- derive_covariate( # data, # as_factor = c( # "p31", # sex # "p20116_i0", # smoking_status_i0 # "p1558_i0", # alcohol_intake_frequency_i0 # "p54_i0" # uk_biobank_assessment_centre_i0 # ) # ) # #> ── Factor ─────────────────────────────────────────────────────────────────── # #> sex [2 levels] # #> Female: n=270191 (54%) | Male: n=229809 (46%) | : n=0 (0%) # #> smoking_status_i0 [3 levels] # #> Current: n=69867 (14%) | Never: n=260458 (52.1%) | Previous: n=154871 (31%) | : n=14804 (3%) # #> ! alcohol_intake_frequency_i0: 6 levels > max_levels (5), consider collapsing categories. # #> Daily or almost daily: n=40113 (8%) | Never: n=45207 (9%) | ... | : n=14973 (3%) # #> ! uk_biobank_assessment_centre_i0: 10 levels > max_levels (5), consider collapsing categories. # #> Birmingham: n=49709 (9.9%) | Bristol: n=49832 (10%) | ... | Sheffield: n=50293 (10.1%) ## ----derive-cut--------------------------------------------------------------- # data <- derive_cut( # data, # col = "p21001_i0", # body_mass_index_bmi_i0 # n = 4, # breaks = c(18.5, 25, 30), # labels = c("Underweight", "Normal", "Overweight", "Obese"), # name = "bmi_cat" # ) # #> ── Source: body_mass_index_bmi_i0 ─────────────────────────────────────────── # #> mean=26.21, median=26.19, sd=5.48, Q1=22.48, Q3=29.9, NA=0% (n=0) # #> ── New column: bmi_cat ────────────────────────────────────────────────────── # #> bmi_cat [4 levels] # #> Underweight: n=40238 (8%) | Normal: n=166710 (33.3%) # #> Overweight: n=170794 (34.2%) | Obese: n=122258 (24.5%) | : n=0 (0%) # # data <- derive_cut( # data, # col = "p22189", # townsend_deprivation_index_at_recruitment # n = 4, # labels = c("Q1 (least deprived)", "Q2", "Q3", "Q4 (most deprived)"), # name = "tdi_cat" # ) # #> ── Source: townsend_deprivation_index_at_recruitment ──────────────────────── # #> mean=-1.25, median=-1.3, sd=3.1, Q1=-3.46, Q3=0.86, NA=0% (n=0) # #> ── New column: tdi_cat ────────────────────────────────────────────────────── # #> tdi_cat [4 levels] # #> Q1 (least deprived): n=125290 (25.1%) | Q2: n=124831 (25%) # #> Q3: n=125102 (25%) | Q4 (most deprived): n=124777 (25%) | : n=0 (0%) ## ----derive-selfreport-------------------------------------------------------- # data <- derive_selfreport( # data, # name = "lung_cancer", # regex = "lung cancer", # field = "cancer" # ) # #> ✔ derive_selfreport (lung_cancer): 6700 cases, 6700 with dates. ## ----derive-icd10------------------------------------------------------------- # data <- derive_icd10( # data, # name = "lung", # icd10 = "^C3[34]", # match = "regex", # source = "cancer_registry", # behaviour = 3L # ) # #> ✔ derive_cancer_registry (lung): 2449 cases, 2449 with date. # #> ✔ derive_icd10 (lung): 2449 cases across 1 source, 2449 with date. ## ----derive-case-------------------------------------------------------------- # data <- derive_case( # data, # name = "lung", # selfreport_col = "lung_cancer_selfreport", # selfreport_date_col = "lung_cancer_selfreport_date" # ) # #> ✔ derive_case (lung): 9110 cases, 9110 with date. # #> ℹ Both sources (lung_icd10 & lung_cancer_selfreport): 39 ## ----derive-timing------------------------------------------------------------ # data <- derive_timing(data, name = "lung", baseline_col = "p53_i0") # date_of_attending_assessment_centre_i0 # #> ✔ derive_timing (lung_timing): # #> ℹ 0 (no disease): 490890 # #> ℹ 1 (prevalent): 5056 # #> ℹ 2 (incident): 4054 # #> ℹ NA (no date): 0 ## ----derive-followup---------------------------------------------------------- # data <- derive_followup( # data, # name = "lung", # event_col = "lung_date", # baseline_col = "p53_i0", # date_of_attending_assessment_centre_i0 # censor_date = as.Date("2022-10-31"), # death_col = "p40000_i0", # date_of_death_i0 # lost_col = FALSE # lost-to-follow-up not available in this dataset # ) # #> ✔ derive_followup (lung): # #> ℹ lung_followup_end: 500000 / 500000 non-missing # #> ℹ lung_followup_years: mean=13.71, median=14.09, range=[0, 16.83] ## ----exposure----------------------------------------------------------------- # data[, smoking_ever := factor( # ifelse(p20116_i0 == "Never", "Never", "Ever"), # levels = c("Never", "Ever") # Never = reference # )] ## ----snapshot-raw------------------------------------------------------------- # ops_snapshot(data, label = "raw") # #> ── snapshot: raw ──────────────────────────────────────────────────────────── # #> rows 500,000 # #> cols 89 # #> NA cols 59 # #> size 293 MB ## ----excl-prevalent----------------------------------------------------------- # data <- data[lung_timing != 1 | is.na(lung_timing)] # ops_snapshot(data, label = "after excluding prevalent cases") # #> ── snapshot: after excluding prevalent cases ───────────────────────────────── # #> rows 494,944 (-5,056) # #> cols 89 (= 0) # #> NA cols 58 (-1) # #> size 290.1 MB (-2.95 MB) ## ----excl-na------------------------------------------------------------------ # data <- data[!is.na(smoking_ever) & # !is.na(p31) & # sex # !is.na(p21022) & # age_at_recruitment # !is.na(bmi_cat) & # bmi category # !is.na(p1558_i0) & # alcohol_intake_frequency_i0 # !is.na(tdi_cat) & # townsend deprivation category # !is.na(p54_i0) & # assessment_centre # !is.na(p22009_a1) & # PC1 # !is.na(p22009_a2) & # PC2 # !is.na(p22009_a3) & # PC3 # !is.na(p22009_a4) & # PC4 # !is.na(p22009_a5) & # PC5 # !is.na(p22009_a6) & # PC6 # !is.na(p22009_a7) & # PC7 # !is.na(p22009_a8) & # PC8 # !is.na(p22009_a9) & # PC9 # !is.na(p22009_a10)] # PC10 # ops_snapshot(data, label = "after excluding missing covariates") # #> ── snapshot: after excluding missing covariates ────────────────────────────── # #> rows 465,937 (-29,007) # #> cols 89 (= 0) # #> NA cols 55 (-3) # #> size 273.3 MB (-16.75 MB) ## ----snapshot-history--------------------------------------------------------- # ops_snapshot() # #> ── ops_snapshot history ───────────────────────────────────────────────────── # #> idx label timestamp nrow ncol n_na_cols size_mb # #> 1: 1 raw 15:35:24 500000 89 59 293.03 # #> 2: 2 after excluding prevalent cases 15:35:51 494944 89 58 290.08 # #> 3: 3 after excluding missing covariates 15:36:18 465937 89 55 273.33 ## ----cohort-overview---------------------------------------------------------- # # Exposure distribution # data[, .N, by = smoking_ever] # # #> smoking_ever N # #> # #> 1: Ever 215893 # #> 2: Never 250044 # # # Outcome: incident cases and timing # data[, .N, by = lung_timing] # #> lung_timing N # #> # #> 1: 0 462110 # #> 2: 2 3827 # # # Follow-up time in years # data[, .(mean = round(mean(lung_followup_years), 2), # median = round(median(lung_followup_years), 2), # min = round(min(lung_followup_years), 2), # max = round(max(lung_followup_years), 2))] # #> mean median min max # #> # #> 1: 13.71 14.09 0 16.83 ## ----assoc-coxph-------------------------------------------------------------- # res <- assoc_coxph( # data = data, # outcome_col = "lung_status", # time_col = "lung_followup_years", # exposure_col = "smoking_ever", # covariates = c("p21022", # age_at_recruitment # "p31", # sex # "bmi_cat", # "tdi_cat", # "p1558_i0", # alcohol_intake_frequency_i0 # "p54_i0", # uk_biobank_assessment_centre_i0 # paste0("p22009_a", 1:10)) # genetic PCs 1-10 # ) # #> ℹ outcome_col lung_status: logical detected, converting TRUE/FALSE -> 1/0 # #> ── assoc_coxph ────────────────────────────────────────────────────────────── # #> ℹ 1 exposure x 3 models = 3 Cox regressions # #> ℹ Input cohort: 465937 participants (n/n_events/person_years reflect each model's actual analysis set) # #> ── smoking_ever ───────────────────────────────────────────────────────────── # #> ✔ Unadjusted | smoking_everEver: HR 0.99 (0.93-1.06), p = 0.834 # #> ✔ Age and sex adjusted | smoking_everEver: HR 0.99 (0.93-1.06), p = 0.838 # #> ✔ Fully adjusted | smoking_everEver: HR 0.99 (0.93-1.06), p = 0.829 # #> ✔ Done: 3 result rows across 1 exposure and 3 models. ## ----res-print---------------------------------------------------------------- # print(res) # #> exposure term model n n_events person_years HR CI_lower # #> # #> 1: smoking_ever smoking_everEver Unadjusted 465937 3827 6388195 0.9932076 0.9320517 # #> 2: smoking_ever smoking_everEver Age and sex adjusted 465937 3827 6388195 0.9933735 0.9322072 # #> 3: smoking_ever smoking_everEver Fully adjusted 465937 3827 6388195 0.9930029 0.9318576 # #> CI_upper p_value HR_label # #> # #> 1: 1.058376 0.8335146 0.99 (0.93-1.06) # #> 2: 1.058553 0.8375372 0.99 (0.93-1.06) # #> 3: 1.058160 0.8285626 0.99 (0.93-1.06) # # class(res) # #> [1] "data.table" "data.frame" ## ----res-df------------------------------------------------------------------- # res_df <- as.data.frame(res) ## ----forest-plot-------------------------------------------------------------- # p <- plot_forest( # data = res_df, # est = res_df$HR, # lower = res_df$CI_lower, # upper = res_df$CI_upper, # ci_column = 2L, # forest plot rendered in column 2 (HR_label) # p_cols = "p_value", # ref_line = 1, # xlim = c(0, 2.0), # ticks_at = c(0, 0.5, 1.0, 1.5, 2.0) # ) # # plot(p) ## ----forest-plot-pub---------------------------------------------------------- # p2 <- plot_forest( # data = res_df[, c("model", "n_events", "n", "p_value")], # est = res_df$HR, # lower = res_df$CI_lower, # upper = res_df$CI_upper, # ci_column = 4L, # p_cols = "p_value", # ref_line = 1, # xlim = c(0, 2.0), # ticks_at = c(0.5, 0.75, 1.0, 1.25, 1.5), # header = c("Model", "Cases", "N", "", "HR (95% CI)", "P value") # ) # # plot(p2) ## ----forest-plot-header------------------------------------------------------- # res_pub <- rbind( # data.frame(model = "Ever vs. Never", HR_label = "", p_value = NA, # HR = NA, CI_lower = NA, CI_upper = NA, stringsAsFactors = FALSE), # res_df[, c("model", "HR_label", "p_value", "HR", "CI_lower", "CI_upper")] # ) # # p3 <- plot_forest( # data = res_pub[, c("model", "p_value")], # est = res_pub$HR, # lower = res_pub$CI_lower, # upper = res_pub$CI_upper, # ci_column = 2L, # p_cols = "p_value", # ref_line = 1, # xlim = c(0, 2.0), # ticks_at = c(0, 0.5, 1.0, 1.5, 2.0), # indent = c(0L, 1L, 1L, 1L), # bold_label = c(TRUE, FALSE, FALSE, FALSE), # header = c("Model", "", "HR (95% CI)", "P value") # ) # # plot(p3) ## ----tableone----------------------------------------------------------------- # t1 <- plot_tableone( # data = as.data.frame(data), # vars = c("p21022", "p31", "bmi_cat", "tdi_cat", "p1558_i0"), # strata = "smoking_ever", # label = list( # p21022 ~ "Age at recruitment (years)", # p31 ~ "Sex", # bmi_cat ~ "BMI category", # tdi_cat ~ "Townsend deprivation index", # p1558_i0 ~ "Alcohol intake frequency" # ), # add_p = TRUE, # save = FALSE # ) # # print(t1) ## ----session-info------------------------------------------------------------- # sessionInfo() # #> R version 4.5.1 (2025-06-13 ucrt) # #> Platform: x86_64-w64-mingw32/x64 # #> Running under: Windows 11 x64 (build 26200) # #> # #> Matrix products: default # #> LAPACK version 3.12.1 # #> # #> locale: # #> [1] LC_COLLATE=Chinese (Simplified)_China.utf8 LC_CTYPE=Chinese (Simplified)_China.utf8 # #> [3] LC_MONETARY=Chinese (Simplified)_China.utf8 LC_NUMERIC=C # #> [5] LC_TIME=Chinese (Simplified)_China.utf8 # #> # #> time zone: Asia/Shanghai # #> tzcode source: internal # #> # #> attached base packages: # #> [1] stats graphics grDevices utils datasets methods base # #> # #> other attached packages: # #> [1] ukbflow_0.3.0 testthat_3.2.3 # #> # #> loaded via a namespace (and not attached): # #> [1] gt_1.0.0 sass_0.4.10 tidyr_1.3.1 generics_0.1.4 gtsummary_2.4.0 # #> [6] xml2_1.3.8 lattice_0.22-7 digest_0.6.37 magrittr_2.0.3 evaluate_1.0.5 # #> [11] grid_4.5.1 cards_0.7.0 pkgload_1.4.0 fastmap_1.2.0 rprojroot_2.1.1 # #> [16] jsonlite_2.0.0 Matrix_1.7-3 processx_3.8.6 pkgbuild_1.4.8 sessioninfo_1.2.3 # #> [21] backports_1.5.0 cardx_0.3.0 brio_1.1.5 survival_3.8-3 ps_1.9.1 # #> [26] gridExtra_2.3 purrr_1.0.4 cli_3.6.4 forestploter_1.1.3 rlang_1.1.6 # #> [31] litedown_0.7 commonmark_2.0.0 ellipsis_0.3.2 splines_4.5.1 remotes_2.5.0 # #> [36] withr_3.0.2 cachem_1.1.0 devtools_2.4.5.9000 tools_4.5.1 memoise_2.0.1 # #> [41] dplyr_1.1.4.9000 broom_1.0.10 curl_7.0.0 vctrs_0.6.5 R6_2.6.1 # #> [46] lifecycle_1.0.4 fs_1.6.6 usethis_3.2.1 pkgconfig_2.0.3 desc_1.4.3 # #> [51] pillar_1.11.1 gtable_0.3.6 data.table_1.17.0 glue_1.8.0 xfun_0.52 # #> [56] tibble_3.2.1 tidyselect_1.2.1 rstudioapi_0.17.1 knitr_1.50 htmltools_0.5.8.1 # #> [61] rmarkdown_2.29 compiler_4.5.1 markdown_2.0