## ----setup, include = FALSE--------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", eval = FALSE ) ## ----ops-setup---------------------------------------------------------------- # library(ukbflow) # # ops_setup() # #> ── ukbflow environment check ────────────────────────────────────────────── # #> ℹ ukbflow 0.1.0 | R 4.4.1 | 2026-03-09 # #> ── 1. dx-toolkit ────────────────────────────────────────────────────────── # #> ✔ dx: /usr/local/bin/dx (dx-toolkit v0.375.0) # #> ── 2. RAP authentication ─────────────────────────────────────────────────── # #> ✔ user: evan.zhou # #> ✔ project: project-GXk9... # #> ── 3. R packages ─────────────────────────────────────────────────────────── # #> ✔ cli 3.6.3 [core] # #> ✔ data.table 1.15.4 [core] # #> ✔ survival 3.7.0 [assoc_coxph] # #> ✔ forestploter 1.1.1 [plot_forest] # #> ... # #> ─────────────────────────────────────────────────────────────────────────── # #> ✔ 15 passed # #> ! 2 optional / warning ## ----ops-setup-prog----------------------------------------------------------- # result <- ops_setup(verbose = FALSE) # result$summary # #> $pass # #> [1] 15 # #> $warn # #> [1] 2 # #> $fail # #> [1] 0 # # # Gate the rest of your script on a clean environment # stopifnot(result$summary$fail == 0) ## ----ops-setup-partial-------------------------------------------------------- # # Check R package dependencies only (skip dx and RAP auth) # ops_setup(check_dx = FALSE, check_auth = FALSE) ## ----ops-toy-cohort----------------------------------------------------------- # dt <- ops_toy() # #> ✔ ops_toy: 1000 participants | 75 columns | scenario = "cohort" | seed = 42 # # dim(dt) # #> [1] 1000 75 # # names(dt) # #> [1] "eid" "p31" "p34" "p53_i0" # #> [5] "p21022" "p21001_i0" "p20116_i0" "p1558_i0" # #> ... ## ----ops-toy-pipeline--------------------------------------------------------- # dt <- ops_toy() # dt <- derive_missing(dt) # dt <- derive_covariate(dt, # as_numeric = "p21001_i0", # as_factor = c("p31", "p20116_i0") # ) ## ----ops-toy-forest----------------------------------------------------------- # dt_forest <- ops_toy(scenario = "forest") # #> ✔ ops_toy: 24 rows | 11 columns | scenario = "forest" | seed = 42 # # plot_forest( # data = dt_forest[model == "Fully adjusted"], # est = dt_forest[model == "Fully adjusted", HR], # lower = dt_forest[model == "Fully adjusted", CI_lower], # upper = dt_forest[model == "Fully adjusted", CI_upper] # ) ## ----ops-toy-seed------------------------------------------------------------- # dt1 <- ops_toy(seed = 1) # dt2 <- ops_toy(seed = 1) # identical(dt1, dt2) # TRUE # # dt_random <- ops_toy(seed = NULL) # different every call ## ----ops-na-basic------------------------------------------------------------- # dt <- ops_toy() # ops_na(dt) # #> ── ops_na ────────────────────────────────────────────────────────────────── # #> ℹ 1000 rows | 65 columns | threshold = 0% # #> ✖ messy_allna 1000 / 1000 (100.00%) # #> ✖ p41280_a4 1000 / 1000 (100.00%) # #> ✖ p20002_i0_a4 976 / 1000 ( 97.60%) # #> ✖ p131742 916 / 1000 ( 91.60%) # #> ... # #> ──────────────────────────────────────────────────────────────────────────── # #> ✖ 41 columns ≥ 10% missing # #> ✔ 24 columns complete (0% missing) ## ----ops-na-threshold--------------------------------------------------------- # # Only list columns with > 50% missing in the console output # ops_na(dt, threshold = 50) # # # Suppress all per-column lines — summary only # ops_na(dt, threshold = 99) ## ----ops-na-prog-------------------------------------------------------------- # result <- ops_na(dt, verbose = FALSE) # result # #> column n_na pct_na # #> # #> 1: messy_allna 1000 100.0 # #> 2: p41280_a4 1000 100.0 # #> ... # # # Identify columns to drop before modelling # cols_to_drop <- result[pct_na > 90, column] # dt[, (cols_to_drop) := NULL] ## ----ops-snapshot-record------------------------------------------------------ # dt <- ops_toy() # ops_snapshot(dt, label = "raw") # #> ── snapshot: raw ─────────────────────────────────────────────────────────── # #> rows 1,000 # #> cols 65 # #> NA cols 41 # #> size 0.61 MB # #> ──────────────────────────────────────────────────────────────────────────── # # dt <- derive_missing(dt) # ops_snapshot(dt, label = "after_derive_missing") # #> ── snapshot: after_derive_missing ────────────────────────────────────────── # #> rows 1,000 (= 0) # #> cols 65 (= 0) # #> NA cols 43 (+2) # #> size 0.61 MB (= 0) # #> ──────────────────────────────────────────────────────────────────────────── # # dt <- dt[p31 == "Female"] # ops_snapshot(dt, label = "female_only") # #> ── snapshot: female_only ─────────────────────────────────────────────────── # #> rows 570 (-430) # #> cols 65 (= 0) # #> NA cols 43 (= 0) # #> size 0.36 MB (-0.25 MB) # #> ──────────────────────────────────────────────────────────────────────────── ## ----ops-snapshot-history----------------------------------------------------- # ops_snapshot() # #> ── ops_snapshot history ──────────────────────────────────────────────────── # #> idx label timestamp nrow ncol n_na_cols size_mb # #> 1: 1 raw 14:30:01 1000 65 41 0.61 # #> 2: 2 after_derive_missing 14:30:05 1000 65 43 0.61 # #> 3: 3 female_only 14:30:08 570 65 43 0.36 # #> ──────────────────────────────────────────────────────────────────────────── ## ----ops-snapshot-silent------------------------------------------------------ # ops_snapshot(dt, label = "pre_assoc", verbose = FALSE) ## ----ops-snapshot-reset------------------------------------------------------- # ops_snapshot(reset = TRUE) # #> ✔ Snapshot history cleared. ## ----ops-snapshot-cols-------------------------------------------------------- # raw_cols <- ops_snapshot_cols("raw") # # raw_cols is a character vector of droppable column names ## ----ops-snapshot-cols-keep--------------------------------------------------- # raw_cols <- ops_snapshot_cols("raw", keep = "p53_i0") ## ----ops-snapshot-diff-------------------------------------------------------- # result <- ops_snapshot_diff("raw", "after_derive_missing") # result$added # columns added in this step # result$removed # columns dropped in this step ## ----ops-snapshot-remove------------------------------------------------------ # # After deriving, drop the original raw columns # dt <- ops_snapshot_remove(dt, from = "raw") # #> ✔ ops_snapshot_remove: dropped 60 raw columns, 15 remaining. ## ----ops-set-safe-cols-------------------------------------------------------- # ops_set_safe_cols(c("date_baseline", "age_at_recruitment")) # # # Clear registered safe cols # ops_set_safe_cols(reset = TRUE) ## ----ops-withdraw------------------------------------------------------------- # dt <- ops_withdraw(dt, file = "withdraw.csv") # #> ── snapshot: before_withdraw ─────────────────────────────────────────────── # #> rows 502,492 # #> ... # #> ── snapshot: after_withdraw ──────────────────────────────────────────────── # #> rows 502,489 (-3) # #> ... # #> ℹ Withdrawal file: w854944_20260310.csv (312 IDs) # #> ✖ Excluded: 3 participants found in data # #> ✔ Remaining: 502,489 participants ## ----ops-workflow------------------------------------------------------------- # library(ukbflow) # # # 1. Verify environment before starting # ops_setup() # # # 2. Generate test data (or extract real data from RAP) # dt <- ops_toy() # # # 3. Inspect data quality before processing # ops_na(dt) # # # 4. Run pipeline with checkpoints # ops_snapshot(dt, label = "raw") # # dt <- derive_missing(dt) # ops_snapshot(dt, label = "after_derive_missing") # # dt <- derive_covariate(dt, # as_numeric = "p21001_i0", # as_factor = c("p31", "p20116_i0") # ) # ops_snapshot(dt, label = "after_derive_covariate") # # # 5. Review full pipeline history # ops_snapshot()