## ----setup, include = FALSE--------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 7, fig.height = 4, fig.align = "center" ) ## ----quick-start, message = FALSE, warning = FALSE---------------------------- library(rsdv) set.seed(42) meta <- metadata(adult_income) |> set_column_type("age", "numerical") |> set_column_type("education_num", "numerical") |> set_column_type("hours_per_week", "numerical") |> set_column_type("occupation", "categorical") |> set_column_type("income", "categorical") syn <- gaussian_copula_synthesizer(meta) |> fit(adult_income) synth <- sample(syn, n = 500) head(synth[, c("age", "education_num", "occupation", "income")]) ## ----metadata, warning = FALSE------------------------------------------------ meta <- metadata(adult_income) |> set_column_type("age", "numerical") |> set_column_type("education_num", "numerical") |> set_column_type("hours_per_week", "numerical") |> set_column_type("occupation", "categorical") |> set_column_type("marital_status", "categorical") |> set_column_type("income", "categorical") print(meta) ## ----fit-sample, message = FALSE, warning = FALSE----------------------------- set.seed(42) syn <- gaussian_copula_synthesizer(meta) syn <- fit(syn, adult_income) synth <- sample(syn, n = 500) ## ----distributions, message = FALSE, warning = FALSE-------------------------- syn_dist <- gaussian_copula_synthesizer( meta, numerical_distributions = list(capital_gain = "gamma"), default_distribution = "norm" ) |> fit(adult_income) ## ----conditional, warning = FALSE--------------------------------------------- high_earners <- sample_conditions( syn, data.frame(income = ">50K", .n = 50, stringsAsFactors = FALSE) ) table(high_earners$income) ## ----quality, warning = FALSE------------------------------------------------- qr <- quality_report(adult_income, synth, meta) print(qr) ## ----plot-column-similarity, fig.height = 5, warning = FALSE------------------ col_scores <- rbind( data.frame( column = qr$ks_scores$column, score = qr$ks_scores$score, type = "KS similarity\n(numerical)", stringsAsFactors = FALSE ), data.frame( column = qr$tvd_scores$column, score = qr$tvd_scores$score, type = "TVD similarity\n(categorical)", stringsAsFactors = FALSE ) ) ggplot2::ggplot( col_scores, ggplot2::aes(x = reorder(column, score), y = score, fill = type) ) + ggplot2::geom_col(width = 0.65, alpha = 0.9) + ggplot2::geom_text( ggplot2::aes(label = sprintf("%.2f", score)), hjust = -0.15, size = 3.2, colour = "grey30" ) + ggplot2::coord_flip() + ggplot2::scale_y_continuous( limits = c(0, 1.15), labels = scales::percent_format(accuracy = 1) ) + ggplot2::scale_fill_manual( values = c( "KS similarity\n(numerical)" = "#2171b5", "TVD similarity\n(categorical)" = "#238b45" ) ) + ggplot2::labs( title = "Column Similarity: Real vs. Synthetic", subtitle = sprintf("Overall quality score: %.3f", qr$overall_score), x = NULL, y = "Similarity score", fill = NULL ) + ggplot2::theme_minimal(base_size = 11) + ggplot2::theme( legend.position = "bottom", panel.grid.major.y = ggplot2::element_blank(), plot.title = ggplot2::element_text(face = "bold"), plot.subtitle = ggplot2::element_text(colour = "grey40") ) ## ----plot-correlation, fig.height = 4, warning = FALSE------------------------ num_cols <- c("age", "education_num", "hours_per_week") cor_real <- round(cor(adult_income[, num_cols], use = "complete.obs"), 3) cor_syn <- round(cor(synth[, num_cols], use = "complete.obs"), 3) mat_to_long <- function(mat, source) { nms <- colnames(mat) data.frame( var1 = rep(nms, each = length(nms)), var2 = rep(nms, times = length(nms)), value = as.vector(mat), source = source, stringsAsFactors = FALSE ) } cor_long <- rbind( mat_to_long(cor_real, "Real data"), mat_to_long(cor_syn, "Synthetic data") ) cor_long$var1 <- factor(cor_long$var1, levels = rev(num_cols)) cor_long$var2 <- factor(cor_long$var2, levels = num_cols) ggplot2::ggplot(cor_long, ggplot2::aes(var2, var1, fill = value)) + ggplot2::geom_tile(colour = "white", linewidth = 0.8) + ggplot2::geom_text( ggplot2::aes(label = sprintf("%.2f", value)), size = 3.5, colour = "grey20" ) + ggplot2::facet_wrap(~source) + ggplot2::scale_fill_gradient2( low = "#d73027", mid = "white", high = "#1a9850", midpoint = 0, limits = c(-1, 1), name = "Pearson r" ) + ggplot2::labs( title = "Correlation Matrix: Real vs. Synthetic", x = NULL, y = NULL ) + ggplot2::theme_minimal(base_size = 11) + ggplot2::theme( axis.text.x = ggplot2::element_text(angle = 30, hjust = 1), strip.text = ggplot2::element_text(face = "bold"), legend.position = "right", plot.title = ggplot2::element_text(face = "bold"), panel.grid = ggplot2::element_blank() ) ## ----plot-marginals-age, fig.height = 3.5, warning = FALSE-------------------- age_data <- rbind( data.frame(value = adult_income$age, source = "Real"), data.frame(value = synth$age, source = "Synthetic") ) ggplot2::ggplot(age_data, ggplot2::aes(x = value, fill = source, colour = source)) + ggplot2::geom_density(alpha = 0.35, linewidth = 0.7) + ggplot2::scale_fill_manual(values = c("Real" = "#2171b5", "Synthetic" = "#ef6548")) + ggplot2::scale_colour_manual(values = c("Real" = "#2171b5", "Synthetic" = "#ef6548")) + ggplot2::labs( title = "Age Distribution: Real vs. Synthetic", x = "Age (years)", y = "Density", fill = NULL, colour = NULL ) + ggplot2::theme_minimal(base_size = 11) + ggplot2::theme( legend.position = "bottom", plot.title = ggplot2::element_text(face = "bold") ) ## ----plot-marginals-income, fig.height = 3, warning = FALSE------------------- income_real <- as.data.frame(table(adult_income$income) / nrow(adult_income)) income_synth <- as.data.frame(table(synth$income) / nrow(synth)) names(income_real) <- c("category", "proportion") names(income_synth) <- c("category", "proportion") income_real$source <- "Real" income_synth$source <- "Synthetic" income_data <- rbind(income_real, income_synth) ggplot2::ggplot( income_data, ggplot2::aes(x = category, y = proportion, fill = source) ) + ggplot2::geom_col(position = "dodge", width = 0.55, alpha = 0.9) + ggplot2::scale_y_continuous(labels = scales::percent_format(accuracy = 1)) + ggplot2::scale_fill_manual(values = c("Real" = "#2171b5", "Synthetic" = "#ef6548")) + ggplot2::labs( title = "Income Category: Real vs. Synthetic", x = NULL, y = "Proportion", fill = NULL ) + ggplot2::theme_minimal(base_size = 11) + ggplot2::theme( legend.position = "bottom", plot.title = ggplot2::element_text(face = "bold") ) ## ----diagnostic, warning = FALSE---------------------------------------------- dr <- diagnostic_report(adult_income, synth, meta) print(dr) ## ----privacy, warning = FALSE------------------------------------------------- pr <- privacy_report(adult_income, synth) print(pr) ## ----plot-privacy, fig.height = 3, warning = FALSE---------------------------- score_val <- pr$nndr_score zones <- data.frame( xmin = c(0, 0.25, 0.50, 0.75), xmax = c(0.25, 0.50, 0.75, 1.00), ymid = 0.5, label = c("High risk", "Moderate", "Good", "Excellent"), fill = c("#d73027", "#fee090", "#a6d96a", "#1a9850"), stringsAsFactors = FALSE ) ggplot2::ggplot() + ggplot2::geom_rect( data = zones, ggplot2::aes(xmin = xmin, xmax = xmax, ymin = 0, ymax = 1, fill = fill), alpha = 0.35 ) + ggplot2::geom_text( data = zones, ggplot2::aes(x = (xmin + xmax) / 2, y = 0.15, label = label), size = 3, colour = "grey30" ) + ggplot2::geom_segment( data = data.frame(x = score_val), ggplot2::aes(x = x, xend = x, y = 0, yend = 1), colour = "black", linewidth = 1.6 ) + ggplot2::geom_label( data = data.frame(x = score_val), ggplot2::aes(x = x, y = 0.72, label = sprintf("NNDR = %.3f", x)), hjust = -0.08, size = 3.8, fontface = "bold", fill = "white", label.size = 0 ) + ggplot2::scale_fill_identity() + ggplot2::scale_x_continuous( limits = c(0, 1), labels = scales::percent_format(accuracy = 1), expand = c(0, 0) ) + ggplot2::labs( title = "Privacy Score: Nearest-Neighbour Distance Ratio (NNDR)", subtitle = "Higher score = lower re-identification risk", x = "NNDR score", y = NULL ) + ggplot2::theme_minimal(base_size = 11) + ggplot2::theme( axis.text.y = ggplot2::element_blank(), axis.ticks.y = ggplot2::element_blank(), panel.grid = ggplot2::element_blank(), plot.title = ggplot2::element_text(face = "bold"), plot.subtitle = ggplot2::element_text(colour = "grey40") ) ## ----attr-disclosure, warning = FALSE----------------------------------------- adr <- attribute_disclosure_risk( real = adult_income, synthetic = synth, sensitive_col = "income", known_cols = "age" ) cat("Attribute disclosure risk (income given age):", round(adr, 3), "\n") ## ----constraints, warning = FALSE--------------------------------------------- meta_constrained <- meta |> add_constraint( inequality_constraint("education_num", "hours_per_week", type = "lt") ) syn_c <- gaussian_copula_synthesizer(meta_constrained) |> fit(adult_income) synth_c <- sample(syn_c, n = 500) # Verify: education_num < hours_per_week in all rows — should return TRUE all(synth_c$education_num < synth_c$hours_per_week)