## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width = 7, fig.height = 5 ) ## ----setup-------------------------------------------------------------------- library(tidylearn) library(dplyr) library(ggplot2) ## ----eval=FALSE--------------------------------------------------------------- # # Run AutoML on iris dataset # result <- tl_auto_ml(iris, Species ~ ., # task = "classification", # time_budget = 60) # # # View best model # print(result$best_model) ## ----eval=FALSE--------------------------------------------------------------- # # View all models tried # names(result$models) ## ----eval=FALSE--------------------------------------------------------------- # # View leaderboard # result$leaderboard ## ----eval=FALSE--------------------------------------------------------------- # # Run AutoML on regression problem # result_reg <- tl_auto_ml(mtcars, mpg ~ ., # task = "regression", # time_budget = 60) # # # Best model # print(result_reg$best_model) ## ----eval=FALSE--------------------------------------------------------------- # # Quick sanity check -- 2 fast models, no CV, done in ~1s # quick <- tl_auto_ml(iris, Species ~ ., # time_budget = 10, # use_reduction = FALSE, # use_clustering = FALSE) # quick$leaderboard # #> baseline_tree, baseline_logistic # # # Development iteration -- baselines + forest, some CV # medium <- tl_auto_ml(iris, Species ~ ., # time_budget = 60, # cv_folds = 3) # medium$leaderboard # #> 5--7 models depending on data size # # # Thorough search -- all phases, full CV # thorough <- tl_auto_ml(iris, Species ~ ., # time_budget = 300, # cv_folds = 5) # thorough$leaderboard # #> 9--11 models with cross-validated scores ## ----eval=FALSE--------------------------------------------------------------- # # Factor/character response -> classification # result_class <- tl_auto_ml(iris, Species ~ ., task = "auto") # # # Numeric response -> regression # result_reg <- tl_auto_ml(mtcars, mpg ~ ., task = "auto") ## ----eval=FALSE--------------------------------------------------------------- # # Disable dimensionality reduction # no_reduction <- tl_auto_ml(iris, Species ~ ., # use_reduction = FALSE, # time_budget = 60) # # # Disable cluster features # no_clustering <- tl_auto_ml(iris, Species ~ ., # use_clustering = FALSE, # time_budget = 60) # # # Baseline models only # baseline_only <- tl_auto_ml(iris, Species ~ ., # use_reduction = FALSE, # use_clustering = FALSE, # time_budget = 30) ## ----eval=FALSE--------------------------------------------------------------- # # Adjust cross-validation folds # result_cv <- tl_auto_ml(iris, Species ~ ., # cv_folds = 10, # time_budget = 120) # # # Fewer folds for faster evaluation # result_fast <- tl_auto_ml(iris, Species ~ ., # cv_folds = 3, # time_budget = 60) ## ----eval=FALSE--------------------------------------------------------------- # result <- tl_auto_ml(iris, Species ~ ., time_budget = 60) # # # Best performing model # best_model <- result$best_model # # # All models trained # all_models <- result$models # # # Specific model # baseline_logistic <- result$models$baseline_logistic # pca_forest <- result$models$pca_forest ## ----eval=FALSE--------------------------------------------------------------- # # View performance comparison # leaderboard <- result$leaderboard # # # Sort by score (higher is better for accuracy, lower for RMSE) # leaderboard <- leaderboard %>% # arrange(desc(score)) # # print(leaderboard) ## ----eval=FALSE--------------------------------------------------------------- # # Use best model for predictions # predictions <- predict(result$best_model, new_data = new_data) # # # Or use a specific model # predictions_pca <- predict(result$models$pca_forest, new_data = new_data) ## ----eval=FALSE--------------------------------------------------------------- # # Split data for evaluation # split <- tl_split(iris, prop = 0.7, stratify = "Species", seed = 123) # # # Run AutoML on training data # automl_iris <- tl_auto_ml(split$train, Species ~ ., # time_budget = 90, # cv_folds = 5) # # # Evaluate on test set # test_preds <- predict(automl_iris$best_model, new_data = split$test) # test_accuracy <- mean(test_preds$.pred == split$test$Species) # # cat("AutoML Test Accuracy:", round(test_accuracy * 100, 1), "%\n") ## ----eval=FALSE--------------------------------------------------------------- # # Compare models # for (model_name in names(automl_iris$models)) { # model <- automl_iris$models[[model_name]] # preds <- predict(model, new_data = split$test) # acc <- mean(preds$.pred == split$test$Species) # cat(model_name, ":", round(acc * 100, 1), "%\n") # } ## ----eval=FALSE--------------------------------------------------------------- # # Split mtcars data # split_mtcars <- tl_split(mtcars, prop = 0.7, seed = 42) # # # Run AutoML # automl_mpg <- tl_auto_ml(split_mtcars$train, mpg ~ ., # task = "regression", # time_budget = 90) # # # Evaluate # test_preds_mpg <- predict(automl_mpg$best_model, new_data = split_mtcars$test) # rmse <- sqrt(mean((test_preds_mpg$.pred - split_mtcars$test$mpg)^2)) # # cat("AutoML Test RMSE:", round(rmse, 2), "\n") ## ----eval=FALSE--------------------------------------------------------------- # # Preprocess data first # processed <- tl_prepare_data( # split$train, # Species ~ ., # scale_method = "standardize", # remove_correlated = TRUE # ) # # # Run AutoML on preprocessed data # automl_processed <- tl_auto_ml(processed$data, Species ~ ., # time_budget = 60) # # # Note: Need to apply same preprocessing to test data # test_processed <- tl_prepare_data( # split$test, # Species ~ ., # scale_method = "standardize" # ) # # test_preds_proc <- predict( # automl_processed$best_model, # new_data = test_processed$data # ) ## ----eval=FALSE--------------------------------------------------------------- # # Manual approach: choose one model # manual_model <- tl_model(split$train, Species ~ ., method = "forest") # manual_preds <- predict(manual_model, new_data = split$test) # manual_acc <- mean(manual_preds$.pred == split$test$Species) # # # AutoML approach # automl_model <- tl_auto_ml(split$train, Species ~ ., time_budget = 60) # automl_preds <- predict(automl_model$best_model, new_data = split$test) # automl_acc <- mean(automl_preds$.pred == split$test$Species) # # cat("Manual Selection:", round(manual_acc * 100, 1), "%\n") # cat("AutoML:", round(automl_acc * 100, 1), "%\n") ## ----eval=FALSE--------------------------------------------------------------- # # First pass: quick exploration # quick_automl <- tl_auto_ml(split$train, Species ~ ., # time_budget = 30, # use_reduction = TRUE, # use_clustering = FALSE) # # # Analyze what worked — best model name is in the leaderboard # best_name <- quick_automl$leaderboard$model[1] # best_method <- quick_automl$best_model$spec$method # cat("Best model:", best_name, "(method:", best_method, ")\n") # # # Second pass: if a PCA variant won, invest more in reduction # if (grepl("^pca_", best_name)) { # refined_automl <- tl_auto_ml(split$train, Species ~ ., # time_budget = 60, # use_reduction = TRUE, # use_clustering = TRUE) # } ## ----eval=FALSE--------------------------------------------------------------- # # Get top 3 models # top_models <- automl_iris$leaderboard %>% # arrange(desc(score)) %>% # head(3) # # # Make predictions with each # ensemble_preds <- list() # for (i in seq_len(nrow(top_models))) { # model_name <- top_models$model[i] # model <- automl_iris$models[[model_name]] # ensemble_preds[[i]] <- predict(model, new_data = split$test)$.pred # } # # # Majority vote for classification # final_pred <- apply(do.call(cbind, ensemble_preds), 1, function(x) { # names(which.max(table(x))) # }) # # ensemble_acc <- mean(final_pred == split$test$Species) # cat("Ensemble Accuracy:", round(ensemble_acc * 100, 1), "%\n") ## ----eval=FALSE--------------------------------------------------------------- # # AutoML automatically uses accuracy for classification # result_class <- tl_auto_ml(iris, Species ~ ., # metric = "accuracy", # time_budget = 60) ## ----eval=FALSE--------------------------------------------------------------- # # AutoML automatically uses RMSE for regression # result_reg <- tl_auto_ml(mtcars, mpg ~ ., # metric = "rmse", # time_budget = 60) ## ----eval=FALSE--------------------------------------------------------------- # # 1. Reduce CV folds (biggest impact) # fast_result <- tl_auto_ml(data, formula, # cv_folds = 2, # time_budget = 30) # # # 2. Disable slow phases # baseline_result <- tl_auto_ml(data, formula, # use_reduction = FALSE, # use_clustering = FALSE, # time_budget = 30) # # # 3. Use a budget under 30s to skip forest/SVM/XGBoost entirely # quick_result <- tl_auto_ml(data, formula, time_budget = 10) ## ----eval=FALSE--------------------------------------------------------------- # result <- tl_auto_ml(data, formula, # metric = "accuracy", # or "rmse" for regression # time_budget = 60) ## ----eval=FALSE--------------------------------------------------------------- # # Increase time budget to unlock all phases # thorough_result <- tl_auto_ml(data, formula, time_budget = 300) # # # Ensure feature engineering is enabled # full_result <- tl_auto_ml(data, formula, # use_reduction = TRUE, # use_clustering = TRUE, # time_budget = 300) ## ----eval=FALSE--------------------------------------------------------------- # # Complete AutoML workflow # workflow_split <- tl_split(iris, prop = 0.7, stratify = "Species", seed = 123) # # automl_result <- tl_auto_ml( # data = workflow_split$train, # formula = Species ~ ., # task = "auto", # use_reduction = TRUE, # use_clustering = TRUE, # time_budget = 120, # cv_folds = 5 # ) # # # Evaluate best model # final_preds <- predict(automl_result$best_model, new_data = workflow_split$test) # final_accuracy <- mean(final_preds$.pred == workflow_split$test$Species) # # cat("Final AutoML Accuracy:", round(final_accuracy * 100, 1), "%\n") # cat("Best approach:", automl_result$best_model$spec$method, "\n")