## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width = 7,
  fig.height = 5
)

## ----setup--------------------------------------------------------------------
library(tidylearn)
library(dplyr)
library(ggplot2)

## ----eval=FALSE---------------------------------------------------------------
# # Run AutoML on iris dataset
# result <- tl_auto_ml(iris, Species ~ .,
#                      task = "classification",
#                      time_budget = 60)
# 
# # View best model
# print(result$best_model)

## ----eval=FALSE---------------------------------------------------------------
# # View all models tried
# names(result$models)

## ----eval=FALSE---------------------------------------------------------------
# # View leaderboard
# result$leaderboard

## ----eval=FALSE---------------------------------------------------------------
# # Run AutoML on regression problem
# result_reg <- tl_auto_ml(mtcars, mpg ~ .,
#                          task = "regression",
#                          time_budget = 60)
# 
# # Best model
# print(result_reg$best_model)

## ----eval=FALSE---------------------------------------------------------------
# # Quick sanity check -- 2 fast models, no CV, done in ~1s
# quick <- tl_auto_ml(iris, Species ~ .,
#                     time_budget = 10,
#                     use_reduction = FALSE,
#                     use_clustering = FALSE)
# quick$leaderboard
# #> baseline_tree, baseline_logistic
# 
# # Development iteration -- baselines + forest, some CV
# medium <- tl_auto_ml(iris, Species ~ .,
#                      time_budget = 60,
#                      cv_folds = 3)
# medium$leaderboard
# #> 5--7 models depending on data size
# 
# # Thorough search -- all phases, full CV
# thorough <- tl_auto_ml(iris, Species ~ .,
#                        time_budget = 300,
#                        cv_folds = 5)
# thorough$leaderboard
# #> 9--11 models with cross-validated scores

## ----eval=FALSE---------------------------------------------------------------
# # Factor/character response -> classification
# result_class <- tl_auto_ml(iris, Species ~ ., task = "auto")
# 
# # Numeric response -> regression
# result_reg <- tl_auto_ml(mtcars, mpg ~ ., task = "auto")

## ----eval=FALSE---------------------------------------------------------------
# # Disable dimensionality reduction
# no_reduction <- tl_auto_ml(iris, Species ~ .,
#                            use_reduction = FALSE,
#                            time_budget = 60)
# 
# # Disable cluster features
# no_clustering <- tl_auto_ml(iris, Species ~ .,
#                             use_clustering = FALSE,
#                             time_budget = 60)
# 
# # Baseline models only
# baseline_only <- tl_auto_ml(iris, Species ~ .,
#                             use_reduction = FALSE,
#                             use_clustering = FALSE,
#                             time_budget = 30)

## ----eval=FALSE---------------------------------------------------------------
# # Adjust cross-validation folds
# result_cv <- tl_auto_ml(iris, Species ~ .,
#                         cv_folds = 10,
#                         time_budget = 120)
# 
# # Fewer folds for faster evaluation
# result_fast <- tl_auto_ml(iris, Species ~ .,
#                           cv_folds = 3,
#                           time_budget = 60)

## ----eval=FALSE---------------------------------------------------------------
# result <- tl_auto_ml(iris, Species ~ ., time_budget = 60)
# 
# # Best performing model
# best_model <- result$best_model
# 
# # All models trained
# all_models <- result$models
# 
# # Specific model
# baseline_logistic <- result$models$baseline_logistic
# pca_forest <- result$models$pca_forest

## ----eval=FALSE---------------------------------------------------------------
# # View performance comparison
# leaderboard <- result$leaderboard
# 
# # Sort by score (higher is better for accuracy, lower for RMSE)
# leaderboard <- leaderboard %>%
#   arrange(desc(score))
# 
# print(leaderboard)

## ----eval=FALSE---------------------------------------------------------------
# # Use best model for predictions
# predictions <- predict(result$best_model, new_data = new_data)
# 
# # Or use a specific model
# predictions_pca <- predict(result$models$pca_forest, new_data = new_data)

## ----eval=FALSE---------------------------------------------------------------
# # Split data for evaluation
# split <- tl_split(iris, prop = 0.7, stratify = "Species", seed = 123)
# 
# # Run AutoML on training data
# automl_iris <- tl_auto_ml(split$train, Species ~ .,
#                           time_budget = 90,
#                           cv_folds = 5)
# 
# # Evaluate on test set
# test_preds <- predict(automl_iris$best_model, new_data = split$test)
# test_accuracy <- mean(test_preds$.pred == split$test$Species)
# 
# cat("AutoML Test Accuracy:", round(test_accuracy * 100, 1), "%\n")

## ----eval=FALSE---------------------------------------------------------------
# # Compare models
# for (model_name in names(automl_iris$models)) {
#   model <- automl_iris$models[[model_name]]
#   preds <- predict(model, new_data = split$test)
#   acc <- mean(preds$.pred == split$test$Species)
#   cat(model_name, ":", round(acc * 100, 1), "%\n")
# }

## ----eval=FALSE---------------------------------------------------------------
# # Split mtcars data
# split_mtcars <- tl_split(mtcars, prop = 0.7, seed = 42)
# 
# # Run AutoML
# automl_mpg <- tl_auto_ml(split_mtcars$train, mpg ~ .,
#                          task = "regression",
#                          time_budget = 90)
# 
# # Evaluate
# test_preds_mpg <- predict(automl_mpg$best_model, new_data = split_mtcars$test)
# rmse <- sqrt(mean((test_preds_mpg$.pred - split_mtcars$test$mpg)^2))
# 
# cat("AutoML Test RMSE:", round(rmse, 2), "\n")

## ----eval=FALSE---------------------------------------------------------------
# # Preprocess data first
# processed <- tl_prepare_data(
#   split$train,
#   Species ~ .,
#   scale_method = "standardize",
#   remove_correlated = TRUE
# )
# 
# # Run AutoML on preprocessed data
# automl_processed <- tl_auto_ml(processed$data, Species ~ .,
#                                time_budget = 60)
# 
# # Note: Need to apply same preprocessing to test data
# test_processed <- tl_prepare_data(
#   split$test,
#   Species ~ .,
#   scale_method = "standardize"
# )
# 
# test_preds_proc <- predict(
#   automl_processed$best_model,
#   new_data = test_processed$data
# )

## ----eval=FALSE---------------------------------------------------------------
# # Manual approach: choose one model
# manual_model <- tl_model(split$train, Species ~ ., method = "forest")
# manual_preds <- predict(manual_model, new_data = split$test)
# manual_acc <- mean(manual_preds$.pred == split$test$Species)
# 
# # AutoML approach
# automl_model <- tl_auto_ml(split$train, Species ~ ., time_budget = 60)
# automl_preds <- predict(automl_model$best_model, new_data = split$test)
# automl_acc <- mean(automl_preds$.pred == split$test$Species)
# 
# cat("Manual Selection:", round(manual_acc * 100, 1), "%\n")
# cat("AutoML:", round(automl_acc * 100, 1), "%\n")

## ----eval=FALSE---------------------------------------------------------------
# # First pass: quick exploration
# quick_automl <- tl_auto_ml(split$train, Species ~ .,
#                            time_budget = 30,
#                            use_reduction = TRUE,
#                            use_clustering = FALSE)
# 
# # Analyze what worked — best model name is in the leaderboard
# best_name <- quick_automl$leaderboard$model[1]
# best_method <- quick_automl$best_model$spec$method
# cat("Best model:", best_name, "(method:", best_method, ")\n")
# 
# # Second pass: if a PCA variant won, invest more in reduction
# if (grepl("^pca_", best_name)) {
#   refined_automl <- tl_auto_ml(split$train, Species ~ .,
#                                time_budget = 60,
#                                use_reduction = TRUE,
#                                use_clustering = TRUE)
# }

## ----eval=FALSE---------------------------------------------------------------
# # Get top 3 models
# top_models <- automl_iris$leaderboard %>%
#   arrange(desc(score)) %>%
#   head(3)
# 
# # Make predictions with each
# ensemble_preds <- list()
# for (i in seq_len(nrow(top_models))) {
#   model_name <- top_models$model[i]
#   model <- automl_iris$models[[model_name]]
#   ensemble_preds[[i]] <- predict(model, new_data = split$test)$.pred
# }
# 
# # Majority vote for classification
# final_pred <- apply(do.call(cbind, ensemble_preds), 1, function(x) {
#   names(which.max(table(x)))
# })
# 
# ensemble_acc <- mean(final_pred == split$test$Species)
# cat("Ensemble Accuracy:", round(ensemble_acc * 100, 1), "%\n")

## ----eval=FALSE---------------------------------------------------------------
# # AutoML automatically uses accuracy for classification
# result_class <- tl_auto_ml(iris, Species ~ .,
#                            metric = "accuracy",
#                            time_budget = 60)

## ----eval=FALSE---------------------------------------------------------------
# # AutoML automatically uses RMSE for regression
# result_reg <- tl_auto_ml(mtcars, mpg ~ .,
#                          metric = "rmse",
#                          time_budget = 60)

## ----eval=FALSE---------------------------------------------------------------
# # 1. Reduce CV folds (biggest impact)
# fast_result <- tl_auto_ml(data, formula,
#                           cv_folds = 2,
#                           time_budget = 30)
# 
# # 2. Disable slow phases
# baseline_result <- tl_auto_ml(data, formula,
#                               use_reduction = FALSE,
#                               use_clustering = FALSE,
#                               time_budget = 30)
# 
# # 3. Use a budget under 30s to skip forest/SVM/XGBoost entirely
# quick_result <- tl_auto_ml(data, formula, time_budget = 10)

## ----eval=FALSE---------------------------------------------------------------
# result <- tl_auto_ml(data, formula,
#                      metric = "accuracy",  # or "rmse" for regression
#                      time_budget = 60)

## ----eval=FALSE---------------------------------------------------------------
# # Increase time budget to unlock all phases
# thorough_result <- tl_auto_ml(data, formula, time_budget = 300)
# 
# # Ensure feature engineering is enabled
# full_result <- tl_auto_ml(data, formula,
#                           use_reduction = TRUE,
#                           use_clustering = TRUE,
#                           time_budget = 300)

## ----eval=FALSE---------------------------------------------------------------
# # Complete AutoML workflow
# workflow_split <- tl_split(iris, prop = 0.7, stratify = "Species", seed = 123)
# 
# automl_result <- tl_auto_ml(
#   data = workflow_split$train,
#   formula = Species ~ .,
#   task = "auto",
#   use_reduction = TRUE,
#   use_clustering = TRUE,
#   time_budget = 120,
#   cv_folds = 5
# )
# 
# # Evaluate best model
# final_preds <- predict(automl_result$best_model, new_data = workflow_split$test)
# final_accuracy <- mean(final_preds$.pred == workflow_split$test$Species)
# 
# cat("Final AutoML Accuracy:", round(final_accuracy * 100, 1), "%\n")
# cat("Best approach:", automl_result$best_model$spec$method, "\n")