% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/ma_projection.R
\name{ma_projection}
\alias{ma_projection}
\title{Model-Assisted Projection Estimator}
\usage{
ma_projection(
  formula,
  cluster_ids,
  weight,
  strata = NULL,
  domain,
  summary_function = "mean",
  working_model,
  data_model,
  data_proj,
  model_metric,
  cv_folds = 3,
  tuning_grid = 10,
  parallel_over = "resamples",
  seed = 1,
  return_yhat = FALSE,
  ...
)
}
\arguments{
\item{formula}{A model formula. All variables used must exist in both \code{data_model} and \code{data_proj}.}

\item{cluster_ids}{Column name (character) or formula specifying cluster identifiers from highest to lowest level. Use \code{~0} or \code{~1} if there are no clusters.}

\item{weight}{Column name in \code{data_proj} representing the survey weights.}

\item{strata}{Column name for stratification; use \code{NULL} if no strata are used.}

\item{domain}{Character vector specifying domain variable names in both datasets.}

\item{summary_function}{A function to compute domain-level estimates (default: \code{"mean"}, \code{"total"}, \code{"variance"}).}

\item{working_model}{A parsnip model object specifying the working model (see \verb{@details}).}

\item{data_model}{Data frame (small sample) containing both target and auxiliary variables.}

\item{data_proj}{Data frame (large sample) containing only auxiliary variables.}

\item{model_metric}{A \code{yardstick::metric_set()} function, or \code{NULL} to use default metrics.}

\item{cv_folds}{Number of folds for k-fold cross-validation.}

\item{tuning_grid}{Either a data frame with tuning parameters or a positive integer specifying the number of grid search candidates.}

\item{parallel_over}{Specifies parallelization mode: \code{"resamples"}, \code{"everything"}, or \code{NULL}.
If "resamples", then tuning will be performed in parallel over resamples alone. Within each resample, the preprocessor (i.e. recipe or formula) is processed once, and is then reused across all models that need to be fit.
If "everything", then tuning will be performed in parallel at two levels. An outer parallel loop will iterate over resamples. Additionally, an inner parallel loop will iterate over all unique combinations of preprocessor and model tuning parameters for that specific resample. This will result in the preprocessor being re-processed multiple times, but can be faster if that processing is extremely fast.}

\item{seed}{Integer seed for reproducibility.}

\item{return_yhat}{Logical; if \code{TRUE}, returns predicted \code{y} values for \code{data_model}.}

\item{...}{Additional arguments passed to \code{\link[survey]{svydesign}}.}
}
\value{
A list containing:
\itemize{
\item \code{model} – The fitted working model object.
\item \code{prediction} – A vector of predictions from the working model.
\item \code{df_result} – A data frame with:
\itemize{
\item \code{domain} – Domain identifier.
\item \code{ypr} – Projection estimator results for each domain.
\item \code{var_ypr} – Estimated variance of the projection estimator.
\item \code{rse_ypr} – Relative standard error (in \\%).
}
}
}
\description{
The function addresses the problem of combining information from two or more independent surveys, a common challenge in survey sampling. It focuses on cases where: \cr
\itemize{
\item \strong{Survey 1:} A large sample collects only auxiliary information.
\item \strong{Survey 2:} A much smaller sample collects both the variables of interest and the auxiliary variables.
}
The function implements a model-assisted projection estimation method based on a working model. The working models that can be used include several machine learning models that can be seen in the details section
}
\details{
The following working models are supported via the \pkg{parsnip} interface:
\itemize{
\item \code{linear_reg()} – Linear regression
\item \code{logistic_reg()} – Logistic regression
\item \code{linear_reg(engine = "stan")} – Bayesian linear regression
\item \code{logistic_reg(engine = "stan")} – Bayesian logistic regression
\item \code{poisson_reg()} – Poisson regression
\item \code{decision_tree()} – Decision tree
\item \code{nearest_neighbor()} – k-Nearest Neighbors (k-NN)
\item \code{naive_bayes()} – Naive Bayes classifier
\item \code{mlp()} – Multi-layer perceptron (neural network)
\item \code{svm_linear()} – Support vector machine with linear kernel
\item \code{svm_poly()} – Support vector machine with polynomial kernel
\item \code{svm_rbf()} – Support vector machine with radial basis function (RBF) kernel
\item \code{bag_tree()} – Bagged decision tree
\item \code{bart()} – Bayesian Additive Regression Trees (BART)
\item \code{rand_forest(engine = "ranger")} – Random forest (via ranger)
\item \code{rand_forest(engine = "aorsf")} – Accelerated oblique random forest (AORF; Jaeger et al. 2022, 2024)
\item \code{boost_tree(engine = "lightgbm")} – Gradient boosting (LightGBM)
\item \code{boost_tree(engine = "xgboost")} – Gradient boosting (XGBoost)
}
For a complete list of supported models and engines, see \href{https://www.tmwr.org/pre-proc-table}{Tidy Modeling With R}.
}
\examples{
\dontrun{
library(sae.projection)
library(dplyr)
library(bonsai)

df_svy22_income <- df_svy22 \%>\% filter(!is.na(income))
df_svy23_income <- df_svy23 \%>\% filter(!is.na(income))

# Linear regression
lm_proj <- ma_projection(
  income ~ age + sex + edu + disability,
  cluster_ids = "PSU", weight = "WEIGHT", strata = "STRATA",
  domain = c("PROV", "REGENCY"),
  working_model = linear_reg(),
  data_model = df_svy22_income,
  data_proj = df_svy23_income,
  nest = TRUE
)

df_svy22_neet <- df_svy22 \%>\%
     filter(between(age, 15, 24))
df_svy23_neet <- df_svy23 \%>\%
     filter(between(age, 15, 24))

# LightGBM regression with hyperparameter tunning
show_engines("boost_tree")
lgbm_model <- boost_tree(
  mtry = tune(), trees = tune(), min_n = tune(),
  tree_depth = tune(), learn_rate = tune(),
  engine = "lightgbm"
)

lgbm_proj <- ma_projection(
  formula = neet ~ sex + edu + disability,
  cluster_ids = "PSU",
  weight = "WEIGHT",
  strata = "STRATA",
  domain = c("PROV", "REGENCY"),
  working_model = lgbm_model,
  data_model = df_svy22_neet,
  data_proj = df_svy23_neet,
  cv_folds = 3,
  tuning_grid = 5,
  nest = TRUE
)
}
}
\references{
\enumerate{
\item Kim, J. K., & Rao, J. N. (2012). Combining data from two independent surveys: a model-assisted approach. Biometrika, 99(1), 85-100.
}
}
