#' Optimal Binning for Categorical Variables using Divergence Measures
#'
#' Performs supervised discretization of categorical variables using a
#' divergence-based hierarchical merging algorithm. This implementation
#' supports multiple information-theoretic and metric divergence measures
#' as described by Zeng (2013), enabling flexible optimization of binning
#' structures for credit scoring and binary classification tasks.
#'
#' @param feature A character vector or factor representing the categorical
#'   predictor variable to be binned. Missing values are automatically
#'   converted to the category \code{"NA"}.
#' @param target An integer vector of binary outcomes (0/1) corresponding
#'   to each observation in \code{feature}. Missing values are not permitted.
#' @param min_bins Integer. Minimum number of bins to produce. Must be >= 2.
#'   If the final number of bins after merging falls below this threshold,
#'   the algorithm will attempt to split bins. Defaults to 3.
#' @param max_bins Integer. Maximum number of bins to produce. Must be >=
#'   \code{min_bins}. The algorithm performs hierarchical merging until
#'   this constraint is satisfied. Defaults to 5.
#' @param bin_cutoff Numeric. Frequency threshold for rare category handling.
#'   Categories with relative frequency below this value are candidates for
#'   pre-binning. Must be in (0, 1). Defaults to 0.05.
#' @param max_n_prebins Integer. Maximum number of initial bins before the
#'   main merging phase. When unique categories exceed this limit, rare
#'   categories are pre-merged into an "other" bin. Must be >= 2. Defaults to 20.
#' @param bin_separator Character string used to concatenate category names
#'   when multiple categories are merged into a single bin. Defaults to "\%;\%".
#' @param convergence_threshold Numeric. Convergence tolerance for the
#'   iterative merging process. Merging stops when the change in minimum
#'   divergence between iterations falls below this threshold. Must be > 0.
#'   Defaults to 1e-6.
#' @param max_iterations Integer. Maximum number of merge operations allowed.
#'   Prevents infinite loops in edge cases. Must be > 0. Defaults to 1000.
#' @param bin_method Character string specifying the Weight of Evidence
#'   calculation method. Must be one of:
#'   \describe{
#'     \item{\code{"woe"}}{Traditional WoE: \eqn{\ln\left(\frac{p_i/P}{n_i/N}\right)}}
#'     \item{\code{"woe1"}}{Smoothed WoE (Zeng): \eqn{\ln\left(\frac{g_i + 0.5}{b_i + 0.5}\right)}}
#'   }
#'   The smoothed variant provides numerical stability for sparse bins.
#'   Defaults to \code{"woe1"}.
#' @param divergence_method Character string specifying the divergence measure
#'   used for determining bin similarity. Must be one of:
#'   \describe{
#'     \item{\code{"he"}}{Hellinger Distance: \eqn{\sum(\sqrt{p_i} - \sqrt{n_i})^2}}
#'     \item{\code{"kl"}}{Symmetrized Kullback-Leibler Divergence}
#'     \item{\code{"klj"}}{Jeffreys J-Divergence: \eqn{(p-n)\ln(p/n)}}
#'     \item{\code{"tr"}}{Triangular Discrimination: \eqn{(p-n)^2/(p+n)}}
#'     \item{\code{"sc"}}{Symmetric Chi-Square: \eqn{(p-n)^2(p+n)/(pn)}}
#'     \item{\code{"js"}}{Jensen-Shannon Divergence}
#'     \item{\code{"l1"}}{L1 Metric (Manhattan Distance): \eqn{|p-n|}}
#'     \item{\code{"l2"}}{L2 Metric (Euclidean Distance): \eqn{\sqrt{\sum(p-n)^2}}}
#'     \item{\code{"ln"}}{L-infinity Metric (Chebyshev Distance): \eqn{\max|p-n|}}
#'   }
#'   Defaults to \code{"l2"}.
#'
#' @return A list containing the binning results with the following components:
#'   \describe{
#'     \item{\code{id}}{Integer vector of bin identifiers (1-indexed)}
#'     \item{\code{bin}}{Character vector of bin labels (merged category names)}
#'     \item{\code{woe}}{Numeric vector of Weight of Evidence values per bin}
#'     \item{\code{divergence}}{Numeric vector of divergence contribution per bin}
#'     \item{\code{count}}{Integer vector of total observations per bin}
#'     \item{\code{count_pos}}{Integer vector of positive cases (target=1) per bin}
#'     \item{\code{count_neg}}{Integer vector of negative cases (target=0) per bin}
#'     \item{\code{converged}}{Logical indicating algorithm convergence}
#'     \item{\code{iterations}}{Integer count of merge operations performed}
#'     \item{\code{total_divergence}}{Numeric total divergence of the binning solution}
#'     \item{\code{bin_method}}{Character string of WoE method used}
#'     \item{\code{divergence_method}}{Character string of divergence measure used}
#'   }
#'
#' @details
#' The algorithm implements a hierarchical agglomerative approach where bins
#' are iteratively merged based on minimum pairwise divergence until the
#' \code{max_bins} constraint is satisfied or convergence is achieved.
#'
#' \strong{Algorithm Workflow:}
#' \enumerate{
#'   \item Input validation and frequency computation
#'   \item Pre-binning of rare categories (if unique categories > \code{max_n_prebins})
#'   \item Initialization of pairwise divergence matrix
#'   \item Iterative merging of most similar bin pairs
#'   \item Splitting of heterogeneous bins (if bins < \code{min_bins})
#'   \item Final metric computation and WoE-based sorting
#' }
#'
#' \strong{Divergence Measure Selection:}
#' The choice of divergence measure affects the binning structure:
#' \itemize{
#'   \item Information-theoretic measures (\code{"kl"}, \code{"js"}, \code{"klj"}):
#'     Emphasize distributional differences; sensitive to rare events
#'   \item Metric measures (\code{"l1"}, \code{"l2"}, \code{"ln"}):
#'     Provide geometric interpretation; robust to outliers
#'   \item Chi-square family (\code{"sc"}, \code{"tr"}):
#'     Balance between information content and robustness
#'   \item Hellinger distance (\code{"he"}):
#'     Bounded measure; suitable for probability distributions
#' }
#'
#' \strong{Pre-binning Strategy:}
#' When the number of unique categories exceeds \code{max_n_prebins}, categories
#' with fewer than 5 observations are aggregated into a special "PREBIN_OTHER"
#' bin to control computational complexity.
#'
#' @references
#' Zeng, G. (2013). Metric Divergence Measures and Information Value in
#' Credit Scoring. \emph{Journal of Mathematics}, 2013, Article ID 848271.
#' \doi{10.1155/2013/848271}
#'
#' Kullback, S., & Leibler, R. A. (1951). On Information and Sufficiency.
#' \emph{The Annals of Mathematical Statistics}, 22(1), 79-86.
#'
#' Lin, J. (1991). Divergence Measures Based on the Shannon Entropy.
#' \emph{IEEE Transactions on Information Theory}, 37(1), 145-151.
#'
#' @seealso
#' \code{\link{ob_categorical_cm}} for ChiMerge-based categorical binning
#'
#' @examples
#' \donttest{
#' # Example 1: Basic usage with synthetic credit data
#' set.seed(42)
#' n <- 1000
#'
#' # Simulate occupation categories with varying default rates
#' occupations <- c(
#'   "Engineer", "Doctor", "Teacher", "Sales",
#'   "Manager", "Clerk", "Other"
#' )
#' default_probs <- c(0.05, 0.03, 0.08, 0.15, 0.07, 0.12, 0.20)
#'
#' feature <- sample(occupations, n,
#'   replace = TRUE,
#'   prob = c(0.15, 0.10, 0.20, 0.18, 0.12, 0.15, 0.10)
#' )
#' target <- sapply(feature, function(x) {
#'   rbinom(1, 1, default_probs[which(occupations == x)])
#' })
#'
#' # Apply optimal binning with L2 divergence
#' result <- ob_categorical_dmiv(feature, target,
#'   min_bins = 2,
#'   max_bins = 4,
#'   divergence_method = "l2"
#' )
#'
#' # Examine binning results
#' print(data.frame(
#'   bin = result$bin,
#'   woe = round(result$woe, 3),
#'   count = result$count,
#'   event_rate = round(result$count_pos / result$count, 3)
#' ))
#'
#' # Example 2: Comparing divergence methods
#' result_js <- ob_categorical_dmiv(feature, target,
#'   divergence_method = "js",
#'   max_bins = 4
#' )
#' result_kl <- ob_categorical_dmiv(feature, target,
#'   divergence_method = "kl",
#'   max_bins = 4
#' )
#'
#' cat("Jensen-Shannon bins:", length(result_js$bin), "\n")
#' cat("Kullback-Leibler bins:", length(result_kl$bin), "\n")
#'
#' # Example 3: High cardinality feature with pre-binning
#' set.seed(123)
#' postal_codes <- paste0("ZIP_", sprintf("%03d", 1:50))
#' feature_high_card <- sample(postal_codes, 2000, replace = TRUE)
#' target_high_card <- rbinom(2000, 1, 0.1)
#'
#' result_prebin <- ob_categorical_dmiv(
#'   feature_high_card,
#'   target_high_card,
#'   max_n_prebins = 15,
#'   max_bins = 5
#' )
#'
#' cat("Final bins after pre-binning:", length(result_prebin$bin), "\n")
#' cat("Algorithm converged:", result_prebin$converged, "\n")
#' }
#'
#' @export
ob_categorical_dmiv <- function(feature, target,
                                min_bins = 3,
                                max_bins = 5,
                                bin_cutoff = 0.05,
                                max_n_prebins = 20,
                                bin_separator = "%;%",
                                convergence_threshold = 1e-6,
                                max_iterations = 1000,
                                bin_method = "woe1",
                                divergence_method = "l2") {
  # Input preprocessing
  if (!is.character(feature)) {
    feature <- as.character(feature)
  }
  feature[is.na(feature)] <- "NA"
  target <- as.integer(target)

  # Invoke C++ implementation
  .Call("_OptimalBinningWoE_optimal_binning_categorical_dmiv",
    target = target,
    feature = feature,
    min_bins = as.integer(min_bins),
    max_bins = as.integer(max_bins),
    bin_cutoff = bin_cutoff,
    max_n_prebins = as.integer(max_n_prebins),
    bin_separator = bin_separator,
    convergence_threshold = convergence_threshold,
    max_iterations = as.integer(max_iterations),
    bin_method = bin_method,
    divergence_method = divergence_method,
    PACKAGE = "OptimalBinningWoE"
  )
}
