% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/dice.R
\name{dice}
\alias{dice}
\title{Diverse Clustering Ensemble}
\usage{
dice(
  data,
  nk,
  reps = 10,
  algorithms = NULL,
  k.method = NULL,
  nmf.method = c("brunet", "lee"),
  hc.method = "average",
  distance = "euclidean",
  cons.funs = c("kmodes", "majority", "CSPA", "LCE", "LCA"),
  sim.mat = c("cts", "srs", "asrs"),
  prep.data = c("none", "full", "sampled"),
  min.var = 1,
  seed = 1,
  trim = FALSE,
  reweigh = FALSE,
  n = 5,
  evaluate = TRUE,
  plot = FALSE,
  ref.cl = NULL,
  progress = TRUE
)
}
\arguments{
\item{data}{data matrix with rows as samples and columns as variables}

\item{nk}{number of clusters (k) requested; can specify a single integer or a
range of integers to compute multiple k}

\item{reps}{number of subsamples}

\item{algorithms}{vector of clustering algorithms for performing consensus
clustering. Must be any number of the following: "nmf", "hc", "diana",
"km", "pam", "ap", "sc", "gmm", "som", "cmeans", "hdbscan". A
custom clustering algorithm can be used.}

\item{k.method}{determines the method to choose k when no reference class is
given. When \code{ref.cl} is not \code{NULL}, k is the number of distinct classes of
\code{ref.cl}. Otherwise the input from \code{k.method} chooses k. The default is to
use the PAC to choose the best k(s). Specifying an integer as a
user-desired k will override the best k chosen by PAC. Finally, specifying
"all" will produce consensus results for all k. The "all" method is
implicitly performed when there is only one k used.}

\item{nmf.method}{specify NMF-based algorithms to run. By default the
"brunet" and "lee" algorithms are called. See \code{\link[NMF:nmf]{NMF::nmf()}} for details.}

\item{hc.method}{agglomeration method for hierarchical clustering. The
the "average" method is used by default. See\code{\link[stats:hclust]{stats::hclust()}} for details.}

\item{distance}{a vector of distance functions. Defaults to "euclidean".
Other options are given in \code{\link[stats:dist]{stats::dist()}}. A custom distance function can
be used.}

\item{cons.funs}{consensus functions to use. Current options are "kmodes"
(k-modes), "majority" (majority voting), "CSPA" (Cluster-based Similarity
Partitioning Algorithm), "LCE" (linkage clustering ensemble), "LCA" (latent
class analysis)}

\item{sim.mat}{similarity matrix; choices are "cts", "srs", "asrs".}

\item{prep.data}{Prepare the data on the "full" dataset, the "sampled"
dataset, or "none" (default).}

\item{min.var}{minimum variability measure threshold used to filter the
feature space for only highly variable features. Only features with a
minimum variability measure across all samples greater than \code{min.var} will
be used. If \code{type = "conventional"}, the standard deviation is the measure
used, and if \code{type = "robust"}, the MAD is the measure used.}

\item{seed}{random seed for knn imputation reproducibility}

\item{trim}{logical; if \code{TRUE}, algorithms that score low on internal indices
will be trimmed out}

\item{reweigh}{logical; if \code{TRUE}, after trimming out poor performing
algorithms, each algorithm is reweighed depending on its internal indices.}

\item{n}{an integer specifying the top \code{n} algorithms to keep after trimming
off the poor performing ones using Rank Aggregation. If the total number of
algorithms is less than \code{n} no trimming is done.}

\item{evaluate}{logical; if \code{TRUE} (default), validity indices are returned.
Internal validity indices are always computed. If \code{ref.cl} is not \code{NULL},
then external validity indices will also be computed.}

\item{plot}{logical; if \code{TRUE}, \code{graph_all} is called and a summary
evaluation heatmap of ranked algorithms vs. internal validity indices is
plotted as well.}

\item{ref.cl}{reference class}

\item{progress}{logical; should a progress bar be displayed?}
}
\value{
A list with the following elements
\item{E}{raw clustering ensemble object}
\item{Eknn}{clustering ensemble object with knn imputation used on \code{E}}
\item{Ecomp}{flattened ensemble object with remaining missing entries imputed
by majority voting}
\item{clusters}{final clustering assignment from the diverse clustering
ensemble method}
\item{indices}{if \code{evaluate = TRUE}, shows cluster evaluation indices;
otherwise \code{NULL}}
}
\description{
Runs consensus clustering across subsamples, algorithms, and number of
clusters (k).
}
\details{
There are three ways to handle the input data before clustering via argument
\code{prep.data}. The default is to use the raw data as-is ("none"). Or, we can
enact \code{\link[=prepare_data]{prepare_data()}} on the full dataset ("full"), or the bootstrap sampled
datasets ("sampled").
}
\examples{
library(dplyr)
data(hgsc)
dat <- hgsc[1:100, 1:50]
ref.cl <- strsplit(rownames(dat), "_") \%>\%
  purrr::map_chr(2) \%>\%
  factor() \%>\%
  as.integer()
dice.obj <- dice(dat, nk = 4, reps = 5, algorithms = "hc", cons.funs =
"kmodes", ref.cl = ref.cl, progress = FALSE)
str(dice.obj, max.level = 2)
}
\author{
Aline Talhouk, Derek Chiu
}
