% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/ml_feature_count_vectorizer.R
\name{ft_count_vectorizer}
\alias{ft_count_vectorizer}
\alias{ml_vocabulary}
\title{Feature Transformation -- CountVectorizer (Estimator)}
\usage{
ft_count_vectorizer(
  x,
  input_col = NULL,
  output_col = NULL,
  binary = FALSE,
  min_df = 1,
  min_tf = 1,
  vocab_size = 2^18,
  uid = random_string("count_vectorizer_"),
  ...
)

ml_vocabulary(model)
}
\arguments{
\item{x}{A \code{spark_connection}, \code{ml_pipeline}, or a \code{tbl_spark}.}

\item{input_col}{The name of the input column.}

\item{output_col}{The name of the output column.}

\item{binary}{Binary toggle to control the output vector values.
If \code{TRUE}, all nonzero counts (after \code{min_tf} filter applied)
are set to 1. This is useful for discrete probabilistic models that
 model binary events rather than integer counts. Default: \code{FALSE}}

\item{min_df}{Specifies the minimum number of different documents a
term must appear in to be included in the vocabulary. If this is an
integer greater than or equal to 1, this specifies the number of
documents the term must appear in; if this is a double in [0,1), then
this specifies the fraction of documents. Default: 1.}

\item{min_tf}{Filter to ignore rare words in a document. For each
document, terms with frequency/count less than the given threshold
are ignored. If this is an integer greater than or equal to 1, then
this specifies a count (of times the term must appear in the document);
if this is a double in [0,1), then this specifies a fraction (out of
the document's token count). Default: 1.}

\item{vocab_size}{Build a vocabulary that only considers the top
\code{vocab_size} terms ordered by term frequency across the corpus.
Default: \code{2^18}.}

\item{uid}{A character string used to uniquely identify the feature transformer.}

\item{...}{Optional arguments; currently unused.}

\item{model}{A \code{ml_count_vectorizer_model}.}
}
\value{
The object returned depends on the class of \code{x}. If it is a
\code{spark_connection}, the function returns a \code{ml_estimator} or a
\code{ml_estimator} object. If it is a \code{ml_pipeline}, it will return
a pipeline with the transformer or estimator appended to it. If a
\code{tbl_spark}, it will return a \code{tbl_spark} with the transformation
 applied to it.

\code{ml_vocabulary()} returns a vector of vocabulary built.
}
\description{
Extracts a vocabulary from document collections.
}
\details{
In the case where \code{x} is a \code{tbl_spark}, the estimator
fits against \code{x} to obtain a transformer, returning a \code{tbl_spark}.
}
\seealso{
Other feature transformers: 
\code{\link{ft_binarizer}()},
\code{\link{ft_bucketizer}()},
\code{\link{ft_chisq_selector}()},
\code{\link{ft_dct}()},
\code{\link{ft_elementwise_product}()},
\code{\link{ft_feature_hasher}()},
\code{\link{ft_hashing_tf}()},
\code{\link{ft_idf}()},
\code{\link{ft_imputer}()},
\code{\link{ft_index_to_string}()},
\code{\link{ft_interaction}()},
\code{\link{ft_lsh}},
\code{\link{ft_max_abs_scaler}()},
\code{\link{ft_min_max_scaler}()},
\code{\link{ft_ngram}()},
\code{\link{ft_normalizer}()},
\code{\link{ft_one_hot_encoder}()},
\code{\link{ft_one_hot_encoder_estimator}()},
\code{\link{ft_pca}()},
\code{\link{ft_polynomial_expansion}()},
\code{\link{ft_quantile_discretizer}()},
\code{\link{ft_r_formula}()},
\code{\link{ft_regex_tokenizer}()},
\code{\link{ft_robust_scaler}()},
\code{\link{ft_sql_transformer}()},
\code{\link{ft_standard_scaler}()},
\code{\link{ft_stop_words_remover}()},
\code{\link{ft_string_indexer}()},
\code{\link{ft_tokenizer}()},
\code{\link{ft_vector_assembler}()},
\code{\link{ft_vector_indexer}()},
\code{\link{ft_vector_slicer}()},
\code{\link{ft_word2vec}()}
}
\concept{feature transformers}
