% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/vectorizers.R
\name{vectorizers}
\alias{hash_vectorizer}
\alias{vectorizers}
\alias{vocab_vectorizer}
\title{Vocabulary and hash vectorizers}
\usage{
vocab_vectorizer(vocabulary, grow_dtm = TRUE, skip_grams_window = 0L)

hash_vectorizer(hash_size = 2^18, ngram = c(1L, 1L), signed_hash = FALSE,
  grow_dtm = TRUE, skip_grams_window = 0L)
}
\arguments{
\item{vocabulary}{\code{text2vec_vocabulary} object, see \link{create_vocabulary}.}

\item{grow_dtm}{\code{logical} Should we grow the document-term matrix
during corpus construction or not.}

\item{skip_grams_window}{\code{integer} window for term-co-occurence matrix
construction. A value of \code{0L} does not construct the TCM.}

\item{hash_size}{\code{integer} The number of of hash-buckets for the feature
hashing trick. The number must be greater than 0, and preferably it will be
a power of 2.}

\item{ngram}{\code{integer} vector. The lower and upper boundary of the range
of n-values for different n-grams to be extracted. All values of \code{n}
such that ngram_min <= n <= ngram_max will be used.}

\item{signed_hash}{\code{logical},  indicating whether to use a signed
hash-function to reduce collisions when hashing.}
}
\value{
A vectorizer \code{function}
}
\description{
This function creates a text vectorizer function
which is used in constructing a corpus.
}
\examples{
data("movie_review")
N <- 100
vectorizer <- hash_vectorizer(2 ^ 18, c(1L, 2L))
it <- itoken(movie_review$review[1:N], preprocess_function = tolower,
             tokenizer = word_tokenizer, chunks_number = 10)
corpus <- create_corpus(it, vectorizer)
hash_dtm <- get_dtm(corpus)

it <- itoken(movie_review$review[1:N], preprocess_function = tolower,
             tokenizer = word_tokenizer, chunks_number = 10)
v <- create_vocabulary(it, c(1L, 1L) )

vectorizer <- vocab_vectorizer(v)

it <- itoken(movie_review$review[1:N], preprocess_function = tolower,
             tokenizer = word_tokenizer, chunks_number = 10)

corpus <- create_corpus(it, vectorizer)
voacb_dtm <- get_dtm(corpus)
}
\seealso{
\link{create_corpus} \link{create_dtm} \link{create_tcm} \link{create_vocabulary}
}

