% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/lingmatch.R
\name{lma_weight}
\alias{lma_weight}
\title{Document-Term Matrix Weighting}
\usage{
lma_weight(dtm, weight = "count", normalize = TRUE, wc.complete = TRUE,
  log.base = 10, alpha = 1, doc.only = FALSE, percent = FALSE)
}
\arguments{
\item{dtm}{A matrix with words as column names.}

\item{weight}{A string referring at least partially to one (or a combination; see note) of the
  available weighting methods:

  \strong{Term weights} (applied uniquely to each cell)
  \itemize{
    \item \strong{\code{binary}} \cr
    \code{(dtm > 0) * 1} \cr
    Convert frequencies to 1s and 0s; remove differences in frequencies.

    \item \strong{\code{log}} \cr
    \code{log(dtm + 1, log.base)} \cr
    Log of frequencies.

    \item \strong{\code{sqrt}} \cr
    \code{sqrt(dtm)} \cr
    Square root of frequencies.

    \item \strong{\code{count}} \cr
    \code{dtm} \cr
    Unaltered; sometimes called term frequencies (tf).

    \item \strong{\code{amplify}} \cr
    \code{dtm ^ alpha} \cr
    Amplify difference in frequencies.
  }

  \strong{Document weights} (applied by column)
  \itemize{
    \item \strong{\code{dflog}} \cr
    \code{log(colSums(dtm > 0), log.base)} \cr
    Log of binary term sum.

    \item \strong{\code{entropy}} \cr
    \code{1 - rowSums(x *} \code{log(x + 1, log.base) /} \code{log(ncol(x),} \code{log.base), na.rm = TRUE)} \cr
    Where \code{x = t(dtm) / colSums(dtm > 0)}; entropy of term-conditional term distribution.

    \item \strong{\code{ppois}} \cr
    \code{1 - ppois(alpha,} \code{colSums(dtm) / nrow(dtm))} \cr
    Poisson-predicted term distribution.

    \item \strong{\code{dpois}} \cr
    \code{1 - dpois(alpha, colSums(dtm) / nrow(dtm))} \cr
    Poisson-predicted term density.

    \item \strong{\code{dfmlog}} \cr
    \code{log(diag(dtm[max.col(t(dtm)),]), log.base)} \cr
    Log of maximum term frequency.

    \item \strong{\code{dfmax}} \cr
    \code{diag(dtm[max.col(t(dtm)),])} \cr
    Maximum term frequency.

    \item \strong{\code{df}} \cr
    \code{colSums(dtm > 0)} \cr
    Sum of binary term occurrence across documents.

    \item \strong{\code{idf}} \cr
    \code{log(nrow(dtm) / colSums(dtm > 0), log.base)} \cr
    Inverse document frequency.

    \item \strong{\code{ridf}} \cr
    \code{idf - log(dpois, log.base)} \cr
    Residual inverse document frequency.

    \item \strong{\code{normal}} \cr
    \code{sqrt(1 / colSums(dtm ^ 2))} \cr
    Normalized document frequency.
  }

Alternatively, \code{'pmi'} or \code{'ppmi'} will apply a pointwise mutual information weighting
scheme (with \code{'ppmi'} setting negative values to 0).}

\item{normalize}{Logical: if \code{FALSE}, the dtm is not divided by document word-count before
being weighted.}

\item{wc.complete}{If the dtm was made with \code{\link{lma_dtm}} (has a \code{'WC'}
attribute), word counts for
frequencies can be based on the raw count (default; \code{wc.complete = TRUE}). If
\code{wc.complete = FALSE}, or the dtm does not have a \code{'WC'} attribute,
\code{rowSums(dtm)} is used as word count.}

\item{log.base}{The base of logs, applied to any weight using \code{\link[base]{log}}.
Default is 10.}

\item{alpha}{A scaling factor applied to document frequency as part of pointwise mutual
information weighting, or amplify's power (\code{dtm ^ alpha}, which defaults to 1.1), or the
specified quantile of the poisson distribution (\code{dpois(alpha,}
\code{colSums(x,} \code{na.rm = TRUE) /} \code{nrow(x))}).}

\item{doc.only}{Logical: if \code{TRUE}, only document weights are returned (a single value for
each term).}

\item{percent}{Logical; if \code{TRUE}, frequencies are multiplied by 100.}
}
\value{
A weighted version of \code{dtm}.
}
\description{
Weight a document-term matrix.
}
\note{
Term weights works to adjust differences in counts within documents, with differences meaning
increasingly more from \code{binary} to \code{log} to \code{sqrt} to \code{count} to \code{amplify}.

Document weights work to treat words differently based on their between-document or overall frequency.
When term frequencies are constant, \code{dpois}, \code{idf}, \code{ridf}, and \code{normal} give
less common words increasingly more weight, and \code{dfmax}, \code{dfmlog}, \code{ppois}, \code{df},
\code{dflog}, and \code{entropy} give less common words increasingly less weight.

\code{weight} can either be a vector with two characters, corresponding to term weight and
document weight (e.g., \code{c('count', 'idf')}), or it can be a string with term and
document weights separated by any of \code{:\\*_/; ,-} (e.g., \code{'count-idf'}).
\code{'tf'} is also acceptable for \code{'count'}, and \code{'tfidf'} will be parsed as
\code{c('count', 'idf')}, though this is a special case.

For \code{weight}, term or document weights can be entered individually; term weights alone will
not apply any document weight, and document weights alone will apply a \code{'count'} term weight
(unless \code{doc.only = TRUE}, in which case a term-named vector of document weights is returned
instead of a weighted dtm).
}
\examples{
# visualize term and document weights

## term weights
term_weights = c('binary', 'log', 'sqrt', 'count', 'amplify')
Weighted = sapply(term_weights, function(w) lma_weight(1:20, w, FALSE))
if(require(splot)) splot(Weighted ~ 1:20, labx = 'Raw Count', lines = 'co')

## document weights
doc_weights = c('df', 'dflog', 'dfmax', 'dfmlog', 'idf', 'ridf',
  'normal', 'dpois', 'ppois', 'entropy')
weight_range = function(w, value = 1){
  m = diag(20)
  m[upper.tri(m, TRUE)] = if(is.numeric(value)) value else unlist(lapply(
    1:20, function(v) rep(if(value == 'inverted') 21 - v else v, v)
  ))
  lma_weight(m, w, FALSE, doc.only = TRUE)
}

if(require(splot)){
  category = rep(c('df', 'idf', 'normal', 'poisson', 'entropy'), c(4, 2, 1, 2, 1))
  op = list(
    laby = 'Relative (Scaled) Weight', labx = 'Document Frequency',
    leg = 'outside', colorby = list(quote(category), grade = TRUE),
    lines = 'connected', mv.scale = TRUE, note = FALSE
  )
  splot(
    sapply(doc_weights, weight_range) ~ 1:20,
    options = op, title = 'Same Term, Varying Document Frequencies',
    sud = 'All term frequencies are 1.'
  )
  splot(
    sapply(doc_weights, weight_range, value = 'sequence') ~ 1:20,
    options = op, title = 'Term as Document Frequencies',
    sud = 'Non-zero terms are the number of non-zero terms.'
  )
  splot(
    sapply(doc_weights, weight_range, value = 'inverted') ~ 1:20,
    options = op, title = 'Term Opposite of Document Frequencies',
    sud = 'Non-zero terms are the number of zero terms + 1.'
  )
}

}
