% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/predict.R
\name{predict.drf}
\alias{predict.drf}
\title{Predict from Distributional Random Forests object}
\usage{
\method{predict}{drf}(
  object,
  newdata = NULL,
  functional = NULL,
  transformation = NULL,
  custom.functional = NULL,
  num.threads = NULL,
  estimate.uncertainty = FALSE,
  ...
)
}
\arguments{
\item{object}{Trained DRF forest object.}

\item{newdata}{Points at which predictions should be made. If NULL, returns out-of-bag
predictions on the training set (i.e., for every training point \eqn{X_i}, provides predictions using only trees which did not use
this point for tree construction). Can be either a data frame, matrix or a vector. Each row represents a data point of interest and 
the number and ordering of columns is assumed the be the same as in the training set.}

\item{functional}{Optional. String indicating the statistical functional that we want to compute from the weights. One option between:
\describe{
 \item{"mean"}{ - Conditional mean, the returned value is a matrix \code{mean} of dimension \code{n} x \code{f},
 where \code{n} denotes the number of observations in \code{newdata} and \code{f} the dimension of the \code{transformation}.}
 \item{"sd"}{ - Conditional standard deviation for each component of the (transformed) response, the returned value is
 a matrix of dimension \code{n} x \code{f}, where \code{n} denotes the number of observations in \code{newdata} and \code{f} the dimension of the \code{transformation}.}
 \item{"quantile"}{ - Conditional quantiles. It requires additional parameter \code{quantiles} containing the list of quantile levels we want to compute. 
 The returned value is an array of dimension \code{n} x \code{f}  x \code{q}, where \code{n} denotes the number of observations in \code{newdata}, 
 \code{f} the dimension of the \code{transformation} and \code{q} the number of desired quantiles.}
 \item{"cor"}{ - Conditional correlation matrix, the returned value is an array of dimension \code{n} x \code{f} x \code{f},
 where \code{n} denotes the number of observations in \code{newdata} and \code{f} the dimension of the \code{transformation}.}
 \item{"cov"}{ - Conditional covariance matrix, the returned value is an array of dimension \code{n} x \code{f}  x \code{f},
 where \code{n} denotes the number of observations in \code{newdata}, \code{f} the dimension of the \code{transformation}.}
 \item{"custom"}{ - A custom function provided by the user, the returned value is a matrix of dimension \code{n} x \code{f},
 where \code{n} denotes the number of observations in \code{newdata} and \code{f} the dimension of the output of the function \code{custom.functional} provided by the user.}
}}

\item{transformation}{An optional transformation function that is applied to the responses before computing the target functional. It helps to extend the functionality to a much wider range of targets. 
The responses are not transformed by default, i.e. the identity function \eqn{f(y) = y} is used.}

\item{custom.functional}{A user-defined function when \code{functional} is set to "custom". This should be a function \code{f(y,w)} which for a single test point
takes the \code{n} x \code{f} matrix \code{y} and the corresponding \code{n}-dimensional vector of weights \code{w} and returns the quantity of interest given as a list of values.
\code{n} denotes the number of training observations and \code{f} the dimension of the \code{transformation}.}

\item{num.threads}{Number of threads used for computing. If set to NULL, the software automatically selects an appropriate amount.}

\item{estimate.uncertainty}{Whether to additionally return B weight vectors calculated on B CI groups for each sample. See Details and return value docu.}

\item{...}{additional parameters.}
}
\value{
If \code{functional} is NULL, returns a list containing 
\item{y}{the matrix of training responses}
\item{weights}{the matrix of weights, whose number of rows corresponds 
the number of rows of \code{newdata} and the number of columns corresponds to the 
number of training  data points.}

If \code{estimate.uncertainty=TRUE}, additionally 
\item{weights.uncertainty}{a list of length \code{nrow(newdata)} where each item is a \code{B x w} 
sparse matrix, where \code{B} is the number of CI groups and \code{w=nrow(Y)}.
This matrix essentially contains \code{B} separate weight vectors, one in each row.}

If \code{functional} is specified, the desired quantity is returned, in the format described above.
}
\description{
Obtain predictions from a DRF forest object. For any point \eqn{x} in the predictor space, it returns the estimate of the conditional distribution 
\eqn{P(Y | X=x)} represented as a weighted distribution \eqn{\sum_i w_i y_i} of the training observations \eqn{{y_i}}. 
Additionally, this function also provides support to directly obtain estimates of certain target quantities \eqn{\tau(P(Y | X))}, 
such as e.g. conditional quantiles, variances or correlations.
}
\details{
To estimate the uncertainty, a set of \code{B=(num.trees)/(ci.group.size)} 
weights is produced for each sample when \code{estimate.uncertainty=TRUE}. 
These \code{B} weights arise from \code{B} subforests (CI groups)
inside the estimation routine and may be seen as bootstrap 
approximation to the estimation uncertainty of the DRF estimator. As such, they 
can be used to build confidence intervals for functionals. For instance, for 
univariate functionals, one may calculate one functional per weight to obtain
\code{B} estimates, with which the variance can be calculated. Then the usual
normal approximation can be used to construct confidence intervals for said functional.
Uncertainty weights are not available OOB.
}
\examples{
\donttest{
library(drf)

n = 500
p = 10
d = 3

# Generate training data
X = matrix(rnorm(n * p), nrow=n)
Y = matrix(rnorm(n * d), nrow=n)
Y[, 1] = Y[, 1] + X[, 1]
Y[, 2] = Y[, 2] * X[, 2]
Y[, 3] = Y[, 3] * X[, 1] + X[, 2]

# Fit DRF object
drf.forest = drf(X, Y, num.trees=100, num.threads=1)

# Generate test data
X_test = matrix(rnorm(10 * p), nrow=10)

out = predict(drf.forest, newdata=X_test)
# Compute E[Y_1 | X] for all data in X_test directly from
# the weights representing the estimated distribution
out$weights \%*\% out$y[,1]

out = predict(drf.forest, newdata=X_test,
              functional='mean')
# Compute E[Y_1 | X] for all data in X_test using built-in functionality
out[,1]

out = predict(drf.forest, newdata=X_test,
              functional='quantile',
              quantiles=c(0.25, 0.75),
              transformation=function(y){y[1] * y[2] * y[3]})
# Compute 25\% and 75\% quantiles of Y_1*Y_2*Y_3, conditionally on X = X_test[1, ]
out[1,,]

out = predict(drf.forest, newdata=X_test,
              functional='cov',
              transformation=function(y){matrix(1:6, nrow=2) \%*\% y})
# Compute 2x2 covariance matrix for (1*Y_1 + 3*Y_2 + 5*Y_3, 2*Y_1 + 4*Y_2 + 6*Y_3),
# conditionally on X = X_test[1, ]
out[1,,]

out = predict(drf.forest, newdata=X_test,
              functional='custom',
              custom.functional=function(y, w){c(sum(y[, 1] * w), sum(y[, 2] * w))})
# Compute E[Y_1, Y_2 | X] for all data in X_test by providing custom functional that
# computes it from the weights
out

## UNCERTAINTY WEIGHTS ######################################################

# Simulate Data that experiences both a mean as well as sd shift
set.seed(10)
n<-500

# Simulate from X
x1 <- runif(n,-1,1)
x2 <- runif(n,-1,1)
x3 <- x1+ runif(n,-1,1)
X0 <- matrix(runif(7*n,-1,1), nrow=n, ncol=7)
X <- cbind(x1,x2, x3, X0)
colnames(X)<-NULL

# Simulate dependent variable Y
Y <- as.matrix(rnorm(n,mean = 0.8*(x1 > 0), sd = 1 + 1*(x2 > 0)))

# Fit DRF with 50 CI groups, each 10 trees large. This results in 50 uncertainty weights 
DRF <- drf(X=X, Y=Y,num.trees=500, min.node.size = 5, ci.group.size=500/50, num.threads=1)

# Obtain Test point
x<-matrix(c(0.2, 0.4, runif(8,-1,1)), nrow=1, ncol=10)

# (we only use 2 threads in this example due to cran limitations)
DRFpred<-predict(DRF, newdata=x, estimate.uncertainty=TRUE, num.threads=1)

## Sample from P_{Y| X=x}
Yxs<-Y[sample(1:n, size=n, replace = TRUE, DRFpred$weights[1,])]

# Calculate quantile prediction as weighted quantiles from Y
qx <- quantile(Yxs, probs = c(0.05,0.95))

# Calculate conditional mean prediction
mux <- mean(Yxs)

# True quantiles
q1<-qnorm(0.05, mean=0.8 * (x[1] > 0), sd=(1+(x[2] > 0)))
q2<-qnorm(0.95, mean=0.8 * (x[1] > 0), sd=(1+(x[2] > 0)))
mu<-0.8 * (x[1] > 0)

# Calculate uncertainty
alpha<-0.05
B<-nrow(DRFpred$weights.uncertainty[[1]])
qxb<-matrix(NaN, nrow=B, ncol=2)
muxb<-matrix(NaN, nrow=B, ncol=1)
for (b in 1:B){
  Yxsb<-Y[sample(1:n, size=n, replace = TRUE, DRFpred$weights.uncertainty[[1]][b,])]
  qxb[b,] <- quantile(Yxsb, probs = c(0.05,0.95))
  muxb[b] <- mean(Yxsb)
}

CI.lower.q1 <- qx[1] - qnorm(1-alpha/2)*sqrt(var(qxb[,1]))
CI.upper.q1 <- qx[1] + qnorm(1-alpha/2)*sqrt(var(qxb[,1]))

CI.lower.q2 <- qx[2] - qnorm(1-alpha/2)*sqrt(var(qxb[,2]))
CI.upper.q2 <- qx[2] + qnorm(1-alpha/2)*sqrt(var(qxb[,2]))

CI.lower.mu <- mux - qnorm(1-alpha/2)*sqrt(var(muxb))
CI.upper.mu <- mux + qnorm(1-alpha/2)*sqrt(var(muxb))

hist(Yxs, prob=TRUE)
z<-seq(-6,7,by=0.01)
d<-dnorm(z, mean=0.8 * (x[1] > 0), sd=(1+(x[2] > 0)))
lines(z,d, col="darkred"  )
abline(v=q1,col="darkred" )
abline(v=q2, col="darkred" )
abline(v=qx[1], col="darkblue")
abline(v=qx[2], col="darkblue")
abline(v=mu, col="darkred")
abline(v=mux, col="darkblue")
abline(v=CI.lower.q1, col="darkblue", lty=2)
abline(v=CI.upper.q1, col="darkblue", lty=2)
abline(v=CI.lower.q2, col="darkblue", lty=2)
abline(v=CI.upper.q2, col="darkblue", lty=2)
abline(v=CI.lower.mu, col="darkblue", lty=2)
abline(v=CI.upper.mu, col="darkblue", lty=2)

}

}
