% Generated by roxygen2 (4.1.0): do not edit by hand
% Please edit documentation in R/pop.predict_function_V2_3.10.15.R
\name{pop.predict}
\alias{pop.predict}
\title{A genome-wide procedure for predicting genetic variance and correlated response in bi-parental breeding populations}
\usage{
pop.predict(G.in = NULL, y.in = NULL, map.in = NULL,
  crossing.table = NULL, parents = NULL, tail.p = 0.1, nInd = 200,
  map.plot = F, min.maf = 0.01, mkr.cutoff = 0.5, entry.cutoff = 0.5,
  remove.dups = T, nSim = 25, frac.train = 0.6, nCV.iter = 100,
  nFold = NULL, nFold.reps = 1, nIter = 12000, burnIn = 3000,
  models = c("rrBLUP", "BayesA", "BayesB", "BayesC", "BL", "BRR"),
  return.raw = F)
}
\arguments{
\item{G.in}{\code{Matrix} of genotypic data. First row contains marker names and the first column contains entry (taxa) names. Genotypes should be coded using the (1, 0, -1, NA) format familiar to users of \code{\link{rrBLUP}} (\cite{Endelman, 2011}). TIP - Set header=\code{FALSE} within \code{\link{read.table}} or \code{\link{read.csv}} when importing a tab-delimited file containing data for \code{G.in}.}

\item{y.in}{\code{Matrix} of phenotypic data. First column contains entry (taxa) names found in \code{G.in}, regardless of whether the entry has a phenotype for any or all traits. Additional columns contain phenotypic data; column names should reflect the trait name(s). TIP - Set header=\code{TRUE} within \code{\link{read.table}} or \code{\link{read.csv}} when importing a tab-delimited file contianing data for \code{y.in}.}

\item{map.in}{\code{Matrix} of genetic map data, three columns total. Column 1 contains marker names, column 2 contains chromosome number, and column 3 contains cM positions. TIP - Set header=\code{TRUE} within \code{read.table} or \code{read.csv} when importing a tab-delimited file contianing data for \code{map.in}.}

\item{crossing.table}{Optional \code{matrix} specifying which crosses are to be simulated, two columns total. Column 1 contains the first parent of the cross (Par1) and column 2 contains the second parent of the cross (Par2).}

\item{parents}{Optional \code{character vector}. If \code{parents="TP"} then only the entries (taxa) within the training population (i.e. are phenotyped for the trait) are considered as parents; all pairwise crosses will be simulated for these. User could otherwise provide a character vector of entry names; all pairwise crosses will be simulated for these.}

\item{tail.p}{Optional \code{numeric} indicating the percentile of the simulated progeny to be included into the calculation of \eqn{\mu}\emph{_sp} and correlated response. Default is \code{0.10}.}

\item{nInd}{Optional \code{integer} indicating the number of progeny simulated per cross, per iteration, using \code{\link[qtl]{sim.cross}} in R/qtl (\url{http://www.rqtl.org}; \emph{Broman et al., 2003}). Default is \code{200}.}

\item{map.plot}{Optional \code{logical}. If \code{TRUE} then a plot of the genetic map will be generated by \code{\link[qtl]{plot.map}}. Default is \code{FALSE}.}

\item{min.maf}{Optional \code{numeric} indicating a minimum minor allele frequency (MAF) when filtering \code{G.in}. Markers with an MAF < \code{min.maf} will be removed. Default is \code{0.01} to remove monomorphic markers. Set to \code{0} for no filtering.}

\item{mkr.cutoff}{Optional \code{numeric} indicating the maximum missing data per marker when filtering \code{G.in}. Markers missing > \code{mkr.cutoff} data will be removed. Default is \code{0.50}. Set to \code{1} for no filtering.}

\item{entry.cutoff}{Optional \code{numeric} indicating the maximum missing genotypic data per entry alloed when filtering \code{G.in}. Entries missing > \code{entry.cutoff} marker data will be removed. Default is \code{0.50}. Set to \code{1} for no filtering.}

\item{remove.dups}{Optional \code{logical}. If \code{TRUE} then duplicate entries in the genotype matrix, if present, will be removed. This step may be necessary for missing marker imputation via the EM algorithm by \code{\link[rrBLUP]{A.mat}} in \code{\link{rrBLUP}} (\cite{Endelman, 2011}; \cite{Poland et al., 2012}). Default is \code{TRUE}.}

\item{nSim}{Optional \code{integer} indicating the number of iterations a population should be simulated for each pairwise cross. Returned values are reported as means of parameters estimated in each of \code{nSim} simulations. Default is \code{25}.}

\item{frac.train}{Optional \code{numeric} indicating the fraction of the TP that is used to estimate marker effects (i.e. the prediction set) under cross-validation (CV) method 1 (see \code{Details} in \code{\link{x.val}}). The remaining \eqn{(1-frac.trait)} of the TP will then comprise the prediction set.}

\item{nCV.iter}{Optional \code{integer} indicating the number of times to iterate \emph{CV method 1} (see \code{Details} in \code{\link{x.val}}). Default is \code{100}.}

\item{nFold}{Optional \code{integer}. If a number is provided, denoting the number of "folds", then CV will be conducted using \emph{CV method 2} (see \code{Details} in \code{\link{x.val}}). Default is \code{NULL}, resulting in the default use of the \emph{CV method 1}.}

\item{nFold.reps}{Optional \code{integer} indicating the number of times \emph{CV method 2} is repeated. The CV accuracy returned is the average \emph{r} of each rep. Default is \code{1}.}

\item{nIter,burnIn}{Optional \code{integer} arguments used by \code{\link[BGLR]{BGLR}} (\cite{de los Compos and Rodriguez, 2014}) when fitting Bayesian models to estimate marker effects. The defaults are \code{12000} and \code{3000}, respectively. These values when conducting CV are fixed \code{1500} and \code{500}, respectively, for computational effeciency.}

\item{models}{Optional \code{Character vector} of the regression models to be used in CV and to estimate marker effects. Options include \code{rrBLUP, BayesA, BayesB, BayesC, BL, BRR}, one or more may be included at a time. By default all models are tested.}

\item{return.raw}{Optional \code{logical}. If \code{TRUE} then \code{pop.predict} will return the results of each simulation in addition to the summarized dataframe. Default is \code{FALSE}.}
}
\value{
A \code{list} containing: \itemize{
           \item \code{predictions} A \code{list} of dataframes containing predictions of (\eqn{\mu}), (\emph{V_G}), and (\eqn{\mu}\emph{_sp}). When multiple traits are provided the correlated responses and correlation between all pairwise traits is also included.
           \item \code{preds.per.sim} If return.raw is \code{TRUE} then a \code{dataframe} containing the results of each simulation is returned. This is usful for calculating dispersion statistics for traits not provided in the standard \code{predictions} dataframe.
           \item \code{CVs} A \code{dataframe} of CV results for each trait/model combination specified.
           \item \code{models.chosen} A \code{matrix} listing the statistical model chosen for each trait.
           \item \code{markers.removed} A \code{vector} of markers removed during filtering for MAF and missing data.
           \item \code{entries.removed} A \code{vector} of entries removed during filtering for missing data and duplicate entries.
         }
}
\description{
\code{pop.predict} uses phenotypic and genotypic data from a set of individuals known as a training population (TP) and a set of candidate parents, which may or may not be included in the TP, to predict the mean (\eqn{\mu}), genetic variance (\emph{V_G}), and superior progeny values (\eqn{\mu}\emph{_sp}) of the half-diallel, or a defined set of pairwise bi-parental crosses between parents. When multiple traits are provided \code{pop.predict} will also predict the correlated responses and correlation between all pairwise traits. See \cite{Mohammadi, Tiede, and Smith (2015)} for further details.

             NOTE - \code{pop.predict} writes and reads files to disk so it is highly recommended to set your working directory
}
\details{
\code{pop.predict} can be used to predict the mean (\eqn{\mu}), genetic variance (\emph{V_G}), superior progeny values (\eqn{\mu}\eqn{_sp}), and predicted correlated response and correlations between all pairwise traits. The methodology and procedure to do so has been described in \cite{Bernardo (2014)} and \cite{Mohammadi, Tiede, and K.P. Smith (2015)}. Users familiar with genome-wide prediction, association mapping, and/or linkage mapping will be familiar with the
         required inputs of \code{pop.predict}. \code{G.in} includes all of the entries (taxa) in the TP as well as additional entries to be considered as parent candidates. Entries included in \code{G.in} that do have a phenotype for any or all traits in \code{y.in} are considered TP entries for those respective traits. \code{G.in} is filtered according to \code{min.maf}, \code{mkr.cutoff}, \code{entry.cutoff}, and \code{remove.dups};
         remaining missing marker data is imputed using the EM algorith (\cite{Poland et al., 2012}) when possible, and the marker mean otherwise, both implemented in \code{\link{rrBLUP}}. For each trait, the TP (i.e. entries with phenotype) is used to: \enumerate{
         \item Perform CV (see \code{frac.train} and \code{nCV.iter} for details about the CV method) to select a regression model
         \item Estimate marker effects using the model resulting in the highest CV accuracy
         }
         Models include ridge regression BLUP implemented in \code{\link{rrBLUP}} (\cite{Endelman, 2011}) and BayesA, BayesB, BayesC\eqn{\pi}, Bayesian lasso (BL), and Bayesian ridge regression (BRR) implemented in \code{\link{BGLR}} (\cite{de los Compos and Rodriguez, 2014}).
         Information from the \code{map.in} is then used to simulate chromosomal recombination expected in a recombinant inbred line (i.e. \emph{F-infinity}) (\cite{Broman et al., 2003}) population (size=\code{nInd}). A function then converts the recombined chromosomal segments of the generic RIL population to the chromosomal segments of the population's respective parents and GEBVs of the simulated progeny are calculated.
         The simulation and conversion process is repeated \emph{s} times, where \emph{s} = \code{nSim}, to calculate dispersion statistics for \eqn{\mu} and \emph{V_G}; the remainder of the values in the \code{predictions} output are means of the \emph{s} simulations.  During each iteration the correlation (\emph{r}) and correlated response of each pairwise combination of traits is also calculated and their mean across \emph{n} simulations is returned.
         The correlated respons of trait.B when predicting trait.A is the mean of trait.B for the (\eqn{\mu}\eqn{_sp}) of trait.A, and vice-versa; a correlated response for the bottom \code{tail.p} and upper \eqn{1-tail.p} is returned for each trait.

         A dataset \code{\link{think_barley.rda}} is provided as an example of the proper formatting of input files and also for users to become familiar with \code{pop.predict}.
}
\examples{
\dontrun{
## View formatting
## Use View() in RStudio or R GUI with X11 forwarding
## Use head() in R GUI without X11 forwarding
View(G.in_ex)
View(y.in_ex)
View(map.in_ex)
View(cross.tab_ex)

## setwd() - pop.predict writes and reads files to disk
##   so it is recommended to set your working directory

## nSim and nFold are set to low values in the
## examples for sake of computing time

## Ex. 1 - Predict a defined set of crosses
## This example uses CV method 1 (see Details of x.val() function)
ex1.out <- pop.predict(G.in = G.in_ex, y.in = y.in_ex,
   map.in = map.in_ex, crossing.table = cross.tab_ex,
   nSim=5, nCV.iter=10)
ex1.out$predictions  ## Predicted parameters
ex1.out$CVs          ## CV results

## Ex. 2 - Predict all pairwise crosses between a list of parents
## This example uses CV method 2 (see Details of x.val() function)
par.list <- sample(y.in_ex[,1], size = 10, replace = F)
ex2.out <- pop.predict(G.in = G.in_ex, y.in = y.in_ex,
   map.in = map.in_ex, parents = par.list,
   nSim=5, nFold=5, nFold.reps=2)

## Ex. 3 - Use only rrBLUP and Bayesian lasso (BL) models
ex3.out <- pop.predict(G.in = G.in_ex, y.in = y.in_ex,
   map.in = map.in_ex, crossing.table = cross.tab_ex,
   models = c("rrBLUP", "BL"), nSim=5, nCV.iter=10)
}
}
\references{
Bernardo, R. 2014. Genomewide Selection of Parental Inbreds: Classes of Loci and Virtual Biparental Populations. Crop Sci. 55:2586-2595.

     Broman, K. W., H. Wu, S. Sen and G.A. Churchill. 2003. R/qtl: QTL mapping in experimental crosses. Bioinformatics 19:889-890.

     Endelman, J. B. 2011. Ridge regression and other kernels for genomic selection with R package rrBLUP. Plant Genome 4:250-255. doi: 10.3835/plantgenome2011.08.0024

     Gustavo de los Campos and Paulino Perez Rodriguez, (2014). BGLR: Bayesian Generalized Linear Regression. R package version 1.0.3. http://CRAN.R-project.org/package=BGLR

     Mohammadi M., T. Tiede, and K.P. Smith. 2015. PopVar: A genome-wide procedure for predicting genetic variance and correlated response in bi-parental breeding populations. Crop Sci. \emph{Accepted}.

     Munoz-Amatriain, M., M. J. Moscou, P. R. Bhat, J. T. Svensson, J. Bartos, P. Suchankova, H. Simkova, T. R. Endo, R. D. Fenton, S. Lonardi, A. M. Castillo, S. Chao, L. Cistue, A. Cuesta-Marcos, K. L. Forrest, M. J. Hayden, P. M. Hayes, R. D. Horsley, K. Makoto, D. Moody, K. Sato, M. P. Valles, B. B. H. Wulff, G. J. Muehlbauer, J. Dolezel, and T. J. Close. 2011 An improved consensus linkage map of barley based on flow-sorted chromosomes and single nucleotide polymorphism markers. Plant Gen. 4:238-249.

     Poland, J., J. Endelman, J. Dawson, J. Rutkoski, S. Wu, Y. Manes, S. Dreisigacker, J. Crossa, H. Sanches-Villeda, M. Sorrells, and J.-L. Jannink. 2012. Genomic Selection in Wheat Breeding using Genotyping-by-Sequencing. Plant Genome 5:103-113.
}

