Abstract
This document aims at simulating data with GMA-SMA models, fitting them with various softwares, and checking their estimates.Dependencies:
suppressPackageStartupMessages(library(plantmix))
suppressPackageStartupMessages(library(lme4))
In this vignette, varietal mixtures will be assembled from a panel of 200 genotypes. Only 1000 mixtures will be observed, among all 1.99^{4} possible ones, i.e., approximately 5%. This incomplete design will be optimized so that it is balanced: all genotype will be observed in the same number of mixtures. Phenotypes will then be simulated organized into a field trial in a randomized complete block design, according to all six GMA-SMA models that can be conceived: 1, 2, 2’, 2’‘, 3 and 3’. See the article by Forst et al (2019), especially for the first three models, as well as the introductory vignette.
set.seed(12345)
nbGenos <- 200
levGenos <- sprintf(
fmt = paste0("g%0", floor(log10(nbGenos)) + 1, "i"),
1:nbGenos
)
nbSnps <- 1000
levSnps <- sprintf(
fmt = paste0("s%0", floor(log10(nbSnps)) + 1, "i"),
1:nbSnps
)
nb_pops <- 10
weak_div_pops <- diag(nb_pops)
weak_div_pops[upper.tri(weak_div_pops)] <- 0.9
weak_div_pops[lower.tri(weak_div_pops)] <- weak_div_pops[upper.tri(weak_div_pops)]
tmp <- rep(nbGenos / nb_pops, nb_pops - 1)
tmp <- c(tmp, nbGenos - sum(tmp))
snpGenos <- simulGenosDoseStruct(
nb_genos = tmp,
nb_snps = nbSnps,
div_pops = weak_div_pops,
geno_IDs = levGenos,
snp_IDs = levSnps
)
dim(snpGenos)
## [1] 200 1000
snpGenos[1:3, 1:4]
## s0001 s0002 s0003 s0004
## g001 1 1 1 1
## g002 1 0 1 1
## g003 0 1 2 0
For simplicity, the first estimator of VanRaden (2008) is used, that assumes linkage equilibrium and Hardy-Weinberg equilibrium.
GRM <- estimGRM(snpGenos)
GRM <- as.matrix(Matrix::nearPD(GRM)$mat)
image(Matrix(GRM), main = "GRM")
hist(diag(GRM), main = "GRM")
hist(GRM[upper.tri(GRM)], main = "GRM")
nbMixes <- 1000
design <- getDesignBinaryVarMix(levGenos, nbMixes = nbMixes)
tmp <- getMixturesPerGeno(getMixtureList(design$combs))
table(sapply(tmp, length)) # each genotype is in the same nb of mixtures -> balanced design
##
## 10
## 200
monoStands <- paste(levGenos, levGenos, sep = "_")
pairs <- design$combs
mixStands <- paste(pairs[, 1], pairs[, 2], sep = "_")
IDs <- c(monoStands, mixStands)
nbBlocks <- 3
blocks <- LETTERS[1:nbBlocks]
dat <- do.call(rbind, lapply(blocks, function(block) {
data.frame(
ID = IDs,
block = block,
stringsAsFactors = TRUE
)
}))
listContr <- list(block = "contr.sum")
X <- model.matrix(~ 1 + block, data = dat, contrasts = listContr)
allZ <- list()
allZ$GMA <- mkZGMA(dat, col = "ID", sep = "_")
system.time(
allZ <- append(
allZ,
mkAllZSMA(dat, col = "ID", sep = "_", verbose = 1)
)
)
## [1] "model 2'"
## [1] "model 2"
## [1] "model 2''"
## [1] "model 3"
## [1] "model 3'"
## user system elapsed
## 27.706 1.384 29.094
sapply(allZ, dim)
## GMA SMA_mod2p SMA_mod2 SMA_mod2pp_ij SMA_mod2pp_ii SMA_mod3 SMA_mod3p_ij
## [1,] 3600 3600 3600 3600 3600 3600 3600
## [2,] 200 1000 1200 1000 200 1200 1000
## SMA_mod3p_ii
## [1,] 3600
## [2,] 200
Several packages will be tested below, most notably lme4
and TMB. However, lme4 does not natively take
a GRM as input. Hence the GRM will be replaced by the identity matrix to
ensure a fair comparison between packages.
truth <- list(
"intercept" = 100,
"var_GMA" = 10,
"var_SMA" = 2,
"var_SMA_ij" = 1.5,
"var_SMA_ii" = 0.8,
"var_error" = 1
)
set.seed(1234)
truth[["blockEffs"]] <- sample(x = c(-1, 1), size = nbBlocks - 1, replace = TRUE) *
rnorm(n = nbBlocks - 1, mean = 3, sd = 2)
GRM <- diag(nbGenos)
truth[["GMAs"]] <- MASS::mvrnorm(
n = 1, mu = rep(0, nbGenos),
Sigma = truth$var_GMA * GRM
)
truth[["SMAs"]] <- rnorm(n = length(IDs), mean = 0, sd = sqrt(truth$var_SMA))
truth[["SMAs_ij"]] <- rnorm(n = length(mixStands), mean = 0, sd = sqrt(truth$var_SMA_ij))
truth[["SMAs_ii"]] <- rnorm(n = length(monoStands), mean = 0, sd = sqrt(truth$var_SMA_ii))
truth[["errors"]] <- rnorm(n = nrow(dat), mean = 0, sd = sqrt(truth$var_error))
y1 <- X %*% c(truth$intercept, truth$blockEffs) +
allZ$GMA %*% truth$GMAs +
truth$errors
dat$pheno1 <- y1[, 1]
y2 <- X %*% c(truth$intercept, truth$blockEffs) +
allZ$GMA %*% truth$GMAs +
allZ$SMA_mod2 %*% truth$SMAs +
truth$errors
dat$pheno2 <- y2[, 1]
y3 <- X %*% c(truth$intercept, truth$blockEffs) +
allZ$GMA %*% truth$GMAs +
allZ$SMA_mod3 %*% truth$SMAs +
truth$errors
dat$pheno3 <- y3[, 1]
y2p <- X %*% c(truth$intercept, truth$blockEffs) +
allZ$GMA %*% truth$GMAs +
allZ$SMA_mod2p %*% truth$SMAs_ij +
truth$errors
dat$pheno2p <- y2p[, 1]
y2pp <- X %*% c(truth$intercept, truth$blockEffs) +
allZ$GMA %*% truth$GMAs +
allZ$SMA_mod2pp_ij %*% truth$SMAs_ij +
allZ$SMA_mod2pp_ii %*% truth$SMAs_ii +
truth$errors
dat$pheno2pp <- y2pp[, 1]
y3p <- X %*% c(truth$intercept, truth$blockEffs) +
allZ$GMA %*% truth$GMAs +
allZ$SMA_mod3p_ij %*% truth$SMAs_ij +
allZ$SMA_mod3p_ii %*% truth$SMAs_ii +
truth$errors
dat$pheno3p <- y3p[, 1]
Function fitGMASMA allows to fit the models with various
packages. In this vignette, only lme4 and TMB
will be used and, as you can see below, both return the same estimates.
(To save time, MM4LMM is commented.)
pkgs <- c("lme4", "TMB") # , "MM4LMM")
if ("MM4LMM" %in% pkgs) {
suppressPackageStartupMessages(library(MM4LMM))
}
runAllPkgs <- function(pkgs, form, dat, listZ, listVCov, listContr, ...) {
## mcMap(function(pkg) {
Map(function(pkg) {
print(paste0("fit with ", pkg, "..."))
fitGMASMA(form, dat, listZ, pkg, listVCov, listContr)
st <- system.time(
try(fit <- fitGMASMA(form, dat, listZ, pkg, listVCov, listContr, REML = TRUE, ...))
)
print(st)
fit
}, pkgs) # , mc.cores = nbCores)
}
form <- pheno1 ~ 1 + block
listZ <- list("GMA" = allZ$GMA)
listVCov <- list("GMA" = GRM)
listContr <- list(block = "contr.sum")
fits1 <- runAllPkgs(pkgs, form, dat, listZ, listVCov, listContr)
## [1] "fit with lme4..."
## user system elapsed
## 0.556 0.000 0.556
## [1] "fit with TMB..."
## user system elapsed
## 3.271 0.000 3.271
print("lme4:")
## [1] "lme4:"
tmp <- as.data.frame(VarCorr(fits1$lme4))
tmp <- setNames(tmp$vcov, tmp$grp)
checks <- data.frame(
truth = do.call(c, truth[c("var_GMA", "var_error")]),
estim = tmp[c("GMA", "Residual")]
)
checks$nBE <- normBiasError(checks$estim, checks$truth)
print(checks)
## truth estim nBE
## var_GMA 10 10.5474529 5.474529
## var_error 1 0.9599666 -4.003335
print("TMB:")
## [1] "TMB:"
checks <- data.frame(
truth = do.call(c, truth[c("var_GMA", "var_error")]),
estim = do.call(c, fits1$TMB$report[c("var_GMA", "var_err")])
)
checks$nBE <- normBiasError(checks$estim, checks$truth)
print(checks)
## truth estim nBE
## var_GMA 10 10.5474516 5.474516
## var_error 1 0.9599667 -4.003335
if ("MM4LMM" %in% names(fits1)) {
print(fits1$MM4LMM$Sigma2)
}
form <- pheno2 ~ 1 + block
listZ <- list(
"GMA" = allZ$GMA,
"SMA" = allZ$SMA_mod2
)
listVCov <- list(
"GMA" = GRM,
"SMA" = diag(ncol(listZ$SMA))
)
listContr <- list(block = "contr.sum")
fits2 <- runAllPkgs(pkgs, form, dat, listZ, listVCov, listContr)
## [1] "fit with lme4..."
## user system elapsed
## 1.440 0.000 1.439
## [1] "fit with TMB..."
## user system elapsed
## 10.752 0.108 10.862
print("lme4:")
## [1] "lme4:"
tmp <- as.data.frame(VarCorr(fits2$lme4))
tmp <- setNames(tmp$vcov, tmp$grp)
checks <- data.frame(
truth = do.call(c, truth[c("var_GMA", "var_SMA", "var_error")]),
estim = tmp[c("GMA", "SMA", "Residual")]
)
checks$nBE <- normBiasError(checks$estim, checks$truth)
print(checks)
## truth estim nBE
## var_GMA 10 10.4978544 4.9785441
## var_SMA 2 2.0197144 0.9857177
## var_error 1 0.9453828 -5.4617169
print("TMB:")
## [1] "TMB:"
checks <- data.frame(
truth = do.call(c, truth[c("var_GMA", "var_SMA", "var_error")]),
estim = do.call(c, fits2$TMB$report[c("var_GMA", "var_SMA1", "var_err")])
)
checks$nBE <- normBiasError(checks$estim, checks$truth)
print(checks)
## truth estim nBE
## var_GMA 10 10.4979307 4.9793071
## var_SMA 2 2.0197059 0.9852969
## var_error 1 0.9453837 -5.4616269
if ("MM4LMM" %in% names(fits2)) {
print(fits2$MM4LMM$Sigma2)
}
form <- pheno3 ~ 1 + block
listZ <- list(
"GMA" = allZ$GMA,
"SMA" = allZ$SMA_mod3
)
listVCov <- list(
"GMA" = GRM,
"SMA" = diag(ncol(listZ$SMA))
)
listContr <- list(block = "contr.sum")
fits3 <- runAllPkgs(pkgs, form, dat, listZ, listVCov, listContr)
## [1] "fit with lme4..."
## user system elapsed
## 2.454 0.008 2.463
## [1] "fit with TMB..."
## user system elapsed
## 11.432 0.128 11.561
print("lme4:")
## [1] "lme4:"
tmp <- as.data.frame(VarCorr(fits3$lme4))
tmp <- setNames(tmp$vcov, tmp$grp)
checks <- data.frame(
truth = do.call(c, truth[c("var_GMA", "var_SMA", "var_error")]),
estim = tmp[c("GMA", "SMA", "Residual")]
)
checks$nBE <- normBiasError(checks$estim, checks$truth)
print(checks)
## truth estim nBE
## var_GMA 10 10.2611009 2.611009
## var_SMA 2 2.1054392 5.271960
## var_error 1 0.9469001 -5.309988
print("TMB:")
## [1] "TMB:"
checks <- data.frame(
truth = do.call(c, truth[c("var_GMA", "var_SMA", "var_error")]),
estim = do.call(c, fits3$TMB$report[c("var_GMA", "var_SMA1", "var_err")])
)
checks$nBE <- normBiasError(checks$estim, checks$truth)
print(checks)
## truth estim nBE
## var_GMA 10 10.2611022 2.611022
## var_SMA 2 2.1054387 5.271936
## var_error 1 0.9469002 -5.309984
if ("MM4LMM" %in% names(fits3)) {
print(fits3$MM4LMM$Sigma2)
}
form <- pheno2p ~ 1 + block
listZ <- list(
"GMA" = allZ$GMA,
"SMA" = allZ$SMA_mod2p
)
listVCov <- list(
"GMA" = GRM,
"SMA" = diag(ncol(listZ$SMA))
)
listContr <- list(block = "contr.sum")
fits2p <- runAllPkgs(pkgs, form, dat, listZ, listVCov, listContr)
## [1] "fit with lme4..."
## Warning in split.default(x, g): data length is not a multiple of split variable
## Warning in split.default(seq_along(x), f, drop = drop, ...): data length is not
## a multiple of split variable
## Warning in split.default(x, g): data length is not a multiple of split variable
## Warning in split.default(seq_along(x), f, drop = drop, ...): data length is not
## a multiple of split variable
## user system elapsed
## 2.061 0.000 2.061
## [1] "fit with TMB..."
## user system elapsed
## 9.650 0.072 9.723
print("lme4:")
## [1] "lme4:"
tmp <- as.data.frame(VarCorr(fits2p$lme4))
tmp <- setNames(tmp$vcov, tmp$grp)
checks <- data.frame(
truth = do.call(c, truth[c("var_GMA", "var_SMA", "var_error")]),
estim = tmp[c("GMA", "SMA", "Residual")]
)
checks$nBE <- normBiasError(checks$estim, checks$truth)
print(checks)
## truth estim nBE
## var_GMA 10 10.3596090 3.596090
## var_SMA 2 1.6026602 -19.866989
## var_error 1 0.9456708 -5.432918
print("TMB:")
## [1] "TMB:"
checks <- data.frame(
truth = do.call(c, truth[c("var_GMA", "var_SMA", "var_error")]),
estim = do.call(c, fits2p$TMB$report[c("var_GMA", "var_SMA1", "var_err")])
)
checks$nBE <- normBiasError(checks$estim, checks$truth)
print(checks)
## truth estim nBE
## var_GMA 10 10.3596074 3.596074
## var_SMA 2 1.6026609 -19.866954
## var_error 1 0.9456708 -5.432919
if ("MM4LMM" %in% names(fits2p)) {
print(fits2p$MM4LMM$Sigma2)
}
form <- pheno2pp ~ 1 + block
listZ <- list(
"GMA" = allZ$GMA,
"SMA_ij" = allZ$SMA_mod2pp_ij,
"SMA_ii" = allZ$SMA_mod2pp_ii
)
listVCov <- list(
"GMA" = GRM,
"SMA_ij" = diag(ncol(listZ$SMA_ij)),
"SMA_ii" = diag(ncol(listZ$SMA_ii))
)
listContr <- list(block = "contr.sum")
fits2pp <- runAllPkgs(pkgs, form, dat, listZ, listVCov, listContr)
## [1] "fit with lme4..."
## Warning in split.default(x, g): data length is not a multiple of split variable
## Warning in split.default(seq_along(x), f, drop = drop, ...): data length is not
## a multiple of split variable
## Warning in split.default(x, g): data length is not a multiple of split variable
## Warning in split.default(seq_along(x), f, drop = drop, ...): data length is not
## a multiple of split variable
## user system elapsed
## 2.375 0.000 2.375
## [1] "fit with TMB..."
## user system elapsed
## 10.384 0.092 10.476
print("lme4:")
## [1] "lme4:"
tmp <- as.data.frame(VarCorr(fits2pp$lme4))
tmp <- setNames(tmp$vcov, tmp$grp)
checks <- data.frame(
truth = do.call(c, truth[c("var_GMA", "var_SMA_ij", "var_SMA_ii", "var_error")]),
estim = tmp[c("GMA", "SMA_ij", "SMA_ii", "Residual")]
)
checks$nBE <- normBiasError(checks$estim, checks$truth)
print(checks)
## truth estim nBE
## var_GMA 10.0 10.0082780 0.08278019
## var_SMA_ij 1.5 1.6172865 7.81910037
## var_SMA_ii 0.8 0.8345874 4.32342341
## var_error 1.0 0.9453837 -5.46163448
print("TMB:")
## [1] "TMB:"
checks <- data.frame(
truth = do.call(c, truth[c("var_GMA", "var_SMA_ij", "var_SMA_ii", "var_error")]),
estim = do.call(c, fits2pp$TMB$report[c("var_GMA", "var_SMA1", "var_SMA2", "var_err")])
)
checks$nBE <- normBiasError(checks$estim, checks$truth)
print(checks)
## truth estim nBE
## var_GMA 10.0 10.0082809 0.08280897
## var_SMA_ij 1.5 1.6172866 7.81910639
## var_SMA_ii 0.8 0.8345874 4.32341949
## var_error 1.0 0.9453836 -5.46163514
if ("MM4LMM" %in% names(fits2pp)) {
print(fits2pp$MM4LMM$Sigma2)
}
form <- pheno3p ~ 1 + block
listZ <- list(
"GMA" = allZ$GMA,
"SMA_ij" = allZ$SMA_mod3p_ij,
"SMA_ii" = allZ$SMA_mod3p_ii
)
listVCov <- list(
"GMA" = GRM,
"SMA_ij" = diag(ncol(listZ$SMA_ij)),
"SMA_ii" = diag(ncol(listZ$SMA_ii))
)
listContr <- list(block = "contr.sum")
fits3p <- runAllPkgs(pkgs, form, dat, listZ, listVCov, listContr)
## [1] "fit with lme4..."
## Warning in split.default(x, g): data length is not a multiple of split variable
## Warning in split.default(seq_along(x), f, drop = drop, ...): data length is not
## a multiple of split variable
## Warning in split.default(x, g): data length is not a multiple of split variable
## Warning in split.default(seq_along(x), f, drop = drop, ...): data length is not
## a multiple of split variable
## user system elapsed
## 3.593 0.000 3.594
## [1] "fit with TMB..."
## user system elapsed
## 11.519 0.116 11.635
print("lme4:")
## [1] "lme4:"
tmp <- as.data.frame(VarCorr(fits3p$lme4))
tmp <- setNames(tmp$vcov, tmp$grp)
checks <- data.frame(
truth = do.call(c, truth[c("var_GMA", "var_SMA_ij", "var_SMA_ii", "var_error")]),
estim = tmp[c("GMA", "SMA_ij", "SMA_ii", "Residual")]
)
checks$nBE <- normBiasError(checks$estim, checks$truth)
print(checks)
## truth estim nBE
## var_GMA 10.0 10.0390341 0.3903412
## var_SMA_ij 1.5 1.6726159 11.5077264
## var_SMA_ii 0.8 0.9481598 18.5199796
## var_error 1.0 0.9440193 -5.5980736
print("TMB:")
## [1] "TMB:"
checks <- data.frame(
truth = do.call(c, truth[c("var_GMA", "var_SMA_ij", "var_SMA_ii", "var_error")]),
estim = do.call(c, fits3p$TMB$report[c("var_GMA", "var_SMA1", "var_SMA2", "var_err")])
)
checks$nBE <- normBiasError(checks$estim, checks$truth)
print(checks)
## truth estim nBE
## var_GMA 10.0 10.0390775 0.3907745
## var_SMA_ij 1.5 1.6726225 11.5081637
## var_SMA_ii 0.8 0.9481763 18.5220407
## var_error 1.0 0.9440195 -5.5980522
if ("MM4LMM" %in% names(fits3p)) {
print(fits3p$MM4LMM$Sigma2)
}
t1 <- proc.time()
t1 - t0
## user system elapsed
## 185.289 2.547 190.107
print(sessionInfo(), locale = FALSE)
## R version 4.5.1 (2025-06-13)
## Platform: x86_64-pc-linux-gnu
## Running under: Debian GNU/Linux 12 (bookworm)
##
## Matrix products: default
## BLAS: /usr/local/R/4.5.1/lib/R/lib/libRblas.so
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.11.0 LAPACK version 3.11.0
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] plantmix_1.0.2 lme4_1.1-37 Matrix_1.7-3 ggplot2_3.5.2 knitr_1.50
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.6 jsonlite_2.0.0 dplyr_1.1.4 compiler_4.5.1
## [5] Rcpp_1.1.0 tidyselect_1.2.1 dichromat_2.0-0.1 jquerylib_0.1.4
## [9] splines_4.5.1 scales_1.4.0 boot_1.3-31 yaml_2.3.10
## [13] fastmap_1.2.0 lattice_0.22-7 R6_2.6.1 generics_0.1.4
## [17] igraph_2.1.4 rbibutils_2.3 MASS_7.3-65 tibble_3.3.0
## [21] nloptr_2.2.1 minqa_1.2.8 TMB_1.9.17 bslib_0.9.0
## [25] pillar_1.11.0 RColorBrewer_1.1-3 rlang_1.1.7 cachem_1.1.0
## [29] xfun_0.52 sass_0.4.10 cli_3.6.5 withr_3.0.2
## [33] magrittr_2.0.3 Rdpack_2.6.4 digest_0.6.37 grid_4.5.1
## [37] lifecycle_1.0.5 nlme_3.1-168 reformulas_0.4.1 vctrs_0.6.5
## [41] evaluate_1.0.4 glue_1.8.0 farver_2.1.2 rmarkdown_2.29
## [45] tools_4.5.1 pkgconfig_2.0.3 htmltools_0.5.8.1
Need a high-speed mirror for your open-source project?
Contact our mirror admin team at info@clientvps.com.
This archive is provided as a free public service to the community.
Proudly supported by infrastructure from VPSPulse , RxServers , BuyNumber , UnitVPS , OffshoreName and secure payment technology by ArionPay.