% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/best_practice.R
\name{bp}
\alias{bp}
\alias{bp_extract_signatures}
\alias{bp_extract_signatures_iter}
\alias{bp_cluster_iter_list}
\alias{bp_get_clustered_sigs}
\alias{bp_get_sig_obj}
\alias{bp_get_stats}
\alias{bp_get_rank_score}
\alias{bp_show_survey2}
\alias{bp_show_survey}
\alias{bp_attribute_activity}
\title{A Best Practice for Signature Extraction and Exposure (Activity) Attribution}
\usage{
bp_extract_signatures(
  nmf_matrix,
  range = 2:5,
  n_bootstrap = 20L,
  n_nmf_run = 50,
  RTOL = 0.001,
  min_contribution = 0,
  cores = min(4L, future::availableCores()),
  cores_solution = min(cores, length(range)),
  seed = 123456L,
  handle_hyper_mutation = TRUE,
  report_integer_exposure = FALSE,
  only_core_stats = nrow(nmf_matrix) > 100,
  cache_dir = file.path(tempdir(), "sigminer_bp"),
  keep_cache = FALSE,
  pynmf = FALSE,
  use_conda = TRUE,
  py_path = "/Users/wsx/anaconda3/bin/python"
)

bp_extract_signatures_iter(
  nmf_matrix,
  range = 2:5,
  sim_threshold = 0.95,
  max_iter = 10L,
  n_bootstrap = 20L,
  n_nmf_run = 50,
  RTOL = 0.001,
  min_contribution = 0,
  cores = min(4L, future::availableCores()),
  cores_solution = min(cores, length(range)),
  seed = 123456L,
  handle_hyper_mutation = TRUE,
  report_integer_exposure = FALSE,
  only_core_stats = nrow(nmf_matrix) > 100,
  cache_dir = file.path(tempdir(), "sigminer_bp"),
  keep_cache = FALSE,
  pynmf = FALSE,
  use_conda = FALSE,
  py_path = "/Users/wsx/anaconda3/bin/python"
)

bp_cluster_iter_list(x, k = NULL, include_final_iteration = TRUE)

bp_get_clustered_sigs(SigClusters, cluster_label)

bp_get_sig_obj(obj, signum = NULL)

bp_get_stats(obj)

bp_get_rank_score(obj)

bp_show_survey2(
  obj,
  x = "signature_number",
  left_y = "silhouette",
  right_y = "L2_error",
  left_name = left_y,
  right_name = right_y,
  left_color = "black",
  right_color = "red",
  left_shape = 16,
  right_shape = 18,
  shape_size = 4,
  highlight = NULL
)

bp_show_survey(
  obj,
  add_score = FALSE,
  scales = c("free_y", "free"),
  fixed_ratio = TRUE
)

bp_attribute_activity(
  input,
  sample_class = NULL,
  nmf_matrix = NULL,
  method = c("bt", "stepwise"),
  bt_use_prop = FALSE,
  return_class = c("matrix", "data.table"),
  use_parallel = FALSE,
  cache_dir = file.path(tempdir(), "sigminer_attribute_activity"),
  keep_cache = FALSE
)
}
\arguments{
\item{nmf_matrix}{a \code{matrix} used for NMF decomposition with rows indicate samples and columns indicate components.}

\item{range}{a \code{numeric} vector containing the ranks of factorization to try. Note that duplicates are removed
and values are sorted in increasing order. The results are notably returned in this order.}

\item{n_bootstrap}{number of bootstrapped (resampling) catalogs used.
When it is \code{0}, the original (input) mutation catalog is used for NMF decomposition,
this is not recommended, just for testing, user should not set it to \code{0}.}

\item{n_nmf_run}{number of NMF runs for each bootstrapped or original catalog.
At default, in total n_bootstrap x n_nmf_run (i.e. 1000) NMF runs are used
for the task.}

\item{RTOL}{a threshold proposed by Nature Cancer paper to control how to
filter solutions of NMF. Default is \verb{0.1\%} (from reference #2),
only NMF solutions with KLD (KL deviance) <= \verb{100.1\%} minimal KLD are kept.}

\item{min_contribution}{a component contribution threshold to filer out small
contributed components.}

\item{cores}{number of cpu cores to run NMF.}

\item{cores_solution}{cores for processing solutions, default is equal to argument \code{cores}.}

\item{seed}{a random seed to make reproducible result.}

\item{handle_hyper_mutation}{default is \code{TRUE}, handle hyper-mutant samples.}

\item{report_integer_exposure}{if \code{TRUE}, report integer signature
exposure by bootstrapping technique.}

\item{only_core_stats}{if \code{TRUE}, only calculate the core stats for signatures and samples.}

\item{cache_dir}{a directory for keep temp result files.}

\item{keep_cache}{if \code{TRUE}, keep cache results.}

\item{pynmf}{if \code{TRUE}, use Python NMF driver \href{http://nimfa.biolab.si/index.html}{Nimfa}.
The seed currently is not used by this implementation, so the only way to reproduce
your result is setting \code{keep_cache = TRUE}.}

\item{use_conda}{if \code{TRUE}, create an independent conda environment to run NMF.}

\item{py_path}{path to Python executable file, e.g. '/Users/wsx/anaconda3/bin/python'. In my
test, it is more stable than \code{use_conda=TRUE}. You can install the Nimfa package by yourself
or set \code{use_conda} to \code{TRUE} to install required Python environment, and then set this option.}

\item{sim_threshold}{a similarity threshold for selecting samples to auto-rerun
the extraction procedure (i.e. \code{bp_extract_signatures()}), default is \code{0.95}.}

\item{max_iter}{the maximum iteration size, default is 10, i.e., at most run
the extraction procedure 10 times.}

\item{x}{result from \code{\link[=bp_extract_signatures_iter]{bp_extract_signatures_iter()}} or a list of
\code{Signature} objects.}

\item{k}{an integer sequence specifying the cluster number to get silhouette.}

\item{include_final_iteration}{if \code{FALSE}, exclude final iteration result
from clustering for input from \code{\link[=bp_extract_signatures_iter]{bp_extract_signatures_iter()}}, not applied
if input is a list of \code{Signature} objects.}

\item{SigClusters}{result from \code{\link[=bp_cluster_iter_list]{bp_cluster_iter_list()}}.}

\item{cluster_label}{cluster labels for a specified cluster number, obtain it
from \code{SigClusters$sil_df}.}

\item{obj}{a \code{ExtractionResult} object from \code{\link[=bp_extract_signatures]{bp_extract_signatures()}}.}

\item{signum}{a integer vector to extract the corresponding \code{Signature} object(s).
If it is \code{NULL} (default), all will be returned.}

\item{left_y}{column name for left y axis.}

\item{right_y}{column name for right y axis.}

\item{left_name}{label name for left y axis.}

\item{right_name}{label name for right y axis.}

\item{left_color}{color for left axis.}

\item{right_color}{color for right axis.}

\item{left_shape}{shape setting.}

\item{right_shape}{shape setting.}

\item{shape_size}{shape setting.}

\item{highlight}{a \code{integer} to highlight a \code{x}.}

\item{add_score}{if \code{FALSE}, don't show score and label optimal points by
rank score.}

\item{scales}{one of "free_y" (default) and "free" to control the scales
of plot facet.}

\item{fixed_ratio}{if \code{TRUE} (default), make the x/y axis ratio fixed.}

\item{input}{result from \code{\link[=bp_extract_signatures]{bp_extract_signatures()}} or a Signature object.}

\item{sample_class}{a named string vector whose names are sample names
and values are class labels (i.e. cancer subtype). If it is \code{NULL} (the default),
treat all samples as one group.}

\item{method}{one of 'bt' (use bootstrap exposure median, from reference #2,
\strong{the most recommended way in my personal view}) or stepwise'
(stepwise reduce and update signatures then do signature fitting
with last signature sets, from reference #2, the result tends to assign
the contribution of removed signatures to the remaining signatures,
\strong{maybe I misunderstand the paper method? PAY ATTENTION}).}

\item{bt_use_prop}{this parameter is only used for \code{bt} method to reset
low contributing signature activity (relative activity \verb{<0.01}). If \code{TRUE},
use empirical P value calculation way (i.e. proportion, used by reference \verb{#2}),
otherwise a \code{t.test} is applied.}

\item{return_class}{string, 'matrix' or 'data.table'.}

\item{use_parallel}{if \code{TRUE}, use parallel computation based on \strong{furrr} package.
It can also be an integer for specifying cores.}
}
\value{
It depends on the called function.
}
\description{
These functions are combined to provide a best practice for optimally
identifying mutational signatures and attributing their activities (exposures)
in tumor samples. They are listed in order to use.
\itemize{
\item \code{bp_extract_signatures()} for extracting signatures.
\item \code{bp_show_survey()} for showing measures change under different
signature numbers to help user select optimal signature number.
At default, an aggregated score (named score) is generated to
suggest the best solution.
\item \code{bp_show_survey2()} for showing simplified signature number survey like
\code{\link[=show_sig_number_survey]{show_sig_number_survey()}}.
\item \code{bp_get_sig_obj()} for get a (list of) \code{Signature} object which is common
used in \strong{sigminer} for analysis and visualization.
\item \code{bp_attribute_activity()} for optimizing signature activities (exposures).
NOTE: the activities from extraction step may be better!
You can also use \link{sig_extract} to get optimal NMF result from multiple NMF runs.
Besides, you can use \link{sig_fit} to quantify exposures based on signatures extracted
from \code{bp_extract_signatures()}.
\item \code{bp_extract_signatures_iter()} for extracting signature in a iteration way.
\item \code{bp_cluster_iter_list()} for clustering (\code{hclust} with average linkage)
iterated signatures to help collapse
multiple signatures into one. The result cluster can be visualized by
\code{plot()} or \code{factoextra::fviz_dend()}.
\item \code{bp_get_clustered_sigs()} for getting clustered (grouped) mean signatures from signature clusters.
\item Extra: \code{bp_get_stats}() for obtaining stats for signatures and samples of a solution.
These stats are aggregated (averaged) as the stats for a solution
(specific signature number).
\item Extra: \code{bp_get_rank_score()} for obtaining rank score for all signature numbers.
}
}
\details{
The signature extraction approach is adopted from reference #1, #2, and
the whole best practice is adopted from the pipeline used by reference #3.
I implement the whole procedure with R code based on the method description
of papers. The code is well organized, tested and documented so user will
find it pretty simple and useful. Besides, the structure of the results is
very clear to see and also visualize like other approaches provided by \strong{sigminer}.
}
\section{Measure Explanation in Survey Plot}{

The survey plot provides a pretty good way to facilitate the signature number
selection. A \code{score} measure is calculated as the weighted mean of selected
measures and visualized as the first sub-plot. The optimal number is highlighted
with red color dot and the best values for each measures are also
highlighted with orange color dots. The detail of 6 measures shown in plot are
explained as below.
\itemize{
\item \code{score} - an aggregated score based on rank scores from selected measures below.
The higher, the better. When two signature numbers have the same score,
the larger signature number is preferred (this is a rare situation, you
have to double check other measures).
\item \code{silhouette} - the average silhouette width for signatures, also named as ASW in reference #2.
The signature number with silhouette decreases sharply is preferred.
\item \code{distance} - the average sample reconstructed cosine distance, the lower value is better.
\item \code{error} - the average sample reconstructed error calculated with L2 formula
(i.e. L2 error). This lower value is better. This measure represents a
similar concept like \code{distance} above, they are all used to quantify how well
sample mutation profiles can be reconstructed from signatures, but \code{distance}
cares the whole mutation profile similarity while \code{error} here cares value difference.
\item \verb{pos cor} - the average positive signature exposure correlation coefficient.
The lower value is better. This measure is constructed based on my understanding
about signatures: mutational signatures are typically treated as independent
recurrent patterns, so their activities are less correlated.
\item \code{similarity} - the average similarity within in a signature cluster.
Like \code{silhouette}, the point decreases sharply is preferred.
In the practice, results from multiple NMF runs are clustered
with "clustering with match" algorithm proposed by reference #2. This value
indicates if the signature profiles extracted from different NMF runs are similar.
}
}

\examples{
data("simulated_catalogs")
\donttest{
# Here I reduce the values for n_bootstrap and n_nmf_run
# for reducing the run time.
# In practice, you should keep default or increase the values
# for better estimation.
#
# The input data here is simulated from 10 mutational signatures
e1 <- bp_extract_signatures(
  t(simulated_catalogs$set1),
  range = 8:12,
  n_bootstrap = 5,
  n_nmf_run = 10
)


# See the survey for different signature numbers
# The suggested solution is marked as red dot
# with highest integrated score.
p1 <- bp_show_survey(e1)
p1
# You can also exclude plotting and highlighting the score
p2 <- bp_show_survey(e1, add_score = FALSE)
p2

# You can also plot a simplified version
p3 <- bp_show_survey2(e1, highlight = 10)
p3

# Obtain the suggested solution from extraction result
obj_suggested <- bp_get_sig_obj(e1, e1$suggested)
obj_suggested
# If you think the suggested signature number is not right
# Just pick up the solution you want
obj_s8 <- bp_get_sig_obj(e1, 8)

# Track the reconstructed profile similarity
rec_sim <- get_sig_rec_similarity(obj_s8, t(simulated_catalogs$set1))
rec_sim

# After extraction, you can assign the signatures
# to reference COSMIC signatures
# More see ?get_sig_similarity
sim <- get_sig_similarity(obj_suggested)
# Visualize the match result
if (require(pheatmap)) {
  pheatmap::pheatmap(sim$similarity)
}

# You already got the activities of signatures
# in obj_suggested, however, you can still
# try to optimize the result.
# NOTE: the optimization step may not truly optimize the result!
expo <- bp_attribute_activity(e1, return_class = "data.table")
expo$abs_activity

# Iterative extraction:
# This procedure will rerun extraction step
# for those samples with reconstructed catalog similarity
# lower than a threshold (default is 0.95)
e2 <- bp_extract_signatures_iter(
  t(simulated_catalogs$set1),
  range = 9:11,
  n_bootstrap = 5,
  n_nmf_run = 5,
  sim_threshold = 0.99
)
e2
# When the procedure run multiple rounds
# you can cluster the signatures from different rounds by
# the following command
# bp_cluster_iter_list(e2)

## Extra utilities
rank_score <- bp_get_rank_score(e1)
rank_score
stats <- bp_get_stats(e2$iter1)
# Get the mean reconstructed similarity
1 - stats$stats_sample$cosine_distance_mean
}
}
\references{
Alexandrov, Ludmil B., et al. "Deciphering signatures of mutational processes operative in human cancer." Cell reports 3.1 (2013): 246-259.

Degasperi, Andrea, et al. "A practical framework and online tool for mutational signature analyses show intertissue variation and driver dependencies." Nature cancer 1.2 (2020): 249-263.

Alexandrov, Ludmil B., et al. “The repertoire of mutational signatures in human cancer.” Nature 578.7793 (2020): 94-101.
}
\seealso{
See \link{sig_estimate}, \link{sig_extract}, \link{sig_auto_extract},
\link{sigprofiler_extract} for other approaches.
}
\author{
Shixiang Wang \href{mailto:w_shixiang@163.com}{w_shixiang@163.com}
}
