## ----include = FALSE---------------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) ## ----------------------------------------------------------------------------- # Load the package library(metasnf) # Start by making a data list containing all our dataframes to more easily # identify subjects without missing data full_data_list <- generate_data_list( list(subc_v, "subcortical_volume", "neuroimaging", "continuous"), list(income, "household_income", "demographics", "continuous"), list(pubertal, "pubertal_status", "demographics", "continuous"), list(anxiety, "anxiety", "behaviour", "ordinal"), list(depress, "depressed", "behaviour", "ordinal"), uid = "unique_id" ) # Partition into a data and target list (optional) data_list <- full_data_list[1:3] target_list <- full_data_list[4:5] # Specifying 5 different sets of settings for SNF set.seed(42) settings_matrix <- generate_settings_matrix( data_list, nrow = 5, max_k = 40 ) # This matrix has clustering solutions for each of the 5 SNF runs! solutions_matrix <- batch_snf(data_list, settings_matrix) extended_solutions <- extend_solutions( solutions_matrix, target_list, cat_test = "fisher_exact" ) ## ----eval = FALSE------------------------------------------------------------- # clust_esm_manhattan <- esm_manhattan_plot( # extended_solutions, # threshold = 0.05, # bonferroni_line = TRUE # ) # # ggplot2::ggsave( # "clust_esm_manhattan.png", # clust_esm_manhattan, # width = 5, # height = 5, # dpi = 100 # ) ## ----------------------------------------------------------------------------- settings_matrix$"clust_alg" ## ----------------------------------------------------------------------------- clust_algs_list <- generate_clust_algs_list() # The default list: clust_algs_list # A prettier format: summarize_clust_algs_list(clust_algs_list) # Adding algorithms provided by the package clust_algs_list <- generate_clust_algs_list( "two_cluster_spectral" = spectral_two, "five_cluster_spectral" = spectral_five ) # Note that this one has the default algorithms as well as the newly added ones summarize_clust_algs_list(clust_algs_list) # This list has only the newly added ones, thanks to the disable_base parameter clust_algs_list <- generate_clust_algs_list( "two_cluster_spectral" = spectral_two, "five_cluster_spectral" = spectral_five, disable_base = TRUE ) summarize_clust_algs_list(clust_algs_list) ## ----------------------------------------------------------------------------- # This list has only the newly added ones, thanks to the disable_base parameter clust_algs_list <- generate_clust_algs_list( "two_cluster_spectral" = spectral_two, "three_cluster_spectral" = spectral_three, "five_cluster_spectral" = spectral_five ) # Specifying 5 different sets of settings for SNF set.seed(42) settings_matrix <- generate_settings_matrix( data_list, nrow = 10, max_k = 40, clustering_algorithms = clust_algs_list ) settings_matrix$"clust_alg" ## ----eval = FALSE------------------------------------------------------------- # solutions_matrix <- batch_snf( # data_list, # settings_matrix, # clust_algs_list = clust_algs_list # ) ## ----eval = FALSE------------------------------------------------------------- # # Default clustering algorithm #1 # spectral_eigen <- function(similarity_matrix) { # estimated_n <- estimate_nclust_given_graph( # W = similarity_matrix, # NUMC = 2:10 # ) # nclust_estimate <- estimated_n$`Eigen-gap best` # solution <- SNFtool::spectralClustering( # similarity_matrix, # nclust_estimate # ) # nclust <- length(unique(solution)) # solution_data <- list("solution" = solution, "nclust" = nclust) # if (nclust_estimate != nclust) { # warning( # "Spectral clustering provided a solution of size ", nclust, # " when the number requested based on the eigen-gap heuristic", # " was ", nclust_estimate, "." # ) # } # return(solution_data) # } # # # Default clustering algorithm #2 # spectral_rot <- function(similarity_matrix) { # estimated_n <- estimate_nclust_given_graph( # W = similarity_matrix, # NUMC = 2:10 # ) # nclust_estimate <- estimated_n$`Rotation cost best` # solution <- SNFtool::spectralClustering( # similarity_matrix, # nclust_estimate # ) # nclust <- length(unique(solution)) # solution_data <- list("solution" = solution, "nclust" = nclust) # if (nclust_estimate != length(unique(solution))) { # warning( # "Spectral clustering provided a solution of size ", nclust, # " when the number requested based on the rotation cost heuristic", # " was ", nclust_estimate, "." # ) # } # return(solution_data) # } ## ----------------------------------------------------------------------------- batch_snf_results <- batch_snf( data_list, settings_matrix, clust_algs_list = clust_algs_list, return_similarity_matrices = TRUE ) names(batch_snf_results) solutions_matrix <- batch_snf_results$"solutions_matrix" # Similarity matrices are in the list below: similarity_matrices <- batch_snf_results$"similarity_matrices" length(similarity_matrices) dim(similarity_matrices[[1]]) # Your manual clustering goes here... ## ----eval = FALSE------------------------------------------------------------- # library(dbscan) # ## Example 1: use dbscan on the iris data set # data(iris) # iris <- as.matrix(iris[, 1:4]) # iris_dist <- dist(iris) # # ## Find suitable DBSCAN parameters: # ## 1. We use minPts = dim + 1 = 5 for iris. A larger value can also be used. # ## 2. We inspect the k-NN distance plot for k = minPts - 1 = 4 # kNNdistplot(iris, minPts = 5) # # ## Noise seems to start around a 4-NN distance of .7 # abline(h=.7, col = "red", lty = 2) # # results <- dbscan(iris_dist, eps = 0.7, minPts = 5) # # # The 1 is added to ensure that those with no cluster (cluster 0) are still # # plotted. # pairs(iris, col = results$cluster + 1) ## ----fig.width = 5, fig.height = 4.5------------------------------------------ library(dbscan) library(ggplot2) data_list <- generate_data_list( list( data = expression_df, name = "genes_1_and_2_exp", domain = "gene_expression", type = "continuous" ), list( data = methylation_df, name = "genes_1_and_2_meth", domain = "gene_methylation", type = "continuous" ), uid = "patient_id" ) set.seed(42) settings_matrix <- generate_settings_matrix( data_list, nrow = 5 ) batch_snf_results <- batch_snf( data_list, settings_matrix, return_similarity_matrices = TRUE ) similarity_matrices <- batch_snf_results$"similarity_matrices" solutions_matrix <- batch_snf_results$"solutions_matrix" representative_sm <- similarity_matrices[[1]] representative_sms <- similarity_matrices[c(1, 2)] distance_matrix1 <- as.dist( max(representative_sm) - representative_sm ) kNNdistplot( distance_matrix1, minPts = 10 ) ## Maybe there? abline(h=0.4872, col = "red", lty = 2) dbscan_results <- dbscan(distance_matrix1, eps = 0.4872, minPts = 10)$"cluster" spectral_results <- get_clusters(solutions_matrix[1, ]) dbscan_vs_spectral <- data.frame( dbscan = dbscan_results, spectral = spectral_results ) ggplot(dbscan_vs_spectral, aes(x = dbscan, y = spectral)) + geom_jitter(height = 0.1, width = 0.1, alpha = 0.5) + theme_bw() ## ----eval = FALSE------------------------------------------------------------- # for (i in seq(0.485, 0.488, by = 0.0001)) { # results <- dbscan(distance_matrix1, eps = i, minPts = 10) # if (length(unique(results$"cluster")) == 3) { # print(i) # } # }