## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
    collapse = TRUE,
    comment = "#>"
)

## -----------------------------------------------------------------------------
library(metasnf)

class(cort_t)

dim(cort_t)

str(cort_t[1:5, 1:5])

cort_t[1:5, 1:5]

## -----------------------------------------------------------------------------
dim(income)

str(income[1:5, ])

income[1:5, ]

## -----------------------------------------------------------------------------
data <- list(
    anxiety,
    depress,
    cort_t,
    cort_sa,
    subc_v,
    income,
    pubertal
)

# The number of rows in each dataframe:
lapply(data, dim)

# Whether or not each dataframe has missing values:
lapply(data,
    function(x) {
        any(is.na(x))
    }
)

## -----------------------------------------------------------------------------
complete_uids <- get_complete_uids(data, uid = "unique_id")

print(length(complete_uids))

# Reducing dataframes to only common subjects with no missing data
anxiety <- anxiety[anxiety$"unique_id" %in% complete_uids, ]
depress <- depress[depress$"unique_id" %in% complete_uids, ]
cort_t <- cort_t[cort_t$"unique_id" %in% complete_uids, ]
cort_sa <- cort_sa[cort_sa$"unique_id" %in% complete_uids, ]
subc_v <- subc_v[subc_v$"unique_id" %in% complete_uids, ]
income <- income[income$"unique_id" %in% complete_uids, ]
pubertal <- pubertal[pubertal$"unique_id" %in% complete_uids, ]

## -----------------------------------------------------------------------------
# Note that you do not need to explicitly name every single named element
# (data = ..., name = ..., etc.)
data_list <- generate_data_list(
    list(
        data = cort_t,
        name = "cortical_thickness",
        domain = "neuroimaging",
        type = "continuous"
    ),
    list(
        data = cort_sa,
        name = "cortical_surface_area",
        domain = "neuroimaging",
        type = "continuous"
    ),
    list(
        data = subc_v,
        name = "subcortical_volume",
        domain = "neuroimaging",
        type = "continuous"
    ),
    list(
        data = income,
        name = "household_income",
        domain = "demographics",
        type = "continuous"
    ),
    list(
        data = pubertal,
        name = "pubertal_status",
        domain = "demographics",
        type = "continuous"
    ),
    uid = "unique_id"
)

## -----------------------------------------------------------------------------
summarize_dl(data_list)

## -----------------------------------------------------------------------------
target_list <- generate_data_list(
    list(anxiety, "anxiety", "behaviour", "ordinal"),
    list(depress, "depressed", "behaviour", "ordinal"),
    uid = "unique_id"
)

summarize_dl(target_list)

## -----------------------------------------------------------------------------
set.seed(42)
settings_matrix <- generate_settings_matrix(
    data_list,
    nrow = 20,
    min_k = 20,
    max_k = 50
)

settings_matrix[1:5, ]

## -----------------------------------------------------------------------------
solutions_matrix <- batch_snf(data_list, settings_matrix)

colnames(solutions_matrix)[1:30]

## -----------------------------------------------------------------------------
cluster_solutions <- get_cluster_solutions(solutions_matrix)

head(cluster_solutions)

## -----------------------------------------------------------------------------
solutions_matrix_aris <- calc_aris(solutions_matrix)

## -----------------------------------------------------------------------------
meta_cluster_order <- get_matrix_order(solutions_matrix_aris)

# Just a vector of numbers
meta_cluster_order

## ----eval = FALSE-------------------------------------------------------------
#  ari_hm <- adjusted_rand_index_heatmap(
#      solutions_matrix_aris,
#      order = meta_cluster_order
#  )
#  
#  save_heatmap(
#      heatmap = ari_hm,
#      path = "./adjusted_rand_index_heatmap.png",
#      width = 330,
#      height = 300,
#      res = 100
#  )

## ----eval = FALSE-------------------------------------------------------------
#  shiny_annotator(ari_hm)

## ----eval = FALSE-------------------------------------------------------------
#  split_vec <- c(2, 5, 12, 17)
#  
#  ari_mc_hm <- adjusted_rand_index_heatmap(
#      solutions_matrix_aris,
#      order = meta_cluster_order,
#      split_vector = split_vec
#  )
#  
#  save_heatmap(
#      heatmap = ari_mc_hm,
#      path = "./ari_mc_hm.png",
#      width = 330,
#      height = 300,
#      res = 100
#  )

## -----------------------------------------------------------------------------
# Only looking at our out-of-model p-values
extended_solutions_matrix <- extend_solutions(
    solutions_matrix,
    target_list = target_list
)

# What columns have been added?
old_cols <- colnames(extended_solutions_matrix) %in% colnames(solutions_matrix)
new_cols <- !old_cols

head(extended_solutions_matrix[, new_cols])

# Re-running to calculate the p-value for every single input and out-of-model
# feature:
extended_solutions_matrix <- extend_solutions(
    solutions_matrix,
    data_list = data_list,
    target_list = target_list
)

## ----eval = FALSE-------------------------------------------------------------
#  # Also would work, but without any summary p-values
#  extended_solutions_matrix <- extend_solutions(
#      solutions_matrix,
#      data_list = c(data_list, target_list)
#  )
#  
#  # Also would work, but now every feature would be part of the summaries
#  extended_solutions_matrix <- extend_solutions(
#      solutions_matrix,
#      target_list = c(data_list, target_list)
#  )

## ----eval = FALSE-------------------------------------------------------------
#  annotated_ari_hm <- adjusted_rand_index_heatmap(
#      solutions_matrix_aris,
#      order = meta_cluster_order,
#      split_vector = split_vec,
#      data = extended_solutions_matrix,
#      top_hm = list(
#          "Depression p-value" = "cbcl_depress_r_pval",
#          "Anxiety p-value" = "cbcl_anxiety_r_pval",
#          "Overall outcomes p-value" = "mean_pval"
#      ),
#      bottom_bar = list(
#          "Number of Clusters" = "nclust"
#      ),
#      annotation_colours = list(
#          "Depression p-value" = colour_scale(
#              extended_solutions_matrix$"cbcl_depress_r_pval",
#              min_colour = "purple",
#              max_colour = "black"
#          ),
#          "Anxiety p-value" = colour_scale(
#              extended_solutions_matrix$"cbcl_anxiety_r_pval",
#              min_colour = "green",
#              max_colour = "black"
#          ),
#          "Overall outcomes p-value" = colour_scale(
#              extended_solutions_matrix$"mean_pval",
#              min_colour = "lightblue",
#              max_colour = "black"
#          )
#      )
#  )
#  
#  save_heatmap(
#      heatmap = annotated_ari_hm,
#      path = "./annotated_ari_hm.png",
#      width = 700,
#      height = 500,
#      res = 100
#  )

## ----eval = FALSE-------------------------------------------------------------
#  extended_solutions_matrix2 <- extended_solutions_matrix |>
#      dplyr::mutate(
#          key_subjects_cluster_together = dplyr::case_when(
#              subject_NDAR_INVLF3TNDUZ == subject_NDAR_INVLDQH8ATK ~ TRUE,
#              TRUE ~ FALSE
#          )
#      )
#  
#  annotated_ari_hm2 <- adjusted_rand_index_heatmap(
#      solutions_matrix_aris,
#      order = meta_cluster_order,
#      split_vector = split_vec,
#      data = extended_solutions_matrix2,
#      top_hm = list(
#          "Depression p-value" = "cbcl_depress_r_pval",
#          "Anxiety p-value" = "cbcl_anxiety_r_pval",
#          "Key Subjects Clustered Together" = "key_subjects_cluster_together"
#      ),
#      bottom_bar = list(
#          "Number of Clusters" = "nclust"
#      ),
#      annotation_colours = list(
#          "Depression p-value" = colour_scale(
#              extended_solutions_matrix$"cbcl_depress_r_pval",
#              min_colour = "purple",
#              max_colour = "black"
#          ),
#          "Anxiety p-value" = colour_scale(
#              extended_solutions_matrix$"cbcl_anxiety_r_pval",
#              min_colour = "purple",
#              max_colour = "black"
#          ),
#          "Key Subjects Clustered Together" = c(
#              "TRUE" = "blue",
#              "FALSE" = "black"
#          )
#      )
#  )
#  
#  save_heatmap(
#      heatmap = annotated_ari_hm2,
#      path = "./annotated_ari_hm2.png",
#      width = 700,
#      height = 500,
#      res = 100
#  )

## ----eval = FALSE-------------------------------------------------------------
#  rep_solutions <- get_representative_solutions(
#      solutions_matrix_aris,
#      split_vector = split_vec,
#      order = meta_cluster_order,
#      extended_solutions_matrix
#  )
#  
#  mc_manhattan <- mc_manhattan_plot(
#      rep_solutions,
#      data_list = data_list,
#      target_list = target_list,
#      hide_x_labels = TRUE,
#      point_size = 2,
#      text_size = 12,
#      threshold = 0.05,
#      neg_log_pval_thresh = 5
#  )
#  
#  ggplot2::ggsave(
#      "mc_manhattan.png",
#      mc_manhattan,
#      height = 4,
#      width = 8,
#      dpi = 100
#  )

## ----eval = FALSE-------------------------------------------------------------
#  rep_solutions_no_cort <- dplyr::select(
#      rep_solutions,
#      -dplyr::contains("mrisdp")
#  )
#  
#  mc_manhattan2 <- mc_manhattan_plot(
#      rep_solutions_no_cort,
#      data_list = data_list,
#      target_list = target_list,
#      point_size = 4,
#      threshold = 0.01,
#      text_size = 12,
#      domain_colours = c(
#          "neuroimaging" = "cadetblue",
#          "demographics" = "purple",
#          "behaviour" = "firebrick"
#      )
#  )
#  mc_manhattan2
#  
#  ggplot2::ggsave(
#      "mc_manhattan2.png",
#      mc_manhattan2,
#      height = 8,
#      width = 12,
#      dpi = 100
#  )

## ----eval = FALSE-------------------------------------------------------------
#  sm_hm <- settings_matrix_heatmap(
#      settings_matrix,
#      order = meta_cluster_order
#  )
#  
#  save_heatmap(
#      heatmap = sm_hm,
#      path = "./settings_matrix_heatmap_ordered.png",
#      width = 400,
#      height = 500,
#      res = 75
#  )

## ----eval = FALSE-------------------------------------------------------------
#  annotated_ari_hm3 <- adjusted_rand_index_heatmap(
#      solutions_matrix_aris,
#      order = meta_cluster_order,
#      split_vector = c(11, 14),
#      data = extended_solutions_matrix,
#      top_hm = list(
#          "Depression p-value" = "cbcl_depress_r_pval",
#          "Anxiety p-value" = "cbcl_anxiety_r_pval",
#          "Key Subjects Clustered Together" = "key_subjects_cluster_together"
#      ),
#      left_hm = list(
#          "Clustering Algorithm" = "clust_alg" # from the original settings
#      ),
#      bottom_bar = list(
#          "Number of Clusters" = "nclust" # also from the original settings
#      ),
#      annotation_colours = list(
#          "Depression p-value" = colour_scale(
#              extended_solutions_matrix$"cbcl_depress_r_pval",
#              min_colour = "purple",
#              max_colour = "black"
#          ),
#          "Anxiety p-value" = colour_scale(
#              extended_solutions_matrix$"cbcl_anxiety_r_pval",
#              min_colour = "purple",
#              max_colour = "black"
#          ),
#          "Key Subjects Clustered Together" = c(
#              "TRUE" = "blue",
#              "FALSE" = "black"
#          )
#      )
#  )

## -----------------------------------------------------------------------------
batch_snf_results <- batch_snf(
    data_list,
    settings_matrix,
    return_similarity_matrices = TRUE
)

solutions_matrix <- batch_snf_results$"solutions_matrix"
similarity_matrices <- batch_snf_results$"similarity_matrices"

## ----eval = FALSE-------------------------------------------------------------
#  silhouette_scores <- calculate_silhouettes(
#      solutions_matrix,
#      similarity_matrices
#  )
#  
#  dunn_indices <- calculate_dunn_indices(
#      solutions_matrix,
#      similarity_matrices
#  )
#  
#  db_indices <- calculate_db_indices(
#      solutions_matrix,
#      similarity_matrices
#  )

## -----------------------------------------------------------------------------
extended_solutions_matrix <- extend_solutions(solutions_matrix, target_list)

colnames(extended_solutions_matrix)[1:25]

# Looking at the newly added columns
head(no_subs(extended_solutions_matrix))

## -----------------------------------------------------------------------------
target_pvals <- get_pvals(extended_solutions_matrix)

head(target_pvals)

## ----eval = FALSE-------------------------------------------------------------
#  pval_hm <- pval_heatmap(target_pvals, order = meta_cluster_order)
#  
#  save_heatmap(
#      heatmap = pval_hm,
#      path = "./pval_heatmap_ordered.png",
#      width = 400,
#      height = 500,
#      res = 100
#  )

## -----------------------------------------------------------------------------
# All the subjects present in all dataframes with no NAs
all_subjects <- data_list[[1]]$"data"$"subjectkey"

# Remove the "subject_" prefix to allow merges with the original data
all_subjects <- gsub("subject_", "", all_subjects)

# Dataframe assigning 80% of subjects to train and 20% to test
assigned_splits <- train_test_assign(train_frac = 0.8, subjects = all_subjects)

# Pulling the training and testing subjects specifically
train_subs <- assigned_splits$"train"
test_subs <- assigned_splits$"test"

# Partition a training set
train_cort_t <- cort_t[cort_t$"unique_id" %in% train_subs, ]
train_cort_sa <- cort_sa[cort_sa$"unique_id" %in% train_subs, ]
train_subc_v <- subc_v[subc_v$"unique_id" %in% train_subs, ]
train_income <- income[income$"unique_id" %in% train_subs, ]
train_pubertal <- pubertal[pubertal$"unique_id" %in% train_subs, ]
train_anxiety <- anxiety[anxiety$"unique_id" %in% train_subs, ]
train_depress <- depress[depress$"unique_id" %in% train_subs, ]

# Partition a test set
test_cort_t <- cort_t[cort_t$"unique_id" %in% test_subs, ]
test_cort_sa <- cort_sa[cort_sa$"unique_id" %in% test_subs, ]
test_subc_v <- subc_v[subc_v$"unique_id" %in% test_subs, ]
test_income <- income[income$"unique_id" %in% test_subs, ]
test_pubertal <- pubertal[pubertal$"unique_id" %in% test_subs, ]
test_anxiety <- anxiety[anxiety$"unique_id" %in% test_subs, ]
test_depress <- depress[depress$"unique_id" %in% test_subs, ]

# A data list with just training subjects
train_data_list <- generate_data_list(
    list(train_cort_t, "cortical_thickness", "neuroimaging", "continuous"),
    list(train_cort_sa, "cortical_sa", "neuroimaging", "continuous"),
    list(train_subc_v, "subcortical_volume", "neuroimaging", "continuous"),
    list(train_income, "household_income", "demographics", "continuous"),
    list(train_pubertal, "pubertal_status", "demographics", "continuous"),
    uid = "unique_id"
)

# A data list with training and testing subjects
full_data_list <- generate_data_list(
    list(cort_t, "cortical_thickness", "neuroimaging", "continuous"),
    list(cort_sa, "cortical_surface_area", "neuroimaging", "continuous"),
    list(subc_v, "subcortical_volume", "neuroimaging", "continuous"),
    list(income, "household_income", "demographics", "continuous"),
    list(pubertal, "pubertal_status", "demographics", "continuous"),
    uid = "unique_id"
)

# Construct the target lists
train_target_list <- generate_data_list(
    list(train_anxiety, "anxiety", "behaviour", "ordinal"),
    list(train_depress, "depressed", "behaviour", "ordinal"),
    uid = "unique_id"
)

# Find a clustering solution in your training data
set.seed(42)
settings_matrix <- generate_settings_matrix(
    train_data_list,
    nrow = 5,
    min_k = 10,
    max_k = 30
)

train_solutions_matrix <- batch_snf(
    train_data_list,
    settings_matrix
)

extended_solutions_matrix <- extend_solutions(
    train_solutions_matrix,
    train_target_list
)

extended_solutions_matrix |> colnames()

# The fifth row had the lowest minimum p-value across our outcomes
lowest_min_pval <- min(extended_solutions_matrix$"min_pval")
which(extended_solutions_matrix$"min_pval" == lowest_min_pval)

# Keep track of your top solution
top_row <- extended_solutions_matrix[4, ]

# Use the solutions matrix from the training subjects and the data list from
# the training and testing subjects to propagate labels to the test subjects
propagated_labels <- lp_solutions_matrix(top_row, full_data_list)

head(propagated_labels)
tail(propagated_labels)

## -----------------------------------------------------------------------------
propagated_labels_all <- lp_solutions_matrix(
    extended_solutions_matrix,
    full_data_list
)

head(propagated_labels_all)
tail(propagated_labels_all)