# Rattle Scripts: From dataset ds build a kmeans cluster.
#
# Copyright (C) 2023-2024, Togaware Pty Ltd.
#
# License: GNU General Public License, Version 3 (the "License")
# https://www.gnu.org/licenses/gpl-3.0.en.html
#
# Time-stamp: <Wednesday 2025-05-14 17:05:39 +1000 Graham Williams>
#
# Licensed under the GNU General Public License, Version 3 (the "License");
#
# This program is free software: you can redistribute it and/or modify it under
# the terms of the GNU General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any later
# version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
# FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
# details.
#
# You should have received a copy of the GNU General Public License along with
# this program.  If not, see <https://www.gnu.org/licenses/>.
#
# Author: Graham Williams

# Cluster using KMeans
#
# <TIMESTAMP>
#
# References:
#
# @williams:2017:essentials.
# https://survivor.togaware.com/datascience/cluster-analysis.html
# https://survivor.togaware.com/datascience/ for further details.

# Reset the random number seed to obtain the same results each
# time.
#
# Note that RattleV5 did not reset the seed so that we can demonstrate
# that each time we get a different random start and then a different
# model. For RattleV6 we will reset the seed and students can explore
# the effect of changing the seed by resetting the seed in
# SETTINGS. (gjw 20241012)
##
## Note that as in dataset_template.R we need to account for the
## SETTING that says to reset to use a different random seeed each
## time. (gjw 20250409)

if (! <RANDOM_PARTITION>) {
  set.seed(<RANDOM_SEED>)
}

# Load required packages from the local library into the R session.

# The 'reshape' package provides the 'rescaler' function.

library(reshape)

mtype <- "kmeans"
mdesc <- "Cluster"

# Set whether the data should be rescaled. For cluster analysis this
# is usually recommended.

rescale <- <CLUSTER_RESCALE>

# Prepare the data for clustering based on the value of rescale.

if (rescale) {

  # Rescale the data.

  tds <- sapply(na.omit(ds[tr, numc]), reshape::rescaler, "range")

} else {

  # Use the data without rescaling.

  tds <- na.omit(ds[tr, numc])

}

# Generate a kmeans cluster of size 10.

model_kmeans <- kmeans(tds,
                       centers = <CLUSTER_NUM>,
                       nstart  = <CLUSTER_RUN>)

# Report on the cluster characteristics.

# Cluster sizes:

print(paste(model_kmeans$size, collapse=' '))

# Data means:

print(colMeans(tds))

# Cluster centers:

print(model_kmeans$centers)

# Within cluster sum of squares:

print(model_kmeans$withinss)

cat("\n")

# Plot the first two principal components, which serve as discriminant
# coordinates. We convert tds to a matrix if it's not already.

svg("<TEMPDIR>/model_cluster_discriminant.svg")
tds_matrix <- as.matrix(tds)
cluster::clusplot(tds_matrix,
                  model_kmeans$cluster,
                  color  = TRUE,
                  shade  = TRUE,
                  labels = 2,
                  lines  = 0,
                  main   = paste(mdesc, 'Discriminant Coordinates Plot -', mtype))
dev.off()

# Extract the corresponding cluster assignments and model specific
# names.

cluster_assignments <- model_kmeans$cluster
pair_file  <- "model_cluster_pairs_kmeans.svg"

## This is not yet robust to missing values. Probably better to get
## the identifiers as ids in the above tds code.
##
## Needs work
##
o <- order(model_kmeans$cluster)
##
if (! is.null(identifier) && ds[tr,identifier] %>% nrow() < 300) {
  ids <- data.frame(ds[tr,][o,identifier],model_kmeans$cluster[o])
  names(ids) <- c(identifier, "cluster")
} else {
  ids = NULL
}

if (! is.null(ids)) print(ids)
