#!/usr/bin/env Rscript # Written by Tomi Leppanen for Clustering Methods course on UEF, spring 2017 # Run like this (on Linux): # chmod +x random_swap.R # ./random_swap.R -d s1.txt -K 15 -g s1-cb.txt -i 100 \ # -c clusters.log -p partition.log -s run.log # -d (or --dataset) defines the dataset to use # -K (or --clusters) defines number of clusters # -t (or --ground_truth) defines ground truth file to use (optional) # -i (or --iterations) defines how many iterations K-means is run (default 5000) # -c (or --clusters) is filename to save clusters to (optional) # -p (or --partition) is filename to save partition to (optional) # # This can also be used inside R interpreter with source command. # In that case call random_swap_clustering(X, K, n_iteratons, ground_truth), # where X is data set as a matrix of row vectors, # K is number of clusters, # n_iterations is number of RS iterations (optional, default 5000) and # ground_truth is ground truth clusters similarly to X for CI calculations # (optional, CI is not calculated, if this is omitted). # This code calculates nMSE. # If used as a script, the code requires Xmisc package to parse command line # arguments, which will be installed if it is not found unless there is no # permission to install packages in which case message "unable to install # packages" is shown. In that case user can install it manually to user's own # directory like this: # # Run R # Command: install.packages('Xmisc') # Answer yes to using personal library and yes to creating personal library, if # asked. Also choose a suitable mirror. # Exit R, e.g. with command: q() # No need to save the workspace image, answer no optimal_partition <- function(X, C) { if (is.null(dim(X))) # Workaround if X has only one vector X <- array(X, c(1, length(X))) distances <- array(0, c(dim(X)[1], dim(C)[1])) for (i in 1:dim(C)[1]) { distances[, i] <- colSums((t(X)-C[i, ])^2) } return(apply(distances, 1, which.min)) } optimal_centroids <- function(X, P, k) { C <- array(0, c(k, dim(X)[2])) for (i in 1:k) { n <- sum(P == i) if (n > 1) C[i, ] <- colSums(X[P == i, ])/n else if (n == 1) C[i, ] <- X[P == i, ] } return(C) } kmeans <- function(X, C, P_initial, max_iterations=1000000) { C_prev <- array(0, dim(C)) iterations_left <- max_iterations while (C != C_prev && iterations_left > 0) { C_prev <- C P <- optimal_partition(X, C) C <- optimal_centroids(X, P, dim(C)[1]) iterations_left <- iterations_left - 1 } return(list("C"=C, "P"=P)) } random_centroids <- function(X, k) { return(X[sample(1:dim(X)[1], k), ]) } random_swap <- function(X, C) { C_new <- array(rep(C), dim(C)) j <- sample(1:dim(C)[1], 1) new <- X[sample(1:dim(X)[1], 1), ] # Pick a new vector if `new` is already in C while (any(apply(C, 1, "==", new))) new <- X[sample(1:dim(X)[1], 1), ] C_new[j, ] <- new return(list("C"=C_new, "j"=j)) } local_repartition <- function(X, C, P, j) { P_new <- rep(P) if (!any(P_new == j)) return(P_new) P_new[P_new == j] <- optimal_partition(X[P_new == j, ], C) for (i in 1:dim(C)[1]) { if (i == j || !any(P_new == i)) next; P_test <- optimal_partition(X[P_new == i, ], rbind(C[j, ], C[i, ])) P_new[P_new == i][P_test == 0] = j } return(P_new) } mse <- function(X, C, P) { return(mean(unlist((X-C[P, ])^2))) } centroid_index <- function(A, B) { A_nearest <- optimal_partition(A, B) B_nearest <- optimal_partition(B, A) A_orphan <- hist(B_nearest, dim(A)[1], plot=FALSE)$counts == 0 B_orphan <- hist(A_nearest, dim(B)[1], plot=FALSE)$counts == 0 return(max(c(sum(A_orphan), sum(B_orphan)))) } random_swap_clustering <- function(X, k, T=5000, ground_truth=NA) { C <- random_centroids(X, k) P <- optimal_partition(X, C) mse_best <- mse(X, C, P) for (i in 1:T) { t <- random_swap(X, C); C_new <- t$C; j <- t$j P_new <- local_repartition(X, C_new, P, j) t <- kmeans(X, C_new, P_new, 2); C_new <- t$C; P_new <- t$P mse_new <- mse(X, C_new, P_new) if (mse_new < mse_best) { C <- C_new; P <- P_new; mse_best <- mse_new if (length(ground_truth) > 1 || !is.na(ground_truth)) { ci <- centroid_index(C, ground_truth) message(sprintf("it: %d, mse: %f, ci: %d", i, mse_best, ci)) } else { message(sprintf("it: %d, mse: %f", i, mse_best)) } } } return(list("C"=C, "P"=P)) } main <- function() { if (!require(Xmisc, warn.conflicts=FALSE, quietly=TRUE)) { install.packages('Xmisc') require(Xmisc) } parser <- ArgumentParser$new() parser$add_description('Random Swap implemented in R') parser$add_argument('-d', '--dataset', type='character', required=TRUE, help='Dataset text file') parser$add_argument('-K', '--clusters', type='integer', required=TRUE, help="Number of clusters") parser$add_argument('-i', '--iterations', type='integer', default=5000) parser$add_argument('-t', '--ground_truth', default=NA, type='character', help="File to read ground truth from, used for calculating centroid index") parser$add_argument('-c', '--centroids', default=NA, type='character', help="File to save centroids to") parser$add_argument('-p', '--partition', default=NA, type='character', help="File to save partition to") parser$add_argument('-s', '--csv', default=NA, type='character', help="File to save statistics in CSV format") args <- parser$get_args() data <- as.matrix(read.table(args$dataset)) if (!is.na(args$ground_truth)) gt <- as.matrix(read.table(args$ground_truth)) else gt = NA temp <- random_swap_clustering(data, args$clusters, args$iterations, gt) C <- temp$C; P <- temp$P message("Finished clustering") mse_final <- mse(data, C, P) if (!is.na(args$ground_truth)) { ci <- centroid_index(C, gt) message(sprintf("Final results: mse=%f, ci=%d", mse_final, ci)) } else { ci <- "N/A" message(sprintf("Final results: mse=%f", mse_final)) } if (!is.na(args$centroids)) write(C, args$centroids, ncolumns=ncol(C)) if (!is.na(args$partition)) write(P, args$partition, ncolumns=1) if (!is.na(args$csv)) { if (!file.exists(args$csv)) { output = file(args$csv, 'w') write('K,MSE,CI', output) } else output = file(args$csv, 'a') write(paste(c(args$clusters, mse_final, ci), collapse=','), output) } } if (!interactive()) main()