k-means clustering and custergram with r

prepared by Volkan OBAN

K-Means Clustering with R

library(datasets)

data(attitude)

attitude

rating complaints privileges learning raises critical advance1 43 51 30 39 61 92 452 63 64 51 54 63 73 473 71 70 68 69 76 86 484 61 63 45 47 54 84 355 81 78 56 66 71 83 476 43 55 49 44 54 49 347 58 67 42 56 66 68 358 71 75 50 55 70 66 419 72 82 72 67 71 83 3110 67 61 45 47 62 80 4111 64 53 53 58 58 67 3412 67 60 47 39 59 74 4113 69 62 57 42 55 63 2514 68 83 83 45 59 77 3515 77 77 54 72 79 77 4616 81 90 50 72 60 54 3617 74 85 64 69 79 79 6318 65 60 65 75 55 80 6019 65 70 46 57 75 85 4620 50 58 68 54 64 78 5221 50 40 33 34 43 64 3322 64 61 52 62 66 80 4123 53 66 52 50 63 80 3724 40 37 42 58 50 57 4925 63 54 42 48 66 75 3326 66 77 66 63 88 76 7227 78 75 58 74 80 78 4928 48 57 44 45 51 83 3829 85 85 71 71 77 74 5530 82 82 39 59 64 78 39

datset.seed(7)

cl = kmeans(dat, 6, nstart=100)

cl

> cl = kmeans(dat, 3, nstart=100)>

> # Examine the result of the clustering algorithm> clK-means clustering with 3 clusters of sizes 11, 2, 17

Cluster means: privileges learning1 61.45455 69.090912 75.50000 49.500003 45.11765 48.94118

Clustering vector: [1] 3 3 1 3 1 3 3 3 1 3 3 3 3 2 1 1 1 1 3 2 3 1 3 3 3 1 1 3 1 3

Within cluster sum of squares by cluster:[1] 783.6364 153.0000 1732.7059 (between_SS / total_SS = 68.0 %)

Available components:

[1] "cluster" "centers" "totss" "withinss" [5] "tot.withinss" "betweenss" "size" "iter" [9] "ifaul

plot(dat, col =(cl$cluster +1) , main="K-Means result with 3 clusters", pch=20, cex=2)

> mydata <- dat> wss <- (nrow(mydata)-1)*sum(apply(mydata,2,var))> for (i in 2:15) wss[i] <- sum(kmeans(mydata,+ centers=i)$withinss)> plot(1:15, wss, type="b", xlab="Number of Clusters",+ ylab="Within groups sum of squares",+ main="Assessing the Optimal Number of Clusters with the Elbow Method",+ pch=20, cex=2)

Ref: https://rpubs.com/FelipeRego/K-Means-Clustering

Felipe Rego

> clustergram.kmeans <- function(Data, k, ...)+ {+ # this is the type of function that the clustergram+ # function takes for the clustering.+ # using similar structure will allow implementation of different clustering algorithms+ + # It returns a list with two elements:+ # cluster = a vector of length of n (the number of subjects/items)+ # indicating to which cluster each item belongs.+ # centers = a k dimensional vector. Each element is 1 number that represent that cluster+ # In our case, we are using the weighted mean of the cluster dimensions by + # Using the first component (loading) of the PCA of the Data.+ + cl <- kmeans(Data, k,...)

https://rpubs.com/FelipeRego/K-Means-Clustering

+ + cluster <- cl$cluster+ centers <- cl$centers %*% princomp(Data)$loadings[,1] # 1 number per center+ # here we are using the weighted mean for each+ + return(list(+ cluster = cluster,+ centers = centers+ ))+ }> > clustergram.plot.matlines <- function(X,Y, k.range, + x.range, y.range , COL, + add.center.points , centers.points)+ {+ plot(0,0, col = "white", xlim = x.range, ylim = y.range,+ axes = F,+ xlab = "Number of clusters (k)", ylab = "PCA weighted Mean of the clusters", main = "Clustergram of the PCA-weighted Mean of the clusters k-mean clusters vs number of clusters (k)")+ axis(side =1, at = k.range)+ axis(side =2)+ abline(v = k.range, col = "grey")+ + matlines(t(X), t(Y), pch = 19, col = COL, lty = 1, lwd = 1.5)+ + if(add.center.points)+ {+ require(plyr)+ + xx <- ldply(centers.points, rbind)+ points(xx$y~xx$x, pch = 19, col = "red", cex = 1.3)+ + # add points+ # temp <- l_ply(centers.points, function(xx) {+ # with(xx,points(y~x, pch = 19, col = "red", cex = 1.3))+ # points(xx$y~xx$x, pch = 19, col = "red", cex = 1.3)+ # return(1)+ # })+ # We assign the lapply to a variable (temp) only to suppress the lapply "NULL" output+ }+ }> > > > clustergram <- function(Data, k.range = 2:10 , + clustering.function = clustergram.kmeans,+ clustergram.plot = clustergram.plot.matlines, + line.width = .004, add.center.points = T)+ {+ # Data - should be a scales matrix. Where each column belongs to a different dimension of the observations+ # k.range - is a vector with the number of clusters to plot the clustergram for+ # clustering.function - this is not really used, but offers a bases to later extend the function to other algorithms + # Although that would more work on the code+ # line.width - is the amount to lift each line in the plot so they won't superimpose eachother+ # add.center.points - just assures that we want to plot points of the cluster means

+ + n <- dim(Data)[1]+ + PCA.1 <- Data %*% princomp(Data)$loadings[,1] # first principal component of our data+ + if(require(colorspace)) {+ COL <- heat_hcl(n)[order(PCA.1)] # line colors+ } else {+ COL <- rainbow(n)[order(PCA.1)] # line colors+ warning('Please consider installing the package "colorspace" for prittier colors')+ }+ + line.width <- rep(line.width, n)+ + Y <- NULL # Y matrix+ X <- NULL # X matrix+ + centers.points <- list()+ + for(k in k.range)+ {+ k.clusters <- clustering.function(Data, k)+ + clusters.vec <- k.clusters$cluster+ # the.centers <- apply(cl$centers,1, mean)+ the.centers <- k.clusters$centers + + noise <- unlist(tapply(line.width, clusters.vec, cumsum))[order(seq_along(clusters.vec)[order(clusters.vec)])]+ # noise <- noise - mean(range(noise))+ y <- the.centers[clusters.vec] + noise+ Y <- cbind(Y, y)+ x <- rep(k, length(y))+ X <- cbind(X, x)+ + centers.points[[k]] <- data.frame(y = the.centers , x = rep(k , k))+ #points(the.centers ~ rep(k , k), pch = 19, col = "red", cex = 1.5)+ }+ + + x.range <- range(k.range)+ y.range <- range(PCA.1)+ + clustergram.plot(X,Y, k.range, + x.range, y.range , COL, + add.center.points , centers.points)+ + + }> set.seed(250)> data("attitude")> Data <- scale(attitude[,-5])

> clustergram(Data, k.range = 2:8, line.width = 0.004)

> par(cex.lab = 1.2, cex.main = .7)> par(mfrow = c(3,2))> for(i in 1:6) clustergram(Data, k.range = 2:8 , line.width = .004, add.center.points = T)

k-means clustering and custergram with r

Data & Analytics