Hierarchical Clustering

1.hclust

# Load data
data(USArrests)
# Compute distances and hierarchical clustering
dd <- dist(scale(USArrests), method = "euclidean")
set.seed(100)
hc <- hclust(dd)
## the abs(id) in hc$merge indicates the rowID of USArrests



2.plot.hclust()

plot(hc)

# Put the labels at the same height: hang = -1
plot(hc, hang = -1, cex = 0.6)



3. plot.dendrogram() function

# Convert hclust into a dendrogram and plot
hcd <- as.dendrogram(hc)

# Default plot
plot(hcd, type = "rectangle", ylab = "Height")

# Triangle plot
plot(hcd, type = "triangle", ylab = "Height")

# Zoom in to the first dendrogram
plot(hcd, xlim = c(1, 20), ylim = c(1,8))

# Define nodePar
nodePar <- list(lab.cex = 0.6, pch = c(NA, 19), 
                cex = 0.7, col = "blue")
# Customized plot; remove labels
plot(hcd, ylab = "Height", nodePar = nodePar, leaflab = "none")

# Horizontal plot
# change the edge of the plot
par(mar = c(1,1,1,6))
plot(hcd,  xlab = "Height",
     nodePar = nodePar, horiz = TRUE)



4. Phylogenetic tree

# install.packages("ape")
library("ape")
plot(as.phylo(hc), type = "fan")

#change the margin
par(mar = c(0,0,0,0))
plot(as.phylo(hc), type = "fan")



6. Heatmap

library('gplots')
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
#check the summary of the data
apply(USArrests,1,function(x){quantile(x)})
##      Alabama  Alaska Arizona Arkansas California Colorado Connecticut Delaware
## 0%      13.2  10.000   8.100    8.800       9.00     7.90        3.30    5.900
## 25%     19.2  35.875  25.275   16.825      32.70    31.00        9.15   13.325
## 50%     39.6  46.250  55.500   34.750      65.80    58.35       44.05   43.900
## 75%    102.5 101.750 133.500   85.000     137.25   109.50       85.25  113.500
## 100%   236.0 263.000 294.000  190.000     276.00   204.00      110.00  238.000
##      Florida Georgia Hawaii Idaho Illinois Indiana   Iowa Kansas Kentucky
## 0%    15.400   17.40  5.300   2.6     10.4    7.20  2.200   6.00     9.70
## 25%   27.775   23.70 16.475  11.3     20.6   17.55  9.025  15.00    14.65
## 50%   55.950   42.90 33.100  34.1     53.5   43.00 33.650  42.00    34.15
## 75%  143.750   97.75 55.250  70.5    124.5   77.00 56.250  78.25    66.25
## 100% 335.000  211.00 83.000 120.0    249.0  113.00 57.000 115.00   109.00
##      Louisiana  Maine Maryland Massachusetts Michigan Minnesota Mississippi
## 0%       15.40  2.100   11.300         4.400    12.10      2.70       16.10
## 25%      20.50  6.375   23.675        13.325    29.35     11.85       16.85
## 50%      44.10 29.400   47.400        50.650    54.55     40.45       30.55
## 75%     111.75 59.000  125.250       101.000   119.25     67.50       97.75
## 100%    249.00 83.000  300.000       149.000   255.00     72.00      259.00
##      Missouri Montana Nebraska Nevada New Hampshire New Jersey New Mexico
## 0%        9.0     6.0     4.30  12.20          2.10       7.40     11.400
## 25%      23.4    13.8    13.45  37.55          7.65      15.95     26.925
## 50%      49.1    34.7    39.25  63.50         32.75      53.90     51.050
## 75%      97.0    67.0    72.00 123.75         56.25     106.50    123.750
## 100%    178.0   109.0   102.00 252.00         57.00     159.00    285.000
##      New York North Carolina North Dakota    Ohio Oklahoma Oregon Pennsylvania
## 0%      11.10         13.000        0.800   7.300     6.60   4.90         6.30
## 25%     22.35         15.325        5.675  17.875    16.65  23.20        12.75
## 50%     56.05         30.550       25.650  48.200    44.00  48.15        43.45
## 75%    128.00        118.000       44.250  86.250    88.75  90.00        80.50
## 100%   254.00        337.000       45.000 120.000   151.00 159.00       106.00
##      Rhode Island South Carolina South Dakota Tennessee  Texas    Utah Vermont
## 0%          3.400         14.400         3.80    13.200  12.70   3.200    2.20
## 25%         7.075         20.475        10.55    23.475  22.30  17.975    8.95
## 50%        47.650         35.250        28.90    42.950  52.75  51.450   21.60
## 75%       108.750        105.750        55.25    91.250 110.25  90.000   36.00
## 100%      174.000        279.000        86.00   188.000 201.00 120.000   48.00
##      Virginia Washington West Virginia Wisconsin Wyoming
## 0%       8.50       4.00          5.70      2.60    6.80
## 25%     17.65      20.65          8.40      8.75   13.40
## 50%     41.85      49.60         24.15     31.90   37.80
## 75%     86.25      91.00         49.50     56.25   85.25
## 100%   156.00     145.00         81.00     66.00  161.00
df <- USArrests#apply(USArrests, 1, function(x){(x - mean(x))/sd(x)})

aa <- heatmap.2(as.matrix(df[hc$order,]), Rowv = F,Colv = F, 
         dendrogram= 'n', 
         scale = 'none', trace = 'none',margins =c(10,8),
         col = colorRampPalette(c('navy', 'white','red'))(n = 10))



K-means Clustering

1.Distance matrix

# Load data
data(USArrests)
#normalize the data by column
df <- scale(USArrests)

library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
distance <- get_dist(df)
fviz_dist(distance, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))



2.Computing K-means clustering in R

# We can compute k-means in R with the kmeans function. Here will group the data into two clusters (centers = 2). The kmeans function also has an nstart option that attempts multiple initial configurations and reports on the best one. For example, adding nstart = 25 will generate 25 initial configurations. This approach is often recommended.
k2 <- kmeans(df, centers = 2, nstart = 25)
str(k2)
## List of 9
##  $ cluster     : Named int [1:50] 1 1 1 2 1 1 2 2 1 1 ...
##   ..- attr(*, "names")= chr [1:50] "Alabama" "Alaska" "Arizona" "Arkansas" ...
##  $ centers     : num [1:2, 1:4] 1.005 -0.67 1.014 -0.676 0.198 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:2] "1" "2"
##   .. ..$ : chr [1:4] "Murder" "Assault" "UrbanPop" "Rape"
##  $ totss       : num 196
##  $ withinss    : num [1:2] 46.7 56.1
##  $ tot.withinss: num 103
##  $ betweenss   : num 93.1
##  $ size        : int [1:2] 20 30
##  $ iter        : int 1
##  $ ifault      : int 0
##  - attr(*, "class")= chr "kmeans"
#The output of kmeans is a list with several bits of information. The most important being:
# cluster: A vector of integers (from 1:k) indicating the cluster to which each point is allocated.
# centers: A matrix of cluster centers.
# totss: The total sum of squares.
# withinss: Vector of within-cluster sum of squares, one component per cluster.
# tot.withinss: Total within-cluster sum of squares, i.e. sum(withinss).
# betweenss: The between-cluster sum of squares, i.e. $totss-tot.withinss$.
# size: The number of points in each cluster.

#We can also view our results by using fviz_cluster
#This provides a nice illustration of the clusters. If there are more than two dimensions (variables) fviz_cluster will perform principal component analysis (PCA) and plot the data points according to the first two principal components that explain the majority of the variance.
fviz_cluster(k2, data = df)

# We can also execute the same process for 3, 4, and 5 clusters, and the results are shown in the figure

k3 <- kmeans(df, centers = 3, nstart = 25)
k4 <- kmeans(df, centers = 4, nstart = 25)
k5 <- kmeans(df, centers = 5, nstart = 25)

# plots to compare
p1 <- fviz_cluster(k2, geom = "point", data = df) + ggtitle("k = 2")
p2 <- fviz_cluster(k3, geom = "point",  data = df) + ggtitle("k = 3")
p3 <- fviz_cluster(k4, geom = "point",  data = df) + ggtitle("k = 4")
p4 <- fviz_cluster(k5, geom = "point",  data = df) + ggtitle("k = 5")

library(gridExtra)
grid.arrange(p1, p2, p3, p4, nrow = 2)



3. Determining Optimal Clusters

# The total within-cluster sum of square (wss) measures the compactness of the clustering and we want it to be as small as possible. Thus, we can use the following algorithm to define the optimal clusters:
# 
# 1.Compute clustering algorithm (e.g., k-means clustering) for different values of k. For instance, by varying k from 1 to 10 clusters
# 2. For each k, calculate the total within-cluster sum of square (wss)
# 3. Plot the curve of wss according to the number of clusters k.
# 4. The location of a bend (knee) in the plot is generally considered as an indicator of the appropriate number of clusters.
set.seed(123)

# function to compute total within-cluster sum of square 
wss <- function(k) {
  kmeans(df, k, nstart = 10 )$tot.withinss
}

# Compute and plot wss for k = 1 to k = 15
k.values <- 1:15

# extract wss for 2-15 clusters
wss_values <- sapply(k.values, wss)

plot(k.values, wss_values,
       type="b", pch = 19, frame = FALSE, 
       xlab="Number of clusters K",
       ylab="Total within-clusters sum of squares")

#Fortunately, this process to compute the “Elbow method” has been wrapped up in a single function (fviz_nbclust):
set.seed(123)

fviz_nbclust(df, kmeans, method = "wss")