# Load data
data(USArrests)
# Compute distances and hierarchical clustering
dd <- dist(scale(USArrests), method = "euclidean")
set.seed(100)
hc <- hclust(dd)
## the abs(id) in hc$merge indicates the rowID of USArrests
plot(hc)
# Put the labels at the same height: hang = -1
plot(hc, hang = -1, cex = 0.6)
# Convert hclust into a dendrogram and plot
hcd <- as.dendrogram(hc)
# Default plot
plot(hcd, type = "rectangle", ylab = "Height")
# Triangle plot
plot(hcd, type = "triangle", ylab = "Height")
# Zoom in to the first dendrogram
plot(hcd, xlim = c(1, 20), ylim = c(1,8))
# Define nodePar
nodePar <- list(lab.cex = 0.6, pch = c(NA, 19),
cex = 0.7, col = "blue")
# Customized plot; remove labels
plot(hcd, ylab = "Height", nodePar = nodePar, leaflab = "none")
# Horizontal plot
# change the edge of the plot
par(mar = c(1,1,1,6))
plot(hcd, xlab = "Height",
nodePar = nodePar, horiz = TRUE)
# install.packages("ape")
library("ape")
plot(as.phylo(hc), type = "fan")
#change the margin
par(mar = c(0,0,0,0))
plot(as.phylo(hc), type = "fan")
library('gplots')
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
#check the summary of the data
apply(USArrests,1,function(x){quantile(x)})
## Alabama Alaska Arizona Arkansas California Colorado Connecticut Delaware
## 0% 13.2 10.000 8.100 8.800 9.00 7.90 3.30 5.900
## 25% 19.2 35.875 25.275 16.825 32.70 31.00 9.15 13.325
## 50% 39.6 46.250 55.500 34.750 65.80 58.35 44.05 43.900
## 75% 102.5 101.750 133.500 85.000 137.25 109.50 85.25 113.500
## 100% 236.0 263.000 294.000 190.000 276.00 204.00 110.00 238.000
## Florida Georgia Hawaii Idaho Illinois Indiana Iowa Kansas Kentucky
## 0% 15.400 17.40 5.300 2.6 10.4 7.20 2.200 6.00 9.70
## 25% 27.775 23.70 16.475 11.3 20.6 17.55 9.025 15.00 14.65
## 50% 55.950 42.90 33.100 34.1 53.5 43.00 33.650 42.00 34.15
## 75% 143.750 97.75 55.250 70.5 124.5 77.00 56.250 78.25 66.25
## 100% 335.000 211.00 83.000 120.0 249.0 113.00 57.000 115.00 109.00
## Louisiana Maine Maryland Massachusetts Michigan Minnesota Mississippi
## 0% 15.40 2.100 11.300 4.400 12.10 2.70 16.10
## 25% 20.50 6.375 23.675 13.325 29.35 11.85 16.85
## 50% 44.10 29.400 47.400 50.650 54.55 40.45 30.55
## 75% 111.75 59.000 125.250 101.000 119.25 67.50 97.75
## 100% 249.00 83.000 300.000 149.000 255.00 72.00 259.00
## Missouri Montana Nebraska Nevada New Hampshire New Jersey New Mexico
## 0% 9.0 6.0 4.30 12.20 2.10 7.40 11.400
## 25% 23.4 13.8 13.45 37.55 7.65 15.95 26.925
## 50% 49.1 34.7 39.25 63.50 32.75 53.90 51.050
## 75% 97.0 67.0 72.00 123.75 56.25 106.50 123.750
## 100% 178.0 109.0 102.00 252.00 57.00 159.00 285.000
## New York North Carolina North Dakota Ohio Oklahoma Oregon Pennsylvania
## 0% 11.10 13.000 0.800 7.300 6.60 4.90 6.30
## 25% 22.35 15.325 5.675 17.875 16.65 23.20 12.75
## 50% 56.05 30.550 25.650 48.200 44.00 48.15 43.45
## 75% 128.00 118.000 44.250 86.250 88.75 90.00 80.50
## 100% 254.00 337.000 45.000 120.000 151.00 159.00 106.00
## Rhode Island South Carolina South Dakota Tennessee Texas Utah Vermont
## 0% 3.400 14.400 3.80 13.200 12.70 3.200 2.20
## 25% 7.075 20.475 10.55 23.475 22.30 17.975 8.95
## 50% 47.650 35.250 28.90 42.950 52.75 51.450 21.60
## 75% 108.750 105.750 55.25 91.250 110.25 90.000 36.00
## 100% 174.000 279.000 86.00 188.000 201.00 120.000 48.00
## Virginia Washington West Virginia Wisconsin Wyoming
## 0% 8.50 4.00 5.70 2.60 6.80
## 25% 17.65 20.65 8.40 8.75 13.40
## 50% 41.85 49.60 24.15 31.90 37.80
## 75% 86.25 91.00 49.50 56.25 85.25
## 100% 156.00 145.00 81.00 66.00 161.00
df <- USArrests#apply(USArrests, 1, function(x){(x - mean(x))/sd(x)})
aa <- heatmap.2(as.matrix(df[hc$order,]), Rowv = F,Colv = F,
dendrogram= 'n',
scale = 'none', trace = 'none',margins =c(10,8),
col = colorRampPalette(c('navy', 'white','red'))(n = 10))
# Load data
data(USArrests)
#normalize the data by column
df <- scale(USArrests)
library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
distance <- get_dist(df)
fviz_dist(distance, gradient = list(low = "#00AFBB", mid = "white", high = "#FC4E07"))
# We can compute k-means in R with the kmeans function. Here will group the data into two clusters (centers = 2). The kmeans function also has an nstart option that attempts multiple initial configurations and reports on the best one. For example, adding nstart = 25 will generate 25 initial configurations. This approach is often recommended.
k2 <- kmeans(df, centers = 2, nstart = 25)
str(k2)
## List of 9
## $ cluster : Named int [1:50] 1 1 1 2 1 1 2 2 1 1 ...
## ..- attr(*, "names")= chr [1:50] "Alabama" "Alaska" "Arizona" "Arkansas" ...
## $ centers : num [1:2, 1:4] 1.005 -0.67 1.014 -0.676 0.198 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "1" "2"
## .. ..$ : chr [1:4] "Murder" "Assault" "UrbanPop" "Rape"
## $ totss : num 196
## $ withinss : num [1:2] 46.7 56.1
## $ tot.withinss: num 103
## $ betweenss : num 93.1
## $ size : int [1:2] 20 30
## $ iter : int 1
## $ ifault : int 0
## - attr(*, "class")= chr "kmeans"
#The output of kmeans is a list with several bits of information. The most important being:
# cluster: A vector of integers (from 1:k) indicating the cluster to which each point is allocated.
# centers: A matrix of cluster centers.
# totss: The total sum of squares.
# withinss: Vector of within-cluster sum of squares, one component per cluster.
# tot.withinss: Total within-cluster sum of squares, i.e. sum(withinss).
# betweenss: The between-cluster sum of squares, i.e. $totss-tot.withinss$.
# size: The number of points in each cluster.
#We can also view our results by using fviz_cluster
#This provides a nice illustration of the clusters. If there are more than two dimensions (variables) fviz_cluster will perform principal component analysis (PCA) and plot the data points according to the first two principal components that explain the majority of the variance.
fviz_cluster(k2, data = df)
# We can also execute the same process for 3, 4, and 5 clusters, and the results are shown in the figure
k3 <- kmeans(df, centers = 3, nstart = 25)
k4 <- kmeans(df, centers = 4, nstart = 25)
k5 <- kmeans(df, centers = 5, nstart = 25)
# plots to compare
p1 <- fviz_cluster(k2, geom = "point", data = df) + ggtitle("k = 2")
p2 <- fviz_cluster(k3, geom = "point", data = df) + ggtitle("k = 3")
p3 <- fviz_cluster(k4, geom = "point", data = df) + ggtitle("k = 4")
p4 <- fviz_cluster(k5, geom = "point", data = df) + ggtitle("k = 5")
library(gridExtra)
grid.arrange(p1, p2, p3, p4, nrow = 2)
# The total within-cluster sum of square (wss) measures the compactness of the clustering and we want it to be as small as possible. Thus, we can use the following algorithm to define the optimal clusters:
#
# 1.Compute clustering algorithm (e.g., k-means clustering) for different values of k. For instance, by varying k from 1 to 10 clusters
# 2. For each k, calculate the total within-cluster sum of square (wss)
# 3. Plot the curve of wss according to the number of clusters k.
# 4. The location of a bend (knee) in the plot is generally considered as an indicator of the appropriate number of clusters.
set.seed(123)
# function to compute total within-cluster sum of square
wss <- function(k) {
kmeans(df, k, nstart = 10 )$tot.withinss
}
# Compute and plot wss for k = 1 to k = 15
k.values <- 1:15
# extract wss for 2-15 clusters
wss_values <- sapply(k.values, wss)
plot(k.values, wss_values,
type="b", pch = 19, frame = FALSE,
xlab="Number of clusters K",
ylab="Total within-clusters sum of squares")
#Fortunately, this process to compute the “Elbow method” has been wrapped up in a single function (fviz_nbclust):
set.seed(123)
fviz_nbclust(df, kmeans, method = "wss")