############################################################ # Identifying Institutional Peers through Cluster Analysis # TAIR 2020 Conference # Sample Code # March 2, 2020 ############################################################ #### Install packages used in code install.packages(c("DMwR", "factoextra", "ggplot2", "gridExtra")) ##### Import data peer_data <- read.csv("peer data.csv") ############################### ##### Step 1: Prepare Data #### ############################### ### 1A: Remove or estimate missing data peer_data <- na.omit(peer_data) # removes all rows with missing values. ## or impute data using knnImputation from DMwR package library(DMwR) peer_data <- knnImputation(peer_data) ### 1B: Standardize data scaled <- scale(peer_data) # scale() will only standardize numeric data. #################################### #### Step 2: Calculate distance #### #################################### library(factoextra) distance <- get_dist(sample25) # computes distance matrix # sample25 is a scaled sample of 25 institutions/rows used for the conference presentation. # You can replace "sample25" with your scaled peer data. # If you want other distance measures, change method parameter to "manhattan", "pearson", "spearman", or "kendall". # Default method is "euclidean". # Example: distanceMH <- get_dist(sample25, method="manhattan") ## plot distance matrix fviz_dist(distance, gradient=list(low="#00AFBB", mid="white", high="#FC4E07")) ## use as.matrix() if you want to see distance values distance_matrix <- as.matrix(distance) ################################## #### Step 3: Cluster analysis #### ################################## k2 <- kmeans(scaled, centers=2, nstart = 25) # performs clustering on matrix fviz_cluster(k2, geom="point", data=scaled) # plots clusters ## add more clusters k3 <- kmeans(scaled, centers = 3, nstart = 25) k4 <- kmeans(scaled, centers = 4, nstart = 25) k5 <- kmeans(scaled, centers = 5, nstart = 25) ## plots to compare clusters p1 <- fviz_cluster(k2, geom = "point", data = scaled) + ggtitle("k = 2") + annotate("text", x=-1.25, y=-3.15, label="UH", size=3) p2 <- fviz_cluster(k3, geom = "point", data = scaled) + ggtitle("k = 3") + annotate("text", x=-1.25, y=-3.15, label="UH", size=3) p3 <- fviz_cluster(k4, geom = "point", data = scaled) + ggtitle("k = 4") + annotate("text", x=-1.25, y=-3.15, label="UH", size=3) p4 <- fviz_cluster(k5, geom = "point", data = scaled) + ggtitle("k = 5") + annotate("text", x=-1.25, y=-3.15, label="UH", size=3) # Annotate is used in conference presentation. You may remove or add your own institution coordinates for labeling. library(gridExtra) grid.arrange(p1, p2, p3, p4, nrow = 2) # arranges clusters onto one page #### choosing optimal clusters set.seed(123) # random number generator used for # simulating, generating random start points # for reproducible results ## elbow method fviz_nbclust(scaled, kmeans, method="wss") ## average silhouette fviz_nbclust(scaled, kmeans, method="silhouette") #### extract cluster results print(k2) ########################################################################################### # For more information visit: https://jxmartinez.com/talk/identifying-institutional-peers/ ###########################################################################################