On the book website, www.StatLearning.com, there is a gene expression data set (Ch10Ex11.csv) that consists of 40 tissue samples with measurements on 1,000 genes. The first 20 samples are from healthy patients, while the second 20 are from a diseased group.
#library(ISLR)
#library(tidyverse)
gene = read.csv("Ch10Ex11.csv", header=FALSE)
dim(gene)
## [1] 1000 40
summary(gene)
## V1 V2 V3 V4
## Min. :-3.056328 Min. :-3.240490 Min. :-3.527188 Min. :-3.06553
## 1st Qu.:-0.684539 1st Qu.:-0.703363 1st Qu.:-0.724498 1st Qu.:-0.70183
## Median : 0.032338 Median :-0.006061 Median : 0.000255 Median : 0.01735
## Mean : 0.006397 Mean :-0.020648 Mean :-0.012895 Mean :-0.00940
## 3rd Qu.: 0.676673 3rd Qu.: 0.660186 3rd Qu.: 0.627725 3rd Qu.: 0.66906
## Max. : 3.519299 Max. : 3.084000 Max. : 3.458551 Max. : 3.22213
## V5 V6 V7 V8
## Min. :-2.99742 Min. :-2.85389 Min. :-2.74516 Min. :-3.62753
## 1st Qu.:-0.73211 1st Qu.:-0.68827 1st Qu.:-0.64695 1st Qu.:-0.66294
## Median :-0.04396 Median :-0.01104 Median : 0.03087 Median :-0.05012
## Mean :-0.04554 Mean :-0.04223 Mean : 0.03408 Mean :-0.01432
## 3rd Qu.: 0.66127 3rd Qu.: 0.60505 3rd Qu.: 0.69415 3rd Qu.: 0.63980
## Max. : 2.89747 Max. : 3.06956 Max. : 3.89075 Max. : 3.35391
## V9 V10 V11 V12
## Min. :-2.85224 Min. :-2.57151 Min. :-3.50807 Min. :-2.46314
## 1st Qu.:-0.66516 1st Qu.:-0.69711 1st Qu.:-0.66221 1st Qu.:-0.76172
## Median : 0.01229 Median :-0.02719 Median :-0.05554 Median :-0.06017
## Mean : 0.02089 Mean : 0.01119 Mean :-0.03980 Mean :-0.04690
## 3rd Qu.: 0.70092 3rd Qu.: 0.69082 3rd Qu.: 0.63184 3rd Qu.: 0.62667
## Max. : 2.76971 Max. : 3.54802 Max. : 3.24751 Max. : 3.47430
## V13 V14 V15 V16
## Min. :-3.31227 Min. :-3.668497 Min. :-3.143676 Min. :-2.927187
## 1st Qu.:-0.64411 1st Qu.:-0.701149 1st Qu.:-0.666224 1st Qu.:-0.692198
## Median : 0.01708 Median : 0.001063 Median :-0.002157 Median :-0.006049
## Mean :-0.01055 Mean :-0.008516 Mean : 0.005718 Mean : 0.001087
## 3rd Qu.: 0.66782 3rd Qu.: 0.650463 3rd Qu.: 0.663221 3rd Qu.: 0.703088
## Max. : 2.83636 Max. : 3.507131 Max. : 3.554124 Max. : 4.088422
## V17 V18 V19 V20
## Min. :-2.735982 Min. :-2.89979 Min. :-2.986620 Min. :-3.64333
## 1st Qu.:-0.802367 1st Qu.:-0.74117 1st Qu.:-0.689626 1st Qu.:-0.63972
## Median : 0.000495 Median :-0.04947 Median :-0.007034 Median :-0.01565
## Mean : 0.003757 Mean :-0.03978 Mean :-0.040910 Mean :-0.01158
## 3rd Qu.: 0.701429 3rd Qu.: 0.62694 3rd Qu.: 0.669064 3rd Qu.: 0.62061
## Max. : 3.183715 Max. : 3.01382 Max. : 3.517389 Max. : 3.43399
## V21 V22 V23 V24
## Min. :-2.9568 Min. :-3.54902 Min. :-3.10594 Min. :-2.9732
## 1st Qu.:-0.5812 1st Qu.:-0.63412 1st Qu.:-0.52197 1st Qu.:-0.5747
## Median : 0.1177 Median : 0.09129 Median : 0.09238 Median : 0.2228
## Mean : 0.2203 Mean : 0.18782 Mean : 0.19267 Mean : 0.2434
## 3rd Qu.: 0.9504 3rd Qu.: 0.92581 3rd Qu.: 0.88226 3rd Qu.: 0.9720
## Max. : 4.7516 Max. : 4.93363 Max. : 4.57264 Max. : 4.2766
## V25 V26 V27 V28
## Min. :-3.2182 Min. :-2.9139 Min. :-3.4260 Min. :-2.9413
## 1st Qu.:-0.6559 1st Qu.:-0.5064 1st Qu.:-0.5618 1st Qu.:-0.5467
## Median : 0.1708 Median : 0.1931 Median : 0.1609 Median : 0.1319
## Mean : 0.2121 Mean : 0.2674 Mean : 0.2572 Mean : 0.2316
## 3rd Qu.: 1.0251 3rd Qu.: 0.9757 3rd Qu.: 0.9855 3rd Qu.: 0.9293
## Max. : 4.9629 Max. : 5.0694 Max. : 4.5857 Max. : 5.1242
## V29 V30 V31 V32
## Min. :-3.1168 Min. :-3.0350 Min. :-3.4798 Min. :-2.8842
## 1st Qu.:-0.5876 1st Qu.:-0.6034 1st Qu.:-0.6183 1st Qu.:-0.5383
## Median : 0.1863 Median : 0.1731 Median : 0.1176 Median : 0.1881
## Mean : 0.2289 Mean : 0.2074 Mean : 0.1894 Mean : 0.2582
## 3rd Qu.: 1.0060 3rd Qu.: 0.9279 3rd Qu.: 0.8815 3rd Qu.: 0.9578
## Max. : 4.4779 Max. : 4.0673 Max. : 4.7855 Max. : 5.6171
## V33 V34 V35 V36
## Min. :-2.8979 Min. :-3.3490 Min. :-3.8558 Min. :-2.9215
## 1st Qu.:-0.6286 1st Qu.:-0.6064 1st Qu.:-0.6043 1st Qu.:-0.5791
## Median : 0.1394 Median : 0.1514 Median : 0.1079 Median : 0.1065
## Mean : 0.2210 Mean : 0.2101 Mean : 0.1733 Mean : 0.2284
## 3rd Qu.: 0.9434 3rd Qu.: 0.9075 3rd Qu.: 0.8598 3rd Qu.: 0.8921
## Max. : 5.3758 Max. : 4.5026 Max. : 4.2191 Max. : 4.4853
## V37 V38 V39 V40
## Min. :-2.8670 Min. :-3.2340 Min. :-2.7112 Min. :-3.5816
## 1st Qu.:-0.5063 1st Qu.:-0.5888 1st Qu.:-0.5610 1st Qu.:-0.5031
## Median : 0.2213 Median : 0.1619 Median : 0.1211 Median : 0.2064
## Mean : 0.2823 Mean : 0.2257 Mean : 0.2215 Mean : 0.2817
## 3rd Qu.: 1.0003 3rd Qu.: 0.9847 3rd Qu.: 0.9576 3rd Qu.: 0.9878
## Max. : 4.7904 Max. : 4.6188 Max. : 4.8149 Max. : 5.2751
gene.corr.dist = as.dist(1-cor(gene))
# Fitting hierarchiacal clustering
h.clust = hclust(
gene.corr.dist
)
plot(h.clust)
COMMENTS: Based on the dendrogram complete linkage seems like it can be classified into two groups.
h.clust = hclust(
gene.corr.dist,
method='average'
)
plot(h.clust)
COMMENTS: Average Linkage does not separate into two groups
h.clust = hclust(
gene.corr.dist,
method='single'
)
plot(h.clust)
COMMENTS: Single linkage looks like it can separate into two groups but looks wacky.
h.clust = hclust(
gene.corr.dist,
method='centroid'
)
plot(h.clust)
summary(h.clust)
## Length Class Mode
## merge 78 -none- numeric
## height 39 -none- numeric
## order 40 -none- numeric
## labels 40 -none- character
## method 1 -none- character
## call 3 -none- call
## dist.method 0 -none- NULL
h.clust
##
## Call:
## hclust(d = gene.corr.dist, method = "centroid")
##
## Cluster method : centroid
## Number of objects: 40