# 2 Introduction to PCA

• Dataset: iris

• Algorithms:

• PCA
# devtools::install_github("vqv/ggbiplot")
library(ggbiplot)

iris.pca <- prcomp(iris[, 1:4], center = TRUE, scale = TRUE)
print(iris.pca)
#> Standard deviations (1, .., p=4):
#> [1] 1.708 0.956 0.383 0.144
#>
#> Rotation (n x k) = (4 x 4):
#>                 PC1     PC2    PC3    PC4
#> Sepal.Length  0.521 -0.3774  0.720  0.261
#> Sepal.Width  -0.269 -0.9233 -0.244 -0.124
#> Petal.Length  0.580 -0.0245 -0.142 -0.801
#> Petal.Width   0.565 -0.0669 -0.634  0.524
summary(iris.pca)
#> Importance of components:
#>                         PC1   PC2    PC3     PC4
#> Standard deviation     1.71 0.956 0.3831 0.14393
#> Proportion of Variance 0.73 0.229 0.0367 0.00518
#> Cumulative Proportion  0.73 0.958 0.9948 1.00000
g <- ggbiplot(iris.pca,
obs.scale = 1,
var.scale = 1,
groups = irisSpecies, ellipse = TRUE, circle = TRUE) + scale_color_discrete(name = "") + theme(legend.direction = "horizontal", legend.position = "top") print(g) The PC1 axis explains 0.730 of the variance, while the PC2 axis explains 0.229 of the variance. ## 2.1 Underlying principal components # Run PCA here with prcomp () iris.pca <- prcomp(iris[, 1:4], center = TRUE, scale = TRUE) print(iris.pca) #> Standard deviations (1, .., p=4): #> [1] 1.708 0.956 0.383 0.144 #> #> Rotation (n x k) = (4 x 4): #> PC1 PC2 PC3 PC4 #> Sepal.Length 0.521 -0.3774 0.720 0.261 #> Sepal.Width -0.269 -0.9233 -0.244 -0.124 #> Petal.Length 0.580 -0.0245 -0.142 -0.801 #> Petal.Width 0.565 -0.0669 -0.634 0.524 # Now, compute the new dataset aligned to the PCs by # using the predict() function . df.new <- predict(iris.pca, iris[, 1:4]) head(df.new) #> PC1 PC2 PC3 PC4 #> [1,] -2.26 -0.478 0.1273 0.02409 #> [2,] -2.07 0.672 0.2338 0.10266 #> [3,] -2.36 0.341 -0.0441 0.02828 #> [4,] -2.29 0.595 -0.0910 -0.06574 #> [5,] -2.38 -0.645 -0.0157 -0.03580 #> [6,] -2.07 -1.484 -0.0269 0.00659 # Show the PCA modelâ€™s sdev values are the square root # of the projected variances, which are along the diagonal # of the covariance matrix of the projected data. iris.pcasdev^2
#> [1] 2.9185 0.9140 0.1468 0.0207
# # Compute covariance matrix for new dataset.
# Recall that the standard deviation is the square root of the variance.
round(cov(df.new), 5)
#>      PC1   PC2   PC3    PC4
#> PC1 2.92 0.000 0.000 0.0000
#> PC2 0.00 0.914 0.000 0.0000
#> PC3 0.00 0.000 0.147 0.0000
#> PC4 0.00 0.000 0.000 0.0207

## 2.2 Compute eigenvectors and eigenvalues

# Scale and center the data.
df.scaled <- scale(iris[, 1:4], center = TRUE, scale = TRUE)

# Compute the covariance matrix.
cov.df.scaled <- cov(df.scaled)

# Compute the eigenvectors and eigen values.
# Each eigenvector (column) is a principal component.
# Each eigenvalue is the variance explained by the
# associated eigenvector.
eigenInformation <- eigen(cov.df.scaled)

print(eigenInformation)
#> eigen() decomposition
#> $values #> [1] 2.9185 0.9140 0.1468 0.0207 #> #>$vectors
#>        [,1]    [,2]   [,3]   [,4]
#> [1,]  0.521 -0.3774  0.720  0.261
#> [2,] -0.269 -0.9233 -0.244 -0.124
#> [3,]  0.580 -0.0245 -0.142 -0.801
#> [4,]  0.565 -0.0669 -0.634  0.524
# Now, compute the new dataset aligned to the PCs by
# multiplying the eigenvector and data matrices.

# Create transposes in preparation for matrix multiplication
eigenvectors.t <- t(eigenInformation\$vectors)     # 4x4
df.scaled.t <- t(df.scaled)    # 4x150

# Perform matrix multiplication.
df.new <- eigenvectors.t %*% df.scaled.t   # 4x150

# Create new data frame. First take transpose and
df.new.t <- t(df.new)    # 150x4
colnames(df.new.t) <- c("PC1", "PC2", "PC3", "PC4")

#>        PC1    PC2     PC3      PC4
#> [1,] -2.26 -0.478  0.1273  0.02409
#> [2,] -2.07  0.672  0.2338  0.10266
#> [3,] -2.36  0.341 -0.0441  0.02828
#> [4,] -2.29  0.595 -0.0910 -0.06574
#> [5,] -2.38 -0.645 -0.0157 -0.03580
#> [6,] -2.07 -1.484 -0.0269  0.00659
# Compute covariance matrix for new dataset
round(cov(df.new.t), 5)
#>      PC1   PC2   PC3    PC4
#> PC1 2.92 0.000 0.000 0.0000
#> PC2 0.00 0.914 0.000 0.0000
#> PC3 0.00 0.000 0.147 0.0000
#> PC4 0.00 0.000 0.000 0.0207