Principal component analysis with sklearn
import seaborn as sns
from sklearn import datasets
import pandas as pd
iris = datasets.load_iris()
target_names = iris.target_names
X = pd.DataFrame(iris.data)
X.columns = ['Sepal Length', 'Sepal Width', 'Petal Length', 'Petal Width']
for name in X.columns:
X[name] = (X[name]-X[name].mean())/X[name].std()
sns.heatmap(X.corr(), vmin=-1, vmax=1, annot=True, fmt='2.2f')
import numpy as np
# calculate the Covariance matrix
Q = X.cov().to_numpy()# find the eigenvalue and eigenvector of the Covariance matrix
D, V = np.linalg.eigh(Q) # print the results
np.set_printoptions(precision=2)
print('principal components:')
for i in range(1, len(D)):
print('Feature %d : %2.3f'%(i, D[i]))
# perform the linear transformation
X_new = X.dot(V)# define the columns names to the X_new
X_new.columns = ["Feature %d"%i for i in range(1,5)]# The correlation between different features disappear!
sns.heatmap(X_new.corr(), vmin=-1, vmax=1, annot=True, fmt='2.2f')
# truncation: dimensional reduction
X_reduced = X.copy()
V_trun = V[:,2:4]
X_reduced = X_reduced.dot(V_trun)
X_reduced['Species'] = iris.target
X_reduced.columns = ['Feature 3', 'Feature 4', 'Species']
for i, t in enumerate(target_names):
X_reduced['Species'].replace(i, t, inplace=True)
sns.scatterplot(data=X_reduced, x='Feature 3', y='Feature 4', hue='Species')
from sklearn.decomposition import PCA
X3 = X.to_numpy()
pca = PCA(n_components=3, random_state=0)
X3_reduced = pca.fit(X3).transform(X3)
X3_reduced = pd.DataFrame(X3_reduced)
X3_reduced['species'] = iris.target# plot the results in 3D scatter plot
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(6,6))
ax = Axes3D(fig)
for i in range(3):
idx = X3_reduced['species']==i
ax.scatter(X3_reduced[0][idx], X3_reduced[1][idx], X3_reduced[2][idx], label=target_names[i])
plt.legend()
ax.view_init(20,75)
plt.xlabel('Feature 2')
plt.ylabel('Feature 3')
ax.set_zlabel('Feature 4')