mpolinowski / python-scikitlearn-cheatsheet

SciKit Learn Machine Learning Cheat Sheet

Home Page:https://mpolinowski.github.io/docs/Development/Python/2023-05-20-python-sklearn-cheat-sheet/2023-05-20

Repository from Github https://github.commpolinowski/python-scikitlearn-cheatsheetRepository from Github https://github.commpolinowski/python-scikitlearn-cheatsheet

scikit-learn - Machine Learning in Python

  • Simple and efficient tools for predictive data analysis
  • Accessible to everybody, and reusable in various contexts
  • Built on NumPy, SciPy, and matplotlib
  • Open source, commercially usable - BSD license

Regressions ++ Classifications ++ Clustering ++ Dimensionality Reduction ++ Model Selection ++ Pre-processing

import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import numpy as np
import pandas as pd
import plotly.express as px
import seaborn as sns
from sklearn import svm
from sklearn.cluster import KMeans
from sklearn.datasets import load_iris, load_wine, fetch_20newsgroups, fetch_openml
from sklearn.impute import MissingIndicator, SimpleImputer
from sklearn.ensemble import (
    RandomForestClassifier,
    RandomForestRegressor,
    GradientBoostingRegressor,
    AdaBoostRegressor,
    GradientBoostingClassifier,
    AdaBoostClassifier
)
from sklearn.feature_extraction.text import (
    CountVectorizer,
    TfidfTransformer,
    TfidfVectorizer
)
from sklearn.linear_model import (
    LinearRegression,
    LogisticRegression,
    Ridge,
    ElasticNet
)
from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay,
    accuracy_score
)
from sklearn.model_selection import (
    train_test_split,
    GridSearchCV,
    cross_val_score,
    cross_validate
)
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import (
    MinMaxScaler,
    StandardScaler,
    OrdinalEncoder,
    LabelEncoder,
    OneHotEncoder,
    PolynomialFeatures
)
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

Working with Missing Values

X_missing = pd.DataFrame(
    np.array([5,2,3,np.NaN,np.NaN,4,-3,2,1,8,np.NaN,4,10,np.NaN,5]).reshape(5,3)
)
X_missing.columns = ['f1','f2','f3']

X_missing
f1 f2 f3
0 5.0 2.0 3.0
1 NaN NaN 4.0
2 -3.0 2.0 1.0
3 8.0 NaN 4.0
4 10.0 NaN 5.0
X_missing.isnull().sum()

# f1    1
# f2    3
# f3    0
# dtype: int64

Missing Indicator

indicator = MissingIndicator(missing_values=np.NaN)
indicator = indicator.fit_transform(X_missing)
indicator = pd.DataFrame(indicator, columns=['a1', 'a2'])
indicator
a1 a2
0 False False
1 True True
2 False False
3 False True
4 False True

Simple Imputer

imputer_mean = SimpleImputer(missing_values=np.NaN, strategy='mean')
X_filled_mean = pd.DataFrame(imputer_mean.fit_transform(X_missing))
X_filled_mean.columns = ['f1','f2','f3']
X_filled_mean
f1 f2 f3
0 5.0 2.0 3.0
1 5.0 2.0 4.0
2 -3.0 2.0 1.0
3 8.0 2.0 4.0
4 10.0 2.0 5.0
imputer_median = SimpleImputer(missing_values=np.NaN, strategy='median')
X_filled_median = pd.DataFrame(imputer_median.fit_transform(X_missing))
X_filled_median.columns = ['f1','f2','f3']
X_filled_median
f1 f2 f3
0 5.0 2.0 3.0
1 6.5 2.0 4.0
2 -3.0 2.0 1.0
3 8.0 2.0 4.0
4 10.0 2.0 5.0
imputer_median = SimpleImputer(missing_values=np.NaN, strategy='most_frequent')
X_filled_median = pd.DataFrame(imputer_median.fit_transform(X_missing))
X_filled_median.columns = ['f1','f2','f3']
X_filled_median
f1 f2 f3
0 5.0 2.0 3.0
1 -3.0 2.0 4.0
2 -3.0 2.0 1.0
3 8.0 2.0 4.0
4 10.0 2.0 5.0

Drop Missing Data

X_missing_dropped = X_missing.dropna(axis=1)
X_missing_dropped
f3
0 3.0
1 4.0
2 1.0
3 4.0
4 5.0
X_missing_dropped = X_missing.dropna(axis=0).reset_index()
X_missing_dropped
f1 f2 f3
0 5.0 2.0 3.0
1 -3.0 2.0 1.0

Categorical Data Preprocessing

X_cat_df = pd.DataFrame(
    np.array([
        ['M', 'O-', 'medium'],
        ['M', 'O-', 'high'],
        ['F', 'O+', 'high'],
        ['F', 'AB', 'low'],
        ['F', 'B+', 'medium']
    ])
)

X_cat_df.columns = ['f1','f2','f3']

X_cat_df
f1 f2 f3
0 M O- medium
1 M O- high
2 F O+ high
3 F AB low
4 F B+ medium

Ordinal Encoder

encoder_ord = OrdinalEncoder(dtype='int')

X_cat_df.f3 = encoder_ord.fit_transform(X_cat_df.f3.values.reshape(-1, 1))
X_cat_df
f1 f2 f3
0 M O- 2
1 M O- 0
2 F O+ 0
3 F AB 1
4 F B+ 2

Label Encoder

encoder_lab = LabelEncoder()
X_cat_df['f2'] = encoder_lab.fit_transform(X_cat_df['f2'])
X_cat_df
f1 f2 f3
0 M 3 2
1 M 3 0
2 F 2 0
3 F 0 1
4 F 1 2

OneHot Encoder

encoder_oh = OneHotEncoder(dtype='int')

onehot_df = pd.DataFrame(
    encoder_oh.fit_transform(X_cat_df[['f1']])
    .toarray(),
    columns=['F', 'M']
)

onehot_df['f2'] = X_cat_df.f2
onehot_df['f3'] = X_cat_df.f3
onehot_df
F M f2 f3
0 0 1 3 2
1 0 1 3 0
2 1 0 2 0
3 1 0 0 1
4 1 0 1 2

Loading SK Datasets

Toy Datasets

load_iris(*[, return_X_y, as_frame]) classification Load and return the iris dataset.
load_diabetes(*[, return_X_y, as_frame, scaled]) regression Load and return the diabetes dataset.
load_digits(*[, n_class, return_X_y, as_frame]) classification Load and return the digits dataset.
load_linnerud(*[, return_X_y, as_frame]) multi-output regression Load and return the physical exercise Linnerud dataset.
load_wine(*[, return_X_y, as_frame]) classification Load and return the wine dataset.
load_breast_cancer(*[, return_X_y, as_frame]) classification Load and return the breast cancer wisconsin dataset.
iris_ds = load_iris()
iris_data = iris_ds.data
col_names = iris_ds.feature_names
target_names = iris_ds.target_names

print(
    'Iris Dataset',
    '\n * Data array: ',
    iris_data.shape,
    '\n * Column names: ',
    col_names,
    '\n * Target names: ',
    target_names
)

# Iris Dataset 
#  * Data array:  (150, 4) 
#  * Column names:  ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)'] 
#  * Target names:  ['setosa' 'versicolor' 'virginica']
iris_df = pd.DataFrame(data=iris_data, columns=col_names)

iris_df.head()
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm)
0 5.1 3.5 1.4 0.2
1 4.9 3.0 1.4 0.2
2 4.7 3.2 1.3 0.2
3 4.6 3.1 1.5 0.2
4 5.0 3.6 1.4 0.2

Real World Datasets

fetch_olivetti_faces(*[, data_home, ...]) classification Load the Olivetti faces data-set from AT&T.
fetch_20newsgroups(*[, data_home, subset, ...]) classification Load the filenames and data from the 20 newsgroups dataset.
fetch_20newsgroups_vectorized(*[, subset, ...]) classification Load and vectorize the 20 newsgroups dataset.
fetch_lfw_people(*[, data_home, funneled, ...]) classification Load the Labeled Faces in the Wild (LFW) people dataset.
fetch_lfw_pairs(*[, subset, data_home, ...]) classification Load the Labeled Faces in the Wild (LFW) pairs dataset.
fetch_covtype(*[, data_home, ...]) classification Load the covertype dataset.
fetch_rcv1(*[, data_home, subset, ...]) classification Load the RCV1 multilabel dataset.
fetch_kddcup99(*[, subset, data_home, ...]) classification Load the kddcup99 dataset.
fetch_california_housing(*[, data_home, ...]) regression Load the California housing dataset.
newsgroups_train = fetch_20newsgroups(subset='train')
train_data = newsgroups_train.data
col_names = newsgroups_train.filenames.shape
target_names = newsgroups_train.target.shape

print(
    'Newsgroup - Train Subset',
    '\n * Data array: ',
    len(train_data),
    '\n * Column names: ',
    col_names,
    '\n * Target names: ',
    target_names
)

# Newsgroup - Train Subset 
#  * Data array:  11314 
#  * Column names:  (11314,) 
#  * Target names:  (11314,)
print('Target Names: ', newsgroups_train.target_names)

# Target Names:  ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']

OpenML Datasets

mice_ds = fetch_openml(name='miceprotein', version=4, parser="auto")
print(
    'Mice Protein Dataset',
    '\n * Data Shape: ',
    mice_ds.data.shape,
    '\n * Target Shape: ',
    mice_ds.target.shape,
    '\n * Target Names: ',
    np.unique(mice_ds.target)
)

# Mice Protein Dataset 
#  * Data Shape:  (1080, 77) 
#  * Target Shape:  (1080,) 
#  * Target Names:  ['c-CS-m' 'c-CS-s' 'c-SC-m' 'c-SC-s' 't-CS-m' 't-CS-s' 't-SC-m' 't-SC-s']
print(mice_ds.DESCR)

Supervised Learning - Regression Models

Simple Linear Regression

iris_df.plot(
    figsize=(12,5),
    kind='scatter',
    x='sepal length (cm)',
    y='sepal width (cm)',
    title='Iris Dataset :: Sepal Width&Height'
)

print(iris_df.corr())

The Sepal Width has very little correlation to all other metrics but itself. While the other three correlate nicely:

sepal length (cm) sepal width (cm) petal length (cm) petal width (cm)
sepal length (cm) 1.000000 -0.117570 0.871754 0.817941
sepal width (cm) -0.117570 1.000000 -0.428440 -0.366126
petal length (cm) 0.871754 -0.428440 1.000000 0.962865
petal width (cm) 0.817941 -0.366126 0.962865 1.000000

scikit-learn - Machine Learning in Python

scikit-learn - Machine Learning in Python

Data Pre-processing

iris_df['petal length (cm)'][:1]
# 0    1.4
# Name: petal length (cm), dtype: float64
iris_df['petal length (cm)'].values.reshape(-1,1)[:1]
# array([[1.4]])
# scikit expects a 2s imput => remove index
X = iris_df['petal length (cm)'].values.reshape(-1,1)
y = iris_df['petal width (cm)'].values.reshape(-1,1)
# train/test split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)
print(X_train.shape, X_test.shape)
# (120, 1) (30, 1) 80:20 split

Model Training

regressor = LinearRegression()
regressor.fit(X_train,y_train)

intercept = regressor.intercept_
slope = regressor.coef_

print(' Intercept: ', intercept, '\n Slope: ', slope)
#  Intercept:  [-0.35135666] 
#  Correlation Coeficient:  [[0.41310505]]

Predictions

y_pred = regressor.predict([X_test[0]])
print(' Prediction: ', y_pred, '\n True Value: ', y_test[0])
#  Prediction:  [[0.22699041]] 
#  True Value:  [0.2]
def predict(value):
    return (slope*value + intercept)[0][0]
print('Prediction: ', predict(X_test[0]))
# Prediction:  [[0.22699041]]
iris_df['petal width (cm) prediction'] = iris_df['petal length (cm)'].apply(predict)
print(' Prediction: ', iris_df['petal width (cm) prediction'][0], '\n True Value: ', iris_df['petal width (cm)'][0])
#  Prediction:  0.22699041280334376 
#  True Value:  0.2
iris_df.head(10)
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) petal width (cm) prediction
0 5.1 3.5 1.4 0.2 0.226990
1 4.9 3.0 1.4 0.2 0.226990
2 4.7 3.2 1.3 0.2 0.185680
3 4.6 3.1 1.5 0.2 0.268301
4 5.0 3.6 1.4 0.2 0.226990
5 5.4 3.9 1.7 0.4 0.350922
6 4.6 3.4 1.4 0.3 0.226990
7 5.0 3.4 1.5 0.2 0.268301
8 4.4 2.9 1.4 0.2 0.226990
9 4.9 3.1 1.5 0.1 0.268301
iris_df.plot(
    figsize=(12,5),
    kind='scatter',
    x='petal width (cm)',
    y='petal width (cm) prediction',
    # no value in colorizing..just looks pretty
    c='petal width (cm) prediction',
    colormap='summer',
    title='Iris Dataset - Sepal Width True vs Prediction'
)

scikit-learn - Machine Learning in Python

Model Evaluation

mae = mean_absolute_error(
    iris_df['petal width (cm)'],
    iris_df['petal width (cm) prediction']
)

mse = mean_squared_error(
    iris_df['petal width (cm)'],
    iris_df['petal width (cm) prediction']
)

rmse = np.sqrt(mse)

print(' MAE: ', mae, '\n MSE: ', mse, '\n RMSE: ', rmse)

#  MAE:  0.1569441318761155 
#  MSE:  0.04209214667485277 
#  RMSE:  0.2051637070118708

ElasticNet Regression

Dataset

!wget https://raw.githubusercontent.com/Satish-Vennapu/DataScience/main/AMES_Final_DF.csv -P datasets
ames_df = pd.read_csv('datasets/AMES_Final_DF.csv')
ames_df.head(5).transpose()
0 1 2 3 4
Lot Frontage 141.0 80.0 81.0 93.0 74.0
Lot Area 31770.0 11622.0 14267.0 11160.0 13830.0
Overall Qual 6.0 5.0 6.0 7.0 5.0
Overall Cond 5.0 6.0 6.0 5.0 5.0
Year Built 1960.0 1961.0 1958.0 1968.0 1997.0
...
Sale Condition_AdjLand 0.0 0.0 0.0 0.0 0.0
Sale Condition_Alloca 0.0 0.0 0.0 0.0 0.0
Sale Condition_Family 0.0 0.0 0.0 0.0 0.0
Sale Condition_Normal 1.0 1.0 1.0 1.0 1.0
Sale Condition_Partial 0.0 0.0 0.0 0.0 0.0
274 rows Ă— 5 columns
# the target value is:
ames_df['SalePrice']
0 215000
1 105000
2 172000
3 244000
4 189900
...
2920 142500
2921 131000
2922 132000
2923 170000
2924 188000
Name: SalePrice, Length: 2925, dtype: int64

Preprocessing

# remove target column from training dataset
X_ames = ames_df.drop('SalePrice', axis=1)
y_ames = ames_df['SalePrice']

print(X_ames.shape, y_ames.shape)
# (2925, 273) (2925,)
# train/test split
X_ames_train, X_ames_test, y_ames_train, y_ames_test = train_test_split(
    X_ames,
    y_ames,
    test_size=0.1,
    random_state=101
)

print(X_ames_train.shape, X_ames_test.shape)
# (2632, 273) (293, 273)
# normalize feature set
scaler = StandardScaler()
X_ames_train_scaled = scaler.fit_transform(X_ames_train)

X_ames_test_scaled = scaler.transform(X_ames_test)

Grid Search for Hyperparameters

base_ames_elastic_net_model = ElasticNet(max_iter=int(1e4))
param_grid = {
    'alpha': [50, 75, 100, 125, 150],
    'l1_ratio':[0.2, 0.4, 0.6, 0.8, 1.0]
}
grid_ames_model = GridSearchCV(
    estimator=base_ames_elastic_net_model,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=5, verbose=1
)

grid_ames_model.fit(X_ames_train_scaled, y_ames_train)

print(
    'Results:\nBest Estimator: ',
    grid_ames_model.best_estimator_,
    '\nBest Hyperparameter: ',
    grid_ames_model.best_params_
)

Results:

  • Best Estimator: ElasticNet(alpha=125, l1_ratio=1.0, max_iter=10000)
  • Best Hyperparameter: {'alpha': 125, 'l1_ratio': 1.0}

Model Evaluation

y_ames_pred = grid_ames_model.predict(X_ames_test_scaled)

print(
    'MAE: ',
    mean_absolute_error(y_ames_test, y_ames_pred),
    'MSE: ',
    mean_squared_error(y_ames_test, y_ames_pred),
    'RMSE: ',
    np.sqrt(mean_squared_error(y_ames_test, y_ames_pred))
)

# MAE:  14185.506207185055 MSE:  422714457.5190704 RMSE:  20560.020854052418
# average SalePrize
np.mean(ames_df['SalePrice'])
# 180815.53743589742

rel_error_avg = mean_absolute_error(y_ames_test, y_ames_pred) * 100 / np.mean(ames_df['SalePrice'])
print('Pridictions are on average off by: ', rel_error_avg.round(2), '%')
# Pridictions are on average off by:  7.85 %
plt.figure(figsize=(10,4))

plt.scatter(y_ames_test,y_ames_pred, c='mediumspringgreen', s=3)
plt.axline((0, 0), slope=1, color='dodgerblue', linestyle=(':'))

plt.title('Prediction Accuracy :: MAE:'+ str(mean_absolute_error(y_ames_test, y_ames_pred).round(2)) + 'US$')
plt.xlabel('True Sales Price')
plt.ylabel('Predicted Sales Price')
plt.savefig('assets/Scikit_Learn_11.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

Multiple Linear Regression

Above I used the petal width and length to create a linear regression model. But as explored earlier we can also use the sepal length (only the sepal width does not show a linear correlation):

print(iris_df.corr())
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm)
sepal length (cm) 1.000000 -0.117570 0.871754 0.817941
sepal width (cm) -0.117570 1.000000 -0.428440 -0.366126
petal length (cm) 0.871754 -0.428440 1.000000 0.962865
petal width (cm) 0.817941 -0.366126 0.962865 1.000000
X_multi = iris_df[['petal length (cm)', 'sepal length (cm)']]
y = iris_df['petal width (cm)']
regressor_multi = LinearRegression()
regressor_multi.fit(X_multi, y)

intercept_multi = regressor_multi.intercept_
slope_multi = regressor_multi.coef_

print(' Intercept: ', intercept_multi, '\n Slope: ', slope_multi)

#  Intercept:  -0.00899597269816943 
#  Slope:  [ 0.44937611 -0.08221782]
def predict_multi(petal_length, sepal_length):
    return (slope_multi[0]*petal_length + slope_multi[1]*sepal_length + intercept_multi)
y_pred = predict_multi(
    iris_df['petal length (cm)'][0],
    iris_df['sepal length (cm)'][0]
)

print(' Prediction: ', y_pred, '\n True value: ', iris_df['petal width (cm)'][0])
#  Prediction:  0.20081970121763193 
#  True value:  0.2
iris_df['petal width (cm) prediction (multi)'] = (
    (
        slope_multi[0] * iris_df['petal length (cm)']
    ) + (
        slope_multi[1] * iris_df['sepal length (cm)']
    ) + (
        intercept_multi
    ) 
)
iris_df.head(10)
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) petal width (cm) prediction petal width (cm) prediction (multi)
0 5.1 3.5 1.4 0.2 0.226990 0.200820
1 4.9 3.0 1.4 0.2 0.226990 0.217263
2 4.7 3.2 1.3 0.2 0.185680 0.188769
3 4.6 3.1 1.5 0.2 0.268301 0.286866
4 5.0 3.6 1.4 0.2 0.226990 0.209041
5 5.4 3.9 1.7 0.4 0.350922 0.310967
6 4.6 3.4 1.4 0.3 0.226990 0.241929
7 5.0 3.4 1.5 0.2 0.268301 0.253979
8 4.4 2.9 1.4 0.2 0.226990 0.258372
9 4.9 3.1 1.5 0.1 0.268301 0.262201
iris_df.plot(
    figsize=(12,5),
    kind='scatter',
    x='petal width (cm)',
    y='petal width (cm) prediction (multi)',
    c='petal width (cm) prediction',
    colormap='summer',
    title='Iris Dataset - Sepal Width True vs Prediction (multi)'
)

scikit-learn - Machine Learning in Python

mae_multi = mean_absolute_error(
    iris_df['petal width (cm)'],
    iris_df['petal width (cm) prediction (multi)']
)

mse_multi = mean_squared_error(
    iris_df['petal width (cm)'],
    iris_df['petal width (cm) prediction (multi)']
)

rmse_multi = np.sqrt(mse_multi)

print(' MAE_Multi: ', mae_multi,' MAE: ', mae, '\n MSE_Multi: ', mse_multi, ' MSE: ', mse, '\n RMSE_Multi: ', rmse_multi, ' RMSE: ', rmse)

The accuracy of the model was improved by adding an additional, correlating value:

Multi Regression Single Regression
Mean Absolute Error 0.15562108079300102 0.1569441318761155
Mean Squared Error 0.04096208526408982 0.04209214667485277
Root Mean Squared Error 0.20239092189149646 0.2051637070118708

Supervised Learning - Logistic Regression Model

Binary Logistic Regression

Dataset

np.random.seed(666)

# generate 10 index values between 0-10
x_data_logistic_binary = np.random.randint(10, size=(10)).reshape(-1, 1)
# generate binary category for values above
y_data_logistic_binary = np.random.randint(2, size=10)

Model Fitting

logistic_binary_model = LogisticRegression(
    solver='liblinear',
    C=10.0,
    random_state=0
)

logistic_binary_model.fit(x_data_logistic_binary, y_data_logistic_binary)

intercept_logistic_binary = logistic_binary_model.intercept_
slope_logistic_binary = logistic_binary_model.coef_

print(' Intercept: ', intercept_logistic_binary, '\n Slope: ', slope_logistic_binary)

#  Intercept:  [-0.4832956] 
#  Slope:  [[0.11180522]]

Model Predictions

prob_pred_logistic_binary = logistic_binary_model.predict_proba(x_data_logistic_binary)
y_pred_logistic_binary = logistic_binary_model.predict(x_data_logistic_binary)


print('Prediction Probabilities: ', prob_pred[:1])

unique, counts = np.unique(y_pred_logistic_binary, return_counts=True)
print('Classes: ', unique, '| Number of Class Instances: ', counts)

# probabilities e.g. below -> 58% certainty that the first element is class 0

# Prediction Probabilities:  [[0.58097284 0.41902716]]
# Classes:  [0 1] | Number of Class Instances:  [5 5]

Model Evaluation

conf_mtx = confusion_matrix(y_data_logistic_binary, y_pred_logistic_binary)
conf_mtx

# [2, 3] [TP, FP]
# [3, 2] [FN, TN]

scikit-learn - Machine Learning in Python

report = classification_report(y_data_logistic_binary, y_pred_logistic_binary)
print(report)
precision recall f1-score support
0 0.40 0.40 0.40 5
1 0.40 0.40 0.40 5
accuracy 0.40 10
macro avg 0.40 0.40 0.40 10
weighted avg 0.40 0.40 0.40 10

Logistic Regression Pipelines

Dataset Preprocessing

iris_ds = load_iris()

# train/test split
X_train_iris, X_test_iris, y_train_iris, y_test_iris = train_test_split(
    iris_ds.data,
    iris_ds.target,
    test_size=0.2,
    random_state=42
)
print(X_train_iris.shape, X_test_iris.shape)
# (120, 4) (30, 4)

Pipeline

pipe_iris = Pipeline([
    ('minmax', MinMaxScaler()),
    ('log_reg', LogisticRegression()),
])

pipe_iris.fit(X_train_iris, y_train_iris)
iris_score = pipe_iris.score(X_test_iris, y_test_iris)
print('Prediction Accuracy: ', iris_score.round(4)*100, '%')
# Prediction Accuracy:  96.67 %

Cross Validation

Train | Test Split
!wget https://raw.githubusercontent.com/reisanar/datasets/master/Advertising.csv -P datasets
adv_df = pd.read_csv('datasets/Advertising.csv')
adv_df.head(5)
TV Radio Newspaper Sales
0 230.1 37.8 69.2 22.1
1 44.5 39.3 45.1 10.4
2 17.2 45.9 69.3 9.3
3 151.5 41.3 58.5 18.5
4 180.8 10.8 58.4 12.9
# Split ds into features and targets
X_adv = adv_df.drop('Sales', axis=1)
y_adv = adv_df['Sales']
# 70:30 train/test split
X_adv_train, X_adv_test, y_adv_train, y_adv_test = train_test_split(
    X_adv, y_adv, test_size=0.3, random_state=666
)

print(X_adv_train.shape, y_adv_train.shape)
# (140, 3) (140,)
# normalize features
scaler_adv = StandardScaler()
scaler_adv.fit(X_adv_train)

X_adv_train = scaler_adv.transform(X_adv_train)
X_adv_test = scaler_adv.transform(X_adv_test)
Model Fitting
model_adv1 = Ridge(
    alpha=100.0
)

model_adv1.fit(X_adv_train, y_adv_train)
Model Evaluation
y_adv_pred = model_adv1.predict(X_adv_test)

mean_squared_error(y_adv_test, y_adv_pred)
# 6.528575771818745
Adjusting Hyper Parameter
model_adv2 = Ridge(
    alpha=1.0
)

model_adv2.fit(X_adv_train, y_adv_train)
y_adv_pred2 = model_adv2.predict(X_adv_test)
mean_squared_error(y_adv_test, y_adv_pred2)
# 2.3319016551123535

Train | Validation | Test Split

# 70:30 train/temp split
X_adv_train, X_adv_temp, y_adv_train, y_adv_temp = train_test_split(
    X_adv, y_adv, test_size=0.3, random_state=666
)

# 50:50 test/val split
X_adv_test, X_adv_val, y_adv_test, y_adv_val = train_test_split(
    X_adv_temp, y_adv_temp, test_size=0.5, random_state=666
)

print(X_adv_train.shape, X_adv_test.shape, X_adv_val.shape)
# (140, 3) (30, 3) (30, 3)
# normalize features
scaler_adv = StandardScaler()
scaler_adv.fit(X_adv_train)

X_adv_train = scaler_adv.transform(X_adv_train)
X_adv_test = scaler_adv.transform(X_adv_test)
X_adv_val = scaler_adv.transform(X_adv_val)
Model Fitting and Evaluation
model_adv3 = Ridge(
    alpha=100.0
)

model_adv3.fit(X_adv_train, y_adv_train)
# do evaluation with the validation set
y_adv_pred3 = model_adv3.predict(X_adv_val)
mean_squared_error(y_adv_val, y_adv_pred3)
# 7.136230975501291
Adjusting Hyper Parameter
model_adv4 = Ridge(
    alpha=1.0
)

model_adv4.fit(X_adv_train, y_adv_train)

y_adv_pred4 = model_adv4.predict(X_adv_val)
mean_squared_error(y_adv_val, y_adv_pred4)
# 2.6393803874124435
# only once you are certain that you have the best performance
# do a final evaluation with the test set
y_adv4_final_pred = model_adv4.predict(X_adv_test)
mean_squared_error(y_adv_test, y_adv4_final_pred)
# 2.024422922812264

k-fold Cross Validation

Do a train/test split and segment the training set by k-folds (e.g. 5-10) and use each of those segments once to validate a training step. The resulting error is the average of all k errors.

Train-Test Split
# 70:30 train/temp split
X_adv_train, X_adv_test, y_adv_train, y_adv_test = train_test_split(
    X_adv, y_adv, test_size=0.3, random_state=666
)
# normalize features
scaler_adv = StandardScaler()
scaler_adv.fit(X_adv_train)

X_adv_train = scaler_adv.transform(X_adv_train)
X_adv_test = scaler_adv.transform(X_adv_test)
Model Scoring
model_adv5 = Ridge(
    alpha=100.0
)
# do a 5-fold cross-eval
scores = cross_val_score(
    estimator=model_adv5,
    X=X_adv_train,
    y=y_adv_train,
    scoring='neg_mean_squared_error',
    cv=5
)

# take the mean of all five neg. error values
abs(scores.mean())
# 8.688107513529168
Adjusting Hyper Parameter
model_adv6 = Ridge(
    alpha=1.0
)
# do a 5-fold cross-eval
scores = cross_val_score(
    estimator=model_adv6,
    X=X_adv_train,
    y=y_adv_train,
    scoring='neg_mean_squared_error',
    cv=5
)

# take the mean of all five neg. error values
abs(scores.mean())
# 3.3419582340688576
Model Fitting and Final Evaluation
model_adv6.fit(X_adv_train, y_adv_train)

y_adv6_final_pred = model_adv6.predict(X_adv_test)
mean_squared_error(y_adv_test, y_adv6_final_pred)
# 2.3319016551123535

Cross Validate

Dataset (re-import)
adv_df = pd.read_csv('datasets/Advertising.csv')
X_adv = adv_df.drop('Sales', axis=1)
y_adv = adv_df['Sales']
# 70:30 train/test split
X_adv_train, X_adv_test, y_adv_train, y_adv_test = train_test_split(
    X_adv, y_adv, test_size=0.3, random_state=666
)
# normalize features
scaler_adv = StandardScaler()
scaler_adv.fit(X_adv_train)

X_adv_train = scaler_adv.transform(X_adv_train)
X_adv_test = scaler_adv.transform(X_adv_test)
Model Scoring
model_adv7 = Ridge(
    alpha=100.0
)
scores = cross_validate(
    model_adv7,
    X_adv_train,
    y_adv_train,
    scoring=[
        'neg_mean_squared_error',
        'neg_mean_absolute_error'
    ],
    cv=10
)
scores_df = pd.DataFrame(scores)
scores_df
fit_time score_time test_neg_mean_squared_error test_neg_mean_absolute_error
0 0.016399 0.000749 -12.539147 -2.851864
1 0.000684 0.000452 -2.806466 -1.423516
2 0.000937 0.000782 -11.142227 -2.740332
3 0.001060 0.000633 -7.237347 -2.196963
4 0.001045 0.000738 -11.313985 -2.690813
5 0.000650 0.000510 -3.169169 -1.526568
6 0.000698 0.000429 -6.578249 -1.727616
7 0.000600 0.000423 -5.740245 -1.640964
8 0.000565 0.000463 -10.268075 -2.415688
9 0.000562 0.000487 -10.641669 -1.974407
abs(scores_df.mean())
fit_time 0.002320
score_time 0.000566
test_neg_mean_squared_error 8.143658
test_neg_mean_absolute_error 2.118873
dtype: float64
Adjusting Hyper Parameter
model_adv8 = Ridge(
    alpha=1.0
)
scores = cross_validate(
    model_adv8,
    X_adv_train,
    y_adv_train,
    scoring=[
        'neg_mean_squared_error',
        'neg_mean_absolute_error'
    ],
    cv=10
)

abs(pd.DataFrame(scores).mean())
fit_time 0.001141
score_time 0.000777
test_neg_mean_squared_error 3.272673
test_neg_mean_absolute_error 1.345709
dtype: float64
Model Fitting and Final Evaluation
model_adv8.fit(X_adv_train, y_adv_train)

y_adv8_final_pred = model_adv8.predict(X_adv_test)
mean_squared_error(y_adv_test, y_adv8_final_pred)
# 2.3319016551123535

Grid Search

Loop through a set of hyperparameters to find an optimum.

Hyperparameter Search
base_elastic_net_model = ElasticNet()
param_grid = {
    'alpha': [0.1, 1, 5, 10, 50, 100],
    'l1_ratio':[0.1, 0.3, 0.5, 0.7, 0.9, 1.0]
}
grid_model = GridSearchCV(
    estimator=base_elastic_net_model,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=5, verbose=2
)

grid_model.fit(X_adv_train, y_adv_train)

print(
    'Results:\nBest Estimator: ',
    grid_model.best_estimator_,
    '\nBest Hyperparameter: ',
    grid_model.best_params_
)

Results:

  • Best Estimator: ElasticNet(alpha=0.1, l1_ratio=1.0)
  • Best Hyperparameter: {'alpha': 0.1, 'l1_ratio': 1.0}
gridcv_results = pd.DataFrame(grid_model.cv_results_)
mean_fit_time std_fit_time mean_score_time std_score_time param_alpha param_l1_ratio params split0_test_score split1_test_score split2_test_score split3_test_score split4_test_score mean_test_score std_test_score rank_test_score
0 0.001156 0.000160 0.000449 0.000038 0.1 0.1 {'alpha': 0.1, 'l1_ratio': 0.1} -1.924119 -3.384152 -3.588444 -3.703040 -5.091974 -3.538346 1.007264 6
1 0.001144 0.000181 0.000407 0.000091 0.1 0.3 {'alpha': 0.1, 'l1_ratio': 0.3} -1.867117 -3.304382 -3.561106 -3.623188 -5.061781 -3.483515 1.016000 5
2 0.000623 0.000026 0.000272 0.000052 0.1 0.5 {'alpha': 0.1, 'l1_ratio': 0.5} -1.812633 -3.220727 -3.539711 -3.547572 -5.043259 -3.432780 1.028406 4
3 0.000932 0.000165 0.000321 0.000060 0.1 0.7 {'alpha': 0.1, 'l1_ratio': 0.7} -1.750153 -3.144120 -3.525226 -3.477228 -5.034008 -3.386147 1.046722 3
4 0.000725 0.000106 0.000259 0.000024 0.1 0.9 {'alpha': 0.1, 'l1_ratio': 0.9} -1.693440 -3.075686 -3.518777 -3.413393 -5.029683 -3.346196 1.065195 2
5 0.000654 0.000053 0.000274 0.000026 0.1 1.0 {'alpha': 0.1, 'l1_ratio': 1.0} -1.667506 -3.044928 -3.518866 -3.384363 -5.031297 -3.329392 1.075006 1
6 0.000595 0.000016 0.000244 0.000002 1 0.1 {'alpha': 1, 'l1_ratio': 0.1} -8.575470 -11.021534 -8.212152 -6.808719 -10.792072 -9.081990 1.604192 12
7 0.000591 0.000018 0.000244 0.000002 1 0.3 {'alpha': 1, 'l1_ratio': 0.3} -8.131855 -10.448423 -7.774620 -6.179358 -10.071728 -8.521197 1.569173 11
8 0.000628 0.000049 0.000266 0.000023 1 0.5 {'alpha': 1, 'l1_ratio': 0.5} -7.519809 -9.562473 -7.261824 -5.453399 -9.213320 -7.802165 1.481785 10
9 0.000594 0.000015 0.000243 0.000002 1 0.7 {'alpha': 1, 'l1_ratio': 0.7} -6.614835 -8.351711 -6.702104 -4.698977 -8.230616 -6.919649 1.329741 9
10 0.000714 0.000108 0.000268 0.000033 1 0.9 {'alpha': 1, 'l1_ratio': 0.9} -5.537250 -6.887828 -6.148400 -4.106124 -7.101573 -5.956235 1.078430 8
11 0.000649 0.000067 0.000263 0.000028 1 1.0 {'alpha': 1, 'l1_ratio': 1.0} -4.932027 -6.058207 -5.892529 -3.798441 -6.472871 -5.430815 0.959804 7
12 0.000645 0.000042 0.000264 0.000040 5 0.1 {'alpha': 5, 'l1_ratio': 0.1} -21.863798 -25.767488 -18.768865 -12.608680 -23.207907 -20.443347 4.520904 13
13 0.000617 0.000030 0.000281 0.000038 5 0.3 {'alpha': 5, 'l1_ratio': 0.3} -23.626694 -27.439028 -20.266203 -12.788078 -24.609195 -21.745840 5.031493 14
14 0.000599 0.000011 0.000249 0.000013 5 0.5 {'alpha': 5, 'l1_ratio': 0.5} -26.202964 -29.867138 -22.527913 -13.423857 -26.835934 -23.771561 5.675911 15
15 0.000588 0.000013 0.000276 0.000035 5 0.7 {'alpha': 5, 'l1_ratio': 0.7} -27.768946 -33.428462 -23.506474 -14.599984 -29.112276 -25.683228 6.382379 17
16 0.000580 0.000003 0.000271 0.000001 5 0.9 {'alpha': 5, 'l1_ratio': 0.9} -29.868949 -34.423737 -25.623955 -16.750237 -31.056181 -27.544612 6.087093 19
17 0.000591 0.000011 0.000259 0.000021 5 1.0 {'alpha': 5, 'l1_ratio': 1.0} -29.868949 -34.423737 -25.623955 -16.750237 -31.056181 -27.544612 6.087093 19
18 0.000632 0.000028 0.000250 0.000012 10 0.1 {'alpha': 10, 'l1_ratio': 0.1} -26.179546 -30.396420 -22.386698 -14.596498 -27.292337 -24.170300 5.429322 16
19 0.000593 0.000020 0.000239 0.000001 10 0.3 {'alpha': 10, 'l1_ratio': 0.3} -28.704426 -33.379967 -24.561645 -15.634153 -29.883725 -26.432783 6.090062 18
20 0.000595 0.000036 0.000245 0.000013 10 0.5 {'alpha': 10, 'l1_ratio': 0.5} -29.868949 -34.423737 -25.623955 -16.750237 -31.056181 -27.544612 6.087093 19
21 0.000610 0.000053 0.000258 0.000015 10 0.7 {'alpha': 10, 'l1_ratio': 0.7} -29.868949 -34.423737 -25.623955 -16.750237 -31.056181 -27.544612 6.087093 19
22 0.000597 0.000022 0.000248 0.000015 10 0.9 {'alpha': 10, 'l1_ratio': 0.9} -29.868949 -34.423737 -25.623955 -16.750237 -31.056181 -27.544612 6.087093 19
23 0.000623 0.000057 0.000305 0.000076 10 1.0 {'alpha': 10, 'l1_ratio': 1.0} -29.868949 -34.423737 -25.623955 -16.750237 -31.056181 -27.544612 6.087093 19
24 0.000602 0.000016 0.000252 0.000013 50 0.1 {'alpha': 50, 'l1_ratio': 0.1} -29.868949 -34.423737 -25.623955 -16.750237 -31.056181 -27.544612 6.087093 19
25 0.000577 0.000009 0.000238 0.000001 50 0.3 {'alpha': 50, 'l1_ratio': 0.3} -29.868949 -34.423737 -25.623955 -16.750237 -31.056181 -27.544612 6.087093 19
26 0.000607 0.000046 0.000245 0.000010 50 0.5 {'alpha': 50, 'l1_ratio': 0.5} -29.868949 -34.423737 -25.623955 -16.750237 -31.056181 -27.544612 6.087093 19
27 0.000569 0.000004 0.000259 0.000012 50 0.7 {'alpha': 50, 'l1_ratio': 0.7} -29.868949 -34.423737 -25.623955 -16.750237 -31.056181 -27.544612 6.087093 19
28 0.000582 0.000022 0.000244 0.000011 50 0.9 {'alpha': 50, 'l1_ratio': 0.9} -29.868949 -34.423737 -25.623955 -16.750237 -31.056181 -27.544612 6.087093 19
29 0.000603 0.000041 0.000251 0.000015 50 1.0 {'alpha': 50, 'l1_ratio': 1.0} -29.868949 -34.423737 -25.623955 -16.750237 -31.056181 -27.544612 6.087093 19
30 0.000670 0.000106 0.000251 0.000013 100 0.1 {'alpha': 100, 'l1_ratio': 0.1} -29.868949 -34.423737 -25.623955 -16.750237 -31.056181 -27.544612 6.087093 19
31 0.000764 0.000179 0.000343 0.000054 100 0.3 {'alpha': 100, 'l1_ratio': 0.3} -29.868949 -34.423737 -25.623955 -16.750237 -31.056181 -27.544612 6.087093 19
32 0.000623 0.000077 0.000244 0.000007 100 0.5 {'alpha': 100, 'l1_ratio': 0.5} -29.868949 -34.423737 -25.623955 -16.750237 -31.056181 -27.544612 6.087093 19
33 0.000817 0.000156 0.000329 0.000076 100 0.7 {'alpha': 100, 'l1_ratio': 0.7} -29.868949 -34.423737 -25.623955 -16.750237 -31.056181 -27.544612 6.087093 19
34 0.000590 0.000017 0.000242 0.000004 100 0.9 {'alpha': 100, 'l1_ratio': 0.9} -29.868949 -34.423737 -25.623955 -16.750237 -31.056181 -27.544612 6.087093 19
35 0.000595 0.000027 0.000242 0.000007 100 1.0 {'alpha': 100, 'l1_ratio': 1.0} -29.868949 -34.423737 -25.623955 -16.750237 -31.056181 -27.544612 6.087093 19
gridcv_results[
    [
        'param_alpha',
        'param_l1_ratio'
    ]
].plot(title='Grid Search Hyperparameter :: Parameter', figsize=(12,8))

scikit-learn - Machine Learning in Python

gridcv_results[
    [
        'mean_fit_time',
        'std_fit_time',
        'mean_score_time'
    ]
].plot(title='Grid Search Hyperparameter :: Timing', figsize=(12,8))

scikit-learn - Machine Learning in Python

gridcv_results[
    [
        'split0_test_score',
        'split1_test_score',
        'split2_test_score',
        'split3_test_score',
        'split4_test_score',
        'mean_test_score',
        'std_test_score',
       'rank_test_score'
    ]
].plot(title='Grid Search Hyperparameter :: Parameter', figsize=(12,8))

scikit-learn - Machine Learning in Python

Model Evaluation
y_grid_pred = grid_model.predict(X_adv_test)

mean_squared_error(y_adv_test, y_grid_pred)
# 2.380865536033581

Supervised Learning - KNN Algorithm

Dataset

wine = load_wine()
print(wine.data.shape)
print(wine.feature_names)
print(wine.data[:1])

# (178, 13)
# ['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']
# [[1.423e+01 1.710e+00 2.430e+00 1.560e+01 1.270e+02 2.800e+00 3.060e+00
#   2.800e-01 2.290e+00 5.640e+00 1.040e+00 3.920e+00 1.065e+03]]
wine_df = pd.DataFrame(data=wine.data, columns=wine.feature_names)
wine_df.head(2).T
0 1
alcohol 14.23 13.20
malic_acid 1.71 1.78
ash 2.43 2.14
alcalinity_of_ash 15.60 11.20
magnesium 127.00 100.00
total_phenols 2.80 2.65
flavanoids 3.06 2.76
nonflavanoid_phenols 0.28 0.26
proanthocyanins 2.29 1.28
color_intensity 5.64 4.38
hue 1.04 1.05
od280/od315_of_diluted_wines 3.92 3.40
proline 1065.00 1050.00

Data Pre-processing

# normalization
scaler = MinMaxScaler()
scaler.fit(wine.data)
wine_norm = scaler.fit_transform(wine.data)
# train/test split
X_train_wine, X_test_wine, y_train_wine, y_test_wine = train_test_split(
    wine_norm,
    wine.target,
    test_size=0.3
)

print(X_train_wine.shape, X_test_wine.shape)
# (124, 13) (54, 13)

Model Fitting

# model for k=3
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train_wine, y_train_wine)

y_pred_wine_knn3 = knn.predict(X_test_wine)
print('Accuracy Score: ', (accuracy_score(y_test_wine, y_pred_wine_knn3)*100).round(2), '%')
# Accuracy Score:  98.15 %
# model for k=5
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_wine, y_train_wine)

y_pred_wine_knn5 = knn.predict(X_test_wine)
print('Accuracy Score: ', (accuracy_score(y_test_wine, y_pred_wine_knn5)*100).round(2), '%')
# Accuracy Score:  98.15 %
# model for k=7
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train_wine, y_train_wine)

y_pred_wine_knn7 = knn.predict(X_test_wine)
print('Accuracy Score: ', (accuracy_score(y_test_wine, y_pred_wine_knn7)*100).round(2), '%')
# Accuracy Score:  96.3 %
# model for k=9
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train_wine, y_train_wine)

y_pred_wine_knn7 = knn.predict(X_test_wine)
print('Accuracy Score: ', (accuracy_score(y_test_wine, y_pred_wine_knn7)*100).round(2), '%')
# Accuracy Score:  96.3 %

Supervised Learning - Decision Tree Classifier

  • Does not require normalization
  • Is not sensitive to missing values

Dataset

!wget https://gist.githubusercontent.com/Dviejopomata/ea5869ba4dcff84f8c294dc7402cd4a9/raw/4671f90b8b04ba4db9d67acafaa4c0827cd233c2/bill_authentication.csv -P datasets
bill_auth_df = pd.read_csv('datasets/bill_authentication.csv')
bill_auth_df.head(3)
Variance Skewness Curtosis Entropy Class
0 3.6216 8.6661 -2.8073 -0.44699 0
1 4.5459 8.1674 -2.4586 -1.46210 0
2 3.8660 -2.6383 1.9242 0.10645 0

Preprocessing

# remove target feature from training set
X_bill = bill_auth_df.drop('Class', axis=1)
y_bill = bill_auth_df['Class']
X_train_bill, X_test_bill, y_train_bill, y_test_bill = train_test_split(X_bill, y_bill, test_size=0.2)

Model Fitting

tree_classifier = DecisionTreeClassifier()

tree_classifier.fit(X_train_bill, y_train_bill)

Evaluation

y_pred_bill = tree_classifier.predict(X_test_bill)
conf_mtx_bill = confusion_matrix(y_test_bill, y_pred_bill)
conf_mtx_bill

# array([[150,   2],
#        [  4, 119]])
conf_mtx_bill_plot = ConfusionMatrixDisplay(
    confusion_matrix=conf_mtx_bill,
    display_labels=[False,True]
)

conf_mtx_bill_plot.plot()
plt.show()

scikit-learn - Machine Learning in Python

report_bill = classification_report(
    y_test_bill, y_pred_bill
)
print(report_bill)
precision recall f1-score support
0 0.97 0.99 0.98 152
1 0.98 0.97 0.98 123
accuracy 0.98 275
macro avg 0.98 0.98 0.98 275
weighted avg 0.98 0.98 0.98 275

Supervised Learning - Random Forest Classifier

  • Does not require normalization
  • Is not sensitive to missing values
  • Low risk of overfitting
  • Efficient with large datasets
  • High accuracy

Dataset

!wget https://raw.githubusercontent.com/xjcjiacheng/data-analysis/master/heart%20disease%20UCI/heart.csv -P datasets
heart_df = pd.read_csv('datasets/heart.csv')
heart_df.head(5)
age sex cp trestbps chol fbs restecg thalach exang oldpeak slope ca thal target
0 63 1 3 145 233 1 0 150 0 2.3 0 0 1 1
1 37 1 2 130 250 0 1 187 0 3.5 0 0 2 1
2 41 0 1 130 204 0 0 172 0 1.4 2 0 2 1
3 56 1 1 120 236 0 1 178 0 0.8 2 0 2 1
4 57 0 0 120 354 0 1 163 1 0.6 2 0 2 1

Preprocessing

# remove target feature from training set
X_heart = heart_df.drop('target', axis=1)
y_heart = heart_df['target']
X_train_heart, X_test_heart, y_train_heart, y_test_heart = train_test_split(
    X_heart,
    y_heart,
    test_size=0.2,
    random_state=0
)

Model Fitting

forest_classifier = RandomForestClassifier(n_estimators=10, criterion='entropy')

forest_classifier.fit(X_train_heart, y_train_heart)

Evaluation

y_pred_heart = forest_classifier.predict(X_test_heart)
conf_mtx_heart = confusion_matrix(y_test_heart, y_pred_heart)
conf_mtx_heart

# array([[24,  3],
#        [ 5, 29]])
conf_mtx_heart_plot = ConfusionMatrixDisplay(
    confusion_matrix=conf_mtx_heart,
    display_labels=[False,True]
)

conf_mtx_heart_plot.plot()
plt.show()

scikit-learn - Machine Learning in Python

report_heart = classification_report(
    y_test_heart, y_pred_heart
)
print(report_heart)
precision recall f1-score support
0 0.83 0.89 0.86 27
1 0.91 0.85 0.88 34
accuracy 0.87 61
macro avg 0.87 0.87 0.87 61
weighted avg 0.87 0.87 0.87 61

Random Forest Hyperparameter Tuning

Testing Hyperparameters

rdnfor_classifier = RandomForestClassifier(
    n_estimators=2,
    min_samples_split=2,
    min_samples_leaf=1,
    criterion='entropy'
)
rdnfor_classifier.fit(X_train_heart, y_train_heart)
rdnfor_pred = rdnfor_classifier.predict(X_test_heart)
print('Accuracy Score: ', accuracy_score(y_test_heart, rdnfor_pred).round(4)*100, '%')

# Accuracy Score:  73.77 %

Grid-Search Cross-Validation

Try a set of values for selected Hyperparameter to find the optimal configuration.

param_grid = {
    'n_estimators': [5, 25, 50, 75,100, 125],
    'min_samples_split': [1,2,3],
    'min_samples_leaf': [1,2,3],
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_features' : ['sqrt', 'log2']
}

grid_search = GridSearchCV(
    estimator = rdnfor_classifier,
    param_grid = param_grid
)

grid_search.fit(X_train_heart, y_train_heart)
print('Best Parameter: ', grid_search.best_params_)
# Best Parameter:  {
# 'criterion': 'entropy',
# 'max_features': 'sqrt',
# 'min_samples_leaf': 2,
# 'min_samples_split': 1,
# 'n_estimators': 25
# }
rdnfor_classifier_optimized = RandomForestClassifier(
    n_estimators=25,
    min_samples_split=1,
    min_samples_leaf=2,
    criterion='entropy',
    max_features='sqrt'
)

rdnfor_classifier_optimized.fit(X_train_heart, y_train_heart)
rdnfor_pred_optimized = rdnfor_classifier_optimized.predict(X_test_heart)
print('Accuracy Score: ', accuracy_score(y_test_heart, rdnfor_pred_optimized).round(4)*100, '%')

# Accuracy Score:  85.25 %

Random Forest Classifier 1 - Penguins

!wget https://github.com/remijul/dataset/raw/master/penguins_size.csv -P datasets
peng_df = pd.read_csv('datasets/penguins_size.csv')
peng_df = peng_df.dropna()
peng_df.head(5)
species island culmen_length_mm culmen_depth_mm flipper_length_mm body_mass_g sex
0 Adelie Torgersen 39.1 18.7 181.0 3750.0 MALE
1 Adelie Torgersen 39.5 17.4 186.0 3800.0 FEMALE
2 Adelie Torgersen 40.3 18.0 195.0 3250.0 FEMALE
4 Adelie Torgersen 36.7 19.3 193.0 3450.0 FEMALE
5 Adelie Torgersen 39.3 20.6 190.0 3650.0 MALE
# drop labels and encode string values
X_peng = pd.get_dummies(peng_df.drop('species', axis=1),drop_first=True)
y_peng = peng_df['species']
# train/test split
X_peng_train, X_peng_test, y_peng_train, y_peng_test = train_test_split(
    X_peng,
    y_peng,
    test_size=0.3,
    random_state=42
)
# creating the model
rfc_peng = RandomForestClassifier(
    n_estimators=10,
    max_features='sqrt',
    random_state=42
)
# model training and running predictions
rfc_peng.fit(X_peng_train, y_peng_train)
peng_pred = rfc_peng.predict(X_peng_test)
print('Accuracy Score: ',accuracy_score(y_peng_test, peng_pred, normalize=True).round(4)*100, '%')
# Accuracy Score:  98.02 %

Feature Importance

# feature importance for classification
peng_index = ['importance']
peng_data_columns = pd.Series(X_peng.columns)
peng_importance_array = rfc_peng.feature_importances_
peng_importance_df = pd.DataFrame(peng_importance_array, peng_data_columns, peng_index)
peng_importance_df
importance
culmen_length_mm 0.288928
culmen_depth_mm 0.111021
flipper_length_mm 0.357994
body_mass_g 0.025477
island_Dream 0.178498
island_Torgersen 0.031042
sex_FEMALE 0.004716
sex_MALE 0.002324
peng_importance_df.sort_values(
    by='importance',
    ascending=False
).plot(
    kind='barh',
    title='Feature Importance for Species Classification',
    figsize=(12,4)
)

scikit-learn - Machine Learning in Python

Model Evaluation

report_peng = classification_report(y_peng_test, peng_pred)
print(report_peng)
precision recall f1-score support
Adelie 0.98 0.98 0.98 49
Chinstrap 0.94 0.94 0.94 18
Gentoo 1.00 1.00 1.00 34
accuracy 0.98 101
macro avg 0.97 0.97 0.97 101
weighted avg 0.98 0.98 0.98 101
conf_mtx_peng = confusion_matrix(y_peng_test, peng_pred)

conf_mtx_peng_plot = ConfusionMatrixDisplay(
    confusion_matrix=conf_mtx_peng
)

conf_mtx_peng_plot.plot(cmap='plasma')

scikit-learn - Machine Learning in Python

Random Forest Classifier - Banknote Authentication

!wget https://github.com/jbrownlee/Datasets/raw/master/banknote_authentication.csv -P datasets
money_df = pd.read_csv('datasets/data-banknote-authentication.csv')
money_df.head(5)
Variance_Wavelet Skewness_Wavelet Curtosis_Wavelet Image_Entropy Class
0 3.62160 8.6661 -2.8073 -0.44699 0
1 4.54590 8.1674 -2.4586 -1.46210 0
2 3.86600 -2.6383 1.9242 0.10645 0
3 3.45660 9.5228 -4.0112 -3.59440 0
4 0.32924 -4.4552 4.5718 -0.98880 0
sns.pairplot(money_df, hue='Class', palette='winter')

scikit-learn - Machine Learning in Python

# drop label for training
X_money = money_df.drop('Class', axis=1)
y_money = money_df['Class']
print(X_money.shape, y_money.shape)
X_money_train, X_money_test, y_money_train, y_money_test = train_test_split(
    X_money,
    y_money,
    test_size=0.15,
    random_state=42
)

Grid Search for Hyperparameters

rfc_money_base = RandomForestClassifier(oob_score=True)
param_grid = {
    'n_estimators': [64, 96, 128, 160, 192],
    'max_features': [2,3,4],
    'bootstrap': [True, False]
}
grid_money = GridSearchCV(rfc_money_base, param_grid) 
grid_money.fit(X_money_train, y_money_train)
grid_money.best_params_
# {'bootstrap': True, 'max_features': 2, 'n_estimators': 96}

Model Training and Evaluation

rfc_money = RandomForestClassifier(
    bootstrap=True,
    max_features=2,
    n_estimators=96,
    oob_score=True
)
rfc_money.fit(X_money_train, y_money_train)
print('Out-of-Bag Score: ', rfc_money.oob_score_.round(4)*100, '%')
# Out-of-Bag Score:  99.14 %
money_pred = rfc_money.predict(X_money_test)
money_report = classification_report(y_money_test, money_pred)
print(money_report)
precision recall f1-score support
0 0.99 1.00 1.00 111
1 1.00 0.99 0.99 95
accuracy 1.00 206
macro avg 1.00 0.99 1.00 206
weighted avg 1.00 1.00 1.00 206
conf_mtx_money = confusion_matrix(y_money_test, money_pred)

conf_mtx_money_plot = ConfusionMatrixDisplay(
    confusion_matrix=conf_mtx_money
)

conf_mtx_money_plot.plot(cmap='plasma')

scikit-learn - Machine Learning in Python

Optimizations

# verify number of estimators found by grid search
errors = []
missclassifications = []

for n in range(1,200):
    rfc = RandomForestClassifier(n_estimators=n, max_features=2)
    rfc.fit(X_money_train, y_money_train)
    preds = rfc.predict(X_money_test)
    
    err = 1 - accuracy_score(y_money_test, preds)
    errors.append(err)
    
    n_missed = np.sum(preds != y_money_test)
    missclassifications.append(n_missed)
plt.figure(figsize=(12,4))
plt.title('Errors as a Function of n_estimators')
plt.xlabel('Estimators')
plt.ylabel('Error Score')
plt.plot(range(1,200), errors)
# there is no noteable improvement above ~10 estimators

scikit-learn - Machine Learning in Python

plt.figure(figsize=(12,4))
plt.title('Misclassifications as a Function of n_estimators')
plt.xlabel('Estimators')
plt.ylabel('Misclassifications')
plt.plot(range(1,200), missclassifications)
# and the same for misclassifications

scikit-learn - Machine Learning in Python

Random Forest Regressor

Comparing different regression models to a random forrest regression model.

# dataset
!wget https://github.com/vineetsingh028/Rock_Density_Prediction/raw/master/rock_density_xray.csv -P datasets
rock_df = pd.read_csv('datasets/rock_density_xray.csv')
rock_df.columns = ['Signal', 'Density']
rock_df.head(5)
Signal Density
0 72.945124 2.456548
1 14.229877 2.601719
2 36.597334 1.967004
3 9.578899 2.300439
4 21.765897 2.452374
plt.figure(figsize=(12,5))
plt.title('X-Ray Bounce Signal Strength vs Rock Density')
sns.scatterplot(data=rock_df, x='Signal', y='Density')
# the signal vs density plot follows a sine wave - spoiler alert: simpler algorithm
# will fail trying to fit this dataset...

scikit-learn - Machine Learning in Python

# train-test split
X_rock = rock_df['Signal'].values.reshape(-1,1)
y_rock = rock_df['Density']

X_rock_train, X_rock_test, y_rock_train, y_rock_test = train_test_split(
    X_rock,
    y_rock,
    test_size=0.1,
    random_state=42
)
# normalization
scaler = StandardScaler()
X_rock_train_scaled = scaler.fit_transform(X_rock_train)
X_rock_test_scaled = scaler.transform(X_rock_test)

vs Linear Regression

lr_rock = LinearRegression()
lr_rock.fit(X_rock_train_scaled, y_rock_train)
lr_rock_preds = lr_rock.predict(X_rock_test_scaled)

mae = mean_absolute_error(y_rock_test, lr_rock_preds)
rmse = np.sqrt(mean_squared_error(y_rock_test, lr_rock_preds))
mean_abs = y_rock_test.mean()
avg_error = mae * 100 / mean_abs

print('MAE: ', mae.round(2), 'RMSE: ', rmse.round(2), 'Relative Avg. Error: ', avg_error.round(2), '%')
# MAE:  0.24 RMSE:  0.3 Relative Avg. Error:  10.93 %
# visualize predictions
plt.figure(figsize=(12,5))
plt.plot(X_rock_test, lr_rock_preds, c='mediumspringgreen')
sns.scatterplot(data=rock_df, x='Signal', y='Density', c='dodgerblue')
plt.title('Linear Regression Predictions')
plt.show()
# the returned error appears small because the linear regression returns an average
# but it cannot fit a linear line to the contours of the underlying sine wave function

scikit-learn - Machine Learning in Python

vs Polynomial Regression

# helper function
def run_model(model, X_train, y_train, X_test, y_test, df):
    
    # FIT MODEL
    model.fit(X_train, y_train)
    
    # EVALUATE
    y_preds = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_preds)
    rmse = np.sqrt(mean_squared_error(y_test, y_preds))
    mean_abs = y_test.mean()
    avg_error = mae * 100 / mean_abs
    print('MAE: ', mae.round(2), 'RMSE: ', rmse.round(2), 'Relative Avg. Error: ', avg_error.round(2), '%')
    
    # PLOT RESULTS
    signal_range = np.arange(0,100)
    output = model.predict(signal_range.reshape(-1,1))
    
    
    plt.figure(figsize=(12,5))
    sns.scatterplot(data=df, x='Signal', y='Density', c='dodgerblue')
    plt.plot(signal_range,output, c='mediumspringgreen')
    plt.title('Regression Predictions')
    plt.show()
# test helper on previous linear regression
run_model(
    model=lr_rock,
    X_train=X_rock_train,
    y_train=y_rock_train,
    X_test=X_rock_test,
    y_test=y_rock_test,
    df=rock_df
)

MAE: 0.24 RMSE: 0.3 Relative Avg. Error: 10.93 %

scikit-learn - Machine Learning in Python

# build polynomial model
pipe_poly = make_pipeline(
    PolynomialFeatures(degree=6),
    LinearRegression()
)
# run model
run_model(
    model=pipe_poly,
    X_train=X_rock_train,
    y_train=y_rock_train,
    X_test=X_rock_test,
    y_test=y_rock_test,
    df=rock_df
)
# with a HARD LIMIT of 0-100 for the xray signal a 6th degree polinomial is a good fit

MAE: 0.13 RMSE: 0.14 Relative Avg. Error: 5.7 %

scikit-learn - Machine Learning in Python

vs KNeighbors Regression

# build polynomial model
k_values=[1,5,10,25]

for k in k_values:
    model = KNeighborsRegressor(n_neighbors=k)
    print(model)
    
    # run model
    run_model(
        model,
        X_train=X_rock_train,
        y_train=y_rock_train,
        X_test=X_rock_test,
        y_test=y_rock_test,
        df=rock_df
    )

KNeighborsRegressor(n_neighbors=1)

MAE: 0.12 RMSE: 0.17 Relative Avg. Error: 5.47 %

scikit-learn - Machine Learning in Python

KNeighborsRegressor()

MAE: 0.13 RMSE: 0.15 Relative Avg. Error: 5.9 %

scikit-learn - Machine Learning in Python

KNeighborsRegressor(n_neighbors=10)

MAE: 0.12 RMSE: 0.14 Relative Avg. Error: 5.44 %

scikit-learn - Machine Learning in Python

KNeighborsRegressor(n_neighbors=25)

MAE: 0.14 RMSE: 0.16 Relative Avg. Error: 6.18 %

scikit-learn - Machine Learning in Python

vs Decision Tree Regression

tree_model = DecisionTreeRegressor()

# run model
run_model(
    model=tree_model,
    X_train=X_rock_train,
    y_train=y_rock_train,
    X_test=X_rock_test,
    y_test=y_rock_test,
    df=rock_df
)

MAE: 0.12 RMSE: 0.17 Relative Avg. Error: 5.47 %

scikit-learn - Machine Learning in Python

vs Support Vector Regression

svr_rock = svm.SVR()

param_grid = {
    'C': [0.01,0.1,1,5,10,100, 1000],
    'gamma': ['auto', 'scale']
}

rock_grid = GridSearchCV(svr_rock, param_grid)
# run model
run_model(
    model=rock_grid,
    X_train=X_rock_train,
    y_train=y_rock_train,
    X_test=X_rock_test,
    y_test=y_rock_test,
    df=rock_df
)

MAE: 0.13 RMSE: 0.14 Relative Avg. Error: 5.75 %

scikit-learn - Machine Learning in Python

vs Gradient Boosting Regression

gbr_rock = GradientBoostingRegressor()

# run model
run_model(
    model=gbr_rock,
    X_train=X_rock_train,
    y_train=y_rock_train,
    X_test=X_rock_test,
    y_test=y_rock_test,
    df=rock_df
)

MAE: 0.13 RMSE: 0.15 Relative Avg. Error: 5.76 %

scikit-learn - Machine Learning in Python

vs Ada Boosting Regression

abr_rock = AdaBoostRegressor()

# run model
run_model(
    model=abr_rock,
    X_train=X_rock_train,
    y_train=y_rock_train,
    X_test=X_rock_test,
    y_test=y_rock_test,
    df=rock_df
)

MAE: 0.13 RMSE: 0.14 Relative Avg. Error: 5.67 %

scikit-learn - Machine Learning in Python

Finally, Random Forrest Regression

rfr_rock = RandomForestRegressor(n_estimators=10)

# run model
run_model(
    model=rfr_rock,
    X_train=X_rock_train,
    y_train=y_rock_train,
    X_test=X_rock_test,
    y_test=y_rock_test,
    df=rock_df
)

MAE: 0.11 RMSE: 0.14 Relative Avg. Error: 5.1 %

scikit-learn - Machine Learning in Python

Supervised Learning - SVC Model

Support Vector Machines (SVMs) are a set of supervised learning methods used for classification, regression and outliers detection.

  • Effective in high dimensional spaces.
  • Still effective in cases where number of dimensions is greater than the number of samples.

Dataset

Measurements of geometrical properties of kernels belonging to three different varieties of wheat:

  • A: Area,
  • P: Perimeter,
  • C = 4piA/P^2: Compactness,
  • LK: Length of kernel,
  • WK: Width of kernel,
  • A_Coef: Asymmetry coefficient
  • LKG: Length of kernel groove.
!wget https://raw.githubusercontent.com/prasertcbs/basic-dataset/master/Seed_Data.csv -P datasets
wheat_df = pd.read_csv('datasets/Seed_Data.csv')
wheat_df.head(5)
A P C LK WK A_Coef LKG target
0 15.26 14.84 0.8710 5.763 3.312 2.221 5.220 0
1 14.88 14.57 0.8811 5.554 3.333 1.018 4.956 0
2 14.29 14.09 0.9050 5.291 3.337 2.699 4.825 0
3 13.84 13.94 0.8955 5.324 3.379 2.259 4.805 0
4 16.14 14.99 0.9034 5.658 3.562 1.355 5.175 0
wheat_df.info()

# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 210 entries, 0 to 209
# Data columns (total 8 columns):
#  #   Column  Non-Null Count  Dtype  
# ---  ------  --------------  -----  
#  0   A       210 non-null    float64
#  1   P       210 non-null    float64
#  2   C       210 non-null    float64
#  3   LK      210 non-null    float64
#  4   WK      210 non-null    float64
#  5   A_Coef  210 non-null    float64
#  6   LKG     210 non-null    float64
#  7   target  210 non-null    int64  
# dtypes: float64(7), int64(1)
# memory usage: 13.2 KB

Preprocessing

# remove target feature from training set
X_wheat = wheat_df.drop('target', axis=1)
y_wheat = wheat_df['target']

print(X_wheat.shape, y_wheat.shape)
# (210, 7) (210,)
# train/test split
X_train_wheat, X_test_wheat, y_train_wheat, y_test_wheat = train_test_split(
    X_wheat,
    y_wheat,
    test_size=0.2,
    random_state=42
)
# normalization
sc_wheat = StandardScaler()
X_train_wheat=sc_wheat.fit_transform(X_train_wheat)
X_test_wheat=sc_wheat.fit_transform(X_test_wheat)

Model Training

# SVM classifier fitting
clf_wheat = svm.SVC()
clf_wheat.fit(X_train_wheat, y_train_wheat)

Model Evaluation

# Predictions
y_wheat_pred = clf_wheat.predict(X_test_wheat)
print(
    'Accuracy Score: ',
    accuracy_score(y_test_wheat, y_wheat_pred, normalize=True).round(4)*100, '%'
)
# Accuracy Score:  90.48 %
report_wheat = classification_report(
    y_test_wheat, y_wheat_pred
)
print(report_wheat)
precision recall f1-score support
0 0.82 0.82 0.82 11
1 1.00 0.93 0.96 14
2 0.89 0.94 0.91 17
accuracy 0.90 42
macro avg 0.90 0.90 0.90 42
weighted avg 0.91 0.90 0.91 42
conf_mtx_wheat = confusion_matrix(y_test_wheat, y_wheat_pred)
conf_mtx_wheat

# array([[ 9,  0,  2],
#        [ 1, 13,  0],
#        [ 1,  0, 16]])
conf_mtx_wheat_plot = ConfusionMatrixDisplay(
    confusion_matrix=conf_mtx_wheat
)

conf_mtx_wheat_plot.plot()
plt.show()

scikit-learn - Machine Learning in Python

Margin Plots for Support Vector Classifier

# get dataset
!wget https://github.com/alpeshraj/mouse_viral_study/raw/main/mouse_viral_study.csv -P datasets
mice_df = pd.read_csv('datasets/mouse_viral_study.csv')
mice_df.head(5)
Med_1_mL Med_2_mL Virus Present
0 6.508231 8.582531 0
1 4.126116 3.073459 1
2 6.427870 6.369758 0
3 3.672953 4.905215 1
4 1.580321 2.440562 1
sns.scatterplot(data=mice_df, x='Med_1_mL',y='Med_2_mL',hue='Virus Present', palette='winter')

scikit-learn - Machine Learning in Python

# visualizing a hyperplane to separate the two features
sns.scatterplot(data=mice_df, x='Med_1_mL',y='Med_2_mL',hue='Virus Present', palette='winter')

x = np.linspace(0,10,100)
m = -1
b = 11
y = m*x + b

plt.plot(x,y,c='fuchsia')

scikit-learn - Machine Learning in Python

SVC with a Linear Kernel

# using a support vector classifier to calculate maximize the margin between both classes

y_vir = mice_df['Virus Present']
X_vir = mice_df.drop('Virus Present',axis=1)

# kernel : {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'}
# the smaller the C value the more feature vectors will be inside the margin
model_vir = svm.SVC(kernel='linear', C=1000)

model_vir.fit(X_vir, y_vir)
# import helper function
from helper.svm_margin_plot import plot_svm_boundary
plot_svm_boundary(model_vir, X_vir, y_vir)

scikit-learn - Machine Learning in Python

# the smaller the C value the more feature vectors will be inside the margin
model_vir_low_reg = svm.SVC(kernel='linear', C=0.005)
model_vir_low_reg.fit(X_vir, y_vir)
plot_svm_boundary(model_vir_low_reg, X_vir, y_vir)

scikit-learn - Machine Learning in Python

SVC with a Radial Basis Function Kernel

model_vir_rbf = svm.SVC(kernel='rbf', C=1)
model_vir_rbf.fit(X_vir, y_vir)
plot_svm_boundary(model_vir_rbf, X_vir, y_vir)

scikit-learn - Machine Learning in Python

# # gamma : {'scale', 'auto'} or float, default='scale'
# - if ``gamma='scale'`` (default) is passed then it uses 1 / (n_features * X.var()) as value of gamma,
# - if 'auto', uses 1 / n_features
# - if float, must be non-negative.
model_vir_rbf_auto_gamma = svm.SVC(kernel='rbf', C=1, gamma='auto')
model_vir_rbf_auto_gamma.fit(X_vir, y_vir)
plot_svm_boundary(model_vir_rbf_auto_gamma, X_vir, y_vir)

scikit-learn - Machine Learning in Python

SVC with a Sigmoid Kernel

model_vir_sigmoid = svm.SVC(kernel='sigmoid', gamma='scale')
model_vir_sigmoid.fit(X_vir, y_vir)
plot_svm_boundary(model_vir_sigmoid, X_vir, y_vir)

scikit-learn - Machine Learning in Python

SVC with a Polynomial Kernel

model_vir_poly = svm.SVC(kernel='poly', C=1, degree=2)
model_vir_poly.fit(X_vir, y_vir)
plot_svm_boundary(model_vir_poly, X_vir, y_vir)

scikit-learn - Machine Learning in Python

Grid Search for Support Vector Classifier

svm_base_model = svm.SVC()

param_grid = {
    'C':[0.01, 0.1, 1],
    'kernel': ['linear', 'rbf']
}
grid = GridSearchCV(svm_base_model, param_grid) 
grid.fit(X_vir, y_vir)
grid.best_params_
# {'C': 0.01, 'kernel': 'linear'}

Support Vector Regression

# dataset
!wget https://github.com/fsdhakan/ML/raw/main/cement_slump.csv -P datasets
cement_df = pd.read_csv('datasets/cement_slump.csv')
cement_df.head(5)
Cement Slag Fly ash Water SP Coarse Aggr. Fine Aggr. SLUMP(cm) FLOW(cm) Compressive Strength (28-day)(Mpa)
0 273.0 82.0 105.0 210.0 9.0 904.0 680.0 23.0 62.0 34.99
1 163.0 149.0 191.0 180.0 12.0 843.0 746.0 0.0 20.0 41.14
2 162.0 148.0 191.0 179.0 16.0 840.0 743.0 1.0 20.0 41.81
3 162.0 148.0 190.0 179.0 19.0 838.0 741.0 3.0 21.5 42.08
4 154.0 112.0 144.0 220.0 10.0 923.0 658.0 20.0 64.0 26.82
plt.figure(figsize=(8,8))
sns.heatmap(cement_df.corr(), annot=True, cmap='viridis')

scikit-learn - Machine Learning in Python

# drop labels
X_cement = cement_df.drop('Compressive Strength (28-day)(Mpa)', axis=1)
y_cement = cement_df['Compressive Strength (28-day)(Mpa)']
 # train/test split
    X_train_cement, X_test_cement, y_train_cement, y_test_cement = train_test_split(
     X_cement,
     y_cement,
     test_size=0.3,
     random_state=42
 )
# normalize
scaler = StandardScaler()
X_train_cement_scaled = scaler.fit_transform(X_train_cement)
X_test_cement_scaled = scaler.transform(X_test_cement)

Base Model Run

base_model_cement = svm.SVR()
base_model_cement.fit(X_train_cement_scaled, y_train_cement)

base_model_predictions = base_model_cement.predict(X_test_cement_scaled)
mae = mean_absolute_error(y_test_cement, base_model_predictions)
rmse = mean_squared_error(y_test_cement, base_model_predictions)
mean_abs = y_test_cement.mean()
avg_error = mae * 100 / mean_abs

print('MAE: ', mae.round(2), 'RMSE: ', rmse.round(2), 'Relative Avg. Error: ', avg_error.round(2), '%')
MAE RMSE Relative Avg. Error
4.68 36.95 12.75 %

Grid Search for better Hyperparameter

param_grid = {
    'C': [0.001,0.01,0.1,0.5,1],
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto'],
    'degree': [2,3,4],
    'epsilon': [0,0.01,0.1,0.5,1,2]
}
cement_grid = GridSearchCV(base_model_cement, param_grid)
cement_grid.fit(X_train_cement_scaled, y_train_cement)
cement_grid.best_params_
# {'C': 1, 'degree': 2, 'epsilon': 2, 'gamma': 'scale', 'kernel': 'linear'}
cement_grid_predictions = cement_grid.predict(X_test_cement_scaled)
mae_grid = mean_absolute_error(y_test_cement, cement_grid_predictions)
rmse_grid = mean_squared_error(y_test_cement, cement_grid_predictions)
mean_abs = y_test_cement.mean()
avg_error_grid = mae_grid * 100 / mean_abs

print('MAE: ', mae_grid.round(2), 'RMSE: ', rmse_grid.round(2), 'Relative Avg. Error: ', avg_error_grid.round(2), '%')
MAE RMSE Relative Avg. Error
1.85 5.2 5.05 %

Example Task - Wine Fraud

Data Exploration

# dataset
!wget https://github.com/CAPGAGA/Fraud-in-Wine/raw/main/wine_fraud.csv -P datasets
wine_df = pd.read_csv('datasets/wine_fraud.csv')
wine_df.head(5)
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol quality type
0 7.4 0.70 0.00 1.9 0.076 11.0 34.0 0.9978 3.51 0.56 9.4 Legit red
1 7.8 0.88 0.00 2.6 0.098 25.0 67.0 0.9968 3.20 0.68 9.8 Legit red
2 7.8 0.76 0.04 2.3 0.092 15.0 54.0 0.9970 3.26 0.65 9.8 Legit red
3 11.2 0.28 0.56 1.9 0.075 17.0 60.0 0.9980 3.16 0.58 9.8 Legit red
4 7.4 0.70 0.00 1.9 0.076 11.0 34.0 0.9978 3.51 0.56 9.4 Legit red
wine_df.value_counts('quality')
quality
Legit 6251
Fraud 246
dtype: int64
wine_df['quality'].value_counts().plot(
    kind='bar',
    figsize=(10,5), 
    title='Wine - Quality distribution')

scikit-learn - Machine Learning in Python

plt.figure(figsize=(10, 5))
plt.title('Wine - Quality distribution by Type')

sns.countplot(
    data=wine_df,
    x='quality',
    hue='type',
    palette='winter'
)

plt.savefig('assets/Scikit_Learn_22.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

wine_df_white = wine_df[wine_df['type'] == 'white']
wine_df_red = wine_df[wine_df['type'] == 'red']
# fraud percentage by wine type
legit_white_wines = wine_df_white.value_counts('quality')[0]
fraud_white_wines = wine_df_white.value_counts('quality')[1]
white_fraud_percentage = fraud_white_wines * 100 / (legit_white_wines + fraud_white_wines)

legit_red_wines = wine_df_red.value_counts('quality')[0]
fraud_red_wines = wine_df_red.value_counts('quality')[1]
red_fraud_percentage = fraud_red_wines * 100 / (legit_red_wines + fraud_red_wines)

print(
    'Fraud Percentage: \nWhite Wines: ',
    white_fraud_percentage.round(2),
    '% \nRed Wines: ',
    red_fraud_percentage.round(2),
    '%'
)
Fraud Percentage:
White Wines: 3.74 %
Red Wines: 3.94 %
# make features numeric
feature_map = {
    'Legit': 0,
    'Fraud': 1,
    'red': 0,
    'white': 1
}

wine_df['quality_enc'] = wine_df['quality'].map(feature_map)
wine_df['type_enc'] = wine_df['type'].map(feature_map)
wine_df[['quality', 'quality_enc', 'type', 'type_enc']]
quality quality_enc type type_enc
0 Legit 0 red 0
1 Legit 0 red 0
2 Legit 0 red 0
3 Legit 0 red 0
4 Legit 0 red 0
...
6492 Legit 0 white 1
6493 Legit 0 white 1
6494 Legit 0 white 1
6495 Legit 0 white 1
6496 Legit 0 white 1
6497 rows Ă— 4 columns
# find correlations
wine_df.corr(numeric_only=True)
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol quality_enc type_enc
fixed acidity 1.000000 0.219008 0.324436 -0.111981 0.298195 -0.282735 -0.329054 0.458910 -0.252700 0.299568 -0.095452 0.021794 -0.486740
volatile acidity 0.219008 1.000000 -0.377981 -0.196011 0.377124 -0.352557 -0.414476 0.271296 0.261454 0.225984 -0.037640 0.151228 -0.653036
citric acid 0.324436 -0.377981 1.000000 0.142451 0.038998 0.133126 0.195242 0.096154 -0.329808 0.056197 -0.010493 -0.061789 0.187397
residual sugar -0.111981 -0.196011 0.142451 1.000000 -0.128940 0.402871 0.495482 0.552517 -0.267320 -0.185927 -0.359415 -0.048756 0.348821
chlorides 0.298195 0.377124 0.038998 -0.128940 1.000000 -0.195045 -0.279630 0.362615 0.044708 0.395593 -0.256916 0.034499 -0.512678
free sulfur dioxide -0.282735 -0.352557 0.133126 0.402871 -0.195045 1.000000 0.720934 0.025717 -0.145854 -0.188457 -0.179838 -0.085204 0.471644
total sulfur dioxide -0.329054 -0.414476 0.195242 0.495482 -0.279630 0.720934 1.000000 0.032395 -0.238413 -0.275727 -0.265740 -0.035252 0.700357
density 0.458910 0.271296 0.096154 0.552517 0.362615 0.025717 0.032395 1.000000 0.011686 0.259478 -0.686745 0.016351 -0.390645
pH -0.252700 0.261454 -0.329808 -0.267320 0.044708 -0.145854 -0.238413 0.011686 1.000000 0.192123 0.121248 0.020107 -0.329129
sulphates 0.299568 0.225984 0.056197 -0.185927 0.395593 -0.188457 -0.275727 0.259478 0.192123 1.000000 -0.003029 -0.034046 -0.487218
alcohol -0.095452 -0.037640 -0.010493 -0.359415 -0.256916 -0.179838 -0.265740 -0.686745 0.121248 -0.003029 1.000000 -0.051141 0.032970
quality_enc 0.021794 0.151228 -0.061789 -0.048756 0.034499 -0.085204 -0.035252 0.016351 0.020107 -0.034046 -0.051141 1.000000 -0.004598
type_enc -0.486740 -0.653036 0.187397 0.348821 -0.512678 0.471644 0.700357 -0.390645 -0.329129 -0.487218 0.032970 -0.004598 1.000000
plt.figure(figsize=(12,8))
sns.heatmap(wine_df.corr(numeric_only=True), annot=True, cmap='viridis')

scikit-learn - Machine Learning in Python

# how does the quality correlate to measurements
wine_df.corr(numeric_only=True)['quality_enc']
Quality Correlstion
fixed acidity 0.021794
volatile acidity 0.151228
citric acid -0.061789
residual sugar -0.048756
chlorides 0.034499
free sulfur dioxide -0.085204
total sulfur dioxide -0.035252
density 0.016351
pH 0.020107
sulphates -0.034046
alcohol -0.051141
quality_enc 1.000000
type_enc -0.004598
Name: quality_enc, dtype: float64
wine_df.corr(numeric_only=True)['quality_enc'][:-2].sort_values().plot(
    figsize=(12,5),
    kind='bar',
    title='Correlation of Measurements to Quality'
)

scikit-learn - Machine Learning in Python

Regression Model

# separate target + remove string values
X_wine = wine_df.drop(['quality_enc', 'quality', 'type'], axis=1)
y_wine = wine_df['quality']

print(X_wine.shape, y_wine.shape)
# train-test split
X_wine_train, X_wine_test, y_wine_train, y_wine_test = train_test_split(
    X_wine,
    y_wine,
    test_size=0.1,
    random_state=42
)
# normalization
scaler = StandardScaler()
X_wine_train_scaled = scaler.fit_transform(X_wine_train)
X_wine_test_scaled = scaler.transform(X_wine_test)
# create the SVC model using class_weight to balance out the
# dataset that heavily leaning towards non-frauds
svc_wine_base = svm.SVC(
    kernel='rbf',
    class_weight='balanced'
)
# grid search
param_grid = {
    'C': [0.5, 1, 1.5, 2, 2.5],
    'gamma' : ['scale', 'auto']
}

wine_grid = GridSearchCV(svc_wine_base, param_grid)
wine_grid.fit(X_wine_train_scaled, y_wine_train)
print('Best Params: ', wine_grid.best_params_)
# Best Params:  {'C': 2.5, 'gamma': 'auto'}
y_wine_pred = wine_grid.predict(X_wine_test_scaled)
print(
    'Accuracy Score: ',
    accuracy_score(y_wine_test, y_wine_pred, normalize=True).round(4)*100, '%'
)
# Accuracy Score:  84.77 %
report_wine = classification_report(
    y_wine_test, y_wine_pred
)
print(report_wine)
precision recall f1-score support
Fraud 0.16 0.68 0.26 25
Legit 0.99 0.85 0.92 625
accuracy 0.85 650
macro avg 0.57 0.77 0.59 650
weighted avg 0.95 0.85 0.89 650
conf_mtx_wine = confusion_matrix(y_wine_test, y_wine_pred)
conf_mtx_wine

# array([[ 17,   8],
#        [ 91, 534]])
conf_mtx_wine_plot = ConfusionMatrixDisplay(
    confusion_matrix=conf_mtx_wine
)

conf_mtx_wine_plot.plot(cmap='plasma')

scikit-learn - Machine Learning in Python

# expand grid search
param_grid = {
    'C': [1000, 1050, 1100, 1050, 1200],
    'gamma' : ['scale', 'auto']
}

wine_grid = GridSearchCV(svc_wine_base, param_grid)
wine_grid.fit(X_wine_train_scaled, y_wine_train)
print('Best Params: ', wine_grid.best_params_)
# Best Params:  {'C': 1100, 'gamma': 'scale'}
y_wine_pred = wine_grid.predict(X_wine_test_scaled)
print('Accuracy Score: ',accuracy_score(y_wine_test, y_wine_pred, normalize=True).round(4)*100, '%')
# Accuracy Score:  94.31 %
report_wine = classification_report(y_wine_test, y_wine_pred)
print(report_wine)
conf_mtx_wine = confusion_matrix(y_wine_test, y_wine_pred)

conf_mtx_wine_plot = ConfusionMatrixDisplay(
    confusion_matrix=conf_mtx_wine
)

conf_mtx_wine_plot.plot(cmap='plasma')
precision recall f1-score support
Fraud 0.29 0.32 0.30 25
Legit 0.97 0.97 0.97 625
accuracy 0.85 650
macro avg 0.63 0.64 0.64 650
weighted avg 0.95 0.94 0.94 650

scikit-learn - Machine Learning in Python

Supervised Learning - Boosting Methods

# dataset - label mushrooms as poisonous or eatable
!wget https://github.com/semnan-university-ai/Mushroom/raw/main/Mushroom.csv -P datasets

Dataset Exploration

shroom_df = pd.read_csv('datasets/mushrooms.csv')
shroom_df.head(5).transpose()

Mushroom Data Set

  1. cap-shape: bell = b, conical = c, convex = x, flat = f, knobbed = k, sunken = s
  2. cap-surface: fibrous = f, grooves = g, scaly = y, smooth = s
  3. cap-color: brown = n, buff = b, cinnamon = c, gray = g, green = r, pink = p, purple = u, red = e, white = w, yellow = y
  4. bruises?: bruises = t, no = f
  5. odor: almond = a, anise = l, creosote = c, fishy = y, foul = f, musty = m, none = n, pungent = p, spicy = s
  6. gill-attachment: attached = a, descending = d, free = f, notched = n
  7. gill-spacing: close = c, crowded = w, distant = d
  8. gill-size: broad = b, narrow = n
  9. gill-color: black = k, brown = n, buff = b, chocolate = h, gray = g, green = r, orange = o, pink = p, purple = u, red = e, white = w, yellow = y
  10. stalk-shape: enlarging = e, tapering = t
  11. stalk-root: bulbous = b, club = c, cup = u, equal = e, rhizomorphs = z, rooted = r, missing = ?
  12. stalk-surface-above-ring: fibrous = f, scaly = y, silky = k, smooth = s
  13. stalk-surface-below-ring: fibrous = f, scaly = y, silky = k, smooth = s
  14. stalk-color-above-ring: brown = n, buff = b, cinnamon = c, gray = g, orange = o, pink = p, red = e, white = w, yellow = y
  15. stalk-color-below-ring: brown = n, buff = b, cinnamon = c, gray = g, orange = o, pink = p, red = e, white = w, yellow = y
  16. veil-type: partial = p, universal = u
  17. veil-color: brown = n, orange = o, white = w, yellow = y
  18. ring-number: none = n, one = o, two = t
  19. ring-type: cobwebby = c, evanescent = e, flaring = f, large = l, none = n, pendant = p, sheathing = s, zone = z
  20. spore-print-color: black = k, brown = n, buff = b, chocolate = h, green = r, orange = o, purple = u, white = w, yellow = y
  21. population: abundant = a, clustered = c, numerous = n, scattered = s, several = v, solitary = `y
  22. habitat: grasses = g, leaves = l, meadows = m, paths = p, urban = u, waste = w, woods = d
0 1 2 3 4
class p e e p e
cap-shape x x b x x
cap-surface s s s y s
cap-color n y w w g
bruises t t t t f
odor p a l p n
gill-attachment f f f f f
gill-spacing c c c c w
gill-size n b b n b
gill-color k k n n k
stalk-shape e e e e t
stalk-root e c c e e
stalk-surface-above-ring s s s s s
stalk-surface-below-ring s s s s s
stalk-color-above-ring w w w w w
stalk-color-below-ring w w w w w
veil-type p p p p p
veil-color w w w w w
ring-number o o o o o
ring-type p p p p e
spore-print-color k n n k n
population s n n s a
habitat u g m u g
shroom_df.isnull().sum()
class 0
cap-shape 0
cap-surface 0
cap-color 0
bruises 0
odor 0
gill-attachment 0
gill-spacing 0
gill-size 0
gill-color 0
stalk-shape 0
stalk-root 0
stalk-surface-above-ring 0
stalk-surface-below-ring 0
stalk-color-above-ring 0
stalk-color-below-ring 0
veil-type 0
veil-color 0
ring-number 0
ring-type 0
spore-print-color 0
population 0
habitat 0
dtype: int64
feature_df = shroom_df.describe().transpose().reset_index(
    names=['feature']
).sort_values(
    'unique', ascending=False
)
feature count unique top freq
9 gill-color 8124 12 b 1728
3 cap-color 8124 10 n 2284
20 spore-print-color 8124 9 w 2388
5 odor 8124 9 n 3528
15 stalk-color-below-ring 8124 9 w 4384
14 stalk-color-above-ring 8124 9 w 4464
22 habitat 8124 7 d 3148
1 cap-shape 8124 6 x 3656
21 population 8124 6 v 4040
19 ring-type 8124 5 p 3968
11 stalk-root 8124 5 b 3776
12 stalk-surface-above-ring 8124 4 s 5176
13 stalk-surface-below-ring 8124 4 s 4936
17 veil-color 8124 4 w 7924
2 cap-surface 8124 4 y 3244
18 ring-number 8124 3 o 7488
10 stalk-shape 8124 2 t 4608
8 gill-size 8124 2 b 5612
7 gill-spacing 8124 2 c 6812
6 gill-attachment 8124 2 f 7914
4 bruises 8124 2 f 4748
0 class 8124 2 e 4208
16 veil-type 8124 1 p 8124
plt.figure(figsize=(12,8))
plt.title('Mushroom Features :: Number of unique Features')
sns.barplot(data=feature_df, y='feature', x='unique', orient='h', palette='summer_r')

scikit-learn - Machine Learning in Python

plt.figure(figsize=(10,4))
plt.title('Mushroom Count :: Editable vs Poisonous')
sns.countplot(data=shroom_df, x='class', palette='seismic_r')

scikit-learn - Machine Learning in Python

Adaptive Boosting

# remove lable class
X_shroom = shroom_df.drop('class', axis=1)
# make all values numeric
X_shroom = pd.get_dummies(X_shroom, drop_first=True)

y_shroom = shroom_df['class']
# train/test split
X_shroom_train, X_shroom_test, y_shroom_train, y_shroom_test = train_test_split(
    X_shroom,
    y_shroom,
    test_size=0.15,
    random_state=42
)

Feature Exploration

# don't try fit a perfect model but only return
# the most important feature for classification
abc_shroom = AdaBoostClassifier(estimator=None, n_estimators=1)
abc_shroom.fit(X_shroom_train,y_shroom_train)
shroom_preds = abc_shroom.predict(X_shroom_test)

print('Accuracy Score: ',accuracy_score(y_shroom_test, shroom_preds, normalize=True).round(4)*100, '%')
# Accuracy Score:  88.35 %
report_shroom = classification_report(y_shroom_test, shroom_preds)
print(report_shroom)
precision recall f1-score support
e 0.97 0.80 0.88 637
p 0.82 0.97 0.89 582
accuracy 0.88 1219
macro avg 0.89 0.89 0.88 1219
weighted avg 0.90 0.88 0.88 1219
conf_mtx_shroom = confusion_matrix(y_shroom_test, shroom_preds)

conf_mtx_shroom_plot = ConfusionMatrixDisplay(
    confusion_matrix=conf_mtx_shroom
)

conf_mtx_shroom_plot.plot(cmap='winter_r')

scikit-learn - Machine Learning in Python

# the model was fit on a single feature and still resulted in a pretty good performance.
# Let's find out what feature was chosen for the classification.

shroom_index = ['importance']
shroom_data_columns = pd.Series(X_shroom.columns)
shroom_importance_array = abc_shroom.feature_importances_
shroom_importance_df = pd.DataFrame(shroom_importance_array, shroom_data_columns, shroom_index)
shroom_importance_df.value_counts()
importance count
0.0 94
1.0 1
dtype: int64
# plot a slice of the dataframe to find the feature
shroom_importance_df_sorted = shroom_importance_df.sort_values(
    by='importance',
    ascending=True
)

shroom_importance_df_sorted[-5:].plot(
    kind='barh',
    title='Feature Importance for Mushroom Classification',
    figsize=(8,4)
)

The most important feature (as determined by the model) is the odor - in this case a odor of none is the best indicator to classify a poisonous mushroom:

odor: almond = a, anise = l, creosote = c, fishy = y, foul = f, musty = m, none = n, pungent = p, spicy = s

scikit-learn - Machine Learning in Python

# the mojority of poisonous mushrooms do have an odor
# naking the lack of it a good indicator for an eatable variety
plt.figure(figsize=(12,4))
plt.title('Mushroom Odor vs Class')
sns.countplot(data=shroom_df, x='odor', hue='class', palette='summer')

scikit-learn - Machine Learning in Python

Optimizing Hyperparameters

# find out how many of the 95 features you have
# to add to your model to get a better fit

error_rates = []

for estimators in range(1,96):
    model = AdaBoostClassifier(n_estimators=estimators)
    model.fit(X_shroom_train,y_shroom_train)
    preds = model.predict(X_shroom_test)
    
    err = 1 - accuracy_score(y_shroom_test, preds)
    error_rates.append(err)
x_range=range(1,96)
plt.figure(figsize=(10,4))
plt.title('Adaboost Error Rate vs n_estimators')
plt.xlabel('n_estimators')
plt.ylabel('Error Rate')
plt.xticks(np.arange(min(x_range), max(x_range)+1, 3.0))
plt.plot(x_range, error_rates)

scikit-learn - Machine Learning in Python

# already after 16 estimators there is no
# visible improvment for the error rate
abc_shroom2 = AdaBoostClassifier(estimator=None, n_estimators=16)
abc_shroom2.fit(X_shroom_train,y_shroom_train)

shroom_preds2 = abc_shroom2.predict(X_shroom_test)

print('Accuracy Score: ',accuracy_score(y_shroom_test, shroom_preds2, normalize=True).round(4)*100, '%')
# Accuracy Score:  99.92 %

report_shroom2 = classification_report(y_shroom_test, shroom_preds2)
print(report_shroom2)
precision recall f1-score support
e 1.00 1.00 1.00 637
p 1.00 1.00 1.00 582
accuracy 1.00 1219
macro avg 1.00 1.00 1.00 1219
weighted avg 1.00 1.00 1.00 1219
conf_mtx_shroom2 = confusion_matrix(y_shroom_test, shroom_preds2)

conf_mtx_shroom_plot2 = ConfusionMatrixDisplay(
    confusion_matrix=conf_mtx_shroom2
)

conf_mtx_shroom_plot2.plot(cmap='winter_r')

scikit-learn - Machine Learning in Python

shroom_index = ['importance']
shroom_data_columns = pd.Series(X_shroom.columns)
shroom_importance_array = abc_shroom2.feature_importances_
shroom_importance_df = pd.DataFrame(shroom_importance_array, shroom_data_columns, shroom_index)
shroom_importance_df.value_counts()

# there are 12 features now that are deemed important
importance count
0.0000 83
0.0625 9
0.1250 2
0.1875 1
dtype: int64
shroom_importance_df_sorted = shroom_importance_df.sort_values(
    by='importance',
    ascending=True
).tail(13)
importance
gill-size_n 0.1875
population_v 0.1250
odor_n 0.1250
odor_c 0.0625
stalk-shape_t 0.0625
spore-print-color_w 0.0625
population_c 0.0625
ring-type_p 0.0625
spore-print-color_r 0.0625
stalk-surface-above-ring_k 0.0625
gill-spacing_w 0.0625
odor_f 0.0625
stalk-color-below-ring_w 0.0000
plt.figure(figsize=(10,6))
plt.title('Features important to classify poisonous Mushrooms')

sns.barplot(
    data=shroom_importance_df_sorted.tail(13),
    y=shroom_importance_df_sorted.tail(13).index,
    x='importance',
    orient='h',
    palette='summer'
)

scikit-learn - Machine Learning in Python

Gradient Boosting

Gridsearch for best Hyperparameter

gb_shroom = GradientBoostingClassifier()
param_grid = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.05,0.1,0.2],
    'max_depth': [2,3,4,5]
}
shroom_grid = GridSearchCV(gb_shroom, param_grid)
shroom_grid.fit(X_shroom_train, y_shroom_train)
shroom_grid.best_params_
# {'learning_rate': 0.05, 'max_depth': 4, 'n_estimators': 150}
shroom_grid_preds = shroom_grid.predict(X_shroom_test)

print('Accuracy Score: ',accuracy_score(y_shroom_test, shroom_grid_preds, normalize=True).round(4)*100, '%')
# Accuracy Score:  100.0 %

report_shroom_grid_preds = classification_report(y_shroom_test, shroom_grid_preds)
print(report_shroom_grid_preds)
precision recall f1-score support
e 1.00 1.00 1.00 637
p 1.00 1.00 1.00 582
accuracy 1.00 1219
macro avg 1.00 1.00 1.00 1219
weighted avg 1.00 1.00 1.00 1219
conf_mtx_shroom_grid = confusion_matrix(y_shroom_test, shroom_grid_preds)

conf_mtx_shroom_grid_plot = ConfusionMatrixDisplay(
    confusion_matrix=conf_mtx_shroom_grid
)

conf_mtx_shroom_grid_plot.plot(cmap='winter_r')

scikit-learn - Machine Learning in Python

Feature Importance

shroom_feature_importance = shroom_grid.best_estimator_.feature_importances_
feature_importance_df = pd.DataFrame(
    index = X_shroom.columns,
    data = shroom_feature_importance,
    columns = ['importance']
)

# kick all features that have zero importance and sort by importance
feature_importance_df = feature_importance_df[
    feature_importance_df['importance'] > 3e-03
].sort_values(
    by='importance',
    ascending=False
)
plt.figure(figsize=(10,6))
plt.title('Features important to classify poisonous Mushrooms')

sns.barplot(
    data=feature_importance_df,
    y=feature_importance_df.index,
    x='importance',
    orient='h',
    palette='summer'
)

scikit-learn - Machine Learning in Python

Supervised Learning - Naive Bayes NLP

Feature Extraction

text = [
    'This is a dataset for binary sentiment classification',
    'containing substantially more data than previous benchmark datasets',
    'We provide a set of 25,000 highly polar movie reviews for training',
    'And 25,000 for testing',
    'There is additional unlabeled data for use as well',
    'Raw text and already processed bag of words formats are provided'
]

CountVectorizer & TfidfTransformer

cv = CountVectorizer(stop_words='english')
cv_sparse_matrix = cv.fit_transform(text)
# <6x30 sparse matrix of type '<class 'numpy.int64'>'
# 	with 33 stored elements in Compressed Sparse Row format>
print(cv_sparse_matrix.todense())
# [[0 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0]
#  [0 0 0 0 1 0 0 1 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0]
#  [1 1 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 1 0 0 1 0 1 0 0 0 1 0 0 0]
#  [1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0]
#  [0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0]
#  [0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 1 0 1 1 0 0 0 0 0 1 0 0 0 1]]
print(cv.vocabulary_)
# {'dataset': 9, 'binary': 5, 'sentiment': 21, 'classification': 6, 'containing': 7, 'substantially': 23, 'data': 8, 'previous': 15, 'benchmark': 4, 'datasets': 10, 'provide': 17, 'set': 22, '25': 1, '000': 0, 'highly': 12, 'polar': 14, 'movie': 13, 'reviews': 20, 'training': 26, 'testing': 24, 'additional': 2, 'unlabeled': 27, 'use': 28, 'raw': 19, 'text': 25, 'processed': 16, 'bag': 3, 'words': 29, 'formats': 11, 'provided': 18}
tfidf_trans = TfidfTransformer()
tfidf_trans_results = tfidf_trans.fit_transform(cv_sparse_matrix)
print(tfidf_trans_results.todense())
# [[0.         0.         0.         0.         0.         0.5
#   0.5        0.         0.         0.5        0.         0.
#   0.         0.         0.         0.         0.         0.
#   0.         0.         0.         0.5        0.         0.
#   0.         0.         0.         0.         0.         0.        ]
#  [0.         0.         0.         0.         0.4198708  0.
#   0.         0.4198708  0.34430007 0.         0.4198708  0.
#   0.         0.         0.         0.4198708  0.         0.
#   0.         0.         0.         0.         0.         0.4198708
#   0.         0.         0.         0.         0.         0.        ]
#  [0.28386526 0.28386526 0.         0.         0.         0.
#   0.         0.         0.         0.         0.         0.
#   0.3461711  0.3461711  0.3461711  0.         0.         0.3461711
#   0.         0.         0.3461711  0.         0.3461711  0.
#   0.         0.         0.3461711  0.         0.         0.        ]
#  [0.5355058  0.5355058  0.         0.         0.         0.
#   0.         0.         0.         0.         0.         0.
#   0.         0.         0.         0.         0.         0.
#   0.         0.         0.         0.         0.         0.
#   0.65304446 0.         0.         0.         0.         0.        ]
#  [0.         0.         0.52182349 0.         0.         0.
#   0.         0.         0.42790272 0.         0.         0.
#   0.         0.         0.         0.         0.         0.
#   0.         0.         0.         0.         0.         0.
#   0.         0.         0.         0.52182349 0.52182349 0.        ]
#  [0.         0.         0.         0.37796447 0.         0.
#   0.         0.         0.         0.         0.         0.37796447
#   0.         0.         0.         0.         0.37796447 0.
#   0.37796447 0.37796447 0.         0.         0.         0.
#   0.         0.37796447 0.         0.         0.         0.37796447]]

TfidfVectorizer

tfidf_vec = TfidfVectorizer(
    lowercase=True,
    analyzer='word',
    stop_words='english'
)

tfidf_vec_results = tfidf_vec.fit_transform(text)
# <6x30 sparse matrix of type '<class 'numpy.float64'>'
# 	with 33 stored elements in Compressed Sparse Row format>
print(tfidf_trans_results == tfidf_vec_results)
# True

Dataset Exploration

!wget https://raw.githubusercontent.com/kunal-lalwani/Twitter-US-Airlines-Sentiment-Analysis/master/Tweets.csv -P datasets
tweet_df = pd.read_csv('datasets/Tweets.csv')
tweet_df.head(3).transpose()
0 1 2
tweet_id 570306133677760513 570301130888122368 570301083672813571
airline_sentiment neutral positive neutral
airline_sentiment_confidence 1.0 0.3486 0.6837
negativereason NaN NaN NaN
negativereason_confidence NaN 0.0 NaN
airline Virgin America Virgin America Virgin America
airline_sentiment_gold NaN NaN NaN
name cairdin jnardino yvonnalynn
negativereason_gold NaN NaN NaN
retweet_count 0 0 0
text @VirginAmerica What @dhepburn said. @VirginAmerica plus you've added commercials t... @VirginAmerica I didn't today... Must mean I n...
tweet_coord NaN NaN NaN
tweet_created 2015-02-24 11:35:52 -0800 2015-02-24 11:15:59 -0800 2015-02-24 11:15:48 -0800
tweet_location NaN NaN Lets Play
user_timezone Eastern Time (US & Canada) Pacific Time (US & Canada) Central Time (US & Canada)
plt.figure(figsize=(12,5))
plt.title('Tweet Sentiment Classification by Airline')
sns.countplot(
    data=tweet_df,
    x='airline',
    hue='airline_sentiment',
    palette='cool'
)

plt.savefig('assets/Scikit_Learn_56.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

plt.figure(figsize=(12,6))
plt.title('Tweet Sentiment Classification with negative Reason')
sns.countplot(
    data=tweet_df,
    x='airline',
    hue='negativereason',
    palette='cool'
)

plt.savefig('assets/Scikit_Learn_57.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

Data Preprocessing

tweet_data = tweet_df[['airline_sentiment', 'text']]
X_tweet = tweet_data['text']
y_tweet = tweet_data['airline_sentiment']
# train/ test split
X_tweet_train, X_tweet_test, y_tweet_train, y_tweet_test = train_test_split(
    X_tweet,
    y_tweet,
    test_size=0.2,
    random_state=42
)

TFIDF Vectorizer

tfidf_tweet_vec = TfidfVectorizer(
    lowercase=True,
    analyzer='word',
    stop_words='english'
)

X_tweet_tfidf_train = tfidf_tweet_vec.fit_transform(X_tweet_train)
# <11712x12987 sparse matrix of type '<class 'numpy.float64'>'
# 	with 106745 stored elements in Compressed Sparse Row format>
X_tweet_tfidf_test = tfidf_tweet_vec.transform(X_tweet_test)

Model Comparison

# report helper function
def report(model):
    preds = model.predict(X_tweet_tfidf_test)
    
    print(classification_report(y_tweet_test, preds))
    
    conf_mtx = confusion_matrix(y_tweet_test, preds)
    conf_mtx_plot = ConfusionMatrixDisplay(
        confusion_matrix=conf_mtx
    )
    conf_mtx_plot.plot(cmap='plasma')
logreg_tweet = LogisticRegression(max_iter=1000)
logreg_tweet.fit(X_tweet_tfidf_train, y_tweet_train)
report(logreg_tweet)
precision recall f1-score support
negative 0.82 0.93 0.88 1889
neutral 0.66 0.48 0.56 580
positive 0.79 0.63 0.70 459
accuracy 0.80 2928
macro avg 0.76 0.68 0.71 2928
weighted avg 0.79 0.80 0.78 2928

scikit-learn - Machine Learning in Python

rbf_svc_tweet = svm.SVC()
rbf_svc_tweet.fit(X_tweet_tfidf_train, y_tweet_train)
report(rbf_svc_tweet)
precision recall f1-score support
negative 0.81 0.95 0.87 1889
neutral 0.68 0.42 0.52 580
positive 0.80 0.61 0.69 459
accuracy 0.79 2928
macro avg 0.76 0.66 0.69 2928
weighted avg 0.78 0.79 0.77 2928

scikit-learn - Machine Learning in Python

linear_svc_tweet = svm.LinearSVC()
linear_svc_tweet.fit(X_tweet_tfidf_train, y_tweet_train)
report(linear_svc_tweet)
precision recall f1-score support
negative 0.85 0.91 0.88 1889
neutral 0.64 0.54 0.58 580
positive 0.76 0.67 0.71 459
accuracy 0.80 2928
macro avg 0.75 0.71 0.72 2928
weighted avg 0.79 0.80 0.79 2928

scikit-learn - Machine Learning in Python

nb_tweets = MultinomialNB()
nb_tweets.fit(X_tweet_tfidf_train, y_tweet_train)
report(nb_tweets)
# The Naive Bayes classifies almost all tweets as negative
# which means it does well with searching neg tweets
# but ends up classifying a lot neutral and pos tweets as neg
precision recall f1-score support
negative 0.69 0.99 0.81 1889
neutral 0.75 0.15 0.25 580
positive 0.94 0.18 0.31 459
accuracy 0.70 2928
macro avg 0.79 0.44 0.46 2928
weighted avg 0.74 0.70 0.62 2928

scikit-learn - Machine Learning in Python

Model Deployment

# building a pipeline to ingest new tweets with the best performing model
pipe = Pipeline(
    [
        ('tfidf', TfidfVectorizer()),
        ('svc', svm.SVC())
    ]
)
# before deployment retrain on entire dataset
pipe.fit(X_tweet, y_tweet)
# test prediction
print(pipe.predict([
    'good flight',
    'terrible service',
    'too late',
    'ok flight',
    'Thank you'
]))
# ['positive' 'negative' 'negative' 'neutral' 'positive']

Text Classification

IMDB Dataset of 50K Movie Reviews https://ai.stanford.edu/~amaas/data/sentiment/

Data Exploration

imdb_df = pd.read_csv('datasets/moviereviews.csv')
imdb_df.head()
label review
0 neg how do films like mouse hunt get into theatres...
1 neg some talented actresses are blessed with a dem...
2 pos this has been an extraordinary year for austra...
3 pos according to hollywood movies made in last few...
4 neg my first press screening of 1998 and already i...
imdb_df.info()

# <class 'pandas.core.frame.DataFrame'>
# RangeIndex: 2000 entries, 0 to 1999
# Data columns (total 2 columns):
#  #   Column  Non-Null Count  Dtype 
# ---  ------  --------------  ----- 
#  0   label   2000 non-null   object
#  1   review  1965 non-null   object
# dtypes: object(2)
# memory usage: 31.4+ KB
# find missing
imdb_df.isnull().sum()
# label      0
# review    35
# dtype: int64
# drop missing
imdb_df = imdb_df.dropna(axis=0)
imdb_df.isnull().sum()
# label     0
# review    0
# dtype: int64
# make sure there a no empty string reviews
# (imdb_df['review'] == '  ').sum()
imdb_df['review'].str.isspace().sum()
# 27
# remove empty string reviews
imdb_df = imdb_df[~imdb_df['review'].str.isspace()]
imdb_df = imdb_df[imdb_df['review'] != '']
imdb_df['review'].str.isspace().sum()
# 0
# is the dataset balanced
imdb_df['label'].value_counts()
# neg    969
# pos    969
# Name: label, dtype: int64

Top 30 Features by Label

# find top 20 words in negative reviews
imdb_neg_df = imdb_df[imdb_df['label'] == 'neg']

count_vectorizer = CountVectorizer(analyzer='word', stop_words='english')
bag_of_words = count_vectorizer.fit_transform(imdb_neg_df['review'])
sum_words = bag_of_words.sum(axis=0)
words_freq = [
    (word, sum_words[0, idx]) for word, idx in count_vectorizer.vocabulary_.items()
]

words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
x, y = zip(*words_freq[:30])

plt.figure(figsize=(12,5))
plt.bar(x,y)
plt.xticks(rotation=90)
plt.title('Top30 Words used in Negative Reviews')

plt.savefig('assets/Scikit_Learn_62.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

# find top 20 words in positive reviews
imdb_pos_df = imdb_df[imdb_df['label'] != 'neg']

count_vectorizer = CountVectorizer(analyzer='word', stop_words='english')
bag_of_words = count_vectorizer.fit_transform(imdb_pos_df['review'])
sum_words = bag_of_words.sum(axis=0)
words_freq = [
    (word, sum_words[0, idx]) for word, idx in count_vectorizer.vocabulary_.items()
]

words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
x, y = zip(*words_freq[:30])

plt.figure(figsize=(12,5))
plt.bar(x,y)
plt.xticks(rotation=90)
plt.title('Top30 Words used in Positive Reviews')

plt.savefig('assets/Scikit_Learn_63.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

Data Preprocessing

X_rev = imdb_df['review']
y_rev = imdb_df['label']
# train/ test split
X_rev_train, X_rev_test, y_rev_train, y_rev_test = train_test_split(
    X_rev,
    y_rev,
    test_size=0.2,
    random_state=42
)
tfidf_rev_vec = TfidfVectorizer(
    lowercase=True,
    analyzer='word',
    stop_words='english'
)

X_rev_tfidf_train = tfidf_rev_vec.fit_transform(X_rev_train)
X_rev_tfidf_test = tfidf_rev_vec.transform(X_rev_test)

Model Training

nb_rev = MultinomialNB()
nb_rev.fit(X_rev_tfidf_train, y_rev_train)
preds = nb_rev.predict(X_rev_tfidf_test)
print(classification_report(y_rev_test, preds))
precision recall f1-score support
neg 0.79 0.88 0.83 188
pos 0.87 0.78 0.82 200
accuracy 0.82 388
macro avg 0.83 0.83 0.82 388
weighted avg 0.83 0.82 0.82 388
conf_mtx = confusion_matrix(y_rev_test, preds)
conf_mtx_plot = ConfusionMatrixDisplay(
     confusion_matrix=conf_mtx
)
conf_mtx_plot.plot(cmap='plasma')

scikit-learn - Machine Learning in Python

Unsupervised Learning - KMeans Clustering

Dataset Exploration

!wget https://github.com/selva86/datasets/raw/master/bank-full.csv -P datasets
bank_df = pd.read_csv('datasets/bank-full.csv', sep=';')
bank_df.head(5).transpose()
0 1 2 3 4
age 56 57 37 40 56
job housemaid services services admin. services
marital married married married married married
education basic.4y high.school high.school basic.6y high.school
default no unknown no no no
housing no no yes no no
loan no no no no yes
contact telephone telephone telephone telephone telephone
month may may may may may
day_of_week mon mon mon mon mon
duration 261 149 226 151 307
campaign 1 1 1 1 1
pdays 999 999 999 999 999
previous 0 0 0 0 0
poutcome nonexistent nonexistent nonexistent nonexistent nonexistent
emp.var.rate 1.1 1.1 1.1 1.1 1.1
cons.price.idx 93.994 93.994 93.994 93.994 93.994
cons.conf.idx -36.4 -36.4 -36.4 -36.4 -36.4
euribor3m 4.857 4.857 4.857 4.857 4.857
nr.employed 5191.0 5191.0 5191.0 5191.0 5191.0
y no no no no no
bank_df.describe()
age duration campaign pdays previous emp.var.rate cons.price.idx cons.conf.idx euribor3m nr.employed
count 41188.00000 41188.000000 41188.000000 41188.000000 41188.000000 41188.000000 41188.000000 41188.000000 41188.000000 41188.000000
mean 40.02406 258.285010 2.567593 962.475454 0.172963 0.081886 93.575664 -40.502600 3.621291 5167.035911
std 10.42125 259.279249 2.770014 186.910907 0.494901 1.570960 0.578840 4.628198 1.734447 72.251528
min 17.00000 0.000000 1.000000 0.000000 0.000000 -3.400000 92.201000 -50.800000 0.634000 4963.600000
25% 32.00000 102.000000 1.000000 999.000000 0.000000 -1.800000 93.075000 -42.700000 1.344000 5099.100000
50% 38.00000 180.000000 2.000000 999.000000 0.000000 1.100000 93.749000 -41.800000 4.857000 5191.000000
75% 47.00000 319.000000 3.000000 999.000000 0.000000 1.400000 93.994000 -36.400000 4.961000 5228.100000
max 98.00000 4918.000000 56.000000 999.000000 7.000000 1.400000 94.767000 -26.900000 5.045000 5228.100000
plt.figure(figsize=(12, 5))
plt.title('Age Distribution by Marital Status')

sns.histplot(
    data=bank_df,
    x='age',
    bins=50,
    hue='marital',
    palette='winter',
    kde=True
)

plt.savefig('assets/Scikit_Learn_65.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

plt.figure(figsize=(12, 5))
plt.title('Age Distribution by Loan Status')

sns.histplot(
    data=bank_df,
    x='age',
    bins=50,
    hue='loan',
    palette='winter',
    kde=True
)

plt.savefig('assets/Scikit_Learn_66.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

# remove columns with `pday`s = 999 (placeholder for never)
plt.figure(figsize=(12, 5))
plt.title('Distribution of Days Since Last Contacted by Loan Status')

sns.histplot(
    data=bank_df[bank_df['pdays'] != 999],
    x='pdays',
    hue='loan',
    palette='winter',
    kde=True
)

plt.savefig('assets/Scikit_Learn_67.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

# Create call duration in minutes column
bank_df['duration_minutes'] = bank_df['duration'].apply(lambda x: x/60).round(1)

plt.figure(figsize=(12, 5))
plt.title('Distribution Contact Duration by Contact Type')
plt.xlim(0,20)
sns.histplot(
    data=bank_df,
    x='duration_minutes',
    hue='contact',
    palette='winter',
    kde=True
)

plt.savefig('assets/Scikit_Learn_68.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

plt.figure(figsize=(16, 5))
plt.title('Customer Jobs Countplot by Loan Defaults')
sns.countplot(
    data=bank_df,
    x='job',
    order=bank_df['job'].value_counts().index,
    palette='winter',
    hue='default'
)

plt.savefig('assets/Scikit_Learn_69.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

plt.figure(figsize=(16, 5))
plt.title('Customer Education Countplot by Loan Defaults')
sns.countplot(
    data=bank_df,
    x='education',
    order=bank_df['education'].value_counts().index,
    palette='winter',
    hue='default'
)

plt.savefig('assets/Scikit_Learn_70.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

sns.pairplot(
    data=bank_df,
    hue='marital',
    palette='winter'
)

plt.savefig('assets/Scikit_Learn_71.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

Dataset Preprocessing

# encode categorical features
X_bank = pd.get_dummies(bank_df)
# normalize data
bank_scaler = StandardScaler()

X_bank_scaled = bank_scaler.fit_transform(X_bank)

Model Training

bank_model = KMeans(
    n_clusters=2,
    n_init='auto',
    random_state=42
)
# fit to find cluster centers and predict what center every feature belongs to
bank_cluster_labels = bank_model.fit_predict(X_bank_scaled)
# add predicted label to source dataframe
X_bank['Cluster'] = bank_cluster_labels
X_bank['Cluster'].value_counts()
# 0    26871
# 1    14317
# Name: Cluster, dtype: int64
# How do the feature correlate with the predicted labels
label_corr = X_bank.corr()['Cluster']
print(label_corr.iloc[:-1].sort_values())
plt.figure(figsize=(10,14))
label_corr.iloc[:-1].sort_values().plot(kind='barh')
plt.title('Feature Importance')

plt.savefig('assets/Scikit_Learn_72.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

Choosing a K Value

# visualize the sum distance of your datapoints to the
# predicted cluster centers as a function of number of clusters
sum_squared_distance = []

for k in range(2,20):
    model = KMeans(n_clusters=k, n_init='auto')
    model.fit(X_bank_scaled)
    
    sum_squared_distance.append(model.inertia_)
plt.figure(figsize=(10,5))
plt.title('SSD as a Function of Number of Cluster')
plt.plot(range(2,20), sum_squared_distance, 'o--')

plt.savefig('assets/Scikit_Learn_73.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

plt.figure(figsize=(10,5))
plt.title('Difference in SSD as a Function of Number of Clusters')
pd.Series(sum_squared_distance).diff().plot(kind='bar')

plt.savefig('assets/Scikit_Learn_74.webp', bbox_inches='tight')

There are two 'elbows' - one between k=5-6 (behold the 0-index in Pandas!) and the second one between k=14-15. Both of them are potential good values for the number of cluster k.

scikit-learn - Machine Learning in Python

Re-fitting the Model

bank_model = KMeans(
    n_clusters=6,
    n_init='auto',
    random_state=42
)
# fit to find cluster centers and predict what center every feature belongs to
bank_cluster_labels = bank_model.fit_predict(X_bank_scaled)
# add predicted label to source dataframe
X_bank['Cluster'] = bank_cluster_labels
X_bank['Cluster'].value_counts()
# 5    10713
# 0    10663
# 1     8164
# 3     5566
# 4     3322
# 2     2760
# Name: Cluster, dtype: int64

Example 1 : Color Quantization

img_array = mpimg.imread('assets/gz.jpg')
img_array.shape
# (325, 640, 3)
plt.imshow(img_array)
plt.title('Original Image')
plt.savefig('assets/Scikit_Learn_75.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

# flatten the image from 3 to 2 dimensions
(height, width, colour) = img_array.shape
img_array2d = img_array.reshape(height*width,colour)
img_array2d.shape
# (208000, 3)
# reduce colour space to 6 clusters
colour_model = KMeans(n_clusters=6, n_init='auto')
colour_labels = colour_model.fit_predict(img_array2d)
# get rgb value for each of the 6 cluster centers
rgb_colours = colour_model.cluster_centers_.round(0).astype(int)
rgb_colours
# array([[186, 111,  58],
#        [ 31,  11,  16],
#        [135,  72,  46],
#        [236, 157,  73],
#        [ 81,  40,  34],
#        [252, 199, 125]])
# assign these rgb values to each pixel within the cluster
# and reshape to original 3d array
quantized_image = np.reshape(rgb_colours[colour_labels],(height,width,colour))
plt.imshow(quantized_image)
plt.title('Quantized Image')
plt.savefig('assets/Scikit_Learn_76.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

Example 2 : Country Clustering

Dataset Exploration

!wget https://github.com/priyansh21112002/CIA-Country-Description/raw/main/CIA_Country_Facts.csv -P datasets
country_df = pd.read_csv('datasets/CIA_Country_Facts.csv')
country_df.head(5).transpose()
0 1 2 3 4
Country Afghanistan Albania Algeria American Samoa Andorra
Region ASIA (EX. NEAR EAST) EASTERN EUROPE NORTHERN AFRICA OCEANIA WESTERN EUROPE
Population 31056997 3581655 32930091 57794 71201
Area (sq. mi.) 647500 28748 2381740 199 468
Pop. Density (per sq. mi.) 48.0 124.6 13.8 290.4 152.1
Coastline (coast/area ratio) 0.0 1.26 0.04 58.29 0.0
Net migration 23.06 -4.93 -0.39 -20.71 6.6
Infant mortality (per 1000 births) 163.07 21.52 31.0 9.27 4.05
GDP ($ per capita) 700.0 4500.0 6000.0 8000.0 19000.0
Literacy (%) 36.0 86.5 70.0 97.0 100.0
Phones (per 1000) 3.2 71.2 78.1 259.5 497.2
Arable (%) 12.13 21.09 3.22 10.0 2.22
Crops (%) 0.22 4.42 0.25 15.0 0.0
Other (%) 87.65 74.49 96.53 75.0 97.78
Climate 1.0 3.0 1.0 2.0 3.0
Birthrate 46.6 15.11 17.14 22.46 8.71
Deathrate 20.34 5.22 4.61 3.27 6.25
Agriculture 0.38 0.232 0.101 NaN NaN
Industry 0.24 0.188 0.6 NaN NaN
Service 0.38 0.579 0.298 NaN NaN
fig, axes = plt.subplots(figsize=(10,5), nrows=1, ncols=2)
plt.suptitle('Country Population Histogram')

axes[0].set_xlabel('Population')
axes[0].set_ylabel('Frequency')

axes[0].hist(
    x=country_df['Population'],
    range=None,
    density=True,
    histtype='bar',
    orientation='vertical',
    color='dodgerblue'
)

axes[1].set_xlabel('Population (<100Mio)')
axes[1].set_ylabel('Frequency')

axes[1].hist(
    x=country_df['Population'],
    range=[0, 1e8],
    density=True,
    histtype='bar',
    orientation='vertical',
    color='fuchsia'
)

plt.savefig('assets/Scikit_Learn_77.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

plt.figure(figsize=(12, 5))
plt.title('GDP ($ per capita) by Region')

sns.barplot(
    data=country_df,
    y='Region',
    x='GDP ($ per capita)',
    estimator=np.mean,
    errorbar='sd',
    orient='h',
    palette='cool'
)

plt.savefig('assets/Scikit_Learn_78.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

plt.figure(figsize=(10, 6))

sns.scatterplot(
    y='Phones (per 1000)',
    x='GDP ($ per capita)',
    data=country_df,
    hue='Region',
    palette='cool',
).set_title('GDP ($ per capita) vs. Phones (per 1000)')

plt.savefig('assets/Scikit_Learn_79.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

plt.figure(figsize=(10, 6))

sns.scatterplot(
    y='Literacy (%)',
    x='GDP ($ per capita)',
    data=country_df,
    hue='Region',
    palette='cool',
).set_title('GDP ($ per capita) vs. Literacy (%)')

plt.savefig('assets/Scikit_Learn_80.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

plt.figure(figsize=(20, 12), dpi=200)
plt.title('Correlation Heatmap CIA Country Dataset')

sns.heatmap(
    country_df.corr(numeric_only=True),
    linewidth=0.5,
    cmap='seismic',
    annot=True
)

plt.savefig('assets/Scikit_Learn_81.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

plt.figure(figsize=(20, 12), dpi=200)
sns.clustermap(
    country_df.corr(numeric_only=True),
    linewidth=0.5,
    cmap='seismic',
    annot=False,
    col_cluster=False
)

plt.savefig('assets/Scikit_Learn_82.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

Dataset Preprocessing

# find columns with missing values
country_df.isnull().sum()
Country 0
Region 0
Population 0
Area (sq. mi.) 0
Pop. Density (per sq. mi.) 0
Coastline (coast/area ratio) 0
Net migration 3
Infant mortality (per 1000 births) 3
GDP ($ per capita) 1
Literacy (%) 18
Phones (per 1000) 4
Arable (%) 2
Crops (%) 2
Other (%) 2
Climate 22
Birthrate 3
Deathrate 4
Agriculture 15
Industry 16
Service 15
dtype: int64
# what countries don't have an agriculture value
country_df[pd.isnull(country_df['Agriculture'])]['Country']
# all countries without agriculture data will not have a
# whole lot of agriculture output. The same is true for 'Industry'
# and 'Service' These values can be set to zero:
3 American Samoa
4 Andorra
78 Gibraltar
80 Greenland
83 Guam
134 Mayotte
140 Montserrat
144 Nauru
153 N. Mariana Islands
171 Saint Helena
174 St Pierre & Miquelon
177 San Marino
208 Turks & Caicos Is
221 Wallis and Futuna
223 Western Sahara
Name: Country, dtype: object
# set missing values to zero for Agriculture, Industry and Service
# define what default values you want to fill
values = {
    "Agriculture": 0,
    "Industry": 0,
    "Service": 0,
}
# and replace missing with values
country_df = country_df.fillna(value=values)
# another datapoint that is often missing is climate
# the climate can be estimated by countries in the same Region
country_df[pd.isnull(country_df['Climate'])][['Country', 'Region', 'Climate']]
Country Region Climate
5 Angola SUB-SAHARAN AFRICA NaN
36 Canada NORTHERN AMERICA NaN
50 Croatia EASTERN EUROPE NaN
66 Faroe Islands WESTERN EUROPE NaN
78 Gibraltar WESTERN EUROPE NaN
101 Italy WESTERN EUROPE NaN
115 Lebanon NEAR EAST NaN
118 Libya NORTHERN AFRICA NaN
120 Lithuania BALTICS NaN
121 Luxembourg WESTERN EUROPE NaN
129 Malta WESTERN EUROPE NaN
137 Moldova C.W. OF IND. STATES NaN
138 Monaco WESTERN EUROPE NaN
141 Morocco NORTHERN AFRICA NaN
145 Nepal ASIA (EX. NEAR EAST) NaN
169 Russia C.W. OF IND. STATES NaN
171 Saint Helena SUB-SAHARAN AFRICA NaN
174 St Pierre & Miquelon NORTHERN AMERICA NaN
177 San Marino WESTERN EUROPE NaN
181 Serbia EASTERN EUROPE NaN
186 Slovenia EASTERN EUROPE NaN
200 Tanzania SUB-SAHARAN AFRICA NaN
country_df[pd.isnull(country_df['Climate'])]['Region'].value_counts()
WESTERN EUROPE 7
SUB-SAHARAN AFRICA 3
EASTERN EUROPE 3
NORTHERN AMERICA 2
NORTHERN AFRICA 2
C.W. OF IND. STATES 2
NEAR EAST 1
BALTICS 1
ASIA (EX. NEAR EAST) 1
Name: Region, dtype: int64
# the Region value has annoying whitespaces that need to be stripped
country_df['Region'] = country_df['Region'].apply(lambda x: x.strip())
# climate zones in western europe
country_df[country_df['Region'] == 'WESTERN EUROPE']['Climate'].value_counts()
# climate zones in SUB-SAHARAN AFRICA
country_df[country_df['Region'] == 'SUB-SAHARAN AFRICA']['Climate'].value_counts()
# climate zones in EASTERN EUROPE
country_df[country_df['Region'] == 'EASTERN EUROPE']['Climate'].value_counts()
# climate zones in NORTHERN AMERICA
country_df[country_df['Region'] == 'NORTHERN AMERICA']['Climate'].value_counts()
# climate zones in NORTHERN AFRICA
country_df[country_df['Region'] == 'NORTHERN AFRICA']['Climate'].value_counts()
# climate zones in C.W. OF IND. STATES
country_df[country_df['Region'] == 'C.W. OF IND. STATES']['Climate'].value_counts()
# climate zones in NEAR EAST
country_df[country_df['Region'] == 'NEAR EAST']['Climate'].value_counts()
# climate zones in BALTICS
country_df[country_df['Region'] == 'BALTICS']['Climate'].value_counts()
# climate zones in ASIA (EX. NEAR EAST)
country_df[country_df['Region'] == 'ASIA (EX. NEAR EAST)']['Climate'].value_counts()
# we can either use the top value to fill missing climate data points
# or use a mean value:
country_df['Climate'] = country_df['Climate'].fillna(country_df.groupby('Region')['Climate'].transform('mean'))
# there are more missing values, e.g. literacy:
country_df[pd.isnull(country_df['Literacy (%)'])][['Country', 'Region', 'Literacy (%)']]
Country Region Literacy (%)
25 Bosnia & Herzegovina EASTERN EUROPE NaN
66 Faroe Islands WESTERN EUROPE NaN
74 Gaza Strip NEAR EAST NaN
78 Gibraltar WESTERN EUROPE NaN
80 Greenland NORTHERN AMERICA NaN
85 Guernsey WESTERN EUROPE NaN
99 Isle of Man WESTERN EUROPE NaN
104 Jersey WESTERN EUROPE NaN
108 Kiribati OCEANIA NaN
123 Macedonia EASTERN EUROPE NaN
134 Mayotte SUB-SAHARAN AFRICA NaN
144 Nauru OCEANIA NaN
185 Slovakia EASTERN EUROPE NaN
187 Solomon Islands OCEANIA NaN
209 Tuvalu OCEANIA NaN
220 Virgin Islands LATIN AMER. & CARIB NaN
222 West Bank NEAR EAST NaN
223 Western Sahara NORTHERN AFRICA NaN
# here we can also fill with mean values:
country_df['Literacy (%)'] = country_df['Literacy (%)'].fillna(country_df.groupby('Region')['Literacy (%)'].transform('mean'))
# the remaining rows with missing values can be dropped for now
country_df = country_df.dropna(axis=0)
country_df.isnull().sum()
Country 0
Region 0
Population 0
Area (sq. mi.) 0
Pop. Density (per sq. mi.) 0
Coastline (coast/area ratio) 0
Net migration 0
Infant mortality (per 1000 births) 0
GDP ($ per capita) 0
Literacy (%) 0
Phones (per 1000) 0
Arable (%) 0
Crops (%) 0
Other (%) 0
Climate 0
Birthrate 0
Deathrate 0
Agriculture 0
Industry 0
Service 0
dtype: int64
# drop the country column as it is a unique
# classifier that will not help with clustering
country_df_dropped = country_df.drop(['Country'], axis=1)
# the region column is useful but needs to be encoded
country_df_dropped = pd.get_dummies(country_df_dropped)
country_df_dropped.head(5).transpose()
0 1 2 3 4
Population 31056997.00 3581655.000 3.293009e+07 57794.00 71201.00
Area (sq. mi.) 647500.00 28748.000 2.381740e+06 199.00 468.00
Pop. Density (per sq. mi.) 48.00 124.600 1.380000e+01 290.40 152.10
Coastline (coast/area ratio) 0.00 1.260 4.000000e-02 58.29 0.00
Net migration 23.06 -4.930 -3.900000e-01 -20.71 6.60
Infant mortality (per 1000 births) 163.07 21.520 3.100000e+01 9.27 4.05
GDP ($ per capita) 700.00 4500.000 6.000000e+03 8000.00 19000.00
Literacy (%) 36.00 86.500 7.000000e+01 97.00 100.00
Phones (per 1000) 3.20 71.200 7.810000e+01 259.50 497.20
Arable (%) 12.13 21.090 3.220000e+00 10.00 2.22
Crops (%) 0.22 4.420 2.500000e-01 15.00 0.00
Other (%) 87.65 74.490 9.653000e+01 75.00 97.78
Climate 1.00 3.000 1.000000e+00 2.00 3.00
Birthrate 46.60 15.110 1.714000e+01 22.46 8.71
Deathrate 20.34 5.220 4.610000e+00 3.27 6.25
Agriculture 0.38 0.232 1.010000e-01 0.00 0.00
Industry 0.24 0.188 6.000000e-01 0.00 0.00
Service 0.38 0.579 2.980000e-01 0.00 0.00
Region_ASIA (EX. NEAR EAST) 1.00 0.000 0.000000e+00 0.00 0.00
Region_BALTICS 0.00 0.000 0.000000e+00 0.00 0.00
Region_C.W. OF IND. STATES 0.00 0.000 0.000000e+00 0.00 0.00
Region_EASTERN EUROPE 0.00 1.000 0.000000e+00 0.00 0.00
Region_LATIN AMER. & CARIB 0.00 0.000 0.000000e+00 0.00 0.00
Region_NEAR EAST 0.00 0.000 0.000000e+00 0.00 0.00
Region_NORTHERN AFRICA 0.00 0.000 1.000000e+00 0.00 0.00
Region_NORTHERN AMERICA 0.00 0.000 0.000000e+00 0.00 0.00
Region_OCEANIA 0.00 0.000 0.000000e+00 1.00 0.00
Region_SUB-SAHARAN AFRICA 0.00 0.000 0.000000e+00 0.00 0.00
Region_WESTERN EUROPE 0.00 0.000 0.000000e+00 0.00 1.00
# to be able to compare all datapoints they need to be normalized
country_scaler = StandardScaler()
country_df_scaled = country_scaler.fit_transform(country_df_dropped)

Model Training

# finding a good k-value for number of cluster
ssd_country = []

for k in range(2,30):
    model = KMeans(n_clusters=k, n_init='auto')
    model.fit(country_df_scaled)
    
    ssd_country.append(model.inertia_)
plt.figure(figsize=(10,5))
plt.title('SSD as a Function of Number of Cluster')
plt.plot(range(2,30), ssd_country, 'o--')

plt.savefig('assets/Scikit_Learn_83.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

plt.figure(figsize=(10,5))
plt.title('Difference in SSD as a Function of Number of Clusters')
pd.Series(ssd_country).diff().plot(kind='bar')

plt.savefig('assets/Scikit_Learn_84.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

country_model = KMeans(
    n_clusters=14,
    n_init='auto',
    random_state=42
)
# fit to find cluster centers and predict what center every feature belongs to
country_cluster_labels = country_model.fit_predict(country_df_scaled)

Model Evaluation

# add predicted label to source dataframe
country_df['Cluster14'] = country_cluster_labels
country_df['Cluster14'].value_counts()
plt.figure(figsize=(10, 7))
sns.set(style='darkgrid')

# hue/style by categorical column
sns.scatterplot(
    x='GDP ($ per capita)',
    y='Literacy (%)',
    data=country_df,
    s=40,
    alpha=0.6,
    hue='Cluster14',
    palette='cool',
    style='Region'
).set_title('Country Clusters with k=14')

plt.savefig('assets/Scikit_Learn_85.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

# repeat but only with 3 cluster
country_model2 = KMeans(
    n_clusters=3,
    n_init='auto',
    random_state=42
)
# fit to find cluster centers and predict what center every feature belongs to
country_cluster_labels2 = country_model2.fit_predict(country_df_scaled)

# add predicted label to source dataframe
country_df['Cluster3'] = country_cluster_labels2

plt.figure(figsize=(10, 7))
sns.set(style='darkgrid')

# hue/style by categorical column
sns.scatterplot(
    x='GDP ($ per capita)',
    y='Literacy (%)',
    data=country_df,
    s=40,
    alpha=0.6,
    hue='Cluster3',
    palette='cool',
    style='Region'
).set_title('Country Clusters with k=3')

plt.savefig('assets/Scikit_Learn_86.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

# How do the feature correlate with the predicted labels
country_label_corr = country_df.corr()['Cluster3']
print(country_label_corr.iloc[:-1].sort_values())

Feature Correlation

Literacy (%) -0.413704
Crops (%) -0.152936
Coastline (coast/area ratio) -0.132610
Service -0.070495
Area (sq. mi.) -0.062183
Phones (per 1000) -0.037538
Population -0.024969
Industry 0.008487
Arable (%) 0.034891
Climate 0.049659
Other (%) 0.050444
Pop. Density (per sq. mi.) 0.101062
GDP ($ per capita) 0.122206
Agriculture 0.250750
Net migration 0.316226
Birthrate 0.369940
Infant mortality (per 1000 births) 0.412365
Deathrate 0.575814
Name: Cluster, dtype: float64
plt.figure(figsize=(10,6))
country_label_corr.iloc[:-1].sort_values().plot(kind='barh')
plt.title('Feature Importance')

plt.savefig('assets/Scikit_Learn_87.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

Plotly Choropleth Map

iso_codes = pd.read_csv('datasets/country-iso-codes.csv')
iso_map = iso_codes.set_index('Country')['ISO Code'].to_dict()
country_df['ISO Code'] = country_df['Country'].map(iso_map)
country_df[['Country','ISO Code']].head(5)
Country ISO Code
0 Afghanistan AFG
1 Albania ALB
2 Algeria DZA
3 American Samoa ASM
4 Andorra AND
fig = px.choropleth(
    country_df,
    locations='ISO Code',
    color='Cluster3',
    hover_name='Country',
    color_continuous_scale=px.colors.sequential.Plasma
)

fig.show()

scikit-learn - Machine Learning in Python

fig = px.choropleth(
    country_df,
    locations='ISO Code',
    color='Cluster14',
    hover_name='Country',
    color_continuous_scale=px.colors.sequential.Plasma
)

fig.show()

scikit-learn - Machine Learning in Python

Unsupervised Learning - Agglomerative Clustering

Dataset Preprocessing

autompg_data: The Auto-MPG dataset for regression Revised from CMU StatLib library, data concerns city-cycle fuel consumption

autoMPG_df = pd.read_csv('datasets/auto-mpg.csv')
autoMPG_df.head(5)
mpg cylinders displacement horsepower weight acceleration model_year origin name
0 18.0 8 307.0 130.0 3504 12.0 70 usa chevrolet chevelle malibu
1 15.0 8 350.0 165.0 3693 11.5 70 usa buick skylark 320
2 18.0 8 318.0 150.0 3436 11.0 70 usa plymouth satellite
3 16.0 8 304.0 150.0 3433 12.0 70 usa amc rebel sst
4 17.0 8 302.0 140.0 3449 10.5 70 usa ford torino
autoMPG_df['origin'].value_counts()
# there are only 3 countries of origin - can be turned into a dummy variable
autoMPG_dummy_df = pd.get_dummies(autoMPG_df.drop('name', axis=1))
autoMPG_dummy_df.head(5)
mpg cylinders displacement horsepower weight acceleration model_year origin_europe origin_japan origin_usa
0 18.0 8 307.0 130.0 3504 12.0 70 False False True
1 15.0 8 350.0 165.0 3693 11.5 70 False False True
2 18.0 8 318.0 150.0 3436 11.0 70 False False True
3 16.0 8 304.0 150.0 3433 12.0 70 False False True
4 17.0 8 302.0 140.0 3449 10.5 70 False False True
# normalize dataset
scaler = MinMaxScaler()
autoMPG_scaled = pd.DataFrame(
    scaler.fit_transform(autoMPG_dummy_df), columns=autoMPG_dummy_df.columns
)
autoMPG_scaled.describe()
mpg cylinders displacement horsepower weight acceleration model_year origin_europe origin_japan origin_usa
count 392.000000 392.000000 392.000000 392.000000 392.000000 392.000000 392.000000 392.000000 392.000000 392.000000
mean 0.384200 0.494388 0.326646 0.317768 0.386897 0.448888 0.498299 0.173469 0.201531 0.625000
std 0.207580 0.341157 0.270398 0.209191 0.240829 0.164218 0.306978 0.379136 0.401656 0.484742
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 0.212766 0.200000 0.095607 0.157609 0.173589 0.343750 0.250000 0.000000 0.000000 0.000000
50% 0.365691 0.200000 0.214470 0.258152 0.337539 0.446429 0.500000 0.000000 0.000000 1.000000
75% 0.531915 1.000000 0.536822 0.434783 0.567550 0.537202 0.750000 0.000000 0.000000 1.000000
max 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000
plt.figure(figsize=(12,10))

sns.heatmap(autoMPG_scaled, annot=False, cmap='viridis')

plt.savefig('assets/Scikit_Learn_90.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

sns.clustermap(
    autoMPG_scaled.corr(numeric_only=True),
    linewidth=0.5,
    cmap='seismic',
    annot=True,
    col_cluster=False
)

plt.savefig('assets/Scikit_Learn_91.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

Assigning Cluster Labels

Known Number of Clusters

# there are ~ 4 clusters visible - let's try to agglomerate them
autoMPG_model = AgglomerativeClustering(n_clusters=4)
cluster_labels = autoMPG_model.fit_predict(autoMPG_scaled)
autoMPG_df['label'] = cluster_labels
autoMPG_df.head(5)
mpg cylinders displacement horsepower weight acceleration model_year origin name label
0 18.0 8 307.0 130.0 3504 12.0 70 usa chevrolet chevelle malibu 2
1 15.0 8 350.0 165.0 3693 11.5 70 usa buick skylark 320 2
2 18.0 8 318.0 150.0 3436 11.0 70 usa plymouth satellite 2
3 16.0 8 304.0 150.0 3433 12.0 70 usa amc rebel sst 2
4 17.0 8 302.0 140.0 3449 10.5 70 usa ford torino 2
plt.figure(figsize=(12,5))
sns.scatterplot(
    x='mpg',
    y='horsepower',
    data=autoMPG_df,
    hue='label',
    palette='cool_r',
    style='origin'
).set_title('Horsepower as a function of Miles-per-gallon')

plt.savefig('assets/Scikit_Learn_92.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

plt.figure(figsize=(12,5))
sns.scatterplot(
    x='model_year',
    y='mpg',
    data=autoMPG_df,
    hue='label',
    palette='cool_r',
    style='origin'
).set_title('Model Year as a function of Miles-per-gallon')
plt.legend(bbox_to_anchor=(1.01,1.01))

plt.savefig('assets/Scikit_Learn_93.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

figure, axes = plt.subplots(1, 3, sharex=True,figsize=(15, 5))
figure.suptitle('Country of Origin')

axes[0].set_title('second chart with no data')

sns.scatterplot(
    x='horsepower',
    y='mpg',
    data=autoMPG_df[autoMPG_df['origin'] == 'europe'],
    hue='label',
    palette='cool_r',
    style='model_year',
    ax=axes[0]
).set_title('Europe')

axes[1].set_title('Europe')

sns.scatterplot(
    x='horsepower',
    y='mpg',
    data=autoMPG_df[autoMPG_df['origin'] == 'japan'],
    hue='label',
    palette='cool_r',
    style='model_year',
    ax=axes[1]
).set_title('Japan')

axes[2].set_title('second chart with no data')

sns.scatterplot(
    x='horsepower',
    y='mpg',
    data=autoMPG_df[autoMPG_df['origin'] == 'usa'],
    hue='label',
    palette='cool_r',
    style='model_year',
    ax=axes[2]
).set_title('USA') 
plt.legend(bbox_to_anchor=(1.01,1.01))

plt.savefig('assets/Scikit_Learn_94.webp', bbox_inches='tight')
# nice... perfect separation by country!

scikit-learn - Machine Learning in Python

Unknown Number of Clusters

The Clustermap created above allowed us to estimate the amount of clusters needed to accuratly label the dataset based on the Dendrogram displayed on the left side. If we do not know how many clusters are present in our dataset we can define a maximum distance threshold a cluster can have before being merged with surrounding clusters. Setting this threshold to zero results in a number of clusters == number of datapoints.

autoMPG_model_auto = AgglomerativeClustering(
    n_clusters=None,
    metric='euclidean',
    distance_threshold=0
)
cluster_labels_auto = autoMPG_model_auto.fit_predict(autoMPG_scaled)
len(np.unique(cluster_labels_auto))
# threshold of zero leads to 392 clusters == number of rows in our dataset
# find out a good distance threshold
linkage_matrix = hierarchy.linkage(autoMPG_model_auto.children_)
linkage_matrix
# [`cluster[i]`, `cluster[j]`, `distance between`, `number of members`]
# to display this matrix we can use the above mentioned dendrogram
plt.figure(figsize=(20,10))
plt.title('Hierarchy Dendrogram for 8 Classes')
dendro = hierarchy.dendrogram(linkage_matrix, truncate_mode='lastp', p=9)

plt.savefig('assets/Scikit_Learn_95.webp', bbox_inches='tight')
# The higher the y-value the larger the distance between the connected clusters

scikit-learn - Machine Learning in Python

# since the miles-per-gallons are a good indicator for the label
# what is the max distance between two points here:
car_max_mpg = autoMPG_scaled.iloc[autoMPG_scaled['mpg'].idxmax()]
car_min_mpg = autoMPG_scaled.iloc[autoMPG_scaled['mpg'].idxmin()]

np.linalg.norm(car_max_mpg - car_min_mpg)
# 3.1128158766165406
# if the max distance is ~3 the threshold should be < 3
autoMPG_model_auto = AgglomerativeClustering(
    n_clusters=None,
    metric='euclidean',
    distance_threshold=2
)
cluster_labels_auto = autoMPG_model_auto.fit_predict(autoMPG_scaled)
len(np.unique(cluster_labels_auto))
# threshold of two leads to 11 clusters
autoMPG_model_auto = AgglomerativeClustering(
    n_clusters=None,
    metric='euclidean',
    distance_threshold=3
)
cluster_labels_auto = autoMPG_model_auto.fit_predict(autoMPG_scaled)
len(np.unique(cluster_labels_auto))
# threshold of three leads to 9 clusters
autoMPG_df['label_auto'] = cluster_labels_auto
figure, axes = plt.subplots(1, 3, sharex=True,figsize=(15, 6))
figure.suptitle('Country of Origin')

axes[0].set_title('second chart with no data')

sns.scatterplot(
    x='horsepower',
    y='mpg',
    data=autoMPG_df[autoMPG_df['origin'] == 'europe'],
    hue='label_auto',
    palette='cool_r',
    style='model_year',
    ax=axes[0]
).set_title('Europe')

axes[1].set_title('Europe')

sns.scatterplot(
    x='horsepower',
    y='mpg',
    data=autoMPG_df[autoMPG_df['origin'] == 'japan'],
    hue='label_auto',
    palette='cool_r',
    style='model_year',
    ax=axes[1]
).set_title('Japan')

axes[2].set_title('second chart with no data')

sns.scatterplot(
    x='horsepower',
    y='mpg',
    data=autoMPG_df[autoMPG_df['origin'] == 'usa'],
    hue='label_auto',
    palette='cool_r',
    style='model_year',
    ax=axes[2]
).set_title('USA') 
plt.legend(bbox_to_anchor=(1.01,1.01))

plt.savefig('assets/Scikit_Learn_96.webp', bbox_inches='tight')
# the division by countries is still there. but we are now getting
# sub-classes within each country - which might be important depending on your set goal

scikit-learn - Machine Learning in Python

Unsupervised Learning - Density-based Spatial Clustering (DBSCAN)

DBSCAN vs KMeans

blobs_df = pd.read_csv('datasets/blobs.csv')
blobs_df.tail(2)
X1 X2
1498 5.454552 6.461246
1499 -7.769230 7.014384
plt.figure(figsize=(12,5))
plt.title('Blobs Dataset')
sns.scatterplot(data=blobs_df, x='X1', y='X2')

plt.savefig('assets/Scikit_Learn_97.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

moons_df = pd.read_csv('datasets/moons.csv')
moons_df.tail(2)
X1 X2
1498 1.803858 -0.154705
1499 0.203305 0.079049
plt.figure(figsize=(12,5))
plt.title('Moons Dataset')
sns.scatterplot(data=moons_df, x='X1', y='X2')

plt.savefig('assets/Scikit_Learn_98.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

circles_df = pd.read_csv('datasets/circles.csv')
circles_df.tail(2)
X1 X2
1498 0.027432 -0.264891
1499 -0.216732 0.183006
plt.figure(figsize=(12,5))
plt.title('Circles Dataset')
sns.scatterplot(data=circles_df, x='X1', y='X2')

plt.savefig('assets/Scikit_Learn_99.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

def display_categories(model, data, axis):
    labels = model.fit_predict(data)
    sns.scatterplot(data=data, x='X1', y='X2', hue=labels, palette='cool' , ax=axis)
km_model_blobs = KMeans(n_clusters=3, init='random', n_init='auto')
db_model_blobs = DBSCAN(eps=0.5, min_samples=5)

figure, axes = plt.subplots(1, 2, sharex=True,figsize=(12, 6))
figure.suptitle('3 Blobs Dataset')

axes[0].set_title('KMeans Clustering')
display_categories(km_model_blobs, blobs_df, axes[0])

axes[1].set_title('DBSCAN Clustering')
display_categories(db_model_blobs, blobs_df, axes[1])

plt.savefig('assets/Scikit_Learn_100.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

km_model_moons = KMeans(n_clusters=2, init='random', n_init='auto')
db_model_moons = DBSCAN(eps=0.2, min_samples=5)

figure, axes = plt.subplots(1, 2, sharex=True,figsize=(12, 6))
figure.suptitle('2 Moons Dataset')

axes[0].set_title('KMeans Clustering')
display_categories(km_model_moons, moons_df, axes[0])

axes[1].set_title('DBSCAN Clustering')
display_categories(db_model_moons, moons_df, axes[1])

plt.savefig('assets/Scikit_Learn_101.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

km_model_circles = KMeans(n_clusters=2, init='random', n_init='auto')
db_model_circles = DBSCAN(eps=0.2, min_samples=5)

figure, axes = plt.subplots(1, 2, sharex=True,figsize=(12, 6))
figure.suptitle('2 Circles Dataset')

axes[0].set_title('KMeans Clustering')
display_categories(km_model_circles, circles_df, axes[0])

axes[1].set_title('DBSCAN Clustering')
display_categories(db_model_circles, circles_df, axes[1])

plt.savefig('assets/Scikit_Learn_102.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

DBSCAN Hyperparameter Tuning

two_blobs_df = pd.read_csv('datasets/two-blobs.csv')
two_blobs_otl_df = pd.read_csv('datasets/two-blobs-outliers.csv')
# default hyperparameter
db_model_base = DBSCAN(eps=0.5, min_samples=5)

figure, axes = plt.subplots(1, 2, sharex=True,figsize=(12, 6))
figure.suptitle('2 Blobs Dataset - Default Hyperparameter')

axes[0].set_title('DBSCAN Clustering w/o Outliers')
display_categories(db_model_base, two_blobs_df, axes[0])

axes[1].set_title('DBSCAN Clustering with Outliers')
display_categories(db_model_base, two_blobs_otl_df, axes[1])

plt.savefig('assets/Scikit_Learn_103.webp', bbox_inches='tight')
# points around cluster 1 are assigned to be outliers

scikit-learn - Machine Learning in Python

# reducing epsilon reduces the max distance (epsilon)
# points are allowed to have and still be assigned to a cluster
db_model_dec = DBSCAN(eps=0.001, min_samples=5)

figure, axes = plt.subplots(1, 2, sharex=True,figsize=(12, 6))
figure.suptitle('2 Blobs Dataset - Reduced Epsilon')

axes[0].set_title('DBSCAN Clustering w/o Outliers')
display_categories(db_model_dec, two_blobs_df, axes[0])

axes[1].set_title('DBSCAN Clustering with Outliers')
display_categories(db_model_dec, two_blobs_otl_df, axes[1])

plt.savefig('assets/Scikit_Learn_104.webp', bbox_inches='tight')
# distance is too small - every point becomes it's own cluster and is assigned as an outlier

scikit-learn - Machine Learning in Python

# increasing epsilon increases the max distance (epsilon)
# points are allowed to have and still be assigned to a cluster
db_model_inc = DBSCAN(eps=10, min_samples=5)

figure, axes = plt.subplots(1, 2, sharex=True,figsize=(12, 6))
figure.suptitle('2 Blobs Dataset - Increased Epsilon')

axes[0].set_title('DBSCAN Clustering w/o Outliers')
display_categories(db_model_inc, two_blobs_df, axes[0])

axes[1].set_title('DBSCAN Clustering with Outliers')
display_categories(db_model_inc, two_blobs_otl_df, axes[1])

plt.savefig('assets/Scikit_Learn_105.webp', bbox_inches='tight')
# distance is too big - every point becomes becomes part of the same cluster

scikit-learn - Machine Learning in Python

Elbow Plot

epsilon_value_range = np.linspace(0.0001, 1, 100)

n_outliers = []
perc_outlier = []
n_clusters = []

for epsilon in epsilon_value_range:
    dbscan_model = DBSCAN(eps=epsilon)
    dbscan_model.fit(two_blobs_otl_df)
    
    # total number of outliers
    n_outliers.append(np.sum(dbscan_model.labels_ == -1))
    # percentage of outliers
    perc_outlier.append(
        100 * np.sum(dbscan_model.labels_ == -1) / len(dbscan_model.labels_)
    )
    # number of clusters
    n_clusters.append(len(np.unique(dbscan_model.labels_)))
plt.figure(figsize=(12,5))
plt.title('Elbow Plot - DBSCAN Hyperparameter')
plt.xlabel('Epsilon (Max Distance between Points)')
plt.ylabel('Number of Outliers')
plt.ylim(0,10)
# we expect 3 outliers
plt.hlines(y=3, xmin=0, xmax=0.7, color='fuchsia')
# 3 outliers are reached somewhere around eps=0.7
plt.vlines(x=0.7, ymin=0, ymax=3, color='fuchsia')
sns.lineplot(x=epsilon_value_range, y=n_outliers)

plt.savefig('assets/Scikit_Learn_107.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

plt.figure(figsize=(12,5))
plt.title('Number of Clusters by Epsilon Range')
plt.xlabel('Epsilon (Max Distance between Points)')
plt.ylabel('Number of Clusters')
# we expect 2 clusters + outliers
plt.hlines(y=3, xmin=0, xmax=1, color='fuchsia')
plt.ylim(0,50)
plt.xlim(0,1)
sns.lineplot(x=epsilon_value_range, y=n_clusters)

plt.savefig('assets/Scikit_Learn_108.webp', bbox_inches='tight')
# we already reach 3 cluster with an epsilon of 0.2
# but as seen above we need an epsilon of 0.7 to reduce
# the number of outliers to 3

scikit-learn - Machine Learning in Python

# find the optimum
# rule of thumb for min_samples = 2*n_dim
n_dim = two_blobs_otl_df.shape[1]
db_model_opt = DBSCAN(eps=0.7, min_samples=2*n_dim)

figure, axes = plt.subplots(1, 2, sharex=True,figsize=(12, 6))
figure.suptitle('2 Blobs Dataset - Optimal Epsilon')

axes[0].set_title('DBSCAN Clustering w/o Outliers')
display_categories(db_model_opt, two_blobs_df, axes[0])

axes[1].set_title('DBSCAN Clustering with Outliers')
display_categories(db_model_opt, two_blobs_otl_df, axes[1])

plt.savefig('assets/Scikit_Learn_106.webp', bbox_inches='tight')
# the 3 outliers are labled as such and every other point is assigned to one of the two clusters

scikit-learn - Machine Learning in Python

# find number of outliers
print('Number of Outliers', np.sum(db_model_opt.labels_ == -1))
# Number of Outliers 3
# get outlier percentage
print('Percentage of Outliers', (100 * np.sum(db_model_opt.labels_ == -1) / len(db_model_opt.labels_)).round(2),'%')
# Percentage of Outliers 0.3 %

Realworld Dataset

Wholesale customers The data set refers to clients of a wholesale distributor. It includes the annual spending in monetary units (m.u.) on diverse product categories

Additional Information

  1. FRESH: annual spending (m.u.) on fresh products (Continuous)
  2. MILK: annual spending (m.u.) on milk products (Continuous)
  3. GROCERY: annual spending (m.u.) on grocery products (Continuous)
  4. FROZEN: annual spending (m.u.) on frozen products (Continuous)
  5. DETERGENTS_PAPER: annual spending (m.u.) on detergents and paper products (Continuous)
  6. DELICATESSEN: annual spending (m.u.)on and delicatessen products (Continuous)
  7. CHANNEL: customers Channel - Horeca (Hotel/Restaurant/Cafe) or Retail channel (Nominal)
  8. REGION: customers Region - Lisnon, Oporto or Other (Nominal)

Dataset Exploration

wholesale_df = pd.read_csv('datasets/wholesome-customers-data.csv')
wholesale_df.head(5)
Channel Region Fresh Milk Grocery Frozen Detergents_Paper Delicassen
0 2 3 12669 9656 7561 214 2674 1338
1 2 3 7057 9810 9568 1762 3293 1776
2 2 3 6353 8808 7684 2405 3516 7844
3 1 3 13265 1196 4221 6404 507 1788
4 2 3 22615 5410 7198 3915 1777 5185
wholesale_df.info()
plt.figure(figsize=(12,5))
plt.title('Whole Sale: Milk Products vs Groceries')
sns.scatterplot(
    data=wholesale_df,
    x='Milk', y='Grocery',
    hue='Channel', style='Region',
    palette='winter'
)

plt.savefig('assets/Scikit_Learn_109.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

plt.figure(figsize=(10, 5))
plt.title('Whole Sale: Milk Products by Distribution Channel')

sns.histplot(
    data=wholesale_df,
    x='Milk',
    bins=50,
    hue='Channel',
    palette='winter',
    kde=True
)

plt.savefig('assets/Scikit_Learn_110.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

sns.clustermap(
    wholesale_df.corr(),
    linewidth=0.5,
    cmap='winter',
    annot=True,
    col_cluster=False
)

plt.savefig('assets/Scikit_Learn_111.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

sns.pairplot(
    data=wholesale_df,
    hue='Region',
    palette='winter'
)

plt.savefig('assets/Scikit_Learn_112.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

Data Preprocessing

# normalize feature set
scaler = StandardScaler()
wholesale_scaled = pd.DataFrame(
    scaler.fit_transform(wholesale_df), columns=wholesale_df.columns
)
wholesale_scaled.describe()
Channel Region Fresh Milk Grocery Frozen Detergents_Paper Delicassen
count 4.400000e+02 4.400000e+02 4.400000e+02 440.000000 4.400000e+02 4.400000e+02 4.400000e+02 4.400000e+02
mean 1.614870e-17 3.552714e-16 -3.431598e-17 0.000000 -4.037175e-17 3.633457e-17 2.422305e-17 -8.074349e-18
std 1.001138e+00 1.001138e+00 1.001138e+00 1.001138 1.001138e+00 1.001138e+00 1.001138e+00 1.001138e+00
min -6.902971e-01 -1.995342e+00 -9.496831e-01 -0.778795 -8.373344e-01 -6.283430e-01 -6.044165e-01 -5.402644e-01
25% -6.902971e-01 -7.023369e-01 -7.023339e-01 -0.578306 -6.108364e-01 -4.804306e-01 -5.511349e-01 -3.964005e-01
50% -6.902971e-01 5.906683e-01 -2.767602e-01 -0.294258 -3.366684e-01 -3.188045e-01 -4.336004e-01 -1.985766e-01
75% 1.448652e+00 5.906683e-01 3.905226e-01 0.189092 2.849105e-01 9.946441e-02 2.184822e-01 1.048598e-01
max 1.448652e+00 5.906683e-01 7.927738e+00 9.183650 8.936528e+00 1.191900e+01 7.967672e+00 1.647845e+01

Model Hyperparameter Tuning

epsilon_value_range = np.linspace(0.001, 3, 100)
n_dim = wholesale_scaled.shape[1]

n_outliers = []
perc_outlier = []
n_clusters = []

for epsilon in epsilon_value_range:
    dbscan_model = DBSCAN(eps=epsilon, min_samples=2*n_dim)
    dbscan_model.fit(wholesale_scaled)
    
    # total number of outliers
    n_outliers.append(np.sum(dbscan_model.labels_ == -1))
    # percentage of outliers
    perc_outlier.append(
        100 * np.sum(dbscan_model.labels_ == -1) / len(dbscan_model.labels_)
    )
    # number of clusters
    n_clusters.append(len(np.unique(dbscan_model.labels_)))
plt.figure(figsize=(12,5))
plt.title('Elbow Plot - DBSCAN Hyperparameter')
plt.xlabel('Epsilon (Max Distance between Points)')
plt.ylabel('Number of Outliers')
plt.hlines(y=25, xmin=0, xmax=2, color='fuchsia')
plt.vlines(x=2, ymin=0, ymax=25, color='fuchsia')
sns.lineplot(x=epsilon_value_range, y=n_outliers)

plt.savefig('assets/Scikit_Learn_113.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

plt.figure(figsize=(12,5))
plt.title('Number of Clusters by Epsilon Range')
plt.xlabel('Epsilon (Max Distance between Points)')
plt.ylabel('Number of Clusters')
plt.hlines(y=3, xmin=0, xmax=2, color='fuchsia')
plt.vlines(x=2, ymin=0, ymax=3, color='fuchsia')
sns.lineplot(x=epsilon_value_range, y=n_clusters)

plt.savefig('assets/Scikit_Learn_114.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

def wholesale_categories(model, data, x, y, axis):
    labels = model.fit_predict(data)
    sns.scatterplot(data=data, x=x, y=y, hue=labels, palette='cool' , ax=axis)
db_model_opt = DBSCAN(eps=2.0, min_samples=2*n_dim)

figure, axes = plt.subplots(1, 2, sharex=True,figsize=(12, 6))
figure.suptitle('Whole Sale Dataset - DBSCAN Cluster (Normalized)')

axes[0].set_title('DBSCAN Clustering Milk Products vs Groceries')
wholesale_categories(
    model=db_model_opt,
    data=wholesale_scaled,
    x='Milk', y='Grocery',
    axis=axes[0]
)

axes[1].set_title('DBSCAN Clustering Milk Products vs Delicassen')
wholesale_categories(
    model=db_model_opt,
    data=wholesale_scaled,
    x='Milk', y='Delicassen',
    axis=axes[1]
)

plt.savefig('assets/Scikit_Learn_115a.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

# add labels to original dataframe
wholesale_df['Label'] = db_model_opt.fit_predict(wholesale_scaled)
wholesale_df['Label'].head(5)
# remove outliers
wholesale_df_wo_otl = wholesale_df[wholesale_df['Label'] != -1]
db_model_opt = DBSCAN(eps=3.0, min_samples=2*n_dim)

figure, axes = plt.subplots(1, 2, sharex=True,figsize=(12, 6))
figure.suptitle('Whole Sale Dataset - DBSCAN Cluster (w/o Outliers)')

axes[0].set_title('DBSCAN Clustering Milk Products vs Groceries')
sns.scatterplot(
    data=wholesale_df_wo_otl,
    x='Milk', y='Grocery',
    hue='Label',
    palette='cool',
    ax=axes[0]
)

axes[1].set_title('DBSCAN Clustering Milk Products vs Delicassen')
sns.scatterplot(
    data=wholesale_df_wo_otl,
    x='Milk', y='Delicassen',
    hue='Label',
    palette='cool',
    ax=axes[1]
)

plt.savefig('assets/Scikit_Learn_115b.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

# see if the mean values of each cluster differ from each other
grouped_df = wholesale_df.groupby('Label').mean()
Label Channel Region Fresh Milk Grocery Frozen Detergents_Paper Delicassen
-1 1.52 2.480000 27729.920000 22966.960000 26609.600000 11289.640000 11173.560000 6707.160000
0 2.00 2.620155 8227.666667 8615.852713 13859.674419 1447.759690 5969.581395 1498.457364
1 1.00 2.513986 12326.972028 3023.559441 3655.328671 3086.181818 763.783217 1083.786713
scaler = MinMaxScaler()
grouped_scaler = pd.DataFrame(
    scaler.fit_transform(grouped_df), columns=grouped_df.columns, index=['Outlier', 'Cluster 1', 'Cluster 2']
)
grouped_scaler.head()
Channel Region Fresh Milk Grocery Frozen Detergents_Paper Delicassen
Outlier 0.52 0.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000
Cluster 1 1.00 1.000000 0.000000 0.280408 0.444551 0.000000 0.500087 0.073741
Cluster 2 0.00 0.242489 0.210196 0.000000 0.000000 0.166475 0.000000 0.000000
plt.figure(figsize=(12, 3))
plt.title('Scaled Cluster / Outliers Comparison (Normalized)')

sns.heatmap(
    grouped_scaler,
    linewidth=0.5,
    cmap='coolwarm',
    annot=True
)

plt.savefig('assets/Scikit_Learn_116.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

grouped_df = grouped_df.drop(['Labels'], axis=1)
# remove outlier
wholesale_clusters = grouped_df.drop(-1, axis=0)
wholesale_clusters.head()
Label Channel Region Fresh Milk Grocery Frozen Detergents_Paper Delicassen
0 2.0 2.620155 8227.666667 8615.852713 13859.674419 1447.759690 5969.581395 1498.457364
1 1.0 2.513986 12326.972028 3023.559441 3655.328671 3086.181818 763.783217 1083.786713
plt.figure(figsize=(12, 3))
plt.title('Mean Spending Values for Cluster 1 and 2')

sns.heatmap(
    wholesale_clusters,
    linewidth=0.5,
    cmap='coolwarm',
    annot=True
)

plt.savefig('assets/Scikit_Learn_117.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

Dimension Reduction - Principal Component Analysis (PCA)

Dataset Preprocessing

Breast cancer wisconsin (diagnostic) dataset.

  • Attribute Information:
    • radius (mean of distances from center to points on the perimeter)
    • texture (standard deviation of gray-scale values)
    • perimeter
    • area
    • smoothness (local variation in radius lengths)
    • compactness (perimeter^2 / area - 1.0)
    • concavity (severity of concave portions of the contour)
    • concave points (number of concave portions of the contour)
    • symmetry
    • fractal dimension ("coastline approximation" - 1)

The mean, standard error, and "worst" or largest (mean of the three worst/largest values) of these features were computed for each image, resulting in 30 features. For instance, field 0 is Mean Radius, field 10 is Radius SE, field 20 is Worst Radius.

  • class:
    • WDBC-Malignant
    • WDBC-Benign
tumor_df = pd.read_csv('datasets/cancer-tumor-data-features.csv')
tumor_df.head(5).transpose()
0 1 2 3 4
mean radius 17.990000 20.570000 19.690000 11.420000 20.290000
mean texture 10.380000 17.770000 21.250000 20.380000 14.340000
mean perimeter 122.800000 132.900000 130.000000 77.580000 135.100000
mean area 1001.000000 1326.000000 1203.000000 386.100000 1297.000000
mean smoothness 0.118400 0.084740 0.109600 0.142500 0.100300
mean compactness 0.277600 0.078640 0.159900 0.283900 0.132800
mean concavity 0.300100 0.086900 0.197400 0.241400 0.198000
mean concave points 0.147100 0.070170 0.127900 0.105200 0.104300
mean symmetry 0.241900 0.181200 0.206900 0.259700 0.180900
mean fractal dimension 0.078710 0.056670 0.059990 0.097440 0.058830
radius error 1.095000 0.543500 0.745600 0.495600 0.757200
texture error 0.905300 0.733900 0.786900 1.156000 0.781300
perimeter error 8.589000 3.398000 4.585000 3.445000 5.438000
area error 153.400000 74.080000 94.030000 27.230000 94.440000
smoothness error 0.006399 0.005225 0.006150 0.009110 0.011490
compactness error 0.049040 0.013080 0.040060 0.074580 0.024610
concavity error 0.053730 0.018600 0.038320 0.056610 0.056880
concave points error 0.015870 0.013400 0.020580 0.018670 0.018850
symmetry error 0.030030 0.013890 0.022500 0.059630 0.017560
fractal dimension error 0.006193 0.003532 0.004571 0.009208 0.005115
worst radius 25.380000 24.990000 23.570000 14.910000 22.540000
worst texture 17.330000 23.410000 25.530000 26.500000 16.670000
worst perimeter 184.600000 158.800000 152.500000 98.870000 152.200000
worst area 2019.000000 1956.000000 1709.000000 567.700000 1575.000000
worst smoothness 0.162200 0.123800 0.144400 0.209800 0.137400
worst compactness 0.665600 0.186600 0.424500 0.866300 0.205000
worst concavity 0.711900 0.241600 0.450400 0.686900 0.400000
worst concave points 0.265400 0.186000 0.243000 0.257500 0.162500
worst symmetry 0.460100 0.275000 0.361300 0.663800 0.236400
worst fractal dimension 0.118900 0.089020 0.087580 0.173000 0.076780
# normalizing data
scaler = StandardScaler()
tumor_scaled_arr = scaler.fit_transform(tumor_df)
tumor_scaled_df = pd.DataFrame(
    tumor_scaled_arr, columns=tumor_df.columns
)
tumor_scaled_df.head(5).transpose()
0 1 2 3 4
mean radius 1.097064 1.829821 1.579888 -0.768909 1.750297
mean texture -2.073335 -0.353632 0.456187 0.253732 -1.151816
mean perimeter 1.269934 1.685955 1.566503 -0.592687 1.776573
mean area 0.984375 1.908708 1.558884 -0.764464 1.826229
mean smoothness 1.568466 -0.826962 0.942210 3.283553 0.280372
mean compactness 3.283515 -0.487072 1.052926 3.402909 0.539340
mean concavity 2.652874 -0.023846 1.363478 1.915897 1.371011
mean concave points 2.532475 0.548144 2.037231 1.451707 1.428493
mean symmetry 2.217515 0.001392 0.939685 2.867383 -0.009560
mean fractal dimension 2.255747 -0.868652 -0.398008 4.910919 -0.562450
radius error 2.489734 0.499255 1.228676 0.326373 1.270543
texture error -0.565265 -0.876244 -0.780083 -0.110409 -0.790244
perimeter error 2.833031 0.263327 0.850928 0.286593 1.273189
area error 2.487578 0.742402 1.181336 -0.288378 1.190357
smoothness error -0.214002 -0.605351 -0.297005 0.689702 1.483067
compactness error 1.316862 -0.692926 0.814974 2.744280 -0.048520
concavity error 0.724026 -0.440780 0.213076 0.819518 0.828471
concave points error 0.660820 0.260162 1.424827 1.115007 1.144205
symmetry error 1.148757 -0.805450 0.237036 4.732680 -0.361092
fractal dimension error 0.907083 -0.099444 0.293559 2.047511 0.499328
worst radius 1.886690 1.805927 1.511870 -0.281464 1.298575
worst texture -1.359293 -0.369203 -0.023974 0.133984 -1.466770
worst perimeter 2.303601 1.535126 1.347475 -0.249939 1.338539
worst area 2.001237 1.890489 1.456285 -0.550021 1.220724
worst smoothness 1.307686 -0.375612 0.527407 3.394275 0.220556
worst compactness 2.616665 -0.430444 1.082932 3.893397 -0.313395
worst concavity 2.109526 -0.146749 0.854974 1.989588 0.613179
worst concave points 2.296076 1.087084 1.955000 2.175786 0.729259
worst symmetry 2.750622 -0.243890 1.152255 6.046041 -0.868353
worst fractal dimension 1.937015 0.281190 0.201391 4.935010 -0.397100

Model Fitting

pca_model = PCA(n_components=2)
pca_results = pca_model.fit_transform(tumor_scaled_df)
print(pca_model.explained_variance_ratio_)
print(np.sum(pca_model.explained_variance_ratio_))
# the two principal components are able to describe
# 63% of the variance in the dataset
# [0.44272026 0.18971182]
# 0.6324320765155945
# adding components to original dataframe
tumor_df[['PC1','PC2']] = pca_results
tumor_df[['PC1','PC2']].head(5).transpose()
0 1 2 3 4
PC1 9.192837 2.387802 5.733896 7.122953 3.935302
PC2 1.948583 -3.768172 -1.075174 10.275589 -1.948072
plt.figure(figsize=(12,5))
plt.title('Principal Component Analysis - Cancer Tumor Dataset')
sns.scatterplot(
    data=tumor_df,
    x='PC1', y='PC2'
)

plt.savefig('assets/Scikit_Learn_118.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

# get label data from dataset to confirm that we still have
# separably clusters after reducing the dimensions to 2
from sklearn.datasets import load_breast_cancer
tumor_dataset = load_breast_cancer()
tumor_dataset.keys()
# dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])
tumor_dataset['target']
plt.figure(figsize=(12,5))
plt.title('PCA Cancer Tumor Dataset - Coloured by Labels')
sns.scatterplot(
    data=tumor_df,
    x='PC1', y='PC2',
    hue=tumor_dataset['target'],
    palette='winter'
)

plt.savefig('assets/Scikit_Learn_119.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

# as shown above we get around 63% of the variance explained by using 2 principal components
# since the dataset has 30 features 30 principal components will explain 100% of the variance

explained_variance = []

for n in range(1,31):
    pca = PCA(n_components=n)
    pca.fit(tumor_scaled_df)
    
    explained_variance.append(np.sum(pca.explained_variance_ratio_))
plt.figure(figsize=(10, 5))
plt.title('Explained Variance by Number of Principal Components')
plt.xlabel('Principal Components')
sns.set(style='darkgrid')
sns.barplot(
    data=pd.DataFrame(explained_variance, columns=['Explained Variance']),
    x=np.arange(1,31),
    y='Explained Variance'
)

plt.savefig('assets/Scikit_Learn_120.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

Dataset 2

What handwritten numbers are the hardest to tell apart for a ML Model?

digits_df = pd.read_csv('datasets/digits.csv')
digits_df.head(5).transpose()
0 1 2 3 4
pixel_0_0 0.0 0.0 0.0 0.0 0.0
pixel_0_1 0.0 0.0 0.0 0.0 0.0
pixel_0_2 5.0 0.0 0.0 7.0 0.0
pixel_0_3 13.0 12.0 4.0 15.0 1.0
pixel_0_4 9.0 13.0 15.0 13.0 11.0
...
pixel_7_4 10.0 16.0 11.0 13.0 16.0
pixel_7_5 0.0 10.0 16.0 9.0 4.0
pixel_7_6 0.0 0.0 9.0 0.0 0.0
pixel_7_7 0.0 0.0 0.0 0.0 0.0
number_label 0.0 1.0 2.0 3.0 4.0
# drop label column
X_digits = digits_df.drop('number_label', axis=1)
digits_labels = digits_df['number_label']
# select a single images
img_idx = 333
Single_Digit = np.array(X_digits.iloc[img_idx])
Single_Digit.shape
# the images inside the dataset are flattened
# (64,)
# need to be turned back into their 8x8 pixel format
Single_Digit = Single_Digit.reshape((8, 8))
Single_Digit.shape
# (8, 8)
# Display the Image
plt.figure(figsize=(4,4))
plt.imshow(Single_Digit, interpolation='nearest', cmap='plasma')
plt.title('Digit Label: %d' % digits_labels[img_idx])
plt.show()

scikit-learn - Machine Learning in Python

plt.figure(figsize=(8,6))
plt.title('Digit Label: %d' % digits_labels[0])

sns.heatmap(
    Single_Digit,
    linewidth=0.5,
    cmap='plasma_r',
    annot=True
)

plt.savefig('assets/Scikit_Learn_122.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python

Dataset 2 Preprocessing

# normalize data
scaler = StandardScaler()
digits_scaled = pd.DataFrame(
    scaler.fit_transform(X_digits), columns=X_digits.columns
)
digits_scaled.head(5).transpose()
0 1 2 3 4
pixel_0_0 0.000000 0.000000 0.000000 0.000000 0.000000
pixel_0_1 -0.335016 -0.335016 -0.335016 -0.335016 -0.335016
pixel_0_2 -0.043081 -1.094937 -1.094937 0.377661 -1.094937
pixel_0_3 0.274072 0.038648 -1.844742 0.744919 -2.551014
pixel_0_4 -0.664478 0.268751 0.735366 0.268751 -0.197863
...
pixel_7_3 0.208293 -0.249010 -2.078218 0.208293 -2.306869
pixel_7_4 -0.366771 0.849632 -0.164037 0.241430 0.849632
pixel_7_5 -1.146647 0.548561 1.565686 0.379040 -0.468564
pixel_7_6 -0.505670 -0.505670 1.695137 -0.505670 -0.505670
pixel_7_7 -0.196008 -0.196008 -0.196008 -0.196008 -0.196008

Model Fitting

pca_model2 = PCA(n_components=2)
pca_results2 = pca_model2.fit_transform(digits_scaled)
print(np.sum(pca_model2.explained_variance_ratio_))
# reducing the number of dimensions from 64 -> 2 leads to 22% explained variance
X_digits[['PC1','PC2']] = pca_results2
X_digits[['PC1','PC2']].head(5).transpose()
0 1 2 3 4
PC1 1.914264 0.588997 1.302144 -3.020847 4.528854
PC2 -0.954564 0.924622 -0.317291 -0.868696 -1.093369
plt.figure(figsize=(12,5))
plt.title('PCA Digits Dataset - Coloured by Labels')
sns.scatterplot(
    data=X_digits,
    x='PC1', y='PC2',
    hue=digits_labels,
    palette='tab20'
)
plt.legend(bbox_to_anchor=(1.01,1.01))

plt.savefig('assets/Scikit_Learn_123.webp', bbox_inches='tight')
# numbers 4 and 7 are very distinct. There is some overlap between 6 and 0 and between 2 and 3
# but you can still get some separation. All the numbers in the middle are 'problematic' and 
# probably need a larger amount training data.

scikit-learn - Machine Learning in Python

# how many components would we have to add to reach 80% explained variance
explained_variance = []

for n in range(1,65):
    pca = PCA(n_components=n)
    pca.fit(digits_scaled)
    
    explained_variance.append(np.sum(pca.explained_variance_ratio_))
plt.figure(figsize=(16, 5))
plt.title('Explained Variance by Number of Principal Components')
plt.xlabel('Principal Components')
sns.set(style='darkgrid')
sns.barplot(
    data=pd.DataFrame(explained_variance, columns=['Explained Variance']),
    x=np.arange(1,65),
    y='Explained Variance'
)

plt.savefig('assets/Scikit_Learn_124.webp', bbox_inches='tight')
# we need more than 20 principal components out of 64 to reach 80% expainable variance:

scikit-learn - Machine Learning in Python

# rerun the training for 3 components for ~30% explained variance
pca_model3 = PCA(n_components=3)
pca_results3 = pca_model3.fit_transform(digits_scaled)
print(np.sum(pca_model3.explained_variance_ratio_))
# reducing the number of dimensions from 64 -> 3 leads to 30% explained variance
X_digits[['PC1','PC2','PC3']] = pca_results3
X_digits[['PC1','PC2','PC3']].head(5).transpose()
0 1 2 3 4
PC1 1.914213 0.588981 1.302030 -3.020765 4.528946
PC2 -0.954510 0.924646 -0.317199 -0.868788 -1.093498
PC3 -3.945982 3.924713 3.023435 -0.801779 0.973213
%matplotlib notebook
fig = plt.figure(figsize=(8,8))
ax = plt.axes(projection='3d')
ax.scatter3D(
    xs=X_digits['PC1'],
    ys=X_digits['PC2'],
    zs=X_digits['PC3'],
    c=digits_labels,
    cmap='tab20'
)
ax.set_title('PCA Digits Dataset - Coloured by Labels')
ax.set(
    xticklabels=[],
    yticklabels=[],
    zticklabels=[],
    xlabel='PC1',
    ylabel='PC2',
    zlabel='PC3',
)

# plt.savefig('assets/Scikit_Learn_125.webp', bbox_inches='tight')

scikit-learn - Machine Learning in Python