# average SalePrizenp.mean(ames_df['SalePrice'])
# 180815.53743589742rel_error_avg=mean_absolute_error(y_ames_test, y_ames_pred) *100/np.mean(ames_df['SalePrice'])
print('Pridictions are on average off by: ', rel_error_avg.round(2), '%')
# Pridictions are on average off by: 7.85 %
Above I used the petal width and length to create a linear regression model. But as explored earlier we can also use the sepal length (only the sepal width does not show a linear correlation):
prob_pred_logistic_binary=logistic_binary_model.predict_proba(x_data_logistic_binary)
y_pred_logistic_binary=logistic_binary_model.predict(x_data_logistic_binary)
print('Prediction Probabilities: ', prob_pred[:1])
unique, counts=np.unique(y_pred_logistic_binary, return_counts=True)
print('Classes: ', unique, '| Number of Class Instances: ', counts)
# probabilities e.g. below -> 58% certainty that the first element is class 0# Prediction Probabilities: [[0.58097284 0.41902716]]# Classes: [0 1] | Number of Class Instances: [5 5]
# only once you are certain that you have the best performance# do a final evaluation with the test sety_adv4_final_pred=model_adv4.predict(X_adv_test)
mean_squared_error(y_adv_test, y_adv4_final_pred)
# 2.024422922812264
k-fold Cross Validation
Do a train/test split and segment the training set by k-folds (e.g. 5-10) and use each of those segments once to validate a training step. The resulting error is the average of all k errors.
# do a 5-fold cross-evalscores=cross_val_score(
estimator=model_adv5,
X=X_adv_train,
y=y_adv_train,
scoring='neg_mean_squared_error',
cv=5
)
# take the mean of all five neg. error valuesabs(scores.mean())
# 8.688107513529168
Adjusting Hyper Parameter
model_adv6=Ridge(
alpha=1.0
)
# do a 5-fold cross-evalscores=cross_val_score(
estimator=model_adv6,
X=X_adv_train,
y=y_adv_train,
scoring='neg_mean_squared_error',
cv=5
)
# take the mean of all five neg. error valuesabs(scores.mean())
# 3.3419582340688576
# verify number of estimators found by grid searcherrors= []
missclassifications= []
forninrange(1,200):
rfc=RandomForestClassifier(n_estimators=n, max_features=2)
rfc.fit(X_money_train, y_money_train)
preds=rfc.predict(X_money_test)
err=1-accuracy_score(y_money_test, preds)
errors.append(err)
n_missed=np.sum(preds!=y_money_test)
missclassifications.append(n_missed)
plt.figure(figsize=(12,4))
plt.title('Errors as a Function of n_estimators')
plt.xlabel('Estimators')
plt.ylabel('Error Score')
plt.plot(range(1,200), errors)
# there is no noteable improvement above ~10 estimators
plt.figure(figsize=(12,4))
plt.title('Misclassifications as a Function of n_estimators')
plt.xlabel('Estimators')
plt.ylabel('Misclassifications')
plt.plot(range(1,200), missclassifications)
# and the same for misclassifications
Random Forest Regressor
Comparing different regression models to a random forrest regression model.
plt.figure(figsize=(12,5))
plt.title('X-Ray Bounce Signal Strength vs Rock Density')
sns.scatterplot(data=rock_df, x='Signal', y='Density')
# the signal vs density plot follows a sine wave - spoiler alert: simpler algorithm# will fail trying to fit this dataset...
# visualize predictionsplt.figure(figsize=(12,5))
plt.plot(X_rock_test, lr_rock_preds, c='mediumspringgreen')
sns.scatterplot(data=rock_df, x='Signal', y='Density', c='dodgerblue')
plt.title('Linear Regression Predictions')
plt.show()
# the returned error appears small because the linear regression returns an average# but it cannot fit a linear line to the contours of the underlying sine wave function
# test helper on previous linear regressionrun_model(
model=lr_rock,
X_train=X_rock_train,
y_train=y_rock_train,
X_test=X_rock_test,
y_test=y_rock_test,
df=rock_df
)
# run modelrun_model(
model=pipe_poly,
X_train=X_rock_train,
y_train=y_rock_train,
X_test=X_rock_test,
y_test=y_rock_test,
df=rock_df
)
# with a HARD LIMIT of 0-100 for the xray signal a 6th degree polinomial is a good fit
# visualizing a hyperplane to separate the two featuressns.scatterplot(data=mice_df, x='Med_1_mL',y='Med_2_mL',hue='Virus Present', palette='winter')
x=np.linspace(0,10,100)
m=-1b=11y=m*x+bplt.plot(x,y,c='fuchsia')
SVC with a Linear Kernel
# using a support vector classifier to calculate maximize the margin between both classesy_vir=mice_df['Virus Present']
X_vir=mice_df.drop('Virus Present',axis=1)
# kernel : {'linear', 'poly', 'rbf', 'sigmoid', 'precomputed'}# the smaller the C value the more feature vectors will be inside the marginmodel_vir=svm.SVC(kernel='linear', C=1000)
model_vir.fit(X_vir, y_vir)
# the smaller the C value the more feature vectors will be inside the marginmodel_vir_low_reg=svm.SVC(kernel='linear', C=0.005)
model_vir_low_reg.fit(X_vir, y_vir)
plot_svm_boundary(model_vir_low_reg, X_vir, y_vir)
# # gamma : {'scale', 'auto'} or float, default='scale'# - if ``gamma='scale'`` (default) is passed then it uses 1 / (n_features * X.var()) as value of gamma,# - if 'auto', uses 1 / n_features# - if float, must be non-negative.model_vir_rbf_auto_gamma=svm.SVC(kernel='rbf', C=1, gamma='auto')
model_vir_rbf_auto_gamma.fit(X_vir, y_vir)
plot_svm_boundary(model_vir_rbf_auto_gamma, X_vir, y_vir)
# create the SVC model using class_weight to balance out the# dataset that heavily leaning towards non-fraudssvc_wine_base=svm.SVC(
kernel='rbf',
class_weight='balanced'
)
plt.figure(figsize=(12,8))
plt.title('Mushroom Features :: Number of unique Features')
sns.barplot(data=feature_df, y='feature', x='unique', orient='h', palette='summer_r')
plt.figure(figsize=(10,4))
plt.title('Mushroom Count :: Editable vs Poisonous')
sns.countplot(data=shroom_df, x='class', palette='seismic_r')
Adaptive Boosting
# remove lable classX_shroom=shroom_df.drop('class', axis=1)
# make all values numericX_shroom=pd.get_dummies(X_shroom, drop_first=True)
y_shroom=shroom_df['class']
# don't try fit a perfect model but only return# the most important feature for classificationabc_shroom=AdaBoostClassifier(estimator=None, n_estimators=1)
abc_shroom.fit(X_shroom_train,y_shroom_train)
# the model was fit on a single feature and still resulted in a pretty good performance.# Let's find out what feature was chosen for the classification.shroom_index= ['importance']
shroom_data_columns=pd.Series(X_shroom.columns)
shroom_importance_array=abc_shroom.feature_importances_shroom_importance_df=pd.DataFrame(shroom_importance_array, shroom_data_columns, shroom_index)
shroom_importance_df.value_counts()
importance
count
0.0
94
1.0
1
dtype: int64
# plot a slice of the dataframe to find the featureshroom_importance_df_sorted=shroom_importance_df.sort_values(
by='importance',
ascending=True
)
shroom_importance_df_sorted[-5:].plot(
kind='barh',
title='Feature Importance for Mushroom Classification',
figsize=(8,4)
)
The most important feature (as determined by the model) is the odor - in this case a odor of none is the best indicator to classify a poisonous mushroom:
odor: almond = a, anise = l, creosote = c, fishy = y, foul = f, musty = m, none = n, pungent = p, spicy = s
# the mojority of poisonous mushrooms do have an odor# naking the lack of it a good indicator for an eatable varietyplt.figure(figsize=(12,4))
plt.title('Mushroom Odor vs Class')
sns.countplot(data=shroom_df, x='odor', hue='class', palette='summer')
Optimizing Hyperparameters
# find out how many of the 95 features you have# to add to your model to get a better fiterror_rates= []
forestimatorsinrange(1,96):
model=AdaBoostClassifier(n_estimators=estimators)
model.fit(X_shroom_train,y_shroom_train)
preds=model.predict(X_shroom_test)
err=1-accuracy_score(y_shroom_test, preds)
error_rates.append(err)
shroom_index= ['importance']
shroom_data_columns=pd.Series(X_shroom.columns)
shroom_importance_array=abc_shroom2.feature_importances_shroom_importance_df=pd.DataFrame(shroom_importance_array, shroom_data_columns, shroom_index)
shroom_importance_df.value_counts()
# there are 12 features now that are deemed important
shroom_feature_importance=shroom_grid.best_estimator_.feature_importances_feature_importance_df=pd.DataFrame(
index=X_shroom.columns,
data=shroom_feature_importance,
columns= ['importance']
)
# kick all features that have zero importance and sort by importancefeature_importance_df=feature_importance_df[
feature_importance_df['importance'] >3e-03
].sort_values(
by='importance',
ascending=False
)
plt.figure(figsize=(10,6))
plt.title('Features important to classify poisonous Mushrooms')
sns.barplot(
data=feature_importance_df,
y=feature_importance_df.index,
x='importance',
orient='h',
palette='summer'
)
Supervised Learning - Naive Bayes NLP
Feature Extraction
text= [
'This is a dataset for binary sentiment classification',
'containing substantially more data than previous benchmark datasets',
'We provide a set of 25,000 highly polar movie reviews for training',
'And 25,000 for testing',
'There is additional unlabeled data for use as well',
'Raw text and already processed bag of words formats are provided'
]
CountVectorizer & TfidfTransformer
cv=CountVectorizer(stop_words='english')
cv_sparse_matrix=cv.fit_transform(text)
# <6x30 sparse matrix of type '<class 'numpy.int64'>'# with 33 stored elements in Compressed Sparse Row format>
tfidf_vec=TfidfVectorizer(
lowercase=True,
analyzer='word',
stop_words='english'
)
tfidf_vec_results=tfidf_vec.fit_transform(text)
# <6x30 sparse matrix of type '<class 'numpy.float64'>'# with 33 stored elements in Compressed Sparse Row format>
tfidf_tweet_vec=TfidfVectorizer(
lowercase=True,
analyzer='word',
stop_words='english'
)
X_tweet_tfidf_train=tfidf_tweet_vec.fit_transform(X_tweet_train)
# <11712x12987 sparse matrix of type '<class 'numpy.float64'>'# with 106745 stored elements in Compressed Sparse Row format>X_tweet_tfidf_test=tfidf_tweet_vec.transform(X_tweet_test)
report(nb_tweets)
# The Naive Bayes classifies almost all tweets as negative# which means it does well with searching neg tweets# but ends up classifying a lot neutral and pos tweets as neg
precision
recall
f1-score
support
negative
0.69
0.99
0.81
1889
neutral
0.75
0.15
0.25
580
positive
0.94
0.18
0.31
459
accuracy
0.70
2928
macro avg
0.79
0.44
0.46
2928
weighted avg
0.74
0.70
0.62
2928
Model Deployment
# building a pipeline to ingest new tweets with the best performing modelpipe=Pipeline(
[
('tfidf', TfidfVectorizer()),
('svc', svm.SVC())
]
)
# before deployment retrain on entire datasetpipe.fit(X_tweet, y_tweet)
# is the dataset balancedimdb_df['label'].value_counts()
# neg 969# pos 969# Name: label, dtype: int64
Top 30 Features by Label
# find top 20 words in negative reviewsimdb_neg_df=imdb_df[imdb_df['label'] =='neg']
count_vectorizer=CountVectorizer(analyzer='word', stop_words='english')
bag_of_words=count_vectorizer.fit_transform(imdb_neg_df['review'])
sum_words=bag_of_words.sum(axis=0)
x, y=zip(*words_freq[:30])
plt.figure(figsize=(12,5))
plt.bar(x,y)
plt.xticks(rotation=90)
plt.title('Top30 Words used in Negative Reviews')
plt.savefig('assets/Scikit_Learn_62.webp', bbox_inches='tight')
# find top 20 words in positive reviewsimdb_pos_df=imdb_df[imdb_df['label'] !='neg']
count_vectorizer=CountVectorizer(analyzer='word', stop_words='english')
bag_of_words=count_vectorizer.fit_transform(imdb_pos_df['review'])
sum_words=bag_of_words.sum(axis=0)
x, y=zip(*words_freq[:30])
plt.figure(figsize=(12,5))
plt.bar(x,y)
plt.xticks(rotation=90)
plt.title('Top30 Words used in Positive Reviews')
plt.savefig('assets/Scikit_Learn_63.webp', bbox_inches='tight')
plt.figure(figsize=(12, 5))
plt.title('Age Distribution by Marital Status')
sns.histplot(
data=bank_df,
x='age',
bins=50,
hue='marital',
palette='winter',
kde=True
)
plt.savefig('assets/Scikit_Learn_65.webp', bbox_inches='tight')
plt.figure(figsize=(12, 5))
plt.title('Age Distribution by Loan Status')
sns.histplot(
data=bank_df,
x='age',
bins=50,
hue='loan',
palette='winter',
kde=True
)
plt.savefig('assets/Scikit_Learn_66.webp', bbox_inches='tight')
# remove columns with `pday`s = 999 (placeholder for never)plt.figure(figsize=(12, 5))
plt.title('Distribution of Days Since Last Contacted by Loan Status')
sns.histplot(
data=bank_df[bank_df['pdays'] !=999],
x='pdays',
hue='loan',
palette='winter',
kde=True
)
plt.savefig('assets/Scikit_Learn_67.webp', bbox_inches='tight')
bank_model=KMeans(
n_clusters=2,
n_init='auto',
random_state=42
)
# fit to find cluster centers and predict what center every feature belongs tobank_cluster_labels=bank_model.fit_predict(X_bank_scaled)
# add predicted label to source dataframeX_bank['Cluster'] =bank_cluster_labels
# visualize the sum distance of your datapoints to the# predicted cluster centers as a function of number of clusterssum_squared_distance= []
forkinrange(2,20):
model=KMeans(n_clusters=k, n_init='auto')
model.fit(X_bank_scaled)
sum_squared_distance.append(model.inertia_)
plt.figure(figsize=(10,5))
plt.title('SSD as a Function of Number of Cluster')
plt.plot(range(2,20), sum_squared_distance, 'o--')
plt.savefig('assets/Scikit_Learn_73.webp', bbox_inches='tight')
plt.figure(figsize=(10,5))
plt.title('Difference in SSD as a Function of Number of Clusters')
pd.Series(sum_squared_distance).diff().plot(kind='bar')
plt.savefig('assets/Scikit_Learn_74.webp', bbox_inches='tight')
There are two 'elbows' - one between k=5-6 (behold the 0-index in Pandas!) and the second one between k=14-15. Both of them are potential good values for the number of cluster k.
Re-fitting the Model
bank_model=KMeans(
n_clusters=6,
n_init='auto',
random_state=42
)
# fit to find cluster centers and predict what center every feature belongs tobank_cluster_labels=bank_model.fit_predict(X_bank_scaled)
# flatten the image from 3 to 2 dimensions
(height, width, colour) =img_array.shapeimg_array2d=img_array.reshape(height*width,colour)
img_array2d.shape# (208000, 3)
# reduce colour space to 6 clusterscolour_model=KMeans(n_clusters=6, n_init='auto')
colour_labels=colour_model.fit_predict(img_array2d)
# get rgb value for each of the 6 cluster centersrgb_colours=colour_model.cluster_centers_.round(0).astype(int)
rgb_colours# array([[186, 111, 58],# [ 31, 11, 16],# [135, 72, 46],# [236, 157, 73],# [ 81, 40, 34],# [252, 199, 125]])
# assign these rgb values to each pixel within the cluster# and reshape to original 3d arrayquantized_image=np.reshape(rgb_colours[colour_labels],(height,width,colour))
# find columns with missing valuescountry_df.isnull().sum()
Country
0
Region
0
Population
0
Area (sq. mi.)
0
Pop. Density (per sq. mi.)
0
Coastline (coast/area ratio)
0
Net migration
3
Infant mortality (per 1000 births)
3
GDP ($ per capita)
1
Literacy (%)
18
Phones (per 1000)
4
Arable (%)
2
Crops (%)
2
Other (%)
2
Climate
22
Birthrate
3
Deathrate
4
Agriculture
15
Industry
16
Service
15
dtype: int64
# what countries don't have an agriculture valuecountry_df[pd.isnull(country_df['Agriculture'])]['Country']
# all countries without agriculture data will not have a# whole lot of agriculture output. The same is true for 'Industry'# and 'Service' These values can be set to zero:
3
American Samoa
4
Andorra
78
Gibraltar
80
Greenland
83
Guam
134
Mayotte
140
Montserrat
144
Nauru
153
N. Mariana Islands
171
Saint Helena
174
St Pierre & Miquelon
177
San Marino
208
Turks & Caicos Is
221
Wallis and Futuna
223
Western Sahara
Name: Country, dtype: object
# set missing values to zero for Agriculture, Industry and Service# define what default values you want to fillvalues= {
"Agriculture": 0,
"Industry": 0,
"Service": 0,
}
# and replace missing with valuescountry_df=country_df.fillna(value=values)
# another datapoint that is often missing is climate# the climate can be estimated by countries in the same Regioncountry_df[pd.isnull(country_df['Climate'])][['Country', 'Region', 'Climate']]
# the Region value has annoying whitespaces that need to be strippedcountry_df['Region'] =country_df['Region'].apply(lambdax: x.strip())
# climate zones in western europecountry_df[country_df['Region'] =='WESTERN EUROPE']['Climate'].value_counts()
# climate zones in SUB-SAHARAN AFRICAcountry_df[country_df['Region'] =='SUB-SAHARAN AFRICA']['Climate'].value_counts()
# climate zones in EASTERN EUROPEcountry_df[country_df['Region'] =='EASTERN EUROPE']['Climate'].value_counts()
# climate zones in NORTHERN AMERICAcountry_df[country_df['Region'] =='NORTHERN AMERICA']['Climate'].value_counts()
# climate zones in NORTHERN AFRICAcountry_df[country_df['Region'] =='NORTHERN AFRICA']['Climate'].value_counts()
# climate zones in C.W. OF IND. STATEScountry_df[country_df['Region'] =='C.W. OF IND. STATES']['Climate'].value_counts()
# climate zones in NEAR EASTcountry_df[country_df['Region'] =='NEAR EAST']['Climate'].value_counts()
# climate zones in BALTICScountry_df[country_df['Region'] =='BALTICS']['Climate'].value_counts()
# climate zones in ASIA (EX. NEAR EAST)country_df[country_df['Region'] =='ASIA (EX. NEAR EAST)']['Climate'].value_counts()
# we can either use the top value to fill missing climate data points# or use a mean value:country_df['Climate'] =country_df['Climate'].fillna(country_df.groupby('Region')['Climate'].transform('mean'))
# there are more missing values, e.g. literacy:country_df[pd.isnull(country_df['Literacy (%)'])][['Country', 'Region', 'Literacy (%)']]
Country
Region
Literacy (%)
25
Bosnia & Herzegovina
EASTERN EUROPE
NaN
66
Faroe Islands
WESTERN EUROPE
NaN
74
Gaza Strip
NEAR EAST
NaN
78
Gibraltar
WESTERN EUROPE
NaN
80
Greenland
NORTHERN AMERICA
NaN
85
Guernsey
WESTERN EUROPE
NaN
99
Isle of Man
WESTERN EUROPE
NaN
104
Jersey
WESTERN EUROPE
NaN
108
Kiribati
OCEANIA
NaN
123
Macedonia
EASTERN EUROPE
NaN
134
Mayotte
SUB-SAHARAN AFRICA
NaN
144
Nauru
OCEANIA
NaN
185
Slovakia
EASTERN EUROPE
NaN
187
Solomon Islands
OCEANIA
NaN
209
Tuvalu
OCEANIA
NaN
220
Virgin Islands
LATIN AMER. & CARIB
NaN
222
West Bank
NEAR EAST
NaN
223
Western Sahara
NORTHERN AFRICA
NaN
# here we can also fill with mean values:country_df['Literacy (%)'] =country_df['Literacy (%)'].fillna(country_df.groupby('Region')['Literacy (%)'].transform('mean'))
# the remaining rows with missing values can be dropped for nowcountry_df=country_df.dropna(axis=0)
country_df.isnull().sum()
Country
0
Region
0
Population
0
Area (sq. mi.)
0
Pop. Density (per sq. mi.)
0
Coastline (coast/area ratio)
0
Net migration
0
Infant mortality (per 1000 births)
0
GDP ($ per capita)
0
Literacy (%)
0
Phones (per 1000)
0
Arable (%)
0
Crops (%)
0
Other (%)
0
Climate
0
Birthrate
0
Deathrate
0
Agriculture
0
Industry
0
Service
0
dtype: int64
# drop the country column as it is a unique# classifier that will not help with clusteringcountry_df_dropped=country_df.drop(['Country'], axis=1)
# the region column is useful but needs to be encodedcountry_df_dropped=pd.get_dummies(country_df_dropped)
country_df_dropped.head(5).transpose()
0
1
2
3
4
Population
31056997.00
3581655.000
3.293009e+07
57794.00
71201.00
Area (sq. mi.)
647500.00
28748.000
2.381740e+06
199.00
468.00
Pop. Density (per sq. mi.)
48.00
124.600
1.380000e+01
290.40
152.10
Coastline (coast/area ratio)
0.00
1.260
4.000000e-02
58.29
0.00
Net migration
23.06
-4.930
-3.900000e-01
-20.71
6.60
Infant mortality (per 1000 births)
163.07
21.520
3.100000e+01
9.27
4.05
GDP ($ per capita)
700.00
4500.000
6.000000e+03
8000.00
19000.00
Literacy (%)
36.00
86.500
7.000000e+01
97.00
100.00
Phones (per 1000)
3.20
71.200
7.810000e+01
259.50
497.20
Arable (%)
12.13
21.090
3.220000e+00
10.00
2.22
Crops (%)
0.22
4.420
2.500000e-01
15.00
0.00
Other (%)
87.65
74.490
9.653000e+01
75.00
97.78
Climate
1.00
3.000
1.000000e+00
2.00
3.00
Birthrate
46.60
15.110
1.714000e+01
22.46
8.71
Deathrate
20.34
5.220
4.610000e+00
3.27
6.25
Agriculture
0.38
0.232
1.010000e-01
0.00
0.00
Industry
0.24
0.188
6.000000e-01
0.00
0.00
Service
0.38
0.579
2.980000e-01
0.00
0.00
Region_ASIA (EX. NEAR EAST)
1.00
0.000
0.000000e+00
0.00
0.00
Region_BALTICS
0.00
0.000
0.000000e+00
0.00
0.00
Region_C.W. OF IND. STATES
0.00
0.000
0.000000e+00
0.00
0.00
Region_EASTERN EUROPE
0.00
1.000
0.000000e+00
0.00
0.00
Region_LATIN AMER. & CARIB
0.00
0.000
0.000000e+00
0.00
0.00
Region_NEAR EAST
0.00
0.000
0.000000e+00
0.00
0.00
Region_NORTHERN AFRICA
0.00
0.000
1.000000e+00
0.00
0.00
Region_NORTHERN AMERICA
0.00
0.000
0.000000e+00
0.00
0.00
Region_OCEANIA
0.00
0.000
0.000000e+00
1.00
0.00
Region_SUB-SAHARAN AFRICA
0.00
0.000
0.000000e+00
0.00
0.00
Region_WESTERN EUROPE
0.00
0.000
0.000000e+00
0.00
1.00
# to be able to compare all datapoints they need to be normalizedcountry_scaler=StandardScaler()
country_df_scaled=country_scaler.fit_transform(country_df_dropped)
Model Training
# finding a good k-value for number of clusterssd_country= []
forkinrange(2,30):
model=KMeans(n_clusters=k, n_init='auto')
model.fit(country_df_scaled)
ssd_country.append(model.inertia_)
plt.figure(figsize=(10,5))
plt.title('SSD as a Function of Number of Cluster')
plt.plot(range(2,30), ssd_country, 'o--')
plt.savefig('assets/Scikit_Learn_83.webp', bbox_inches='tight')
plt.figure(figsize=(10,5))
plt.title('Difference in SSD as a Function of Number of Clusters')
pd.Series(ssd_country).diff().plot(kind='bar')
plt.savefig('assets/Scikit_Learn_84.webp', bbox_inches='tight')
country_model=KMeans(
n_clusters=14,
n_init='auto',
random_state=42
)
# fit to find cluster centers and predict what center every feature belongs tocountry_cluster_labels=country_model.fit_predict(country_df_scaled)
Model Evaluation
# add predicted label to source dataframecountry_df['Cluster14'] =country_cluster_labelscountry_df['Cluster14'].value_counts()
plt.figure(figsize=(10, 7))
sns.set(style='darkgrid')
# hue/style by categorical columnsns.scatterplot(
x='GDP ($ per capita)',
y='Literacy (%)',
data=country_df,
s=40,
alpha=0.6,
hue='Cluster14',
palette='cool',
style='Region'
).set_title('Country Clusters with k=14')
plt.savefig('assets/Scikit_Learn_85.webp', bbox_inches='tight')
# repeat but only with 3 clustercountry_model2=KMeans(
n_clusters=3,
n_init='auto',
random_state=42
)
# fit to find cluster centers and predict what center every feature belongs tocountry_cluster_labels2=country_model2.fit_predict(country_df_scaled)
# add predicted label to source dataframecountry_df['Cluster3'] =country_cluster_labels2plt.figure(figsize=(10, 7))
sns.set(style='darkgrid')
# hue/style by categorical columnsns.scatterplot(
x='GDP ($ per capita)',
y='Literacy (%)',
data=country_df,
s=40,
alpha=0.6,
hue='Cluster3',
palette='cool',
style='Region'
).set_title('Country Clusters with k=3')
plt.savefig('assets/Scikit_Learn_86.webp', bbox_inches='tight')
# How do the feature correlate with the predicted labelscountry_label_corr=country_df.corr()['Cluster3']
print(country_label_corr.iloc[:-1].sort_values())
# there are ~ 4 clusters visible - let's try to agglomerate themautoMPG_model=AgglomerativeClustering(n_clusters=4)
cluster_labels=autoMPG_model.fit_predict(autoMPG_scaled)
autoMPG_df['label'] =cluster_labelsautoMPG_df.head(5)
mpg
cylinders
displacement
horsepower
weight
acceleration
model_year
origin
name
label
0
18.0
8
307.0
130.0
3504
12.0
70
usa
chevrolet chevelle malibu
2
1
15.0
8
350.0
165.0
3693
11.5
70
usa
buick skylark 320
2
2
18.0
8
318.0
150.0
3436
11.0
70
usa
plymouth satellite
2
3
16.0
8
304.0
150.0
3433
12.0
70
usa
amc rebel sst
2
4
17.0
8
302.0
140.0
3449
10.5
70
usa
ford torino
2
plt.figure(figsize=(12,5))
sns.scatterplot(
x='mpg',
y='horsepower',
data=autoMPG_df,
hue='label',
palette='cool_r',
style='origin'
).set_title('Horsepower as a function of Miles-per-gallon')
plt.savefig('assets/Scikit_Learn_92.webp', bbox_inches='tight')
plt.figure(figsize=(12,5))
sns.scatterplot(
x='model_year',
y='mpg',
data=autoMPG_df,
hue='label',
palette='cool_r',
style='origin'
).set_title('Model Year as a function of Miles-per-gallon')
plt.legend(bbox_to_anchor=(1.01,1.01))
plt.savefig('assets/Scikit_Learn_93.webp', bbox_inches='tight')
figure, axes=plt.subplots(1, 3, sharex=True,figsize=(15, 5))
figure.suptitle('Country of Origin')
axes[0].set_title('second chart with no data')
sns.scatterplot(
x='horsepower',
y='mpg',
data=autoMPG_df[autoMPG_df['origin'] =='europe'],
hue='label',
palette='cool_r',
style='model_year',
ax=axes[0]
).set_title('Europe')
axes[1].set_title('Europe')
sns.scatterplot(
x='horsepower',
y='mpg',
data=autoMPG_df[autoMPG_df['origin'] =='japan'],
hue='label',
palette='cool_r',
style='model_year',
ax=axes[1]
).set_title('Japan')
axes[2].set_title('second chart with no data')
sns.scatterplot(
x='horsepower',
y='mpg',
data=autoMPG_df[autoMPG_df['origin'] =='usa'],
hue='label',
palette='cool_r',
style='model_year',
ax=axes[2]
).set_title('USA')
plt.legend(bbox_to_anchor=(1.01,1.01))
plt.savefig('assets/Scikit_Learn_94.webp', bbox_inches='tight')
# nice... perfect separation by country!
Unknown Number of Clusters
The Clustermap created above allowed us to estimate the amount of clusters needed to accuratly label the dataset based on the Dendrogram displayed on the left side. If we do not know how many clusters are present in our dataset we can define a maximum distance threshold a cluster can have before being merged with surrounding clusters. Setting this threshold to zero results in a number of clusters == number of datapoints.
autoMPG_model_auto=AgglomerativeClustering(
n_clusters=None,
metric='euclidean',
distance_threshold=0
)
cluster_labels_auto=autoMPG_model_auto.fit_predict(autoMPG_scaled)
len(np.unique(cluster_labels_auto))
# threshold of zero leads to 392 clusters == number of rows in our dataset
# find out a good distance thresholdlinkage_matrix=hierarchy.linkage(autoMPG_model_auto.children_)
linkage_matrix# [`cluster[i]`, `cluster[j]`, `distance between`, `number of members`]
# to display this matrix we can use the above mentioned dendrogramplt.figure(figsize=(20,10))
plt.title('Hierarchy Dendrogram for 8 Classes')
dendro=hierarchy.dendrogram(linkage_matrix, truncate_mode='lastp', p=9)
plt.savefig('assets/Scikit_Learn_95.webp', bbox_inches='tight')
# The higher the y-value the larger the distance between the connected clusters
# since the miles-per-gallons are a good indicator for the label# what is the max distance between two points here:car_max_mpg=autoMPG_scaled.iloc[autoMPG_scaled['mpg'].idxmax()]
car_min_mpg=autoMPG_scaled.iloc[autoMPG_scaled['mpg'].idxmin()]
np.linalg.norm(car_max_mpg-car_min_mpg)
# 3.1128158766165406# if the max distance is ~3 the threshold should be < 3
autoMPG_model_auto=AgglomerativeClustering(
n_clusters=None,
metric='euclidean',
distance_threshold=2
)
cluster_labels_auto=autoMPG_model_auto.fit_predict(autoMPG_scaled)
len(np.unique(cluster_labels_auto))
# threshold of two leads to 11 clusters
autoMPG_model_auto=AgglomerativeClustering(
n_clusters=None,
metric='euclidean',
distance_threshold=3
)
cluster_labels_auto=autoMPG_model_auto.fit_predict(autoMPG_scaled)
len(np.unique(cluster_labels_auto))
# threshold of three leads to 9 clusters
autoMPG_df['label_auto'] =cluster_labels_auto
figure, axes=plt.subplots(1, 3, sharex=True,figsize=(15, 6))
figure.suptitle('Country of Origin')
axes[0].set_title('second chart with no data')
sns.scatterplot(
x='horsepower',
y='mpg',
data=autoMPG_df[autoMPG_df['origin'] =='europe'],
hue='label_auto',
palette='cool_r',
style='model_year',
ax=axes[0]
).set_title('Europe')
axes[1].set_title('Europe')
sns.scatterplot(
x='horsepower',
y='mpg',
data=autoMPG_df[autoMPG_df['origin'] =='japan'],
hue='label_auto',
palette='cool_r',
style='model_year',
ax=axes[1]
).set_title('Japan')
axes[2].set_title('second chart with no data')
sns.scatterplot(
x='horsepower',
y='mpg',
data=autoMPG_df[autoMPG_df['origin'] =='usa'],
hue='label_auto',
palette='cool_r',
style='model_year',
ax=axes[2]
).set_title('USA')
plt.legend(bbox_to_anchor=(1.01,1.01))
plt.savefig('assets/Scikit_Learn_96.webp', bbox_inches='tight')
# the division by countries is still there. but we are now getting# sub-classes within each country - which might be important depending on your set goal
# default hyperparameterdb_model_base=DBSCAN(eps=0.5, min_samples=5)
figure, axes=plt.subplots(1, 2, sharex=True,figsize=(12, 6))
figure.suptitle('2 Blobs Dataset - Default Hyperparameter')
axes[0].set_title('DBSCAN Clustering w/o Outliers')
display_categories(db_model_base, two_blobs_df, axes[0])
axes[1].set_title('DBSCAN Clustering with Outliers')
display_categories(db_model_base, two_blobs_otl_df, axes[1])
plt.savefig('assets/Scikit_Learn_103.webp', bbox_inches='tight')
# points around cluster 1 are assigned to be outliers
# reducing epsilon reduces the max distance (epsilon)# points are allowed to have and still be assigned to a clusterdb_model_dec=DBSCAN(eps=0.001, min_samples=5)
figure, axes=plt.subplots(1, 2, sharex=True,figsize=(12, 6))
figure.suptitle('2 Blobs Dataset - Reduced Epsilon')
axes[0].set_title('DBSCAN Clustering w/o Outliers')
display_categories(db_model_dec, two_blobs_df, axes[0])
axes[1].set_title('DBSCAN Clustering with Outliers')
display_categories(db_model_dec, two_blobs_otl_df, axes[1])
plt.savefig('assets/Scikit_Learn_104.webp', bbox_inches='tight')
# distance is too small - every point becomes it's own cluster and is assigned as an outlier
# increasing epsilon increases the max distance (epsilon)# points are allowed to have and still be assigned to a clusterdb_model_inc=DBSCAN(eps=10, min_samples=5)
figure, axes=plt.subplots(1, 2, sharex=True,figsize=(12, 6))
figure.suptitle('2 Blobs Dataset - Increased Epsilon')
axes[0].set_title('DBSCAN Clustering w/o Outliers')
display_categories(db_model_inc, two_blobs_df, axes[0])
axes[1].set_title('DBSCAN Clustering with Outliers')
display_categories(db_model_inc, two_blobs_otl_df, axes[1])
plt.savefig('assets/Scikit_Learn_105.webp', bbox_inches='tight')
# distance is too big - every point becomes becomes part of the same cluster
Elbow Plot
epsilon_value_range=np.linspace(0.0001, 1, 100)
n_outliers= []
perc_outlier= []
n_clusters= []
forepsiloninepsilon_value_range:
dbscan_model=DBSCAN(eps=epsilon)
dbscan_model.fit(two_blobs_otl_df)
# total number of outliersn_outliers.append(np.sum(dbscan_model.labels_==-1))
# percentage of outliersperc_outlier.append(
100*np.sum(dbscan_model.labels_==-1) /len(dbscan_model.labels_)
)
# number of clustersn_clusters.append(len(np.unique(dbscan_model.labels_)))
plt.figure(figsize=(12,5))
plt.title('Elbow Plot - DBSCAN Hyperparameter')
plt.xlabel('Epsilon (Max Distance between Points)')
plt.ylabel('Number of Outliers')
plt.ylim(0,10)
# we expect 3 outliersplt.hlines(y=3, xmin=0, xmax=0.7, color='fuchsia')
# 3 outliers are reached somewhere around eps=0.7plt.vlines(x=0.7, ymin=0, ymax=3, color='fuchsia')
sns.lineplot(x=epsilon_value_range, y=n_outliers)
plt.savefig('assets/Scikit_Learn_107.webp', bbox_inches='tight')
plt.figure(figsize=(12,5))
plt.title('Number of Clusters by Epsilon Range')
plt.xlabel('Epsilon (Max Distance between Points)')
plt.ylabel('Number of Clusters')
# we expect 2 clusters + outliersplt.hlines(y=3, xmin=0, xmax=1, color='fuchsia')
plt.ylim(0,50)
plt.xlim(0,1)
sns.lineplot(x=epsilon_value_range, y=n_clusters)
plt.savefig('assets/Scikit_Learn_108.webp', bbox_inches='tight')
# we already reach 3 cluster with an epsilon of 0.2# but as seen above we need an epsilon of 0.7 to reduce# the number of outliers to 3
# find the optimum# rule of thumb for min_samples = 2*n_dimn_dim=two_blobs_otl_df.shape[1]
db_model_opt=DBSCAN(eps=0.7, min_samples=2*n_dim)
figure, axes=plt.subplots(1, 2, sharex=True,figsize=(12, 6))
figure.suptitle('2 Blobs Dataset - Optimal Epsilon')
axes[0].set_title('DBSCAN Clustering w/o Outliers')
display_categories(db_model_opt, two_blobs_df, axes[0])
axes[1].set_title('DBSCAN Clustering with Outliers')
display_categories(db_model_opt, two_blobs_otl_df, axes[1])
plt.savefig('assets/Scikit_Learn_106.webp', bbox_inches='tight')
# the 3 outliers are labled as such and every other point is assigned to one of the two clusters
# find number of outliersprint('Number of Outliers', np.sum(db_model_opt.labels_==-1))
# Number of Outliers 3# get outlier percentageprint('Percentage of Outliers', (100*np.sum(db_model_opt.labels_==-1) /len(db_model_opt.labels_)).round(2),'%')
# Percentage of Outliers 0.3 %
Realworld Dataset
Wholesale customers
The data set refers to clients of a wholesale distributor. It includes the annual spending in monetary units (m.u.) on diverse product categories
Additional Information
FRESH: annual spending (m.u.) on fresh products (Continuous)
MILK: annual spending (m.u.) on milk products (Continuous)
GROCERY: annual spending (m.u.) on grocery products (Continuous)
FROZEN: annual spending (m.u.) on frozen products (Continuous)
DETERGENTS_PAPER: annual spending (m.u.) on detergents and paper products (Continuous)
DELICATESSEN: annual spending (m.u.)on and delicatessen products (Continuous)
CHANNEL: customers Channel - Horeca (Hotel/Restaurant/Cafe) or Retail channel (Nominal)
REGION: customers Region - Lisnon, Oporto or Other (Nominal)
epsilon_value_range=np.linspace(0.001, 3, 100)
n_dim=wholesale_scaled.shape[1]
n_outliers= []
perc_outlier= []
n_clusters= []
forepsiloninepsilon_value_range:
dbscan_model=DBSCAN(eps=epsilon, min_samples=2*n_dim)
dbscan_model.fit(wholesale_scaled)
# total number of outliersn_outliers.append(np.sum(dbscan_model.labels_==-1))
# percentage of outliersperc_outlier.append(
100*np.sum(dbscan_model.labels_==-1) /len(dbscan_model.labels_)
)
# number of clustersn_clusters.append(len(np.unique(dbscan_model.labels_)))
plt.figure(figsize=(12, 3))
plt.title('Mean Spending Values for Cluster 1 and 2')
sns.heatmap(
wholesale_clusters,
linewidth=0.5,
cmap='coolwarm',
annot=True
)
plt.savefig('assets/Scikit_Learn_117.webp', bbox_inches='tight')
Dimension Reduction - Principal Component Analysis (PCA)
Dataset Preprocessing
Breast cancer wisconsin (diagnostic) dataset.
Attribute Information:
radius (mean of distances from center to points on the perimeter)
texture (standard deviation of gray-scale values)
perimeter
area
smoothness (local variation in radius lengths)
compactness (perimeter^2 / area - 1.0)
concavity (severity of concave portions of the contour)
concave points (number of concave portions of the contour)
symmetry
fractal dimension ("coastline approximation" - 1)
The mean, standard error, and "worst" or largest (mean of the three worst/largest values) of these features were computed for each image, resulting in 30 features. For instance, field 0 is Mean Radius, field 10 is Radius SE, field 20 is Worst Radius.
print(pca_model.explained_variance_ratio_)
print(np.sum(pca_model.explained_variance_ratio_))
# the two principal components are able to describe# 63% of the variance in the dataset# [0.44272026 0.18971182]# 0.6324320765155945
# adding components to original dataframetumor_df[['PC1','PC2']] =pca_resultstumor_df[['PC1','PC2']].head(5).transpose()
# get label data from dataset to confirm that we still have# separably clusters after reducing the dimensions to 2fromsklearn.datasetsimportload_breast_cancer
plt.figure(figsize=(12,5))
plt.title('PCA Cancer Tumor Dataset - Coloured by Labels')
sns.scatterplot(
data=tumor_df,
x='PC1', y='PC2',
hue=tumor_dataset['target'],
palette='winter'
)
plt.savefig('assets/Scikit_Learn_119.webp', bbox_inches='tight')
# as shown above we get around 63% of the variance explained by using 2 principal components# since the dataset has 30 features 30 principal components will explain 100% of the varianceexplained_variance= []
forninrange(1,31):
pca=PCA(n_components=n)
pca.fit(tumor_scaled_df)
explained_variance.append(np.sum(pca.explained_variance_ratio_))
plt.figure(figsize=(10, 5))
plt.title('Explained Variance by Number of Principal Components')
plt.xlabel('Principal Components')
sns.set(style='darkgrid')
sns.barplot(
data=pd.DataFrame(explained_variance, columns=['Explained Variance']),
x=np.arange(1,31),
y='Explained Variance'
)
plt.savefig('assets/Scikit_Learn_120.webp', bbox_inches='tight')
# drop label columnX_digits=digits_df.drop('number_label', axis=1)
digits_labels=digits_df['number_label']
# select a single imagesimg_idx=333Single_Digit=np.array(X_digits.iloc[img_idx])
Single_Digit.shape# the images inside the dataset are flattened# (64,)
# need to be turned back into their 8x8 pixel formatSingle_Digit=Single_Digit.reshape((8, 8))
Single_Digit.shape# (8, 8)
pca_model2=PCA(n_components=2)
pca_results2=pca_model2.fit_transform(digits_scaled)
print(np.sum(pca_model2.explained_variance_ratio_))
# reducing the number of dimensions from 64 -> 2 leads to 22% explained variance
plt.figure(figsize=(12,5))
plt.title('PCA Digits Dataset - Coloured by Labels')
sns.scatterplot(
data=X_digits,
x='PC1', y='PC2',
hue=digits_labels,
palette='tab20'
)
plt.legend(bbox_to_anchor=(1.01,1.01))
plt.savefig('assets/Scikit_Learn_123.webp', bbox_inches='tight')
# numbers 4 and 7 are very distinct. There is some overlap between 6 and 0 and between 2 and 3# but you can still get some separation. All the numbers in the middle are 'problematic' and # probably need a larger amount training data.
# how many components would we have to add to reach 80% explained varianceexplained_variance= []
forninrange(1,65):
pca=PCA(n_components=n)
pca.fit(digits_scaled)
explained_variance.append(np.sum(pca.explained_variance_ratio_))
plt.figure(figsize=(16, 5))
plt.title('Explained Variance by Number of Principal Components')
plt.xlabel('Principal Components')
sns.set(style='darkgrid')
sns.barplot(
data=pd.DataFrame(explained_variance, columns=['Explained Variance']),
x=np.arange(1,65),
y='Explained Variance'
)
plt.savefig('assets/Scikit_Learn_124.webp', bbox_inches='tight')
# we need more than 20 principal components out of 64 to reach 80% expainable variance:
# rerun the training for 3 components for ~30% explained variancepca_model3=PCA(n_components=3)
pca_results3=pca_model3.fit_transform(digits_scaled)
print(np.sum(pca_model3.explained_variance_ratio_))
# reducing the number of dimensions from 64 -> 3 leads to 30% explained variance