import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.manifold import TSNE
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage
loc = "CUSTOMERS_CLEAN.csv"
retail = pd.read_csv(loc)
To explore the feasibility of incorporating ColumnTransformer and Pipeline for more efficient programming during dimensional reduction and clustering processes.
Current Approach:
Limitations of ColumnTransformer and Pipeline:
Trade-offs:
retail.head()
Quantity | City | Continent | Postal_Code | State_Province | Order_Date | Delivery_Date | Total Revenue | Unit Cost | Discount | OrderTypeLabel | CustomerCountryLabel | Customer_BirthDate | Customer_Group | Customer_Type | Order ID | Profit | Days to Delivery | Loyalty Num | Customer ID | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 3 | Leinster | Oceania | 6437 | Western Australia | 01JAN2012 | 07JAN2012 | $28.50 | $9.10 | . | Internet Sale | Australia | 08MAY1978 | Internet/Catalog Customers | Internet/Catalog Customers | 1230000033 | $1.20 | 6 | 99 | 8818 |
1 | 2 | Berowra | Oceania | 2081 | New South Wales | 01JAN2012 | 04JAN2012 | $113.40 | $56.90 | . | Internet Sale | Australia | 13DEC1978 | Orion Club Gold members | Orion Club Gold members high activity | 1230000204 | ($0.40) | 3 | 99 | 47793 |
2 | 2 | Berowra | Oceania | 2081 | New South Wales | 01JAN2012 | 04JAN2012 | $41.00 | $18.50 | . | Internet Sale | Australia | 13DEC1978 | Orion Club Gold members | Orion Club Gold members high activity | 1230000204 | $4.00 | 3 | 99 | 47793 |
3 | 1 | Northbridge | Oceania | 2063 | New South Wales | 01JAN2012 | 03JAN2012 | $35.20 | $29.60 | . | Internet Sale | Australia | 22JUN1997 | Orion Club Gold members | Orion Club Gold members high activity | 1230000268 | $5.60 | 2 | 0 | 71727 |
4 | 1 | Montréal | North America | NaN | Quebec | 01JAN2012 | 04JAN2012 | $24.70 | $23.60 | . | Internet Sale | Canada | 28JAN1978 | Orion Club Gold members | Orion Club Gold members medium activity | 1230000487 | $1.10 | 3 | 99 | 74503 |
The initial data preprocessing will focus on cleaning the features that will be utilized to create five new derived features. Subsequent analysis and clustering will be performed on these five features.
def initial_preprocessing(retail):
retail = retail.copy()
print("Missing values:\n", retail.isnull().sum())
print("Checking for duplicate values:\n", retail.duplicated().sum())
print("Data Shape:\n", retail.shape)
# dropping duplicate rows
retail = retail.drop_duplicates()
print("Checking for duplicate values after dropping:\n", retail.duplicated().sum())
return retail
retial = initial_preprocessing(retail)
Missing values: Quantity 0 City 135 Continent 0 Postal_Code 3716 State_Province 117192 Order_Date 0 Delivery_Date 0 Total Revenue 0 Unit Cost 0 Discount 0 OrderTypeLabel 0 CustomerCountryLabel 0 Customer_BirthDate 0 Customer_Group 0 Customer_Type 0 Order ID 0 Profit 0 Days to Delivery 0 Loyalty Num 0 Customer ID 0 dtype: int64 Checking for duplicate values: 21 Data Shape: (951669, 20) Checking for duplicate values after dropping: 0
def initial_preprocessing2(retail, columns):
retail = retail.copy()
today = pd.to_datetime("today")
retail["Delivery_Date"] = pd.to_datetime(retail["Delivery_Date"], format="%d%b%Y")
retail["Customer_BirthDate"] = pd.to_datetime(retail["Customer_BirthDate"], format="%d%b%Y")
for i in columns:
retail[i] = retail[i].replace("[\$,]", "", regex=True).astype(float)
aggregated = retail.groupby("Customer ID").agg(
frequency = ("Order ID", "count"), #
recency = ("Delivery_Date", lambda x: (today - x.max()).days), # Days since most recent order
CLV = ("Total Revenue", "sum"), # Total Revenue as CLV
Avg_unit_cost = ("Unit Cost", "mean"), # average unit cost
age = ("Customer_BirthDate", lambda x: round((today - x.max()).days/365, 1)) # cal age
).reset_index()
return aggregated
columns = ["Total Revenue", "Unit Cost"]
project_data = initial_preprocessing2(retail, columns)
project_data = initial_preprocessing(project_data)
Missing values: Customer ID 0 frequency 0 recency 0 CLV 0 Avg_unit_cost 0 age 0 dtype: int64 Checking for duplicate values: 0 Data Shape: (68300, 6) Checking for duplicate values after dropping: 0
The next step in our analysis will focus on performing exploratory data analysis (EDA) on the newly created features and DataFrame.
plotting_data = project_data.drop("Customer ID", axis = 1)
plotting_data.describe()
frequency | recency | CLV | Avg_unit_cost | age | |
---|---|---|---|---|---|
count | 68300.000000 | 68300.000000 | 68300.000000 | 68300.000000 | 68300.000000 |
mean | 13.933660 | 3270.399971 | 1950.168370 | 78.894765 | 51.592201 |
std | 11.329121 | 414.289931 | 1719.939245 | 38.004880 | 17.486454 |
min | 1.000000 | 2884.000000 | 0.630000 | 0.500000 | 27.000000 |
25% | 6.000000 | 2948.000000 | 696.000000 | 57.579792 | 36.200000 |
50% | 11.000000 | 3092.000000 | 1497.450000 | 73.449286 | 51.400000 |
75% | 19.000000 | 3442.000000 | 2709.845000 | 92.515260 | 66.600000 |
max | 121.000000 | 4728.000000 | 18860.960000 | 1463.500000 | 82.000000 |
fig, axes = plt.subplots(1, len(plotting_data.columns), figsize=(18, 6))
for i, column in enumerate(plotting_data.columns):
sns.histplot(plotting_data[column], ax=axes[i], kde = True, bins = 20, color = "blue")
axes[i].set_title(f'histogram of {column}') # Title for each subplot
# Adjust the layout to prevent overlap
plt.tight_layout()
plt.show()
fig, axes = plt.subplots(1, len(plotting_data.columns), figsize=(18, 6))
for i, column in enumerate(plotting_data.columns):
sns.boxplot(data = plotting_data[column], ax=axes[i])
axes[i].set_title(f'Boxplot of {column}') # Title for each subplot
# Adjust the layout to prevent overlap
plt.tight_layout()
plt.show()
sns.scatterplot(data = plotting_data, x = "frequency", y = "CLV") #investigating for linear relationship
plt.show()
sns.scatterplot(data = plotting_data, x = "Avg_unit_cost", y = "CLV") #investigating for linear relationship
plt.show()
The data preprocessing phase 2 focused on ensuring the dataset's quality and reliability for effective customer segmentation. The key metrics—Frequency, Recency, Customer Lifetime Value (CLV), Average Unit Cost, and Age—were prioritized to align with business objectives. Below are the summarized findings:
To ensure robust segmentation, outlier detection using Isolation Forest will be conducted to better understand and manage the influence of extreme values.
# Sclar
scaler = StandardScaler()
retail_scaled = scaler.fit_transform(plotting_data)
retail_scaled = pd.DataFrame(retail_scaled, columns = plotting_data.columns)
# 11. dimension reduction with PCA and t-SNE to reduce the data to 2D
# PCA
pca = PCA(n_components = 2)
pca_result = pca.fit_transform(retail_scaled)
X_pca = pd.DataFrame(pca_result, columns=[f'PC{i+1}' for i in range(pca_result.shape[1])])
# t-SNE
TSNE_model = TSNE(n_components=2, perplexity=10)
TSNE_transformed_data = TSNE_model.fit_transform(retail_scaled)
X_tsne = pd.DataFrame(TSNE_transformed_data, columns=[f'tsne{i+1}' for i in range(TSNE_transformed_data.shape[1])])
def plot_pca(X_pca, y):
X_pca["anomaly"] = y
custom_palette = {1: "blue", -1: "red"}
sns.scatterplot(data= X_pca, x = "PC1", y = "PC2", hue = "anomaly", palette = custom_palette)
plt.legend(labels=['Normal ', 'Anomaly'], loc='upper right')
plt.show()
def iso_forest(retail):
retail = retail.copy()
model = IsolationForest(n_estimators = 100, contamination = 0.025, random_state = 42)
model.fit(retail)
y_pred = model.predict(retail)
print(pd.Series(y_pred).value_counts(normalize=True) * 100)
return y_pred
iso_retail_pred = iso_forest(plotting_data)
1 97.499268 -1 2.500732 Name: proportion, dtype: float64
plot_pca(X_pca, iso_retail_pred)
The Isolation Forest model, combined with PCA visualization, provides a robust approach to identifying and validating outliers in the dataset. The results highlight potential areas of concern or interest, such as extreme values in customer behavior or product preferences. These outliers will be carefully examined and, where necessary, managed to ensure they do not unduly influence the customer segmentation process.
Select the optimum value of clusters ( k ) with the Elbow and Silhouette score methods
k_range = range(2, 11)
def k_means(retail, k_range):
wss = []
silhouette_scores = []
retail = retail.copy()
for k in k_range:
kmeans = KMeans(n_clusters = k, init = 'k-means++', random_state = 42, n_init = 10)
kmeans.fit(retail)
score = silhouette_score(retail, kmeans.labels_)
wss.append(kmeans.inertia_)
silhouette_scores.append(score)
return wss, silhouette_scores
wss, silhouette_score = k_means(retail_scaled, k_range)
# with scaled dataset
plt.figure(figsize=(8, 6))
plt.plot(k_range, wss)
plt.title('Elbow Method for Optimal k')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Within-Cluster Sum of Squares (WCSS)')
plt.show()
plt.figure(figsize=(8,6))
plt.plot(k_range, silhouette_score)
plt.title('Silhouette Score for Different Values of k')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Silhouette Score')
plt.show()
for k, score in zip(k_range, silhouette_score):
print(f"Silhouette Score for k={k}: {score}")
Silhouette Score for k=2: 0.2608448975514255 Silhouette Score for k=3: 0.2525993965818325 Silhouette Score for k=4: 0.25319115183306273 Silhouette Score for k=5: 0.26694482763779953 Silhouette Score for k=6: 0.2525431591824519 Silhouette Score for k=7: 0.2351264830470789 Silhouette Score for k=8: 0.23495134858551955 Silhouette Score for k=9: 0.24030792886693111 Silhouette Score for k=10: 0.22471383902722936
The Elbow Method was applied to identify the optimal number of clusters (( k )) for customer segmentation by analyzing the within-cluster sum of squares (WSS). This method helps pinpoint the ( k ) value where adding more clusters yields diminishing returns in clustering performance.
To confirm the optimal ( k ) and assess clustering quality, the Silhouette Score method was used as a secondary evaluation metric.
The Silhouette Score measures clustering quality by evaluating:
With ( k = 5 ) determined as the optimal number of clusters:
# performing K means with k = 5
def k_mean(retail_scaled, k):
retail_scaled = retail_scaled.copy()
kmeans = KMeans(n_clusters = 5, init = "k-means++", random_state = 42, n_init = 10)
y_cluster = kmeans.fit_predict(retail_scaled)
print("\nCluster Labels for each data point:\n", pd.Series(y_cluster).value_counts())
return y_cluster
k = 5
y_cluster = k_mean(retail_scaled, k)
# Viewing the cluster number associated with each customer_ID
cluster = project_data[["Customer ID"]].copy()
cluster["class"] = y_cluster
cluster.head()
Customer ID | class | |
---|---|---|
0 | 1 | 3 |
1 | 3 | 4 |
2 | 4 | 0 |
3 | 5 | 4 |
4 | 6 | 1 |
After determining that the optimal number of clusters (( k = 5 )) was based on the Elbow Method and Silhouette Score, we performed K-Means Clustering on the dataset to segment the customers.
At Run Time The K-Means algorithm assigned each customer to one of the five clusters. Below are the cluster labels assigned to each customer based on the clustering process:
The next step involves creating boxplots to display the clusters with regard to the following key metrics:
These visualizations will help us understand the distribution and variability of these metrics within each cluster, highlighting key differences and trends. The boxplots will also help identify any outliers that might influence cluster interpretations.
Using boxplots, we will compare and contrast clusters based on the metrics to gain actionable insights for targeted marketing strategies and customer retention efforts.
# Create boxplots to display the clusters with regard to frequency, recency, CLV, average unit cost, and customer age.
boxplotting = plotting_data.copy()
boxplotting["class"] = y_cluster
fig, axes = plt.subplots(1, len(boxplotting.columns) - 1, figsize=(18, 6))
for i, column in enumerate(boxplotting.columns):
if column != "class":
sns.boxplot(data = boxplotting, x = "class", y = column, ax=axes[i])
axes[i-1].set_title(f'Boxplot of {column}') # Title for each subplot
# Adjust the layout to prevent overlap
plt.tight_layout()
plt.show()
To understand the differences in customer behavior across the five clusters, we created boxplots for the following metrics:
These visualizations provide insights into the distribution and variability of each metric within each cluster, helping us better interpret customer segmentation.
Outliers and Variability:
The significant number of outliers, especially in CLV and Average Unit Cost, indicates a diverse customer base with varying purchasing power and product preferences. Marketing strategies should focus on identifying and catering to high-value customers while considering the needs of price-sensitive segments.
Cluster Differences:
Frequency as a Stable Feature:
The stability in frequency suggests it might be less effective for differentiation. However, it remains crucial for measuring overall engagement and identifying loyal customers who make frequent purchases.
# 2D visualisation to display the clusters with different colours. Use the output from the PCA and t-SNE.
X_tsne_plot = X_tsne.copy()
X_tsne_plot["class"] = y_cluster
sns.scatterplot(data = X_tsne_plot, x = "tsne1", y = "tsne2", hue = "class", palette="Set1")
plt.legend(loc='upper right')
plt.title("T-SNE Plot")
plt.show()
X_pca_plot = X_pca.copy()
X_pca_plot["class"] = y_cluster
sns.scatterplot(data = X_pca_plot, x = "PC1", y = "PC2", hue = "class", palette="Set1")
plt.legend(loc='upper right')
plt.title("PCA Plot")
plt.show()
To visualize the customer clusters in a 2D space, we performed dimension reduction using both PCA (Principal Component Analysis) and t-SNE (t-distributed Stochastic Neighbor Embedding). These methods help uncover patterns in high-dimensional data by projecting it into two dimensions while retaining as much meaningful structure as possible.
Clear Cluster Separations:
The distinct separations observed in both PCA and t-SNE visualizations validate the robustness of the clustering process. Each cluster represents a unique segment of customers with specific behavioral patterns, making them actionable for tailored marketing strategies.
Cluster 1 Overlap:
The overlap of Cluster 4 with other clusters suggests that this group may include customers whose purchasing behavior, engagement, or demographics share characteristics with multiple segments. This transitional nature may require a nuanced approach:
Validation of Clustering with Dimensional Reduction:
The alignment between PCA and t-SNE results strengthens the confidence in the identified clusters and supports their use for actionable customer segmentation.
PCA-Based 2D Visualization:
The PCA plot provides a straightforward representation of the cluster structure, highlighting the general separations and overlaps between groups.
t-SNE-Based 2D Visualization:
The t-SNE plot emphasizes local relationships between data points, revealing nuanced structures and validating the robustness of the clustering process.
# 7. Perform hierarchical clustering and create a dendogram.
retail_sample = retail_scaled.sample(30000)
agglo_cluster = AgglomerativeClustering(n_clusters = 5, metric='euclidean', linkage='ward')
y_agglo_cluster = agglo_cluster.fit_predict(retail_sample)
linked = linkage(retail_sample, method='ward')
plt.figure(figsize=(10, 7))
dendrogram(linked)
plt.title('Dendrogram for Hierarchical Clustering')
plt.show()
The dendrogram revealed a clear cutoff point at 5 clusters, confirmed by the distinct separation of branches at this level. This aligns with the results of the Elbow and Silhouette analyses from K-Means.