import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.manifold import TSNE
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, linkage


loc = "CUSTOMERS_CLEAN.csv"
retail = pd.read_csv(loc)


retail.head()


def initial_preprocessing(retail):

    retail = retail.copy()
    print("Missing values:\n", retail.isnull().sum())
    print("Checking for duplicate values:\n", retail.duplicated().sum())
    print("Data Shape:\n", retail.shape)
    # dropping duplicate rows
    retail = retail.drop_duplicates()
    print("Checking for duplicate values after dropping:\n", retail.duplicated().sum())

    return retail


retial = initial_preprocessing(retail)

Missing values:
 Quantity                     0
City                       135
Continent                    0
Postal_Code               3716
State_Province          117192
Order_Date                   0
Delivery_Date                0
Total Revenue                0
Unit Cost                    0
Discount                     0
OrderTypeLabel               0
CustomerCountryLabel         0
Customer_BirthDate           0
Customer_Group               0
Customer_Type                0
Order ID                     0
Profit                       0
Days to Delivery             0
Loyalty Num                  0
Customer ID                  0
dtype: int64
Checking for duplicate values:
 21
Data Shape:
 (951669, 20)
Checking for duplicate values after dropping:
 0


def initial_preprocessing2(retail, columns):

    retail = retail.copy()

    today = pd.to_datetime("today")
    retail["Delivery_Date"] = pd.to_datetime(retail["Delivery_Date"], format="%d%b%Y")
    retail["Customer_BirthDate"] = pd.to_datetime(retail["Customer_BirthDate"], format="%d%b%Y")
    
    for i in columns:
        retail[i] = retail[i].replace("[\$,]", "", regex=True).astype(float)

    aggregated = retail.groupby("Customer ID").agg(
        frequency = ("Order ID", "count"), # 
        recency = ("Delivery_Date", lambda x: (today - x.max()).days),  # Days since most recent order
        CLV = ("Total Revenue", "sum"),  # Total Revenue as CLV
        Avg_unit_cost = ("Unit Cost", "mean"), # average unit cost
        age = ("Customer_BirthDate", lambda x: round((today - x.max()).days/365, 1))  # cal age
    ).reset_index()

    return aggregated


columns = ["Total Revenue", "Unit Cost"]
project_data = initial_preprocessing2(retail, columns)


project_data = initial_preprocessing(project_data)

Missing values:
 Customer ID      0
frequency        0
recency          0
CLV              0
Avg_unit_cost    0
age              0
dtype: int64
Checking for duplicate values:
 0
Data Shape:
 (68300, 6)
Checking for duplicate values after dropping:
 0


plotting_data = project_data.drop("Customer ID", axis = 1)
plotting_data.describe()


fig, axes = plt.subplots(1, len(plotting_data.columns), figsize=(18, 6))

for i, column in enumerate(plotting_data.columns):
    sns.histplot(plotting_data[column], ax=axes[i], kde = True, bins = 20, color = "blue")
    axes[i].set_title(f'histogram of {column}')  # Title for each subplot

# Adjust the layout to prevent overlap
plt.tight_layout()
plt.show()


fig, axes = plt.subplots(1, len(plotting_data.columns), figsize=(18, 6))

for i, column in enumerate(plotting_data.columns):
    sns.boxplot(data = plotting_data[column], ax=axes[i])
    axes[i].set_title(f'Boxplot of {column}')  # Title for each subplot

# Adjust the layout to prevent overlap
plt.tight_layout()
plt.show()


sns.scatterplot(data = plotting_data, x = "frequency", y = "CLV") #investigating for linear relationship 
plt.show()


sns.scatterplot(data = plotting_data, x = "Avg_unit_cost", y = "CLV") #investigating for linear relationship 
plt.show()


# Sclar 
scaler = StandardScaler()
retail_scaled = scaler.fit_transform(plotting_data)
retail_scaled = pd.DataFrame(retail_scaled, columns = plotting_data.columns)


# 11. dimension reduction with PCA and t-SNE to reduce the data to 2D
# PCA
pca = PCA(n_components = 2)
pca_result = pca.fit_transform(retail_scaled)
X_pca = pd.DataFrame(pca_result, columns=[f'PC{i+1}' for i in range(pca_result.shape[1])])


# t-SNE
TSNE_model = TSNE(n_components=2, perplexity=10)
TSNE_transformed_data = TSNE_model.fit_transform(retail_scaled)
X_tsne = pd.DataFrame(TSNE_transformed_data, columns=[f'tsne{i+1}' for i in range(TSNE_transformed_data.shape[1])])


def plot_pca(X_pca, y):
   
    X_pca["anomaly"] = y
    
    custom_palette = {1: "blue", -1: "red"}
    sns.scatterplot(data= X_pca, x = "PC1", y = "PC2", hue = "anomaly", palette = custom_palette)
    plt.legend(labels=['Normal ', 'Anomaly'], loc='upper right') 
    plt.show()


def iso_forest(retail):

    retail = retail.copy()

    model = IsolationForest(n_estimators = 100, contamination =  0.025, random_state = 42)
    model.fit(retail)

    y_pred = model.predict(retail)
    print(pd.Series(y_pred).value_counts(normalize=True) * 100)

    return y_pred


iso_retail_pred = iso_forest(plotting_data)

 1    97.499268
-1     2.500732
Name: proportion, dtype: float64


plot_pca(X_pca, iso_retail_pred)


k_range = range(2, 11)
def k_means(retail, k_range):

    wss = []
    silhouette_scores = []
    retail = retail.copy()
    
    for k in k_range:
        
        kmeans = KMeans(n_clusters = k, init = 'k-means++', random_state = 42, n_init = 10)
        kmeans.fit(retail)
        score = silhouette_score(retail, kmeans.labels_)
        
        wss.append(kmeans.inertia_)
        silhouette_scores.append(score)

    return wss, silhouette_scores


wss, silhouette_score = k_means(retail_scaled, k_range)


# with scaled dataset
plt.figure(figsize=(8, 6))
plt.plot(k_range, wss)
plt.title('Elbow Method for Optimal k')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Within-Cluster Sum of Squares (WCSS)')
plt.show()


plt.figure(figsize=(8,6))
plt.plot(k_range, silhouette_score)
plt.title('Silhouette Score for Different Values of k')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Silhouette Score')
plt.show()


for k, score in zip(k_range, silhouette_score):
    print(f"Silhouette Score for k={k}: {score}")

Silhouette Score for k=2: 0.2608448975514255
Silhouette Score for k=3: 0.2525993965818325
Silhouette Score for k=4: 0.25319115183306273
Silhouette Score for k=5: 0.26694482763779953
Silhouette Score for k=6: 0.2525431591824519
Silhouette Score for k=7: 0.2351264830470789
Silhouette Score for k=8: 0.23495134858551955
Silhouette Score for k=9: 0.24030792886693111
Silhouette Score for k=10: 0.22471383902722936


# performing K means with k = 5
def k_mean(retail_scaled, k):

    retail_scaled = retail_scaled.copy()
    kmeans = KMeans(n_clusters = 5, init = "k-means++", random_state = 42, n_init = 10)

    y_cluster = kmeans.fit_predict(retail_scaled)
    print("\nCluster Labels for each data point:\n", pd.Series(y_cluster).value_counts())

    return y_cluster


k = 5
y_cluster = k_mean(retail_scaled, k)


# Viewing the cluster number associated with each customer_ID
cluster  = project_data[["Customer ID"]].copy()
cluster["class"] = y_cluster
cluster.head()


# Create boxplots to display the clusters with regard to frequency, recency, CLV, average unit cost, and customer age.
boxplotting = plotting_data.copy()
boxplotting["class"] = y_cluster

fig, axes = plt.subplots(1, len(boxplotting.columns) - 1, figsize=(18, 6))

for i, column in enumerate(boxplotting.columns):
    if column != "class":
        sns.boxplot(data = boxplotting, x = "class", y = column, ax=axes[i])
        axes[i-1].set_title(f'Boxplot of {column}')  # Title for each subplot

# Adjust the layout to prevent overlap
plt.tight_layout()
plt.show()


# 2D visualisation to display the clusters with different colours. Use the output from the PCA and t-SNE.
X_tsne_plot = X_tsne.copy()
X_tsne_plot["class"] = y_cluster

sns.scatterplot(data = X_tsne_plot, x = "tsne1", y = "tsne2", hue = "class", palette="Set1")
plt.legend(loc='upper right') 
plt.title("T-SNE Plot")
plt.show()


X_pca_plot = X_pca.copy()
X_pca_plot["class"] = y_cluster

sns.scatterplot(data = X_pca_plot, x = "PC1", y = "PC2", hue = "class", palette="Set1")
plt.legend(loc='upper right') 
plt.title("PCA Plot")
plt.show()


# 7. Perform hierarchical clustering and create a dendogram.

retail_sample = retail_scaled.sample(30000)

agglo_cluster = AgglomerativeClustering(n_clusters = 5, metric='euclidean', linkage='ward')
y_agglo_cluster = agglo_cluster.fit_predict(retail_sample)

linked = linkage(retail_sample, method='ward')
plt.figure(figsize=(10, 7))
dendrogram(linked)
plt.title('Dendrogram for Hierarchical Clustering')
plt.show()

	Quantity	City	Continent	Postal_Code	State_Province	Order_Date	Delivery_Date	Total Revenue	Unit Cost	Discount	OrderTypeLabel	CustomerCountryLabel	Customer_BirthDate	Customer_Group	Customer_Type	Order ID	Profit	Days to Delivery	Loyalty Num	Customer ID
0	3	Leinster	Oceania	6437	Western Australia	01JAN2012	07JAN2012	$28.50	$9.10	.	Internet Sale	Australia	08MAY1978	Internet/Catalog Customers	Internet/Catalog Customers	1230000033	$1.20	6	99	8818
1	2	Berowra	Oceania	2081	New South Wales	01JAN2012	04JAN2012	$113.40	$56.90	.	Internet Sale	Australia	13DEC1978	Orion Club Gold members	Orion Club Gold members high activity	1230000204	($0.40)	3	99	47793
2	2	Berowra	Oceania	2081	New South Wales	01JAN2012	04JAN2012	$41.00	$18.50	.	Internet Sale	Australia	13DEC1978	Orion Club Gold members	Orion Club Gold members high activity	1230000204	$4.00	3	99	47793
3	1	Northbridge	Oceania	2063	New South Wales	01JAN2012	03JAN2012	$35.20	$29.60	.	Internet Sale	Australia	22JUN1997	Orion Club Gold members	Orion Club Gold members high activity	1230000268	$5.60	2	0	71727
4	1	Montréal	North America	NaN	Quebec	01JAN2012	04JAN2012	$24.70	$23.60	.	Internet Sale	Canada	28JAN1978	Orion Club Gold members	Orion Club Gold members medium activity	1230000487	$1.10	3	99	74503

	frequency	recency	CLV	Avg_unit_cost	age
count	68300.000000	68300.000000	68300.000000	68300.000000	68300.000000
mean	13.933660	3270.399971	1950.168370	78.894765	51.592201
std	11.329121	414.289931	1719.939245	38.004880	17.486454
min	1.000000	2884.000000	0.630000	0.500000	27.000000
25%	6.000000	2948.000000	696.000000	57.579792	36.200000
50%	11.000000	3092.000000	1497.450000	73.449286	51.400000
75%	19.000000	3442.000000	2709.845000	92.515260	66.600000
max	121.000000	4728.000000	18860.960000	1463.500000	82.000000

	Customer ID	class
0	1	3
1	3	4
2	4	0
3	5	4
4	6	1

Efficiency in Programming: ColumnTransformer and Pipeline¶

Objective¶

Findings¶

Conclusion¶

Observations¶

Data Preprocessing Findings and Exploratory Data Analysis¶

Data Preprocessing Findings¶

1. Missing Values¶

2. Distribution Analysis¶

3. Outliers¶

4. Feature Relationships¶

Next Steps¶

Outlier Detection: Observations and Model Performance¶

Isolation Forest Model¶

Model Parameters¶

Model Observations¶

PCA-Based Visualization¶

Visualization Insights¶

Conclusion¶

Next Steps¶

Determining the Optimal Number of Clusters ( ( k ) )¶

1. Elbow Method for Determining ( k )¶

Overview¶

Findings¶

Next Steps¶

2. Optimal ( k ) Based on the Silhouette Score¶

Overview¶

Findings¶

3. Conclusion¶

Perform K-Means Clustering¶

Performing K-Means Clustering and Viewing Cluster Labels¶

1. K-Means Clustering¶

Cluster Assignment¶

3. Next Step: Create Boxplots¶

Objective¶

Boxplot Analysis of Clusters Based on Key Metrics¶

Objective¶

Findings from Boxplots¶

1. Outlier Analysis¶

2. Cluster-Specific Observations¶

Interpretation in Business Context¶

Dimension Reduction with PCA and t-SNE¶

Objective¶

Findings from PCA and t-SNE¶

1. PCA Visualization¶

2. t-SNE Visualization¶

Interpretation in Business Context¶

Visual Representations¶