import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.svm import OneClassSVM
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.decomposition import PCA
url = 'https://raw.githubusercontent.com/fourthrevlxd/cam_dsb/main/engine.csv'
engine = pd.read_csv(url)
def structure(engine):
print("\033[1mHead of DataFrame:\033[0m\n", engine.head())
print("\033[1mTail of DataFrame:\033[0m\n", engine.tail())
print("\033[1mData Shape:\033[0m\n", engine.shape)
print("\033[1mChecking for missing values:\033[0m\n", engine.isnull().sum())
print("\033[1mChecking for duplicate values:\033[0m\n", engine.duplicated().sum())
structure(engine)
Head of DataFrame: Engine rpm Lub oil pressure Fuel pressure Coolant pressure \ 0 682 2.391656 4.617196 2.848982 1 605 5.466877 6.424361 5.727520 2 658 3.434232 3.680896 1.678708 3 749 2.094656 7.120927 1.639670 4 676 3.538228 5.956472 3.225336 lub oil temp Coolant temp 0 76.272417 69.884609 1 73.222679 74.907314 2 88.089916 78.704806 3 77.661625 82.386700 4 75.226352 67.153220 Tail of DataFrame: Engine rpm Lub oil pressure Fuel pressure Coolant pressure \ 19530 681 3.598792 7.300853 1.083391 19531 580 2.243040 5.242489 1.471350 19532 616 3.310048 3.787693 6.001031 19533 1163 2.731726 3.703595 2.951684 19534 695 2.515089 6.355462 2.688567 lub oil temp Coolant temp 19530 76.206955 69.848780 19531 76.884907 87.098119 19532 75.889810 72.220009 19533 76.784626 81.400088 19534 77.555918 71.156081 Data Shape: (19535, 6) Checking for missing values: Engine rpm 0 Lub oil pressure 0 Fuel pressure 0 Coolant pressure 0 lub oil temp 0 Coolant temp 0 dtype: int64 Checking for duplicate values: 0
print("\033[1mDescriptive statistics of data:\033[0m")
print(engine.median())
engine_stats = engine.describe(percentiles=[0.25, 0.5, 0.75, 0.95])
engine_stats
Descriptive statistics of data:
Engine rpm 746.000000
Lub oil pressure 3.162035
Fuel pressure 6.201720
Coolant pressure 2.166883
lub oil temp 76.817350
Coolant temp 78.346662
dtype: float64
Engine rpm | Lub oil pressure | Fuel pressure | Coolant pressure | lub oil temp | Coolant temp | |
---|---|---|---|---|---|---|
count | 19535.000000 | 19535.000000 | 19535.000000 | 19535.000000 | 19535.000000 | 19535.000000 |
mean | 791.239263 | 3.303775 | 6.655615 | 2.335369 | 77.643420 | 78.427433 |
std | 267.611193 | 1.021643 | 2.761021 | 1.036382 | 3.110984 | 6.206749 |
min | 61.000000 | 0.003384 | 0.003187 | 0.002483 | 71.321974 | 61.673325 |
25% | 593.000000 | 2.518815 | 4.916886 | 1.600466 | 75.725990 | 73.895421 |
50% | 746.000000 | 3.162035 | 6.201720 | 2.166883 | 76.817350 | 78.346662 |
75% | 934.000000 | 4.055272 | 7.744973 | 2.848840 | 78.071691 | 82.915411 |
95% | 1324.000000 | 5.058040 | 12.208475 | 4.438415 | 84.940778 | 88.612891 |
max | 2239.000000 | 7.265566 | 21.138326 | 7.478505 | 89.580796 | 195.527912 |
percentiles_95 = engine_stats.loc["95%"]
engine_rpm_outliers = engine[engine["Engine rpm"] > percentiles_95["Engine rpm"]]
oil_pressure_outliers = engine[engine["Lub oil pressure"] > percentiles_95["Lub oil pressure"]]
# Display results
print("Engine rpm Outliers:\n", engine_rpm_outliers)
print("\nLubrication Oil Pressure Outliers:\n", oil_pressure_outliers)
Engine rpm Outliers: Engine rpm Lub oil pressure Fuel pressure Coolant pressure \ 23 1411 3.518329 4.158887 2.044416 35 1374 2.543575 8.641020 1.541131 73 1438 3.128226 3.346491 2.327817 76 1399 4.381382 8.970809 1.466342 99 1347 2.414643 4.876242 1.457513 ... ... ... ... ... 19472 1463 2.796106 9.633089 2.289238 19484 1383 1.980378 6.727795 1.680486 19495 1424 4.182182 4.712657 1.839777 19522 1329 3.588706 6.446181 2.610933 19529 1581 2.818947 4.519570 0.838869 lub oil temp Coolant temp 23 78.023885 86.243027 35 82.268575 66.810303 73 84.176073 75.921392 76 76.262662 66.499672 99 75.263159 81.567636 ... ... ... 19472 77.515148 75.554246 19484 75.905705 74.586248 19495 77.221795 75.895910 19522 76.126108 76.579361 19529 76.067947 75.448412 [974 rows x 6 columns] Lubrication Oil Pressure Outliers: Engine rpm Lub oil pressure Fuel pressure Coolant pressure \ 1 605 5.466877 6.424361 5.727520 7 576 5.495972 13.114658 1.251058 12 550 5.336469 9.645622 1.935837 17 1312 5.963439 7.563578 5.931953 52 625 5.618394 6.188778 1.133348 ... ... ... ... ... 19299 659 5.507376 10.212879 1.343362 19360 911 5.064814 7.939206 2.869245 19372 525 5.165195 5.148526 1.782594 19451 907 5.651968 8.047249 4.843964 19519 825 5.076495 4.545234 4.488123 lub oil temp Coolant temp 1 73.222679 74.907314 7 78.091390 71.934674 12 75.266365 74.793934 17 77.738876 88.661972 52 74.161536 76.832068 ... ... ... 19299 77.194506 88.626493 19360 77.320031 77.901975 19372 78.072750 81.230394 19451 76.607078 84.999955 19519 75.575941 80.174923 [977 rows x 6 columns]
Engine rpm | Lubrication oil pressure | Fuel pressure | Coolant pressure | Lubrication oil temperature | Coolant temperature | |
---|---|---|---|---|---|---|
Mean | 791.239 | 3.304 | 6.656 | 2.335 | 77.643 | 78.427 |
Median | 746.000 | 3.162 | 6.202 | 2.167 | 76.817 | 78.347 |
95th percentile | 1324.000 | 5.058 | 12.208 | 4.438 | 84.941 | 88.613 |
Engine rpm:
Lubrication Oil Pressure:
fig, axes = plt.subplots(1, len(engine.columns), figsize=(18, 6))
for i, column in enumerate(engine.columns):
sns.boxplot(data=engine[column], ax=axes[i])
axes[i].set_title(f'Boxplot of {column}') # Title for each subplot
# Adjust the layout to prevent overlap
plt.tight_layout()
plt.show()
fig, axes = plt.subplots(1, len(engine.columns), figsize=(18, 6))
for i, column in enumerate(engine.columns):
sns.histplot(engine[column], ax=axes[i], kde = True, bins = 30, color = "blue")
axes[i].set_title(f'histogram of {column}') # Title for each subplot
# Adjust the layout to prevent overlap
plt.tight_layout()
plt.show()
def iqr(engine):
engine = engine.copy()
columns_list = list(engine.columns)
for i in columns_list:
q1 = engine[i].quantile(0.25)
q3 = engine[i].quantile(0.75)
iqr = q3 - q1
lower_limit = q1 - (1.5 * iqr)
upper_limit = q3 + (1.5 * iqr)
engine[i+"_flagged"] = ((engine[i] > upper_limit) | (engine[i] < lower_limit)).astype(int)
detection_columns = [col for col in engine.columns if "flagged" in col]
engine["anomaly"] = (engine[detection_columns].sum(axis = 1) > 1).astype(int)
print(engine["anomaly"].value_counts(normalize=True) * 100)
return engine
iqr_engine = iqr(engine)
features = list(engine.columns)
# Create a correlation matrix to find feature co-occurrence in outliers
outlier_flags = iqr_engine[[f'{feature}_flagged' for feature in features]]
co_occurrence = outlier_flags.T.dot(outlier_flags)
print("Pairwise Co-occurrence of Outlier Flags:")
co_occurrence
anomaly 0 97.839775 1 2.160225 Name: proportion, dtype: float64 Pairwise Co-occurrence of Outlier Flags:
Engine rpm_flagged | Lub oil pressure_flagged | Fuel pressure_flagged | Coolant pressure_flagged | lub oil temp_flagged | Coolant temp_flagged | |
---|---|---|---|---|---|---|
Engine rpm_flagged | 464 | 3 | 25 | 16 | 72 | 0 |
Lub oil pressure_flagged | 3 | 66 | 4 | 3 | 8 | 0 |
Fuel pressure_flagged | 25 | 4 | 1135 | 45 | 152 | 0 |
Coolant pressure_flagged | 16 | 3 | 45 | 785 | 116 | 0 |
lub oil temp_flagged | 72 | 8 | 152 | 116 | 2617 | 0 |
Coolant temp_flagged | 0 | 0 | 0 | 0 | 0 | 2 |
scaler = StandardScaler()
engine_scaled = scaler.fit_transform(engine)
engine_scaled = pd.DataFrame(engine_scaled, columns = engine.columns)
def plot_pca(engine):
X = engine.drop("anomaly", axis =1)
y = engine["anomaly"]
pca = PCA(n_components = 2)
pca_result = pca.fit_transform(X)
X_pca = pd.DataFrame(pca_result, columns=[f'PC{i+1}' for i in range(pca_result.shape[1])])
X_pca["anomaly"] = y
custom_palette = {1: "blue", -1: "red"}
sns.scatterplot(data= X_pca, x = "PC1", y = "PC2", hue = "anomaly", palette = custom_palette)
plt.legend(labels=['Normal ', 'Anomaly'], loc='lower right')
plt.show()
def one_class_svm(engine, engine_scaled):
engine = engine.copy()
model = OneClassSVM(kernel = "rbf", gamma = 0.03, nu = 0.025)
model.fit(engine_scaled)
y_pred = model.predict(engine_scaled)
engine_scaled["anomaly"] = y_pred
print(engine_scaled["anomaly"].value_counts(normalize=True) * 100)
return engine_scaled
svm_engine = one_class_svm(engine, engine_scaled)
anomaly 1 97.486563 -1 2.513437 Name: proportion, dtype: float64
plot_pca(svm_engine)
Initial One-Class SVM Model:
gamma
= 0.5 and nu
= 0.05). This indicates that the model may be too sensitive and flagging more samples than necessary.Tuned One-Class SVM Model:
gamma
= 0.03 and nu
= 0.0216), the tuned model detected a more appropriate number of outliers, aligning more closely with the expected 1-5% anomaly rate. This is a more suitable range for practical anomaly detection, ensuring the model is not too aggressive but still effective at identifying issues. PCA Visualization:
Business Relevance:
def iso_forest(engine, engine_scaled):
engine = engine.copy()
model = IsolationForest(n_estimators = 100, contamination = 0.025, random_state = 42)
model.fit(engine)
y_pred = model.predict(engine)
engine_scaled["anomaly"] = y_pred
print(engine_scaled["anomaly"].value_counts(normalize=True) * 100)
return engine_scaled
iso_engine = iso_forest(engine, engine_scaled)
anomaly 1 97.496801 -1 2.503199 Name: proportion, dtype: float64
plot_pca(iso_engine)
Initial Isolation Forest Model:
n_estimators
= 100 and contamination
= 0.05). This suggests that the model was too sensitive and flagged too many normal samples as outliers.Tuned Isolation Forest Model:
n_estimators
= 100 and contamination
= 0.025), the model detected a more appropriate number of anomalies, aligning closer to the desired 1-5% range (2.5%).PCA Visualization:
#Pca plot for IQR
# Note custom_palette = {1: "blue", -1: "red"} in plot_pca(engine_scaleds) function was changed to custom_palette = {0: "blue", 1: "red"} momentarily to allow for plotting
engine_scaleds = engine_scaled.copy()
engine_scaleds["anomaly"] = iqr_engine["anomaly"]
plot_pca(engine_scaleds)