import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.svm import OneClassSVM
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.decomposition import PCA


url = 'https://raw.githubusercontent.com/fourthrevlxd/cam_dsb/main/engine.csv'
engine = pd.read_csv(url)


def structure(engine):

    print("\033[1mHead of DataFrame:\033[0m\n", engine.head())
    print("\033[1mTail of DataFrame:\033[0m\n", engine.tail())
    print("\033[1mData Shape:\033[0m\n", engine.shape)
    print("\033[1mChecking for missing values:\033[0m\n", engine.isnull().sum())
    print("\033[1mChecking for duplicate values:\033[0m\n", engine.duplicated().sum())


structure(engine)

Head of DataFrame:
    Engine rpm  Lub oil pressure  Fuel pressure  Coolant pressure  \
0         682          2.391656       4.617196          2.848982   
1         605          5.466877       6.424361          5.727520   
2         658          3.434232       3.680896          1.678708   
3         749          2.094656       7.120927          1.639670   
4         676          3.538228       5.956472          3.225336   

   lub oil temp  Coolant temp  
0     76.272417     69.884609  
1     73.222679     74.907314  
2     88.089916     78.704806  
3     77.661625     82.386700  
4     75.226352     67.153220  
Tail of DataFrame:
        Engine rpm  Lub oil pressure  Fuel pressure  Coolant pressure  \
19530         681          3.598792       7.300853          1.083391   
19531         580          2.243040       5.242489          1.471350   
19532         616          3.310048       3.787693          6.001031   
19533        1163          2.731726       3.703595          2.951684   
19534         695          2.515089       6.355462          2.688567   

       lub oil temp  Coolant temp  
19530     76.206955     69.848780  
19531     76.884907     87.098119  
19532     75.889810     72.220009  
19533     76.784626     81.400088  
19534     77.555918     71.156081  
Data Shape:
 (19535, 6)
Checking for missing values:
 Engine rpm          0
Lub oil pressure    0
Fuel pressure       0
Coolant pressure    0
lub oil temp        0
Coolant temp        0
dtype: int64
Checking for duplicate values:
 0


print("\033[1mDescriptive statistics of data:\033[0m")
print(engine.median())
engine_stats = engine.describe(percentiles=[0.25, 0.5, 0.75, 0.95])
engine_stats

Descriptive statistics of data:
Engine rpm          746.000000
Lub oil pressure      3.162035
Fuel pressure         6.201720
Coolant pressure      2.166883
lub oil temp         76.817350
Coolant temp         78.346662
dtype: float64


percentiles_95 = engine_stats.loc["95%"]
engine_rpm_outliers = engine[engine["Engine rpm"] > percentiles_95["Engine rpm"]]
oil_pressure_outliers = engine[engine["Lub oil pressure"] > percentiles_95["Lub oil pressure"]]

# Display results
print("Engine rpm Outliers:\n", engine_rpm_outliers)
print("\nLubrication Oil Pressure Outliers:\n", oil_pressure_outliers)

Engine rpm Outliers:
        Engine rpm  Lub oil pressure  Fuel pressure  Coolant pressure  \
23           1411          3.518329       4.158887          2.044416   
35           1374          2.543575       8.641020          1.541131   
73           1438          3.128226       3.346491          2.327817   
76           1399          4.381382       8.970809          1.466342   
99           1347          2.414643       4.876242          1.457513   
...           ...               ...            ...               ...   
19472        1463          2.796106       9.633089          2.289238   
19484        1383          1.980378       6.727795          1.680486   
19495        1424          4.182182       4.712657          1.839777   
19522        1329          3.588706       6.446181          2.610933   
19529        1581          2.818947       4.519570          0.838869   

       lub oil temp  Coolant temp  
23        78.023885     86.243027  
35        82.268575     66.810303  
73        84.176073     75.921392  
76        76.262662     66.499672  
99        75.263159     81.567636  
...             ...           ...  
19472     77.515148     75.554246  
19484     75.905705     74.586248  
19495     77.221795     75.895910  
19522     76.126108     76.579361  
19529     76.067947     75.448412  

[974 rows x 6 columns]

Lubrication Oil Pressure Outliers:
        Engine rpm  Lub oil pressure  Fuel pressure  Coolant pressure  \
1             605          5.466877       6.424361          5.727520   
7             576          5.495972      13.114658          1.251058   
12            550          5.336469       9.645622          1.935837   
17           1312          5.963439       7.563578          5.931953   
52            625          5.618394       6.188778          1.133348   
...           ...               ...            ...               ...   
19299         659          5.507376      10.212879          1.343362   
19360         911          5.064814       7.939206          2.869245   
19372         525          5.165195       5.148526          1.782594   
19451         907          5.651968       8.047249          4.843964   
19519         825          5.076495       4.545234          4.488123   

       lub oil temp  Coolant temp  
1         73.222679     74.907314  
7         78.091390     71.934674  
12        75.266365     74.793934  
17        77.738876     88.661972  
52        74.161536     76.832068  
...             ...           ...  
19299     77.194506     88.626493  
19360     77.320031     77.901975  
19372     78.072750     81.230394  
19451     76.607078     84.999955  
19519     75.575941     80.174923  

[977 rows x 6 columns]


fig, axes = plt.subplots(1, len(engine.columns), figsize=(18, 6))

for i, column in enumerate(engine.columns):
    sns.boxplot(data=engine[column], ax=axes[i])
    axes[i].set_title(f'Boxplot of {column}')  # Title for each subplot

# Adjust the layout to prevent overlap
plt.tight_layout()
plt.show()


fig, axes = plt.subplots(1, len(engine.columns), figsize=(18, 6))

for i, column in enumerate(engine.columns):
    sns.histplot(engine[column], ax=axes[i], kde = True, bins = 30, color = "blue")
    axes[i].set_title(f'histogram of {column}')  # Title for each subplot

# Adjust the layout to prevent overlap
plt.tight_layout()
plt.show()


def iqr(engine):

    engine = engine.copy()
    columns_list = list(engine.columns)
    
    for i in columns_list:
        q1 = engine[i].quantile(0.25)
        q3 = engine[i].quantile(0.75)

        iqr = q3 - q1
        lower_limit = q1 - (1.5 * iqr)
        upper_limit = q3 + (1.5 * iqr)

        engine[i+"_flagged"] =  ((engine[i] > upper_limit) | (engine[i] < lower_limit)).astype(int)

    detection_columns = [col for col in engine.columns if "flagged" in col]
    engine["anomaly"] = (engine[detection_columns].sum(axis = 1) > 1).astype(int)
    print(engine["anomaly"].value_counts(normalize=True) * 100)
    
    return engine


iqr_engine = iqr(engine)
features = list(engine.columns)

# Create a correlation matrix to find feature co-occurrence in outliers
outlier_flags = iqr_engine[[f'{feature}_flagged' for feature in features]]
co_occurrence = outlier_flags.T.dot(outlier_flags)

print("Pairwise Co-occurrence of Outlier Flags:")
co_occurrence

anomaly
0    97.839775
1     2.160225
Name: proportion, dtype: float64
Pairwise Co-occurrence of Outlier Flags:


scaler = StandardScaler()
engine_scaled = scaler.fit_transform(engine)
engine_scaled = pd.DataFrame(engine_scaled, columns = engine.columns)


def plot_pca(engine):

    X = engine.drop("anomaly", axis =1)
    y = engine["anomaly"]
    
    pca = PCA(n_components = 2)
    pca_result = pca.fit_transform(X)

    X_pca = pd.DataFrame(pca_result, columns=[f'PC{i+1}' for i in range(pca_result.shape[1])])
    X_pca["anomaly"] = y

    custom_palette = {1: "blue", -1: "red"}
    sns.scatterplot(data= X_pca, x = "PC1", y = "PC2", hue = "anomaly", palette = custom_palette)
    plt.legend(labels=['Normal ', 'Anomaly'], loc='lower right') 
    plt.show()


def one_class_svm(engine, engine_scaled):

    engine = engine.copy()
    
    model = OneClassSVM(kernel = "rbf", gamma = 0.03, nu = 0.025)
    model.fit(engine_scaled)
    y_pred = model.predict(engine_scaled)

    
    engine_scaled["anomaly"] = y_pred
    print(engine_scaled["anomaly"].value_counts(normalize=True) * 100)

    return engine_scaled


svm_engine = one_class_svm(engine, engine_scaled)

anomaly
 1    97.486563
-1     2.513437
Name: proportion, dtype: float64


plot_pca(svm_engine)


def iso_forest(engine, engine_scaled):

    engine = engine.copy()

    model = IsolationForest(n_estimators = 100, contamination =  0.025, random_state = 42)
    model.fit(engine)

    y_pred = model.predict(engine)
    engine_scaled["anomaly"] = y_pred
    print(engine_scaled["anomaly"].value_counts(normalize=True) * 100)

    return engine_scaled


iso_engine = iso_forest(engine, engine_scaled)

anomaly
 1    97.496801
-1     2.503199
Name: proportion, dtype: float64


plot_pca(iso_engine)


#Pca plot for IQR
# Note custom_palette = {1: "blue", -1: "red"} in plot_pca(engine_scaleds) function was changed to  custom_palette = {0: "blue", 1: "red"} momentarily to allow for plotting
engine_scaleds = engine_scaled.copy()
engine_scaleds["anomaly"] = iqr_engine["anomaly"]
plot_pca(engine_scaleds)

	Engine rpm	Lub oil pressure	Fuel pressure	Coolant pressure	lub oil temp	Coolant temp
count	19535.000000	19535.000000	19535.000000	19535.000000	19535.000000	19535.000000
mean	791.239263	3.303775	6.655615	2.335369	77.643420	78.427433
std	267.611193	1.021643	2.761021	1.036382	3.110984	6.206749
min	61.000000	0.003384	0.003187	0.002483	71.321974	61.673325
25%	593.000000	2.518815	4.916886	1.600466	75.725990	73.895421
50%	746.000000	3.162035	6.201720	2.166883	76.817350	78.346662
75%	934.000000	4.055272	7.744973	2.848840	78.071691	82.915411
95%	1324.000000	5.058040	12.208475	4.438415	84.940778	88.612891
max	2239.000000	7.265566	21.138326	7.478505	89.580796	195.527912

	Engine rpm	Lubrication oil pressure	Fuel pressure	Coolant pressure	Lubrication oil temperature	Coolant temperature
Mean	791.239	3.304	6.656	2.335	77.643	78.427
Median	746.000	3.162	6.202	2.167	76.817	78.347
95th percentile	1324.000	5.058	12.208	4.438	84.941	88.613

	Engine rpm_flagged	Lub oil pressure_flagged	Fuel pressure_flagged	Coolant pressure_flagged	lub oil temp_flagged	Coolant temp_flagged
Engine rpm_flagged	464	3	25	16	72	0
Lub oil pressure_flagged	3	66	4	3	8	0
Fuel pressure_flagged	25	4	1135	45	152	0
Coolant pressure_flagged	16	3	45	785	116	0
lub oil temp_flagged	72	8	152	116	2617	0
Coolant temp_flagged	0	0	0	0	0	2

Table of Content¶

1. Required Libraries and Dataset¶

2. DataFrame and EDA¶

Results¶

Results¶

Observations¶

Values beyond the 95th percentile Insights and Recommendations¶

3. Distribution and Extreme Values Visualization¶

Insights¶

Observations from Histograms:¶

Observations from Boxplots:¶

4. Interquartile Range (IQR) Method¶

Observations and Insights¶

5. One-class SVM¶

Insights and Observations¶

Model Performance¶

6. Isolation Forest¶

Insights and Observations¶

Model Performance¶