import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import numpy as np


train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
submission = pd.read_csv("sample_submission.csv")


test.head()
train.head()


print(train.describe())
print(train.info())

                  id     model_year         milage         price
count  188533.000000  188533.000000  188533.000000  1.885330e+05
mean    94266.000000    2015.829998   65705.295174  4.387802e+04
std     54424.933488       5.660967   49798.158076  7.881952e+04
min         0.000000    1974.000000     100.000000  2.000000e+03
25%     47133.000000    2013.000000   24115.000000  1.700000e+04
50%     94266.000000    2017.000000   57785.000000  3.082500e+04
75%    141399.000000    2020.000000   95400.000000  4.990000e+04
max    188532.000000    2024.000000  405000.000000  2.954083e+06
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 188533 entries, 0 to 188532
Data columns (total 13 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   id            188533 non-null  int64 
 1   brand         188533 non-null  object
 2   model         188533 non-null  object
 3   model_year    188533 non-null  int64 
 4   milage        188533 non-null  int64 
 5   fuel_type     183450 non-null  object
 6   engine        188533 non-null  object
 7   transmission  188533 non-null  object
 8   ext_col       188533 non-null  object
 9   int_col       188533 non-null  object
 10  accident      186081 non-null  object
 11  clean_title   167114 non-null  object
 12  price         188533 non-null  int64 
dtypes: int64(4), object(9)
memory usage: 18.7+ MB
None


train.isnull().sum()

id                  0
brand               0
model               0
model_year          0
milage              0
fuel_type        5083
engine              0
transmission        0
ext_col             0
int_col             0
accident         2452
clean_title     21419
price               0
dtype: int64


def fuel_type_cleaning(row):

    if (row["brand"] == "Tesla" or row["brand"] == "Rivian" or row["engine"] == "Electric" or "Electric Fuel System" in row["engine"] or 
       row["model"] == "EV6 Wind") and pd.isnull(row["fuel_type"]):
        
        row["fuel_type"] = "Electric"
    elif "Engine Gasoline Fuel" in row["engine"] and pd.isnull(row["fuel_type"]):
        
        row["fuel_type"] = "Gasoline"
    elif ("Engine Gas/Electric Hybrid" in row["engine"] or "Engine Gasoline/Mild Electric Hybrid" in row["engine"]) and pd.isnull(row["fuel_type"]):
        
        row["fuel_type"] = "Hybrid"
    elif ("Battery" in row["engine"] or "Electric" in row["engine"]) and pd.isnull(row["fuel_type"]):
        
        row["fuel_type"] = "Electric"
    elif pd.isnull(row["fuel_type"]):
        
        row["fuel_type"] = "Diesel"

    return row


def complete_cleaning(data):

    data['clean_title'].fillna('Yes', inplace=True)
    data['accident'].fillna('None reported', inplace=True)
    # Convert all string columns to lowercase
    data = data.apply(lambda col: col.str.lower() if col.dtype == 'object' else col)
    data['transmission'] = data['transmission'].replace({r'\bmanual\b': 'm/t', r'\bautomatic\b': 'a/t', 'at': 'a/t', 
                                                         'mt': 'm/t','a/t cvt':'cvt transmission', 'cvt-f': 'cvt transmission',
                                                        r'6[ -]speed': '6-speed'}, regex=True)
    
    return data


def feature_engineering(data):

    year = 2025
    
    data['horsepower'] = data['engine'].str.extract(r'(\d+\.?\d*)hp').fillna(0).astype(float)
    data['liter'] = data['engine'].str.extract(r'(\d+\.?\d*)l').fillna(0).astype(float)
    data['cylinders'] = data['engine'].str.extract(r'(\d+) cylinder').fillna(0).astype(int)
    data['car_age'] = year - data['model_year']
    data['milage_per_year'] = data['milage'] / data['car_age']

    return data


train_t = train.apply(fuel_type_cleaning, axis =1)
train_t = complete_cleaning(train_t)
train_t = feature_engineering(train_t)


#test cleaning and feature engineering
test_t = test.apply(fuel_type_cleaning, axis =1)
test_t = complete_cleaning(test_t)
test_t = feature_engineering(test_t)


train_t.drop(columns = ['engine', 'id'], inplace = True)
id = test_t['id']
test_t.drop(columns = ['engine', 'id'], inplace = True)


plt.figure(figsize=(10, 6))
sns.histplot(round(train['price'], 2))
plt.title('Distribution of Car Prices')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()


plt.figure(figsize=(10, 6))
sns.lineplot(x='milage', y='price', data=train)
plt.title('Mileage vs. Price')
plt.xlabel('Mileage')
plt.ylabel('Price')
plt.show()


#model training 
x = train_t.drop(columns = ['price'])
y = train_t['price']

# Function to apply frequency encoding
def frequency_encoding(X, test_t):

    cat_cols = X.select_dtypes(include = ['object']).columns

    for col in cat_cols:
        freq_encoding = X[col].value_counts().to_dict() # Get frequency counts
        X[col +'_encoded'] = X[col].map(freq_encoding) # Map the frequencies to a new column
        test_t[col +'_encoded'] = test_t[col].map(freq_encoding) # Map the frequencies to a test column as well
        test_t = test_t.fillna(0)
        
    
    return X.drop(columns=cat_cols), test_t.drop(columns=cat_cols)


X, test_t = frequency_encoding(x, test_t)


lgb_model = lgb.LGBMRegressor(random_state=42)
lgb_model.fit(X, y)
lgb_pred2 = lgb_model.predict(test_t)

output = pd.DataFrame({'id': id, 'price': lgb_pred2})
output.to_csv('submission.csv', index=False)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006544 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1433
[LightGBM] [Info] Number of data points in the train set: 188533, number of used features: 14
[LightGBM] [Info] Start training from score 43878.016178

	id	brand	model	model_year	milage	fuel_type	engine	transmission	ext_col	int_col	accident	clean_title	price
0	0	MINI	Cooper S Base	2007	213000	Gasoline	172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel	A/T	Yellow	Gray	None reported	Yes	4200
1	1	Lincoln	LS V8	2002	143250	Gasoline	252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel	A/T	Silver	Beige	At least 1 accident or damage reported	Yes	4999
2	2	Chevrolet	Silverado 2500 LT	2002	136731	E85 Flex Fuel	320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab...	A/T	Blue	Gray	None reported	Yes	13900
3	3	Genesis	G90 5.0 Ultimate	2017	19500	Gasoline	420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel	Transmission w/Dual Shift Mode	Black	Black	None reported	Yes	45000
4	4	Mercedes-Benz	Metris Base	2021	7388	Gasoline	208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel	7-Speed A/T	Black	Beige	None reported	Yes	97500

Data Overview¶

Dataset Description¶

Initial Data Exploration and Summary¶

Initial Observation¶

Data Cleaning¶

Completing the Cleaning Processing for Missing Accident and Clean Title Data:¶

Feature Engineering¶

Visualizing Features¶

Observations¶

Feature Encoding¶

Model Training¶