import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
import numpy as np
I begin by loading the datasets:
train.csv: Contains the training data with vehicle attributes and their corresponding prices.
test.csv: Contains the test data for which we need to predict prices.
sample_submission.csv: A sample submission file for the predicted test data prices.
I display the first few rows of both the training and test datasets to get a quick look at the structure.
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
submission = pd.read_csv("sample_submission.csv")
test.head()
train.head()
id | brand | model | model_year | milage | fuel_type | engine | transmission | ext_col | int_col | accident | clean_title | price | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | MINI | Cooper S Base | 2007 | 213000 | Gasoline | 172.0HP 1.6L 4 Cylinder Engine Gasoline Fuel | A/T | Yellow | Gray | None reported | Yes | 4200 |
1 | 1 | Lincoln | LS V8 | 2002 | 143250 | Gasoline | 252.0HP 3.9L 8 Cylinder Engine Gasoline Fuel | A/T | Silver | Beige | At least 1 accident or damage reported | Yes | 4999 |
2 | 2 | Chevrolet | Silverado 2500 LT | 2002 | 136731 | E85 Flex Fuel | 320.0HP 5.3L 8 Cylinder Engine Flex Fuel Capab... | A/T | Blue | Gray | None reported | Yes | 13900 |
3 | 3 | Genesis | G90 5.0 Ultimate | 2017 | 19500 | Gasoline | 420.0HP 5.0L 8 Cylinder Engine Gasoline Fuel | Transmission w/Dual Shift Mode | Black | Black | None reported | Yes | 45000 |
4 | 4 | Mercedes-Benz | Metris Base | 2021 | 7388 | Gasoline | 208.0HP 2.0L 4 Cylinder Engine Gasoline Fuel | 7-Speed A/T | Black | Beige | None reported | Yes | 97500 |
To understand the data better, we check for missing values and dataset statistics.
print(train.describe())
print(train.info())
id model_year milage price count 188533.000000 188533.000000 188533.000000 1.885330e+05 mean 94266.000000 2015.829998 65705.295174 4.387802e+04 std 54424.933488 5.660967 49798.158076 7.881952e+04 min 0.000000 1974.000000 100.000000 2.000000e+03 25% 47133.000000 2013.000000 24115.000000 1.700000e+04 50% 94266.000000 2017.000000 57785.000000 3.082500e+04 75% 141399.000000 2020.000000 95400.000000 4.990000e+04 max 188532.000000 2024.000000 405000.000000 2.954083e+06 <class 'pandas.core.frame.DataFrame'> RangeIndex: 188533 entries, 0 to 188532 Data columns (total 13 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 188533 non-null int64 1 brand 188533 non-null object 2 model 188533 non-null object 3 model_year 188533 non-null int64 4 milage 188533 non-null int64 5 fuel_type 183450 non-null object 6 engine 188533 non-null object 7 transmission 188533 non-null object 8 ext_col 188533 non-null object 9 int_col 188533 non-null object 10 accident 186081 non-null object 11 clean_title 167114 non-null object 12 price 188533 non-null int64 dtypes: int64(4), object(9) memory usage: 18.7+ MB None
train.isnull().sum()
id 0 brand 0 model 0 model_year 0 milage 0 fuel_type 5083 engine 0 transmission 0 ext_col 0 int_col 0 accident 2452 clean_title 21419 price 0 dtype: int64
There are some missing values in critical columns like fuel_type, accident, and clean_title. We employ several strategies to clean the data:
def fuel_type_cleaning(row):
if (row["brand"] == "Tesla" or row["brand"] == "Rivian" or row["engine"] == "Electric" or "Electric Fuel System" in row["engine"] or
row["model"] == "EV6 Wind") and pd.isnull(row["fuel_type"]):
row["fuel_type"] = "Electric"
elif "Engine Gasoline Fuel" in row["engine"] and pd.isnull(row["fuel_type"]):
row["fuel_type"] = "Gasoline"
elif ("Engine Gas/Electric Hybrid" in row["engine"] or "Engine Gasoline/Mild Electric Hybrid" in row["engine"]) and pd.isnull(row["fuel_type"]):
row["fuel_type"] = "Hybrid"
elif ("Battery" in row["engine"] or "Electric" in row["engine"]) and pd.isnull(row["fuel_type"]):
row["fuel_type"] = "Electric"
elif pd.isnull(row["fuel_type"]):
row["fuel_type"] = "Diesel"
return row
def complete_cleaning(data):
data['clean_title'].fillna('Yes', inplace=True)
data['accident'].fillna('None reported', inplace=True)
# Convert all string columns to lowercase
data = data.apply(lambda col: col.str.lower() if col.dtype == 'object' else col)
data['transmission'] = data['transmission'].replace({r'\bmanual\b': 'm/t', r'\bautomatic\b': 'a/t', 'at': 'a/t',
'mt': 'm/t','a/t cvt':'cvt transmission', 'cvt-f': 'cvt transmission',
r'6[ -]speed': '6-speed'}, regex=True)
return data
We engineer new features to improve the predictive power of the model:
def feature_engineering(data):
year = 2025
data['horsepower'] = data['engine'].str.extract(r'(\d+\.?\d*)hp').fillna(0).astype(float)
data['liter'] = data['engine'].str.extract(r'(\d+\.?\d*)l').fillna(0).astype(float)
data['cylinders'] = data['engine'].str.extract(r'(\d+) cylinder').fillna(0).astype(int)
data['car_age'] = year - data['model_year']
data['milage_per_year'] = data['milage'] / data['car_age']
return data
train_t = train.apply(fuel_type_cleaning, axis =1)
train_t = complete_cleaning(train_t)
train_t = feature_engineering(train_t)
#test cleaning and feature engineering
test_t = test.apply(fuel_type_cleaning, axis =1)
test_t = complete_cleaning(test_t)
test_t = feature_engineering(test_t)
train_t.drop(columns = ['engine', 'id'], inplace = True)
id = test_t['id']
test_t.drop(columns = ['engine', 'id'], inplace = True)
plt.figure(figsize=(10, 6))
sns.histplot(round(train['price'], 2))
plt.title('Distribution of Car Prices')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()
plt.figure(figsize=(10, 6))
sns.lineplot(x='milage', y='price', data=train)
plt.title('Mileage vs. Price')
plt.xlabel('Mileage')
plt.ylabel('Price')
plt.show()
From the graph, we can see that as the mileage goes up the price of the car comes down which aligns with common expectations. Higher mileage often indicates more wear and tear, which can negatively affect a car's resale value.
Categorical features like brand, model, fuel_type, etc., need to be encoded into numerical values. I am using frequency encoding, which replaces categories with their frequencies in the dataset.
#model training
x = train_t.drop(columns = ['price'])
y = train_t['price']
# Function to apply frequency encoding
def frequency_encoding(X, test_t):
cat_cols = X.select_dtypes(include = ['object']).columns
for col in cat_cols:
freq_encoding = X[col].value_counts().to_dict() # Get frequency counts
X[col +'_encoded'] = X[col].map(freq_encoding) # Map the frequencies to a new column
test_t[col +'_encoded'] = test_t[col].map(freq_encoding) # Map the frequencies to a test column as well
test_t = test_t.fillna(0)
return X.drop(columns=cat_cols), test_t.drop(columns=cat_cols)
X, test_t = frequency_encoding(x, test_t)
lgb_model = lgb.LGBMRegressor(random_state=42)
lgb_model.fit(X, y)
lgb_pred2 = lgb_model.predict(test_t)
output = pd.DataFrame({'id': id, 'price': lgb_pred2})
output.to_csv('submission.csv', index=False)
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006544 seconds. You can set `force_row_wise=true` to remove the overhead. And if memory is not enough, you can set `force_col_wise=true`. [LightGBM] [Info] Total Bins 1433 [LightGBM] [Info] Number of data points in the train set: 188533, number of used features: 14 [LightGBM] [Info] Start training from score 43878.016178