import pandas as pd
import lightgbm as lgb
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
In this project, I conducted an in-depth analysis of a loan application dataset to build a model that predicts the likelihood of a loan default. This dataset provides details about applicants, including their age, income, employment history, and loan-related features like loan amount, interest rate, and loan intent. The objective was to develop a model that could help financial institutions assess credit risk more accurately, minimizing potential losses due to defaults.
The dataset includes information on various applicant and loan characteristics:
person_age
, person_income
, person_home_ownership
, and person_emp_length
(years of employment) describe each applicant’s profile.loan_intent
, loan_grade
, loan_amnt
(loan amount), and loan_int_rate
(interest rate) offer insights into each loan’s purpose and terms.cb_person_cred_hist_length
(credit history length in years) and cb_person_default_on_file
, a binary indicator of past default history.loan_status
, where 0
indicates no default and 1
indicates a default.loan_intent
and loan_grade
.loan_status
, focusing on metrics such as accuracy, precision, recall, and AUC (area under the curve).This project showcases my skills in data processing, feature engineering, machine learning, and financial risk analysis. By building an effective predictive model, I aim to demonstrate the potential for data science to improve risk assessment and lending decisions in finance.
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train.head(6)
id | person_age | person_income | person_home_ownership | person_emp_length | loan_intent | loan_grade | loan_amnt | loan_int_rate | loan_percent_income | cb_person_default_on_file | cb_person_cred_hist_length | loan_status | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 37 | 35000 | RENT | 0.0 | EDUCATION | B | 6000 | 11.49 | 0.17 | N | 14 | 0 |
1 | 1 | 22 | 56000 | OWN | 6.0 | MEDICAL | C | 4000 | 13.35 | 0.07 | N | 2 | 0 |
2 | 2 | 29 | 28800 | OWN | 8.0 | PERSONAL | A | 6000 | 8.90 | 0.21 | N | 10 | 0 |
3 | 3 | 30 | 70000 | RENT | 14.0 | VENTURE | B | 12000 | 11.11 | 0.17 | N | 5 | 0 |
4 | 4 | 22 | 60000 | RENT | 2.0 | MEDICAL | A | 6000 | 6.92 | 0.10 | N | 3 | 0 |
5 | 5 | 27 | 45000 | RENT | 2.0 | VENTURE | A | 9000 | 8.94 | 0.20 | N | 5 | 0 |
train.describe()
id | person_age | person_income | person_emp_length | loan_amnt | loan_int_rate | loan_percent_income | cb_person_cred_hist_length | loan_status | |
---|---|---|---|---|---|---|---|---|---|
count | 58645.000000 | 58645.000000 | 5.864500e+04 | 58645.000000 | 58645.000000 | 58645.000000 | 58645.000000 | 58645.000000 | 58645.000000 |
mean | 29322.000000 | 27.550857 | 6.404617e+04 | 4.701015 | 9217.556518 | 10.677874 | 0.159238 | 5.813556 | 0.142382 |
std | 16929.497605 | 6.033216 | 3.793111e+04 | 3.959784 | 5563.807384 | 3.034697 | 0.091692 | 4.029196 | 0.349445 |
min | 0.000000 | 20.000000 | 4.200000e+03 | 0.000000 | 500.000000 | 5.420000 | 0.000000 | 2.000000 | 0.000000 |
25% | 14661.000000 | 23.000000 | 4.200000e+04 | 2.000000 | 5000.000000 | 7.880000 | 0.090000 | 3.000000 | 0.000000 |
50% | 29322.000000 | 26.000000 | 5.800000e+04 | 4.000000 | 8000.000000 | 10.750000 | 0.140000 | 4.000000 | 0.000000 |
75% | 43983.000000 | 30.000000 | 7.560000e+04 | 7.000000 | 12000.000000 | 12.990000 | 0.210000 | 8.000000 | 0.000000 |
max | 58644.000000 | 123.000000 | 1.900000e+06 | 123.000000 | 35000.000000 | 23.220000 | 0.830000 | 30.000000 | 1.000000 |
train.isnull().sum()
id 0 person_age 0 person_income 0 person_home_ownership 0 person_emp_length 0 loan_intent 0 loan_grade 0 loan_amnt 0 loan_int_rate 0 loan_percent_income 0 cb_person_default_on_file 0 cb_person_cred_hist_length 0 loan_status 0 dtype: int64
status_counts = train['loan_status'].value_counts()
status_percentage = (status_counts / len(train)) * 100
round(status_percentage, 0)
loan_status 0 86.0 1 14.0 Name: count, dtype: float64
columns = train.select_dtypes(include = "object").columns
for col in columns:
print(f"Value counts for column: {col}")
print(train[col].value_counts(), "\n")
Value counts for column: person_home_ownership person_home_ownership RENT 30594 MORTGAGE 24824 OWN 3138 OTHER 89 Name: count, dtype: int64 Value counts for column: loan_intent loan_intent EDUCATION 12271 MEDICAL 10934 PERSONAL 10016 VENTURE 10011 DEBTCONSOLIDATION 9133 HOMEIMPROVEMENT 6280 Name: count, dtype: int64 Value counts for column: loan_grade loan_grade A 20984 B 20400 C 11036 D 5034 E 1009 F 149 G 33 Name: count, dtype: int64 Value counts for column: cb_person_default_on_file cb_person_default_on_file N 49943 Y 8702 Name: count, dtype: int64
- **They are:** Ordinal Encoding for person_home_ownership, loan_grade and cb_person_default_on_file as these features have some order to it.
One-hot encoding for loan_intent as the feature has no order to it.
def encoding(data, columns):
data = data.copy()
#using a combination of freq and ordinal encoder
# Mapping dictionary
mapping = {'RENT': 3, 'MORTGAGE': 2, 'OWN': 1, 'OTHER': 4 }
data['person_home_ownership'] = data['person_home_ownership'].map(mapping)
for col in columns:
if col == 'loan_intent':
data = pd.get_dummies(data, columns = ['loan_intent'])
else:
# Initialize the OrdinalEncoder
ordinal_encoder = OrdinalEncoder()
data[col] = ordinal_encoder.fit_transform(data[[col]])
return data
clean_train = encoding(train, columns)
clean_test = encoding(test, columns)
I will perform the following analysis.
train_analysis = train.copy()
bins = [15, 24, 34, 44, 54, 64, 100]
labels = ["16-24", "25-34", "35-44", "45-54", "55-64", "65+"]
train_analysis["age_bin"] = pd.cut(train_analysis["person_age"], bins = bins, labels = labels)
plot_data = train_analysis.groupby(['age_bin','loan_status']).size().reset_index(name='count')
plt.figure(figsize=(12, 6))
sns.barplot(data = plot_data, x='age_bin', y='count', hue='loan_status')
plt.title('Loan Status Counts by Age Group')
plt.xlabel('Age Group')
plt.ylabel('Count of Loan Status')
plt.legend(title='Loan Status')
plt.show()
plot_data = train_analysis.groupby(['loan_intent','loan_status']).size().reset_index(name='count')
plt.figure(figsize=(12, 6))
sns.barplot(data = plot_data, x='loan_intent', y='count', hue='loan_status')
plt.title('Loan Status Counts by Loan Intent')
plt.xlabel('Loan Intent')
plt.ylabel('Count of Loan Status')
plt.legend(title='Loan Status')
plt.show()
plt.figure(figsize = (12,6))
sns.lineplot(train_analysis, x = 'age_bin', y = 'person_income', hue = 'cb_person_default_on_file' )
plt.show()
Given limited domain knowledge, only two new features will created from existing features to try to improve model performance
#feature engineering
def feature_eng(data):
data['debt_to_income_ratio'] = data['loan_amnt'] / data['person_income']
data['interest_rate_category'] = pd.cut(data['loan_int_rate'], bins=[0, 8, 12, 24], labels=[1, 2, 3])
return data
clean_train = feature_eng(clean_train)
clean_test = feature_eng(clean_test)
# Convert specified columns to categorical type for train
clean_train['loan_grade'] = clean_train['loan_grade'].apply(lambda x: int(x) if x.is_integer() else x).astype('category')
clean_train['person_home_ownership'] = clean_train['person_home_ownership'].apply(lambda x: int(x) if x.is_integer() else x).astype('category')
clean_train['cb_person_default_on_file'] = clean_train['cb_person_default_on_file'].apply(lambda x: int(x) if x.is_integer() else x).astype('category')
# Convert specified columns to categorical type for test
clean_test['loan_grade'] = clean_test['loan_grade'].apply(lambda x: int(x) if x.is_integer() else x).astype('category')
clean_test['person_home_ownership'] = clean_test['person_home_ownership'].apply(lambda x: int(x) if x.is_integer() else x).astype('category')
clean_test['cb_person_default_on_file'] = clean_test['cb_person_default_on_file'].apply(lambda x: int(x) if x.is_integer() else x).astype('category')
X = clean_train[clean_train['person_age'] < 100].drop(columns = ['id', 'loan_status'])
y = clean_train[clean_train['person_age'] < 100]['loan_status']
ids = clean_test['id']
clean_test_data = clean_test.drop(columns = ['id'])
count = y.value_counts()
class_weights = [(round((count.min()/count.sum())*100, 0))/ 10 , (round((count.max()/count.sum())*100, 0))/10]
cat_features = ['interest_rate_category', 'person_home_ownership', 'loan_grade', 'cb_person_default_on_file']
best_params = {'iterations': 529, 'depth': 3, 'learning_rate': 0.23354815782201432, 'l2_leaf_reg': 4.401132273867116,
'bagging_temperature': 0.6457881123448875, 'random_strength': 8.71241029080795, 'border_count': 243}
model = CatBoostClassifier(**best_params, cat_features = cat_features, random_state = 42, eval_metric = 'AUC', class_weights = class_weights)
model.fit(X,y)
pred = model.predict_proba(clean_test_data)[:,1]
output = pd.DataFrame({'id': ids, 'loan_status': pred})
output.to_csv('submission2.csv', index=False)
X = clean_train[clean_train['person_age'] < 100].drop(columns = ['id', 'loan_status'])
y = clean_train[clean_train['person_age'] < 100]['loan_status']
ids = clean_test['id']
clean_test_data = clean_test.drop(columns = ['id'])
count = y.value_counts()
scale_pos_weight = count.max() / count.min()
model = lgb.LGBMClassifier(random_state = 42, objective = 'binary', metric = 'auc', scale_pos_weight = scale_pos_weight)
model.fit(X,y)
pred = pred = model.predict_proba(clean_test_data)[:,1]
output = pd.DataFrame({'id': ids, 'loan_status': pred})
output.to_csv('submission.csv', index=False)
CatBoost gives a better a result on unseen data then lightgbm