import pandas as pd
import lightgbm as lgb
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')


train.head(6)


train.describe()


train.isnull().sum()

id                            0
person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
loan_status                   0
dtype: int64


status_counts = train['loan_status'].value_counts()
status_percentage = (status_counts / len(train)) * 100
round(status_percentage, 0)

loan_status
0    86.0
1    14.0
Name: count, dtype: float64


columns = train.select_dtypes(include = "object").columns

for col in columns:

    print(f"Value counts for column: {col}")
    print(train[col].value_counts(), "\n")

Value counts for column: person_home_ownership
person_home_ownership
RENT        30594
MORTGAGE    24824
OWN          3138
OTHER          89
Name: count, dtype: int64 

Value counts for column: loan_intent
loan_intent
EDUCATION            12271
MEDICAL              10934
PERSONAL             10016
VENTURE              10011
DEBTCONSOLIDATION     9133
HOMEIMPROVEMENT       6280
Name: count, dtype: int64 

Value counts for column: loan_grade
loan_grade
A    20984
B    20400
C    11036
D     5034
E     1009
F      149
G       33
Name: count, dtype: int64 

Value counts for column: cb_person_default_on_file
cb_person_default_on_file
N    49943
Y     8702
Name: count, dtype: int64


def encoding(data, columns):

    data = data.copy()
    #using a combination of freq and ordinal encoder

    # Mapping dictionary
    mapping = {'RENT': 3, 'MORTGAGE': 2, 'OWN': 1, 'OTHER': 4 }
    data['person_home_ownership'] = data['person_home_ownership'].map(mapping)
    
    for col in columns:

        if col == 'loan_intent':

            data = pd.get_dummies(data, columns = ['loan_intent'])
        else:
            # Initialize the OrdinalEncoder
            ordinal_encoder = OrdinalEncoder()
            
            data[col] = ordinal_encoder.fit_transform(data[[col]])

    return data


clean_train = encoding(train, columns)
clean_test = encoding(test, columns)


train_analysis = train.copy()

bins = [15, 24, 34, 44, 54, 64, 100]
labels = ["16-24", "25-34", "35-44", "45-54", "55-64", "65+"]
train_analysis["age_bin"] = pd.cut(train_analysis["person_age"], bins = bins, labels = labels)

plot_data = train_analysis.groupby(['age_bin','loan_status']).size().reset_index(name='count')

plt.figure(figsize=(12, 6))
sns.barplot(data = plot_data, x='age_bin', y='count', hue='loan_status')

plt.title('Loan Status Counts by Age Group')
plt.xlabel('Age Group')
plt.ylabel('Count of Loan Status')
plt.legend(title='Loan Status')
plt.show()


plot_data = train_analysis.groupby(['loan_intent','loan_status']).size().reset_index(name='count')

plt.figure(figsize=(12, 6))
sns.barplot(data = plot_data, x='loan_intent', y='count', hue='loan_status')

plt.title('Loan Status Counts by Loan Intent')
plt.xlabel('Loan Intent')
plt.ylabel('Count of Loan Status')
plt.legend(title='Loan Status')
plt.show()


plt.figure(figsize = (12,6))

sns.lineplot(train_analysis, x = 'age_bin', y = 'person_income', hue = 'cb_person_default_on_file' )
plt.show()


#feature engineering
def feature_eng(data):

    data['debt_to_income_ratio'] = data['loan_amnt'] / data['person_income']
    data['interest_rate_category'] = pd.cut(data['loan_int_rate'], bins=[0, 8, 12, 24], labels=[1, 2, 3])

    return data


clean_train = feature_eng(clean_train)
clean_test = feature_eng(clean_test)


# Convert specified columns to categorical type for train
clean_train['loan_grade'] = clean_train['loan_grade'].apply(lambda x: int(x) if x.is_integer() else x).astype('category')
clean_train['person_home_ownership'] = clean_train['person_home_ownership'].apply(lambda x: int(x) if x.is_integer() else x).astype('category')
clean_train['cb_person_default_on_file'] = clean_train['cb_person_default_on_file'].apply(lambda x: int(x) if x.is_integer() else x).astype('category')


# Convert specified columns to categorical type for test
clean_test['loan_grade'] = clean_test['loan_grade'].apply(lambda x: int(x) if x.is_integer() else x).astype('category')
clean_test['person_home_ownership'] = clean_test['person_home_ownership'].apply(lambda x: int(x) if x.is_integer() else x).astype('category')
clean_test['cb_person_default_on_file'] = clean_test['cb_person_default_on_file'].apply(lambda x: int(x) if x.is_integer() else x).astype('category')


X =  clean_train[clean_train['person_age'] < 100].drop(columns = ['id', 'loan_status'])
y = clean_train[clean_train['person_age'] < 100]['loan_status']

ids = clean_test['id']
clean_test_data = clean_test.drop(columns = ['id'])

count = y.value_counts()
class_weights = [(round((count.min()/count.sum())*100, 0))/ 10 , (round((count.max()/count.sum())*100, 0))/10]
cat_features = ['interest_rate_category', 'person_home_ownership', 'loan_grade', 'cb_person_default_on_file']
best_params = {'iterations': 529, 'depth': 3, 'learning_rate': 0.23354815782201432, 'l2_leaf_reg': 4.401132273867116,
               'bagging_temperature': 0.6457881123448875, 'random_strength': 8.71241029080795, 'border_count': 243}

model = CatBoostClassifier(**best_params, cat_features = cat_features, random_state = 42, eval_metric = 'AUC', class_weights = class_weights)
model.fit(X,y)

pred = model.predict_proba(clean_test_data)[:,1]
output = pd.DataFrame({'id': ids, 'loan_status': pred})
output.to_csv('submission2.csv', index=False)


X =  clean_train[clean_train['person_age'] < 100].drop(columns = ['id', 'loan_status'])
y = clean_train[clean_train['person_age'] < 100]['loan_status']

ids = clean_test['id']
clean_test_data = clean_test.drop(columns = ['id'])

count = y.value_counts()
scale_pos_weight = count.max() / count.min()

model = lgb.LGBMClassifier(random_state = 42, objective = 'binary', metric = 'auc', scale_pos_weight = scale_pos_weight)
model.fit(X,y)

pred = pred = model.predict_proba(clean_test_data)[:,1]
output = pd.DataFrame({'id': ids, 'loan_status': pred})
output.to_csv('submission.csv', index=False)

	id	person_age	person_income	person_emp_length	loan_amnt	loan_int_rate	loan_percent_income	cb_person_cred_hist_length	loan_status
count	58645.000000	58645.000000	5.864500e+04	58645.000000	58645.000000	58645.000000	58645.000000	58645.000000	58645.000000
mean	29322.000000	27.550857	6.404617e+04	4.701015	9217.556518	10.677874	0.159238	5.813556	0.142382
std	16929.497605	6.033216	3.793111e+04	3.959784	5563.807384	3.034697	0.091692	4.029196	0.349445
min	0.000000	20.000000	4.200000e+03	0.000000	500.000000	5.420000	0.000000	2.000000	0.000000
25%	14661.000000	23.000000	4.200000e+04	2.000000	5000.000000	7.880000	0.090000	3.000000	0.000000
50%	29322.000000	26.000000	5.800000e+04	4.000000	8000.000000	10.750000	0.140000	4.000000	0.000000
75%	43983.000000	30.000000	7.560000e+04	7.000000	12000.000000	12.990000	0.210000	8.000000	0.000000
max	58644.000000	123.000000	1.900000e+06	123.000000	35000.000000	23.220000	0.830000	30.000000	1.000000

Table of Content¶

1. Introduction¶

Project Title: Credit Risk Analysis for Personal Loan Applications¶

Introduction¶

Dataset Overview¶

Project Goals¶

Technologies and Tools¶

Data Preparation and Analysis¶

Key Observation so Far on train dataset¶

Encoding¶

Analysis¶

1. Does age have any impact on loan approval?¶

Observations¶

2. Look at the overall loan intent and see if there is any correlation to loan_status¶

Observations¶

3. Explore if there is any relationship between a person's income, age group and any previous default on file, i.e. does a higher income mean less default and Vice versa?¶

Observations¶

3. Feature Engineering¶

Model Training¶

Results¶

Conclusion¶

	id	person_age	person_income	person_home_ownership	person_emp_length	loan_intent	loan_grade	loan_amnt	loan_int_rate	loan_percent_income	cb_person_default_on_file	cb_person_cred_hist_length
0	0	37	35000	RENT	0.0	EDUCATION	B	6000	11.49	0.17	N	14
1	1	22	56000	OWN	6.0	MEDICAL	C	4000	13.35	0.07	N	2
2	2	29	28800	OWN	8.0	PERSONAL	A	6000	8.90	0.21	N	10
3	3	30	70000	RENT	14.0	VENTURE	B	12000	11.11	0.17	N	5
4	4	22	60000	RENT	2.0	MEDICAL	A	6000	6.92	0.10	N	3
5	5	27	45000	RENT	2.0	VENTURE	A	9000	8.94	0.20	N	5