import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score


loan_df = pd.read_csv('loan_train.csv')


print("Head of the dataset:")
print(loan_df.head())

print("\nSummary statistics of the dataset:")
print(loan_df.describe())

print("\nInformation about the dataset:")
print(loan_df.info())

Head of the dataset:
    Loan_ID Gender Married Dependents     Education Self_Employed  \
0  LP001002   Male      No          0      Graduate            No   
1  LP001003   Male     Yes          1      Graduate            No   
2  LP001005   Male     Yes          0      Graduate           Yes   
3  LP001006   Male     Yes          0  Not Graduate            No   
4  LP001008   Male      No          0      Graduate            No   

   ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
0             5849                0.0         NaN             360.0   
1             4583             1508.0       128.0             360.0   
2             3000                0.0        66.0             360.0   
3             2583             2358.0       120.0             360.0   
4             6000                0.0       141.0             360.0   

   Credit_History Property_Area Loan_Status  
0             1.0         Urban           Y  
1             1.0         Rural           N  
2             1.0         Urban           Y  
3             1.0         Urban           Y  
4             1.0         Urban           Y  

Summary statistics of the dataset:
       ApplicantIncome  CoapplicantIncome  LoanAmount  Loan_Amount_Term  \
count       614.000000         614.000000  592.000000         600.00000   
mean       5403.459283        1621.245798  146.412162         342.00000   
std        6109.041673        2926.248369   85.587325          65.12041   
min         150.000000           0.000000    9.000000          12.00000   
25%        2877.500000           0.000000  100.000000         360.00000   
50%        3812.500000        1188.500000  128.000000         360.00000   
75%        5795.000000        2297.250000  168.000000         360.00000   
max       81000.000000       41667.000000  700.000000         480.00000   

       Credit_History  
count      564.000000  
mean         0.842199  
std          0.364878  
min          0.000000  
25%          1.000000  
50%          1.000000  
75%          1.000000  
max          1.000000  

Information about the dataset:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            614 non-null    object 
 1   Gender             601 non-null    object 
 2   Married            611 non-null    object 
 3   Dependents         599 non-null    object 
 4   Education          614 non-null    object 
 5   Self_Employed      582 non-null    object 
 6   ApplicantIncome    614 non-null    int64  
 7   CoapplicantIncome  614 non-null    float64
 8   LoanAmount         592 non-null    float64
 9   Loan_Amount_Term   600 non-null    float64
 10  Credit_History     564 non-null    float64
 11  Property_Area      614 non-null    object 
 12  Loan_Status        614 non-null    object 
dtypes: float64(4), int64(1), object(8)
memory usage: 62.5+ KB
None


loan_df.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64


loan_df = loan_df.dropna()
loan_df = loan_df.drop('Loan_ID',axis=1)
# Displaying the first few rows of the new dataframe
print(loan_df.head())
print(loan_df.info())

  Gender Married Dependents     Education Self_Employed  ApplicantIncome  \
1   Male     Yes          1      Graduate            No             4583   
2   Male     Yes          0      Graduate           Yes             3000   
3   Male     Yes          0  Not Graduate            No             2583   
4   Male      No          0      Graduate            No             6000   
5   Male     Yes          2      Graduate           Yes             5417   

   CoapplicantIncome  LoanAmount  Loan_Amount_Term  Credit_History  \
1             1508.0       128.0             360.0             1.0   
2                0.0        66.0             360.0             1.0   
3             2358.0       120.0             360.0             1.0   
4                0.0       141.0             360.0             1.0   
5             4196.0       267.0             360.0             1.0   

  Property_Area Loan_Status  
1         Rural           N  
2         Urban           Y  
3         Urban           Y  
4         Urban           Y  
5         Urban           Y  
<class 'pandas.core.frame.DataFrame'>
Index: 480 entries, 1 to 613
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Gender             480 non-null    object 
 1   Married            480 non-null    object 
 2   Dependents         480 non-null    object 
 3   Education          480 non-null    object 
 4   Self_Employed      480 non-null    object 
 5   ApplicantIncome    480 non-null    int64  
 6   CoapplicantIncome  480 non-null    float64
 7   LoanAmount         480 non-null    float64
 8   Loan_Amount_Term   480 non-null    float64
 9   Credit_History     480 non-null    float64
 10  Property_Area      480 non-null    object 
 11  Loan_Status        480 non-null    object 
dtypes: float64(4), int64(1), object(7)
memory usage: 48.8+ KB
None


loan_df.isnull().sum()

Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64


# Encoding categorical variables
label_encoder = LabelEncoder()

loan_df[['Education', 'Property_Area', 'Loan_Status', 'Gender', 'Married', 'Dependents', 'Self_Employed']] = loan_df[['Education', 'Property_Area', 'Loan_Status', 'Gender', 'Married', 'Dependents', 'Self_Employed']].apply(LabelEncoder().fit_transform)

loan_df['Property_Area'] = label_encoder.fit_transform(loan_df['Property_Area'])
loan_df['Loan_Status'] = label_encoder.fit_transform(loan_df['Loan_Status'])
loan_df['Gender'] = label_encoder.fit_transform(loan_df['Gender'])
loan_df['Married'] = label_encoder.fit_transform(loan_df['Married'])
loan_df['Dependents'] = label_encoder.fit_transform(loan_df['Dependents'])
loan_df['Self_Employed'] = label_encoder.fit_transform(loan_df['Self_Employed'])


loan_df.head()


# Feature scaling
scaler = MinMaxScaler()
loan_df[['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Credit_History']] = scaler.fit_transform(loan_df[['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Credit_History']])


loan_df.head()


X = loan_df.drop('Loan_Status', axis=1)
y = loan_df['Loan_Status']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


logistic_reg = LogisticRegression(max_iter=1000)


logistic_reg.fit(X_train, y_train)

LogisticRegression(max_iter=1000)

LogisticRegression(max_iter=1000)


y_pred_logistic = logistic_reg.predict(X_test)


# accuracy_logistic = accuracy_score(y_test, y_pred_logistic)
# print("\nLogistic Regression - Accuracy:", accuracy_logistic)
print("Classification Report:")
print(classification_report(y_test, y_pred_logistic))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_logistic))
print("\n")

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.39      0.56        28
           1       0.80      1.00      0.89        68

    accuracy                           0.82        96
   macro avg       0.90      0.70      0.73        96
weighted avg       0.86      0.82      0.79        96

Confusion Matrix:
[[11 17]
 [ 0 68]]


accuracy_logistic = accuracy_score(y_test, y_pred_logistic)
f1_logistic = f1_score(y_test, y_pred_logistic)

print("\nAccuracy for Logistic Regression:", accuracy_logistic)
print("F1-score for Logistic Regression:", f1_logistic)

Accuracy for Logistic Regression: 0.8229166666666666
F1-score for Logistic Regression: 0.888888888888889


classifier = SVC(kernel='rbf')
classifier.fit(X_train, y_train)

SVC()

SVC()


pred_svm = classifier.predict(X_test)
accuracy_svm = accuracy_score(y_test, pred_svm)
print("Accuracy for SVM:", accuracy_svm)
f1_svm = f1_score(y_test, pred_svm)
print("F1-score for SVM:", f1_svm)

Accuracy for SVM: 0.7083333333333334
F1-score for SVM: 0.8292682926829268


param_grid_knn = {'n_neighbors': [3, 5, 7, 9, 11]}


grid_search_knn = GridSearchCV(KNeighborsClassifier(), param_grid_knn, cv=5, scoring='f1')


grid_search_knn.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': [3, 5, 7, 9, 11]}, scoring='f1')

GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': [3, 5, 7, 9, 11]}, scoring='f1')

KNeighborsClassifier()

KNeighborsClassifier()


print("\nBest hyperparameters for KNN:", grid_search_knn.best_params_)

Best hyperparameters for KNN: {'n_neighbors': 7}


y_pred_best_knn = grid_search_knn.predict(X_test)


accuracy_best_knn = accuracy_score(y_test, y_pred_best_knn)
f1_best_knn = f1_score(y_test, y_pred_best_knn)
print("Accuracy of KNN with best hyperparameters for KNN:", accuracy_best_knn)
print("F1-score of KNN with best hyperparameters for KNN:", f1_best_knn)

Accuracy of KNN with best hyperparameters for KNN: 0.7604166666666666
F1-score of KNN with best hyperparameters for KNN: 0.8535031847133758


# Compute ROC curve and ROC area under the curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_best_knn)
roc_auc = roc_auc_score(y_test, y_pred_best_knn)


# Plot ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='blue', lw=2, label='ROC curve (AUC = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='red', linestyle='--', lw=2, label='Random Guess')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.show()


# KNN with GridSearchCV
print("Best KNN Parameters:", grid_search_knn.best_params_)
print("Accuracy for KNN:", accuracy_best_knn)
print("F1-score for KNN:", f1_best_knn)

# SVM Classifier
accuracy_svm = accuracy_score(y_test, pred_svm)
f1_svm = f1_score(y_test, pred_svm)

print("\nAccuracy for SVM:", accuracy_svm)
print("F1-score for SVM:", f1_svm)

# Logistic Regression
accuracy_logistic = accuracy_score(y_test, y_pred_logistic)
f1_logistic = f1_score(y_test, y_pred_logistic)

print("\nAccuracy for Logistic Regression:", accuracy_logistic)
print("F1-score for Logistic Regression:", f1_logistic)

# Print Classification Reports and Confusion Matrices
print("\nKNN Classification Report:")
print(classification_report(y_test, y_pred_best_knn))
print("KNN Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_best_knn))

print("\nSVM Classification Report:")
print(classification_report(y_test, pred_svm))
print("SVM Confusion Matrix:")
print(confusion_matrix(y_test, pred_svm))

print("\nLogistic Regression Classification Report:")
print(classification_report(y_test, y_pred_logistic))
print("Logistic Regression Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_logistic))

Best KNN Parameters: {'n_neighbors': 7}
Accuracy for KNN: 0.7604166666666666
F1-score for KNN: 0.8535031847133758

Accuracy for SVM: 0.7083333333333334
F1-score for SVM: 0.8292682926829268

Accuracy for Logistic Regression: 0.8229166666666666
F1-score for Logistic Regression: 0.888888888888889

KNN Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.21      0.34        28
           1       0.75      0.99      0.85        68

    accuracy                           0.76        96
   macro avg       0.80      0.60      0.60        96
weighted avg       0.78      0.76      0.70        96

KNN Confusion Matrix:
[[ 6 22]
 [ 1 67]]

SVM Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        28
           1       0.71      1.00      0.83        68

    accuracy                           0.71        96
   macro avg       0.35      0.50      0.41        96
weighted avg       0.50      0.71      0.59        96

SVM Confusion Matrix:
[[ 0 28]
 [ 0 68]]

Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.39      0.56        28
           1       0.80      1.00      0.89        68

    accuracy                           0.82        96
   macro avg       0.90      0.70      0.73        96
weighted avg       0.86      0.82      0.79        96

Logistic Regression Confusion Matrix:
[[11 17]
 [ 0 68]]

/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
/usr/local/lib/python3.10/dist-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))

Practical 3: Classification¶

Let's get started¶

Data Splitting¶

Implementing Classification Algorithms¶

Logistic Regression¶

SVM Classification¶

KNN¶

Column	Description
Loan_ID	Unique Loan ID
Gender	Male/ Female
Married	Applicant married (Y/N)
Dependents	Number of dependents
Education	Applicant Education (Graduate/ Under Graduate)
Self_Employed	Self employed (Y/N)
ApplicantIncome	Applicant income
CoapplicantIncome	Coapplicant income
LoanAmount	Loan amount in thousands
Loan_Amount_Term	Term of loan in months
Credit_History	Credit history meets guidelines
Property_Area	Urban/ Semi Urban/ Rural
Loan_Status	Loan approved (Y/N)	\

	Gender	Married	Dependents	Education	Self_Employed	ApplicantIncome	CoapplicantIncome	LoanAmount	Loan_Amount_Term	Credit_History	Property_Area	Loan_Status
1	1	1	1	0	0	4583	1508.0	128.0	360.0	1.0	0	0
2	1	1	0	0	1	3000	0.0	66.0	360.0	1.0	2	1
3	1	1	0	1	0	2583	2358.0	120.0	360.0	1.0	2	1
4	1	0	0	0	0	6000	0.0	141.0	360.0	1.0	2	1
5	1	1	2	0	1	5417	4196.0	267.0	360.0	1.0	2	1

	Gender	Married	Dependents	Education	Self_Employed	ApplicantIncome	CoapplicantIncome	LoanAmount	Loan_Amount_Term	Credit_History	Property_Area	Loan_Status
1	1	1	1	0	0	0.054830	0.044567	0.201354	360.0	1.0	0	0
2	1	1	0	0	1	0.035250	0.000000	0.096447	360.0	1.0	2	1
3	1	1	0	1	0	0.030093	0.069687	0.187817	360.0	1.0	2	1
4	1	0	0	0	0	0.072356	0.000000	0.223350	360.0	1.0	2	1
5	1	1	2	0	1	0.065145	0.124006	0.436548	360.0	1.0	2	1

Metric	KNN	SVM	Logistic Regression
Accuracy	76.04%	70.83%	82.29%
F1-score	85.35%	82.92%	88.89%