bank_personal_loan_modelling.py

# -*- coding: utf-8 -*-
"""Bank_Personal_Loan_Modelling.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/gist/kunalvasudevan/018ef013e01421a8081a14da22f22fc5/copy-of-aiml2.ipynb

Marketing Campaign for Banking Products

Data Description:
The file Bank.xls contains data on 5000 customers. The data include customer
demographic information (age, income, etc.), the customer's relationship with the bank
(mortgage, securities account, etc.), and the customer response to the last personal
loan campaign (Personal Loan).
Among these 5000 customers, only 480 (= 9.6%) accepted the personal loan that was
offered to them in the earlier campaign.

Context:
The bank has a growing customer base. The bank wants to increase borrowers (asset
customers) base to bring in more loan business and earn more through the interest on
loans. So , the bank wants to convert the liability based customers to personal loan
customers. (while retaining them as depositors). A campaign that the bank ran last year
for liability customers showed a healthy conversion rate of over 9% success. The
department wants you to build a model that will help them identify the potential
customers who have a higher probability of purchasing the loan. This will increase the
success ratio while at the same time reduce the cost of the campaign.

Attribute Information:
● ID: Customer ID

● Age: Customer's age in completed years

● Experience: #years of professional experience

● Income: Annual income of the customer 

● ZIP Code: Home Address ZIP code.

● Family: Family size of the customer

● CCAvg: Avg. spending on credit cards per month ($000)

● Education: Education Level. 1: Undergrad; 2: Graduate; 
   3: Advanced/Professional

● Mortgage: Value of house mortgage if any. ($000)

● Personal Loan: Did this customer accept the personal loan offered in the last
campaign?

● Securities Account: Does the customer have a securities account with the bank?

● CD Account: Does the customer have a certificate of deposit (CD) account with
the bank?

● Online: Does the customer use internet banking facilities?

● Credit card: Does the customer use a credit card issued by the bank?

Objective:
The classification goal is to predict the likelihood of a liability customer buying personal
loans.bold text

# **Task: 1**

**Import the datasets and libraries, check datatype, statistical summary, shape, null
values etc**
"""

# Commented out IPython magic to ensure Python compatibility.
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.decomposition import PCA
from sklearn import model_selection
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
# %matplotlib inline
import seaborn as sns

Dataset = pd.read_excel('Bank loan.xlsx',sheet_name='Data')

#to display the top 5 rows
Dataset.head()

#to display the bottom 5 rows
Dataset.tail()

#to display the dataset of each column and also display the non-null rows for each column
Dataset.info()

#to display the Summary stats which includes count , mean , std,min,max,etc
Dataset.describe().T

Dataset.shape()

"""Observation

No Missing Values
"""

Dataset.isna().apply(pd.value_counts)

# To display how many nulls value are present in each column
Dataset.isnull().sum()

"""# **Task: 2**

**Check if you need to clean the data for any of the variables**

Observation

    Column 'Experience' has negative values

Replacing the negative values with the medium value of the column.

**checking wheather there is any negative in it .**
"""

any(Dataset['Experience'] < 0)

"""**Changing the negative Value into median value**"""

exp_med = Dataset.loc[:,"Experience"].median()
Dataset.loc[:, 'Experience'].replace([-1, -2, -3], [exp_med, exp_med, exp_med], inplace=True)

"""**Checking in the table**"""

Dataset.describe()

Dataset[['Personal Loan', 'Age', 'Income', 'CCAvg', 'Mortgage']].corr()

experience = Dataset['Experience']
age = Dataset['Age']
correlation = experience.corr(age)
correlation

Dataset.ax = plt.subplots(figsize=(15,10))
sns.heatmap(Dataset.corr(),cmap='plasma' ,annot = True)

Dataset.info

Dataset = Dataset.drop(['ID','ZIP Code'],axis=1)
Dataset.head()

"""# **Task: 3**

**EDA: Study the data distribution in each attribute and target variable, share your
findings.**

● Number of unique in each column
"""

Dataset.nunique()

"""● Number of people with zero mortgage"""

(Dataset.Mortgage == 0).sum()

"""● Number of people with zero credit card spending per month"""

(Dataset.CCAvg == 0).sum()

"""● Value counts of all categorical columns."""

Dataset.Family.value_counts()

sns.countplot(y='Family', data=Dataset)

"""● Univariate and Bivariate analysis

**Univariate Analysis of the continuous variables before Skewness**
"""

plt.figure(figsize= (40.5,40.5))
plt.subplot(5,3,1)
plt.hist(Dataset.Age, color='lightblue', edgecolor = 'black')
plt.xlabel('Age')

plt.subplot(5,3,2)
plt.hist(Dataset.Experience, color='darkblue', edgecolor = 'black')
plt.xlabel('Experience')

plt.subplot(5,3,3)
plt.hist(Dataset.Income, color='purple', edgecolor = 'black')
plt.xlabel('Income')

plt.subplot(5,3,4)
plt.hist(Dataset.CCAvg, color='green', edgecolor = 'black')
plt.xlabel('Credit Card Average')

plt.subplot(5,3,5)
plt.hist(Dataset.Mortgage, color='yellow', edgecolor = 'black')
plt.xlabel('Mortgage')

plt.show()

"""Observation
 
Checking for Skewness of data
"""

# Checking for Skewness of data

import statsmodels.api as sm
import scipy.stats as stats
Skewness = pd.DataFrame({'Skewness' : [stats.skew(Dataset.Mortgage),stats.skew(Dataset.Income),stats.skew(Dataset.CCAvg),stats.skew(Dataset.Experience)
                                      ,stats.skew(Dataset.Age)]},index=['Mortgage','Income','CCAvg','Experience','Age'])
Skewness

"""Observation

Income, Credit card Average & Mortgage are highly skewed

**Univariate Analysis of the continuous variables for the specified variable which has high skewness**
"""

plt.figure(figsize= (20,20))
plt.subplot(5,3,1)
plt.hist(Dataset.Age, color='lightblue', edgecolor = 'black')
plt.ylabel('Age')
plt.xlabel('Experience')

plt.subplot(5,3,2)
plt.hist(Dataset.Experience, color='darkblue', edgecolor = 'black')
plt.ylabel('Education')
plt.xlabel('Income')

plt.subplot(5,3,3)
plt.hist(Dataset.Income, color='purple', edgecolor = 'black')
plt.ylabel('Mortgage')
plt.xlabel('Education')


plt.show()

"""Multivariate Analysis: Pair plot:

pair plot shows a clear and nice view of all variables and their realtion ship with all other variables. Image for post
"""

sns.pairplot(Dataset)
plt.show()

"""# **Apply necessary transformations for the feature variables**

**To check how many user are buying personal loan**
"""

sns.set(style="ticks", color_codes=True)
sns.countplot(y='Personal Loan', data=Dataset)

Dataset["Personal Loan"].value_counts().to_frame()

"""From the above Picture, We can say that the data is imbalanced"""

sns.boxplot(x='Personal Loan',y='Family', data=Dataset)

sns.countplot(x='Personal Loan',hue='Family',data=Dataset)

"""Observation

The graph show persons who have personal loan have a higher credit card average.
 
It is clearly visible that as the members of family increases  the necessity of loan is also increasing.
 
It is very precise that as the income increases (approx 100K) the mortgage value also increases gradually wiht the necessity of personal loan.
"""

plt.figure(figsize=(12,12))

plt.subplot(3,1,1)
sns.scatterplot(Dataset.CCAvg, Dataset.Income, hue = Dataset['Personal Loan'], palette= ['orange','blue'])

plt.subplot(3,1,2)
sns.scatterplot(Dataset.Family, Dataset.Income, hue = Dataset['Personal Loan'], palette= ['pink','green'])

plt.subplot(3,1,3)
sns.scatterplot(Dataset.Mortgage, Dataset.Income, hue = Dataset['Personal Loan'], palette= ['yellow','red'])

plt.figure(figsize=(15,15))

plt.subplot(2,2,1)
sns.countplot(x="Account security", data=Dataset ,hue="Personal Loan")

plt.subplot(2,2,2)
sns.countplot(x='CD Account' ,data=Dataset ,hue='Personal Loan')

"""Observation

Majority of customers who does not have loan is holding securities account, Whereas small proportion of customers having loan does hold but majority of them do not have securities account.
Customers who does not have CD account, does not have loan as well, but almost all customers who has CD account has loan as well
"""

sns.distplot(Dataset[Dataset["Personal Loan"] == 0]['Income'], color = 'r')
sns.distplot(Dataset[Dataset["Personal Loan"] == 1]['Income'], color = 'g')

"""Observation

The graph show those who have personal loan also have a higher income.

Observation

'Age' has a very strong association with 'Experience' but nothing gets affected with loan attribute.
It seems that customers with education level is 1 is having more income which is mere equal to the customers who has taken the personal loan.
Customers with education level 2 & 3 seems to take personal loan as they have high mortgage.
"""

Dataset[['Personal Loan', 'Income', 'CCAvg', 'Mortgage']].corr()['Personal Loan'][1:].plot.bar()

"""Observation

The above diagram shows a clear vision on the correlation between the independant variable and dependant variables, we see that 'Income' and 'Credit Card Average' has some correlation with 'Personal Loan'.

# **Splitting into train and test data**
"""

X = Dataset.drop('Personal Loan',axis = 1).values
y = Dataset['Personal Loan'].values
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.25, random_state=42)

scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.fit_transform(X_train)

pca = PCA(n_components=10)
pca.fit(X_train)
X_train = pca.fit_transform(X_train)

X_train.shape

"""# **Logistic Regression**"""

train_set, test_set = train_test_split(Dataset.drop(['ID','Experience'], axis=1), test_size=0.3 , random_state=100)

train_labels = train_set.pop('Personal Loan')
test_labels = test_set.pop('Personal Loan')

train_set_indep = Dataset.drop(['Experience' ,'ID'] , axis = 1).drop(labels= "Personal Loan" , axis = 1)
train_set_dep = Dataset["Personal Loan"]
X = np.array(train_set_indep)
Y = np.array(train_set_dep)
X_Train = X[ :3500, :]
X_Test = X[3501: , :]
Y_Train = Y[:3500, ]
Y_Test = Y[3501:, ]

logmodel = LogisticRegression()
logmodel.fit(X_train,Y_train)

"""**Train**"""

predict = logmodel.predict(X_train)
predictProb = logmodel.predict_proba(X_train)

# Confusion Matrix
cm = confusion_matrix(Y_train, predict)

from sklearn import metrics
print(metrics.accuracy_score(Y_train,predict))

# Classification Report
print(classification_report(Y_train, predict))

# Confusion Matrix
cm = confusion_matrix(Y_train, predict)

class_label = ["Positive", "Negative"]
df_cm = pd.DataFrame(cm, index = class_label, columns = class_label)
sns.heatmap(df_cm, annot = True, fmt = "d")
plt.title("Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

"""**Test**"""

predict = logmodel.predict(X_test)
predictProb = logmodel.predict_proba(X_test)

print(metrics.accuracy_score(Y_test,predict))

# Classification Report
print(classification_report(Y_test, predict))

# Confusion Matrix
cm = confusion_matrix(Y_test, predict)

class_label = ["Positive", "Negative"]
df_cm = pd.DataFrame(cm, index = class_label, columns = class_label)
sns.heatmap(df_cm, annot = True, fmt = "d")
plt.title("Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

"""# **Linear Regession**"""

Dataset = pd.read_excel('Bank loan.xlsx',sheet_name='Data').values
x= Dataset[:,:-1]
y= Dataset[:,:-1]
x,y

plt.scatter(x, y)

model = LinearRegression()
model.fit(x,y)
y_pred = model.predict(x)
y_pred

# Model
linearregression = LinearRegression()
model = linearregression.fit(X_train, Y_train)

prediction = model.predict(X_train)
model.score(X_train,Y_train)

# Model
linearregression = LinearRegression()
model = linearregression.fit(X_test, Y_test)

prediction = model.predict(X_test)
model.score(X_test,Y_test)

# Confusion Matrix
cm1 = confusion_matrix(Y_test, predicted)

class_label = ["Positive", "Negative"]
df_cm1 = pd.DataFrame(cm1, index = class_label, columns = class_label)
sns.heatmap(df_cm1, annot = True, fmt = "d")
plt.title("Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

"""# **KNN Algorithm**"""

# Creating odd list of K for KNN
myList = list(range(1,20))

# Subsetting just the odd ones
neighbors = list(filter(lambda x: x % 2 != 0, myList))

# Empty list that will hold accuracy scores
ac_scores = []

# Perform accuracy metrics for values from 1,3,5....19
for k in neighbors:
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, Y_train)
    
    # Predict the response
    Y_Pred = knn.predict(X_test)
    
    # Evaluate accuracy
    scores = accuracy_score(Y_test, Y_Pred)
    ac_scores.append(scores)

# Changing to misclassification error
MSE = [1 - x for x in ac_scores]

# Determining best k
optimal_k = neighbors[MSE.index(min(MSE))]
print("The optimal number of neighbors is %d" % optimal_k)

knn.fit(X_train, Y_train)    
predicted = knn.predict(X_train)
from sklearn.metrics import accuracy_score
acc = accuracy_score(Y_train, predicted)
print(acc)

# Classification Report
print(classification_report(Y_train, predicted))

knn = KNeighborsClassifier(n_neighbors= 13 , weights = 'uniform', metric = 'euclidean')
knn.fit(X_train, Y_train)    
predicted = knn.predict(X_test)
from sklearn.metrics import accuracy_score
acc = accuracy_score(Y_test, predicted)
print(acc)

# Classification Report
print(classification_report(Y_test, predicted))

plt.plot(neighbors, MSE)
plt.xlabel('Number of Neighbors K')
plt.ylabel('Misclassification Error')

# Confusion Matrix
cm1 = confusion_matrix(Y_test, predicted)

class_label = ["Positive", "Negative"]
df_cm1 = pd.DataFrame(cm1, index = class_label, columns = class_label)
sns.heatmap(df_cm1, annot = True, fmt = "d")
plt.title("Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

"""# **Naive Bayes**"""

# Model
naive_model = GaussianNB()
naive_model.fit(X_train, Y_train)

prediction = naive_model.predict(X_train)
naive_model.score(X_train,Y_train)

# Classififcation Report
print(classification_report(Y_train, prediction))

# Model
naive_model = GaussianNB()
naive_model.fit(X_train, Y_train)

prediction = naive_model.predict(X_test)
naive_model.score(X_test,Y_test)

# Classififcation Report
print(classification_report(Y_test, prediction))

# Confusion Matrix
cm2 = confusion_matrix(Y_train, prediction)
class_label = ["Positive", "Negative"]
df_cm2 = pd.DataFrame(cm2, index = class_label, columns = class_label)
sns.heatmap(df_cm2, annot = True, fmt = "d")
plt.title("Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

"""# **Decision Tree**"""

decisiontree = DecisionTreeClassifier()

observation = [[5,4,3,2]]
model.predict(observation)
model.predict_proba(observation)

# Model
decisiontree = DecisionTreeClassifier()
model = decisiontree.fit(X_train, Y_train)

prediction = model.predict(X_train)
model.score(X_train,Y_train)

# Classififcation Report
print(classification_report(Y_train, prediction))

# Confusion Matrix
cm2 = confusion_matrix(Y_train, prediction)
class_label = ["Positive", "Negative"]
df_cm2 = pd.DataFrame(cm2, index = class_label, columns = class_label)
sns.heatmap(df_cm2, annot = True, fmt = "d")
plt.title("Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

# Model
decisiontree = DecisionTreeClassifier()
model = decisiontree.fit(X_train, Y_train)

prediction = model.predict(X_test)
model.score(X_test,Y_test)

# Classififcation Report
print(classification_report(Y_test, prediction))

# Confusion Matrix
cm2 = confusion_matrix(Y_test, prediction)
class_label = ["Positive", "Negative"]
df_cm2 = pd.DataFrame(cm2, index = class_label, columns = class_label)
sns.heatmap(df_cm2, annot = True, fmt = "d")
plt.title("Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

"""# **Random Forest**"""

randomforest = RandomForestClassifier()

# Model
randomforest = RandomForestClassifier()
model = randomforest.fit(X_test, Y_test)

prediction = model.predict(X_train)
model.score(X_train,Y_train)

# Classififcation Report
print(classification_report(Y_train, prediction))

# Confusion Matrix
cm3 = confusion_matrix(Y_train, prediction)
class_label = ["Positive", "Negative"]
df_cm3 = pd.DataFrame(cm3, index = class_label, columns = class_label)
sns.heatmap(df_cm3, annot = True, fmt = "d")
plt.title("Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

# Model
randomforest = RandomForestClassifier()
model = randomforest.fit(X_test, Y_test)

prediction = model.predict(X_test)
model.score(X_test,Y_test)

# Classififcation Report
print(classification_report(Y_test, prediction))

# Confusion Matrix
cm3 = confusion_matrix(Y_test, prediction)
class_label = ["Positive", "Negative"]
df_cm3 = pd.DataFrame(cm3, index = class_label, columns = class_label)
sns.heatmap(df_cm3, annot = True, fmt = "d")
plt.title("Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

"""# **SVM**"""

svm = SVC()

# Model
svm = SVC()
model = svm.fit(X_train, Y_train)

prediction = model.predict(X_train)
model.score(X_train,Y_train)

# Classififcation Report
print(classification_report(Y_train, prediction))

# Confusion Matrix
cm4 = confusion_matrix(Y_train, prediction)
class_label = ["Positive", "Negative"]
df_cm4 = pd.DataFrame(cm4, index = class_label, columns = class_label)
sns.heatmap(df_cm4, annot = True, fmt = "d")
plt.title("Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

# Model
svm = SVC()
model = svm.fit(X_test, Y_test)

prediction = model.predict(X_test)
model.score(X_test,Y_test)

# Classififcation Report
print(classification_report(Y_test, prediction))

# Confusion Matrix
cm4 = confusion_matrix(Y_test, prediction)
class_label = ["Positive", "Negative"]
df_cm4 = pd.DataFrame(cm4, index = class_label, columns = class_label)
sns.heatmap(df_cm4, annot = True, fmt = "d")
plt.title("Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

"""# **AdaBoost**"""

adaboost=AdaBoostClassifier()

adaboost=AdaBoostClassifier()
model = adaboost.fit(X_train, Y_train)

prediction = model.predict(X_train)
model.score(X_train,Y_train)

# Classififcation Report
print(classification_report(Y_train, prediction))

# Confusion Matrix
cm5 = confusion_matrix(Y_train, prediction)
class_label = ["Positive", "Negative"]
df_cm5 = pd.DataFrame(cm5, index = class_label, columns = class_label)
sns.heatmap(df_cm5, annot = True, fmt = "d")
plt.title("Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

adaboost=AdaBoostClassifier()
model = adaboost.fit(X_test, Y_test)

prediction = model.predict(X_test)
model.score(X_test,Y_test)

# Classififcation Report
print(classification_report(Y_test, prediction))

# Confusion Matrix
cm6 = confusion_matrix(Y_test, prediction)
class_label = ["Positive", "Negative"]
df_cm6 = pd.DataFrame(cm6, index = class_label, columns = class_label)
sns.heatmap(df_cm6, annot = True, fmt = "d")
plt.title("Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

"""**violin plot:**

voilin plots also like box plots, but these give pdf along with box plots in it. they look a voilin
"""

sns.violinplot(x="Family",y="Age",data= Dataset,size='8')
plt.show()

"""**CDF(cummulative distributive function), PDF(probability denstiy funtion):**"""

count,bin_edges = np.histogram(Dataset['Age'],bins=10, density = True)
plt.xlabel('Age')
pdf = count/(sum(count))
print("pdf=",pdf);
print("bin_edges= ",bin_edges);
cdf = np.cumsum(pdf)
print("cdf=",cdf);
plt.plot(bin_edges[1:],pdf);
plt.plot(bin_edges[1:],cdf);