# import libraries 
import pandas as pd # Import Pandas for data manipulation using dataframes
import numpy as np # Import Numpy for data statistical analysis 
import matplotlib.pyplot as plt # Import matplotlib for data visualisation
import seaborn as sns # Statistical data visualization
import os
import warnings
warnings.filterwarnings("ignore")


os.chdir("D:\\Python\\5")
bank_df = pd.read_csv('Bank_Customer_retirement.csv')
bank_df.head()


# Dropping Customer ID
bank_df.drop("Customer ID",axis=1,inplace=True)
bank_df.head(2)


sns.pairplot(bank_df, hue = 'Retire', vars = ['Age', 'Savings'] )

<seaborn.axisgrid.PairGrid at 0x1ad5add8>


sns.countplot(bank_df['Retire'], label = "Retirement")

<AxesSubplot:xlabel='Retire', ylabel='count'>


# number of missing values by variables
bank_df.isnull().sum()

Age        0
Savings    0
Retire     0
dtype: int64


# Let's drop the target label coloumns to save only the independent variable
X = bank_df.drop(['Retire'],axis=1)


# Let's save the target label coloumns as y
y = bank_df['Retire']


from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=1005)


from sklearn.svm import SVC
svc_model = SVC()
svc_model.fit(X_train, y_train)

SVC()


from sklearn.metrics import classification_report, confusion_matrix
y_predict = svc_model.predict(X_test)
cm = confusion_matrix(y_test, y_predict)


sns.heatmap(cm, annot=True)

<AxesSubplot:>


print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

           0       0.94      0.85      0.90        55
           1       0.84      0.93      0.88        45

    accuracy                           0.89       100
   macro avg       0.89      0.89      0.89       100
weighted avg       0.90      0.89      0.89       100


# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)


from sklearn.svm import SVC
svc_model = SVC()
svc_model.fit(X_train, y_train)

SVC()


from sklearn.metrics import classification_report, confusion_matrix
y_predict = svc_model.predict(X_test)
cm = confusion_matrix(y_test, y_predict)
print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

           0       0.95      0.95      0.95        55
           1       0.93      0.93      0.93        45

    accuracy                           0.94       100
   macro avg       0.94      0.94      0.94       100
weighted avg       0.94      0.94      0.94       100


param_grid = {'C': [0.001, 0.1, 1], 'gamma': [1, 0.1, 0.01, 0.001], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}


from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(SVC(),param_grid,refit=True)
grid.fit(X_train,y_train)

GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.001, 0.1, 1], 'gamma': [1, 0.1, 0.01, 0.001],
                         'kernel': ['linear', 'poly', 'rbf', 'sigmoid']})


grid_predictions = grid.predict(X_test)
cm = confusion_matrix(y_test, grid_predictions)
print(classification_report(y_test,grid_predictions))

              precision    recall  f1-score   support

           0       0.96      0.95      0.95        55
           1       0.93      0.96      0.95        45

    accuracy                           0.95       100
   macro avg       0.95      0.95      0.95       100
weighted avg       0.95      0.95      0.95       100


grid.best_params_

{'C': 1, 'gamma': 1, 'kernel': 'linear'}

	Customer ID	Age	Savings	Retire
0	0	39.180417	322349.8740	0
1	1	56.101686	768671.5740	1
2	2	57.023043	821505.4718	1
3	3	43.711358	494187.4850	0
4	4	54.728823	691435.7723	1

	Age	Savings	Retire
0	39.180417	322349.874	0
1	56.101686	768671.574	1

Bank Customers Retirement Predictions Using Support Vector Machines ¶

Support Vector Machine¶

PROBLEM STATEMENT¶

Importing Packages¶

We will import the data¶

In this data, in Retire column 1 means they have accumulated enough money to spend the rest of their life peacefully, and 0 mean they are not yet ready to take up retirement.¶

Savings are the respective money accumulated and Age shows the age of the person.¶

Customer ID is the unique ID of the customers; not going to be helpful since all the data are different. Hence we will drop it from the data.¶

VISUALIZING THE DATA¶

Here we are showing the histogram and scatter plot for Age and Saving for different values of Retire values.¶

Observation: There is a clear demarcation between Retire values (0 and 1) in respect to Age and Savings. We can conclude that both the independent variables (Age and Savings) are likely to predict the dependent variable (Retire) to a great extent.¶

Now we are checking the frequency distribution of the Retire columns.¶

Observation: Number of 1 and 0 are very similar, hence we don't have any problem of imbalanced data¶

Checking for missing values¶

Observation: There are no missing values in the data¶

Data split¶

Segregating the independent variables as X and dependent variable as y¶

Now we will split the data into training (80% of the data) and rest 20% - named test, will be kept aside for later use.¶

MODEL TRAINING using Scikit Learn¶

We will be using Support Vector Machine algorithm from the Scikit Learn package. We are using all the default parameters of the package. Later, we will try to tune the hyper parameters to further increase the accuracy.¶

EVALUATING THE MODEL¶

Once the model is executed, we will predict the test data with our model.¶

Visualize the confusion matrix¶

Overall accuracy is 89% and precision for 0 and 1 are 84% and 94% respectively.¶

Improving the model using feature scaling¶

Model training¶

Evaluating the model with scaled data¶

Observation: The accuracy has improved to 94% from the previous model accuracy of 89%¶

IMPROVING THE MODEL - Hyper parameter tuning¶

We will now tune different values of C parameters, gamma and different kernels to further fine tune the results in order to achieve higher accuracy¶

Observation: The accuracy has further improved to 95% from the previous model accuracy of 94%. This is will be our final result¶

This is how we can check for the best combination of hyper-parameters to get the best result.¶

¶