# import libraries 
import pandas as pd # Import Pandas for data manipulation using dataframes
import numpy as np # Import Numpy for data statistical analysis 
import matplotlib.pyplot as plt # Import matplotlib for data visualisation
import seaborn as sns # Statistical data visualization
# %matplotlib inline
import warnings
warnings.filterwarnings("ignore")


# Import Cancer data drom the Sklearn library
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
# Read the DataFrame, first using the feature data
df = pd.DataFrame(data.data, columns=data.feature_names)
# Add a target column, and fill it with the target data
df['target'] = data.target
# Show the first five rows
df.head()


sns.pairplot(df, hue = 'target', vars = ['mean radius', 'mean texture', 'mean area', 'mean perimeter', 'mean smoothness'] )

<seaborn.axisgrid.PairGrid at 0x2186c4a8>


sns.countplot(df['target'], label = "Target")

<AxesSubplot:xlabel='target', ylabel='count'>


sns.scatterplot(x = 'mean area', y = 'mean smoothness', hue = 'target', data = df)

<AxesSubplot:xlabel='mean area', ylabel='mean smoothness'>


sns.lmplot('mean area', 'mean smoothness', hue ='target', data = df, fit_reg=False)

<seaborn.axisgrid.FacetGrid at 0x1f2c2320>


# Let's check the correlation between the variables 
# Strong correlation between the mean radius and mean perimeter, mean area and mean primeter
plt.figure(figsize=(20,10)) 
sns.heatmap(df.corr(), annot=True)

<AxesSubplot:>


# number of missing values by variables
df.isnull().sum()

mean radius                0
mean texture               0
mean perimeter             0
mean area                  0
mean smoothness            0
mean compactness           0
mean concavity             0
mean concave points        0
mean symmetry              0
mean fractal dimension     0
radius error               0
texture error              0
perimeter error            0
area error                 0
smoothness error           0
compactness error          0
concavity error            0
concave points error       0
symmetry error             0
fractal dimension error    0
worst radius               0
worst texture              0
worst perimeter            0
worst area                 0
worst smoothness           0
worst compactness          0
worst concavity            0
worst concave points       0
worst symmetry             0
worst fractal dimension    0
target                     0
dtype: int64


# Let's drop the target label coloumns
X = df.drop(['target'],axis=1)


y = df['target']


from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=5)


from sklearn.svm import SVC 
from sklearn.metrics import classification_report, confusion_matrix

svc_model = SVC()
svc_model.fit(X_train, y_train)

SVC()


y_predict = svc_model.predict(X_test)
cm = confusion_matrix(y_test, y_predict)
sns.heatmap(cm, annot=True)

<AxesSubplot:>


print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

           0       1.00      0.85      0.92        48
           1       0.90      1.00      0.95        66

    accuracy                           0.94       114
   macro avg       0.95      0.93      0.94       114
weighted avg       0.94      0.94      0.94       114


# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)


from sklearn.svm import SVC
svc_model = SVC()
svc_model.fit(X_train, y_train)

SVC()


y_predict = svc_model.predict(X_test)
cm = confusion_matrix(y_test, y_predict)
sns.heatmap(cm, annot=True)

<AxesSubplot:>


from sklearn.metrics import classification_report, confusion_matrix
y_predict = svc_model.predict(X_test)
cm = confusion_matrix(y_test, y_predict)
print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

           0       0.98      0.94      0.96        48
           1       0.96      0.98      0.97        66

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114


param_grid = {'C': [0.001, 0.1, 1], 'gamma': [1, 0.1, 0.01, 0.001], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}


from sklearn.model_selection import GridSearchCV
grid = GridSearchCV(SVC(),param_grid,refit=True)
grid.fit(X_train,y_train)

GridSearchCV(estimator=SVC(),
             param_grid={'C': [0.001, 0.1, 1], 'gamma': [1, 0.1, 0.01, 0.001],
                         'kernel': ['linear', 'poly', 'rbf', 'sigmoid']})


grid.best_params_

{'C': 0.1, 'gamma': 1, 'kernel': 'linear'}


grid.best_estimator_

SVC(C=0.1, gamma=1, kernel='linear')


grid_predictions = grid.predict(X_test)
cm = confusion_matrix(y_test, grid_predictions)
sns.heatmap(cm, annot=True)

<AxesSubplot:>


print(classification_report(y_test,grid_predictions))

              precision    recall  f1-score   support

           0       0.98      0.96      0.97        48
           1       0.97      0.98      0.98        66

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114


grid.best_params_

{'C': 0.1, 'gamma': 1, 'kernel': 'linear'}

	mean radius	mean texture	mean perimeter	mean area	mean smoothness	mean compactness	mean concavity	mean concave points	mean symmetry	mean fractal dimension	...	worst texture	worst perimeter	worst area	worst smoothness	worst compactness	worst concavity	worst concave points	worst symmetry	worst fractal dimension
0	17.99	10.38	122.80	1001.0	0.11840	0.27760	0.3001	0.14710	0.2419	0.07871	...	17.33	184.60	2019.0	0.1622	0.6656	0.7119	0.2654	0.4601	0.11890
1	20.57	17.77	132.90	1326.0	0.08474	0.07864	0.0869	0.07017	0.1812	0.05667	...	23.41	158.80	1956.0	0.1238	0.1866	0.2416	0.1860	0.2750	0.08902
2	19.69	21.25	130.00	1203.0	0.10960	0.15990	0.1974	0.12790	0.2069	0.05999	...	25.53	152.50	1709.0	0.1444	0.4245	0.4504	0.2430	0.3613	0.08758
3	11.42	20.38	77.58	386.1	0.14250	0.28390	0.2414	0.10520	0.2597	0.09744	...	26.50	98.87	567.7	0.2098	0.8663	0.6869	0.2575	0.6638	0.17300
4	20.29	14.34	135.10	1297.0	0.10030	0.13280	0.1980	0.10430	0.1809	0.05883	...	16.67	152.20	1575.0	0.1374	0.2050	0.4000	0.1625	0.2364	0.07678

BREAST CANCER CLASSIFICATION USING SUPPORT VECTOR MACHINES

Support Vector Machine

PROBLEM STATEMENT

Importing Library¶

Load the data - inbuilt dataset from Scikit Learn package¶

If the target column is 1 it means it's a malignant (fatal) and if it's 0 it means Benign (not so harmful)¶

Rest of the column shows the different characteristics of the cell found in the patient's body¶

VISUALIZING THE DATA¶

Here we are showing the histogram and scatter plot for all variables for different values of Target values.¶

Observation: There is a clear demarcation between Target values (0 and 1) in respect to other independent variables. We can conclude that both the independent variables (all variables except Target) are likely to predict the dependent variable (Target) to a great extent.¶

Now we are checking the frequency distribution of the Retire columns.¶

Observation: Number of 1 and 0 are similar, hence we don't have any problem of imbalanced data¶

Checking for missing values¶

Observation: There are no missing values in the data¶

Data split¶

Segregating the independent variables as X and dependent variable as y¶

Now we will split the data into training (80% of the data) and rest 20% - named test, will be kept aside for later use.¶

MODEL TRAINING using Scikit Learn¶

We will be using Support Vector Machine algorithm from the Scikit Learn package. We are using all the default parameters of the package. Later, we will try to tune the hyper parameters to further increase the accuracy.¶

EVALUATING THE MODEL¶

Once the model is executed, we will predict the test data with our model.¶

Overall accuracy is 94% and precision for 0 and 1 are 100% and 90% respectively.¶

Improving the model using feature scaling¶

Model training¶

Evaluating the model with scaled data¶

Observation: The accuracy has improved to 96% from the previous model accuracy of 94%¶

IMPROVING THE MODEL - Hyper parameter tuning¶

We will now tune different values of C parameters, gamma and different kernels to further fine tune the results in order to achieve higher accuracy¶

Observation: The accuracy has further improved to 97% from the previous model accuracy of 96%. This is will be our final result¶

This is how we can check for the best combination of hyper-parameters to get the best result.¶

¶