import pandas as pd
import numpy as np
import warnings 
warnings.filterwarnings("ignore")
from sklearn.preprocessing import LabelBinarizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import roc_curve, precision_recall_curve, auc, make_scorer, recall_score, accuracy_score, precision_score, confusion_matrix, fbeta_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report as cr
import matplotlib.pyplot as plt
plt.style.use("ggplot")
import os


os.chdir("C:\\Users\\ASUS\\Desktop")
df = pd.read_csv('Property.csv', parse_dates=True)
pd.set_option('display.max_columns', None)
df.head()


n = df.nunique(axis=0)
n

Junk                         2
InteriorsStyle            1036
PriceIndex8              12686
ListDate                   516
Material                   134
PriceIndex9               2019
Agency                       5
AreaIncomeType               4
EnvRating                    3
PriceIndex7              12078
ExpeditedListing             2
PriceIndex4              13006
PriceIndex1              10053
PriceIndex6              10863
PRIMEUNIT                    3
Channel                      3
Zip                        152
InsurancePremiumIndex      277
PlotType                    13
Architecture                33
PriceIndex3              12309
Region                       3
PriceIndex5              10011
SubModel                   831
Facade                      17
State                       37
NormalisedPopulation     36675
BuildYear                   10
RegionType                   5
PropertyAge                 10
PriceIndex2              11010
dtype: int64


df.drop( ["InteriorsStyle","ListDate","Material","Zip","Architecture","SubModel","Facade","State","BuildYear",] ,axis=1,inplace=True)


df.head()


df.isnull().mean()*100

Junk                     0.0
PriceIndex8              0.0
PriceIndex9              0.0
Agency                   0.0
AreaIncomeType           0.0
EnvRating                0.0
PriceIndex7              0.0
ExpeditedListing         0.0
PriceIndex4              0.0
PriceIndex1              0.0
PriceIndex6              0.0
PRIMEUNIT                0.0
Channel                  0.0
InsurancePremiumIndex    0.0
PlotType                 0.0
PriceIndex3              0.0
Region                   0.0
PriceIndex5              0.0
NormalisedPopulation     0.0
RegionType               0.0
PropertyAge              0.0
PriceIndex2              0.0
dtype: float64


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62035 entries, 0 to 62034
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Junk                   62035 non-null  int64 
 1   PriceIndex8            62035 non-null  object
 2   PriceIndex9            62035 non-null  int64 
 3   Agency                 62035 non-null  object
 4   AreaIncomeType         62035 non-null  object
 5   EnvRating              62035 non-null  object
 6   PriceIndex7            62035 non-null  object
 7   ExpeditedListing       62035 non-null  int64 
 8   PriceIndex4            62035 non-null  object
 9   PriceIndex1            62035 non-null  object
 10  PriceIndex6            62035 non-null  object
 11  PRIMEUNIT              62035 non-null  object
 12  Channel                62035 non-null  object
 13  InsurancePremiumIndex  62035 non-null  int64 
 14  PlotType               62035 non-null  object
 15  PriceIndex3            62035 non-null  object
 16  Region                 62035 non-null  object
 17  PriceIndex5            62035 non-null  object
 18  NormalisedPopulation   62035 non-null  int64 
 19  RegionType             62035 non-null  object
 20  PropertyAge            62035 non-null  int64 
 21  PriceIndex2            62035 non-null  object
dtypes: int64(6), object(16)
memory usage: 10.4+ MB


cols = ["PriceIndex8" ,	"PriceIndex9" ,	"PriceIndex7" ,	"PriceIndex1" ,	"PriceIndex6" ,	"PriceIndex3",
        "InsurancePremiumIndex" ,	"PriceIndex5" ,	"NormalisedPopulation" ,	"PropertyAge" ,	"PriceIndex2", "PriceIndex4" ]
df[cols] = df[cols].apply(pd.to_numeric, errors='coerce', axis=1)


df.isnull().mean()*100

Junk                     0.000000
PriceIndex8              0.438462
PriceIndex9              0.000000
Agency                   0.000000
AreaIncomeType           0.000000
EnvRating                0.000000
PriceIndex7              0.438462
ExpeditedListing         0.000000
PriceIndex4              0.022568
PriceIndex1              0.022568
PriceIndex6              0.438462
PRIMEUNIT                0.000000
Channel                  0.000000
InsurancePremiumIndex    0.000000
PlotType                 0.000000
PriceIndex3              0.022568
Region                   0.000000
PriceIndex5              0.438462
NormalisedPopulation     0.000000
RegionType               0.000000
PropertyAge              0.000000
PriceIndex2              0.022568
dtype: float64


df = df.dropna()


df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 61763 entries, 0 to 62034
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Junk                   61763 non-null  int64  
 1   PriceIndex8            61763 non-null  float64
 2   PriceIndex9            61763 non-null  float64
 3   Agency                 61763 non-null  object 
 4   AreaIncomeType         61763 non-null  object 
 5   EnvRating              61763 non-null  object 
 6   PriceIndex7            61763 non-null  float64
 7   ExpeditedListing       61763 non-null  int64  
 8   PriceIndex4            61763 non-null  float64
 9   PriceIndex1            61763 non-null  float64
 10  PriceIndex6            61763 non-null  float64
 11  PRIMEUNIT              61763 non-null  object 
 12  Channel                61763 non-null  object 
 13  InsurancePremiumIndex  61763 non-null  float64
 14  PlotType               61763 non-null  object 
 15  PriceIndex3            61763 non-null  float64
 16  Region                 61763 non-null  object 
 17  PriceIndex5            61763 non-null  float64
 18  NormalisedPopulation   61763 non-null  float64
 19  RegionType             61763 non-null  object 
 20  PropertyAge            61763 non-null  float64
 21  PriceIndex2            61763 non-null  float64
dtypes: float64(12), int64(2), object(8)
memory usage: 10.8+ MB


char = [column for column in df.columns if df[column].dtype == 'object']
df[char].head(2)


#W are extracting all the unique values of the categorical variables
df[char].apply(lambda x: x.value_counts()).T.stack()

Agency          CAT1       21486.0
                CAT2       19734.0
                CAT3       10395.0
                OTHER      10147.0
                missing        1.0
AreaIncomeType  A          30431.0
                B          28016.0
                C            642.0
                missing     2674.0
EnvRating       GREEN       2864.0
                RED           68.0
                missing    58831.0
PRIMEUNIT       NO          2875.0
                YES           57.0
                missing    58831.0
Channel         Agent      12223.0
                Direct     34681.0
                Other      14859.0
PlotType        32T2        6821.0
                7RBQ        1647.0
                8FRL         720.0
                DIJX        6098.0
                DJOP        1894.0
                DSSM        2694.0
                I00H        1461.0
                LIGY         652.0
                T559           1.0
                V5R0        1225.0
                W62B        7553.0
                WHBI       26038.0
                WLQ6        4959.0
Region          A          59601.0
                B           2156.0
                missing        6.0
RegionType      A          51615.0
                B           6848.0
                C           3127.0
                OTHER        172.0
                missing        1.0
dtype: float64


#dropping the variables PRIMEUNIT, EnvRating
df = df.drop(["EnvRating","PRIMEUNIT"],axis=1)
# replacing the values with mode in Area Income Type
df.loc[df["AreaIncomeType"] == "missing","AreaIncomeType"] = "A"
df.loc[df["AreaIncomeType"] == "C","AreaIncomeType"] = "A"
#delete rows with "missing" as value, since it's insignificant in number
df = df.loc[df["Agency"] != "missing",]
df = df.loc[df["Region"] != "missing",]


df = pd.get_dummies(df, columns=['Agency',
 'AreaIncomeType',
 'Channel',
 'PlotType',
 'Region',
 'RegionType'])
df.head()


# We remove the label values from our training data
X = df.drop(['Junk'],axis=1)

# We assigned those label values to our Y dataset
y = df['Junk']


# Split it to a 70:30 Ratio Train:Test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)


# Normalize the data
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)


from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

model = LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression()


predictions = model.predict(X_test)

print("Accuracy {0:.2f}%".format(100*accuracy_score(predictions, y_test)))
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

Accuracy 87.91%
[[16282     2]
 [ 2238     5]]
              precision    recall  f1-score   support

           0       0.88      1.00      0.94     16284
           1       0.71      0.00      0.00      2243

    accuracy                           0.88     18527
   macro avg       0.80      0.50      0.47     18527
weighted avg       0.86      0.88      0.82     18527


df.Junk.value_counts(normalize=True)*100

0    87.737224
1    12.262776
Name: Junk, dtype: float64


from imblearn.over_sampling import SVMSMOTE


sm = SVMSMOTE(
    sampling_strategy='auto',  # samples only the minority class
    random_state=0,  # for reproducibility
    k_neighbors=5, # neighbours to create the synthetic examples
    m_neighbors=10, # neighbours to determine if minority class is in "danger"
    n_jobs=-1)


# create the synthetic examples
X_res, y_res = sm.fit_resample(X, y)


# number of minority class observations
print(y.value_counts(normalize=True)*100), print(y_res.value_counts(normalize=True)*100)

0    87.737224
1    12.262776
Name: Junk, dtype: float64
1    50.0
0    50.0
Name: Junk, dtype: float64

(None, None)


def AnalyticsEducator(X,y):

  from sklearn.model_selection import train_test_split
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
  
  # Normalize the data
  from sklearn.preprocessing import StandardScaler

  sc = StandardScaler()
  X_train = sc.fit_transform(X_train)
  X_test = sc.transform(X_test)

  from sklearn.linear_model import LogisticRegression
  from sklearn.metrics import confusion_matrix
  from sklearn.metrics import classification_report
  from sklearn.metrics import accuracy_score

  model = LogisticRegression()
  model.fit(X_train, y_train)

  predictions = model.predict(X_test)

  print("Accuracy {0:.2f}%".format(100*accuracy_score(predictions, y_test)))
  print(confusion_matrix(y_test, predictions))
  print(classification_report(y_test, predictions))


AnalyticsEducator(X_res, y_res)

Accuracy 84.15%
[[14827  1518]
 [ 3634 12531]]
              precision    recall  f1-score   support

           0       0.80      0.91      0.85     16345
           1       0.89      0.78      0.83     16165

    accuracy                           0.84     32510
   macro avg       0.85      0.84      0.84     32510
weighted avg       0.85      0.84      0.84     32510


from imblearn.over_sampling import BorderlineSMOTE


sm_b1 = BorderlineSMOTE(
    sampling_strategy='auto',  # samples only the minority class
    random_state=0,  # for reproducibility
    k_neighbors=5, # the neighbours to crete the new examples
    m_neighbors=10, # the neiighbours to find the DANGER group
    kind='borderline-1',
    n_jobs=-1
)


X_res_b1, y_res_b1 = sm_b1.fit_resample(X, y)


AnalyticsEducator(X_res_b1, y_res_b1)

Accuracy 91.10%
[[16324    21]
 [ 2872 13293]]
              precision    recall  f1-score   support

           0       0.85      1.00      0.92     16345
           1       1.00      0.82      0.90     16165

    accuracy                           0.91     32510
   macro avg       0.92      0.91      0.91     32510
weighted avg       0.92      0.91      0.91     32510


sm_b2 = BorderlineSMOTE(
    sampling_strategy='auto',  # samples only the minority class
    random_state=0,  # for reproducibility
    k_neighbors=5, # the neighbours to crete the new examples
    m_neighbors=10, # the neiighbours to find the DANGER group
    kind='borderline-2',
    n_jobs=4
)

X_res_b2, y_res_b2 = sm_b2.fit_resample(X, y)


AnalyticsEducator(X_res_b2, y_res_b2)

Accuracy 91.67%
[[16336     9]
 [ 2699 13466]]
              precision    recall  f1-score   support

           0       0.86      1.00      0.92     16345
           1       1.00      0.83      0.91     16165

    accuracy                           0.92     32510
   macro avg       0.93      0.92      0.92     32510
weighted avg       0.93      0.92      0.92     32510

	Junk	InteriorsStyle	PriceIndex8	ListDate	Material	PriceIndex9	Agency	AreaIncomeType	EnvRating	PriceIndex7	PriceIndex4	PriceIndex1	PriceIndex6	PRIMEUNIT	Channel	Zip	InsurancePremiumIndex	PlotType	Architecture	PriceIndex3	Region	PriceIndex5	SubModel	Facade	State	NormalisedPopulation	BuildYear	RegionType	PropertyAge	PriceIndex2
0	0	6LLJ	14674.0	9/7/2010	UT7W	8270	CAT3	B	missing	13143.0	14224.0	9217.0	10387.0	missing	Direct	21075	623	WHBI	I3Z9	13108.0	A	9022.0	6E9G	GREEN	MD	42077	2008	A	2	10692.0
1	1	XZ9F	4172.0	1/7/2009	PRN0	3890	CAT2	B	missing	3461.0	4404.0	2958.0	3400.0	missing	Direct	73129	1689	WHBI	AVYP	3695.0	A	2742.0	FIVQ	SILVER	OK	77258	2002	A	7	3615.0
2	0	QWEV	8210.0	6/23/2010	A9RA	5900	CAT1	B	missing	6922.0	8952.0	4568.0	4862.0	missing	Other	29697	2351	W62B	1M5X	7730.0	A	3768.0	4K0H	SILVER	SC	94514	2004	A	6	5805.0
3	0	MK2K	7309.0	3/19/2009	RRJQ	7460	OTHER	A	missing	6300.0	7460.0	5361.0	6305.0	missing	Agent	28273	1933	WLQ6	DKOO	6290.0	A	5370.0	ZLXY	WHITE	NC	82302	2002	C	7	6444.0
4	0	GK4G	9182.0	3/3/2010	HE5A	6400	OTHER	B	missing	8330.0	9846.0	5428.0	5718.0	missing	Other	29697	482	WHBI	KML6	8583.0	A	4803.0	FIVQ	SILVER	SC	49176	2007	B	3	6286.0

	Junk	PriceIndex8	PriceIndex9	Agency	AreaIncomeType	EnvRating	PriceIndex7	PriceIndex4	PriceIndex1	PriceIndex6	PRIMEUNIT	Channel	InsurancePremiumIndex	PlotType	PriceIndex3	Region	PriceIndex5	NormalisedPopulation	RegionType	PropertyAge	PriceIndex2
0	0	14674.0	8270	CAT3	B	missing	13143.0	14224.0	9217.0	10387.0	missing	Direct	623	WHBI	13108.0	A	9022.0	42077	A	2	10692.0
1	1	4172.0	3890	CAT2	B	missing	3461.0	4404.0	2958.0	3400.0	missing	Direct	1689	WHBI	3695.0	A	2742.0	77258	A	7	3615.0
2	0	8210.0	5900	CAT1	B	missing	6922.0	8952.0	4568.0	4862.0	missing	Other	2351	W62B	7730.0	A	3768.0	94514	A	6	5805.0
3	0	7309.0	7460	OTHER	A	missing	6300.0	7460.0	5361.0	6305.0	missing	Agent	1933	WLQ6	6290.0	A	5370.0	82302	C	7	6444.0
4	0	9182.0	6400	OTHER	B	missing	8330.0	9846.0	5428.0	5718.0	missing	Other	482	WHBI	8583.0	A	4803.0	49176	B	3	6286.0

	Junk	PriceIndex8	PriceIndex9	PriceIndex7	PriceIndex4	PriceIndex1	PriceIndex6	InsurancePremiumIndex	PriceIndex3	PriceIndex5	NormalisedPopulation	PropertyAge	PriceIndex2	Agency_CAT1	Agency_CAT2	Agency_CAT3	Agency_OTHER	AreaIncomeType_A	AreaIncomeType_B	Channel_Agent	Channel_Direct	Channel_Other	PlotType_W62B	PlotType_WHBI	PlotType_WLQ6	Region_A	RegionType_A	RegionType_B	RegionType_C
0	0	14674.0	8270.0	13143.0	14224.0	9217.0	10387.0	623.0	13108.0	9022.0	42077.0	2.0	10692.0	0	0	1	0	0	1	0	1	0	0	1	0	1	1	0	0
1	1	4172.0	3890.0	3461.0	4404.0	2958.0	3400.0	1689.0	3695.0	2742.0	77258.0	7.0	3615.0	0	1	0	0	0	1	0	1	0	0	1	0	1	1	0	0
2	0	8210.0	5900.0	6922.0	8952.0	4568.0	4862.0	2351.0	7730.0	3768.0	94514.0	6.0	5805.0	1	0	0	0	0	1	0	0	1	1	0	0	1	1	0	0
3	0	7309.0	7460.0	6300.0	7460.0	5361.0	6305.0	1933.0	6290.0	5370.0	82302.0	7.0	6444.0	0	0	0	1	1	0	1	0	0	0	0	1	1	0	0	1
4	0	9182.0	6400.0	8330.0	9846.0	5428.0	5718.0	482.0	8583.0	4803.0	49176.0	3.0	6286.0	0	0	0	1	0	1	0	0	1	0	1	0	1	0	1	0

Problem Statement¶

The company has hired a data scientist to differentiate between the junk properties and good properties, so that the company can prioritize the good properties over the junk properties. This process will attract more traffic and their revenue will also increase.¶

Methodology¶

Import the packages¶

Import the data¶

Showing the count of unique values of each variable¶

Observation: We can see that there are some variables which seems like a categorical variable but there are too many values there. We don't have any logic to club them to together, hence we will delete it.¶

Checking for Missing values¶

There are no missing values in this dataset¶

Check if the properties are correctly set¶

We can see that there are several variables which are object (like PriceIndex8,PriceIndex8 etc), which should have been numeric¶

Setting the properties correctly¶

Now we will check for missing values once again;¶

There are missing values after converting the variables into numeric¶

There are some missing values, but they are quite insignificant. We will drop all the rows with missing values.¶

Now there are no more missing values.¶

Univariate analysis and dummy variable creation¶

Now we are going to extract all object variables (categorical variables) and check their values¶

Now we are going to check the frequency distribution of each categorical variables, and treat them according to their distributions¶

Here we can see that RegionType, Region, and Agency having insignificant number of missing values, hence we will delete those rows.¶

PRIMEUNIT, EnvRating majority values are missing, hence we will drop this variable.¶

Area Income type missing values will be replaced with the mode. The value C has insignificant number, hence it will also be replaced with mode.¶

Now we will create the dummy variables¶

Data split¶

Segregating the independent variables as X and dependent variable as y¶

Now we will split the data into training (70% of the data) and rest 30% - named test, will be kept aside for later use.¶

Feature scaling¶

MODEL TRAINING¶

We are now going to run the Binary Logistic Regression from the Scikit Learn (sklearn) package¶

EVALUATING THE MODEL¶

Once the model is executed, we will predict the test data with our model.¶

Result interpretations¶

The precision and recall for 1 are 71% and 0% respectively. It means out of all the actual 1 the model could hardly predict anything correctly.¶

What might be the reason for such a poor performance of the model¶

If we check the ratio of 0 and 1 in our original data, then we can see that out of total data 0 consists of 87.7%; there are hardly much data for 1. Hence, the model couldn't be trained properly to identify the 1 properly. This is called imbalanced data.¶

We need to balance the data first, in order to get a better result.¶

SMOTE with over sampling¶

First we will try with Support Vector Machine (SVM) Smote over sampling¶

Run the algorithm¶

Recreate the data with synthetic oversampled minority class (1 in our case)¶

We can see that earlier the data was highly imbalanced (dataset = y, only 12% of 1) but now it has been balanced (dataset= y_res, 50% of total data is 1)¶

Here we are creating an user defined function to segregate the data, normalize it, run the logistic regression algorithm and predict the results on the test data¶

Result interpretations¶

Now we can see that the precision and recall for 1 are 89% and 78% respectively. It means there has been a tremendous improvement in the performance of the model than the previous iteration.¶

Now we will try with another SMOTE method named Border Line 1 Smote oversampling.¶

We can see that borderline smote has made further improvements in the accuracy of the model.¶

Now we will try one last time, another method of border line smote; named border line 2 method¶

We can see that borderline 2 smote has produced the best result among all the methods which we have used so far.¶

There is no guarantee that for every dataset borderline 2 method will keep on producing the best result. We might need to check the different methods before we could zero down on the best algorithm.¶

We provide a 100% money back guarantee on learning. It means that each and every student of analytics educator will be able to understand every line of codes and algorithm, else we will refund the money back.¶

You may check out all our instructor led courses from this link. https://www.analyticseducator.com/Courses-Offers.html¶

If you want to read more such Machine Learning case studies then you may go through the following links -¶

https://www.analyticseducator.com/Blog/Logistic-Regression_Titanic-Survival-checkpoint.html¶

https://www.analyticseducator.com/Blog/Cancer-Classification-Support-Vector-Machines.html¶

https://www.analyticseducator.com/Blog/K-Nearest%20Neighbours%20for%20T-shirt%20size%20classification.html¶

You may check out all our instructor led courses from this link. https://www.analyticseducator.com/Courses-Offers.html ¶

https://www.analyticseducator.com/Blog/Logistic-Regression_Titanic-Survival-checkpoint.html ¶

https://www.analyticseducator.com/Blog/Cancer-Classification-Support-Vector-Machines.html ¶

https://www.analyticseducator.com/Blog/K-Nearest%20Neighbours%20for%20T-shirt%20size%20classification.html ¶