import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import LabelBinarizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import roc_curve, precision_recall_curve, auc, make_scorer, recall_score, accuracy_score, precision_score, confusion_matrix, fbeta_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report as cr
import matplotlib.pyplot as plt
plt.style.use("ggplot")
import os
os.chdir("C:\\Users\\ASUS\\Desktop")
df = pd.read_csv('Property.csv', parse_dates=True)
pd.set_option('display.max_columns', None)
df.head()
Junk | InteriorsStyle | PriceIndex8 | ListDate | Material | PriceIndex9 | Agency | AreaIncomeType | EnvRating | PriceIndex7 | ExpeditedListing | PriceIndex4 | PriceIndex1 | PriceIndex6 | PRIMEUNIT | Channel | Zip | InsurancePremiumIndex | PlotType | Architecture | PriceIndex3 | Region | PriceIndex5 | SubModel | Facade | State | NormalisedPopulation | BuildYear | RegionType | PropertyAge | PriceIndex2 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 6LLJ | 14674.0 | 9/7/2010 | UT7W | 8270 | CAT3 | B | missing | 13143.0 | 0 | 14224.0 | 9217.0 | 10387.0 | missing | Direct | 21075 | 623 | WHBI | I3Z9 | 13108.0 | A | 9022.0 | 6E9G | GREEN | MD | 42077 | 2008 | A | 2 | 10692.0 |
1 | 1 | XZ9F | 4172.0 | 1/7/2009 | PRN0 | 3890 | CAT2 | B | missing | 3461.0 | 0 | 4404.0 | 2958.0 | 3400.0 | missing | Direct | 73129 | 1689 | WHBI | AVYP | 3695.0 | A | 2742.0 | FIVQ | SILVER | OK | 77258 | 2002 | A | 7 | 3615.0 |
2 | 0 | QWEV | 8210.0 | 6/23/2010 | A9RA | 5900 | CAT1 | B | missing | 6922.0 | 0 | 8952.0 | 4568.0 | 4862.0 | missing | Other | 29697 | 2351 | W62B | 1M5X | 7730.0 | A | 3768.0 | 4K0H | SILVER | SC | 94514 | 2004 | A | 6 | 5805.0 |
3 | 0 | MK2K | 7309.0 | 3/19/2009 | RRJQ | 7460 | OTHER | A | missing | 6300.0 | 0 | 7460.0 | 5361.0 | 6305.0 | missing | Agent | 28273 | 1933 | WLQ6 | DKOO | 6290.0 | A | 5370.0 | ZLXY | WHITE | NC | 82302 | 2002 | C | 7 | 6444.0 |
4 | 0 | GK4G | 9182.0 | 3/3/2010 | HE5A | 6400 | OTHER | B | missing | 8330.0 | 0 | 9846.0 | 5428.0 | 5718.0 | missing | Other | 29697 | 482 | WHBI | KML6 | 8583.0 | A | 4803.0 | FIVQ | SILVER | SC | 49176 | 2007 | B | 3 | 6286.0 |
n = df.nunique(axis=0)
n
Junk 2 InteriorsStyle 1036 PriceIndex8 12686 ListDate 516 Material 134 PriceIndex9 2019 Agency 5 AreaIncomeType 4 EnvRating 3 PriceIndex7 12078 ExpeditedListing 2 PriceIndex4 13006 PriceIndex1 10053 PriceIndex6 10863 PRIMEUNIT 3 Channel 3 Zip 152 InsurancePremiumIndex 277 PlotType 13 Architecture 33 PriceIndex3 12309 Region 3 PriceIndex5 10011 SubModel 831 Facade 17 State 37 NormalisedPopulation 36675 BuildYear 10 RegionType 5 PropertyAge 10 PriceIndex2 11010 dtype: int64
df.drop( ["InteriorsStyle","ListDate","Material","Zip","Architecture","SubModel","Facade","State","BuildYear",] ,axis=1,inplace=True)
df.head()
Junk | PriceIndex8 | PriceIndex9 | Agency | AreaIncomeType | EnvRating | PriceIndex7 | ExpeditedListing | PriceIndex4 | PriceIndex1 | PriceIndex6 | PRIMEUNIT | Channel | InsurancePremiumIndex | PlotType | PriceIndex3 | Region | PriceIndex5 | NormalisedPopulation | RegionType | PropertyAge | PriceIndex2 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 14674.0 | 8270 | CAT3 | B | missing | 13143.0 | 0 | 14224.0 | 9217.0 | 10387.0 | missing | Direct | 623 | WHBI | 13108.0 | A | 9022.0 | 42077 | A | 2 | 10692.0 |
1 | 1 | 4172.0 | 3890 | CAT2 | B | missing | 3461.0 | 0 | 4404.0 | 2958.0 | 3400.0 | missing | Direct | 1689 | WHBI | 3695.0 | A | 2742.0 | 77258 | A | 7 | 3615.0 |
2 | 0 | 8210.0 | 5900 | CAT1 | B | missing | 6922.0 | 0 | 8952.0 | 4568.0 | 4862.0 | missing | Other | 2351 | W62B | 7730.0 | A | 3768.0 | 94514 | A | 6 | 5805.0 |
3 | 0 | 7309.0 | 7460 | OTHER | A | missing | 6300.0 | 0 | 7460.0 | 5361.0 | 6305.0 | missing | Agent | 1933 | WLQ6 | 6290.0 | A | 5370.0 | 82302 | C | 7 | 6444.0 |
4 | 0 | 9182.0 | 6400 | OTHER | B | missing | 8330.0 | 0 | 9846.0 | 5428.0 | 5718.0 | missing | Other | 482 | WHBI | 8583.0 | A | 4803.0 | 49176 | B | 3 | 6286.0 |
df.isnull().mean()*100
Junk 0.0 PriceIndex8 0.0 PriceIndex9 0.0 Agency 0.0 AreaIncomeType 0.0 EnvRating 0.0 PriceIndex7 0.0 ExpeditedListing 0.0 PriceIndex4 0.0 PriceIndex1 0.0 PriceIndex6 0.0 PRIMEUNIT 0.0 Channel 0.0 InsurancePremiumIndex 0.0 PlotType 0.0 PriceIndex3 0.0 Region 0.0 PriceIndex5 0.0 NormalisedPopulation 0.0 RegionType 0.0 PropertyAge 0.0 PriceIndex2 0.0 dtype: float64
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 62035 entries, 0 to 62034 Data columns (total 22 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Junk 62035 non-null int64 1 PriceIndex8 62035 non-null object 2 PriceIndex9 62035 non-null int64 3 Agency 62035 non-null object 4 AreaIncomeType 62035 non-null object 5 EnvRating 62035 non-null object 6 PriceIndex7 62035 non-null object 7 ExpeditedListing 62035 non-null int64 8 PriceIndex4 62035 non-null object 9 PriceIndex1 62035 non-null object 10 PriceIndex6 62035 non-null object 11 PRIMEUNIT 62035 non-null object 12 Channel 62035 non-null object 13 InsurancePremiumIndex 62035 non-null int64 14 PlotType 62035 non-null object 15 PriceIndex3 62035 non-null object 16 Region 62035 non-null object 17 PriceIndex5 62035 non-null object 18 NormalisedPopulation 62035 non-null int64 19 RegionType 62035 non-null object 20 PropertyAge 62035 non-null int64 21 PriceIndex2 62035 non-null object dtypes: int64(6), object(16) memory usage: 10.4+ MB
cols = ["PriceIndex8" , "PriceIndex9" , "PriceIndex7" , "PriceIndex1" , "PriceIndex6" , "PriceIndex3",
"InsurancePremiumIndex" , "PriceIndex5" , "NormalisedPopulation" , "PropertyAge" , "PriceIndex2", "PriceIndex4" ]
df[cols] = df[cols].apply(pd.to_numeric, errors='coerce', axis=1)
df.isnull().mean()*100
Junk 0.000000 PriceIndex8 0.438462 PriceIndex9 0.000000 Agency 0.000000 AreaIncomeType 0.000000 EnvRating 0.000000 PriceIndex7 0.438462 ExpeditedListing 0.000000 PriceIndex4 0.022568 PriceIndex1 0.022568 PriceIndex6 0.438462 PRIMEUNIT 0.000000 Channel 0.000000 InsurancePremiumIndex 0.000000 PlotType 0.000000 PriceIndex3 0.022568 Region 0.000000 PriceIndex5 0.438462 NormalisedPopulation 0.000000 RegionType 0.000000 PropertyAge 0.000000 PriceIndex2 0.022568 dtype: float64
df = df.dropna()
df.info()
<class 'pandas.core.frame.DataFrame'> Int64Index: 61763 entries, 0 to 62034 Data columns (total 22 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Junk 61763 non-null int64 1 PriceIndex8 61763 non-null float64 2 PriceIndex9 61763 non-null float64 3 Agency 61763 non-null object 4 AreaIncomeType 61763 non-null object 5 EnvRating 61763 non-null object 6 PriceIndex7 61763 non-null float64 7 ExpeditedListing 61763 non-null int64 8 PriceIndex4 61763 non-null float64 9 PriceIndex1 61763 non-null float64 10 PriceIndex6 61763 non-null float64 11 PRIMEUNIT 61763 non-null object 12 Channel 61763 non-null object 13 InsurancePremiumIndex 61763 non-null float64 14 PlotType 61763 non-null object 15 PriceIndex3 61763 non-null float64 16 Region 61763 non-null object 17 PriceIndex5 61763 non-null float64 18 NormalisedPopulation 61763 non-null float64 19 RegionType 61763 non-null object 20 PropertyAge 61763 non-null float64 21 PriceIndex2 61763 non-null float64 dtypes: float64(12), int64(2), object(8) memory usage: 10.8+ MB
char = [column for column in df.columns if df[column].dtype == 'object']
df[char].head(2)
Agency | AreaIncomeType | EnvRating | PRIMEUNIT | Channel | PlotType | Region | RegionType | |
---|---|---|---|---|---|---|---|---|
0 | CAT3 | B | missing | missing | Direct | WHBI | A | A |
1 | CAT2 | B | missing | missing | Direct | WHBI | A | A |
#W are extracting all the unique values of the categorical variables
df[char].apply(lambda x: x.value_counts()).T.stack()
Agency CAT1 21486.0 CAT2 19734.0 CAT3 10395.0 OTHER 10147.0 missing 1.0 AreaIncomeType A 30431.0 B 28016.0 C 642.0 missing 2674.0 EnvRating GREEN 2864.0 RED 68.0 missing 58831.0 PRIMEUNIT NO 2875.0 YES 57.0 missing 58831.0 Channel Agent 12223.0 Direct 34681.0 Other 14859.0 PlotType 32T2 6821.0 7RBQ 1647.0 8FRL 720.0 DIJX 6098.0 DJOP 1894.0 DSSM 2694.0 I00H 1461.0 LIGY 652.0 T559 1.0 V5R0 1225.0 W62B 7553.0 WHBI 26038.0 WLQ6 4959.0 Region A 59601.0 B 2156.0 missing 6.0 RegionType A 51615.0 B 6848.0 C 3127.0 OTHER 172.0 missing 1.0 dtype: float64
#dropping the variables PRIMEUNIT, EnvRating
df = df.drop(["EnvRating","PRIMEUNIT"],axis=1)
# replacing the values with mode in Area Income Type
df.loc[df["AreaIncomeType"] == "missing","AreaIncomeType"] = "A"
df.loc[df["AreaIncomeType"] == "C","AreaIncomeType"] = "A"
#delete rows with "missing" as value, since it's insignificant in number
df = df.loc[df["Agency"] != "missing",]
df = df.loc[df["Region"] != "missing",]
df = pd.get_dummies(df, columns=['Agency',
'AreaIncomeType',
'Channel',
'PlotType',
'Region',
'RegionType'])
df.head()
Junk | PriceIndex8 | PriceIndex9 | PriceIndex7 | ExpeditedListing | PriceIndex4 | PriceIndex1 | PriceIndex6 | InsurancePremiumIndex | PriceIndex3 | PriceIndex5 | NormalisedPopulation | PropertyAge | PriceIndex2 | Agency_CAT1 | Agency_CAT2 | Agency_CAT3 | Agency_OTHER | AreaIncomeType_A | AreaIncomeType_B | Channel_Agent | Channel_Direct | Channel_Other | PlotType_32T2 | PlotType_7RBQ | PlotType_8FRL | PlotType_DIJX | PlotType_DJOP | PlotType_DSSM | PlotType_I00H | PlotType_LIGY | PlotType_V5R0 | PlotType_W62B | PlotType_WHBI | PlotType_WLQ6 | Region_A | Region_B | RegionType_A | RegionType_B | RegionType_C | RegionType_OTHER | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 14674.0 | 8270.0 | 13143.0 | 0 | 14224.0 | 9217.0 | 10387.0 | 623.0 | 13108.0 | 9022.0 | 42077.0 | 2.0 | 10692.0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 |
1 | 1 | 4172.0 | 3890.0 | 3461.0 | 0 | 4404.0 | 2958.0 | 3400.0 | 1689.0 | 3695.0 | 2742.0 | 77258.0 | 7.0 | 3615.0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 |
2 | 0 | 8210.0 | 5900.0 | 6922.0 | 0 | 8952.0 | 4568.0 | 4862.0 | 2351.0 | 7730.0 | 3768.0 | 94514.0 | 6.0 | 5805.0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 |
3 | 0 | 7309.0 | 7460.0 | 6300.0 | 0 | 7460.0 | 5361.0 | 6305.0 | 1933.0 | 6290.0 | 5370.0 | 82302.0 | 7.0 | 6444.0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 |
4 | 0 | 9182.0 | 6400.0 | 8330.0 | 0 | 9846.0 | 5428.0 | 5718.0 | 482.0 | 8583.0 | 4803.0 | 49176.0 | 3.0 | 6286.0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 |
# We remove the label values from our training data
X = df.drop(['Junk'],axis=1)
# We assigned those label values to our Y dataset
y = df['Junk']
# Split it to a 70:30 Ratio Train:Test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
# Normalize the data
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
model = LogisticRegression()
model.fit(X_train, y_train)
LogisticRegression()
predictions = model.predict(X_test)
print("Accuracy {0:.2f}%".format(100*accuracy_score(predictions, y_test)))
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))
Accuracy 87.91% [[16282 2] [ 2238 5]] precision recall f1-score support 0 0.88 1.00 0.94 16284 1 0.71 0.00 0.00 2243 accuracy 0.88 18527 macro avg 0.80 0.50 0.47 18527 weighted avg 0.86 0.88 0.82 18527
df.Junk.value_counts(normalize=True)*100
0 87.737224 1 12.262776 Name: Junk, dtype: float64
from imblearn.over_sampling import SVMSMOTE
sm = SVMSMOTE(
sampling_strategy='auto', # samples only the minority class
random_state=0, # for reproducibility
k_neighbors=5, # neighbours to create the synthetic examples
m_neighbors=10, # neighbours to determine if minority class is in "danger"
n_jobs=-1)
# create the synthetic examples
X_res, y_res = sm.fit_resample(X, y)
# number of minority class observations
print(y.value_counts(normalize=True)*100), print(y_res.value_counts(normalize=True)*100)
0 87.737224 1 12.262776 Name: Junk, dtype: float64 1 50.0 0 50.0 Name: Junk, dtype: float64
(None, None)
def AnalyticsEducator(X,y):
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Normalize the data
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
model = LogisticRegression()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
print("Accuracy {0:.2f}%".format(100*accuracy_score(predictions, y_test)))
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))
AnalyticsEducator(X_res, y_res)
Accuracy 84.15% [[14827 1518] [ 3634 12531]] precision recall f1-score support 0 0.80 0.91 0.85 16345 1 0.89 0.78 0.83 16165 accuracy 0.84 32510 macro avg 0.85 0.84 0.84 32510 weighted avg 0.85 0.84 0.84 32510
from imblearn.over_sampling import BorderlineSMOTE
sm_b1 = BorderlineSMOTE(
sampling_strategy='auto', # samples only the minority class
random_state=0, # for reproducibility
k_neighbors=5, # the neighbours to crete the new examples
m_neighbors=10, # the neiighbours to find the DANGER group
kind='borderline-1',
n_jobs=-1
)
X_res_b1, y_res_b1 = sm_b1.fit_resample(X, y)
AnalyticsEducator(X_res_b1, y_res_b1)
Accuracy 91.10% [[16324 21] [ 2872 13293]] precision recall f1-score support 0 0.85 1.00 0.92 16345 1 1.00 0.82 0.90 16165 accuracy 0.91 32510 macro avg 0.92 0.91 0.91 32510 weighted avg 0.92 0.91 0.91 32510
sm_b2 = BorderlineSMOTE(
sampling_strategy='auto', # samples only the minority class
random_state=0, # for reproducibility
k_neighbors=5, # the neighbours to crete the new examples
m_neighbors=10, # the neiighbours to find the DANGER group
kind='borderline-2',
n_jobs=4
)
X_res_b2, y_res_b2 = sm_b2.fit_resample(X, y)
AnalyticsEducator(X_res_b2, y_res_b2)
Accuracy 91.67% [[16336 9] [ 2699 13466]] precision recall f1-score support 0 0.86 1.00 0.92 16345 1 1.00 0.83 0.91 16165 accuracy 0.92 32510 macro avg 0.93 0.92 0.92 32510 weighted avg 0.93 0.92 0.92 32510