import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import xgboost
import math
#from __future__ import division
from scipy.stats import pearsonr
from sklearn.linear_model import LinearRegression
#from sklearn import cross_validation, tree, linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import explained_variance_score
import os


pd.set_option('display.max_columns', None)


os.chdir("C:\\Users\\ASUS")
df = pd.read_csv('kc_house_data.csv')
df.head()


# Check the summary of the data
df.describe()


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             21613 non-null  int64  
 1   date           21613 non-null  object 
 2   price          21613 non-null  float64
 3   bedrooms       21613 non-null  int64  
 4   bathrooms      21613 non-null  float64
 5   sqft_living    21613 non-null  int64  
 6   sqft_lot       21613 non-null  int64  
 7   floors         21613 non-null  float64
 8   waterfront     21613 non-null  int64  
 9   view           21613 non-null  int64  
 10  condition      21613 non-null  int64  
 11  grade          21613 non-null  int64  
 12  sqft_above     21613 non-null  int64  
 13  sqft_basement  21613 non-null  int64  
 14  yr_built       21613 non-null  int64  
 15  yr_renovated   21613 non-null  int64  
 16  zipcode        21613 non-null  int64  
 17  lat            21613 non-null  float64
 18  long           21613 non-null  float64
 19  sqft_living15  21613 non-null  int64  
 20  sqft_lot15     21613 non-null  int64  
dtypes: float64(5), int64(15), object(1)
memory usage: 3.5+ MB


df.drop(['id',"date","zipcode","lat","long","yr_built"],inplace=True,axis=1)
df.head()


df.yr_renovated.value_counts()

0       20699
2014       91
2013       37
2003       36
2000       35
        ...  
1934        1
1959        1
1951        1
1948        1
1944        1
Name: yr_renovated, Length: 70, dtype: int64


# Dummy variable creation
df.yr_renovated = np.where(df.yr_renovated==0,0,1)


df.waterfront.value_counts()

0    21450
1      163
Name: waterfront, dtype: int64


df.bathrooms.value_counts().head()

2.50    5380
1.00    3852
1.75    3048
2.25    2047
2.00    1930
Name: bathrooms, dtype: int64


#rounding off the bathroom values
df['bathrooms'] = np.round(df['bathrooms'])


df.isnull().sum()

price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
grade            0
sqft_above       0
sqft_basement    0
yr_renovated     0
sqft_living15    0
sqft_lot15       0
dtype: int64


import seaborn as sns
sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.histplot(df['price'],kde=True,bins=50)

<AxesSubplot:xlabel='price', ylabel='Count'>


sns.set(rc={'figure.figsize':(13.7,10.27)})
sns.heatmap(df.corr(),annot=True)

<AxesSubplot:>


#segregate data into dependent and independent variables

X = df.drop("price", axis = 1)#independent variables
y = df["price"]#dependent variable


# Splitting it into training and testing (70% train & 30% test)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)


#import the package
from sklearn.linear_model import LinearRegression
#initialize the model
lin_reg = LinearRegression()
#run the model on the dataset
lin_reg.fit(X_train, y_train)
#predict on the test data
y_pred = lin_reg.predict(X_test)


from sklearn.metrics import mean_absolute_percentage_error
mean_absolute_percentage_error(y_test,y_pred)*100

31.876634943970743


from sklearn.tree import DecisionTreeRegressor 
dt = DecisionTreeRegressor(random_state = 0) 
dt.fit(X_train, y_train)
# Predict the model
y_pred = dt.predict(X_test)


from sklearn.metrics import mean_absolute_percentage_error
mean_absolute_percentage_error(y_test,y_pred)*100

33.772099593918995


from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators = 500, random_state = 42)
rf.fit(X_train, y_train)
# Predict the model
y_pred = rf.predict(X_test)


from sklearn.metrics import mean_absolute_percentage_error
mean_absolute_percentage_error(y_test,y_pred)*100

25.585464158290428

	id	date	price	bedrooms	bathrooms	sqft_living	sqft_lot	floors	condition	grade	sqft_above	sqft_basement	yr_built	yr_renovated	zipcode	lat	long	sqft_living15	sqft_lot15
0	7129300520	20141013T000000	221900.0	3	1.00	1180	5650	1.0	3	7	1180	0	1955	0	98178	47.5112	-122.257	1340	5650
1	6414100192	20141209T000000	538000.0	3	2.25	2570	7242	2.0	3	7	2170	400	1951	1991	98125	47.7210	-122.319	1690	7639
2	5631500400	20150225T000000	180000.0	2	1.00	770	10000	1.0	3	6	770	0	1933	0	98028	47.7379	-122.233	2720	8062
3	2487200875	20141209T000000	604000.0	4	3.00	1960	5000	1.0	5	7	1050	910	1965	0	98136	47.5208	-122.393	1360	5000
4	1954400510	20150218T000000	510000.0	3	2.00	1680	8080	1.0	3	8	1680	0	1987	0	98074	47.6168	-122.045	1800	7503

	id	price	bedrooms	bathrooms	sqft_living	sqft_lot	floors	waterfront	view	condition	grade	sqft_above	sqft_basement	yr_built	yr_renovated	zipcode	lat	long	sqft_living15	sqft_lot15
count	2.161300e+04	2.161300e+04	21613.000000	21613.000000	21613.000000	2.161300e+04	21613.000000	21613.000000	21613.000000	21613.000000	21613.000000	21613.000000	21613.000000	21613.000000	21613.000000	21613.000000	21613.000000	21613.000000	21613.000000	21613.000000
mean	4.580302e+09	5.400881e+05	3.370842	2.114757	2079.899736	1.510697e+04	1.494309	0.007542	0.234303	3.409430	7.656873	1788.390691	291.509045	1971.005136	84.402258	98077.939805	47.560053	-122.213896	1986.552492	12768.455652
std	2.876566e+09	3.671272e+05	0.930062	0.770163	918.440897	4.142051e+04	0.539989	0.086517	0.766318	0.650743	1.175459	828.090978	442.575043	29.373411	401.679240	53.505026	0.138564	0.140828	685.391304	27304.179631
min	1.000102e+06	7.500000e+04	0.000000	0.000000	290.000000	5.200000e+02	1.000000	0.000000	0.000000	1.000000	1.000000	290.000000	0.000000	1900.000000	0.000000	98001.000000	47.155900	-122.519000	399.000000	651.000000
25%	2.123049e+09	3.219500e+05	3.000000	1.750000	1427.000000	5.040000e+03	1.000000	0.000000	0.000000	3.000000	7.000000	1190.000000	0.000000	1951.000000	0.000000	98033.000000	47.471000	-122.328000	1490.000000	5100.000000
50%	3.904930e+09	4.500000e+05	3.000000	2.250000	1910.000000	7.618000e+03	1.500000	0.000000	0.000000	3.000000	7.000000	1560.000000	0.000000	1975.000000	0.000000	98065.000000	47.571800	-122.230000	1840.000000	7620.000000
75%	7.308900e+09	6.450000e+05	4.000000	2.500000	2550.000000	1.068800e+04	2.000000	0.000000	0.000000	4.000000	8.000000	2210.000000	560.000000	1997.000000	0.000000	98118.000000	47.678000	-122.125000	2360.000000	10083.000000
max	9.900000e+09	7.700000e+06	33.000000	8.000000	13540.000000	1.651359e+06	3.500000	1.000000	4.000000	5.000000	13.000000	9410.000000	4820.000000	2015.000000	2015.000000	98199.000000	47.777600	-121.315000	6210.000000	871200.000000

	price	bedrooms	bathrooms	sqft_living	sqft_lot	floors	condition	grade	sqft_above	sqft_basement	yr_renovated	sqft_living15	sqft_lot15
0	221900.0	3	1.00	1180	5650	1.0	3	7	1180	0	0	1340	5650
1	538000.0	3	2.25	2570	7242	2.0	3	7	2170	400	1991	1690	7639
2	180000.0	2	1.00	770	10000	1.0	3	6	770	0	0	2720	8062
3	604000.0	4	3.00	1960	5000	1.0	5	7	1050	910	0	1360	5000
4	510000.0	3	2.00	1680	8080	1.0	3	8	1680	0	0	1800	7503

Objective:¶

Machine Learning¶

The Case Study¶

Import the packages¶

Import the data into python¶

Here price is the dependent variable, which we need to predict, based on other independent variables like number of bedrooms, bathrooms, floors, living area sqft etc¶

Exploratory Data Analysis¶

Observation: we can understand that some of the variables are not going to help us in predicting the price, hence we will drop it. The columns we are going to drop are id, date, zipcode, lat,yr_built, and long.¶

Univariate Analysis¶

Observation: we can see that majority of the data points are 0, means no renovation has been done. Hence, we will convert it into a dummy variable of 0 means no renovation, and 1 means renovation done.¶

Observation: we can see that bathrooms are in decimal values, which is not possible. We will round off these values.¶

Missing values¶

Now we will check if there are any missing values in the data¶

Observation: there are no missing values¶

Univariate Analysis of the dependent variable¶

Observation: Starting the visualizations with distplot we can see a density of the Price but if you notice we can see a little bit outliers over 2 million.¶

Correlation matrix¶

Observation: the correlations are under control. None of the independent variables are high correlation among themselves.¶

Data split¶

Segregating the independent variables as X and dependent variable as y¶

Now we will split the data into training (80% of the data) and rest 30% - named test, will be kept aside for later use.¶

Modeling our Data¶

Linear Regression with scikit learn¶

Calculate the accuracy of the model¶

MAPE of 31.87% means the predicted values are 31.87% deviated from the actual values. We can assume this model has an accuracy of 100%- 31.87% = 68.13% of accuracy. This is rather poor accuracy.¶

Now we will be using Decision Tree to predict the prices¶

Decision Tree Algorithm¶

Calculate the accuracy of the model¶

MAPE of 33.7% shows that Decision Tree has given us a worse result than Linear Regerssion¶

Now we will be using Random Forest algorithm¶

Calculate the accuracy of the model¶

MAPE of 25.5% shows that Random Forest has given us the best result among all these variables.¶

We provide a 100% money back guarantee on learning. It means that each and every student of analytics educator will be able to understand every line of codes and algorithm, else we will refund the money back.¶

You may check out all our instructor led courses from this link. https://www.analyticseducator.com/Courses-Offers.html ¶

If you want to read more such Machine Learning case studies then you may go through the following links -¶

https://www.analyticseducator.com/Blog/Logistic-Regression_Titanic-Survival-checkpoint.html ¶

https://www.analyticseducator.com/Blog/Cancer-Classification-Support-Vector-Machines.html ¶

https://www.analyticseducator.com/Blog/K-Nearest%20Neighbours%20for%20T-shirt%20size%20classification.html ¶

Objective:¶

Machine Learning¶

The Case Study¶

Import the packages¶

Import the data into python¶

Here price is the dependent variable, which we need to predict, based on other independent variables like number of bedrooms, bathrooms, floors, living area sqft etc¶

Exploratory Data Analysis¶

Observation: we can understand that some of the variables are not going to help us in predicting the price, hence we will drop it. The columns we are going to drop are id, date, zipcode, lat,yr_built, and long.¶

Univariate Analysis¶

Observation: we can see that majority of the data points are 0, means no renovation has been done. Hence, we will convert it into a dummy variable of 0 means no renovation, and 1 means renovation done.¶

Observation: we can see that bathrooms are in decimal values, which is not possible. We will round off these values.¶

Missing values¶

Now we will check if there are any missing values in the data¶

Observation: there are no missing values¶

Univariate Analysis of the dependent variable¶

Observation: Starting the visualizations with distplot we can see a density of the Price but if you notice we can see a little bit outliers over 2 million.¶

Correlation matrix¶

Observation: the correlations are under control. None of the independent variables are high correlation among themselves.¶

Data split¶

Segregating the independent variables as X and dependent variable as y¶

Now we will split the data into training (80% of the data) and rest 30% - named test, will be kept aside for later use.¶

Modeling our Data¶

Linear Regression with scikit learn¶

Calculate the accuracy of the model¶

MAPE of 31.87% means the predicted values are 31.87% deviated from the actual values. We can assume this model has an accuracy of 100%- 31.87% = 68.13% of accuracy. This is rather poor accuracy.¶

Now we will be using Decision Tree to predict the prices¶

Decision Tree Algorithm¶

Calculate the accuracy of the model¶

MAPE of 33.7% shows that Decision Tree has given us a worse result than Linear Regerssion¶

Now we will be using Random Forest algorithm¶

Calculate the accuracy of the model¶

MAPE of 25.5% shows that Random Forest has given us the best result among all these variables.¶

We provide a 100% money back guarantee on learning. It means that each and every student of analytics educator will be able to understand every line of codes and algorithm, else we will refund the money back.¶

You may check out all our instructor led courses from this link. https://www.analyticseducator.com/Courses-Offers.html¶

If you want to read more such Machine Learning case studies then you may go through the following links -¶

https://www.analyticseducator.com/Blog/Logistic-Regression_Titanic-Survival-checkpoint.html¶

https://www.analyticseducator.com/Blog/Cancer-Classification-Support-Vector-Machines.html¶

https://www.analyticseducator.com/Blog/K-Nearest%20Neighbours%20for%20T-shirt%20size%20classification.html¶

You may check out all our instructor led courses from this link. https://www.analyticseducator.com/Courses-Offers.html ¶

https://www.analyticseducator.com/Blog/Logistic-Regression_Titanic-Survival-checkpoint.html ¶

https://www.analyticseducator.com/Blog/Cancer-Classification-Support-Vector-Machines.html ¶

https://www.analyticseducator.com/Blog/K-Nearest%20Neighbours%20for%20T-shirt%20size%20classification.html ¶