# import the packages
import pandas as pd
import os
os.chdir("C:\\Users\\ASUS\\Desktop\\case study")

#import the data
main_df = pd.read_csv("data.csv")
main_df.head()


main_df=pd.get_dummies(data=main_df, columns=['department', 'salary'],drop_first=True)
main_df.head()


main_df.isnull().sum()

employee_id                0
number_project             0
average_montly_hours       0
time_spend_company         0
Work_accident              0
left                       0
promotion_last_5years      0
satisfaction_level        27
last_evaluation           27
department_RandD           0
department_accounting      0
department_hr              0
department_management      0
department_marketing       0
department_product_mng     0
department_sales           0
department_support         0
department_technical       0
salary_low                 0
salary_medium              0
dtype: int64


round(main_df.isnull().mean()*100,2)

employee_id               0.00
number_project            0.00
average_montly_hours      0.00
time_spend_company        0.00
Work_accident             0.00
left                      0.00
promotion_last_5years     0.00
satisfaction_level        0.18
last_evaluation           0.18
department_RandD          0.00
department_accounting     0.00
department_hr             0.00
department_management     0.00
department_marketing      0.00
department_product_mng    0.00
department_sales          0.00
department_support        0.00
department_technical      0.00
salary_low                0.00
salary_medium             0.00
dtype: float64


main_df = main_df.dropna()


# Removing employee ID
main_df.drop(columns='employee_id',inplace=True)
main_df.head(2)


main_df['left'].value_counts()

0    11407
1     3549
Name: left, dtype: int64


# We remove the label values from our training data
X = main_df.drop(['left'],axis=1)

# We assigned those label values to our Y dataset
y = main_df['left']


# Split it to a 70:30 Ratio Train:Test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)


# Normalize the data
from sklearn.preprocessing import StandardScaler

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)


from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

model = LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression()


predictions = model.predict(X_test)

print("Accuracy {0:.2f}%".format(100*accuracy_score(predictions, y_test)))
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))

Accuracy 79.01%
[[3180  265]
 [ 677  365]]
              precision    recall  f1-score   support

           0       0.82      0.92      0.87      3445
           1       0.58      0.35      0.44      1042

    accuracy                           0.79      4487
   macro avg       0.70      0.64      0.65      4487
weighted avg       0.77      0.79      0.77      4487


# Importing the Keras libraries and packages
from tensorflow.contrib.keras.api.keras.models import Sequential
from tensorflow.contrib.keras.api.keras.layers import Dense
from tensorflow.contrib.keras import backend


# Initialising the ANN
classifier = Sequential()

# Adding the input layer 
#units = (total ind var + dep var)/2 - (it's a tip)
# initialize the weights with the function "uniform"
# rectifier funciton for input layer & sigmoid for output 
# together it's called relu(rectified linear unit)

classifier.add(Dense(units=10, 
kernel_initializer='uniform', activation='relu',
input_dim=18))

# Adding the second hidden layer
# just copy paste the above line without input_dim
classifier.add(Dense(units=10, 
kernel_initializer='uniform', activation='relu'))

# Adding the output layer
#output is binary, hence units will be 1; 
#sigmoid will give us probabilities (dep var binary)
# dep var as nominal use softmax as activation function
classifier.add(Dense(units=1, 
kernel_initializer='uniform', activation='sigmoid'))


# Compiling the ANN
# initialized weights should be optimized using "adam"
#function. This uses stochastic gradient descent.
# loss function for binary is 'binary_crossentropy'
# metrics to be evaluated
classifier.compile(optimizer='adam', 
loss='binary_crossentropy', metrics=['accuracy'])

# Fitting the ANN to the Training set
#batch size is no. of observation model will adjust weights
# thumb rule batch size 10 and epochs=100
#
classifier.fit(X_train, y_train, batch_size=10,
epochs=20, validation_split=0.1)

Train on 9422 samples, validate on 1047 samples
WARNING:tensorflow:From C:\Users\ASUS\anaconda3\envs\py36\lib\site-packages\tensorflow\python\ops\math_ops.py:3066: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.cast instead.
Epoch 1/20
9422/9422 [==============================] - 1s 124us/sample - loss: 0.4024 - acc: 0.8278 - val_loss: 0.2636 - val_acc: 0.8854
Epoch 2/20
9422/9422 [==============================] - 1s 94us/sample - loss: 0.2042 - acc: 0.9305 - val_loss: 0.1977 - val_acc: 0.9398
Epoch 3/20
9422/9422 [==============================] - 1s 94us/sample - loss: 0.1590 - acc: 0.9538 - val_loss: 0.1801 - val_acc: 0.9446
Epoch 4/20
9422/9422 [==============================] - 1s 99us/sample - loss: 0.1456 - acc: 0.9574 - val_loss: 0.1697 - val_acc: 0.9484
Epoch 5/20
9422/9422 [==============================] - 1s 100us/sample - loss: 0.1396 - acc: 0.9594 - val_loss: 0.1690 - val_acc: 0.9494
Epoch 6/20
9422/9422 [==============================] - 1s 99us/sample - loss: 0.1344 - acc: 0.9605 - val_loss: 0.1615 - val_acc: 0.9503
Epoch 7/20
9422/9422 [==============================] - 1s 102us/sample - loss: 0.1310 - acc: 0.9612 - val_loss: 0.1629 - val_acc: 0.9475
Epoch 8/20
9422/9422 [==============================] - 1s 98us/sample - loss: 0.1287 - acc: 0.9617 - val_loss: 0.1590 - val_acc: 0.9465
Epoch 9/20
9422/9422 [==============================] - 1s 99us/sample - loss: 0.1264 - acc: 0.9637 - val_loss: 0.1541 - val_acc: 0.9561
Epoch 10/20
9422/9422 [==============================] - 1s 103us/sample - loss: 0.1245 - acc: 0.9643 - val_loss: 0.1512 - val_acc: 0.9532
Epoch 11/20
9422/9422 [==============================] - 1s 101us/sample - loss: 0.1227 - acc: 0.9636 - val_loss: 0.1512 - val_acc: 0.9532
Epoch 12/20
9422/9422 [==============================] - 1s 100us/sample - loss: 0.1216 - acc: 0.9661 - val_loss: 0.1477 - val_acc: 0.9542
Epoch 13/20
9422/9422 [==============================] - 1s 99us/sample - loss: 0.1205 - acc: 0.9655 - val_loss: 0.1459 - val_acc: 0.9561
Epoch 14/20
9422/9422 [==============================] - 1s 102us/sample - loss: 0.1194 - acc: 0.9657 - val_loss: 0.1453 - val_acc: 0.9561
Epoch 15/20
9422/9422 [==============================] - 1s 98us/sample - loss: 0.1179 - acc: 0.9664 - val_loss: 0.1415 - val_acc: 0.9551
Epoch 16/20
9422/9422 [==============================] - 1s 98us/sample - loss: 0.1172 - acc: 0.9662 - val_loss: 0.1449 - val_acc: 0.9522
Epoch 17/20
9422/9422 [==============================] - 1s 102us/sample - loss: 0.1159 - acc: 0.9665 - val_loss: 0.1404 - val_acc: 0.9542
Epoch 18/20
9422/9422 [==============================] - 1s 99us/sample - loss: 0.1154 - acc: 0.9672 - val_loss: 0.1408 - val_acc: 0.9561
Epoch 19/20
9422/9422 [==============================] - 1s 97us/sample - loss: 0.1150 - acc: 0.9667 - val_loss: 0.1382 - val_acc: 0.9570
Epoch 20/20
9422/9422 [==============================] - 1s 102us/sample - loss: 0.1141 - acc: 0.9669 - val_loss: 0.1440 - val_acc: 0.9503

<tensorflow.python.keras.callbacks.History at 0x22878c50>


import numpy as np
#Making predictions and evaluating the model
# Predicting the Test set results
y_pred = classifier.predict(X_test)
y_pred = np.where(y_pred > 0.5,1,0)


# Making the Confusion Matrix
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[3381   64]
 [  96  946]]
              precision    recall  f1-score   support

           0       0.97      0.98      0.98      3445
           1       0.94      0.91      0.92      1042

    accuracy                           0.96      4487
   macro avg       0.95      0.94      0.95      4487
weighted avg       0.96      0.96      0.96      4487

0.9643414308000892

Retention Study - Figuring Out Which Employees May Quit¶

Loading our Main HR Data and Import the packages¶

Here each row belongs to different employee and their respective data.¶

In this data, in left column 1 means the employee has left the company, and 0 means they are still working. This is our dependent variable - which we are going to predict.¶

Rest of the columns are all independent variable.¶

Dummy Variables¶

Now we will check if there are missing values in our data¶

We can see that only satisfaction_level and last_evaluation are having missing values.¶

We would also like to check % of missing value per variable¶

Now we will delete all the rows with missing values¶

Now we will drop the employee id¶

We will check the number of 1(resigns) and 0(still working) in left variable¶

Data split¶

Segregating the independent variables as X and dependent variable as y¶

Now we will split the data into training (70% of the data) and rest 30% - named test, will be kept aside for later use.¶

Now we will do the feature scaling¶

Now let's Train a Logistic Regression Model¶

EVALUATING THE MODEL¶

Once the model is executed, we will predict the test data with our model.¶

Overall accuracy is 79% and precision for 0 and 1 are 82% and 58% respectively. So it means that out of all the 0 82% matches with our prediction, but out of all the 1 only 58% matches with our prediction.¶

Now let's Train an Artificial Neural Network Model¶

We will first initialize the ANN and then add some hidden layers into it.¶

Now we will compile the model and used K Fold cross validation with 10% holdout sample¶

EVALUATING THE MODEL¶

Once the model is executed, we will predict the test data with our model.¶

EVALUATING THE MODEL¶

Once the model is executed, we will predict the test data with our model.¶

Overall accuracy is 96% and precision for 0 and 1 are 97% and 94% respectively.¶

So it means that out of all the employees who were about the resign, ANN model can predict 94% of them accurately.¶

We can see the ANN can provide a huge improvement on our model than what we had gotten with Logistic regerssion.¶

If you wish to know more about our courses then please visit the following link:¶

We also provide training on data science, or impart private help to the associates for their assignments. You may write to us at analyticseducator@gmail.com or call us at +91 9163223228 or whatsApp at 9804919166 for more information.¶

	employee_id	number_project	average_montly_hours	time_spend_company	left	department	salary	satisfaction_level	last_evaluation
0	1003	2	157	3	1	sales	low	0.38	0.53
1	1005	5	262	6	1	sales	medium	0.80	0.86
2	1486	7	272	4	1	sales	medium	0.11	0.88
3	1038	5	223	5	1	sales	low	0.72	0.87
4	1057	2	159	3	1	sales	low	0.37	0.52