import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
import pandas as pd
import scipy as sp
from sklearn import mixture
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix
%matplotlib inline
# Pre Process- Data
# Read TSV with \t
mymap = {'Yes':1,'No':0,'Travel_Rarely':1, 'Travel_Frequently': 2 ,'Non-Travel':3, 'Research & Development' :1 ,
'Human Resources':2,'Sales':3,'Life Sciences':1,'Medical':6,'Technical Degree':3,'Marketing':4,'Other':5,
'Female':1, 'Male':2,'Research Scientist':1,'Laboratory Technician':2,'Healthcare Representative':3,
'Manufacturing Director':4,'Manager':5,'Sales Representative':6,'Research Director':7,'Sales Executive':8,
'Single':1,'Married':2,'Divorced':3}#Medical = 6 as HR =2 in another column
dfh =df.applymap(lambda s: mymap.get(s) if s in mymap else s)
# In mymap == Yes =1 and No =0 - replacements made in both Attrition and OverTime
#dfh.to_csv('dfh_05DEC.csv') # Ok for down csv
#InterimDF Dropped-Attr,EmployeeCount,EmployeeNumber ,Over18 and StandardHours
df1 = dfh.drop(df.columns[[1,8,9,21,26]],axis=1,inplace=False)
print df1.shape
names = df1.columns.values
print "________________________________________"
print names
print "________________________________________"
df2 = pd.DataFrame(dfh["Attrition"]) # Interim DF only - Attr
names1 = df2.columns.values
print names1
print df2.shape
print df2["Attrition"].value_counts() # Here - 0 == Live Employee , 1 == Exited Employee / Attrited Employee
# Convert DF to Numpy Array
# 1st Numpy Array == X , only features
# 2nd Numpy Array == y , only target Labels
import numpy as np
X = df1.iloc[:,0:30].values # All Features of - df1 besides Attr and AGE #TBD --- Need to ADD AGE ???
y = df2.iloc[:,0].values # Choosing only 1 - Target Feature from - dfh
print X.shape
print y.shape
print "_________________________________________________________"
print('Target Variable "Attrition":', (y))
print "_________________________________________________________"
print('Class labels for Target Variable "Attrition":', np.unique(y))
print "_________________________________________________________"
print('Percentage of Class Label ==1 = {:.4f}'.format(df2["Attrition"].mean()))
print('Percentage of Class Label ==0 = {:.4f}'.format(1-df2["Attrition"].mean()))
print "_________________________________________________________"
print "Model that Predicts 83.88% Accuracy is Non Predictor OR NO_Model- as it will always predict Dominant Class"
print "This dataset Dominant Class = ZERO or LIVE EMPLOYEE - we need more than 83.88% Accuracy Score."
Source :--
Some classification problems can exhibit a large imbalance in the distribution of the target classes: for instance
there could be several times more negative samples than positive samples. In such cases it is recommended to
use stratified sampling as implemented in StratifiedKFold and StratifiedShuffleSplit to ensure that relative
class frequencies is approximately preserved in each train and validation fold.
from sklearn.model_selection import StratifiedShuffleSplit
print "__________________________________"
%time sss = StratifiedShuffleSplit(n_splits=10, test_size=0.3, random_state=123) # test_size=0.3 thus TRAIN_size =0.7 OR 70%
sss.get_n_splits(X, y)
print "__________________________________"
print "__________________________________"
for train_index, test_index in sss.split(X, y):
# print("TRAIN:", train_index, "TEST:", test_index) # Printing INDEX Values not ACTUAL Feature Values
X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]
print X_train.shape
print y_train.shape
print X_test.shape
print y_test.shape
np.savetxt("X_train.csv", X_train, delimiter=",") # Numpy arrays saved as CSV's
np.savetxt("y_train.csv", y_train, delimiter=",")
np.savetxt("X_test.csv", X_test, delimiter=",")
np.savetxt("y_test.csv", y_test, delimiter=",")
# Source --
# Gaussian Naive bayes -GaussianNB as 1st Classifier without any Feature Scaling
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
y_pred =, y_train).predict(X_train)
print("Number of mislabeled points out of a total %d points : %d" % (X_train.shape[0],(y_train != y_pred).sum()))
# Cells below with Scaler and STD Scaler data - those are not Best options as regards ACCURACY scores
# but we can not classify with Non Scaled Data as the Categorical Features we have are all having Diff Scales
# Thus with this data set Naive Bayes is not a Good Choice of Classifier .
# Source -
# Source -
# 1st RUN - Multinomial Naive Bayes - with X_train_scaled and y_train
# 2nd RUN - Multinomial Naive Bayes - with X_train_sc and y_train
# Instantiated - Multinomial Naive Bayes - but didnt Fit or Predict as MNB cant be used with Negative Values
# in the X_train it throws an error with scaled data
# When we use X_train_scaled or X_train_sc
# We seem to be getting a Sparse Matrix or negative Value Error at location ---
/home/dhankar/anaconda2/lib/python2.7/site-packages/sklearn/naive_bayes.pyc in _count(self, X, Y)
688 """Count and smooth feature occurrences."""
689 if np.any(( if issparse(X) else X) < 0):
--> 690 raise ValueError("Input X must be non-negative")
691 self.feature_count_ += safe_sparse_dot(Y.T, X)
692 self.class_count_ += Y.sum(axis=0)
ValueError: Input X must be non-negative
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
# train the model using X_train_scaled (timing it with an IPython "magic command")
%time, y_train)
# Source -
# Source -
# All Code below this - OK
# How is STANDARD SCALER diff from this SCALER ?? - On the STANDARD SCALER official documentation page - Scaler
# is mentioned as - "Equivalent function without the object oriented API." But data set pre-processed with
# STANDARD SCALER provides a higher Accuracy Score upon model evaluation .
# Other Scalers given - required / not required ??
# MIN MAX Scaler Not Required - Not implemented for this data set.
##### simple scale --- mean ==0 variance ==1
# Catch 22 - Docs for Scaler state dont Scale the Target Feature - BUT - MLP - Neural Net requires this ??
## 1st RUN - Scale X_train ,X_test , y_train and y_test.
# 2nd RUN - STANDARD SCALER X_train ,X_test , y_train and y_test.
from sklearn import preprocessing
from sklearn.preprocessing import scale
X_train_scaled = preprocessing.scale(X_train)
print X_train_scaled.shape
print type(X_train_scaled)
print "_________________________________"
y_train_scaled = preprocessing.scale(y_train)
print y_train_scaled.shape
print type(y_train_scaled)
print "_________________________________"
#print X_train_scaled # Ok Not required
print "_________________________________"
print X_train_scaled.mean(axis=0) # Means Exponential e-16 or e-17, Why not ZERO's ? format the Floating Points
print "_________________________________"
print X_train_scaled.std(axis=0)
print "_________________X-train-scaled___________________________________________________________________"
print('Feature == 0 -- Mean after Rescaling = {:.8f}'.format(X_train_scaled[:,0].mean()))
print('Feature == 0 -- Variance after Rescaling = {:.8f}'.format(X_train_scaled[:,0].std()))
print('Feature == 1 -- Mean after Rescaling = {:.8f}'.format(X_train_scaled[:,1].mean()))
print('Feature == 1 -- Variance after Rescaling = {:.8f}'.format(X_train_scaled[:,1].std()))
print('Feature == 2 -- Mean after Rescaling = {:.8f}'.format(X_train_scaled[:,2].mean()))
print('Feature == 3 -- Mean after Rescaling = {:.8f}'.format(X_train_scaled[:,3].mean()))
print('Feature == 4 -- Mean after Rescaling = {:.8f}'.format(X_train_scaled[:,4].mean()))
print('Feature == 5 -- Mean after Rescaling = {:.8f}'.format(X_train_scaled[:,5].mean()))
print('Feature == 6 -- Mean after Rescaling = {:.8f}'.format(X_train_scaled[:,6].mean()))
print('Feature == 7 -- Mean after Rescaling = {:.8f}'.format(X_train_scaled[:,7].mean()))
print('Feature == 8 -- Mean after Rescaling = {:.8f}'.format(X_train_scaled[:,8].mean()))
print "_________________y-train-scaled______________________________________________________________"
print('Feature == 0 -- Mean after Rescaling = {:.8f}'.format(y_train_scaled.mean()))
print('Feature == 0 -- Variance after Rescaling = {:.8f}'.format(y_train_scaled.std()))
print "_____________________________________________________________________________________________"
X_test_scaled = preprocessing.scale(X_test)
print X_test_scaled.shape
#print type(X_test_scaled)
print "_________________________________"
y_test_scaled = preprocessing.scale(y_test)
print y_test_scaled.shape
#print type(y_test_scaled)
print "_________________________________"
#print X_test_scaled # Ok Not required
print "_________________________________"
print X_test_scaled.mean(axis=0) #
print "_________________________________"
print X_test_scaled.std(axis=0)
print "________________X-Test-Scaled_______________________________________________________________"
print('Feature == 0 -- Mean after Rescaling = {:.8f}'.format(X_test_scaled[:,0].mean()))
print('Feature == 0 -- Variance after Rescaling = {:.8f}'.format(X_test_scaled[:,0].std()))
print('Feature == 1 -- Mean after Rescaling = {:.8f}'.format(X_test_scaled[:,1].mean()))
print('Feature == 1 -- Variance after Rescaling = {:.8f}'.format(X_test_scaled[:,1].std()))
print('Feature == 2 -- Mean after Rescaling = {:.8f}'.format(X_test_scaled[:,2].mean()))
print('Feature == 3 -- Mean after Rescaling = {:.8f}'.format(X_test_scaled[:,3].mean()))
print('Feature == 4 -- Mean after Rescaling = {:.8f}'.format(X_test_scaled[:,4].mean()))
print('Feature == 5 -- Mean after Rescaling = {:.8f}'.format(X_test_scaled[:,5].mean()))
print "________________y-Test-Scaled_______________________________________________________________"
print('Feature == 0 -- Mean after Rescaling = {:.8f}'.format(y_test_scaled.mean()))
print('Feature == 0 -- Variance after Rescaling = {:.8f}'.format(y_test_scaled.std()))
# Need to check - why -0.0000 values for Mean on Rescaling and do these impact the Predictions ?
# Standardizing and Rescaling -
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc1 = StandardScaler()
#sc = preprocessing.StandardScaler().fit(X_train) # single line option chained code
X_train_sc = sc.transform(X_train)
X_test_sc = sc.transform(X_test) #@Important Note --- Why not do a ?
y_train_sc1 = sc1.transform(y_train)
y_test_sc1 = sc1.transform(y_test)
# The means and STD values for X_test arent same as above with SCALER ?
print X_train_sc.shape
print type(X_train_sc)
print "_________________________________"
#print X_test_scaled # Ok Not required
print "_________________________________"
print X_train_sc.mean(axis=0) #
print "_________________________________"
print X_train_sc.std(axis=0)
print "________________X-Train-sc_______________________________________________________________"
print('Feature == 0 -- Mean after Rescaling = {:.4f}'.format(X_train_sc[:,0].mean()))
print('Feature == 0 -- Variance after Rescaling = {:.4f}'.format(X_train_sc[:,0].std()))
print('Feature == 1 -- Mean after Rescaling = {:.4f}'.format(X_train_sc[:,1].mean()))
print('Feature == 1 -- Variance after Rescaling = {:.4f}'.format(X_train_sc[:,1].std()))
print('Feature == 2 -- Mean after Rescaling = {:.8f}'.format(X_train_sc[:,2].mean()))
print('Feature == 3 -- Mean after Rescaling = {:.8f}'.format(X_train_sc[:,3].mean()))
print('Feature == 4 -- Mean after Rescaling = {:.8f}'.format(X_train_sc[:,4].mean()))
print('Feature == 5 -- Mean after Rescaling = {:.8f}'.format(X_train_sc[:,5].mean()))
print "________________y-Train-sc_______________________________________________________________"
print('Feature == 0 -- Mean after Rescaling = {:.4f}'.format(y_train_sc1.mean()))
print('Feature == 0 -- Variance after Rescaling = {:.4f}'.format(y_train_sc1.std()))
print X_test_sc.shape
print type(X_test_sc)
print "_________________________________"
#print X_test_scaled # Ok Not required
print "_________________________________"
print X_test_sc.mean(axis=0) #
print "_________________________________"
print X_test_sc.std(axis=0)
print "_____________X-Test-sc__________________________________________________________________"
print('Feature == 0 -- Mean after Rescaling = {:.4f}'.format(X_test_sc[:,0].mean()))
print('Feature == 0 -- Variance after Rescaling = {:.4f}'.format(X_test_sc[:,0].std()))
print('Feature == 1 -- Mean after Rescaling = {:.4f}'.format(X_test_sc[:,1].mean()))
print('Feature == 1 -- Variance after Rescaling = {:.4f}'.format(X_test_sc[:,1].std()))
print('Feature == 2 -- Mean after Rescaling = {:.8f}'.format(X_test_sc[:,2].mean()))
print('Feature == 3 -- Mean after Rescaling = {:.8f}'.format(X_test_sc[:,3].mean()))
print('Feature == 4 -- Mean after Rescaling = {:.8f}'.format(X_test_sc[:,4].mean()))
print('Feature == 5 -- Mean after Rescaling = {:.8f}'.format(X_test_sc[:,5].mean()))
print "____________y-Test-sc___________________________________________________________________"
print('Feature == 0 -- Mean after Rescaling = {:.4f}'.format(y_test_sc1.mean()))
print('Feature == 0 -- Variance after Rescaling = {:.4f}'.format(y_test_sc1.std()))
The disadvantages of Multi-layer Perceptron (MLP) include:
1/ MLP with hidden layers have a non-convex loss function where there exists more than one local minimum.
#Therefore different random weight initializations can lead to different validation accuracy.
2/ MLP requires tuning a number of hyperparameters such as the number of hidden neurons, layers,
#and iterations.
3/ MLP is sensitive to feature scaling.
Scaling Data - Train and Test sets both for MLP - Multi-layer Perceptron is sensitive to feature scaling,
so it is highly recommended to scale your data. For example, scale each attribute on the input
vector X to [0, 1] or [-1, +1], or standardize it to have mean 0 and variance 1. Note that you must apply
the same scaling to the test set for meaningful results. You can use StandardScaler for standardization.
# Neural Network - Multi-layer Perceptron (MLP)
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
#clf = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5, 2), random_state=1)
#clf = MLPClassifier(solver='adam', alpha=1e-5,hidden_layer_sizes=(5, 2), random_state=1), y_test_sc1)
MLPC = MLPClassifier(random_state=2),y_train)
scores = cross_val_score(MLPC,X_test,y_test,cv=5,scoring='accuracy')
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
# Source --
# Gaussian Naive bayes -GaussianNB as 1st Classifier without any Feature Scaling
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
y_pred =, y_train).predict(X_train)
print("Number of mislabeled points out of a total %d points : %d" % (X_train.shape[0],(y_train != y_pred).sum()))
RAW DATA -- Number of mislabeled points out of a total 2058 points : 399
STD SCALER --- Number of mislabeled points out of a total 2058 points : 412
SCALER --- Number of mislabeled points out of a total 2058 points : 412
# 1st RUN - Naive Bayes - with X_train_scaled and y_train
# 2nd RUN - Naive Bayes - with X_train_sc and y_train
# Instantiate - Multinomial Naive Bayes
# When we use X_train_scaled or X_train_sc
# We seem to be getting a Sparse Matrix or negative Value Error at location ---
/home/dhankar/anaconda2/lib/python2.7/site-packages/sklearn/naive_bayes.pyc in _count(self, X, Y)
688 """Count and smooth feature occurrences."""
689 if np.any(( if issparse(X) else X) < 0):
--> 690 raise ValueError("Input X must be non-negative")
691 self.feature_count_ += safe_sparse_dot(Y.T, X)
692 self.class_count_ += Y.sum(axis=0)
ValueError: Input X must be non-negative
# Basis these Questions --
# We use only GaussianNB and not MultinomialNB with Scaler and STD Scaler Data.
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
# train the model using X_train_scaled (timing it with an IPython "magic command")
%time, y_train) # Just a print of the Fit this is Not used as we cant use Raw Unscaled Data
# 1st RUN - KNN - with X_train_scaled and y_train
# 2nd RUN - KNN - with X_train_sc and y_train
from sklearn.neighbors import KNeighborsClassifier
# Instantiate kNN model with 1 Neighbour
knn = KNeighborsClassifier(n_neighbors=1)
# Fit kNN model with Train data (occurs in-place), y_train), y_train)
In [97]:
# 1st RUN - predict kNN Class with with X_test_scaled
# 2nd RUN - predict kNN Class with with X_test_sc
#y_pred_class_kNN = knn.predict(X_test_scaled)
y_pred_class_kNN = knn.predict(X_test_sc)
# calculate accuracy MODEL EVAL
# calculate accuracy of class predictions
The AUC and ROC ==
Example of Receiver Operating Characteristic (ROC) metric to evaluate classifier output quality
using cross-validation.
ROC curves typically feature true positive rate on the Y axis, and false positive rate on the X axis.
This means that the top left corner of the plot is the “ideal” point - a false positive rate of zero,
and a true positive rate of one. This is not very realistic, but it does mean that a larger area under
the curve (AUC) is usually better.
The “steepness” of ROC curves is also important, since it is ideal to maximize the true positive rate
while minimizing the false positive rate.
0.950113378685 = 95.01%
_____Model Evaluation with AUC Area Under the Curve __________________
0.890597639893 = 89.05%
precision recall f1-score support
0 0.96 0.98 0.97 740
1 0.88 0.80 0.84 142
avg / total 0.95 0.95 0.95 882
##1st RUN_kNN_1 Neighbour - with StandardScaler data .######################_________________________________________
0.945578231293 = 94.55%
_____Model Evaluation with AUC Area Under the Curve __________________
0.878800355589 = 87.88%
precision recall f1-score support
0 0.96 0.98 0.97 737
1 0.88 0.78 0.82 145
avg / total 0.94 0.95 0.94 882
##1st RUN_kNN_1 Neighbour - with SCALER data .######################_________________________________________
Accuracy Score :-
0.943310657596 = 94.33%
_____Model Evaluation with AUC Area Under the Curve __________________
0.871903803865 = 87.19%
precision recall f1-score support
0 0.95 0.98 0.97 737
1 0.87 0.77 0.82 145
avg / total 0.94 0.94 0.94 882
##1st RUN_kNN_1 Neighbour - .######################
kNN of Non PCA Data Set with Seed 123 -- Feature AGE Included
Accuracy Score :-
0.922902494331 = 92.29%
_____Model Evaluation with AUC Area Under the Curve __________________
0.865231834558 = 86.52%
precision recall f1-score support
0 0.96 0.95 0.95 737
1 0.76 0.78 0.77 145
avg / total 0.92 0.92 0.92 882
# PCA data set --- Exactly same with multiple runs - seed or no seed --
Accuracy Score :-
precision recall f1-score support
0 0.84 1.00 0.91 737
1 0.00 0.00 0.00 145 ##### Notice All ZERO's === NO 1's being Predicted ??
avg / total 0.70 0.83 0.76 882
##2nd RUN ...########################
Non PCA Data Set with Seed 123 -- Feature AGE Not Included
#1 _____Model Evaluation Accuracy Score
0.878684807256 == 87.86% , Accuracy Score - which is OK not Good as a Non Model is supposed to have -
As calculated above earlier = 1-dfh["Attrition].mean() == 83.88%
#2_____Model Evaluation with AUC Area Under the Curve __________________
0.672582229916 = 67.25%
precision recall f1-score support
0 0.89 0.98 0.93 737
1 0.78 0.37 0.50 145 ##### Notice All Non ZERO's ===
avg / total 0.87 0.88 0.86 882
##3rd RUN ...######################
Non PCA Data Set with Seed 123 -- Feature AGE Included
#1 _____Model Evaluation Accuracy Score
0.863945578231 = 86.39%
#2_____Model Evaluation with AUC Area Under the Curve __________________
0.613905394657 = 61.39%
precision recall f1-score support
0 0.87 0.99 0.92 737
1 0.78 0.24 0.37 145
avg / total 0.85 0.86 0.83 882
from sklearn import metrics
print type(y_pred_class_kNN)
print len(y_pred_class_kNN)
print "__________________________________________"
print len(y_test)
print('Logistic Reg Model predicted classes: {}'.format(y_pred_class_kNN))
print('Actual data - Real classes: {}'.format(y_test))
print "__________________________________________"
print metrics.accuracy_score(y_test, y_pred_class_kNN)
print "_____Model Evaluation with AUC Area Under the Curve __________________"
print metrics.roc_auc_score(y_test, y_pred_class_kNN)
print(metrics.classification_report(y_test, y_pred_class_kNN))
# Logistic Regression
# import and instantiate a Logistic Regression model
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
#Logistic Regression
# 1st RUN - RAW DATA --- Train model using X_train - the MODEL FIT step
# 2nd RUN - RAW DATA --- Train model using X_train_scaled - the MODEL FIT step
%time, y_train)
print "____________________________"
# 1st RUN - RAW DATA --- Train model using X_train - the MODEL FIT step
CPU times: user 24 ms, sys: 0 ns, total: 24 ms
Wall time: 326 ms
[[-0.22926078 0.02842471 -0.12015534 0.53251037 0.27349992 -0.00501477
-0.07026319 -0.43308996 0.14688605 -0.05153573 -0.29965872 -0.32116135
-0.21602175 -0.44955459 -0.43029879 -0.14698227 0.08871293 0.38247998
0.73392199 -0.2554244 0.18246334 -0.23867643 0. -0.17085906
-0.29944854 -0.10888722 -0.15189001 0.5286157 -0.47053308 0.51628589
1st Run === Seed=== 123
CPU times: user 68 ms, sys: 4 ms, total: 72 ms
Wall time: 524 ms
[ 0.00057899]
[[ -2.41465872e-02 -4.92080733e-03 -2.34642609e-04 1.68309915e-01
2.70271877e-02 -9.01246489e-03 -2.26917398e-02 -3.27923361e-01
4.00104704e-02 -1.85470147e-03 -2.56502435e-01 -5.68589051e-02
8.84412682e-02 -3.24237113e-01 -3.16467654e-01 -1.25498378e-04
8.15611102e-06 1.60560664e-01 4.71414431e-01 -3.66879867e-02
5.26985015e-02 -1.40384273e-01 4.63188771e-02 -2.98077801e-01
-2.60246042e-02 -1.15840781e-01 -1.15749399e-01 7.66046992e-02
-1.17914023e-01 1.45985087e-01 -1.31963175e-01]]
# make class predictions for X_test # the MODEL PRED
# make class predictions for X_test_scaled # the MODEL PRED == y_pred_class_scaled
y_pred_class_scaled = logreg.predict(X_test_scaled)
print type(y_pred_class_scaled)
# TBD --- Check
print y_pred_class.shape
print y_pred_class
print "__________________________________"
print y_test
# calculate predicted probabilities for X_test(well calibrated)
y_pred_prob = logreg.predict_proba(X_test)[:, 1]
#y_pred_prob #Ok Dont
# calculate accuracy MODEL EVAL
# calculate accuracy of class predictions
from sklearn import metrics
print len(y_pred_class_scaled)
print "__________________________________________"
print len(y_test)
print('Logistic Reg Model predicted classes: {}'.format(y_pred_class_scaled))
print('Actual data - Real classes: {}'.format(y_test))
print "__________________________________________"
print metrics.accuracy_score(y_test, y_pred_class_scaled)
print "_____Model Evaluation with AUC Area Under the Curve __________________"
print metrics.roc_auc_score(y_test, y_pred_class_scaled)
print(metrics.classification_report(y_test, y_pred_class_scaled))
# Scaler Data --
0.891156462585 = 89.11%
_____Model Evaluation with AUC Area Under the Curve __________________
precision recall f1-score support
0 0.90 0.98 0.94 740
1 0.80 0.43 0.56 142
avg / total 0.88 0.89 0.88 882
# Scaler Data --
0.877551020408 = 87.75%
_____Model Evaluation with AUC Area Under the Curve __________________
0.669133954054 = 66.91%
precision recall f1-score support
0 0.89 0.98 0.93 737
1 0.78 0.36 0.49 145
avg / total 0.87 0.88 0.86 882
# PCA data set --- Exactly same with multiple runs - seed or no seed --
precision recall f1-score support
0 0.84 1.00 0.91 737
1 0.00 0.00 0.00 145 ##### Notice All ZERO's === NO 1's being Predicted ??
avg / total 0.70 0.83 0.76 882
##2nd RUN ...##########################################################################
Non PCA Data Set with Seed 123 -- Feature AGE Not Included
#1 _____Model Evaluation Accuracy Score
0.878684807256 == 87.86% , Accuracy Score - which is OK not Good as a Non Model is supposed to have -
As calculated above earlier = 1-dfh["Attrition].mean() == 83.88%
#2_____Model Evaluation with AUC Area Under the Curve __________________
0.672582229916 = 67.25%
precision recall f1-score support
0 0.89 0.98 0.93 737
1 0.78 0.37 0.50 145 ##### Notice All Non ZERO's ===
avg / total 0.87 0.88 0.86 882
##3rd RUN ...##############################################################################
Non PCA Data Set with Seed 123 -- Feature AGE Included
#1 _____Model Evaluation Accuracy Score
0.863945578231 = 86.39%
#2_____Model Evaluation with AUC Area Under the Curve __________________
0.613905394657 = 61.39%
precision recall f1-score support
0 0.87 0.99 0.92 737
1 0.78 0.24 0.37 145
avg / total 0.85 0.86 0.83 882
# print the confusion matrix
0 | |
1 | |
2nd RUN ---
array([[722== TN, 15 == FP],
[ 92== FN, 53 == TP]])
3rd RUN ---
[[727 10]
[110 35]]
print metrics.confusion_matrix(y_test, y_pred_class_scaled)
# Total 882
#print the True positives # Check Term -- "True positives"
X_test[y_test == y_pred_class]
T_Positives = X_test[y_test == y_pred_class]
print type(T_Positives)
print T_Positives.shape
# (775, 30) == 722 + 53 == Diagonal 1
print T_Positives
In [42]:
# print the True Negatives # Check Term -- "T_Negatives"
X_test[y_test != y_pred_class]
T_Negatives = X_test[y_test != y_pred_class]
print type(T_Negatives)
print T_Negatives.shape
print T_Negatives
# 146 = 145 + 1 --- from the Diagonal of Confusion Matrix seen above
<type 'numpy.ndarray'>
(146, 12)
[[ 31 667 1 ..., 0 0 0]
[ 29 992 1 ..., 2 1 5]
[ 26 342 2 ..., 2 1 2]
[ 34 988 23 ..., 2 0 2]
[ 26 1330 21 ..., 1 0 0]
[ 25 383 9 ..., 2 2 2]]
# print the false positives
X_test[y_test < y_pred_class]
False_Positives = X_test[y_test < y_pred_class]
print type(False_Positives)
print False_Positives.shape
print False_Positives
In [38]:
# print the false negatives
False_Negatives = X_test[y_test > y_pred_class]
print type(False_Negatives)
print False_Negatives.shape
print False_Negatives
# Correlation from DF Data
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn
corr_df1 = df1.corr(method='pearson')
#print("--------------- CORRELATIONS ---------------")
#print(corr_dfpca.head(len(dfpca))) # Not required as we are plottng the Correlation
# We can look at Column 1 of the Print out below - see what all Features have a
# greater than 0.1 Corr value - Negative or Positive both considered .
print("--------------- CREATE A HEATMAP ---------------")
# Create a mask to display only the lower triangle of the matrix (since it's mirrored around its
# top-left to bottom-right diagonal).
mask = np.zeros_like(corr_df1)
mask[np.triu_indices_from(mask)] = True
# Create the heatmap using seaborn library.
# List if colormaps (parameter 'cmap') is available here:
seaborn.heatmap(corr_df1, cmap='RdYlGn_r', vmax=1.0, vmin=-1.0 , mask = mask, linewidths=2.5)
# Show the plot we reorient the labels for each column and row to make them easier to read.
# Watch this space for more
#Sandbox Code - Not required anymore
# We are now using - from sklearn.model_selection import StratifiedShuffleSplit
#Add version check for recent scikit-learn 0.18 checks
from distutils.version import LooseVersion as Version
from sklearn import __version__ as sklearn_version
import random
random.seed(123) # if NO seed - we get non-reproducible results
#Split data - 70% training, 30% test set:
if Version(sklearn_version) < '0.18':
from sklearn.cross_validation import train_test_split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
print X_train.shape
print X_test.shape
print y_train.shape # Pred Variable Only - Attrition-- Train Set
print y_test.shape # Pred Variable Only - Attrition-- Test Set
##### Sandbox from --
# OK
X_train1 = np.array([[1,2,3],[4,5,6],[12,13,14]])
print X_train1.shape
from sklearn import preprocessing
from sklearn.preprocessing import scale
X_trn_scaled = preprocessing.scale(X_train1)
print type(X_trn_scaled)
print "_________________________________"
print X_trn_scaled
print "_________________________________"
print X_trn_scaled.mean(axis=0)
print "_________________________________"
print X_trn_scaled.std(axis=0)
