Friday, 25 November 2016

Big Data HR Analytics - IBM WATSON ANALYTICS Sample Dataset Employee Churn Prediction

import findspark
findspark.init()
import pyspark
sc = pyspark.SparkContext(appName="HR")
 
print sc # Not required 


#<pyspark.context.SparkContext object at 0x7f7d38b41790>
# if we shut down the Notebook Kernel the Pyspark Context also shuts down = Not the ideal way to do the same though!!
<pyspark.context.SparkContext object at 0x7f7d38b41790>
In [2]:
## Dhankar >>
# Converted CSV to TSV 
# Now importing data from TSV 
# raw_d == the SPARK RDD Object
# Print out top 3 Rows 

raw_d = sc.textFile("hr.tsv")

# 
# In the above line of Code - actual Loading of CSV in RDD is Not yet Done 
# Its Done LAZILY - "as and when ABOSULUTELY required" as below - 
#

raw_d.take(3) 
Out[2]:
[u'Age\tAttrition\tBusinessTravel\tDailyRate\tDepartment\tDistanceFromHome\tEducation\tEducationField\tEmployeeCount\tEmployeeNumber\tEnvironmentSatisfaction\tGender\tHourlyRate\tJobInvolvement\tJobLevel\tJobRole\tJobSatisfaction\tMaritalStatus\tMonthlyIncome\tMonthlyRate\tNumCompaniesWorked\tOver18\tOverTime\tPercentSalaryHike\tPerformanceRating\tRelationshipSatisfaction\tStandardHours\tStockOptionLevel\tTotalWorkingYears\tTrainingTimesLastYear\tWorkLifeBalance\tYearsAtCompany\tYearsInCurrentRole\tYearsSinceLastPromotion\tYearsWithCurrManager',
 u'41\tYes\tTravel_Rarely\t1102\tSales\t1\t2\tLife Sciences\t1\t1\t2\tFemale\t94\t3\t2\tSales Executive\t4\tSingle\t5993\t19479\t8\tY\tYes\t11\t3\t1\t80\t0\t8\t0\t1\t6\t4\t0\t5',
 u'49\tNo\tTravel_Frequently\t279\tResearch & Development\t8\t1\tLife Sciences\t1\t2\t3\tMale\t61\t2\t2\tResearch Scientist\t2\tMarried\t5130\t24907\t1\tY\tNo\t23\t4\t4\t80\t1\t10\t3\t3\t10\t7\t1\t7']
In [4]:
from time import time
#
t0 = time()
count1 = raw_d.count()
t1 = time() - t0
print("There are {} ________".format(count1))
print("Count completed in {} seconds".format(round(t1, 3)))

# Values of t1 reduce from 0.254 to 0.16 after three iterations
There are 2941 ________
Count completed in 0.17 seconds
In [5]:
from pprint import pprint

csv_data = raw_d.map(lambda line: line.split("\t"))
t0 = time()
head_rows = csv_data.take(5)
tt = time() - t0

pprint(head_rows[0]) # Prints the Column Headers from the RDD Object
print "_________________________________________________"
print("Parse completed in {} seconds".format(round(tt, 3)))
[u'Age',
 u'Attrition',
 u'BusinessTravel',
 u'DailyRate',
 u'Department',
 u'DistanceFromHome',
 u'Education',
 u'EducationField',
 u'EmployeeCount',
 u'EmployeeNumber',
 u'EnvironmentSatisfaction',
 u'Gender',
 u'HourlyRate',
 u'JobInvolvement',
 u'JobLevel',
 u'JobRole',
 u'JobSatisfaction',
 u'MaritalStatus',
 u'MonthlyIncome',
 u'MonthlyRate',
 u'NumCompaniesWorked',
 u'Over18',
 u'OverTime',
 u'PercentSalaryHike',
 u'PerformanceRating',
 u'RelationshipSatisfaction',
 u'StandardHours',
 u'StockOptionLevel',
 u'TotalWorkingYears',
 u'TrainingTimesLastYear',
 u'WorkLifeBalance',
 u'YearsAtCompany',
 u'YearsInCurrentRole',
 u'YearsSinceLastPromotion',
 u'YearsWithCurrManager']
_________________________________________________
Parse completed in 0.085 seconds
In [6]:
import pandas as pd
import numpy as np

#reading the csv file and storing in pandas dataframe:
h_df=pd.read_csv("HR1.csv",sep=",")
#
h_df.head(5)
Out[6]:
AgeAttritionBusinessTravelDailyRateDepartmentDistanceFromHomeEducationEducationFieldEmployeeCountEmployeeNumber...RelationshipSatisfactionStandardHoursStockOptionLevelTotalWorkingYearsTrainingTimesLastYearWorkLifeBalanceYearsAtCompanyYearsInCurrentRoleYearsSinceLastPromotionYearsWithCurrManager
041YesTravel_Rarely1102Sales12Life Sciences11...18008016405
149NoTravel_Frequently279Research & Development81Life Sciences12...4801103310717
237YesTravel_Rarely1373Research & Development22Other13...28007330000
333NoTravel_Frequently1392Research & Development34Life Sciences14...38008338730
427NoTravel_Rarely591Research & Development21Medical15...48016332222
5 rows × 35 columns
In [7]:
from __future__ import division
import pandas as pd
import numpy as np

# Isolate the target- Feature / Variable / Column 
churn_result = h_df['Attrition'] # Choose the Column - Attrition from the DF 
h_nda = np.where(churn_result == 'Yes',1,0) # Convert 1 Column DF to numpy.ndarray == Numpy Array 

#print h_nda
In [23]:
# 'yes'/'no' converted to 1 and 0
df1=pd.DataFrame(np.where(h_df['Attrition'] == 'Yes', 1, 0))
df1.columns=['Attrition']
#
print df1.head(5)
#
df2 = pd.DataFrame(h_df.BusinessTravel.map( lambda x: 0 if x == 'Non-Travel' else 1 if x == 'Travel_Rarely' else 2 ))
df2.columns=['BusinessTravel']  
#
print df2.head(5) # Not required
#
df3 = pd.DataFrame(h_df.Department.map( lambda x: 0 if x == 'Human Resources' else 1 if x == 'Sales' else 2 ))
df3.columns=['Department']  
#
print df3.head(5) # Not required
#
df4 = pd.DataFrame(h_df.DistanceFromHome.map( lambda x: 1 if (x <= 10) else 2 if (x <= 20) else 3 ))
df4.columns=['DistanceFromHome']
#
print df4.head(5) # Not required
#
df5 = pd.DataFrame(h_df.EducationField.map( lambda x: 1 if (x == 'Life Sciences') else 2 if (x == 'Medical') else 3 if (x == 'Marketing') else 4 if (x == 'Human Resources') else 0 ))
df5.columns=['EducationField']
#
print df5.head(5) # Not required
#
df6= pd.DataFrame(h_df.Gender.map( lambda x: 1 if (x == 'Male') else 2 ))
df6.columns=['Gender']
#
print df6.head(5) # Not required
#
df7= pd.DataFrame(h_df.JobRole.map( lambda x: 9 if (x == 'Healthcare Representative') else 8 if (x == 'Human Resources') else 7 if (x == 'Laboratory Technician') else 6 if (x == 'Manufacturing Director') else 5 if (x == 'Manager') else 4 if (x == 'Research Scientist') else 3 if (x == 'Research Director') else 2 if (x == 'Sales Executive') else 1 if (x == 'Sales Representative')else 0))#
df7.columns=['JobRole']
#
print df7.head(5) # Not required
#
# MaritalStatus
df8= pd.DataFrame(h_df.MaritalStatus.map( lambda x: 0 if (x == 'Single') else 1 if (x == 'Married') else 2 ))
df8.columns=['MaritalStatus']
#
print df8.head(5) # Not required
#
# OverTime
df9= pd.DataFrame(h_df.OverTime.map( lambda x: 0 if (x == 'Yes') else 1 ))
df9.columns=['OverTime']
#
print df9.head(5) # Not required
#
# TotalWorkingYears
df10= pd.DataFrame(h_df.TotalWorkingYears.map( lambda x: 0 if (x <= 5) else 1 if (x <= 10) else 2  if (x <= 20) else 3 if (x <= 30) else 4))
df10.columns=['TotalWorkingYears']
#
print df10.head(5) # Not required
#
# Dropping Columns ==> col_drop =  ['Attrition','BusinessTravel','Department','DistanceFromHome', 'EducationField','Gender','JobRole']
# "Over18" == All YES , "StandardHours" == All 80 , Column Dropped ....

col_drop =  ['Attrition','BusinessTravel','Department','DistanceFromHome','EducationField','Gender','JobRole','MaritalStatus','Over18','StandardHours','OverTime','TotalWorkingYears']

df_1= h_df.drop(col_drop,axis=1)

print df_1.head(5)
# Concat the DF's 
#
temp_df_ls = [df1, df2, df3, df4, df5, df6, df7, df8, df9, df10 ,df_1]
h1_df = pd.concat(temp_df_ls,axis=1, join='outer')

print h1_df.head(15)
# to CSV 
# We now have 33 columns all with Numeric Values - No String Values , 2 Columns have been Dropped....
h1_df.to_csv('dfh.csv',sep=',')
   Attrition
0          1
1          0
2          1
3          0
4          0
   BusinessTravel
0               1
1               2
2               1
3               2
4               1
   Department
0           1
1           2
2           2
3           2
4           2
   DistanceFromHome
0                 1
1                 1
2                 1
3                 1
4                 1
   EducationField
0               1
1               1
2               0
3               1
4               2
   Gender
0       2
1       1
2       1
3       2
4       1
   JobRole
0        2
1        4
2        7
3        4
4        7
   MaritalStatus
0              0
1              1
2              0
3              1
4              1
   OverTime
0         0
1         1
2         0
3         0
4         1
   TotalWorkingYears
0                  1
1                  1
2                  1
3                  1
4                  1
   Age  DailyRate  Education  EmployeeCount  EmployeeNumber  \
0   41       1102          2              1               1   
1   49        279          1              1               2   
2   37       1373          2              1               3   
3   33       1392          4              1               4   
4   27        591          1              1               5   

   EnvironmentSatisfaction  HourlyRate  JobInvolvement  JobLevel  \
0                        2          94               3         2   
1                        3          61               2         2   
2                        4          92               2         1   
3                        4          56               3         1   
4                        1          40               3         1   

   JobSatisfaction          ...           PercentSalaryHike  \
0                4          ...                          11   
1                2          ...                          23   
2                3          ...                          15   
3                3          ...                          11   
4                2          ...                          12   

   PerformanceRating  RelationshipSatisfaction  StockOptionLevel  \
0                  3                         1                 0   
1                  4                         4                 1   
2                  3                         2                 0   
3                  3                         3                 0   
4                  3                         4                 1   

   TrainingTimesLastYear  WorkLifeBalance  YearsAtCompany  YearsInCurrentRole  \
0                      0                1               6                   4   
1                      3                3              10                   7   
2                      3                3               0                   0   
3                      3                3               8                   7   
4                      3                3               2                   2   

   YearsSinceLastPromotion  YearsWithCurrManager  
0                        0                     5  
1                        1                     7  
2                        0                     0  
3                        3                     0  
4                        2                     2  

[5 rows x 23 columns]
    Attrition  BusinessTravel  Department  DistanceFromHome  EducationField  \
0           1               1           1                 1               1   
1           0               2           2                 1               1   
2           1               1           2                 1               0   
3           0               2           2                 1               1   
4           0               1           2                 1               2   
5           0               2           2                 1               1   
6           0               1           2                 1               2   
7           0               1           2                 3               1   
8           0               2           2                 3               1   
9           0               1           2                 3               2   
10          0               1           2                 2               2   
11          0               1           2                 2               1   
12          0               1           2                 3               1   
13          0               1           2                 2               2   
14          1               1           2                 3               1   

    Gender  JobRole  MaritalStatus  OverTime  TotalWorkingYears  \
0        2        2              0         0                  1   
1        1        4              1         1                  1   
2        1        7              0         0                  1   
3        2        4              1         0                  1   
4        1        7              1         1                  1   
5        1        7              0         1                  1   
6        2        7              1         0                  2   
7        1        7              2         1                  0   
8        1        6              0         1                  1   
9        1        9              1         1                  2   
10       1        7              1         1                  1   
11       2        7              0         0                  1   
12       1        4              2         1                  0   
13       1        7              2         1                  0   
14       1        7              0         0                  1   

            ...           PercentSalaryHike  PerformanceRating  \
0           ...                          11                  3   
1           ...                          23                  4   
2           ...                          15                  3   
3           ...                          11                  3   
4           ...                          12                  3   
5           ...                          13                  3   
6           ...                          20                  4   
7           ...                          22                  4   
8           ...                          21                  4   
9           ...                          13                  3   
10          ...                          13                  3   
11          ...                          12                  3   
12          ...                          17                  3   
13          ...                          11                  3   
14          ...                          14                  3   

    RelationshipSatisfaction  StockOptionLevel  TrainingTimesLastYear  \
0                          1                 0                      0   
1                          4                 1                      3   
2                          2                 0                      3   
3                          3                 0                      3   
4                          4                 1                      3   
5                          3                 0                      2   
6                          1                 3                      3   
7                          2                 1                      2   
8                          2                 0                      2   
9                          2                 2                      3   
10                         3                 1                      5   
11                         4                 0                      3   
12                         4                 1                      1   
13                         3                 1                      2   
14                         2                 0                      4   

    WorkLifeBalance  YearsAtCompany  YearsInCurrentRole  \
0                 1               6                   4   
1                 3              10                   7   
2                 3               0                   0   
3                 3               8                   7   
4                 3               2                   2   
5                 2               7                   7   
6                 2               1                   0   
7                 3               1                   0   
8                 3               9                   7   
9                 2               7                   7   
10                3               5                   4   
11                3               9                   5   
12                2               5                   2   
13                3               2                   2   
14                3               4                   2   

    YearsSinceLastPromotion  YearsWithCurrManager  
0                         0                     5  
1                         1                     7  
2                         0                     0  
3                         3                     0  
4                         2                     2  
5                         3                     6  
6                         0                     0  
7                         0                     0  
8                         1                     8  
9                         7                     7  
10                        0                     3  
11                        0                     8  
12                        4                     3  
13                        1                     2  
14                        0                     3  

[15 rows x 33 columns]
Out[23]:
'\nIn [4]: frames = [df1, df2, df3]\n\nIn [5]: result = pd.concat(frames)\n\n'
In [ ]:
# Cross Validation 

from sklearn.cross_validation import KFold

def run_cv(X,y,clf_class,**kwargs):
    # Construct a kfolds object
    kf = KFold(len(y),n_folds=5,shuffle=True)
    y_pred = y.copy()

    # Iterate through folds
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        # Initialize a classifier with key word arguments
        clf = clf_class(**kwargs)
        clf.fit(X_train,y_train)
        y_pred[test_index] = clf.predict(X_test)
    return y_pred
In [33]:
from sklearn.datasets import load_digits
#digitss = pd.DataFrame(load_digits()) --- Error -- Arrays not of same Length ? 

digitss = load_digits()

print type(digitss)
print "__________________"

XXX, yyy = digits.data, digits.target

print XXX

print "__________________"

print yyy
<class 'sklearn.datasets.base.Bunch'>
__________________
[[  0.   0.   5. ...,   0.   0.   0.]
 [  0.   0.   0. ...,  10.   0.   0.]
 [  0.   0.   0. ...,  16.   9.   0.]
 ..., 
 [  0.   0.   1. ...,   6.   0.   0.]
 [  0.   0.   2. ...,  12.   0.   0.]
 [  0.   0.  10. ...,  12.   1.   0.]]
__________________
[0 1 2 ..., 8 9 8]
In [27]:
# Dont delete Cells below here ............

%matplotlib inline
# 
import pandas as pd
from pandas.tseries.resample import TimeGrouper
import matplotlib 
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')

t0 = time()

df_c=h1_df.groupby('Age')['EmployeeCount'].sum()

#df_c.head(10) # Not required
#
plt.figure(); df_c.plot(color='r',x_compat=True).set_ylabel('Head_Count');
plt.tight_layout()
plt.title('All Employees__X Axis-AGE , Y Axis-COUNT ')
plt.savefig("All_Emp_Age_Count.pdf", dpi=950)

t1 = time() - t0

print "_________________________________________________"
print("Task completed in {} seconds".format(round(t1, 4)))
print "_________________________________________________"
_________________________________________________
Task completed in 0.3878 seconds
_________________________________________________
In [29]:
t0 = time()

df_d=h_df.groupby('JobRole')['EmployeeCount'].sum()
#
plt.figure(); df_d.plot(color='r',x_compat=True).set_ylabel('Head_Count');  

plt.tight_layout()
plt.title('All Employees__X Axis-JOB_ROLE , Y Axis-COUNT ')
plt.savefig("All_Emp_Age_Count.pdf", dpi=950)

t1 = time() - t0

print "_________________________________________________"
print("Task completed in {} seconds".format(round(t1, 4)))
print "_________________________________________________"

# Color Choices - color='r' , color='g' , color='b'
_________________________________________________
Task completed in 0.276 seconds
_________________________________________________

No comments:

Post a Comment