import findspark
findspark.init()
import pyspark
sc = pyspark.SparkContext(appName="HR")
 
print sc # Not required 
#<pyspark.context.SparkContext object at 0x7f7d38b41790>
# if we shut down the Notebook Kernel the Pyspark Context also shuts down = Not the ideal way to do the same though!!
In [2]:
## Dhankar >>
# Converted CSV to TSV 
# Now importing data from TSV 
# raw_d == the SPARK RDD Object
# Print out top 3 Rows 
raw_d = sc.textFile("hr.tsv")
# 
# In the above line of Code - actual Loading of CSV in RDD is Not yet Done 
# Its Done LAZILY - "as and when ABOSULUTELY required" as below - 
#
raw_d.take(3) 
Out[2]:
In [4]:
from time import time
#
t0 = time()
count1 = raw_d.count()
t1 = time() - t0
print("There are {} ________".format(count1))
print("Count completed in {} seconds".format(round(t1, 3)))
# Values of t1 reduce from 0.254 to 0.16 after three iterations
In [5]:
from pprint import pprint
csv_data = raw_d.map(lambda line: line.split("\t"))
t0 = time()
head_rows = csv_data.take(5)
tt = time() - t0
pprint(head_rows[0]) # Prints the Column Headers from the RDD Object
print "_________________________________________________"
print("Parse completed in {} seconds".format(round(tt, 3)))
In [6]:
import pandas as pd
import numpy as np
#reading the csv file and storing in pandas dataframe:
h_df=pd.read_csv("HR1.csv",sep=",")
#
h_df.head(5)
Out[6]:
In [7]:
from __future__ import division
import pandas as pd
import numpy as np
# Isolate the target- Feature / Variable / Column 
churn_result = h_df['Attrition'] # Choose the Column - Attrition from the DF 
h_nda = np.where(churn_result == 'Yes',1,0) # Convert 1 Column DF to numpy.ndarray == Numpy Array 
#print h_nda
In [23]:
# 'yes'/'no' converted to 1 and 0
df1=pd.DataFrame(np.where(h_df['Attrition'] == 'Yes', 1, 0))
df1.columns=['Attrition']
#
print df1.head(5)
#
df2 = pd.DataFrame(h_df.BusinessTravel.map( lambda x: 0 if x == 'Non-Travel' else 1 if x == 'Travel_Rarely' else 2 ))
df2.columns=['BusinessTravel']  
#
print df2.head(5) # Not required
#
df3 = pd.DataFrame(h_df.Department.map( lambda x: 0 if x == 'Human Resources' else 1 if x == 'Sales' else 2 ))
df3.columns=['Department']  
#
print df3.head(5) # Not required
#
df4 = pd.DataFrame(h_df.DistanceFromHome.map( lambda x: 1 if (x <= 10) else 2 if (x <= 20) else 3 ))
df4.columns=['DistanceFromHome']
#
print df4.head(5) # Not required
#
df5 = pd.DataFrame(h_df.EducationField.map( lambda x: 1 if (x == 'Life Sciences') else 2 if (x == 'Medical') else 3 if (x == 'Marketing') else 4 if (x == 'Human Resources') else 0 ))
df5.columns=['EducationField']
#
print df5.head(5) # Not required
#
df6= pd.DataFrame(h_df.Gender.map( lambda x: 1 if (x == 'Male') else 2 ))
df6.columns=['Gender']
#
print df6.head(5) # Not required
#
df7= pd.DataFrame(h_df.JobRole.map( lambda x: 9 if (x == 'Healthcare Representative') else 8 if (x == 'Human Resources') else 7 if (x == 'Laboratory Technician') else 6 if (x == 'Manufacturing Director') else 5 if (x == 'Manager') else 4 if (x == 'Research Scientist') else 3 if (x == 'Research Director') else 2 if (x == 'Sales Executive') else 1 if (x == 'Sales Representative')else 0))#
df7.columns=['JobRole']
#
print df7.head(5) # Not required
#
# MaritalStatus
df8= pd.DataFrame(h_df.MaritalStatus.map( lambda x: 0 if (x == 'Single') else 1 if (x == 'Married') else 2 ))
df8.columns=['MaritalStatus']
#
print df8.head(5) # Not required
#
# OverTime
df9= pd.DataFrame(h_df.OverTime.map( lambda x: 0 if (x == 'Yes') else 1 ))
df9.columns=['OverTime']
#
print df9.head(5) # Not required
#
# TotalWorkingYears
df10= pd.DataFrame(h_df.TotalWorkingYears.map( lambda x: 0 if (x <= 5) else 1 if (x <= 10) else 2  if (x <= 20) else 3 if (x <= 30) else 4))
df10.columns=['TotalWorkingYears']
#
print df10.head(5) # Not required
#
# Dropping Columns ==> col_drop =  ['Attrition','BusinessTravel','Department','DistanceFromHome', 'EducationField','Gender','JobRole']
# "Over18" == All YES , "StandardHours" == All 80 , Column Dropped ....
col_drop =  ['Attrition','BusinessTravel','Department','DistanceFromHome','EducationField','Gender','JobRole','MaritalStatus','Over18','StandardHours','OverTime','TotalWorkingYears']
df_1= h_df.drop(col_drop,axis=1)
print df_1.head(5)
# Concat the DF's 
#
temp_df_ls = [df1, df2, df3, df4, df5, df6, df7, df8, df9, df10 ,df_1]
h1_df = pd.concat(temp_df_ls,axis=1, join='outer')
print h1_df.head(15)
# to CSV 
# We now have 33 columns all with Numeric Values - No String Values , 2 Columns have been Dropped....
h1_df.to_csv('dfh.csv',sep=',')
Out[23]:
In [ ]:
# Cross Validation 
from sklearn.cross_validation import KFold
def run_cv(X,y,clf_class,**kwargs):
    # Construct a kfolds object
    kf = KFold(len(y),n_folds=5,shuffle=True)
    y_pred = y.copy()
    # Iterate through folds
    for train_index, test_index in kf:
        X_train, X_test = X[train_index], X[test_index]
        y_train = y[train_index]
        # Initialize a classifier with key word arguments
        clf = clf_class(**kwargs)
        clf.fit(X_train,y_train)
        y_pred[test_index] = clf.predict(X_test)
    return y_pred
In [33]:
from sklearn.datasets import load_digits
#digitss = pd.DataFrame(load_digits()) --- Error -- Arrays not of same Length ? 
digitss = load_digits()
print type(digitss)
print "__________________"
XXX, yyy = digits.data, digits.target
print XXX
print "__________________"
print yyy
In [27]:
# Dont delete Cells below here ............
%matplotlib inline
# 
import pandas as pd
from pandas.tseries.resample import TimeGrouper
import matplotlib 
import matplotlib.pyplot as plt
matplotlib.style.use('ggplot')
t0 = time()
df_c=h1_df.groupby('Age')['EmployeeCount'].sum()
#df_c.head(10) # Not required
#
plt.figure(); df_c.plot(color='r',x_compat=True).set_ylabel('Head_Count');
plt.tight_layout()
plt.title('All Employees__X Axis-AGE , Y Axis-COUNT ')
plt.savefig("All_Emp_Age_Count.pdf", dpi=950)
t1 = time() - t0
print "_________________________________________________"
print("Task completed in {} seconds".format(round(t1, 4)))
print "_________________________________________________"
In [29]:
t0 = time()
df_d=h_df.groupby('JobRole')['EmployeeCount'].sum()
#
plt.figure(); df_d.plot(color='r',x_compat=True).set_ylabel('Head_Count');  
plt.tight_layout()
plt.title('All Employees__X Axis-JOB_ROLE , Y Axis-COUNT ')
plt.savefig("All_Emp_Age_Count.pdf", dpi=950)
t1 = time() - t0
print "_________________________________________________"
print("Task completed in {} seconds".format(round(t1, 4)))
print "_________________________________________________"
# Color Choices - color='r' , color='g' , color='b'
 
No comments:
Post a Comment