HR analysis¶

Authors: Alexander Fred-Ojala, Ikhlaq Sidhu -- April 2019

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

pd.options.display.max_columns = 1000

%matplotlib inline

Read in candidate data¶

# read in all candidate data
df = pd.read_csv('data.csv')

/Users/afo/anaconda3/envs/data-x/lib/python3.6/site-packages/IPython/core/interactiveshell.py:2785: DtypeWarning: Columns (23,76,106,110,111,119,121,122,131,152) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)

Explore the data¶

# number of data entries
original_shape = df.shape
original_shape

(5542, 157)

# explore columns
list(df.columns)

['Full name',
 'Position title',
 'Employer',
 'Tags',
 'Job ID',
 'Monthly Base Salary',
 'Status',
 'Base Monthly Salary',
 'Certifications',
 'Date of Birth',
 'Days in current step',
 'College/University',
 'Linked job step (not editable)',
 'All linked jobs (name & step)',
 'Last note',
 'Age',
 'Availability to Start',
 'Availability to travel',
 'Candidate Type',
 'City',
 'Civil Status',
 'Competencies',
 'Country',
 'Country of Origin',
 'Currently waiting for approval from',
 'Date Hired',
 'Degree',
 'End Date',
 'End Year',
 'Honors/Awards/Achivement',
 'Location (Country & City)',
 'Present',
 'Qualification',
 'Start Date',
 'Start Year',
 'Facebook',
 'Gender',
 'Hobbies / Interests',
 'Languages',
 'Last sent email',
 'Last step update',
 'Last update',
 'Last update (in days)',
 'Licensure Exams',
 'Linked job',
 'LinkedIn ID',
 'Location Preference',
 'Monetary Benefits',
 'Role',
 'Salary Objective',
 'Successful Placement',
 'Summary of Experience',
 'Test Score',
 'Test Taken',
 'Title',
 'Total Years of  Managerial Experience',
 'Total Years of  Work Experience',
 'End Date.1',
 'Institute',
 'Start Date.1',
 'Training',
 'Training Category',
 'Visa',
 'Work Experience',
 'Company',
 'Employment Type',
 'End Date.2',
 'Industry',
 'Job Category',
 'Job Description',
 'Motivation to move/Reason for Leaving',
 'Position',
 'Present.1',
 'Specialization',
 'Start Date.2',
 'Work Time Preference',
 'Work cities',
 'Work countries',
 'Failed / Not Recommended / Not Responsive',
 'For BLP Panel Interview',
 'Group Exercise',
 'Initial Application Screening',
 'Online Test',
 'Video Application',
 'Being considered',
 'Could Not Contact',
 'Not a match',
 'On hold',
 'Submitted',
 'Wrong contact info',
 'Applicant',
 'Candidate Not Interested',
 'Client Endorsement',
 'Client Interview',
 'Client Selection',
 'Database applicant',
 'Documentation',
 'Face to Face Interview',
 'Failed CV screening (Client)',
 'Knockout',
 'Knockout - Completed',
 'Offer Accepted',
 'Candidate Not Interested - Company',
 'Candidate Not Interested - Compensation',
 'Candidate Not Interested - Job Type',
 'Candidate Not Interested - Locaiton',
 'Candidate Not Interested - Shift',
 'Candidate Not Suitable - Experience',
 'Candidate Not Suitable - Failed Assessment',
 'Candidate Not Suitable - Failed Background Check',
 'Candidate Not Suitable - Failed Client Interview',
 'Candidate Not Suitable - Failed JC Interview',
 'Candidate Not Suitable - Failed Medical Exam',
 'Candidate Not Suitable - Failed NBI Clearance',
 'Candidate assessment',
 'Career portal applicant',
 'Client Endorsement.1',
 'Client Interview.1',
 'Client Selection.1',
 'Contract Signed',
 'Database applicant.1',
 'Knockout.1',
 'Knockout - Completed.1',
 'Knockout Failed',
 'No Show - Client Interview',
 'No Show - Failed to Submit Requirements',
 'No Show - First Day Onboard',
 'No Show - JC Interview/Assessment',
 'Offer Accepted.1',
 'Offer Creation',
 'Offer Declined - Compensation Benefits',
 'Offer Declined - Job Type',
 'Offer Declined - Location',
 'Offer Declined - Shift',
 'Offer Extended',
 'Offer Withdrawn by Client',
 'Onboarded - Successful Placement',
 'Online applicant',
 'Pre-Employment Requirements',
 'Referral applicant',
 'Shortlisted',
 'Walkin, job fair and events applicant',
 'Web sourced applicant',
 'Advocacies',
 'Allowance',
 'Group / Division',
 'Job Assignment',
 'Job Category.1',
 'Job Description and Qualification',
 'Job Order Level',
 'Job Order Location',
 'Job Order Status',
 'Job Sub Category',
 'Job Sub-Category',
 'Knockout Question',
 'Last step update.1',
 'Number  of positions']

# first entries
df.drop(['Full name','Employer'],axis=1).head(1)

Handle missing values¶

If more than 70% of the data is missing from one column, then we remove that column from the data set.

# Number of missing values
df.isnull().sum()

Full name                                    0
Position title                             225
Employer                                   284
Tags                                        11
Job ID                                       0
Monthly Base Salary                       4240
Status                                       0
Base Monthly Salary                       4926
Certifications                            5532
Date of Birth                             4818
Days in current step                         0
College/University                         574
Linked job step (not editable)               0
All linked jobs (name & step)                0
Last note                                 1637
Age                                       4851
Availability to Start                     5255
Availability to travel                    5188
Candidate Type                            1365
City                                      5542
Civil Status                              4817
Competencies                              5542
Country                                   5542
Country of Origin                         5538
Currently waiting for approval from       5542
Date Hired                                5540
Degree                                    5396
End Date                                  5454
End Year                                  5452
Honors/Awards/Achivement                  5538
                                          ... 
No Show - JC Interview/Assessment         5542
Offer Accepted.1                          4335
Offer Creation                            4335
Offer Declined - Compensation Benefits    5542
Offer Declined - Job Type                 5541
Offer Declined - Location                 5542
Offer Declined - Shift                    5542
Offer Extended                            4335
Offer Withdrawn by Client                 5542
Onboarded - Successful Placement          5542
Online applicant                          4950
Pre-Employment Requirements               5542
Referral applicant                        5317
Shortlisted                               4335
Walkin, job fair and events applicant     5522
Web sourced applicant                     5542
Advocacies                                5462
Allowance                                 5512
Group / Division                            41
Job Assignment                              52
Job Category.1                              27
Job Description and Qualification           44
Job Order Level                             50
Job Order Location                          28
Job Order Status                          4960
Job Sub Category                          5532
Job Sub-Category                            57
Knockout Question                         5244
Last step update.1                          58
Number  of positions                         0
Length: 157, dtype: int64

# remove all columns where more than 30% or more values are missing
for col in df.columns:
    null_rate = df[col].isnull().sum()/df.shape[0]
    if null_rate > .7:
        df = df.drop(labels=col,axis=1)

# Number of dropped columns
original_shape[1] - df.shape[1]

124

Columns kept in the analysis¶

df.columns

Index(['Full name', 'Position title', 'Employer', 'Tags', 'Job ID', 'Status',
       'Days in current step', 'College/University',
       'Linked job step (not editable)', 'All linked jobs (name & step)',
       'Last note', 'Candidate Type', 'Start Date', 'Last sent email',
       'Last step update', 'Last update', 'Last update (in days)',
       'Linked job', 'LinkedIn ID', 'Total Years of  Work Experience',
       'Work Experience', 'Company', 'Position', 'Start Date.2',
       'Group / Division', 'Job Assignment', 'Job Category.1',
       'Job Description and Qualification', 'Job Order Level',
       'Job Order Location', 'Job Sub-Category', 'Last step update.1',
       'Number  of positions'],
      dtype='object')

Potential target / output column:¶

Status
Linked job step
All linked jobs
Last note

# Status
df['Status'].value_counts() # only if the candidate is active or not?

Closed    4866
Open       676
Name: Status, dtype: int64

df['Linked job step (not editable)'].value_counts() # offer accepted and successful placement

Offer Accepted                           4703
Offer Accepted - Successful Placement     839
Name: Linked job step (not editable), dtype: int64

df['All linked jobs (name & step)'].head()

0    Data Privacy Folder (ID4719) - Auto Send a Rem...
1    Data Privacy Folder (ID4719) - Reminder 2, Ser...
2    Data Privacy Folder (ID4719) - Reminder 2, Ser...
3    Data Privacy Folder (ID4719) - Reminder 2, Ser...
4    Data Privacy Folder (ID4719) - Reminder 2, Ser...
Name: All linked jobs (name & step), dtype: object

df['Last note'].value_counts()[:10]

DPA for existing candidates                                   2539
DPA for existing candidates (email 2)                          672
8.09 SP - start date in ISS Facility - Aug. 13, 2018            96
verbally accepted the offer. tentative start date - Nov 19      80
AWOL; he submitted his resignation just through email           70
She got offer but still negotiate to increase.                  54
Offer accepted 02/05                                            45
SP of Claire Somosierra                                         40
Application withdrawn                                           35
For Final Interview; Oct 6                                      30
Name: Last note, dtype: int64

Chose `Linked job step` as predicted output of success¶

# check data types
df.dtypes

Full name                             object
Position title                        object
Employer                              object
Tags                                  object
Job ID                                 int64
Status                                object
Days in current step                   int64
College/University                    object
Linked job step (not editable)        object
All linked jobs (name & step)         object
Last note                             object
Candidate Type                        object
Start Date                            object
Last sent email                       object
Last step update                      object
Last update                           object
Last update (in days)                  int64
Linked job                            object
LinkedIn ID                           object
Total Years of  Work Experience      float64
Work Experience                       object
Company                               object
Position                              object
Start Date.2                          object
Group / Division                      object
Job Assignment                        object
Job Category.1                        object
Job Description and Qualification     object
Job Order Level                       object
Job Order Location                    object
Job Sub-Category                      object
Last step update.1                    object
Number  of positions                   int64
dtype: object

Analyze the influence of work experience vs success¶

Histogram of the work experience distribution¶

work_exp = df['Total Years of  Work Experience']
work_exp.dropna().astype(int).hist();

Create new data frame with only work experience and success / not¶

df_new = df[['Linked job step (not editable)','Total Years of  Work Experience']]

Binary mapping of target variable¶

df_new['Linked job step (not editable)'] = df_new['Linked job step (not editable)'].map({'Offer Accepted - Successful Placement':1, 'Offer Accepted':0})

/Users/afo/anaconda3/envs/data-x/lib/python3.6/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.

# Rename columns
df_new.columns = ['success','work_exp']

# drop missing values
df_new = df_new.dropna()

df_new.shape

(5164, 2)

Correlation between variables¶

We can see significant positive correlation between work experience and Successful placement

import seaborn as sns

df_new.corr()

Calculate prediction baseline¶

df_new['success'].value_counts()

0    4375
1     789
Name: success, dtype: int64

# Baseline
baseline = df_new['success'].value_counts()[0]/df_new.shape[0]
baseline

0.8472114639814098

The basline is 84%¶

Create Random Forest prediction model (with cross validation)¶

# Single model

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

x = np.array(df_new['work_exp']).reshape(-1, 1)
y = np.array(df_new['success'])

Accuracy¶

On average we get a 90% accuracy

model = RandomForestClassifier(n_estimators=50)
score = np.array([])
# Train 40 different models
n_runs = 40
for i in range(n_runs):
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = .2)
    model.fit(x_train, y_train)
    score = np.append(score, model.score(x_test, y_test))
print(np.sum(score) / n_runs)

0.9037512100677638

# Standard deviation
np.std(score) # small standard deviation

0.008601530738017222

score

array([0.89448209, 0.89254598, 0.89835431, 0.91190707, 0.90416263,
       0.89932236, 0.90416263, 0.90125847, 0.88480155, 0.89254598,
       0.8964182 , 0.90803485, 0.90222652, 0.90416263, 0.90319458,
       0.90319458, 0.91190707, 0.89738625, 0.90029042, 0.91384318,
       0.90609874, 0.90416263, 0.91481123, 0.90803485, 0.91384318,
       0.8964182 , 0.90222652, 0.88867377, 0.90125847, 0.89738625,
       0.91674734, 0.91093901, 0.91384318, 0.88286544, 0.90803485,
       0.91674734, 0.89835431, 0.89351404, 0.91577928, 0.89448209])

Calculate relative improvement¶

Our model has a relative decrease in error of 45% (great for such a simple model, and within the margin of statistical relevancy!)

# Relative improvement over baseline
(1-score / n_runs)/(1-baseline)

0.6506978210805171

Check confusion matrix¶

This is great because it looks like false negatives (successful candidates that are predicted to not be good), are minimized.

from sklearn.metrics import confusion_matrix
pd.DataFrame(confusion_matrix(y_test, model.predict(x_test)),columns=['True negative','True positive'],\
            index=['Predicted negative','Predicted positive'])

Add additional features to the model to increase complexity and performance¶

Multiinput classifier

df.columns

Index(['Full name', 'Position title', 'Employer', 'Tags', 'Job ID', 'Status',
       'Days in current step', 'College/University',
       'Linked job step (not editable)', 'All linked jobs (name & step)',
       'Last note', 'Candidate Type', 'Start Date', 'Last sent email',
       'Last step update', 'Last update', 'Last update (in days)',
       'Linked job', 'LinkedIn ID', 'Total Years of  Work Experience',
       'Work Experience', 'Company', 'Position', 'Start Date.2',
       'Group / Division', 'Job Assignment', 'Job Category.1',
       'Job Description and Qualification', 'Job Order Level',
       'Job Order Location', 'Job Sub-Category', 'Last step update.1',
       'Number  of positions'],
      dtype='object')

Two additional options:¶

Candidate type
Job assignment

# Number of missing values
df['Candidate Type'].isnull().sum()

1365

# Number of categories and samples / category
df['Candidate Type'].value_counts()

Database applicant         1498
Career portal applicant    1314
Web sourced applicant      1200
Referral applicant          154
Walk-in applicant            10
Prospect                      1
Name: Candidate Type, dtype: int64

# Number of missing values
df['Job Assignment'].isnull().sum()

52

# Number of categories and samples / category
df['Job Assignment'].value_counts()

Contingency                        3273
Retainer                            908
Outsourcing                         648
Temp to Hire                        445
Project Based                       179
Recruitment Process Outsourcing      37
Name: Job Assignment, dtype: int64

Create new data frame with 3 features, 1 target¶

df3 = df[['Linked job step (not editable)',\
          'Total Years of  Work Experience','Candidate Type','Job Assignment']].copy()

df3.columns = ['success','work_exp','type','assignment']

df3['success'].unique()

array(['Offer Accepted', 'Offer Accepted - Successful Placement'],
      dtype=object)

# Convert target variable to binary outputs
df3['success'] = df3['success'].replace({'Offer Accepted - Successful Placement':1, 'Offer Accepted':0})

# Drop missing values
df3 = df3.dropna()

df3['work_exp'] = df3['work_exp'].astype(int)

df3.dtypes

success        int64
work_exp       int64
type          object
assignment    object
dtype: object

df3.type.value_counts()

Database applicant         1378
Career portal applicant    1274
Web sourced applicant      1159
Referral applicant           96
Walk-in applicant             4
Name: type, dtype: int64

# remove walk-in applicant category (too few samples)
df3 = df3[df3['type']!='Walk-in applicant'].copy()

df3.shape

(3907, 4)

Convert nominal cateogries (type and assignment, to one hot encoded columns)¶

# get type and assignment as new columns
for col in ['type', 'assignment']:
    new_entries = pd.get_dummies(df3[col])
    df3[new_entries.columns] = new_entries
    df3 = df3.drop(col, axis=1)

df3

Check correlation heat map¶

sns.heatmap(df3.corr());

Train single model to check performance¶

# Single model

x = np.array(df3.loc[:, df3.columns != 'success'])
y = np.array(df3['success'])


x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1)

Calculate baseline¶

We have to beat 95%

# baseline

df3['success'].value_counts()

0    3711
1     196
Name: success, dtype: int64

1-df3['success'].value_counts()[1]/df3.shape[0]

0.949833631942667

Train classifier¶

model = RandomForestClassifier(n_estimators=10)

model.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

Evaluate classifier¶

model.score(x_test, y_test)

0.9820971867007673

confusion_matrix(y_test, model.predict(x_test))

array([[371,   3],
       [  4,  13]])

Accuracy increased significantly over all runs (we get a lot of signal from this simple PoC model)¶

	success	work_exp	Career portal applicant	Database applicant	Referral applicant	Web sourced applicant	Contingency	Outsourcing	Project Based	Recruitment Process Outsourcing	Retainer	Temp to Hire
1	0	5	0	0	0	1	0	0	1	0	0	0
2	0	5	0	0	0	1	0	0	1	0	0	0
3	0	5	0	0	0	1	0	0	1	0	0	0
4	0	5	0	0	0	1	0	0	1	0	0	0
5	0	28	0	0	0	1	1	0	0	0	0	0
6	0	28	0	0	0	1	1	0	0	0	0	0
7	0	10	1	0	0	0	1	0	0	0	0	0
8	0	10	1	0	0	0	1	0	0	0	0	0
12	0	19	1	0	0	0	1	0	0	0	0	0
16	0	3	0	0	0	1	1	0	0	0	0	0
17	0	3	0	0	0	1	1	0	0	0	0	0
18	0	3	0	0	0	1	1	0	0	0	0	0
23	0	10	0	1	0	0	0	1	0	0	0	0
24	0	10	0	1	0	0	0	1	0	0	0	0
25	0	10	0	1	0	0	0	1	0	0	0	0
26	0	10	0	1	0	0	0	1	0	0	0	0
27	0	10	0	1	0	0	0	1	0	0	0	0
28	0	10	0	1	0	0	0	1	0	0	0	0
29	0	10	0	1	0	0	0	1	0	0	0	0
30	0	10	0	1	0	0	0	1	0	0	0	0
31	0	10	0	1	0	0	0	1	0	0	0	0
32	0	10	0	1	0	0	0	1	0	0	0	0
33	0	10	0	1	0	0	0	1	0	0	0	0
34	0	10	0	1	0	0	0	1	0	0	0	0
35	0	10	0	1	0	0	0	1	0	0	0	0
36	0	10	0	1	0	0	0	1	0	0	0	0
37	0	10	0	1	0	0	0	1	0	0	0	0
38	0	10	0	1	0	0	0	1	0	0	0	0
39	0	10	0	1	0	0	0	1	0	0	0	0
40	0	10	0	1	0	0	0	1	0	0	0	0
...	...	...	...	...	...	...	...	...	...	...	...	...
5509	0	4	0	0	0	1	1	0	0	0	0	0
5510	0	4	0	0	0	1	1	0	0	0	0	0
5511	0	4	0	0	0	1	1	0	0	0	0	0
5512	0	4	0	0	0	1	1	0	0	0	0	0
5513	1	10	0	0	0	1	1	0	0	0	0	0
5514	1	10	0	0	0	1	1	0	0	0	0	0
5515	1	10	0	0	0	1	1	0	0	0	0	0
5516	1	10	0	0	0	1	1	0	0	0	0	0
5517	1	10	0	0	0	1	1	0	0	0	0	0
5518	1	10	0	0	0	1	1	0	0	0	0	0
5519	1	10	0	0	0	1	1	0	0	0	0	0
5520	1	10	0	0	0	1	1	0	0	0	0	0
5521	1	10	0	0	0	1	1	0	0	0	0	0
5522	1	10	0	0	0	1	1	0	0	0	0	0
5523	1	10	0	0	0	1	1	0	0	0	0	0
5524	1	10	0	0	0	1	1	0	0	0	0	0
5525	1	10	0	0	0	1	1	0	0	0	0	0
5526	1	10	0	0	0	1	1	0	0	0	0	0
5527	1	10	0	0	0	1	1	0	0	0	0	0
5528	1	10	0	0	0	1	1	0	0	0	0	0
5529	1	10	0	0	0	1	1	0	0	0	0	0
5530	1	10	0	0	0	1	1	0	0	0	0	0
5531	0	11	0	0	0	1	1	0	0	0	0	0
5532	0	11	0	0	0	1	1	0	0	0	0	0
5533	0	11	0	0	0	1	1	0	0	0	0	0
5534	0	11	0	0	0	1	1	0	0	0	0	0
5535	0	11	0	0	0	1	1	0	0	0	0	0
5536	0	11	0	0	0	1	1	0	0	0	0	0
5540	0	3	0	0	1	0	1	0	0	0	0	0
5541	0	3	0	0	1	0	1	0	0	0	0	0

	success	work_exp
success	1.000000	0.480397
work_exp	0.480397	1.000000

	True negative	True positive
Predicted negative	844	22
Predicted positive	82	85