Authors: Alexander Fred-Ojala, Ikhlaq Sidhu -- April 2019
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
pd.options.display.max_columns = 1000
%matplotlib inline
# read in all candidate data
df = pd.read_csv('data.csv')
# number of data entries
original_shape = df.shape
original_shape
# explore columns
list(df.columns)
# first entries
df.drop(['Full name','Employer'],axis=1).head(1)
If more than 70% of the data is missing from one column, then we remove that column from the data set.
# Number of missing values
df.isnull().sum()
# remove all columns where more than 30% or more values are missing
for col in df.columns:
null_rate = df[col].isnull().sum()/df.shape[0]
if null_rate > .7:
df = df.drop(labels=col,axis=1)
# Number of dropped columns
original_shape[1] - df.shape[1]
df.columns
# Status
df['Status'].value_counts() # only if the candidate is active or not?
df['Linked job step (not editable)'].value_counts() # offer accepted and successful placement
df['All linked jobs (name & step)'].head()
df['Last note'].value_counts()[:10]
Linked job step
as predicted output of success¶# check data types
df.dtypes
work_exp = df['Total Years of Work Experience']
work_exp.dropna().astype(int).hist();
df_new = df[['Linked job step (not editable)','Total Years of Work Experience']]
df_new['Linked job step (not editable)'] = df_new['Linked job step (not editable)'].map({'Offer Accepted - Successful Placement':1, 'Offer Accepted':0})
# Rename columns
df_new.columns = ['success','work_exp']
# drop missing values
df_new = df_new.dropna()
df_new.shape
We can see significant positive correlation between work experience and Successful placement
import seaborn as sns
df_new.corr()
df_new['success'].value_counts()
# Baseline
baseline = df_new['success'].value_counts()[0]/df_new.shape[0]
baseline
# Single model
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
x = np.array(df_new['work_exp']).reshape(-1, 1)
y = np.array(df_new['success'])
On average we get a 90% accuracy
model = RandomForestClassifier(n_estimators=50)
score = np.array([])
# Train 40 different models
n_runs = 40
for i in range(n_runs):
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = .2)
model.fit(x_train, y_train)
score = np.append(score, model.score(x_test, y_test))
print(np.sum(score) / n_runs)
# Standard deviation
np.std(score) # small standard deviation
score
Our model has a relative decrease in error of 45% (great for such a simple model, and within the margin of statistical relevancy!)
# Relative improvement over baseline
(1-score / n_runs)/(1-baseline)
This is great because it looks like false negatives (successful candidates that are predicted to not be good), are minimized.
from sklearn.metrics import confusion_matrix
pd.DataFrame(confusion_matrix(y_test, model.predict(x_test)),columns=['True negative','True positive'],\
index=['Predicted negative','Predicted positive'])
Multiinput classifier
df.columns
# Number of missing values
df['Candidate Type'].isnull().sum()
# Number of categories and samples / category
df['Candidate Type'].value_counts()
# Number of missing values
df['Job Assignment'].isnull().sum()
# Number of categories and samples / category
df['Job Assignment'].value_counts()
df3 = df[['Linked job step (not editable)',\
'Total Years of Work Experience','Candidate Type','Job Assignment']].copy()
df3.columns = ['success','work_exp','type','assignment']
df3['success'].unique()
# Convert target variable to binary outputs
df3['success'] = df3['success'].replace({'Offer Accepted - Successful Placement':1, 'Offer Accepted':0})
# Drop missing values
df3 = df3.dropna()
df3['work_exp'] = df3['work_exp'].astype(int)
df3.dtypes
df3.type.value_counts()
# remove walk-in applicant category (too few samples)
df3 = df3[df3['type']!='Walk-in applicant'].copy()
df3.shape
# get type and assignment as new columns
for col in ['type', 'assignment']:
new_entries = pd.get_dummies(df3[col])
df3[new_entries.columns] = new_entries
df3 = df3.drop(col, axis=1)
df3
sns.heatmap(df3.corr());
# Single model
x = np.array(df3.loc[:, df3.columns != 'success'])
y = np.array(df3['success'])
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1)
We have to beat 95%
# baseline
df3['success'].value_counts()
1-df3['success'].value_counts()[1]/df3.shape[0]
model = RandomForestClassifier(n_estimators=10)
model.fit(x_train, y_train)
model.score(x_test, y_test)
confusion_matrix(y_test, model.predict(x_test))