ML Exploration: Titanic Dataset

In summer 2019 I blogged about how I was taking a couple months to work on Machine Learning.

Since then I’ve mostly focused on software for the Mac and server-side development. My ML hands-on knowledge was getting a bit, rusty… Plus, things have evolved a bit: new technologies, new approaches, new concepts… Perfect timing as the new edition of the Hands-On ML with Sikit, Keras and TensorFlow book was recently released. 

I’ll be re-reading it and redoing all exercises. Below you’ll find the first major exercise I completed yesterday, the Titanic dataset. Today I just started a SPAM filtering model, excellent book. 

Marc

— 

import pandas as pd
import matplotlib.pyplot as plt

#—– INITIAL SETUP
titanic_train_data = pd.read_csv(‘titanicData/train.csv’)

X_train = titanic_train_data.drop(labels=‘Survived’, axis=1).copy()
y_train = titanic_train_data[[‘Survived’]].copy()

#—– DATA EXPLORATION

X_train.head()

X_train.count()
#We have a total of 891 entries. Not known for all are:
# -Cabin information is not known for all with 204 entries.
# -Age is not known for all with 714 entries.
# -Embarked is not known for all with 889 entries.

X_train.describe()
#Key insights:
# – People are quite young with median at 28 and mean at 29
# – Most people where in 2nd or 3rd class.
# – Most people did not travel with siblings or spouses SibSp, Same re. parent or children Parch.
# – Fare changes significantly and could be an indication of quality of the room.
X_train[‘Pclass’].unique()
#3 class types.

X_train[‘SibSp’].unique()
#From 1 to 8.

X_train[‘Embarked’].unique()
#S, C, Q or nan.

X_train[‘Sex’].value_counts()
#More male than female, 577 male vs 314 female.

X_train[‘Pclass’].value_counts()
#A lot more third than first, funnily enough more 1st than second.
#Plotting split between classes
plt.pie(x=X_train[‘Pclass’].value_counts(), labels=X_train[‘Pclass’].unique(),autopct=‘%1.0f%%’ )
plt.legend()
plt.show()

#Plotting where people came in the titanic
plt.bar(x=[‘S’,‘C’,‘Q’] ,height=X_train[‘Embarked’].value_counts())
plt.show()

#—– DATA PREPARATION

#Feature engineering, combine Siblings and Spouses together with Children and Parents
#X_train[‘Siblings’] = X_train[‘SibSp’] + X_train[‘Parch’]

#Remove data we won’t be using
#X_train = X_train.drop(columns=[‘PassengerId’, ‘Name’, ‘Ticket’, ‘Cabin’, ‘SibSp’, ‘Parch’])

#Test that it worked correctly
#X_train.head()
#X_train[X_train[‘Siblings’]>1]
from sklearn.base import BaseEstimator, TransformerMixin

class PrepareData(BaseEstimator, TransformerMixin):
‘Feature engineering, all custom changes are done in this class’
def __init__(self):
pass
def fit(self, X, y=None):
return self
def transform(self, X):
print(f‘About to {len(list(X))} items -> {list(X)})
X[‘Siblings’] = X[‘SibSp’] + X[‘Parch’]
print(f‘Having {len(list(X))} items -> {list(X)})
X = X.drop(columns=[‘PassengerId’, ‘Name’, ‘Ticket’, ‘Cabin’, ‘SibSp’, ‘Parch’])
print(f‘Returning {len(list(X))} items -> {list(X)})
return X

#—– PIPELINE
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

num_pipeline = Pipeline(
[
(‘imputer’, SimpleImputer(strategy=‘median’)),
(‘std_scaler’, StandardScaler())
]
)

#Get the headers
X_train_num_cols = [‘Age’, ‘Siblings’, ‘Fare’, ‘Pclass’]
X_train_cat_cols = [‘Sex’, ‘Embarked’]
#Get numberical values and non numerical values
ext_pipeline = ColumnTransformer(
[
(‘num’, num_pipeline, X_train_num_cols),
(‘cat’, OneHotEncoder(handle_unknown=‘ignore’), X_train_cat_cols)
]
)

full_pipeline = Pipeline(
[
(‘custPrep’, PrepareData()),
(‘ext_pipe’, ext_pipeline)
]
)

X_train_prepared = full_pipeline.fit_transform(X_train)

#—– MODEL TRAINING AND PREDICTION USING KNEIGHBORS (TO START WITH ONE)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

neigh_clf = KNeighborsClassifier(n_neighbors=3, n_jobs=-1)
score = cross_val_score(neigh_clf, X_train_prepared, y=y_train.values.ravel(), cv=5)
score.mean() #80% is not bad considering 60% died and 40% survived

#Death rate
y_train.value_counts()[0]/(y_train.value_counts()[0]+y_train.value_counts()[1])
#—– IMPROVE MODEL THROUGH GRID-SEARCH
from sklearn.model_selection import GridSearchCV

param_grid = [
{
‘n_neighbors’:[3, 15, 30, 40, 50],
‘leaf_size’: [15, 20, 30, 35, 45],
‘weights’: [‘uniform’, ‘distance’]
}
]

neigh_clf = KNeighborsClassifier()
grid_search = GridSearchCV(neigh_clf, param_grid, cv=3, return_train_score=True)
grid_search.fit(X_train_prepared, y_train.values.ravel())
grid_search.best_params_
grid_search.best_score_
#{‘leaf_size’: 15, ‘n_neighbors’: 30, ‘weights’: ‘uniform’}
#—– PREPARING FOR SUMBISSION WITH IMPROVED MODEL
neigh_clf = grid_search.best_estimator_
neigh_clf.fit(X_train_prepared, y_train.values.ravel())

X_test = pd.read_csv(‘titanicData/test.csv’)
#y_test_withId = pd.read_csv(‘titanicData/gender_submission.csv’)
#y_test = y_test_withId.drop(columns=[‘PassengerId’])

X_test_prepared = full_pipeline.transform(X_test)

from sklearn.metrics import accuracy_score
y_test_pred = neigh_clf.predict(X_test_prepared)
#accuracy_score(y_test, y_test_pred) Can’t use as y_test data is fake. Need to submit to kaggle to get the right data

#—– USE SVM (TO TRY ANOTHER MODEL)

from sklearn import svm

svm_clf = svm.SVC(kernel= ‘poly’)
svm_clf.fit(X_train_prepared, y_train.values.ravel())
y_test_pred = svm_clf.predict(X_test_prepared)
#accuracy_score(y_test, y_test_pred)

#Lets try with linear kernel
svm_clf = svm.SVC(kernel= ‘linear’)
svm_clf.fit(X_train_prepared, y_train.values.ravel())
y_test_pred = svm_clf.predict(X_test_prepared)
#accuracy_score(y_test, y_test_pred)
#We can find as well coeficiants of feature importance
svm_clf.coef_[0]

#And confusion matrix
from sklearn.metrics import plot_confusion_matrix
plot_confusion_matrix(svm_clf, X_test_prepared, y_test.values.ravel(),
cmap=plt.cm.Blues)

#And directly calculating numbers and graphing it in a diferent way
from sklearn.metrics import confusion_matrix
#conf_mx = confusion_matrix(y_test, y_test_pred)
#plt.matshow(conf_mx, cmap=plt.cm.gray)

#—– GET READY FOR SUBMISSION TO KAGGLE
y_test_withId = pd.read_csv(‘titanicData/gender_submission.csv’)
y_test_withId[‘Survived’] = y_test_pred
y_test_withId.to_csv(‘submission.csv’, index=False)