The Titanic
Kaggle: Titanic
1 2 3 4
| Simple implementation with basic data cleaning, one-hot encoding and lightGBM classifier.
Score: 0.76794 Rank: 11846/17593
|
Import Packages / Load Dataset
1
| %cd /content/drive/My Drive/Kaggle/titanic
|
/content/drive/My Drive/Kaggle/titanic
1 2 3 4 5 6 7 8 9
| import numpy as np import pandas as pd import matplotlib.pyplot as plt %matplotlib inline
import re import lightgbm import xgboost import os
|
1
| df_train = pd.read_csv(os.getcwd()+'/train.csv')
|
Data Exploration
|
PassengerId
|
Survived
|
Pclass
|
Age
|
SibSp
|
Parch
|
Fare
|
count
|
891.000000
|
891.000000
|
891.000000
|
714.000000
|
891.000000
|
891.000000
|
891.000000
|
mean
|
446.000000
|
0.383838
|
2.308642
|
29.699118
|
0.523008
|
0.381594
|
32.204208
|
std
|
257.353842
|
0.486592
|
0.836071
|
14.526497
|
1.102743
|
0.806057
|
49.693429
|
min
|
1.000000
|
0.000000
|
1.000000
|
0.420000
|
0.000000
|
0.000000
|
0.000000
|
25%
|
223.500000
|
0.000000
|
2.000000
|
20.125000
|
0.000000
|
0.000000
|
7.910400
|
50%
|
446.000000
|
0.000000
|
3.000000
|
28.000000
|
0.000000
|
0.000000
|
14.454200
|
75%
|
668.500000
|
1.000000
|
3.000000
|
38.000000
|
1.000000
|
0.000000
|
31.000000
|
max
|
891.000000
|
1.000000
|
3.000000
|
80.000000
|
8.000000
|
6.000000
|
512.329200
|
|
PassengerId
|
Survived
|
Pclass
|
Name
|
Sex
|
Age
|
SibSp
|
Parch
|
Ticket
|
Fare
|
Cabin
|
Embarked
|
0
|
1
|
0
|
3
|
Braund, Mr. Owen Harris
|
male
|
22.0
|
1
|
0
|
A/5 21171
|
7.2500
|
NaN
|
S
|
1
|
2
|
1
|
1
|
Cumings, Mrs. John Bradley (Florence Briggs Th...
|
female
|
38.0
|
1
|
0
|
PC 17599
|
71.2833
|
C85
|
C
|
2
|
3
|
1
|
3
|
Heikkinen, Miss. Laina
|
female
|
26.0
|
0
|
0
|
STON/O2. 3101282
|
7.9250
|
NaN
|
S
|
3
|
4
|
1
|
1
|
Futrelle, Mrs. Jacques Heath (Lily May Peel)
|
female
|
35.0
|
1
|
0
|
113803
|
53.1000
|
C123
|
S
|
4
|
5
|
0
|
3
|
Allen, Mr. William Henry
|
male
|
35.0
|
0
|
0
|
373450
|
8.0500
|
NaN
|
S
|
PassengerId 0
Survived 0
Pclass 0
Name 0
Sex 0
Age 177
SibSp 0
Parch 0
Ticket 0
Fare 0
Cabin 687
Embarked 2
dtype: int64
PassengerId 0.000000
Survived 0.000000
Pclass 0.000000
Name 0.000000
Sex 0.000000
Age 0.198653
SibSp 0.000000
Parch 0.000000
Ticket 0.000000
Fare 0.000000
Cabin 0.771044
Embarked 0.002245
dtype: float64
1
| df_train.Embarked.unique()
|
array(['S', 'C', 'Q', nan], dtype=object)
1 2
| df_train.Cabin.dropna().map(lambda x: x[0]).unique()
|
array(['C', 'E', 'G', 'D', 'A', 'B', 'F', 'T'], dtype=object)
Preprocessing
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
| df_clean = df_train.copy()
df_clean.Sex = df_clean.Sex.apply(lambda x: (x=='male') * 1)
df_clean.Cabin = df_clean.Cabin.map(lambda x: re.findall("^[a-zA-Z]", x)[0] if not (x is np.nan) else x)
df_clean["Title"] = df_clean.Name.map(lambda x: re.findall(pattern = "([A-Z][a-zA-Z]+)\.", string = x)[0])
df_clean.Title = df_clean.Title.map(lambda x: "Other" if (df_clean.Title.value_counts()[x] < 10) else x)
df_clean.Cabin = df_clean.Cabin.fillna(value="Unk")
byPclassAndTitle = df_clean.groupby(["Pclass", "Title"]).agg(np.median)["Age"].reset_index() df_clean = pd.concat( [ df_clean[df_clean.Age.notna()], pd.merge(df_clean[df_clean.Age.isna()], byPclassAndTitle, how='left', on=["Pclass", "Title"]).drop(columns="Age_x").rename(columns={"Age_y":"Age"})[df_clean.columns] ], axis=0 )
df_clean.Embarked = df_clean.Embarked.fillna("unk") df_clean.sort_values("PassengerId", inplace=True)
|
1
| df_clean.Title.value_counts()
|
Mr 517
Miss 182
Mrs 125
Master 40
Other 27
Name: Title, dtype: int64
1
| df_clean.Embarked.value_counts()
|
S 644
C 168
Q 77
unk 2
Name: Embarked, dtype: int64
Model
1 2
| from sklearn.model_selection import train_test_split from sklearn.preprocessing import OneHotEncoder
|
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'Title'],
dtype='object')
1 2 3 4 5 6 7 8 9 10
|
kept_cols = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare' , 'Title'] cat_cols = ['Title']
num_cols = [i for i in kept_cols if i not in cat_cols]
|
1
| X = pd.concat([df_clean[num_cols], pd.get_dummies(df_clean[cat_cols], drop_first=True)], axis=1)
|
|
Pclass
|
Sex
|
Age
|
SibSp
|
Parch
|
Fare
|
Title_Miss
|
Title_Mr
|
Title_Mrs
|
Title_Other
|
0
|
3
|
1
|
22.0
|
1
|
0
|
7.2500
|
0
|
1
|
0
|
0
|
1
|
1
|
0
|
38.0
|
1
|
0
|
71.2833
|
0
|
0
|
1
|
0
|
2
|
3
|
0
|
26.0
|
0
|
0
|
7.9250
|
1
|
0
|
0
|
0
|
3
|
1
|
0
|
35.0
|
1
|
0
|
53.1000
|
0
|
0
|
1
|
0
|
4
|
3
|
1
|
35.0
|
0
|
0
|
8.0500
|
0
|
1
|
0
|
0
|
...
|
...
|
...
|
...
|
...
|
...
|
...
|
...
|
...
|
...
|
...
|
886
|
2
|
1
|
27.0
|
0
|
0
|
13.0000
|
0
|
0
|
0
|
1
|
887
|
1
|
0
|
19.0
|
0
|
0
|
30.0000
|
1
|
0
|
0
|
0
|
176
|
3
|
0
|
18.0
|
1
|
2
|
23.4500
|
1
|
0
|
0
|
0
|
889
|
1
|
1
|
26.0
|
0
|
0
|
30.0000
|
0
|
1
|
0
|
0
|
890
|
3
|
1
|
32.0
|
0
|
0
|
7.7500
|
0
|
1
|
0
|
0
|
891 rows × 10 columns
1
| y = df_clean['Survived']
|
1 2 3 4 5 6 7 8 9
| X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2) clf = lightgbm.LGBMClassifier( max_depth=5, min_child_weight=0.1, n_jobs=-1, num_leaves=15, )
clf.fit(X=X_train, y=y_train) clf.score(X_val, y_val)
|
0.8715083798882681
Do the same thing for our test data
1
| df_test = pd.read_csv(os.getcwd()+'/test.csv')
|
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
| df_test_clean = df_test.copy()
df_test_clean.Sex = df_test_clean.Sex.apply(lambda x: (x=='male') * 1)
df_test_clean.Cabin = df_test_clean.Cabin.map(lambda x: re.findall("^[a-zA-Z]", x)[0] if not (x is np.nan) else x)
df_test_clean["Title"] = df_test_clean.Name.map(lambda x: re.findall(pattern = "([A-Z][a-zA-Z]+)\.", string = x)[0])
df_test_clean.Title = df_test_clean.Title.map(lambda x: "Other" if x not in ["Mr", "Miss", "Mrs", "Master"] else x)
df_test_clean.Cabin = df_test_clean.Cabin.fillna(value="Unk")
df_test_clean = pd.concat( [ df_test_clean[df_test_clean.Age.notna()], pd.merge(df_test_clean[df_test_clean.Age.isna()], byPclassAndTitle, how='left', on=["Pclass", "Title"]).drop(columns="Age_x").rename(columns={"Age_y":"Age"})[df_test_clean.columns] ], axis=0 )
df_test_clean.Embarked = df_test_clean.Embarked.fillna("unk") df_test_clean.sort_values("PassengerId", inplace=True)
|
1
| X_test = pd.concat([df_test_clean[num_cols], pd.get_dummies(df_test_clean[cat_cols], drop_first=True)], axis=1)
|
1
| X_test = X_test.assign(Cabin_T = 0).assign(Embarked_unk=0)[X_train.columns]
|
array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1,
1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1,
0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0,
1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0,
0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0,
1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1,
0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1])
1
| y_test_truth = pd.read_csv(os.getcwd()+'/gender_submission.csv')
|
1
| clf.score(X_test, y_test_truth.Survived)
|
0.8827751196172249
Export Prediction
1 2 3 4
| y_submission = y_test_truth.copy() y_submission["Survived"] = clf.predict(X_test) y_submission.set_index("PassengerId", inplace=True) y_submission.to_csv(os.getcwd()+'/Submission.csv')
|