In [1]:
import pandas as pd

df = pd.read_csv('data/train.csv')
df.head(10)
Out[1]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
5 6 0 3 Moran, Mr. James male NaN 0 0 330877 8.4583 NaN Q
6 7 0 1 McCarthy, Mr. Timothy J male 54.0 0 0 17463 51.8625 E46 S
7 8 0 3 Palsson, Master. Gosta Leonard male 2.0 3 1 349909 21.0750 NaN S
8 9 1 3 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) female 27.0 0 2 347742 11.1333 NaN S
9 10 1 2 Nasser, Mrs. Nicholas (Adele Achem) female 14.0 1 0 237736 30.0708 NaN C
In [2]:
df[['Sex', 'Age', 'SibSp', 'Parch']].head()
Out[2]:
Sex Age SibSp Parch
0 male 22.0 1 0
1 female 38.0 1 0
2 female 26.0 0 0
3 female 35.0 1 0
4 male 35.0 0 0
In [3]:
# 테스트셋
df_test = pd.read_csv('data/test.csv')
df_test.head()
Out[3]:
PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 892 3 Kelly, Mr. James male 34.5 0 0 330911 7.8292 NaN Q
1 893 3 Wilkes, Mrs. James (Ellen Needs) female 47.0 1 0 363272 7.0000 NaN S
2 894 2 Myles, Mr. Thomas Francis male 62.0 0 0 240276 9.6875 NaN Q
3 895 3 Wirz, Mr. Albert male 27.0 0 0 315154 8.6625 NaN S
4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female 22.0 1 1 3101298 12.2875 NaN S
In [4]:
# 전체 나이 평균, 즉 df['Age'].mean()을 빈칸에 넣기
df['Age'] = df['Age'].fillna(df['Age'].mean())
df.head(10)
Out[4]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.000000 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.000000 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.000000 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.000000 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.000000 0 0 373450 8.0500 NaN S
5 6 0 3 Moran, Mr. James male 29.699118 0 0 330877 8.4583 NaN Q
6 7 0 1 McCarthy, Mr. Timothy J male 54.000000 0 0 17463 51.8625 E46 S
7 8 0 3 Palsson, Master. Gosta Leonard male 2.000000 3 1 349909 21.0750 NaN S
8 9 1 3 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) female 27.000000 0 2 347742 11.1333 NaN S
9 10 1 2 Nasser, Mrs. Nicholas (Adele Achem) female 14.000000 1 0 237736 30.0708 NaN C
In [5]:
# 테스트셋에도 똑같이 적용해야함
df_test['Age'] = df_test['Age'].fillna(df_test['Age'].mean());
# 마지막에 ; 을 뒤에 붙이면 out에 결과표시 생략
In [6]:
df.loc[df['Age'] < 10, 'Age'] = 0
df.loc[(df['Age'] >= 10) & (df['Age'] < 20), 'Age'] = 1
df.loc[(df['Age'] >= 20) & (df['Age'] < 30), 'Age'] = 2
df.loc[(df['Age'] >= 30) & (df['Age'] < 40), 'Age'] = 3
df.loc[(df['Age'] >= 40) & (df['Age'] < 50), 'Age'] = 4
df.loc[df['Age'] >= 50, 'Age'] = 5
In [7]:
df['FamilySize'] = df['SibSp'] + df['Parch']
df.head()
Out[7]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked FamilySize
0 1 0 3 Braund, Mr. Owen Harris male 2.0 1 0 A/5 21171 7.2500 NaN S 1
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 3.0 1 0 PC 17599 71.2833 C85 C 1
2 3 1 3 Heikkinen, Miss. Laina female 2.0 0 0 STON/O2. 3101282 7.9250 NaN S 0
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 3.0 1 0 113803 53.1000 C123 S 1
4 5 0 3 Allen, Mr. William Henry male 3.0 0 0 373450 8.0500 NaN S 0
In [8]:
# 테스트셋도 똑같이 적용
df_test['FamilySize'] = df_test['SibSp'] + df_test['Parch']
In [9]:
# 더이상 필요 없는 열은 버리고, 필요한 필드만 남기기
train = df[['Survived', 'Sex', 'Age', 'FamilySize']]
test = df_test[['Sex', 'Age', 'FamilySize']] # test데이터는 애초에 Survived가 없음

train.head()
Out[9]:
Survived Sex Age FamilySize
0 0 male 2.0 1
1 1 female 3.0 1
2 1 female 2.0 0
3 1 female 3.0 1
4 0 male 3.0 0
In [10]:
# 운임도 빈칸(NaN)을 운임평균으로 채우기
df['Fare'] = df['Fare'].fillna(df['Fare'].mean())
df_test['Fare'] = df_test['Fare'].fillna(df_test['Fare'].mean())
In [11]:
df.isnull().sum()
Out[11]:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
FamilySize       0
dtype: int64
In [12]:
df['Embarked'].value_counts()
Out[12]:
S    644
C    168
Q     77
Name: Embarked, dtype: int64
In [13]:
df['Embarked'] = df['Embarked'].fillna('S')
df_test['Embarked'] = df_test['Embarked'].fillna('S')
In [14]:
df.isnull().sum()
Out[14]:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
FamilySize       0
dtype: int64
In [15]:
# 필요한 필드만 남기기
train = df[['Survived', 'Sex', 'Age', 'FamilySize', 'Fare', 'Embarked']]
test = df_test[['Sex', 'Age', 'FamilySize', 'Fare', 'Embarked']] # test데이터는 애초에 Survived가 없음

train.head()
Out[15]:
Survived Sex Age FamilySize Fare Embarked
0 0 male 2.0 1 7.2500 S
1 1 female 3.0 1 71.2833 C
2 1 female 2.0 0 7.9250 S
3 1 female 3.0 1 53.1000 S
4 0 male 3.0 0 8.0500 S
In [16]:
df.loc[df['Sex'] == 'male', 'Sex'] = 0
df.loc[df['Sex'] == 'female', 'Sex'] = 1

df_test.loc[df_test['Sex'] == 'male', 'Sex'] = 0
df_test.loc[df_test['Sex'] == 'female', 'Sex'] = 1

df.head()
Out[16]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked FamilySize
0 1 0 3 Braund, Mr. Owen Harris 0 2.0 1 0 A/5 21171 7.2500 NaN S 1
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... 1 3.0 1 0 PC 17599 71.2833 C85 C 1
2 3 1 3 Heikkinen, Miss. Laina 1 2.0 0 0 STON/O2. 3101282 7.9250 NaN S 0
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) 1 3.0 1 0 113803 53.1000 C123 S 1
4 5 0 3 Allen, Mr. William Henry 0 3.0 0 0 373450 8.0500 NaN S 0
In [17]:
df.loc[df['Embarked'] == 'S', 'Embarked'] = 0
df.loc[df['Embarked'] == 'C', 'Embarked'] = 1
df.loc[df['Embarked'] == 'Q', 'Embarked'] = 2

df_test.loc[df_test['Embarked'] == 'S', 'Embarked'] = 0
df_test.loc[df_test['Embarked'] == 'C', 'Embarked'] = 1
df_test.loc[df_test['Embarked'] == 'Q', 'Embarked'] = 2

df.head()
Out[17]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked FamilySize
0 1 0 3 Braund, Mr. Owen Harris 0 2.0 1 0 A/5 21171 7.2500 NaN 0 1
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... 1 3.0 1 0 PC 17599 71.2833 C85 1 1
2 3 1 3 Heikkinen, Miss. Laina 1 2.0 0 0 STON/O2. 3101282 7.9250 NaN 0 0
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) 1 3.0 1 0 113803 53.1000 C123 0 1
4 5 0 3 Allen, Mr. William Henry 0 3.0 0 0 373450 8.0500 NaN 0 0
In [18]:
train = df[['Survived', 'Sex', 'Age', 'FamilySize', 'Fare', 'Embarked']]
test = df_test[['Sex', 'Age', 'FamilySize', 'Fare', 'Embarked']] # test데이터는 애초에 Survived가 없음

train.head()
Out[18]:
Survived Sex Age FamilySize Fare Embarked
0 0 0 2.0 1 7.2500 0
1 1 1 3.0 1 71.2833 1
2 1 1 2.0 0 7.9250 0
3 1 1 3.0 1 53.1000 0
4 0 0 3.0 0 8.0500 0

과제 1

In [19]:
df[['Name', 'Age']].head(10)
Out[19]:
Name Age
0 Braund, Mr. Owen Harris 2.0
1 Cumings, Mrs. John Bradley (Florence Briggs Th... 3.0
2 Heikkinen, Miss. Laina 2.0
3 Futrelle, Mrs. Jacques Heath (Lily May Peel) 3.0
4 Allen, Mr. William Henry 3.0
5 Moran, Mr. James 2.0
6 McCarthy, Mr. Timothy J 5.0
7 Palsson, Master. Gosta Leonard 0.0
8 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) 2.0
9 Nasser, Mrs. Nicholas (Adele Achem) 1.0
In [20]:
df.loc[ df['Name'].str.contains('Mr\.'), 'Name' ] = 'Mr'
df.loc[ df['Name'].str.contains('Mrs\.'), 'Name' ] = 'Mrs'
df.loc[ df['Name'].str.contains('Miss\.'), 'Name' ] = 'Miss'
df.head()
Out[20]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked FamilySize
0 1 0 3 Mr 0 2.0 1 0 A/5 21171 7.2500 NaN 0 1
1 2 1 1 Mrs 1 3.0 1 0 PC 17599 71.2833 C85 1 1
2 3 1 3 Miss 1 2.0 0 0 STON/O2. 3101282 7.9250 NaN 0 0
3 4 1 1 Mrs 1 3.0 1 0 113803 53.1000 C123 0 1
4 5 0 3 Mr 0 3.0 0 0 373450 8.0500 NaN 0 0
In [22]:
df['Name'] = df['Name'].map({
    'Mr': 0,
    'Mrs': 1,
    'Miss': 2
})

df.head(10)
Out[22]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked FamilySize
0 1 0 3 NaN 0 2.0 1 0 A/5 21171 7.2500 NaN 0 1
1 2 1 1 NaN 1 3.0 1 0 PC 17599 71.2833 C85 1 1
2 3 1 3 NaN 1 2.0 0 0 STON/O2. 3101282 7.9250 NaN 0 0
3 4 1 1 NaN 1 3.0 1 0 113803 53.1000 C123 0 1
4 5 0 3 NaN 0 3.0 0 0 373450 8.0500 NaN 0 0
5 6 0 3 NaN 0 2.0 0 0 330877 8.4583 NaN 2 0
6 7 0 1 NaN 0 5.0 0 0 17463 51.8625 E46 0 0
7 8 0 3 NaN 0 0.0 3 1 349909 21.0750 NaN 0 4
8 9 1 3 NaN 1 2.0 0 2 347742 11.1333 NaN 0 2
9 10 1 2 NaN 1 1.0 1 0 237736 30.0708 NaN 1 1
In [23]:
df['Name'] = df['Name'].fillna(3)
df.head(10)
Out[23]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked FamilySize
0 1 0 3 3.0 0 2.0 1 0 A/5 21171 7.2500 NaN 0 1
1 2 1 1 3.0 1 3.0 1 0 PC 17599 71.2833 C85 1 1
2 3 1 3 3.0 1 2.0 0 0 STON/O2. 3101282 7.9250 NaN 0 0
3 4 1 1 3.0 1 3.0 1 0 113803 53.1000 C123 0 1
4 5 0 3 3.0 0 3.0 0 0 373450 8.0500 NaN 0 0
5 6 0 3 3.0 0 2.0 0 0 330877 8.4583 NaN 2 0
6 7 0 1 3.0 0 5.0 0 0 17463 51.8625 E46 0 0
7 8 0 3 3.0 0 0.0 3 1 349909 21.0750 NaN 0 4
8 9 1 3 3.0 1 2.0 0 2 347742 11.1333 NaN 0 2
9 10 1 2 3.0 1 1.0 1 0 237736 30.0708 NaN 1 1
In [24]:
df['Name'].value_counts()
Out[24]:
3.0    891
Name: Name, dtype: int64
In [25]:
df['Age'] = df['Age'].fillna( df.groupby('Name')['Age'].transform('mean') )
df.head(10)
Out[25]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked FamilySize
0 1 0 3 3.0 0 2.0 1 0 A/5 21171 7.2500 NaN 0 1
1 2 1 1 3.0 1 3.0 1 0 PC 17599 71.2833 C85 1 1
2 3 1 3 3.0 1 2.0 0 0 STON/O2. 3101282 7.9250 NaN 0 0
3 4 1 1 3.0 1 3.0 1 0 113803 53.1000 C123 0 1
4 5 0 3 3.0 0 3.0 0 0 373450 8.0500 NaN 0 0
5 6 0 3 3.0 0 2.0 0 0 330877 8.4583 NaN 2 0
6 7 0 1 3.0 0 5.0 0 0 17463 51.8625 E46 0 0
7 8 0 3 3.0 0 0.0 3 1 349909 21.0750 NaN 0 4
8 9 1 3 3.0 1 2.0 0 2 347742 11.1333 NaN 0 2
9 10 1 2 3.0 1 1.0 1 0 237736 30.0708 NaN 1 1
In [26]:
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set()
In [27]:
facet = sns.FacetGrid(df, hue="Survived", aspect=4)
facet.map(sns.kdeplot, 'Age')
facet.add_legend()

plt.show()

과제 2

In [29]:
df['Fare'] = df['Fare'].fillna( df.groupby('Pclass')['Fare'].transform('mean') )
In [30]:
df_test['Fare'] = df_test['Fare'].fillna( df_test.groupby('Pclass')['Fare'].transform('mean') )
In [31]:
print(df_test.isnull().sum())
PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Embarked         0
FamilySize       0
dtype: int64
In [32]:
x_train = train[['Sex', 'Age', 'FamilySize', 'Fare', 'Embarked']]
y_train = train['Survived']
In [33]:
x_train
Out[33]:
Sex Age FamilySize Fare Embarked
0 0 2.0 1 7.2500 0
1 1 3.0 1 71.2833 1
2 1 2.0 0 7.9250 0
3 1 3.0 1 53.1000 0
4 0 3.0 0 8.0500 0
5 0 2.0 0 8.4583 2
6 0 5.0 0 51.8625 0
7 0 0.0 4 21.0750 0
8 1 2.0 2 11.1333 0
9 1 1.0 1 30.0708 1
10 1 0.0 2 16.7000 0
11 1 5.0 0 26.5500 0
12 0 2.0 0 8.0500 0
13 0 3.0 6 31.2750 0
14 1 1.0 0 7.8542 0
15 1 5.0 0 16.0000 0
16 0 0.0 5 29.1250 2
17 0 2.0 0 13.0000 0
18 1 3.0 1 18.0000 0
19 1 2.0 0 7.2250 1
20 0 3.0 0 26.0000 0
21 0 3.0 0 13.0000 0
22 1 1.0 0 8.0292 2
23 0 2.0 0 35.5000 0
24 1 0.0 4 21.0750 0
25 1 3.0 6 31.3875 0
26 0 2.0 0 7.2250 1
27 0 1.0 5 263.0000 0
28 1 2.0 0 7.8792 2
29 0 2.0 0 7.8958 0
... ... ... ... ... ...
861 0 2.0 1 11.5000 0
862 1 4.0 0 25.9292 0
863 1 2.0 10 69.5500 0
864 0 2.0 0 13.0000 0
865 1 4.0 0 13.0000 0
866 1 2.0 1 13.8583 1
867 0 3.0 0 50.4958 0
868 0 2.0 0 9.5000 0
869 0 0.0 2 11.1333 0
870 0 2.0 0 7.8958 0
871 1 4.0 2 52.5542 0
872 0 3.0 0 5.0000 0
873 0 4.0 0 9.0000 0
874 1 2.0 1 24.0000 1
875 1 1.0 0 7.2250 1
876 0 2.0 0 9.8458 0
877 0 1.0 0 7.8958 0
878 0 2.0 0 7.8958 0
879 1 5.0 1 83.1583 1
880 1 2.0 1 26.0000 0
881 0 3.0 0 7.8958 0
882 1 2.0 0 10.5167 0
883 0 2.0 0 10.5000 0
884 0 2.0 0 7.0500 0
885 1 3.0 5 29.1250 2
886 0 2.0 0 13.0000 0
887 1 1.0 0 30.0000 0
888 1 2.0 3 23.4500 0
889 0 2.0 0 30.0000 1
890 0 3.0 0 7.7500 2

891 rows × 5 columns

In [34]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier()
tree.fit(x_train, y_train)

print('training set accuracy:', tree.score(x_train, y_train))
training set accuracy: 0.9450056116722784
In [36]:
x_test = test[['Sex', 'Age', 'FamilySize', 'Fare', 'Embarked']]
In [37]:
x_test
Out[37]:
Sex Age FamilySize Fare Embarked
0 0 34.50000 0 7.8292 2
1 1 47.00000 1 7.0000 0
2 0 62.00000 0 9.6875 2
3 0 27.00000 0 8.6625 0
4 1 22.00000 2 12.2875 0
5 0 14.00000 0 9.2250 0
6 1 30.00000 0 7.6292 2
7 0 26.00000 2 29.0000 0
8 1 18.00000 0 7.2292 1
9 0 21.00000 2 24.1500 0
10 0 30.27259 0 7.8958 0
11 0 46.00000 0 26.0000 0
12 1 23.00000 1 82.2667 0
13 0 63.00000 1 26.0000 0
14 1 47.00000 1 61.1750 0
15 1 24.00000 1 27.7208 1
16 0 35.00000 0 12.3500 2
17 0 21.00000 0 7.2250 1
18 1 27.00000 1 7.9250 0
19 1 45.00000 0 7.2250 1
20 0 55.00000 1 59.4000 1
21 0 9.00000 1 3.1708 0
22 1 30.27259 0 31.6833 0
23 0 21.00000 1 61.3792 1
24 1 48.00000 4 262.3750 1
25 0 50.00000 1 14.5000 0
26 1 22.00000 1 61.9792 1
27 0 22.50000 0 7.2250 1
28 0 41.00000 0 30.5000 0
29 0 30.27259 2 21.6792 1
... ... ... ... ... ...
388 0 21.00000 0 7.7500 2
389 0 6.00000 4 21.0750 0
390 0 23.00000 0 93.5000 0
391 1 51.00000 1 39.4000 0
392 0 13.00000 2 20.2500 0
393 0 47.00000 0 10.5000 0
394 0 29.00000 4 22.0250 0
395 1 18.00000 1 60.0000 0
396 0 24.00000 0 7.2500 2
397 1 48.00000 2 79.2000 1
398 0 22.00000 0 7.7750 0
399 0 31.00000 0 7.7333 2
400 1 30.00000 0 164.8667 0
401 0 38.00000 1 21.0000 0
402 1 22.00000 1 59.4000 1
403 0 17.00000 0 47.1000 0
404 0 43.00000 1 27.7208 1
405 0 20.00000 0 13.8625 1
406 0 23.00000 1 10.5000 0
407 0 50.00000 2 211.5000 1
408 1 30.27259 0 7.7208 2
409 1 3.00000 2 13.7750 0
410 1 30.27259 0 7.7500 2
411 1 37.00000 1 90.0000 2
412 1 28.00000 0 7.7750 0
413 0 30.27259 0 8.0500 0
414 1 39.00000 0 108.9000 1
415 0 38.50000 0 7.2500 0
416 0 30.27259 0 8.0500 0
417 0 30.27259 2 22.3583 1

418 rows × 5 columns

In [38]:
prediction = tree.predict(x_test) # 즉 y_test가 만들어집니다.
prediction
Out[38]:
array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1,
       0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0,
       1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0],
      dtype=int64)
In [41]:
submit = pd.DataFrame({
    'PassengerId': df_test['PassengerId'],
    'Survived': prediction
})

submit.to_csv('submit.csv', index=False)
In [42]:
my_prediction = pd.read_csv('submit.csv')
my_prediction.head()
Out[42]:
PassengerId Survived
0 892 0
1 893 0
2 894 0
3 895 0
4 896 1