In [1]:
import pandas as pd

df = pd.read_csv('data/train.csv')
df.head(10)
Out[1]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
5 6 0 3 Moran, Mr. James male NaN 0 0 330877 8.4583 NaN Q
6 7 0 1 McCarthy, Mr. Timothy J male 54.0 0 0 17463 51.8625 E46 S
7 8 0 3 Palsson, Master. Gosta Leonard male 2.0 3 1 349909 21.0750 NaN S
8 9 1 3 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) female 27.0 0 2 347742 11.1333 NaN S
9 10 1 2 Nasser, Mrs. Nicholas (Adele Achem) female 14.0 1 0 237736 30.0708 NaN C
In [2]:
df[['Sex', 'Age', 'SibSp', 'Parch']].head()
Out[2]:
Sex Age SibSp Parch
0 male 22.0 1 0
1 female 38.0 1 0
2 female 26.0 0 0
3 female 35.0 1 0
4 male 35.0 0 0
In [3]:
# 테스트셋
df_test = pd.read_csv('data/test.csv')
df_test.head()
Out[3]:
PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 892 3 Kelly, Mr. James male 34.5 0 0 330911 7.8292 NaN Q
1 893 3 Wilkes, Mrs. James (Ellen Needs) female 47.0 1 0 363272 7.0000 NaN S
2 894 2 Myles, Mr. Thomas Francis male 62.0 0 0 240276 9.6875 NaN Q
3 895 3 Wirz, Mr. Albert male 27.0 0 0 315154 8.6625 NaN S
4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female 22.0 1 1 3101298 12.2875 NaN S
In [4]:
# 전체 나이 평균, 즉 df['Age'].mean()을 빈칸에 넣기

df['Age'] = df['Age'].fillna(df['Age'].mean())
df.head(10)
Out[4]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.000000 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.000000 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.000000 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.000000 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.000000 0 0 373450 8.0500 NaN S
5 6 0 3 Moran, Mr. James male 29.699118 0 0 330877 8.4583 NaN Q
6 7 0 1 McCarthy, Mr. Timothy J male 54.000000 0 0 17463 51.8625 E46 S
7 8 0 3 Palsson, Master. Gosta Leonard male 2.000000 3 1 349909 21.0750 NaN S
8 9 1 3 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) female 27.000000 0 2 347742 11.1333 NaN S
9 10 1 2 Nasser, Mrs. Nicholas (Adele Achem) female 14.000000 1 0 237736 30.0708 NaN C
In [5]:
# 테스트셋에도 똑같이 적용해야함
df_test['Age'] = df_test['Age'].fillna(df_test['Age'].mean());
# 마지막에 ; 을 뒤에 붙이면 out에 결과표시 생략
In [6]:
df.loc[df['Age'] < 10, 'Age'] = 0
df.loc[(df['Age'] >= 10) & (df['Age'] < 20), 'Age'] = 1
df.loc[(df['Age'] >= 20) & (df['Age'] < 30), 'Age'] = 2
df.loc[(df['Age'] >= 30) & (df['Age'] < 40), 'Age'] = 3
df.loc[(df['Age'] >= 40) & (df['Age'] < 50), 'Age'] = 4
df.loc[df['Age'] >= 50, 'Age'] = 5
In [7]:
df_test.loc[df['Age'] < 10, 'Age'] = 0
df_test.loc[(df['Age'] >= 10) & (df['Age'] < 20), 'Age'] = 1
df_test.loc[(df['Age'] >= 20) & (df['Age'] < 30), 'Age'] = 2
df_test.loc[(df['Age'] >= 30) & (df['Age'] < 40), 'Age'] = 3
df_test.loc[(df['Age'] >= 40) & (df['Age'] < 50), 'Age'] = 4
df_test.loc[df['Age'] >= 50, 'Age'] = 5
In [8]:
df
Out[8]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 2.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 3.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 2.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 3.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 3.0 0 0 373450 8.0500 NaN S
... ... ... ... ... ... ... ... ... ... ... ... ...
886 887 0 2 Montvila, Rev. Juozas male 2.0 0 0 211536 13.0000 NaN S
887 888 1 1 Graham, Miss. Margaret Edith female 1.0 0 0 112053 30.0000 B42 S
888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female 2.0 1 2 W./C. 6607 23.4500 NaN S
889 890 1 1 Behr, Mr. Karl Howell male 2.0 0 0 111369 30.0000 C148 C
890 891 0 3 Dooley, Mr. Patrick male 3.0 0 0 370376 7.7500 NaN Q

891 rows × 12 columns

In [9]:
df['FamilySize'] = df['SibSp'] + df['Parch']
df.head()
Out[9]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked FamilySize
0 1 0 3 Braund, Mr. Owen Harris male 2.0 1 0 A/5 21171 7.2500 NaN S 1
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 3.0 1 0 PC 17599 71.2833 C85 C 1
2 3 1 3 Heikkinen, Miss. Laina female 2.0 0 0 STON/O2. 3101282 7.9250 NaN S 0
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 3.0 1 0 113803 53.1000 C123 S 1
4 5 0 3 Allen, Mr. William Henry male 3.0 0 0 373450 8.0500 NaN S 0
In [10]:
# 테스트셋도 똑같이 적용
df_test['FamilySize'] = df_test['SibSp'] + df_test['Parch']
In [11]:
# 더이상 필요 없는 열은 버리고, 필요한 필드만 남기기
train = df[['Survived', 'Sex', 'Age', 'FamilySize']]
test = df_test[['Sex', 'Age', 'FamilySize']] # test데이터는 애초에 Survived가 없음

train.head()
Out[11]:
Survived Sex Age FamilySize
0 0 male 2.0 1
1 1 female 3.0 1
2 1 female 2.0 0
3 1 female 3.0 1
4 0 male 3.0 0
In [12]:
# 운임도 빈칸(NaN)을 운임평균으로 채우기
df['Fare'] = df['Fare'].fillna(df['Fare'].mean())
df_test['Fare'] = df_test['Fare'].fillna(df_test['Fare'].mean())
In [13]:
df.isnull().sum()
Out[13]:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
FamilySize       0
dtype: int64
In [14]:
df['Embarked'].value_counts()
Out[14]:
S    644
C    168
Q     77
Name: Embarked, dtype: int64
In [15]:
df['Embarked'] = df['Embarked'].fillna('S')
df_test['Embarked'] = df_test['Embarked'].fillna('S')
In [16]:
df.isnull().sum()
Out[16]:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
FamilySize       0
dtype: int64
In [17]:
# 필요한 필드만 남기기
train = df[['Survived', 'Sex', 'Age', 'FamilySize', 'Fare', 'Embarked']]
test = df_test[['Sex', 'Age', 'FamilySize', 'Fare', 'Embarked']] # test데이터는 애초에 Survived가 없음

train.head()
Out[17]:
Survived Sex Age FamilySize Fare Embarked
0 0 male 2.0 1 7.2500 S
1 1 female 3.0 1 71.2833 C
2 1 female 2.0 0 7.9250 S
3 1 female 3.0 1 53.1000 S
4 0 male 3.0 0 8.0500 S
In [18]:
df.loc[df['Sex'] == 'male', 'Sex'] = 0
df.loc[df['Sex'] == 'female', 'Sex'] = 1

df_test.loc[df_test['Sex'] == 'male', 'Sex'] = 0
df_test.loc[df_test['Sex'] == 'female', 'Sex'] = 1

df.head()
Out[18]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked FamilySize
0 1 0 3 Braund, Mr. Owen Harris 0 2.0 1 0 A/5 21171 7.2500 NaN S 1
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... 1 3.0 1 0 PC 17599 71.2833 C85 C 1
2 3 1 3 Heikkinen, Miss. Laina 1 2.0 0 0 STON/O2. 3101282 7.9250 NaN S 0
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) 1 3.0 1 0 113803 53.1000 C123 S 1
4 5 0 3 Allen, Mr. William Henry 0 3.0 0 0 373450 8.0500 NaN S 0
In [19]:
df.loc[df['Embarked'] == 'S', 'Embarked'] = 0
df.loc[df['Embarked'] == 'C', 'Embarked'] = 1
df.loc[df['Embarked'] == 'Q', 'Embarked'] = 2

df_test.loc[df_test['Embarked'] == 'S', 'Embarked'] = 0
df_test.loc[df_test['Embarked'] == 'C', 'Embarked'] = 1
df_test.loc[df_test['Embarked'] == 'Q', 'Embarked'] = 2

df.head()
Out[19]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked FamilySize
0 1 0 3 Braund, Mr. Owen Harris 0 2.0 1 0 A/5 21171 7.2500 NaN 0 1
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... 1 3.0 1 0 PC 17599 71.2833 C85 1 1
2 3 1 3 Heikkinen, Miss. Laina 1 2.0 0 0 STON/O2. 3101282 7.9250 NaN 0 0
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) 1 3.0 1 0 113803 53.1000 C123 0 1
4 5 0 3 Allen, Mr. William Henry 0 3.0 0 0 373450 8.0500 NaN 0 0
In [20]:
train = df[['Survived', 'Sex', 'Age', 'FamilySize', 'Fare', 'Embarked']]
test = df_test[['Sex', 'Age', 'FamilySize', 'Fare', 'Embarked']] # test데이터는 애초에 Survived가 없음

train.head()
Out[20]:
Survived Sex Age FamilySize Fare Embarked
0 0 0 2.0 1 7.2500 0
1 1 1 3.0 1 71.2833 1
2 1 1 2.0 0 7.9250 0
3 1 1 3.0 1 53.1000 0
4 0 0 3.0 0 8.0500 0

Stage 3

In [21]:
x_train = train[['Sex', 'Age', 'FamilySize', 'Fare', 'Embarked']]
y_train = train['Survived']

x_train
Out[21]:
Sex Age FamilySize Fare Embarked
0 0 2.0 1 7.2500 0
1 1 3.0 1 71.2833 1
2 1 2.0 0 7.9250 0
3 1 3.0 1 53.1000 0
4 0 3.0 0 8.0500 0
... ... ... ... ... ...
886 0 2.0 0 13.0000 0
887 1 1.0 0 30.0000 0
888 1 2.0 3 23.4500 0
889 0 2.0 0 30.0000 1
890 0 3.0 0 7.7500 2

891 rows × 5 columns

In [22]:
y_train
Out[22]:
0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64
In [23]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier()
tree.fit(x_train, y_train)

print('training set accuracy:', tree.score(x_train, y_train))
training set accuracy: 0.9450056116722784
In [24]:
x_test = test[['Sex', 'Age', 'FamilySize', 'Fare', 'Embarked']]

x_test
Out[24]:
Sex Age FamilySize Fare Embarked
0 0 0.0 0 7.8292 2
1 1 0.0 1 7.0000 0
2 0 0.0 0 9.6875 2
3 0 0.0 0 8.6625 0
4 1 0.0 2 12.2875 0
... ... ... ... ... ...
413 0 0.0 0 8.0500 0
414 1 0.0 0 108.9000 1
415 0 0.0 0 7.2500 0
416 0 0.0 0 8.0500 0
417 0 0.0 2 22.3583 1

418 rows × 5 columns

In [25]:
prediction = tree.predict(x_test)

prediction
Out[25]:
array([1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1],
      dtype=int64)
In [26]:
x_test
Out[26]:
Sex Age FamilySize Fare Embarked
0 0 0.0 0 7.8292 2
1 1 0.0 1 7.0000 0
2 0 0.0 0 9.6875 2
3 0 0.0 0 8.6625 0
4 1 0.0 2 12.2875 0
... ... ... ... ... ...
413 0 0.0 0 8.0500 0
414 1 0.0 0 108.9000 1
415 0 0.0 0 7.2500 0
416 0 0.0 0 8.0500 0
417 0 0.0 2 22.3583 1

418 rows × 5 columns

In [27]:
submit = pd.DataFrame({
    'PassengerId': df_test['PassengerId'],
    'Survived': prediction,    
})

submit.to_csv('submit.csv', index = False)
In [28]:
my_prediction = pd.read_csv('submit.csv')
my_prediction.head()
Out[28]:
PassengerId Survived
0 892 1
1 893 1
2 894 1
3 895 1
4 896 1

Week 4 - Stage 1

In [29]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier()
tree.fit(x_train, y_train)

print('training set accuracy: ', tree.score(x_train, y_train))
training set accuracy:  0.9450056116722784
In [30]:
tree1 = DecisionTreeClassifier()
tree1.fit(x_train, y_train)

print('training set accuracy: ', tree1.score(x_train, y_train))
training set accuracy:  0.9450056116722784
In [31]:
tree2 = DecisionTreeClassifier()
tree2.fit(x_train, y_train)

print('training set accuracy: ', tree2.score(x_train, y_train))
training set accuracy:  0.9450056116722784
In [32]:
tree3 = DecisionTreeClassifier()
tree3.fit(x_train, y_train)

print('training set accuracy: ', tree3.score(x_train, y_train))
training set accuracy:  0.9450056116722784
In [33]:
x_valid = x_train[0:100]
y_valid = y_train[0:100]

x_train = x_train[100:]
y_train = y_train[100:]
In [34]:
tree1 = DecisionTreeClassifier()
tree1.fit(x_train, y_train)

print('training set accuracy: ', tree1.score(x_train, y_train))
print('validation set accuracy: ', tree1.score(x_valid, y_valid))

tree2 = DecisionTreeClassifier()
tree2.fit(x_train, y_train)

print('training set accuracy: ', tree2.score(x_train, y_train))
print('validation set accuracy: ', tree2.score(x_valid, y_valid))

tree3 = DecisionTreeClassifier()
tree3.fit(x_train, y_train)

print('training set accuracy: ', tree3.score(x_train, y_train))
print('validation set accuracy: ', tree3.score(x_valid, y_valid))
training set accuracy:  0.9443742098609356
validation set accuracy:  0.79
training set accuracy:  0.9443742098609356
validation set accuracy:  0.79
training set accuracy:  0.9443742098609356
validation set accuracy:  0.79
In [35]:
prediction1 = tree1.predict(test)
prediction2 = tree2.predict(test)
prediction3 = tree3.predict(test)

ensemble = (prediction1 + prediction2 + prediction3) / 3
ensemble[ensemble > 0.5] = 1
ensemble[ensemble <= 0.5] = 0

ensemble = ensemble.astype(int)

ensemble
Out[35]:
array([1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1])
In [36]:
submit = pd.DataFrame({
    'PassengerId': df_test['PassengerId'],
    'Survived': ensemble
})

submit.to_csv('submit_ensemble.csv', index=False)

Challenge 2

In [37]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier()
forest.fit(x_train, y_train)

forest.score(x_train, y_train)
Out[37]:
0.943109987357775
In [38]:
forest.score(x_valid, y_valid)
Out[38]:
0.79
In [39]:
prediction = tree.predict(x_test)
prediction

submit = pd.DataFrame({
    'PassengerId': df_test['PassengerId'],
    'Survived': ensemble
})

submit.to_csv('submit_ensemble.csv', index=False)

Homework 1

In [40]:
from sklearn.ensemble import RandomForestClassifier

# validation data
forest = RandomForestClassifier(n_estimators = 100)
forest.fit(x_train, y_train)

print('training set accuracy: ', forest.score(x_train, y_train))

prediction = tree.predict(x_test)
prediction
training set accuracy:  0.9443742098609356
Out[40]:
array([1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1],
      dtype=int64)
In [41]:
submit = pd.DataFrame({
    'PassengerId': df_test['PassengerId'],
    'Survived': prediction
})

submit.to_csv('submit.csv', index=False)
In [ ]: