In [1]:
import pandas as pd

df = pd.read_csv("../../COALA_DS_DATA/COALA_DS_DATA/week3/data/train.csv")

df
Out[1]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
... ... ... ... ... ... ... ... ... ... ... ... ...
886 887 0 2 Montvila, Rev. Juozas male 27.0 0 0 211536 13.0000 NaN S
887 888 1 1 Graham, Miss. Margaret Edith female 19.0 0 0 112053 30.0000 B42 S
888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female NaN 1 2 W./C. 6607 23.4500 NaN S
889 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.0000 C148 C
890 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.7500 NaN Q

891 rows × 12 columns

In [2]:
df['Fare'].value_counts()
Out[2]:
8.0500     43
13.0000    42
7.8958     38
7.7500     34
26.0000    31
           ..
8.4583      1
9.8375      1
8.3625      1
14.1083     1
17.4000     1
Name: Fare, Length: 248, dtype: int64
In [3]:
df[['Pclass', 'Fare']]
Out[3]:
Pclass Fare
0 3 7.2500
1 1 71.2833
2 3 7.9250
3 1 53.1000
4 3 8.0500
... ... ...
886 2 13.0000
887 1 30.0000
888 3 23.4500
889 1 30.0000
890 3 7.7500

891 rows × 2 columns

In [4]:
df['Fare'] = df['Fare'].fillna(df.groupby('Pclass')['Fare'].transform("mean"))
df.head()
Out[4]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
In [5]:
df.loc[df['SibSp'] + df["Parch"] == 0, 'isAlone'] = 1
df.loc[df['SibSp'] + df["Parch"] > 0, 'isAlone'] = 0

df
Out[5]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked isAlone
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S 0.0
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C 0.0
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S 1.0
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S 0.0
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S 1.0
... ... ... ... ... ... ... ... ... ... ... ... ... ...
886 887 0 2 Montvila, Rev. Juozas male 27.0 0 0 211536 13.0000 NaN S 1.0
887 888 1 1 Graham, Miss. Margaret Edith female 19.0 0 0 112053 30.0000 B42 S 1.0
888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female NaN 1 2 W./C. 6607 23.4500 NaN S 0.0
889 890 1 1 Behr, Mr. Karl Howell male 26.0 0 0 111369 30.0000 C148 C 1.0
890 891 0 3 Dooley, Mr. Patrick male 32.0 0 0 370376 7.7500 NaN Q 1.0

891 rows × 13 columns

In [6]:
df.loc[df['Name'].str.contains("Mr\."), 'Name'] = 'Mr'
df.loc[df['Name'].str.contains("Mrs\."), 'Name'] = 'Mrs'
df.loc[df['Name'].str.contains("Miss\."), 'Name'] = 'Miss'
df[['Name', 'Age']]
Out[6]:
Name Age
0 Mr 22.0
1 Mrs 38.0
2 Miss 26.0
3 Mrs 35.0
4 Mr 35.0
... ... ...
886 Montvila, Rev. Juozas 27.0
887 Miss 19.0
888 Miss NaN
889 Mr 26.0
890 Mr 32.0

891 rows × 2 columns

In [7]:
df['Name'] = df['Name'].map({
    'Mr': 0,
    'Mrs': 1,
    'Miss': 2
})

df['Name'] = df['Name'].fillna(3)

df
Out[7]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked isAlone
0 1 0 3 0.0 male 22.0 1 0 A/5 21171 7.2500 NaN S 0.0
1 2 1 1 1.0 female 38.0 1 0 PC 17599 71.2833 C85 C 0.0
2 3 1 3 2.0 female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S 1.0
3 4 1 1 1.0 female 35.0 1 0 113803 53.1000 C123 S 0.0
4 5 0 3 0.0 male 35.0 0 0 373450 8.0500 NaN S 1.0
... ... ... ... ... ... ... ... ... ... ... ... ... ...
886 887 0 2 3.0 male 27.0 0 0 211536 13.0000 NaN S 1.0
887 888 1 1 2.0 female 19.0 0 0 112053 30.0000 B42 S 1.0
888 889 0 3 2.0 female NaN 1 2 W./C. 6607 23.4500 NaN S 0.0
889 890 1 1 0.0 male 26.0 0 0 111369 30.0000 C148 C 1.0
890 891 0 3 0.0 male 32.0 0 0 370376 7.7500 NaN Q 1.0

891 rows × 13 columns

In [8]:
df_test = pd.read_csv("../../COALA_DS_DATA/COALA_DS_DATA/week3/data/test.csv")

df_test
Out[8]:
PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 892 3 Kelly, Mr. James male 34.5 0 0 330911 7.8292 NaN Q
1 893 3 Wilkes, Mrs. James (Ellen Needs) female 47.0 1 0 363272 7.0000 NaN S
2 894 2 Myles, Mr. Thomas Francis male 62.0 0 0 240276 9.6875 NaN Q
3 895 3 Wirz, Mr. Albert male 27.0 0 0 315154 8.6625 NaN S
4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female 22.0 1 1 3101298 12.2875 NaN S
... ... ... ... ... ... ... ... ... ... ... ...
413 1305 3 Spector, Mr. Woolf male NaN 0 0 A.5. 3236 8.0500 NaN S
414 1306 1 Oliva y Ocana, Dona. Fermina female 39.0 0 0 PC 17758 108.9000 C105 C
415 1307 3 Saether, Mr. Simon Sivertsen male 38.5 0 0 SOTON/O.Q. 3101262 7.2500 NaN S
416 1308 3 Ware, Mr. Frederick male NaN 0 0 359309 8.0500 NaN S
417 1309 3 Peter, Master. Michael J male NaN 1 1 2668 22.3583 NaN C

418 rows × 11 columns

In [9]:
df_test['Fare'].isnull().sum()
Out[9]:
1
In [10]:
df_test['Fare'] = df_test['Fare'].fillna(df_test.groupby('Pclass')['Fare'].transform("mean"))
df_test.head()
Out[10]:
PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 892 3 Kelly, Mr. James male 34.5 0 0 330911 7.8292 NaN Q
1 893 3 Wilkes, Mrs. James (Ellen Needs) female 47.0 1 0 363272 7.0000 NaN S
2 894 2 Myles, Mr. Thomas Francis male 62.0 0 0 240276 9.6875 NaN Q
3 895 3 Wirz, Mr. Albert male 27.0 0 0 315154 8.6625 NaN S
4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female 22.0 1 1 3101298 12.2875 NaN S
In [11]:
df_test.loc[df_test['SibSp'] + df_test["Parch"] == 0, 'isAlone'] = 1
df_test.loc[df_test['SibSp'] + df_test["Parch"] > 0, 'isAlone'] = 0

df_test
Out[11]:
PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked isAlone
0 892 3 Kelly, Mr. James male 34.5 0 0 330911 7.8292 NaN Q 1.0
1 893 3 Wilkes, Mrs. James (Ellen Needs) female 47.0 1 0 363272 7.0000 NaN S 0.0
2 894 2 Myles, Mr. Thomas Francis male 62.0 0 0 240276 9.6875 NaN Q 1.0
3 895 3 Wirz, Mr. Albert male 27.0 0 0 315154 8.6625 NaN S 1.0
4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female 22.0 1 1 3101298 12.2875 NaN S 0.0
... ... ... ... ... ... ... ... ... ... ... ... ...
413 1305 3 Spector, Mr. Woolf male NaN 0 0 A.5. 3236 8.0500 NaN S 1.0
414 1306 1 Oliva y Ocana, Dona. Fermina female 39.0 0 0 PC 17758 108.9000 C105 C 1.0
415 1307 3 Saether, Mr. Simon Sivertsen male 38.5 0 0 SOTON/O.Q. 3101262 7.2500 NaN S 1.0
416 1308 3 Ware, Mr. Frederick male NaN 0 0 359309 8.0500 NaN S 1.0
417 1309 3 Peter, Master. Michael J male NaN 1 1 2668 22.3583 NaN C 0.0

418 rows × 12 columns

In [12]:
df['Embarked'] = df['Embarked'].fillna('S')
df_test['Embarked'] = df_test['Embarked'].fillna('S')

df.loc[df['Embarked'] == 'S', 'Embarked'] = 0
df.loc[df['Embarked'] == 'C', 'Embarked'] = 1
df.loc[df['Embarked'] == 'Q', 'Embarked'] = 2

df_test.loc[df_test['Embarked'] == 'S', 'Embarked'] = 0
df_test.loc[df_test['Embarked'] == 'C', 'Embarked'] = 1
df_test.loc[df_test['Embarked'] == 'Q', 'Embarked'] = 2


df.head()
Out[12]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked isAlone
0 1 0 3 0.0 male 22.0 1 0 A/5 21171 7.2500 NaN 0 0.0
1 2 1 1 1.0 female 38.0 1 0 PC 17599 71.2833 C85 1 0.0
2 3 1 3 2.0 female 26.0 0 0 STON/O2. 3101282 7.9250 NaN 0 1.0
3 4 1 1 1.0 female 35.0 1 0 113803 53.1000 C123 0 0.0
4 5 0 3 0.0 male 35.0 0 0 373450 8.0500 NaN 0 1.0
In [13]:
df.loc[df['Sex'] == 'male', 'Sex'] = 0
df.loc[df['Sex'] == 'female', 'Sex'] = 1

df_test.loc[df_test['Sex'] == 'male', 'Sex'] = 0
df_test.loc[df_test['Sex'] == 'female', 'Sex'] = 1

df.head()
Out[13]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked isAlone
0 1 0 3 0.0 0 22.0 1 0 A/5 21171 7.2500 NaN 0 0.0
1 2 1 1 1.0 1 38.0 1 0 PC 17599 71.2833 C85 1 0.0
2 3 1 3 2.0 1 26.0 0 0 STON/O2. 3101282 7.9250 NaN 0 1.0
3 4 1 1 1.0 1 35.0 1 0 113803 53.1000 C123 0 0.0
4 5 0 3 0.0 0 35.0 0 0 373450 8.0500 NaN 0 1.0
In [14]:
df['FamilySize'] = df['SibSp'] + df['Parch']
df_test['FamilySize'] = df_test['SibSp'] + df_test['Parch']
In [15]:
df_test.isnull().sum()
Out[15]:
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Embarked         0
isAlone          0
FamilySize       0
dtype: int64
In [16]:
df_test['Name'] = df_test['Name'].map({
    'Mr': 0,
    'Mrs': 1,
    'Miss': 2
})

df_test['Name'] = df_test['Name'].fillna(3)
In [17]:
df_test['Age'] = df_test['Age'].fillna(df_test.groupby('Name')['Age'].transform('mean'))
df['Age'] = df['Age'].fillna(df.groupby('Name')['Age'].transform('mean'))
In [18]:
x_train = df[['Name', 'Sex', 'Age', 'Fare', 'FamilySize', 'Embarked', 'isAlone']]
y_train = df['Survived'] 

x_train
Out[18]:
Name Sex Age Fare FamilySize Embarked isAlone
0 0.0 0 22.000000 7.2500 1 0 0.0
1 1.0 1 38.000000 71.2833 1 1 0.0
2 2.0 1 26.000000 7.9250 0 0 1.0
3 1.0 1 35.000000 53.1000 1 0 0.0
4 0.0 0 35.000000 8.0500 0 0 1.0
... ... ... ... ... ... ... ...
886 3.0 0 27.000000 13.0000 0 0 1.0
887 2.0 1 19.000000 30.0000 0 0 1.0
888 2.0 1 21.773973 23.4500 3 0 0.0
889 0.0 0 26.000000 30.0000 0 1 1.0
890 0.0 0 32.000000 7.7500 0 2 1.0

891 rows × 7 columns

In [19]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier()
tree.fit(x_train, y_train)

print('training set accuracy:', tree.score(x_train, y_train))
training set accuracy: 0.9842873176206509
In [20]:
x_test = df_test[['Name', 'Sex', 'Age', 'Fare', 'FamilySize', 'Embarked', 'isAlone']]
prediction = tree.predict(x_test) 
prediction
Out[20]:
array([0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0,
       1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0,
       1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1,
       1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0],
      dtype=int64)
In [21]:
submit = pd.DataFrame({
    'PassengerId': df_test['PassengerId'],
    'Survived': prediction
})

submit.to_csv('submit.csv', index=False)
In [22]:
my_prediction = pd.read_csv('submit.csv')
my_prediction.head()
Out[22]:
PassengerId Survived
0 892 0
1 893 0
2 894 0
3 895 1
4 896 0

Random Forest


Decision Tree는 일부 데이터에 대해 과적합(Overfitting) 현상으로 실제 데이터 예측 성공 확률이 낮아질 수 있음.

그것을 보완하기 위해 Random Forest는 여러 Decision Tree의 결과를 종합, 평균을 계산(Ensemble)하여 예측하는 모델

Stage1

In [23]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier()
tree.fit(x_train, y_train)

print('training set accuracy : ', tree.score(x_train, y_train))
training set accuracy :  0.9842873176206509
In [24]:
tree1 = DecisionTreeClassifier()
tree1.fit(x_train, y_train)

print('training set accuracy : ', tree1.score(x_train, y_train))
training set accuracy :  0.9842873176206509
In [25]:
tree2 = DecisionTreeClassifier()
tree2.fit(x_train, y_train)

print('training set accuracy : ', tree2.score(x_train, y_train))
training set accuracy :  0.9842873176206509
In [26]:
# 위의 3개의 트리는 모두 다른 트리다.
# 문제나 데이터가 복잡해지면 정확도는 달라짐
# 연결해보기 (Ensemble)

# Validation_Set -> 훈련에서 제외시킬 테스트(학습된 모델의 중간 성능)용 데이터
x_valid = x_train[:100]
y_valid = y_train[:100]

x_train = x_train[100:]
y_train = y_train[100:]
In [27]:
tree = DecisionTreeClassifier()
tree.fit(x_train, y_train)

print('training set accuracy : ', tree.score(x_train, y_train))
print('validation set accuracy : ', tree.score(x_valid, y_valid))

tree1 = DecisionTreeClassifier()
tree1.fit(x_train, y_train)

print('training set accuracy : ', tree1.score(x_train, y_train))
print('validation set accuracy : ', tree1.score(x_valid, y_valid))

tree2 = DecisionTreeClassifier()
tree2.fit(x_train, y_train)

print('training set accuracy : ', tree2.score(x_train, y_train))
print('validation set accuracy : ', tree2.score(x_valid, y_valid))

# validation accuracy 는 매번 달라지는 것을 보면, 서로 다른 트리임을 알 수 있다.
# val_accuracy 가 더 합리적인 성능 지표
training set accuracy :  0.9848293299620733
validation set accuracy :  0.79
training set accuracy :  0.9848293299620733
validation set accuracy :  0.78
training set accuracy :  0.9848293299620733
validation set accuracy :  0.78
In [28]:
# 여러개의 Decision Tree Ensemble 하기
prediction1 = tree.predict(df_test[['Name', 'Sex', 'Age', 'Fare', 'FamilySize', 'Embarked', 'isAlone']])
prediction2 = tree1.predict(df_test[['Name', 'Sex', 'Age', 'Fare', 'FamilySize', 'Embarked', 'isAlone']])
prediction3 = tree2.predict(df_test[['Name', 'Sex', 'Age', 'Fare', 'FamilySize', 'Embarked', 'isAlone']])

ensemble = (prediction1 + prediction2 + prediction3) / 3
ensemble[ensemble > 0.5] = 1
ensemble[ensemble <= 0.5] = 0

# 평균 내는 작업으로 인해 survived 데이터가 정수에서 소수로 바뀜
# 다시 정수로 변환
ensemble = ensemble.astype(int)

ensemble
Out[28]:
array([0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1,
       1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1,
       0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0,
       1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0,
       1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1,
       1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0])
In [29]:
submit = pd.DataFrame({
    'PassengerId' : df_test['PassengerId'],
    'Survived' : ensemble
})

submit.to_csv("submit_ensemble.csv", index = False)

Ch2

In [30]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators = 3000)
rfc.fit(x_train, y_train)

print('training set accuracy : ', rfc.score(x_train, y_train))
training set accuracy :  0.9848293299620733
In [31]:
print('validation set accuracy : ', rfc.score(x_valid, y_valid))
validation set accuracy :  0.75
In [32]:
prediction2 = rfc.predict(x_test)
print(prediction)

submit = pd.DataFrame({
    'PassengerId' : df_test["PassengerId"],
    'Survived' : prediction
})

submit.to_csv('submit_rfc.csv', index = False)
[0 0 0 1 0 1 0 0 1 0 0 0 1 0 1 1 0 1 1 0 1 0 1 0 1 0 1 1 1 0 0 0 1 0 0 1 0
 0 0 1 0 0 0 1 1 0 1 0 1 1 0 0 1 1 1 0 0 0 1 1 1 0 1 1 1 1 1 1 0 1 1 0 0 0
 1 1 0 1 0 1 1 1 0 0 0 0 1 1 0 1 0 0 1 0 0 0 1 0 1 0 1 0 0 0 1 0 1 0 0 0 0
 0 1 1 1 0 0 1 1 1 1 0 1 0 0 1 1 1 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 1 0
 0 0 1 0 0 1 0 1 1 1 0 1 0 1 1 0 0 1 1 0 1 0 0 0 0 0 0 1 1 1 1 1 0 0 1 0 1
 0 1 1 0 0 0 0 1 0 1 0 1 0 0 0 1 0 1 1 0 0 0 0 1 0 0 0 0 1 1 1 0 1 1 0 1 0
 0 1 1 1 1 0 0 0 1 1 0 0 1 1 1 1 1 1 1 1 1 0 0 1 1 0 1 0 0 0 0 0 1 0 0 0 1
 0 1 1 1 1 0 0 0 0 0 1 1 0 1 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 1 1
 0 0 0 0 0 1 0 0 0 1 0 1 0 0 1 1 0 0 1 1 1 0 1 0 0 0 0 0 1 1 1 1 0 0 1 1 0
 1 1 0 0 0 0 0 0 0 0 1 1 1 0 0 0 1 1 0 0 0 1 0 1 0 0 1 0 1 1 1 1 0 0 1 1 0
 0 1 0 0 1 1 0 0 1 1 0 1 1 1 0 1 1 0 0 0 0 1 1 0 0 1 1 1 1 0 1 0 1 0 0 0 0
 1 0 1 0 1 0 0 1 0 0 0]

HW1

In [33]:
x_all = df[['Name', 'Sex', 'Age', 'Fare', 'FamilySize', 'Embarked', 'isAlone']]
y_all = df['Survived']
In [34]:
from sklearn.ensemble import RandomForestClassifier

rfc2 = RandomForestClassifier(n_estimators = 3000)
rfc2.fit(x_all, y_all)

print('training set accuracy : ', rfc2.score(x_all, y_all))
training set accuracy :  0.9842873176206509
In [35]:
prediction2 = rfc2.predict(x_test)
print(prediction2)

submit = pd.DataFrame({
    'PassengerId' : df_test["PassengerId"],
    'Survived' : prediction
})

submit.to_csv('submit_rfc2.csv', index = False)
[0 0 0 0 1 0 0 0 1 1 0 0 1 0 1 1 0 0 0 1 1 1 1 1 1 0 1 0 1 1 0 1 1 1 1 0 0
 0 0 1 1 0 0 1 1 0 1 0 1 1 0 0 1 1 0 0 0 0 0 1 0 0 0 1 1 1 1 0 0 1 1 0 1 0
 1 1 0 1 0 1 1 0 0 0 0 1 1 0 0 1 1 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0
 0 1 1 1 1 0 1 1 1 1 0 1 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 0 1 0 0
 0 0 1 0 0 1 0 0 1 1 0 1 0 1 1 0 0 1 1 0 1 0 0 0 0 0 0 1 1 1 1 1 0 1 1 0 1
 0 1 1 0 0 0 0 1 0 1 0 1 0 0 0 1 1 1 1 0 0 0 0 1 0 0 0 0 1 1 0 0 1 1 0 1 0
 1 0 1 1 0 0 0 0 0 1 0 0 1 0 1 0 1 1 1 1 1 0 0 0 1 0 1 1 1 0 1 0 0 0 0 0 1
 0 0 0 1 1 0 0 0 0 0 0 1 0 1 1 0 1 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 1 0 0
 1 1 0 0 0 0 0 0 0 1 1 1 1 1 0 0 0 0 1 1 1 0 0 1 0 0 0 0 1 0 1 1 0 0 1 1 0
 1 0 0 0 0 0 0 0 0 0 1 1 1 0 1 0 1 1 0 0 1 1 0 1 0 0 1 0 1 1 0 1 0 1 1 1 0
 1 1 0 0 1 1 0 0 1 0 0 0 1 1 0 1 0 0 0 0 0 1 1 0 0 1 0 1 0 0 1 0 1 0 0 0 0
 1 0 1 0 1 0 0 1 0 0 1]
In [36]:
from sklearn.ensemble import RandomForestClassifier

rfc3 = RandomForestClassifier(n_estimators = 100, max_depth = 25)
rfc3.fit(x_all, y_all)

print("training set accuracy :", rfc3.score(x_all, y_all))
training set accuracy : 0.9842873176206509
In [37]:
prediction3 = rfc3.predict(x_test)
print(prediction3)

submit = pd.DataFrame({
    'PassengerId' : df_test["PassengerId"],
    'Survived' : prediction
})

submit.to_csv('submit_rfc3.csv', index = False)
[0 0 0 0 1 0 0 0 1 1 0 0 1 0 1 1 0 0 0 1 1 1 1 1 1 0 1 0 0 1 0 1 1 1 1 0 0
 0 0 1 1 0 0 1 1 0 1 0 1 1 0 0 1 1 0 0 0 0 0 1 0 0 0 1 1 1 1 0 0 1 1 0 0 0
 1 1 0 1 0 1 1 0 0 0 0 1 1 0 0 1 1 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0
 0 1 1 1 1 0 1 1 1 1 0 1 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1 1 0 0 1 0 0
 0 1 1 0 0 1 0 0 1 1 0 1 0 1 1 0 0 1 1 0 1 0 0 0 0 0 0 1 1 1 1 1 0 1 1 0 1
 0 1 1 0 0 0 0 1 0 1 0 1 0 0 0 1 1 1 1 0 0 0 0 1 0 0 0 1 1 1 0 0 1 1 0 1 0
 1 0 1 1 0 0 0 0 1 1 0 0 1 0 1 0 1 1 1 1 1 0 0 1 1 1 1 0 1 0 1 0 0 0 0 0 1
 0 0 0 1 1 0 0 0 0 0 0 1 0 1 1 0 1 0 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 1 0 0
 1 1 0 0 0 0 0 0 0 1 1 1 1 1 0 0 0 0 1 1 1 0 0 1 0 0 0 0 1 0 1 1 0 0 1 1 0
 1 0 0 0 1 0 0 0 0 0 1 1 1 0 1 0 1 1 0 0 1 1 0 1 0 0 1 0 1 1 0 1 0 1 1 1 0
 1 1 0 0 1 1 0 0 1 0 0 0 1 1 0 1 0 0 0 0 1 1 1 0 0 1 0 1 0 0 1 0 1 0 0 0 0
 1 0 1 0 1 0 0 1 0 0 1]