In [1]:
import pandas as pd
data = pd.read_csv('data/train.csv')
data.head(5)
Out[1]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
In [2]:
data[['Sex', 'Age', 'SibSp', 'Parch']].head()
data_test = pd.read_csv('data/test.csv')
data_test.head()
Out[2]:
PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 892 3 Kelly, Mr. James male 34.5 0 0 330911 7.8292 NaN Q
1 893 3 Wilkes, Mrs. James (Ellen Needs) female 47.0 1 0 363272 7.0000 NaN S
2 894 2 Myles, Mr. Thomas Francis male 62.0 0 0 240276 9.6875 NaN Q
3 895 3 Wirz, Mr. Albert male 27.0 0 0 315154 8.6625 NaN S
4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female 22.0 1 1 3101298 12.2875 NaN S
In [3]:
data['Age'] = data['Age'].fillna(data['Age'].mean())
data_test['Age'] = data_test['Age'].fillna(data_test['Age'].mean())
In [4]:
#data.loc[조건, 열] = 넣고 싶은 값

data.loc[ data['Age'] < 10, 'Age'] = 0
data.loc[ (data['Age'] >= 10) & (data['Age'] < 20), 'Age'] = 1
data.loc[ (data['Age'] >= 20) & (data['Age'] < 30), 'Age'] = 2
data.loc[ (data['Age'] >= 30) & (data['Age'] < 40), 'Age'] = 3
data.loc[ (data['Age'] >= 40) & (data['Age'] < 50), 'Age'] = 4
data.loc[ (data['Age'] >= 50), 'Age'] = 5
In [5]:
data_test.loc[ data_test['Age'] < 10, 'Age'] = 0
data_test.loc[ (data_test['Age'] >= 10) & (data_test['Age'] < 20), 'Age'] = 1
data_test.loc[ (data_test['Age'] >= 20) & (data_test['Age'] < 30), 'Age'] = 2
data_test.loc[ (data_test['Age'] >= 30) & (data_test['Age'] < 40), 'Age'] = 3
data_test.loc[ (data_test['Age'] >= 40) & (data_test['Age'] < 50), 'Age'] = 4
data_test.loc[ (data_test['Age'] >= 50), 'Age'] = 5
In [6]:
data['FamilySize'] = data['SibSp'] + data['Parch']
data.loc[data['FamilySize'] == 0, 'IsAlone'] = 1
data.loc[data['FamilySize'] >= 1, 'IsAlone'] = 0
data.head()
Out[6]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked FamilySize IsAlone
0 1 0 3 Braund, Mr. Owen Harris male 2.0 1 0 A/5 21171 7.2500 NaN S 1 0.0
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 3.0 1 0 PC 17599 71.2833 C85 C 1 0.0
2 3 1 3 Heikkinen, Miss. Laina female 2.0 0 0 STON/O2. 3101282 7.9250 NaN S 0 1.0
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 3.0 1 0 113803 53.1000 C123 S 1 0.0
4 5 0 3 Allen, Mr. William Henry male 3.0 0 0 373450 8.0500 NaN S 0 1.0
In [7]:
data_test['FamilySize'] = data_test['SibSp'] + data_test['Parch']

train = data[['Survived', 'Sex', 'Age', 'FamilySize']]
test = data_test[['Sex', 'Age', 'FamilySize']]

train.head()
Out[7]:
Survived Sex Age FamilySize
0 0 male 2.0 1
1 1 female 3.0 1
2 1 female 2.0 0
3 1 female 3.0 1
4 0 male 3.0 0
In [8]:
data['Fare'] = data['Fare'].fillna(data['Fare'].mean())
data_test['Fare'] = data_test['Fare'].fillna(data_test['Fare'].mean())
In [9]:
data.isnull().sum()
Out[9]:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
FamilySize       0
IsAlone          0
dtype: int64
In [10]:
data_test.isnull().sum()
Out[10]:
PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Embarked         0
FamilySize       0
dtype: int64
In [11]:
data['Embarked'].value_counts()
Out[11]:
S    644
C    168
Q     77
Name: Embarked, dtype: int64
In [12]:
data['Embarked'] = data['Embarked'].fillna('S')
data_test['Embarked'] = data_test['Embarked'].fillna('S')
In [13]:
data_test.isnull().sum()
Out[13]:
PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Embarked         0
FamilySize       0
dtype: int64
In [14]:
train = data[['Survived', 'Sex', 'Age', 'FamilySize', 'Fare', 'Embarked']]
test = data_test[['Sex', 'Age', 'FamilySize', 'Fare', 'Embarked']]

train.head()
Out[14]:
Survived Sex Age FamilySize Fare Embarked
0 0 male 2.0 1 7.2500 S
1 1 female 3.0 1 71.2833 C
2 1 female 2.0 0 7.9250 S
3 1 female 3.0 1 53.1000 S
4 0 male 3.0 0 8.0500 S

챌린지1

In [15]:
data.loc[data['Sex'] == 'male', 'Sex'] = 0
data.loc[data['Sex'] == 'female', 'Sex'] = 1

data.head()
Out[15]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked FamilySize IsAlone
0 1 0 3 Braund, Mr. Owen Harris 0 2.0 1 0 A/5 21171 7.2500 NaN S 1 0.0
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... 1 3.0 1 0 PC 17599 71.2833 C85 C 1 0.0
2 3 1 3 Heikkinen, Miss. Laina 1 2.0 0 0 STON/O2. 3101282 7.9250 NaN S 0 1.0
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) 1 3.0 1 0 113803 53.1000 C123 S 1 0.0
4 5 0 3 Allen, Mr. William Henry 0 3.0 0 0 373450 8.0500 NaN S 0 1.0
In [16]:
data_test.loc[data_test['Sex'] == 'male', 'Sex'] = 0
data_test.loc[data_test['Sex'] == 'female', 'Sex'] = 1

data_test.head()
Out[16]:
PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked FamilySize
0 892 3 Kelly, Mr. James 0 3.0 0 0 330911 7.8292 NaN Q 0
1 893 3 Wilkes, Mrs. James (Ellen Needs) 1 4.0 1 0 363272 7.0000 NaN S 1
2 894 2 Myles, Mr. Thomas Francis 0 5.0 0 0 240276 9.6875 NaN Q 0
3 895 3 Wirz, Mr. Albert 0 2.0 0 0 315154 8.6625 NaN S 0
4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) 1 2.0 1 1 3101298 12.2875 NaN S 2
In [17]:
data['Embarked'].value_counts()
Out[17]:
S    646
C    168
Q     77
Name: Embarked, dtype: int64
In [18]:
data.loc[data['Embarked'] == 'S', 'Embarked'] = 0
data.loc[data['Embarked'] == 'C', 'Embarked'] = 1
data.loc[data['Embarked'] == 'Q', 'Embarked'] = 2
In [19]:
data
Out[19]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked FamilySize IsAlone
0 1 0 3 Braund, Mr. Owen Harris 0 2.0 1 0 A/5 21171 7.2500 NaN 0 1 0.0
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... 1 3.0 1 0 PC 17599 71.2833 C85 1 1 0.0
2 3 1 3 Heikkinen, Miss. Laina 1 2.0 0 0 STON/O2. 3101282 7.9250 NaN 0 0 1.0
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) 1 3.0 1 0 113803 53.1000 C123 0 1 0.0
4 5 0 3 Allen, Mr. William Henry 0 3.0 0 0 373450 8.0500 NaN 0 0 1.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
886 887 0 2 Montvila, Rev. Juozas 0 2.0 0 0 211536 13.0000 NaN 0 0 1.0
887 888 1 1 Graham, Miss. Margaret Edith 1 1.0 0 0 112053 30.0000 B42 0 0 1.0
888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" 1 2.0 1 2 W./C. 6607 23.4500 NaN 0 3 0.0
889 890 1 1 Behr, Mr. Karl Howell 0 2.0 0 0 111369 30.0000 C148 1 0 1.0
890 891 0 3 Dooley, Mr. Patrick 0 3.0 0 0 370376 7.7500 NaN 2 0 1.0

891 rows × 14 columns

In [20]:
data_test.loc[data_test['Embarked'] == 'S', 'Embarked'] = 0
data_test.loc[data_test['Embarked'] == 'C', 'Embarked'] = 1
data_test.loc[data_test['Embarked'] == 'Q', 'Embarked'] = 2
In [21]:
train
Out[21]:
Survived Sex Age FamilySize Fare Embarked
0 0 male 2.0 1 7.2500 S
1 1 female 3.0 1 71.2833 C
2 1 female 2.0 0 7.9250 S
3 1 female 3.0 1 53.1000 S
4 0 male 3.0 0 8.0500 S
... ... ... ... ... ... ...
886 0 male 2.0 0 13.0000 S
887 1 female 1.0 0 30.0000 S
888 0 female 2.0 3 23.4500 S
889 1 male 2.0 0 30.0000 C
890 0 male 3.0 0 7.7500 Q

891 rows × 6 columns

In [22]:
train = data[['Survived', 'Sex', 'Age', 'FamilySize', 'Fare', 'Embarked']]
test = data_test[['Sex', 'Age', 'FamilySize', 'Fare', 'Embarked']]

train.head()
Out[22]:
Survived Sex Age FamilySize Fare Embarked
0 0 0 2.0 1 7.2500 0
1 1 1 3.0 1 71.2833 1
2 1 1 2.0 0 7.9250 0
3 1 1 3.0 1 53.1000 0
4 0 0 3.0 0 8.0500 0
In [23]:
train
Out[23]:
Survived Sex Age FamilySize Fare Embarked
0 0 0 2.0 1 7.2500 0
1 1 1 3.0 1 71.2833 1
2 1 1 2.0 0 7.9250 0
3 1 1 3.0 1 53.1000 0
4 0 0 3.0 0 8.0500 0
... ... ... ... ... ... ...
886 0 0 2.0 0 13.0000 0
887 1 1 1.0 0 30.0000 0
888 0 1 2.0 3 23.4500 0
889 1 0 2.0 0 30.0000 1
890 0 0 3.0 0 7.7500 2

891 rows × 6 columns

In [24]:
test
Out[24]:
Sex Age FamilySize Fare Embarked
0 0 3.0 0 7.8292 2
1 1 4.0 1 7.0000 0
2 0 5.0 0 9.6875 2
3 0 2.0 0 8.6625 0
4 1 2.0 2 12.2875 0
... ... ... ... ... ...
413 0 3.0 0 8.0500 0
414 1 3.0 0 108.9000 1
415 0 3.0 0 7.2500 0
416 0 3.0 0 8.0500 0
417 0 3.0 2 22.3583 1

418 rows × 5 columns

--------------

In [25]:
data['Pclass'].value_counts()
Out[25]:
3    491
1    216
2    184
Name: Pclass, dtype: int64
In [36]:
data['Fare'] = data['Fare'].fillna( data.groupby('Pclass')['Fare'].transform('mean') )
data_test['Fare'] = data_test['Fare'].fillna( data_test.groupby('Pclass')['Fare'].transform('mean') )

print(data_test.isnull().sum())
PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Embarked         0
FamilySize       0
dtype: int64
In [37]:
#x_data = data[['Sex', 'Age', 'Pclass', 'Fare', 'Embarked']] -> Pclass 왜 포함 안하나요?
x_train = data[['Sex', 'Age', 'FamilySize', 'Fare', 'Embarked']]
y_train = data['Survived']

x_train
Out[37]:
Sex Age FamilySize Fare Embarked
0 0 2.0 1 7.2500 0
1 1 3.0 1 71.2833 1
2 1 2.0 0 7.9250 0
3 1 3.0 1 53.1000 0
4 0 3.0 0 8.0500 0
... ... ... ... ... ...
886 0 2.0 0 13.0000 0
887 1 1.0 0 30.0000 0
888 1 2.0 3 23.4500 0
889 0 2.0 0 30.0000 1
890 0 3.0 0 7.7500 2

891 rows × 5 columns

In [28]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier()
tree.fit(x_train, y_train)

#tree.score(x_train, y_train)

print('training set accuracy:', tree.score(x_train, y_train))
training set accuracy: 0.9450056116722784
In [38]:
x_test = data_test[['Sex', 'Age', 'FamilySize', 'Fare', 'Embarked']]

x_test
Out[38]:
Sex Age FamilySize Fare Embarked
0 0 3.0 0 7.8292 2
1 1 4.0 1 7.0000 0
2 0 5.0 0 9.6875 2
3 0 2.0 0 8.6625 0
4 1 2.0 2 12.2875 0
... ... ... ... ... ...
413 0 3.0 0 8.0500 0
414 1 3.0 0 108.9000 1
415 0 3.0 0 7.2500 0
416 0 3.0 0 8.0500 0
417 0 3.0 2 22.3583 1

418 rows × 5 columns

In [39]:
#결과가 모범답안과 달라요 ㅠ
prediction = tree.predict(x_test)
prediction
Out[39]:
array([0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0],
      dtype=int64)
In [31]:
submit = pd.DataFrame({
    'PassengerId': data_test['PassengerId'],
    'Survived': prediction
})

submit.to_csv('submit.csv', index=False)
In [32]:
# 실제 생존/사망 결과와 비교하는건 없나요?
my_prediction = pd.read_csv('submit.csv')
my_prediction.head()
Out[32]:
PassengerId Survived
0 892 0
1 893 0
2 894 0
3 895 0
4 896 1
In [33]:
train.head()
Out[33]:
Survived Sex Age FamilySize Fare Embarked
0 0 0 2.0 1 7.2500 0
1 1 1 3.0 1 71.2833 1
2 1 1 2.0 0 7.9250 0
3 1 1 3.0 1 53.1000 0
4 0 0 3.0 0 8.0500 0
In [34]:
train.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 6 columns):
Survived      891 non-null int64
Sex           891 non-null int64
Age           891 non-null float64
FamilySize    891 non-null int64
Fare          891 non-null float64
Embarked      891 non-null int64
dtypes: float64(2), int64(4)
memory usage: 41.9 KB
In [35]:
train.describe()
Out[35]:
Survived Sex Age FamilySize Fare Embarked
count 891.000000 891.000000 891.000000 891.000000 891.000000 891.000000
mean 0.383838 0.352413 2.382716 0.904602 32.204208 0.361392
std 0.486592 0.477990 1.258005 1.613459 49.693429 0.635673
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 0.000000 0.000000 2.000000 0.000000 7.910400 0.000000
50% 0.000000 0.000000 2.000000 0.000000 14.454200 0.000000
75% 1.000000 1.000000 3.000000 1.000000 31.000000 1.000000
max 1.000000 1.000000 5.000000 10.000000 512.329200 2.000000
In [ ]: