In [1]:
import pandas as pd
In [2]:
data = pd.read_csv('data/train.csv')
In [3]:
data.head(5)
Out[3]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
In [4]:
data[['Sex', 'Age', 'SibSp', 'Parch']].head()
Out[4]:
Sex Age SibSp Parch
0 male 22.0 1 0
1 female 38.0 1 0
2 female 26.0 0 0
3 female 35.0 1 0
4 male 35.0 0 0
In [5]:
data_test = pd.read_csv('data/test.csv')
data_test.head()
Out[5]:
PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 892 3 Kelly, Mr. James male 34.5 0 0 330911 7.8292 NaN Q
1 893 3 Wilkes, Mrs. James (Ellen Needs) female 47.0 1 0 363272 7.0000 NaN S
2 894 2 Myles, Mr. Thomas Francis male 62.0 0 0 240276 9.6875 NaN Q
3 895 3 Wirz, Mr. Albert male 27.0 0 0 315154 8.6625 NaN S
4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female 22.0 1 1 3101298 12.2875 NaN S
In [6]:
data['Age'] = data['Age'].fillna(data['Age'].mean())
In [7]:
data
Out[7]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.000000 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.000000 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.000000 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.000000 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.000000 0 0 373450 8.0500 NaN S
... ... ... ... ... ... ... ... ... ... ... ... ...
886 887 0 2 Montvila, Rev. Juozas male 27.000000 0 0 211536 13.0000 NaN S
887 888 1 1 Graham, Miss. Margaret Edith female 19.000000 0 0 112053 30.0000 B42 S
888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female 29.699118 1 2 W./C. 6607 23.4500 NaN S
889 890 1 1 Behr, Mr. Karl Howell male 26.000000 0 0 111369 30.0000 C148 C
890 891 0 3 Dooley, Mr. Patrick male 32.000000 0 0 370376 7.7500 NaN Q

891 rows × 12 columns

In [8]:
data_test['Age'] = data_test['Age'].fillna(data_test['Age'].mean())
In [9]:
#data.loc[조건, 열] = 넣고 싶은 값

data.loc[ data['Age'] < 10, 'Age'] = 0
data.loc[ (data['Age'] >= 10) & (data['Age'] < 20), 'Age'] = 1
data.loc[ (data['Age'] >= 20) & (data['Age'] < 30), 'Age'] = 2
data.loc[ (data['Age'] >= 30) & (data['Age'] < 40), 'Age'] = 3
data.loc[ (data['Age'] >= 40) & (data['Age'] < 50), 'Age'] = 4
data.loc[ (data['Age'] >= 50), 'Age'] = 5
In [10]:
data_test.loc[ data_test['Age'] < 10, 'Age'] = 0
data_test.loc[ (data_test['Age'] >= 10) & (data_test['Age'] < 20), 'Age'] = 1
data_test.loc[ (data_test['Age'] >= 20) & (data_test['Age'] < 30), 'Age'] = 2
data_test.loc[ (data_test['Age'] >= 30) & (data_test['Age'] < 40), 'Age'] = 3
data_test.loc[ (data_test['Age'] >= 40) & (data_test['Age'] < 50), 'Age'] = 4
data_test.loc[ (data_test['Age'] >= 50), 'Age'] = 5
In [11]:
data['FamilySize'] = data['SibSp'] + data['Parch']
data.head()
Out[11]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked FamilySize
0 1 0 3 Braund, Mr. Owen Harris male 2.0 1 0 A/5 21171 7.2500 NaN S 1
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 3.0 1 0 PC 17599 71.2833 C85 C 1
2 3 1 3 Heikkinen, Miss. Laina female 2.0 0 0 STON/O2. 3101282 7.9250 NaN S 0
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 3.0 1 0 113803 53.1000 C123 S 1
4 5 0 3 Allen, Mr. William Henry male 3.0 0 0 373450 8.0500 NaN S 0
In [12]:
data_test['FamilySize'] = data_test['SibSp'] + data_test['Parch']

train = data[['Survived', 'Sex', 'Age', 'FamilySize']]
test = data_test[['Sex', 'Age', 'FamilySize']]

train.head()
Out[12]:
Survived Sex Age FamilySize
0 0 male 2.0 1
1 1 female 3.0 1
2 1 female 2.0 0
3 1 female 3.0 1
4 0 male 3.0 0
In [13]:
data['Fare'] = data['Fare'].fillna(data['Fare'].mean())
data_test['Fare'] = data_test['Fare'].fillna(data_test['Fare'].mean())
In [14]:
data.isnull().sum()
Out[14]:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
FamilySize       0
dtype: int64
In [15]:
data_test.isnull().sum()
Out[15]:
PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Embarked         0
FamilySize       0
dtype: int64
In [16]:
data['Embarked'].value_counts()
Out[16]:
S    644
C    168
Q     77
Name: Embarked, dtype: int64
In [17]:
data['Embarked'] = data['Embarked'].fillna('S')
data_test['Embarked'] = data_test['Embarked'].fillna('S')
In [18]:
data_test.isnull().sum()
Out[18]:
PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Embarked         0
FamilySize       0
dtype: int64
In [19]:
train = data[['Survived', 'Sex', 'Age', 'FamilySize', 'Fare', 'Embarked']]
test = data_test[['Sex', 'Age', 'FamilySize', 'Fare', 'Embarked']]

train.head()
Out[19]:
Survived Sex Age FamilySize Fare Embarked
0 0 male 2.0 1 7.2500 S
1 1 female 3.0 1 71.2833 C
2 1 female 2.0 0 7.9250 S
3 1 female 3.0 1 53.1000 S
4 0 male 3.0 0 8.0500 S

In [20]:
data.loc[data['Sex'] == 'male', 'Sex'] = 0
data.loc[data['Sex'] == 'female', 'Sex'] = 1

data_test.loc[data_test['Sex'] == 'male', 'Sex'] = 0
data_test.loc[data_test['Sex'] == 'female', 'Sex'] = 1
data.head()
Out[20]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked FamilySize
0 1 0 3 Braund, Mr. Owen Harris 0 2.0 1 0 A/5 21171 7.2500 NaN S 1
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... 1 3.0 1 0 PC 17599 71.2833 C85 C 1
2 3 1 3 Heikkinen, Miss. Laina 1 2.0 0 0 STON/O2. 3101282 7.9250 NaN S 0
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) 1 3.0 1 0 113803 53.1000 C123 S 1
4 5 0 3 Allen, Mr. William Henry 0 3.0 0 0 373450 8.0500 NaN S 0
In [21]:
data.loc[data['Embarked'] == 'S', 'Embarked'] = 0
data.loc[data['Embarked'] == 'C', 'Embarked'] = 1
data.loc[data['Embarked'] == 'Q', 'Embarked'] = 2

data_test.loc[data_test['Embarked'] == 'S', 'Embarked'] = 0
data_test.loc[data_test['Embarked'] == 'C', 'Embarked'] = 1
data_test.loc[data_test['Embarked'] == 'Q', 'Embarked'] = 2
data.head()
Out[21]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked FamilySize
0 1 0 3 Braund, Mr. Owen Harris 0 2.0 1 0 A/5 21171 7.2500 NaN 0 1
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... 1 3.0 1 0 PC 17599 71.2833 C85 1 1
2 3 1 3 Heikkinen, Miss. Laina 1 2.0 0 0 STON/O2. 3101282 7.9250 NaN 0 0
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) 1 3.0 1 0 113803 53.1000 C123 0 1
4 5 0 3 Allen, Mr. William Henry 0 3.0 0 0 373450 8.0500 NaN 0 0
In [22]:
#뻘짓
data['Cabin'].value_counts()
Out[22]:
C23 C25 C27    4
B96 B98        4
G6             4
F33            3
F2             3
              ..
C111           1
A16            1
T              1
B42            1
C95            1
Name: Cabin, Length: 147, dtype: int64
In [23]:
#뻘짓
data['Cabin'] = data['Cabin'].fillna('B96')
data_test['Cabin'] = data_test['Cabin'].fillna('B96')
data.head()
Out[23]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked FamilySize
0 1 0 3 Braund, Mr. Owen Harris 0 2.0 1 0 A/5 21171 7.2500 B96 0 1
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... 1 3.0 1 0 PC 17599 71.2833 C85 1 1
2 3 1 3 Heikkinen, Miss. Laina 1 2.0 0 0 STON/O2. 3101282 7.9250 B96 0 0
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) 1 3.0 1 0 113803 53.1000 C123 0 1
4 5 0 3 Allen, Mr. William Henry 0 3.0 0 0 373450 8.0500 B96 0 0
In [24]:
train = data[['Survived', 'Sex', 'Age', 'FamilySize', 'Fare', 'Embarked']]
test = data_test[['Sex', 'Age', 'FamilySize', 'Fare', 'Embarked']]

train.head()
Out[24]:
Survived Sex Age FamilySize Fare Embarked
0 0 0 2.0 1 7.2500 0
1 1 1 3.0 1 71.2833 1
2 1 1 2.0 0 7.9250 0
3 1 1 3.0 1 53.1000 0
4 0 0 3.0 0 8.0500 0
In [25]:
data[['Name', 'Age']].head(10)
Out[25]:
Name Age
0 Braund, Mr. Owen Harris 2.0
1 Cumings, Mrs. John Bradley (Florence Briggs Th... 3.0
2 Heikkinen, Miss. Laina 2.0
3 Futrelle, Mrs. Jacques Heath (Lily May Peel) 3.0
4 Allen, Mr. William Henry 3.0
5 Moran, Mr. James 2.0
6 McCarthy, Mr. Timothy J 5.0
7 Palsson, Master. Gosta Leonard 0.0
8 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) 2.0
9 Nasser, Mrs. Nicholas (Adele Achem) 1.0
In [26]:
data.loc[ data['Name'].str.contains('Mr\.'), 'Name' ] = 'Mr'
data.loc[ data['Name'].str.contains('Mrs\.'), 'Name' ] = 'Mrs'
data.loc[ data['Name'].str.contains('Miss\.'), 'Name' ] = 'Miss'
data.head()
Out[26]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked FamilySize
0 1 0 3 Mr 0 2.0 1 0 A/5 21171 7.2500 B96 0 1
1 2 1 1 Mrs 1 3.0 1 0 PC 17599 71.2833 C85 1 1
2 3 1 3 Miss 1 2.0 0 0 STON/O2. 3101282 7.9250 B96 0 0
3 4 1 1 Mrs 1 3.0 1 0 113803 53.1000 C123 0 1
4 5 0 3 Mr 0 3.0 0 0 373450 8.0500 B96 0 0
In [27]:
# data.loc[data['Name'] == 'Mr', 'Name'] = 0
# data.loc[data['Name'] == 'Mrs', 'Name'] = 1
# data.loc[data['Name'] == 'Miss', 'Name'] = 2

# data_test.loc[data_test['Name'] == 'Mr', 'Name'] = 0
# data_test.loc[data_test['Name'] == 'Mrs', 'Name'] = 1
# data_test.loc[data_test['Name'] == 'Miss', 'Name'] = 2
# data.head()

#이부분 이해안감


data['Name'] = data['Name'].map({
    'Mr': 0,
    'Mrs': 1,
    'Miss': 2
})

data.head(10)
Out[27]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked FamilySize
0 1 0 3 0.0 0 2.0 1 0 A/5 21171 7.2500 B96 0 1
1 2 1 1 1.0 1 3.0 1 0 PC 17599 71.2833 C85 1 1
2 3 1 3 2.0 1 2.0 0 0 STON/O2. 3101282 7.9250 B96 0 0
3 4 1 1 1.0 1 3.0 1 0 113803 53.1000 C123 0 1
4 5 0 3 0.0 0 3.0 0 0 373450 8.0500 B96 0 0
5 6 0 3 0.0 0 2.0 0 0 330877 8.4583 B96 2 0
6 7 0 1 0.0 0 5.0 0 0 17463 51.8625 E46 0 0
7 8 0 3 NaN 0 0.0 3 1 349909 21.0750 B96 0 4
8 9 1 3 1.0 1 2.0 0 2 347742 11.1333 B96 0 2
9 10 1 2 1.0 1 1.0 1 0 237736 30.0708 B96 1 1
In [28]:
data.isnull().sum()
Out[28]:
PassengerId     0
Survived        0
Pclass          0
Name           67
Sex             0
Age             0
SibSp           0
Parch           0
Ticket          0
Fare            0
Cabin           0
Embarked        0
FamilySize      0
dtype: int64
In [29]:
data['Name'] = data['Name'].fillna(3)
data_test['Name'] = data_test['Name'].fillna(3)

data.head(10)
Out[29]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked FamilySize
0 1 0 3 0.0 0 2.0 1 0 A/5 21171 7.2500 B96 0 1
1 2 1 1 1.0 1 3.0 1 0 PC 17599 71.2833 C85 1 1
2 3 1 3 2.0 1 2.0 0 0 STON/O2. 3101282 7.9250 B96 0 0
3 4 1 1 1.0 1 3.0 1 0 113803 53.1000 C123 0 1
4 5 0 3 0.0 0 3.0 0 0 373450 8.0500 B96 0 0
5 6 0 3 0.0 0 2.0 0 0 330877 8.4583 B96 2 0
6 7 0 1 0.0 0 5.0 0 0 17463 51.8625 E46 0 0
7 8 0 3 3.0 0 0.0 3 1 349909 21.0750 B96 0 4
8 9 1 3 1.0 1 2.0 0 2 347742 11.1333 B96 0 2
9 10 1 2 1.0 1 1.0 1 0 237736 30.0708 B96 1 1
In [30]:
data['Name'].value_counts()
Out[30]:
0.0    517
2.0    182
1.0    125
3.0     67
Name: Name, dtype: int64
In [32]:
#이부분 이해안감(Name별로 Age 평균내는 식)

data['Age'] = data['Age'].fillna( data.groupby('Name')['Age'].transform('mean') )
data.head(10)
#data.head를 해도 mean값이 안보이는데 왜하나요
Out[32]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked FamilySize
0 1 0 3 0.0 0 2.0 1 0 A/5 21171 7.2500 B96 0 1
1 2 1 1 1.0 1 3.0 1 0 PC 17599 71.2833 C85 1 1
2 3 1 3 2.0 1 2.0 0 0 STON/O2. 3101282 7.9250 B96 0 0
3 4 1 1 1.0 1 3.0 1 0 113803 53.1000 C123 0 1
4 5 0 3 0.0 0 3.0 0 0 373450 8.0500 B96 0 0
5 6 0 3 0.0 0 2.0 0 0 330877 8.4583 B96 2 0
6 7 0 1 0.0 0 5.0 0 0 17463 51.8625 E46 0 0
7 8 0 3 3.0 0 0.0 3 1 349909 21.0750 B96 0 4
8 9 1 3 1.0 1 2.0 0 2 347742 11.1333 B96 0 2
9 10 1 2 1.0 1 1.0 1 0 237736 30.0708 B96 1 1
In [33]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
In [35]:
facet = sns.FacetGrid(data, hue='Survived', aspect=4)
facet.map(sns.kdeplot, 'Age')
facet.add_legend()

plt.show()
In [ ]: