In [1]:
import pandas as pd
In [2]:
df = pd.read_csv('data/train.csv')
In [3]:
df.head(5)
Out[3]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
In [4]:
df[['Sex','Age','SibSp','Parch']].head()
Out[4]:
Sex Age SibSp Parch
0 male 22.0 1 0
1 female 38.0 1 0
2 female 26.0 0 0
3 female 35.0 1 0
4 male 35.0 0 0
In [5]:
#testset
df_test = pd.read_csv('data/test.csv')
df_test.head()
Out[5]:
PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 892 3 Kelly, Mr. James male 34.5 0 0 330911 7.8292 NaN Q
1 893 3 Wilkes, Mrs. James (Ellen Needs) female 47.0 1 0 363272 7.0000 NaN S
2 894 2 Myles, Mr. Thomas Francis male 62.0 0 0 240276 9.6875 NaN Q
3 895 3 Wirz, Mr. Albert male 27.0 0 0 315154 8.6625 NaN S
4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female 22.0 1 1 3101298 12.2875 NaN S
In [6]:
df['Age']=df['Age'].fillna(df['Age'].mean())
In [7]:
df
Out[7]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.000000 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.000000 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.000000 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.000000 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.000000 0 0 373450 8.0500 NaN S
... ... ... ... ... ... ... ... ... ... ... ... ...
886 887 0 2 Montvila, Rev. Juozas male 27.000000 0 0 211536 13.0000 NaN S
887 888 1 1 Graham, Miss. Margaret Edith female 19.000000 0 0 112053 30.0000 B42 S
888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female 29.699118 1 2 W./C. 6607 23.4500 NaN S
889 890 1 1 Behr, Mr. Karl Howell male 26.000000 0 0 111369 30.0000 C148 C
890 891 0 3 Dooley, Mr. Patrick male 32.000000 0 0 370376 7.7500 NaN Q

891 rows × 12 columns

In [8]:
#df.loc[조건,열]= 넣고싶은 값

df.loc[df['Age']<10,'Age']=0
df.loc[(df['Age']>=10)&(df['Age']<20),'Age']=1
df.loc[(df['Age']>=20)&(df['Age']<30),'Age']=2
df.loc[(df['Age']>=30)&(df['Age']<40),'Age']=3
df.loc[(df['Age']>=40)&(df['Age']<50),'Age']=4
df.loc[df['Age']>=50,'Age']=5
In [9]:
df
Out[9]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 2.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 3.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 2.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 3.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 3.0 0 0 373450 8.0500 NaN S
... ... ... ... ... ... ... ... ... ... ... ... ...
886 887 0 2 Montvila, Rev. Juozas male 2.0 0 0 211536 13.0000 NaN S
887 888 1 1 Graham, Miss. Margaret Edith female 1.0 0 0 112053 30.0000 B42 S
888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female 2.0 1 2 W./C. 6607 23.4500 NaN S
889 890 1 1 Behr, Mr. Karl Howell male 2.0 0 0 111369 30.0000 C148 C
890 891 0 3 Dooley, Mr. Patrick male 3.0 0 0 370376 7.7500 NaN Q

891 rows × 12 columns

In [10]:
#df_test.loc[조건,열]= 넣고싶은 값

df_test.loc[df_test['Age']<10,'Age']=0
df_test.loc[(df_test['Age']>=10)&(df_test['Age']<20),'Age']=1
df_test.loc[(df_test['Age']>=20)&(df_test['Age']<30),'Age']=2
df_test.loc[(df_test['Age']>=30)&(df_test['Age']<40),'Age']=3
df_test.loc[(df_test['Age']>=40)&(df_test['Age']<50),'Age']=4
df_test.loc[df_test['Age']>=50,'Age']=5
In [11]:
df['FamilySize']=df['SibSp']+df['Parch']
df
Out[11]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked FamilySize
0 1 0 3 Braund, Mr. Owen Harris male 2.0 1 0 A/5 21171 7.2500 NaN S 1
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 3.0 1 0 PC 17599 71.2833 C85 C 1
2 3 1 3 Heikkinen, Miss. Laina female 2.0 0 0 STON/O2. 3101282 7.9250 NaN S 0
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 3.0 1 0 113803 53.1000 C123 S 1
4 5 0 3 Allen, Mr. William Henry male 3.0 0 0 373450 8.0500 NaN S 0
... ... ... ... ... ... ... ... ... ... ... ... ... ...
886 887 0 2 Montvila, Rev. Juozas male 2.0 0 0 211536 13.0000 NaN S 0
887 888 1 1 Graham, Miss. Margaret Edith female 1.0 0 0 112053 30.0000 B42 S 0
888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female 2.0 1 2 W./C. 6607 23.4500 NaN S 3
889 890 1 1 Behr, Mr. Karl Howell male 2.0 0 0 111369 30.0000 C148 C 0
890 891 0 3 Dooley, Mr. Patrick male 3.0 0 0 370376 7.7500 NaN Q 0

891 rows × 13 columns

In [12]:
df_test['FamilySize']=df_test['SibSp']+df_test['Parch']
df_test
Out[12]:
PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked FamilySize
0 892 3 Kelly, Mr. James male 3.0 0 0 330911 7.8292 NaN Q 0
1 893 3 Wilkes, Mrs. James (Ellen Needs) female 4.0 1 0 363272 7.0000 NaN S 1
2 894 2 Myles, Mr. Thomas Francis male 5.0 0 0 240276 9.6875 NaN Q 0
3 895 3 Wirz, Mr. Albert male 2.0 0 0 315154 8.6625 NaN S 0
4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female 2.0 1 1 3101298 12.2875 NaN S 2
... ... ... ... ... ... ... ... ... ... ... ... ...
413 1305 3 Spector, Mr. Woolf male NaN 0 0 A.5. 3236 8.0500 NaN S 0
414 1306 1 Oliva y Ocana, Dona. Fermina female 3.0 0 0 PC 17758 108.9000 C105 C 0
415 1307 3 Saether, Mr. Simon Sivertsen male 3.0 0 0 SOTON/O.Q. 3101262 7.2500 NaN S 0
416 1308 3 Ware, Mr. Frederick male NaN 0 0 359309 8.0500 NaN S 0
417 1309 3 Peter, Master. Michael J male NaN 1 1 2668 22.3583 NaN C 2

418 rows × 12 columns

In [13]:
train = df[['Survived','Sex','Age','FamilySize']]
test = df_test[['Sex','Age','FamilySize']]
                
train.head()
Out[13]:
Survived Sex Age FamilySize
0 0 male 2.0 1
1 1 female 3.0 1
2 1 female 2.0 0
3 1 female 3.0 1
4 0 male 3.0 0

미션1

In [14]:
df['Fare']=df['Fare'].fillna(df['Fare'].mean())
df_test['Fare']=df_test['Fare'].fillna(df_test['Fare'].mean())

함께실습7

In [15]:
df.isnull().sum()
Out[15]:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
FamilySize       0
dtype: int64
In [16]:
df_test.isnull().sum()
Out[16]:
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Embarked         0
FamilySize       0
dtype: int64
In [17]:
df['Embarked'].value_counts()
Out[17]:
S    644
C    168
Q     77
Name: Embarked, dtype: int64
In [18]:
df['Pclass'].value_counts()
Out[18]:
3    491
1    216
2    184
Name: Pclass, dtype: int64
In [19]:
df['Embarked']=df['Embarked'].fillna('S')
df_test['Embarked']=df_test['Embarked'].fillna('S')
In [20]:
df.isnull().sum()
Out[20]:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
FamilySize       0
dtype: int64
In [21]:
train = df[['Survived','Sex','Age','FamilySize','Fare','Embarked']]
test = df_test[['Sex','Age','FamilySize','Fare','Embarked']]
In [22]:
df['Cabin']=df['Cabin'].str[0]
df.head()
Out[22]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked FamilySize
0 1 0 3 Braund, Mr. Owen Harris male 2.0 1 0 A/5 21171 7.2500 NaN S 1
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 3.0 1 0 PC 17599 71.2833 C C 1
2 3 1 3 Heikkinen, Miss. Laina female 2.0 0 0 STON/O2. 3101282 7.9250 NaN S 0
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 3.0 1 0 113803 53.1000 C S 1
4 5 0 3 Allen, Mr. William Henry male 3.0 0 0 373450 8.0500 NaN S 0
In [23]:
df['Cabin'].value_counts()
Out[23]:
C    59
B    47
D    33
E    32
A    15
F    13
G     4
T     1
Name: Cabin, dtype: int64
In [24]:
df['Cabin']=df['Cabin'].fillna('C')

df_test['Cabin']=df_test['Cabin'].str[0]
df_test['Cabin']=df_test['Cabin'].fillna('C')

df_test.isnull().sum()
df['Cabin'].value_counts()
Out[24]:
C    746
B     47
D     33
E     32
A     15
F     13
G      4
T      1
Name: Cabin, dtype: int64
In [25]:
df['Cabin']=df['Cabin'].map({
    'C':0,
    'B':1,
    'D':2,
    'E':3,
    'A':4,
    'F':5,
    'G':6,
    'T':7
})
In [26]:
df_test['Cabin']=df_test['Cabin'].map({
    'C':0,
    'B':1,
    'D':2,
    'E':3,
    'A':4,
    'F':5,
    'G':6,
    'T':7
})
df['Cabin'].value_counts()
Out[26]:
0    746
1     47
2     33
3     32
4     15
5     13
6      4
7      1
Name: Cabin, dtype: int64
In [27]:
df['Embarked'].value_counts()
Out[27]:
S    646
C    168
Q     77
Name: Embarked, dtype: int64
In [28]:
df['Embarked']=df['Embarked'].map({
    'S':0,
    'C':1,
    'Q':2
})
df.head()

df_test['Embarked']=df_test['Embarked'].map({
    'S':0,
    'C':1,
    'Q':2
})
df.head()
Out[28]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked FamilySize
0 1 0 3 Braund, Mr. Owen Harris male 2.0 1 0 A/5 21171 7.2500 0 0 1
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 3.0 1 0 PC 17599 71.2833 0 1 1
2 3 1 3 Heikkinen, Miss. Laina female 2.0 0 0 STON/O2. 3101282 7.9250 0 0 0
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 3.0 1 0 113803 53.1000 0 0 1
4 5 0 3 Allen, Mr. William Henry male 3.0 0 0 373450 8.0500 0 0 0
In [29]:
df['Sex']=df['Sex'].map({
    'male':0,
    'female':1
})

df_test['Sex']=df_test['Sex'].map({
    'male':0,
    'female':1
})
In [30]:
df_test
Out[30]:
PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked FamilySize
0 892 3 Kelly, Mr. James 0 3.0 0 0 330911 7.8292 0 2 0
1 893 3 Wilkes, Mrs. James (Ellen Needs) 1 4.0 1 0 363272 7.0000 0 0 1
2 894 2 Myles, Mr. Thomas Francis 0 5.0 0 0 240276 9.6875 0 2 0
3 895 3 Wirz, Mr. Albert 0 2.0 0 0 315154 8.6625 0 0 0
4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) 1 2.0 1 1 3101298 12.2875 0 0 2
... ... ... ... ... ... ... ... ... ... ... ... ...
413 1305 3 Spector, Mr. Woolf 0 NaN 0 0 A.5. 3236 8.0500 0 0 0
414 1306 1 Oliva y Ocana, Dona. Fermina 1 3.0 0 0 PC 17758 108.9000 0 1 0
415 1307 3 Saether, Mr. Simon Sivertsen 0 3.0 0 0 SOTON/O.Q. 3101262 7.2500 0 0 0
416 1308 3 Ware, Mr. Frederick 0 NaN 0 0 359309 8.0500 0 0 0
417 1309 3 Peter, Master. Michael J 0 NaN 1 1 2668 22.3583 0 1 2

418 rows × 12 columns

미션1 빈칸을 더 현명하게 처리하기

In [31]:
df[['Name','Age']].head(10)
Out[31]:
Name Age
0 Braund, Mr. Owen Harris 2.0
1 Cumings, Mrs. John Bradley (Florence Briggs Th... 3.0
2 Heikkinen, Miss. Laina 2.0
3 Futrelle, Mrs. Jacques Heath (Lily May Peel) 3.0
4 Allen, Mr. William Henry 3.0
5 Moran, Mr. James 2.0
6 McCarthy, Mr. Timothy J 5.0
7 Palsson, Master. Gosta Leonard 0.0
8 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) 2.0
9 Nasser, Mrs. Nicholas (Adele Achem) 1.0
In [32]:
df.loc[df['Name'].str.contains('Mr\.'),'Name']='Mr'
df.loc[df['Name'].str.contains('Mrs\.'),'Name']='Mrs'
df.loc[df['Name'].str.contains('Miss\.'),'Name']='Miss'
df.head()
Out[32]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked FamilySize
0 1 0 3 Mr 0 2.0 1 0 A/5 21171 7.2500 0 0 1
1 2 1 1 Mrs 1 3.0 1 0 PC 17599 71.2833 0 1 1
2 3 1 3 Miss 1 2.0 0 0 STON/O2. 3101282 7.9250 0 0 0
3 4 1 1 Mrs 1 3.0 1 0 113803 53.1000 0 0 1
4 5 0 3 Mr 0 3.0 0 0 373450 8.0500 0 0 0
In [33]:
df['Name']=df['Name'].map({
    'Mr':0,
    'Mrs':1,
    'Miss':2
})

df.head(10)
Out[33]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked FamilySize
0 1 0 3 0.0 0 2.0 1 0 A/5 21171 7.2500 0 0 1
1 2 1 1 1.0 1 3.0 1 0 PC 17599 71.2833 0 1 1
2 3 1 3 2.0 1 2.0 0 0 STON/O2. 3101282 7.9250 0 0 0
3 4 1 1 1.0 1 3.0 1 0 113803 53.1000 0 0 1
4 5 0 3 0.0 0 3.0 0 0 373450 8.0500 0 0 0
5 6 0 3 0.0 0 2.0 0 0 330877 8.4583 0 2 0
6 7 0 1 0.0 0 5.0 0 0 17463 51.8625 3 0 0
7 8 0 3 NaN 0 0.0 3 1 349909 21.0750 0 0 4
8 9 1 3 1.0 1 2.0 0 2 347742 11.1333 0 0 2
9 10 1 2 1.0 1 1.0 1 0 237736 30.0708 0 1 1
In [34]:
df['Name']=df['Name'].fillna(3)
df.head(10)
Out[34]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked FamilySize
0 1 0 3 0.0 0 2.0 1 0 A/5 21171 7.2500 0 0 1
1 2 1 1 1.0 1 3.0 1 0 PC 17599 71.2833 0 1 1
2 3 1 3 2.0 1 2.0 0 0 STON/O2. 3101282 7.9250 0 0 0
3 4 1 1 1.0 1 3.0 1 0 113803 53.1000 0 0 1
4 5 0 3 0.0 0 3.0 0 0 373450 8.0500 0 0 0
5 6 0 3 0.0 0 2.0 0 0 330877 8.4583 0 2 0
6 7 0 1 0.0 0 5.0 0 0 17463 51.8625 3 0 0
7 8 0 3 3.0 0 0.0 3 1 349909 21.0750 0 0 4
8 9 1 3 1.0 1 2.0 0 2 347742 11.1333 0 0 2
9 10 1 2 1.0 1 1.0 1 0 237736 30.0708 0 1 1
In [36]:
df['Name'].value_counts()
Out[36]:
0.0    517
2.0    182
1.0    125
3.0     67
Name: Name, dtype: int64
In [37]:
df['Age']=df['Age'].fillna(df.groupby('Name')['Age'].transform('mean'))
df.head(10)
Out[37]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked FamilySize
0 1 0 3 0.0 0 2.0 1 0 A/5 21171 7.2500 0 0 1
1 2 1 1 1.0 1 3.0 1 0 PC 17599 71.2833 0 1 1
2 3 1 3 2.0 1 2.0 0 0 STON/O2. 3101282 7.9250 0 0 0
3 4 1 1 1.0 1 3.0 1 0 113803 53.1000 0 0 1
4 5 0 3 0.0 0 3.0 0 0 373450 8.0500 0 0 0
5 6 0 3 0.0 0 2.0 0 0 330877 8.4583 0 2 0
6 7 0 1 0.0 0 5.0 0 0 17463 51.8625 3 0 0
7 8 0 3 3.0 0 0.0 3 1 349909 21.0750 0 0 4
8 9 1 3 1.0 1 2.0 0 2 347742 11.1333 0 0 2
9 10 1 2 1.0 1 1.0 1 0 237736 30.0708 0 1 1

미션2

In [38]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()
In [39]:
facet = sns.FacetGrid(df, hue='Survived', aspect=4)
facet.map(sns.kdeplot, 'Age')
facet.add_legend()

plt.show()
In [ ]: