In [1]:
import pandas as pd
In [2]:
df = pd.read_csv('data/train.csv')
In [3]:
df.head(5)
Out[3]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
In [4]:
df[['Sex','Age','SibSp','Parch']]
Out[4]:
Sex Age SibSp Parch
0 male 22.0 1 0
1 female 38.0 1 0
2 female 26.0 0 0
3 female 35.0 1 0
4 male 35.0 0 0
... ... ... ... ...
886 male 27.0 0 0
887 female 19.0 0 0
888 female NaN 1 2
889 male 26.0 0 0
890 male 32.0 0 0

891 rows × 4 columns

In [5]:
# testset
df_test = pd.read_csv('data/test.csv')
df_test.head()
Out[5]:
PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 892 3 Kelly, Mr. James male 34.5 0 0 330911 7.8292 NaN Q
1 893 3 Wilkes, Mrs. James (Ellen Needs) female 47.0 1 0 363272 7.0000 NaN S
2 894 2 Myles, Mr. Thomas Francis male 62.0 0 0 240276 9.6875 NaN Q
3 895 3 Wirz, Mr. Albert male 27.0 0 0 315154 8.6625 NaN S
4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female 22.0 1 1 3101298 12.2875 NaN S
In [6]:
df['Age'].fillna(df['Age'].mean())
Out[6]:
0      22.000000
1      38.000000
2      26.000000
3      35.000000
4      35.000000
         ...    
886    27.000000
887    19.000000
888    29.699118
889    26.000000
890    32.000000
Name: Age, Length: 891, dtype: float64
In [7]:
df['Age'] = df['Age'].fillna(df['Age'].mean())
In [8]:
df['Age']
Out[8]:
0      22.000000
1      38.000000
2      26.000000
3      35.000000
4      35.000000
         ...    
886    27.000000
887    19.000000
888    29.699118
889    26.000000
890    32.000000
Name: Age, Length: 891, dtype: float64
In [9]:
# 새로운 값을 저장해줘야함
df_test['Age'] = df_test['Age'].fillna(df['Age'].mean())
In [10]:
# df.loc[조건,열]= 넣고 싶은값

df.loc[(df['Age']<10), 'Age'] = 0
df.loc[(df['Age']>=10) & (df['Age']<20), 'Age'] = 1
df.loc[(df['Age']>=20 )& (df['Age']<30), 'Age'] = 2
df.loc[(df['Age']>=30 )& (df['Age']<40), 'Age'] = 3
df.loc[(df['Age']>=40 )& (df['Age']<50), 'Age'] = 4
df.loc[(df['Age']>=50), 'Age'] = 5
#하고 또 저장해줘야함.
In [11]:
# df_test.loc[조건,열]= 넣고 싶은값

df_test.loc[(df_test['Age']<10), 'Age'] = 0
df_test.loc[(df_test['Age']>=10) & (df_test['Age']<20), 'Age'] = 1
df_test.loc[(df_test['Age']>=20 )& (df_test['Age']<30), 'Age'] = 2
df_test.loc[(df_test['Age']>=30 )& (df_test['Age']<40), 'Age'] = 3
df_test.loc[(df_test['Age']>=40 )& (df_test['Age']<50), 'Age'] = 4
df_test.loc[(df_test['Age']>=50), 'Age'] = 5
#하고 또 저장해줘야함.
In [12]:
df_test
Out[12]:
PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 892 3 Kelly, Mr. James male 3.0 0 0 330911 7.8292 NaN Q
1 893 3 Wilkes, Mrs. James (Ellen Needs) female 4.0 1 0 363272 7.0000 NaN S
2 894 2 Myles, Mr. Thomas Francis male 5.0 0 0 240276 9.6875 NaN Q
3 895 3 Wirz, Mr. Albert male 2.0 0 0 315154 8.6625 NaN S
4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female 2.0 1 1 3101298 12.2875 NaN S
... ... ... ... ... ... ... ... ... ... ... ...
413 1305 3 Spector, Mr. Woolf male 2.0 0 0 A.5. 3236 8.0500 NaN S
414 1306 1 Oliva y Ocana, Dona. Fermina female 3.0 0 0 PC 17758 108.9000 C105 C
415 1307 3 Saether, Mr. Simon Sivertsen male 3.0 0 0 SOTON/O.Q. 3101262 7.2500 NaN S
416 1308 3 Ware, Mr. Frederick male 2.0 0 0 359309 8.0500 NaN S
417 1309 3 Peter, Master. Michael J male 2.0 1 1 2668 22.3583 NaN C

418 rows × 11 columns

In [13]:
df['Familysize'] = df['SibSp']+ df['Parch']
df.head()
Out[13]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked Familysize
0 1 0 3 Braund, Mr. Owen Harris male 2.0 1 0 A/5 21171 7.2500 NaN S 1
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 3.0 1 0 PC 17599 71.2833 C85 C 1
2 3 1 3 Heikkinen, Miss. Laina female 2.0 0 0 STON/O2. 3101282 7.9250 NaN S 0
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 3.0 1 0 113803 53.1000 C123 S 1
4 5 0 3 Allen, Mr. William Henry male 3.0 0 0 373450 8.0500 NaN S 0
In [14]:
df_test['Familysize'] = df_test['SibSp']+ df_test['Parch']
In [15]:
train = df[['Survived','Sex','Age','Familysize']]
test = df[['Sex','Age','Familysize']]

train.head()
Out[15]:
Survived Sex Age Familysize
0 0 male 2.0 1
1 1 female 3.0 1
2 1 female 2.0 0
3 1 female 3.0 1
4 0 male 3.0 0

미션1

In [16]:
df['Fare'] = df['Fare'].fillna(df['Fare'].mean())
df_test['Fare']= df_test['Fare'].fillna(df['Fare'].mean())
#셀클릭후 b누르면 셀추가 d누르면 삭제

함께실습 7

In [17]:
# 빈칸 찾기 
In [18]:
df.isnull().sum()
Out[18]:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
Familysize       0
dtype: int64
In [19]:
df_test.isnull().sum()
Out[19]:
PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Embarked         0
Familysize       0
dtype: int64
In [20]:
df['Embarked'].value_counts()
Out[20]:
S    644
C    168
Q     77
Name: Embarked, dtype: int64
In [21]:
df['Pclass'].value_counts()
Out[21]:
3    491
1    216
2    184
Name: Pclass, dtype: int64
In [22]:
df['Embarked'] = df['Embarked'].fillna('S')
df.isnull().sum()

df_test['Embarked'] = df_test['Embarked'].fillna('S')
In [23]:
train = df[['Survived','Sex','Age','Familysize','Fare','Embarked']]
test = df[['Sex','Age','Familysize','Fare','Embarked']]

train.head()
Out[23]:
Survived Sex Age Familysize Fare Embarked
0 0 male 2.0 1 7.2500 S
1 1 female 3.0 1 71.2833 C
2 1 female 2.0 0 7.9250 S
3 1 female 3.0 1 53.1000 S
4 0 male 3.0 0 8.0500 S
In [24]:
df.loc[df['Sex']=='male', 'Sex'] = 0
df.loc[df['Sex']=='female', 'Sex'] = 1
In [25]:
df_test.loc[df_test['Sex']=='male', 'Sex'] = 0
df_test.loc[df_test['Sex']=='female', 'Sex'] = 1
In [26]:
df
Out[26]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked Familysize
0 1 0 3 Braund, Mr. Owen Harris 0 2.0 1 0 A/5 21171 7.2500 NaN S 1
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... 1 3.0 1 0 PC 17599 71.2833 C85 C 1
2 3 1 3 Heikkinen, Miss. Laina 1 2.0 0 0 STON/O2. 3101282 7.9250 NaN S 0
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) 1 3.0 1 0 113803 53.1000 C123 S 1
4 5 0 3 Allen, Mr. William Henry 0 3.0 0 0 373450 8.0500 NaN S 0
... ... ... ... ... ... ... ... ... ... ... ... ... ...
886 887 0 2 Montvila, Rev. Juozas 0 2.0 0 0 211536 13.0000 NaN S 0
887 888 1 1 Graham, Miss. Margaret Edith 1 1.0 0 0 112053 30.0000 B42 S 0
888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" 1 2.0 1 2 W./C. 6607 23.4500 NaN S 3
889 890 1 1 Behr, Mr. Karl Howell 0 2.0 0 0 111369 30.0000 C148 C 0
890 891 0 3 Dooley, Mr. Patrick 0 3.0 0 0 370376 7.7500 NaN Q 0

891 rows × 13 columns

In [27]:
df.loc[df['Embarked']=='S', 'Embarked'] = 0
df.loc[df['Embarked']=='C', 'Embarked'] = 1
df.loc[df['Embarked']=='Q', 'Embarked'] = 2
In [28]:
df_test.loc[df_test['Embarked']=='S', 'Embarked'] = 0
df_test.loc[df_test['Embarked']=='C', 'Embarked'] = 1
df_test.loc[df_test['Embarked']=='Q', 'Embarked'] = 2
In [29]:
train = train = df[['Survived','Sex','Age','Familysize','Fare','Embarked']]
test = df[['Sex','Age','Familysize','Fare','Embarked']]
In [30]:
train
Out[30]:
Survived Sex Age Familysize Fare Embarked
0 0 0 2.0 1 7.2500 0
1 1 1 3.0 1 71.2833 1
2 1 1 2.0 0 7.9250 0
3 1 1 3.0 1 53.1000 0
4 0 0 3.0 0 8.0500 0
... ... ... ... ... ... ...
886 0 0 2.0 0 13.0000 0
887 1 1 1.0 0 30.0000 0
888 0 1 2.0 3 23.4500 0
889 1 0 2.0 0 30.0000 1
890 0 0 3.0 0 7.7500 2

891 rows × 6 columns

미션 1

In [31]:
df[['Name','Age']].head(10)
Out[31]:
Name Age
0 Braund, Mr. Owen Harris 2.0
1 Cumings, Mrs. John Bradley (Florence Briggs Th... 3.0
2 Heikkinen, Miss. Laina 2.0
3 Futrelle, Mrs. Jacques Heath (Lily May Peel) 3.0
4 Allen, Mr. William Henry 3.0
5 Moran, Mr. James 2.0
6 McCarthy, Mr. Timothy J 5.0
7 Palsson, Master. Gosta Leonard 0.0
8 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) 2.0
9 Nasser, Mrs. Nicholas (Adele Achem) 1.0
In [32]:
df.loc[df['Name'].str.contains('Mr\.'), 'Name'] = 'Mr'
df.loc[df['Name'].str.contains('Mrs\.'), 'Name'] = 'Mrs'
df.loc[df['Name'].str.contains('Miss\.'), 'Name'] = 'Miss'
In [33]:
df['Name'].value_counts()
Out[33]:
Mr                                                   517
Miss                                                 182
Mrs                                                  125
Mallet, Master. Andre                                  1
Skoog, Master. Harald                                  1
                                                    ... 
Uruchurtu, Don. Manuel E                               1
Minahan, Dr. William Edward                            1
Moubarek, Master. Halim Gonios ("William George")      1
Leader, Dr. Alice (Farnham)                            1
Weir, Col. John                                        1
Name: Name, Length: 70, dtype: int64
In [34]:
df['Name'] = df['Name'].map({
    'Mr': 0,
    'Mrs': 1,
    'Miss': 2
})

df.head(10)
Out[34]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked Familysize
0 1 0 3 0.0 0 2.0 1 0 A/5 21171 7.2500 NaN 0 1
1 2 1 1 1.0 1 3.0 1 0 PC 17599 71.2833 C85 1 1
2 3 1 3 2.0 1 2.0 0 0 STON/O2. 3101282 7.9250 NaN 0 0
3 4 1 1 1.0 1 3.0 1 0 113803 53.1000 C123 0 1
4 5 0 3 0.0 0 3.0 0 0 373450 8.0500 NaN 0 0
5 6 0 3 0.0 0 2.0 0 0 330877 8.4583 NaN 2 0
6 7 0 1 0.0 0 5.0 0 0 17463 51.8625 E46 0 0
7 8 0 3 NaN 0 0.0 3 1 349909 21.0750 NaN 0 4
8 9 1 3 1.0 1 2.0 0 2 347742 11.1333 NaN 0 2
9 10 1 2 1.0 1 1.0 1 0 237736 30.0708 NaN 1 1
In [35]:
df['Name'] = df['Name'].fillna(3)
df
Out[35]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked Familysize
0 1 0 3 0.0 0 2.0 1 0 A/5 21171 7.2500 NaN 0 1
1 2 1 1 1.0 1 3.0 1 0 PC 17599 71.2833 C85 1 1
2 3 1 3 2.0 1 2.0 0 0 STON/O2. 3101282 7.9250 NaN 0 0
3 4 1 1 1.0 1 3.0 1 0 113803 53.1000 C123 0 1
4 5 0 3 0.0 0 3.0 0 0 373450 8.0500 NaN 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ...
886 887 0 2 3.0 0 2.0 0 0 211536 13.0000 NaN 0 0
887 888 1 1 2.0 1 1.0 0 0 112053 30.0000 B42 0 0
888 889 0 3 2.0 1 2.0 1 2 W./C. 6607 23.4500 NaN 0 3
889 890 1 1 0.0 0 2.0 0 0 111369 30.0000 C148 1 0
890 891 0 3 0.0 0 3.0 0 0 370376 7.7500 NaN 2 0

891 rows × 13 columns

In [37]:
df['Name'].value_counts()
Out[37]:
0.0    517
2.0    182
1.0    125
3.0     67
Name: Name, dtype: int64
In [38]:
df['Age'] = df['Age'].fillna( df.groupby('Name')['Age'].transform('mean') )
df.head(10)
Out[38]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked Familysize
0 1 0 3 0.0 0 2.0 1 0 A/5 21171 7.2500 NaN 0 1
1 2 1 1 1.0 1 3.0 1 0 PC 17599 71.2833 C85 1 1
2 3 1 3 2.0 1 2.0 0 0 STON/O2. 3101282 7.9250 NaN 0 0
3 4 1 1 1.0 1 3.0 1 0 113803 53.1000 C123 0 1
4 5 0 3 0.0 0 3.0 0 0 373450 8.0500 NaN 0 0
5 6 0 3 0.0 0 2.0 0 0 330877 8.4583 NaN 2 0
6 7 0 1 0.0 0 5.0 0 0 17463 51.8625 E46 0 0
7 8 0 3 3.0 0 0.0 3 1 349909 21.0750 NaN 0 4
8 9 1 3 1.0 1 2.0 0 2 347742 11.1333 NaN 0 2
9 10 1 2 1.0 1 1.0 1 0 237736 30.0708 NaN 1 1

미션2

In [39]:
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set()
In [41]:
facet = sns.FacetGrid(df, hue="Survived", aspect=4)
facet.map(sns.kdeplot, 'Age')
facet.add_legend()

plt
Out[41]:
<module 'matplotlib.pyplot' from 'C:\\ProgramData\\Anaconda3\\lib\\site-packages\\matplotlib\\pyplot.py'>
In [ ]: