In [1]:
import pandas as pd
In [2]:
df = pd.read_csv('data/train.csv')
df.head()
Out[2]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
In [3]:
# 대괄호 두번해줘야함
df[['Sex','Age','SibSp','Parch']]
Out[3]:
Sex Age SibSp Parch
0 male 22.0 1 0
1 female 38.0 1 0
2 female 26.0 0 0
3 female 35.0 1 0
4 male 35.0 0 0
... ... ... ... ...
886 male 27.0 0 0
887 female 19.0 0 0
888 female NaN 1 2
889 male 26.0 0 0
890 male 32.0 0 0

891 rows × 4 columns

In [4]:
df_test = pd.read_csv('data/test.csv')
df_test.head()
Out[4]:
PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 892 3 Kelly, Mr. James male 34.5 0 0 330911 7.8292 NaN Q
1 893 3 Wilkes, Mrs. James (Ellen Needs) female 47.0 1 0 363272 7.0000 NaN S
2 894 2 Myles, Mr. Thomas Francis male 62.0 0 0 240276 9.6875 NaN Q
3 895 3 Wirz, Mr. Albert male 27.0 0 0 315154 8.6625 NaN S
4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female 22.0 1 1 3101298 12.2875 NaN S
In [5]:
df['Age'].fillna(df['Age'].mean())
Out[5]:
0      22.000000
1      38.000000
2      26.000000
3      35.000000
4      35.000000
         ...    
886    27.000000
887    19.000000
888    29.699118
889    26.000000
890    32.000000
Name: Age, Length: 891, dtype: float64
In [6]:
df['Age'] = df['Age'].fillna(df['Age'].mean())
In [7]:
df['Age']
Out[7]:
0      22.000000
1      38.000000
2      26.000000
3      35.000000
4      35.000000
         ...    
886    27.000000
887    19.000000
888    29.699118
889    26.000000
890    32.000000
Name: Age, Length: 891, dtype: float64
In [8]:
df_test['Age'] = df_test['Age'].fillna(df_test['Age'].mean())
In [9]:
    #df.loc[조건, 열] = 넣고싶은 값
In [10]:
df.loc[df["Age"] < 10,'Age'] = 0
df.loc[(df["Age"] >=10) & (df['Age']<=20),'Age'] = 1
df.loc[(df["Age"] >=20) & (df['Age']<=30),'Age'] = 2
df.loc[(df["Age"] >=30) & (df['Age']<=40),'Age'] = 3
df.loc[(df["Age"] >=40) & (df['Age']<=50),'Age'] = 4
df.loc[(df["Age"] >=50) ,'Age'] = 5
In [11]:
df_test.loc[df_test["Age"] < 10,'Age'] = 0
df_test.loc[(df_test["Age"] >=10) & (df_test['Age']<=20),'Age'] = 1
df_test.loc[(df_test["Age"] >=20) & (df_test['Age']<=30),'Age'] = 2
df_test.loc[(df_test["Age"] >=30) & (df_test['Age']<=40),'Age'] = 3
df_test.loc[(df_test["Age"] >=40) & (df_test['Age']<=50),'Age'] = 4
df_test.loc[(df_test["Age"] >=50) ,'Age'] = 5
In [12]:
df_test
Out[12]:
PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 892 3 Kelly, Mr. James male 3.0 0 0 330911 7.8292 NaN Q
1 893 3 Wilkes, Mrs. James (Ellen Needs) female 4.0 1 0 363272 7.0000 NaN S
2 894 2 Myles, Mr. Thomas Francis male 5.0 0 0 240276 9.6875 NaN Q
3 895 3 Wirz, Mr. Albert male 2.0 0 0 315154 8.6625 NaN S
4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female 2.0 1 1 3101298 12.2875 NaN S
... ... ... ... ... ... ... ... ... ... ... ...
413 1305 3 Spector, Mr. Woolf male 3.0 0 0 A.5. 3236 8.0500 NaN S
414 1306 1 Oliva y Ocana, Dona. Fermina female 3.0 0 0 PC 17758 108.9000 C105 C
415 1307 3 Saether, Mr. Simon Sivertsen male 3.0 0 0 SOTON/O.Q. 3101262 7.2500 NaN S
416 1308 3 Ware, Mr. Frederick male 3.0 0 0 359309 8.0500 NaN S
417 1309 3 Peter, Master. Michael J male 3.0 1 1 2668 22.3583 NaN C

418 rows × 11 columns

In [13]:
#열만들기
df['FamilySize'] = df['Parch'] + df['SibSp']
df.head()
Out[13]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked FamilySize
0 1 0 3 Braund, Mr. Owen Harris male 2.0 1 0 A/5 21171 7.2500 NaN S 1
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 3.0 1 0 PC 17599 71.2833 C85 C 1
2 3 1 3 Heikkinen, Miss. Laina female 2.0 0 0 STON/O2. 3101282 7.9250 NaN S 0
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 3.0 1 0 113803 53.1000 C123 S 1
4 5 0 3 Allen, Mr. William Henry male 3.0 0 0 373450 8.0500 NaN S 0
In [14]:
df_test['FamilySize'] = df_test['Parch'] + df_test['SibSp']
df.head()
Out[14]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked FamilySize
0 1 0 3 Braund, Mr. Owen Harris male 2.0 1 0 A/5 21171 7.2500 NaN S 1
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 3.0 1 0 PC 17599 71.2833 C85 C 1
2 3 1 3 Heikkinen, Miss. Laina female 2.0 0 0 STON/O2. 3101282 7.9250 NaN S 0
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 3.0 1 0 113803 53.1000 C123 S 1
4 5 0 3 Allen, Mr. William Henry male 3.0 0 0 373450 8.0500 NaN S 0
In [15]:
train = df[['Survived','Sex','Age','FamilySize']]
test = df[['Sex','Age','FamilySize']]

train.head()
Out[15]:
Survived Sex Age FamilySize
0 0 male 2.0 1
1 1 female 3.0 1
2 1 female 2.0 0
3 1 female 3.0 1
4 0 male 3.0 0
In [16]:
df['Fare'] = df['Fare'].fillna(df['Fare'].mean())
df_test['Fare'] = df_test['Fare'].fillna(df_test['Fare'].mean())
In [17]:
#빈칸 보는법
df.isnull().sum()
Out[17]:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
FamilySize       0
dtype: int64
In [18]:
#값 보는것
df['Embarked'].value_counts()
Out[18]:
S    644
C    168
Q     77
Name: Embarked, dtype: int64
In [19]:
df['Embarked']= df['Embarked'].fillna('S')
In [20]:
df.isnull().sum()
Out[20]:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
FamilySize       0
dtype: int64
In [21]:
train = df[['Survived','Sex','Age','FamilySize','Fare','Embarked']]
test = df_test[['Sex','Age','FamilySize','Fare','Embarked']]

train.head()
Out[21]:
Survived Sex Age FamilySize Fare Embarked
0 0 male 2.0 1 7.2500 S
1 1 female 3.0 1 71.2833 C
2 1 female 2.0 0 7.9250 S
3 1 female 3.0 1 53.1000 S
4 0 male 3.0 0 8.0500 S
In [22]:
#문자를 숫자로 바꿀때 

df.loc[df['Sex']=='male','Sex'] = 0
df.loc[df['Sex']=='female','Sex'] = 1
df.head()
Out[22]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked FamilySize
0 1 0 3 Braund, Mr. Owen Harris 0 2.0 1 0 A/5 21171 7.2500 NaN S 1
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... 1 3.0 1 0 PC 17599 71.2833 C85 C 1
2 3 1 3 Heikkinen, Miss. Laina 1 2.0 0 0 STON/O2. 3101282 7.9250 NaN S 0
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) 1 3.0 1 0 113803 53.1000 C123 S 1
4 5 0 3 Allen, Mr. William Henry 0 3.0 0 0 373450 8.0500 NaN S 0
In [23]:
#문자를 숫자로 바꿀때 

df_test.loc[df_test['Sex']=='male','Sex'] = 0
df_test.loc[df_test['Sex']=='female','Sex'] = 1
df_test.head()
Out[23]:
PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked FamilySize
0 892 3 Kelly, Mr. James 0 3.0 0 0 330911 7.8292 NaN Q 0
1 893 3 Wilkes, Mrs. James (Ellen Needs) 1 4.0 1 0 363272 7.0000 NaN S 1
2 894 2 Myles, Mr. Thomas Francis 0 5.0 0 0 240276 9.6875 NaN Q 0
3 895 3 Wirz, Mr. Albert 0 2.0 0 0 315154 8.6625 NaN S 0
4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) 1 2.0 1 1 3101298 12.2875 NaN S 2
In [24]:
df.loc[df['Embarked']=='S','Embarked'] = 0
df.loc[df['Embarked']=='C','Embarked'] = 1
df.loc[df['Embarked']=='Q','Embarked'] = 2
df.head()
Out[24]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked FamilySize
0 1 0 3 Braund, Mr. Owen Harris 0 2.0 1 0 A/5 21171 7.2500 NaN 0 1
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... 1 3.0 1 0 PC 17599 71.2833 C85 1 1
2 3 1 3 Heikkinen, Miss. Laina 1 2.0 0 0 STON/O2. 3101282 7.9250 NaN 0 0
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) 1 3.0 1 0 113803 53.1000 C123 0 1
4 5 0 3 Allen, Mr. William Henry 0 3.0 0 0 373450 8.0500 NaN 0 0
In [25]:
df_test.loc[df_test['Embarked']=='S','Embarked'] = 0
df_test.loc[df_test['Embarked']=='C','Embarked'] = 1
df_test.loc[df_test['Embarked']=='Q','Embarked'] = 2
df_test.head()
Out[25]:
PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked FamilySize
0 892 3 Kelly, Mr. James 0 3.0 0 0 330911 7.8292 NaN 2 0
1 893 3 Wilkes, Mrs. James (Ellen Needs) 1 4.0 1 0 363272 7.0000 NaN 0 1
2 894 2 Myles, Mr. Thomas Francis 0 5.0 0 0 240276 9.6875 NaN 2 0
3 895 3 Wirz, Mr. Albert 0 2.0 0 0 315154 8.6625 NaN 0 0
4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) 1 2.0 1 1 3101298 12.2875 NaN 0 2
In [26]:
train = df[['Survived','Sex','Age','FamilySize','Fare','Embarked']]
test = df_test[['Sex','Age','FamilySize','Fare','Embarked']]

train.head()
Out[26]:
Survived Sex Age FamilySize Fare Embarked
0 0 0 2.0 1 7.2500 0
1 1 1 3.0 1 71.2833 1
2 1 1 2.0 0 7.9250 0
3 1 1 3.0 1 53.1000 0
4 0 0 3.0 0 8.0500 0
In [27]:
df['Fare'] = df['Fare'].fillna( df.groupby('Pclass')['Fare'].transform('mean') )

# df_test['Fare'] = df_test['Fare'].fillna(df_test['Fare'].mean()) # 기존
df_test['Fare'] = df_test['Fare'].fillna( df_test.groupby('Pclass')['Fare'].transform('mean') )

# print(df.isnull().sum())
print(df_test.isnull().sum())
PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Embarked         0
FamilySize       0
dtype: int64
In [31]:
x_train = train[['Sex', 'Age', 'FamilySize', 'Fare', 'Embarked']]
y_train = train['Survived']

x_train
Out[31]:
Sex Age FamilySize Fare Embarked
0 0 2.0 1 7.2500 0
1 1 3.0 1 71.2833 1
2 1 2.0 0 7.9250 0
3 1 3.0 1 53.1000 0
4 0 3.0 0 8.0500 0
... ... ... ... ... ...
886 0 2.0 0 13.0000 0
887 1 1.0 0 30.0000 0
888 1 2.0 3 23.4500 0
889 0 2.0 0 30.0000 1
890 0 3.0 0 7.7500 2

891 rows × 5 columns

In [30]:
y_train
Out[30]:
0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64
In [32]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier()
tree.fit(x_train, y_train)

print( tree.score(x_train, y_train))
0.9461279461279462
In [33]:
x_test = test[['Sex', 'Age', 'FamilySize', 'Fare', 'Embarked']]
x_test
Out[33]:
Sex Age FamilySize Fare Embarked
0 0 3.0 0 7.8292 2
1 1 4.0 1 7.0000 0
2 0 5.0 0 9.6875 2
3 0 2.0 0 8.6625 0
4 1 2.0 2 12.2875 0
... ... ... ... ... ...
413 0 3.0 0 8.0500 0
414 1 3.0 0 108.9000 1
415 0 3.0 0 7.2500 0
416 0 3.0 0 8.0500 0
417 0 3.0 2 22.3583 1

418 rows × 5 columns

In [34]:
prediction = tree.predict(x_test) # 즉 y_test가 만들어집니다.
prediction
Out[34]:
array([0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0],
      dtype=int64)
In [37]:
submit = pd.DataFrame({
    'PassengerId': df_test['PassengerId'],
    'Survived': prediction
})

submit.to_csv('submit3.csv', index=False)
In [36]:
my_prediction = pd.read_csv('submit3.csv')
my_prediction.head()
Out[36]:
PassengerId Survived
0 892 0
1 893 0
2 894 0
3 895 0
4 896 1
In [ ]: