여러 특성을 지닌 데이터의 Decision Tree

자율1

In [1]:
data = {
    'weight': [28, 2, 9, 6],
    'movable': [True, True, True, False],
    'categorty': ['animal', 'plant', 'animal', 'plant']
}
In [2]:
#코드 작성이 끝나면 target_index를 0부터 3까지 변경하면서 실행해보세요.
TargetInput = int(input('번호 입력(1~4): '))
print(TargetInput,'번 생물: 동물/식물을 분류합니다.')

if data['movable'][TargetInput-1] == True:
    if data['weight'][TargetInput-1] <= 6:
        print('식물')
    else:
        print('동물')
else:
    print('식물')
번호 입력(1~4): 1
1 번 생물: 동물/식물을 분류합니다.
동물

자율2

In [3]:
TargetInput = int(input('번호 입력(1~4): '))
print(TargetInput,'번 생물: 동물/식물을 분류합니다.')

if data['weight'][TargetInput-1] >= 6:
    if data['movable'][TargetInput-1] == True:
        print('동물')
    else:
        print('식물')
else:
    print('식물')
번호 입력(1~4): 2
2 번 생물: 동물/식물을 분류합니다.
식물

자율3

Cabin data의 첫 글자만 남기고 번호로 변경

In [4]:
import pandas as pd
In [5]:
df = pd.read_csv('data/train.csv')
In [6]:
df.head()
Out[6]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
In [7]:
print(df['Cabin'].value_counts())
B96 B98        4
G6             4
C23 C25 C27    4
F33            3
F2             3
              ..
C95            1
A34            1
C47            1
A23            1
C91            1
Name: Cabin, Length: 147, dtype: int64
In [8]:
df['Cabin'] = df['Cabin'].str[0]
print(df['Cabin'].value_counts())
df['Cabin'].isnull().sum() #빈칸
C    59
B    47
D    33
E    32
A    15
F    13
G     4
T     1
Name: Cabin, dtype: int64
Out[8]:
687
In [9]:
df['Cabin'] = df['Cabin'].fillna('C')
df['Cabin'].value_counts()
#빈칸(NaN)을 가장 많은 C로 대체
Out[9]:
C    746
B     47
D     33
E     32
A     15
F     13
G      4
T      1
Name: Cabin, dtype: int64
In [10]:
#testset에 대해서도 동일하게 진행
df_test = pd.read_csv('data/test.csv')
df_test['Cabin'] = df_test['Cabin'].str[0]
df_test['Cabin'] = df_test['Cabin'].fillna('C')
In [11]:
df_test.isnull().sum()
Out[11]:
PassengerId     0
Pclass          0
Name            0
Sex             0
Age            86
SibSp           0
Parch           0
Ticket          0
Fare            1
Cabin           0
Embarked        0
dtype: int64
In [12]:
# map()을 이용한 데이터 변경
df['Cabin'] = df['Cabin'].map({
    'C': 0,
    'B': 1,
    'D': 2,
    'E': 3,
    'A': 4,
    'F': 5,
    'G': 6,
    'T': 7
})
print(df['Cabin'].value_counts())

df_test['Cabin'] = df_test['Cabin'].map({
    'C': 0,
    'B': 1,
    'D': 2,
    'E': 3,
    'A': 4,
    'F': 5,
    'G': 6,
    'T': 7
})
df_test['Cabin'].value_counts()
0    746
1     47
2     33
3     32
4     15
5     13
6      4
7      1
Name: Cabin, dtype: int64
Out[12]:
0    362
1     18
2     13
3      9
5      8
4      7
6      1
Name: Cabin, dtype: int64

Name에서 유용한 정보 남기기

In [13]:
df['Name'].head()
Out[13]:
0                              Braund, Mr. Owen Harris
1    Cumings, Mrs. John Bradley (Florence Briggs Th...
2                               Heikkinen, Miss. Laina
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)
4                             Allen, Mr. William Henry
Name: Name, dtype: object
In [14]:
# 쉬운 방법: 전체 나이 평균 대입
# df['Age'].fillna(df['Age'].mean())

#개선된 방법: 이름에 들어가는 Mr, Mrs, Miss 타이틀 정보 기준의 평균값 넣기
df.loc[ df['Name'].str.contains('Mr\.'), 'Name'] = 'Mr'
df.loc[ df['Name'].str.contains('Miss\.'), 'Name'] = 'Miss'
df.loc[ df['Name'].str.contains('Mrs\.'), 'Name'] = 'Mrs'
df.loc[ df['Name'].str.contains('Master\.'), 'Name'] = 'Master'
df.loc[ df['Name'].str.contains('Dr\.'), 'Name'] = 'Dr'
df.loc[ df['Name'].str.contains('Major\.'), 'Name'] = 'Major'
df.loc[ df['Name'].str.contains('Rev\.'), 'Name'] = 'Rev'
In [15]:
df_test.loc[ df_test['Name'].str.contains('Mr\.'), 'Name'] = 'Mr'
df_test.loc[ df_test['Name'].str.contains('Miss\.'), 'Name'] = 'Miss'
df_test.loc[ df_test['Name'].str.contains('Mrs\.'), 'Name'] = 'Mrs'
df_test.loc[ df_test['Name'].str.contains('Master\.'), 'Name'] = 'Master'
df_test.loc[ df_test['Name'].str.contains('Dr\.'), 'Name'] = 'Dr'
df_test.loc[ df_test['Name'].str.contains('Major\.'), 'Name'] = 'Major'
df_test.loc[ df_test['Name'].str.contains('Rev\.'), 'Name'] = 'Rev'
In [16]:
df.loc[ df['Name'].str.contains('Mr '), 'Name'] = 'Mr'
df.loc[ df['Name'].str.contains('Mrs '), 'Name'] = 'Mrs'
df['Name'].value_counts()
Out[16]:
Mr                                                          518
Miss                                                        182
Mrs                                                         127
Master                                                       40
Dr                                                            7
Rev                                                           6
Major                                                         2
Reuchlin, Jonkheer. John George                               1
Rothes, the Countess. of (Lucy Noel Martha Dyer-Edwards)      1
Simonius-Blumer, Col. Oberst Alfons                           1
Aubart, Mme. Leontine Pauline                                 1
Crosby, Capt. Edward Gifford                                  1
Uruchurtu, Don. Manuel E                                      1
Reynaldo, Ms. Encarnacion                                     1
Weir, Col. John                                               1
Sagesser, Mlle. Emma                                          1
Name: Name, dtype: int64
In [17]:
df_test.loc[ df_test['Name'].str.contains('Mr '), 'Name'] = 'Mr'
df_test.loc[ df_test['Name'].str.contains('Mrs '), 'Name'] = 'Mrs'
In [18]:
df['Name'] = df['Name'].map({
    'Mr': 0,
    'Mrs': 1,
    'Miss': 2,
    'Master': 3,
    'Dr': 4,
    'Rev': 5,
    'Major': 6,
})
In [19]:
df_test['Name'] = df_test['Name'].map({
    'Mr': 0,
    'Mrs': 1,
    'Miss': 2,
    'Master': 3,
    'Dr': 4,
    'Rev': 5,
    'Major': 6,
})
In [20]:
df['Name'].value_counts()
Out[20]:
0.0    518
2.0    182
1.0    127
3.0     40
4.0      7
5.0      6
6.0      2
Name: Name, dtype: int64
In [21]:
df['Name'] = df['Name'].fillna(7)
df_test['Name'] = df_test['Name'].fillna(7)
df['Name'].value_counts()
Out[21]:
0.0    518
2.0    182
1.0    127
3.0     40
7.0      9
4.0      7
5.0      6
6.0      2
Name: Name, dtype: int64

이름으로 나이 빈칸 채우기

In [22]:
df['Age'] = df['Age'].fillna( df.groupby('Name')['Age'].transform('mean'))
df_test['Age'] = df_test['Age'].fillna( df_test.groupby('Name')['Age'].transform('mean'))
df.head(10)
Out[22]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 0.0 male 22.000000 1 0 A/5 21171 7.2500 0 S
1 2 1 1 1.0 female 38.000000 1 0 PC 17599 71.2833 0 C
2 3 1 3 2.0 female 26.000000 0 0 STON/O2. 3101282 7.9250 0 S
3 4 1 1 1.0 female 35.000000 1 0 113803 53.1000 0 S
4 5 0 3 0.0 male 35.000000 0 0 373450 8.0500 0 S
5 6 0 3 0.0 male 32.409774 0 0 330877 8.4583 0 Q
6 7 0 1 0.0 male 54.000000 0 0 17463 51.8625 3 S
7 8 0 3 3.0 male 2.000000 3 1 349909 21.0750 0 S
8 9 1 3 1.0 female 27.000000 0 2 347742 11.1333 0 S
9 10 1 2 1.0 female 14.000000 1 0 237736 30.0708 0 C
In [23]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()
In [24]:
facet = sns.FacetGrid(df, hue='Survived', aspect=4)
facet.map(sns.kdeplot, 'Age')
facet.add_legend()

plt.show()

HW2

PClass에 따른 Fare 빈칸 평균으로 채우고 IsAlone 열 추가하기

미션1) PClass 별로 Fare를 평균내어, Fare가 비어져 있을 경우 그 값으로 대체합니다.
Stage4에서 변형된 Name에 따라 Age를 평균내어 빈칸을 채운것과 동일한 방식입니다.
미션2) 혼자 왔는지 알려주는 IsAlone 데이터 추가하기
Challenge1과 완전히 동일합니다. 복습하면서 한번 더 진행해주세요.

In [25]:
df['Pclass'].value_counts()
Out[25]:
3    491
1    216
2    184
Name: Pclass, dtype: int64
In [26]:
df['Fare'].isnull().sum()
# Fare 빈칸 없음
Out[26]:
0
In [27]:
df['Fare'] = df['Fare'].fillna( df.groupby('Pclass')['Fare'].transform('mean'))
df_test['Fare'].isnull().sum() #testset에는 Fare 빈칸 한 개 있다.
Out[27]:
1
In [28]:
df_test['Fare'] = df_test['Fare'].fillna( df_test.groupby('Pclass')['Fare'].transform('mean'))
df_test.isnull().sum()
Out[28]:
PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64
In [29]:
df['FamilySize'] = df['SibSp'] + df['Parch']
df_test['FamilySize'] = df_test['SibSp'] + df_test['Parch']
In [43]:
df['IsAlone'] = 0
df_test['IsAlone'] = 0
In [44]:
#for i in df:
#    if df['FamilySize'][1].item() == 0 :
#       df.loc['IsAlone'][1].item() = 1
df.loc[ df['FamilySize'] == 0, 'IsAlone'] = 1
df.loc[ df['FamilySize'] != 0, 'IsAlone'] = 0

df_test.loc[ df_test['FamilySize'] == 0, 'IsAlone'] = 1
df_test.loc[ df_test['FamilySize'] != 0, 'IsAlone'] = 0
In [32]:
df
Out[32]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked FamilySize IsAlone
0 1 0 3 0.0 male 22.000000 1 0 A/5 21171 7.2500 0 S 1 0
1 2 1 1 1.0 female 38.000000 1 0 PC 17599 71.2833 0 C 1 0
2 3 1 3 2.0 female 26.000000 0 0 STON/O2. 3101282 7.9250 0 S 0 1
3 4 1 1 1.0 female 35.000000 1 0 113803 53.1000 0 S 1 0
4 5 0 3 0.0 male 35.000000 0 0 373450 8.0500 0 S 0 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
886 887 0 2 5.0 male 27.000000 0 0 211536 13.0000 0 S 0 1
887 888 1 1 2.0 female 19.000000 0 0 112053 30.0000 1 S 0 1
888 889 0 3 2.0 female 21.773973 1 2 W./C. 6607 23.4500 0 S 3 0
889 890 1 1 0.0 male 26.000000 0 0 111369 30.0000 0 C 0 1
890 891 0 3 0.0 male 32.000000 0 0 370376 7.7500 0 Q 0 1

891 rows × 14 columns

수업시간에 다룬 내용들 다시 반복해서 데이터 가공하기

Sex

In [33]:
print('trainset에서 Sex 빈칸', df['Sex'].isnull().sum())
df.loc[df['Sex'] == 'male', 'Sex'] = 0
df.loc[df['Sex'] == 'female', 'Sex'] = 1
print('testset에서 Sex 빈칸', df_test['Sex'].isnull().sum())
df_test.loc[df_test['Sex'] == 'male', 'Sex'] = 0
df_test.loc[df_test['Sex'] == 'female', 'Sex'] = 1
trainset에서 Sex 빈칸 0
testset에서 Sex 빈칸 0
In [38]:
df.head()
Out[38]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked FamilySize IsAlone
0 1 0 3 0.0 0 22.0 1 0 A/5 21171 7.2500 0 S 1 0
1 2 1 1 1.0 1 38.0 1 0 PC 17599 71.2833 0 C 1 0
2 3 1 3 2.0 1 26.0 0 0 STON/O2. 3101282 7.9250 0 S 0 1
3 4 1 1 1.0 1 35.0 1 0 113803 53.1000 0 S 1 0
4 5 0 3 0.0 0 35.0 0 0 373450 8.0500 0 S 0 1

Embarked

In [57]:
df.loc[df['Embarked'] == 'S', 'Embarked'] = 0
df.loc[df['Embarked'] == 'C', 'Embarked'] = 1
df.loc[df['Embarked'] == 'Q', 'Embarked'] = 2

df_test.loc[df_test['Embarked'] == 'S', 'Embarked'] = 0
df_test.loc[df_test['Embarked'] == 'C', 'Embarked'] = 1
df_test.loc[df_test['Embarked'] == 'Q', 'Embarked'] = 2
In [58]:
train = df[['Survived', 'Name', 'Sex', 'Age', 'FamilySize', 'Fare', 'Cabin', 'Embarked']]
test = df_test[['Name', 'Sex', 'Age', 'FamilySize', 'Fare', 'Cabin', 'Embarked']]
In [59]:
print(df.isnull().sum())
df['Embarked'] = df['Embarked'].fillna('S')
print(df_test.isnull().sum())
PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
FamilySize     0
IsAlone        0
dtype: int64
PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
FamilySize     0
IsAlone        0
dtype: int64
In [60]:
x_train = train[['Name', 'Sex', 'Age', 'FamilySize', 'Fare', 'Cabin', 'Embarked']]
y_train = train['Survived']
In [61]:
print(x_train)
print(y_train)
     Name Sex        Age  FamilySize     Fare  Cabin  Embarked
0     0.0   0  22.000000           1   7.2500      0         0
1     1.0   1  38.000000           1  71.2833      0         1
2     2.0   1  26.000000           0   7.9250      0         0
3     1.0   1  35.000000           1  53.1000      0         0
4     0.0   0  35.000000           0   8.0500      0         0
..    ...  ..        ...         ...      ...    ...       ...
886   5.0   0  27.000000           0  13.0000      0         0
887   2.0   1  19.000000           0  30.0000      1         0
888   2.0   1  21.773973           3  23.4500      0         0
889   0.0   0  26.000000           0  30.0000      0         1
890   0.0   0  32.000000           0   7.7500      0         2

[891 rows x 7 columns]
0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64
In [62]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier()
tree.fit(x_train, y_train)

tree.score(x_train, y_train)
Out[62]:
0.9865319865319865
In [63]:
x_test = test[['Name', 'Sex', 'Age', 'FamilySize', 'Fare', 'Cabin', 'Embarked']]
In [65]:
prediction = tree.predict(x_test)
prediction
Out[65]:
array([0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0,
       0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0,
       1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0,
       0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0,
       0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1,
       1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1],
      dtype=int64)
In [66]:
submit = pd.DataFrame({
    'PassengerId': df_test['PassengerId'],
    'Survived': prediction
})
submit.to_csv('submit.csv', index = False)

my_prediction = pd.read_csv('submit.csv')
my_prediction.head()
Out[66]:
PassengerId Survived
0 892 0
1 893 1
2 894 1
3 895 1
4 896 0
In [ ]: