In [1]:
import pandas as pd
In [2]:
df = pd.read_csv('data/train.csv')
df.head(10)
Out[2]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
5 6 0 3 Moran, Mr. James male NaN 0 0 330877 8.4583 NaN Q
6 7 0 1 McCarthy, Mr. Timothy J male 54.0 0 0 17463 51.8625 E46 S
7 8 0 3 Palsson, Master. Gosta Leonard male 2.0 3 1 349909 21.0750 NaN S
8 9 1 3 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) female 27.0 0 2 347742 11.1333 NaN S
9 10 1 2 Nasser, Mrs. Nicholas (Adele Achem) female 14.0 1 0 237736 30.0708 NaN C
In [3]:
df[['Sex','Age','SibSp',"Parch"]].head(6)
Out[3]:
Sex Age SibSp Parch
0 male 22.0 1 0
1 female 38.0 1 0
2 female 26.0 0 0
3 female 35.0 1 0
4 male 35.0 0 0
5 male NaN 0 0
In [4]:
#testset
df_test=pd.read_csv('data/test.csv')
df_test.head()
Out[4]:
PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 892 3 Kelly, Mr. James male 34.5 0 0 330911 7.8292 NaN Q
1 893 3 Wilkes, Mrs. James (Ellen Needs) female 47.0 1 0 363272 7.0000 NaN S
2 894 2 Myles, Mr. Thomas Francis male 62.0 0 0 240276 9.6875 NaN Q
3 895 3 Wirz, Mr. Albert male 27.0 0 0 315154 8.6625 NaN S
4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female 22.0 1 1 3101298 12.2875 NaN S
In [5]:
df['Age']=df['Age'].fillna(df['Age'].mean())
In [6]:
df['Age']
Out[6]:
0      22.000000
1      38.000000
2      26.000000
3      35.000000
4      35.000000
         ...    
886    27.000000
887    19.000000
888    29.699118
889    26.000000
890    32.000000
Name: Age, Length: 891, dtype: float64
In [7]:
df
Out[7]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.000000 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.000000 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.000000 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.000000 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.000000 0 0 373450 8.0500 NaN S
... ... ... ... ... ... ... ... ... ... ... ... ...
886 887 0 2 Montvila, Rev. Juozas male 27.000000 0 0 211536 13.0000 NaN S
887 888 1 1 Graham, Miss. Margaret Edith female 19.000000 0 0 112053 30.0000 B42 S
888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female 29.699118 1 2 W./C. 6607 23.4500 NaN S
889 890 1 1 Behr, Mr. Karl Howell male 26.000000 0 0 111369 30.0000 C148 C
890 891 0 3 Dooley, Mr. Patrick male 32.000000 0 0 370376 7.7500 NaN Q

891 rows × 12 columns

In [8]:
df_test['Age']=df_test['Age'].fillna(df['Age'].mean())
In [9]:
# df.loc[조건, 열 ]= 넣고 싶은 값
df.loc[df['Age']<10, 'Age']= 0
df.loc[(df['Age']>=10) & (df['Age']<20), 'Age']= 1
df.loc[(df['Age']>=20) & (df['Age']<30), 'Age']= 2
df.loc[(df['Age']>=30) & (df['Age']<40), 'Age']= 3
df.loc[(df['Age']>=40) & (df['Age']<50), 'Age']= 4
df.loc[df['Age']>50, 'Age']= 5
In [10]:
df
Out[10]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 2.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 3.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 2.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 3.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 3.0 0 0 373450 8.0500 NaN S
... ... ... ... ... ... ... ... ... ... ... ... ...
886 887 0 2 Montvila, Rev. Juozas male 2.0 0 0 211536 13.0000 NaN S
887 888 1 1 Graham, Miss. Margaret Edith female 1.0 0 0 112053 30.0000 B42 S
888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female 2.0 1 2 W./C. 6607 23.4500 NaN S
889 890 1 1 Behr, Mr. Karl Howell male 2.0 0 0 111369 30.0000 C148 C
890 891 0 3 Dooley, Mr. Patrick male 3.0 0 0 370376 7.7500 NaN Q

891 rows × 12 columns

In [11]:
# df_test.loc[조건, 열 ]= 넣고 싶은 값
df_test.loc[df_test['Age']<10, 'Age']= 0
df_test.loc[(df_test['Age']>=10) & (df_test['Age']<20), 'Age']= 1
df_test.loc[(df_test['Age']>=20) & (df_test['Age']<30), 'Age']= 2
df_test.loc[(df_test['Age']>=30) & (df_test['Age']<40), 'Age']= 3
df_test.loc[(df_test['Age']>=40) & (df_test['Age']<50), 'Age']= 4
df_test.loc[df_test['Age']>50, 'Age']= 5
In [12]:
df['FamilySize'] = df['SibSp']+df['Parch']
df.head()
Out[12]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked FamilySize
0 1 0 3 Braund, Mr. Owen Harris male 2.0 1 0 A/5 21171 7.2500 NaN S 1
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 3.0 1 0 PC 17599 71.2833 C85 C 1
2 3 1 3 Heikkinen, Miss. Laina female 2.0 0 0 STON/O2. 3101282 7.9250 NaN S 0
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 3.0 1 0 113803 53.1000 C123 S 1
4 5 0 3 Allen, Mr. William Henry male 3.0 0 0 373450 8.0500 NaN S 0
In [13]:
df_test['FamilySize'] = df_test['SibSp']+df_test['Parch']
df_test.head()
Out[13]:
PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked FamilySize
0 892 3 Kelly, Mr. James male 3.0 0 0 330911 7.8292 NaN Q 0
1 893 3 Wilkes, Mrs. James (Ellen Needs) female 4.0 1 0 363272 7.0000 NaN S 1
2 894 2 Myles, Mr. Thomas Francis male 5.0 0 0 240276 9.6875 NaN Q 0
3 895 3 Wirz, Mr. Albert male 2.0 0 0 315154 8.6625 NaN S 0
4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female 2.0 1 1 3101298 12.2875 NaN S 2
In [14]:
train = df[['Survived', 'Sex', 'Age','FamilySize']]
test = df_test[['Sex', 'Age','FamilySize']]
train.head()
Out[14]:
Survived Sex Age FamilySize
0 0 male 2.0 1
1 1 female 3.0 1
2 1 female 2.0 0
3 1 female 3.0 1
4 0 male 3.0 0

미션1

In [15]:
df['Fare']=df['Fare'].fillna(df['Fare'].mean())
df_test['Fare']=df_test['Fare'].fillna(df_test['Fare'].mean())

함께 실습7

In [16]:
df.isnull().sum()
Out[16]:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
FamilySize       0
dtype: int64
In [17]:
df_test.isnull().sum()
Out[17]:
PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          327
Embarked         0
FamilySize       0
dtype: int64
In [18]:
df['Embarked'].value_counts()
Out[18]:
S    644
C    168
Q     77
Name: Embarked, dtype: int64
In [19]:
df['Pclass'].value_counts()
Out[19]:
3    491
1    216
2    184
Name: Pclass, dtype: int64
In [20]:
df['Embarked']=df['Embarked'].fillna('S')
df_test['Embarked']=df_test['Embarked'].fillna('S')
In [21]:
df.isnull().sum()
Out[21]:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
FamilySize       0
dtype: int64
In [22]:
train = df[['Survived', 'Sex', 'Age','FamilySize','Fare','Embarked']]
test = df_test[['Sex', 'Age','FamilySize','Fare','Embarked']]
train.head()
Out[22]:
Survived Sex Age FamilySize Fare Embarked
0 0 male 2.0 1 7.2500 S
1 1 female 3.0 1 71.2833 C
2 1 female 2.0 0 7.9250 S
3 1 female 3.0 1 53.1000 S
4 0 male 3.0 0 8.0500 S

챌린지1

In [23]:
#df.loc[조건,열]=값
df.loc[df['Sex']=='male','Sex'] = 0
df.loc[df['Sex']=='female','Sex'] = 1
df.head()
Out[23]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked FamilySize
0 1 0 3 Braund, Mr. Owen Harris 0 2.0 1 0 A/5 21171 7.2500 NaN S 1
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... 1 3.0 1 0 PC 17599 71.2833 C85 C 1
2 3 1 3 Heikkinen, Miss. Laina 1 2.0 0 0 STON/O2. 3101282 7.9250 NaN S 0
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) 1 3.0 1 0 113803 53.1000 C123 S 1
4 5 0 3 Allen, Mr. William Henry 0 3.0 0 0 373450 8.0500 NaN S 0
In [24]:
#df.loc[조건,열]=값
df_test.loc[df_test['Sex']=='male','Sex'] = 0
df_test.loc[df_test['Sex']=='female','Sex'] = 1
df_test.head()
Out[24]:
PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked FamilySize
0 892 3 Kelly, Mr. James 0 3.0 0 0 330911 7.8292 NaN Q 0
1 893 3 Wilkes, Mrs. James (Ellen Needs) 1 4.0 1 0 363272 7.0000 NaN S 1
2 894 2 Myles, Mr. Thomas Francis 0 5.0 0 0 240276 9.6875 NaN Q 0
3 895 3 Wirz, Mr. Albert 0 2.0 0 0 315154 8.6625 NaN S 0
4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) 1 2.0 1 1 3101298 12.2875 NaN S 2
In [25]:
df['Embarked'].value_counts()
Out[25]:
S    646
C    168
Q     77
Name: Embarked, dtype: int64
In [26]:
df.loc[df['Embarked']=='S', 'Embarked']=0
df.loc[df['Embarked']=='C', 'Embarked']=1
df.loc[df['Embarked']=='Q', 'Embarked']=2
In [27]:
df
Out[27]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked FamilySize
0 1 0 3 Braund, Mr. Owen Harris 0 2.0 1 0 A/5 21171 7.2500 NaN 0 1
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... 1 3.0 1 0 PC 17599 71.2833 C85 1 1
2 3 1 3 Heikkinen, Miss. Laina 1 2.0 0 0 STON/O2. 3101282 7.9250 NaN 0 0
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) 1 3.0 1 0 113803 53.1000 C123 0 1
4 5 0 3 Allen, Mr. William Henry 0 3.0 0 0 373450 8.0500 NaN 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ...
886 887 0 2 Montvila, Rev. Juozas 0 2.0 0 0 211536 13.0000 NaN 0 0
887 888 1 1 Graham, Miss. Margaret Edith 1 1.0 0 0 112053 30.0000 B42 0 0
888 889 0 3 Johnston, Miss. Catherine Helen "Carrie" 1 2.0 1 2 W./C. 6607 23.4500 NaN 0 3
889 890 1 1 Behr, Mr. Karl Howell 0 2.0 0 0 111369 30.0000 C148 1 0
890 891 0 3 Dooley, Mr. Patrick 0 3.0 0 0 370376 7.7500 NaN 2 0

891 rows × 13 columns

In [28]:
df_test.loc[df_test['Embarked']=='S', 'Embarked']=0
df_test.loc[df_test['Embarked']=='C', 'Embarked']=1
df_test.loc[df_test['Embarked']=='Q', 'Embarked']=2
In [29]:
df_test
Out[29]:
PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked FamilySize
0 892 3 Kelly, Mr. James 0 3.0 0 0 330911 7.8292 NaN 2 0
1 893 3 Wilkes, Mrs. James (Ellen Needs) 1 4.0 1 0 363272 7.0000 NaN 0 1
2 894 2 Myles, Mr. Thomas Francis 0 5.0 0 0 240276 9.6875 NaN 2 0
3 895 3 Wirz, Mr. Albert 0 2.0 0 0 315154 8.6625 NaN 0 0
4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) 1 2.0 1 1 3101298 12.2875 NaN 0 2
... ... ... ... ... ... ... ... ... ... ... ... ...
413 1305 3 Spector, Mr. Woolf 0 2.0 0 0 A.5. 3236 8.0500 NaN 0 0
414 1306 1 Oliva y Ocana, Dona. Fermina 1 3.0 0 0 PC 17758 108.9000 C105 1 0
415 1307 3 Saether, Mr. Simon Sivertsen 0 3.0 0 0 SOTON/O.Q. 3101262 7.2500 NaN 0 0
416 1308 3 Ware, Mr. Frederick 0 2.0 0 0 359309 8.0500 NaN 0 0
417 1309 3 Peter, Master. Michael J 0 2.0 1 1 2668 22.3583 NaN 1 2

418 rows × 12 columns

In [30]:
train = df[['Survived', 'Sex', 'Age','FamilySize','Fare','Embarked']]
test = df_test[['Sex', 'Age','FamilySize','Fare','Embarked']]
test.head()
Out[30]:
Sex Age FamilySize Fare Embarked
0 0 3.0 0 7.8292 2
1 1 4.0 1 7.0000 0
2 0 5.0 0 9.6875 2
3 0 2.0 0 8.6625 0
4 1 2.0 2 12.2875 0

stage 3

In [31]:
x_train =train[['Sex', 'Age','FamilySize','Fare','Embarked']]
y_train = train['Survived']
x_train
Out[31]:
Sex Age FamilySize Fare Embarked
0 0 2.0 1 7.2500 0
1 1 3.0 1 71.2833 1
2 1 2.0 0 7.9250 0
3 1 3.0 1 53.1000 0
4 0 3.0 0 8.0500 0
... ... ... ... ... ...
886 0 2.0 0 13.0000 0
887 1 1.0 0 30.0000 0
888 1 2.0 3 23.4500 0
889 0 2.0 0 30.0000 1
890 0 3.0 0 7.7500 2

891 rows × 5 columns

In [32]:
y_train
Out[32]:
0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64
In [33]:
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier()
tree.fit(x_train, y_train)

tree.score(x_train, y_train)
Out[33]:
0.9461279461279462
In [34]:
x_test = test[['Sex', 'Age','FamilySize','Fare','Embarked']]
x_test
Out[34]:
Sex Age FamilySize Fare Embarked
0 0 3.0 0 7.8292 2
1 1 4.0 1 7.0000 0
2 0 5.0 0 9.6875 2
3 0 2.0 0 8.6625 0
4 1 2.0 2 12.2875 0
... ... ... ... ... ...
413 0 2.0 0 8.0500 0
414 1 3.0 0 108.9000 1
415 0 3.0 0 7.2500 0
416 0 2.0 0 8.0500 0
417 0 2.0 2 22.3583 1

418 rows × 5 columns

In [35]:
prediction = tree.predict(x_test)
prediction
Out[35]:
array([0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0,
       0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0],
      dtype=int64)
In [36]:
x_test
Out[36]:
Sex Age FamilySize Fare Embarked
0 0 3.0 0 7.8292 2
1 1 4.0 1 7.0000 0
2 0 5.0 0 9.6875 2
3 0 2.0 0 8.6625 0
4 1 2.0 2 12.2875 0
... ... ... ... ... ...
413 0 2.0 0 8.0500 0
414 1 3.0 0 108.9000 1
415 0 3.0 0 7.2500 0
416 0 2.0 0 8.0500 0
417 0 2.0 2 22.3583 1

418 rows × 5 columns

In [37]:
submit = pd.DataFrame({
    'PassengerId':df_test['PassengerId'],
    'Survived':prediction
})
submit.to_csv('submit.csv', index =False)
In [38]:
my_prediction = pd.read_csv('submit.csv')
my_prediction
Out[38]:
PassengerId Survived
0 892 0
1 893 0
2 894 0
3 895 0
4 896 1
... ... ...
413 1305 0
414 1306 1
415 1307 0
416 1308 0
417 1309 0

418 rows × 2 columns

week4 - Stage1

In [39]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier()
tree.fit(x_train, y_train)
print('traning set accuracy: ', tree.score(x_train,y_train))
traning set accuracy:  0.9461279461279462
In [40]:
tree1 = DecisionTreeClassifier()
tree1.fit(x_train, y_train)
print('traning set accuracy: ', tree1.score(x_train,y_train))
traning set accuracy:  0.9461279461279462
In [41]:
tree2 = DecisionTreeClassifier()
tree2.fit(x_train, y_train)
print('traning set accuracy: ', tree2.score(x_train,y_train))
traning set accuracy:  0.9461279461279462
In [42]:
tree3 = DecisionTreeClassifier()
tree3.fit(x_train, y_train)
print('traning set accuracy: ', tree3.score(x_train,y_train))
traning set accuracy:  0.9461279461279462
In [43]:
x_valid = x_train[0:100]
y_valid = y_train[0:100]
x_train = x_train[100:]
y_train = y_train[100:]
In [44]:
tree1 = DecisionTreeClassifier()
tree1.fit(x_train, y_train)
print('traning set accuracy: ', tree1.score(x_train,y_train))
print('validation set accuracy: ', tree1.score(x_valid,y_valid))
tree2 = DecisionTreeClassifier()
tree2.fit(x_train, y_train)
print('traning set accuracy: ', tree2.score(x_train,y_train))
print('validation set accuracy: ', tree2.score(x_valid,y_valid))
tree3 = DecisionTreeClassifier()
tree3.fit(x_train, y_train)
print('traning set accuracy: ', tree3.score(x_train,y_train))
print('validation set accuracy: ', tree3.score(x_valid,y_valid))
traning set accuracy:  0.9456384323640961
validation set accuracy:  0.78
traning set accuracy:  0.9456384323640961
validation set accuracy:  0.77
traning set accuracy:  0.9456384323640961
validation set accuracy:  0.77

challenge 2->hw1

In [45]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators=100)
forest.fit(x_train, y_train)

forest.score(x_train, y_train)
Out[45]:
0.9456384323640961
In [48]:
print('training set accuracy:',forest.score(x_train, y_train))
prediction = forest.predict(x_test)
prediction
training set accuracy: 0.9456384323640961
Out[48]:
array([0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1,
       1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0,
       1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0,
       0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1],
      dtype=int64)
In [49]:
submit = pd.DataFrame({
    'PassengerId':df_test['PassengerId'],
    'Survived':prediction
})
submit.to_csv('submit.csv', index =False)
In [ ]: