MNIST http://yann.lecun.com/exdb/mnist/
28*28 픽셀의 손글씨 숫자이미지를 입력받아서 실제로 의미하는 숫자 인식하기
784개의 특징 데이터를 구성한 후 머신러닝으로 어떤 숫자인지 추측 가능
28*28사이즈의 의미지로부터 label값을 얻어낸다
import pandas as pd
df=pd.read_csv('data/digit.csv')
df.head(10)
df.describe()
df['label'].value_counts()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()
sns.countplot(data=df,x='label')
plt.show()
sns.catplot(data=df,x='label',kind='count')
plt.show()
import numpy as np
numbers=df.drop(['label'],axis=1)
nth=5
img=np.reshape(numbers.iloc[nth].values,[28,28])
plt.imshow(img)
plt.show()
#df.drop([],axis=1)
from sklearn.model_selection import train_test_split
train_data=df.drop(['label'],axis=1)
target_data=df['label']
x_train,x_test,y_train,y_test=train_test_split(train_data,target_data,test_size=0.2)
x_train,x_valid,y_train,y_valid=train_test_split(x_train,y_train,test_size=0.2)
print(train_data.shape)
print(x_train.shape,y_train.shape)
print(x_valid.shape,y_valid.shape)
print(x_test.shape,y_test.shape)
from sklearn.tree import DecisionTreeClassifier
tree=DecisionTreeClassifier().fit(x_train,y_train)
print('train set score:',tree.score(x_train,y_train))
print('valid set score:',tree.score(x_valid,y_valid))
from sklearn.ensemble import RandomForestClassifier
forest=RandomForestClassifier().fit(x_train,y_train)
print('train set score:',forest.score(x_train,y_train))
print('valid set score:',forest.score(x_valid,y_valid))
print('test set score:', forest.score(x_test,y_test))
prediction=forest.predict(x_test)
prediction
결과확인 RandomForestClassifier모델로 93.8의 정확도로 숫자 손글씨를 OCR할 수 있다.
import random # 난수 생성
for i in range(4):
n=random.randrange(0,len(x_test))
img=np.reshape(x_test.iloc[n].values,[28,28])
plt.imshow(img)
plt.show()
result=forest.predict([x_test.iloc[n].values])[0] #[]빼기
print("인식된 숫자는", result,"입니다")
from sklearn.svm import SVC
model=SVC().fit(x_train,y_train)
print("train set score:",model.score(x_train,y_train))
print("valid set score:",model.score(x_valid,y_valid))
시간이 오래걸림 Support Vector Machine은 분류 모델에 적합