Support Vector Machine


  • 분류 모델

    from sklearn.svm import SVC
    model = SVC()
  • 회귀 모델

from sklearn.svm import SVC
model = SVR()

SVR 회귀 모델 - 와인

In [1]:
import pandas as pd

df = pd.read_csv("../../COALA_DS_DATA/COALA_DS_DATA/week6/data/wine.csv")
df
Out[1]:
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol quality
0 7.4 0.700 0.00 1.9 0.076 11.0 34.0 0.99780 3.51 0.56 9.4 4.617195
1 7.8 0.880 0.00 2.6 0.098 25.0 67.0 0.99680 3.20 0.68 9.8 4.782987
2 7.8 0.760 0.04 2.3 0.092 15.0 54.0 0.99700 3.26 0.65 9.8 4.868157
3 11.2 0.280 0.56 1.9 0.075 17.0 60.0 0.99800 3.16 0.58 9.8 5.929590
4 7.4 0.700 0.00 1.9 0.076 11.0 34.0 0.99780 3.51 0.56 9.4 4.714931
... ... ... ... ... ... ... ... ... ... ... ... ...
1594 6.2 0.600 0.08 2.0 0.090 32.0 44.0 0.99490 3.45 0.58 10.5 5.420490
1595 5.9 0.550 0.10 2.2 0.062 39.0 51.0 0.99512 3.52 0.76 11.2 5.730746
1596 6.3 0.510 0.13 2.3 0.076 29.0 40.0 0.99574 3.42 0.75 11.0 6.337004
1597 5.9 0.645 0.12 2.0 0.075 32.0 44.0 0.99547 3.57 0.71 10.2 4.697053
1598 6.0 0.310 0.47 3.6 0.067 18.0 42.0 0.99549 3.39 0.66 11.0 5.624180

1599 rows × 12 columns

In [2]:
df['quality'] = round(df['quality'])
df
Out[2]:
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol quality
0 7.4 0.700 0.00 1.9 0.076 11.0 34.0 0.99780 3.51 0.56 9.4 5.0
1 7.8 0.880 0.00 2.6 0.098 25.0 67.0 0.99680 3.20 0.68 9.8 5.0
2 7.8 0.760 0.04 2.3 0.092 15.0 54.0 0.99700 3.26 0.65 9.8 5.0
3 11.2 0.280 0.56 1.9 0.075 17.0 60.0 0.99800 3.16 0.58 9.8 6.0
4 7.4 0.700 0.00 1.9 0.076 11.0 34.0 0.99780 3.51 0.56 9.4 5.0
... ... ... ... ... ... ... ... ... ... ... ... ...
1594 6.2 0.600 0.08 2.0 0.090 32.0 44.0 0.99490 3.45 0.58 10.5 5.0
1595 5.9 0.550 0.10 2.2 0.062 39.0 51.0 0.99512 3.52 0.76 11.2 6.0
1596 6.3 0.510 0.13 2.3 0.076 29.0 40.0 0.99574 3.42 0.75 11.0 6.0
1597 5.9 0.645 0.12 2.0 0.075 32.0 44.0 0.99547 3.57 0.71 10.2 5.0
1598 6.0 0.310 0.47 3.6 0.067 18.0 42.0 0.99549 3.39 0.66 11.0 6.0

1599 rows × 12 columns

In [3]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_data = scaler.fit_transform(df) # 반환값 -> ndarray
scaled_data = pd.DataFrame(scaled_data, columns = df.columns)

scaled_data
Out[3]:
fixed acidity volatile acidity citric acid residual sugar chlorides free sulfur dioxide total sulfur dioxide density pH sulphates alcohol quality
0 -0.528360 0.961877 -1.391472 -0.453218 -0.243707 -0.466193 -0.379133 0.558274 1.288643 -0.579207 -0.960246 -0.787823
1 -0.298547 1.967442 -1.391472 0.043416 0.223875 0.872638 0.624363 0.028261 -0.719933 0.128950 -0.584777 -0.787823
2 -0.298547 1.297065 -1.186070 -0.169427 0.096353 -0.083669 0.229047 0.134264 -0.331177 -0.048089 -0.584777 -0.787823
3 1.654856 -1.384443 1.484154 -0.453218 -0.264960 0.107592 0.411500 0.664277 -0.979104 -0.461180 -0.584777 0.450848
4 -0.528360 0.961877 -1.391472 -0.453218 -0.243707 -0.466193 -0.379133 0.558274 1.288643 -0.579207 -0.960246 -0.787823
... ... ... ... ... ... ... ... ... ... ... ... ...
1594 -1.217796 0.403229 -0.980669 -0.382271 0.053845 1.542054 -0.075043 -0.978765 0.899886 -0.461180 0.072294 -0.787823
1595 -1.390155 0.123905 -0.877968 -0.240375 -0.541259 2.211469 0.137820 -0.862162 1.353436 0.601055 0.729364 0.450848
1596 -1.160343 -0.099554 -0.723916 -0.169427 -0.243707 1.255161 -0.196679 -0.533554 0.705508 0.542042 0.541630 0.450848
1597 -1.390155 0.654620 -0.775267 -0.382271 -0.264960 1.542054 -0.075043 -0.676657 1.677400 0.305990 -0.209308 -0.787823
1598 -1.332702 -1.216849 1.021999 0.752894 -0.434990 0.203223 -0.135861 -0.666057 0.511130 0.010924 0.541630 0.450848

1599 rows × 12 columns

In [4]:
from sklearn.model_selection import train_test_split

train_data = scaled_data.drop('quality', axis = 1)
target_data = scaled_data['quality']


x_train, x_test, y_train, y_test = train_test_split(train_data, target_data, test_size = 0.2)
x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size = 0.2)
In [5]:
from sklearn.svm import SVR

model = SVR()
model.fit(x_train, y_train)
C:\Users\kis03\Anaconda3\lib\site-packages\sklearn\svm\base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
  "avoid this warning.", FutureWarning)
Out[5]:
SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1,
    gamma='auto_deprecated', kernel='rbf', max_iter=-1, shrinking=True,
    tol=0.001, verbose=False)
In [6]:
print('training set accuracy :', model.score(x_train, y_train))
print('valid set score', model.score(x_valid, y_valid))
training set accuracy : 0.5379378598946503
valid set score 0.38087133320787514
In [7]:
print("test set accuracy :", model.score(x_test, y_test))
test set accuracy : 0.34993300597270593

SVC 분류모델 - 손글씨

In [8]:
import pandas as pd

df1 = pd.read_csv("../../COALA_DS_DATA/COALA_DS_DATA/week6/data/digit.csv")
df1
Out[8]:
pixel 1,1 pixel 1,2 pixel 1,3 pixel 1,4 pixel 1,5 pixel 1,6 pixel 1,7 pixel 1,8 pixel 1,9 pixel 1,10 ... pixel 28,20 pixel 28,21 pixel 28,22 pixel 28,23 pixel 28,24 pixel 28,25 pixel 28,26 pixel 28,27 pixel 28,28 label
0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 4
1 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 8
2 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 8
3 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 7
4 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 4
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
9995 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 9
9996 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 7
9997 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0
9998 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 5
9999 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 6

10000 rows × 785 columns

In [9]:
train_data = df1.drop('label', axis = 1)
target_data = df1['label']

print(train_data.shape, target_data.shape)
(10000, 784) (10000,)
In [10]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(train_data, target_data, test_size = 0.2)

print(train_data.shape, x_train.shape, x_test.shape)
(10000, 784) (8000, 784) (2000, 784)
In [11]:
from sklearn.svm import SVC

model2 = SVC()
model2.fit(x_train, y_train)
C:\Users\kis03\Anaconda3\lib\site-packages\sklearn\svm\base.py:193: FutureWarning: The default value of gamma will change from 'auto' to 'scale' in version 0.22 to account better for unscaled features. Set gamma explicitly to 'auto' or 'scale' to avoid this warning.
  "avoid this warning.", FutureWarning)
Out[11]:
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)
In [13]:
print('training set accuracy :', model2.score(x_train, y_train))
print("test set accuracy :", model2.score(x_test, y_test))
training set accuracy : 0.922375
test set accuracy : 0.9155