logo

모델 성능 비교 및 선택 전략

모델 성능 비교 및 선택 전략

예제 데이터

from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
X, y = make_classification(
    n_samples=1000,         # 사례 수
    n_features=2,           # 특징(차원)
    n_informative=2, n_redundant=0, n_repeated=0,
    n_classes=2,            # 클래스 수
    n_clusters_per_class=1, # 클래스별 군집 수
    weights=[0.8, 0.2],   # 비율
    class_sep=0.5,          # 간격
    flip_y=0.0,             # 오분류 비율
    random_state=0,
)

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

로지스틱 회귀분석

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
model = LinearDiscriminantAnalysis()
model.fit(x_train, y_train)

:::info[output]

LinearDiscriminantAnalysis()

:::

혼동 행렬

from sklearn.metrics import confusion_matrix
import numpy as np

# 예측
y_prob = model.predict_proba(x_test)[:, 1]
threshold = 0.5
y_pred = np.where(y_prob > threshold, 1, 0)

# 혼동행렬 만들기
confusion_matrix(y_test, y_pred)

:::info[output]

array([[252,   5],
       [ 33,  40]], dtype=int64)

:::

정확도

from sklearn.metrics import *
accuracy_score(y_test, y_pred)

:::info[output]

0.8848484848484849

:::

정밀도

precision_score(y_test, y_pred)

:::info[output]

0.8888888888888888

:::

재현도

recall_score(y_test, y_pred)

:::info[output]

0.547945205479452

:::

문턱을 낮췄을 때

threshold = 0.4
y_pred = np.where(y_prob > threshold, 1, 0)
confusion_matrix(y_test, y_pred)

:::info[output]

array([[247,  10],
       [ 27,  46]], dtype=int64)

:::

precision_score(y_test, y_pred)

:::info[output]

0.8214285714285714

:::

recall_score(y_test, y_pred)

:::info[output]

0.6301369863013698

:::

문턱을 높였을 때

threshold = 0.6
y_pred = np.where(y_prob > threshold, 1, 0)
confusion_matrix(y_test, y_pred)

:::info[output]

array([[253,   4],
       [ 39,  34]], dtype=int64)

:::

precision_score(y_test, y_pred)

:::info[output]

0.8947368421052632

:::

recall_score(y_test, y_pred)

:::info[output]

0.4657534246575342

:::

퀴즈

Previous
불완전한 데이터의 약한 지도