0331

image.png

model_training.ipynb

0401

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, cross_val_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
import lightgbm as lgb
import optuna

# 1. 데이터 로드 최적화
print("[1/6] 데이터 로드 및 검증 중...")
df = pd.read_csv("preprocessed_data.csv", usecols=['상호명_전처리', '카테고리'])
df["상호명_전처리"] = df["상호명_전처리"].fillna("").astype(str)

# 2. 데이터 분할 (계층화 샘플링)
print("[2/6] 데이터 분할 중...")
X_train, X_test, y_train, y_test = train_test_split(
    df["상호명_전처리"],
    df["카테고리"],
    test_size=0.2,
    random_state=42,
    stratify=df["카테고리"]
)

# 3. 경량화 텍스트 벡터화
print("[3/6] 텍스트 벡터화 중...")
vectorizer = TfidfVectorizer(
    max_features=8000,  # 특징 수 감소
    ngram_range=(1, 2),  # ngram 범위 축소
    min_df=5,            # 더 aggressive한 min_df
    max_df=0.9,
    stop_words=["점","센터","코너"],
    analyzer='char_wb',
    dtype=np.float32
)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# 4. 효율적인 리샘플링
print("[4/6] 데이터 리샘플링 중...")
sampling_strategy = {
    '기타': 6000, '식비': 8000, '쇼핑': 6000,
    '문화': 4000, '교육': 3000, '교통': 2500,
    '의료': 2000, '주거': 2000
}

under = RandomUnderSampler(sampling_strategy=sampling_strategy)
X_under, y_under = under.fit_resample(X_train_tfidf, y_train)

over = SMOTE(sampling_strategy='auto', k_neighbors=3)  # k_neighbors 감소
X_res, y_res = over.fit_resample(X_under, y_under)

# 5. 최적화된 하이퍼파라미터 튜닝
print("[5/6] 모델 학습 중...")
def objective(trial):
    params = {
        'num_leaves': trial.suggest_int('num_leaves', 30, 60),
        'max_depth': trial.suggest_int('max_depth', 5, 12),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.15),
        'n_estimators': trial.suggest_int('n_estimators', 200, 400),
        'min_child_samples': trial.suggest_int('min_child_samples', 30, 80),
        'subsample': trial.suggest_float('subsample', 0.7, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),
        'verbosity': -1  # 로깅 제거
    }
    model = lgb.LGBMClassifier(**params, random_state=42)
    return np.mean(cross_val_score(model, X_res, y_res, cv=3, n_jobs=1, scoring='f1_weighted'))

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10, show_progress_bar=True)  # 트라이얼 수 감소

best_model = lgb.LGBMClassifier(
    **study.best_params,
    verbosity=-1,
    n_jobs=-1  # 모든 코어 활용
)
best_model.fit(X_res, y_res)

# 6. 평가 및 저장
print("[6/6] 모델 평가 및 저장 중...")
y_pred = best_model.predict(X_test_tfidf)

print("\\n🔥 최종 성능 리포트:")
print(classification_report(y_test, y_pred))

# 모델 저장
best_model.booster_.save_model('optimized_model.txt')
🔥 최종 성능 리포트:
              precision    recall  f1-score   support

          교육       0.77      0.65      0.71     10196
          교통       0.42      0.77      0.54      1367
          기타       0.79      0.62      0.70     32929
          문화       0.59      0.66      0.62      6154
          쇼핑       0.33      0.64      0.43     12900
          식비       0.82      0.71      0.76     35582
          의료       0.87      0.80      0.83      6541
          주거       0.87      0.86      0.86      5038

    accuracy                           0.68    110707
   macro avg       0.68      0.71      0.68    110707
weighted avg       0.74      0.68      0.70    110707

0402

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split, cross_val_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
import lightgbm as lgb
import optuna

# 1. 데이터 로드 최적화
print("[1/6] 데이터 로드 및 검증 중...")
df = pd.read_csv("preprocessed_data.csv", usecols=['상호명_전처리', '카테고리'])
df["상호명_전처리"] = df["상호명_전처리"].fillna("").astype(str)

# 2. 데이터 분할 (계층화 샘플링)
print("[2/6] 데이터 분할 중...")
X_train, X_test, y_train, y_test = train_test_split(
    df["상호명_전처리"],
    df["카테고리"],
    test_size=0.2,
    random_state=42,
    stratify=df["카테고리"]
)

# 3. 강화된 텍스트 벡터화
print("[3/6] 텍스트 벡터화 중...")
vectorizer = TfidfVectorizer(
    max_features=10000,  # 특징 수 증가
    ngram_range=(1, 2),
    min_df=5,
    max_df=0.9,
    stop_words=["점","센터","코너"],
    analyzer='char_wb',
    dtype=np.float32
)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# 4. 조정된 리샘플링 전략
print("[4/6] 데이터 리샘플링 중...")
sampling_strategy = {
    '기타': 6000, '식비': 8000, '쇼핑': 8000,  # 쇼핑 샘플 증가
    '문화': 4000, '교육': 3000, '교통': 2500,
    '의료': 2000, '주거': 2000
}

under = RandomUnderSampler(sampling_strategy=sampling_strategy)
X_under, y_under = under.fit_resample(X_train_tfidf, y_train)

over = SMOTE(sampling_strategy='auto', k_neighbors=3)
X_res, y_res = over.fit_resample(X_under, y_under)

# 5. 강화된 하이퍼파라미터 튜닝
print("[5/6] 모델 학습 중...")
def objective(trial):
    params = {
        'num_leaves': trial.suggest_int('num_leaves', 50, 100),  # 범위 확장
        'max_depth': trial.suggest_int('max_depth', 8, 15),      # 깊이 증가
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'n_estimators': trial.suggest_int('n_estimators', 300, 500),
        'min_child_samples': trial.suggest_int('min_child_samples', 20, 50),  # 범위 축소
        'subsample': trial.suggest_float('subsample', 0.7, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 0.1),  # L1 정규화 추가
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 0.1), # L2 정규화 추가
        'verbosity': -1
    }
    model = lgb.LGBMClassifier(**params, random_state=42)
    return np.mean(cross_val_score(model, X_res, y_res, cv=3, n_jobs=1, scoring='f1_weighted'))

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10, show_progress_bar=True)

best_model = lgb.LGBMClassifier(
    **study.best_params,
    verbosity=-1,
    n_jobs=-1
)
best_model.fit(X_res, y_res)

# 6. 평가 및 저장
print("[6/6] 모델 평가 및 저장 중...")
y_pred = best_model.predict(X_test_tfidf)

print("\\n🔥 최종 성능 리포트:")
print(classification_report(y_test, y_pred))

best_model.booster_.save_model('optimized_model.txt')
🔥 최종 성능 리포트:
              precision    recall  f1-score   support

          교육       0.79      0.65      0.72     10196
          교통       0.46      0.78      0.58      1367
          기타       0.80      0.63      0.70     32929
          문화       0.60      0.68      0.64      6154
          쇼핑       0.34      0.68      0.45     12900
          식비       0.85      0.72      0.78     35582
          의료       0.87      0.80      0.84      6541
          주거       0.89      0.85      0.87      5038

    accuracy                           0.69    110707
   macro avg       0.70      0.72      0.70    110707
weighted avg       0.75      0.69      0.71    110707