In [1]:
import setup_env
--------------------------------------------------------------------------------
=== Hardware Acceleration ===
PyTorch version: 2.9.0a0+145a3a7bda.nv25.10
Using NVIDIA GPU (CUDA)
CUDA version: 13.0
GPU name: NVIDIA GeForce RTX 5070 Ti
GPU count: 1
Total GPU memory: 15.92 GB
Allocated memory: 0.00 GB
Free memory: 15.92 GB
Device: cuda
=== Matplotlib Settings ===
✅ Font: NanumGothic
=== System Info ===
OS: Ubuntu 24.04.3 LTS (Noble Numbat)
Kernel: 6.6.87.2-microsoft-standard-WSL2
Architecture: x86_64
Python: 3.12.3
Working directory: /workspace/ai-deeplearning/tutorial
=== Library Versions ===
NumPy: 2.1.0
Pandas: 3.0.0
Matplotlib: 3.10.7
Scikit-learn: 1.7.2
OpenCV: Not installed → !pip install -q opencv-python
Pillow: 12.0.0
Seaborn: 0.13.2
TensorFlow: Not installed → !pip install -q tensorflow
Transformers: 4.40.1
TorchVision: 0.24.0a0+094e7af5
=== Environment setup completed ===
--------------------------------------------------------------------------------
=== Visualizing Test Plot (Wide View) ===
=== GPU Usage Code Snippet === Device set to: cuda ---------------------------------------- # 아래 코드를 복사해서 모델과 데이터를 GPU로 보내세요: model = YourModel().to(device) data = data.to(device) ---------------------------------------- === Environment setup completed === --------------------------------------------------------------------------------
In [2]:
import os
# 디렉토리 생성
os.makedirs('./data/ag_news', exist_ok=True)
# 이미 다운받은게 있으면 스킵
base_url = "https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv"
for filename in ['train.csv', 'test.csv']:
filepath = f'./data/ag_news/{filename}'
if os.path.exists(filepath):
print(f"✅ {filename} 이미 존재 → 스킵")
else:
print(f"⬇️ {filename} 다운로드 중...")
os.system(f'wget -P ./data/ag_news {base_url}/{filename}')
print(f"✅ {filename} 다운로드 완료")
✅ train.csv 이미 존재 → 스킵 ✅ test.csv 이미 존재 → 스킵
In [3]:
import pandas as pd
df_train = pd.read_csv('./data/ag_news/train.csv', header=None)
df_test = pd.read_csv('./data/ag_news/test.csv', header=None)
print("컬럼 수:", df_train.shape)
print("\n샘플 데이터:")
print(df_train.head(3))
print("\n결측값:", df_train.isnull().sum().tolist())
print("레이블 분포:", df_train[0].value_counts().to_dict())
컬럼 수: (120000, 3)
샘플 데이터:
0 1 \
0 3 Wall St. Bears Claw Back Into the Black (Reuters)
1 3 Carlyle Looks Toward Commercial Aerospace (Reuters)
2 3 Oil and Economy Cloud Stocks' Outlook (Reuters)
2
0 Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.
1 Reuters - Private investment firm Carlyle Group,\which has a reputation for making well-timed and occasionally\controversial plays in the defense industry, has quietly placed\its bets on another part of the market.
2 Reuters - Soaring crude prices plus worries\about the economy and the outlook for earnings are expected to\hang over the stock market next week during the depth of the\summer doldrums.
결측값: [0, 0, 0]
레이블 분포: {3: 30000, 4: 30000, 2: 30000, 1: 30000}
In [4]:
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
df_train = pd.read_csv('./data/ag_news/train.csv', header=None, names=['label', 'title', 'body'])
df_test = pd.read_csv('./data/ag_news/test.csv', header=None, names=['label', 'title', 'body'])
def preprocess(title, body):
text = str(title) + ' ' + str(body)
text = text.lower()
text = re.sub(r'[^a-z0-9\s]', ' ', text)
text = re.sub(r'\s+', ' ', text).strip()
return text
df_train['text'] = df_train.apply(lambda r: preprocess(r['title'], r['body']), axis=1)
df_test['text'] = df_test.apply(lambda r: preprocess(r['title'], r['body']), axis=1)
df_train['label'] = df_train['label'] - 1
df_test['label'] = df_test['label'] - 1
df_tr, df_val = train_test_split(df_train, test_size=0.2, random_state=42, stratify=df_train['label'])
label_names = ['World', 'Sports', 'Business', 'Sci/Tech']
print(f"훈련셋: {len(df_tr):,}개")
print(f"검증셋: {len(df_val):,}개")
print(f"테스트셋: {len(df_test):,}개")
# ── 1. 데이터셋 크기 비교 ──
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
splits = {'Train': df_tr, 'Val': df_val, 'Test': df_test}
colors = ['#4C72B0', '#DD8452', '#55A868']
for ax, (name, df), color in zip(axes, splits.items(), colors):
counts = df['label'].value_counts().sort_index()
ax.bar([label_names[i] for i in counts.index], counts.values, color=color, edgecolor='white')
ax.set_title(f'{name} ({len(df):,}개)')
ax.set_ylabel('개수')
for i, v in enumerate(counts.values):
ax.text(i, v + 100, str(v), ha='center', fontweight='bold', fontsize=9)
plt.suptitle('데이터셋 분할 및 레이블 분포', fontsize=13, fontweight='bold')
plt.tight_layout()
plt.show()
# ── 2. 텍스트 길이 분포 ──
for df, name in [(df_tr, 'Train'), (df_val, 'Val'), (df_test, 'Test')]:
df['text_len'] = df['text'].apply(lambda x: len(x.split()))
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
for ax, (name, df), color in zip(axes, splits.items(), colors):
ax.hist(df['text_len'], bins=50, color=color, edgecolor='white')
ax.axvline(df['text_len'].mean(), color='red', linestyle='--', label=f'평균: {df["text_len"].mean():.0f}')
ax.set_title(f'{name} 텍스트 길이 분포')
ax.set_xlabel('단어 수')
ax.set_ylabel('빈도')
ax.legend()
plt.suptitle('텍스트 길이 분포', fontsize=13, fontweight='bold')
plt.tight_layout()
plt.show()
# ── 3. 데이터셋 비율 파이차트 ──
fig, ax = plt.subplots(figsize=(6, 6))
sizes = [len(df_tr), len(df_val), len(df_test)]
labels = [f'Train\n{len(df_tr):,}개', f'Val\n{len(df_val):,}개', f'Test\n{len(df_test):,}개']
ax.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%',
startangle=90, wedgeprops=dict(edgecolor='white', linewidth=2))
ax.set_title('데이터셋 분할 비율', fontsize=13, fontweight='bold')
plt.show()
훈련셋: 96,000개 검증셋: 24,000개 테스트셋: 7,600개
In [5]:
from collections import Counter
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
# 추가 제거할 단어들 (뉴스 소스, HTML 잔재, 의미없는 단어)
custom_stopwords = {
'reuters', 'ap', 'afp', 'gt', 'lt', 'quot', 'amp', # 뉴스소스 & HTML
'said', 'say', 'says', # 너무 일반적인 동사
'monday', 'tuesday', 'wednesday', 'thursday', # 요일
'friday', 'saturday', 'sunday',
'new', 'one', 'two', 'three', 'year', 'first', # 너무 일반적인 단어
}
def clean_text(text, remove_stopwords=True, min_len=2):
tokens = text.split()
if remove_stopwords:
tokens = [t for t in tokens if t not in stop_words]
tokens = [t for t in tokens if t not in custom_stopwords] # 커스텀 불용어 추가
tokens = [t for t in tokens if len(t) >= min_len]
tokens = [t for t in tokens if not t.isdigit()]
return ' '.join(tokens)
# 재적용
df_tr['text_clean'] = df_tr['text'].apply(clean_text)
df_val['text_clean'] = df_val['text'].apply(clean_text)
df_test['text_clean'] = df_test['text'].apply(clean_text)
# 사전 재구축 및 확인
counter = Counter()
for text in df_tr['text_clean']:
counter.update(text.split())
print(f"정제 후 고유 단어 수: {len(counter):,}")
print("\n상위 20개 단어:")
for word, count in counter.most_common(20):
print(f" {word:15s}: {count:,}")
[nltk_data] Downloading package stopwords to /root/nltk_data... [nltk_data] Package stopwords is already up-to-date!
정제 후 고유 단어 수: 58,586 상위 20개 단어: us : 10,548 world : 6,905 company : 6,151 oil : 6,020 inc : 5,522 last : 5,281 iraq : 5,100 york : 5,055 yesterday : 4,867 microsoft : 4,814 president : 4,805 game : 4,691 million : 4,643 week : 4,584 time : 4,397 corp : 4,131 united : 4,097 stocks : 3,990 com : 3,987 prices : 3,921
In [6]:
# 사전 크기 설정 (상위 10000개만 사용)
MAX_VOCAB = 10000
vocab = {'<PAD>': 0, '<UNK>': 1}
for word, _ in counter.most_common(MAX_VOCAB - 2):
vocab[word] = len(vocab)
print(f"사전 크기: {len(vocab):,}")
print(f"예시: 'world' → {vocab.get('world')}, 'game' → {vocab.get('game')}")
사전 크기: 10,000 예시: 'world' → 3, 'game' → 13
In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
# TF-IDF 벡터화 (훈련셋으로만 fit!)
tfidf = TfidfVectorizer(max_features=10000)
X_train_tfidf = tfidf.fit_transform(df_tr['text_clean']).toarray().astype('float32')
X_val_tfidf = tfidf.transform(df_val['text_clean']).toarray().astype('float32')
X_test_tfidf = tfidf.transform(df_test['text_clean']).toarray().astype('float32')
print(f"X_train shape: {X_train_tfidf.shape}")
print(f"X_val shape: {X_val_tfidf.shape}")
print(f"X_test shape: {X_test_tfidf.shape}")
# 상위 중요 단어 시각화
feature_names = tfidf.get_feature_names_out()
mean_tfidf = X_train_tfidf.mean(axis=0)
top_idx = mean_tfidf.argsort()[-15:][::-1]
plt.figure(figsize=(12, 4))
plt.bar([feature_names[i] for i in top_idx], mean_tfidf[top_idx], color='steelblue')
plt.title('TF-IDF 평균값 상위 15개 단어')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
X_train shape: (96000, 10000) X_val shape: (24000, 10000) X_test shape: (7600, 10000)
In [12]:
# 커널 인터럽트하고 모델 재선언!
class TextCNN(nn.Module):
def __init__(self, input_dim, num_classes):
super().__init__()
self.conv1 = nn.Conv1d(1, 64, kernel_size=5, padding=2) # 128 → 64
self.conv2 = nn.Conv1d(64, 128, kernel_size=3, padding=1) # 256 → 128
self.pool = nn.AdaptiveMaxPool1d(32)
self.fc = nn.Sequential(
nn.Linear(128 * 32, 128), # 256 → 128
nn.ReLU(),
nn.Dropout(0.3),
nn.Linear(128, num_classes)
)
def forward(self, x):
x = x.unsqueeze(1)
x = torch.relu(self.conv1(x))
x = torch.relu(self.conv2(x))
x = self.pool(x)
x = x.view(x.size(0), -1)
return self.fc(x)
model_cnn = TextCNN(input_dim=10000, num_classes=4).to(device)
total_params = sum(p.numel() for p in model_cnn.parameters())
print(f"총 파라미터 수: {total_params:,}")
총 파라미터 수: 550,020
In [13]:
from torch.utils.data import TensorDataset, DataLoader
y_train = df_tr['label'].values
y_val = df_val['label'].values
y_test = df_test['label'].values
X_train_t = torch.FloatTensor(X_train_tfidf).to(device)
X_val_t = torch.FloatTensor(X_val_tfidf).to(device)
X_test_t = torch.FloatTensor(X_test_tfidf).to(device)
y_train_t = torch.LongTensor(y_train).to(device)
y_val_t = torch.LongTensor(y_val).to(device)
y_test_t = torch.LongTensor(y_test).to(device)
train_loader = DataLoader(TensorDataset(X_train_t, y_train_t), batch_size=256, shuffle=True)
val_loader = DataLoader(TensorDataset(X_val_t, y_val_t), batch_size=256, shuffle=False)
test_loader = DataLoader(TensorDataset(X_test_t, y_test_t), batch_size=256, shuffle=False)
print(f"Train 배치 수: {len(train_loader)}")
print(f"Val 배치 수: {len(val_loader)}")
print(f"Test 배치 수: {len(test_loader)}")
Train 배치 수: 375 Val 배치 수: 94 Test 배치 수: 30
In [15]:
from tqdm import tqdm
optimizer = torch.optim.Adam(model_cnn.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()
EPOCHS = 30
history = {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': []}
for epoch in range(EPOCHS):
# ── 훈련 ──
model_cnn.train()
total_loss, correct, total = 0, 0, 0
pbar = tqdm(train_loader, desc=f"Epoch {epoch+1:2d}/{EPOCHS}", leave=False)
for X_b, y_b in pbar:
optimizer.zero_grad()
out = model_cnn(X_b)
loss = criterion(out, y_b)
loss.backward()
optimizer.step()
total_loss += loss.item()
correct += (out.argmax(1) == y_b).sum().item()
total += len(y_b)
pbar.set_postfix({'loss': f'{total_loss/len(pbar):.4f}', 'acc': f'{correct/total:.4f}'})
train_loss = total_loss / len(train_loader)
train_acc = correct / total
# ── 검증 ──
model_cnn.eval()
val_loss, val_correct, val_total = 0, 0, 0
with torch.no_grad():
for X_b, y_b in val_loader:
out = model_cnn(X_b)
val_loss += criterion(out, y_b).item()
val_correct += (out.argmax(1) == y_b).sum().item()
val_total += len(y_b)
val_loss = val_loss / len(val_loader)
val_acc = val_correct / val_total
history['train_loss'].append(train_loss)
history['train_acc'].append(train_acc)
history['val_loss'].append(val_loss)
history['val_acc'].append(val_acc)
print(f"\rEpoch {epoch+1:2d}/{EPOCHS} | Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f} | Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}", end='', flush=True)
print()
Epoch 1/30 | Train Loss: 0.9674 | Train Acc: 0.6015 | Val Loss: 0.9552 | Val Acc: 0.6099
Epoch 2/30 | Train Loss: 0.9578 | Train Acc: 0.6066 | Val Loss: 0.9513 | Val Acc: 0.6108
Epoch 3/30 | Train Loss: 0.9490 | Train Acc: 0.6124 | Val Loss: 0.9434 | Val Acc: 0.6145
Epoch 4/30 | Train Loss: 0.9400 | Train Acc: 0.6149 | Val Loss: 0.9406 | Val Acc: 0.6156
Epoch 5/30 | Train Loss: 0.9314 | Train Acc: 0.6188 | Val Loss: 0.9352 | Val Acc: 0.6188
Epoch 6/30 | Train Loss: 0.9223 | Train Acc: 0.6255 | Val Loss: 0.9338 | Val Acc: 0.6200
Epoch 7/30 | Train Loss: 0.9153 | Train Acc: 0.6270 | Val Loss: 0.9311 | Val Acc: 0.6210
Epoch 8/30 | Train Loss: 0.9054 | Train Acc: 0.6308 | Val Loss: 0.9267 | Val Acc: 0.6238
Epoch 9/30 | Train Loss: 0.8995 | Train Acc: 0.6355 | Val Loss: 0.9288 | Val Acc: 0.6237
Epoch 10/30 | Train Loss: 0.8925 | Train Acc: 0.6380 | Val Loss: 0.9257 | Val Acc: 0.6241
Epoch 11/30 | Train Loss: 0.8875 | Train Acc: 0.6399 | Val Loss: 0.9251 | Val Acc: 0.6260
Epoch 12/30 | Train Loss: 0.8764 | Train Acc: 0.6457 | Val Loss: 0.9271 | Val Acc: 0.6240
Epoch 13/30 | Train Loss: 0.8740 | Train Acc: 0.6446 | Val Loss: 0.9262 | Val Acc: 0.6241
Epoch 14/30 | Train Loss: 0.8665 | Train Acc: 0.6492 | Val Loss: 0.9226 | Val Acc: 0.6266
Epoch 15/30 | Train Loss: 0.8591 | Train Acc: 0.6529 | Val Loss: 0.9235 | Val Acc: 0.6272
Epoch 16/30 | Train Loss: 0.8511 | Train Acc: 0.6561 | Val Loss: 0.9220 | Val Acc: 0.6275
Epoch 17/30 | Train Loss: 0.8451 | Train Acc: 0.6583 | Val Loss: 0.9220 | Val Acc: 0.6287
Epoch 18/30 | Train Loss: 0.8396 | Train Acc: 0.6605 | Val Loss: 0.9256 | Val Acc: 0.6266
Epoch 19/30 | Train Loss: 0.8353 | Train Acc: 0.6628 | Val Loss: 0.9247 | Val Acc: 0.6243
Epoch 20/30 | Train Loss: 0.8298 | Train Acc: 0.6651 | Val Loss: 0.9213 | Val Acc: 0.6275
Epoch 21/30 | Train Loss: 0.8227 | Train Acc: 0.6683 | Val Loss: 0.9243 | Val Acc: 0.6285
Epoch 22/30 | Train Loss: 0.8178 | Train Acc: 0.6682 | Val Loss: 0.9239 | Val Acc: 0.6300
Epoch 23/30 | Train Loss: 0.8130 | Train Acc: 0.6721 | Val Loss: 0.9235 | Val Acc: 0.6302
Epoch 24/30 | Train Loss: 0.8092 | Train Acc: 0.6734 | Val Loss: 0.9225 | Val Acc: 0.6276
Epoch 25/30 | Train Loss: 0.8047 | Train Acc: 0.6761 | Val Loss: 0.9248 | Val Acc: 0.6281
Epoch 26/30 | Train Loss: 0.7999 | Train Acc: 0.6785 | Val Loss: 0.9264 | Val Acc: 0.6273
Epoch 27/30 | Train Loss: 0.7944 | Train Acc: 0.6798 | Val Loss: 0.9253 | Val Acc: 0.6285
Epoch 28/30 | Train Loss: 0.7886 | Train Acc: 0.6830 | Val Loss: 0.9256 | Val Acc: 0.6285
Epoch 29/30 | Train Loss: 0.7842 | Train Acc: 0.6841 | Val Loss: 0.9316 | Val Acc: 0.6308
Epoch 30/30 | Train Loss: 0.7823 | Train Acc: 0.6857 | Val Loss: 0.9298 | Val Acc: 0.6303
In [17]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report
label_names = ['World', 'Sports', 'Business', 'Sci/Tech']
# ── 학습 곡선 ──
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
epochs = range(1, EPOCHS + 1)
axes[0].plot(epochs, history['train_loss'], 'b-o', markersize=5, label='Train Loss')
axes[0].plot(epochs, history['val_loss'], 'r-o', markersize=5, label='Val Loss')
axes[0].set_title('Loss 곡선', fontsize=13, fontweight='bold')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].legend()
axes[0].grid(alpha=0.3)
axes[1].plot(epochs, history['train_acc'], 'b-o', markersize=5, label='Train Acc')
axes[1].plot(epochs, history['val_acc'], 'r-o', markersize=5, label='Val Acc')
axes[1].set_title('Accuracy 곡선', fontsize=13, fontweight='bold')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Accuracy')
axes[1].set_ylim(0, 1)
axes[1].legend()
axes[1].grid(alpha=0.3)
plt.suptitle(f'TF-IDF + CNN 학습 결과 | 최고 Val Acc: {max(history["val_acc"]):.4f}',
fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()
In [18]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report
# ── 테스트 평가 ──
model_cnn.eval()
all_preds = []
with torch.no_grad():
for X_b, y_b in test_loader:
preds = model_cnn(X_b).argmax(1).cpu().numpy()
all_preds.extend(preds)
all_preds = np.array(all_preds)
test_acc = (all_preds == y_test).mean()
print(f"✅ 테스트 정확도: {test_acc:.4f} ({test_acc*100:.2f}%)")
print()
print(classification_report(y_test, all_preds, target_names=label_names))
# ── 혼동행렬 ──
cm = confusion_matrix(y_test, all_preds)
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Oranges',
xticklabels=label_names, yticklabels=label_names, ax=axes[0])
axes[0].set_title('혼동행렬 (개수)', fontsize=13, fontweight='bold')
axes[0].set_ylabel('실제 레이블')
axes[0].set_xlabel('예측 레이블')
cm_pct = cm.astype(float) / cm.sum(axis=1, keepdims=True)
sns.heatmap(cm_pct, annot=True, fmt='.2%', cmap='Oranges',
xticklabels=label_names, yticklabels=label_names, ax=axes[1])
axes[1].set_title('혼동행렬 (비율)', fontsize=13, fontweight='bold')
axes[1].set_ylabel('실제 레이블')
axes[1].set_xlabel('예측 레이블')
plt.suptitle(f'TF-IDF + CNN 테스트 결과 | 정확도: {test_acc*100:.2f}%',
fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()
✅ 테스트 정확도: 0.6264 (62.64%)
precision recall f1-score support
World 0.66 0.61 0.64 1900
Sports 0.63 0.74 0.68 1900
Business 0.64 0.59 0.61 1900
Sci/Tech 0.57 0.57 0.57 1900
accuracy 0.63 7600
macro avg 0.63 0.63 0.63 7600
weighted avg 0.63 0.63 0.63 7600