In [14]:
import setup_env
--------------------------------------------------------------------------------
=== Hardware Acceleration ===
PyTorch version: 2.9.0a0+145a3a7bda.nv25.10
Using NVIDIA GPU (CUDA)
CUDA version: 13.0
GPU name: NVIDIA GeForce RTX 5070 Ti
GPU count: 1
Total GPU memory: 15.92 GB
Allocated memory: 1.88 GB
Free memory: 14.04 GB
Device: cuda
=== Matplotlib Settings ===
✅ Font: NanumGothic
=== System Info ===
OS: Ubuntu 24.04.3 LTS (Noble Numbat)
Kernel: 6.6.87.2-microsoft-standard-WSL2
Architecture: x86_64
Python: 3.12.3
Working directory: /workspace/ai-deeplearning/tutorial
=== Library Versions ===
NumPy: 2.1.0
Pandas: 3.0.0
Matplotlib: 3.10.7
Scikit-learn: 1.7.2
OpenCV: Not installed → !pip install -q opencv-python
Pillow: 12.0.0
Seaborn: 0.13.2
TensorFlow: Not installed → !pip install -q tensorflow
Transformers: 4.40.1
TorchVision: 0.24.0a0+094e7af5
=== Environment setup completed ===
--------------------------------------------------------------------------------
=== Visualizing Test Plot (Wide View) ===
=== GPU Usage Code Snippet === Device set to: cuda ---------------------------------------- # 아래 코드를 복사해서 모델과 데이터를 GPU로 보내세요: model = YourModel().to(device) data = data.to(device) ---------------------------------------- === Environment setup completed === --------------------------------------------------------------------------------
In [2]:
import os
# 디렉토리 생성
os.makedirs('./data/ag_news', exist_ok=True)
# 이미 다운받은게 있으면 스킵
base_url = "https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv"
for filename in ['train.csv', 'test.csv']:
filepath = f'./data/ag_news/{filename}'
if os.path.exists(filepath):
print(f"✅ {filename} 이미 존재 → 스킵")
else:
print(f"⬇️ {filename} 다운로드 중...")
os.system(f'wget -P ./data/ag_news {base_url}/{filename}')
print(f"✅ {filename} 다운로드 완료")
✅ train.csv 이미 존재 → 스킵 ✅ test.csv 이미 존재 → 스킵
In [3]:
import pandas as pd
df_train = pd.read_csv('./data/ag_news/train.csv', header=None)
df_test = pd.read_csv('./data/ag_news/test.csv', header=None)
print("컬럼 수:", df_train.shape)
print("\n샘플 데이터:")
print(df_train.head(3))
print("\n결측값:", df_train.isnull().sum().tolist())
print("레이블 분포:", df_train[0].value_counts().to_dict())
컬럼 수: (120000, 3)
샘플 데이터:
0 1 \
0 3 Wall St. Bears Claw Back Into the Black (Reuters)
1 3 Carlyle Looks Toward Commercial Aerospace (Reuters)
2 3 Oil and Economy Cloud Stocks' Outlook (Reuters)
2
0 Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.
1 Reuters - Private investment firm Carlyle Group,\which has a reputation for making well-timed and occasionally\controversial plays in the defense industry, has quietly placed\its bets on another part of the market.
2 Reuters - Soaring crude prices plus worries\about the economy and the outlook for earnings are expected to\hang over the stock market next week during the depth of the\summer doldrums.
결측값: [0, 0, 0]
레이블 분포: {3: 30000, 4: 30000, 2: 30000, 1: 30000}
In [5]:
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
df_train = pd.read_csv('./data/ag_news/train.csv', header=None, names=['label', 'title', 'body'])
df_test = pd.read_csv('./data/ag_news/test.csv', header=None, names=['label', 'title', 'body'])
# BERT는 간단한 전처리만
def preprocess(title, body):
text = str(title) + ' ' + str(body)
text = text.strip() # 앞뒤 공백만 제거
return text
df_train['text'] = df_train.apply(lambda r: preprocess(r['title'], r['body']), axis=1)
df_test['text'] = df_test.apply(lambda r: preprocess(r['title'], r['body']), axis=1)
df_train['label'] = df_train['label'] - 1
df_test['label'] = df_test['label'] - 1
df_tr, df_val = train_test_split(df_train, test_size=0.2, random_state=42, stratify=df_train['label'])
label_names = ['World', 'Sports', 'Business', 'Sci/Tech']
print(f"훈련셋: {len(df_tr):,}개")
print(f"검증셋: {len(df_val):,}개")
print(f"테스트셋: {len(df_test):,}개")
# ── 1. 데이터셋 크기 비교 ──
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
splits = {'Train': df_tr, 'Val': df_val, 'Test': df_test}
colors = ['#4C72B0', '#DD8452', '#55A868']
for ax, (name, df), color in zip(axes, splits.items(), colors):
counts = df['label'].value_counts().sort_index()
ax.bar([label_names[i] for i in counts.index], counts.values, color=color, edgecolor='white')
ax.set_title(f'{name} ({len(df):,}개)')
ax.set_ylabel('개수')
for i, v in enumerate(counts.values):
ax.text(i, v + 100, str(v), ha='center', fontweight='bold', fontsize=9)
plt.suptitle('데이터셋 분할 및 레이블 분포', fontsize=13, fontweight='bold')
plt.tight_layout()
plt.show()
# ── 2. 텍스트 길이 분포 ──
for df, name in [(df_tr, 'Train'), (df_val, 'Val'), (df_test, 'Test')]:
df['text_len'] = df['text'].apply(lambda x: len(x.split()))
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
for ax, (name, df), color in zip(axes, splits.items(), colors):
ax.hist(df['text_len'], bins=50, color=color, edgecolor='white')
ax.axvline(df['text_len'].mean(), color='red', linestyle='--', label=f'평균: {df["text_len"].mean():.0f}')
ax.set_title(f'{name} 텍스트 길이 분포')
ax.set_xlabel('단어 수')
ax.set_ylabel('빈도')
ax.legend()
plt.suptitle('텍스트 길이 분포', fontsize=13, fontweight='bold')
plt.tight_layout()
plt.show()
# ── 3. 데이터셋 비율 파이차트 ──
fig, ax = plt.subplots(figsize=(6, 6))
sizes = [len(df_tr), len(df_val), len(df_test)]
labels = [f'Train\n{len(df_tr):,}개', f'Val\n{len(df_val):,}개', f'Test\n{len(df_test):,}개']
ax.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%',
startangle=90, wedgeprops=dict(edgecolor='white', linewidth=2))
ax.set_title('데이터셋 분할 비율', fontsize=13, fontweight='bold')
plt.show()
훈련셋: 96,000개 검증셋: 24,000개 테스트셋: 7,600개
In [6]:
from transformers import BertTokenizer
import torch
from torch.utils.data import Dataset
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
MAX_LEN = 128 # BERT는 보통 128~512 사용
class AGNewsBERTDataset(Dataset):
def __init__(self, df, tokenizer, max_len):
self.texts = df['text'].tolist()
self.labels = df['label'].tolist()
self.tokenizer = tokenizer
self.max_len = max_len
def __len__(self):
return len(self.labels)
def __getitem__(self, idx):
text = self.texts[idx]
encoding = self.tokenizer(
text,
max_length=self.max_len,
padding='max_length',
truncation=True,
return_tensors='pt'
)
return {
'input_ids': encoding['input_ids'].squeeze(0), # [max_len]
'attention_mask': encoding['attention_mask'].squeeze(0), # [max_len]
'label': torch.tensor(self.labels[idx], dtype=torch.long)
}
# 데이터셋 생성
train_dataset = AGNewsBERTDataset(df_tr, tokenizer, MAX_LEN)
val_dataset = AGNewsBERTDataset(df_val, tokenizer, MAX_LEN)
test_dataset = AGNewsBERTDataset(df_test, tokenizer, MAX_LEN)
print(f"Train 샘플 수: {len(train_dataset):,}")
print(f"Val 샘플 수: {len(val_dataset):,}")
print(f"Test 샘플 수: {len(test_dataset):,}")
# 샘플 확인
sample = train_dataset[0]
print(f"\nInput IDs 길이: {len(sample['input_ids'])}")
print(f"첫 10개 토큰: {sample['input_ids'][:10]}")
print(f"디코딩: {tokenizer.decode(sample['input_ids'][:20])}")
Train 샘플 수: 96,000 Val 샘플 수: 24,000 Test 샘플 수: 7,600 Input IDs 길이: 128 첫 10개 토큰: tensor([ 101, 18856, 28418, 15608, 12422, 2055, 6745, 4544, 1010, 2758]) 디코딩: [CLS] clijsters unsure about latest injury, says hewitt tokyo ( reuters ) - belgian kim clij
In [7]:
from torch.utils.data import DataLoader
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)
print(f"Train 배치 수: {len(train_loader)}")
print(f"Val 배치 수: {len(val_loader)}")
print(f"Test 배치 수: {len(test_loader)}")
# 배치 확인
sample_batch = next(iter(train_loader))
print(f"\n배치 구조:")
print(f" input_ids shape: {sample_batch['input_ids'].shape}")
print(f" attention_mask shape: {sample_batch['attention_mask'].shape}")
print(f" label shape: {sample_batch['label'].shape}")
Train 배치 수: 1500 Val 배치 수: 375 Test 배치 수: 119 배치 구조: input_ids shape: torch.Size([64, 128]) attention_mask shape: torch.Size([64, 128]) label shape: torch.Size([64])
In [10]:
import os
import torch
import torch.nn as nn
from transformers import RobertaModel, RobertaTokenizer
# RoBERTa 모델 경로
ROBERTA_PATH = './data/roberta'
# 모델이 없으면 다운로드
if not os.path.exists(ROBERTA_PATH):
print(f"'{ROBERTA_PATH}' 폴더가 없습니다. RoBERTa 모델을 다운로드합니다...")
os.makedirs(ROBERTA_PATH, exist_ok=True)
# 토크나이저와 모델 다운로드
tokenizer_download = RobertaTokenizer.from_pretrained('roberta-base')
model_download = RobertaModel.from_pretrained('roberta-base')
# 로컬에 저장
tokenizer_download.save_pretrained(ROBERTA_PATH)
model_download.save_pretrained(ROBERTA_PATH)
print(f"✅ RoBERTa 모델 다운로드 완료: {ROBERTA_PATH}")
else:
print(f"✅ RoBERTa 모델이 이미 존재합니다: {ROBERTA_PATH}")
# 로컬 모델 로드
print(f"로컬에서 RoBERTa 모델 로드 중...")
tokenizer = RobertaTokenizer.from_pretrained(ROBERTA_PATH)
class RoBERTaClassifier(nn.Module):
def __init__(self, roberta_path, num_classes=4, dropout=0.3):
super().__init__()
# 로컬 경로에서 RoBERTa 모델 로드
self.roberta = RobertaModel.from_pretrained(roberta_path)
# 분류 헤드
self.dropout = nn.Dropout(dropout)
self.fc = nn.Linear(self.roberta.config.hidden_size, num_classes) # 768 -> 4
def forward(self, input_ids, attention_mask):
# RoBERTa 인코딩
outputs = self.roberta(
input_ids=input_ids,
attention_mask=attention_mask
)
# [CLS] 토큰의 출력 사용
pooled_output = outputs.pooler_output # [batch, 768]
x = self.dropout(pooled_output)
return self.fc(x)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_roberta = RoBERTaClassifier(roberta_path=ROBERTA_PATH, num_classes=4).to(device)
total_params = sum(p.numel() for p in model_roberta.parameters())
trainable_params = sum(p.numel() for p in model_roberta.parameters() if p.requires_grad)
print(f"총 파라미터 수: {total_params:,}")
print(f"학습 가능 파라미터: {trainable_params:,}")
print(f"GPU 사용: {torch.cuda.is_available()}")
'./data/roberta' 폴더가 없습니다. RoBERTa 모델을 다운로드합니다...
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight'] You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
✅ RoBERTa 모델 다운로드 완료: ./data/roberta 로컬에서 RoBERTa 모델 로드 중... 총 파라미터 수: 124,648,708 학습 가능 파라미터: 124,648,708 GPU 사용: True
In [11]:
from tqdm import tqdm
optimizer = torch.optim.AdamW(model_roberta.parameters(), lr=2e-5) # RoBERTa는 작은 lr 사용
criterion = nn.CrossEntropyLoss()
EPOCHS = 5 # 사전학습 모델은 적은 에폭으로도 충분
history = {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': []}
for epoch in range(EPOCHS):
model_roberta.train()
total_loss, correct, total = 0, 0, 0
pbar = tqdm(train_loader, desc=f"Epoch {epoch+1:2d}/{EPOCHS}", leave=False)
for batch in pbar:
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['label'].to(device)
optimizer.zero_grad()
out = model_roberta(input_ids, attention_mask)
loss = criterion(out, labels)
loss.backward()
torch.nn.utils.clip_grad_norm_(model_roberta.parameters(), 1.0)
optimizer.step()
total_loss += loss.item()
correct += (out.argmax(1) == labels).sum().item()
total += len(labels)
pbar.set_postfix({'loss': f'{total_loss/len(pbar):.4f}', 'acc': f'{correct/total:.4f}'})
train_loss = total_loss / len(train_loader)
train_acc = correct / total
model_roberta.eval()
val_loss, val_correct, val_total = 0, 0, 0
with torch.no_grad():
for batch in val_loader:
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['label'].to(device)
out = model_roberta(input_ids, attention_mask)
val_loss += criterion(out, labels).item()
val_correct += (out.argmax(1) == labels).sum().item()
val_total += len(labels)
val_loss = val_loss / len(val_loader)
val_acc = val_correct / val_total
history['train_loss'].append(train_loss)
history['train_acc'].append(train_acc)
history['val_loss'].append(val_loss)
history['val_acc'].append(val_acc)
print(f"\rEpoch {epoch+1:2d}/{EPOCHS} | Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f} | Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}", end='', flush=True)
print()
Epoch 1/5 | Train Loss: 0.7173 | Train Acc: 0.7032 | Val Loss: 0.3631 | Val Acc: 0.8741
Epoch 2/5 | Train Loss: 0.3344 | Train Acc: 0.8826 | Val Loss: 0.2929 | Val Acc: 0.9023
Epoch 3/5 | Train Loss: 0.2510 | Train Acc: 0.9132 | Val Loss: 0.2826 | Val Acc: 0.9036
Epoch 4/5 | Train Loss: 0.2036 | Train Acc: 0.9299 | Val Loss: 0.2406 | Val Acc: 0.9204
Epoch 5/5 | Train Loss: 0.1668 | Train Acc: 0.9434 | Val Loss: 0.2606 | Val Acc: 0.9207
In [12]:
import matplotlib.pyplot as plt
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
epochs = range(1, EPOCHS + 1)
axes[0].plot(epochs, history['train_loss'], 'b-o', markersize=5, label='Train Loss')
axes[0].plot(epochs, history['val_loss'], 'r-o', markersize=5, label='Val Loss')
axes[0].set_title('Loss 곡선', fontsize=13, fontweight='bold')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].legend()
axes[0].grid(alpha=0.3)
axes[1].plot(epochs, history['train_acc'], 'b-o', markersize=5, label='Train Acc')
axes[1].plot(epochs, history['val_acc'], 'r-o', markersize=5, label='Val Acc')
axes[1].set_title('Accuracy 곡선', fontsize=13, fontweight='bold')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Accuracy')
axes[1].set_ylim(0, 1)
axes[1].legend()
axes[1].grid(alpha=0.3)
plt.suptitle(f'RoBERTa (사전학습 모델 파인튜닝) 학습 결과 | 최고 Val Acc: {max(history["val_acc"]):.4f}',
fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()
In [13]:
import seaborn as sns
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report
label_names = ['World', 'Sports', 'Business', 'Sci/Tech']
model_roberta.eval()
all_preds = []
with torch.no_grad():
for batch in test_loader:
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
out = model_roberta(input_ids, attention_mask)
all_preds.extend(out.argmax(1).cpu().numpy())
all_preds = np.array(all_preds)
y_test_arr = df_test['label'].values
test_acc = (all_preds == y_test_arr).mean()
print(f"✅ 테스트 정확도: {test_acc:.4f} ({test_acc*100:.2f}%)")
print()
print(classification_report(y_test_arr, all_preds, target_names=label_names))
cm = confusion_matrix(y_test_arr, all_preds)
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
sns.heatmap(cm, annot=True, fmt='d', cmap='Greens',
xticklabels=label_names, yticklabels=label_names, ax=axes[0])
axes[0].set_title('혼동행렬 (개수)', fontsize=13, fontweight='bold')
axes[0].set_ylabel('실제 레이블')
axes[0].set_xlabel('예측 레이블')
cm_pct = cm.astype(float) / cm.sum(axis=1, keepdims=True)
sns.heatmap(cm_pct, annot=True, fmt='.2%', cmap='Greens',
xticklabels=label_names, yticklabels=label_names, ax=axes[1])
axes[1].set_title('혼동행렬 (비율)', fontsize=13, fontweight='bold')
axes[1].set_ylabel('실제 레이블')
axes[1].set_xlabel('예측 레이블')
plt.suptitle(f'RoBERTa (사전학습 모델 파인튜닝) 테스트 결과 | 정확도: {test_acc*100:.2f}%',
fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()
✅ 테스트 정확도: 0.9184 (91.84%)
precision recall f1-score support
World 0.94 0.91 0.93 1900
Sports 0.95 0.98 0.96 1900
Business 0.90 0.88 0.89 1900
Sci/Tech 0.89 0.90 0.89 1900
accuracy 0.92 7600
macro avg 0.92 0.92 0.92 7600
weighted avg 0.92 0.92 0.92 7600