In [1]:
import setup_env
--------------------------------------------------------------------------------
=== Hardware Acceleration ===
PyTorch version: 2.9.0a0+145a3a7bda.nv25.10
Using NVIDIA GPU (CUDA)
   CUDA version: 13.0
   GPU name: NVIDIA GeForce RTX 5070 Ti
   GPU count: 1
   Total GPU memory: 15.92 GB
   Allocated memory: 0.00 GB
   Free memory: 15.92 GB
Device: cuda

=== Matplotlib Settings ===
✅ Font: NanumGothic

=== System Info ===
OS: Ubuntu 24.04.3 LTS (Noble Numbat)
    Kernel: 6.6.87.2-microsoft-standard-WSL2
Architecture: x86_64
Python: 3.12.3
Working directory: /workspace/ai-deeplearning/tutorial

=== Library Versions ===
NumPy: 2.1.0
Pandas: 3.0.0
Matplotlib: 3.10.7
Scikit-learn: 1.7.2
OpenCV: Not installed → !pip install -q opencv-python
Pillow: 12.0.0
Seaborn: 0.13.2
TensorFlow: Not installed → !pip install -q tensorflow
Transformers: 4.40.1
TorchVision: 0.24.0a0+094e7af5

=== Environment setup completed ===
--------------------------------------------------------------------------------

=== Visualizing Test Plot (Wide View) ===
No description has been provided for this image
=== GPU Usage Code Snippet ===
Device set to: cuda
----------------------------------------
# 아래 코드를 복사해서 모델과 데이터를 GPU로 보내세요:
model = YourModel().to(device)
data = data.to(device)
----------------------------------------

=== Environment setup completed ===
--------------------------------------------------------------------------------
In [2]:
import os

# 디렉토리 생성
os.makedirs('./data/ag_news', exist_ok=True)

# 이미 다운받은게 있으면 스킵
base_url = "https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv"

for filename in ['train.csv', 'test.csv']:
    filepath = f'./data/ag_news/{filename}'
    if os.path.exists(filepath):
        print(f"✅ {filename} 이미 존재 → 스킵")
    else:
        print(f"⬇️ {filename} 다운로드 중...")
        os.system(f'wget -P ./data/ag_news {base_url}/{filename}')
        print(f"✅ {filename} 다운로드 완료")
✅ train.csv 이미 존재 → 스킵
✅ test.csv 이미 존재 → 스킵
In [3]:
import pandas as pd

df_train = pd.read_csv('./data/ag_news/train.csv', header=None)
df_test  = pd.read_csv('./data/ag_news/test.csv',  header=None)

print("컬럼 수:", df_train.shape)
print("\n샘플 데이터:")
print(df_train.head(3))
print("\n결측값:", df_train.isnull().sum().tolist())
print("레이블 분포:", df_train[0].value_counts().to_dict())
컬럼 수: (120000, 3)

샘플 데이터:
   0                                                    1  \
0  3    Wall St. Bears Claw Back Into the Black (Reuters)   
1  3  Carlyle Looks Toward Commercial Aerospace (Reuters)   
2  3      Oil and Economy Cloud Stocks' Outlook (Reuters)   

                                                                                                                                                                                                                        2  
0                                                                                                                          Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.  
1  Reuters - Private investment firm Carlyle Group,\which has a reputation for making well-timed and occasionally\controversial plays in the defense industry, has quietly placed\its bets on another part of the market.  
2                                Reuters - Soaring crude prices plus worries\about the economy and the outlook for earnings are expected to\hang over the stock market next week during the depth of the\summer doldrums.  

결측값: [0, 0, 0]
레이블 분포: {3: 30000, 4: 30000, 2: 30000, 1: 30000}
In [4]:
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

df_train = pd.read_csv('./data/ag_news/train.csv', header=None, names=['label', 'title', 'body'])
df_test  = pd.read_csv('./data/ag_news/test.csv',  header=None, names=['label', 'title', 'body'])

def preprocess(title, body):
    text = str(title) + ' ' + str(body)
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df_train['text'] = df_train.apply(lambda r: preprocess(r['title'], r['body']), axis=1)
df_test['text']  = df_test.apply(lambda r: preprocess(r['title'], r['body']), axis=1)
df_train['label'] = df_train['label'] - 1
df_test['label']  = df_test['label'] - 1

df_tr, df_val = train_test_split(df_train, test_size=0.2, random_state=42, stratify=df_train['label'])

label_names = ['World', 'Sports', 'Business', 'Sci/Tech']
print(f"훈련셋:   {len(df_tr):,}개")
print(f"검증셋:   {len(df_val):,}개")
print(f"테스트셋: {len(df_test):,}개")

# ── 1. 데이터셋 크기 비교 ──
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

splits = {'Train': df_tr, 'Val': df_val, 'Test': df_test}
colors = ['#4C72B0', '#DD8452', '#55A868']
for ax, (name, df), color in zip(axes, splits.items(), colors):
    counts = df['label'].value_counts().sort_index()
    ax.bar([label_names[i] for i in counts.index], counts.values, color=color, edgecolor='white')
    ax.set_title(f'{name} ({len(df):,}개)')
    ax.set_ylabel('개수')
    for i, v in enumerate(counts.values):
        ax.text(i, v + 100, str(v), ha='center', fontweight='bold', fontsize=9)

plt.suptitle('데이터셋 분할 및 레이블 분포', fontsize=13, fontweight='bold')
plt.tight_layout()
plt.show()

# ── 2. 텍스트 길이 분포 ──
for df, name in [(df_tr, 'Train'), (df_val, 'Val'), (df_test, 'Test')]:
    df['text_len'] = df['text'].apply(lambda x: len(x.split()))

fig, axes = plt.subplots(1, 3, figsize=(15, 4))
for ax, (name, df), color in zip(axes, splits.items(), colors):
    ax.hist(df['text_len'], bins=50, color=color, edgecolor='white')
    ax.axvline(df['text_len'].mean(), color='red', linestyle='--', label=f'평균: {df["text_len"].mean():.0f}')
    ax.set_title(f'{name} 텍스트 길이 분포')
    ax.set_xlabel('단어 수')
    ax.set_ylabel('빈도')
    ax.legend()

plt.suptitle('텍스트 길이 분포', fontsize=13, fontweight='bold')
plt.tight_layout()
plt.show()

# ── 3. 데이터셋 비율 파이차트 ──
fig, ax = plt.subplots(figsize=(6, 6))
sizes = [len(df_tr), len(df_val), len(df_test)]
labels = [f'Train\n{len(df_tr):,}개', f'Val\n{len(df_val):,}개', f'Test\n{len(df_test):,}개']
ax.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%',
       startangle=90, wedgeprops=dict(edgecolor='white', linewidth=2))
ax.set_title('데이터셋 분할 비율', fontsize=13, fontweight='bold')
plt.show()
훈련셋:   96,000개
검증셋:   24,000개
테스트셋: 7,600개
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [5]:
from collections import Counter

# text 컬럼 사용 (불용어 유지!)
counter_rnn = Counter()
for text in df_tr['text']:
    counter_rnn.update(text.split())

MAX_VOCAB = 20000
vocab = {'<PAD>': 0, '<UNK>': 1}
for word, _ in counter_rnn.most_common(MAX_VOCAB - 2):
    vocab[word] = len(vocab)

print(f"전체 고유 단어 수: {len(counter_rnn):,}")
print(f"사전 크기: {len(vocab):,}")
전체 고유 단어 수: 60,018
사전 크기: 20,000
In [6]:
# 텍스트 → 인덱스 변환 및 패딩
MAX_LEN = 64

def text_to_ids(text, vocab, max_len):
    tokens = text.split()[:max_len]
    ids = [vocab.get(t, 1) for t in tokens]   # 없으면 <UNK>=1
    ids += [0] * (max_len - len(ids))          # <PAD>=0 으로 패딩
    return ids

# 변환 확인
sample = text_to_ids(df_tr['text'].iloc[0], vocab, MAX_LEN)
print(f"원본: {df_tr['text'].iloc[0][:60]}...")
print(f"변환: {sample[:10]}...")
print(f"길이: {len(sample)}")
원본: clijsters unsure about latest injury says hewitt tokyo reute...
변환: [5358, 12329, 67, 322, 890, 85, 1904, 424, 22, 4301]...
길이: 64
In [7]:
from torch.utils.data import Dataset, DataLoader

class AGNewsDataset(Dataset):
    def __init__(self, texts, labels):
        self.X = [text_to_ids(t, vocab, MAX_LEN) for t in texts]
        self.y = labels.tolist()

    def __len__(self): return len(self.y)

    def __getitem__(self, idx):
        return (torch.tensor(self.X[idx], dtype=torch.long),
                torch.tensor(self.y[idx], dtype=torch.long))

train_dataset = AGNewsDataset(df_tr['text'].values,   df_tr['label'].values)
val_dataset   = AGNewsDataset(df_val['text'].values,  df_val['label'].values)
test_dataset  = AGNewsDataset(df_test['text'].values, df_test['label'].values)

train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
val_loader   = DataLoader(val_dataset,   batch_size=256, shuffle=False)
test_loader  = DataLoader(test_dataset,  batch_size=256, shuffle=False)

print(f"Train 배치 수: {len(train_loader)}")
print(f"Val   배치 수: {len(val_loader)}")
print(f"Test  배치 수: {len(test_loader)}")
Train 배치 수: 375
Val   배치 수: 94
Test  배치 수: 30
In [9]:
import os

fasttext_dir  = './data/fasttext'
fasttext_file = f'{fasttext_dir}/crawl-300d-2M.vec'

os.makedirs(fasttext_dir, exist_ok=True)

if os.path.exists(fasttext_file):
    print("✅ FastText 파일 이미 존재 → 스킵")
else:
    print("⬇️ FastText 다운로드 중... (600MB, 시간이 걸릴 수 있어)")
    os.system(f'wget -P {fasttext_dir} https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip')
    os.system(f'unzip {fasttext_dir}/crawl-300d-2M.vec.zip -d {fasttext_dir}')
    os.system(f'rm {fasttext_dir}/crawl-300d-2M.vec.zip')
    print("✅ FastText 다운로드 완료!")
✅ FastText 파일 이미 존재 → 스킵
In [10]:
import numpy as np

EMBED_DIM = 300  # FastText는 300차원

def load_fasttext(fasttext_path, vocab, embed_dim):
    embedding_matrix = np.random.uniform(-0.1, 0.1, (len(vocab), embed_dim)).astype('float32')
    embedding_matrix[0] = 0  # <PAD> = 0 벡터

    found = 0
    with open(fasttext_path, 'r', encoding='utf-8') as f:
        next(f)  # 첫 줄 헤더 스킵
        for line in f:
            parts = line.strip().split()
            word = parts[0]
            if word in vocab:
                embedding_matrix[vocab[word]] = np.array(parts[1:], dtype='float32')
                found += 1

    print(f"전체 사전 크기:      {len(vocab):,}")
    print(f"FastText 커버 단어:  {found:,} ({found/len(vocab)*100:.1f}%)")
    print(f"임베딩 행렬 shape:   {embedding_matrix.shape}")
    return embedding_matrix

embedding_matrix = load_fasttext('./data/fasttext/crawl-300d-2M.vec', vocab, EMBED_DIM)
전체 사전 크기:      20,000
FastText 커버 단어:  19,003 (95.0%)
임베딩 행렬 shape:   (20000, 300)
In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.attn = nn.Linear(hidden_dim, 1)

    def forward(self, lstm_output):
        # lstm_output: [batch, seq, hidden_dim]
        scores  = self.attn(lstm_output).squeeze(-1)      # [batch, seq]
        weights = F.softmax(scores, dim=-1)                # 0~1 사이 가중치
        context = (lstm_output * weights.unsqueeze(-1)).sum(dim=1)  # 가중합
        return context, weights

class LSTMAttentionClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, embedding_matrix=None):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        if embedding_matrix is not None:
            self.embedding.weight.data.copy_(torch.FloatTensor(embedding_matrix))
            self.embedding.weight.requires_grad = True  # 파인튜닝!

        self.lstm = nn.LSTM(embed_dim, hidden_dim,
                            num_layers=2,
                            batch_first=True,
                            dropout=0.3)
        self.attention = Attention(hidden_dim)
        self.dropout   = nn.Dropout(0.3)
        self.fc        = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded         = self.dropout(self.embedding(x))
        lstm_out, _      = self.lstm(embedded)
        context, weights = self.attention(lstm_out)
        return self.fc(self.dropout(context)), weights

model_attn = LSTMAttentionClassifier(
    vocab_size=len(vocab),
    embed_dim=EMBED_DIM,
    hidden_dim=128,
    output_dim=4,
    embedding_matrix=embedding_matrix
).to(device)

total_params = sum(p.numel() for p in model_attn.parameters())
train_params = sum(p.numel() for p in model_attn.parameters() if p.requires_grad)
print(f"총 파라미터 수:       {total_params:,}")
print(f"학습가능 파라미터 수: {train_params:,}")
총 파라미터 수:       6,352,901
학습가능 파라미터 수: 6,352,901
In [12]:
from torch.utils.data import Dataset, DataLoader

train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
val_loader   = DataLoader(val_dataset,   batch_size=256, shuffle=False)
test_loader  = DataLoader(test_dataset,  batch_size=256, shuffle=False)

print(f"Train 배치 수: {len(train_loader)}")
print(f"Val   배치 수: {len(val_loader)}")
print(f"Test  배치 수: {len(test_loader)}")
Train 배치 수: 375
Val   배치 수: 94
Test  배치 수: 30
In [13]:
from tqdm import tqdm

optimizer = torch.optim.Adam(model_attn.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()
EPOCHS = 10

history = {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': []}

for epoch in range(EPOCHS):
    model_attn.train()
    total_loss, correct, total = 0, 0, 0
    pbar = tqdm(train_loader, desc=f"Epoch {epoch+1:2d}/{EPOCHS}", leave=False)
    for X_b, y_b in pbar:
        X_b, y_b = X_b.to(device), y_b.to(device)
        optimizer.zero_grad()
        out, _ = model_attn(X_b)
        loss = criterion(out, y_b)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model_attn.parameters(), 1.0)
        optimizer.step()
        total_loss += loss.item()
        correct += (out.argmax(1) == y_b).sum().item()
        total += len(y_b)
        pbar.set_postfix({'loss': f'{total_loss/len(pbar):.4f}', 'acc': f'{correct/total:.4f}'})

    train_loss = total_loss / len(train_loader)
    train_acc  = correct / total

    model_attn.eval()
    val_loss, val_correct, val_total = 0, 0, 0
    with torch.no_grad():
        for X_b, y_b in val_loader:
            X_b, y_b = X_b.to(device), y_b.to(device)
            out, _ = model_attn(X_b)
            val_loss    += criterion(out, y_b).item()
            val_correct += (out.argmax(1) == y_b).sum().item()
            val_total   += len(y_b)

    val_loss = val_loss / len(val_loader)
    val_acc  = val_correct / val_total

    history['train_loss'].append(train_loss)
    history['train_acc'].append(train_acc)
    history['val_loss'].append(val_loss)
    history['val_acc'].append(val_acc)

    print(f"\rEpoch {epoch+1:2d}/{EPOCHS} | Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f} | Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}", end='', flush=True)

print()
                                                                                        
Epoch  1/10 | Train Loss: 0.3720 | Train Acc: 0.8682 | Val Loss: 0.2407 | Val Acc: 0.9202
                                                                                        
Epoch  2/10 | Train Loss: 0.2233 | Train Acc: 0.9278 | Val Loss: 0.2244 | Val Acc: 0.9249
                                                                                        
Epoch  3/10 | Train Loss: 0.1865 | Train Acc: 0.9392 | Val Loss: 0.2278 | Val Acc: 0.9255
                                                                                        
Epoch  4/10 | Train Loss: 0.1569 | Train Acc: 0.9477 | Val Loss: 0.2466 | Val Acc: 0.9199
                                                                                        
Epoch  5/10 | Train Loss: 0.1346 | Train Acc: 0.9540 | Val Loss: 0.2469 | Val Acc: 0.9207
                                                                                        
Epoch  6/10 | Train Loss: 0.1117 | Train Acc: 0.9614 | Val Loss: 0.2600 | Val Acc: 0.9226
                                                                                        
Epoch  7/10 | Train Loss: 0.0955 | Train Acc: 0.9667 | Val Loss: 0.2917 | Val Acc: 0.9150
                                                                                        
Epoch  8/10 | Train Loss: 0.0807 | Train Acc: 0.9721 | Val Loss: 0.3049 | Val Acc: 0.9151
                                                                                        
Epoch  9/10 | Train Loss: 0.0679 | Train Acc: 0.9761 | Val Loss: 0.3272 | Val Acc: 0.9170
                                                                                        
Epoch 10/10 | Train Loss: 0.0565 | Train Acc: 0.9801 | Val Loss: 0.3520 | Val Acc: 0.9143
In [14]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 2, figsize=(14, 5))
epochs = range(1, EPOCHS + 1)

axes[0].plot(epochs, history['train_loss'], 'b-o', markersize=5, label='Train Loss')
axes[0].plot(epochs, history['val_loss'],   'r-o', markersize=5, label='Val Loss')
axes[0].set_title('Loss 곡선', fontsize=13, fontweight='bold')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].legend()
axes[0].grid(alpha=0.3)

axes[1].plot(epochs, history['train_acc'], 'b-o', markersize=5, label='Train Acc')
axes[1].plot(epochs, history['val_acc'],   'r-o', markersize=5, label='Val Acc')
axes[1].set_title('Accuracy 곡선', fontsize=13, fontweight='bold')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Accuracy')
axes[1].set_ylim(0, 1)
axes[1].legend()
axes[1].grid(alpha=0.3)

plt.suptitle(f'LSTM + Attention (FastText fine-tune) 학습 결과 | 최고 Val Acc: {max(history["val_acc"]):.4f}',
             fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()
No description has been provided for this image
In [15]:
import seaborn as sns
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report

label_names = ['World', 'Sports', 'Business', 'Sci/Tech']

model_attn.eval()
all_preds, all_weights = [], []
with torch.no_grad():
    for X_b, y_b in test_loader:
        X_b = X_b.to(device)
        out, weights = model_attn(X_b)
        all_preds.extend(out.argmax(1).cpu().numpy())
        all_weights.append(weights.cpu().numpy())

all_preds  = np.array(all_preds)
y_test_arr = df_test['label'].values
test_acc   = (all_preds == y_test_arr).mean()
print(f"✅ 테스트 정확도: {test_acc:.4f} ({test_acc*100:.2f}%)")
print()
print(classification_report(y_test_arr, all_preds, target_names=label_names))

cm = confusion_matrix(y_test_arr, all_preds)
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

sns.heatmap(cm, annot=True, fmt='d', cmap='Purples',
            xticklabels=label_names, yticklabels=label_names, ax=axes[0])
axes[0].set_title('혼동행렬 (개수)', fontsize=13, fontweight='bold')
axes[0].set_ylabel('실제 레이블')
axes[0].set_xlabel('예측 레이블')

cm_pct = cm.astype(float) / cm.sum(axis=1, keepdims=True)
sns.heatmap(cm_pct, annot=True, fmt='.2%', cmap='Purples',
            xticklabels=label_names, yticklabels=label_names, ax=axes[1])
axes[1].set_title('혼동행렬 (비율)', fontsize=13, fontweight='bold')
axes[1].set_ylabel('실제 레이블')
axes[1].set_xlabel('예측 레이블')

plt.suptitle(f'LSTM + Attention (FastText fine-tune) 테스트 결과 | 정확도: {test_acc*100:.2f}%',
             fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

# ── Attention 가중치 시각화 (샘플 1개) ──
sample_idx = 0
sample_text = df_test['text'].iloc[sample_idx].split()[:MAX_LEN]
sample_weights = all_weights[0][sample_idx][:len(sample_text)]

plt.figure(figsize=(14, 3))
plt.bar(range(len(sample_text)), sample_weights, color='purple', alpha=0.7)
plt.xticks(range(len(sample_text)), sample_text, rotation=45, ha='right', fontsize=9)
plt.title(f'Attention 가중치 시각화 (예측: {label_names[all_preds[sample_idx]]} / 실제: {label_names[y_test_arr[sample_idx]]})')
plt.ylabel('Attention 가중치')
plt.tight_layout()
plt.show()
✅ 테스트 정확도: 0.9155 (91.55%)

              precision    recall  f1-score   support

       World       0.93      0.91      0.92      1900
      Sports       0.97      0.98      0.97      1900
    Business       0.90      0.87      0.88      1900
    Sci/Tech       0.87      0.91      0.89      1900

    accuracy                           0.92      7600
   macro avg       0.92      0.92      0.92      7600
weighted avg       0.92      0.92      0.92      7600

No description has been provided for this image
No description has been provided for this image