In [1]:
import setup_env
--------------------------------------------------------------------------------
=== Hardware Acceleration ===
PyTorch version: 2.9.0a0+145a3a7bda.nv25.10
Using NVIDIA GPU (CUDA)
   CUDA version: 13.0
   GPU name: NVIDIA GeForce RTX 5070 Ti
   GPU count: 1
   Total GPU memory: 15.92 GB
   Allocated memory: 0.00 GB
   Free memory: 15.92 GB
Device: cuda

=== Matplotlib Settings ===
✅ Font: NanumGothic

=== System Info ===
OS: Ubuntu 24.04.3 LTS (Noble Numbat)
    Kernel: 6.6.87.2-microsoft-standard-WSL2
Architecture: x86_64
Python: 3.12.3
Working directory: /workspace/ai-deeplearning/tutorial

=== Library Versions ===
NumPy: 2.1.0
Pandas: 3.0.0
Matplotlib: 3.10.7
Scikit-learn: 1.7.2
OpenCV: Not installed → !pip install -q opencv-python
Pillow: 12.0.0
Seaborn: 0.13.2
TensorFlow: Not installed → !pip install -q tensorflow
Transformers: 4.40.1
TorchVision: 0.24.0a0+094e7af5

=== Environment setup completed ===
--------------------------------------------------------------------------------

=== Visualizing Test Plot (Wide View) ===
No description has been provided for this image
=== GPU Usage Code Snippet ===
Device set to: cuda
----------------------------------------
# 아래 코드를 복사해서 모델과 데이터를 GPU로 보내세요:
model = YourModel().to(device)
data = data.to(device)
----------------------------------------

=== Environment setup completed ===
--------------------------------------------------------------------------------
In [2]:
import os

# 디렉토리 생성
os.makedirs('./data/ag_news', exist_ok=True)

# 이미 다운받은게 있으면 스킵
base_url = "https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv"

for filename in ['train.csv', 'test.csv']:
    filepath = f'./data/ag_news/{filename}'
    if os.path.exists(filepath):
        print(f"✅ {filename} 이미 존재 → 스킵")
    else:
        print(f"⬇️ {filename} 다운로드 중...")
        os.system(f'wget -P ./data/ag_news {base_url}/{filename}')
        print(f"✅ {filename} 다운로드 완료")
✅ train.csv 이미 존재 → 스킵
✅ test.csv 이미 존재 → 스킵
In [3]:
import pandas as pd

df_train = pd.read_csv('./data/ag_news/train.csv', header=None)
df_test  = pd.read_csv('./data/ag_news/test.csv',  header=None)

print("컬럼 수:", df_train.shape)
print("\n샘플 데이터:")
print(df_train.head(3))
print("\n결측값:", df_train.isnull().sum().tolist())
print("레이블 분포:", df_train[0].value_counts().to_dict())
컬럼 수: (120000, 3)

샘플 데이터:
   0                                                    1  \
0  3    Wall St. Bears Claw Back Into the Black (Reuters)   
1  3  Carlyle Looks Toward Commercial Aerospace (Reuters)   
2  3      Oil and Economy Cloud Stocks' Outlook (Reuters)   

                                                                                                                                                                                                                        2  
0                                                                                                                          Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.  
1  Reuters - Private investment firm Carlyle Group,\which has a reputation for making well-timed and occasionally\controversial plays in the defense industry, has quietly placed\its bets on another part of the market.  
2                                Reuters - Soaring crude prices plus worries\about the economy and the outlook for earnings are expected to\hang over the stock market next week during the depth of the\summer doldrums.  

결측값: [0, 0, 0]
레이블 분포: {3: 30000, 4: 30000, 2: 30000, 1: 30000}
In [4]:
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

df_train = pd.read_csv('./data/ag_news/train.csv', header=None, names=['label', 'title', 'body'])
df_test  = pd.read_csv('./data/ag_news/test.csv',  header=None, names=['label', 'title', 'body'])

def preprocess(title, body):
    text = str(title) + ' ' + str(body)
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df_train['text'] = df_train.apply(lambda r: preprocess(r['title'], r['body']), axis=1)
df_test['text']  = df_test.apply(lambda r: preprocess(r['title'], r['body']), axis=1)
df_train['label'] = df_train['label'] - 1
df_test['label']  = df_test['label'] - 1

df_tr, df_val = train_test_split(df_train, test_size=0.2, random_state=42, stratify=df_train['label'])

label_names = ['World', 'Sports', 'Business', 'Sci/Tech']
print(f"훈련셋:   {len(df_tr):,}개")
print(f"검증셋:   {len(df_val):,}개")
print(f"테스트셋: {len(df_test):,}개")

# ── 1. 데이터셋 크기 비교 ──
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

splits = {'Train': df_tr, 'Val': df_val, 'Test': df_test}
colors = ['#4C72B0', '#DD8452', '#55A868']
for ax, (name, df), color in zip(axes, splits.items(), colors):
    counts = df['label'].value_counts().sort_index()
    ax.bar([label_names[i] for i in counts.index], counts.values, color=color, edgecolor='white')
    ax.set_title(f'{name} ({len(df):,}개)')
    ax.set_ylabel('개수')
    for i, v in enumerate(counts.values):
        ax.text(i, v + 100, str(v), ha='center', fontweight='bold', fontsize=9)

plt.suptitle('데이터셋 분할 및 레이블 분포', fontsize=13, fontweight='bold')
plt.tight_layout()
plt.show()

# ── 2. 텍스트 길이 분포 ──
for df, name in [(df_tr, 'Train'), (df_val, 'Val'), (df_test, 'Test')]:
    df['text_len'] = df['text'].apply(lambda x: len(x.split()))

fig, axes = plt.subplots(1, 3, figsize=(15, 4))
for ax, (name, df), color in zip(axes, splits.items(), colors):
    ax.hist(df['text_len'], bins=50, color=color, edgecolor='white')
    ax.axvline(df['text_len'].mean(), color='red', linestyle='--', label=f'평균: {df["text_len"].mean():.0f}')
    ax.set_title(f'{name} 텍스트 길이 분포')
    ax.set_xlabel('단어 수')
    ax.set_ylabel('빈도')
    ax.legend()

plt.suptitle('텍스트 길이 분포', fontsize=13, fontweight='bold')
plt.tight_layout()
plt.show()

# ── 3. 데이터셋 비율 파이차트 ──
fig, ax = plt.subplots(figsize=(6, 6))
sizes = [len(df_tr), len(df_val), len(df_test)]
labels = [f'Train\n{len(df_tr):,}개', f'Val\n{len(df_val):,}개', f'Test\n{len(df_test):,}개']
ax.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%',
       startangle=90, wedgeprops=dict(edgecolor='white', linewidth=2))
ax.set_title('데이터셋 분할 비율', fontsize=13, fontweight='bold')
plt.show()
훈련셋:   96,000개
검증셋:   24,000개
테스트셋: 7,600개
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [5]:
from collections import Counter

# text 컬럼 사용 (불용어 유지!)
counter_rnn = Counter()
for text in df_tr['text']:
    counter_rnn.update(text.split())

MAX_VOCAB = 20000
vocab = {'<PAD>': 0, '<UNK>': 1}
for word, _ in counter_rnn.most_common(MAX_VOCAB - 2):
    vocab[word] = len(vocab)

print(f"전체 고유 단어 수: {len(counter_rnn):,}")
print(f"사전 크기: {len(vocab):,}")
전체 고유 단어 수: 60,018
사전 크기: 20,000
In [6]:
# 텍스트 → 인덱스 변환 및 패딩
MAX_LEN = 64

def text_to_ids(text, vocab, max_len):
    tokens = text.split()[:max_len]
    ids = [vocab.get(t, 1) for t in tokens]   # 없으면 <UNK>=1
    ids += [0] * (max_len - len(ids))          # <PAD>=0 으로 패딩
    return ids

# 변환 확인
sample = text_to_ids(df_tr['text'].iloc[0], vocab, MAX_LEN)
print(f"원본: {df_tr['text'].iloc[0][:60]}...")
print(f"변환: {sample[:10]}...")
print(f"길이: {len(sample)}")
원본: clijsters unsure about latest injury says hewitt tokyo reute...
변환: [5358, 12329, 67, 322, 890, 85, 1904, 424, 22, 4301]...
길이: 64
In [7]:
from torch.utils.data import Dataset, DataLoader

class AGNewsDataset(Dataset):
    def __init__(self, texts, labels):
        self.X = [text_to_ids(t, vocab, MAX_LEN) for t in texts]
        self.y = labels.tolist()

    def __len__(self): return len(self.y)

    def __getitem__(self, idx):
        return (torch.tensor(self.X[idx], dtype=torch.long),
                torch.tensor(self.y[idx], dtype=torch.long))

train_dataset = AGNewsDataset(df_tr['text'].values,   df_tr['label'].values)
val_dataset   = AGNewsDataset(df_val['text'].values,  df_val['label'].values)
test_dataset  = AGNewsDataset(df_test['text'].values, df_test['label'].values)

train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)
val_loader   = DataLoader(val_dataset,   batch_size=256, shuffle=False)
test_loader  = DataLoader(test_dataset,  batch_size=256, shuffle=False)

print(f"Train 배치 수: {len(train_loader)}")
print(f"Val   배치 수: {len(val_loader)}")
print(f"Test  배치 수: {len(test_loader)}")
Train 배치 수: 375
Val   배치 수: 94
Test  배치 수: 30
In [9]:
!pip install gensim
Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Requirement already satisfied: numpy>=1.18.5 in /usr/local/lib/python3.12/dist-packages (from gensim) (2.1.0)
Requirement already satisfied: scipy>=1.7.0 in /usr/local/lib/python3.12/dist-packages (from gensim) (1.16.2)
Collecting smart_open>=1.8.1 (from gensim)
  Downloading smart_open-7.5.0-py3-none-any.whl.metadata (24 kB)
Requirement already satisfied: wrapt in /usr/local/lib/python3.12/dist-packages (from smart_open>=1.8.1->gensim) (1.17.3)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 27.9/27.9 MB 12.6 MB/s  0:00:02m0:00:0100:01
Downloading smart_open-7.5.0-py3-none-any.whl (63 kB)
Installing collected packages: smart_open, gensim
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2/2 [gensim]2m1/2 [gensim]
Successfully installed gensim-4.4.0 smart_open-7.5.0
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning.
In [10]:
# gensim 설치 확인
# !pip install gensim

from gensim.models import Word2Vec

# AG News 훈련 데이터로 Word2Vec 학습
sentences = [text.split() for text in df_tr['text']]

print("Word2Vec 학습 중...")
w2v_model = Word2Vec(
    sentences,
    vector_size=100,   # 임베딩 차원
    window=5,          # 주변 단어 범위
    min_count=2,       # 최소 등장 횟수
    workers=4,         # 병렬 처리
    epochs=5
)

print(f"학습 완료!")
print(f"Word2Vec 단어 수: {len(w2v_model.wv):,}")
print(f"\n'sports' 유사 단어: {w2v_model.wv.most_similar('sports', topn=5)}")
print(f"'technology' 유사 단어: {w2v_model.wv.most_similar('technology', topn=5)}")
Word2Vec 학습 중...
학습 완료!
Word2Vec 단어 수: 40,638

'sports' 유사 단어: [('cbc', 0.6642742156982422), ('iosn', 0.5549443364143372), ('businessweek', 0.5540317893028259), ('cnnfn', 0.5530335307121277), ('configuring', 0.5493690371513367)]
'technology' 유사 단어: [('capabilities', 0.7666119933128357), ('content', 0.7508335113525391), ('networking', 0.7501261234283447), ('computing', 0.747771143913269), ('tools', 0.743340015411377)]
In [11]:
import numpy as np

EMBED_DIM = 100

# vocab 단어들을 Word2Vec 벡터로 초기화
embedding_matrix = np.random.uniform(-0.1, 0.1, (len(vocab), EMBED_DIM)).astype('float32')
embedding_matrix[0] = 0  # <PAD> = 0 벡터

found = 0
for word, idx in vocab.items():
    if word in w2v_model.wv:
        embedding_matrix[idx] = w2v_model.wv[word]
        found += 1

print(f"전체 사전 크기: {len(vocab):,}")
print(f"Word2Vec 커버 단어: {found:,} ({found/len(vocab)*100:.1f}%)")
print(f"임베딩 행렬 shape: {embedding_matrix.shape}")
전체 사전 크기: 20,000
Word2Vec 커버 단어: 19,998 (100.0%)
임베딩 행렬 shape: (20000, 100)
In [13]:
import torch
import torch.nn as nn

class RNNClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, embedding_matrix=None):
        super().__init__()

        # 임베딩 레이어 (Word2Vec 초기화, 학습중 고정)
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        if embedding_matrix is not None:
            self.embedding.weight.data.copy_(torch.FloatTensor(embedding_matrix))
            self.embedding.weight.requires_grad = False  # 고정 (freeze)

        self.rnn = nn.RNN(embed_dim, hidden_dim, batch_first=True)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.dropout(self.embedding(x))
        output, hidden = self.rnn(embedded)
        return self.fc(self.dropout(hidden.squeeze(0)))

model_rnn = RNNClassifier(
    vocab_size=len(vocab),
    embed_dim=EMBED_DIM,
    hidden_dim=128,
    output_dim=4,
    embedding_matrix=embedding_matrix
).to(device)

total_params = sum(p.numel() for p in model_rnn.parameters())
train_params = sum(p.numel() for p in model_rnn.parameters() if p.requires_grad)
print(f"총 파라미터 수:      {total_params:,}")
print(f"학습가능 파라미터 수: {train_params:,}")
총 파라미터 수:      2,029,956
학습가능 파라미터 수: 29,956
In [15]:
from tqdm import tqdm

optimizer = torch.optim.Adam(model_rnn.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()
EPOCHS = 20

history = {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': []}

for epoch in range(EPOCHS):
    # ── 훈련 ──
    model_rnn.train()
    total_loss, correct, total = 0, 0, 0
    pbar = tqdm(train_loader, desc=f"Epoch {epoch+1:2d}/{EPOCHS}", leave=False)
    for X_b, y_b in pbar:
        X_b, y_b = X_b.to(device), y_b.to(device)
        optimizer.zero_grad()
        out = model_rnn(X_b)
        loss = criterion(out, y_b)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model_rnn.parameters(), 1.0)  # 그래디언트 클리핑
        optimizer.step()
        total_loss += loss.item()
        correct += (out.argmax(1) == y_b).sum().item()
        total += len(y_b)
        pbar.set_postfix({'loss': f'{total_loss/len(pbar):.4f}', 'acc': f'{correct/total:.4f}'})

    train_loss = total_loss / len(train_loader)
    train_acc  = correct / total

    # ── 검증 ──
    model_rnn.eval()
    val_loss, val_correct, val_total = 0, 0, 0
    with torch.no_grad():
        for X_b, y_b in val_loader:
            X_b, y_b = X_b.to(device), y_b.to(device)
            out = model_rnn(X_b)
            val_loss    += criterion(out, y_b).item()
            val_correct += (out.argmax(1) == y_b).sum().item()
            val_total   += len(y_b)

    val_loss = val_loss / len(val_loader)
    val_acc  = val_correct / val_total

    history['train_loss'].append(train_loss)
    history['train_acc'].append(train_acc)
    history['val_loss'].append(val_loss)
    history['val_acc'].append(val_acc)

    print(f"\rEpoch {epoch+1:2d}/{EPOCHS} | Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f} | Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}", end='', flush=True)

print()
                                                                                        
Epoch  1/20 | Train Loss: 0.8791 | Train Acc: 0.6295 | Val Loss: 0.9421 | Val Acc: 0.5438
                                                                                        
Epoch  2/20 | Train Loss: 0.9802 | Train Acc: 0.5657 | Val Loss: 0.9661 | Val Acc: 0.5941
                                                                                        
Epoch  3/20 | Train Loss: 0.9456 | Train Acc: 0.6250 | Val Loss: 0.9263 | Val Acc: 0.6468
                                                                                        
Epoch  4/20 | Train Loss: 0.9006 | Train Acc: 0.6533 | Val Loss: 0.8826 | Val Acc: 0.6352
                                                                                        
Epoch  5/20 | Train Loss: 0.9009 | Train Acc: 0.6680 | Val Loss: 0.8747 | Val Acc: 0.6897
                                                                                        
Epoch  6/20 | Train Loss: 0.8695 | Train Acc: 0.6915 | Val Loss: 0.8353 | Val Acc: 0.7086
                                                                                        
Epoch  7/20 | Train Loss: 0.8362 | Train Acc: 0.7058 | Val Loss: 0.8027 | Val Acc: 0.7127
                                                                                        
Epoch  8/20 | Train Loss: 0.8347 | Train Acc: 0.6978 | Val Loss: 0.8571 | Val Acc: 0.6614
                                                                                        
Epoch  9/20 | Train Loss: 0.8899 | Train Acc: 0.6334 | Val Loss: 0.8602 | Val Acc: 0.6676
                                                                                        
Epoch 10/20 | Train Loss: 0.8259 | Train Acc: 0.6893 | Val Loss: 0.7868 | Val Acc: 0.7040
                                                                                        
Epoch 11/20 | Train Loss: 0.8088 | Train Acc: 0.6941 | Val Loss: 0.8154 | Val Acc: 0.6883
                                                                                        
Epoch 12/20 | Train Loss: 0.8912 | Train Acc: 0.6532 | Val Loss: 0.8828 | Val Acc: 0.6536
                                                                                        
Epoch 13/20 | Train Loss: 0.8543 | Train Acc: 0.6752 | Val Loss: 0.8043 | Val Acc: 0.7065
                                                                                        
Epoch 14/20 | Train Loss: 0.8224 | Train Acc: 0.7035 | Val Loss: 0.7902 | Val Acc: 0.7256
                                                                                        
Epoch 15/20 | Train Loss: 0.8139 | Train Acc: 0.7055 | Val Loss: 0.7889 | Val Acc: 0.7027
                                                                                        
Epoch 16/20 | Train Loss: 0.8301 | Train Acc: 0.6959 | Val Loss: 0.8439 | Val Acc: 0.7019
                                                                                        
Epoch 17/20 | Train Loss: 0.8074 | Train Acc: 0.7097 | Val Loss: 0.7262 | Val Acc: 0.7506
                                                                                        
Epoch 18/20 | Train Loss: 0.8242 | Train Acc: 0.6996 | Val Loss: 0.7825 | Val Acc: 0.7089
                                                                                        
Epoch 19/20 | Train Loss: 0.8064 | Train Acc: 0.7039 | Val Loss: 0.7763 | Val Acc: 0.7254
                                                                                        
Epoch 20/20 | Train Loss: 0.7921 | Train Acc: 0.7106 | Val Loss: 0.7429 | Val Acc: 0.7431
In [16]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(1, 2, figsize=(14, 5))
epochs = range(1, EPOCHS + 1)

axes[0].plot(epochs, history['train_loss'], 'b-o', markersize=5, label='Train Loss')
axes[0].plot(epochs, history['val_loss'],   'r-o', markersize=5, label='Val Loss')
axes[0].set_title('Loss 곡선', fontsize=13, fontweight='bold')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].legend()
axes[0].grid(alpha=0.3)

axes[1].plot(epochs, history['train_acc'], 'b-o', markersize=5, label='Train Acc')
axes[1].plot(epochs, history['val_acc'],   'r-o', markersize=5, label='Val Acc')
axes[1].set_title('Accuracy 곡선', fontsize=13, fontweight='bold')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Accuracy')
axes[1].set_ylim(0, 1)
axes[1].legend()
axes[1].grid(alpha=0.3)

plt.suptitle(f'RNN (Word2Vec) 학습 결과 | 최고 Val Acc: {max(history["val_acc"]):.4f}',
             fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()
No description has been provided for this image
In [17]:
import seaborn as sns
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report

label_names = ['World', 'Sports', 'Business', 'Sci/Tech']

model_rnn.eval()
all_preds = []
with torch.no_grad():
    for X_b, y_b in test_loader:
        X_b = X_b.to(device)
        preds = model_rnn(X_b).argmax(1).cpu().numpy()
        all_preds.extend(preds)

all_preds = np.array(all_preds)
y_test_arr = df_test['label'].values
test_acc = (all_preds == y_test_arr).mean()
print(f"✅ 테스트 정확도: {test_acc:.4f} ({test_acc*100:.2f}%)")
print()
print(classification_report(y_test_arr, all_preds, target_names=label_names))

cm = confusion_matrix(y_test_arr, all_preds)
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=label_names, yticklabels=label_names, ax=axes[0])
axes[0].set_title('혼동행렬 (개수)', fontsize=13, fontweight='bold')
axes[0].set_ylabel('실제 레이블')
axes[0].set_xlabel('예측 레이블')

cm_pct = cm.astype(float) / cm.sum(axis=1, keepdims=True)
sns.heatmap(cm_pct, annot=True, fmt='.2%', cmap='Blues',
            xticklabels=label_names, yticklabels=label_names, ax=axes[1])
axes[1].set_title('혼동행렬 (비율)', fontsize=13, fontweight='bold')
axes[1].set_ylabel('실제 레이블')
axes[1].set_xlabel('예측 레이블')

plt.suptitle(f'RNN (Word2Vec) 테스트 결과 | 정확도: {test_acc*100:.2f}%',
             fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()
✅ 테스트 정확도: 0.7388 (73.88%)

              precision    recall  f1-score   support

       World       0.72      0.80      0.76      1900
      Sports       0.85      0.83      0.84      1900
    Business       0.68      0.66      0.67      1900
    Sci/Tech       0.71      0.67      0.69      1900

    accuracy                           0.74      7600
   macro avg       0.74      0.74      0.74      7600
weighted avg       0.74      0.74      0.74      7600

No description has been provided for this image