In [1]:
import setup_env
from setup_env import device
--------------------------------------------------------------------------------
=== Hardware Acceleration ===
PyTorch version: 2.9.0a0+145a3a7bda.nv25.10
Using NVIDIA GPU (CUDA)
   CUDA version: 13.0
   GPU name: NVIDIA GeForce RTX 5070 Ti
   GPU count: 1
   Total GPU memory: 15.92 GB
   Allocated memory: 0.00 GB
   Free memory: 15.92 GB
Device: cuda

=== Matplotlib Settings ===
✅ Font: NanumGothic

=== System Info ===
OS: Ubuntu 24.04.3 LTS (Noble Numbat)
    Kernel: 6.6.87.2-microsoft-standard-WSL2
Architecture: x86_64
Python: 3.12.3
Working directory: /workspace/ai-deeplearning/tutorial

=== Library Versions ===
NumPy: 2.1.0
Pandas: 3.0.0
Matplotlib: 3.10.7
Scikit-learn: 1.7.2
OpenCV: Not installed → !pip install -q opencv-python
Pillow: 12.0.0
Seaborn: 0.13.2
TensorFlow: Not installed → !pip install -q tensorflow
Transformers: 5.2.0
TorchVision: 0.24.0a0+094e7af5

=== Environment setup completed ===
--------------------------------------------------------------------------------

=== Visualizing Test Plot (Wide View) ===
No description has been provided for this image
=== GPU Usage Code Snippet ===
Device set to: cuda
----------------------------------------
# 아래 코드를 복사해서 모델과 데이터를 GPU로 보내세요:
model = YourModel().to(device)
data = data.to(device)
----------------------------------------

=== Environment setup completed ===
--------------------------------------------------------------------------------

KLUE Dataset¶

  • ynat은 연합뉴스 헤드라인 7만건을 정치·경제·사회·생활문화·세계·IT과학·스포츠 7개 카테고리로 분류한 데이터
    • https://huggingface.co/datasets/klue/klue
    • https://klue-benchmark.com/

1. dataset¶


In [25]:
!pip install -q datasets

import datasets
print("datasets version : " +datasets.__version__)
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning.
datasets version : 2.19.0
In [26]:
!pip install -q ipywidgets
import ipywidgets
print("ipywidgets version : " + ipywidgets.__version__)
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning.
ipywidgets version : 8.1.8
In [2]:
from datasets import load_dataset

dataset = load_dataset("klue", "ynat", cache_dir="./data/ynat")
print(dataset)
DatasetDict({
    train: Dataset({
        features: ['guid', 'title', 'label', 'url', 'date'],
        num_rows: 45678
    })
    validation: Dataset({
        features: ['guid', 'title', 'label', 'url', 'date'],
        num_rows: 9107
    })
})

2. tokenizer¶

  • 허깅페이스 transformers는 BERT, GPT, RoBERTa 등 트랜스포머 기반 모델과 토크나이저를 쉽게 불러쓸 수 있게 해주는 패키지
  • BERT: 정적 마스킹, NSP 태스크 포함, 적은 데이터로 학습.
  • RoBERTa: 동적 마스킹, NSP 제거, 더 많은 데이터+더 오래 학습.

In [28]:
!pip install -q transformers

import transformers
print("transformers version : " + transformers.__version__)
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning.
transformers version : 5.2.0
In [29]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("klue/roberta-base")

print("어휘사전 크기:", tokenizer.vocab_size)
print("최대 입력 길이:", tokenizer.model_max_length)

vocab = tokenizer.get_vocab()
print(list(vocab.items())[:10])
어휘사전 크기: 32000
최대 입력 길이: 512
[('소박', 11778), ('##칵', 3351), ('흘렀', 11091), ('끈질기', 21291), ('내쫓', 20015), ('타선', 10872), ('기고', 17073), ('삼계', 19248), ('##의', 2079), ('##MC', 11505)]
In [30]:
import matplotlib.pyplot as plt

lengths = [len(tokenizer(t)["input_ids"]) for t in dataset["train"]["title"]]
print("평균:", sum(lengths)/len(lengths))
print("최대:", max(lengths))
print("최소:", min(lengths))
plt.hist(lengths, bins=30)
plt.show()
평균: 15.392836814221289
최대: 30
최소: 4
No description has been provided for this image
In [31]:
from torch.utils.data import Dataset, DataLoader

class GPTDataset(Dataset):
    def __init__(self, titles, tokenizer, max_len=30):
        self.input_ids = []
        for title in titles:
            tokens = tokenizer(
                title,                        # 입력 문장
                max_length=max_len,           # 최대 토큰 수
                truncation=True,              # 초과시 자르기
                padding="max_length",         # 부족하면 max_length까지 패딩
                return_tensors="pt"           # 파이토치 텐서로 반환
            )
            self.input_ids.append(tokens["input_ids"].squeeze())
    
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        x = self.input_ids[idx][:-1]
        y = self.input_ids[idx][1:]
        return x, y

train_dataset = GPTDataset(dataset["train"]["title"], tokenizer)
val_dataset = GPTDataset(dataset["validation"]["title"], tokenizer)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

print("train 배치 수:", len(train_loader))
print("val 배치 수:", len(val_loader))
train 배치 수: 1428
val 배치 수: 285
In [32]:
!pip install torchinfo -q

import torchinfo
print(torchinfo.__version__)
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning.
1.8.0
In [33]:
import torch
import torch.nn as nn
from torchinfo import summary

class GPTDecoder(nn.Module):
    def __init__(self, vocab_size, d_model=512, num_heads=8, num_layers=4, max_len=30, dropout=0.1):
        super().__init__()
        # 토큰 인덱스 → 512차원 벡터로 변환
        self.token_embedding = nn.Embedding(vocab_size, d_model)
        # 위치(0,1,2...) → 512차원 벡터로 변환
        self.pos_embedding = nn.Embedding(max_len, d_model)
        
        # 디코더 레이어 1개 정의 (헤드8개, 드롭아웃)
        decoder_layer = nn.TransformerDecoderLayer(d_model=d_model, nhead=num_heads, dropout=dropout, batch_first=True)
        # 디코더 레이어 4개 쌓기
        self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_layers)
        
        # 512차원 → 어휘사전 크기로 변환 (각 토큰 확률 출력)
        self.fc_out = nn.Linear(d_model, vocab_size)
        self.d_model = d_model

    def forward(self, x):
        # 입력 시퀀스 길이 확인
        seq_len = x.size(1)
        # 위치 인덱스 생성 [0,1,2,...,seq_len-1]
        positions = torch.arange(seq_len, device=x.device).unsqueeze(0)
        
        # 토큰 임베딩 + 위치 임베딩 더하기
        x = self.token_embedding(x) + self.pos_embedding(positions)
        
        # 미래 토큰 못보게 마스크 생성
        mask = nn.Transformer.generate_square_subsequent_mask(seq_len, device=x.device)
        
        # 디코더 통과 (GPT는 인코더 없으니 x를 메모리로도 넘김)
        out = self.decoder(x, x, tgt_mask=mask)
        # 각 위치에서 다음 토큰 확률 출력
        return self.fc_out(out)

model = GPTDecoder(vocab_size=tokenizer.vocab_size)

summary(model, input_size=(1, 29), dtypes=[torch.long])
Out[33]:
===============================================================================================
Layer (type:depth-idx)                        Output Shape              Param #
===============================================================================================
GPTDecoder                                    [1, 29, 32000]            --
├─Embedding: 1-1                              [1, 29, 512]              16,384,000
├─Embedding: 1-2                              [1, 29, 512]              15,360
├─TransformerDecoder: 1-3                     [1, 29, 512]              --
│    └─ModuleList: 2-1                        --                        --
│    │    └─TransformerDecoderLayer: 3-1      [1, 29, 512]              4,204,032
│    │    └─TransformerDecoderLayer: 3-2      [1, 29, 512]              4,204,032
│    │    └─TransformerDecoderLayer: 3-3      [1, 29, 512]              4,204,032
│    │    └─TransformerDecoderLayer: 3-4      [1, 29, 512]              4,204,032
├─Linear: 1-4                                 [1, 29, 32000]            16,416,000
===============================================================================================
Total params: 49,631,488
Trainable params: 49,631,488
Non-trainable params: 0
Total mult-adds (Units.MEGABYTES): 41.23
===============================================================================================
Input size (MB): 0.00
Forward/backward pass size (MB): 11.46
Params size (MB): 164.91
Estimated Total Size (MB): 176.37
===============================================================================================
In [34]:
import torch.optim as optim
from tqdm import tqdm

optimizer = optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss(ignore_index=1)

history = {"train_loss": [], "val_loss": []}

def train(model, train_loader, val_loader, optimizer, criterion, epochs=10):
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for x, y in tqdm(train_loader, desc=f"Epoch {epoch+1} 훈련"):
            x, y = x.to(device), y.to(device)
            optimizer.zero_grad()
            output = model(x).view(-1, tokenizer.vocab_size)
            loss = criterion(output, y.view(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        model.eval()
        val_loss = 0
        with torch.no_grad():
            for x, y in tqdm(val_loader, desc=f"Epoch {epoch+1} 검증"):
                x, y = x.to(device), y.to(device)
                output = model(x).view(-1, tokenizer.vocab_size)
                val_loss += criterion(output, y.view(-1)).item()

        train_l = total_loss/len(train_loader)
        val_l = val_loss/len(val_loader)
        history["train_loss"].append(train_l)
        history["val_loss"].append(val_l)
        print(f"Epoch {epoch+1} | Train Loss: {train_l:.4f} | Val Loss: {val_l:.4f}")

train(model, train_loader, val_loader, optimizer, criterion)
Epoch 1 훈련: 100%|██████████| 1428/1428 [00:26<00:00, 53.11it/s]
Epoch 1 검증: 100%|██████████| 285/285 [00:01<00:00, 215.63it/s]
Epoch 1 | Train Loss: 3.1326 | Val Loss: 1.3673
Epoch 2 훈련: 100%|██████████| 1428/1428 [00:25<00:00, 56.42it/s]
Epoch 2 검증: 100%|██████████| 285/285 [00:01<00:00, 200.95it/s]
Epoch 2 | Train Loss: 0.5289 | Val Loss: 0.6701
Epoch 3 훈련: 100%|██████████| 1428/1428 [00:26<00:00, 52.97it/s]
Epoch 3 검증: 100%|██████████| 285/285 [00:01<00:00, 200.79it/s]
Epoch 3 | Train Loss: 0.1800 | Val Loss: 0.4438
Epoch 4 훈련: 100%|██████████| 1428/1428 [00:26<00:00, 53.27it/s]
Epoch 4 검증: 100%|██████████| 285/285 [00:01<00:00, 209.30it/s]
Epoch 4 | Train Loss: 0.0709 | Val Loss: 0.3658
Epoch 5 훈련: 100%|██████████| 1428/1428 [00:26<00:00, 53.16it/s]
Epoch 5 검증: 100%|██████████| 285/285 [00:01<00:00, 206.60it/s]
Epoch 5 | Train Loss: 0.0290 | Val Loss: 0.3241
Epoch 6 훈련: 100%|██████████| 1428/1428 [00:26<00:00, 53.30it/s]
Epoch 6 검증: 100%|██████████| 285/285 [00:01<00:00, 203.61it/s]
Epoch 6 | Train Loss: 0.0115 | Val Loss: 0.3091
Epoch 7 훈련: 100%|██████████| 1428/1428 [00:26<00:00, 53.27it/s]
Epoch 7 검증: 100%|██████████| 285/285 [00:01<00:00, 199.93it/s]
Epoch 7 | Train Loss: 0.0028 | Val Loss: 0.3105
Epoch 8 훈련: 100%|██████████| 1428/1428 [00:26<00:00, 53.95it/s]
Epoch 8 검증: 100%|██████████| 285/285 [00:01<00:00, 221.30it/s]
Epoch 8 | Train Loss: 0.0011 | Val Loss: 0.3149
Epoch 9 훈련: 100%|██████████| 1428/1428 [00:26<00:00, 54.65it/s]
Epoch 9 검증: 100%|██████████| 285/285 [00:01<00:00, 235.70it/s]
Epoch 9 | Train Loss: 0.0007 | Val Loss: 0.3153
Epoch 10 훈련: 100%|██████████| 1428/1428 [00:25<00:00, 55.34it/s]
Epoch 10 검증: 100%|██████████| 285/285 [00:01<00:00, 215.60it/s]
Epoch 10 | Train Loss: 0.0006 | Val Loss: 0.3175

In [39]:
import matplotlib.pyplot as plt
import torch

ppl_train = [torch.exp(torch.tensor(l)).item() for l in history["train_loss"]]
ppl_val = [torch.exp(torch.tensor(l)).item() for l in history["val_loss"]]

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

ax1.plot(history["train_loss"], label="Train Loss")
ax1.plot(history["val_loss"], label="Val Loss")
ax1.set_xlabel("Epoch")
ax1.set_ylabel("Loss")
ax1.legend()
ax1.set_title("학습 곡선 (Loss)")
ax1.grid(True)

ax2.plot(ppl_train, label="Train PPL")
ax2.plot(ppl_val, label="Val PPL")
ax2.set_xlabel("Epoch")
ax2.set_ylabel("Perplexity")
ax2.legend()
ax2.set_title("학습 곡선 (PPL)")
ax2.grid(True)

plt.tight_layout()
plt.show()
No description has been provided for this image
In [43]:
def generate(model, tokenizer, prompt, max_new_tokens=20, temperature=0.8, top_k=50, device=device):
    model.eval()
    tokens = tokenizer(prompt, return_tensors="pt")["input_ids"].to(device)
    
    with torch.no_grad():
        for _ in range(max_new_tokens):
            output = model(tokens[:, -29:])
            logits = output[:, -1, :] / temperature
            
            top_k_logits, top_k_idx = torch.topk(logits, top_k)
            probs = torch.softmax(top_k_logits, dim=-1)
            next_token = top_k_idx[0][torch.multinomial(probs[0], 1)]
            
            tokens = torch.cat([tokens, next_token.view(1, 1)], dim=1)
            if next_token.item() == tokenizer.sep_token_id:
                break
    
    return tokenizer.decode(tokens[0], skip_special_tokens=True)

print(generate(model, tokenizer, "유튜브"))
유튜브 유튜브 유튜브 유튜브 유튜브


In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("skt/kogpt2-base-v2", cache_dir="./data/kogpt2")
model = AutoModelForCausalLM.from_pretrained("skt/kogpt2-base-v2", cache_dir="./data/kogpt2")

print("토크나이저 어휘사전 크기:", tokenizer.vocab_size)
print("모델 구조:", model)
The tied weights mapping and config for this model specifies to tie transformer.wte.weight to lm_head.weight, but both are present in the checkpoints, so we will NOT tie them. You should update the config with `tie_word_embeddings=False` to silence this warning
GPT2LMHeadModel LOAD REPORT from: skt/kogpt2-base-v2
Key                                     | Status     |  | 
----------------------------------------+------------+--+-
transformer.h.{0...11}.attn.masked_bias | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
토크나이저 어휘사전 크기: 51200
모델 구조: GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(51200, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=51200, bias=False)
)
In [45]:
lengths = [len(tokenizer(t)["input_ids"]) for t in dataset["train"]["title"]]
print("평균:", sum(lengths)/len(lengths))
print("최대:", max(lengths))
print("최소:", min(lengths))

import matplotlib.pyplot as plt
plt.hist(lengths, bins=30)
plt.title("토큰 길이 분포")
plt.grid(True)
plt.show()
평균: 26.731621349446122
최대: 47
최소: 4
No description has been provided for this image
In [6]:
from torch.utils.data import Dataset, DataLoader

tokenizer.pad_token = tokenizer.eos_token

class GPTDataset(Dataset):
    def __init__(self, titles, tokenizer, max_len=47):
        self.input_ids = []
        self.labels = []
        for title in titles:
            tokens = tokenizer(
                title,
                max_length=max_len,
                truncation=True,
                padding="max_length",
                return_tensors="pt"
            )
            ids = tokens["input_ids"].squeeze()
            label = ids.clone()
            label[label == tokenizer.pad_token_id] = -100  # 패딩 무시
            self.input_ids.append(ids)
            self.labels.append(label)

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.labels[idx]

train_dataset = GPTDataset(dataset["train"]["title"], tokenizer)
val_dataset = GPTDataset(dataset["validation"]["title"], tokenizer)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
In [8]:
import torch.optim as optim
from tqdm import tqdm

model = model.to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-4)

history = {"train_loss": [], "val_loss": []}

def train(model, train_loader, val_loader, optimizer, epochs=5):
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for x, y in tqdm(train_loader, desc=f"Epoch {epoch+1} 훈련"):
            x, y = x.to(device), y.to(device)
            optimizer.zero_grad()
            output = model(x, labels=y)
            loss = output.loss
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        model.eval()
        val_loss = 0
        with torch.no_grad():
            for x, y in tqdm(val_loader, desc=f"Epoch {epoch+1} 검증"):
                x, y = x.to(device), y.to(device)
                output = model(x, labels=y)
                val_loss += output.loss.item()

        train_l = total_loss/len(train_loader)
        val_l = val_loss/len(val_loader)
        history["train_loss"].append(train_l)
        history["val_loss"].append(val_l)
        print(f"Epoch {epoch+1} | Train Loss: {train_l:.4f} | Val Loss: {val_l:.4f}")

train(model, train_loader, val_loader, optimizer)
Epoch 1 훈련:   0%|          | 0/1428 [00:00<?, ?it/s]/opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [64,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [65,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [66,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [67,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [68,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [69,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [70,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [71,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [72,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [73,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [74,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [75,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [76,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [77,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [78,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [79,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [80,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [81,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [82,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [83,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [84,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [85,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [86,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [87,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [88,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [89,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [90,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [91,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [92,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [93,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [94,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [95,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [96,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [97,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [98,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [99,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [100,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [101,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [102,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [103,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [104,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [105,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [106,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [107,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [108,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [109,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [110,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [111,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [112,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [113,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [114,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [115,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [116,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [117,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [118,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [119,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [120,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [121,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [122,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [123,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [124,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [125,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [126,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [127,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
Epoch 1 훈련:   0%|          | 0/1428 [00:00<?, ?it/s]
---------------------------------------------------------------------------
AcceleratorError                          Traceback (most recent call last)
Cell In[8], line 36
     33         history["val_loss"].append(val_l)
     34         print(f"Epoch {epoch+1} | Train Loss: {train_l:.4f} | Val Loss: {val_l:.4f}")
---> 36 train(model, train_loader, val_loader, optimizer)

Cell In[8], line 16, in train(model, train_loader, val_loader, optimizer, epochs)
     14 x, y = x.to(device), y.to(device)
     15 optimizer.zero_grad()
---> 16 output = model(x, labels=y)
     17 loss = output.loss
     18 loss.backward()

File /usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py:1775, in Module._wrapped_call_impl(self, *args, **kwargs)
   1773     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1774 else:
-> 1775     return self._call_impl(*args, **kwargs)

File /usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py:1786, in Module._call_impl(self, *args, **kwargs)
   1781 # If we don't have any hooks, we want to skip the rest of the logic in
   1782 # this function, and just call forward.
   1783 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1784         or _global_backward_pre_hooks or _global_backward_hooks
   1785         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1786     return forward_call(*args, **kwargs)
   1788 result = None
   1789 called_always_called_hooks = set()

File /usr/local/lib/python3.12/dist-packages/transformers/models/gpt2/modeling_gpt2.py:759, in GPT2LMHeadModel.forward(self, input_ids, past_key_values, cache_position, attention_mask, token_type_ids, position_ids, inputs_embeds, encoder_hidden_states, encoder_attention_mask, labels, use_cache, output_attentions, output_hidden_states, return_dict, logits_to_keep, **kwargs)
    739 r"""
    740 input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
    741     `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
   (...)    755     are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
    756 """
    757 return_dict = return_dict if return_dict is not None else self.config.use_return_dict
--> 759 transformer_outputs = self.transformer(
    760     input_ids,
    761     past_key_values=past_key_values,
    762     attention_mask=attention_mask,
    763     cache_position=cache_position,
    764     token_type_ids=token_type_ids,
    765     position_ids=position_ids,
    766     inputs_embeds=inputs_embeds,
    767     encoder_hidden_states=encoder_hidden_states,
    768     encoder_attention_mask=encoder_attention_mask,
    769     use_cache=use_cache,
    770     output_attentions=output_attentions,
    771     output_hidden_states=output_hidden_states,
    772     return_dict=return_dict,
    773 )
    774 hidden_states = transformer_outputs[0]
    776 slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep

File /usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py:1775, in Module._wrapped_call_impl(self, *args, **kwargs)
   1773     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1774 else:
-> 1775     return self._call_impl(*args, **kwargs)

File /usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py:1786, in Module._call_impl(self, *args, **kwargs)
   1781 # If we don't have any hooks, we want to skip the rest of the logic in
   1782 # this function, and just call forward.
   1783 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1784         or _global_backward_pre_hooks or _global_backward_hooks
   1785         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1786     return forward_call(*args, **kwargs)
   1788 result = None
   1789 called_always_called_hooks = set()

File /usr/local/lib/python3.12/dist-packages/transformers/models/gpt2/modeling_gpt2.py:612, in GPT2Model.forward(self, input_ids, past_key_values, cache_position, attention_mask, token_type_ids, position_ids, inputs_embeds, encoder_hidden_states, encoder_attention_mask, use_cache, output_attentions, output_hidden_states, return_dict, **kwargs)
    610 if cache_position is None:
    611     past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
--> 612     cache_position = torch.arange(
    613         past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
    614     )
    615 if position_ids is None:
    616     position_ids = cache_position.unsqueeze(0)

AcceleratorError: CUDA error: device-side assert triggered
Search for `cudaErrorAssert' in https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html for more information.
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
In [49]:
print("토크나이저 vocab_size:", tokenizer.vocab_size)
print("모델 vocab_size:", model.config.vocab_size)
토크나이저 vocab_size: 51200
모델 vocab_size: 51200
In [50]:
print("pad_token_id:", tokenizer.pad_token_id)
print("eos_token_id:", tokenizer.eos_token_id)
pad_token_id: 51200
eos_token_id: 51200
In [53]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

device = torch.device("cpu")
model = model.to(device)
train(model, train_loader, val_loader, optimizer)
---------------------------------------------------------------------------
AcceleratorError                          Traceback (most recent call last)
Cell In[53], line 5
      2 os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
      4 device = torch.device("cpu")
----> 5 model = model.to(device)
      6 train(model, train_loader, val_loader, optimizer)

File /usr/local/lib/python3.12/dist-packages/transformers/modeling_utils.py:3529, in PreTrainedModel.to(self, *args, **kwargs)
   3524     if dtype_present_in_args:
   3525         raise ValueError(
   3526             "You cannot cast a GPTQ model in a new `dtype`. Make sure to load the model using `from_pretrained` using the desired"
   3527             " `dtype` by passing the correct `dtype` argument."
   3528         )
-> 3529 return super().to(*args, **kwargs)

File /usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py:1371, in Module.to(self, *args, **kwargs)
   1368         else:
   1369             raise
-> 1371 return self._apply(convert)

File /usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py:930, in Module._apply(self, fn, recurse)
    928 if recurse:
    929     for module in self.children():
--> 930         module._apply(fn)
    932 def compute_should_use_set_data(tensor, tensor_applied) -> bool:
    933     if torch._has_compatible_shallow_copy_type(tensor, tensor_applied):
    934         # If the new tensor has compatible tensor type as the existing tensor,
    935         # the current behavior is to change the tensor in-place using `.data =`,
   (...)    940         # global flag to let the user control whether they want the future
    941         # behavior of overwriting the existing tensor or not.

File /usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py:930, in Module._apply(self, fn, recurse)
    928 if recurse:
    929     for module in self.children():
--> 930         module._apply(fn)
    932 def compute_should_use_set_data(tensor, tensor_applied) -> bool:
    933     if torch._has_compatible_shallow_copy_type(tensor, tensor_applied):
    934         # If the new tensor has compatible tensor type as the existing tensor,
    935         # the current behavior is to change the tensor in-place using `.data =`,
   (...)    940         # global flag to let the user control whether they want the future
    941         # behavior of overwriting the existing tensor or not.

File /usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py:957, in Module._apply(self, fn, recurse)
    953 # Tensors stored in modules are graph leaves, and we don't want to
    954 # track autograd history of `param_applied`, so we have to use
    955 # `with torch.no_grad():`
    956 with torch.no_grad():
--> 957     param_applied = fn(param)
    958 p_should_use_set_data = compute_should_use_set_data(param, param_applied)
    960 from torch._subclasses.fake_tensor import FakeTensor

File /usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py:1357, in Module.to.<locals>.convert(t)
   1350     if convert_to_format is not None and t.dim() in (4, 5):
   1351         return t.to(
   1352             device,
   1353             dtype if t.is_floating_point() or t.is_complex() else None,
   1354             non_blocking,
   1355             memory_format=convert_to_format,
   1356         )
-> 1357     return t.to(
   1358         device,
   1359         dtype if t.is_floating_point() or t.is_complex() else None,
   1360         non_blocking,
   1361     )
   1362 except NotImplementedError as e:
   1363     if str(e) == "Cannot copy out of meta tensor; no data!":

AcceleratorError: CUDA error: device-side assert triggered
Search for `cudaErrorAssert' in https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html for more information.
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
In [ ]:
 
In [ ]: