In [1]:
import setup_env
from setup_env import device
--------------------------------------------------------------------------------
=== Hardware Acceleration ===
PyTorch version: 2.9.0a0+145a3a7bda.nv25.10
Using NVIDIA GPU (CUDA)
CUDA version: 13.0
GPU name: NVIDIA GeForce RTX 5070 Ti
GPU count: 1
Total GPU memory: 15.92 GB
Allocated memory: 0.00 GB
Free memory: 15.92 GB
Device: cuda
=== Matplotlib Settings ===
✅ Font: NanumGothic
=== System Info ===
OS: Ubuntu 24.04.3 LTS (Noble Numbat)
Kernel: 6.6.87.2-microsoft-standard-WSL2
Architecture: x86_64
Python: 3.12.3
Working directory: /workspace/ai-deeplearning/tutorial
=== Library Versions ===
NumPy: 2.1.0
Pandas: 3.0.0
Matplotlib: 3.10.7
Scikit-learn: 1.7.2
OpenCV: Not installed → !pip install -q opencv-python
Pillow: 12.0.0
Seaborn: 0.13.2
TensorFlow: Not installed → !pip install -q tensorflow
Transformers: 5.2.0
TorchVision: 0.24.0a0+094e7af5
=== Environment setup completed ===
--------------------------------------------------------------------------------
=== Visualizing Test Plot (Wide View) ===
=== GPU Usage Code Snippet === Device set to: cuda ---------------------------------------- # 아래 코드를 복사해서 모델과 데이터를 GPU로 보내세요: model = YourModel().to(device) data = data.to(device) ---------------------------------------- === Environment setup completed === --------------------------------------------------------------------------------
1. dataset¶
In [25]:
!pip install -q datasets
import datasets
print("datasets version : " +datasets.__version__)
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning. datasets version : 2.19.0
In [26]:
!pip install -q ipywidgets
import ipywidgets
print("ipywidgets version : " + ipywidgets.__version__)
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning. ipywidgets version : 8.1.8
In [2]:
from datasets import load_dataset
dataset = load_dataset("klue", "ynat", cache_dir="./data/ynat")
print(dataset)
DatasetDict({
train: Dataset({
features: ['guid', 'title', 'label', 'url', 'date'],
num_rows: 45678
})
validation: Dataset({
features: ['guid', 'title', 'label', 'url', 'date'],
num_rows: 9107
})
})
2. tokenizer¶
- 허깅페이스 transformers는 BERT, GPT, RoBERTa 등 트랜스포머 기반 모델과 토크나이저를 쉽게 불러쓸 수 있게 해주는 패키지
- BERT: 정적 마스킹, NSP 태스크 포함, 적은 데이터로 학습.
- RoBERTa: 동적 마스킹, NSP 제거, 더 많은 데이터+더 오래 학습.
In [28]:
!pip install -q transformers
import transformers
print("transformers version : " + transformers.__version__)
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning. transformers version : 5.2.0
In [29]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("klue/roberta-base")
print("어휘사전 크기:", tokenizer.vocab_size)
print("최대 입력 길이:", tokenizer.model_max_length)
vocab = tokenizer.get_vocab()
print(list(vocab.items())[:10])
어휘사전 크기: 32000
최대 입력 길이: 512
[('소박', 11778), ('##칵', 3351), ('흘렀', 11091), ('끈질기', 21291), ('내쫓', 20015), ('타선', 10872), ('기고', 17073), ('삼계', 19248), ('##의', 2079), ('##MC', 11505)]
In [30]:
import matplotlib.pyplot as plt
lengths = [len(tokenizer(t)["input_ids"]) for t in dataset["train"]["title"]]
print("평균:", sum(lengths)/len(lengths))
print("최대:", max(lengths))
print("최소:", min(lengths))
plt.hist(lengths, bins=30)
plt.show()
평균: 15.392836814221289 최대: 30 최소: 4
In [31]:
from torch.utils.data import Dataset, DataLoader
class GPTDataset(Dataset):
def __init__(self, titles, tokenizer, max_len=30):
self.input_ids = []
for title in titles:
tokens = tokenizer(
title, # 입력 문장
max_length=max_len, # 최대 토큰 수
truncation=True, # 초과시 자르기
padding="max_length", # 부족하면 max_length까지 패딩
return_tensors="pt" # 파이토치 텐서로 반환
)
self.input_ids.append(tokens["input_ids"].squeeze())
def __len__(self):
return len(self.input_ids)
def __getitem__(self, idx):
x = self.input_ids[idx][:-1]
y = self.input_ids[idx][1:]
return x, y
train_dataset = GPTDataset(dataset["train"]["title"], tokenizer)
val_dataset = GPTDataset(dataset["validation"]["title"], tokenizer)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
print("train 배치 수:", len(train_loader))
print("val 배치 수:", len(val_loader))
train 배치 수: 1428 val 배치 수: 285
In [32]:
!pip install torchinfo -q
import torchinfo
print(torchinfo.__version__)
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning. 1.8.0
In [33]:
import torch
import torch.nn as nn
from torchinfo import summary
class GPTDecoder(nn.Module):
def __init__(self, vocab_size, d_model=512, num_heads=8, num_layers=4, max_len=30, dropout=0.1):
super().__init__()
# 토큰 인덱스 → 512차원 벡터로 변환
self.token_embedding = nn.Embedding(vocab_size, d_model)
# 위치(0,1,2...) → 512차원 벡터로 변환
self.pos_embedding = nn.Embedding(max_len, d_model)
# 디코더 레이어 1개 정의 (헤드8개, 드롭아웃)
decoder_layer = nn.TransformerDecoderLayer(d_model=d_model, nhead=num_heads, dropout=dropout, batch_first=True)
# 디코더 레이어 4개 쌓기
self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=num_layers)
# 512차원 → 어휘사전 크기로 변환 (각 토큰 확률 출력)
self.fc_out = nn.Linear(d_model, vocab_size)
self.d_model = d_model
def forward(self, x):
# 입력 시퀀스 길이 확인
seq_len = x.size(1)
# 위치 인덱스 생성 [0,1,2,...,seq_len-1]
positions = torch.arange(seq_len, device=x.device).unsqueeze(0)
# 토큰 임베딩 + 위치 임베딩 더하기
x = self.token_embedding(x) + self.pos_embedding(positions)
# 미래 토큰 못보게 마스크 생성
mask = nn.Transformer.generate_square_subsequent_mask(seq_len, device=x.device)
# 디코더 통과 (GPT는 인코더 없으니 x를 메모리로도 넘김)
out = self.decoder(x, x, tgt_mask=mask)
# 각 위치에서 다음 토큰 확률 출력
return self.fc_out(out)
model = GPTDecoder(vocab_size=tokenizer.vocab_size)
summary(model, input_size=(1, 29), dtypes=[torch.long])
Out[33]:
=============================================================================================== Layer (type:depth-idx) Output Shape Param # =============================================================================================== GPTDecoder [1, 29, 32000] -- ├─Embedding: 1-1 [1, 29, 512] 16,384,000 ├─Embedding: 1-2 [1, 29, 512] 15,360 ├─TransformerDecoder: 1-3 [1, 29, 512] -- │ └─ModuleList: 2-1 -- -- │ │ └─TransformerDecoderLayer: 3-1 [1, 29, 512] 4,204,032 │ │ └─TransformerDecoderLayer: 3-2 [1, 29, 512] 4,204,032 │ │ └─TransformerDecoderLayer: 3-3 [1, 29, 512] 4,204,032 │ │ └─TransformerDecoderLayer: 3-4 [1, 29, 512] 4,204,032 ├─Linear: 1-4 [1, 29, 32000] 16,416,000 =============================================================================================== Total params: 49,631,488 Trainable params: 49,631,488 Non-trainable params: 0 Total mult-adds (Units.MEGABYTES): 41.23 =============================================================================================== Input size (MB): 0.00 Forward/backward pass size (MB): 11.46 Params size (MB): 164.91 Estimated Total Size (MB): 176.37 ===============================================================================================
In [34]:
import torch.optim as optim
from tqdm import tqdm
optimizer = optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss(ignore_index=1)
history = {"train_loss": [], "val_loss": []}
def train(model, train_loader, val_loader, optimizer, criterion, epochs=10):
for epoch in range(epochs):
model.train()
total_loss = 0
for x, y in tqdm(train_loader, desc=f"Epoch {epoch+1} 훈련"):
x, y = x.to(device), y.to(device)
optimizer.zero_grad()
output = model(x).view(-1, tokenizer.vocab_size)
loss = criterion(output, y.view(-1))
loss.backward()
optimizer.step()
total_loss += loss.item()
model.eval()
val_loss = 0
with torch.no_grad():
for x, y in tqdm(val_loader, desc=f"Epoch {epoch+1} 검증"):
x, y = x.to(device), y.to(device)
output = model(x).view(-1, tokenizer.vocab_size)
val_loss += criterion(output, y.view(-1)).item()
train_l = total_loss/len(train_loader)
val_l = val_loss/len(val_loader)
history["train_loss"].append(train_l)
history["val_loss"].append(val_l)
print(f"Epoch {epoch+1} | Train Loss: {train_l:.4f} | Val Loss: {val_l:.4f}")
train(model, train_loader, val_loader, optimizer, criterion)
Epoch 1 훈련: 100%|██████████| 1428/1428 [00:26<00:00, 53.11it/s] Epoch 1 검증: 100%|██████████| 285/285 [00:01<00:00, 215.63it/s]
Epoch 1 | Train Loss: 3.1326 | Val Loss: 1.3673
Epoch 2 훈련: 100%|██████████| 1428/1428 [00:25<00:00, 56.42it/s] Epoch 2 검증: 100%|██████████| 285/285 [00:01<00:00, 200.95it/s]
Epoch 2 | Train Loss: 0.5289 | Val Loss: 0.6701
Epoch 3 훈련: 100%|██████████| 1428/1428 [00:26<00:00, 52.97it/s] Epoch 3 검증: 100%|██████████| 285/285 [00:01<00:00, 200.79it/s]
Epoch 3 | Train Loss: 0.1800 | Val Loss: 0.4438
Epoch 4 훈련: 100%|██████████| 1428/1428 [00:26<00:00, 53.27it/s] Epoch 4 검증: 100%|██████████| 285/285 [00:01<00:00, 209.30it/s]
Epoch 4 | Train Loss: 0.0709 | Val Loss: 0.3658
Epoch 5 훈련: 100%|██████████| 1428/1428 [00:26<00:00, 53.16it/s] Epoch 5 검증: 100%|██████████| 285/285 [00:01<00:00, 206.60it/s]
Epoch 5 | Train Loss: 0.0290 | Val Loss: 0.3241
Epoch 6 훈련: 100%|██████████| 1428/1428 [00:26<00:00, 53.30it/s] Epoch 6 검증: 100%|██████████| 285/285 [00:01<00:00, 203.61it/s]
Epoch 6 | Train Loss: 0.0115 | Val Loss: 0.3091
Epoch 7 훈련: 100%|██████████| 1428/1428 [00:26<00:00, 53.27it/s] Epoch 7 검증: 100%|██████████| 285/285 [00:01<00:00, 199.93it/s]
Epoch 7 | Train Loss: 0.0028 | Val Loss: 0.3105
Epoch 8 훈련: 100%|██████████| 1428/1428 [00:26<00:00, 53.95it/s] Epoch 8 검증: 100%|██████████| 285/285 [00:01<00:00, 221.30it/s]
Epoch 8 | Train Loss: 0.0011 | Val Loss: 0.3149
Epoch 9 훈련: 100%|██████████| 1428/1428 [00:26<00:00, 54.65it/s] Epoch 9 검증: 100%|██████████| 285/285 [00:01<00:00, 235.70it/s]
Epoch 9 | Train Loss: 0.0007 | Val Loss: 0.3153
Epoch 10 훈련: 100%|██████████| 1428/1428 [00:25<00:00, 55.34it/s] Epoch 10 검증: 100%|██████████| 285/285 [00:01<00:00, 215.60it/s]
Epoch 10 | Train Loss: 0.0006 | Val Loss: 0.3175
In [39]:
import matplotlib.pyplot as plt
import torch
ppl_train = [torch.exp(torch.tensor(l)).item() for l in history["train_loss"]]
ppl_val = [torch.exp(torch.tensor(l)).item() for l in history["val_loss"]]
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
ax1.plot(history["train_loss"], label="Train Loss")
ax1.plot(history["val_loss"], label="Val Loss")
ax1.set_xlabel("Epoch")
ax1.set_ylabel("Loss")
ax1.legend()
ax1.set_title("학습 곡선 (Loss)")
ax1.grid(True)
ax2.plot(ppl_train, label="Train PPL")
ax2.plot(ppl_val, label="Val PPL")
ax2.set_xlabel("Epoch")
ax2.set_ylabel("Perplexity")
ax2.legend()
ax2.set_title("학습 곡선 (PPL)")
ax2.grid(True)
plt.tight_layout()
plt.show()
In [43]:
def generate(model, tokenizer, prompt, max_new_tokens=20, temperature=0.8, top_k=50, device=device):
model.eval()
tokens = tokenizer(prompt, return_tensors="pt")["input_ids"].to(device)
with torch.no_grad():
for _ in range(max_new_tokens):
output = model(tokens[:, -29:])
logits = output[:, -1, :] / temperature
top_k_logits, top_k_idx = torch.topk(logits, top_k)
probs = torch.softmax(top_k_logits, dim=-1)
next_token = top_k_idx[0][torch.multinomial(probs[0], 1)]
tokens = torch.cat([tokens, next_token.view(1, 1)], dim=1)
if next_token.item() == tokenizer.sep_token_id:
break
return tokenizer.decode(tokens[0], skip_special_tokens=True)
print(generate(model, tokenizer, "유튜브"))
유튜브 유튜브 유튜브 유튜브 유튜브
In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained("skt/kogpt2-base-v2", cache_dir="./data/kogpt2")
model = AutoModelForCausalLM.from_pretrained("skt/kogpt2-base-v2", cache_dir="./data/kogpt2")
print("토크나이저 어휘사전 크기:", tokenizer.vocab_size)
print("모델 구조:", model)
The tied weights mapping and config for this model specifies to tie transformer.wte.weight to lm_head.weight, but both are present in the checkpoints, so we will NOT tie them. You should update the config with `tie_word_embeddings=False` to silence this warning
GPT2LMHeadModel LOAD REPORT from: skt/kogpt2-base-v2
Key | Status | |
----------------------------------------+------------+--+-
transformer.h.{0...11}.attn.masked_bias | UNEXPECTED | |
Notes:
- UNEXPECTED :can be ignored when loading from different task/architecture; not ok if you expect identical arch.
토크나이저 어휘사전 크기: 51200
모델 구조: GPT2LMHeadModel(
(transformer): GPT2Model(
(wte): Embedding(51200, 768)
(wpe): Embedding(1024, 768)
(drop): Dropout(p=0.1, inplace=False)
(h): ModuleList(
(0-11): 12 x GPT2Block(
(ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(attn): GPT2Attention(
(c_attn): Conv1D(nf=2304, nx=768)
(c_proj): Conv1D(nf=768, nx=768)
(attn_dropout): Dropout(p=0.1, inplace=False)
(resid_dropout): Dropout(p=0.1, inplace=False)
)
(ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
(mlp): GPT2MLP(
(c_fc): Conv1D(nf=3072, nx=768)
(c_proj): Conv1D(nf=768, nx=3072)
(act): NewGELUActivation()
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
(ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)
(lm_head): Linear(in_features=768, out_features=51200, bias=False)
)
In [45]:
lengths = [len(tokenizer(t)["input_ids"]) for t in dataset["train"]["title"]]
print("평균:", sum(lengths)/len(lengths))
print("최대:", max(lengths))
print("최소:", min(lengths))
import matplotlib.pyplot as plt
plt.hist(lengths, bins=30)
plt.title("토큰 길이 분포")
plt.grid(True)
plt.show()
평균: 26.731621349446122 최대: 47 최소: 4
In [6]:
from torch.utils.data import Dataset, DataLoader
tokenizer.pad_token = tokenizer.eos_token
class GPTDataset(Dataset):
def __init__(self, titles, tokenizer, max_len=47):
self.input_ids = []
self.labels = []
for title in titles:
tokens = tokenizer(
title,
max_length=max_len,
truncation=True,
padding="max_length",
return_tensors="pt"
)
ids = tokens["input_ids"].squeeze()
label = ids.clone()
label[label == tokenizer.pad_token_id] = -100 # 패딩 무시
self.input_ids.append(ids)
self.labels.append(label)
def __len__(self):
return len(self.input_ids)
def __getitem__(self, idx):
return self.input_ids[idx], self.labels[idx]
train_dataset = GPTDataset(dataset["train"]["title"], tokenizer)
val_dataset = GPTDataset(dataset["validation"]["title"], tokenizer)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
In [8]:
import torch.optim as optim
from tqdm import tqdm
model = model.to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-4)
history = {"train_loss": [], "val_loss": []}
def train(model, train_loader, val_loader, optimizer, epochs=5):
for epoch in range(epochs):
model.train()
total_loss = 0
for x, y in tqdm(train_loader, desc=f"Epoch {epoch+1} 훈련"):
x, y = x.to(device), y.to(device)
optimizer.zero_grad()
output = model(x, labels=y)
loss = output.loss
loss.backward()
optimizer.step()
total_loss += loss.item()
model.eval()
val_loss = 0
with torch.no_grad():
for x, y in tqdm(val_loader, desc=f"Epoch {epoch+1} 검증"):
x, y = x.to(device), y.to(device)
output = model(x, labels=y)
val_loss += output.loss.item()
train_l = total_loss/len(train_loader)
val_l = val_loss/len(val_loader)
history["train_loss"].append(train_l)
history["val_loss"].append(val_l)
print(f"Epoch {epoch+1} | Train Loss: {train_l:.4f} | Val Loss: {val_l:.4f}")
train(model, train_loader, val_loader, optimizer)
Epoch 1 훈련: 0%| | 0/1428 [00:00<?, ?it/s]/opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [64,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed. /opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [65,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed. /opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [66,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed. /opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [67,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed. /opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [68,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed. /opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [69,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed. /opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [70,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed. /opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [71,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed. /opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [72,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed. /opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [73,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed. /opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [74,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed. /opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [75,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed. /opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [76,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed. /opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [77,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed. /opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [78,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed. /opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [79,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed. /opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [80,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed. /opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [81,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed. /opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [82,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed. /opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [83,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed. /opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [84,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed. /opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [85,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed. /opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [86,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed. /opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [87,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed. /opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [88,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed. /opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [89,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed. /opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [90,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed. /opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [91,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed. /opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [92,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed. /opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [93,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed. /opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [94,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed. /opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [95,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed. /opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [96,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed. /opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [97,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed. /opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [98,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed. /opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [99,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed. /opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [100,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed. /opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [101,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed. /opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [102,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed. /opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [103,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed. /opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [104,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed. /opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [105,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed. /opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [106,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed. /opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [107,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed. /opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [108,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed. /opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [109,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed. /opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [110,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed. /opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [111,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed. /opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [112,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed. /opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [113,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed. /opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [114,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed. /opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [115,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed. /opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [116,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed. /opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [117,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed. /opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [118,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed. /opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [119,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed. /opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [120,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed. /opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [121,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed. /opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [122,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed. /opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [123,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed. /opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [124,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed. /opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [125,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed. /opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [126,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed. /opt/pytorch/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [34,0,0], thread: [127,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed. Epoch 1 훈련: 0%| | 0/1428 [00:00<?, ?it/s]
--------------------------------------------------------------------------- AcceleratorError Traceback (most recent call last) Cell In[8], line 36 33 history["val_loss"].append(val_l) 34 print(f"Epoch {epoch+1} | Train Loss: {train_l:.4f} | Val Loss: {val_l:.4f}") ---> 36 train(model, train_loader, val_loader, optimizer) Cell In[8], line 16, in train(model, train_loader, val_loader, optimizer, epochs) 14 x, y = x.to(device), y.to(device) 15 optimizer.zero_grad() ---> 16 output = model(x, labels=y) 17 loss = output.loss 18 loss.backward() File /usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py:1775, in Module._wrapped_call_impl(self, *args, **kwargs) 1773 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc] 1774 else: -> 1775 return self._call_impl(*args, **kwargs) File /usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py:1786, in Module._call_impl(self, *args, **kwargs) 1781 # If we don't have any hooks, we want to skip the rest of the logic in 1782 # this function, and just call forward. 1783 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks 1784 or _global_backward_pre_hooks or _global_backward_hooks 1785 or _global_forward_hooks or _global_forward_pre_hooks): -> 1786 return forward_call(*args, **kwargs) 1788 result = None 1789 called_always_called_hooks = set() File /usr/local/lib/python3.12/dist-packages/transformers/models/gpt2/modeling_gpt2.py:759, in GPT2LMHeadModel.forward(self, input_ids, past_key_values, cache_position, attention_mask, token_type_ids, position_ids, inputs_embeds, encoder_hidden_states, encoder_attention_mask, labels, use_cache, output_attentions, output_hidden_states, return_dict, logits_to_keep, **kwargs) 739 r""" 740 input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`): 741 `input_ids_length` = `sequence_length` if `past_key_values` is `None` else (...) 755 are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]` 756 """ 757 return_dict = return_dict if return_dict is not None else self.config.use_return_dict --> 759 transformer_outputs = self.transformer( 760 input_ids, 761 past_key_values=past_key_values, 762 attention_mask=attention_mask, 763 cache_position=cache_position, 764 token_type_ids=token_type_ids, 765 position_ids=position_ids, 766 inputs_embeds=inputs_embeds, 767 encoder_hidden_states=encoder_hidden_states, 768 encoder_attention_mask=encoder_attention_mask, 769 use_cache=use_cache, 770 output_attentions=output_attentions, 771 output_hidden_states=output_hidden_states, 772 return_dict=return_dict, 773 ) 774 hidden_states = transformer_outputs[0] 776 slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep File /usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py:1775, in Module._wrapped_call_impl(self, *args, **kwargs) 1773 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc] 1774 else: -> 1775 return self._call_impl(*args, **kwargs) File /usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py:1786, in Module._call_impl(self, *args, **kwargs) 1781 # If we don't have any hooks, we want to skip the rest of the logic in 1782 # this function, and just call forward. 1783 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks 1784 or _global_backward_pre_hooks or _global_backward_hooks 1785 or _global_forward_hooks or _global_forward_pre_hooks): -> 1786 return forward_call(*args, **kwargs) 1788 result = None 1789 called_always_called_hooks = set() File /usr/local/lib/python3.12/dist-packages/transformers/models/gpt2/modeling_gpt2.py:612, in GPT2Model.forward(self, input_ids, past_key_values, cache_position, attention_mask, token_type_ids, position_ids, inputs_embeds, encoder_hidden_states, encoder_attention_mask, use_cache, output_attentions, output_hidden_states, return_dict, **kwargs) 610 if cache_position is None: 611 past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 --> 612 cache_position = torch.arange( 613 past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device 614 ) 615 if position_ids is None: 616 position_ids = cache_position.unsqueeze(0) AcceleratorError: CUDA error: device-side assert triggered Search for `cudaErrorAssert' in https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html for more information. CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect. For debugging consider passing CUDA_LAUNCH_BLOCKING=1 Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
In [49]:
print("토크나이저 vocab_size:", tokenizer.vocab_size)
print("모델 vocab_size:", model.config.vocab_size)
토크나이저 vocab_size: 51200 모델 vocab_size: 51200
In [50]:
print("pad_token_id:", tokenizer.pad_token_id)
print("eos_token_id:", tokenizer.eos_token_id)
pad_token_id: 51200 eos_token_id: 51200
In [53]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
device = torch.device("cpu")
model = model.to(device)
train(model, train_loader, val_loader, optimizer)
--------------------------------------------------------------------------- AcceleratorError Traceback (most recent call last) Cell In[53], line 5 2 os.environ["CUDA_LAUNCH_BLOCKING"] = "1" 4 device = torch.device("cpu") ----> 5 model = model.to(device) 6 train(model, train_loader, val_loader, optimizer) File /usr/local/lib/python3.12/dist-packages/transformers/modeling_utils.py:3529, in PreTrainedModel.to(self, *args, **kwargs) 3524 if dtype_present_in_args: 3525 raise ValueError( 3526 "You cannot cast a GPTQ model in a new `dtype`. Make sure to load the model using `from_pretrained` using the desired" 3527 " `dtype` by passing the correct `dtype` argument." 3528 ) -> 3529 return super().to(*args, **kwargs) File /usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py:1371, in Module.to(self, *args, **kwargs) 1368 else: 1369 raise -> 1371 return self._apply(convert) File /usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py:930, in Module._apply(self, fn, recurse) 928 if recurse: 929 for module in self.children(): --> 930 module._apply(fn) 932 def compute_should_use_set_data(tensor, tensor_applied) -> bool: 933 if torch._has_compatible_shallow_copy_type(tensor, tensor_applied): 934 # If the new tensor has compatible tensor type as the existing tensor, 935 # the current behavior is to change the tensor in-place using `.data =`, (...) 940 # global flag to let the user control whether they want the future 941 # behavior of overwriting the existing tensor or not. File /usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py:930, in Module._apply(self, fn, recurse) 928 if recurse: 929 for module in self.children(): --> 930 module._apply(fn) 932 def compute_should_use_set_data(tensor, tensor_applied) -> bool: 933 if torch._has_compatible_shallow_copy_type(tensor, tensor_applied): 934 # If the new tensor has compatible tensor type as the existing tensor, 935 # the current behavior is to change the tensor in-place using `.data =`, (...) 940 # global flag to let the user control whether they want the future 941 # behavior of overwriting the existing tensor or not. File /usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py:957, in Module._apply(self, fn, recurse) 953 # Tensors stored in modules are graph leaves, and we don't want to 954 # track autograd history of `param_applied`, so we have to use 955 # `with torch.no_grad():` 956 with torch.no_grad(): --> 957 param_applied = fn(param) 958 p_should_use_set_data = compute_should_use_set_data(param, param_applied) 960 from torch._subclasses.fake_tensor import FakeTensor File /usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py:1357, in Module.to.<locals>.convert(t) 1350 if convert_to_format is not None and t.dim() in (4, 5): 1351 return t.to( 1352 device, 1353 dtype if t.is_floating_point() or t.is_complex() else None, 1354 non_blocking, 1355 memory_format=convert_to_format, 1356 ) -> 1357 return t.to( 1358 device, 1359 dtype if t.is_floating_point() or t.is_complex() else None, 1360 non_blocking, 1361 ) 1362 except NotImplementedError as e: 1363 if str(e) == "Cannot copy out of meta tensor; no data!": AcceleratorError: CUDA error: device-side assert triggered Search for `cudaErrorAssert' in https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html for more information. CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect. For debugging consider passing CUDA_LAUNCH_BLOCKING=1 Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
In [ ]:
In [ ]: