import setup_env
from setup_env import device

--------------------------------------------------------------------------------
=== Hardware Acceleration ===
PyTorch version: 2.9.0a0+145a3a7bda.nv25.10
Using NVIDIA GPU (CUDA)
   CUDA version: 13.0
   GPU name: NVIDIA GeForce RTX 5070 Ti
   GPU count: 1
   Total GPU memory: 15.92 GB
   Allocated memory: 0.00 GB
   Free memory: 15.92 GB
Device: cuda

=== Matplotlib Settings ===
✅ Font: NanumGothic

=== System Info ===
OS: Ubuntu 24.04.3 LTS (Noble Numbat)
    Kernel: 6.6.87.2-microsoft-standard-WSL2
Architecture: x86_64
Python: 3.12.3
Working directory: /workspace/ai-deeplearning/tutorial

=== Library Versions ===
NumPy: 2.1.0
Pandas: 3.0.0
Matplotlib: 3.10.7
Scikit-learn: 1.7.2
OpenCV: Not installed → !pip install -q opencv-python
Pillow: 12.0.0
Seaborn: 0.13.2
TensorFlow: Not installed → !pip install -q tensorflow
Transformers: 5.2.0
TorchVision: 0.24.0a0+094e7af5

=== Environment setup completed ===
--------------------------------------------------------------------------------

=== Visualizing Test Plot (Wide View) ===

=== GPU Usage Code Snippet ===
Device set to: cuda
----------------------------------------
# 아래 코드를 복사해서 모델과 데이터를 GPU로 보내세요:
model = YourModel().to(device)
data = data.to(device)
----------------------------------------

=== Environment setup completed ===
--------------------------------------------------------------------------------

from datasets import load_dataset
ds = load_dataset("opus100", "en-ko")

import os
from datasets import load_dataset

data_dir = "./data/opus100_en_ko"

if os.path.exists(data_dir):
    print("이미 존재합니다. 스킵합니다.")
else:
    print("다운로드 중...")
    ds = load_dataset("opus100", "en-ko")
    ds.save_to_disk(data_dir)
    print("완료:", data_dir)

이미 존재합니다. 스킵합니다.

from datasets import load_from_disk

ds = load_from_disk("./data/opus100_en_ko")

print("데이터셋 구조:")
print(ds)

print("\n--- train 샘플 3개 ---")
for i in range(3):
    pair = ds["train"][i]["translation"]
    print(f"EN: {pair['en']}")
    print(f"KO: {pair['ko']}")
    print()

데이터셋 구조:
DatasetDict({
    test: Dataset({
        features: ['translation'],
        num_rows: 2000
    })
    train: Dataset({
        features: ['translation'],
        num_rows: 1000000
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 2000
    })
})

--- train 샘플 3개 ---
EN: They're shaped like a bus.
KO: 할머니처럼 만들었지만.. ? 엉망이지만..

EN: I ain't fishing' 'em out.
KO: 그거 꺼내려다가는

EN: You are torturing god's creatures in an age where we have the technology that no longer requires us to.
KO: 선생님은 이 기술력이 있는 시대에 그러지 않아도 되는데도 신의 피조물을 괴롭히고 있다고요

from transformers import pipeline
translator = pipeline("translation", model="facebook/nllb-200-distilled-600M")
translator("Hello, how are you?", src_lang="eng_Latn", tgt_lang="kor_Hang")

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import os

model_dir = "./data/nllb-200-distilled-600M"

if os.path.exists(model_dir):
    print("이미 존재합니다. 스킵합니다.")
else:
    print("다운로드 중...")
    tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
    model_nllb = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
    tokenizer.save_pretrained(model_dir)
    model_nllb.save_pretrained(model_dir)
    print("완료:", model_dir)

이미 존재합니다. 스킵합니다.

# 1. 모델 & 토크나이저 로드
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch

model_dir = "./data/nllb-200-distilled-600M"
tokenizer_nllb = AutoTokenizer.from_pretrained(model_dir)
model_nllb = AutoModelForSeq2SeqLM.from_pretrained(model_dir).to(device)

# 2. 번역 함수
def translate_nllb(sentence, src_lang="eng_Latn", tgt_lang="kor_Hang"):
    inputs = tokenizer_nllb(sentence, return_tensors="pt").to(device)
    translated = model_nllb.generate(
        **inputs,
        forced_bos_token_id=tokenizer_nllb.lang_code_to_id[tgt_lang],
        max_length=200
    )
    return tokenizer_nllb.decode(translated[0], skip_special_tokens=True)

from torch.utils.data import Dataset, DataLoader

class NLLBDataset(Dataset):
    def __init__(self, data, tokenizer, src_lang="eng_Latn", tgt_lang="kor_Hang", max_len=128):
        self.data = data
        self.tokenizer = tokenizer
        self.src_lang = src_lang
        self.tgt_lang = tgt_lang
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        pair = self.data[idx]["translation"]
        
        # 입력 토크나이징
        self.tokenizer.src_lang = self.src_lang
        inputs = self.tokenizer(
            pair["en"], max_length=self.max_len, truncation=True, padding="max_length", return_tensors="pt"
        )
        
        # 타겟 토크나이징 (tgt_lang 명시)
        self.tokenizer.src_lang = self.tgt_lang
        labels = self.tokenizer(
            pair["ko"], max_length=self.max_len, truncation=True, padding="max_length", return_tensors="pt"
        )
        
        label_ids = labels["input_ids"].squeeze()
        label_ids[label_ids == self.tokenizer.pad_token_id] = -100
        return {
            "input_ids": inputs["input_ids"].squeeze(),
            "attention_mask": inputs["attention_mask"].squeeze(),
            "labels": label_ids
        }

# DataLoader 재생성
train_nllb = NLLBDataset(ds["train"].select(range(300000)), tokenizer_nllb)
valid_nllb = NLLBDataset(ds["validation"], tokenizer_nllb)
train_loader_nllb = DataLoader(train_nllb, batch_size=16, shuffle=True, num_workers=4)
valid_loader_nllb = DataLoader(valid_nllb, batch_size=16, shuffle=False, num_workers=4)

import torch
from tqdm.auto import tqdm

# 1. 평가 모드 선언 (가중치 고정 및 드롭아웃 비활성화)
model_nllb.eval()

all_sources = []
all_predictions = []
all_targets = []

# 3. 그래디언트 계산 비활성화 (가중치 업데이트를 원천 차단하고 메모리 절약)
with torch.no_grad():
    # 검증 데이터셋(valid_loader_nllb)에서 샘플을 가져옵니다
    for batch in tqdm(valid_loader_nllb, desc="Pre-trained 모델 측정 중"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        all_sources.extend(tokenizer_nllb.batch_decode(input_ids, skip_special_tokens=True))
        labels = batch['labels'].to(device)

        # 번역 생성 (가중치 고정 상태 유지)
        generated_tokens = model_nllb.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            # [수정] 에러 해결을 위해 메서드 변경
            forced_bos_token_id=tokenizer_nllb.convert_tokens_to_ids("kor_Hang"),
            max_length=200
        )

        # 결과 해독 및 저장
        decoded_preds = tokenizer_nllb.batch_decode(generated_tokens, skip_special_tokens=True)
        
        # 정답(Label) 해독 (패딩인 -100은 제외하고 해독)
        labels[labels == -100] = tokenizer_nllb.pad_token_id
        decoded_labels = tokenizer_nllb.batch_decode(labels, skip_special_tokens=True)

        all_predictions.extend(decoded_preds)
        all_targets.extend(decoded_labels)

# 결과 확인 (샘플 5개만 출력)
print("\n=== Pre-trained (Base) 결과 샘플 ===")
for i in range(5):
    print(f"원본(EN): {all_sources[i]}")
    print(f"정답(KO): {all_targets[i]}")
    print(f"모델(KO): {all_predictions[i]}")
    print("-" * 30)

Pre-trained 모델 측정 중: 100%|██████████| 16/16 [12:21<00:00, 46.32s/it]

=== Pre-trained (Base) 결과 샘플 ===
원본(EN): Yeah, a lot of it.
정답(KO): 네, 무척요.
모델(KO): - 그래, 많이요
------------------------------
원본(EN): I'll set up some tests. Shep,
정답(KO): 날 뚫어지게 쳐다보는데 그만 해요
모델(KO): 몇 가지 테스트를 할게요
------------------------------
원본(EN): Look, I don't like it any more than you do, but if you help me, I promise to keep you safe.
정답(KO): 이봐 나도 너만큼 안 내켜 그래도 날 도우면 내가 보호해주지
모델(KO): 난 너보다 싫어하지만 네가 도와준다면 널 안전하게 지켜줄게
------------------------------
원본(EN): Like, what does that even mean?
정답(KO): 뭔 뜻이야?
모델(KO): 그게 무슨 뜻일까요?
------------------------------
원본(EN): She becomes the story.
정답(KO): 리즈가 영웅이 되고 있어요
모델(KO): 그녀는 이야기가 될 것입니다.
------------------------------

import torch
from tqdm.auto import tqdm
from nltk.translate.bleu_score import corpus_bleu

model_nllb.eval()
all_sources, all_predictions, all_targets = [], [], []
tgt_id = tokenizer_nllb.convert_tokens_to_ids("kor_Hang")

with torch.no_grad():
    for batch in tqdm(valid_loader_nllb, desc="Baseline 측정"):
        input_ids = batch['input_ids'].to(device)
        gen = model_nllb.generate(input_ids=input_ids, attention_mask=batch['attention_mask'].to(device), forced_bos_token_id=tgt_id, max_length=200)
        
        all_sources.extend(tokenizer_nllb.batch_decode(input_ids, skip_special_tokens=True))
        all_predictions.extend(tokenizer_nllb.batch_decode(gen, skip_special_tokens=True))
        
        labels = batch['labels'].clone(); labels[labels == -100] = tokenizer_nllb.pad_token_id
        all_targets.extend(tokenizer_nllb.batch_decode(labels, skip_special_tokens=True))

# 점수 계산 및 샘플 출력
score = corpus_bleu([[t.split()] for t in all_targets], [p.split() for p in all_predictions]) * 100
print(f"\nBaseline BLEU Score: {score:.2f}")
for i in range(5): print(f"EN: {all_sources[i]}\nKO 정답: {all_targets[i]}\nKO 모델: {all_predictions[i]}\n{'-'*30}")

Baseline 측정: 100%|██████████| 16/16 [12:07<00:00, 45.45s/it]

Baseline BLEU Score: 0.88
EN: Yeah, a lot of it.
KO 정답: 네, 무척요.
KO 모델: - 그래, 많이요
------------------------------
EN: I'll set up some tests. Shep,
KO 정답: 날 뚫어지게 쳐다보는데 그만 해요
KO 모델: 몇 가지 테스트를 할게요
------------------------------
EN: Look, I don't like it any more than you do, but if you help me, I promise to keep you safe.
KO 정답: 이봐 나도 너만큼 안 내켜 그래도 날 도우면 내가 보호해주지
KO 모델: 난 너보다 싫어하지만 네가 도와준다면 널 안전하게 지켜줄게
------------------------------
EN: Like, what does that even mean?
KO 정답: 뭔 뜻이야?
KO 모델: 그게 무슨 뜻일까요?
------------------------------
EN: She becomes the story.
KO 정답: 리즈가 영웅이 되고 있어요
KO 모델: 그녀는 이야기가 될 것입니다.
------------------------------

from peft import LoraConfig, get_peft_model, TaskType

peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode=False,

    r=32,           # 10만+ 데이터 → 높은 rank 충분히 활용 가능
    lora_alpha=64,  # alpha = r * 2 공식 유지
    lora_dropout=0.05,  # 데이터 많으므로 낮게

    # 인코더 + 디코더 전체 어텐션 + FFN 커버
    target_modules=[
        "q_proj", "k_proj", "v_proj", "out_proj",  # 전체 어텐션
        "fc1", "fc2"                                 # FFN (번역 품질 핵심)
    ],
    bias="none",
)

model_nllb = get_peft_model(model_nllb, peft_config)
model_nllb.print_trainable_parameters()

trainable params: 17,301,504 || all params: 632,375,296 || trainable%: 2.7360

import torch
import torch.optim as optim
from transformers import get_cosine_schedule_with_warmup
from tqdm.auto import tqdm
import os, time
from datetime import timedelta

torch.cuda.empty_cache()
accumulation_steps = 4

optimizer = optim.AdamW(
    model_nllb.parameters(),
    lr=5e-5,
    weight_decay=0.01
)

total_steps = len(train_loader_nllb) // accumulation_steps
scheduler = get_cosine_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(total_steps * 0.06),
    num_training_steps=total_steps
)

model_nllb.train()
progress_bar = tqdm(train_loader_nllb, desc="LoRA Training")
optimizer.zero_grad()

total_start = time.time()
step_start = time.time()

for i, batch in enumerate(progress_bar):
    input_ids      = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels         = batch['labels'].to(device)

    outputs = model_nllb(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
    loss = outputs.loss / accumulation_steps
    loss.backward()

    if (i + 1) % accumulation_steps == 0:
        torch.nn.utils.clip_grad_norm_(model_nllb.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        step_elapsed  = time.time() - step_start
        total_elapsed = time.time() - total_start
        progress_bar.set_postfix({
            'loss'     : f"{loss.item() * accumulation_steps:.4f}",
            'lr'       : f"{scheduler.get_last_lr()[0]:.2e}",
            'step_time': f"{step_elapsed:.2f}s",
            'total'    : str(timedelta(seconds=int(total_elapsed)))
        })
        step_start = time.time()

total_elapsed = time.time() - total_start
print(f"\n✅ 학습 완료 | 총 시간: {timedelta(seconds=int(total_elapsed))}")

# 모델 저장
save_path = "./data/lora_nllb_model"
os.makedirs(save_path, exist_ok=True)

save_start = time.time()
model_nllb.save_pretrained(save_path)
tokenizer_nllb.save_pretrained(save_path)
print(f"💾 저장 완료: {save_path} ({time.time()-save_start:.2f}s)")

LoRA Training: 100%|██████████| 18750/18750 [3:27:14<00:00,  1.51it/s, loss=3.0595, lr=0.00e+00, step_time=2.67s, total=3:27:12]

✅ 학습 완료 | 총 시간: 3:27:14
💾 저장 완료: ./data/lora_nllb_model (0.69s)

from peft import PeftModel
from transformers import AutoTokenizer

model_nllb = PeftModel.from_pretrained(base_model, "./data/lora_nllb_model")
tokenizer_nllb = AutoTokenizer.from_pretrained("./data/lora_nllb_model")

import torch
from tqdm.auto import tqdm
from nltk.translate.bleu_score import corpus_bleu

model_nllb.eval()
all_sources, all_predictions, all_targets = [], [], []
tgt_id = tokenizer_nllb.convert_tokens_to_ids("kor_Hang")

with torch.no_grad():
    for batch in tqdm(valid_loader_nllb, desc="Baseline 측정"):
        input_ids = batch['input_ids'].to(device)
        gen = model_nllb.generate(input_ids=input_ids, attention_mask=batch['attention_mask'].to(device), forced_bos_token_id=tgt_id, max_length=200)
        
        all_sources.extend(tokenizer_nllb.batch_decode(input_ids, skip_special_tokens=True))
        all_predictions.extend(tokenizer_nllb.batch_decode(gen, skip_special_tokens=True))
        
        labels = batch['labels'].clone(); labels[labels == -100] = tokenizer_nllb.pad_token_id
        all_targets.extend(tokenizer_nllb.batch_decode(labels, skip_special_tokens=True))

# 점수 계산 및 샘플 출력
score = corpus_bleu([[t.split()] for t in all_targets], [p.split() for p in all_predictions]) * 100
print(f"\nBaseline BLEU Score: {score:.2f}")
for i in range(5): print(f"EN: {all_sources[i]}\nKO 정답: {all_targets[i]}\nKO 모델: {all_predictions[i]}\n{'-'*30}")

Baseline 측정: 100%|██████████| 125/125 [01:43<00:00,  1.20it/s]

Baseline BLEU Score: 1.85
EN: Yeah, a lot of it.
KO 정답: 네, 무척요.
KO 모델: 그래, 많이요
------------------------------
EN: I'll set up some tests. Shep,
KO 정답: 날 뚫어지게 쳐다보는데 그만 해요
KO 모델: 검사 좀 할게요
------------------------------
EN: Look, I don't like it any more than you do, but if you help me, I promise to keep you safe.
KO 정답: 이봐 나도 너만큼 안 내켜 그래도 날 도우면 내가 보호해주지
KO 모델: 난 너보다 싫어하지만 네가 도와준다면 널 지켜줄게
------------------------------
EN: Like, what does that even mean?
KO 정답: 뭔 뜻이야?
KO 모델: 그게 무슨 뜻인지?
------------------------------
EN: She becomes the story.
KO 정답: 리즈가 영웅이 되고 있어요
KO 모델: 그녀는 이야기로 변한다
------------------------------

!pip install -q --upgrade transformers peft accelerate

WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning.

# pip install peft
from peft import get_peft_model, LoraConfig, TaskType

# 1. LoRA 설정
lora_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    r=16,                    # rank
    lora_alpha=32,           # 스케일링
    lora_dropout=0.1,
    target_modules=["q_proj", "v_proj"]  # Attention Q, V에만 적용
)

# 2. 모델에 LoRA 적용
model_lora = get_peft_model(model_nllb, lora_config)
model_lora.print_trainable_parameters()  # 학습 파라미터 확인

/usr/local/lib/python3.12/dist-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[1], line 14
      5 lora_config = LoraConfig(
      6     task_type=TaskType.SEQ_2_SEQ_LM,
      7     r=16,                    # rank
   (...)     10     target_modules=["q_proj", "v_proj"]  # Attention Q, V에만 적용
     11 )
     13 # 2. 모델에 LoRA 적용
---> 14 model_lora = get_peft_model(model_nllb, lora_config)
     15 model_lora.print_trainable_parameters()  # 학습 파라미터 확인

NameError: name 'model_nllb' is not defined

	예시
번역	"나는 밥을 먹었다" → "I ate rice"
요약	긴 문장 → 짧은 문장
챗봇	질문 → 대답

연도	기술	핵심 방식	한계
~2007	규칙 기반 (SYSTRAN)	언어학자가 문법 규칙 수작업	관용어, 문맥, 예외 처리 불가
2007~2016	통계 기반 (SMT)	대용량 병렬 코퍼스에서 확률 학습	구절 단위 번역 → 문맥 유실
2016.11	신경망 번역 (GNMT)	Seq2Seq + Attention, 문장 전체 처리	LSTM 순차처리 → 병렬화 불가, 긴 문장 약함
2017	Transformer	Attention Is All You Need, LSTM 제거	추론 속도 문제

컬럼	예시
`translation.en`	"I ate rice with a friend yesterday."
`translation.ko`	"나는 어제 친구와 밥을 먹었다."

	내용
지원 언어	200개
모델 크기	600M / 1.3B / 3.3B
구조	Transformer 인코더-디코더
학습 데이터	850억 개 문장 쌍

모델	출시	지원 언어	영→한 품질
GNMT (Google)	2016	103개	보통
M2M-100 (Meta)	2020	100개	좋음
NLLB-200 (Meta)	2022	200개	최고

Seq2Seq (Sequence-to-Sequence)¶

opus100 데이터셋¶

데이터 구조¶

데이터 크기¶

검증 방식¶

왜 영→한인가?¶

NLLB-200 (No Language Left Behind)¶

특징¶

기존 모델과 비교¶

핵심 차별점¶

Fine-tuning 방법 비교¶

LoRA (Low-Rank Adaptation)¶

🚀 LoRA (Low-Rank Adaptation) 요약¶

1. 정체 및 개발자¶

2. 작동 원리 (핵심 알고리즘)¶

3. 주요 특징¶

방법	학습 파라미터	학습 속도	메모리 사용량	예상 성능
Pre-trained (Base)	0% (없음)	N/A (즉시 실행)	최소 (추론만)	기준점 (Baseline)
Full Fine-tuning	100%	느림	매우 많음	최고
Last N Layers	일부 (상위층)	보통	보통	좋음
LoRA	1~5%	빠름	적음	Full FT와 거의 동등

split	문장 쌍 수
train	1,000,000
validation	2,000
test	2,000