pip install nltk

!pip install nltk

Collecting nltk
  Downloading nltk-3.9.2-py3-none-any.whl.metadata (3.2 kB)
Requirement already satisfied: click in /usr/local/lib/python3.12/dist-packages (from nltk) (8.3.0)
Requirement already satisfied: joblib in /usr/local/lib/python3.12/dist-packages (from nltk) (1.5.2)
Requirement already satisfied: regex>=2021.8.3 in /usr/local/lib/python3.12/dist-packages (from nltk) (2025.9.18)
Requirement already satisfied: tqdm in /usr/local/lib/python3.12/dist-packages (from nltk) (4.67.1)
Downloading nltk-3.9.2-py3-none-any.whl (1.5 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.5/1.5 MB 12.0 MB/s  0:00:00
Installing collected packages: nltk
Successfully installed nltk-3.9.2
WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning.

import nltk
nltk.download('punkt')                      # 토큰화(pickle 파일 형식)
nltk.download('punkt_tab')                  # 토큰화(tabular 파일 형식)
nltk.download('stopwords')                  # 불용어
nltk.download('averaged_perceptron_tagger') # 품사 태깅
nltk.download('averaged_perceptron_tagger_eng')  # 영어 품사 태깅
nltk.download('wordnet')                    # 어간 추출

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!

True

import nltk
from nltk.tokenize import word_tokenize

# 예시 1: 구두점 분리
print('"I love Python!" :', word_tokenize("I love Python!"))

# 예시 2: 축약형
print('"I don\'t know" :', word_tokenize("I don't know"))

# 예시 3: 축약형 2
print('"We\'re happy" :', word_tokenize("We're happy"))

# 예시 4: 특수문자
print('"Hello, world!" :', word_tokenize("Hello, world!"))

# 예시 5: 숫자 + 기호
print('"Price: $100.50" :', word_tokenize("Price: $100.50"))

# 예시 6: 약어
print('"I live in U.S.A." :', word_tokenize("I live in U.S.A."))

# 예시 7: 이메일
print('"Email me at [email protected]" :', word_tokenize("Email me at [email protected]"))

"I love Python!" : ['I', 'love', 'Python', '!']
"I don't know" : ['I', 'do', "n't", 'know']
"We're happy" : ['We', "'re", 'happy']
"Hello, world!" : ['Hello', ',', 'world', '!']
"Price: $100.50" : ['Price', ':', '$', '100.50']
"I live in U.S.A." : ['I', 'live', 'in', 'U.S.A', '.']
"Email me at [email protected]" : ['Email', 'me', 'at', 'test', '@', 'email.com']

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# 불용어 리스트
stop_words = set(stopwords.words('english'))
print(f"불용어 개수: {len(stop_words)}")
print(f"불용어 예시: {list(stop_words)[:100]}")

# 예시 1: 불용어 제거 전/후
text = "I love Python and it is amazing"
tokens = word_tokenize(text.lower())

print(f"\n원본: {tokens}")

filtered = [word for word in tokens if word not in stop_words]
print(f"불용어 제거 후: {filtered}")

# 예시 2: 문서 분류/검색할 때
text2 = "The cat is on the table"
tokens2 = word_tokenize(text2.lower())
filtered2 = [word for word in tokens2 if word not in stop_words]

print(f"\n원본: {tokens2}")
print(f"불용어 제거 후: {filtered2}")

불용어 개수: 198
불용어 예시: ["needn't", 'is', 'after', "wasn't", 'because', "it'd", 'during', 'wasn', 'no', 'with', 'be', "won't", 'so', 'above', 'don', 've', 'when', 'again', 'did', 'over', 'where', 'up', "i'm", 'y', 'once', 'have', 'can', 'under', 'such', "shouldn't", 'their', 'should', 'down', "it'll", "i've", 'mightn', 'by', 'were', 'why', "she'll", 'ourselves', 'your', 'haven', 'his', "we'd", 'was', 'will', "should've", 'as', 'needn', "we'll", "haven't", 'ours', 'some', 'weren', 'nor', 'its', 'here', 'about', "they're", "it's", "aren't", 'if', 'just', 'being', 'himself', "you'd", "isn't", "he's", 'her', 'has', 'through', 'between', 'for', 'off', 'at', 'yours', "she'd", 'same', "we're", 'mustn', "i'll", 'doing', 'this', "didn't", 'isn', 'the', "wouldn't", 'having', 'then', 'more', 'doesn', 'very', 'these', "weren't", 'didn', 'now', 'my', "mightn't", 'on']

원본: ['i', 'love', 'python', 'and', 'it', 'is', 'amazing']
불용어 제거 후: ['love', 'python', 'amazing']

원본: ['the', 'cat', 'is', 'on', 'the', 'table']
불용어 제거 후: ['cat', 'table']

import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize

# 품사 태그 한글 설명
pos_dict = {
    'NN': '명사',
    'NNP': '고유명사',
    'NNS': '복수명사',
    'VB': '동사원형',
    'VBP': '현재동사',
    'VBD': '과거동사',
    'VBG': '현재분사',
    'JJ': '형용사',
    'RB': '부사',
    'PRP': '대명사',
    'DT': '관사',
    'IN': '전치사',
    'MD': '조동사',
    'CC': '접속사',
    '.': '마침표'
}

# 예시 1: 기본 품사 태깅
text = "I love Python programming"
tokens = word_tokenize(text)
tagged = pos_tag(tokens)

print("예시 1:")
for word, tag in tagged:
    korean = pos_dict.get(tag, tag)
    print(f'"{word}" → {tag} ({korean})')

# 예시 2: 같은 단어, 다른 품사
text2 = "They can fish. I have a can."
tokens2 = word_tokenize(text2)
tagged2 = pos_tag(tokens2)

print("\n예시 2 (같은 단어 'can'):")
for word, tag in tagged2:
    korean = pos_dict.get(tag, tag)
    print(f'"{word}" → {tag} ({korean})')

# 예시 3: 다양한 품사
text3 = "The quick brown fox jumps over the lazy dog"
tokens3 = word_tokenize(text3)
tagged3 = pos_tag(tokens3)

print("\n예시 3:")
for word, tag in tagged3:
    korean = pos_dict.get(tag, tag)
    print(f'"{word}" → {tag} ({korean})')

예시 1:
"I" → PRP (대명사)
"love" → VBP (현재동사)
"Python" → NNP (고유명사)
"programming" → NN (명사)

예시 2 (같은 단어 'can'):
"They" → PRP (대명사)
"can" → MD (조동사)
"fish" → VB (동사원형)
"." → . (마침표)
"I" → PRP (대명사)
"have" → VBP (현재동사)
"a" → DT (관사)
"can" → MD (조동사)
"." → . (마침표)

예시 3:
"The" → DT (관사)
"quick" → JJ (형용사)
"brown" → NN (명사)
"fox" → NN (명사)
"jumps" → VBZ (VBZ)
"over" → IN (전치사)
"the" → DT (관사)
"lazy" → JJ (형용사)
"dog" → NN (명사)

import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

# Stemmer 생성
stemmer = PorterStemmer()

# 예시 1: 동사 어간 추출
words1 = ["running", "runs", "run", "runner"]
print("예시 1 (동사):")
for word in words1:
    stem = stemmer.stem(word)
    print(f'"{word}" → "{stem}"')

# 예시 2: 형용사 어간 추출
words2 = ["happy", "happier", "happiest", "happiness"]
print("\n예시 2 (형용사):")
for word in words2:
    stem = stemmer.stem(word)
    print(f'"{word}" → "{stem}"')

# 예시 3: 다양한 단어
words3 = ["studies", "studying", "studied", "study"]
print("\n예시 3 (study 변형):")
for word in words3:
    stem = stemmer.stem(word)
    print(f'"{word}" → "{stem}"')

# 예시 4: 문장에서 어간 추출
text = "I am running and jumping quickly"
tokens = word_tokenize(text)
print("\n예시 4 (문장):")
for word in tokens:
    stem = stemmer.stem(word.lower())
    print(f'"{word}" → "{stem}"')

예시 1 (동사):
"running" → "run"
"runs" → "run"
"run" → "run"
"runner" → "runner"

예시 2 (형용사):
"happy" → "happi"
"happier" → "happier"
"happiest" → "happiest"
"happiness" → "happi"

예시 3 (study 변형):
"studies" → "studi"
"studying" → "studi"
"studied" → "studi"
"study" → "studi"

예시 4 (문장):
"I" → "i"
"am" → "am"
"running" → "run"
"and" → "and"
"jumping" → "jump"
"quickly" → "quickli"

import setup_env

--------------------------------------------------------------------------------
=== Hardware Acceleration ===
PyTorch version: 2.9.0a0+145a3a7bda.nv25.10
Using NVIDIA GPU (CUDA)
   CUDA version: 13.0
   GPU name: NVIDIA GeForce RTX 5070 Ti
   GPU count: 1
   Total GPU memory: 15.92 GB
   Allocated memory: 0.00 GB
   Free memory: 15.92 GB
Device: cuda

=== Matplotlib Settings ===
✅ Font: NanumGothic

=== System Info ===
OS: Ubuntu 24.04.3 LTS (Noble Numbat)
    Kernel: 6.6.87.2-microsoft-standard-WSL2
Architecture: x86_64
Python: 3.12.3
Working directory: /workspace/ai-deeplearning/tutorial

=== Library Versions ===
NumPy: 2.1.0
Pandas: 3.0.0
Matplotlib: 3.10.7
Scikit-learn: 1.7.2
OpenCV: Not installed → !pip install -q opencv-python
Pillow: 12.0.0
Seaborn: 0.13.2
TensorFlow: Not installed → !pip install -q tensorflow
Transformers: 4.40.1
TorchVision: 0.24.0a0+094e7af5

=== Environment setup completed ===
--------------------------------------------------------------------------------

=== Visualizing Test Plot (Wide View) ===

=== GPU Usage Code Snippet ===
Device set to: cuda
----------------------------------------
# 아래 코드를 복사해서 모델과 데이터를 GPU로 보내세요:
model = YourModel().to(device)
data = data.to(device)
----------------------------------------

=== Environment setup completed ===
--------------------------------------------------------------------------------

import os

# 디렉토리 생성
os.makedirs('./data/ag_news', exist_ok=True)

# 이미 다운받은게 있으면 스킵
base_url = "https://raw.githubusercontent.com/mhjabreel/CharCnn_Keras/master/data/ag_news_csv"

for filename in ['train.csv', 'test.csv']:
    filepath = f'./data/ag_news/{filename}'
    if os.path.exists(filepath):
        print(f"✅ {filename} 이미 존재 → 스킵")
    else:
        print(f"⬇️ {filename} 다운로드 중...")
        os.system(f'wget -P ./data/ag_news {base_url}/{filename}')
        print(f"✅ {filename} 다운로드 완료")

✅ train.csv 이미 존재 → 스킵
✅ test.csv 이미 존재 → 스킵

import pandas as pd

df_train = pd.read_csv('./data/ag_news/train.csv', header=None)
df_test  = pd.read_csv('./data/ag_news/test.csv',  header=None)

print("컬럼 수:", df_train.shape)
print("\n샘플 데이터:")
print(df_train.head(3))
print("\n결측값:", df_train.isnull().sum().tolist())
print("레이블 분포:", df_train[0].value_counts().to_dict())

컬럼 수: (120000, 3)

샘플 데이터:
   0                                                    1  \
0  3    Wall St. Bears Claw Back Into the Black (Reuters)   
1  3  Carlyle Looks Toward Commercial Aerospace (Reuters)   
2  3      Oil and Economy Cloud Stocks' Outlook (Reuters)   

                                                                                                                                                                                                                        2  
0                                                                                                                          Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.  
1  Reuters - Private investment firm Carlyle Group,\which has a reputation for making well-timed and occasionally\controversial plays in the defense industry, has quietly placed\its bets on another part of the market.  
2                                Reuters - Soaring crude prices plus worries\about the economy and the outlook for earnings are expected to\hang over the stock market next week during the depth of the\summer doldrums.  

결측값: [0, 0, 0]
레이블 분포: {3: 30000, 4: 30000, 2: 30000, 1: 30000}

import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

df_train = pd.read_csv('./data/ag_news/train.csv', header=None, names=['label', 'title', 'body'])
df_test  = pd.read_csv('./data/ag_news/test.csv',  header=None, names=['label', 'title', 'body'])

def preprocess(title, body):
    text = str(title) + ' ' + str(body)
    text = text.lower()
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df_train['text'] = df_train.apply(lambda r: preprocess(r['title'], r['body']), axis=1)
df_test['text']  = df_test.apply(lambda r: preprocess(r['title'], r['body']), axis=1)
df_train['label'] = df_train['label'] - 1
df_test['label']  = df_test['label'] - 1

df_tr, df_val = train_test_split(df_train, test_size=0.2, random_state=42, stratify=df_train['label'])

label_names = ['World', 'Sports', 'Business', 'Sci/Tech']
print(f"훈련셋:   {len(df_tr):,}개")
print(f"검증셋:   {len(df_val):,}개")
print(f"테스트셋: {len(df_test):,}개")

# ── 1. 데이터셋 크기 비교 ──
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

splits = {'Train': df_tr, 'Val': df_val, 'Test': df_test}
colors = ['#4C72B0', '#DD8452', '#55A868']
for ax, (name, df), color in zip(axes, splits.items(), colors):
    counts = df['label'].value_counts().sort_index()
    ax.bar([label_names[i] for i in counts.index], counts.values, color=color, edgecolor='white')
    ax.set_title(f'{name} ({len(df):,}개)')
    ax.set_ylabel('개수')
    for i, v in enumerate(counts.values):
        ax.text(i, v + 100, str(v), ha='center', fontweight='bold', fontsize=9)

plt.suptitle('데이터셋 분할 및 레이블 분포', fontsize=13, fontweight='bold')
plt.tight_layout()
plt.show()

# ── 2. 텍스트 길이 분포 ──
for df, name in [(df_tr, 'Train'), (df_val, 'Val'), (df_test, 'Test')]:
    df['text_len'] = df['text'].apply(lambda x: len(x.split()))

fig, axes = plt.subplots(1, 3, figsize=(15, 4))
for ax, (name, df), color in zip(axes, splits.items(), colors):
    ax.hist(df['text_len'], bins=50, color=color, edgecolor='white')
    ax.axvline(df['text_len'].mean(), color='red', linestyle='--', label=f'평균: {df["text_len"].mean():.0f}')
    ax.set_title(f'{name} 텍스트 길이 분포')
    ax.set_xlabel('단어 수')
    ax.set_ylabel('빈도')
    ax.legend()

plt.suptitle('텍스트 길이 분포', fontsize=13, fontweight='bold')
plt.tight_layout()
plt.show()

# ── 3. 데이터셋 비율 파이차트 ──
fig, ax = plt.subplots(figsize=(6, 6))
sizes = [len(df_tr), len(df_val), len(df_test)]
labels = [f'Train\n{len(df_tr):,}개', f'Val\n{len(df_val):,}개', f'Test\n{len(df_test):,}개']
ax.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%',
       startangle=90, wedgeprops=dict(edgecolor='white', linewidth=2))
ax.set_title('데이터셋 분할 비율', fontsize=13, fontweight='bold')
plt.show()

훈련셋:   96,000개
검증셋:   24,000개
테스트셋: 7,600개

연도	모델	핵심 아이디어	한계	인간 기준 돌파
1986	RNN	입력+이전hidden을 같이 계산해서 순차처리	긴 문장 앞부분 hidden이 희석됨	-
1997	LSTM	hidden(단기기억)+cell(장기기억)으로 분리, 게이트로 뭘 기억할지 선택	여전히 순차처리, 느림	-
2013	Word2Vec	단어를 의미있는 벡터로 변환 (왕-남자+여자=여왕)	문맥 무시, 같은 단어=같은 벡터	-
2014	GloVe	Word2Vec 개선, 단어 동시출현 통계 활용	여전히 정적임베딩 한계	-
2014	Seq2Seq	LSTM 인코더(압축)+디코더(복원) 구조로 번역	마지막 hidden 1개만 디코더에 전달, 정보손실	-
2015	Attention	인코더의 모든 hidden을 저장 후 디코더가 중요한 hidden에 집중	RNN 기반이라 여전히 순차처리 느림	-
2016	FastText	부분단어(subword) 단위 학습, 처음보는 단어도 이해	여전히 정적임베딩 한계	-
2017	Transformer	Attention만으로 순차처리 제거, 모든 단어 병렬처리	데이터/연산량 많이 필요	-
2018	BERT	Transformer로 양방향 사전학습, 문맥임베딩(동적임베딩) 등장	-	독해(SQuAD) 인간 91.2% 초월
2019	RoBERTa 등	BERT 개선	-	문장분류(GLUE) 87점 초월
2020	GPT-3	초거대 Transformer, 다음단어맞추기로 사전학습, few-shot 학습	엄청난 컴퓨팅 비용	번역(BLEU) 인간번역 수준 도달
2021+	GPT-4 등	멀티모달, 초거대화	긴 문맥/상식/추론은 여전히 인간 우위 영역 존재	상식추론 ~90% 초월

Step	모델	임베딩	임베딩 학습 여부	비고
0	MLP	원핫인코딩	-	기초 베이스라인
1	CNN	TF-IDF	-	단어 중요도 반영
2	RNN	Word2Vec	고정 (freeze)	순서 처리 시작
3	LSTM	GloVe	고정 (pre-trained)	장기기억 추가
4	Attention	FastText	학습 중 업데이트 (fine-tune)	중요 단어 집중 + 부분단어 학습
5	Transformer	학습가능 임베딩	처음부터 학습	소형 구조, 오버핏 감수 (개념 시연)
6	BERT	문맥임베딩 (사전학습)	파인튜닝	최종 최강 모델

Step	모델	태스크	임베딩	비고
0	Seq2Seq (LSTM)	번역/요약	GloVe 고정	인코더-디코더 기초
1	Seq2Seq + Attention	번역/요약	GloVe fine-tune	중요 단어 집중
2	오토인코더	문장 압축/복원	학습가능 임베딩	비지도학습
3	Transformer (풀버전)	번역	학습가능 임베딩	인코더+디코더 풀구조
4	GPT	텍스트 생성	문맥임베딩 (사전학습)	디코더만, 파인튜닝

NLTK¶

설치¶

주요 기능¶

1. 토큰화¶

2. 불용어¶

3. 품사(POS, Part of Speech): 단어의 문법적 역할¶

4. 어간¶

시리즈 1 - 텍스트 분류 (AG News)¶

시리즈 2 - 생성/시퀀스 모델 (별도 튜토리얼)¶

AG News Dataset¶

개요¶

데이터셋 구성¶

카테고리 (4개 클래스)¶