import setup_env
from setup_env import device

!pip install chromadb langchain langchain-community -q

import chromadb
import langchain
print(f"chromadb: {chromadb.__version__}")
print(f"langchain: {langchain.__version__}")

WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning.
chromadb: 1.5.1
langchain: 1.2.10

import chromadb

client = chromadb.PersistentClient(path="./chromadb")
collection = client.get_or_create_collection(name="rag_collection")
print(collection)

Collection(name=rag_collection)

!pip install python-docx pypdf openpyxl python-pptx -q

import docx, pypdf, openpyxl, pptx
print(f"python-docx: {docx.__version__}")
print(f"pypdf: {pypdf.__version__}")
print(f"openpyxl: {openpyxl.__version__}")
print(f"python-pptx: {pptx.__version__}")

WARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager, possibly rendering your system unusable. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv. Use the --root-user-action option if you know what you are doing and want to suppress this warning.
python-docx: 1.2.0
pypdf: 6.7.1
openpyxl: 3.1.5
python-pptx: 1.0.2

import time
from docx import Document

doc = Document("./chromadb/data/02.식당리스트.docx")
text = "\n".join([para.text for para in doc.paragraphs if para.text.strip()])

collection.upsert(
    documents=[text],
    ids=[str(int(time.time() * 1000))],
    metadatas=[{"source": "02.식당리스트.docx", "category": "식당"}]
)
print("저장 완료")
print(text[:200])

저장 완료
1.식사 안내
코로나 이후 연세대 교내 지침이 강화되어 건물 내 음식물 반입 및 취식 불가하므로 외부 식당만 이용 가능합니다. 이 점 널리 양해해 주시고, 코로나 확산 방지를 위해 해당사항 꼭 준수 부탁드립니다.
2. 식사 이용안내
1) 이용방법: 금요일 저녁 및 토요일 점심 식사 가능(식당에 비치된 정보대학원 장부에 과정명, 성명 기재하고 주문)
2) 1

import time
from docx import Document

# 기존 삭제
collection.delete(where={"source": "02.식당리스트.docx"})

doc = Document("./chromadb/data/02.식당리스트.docx")
lines = [para.text.strip() for para in doc.paragraphs if para.text.strip()]

for line in lines:
    collection.add(
        documents=[line],
        ids=[str(int(time.time() * 1000))],
        metadatas=[{"source": "02.식당리스트.docx", "category": "식당"}]
    )

print(f"{len(lines)}개 줄 저장 완료")

7개 줄 저장 완료

chroma run --path ./chromadb --host 0.0.0.0 --port 55533

import chromadb

client = chromadb.HttpClient(host="127.0.0.1", port=55533)
collection = client.get_or_create_collection(name="rag_collection")

results = collection.get()
print(results)

{'ids': ['1771749845606'], 'embeddings': None, 'metadatas': [{'category': '식당', 'source': '02.식당리스트.docx'}], 'documents': ['1.식사 안내\n코로나 이후 연세대 교내 지침이 강화되어 건물 내 음식물 반입 및 취식 불가하므로 외부 식당만 이용 가능합니다. 이 점 널리 양해해 주시고, 코로나 확산 방지를 위해 해당사항 꼭 준수 부탁드립니다.\n2. 식사 이용안내\n1) 이용방법: 금요일 저녁 및 토요일 점심 식사 가능(식당에 비치된 정보대학원 장부에 과정명, 성명 기재하고 주문)\n2) 1인당 주문 가능 금액: 장부에 기재된 이용가능금액 확인(초과금액 개별 결제)\n3) 식당 리스트\n※ 식사가능 식당리스트는 추후 업데이트 예정\xa0'], 'data': None, 'uris': None, 'included': ['metadatas', 'documents']}

import time

collection.add(
    documents=["주진규 전화번호 010-1234-5678"],
    ids=[str(int(time.time() * 1000))],
    metadatas=[{"source": "manual", "category": "연락처"}]
)
print("저장 완료")

저장 완료

벡터DB	유형	특징	언어	비고
Faiss	로컬	Meta 개발, 빠른 유사도 검색, 대용량 처리 강점	Python	오픈소스
Chroma	로컬/클라우드	설치 간단, LangChain 연동 쉬움	Python	오픈소스
Pinecone	클라우드	완전관리형, 실시간 업데이트	Python/JS	유료
Weaviate	로컬/클라우드	GraphQL 지원, 멀티모달 가능	Python/JS	오픈소스
Milvus	로컬/클라우드	대규모 분산처리 강점	Python	오픈소스
Qdrant	로컬/클라우드	Rust 기반으로 빠름, 필터링 강점	Python	오픈소스
pgvector	로컬	PostgreSQL 확장, 기존 DB에 벡터 추가	Python	오픈소스
Redis	로컬/클라우드	기존 Redis에 벡터 검색 추가	Python	오픈소스
공통 기능
저장	임베딩 벡터 저장	-	-	BERT 등으로 변환한 벡터
검색	유사도 기반 검색	cosine similarity, L2 distance	-	질문과 가장 유사한 문서 검색
RAG 연동	검색결과 디코더에 전달	-	-	LangChain으로 쉽게 연동

항목	설명	예시
id	각 데이터의 고유 식별자	`"doc_1"`
documents	실제 텍스트 내용	`"2026년 2월 주식 수익 3천6백만원"`
embeddings	텍스트를 변환한 벡터	`[0.1, 0.3, -0.7, ...]`
metadatas	부가 정보 (날짜, 출처 등)	`{"date": "2026-02", "category": "주식"}`