
9.3 向量索引构建示例
文档进入向量库前,应先清洗、切分、打标签、嵌入,再写入索引。以下示例展示一种最简流程,真实环境中可替换为 Milvus 或 Qdrant SDK。
代码清单 9-2 文档切分与索引写入
from dataclasses import dataclass
from typing import Iterable
import hashlib
@dataclass
class Chunk:
chunk_id: str
text: str
metadata: dict
def chunk_document(doc_id: str, title: str, text: str, source_type: str) -> list[Chunk]:
parts = [p.strip() for p in text.split("\n\n") if p.strip()]
chunks = []
for i, part in enumerate(parts):
cid = hashlib.md5(f"{doc_id}-{i}-{part[:100]}".encode()).hexdigest()
chunks.append(
Chunk(
chunk_id=cid,
text=f"[{title}]\n{part}",
metadata={"doc_id": doc_id, "source_type": source_type, "seq": i},
)
)
return chunks
():
rows = []
c chunks:
vec = embed_fn(c.text)
rows.append({: c.chunk_id, : vec, : c.metadata | {: c.text}})
store.upsert(rows)


