SpringAI 深入解析 RAG 检索增强工作流程与调优

SpringAI 深入解析 RAG 检索增强工作流程与调优 | 极客日志

{
 "帅哥":{
  "name": "小雷"
 }
}

// 从 classpath 下的 JSON 文件中读取文档
@Component
class MyJsonReader {
    private final Resource resource;

    MyJsonReader(@Value("classpath:products.json") Resource resource) {
        this.resource = resource;
    }

    // 基本用法
    List<Document> loadBasicJsonDocuments() {
        JsonReader jsonReader = new JsonReader(this.resource);
        return jsonReader.get();
    }

    // 指定使用哪些 JSON 字段作为文档内容
    List<Document> loadJsonWithSpecificFields() {
        JsonReader jsonReader = new JsonReader(this.resource, "description", "features");
        return jsonReader.get();
    }

    // 使用 JSON 指针精确提取文档内容
    List<Document> loadJsonWithPointer() {
        JsonReader jsonReader = new JsonReader(this.resource);
        return jsonReader.get("/items"); // 提取 items 数组内的内容
    }
}

public class MsgEmailParser {
    private MsgEmailParser() {
        // Private constructor to prevent instantiation
    }

    /**
     * Convert MsgEmailElement to Document
     * @param element MSG email element
     * @return Document object
     */
    public static Document convertToDocument(MsgEmailElement element) {
        if (element == null) {
            throw new IllegalArgumentException("MsgEmailElement cannot be null");
        }
        // Build metadata
        Map<String, Object> metadata = new HashMap<>();
        // Add metadata with null check
        if (StringUtils.hasText(element.getSubject())) {
            metadata.put("subject", element.getSubject());
        }
        // ... 省略更多元信息的设置
        // Create Document object with content null check
        String content = StringUtils.hasText(element.getText()) ? element.getText() : "";
        return new Document(content, metadata);
    }
}

public interface DocumentTransformer extends Function<List<Document>, List<Document>> {
    default List<Document> transform(List<Document> documents) {
        return apply(documents);
    }
}

@Component
class MyTokenTextSplitter {
    public List<Document> splitDocuments(List<Document> documents) {
        TokenTextSplitter splitter = new TokenTextSplitter();
        return splitter.apply(documents);
    }

    public List<Document> splitCustomized(List<Document> documents) {
        TokenTextSplitter splitter = new TokenTextSplitter(1000, 400, 10, 5000, true);
        return splitter.apply(documents);
    }
}

@Component
class MyDocumentEnricher {
    private final ChatModel chatModel;

    MyDocumentEnricher(ChatModel chatModel) {
        this.chatModel = chatModel;
    }

    // 关键词元信息增强器
    List<Document> enrichDocumentsByKeyword(List<Document> documents) {
        KeywordMetadataEnricher enricher = new KeywordMetadataEnricher(this.chatModel, 5);
        return enricher.apply(documents);
    }

    // 摘要元信息增强器
    List<Document> enrichDocumentsBySummary(List<Document> documents) {
        SummaryMetadataEnricher enricher = new SummaryMetadataEnricher(chatModel, List.of(SummaryType.PREVIOUS, SummaryType.CURRENT, SummaryType.NEXT));
        return enricher.apply(documents);
    }
}

DefaultContentFormatter formatter = DefaultContentFormatter.builder()
    .withMetadataTemplate("{key}: {value}")
    .withMetadataSeparator("\n")
    .withTextTemplate("{metadata_string}\n\n{content}")
    .withExcludedInferenceMetadataKeys("embedding", "vector_id")
    .withExcludedEmbedMetadataKeys("source_url", "timestamp")
    .build();
// 使用格式化器处理文档
String formattedText = formatter.format(document, MetadataMode.INFERENCE);

@Component
class MyDocumentWriter {
    public void writeDocuments(List<Document> documents) {
        FileDocumentWriter writer = new FileDocumentWriter("output.txt", true, MetadataMode.ALL, false);
        writer.accept(documents);
    }
}

@Component
class MyVectorStoreWriter {
    private final VectorStore vectorStore;

    MyVectorStoreWriter(VectorStore vectorStore) {
        this.vectorStore = vectorStore;
    }

    public void storeDocuments(List<Document> documents) {
        vectorStore.accept(documents);
    }
}

// 抽取：从 PDF 文件读取文档
PDFReader pdfReader = new PagePdfDocumentReader("knowledge_base.pdf");
List<Document> documents = pdfReader.read();

// 转换：分割文本并添加摘要
TokenTextSplitter splitter = new TokenTextSplitter(500, 50);
List<Document> splitDocuments = splitter.apply(documents);
SummaryMetadataEnricher enricher = new SummaryMetadataEnricher(chatModel, List.of(SummaryType.CURRENT));
List<Document> enrichedDocuments = enricher.apply(splitDocuments);

// 加载：写入向量数据库
vectorStore.write(enrichedDocuments);
// 或者使用链式调用
vectorStore.write(enricher.apply(splitter.apply(pdfReader.read())));

public interface VectorStore extends DocumentWriter {
    default String getName() {
        return this.getClass().getSimpleName();
    }
    void add(List<Document> documents);
}

SpringAI 深入解析 RAG 检索增强工作流程与调优

SpringAI 深入解析 RAG 检索增强工作流程与调优

本节重点

一、RAG 核心特性

RAG 检索增强工作流程

一、建立索引

1. 文档预处理和切割 ETL

2. 向量转换和存储

二、检索增强

1. 文档过滤和检索

2. 查询增强和关联

更多推荐文章

相关免费在线工具

文档收集和切割 - ETL

文档

ETL

抽取（Extract）

转换（Transform）

1. TextSplitter 文本分割器

2. MetadataEnricher 元数据增强器

3. ContentFormatter 内容格式化工具

加载（Load）

ETL 流程示例

向量转换和存储

VectorStore 接口介绍

更多推荐文章

相关免费在线工具

SpringAI 深入解析 RAG 检索增强工作流程与调优

SpringAI 深入解析 RAG 检索增强工作流程与调优

本节重点

一、RAG 核心特性

RAG 检索增强工作流程

一、建立索引

1. 文档预处理和切割 ETL

2. 向量转换和存储

二、检索增强

1. 文档过滤和检索

2. 查询增强和关联

微信扫一扫，关注极客日志

更多推荐文章

相关免费在线工具

文档收集和切割 - ETL

文档

ETL

抽取（Extract）

转换（Transform）

1. TextSplitter 文本分割器

2. MetadataEnricher 元数据增强器

3. ContentFormatter 内容格式化工具

加载（Load）

ETL 流程示例

向量转换和存储

VectorStore 接口介绍

微信扫一扫，关注极客日志

更多推荐文章

相关免费在线工具