C++ 基于正倒排索引的 Boost 搜索引擎实现

正倒排索引在搜索引擎项目中至关重要，两者协同工作以实现内容搜索。搜索引擎在对文档进行处理和索引构建时，会先创建正排索引，然后基于正排索引进一步生成倒排索引。当用户输入查询关键词时，搜索引擎会利用倒排索引快速定位包含该关键词的文档，再结合正排索引等其他信息进行结果展示。

1. 正倒排索引结构

1.1 正排索引

正排索引存储文档内容和其 ID。

// 正排索引中需要用到的结构体
typedef struct DocInfo {
    std::string title;   // 文档的标题
    std::string content; // 文档的内容
    std::string url;     // 文档的 URL
    int doc_id;          // 文档的 ID
} DocInfo1;

1.2 倒排索引

倒排索引存储文档 ID、关键字及其权重。InvertedList 通常称为倒排拉链，因为一个关键字可能对应多个文档。

// 倒排索引中需要用到的结构体
struct InvertedElem {
    int doc_id;      // 文档的 ID
    std::string word;// 关键字
    int weight;      // 文档的权重
};
typedef std::vector<InvertedElem> InvertedList;

2. 正倒排序部分 Class 的 Private 部分

2.1 准备工作

正排索引使用 vector，因为其下标即为文档 ID，使用便捷且减少代码量。倒排索引使用哈希表（unordered_map），通过关键字映射到倒排文档列表。

private:
    // 正排索引使用 vector，因为它的下标就是文档的 id
    std::vector<DocInfo1> forward_index;
    // 使用哈希来进行映射
    std::unordered_map<std::string, InvertedList> inverted_index;

2.2 单例模式

采用单例模式管理索引实例，好处包括：

减少资源浪费：避免反复创建或销毁实例导致的内存占用过高和 IO 开销增加。
确保全局逻辑统一：防止因多实例初始化参数不同导致搜索结果不稳定。
简化资源管理与调用：降低模块间耦合度，减少代码冗余。

需禁用拷贝构造函数和赋值运算符，并添加互斥锁以防止多线程并发创建多个实例。

private:
    Index() {};
    ( Index&) = ;
    Index& =( Index&) = ;
    
     Index* instance;
     std::mutex log;

:
    ~();

    {
         (instance == ) {
            log.();
             (instance == ) {
                instance =  ();
            }
            log.();
        }
         instance;
    }

#pragma once #include <iostream> #include <string> #include <vector> #include <unordered_map> #include <fstream> #include <mutex> #include "usuallytool.hpp" #include <boost/algorithm/string.hpp> #include "log.hpp" namespace ns_index { typedef struct DocInfo { std::string title; std::string content; std::string url; int doc_id; } DocInfo1; struct InvertedElem { int doc_id; std::string word; int weight; }; typedef std::vector<InvertedElem> InvertedList; class Index { private: std::vector<DocInfo1> forward_index; std::unordered_map<std::string, InvertedList> inverted_index; Index() {}; Index(const Index&) = delete; Index& operator=(const Index&) = delete; static Index* instance; static std::mutex log; public: ~Index(); static Index* Getinstance() { if (instance == nullptr) { log.lock(); if (instance == nullptr) { instance = new Index(); } log.unlock(); } return instance; } DocInfo1* GetForwardIndex(uint64_t doc_id) { if (doc_id >= forward_index.size()) { std::cout << "doc_id out range, error!" << std::endl; return nullptr; } return &forward_index[doc_id]; } InvertedList* GetInvertedList(const std::string& word) { auto iter = inverted_index.find(word); if (iter == inverted_index.end()) { std::cout << word << "get error" << std::endl; return nullptr; } return &(iter->second); } bool BuildIndex(const std::string& input) { std::ifstream in(input, std::ios::in | std::ios::binary); if (!in.is_open()) { std::cout << input << "open error" << std::endl; return false; } int count = 0; std::string line; while (std::getline(in, line)) { DocInfo1* doc = BuildForwardIndex(line); if (doc == nullptr) { std::cout << "BuildIndex error" << std::endl; continue; } BuildInvertedIndex(*doc); count++; if (count % 50 == 0) LOG1(NORMAL, "索引建立到：" + std::to_string(count)); } return true; } private: DocInfo1* BuildForwardIndex(const std::string& line) { std::vector<std::string> results; ns_util::StringUtil::Split(line, &results, "\3"); if (results.size() != 3) return nullptr; DocInfo1 doc; doc.title = results[0]; doc.content = results[1]; doc.url = results[2]; doc.doc_id = forward_index.size(); forward_index.push_back(doc); return &forward_index.back(); } bool BuildInvertedIndex(const DocInfo1& doc) { struct word_cnt { int title_cnt; int content_cnt; word_cnt() : title_cnt(0), content_cnt(0) {} }; std::unordered_map<std::string, word_cnt> word_map; std::vector<std::string> title_words; ns_util::JiebaUtil::CutString(doc.title, &title_words); for (auto& tw : title_words) { boost::to_lower(tw); word_map[tw].title_cnt++; } std::vector<std::string> content_words; ns_util::JiebaUtil::CutString(doc.content, &content_words); for (auto& cw : content_words) { boost::to_lower(cw); word_map[cw].content_cnt++; } #define X 10 #define Y 1 for (auto& word_pair : word_map) { InvertedElem item; item.doc_id = doc.doc_id; item.word = word_pair.first; item.weight = X * word_pair.second.title_cnt + Y * word_pair.second.content_cnt; inverted_index[word_pair.first].push_back(item); } return true; } }; Index* Index::instance = nullptr; std::mutex Index::log; }

C++ 基于正倒排索引的 Boost 搜索引擎实现