C++ 高并发内存池：基于基数树的性能优化与测试

文章配图

1. 使用基数树进行优化

在之前的实现中，PageCache 存在一个比较严重的性能瓶颈：查找页 ID 到 Span 的映射时需要加锁。由于 PageCache 中不断存在修改操作，如果在一个线程查询的过程中，另一个线程同时把这个 Span 拿走了，就会引发数据竞争问题。

更糟糕的是，这把锁直接锁住了整个 PageCache。没抢到锁的线程会阻塞等待，造成了严重的性能浪费。为了解决这个问题，我们引入了 Google 团队常用的数据结构：基数树（Radix Tree）。

文章配图

感兴趣的可以阅读相关技术文章，例如 Linux Kernel 中的基数树实现。基数树的核心优势在于写之前会提前开好空间，写入过程中不会破坏原有结构。因为读写是分离的，线程 1 对一个位置读写时，线程 2 不可能对这个位置产生冲突。

TCMalloc 源码中有三个基数树的模板，适用于不同的场景。这里我们主要使用前两个模板。需要注意的是，该项目暂时只能在 32 位平台下使用基数树。

TCMalloc 基数树实现

#pragma once
#include "Common.h"
#include "ObjectPool.h"

// Single-level array
template <int BITS>
class TCMalloc_PageMap1 {
private:
    static const int LENGTH = 1 << BITS;
    void** array_;
public:
    typedef uintptr_t Number;
    explicit TCMalloc_PageMap1() {
        size_t size = sizeof(void*) << BITS;
        size_t alignSize = SizeClass::_RoundUp(size,  << PAGE_SHIFT);
        array_ = (**)(alignSize >> PAGE_SHIFT);
        (array_, , (*) << BITS);
    }

    
    
    {
         ((k >> BITS) > ) {  ; }
         array_[k];
    }

    
    
    {
        array_[k] = v;
    }
};


 < BITS>
  {
:
      PAGE_ID ROOT_BITS = ;
      PAGE_ID ROOT_LENGTH = (PAGE_ID) << ROOT_BITS;
      PAGE_ID LEAF_BITS = BITS - ROOT_BITS;
      PAGE_ID LEAF_LENGTH = (PAGE_ID) << LEAF_BITS;

      {
        * values[LEAF_LENGTH];
    };
    Leaf* root_[ROOT_LENGTH];
    * (*allocator_)();

:
      Number;
    {
        (root_, , (root_));
        ();
    }

    {
         Number i1 = k >> LEAF_BITS;
         Number i2 = k & (LEAF_LENGTH - );
         ((k >> BITS) >  || root_[i1] == ) {  ; }
         root_[i1]->values[i2];
    }

    {
         Number i1 = k >> LEAF_BITS;
         Number i2 = k & (LEAF_LENGTH - );
         ((k >> BITS) !=  || i1 >= ROOT_LENGTH) { ; }
         (root_[i1] == ) {
             (!(k, )) { ; }
        }
         (i2 >= LEAF_LENGTH) { ; }
        root_[i1]->values[i2] = v;
    }

    {
         (Number key = start; key <= start + n - ;) {
             Number i1 = key >> LEAF_BITS;
             (i1 >= ROOT_LENGTH)  ;
             (root_[i1] == ) {
                 ObjectPool<Leaf> leafPool;
                Leaf* leaf = (Leaf*)leafPool.();
                (leaf, , (*leaf));
                root_[i1] = leaf;
            }
            key = ((key >> LEAF_BITS) + ) << LEAF_BITS;
        }
         ;
    }

    {
        (, (PAGE_ID) << BITS);
    }
};

// ntimes 一轮申请和释放内存的次数 // rounds 轮次 void BenchmarkMalloc(size_t ntimes, size_t nworks, size_t rounds) { std::vector<std::thread> vthread(nworks); std::atomic<size_t> malloc_costtime = 0; std::atomic<size_t> free_costtime = 0; for (size_t k = 0; k < nworks; ++k) { vthread[k] = std::thread([&]() { std::vector<void*> v; v.reserve(ntimes); for (size_t j = 0; j < rounds; ++j) { size_t begin1 = clock(); for (size_t i = 0; i < ntimes; i++) { v.push_back(malloc((16 + i) % 8192 + 1)); } size_t end1 = clock(); size_t begin2 = clock(); for (size_t i = 0; i < ntimes; i++) { free(v[i]); } size_t end2 = clock(); v.clear(); malloc_costtime += (end1 - begin1); free_costtime += (end2 - begin2); } }); } for (auto& t : vthread) { t.join(); } printf("%zu 个线程并发执行%zu 轮次，每轮次 malloc %zu 次：花费：%zu ms\n", nworks, rounds, ntimes, malloc_costtime.load()); printf("%zu 个线程并发执行%zu 轮次，每轮次 free %zu 次：花费：%zu ms\n", nworks, rounds, ntimes, free_costtime.load()); printf("%zu 个线程并发 malloc&free %zu 次，总计花费：%zu ms\n", nworks, nworks * rounds * ntimes, malloc_costtime.load() + free_costtime.load()); } void BenchmarkConcurrentMalloc(size_t ntimes, size_t nworks, size_t rounds) { std::vector<std::thread> vthread(nworks); std::atomic<size_t> malloc_costtime = 0; std::atomic<size_t> free_costtime = 0; for (size_t k = 0; k < nworks; ++k) { vthread[k] = std::thread([&]() { std::vector<void*> v; v.reserve(ntimes); for (size_t j = 0; j < rounds; ++j) { size_t begin1 = clock(); for (size_t i = 0; i < ntimes; i++) { v.push_back(ConcurrentAlloc((16 + i) % 8192 + 1)); } size_t end1 = clock(); size_t begin2 = clock(); for (size_t i = 0; i < ntimes; i++) { ConcurrentDealloc(v[i]); } size_t end2 = clock(); v.clear(); malloc_costtime += (end1 - begin1); free_costtime += (end2 - begin2); } }); } for (auto& t : vthread) { t.join(); } printf("%zu 个线程并发执行%zu 轮次，每轮次 concurrent alloc %zu 次：花费：%zu ms\n", nworks, rounds, ntimes, malloc_costtime.load()); printf("%zu 个线程并发执行%zu 轮次，每轮次 concurrent dealloc %zu 次：花费：%zu ms\n", nworks, rounds, ntimes, free_costtime.load()); printf("%zu 个线程并发 concurrent alloc&dealloc %zu 次，总计花费：%zu ms\n", nworks, nworks * rounds * ntimes, malloc_costtime.load() + free_costtime.load()); } int main() { size_t n = 10000; std::cout << "==========================================================" << std::endl; BenchmarkConcurrentMalloc(n, 4, 10); std::cout << std::endl << std::endl; BenchmarkMalloc(n, 4, 10); std::cout << "==========================================================" << std::endl; return 0; }

C++ 高并发内存池：基于基数树的性能优化与测试

1. 使用基数树进行优化

TCMalloc 基数树实现

更多推荐文章

相关免费在线工具

2. 性能测试

更多推荐文章

相关免费在线工具

C++ 高并发内存池：基于基数树的性能优化与测试

1. 使用基数树进行优化

TCMalloc 基数树实现

微信扫一扫，关注极客日志

更多推荐文章

相关免费在线工具

2. 性能测试

微信扫一扫，关注极客日志

更多推荐文章

相关免费在线工具