ClickHouse 的 inline memcpy 实现

ClickHouse 的 inline memcpy 实现其实很简单的，但是毕竟名声大，还有注释写得特别好，忍不住摘下来……以后水群吹牛逼用！

提前省流

不考虑注释中提到的玄学，只看代码的话，至少需要关注以下几点：

__restrict 非标准关键字安排上，避免阻挡别名优化。
分支顺序很重要，小尺寸的执行流先安排（理由：抄的 glibc），其次才是中、大尺寸。
ClickHouse 认为的小尺寸是 16 字节以内，中尺寸是 128 字节以内。
平凡操作（8 字节内）可以靠 __builtin_memcpy 操作。
重叠 move 技巧：只需 __builtin_memcpy 两次。例子：to copy 5 bytes [0, 1, 2, 3, 4] we will copy tail [1, 2, 3, 4] first and then head [0, 1, 2, 3].
大尺寸靠 SIMD 和循环展开。
中尺寸靠 SIMD，没有展开（作者评论：不够展开）。
注意内存位置可以任意的，SIMD 只处理了 dst 对齐（见 padding）。It’s not possible to have both src and dst aligned. So, we will use aligned stores and unaligned loads.

考虑注释中的讨论：

不使用 glibc memcpy 是为了避免 @PLT 跳转和可移植性，并且实测有性能提升。
所有通用实现中，处理不规则（奇数）尺寸靠的就是重叠 mov 操作。
不要混用不同指令集的 SIMD 操作，不要使用降频明显的（AVX512）指令集。
当前实现选择了循环展开 8 次，不算最优解。
mm prefetch 对于 AMD 平台是负优化（截至 2021 年）。
互联网上的 memcpy 基准测试大多是错误的。
大尺寸除了 SIMD 以外可考虑 rep movsb 操作。
超大尺寸（大于 L3 一半）可以考虑跳过缓存的 non-temporal store 操作。
但是作者说该实现实际不考虑超大尺寸，因为很复杂。

先看代码

static inline void * inline_memcpy(void * __restrict dst_, const void * __restrict src_, size_t size)
{
    /// We will use pointer arithmetic, so char pointer will be used.
    /// Note that __restrict makes sense (otherwise compiler will reload data from memory
    /// instead of using the value of registers due to possible aliasing).
    char * __restrict dst = reinterpret_cast<char * __restrict>(dst_);
    const char * __restrict src = reinterpret_cast<const char * __restrict>(src_);

    /// Standard memcpy returns the original value of dst. It is rarely used but we have to do it.
    /// If you use memcpy with small but non-constant sizes, you can call inline_memcpy directly
    /// for inlining and removing this single instruction.
    void * ret = dst;

tail:
    /// Small sizes and tails after the loop for large sizes.
    /// The order of branches is important but in fact the optimal order depends on the distribution of sizes in your application.
    /// This order of branches is from the disassembly of glibc's code.
    /// We copy chunks of possibly uneven size with two overlapping movs.
    /// Example: to copy 5 bytes [0, 1, 2, 3, 4] we will copy tail [1, 2, 3, 4] first and then head [0, 1, 2, 3].
    if (size <= 16)
    {
        if (size >= 8)
        {
            /// Chunks of 8..16 bytes.
            __builtin_memcpy(dst + size - 8, src + size - 8, 8);
            __builtin_memcpy(dst, src, 8);
        }
        else if (size >= 4)
        {
            /// Chunks of 4..7 bytes.
            __builtin_memcpy(dst + size - 4, src + size - 4, 4);
            __builtin_memcpy(dst, src, 4);
        }
        else if (size >= 2)
        {
            /// Chunks of 2..3 bytes.
            __builtin_memcpy(dst + size - 2, src + size - 2, 2);
            __builtin_memcpy(dst, src, 2);
        }
        else if (size >= 1)
        {
            /// A single byte.
            *dst = *src;
        }
        /// No bytes remaining.
    }
    else
    {
        /// Medium and large sizes.
        if (size <= 128)
        {
            /// Medium size, not enough for full loop unrolling.

            /// We will copy the last 16 bytes.
            _mm_storeu_si128(reinterpret_cast<__m128i *>(dst + size - 16), _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + size - 16)));

            /// Then we will copy every 16 bytes from the beginning in a loop.
            /// The last loop iteration will possibly overwrite some part of already copied last 16 bytes.
            /// This is Ok, similar to the code for small sizes above.
            while (size > 16)
            {
                _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), _mm_loadu_si128(reinterpret_cast<const __m128i *>(src)));
                dst += 16;
                src += 16;
                size -= 16;
            }
        }
        else
        {
            /// Large size with fully unrolled loop.

            /// Align destination to 16 bytes boundary.
            size_t padding = (16 - (reinterpret_cast<size_t>(dst) & 15)) & 15;

            /// If not aligned - we will copy first 16 bytes with unaligned stores.
            if (padding > 0)
            {
                __m128i head = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src));
                _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), head);
                dst += padding;
                src += padding;
                size -= padding;
            }

            /// Aligned unrolled copy. We will use half of available SSE registers.
            /// It's not possible to have both src and dst aligned.
            /// So, we will use aligned stores and unaligned loads.
            __m128i c0, c1, c2, c3, c4, c5, c6, c7;

            while (size >= 128)
            {
                c0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 0);
                c1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 1);
                c2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 2);
                c3 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 3);
                c4 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 4);
                c5 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 5);
                c6 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 6);
                c7 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src) + 7);
                src += 128;
                _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 0), c0);
                _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 1), c1);
                _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 2), c2);
                _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 3), c3);
                _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 4), c4);
                _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 5), c5);
                _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 6), c6);
                _mm_store_si128((reinterpret_cast<__m128i*>(dst) + 7), c7);
                dst += 128;

                size -= 128;
            }

            /// The latest remaining 0..127 bytes will be processed as usual.
            goto tail;
        }
    }

    return ret;
}

再看注释

原文可以看文件，这里直接机翻并自动加粗。

自定义 memcpy 实现用于 ClickHouse

该实现相较于使用 glibc 的 memcpy，有以下优势：

避免对特定版本的 glibc 符号的依赖，如 memcpy@@GLIBC_2.14，确保移植性。
避免因共享链接导致的 PLT 间接调用，提高效率。
可以直接调用 inline_memcpy，优化内联或进行跨过程分析。
在当前 CPU 上的性能测试中有更好表现：某些查询性能提高多达 25%，所有查询平均提高 0.7% 到 1%。

编写自定义 memcpy 非常困难，原因包括：

最优实现取决于特定 CPU 模型。
最优实现依赖于大小参数的分布。
还与并发复制数据的线程数相关。
还与调用代码如何使用复制的数据以及不同 memcpy 调用之间的关系有关。
由于场景范围广泛，进行正确测试尤其困难。编写自定义 memcpy 时，有可能会在不具代表性的微基准测试中进行过度优化，从而导致实际应用场景性能下降。

互联网上的 memcpy 基准测试大多是错误的。

详细说明

小尺寸时，代码中分支的顺序很重要。
有些实现使用特定的分支顺序（如 glibc 中的实现），也有使用跳转表（例如 Cosmopolitan libc 中的汇编代码）。
另有一些实现使用 Duff 设备（见 https://github.com/skywind3000/FastMemcpy/）。
复制不规则大小时也很重要。
几乎所有实现，包括该实现，使用两次重叠的 mov 指令。
编译时需要禁用 -ftree-loop-distribute-patterns，否则编译器可能会将内部循环替换为调用 memcpy，导致无限递归。
对于较大的尺寸，选择使用的指令也很关键：
- SSE、AVX 或 AVX-512；
- rep movsb；
  性能会根据尺寸阈值、CPU 模型和“erms”标志（增强型 rep movsb）的使用而有所不同。
使用 AVX-512 可能会由于降频而导致性能下降。
如果大部分代码使用 SSE，使用 AVX 可能会导致性能下降（还取决于是否使用了 vzeroupper 指令）。
但在某些情况下，使用 AVX 会带来性能提升。
循环展开的次数也会影响性能。
本实现将循环展开 8 次（根据可用寄存器数），但这并不总是最优解。
是否使用对齐或不对齐的加载/存储也很关键。
本实现使用不对齐的加载和对齐的存储。
预取指令的使用：
在一些 Intel CPU 上，预取指令可能会加速性能，但在 AMD 上可能会降低性能。设置正确的偏移量来进行预取并不直观。
对于非常大的数据量（超过 L3 缓存的一半），可以使用非临时（绕过缓存）存储。
但具体的阈值不明确——当多个线程同时进行 memcpy 时，最佳阈值可能会更低，因为 L3 缓存是共享的（L2 缓存也部分共享）。
对于非常大的 memcpy，一般意味着代码中使用了不符合缓存友好的算法，或是测试了不现实的场景，因此我们不会关注使用非临时存储。
在最近的 Intel CPU 上，“erms” 特性使得 rep movsb 成为最有利的实现，甚至比使用最宽寄存器的非临时对齐展开存储还要好。
memcpy 可以用汇编、C 或 C++ 编写，后者也可以使用内联汇编。
汇编实现更有利，因为它能确保编译器不会使代码变得更差，并能确保分支顺序、代码布局和所需寄存器的使用。如果它被放在单独的翻译单元中，则无法进行内联（但可以通过内联汇编来解决这个问题）。
有时 C 或 C++ 代码可以被编译器进一步优化。例如，clang 能够在启用 -mavx 时，将 SSE 指令替换为 AVX 代码。
请注意，编译器可能会将简单的代码替换为 memcpy，反之亦然：
- 已知小尺寸的 memcpy 会被替换为简单的指令，不会调用 memcpy；这由 -fbuiltin-memcpy 控制，可以通过调用 __builtin_memcpy 来手动确保。
- 复制字节的循环 可以被识别并替换为 memcpy 调用，这由 -ftree-loop-distribute-patterns 控制。
- 同时，复制字节的循环也可以被展开、剥离和向量化，从而生成类似于优秀 memcpy 实现的内联代码。

此描述截至 2021 年 3 月更新。

已阅·总结

构造需要 SFENCE 的测试样例

#include <immintrin.h>
#include <iostream>
#include <thread>
#include <atomic>

void nta_copy(int *dest, const int *src, size_t count) {
    for (size_t i = 0; i < count; i += 4) {
        __m128i data = _mm_load_si128(reinterpret_cast<const __m128i *>(src + i));
        // 被 GCC 气晕，签名都对不上
        // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si128
        _mm_stream_si128(reinterpret_cast<__m128i *>(dest + i), data); // Non-temporal store
    }
    // 注释掉这个
    // _mm_sfence();
}

// 本地运行这段代码，是潜在并发问题的，并且 TSAN 无法检测出来
// 总之：非临时指令并不遵循与常规存储指令相同的一致性规则
int main() {
    for(auto _{10}; _--;) {
        alignas(16) int a[] = {1,2,3,4,5,6,7,8};
        alignas(16) int b[] = {0,0,0,0,0,0,0,0};
        static_assert(std::size(a) % 4 == 0);

        std::atomic<int> atm {};

        std::thread green {[&] {
            nta_copy(b, a, std::size(a));
            // Acquire-Release 对于非临时操作没用
            atm.store(1, std::memory_order_release);
            while(atm.load(std::memory_order_acquire));
        }};
        while(atm.load(std::memory_order_acquire) == 0);
        // 有可能输出 [0,0,3,4,5,6,7,8]
        for(auto v : b) std::cout << v << ' ';
        std::cout << std::endl;
        atm.store(0, std::memory_order_release);
        green.join();
    }
}

内建领域大神：builtin 无特殊含义

GCC 文档没说清楚的问题，不到五分钟就被 wanghenshui 秒了

Because the WC protocol uses a weakly-ordered memory consistency model, a fencing operation implemented with the SFENCE or MFENCE instruction should be used in conjunction with MOVNTI instructions if multiple processors might use different memory types to read/write the destination memory locations.
来源：MOVNTI – felixcloutier

个人吐槽一点皮毛，只是觉得实现和注释有点割裂感：

不太理解为啥用 __builtin_memcpy。反汇编得知，编译器也是选择的 mov 指令。
不使用 AVX512 是英特尔的问题。Clang 19 + -march=znver5 + -O3 仍然会把 8 次 SIMD 展开直接生成为 2 次 AVX512 指令（vmovups (%rsi), %zmm0 + vmovups 64(%rsi), %zmm1）。
试了几个平台都没看到 rep movsb 操作。网上有人评论这个操作可能很慢。
作者提到互联网上的 memcpy 基准测试大多是错误的！没有任何后续说明。
作者提到使用非临时存储操作，也提到多线程场景，但是没提到该场景 SFENCE 的必要性。
作者提到应该优先使用内联汇编以避免编译器负优化，但是这份代码完全是纯血版 C++ 啊。