如何在Visual Studio 2019社区中不让最快的人惊慌并发现漏洞

一篇关于在现代处理器中使用“奇怪的” popcount指令的文章提示了我。 这与计算单位数无关,而与检测C线( 零终止线 )末端的符号有关。
以空值终止的字符串是一种用编程语言表示字符串的方式,其中不引入特殊的字符串类型,而是使用字符数组,并且字符串的末尾是遇到的第一个特殊的空字符(ASCII码为NUL,值为0)。

为了确定此类术语的长度,应用了标准函数。

size_t __cdecl strlen(char const* str) 

其运算算法可以用C描述为:

 size_t strlen_algo(const char* str) { size_t length = 0; while (*str++) length++; return length; } 

让我们看看MS Visual Studio 2019社区编译器(Release,x86)将其转换为:

 08811F7h: mov al,byte ptr [ecx] inc ecx test al,al jne main+0D7h (08811F7h) 

即,从存储器中加载一个字节并与零进行比较。 如果您在Release中构建项目,则用相同的代码替换strlen调用,该算法是正确的,但是在我看来,速度还不够。 如果在Debug中调用标准strlen来编译代码,会发生什么情况? -如预期的那样,将调用strlen库函数,但由人工在汇编器上手动编写。

MS Visual Studio中标准strlen函数的代码破坏器
 ;strlen.asm - contains strlen() routine ; ; Copyright (c) Microsoft Corporation. All rights reserved. ; ;Purpose: ; strlen returns the length of a null-terminated string, ; not including the null byte itself. ; ;******************************************************************************* .xlist include cruntime.inc .list page ;*** ;strlen - return the length of a null-terminated string ; ;Purpose: ; Finds the length in bytes of the given string, not including ; the final null character. ; ; Algorithm: ; int strlen (const char * str) ; { ; int length = 0; ; ; while( *str++ ) ; ++length; ; ; return( length ); ; } ; ;Entry: ; const char * str - string whose length is to be computed ; ;Exit: ; EAX = length of the string "str", exclusive of the final null byte ; ;Uses: ; EAX, ECX, EDX ; ;Exceptions: ; ;******************************************************************************* CODESEG public strlen strlen proc \ buf:ptr byte OPTION PROLOGUE:NONE, EPILOGUE:NONE .FPO ( 0, 1, 0, 0, 0, 0 ) string equ [esp + 4] mov ecx,string ; ecx -> string test ecx,3 ; test if string is aligned on 32 bits je short main_loop str_misaligned: ; simple byte loop until string is aligned mov al,byte ptr [ecx] add ecx,1 test al,al je short byte_3 test ecx,3 jne short str_misaligned add eax,dword ptr 0 ; 5 byte nop to align label below align 16 ; should be redundant main_loop: mov eax,dword ptr [ecx] ; read 4 bytes mov edx,7efefeffh add edx,eax xor eax,-1 xor eax,edx add ecx,4 test eax,81010100h je short main_loop ; found zero byte in the loop mov eax,[ecx - 4] test al,al ; is it byte 0 je short byte_0 test ah,ah ; is it byte 1 je short byte_1 test eax,00ff0000h ; is it byte 2 je short byte_2 test eax,0ff000000h ; is it byte 3 je short byte_3 jmp short main_loop ; taken if bits 24-30 are clear and bit ; 31 is set byte_3: lea eax,[ecx - 1] mov ecx,string sub eax,ecx ret byte_2: lea eax,[ecx - 2] mov ecx,string sub eax,ecx ret byte_1: lea eax,[ecx - 3] mov ecx,string sub eax,ecx ret byte_0: lea eax,[ecx - 4] mov ecx,string sub eax,ecx ret strlen endp end 


表1-以秒为单位的出色基准测试的运行时间(MS VS 2019社区,C ++ cl版本:19.22.27905)

大块,1K大块,1K,*异常调用小块,10个元素小块,10个元素,* strlen调用
调试,x867.257.253.063.06
发行,x869.03.90.150.12
调试,x646.06.03.43.4
发行版,x648.52.30.150.11

*强制编译器调用strlen库函数

因此,我们可以得出结论,即使在较小的字符串上,用MS编译器替换字节比较也是无效的,而在较大的字符串上,Debug领先于Release!

基准代码
 #include <iostream> #include <string> #include <vector> #include <chrono> #include <nmmintrin.h> inline size_t strlen_algo(const char* str) { size_t length = 0; while (*str++) length++; return length; } inline size_t strlen_sse4(const char* str) { size_t length = 0; int res = 0; //align to 16 bytes while (((size_t(str+length)) & (sizeof(__m128i) - 1)) != 0) { if (str[length] == 0) return length; length++; } __m128i z128 = _mm_setzero_si128(); for(; ; length += 16) { __m128i data = _mm_load_si128((__m128i*)(str + length)); if ((res = _mm_cmpistri(z128, data, _SIDD_CMP_EQUAL_EACH)) != 16) break; } /*while (str[length]) length++;*/ return length + res; } #define _DISABLE_ASM_BSF //https://www.strchr.com/sse2_optimised_strlen #ifndef WORDS_BIGENDIAN #if 0 #elif 0 #else static inline int count_bits_to_0(unsigned int x) // counting trailing zeroes, by Nazo, post: 2009/07/20 03:40 { // this is current winner for speed static const unsigned char table[256] = { 7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, }; if ((unsigned char)x) return table[(unsigned char)x]; return table[x >> 8] + 8; // t[x / 256] + 8 } #endif #else #if 0 static inline int count_bits_to_0(unsigned int x) // counting trailing zeroes { register int i = 0; if (!(x & (1 << 15))) i++; else return i; if (!(x & (1 << 14))) i++; else return i; if (!(x & (1 << 13))) i++; else return i; if (!(x & (1 << 12))) i++; else return i; if (!(x & (1 << 11))) i++; else return i; if (!(x & (1 << 10))) i++; else return i; if (!(x & (1 << 9))) i++; else return i; if (!(x & (1 << 8))) i++; else return i; if (!(x & (1 << 7))) i++; else return i; if (!(x & (1 << 6))) i++; else return i; if (!(x & (1 << 5))) i++; else return i; if (!(x & (1 << 4))) i++; else return i; if (!(x & (1 << 3))) i++; else return i; if (!(x & (1 << 2))) i++; else return i; if (!(x & (1 << 1))) i++; else return i; if (!(x & (1 << 0))) i++; return i; } #else static inline int count_bits_to_0(unsigned int x) // counting trailing zeroes { // http://www.hackersdelight.org/: nlz1() shortened for 16-bit mask register int n = 0; if (x <= 0x000000FFU) { n = n + 8; x = x << 8; } if (x <= 0x00000FFFU) { n = n + 4; x = x << 4; } if (x <= 0x00003FFFU) { n = n + 2; x = x << 2; } if (x <= 0x00007FFFU) { n = n + 1; } return n; } #endif #endif size_t strlen2(const char* str) { register size_t len = 0; // align to 16 bytes while ((((intptr_t)str) & (sizeof(__m128i) - 1)) != 0) { if (*str++ == 0) return len; ++len; } // search for 0 __m128i xmm0 = _mm_setzero_si128(); __m128i xmm1; int mask = 0; for (;;) { xmm1 = _mm_load_si128((__m128i*)str); xmm1 = _mm_cmpeq_epi8(xmm1, xmm0); if ((mask = _mm_movemask_epi8(xmm1)) != 0) { // got 0 somewhere within 16 bytes in xmm1, or within 16 bits in mask // find index of first set bit #ifndef _DISABLE_ASM_BSF // define it to disable ASM #if (_MSC_VER >= 1300) // make sure <intrin.h> is included unsigned long pos; _BitScanForward(&pos, mask); len += (size_t)pos; #elif defined(_MSC_VER) // earlier MSVC's do not have _BitScanForward, use inline asm __asm bsf edx, mask; edx = bsf(mask) __asm add edx, len; edx += len __asm mov len, edx; len = edx #elif ((__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))) // modern GCC has built-in __builtin_ctz len += __builtin_ctz(mask); #elif defined(__GNUC__) // older GCC shall use inline asm unsigned int pos; asm("bsf %1, %0" : "=r" (pos) : "rm" (mask)); len += (size_t)pos; #else // none of choices exist, use local BSF implementation len += count_bits_to_0(mask); #endif #else len += count_bits_to_0(mask); #endif break; } str += sizeof(__m128i); len += sizeof(__m128i); } return len; } int main() { std::vector<std::string> vstr; const int str_num = 1024; const int str_size = 1024; size_t len_result = 0; srand(0); for (int i = 0; i < str_num; i++) { std::string str1; for (int j = 0; j < str_size; j++) { str1.push_back('0' + rand() % 78); } vstr.push_back(std::move(str1)); } auto strlen_func = strlen; //auto strlen_func = strlen_algo; //auto strlen_func = strlen_sse4; //auto strlen_func = strlen2; auto time_std = std::chrono::steady_clock::now(); for (int k = 0; k < 10*1000; k++) { for (int i = 0; i < str_num; i++) { const char* str_for_test = vstr[i].c_str(); len_result += strlen_func(str_for_test); //len_result += strlen(str_for_test); } for (int i = 0; i < str_num; i++) { const char* str_for_test = vstr[i].c_str(); len_result -= strlen_func(str_for_test); //len_result -= strlen(str_for_test); } } auto finish = std::chrono::steady_clock::now(); double elapsed_seconds = std::chrono::duration_cast<std::chrono::duration<double>>(finish - time_std).count(); std::cout << "\n" << len_result; std::cout << "\n\nTime: " << elapsed_seconds; return 0; } 


弦乐
 len_result += strlen(str_for_test); 
编译
调试:调用strlen库函数;
发布:字节比较。

如果您发表评论并撰写

 len_result += strlen_func(str_for_test); 

在哪里

 auto strlen_func = strlen; 

我们将强制编译器始终调用库函数。

由于在字节比较之前将库函数实现了2.3倍的加速(对于Release,x86、1k)?

由于比较的结果不是一个字节,而是四个字节。这里的所有魔术:

 main_loop: mov eax,dword ptr [ecx] ; read 4 bytes mov edx,7efefeffh add edx,eax xor eax,-1 xor eax,edx add ecx,4 test eax,81010100h je short main_loop 

使用现代处理器的矢量指令是否可以更快地执行操作? 让我们尝试一下。

使用Intel Intrinsics指南 ,我们可以找到仅用于处理字符串的固有_mm_cmpistri SSE4.2。 两个长度为128位的向量和一个操作掩码被馈送到输入。 作为掩码,我们使用:_SIDD_UBYTE_OPS = 0-数据类型,_SIDD_CMP_EQUAL_EACH = 8-字节对齐操作,我们将与零向量进行比较。 返回的值将是第一个成对的不相等元素的数量(也就是说,如果该元素从左到右匹配,则计数停止,如果有人确认该行为,我将很高兴)。

 size_t length = 0; int res = 0; //align to 16 bytes while (((size_t(str+length)) & (sizeof(__m128i) - 1)) != 0) { if (str[length] == 0) return length; length++; } __m128i z128 = _mm_setzero_si128(); for(; ; length += 16) { __m128i data = _mm_load_si128((__m128i*)(str + length)); if ((res = _mm_cmpistri(z128, data, _SIDD_CMP_EQUAL_EACH)) != 16) break; } /*while (str[length]) length++;*/ return length + res; 

代号

 while (((size_t(str+length)) & (sizeof(__m128i) - 1)) != 0) { if (str[length] == 0) return length; length++; } 

用于均衡加载的行的地址,大多数SSE指令需要使用该地址(以16的倍数为单位)。 对于我们使用的pcmpistri指令,严格不需要对齐,也不会抛出访问异常。
内在的
 __m128i data = _mm_load_si128((__m128i*)(str + length)); if ((res = _mm_cmpistri(z128, data, _SIDD_CMP_EQUAL_EACH)) != 16) 

编译成:
 pcmpistri xmm1,xmmword ptr [eax+edx],8 

但是,在我们的情况下,对齐16也是有用的,它可以使速度稍有提高,因此我们可以确保16字节的读取周期不会进入可能未分配的页面( 4K内存页面 )。

周期:
 while (str[length]) length++; 

如果检测到字符串的结尾,则“获取”字符串的大小(因为我不确定_mm_cmpistri的运算算法)。
picul评论后被删除,从而增加了一些细线。

我们做得最快吗? -不幸的是,不,来自https://www.strchr.com/sse2_optimised_strlen的人甚至在不使用SSE4.2的情况下运行得更快。

表2-基准测试的运行时间(以秒为单位)(发布)
字符数MS字节比较斯特伦女士上证4.2上证2
10,x860.150.120.120.125
1K,x869.03.91.651.42
10,x640.150.110.080.1
1K,x648.52.31.61.32

结论:

在我看来,MS始终需要调用strlen库,而不需要进行替换字节比较。

UPD
添加了x64测试。
删除了strlen_SSE4中的最后一个循环

Source: https://habr.com/ru/post/zh-CN467699/


All Articles