<charconv> is very fast, but it could be faster. One issue is that MSVC's optimizer often generates slower code for <charconv> than Clang/LLVM. We should create reduced test cases and file bugs on Developer Community for the optimizer team to investigate and fix.
At the bottom of this issue, I've prepared a self-contained test case using <charconv>. Here's a table of my timing results (Intel Core i7-8700, VS 2019 16.8 Preview 2, Clang/LLVM 10.0.0). The measured times are average nanoseconds per floating-point value.
| Scenario | MSVC x86 | LLVM x86 | LLVM x86 Speedup Ratio | MSVC x64 | LLVM x64 | LLVM x64 Speedup Ratio |
|----------|-------:|---------:|-----------------------:|-------:|---------:|-----------------------:|
| float plain shortest | 48.3 | 45.5 | 1.06 | 41.9 | 35.7 | :warning: 1.17 |
| double plain shortest | 92.5 | 71.8 | :x: 1.29 | 49.1 | 39.4 | :x: 1.25 |
| float scientific shortest | 47.2 | 41.4 | :warning: 1.14 | 38.3 | 31.6 | :x: 1.21 |
| double scientific shortest | 92.6 | 72.5 | :x: 1.28 | 48.0 | 39.6 | :x: 1.21 |
| float fixed shortest | 58.9 | 51.1 | :warning: 1.15 | 47.6 | 38.8 | :x: 1.23 |
| double fixed shortest | 338.5 | 298.4 | :warning: 1.13 | 145.9 | 111.6 | :x: 1.31 |
| float general shortest | 48.8 | 42.5 | :warning: 1.15 | 39.3 | 32.5 | :x: 1.21 |
| double general shortest | 92.9 | 72.4 | :x: 1.28 | 48.6 | 38.5 | :x: 1.26 |
| float hex shortest | 25.8 | 26.4 | :heavy_check_mark: 0.98 | 21.0 | 23.8 | :heavy_check_mark: 0.88 |
| double hex shortest | 109.3 | 53.8 | :boom: 2.03 | 27.7 | 26.6 | 1.04 |
| float scientific 8 | 116.5 | 102.3 | :warning: 1.14 | 63.8 | 58.0 | :warning: 1.10 |
| double scientific 16 | 145.5 | 128.7 | :warning: 1.13 | 77.5 | 65.1 | :warning: 1.19 |
| float fixed 6 (lossy) | 83.8 | 79.3 | 1.06 | 45.2 | 41.2 | :warning: 1.10 |
| double fixed 6 (lossy) | 294.7 | 270.6 | 1.09 | 121.4 | 94.1 | :x: 1.29 |
| float general 9 | 134.5 | 118.8 | :warning: 1.13 | 79.5 | 70.7 | :warning: 1.12 |
| double general 17 | 176.2 | 153.8 | :warning: 1.15 | 97.6 | 81.4 | :x: 1.20 |
| float hex 6 | 25.7 | 21.2 | :x: 1.21 | 21.0 | 17.4 | :x: 1.21 |
| double hex 13 | 103.0 | 26.4 | :boom: 3.90 | 26.9 | 30.6 | :heavy_check_mark: 0.88 |
The "LLVM Speedup Ratio" is the MSVC time divided by the LLVM time, so higher values mean that MSVC is slower than what LLVM has proven is possible on this hardware. I've marked values >= 2.0 with :boom:, values >= 1.2 with :x:, values >= 1.1 with :warning:, and values < 1.0 with :heavy_check_mark: (this is when MSVC is faster).
There are at least two separate issues here:
x86 double hex :boom:. This codepath performs bitwise operations with uint64_t. Something terrible is happening here, causing code that should be ultra-fast to instead be extremely slow (relatively speaking). This seems specific to the integer type being wider than the machine's preferred width. The good news is that because this codepath is so simple, it should be easy to extract into a minimal test case, and because the issue is so severe, the cause will hopefully be easy for the optimizer devs to find and fix. The functions are: https://github.com/microsoft/STL/blob/b90b3e05d2eb77e3178fb83c2e1eb7d539debf21/stl/inc/charconv#L2079-L2432
Decimal slowness, from moderate :warning: to significant :x:. These optimizer issues affect @ulfjack's amazing Ryu and Ryu Printf algorithms (used for the "shortest" and "precision" scenarios, respectively, although shortest does need to invoke Ryu Printf in a specific situation). When I worked on importing them to <charconv>, I reported as many optimizer issues as I could, MSVC (and LLVM) were able to fix some of them, and I was able to improve the source workarounds for other issues. However, there are still persistent issues with MSVC's optimizer.
Bonus issue: Conversely, MSVC appears to be generating noticeably faster code for the double hex 13 scenario on x64 :heavy_check_mark:. This indicates that LLVM has room for improvement; an issue could be reduced and reported to them.
<charconv> test case ("C1XX/C2" refers to the MSVC compiler front-end/back-end, like "Clang/LLVM"):
C:\Temp>type cppcon.cpp
// cl /std:c++17 /EHsc /nologo /W4 /MT /O2 cppcon.cpp && cppcon
// clang-cl /std:c++17 /EHsc /nologo /W4 /MT /O2 cppcon.cpp && cppcon
#include <charconv>
#include <chrono>
#include <random>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <system_error>
#include <type_traits>
#include <vector>
using namespace std;
using namespace std::chrono;
void verify(const bool b) {
if (!b) {
puts("FAIL");
exit(EXIT_FAILURE);
}
}
enum class RoundTrip { Sci, Fix, Gen, Hex, Lossy };
constexpr size_t N = 2'000'000; // how many floating-point values to test
constexpr size_t K = 5; // how many times to repeat the test, for cleaner timing
constexpr size_t BufSize = 2'000; // more than enough
unsigned int global_dummy = 0;
constexpr chars_format chars_format_from_RoundTrip(const RoundTrip rt) {
switch (rt) {
case RoundTrip::Sci:
return chars_format::scientific;
case RoundTrip::Fix:
return chars_format::fixed;
case RoundTrip::Gen:
return chars_format::general;
case RoundTrip::Hex:
return chars_format::hex;
case RoundTrip::Lossy:
default:
puts("FAIL");
exit(EXIT_FAILURE);
}
}
template <RoundTrip RT, typename Floating, typename... Args>
void test_to_chars(const char* const str, const vector<Floating>& vec, const Args&... args) {
char buf[BufSize];
const auto start = steady_clock::now();
for (size_t k = 0; k < K; ++k) {
for (const auto& elem : vec) {
const auto result = to_chars(buf, buf + BufSize, elem, args...);
global_dummy += static_cast<unsigned int>(result.ptr - buf);
global_dummy += static_cast<unsigned int>(buf[0]);
}
}
const auto finish = steady_clock::now();
printf("%6.1f ns | %s\n", duration<double, nano>{finish - start}.count() / (N * K), str);
for (const auto& elem : vec) {
const auto result = to_chars(buf, buf + BufSize, elem, args...);
verify(result.ec == errc{});
if constexpr (RT == RoundTrip::Lossy) {
// skip lossy conversions
} else {
Floating round_trip;
const auto from_result = from_chars(buf, result.ptr, round_trip, chars_format_from_RoundTrip(RT));
verify(from_result.ec == errc{});
verify(from_result.ptr == result.ptr);
verify(round_trip == elem);
}
}
}
int main() {
#if defined(__clang__) && defined(_M_IX86)
const char* const toolset = "Clang/LLVM x86 + MSVC STL";
#elif defined(__clang__) && defined(_M_X64)
const char* const toolset = "Clang/LLVM x64 + MSVC STL";
#elif !defined(__clang__) && defined(_M_IX86)
const char* const toolset = "C1XX/C2 x86 + MSVC STL";
#elif !defined(__clang__) && defined(_M_X64)
const char* const toolset = "C1XX/C2 x64 + MSVC STL";
#else
const char* const toolset = "Unknown Toolset";
#endif
puts(toolset);
vector<float> vec_flt;
vector<double> vec_dbl;
{
mt19937_64 mt64;
vec_flt.reserve(N);
while (vec_flt.size() < N) {
const uint32_t val = static_cast<uint32_t>(mt64());
constexpr uint32_t inf_nan = 0x7F800000U;
if ((val & inf_nan) == inf_nan) {
continue; // skip INF/NAN
}
float flt;
static_assert(sizeof(flt) == sizeof(val));
memcpy(&flt, &val, sizeof(flt));
vec_flt.push_back(flt);
}
vec_dbl.reserve(N);
while (vec_dbl.size() < N) {
const uint64_t val = mt64();
constexpr uint64_t inf_nan = 0x7FF0000000000000ULL;
if ((val & inf_nan) == inf_nan) {
continue; // skip INF/NAN
}
double dbl;
static_assert(sizeof(dbl) == sizeof(val));
memcpy(&dbl, &val, sizeof(dbl));
vec_dbl.push_back(dbl);
}
}
test_to_chars<RoundTrip::Gen>("STL float plain shortest", vec_flt);
test_to_chars<RoundTrip::Gen>("STL double plain shortest", vec_dbl);
test_to_chars<RoundTrip::Sci>("STL float scientific shortest", vec_flt, chars_format::scientific);
test_to_chars<RoundTrip::Sci>("STL double scientific shortest", vec_dbl, chars_format::scientific);
test_to_chars<RoundTrip::Fix>("STL float fixed shortest", vec_flt, chars_format::fixed);
test_to_chars<RoundTrip::Fix>("STL double fixed shortest", vec_dbl, chars_format::fixed);
test_to_chars<RoundTrip::Gen>("STL float general shortest", vec_flt, chars_format::general);
test_to_chars<RoundTrip::Gen>("STL double general shortest", vec_dbl, chars_format::general);
test_to_chars<RoundTrip::Hex>("STL float hex shortest", vec_flt, chars_format::hex);
test_to_chars<RoundTrip::Hex>("STL double hex shortest", vec_dbl, chars_format::hex);
test_to_chars<RoundTrip::Sci>("STL float scientific 8", vec_flt, chars_format::scientific, 8);
test_to_chars<RoundTrip::Sci>("STL double scientific 16", vec_dbl, chars_format::scientific, 16);
test_to_chars<RoundTrip::Lossy>("STL float fixed 6 (lossy)", vec_flt, chars_format::fixed, 6);
test_to_chars<RoundTrip::Lossy>("STL double fixed 6 (lossy)", vec_dbl, chars_format::fixed, 6);
test_to_chars<RoundTrip::Gen>("STL float general 9", vec_flt, chars_format::general, 9);
test_to_chars<RoundTrip::Gen>("STL double general 17", vec_dbl, chars_format::general, 17);
test_to_chars<RoundTrip::Hex>("STL float hex 6", vec_flt, chars_format::hex, 6);
test_to_chars<RoundTrip::Hex>("STL double hex 13", vec_dbl, chars_format::hex, 13);
puts("----------");
printf("global_dummy: %u\n", global_dummy);
}
@statementreply wrote:
On x86 double hex to_chars performance:
MSVC generates 1.7x faster code with /arch:SSE
Oh I guess I found out the culprit. MSVC’s SSE2 codegen writes two adjacent 32-bit memory, and then performs a 64-bit load.
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=80833
Looks similar to this GCC bug report
@statementreply wrote:
On x86 double hex to_chars performance:
MSVC generates 1.7x faster code with /arch:SSE
Oh I guess I found out the culprit. MSVC’s SSE2 codegen writes two adjacent 32-bit memory, and then performs a 64-bit load.
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=80833
Looks similar to this GCC bug report
Filed DevCom-1167526.
C:\Users\He\source\test\perf>type shift64_repo.cpp
using uint64 = unsigned long long;
extern const char* const digits;
char* test_func(char* str, uint64 value, int precision) noexcept {
int shift = 60;
for (;;) {
*str++ = digits[value >> shift];
--precision;
if (precision == 0) {
break;
}
const uint64 mask = (uint64{1} << shift) - 1;
value &= mask;
shift -= 4;
}
return str;
}
md5-3103ccd3c8202b809ed13eee030a0080
C:\Users\He\source\test\perf>cl /EHsc /W4 /WX /O2 /c /Fa shift64_repo.cpp
用于 x86 的 Microsoft (R) C/C++ 优化编译器 19.28.29213 版
版权所有(C) Microsoft Corporation。保留所有权利。
shift64_repo.cpp
C:\Users\He\source\test\perf>type shift64_repo.asm
md5-3103ccd3c8202b809ed13eee030a0080
; Listing generated by Microsoft (R) Optimizing Compiler Version 19.28.29213.0
TITLE C:\Users\He\source\test\perf\shift64_repo.cpp
.686P
.XMM
include listing.inc
.model flat
INCLUDELIB LIBCMT
INCLUDELIB OLDNAMES
PUBLIC ?test_func@@YAPADPAD_KH@Z ; test_func
EXTRN ?digits@@3QBDB:DWORD ; digits
; Function compile flags: /Ogtpy
; COMDAT ?test_func@@YAPADPAD_KH@Z
_TEXT SEGMENT
_str$ = 8 ; size = 4
_value$ = 12 ; size = 8
_precision$ = 20 ; size = 4
?test_func@@YAPADPAD_KH@Z PROC ; test_func, COMDAT
; File C:\Users\He\source\test\perf\shift64_repo.cpp
; Line 7
mov eax, DWORD PTR ?digits@@3QBDB ; digits
push ebx
mov ebx, DWORD PTR _value$[esp+4]
mov ecx, ebx
push ebp
mov ebp, DWORD PTR _precision$[esp+4]
push esi
mov esi, DWORD PTR _str$[esp+8]
shr ecx, 28 ; 0000001cH
push edi
mov edi, 60 ; 0000003cH
mov al, BYTE PTR [ecx+eax]
mov BYTE PTR [esi], al
inc esi
sub ebp, 1
; Line 10
je SHORT $LN13@test_func
npad 6
$LL2@test_func:
; Line 14
xor ecx, ecx
xor edx, edx
mov eax, edi
bts ecx, eax
cmp eax, 32 ; 00000020H
cmovae edx, ecx
xor ecx, edx
cmp eax, 64 ; 00000040H
cmovae edx, ecx
add ecx, -1
adc edx, -1
; Line 15
and DWORD PTR _value$[esp+12], ecx
mov ecx, DWORD PTR ?digits@@3QBDB ; digits
and ebx, edx
; Line 16
sub edi, 4
mov DWORD PTR _value$[esp+16], ebx
movq xmm1, QWORD PTR _value$[esp+12]
movd xmm0, edi
psrlq xmm1, xmm0
movd edx, xmm1
mov cl, BYTE PTR [edx+ecx]
mov BYTE PTR [esi], cl
inc esi
sub ebp, 1
jne SHORT $LL2@test_func
$LN13@test_func:
pop edi
; Line 20
mov eax, esi
pop esi
pop ebp
pop ebx
ret 0
?test_func@@YAPADPAD_KH@Z ENDP ; test_func
_TEXT ENDS
END
md5-f328d590e3d20cb8dd7f233a4bd6ca9b
and DWORD PTR _value$[esp+12], ecx
mov ecx, DWORD PTR ?digits@@3QBDB
and ebx, edx
sub edi, 4
mov DWORD PTR _value$[esp+16], ebx
movq xmm1, QWORD PTR _value$[esp+12]
Most helpful comment
Filed DevCom-1167526.