Based on https://github.com/rust-lang/rust/issues/73462
pub fn conv() -> String {
b'a'.to_string()
}
pub fn conv() -> String {
10.to_string()
}
Not sure if we can do much about integer to string but I bet we can do something for byte to string.
test tests::bench_byte_to_string ... bench: 51,257 ns/iter (+/- 1,623)
test tests::bench_char_to_string ... bench: 43,246 ns/iter (+/- 7,624)
test tests::bench_int_to_string ... bench: 52,218 ns/iter (+/- 3,077)
test tests::bench_str_to_string ... bench: 15,701 ns/iter (+/- 380)
(note that char benchmark is not updated with the latest specialization update by @lzutao)
Oh, looks like we don't know if the u8 is a valid utf-8 string or not, u8 byte is probably invalid. I wonder if one can have a specialized ToString for i8 and others numbers.
Looking at which u8.to_string() most likely uses, I don't think we need buf = [MaybeUninit::<u8>::uninit(); 39] for u8 but not sure if using a different value for different type will speed it up.
I also don't think we can parse 4 characters at a time for u8 or i8. Same, not sure if separating them is good.
I have a rough plan but it isn't quite clear.
One of these things is porting the similar functions in C++ to make it more efficient:
See:
Maybe we could also try porting the go version in https://golang.org/src/strconv/itoa.go?s=1028:1051#L24, and then see which one is the fastest implementation.
I have sort of a patch but I don't know how long the benchmark will run with ./x.py bench src/libcore, I wish there is a way to benchmark certain functions, still it took too long.
I wrote a patch to use different stack size to parse different type of integer, not sure if this will improve parsing performance.
Patch
diff --git a/src/libcore/fmt/num.rs b/src/libcore/fmt/num.rs
index 7d77e33d743..1457da8c667 100644
--- a/src/libcore/fmt/num.rs
+++ b/src/libcore/fmt/num.rs
@@ -187,10 +187,10 @@ static DEC_DIGITS_LUT: &[u8; 200] = b"0001020304050607080910111213141516171819\
8081828384858687888990919293949596979899";
macro_rules! impl_Display {
- ($($t:ident),* as $u:ident via $conv_fn:ident named $name:ident) => {
+ ($($t:ident),* as $u:ident via $conv_fn:ident named $name:ident max $max_len:literal) => {
fn $name(mut n: $u, is_nonnegative: bool, f: &mut fmt::Formatter<'_>) -> fmt::Result {
- // 2^128 is about 3*10^38, so 39 gives an extra byte of space
- let mut buf = [MaybeUninit::<u8>::uninit(); 39];
+ // 2^bits is about $max_len, so 39 gives an extra byte of space
+ let mut buf = [MaybeUninit::<u8>::uninit(); $max_len];
let mut curr = buf.len() as isize;
let buf_ptr = MaybeUninit::first_ptr_mut(&mut buf);
let lut_ptr = DEC_DIGITS_LUT.as_ptr();
@@ -198,28 +198,30 @@ macro_rules! impl_Display {
// SAFETY: Since `d1` and `d2` are always less than or equal to `198`, we
// can copy from `lut_ptr[d1..d1 + 1]` and `lut_ptr[d2..d2 + 1]`. To show
// that it's OK to copy into `buf_ptr`, notice that at the beginning
- // `curr == buf.len() == 39 > log(n)` since `n < 2^128 < 10^39`, and at
- // each step this is kept the same as `n` is divided. Since `n` is always
- // non-negative, this means that `curr > 0` so `buf_ptr[curr..curr + 1]`
- // is safe to access.
+ // `curr == buf.len() == $max_len > log(n)` since `n < 2^128 < 10^bits`,
+ // and at each step this is kept the same as `n` is divided. Since `n`
+ // is always non-negative, this means that `curr > 0` so
+ // `buf_ptr[curr..curr + 1]` is safe to access.
unsafe {
// need at least 16 bits for the 4-characters-at-a-time to work.
assert!(crate::mem::size_of::<$u>() >= 2);
// eagerly decode 4 characters at a time
- while n >= 10000 {
- let rem = (n % 10000) as isize;
- n /= 10000;
-
- let d1 = (rem / 100) << 1;
- let d2 = (rem % 100) << 1;
- curr -= 4;
-
- // We are allowed to copy to `buf_ptr[curr..curr + 3]` here since
- // otherwise `curr < 0`. But then `n` was originally at least `10000^10`
- // which is `10^40 > 2^128 > n`.
- ptr::copy_nonoverlapping(lut_ptr.offset(d1), buf_ptr.offset(curr), 2);
- ptr::copy_nonoverlapping(lut_ptr.offset(d2), buf_ptr.offset(curr + 2), 2);
+ if $max_len > 4 {
+ while n >= 10000 {
+ let rem = (n % 10000) as isize;
+ n /= 10000;
+
+ let d1 = (rem / 100) << 1;
+ let d2 = (rem % 100) << 1;
+ curr -= 4;
+
+ // We are allowed to copy to `buf_ptr[curr..curr + 3]` here since
+ // otherwise `curr < 0`. But then `n` was originally at least `10000^10`
+ // which is `10^40 > 2^128 > n`.
+ ptr::copy_nonoverlapping(lut_ptr.offset(d1), buf_ptr.offset(curr), 2);
+ ptr::copy_nonoverlapping(lut_ptr.offset(d2), buf_ptr.offset(curr + 2), 2);
+ }
}
// if we reach here numbers are <= 9999, so at most 4 chars long
@@ -440,10 +442,10 @@ macro_rules! impl_Exp {
#[cfg(any(target_pointer_width = "64", target_arch = "wasm32"))]
mod imp {
use super::*;
- impl_Display!(
- i8, u8, i16, u16, i32, u32, i64, u64, usize, isize
- as u64 via to_u64 named fmt_u64
- );
+ impl_Display!(i8, u8 as u64 via to_u64 named fmt_u8 max 3);
+ impl_Display!(i16, u16 as u64 via to_u64 named fmt_u16 max 5);
+ impl_Display!(i32, u32 as u64 via to_u64 named fmt_u32 max 10);
+ impl_Display!(i64, u64, usize, isize as u64 via to_u64 named fmt_u64 max 20);
impl_Exp!(
i8, u8, i16, u16, i32, u32, i64, u64, usize, isize
as u64 via to_u64 named exp_u64
@@ -453,11 +455,13 @@ mod imp {
#[cfg(not(any(target_pointer_width = "64", target_arch = "wasm32")))]
mod imp {
use super::*;
- impl_Display!(i8, u8, i16, u16, i32, u32, isize, usize as u32 via to_u32 named fmt_u32);
- impl_Display!(i64, u64 as u64 via to_u64 named fmt_u64);
+ impl_Display!(i8, u8 as u32 via to_u32 named fmt_u8 max 3);
+ impl_Display!(i16, u16 as u32 via to_u32 named fmt_u16 max 5);
+ impl_Display!(i32, u32, isize, usize as u32 via to_u32 named fmt_u32 max 10);
+ impl_Display!(i64, u64 as u64 via to_u64 named fmt_u64 max 20);
impl_Exp!(i8, u8, i16, u16, i32, u32, isize, usize as u32 via to_u32 named exp_u32);
impl_Exp!(i64, u64 as u64 via to_u64 named exp_u64);
}
-impl_Display!(i128, u128 as u128 via to_u128 named fmt_u128);
+impl_Display!(i128, u128 as u128 via to_u128 named fmt_u128 max 39);
impl_Exp!(i128, u128 as u128 via to_u128 named exp_u128);
@lzutao How do I know if it is faster quickly?
I may build stage1 rustc and run your benchmark. Also comparing with result of master rustc.
@lzutao How do you do that? I use ./x.py bench src/libcore but it takes so long even when I run that on a shared compile farm with 16 cores.
The hard code number is not bad but I would go with creating a NumLimits trait similar to std::numeric_limits to request the number of decimal digits for a integer.
How do you do that?
I do ./x.py build --stage=1 src/rustc and use rustup toolchain link stage1 build/<target>/stage1
to create a custom toolchain. Then I use that toolchain* to bench custom benchmark (not libcore).
*: with rustup override or cargo +stage1.
But rust does not have numeric limits right? So I need to add something like MAX and MIN but keep it only for internal use?
Yeah, my plan is to add that trait to Rust. But you are free to hardcode that number to macro
unless lib/compiler team push back.
rustup toolchain set?
My mistake rustup toolchain link
Btw, why did you want to run bench on src/libcore?
@lzutao I am thinking of trying out the benchmark first and then tweak it, later only add it as a trait, but I don't know if trait can carry value. But still, making all of what you say when it isn't faster isn't very useful for me.
Btw, why do you want to run bench on src/libcore?
Because I want to test out if Display for u8 did improve, which should also improves the speed of u8.to_string(). I saw that there might have some benchmarks on it there.
EDIT: @lzutao By the way, it's rustup toolchain link stage1 build/x86_64-unknown-linux-gnu/stage1.
But still, making all of what you say when it isn't faster isn't very useful for me
It is a brick of the procedure. You might interest in reading the source code of to_string function in C++.
It is a brick of the procedure. You might interested to read the source code of to_string function in C++.
Thanks, that's very helpful. I was looking for that here and there but I cannot find it.
@lzutao But how should I proceed with this? The method you specified does not seemed to work for libcore with a bunch of errors.
My idea is work it out into two parts, first identify if this speeds up integer to string conversion, second work out the numeric limits you mentioned. But I don't know if it is faster, but still I will try running benchmark here, but it will probably take a long time.
But how should I proceed with this?
You shouldn't have to. That is my very long term/rough plan and I may have to write an RFC/demo to convince the compiler team.
But I don't know if it is faster
Probably writing a micro benchmark to compare C++ to_string<int> and Rust {int}.to_string.
Honestly I don't know, that is in my own hope.
is intended to allow the fastest possible implementation that is useful in common high-throughput contexts such as text-based interchange (JSON or XML).
I tried running a benchmark and here are the results.
Summary of cargo benchcmp old.txt new.txt --threshold 5
name old ns/iter new ns/iter diff ns/iter diff % speedup
ascii::long::case10_mask_mult_bool_lookup_table 36,285 (192 MB/s) 33,184 (210 MB/s) -3,101 -8.55% x 1.09
ascii::medium::is_ascii_alphanumeric 150 (213 MB/s) 162 (197 MB/s) 12 8.00% x 0.93
ascii::medium::is_ascii_control 136 (235 MB/s) 126 (253 MB/s) -10 -7.35% x 1.08
ascii::medium::is_ascii_uppercase 136 (235 MB/s) 129 (248 MB/s) -7 -5.15% x 1.05
ascii::medium::is_ascii_whitespace 134 (238 MB/s) 127 (251 MB/s) -7 -5.22% x 1.06
ascii::short::case00_alloc_only 133 (52 MB/s) 126 (55 MB/s) -7 -5.26% x 1.06
ascii::short::case03_branch_and_subtract 157 (44 MB/s) 146 (47 MB/s) -11 -7.01% x 1.08
ascii::short::is_ascii_alphabetic 148 (47 MB/s) 138 (50 MB/s) -10 -6.76% x 1.07
ascii::short::is_ascii_digit 180 (38 MB/s) 133 (52 MB/s) -47 -26.11% x 1.35
ascii::short::is_ascii_punctuation 145 (48 MB/s) 132 (53 MB/s) -13 -8.97% x 1.10
fmt::write_vec_macro2 154,290 166,191 11,901 7.71% x 0.93
iter::bench_enumerate_ref_sum 20,468,050 22,914,835 2,446,785 11.95% x 0.89
iter::bench_filter_map_ref_sum 24,912,476 22,021,140 -2,891,336 -11.61% x 1.13
iter::bench_flat_map_chain_ref_sum 136,840,693 166,930,595 30,089,902 21.99% x 0.82
iter::bench_flat_map_sum 16,049,100 16,852,320 803,220 5.00% x 0.95
iter::bench_inspect_chain_sum 32,036,133 36,677,341 4,641,208 14.49% x 0.87
iter::bench_take_while_chain_ref_sum 25,880,655 27,220,579 1,339,924 5.18% x 0.95
iter::bench_zip_copy 1,710 1,964 254 14.85% x 0.87
num::bench_i32_from_str_radix_36 272,111 293,915 21,804 8.01% x 0.93
num::bench_u16_from_str_radix_36 197,457 224,978 27,521 13.94% x 0.88
num::dec2flt::bench_short_decimal 217 233 16 7.37% x 0.93
slice::rotate_16_usize_5 18,022 19,973 1,951 10.83% x 0.90
slice::rotate_64_usize_5 318,033 347,852 29,819 9.38% x 0.91
slice::rotate_u8 1,941 2,108 167 8.60% x 0.92
But still, I don't know why the results differs a lot, maybe because it's a compile farm.