The most simple server echo I can think of transfers less than 1MB/s. It doesn't matter if it's evented or not or if using streams or not.
// To use:
// 1. Start server with "zig build run"
// 2.Then test it with "time dd if=/dev/zero | nc -v -v -n 127.0.0.1 9001"
// 3. Press ctrl+c and nwill output the speed
const std = @import("std");
const net = std.net;
pub const io_mode = .evented;
pub fn main() anyerror!void {
const allocator = std.heap.direct_allocator;
const req_listen_addr = try net.Address.parseIp4("127.0.0.1", 9001);
std.event.Loop.instance.?.beginOneEvent();
var server = net.StreamServer.init(.{});
defer server.deinit();
try server.listen(req_listen_addr);
std.debug.warn("listening at {}\n", server.listen_address);
while (true) {
const conn = try server.accept();
std.debug.warn("connected to {}\n", conn.address);
echo(conn) catch |err| {
std.debug.warn("connection dropped {}\n", conn.address);
};
}
}
pub fn echo(conn: net.StreamServer.Connection) !void {
var in_stream = &conn.file.inStream().stream;
var out_stream = &conn.file.outStream().stream;
while (true) {
var c: u8 = try in_stream.readByte();
try out_stream.writeByte(c);
}
}
dd if=/dev/zero | nc -v -v -n 127.0.0.1 9001
Connection to 127.0.0.1 9001 port [tcp/*] succeeded!
^C17529+0 records in
17528+0 records out
8974336 bytes (9.0 MB, 8.6 MiB) copied, 17.2288 s, 521 kB/s
Also says 104 is an unexpected errno when aborting.
Sorry for all the noise. Streams should use a bigger buffer with os.read to avoid serious performance hits.
pub fn echo(conn: net.StreamServer.Connection) !void {
while (true) {
var buffer: [8096]u8 = undefined;
var n = try conn.file.read(buffer[0..]);
//try conn.file.write(buffer[0..]);
}
}
Gives a throughput of around 350MB/s, while a single byte buffer (as is used by the stream's api) reads at around 1.7MB/s
Did you mean to re-open?
Yes, it'd be nice if the InStream used some sort of intermediate buffer as anything using readByte will be slow (as discovered above).
You might be looking for std.io.BufferedInStream
Doh... yes. Thanks.
Using a buffered in stream is still slow
pub fn echo(conn: net.StreamServer.Connection) !void {
var in_stream = &std.io.BufferedInStream(std.fs.File.ReadError).init(
&conn.file.inStream().stream).stream;
//var out_stream = &conn.file.outStream().stream;
while (true) {
var c = try in_stream.readByte();
//try conn.file.write(buffer[0..]);
}
}
Reading using file.read with a buf[4096] directly:
(base) jrm@inx:~$ dd if=/dev/zero | nc -v -v -n 127.0.0.1 9002
Connection to 127.0.0.1 9002 port [tcp/*] succeeded!
^C5646434+0 records in
5646434+0 records out
2890974208 bytes (2.9 GB, 2.7 GiB) copied, 7.81546 s, 370 MB/s
Reading using a buffered in stream
(base) jrm@inx:~$ dd if=/dev/zero | nc -v -v -n 127.0.0.1 9002
Connection to 127.0.0.1 9002 port [tcp/*] succeeded!
^C334972+0 records in
334971+0 records out
171505152 bytes (172 MB, 164 MiB) copied, 7.0789 s, 24.2 MB/s
(base) jrm@inx:~$ dd if=/dev/zero | nc -v -v -n 127.0.0.1 9002
Connection to 127.0.0.1 9002 port [tcp/*] succeeded!
^C497788+0 records in
497787+0 records out
254866944 bytes (255 MB, 243 MiB) copied, 10.3641 s, 24.6 MB/s
Is this due to LLVM not knowing how to optimize zig's current interface pattern? Or something else?
I've been debugging this all morning and found several things.
// To use:
// 1. Start server with "zig build run"
// 2.Then test it with "time dd if=/dev/zero | nc -v -v -n 127.0.0.1 9001"
// 3. Press ctrl+c and nwill output the speed
const std = @import("std");
const net = std.net;
const os = std.os;
const math = std.math;
const mem = std.mem;
const File = std.fs.File;
const assert = std.debug.assert;
//pub const io_mode = .evented;
const buffer_size = 4096;
pub fn main() anyerror!void {
const allocator = std.heap.direct_allocator;
const req_listen_addr = try net.Address.parseIp4("127.0.0.1", 9002);
//std.event.Loop.instance.?.beginOneEvent();
var server = net.StreamServer.init(.{});
defer server.deinit();
try server.listen(req_listen_addr);
std.debug.warn("listening at {}\n", server.listen_address);
const conn = try server.accept();
std.debug.warn("connected to {}\n", conn.address);
// 9008128 bytes (9.0 MB, 8.6 MiB) copied, 4.68951 s, 1.9 MB/s
//try readUnbuffered(conn);
// 1767279616 bytes (1.8 GB, 1.6 GiB) copied, 4.97003 s, 356 MB/s
//try readSlice(conn);
// 963648512 bytes (964 MB, 919 MiB) copied, 7.86233 s, 123 MB/s
// try readBufferedInStream(conn);
// 1465704448 bytes (1.5 GB, 1.4 GiB) copied, 7.90966 s, 185 MB/s
// try readBufferedReader(conn);
// 1600124928 bytes (1.6 GB, 1.5 GiB) copied, 6.77311 s, 236 MB/s
try readBufferedReaderSingle(conn);
// 2934166528 bytes (2.9 GB, 2.7 GiB) copied, 8.03281 s, 365 MB/s
//try readRaw(conn);
}
pub fn readUnbuffered(conn: net.StreamServer.Connection) !void {
const in_stream = &conn.file.inStream().stream;
while (true) {
var c = try in_stream.readByte();
}
}
pub fn readSlice(conn: net.StreamServer.Connection) !void {
const in_stream = &conn.file.inStream().stream;
var buffer: [4096]u8 = undefined;
while (true) {
var c = try in_stream.read(buffer[0..]);
}
}
pub fn readBufferedInStream(conn: net.StreamServer.Connection) !void {
const in_stream = &std.io.BufferedInStream(File.ReadError).init(&conn.file.inStream().stream).stream;
while (true) {
var c = try in_stream.readByte();
}
}
pub fn readBufferedReader(conn: net.StreamServer.Connection) !void {
const in_stream = &BufferedReader.init(&conn.file.inStream().stream).stream;
while (true) {
var c = try in_stream.readByte();
}
}
pub fn readBufferedReaderSingle(conn: net.StreamServer.Connection) !void {
const in_stream = &BufferedReader.init(&conn.file.inStream().stream).stream;
while (true) {
var dest: [1]u8 = undefined;
var c = try in_stream.read(dest[0..]);
}
}
pub fn readRaw(conn: net.StreamServer.Connection) !void {
var buffer: [4096]u8 = undefined;
var start_index: usize = buffer_size;
var end_index: usize = buffer_size;
while (true) {
var dest: [1]u8 = undefined;
if (start_index == end_index) {
start_index = 0;
end_index = try os.read(conn.file.handle, buffer[0..]);
}
dest[0] = buffer[start_index];
start_index += 1;
}
}
const BufferedReader = struct {
pub const Stream = File.InStream.Stream;
unbuffered_in_stream: *Stream,
stream: Stream,
buffer: [buffer_size]u8,
start_index: usize,
end_index: usize,
pub fn init(unbuffered_in_stream: *Stream) BufferedReader {
return BufferedReader{
.unbuffered_in_stream = unbuffered_in_stream,
.buffer = undefined,
// Initialize these two fields to buffer_size so that
// in `readFn` we treat the state as being able to read
// more from the unbuffered stream. If we set them to 0
// and 0, the code would think we already hit EOF.
.start_index = buffer_size,
.end_index = buffer_size,
.stream = Stream{ .readFn = readFn },
};
}
fn readFn(in_stream: *Stream, dest: []u8) !usize {
const self = @fieldParentPtr(BufferedReader, "stream", in_stream);
// Hot path for one byte reads
if (dest.len == 1 and self.end_index > self.start_index) {
dest[0] = self.buffer[self.start_index];
self.start_index += 1;
return 1;
}
var dest_index: usize = 0;
while (true) {
const dest_space = dest.len - dest_index;
if (dest_space == 0) {
return dest_index;
}
const amt_buffered = self.end_index - self.start_index;
if (amt_buffered == 0) {
assert(self.end_index <= buffer_size);
// Make sure the last read actually gave us some data
if (self.end_index == 0) {
// reading from the unbuffered stream returned nothing
// so we have nothing left to read.
return dest_index;
}
// we can read more data from the unbuffered stream
if (dest_space < buffer_size) {
self.start_index = 0;
self.end_index = try self.unbuffered_in_stream.read(self.buffer[0..]);
// Shortcut
if (self.end_index >= dest_space) {
mem.copy(u8, dest[dest_index..], self.buffer[0..dest_space]);
self.start_index = dest_space;
return dest.len;
}
} else {
// asking for so much data that buffering is actually less efficient.
// forward the request directly to the unbuffered stream
const amt_read = try self.unbuffered_in_stream.read(dest[dest_index..]);
return dest_index + amt_read;
}
}
const copy_amount = math.min(dest_space, amt_buffered);
const copy_end_index = self.start_index + copy_amount;
mem.copy(u8, dest[dest_index..], self.buffer[self.start_index..copy_end_index]);
self.start_index = copy_end_index;
dest_index += copy_amount;
}
}
};
It'd be good to do some comparisons to other languages.
Also using nc -l 9002 > /dev/null is reading around 350-360MB/s so I'd expect that to be the ideal range (same as raw reads above).
There seems to be a significant impact when going through functions. Maybe some inlining could speed things up?
I believe that's the part where llvm not being able to devirtualize zig's interface pattern comes in. This is an issue I plan to address one way or another. (different interface pattern? send patch to improve llvm?)
The mixin pattern should fix this
Some more updates... I was testing this wrong. Netcat can do ~2.2 GB/s, the limit in my tests above was dd's small default buffer of 512.
A better command to test is dd bs=64k count=10G if=/dev/zero iflag=count_bytes | nc -v -N 127.0.0.1 9002 which copies 10GB using 64k buffer (netcat uses a buffer of 16384) then stops.
With this to nc -l 9002 > /dev/null it got:
10737418240 bytes (11 GB, 10 GiB) copied, 4.73732 s, 2.3 GB/s
The readSlice function is slightly faster :)
10737418240 bytes (11 GB, 10 GiB) copied, 4.58902 s, 2.3 GB/s
However even with the latest changes the bufferedInStream is still horribly slow
5749080064 bytes (5.7 GB, 5.4 GiB) copied, 23.602 s, 244 MB/s
@andrewrk The latest read test script shows that the __inheritance pattern does appear to have a significant negative performance impact__ .
This impact is not seen when doing buffered reads because the call count is significantly lower. Eg a stream.read(buf[0..buf_size]) is called once per buf_size while stream.readByte is called per byte, (ie buf_size _more_ calls).
To demonstrate this I added two functions to a BufferedReader struct that are completely identical except one uses @fieldParentPtr and the other does not. The one using @fieldParentPtr reads at ~600MB/s while the other reads at ~1.4GB/s.
To try it yourself set READ_FN = 7 and run it 2 or 3 times. Then change to READ_FN = 8 and run several times again.
Likewise, The readRawStream is almost twice as slow at 1.3GB/s as readRaw which reads around ~2.4 GB/s.
I expect #4710 to have an impact on this issue
Indeed, I ran the test again, now readRaw and readRawStream both have the same good performance.
I didn't try the @fieldParentPtr versions because it isn't needed with the redesign. The new design is much better!
Thank you and great work!!
Most helpful comment
@andrewrk The latest read test script shows that the __inheritance pattern does appear to have a significant negative performance impact__ .
This impact is not seen when doing buffered reads because the call count is significantly lower. Eg a
stream.read(buf[0..buf_size])is called once perbuf_sizewhilestream.readByteis called per byte, (iebuf_size_more_ calls).To demonstrate this I added two functions to a BufferedReader struct that are completely identical except one uses
@fieldParentPtrand the other does not. The one using@fieldParentPtrreads at ~600MB/s while the other reads at ~1.4GB/s.To try it yourself set
READ_FN = 7and run it 2 or 3 times. Then change toREAD_FN = 8and run several times again.Likewise, The
readRawStreamis almost twice as slow at 1.3GB/s asreadRawwhich reads around ~2.4 GB/s.