Describe the situation
when using COMPACT format for parts queries significantly slower
How to reproduce
CREATE TABLE statements for all tables involvedCREATE TABLE foo
(
`f1` String,
`f2` String,
`f3` Int64,
`f4` Float64,
`f5` Int32,
`f6` Int32,
`date` Date
)
ENGINE = MergeTree()
ORDER BY (f2, f3, date, f5, f6)
SETTINGS index_granularity = 8192
Contains 30B records
SELECT f2, sum(f4) AS total_f4 FROM foo AS guid_1 WHERE (f3 = 643) AND ((date <= '2020-06-7') AND (date >= '2020-06-01')) GROUP BY f2 ORDER BY sum(f4) DESC NULLS FIRST LIMIT 20Expected performance
WIDE parts - query execution time - 12.35
COMPACT parts - query execution time - 27.57
Please let me know if you need any additional input
Compact parts should be slightly slower on read. They are intented for small data parts (under ~10MiB) to speed up INSERTs and not to be enabled by default for all data.
There are known issues, see https://github.com/ClickHouse/ClickHouse/pull/12492
We don't have FireTree engine in ClickHouse (it was in your report before you edited it).
- We don't have
FireTreeengine in ClickHouse (it was in your report before you edited it).
Search and replace issue :) thats why it was edited :)
- Compact parts should be slightly slower on read. They are intented for small data parts (under ~10MiB) to speed up INSERTs and not to be enabled by default for all data.
- There are known issues, see #12492
So, is it plainly wrong to use it for more formidable parts, like 2-10Gb?
Yes.
Reading a column from big compact part will involve more disk seeks.
It should be noticeably slower for HDD and just a little slower on SSD.
I'm not 100% sure that 2x difference is Ok, though.
I expect that #12492 will eliminate most of the difference, especially for SSD.
But about 1.1x difference is expected.
Select is slower by no more than 30% in case of simulated test...
drop table if exists foo_compact;
CREATE TABLE foo_compact
(
`f1` String,
`f2` String,
`f3` Int64,
`f4` Float64,
`f5` Int32,
`f6` Int32,
`date` Date
)
ENGINE = MergeTree()
ORDER BY (f2, f3, date, f5, f6)
SETTINGS index_granularity = 8192,min_bytes_for_wide_part=100000000000;
--- compact parts comparison
drop table if exists foo_wide;
CREATE TABLE foo_wide
(
`f1` String,
`f2` String,
`f3` Int64,
`f4` Float64,
`f5` Int32,
`f6` Int32,
`date` Date
)
ENGINE = MergeTree()
ORDER BY (f2, f3, date, f5, f6)
SETTINGS index_granularity = 8192,min_bytes_for_wide_part=0;
set max_insert_block_size=100000;
insert into foo_compact
select
toString(cityHash64(number%100000,1)) f1,
toString(cityHash64(number%1000,2)) f2,
toInt64(600+cityHash64(number,3)%100) f3,
toFloat64(cityHash64(number,4)) f4,
toInt32(cityHash64(number,5)) f5,
toInt32(cityHash64(number,6)) f6,
toDate('2020-06-01')+cityHash64(number,7)%15 as `date`
from numbers(10000000);
insert into foo_wide
select
toString(cityHash64(number%100000,1)) f1,
toString(cityHash64(number%1000,2)) f2,
toInt64(600+cityHash64(number,3)%100) f3,
toFloat64(cityHash64(number,4)) f4,
toInt32(cityHash64(number,5)) f5,
toInt32(cityHash64(number,6)) f6,
toDate('2020-06-01')+cityHash64(number,7)%15 as `date`
from numbers(10000000);
select * from system.parts where table like 'foo_compact' and active;
select * from system.parts where table like 'foo_wide' and active;
SELECT f2, sum(f4) AS total_f4 FROM foo_wide
WHERE (f3 = 643) AND
((date <= '2020-06-7') AND (date >= '2020-06-01'))
GROUP BY f2 ORDER BY sum(f4) DESC NULLS FIRST LIMIT 20
Format Null;
Elapsed: 0.045 sec. Processed 10.00 million rows, 458.33 MB (224.56 million rows/s., 10.29 GB/s.)
SELECT f2, sum(f4) AS total_f4 FROM foo_compact
WHERE (f3 = 643) AND
((date <= '2020-06-7') AND (date >= '2020-06-01'))
GROUP BY f2 ORDER BY sum(f4) DESC NULLS FIRST LIMIT 20
Format Null;
Elapsed: 0.064 sec. Processed 10.00 million rows, 464.09 MB (156.04 million rows/s., 7.24 GB/s.)
Yes.
Reading a column from big compact part will involve more disk seeks.
It should be noticeably slower for HDD and just a little slower on SSD.I'm not 100% sure that 2x difference is Ok, though.
I expect that #12492 will eliminate most of the difference, especially for SSD.But about 1.1x difference is expected.
I'm aware of seeks but since we are running on NVMe, I assumed the difference will be negligible.
Ok, I think we will wait for the fix and chech again
Select is slower by no more than 30% in case of simulated test...
drop table if exists foo_compact; CREATE TABLE foo_compact ( `f1` String, `f2` String, `f3` Int64, `f4` Float64, `f5` Int32, `f6` Int32, `date` Date ) ENGINE = MergeTree() ORDER BY (f2, f3, date, f5, f6) SETTINGS index_granularity = 8192,min_bytes_for_wide_part=100000000000; --- compact parts comparison drop table if exists foo_wide; CREATE TABLE foo_wide ( `f1` String, `f2` String, `f3` Int64, `f4` Float64, `f5` Int32, `f6` Int32, `date` Date ) ENGINE = MergeTree() ORDER BY (f2, f3, date, f5, f6) SETTINGS index_granularity = 8192,min_bytes_for_wide_part=0; set max_insert_block_size=100000; insert into foo_compact select toString(cityHash64(number%100000,1)) f1, toString(cityHash64(number%1000,2)) f2, toInt64(600+cityHash64(number,3)%100) f3, toFloat64(cityHash64(number,4)) f4, toInt32(cityHash64(number,5)) f5, toInt32(cityHash64(number,6)) f6, toDate('2020-06-01')+cityHash64(number,7)%15 as `date` from numbers(10000000); insert into foo_wide select toString(cityHash64(number%100000,1)) f1, toString(cityHash64(number%1000,2)) f2, toInt64(600+cityHash64(number,3)%100) f3, toFloat64(cityHash64(number,4)) f4, toInt32(cityHash64(number,5)) f5, toInt32(cityHash64(number,6)) f6, toDate('2020-06-01')+cityHash64(number,7)%15 as `date` from numbers(10000000); select * from system.parts where table like 'foo_compact' and active; select * from system.parts where table like 'foo_wide' and active; SELECT f2, sum(f4) AS total_f4 FROM foo_wide WHERE (f3 = 643) AND ((date <= '2020-06-7') AND (date >= '2020-06-01')) GROUP BY f2 ORDER BY sum(f4) DESC NULLS FIRST LIMIT 20 Format Null; Elapsed: 0.045 sec. Processed 10.00 million rows, 458.33 MB (224.56 million rows/s., 10.29 GB/s.) SELECT f2, sum(f4) AS total_f4 FROM foo_compact WHERE (f3 = 643) AND ((date <= '2020-06-7') AND (date >= '2020-06-01')) GROUP BY f2 ORDER BY sum(f4) DESC NULLS FIRST LIMIT 20 Format Null; Elapsed: 0.064 sec. Processed 10.00 million rows, 464.09 MB (156.04 million rows/s., 7.24 GB/s.)
you are assuming the slowdown is linear. I'm not sure it is
Yes.
Reading a column from big compact part will involve more disk seeks.
It should be noticeably slower for HDD and just a little slower on SSD.I'm not 100% sure that 2x difference is Ok, though.
I expect that #12492 will eliminate most of the difference, especially for SSD.But about 1.1x difference is expected.
I see it was merged to master. As well I saw the perf test but I didnt get what the delta between compact and wide is. Do you have some numbers?
@kreuzerkrieg I don't have these numbers, maybe @CurtizJ has?
I have some.
E.g. query on table hits from test datasets: select min(URL), min(UserID) from hits_wide group by CounterID % 1000. UserID and CounterID have UInt64 type, URL has String type.
SELECT count()
FROM hits_wide
โโcount()โโ
โ 8873898 โ
โโโโโโโโโโโ
On VM with HDD:
clickhouse benchmark <<< "select min(URL), min(UserID) from hits_wide group by CounterID % 1000" --cumulative --max_threads 24
localhost:9000, queries 487, QPS: 13.235, RPS: 117450231.992, MiB/s: 11037.477, result RPS: 13235.472, result MiB/s: 0.911.
0.000% 0.066 sec.
10.000% 0.070 sec.
20.000% 0.071 sec.
30.000% 0.072 sec.
40.000% 0.073 sec.
50.000% 0.074 sec.
60.000% 0.075 sec.
70.000% 0.077 sec.
80.000% 0.078 sec.
90.000% 0.082 sec.
95.000% 0.087 sec.
99.000% 0.105 sec.
99.900% 0.130 sec.
99.990% 0.130 sec.
md5-4d5cff8dcf02a80681377ab481ec3acc
clickhouse benchmark <<< "select min(URL), min(UserID) from hits_comp group by CounterID % 1000" --cumulative --max_threads 24
md5-4d5cff8dcf02a80681377ab481ec3acc
localhost:9000, queries 327, QPS: 10.205, RPS: 90561924.268, MiB/s: 8510.627, result RPS: 10205.428, result MiB/s: 0.702.
0.000% 0.086 sec.
10.000% 0.091 sec.
20.000% 0.092 sec.
30.000% 0.094 sec.
40.000% 0.095 sec.
50.000% 0.096 sec.
60.000% 0.097 sec.
70.000% 0.099 sec.
80.000% 0.101 sec.
90.000% 0.106 sec.
95.000% 0.109 sec.
99.000% 0.147 sec.
99.900% 0.199 sec.
99.990% 0.199 sec.
Also syntetic test from performance tests.
CREATE TABLE mt_comp_parts
ENGINE = MergeTree
ORDER BY (c1, c2)
SETTINGS min_rows_for_wide_part = 1000000000 AS
SELECT *
FROM generateRandom('c1 UInt32, c2 UInt64, s1 String, arr1 Array(UInt32), c3 UInt64, s2 String', 0, 30, 30)
LIMIT 50000000
CREATE TABLE mt_wide_parts
ENGINE = MergeTree
ORDER BY (c1, c2) AS
SELECT *
FROM generateRandom('c1 UInt32, c2 UInt64, s1 String, arr1 Array(UInt32), c3 UInt64, s2 String', 0, 30, 30)
LIMIT 50000000
md5-c87baa32c2c52945c64df764f22eb5f5
clickhouse benchmark <<< "SELECT count() FROM mt_wide_parts WHERE NOT ignore(c1, s1, c3)" --cumulative --max_threads 24
md5-4d5cff8dcf02a80681377ab481ec3acc
localhost:9000, queries 139, QPS: 10.470, RPS: 523513466.646, MiB/s: 17973.499, result RPS: 10.470, result MiB/s: 0.000.
0.000% 0.085 sec.
10.000% 0.090 sec.
20.000% 0.091 sec.
30.000% 0.092 sec.
40.000% 0.093 sec.
50.000% 0.094 sec.
60.000% 0.095 sec.
70.000% 0.096 sec.
80.000% 0.099 sec.
90.000% 0.104 sec.
95.000% 0.106 sec.
99.000% 0.121 sec.
99.900% 0.129 sec.
99.990% 0.129 sec.
md5-4d5cff8dcf02a80681377ab481ec3acc
clickhouse benchmark <<< "SELECT count() FROM mt_comp_parts WHERE NOT ignore(c1, s1, c3)" --cumulative --max_threads 24
md5-4d5cff8dcf02a80681377ab481ec3acc
localhost:9000, queries 350, QPS: 9.273, RPS: 463670572.120, MiB/s: 15919.058, result RPS: 9.273, result MiB/s: 0.000.
0.000% 0.096 sec.
10.000% 0.102 sec.
20.000% 0.103 sec.
30.000% 0.104 sec.
40.000% 0.105 sec.
50.000% 0.106 sec.
60.000% 0.107 sec.
70.000% 0.109 sec.
80.000% 0.111 sec.
90.000% 0.116 sec.
95.000% 0.122 sec.
99.000% 0.135 sec.
99.900% 0.145 sec.
99.990% 0.145 sec.
Most helpful comment
Also syntetic test from performance tests.