Influxdb: Neverending compaction

Created on 23 Jul 2019  路  8Comments  路  Source: influxdata/influxdb

__Steps to reproduce:__
List the minimal actions needed to reproduce the behavior.

  1. Db work for some time (weeks maybe)
  2. ...
  3. Infinite loop with failures

__Expected behavior:__
Compaction should finish successfully.

__Actual behavior:__
Compaction crashed (?)

__Environment info:__

  • System info: Run uname -srm and copy the output here
    Linux 4.14.127-bpr2+ armv7l

  • InfluxDB version: Run influxd version and copy the output here
    InfluxDB v1.7.7 (git: 1.7 f8fdf652f348fc9980997fe1c972e2b79ddd13b0)

  • Other relevant environment details: Container runtime, disk info, etc

~# df -h /containers/
Filesystem      Size  Used Avail Use% Mounted on
/dev/mmcblk0     60G  7.9G   51G  14% /containers

/dev/mmcblk0 on /containers type f2fs (rw,relatime,lazytime,background_gc=on,no_heap,user_xattr,inline_xattr,acl,inline_data,inline_dentry,flush_merge,extent_cache,mode=adaptive,active_logs=6)

# free -h
              total        used        free      shared  buff/cache   available
Mem:          2.0Gi       602Mi        99Mi       1.0Mi       1.3Gi       1.3Gi
Swap:         1.0Gi       227Mi       778Mi

# ps aux | grep influxdb
influxdb 23991  5.0  9.5 2834780 195952 ?      Ssl  21:54   1:36 /usr/bin/influxd -config /etc/influxdb/influxdb.conf

__Config:__

[meta]
  dir = "/containers/system/monitoring/influxdb/meta"
  retention-autocreate = true
  logging-enabled = true
  pprof-enabled = false
  lease-duration = "30m0s"
  bind-address = "127.0.0.1:8088"
  enabled = false

[data]
  enabled = true
  dir = "/containers/system/monitoring/influxdb/data"
  wal-dir = "/containers/system/monitoring/influxdb/wal"
  wal-logging-enabled = true
  query-log-enabled = false
  cache-snapshot-memory-size = 512000
  compact-min-file-count = 10
  compact-full-write-cold-duration = "24h"
  max-points-per-block = 256
  max-concurrent-compactions = 4
  max-series-per-database = 4096

__Logs:__
Include snippet of errors in log.

ts=2019-07-23T19:20:43.238047Z lvl=info msg="Aborted compaction" log_id=0Gp2pItl000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0Gp4I~sl000 op_name=tsm1_compact_group error="compaction in progress: open /containers/system/monitoring/influxdb/data/collectd/autogen/216/000000022-000000002.tsm.tmp: file exists"
ts=2019-07-23T19:20:44.239585Z lvl=info msg="TSM compaction (end)" log_id=0Gp2pItl000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0Gp4I~sl000 op_name=tsm1_compact_group op_event=end op_elapsed=1007.711ms
ts=2019-07-23T19:20:45.233311Z lvl=info msg="TSM compaction (start)" log_id=0Gp2pItl000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0Gp4J7gl000 op_name=tsm1_compact_group op_event=start
ts=2019-07-23T19:20:45.234088Z lvl=info msg="Beginning compaction" log_id=0Gp2pItl000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0Gp4J7gl000 op_name=tsm1_compact_group tsm1_files_n=4
ts=2019-07-23T19:20:45.234606Z lvl=info msg="Compacting file" log_id=0Gp2pItl000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0Gp4J7gl000 op_name=tsm1_compact_group tsm1_index=0 tsm1_file=/containers/system/monitoring/influxdb/data/collectd/autogen/216/000000003-000000002.tsm
ts=2019-07-23T19:20:45.235190Z lvl=info msg="Compacting file" log_id=0Gp2pItl000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0Gp4J7gl000 op_name=tsm1_compact_group tsm1_index=1 tsm1_file=/containers/system/monitoring/influxdb/data/collectd/autogen/216/000000012-000000002.tsm
ts=2019-07-23T19:20:45.235759Z lvl=info msg="Compacting file" log_id=0Gp2pItl000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0Gp4J7gl000 op_name=tsm1_compact_group tsm1_index=2 tsm1_file=/containers/system/monitoring/influ
xdb/data/collectd/autogen/216/000000021-000000002.tsm
ts=2019-07-23T19:20:45.236324Z lvl=info msg="Compacting file" log_id=0Gp2pItl000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0Gp4J7gl000 op_name=tsm1_compact_group tsm1_index=3 tsm1_file=/containers/system/monitoring/influxdb/data/collectd/autogen/216/000000022-000000001.tsm
ts=2019-07-23T19:20:45.238585Z lvl=info msg="Aborted compaction" log_id=0Gp2pItl000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0Gp4J7gl000 op_name=tsm1_compact_group error="compaction in progress: open /containers/system/monitoring/influxdb/data/collectd/autogen/216/000000022-000000002.tsm.tmp: file exists"
ts=2019-07-23T19:20:46.241812Z lvl=info msg="TSM compaction (end)" log_id=0Gp2pItl000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0Gp4J7gl000 op_name=tsm1_compact_group op_event=end op_elapsed=1010.110ms
ts=2019-07-23T19:20:47.233691Z lvl=info msg="TSM compaction (start)" log_id=0Gp2pItl000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0Gp4JFWG000 op_name=tsm1_compact_group op_event=start
ts=2019-07-23T19:20:47.234447Z lvl=info msg="Beginning compaction" log_id=0Gp2pItl000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0Gp4JFWG000 op_name=tsm1_compact_group tsm1_files_n=4
ts=2019-07-23T19:20:47.234995Z lvl=info msg="Compacting file" log_id=0Gp2pItl000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0Gp4JFWG000 op_name=tsm1_compact_group tsm1_index=0 tsm1_file=/containers/system/monitoring/influxdb/data/collectd/autogen/216/000000003-000000002.tsm
ts=2019-07-23T19:20:47.235585Z lvl=info msg="Compacting file" log_id=0Gp2pItl000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0Gp4JFWG000 op_name=tsm1_compact_group tsm1_index=1 tsm1_file=/containers/system/monitoring/influxdb/data/collectd/autogen/216/000000012-000000002.tsm
ts=2019-07-23T19:20:47.236159Z lvl=info msg="Compacting file" log_id=0Gp2pItl000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0Gp4JFWG000 op_name=tsm1_compact_group tsm1_index=2 tsm1_file=/containers/system/monitoring/influxdb/data/collectd/autogen/216/000000021-000000002.tsm
ts=2019-07-23T19:20:47.236735Z lvl=info msg="Compacting file" log_id=0Gp2pItl000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0Gp4JFWG000 op_name=tsm1_compact_group tsm1_index=3 tsm1_file=/containers/system/monitoring/influxdb/data/collectd/autogen/216/000000022-000000001.tsm
ts=2019-07-23T19:20:47.238685Z lvl=info msg="Aborted compaction" log_id=0Gp2pItl000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0Gp4JFWG000 op_name=tsm1_compact_group error="compaction in progress: open /containers/system/monitoring/influxdb/data/collectd/autogen/216/000000022-000000002.tsm.tmp: file exists"
ts=2019-07-23T19:20:48.240944Z lvl=info msg="TSM compaction (end)" log_id=0Gp2pItl000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0Gp4JFWG000 op_name=tsm1_compact_group op_event=end op_elapsed=1007.408ms
ts=2019-07-23T19:20:49.233927Z lvl=info msg="TSM compaction (start)" log_id=0Gp2pItl000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0Gp4JNKG000 op_name=tsm1_compact_group op_event=start
ts=2019-07-23T19:20:49.242853Z lvl=info msg="Beginning compaction" log_id=0Gp2pItl000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0Gp4JNKG000 op_name=tsm1_compact_group tsm1_files_n=4
ts=2019-07-23T19:20:49.243677Z lvl=info msg="Compacting file" log_id=0Gp2pItl000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0Gp4JNKG000 op_name=tsm1_compact_group tsm1_index=0 tsm1_file=/containers/system/monitoring/influxdb/data/collectd/autogen/216/000000003-000000002.tsm
ts=2019-07-23T19:20:49.244268Z lvl=info msg="Compacting file" log_id=0Gp2pItl000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0Gp4JNKG000 op_name=tsm1_compact_group tsm1_index=1 tsm1_file=/containers/system/monitoring/influxdb/data/collectd/autogen/216/000000012-000000002.tsm
ts=2019-07-23T19:20:49.244826Z lvl=info msg="Compacting file" log_id=0Gp2pItl000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0Gp4JNKG000 op_name=tsm1_compact_group tsm1_index=2 tsm1_file=/containers/system/monitoring/influxdb/data/collectd/autogen/216/000000021-000000002.tsm
ts=2019-07-23T19:20:49.245365Z lvl=info msg="Compacting file" log_id=0Gp2pItl000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0Gp4JNKG000 op_name=tsm1_compact_group tsm1_index=3 tsm1_file=/containers/system/monitoring/influxdb/data/collectd/autogen/216/000000022-000000001.tsm
ts=2019-07-23T19:20:49.253546Z lvl=info msg="Aborted compaction" log_id=0Gp2pItl000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0Gp4JNKG000 op_name=tsm1_compact_group error="compaction in progress: open /containers/system/monitoring/influxdb/data/collectd/autogen/216/000000022-000000002.tsm.tmp: file exists"
ts=2019-07-23T19:20:50.255163Z lvl=info msg="TSM compaction (end)" log_id=0Gp2pItl000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0Gp4JNKG000 op_name=tsm1_compact_group op_event=end op_elapsed=1021.392ms
ts=2019-07-23T19:20:51.232560Z lvl=info msg="TSM compaction (start)" log_id=0Gp2pItl000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0Gp4JV7l000 op_name=tsm1_compact_group op_event=start
ts=2019-07-23T19:20:51.233319Z lvl=info msg="Beginning compaction" log_id=0Gp2pItl000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0Gp4JV7l000 op_name=tsm1_compact_group tsm1_files_n=4
ts=2019-07-23T19:20:51.233842Z lvl=info msg="Compacting file" log_id=0Gp2pItl000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0Gp4JV7l000 op_name=tsm1_compact_group tsm1_index=0 tsm1_file=/containers/system/monitoring/influxdb/data/collectd/autogen/216/000000003-000000002.tsm
ts=2019-07-23T19:20:51.234417Z lvl=info msg="Compacting file" log_id=0Gp2pItl000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0Gp4JV7l000 op_name=tsm1_compact_group tsm1_index=1 tsm1_file=/containers/system/monitoring/influxdb/data/collectd/autogen/216/000000012-000000002.tsm
ts=2019-07-23T19:20:51.234986Z lvl=info msg="Compacting file" log_id=0Gp2pItl000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0Gp4JV7l000 op_name=tsm1_compact_group tsm1_index=2 tsm1_file=/containers/system/monitoring/influxdb/data/collectd/autogen/216/000000021-000000002.tsm
ts=2019-07-23T19:20:51.235551Z lvl=info msg="Compacting file" log_id=0Gp2pItl000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0Gp4JV7l000 op_name=tsm1_compact_group tsm1_index=3 tsm1_file=/containers/system/monitoring/influxdb/data/collectd/autogen/216/000000022-000000001.tsm
ts=2019-07-23T19:20:51.237402Z lvl=info msg="Aborted compaction" log_id=0Gp2pItl000 engine=tsm1 tsm1_strategy=full tsm1_optimize=false trace_id=0Gp4JV7l000 op_name=tsm1_compact_group error="compaction in progress: open /containers/system/monitoring/influxdb/data/collectd/autogen/216/000000022-000000002.tsm.tmp: file exists"

.....

vars.txt

1.x

Most helpful comment

It's more about virtual address space, not /physical/ memory consumption.

As for me - I stuck with aggressive retention policy to keep DB small. Each tech may have it's own limitation, but situations like this are sick.

Okay, dear influxdb, if I can not have so many data, please inform me about this with the message I can understand or hint me how I can fix|workaround this. Just don't _destroy_ my logs/device/storage with trying to do something which can not be done in never ending loop. ..

All 8 comments

I've also seen the "Aborted compaction" a lot. See:
https://github.com/influxdata/influxdb/issues/14608

This issue has been automatically marked as stale because it has not had recent activity. It will be closed if no further activity occurs. Thank you for your contributions.

What does "needs repro" mean? Needs reproducing?

I have the same issue, and this behavior will explode 'syslog' and 'daemon.log' files

as am I

Is this on a 32-bit platform such as an RPi3? If so, look at #11339.

Your options are move to a 64-bit kernel (and try your luck with the 2GB of RAM you have) or move to a larger device entirely. An experimental 64-bit kernel is available for Raspbian, which I am using successfully for many months now on an RPi 4 with 4GB RAM, where InfluxDB has not had any issues. RPi3 is a dead end for InfluxDB as the devs have indicated there will be no 32-bit "niceties" for the compaction (a fix was actually provided as a prior branch that will fix this issue, but will not be merged it appears). The compaction fails simply because it tries to load everything into RAM, which once the database is large enough - simply cannot be allocated in a 32-bit environment.

It's more about virtual address space, not /physical/ memory consumption.

As for me - I stuck with aggressive retention policy to keep DB small. Each tech may have it's own limitation, but situations like this are sick.

Okay, dear influxdb, if I can not have so many data, please inform me about this with the message I can understand or hint me how I can fix|workaround this. Just don't _destroy_ my logs/device/storage with trying to do something which can not be done in never ending loop. ..

__Setup__

  • Running a single instance of OSS Influxdb v1.7.7 on Ubuntu Server 64-bit (x86) (18.04) with 1 CPU and 1GB memory (type t2.micro) as opposite to Linux 4.14 (from the issue) with 2GB of memory.
  • index-version was set to tsi1 . I believe customer index type was inmem
  • cache-snapshot-memory-size = "25k"andcache-snapshot-write-cold-duration = "1m"` to speed up compactions. That differs from cache-snapshot-memory-size = 512000 in the customer settings.
    I do not think that my deviation from customer鈥檚 setting are important enough to repro the behaviour.

__Result__

  • The data was successfully written and compacted for a few hours at least. Did not see any compaction errors.

__Introduce tmp file__

  • Before next compaction took place I created a tmp file and was able to see the same error as in the customer logs:
    The compaction failure was in the loop, and is still going. snippet from my log:
Jul 24 17:39:50 ip-172-31-10-105 influxd[1853]: ts=2020-07-24T17:39:50.009141Z lvl=info msg="Beginning compaction" log_id=0OCOj8Hl000 engine=tsm1 tsm1_level=1 tsm1_strategy=level trace_id=0OCUP6U0000 op_name=tsm1_compact_group tsm1_files_n=8
Jul 24 17:39:50 ip-172-31-10-105 influxd[1853]: ts=2020-07-24T17:39:50.009306Z lvl=info msg="Compacting file" log_id=0OCOj8Hl000 engine=tsm1 tsm1_level=1 tsm1_strategy=level trace_id=0OCUP6U0000 op_name=tsm1_compact_group tsm1_index=0 tsm1_file=/var/lib/influxdb/data/_internal/monitor/1/000000521-000000001.tsm
Jul 24 17:39:50 ip-172-31-10-105 influxd[1853]: ts=2020-07-24T17:39:50.009443Z lvl=info msg="Compacting file" log_id=0OCOj8Hl000 engine=tsm1 tsm1_level=1 tsm1_strategy=level trace_id=0OCUP6U0000 op_name=tsm1_compact_group tsm1_index=1 tsm1_file=/var/lib/influxdb/data/_internal/monitor/1/000000522-000000001.tsm
Jul 24 17:39:50 ip-172-31-10-105 influxd[1853]: ts=2020-07-24T17:39:50.009607Z lvl=info msg="Compacting file" log_id=0OCOj8Hl000 engine=tsm1 tsm1_level=1 tsm1_strategy=level trace_id=0OCUP6U0000 op_name=tsm1_compact_group tsm1_index=2 tsm1_file=/var/lib/influxdb/data/_internal/monitor/1/000000523-000000001.tsm
Jul 24 17:39:50 ip-172-31-10-105 influxd[1853]: ts=2020-07-24T17:39:50.009964Z lvl=info msg="Compacting file" log_id=0OCOj8Hl000 engine=tsm1 tsm1_level=1 tsm1_strategy=level trace_id=0OCUP6U0000 op_name=tsm1_compact_group tsm1_index=3 tsm1_file=/var/lib/influxdb/data/_internal/monitor/1/000000524-000000001.tsm
Jul 24 17:39:50 ip-172-31-10-105 influxd[1853]: ts=2020-07-24T17:39:50.010119Z lvl=info msg="Compacting file" log_id=0OCOj8Hl000 engine=tsm1 tsm1_level=1 tsm1_strategy=level trace_id=0OCUP6U0000 op_name=tsm1_compact_group tsm1_index=4 tsm1_file=/var/lib/influxdb/data/_internal/monitor/1/000000525-000000001.tsm
Jul 24 17:39:50 ip-172-31-10-105 influxd[1853]: ts=2020-07-24T17:39:50.010256Z lvl=info msg="Compacting file" log_id=0OCOj8Hl000 engine=tsm1 tsm1_level=1 tsm1_strategy=level trace_id=0OCUP6U0000 op_name=tsm1_compact_group tsm1_index=5 tsm1_file=/var/lib/influxdb/data/_internal/monitor/1/000000526-000000001.tsm
Jul 24 17:39:50 ip-172-31-10-105 influxd[1853]: ts=2020-07-24T17:39:50.010386Z lvl=info msg="Compacting file" log_id=0OCOj8Hl000 engine=tsm1 tsm1_level=1 tsm1_strategy=level trace_id=0OCUP6U0000 op_name=tsm1_compact_group tsm1_index=6 tsm1_file=/var/lib/influxdb/data/_internal/monitor/1/000000527-000000001.tsm
Jul 24 17:39:50 ip-172-31-10-105 influxd[1853]: ts=2020-07-24T17:39:50.010530Z lvl=info msg="Compacting file" log_id=0OCOj8Hl000 engine=tsm1 tsm1_level=1 tsm1_strategy=level trace_id=0OCUP6U0000 op_name=tsm1_compact_group tsm1_index=7 tsm1_file=/var/lib/influxdb/data/_internal/monitor/1/000000528-000000001.tsm
Jul 24 17:39:50 ip-172-31-10-105 influxd[1853]: ts=2020-07-24T17:39:50.010771Z lvl=info msg="Aborted compaction" log_id=0OCOj8Hl000 engine=tsm1 tsm1_level=1 tsm1_strategy=level trace_id=0OCUP6U0000 op_name=tsm1_compact_group error="compaction in progress: open /var/lib/influxdb/data/_internal/monitor/1/000000528-000000002.tsm.tmp: file exists"
Was this page helpful?
0 / 5 - 0 ratings

Related issues

jsternberg picture jsternberg  路  57Comments

cheribral picture cheribral  路  59Comments

toddboom picture toddboom  路  69Comments

srfraser picture srfraser  路  90Comments

pauldix picture pauldix  路  89Comments