We've been running CockroachDB with Docker Compose for weeks now, and today we just got a panic in production. We're using lib/pq, and xo/xo to generate DB code, but our usage boils down to basic CRUD queries on single tables and some transactions to create users, user invites, etc.
Linux xxxx-3 4.4.0-101-generic #124-Ubuntu SMP Fri Nov 10 18:29:59 UTC 2017 x86_64 x86_64 x86_64 GNU/Linux
panic while executing "SELECT c.reg_no,c.name,aa.username,aa.password ,(sum(male_required_count) + sum(female_required_count) + sum(both_gender_required_count)) Total_Positions\n,sum(male_required_count)male_required_count,sum(female_required_count)female_required_count ,sum(both_gender_required_count)both_gender_required_count \nFROM company c \njoin job j on j.company_id=c.id\njoin auth.user aa on aa.username=c.reg_no\nWHERE c.created_by IS NOT NULL \ngroup by c.reg_no,c.name,aa.username,aa.password ;": flow: unexpected 7085 leftover bytes
goroutine 534871127 [running]:
runtime/debug.Stack(0x7f131b28d760, 0xc42034a1b0, 0xc420022c00)
/usr/local/go/src/runtime/debug/stack.go:24 +0x79
github.com/cockroachdb/cockroach/pkg/util/log.ReportPanic(0x7f131b28d760, 0xc42034a1b0, 0xc420022c00, 0x1cb7d80, 0xc4817408c0, 0x1)
/go/src/github.com/cockroachdb/cockroach/pkg/util/log/crash_reporting.go:129 +0x180
github.com/cockroachdb/cockroach/pkg/util/stop.(*Stopper).Recover(0xc4207bc6c0, 0x7f131b28d760, 0xc42034a1b0)
/go/src/github.com/cockroachdb/cockroach/pkg/util/stop/stopper.go:179 +0xd4
panic(0x1cb7d80, 0xc4817408c0)
/usr/local/go/src/runtime/panic.go:489 +0x2cf
github.com/cockroachdb/cockroach/pkg/sql/pgwire.(*v3Conn).serve.func1(0xc456cb3500, 0x7f131b28d760, 0xc4be7bdc20)
/go/src/github.com/cockroachdb/cockroach/pkg/sql/pgwire/v3.go:423 +0xa1
panic(0x1cb7d80, 0xc4817408c0)
/usr/local/go/src/runtime/panic.go:489 +0x2cf
github.com/cockroachdb/cockroach/pkg/sql.(*Session).maybeRecover(0xc480b99000, 0x1ea1b74, 0x9, 0xc4a874b229, 0x1e1)
/go/src/github.com/cockroachdb/cockroach/pkg/sql/session.go:1381 +0x28d
panic(0x1bdbbc0, 0xc43c89da20)
/usr/local/go/src/runtime/panic.go:489 +0x2cf
github.com/cockroachdb/cockroach/pkg/sql/mon.(*BytesMonitor).doStop(0xc4aa79a000, 0x7f131b28d760, 0xc4b0a97d10, 0xc46a9db401)
/go/src/github.com/cockroachdb/cockroach/pkg/sql/mon/bytes_usage.go:390 +0x27e
github.com/cockroachdb/cockroach/pkg/sql/mon.(*BytesMonitor).Stop(0xc4aa79a000, 0x7f131b28d760, 0xc4b0a97d10)
/go/src/github.com/cockroachdb/cockroach/pkg/sql/mon/bytes_usage.go:375 +0x44
github.com/cockroachdb/cockroach/pkg/sql/parser.(*EvalContext).Stop(0xc4aff7a550, 0x7f131b28d760, 0xc4b0a97d10)
/go/src/github.com/cockroachdb/cockroach/pkg/sql/parser/eval.go:1930 +0x46
github.com/cockroachdb/cockroach/pkg/sql/distsqlrun.(*Flow).Cleanup(0xc4aff7a500, 0x7f131b28d760, 0xc4b0a97d10)
/go/src/github.com/cockroachdb/cockroach/pkg/sql/distsqlrun/flow.go:426 +0x81
github.com/cockroachdb/cockroach/pkg/sql.(*distSQLPlanner).Run(0xc4203d6480, 0xc46a9dbd00, 0xc4adfe9680, 0xc46a9dbd28, 0xc453c5d0e0, 0x34425cf64a96fa29, 0x19075075b6f5aae, 0x1, 0xed1c47ae1, 0x18370f4f, ...)
/go/src/github.com/cockroachdb/cockroach/pkg/sql/distsql_running.go:212 +0x7fa
github.com/cockroachdb/cockroach/pkg/sql.(*distSQLPlanner).PlanAndRun(0xc4203d6480, 0x7f131a607a68, 0xc4bdd8e1c0, 0xc4adfe9680, 0x2d74400, 0xc4c5930900, 0xc453c5d0e0, 0x34425cf64a96fa29, 0x19075075b6f5aae, 0x1, ...)
/go/src/github.com/cockroachdb/cockroach/pkg/sql/distsql_running.go:445 +0x209
github.com/cockroachdb/cockroach/pkg/sql.(*Executor).execDistSQL(0xc42044a240, 0xc480b99208, 0x2d74400, 0xc4c5930900, 0x2d7e460, 0xc456cb3500, 0x0, 0x15002df424a07e18)
/go/src/github.com/cockroachdb/cockroach/pkg/sql/executor.go:1960 +0x2e9
github.com/cockroachdb/cockroach/pkg/sql.(*Executor).execStmt(0xc42044a240, 0x2d75b40, 0xc46c5461e0, 0x0, 0x0, 0x0, 0x15002df424a07e18, 0x1, 0xc48b709f80, 0xc480b99208, ...)
/go/src/github.com/cockroachdb/cockroach/pkg/sql/executor.go:2152 +0x3a1
github.com/cockroachdb/cockroach/pkg/sql.(*Executor).execStmtInOpenTxn(0xc42044a240, 0xc480b99000, 0x2d75b40, 0xc46c5461e0, 0x0, 0x0, 0x0, 0x15002df424a07e18, 0x1, 0xc48b709f80, ...)
/go/src/github.com/cockroachdb/cockroach/pkg/sql/executor.go:1793 +0x72b
github.com/cockroachdb/cockroach/pkg/sql.(*Executor).execSingleStatement(0xc42044a240, 0xc480b99000, 0x2d75b40, 0xc46c5461e0, 0x0, 0x0, 0x0, 0x15002df424a07e18, 0x1, 0xc48b709f80, ...)
/go/src/github.com/cockroachdb/cockroach/pkg/sql/executor.go:1320 +0x92b
github.com/cockroachdb/cockroach/pkg/sql.runTxnAttempt(0xc42044a240, 0xc480b99000, 0xc4bdd8e140, 0x1, 0x1, 0x0, 0x1, 0xda0001, 0x0, 0x2d77e60, ...)
/go/src/github.com/cockroachdb/cockroach/pkg/sql/executor.go:1192 +0x37f
github.com/cockroachdb/cockroach/pkg/sql.runWithAutoRetry(0xc42044a240, 0xc480b99000, 0xc4bdd8e140, 0x1, 0x1, 0x18360101, 0x0, 0x0, 0x3ff0000000000000, 0x3263fa8, ...)
/go/src/github.com/cockroachdb/cockroach/pkg/sql/executor.go:1016 +0x1f0
github.com/cockroachdb/cockroach/pkg/sql.(*Executor).execParsed(0xc42044a240, 0xc480b99000, 0xc4bdd8e140, 0x1, 0x1, 0x0, 0x0, 0xc420018000, 0x60a1d3)
/go/src/github.com/cockroachdb/cockroach/pkg/sql/executor.go:834 +0x26a
github.com/cockroachdb/cockroach/pkg/sql.(*Executor).execRequest(0xc42044a240, 0xc480b99000, 0xc4a874b229, 0x1e1, 0x0, 0x0, 0x1e1, 0xdd7)
/go/src/github.com/cockroachdb/cockroach/pkg/sql/executor.go:757 +0x493
github.com/cockroachdb/cockroach/pkg/sql.(*Executor).ExecuteStatements(0xc42044a240, 0xc480b99000, 0xc4a874b229, 0x1e1, 0x0, 0x0, 0x0)
/go/src/github.com/cockroachdb/cockroach/pkg/sql/executor.go:638 +0x129
github.com/cockroachdb/cockroach/pkg/sql/pgwire.(*v3Conn).handleSimpleQuery(0xc456cb3500, 0xc456cb3528, 0x0, 0x0)
/go/src/github.com/cockroachdb/cockroach/pkg/sql/pgwire/v3.go:612 +0x148
github.com/cockroachdb/cockroach/pkg/sql/pgwire.(*v3Conn).serve(0xc456cb3500, 0x7f131b28d760, 0xc4be7bdc20, 0xc5138b8b70, 0x5400, 0xc4200b2368, 0x0, 0x0)
/go/src/github.com/cockroachdb/cockroach/pkg/sql/pgwire/v3.go:503 +0xbd0
github.com/cockroachdb/cockroach/pkg/sql/pgwire.(*Server).ServeConn(0xc4200b2200, 0x7f131a607a68, 0xc4bb874940, 0x2d7e640, 0xc46816e000, 0x0, 0x0)
/go/src/github.com/cockroachdb/cockroach/pkg/sql/pgwire/server.go:421 +0x96c
github.com/cockroachdb/cockroach/pkg/server.(*Server).Start.func9.1(0x2d7e640, 0xc46816e000)
/go/src/github.com/cockroachdb/cockroach/pkg/server/server.go:749 +0x149
github.com/cockroachdb/cockroach/pkg/util/netutil.(*Server).ServeWith.func1(0xc4207bc6c0, 0x7f131b28d760, 0xc42034a1b0, 0xc420724070, 0x2d7e640, 0xc46816e000, 0xc4219f3ce0)
/go/src/github.com/cockroachdb/cockroach/pkg/util/netutil/net.go:140 +0xa9
created by github.com/cockroachdb/cockroach/pkg/util/netutil.(*Server).ServeWith
/go/src/github.com/cockroachdb/cockroach/pkg/util/netutil/net.go:142 +0x267
Client:
Version: 17.11.0-ce
API version: 1.34
Go version: go1.8.3
Git commit: 1caf76c
Built: Mon Nov 20 18:37:39 2017
OS/Arch: linux/amd64
Server:
Version: 17.11.0-ce
API version: 1.34 (minimum version 1.12)
Go version: go1.8.3
Git commit: 1caf76c
Built: Mon Nov 20 18:36:09 2017
OS/Arch: linux/amd64
Experimental: false
Additionally we get a lot of transactions rejected for no reason, is that because we're running 4 nodes on the same host?
More info about the query:
"SELECT c.reg_no,c.name,aa.username,aa.password ,(sum(male_required_count) + sum(female_required_count) + sum(both_gender_required_count)) Total_Positions
,sum(male_required_count)male_required_count,sum(female_required_count)female_required_count ,sum(both_gender_required_count)both_gender_required_count
FROM company c \njoin job j on j.company_id=c.id
join auth.user aa on aa.username=c.reg_no
WHERE c.created_by IS NOT NULL
group by c.reg_no,c.name,aa.username,aa.password ;"
\
Also notice the weird whitespace after aa.password.
\
(edit1) The above two paragraphs might be wrong, still investigating that query.
(edit2) Yes, the query might not have had anything to do with the panic. It might've been simply a symptom of us restarting the database (not-so-graceful shutdown). The database was not accepting connections for hours at that point and had no resource utilization, although normally it takes up 90%+ cores + RAM. Could the memory saturation have caused the initial problem?
Hi @andreis,
Thanks for the report. This is a bug - memory saturation shouldn't have anything to do with it.
The cause of the bug appears to be an internal mis-accounting of memory in DistSQL.
Assigning @asubiotto to take a look.
Hi @andreis, did this happen when you shut down the node? If not, when did it happen? It might be helpful to also get some more extensive logs from the node on which the panic occurred if you have them.
My suspicion is that this is a race condition between an executing flow and a shutdown. The panic happens when Stoping the flow's memory monitor in flow.Cleanup. We however clear the account associated with this monitor on the line above: f.EvalCtx.ActiveMemAcc.Close. What might be happening is that some processor (probably the hash joiner in this case) creates another account (for its buckets) and gets to add some memory before the monitor is Closed since the flow context isn't cancelled until the end of the Cleanup.
edit: this theory is wrong, we don't proceed to flow.Cleanup until flow.Wait returns which means that there are no running processors when evalCtx.Stop gets executed.
Pretty sure it happened when restarting the docker container, so yes on shutdown
edit: 11,999 stars!! Congrats :)
edit: 12,000 🎊 ✨ 🚀
I've taken a cursory look at the hashjoiner code and I don't immediately see any leaks, but that code sure is complicated).
I've hit this kind of memory accounting error on another occasion and they're hard to debug because there's no information about where the leak might be.
@knz what do you think of making BytesMonitors maintain a list of open children (monitors and BoundAccounts) to be printed on this panic?
@andreimatei bad idea.
To investigate this there's a trace option: --vmodule=bytes_usage=2
You can enable that then look at where the bytes are allocated/deallocated.
@knz I don't understand why you think that's a bad idea. We hit these problems a lot, and we can't retroactively turn on --vmodule=bytes_usage=2 for reported panics.
@jordanlewis feel free to take over the concerns and the design. My opinion is not evidence-based.
Just be sure to run before/after benchmarks with benchmem enabled if you change anything in that infrastructure.
@jordanlewis do you think we should reevaluate the status of this issue (w.r.t the 2.0 milestone) given the new changes to the memory accounting workflow? This issue, while not entirely resolved in the accounting logic, would at least no longer result in a panic.
But it's still something to fix and I think I can get to it in the 2.0 timeframe. Let's keep it on this milestone and I'll move it back if it turns out I can't get to it in time.
I tried reproducing this running a similar query against v1.1.3 and shutting down the server by bypassing draining and had no luck. Since this issue won't produce a panic in 2.0, I'm closing this because there's not much more I can do.