Seen on a number of trybot runs in the past few weeks:
https://storage.googleapis.com/go-build-log/02de4ef8/windows-amd64-gce_80a1a98e.log
ok net 22.767s
runtime: gp: gp=0xc0422dcb60, goid=2299, gp->atomicstatus=8
runtime: g: g=0xa5cfe0, goid=0, g->atomicstatus=0
fatal error: bad g->status in ready
runtime stack:
runtime.throw(0x867c02, 0x16)
C:/workdir/go/src/runtime/panic.go:566 +0x9c
runtime.ready(0xc0422dcb60, 0x3, 0xc04237f501)
C:/workdir/go/src/runtime/proc.go:540 +0x14b
runtime.goready.func1()
C:/workdir/go/src/runtime/proc.go:270 +0x3f
runtime.systemstack(0xa5d020)
C:/workdir/go/src/runtime/asm_amd64.s:298 +0x7e
runtime.mstart()
C:/workdir/go/src/runtime/proc.go:1079
goroutine 2304 [running]:
runtime.systemstack_switch()
C:/workdir/go/src/runtime/asm_amd64.s:252 fp=0xc0420eba08 sp=0xc0420eba00
runtime.goready(0xc0422dcb60, 0x3)
C:/workdir/go/src/runtime/proc.go:271 +0x52 fp=0xc0420eba38 sp=0xc0420eba08
runtime.closechan(0xc04233b800)
C:/workdir/go/src/runtime/chan.go:371 +0x1c5 fp=0xc0420ebab8 sp=0xc0420eba38
net/http.(*persistConn).closeLocked(0xc04233c6c0, 0xa2e460, 0xc0420f0010)
C:/workdir/go/src/net/http/transport.go:1933 +0xb1 fp=0xc0420ebb00 sp=0xc0420ebab8
net/http.(*persistConn).close(0xc04233c6c0, 0xa2e460, 0xc0420f0010)
C:/workdir/go/src/net/http/transport.go:1913 +0x8c fp=0xc0420ebb30 sp=0xc0420ebb00
net/http.(*persistConn).readLoop.func1(0xc04233c6c0, 0xc0420ebc40)
C:/workdir/go/src/net/http/transport.go:1397 +0x48 fp=0xc0420ebb58 sp=0xc0420ebb30
net/http.(*persistConn).readLoop(0xc04233c6c0)
C:/workdir/go/src/net/http/transport.go:1470 +0x11d7 fp=0xc0420ebf98 sp=0xc0420ebb58
runtime.goexit()
C:/workdir/go/src/runtime/asm_amd64.s:2104 +0x1 fp=0xc0420ebfa0 sp=0xc0420ebf98
created by net/http.(*Transport).dialConn
C:/workdir/go/src/net/http/transport.go:1072 +0x4f2
goroutine 1 [chan receive]:
testing.(*T).Run(0xc04205a0c0, 0x8653e3, 0x13, 0x881ac8, 0xc04206dc01)
C:/workdir/go/src/testing/testing.go:647 +0x31c
testing.RunTests.func1(0xc04205a0c0)
C:/workdir/go/src/testing/testing.go:793 +0x6e
testing.tRunner(0xc04205a0c0, 0xc0420ede00)
C:/workdir/go/src/testing/testing.go:610 +0x88
testing.RunTests(0x880f40, 0xa58660, 0x188, 0x188, 0x64)
C:/workdir/go/src/testing/testing.go:799 +0x2fc
testing.(*M).Run(0xc0420edee8, 0x404212)
C:/workdir/go/src/testing/testing.go:743 +0x8c
net/http_test.TestMain(0xc0420edee8)
C:/workdir/go/src/net/http/main_test.go:19 +0x32
main.main()
net/http/_test/_testmain.go:880 +0xcd
goroutine 1837 [chan receive]:
testing.(*T).Parallel(0xc0424460c0)
C:/workdir/go/src/testing/testing.go:552 +0x156
net/http_test.setParallel(0xc0424460c0)
C:/workdir/go/src/net/http/main_test.go:87 +0x4a
net/http_test.TestServerTimeouts(0xc0424460c0)
C:/workdir/go/src/net/http/serve_test.go:462 +0xa8
testing.tRunner(0xc0424460c0, 0x8820e0)
C:/workdir/go/src/testing/testing.go:610 +0x88
created by testing.(*T).Run
C:/workdir/go/src/testing/testing.go:646 +0x2f2
goroutine 2138 [runnable]:
time.Sleep(0xf4240)
C:/workdir/go/src/runtime/time.go:59 +0xef
net/http_test.TestTimeoutHandlerRace.func1(0xa31720, 0xc04225d400, 0xc0421325a0)
C:/workdir/go/src/net/http/serve_test.go:1888 +0xe0
net/http.HandlerFunc.ServeHTTP(0x8821d8, 0xa31720, 0xc04225d400, 0xc0421325a0)
C:/workdir/go/src/net/http/server.go:1726 +0x4b
net/http.(*timeoutHandler).ServeHTTP.func1(0xc042303080, 0xc04225d400, 0xc0421325a0, 0xc04221b620)
C:/workdir/go/src/net/http/server.go:2531 +0x5a
created by net/http.(*timeoutHandler).ServeHTTP
C:/workdir/go/src/net/http/server.go:2533 +0x174
goroutine 2140 [runnable]:
time.Sleep(0xf4240)
C:/workdir/go/src/runtime/time.go:59 +0xef
net/http_test.TestTimeoutHandlerRace.func1(0xa31720, 0xc04225d540, 0xc042132870)
C:/workdir/go/src/net/http/serve_test.go:1888 +0xe0
net/http.HandlerFunc.ServeHTTP(0x8821d8, 0xa31720, 0xc04225d540, 0xc042132870)
C:/workdir/go/src/net/http/server.go:1726 +0x4b
net/http.(*timeoutHandler).ServeHTTP.func1(0xc042303080, 0xc04225d540, 0xc042132870, 0xc04221b860)
C:/workdir/go/src/net/http/server.go:2531 +0x5a
created by net/http.(*timeoutHandler).ServeHTTP
C:/workdir/go/src/net/http/server.go:2533 +0x174
goroutine 2305 [select]:
net/http.(*persistConn).writeLoop(0xc04233c6c0)
C:/workdir/go/src/net/http/transport.go:1659 +0x3c4
created by net/http.(*Transport).dialConn
C:/workdir/go/src/net/http/transport.go:1073 +0x517
goroutine 2300 [IO wait]:
net.runtime_pollWait(0x223bc28, 0x72, 0xc0420f0848)
C:/workdir/go/src/runtime/netpoll.go:160 +0x60
net.(*pollDesc).wait(0xc04237f4f0, 0x72, 0xc0420f0848, 0x0)
C:/workdir/go/src/net/fd_poll_runtime.go:75 +0x3f
net.(*ioSrv).ExecIO(0xc042022020, 0xc04237f3e0, 0x85ea59, 0x8, 0xc04233f120, 0xc042352c00, 0x0, 0x0)
C:/workdir/go/src/net/fd_windows.go:171 +0x18a
net.(*netFD).acceptOne(0xc04237f380, 0xc0422d1180, 0x2, 0x2, 0xc04237f3e0, 0xc04217d050, 0xc0420bde40, 0xc0420bddc0)
C:/workdir/go/src/net/fd_windows.go:529 +0x1f8
net.(*netFD).accept(0xc04237f380, 0x0, 0x0, 0x0)
C:/workdir/go/src/net/fd_windows.go:559 +0x131
net.(*TCPListener).accept(0xc042022368, 0x434c8e, 0xc0420bde70, 0x4c1a24)
C:/workdir/go/src/net/tcpsock_posix.go:132 +0x35
net.(*TCPListener).Accept(0xc042022368, 0x881158, 0xc04202ab00, 0xa32060, 0xc04217cfc0)
C:/workdir/go/src/net/tcpsock.go:225 +0x50
net/http.(*Server).Serve(0xc042417880, 0xa31520, 0xc042022368, 0x0, 0x0)
C:/workdir/go/src/net/http/server.go:2273 +0x1d5
net/http/httptest.(*Server).goServe.func1(0xc04233b5c0)
C:/workdir/go/src/net/http/httptest/server.go:236 +0x74
created by net/http/httptest.(*Server).goServe
C:/workdir/go/src/net/http/httptest/server.go:237 +0x63
goroutine 2122 [runnable]:
time.Sleep(0xf4240)
C:/workdir/go/src/runtime/time.go:59 +0xef
net/http_test.TestTimeoutHandlerRace.func1(0xa31720, 0xc04204b9a0, 0xc04225aa50)
C:/workdir/go/src/net/http/serve_test.go:1888 +0xe0
net/http.HandlerFunc.ServeHTTP(0x8821d8, 0xa31720, 0xc04204b9a0, 0xc04225aa50)
C:/workdir/go/src/net/http/server.go:1726 +0x4b
net/http.(*timeoutHandler).ServeHTTP.func1(0xc042303080, 0xc04204b9a0, 0xc04225aa50, 0xc04213ec60)
C:/workdir/go/src/net/http/server.go:2531 +0x5a
created by net/http.(*timeoutHandler).ServeHTTP
C:/workdir/go/src/net/http/server.go:2533 +0x174
goroutine 2299 [semacquire]:
sync.runtime_Semacquire(0xc04233c794)
C:/workdir/go/src/runtime/sema.go:47 +0x37
sync.(*Mutex).Lock(0xc04233c790)
C:/workdir/go/src/sync/mutex.go:85 +0xa3
net/http.(*persistConn).canceled(0xc04233c6c0, 0x0, 0x0)
C:/workdir/go/src/net/http/transport.go:1290 +0x56
net/http.(*persistConn).mapRoundTripErrorAfterClosed(0xc04233c6c0, 0x0, 0x0, 0x85d801)
C:/workdir/go/src/net/http/transport.go:1369 +0x36
net/http.(*persistConn).roundTrip(0xc04233c6c0, 0xc04230a5a0, 0x0, 0x0, 0x0)
C:/workdir/go/src/net/http/transport.go:1869 +0xa41
net/http.(*Transport).RoundTrip(0xc04221d2c0, 0xc04221d3b0, 0xc04221d2c0, 0x0, 0x0)
C:/workdir/go/src/net/http/transport.go:381 +0x4f5
net/http.send(0xc04221d3b0, 0xa2c760, 0xc04221d2c0, 0x0, 0x0, 0x0, 0x8, 0xc0422e1c38, 0xc042022380)
C:/workdir/go/src/net/http/client.go:256 +0x166
net/http.(*Client).send(0xc042145080, 0xc04221d3b0, 0x0, 0x0, 0x0, 0xc042022380, 0x0, 0x1)
C:/workdir/go/src/net/http/client.go:146 +0x109
net/http.(*Client).doFollowingRedirects(0xc042145080, 0xc04221d3b0, 0x8814e0, 0x16, 0x0, 0x0)
C:/workdir/go/src/net/http/client.go:529 +0x5e8
net/http.(*Client).Get(0xc042145080, 0xc04230a4e0, 0x16, 0xc04205b080, 0xc04239a9a0, 0x0)
C:/workdir/go/src/net/http/client.go:419 +0x9a
net/http_test.testHandlerPanic(0xc04205b080, 0xc0422e0000, 0x7d8260, 0xc042283c40)
C:/workdir/go/src/net/http/serve_test.go:2238 +0x304
net/http_test.TestHandlerPanic_h1(0xc04205b080)
C:/workdir/go/src/net/http/serve_test.go:2178 +0x85
testing.tRunner(0xc04205b080, 0x881ac8)
C:/workdir/go/src/testing/testing.go:610 +0x88
created by testing.(*T).Run
C:/workdir/go/src/testing/testing.go:646 +0x2f2
goroutine 2136 [runnable]:
time.Sleep(0xf4240)
C:/workdir/go/src/runtime/time.go:59 +0xef
net/http_test.TestTimeoutHandlerRace.func1(0xa31720, 0xc04225d360, 0xc0421324b0)
C:/workdir/go/src/net/http/serve_test.go:1888 +0xe0
net/http.HandlerFunc.ServeHTTP(0x8821d8, 0xa31720, 0xc04225d360, 0xc0421324b0)
C:/workdir/go/src/net/http/server.go:1726 +0x4b
net/http.(*timeoutHandler).ServeHTTP.func1(0xc042303080, 0xc04225d360, 0xc0421324b0, 0xc04221b3e0)
C:/workdir/go/src/net/http/server.go:2531 +0x5a
created by net/http.(*timeoutHandler).ServeHTTP
C:/workdir/go/src/net/http/server.go:2533 +0x174
goroutine 1971 [chan receive]:
testing.(*T).Parallel(0xc04205bc80)
C:/workdir/go/src/testing/testing.go:552 +0x156
net/http_test.setParallel(0xc04205bc80)
C:/workdir/go/src/net/http/main_test.go:87 +0x4a
net/http_test.TestTLSHandshakeTimeout(0xc04205bc80)
C:/workdir/go/src/net/http/serve_test.go:1026 +0x5d
testing.tRunner(0xc04205bc80, 0x8821a8)
C:/workdir/go/src/testing/testing.go:610 +0x88
created by testing.(*T).Run
C:/workdir/go/src/testing/testing.go:646 +0x2f2
FAIL net/http 3.258s
2016/09/06 21:33:51 Failed: exit status 1
/cc @mdempsky (it was his most recent trybot failure)
Relevant part seems to be that goroutine 2304 is closing pc.closech and thinks it should ready goroutine 2299 to run. In the trace, goroutine 2299 is executing the body of a 'case <-pc.closech' in a select. But clearly it started that code ahead of when goroutine 2304 thought it should. Maybe the synchronization is wrong in the case of a channel being closed and the select might end up both queueing on the channel and noticing the channel is closed and returning without dequeueing. Easy to imagine that being a race that would be rare.
Oddly, I can't find a single instance of this crash or any "status in ready" crash on the dashboard logs, back to the beginning of the year.
atomicstatus=8 is actually _Gcopystack, but that only happens when the goroutine is copying its own stack: it flips from _Grunning to _Gcopystack and then back. Oddly, the stack trace shows the goroutine blocked in semacquire inside a Mutex.Lock, not actually running. Maybe it ran ahead and got to the lock between the start of the panic and the dump, but that seems like a stretch. So the dump shows semacquire, we somehow caught it in _Gcopystack, and between those it must have been _Grunning. A lot of ducks in a row. Hard to believe.
This just happened on a CL at tip in a trybot run:
https://storage.googleapis.com/go-build-log/5a25a022/windows-amd64-gce_5723b9ab.log
(in https://go-review.googlesource.com/c/34656/ PS1)
I've marked this Go 1.8 again. This seems not good.
@aclements, any update here?
Another trybot failure: https://storage.googleapis.com/go-build-log/e77471fe/windows-amd64-gce_24f4464c.log
@bradfitz, do you know if this is only happening on windows-amd64?
Some observations:
This has happened once on the builders (non-trybot): 2016-11-11T19:46:17-39e3cbf/windows-amd64-gce
Logs 1 and 2 fail in closechan, but 3 and 4 fail in send from a select.
Log 1 received from pc.closech on goroutine 2299 just before this panic, which is the channel that's being closed by goroutine 2304. The select is in roundTrip (based on parent commit 8259cf3). According to the traceback, 2304 is in semacquire, which means it had to be in _Grunning (2) a moment ago and is in _Gwaiting (4) as of the traceback. But when the panic was printed, it was in _Gcopystack (8), which is only used for synchronous stack growth. Perhaps something in the select path caused stack growth while the goroutine was visible to the close?
Log 3 panics on goroutine 6749 when sending on sc.wantWriteFrameCh from writeFrameFromHandler (based on parent commit 9def857). The offending goroutine is 6730. I'm not quite sure what 6730 is doing; there are a lot of mentions of "close" in the traceback, which might have closed sc.wantWriteFrameCh, but as far as I can tell this channel never gets closed. Unwinding all the way to serve shows we're in a receive case on sc.wantWriteFrameCh. 6730 is also in the process of panicking, but I'm not sure what that's due to.
I've only seen it happen on windows-amd64, but you have the greppable logs.
I suppose we could run the net/http tests in a loop on some Windows VMs.
Austin and I found this. CL to follow.
Oh, and the bug was introduced in Go 1.4.
CL https://golang.org/cl/34835 mentions this issue.
CL https://golang.org/cl/35637 mentions this issue.
Most helpful comment
Austin and I found this. CL to follow.