go version)?位 go version go version go1.13 windows/amd64
And Go 1.14-RC1.
Yes.
go env)?go env Output
位 go env
set GO111MODULE=
set GOARCH=amd64
set GOBIN=
set GOCACHE=C:UsersklausAppDataLocalgo-build
set GOENV=C:UsersklausAppDataRoaminggoenv
set GOEXE=.exe
set GOFLAGS=
set GOHOSTARCH=amd64
set GOHOSTOS=windows
set GONOPROXY=
set GONOSUMDB=
set GOOS=windows
set GOPATH=c:gopath
set GOPRIVATE=
set GOPROXY=https://proxy.golang.org,direct
set GOROOT=c:go
set GOSUMDB=sum.golang.org
set GOTMPDIR=
set GOTOOLDIR=c:gopkgtoolwindows_amd64
set GCCGO=gccgo
set AR=ar
set CC=gcc
set CXX=g++
set CGO_ENABLED=1
set GOMOD=
set CGO_CFLAGS=-g -O2
set CGO_CPPFLAGS=
set CGO_CXXFLAGS=-g -O2
set CGO_FFLAGS=-g -O2
set CGO_LDFLAGS=-g -O2
set PKG_CONFIG=pkg-config
set GOGCCFLAGS=-m64 -mthreads -fmessage-length=0 -fdebug-prefix-map=c:tempwintempgo-build453787042=/tmp/go-build -gno-record-gcc-switches
Isolated code: reproducer.zip
go test -bench=. -test.benchtime=10s
Most of the code is needed for the test setup, only (*tokens).EstimatedBits and mFastLog2 is run during the benchmark.
位 benchcmp go113.txt go114.txt
benchmark old ns/op new ns/op delta
Benchmark_tokens_EstimatedBits-12 663 716 +7.99%
benchmark old MB/s new MB/s speedup
Benchmark_tokens_EstimatedBits-12 1.51 1.40 0.93x
Equivalent performance.
8% performance regression.
For convenience, here's the benchmark (without testdata and other files in .zip):
package flate
import (
"io/ioutil"
"testing"
)
func Benchmark_tokens_EstimatedBits(b *testing.B) {
tok := loadTestTokens(b)
b.ResetTimer()
// One "byte", one token iteration.
b.SetBytes(1)
for i := 0; i < b.N; i++ {
_ = tok.EstimatedBits()
}
}
// ...
token.go (with tokens.EstimatedBits code)
// Copyright 2009 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package flate
import (
"bytes"
"encoding/binary"
"fmt"
"io"
"math"
)
// EstimatedBits will return an minimum size estimated by an *optimal*
// compression of the block.
// The size of the block
func (t *tokens) EstimatedBits() int {
shannon := float32(0)
bits := int(0)
nMatches := 0
if t.nLits > 0 {
invTotal := 1.0 / float32(t.nLits)
for _, v := range t.litHist[:] {
if v > 0 {
n := float32(v)
shannon += -mFastLog2(n*invTotal) * n
}
}
// Just add 15 for EOB
shannon += 15
for i, v := range t.extraHist[1 : literalCount-256] {
if v > 0 {
n := float32(v)
shannon += -mFastLog2(n*invTotal) * n
bits += int(lengthExtraBits[i&31]) * int(v)
nMatches += int(v)
}
}
}
if nMatches > 0 {
invTotal := 1.0 / float32(nMatches)
for i, v := range t.offHist[:offsetCodeCount] {
if v > 0 {
n := float32(v)
shannon += -mFastLog2(n*invTotal) * n
bits += int(offsetExtraBits[i&31]) * int(v)
}
}
}
return int(shannon) + bits
}
// from https://stackoverflow.com/a/28730362
func mFastLog2(val float32) float32 {
ux := int32(math.Float32bits(val))
log2 := (float32)(((ux >> 23) & 255) - 128)
ux &= -0x7f800001
ux += 127 << 23
uval := math.Float32frombits(uint32(ux))
log2 += ((-0.34484843)*uval+2.02466578)*uval - 0.67487759
return log2
}
const (
lengthShift = 22
offsetMask = 1<<lengthShift - 1
typeMask = 3 << 30
matchType = 1 << 30
literalCount = 286
maxMatchLength = 258 // The longest match for the compressor
maxStoreBlockSize = 65535
debugDecode = false
baseMatchOffset = 1 // The smallest match offset
baseMatchLength = 3 // The smallest match length per the RFC section 3.2.5
maxMatchOffset = 1 << 15 // The largest match offset
offsetCodeCount = 30
endBlockMarker = 256
)
// The length code for length X (MIN_MATCH_LENGTH <= X <= MAX_MATCH_LENGTH)
// is lengthCodes[length - MIN_MATCH_LENGTH]
var lengthCodes = [256]uint8{
0, 1, 2, 3, 4, 5, 6, 7, 8, 8,
9, 9, 10, 10, 11, 11, 12, 12, 12, 12,
13, 13, 13, 13, 14, 14, 14, 14, 15, 15,
15, 15, 16, 16, 16, 16, 16, 16, 16, 16,
17, 17, 17, 17, 17, 17, 17, 17, 18, 18,
18, 18, 18, 18, 18, 18, 19, 19, 19, 19,
19, 19, 19, 19, 20, 20, 20, 20, 20, 20,
20, 20, 20, 20, 20, 20, 20, 20, 20, 20,
21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
21, 21, 21, 21, 21, 21, 22, 22, 22, 22,
22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
22, 22, 23, 23, 23, 23, 23, 23, 23, 23,
23, 23, 23, 23, 23, 23, 23, 23, 24, 24,
24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
25, 25, 26, 26, 26, 26, 26, 26, 26, 26,
26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
26, 26, 26, 26, 27, 27, 27, 27, 27, 27,
27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
27, 27, 27, 27, 27, 28,
}
// lengthCodes1 is length codes, but starting at 1.
var lengthCodes1 = [256]uint8{
1, 2, 3, 4, 5, 6, 7, 8, 9, 9,
10, 10, 11, 11, 12, 12, 13, 13, 13, 13,
14, 14, 14, 14, 15, 15, 15, 15, 16, 16,
16, 16, 17, 17, 17, 17, 17, 17, 17, 17,
18, 18, 18, 18, 18, 18, 18, 18, 19, 19,
19, 19, 19, 19, 19, 19, 20, 20, 20, 20,
20, 20, 20, 20, 21, 21, 21, 21, 21, 21,
21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
22, 22, 22, 22, 22, 22, 22, 22, 22, 22,
22, 22, 22, 22, 22, 22, 23, 23, 23, 23,
23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
23, 23, 24, 24, 24, 24, 24, 24, 24, 24,
24, 24, 24, 24, 24, 24, 24, 24, 25, 25,
25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
26, 26, 27, 27, 27, 27, 27, 27, 27, 27,
27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
27, 27, 27, 27, 28, 28, 28, 28, 28, 28,
28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
28, 28, 28, 28, 28, 29,
}
var offsetCodes = [256]uint32{
0, 1, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7,
8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 9, 9, 9,
10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13,
14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14,
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
}
// offsetCodes14 are offsetCodes, but with 14 added.
var offsetCodes14 = [256]uint32{
14, 15, 16, 17, 18, 18, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21,
22, 22, 22, 22, 22, 22, 22, 22, 23, 23, 23, 23, 23, 23, 23, 23,
24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29, 29,
}
// The number of extra bits needed by length code X - LENGTH_CODES_START.
var lengthExtraBits = [32]int8{
/* 257 */ 0, 0, 0,
/* 260 */ 0, 0, 0, 0, 0, 1, 1, 1, 1, 2,
/* 270 */ 2, 2, 2, 3, 3, 3, 3, 4, 4, 4,
/* 280 */ 4, 5, 5, 5, 5, 0,
}
// offset code word extra bits.
var offsetExtraBits = [64]int8{
0, 0, 0, 0, 1, 1, 2, 2, 3, 3,
4, 4, 5, 5, 6, 6, 7, 7, 8, 8,
9, 9, 10, 10, 11, 11, 12, 12, 13, 13,
/* extended window */
14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20,
}
type token uint32
type tokens struct {
nLits int
extraHist [32]uint16 // codes 256->maxnumlit
offHist [32]uint16 // offset codes
litHist [256]uint16 // codes 0->255
n uint16 // Must be able to contain maxStoreBlockSize
tokens [maxStoreBlockSize + 1]token
}
func (t *tokens) Reset() {
if t.n == 0 {
return
}
t.n = 0
t.nLits = 0
for i := range t.litHist[:] {
t.litHist[i] = 0
}
for i := range t.extraHist[:] {
t.extraHist[i] = 0
}
for i := range t.offHist[:] {
t.offHist[i] = 0
}
}
func (t *tokens) indexTokens(in []token) {
t.Reset()
for _, tok := range in {
if tok < matchType {
t.AddLiteral(tok.literal())
continue
}
t.AddMatch(uint32(tok.length()), tok.offset())
}
}
func (t *tokens) AddLiteral(lit byte) {
t.tokens[t.n] = token(lit)
t.litHist[lit]++
t.n++
t.nLits++
}
// AddMatch adds a match to the tokens.
// This function is very sensitive to inlining and right on the border.
func (t *tokens) AddMatch(xlength uint32, xoffset uint32) {
if debugDecode {
if xlength >= maxMatchLength+baseMatchLength {
panic(fmt.Errorf("invalid length: %v", xlength))
}
if xoffset >= maxMatchOffset+baseMatchOffset {
panic(fmt.Errorf("invalid offset: %v", xoffset))
}
}
t.nLits++
lengthCode := lengthCodes1[uint8(xlength)] & 31
t.tokens[t.n] = token(matchType | xlength<<lengthShift | xoffset)
t.extraHist[lengthCode]++
t.offHist[offsetCode(xoffset)&31]++
t.n++
}
// AddMatchLong adds a match to the tokens, potentially longer than max match length.
// Length should NOT have the base subtracted, only offset should.
func (t *tokens) AddMatchLong(xlength int32, xoffset uint32) {
if debugDecode {
if xoffset >= maxMatchOffset+baseMatchOffset {
panic(fmt.Errorf("invalid offset: %v", xoffset))
}
}
oc := offsetCode(xoffset) & 31
for xlength > 0 {
xl := xlength
if xl > 258 {
// We need to have at least baseMatchLength left over for next loop.
xl = 258 - baseMatchLength
}
xlength -= xl
xl -= 3
t.nLits++
lengthCode := lengthCodes1[uint8(xl)] & 31
t.tokens[t.n] = token(matchType | uint32(xl)<<lengthShift | xoffset)
t.extraHist[lengthCode]++
t.offHist[oc]++
t.n++
}
}
func (t *tokens) AddEOB() {
t.tokens[t.n] = token(endBlockMarker)
t.extraHist[0]++
t.n++
}
func (t *tokens) Slice() []token {
return t.tokens[:t.n]
}
// VarInt returns the tokens as varint encoded bytes.
func (t *tokens) VarInt() []byte {
var b = make([]byte, binary.MaxVarintLen32*int(t.n))
var off int
for _, v := range t.tokens[:t.n] {
off += binary.PutUvarint(b[off:], uint64(v))
}
return b[:off]
}
// FromVarInt restores t to the varint encoded tokens provided.
// Any data in t is removed.
func (t *tokens) FromVarInt(b []byte) error {
var buf = bytes.NewReader(b)
var toks []token
for {
r, err := binary.ReadUvarint(buf)
if err == io.EOF {
break
}
if err != nil {
return err
}
toks = append(toks, token(r))
}
t.indexTokens(toks)
return nil
}
// Returns the type of a token
func (t token) typ() uint32 { return uint32(t) & typeMask }
// Returns the literal of a literal token
func (t token) literal() uint8 { return uint8(t) }
// Returns the extra offset of a match token
func (t token) offset() uint32 { return uint32(t) & offsetMask }
func (t token) length() uint8 { return uint8(t >> lengthShift) }
// Returns the offset code corresponding to a specific offset
func offsetCode(off uint32) uint32 {
if false {
if off < uint32(len(offsetCodes)) {
return offsetCodes[off&255]
} else if off>>7 < uint32(len(offsetCodes)) {
return offsetCodes[(off>>7)&255] + 14
} else {
return offsetCodes[(off>>14)&255] + 28
}
}
if off < uint32(len(offsetCodes)) {
return offsetCodes[uint8(off)]
}
return offsetCodes14[uint8(off>>7)]
}
/cc @aclements @randall77 @ianlancetaylor @mknyszek
I get a startling regression on a Mac laptop -- 70% slowdown -- will look further to see if I am making some obvious mistake.
@dr2chase I can reproduce your result on my Mac laptop too:
flate $ go1.12.16 test -bench=. -test.benchtime=10s
goos: darwin
goarch: amd64
pkg: go.test/flate
Benchmark_tokens_EstimatedBits-8 20000000 906 ns/op 1.10 MB/s
PASS
ok go.test/flate 19.106s
flate $ go1.13.7 test -bench=. -test.benchtime=10s
goos: darwin
goarch: amd64
pkg: go.test/flate
Benchmark_tokens_EstimatedBits-8 16228050 745 ns/op 1.34 MB/s
PASS
ok go.test/flate 12.925s
flate $ go1.14rc1 test -bench=. -test.benchtime=10s
goos: darwin
goarch: amd64
pkg: go.test/flate
Benchmark_tokens_EstimatedBits-8 4280259 2876 ns/op 0.35 MB/s
PASS
ok go.test/flate 15.204s
Tentatively marking this as a release blocker, until we understand what's causing this and whether this is representative of many Go programs. /cc @cagedmantis @toothrot
Charming. Turning off asynchronous preemption eliminates a whole lot of the regression, but not all of it. (Why did I try this? Because attempting to collect profiles made both of them equally, terribly slow.)
benchstat 20200207T165050.Go-1.13.stdout 20200207T165050.Go-1.14.stdout
name \ time/op 20200207T165050.Go-1.13 20200207T165050.Go-1.14 delta
_tokens_EstimatedBits-4 791ns 卤 9% 911ns 卤 9% +15.06% (p=0.016 n=5+5)
I have to quit for the night, what I find in 1.14 vs 1.13 (ignoring likely interactions with preemption) is an extra instruction in the GOSSAFUNC output, comparing final output for
GOSSAFUNC='(*tokens).EstimatedBits' go build token.go
I doubt we're going to fix that extra instruction, but the other slowdown seems worth further look.
The inner loops are
1.13:
v106 00017 (+25) INCQ CX
v18 00018 (+25) CMPQ CX, $256
b6 00019 (25) JGE 55
v41 00020 (25) MOVWLZX 136(AX)(CX*2), DX
v178 00021 (+26) TESTW DX, DX
b7 00022 (26) JLS 53
v46 00023 (+27) XORPS X2, X2
v46 00024 (27) CVTSL2SS DX, X2
v289 00025 (+28) MOVUPS X2, X3
v48 00026 (28) MULSS X1, X2
# $GOROOT/src/math/unsafe.go
v60 00027 (+12) MOVL X2, DX
# /Users/drchase/work/gocode/src/github.com/dr2chase/benchmarks/klauspost/token.go
v301 00028 (+58) MOVL DX, BX
v65 00029 (58) SARL $23, DX
v67 00030 (58) MOVBLZX DX, DX
v69 00031 (58) ADDL $-128, DX
v70 00032 (58) XORPS X2, X2
v70 00033 (58) CVTSL2SS DX, X2
v72 00034 (+59) ANDL $-2139095041, BX
v74 00035 (+60) LEAL 1065353216(BX), DX
# $GOROOT/src/math/unsafe.go
v85 00036 (+18) MOVL DX, X4
# /Users/drchase/work/gocode/src/github.com/dr2chase/benchmarks/klauspost/token.go
v276 00037 (+62) MOVSS $(-0.34484842419624329), X5
v90 00038 (62) MULSS X4, X5
v274 00039 (62) MOVSS $(2.0246658325195312), X6
v92 00040 (62) ADDSS X5, X6
v93 00041 (62) MULSS X4, X6
v273 00042 (62) MOVSS $(0.6748775839805603), X4
v95 00043 (62) SUBSS X4, X6
v96 00044 (62) ADDSS X6, X2
v272 00045 (+28) MOVSS $(-0.0), X5
v100 00046 (28) PXOR X5, X2
v102 00047 (28) MULSS X2, X3
v103 00048 (28) ADDSS X3, X0
v50 00049 (?) NOP
v55 00050 (+57) XCHGL AX, AX
v80 00051 (+61) XCHGL AX, AX
b14 00052 (28) JMP 17
v172 00053 (28) MOVSS $(0.6748775839805603), X4
b40 00054 (26) JMP 17
and 1.14
1.14x
v106 00019 (+25) INCQ CX
v117 00020 (+25) CMPQ CX, $256
b6 00021 (25) JGE 58
v41 00022 (25) MOVWLZX 136(AX)(CX*2), DX
v139 00023 (+26) TESTW DX, DX
b7 00024 (26) JLS 55
v46 00025 (+27) XORPS X2, X2
v46 00026 (27) CVTSL2SS DX, X2
v252 00027 (+28) MOVUPS X2, X3
v48 00028 (28) MULSS X1, X2
# $GOROOT/src/math/unsafe.go
v60 00029 (+12) MOVL X2, DX
# /Users/drchase/work/gocode/src/github.com/dr2chase/benchmarks/klauspost/token.go
v173 00030 (+58) MOVL DX, BX
v65 00031 (58) SARL $23, DX
v67 00032 (58) MOVBLZX DX, DX
v69 00033 (58) ADDL $-128, DX
v70 00034 (58) XORPS X2, X2
v70 00035 (58) CVTSL2SS DX, X2
v72 00036 (+59) ANDL $-2139095041, BX
v74 00037 (+60) LEAL 1065353216(BX), DX
# $GOROOT/src/math/unsafe.go
v85 00038 (+18) MOVL DX, X4
# /Users/drchase/work/gocode/src/github.com/dr2chase/benchmarks/klauspost/token.go
v171 00039 (+62) MOVSS $(-0.34484842419624329), X5
v90 00040 (62) MULSS X4, X5
v153 00041 (62) MOVSS $(2.0246658325195312), X6
v92 00042 (62) ADDSS X6, X5
v93 00043 (62) MULSS X5, X4
v151 00044 (62) MOVSS $(0.6748775839805603), X5
v95 00045 (62) SUBSS X5, X4
v96 00046 (62) ADDSS X4, X2
v79 00047 (+28) MOVSS $(-0.0), X4
v100 00048 (28) PXOR X4, X2
v102 00049 (28) MULSS X2, X3
v103 00050 (28) ADDSS X3, X0
v50 00051 (?) NOP
v55 00052 (+57) XCHGL AX, AX
v80 00053 (+61) XCHGL AX, AX
b14 00054 (28) JMP 19
v221 00055 (28) MOVSS $(0.6748775839805603), X5
v263 00056 (28) MOVSS $(2.0246658325195312), X6
b29 00057 (26) JMP 19
It might be worthwhile quickly eliminating #35881 as the source of the regression. Is your CPU listed in the description of #35881? Are you running a recent version of the microcode? If so, you could try
Adding GOAMD64="alignedjumps" gives a very small performance reduction, but within the margin of error - but I am also not seeing the 70% reduction. Only have my laptop on battery so benchmarks are a bit shaky.
位 benchstat unaligned.txt aligned.txt
name old time/op new time/op delta
_tokens_EstimatedBits-12 694ns 卤 2% 695ns 卤 2% ~ (p=0.921 n=9+10)
I did a as-careful-as-possible laptop benchmark, and with asynchronous preemption OFF, the branch alignment fix makes 1.13 and 1.14 effectively equal (on my laptop).
We still don't entirely understand what's wrong with preemption in this benchmark, will keep looking, will also see how the latest version of the branch alignment batch compares.
FYI: Just got attracted by this issue, I ran the reproducer and my benchmark shows that async preemption is the only problem (disable async preemption makes the program even faster with go114) on my MacBook Pro 15inch 2019 (power from charger):
$ go113 test -v -run=none -bench=. -test.benchtime=10s -benchmem -count=5 . | tee bench.113.txt
$ go114 test -v -run=none -bench=. -test.benchtime=10s -benchmem -count=5 . | tee bench.114.txt
$ GODEBUG=asyncpreemptoff=1 go114 -v -run=none -bench=. -test.benchtime=10s -benchmem -count=5 . | tee bench.114.asyncpreeptoff.txt
$ benchstat bench.113.txt bench.114.txt
name old time/op new time/op delta
_tokens_EstimatedBits-12 735ns 卤 2% 2505ns 卤 2% +240.91% (p=0.008 n=5+5)
name old speed new speed delta
_tokens_EstimatedBits-12 1.36MB/s 卤 1% 0.40MB/s 卤 4% -70.88% (p=0.008 n=5+5)
name old alloc/op new alloc/op delta
_tokens_EstimatedBits-12 0.00B 0.00B ~ (all equal)
name old allocs/op new allocs/op delta
_tokens_EstimatedBits-12 0.00 0.00 ~ (all equal)
$ benchstat bench.113.txt bench.114.asyncpreeptoff.txt
name old time/op new time/op delta
_tokens_EstimatedBits-12 735ns 卤 2% 650ns 卤 1% -11.49% (p=0.008 n=5+5)
name old speed new speed delta
_tokens_EstimatedBits-12 1.36MB/s 卤 1% 1.54MB/s 卤 0% +13.24% (p=0.016 n=5+4)
name old alloc/op new alloc/op delta
_tokens_EstimatedBits-12 0.00B 0.00B ~ (all equal)
name old allocs/op new allocs/op delta
_tokens_EstimatedBits-12 0.00 0.00 ~ (all equal)
Go version, processor and kernel spec:
$ go113 version
go version go1.13.4 darwin/amd64
$ go114 version
go version devel +ee3a3717aa Thu Feb 6 01:09:50 2020 +0000 darwin/amd64
$ sysctl -n machdep.cpu.brand_string
Intel(R) Core(TM) i7-9750H CPU @ 2.60GHz
$ uname -a
Darwin ou.local 19.3.0 Darwin Kernel Version 19.3.0: Thu Jan 9 20:58:23 PST 2020; root:xnu-6153.81.5~1/RELEASE_X86_64 x86_64
Just a hypothesis: async preemption is preempting the for i := 0; i < b.N; i++ { ... } loop which causes a inaccurate measurement.
The alignment issue depends on your CPU's microcode version, which Apple has updated in the last few months. No idea how that works for Linux users. For example:
sysctl -a | egrep machdep.cpu.'(family|model|extfamily|stepping|microcode)'
machdep.cpu.family: 6
machdep.cpu.model: 142
machdep.cpu.extfamily: 0
machdep.cpu.stepping: 9
machdep.cpu.microcode_version: 202
On Linux the microcode version can be found in /proc/cpuinfo.
Re the large slowdown on Darwin: there's something strange going on with signals here.
If I build this benchmark with 1.13.6, I get:
% ./113.test -test.bench .\* -test.benchtime=1000000x
Benchmark_tokens_EstimatedBits-16 1000000 657 ns/op 1.52 MB/s
% ./113.test -test.bench .\* -test.benchtime=1000000x -test.cpuprofile=113.prof
Benchmark_tokens_EstimatedBits-16 1000000 2551 ns/op 0.39 MB/s
i.e., just turning on the profiler leads to an almost 4x slowdown.
1.14 is different because it uses signals all the time. So we always see the slow behavior unless both the profiler and the async preempt are off:
% ./tip.test -test.bench .\* -test.benchtime=1000000x
Benchmark_tokens_EstimatedBits-16 1000000 2498 ns/op 0.40 MB/s
% ./tip.test -test.bench .\* -test.benchtime=1000000x -test.cpuprofile=tip.prof
Benchmark_tokens_EstimatedBits-16 1000000 2536 ns/op 0.39 MB/s
% GODEBUG=asyncpreemptoff=1 ./tip.test -test.bench .\* -test.benchtime=1000000x -test.cpuprofile=tip.prof
Benchmark_tokens_EstimatedBits-16 1000000 2530 ns/op 0.40 MB/s
% GODEBUG=asyncpreemptoff=1 ./tip.test -test.bench .\* -test.benchtime=1000000x
Benchmark_tokens_EstimatedBits-16 1000000 680 ns/op 1.47 MB/s
Not sure what's going on here yet, but something about Darwin and signals is bad.
This doesn't seem to happen for other benchmarks, though. Something about this particular benchmark is triggering the bad behavior.
So this really isn't new with 1.14. It just appears more often due to more use of signals.
I've split out the Darwin-specific weirdness into #37174 .
Change https://golang.org/cl/219229 mentions this issue: obj/x86: nop-only, align functions and jumps for better performance
The remaining performance issue is related to unaligned jumps. We will not be making changes to our jumps before the 1.14 release, but likely will address it in a point release.
I'll remove the release-blocker label, and update the milestone to 1.15. Please comment if I am mistaken.
@cherrymui @dr2chase Is this still an issue for 1.15? If so, what should we set the milestone to? Thanks.
benchstat -geomean go1.{13,15}.log
name \ time/op go1.13 go1.15 delta
_tokens_EstimatedBits-4 673ns 卤 2% 673ns 卤 2% ~ (p=0.577 n=24+24)
Oops, no, because of a finger-fumble that is not correct. Shut down Chrome and the IDE, try again.
Okay, now it's closed. There's a 0.3% slowdown (2 parts in 673), which is close enough.
benchstat -geomean go1.{13,15}.log
name \ time/op go1.13 go1.15 delta
_tokens_EstimatedBits-4 673ns 卤 2% 675ns 卤 2% ~ (p=0.166 n=24+24)
Most helpful comment
For convenience, here's the benchmark (without testdata and other files in .zip):
token.go (with tokens.EstimatedBits code)
/cc @aclements @randall77 @ianlancetaylor @mknyszek