The refcount ops emitted by this code is splitted across basicblocks. Numba refcount pruning pass is not able to prune them.
import numba
import numpy as np
from timeit import default_timer as timer
def bench(nrt):
print("set _nrt=%s" % nrt)
jit = numba.jit(nopython=True, nogil=True, _nrt=nrt)
@jit
def _append_non_na(x, y, agg, field):
if not np.isnan(field):
agg[y, x] += 1
@jit
def _append(x, y, agg, field):
if not np.isnan(field):
if np.isnan(agg[y, x]):
agg[y, x] = field
else:
agg[y, x] += field
@jit
def append(x, y, agg, field):
_append_non_na(x, y, agg, field)
_append(x, y, agg, field)
@jit
def extend(arr, field):
for i in range(arr.shape[0]):
for j in range(arr.shape[1]):
append(j, i, arr, field)
arr = np.ones((10, 100))
for i in range(10):
ts = timer()
extend(arr, 10)
te = timer()
print("%.5fms" % ((te - ts) * 1000))
# extend.inspect_cfg(extend.signatures[0]).display(view=True)
bench(nrt=True)
bench(nrt=False)
Output
set _nrt=True
224.90708ms
0.02676ms
0.02478ms
0.02456ms
0.02460ms
0.02464ms
0.02724ms
0.02477ms
0.02457ms
0.02448ms
set _nrt=False
157.52096ms
0.00259ms
0.00072ms
0.00065ms
0.00062ms
0.00062ms
0.00065ms
0.00062ms
0.00065ms
0.00066ms
With the patch #2352, the new numbers are:
set _nrt=True
259.69325ms
0.00284ms
0.00110ms
0.00089ms
0.00083ms
0.00080ms
0.00082ms
0.00080ms
0.00083ms
0.00081ms
set _nrt=False
156.33411ms
0.00239ms
0.00079ms
0.00066ms
0.00065ms
0.00063ms
0.00067ms
0.00061ms
0.00065ms
0.00061ms
Most helpful comment
With the patch #2352, the new numbers are: