(Brief description of the problem in no more than 2 sentences.)
----------Python Info----------
('Version :', '2.7.15')
('Compiler :', 'GCC 7.2.0')
('Build :', ('default', 'May 1 2018 23:32:55'))
('Arch :', ('64bit', ''))
------------Pip Info-----------
('Version :', '18.1')
('Directory :', '/home/kohill/anaconda2/lib/python2.7/site-packages/pip')
----------MXNet Info-----------
('Version :', '1.3.0')
('Directory :', '/home/kohill/anaconda2/lib/python2.7/site-packages/mxnet')
('Commit Hash :', 'b3be92f4a48bce62a5a8424271871c2f81c8f7f1')
----------System Info----------
('Platform :', 'Linux-4.15.0-36-generic-x86_64-with-debian-stretch-sid')
('system :', 'Linux')
('node :', 'heils-server')
('release :', '4.15.0-36-generic')
('version :', '#39~16.04.1-Ubuntu SMP Tue Sep 25 08:59:23 UTC 2018')
----------Hardware Info----------
('machine :', 'x86_64')
('processor :', 'x86_64')
Architecture: x86_64
CPU \u8fd0\u884c\u6a21\u5f0f\uff1a 32-bit, 64-bit
Byte Order: Little Endian
CPU(s): 40
On-line CPU(s) list: 0-39
\u6bcf\u4e2a\u6838\u7684\u7ebf\u7a0b\u6570\uff1a2
\u6bcf\u4e2a\u5ea7\u7684\u6838\u6570\uff1a 10
Socket(s): 2
NUMA \u8282\u70b9\uff1a 2
\u5382\u5546 ID\uff1a GenuineIntel
CPU \u7cfb\u5217\uff1a 6
\u578b\u53f7\uff1a 63
Model name: Intel(R) Xeon(R) CPU E5-2650 v3 @ 2.30GHz
\u6b65\u8fdb\uff1a 2
CPU MHz\uff1a 1197.546
CPU max MHz: 3000.0000
CPU min MHz: 1200.0000
BogoMIPS: 4591.39
\u865a\u62df\u5316\uff1a VT-x
L1d \u7f13\u5b58\uff1a 32K
L1i \u7f13\u5b58\uff1a 32K
L2 \u7f13\u5b58\uff1a 256K
L3 \u7f13\u5b58\uff1a 25600K
NUMA node0 CPU(s): 0-9,20-29
NUMA node1 CPU(s): 10-19,30-39
Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush dts acpi mmx fxsr sse sse2 ss ht tm pbe syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon pebs bts rep_good nopl xtopology nonstop_tsc cpuid aperfmperf pni pclmulqdq dtes64 monitor ds_cpl vmx smx est tm2 ssse3 sdbg fma cx16 xtpr pdcm pcid dca sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand lahf_lm abm cpuid_fault epb invpcid_single pti intel_ppin ssbd ibrs ibpb stibp tpr_shadow vnmi flexpriority ept vpid fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid cqm xsaveopt cqm_llc cqm_occup_llc dtherm ida arat pln pts flush_l1d
----------Network Test----------
Setting timeout: 10
Timing for MXNet: https://github.com/apache/incubator-mxnet, DNS: 0.0046 sec, LOAD: 0.8261 sec.
Timing for PYPI: https://pypi.python.org/pypi/pip, DNS: 0.0039 sec, LOAD: 3.3043 sec.
Timing for FashionMNIST: https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/fashion-mnist/train-labels-idx1-ubyte.gz, DNS: 0.3213 sec, LOAD: 1.9182 sec.
Timing for Conda: https://repo.continuum.io/pkgs/free/, DNS: 0.0029 sec, LOAD: 0.6460 sec.
Timing for Gluon Tutorial(en): http://gluon.mxnet.io, DNS: 0.0032 sec, LOAD: 1.9723 sec.
Timing for Gluon Tutorial(cn): https://zh.gluon.ai, DNS: 0.5289 sec, LOAD: 1.0473 sec.
Package used (Python/R/Scala/Julia):
(I'm using Python)
/home/kohill/anaconda2/bin/python /home/kohill/Desktop/mx-detection/bug_example.py
Model file is not found. Downloading.
Downloading /home/kohill/.mxnet/models/resnet18_v1b-2d9d980c.zip from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/resnet18_v1b-2d9d980c.zip...
42433KB [00:46, 909.00KB/s]
/home/kohill/anaconda2/lib/python2.7/site-packages/mxnet/gluon/block.py:421: UserWarning: load_params is deprecated. Please use load_parameters.
warnings.warn("load_params is deprecated. Please use load_parameters.")
Exception in thread Thread-5:
Traceback (most recent call last):
File "/home/kohill/anaconda2/lib/python2.7/threading.py", line 801, in __bootstrap_inner
self.run()
File "/home/kohill/anaconda2/lib/python2.7/threading.py", line 754, in run
self.__target(*self.__args, **self.__kwargs)
File "/home/kohill/Desktop/mx-detection/bug_example.py", line 10, in worker
outnd = module(inputs) # type: mx.nd.NDArray
File "/home/kohill/anaconda2/lib/python2.7/site-packages/mxnet/gluon/block.py", line 541, in __call__
out = self.forward(*args)
File "/home/kohill/anaconda2/lib/python2.7/site-packages/mxnet/gluon/block.py", line 908, in forward
return self._call_cached_op(x, *args)
File "/home/kohill/anaconda2/lib/python2.7/site-packages/mxnet/gluon/block.py", line 798, in _call_cached_op
self._build_cache(*args)
File "/home/kohill/anaconda2/lib/python2.7/site-packages/mxnet/gluon/block.py", line 750, in _build_cache
data, out = self._get_graph(*args)
File "/home/kohill/anaconda2/lib/python2.7/site-packages/mxnet/gluon/block.py", line 742, in _get_graph
out = self.hybrid_forward(symbol, *grouped_inputs, **params) # pylint: disable=no-value-for-parameter
File "/home/kohill/anaconda2/lib/python2.7/site-packages/mxnet/gluon/block.py", line 85, in __exit__
self._name_scope.__exit__(ptype, value, trace)
AttributeError: 'NoneType' object has no attribute '__exit__'
Exception in thread Thread-3:
Traceback (most recent call last):
File "/home/kohill/anaconda2/lib/python2.7/threading.py", line 801, in __bootstrap_inner
self.run()
File "/home/kohill/anaconda2/lib/python2.7/threading.py", line 754, in run
self.__target(*self.__args, **self.__kwargs)
File "/home/kohill/Desktop/mx-detection/bug_example.py", line 10, in worker
outnd = module(inputs) # type: mx.nd.NDArray
File "/home/kohill/anaconda2/lib/python2.7/site-packages/mxnet/gluon/block.py", line 541, in __call__
out = self.forward(*args)
File "/home/kohill/anaconda2/lib/python2.7/site-packages/mxnet/gluon/block.py", line 908, in forward
return self._call_cached_op(x, *args)
File "/home/kohill/anaconda2/lib/python2.7/site-packages/mxnet/gluon/block.py", line 798, in _call_cached_op
self._build_cache(*args)
File "/home/kohill/anaconda2/lib/python2.7/site-packages/mxnet/gluon/block.py", line 750, in _build_cache
data, out = self._get_graph(*args)
File "/home/kohill/anaconda2/lib/python2.7/site-packages/mxnet/gluon/block.py", line 742, in _get_graph
out = self.hybrid_forward(symbol, *grouped_inputs, **params) # pylint: disable=no-value-for-parameter
File "/home/kohill/anaconda2/lib/python2.7/site-packages/mxnet/gluon/block.py", line 85, in __exit__
self._name_scope.__exit__(ptype, value, trace)
AttributeError: 'NoneType' object has no attribute '__exit__'
Exception in thread Thread-4:
Traceback (most recent call last):
File "/home/kohill/anaconda2/lib/python2.7/threading.py", line 801, in __bootstrap_inner
self.run()
File "/home/kohill/anaconda2/lib/python2.7/threading.py", line 754, in run
self.__target(*self.__args, **self.__kwargs)
File "/home/kohill/Desktop/mx-detection/bug_example.py", line 10, in worker
outnd = module(inputs) # type: mx.nd.NDArray
File "/home/kohill/anaconda2/lib/python2.7/site-packages/mxnet/gluon/block.py", line 541, in __call__
out = self.forward(*args)
File "/home/kohill/anaconda2/lib/python2.7/site-packages/mxnet/gluon/block.py", line 908, in forward
return self._call_cached_op(x, *args)
File "/home/kohill/anaconda2/lib/python2.7/site-packages/mxnet/gluon/block.py", line 798, in _call_cached_op
self._build_cache(*args)
File "/home/kohill/anaconda2/lib/python2.7/site-packages/mxnet/gluon/block.py", line 750, in _build_cache
data, out = self._get_graph(*args)
File "/home/kohill/anaconda2/lib/python2.7/site-packages/mxnet/gluon/block.py", line 742, in _get_graph
out = self.hybrid_forward(symbol, *grouped_inputs, **params) # pylint: disable=no-value-for-parameter
File "/home/kohill/anaconda2/lib/python2.7/site-packages/gluoncv/model_zoo/resnetv1b.py", line 235, in hybrid_forward
x = self.conv1(x)
File "/home/kohill/anaconda2/lib/python2.7/site-packages/mxnet/gluon/block.py", line 541, in __call__
out = self.forward(*args)
File "/home/kohill/anaconda2/lib/python2.7/site-packages/mxnet/gluon/block.py", line 925, in forward
return self.hybrid_forward(symbol, x, *args, **params)
File "/home/kohill/anaconda2/lib/python2.7/site-packages/mxnet/gluon/block.py", line 85, in __exit__
self._name_scope.__exit__(ptype, value, trace)
AttributeError: 'NoneType' object has no attribute '__exit__'
Exception in thread Thread-2:
Traceback (most recent call last):
File "/home/kohill/anaconda2/lib/python2.7/threading.py", line 801, in __bootstrap_inner
self.run()
File "/home/kohill/anaconda2/lib/python2.7/threading.py", line 754, in run
self.__target(*self.__args, **self.__kwargs)
File "/home/kohill/Desktop/mx-detection/bug_example.py", line 10, in worker
outnd = module(inputs) # type: mx.nd.NDArray
File "/home/kohill/anaconda2/lib/python2.7/site-packages/mxnet/gluon/block.py", line 541, in __call__
out = self.forward(*args)
File "/home/kohill/anaconda2/lib/python2.7/site-packages/mxnet/gluon/block.py", line 908, in forward
return self._call_cached_op(x, *args)
File "/home/kohill/anaconda2/lib/python2.7/site-packages/mxnet/gluon/block.py", line 798, in _call_cached_op
self._build_cache(*args)
File "/home/kohill/anaconda2/lib/python2.7/site-packages/mxnet/gluon/block.py", line 750, in _build_cache
data, out = self._get_graph(*args)
File "/home/kohill/anaconda2/lib/python2.7/site-packages/mxnet/gluon/block.py", line 742, in _get_graph
out = self.hybrid_forward(symbol, *grouped_inputs, **params) # pylint: disable=no-value-for-parameter
File "/home/kohill/anaconda2/lib/python2.7/site-packages/mxnet/gluon/block.py", line 85, in __exit__
self._name_scope.__exit__(ptype, value, trace)
AttributeError: 'NoneType' object has no attribute '__exit__'
Process finished with exit code 0
import gluoncv
import threading
import mxnet as mx
net = gluoncv.model_zoo.resnet18_v1b(pretrained=True)
net.hybridize()
ctx_list = [mx.gpu(x) for x in [0,1,2,3]]
def worker(module, inputs, i, lock, outputs):
outnd = module(inputs) # type: mx.nd.NDArray
outnd.wait_to_read()
with lock:
outputs[i] = outnd
threads = []
outputs = []
lock = threading.Lock()
for i in range(len(ctx_list)):
thread = threading.Thread(target=worker, args=(net, mx.random.randn(1, 3, 368, 368), i, lock, outputs))
threads.append(thread)
for thread in threads:
thread.start()
for thread in threads:
thread.join()
(Paste the commands you ran that produced the error.)
1.
2.
Forward once before starting the thread.
The latest version of mxnet also have this bug.
@mxnet-label-bot add [Gluon, Thread Safety, Bug]
@kohillyang
Thanks for submitting the issue. I have added the labels so that community members can provide the help.
I'm facing the same problem. Has this issue been fixed?
mxnet in general is not thread safe. You can accomplish the above using multiprocessing.
import multiprocessing as mp
import gluoncv
import mxnet as mx
net = gluoncv.model_zoo.resnet18_v1b(pretrained=True)
net.hybridize()
def worker(module, input, outputs):
outnd = module(input) # type: mx.nd.NDArray
outnd.wait_to_read()
outputs.put(outnd)
ps = []
outputs = mp.Queue(5)
for i in range(3):
input1 = mx.random.randn(1, 3, 368, 368)
p = mp.Process(target=worker, args=(net, input1, outputs))
ps.append(p)
for p in ps:
p.start()
for p in ps:
p.join()
while not outputs.empty():
print(outputs.get().shape)
But unlike pytorch, it is not possible to optimize the network if using Process instead. I found a inconvenient way to solve it is to inference once before pushing it into sub-threads.
@kohillyang
In my opinion, supporting multi-threading in Python will drop the performance, because we need to add locks to keep thread-safety.
I think it's better to use multi-process in Python, which has GIL and create a fake multi-threading. We could pass NDArray object through Pipe, e.g. Gluon DataLoader
Could you please provide some projects which use multi-threading to optimize a network? We may support multi-threading in Python if it is necessary. Thank you!
I submited a PR just now, which may support multi-theading environment for Gluon.
https://github.com/apache/incubator-mxnet/pull/14344
BTW, in the testing case in this issue, outputs = [None for _ in range(len(ctx_list))]
@wkcn, Glad to see this issue is going to be resolved.
One case threading is needed is that when some operators are written by numpy, especially when the batch size is small, which is common in object detection. Using multi-threading can get a speed improvement of about 15% according to my test on 8x P40
Anyway, At least according to my test, mxnet has already supported the multi-threading training, for example, https://github.com/dmlc/gluon-cv/blob/master/gluoncv/utils/parallel.py, and https://github.com/dmlc/gluon-cv/blob/master/scripts/segmentation/train.py uses parallel.py to speed training up. There maybe no extra works need to be done.
@kohillyang Thank you!
In Object Detection, do you mean that the proposal and proposal_target layers are custom operators, written by numpy, and using multi-threading to execute these NumPy operators parallelly can accelerate them?
Yes. Another case is that if some codes is written by Block rather than HybridBlock if some codes can hardly be packaged into a single Operator and asnumpy is called (sometimes because dynamic shape inference is almost impossible.). In this case if more than one gpu are used and not using mult-threading, the network can not easily be paralleled.
Since dynamic network is becoming more and more popular, I think supporting multi-threading is needed.
@kohillyang Hi! Could you please provide an example code to show how to run the operator written by numpy in parallel? Thanks!
I see. There is only one thread to execute Custom Operator.
Did you modify src/operator/custom/custom-inl.h to support multi-threading?
I didn't modify src/operator/custom/custom-inl.h, but there can be more than one thread to excucte custom Operator. I mean, considering there are only one network, it has each individual copy on each GPU, so I think they can be treated several independent networks when forwarding. And if we have n GPUs, we execute n threads, one thread per one GPU, to inference and back-propagate these networks. Then there should be n threads to execute the custom Operator. As the GIL is freed when CPP codes are executed, and to the best of my knowns, there is no lock in mxnet in this case to force Custom Operator to be executed in only one thread, using multi-threading can speed these operators up.
But I'm not sure whether mxnet forces only one thread to execute Custom Operator.
@kohillyang
I submitted a PR to support multi-threading for Custom Operator
https://github.com/apache/incubator-mxnet/pull/14363, but I do not know how did you accelerate it without modifying custom-inl.h
Could you upload your code? Did you use Python multi-threading to implement it?
@wkcn
I have to admit that I was wrong. I have tried to write a small test case and I found it is absolutely impossible to run a same CustomOp in different threads. More worse, I found my program sometimes cashes if multi-threading is used.
Here is my test codes:
import os
# os.environ["MXNET_ENGINE_TYPE"]="NaiveEngine"
import mxnet as mx
import time
import threading
import numpy as np
import cv2
import os
cv2.setNumThreads(1) # Sometimes we need this to avoid deadlock, especially in multi-processing environments.
class TestOP(mx.operator.CustomOp):
def __init__(self, *args, **kwargs):
super(TestOP, self).__init__(*args, **kwargs)
print("init")
def forward(self, is_train, req, in_data, out_data, aux):
try:
x = in_data[0].asnumpy()
print("ss")
x = np.ones(shape=(1024, 1024, 300))
x_resized = cv2.resize(x, (0, 0), fx=0.5, fy=0.5)
x_resized_sum = x_resized.sum()
print('ee', x_resized_sum)
except Exception as e:
print(e)
@mx.operator.register("test_op")
class TestOPProp(mx.operator.CustomOpProp):
def __init__(self):
super(TestOPProp, self).__init__()
def list_arguments(self):
return ['x']
def list_outputs(self):
return ['y']
def infer_shape(self, in_shape):
return in_shape, in_shape
def create_operator(self, ctx, shapes, dtypes):
return TestOP()
ctx_list = [mx.gpu(x) for x in [0, 1, 2, 3]]
x_list = [mx.nd.ones(shape=(1, 2), ctx=c) for c in ctx_list]
data = mx.sym.var(name="data")
y = mx.sym.Custom(data, op_type="test_op")
y = mx.sym.identity(y, name="identity")
sym_block = mx.gluon.SymbolBlock(outputs=y, inputs=data)
sym_block.collect_params().reset_ctx(ctx_list)
def forward(x, ctx):
# print("enter", x)
re = sym_block(x)
re.wait_to_read()
# print("exit")
return re
# for x, c in zip(x_list, ctx_list):
# forward(x, c)
# mx.nd.waitall()
threads = []
for x, c in zip(x_list, ctx_list):
t = threading.Thread(target=forward, args=(x, c))
t.daemon = True
t.start()
#
for t in threads:
t.join()
mx.nd.waitall()
It cashes without any Exception and outputs.
if line print("enter", x) is not committed, it does not crash but the cpu usage is less than 100%, and the outputs are in order so I am sure that there is only one thread to execute CustomOP.
Thanks for your report! I will check it.
I have closed the previous PR, since I found that it is too complex to support multi-threading. The issue is still considered.
There are some bugs when running MXNet on multi-threading or multi-process, e.g.
https://github.com/apache/incubator-mxnet/issues/14396