Test fails on master:
======================================================================
ERROR: test_gluon_model_zoo_gpu.test_training
----------------------------------------------------------------------
Traceback (most recent call last):
File "/usr/local/lib/python2.7/dist-packages/nose/case.py", line 197, in runTest
self.test(*self.arg)
File "/work/mxnet/tests/python/gpu/../unittest/common.py", line 173, in test_new
orig_test(*args, **kwargs)
File "/work/mxnet/tests/python/gpu/test_gluon_model_zoo_gpu.py", line 161, in test_training
max_val = np.max(np.abs(cpu_out.asnumpy()))
File "/work/mxnet/python/mxnet/ndarray/ndarray.py", line 1980, in asnumpy
ctypes.c_size_t(data.size)))
File "/work/mxnet/python/mxnet/base.py", line 252, in check_call
raise MXNetError(py_str(_LIB.MXGetLastError()))
MXNetError: [20:11:53] src/operator/nn/mkldnn/mkldnn_base.cc:528: Check failed: similar
Stack trace returned 10 entries:
[bt] (0) /work/mxnet/python/mxnet/../../lib/libmxnet.so(dmlc::StackTrace[abi:cxx11]()+0x60) [0x7f0c25d010f0]
[bt] (1) /work/mxnet/python/mxnet/../../lib/libmxnet.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x32) [0x7f0c25d016e2]
[bt] (2) /work/mxnet/python/mxnet/../../lib/libmxnet.so(mxnet::OpCheck::Run(std::function<void (nnvm::NodeAttrs const&, mxnet::OpContext const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&, std::vector<mxnet::TBlob, std::allocator<mxnet::TBlob> > const&)>, nnvm::NodeAttrs const&, mxnet::OpContext const&, std::vector<mxnet::NDArray, std::allocator<mxnet::NDArray> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&, std::vector<mxnet::NDArray, std::allocator<mxnet::NDArray> > const&)+0x1d61) [0x7f0c25db66c1]
[bt] (3) /work/mxnet/python/mxnet/../../lib/libmxnet.so(+0x506d49b) [0x7f0c28f2049b]
[bt] (4) /work/mxnet/python/mxnet/../../lib/libmxnet.so(std::_Function_handler<void (nnvm::NodeAttrs const&, mxnet::OpContext const&, std::vector<mxnet::NDArray, std::allocator<mxnet::NDArray> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&, std::vector<mxnet::NDArray, std::allocator<mxnet::NDArray> > const&), void (*)(nnvm::NodeAttrs const&, mxnet::OpContext const&, std::vector<mxnet::NDArray, std::allocator<mxnet::NDArray> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&, std::vector<mxnet::NDArray, std::allocator<mxnet::NDArray> > const&)>::_M_invoke(std::_Any_data const&, nnvm::NodeAttrs const&, mxnet::OpContext const&, std::vector<mxnet::NDArray, std::allocator<mxnet::NDArray> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&, std::vector<mxnet::NDArray, std::allocator<mxnet::NDArray> > const&)+0x20) [0x7f0c25e1bb60]
[bt] (5) /work/mxnet/python/mxnet/../../lib/libmxnet.so(mxnet::imperative::PushFComputeEx(std::function<void (nnvm::NodeAttrs const&, mxnet::OpContext const&, std::vector<mxnet::NDArray, std::allocator<mxnet::NDArray> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&, std::vector<mxnet::NDArray, std::allocator<mxnet::NDArray> > const&)> const&, nnvm::Op const*, nnvm::NodeAttrs const&, mxnet::Context const&, std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&, std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&, std::vector<mxnet::Resource, std::allocator<mxnet::Resource> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&)::{lambda(mxnet::RunContext)#1}::operator()(mxnet::RunContext) const+0x150) [0x7f0c292511b0]
[bt] (6) /work/mxnet/python/mxnet/../../lib/libmxnet.so(std::_Function_handler<void (mxnet::RunContext), mxnet::imperative::PushFComputeEx(std::function<void (nnvm::NodeAttrs const&, mxnet::OpContext const&, std::vector<mxnet::NDArray, std::allocator<mxnet::NDArray> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&, std::vector<mxnet::NDArray, std::allocator<mxnet::NDArray> > const&)> const&, nnvm::Op const*, nnvm::NodeAttrs const&, mxnet::Context const&, std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&, std::vector<mxnet::engine::Var*, std::allocator<mxnet::engine::Var*> > const&, std::vector<mxnet::Resource, std::allocator<mxnet::Resource> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<mxnet::NDArray*, std::allocator<mxnet::NDArray*> > const&, std::vector<mxnet::OpReqType, std::allocator<mxnet::OpReqType> > const&)::{lambda(mxnet::RunContext)#1}>::_M_invoke(std::_Any_data const&, mxnet::RunContext&&)+0x1e) [0x7f0c292512fe]
[bt] (7) /work/mxnet/python/mxnet/../../lib/libmxnet.so(+0x5bbe66b) [0x7f0c29a7166b]
[bt] (8) /work/mxnet/python/mxnet/../../lib/libmxnet.so(mxnet::engine::ThreadedEngine::ExecuteOprBlock(mxnet::RunContext, mxnet::engine::OprBlock*)+0xc8f) [0x7f0c29a6c18f]
[bt] (9) /work/mxnet/python/mxnet/../../lib/libmxnet.so(std::_Function_handler<void (std::shared_ptr<dmlc::ManualEvent>), mxnet::engine::ThreadedEnginePerDevice::PushToExecute(mxnet::engine::OprBlock*, bool)::{lambda()#1}::operator()() const::{lambda(std::shared_ptr<dmlc::ManualEvent>)#1}>::_M_invoke(std::_Any_data const&, std::shared_ptr<dmlc::ManualEvent>&&)+0x13a) [0x7f0c29a817fa]
-------------------- >> begin captured logging << --------------------
root: INFO: data/val-5k-256.rec exists, skipping download
common: INFO: Setting test np/mx/python random seeds, use MXNET_TEST_SEED=241519495 to reproduce.
--------------------- >> end captured logging << ---------------------
test_gluon_model_zoo_gpu.test_training out of memory
https://github.com/apache/incubator-mxnet/issues/10323
Flaky test_gluon_model_zoo_gpu.test_training @ Python3: MKLDNN-GPU
https://github.com/apache/incubator-mxnet/issues/9820
@mxnet-label-bot add [Gluon, GPU, Model Zoo, MKLDNN]
@pengxin99 will take a look for this issue :)
@juliusshufan
@pengzhao-intel @TaoLv dose this error means that the output of CPU model parameters does not the match with the GPU?
we run this testcase at our machine and can`t reproduce this error, our envs :
make -j USE_MKLDNN=1 USE_BLAS=mkl USE_OPENCV=1 USE_CUDA=1 USE_CUDNN=1 USE_CUDA_PATH=/usr/local/cuda-9.0
we got ok result as follows:
[INFO] Setting module np/mx/python random seeds, use MXNET_MODULE_SEED=1237749972 to reproduce.
[WARNING] * test-level seed set: all "@with_seed()" tests run deterministically *
[INFO] Setting test np/mx/python random seeds, use MXNET_TEST_SEED=241519495 to reproduce.
[13:58:44] src/io/iter_image_recordio_2.cc:172: ImageRecordIOParser2: data/val-5k-256.rec, use 1 threads for decoding..
testing resnet18_v1
[13:58:46] src/operator/nn/mkldnn/mkldnn_base.cc:74: Allocate 147456 bytes with malloc directly
[13:58:46] src/operator/nn/mkldnn/mkldnn_base.cc:74: Allocate 8028160 bytes with malloc directly
[13:58:47] src/operator/nn/./cudnn/./cudnn_algoreg-inl.h:97: Running performance tests to find the best convolution algorithm, this can take a while... (setting env variable MXNET_CUDNN_AUTOTUNE_DEFAULT to 0 to disable)
resnet18_v1: CPU 3.3956544, GPU 3.395655
testing densenet121
[13:58:51] src/operator/nn/./cudnn/./cudnn_algoreg-inl.h:97: Running performance tests to find the best convolution algorithm, this can take a while... (setting env variable MXNET_CUDNN_AUTOTUNE_DEFAULT to 0 to disable)
densenet121: CPU 2.911241, GPU 2.9112394
.
Ran 1 test in 8.308s
OK
@pengxin99 Though I think you have the right setup, we are testing the build with following parameters:
build_ubuntu_gpu_mkldnn() {
set -ex
build_ccache_wrappers
make \
DEV=1 \
ENABLE_TESTCOVERAGE=1 \
USE_CPP_PACKAGE=1 \
USE_BLAS=openblas \
USE_MKLDNN=1 \
USE_CUDA=1 \
USE_CUDA_PATH=/usr/local/cuda \
USE_CUDNN=1 \
CUDA_ARCH="$CI_CUDA_COMPUTE_CAPABILITIES" \
-j$(nproc)
}
I see that DEV and USE_BLAS differ. Can this be an issue?
@lebeg I want to test if the DEV and USE_BLAS will affect the results, but i can not compile success use this two parameters:
make -j20 DEV=1 ENABLE_TESTCOVERAGE=1 USE_CPP_PACKAGE=1 USE_MKLDNN=1 USE_BLAS=openblas USE_CUDA=1 USE_CUDNN=1 USE_CUDA_PATH=/usr/local/cuda-9.0/
In file included from /home/pengxiny/case_test/incubator-mxnet/3rdparty/mshadow/mshadow/tensor.h:16:0,
from include/mxnet/./base.h:32,
from include/mxnet/operator_util.h:43,
from src/operator/tensor/./elemwise_unary_op.h:28,
from src/operator/tensor/elemwise_binary_broadcast_op_basic.cc:25:
/home/pengxiny/case_test/incubator-mxnet/3rdparty/mshadow/mshadow/./base.h:162:23: fatal error: cblas.h: No such file or directory
#include
^
could you tell me what DEV mean?
The flag DEV should not be relevant, but for openblas to work you probably need to install OpenBLAS first. For Ubuntu it would be as simple as: sudo apt install libopenblas-dev
@lebeg Thanks, and I build with openblas, but still can`t reproduce this issue, i want to know if this issue appear every time in your envrionment?
make -j20 DEV=1 USE_MKLDNN=1 USE_BLAS=openblas USE_CUDA=1 USE_CUDNN=1 USE_CUDA_PATH=/usr/local/cuda-9.0/[INFO] Setting module np/mx/python random seeds, use MXNET_MODULE_SEED=1185757949 to reproduce.
[WARNING] * test-level seed set: all "@with_seed()" tests run deterministically *
[INFO] Setting test np/mx/python random seeds, use MXNET_TEST_SEED=241519495 to reproduce.
[10:39:00] src/io/iter_image_recordio_2.cc:172: ImageRecordIOParser2: data/val-5k-256.rec, use 1 threads for decoding..
testing resnet18_v1
[10:39:02] src/operator/nn/mkldnn/mkldnn_base.cc:74: Allocate 147456 bytes with malloc directly
[10:39:02] src/operator/nn/mkldnn/mkldnn_base.cc:74: Allocate 8028160 bytes with malloc directly
[10:39:04] src/operator/nn/./cudnn/./cudnn_algoreg-inl.h:97: Running performance tests to find the best convolution algorithm, this can take a while... (setting env variable MXNET_CUDNN_AUTOTUNE_DEFAULT to 0 to disable)
resnet18_v1: CPU 3.3956556, GPU 3.395655
testing densenet121
[10:39:07] src/operator/nn/./cudnn/./cudnn_algoreg-inl.h:97: Running performance tests to find the best convolution algorithm, this can take a while... (setting env variable MXNET_CUDNN_AUTOTUNE_DEFAULT to 0 to disable)
densenet121: CPU 2.91124, GPU 2.9112394
.
Ran 1 test in 9.157s
OK
Maybe it is not an issue anymore, feel free to introduce a PR that reverts the disabling of the test.
Flaky test found - http://jenkins.mxnet-ci.amazon-ml.com/blue/organizations/jenkins/mxnet-validation%2Funix-gpu/detail/PR-13901/2/pipeline in unrelated PR
@pengxin99 could you check this test again?
@pengzhao-intel @ChaiBapchya
a quick reply, i use mxnet build from source (lasted version),
build code: make -j20 DEV=1 USE_MKLDNN=1 USE_BLAS=openblas USE_CUDA=1 USE_CUDNN=1 USE_CUDA_PATH=/usr/local/cuda-9.0/
test seed set: MXNET_TEST_SEED=1844955066
and got test ok as follows:
test_gluon_model_zoo_gpu.test_training
... [INFO] Setting test np/mx/python random seeds, use MXNET_TEST_SEED=1844955066 to reproduce.
[16:53:44] src/io/iter_image_recordio_2.cc:172: ImageRecordIOParser2: data/val-5k-256.rec, use 1 threads for decoding..
testing resnet18_v1
resnet18_v1: CPU 5.159359, GPU 5.1593604
testing densenet121
densenet121: CPU 3.6354995, GPU 3.6354992
ok
Ran 2 tests in 25.003s
OK
Yet another time
Unrelated PR https://github.com/apache/incubator-mxnet/pull/15541
http://jenkins.mxnet-ci.amazon-ml.com/blue/organizations/jenkins/mxnet-validation%2Funix-gpu/detail/PR-15541/2/pipeline
======================================================================
FAIL: test_gluon_model_zoo_gpu.test_training
----------------------------------------------------------------------
Traceback (most recent call last):
File "/usr/local/lib/python2.7/dist-packages/nose/case.py", line 197, in runTest
self.test(*self.arg)
File "/work/mxnet/tests/python/gpu/../unittest/common.py", line 177, in test_new
orig_test(*args, **kwargs)
File "/work/mxnet/tests/python/gpu/test_gluon_model_zoo_gpu.py", line 181, in test_training
rtol=1e-3, atol=1e-3)
File "/work/mxnet/python/mxnet/test_utils.py", line 510, in assert_almost_equal
raise AssertionError(msg)
AssertionError:
Items are not equal:
Error 1.071543 exceeds tolerance rtol=0.001000, atol=0.001000. Location of maximum error:(263,), a=2.995600, b=2.999886
a: array([1.4665551, 1.2490531, 1.574928 , ..., 1.324182 , 1.3332574,
1.6793004], dtype=float32)
b: array([1.4677135, 1.2497671, 1.5763084, ..., 1.3250492, 1.3341433,
1.680894 ], dtype=float32)
@TaoLv please let ruilin taking look for this issue.
@ruilinzhuintel
@pengzhao-intel @ChaiBapchya @TaoLv
Sorry for late reply, I reproduced this case on the latest mxnet repo.
My hardware environment:
CPU: Intel(R) Xeon(R) Platinum 8180 CPU @ 2.50GHz
GPU: Tesla P100
My Mxnet build environment:
CPATH=/usr/include/openblas make -j USE_MKLDNN=1 USE_BLAS=openblas USE_OPENCV=1 USE_CUDA=1 USE_CUDNN=1 USE_CUDA_PATH=/usr/local/cuda-9.0
MXNET_TEST_SEED=1844955066
And I got test OK as below:
.[INFO] Setting test np/mx/python random seeds, use MXNET_TEST_SEED=1844955066 to reproduce.
[15:13:42] src/io/iter_image_recordio_2.cc:178: ImageRecordIOParser2: data/val-5k-256.rec, use 1 threads for decoding..
testing resnet18_v1
resnet18_v1: CPU 5.159359, GPU 5.1593604
testing densenet121
densenet121: CPU 3.635501, GPU 3.6354992.
Ran 2 tests in 24.117s
OK
@wuxun-zhang please double check if the issue still exist, thanks.
@pengzhao-intel Sure. Will take a look at this.
@pengzhao-intel Just tested with mxnet master (commit: ef19b09c297f5dbc100b1283c401074231f883d1) on V100 and cannot reproduce such issue.
Alright, then we can close this (can reopen if it resurfaces)