alibaba / x-deeplearning

An industrial deep learning framework for high-dimension sparse data

Geek Repo:Geek Repo

Github PK Tool:Github PK Tool

xdl-algorithm-solution\TDMServing\evaluation 部署运行(可执行程序&动态库)打分异常,已解决,分享下

ustcdane opened this issue · comments

背景
训练TDM模型用的是官方提供的XDL1.1 GPU版本 ubuntu-gpu-mxnet1.3 ,模型转换及TDMServing用的是官方给的1.2 master版本,模型转换 按照官方教程, 其中model_converter_example.sh 需要针对训练时网络结构改写graph_ulf.txt。
blaze编译方式:
#cmake .. -DUSE_CUDA=0 -DUSE_MKL=1 -DMKL_PATH=/search/Daniel/intel -DSYMBOL_EXPORT_CTL=0 && make -j64 && make install

TDMServing 编译方式:
cmake .. && make -j32 && make install

出现的问题
1. 编译机器上运行 ,线下线上打分一致
编译机器环境:CentOS Linux release 7.4.1708 (Core) gcc 版本 4.8.5 20150623 (Red Hat 4.8.5-39) (GCC)
xdl-algorithm-solution/TDMServing/build/evaluation
./tdm_evaluation
输出打分正常,如下:
score:0.992156
score:0.991907
score:0.990124
score:0.989871
score:0.989138
score:0.989032
score:0.988131
score:0.987107
score:0.985953
score:0.984634
score:0.983796
score:0.983113
score:0.982551
score:0.982071
score:0.982002
score:0.981920
score:0.981669
score:0.981479
score:0.981422
score:0.980451
score:0.980342

2. tdm_evaluation及其把依赖的程序拷贝到其它机器上运行出现打分都为1的情况
新机器环境:CentOS Linux release 7.6.1810 (Core) gcc version 4.8.5 20150623 (Red Hat 4.8.5-36)
从编译机器拷贝的依赖动态库:
libs/
├── libblaze.so
├── libglog.so.0
├── libiomp5.so
├── libmkl_avx2.so
├── libmkl_avx512_mic.so
├── libmkl_avx512.so
├── libmkl_avx.so
├── libmkl_core.so
├── libmkl_def.so
├── libmkl_gf_ilp64.so
├── libmkl_gf_lp64.so
├── libmkl_gnu_thread.so
├── libmkl_intel_ilp64.so
├── libmkl_intel_lp64.so
├── libmkl_intel_thread.so
├── libmkl_mc3.so
├── libmkl_mc.so
├── libmkl_rt.so
├── libmkl_sequential.so
├── libmkl_tbb_thread.so
├── libmkl_vml_avx2.so
├── libmkl_vml_avx512_mic.so
├── libmkl_vml_avx512.so
├── libmkl_vml_avx.so
├── libmkl_vml_cmpt.so
├── libmkl_vml_def.so
├── libmkl_vml_mc2.so
├── libmkl_vml_mc3.so
├── libmkl_vml_mc.so
├── libprotobuf.so.16
└── libtdm_serving.so
打分结果出现异常:
score:1.000000
score:1.000000
score:1.000000
score:1.000000
score:1.000000
score:1.000000
score:1.000000
score:1.000000
score:1.000000
score:1.000000
score:1.000000
score:1.000000
score:1.000000
score:1.000000
score:1.000000
score:1.000000
score:1.000000
score:1.000000
score:1.000000
score:1.000000
score:1.000000
自己通过多次尝试,和几个用XDL同学在自己机器上的尝试,在新机器上打分全是1.

3. 解决方法
怀疑可能是依赖的动态库的原因?随后自己在TDMServing下添加了一个目录 online_test,目录结构如下,限定依赖库的路径如下面的CMakeLists.txt

TDMServing/
├── benchmark
├── build
├── cmake
├── CMakeLists.txt
├── docs
├── evaluation
├── examples
├── LICENSE
├── my_install
├── online_test
│   ├── build
│   ├── CMakeLists.txt
│   └── tdm_evaluation.cpp
├── tdm-serving
├── test
└── tools

注意: 我blaze和TDMServing install 目录有所改动
x-deeplearning/blaze/CMakeLists.txt
#set(CMAKE_INSTALL_PREFIX /opt/blaze/)
set(CMAKE_INSTALL_PREFIX ${PROJECT_SOURCE_DIR}/blaze_install/blaze/)

x-deeplearning/xdl-algorithm-solution/TDMServing/CMakeLists.txt
#set(CMAKE_INSTALL_PREFIX /opt/tdm-serving/)
set(CMAKE_INSTALL_PREFIX ${PROJECT_SOURCE_DIR}/my_install)
install(DIRECTORY build/proto/ DESTINATION proto)

CMakeLists.txt 内容如下:

cmake_minimum_required (VERSION 2.8)

project (tdm_serving_test)

enable_testing()

#The default tdm-serving install prefix
set(CMAKE_INSTALL_PREFIX /search/Daniel/XDL/x-deeplearning/xdl-algorithm-solution/TDMServing/online_test/my_install)

message(STATUS "BUILD_UNIT_TESTS=${BUILD_UNIT_TESTS}")
message(STATUS "WITH_DEBUG_SYMBOLS=${WITH_DEBUG_SYMBOLS}")
message(STATUS "CMAKE_INSTALL_PREFIX=${CMAKE_INSTALL_PREFIX}")

set(CMAKE_BUILD_TYPE "Release")
set(CMAKE_CXX_FLAGS "-Wall -W -fPIC -Wno-unused-but-set-variable -std=c++11 -D_GLIBCXX_USE_CXX11_ABI=0 -fopenmp ")
set(CMAKE_CXX_FLAGS_DEBUG " -O0")
set(CMAKE_CXX_FLAGS_RELEASE " -O2 -DNDEBUG")

if(WITH_DEBUG_SYMBOLS)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g")
endif()

#glog
find_path(GLOG_INCLUDE_PATH NAMES glog/logging.h PATHS
/search/Daniel/glog_install/include)
find_library(GLOG_LIB NAMES glog PATHS /search/Daniel/glog_install/lib)
if((NOT GLOG_INCLUDE_PATH) OR (NOT GLOG_LIB))
message(FATAL_ERROR "Fail to find glog")
endif()
message("-- Found glog: " ${GLOG_LIB})

#blaze
#cmake .. -DUSE_CUDA=0 -DUSE_MKL=1 -DMKL_PATH=/search/Daniel/intel -DSYMBOL_EXPORT_CTL=0 && make -j64 && make install
#make install to directory blaze_install/blaze, which contains: bin conf include lib tools
find_path(BLAZE_INCLUDE_PATH NAMES blaze/include/predictor.h PATHS
/search/Daniel/XDL/x-deeplearning/blaze/blaze_install/)
#find_library(BLAZE_LIB NAMES lib/libblaze_static.a PATHS
find_library(BLAZE_LIB NAMES lib/libblaze.so PATHS
/search/Daniel/XDL/x-deeplearning/blaze/blaze_install/blaze)
if((NOT BLAZE_INCLUDE_PATH) OR (NOT BLAZE_LIB))
message(FATAL_ERROR "Fail to find blaze")
endif()

message("-- Found blaze: " ${BLAZE_LIB})

#TDMServing
#cmake .. && make -j32 && make install
#make install to directory my_install, which contains: include lib proto
find_path(TDMServing_INCLUDE_PATH NAMES include/search_manager.h PATHS
/search/Daniel/XDL/x-deeplearning/xdl-algorithm-solution/TDMServing/my_install/)
find_library(TDMServing_LIB NAMES lib/libtdm_serving.so PATHS
/search/Daniel/XDL/x-deeplearning/xdl-algorithm-solution/TDMServing/my_install/)
if((NOT TDMServing_INCLUDE_PATH) OR (NOT TDMServing_LIB))
message(FATAL_ERROR "Fail to find TDMServing")
endif()

message("-- Found TDMServing: " ${TDMServing_LIB})

find_library(MKL_INTEL_CORE libmkl_core.so /search/Daniel/intel/mkl/lib/intel64/ NO_DEFAULT_PATH)
find_library(MKL_INTEL_LP64 libmkl_intel_lp64.so /search/Daniel/intel/mkl/lib/intel64/ NO_DEFAULT_PATH)
find_library(MKL_INTEL_IOMP5 libiomp5.so /search/Daniel/intel/compilers_and_libraries_2019.3.199/linux/compiler/lib/intel64_lin/ NO_DEFAULT_PATH)

message("-- Found MKL CORE: " ${MKL_INTEL_CORE})
message("-- Found MKL LP64: " ${MKL_INTEL_LP64})
message("-- Found INTEL IOMP5: " ${MKL_INTEL_IOMP5})

find_library(PROTO_L libprotobuf.so /usr/local/lib/ NO_DEFAULT_PATH)
message("-- Found protobuf: " ${PROTO_L})

include_directories(
${PROJECT_BINARY_DIR}/
${PROJECT_SOURCE_DIR}/
${PROJECT_SOURCE_DIR}/../my_install/
${PROJECT_SOURCE_DIR}/../tdm-serving/
${PROTOBUF_INCLUDE_DIRS}/
${GLOG_INCLUDE_PATH}/
${BLAZE_INCLUDE_PATH}/
${TDMServing_INCLUDE_PATH}
)

set(UTEST_SOURCE_FILES ${PROJECT_SOURCE_DIR}/tdm_evaluation.cpp)
add_executable(${PROJECT_NAME} ${UTEST_SOURCE_FILES})
target_link_libraries(${PROJECT_NAME} ${GLOG_LIB}
${PROTOBUF_LIB}
${BLAZE_LIB}
${TDMServing_LIB}
${MKL_INTEL_CORE}
${MKL_INTEL_LP64}
${MKL_INTEL_IOMP5}
${PROTO_L}
)

cmake .. && make -j8

跨机器运行:
pw=pwd
deps="$pw/libs/"
export LD_PRELOAD=$deps/libmkl_rt.so
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$deps
运行 ./tdm_evaluation
输出打分正常,如下:
score:0.992156
score:0.991907
score:0.990124
score:0.989871
score:0.989138
score:0.989032
score:0.988131
score:0.987107
score:0.985953
score:0.984634
score:0.983796
score:0.983113
score:0.982551
score:0.982071
score:0.982002
score:0.981920
score:0.981669
score:0.981479
score:0.981422
score:0.980451
score:0.980342
在如下两台跨机器都能正常打分了。
CentOS Linux release 7.6.1810 (Core) gcc version 4.8.5 20150623 (Red Hat 4.8.5-36)
CentOS Linux release 7.1.1503 (Core) gcc 版本 4.8.3 20140911 (Red Hat 4.8.3-9) (GCC)

总结
目前出现这种错误的原因还不是很明确,请了解这种状况的同学给个解释,谢谢~~

我这边根据以上的修改,使用的master最新版本的blaze代码编译,和线下(官方镜像提供的xdl1.1)打分也对齐了,赞。具体底层原因还不太清楚。