You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Reproduces the problem - code/configuration sample
自定义配置文件如下:
from opencompass.runners import DLCRunner, LocalRunner
from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
from opencompass.models import HuggingFaceCausalLM
from mmengine.config import read_base
from opencompass.partitioners import SizePartitioner, NaivePartitioner
with read_base():
from .datasets.mmlu.mmlu_clean_ppl import mmlu_datasets
04/18 16:57:52 - OpenCompass - WARNING - SlurmRunner is not used, so the partition argument is ignored.
04/18 16:57:52 - OpenCompass - INFO - Partitioned into 25 tasks.
04/18 16:57:52 - OpenCompass - WARNING - To ensure the integrity of the log results, the log displayed by DLCRunner has a 10-second delay.
[ ] 0/25, elapsed: 0s, ETA:multiprocessing.pool.RemoteTraceback:
"""
Traceback (most recent call last):
File "/cpfs01/user/zhangshusen/glusterfs2/zhangshusen/miniconda3/envs/opencompass/lib/python3.10/multiprocessing/pool.py", line 125, in worker
result = (True, func(*args, **kwds))
File "/cpfs01/user/zhangshusen/glusterfs2/zhangshusen/opencompass/opencompass/runners/dlc.py", line 264, in _launch
return_code = _run_within_retry()
File "/cpfs01/user/zhangshusen/glusterfs2/zhangshusen/opencompass/opencompass/runners/dlc.py", line 229, in _run_within_retry
raise RuntimeError(
RuntimeError: Failed to get job info for dlc56wtmugbosjtd
"""
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/cpfs01/user/zhangshusen/glusterfs2/zhangshusen/opencompass/run.py", line 4, in
main()
File "/cpfs01/user/zhangshusen/glusterfs2/zhangshusen/opencompass/opencompass/cli/main.py", line 309, in main
runner(tasks)
File "/cpfs01/user/zhangshusen/glusterfs2/zhangshusen/opencompass/opencompass/runners/base.py", line 38, in call
status = self.launch(tasks)
File "/cpfs01/user/zhangshusen/glusterfs2/zhangshusen/opencompass/opencompass/runners/dlc.py", line 70, in launch
status = track_parallel_progress(self._launch,
File "/cpfs01/user/zhangshusen/glusterfs2/zhangshusen/miniconda3/envs/opencompass/lib/python3.10/site-packages/mmengine/utils/progressbar.py", line 200, in track_parallel_progress
for result in gen:
File "/cpfs01/user/zhangshusen/glusterfs2/zhangshusen/miniconda3/envs/opencompass/lib/python3.10/multiprocessing/pool.py", line 873, in next
raise value
RuntimeError: Failed to get job info for dlc56wtmugbosjtd
Prerequisite
Type
I'm evaluating with the officially supported tasks/models/datasets.
Environment
{'CUDA available': True,
'CUDA_HOME': '/usr/local/cuda',
'GCC': 'gcc (Ubuntu 9.4.0-1ubuntu1~20.04.1) 9.4.0',
'GPU 0,1,2': 'NVIDIA A100-SXM4-80GB',
'MMEngine': '0.10.3',
'MUSA available': False,
'NVCC': 'Cuda compilation tools, release 12.1, V12.1.66',
'OpenCV': '4.9.0',
'PyTorch': '2.2.2',
'PyTorch compiling details': 'PyTorch built with:\n'
' - GCC 9.3\n'
' - C++ Version: 201703\n'
' - Intel(R) oneAPI Math Kernel Library Version '
'2023.1-Product Build 20230303 for Intel(R) 64 '
'architecture applications\n'
' - Intel(R) MKL-DNN v3.3.2 (Git Hash '
'2dc95a2ad0841e29db8b22fbccaf3e5da7992b01)\n'
' - OpenMP 201511 (a.k.a. OpenMP 4.5)\n'
' - LAPACK is enabled (usually provided by '
'MKL)\n'
' - NNPACK is enabled\n'
' - CPU capability usage: AVX512\n'
' - CUDA Runtime 12.1\n'
' - NVCC architecture flags: '
'-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_60,code=sm_60;-gencode;arch=compute_61,code=sm_61;-gencode;arch=compute_70,code=sm_70;-gencode;arch=compute_75,code=sm_75;-gencode;arch=compute_80,code=sm_80;-gencode;arch=compute_86,code=sm_86;-gencode;arch=compute_90,code=sm_90\n'
' - CuDNN 8.9.2\n'
' - Magma 2.6.1\n'
' - Build settings: BLAS_INFO=mkl, '
'BUILD_TYPE=Release, CUDA_VERSION=12.1, '
'CUDNN_VERSION=8.9.2, '
'CXX_COMPILER=/opt/rh/devtoolset-9/root/usr/bin/c++, '
'CXX_FLAGS= -D_GLIBCXX_USE_CXX11_ABI=0 '
'-fabi-version=11 -fvisibility-inlines-hidden '
'-DUSE_PTHREADPOOL -DNDEBUG -DUSE_KINETO '
'-DLIBKINETO_NOROCTRACER -DUSE_FBGEMM '
'-DUSE_QNNPACK -DUSE_PYTORCH_QNNPACK '
'-DUSE_XNNPACK -DSYMBOLICATE_MOBILE_DEBUG_HANDLE '
'-O2 -fPIC -Wall -Wextra -Werror=return-type '
'-Werror=non-virtual-dtor -Werror=bool-operation '
'-Wnarrowing -Wno-missing-field-initializers '
'-Wno-type-limits -Wno-array-bounds '
'-Wno-unknown-pragmas -Wno-unused-parameter '
'-Wno-unused-function -Wno-unused-result '
'-Wno-strict-overflow -Wno-strict-aliasing '
'-Wno-stringop-overflow -Wsuggest-override '
'-Wno-psabi -Wno-error=pedantic '
'-Wno-error=old-style-cast -Wno-missing-braces '
'-fdiagnostics-color=always -faligned-new '
'-Wno-unused-but-set-variable '
'-Wno-maybe-uninitialized -fno-math-errno '
'-fno-trapping-math -Werror=format '
'-Wno-stringop-overflow, LAPACK_INFO=mkl, '
'PERF_WITH_AVX=1, PERF_WITH_AVX2=1, '
'PERF_WITH_AVX512=1, TORCH_VERSION=2.2.2, '
'USE_CUDA=ON, USE_CUDNN=ON, USE_EXCEPTION_PTR=1, '
'USE_GFLAGS=OFF, USE_GLOG=OFF, USE_MKL=ON, '
'USE_MKLDNN=ON, USE_MPI=OFF, USE_NCCL=ON, '
'USE_NNPACK=ON, USE_OPENMP=ON, USE_ROCM=OFF, '
'USE_ROCM_KERNEL_ASSERT=OFF, \n',
'Python': '3.10.14 (main, Mar 21 2024, 16:24:04) [GCC 11.2.0]',
'TorchVision': '0.17.2',
'numpy_random_seed': 2147483648,
'opencompass': '0.2.3+bd7c11b',
'sys.platform': 'linux'}
Reproduces the problem - code/configuration sample
自定义配置文件如下:
from opencompass.runners import DLCRunner, LocalRunner
from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
from opencompass.models import HuggingFaceCausalLM
from mmengine.config import read_base
from opencompass.partitioners import SizePartitioner, NaivePartitioner
with read_base():
from .datasets.mmlu.mmlu_clean_ppl import mmlu_datasets
datasets = [*mmlu_datasets]
models = [
dict(
type=HuggingFaceCausalLM,
abbr='7b-model',
path="model_path",
tokenizer_path='model_path',
tokenizer_kwargs=dict(padding_side='left',
truncation_side='left',
trust_remote_code=True,
use_fast=False,),
max_out_len=100,
max_seq_len=2048,
batch_size=8,
model_kwargs=dict(device_map='auto', trust_remote_code=True),
run_cfg=dict(num_gpus=1, num_procs=1),
)
]
infer = dict(
partitioner=dict(type=SizePartitioner, max_task_size=5000),
runner=dict(
type=DLCRunner,
task=dict(type=OpenICLInferTask),
max_num_workers=4,
aliyun_cfg=dict(
dlc_config_path="/root/.dlc/config",
python_env_path="/root/miniconda3/envs/opencompass",
workspace_id='ws***',
worker_image='image-url',
),
retry=2,
),
)
eval = dict(
partitioner=dict(type=NaivePartitioner),
runner=dict(
type=LocalRunner,
max_num_workers=4,
task=dict(type=OpenICLEvalTask)),
)
Reproduces the problem - command or script
python run.py configs/eval_test.py
Reproduces the problem - error message
04/18 16:57:52 - OpenCompass - WARNING - SlurmRunner is not used, so the partition argument is ignored.
04/18 16:57:52 - OpenCompass - INFO - Partitioned into 25 tasks.
04/18 16:57:52 - OpenCompass - WARNING - To ensure the integrity of the log results, the log displayed by DLCRunner has a 10-second delay.
[ ] 0/25, elapsed: 0s, ETA:multiprocessing.pool.RemoteTraceback:
"""
Traceback (most recent call last):
File "/cpfs01/user/zhangshusen/glusterfs2/zhangshusen/miniconda3/envs/opencompass/lib/python3.10/multiprocessing/pool.py", line 125, in worker
result = (True, func(*args, **kwds))
File "/cpfs01/user/zhangshusen/glusterfs2/zhangshusen/opencompass/opencompass/runners/dlc.py", line 264, in _launch
return_code = _run_within_retry()
File "/cpfs01/user/zhangshusen/glusterfs2/zhangshusen/opencompass/opencompass/runners/dlc.py", line 229, in _run_within_retry
raise RuntimeError(
RuntimeError: Failed to get job info for dlc56wtmugbosjtd
"""
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/cpfs01/user/zhangshusen/glusterfs2/zhangshusen/opencompass/run.py", line 4, in
main()
File "/cpfs01/user/zhangshusen/glusterfs2/zhangshusen/opencompass/opencompass/cli/main.py", line 309, in main
runner(tasks)
File "/cpfs01/user/zhangshusen/glusterfs2/zhangshusen/opencompass/opencompass/runners/base.py", line 38, in call
status = self.launch(tasks)
File "/cpfs01/user/zhangshusen/glusterfs2/zhangshusen/opencompass/opencompass/runners/dlc.py", line 70, in launch
status = track_parallel_progress(self._launch,
File "/cpfs01/user/zhangshusen/glusterfs2/zhangshusen/miniconda3/envs/opencompass/lib/python3.10/site-packages/mmengine/utils/progressbar.py", line 200, in track_parallel_progress
for result in gen:
File "/cpfs01/user/zhangshusen/glusterfs2/zhangshusen/miniconda3/envs/opencompass/lib/python3.10/multiprocessing/pool.py", line 873, in next
raise value
RuntimeError: Failed to get job info for dlc56wtmugbosjtd
Other information
dlc上看到提交的infer任务运行成功了,但是运行脚本侧获取不到dlc运行结果,导致后续eval任务失败
The text was updated successfully, but these errors were encountered: