Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

secretflow API编程有问题 #1205

Closed
LeoneChen opened this issue Mar 21, 2024 · 15 comments
Closed

secretflow API编程有问题 #1205

LeoneChen opened this issue Mar 21, 2024 · 15 comments
Labels

Comments

@LeoneChen
Copy link

下面是一段使用sf API的代码,运行有问题(在sf.init里加上debug_mode=True后直接卡住了)

import secretflow as sf
import jax.numpy as jnp
import numpy as np
# <import libraries that get_alice_data, get_bob_data, and func will use>
from sklearn.datasets import load_iris
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Initialize secretflow framework, SPU node, and participants' PYU nodes
sf.init(['alice', 'bob'], address='local')
alice, bob = sf.PYU('alice'), sf.PYU('bob')
spu_node_device = sf.SPU(sf.utils.testing.cluster_def(['alice', 'bob']))

def func(data_alice, data_bob):
    # Train SVM model on data_alice
    model = SVC()
    model.fit(data_alice['data'], data_alice['target'])

    # Predict using the trained model on data_bob
    predicted_labels = model.predict(data_bob['data'])
    return predicted_labels

def get_alice_data():
    # load iris dataset
    iris = load_iris()

    # split dataset into features and target
    data = iris.data
    target = iris.target

    # split into training and test set
    X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42)

    # return training dataset
    return {"data": X_train, "target": y_train}

def get_bob_data():
    # load iris dataset
    iris = load_iris()

    # split dataset into features and target
    data = iris.data
    target = iris.target

    # split into training and test set
    X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42)

    # return test dataset
    return {"data": X_test, "target": y_test}

# Get data from PYU nodes which have data
data_alice = alice(get_alice_data)()
data_bob = bob(get_bob_data)()

result = spu_node_device(func)(data_alice, data_bob)

revealed_result = sf.reveal(result)

print("== RESULT BEGIN ==\n%s\n== RESULT END ==" % revealed_result)
sf.shutdown()

data_aliceget_alice_data的形式是

(_run pid=617599) {'data': array([[4.6, 3.6, 1. , 0.2],
(_run pid=617599)        [5.7, 4.4, 1.5, 0.4],
(_run pid=617599)        [6.7, 3.1, 4.4, 1.4],
(_run pid=617599)        [4.8, 3.4, 1.6, 0.2],
(_run pid=617599)        [4.4, 3.2, 1.3, 0.2],
(_run pid=617599)        [6.3, 2.5, 5. , 1.9],
(_run pid=617599)        [6.4, 3.2, 4.5, 1.5],
(_run pid=617599)        [5.2, 3.5, 1.5, 0.2],
(_run pid=617599)        [5. , 3.6, 1.4, 0.2],
(_run pid=617599)        [5.2, 4.1, 1.5, 0.1],
(_run pid=617599)        [5.8, 2.7, 5.1, 1.9],
(_run pid=617599)        [6. , 3.4, 4.5, 1.6],
(_run pid=617599)        [6.7, 3.1, 4.7, 1.5],
(_run pid=617599)        [5.4, 3.9, 1.3, 0.4],
(_run pid=617599)        [5.4, 3.7, 1.5, 0.2],
(_run pid=617599)        [5.5, 2.4, 3.7, 1. ],
(_run pid=617599)        [6.3, 2.8, 5.1, 1.5],
(_run pid=617599)        [6.4, 3.1, 5.5, 1.8],
(_run pid=617599)        [6.6, 3. , 4.4, 1.4],
(_run pid=617599)        [7.2, 3.6, 6.1, 2.5],
(_run pid=617599)        [5.7, 2.9, 4.2, 1.3],
(_run pid=617599)        [7.6, 3. , 6.6, 2.1],
(_run pid=617599)        [5.6, 3. , 4.5, 1.5],
(_run pid=617599)        [5.1, 3.5, 1.4, 0.2],
(_run pid=617599)        [7.7, 2.8, 6.7, 2. ],
(_run pid=617599)        [5.8, 2.7, 4.1, 1. ],
(_run pid=617599)        [5.2, 3.4, 1.4, 0.2],
(_run pid=617599)        [5. , 3.5, 1.3, 0.3],
(_run pid=617599)        [5.1, 3.8, 1.9, 0.4],
(_run pid=617599)        [5. , 2. , 3.5, 1. ],
(_run pid=617599)        [6.3, 2.7, 4.9, 1.8],
(_run pid=617599)        [4.8, 3.4, 1.9, 0.2],
(_run pid=617599)        [5. , 3. , 1.6, 0.2],
(_run pid=617599)        [5.1, 3.3, 1.7, 0.5],
(_run pid=617599)        [5.6, 2.7, 4.2, 1.3],
(_run pid=617599)        [5.1, 3.4, 1.5, 0.2],
(_run pid=617599)        [5.7, 3. , 4.2, 1.2],
(_run pid=617599)        [7.7, 3.8, 6.7, 2.2],
(_run pid=617599)        [4.6, 3.2, 1.4, 0.2],
(_run pid=617599)        [6.2, 2.9, 4.3, 1.3],
(_run pid=617599)        [5.7, 2.5, 5. , 2. ],
(_run pid=617599)        [5.5, 4.2, 1.4, 0.2],
(_run pid=617599)        [6. , 3. , 4.8, 1.8],
(_run pid=617599)        [5.8, 2.7, 5.1, 1.9],
(_run pid=617599)        [6. , 2.2, 4. , 1. ],
(_run pid=617599)        [5.4, 3. , 4.5, 1.5],
(_run pid=617599)        [6.2, 3.4, 5.4, 2.3],
(_run pid=617599)        [5.5, 2.3, 4. , 1.3],
(_run pid=617599)        [5.4, 3.9, 1.7, 0.4],
(_run pid=617599)        [5. , 2.3, 3.3, 1. ],
(_run pid=617599)        [6.4, 2.7, 5.3, 1.9],
(_run pid=617599)        [5. , 3.3, 1.4, 0.2],
(_run pid=617599)        [5. , 3.2, 1.2, 0.2],
(_run pid=617599)        [5.5, 2.4, 3.8, 1.1],
(_run pid=617599)        [6.7, 3. , 5. , 1.7],
(_run pid=617599)        [4.9, 3.1, 1.5, 0.2],
(_run pid=617599)        [5.8, 2.8, 5.1, 2.4],
(_run pid=617599)        [5. , 3.4, 1.5, 0.2],
(_run pid=617599)        [5. , 3.5, 1.6, 0.6],
(_run pid=617599)        [5.9, 3.2, 4.8, 1.8],
(_run pid=617599)        [5.1, 2.5, 3. , 1.1],
(_run pid=617599)        [6.9, 3.2, 5.7, 2.3],
(_run pid=617599)        [6. , 2.7, 5.1, 1.6],
(_run pid=617599)        [6.1, 2.6, 5.6, 1.4],
(_run pid=617599)        [7.7, 3. , 6.1, 2.3],
(_run pid=617599)        [5.5, 2.5, 4. , 1.3],
(_run pid=617599)        [4.4, 2.9, 1.4, 0.2],
(_run pid=617599)        [4.3, 3. , 1.1, 0.1],
(_run pid=617599)        [6. , 2.2, 5. , 1.5],
(_run pid=617599)        [7.2, 3.2, 6. , 1.8],
(_run pid=617599)        [4.6, 3.1, 1.5, 0.2],
(_run pid=617599)        [5.1, 3.5, 1.4, 0.3],
(_run pid=617599)        [4.4, 3. , 1.3, 0.2],
(_run pid=617599)        [6.3, 2.5, 4.9, 1.5],
(_run pid=617599)        [6.3, 3.4, 5.6, 2.4],
(_run pid=617599)        [4.6, 3.4, 1.4, 0.3],
(_run pid=617599)        [6.8, 3. , 5.5, 2.1],
(_run pid=617599)        [6.3, 3.3, 6. , 2.5],
(_run pid=617599)        [4.7, 3.2, 1.3, 0.2],
(_run pid=617599)        [6.1, 2.9, 4.7, 1.4],
(_run pid=617599)        [6.5, 2.8, 4.6, 1.5],
(_run pid=617599)        [6.2, 2.8, 4.8, 1.8],
(_run pid=617599)        [7. , 3.2, 4.7, 1.4],
(_run pid=617599)        [6.4, 3.2, 5.3, 2.3],
(_run pid=617599)        [5.1, 3.8, 1.6, 0.2],
(_run pid=617599)        [6.9, 3.1, 5.4, 2.1],
(_run pid=617599)        [5.9, 3. , 4.2, 1.5],
(_run pid=617599)        [6.5, 3. , 5.2, 2. ],
(_run pid=617599)        [5.7, 2.6, 3.5, 1. ],
(_run pid=617599)        [5.2, 2.7, 3.9, 1.4],
(_run pid=617599)        [6.1, 3. , 4.6, 1.4],
(_run pid=617599)        [4.5, 2.3, 1.3, 0.3],
(_run pid=617599)        [6.6, 2.9, 4.6, 1.3],
(_run pid=617599)        [5.5, 2.6, 4.4, 1.2],
(_run pid=617599)        [5.3, 3.7, 1.5, 0.2],
(_run pid=617599)        [5.6, 3. , 4.1, 1.3],
(_run pid=617599)        [7.3, 2.9, 6.3, 1.8],
(_run pid=617599)        [6.7, 3.3, 5.7, 2.1],
(_run pid=617599)        [5.1, 3.7, 1.5, 0.4],
(_run pid=617599)        [4.9, 2.4, 3.3, 1. ],
(_run pid=617599)        [6.7, 3.3, 5.7, 2.5],
(_run pid=617599)        [7.2, 3. , 5.8, 1.6],
(_run pid=617599)        [4.9, 3.6, 1.4, 0.1],
(_run pid=617599)        [6.7, 3.1, 5.6, 2.4],
(_run pid=617599)        [4.9, 3. , 1.4, 0.2],
(_run pid=617599)        [6.9, 3.1, 4.9, 1.5],
(_run pid=617599)        [7.4, 2.8, 6.1, 1.9],
(_run pid=617599)        [6.3, 2.9, 5.6, 1.8],
(_run pid=617599)        [5.7, 2.8, 4.1, 1.3],
(_run pid=617599)        [6.5, 3. , 5.5, 1.8],
(_run pid=617599)        [6.3, 2.3, 4.4, 1.3],
(_run pid=617599)        [6.4, 2.9, 4.3, 1.3],
(_run pid=617599)        [5.6, 2.8, 4.9, 2. ],
(_run pid=617599)        [5.9, 3. , 5.1, 1.8],
(_run pid=617599)        [5.4, 3.4, 1.7, 0.2],
(_run pid=617599)        [6.1, 2.8, 4. , 1.3],
(_run pid=617599)        [4.9, 2.5, 4.5, 1.7],
(_run pid=617599)        [5.8, 4. , 1.2, 0.2],
(_run pid=617599)        [5.8, 2.6, 4. , 1.2],
(_run pid=617599)        [7.1, 3. , 5.9, 2.1]]), 'target': array([0, 0, 1, 0, 0, 2, 1, 0, 0, 0, 2, 1, 1, 0, 0, 1, 2, 2, 1, 2, 1, 2,
(_run pid=617599)        1, 0, 2, 1, 0, 0, 0, 1, 2, 0, 0, 0, 1, 0, 1, 2, 0, 1, 2, 0, 2, 2,
(_run pid=617599)        1, 1, 2, 1, 0, 1, 2, 0, 0, 1, 1, 0, 2, 0, 0, 1, 1, 2, 1, 2, 2, 1,
(_run pid=617599)        0, 0, 2, 2, 0, 0, 0, 1, 2, 0, 2, 2, 0, 1, 1, 2, 1, 2, 0, 2, 1, 2,
(_run pid=617599)        1, 1, 1, 0, 1, 1, 0, 1, 2, 2, 0, 1, 2, 2, 0, 2, 0, 1, 2, 2, 1, 2,
(_run pid=617599)        1, 1, 2, 2, 0, 1, 2, 0, 1, 2])}

func里的形式是

{'data': Traced<ShapedArray(float32[120,4])>with<DynamicJaxprTrace(level=1/0)>, 'target': Traced<ShapedArray(int32[120])>with<DynamicJaxprTrace(level=1/0)>}

报错输出为:

(sf) ➜  /home/leone/secretflow-all git:(master) ✗ JAX_PLATFORMS=cpu python /home/leone/secretflow-all/secretflow-test/seeds/02c4c7370e9ca7887f0db07e6473266f92d354ca87aa6464c17e5f286366e20c_spu_code.py
2024-03-21 22:32:11,755 INFO worker.py:1538 -- Started a local Ray instance.
(_run pid=613777) WARNING:jax._src.xla_bridge:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
(_run pid=613770) WARNING:jax._src.xla_bridge:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
2024-03-21 22:32:14,804 ERROR worker.py:400 -- Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): ray::SPURuntime.run() (pid=614761, ip=192.168.50.91, repr=SPURuntime(device_id=None, party=bob))
  At least one of the input arguments for this task could not be computed:
ray.exceptions.RayTaskError: ray::_spu_compile() (pid=613770, ip=192.168.50.91)
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/secretflow/device/device/spu.py", line 1614, in _spu_compile
    executable, output_tree = spu_fe.compile(
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/spu/utils/frontend.py", line 219, in compile
    ir_text, output = _jax_compilation(
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/spu/utils/frontend.py", line 122, in _jax_compilation
    cfn, output = jax.xla_computation(
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/jax/_src/traceback_util.py", line 166, in reraise_with_filtered_traceback
    return fun(*args, **kwargs)
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/jax/_src/api.py", line 544, in computation_maker
    jaxpr, out_avals, consts = pe.trace_to_jaxpr_dynamic(jaxtree_fun, avals)
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/jax/_src/profiler.py", line 314, in wrapper
    return func(*args, **kwargs)
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/jax/_src/interpreters/partial_eval.py", line 2155, in trace_to_jaxpr_dynamic
    jaxpr, out_avals, consts = trace_to_subjaxpr_dynamic(
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/jax/_src/interpreters/partial_eval.py", line 2177, in trace_to_subjaxpr_dynamic
    ans = fun.call_wrapped(*in_tracers_)
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/jax/_src/linear_util.py", line 188, in call_wrapped
    ans = self.f(*args, **dict(self.params, **kwargs))
  File "/home/leone/secretflow-all/secretflow-test/seeds/02c4c7370e9ca7887f0db07e6473266f92d354ca87aa6464c17e5f286366e20c_spu_code.py", line 19, in func
    model.fit(data_alice['data'], data_alice['target'])
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/sklearn/svm/_base.py", line 173, in fit
    X, y = self._validate_data(
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/sklearn/base.py", line 596, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/sklearn/utils/validation.py", line 1074, in check_X_y
    X = check_array(
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/sklearn/utils/validation.py", line 856, in check_array
    array = np.asarray(array, order=order, dtype=dtype)
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/jax/_src/core.py", line 598, in __array__
    raise TracerArrayConversionError(self)
jax._src.traceback_util.UnfilteredStackTrace: jax.errors.TracerArrayConversionError: The numpy.ndarray conversion method __array__() was called on the JAX Tracer object Traced<ShapedArray(float32[120,4])>with<DynamicJaxprTrace(level=1/0)>
See https://jax.readthedocs.io/en/latest/errors.html#jax.errors.TracerArrayConversionError

The stack trace below excludes JAX-internal frames.
The preceding is the original exception that occurred, unmodified.

--------------------

The above exception was the direct cause of the following exception:

ray::_spu_compile() (pid=613770, ip=192.168.50.91)
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/secretflow/device/device/spu.py", line 1614, in _spu_compile
    executable, output_tree = spu_fe.compile(
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/spu/utils/frontend.py", line 219, in compile
    ir_text, output = _jax_compilation(
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/spu/utils/frontend.py", line 122, in _jax_compilation
    cfn, output = jax.xla_computation(
  File "/home/leone/secretflow-all/secretflow-test/seeds/02c4c7370e9ca7887f0db07e6473266f92d354ca87aa6464c17e5f286366e20c_spu_code.py", line 19, in func
    model.fit(data_alice['data'], data_alice['target'])
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/sklearn/svm/_base.py", line 173, in fit
    X, y = self._validate_data(
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/sklearn/base.py", line 596, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/sklearn/utils/validation.py", line 1074, in check_X_y
    X = check_array(
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/sklearn/utils/validation.py", line 856, in check_array
    array = np.asarray(array, order=order, dtype=dtype)
jax.errors.TracerArrayConversionError: The numpy.ndarray conversion method __array__() was called on the JAX Tracer object Traced<ShapedArray(float32[120,4])>with<DynamicJaxprTrace(level=1/0)>
See https://jax.readthedocs.io/en/latest/errors.html#jax.errors.TracerArrayConversionError

During handling of the above exception, another exception occurred:

ray::_spu_compile() (pid=613770, ip=192.168.50.91)
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/secretflow/device/device/spu.py", line 1629, in _spu_compile
    raise ray.exceptions.WorkerCrashedError()
ray.exceptions.WorkerCrashedError: The worker died unexpectedly while executing this task. Check python-core-worker-*.log files for more information.
Traceback (most recent call last):
  File "/home/leone/secretflow-all/secretflow-test/seeds/02c4c7370e9ca7887f0db07e6473266f92d354ca87aa6464c17e5f286366e20c_spu_code.py", line 59, in <module>
    revealed_result = sf.reveal(result)
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/secretflow/device/driver.py", line 153, in reveal
    info, shares_chunk = x.device.outfeed_shares(x.shares_name)
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/secretflow/device/device/spu.py", line 1909, in outfeed_shares
    shares_chunk_count = sfd.get(
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/secretflow/distributed/primitive.py", line 158, in get
    return ray.get(object_refs)
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/ray/_private/client_mode_hook.py", line 105, in wrapper
    return func(*args, **kwargs)
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/ray/_private/worker.py", line 2309, in get
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError: ray::SPURuntime.outfeed_shares_chunk_count() (pid=614757, ip=192.168.50.91, repr=SPURuntime(device_id=None, party=alice))
  At least one of the input arguments for this task could not be computed:
ray.exceptions.RayTaskError: ray::SPURuntime.run() (pid=614757, ip=192.168.50.91, repr=SPURuntime(device_id=None, party=alice))
  At least one of the input arguments for this task could not be computed:
ray.exceptions.RayTaskError: ray::_spu_compile() (pid=613770, ip=192.168.50.91)
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/secretflow/device/device/spu.py", line 1614, in _spu_compile
    executable, output_tree = spu_fe.compile(
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/spu/utils/frontend.py", line 219, in compile
    ir_text, output = _jax_compilation(
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/spu/utils/frontend.py", line 122, in _jax_compilation
    cfn, output = jax.xla_computation(
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/jax/_src/traceback_util.py", line 166, in reraise_with_filtered_traceback
    return fun(*args, **kwargs)
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/jax/_src/api.py", line 544, in computation_maker
    jaxpr, out_avals, consts = pe.trace_to_jaxpr_dynamic(jaxtree_fun, avals)
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/jax/_src/profiler.py", line 314, in wrapper
    return func(*args, **kwargs)
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/jax/_src/interpreters/partial_eval.py", line 2155, in trace_to_jaxpr_dynamic
    jaxpr, out_avals, consts = trace_to_subjaxpr_dynamic(
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/jax/_src/interpreters/partial_eval.py", line 2177, in trace_to_subjaxpr_dynamic
    ans = fun.call_wrapped(*in_tracers_)
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/jax/_src/linear_util.py", line 188, in call_wrapped
    ans = self.f(*args, **dict(self.params, **kwargs))
  File "/home/leone/secretflow-all/secretflow-test/seeds/02c4c7370e9ca7887f0db07e6473266f92d354ca87aa6464c17e5f286366e20c_spu_code.py", line 19, in func
    model.fit(data_alice['data'], data_alice['target'])
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/sklearn/svm/_base.py", line 173, in fit
    X, y = self._validate_data(
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/sklearn/base.py", line 596, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/sklearn/utils/validation.py", line 1074, in check_X_y
    X = check_array(
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/sklearn/utils/validation.py", line 856, in check_array
    array = np.asarray(array, order=order, dtype=dtype)
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/jax/_src/core.py", line 598, in __array__
    raise TracerArrayConversionError(self)
jax._src.traceback_util.UnfilteredStackTrace: jax.errors.TracerArrayConversionError: The numpy.ndarray conversion method __array__() was called on the JAX Tracer object Traced<ShapedArray(float32[120,4])>with<DynamicJaxprTrace(level=1/0)>
See https://jax.readthedocs.io/en/latest/errors.html#jax.errors.TracerArrayConversionError

The stack trace below excludes JAX-internal frames.
The preceding is the original exception that occurred, unmodified.

--------------------

The above exception was the direct cause of the following exception:

ray::_spu_compile() (pid=613770, ip=192.168.50.91)
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/secretflow/device/device/spu.py", line 1614, in _spu_compile
    executable, output_tree = spu_fe.compile(
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/spu/utils/frontend.py", line 219, in compile
    ir_text, output = _jax_compilation(
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/spu/utils/frontend.py", line 122, in _jax_compilation
    cfn, output = jax.xla_computation(
  File "/home/leone/secretflow-all/secretflow-test/seeds/02c4c7370e9ca7887f0db07e6473266f92d354ca87aa6464c17e5f286366e20c_spu_code.py", line 19, in func
    model.fit(data_alice['data'], data_alice['target'])
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/sklearn/svm/_base.py", line 173, in fit
    X, y = self._validate_data(
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/sklearn/base.py", line 596, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/sklearn/utils/validation.py", line 1074, in check_X_y
    X = check_array(
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/sklearn/utils/validation.py", line 856, in check_array
    array = np.asarray(array, order=order, dtype=dtype)
jax.errors.TracerArrayConversionError: The numpy.ndarray conversion method __array__() was called on the JAX Tracer object Traced<ShapedArray(float32[120,4])>with<DynamicJaxprTrace(level=1/0)>
See https://jax.readthedocs.io/en/latest/errors.html#jax.errors.TracerArrayConversionError

During handling of the above exception, another exception occurred:

ray::_spu_compile() (pid=613770, ip=192.168.50.91)
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/secretflow/device/device/spu.py", line 1629, in _spu_compile
    raise ray.exceptions.WorkerCrashedError()
ray.exceptions.WorkerCrashedError: The worker died unexpectedly while executing this task. Check python-core-worker-*.log files for more information.
(sf) ➜  /home/leone/secretflow-all git:(master) ✗ 
@ian-huu
Copy link
Member

ian-huu commented Mar 22, 2024

debug mode 是单进程同步执行的,spu (至少需要两个进程)无法在 debug 模式使用,用到 spu 的地方需要改成 pyu 来模拟

@LeoneChen
Copy link
Author

但是非debug模式的上述问题要怎么解决呢

@LeoneChen
Copy link
Author

还有这个case中

import secretflow as sf
import jax.numpy as jnp

# Additional libraries that get_alice_data, get_bob_data, and func will use
import scipy.special

# Initialize secretflow framework, SPU node, and participants' PYU nodes
sf.init(["alice", "bob", "charlie"], address="local")
alice, bob, charlie = sf.PYU("alice"), sf.PYU("bob"), sf.PYU("charlie")
spu_node_device = sf.SPU(sf.utils.testing.cluster_def(["alice", "bob", "charlie"]))


def func(data_alice, data_bob, data_charlie):
    # Perform element-wise computation on data_alice, data_bob, and data_charlie
    result1 = jnp.add(data_alice, data_bob)
    result1 = jnp.multiply(result1, jnp.log(data_alice))
    result1 = jnp.divide(result1, jnp.sqrt(data_bob))
    result1 = jnp.ceil(result1)
    result1 = jnp.clip(result1, 0, 10)

    # Perform element-wise computation on data_bob and data_charlie
    result2 = jnp.subtract(data_bob, data_charlie)
    result2 = jnp.exp(result2)
    result2 = jnp.around(result2, decimals=2)

    return result1, result2


def get_alice_data():
    # Return Alice's data
    return jnp.array([1, 2, 3])


def get_bob_data():
    # Return Bob's data
    return jnp.array([4, 5, 6])


def get_charlie_data():
    # Return Charlie's data
    return jnp.array([7, 8, 9])


# Get data from PYU nodes which have data
data_alice = alice(get_alice_data)()
data_bob = bob(get_bob_data)()
data_charlie = charlie(get_charlie_data)()

result1, result2 = spu_node_device(
    func, num_returns_policy=sf.device.SPUCompilerNumReturnsPolicy.FROM_COMPILER
)(data_alice, data_bob, data_charlie)

revealed_result1 = sf.reveal(result1)
revealed_result2 = sf.reveal(result2)

print("== RESULT BEGIN ==")
print("Result 1:")
print(revealed_result1)
print("Result 2:")
print(revealed_result2)
print("== RESULT END ==")

sf.shutdown()

报错error: failed to legalize operation 'stablehlo.round_nearest_even',是因为操作不支持吗

2024-03-22 16:28:32,954 INFO worker.py:1538 -- Started a local Ray instance.
WARNING:jax._src.xla_bridge:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
(_run pid=663353) WARNING:jax._src.xla_bridge:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
(_run pid=663360) WARNING:jax._src.xla_bridge:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
(_run pid=663348) WARNING:jax._src.xla_bridge:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
(_spu_compile pid=663360) loc(fused["xla_computation(func)/jit(main)/jit(round)/round[rounding_method=1]", "/home/leone/secretflow-all/secretflow-test/seeds/8c2220bd3b4bdf8f0c811ed1376e22c42fd08f967aeca9ae40ffdb15200a3703_spu_code.py":24:0]): error: failed to legalize operation 'stablehlo.round_nearest_even'
Traceback (most recent call last):
  File "/home/leone/secretflow-all/secretflow-test/seeds/8c2220bd3b4bdf8f0c811ed1376e22c42fd08f967aeca9ae40ffdb15200a3703_spu_code.py", line 49, in <module>
    result1, result2 = spu_node_device(
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/secretflow/device/device/spu.py", line 1837, in wrapper
    executable, out_shape = sfd.get([executable, out_shape])
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/secretflow/distributed/primitive.py", line 158, in get
    return ray.get(object_refs)
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/ray/_private/client_mode_hook.py", line 105, in wrapper
    return func(*args, **kwargs)
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/ray/_private/worker.py", line 2309, in get
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError: ray::_spu_compile() (pid=663360, ip=192.168.50.91)
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/spu/utils/frontend.py", line 289, in compile
    mlir = spu_api.compile(source, copts)
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/spu/api.py", line 170, in compile
    return _spu_compilation(source.SerializeToString(), copts.SerializeToString())
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/cachetools/__init__.py", line 737, in wrapper
    v = func(*args, **kwargs)
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/spu/api.py", line 156, in _spu_compilation
    return libspu.compile(source, options_str)
RuntimeError: what: 
        [libspu/compiler/front_end/fe.cc:82] Run front end pipeline failed
stacktrace: 
#0 spu::compiler::FE::doit()+0x746f97a559f4
#1 spu::compiler::compile[abi:cxx11]()+0x746f979fa6ba
#2 spu::pybind11_init_libspu()::{lambda()#4}::operator()()+0x746f978f1508
#3 pybind11::detail::argument_loader<>::call_impl<>()+0x746f978fd737
#4 _ZNO8pybind116detail15argument_loaderIJRKNS_5bytesERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEEE4callIS2_NS0_9void_typeERZN3spuL20pybind11_init_libspuERNS_7module_EEUlS4_SC_E2_EENSt9enable_ifIXntsrSt7is_voidIT_E5valueESN_E4typeEOT1_+0x746f978fb9c3
#5 pybind11::cpp_function::initialize<>()::{lambda()#3}::operator()()+0x746f978fa391
#6 pybind11::cpp_function::initialize<>()::{lambda()#3}::_FUN()+0x746f978fa451
#7 pybind11::cpp_function::dispatcher()+0x746f97911216
#8 cfunction_call+0x507387



During handling of the above exception, another exception occurred:

ray::_spu_compile() (pid=663360, ip=192.168.50.91)
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/secretflow/device/device/spu.py", line 1629, in _spu_compile
    raise ray.exceptions.WorkerCrashedError()
ray.exceptions.WorkerCrashedError: The worker died unexpectedly while executing this task. Check python-core-worker-*.log files for more information.
Exception ignored in: <function SPUObject.__del__ at 0x71bf00933b80>
Traceback (most recent call last):
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/secretflow/device/device/spu.py", line 190, in __del__
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/ray/actor.py", line 138, in remote
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/ray/util/tracing/tracing_helper.py", line 425, in _start_span
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/ray/actor.py", line 184, in _remote
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/ray/actor.py", line 171, in invocation
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/ray/actor.py", line 1172, in _actor_method_call
AttributeError: 'Worker' object has no attribute 'core_worker'
Exception ignored in: <function SPUObject.__del__ at 0x71bf00933b80>
Traceback (most recent call last):
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/secretflow/device/device/spu.py", line 190, in __del__
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/ray/actor.py", line 138, in remote
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/ray/util/tracing/tracing_helper.py", line 425, in _start_span
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/ray/actor.py", line 184, in _remote
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/ray/actor.py", line 171, in invocation
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/ray/actor.py", line 1172, in _actor_method_call
AttributeError: 'Worker' object has no attribute 'core_worker'
Exception ignored in: <function SPUObject.__del__ at 0x71bf00933b80>
Traceback (most recent call last):
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/secretflow/device/device/spu.py", line 190, in __del__
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/ray/actor.py", line 138, in remote
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/ray/util/tracing/tracing_helper.py", line 425, in _start_span
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/ray/actor.py", line 184, in _remote
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/ray/actor.py", line 171, in invocation
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/ray/actor.py", line 1172, in _actor_method_call
AttributeError: 'Worker' object has no attribute 'core_worker'
(sf) ➜  /home/leone/secretflow-all git:(master) ✗ JAX_PLATFORMS=cpu python secretflow-test/seeds/8c2220bd3b4bdf8f0c811ed1376e22c42fd08f967aeca9ae40ffdb15200a3703_spu_code.py

@anakinxc
Copy link
Contributor

round_nearest_even 现在还不支持,欢迎到 spu repo 下面提个 issue

@LeoneChen
Copy link
Author

jax._src.traceback_util.UnfilteredStackTrace: jax.errors.TracerArrayConversionError: The numpy.ndarray conversion method __array__() was called on the JAX Tracer object Traced<ShapedArray(float32[120,4])>with<DynamicJaxprTrace(level=1/0)>这个问题麻烦帮忙看看要怎么解决?

gpt是这么说的
image

@anakinxc
Copy link
Contributor

  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/sklearn/utils/validation.py", line 856, in check_array

这个看起来是个 sklearn 和 jax 的问题,建议先试一下直接用 jax jit 能不能跑

@LeoneChen
Copy link
Author

LeoneChen commented Mar 25, 2024

我在SPU版本的funcget_alice_dataget_bob_data前面都加了@jax.jit,还是一样有问题。

明文版本的代码就没问题

import numpy as np
from sklearn.datasets import load_iris
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

def func(data_alice, data_bob):
    # Train SVM model on data_alice
    model = SVC()
    model.fit(data_alice['data'], data_alice['target'])

    # Predict using the trained model on data_bob
    predicted_labels = model.predict(data_bob['data'])
    return predicted_labels

def get_alice_data():
    # load iris dataset
    iris = load_iris()

    # split dataset into features and target
    data = iris.data
    target = iris.target

    # split into training and test set
    X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42)

    # return training dataset
    return {"data": X_train, "target": y_train}

def get_bob_data():
    # load iris dataset
    iris = load_iris()

    # split dataset into features and target
    data = iris.data
    target = iris.target

    # split into training and test set
    X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42)

    # return test dataset
    return {"data": X_test, "target": y_test}

# Get data
data_alice = get_alice_data()
data_bob = get_bob_data()

result = func(data_alice, data_bob)

print("== RESULT BEGIN ==\n%s\n== RESULT END ==" % result)

@anakinxc
Copy link
Contributor

我在SPU版本的funcget_alice_dataget_bob_data前面都加了@jax.jit,还是一样有问题。

明文版本的代码就没问题

import numpy as np
from sklearn.datasets import load_iris
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

def func(data_alice, data_bob):
    # Train SVM model on data_alice
    model = SVC()
    model.fit(data_alice['data'], data_alice['target'])

    # Predict using the trained model on data_bob
    predicted_labels = model.predict(data_bob['data'])
    return predicted_labels

def get_alice_data():
    # load iris dataset
    iris = load_iris()

    # split dataset into features and target
    data = iris.data
    target = iris.target

    # split into training and test set
    X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42)

    # return training dataset
    return {"data": X_train, "target": y_train}

def get_bob_data():
    # load iris dataset
    iris = load_iris()

    # split dataset into features and target
    data = iris.data
    target = iris.target

    # split into training and test set
    X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42)

    # return test dataset
    return {"data": X_test, "target": y_test}

# Get data
data_alice = get_alice_data()
data_bob = get_bob_data()

result = func(data_alice, data_bob)

print("== RESULT BEGIN ==\n%s\n== RESULT END ==" % result)

jax 的问题,只要你在你的 func 上加 @jax.jit 就会报错

@LeoneChen
Copy link
Author

所以这问题是不是没法整?这和spu先编译后执行的原理有关吗?

@da-niao-dan
Copy link
Member

spu 可执行的代码 是 jit 可执行代码的子集, jax jit 失败的代码无法在 spu执行。 解决问题的办法是习惯jax的编程模式

@anakinxc
Copy link
Contributor

所以这问题是不是没法整?这和spu先编译后执行的原理有关吗?

嗯,SPU 执行的前置条件是 jax 可以 jit 这个 function

@LeoneChen
Copy link
Author

绝了。。。 感谢大佬问答

@gxcuit
Copy link

gxcuit commented Apr 16, 2024

下面是一段使用sf API的代码,运行有问题(在sf.init里加上debug_mode=True后直接卡住了)

import secretflow as sf
import jax.numpy as jnp
import numpy as np
# <import libraries that get_alice_data, get_bob_data, and func will use>
from sklearn.datasets import load_iris
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Initialize secretflow framework, SPU node, and participants' PYU nodes
sf.init(['alice', 'bob'], address='local')
alice, bob = sf.PYU('alice'), sf.PYU('bob')
spu_node_device = sf.SPU(sf.utils.testing.cluster_def(['alice', 'bob']))

def func(data_alice, data_bob):
    # Train SVM model on data_alice
    model = SVC()
    model.fit(data_alice['data'], data_alice['target'])

    # Predict using the trained model on data_bob
    predicted_labels = model.predict(data_bob['data'])
    return predicted_labels

def get_alice_data():
    # load iris dataset
    iris = load_iris()

    # split dataset into features and target
    data = iris.data
    target = iris.target

    # split into training and test set
    X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42)

    # return training dataset
    return {"data": X_train, "target": y_train}

def get_bob_data():
    # load iris dataset
    iris = load_iris()

    # split dataset into features and target
    data = iris.data
    target = iris.target

    # split into training and test set
    X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42)

    # return test dataset
    return {"data": X_test, "target": y_test}

# Get data from PYU nodes which have data
data_alice = alice(get_alice_data)()
data_bob = bob(get_bob_data)()

result = spu_node_device(func)(data_alice, data_bob)

revealed_result = sf.reveal(result)

print("== RESULT BEGIN ==\n%s\n== RESULT END ==" % revealed_result)
sf.shutdown()

data_aliceget_alice_data的形式是

(_run pid=617599) {'data': array([[4.6, 3.6, 1. , 0.2],
(_run pid=617599)        [5.7, 4.4, 1.5, 0.4],
(_run pid=617599)        [6.7, 3.1, 4.4, 1.4],
(_run pid=617599)        [4.8, 3.4, 1.6, 0.2],
(_run pid=617599)        [4.4, 3.2, 1.3, 0.2],
(_run pid=617599)        [6.3, 2.5, 5. , 1.9],
(_run pid=617599)        [6.4, 3.2, 4.5, 1.5],
(_run pid=617599)        [5.2, 3.5, 1.5, 0.2],
(_run pid=617599)        [5. , 3.6, 1.4, 0.2],
(_run pid=617599)        [5.2, 4.1, 1.5, 0.1],
(_run pid=617599)        [5.8, 2.7, 5.1, 1.9],
(_run pid=617599)        [6. , 3.4, 4.5, 1.6],
(_run pid=617599)        [6.7, 3.1, 4.7, 1.5],
(_run pid=617599)        [5.4, 3.9, 1.3, 0.4],
(_run pid=617599)        [5.4, 3.7, 1.5, 0.2],
(_run pid=617599)        [5.5, 2.4, 3.7, 1. ],
(_run pid=617599)        [6.3, 2.8, 5.1, 1.5],
(_run pid=617599)        [6.4, 3.1, 5.5, 1.8],
(_run pid=617599)        [6.6, 3. , 4.4, 1.4],
(_run pid=617599)        [7.2, 3.6, 6.1, 2.5],
(_run pid=617599)        [5.7, 2.9, 4.2, 1.3],
(_run pid=617599)        [7.6, 3. , 6.6, 2.1],
(_run pid=617599)        [5.6, 3. , 4.5, 1.5],
(_run pid=617599)        [5.1, 3.5, 1.4, 0.2],
(_run pid=617599)        [7.7, 2.8, 6.7, 2. ],
(_run pid=617599)        [5.8, 2.7, 4.1, 1. ],
(_run pid=617599)        [5.2, 3.4, 1.4, 0.2],
(_run pid=617599)        [5. , 3.5, 1.3, 0.3],
(_run pid=617599)        [5.1, 3.8, 1.9, 0.4],
(_run pid=617599)        [5. , 2. , 3.5, 1. ],
(_run pid=617599)        [6.3, 2.7, 4.9, 1.8],
(_run pid=617599)        [4.8, 3.4, 1.9, 0.2],
(_run pid=617599)        [5. , 3. , 1.6, 0.2],
(_run pid=617599)        [5.1, 3.3, 1.7, 0.5],
(_run pid=617599)        [5.6, 2.7, 4.2, 1.3],
(_run pid=617599)        [5.1, 3.4, 1.5, 0.2],
(_run pid=617599)        [5.7, 3. , 4.2, 1.2],
(_run pid=617599)        [7.7, 3.8, 6.7, 2.2],
(_run pid=617599)        [4.6, 3.2, 1.4, 0.2],
(_run pid=617599)        [6.2, 2.9, 4.3, 1.3],
(_run pid=617599)        [5.7, 2.5, 5. , 2. ],
(_run pid=617599)        [5.5, 4.2, 1.4, 0.2],
(_run pid=617599)        [6. , 3. , 4.8, 1.8],
(_run pid=617599)        [5.8, 2.7, 5.1, 1.9],
(_run pid=617599)        [6. , 2.2, 4. , 1. ],
(_run pid=617599)        [5.4, 3. , 4.5, 1.5],
(_run pid=617599)        [6.2, 3.4, 5.4, 2.3],
(_run pid=617599)        [5.5, 2.3, 4. , 1.3],
(_run pid=617599)        [5.4, 3.9, 1.7, 0.4],
(_run pid=617599)        [5. , 2.3, 3.3, 1. ],
(_run pid=617599)        [6.4, 2.7, 5.3, 1.9],
(_run pid=617599)        [5. , 3.3, 1.4, 0.2],
(_run pid=617599)        [5. , 3.2, 1.2, 0.2],
(_run pid=617599)        [5.5, 2.4, 3.8, 1.1],
(_run pid=617599)        [6.7, 3. , 5. , 1.7],
(_run pid=617599)        [4.9, 3.1, 1.5, 0.2],
(_run pid=617599)        [5.8, 2.8, 5.1, 2.4],
(_run pid=617599)        [5. , 3.4, 1.5, 0.2],
(_run pid=617599)        [5. , 3.5, 1.6, 0.6],
(_run pid=617599)        [5.9, 3.2, 4.8, 1.8],
(_run pid=617599)        [5.1, 2.5, 3. , 1.1],
(_run pid=617599)        [6.9, 3.2, 5.7, 2.3],
(_run pid=617599)        [6. , 2.7, 5.1, 1.6],
(_run pid=617599)        [6.1, 2.6, 5.6, 1.4],
(_run pid=617599)        [7.7, 3. , 6.1, 2.3],
(_run pid=617599)        [5.5, 2.5, 4. , 1.3],
(_run pid=617599)        [4.4, 2.9, 1.4, 0.2],
(_run pid=617599)        [4.3, 3. , 1.1, 0.1],
(_run pid=617599)        [6. , 2.2, 5. , 1.5],
(_run pid=617599)        [7.2, 3.2, 6. , 1.8],
(_run pid=617599)        [4.6, 3.1, 1.5, 0.2],
(_run pid=617599)        [5.1, 3.5, 1.4, 0.3],
(_run pid=617599)        [4.4, 3. , 1.3, 0.2],
(_run pid=617599)        [6.3, 2.5, 4.9, 1.5],
(_run pid=617599)        [6.3, 3.4, 5.6, 2.4],
(_run pid=617599)        [4.6, 3.4, 1.4, 0.3],
(_run pid=617599)        [6.8, 3. , 5.5, 2.1],
(_run pid=617599)        [6.3, 3.3, 6. , 2.5],
(_run pid=617599)        [4.7, 3.2, 1.3, 0.2],
(_run pid=617599)        [6.1, 2.9, 4.7, 1.4],
(_run pid=617599)        [6.5, 2.8, 4.6, 1.5],
(_run pid=617599)        [6.2, 2.8, 4.8, 1.8],
(_run pid=617599)        [7. , 3.2, 4.7, 1.4],
(_run pid=617599)        [6.4, 3.2, 5.3, 2.3],
(_run pid=617599)        [5.1, 3.8, 1.6, 0.2],
(_run pid=617599)        [6.9, 3.1, 5.4, 2.1],
(_run pid=617599)        [5.9, 3. , 4.2, 1.5],
(_run pid=617599)        [6.5, 3. , 5.2, 2. ],
(_run pid=617599)        [5.7, 2.6, 3.5, 1. ],
(_run pid=617599)        [5.2, 2.7, 3.9, 1.4],
(_run pid=617599)        [6.1, 3. , 4.6, 1.4],
(_run pid=617599)        [4.5, 2.3, 1.3, 0.3],
(_run pid=617599)        [6.6, 2.9, 4.6, 1.3],
(_run pid=617599)        [5.5, 2.6, 4.4, 1.2],
(_run pid=617599)        [5.3, 3.7, 1.5, 0.2],
(_run pid=617599)        [5.6, 3. , 4.1, 1.3],
(_run pid=617599)        [7.3, 2.9, 6.3, 1.8],
(_run pid=617599)        [6.7, 3.3, 5.7, 2.1],
(_run pid=617599)        [5.1, 3.7, 1.5, 0.4],
(_run pid=617599)        [4.9, 2.4, 3.3, 1. ],
(_run pid=617599)        [6.7, 3.3, 5.7, 2.5],
(_run pid=617599)        [7.2, 3. , 5.8, 1.6],
(_run pid=617599)        [4.9, 3.6, 1.4, 0.1],
(_run pid=617599)        [6.7, 3.1, 5.6, 2.4],
(_run pid=617599)        [4.9, 3. , 1.4, 0.2],
(_run pid=617599)        [6.9, 3.1, 4.9, 1.5],
(_run pid=617599)        [7.4, 2.8, 6.1, 1.9],
(_run pid=617599)        [6.3, 2.9, 5.6, 1.8],
(_run pid=617599)        [5.7, 2.8, 4.1, 1.3],
(_run pid=617599)        [6.5, 3. , 5.5, 1.8],
(_run pid=617599)        [6.3, 2.3, 4.4, 1.3],
(_run pid=617599)        [6.4, 2.9, 4.3, 1.3],
(_run pid=617599)        [5.6, 2.8, 4.9, 2. ],
(_run pid=617599)        [5.9, 3. , 5.1, 1.8],
(_run pid=617599)        [5.4, 3.4, 1.7, 0.2],
(_run pid=617599)        [6.1, 2.8, 4. , 1.3],
(_run pid=617599)        [4.9, 2.5, 4.5, 1.7],
(_run pid=617599)        [5.8, 4. , 1.2, 0.2],
(_run pid=617599)        [5.8, 2.6, 4. , 1.2],
(_run pid=617599)        [7.1, 3. , 5.9, 2.1]]), 'target': array([0, 0, 1, 0, 0, 2, 1, 0, 0, 0, 2, 1, 1, 0, 0, 1, 2, 2, 1, 2, 1, 2,
(_run pid=617599)        1, 0, 2, 1, 0, 0, 0, 1, 2, 0, 0, 0, 1, 0, 1, 2, 0, 1, 2, 0, 2, 2,
(_run pid=617599)        1, 1, 2, 1, 0, 1, 2, 0, 0, 1, 1, 0, 2, 0, 0, 1, 1, 2, 1, 2, 2, 1,
(_run pid=617599)        0, 0, 2, 2, 0, 0, 0, 1, 2, 0, 2, 2, 0, 1, 1, 2, 1, 2, 0, 2, 1, 2,
(_run pid=617599)        1, 1, 1, 0, 1, 1, 0, 1, 2, 2, 0, 1, 2, 2, 0, 2, 0, 1, 2, 2, 1, 2,
(_run pid=617599)        1, 1, 2, 2, 0, 1, 2, 0, 1, 2])}

func里的形式是

{'data': Traced<ShapedArray(float32[120,4])>with<DynamicJaxprTrace(level=1/0)>, 'target': Traced<ShapedArray(int32[120])>with<DynamicJaxprTrace(level=1/0)>}

报错输出为:

(sf) ➜  /home/leone/secretflow-all git:(master) ✗ JAX_PLATFORMS=cpu python /home/leone/secretflow-all/secretflow-test/seeds/02c4c7370e9ca7887f0db07e6473266f92d354ca87aa6464c17e5f286366e20c_spu_code.py
2024-03-21 22:32:11,755 INFO worker.py:1538 -- Started a local Ray instance.
(_run pid=613777) WARNING:jax._src.xla_bridge:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
(_run pid=613770) WARNING:jax._src.xla_bridge:No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
2024-03-21 22:32:14,804 ERROR worker.py:400 -- Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): ray::SPURuntime.run() (pid=614761, ip=192.168.50.91, repr=SPURuntime(device_id=None, party=bob))
  At least one of the input arguments for this task could not be computed:
ray.exceptions.RayTaskError: ray::_spu_compile() (pid=613770, ip=192.168.50.91)
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/secretflow/device/device/spu.py", line 1614, in _spu_compile
    executable, output_tree = spu_fe.compile(
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/spu/utils/frontend.py", line 219, in compile
    ir_text, output = _jax_compilation(
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/spu/utils/frontend.py", line 122, in _jax_compilation
    cfn, output = jax.xla_computation(
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/jax/_src/traceback_util.py", line 166, in reraise_with_filtered_traceback
    return fun(*args, **kwargs)
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/jax/_src/api.py", line 544, in computation_maker
    jaxpr, out_avals, consts = pe.trace_to_jaxpr_dynamic(jaxtree_fun, avals)
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/jax/_src/profiler.py", line 314, in wrapper
    return func(*args, **kwargs)
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/jax/_src/interpreters/partial_eval.py", line 2155, in trace_to_jaxpr_dynamic
    jaxpr, out_avals, consts = trace_to_subjaxpr_dynamic(
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/jax/_src/interpreters/partial_eval.py", line 2177, in trace_to_subjaxpr_dynamic
    ans = fun.call_wrapped(*in_tracers_)
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/jax/_src/linear_util.py", line 188, in call_wrapped
    ans = self.f(*args, **dict(self.params, **kwargs))
  File "/home/leone/secretflow-all/secretflow-test/seeds/02c4c7370e9ca7887f0db07e6473266f92d354ca87aa6464c17e5f286366e20c_spu_code.py", line 19, in func
    model.fit(data_alice['data'], data_alice['target'])
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/sklearn/svm/_base.py", line 173, in fit
    X, y = self._validate_data(
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/sklearn/base.py", line 596, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/sklearn/utils/validation.py", line 1074, in check_X_y
    X = check_array(
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/sklearn/utils/validation.py", line 856, in check_array
    array = np.asarray(array, order=order, dtype=dtype)
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/jax/_src/core.py", line 598, in __array__
    raise TracerArrayConversionError(self)
jax._src.traceback_util.UnfilteredStackTrace: jax.errors.TracerArrayConversionError: The numpy.ndarray conversion method __array__() was called on the JAX Tracer object Traced<ShapedArray(float32[120,4])>with<DynamicJaxprTrace(level=1/0)>
See https://jax.readthedocs.io/en/latest/errors.html#jax.errors.TracerArrayConversionError

The stack trace below excludes JAX-internal frames.
The preceding is the original exception that occurred, unmodified.

--------------------

The above exception was the direct cause of the following exception:

ray::_spu_compile() (pid=613770, ip=192.168.50.91)
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/secretflow/device/device/spu.py", line 1614, in _spu_compile
    executable, output_tree = spu_fe.compile(
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/spu/utils/frontend.py", line 219, in compile
    ir_text, output = _jax_compilation(
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/spu/utils/frontend.py", line 122, in _jax_compilation
    cfn, output = jax.xla_computation(
  File "/home/leone/secretflow-all/secretflow-test/seeds/02c4c7370e9ca7887f0db07e6473266f92d354ca87aa6464c17e5f286366e20c_spu_code.py", line 19, in func
    model.fit(data_alice['data'], data_alice['target'])
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/sklearn/svm/_base.py", line 173, in fit
    X, y = self._validate_data(
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/sklearn/base.py", line 596, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/sklearn/utils/validation.py", line 1074, in check_X_y
    X = check_array(
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/sklearn/utils/validation.py", line 856, in check_array
    array = np.asarray(array, order=order, dtype=dtype)
jax.errors.TracerArrayConversionError: The numpy.ndarray conversion method __array__() was called on the JAX Tracer object Traced<ShapedArray(float32[120,4])>with<DynamicJaxprTrace(level=1/0)>
See https://jax.readthedocs.io/en/latest/errors.html#jax.errors.TracerArrayConversionError

During handling of the above exception, another exception occurred:

ray::_spu_compile() (pid=613770, ip=192.168.50.91)
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/secretflow/device/device/spu.py", line 1629, in _spu_compile
    raise ray.exceptions.WorkerCrashedError()
ray.exceptions.WorkerCrashedError: The worker died unexpectedly while executing this task. Check python-core-worker-*.log files for more information.
Traceback (most recent call last):
  File "/home/leone/secretflow-all/secretflow-test/seeds/02c4c7370e9ca7887f0db07e6473266f92d354ca87aa6464c17e5f286366e20c_spu_code.py", line 59, in <module>
    revealed_result = sf.reveal(result)
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/secretflow/device/driver.py", line 153, in reveal
    info, shares_chunk = x.device.outfeed_shares(x.shares_name)
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/secretflow/device/device/spu.py", line 1909, in outfeed_shares
    shares_chunk_count = sfd.get(
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/secretflow/distributed/primitive.py", line 158, in get
    return ray.get(object_refs)
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/ray/_private/client_mode_hook.py", line 105, in wrapper
    return func(*args, **kwargs)
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/ray/_private/worker.py", line 2309, in get
    raise value.as_instanceof_cause()
ray.exceptions.RayTaskError: ray::SPURuntime.outfeed_shares_chunk_count() (pid=614757, ip=192.168.50.91, repr=SPURuntime(device_id=None, party=alice))
  At least one of the input arguments for this task could not be computed:
ray.exceptions.RayTaskError: ray::SPURuntime.run() (pid=614757, ip=192.168.50.91, repr=SPURuntime(device_id=None, party=alice))
  At least one of the input arguments for this task could not be computed:
ray.exceptions.RayTaskError: ray::_spu_compile() (pid=613770, ip=192.168.50.91)
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/secretflow/device/device/spu.py", line 1614, in _spu_compile
    executable, output_tree = spu_fe.compile(
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/spu/utils/frontend.py", line 219, in compile
    ir_text, output = _jax_compilation(
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/spu/utils/frontend.py", line 122, in _jax_compilation
    cfn, output = jax.xla_computation(
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/jax/_src/traceback_util.py", line 166, in reraise_with_filtered_traceback
    return fun(*args, **kwargs)
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/jax/_src/api.py", line 544, in computation_maker
    jaxpr, out_avals, consts = pe.trace_to_jaxpr_dynamic(jaxtree_fun, avals)
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/jax/_src/profiler.py", line 314, in wrapper
    return func(*args, **kwargs)
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/jax/_src/interpreters/partial_eval.py", line 2155, in trace_to_jaxpr_dynamic
    jaxpr, out_avals, consts = trace_to_subjaxpr_dynamic(
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/jax/_src/interpreters/partial_eval.py", line 2177, in trace_to_subjaxpr_dynamic
    ans = fun.call_wrapped(*in_tracers_)
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/jax/_src/linear_util.py", line 188, in call_wrapped
    ans = self.f(*args, **dict(self.params, **kwargs))
  File "/home/leone/secretflow-all/secretflow-test/seeds/02c4c7370e9ca7887f0db07e6473266f92d354ca87aa6464c17e5f286366e20c_spu_code.py", line 19, in func
    model.fit(data_alice['data'], data_alice['target'])
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/sklearn/svm/_base.py", line 173, in fit
    X, y = self._validate_data(
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/sklearn/base.py", line 596, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/sklearn/utils/validation.py", line 1074, in check_X_y
    X = check_array(
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/sklearn/utils/validation.py", line 856, in check_array
    array = np.asarray(array, order=order, dtype=dtype)
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/jax/_src/core.py", line 598, in __array__
    raise TracerArrayConversionError(self)
jax._src.traceback_util.UnfilteredStackTrace: jax.errors.TracerArrayConversionError: The numpy.ndarray conversion method __array__() was called on the JAX Tracer object Traced<ShapedArray(float32[120,4])>with<DynamicJaxprTrace(level=1/0)>
See https://jax.readthedocs.io/en/latest/errors.html#jax.errors.TracerArrayConversionError

The stack trace below excludes JAX-internal frames.
The preceding is the original exception that occurred, unmodified.

--------------------

The above exception was the direct cause of the following exception:

ray::_spu_compile() (pid=613770, ip=192.168.50.91)
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/secretflow/device/device/spu.py", line 1614, in _spu_compile
    executable, output_tree = spu_fe.compile(
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/spu/utils/frontend.py", line 219, in compile
    ir_text, output = _jax_compilation(
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/spu/utils/frontend.py", line 122, in _jax_compilation
    cfn, output = jax.xla_computation(
  File "/home/leone/secretflow-all/secretflow-test/seeds/02c4c7370e9ca7887f0db07e6473266f92d354ca87aa6464c17e5f286366e20c_spu_code.py", line 19, in func
    model.fit(data_alice['data'], data_alice['target'])
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/sklearn/svm/_base.py", line 173, in fit
    X, y = self._validate_data(
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/sklearn/base.py", line 596, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/sklearn/utils/validation.py", line 1074, in check_X_y
    X = check_array(
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/sklearn/utils/validation.py", line 856, in check_array
    array = np.asarray(array, order=order, dtype=dtype)
jax.errors.TracerArrayConversionError: The numpy.ndarray conversion method __array__() was called on the JAX Tracer object Traced<ShapedArray(float32[120,4])>with<DynamicJaxprTrace(level=1/0)>
See https://jax.readthedocs.io/en/latest/errors.html#jax.errors.TracerArrayConversionError

During handling of the above exception, another exception occurred:

ray::_spu_compile() (pid=613770, ip=192.168.50.91)
  File "/home/leone/anaconda3/envs/sf/lib/python3.9/site-packages/secretflow/device/device/spu.py", line 1629, in _spu_compile
    raise ray.exceptions.WorkerCrashedError()
ray.exceptions.WorkerCrashedError: The worker died unexpectedly while executing this task. Check python-core-worker-*.log files for more information.
(sf) ➜  /home/leone/secretflow-all git:(master) ✗ 

您好,麻烦问以下,(_run pid=617599) 这个日志是在什么位置? 我是单机仿真模式,控制台并没有输出日志

@zhouaihui zhouaihui added the module: SPU SPU related issues label Apr 18, 2024
@LeoneChen
Copy link
Author

这个是在里面加print就行了

Copy link

Stale issue message. Please comment to remove stale tag. Otherwise this issue will be closed soon.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
Projects
None yet
Development

No branches or pull requests

6 participants