You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
got exception:
Generating train split: 1834 examples [00:00, 5227.98 examples/s]
Traceback (most recent call last):
File "/usr/local/lib/python3.11/dist-packages/datasets/builder.py", line 2011, in _prepare_split_single
writer.write_table(table)
File "/usr/local/lib/python3.11/dist-packages/datasets/arrow_writer.py", line 585, in write_table
pa_table = table_cast(pa_table, self._schema)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/datasets/table.py", line 2295, in table_cast
return cast_table_to_schema(table, schema)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/datasets/table.py", line 2254, in cast_table_to_schema
arrays = [cast_array_to_feature(table[name], feature) for name, feature in features.items()]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/datasets/table.py", line 2254, in
arrays = [cast_array_to_feature(table[name], feature) for name, feature in features.items()]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/datasets/table.py", line 1802, in wrapper
return pa.chunked_array([func(chunk, *args, **kwargs) for chunk in array.chunks])
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/datasets/table.py", line 1802, in
return pa.chunked_array([func(chunk, *args, **kwargs) for chunk in array.chunks])
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/datasets/table.py", line 2018, in cast_array_to_feature
casted_array_values = _c(array.values, feature[0])
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/datasets/table.py", line 1804, in wrapper
return func(array, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/datasets/table.py", line 2115, in cast_array_to_feature
raise TypeError(f"Couldn't cast array of type\n{array.type}\nto\n{feature}")
TypeError: Couldn't cast array of type
struct<m.name: string, x.name: string, p.name: string, n.name: string, h.name: string, name: string, c: int64, collect(r.name): list<item: string>, q.name: string, rel.name: string, count(p): int64, 1: int64, p.location: string, max(n.name): null, mn.name: string, p.time: int64, min(q.name): string>
to
{'q.name': Value(dtype='string', id=None), 'mn.name': Value(dtype='string', id=None), 'x.name': Value(dtype='string', id=None), 'p.name': Value(dtype='string', id=None), 'n.name': Value(dtype='string', id=None), 'name': Value(dtype='string', id=None), 'm.name': Value(dtype='string', id=None), 'h.name': Value(dtype='string', id=None), 'count(p)': Value(dtype='int64', id=None), 'rel.name': Value(dtype='string', id=None), 'c': Value(dtype='int64', id=None), 'collect(r.name)': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), '1': Value(dtype='int64', id=None), 'p.location': Value(dtype='string', id=None), 'substring(h.name,0,5)': Value(dtype='string', id=None), 'p.time': Value(dtype='int64', id=None)}
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/home/ubuntu/llm/train-2.py", line 150, in
dataset = load_dataset("Doraemon-AI/text-to-neo4j-cypher-chinese")
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/datasets/load.py", line 2609, in load_dataset
builder_instance.download_and_prepare(
File "/usr/local/lib/python3.11/dist-packages/datasets/builder.py", line 1027, in download_and_prepare
self._download_and_prepare(
File "/usr/local/lib/python3.11/dist-packages/datasets/builder.py", line 1122, in _download_and_prepare
self._prepare_split(split_generator, **prepare_split_kwargs)
File "/usr/local/lib/python3.11/dist-packages/datasets/builder.py", line 1882, in _prepare_split
for job_id, done, content in self._prepare_split_single(
File "/usr/local/lib/python3.11/dist-packages/datasets/builder.py", line 2038, in _prepare_split_single
raise DatasetGenerationError("An error occurred while generating the dataset") from e
datasets.exceptions.DatasetGenerationError: An error occurred while generating the dataset
I encountered this same issue when loading a customized dataset for ORPO training, in which there were three columns and two of them were lists.
I debugged and found that it might be caused by the type-infer mechanism and because in some chunks one of the columns is always an empty list ([]), it was regarded as list<item: null>, however in some other chunk it was list<item: string>. This triggered a TypeError running the function table_cast().
I temporarily fixed this by re-dumping the file into a regular JSON format instead of lines of JSON dict. I didn't dig deeper for the lack of knowledge and programming ability but I do hope some developer of this repo will find and fix it.
Describe the bug
dataset = load_dataset("Doraemon-AI/text-to-neo4j-cypher-chinese")
got exception:
Generating train split: 1834 examples [00:00, 5227.98 examples/s]
Traceback (most recent call last):
File "/usr/local/lib/python3.11/dist-packages/datasets/builder.py", line 2011, in _prepare_split_single
writer.write_table(table)
File "/usr/local/lib/python3.11/dist-packages/datasets/arrow_writer.py", line 585, in write_table
pa_table = table_cast(pa_table, self._schema)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/datasets/table.py", line 2295, in table_cast
return cast_table_to_schema(table, schema)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/datasets/table.py", line 2254, in cast_table_to_schema
arrays = [cast_array_to_feature(table[name], feature) for name, feature in features.items()]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/datasets/table.py", line 2254, in
arrays = [cast_array_to_feature(table[name], feature) for name, feature in features.items()]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/datasets/table.py", line 1802, in wrapper
return pa.chunked_array([func(chunk, *args, **kwargs) for chunk in array.chunks])
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/datasets/table.py", line 1802, in
return pa.chunked_array([func(chunk, *args, **kwargs) for chunk in array.chunks])
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/datasets/table.py", line 2018, in cast_array_to_feature
casted_array_values = _c(array.values, feature[0])
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/datasets/table.py", line 1804, in wrapper
return func(array, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/datasets/table.py", line 2115, in cast_array_to_feature
raise TypeError(f"Couldn't cast array of type\n{array.type}\nto\n{feature}")
TypeError: Couldn't cast array of type
struct<m.name: string, x.name: string, p.name: string, n.name: string, h.name: string, name: string, c: int64, collect(r.name): list<item: string>, q.name: string, rel.name: string, count(p): int64, 1: int64, p.location: string, max(n.name): null, mn.name: string, p.time: int64, min(q.name): string>
to
{'q.name': Value(dtype='string', id=None), 'mn.name': Value(dtype='string', id=None), 'x.name': Value(dtype='string', id=None), 'p.name': Value(dtype='string', id=None), 'n.name': Value(dtype='string', id=None), 'name': Value(dtype='string', id=None), 'm.name': Value(dtype='string', id=None), 'h.name': Value(dtype='string', id=None), 'count(p)': Value(dtype='int64', id=None), 'rel.name': Value(dtype='string', id=None), 'c': Value(dtype='int64', id=None), 'collect(r.name)': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None), '1': Value(dtype='int64', id=None), 'p.location': Value(dtype='string', id=None), 'substring(h.name,0,5)': Value(dtype='string', id=None), 'p.time': Value(dtype='int64', id=None)}
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/home/ubuntu/llm/train-2.py", line 150, in
dataset = load_dataset("Doraemon-AI/text-to-neo4j-cypher-chinese")
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.11/dist-packages/datasets/load.py", line 2609, in load_dataset
builder_instance.download_and_prepare(
File "/usr/local/lib/python3.11/dist-packages/datasets/builder.py", line 1027, in download_and_prepare
self._download_and_prepare(
File "/usr/local/lib/python3.11/dist-packages/datasets/builder.py", line 1122, in _download_and_prepare
self._prepare_split(split_generator, **prepare_split_kwargs)
File "/usr/local/lib/python3.11/dist-packages/datasets/builder.py", line 1882, in _prepare_split
for job_id, done, content in self._prepare_split_single(
File "/usr/local/lib/python3.11/dist-packages/datasets/builder.py", line 2038, in _prepare_split_single
raise DatasetGenerationError("An error occurred while generating the dataset") from e
datasets.exceptions.DatasetGenerationError: An error occurred while generating the dataset
Steps to reproduce the bug
dataset = load_dataset("Doraemon-AI/text-to-neo4j-cypher-chinese")
Expected behavior
no exception
Environment info
python 3.11
datasets 2.19.0
The text was updated successfully, but these errors were encountered: