Skip to content

Commit cafae09

Browse files
committed
Renamed a new arg to fingerprint
1 parent 07df063 commit cafae09

File tree

4 files changed

+16
-11
lines changed

4 files changed

+16
-11
lines changed

src/datasets/arrow_dataset.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1047,7 +1047,7 @@ def from_generator(
10471047
gen_kwargs: Optional[dict] = None,
10481048
num_proc: Optional[int] = None,
10491049
split: NamedSplit = Split.TRAIN,
1050-
dataset_id_suffix: Optional[str] = None,
1050+
fingerprint: Optional[str] = None,
10511051
**kwargs,
10521052
):
10531053
"""Create a Dataset from a generator.
@@ -1074,9 +1074,9 @@ def from_generator(
10741074
Split name to be assigned to the dataset.
10751075
10761076
<Added version="2.21.0"/>
1077-
dataset_id_suffix (`str`, *optional*):
1078-
Suffix that will be used to generate dataset ID.
1079-
By default `dataset_id_suffix` is generated by hashing all the args which can be slow in case of a large dataset.
1077+
fingerprint (`str`, *optional*):
1078+
Fingerprint that will be used to generate dataset ID.
1079+
By default `fingerprint` is generated by hashing all the args which can be slow in case of a large dataset.
10801080
10811081
<Added version="3.6.0"/>
10821082
**kwargs (additional keyword arguments):
@@ -1116,7 +1116,7 @@ def from_generator(
11161116
gen_kwargs=gen_kwargs,
11171117
num_proc=num_proc,
11181118
split=split,
1119-
dataset_id_suffix=dataset_id_suffix,
1119+
fingerprint=fingerprint,
11201120
**kwargs,
11211121
).read()
11221122

src/datasets/builder.py

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,7 @@ def create_config_id(
141141
self,
142142
config_kwargs: dict,
143143
custom_features: Optional[Features] = None,
144+
fingerprint: Optional[str] = None,
144145
) -> str:
145146
"""
146147
The config id is used to build the cache directory.
@@ -155,8 +156,9 @@ def create_config_id(
155156
"""
156157
# Possibly add a suffix to the name to handle custom features/data_files/config_kwargs
157158
suffix: Optional[str] = None
158-
if "dataset_id_suffix" in config_kwargs and config_kwargs["dataset_id_suffix"] is not None:
159-
suffix = config_kwargs["dataset_id_suffix"]
159+
160+
if fingerprint is not None:
161+
suffix = fingerprint
160162
else:
161163
config_kwargs_to_add_to_suffix = config_kwargs.copy()
162164
# name and version are already used to build the cache directory
@@ -316,6 +318,7 @@ def __init__(
316318
data_dir: Optional[str] = None,
317319
storage_options: Optional[dict] = None,
318320
writer_batch_size: Optional[int] = None,
321+
fingerprint: Optional[str] = None,
319322
**config_kwargs,
320323
):
321324
# DatasetBuilder name
@@ -346,6 +349,7 @@ def __init__(
346349
self.config, self.config_id = self._create_builder_config(
347350
config_name=config_name,
348351
custom_features=features,
352+
fingerprint=fingerprint,
349353
**config_kwargs,
350354
)
351355

@@ -536,7 +540,7 @@ def get_exported_dataset_info(self) -> DatasetInfo:
536540
return self.get_all_exported_dataset_infos().get(self.config.name, DatasetInfo())
537541

538542
def _create_builder_config(
539-
self, config_name=None, custom_features=None, **config_kwargs
543+
self, config_name=None, custom_features=None, fingerprint=None, **config_kwargs
540544
) -> tuple[BuilderConfig, str]:
541545
"""Create and validate BuilderConfig object as well as a unique config id for this config.
542546
Raises ValueError if there are multiple builder configs and config_name and DEFAULT_CONFIG_NAME are None.
@@ -607,6 +611,7 @@ def _create_builder_config(
607611
config_id = builder_config.create_config_id(
608612
config_kwargs,
609613
custom_features=custom_features,
614+
fingerprint=fingerprint,
610615
)
611616
is_custom = (config_id not in self.builder_configs) and config_id != "default"
612617
if is_custom:

src/datasets/io/generator.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ def __init__(
1616
gen_kwargs: Optional[dict] = None,
1717
num_proc: Optional[int] = None,
1818
split: NamedSplit = Split.TRAIN,
19-
dataset_id_suffix: Optional[str] = None,
19+
fingerprint: Optional[str] = None,
2020
**kwargs,
2121
):
2222
super().__init__(
@@ -33,7 +33,7 @@ def __init__(
3333
generator=generator,
3434
gen_kwargs=gen_kwargs,
3535
split=split,
36-
dataset_id_suffix=dataset_id_suffix,
36+
fingerprint=fingerprint,
3737
**kwargs,
3838
)
3939

src/datasets/packaged_modules/generator/generator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ class GeneratorConfig(datasets.BuilderConfig):
1010
gen_kwargs: Optional[dict] = None
1111
features: Optional[datasets.Features] = None
1212
split: datasets.NamedSplit = datasets.Split.TRAIN
13-
dataset_id_suffix: Optional[str] = None
13+
fingerprint: Optional[str] = None
1414

1515
def __post_init__(self):
1616
super().__post_init__()

0 commit comments

Comments
 (0)