winglian commited on
Commit
fe28543
1 Parent(s): 0d2e34f

optimize the iteration when tokenizeing large datasets (#332)

Browse files
Files changed (1) hide show
  1. src/axolotl/utils/data.py +10 -1
src/axolotl/utils/data.py CHANGED
@@ -1,5 +1,6 @@
1
  """Module containing data utilities"""
2
  import functools
 
3
  import logging
4
  from hashlib import md5
5
  from pathlib import Path
@@ -264,8 +265,16 @@ def load_tokenized_prepared_datasets(
264
  LOG.info("tokenizing, merging, and shuffling master dataset")
265
 
266
  samples: List[int] = []
 
267
  for d in datasets:
268
- samples = samples + list(d)
 
 
 
 
 
 
 
269
  dataset = Dataset.from_list(samples).shuffle(seed=seed)
270
  if cfg.local_rank == 0:
271
  LOG.info(f"Saving merged prepared dataset to disk... {prepared_ds_path}")
 
1
  """Module containing data utilities"""
2
  import functools
3
+ import itertools
4
  import logging
5
  from hashlib import md5
6
  from pathlib import Path
 
265
  LOG.info("tokenizing, merging, and shuffling master dataset")
266
 
267
  samples: List[int] = []
268
+ chunk_size = 1000
269
  for d in datasets:
270
+ d_iter = iter(d)
271
+ while True:
272
+ chunk = list(itertools.islice(d_iter, chunk_size))
273
+ if not chunk:
274
+ break
275
+ samples.extend(chunk)
276
+
277
+ LOG.info("shuffle")
278
  dataset = Dataset.from_list(samples).shuffle(seed=seed)
279
  if cfg.local_rank == 0:
280
  LOG.info(f"Saving merged prepared dataset to disk... {prepared_ds_path}")