Spaces:

Dovakiins
/

qwerrwe

Build error

Jan Philipp Harries Jan Philipp Harries commited on Sep 13, 2023

Commit

2f586d1

•

1 Parent(s): 9845c5e

Fix pretraining with iterable/streaming Dataset (#556)

* return without packing prep/len

* fix remove columns

* fix encode arguments

* add error when max steps not set

* fix test

---------

Co-authored-by: Jan Philipp Harries <jphme@users.noreply.github.com>

Files changed (3) hide show

src/axolotl/utils/config.py +4 -0
src/axolotl/utils/data.py +14 -5
tests/test_data.py +1 -1

src/axolotl/utils/config.py CHANGED Viewed

@@ -191,6 +191,10 @@ def validate_config(cfg):
         LOG.warning(
             "You probably want to disable group_by_length as it will force a streamed dataset to download completely."
         )
     if any([cfg.adam_beta1, cfg.adam_beta2, cfg.adam_epsilon]) and (
         not cfg.optimizer or "adamw" not in cfg.optimizer

         LOG.warning(
             "You probably want to disable group_by_length as it will force a streamed dataset to download completely."
         )
+    if cfg.pretraining_dataset and not cfg.max_steps:
+        raise ValueError(
+            "max_steps must be set when using iterable pretraining_dataset, Trainer can't infer length and schedule optimizer/learning rate without it!"
+        )
     if any([cfg.adam_beta1, cfg.adam_beta2, cfg.adam_epsilon]) and (
         not cfg.optimizer or "adamw" not in cfg.optimizer

src/axolotl/utils/data.py CHANGED Viewed

@@ -3,7 +3,7 @@ import functools
 import hashlib
 import logging
 from pathlib import Path
-from typing import Tuple, Union
 import torch
 from datasets import (
@@ -74,6 +74,7 @@ def prepare_dataset(cfg, tokenizer):
         # https://discuss.huggingface.co/t/how-to-use-huggingface-trainer-streaming-datasets-without-wrapping-it-with-torchdatas-iterablewrapper/25230
         train_dataset = train_dataset.with_format("torch")
         eval_dataset = None
     with zero_first(is_main_process()):
         train_dataset, eval_dataset = process_datasets_for_packing(
@@ -527,9 +528,11 @@ def load_prepare_datasets(
     return train_dataset, eval_dataset
-def encode_pretraining(tokenizer, max_tokens, examples):
     res = tokenizer(
-        examples["text"],
         truncation=True,
         max_length=max_tokens - 2,
         add_special_tokens=True,
@@ -637,6 +640,12 @@ def load_pretraining_dataset(path, tokenizer, max_tokens=2048, seed=42):
     encode = functools.partial(encode_pretraining, tokenizer, max_tokens)
     dataset = load_dataset(path, streaming=True, split="train")
     dataset = dataset.shuffle(seed=seed, buffer_size=10_000)
-    # TODO dynamically figure out which columns/features to remove
-    dataset = dataset.map(encode, batched=True, remove_columns=["text", "meta"])
     return dataset

 import hashlib
 import logging
 from pathlib import Path
+from typing import Dict, List, Tuple, Union
 import torch
 from datasets import (
         # https://discuss.huggingface.co/t/how-to-use-huggingface-trainer-streaming-datasets-without-wrapping-it-with-torchdatas-iterablewrapper/25230
         train_dataset = train_dataset.with_format("torch")
         eval_dataset = None
+        return train_dataset, eval_dataset, cfg.max_steps
     with zero_first(is_main_process()):
         train_dataset, eval_dataset = process_datasets_for_packing(
     return train_dataset, eval_dataset
+def encode_pretraining(
+    tokenizer: PreTrainedTokenizerBase, max_tokens: int, examples: List[str]
+) -> Dict[str, List]:
     res = tokenizer(
+        examples,
         truncation=True,
         max_length=max_tokens - 2,
         add_special_tokens=True,
     encode = functools.partial(encode_pretraining, tokenizer, max_tokens)
     dataset = load_dataset(path, streaming=True, split="train")
     dataset = dataset.shuffle(seed=seed, buffer_size=10_000)
+    dataset = dataset.map(
+        encode,
+        batched=True,
+        input_columns="text",
+        remove_columns=[
+            "text",
+        ],
+    )
     return dataset

tests/test_data.py CHANGED Viewed

@@ -35,7 +35,7 @@ class TestEncodePretraining(unittest.TestCase):
                 "hello, hello",
             ]
         }
-        result = encode_pretraining(self.tokenizer, self.max_tokens, examples)
         self.assertEqual(len(result["input_ids"]), 3)

                 "hello, hello",
             ]
         }
+        result = encode_pretraining(self.tokenizer, self.max_tokens, examples["text"])
         self.assertEqual(len(result["input_ids"]), 3)