winglian commited on
Commit
0d28df0
1 Parent(s): 84c7bc4

move filter to before saving so it doesn't happen everytime, update runpod manual script

Browse files
Files changed (3) hide show
  1. README.md +1 -1
  2. scripts/setup-runpod.sh +3 -3
  3. src/axolotl/utils/data.py +12 -12
README.md CHANGED
@@ -155,7 +155,7 @@ use_cpu: false
155
  - Once you start your runpod, and SSH into it:
156
  ```shell
157
  export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
158
- source <(curl -s https://raw.githubusercontent.com/winglian/axolotl/main/scripts/setup-runpod.sh)
159
  ```
160
 
161
  - Once the setup script completes
 
155
  - Once you start your runpod, and SSH into it:
156
  ```shell
157
  export TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
158
+ source <(curl -s https://raw.githubusercontent.com/OpenAccess-AI-Collective/axolotl/dev/scripts/setup-runpod.sh)
159
  ```
160
 
161
  - Once the setup script completes
scripts/setup-runpod.sh CHANGED
@@ -29,14 +29,14 @@ fi
29
  # install flash-attn and deepspeed from pre-built wheels for this specific container b/c these take forever to install
30
  mkdir -p /workspace/wheels
31
  cd /workspace/wheels
32
- curl -L -O https://github.com/winglian/axolotl/raw/wheels/wheels/deepspeed-0.9.2%2B7ddc3b01-cp38-cp38-linux_x86_64.whl
33
- curl -L -O https://github.com/winglian/axolotl/raw/wheels/wheels/flash_attn-1.0.4-cp38-cp38-linux_x86_64.whl
34
  pip install deepspeed-0.9.2%2B7ddc3b01-cp38-cp38-linux_x86_64.whl
35
  pip install flash_attn-1.0.4-cp38-cp38-linux_x86_64.whl
36
  pip install "peft @ git+https://github.com/huggingface/peft.git@main" --force-reinstall --no-dependencies
37
 
38
  cd /workspace/
39
- git clone https://github.com/winglian/axolotl.git
40
  cd axolotl
41
  pip install -e .[int4]
42
  mkdir -p ~/.cache/huggingface/accelerate/
 
29
  # install flash-attn and deepspeed from pre-built wheels for this specific container b/c these take forever to install
30
  mkdir -p /workspace/wheels
31
  cd /workspace/wheels
32
+ curl -L -O https://github.com/OpenAccess-AI-Collective/axolotl/raw/wheels/wheels/deepspeed-0.9.2%2B7ddc3b01-cp38-cp38-linux_x86_64.whl
33
+ curl -L -O https://github.com/OpenAccess-AI-Collective/axolotl/raw/wheels/wheels/flash_attn-1.0.4-cp38-cp38-linux_x86_64.whl
34
  pip install deepspeed-0.9.2%2B7ddc3b01-cp38-cp38-linux_x86_64.whl
35
  pip install flash_attn-1.0.4-cp38-cp38-linux_x86_64.whl
36
  pip install "peft @ git+https://github.com/huggingface/peft.git@main" --force-reinstall --no-dependencies
37
 
38
  cd /workspace/
39
+ git clone https://github.com/OpenAccess-AI-Collective/axolotl.git
40
  cd axolotl
41
  pip install -e .[int4]
42
  mkdir -p ~/.cache/huggingface/accelerate/
src/axolotl/utils/data.py CHANGED
@@ -198,6 +198,18 @@ def load_prepare_datasets(tokenizer, cfg, default_dataset_prepared_path):
198
  )
199
  dataset = Dataset.from_list([_ for _ in constant_len_dataset])
200
 
 
 
 
 
 
 
 
 
 
 
 
 
201
  if cfg.local_rank == 0:
202
  logging.info(
203
  f"Saving packed prepared dataset to disk... {prepared_ds_path}"
@@ -208,18 +220,6 @@ def load_prepare_datasets(tokenizer, cfg, default_dataset_prepared_path):
208
  tokenizer, cfg, default_dataset_prepared_path
209
  )
210
 
211
- # filter out bad data
212
- dataset = Dataset.from_list(
213
- [
214
- d
215
- for d in dataset
216
- if len(d["input_ids"]) < cfg.sequence_len
217
- and len(d["input_ids"]) > 0
218
- and len(d["input_ids"]) == len(d["attention_mask"])
219
- and len(d["input_ids"]) == len(d["labels"])
220
- ]
221
- )
222
-
223
  if cfg.dataset_shard_num and cfg.dataset_shard_idx is not None:
224
  logging.info(
225
  f"Using index #{cfg.dataset_shard_idx} of {cfg.dataset_shard_num} shards"
 
198
  )
199
  dataset = Dataset.from_list([_ for _ in constant_len_dataset])
200
 
201
+ # filter out bad data
202
+ dataset = Dataset.from_list(
203
+ [
204
+ d
205
+ for d in dataset
206
+ if len(d["input_ids"]) < cfg.sequence_len
207
+ and len(d["input_ids"]) > 0
208
+ and len(d["input_ids"]) == len(d["attention_mask"])
209
+ and len(d["input_ids"]) == len(d["labels"])
210
+ ]
211
+ )
212
+
213
  if cfg.local_rank == 0:
214
  logging.info(
215
  f"Saving packed prepared dataset to disk... {prepared_ds_path}"
 
220
  tokenizer, cfg, default_dataset_prepared_path
221
  )
222
 
 
 
 
 
 
 
 
 
 
 
 
 
223
  if cfg.dataset_shard_num and cfg.dataset_shard_idx is not None:
224
  logging.info(
225
  f"Using index #{cfg.dataset_shard_idx} of {cfg.dataset_shard_num} shards"