Nanobit commited on
Commit
cf68153
1 Parent(s): bd3c5a5

Add CompletionPrompt type

Browse files
src/axolotl/prompt_tokenizers.py CHANGED
@@ -125,6 +125,25 @@ class NomicGPT4AllPromptTokenizingStrategy(InstructionPromptTokenizingStrategy):
125
  )
126
 
127
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
  class ReflectionPromptTokenizingStrategy(PromptTokenizingStrategy):
129
  def parse_instruction_fields(self, prompt) -> (str, str, str, str, str):
130
  raise NotImplementedError
 
125
  )
126
 
127
 
128
+ class CompletionPromptTokenizingStrategy(InstructionPromptTokenizingStrategy):
129
+ def parse_instruction_fields(self, prompt) -> (str):
130
+ return (
131
+ prompt["text"]
132
+ )
133
+
134
+ def tokenize_prompt(self, prompt):
135
+ text = self.parse_instruction_fields(prompt)
136
+ full_prompt = self._build_full_prompt(text)
137
+ tokenized_full_prompt = self._tokenize(full_prompt)
138
+
139
+ return tokenized_full_prompt
140
+
141
+ def _build_full_prompt(self, text):
142
+ return self.prompter.build_prompt(
143
+ text
144
+ )
145
+
146
+
147
  class ReflectionPromptTokenizingStrategy(PromptTokenizingStrategy):
148
  def parse_instruction_fields(self, prompt) -> (str, str, str, str, str):
149
  raise NotImplementedError
src/axolotl/prompters.py CHANGED
@@ -35,6 +35,17 @@ class JeopardyPrompter(AlpacaPrompter):
35
  prompt_input = "Below is a Jeopardy clue paired with input providing the category of the clue. Write a concise response that best answers tbe clue given the category.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n"
36
 
37
 
 
 
 
 
 
 
 
 
 
 
 
38
  class GPTeacherPrompter(AlpacaPrompter):
39
  ...
40
 
 
35
  prompt_input = "Below is a Jeopardy clue paired with input providing the category of the clue. Write a concise response that best answers tbe clue given the category.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n"
36
 
37
 
38
+ class CompletionPrompter(AlpacaPrompter):
39
+ def build_prompt(
40
+ self,
41
+ text: str
42
+ ) -> str:
43
+ return text
44
+
45
+ def get_response(self, output: str) -> str:
46
+ return output.strip()
47
+
48
+
49
  class GPTeacherPrompter(AlpacaPrompter):
50
  ...
51
 
src/axolotl/utils/data.py CHANGED
@@ -11,13 +11,17 @@ from axolotl.prompt_tokenizers import (
11
  GPTeacherPromptTokenizingStrategy,
12
  OpenAssistantPromptTokenizingStrategy,
13
  AlpacaReflectionPTStrategy,
14
- ShareGPTPromptTokenizingStrategy, JeopardyPromptTokenizingStrategy,
 
 
15
  )
16
  from axolotl.prompters import (
17
  AlpacaPrompter,
18
  GPTeacherPrompter,
19
  ReflectAlpacaPrompter,
20
- ShareGPTPrompter, JeopardyPrompter,
 
 
21
  )
22
 
23
 
@@ -118,6 +122,15 @@ def load_prepare_datasets(tokenizer, cfg, default_dataset_prepared_path):
118
  )
119
  ds_wrapper = TokenizedPromptDataset(ds_strategy, ds["train"])
120
  datasets.append(ds_wrapper)
 
 
 
 
 
 
 
 
 
121
  else:
122
  logging.error(f"unhandled prompt tokenization strategy: {d.type}")
123
  logging.info("tokenizing, merging, and shuffling master dataset")
 
11
  GPTeacherPromptTokenizingStrategy,
12
  OpenAssistantPromptTokenizingStrategy,
13
  AlpacaReflectionPTStrategy,
14
+ ShareGPTPromptTokenizingStrategy,
15
+ JeopardyPromptTokenizingStrategy,
16
+ CompletionPromptTokenizingStrategy,
17
  )
18
  from axolotl.prompters import (
19
  AlpacaPrompter,
20
  GPTeacherPrompter,
21
  ReflectAlpacaPrompter,
22
+ ShareGPTPrompter,
23
+ JeopardyPrompter,
24
+ CompletionPrompter,
25
  )
26
 
27
 
 
122
  )
123
  ds_wrapper = TokenizedPromptDataset(ds_strategy, ds["train"])
124
  datasets.append(ds_wrapper)
125
+ elif d.type == "completion":
126
+ ds_strategy = CompletionPromptTokenizingStrategy(
127
+ CompletionPrompter(),
128
+ tokenizer,
129
+ cfg.train_on_inputs,
130
+ cfg.sequence_len,
131
+ )
132
+ ds_wrapper = TokenizedPromptDataset(ds_strategy, ds["train"])
133
+ datasets.append(ds_wrapper)
134
  else:
135
  logging.error(f"unhandled prompt tokenization strategy: {d.type}")
136
  logging.info("tokenizing, merging, and shuffling master dataset")