winglian commited on
Commit
1e5014a
2 Parent(s): 4066c78 78a1e1f

Merge pull request #255 from OpenAccess-AI-Collective/open-orca-prompts

Browse files
README.md CHANGED
@@ -195,6 +195,10 @@ Have dataset(s) in one of the following format (JSONL recommended):
195
  ```json
196
  {"message_1": "...", "message_2": "..."}
197
  ```
 
 
 
 
198
  - `context_qa`: in context question answering from an article
199
  ```json
200
  {"article": "...", "question": "...", "answer": "..."}
 
195
  ```json
196
  {"message_1": "...", "message_2": "..."}
197
  ```
198
+ - `alpaca_w_system.load_open_orca`: support for open orca datasets with included system prompts, instruct
199
+ ```json
200
+ {"system_prompt": "...", "question": "...", "response": "..."}
201
+ ```
202
  - `context_qa`: in context question answering from an article
203
  ```json
204
  {"article": "...", "question": "...", "answer": "..."}
src/axolotl/prompt_strategies/alpaca_w_system.py CHANGED
@@ -75,6 +75,20 @@ class SystemDataPrompter(AlpacaPrompter):
75
  yield res
76
 
77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  def load(tokenizer, cfg):
79
  return load_chat(tokenizer, cfg)
80
 
@@ -95,3 +109,12 @@ def load_chat(tokenizer, cfg):
95
  cfg.train_on_inputs,
96
  cfg.sequence_len,
97
  )
 
 
 
 
 
 
 
 
 
 
75
  yield res
76
 
77
 
78
+ class OpenOrcaPromptTokenizingStrategy(InstructionWSystemPromptTokenizingStrategy):
79
+ """
80
+ Tokenizing strategy for OpenOrca datasets
81
+ """
82
+
83
+ def parse_instruction_fields(self, prompt) -> Tuple[str, str, str, str]:
84
+ return (
85
+ prompt["question"],
86
+ "",
87
+ prompt["response"],
88
+ prompt["system_prompt"],
89
+ )
90
+
91
+
92
  def load(tokenizer, cfg):
93
  return load_chat(tokenizer, cfg)
94
 
 
109
  cfg.train_on_inputs,
110
  cfg.sequence_len,
111
  )
112
+
113
+
114
+ def load_open_orca(tokenizer, cfg):
115
+ return OpenOrcaPromptTokenizingStrategy(
116
+ SystemDataPrompter(PromptStyle.INSTRUCT.value),
117
+ tokenizer,
118
+ cfg.train_on_inputs,
119
+ cfg.sequence_len,
120
+ )