PathFinderKR
/

Waktaverse-Llama-3-KO-8B-Instruct

Text Generation

text-generation-inference

Inference Endpoints

Model card Files Files and versions Community

PathFinderKR commited on May 25

Commit

dd3197e

•

1 Parent(s): 7d5cfda

Update README.md

Files changed (1) hide show

README.md +11 -13

README.md CHANGED Viewed

@@ -86,7 +86,7 @@ model = AutoModelForCausalLM.from_pretrained(
 ################################################################################
 num_return_sequences=1
 max_new_tokens=1024
-temperature=0.9
 top_p=0.9
 repetition_penalty=1.1
@@ -172,17 +172,17 @@ The model training used LoRA for computational efficiency. 0.04 billion paramete
 # bitsandbytes parameters
 ################################################################################
 load_in_4bit=True
-bnb_4bit_compute_dtype=torch_dtype
 bnb_4bit_quant_type="nf4"
-bnb_4bit_use_double_quant=False
 ################################################################################
 # LoRA parameters
 ################################################################################
 task_type="CAUSAL_LM"
 target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
-r=16
-lora_alpha=32
 lora_dropout=0.1
 bias="none"
@@ -190,20 +190,20 @@ bias="none"
 # TrainingArguments parameters
 ################################################################################
 num_train_epochs=2
-per_device_train_batch_size=1
-gradient_accumulation_steps=1
 gradient_checkpointing=True
 learning_rate=2e-5
 lr_scheduler_type="cosine"
 warmup_ratio=0.1
-optim = "adamw_torch"
 weight_decay=0.01
 ################################################################################
 # SFT parameters
 ################################################################################
-max_seq_length=1024
-packing=True
 ```
@@ -285,9 +285,7 @@ packing=True
 ### Training Details
-- **Training time:** 32 hours
-- **VRAM usage:** 12.8 GB
-- **GPU power usage:** 300 W

 ################################################################################
 num_return_sequences=1
 max_new_tokens=1024
+temperature=0.6
 top_p=0.9
 repetition_penalty=1.1
 # bitsandbytes parameters
 ################################################################################
 load_in_4bit=True
+bnb_4bit_compute_dtype=torch.bfloat16
 bnb_4bit_quant_type="nf4"
+bnb_4bit_use_double_quant=True
 ################################################################################
 # LoRA parameters
 ################################################################################
 task_type="CAUSAL_LM"
 target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
+r=8
+lora_alpha=16
 lora_dropout=0.1
 bias="none"
 # TrainingArguments parameters
 ################################################################################
 num_train_epochs=2
+per_device_train_batch_size=4
+gradient_accumulation_steps=2
 gradient_checkpointing=True
 learning_rate=2e-5
 lr_scheduler_type="cosine"
 warmup_ratio=0.1
+optim = "paged_adamw_8bit"
 weight_decay=0.01
 ################################################################################
 # SFT parameters
 ################################################################################
+max_seq_length=4096
+packing=False
 ```
 ### Training Details
+- **Training time:** 80 hours