alexwww94
/

glm-4v-9b-gptq-4bit

@@ -1,7 +1,38 @@
 ## Usage
 This model is quantized using [AutoGPTQ](https://github.com/AutoGPTQ/AutoGPTQ) for [THUDM/glm-4v-9b](https://huggingface.co/THUDM/glm-4v-9b).
-The quantization script will be released later
 ### Load model
 ```python
@@ -16,6 +47,27 @@ import datasets
 from transformers import AutoTokenizer, AutoModelForCausalLM
 from auto_gptq import AutoGPTQForCausalLM
 device = 'cuda:0'
 quantized_model_dir = 'alexwww94/glm-4v-9b-gptq'
 trust_remote_code = True
@@ -39,6 +91,25 @@ model = AutoGPTQForCausalLM.from_quantized(
 You can also load the model using HuggingFace Transformers, but it will slow down inference.
 ```python
 model = AutoModelForCausalLM.from_pretrained(
     quantized_model_dir,
     torch_dtype=torch.float16,

 ## Usage
 This model is quantized using [AutoGPTQ](https://github.com/AutoGPTQ/AutoGPTQ) for [THUDM/glm-4v-9b](https://huggingface.co/THUDM/glm-4v-9b).
+Use pip install AutoGPTQ (required)
+(The quantization script will be released later)
+```bash
+pip install auto-gptq
+```
+Since the original auto-gptq library does not support the quantization of chatglm models, manual import (hack) is required.
+```python
+from auto_gptq.modeling._base import BaseGPTQForCausalLM
+from auto_gptq.modeling.auto import GPTQ_CAUSAL_LM_MODEL_MAP
+class ChatGLMGPTQForCausalLM(BaseGPTQForCausalLM):
+    layer_type = ["GLMBlock", "TransformerLayer", "GLU"]
+    layers_block_names = ["transformer.encoder.layers",
+                            "transformer.vision.transformer.layers",
+                            "transformer.vision.linear_proj"]
+    outside_layer_modules = ["transformer.output_layer"]
+    inside_layer_modules = [
+        ["self_attention.query_key_value", "self_attention.dense", "mlp.dense_h_to_4h", "mlp.dense_4h_to_h"],
+        ["attention.query_key_value", "attention.dense", "mlp.fc1", "mlp.fc2"],
+        ["linear_proj", "dense_h_to_4h", "gate_proj", "dense_4h_to_h"],
+    ]
+GPTQ_CAUSAL_LM_MODEL_MAP['chatglm'] = ChatGLMGPTQForCausalLM
+```
+The complete model import code is as follows:
 ### Load model
 ```python
 from transformers import AutoTokenizer, AutoModelForCausalLM
 from auto_gptq import AutoGPTQForCausalLM
+from auto_gptq.modeling._base import BaseGPTQForCausalLM
+from auto_gptq.modeling.auto import GPTQ_CAUSAL_LM_MODEL_MAP
+class ChatGLMGPTQForCausalLM(BaseGPTQForCausalLM):
+    layer_type = ["GLMBlock", "TransformerLayer", "GLU"]
+    layers_block_names = ["transformer.encoder.layers",
+                            "transformer.vision.transformer.layers",
+                            "transformer.vision.linear_proj"]
+    outside_layer_modules = ["transformer.output_layer"]
+    inside_layer_modules = [
+        ["self_attention.query_key_value", "self_attention.dense", "mlp.dense_h_to_4h", "mlp.dense_4h_to_h"],
+        ["attention.query_key_value", "attention.dense", "mlp.fc1", "mlp.fc2"],
+        ["linear_proj", "dense_h_to_4h", "gate_proj", "dense_4h_to_h"],
+    ]
+GPTQ_CAUSAL_LM_MODEL_MAP['chatglm'] = ChatGLMGPTQForCausalLM
 device = 'cuda:0'
 quantized_model_dir = 'alexwww94/glm-4v-9b-gptq'
 trust_remote_code = True
 You can also load the model using HuggingFace Transformers, but it will slow down inference.
 ```python
+import os
+import json
+import random
+import time
+import torch
+import datasets
+from transformers import AutoTokenizer, AutoModelForCausalLM
+device = 'cuda:0'
+quantized_model_dir = 'alexwww94/glm-4v-9b-gptq-4bit'
+trust_remote_code = True
+tokenizer = AutoTokenizer.from_pretrained(
+    quantized_model_dir,
+    trust_remote_code=trust_remote_code,
+)
 model = AutoModelForCausalLM.from_pretrained(
     quantized_model_dir,
     torch_dtype=torch.float16,