alexwww94 commited on
Commit
760ebb4
1 Parent(s): fd5b301

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +72 -1
README.md CHANGED
@@ -1,7 +1,38 @@
1
  ## Usage
2
  This model is quantized using [AutoGPTQ](https://github.com/AutoGPTQ/AutoGPTQ) for [THUDM/glm-4v-9b](https://huggingface.co/THUDM/glm-4v-9b).
3
 
4
- The quantization script will be released later
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
  ### Load model
7
  ```python
@@ -16,6 +47,27 @@ import datasets
16
  from transformers import AutoTokenizer, AutoModelForCausalLM
17
  from auto_gptq import AutoGPTQForCausalLM
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  device = 'cuda:0'
20
  quantized_model_dir = 'alexwww94/glm-4v-9b-gptq'
21
  trust_remote_code = True
@@ -39,6 +91,25 @@ model = AutoGPTQForCausalLM.from_quantized(
39
  You can also load the model using HuggingFace Transformers, but it will slow down inference.
40
 
41
  ```python
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  model = AutoModelForCausalLM.from_pretrained(
43
  quantized_model_dir,
44
  torch_dtype=torch.float16,
 
1
  ## Usage
2
  This model is quantized using [AutoGPTQ](https://github.com/AutoGPTQ/AutoGPTQ) for [THUDM/glm-4v-9b](https://huggingface.co/THUDM/glm-4v-9b).
3
 
4
+ Use pip install AutoGPTQ (required)
5
+
6
+ (The quantization script will be released later)
7
+
8
+ ```bash
9
+ pip install auto-gptq
10
+ ```
11
+
12
+ Since the original auto-gptq library does not support the quantization of chatglm models, manual import (hack) is required.
13
+ ```python
14
+ from auto_gptq.modeling._base import BaseGPTQForCausalLM
15
+ from auto_gptq.modeling.auto import GPTQ_CAUSAL_LM_MODEL_MAP
16
+
17
+ class ChatGLMGPTQForCausalLM(BaseGPTQForCausalLM):
18
+ layer_type = ["GLMBlock", "TransformerLayer", "GLU"]
19
+
20
+ layers_block_names = ["transformer.encoder.layers",
21
+ "transformer.vision.transformer.layers",
22
+ "transformer.vision.linear_proj"]
23
+
24
+ outside_layer_modules = ["transformer.output_layer"]
25
+
26
+ inside_layer_modules = [
27
+ ["self_attention.query_key_value", "self_attention.dense", "mlp.dense_h_to_4h", "mlp.dense_4h_to_h"],
28
+ ["attention.query_key_value", "attention.dense", "mlp.fc1", "mlp.fc2"],
29
+ ["linear_proj", "dense_h_to_4h", "gate_proj", "dense_4h_to_h"],
30
+ ]
31
+
32
+ GPTQ_CAUSAL_LM_MODEL_MAP['chatglm'] = ChatGLMGPTQForCausalLM
33
+ ```
34
+
35
+ The complete model import code is as follows:
36
 
37
  ### Load model
38
  ```python
 
47
  from transformers import AutoTokenizer, AutoModelForCausalLM
48
  from auto_gptq import AutoGPTQForCausalLM
49
 
50
+
51
+ from auto_gptq.modeling._base import BaseGPTQForCausalLM
52
+ from auto_gptq.modeling.auto import GPTQ_CAUSAL_LM_MODEL_MAP
53
+
54
+ class ChatGLMGPTQForCausalLM(BaseGPTQForCausalLM):
55
+ layer_type = ["GLMBlock", "TransformerLayer", "GLU"]
56
+
57
+ layers_block_names = ["transformer.encoder.layers",
58
+ "transformer.vision.transformer.layers",
59
+ "transformer.vision.linear_proj"]
60
+
61
+ outside_layer_modules = ["transformer.output_layer"]
62
+
63
+ inside_layer_modules = [
64
+ ["self_attention.query_key_value", "self_attention.dense", "mlp.dense_h_to_4h", "mlp.dense_4h_to_h"],
65
+ ["attention.query_key_value", "attention.dense", "mlp.fc1", "mlp.fc2"],
66
+ ["linear_proj", "dense_h_to_4h", "gate_proj", "dense_4h_to_h"],
67
+ ]
68
+
69
+ GPTQ_CAUSAL_LM_MODEL_MAP['chatglm'] = ChatGLMGPTQForCausalLM
70
+
71
  device = 'cuda:0'
72
  quantized_model_dir = 'alexwww94/glm-4v-9b-gptq'
73
  trust_remote_code = True
 
91
  You can also load the model using HuggingFace Transformers, but it will slow down inference.
92
 
93
  ```python
94
+ import os
95
+
96
+ import json
97
+ import random
98
+ import time
99
+
100
+ import torch
101
+ import datasets
102
+ from transformers import AutoTokenizer, AutoModelForCausalLM
103
+
104
+ device = 'cuda:0'
105
+ quantized_model_dir = 'alexwww94/glm-4v-9b-gptq-4bit'
106
+ trust_remote_code = True
107
+
108
+ tokenizer = AutoTokenizer.from_pretrained(
109
+ quantized_model_dir,
110
+ trust_remote_code=trust_remote_code,
111
+ )
112
+
113
  model = AutoModelForCausalLM.from_pretrained(
114
  quantized_model_dir,
115
  torch_dtype=torch.float16,