File size: 5,161 Bytes
8e97a2b
 
 
 
 
 
 
 
 
 
 
 
fd5b301
 
 
760ebb4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fd5b301
 
 
 
 
 
 
 
 
 
 
 
 
 
760ebb4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fd5b301
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
760ebb4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fd5b301
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8e97a2b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
---
license: other
language:
- zh
- en
tags:
- chatglm
- glm-4v
- quantization
- auto-gptq
- 4bit
---
## Usage
This model is quantized using [AutoGPTQ](https://github.com/AutoGPTQ/AutoGPTQ) for [THUDM/glm-4v-9b](https://huggingface.co/THUDM/glm-4v-9b). 

Use pip install AutoGPTQ (required)

(The quantization script will be released later)

```bash
pip install auto-gptq
```

Since the original auto-gptq library does not support the quantization of chatglm models, manual import (hack) is required.
```python
from auto_gptq.modeling._base import BaseGPTQForCausalLM
from auto_gptq.modeling.auto import GPTQ_CAUSAL_LM_MODEL_MAP

class ChatGLMGPTQForCausalLM(BaseGPTQForCausalLM):
    layer_type = ["GLMBlock", "TransformerLayer", "GLU"]

    layers_block_names = ["transformer.encoder.layers", 
                            "transformer.vision.transformer.layers", 
                            "transformer.vision.linear_proj"]
        
    outside_layer_modules = ["transformer.output_layer"]
    
    inside_layer_modules = [
        ["self_attention.query_key_value", "self_attention.dense", "mlp.dense_h_to_4h", "mlp.dense_4h_to_h"],
        ["attention.query_key_value", "attention.dense", "mlp.fc1", "mlp.fc2"],
        ["linear_proj", "dense_h_to_4h", "gate_proj", "dense_4h_to_h"],
    ]

GPTQ_CAUSAL_LM_MODEL_MAP['chatglm'] = ChatGLMGPTQForCausalLM
```

The complete model import code is as follows:

### Load model
```python
import os

import json
import random
import time

import torch
import datasets
from transformers import AutoTokenizer, AutoModelForCausalLM
from auto_gptq import AutoGPTQForCausalLM


from auto_gptq.modeling._base import BaseGPTQForCausalLM
from auto_gptq.modeling.auto import GPTQ_CAUSAL_LM_MODEL_MAP

class ChatGLMGPTQForCausalLM(BaseGPTQForCausalLM):
    layer_type = ["GLMBlock", "TransformerLayer", "GLU"]

    layers_block_names = ["transformer.encoder.layers", 
                            "transformer.vision.transformer.layers", 
                            "transformer.vision.linear_proj"]
        
    outside_layer_modules = ["transformer.output_layer"]
    
    inside_layer_modules = [
        ["self_attention.query_key_value", "self_attention.dense", "mlp.dense_h_to_4h", "mlp.dense_4h_to_h"],
        ["attention.query_key_value", "attention.dense", "mlp.fc1", "mlp.fc2"],
        ["linear_proj", "dense_h_to_4h", "gate_proj", "dense_4h_to_h"],
    ]

GPTQ_CAUSAL_LM_MODEL_MAP['chatglm'] = ChatGLMGPTQForCausalLM

device = 'cuda:0'
quantized_model_dir = 'alexwww94/glm-4v-9b-gptq'
trust_remote_code = True

tokenizer = AutoTokenizer.from_pretrained(
    quantized_model_dir,
    trust_remote_code=trust_remote_code,
)

model = AutoGPTQForCausalLM.from_quantized(
    quantized_model_dir,
    device=device,
    trust_remote_code=trust_remote_code,
    torch_dtype=torch.float16,
    use_cache=True,
    inject_fused_mlp=True,
    inject_fused_attention=True,
)
```

You can also load the model using HuggingFace Transformers, but it will slow down inference.

```python
import os

import json
import random
import time

import torch
import datasets
from transformers import AutoTokenizer, AutoModelForCausalLM

device = 'cuda:0'
quantized_model_dir = 'alexwww94/glm-4v-9b-gptq-4bit'
trust_remote_code = True

tokenizer = AutoTokenizer.from_pretrained(
    quantized_model_dir,
    trust_remote_code=trust_remote_code,
)

model = AutoModelForCausalLM.from_pretrained(
    quantized_model_dir,
    torch_dtype=torch.float16,
    device_map="auto",
    low_cpu_mem_usage=True,
    trust_remote_code=trust_remote_code,
    use_cache=True
).eval()
```

### inference test
Load the CogVLM-SFT-311K-subset-gptq dataset as test data, which is a dataset for quantization.

```python
dataset = datasets.load_dataset('alexwww94/CogVLM-SFT-311K-subset-gptq')

for example in dataset['single']:
    # prompt = "为什么马会被围栏限制在一个区域内?"
    prompt = json.loads(example['labels_zh'])['conversations'][0]
    answer = json.loads(example['labels_zh'])['conversations'][1]
    image = example['image']
    print(f"prompt: {prompt}")
    print("-" * 42)
    print(f"golden: {answer}")
    print("-" * 42)

    start = time.time()

    prompt.update({'image': image})
    inputs = tokenizer.apply_chat_template([prompt],
                                    add_generation_prompt=True, tokenize=True, return_tensors="pt",
                                    return_dict=True, dtyp=torch.bfloat16)  # chat mode
    inputs = inputs.to(device)
    inputs['images'] = inputs['images'].half()

    gen_kwargs = {"max_length": 2500, "do_sample": True, "top_k": 1}
    with torch.inference_mode():
        outputs = model.generate(**inputs, **gen_kwargs)
        outputs = outputs[:, inputs['input_ids'].shape[1]:]
        generated_text = tokenizer.decode(outputs[0]).split('<|endoftext|>')[0]

    end = time.time()
    print(f"quant: {generated_text}")
    num_new_tokens = len(tokenizer(generated_text)["input_ids"])
    print(f"generate {num_new_tokens} tokens using {end-start: .4f}s, {num_new_tokens / (end - start)} tokens/s.")
    print("=" * 42)

    # break
```

### metrics
(to be released later)