adams-story commited on
Commit
65bd4c2
1 Parent(s): a1de053

Upload 2 files

Browse files
Files changed (2) hide show
  1. config.json +41 -331
  2. pytorch_model.bin +2 -2
config.json CHANGED
@@ -1,336 +1,46 @@
1
  {
2
  "architectures": [
3
- "VQCLIPModel"
4
  ],
5
- "clip_config_dict": {
6
- "_commit_hash": "8d052a0f05efbaefbc9e8786ba291cfdf93e5bff",
7
- "_name_or_path": "clip-vit-large-patch14/",
8
- "add_cross_attention": false,
9
- "architectures": [
10
- "CLIPModel"
11
- ],
12
- "bad_words_ids": null,
13
- "begin_suppress_tokens": null,
14
- "bos_token_id": null,
15
- "chunk_size_feed_forward": 0,
16
- "cross_attention_hidden_size": null,
17
- "decoder_start_token_id": null,
18
- "diversity_penalty": 0.0,
19
- "do_sample": false,
20
- "early_stopping": false,
21
- "encoder_no_repeat_ngram_size": 0,
22
- "eos_token_id": null,
23
- "exponential_decay_length_penalty": null,
24
- "finetuning_task": null,
25
- "forced_bos_token_id": null,
26
- "forced_eos_token_id": null,
27
- "id2label": {
28
- "0": "LABEL_0",
29
- "1": "LABEL_1"
30
- },
31
- "initializer_factor": 1.0,
32
- "is_decoder": false,
33
- "is_encoder_decoder": false,
34
- "label2id": {
35
- "LABEL_0": 0,
36
- "LABEL_1": 1
37
- },
38
- "length_penalty": 1.0,
39
- "logit_scale_init_value": 2.6592,
40
- "max_length": 20,
41
- "min_length": 0,
42
- "model_type": "clip",
43
- "no_repeat_ngram_size": 0,
44
- "num_beam_groups": 1,
45
- "num_beams": 1,
46
- "num_return_sequences": 1,
47
- "output_attentions": false,
48
- "output_hidden_states": false,
49
- "output_scores": false,
50
- "pad_token_id": null,
51
- "prefix": null,
52
- "problem_type": null,
53
- "projection_dim": 768,
54
- "pruned_heads": {},
55
- "remove_invalid_values": false,
56
- "repetition_penalty": 1.0,
57
- "return_dict": true,
58
- "return_dict_in_generate": false,
59
- "sep_token_id": null,
60
- "suppress_tokens": null,
61
- "task_specific_params": null,
62
- "temperature": 1.0,
63
- "text_config": {
64
- "_name_or_path": "",
65
- "add_cross_attention": false,
66
- "architectures": null,
67
- "attention_dropout": 0.0,
68
- "bad_words_ids": null,
69
- "begin_suppress_tokens": null,
70
- "bos_token_id": 0,
71
- "chunk_size_feed_forward": 0,
72
- "cross_attention_hidden_size": null,
73
- "decoder_start_token_id": null,
74
- "diversity_penalty": 0.0,
75
- "do_sample": false,
76
- "dropout": 0.0,
77
- "early_stopping": false,
78
- "encoder_no_repeat_ngram_size": 0,
79
- "eos_token_id": 2,
80
- "exponential_decay_length_penalty": null,
81
- "finetuning_task": null,
82
- "forced_bos_token_id": null,
83
- "forced_eos_token_id": null,
84
- "hidden_act": "quick_gelu",
85
- "hidden_size": 768,
86
- "id2label": {
87
- "0": "LABEL_0",
88
- "1": "LABEL_1"
89
- },
90
- "initializer_factor": 1.0,
91
- "initializer_range": 0.02,
92
- "intermediate_size": 3072,
93
- "is_decoder": false,
94
- "is_encoder_decoder": false,
95
- "label2id": {
96
- "LABEL_0": 0,
97
- "LABEL_1": 1
98
- },
99
- "layer_norm_eps": 1e-05,
100
- "length_penalty": 1.0,
101
- "max_length": 20,
102
- "max_position_embeddings": 77,
103
- "min_length": 0,
104
- "model_type": "clip_text_model",
105
- "no_repeat_ngram_size": 0,
106
- "num_attention_heads": 12,
107
- "num_beam_groups": 1,
108
- "num_beams": 1,
109
- "num_hidden_layers": 12,
110
- "num_return_sequences": 1,
111
- "output_attentions": false,
112
- "output_hidden_states": false,
113
- "output_scores": false,
114
- "pad_token_id": 1,
115
- "prefix": null,
116
- "problem_type": null,
117
- "projection_dim": 768,
118
- "pruned_heads": {},
119
- "remove_invalid_values": false,
120
- "repetition_penalty": 1.0,
121
- "return_dict": true,
122
- "return_dict_in_generate": false,
123
- "sep_token_id": null,
124
- "suppress_tokens": null,
125
- "task_specific_params": null,
126
- "temperature": 1.0,
127
- "tf_legacy_loss": false,
128
- "tie_encoder_decoder": false,
129
- "tie_word_embeddings": true,
130
- "tokenizer_class": null,
131
- "top_k": 50,
132
- "top_p": 1.0,
133
- "torch_dtype": null,
134
- "torchscript": false,
135
- "transformers_version": "4.30.1",
136
- "typical_p": 1.0,
137
- "use_bfloat16": false,
138
- "vocab_size": 49408
139
- },
140
- "tf_legacy_loss": false,
141
- "tie_encoder_decoder": false,
142
- "tie_word_embeddings": true,
143
- "tokenizer_class": null,
144
- "top_k": 50,
145
- "top_p": 1.0,
146
- "torch_dtype": "float32",
147
- "torchscript": false,
148
- "transformers_version": null,
149
- "typical_p": 1.0,
150
- "use_bfloat16": false,
151
- "vision_config": {
152
- "_name_or_path": "",
153
- "add_cross_attention": false,
154
- "architectures": null,
155
- "attention_dropout": 0.0,
156
- "bad_words_ids": null,
157
- "begin_suppress_tokens": null,
158
- "bos_token_id": null,
159
- "chunk_size_feed_forward": 0,
160
- "cross_attention_hidden_size": null,
161
- "decoder_start_token_id": null,
162
- "diversity_penalty": 0.0,
163
- "do_sample": false,
164
- "dropout": 0.0,
165
- "early_stopping": false,
166
- "encoder_no_repeat_ngram_size": 0,
167
- "eos_token_id": null,
168
- "exponential_decay_length_penalty": null,
169
- "finetuning_task": null,
170
- "forced_bos_token_id": null,
171
- "forced_eos_token_id": null,
172
- "hidden_act": "quick_gelu",
173
- "hidden_size": 1024,
174
- "id2label": {
175
- "0": "LABEL_0",
176
- "1": "LABEL_1"
177
- },
178
- "image_size": 224,
179
- "initializer_factor": 1.0,
180
- "initializer_range": 0.02,
181
- "intermediate_size": 4096,
182
- "is_decoder": false,
183
- "is_encoder_decoder": false,
184
- "label2id": {
185
- "LABEL_0": 0,
186
- "LABEL_1": 1
187
- },
188
- "layer_norm_eps": 1e-05,
189
- "length_penalty": 1.0,
190
- "max_length": 20,
191
- "min_length": 0,
192
- "model_type": "clip_vision_model",
193
- "no_repeat_ngram_size": 0,
194
- "num_attention_heads": 16,
195
- "num_beam_groups": 1,
196
- "num_beams": 1,
197
- "num_channels": 3,
198
- "num_hidden_layers": 24,
199
- "num_return_sequences": 1,
200
- "output_attentions": false,
201
- "output_hidden_states": false,
202
- "output_scores": false,
203
- "pad_token_id": null,
204
- "patch_size": 14,
205
- "prefix": null,
206
- "problem_type": null,
207
- "projection_dim": 768,
208
- "pruned_heads": {},
209
- "remove_invalid_values": false,
210
- "repetition_penalty": 1.0,
211
- "return_dict": true,
212
- "return_dict_in_generate": false,
213
- "sep_token_id": null,
214
- "suppress_tokens": null,
215
- "task_specific_params": null,
216
- "temperature": 1.0,
217
- "tf_legacy_loss": false,
218
- "tie_encoder_decoder": false,
219
- "tie_word_embeddings": true,
220
- "tokenizer_class": null,
221
- "top_k": 50,
222
- "top_p": 1.0,
223
- "torch_dtype": null,
224
- "torchscript": false,
225
- "transformers_version": "4.30.1",
226
- "typical_p": 1.0,
227
- "use_bfloat16": false
228
- }
229
- },
230
- "model_type": "VQCLIP",
231
- "text_vq_adapter_config_dict": null,
232
  "torch_dtype": "float32",
233
- "transformers_version": "4.30.2",
234
- "vision_vq_adapter_config_dict": {
235
- "_name_or_path": "",
236
- "add_cross_attention": false,
237
- "architectures": null,
238
- "bad_words_ids": null,
239
- "begin_suppress_tokens": null,
240
- "bos_token_id": null,
241
- "chunk_size_feed_forward": 0,
242
- "clip_dim": 768,
243
- "cross_attention_hidden_size": null,
244
- "decoder_start_token_id": null,
245
- "diversity_penalty": 0.0,
246
- "do_sample": false,
247
- "early_stopping": false,
248
- "encoder_no_repeat_ngram_size": 0,
249
- "eos_token_id": null,
250
- "exponential_decay_length_penalty": null,
251
- "finetuning_task": null,
252
- "forced_bos_token_id": null,
253
- "forced_eos_token_id": null,
254
- "id2label": {
255
- "0": "LABEL_0",
256
- "1": "LABEL_1"
257
- },
258
- "is_decoder": false,
259
- "is_encoder_decoder": false,
260
- "is_rq": false,
261
- "label2id": {
262
- "LABEL_0": 0,
263
- "LABEL_1": 1
264
- },
265
- "length_penalty": 1.0,
266
- "max_length": 20,
267
- "min_length": 0,
268
- "mlp_dim": 1028,
269
- "mlp_hidden_dim": 512,
270
- "mlp_layers": 1,
271
- "model_type": "",
272
- "no_repeat_ngram_size": 0,
273
- "num_beam_groups": 1,
274
- "num_beams": 1,
275
- "num_return_sequences": 1,
276
- "output_attentions": false,
277
- "output_hidden_states": false,
278
- "output_scores": false,
279
- "pad_token_id": null,
280
- "prefix": null,
281
- "problem_type": null,
282
- "pruned_heads": {},
283
- "remove_invalid_values": false,
284
- "repetition_penalty": 1.0,
285
- "return_dict": true,
286
- "return_dict_in_generate": false,
287
- "rq_quantize_dropout": true,
288
- "rq_quantize_dropout_cutoff_index": 1,
289
- "rq_quantize_dropout_multiple_of": 4,
290
- "sep_token_id": null,
291
- "suppress_tokens": null,
292
- "task_specific_params": null,
293
- "temperature": 1.0,
294
- "tf_legacy_loss": false,
295
- "tie_encoder_decoder": false,
296
- "tie_word_embeddings": true,
297
- "tokenizer_class": null,
298
- "top_k": 50,
299
- "top_p": 1.0,
300
- "torch_dtype": null,
301
- "torchscript": false,
302
- "transformers_version": "4.30.1",
303
- "typical_p": 1.0,
304
- "use_bfloat16": false,
305
- "vq_accept_image_fmap": false,
306
- "vq_affine_param": false,
307
- "vq_affine_param_batch_decay": 0.99,
308
- "vq_affine_param_codebook_decay": 0.9,
309
- "vq_channel_last": true,
310
- "vq_codebook_dim": 32,
311
- "vq_codebook_size": 64,
312
- "vq_commitment_use_cross_entropy_loss": false,
313
- "vq_commitment_weight": 0.05,
314
- "vq_decay": 0.85,
315
- "vq_ema_update": true,
316
- "vq_eps": 1e-05,
317
- "vq_heads": 32,
318
- "vq_kmeans_init": false,
319
- "vq_kmeans_iters": 20,
320
- "vq_learnable_codebook": false,
321
- "vq_orthogonal_reg_active_codes_only": false,
322
- "vq_orthogonal_reg_max_codes": null,
323
- "vq_orthogonal_reg_weight": 0.0,
324
- "vq_reinmax": false,
325
- "vq_sample_codebook_temp": 1.0,
326
- "vq_separate_codebook_per_head": true,
327
- "vq_stochastic_sample_codes": true,
328
- "vq_straight_through": false,
329
- "vq_sync_affine_param": false,
330
- "vq_sync_codebook": false,
331
- "vq_sync_kmeans": true,
332
- "vq_sync_update_v": 0.0,
333
- "vq_threshold_ema_dead_code": 2,
334
- "vq_use_cosine_sim": false
335
- }
336
  }
 
1
  {
2
  "architectures": [
3
+ "VQAdapterModel"
4
  ],
5
+ "clip_dim": 768,
6
+ "codebook_lr": 10.0,
7
+ "is_rq": false,
8
+ "mlp_dim": 1028,
9
+ "mlp_hidden_dim": 512,
10
+ "mlp_layers": 1,
11
+ "rq_quantize_dropout": true,
12
+ "rq_quantize_dropout_cutoff_index": 1,
13
+ "rq_quantize_dropout_multiple_of": 4,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  "torch_dtype": "float32",
15
+ "transformers_version": "4.31.0.dev0",
16
+ "vq_accept_image_fmap": false,
17
+ "vq_affine_param": false,
18
+ "vq_affine_param_batch_decay": 0.99,
19
+ "vq_affine_param_codebook_decay": 0.9,
20
+ "vq_channel_last": true,
21
+ "vq_codebook_dim": 32,
22
+ "vq_codebook_size": 64,
23
+ "vq_commitment_use_cross_entropy_loss": false,
24
+ "vq_commitment_weight": 0.05,
25
+ "vq_decay": 0.85,
26
+ "vq_ema_update": true,
27
+ "vq_eps": 1e-05,
28
+ "vq_heads": 32,
29
+ "vq_kmeans_init": false,
30
+ "vq_kmeans_iters": 20,
31
+ "vq_learnable_codebook": false,
32
+ "vq_orthogonal_reg_active_codes_only": false,
33
+ "vq_orthogonal_reg_max_codes": null,
34
+ "vq_orthogonal_reg_weight": 0.0,
35
+ "vq_reinmax": false,
36
+ "vq_sample_codebook_temp": 1.0,
37
+ "vq_separate_codebook_per_head": true,
38
+ "vq_stochastic_sample_codes": true,
39
+ "vq_straight_through": false,
40
+ "vq_sync_affine_param": false,
41
+ "vq_sync_codebook": false,
42
+ "vq_sync_kmeans": true,
43
+ "vq_sync_update_v": 0.0,
44
+ "vq_threshold_ema_dead_code": 2,
45
+ "vq_use_cosine_sim": false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  }
pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bffff18537e4ef89636f957dd24657dee9769c3f5d0cf4a4bdb3df1e44a57a9e
3
- size 19485348
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:086b610aae0ca0169970562e70b3d940a33b28f4bbf153f42dcdd1203cd5e1d7
3
+ size 19485281