Revert "fix makedown"

Browse files

This reverts commit 69d7e70935038ea5dcc54099c7ef388e9b2949a3.

Files changed (13) hide show

.gitattributes +4 -0
README.md +3 -106
config.json +3 -158
configuration_kosmos2_5.py +3 -330
generation_config.json +3 -9
image_processing_kosmos2_5.py +3 -343
model.safetensors.index.json +3 -621
modeling_kosmos2_5.py +0 -0
preprocessor_config.json +3 -15
processing_kosmos2_5.py +3 -147
special_tokens_map.json +3 -30
tokenizer.json +0 -0
tokenizer_config.json +0 -0

.gitattributes CHANGED Viewed

@@ -34,3 +34,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 receipt_00008.png filter=lfs diff=lfs merge=lfs -text

 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 receipt_00008.png filter=lfs diff=lfs merge=lfs -text
+*.md filter=lfs diff=lfs merge=lfs -text
+*.py filter=lfs diff=lfs merge=lfs -text
+*.json filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,106 +1,3 @@
----
-language: en
-license: mit
----
-# Kosmos-2.5
-[Microsoft Document AI](https://www.microsoft.com/en-us/research/project/document-ai/) | [GitHub](https://github.com/microsoft/unilm/tree/master/kosmos-2.5)
-## Model description
-Kosmos-2.5 is a multimodal literate model for machine reading of text-intensive images. Pre-trained on large-scale text-intensive images, Kosmos-2.5 excels in two distinct yet cooperative transcription tasks: (1) generating spatially-aware text blocks, where each block of text is assigned its spatial coordinates within the image, and (2) producing structured text output that captures styles and structures into the markdown format. This unified multimodal literate capability is achieved through a shared decoder-only auto-regressive Transformer architecture, task-specific prompts, and flexible text representations. We evaluate Kosmos-2.5 on end-to-end document-level text recognition and image-to-markdown text generation. Furthermore, the model can be readily adapted for any text-intensive image understanding task with different prompts through supervised fine-tuning, making it a general-purpose tool for real-world applications involving text-rich images. This work also paves the way for the future scaling of multimodal large language models.
-[Kosmos-2.5: A Multimodal Literate Model](https://arxiv.org/abs/2309.11419)
-## NOTE:
-Since this is a generative model, there is a risk of **hallucination** during the generation process, and it **CAN NOT** guarantee the accuracy of all OCR/Markdown results in the images.
-## Use with transformers：
-```python
-from PIL import Image
-import requests
-import torch
-from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
-import re
-repo = "microsoft/kosmos-2.5"
-device = "cuda:0"
-dtype = torch.bfloat16
-model = Kosmos2_5ForConditionalGeneration.from_pretrained(repo, device_map=device, torch_dtype=dtype)
-processor = AutoProcessor.from_pretrained(repo)
-url = "https://huggingface.co/kirp/kosmos2_5/resolve/main/receipt_00008.png"
-image = Image.open(requests.get(url, stream=True).raw)
-prompt = "<ocr>" # <md>
-inputs = processor(text=prompt, images=image, return_tensors="pt")
-height, width = inputs.pop("height"), inputs.pop("width")
-raw_width, raw_height = image.size
-scale_height = raw_height / height
-scale_width = raw_width / width
-inputs = {k: v.to(device) if v is not None else None for k, v in inputs.items()}
-inputs["flattened_patches"] = inputs["flattened_patches"].to(dtype)
-generated_ids = model.generate(
-    **inputs,
-    max_new_tokens=1024,
-)
-generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
-def postprocess(y, scale_height, scale_width):
-    y = y.replace(prompt, "")
-    if "<md>" in prompt:
-        return y
-    pattern = r"<bbox><x_\d+><y_\d+><x_\d+><y_\d+></bbox>"
-    bboxs_raw = re.findall(pattern, y)
-    lines = re.split(pattern, y)[1:]
-    bboxs = [re.findall(r"\d+", i) for i in bboxs_raw]
-    bboxs = [[int(j) for j in i] for i in bboxs]
-    info = ""
-    for i in range(len(lines)):
-        box = bboxs[i]
-        x0, y0, x1, y1 = box
-        if not (x0 >= x1 or y0 >= y1):
-            x0 = int(x0 * scale_width)
-            y0 = int(y0 * scale_height)
-            x1 = int(x1 * scale_width)
-            y1 = int(y1 * scale_height)
-            info += f"{x0},{y0},{x1},{y0},{x1},{y1},{x0},{y1},{lines[i]}"
-    return info
-output_text = postprocess(generated_text[0], scale_height, scale_width)
-print(output_text)
-```
-```text
-55,595,71,595,71,629,55,629,1
-82,595,481,595,481,635,82,635,[REG] BLACK SAKURA
-716,590,841,590,841,629,716,629,45,455
-55,637,71,637,71,672,55,672,1
-82,637,486,637,486,675,82,675,COOKIE DOH SAUCES
-818,632,843,632,843,668,818,668,0
-51,683,71,683,71,719,51,719,1
-82,683,371,683,371,719,82,719,NATA DE COCO
-820,677,845,677,845,713,820,713,0
-32,770,851,770,851,811,32,811,Sub Total 45,455
-28,811,853,811,853,858,28,858,PB1 (10%) 4,545
-28,857,855,857,855,905,28,905,Rounding 0
-24,905,858,905,858,956,24,956,Total 50,000
-17,1096,868,1096,868,1150,17,1150,Card Payment 50,000
-```
-## Citation
-If you find Kosmos-2.5 useful in your research, please cite the following paper:
-```
-@article{lv2023kosmos,
-  title={Kosmos-2.5: A multimodal literate model},
-  author={Lv, Tengchao and Huang, Yupan and Chen, Jingye and Cui, Lei and Ma, Shuming and Chang, Yaoyao and Huang, Shaohan and Wang, Wenhui and Dong, Li and Luo, Weiyao and others},
-  journal={arXiv preprint arXiv:2309.11419},
-  year={2023}
-}
-```
-## License
-The content of this project itself is licensed under the [MIT](https://github.com/microsoft/unilm/blob/master/kosmos-2.5/LICENSE)
-[Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct)

+version https://git-lfs.github.com/spec/v1
+oid sha256:4d1c384c76fca9be88593a39f73619c7594ac476eb1fb278be62f702d1d6ef1c
+size 4782

config.json CHANGED Viewed

@@ -1,158 +1,3 @@
-{
-  "architectures": [
-    "Kosmos2_5ForConditionalGeneration"
-  ],
-  "auto_map": {
-    "AutoConfig": "configuration_kosmos2_5.Kosmos2_5Config",
-    "AutoProcessor": "processing_kosmos2_5.Kosmos2_5Processor",
-    "AutoImageProcessor": "image_processing_kosmos2_5.Kosmos2_5ImageProcessor",
-    "AutoModel": "modeling_kosmos2_5.Kosmos2_5Model",
-    "AutoModelForVision2Seq": "modeling_kosmos2_5.Kosmos2_5ForConditionalGeneration"
-  },
-  "latent_query_num": 2048,
-  "model_type": "kosmos-2.5",
-  "text_config": {
-    "_name_or_path": "",
-    "activation_dropout": 0.0,
-    "activation_function": "gelu",
-    "add_cross_attention": false,
-    "architectures": null,
-    "attention_dropout": 0.0,
-    "attention_heads": 16,
-    "bad_words_ids": null,
-    "begin_suppress_tokens": null,
-    "bos_token_id": 0,
-    "chunk_size_feed_forward": 0,
-    "cross_attention_hidden_size": null,
-    "decoder_start_token_id": null,
-    "dropout": 0,
-    "early_stopping": false,
-    "embed_dim": 1536,
-    "pad_token_id": 1,
-    "eos_token_id": 2,
-    "exponential_decay_length_penalty": null,
-    "ffn_dim": 6144,
-    "finetuning_task": null,
-    "forced_bos_token_id": null,
-    "forced_eos_token_id": null,
-    "id2label": {
-      "0": "LABEL_0",
-      "1": "LABEL_1"
-    },
-    "init_std": 0.02,
-    "is_decoder": false,
-    "is_encoder_decoder": false,
-    "label2id": {
-      "LABEL_0": 0,
-      "LABEL_1": 1
-    },
-    "layer_norm_eps": 1e-05,
-    "layerdrop": 0.0,
-    "layers": 24,
-    "max_length": 20,
-    "max_position_embeddings": 4096,
-    "min_length": 0,
-    "model_type": "kosmos_2_5_text_model",
-    "num_return_sequences": 1,
-    "output_attentions": false,
-    "output_hidden_states": false,
-    "output_scores": false,
-    "prefix": null,
-    "problem_type": null,
-    "pruned_heads": {},
-    "remove_invalid_values": false,
-    "return_dict": true,
-    "return_dict_in_generate": false,
-    "scale_embedding": true,
-    "sep_token_id": null,
-    "suppress_tokens": null,
-    "task_specific_params": null,
-    "tf_legacy_loss": false,
-    "tie_encoder_decoder": false,
-    "tie_word_embeddings": true,
-    "tokenizer_class": null,
-    "torch_dtype": null,
-    "torchscript": false,
-    "use_bfloat16": false,
-    "use_cache": true,
-    "vocab_size": 108481
-  },
-  "torch_dtype": "float32",
-  "transformers_version": "4.42.0.dev0",
-  "vision_config": {
-    "_name_or_path": "",
-    "add_cross_attention": false,
-    "architectures": null,
-    "attention_dropout": 0.0,
-    "bad_words_ids": null,
-    "begin_suppress_tokens": null,
-    "bos_token_id": null,
-    "chunk_size_feed_forward": 0,
-    "cross_attention_hidden_size": null,
-    "d_ff": 3968,
-    "d_kv": 64,
-    "decoder_start_token_id": null,
-    "dense_act_fn": "gelu_new",
-    "diversity_penalty": 0.0,
-    "do_sample": false,
-    "dropout_rate": 0.0,
-    "early_stopping": false,
-    "encoder_no_repeat_ngram_size": 0,
-    "eos_token_id": null,
-    "exponential_decay_length_penalty": null,
-    "finetuning_task": null,
-    "forced_bos_token_id": null,
-    "forced_eos_token_id": null,
-    "hidden_size": 1536,
-    "id2label": {
-      "0": "LABEL_0",
-      "1": "LABEL_1"
-    },
-    "initializer_factor": 1.0,
-    "initializer_range": 1e-10,
-    "is_decoder": false,
-    "is_encoder_decoder": false,
-    "label2id": {
-      "LABEL_0": 0,
-      "LABEL_1": 1
-    },
-    "layer_norm_eps": 1e-06,
-    "length_penalty": 1.0,
-    "max_length": 4096,
-    "min_length": 0,
-    "model_type": "kosmos_2_5_vision_model",
-    "no_repeat_ngram_size": 0,
-    "num_attention_heads": 24,
-    "num_beam_groups": 1,
-    "num_beams": 1,
-    "num_hidden_layers": 18,
-    "num_return_sequences": 1,
-    "output_attentions": false,
-    "output_hidden_states": false,
-    "output_scores": false,
-    "pad_token_id": null,
-    "patch_embed_hidden_size": 768,
-    "prefix": null,
-    "problem_type": null,
-    "pruned_heads": {},
-    "remove_invalid_values": false,
-    "repetition_penalty": 1.0,
-    "return_dict": true,
-    "return_dict_in_generate": false,
-    "sep_token_id": null,
-    "seq_len": 4096,
-    "suppress_tokens": null,
-    "task_specific_params": null,
-    "temperature": 1.0,
-    "tf_legacy_loss": false,
-    "tie_encoder_decoder": false,
-    "tie_word_embeddings": true,
-    "tokenizer_class": null,
-    "top_k": 50,
-    "top_p": 1.0,
-    "torch_dtype": null,
-    "torchscript": false,
-    "typical_p": 1.0,
-    "use_bfloat16": false
-  }
-}

+version https://git-lfs.github.com/spec/v1
+oid sha256:d785d66f4c8c97fc80676509cf9a887b783f6ce59ff8b6e569ede5cf4d65da0b
+size 4398

configuration_kosmos2_5.py CHANGED Viewed

@@ -1,330 +1,3 @@
-# coding=utf-8
-# Copyright 2024 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""KOSMOS-2.5.5 model configuration"""
-import os
-from typing import Union
-from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import logging
-logger = logging.get_logger(__name__)
-class Kosmos2_5TextConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`Kosmos2_5TextModel`]. It is used to instantiate a
-    KOSMOS-2.5 text decoder according to the specified arguments, defining the model architecture. Instantiating a
-    configuration with the defaults will yield a similar configuration to that of the text decoder of the KOSMOS-2.5
-    [microsoft/KOSMOS-2.5](https://huggingface.co/microsoft/KOSMOS-2.5) architecture.
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-    Args:
-        vocab_size (`int`, *optional*, defaults to 108481):
-            Vocabulary size of the Kosmos2_5 model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`Kosmos2_5Model`].
-        max_position_embeddings (`int`, *optional*, defaults to 2048):
-            The maximum sequence length that this model might ever be used with. Typically set this to something large
-            just in case (e.g., 512 or 1024 or 2048).
-        embed_dim (`int`, *optional*, defaults to 2048):
-            Dimensionality of the layers and the pooler layer.
-        layers (`int`, *optional*, defaults to 24):
-            Number of hidden layers in the Transformer encoder.
-        ffn_dim (`int`, *optional*, defaults to 8192):
-            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
-        attention_heads (`int`, *optional*, defaults to 32):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
-            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
-            `"relu"`, `"silu"` and `"gelu_new"` are supported.
-        dropout (`float`, *optional*, defaults to 0.1):
-            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_dropout (`float`, *optional*, defaults to 0.1):
-            The dropout ratio for the attention probabilities.
-        activation_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for activations inside the fully connected layer.
-        layerdrop (`float`, *optional*, defaults to 0.0):
-            The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
-            for more details.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
-            The epsilon used by the layer normalization layers.
-        init_std (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        scale_embedding (`bool`, *optional*, defaults to `True`):
-            Scale embeddings by diving by sqrt(embed_dim).
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models).
-    ```"""
-    model_type = "kosmos_2_5_text_model"
-    keys_to_ignore_at_inference = ["past_key_values"]
-    attribute_map = {
-        "num_attention_heads": "attention_heads",
-        "hidden_size": "embed_dim",
-        "num_hidden_layers": "layers",
-    }
-    def __init__(
-        self,
-        vocab_size=108481,
-        max_position_embeddings=4096,
-        embed_dim=1536,
-        layers=24,
-        ffn_dim=6144,
-        attention_heads=16,
-        activation_function="gelu",
-        dropout=0.1,
-        attention_dropout=0,
-        activation_dropout=0.0,
-        layerdrop=0.0,
-        layer_norm_eps=1e-5,
-        init_std=0.02,
-        scale_embedding=True,
-        use_cache=True,
-        pad_token_id=1,
-        bos_token_id=0,
-        eos_token_id=2,
-        **kwargs,
-    ):
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            **kwargs,
-        )
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.embed_dim = embed_dim
-        self.layers = layers
-        self.ffn_dim = ffn_dim
-        self.attention_heads = attention_heads
-        self.activation_function = activation_function
-        self.dropout = dropout
-        self.attention_dropout = attention_dropout
-        self.activation_dropout = activation_dropout
-        self.layerdrop = layerdrop
-        self.layer_norm_eps = layer_norm_eps
-        self.init_std = init_std
-        self.scale_embedding = scale_embedding
-        self.use_cache = use_cache
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-        # get the text config dict if we are loading from Kosmos2_5Config
-        if config_dict.get("model_type") == "kosmos-2.5":
-            config_dict = config_dict["text_config"]
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-        return cls.from_dict(config_dict, **kwargs)
-class Kosmos2_5VisionConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`Kosmos2_5VisionModel`]. It is used to
-    instantiate a Kosmos2_5 vision model according to the specified arguments, defining the model architecture.
-    Instantiating a configuration defaults will yield a similar configuration to that of the kosmos-2.5
-    [microsoft/kosmos-2.5](https://huggingface.co/microsoft/kosmos-2.5) architecture.
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-    Args:
-        hidden_size (`int`, *optional*, defaults to 768):
-            Dimensionality of the encoder layers and the pooler layer.
-        patch_embed_hidden_size (`int`, *optional*, defaults to 768):
-            Dimensionality of the input patch_embedding layer in the Transformer encoder.
-        d_ff (`int`, *optional*, defaults to 2048):
-            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        d_kv (`int`, *optional*, defaults to 64):
-            Dimensionality of the key, query, value projections per attention head.
-        num_hidden_layers (`int`, *optional*, defaults to 12):
-            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 12):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        dense_act_fn (`str` or `function`, *optional*, defaults to `"gelu_new"`):
-            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
-            `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
-            The epsilon used by the layer normalization layers.
-        dropout_rate (`float`, *optional*, defaults to 0.0):
-            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio for the attention probabilities.
-        initializer_range (`float`, *optional*, defaults to 1e-10):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        initializer_factor (`float`, *optional*, defaults to 1.0):
-            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
-            testing).
-        seq_len (`int`, *optional*, defaults to 4096):
-            Maximum sequence length (here number of patches) supported by the model.
-    Example:
-    ```python
-    >>> from transformers import Kosmos2_5VisionConfig, Kosmos2_5VisionModel
-    >>> # Initializing a Kosmos2_5VisionConfig with microsoft/kosmos-2.5 style configuration
-    >>> configuration = Kosmos2_5VisionConfig()
-    >>> # Initializing a Kosmos2_5VisionModel (with random weights) from the microsoft/kosmos-2.5 style configuration
-    >>> model = Kosmos2_5VisionModel(configuration)
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-    model_type = "kosmos_2_5_vision_model"
-    def __init__(
-        self,
-        hidden_size=1536,
-        patch_embed_hidden_size=768,
-        d_ff=3968,
-        d_kv=64,
-        num_hidden_layers=18,
-        num_attention_heads=24,
-        dense_act_fn="gelu_new",
-        layer_norm_eps=1e-6,
-        dropout_rate=0.0,
-        attention_dropout=0.0,
-        initializer_range=1e-10,
-        initializer_factor=1.0,
-        seq_len=4096,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.hidden_size = hidden_size
-        self.patch_embed_hidden_size = patch_embed_hidden_size
-        self.d_ff = d_ff
-        self.dropout_rate = dropout_rate
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.initializer_range = initializer_range
-        self.initializer_factor = initializer_factor
-        self.attention_dropout = attention_dropout
-        self.layer_norm_eps = layer_norm_eps
-        self.dense_act_fn = dense_act_fn
-        self.seq_len = seq_len
-        self.d_kv = d_kv
-    @classmethod
-    def from_pretrained(
-        cls, pretrainehidden_size_name_or_path: Union[str, os.PathLike], **kwargs
-    ) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-        config_dict, kwargs = cls.get_config_dict(pretrainehidden_size_name_or_path, **kwargs)
-        # get the vision config dict if we are loading from Kosmos2_5Config
-        if config_dict.get("model_type") == "Kosmos2_5":
-            config_dict = config_dict["vision_config"]
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-        return cls.from_dict(config_dict, **kwargs)
-class Kosmos2_5Config(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`Kosmos2_5Model`]. It is used to instantiate a
-    KOSMOS-2.5 model according to the specified arguments, defining the model architecture. Instantiating a configuration
-    with the defaults will yield a similar configuration to that of the KOSMOS-2.5
-    [microsoft/KOSMOS-2.5-patch14-224](https://huggingface.co/microsoft/KOSMOS-2.5-patch14-224) architecture.
-    Args:
-        text_config (`dict`, *optional*):
-            Dictionary of configuration options used to initialize [`Kosmos2_5TextConfig`].
-        vision_config (`dict`, *optional*):
-            Dictionary of configuration options used to initialize [`Kosmos2_5VisionConfig`].
-        latent_query_num (`int`, *optional*, defaults to 2048):
-            The number of latent query tokens that represent the image features used in the text decoder component.
-        kwargs (*optional*):
-            Dictionary of keyword arguments.
-    Example:
-    ```python
-    >>> from .. import Kosmos2_5Config, Kosmos2_5Model
-    >>> # Initializing a KOSMOS-2.5 KOSMOS-2.5-patch14-224 style configuration
-    >>> configuration = Kosmos2_5Config()
-    >>> # Initializing a model (with random weights) from the KOSMOS-2.5-patch14-224 style configuration
-    >>> model = Kosmos2_5Model(configuration)
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-    model_type = "kosmos-2.5"
-    is_composition = True
-    def __init__(
-        self,
-        text_config=None,
-        vision_config=None,
-        latent_query_num=2048,
-        **kwargs,
-    ):
-        super().__init__(**kwargs)
-        if text_config is None:
-            text_config = {}
-            logger.info("text_config is None. Initializing the Kosmos2_5TextConfig with default values.")
-        if vision_config is None:
-            vision_config = {}
-            logger.info("vision_config is None. Initializing the Kosmos2_5VisionConfig with default values.")
-        self.text_config = Kosmos2_5TextConfig(**text_config)
-        self.vision_config = Kosmos2_5VisionConfig(**vision_config)
-        self.latent_query_num = latent_query_num
-    @classmethod
-    def from_text_vision_configs(
-        cls,
-        text_config: Kosmos2_5TextConfig,
-        vision_config: Kosmos2_5VisionConfig,
-        **kwargs,
-    ):
-        r"""
-        Instantiate a [`Pix2StructConfig`] (or a derived class) from pix2struct text model configuration and pix2struct
-        vision model configuration.
-        Returns:
-            [`Pix2StructConfig`]: An instance of a configuration object
-        """
-        return cls(
-            text_config=text_config.to_dict(),
-            vision_config=vision_config.to_dict(),
-            **kwargs,
-        )

+version https://git-lfs.github.com/spec/v1
+oid sha256:ad443b3012c42bce3b3a7b83debeab403dc5fbd249a5b7a1b8e4d266dc838ff9
+size 14660

generation_config.json CHANGED Viewed

@@ -1,9 +1,3 @@
-{
-  "_from_model_config": false,
-  "bos_token_id": 0,
-  "eos_token_id": 2,
-  "pad_token_id": 1,
-  "transformers_version": "4.42.0.dev0",
-  "num_beam" : 1,
-  "do_sample": false
-}

+version https://git-lfs.github.com/spec/v1
+oid sha256:6ff1351b6a1bc18f890c14bb6f08bdb7db7b056fd7df44ab4cc90d9f832d0091
+size 178

image_processing_kosmos2_5.py CHANGED Viewed

@@ -1,343 +1,3 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Image processor class for Kosmos2_5."""
-import math
-from typing import Dict, Optional, Union
-from transformers import AutoImageProcessor
-import numpy as np
-from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
-from transformers.image_transforms import (
-    convert_to_rgb,
-    normalize,
-    to_channel_dimension_format,
-)
-from transformers.image_utils import (
-    ChannelDimension,
-    ImageInput,
-    get_image_size,
-    infer_channel_dimension_format,
-    make_list_of_images,
-    to_numpy_array,
-    valid_images,
-)
-from transformers.utils import TensorType, is_torch_available, logging
-from transformers.utils.import_utils import requires_backends
-if is_torch_available():
-    import torch
-logger = logging.get_logger(__name__)
-DEFAULT_FONT_PATH = "ybelkada/fonts"
-# adapted from: https://discuss.pytorch.org/t/tf-image-extract-patches-in-pytorch/171409/2
-def torch_extract_patches(image_tensor, patch_height, patch_width):
-    """
-    Utiliy function to extract patches from a given image tensor. Returns a tensor of shape (1, `patch_height`,
-    `patch_width`, `num_channels`x `patch_height` x `patch_width`)
-    Args:
-        image_tensor (torch.Tensor):
-            The image tensor to extract patches from.
-        patch_height (int):
-            The height of the patches to extract.
-        patch_width (int):
-            The width of the patches to extract.
-    """
-    requires_backends(torch_extract_patches, ["torch"])
-    image_tensor = image_tensor.unsqueeze(0)
-    patches = torch.nn.functional.unfold(image_tensor, (patch_height, patch_width), stride=(patch_height, patch_width))
-    patches = patches.reshape(image_tensor.size(0), image_tensor.size(1), patch_height, patch_width, -1)
-    patches = patches.permute(0, 4, 2, 3, 1).reshape(
-        image_tensor.size(2) // patch_height,
-        image_tensor.size(3) // patch_width,
-        image_tensor.size(1) * patch_height * patch_width,
-    )
-    return patches.unsqueeze(0)
-class Kosmos2_5ImageProcessor(BaseImageProcessor):
-    r"""
-    Constructs a Kosmos2_5 image processor.
-    Args:
-        do_convert_rgb (`bool`, *optional*, defaults to `True`):
-            Whether to convert the image to RGB.
-        do_normalize (`bool`, *optional*, defaults to `True`):
-            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
-            method. According to Kosmos2_5 paper and code, the image is normalized with its own mean and standard
-            deviation.
-        patch_size (`Dict[str, int]`, *optional*, defaults to `{"height": 16, "width": 16}`):
-            The patch size to use for the image. According to Kosmos2_5 paper and code, the patch size is 16x16.
-        max_patches (`int`, *optional*, defaults to 4096):
-            The maximum number of patches to extract from the image as per the [Kosmos2_5
-            paper](https://arxiv.org/pdf/2309.11419).
-    """
-    model_input_names = ["flattened_patches"]
-    def __init__(
-        self,
-        do_convert_rgb: bool = True,
-        do_normalize: bool = True,
-        patch_size: Dict[str, int] = None,
-        max_patches: int = 4096,
-        **kwargs,
-    ) -> None:
-        super().__init__(**kwargs)
-        self.patch_size = patch_size if patch_size is not None else {"height": 16, "width": 16}
-        self.do_normalize = do_normalize
-        self.do_convert_rgb = do_convert_rgb
-        self.max_patches = max_patches
-    def extract_flattened_patches(
-        self,
-        image: np.ndarray,
-        max_patches: int,
-        patch_size: dict,
-        input_data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ) -> np.ndarray:
-        """
-        Extract flattened patches from an image.
-        Args:
-            image (`np.ndarray`):
-                Image to extract flattened patches from.
-            max_patches (`int`):
-                Maximum number of patches to extract.
-            patch_size (`dict`):
-                Dictionary containing the patch height and width.
-        Returns:
-            result (`np.ndarray`):
-                A sequence of `max_patches` flattened patches.
-        """
-        requires_backends(self.extract_flattened_patches, "torch")
-        # convert to torch
-        image = to_channel_dimension_format(image, ChannelDimension.FIRST, input_data_format)
-        image = torch.from_numpy(image)
-        patch_height, patch_width = patch_size["height"], patch_size["width"]
-        image_height, image_width = get_image_size(image, ChannelDimension.FIRST)
-        # maximize scale s.t.
-        scale = math.sqrt(max_patches * (patch_height / image_height) * (patch_width / image_width))
-        num_feasible_rows = max(min(math.floor(scale * image_height / patch_height), max_patches), 1)
-        num_feasible_cols = max(min(math.floor(scale * image_width / patch_width), max_patches), 1)
-        resized_height = max(num_feasible_rows * patch_height, 1)
-        resized_width = max(num_feasible_cols * patch_width, 1)
-        image = torch.nn.functional.interpolate(
-            image.unsqueeze(0),
-            size=(resized_height, resized_width),
-            mode="bilinear",
-            align_corners=False,
-            antialias=True,
-        ).squeeze(0)
-        # [1, rows, columns, patch_height * patch_width * image_channels]
-        patches = torch_extract_patches(image, patch_height, patch_width)
-        patches_shape = patches.shape
-        rows = patches_shape[1]
-        columns = patches_shape[2]
-        depth = patches_shape[3]
-        # [rows * columns, patch_height * patch_width * image_channels]
-        patches = patches.reshape([rows * columns, depth])
-        # [rows * columns, 1]
-        row_ids = torch.arange(rows).reshape([rows, 1]).repeat(1, columns).reshape([rows * columns, 1])
-        col_ids = torch.arange(columns).reshape([1, columns]).repeat(rows, 1).reshape([rows * columns, 1])
-        # Offset by 1 so the ids do not contain zeros, which represent padding.
-        row_ids += 1
-        col_ids += 1
-        # Prepare additional patch features.
-        # [rows * columns, 1]
-        row_ids = row_ids.to(torch.float32)
-        col_ids = col_ids.to(torch.float32)
-        # [rows * columns, 2 + patch_height * patch_width * image_channels]
-        result = torch.cat([row_ids, col_ids, patches], -1)
-        # [max_patches, 2 + patch_height * patch_width * image_channels]
-        result = torch.nn.functional.pad(result, [0, 0, 0, max_patches - (rows * columns)]).float()
-        result = to_numpy_array(result)
-        return result, resized_width, resized_height, rows, columns
-    def normalize(
-        self,
-        image: np.ndarray,
-        data_format: Optional[Union[str, ChannelDimension]] = None,
-        input_data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ) -> np.ndarray:
-        """
-        Normalize an image. image = (image - image_mean) / image_std.
-        The image std is to mimic the tensorflow implementation of the `per_image_standardization`:
-        https://www.tensorflow.org/api_docs/python/tf/image/per_image_standardization
-        Args:
-            image (`np.ndarray`):
-                Image to normalize.
-            data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format for the output image. If unset, the channel dimension format of the input
-                image is used.
-            input_data_format (`str` or `ChannelDimension`, *optional*):
-                The channel dimension format of the input image. If not provided, it will be inferred.
-        """
-        if image.dtype == np.uint8:
-            image = image.astype(np.float32)
-        # take mean across the whole `image`
-        mean = np.mean(image)
-        std = np.std(image)
-        adjusted_stddev = max(std, 1.0 / math.sqrt(np.prod(image.shape)))
-        return normalize(
-            image,
-            mean=mean,
-            std=adjusted_stddev,
-            data_format=data_format,
-            input_data_format=input_data_format,
-            **kwargs,
-        )
-    def preprocess(
-        self,
-        images: ImageInput,
-        do_convert_rgb: bool = None,
-        do_normalize: Optional[bool] = None,
-        max_patches: Optional[int] = None,
-        patch_size: Optional[Dict[str, int]] = None,
-        return_tensors: Optional[Union[str, TensorType]] = None,
-        data_format: ChannelDimension = ChannelDimension.FIRST,
-        input_data_format: Optional[Union[str, ChannelDimension]] = None,
-        **kwargs,
-    ) -> ImageInput:
-        """
-        Preprocess an image or batch of images. The processor first computes the maximum possible number of
-        aspect-ratio preserving patches of size `patch_size` that can be extracted from the image. It then pads the
-        image with zeros to make the image respect the constraint of `max_patches`. Before extracting the patches the
-        images are standardized following the tensorflow implementation of `per_image_standardization`
-        (https://www.tensorflow.org/api_docs/python/tf/image/per_image_standardization).
-        Args:
-            images (`ImageInput`):
-                Image to preprocess. Expects a single or batch of images.
-            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
-                Whether to convert the image to RGB.
-            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
-                Whether to normalize the image.
-            max_patches (`int`, *optional*, defaults to `self.max_patches`):
-                Maximum number of patches to extract.
-            patch_size (`dict`, *optional*, defaults to `self.patch_size`):
-                Dictionary containing the patch height and width.
-            return_tensors (`str` or `TensorType`, *optional*):
-                The type of tensors to return. Can be one of:
-                    - Unset: Return a list of `np.ndarray`.
-                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
-                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
-                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
-                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
-            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
-                The channel dimension format for the output image. Can be one of:
-                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-                - Unset: Use the channel dimension format of the input image.
-            input_data_format (`ChannelDimension` or `str`, *optional*):
-                The channel dimension format for the input image. If unset, the channel dimension format is inferred
-                from the input image. Can be one of:
-                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
-                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
-                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
-        """
-        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
-        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
-        patch_size = patch_size if patch_size is not None else self.patch_size
-        max_patches = max_patches if max_patches is not None else self.max_patches
-        if kwargs.get("data_format", None) is not None:
-            raise ValueError("data_format is not an accepted input as the outputs are ")
-        images = make_list_of_images(images)
-        if not valid_images(images):
-            raise ValueError(
-                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
-                "torch.Tensor, tf.Tensor or jax.ndarray."
-            )
-        # PIL RGBA images are converted to RGB
-        if do_convert_rgb:
-            images = [convert_to_rgb(image) for image in images]
-        # All transformations expect numpy arrays.
-        images = [to_numpy_array(image) for image in images]
-        if input_data_format is None:
-            # We assume that all images have the same channel dimension format.
-            input_data_format = infer_channel_dimension_format(images[0])
-        if do_normalize:
-            images = [self.normalize(image=image, input_data_format=input_data_format) for image in images]
-        # convert to torch tensor and permute
-        images = [
-            self.extract_flattened_patches(
-                image=image,
-                max_patches=max_patches,
-                patch_size=patch_size,
-                input_data_format=input_data_format,
-            )
-            for image in images
-        ]
-        width = [image[1] for image in images]
-        height = [image[2] for image in images]
-        rows = [image[3] for image in images]
-        cols = [image[4] for image in images]
-        images = [image[0] for image in images]
-        # create attention mask in numpy
-        attention_masks = [(image.sum(axis=-1) != 0).astype(np.float32) for image in images]
-        encoded_outputs = BatchFeature(
-            data={
-                "flattened_patches": images,
-                "attention_mask": attention_masks,
-                "width": width,
-                "height": height,
-                "rows": rows,
-                "cols": cols,
-            },
-            tensor_type=return_tensors,
-        )
-        return encoded_outputs
-AutoImageProcessor.register("Kosmos2_5ImageProcessor", Kosmos2_5ImageProcessor)

+version https://git-lfs.github.com/spec/v1
+oid sha256:6e81817457c381706ca63af46381086181ece892a947801e76088af822b99ed5
+size 14573

model.safetensors.index.json CHANGED Viewed

@@ -1,621 +1,3 @@
-{
-  "metadata": {
-    "total_size": 5498585088
-  },
-  "weight_map": {
-    "image_to_text_projection.dense.bias": "model-00002-of-00002.safetensors",
-    "image_to_text_projection.dense.weight": "model-00002-of-00002.safetensors",
-    "image_to_text_projection.latent_query": "model-00002-of-00002.safetensors",
-    "image_to_text_projection.x_attn.k_proj.bias": "model-00002-of-00002.safetensors",
-    "image_to_text_projection.x_attn.k_proj.weight": "model-00002-of-00002.safetensors",
-    "image_to_text_projection.x_attn.out_proj.bias": "model-00002-of-00002.safetensors",
-    "image_to_text_projection.x_attn.out_proj.weight": "model-00002-of-00002.safetensors",
-    "image_to_text_projection.x_attn.q_proj.bias": "model-00002-of-00002.safetensors",
-    "image_to_text_projection.x_attn.q_proj.weight": "model-00002-of-00002.safetensors",
-    "image_to_text_projection.x_attn.v_proj.bias": "model-00002-of-00002.safetensors",
-    "image_to_text_projection.x_attn.v_proj.weight": "model-00002-of-00002.safetensors",
-    "text_model.model.embed_tokens.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layer_norm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layer_norm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.0.ffn.fc1.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.0.ffn.fc1.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.0.ffn.fc2.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.0.ffn.fc2.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.0.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.0.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.0.final_layer_norm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.0.final_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.0.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.0.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.0.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.0.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.1.ffn.fc1.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.1.ffn.fc1.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.1.ffn.fc2.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.1.ffn.fc2.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.1.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.1.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.1.final_layer_norm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.1.final_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.1.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.1.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.1.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.1.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.10.ffn.fc1.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.10.ffn.fc1.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.10.ffn.fc2.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.10.ffn.fc2.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.10.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.10.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.10.final_layer_norm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.10.final_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.10.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.10.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.10.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.10.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.11.ffn.fc1.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.11.ffn.fc1.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.11.ffn.fc2.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.11.ffn.fc2.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.11.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.11.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.11.final_layer_norm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.11.final_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.11.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.11.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.11.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.11.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.12.ffn.fc1.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.12.ffn.fc1.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.12.ffn.fc2.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.12.ffn.fc2.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.12.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.12.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.12.final_layer_norm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.12.final_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.12.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.12.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.12.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.12.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.13.ffn.fc1.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.13.ffn.fc1.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.13.ffn.fc2.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.13.ffn.fc2.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.13.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.13.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.13.final_layer_norm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.13.final_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.13.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.13.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.13.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.13.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.14.ffn.fc1.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.14.ffn.fc1.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.14.ffn.fc2.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.14.ffn.fc2.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.14.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.14.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.14.final_layer_norm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.14.final_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.14.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.14.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.14.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.14.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.15.ffn.fc1.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.15.ffn.fc1.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.15.ffn.fc2.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.15.ffn.fc2.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.15.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.15.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.15.final_layer_norm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.15.final_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.15.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.15.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.15.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.15.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.16.ffn.fc1.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.16.ffn.fc1.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.16.ffn.fc2.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.16.ffn.fc2.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.16.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.16.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.16.final_layer_norm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.16.final_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.16.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.16.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.16.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.16.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.17.ffn.fc1.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.17.ffn.fc1.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.17.ffn.fc2.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.17.ffn.fc2.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.17.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.17.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.17.final_layer_norm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.17.final_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.17.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.17.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.17.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.17.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.18.ffn.fc1.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.18.ffn.fc1.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.18.ffn.fc2.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.18.ffn.fc2.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.18.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.18.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.18.final_layer_norm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.18.final_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.18.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.18.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.18.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.18.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.18.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.19.ffn.fc1.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.19.ffn.fc1.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.19.ffn.fc2.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.19.ffn.fc2.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.19.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.19.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.19.final_layer_norm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.19.final_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.19.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.19.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.19.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.19.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.19.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.2.ffn.fc1.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.2.ffn.fc1.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.2.ffn.fc2.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.2.ffn.fc2.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.2.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.2.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.2.final_layer_norm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.2.final_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.2.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.2.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.2.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.2.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.20.ffn.fc1.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.20.ffn.fc1.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.20.ffn.fc2.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.20.ffn.fc2.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.20.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.20.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.20.final_layer_norm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.20.final_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.20.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.20.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.20.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.20.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.20.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.20.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.20.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.21.ffn.fc1.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.21.ffn.fc1.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.21.ffn.fc2.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.21.ffn.fc2.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.21.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.21.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.21.final_layer_norm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.21.final_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.21.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.21.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.21.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.21.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.21.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.21.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.21.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.22.ffn.fc1.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.22.ffn.fc1.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.22.ffn.fc2.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.22.ffn.fc2.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.22.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.22.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.22.final_layer_norm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.22.final_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.22.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.22.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.22.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.22.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.22.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.22.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.22.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.22.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.22.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.22.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.23.ffn.fc1.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.23.ffn.fc1.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.23.ffn.fc2.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.23.ffn.fc2.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.23.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.23.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.23.final_layer_norm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.23.final_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.23.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.23.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.23.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.23.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.23.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.23.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.23.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.23.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.23.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.23.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.3.ffn.fc1.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.3.ffn.fc1.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.3.ffn.fc2.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.3.ffn.fc2.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.3.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.3.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.3.final_layer_norm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.3.final_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.3.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.3.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.3.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.3.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.4.ffn.fc1.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.4.ffn.fc1.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.4.ffn.fc2.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.4.ffn.fc2.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.4.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.4.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.4.final_layer_norm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.4.final_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.4.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.4.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.4.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.4.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.5.ffn.fc1.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.5.ffn.fc1.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.5.ffn.fc2.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.5.ffn.fc2.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.5.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.5.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.5.final_layer_norm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.5.final_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.5.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.5.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.5.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.5.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.6.ffn.fc1.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.6.ffn.fc1.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.6.ffn.fc2.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.6.ffn.fc2.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.6.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.6.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.6.final_layer_norm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.6.final_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.6.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.6.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.6.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.6.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.7.ffn.fc1.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.7.ffn.fc1.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.7.ffn.fc2.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.7.ffn.fc2.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.7.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.7.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.7.final_layer_norm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.7.final_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.7.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.7.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.7.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.7.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.8.ffn.fc1.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.8.ffn.fc1.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.8.ffn.fc2.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.8.ffn.fc2.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.8.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.8.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.8.final_layer_norm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.8.final_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.8.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.8.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.8.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.8.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.9.ffn.fc1.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.9.ffn.fc1.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.9.ffn.fc2.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.9.ffn.fc2.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.9.ffn.ffn_layernorm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.9.ffn.ffn_layernorm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.9.final_layer_norm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.9.final_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.9.self_attn.out_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.9.self_attn.out_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.9.self_attn_layer_norm.bias": "model-00001-of-00002.safetensors",
-    "text_model.model.layers.9.self_attn_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "text_model.model.segment_emb.weight": "model-00001-of-00002.safetensors",
-    "vision_model.embeddings.column_embedder.weight": "model-00001-of-00002.safetensors",
-    "vision_model.embeddings.patch_projection.bias": "model-00001-of-00002.safetensors",
-    "vision_model.embeddings.patch_projection.weight": "model-00001-of-00002.safetensors",
-    "vision_model.embeddings.row_embedder.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.0.attention.key.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.0.attention.output.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.0.attention.query.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.0.attention.value.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.0.mlp.wi_0.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.0.mlp.wi_1.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.0.mlp.wo.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.0.pre_attention_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.0.pre_mlp_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.1.attention.key.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.1.attention.output.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.1.attention.query.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.1.attention.value.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.1.mlp.wi_0.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.1.mlp.wi_1.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.1.mlp.wo.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.1.pre_attention_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.1.pre_mlp_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.10.attention.key.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.10.attention.output.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.10.attention.query.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.10.attention.value.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.10.mlp.wi_0.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.10.mlp.wi_1.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.10.mlp.wo.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.10.pre_attention_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.10.pre_mlp_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.11.attention.key.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.11.attention.output.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.11.attention.query.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.11.attention.value.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.11.mlp.wi_0.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.11.mlp.wi_1.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.11.mlp.wo.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.11.pre_attention_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.11.pre_mlp_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.12.attention.key.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.12.attention.output.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.12.attention.query.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.12.attention.value.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.12.mlp.wi_0.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.12.mlp.wi_1.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.12.mlp.wo.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.12.pre_attention_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.12.pre_mlp_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.13.attention.key.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.13.attention.output.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.13.attention.query.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.13.attention.value.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.13.mlp.wi_0.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.13.mlp.wi_1.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.13.mlp.wo.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.13.pre_attention_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.13.pre_mlp_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.14.attention.key.weight": "model-00002-of-00002.safetensors",
-    "vision_model.encoder.layer.14.attention.output.weight": "model-00002-of-00002.safetensors",
-    "vision_model.encoder.layer.14.attention.query.weight": "model-00002-of-00002.safetensors",
-    "vision_model.encoder.layer.14.attention.value.weight": "model-00002-of-00002.safetensors",
-    "vision_model.encoder.layer.14.mlp.wi_0.weight": "model-00002-of-00002.safetensors",
-    "vision_model.encoder.layer.14.mlp.wi_1.weight": "model-00002-of-00002.safetensors",
-    "vision_model.encoder.layer.14.mlp.wo.weight": "model-00002-of-00002.safetensors",
-    "vision_model.encoder.layer.14.pre_attention_layer_norm.weight": "model-00002-of-00002.safetensors",
-    "vision_model.encoder.layer.14.pre_mlp_layer_norm.weight": "model-00002-of-00002.safetensors",
-    "vision_model.encoder.layer.15.attention.key.weight": "model-00002-of-00002.safetensors",
-    "vision_model.encoder.layer.15.attention.output.weight": "model-00002-of-00002.safetensors",
-    "vision_model.encoder.layer.15.attention.query.weight": "model-00002-of-00002.safetensors",
-    "vision_model.encoder.layer.15.attention.value.weight": "model-00002-of-00002.safetensors",
-    "vision_model.encoder.layer.15.mlp.wi_0.weight": "model-00002-of-00002.safetensors",
-    "vision_model.encoder.layer.15.mlp.wi_1.weight": "model-00002-of-00002.safetensors",
-    "vision_model.encoder.layer.15.mlp.wo.weight": "model-00002-of-00002.safetensors",
-    "vision_model.encoder.layer.15.pre_attention_layer_norm.weight": "model-00002-of-00002.safetensors",
-    "vision_model.encoder.layer.15.pre_mlp_layer_norm.weight": "model-00002-of-00002.safetensors",
-    "vision_model.encoder.layer.16.attention.key.weight": "model-00002-of-00002.safetensors",
-    "vision_model.encoder.layer.16.attention.output.weight": "model-00002-of-00002.safetensors",
-    "vision_model.encoder.layer.16.attention.query.weight": "model-00002-of-00002.safetensors",
-    "vision_model.encoder.layer.16.attention.value.weight": "model-00002-of-00002.safetensors",
-    "vision_model.encoder.layer.16.mlp.wi_0.weight": "model-00002-of-00002.safetensors",
-    "vision_model.encoder.layer.16.mlp.wi_1.weight": "model-00002-of-00002.safetensors",
-    "vision_model.encoder.layer.16.mlp.wo.weight": "model-00002-of-00002.safetensors",
-    "vision_model.encoder.layer.16.pre_attention_layer_norm.weight": "model-00002-of-00002.safetensors",
-    "vision_model.encoder.layer.16.pre_mlp_layer_norm.weight": "model-00002-of-00002.safetensors",
-    "vision_model.encoder.layer.17.attention.key.weight": "model-00002-of-00002.safetensors",
-    "vision_model.encoder.layer.17.attention.output.weight": "model-00002-of-00002.safetensors",
-    "vision_model.encoder.layer.17.attention.query.weight": "model-00002-of-00002.safetensors",
-    "vision_model.encoder.layer.17.attention.value.weight": "model-00002-of-00002.safetensors",
-    "vision_model.encoder.layer.17.mlp.wi_0.weight": "model-00002-of-00002.safetensors",
-    "vision_model.encoder.layer.17.mlp.wi_1.weight": "model-00002-of-00002.safetensors",
-    "vision_model.encoder.layer.17.mlp.wo.weight": "model-00002-of-00002.safetensors",
-    "vision_model.encoder.layer.17.pre_attention_layer_norm.weight": "model-00002-of-00002.safetensors",
-    "vision_model.encoder.layer.17.pre_mlp_layer_norm.weight": "model-00002-of-00002.safetensors",
-    "vision_model.encoder.layer.2.attention.key.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.2.attention.output.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.2.attention.query.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.2.attention.value.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.2.mlp.wi_0.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.2.mlp.wi_1.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.2.mlp.wo.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.2.pre_attention_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.2.pre_mlp_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.3.attention.key.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.3.attention.output.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.3.attention.query.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.3.attention.value.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.3.mlp.wi_0.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.3.mlp.wi_1.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.3.mlp.wo.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.3.pre_attention_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.3.pre_mlp_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.4.attention.key.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.4.attention.output.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.4.attention.query.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.4.attention.value.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.4.mlp.wi_0.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.4.mlp.wi_1.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.4.mlp.wo.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.4.pre_attention_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.4.pre_mlp_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.5.attention.key.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.5.attention.output.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.5.attention.query.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.5.attention.value.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.5.mlp.wi_0.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.5.mlp.wi_1.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.5.mlp.wo.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.5.pre_attention_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.5.pre_mlp_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.6.attention.key.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.6.attention.output.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.6.attention.query.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.6.attention.value.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.6.mlp.wi_0.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.6.mlp.wi_1.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.6.mlp.wo.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.6.pre_attention_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.6.pre_mlp_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.7.attention.key.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.7.attention.output.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.7.attention.query.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.7.attention.value.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.7.mlp.wi_0.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.7.mlp.wi_1.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.7.mlp.wo.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.7.pre_attention_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.7.pre_mlp_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.8.attention.key.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.8.attention.output.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.8.attention.query.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.8.attention.value.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.8.mlp.wi_0.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.8.mlp.wi_1.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.8.mlp.wo.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.8.pre_attention_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.8.pre_mlp_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.9.attention.key.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.9.attention.output.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.9.attention.query.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.9.attention.value.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.9.mlp.wi_0.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.9.mlp.wi_1.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.9.mlp.wo.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.9.pre_attention_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "vision_model.encoder.layer.9.pre_mlp_layer_norm.weight": "model-00001-of-00002.safetensors",
-    "vision_model.layernorm.weight": "model-00002-of-00002.safetensors"
-  }
-}

+version https://git-lfs.github.com/spec/v1
+oid sha256:7a1efccef236dea0c422e37d1584fa27b28c3dea5a09a98e2d6ef53c83a4830c
+size 56481

modeling_kosmos2_5.py CHANGED Viewed

The diff for this file is too large to render. See raw diff

preprocessor_config.json CHANGED Viewed

@@ -1,15 +1,3 @@
-{
-  "do_convert_rgb": true,
-  "do_normalize": true,
-  "image_processor_type": "Kosmos2_5ImageProcessor",
-  "max_patches": 4096,
-  "patch_size": {
-    "height": 16,
-    "width": 16
-  },
-  "processor_class": "Kosmos2_5Processor",
-  "auto_map": {
-    "AutoProcessor": "processing_kosmos2_5.Kosmos2_5Processor",
-    "AutoImageProcessor": "image_processing_kosmos2_5.Kosmos2_5ImageProcessor"
-  }
-}

+version https://git-lfs.github.com/spec/v1
+oid sha256:d46bc213f9d995f6d772767554da4651cc4888a962f96a06313c275409bcc68e
+size 393

processing_kosmos2_5.py CHANGED Viewed

@@ -1,147 +1,3 @@
-# coding=utf-8
-# Copyright 2024 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-Processor class for Kosmos2_5.
-"""
-from typing import List, Optional, Union
-import transformers
-from transformers.image_processing_utils import BatchFeature
-from transformers.processing_utils import ProcessorMixin
-from transformers.tokenization_utils_base import PaddingStrategy, TextInput, TruncationStrategy
-from transformers.utils import TensorType, is_torch_available
-from .image_processing_kosmos2_5 import Kosmos2_5ImageProcessor
-transformers.Kosmos2_5ImageProcessor = Kosmos2_5ImageProcessor
-if is_torch_available():
-    import torch
-class Kosmos2_5Processor(ProcessorMixin):
-    r"""
-    Constructs a Kosmos2_5 processor which wraps a BERT tokenizer and Kosmos2_5 image processor into a single
-    processor.
-    [`Kosmos2_5Processor`] offers all the functionalities of [`Kosmos2_5ImageProcessor`] and [`T5TokenizerFast`]. See
-    the docstring of [`~Kosmos2_5Processor.__call__`] and [`~Kosmos2_5Processor.decode`] for more information.
-    Args:
-        image_processor (`Kosmos2_5ImageProcessor`):
-            An instance of [`Kosmos2_5ImageProcessor`]. The image processor is a required input.
-        tokenizer (Union[`T5TokenizerFast`, `T5Tokenizer`]):
-            An instance of ['T5TokenizerFast`] or ['T5Tokenizer`]. The tokenizer is a required input.
-    """
-    attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "Kosmos2_5ImageProcessor"
-    tokenizer_class = "PreTrainedTokenizerFast"
-    def __init__(self, image_processor, tokenizer):
-        tokenizer.return_token_type_ids = False
-        self.image_processor = image_processor
-        self.tokenizer = tokenizer
-    def __call__(
-        self,
-        images=None,
-        text: Union[TextInput, List[TextInput]] = None,
-        add_special_tokens: bool = True,
-        padding: Union[bool, str, PaddingStrategy] = True,
-        truncation: Union[bool, str, TruncationStrategy] = True,
-        max_length: Optional[int] = None,
-        max_patches: Optional[int] = 4096,
-        stride: int = 0,
-        pad_to_multiple_of: Optional[int] = None,
-        return_attention_mask: Optional[bool] = None,
-        return_tensors: Optional[Union[str, TensorType]] = "pt",
-        **kwargs,
-    ) -> BatchFeature:
-        """
-        This method uses [`Kosmos2_5ImageProcessor.preprocess`] method to prepare image(s) for the model, and
-        [`PreTrainedTokenizerFast.__call__`] to prepare text for the model.
-        Please refer to the docstring of the above two methods for more information.
-        The rest of this documentation shows the arguments specific to `Kosmos2_5Processor`.
-        """
-        if images is None and text is None:
-            raise ValueError("You have to specify either images or text.")
-        encoding = BatchFeature()
-        if images is not None:
-            image_encoding = self.image_processor(
-                images, return_tensors=return_tensors, max_patches=max_patches, **kwargs
-            )
-            image_encoding.pop("rows")
-            image_encoding.pop("cols")
-            encoding.update(image_encoding)
-        if text is not None:
-            # use updates or pop
-            input = self.tokenizer(
-                text,
-                add_special_tokens=add_special_tokens,
-                padding=padding,
-                truncation=truncation,
-                max_length=max_length,
-                stride=stride,
-                pad_to_multiple_of=pad_to_multiple_of,
-                return_attention_mask=return_attention_mask,
-                return_tensors="pt",
-            )
-            batch_size, seq_len = input.input_ids.shape
-            additional_tokens = [0, 100283] + [0] * 2048 + [100284]
-            additional_tokens_tensor = torch.tensor(additional_tokens).unsqueeze(0).repeat(batch_size, 1)
-            input_ids = torch.cat([additional_tokens_tensor, input.input_ids], dim=1)
-            image_embeds_position_mask = [0, -1] + [1] * 2048 + [-1] + [0] * seq_len
-            image_embeds_position_mask = (
-                torch.LongTensor(image_embeds_position_mask).unsqueeze(0).repeat(batch_size, 1)
-            )
-            added_attention_mask = [1, 1] + [1] * 2048 + [1]
-            added_attention_mask_tensor = torch.tensor(added_attention_mask).unsqueeze(0).repeat(batch_size, 1)
-            attention_mask = torch.cat([added_attention_mask_tensor, input.attention_mask], dim=1)
-            encoding.update(
-                {
-                    "input_ids": input_ids,
-                    "attention_mask": attention_mask,
-                    "image_embeds_position_mask": image_embeds_position_mask,
-                }
-            )
-        return encoding
-    def batch_decode(self, *args, **kwargs):
-        """
-        This method forwards all its arguments to Kosmos2_5TokenizerFast's [`~PreTrainedTokenizer.batch_decode`].
-        Please refer to the docstring of this method for more information.
-        """
-        return self.tokenizer.batch_decode(*args, **kwargs)
-    def decode(self, *args, **kwargs):
-        """
-        This method forwards all its arguments to Kosmos2_5TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please
-        refer to the docstring of this method for more information.
-        """
-        return self.tokenizer.decode(*args, **kwargs)
-    @property
-    def model_input_names(self):
-        tokenizer_input_names = self.tokenizer.model_input_names
-        image_processor_input_names = self.image_processor.model_input_names
-        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))

+version https://git-lfs.github.com/spec/v1
+oid sha256:e1695632edfe24f44f91dfee4558094e9cc43ba9d94a2adfdf6421c92a242360
+size 6211

special_tokens_map.json CHANGED Viewed

@@ -1,30 +1,3 @@
-{
-  "bos_token": {
-    "content": "<s>",
-    "lstrip": false,
-    "normalized": false,
-    "rstrip": false,
-    "single_word": false
-  },
-  "eos_token": {
-    "content": "</s>",
-    "lstrip": false,
-    "normalized": false,
-    "rstrip": false,
-    "single_word": false
-  },
-  "pad_token": {
-    "content": "<pad>",
-    "lstrip": false,
-    "normalized": false,
-    "rstrip": false,
-    "single_word": false
-  },
-  "unk_token": {
-    "content": "<unk>",
-    "lstrip": false,
-    "normalized": false,
-    "rstrip": false,
-    "single_word": false
-  }
-}

+version https://git-lfs.github.com/spec/v1
+oid sha256:358c249e2fb29060c6b73157d428853b0c48710deffc8ee670ab1013880946c9
+size 552

tokenizer.json CHANGED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json CHANGED Viewed

The diff for this file is too large to render. See raw diff