nomic-ai
/

nomic-bert-2048

@@ -3,39 +3,34 @@
 # https://github.com/mlcommons/training_results_v2.0/blob/main/HazyResearch/benchmarks/bert/implementations/pytorch/modeling.py
 # https://github.com/mlcommons/training_results_v2.1/blob/main/Azure-HazyResearch/benchmarks/bert/implementations/ND96amsr_A100_v4/modeling.py
 # Inspired by https://github.com/huggingface/transformers/blob/main/src/transformers/models/bert/modeling_bert.py
 import os
-import logging
 from functools import partial
-from typing import Optional, List, Tuple, Union
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange, repeat
 from transformers import GPT2Config, PreTrainedModel
 from transformers.models.bert.modeling_bert import (
     BaseModelOutputWithPoolingAndCrossAttentions,
     MaskedLMOutput,
-    SequenceClassifierOutput
-)
-import re
-from collections import OrderedDict
-from safetensors.torch import load_file as safe_load_file
-from transformers.utils import (
-    SAFE_WEIGHTS_INDEX_NAME,
-    SAFE_WEIGHTS_NAME,
-    WEIGHTS_INDEX_NAME,
-    WEIGHTS_NAME,
 )
 from transformers.utils.hub import cached_file, get_checkpoint_shard_files
 from .configuration_hf_nomic_bert import NomicBertConfig
 logger = logging.getLogger(__name__)
 # adapted from flash attention, added safe serialization option for hf models
 def state_dict_from_pretrained(model_name, safe_serialization=False, device=None, dtype=None):
     # If not fp32, then we don't want to load directly to the GPU
@@ -50,18 +45,12 @@ def state_dict_from_pretrained(model_name, safe_serialization=False, device=None
     safe_weights_index_path = os.path.join(model_name, SAFE_WEIGHTS_INDEX_NAME)
     if os.path.isfile(weights_path):
-        resolved_archive_file = cached_file(
-            model_name, WEIGHTS_NAME, _raise_exceptions_for_missing_entries=False
-        )
     elif os.path.isfile(weights_index_path):
-        resolved_archive_file = cached_file(
-            model_name, WEIGHTS_INDEX_NAME, _raise_exceptions_for_missing_entries=False
-        )
         is_sharded = True
     elif os.path.isfile(safe_weights_path):
-        resolved_archive_file = cached_file(
-            model_name, SAFE_WEIGHTS_NAME, _raise_exceptions_for_missing_entries=False
-        )
         load_safe = True
     elif os.path.isfile(safe_weights_index_path):
         resolved_archive_file = cached_file(
@@ -74,8 +63,7 @@ def state_dict_from_pretrained(model_name, safe_serialization=False, device=None
         resolved_archive_file = cached_file(model_name, weight_name, _raise_exceptions_for_missing_entries=False)
         if resolved_archive_file is None:
             weight_index = WEIGHTS_INDEX_NAME if not safe_serialization else SAFE_WEIGHTS_INDEX_NAME
-            resolved_archive_file = cached_file(model_name, weight_index,
-                                                _raise_exceptions_for_missing_entries=False)
             if resolved_archive_file is not None:
                 is_sharded = True
@@ -92,9 +80,7 @@ def state_dict_from_pretrained(model_name, safe_serialization=False, device=None
     if is_sharded:
         # resolved_archive_file becomes a list of files that point to the different
         # checkpoint shards in this case.
-        resolved_archive_file, sharded_metadata = get_checkpoint_shard_files(
-            model_name, resolved_archive_file
-        )
         state_dict = {}
         for sharded_file in resolved_archive_file:
             state_dict.update(loader(sharded_file))
@@ -106,7 +92,7 @@ def state_dict_from_pretrained(model_name, safe_serialization=False, device=None
     state_dict = {k: v.to(device=device) for k, v in state_dict.items()}
     return state_dict
 def filter_shapes(state_dict, model):
     """
     Filters the state dict to match the current model shape.
@@ -118,11 +104,18 @@ def filter_shapes(state_dict, model):
                 filtered_state_dict[key] = value
     return filtered_state_dict
-def remap_bert_state_dict(state_dict, config, remove_bert=False, remove_cls_weights=False, add_pooling_layer=False):
     """
     Map the state_dict of a Huggingface BERT model to be flash_attn compatible.
     """
     def add_bert_prefix(key):
         # prepend bert. to the key
         if key.startswith("bert.") or key.startswith("cls."):
@@ -130,7 +123,7 @@ def remap_bert_state_dict(state_dict, config, remove_bert=False, remove_cls_weig
         return f"bert.{key}"
     state_dict = OrderedDict((add_bert_prefix(k), v) for k, v in state_dict.items())
     # LayerNorm
     def key_mapping_ln_gamma_beta(key):
         key = re.sub(r"LayerNorm.gamma$", "LayerNorm.weight", key)
@@ -195,9 +188,7 @@ def remap_bert_state_dict(state_dict, config, remove_bert=False, remove_cls_weig
         bk = state_dict.pop(f"bert.encoder.layers.{d}.attention.self.key.bias")
         bv = state_dict.pop(f"bert.encoder.layers.{d}.attention.self.value.bias")
         if not (last_layer_subset and d == config.num_hidden_layers - 1):
-            state_dict[f"bert.encoder.layers.{d}.attn.Wqkv.weight"] = torch.cat(
-                [Wq, Wk, Wv], dim=0
-            )
             state_dict[f"bert.encoder.layers.{d}.attn.Wqkv.bias"] = torch.cat([bq, bk, bv], dim=0)
         else:
             state_dict[f"bert.encoder.layers.{d}.attn.Wq.weight"] = Wq
@@ -217,7 +208,6 @@ def remap_bert_state_dict(state_dict, config, remove_bert=False, remove_cls_weig
     def key_mapping_decoder_bias(key):
         return re.sub(r"^cls.predictions.bias", "cls.predictions.decoder.bias", key)
     # remove nsp weights, we don't use
     state_dict.pop("cls.seq_relationship.weight", None)
     state_dict.pop("cls.seq_relationship.bias", None)
@@ -226,12 +216,14 @@ def remap_bert_state_dict(state_dict, config, remove_bert=False, remove_cls_weig
     state_dict = OrderedDict((key_mapping_decoder_bias(k), v) for k, v in state_dict.items())
     if remove_cls_weights:
-        cls_weights = ["cls.predictions.decoder.bias",
-                       "cls.predictions.transform.dense.weight",
-                       "cls.predictions.transform.dense.bias",
-                       "cls.predictions.transform.layer_norm.weight",
-                       "cls.predictions.transform.layer_norm.bias",
-                       "cls.predictions.decoder.weight"]
         for weight in cls_weights:
             state_dict.pop(weight, None)
@@ -257,20 +249,21 @@ def remap_bert_state_dict(state_dict, config, remove_bert=False, remove_cls_weig
                 )
     if add_pooling_layer is False:
-        pooler_weights = ["bert.pooler.dense.weight",
-                          "bert.pooler.dense.bias",
-                        ]
         for key in pooler_weights:
             state_dict.pop(key, None)
     if remove_bert:
         def remove_bert_prefix(key):
             key = re.sub(r"^bert.", "", key)
             return key
         state_dict = OrderedDict((remove_bert_prefix(k), v) for k, v in state_dict.items())
     return state_dict
@@ -278,6 +271,7 @@ class NomicBertPreTrainedModel(PreTrainedModel):
     """An abstract class to handle weights initialization and
     a simple interface for dowloading and loading pretrained models.
     """
     config_class = NomicBertConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
@@ -317,14 +311,13 @@ class NomicBertPreTrainedModel(PreTrainedModel):
         if config is None:
             config = cls.config_class.from_pretrained(model_name)
         remove_cls = cls != NomicBertForPreTraining
-        remove_bert_prefix = cls != NomicBertForPreTraining
         ignore_mismatched_shapes = kwargs.pop("ignore_mismatched_sizes", False)
         num_labels = kwargs.pop("num_labels", None)
         rotary_scaling_factor = kwargs.pop("rotary_scaling_factor", None)
         if rotary_scaling_factor:
             config.rotary_scaling_factor = rotary_scaling_factor
-        else:
-            config.rotary_scaling_factor = None
         if config.n_positions <= 0 and config.rotary_emb_fraction > 0:
             config.n_positions = 2048
         if num_labels:
@@ -341,26 +334,34 @@ class NomicBertPreTrainedModel(PreTrainedModel):
         # Assuming we know what we're doing when loading from disk
         # Prob a bad assumption but i'm tired and want to train this asap
         if os.path.exists(model_name):
-            state_dict = torch.load(f"{model_name}/pytorch_model.bin")
             if ignore_mismatched_shapes:
                 state_dict = filter_shapes(state_dict, model)
             load_return = model.load_state_dict(state_dict, strict=False)
         else:
             # TODO: can probably check config class and see if we need to remap from a bert model
-            state_dict = state_dict_from_pretrained(model_name)
-            state_dict = remap_bert_state_dict(state_dict,
-                                               config,
-                                               remove_bert=remove_bert_prefix,
-                                               remove_cls_weights=remove_cls,
-                                               add_pooling_layer=getattr(config, "add_pooling_layer", False)
-                                               )
             if ignore_mismatched_shapes:
                 state_dict = filter_shapes(state_dict, model)
-            load_return = model.load_state_dict(
-                state_dict,
-                strict=True
-            )
         logger.warning(load_return)
         return model
@@ -380,25 +381,21 @@ def _init_weights(module, initializer_range=0.02):
         if module.padding_idx is not None:
             nn.init.zeros_(module.weight[module.padding_idx])
 class NomicBertEmbeddings(nn.Module):
-    def __init__(
-        self,
-        config
-    ):
         """
         If max_position_embeddings <= 0, there's no position embeddings
         If type_vocab_size <= 0, there's no token type embeddings
         """
         super().__init__()
-        self.word_embeddings = nn.Embedding(
-            config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id
-        )
         self.max_position_embeddings = config.max_position_embeddings if config.rotary_emb_fraction <= 0 else 0
         self.type_vocab_size = config.type_vocab_size
         if self.max_position_embeddings > 0 and config.rotary_emb_fraction <= 0:
             self.position_embeddings = nn.Embedding(
-                config.max_position_embeddings, config.hidden_size,
             )
         if self.type_vocab_size > 0:
             self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
@@ -425,6 +422,7 @@ class NomicBertEmbeddings(nn.Module):
             embeddings = embeddings + position_embeddings
         return embeddings
 class NomicBertMLP(nn.Module):
     def __init__(
         self,
@@ -442,11 +440,7 @@ class NomicBertMLP(nn.Module):
         hidden_features = hidden_features if hidden_features is not None else in_features * 4
         self.return_residual = return_residual
         self.fc1 = nn.Linear(in_features, hidden_features, bias=bias1)
-        approximate = (
-            "tanh"
-            if activation in ["gelu_new", "gelu_fast", "gelu_pytorch_tanh"]
-            else "none"
-        )
         self.activation = nn.GELU(approximate=approximate) if activation == "gelu" else activation
         self.fc2 = nn.Linear(hidden_features, out_features, bias=bias2)
@@ -456,7 +450,7 @@ class NomicBertMLP(nn.Module):
         y = self.fc2(y)
         return y if not self.return_residual else (y, x)
 class NomciBertGatedMLP(nn.Module):
     def __init__(
         self,
@@ -474,9 +468,7 @@ class NomciBertGatedMLP(nn.Module):
     ):
         super().__init__()
         out_features = out_features if out_features is not None else in_features
-        hidden_features = (
-            hidden_features if hidden_features is not None else int(8 * in_features / 3)
-        )
         hidden_features = (hidden_features + multiple_of - 1) // multiple_of * multiple_of
         self.return_residual = return_residual
@@ -513,8 +505,8 @@ def apply_rotary_emb(x, cos, sin, offset=0, interleaved=False):
     ro_dim = cos.shape[-1] * 2
     assert ro_dim <= x.shape[-1]
     cos, sin = (
-        cos[offset: offset + x.shape[1]],
-        sin[offset: offset + x.shape[1]],
     )
     cos = repeat(cos, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)")
     sin = repeat(sin, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)")
@@ -571,10 +563,7 @@ class NomicBertRotaryEmbedding(nn.Module):
         self._sin_k_cached = None
     def _compute_inv_freq(self, device=None):
-        return 1.0 / (
-            self.base
-            ** (torch.arange(0, self.dim, 2, device=device, dtype=torch.float32) / self.dim)
-        )
     def _update_cos_sin_cache(self, seqlen, device=None, dtype=None):
         # Reset the tables if the sequence length has changed,
@@ -646,14 +635,10 @@ class NomicBertDynamicNTKRotaryEmbedding(NomicBertRotaryEmbedding):
         self.rotary_scaling_factor = rotary_scaling_factor
         self.max_position_embeddings = max_position_embeddings
     def _compute_inv_freq(self, base=None, device=None):
         if base is None:
             base = self.base
-        return 1.0 / (
-            base
-            ** (torch.arange(0, self.dim, 2, device=device, dtype=torch.float32) / self.dim)
-        )
     def _update_cos_sin_cache(self, seqlen, device=None, dtype=None):
         # Reset the tables if the sequence length has changed,
@@ -704,8 +689,7 @@ class NomicBertDynamicNTKRotaryEmbedding(NomicBertRotaryEmbedding):
                 self._sin_cached = torch.sin(freqs).to(dtype)
             else:
                 power = (
-                    torch.arange(seqlen, dtype=self.scale.dtype, device=self.scale.device)
-                    - seqlen // 2
                 ) / self.scale_base
                 scale = self.scale.to(device=power.device) ** rearrange(power, "s -> s 1")
                 # We want the multiplication by scale to happen in fp32
@@ -714,6 +698,7 @@ class NomicBertDynamicNTKRotaryEmbedding(NomicBertRotaryEmbedding):
                 self._cos_k_cached = (torch.cos(freqs) / scale).to(dtype)
                 self._sin_k_cached = (torch.sin(freqs) / scale).to(dtype)
 class NomicBertAttention(nn.Module):
     """Multi-head self-attention and cross-attention"""
@@ -754,8 +739,8 @@ class NomicBertAttention(nn.Module):
                     scale_base=config.rotary_emb_scale_base,
                     interleaved=config.rotary_emb_interleaved,
                     rotary_scaling_factor=config.rotary_scaling_factor,
-                    max_position_embeddings=config.n_positions,
-                )
             else:
                 self.rotary_emb = NomicBertRotaryEmbedding(
                     dim=self.rotary_emb_dim,
@@ -826,7 +811,7 @@ class NomicBertAttention(nn.Module):
         attn_output = self.out_proj(attn_output)
         return attn_output
 class NomicBertBlock(nn.Module):
     def __init__(
@@ -836,17 +821,31 @@ class NomicBertBlock(nn.Module):
         super().__init__()
         self.prenorm = config.prenorm
         self.fused_dropout_add_ln = config.fused_dropout_add_ln
-        self.attn = NomicBertAttention(config)
         activation = (
-                F.sigmoid
-                if config.activation_function == "glu"
-                else (F.silu if config.activation_function == "swiglu" else F.gelu)
         )
         if config.activation_function in ["glu", "swiglu", "geglu"]:
-            self.mlp = NomciBertGatedMLP(config.n_embd, hidden_features=config.n_inner, bias1=config.mlp_fc1_bias, bias2=config.mlp_fc2_bias, activation=activation, fused_bias_fc=config.fused_bias_fc)
         else:
-            self.mlp = NomicBertMLP(config.n_embd, hidden_features=config.n_inner, bias1=config.mlp_fc1_bias, bias2=config.mlp_fc2_bias, activation=activation, fused_bias_fc=config.fused_bias_fc)
         self.dropout1 = nn.Dropout(config.resid_pdrop)
         self.norm1 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
@@ -880,7 +879,13 @@ class NomicBertBlock(nn.Module):
             dropped = self.dropout1(hidden_states)
             residual = (dropped + residual) if residual is not None else dropped
             hidden_states = self.norm1(residual.to(dtype=self.norm1.weight.dtype))
-            hidden_states = self.attn(hidden_states, attention_mask=attention_mask, is_padded_inputs=is_padded_inputs, cu_seqlens=cu_seqlens, max_seq_len=max_seq_len)
             dropped = self.dropout2(hidden_states)
             residual = (dropped + residual) if residual is not None else dropped
@@ -890,36 +895,29 @@ class NomicBertBlock(nn.Module):
             return hidden_states, None, residual
         else:
             assert residual is None
-            attn_outputs = self.attn(hidden_states,
-                                     attention_mask=attention_mask,
-                                     is_padded_inputs=is_padded_inputs,
-                                     cu_seqlens=cu_seqlens,
-                                     max_seq_len=max_seq_len)
-            hidden_states = self.norm1(
-                (self.dropout1(attn_outputs) + hidden_states).to(
-                    dtype=self.norm1.weight.dtype
-                )
             )
             mlp_out = self.mlp(hidden_states)
-            hidden_states = self.norm2(
-                (self.dropout2(mlp_out) + hidden_states).to(
-                    dtype=self.norm2.weight.dtype
-                )
-            )
             return hidden_states, None, None
 class NomicBertEncoder(nn.Module):
     def __init__(self, config: GPT2Config):
         super().__init__()
-        self.layers = nn.ModuleList(
-            [NomicBertBlock(config) for _ in range(config.n_layer)]
-        )
         self.gradient_checkpointing = False
         self.config = config
-    def forward(self,
         hidden_states: torch.LongTensor = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
@@ -929,8 +927,8 @@ class NomicBertEncoder(nn.Module):
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
-        is_padded_inputs: Optional[bool] = True,):
         """If subset_mask is not None, we only want output for the subset of the sequence.
         This means that we only compute the last layer output for these tokens.
         subset_mask: (batch, seqlen), dtype=torch.bool
@@ -938,7 +936,6 @@ class NomicBertEncoder(nn.Module):
         hidden_states2 = None
         residual = None
         for _, layer in enumerate(self.layers):
             if self.gradient_checkpointing and self.training:
@@ -998,11 +995,7 @@ class NomicBertPredictionHeadTransform(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.dense = nn.Linear(config.n_embd, config.n_embd, bias=config.mlp_fc1_bias)
-        approximate = (
-            "tanh"
-            if config.activation_function in ["gelu_new", "gelu_fast", "gelu_pytorch_tanh"]
-            else "none"
-        )
         if config.activation_function == "swiglu":
             self.transform_act_fn = F.silu
         else:
@@ -1047,15 +1040,19 @@ class NomicBertModel(NomicBertPreTrainedModel):
         super().__init__(config)
         self.pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
         if config.vocab_size % self.pad_vocab_size_multiple != 0:
-            config.vocab_size += self.pad_vocab_size_multiple - (
-                config.vocab_size % self.pad_vocab_size_multiple
-            )
-        assert config.activation_function in ["gelu", "gelu_new", "gelu_fast", "gelu_pytorch_tanh", "swiglu", "geglu", "glu"]
-        self.embeddings = NomicBertEmbeddings(
-            config
-        )
         self.emb_drop = nn.Dropout(config.resid_pdrop)
         self.emb_ln = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
         self.encoder = NomicBertEncoder(config)
@@ -1069,22 +1066,23 @@ class NomicBertModel(NomicBertPreTrainedModel):
         position_ids=None,
         token_type_ids=None,
         attention_mask=None,
     ):
         if token_type_ids is None:
             token_type_ids = torch.zeros_like(input_ids)
-        hidden_states = self.embeddings(
-            input_ids, position_ids=position_ids, token_type_ids=token_type_ids
-        )
         hidden_states = self.emb_ln(hidden_states)
         hidden_states = self.emb_drop(hidden_states)
         attention_mask = self.get_extended_attention_mask(attention_mask, input_ids.shape)
-        sequence_output = self.encoder(
-            hidden_states, attention_mask=attention_mask
-        )
         pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
         return BaseModelOutputWithPoolingAndCrossAttentions(
             last_hidden_state=sequence_output,
             pooler_output=pooled_output,
@@ -1151,10 +1149,10 @@ class NomicBertForPreTraining(NomicBertPreTrainedModel):
             loss=total_loss,
             logits=prediction_scores,
             hidden_states=outputs.hidden_states,
-            attentions=None,
         )
 class NomicBertForSequenceClassification(NomicBertPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
@@ -1162,9 +1160,7 @@ class NomicBertForSequenceClassification(NomicBertPreTrainedModel):
         self.config = config
         self.bert = NomicBertModel(config)
-        classifier_dropout = (
-            getattr(config, "classifier_dropout", config.embd_pdrop)
-        )
         self.dropout = nn.Dropout(classifier_dropout)
         self.classifier = nn.Linear(config.n_embd, config.num_labels)

 # https://github.com/mlcommons/training_results_v2.0/blob/main/HazyResearch/benchmarks/bert/implementations/pytorch/modeling.py
 # https://github.com/mlcommons/training_results_v2.1/blob/main/Azure-HazyResearch/benchmarks/bert/implementations/ND96amsr_A100_v4/modeling.py
+import logging
 # Inspired by https://github.com/huggingface/transformers/blob/main/src/transformers/models/bert/modeling_bert.py
 import os
+import re
+from collections import OrderedDict
 from functools import partial
+from typing import List, Optional, Tuple, Union
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange, repeat
+from safetensors.torch import load_file as safe_load_file
 from transformers import GPT2Config, PreTrainedModel
 from transformers.models.bert.modeling_bert import (
     BaseModelOutputWithPoolingAndCrossAttentions,
     MaskedLMOutput,
+    SequenceClassifierOutput,
 )
+from transformers.utils import SAFE_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME, WEIGHTS_INDEX_NAME, WEIGHTS_NAME
 from transformers.utils.hub import cached_file, get_checkpoint_shard_files
 from .configuration_hf_nomic_bert import NomicBertConfig
 logger = logging.getLogger(__name__)
 # adapted from flash attention, added safe serialization option for hf models
 def state_dict_from_pretrained(model_name, safe_serialization=False, device=None, dtype=None):
     # If not fp32, then we don't want to load directly to the GPU
     safe_weights_index_path = os.path.join(model_name, SAFE_WEIGHTS_INDEX_NAME)
     if os.path.isfile(weights_path):
+        resolved_archive_file = cached_file(model_name, WEIGHTS_NAME, _raise_exceptions_for_missing_entries=False)
     elif os.path.isfile(weights_index_path):
+        resolved_archive_file = cached_file(model_name, WEIGHTS_INDEX_NAME, _raise_exceptions_for_missing_entries=False)
         is_sharded = True
     elif os.path.isfile(safe_weights_path):
+        resolved_archive_file = cached_file(model_name, SAFE_WEIGHTS_NAME, _raise_exceptions_for_missing_entries=False)
         load_safe = True
     elif os.path.isfile(safe_weights_index_path):
         resolved_archive_file = cached_file(
         resolved_archive_file = cached_file(model_name, weight_name, _raise_exceptions_for_missing_entries=False)
         if resolved_archive_file is None:
             weight_index = WEIGHTS_INDEX_NAME if not safe_serialization else SAFE_WEIGHTS_INDEX_NAME
+            resolved_archive_file = cached_file(model_name, weight_index, _raise_exceptions_for_missing_entries=False)
             if resolved_archive_file is not None:
                 is_sharded = True
     if is_sharded:
         # resolved_archive_file becomes a list of files that point to the different
         # checkpoint shards in this case.
+        resolved_archive_file, sharded_metadata = get_checkpoint_shard_files(model_name, resolved_archive_file)
         state_dict = {}
         for sharded_file in resolved_archive_file:
             state_dict.update(loader(sharded_file))
     state_dict = {k: v.to(device=device) for k, v in state_dict.items()}
     return state_dict
 def filter_shapes(state_dict, model):
     """
     Filters the state dict to match the current model shape.
                 filtered_state_dict[key] = value
     return filtered_state_dict
+def remap_bert_state_dict(
+    state_dict,
+    config,
+    remove_bert=False,
+    remove_cls_weights=False,
+    add_pooling_layer=False,
+):
     """
     Map the state_dict of a Huggingface BERT model to be flash_attn compatible.
     """
     def add_bert_prefix(key):
         # prepend bert. to the key
         if key.startswith("bert.") or key.startswith("cls."):
         return f"bert.{key}"
     state_dict = OrderedDict((add_bert_prefix(k), v) for k, v in state_dict.items())
     # LayerNorm
     def key_mapping_ln_gamma_beta(key):
         key = re.sub(r"LayerNorm.gamma$", "LayerNorm.weight", key)
         bk = state_dict.pop(f"bert.encoder.layers.{d}.attention.self.key.bias")
         bv = state_dict.pop(f"bert.encoder.layers.{d}.attention.self.value.bias")
         if not (last_layer_subset and d == config.num_hidden_layers - 1):
+            state_dict[f"bert.encoder.layers.{d}.attn.Wqkv.weight"] = torch.cat([Wq, Wk, Wv], dim=0)
             state_dict[f"bert.encoder.layers.{d}.attn.Wqkv.bias"] = torch.cat([bq, bk, bv], dim=0)
         else:
             state_dict[f"bert.encoder.layers.{d}.attn.Wq.weight"] = Wq
     def key_mapping_decoder_bias(key):
         return re.sub(r"^cls.predictions.bias", "cls.predictions.decoder.bias", key)
     # remove nsp weights, we don't use
     state_dict.pop("cls.seq_relationship.weight", None)
     state_dict.pop("cls.seq_relationship.bias", None)
     state_dict = OrderedDict((key_mapping_decoder_bias(k), v) for k, v in state_dict.items())
     if remove_cls_weights:
+        cls_weights = [
+            "cls.predictions.decoder.bias",
+            "cls.predictions.transform.dense.weight",
+            "cls.predictions.transform.dense.bias",
+            "cls.predictions.transform.layer_norm.weight",
+            "cls.predictions.transform.layer_norm.bias",
+            "cls.predictions.decoder.weight",
+        ]
         for weight in cls_weights:
             state_dict.pop(weight, None)
                 )
     if add_pooling_layer is False:
+        pooler_weights = [
+            "bert.pooler.dense.weight",
+            "bert.pooler.dense.bias",
+        ]
         for key in pooler_weights:
             state_dict.pop(key, None)
     if remove_bert:
         def remove_bert_prefix(key):
             key = re.sub(r"^bert.", "", key)
             return key
         state_dict = OrderedDict((remove_bert_prefix(k), v) for k, v in state_dict.items())
     return state_dict
     """An abstract class to handle weights initialization and
     a simple interface for dowloading and loading pretrained models.
     """
     config_class = NomicBertConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
         if config is None:
             config = cls.config_class.from_pretrained(model_name)
         remove_cls = cls != NomicBertForPreTraining
+        remove_bert_prefix = cls != NomicBertForPreTraining and cls != NomicBertForSequenceClassification
         ignore_mismatched_shapes = kwargs.pop("ignore_mismatched_sizes", False)
         num_labels = kwargs.pop("num_labels", None)
         rotary_scaling_factor = kwargs.pop("rotary_scaling_factor", None)
         if rotary_scaling_factor:
             config.rotary_scaling_factor = rotary_scaling_factor
         if config.n_positions <= 0 and config.rotary_emb_fraction > 0:
             config.n_positions = 2048
         if num_labels:
         # Assuming we know what we're doing when loading from disk
         # Prob a bad assumption but i'm tired and want to train this asap
         if os.path.exists(model_name):
+            model_path = f"{model_name}/pytorch_model.bin"
+            if os.path.exists(model_path):
+                state_dict = torch.load(f"{model_name}/pytorch_model.bin")
+            else:
+                model_path = f"{model_name}/model.safetensors"
+                if not os.path.exists(model_path):
+                    raise ValueError(f"Model path {model_path} not found")
+                state_dict = safe_load_file(model_path)
             if ignore_mismatched_shapes:
                 state_dict = filter_shapes(state_dict, model)
             load_return = model.load_state_dict(state_dict, strict=False)
         else:
             # TODO: can probably check config class and see if we need to remap from a bert model
+            state_dict = state_dict_from_pretrained(
+                model_name, safe_serialization=kwargs.get("safe_serialization", False)
+            )
+            state_dict = remap_bert_state_dict(
+                state_dict,
+                config,
+                remove_bert=remove_bert_prefix,
+                remove_cls_weights=remove_cls,
+                add_pooling_layer=getattr(config, "add_pooling_layer", False),
+            )
             if ignore_mismatched_shapes:
                 state_dict = filter_shapes(state_dict, model)
+            load_return = model.load_state_dict(state_dict, strict=True)
         logger.warning(load_return)
         return model
         if module.padding_idx is not None:
             nn.init.zeros_(module.weight[module.padding_idx])
 class NomicBertEmbeddings(nn.Module):
+    def __init__(self, config):
         """
         If max_position_embeddings <= 0, there's no position embeddings
         If type_vocab_size <= 0, there's no token type embeddings
         """
         super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
         self.max_position_embeddings = config.max_position_embeddings if config.rotary_emb_fraction <= 0 else 0
         self.type_vocab_size = config.type_vocab_size
         if self.max_position_embeddings > 0 and config.rotary_emb_fraction <= 0:
             self.position_embeddings = nn.Embedding(
+                config.max_position_embeddings,
+                config.hidden_size,
             )
         if self.type_vocab_size > 0:
             self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
             embeddings = embeddings + position_embeddings
         return embeddings
 class NomicBertMLP(nn.Module):
     def __init__(
         self,
         hidden_features = hidden_features if hidden_features is not None else in_features * 4
         self.return_residual = return_residual
         self.fc1 = nn.Linear(in_features, hidden_features, bias=bias1)
+        approximate = "tanh" if activation in ["gelu_new", "gelu_fast", "gelu_pytorch_tanh"] else "none"
         self.activation = nn.GELU(approximate=approximate) if activation == "gelu" else activation
         self.fc2 = nn.Linear(hidden_features, out_features, bias=bias2)
         y = self.fc2(y)
         return y if not self.return_residual else (y, x)
 class NomciBertGatedMLP(nn.Module):
     def __init__(
         self,
     ):
         super().__init__()
         out_features = out_features if out_features is not None else in_features
+        hidden_features = hidden_features if hidden_features is not None else int(8 * in_features / 3)
         hidden_features = (hidden_features + multiple_of - 1) // multiple_of * multiple_of
         self.return_residual = return_residual
     ro_dim = cos.shape[-1] * 2
     assert ro_dim <= x.shape[-1]
     cos, sin = (
+        cos[offset : offset + x.shape[1]],
+        sin[offset : offset + x.shape[1]],
     )
     cos = repeat(cos, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)")
     sin = repeat(sin, "... d -> ... 1 (2 d)" if not interleaved else "... d -> ... 1 (d 2)")
         self._sin_k_cached = None
     def _compute_inv_freq(self, device=None):
+        return 1.0 / (self.base ** (torch.arange(0, self.dim, 2, device=device, dtype=torch.float32) / self.dim))
     def _update_cos_sin_cache(self, seqlen, device=None, dtype=None):
         # Reset the tables if the sequence length has changed,
         self.rotary_scaling_factor = rotary_scaling_factor
         self.max_position_embeddings = max_position_embeddings
     def _compute_inv_freq(self, base=None, device=None):
         if base is None:
             base = self.base
+        return 1.0 / (base ** (torch.arange(0, self.dim, 2, device=device, dtype=torch.float32) / self.dim))
     def _update_cos_sin_cache(self, seqlen, device=None, dtype=None):
         # Reset the tables if the sequence length has changed,
                 self._sin_cached = torch.sin(freqs).to(dtype)
             else:
                 power = (
+                    torch.arange(seqlen, dtype=self.scale.dtype, device=self.scale.device) - seqlen // 2
                 ) / self.scale_base
                 scale = self.scale.to(device=power.device) ** rearrange(power, "s -> s 1")
                 # We want the multiplication by scale to happen in fp32
                 self._cos_k_cached = (torch.cos(freqs) / scale).to(dtype)
                 self._sin_k_cached = (torch.sin(freqs) / scale).to(dtype)
 class NomicBertAttention(nn.Module):
     """Multi-head self-attention and cross-attention"""
                     scale_base=config.rotary_emb_scale_base,
                     interleaved=config.rotary_emb_interleaved,
                     rotary_scaling_factor=config.rotary_scaling_factor,
+                    max_position_embeddings=config.max_trained_positions,
+                )
             else:
                 self.rotary_emb = NomicBertRotaryEmbedding(
                     dim=self.rotary_emb_dim,
         attn_output = self.out_proj(attn_output)
         return attn_output
 class NomicBertBlock(nn.Module):
     def __init__(
         super().__init__()
         self.prenorm = config.prenorm
         self.fused_dropout_add_ln = config.fused_dropout_add_ln
+        self.attn = NomicBertAttention(config)
         activation = (
+            F.sigmoid
+            if config.activation_function == "glu"
+            else (F.silu if config.activation_function == "swiglu" else F.gelu)
         )
         if config.activation_function in ["glu", "swiglu", "geglu"]:
+            self.mlp = NomciBertGatedMLP(
+                config.n_embd,
+                hidden_features=config.n_inner,
+                bias1=config.mlp_fc1_bias,
+                bias2=config.mlp_fc2_bias,
+                activation=activation,
+                fused_bias_fc=config.fused_bias_fc,
+            )
         else:
+            self.mlp = NomicBertMLP(
+                config.n_embd,
+                hidden_features=config.n_inner,
+                bias1=config.mlp_fc1_bias,
+                bias2=config.mlp_fc2_bias,
+                activation=activation,
+                fused_bias_fc=config.fused_bias_fc,
+            )
         self.dropout1 = nn.Dropout(config.resid_pdrop)
         self.norm1 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
             dropped = self.dropout1(hidden_states)
             residual = (dropped + residual) if residual is not None else dropped
             hidden_states = self.norm1(residual.to(dtype=self.norm1.weight.dtype))
+            hidden_states = self.attn(
+                hidden_states,
+                attention_mask=attention_mask,
+                is_padded_inputs=is_padded_inputs,
+                cu_seqlens=cu_seqlens,
+                max_seq_len=max_seq_len,
+            )
             dropped = self.dropout2(hidden_states)
             residual = (dropped + residual) if residual is not None else dropped
             return hidden_states, None, residual
         else:
             assert residual is None
+            attn_outputs = self.attn(
+                hidden_states,
+                attention_mask=attention_mask,
+                is_padded_inputs=is_padded_inputs,
+                cu_seqlens=cu_seqlens,
+                max_seq_len=max_seq_len,
             )
+            hidden_states = self.norm1((self.dropout1(attn_outputs) + hidden_states).to(dtype=self.norm1.weight.dtype))
             mlp_out = self.mlp(hidden_states)
+            hidden_states = self.norm2((self.dropout2(mlp_out) + hidden_states).to(dtype=self.norm2.weight.dtype))
             return hidden_states, None, None
 class NomicBertEncoder(nn.Module):
     def __init__(self, config: GPT2Config):
         super().__init__()
+        self.layers = nn.ModuleList([NomicBertBlock(config) for _ in range(config.n_layer)])
         self.gradient_checkpointing = False
         self.config = config
+    def forward(
+        self,
         hidden_states: torch.LongTensor = None,
         attention_mask: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
         output_attentions: Optional[bool] = None,
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
+        is_padded_inputs: Optional[bool] = True,
+    ):
         """If subset_mask is not None, we only want output for the subset of the sequence.
         This means that we only compute the last layer output for these tokens.
         subset_mask: (batch, seqlen), dtype=torch.bool
         hidden_states2 = None
         residual = None
         for _, layer in enumerate(self.layers):
             if self.gradient_checkpointing and self.training:
     def __init__(self, config):
         super().__init__()
         self.dense = nn.Linear(config.n_embd, config.n_embd, bias=config.mlp_fc1_bias)
+        approximate = "tanh" if config.activation_function in ["gelu_new", "gelu_fast", "gelu_pytorch_tanh"] else "none"
         if config.activation_function == "swiglu":
             self.transform_act_fn = F.silu
         else:
         super().__init__(config)
         self.pad_vocab_size_multiple = getattr(config, "pad_vocab_size_multiple", 1)
         if config.vocab_size % self.pad_vocab_size_multiple != 0:
+            config.vocab_size += self.pad_vocab_size_multiple - (config.vocab_size % self.pad_vocab_size_multiple)
+        assert config.activation_function in [
+            "gelu",
+            "gelu_new",
+            "gelu_fast",
+            "gelu_pytorch_tanh",
+            "swiglu",
+            "geglu",
+            "glu",
+        ]
+        self.embeddings = NomicBertEmbeddings(config)
         self.emb_drop = nn.Dropout(config.resid_pdrop)
         self.emb_ln = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
         self.encoder = NomicBertEncoder(config)
         position_ids=None,
         token_type_ids=None,
         attention_mask=None,
+        return_dict=None,
+        matryoshka_dim=None,
     ):
         if token_type_ids is None:
             token_type_ids = torch.zeros_like(input_ids)
+        hidden_states = self.embeddings(input_ids, position_ids=position_ids, token_type_ids=token_type_ids)
         hidden_states = self.emb_ln(hidden_states)
         hidden_states = self.emb_drop(hidden_states)
         attention_mask = self.get_extended_attention_mask(attention_mask, input_ids.shape)
+        sequence_output = self.encoder(hidden_states, attention_mask=attention_mask, return_dict=return_dict)
         pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+        if matryoshka_dim:
+            sequence_output = sequence_output[:, :matryoshka_dim]
         return BaseModelOutputWithPoolingAndCrossAttentions(
             last_hidden_state=sequence_output,
             pooler_output=pooled_output,
             loss=total_loss,
             logits=prediction_scores,
             hidden_states=outputs.hidden_states,
+            attentions=None,
         )
 class NomicBertForSequenceClassification(NomicBertPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.config = config
         self.bert = NomicBertModel(config)
+        classifier_dropout = getattr(config, "classifier_dropout", config.embd_pdrop)
         self.dropout = nn.Dropout(classifier_dropout)
         self.classifier = nn.Linear(config.n_embd, config.num_labels)