File size: 2,324 Bytes
6b9b229
 
 
 
 
 
 
 
 
2d60ba3
6b9b229
 
 
 
 
 
 
 
 
 
2d60ba3
 
6b9b229
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
"""
Flash attention monkey patch for cerebras btlm model
"""

import importlib
import logging
from typing import Optional, Tuple

import torch
from accelerate import init_empty_weights
from flash_attn.flash_attn_interface import flash_attn_func
from transformers import AutoConfig, AutoModelForCausalLM

LOG = logging.getLogger("axolotl")


def replace_btlm_attn_with_flash_attn(model_name="cerebras/btlm-3b-8k-base"):
    # this is a wonky hack to get the remotely loaded module
    model_config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
    # we need to load the model here in order for modeling_btlm to be available
    with init_empty_weights():
        AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
    module_name = model_config.__class__.__module__.replace(
        ".configuration_btlm", ".modeling_btlm"
    )
    modeling_btlm = importlib.import_module(module_name)
    modeling_btlm.BTLMAttention._attn = (  # pylint: disable=protected-access
        flashattn_attn
    )


def flashattn_attn(
    self,
    query: torch.Tensor,
    key: Optional[torch.Tensor] = None,
    value: Optional[torch.Tensor] = None,
    attention_mask: Optional[torch.Tensor] = None,  # pylint: disable=unused-argument
    head_mask: Optional[torch.Tensor] = None,
    position_bias: Optional[torch.Tensor] = None,  # pylint: disable=unused-argument
) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
    softmax_scale = (
        1 / (key.size(-1) ** self.attn_scale_power) if self.scale_attn_weights else None
    )

    query = query.permute(0, 2, 1, 3)
    key = key.permute(0, 2, 1, 3)
    value = value.permute(0, 2, 1, 3)

    # Perform Flash attention
    attn_output = flash_attn_func(
        query,
        key,
        value,
        dropout_p=0.0,  # Assuming you have this attribute
        softmax_scale=softmax_scale,  # Set this if you have specific scaling in mind
        causal=not self.is_cross_attention,  # Assuming you have this attribute
        return_attn_probs=False,  # Set this based on your needs
    )

    # Optional: Apply head mask if it's not None
    if head_mask is not None:
        attn_output *= head_mask

    attn_output = attn_output.permute(0, 2, 1, 3)

    return attn_output, None  # We don't have explicit attn_weights in Flash attention