Spaces:

AI4PD
/

hexviz

Sleeping

App Files Files Community

aksell commited on Mar 22, 2023

Commit

466a8f2

•

1 Parent(s): b7ab123

Implement get_attention for tape BERT

Browse files

Files changed (4) hide show

poetry.lock +118 -1
protention/attention.py +30 -11
pyproject.toml +1 -0
tests/test_attention.py +10 -1

poetry.lock CHANGED Viewed

@@ -171,6 +171,38 @@ category = "main"
 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
 [[package]]
 name = "cachetools"
 version = "5.3.0"
@@ -572,6 +604,14 @@ MarkupSafe = ">=2.0"
 [package.extras]
 i18n = ["Babel (>=2.7)"]
 [[package]]
 name = "jsonpointer"
 version = "2.3"
@@ -749,6 +789,14 @@ category = "main"
 optional = false
 python-versions = "*"
 [[package]]
 name = "markdown-it-py"
 version = "2.2.0"
@@ -1474,6 +1522,36 @@ pygments = ">=2.13.0,<3.0.0"
 [package.extras]
 jupyter = ["ipywidgets (>=7.5.1,<9)"]
 [[package]]
 name = "semver"
 version = "2.13.0"
@@ -1613,6 +1691,37 @@ python-versions = ">=3.8"
 [package.dependencies]
 mpmath = ">=0.19"
 [[package]]
 name = "terminado"
 version = "0.17.1"
@@ -1983,7 +2092,7 @@ testing = ["pytest (>=6)", "pytest-checkdocs (>=2.4)", "flake8 (<5)", "pytest-co
 [metadata]
 lock-version = "1.1"
 python-versions = "^3.10"
-content-hash = "c748285bd150fadef69123d60f0b4ad96d99715916c7e1ab30214132749f8aed"
 [metadata.files]
 altair = []
@@ -2027,6 +2136,8 @@ beautifulsoup4 = []
 biopython = []
 bleach = []
 blinker = []
 cachetools = []
 certifi = []
 cffi = []
@@ -2071,6 +2182,7 @@ ipywidgets = []
 isoduration = []
 jedi = []
 jinja2 = []
 jsonpointer = []
 jsonschema = []
 jupyter-client = []
@@ -2082,6 +2194,7 @@ jupyter-server-terminals = []
 jupyterlab-pygments = []
 jupyterlab-widgets = []
 lit = []
 markdown-it-py = []
 markupsafe = []
 matplotlib-inline = []
@@ -2205,6 +2318,8 @@ requests = []
 rfc3339-validator = []
 rfc3986-validator = []
 rich = []
 semver = []
 send2trash = [
     {file = "Send2Trash-1.8.0-py3-none-any.whl", hash = "sha256:f20eaadfdb517eaca5ce077640cb261c7d2698385a6a0f072a4a5447fd49fa08"},
@@ -2225,6 +2340,8 @@ stack-data = []
 stmol = []
 streamlit = []
 sympy = []
 terminado = []
 tinycss2 = []
 tokenizers = []

 optional = false
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+[[package]]
+name = "boto3"
+version = "1.26.95"
+description = "The AWS SDK for Python"
+category = "main"
+optional = false
+python-versions = ">= 3.7"
+[package.dependencies]
+botocore = ">=1.29.95,<1.30.0"
+jmespath = ">=0.7.1,<2.0.0"
+s3transfer = ">=0.6.0,<0.7.0"
+[package.extras]
+crt = ["botocore[crt] (>=1.21.0,<2.0a0)"]
+[[package]]
+name = "botocore"
+version = "1.29.95"
+description = "Low-level, data-driven core of boto 3."
+category = "main"
+optional = false
+python-versions = ">= 3.7"
+[package.dependencies]
+jmespath = ">=0.7.1,<2.0.0"
+python-dateutil = ">=2.1,<3.0.0"
+urllib3 = ">=1.25.4,<1.27"
+[package.extras]
+crt = ["awscrt (==0.16.9)"]
 [[package]]
 name = "cachetools"
 version = "5.3.0"
 [package.extras]
 i18n = ["Babel (>=2.7)"]
+[[package]]
+name = "jmespath"
+version = "1.0.1"
+description = "JSON Matching Expressions"
+category = "main"
+optional = false
+python-versions = ">=3.7"
 [[package]]
 name = "jsonpointer"
 version = "2.3"
 optional = false
 python-versions = "*"
+[[package]]
+name = "lmdb"
+version = "1.4.0"
+description = "Universal Python binding for the LMDB 'Lightning' Database"
+category = "main"
+optional = false
+python-versions = "*"
 [[package]]
 name = "markdown-it-py"
 version = "2.2.0"
 [package.extras]
 jupyter = ["ipywidgets (>=7.5.1,<9)"]
+[[package]]
+name = "s3transfer"
+version = "0.6.0"
+description = "An Amazon S3 Transfer Manager"
+category = "main"
+optional = false
+python-versions = ">= 3.7"
+[package.dependencies]
+botocore = ">=1.12.36,<2.0a.0"
+[package.extras]
+crt = ["botocore[crt] (>=1.20.29,<2.0a.0)"]
+[[package]]
+name = "scipy"
+version = "1.9.3"
+description = "Fundamental algorithms for scientific computing in Python"
+category = "main"
+optional = false
+python-versions = ">=3.8"
+[package.dependencies]
+numpy = ">=1.18.5,<1.26.0"
+[package.extras]
+test = ["pytest", "pytest-cov", "pytest-xdist", "asv", "mpmath", "gmpy2", "threadpoolctl", "scikit-umfpack"]
+doc = ["sphinx (!=4.1.0)", "pydata-sphinx-theme (==0.9.0)", "sphinx-panels (>=0.5.2)", "matplotlib (>2)", "numpydoc", "sphinx-tabs"]
+dev = ["mypy", "typing-extensions", "pycodestyle", "flake8"]
 [[package]]
 name = "semver"
 version = "2.13.0"
 [package.dependencies]
 mpmath = ">=0.19"
+[[package]]
+name = "tape-proteins"
+version = "0.5"
+description = "Repostory of Protein Benchmarking and Modeling"
+category = "main"
+optional = false
+python-versions = "*"
+[package.dependencies]
+biopython = "*"
+boto3 = "*"
+filelock = "*"
+lmdb = "*"
+requests = "*"
+scipy = "*"
+tensorboardX = "*"
+tqdm = "*"
+[[package]]
+name = "tensorboardx"
+version = "2.6"
+description = "TensorBoardX lets you watch Tensors Flow without Tensorflow"
+category = "main"
+optional = false
+python-versions = "*"
+[package.dependencies]
+numpy = "*"
+packaging = "*"
+protobuf = ">=3.8.0,<4"
 [[package]]
 name = "terminado"
 version = "0.17.1"
 [metadata]
 lock-version = "1.1"
 python-versions = "^3.10"
+content-hash = "ad6054ae4a119d961e9941f135489d1b89310303aefc27d3132fbd1ed1c35a0f"
 [metadata.files]
 altair = []
 biopython = []
 bleach = []
 blinker = []
+boto3 = []
+botocore = []
 cachetools = []
 certifi = []
 cffi = []
 isoduration = []
 jedi = []
 jinja2 = []
+jmespath = []
 jsonpointer = []
 jsonschema = []
 jupyter-client = []
 jupyterlab-pygments = []
 jupyterlab-widgets = []
 lit = []
+lmdb = []
 markdown-it-py = []
 markupsafe = []
 matplotlib-inline = []
 rfc3339-validator = []
 rfc3986-validator = []
 rich = []
+s3transfer = []
+scipy = []
 semver = []
 send2trash = [
     {file = "Send2Trash-1.8.0-py3-none-any.whl", hash = "sha256:f20eaadfdb517eaca5ce077640cb261c7d2698385a6a0f072a4a5447fd49fa08"},
 stmol = []
 streamlit = []
 sympy = []
+tape-proteins = []
+tensorboardx = []
 terminado = []
 tinycss2 = []
 tokenizers = []

protention/attention.py CHANGED Viewed

@@ -1,11 +1,16 @@
 from io import StringIO
 from urllib import request
 import torch
 from Bio.PDB import PDBParser, Polypeptide, Structure
 from transformers import T5EncoderModel, T5Tokenizer
 def get_structure(pdb_code: str) -> Structure:
     """
     Get structure from PDB
@@ -46,9 +51,14 @@ def get_protT5() -> tuple[T5Tokenizer, T5EncoderModel]:
     return tokenizer, model
 def get_attention(
-    pdb_code: str, chain_ids: list[str], layer: int, head: int, min_attn: float = 0.2
 ):
     """
     Get attention from T5
@@ -57,13 +67,22 @@ def get_attention(
     structure = get_structure(pdb_code)
     # Get list of sequences
     sequences = get_sequences(structure)
-    # get model
-    tokenizer, model = get_protT5()
-    # call model
-    ## Get sequence
-    # get attention
-    # extract attention

+from enum import Enum
 from io import StringIO
 from urllib import request
 import torch
 from Bio.PDB import PDBParser, Polypeptide, Structure
+from tape import ProteinBertModel, TAPETokenizer
 from transformers import T5EncoderModel, T5Tokenizer
+class Model(str, Enum):
+    tape_bert = "bert-base"
 def get_structure(pdb_code: str) -> Structure:
     """
     Get structure from PDB
     return tokenizer, model
+def get_tape_bert() -> tuple[TAPETokenizer, ProteinBertModel]:
+    tokenizer = TAPETokenizer()
+    model = ProteinBertModel.from_pretrained('bert-base', output_attentions=True)
+    return tokenizer, model
 def get_attention(
+    pdb_code: str, model: Model = Model.tape_bert
 ):
     """
     Get attention from T5
     structure = get_structure(pdb_code)
     # Get list of sequences
     sequences = get_sequences(structure)
+    # TODO handle multiple sequences
+    sequence = sequences[0]
+    match model:
+        case model.tape_bert:
+            tokenizer, model = get_tape_bert()
+            token_idxs = tokenizer.encode(sequence).tolist()
+            inputs = torch.tensor(token_idxs).unsqueeze(0)
+            with torch.no_grad():
+                attns = model(inputs)[-1]
+                # Remove attention from <CLS> (first) and <SEP> (last) token
+            attns = [attn[:, :, 1:-1, 1:-1] for attn in attns]
+            attns = torch.stack([attn.squeeze(0) for attn in attns])
+        case model.prot_T5:
+            # Space separate sequences
+            sequences = [" ".join(sequence) for sequence in sequences]
+            tokenizer, model = get_protT5()
+    return attns

pyproject.toml CHANGED Viewed

@@ -12,6 +12,7 @@ biopython = "^1.81"
 transformers = "^4.27.1"
 torch = "^2.0.0"
 sentencepiece = "^0.1.97"
 [tool.poetry.dev-dependencies]
 pytest = "^7.2.2"

 transformers = "^4.27.1"
 torch = "^2.0.0"
 sentencepiece = "^0.1.97"
+tape-proteins = "^0.5"
 [tool.poetry.dev-dependencies]
 pytest = "^7.2.2"

tests/test_attention.py CHANGED Viewed

@@ -1,7 +1,9 @@
 from Bio.PDB.Structure import Structure
 from transformers import T5EncoderModel, T5Tokenizer
-from protention.attention import get_protT5, get_sequences, get_structure
 def test_get_structure():
@@ -33,3 +35,10 @@ def test_get_protT5():
     assert isinstance(tokenizer, T5Tokenizer)
     assert isinstance(model, T5EncoderModel)

+import torch
 from Bio.PDB.Structure import Structure
 from transformers import T5EncoderModel, T5Tokenizer
+from protention.attention import (Model, get_attention, get_protT5,
+                                  get_sequences, get_structure)
 def test_get_structure():
     assert isinstance(tokenizer, T5Tokenizer)
     assert isinstance(model, T5EncoderModel)
+def test_get_attention_tape():
+    result = get_attention("1AKE", model=Model.tape_bert)
+    assert result is not None
+    assert result.shape == torch.Size([12,12,456,456])