demo_plinder_smina / inference_app.py
simonduerr's picture
Update inference_app.py
ff1a3bf verified
raw
history blame
No virus
7.12 kB
import time
import gradio as gr
from gradio_molecule3d import Molecule3D
import sys
import os
import os
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import DataStructs
from rdkit.Chem import RDConfig
from rdkit.Chem import rdBase
import pickle
from Bio.PDB import *
from Bio import PDB
import requests
import subprocess
import mdtraj as md
from enspara import geometry
from sklearn.cluster import DBSCAN
import pandas as pd
def run_smina(
ligand_path, protein_path, out_path, pocket_center, pocket_size, num_poses=1, exhaustiveness=1
):
"""
Perform docking with Smina.
Parameters
----------
ligand_path: str or pathlib.Path
Path to ligand PDBQT file that should be docked.
protein_path: str or pathlib.Path
Path to protein PDBQT file that should be docked to.
out_path: str or pathlib.Path
Path to which docking poses should be saved, SDF or PDB format.
pocket_center: iterable of float or int
Coordinates defining the center of the binding site.
pocket_size: iterable of float or int
Lengths of edges defining the binding site.
num_poses: int
Maximum number of poses to generate.
exhaustiveness: int
Accuracy of docking calculations.
Returns
-------
output_text: str
The output of the Smina calculation.
"""
output_text = subprocess.check_output(
[
"./smina.static",
"--ligand",
str(ligand_path),
"--receptor",
str(protein_path),
"--out",
str(out_path),
"--center_x",
str(pocket_center[0]),
"--center_y",
str(pocket_center[1]),
"--center_z",
str(pocket_center[2]),
"--size_x",
str(pocket_size[0]),
"--size_y",
str(pocket_size[1]),
"--size_z",
str(pocket_size[2]),
"--num_modes",
str(num_poses),
"--exhaustiveness",
str(exhaustiveness),
],
universal_newlines=True, # needed to capture output text
)
return output_text
def predict (input_sequence, input_ligand, input_protein, exhaustiveness):
"""
Main prediction function that calls ligsite and smina
Parameters
----------
input_sequence: str
monomer sequence
input_ligand: str
ligand as SMILES string
protein_path: gradio.File
Gradio file object to monomer protein structure as PDB
exhaustiveness: int
SMINA parameter
Returns
-------
output_structures: tuple
(output_protein, output_ligand_sdf)
run_time: float
run time of the program
"""
start_time = time.time()
if input_protein==None:
raise gr.Error("need pdb input")
m=Chem.MolFromSmiles(input_ligand)
m2=Chem.AddHs(m)
AllChem.EmbedMolecule(m2)
AllChem.MMFFOptimizeMolecule(m2)
Chem.SDWriter("/usr/src/app/ligand.sdf").write(m2)
os.system(f"obabel {input_protein.name} -xr -O /usr/src/app/receptor.pdbqt")
os.system("obabel -isdf /usr/src/app/ligand.sdf -O /usr/src/app/ligand.pdbqt")
#Find pocket
pdb = md.load(input_protein.name)
# run ligsite
pockets_xyz = geometry.pockets.get_pocket_cells(struct=pdb)
eps_value = 0.15
min_samples_value = 5
dbscan = DBSCAN(eps=eps_value, min_samples=min_samples_value)
labels = dbscan.fit_predict(pockets_xyz)
# Find the unique clusters and their sizes
unique_labels, counts = np.unique(labels, return_counts=True)
# Exclude noise points
valid_clusters = unique_labels[unique_labels != -1]
valid_counts = counts[unique_labels != -1]
# Find the cluster with the most points (highest density)
densest_cluster_label = valid_clusters[np.argmax(valid_counts)]
densest_cluster_points = pockets_xyz[labels == densest_cluster_label]
# write cluster to PDB
top_df = pd.DataFrame()
top_df['serial'] = list(range(densest_cluster_points.shape[0]))
top_df['name'] = 'PK'
top_df['element'] = 'H'
top_df['resSeq'] = list(range(densest_cluster_points.shape[0]))
top_df['resName'] = 'PCK'
top_df['chainID'] = 0
pocket_top = md.Topology.from_dataframe(top_df, np.array([]))
pocket_trj = md.Trajectory(xyz=densest_cluster_points, topology=pocket_top)
pocket_trj.save('/usr/src/app/pockets_dense.pdb')
parser = PDBParser()
struc = parser.get_structure("X", "/usr/src/app/pockets_dense.pdb")
coords = [x.coord for x in struc.get_atoms()]
pocket_center = np.mean(coords, axis=0)
# run smina
output_text = run_smina(
"/usr/src/app/ligand.pdbqt",
"/usr/src/app/receptor.pdbqt",
"/usr/src/app/docking_pose.sdf",
pocket_center,
[10,10,10],
exhaustiveness=exhaustiveness
)
end_time = time.time()
run_time = end_time - start_time
return [input_protein.name,"/usr/src/app/docking_pose.sdf"], run_time
with gr.Blocks() as app:
gr.Markdown("# LigSite + Smina")
gr.Markdown("Example model using LigSite and DBScan to find a binding pocket in the protein and then SMINA to dock the ligand in the found pocket.")
with gr.Row():
input_sequence = gr.Textbox(lines=3, label="Input Protein sequence (FASTA)")
input_ligand = gr.Textbox(lines=3, label="Input ligand SMILES")
input_protein = gr.File(label="Input protein monomer")
# define any options here
# for automated inference the default options are used
exhaustiveness = gr.Slider(1,10,value=1, label="Slider Option")
# checkbox_option = gr.Checkbox(label="Checkbox Option")
# dropdown_option = gr.Dropdown(["Option 1", "Option 2", "Option 3"], label="Radio Option")
btn = gr.Button("Run Inference")
gr.Examples(
[
[
"SVKSEYAEAAAVGQEAVAVFNTMKAAFQNGDKEAVAQYLARLASLYTRHEELLNRILEKARREGNKEAVTLMNEFTATFQTGKSIFNAMVAAFKNGDDDSFESYLQALEKVTAKGETLADQIAKAL:SVKSEYAEAAAVGQEAVAVFNTMKAAFQNGDKEAVAQYLARLASLYTRHEELLNRILEKARREGNKEAVTLMNEFTATFQTGKSIFNAMVAAFKNGDDDSFESYLQALEKVTAKGETLADQIAKAL",
"COc1ccc(cc1)n2c3c(c(n2)C(=O)N)CCN(C3=O)c4ccc(cc4)N5CCCCC5=O",
"input_test.pdb"
],
],
[input_sequence, input_ligand, input_protein],
)
reps = [
{
"model": 0,
"style": "cartoon",
"color": "whiteCarbon",
},
{
"model": 0,
"resname": "UNK",
"style": "stick",
"color": "greenCarbon",
},
{
"model": 0,
"resname": "LIG",
"style": "stick",
"color": "greenCarbon",
},
{
"model": 1,
"style": "stick",
"color": "greenCarbon",
}
]
out = Molecule3D(reps=reps)
run_time = gr.Textbox(label="Runtime")
btn.click(predict, inputs=[input_sequence, input_ligand, input_protein, exhaustiveness], outputs=[out, run_time])
app.launch()