from huggingface_hub import from_pretrained_keras import numpy as np import gradio as gr import transformers import tensorflow as tf class BertSemanticDataGenerator(tf.keras.utils.Sequence): """Generates batches of data.""" def __init__( self, sentence_pairs, labels, batch_size=32, shuffle=True, include_targets=True, ): self.sentence_pairs = sentence_pairs self.labels = labels self.shuffle = shuffle self.batch_size = batch_size self.include_targets = include_targets # Load our BERT Tokenizer to encode the text. # We will use base-base-uncased pretrained model. self.tokenizer = transformers.BertTokenizer.from_pretrained( "bert-base-uncased", do_lower_case=True ) self.indexes = np.arange(len(self.sentence_pairs)) self.on_epoch_end() def __len__(self): # Denotes the number of batches per epoch. return len(self.sentence_pairs) // self.batch_size def __getitem__(self, idx): # Retrieves the batch of index. indexes = self.indexes[idx * self.batch_size : (idx + 1) * self.batch_size] sentence_pairs = self.sentence_pairs[indexes] # With BERT tokenizer's batch_encode_plus batch of both the sentences are # encoded together and separated by [SEP] token. encoded = self.tokenizer.batch_encode_plus( sentence_pairs.tolist(), add_special_tokens=True, max_length=128, return_attention_mask=True, return_token_type_ids=True, pad_to_max_length=True, return_tensors="tf", ) # Convert batch of encoded features to numpy array. input_ids = np.array(encoded["input_ids"], dtype="int32") attention_masks = np.array(encoded["attention_mask"], dtype="int32") token_type_ids = np.array(encoded["token_type_ids"], dtype="int32") # Set to true if data generator is used for training/validation. if self.include_targets: labels = np.array(self.labels[indexes], dtype="int32") return [input_ids, attention_masks, token_type_ids], labels else: return [input_ids, attention_masks, token_type_ids] model = from_pretrained_keras("keras-io/bert-semantic-similarity") labels = ["contradiction", "entailment", "neutral"] def predict(*sentences): if len(sentences) != 6: return {'error': 'Se esperan 6 oraciones'} sentence_pairs = np.array([[str(sentences[i]), str(expected_responses[i])] for i in range(6)]) test_data = BertSemanticDataGenerator( sentence_pairs, labels=None, batch_size=1, shuffle=False, include_targets=False, ) probs = model.predict(test_data[0])[0] labels_probs = {labels[i]: float(probs[i]) for i, _ in enumerate(labels)} return labels_probs expected_responses = [ 'respuesta1a', 'respuesta2a', 'respuesta3a', 'respuesta4a', 'respuesta5a', 'respuesta6a' ] examples = [ ["Two women are observing something together.", "respuesta1a"], ["A smiling costumed woman is holding an umbrella", "respuesta2a"], ["A soccer game with multiple males playing", "respuesta3a"], ["Some men are playing a sport", "respuesta4a"], ["Another example sentence", "respuesta5a"], ["One more example for the sixth input", "respuesta6a"] ] # Interfaz Gradio gr.Interface( fn=predict, title="Semantic Similarity with BERT", description="Natural Language Inference by fine-tuning BERT model on SNLI Corpus 📰", inputs=[gr.Textbox(label=f"Input {i+1}") for i in range(6)], examples=examples, outputs=gr.outputs.Label(num_top_classes=3, label='Semantic similarity'), cache_examples=False, article="Author: Vu Minh Chien. Based on the keras example from Mohamad Merchant", ).launch(debug=True, enable_queue=True)