|
import librosa |
|
import requests |
|
import torch |
|
import torch.nn.functional as F |
|
from transformers import AudioClassificationPipeline, AutoConfig, Wav2Vec2Processor |
|
from io import BytesIO |
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
model_name_or_path = "KELONMYOSA/wav2vec2-xls-r-300m-emotion-ru" |
|
config = AutoConfig.from_pretrained(model_name_or_path) |
|
processor = Wav2Vec2Processor.from_pretrained(model_name_or_path) |
|
sampling_rate = processor.feature_extractor.sampling_rate |
|
|
|
|
|
class SpeechEmotionRecognitionPipeline(AudioClassificationPipeline): |
|
def _sanitize_parameters(self, **pipeline_parameters): |
|
return {}, {}, {} |
|
|
|
def preprocess(self, inputs): |
|
if isinstance(inputs, str): |
|
if inputs.startswith("http://") or inputs.startswith("https://"): |
|
inputs = BytesIO(requests.get(inputs).content) |
|
else: |
|
inputs = open(inputs, "rb") |
|
|
|
speech, sr = librosa.load(inputs, sr=sampling_rate) |
|
features = processor(speech, sampling_rate=sampling_rate, return_tensors="pt", padding=True) |
|
|
|
return features.input_values.to(device) |
|
|
|
def _forward(self, model_inputs): |
|
return self.model(model_inputs) |
|
|
|
def postprocess(self, model_outputs): |
|
logits = model_outputs.logits |
|
|
|
scores = F.softmax(logits, dim=1).detach().cpu().numpy()[0] |
|
outputs = [{"label": config.id2label[i], "score": round(score, 5)} for i, score in |
|
enumerate(scores)] |
|
return outputs |
|
|