KELONMYOSA
commited on
Commit
•
50560af
1
Parent(s):
b39f025
pipeline
Browse files- config.json +18 -1
- emotion_recognition_pipeline.py +32 -0
config.json
CHANGED
@@ -10,7 +10,7 @@
|
|
10 |
],
|
11 |
"attention_dropout": 0.1,
|
12 |
"auto_map": {
|
13 |
-
"AutoModelForAudioClassification": "emotion_model.Wav2Vec2ForSpeechClassification"
|
14 |
},
|
15 |
"bos_token_id": 1,
|
16 |
"classifier_proj_size": 256,
|
@@ -46,6 +46,23 @@
|
|
46 |
],
|
47 |
"ctc_loss_reduction": "mean",
|
48 |
"ctc_zero_infinity": true,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
"diversity_loss_weight": 0.1,
|
50 |
"do_stable_layer_norm": true,
|
51 |
"eos_token_id": 2,
|
|
|
10 |
],
|
11 |
"attention_dropout": 0.1,
|
12 |
"auto_map": {
|
13 |
+
"AutoModelForAudioClassification": "KELONMYOSA/wav2vec2-xls-r-300m-emotion-ru--emotion_model.Wav2Vec2ForSpeechClassification"
|
14 |
},
|
15 |
"bos_token_id": 1,
|
16 |
"classifier_proj_size": 256,
|
|
|
46 |
],
|
47 |
"ctc_loss_reduction": "mean",
|
48 |
"ctc_zero_infinity": true,
|
49 |
+
"custom_pipelines": {
|
50 |
+
"audio-classification": {
|
51 |
+
"default": {
|
52 |
+
"model": {
|
53 |
+
"pt": [
|
54 |
+
"KELONMYOSA/wav2vec2-xls-r-300m-emotion-ru",
|
55 |
+
"main"
|
56 |
+
]
|
57 |
+
}
|
58 |
+
},
|
59 |
+
"impl": "emotion_recognition_pipeline.SpeechEmotionRecognitionPipeline",
|
60 |
+
"pt": [
|
61 |
+
"Wav2Vec2ForSpeechClassification"
|
62 |
+
],
|
63 |
+
"tf": []
|
64 |
+
}
|
65 |
+
},
|
66 |
"diversity_loss_weight": 0.1,
|
67 |
"do_stable_layer_norm": true,
|
68 |
"eos_token_id": 2,
|
emotion_recognition_pipeline.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import librosa
|
2 |
+
import torch
|
3 |
+
import torch.nn.functional as F
|
4 |
+
from transformers import Pipeline, AutoConfig, Wav2Vec2Processor
|
5 |
+
|
6 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
7 |
+
model_name_or_path = "KELONMYOSA/wav2vec2-xls-r-300m-emotion-ru"
|
8 |
+
config = AutoConfig.from_pretrained(model_name_or_path)
|
9 |
+
processor = Wav2Vec2Processor.from_pretrained(model_name_or_path)
|
10 |
+
sampling_rate = processor.feature_extractor.sampling_rate
|
11 |
+
|
12 |
+
|
13 |
+
class SpeechEmotionRecognitionPipeline(Pipeline):
|
14 |
+
def _sanitize_parameters(self, **pipeline_parameters):
|
15 |
+
return {}, {}, {}
|
16 |
+
|
17 |
+
def preprocess(self, audio, second_text=None):
|
18 |
+
speech, sr = librosa.load(audio, sr=sampling_rate)
|
19 |
+
features = processor(speech, sampling_rate=sampling_rate, return_tensors="pt", padding=True)
|
20 |
+
|
21 |
+
return features.input_values.to(device)
|
22 |
+
|
23 |
+
def _forward(self, model_inputs):
|
24 |
+
return self.model(model_inputs)
|
25 |
+
|
26 |
+
def postprocess(self, model_outputs):
|
27 |
+
logits = model_outputs.logits
|
28 |
+
|
29 |
+
scores = F.softmax(logits, dim=1).detach().cpu().numpy()[0]
|
30 |
+
outputs = [{"label": config.id2label[i], "score": round(score, 5)} for i, score in
|
31 |
+
enumerate(scores)]
|
32 |
+
return outputs
|