KELONMYOSA commited on
Commit
50560af
1 Parent(s): b39f025
Files changed (2) hide show
  1. config.json +18 -1
  2. emotion_recognition_pipeline.py +32 -0
config.json CHANGED
@@ -10,7 +10,7 @@
10
  ],
11
  "attention_dropout": 0.1,
12
  "auto_map": {
13
- "AutoModelForAudioClassification": "emotion_model.Wav2Vec2ForSpeechClassification"
14
  },
15
  "bos_token_id": 1,
16
  "classifier_proj_size": 256,
@@ -46,6 +46,23 @@
46
  ],
47
  "ctc_loss_reduction": "mean",
48
  "ctc_zero_infinity": true,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  "diversity_loss_weight": 0.1,
50
  "do_stable_layer_norm": true,
51
  "eos_token_id": 2,
 
10
  ],
11
  "attention_dropout": 0.1,
12
  "auto_map": {
13
+ "AutoModelForAudioClassification": "KELONMYOSA/wav2vec2-xls-r-300m-emotion-ru--emotion_model.Wav2Vec2ForSpeechClassification"
14
  },
15
  "bos_token_id": 1,
16
  "classifier_proj_size": 256,
 
46
  ],
47
  "ctc_loss_reduction": "mean",
48
  "ctc_zero_infinity": true,
49
+ "custom_pipelines": {
50
+ "audio-classification": {
51
+ "default": {
52
+ "model": {
53
+ "pt": [
54
+ "KELONMYOSA/wav2vec2-xls-r-300m-emotion-ru",
55
+ "main"
56
+ ]
57
+ }
58
+ },
59
+ "impl": "emotion_recognition_pipeline.SpeechEmotionRecognitionPipeline",
60
+ "pt": [
61
+ "Wav2Vec2ForSpeechClassification"
62
+ ],
63
+ "tf": []
64
+ }
65
+ },
66
  "diversity_loss_weight": 0.1,
67
  "do_stable_layer_norm": true,
68
  "eos_token_id": 2,
emotion_recognition_pipeline.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import librosa
2
+ import torch
3
+ import torch.nn.functional as F
4
+ from transformers import Pipeline, AutoConfig, Wav2Vec2Processor
5
+
6
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
7
+ model_name_or_path = "KELONMYOSA/wav2vec2-xls-r-300m-emotion-ru"
8
+ config = AutoConfig.from_pretrained(model_name_or_path)
9
+ processor = Wav2Vec2Processor.from_pretrained(model_name_or_path)
10
+ sampling_rate = processor.feature_extractor.sampling_rate
11
+
12
+
13
+ class SpeechEmotionRecognitionPipeline(Pipeline):
14
+ def _sanitize_parameters(self, **pipeline_parameters):
15
+ return {}, {}, {}
16
+
17
+ def preprocess(self, audio, second_text=None):
18
+ speech, sr = librosa.load(audio, sr=sampling_rate)
19
+ features = processor(speech, sampling_rate=sampling_rate, return_tensors="pt", padding=True)
20
+
21
+ return features.input_values.to(device)
22
+
23
+ def _forward(self, model_inputs):
24
+ return self.model(model_inputs)
25
+
26
+ def postprocess(self, model_outputs):
27
+ logits = model_outputs.logits
28
+
29
+ scores = F.softmax(logits, dim=1).detach().cpu().numpy()[0]
30
+ outputs = [{"label": config.id2label[i], "score": round(score, 5)} for i, score in
31
+ enumerate(scores)]
32
+ return outputs