ljy266987 commited on
Commit
56c3b64
1 Parent(s): 7c306ab
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.wav filter=lfs diff=lfs merge=lfs -text
app.py ADDED
@@ -0,0 +1,345 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+
3
+ import os
4
+ import librosa
5
+ import base64
6
+ import io
7
+ import gradio as gr
8
+ import re
9
+
10
+ import numpy as np
11
+ import torch
12
+ import torchaudio
13
+ from modelscope import HubApi
14
+
15
+ api = HubApi()
16
+
17
+ key = os.environ["apikey"] if "apikey" in os.environ else ""
18
+ try:
19
+ api.login(key)
20
+ except:
21
+ pass
22
+
23
+ from funasr import AutoModel
24
+
25
+ # model = "/Users/zhifu/Downloads/modelscope_models/SenseVoiceSmall"
26
+ model = "iic/SenseVoiceSmall"
27
+ model = AutoModel(model=model,
28
+ vad_model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch",
29
+ vad_kwargs={"max_single_segment_time": 30000},
30
+ trust_remote_code=True,
31
+ )
32
+
33
+ import re
34
+
35
+ emo_dict = {
36
+ "<|HAPPY|>": "😊",
37
+ "<|SAD|>": "😔",
38
+ "<|ANGRY|>": "😡",
39
+ "<|NEUTRAL|>": "",
40
+ "<|FEARFUL|>": "😰",
41
+ "<|DISGUSTED|>": "🤢",
42
+ "<|SURPRISED|>": "😮",
43
+ }
44
+
45
+ event_dict = {
46
+ "<|BGM|>": "🎼",
47
+ "<|Speech|>": "",
48
+ "<|Applause|>": "👏",
49
+ "<|Laughter|>": "😀",
50
+ "<|Cry|>": "😭",
51
+ "<|Sneeze|>": "🤧",
52
+ "<|Breath|>": "",
53
+ "<|Cough|>": "🤧",
54
+ }
55
+
56
+ emoji_dict = {
57
+ "<|nospeech|><|Event_UNK|>": "❓",
58
+ "<|zh|>": "",
59
+ "<|en|>": "",
60
+ "<|yue|>": "",
61
+ "<|ja|>": "",
62
+ "<|ko|>": "",
63
+ "<|nospeech|>": "",
64
+ "<|HAPPY|>": "😊",
65
+ "<|SAD|>": "😔",
66
+ "<|ANGRY|>": "😡",
67
+ "<|NEUTRAL|>": "",
68
+ "<|BGM|>": "🎼",
69
+ "<|Speech|>": "",
70
+ "<|Applause|>": "👏",
71
+ "<|Laughter|>": "😀",
72
+ "<|FEARFUL|>": "😰",
73
+ "<|DISGUSTED|>": "🤢",
74
+ "<|SURPRISED|>": "😮",
75
+ "<|Cry|>": "😭",
76
+ "<|EMO_UNKNOWN|>": "",
77
+ "<|Sneeze|>": "🤧",
78
+ "<|Breath|>": "",
79
+ "<|Cough|>": "😷",
80
+ "<|Sing|>": "",
81
+ "<|Speech_Noise|>": "",
82
+ "<|withitn|>": "",
83
+ "<|woitn|>": "",
84
+ "<|GBG|>": "",
85
+ "<|Event_UNK|>": "",
86
+ }
87
+
88
+ lang_dict = {
89
+ "<|zh|>": "<|lang|>",
90
+ "<|en|>": "<|lang|>",
91
+ "<|yue|>": "<|lang|>",
92
+ "<|ja|>": "<|lang|>",
93
+ "<|ko|>": "<|lang|>",
94
+ "<|nospeech|>": "<|lang|>",
95
+ }
96
+
97
+ emo_set = {"😊", "😔", "😡", "😰", "🤢", "😮"}
98
+ event_set = {"🎼", "👏", "😀", "😭", "🤧", "😷",}
99
+
100
+ def format_str(s):
101
+ for sptk in emoji_dict:
102
+ s = s.replace(sptk, emoji_dict[sptk])
103
+ return s
104
+
105
+
106
+ def format_str_v2(s):
107
+ sptk_dict = {}
108
+ for sptk in emoji_dict:
109
+ sptk_dict[sptk] = s.count(sptk)
110
+ s = s.replace(sptk, "")
111
+ emo = "<|NEUTRAL|>"
112
+ for e in emo_dict:
113
+ if sptk_dict[e] > sptk_dict[emo]:
114
+ emo = e
115
+ for e in event_dict:
116
+ if sptk_dict[e] > 0:
117
+ s = event_dict[e] + s
118
+ s = s + emo_dict[emo]
119
+
120
+ for emoji in emo_set.union(event_set):
121
+ s = s.replace(" " + emoji, emoji)
122
+ s = s.replace(emoji + " ", emoji)
123
+ return s.strip()
124
+
125
+ def format_str_v3(s):
126
+ def get_emo(s):
127
+ return s[-1] if s[-1] in emo_set else None
128
+ def get_event(s):
129
+ return s[0] if s[0] in event_set else None
130
+
131
+ s = s.replace("<|nospeech|><|Event_UNK|>", "❓")
132
+ for lang in lang_dict:
133
+ s = s.replace(lang, "<|lang|>")
134
+ s_list = [format_str_v2(s_i).strip(" ") for s_i in s.split("<|lang|>")]
135
+ new_s = " " + s_list[0]
136
+ cur_ent_event = get_event(new_s)
137
+ for i in range(1, len(s_list)):
138
+ if len(s_list[i]) == 0:
139
+ continue
140
+ if get_event(s_list[i]) == cur_ent_event and get_event(s_list[i]) != None:
141
+ s_list[i] = s_list[i][1:]
142
+ #else:
143
+ cur_ent_event = get_event(s_list[i])
144
+ if get_emo(s_list[i]) != None and get_emo(s_list[i]) == get_emo(new_s):
145
+ new_s = new_s[:-1]
146
+ new_s += s_list[i].strip().lstrip()
147
+ new_s = new_s.replace("The.", " ")
148
+ return new_s.strip()
149
+
150
+ def model_inference(input_wav, language, fs=16000):
151
+ # task_abbr = {"Speech Recognition": "ASR", "Rich Text Transcription": ("ASR", "AED", "SER")}
152
+ language_abbr = {"auto": "auto", "zh": "zh", "en": "en", "yue": "yue", "ja": "ja", "ko": "ko",
153
+ "nospeech": "nospeech"}
154
+
155
+ # task = "Speech Recognition" if task is None else task
156
+ language = "auto" if len(language) < 1 else language
157
+ selected_language = language_abbr[language]
158
+ # selected_task = task_abbr.get(task)
159
+
160
+ # print(f"input_wav: {type(input_wav)}, {input_wav[1].shape}, {input_wav}")
161
+
162
+ if isinstance(input_wav, tuple):
163
+ fs, input_wav = input_wav
164
+ input_wav = input_wav.astype(np.float32) / np.iinfo(np.int16).max
165
+ if len(input_wav.shape) > 1:
166
+ input_wav = input_wav.mean(-1)
167
+ if fs != 16000:
168
+ print(f"audio_fs: {fs}")
169
+ resampler = torchaudio.transforms.Resample(fs, 16000)
170
+ input_wav_t = torch.from_numpy(input_wav).to(torch.float32)
171
+ input_wav = resampler(input_wav_t[None, :])[0, :].numpy()
172
+
173
+ # DecodingOptions = {
174
+ # "task": selected_task,
175
+ # "language": selected_language,
176
+ # "fp16": True,
177
+ # "gain_event": True,
178
+ # }
179
+ merge_vad = True #False if selected_task == "ASR" else True
180
+ print(f"language: {language}, merge_vad: {merge_vad}")
181
+ text = model.generate(input=input_wav,
182
+ cache={},
183
+ language=language,
184
+ use_itn=True,
185
+ batch_size_s=0, merge_vad=merge_vad)
186
+ # if len(input_wav) > 16000*30:
187
+ # text = model.generate(input=input_wav, task=selected_task, language=language, batch_size_s=0)
188
+ # else:
189
+ # text = model.inference(input=input_wav, task=selected_task, language=language, batch_size_s=0)
190
+ print(text)
191
+ text = text[0]["text"]
192
+ #text = format_str(text)
193
+ #text = format_str_v2(text)
194
+ text = format_str_v3(text)
195
+ # text = distingush_speech(text)
196
+ # text = "".join(text)
197
+ print(text)
198
+
199
+ return text
200
+
201
+
202
+ audio_examples = [
203
+ ["example/zh.mp3", "zh"],
204
+ ["example/yue.mp3", "yue"],
205
+ ["example/en.mp3", "en"],
206
+ ["example/ja.mp3", "ja"],
207
+ ["example/ko.mp3", "ko"],
208
+ ["example/emo_1.wav", "auto"],
209
+ ["example/emo_2.wav", "auto"],
210
+ ["example/emo_3.wav", "auto"],
211
+ #["example/emo_4.wav", "auto"],
212
+ #["example/event_1.wav", "auto"],
213
+ #["example/event_2.wav", "auto"],
214
+ #["example/event_3.wav", "auto"],
215
+ ["example/rich_1.wav", "auto"],
216
+ ["example/rich_2.wav", "auto"],
217
+ #["example/rich_3.wav", "auto"],
218
+ ["example/longwav_1.wav", "auto"],
219
+ ["example/longwav_2.wav", "auto"],
220
+ ["example/longwav_3.wav", "auto"],
221
+ #["example/longwav_4.wav", "auto"],
222
+ ]
223
+
224
+
225
+ description = """
226
+
227
+ # SenseVoice is a speech foundation model with multiple speech understanding capabilities, including automatic speech recognition (ASR), spoken language identification (LID), speech emotion recognition (SER), and acoustic event classification (AEC) or acoustic event detection (AED).
228
+
229
+ ## Usage
230
+ ### Upload an audio file or input through a microphone, then select the task and language.
231
+
232
+ *Language*
233
+ - `auto`: the audio language will be detected automatically.
234
+ - `Language Type Selection`: can also be specified for a particular language type.
235
+
236
+ Recommended audio input duration is below 30 seconds. For audio longer than 30 seconds, local deployment is recommended, github repo.
237
+
238
+ """
239
+
240
+ html_content = """
241
+ <div>
242
+ <h2 style="font-size: 22px;margin-left: 0px;">Voice Understanding Model: SenseVoice-Small</h2>
243
+ <p style="font-size: 18px;margin-left: 20px;">SenseVoice-Small is an encoder-only speech foundation model designed for rapid voice understanding. It encompasses a variety of features including automatic speech recognition (ASR), spoken language identification (LID), speech emotion recognition (SER), and acoustic event detection (AED). SenseVoice-Small supports multilingual recognition for Chinese, English, Cantonese, Japanese, and Korean. Additionally, it offers exceptionally low inference latency, performing 7 times faster than Whisper-small and 17 times faster than Whisper-large.</p>
244
+ <h2 style="font-size: 22px;margin-left: 0px;">Usage</h2> <p style="font-size: 18px;margin-left: 20px;">Upload an audio file or input through a microphone, then select the task and language. the audio is transcribed into corresponding text along with associated emotions (😊 happy, 😡 angry/exicting, 😔 sad) and types of sound events (😀 laughter, 🎼 music, 👏 applause, 🤧 cough&sneeze, 😭 cry). The event labels are placed in the front of the text and the emotion are in the back of the text.</p>
245
+ <p style="font-size: 18px;margin-left: 20px;">Recommended audio input duration is below 30 seconds. For audio longer than 30 seconds, local deployment is recommended.</p>
246
+ <h2 style="font-size: 22px;margin-left: 0px;">Repo</h2>
247
+ <p style="font-size: 18px;margin-left: 20px;"><a href="https://github.com/FunAudioLLM/SenseVoice" target="_blank">SenseVoice</a>: multilingual speech understanding model</p>
248
+ <p style="font-size: 18px;margin-left: 20px;"><a href="https://github.com/modelscope/FunASR" target="_blank">FunASR</a>: fundamental speech recognition toolkit</p>
249
+ <p style="font-size: 18px;margin-left: 20px;"><a href="https://github.com/modelscope/CosyVoice" target="_blank">CosyVoice</a>: high-quality multilingual TTS model</p>
250
+ </div>
251
+ """
252
+
253
+ # 自定义表格的 HTML 和 CSS 代码
254
+ centered_table_html = """
255
+ <style>
256
+ .centered-table {
257
+ margin-left: auto;
258
+ margin-right: auto;
259
+ }
260
+ </style>
261
+ <div class="centered-table">
262
+ <table border="1" style="border-collapse: collapse; width: 100%;">
263
+ <tr>
264
+ <th>Samples</th>
265
+ <th>Speech Recognition</th>
266
+ <th>Rich Text Transcription</th>
267
+ </tr>
268
+
269
+
270
+ <tr>
271
+ <td><a href="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/SenseVoice/demo/%E9%9F%A9%E8%AF%AD-%E8%8B%B1%E8%AF%AD-%E7%BB%BC%E8%89%BA.wav" target="_blank">韩语-英语-综艺</a></td>
272
+ <td>자, 둘, 셋! 안녕 하 세요?저는 새봄 영원 신화 입니다. 한 주간 잘 지내셨나요? 오늘은 어떤 영상을 보러 들을 거냐 혹시 밴드 음악 좋아해요 네 약간 락 그런 거 그냥 밴드요 밴드 음악 좋아하는 밴드 누구 있어요 저요 뭐 우리나라 아니어도 상관없어요 어...라디오 해군 아 진짜 형은요? FT 아일랜드 그런 그런 그룹 들이 많이 있 는데 그런 관련 된 중국 에 서도 밴드 관련 된 프로그램 이 있 다고 해요?그래서 그 프로그램 영상 을 한번 만나 볼 건데 그렇게 감안 해서 생각 하 시고, 보 면서 어떤 음악 을 하 는지 또 밴드 가, 밴드라는 지칭하는 단어가 여기 악기들이 들어가는 그런 연주잖아요 그래서 느낌이 다르지 안 다르지는 잘 모르겠는데 좀 전 ���슷할 것 같아요 그래서 한번 보도록 하겠습니다 틀어주세요. え?うん。 なんか、何なんだ。 여기 왔다 약간 벨로그 こんばんは? 이디엠 이디엠인가 뭐야, 재밌어야? 哦。 Like, what is that? 세일 고려하는 게 그렇게 많다 아 저 뒤에 사람 있는 게 신기한 화면이 아니라 이거 레이저인 거야 화면이 레이저를 잘 할 필요가 없는데, 정전을 잘 못해? I'm 80 now. 是吧。 No touch. てるよ。 패트가 있다는 게 아니라 투명 투명</td>
273
+ <td>자, 둘, 셋 안녕하세요 저는 새봄 영원 신화입니다 자, 여러분들! 한주가 잘 지내셨나요 😀 ?오늘은 어떤 영상을 불러드릴 거냐 혹시 밴드 음악 좋아요? 그냥 밴드요 밴드 밴드 음악 밴드 음악이 락 아닌가 밴드 좋아하는 밴드 누구 있어요 저요 우리나라 아니어도 상관없어요 어...라디오 헤드 아 형은요? 🎵 FG 아일랜드 그런 그런 그룹 들이 많이 있 는데 그런 관련 된 중국 에 서도 밴드 관련 된 프로그램 이 있 다고 해요?그래서, 그 프로그램 영상 을 한번 만나 볼 건데 그렇게 감안 해서 생각 하 시고, 보 면서 어떤 음악 을 하 는지 또 밴드 가, 🎶 밴드라는 지칭하는 단어가 여기 악기들이 들어가는 그런 연주잖아요 그래서 느낌이 다르지 안 다르지는 잘 모르겠는데 좀 전 비슷할 것 같아요 그래서 한번 보도록 하겠습니다 자 틀어주세요. 🎵 들어왔다 약간 갤럭시 온라인? 🎶 😀 What is that. 🎵 되게 화려한데, 렉크가 아 저 뒤에 사람 있는 게 신기해 화면이 아니라 이거 레이저인 거야?레이저를 저렇게 조절이 되냐 누구 저게 와우. 🎶 👏 No touch. 👏 👏 👏 👏 👏 🎵 패스가 있다는 게 아니라 투명 투명. 🎶</td>
274
+
275
+ </tr>
276
+
277
+ <tr>
278
+ <td><a href="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/SenseVoice/demo/%E8%8B%B1%E6%96%87-%E6%AD%8C%E6%9B%B2.wav" target="_blank">英文-歌曲</a></td>
279
+ <td>What's up, I'm Morris, I'm playing Blood Incantation Hey, this is Jeff Barrett. And this is Isaac Falk, it's Paul Riedel from Blood Incantation here at Amoeba in Hollywood, we're doing What's in My Bag. So, yeah, I found. This CD, it's a. Fates warning, perfect symmetry. 在一。 It's the album they did right around the time of another album, Parallels, where they had a new singer, this is where this band really started going in the progressive direction, where very extended like solo sections and odd time signatures and stuff like that and yeah, very influential, actually parts of this were very influential on Hidden History of the Human Race and we were working on that. First thing in my bag is. Mordred Angels 1999 album, Formulas Fatal to the Flesh. I don't own this on vinyl, I think this is the second. Repress, huge influence on floating containment, if you can tell, one of our favorite records. And it's good to find it here. Yeah, total classic. The first thing I found was in the new arrivals bin. ста 90s reissue of the second Wallenstein album. Classic krautrock on the pills label, great psychedelic Prague krautrock with a little bit of folk and kind of symphonic aspect, killer. And a great cover. Yeah, mother universe. First thing I picked out, I went straight to the jazz section. Finds this record that. Changed my life, I got to see Dave Brubeck before he died when I was about 17 in New Jersey. The saxophone player on this, Paul Desmond, is also one of my favorite jazz players, super important record to me and my dad, which he grew on very quickly. My next thing. Is this Osamu Kitajima album? I have not heard this one, but I'm a really big fan of his stuff from the 70s and 80s, and this album has a 12 minute song, so I'm guessing it's going to be pretty cool, I just want to read the back of this because this was like really interesting to me, it says,"Higher Octave Music is founded upon the vision that we have entered an era of global cooperation and unity in which we have entered an era of global cooperation and unity, We are dedicated to a process of continuous refinement, both artistically and commercially, as in music, so in life. I was like, that sounds great, I'm going to buy that next I got the Carlos Santana John McLaughlin record. Love, devotion, surrender. Morris picked it up at another record store on this tour, they're two amazing guitarists that I love both of their solo works, so I'd like to see them work together. Next is a record I have not heard, but was apparently unavailable on official format until this reissue from Wawa by Bernard Zolotol's like a classic new age. Type of progressive electronics guy. A lot of Terry Riley style tape loops, rather than like. You know, harsh digital sequencers and synthesizer stuff. So I have a couple records from this guy. I never heard this one. It includes a bonus 7". There's also a song called Gliding through the Cosmophonic Dome. So it's probably great. Next step, I got.</td>
280
+ <td>What's up. I'm Morris playing blood incantation. Hey, this is Jeff Barrett, and this is Isaac Falk. It's Paul Riedel from Blood Incantation here at Amoeba in Hollywood. We're doing what's in my bag. 🎼 So, yeah, I found this Cd. It's a fate's warning. Perfect symmetry. 🎵 It's the album they did right around the time of another album, Parallels, where they had a new singer. This is where this band really started going in the progressive direction, where. 🎶 Very extended, like solo sections and odd time signatures and stuff like that. And yeah, very influential. Actually, parts of this were very influential on hidden history of the human race. And we're working on that. The first thing in my bag is more of an angel's album. Formulas fatal to the flesh. 🎼 I don't own this on vinyl. I think this is the second repress. Huge influence on loading, 🎵 if you can tell one of our favorite records. 🎶 And it's good to find it here. Yeah, total classic. The first thing I found was in the new arrivals bin, 🎵 the S reissue of the second Wallenstein album. Classic kraut rock on the pills label. Great psychedelic 🎶 Prague kraut rock with a little bit of folk and kind of symphonic aspect 🎵 killer and a great cover. Yeah, mother universe. First thing I picked out, I went straight to the jazz section. 🎶 😊 To find this record that changed my life, I got to see Dave Brubeck before he died when I was about seventeen 🎵 in New Jersey. 🎶 The saxophone player on this, Paul Desmond, is also one of my favorite jazz players, super important record to me and my dad, which he grew on very quickly. 🎼 🎵 My next thing is this Osamu Kitajima album. 🎶 I have not heard this one, but I'm a really big fan of his stuff from the S And S. And this album has a minute song. So I'm. I'm guessing it's going to be pretty cool. I just want to read the back of this because this was like, really interesting to me. It says higher octave music is founded upon the vision that we have entered an era of. 😊 Global cooperation and unity in which music plays an integral 🎵 part. Our purpose is to help set a new standard of excellence in the music of this era. 🎶 We are dedicated to a process of continuous refinement, both artistically and commercially, as in music. So in life, 🎵 I was like, that sounds great. I'm going to buy that. Next, I got the Carlos Santana John McLaughlin record. 🎶 😊 Love, devotion, surrender. 🎼 Morris picked it up at another record store on this tour. 🎵 There are two amazing guitarists that I love both of their solo works. So I'd like to see them work together. 🎶 😊 Next is a record I have not heard, but was apparently unavailable on official format until this reissue from Wawa. But Bernard Zolotol is like a classic new age. 🎵 Type of progressive electronics guy. A lot of Terry Riley style tape loops, rather than like. You know, harsh digital sequencers and synthesizer stuff. So I have a couple records from the sky. I never heard this one. It includes a bonus seven inch. There's also a song called Gliding through the Cosmophonic Dome. So it's probably great. Next up, I got. 🎶</td>
281
+
282
+ </tr>
283
+
284
+ <tr>
285
+ <td><a href="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/SenseVoice/demo/%E8%8B%B1%E6%96%87-%E4%B8%AD%E6%96%87-%E8%85%BE%E6%A0%BC%E5%B0%94-%E5%A4%A9%E5%A0%82.wav" target="_blank">英文-中文-腾格尔-天堂</a></td>
286
+ <td>Tangry with his song, heaven. de de de le de Absolute shock, but in a great way. That was awesome. That was awesome. What way to open a song that was awesome, awesome. I'd love to check out some more Mongolian throat singing stuff. That is correct, right, It is Mongolian. Let me know. I'd love to check out more. I think a lot of you want to check out the Who if you guys still want me to, I'd be more than happy to. de de de 蓝蓝的天空,清清的湖水啊。 That is incredible, that is incredible. That is incredible for those of you don't know what I'm saying right now, the way he can make it sound like he's finished a note, you know, he like lowers it so low you can't even hear the note anymore and then he brings it back and you can see his mouth still open the way it makes the way he can like finish a note but not finish it, I don't know how to explain that that is an incredible talent that is amazing. 哦哟这是我的家。</td>
287
+ <td><span style='color: black;'> Tangry with his song. </span><span style='color: black;'> </span><span style='color: black;'> Heaven. </span><span style='color: #0D47A1;'> 🎵</span><span style='color: black;'>absolute.sock but in a great way🎶</span><span style='color: #0D47A1;'>. 🎵 </span><span style='color: black;'> Wow. </span><span style='color: black;'> </span><span style='color: black;'> That was awesome. That was awesome. What way to open a song. That was awesome, awesome. I'd love to check out some more Mongolian 🎶 folk singing stuff. That is correct, right, It is Mongolian. Let me know. I'd love to check out more. I think a lot of you want to check out the Who if you guys still want me to, I'd be more than happy to. </span><span style='color: #0D47A1;'> 😊 🎵蓝蓝的天空。轻轻的呼声。你要。绿绿的草原。🎶 </span><span style='color: black;'> That is incredible. That is incredible. </span><span style='color: black;'> </span><span style='color: black;'> That is incredible for those of you don't know what I'm saying right now, the way he can make it sound like he's finished a note, you know, he like lowers it so low you can't even hear the note anymore and then he brings it back and you can see his mouth still open the way it makes the way he can like finish a note but not finish it, I don't know how to explain that, that is an incredible talent, that is amazing. </span><span style='color: #0D47A1;'> 😡 🎵</span><span style='color: black;'>给我买什么?</span><span style='color: #0D47A1;'>这是我的家。哎,嘿。🎶</span></td>
288
+
289
+ </tr>
290
+
291
+ <tr>
292
+ <td><a href="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/SenseVoice/demo/%E6%97%A5%E8%AF%AD-%E8%8B%B1%E8%AF%AD-%E8%8B%B9%E6%9E%9C.wav" target="_blank">日语-英语-苹果</a></td>
293
+ <td>And there's another big game coming to Mac this year, to tell you all about it, here's legendary game creator Kojima-san from Kojima Productions. はい、皆さんこんにちは。僕らが現在取り組んでいるマックのプロジェクトを本日ここで皆さんにご紹介できることをとても嬉しく思って。います。 僕自身、1994年に最初の Mac を購入して以来の Apple の大ファンです。 そして、僕とチームが手掛けた作品をマックでお届けすることが、長年の夢でもありましたマックでのゲーム体験は、ついに新しい時代に突入しました。 その時代に合わせて、デストランディングディレクターズカットの Mac 版のリリースが2023年年末に決定したことをここで発表させていただきます。 マック版デストランディングディレクターズカットでは、Apple の最新のテクノロジーを最大限に活かしたゲーム体験を皆さんにお届けしたいと思います。 メタル fx アップスケーリングによる高精度なグラフィックをはじめ、アップルシリコンの素晴らしいパフォーマンスやモダンなレンダリングパイプラインを兼ね備えたメタル3には今回とても驚かされました。 ぜひ多くの皆さんに、この革新的かつエクサイティングな新しいマックの環境に触れていただければと思っています。 このデストランディングディレクターズカットを皮切りに、今後の小島プロダクションタイトルについても Apple プラットフォームへの展開を積極的に行っていく予定です。 デストランディングディレクターズカットの先行予約は近日中に開始する予定です。ぜひ楽しみにお待ちください。それでは、ザインキュー。</td>
294
+ <td>And there's another big game coming to Mac this year to tell you all about it. Here's legendary game creator Kojima-san, from Kojima Productions. 🎵みなさんこんにちは、僕らが現在取り組んでいるマックのプロジェクトを本日ここで皆さんにご紹介できることをとても嬉しく思っています。🎶 僕自身、千九百九十四年に最初の mac を購入して以来の🎵アップルの大ファンです。そして、僕とチームが手掛けた作品を mac でお届けすることが、長年の夢でもありました mac でのゲーム体験は、ついに新しい時代に突入しました。🎶 その時代に合わせて、デストランディングディレクターズカットの🎵mac版のリリースが二千二十三年年末に決定したことをここで発表させていただきます。mac 版デストランディングディレクターズカットでは、apple の最新のテクノロジーを最大限に活かしたゲーム体験を皆さんにお届けしたいと思います。メタル fx アップスケーリングによる高精度なグラフィックをはじめ、アップルシリコンの素晴らしいパフォーマンスやモダンなレンダリングパイプラインを兼ね備えたメタルには今回とても驚かされました。ぜひ多くの皆さんに、この革新的かつエクサイティングな新しいマックの環境に触れていただければと思っています。このデストランディングディレクターズカットを皮切りに、今後の小島プロダクションタイトルについても、アップルプラットフォームへの展開を積極的に行っていく予定です。デストランディングディレクターズカットの先行予約は近日中に開始する予定です。是非楽しみにお待ちください。それでは、ザインキュー。🎶</td>
295
+
296
+ </tr>
297
+
298
+ <tr>
299
+ <td><a href="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/SenseVoice/demo/%E4%B8%AD%E8%8B%B1-%E5%8D%8E%E8%AF%AD%E4%B9%90%E5%9D%9B.wav" target="_blank">中英-华语乐坛</a></td>
300
+ <td>问你什么想法,我是我对你的表达。问你。 什么看法?我的世界,我的复活。 都说华流才是顶流,而随着华语乐坛的崛起,的确有不少华语歌手真正做到了用作品和歌声征服国际舞台。那么本期视频就为小伙伴们探点了这样火遍全球的四首华语歌曲。话不多说,快来看看有没有你喜欢的吧。 de number four play我呸,由蔡依林演唱,发行于2014年,是一首中西合并,风格十分前卫的歌曲。在这首歌中,蔡依林可谓突破了自己以往的尺度,特别是现场表演,更是气场全开,完全就是女王的风范。 想要挣,他挣我赔,快点去相亲,也是要付我赔,快点去那里来,想要挣我赔,我赔,早赔更赔。 什么都喜欢什么都会。 number three,左手指月,左手指月,指指人心。这是一首暗含佛家禅意的歌曲,除了精妙的作词之外,歌曲超三个八度的高音也只有原唱萨顶顶能演绎出其中的精髓。而她的现场演唱,更是让老外都惊羡不已。 此人是你全部的社会信念。 我的家。 啊,好好的。 number two,光年之外,这是好莱坞大片太空旅客专程邀请段子琪为电影创作的主题曲,而段子琪显然也不负他们所望。这首光年之外,不仅与电影的主题十分契合,而且火爆全网,成为了2017年的年度十大金曲。果然,华语小天后的魅力,你真的可以永远相信。 为爱遥远在空间之外,若能守候未知,为你等待。我没想到,为你活得多么荒凉。 伤多了一小,没有你根本不想逃。 de number one浮夸,或许很多小伙伴不知道的是,原创作者写这首歌,其实一开始就是为了纪念哥哥张国荣,后来被陈奕迅演唱后,更是成为了一个经典浮夸式的演绎。据说在2014年的某颁奖盛典,因为伊森的现场太过浮夸,以至于主办方不得不将这一段给剪掉。 天使帝女马与人类流花吧,一生只想你惊讶,我旧事只为传唱,不怕重聚。 好了,这就是本期节目的全部内容了,喜欢的小伙伴别忘了点赞关注,我们下期见,拜拜。</td>
301
+ <td><span style='color: #0D47A1;'>🎵问你什么想法,我是我对你的表达。问你。什么看法?我的世界,我的复活。</span><span style='color: black;'>都说华流才是顶流,而随着华语乐坛的崛起,的确有不少华语歌手真正做到了用作品和歌声征服国际舞台。那么本期视频就为小伙伴们探点了这样火遍全球的四首华语歌曲。话不多说,快来看看有没有你喜欢的吧。</span><span style='color: #0D47A1;'>😊🎶 </span><span style='color: black;'>number four play我呸,由蔡依林演唱,发行于2014年,是一首中西合并,风格十分前卫的歌曲。在这首歌中,蔡依林可谓突破了自己以往的尺度,特别是现场表演,更是气场全开,完全就是女王的风范。</span><span style='color: black;'> 🎼 </span><span style='color: black;'>number3,左手指月左手指月,指指人心。这是一首暗含佛家禅意的歌曲,除了精妙的作词之外,歌曲超三个八度的高音也只有原唱萨顶顶能演绎出其中的精髓。而她的现场演唱,更是让老外都惊🎵羡不已。</span><span style='color: #0D47A1;'>此人是你全部的社会信念。😊啊,一生。😊🎶 </span><span style='color: black;'>number two,光年之外,这是好莱坞大片太空旅客专程邀请段子琪为电影创作的主题曲,而段子琪显然也不负他们所望。这首光年之外,不仅与电影的主题十分契合,而且火爆全网,成为了2017年的年度十大金曲。果然,华语小天后的魅力,你真的可以永远相信。</span><span style='color: #0D47A1;'> 🎵为爱遥远在空间之外,若能守候未知,为你等待。我没想到,哎,我如同荒凉。</span><span style='color: black;'>number one浮夸,或许很多小伙伴不知道的是,原创作者写这首歌,其实一开始就是为了纪念哥哥张国荣,后来被陈奕迅演唱后,更是成为了一个经典浮夸式的演绎。据说在2014年的某颁奖盛典,因为伊森的现场🎶太过浮夸,以至于主办方不得不将这一段给剪掉。</span><span style='color: #0D47A1;'> 🎵歇斯底里,马儿,眼泪流花罢,一生只想你惊讶,我旧事只为传唱,不怕重聚。好了,这就是本期节目的全部内容了,喜欢的小伙伴别忘了点赞关注我们,下期见,拜拜。🎶</span></td>
302
+
303
+ </tr>
304
+
305
+ <tr>
306
+ <td><a href="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/SenseVoice/demo/%E4%B8%AD%E6%96%87-%E6%97%A5%E8%AF%AD-%E7%BD%91%E7%BA%A2%E6%AD%8C%E7%BF%BB%E5%94%B1.wav" target="_blank">中文-日语-网红歌翻唱</a></td>
307
+ <td>呼啦啦啦啦啦啦啦啦呼啦啦啦啦啦啦啦。 哈喽各位,这里是音乐萌太郎,我是小凡。几率死生命死之后,今天我们又有三首流行歌曲。 被日本看上了,到底被注入了怎样的灵魂,我们一起来听一下吧。东宝石的这首野狼disco,堪称今年最。 被洗脑的神曲前段时间不但风靡大学校园,就连陈伟霆也出了正宗的岗位教程。没想到转眼间,这首歌却被日语看上了,被软萌的罗丽依一唱,我竟然有。 在停裂暗循环的感觉。 夜叫ぼうか嘘が本当だ思いっきりして誰も忘れた君は一番だ知ってるから。 せえのこっちにりょうを描いてこっちに虹を描くいいね逆にこっちに虹を描いてこっちにりょうを描くすごい 哦,不要。 前段时间,由音雀视听书品赵方静演唱的古风电音盲咒,也凭借洗脑旋律在短时间内成功刷屏,这次更是背翻战神日语版走红网络。短短一周的时间,视频已经快要达到200万的播放了。 幻想が一粒の涙そんなの無理っての私だけじゃいき。 出らないあなたの言葉を思えば胸がギュッとなんだかずっと痛いや。 おお、女のない夜空以外。 听听听听。 还记得那首换装歌曲速吗?这次也被小姐姐翻唱成了日语版,不过对于这首歌还是有些争议。有网友表示,空灵的嗓音也许更适合这首歌的曲风。节目的最后,一起来听听这首日语版的歌曲吧。喜欢的小伙伴记得关注,我们下期见,拜拜。 連れ出されて星の中へ。 的从列。</td>
308
+ <td><span style='color: #0D47A1;'>🎵呼啦啦啦啦啦啦啦啦呼啦啦啦啦啦啦啦啦。😊hello,各位,这里是音乐萌太糖,我是小凡。几率死生命死之后,今天我们又有三首流行歌曲。🎶😊 </span><span style='color: black;'>被日本看上了,到底被注入了怎样的灵魂,我们一起来听一下吧。🎵东宝石的这首野狼disco,堪称今年最。</span><span style='color: black;'>😊</span><span style='color: black;'>被洗脑的神曲前段时间不但风靡大学校园,就连陈伟霆也出了正宗的岗位教程。没想到转眼间,这首歌却被日语看上了,被软萌的罗丽音一唱,我竟然有。</span><span style='color: black;'>😊</span><span style='color: black;'>在停恋爱循环的感觉。</span><span style='color: #0D47A1;'>😊夜叫ぼうか嘘が本当か思いっきりして誰も忘れた君は一番だ知ってるから。せーのこっちにりょうを描いてこっちに虹を描くいいね逆にこっちに虹を描いてこっちにりょうを描くすごい!哦,不要。前段时间,由音雀视听书品赵方静演唱的古风电音盲咒,也凭借洗脑旋律在短时间内成功刷屏,这次更是背翻战场日语版走红网络。短短一周的时间,视频已经快要达到200万的播放了。😊幻想が一粒の涙そんなの無理っての私だけじゃいき。出らないあなたの言葉を思えば胸がギュッとなんだかずっと痛いや。女のない夜空以外。</span><span style='color: black;'>还记得那首换装歌曲速吗?这次也被小姐姐翻唱成了日语版,不过对于这首歌还是有些争议。有网友表示,空灵的嗓音也许更适合这首歌的曲风。节目的最后,一起来听听这首日语版的歌曲吧。喜欢的小伙伴记得关注,我们下期见,拜拜。</span><span style='color: #0D47A1;'>😊連れ出されて星の中へ。的从容。🎶</span></td>
309
+
310
+ </tr>
311
+
312
+ </table>
313
+ </div>
314
+ """
315
+
316
+
317
+ def launch():
318
+ with gr.Blocks(theme=gr.themes.Soft()) as demo:
319
+ # gr.Markdown(description)
320
+ gr.HTML(html_content)
321
+ with gr.Row():
322
+ with gr.Column():
323
+ audio_inputs = gr.Audio(label="Upload audio or use the microphone")
324
+
325
+ with gr.Accordion("Configuration"):
326
+ # task_inputs = gr.Radio(choices=["Speech Recognition", "Rich Text Transcription"],
327
+ # value="Speech Recognition", label="Task")
328
+ language_inputs = gr.Dropdown(choices=["auto", "zh", "en", "yue", "ja", "ko", "nospeech"],
329
+ value="auto",
330
+ label="Language")
331
+ fn_button = gr.Button("Start", variant="primary")
332
+ text_outputs = gr.Textbox(label="Results")
333
+ gr.Examples(examples=audio_examples, inputs=[audio_inputs, language_inputs], examples_per_page=20)
334
+
335
+ fn_button.click(model_inference, inputs=[audio_inputs, language_inputs], outputs=text_outputs)
336
+ # with gr.Accordion("More examples"):
337
+ # gr.HTML(centered_table_html)
338
+ demo.launch()
339
+
340
+
341
+ if __name__ == "__main__":
342
+ # iface.launch()
343
+ launch()
344
+
345
+
example/emo_1.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2729e565c98979826d9335b5563fdc79e9bc1b4ab256f67f38adc8e8c2c1646b
3
+ size 87820
example/emo_2.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:26de72f71c7428451b4f62920f56b5853f4d235069f4281816587dc1d7ad0e05
3
+ size 117900
example/emo_3.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2648a25d1d5012f81da3fd7ea4ec82ca3b6ddeaae89996e474ce81630fdabf69
3
+ size 80780
example/emo_4.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45834a358ef3e95dae6d7e2643204878dbd28f2b3c2055dec588a8064b5472a6
3
+ size 87084
example/en.mp3 ADDED
Binary file (57.4 kB). View file
 
example/event_1.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2de04b4d958ab53a2f66653f66bc640757bb07cacbb1a614c28777f505c9aa20
3
+ size 441044
example/event_2.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a254709bbf039a760772033127d7ddd1b1296827c1c4bf91397c4eed47630415
3
+ size 441044
example/event_3.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d3237552ea143411a563b9e886f870df5870d31cb3272ae33f2588ba5076fe7
3
+ size 441044
example/ja.mp3 ADDED
Binary file (57.8 kB). View file
 
example/ko.mp3 ADDED
Binary file (27.9 kB). View file
 
example/longwav_1.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:49ff8d0f20e7c1e9a46f2c40e5eadd2df145bd2be83735e454790571b08a12f1
3
+ size 5925776
example/longwav_2.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae93488f4091552861fe36f0df4cce01b4f93d4ccdf846979937ab1e4793a4d4
3
+ size 8742988
example/longwav_3.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe4c9b3ea090c399630266b54fb1d7ff864162b5f82749cd312db16806932a29
3
+ size 7033468
example/longwav_4.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76efd05d140a38f48d976222a39ec411931db122a54e3f0d3cda3b4a7a1485c8
3
+ size 2783624
example/rich_1.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d650ab4f08bd3a5d3d59fd67d085fd0765ddabe4729294f4106fceece321fdf
3
+ size 571456
example/rich_2.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3384e2e768ba11d921f3fcbfee5ebc18c70323f18ab41d2c93931e4cf26e45ff
3
+ size 320044
example/rich_3.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:17ff9c06cfc608e13dec503ae2e5015b1748fd5b9509b2080e31089eec2af7cc
3
+ size 206986
example/yue.mp3 ADDED
Binary file (31.2 kB). View file
 
example/zh.mp3 ADDED
Binary file (45 kB). View file
 
model.py ADDED
@@ -0,0 +1,898 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Iterable, Optional
2
+ import types
3
+ import time
4
+ import numpy as np
5
+ import torch
6
+ import torch.nn.functional as F
7
+ from torch import Tensor
8
+ from torch import nn
9
+ from torch.cuda.amp import autocast
10
+ from funasr.metrics.compute_acc import compute_accuracy, th_accuracy
11
+ from funasr.losses.label_smoothing_loss import LabelSmoothingLoss
12
+ from funasr.train_utils.device_funcs import force_gatherable
13
+
14
+ from funasr.utils.load_utils import load_audio_text_image_video, extract_fbank
15
+ from funasr.utils.datadir_writer import DatadirWriter
16
+ from funasr.models.ctc.ctc import CTC
17
+
18
+ from funasr.register import tables
19
+
20
+
21
+ from funasr.models.paraformer.search import Hypothesis
22
+
23
+
24
+ class SinusoidalPositionEncoder(torch.nn.Module):
25
+ """ """
26
+
27
+ def __int__(self, d_model=80, dropout_rate=0.1):
28
+ pass
29
+
30
+ def encode(
31
+ self, positions: torch.Tensor = None, depth: int = None, dtype: torch.dtype = torch.float32
32
+ ):
33
+ batch_size = positions.size(0)
34
+ positions = positions.type(dtype)
35
+ device = positions.device
36
+ log_timescale_increment = torch.log(torch.tensor([10000], dtype=dtype, device=device)) / (
37
+ depth / 2 - 1
38
+ )
39
+ inv_timescales = torch.exp(
40
+ torch.arange(depth / 2, device=device).type(dtype) * (-log_timescale_increment)
41
+ )
42
+ inv_timescales = torch.reshape(inv_timescales, [batch_size, -1])
43
+ scaled_time = torch.reshape(positions, [1, -1, 1]) * torch.reshape(
44
+ inv_timescales, [1, 1, -1]
45
+ )
46
+ encoding = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=2)
47
+ return encoding.type(dtype)
48
+
49
+ def forward(self, x):
50
+ batch_size, timesteps, input_dim = x.size()
51
+ positions = torch.arange(1, timesteps + 1, device=x.device)[None, :]
52
+ position_encoding = self.encode(positions, input_dim, x.dtype).to(x.device)
53
+
54
+ return x + position_encoding
55
+
56
+
57
+ class PositionwiseFeedForward(torch.nn.Module):
58
+ """Positionwise feed forward layer.
59
+
60
+ Args:
61
+ idim (int): Input dimenstion.
62
+ hidden_units (int): The number of hidden units.
63
+ dropout_rate (float): Dropout rate.
64
+
65
+ """
66
+
67
+ def __init__(self, idim, hidden_units, dropout_rate, activation=torch.nn.ReLU()):
68
+ """Construct an PositionwiseFeedForward object."""
69
+ super(PositionwiseFeedForward, self).__init__()
70
+ self.w_1 = torch.nn.Linear(idim, hidden_units)
71
+ self.w_2 = torch.nn.Linear(hidden_units, idim)
72
+ self.dropout = torch.nn.Dropout(dropout_rate)
73
+ self.activation = activation
74
+
75
+ def forward(self, x):
76
+ """Forward function."""
77
+ return self.w_2(self.dropout(self.activation(self.w_1(x))))
78
+
79
+
80
+ class MultiHeadedAttentionSANM(nn.Module):
81
+ """Multi-Head Attention layer.
82
+
83
+ Args:
84
+ n_head (int): The number of heads.
85
+ n_feat (int): The number of features.
86
+ dropout_rate (float): Dropout rate.
87
+
88
+ """
89
+
90
+ def __init__(
91
+ self,
92
+ n_head,
93
+ in_feat,
94
+ n_feat,
95
+ dropout_rate,
96
+ kernel_size,
97
+ sanm_shfit=0,
98
+ lora_list=None,
99
+ lora_rank=8,
100
+ lora_alpha=16,
101
+ lora_dropout=0.1,
102
+ ):
103
+ """Construct an MultiHeadedAttention object."""
104
+ super().__init__()
105
+ assert n_feat % n_head == 0
106
+ # We assume d_v always equals d_k
107
+ self.d_k = n_feat // n_head
108
+ self.h = n_head
109
+ # self.linear_q = nn.Linear(n_feat, n_feat)
110
+ # self.linear_k = nn.Linear(n_feat, n_feat)
111
+ # self.linear_v = nn.Linear(n_feat, n_feat)
112
+
113
+ self.linear_out = nn.Linear(n_feat, n_feat)
114
+ self.linear_q_k_v = nn.Linear(in_feat, n_feat * 3)
115
+ self.attn = None
116
+ self.dropout = nn.Dropout(p=dropout_rate)
117
+
118
+ self.fsmn_block = nn.Conv1d(
119
+ n_feat, n_feat, kernel_size, stride=1, padding=0, groups=n_feat, bias=False
120
+ )
121
+ # padding
122
+ left_padding = (kernel_size - 1) // 2
123
+ if sanm_shfit > 0:
124
+ left_padding = left_padding + sanm_shfit
125
+ right_padding = kernel_size - 1 - left_padding
126
+ self.pad_fn = nn.ConstantPad1d((left_padding, right_padding), 0.0)
127
+
128
+ def forward_fsmn(self, inputs, mask, mask_shfit_chunk=None):
129
+ b, t, d = inputs.size()
130
+ if mask is not None:
131
+ mask = torch.reshape(mask, (b, -1, 1))
132
+ if mask_shfit_chunk is not None:
133
+ mask = mask * mask_shfit_chunk
134
+ inputs = inputs * mask
135
+
136
+ x = inputs.transpose(1, 2)
137
+ x = self.pad_fn(x)
138
+ x = self.fsmn_block(x)
139
+ x = x.transpose(1, 2)
140
+ x += inputs
141
+ x = self.dropout(x)
142
+ if mask is not None:
143
+ x = x * mask
144
+ return x
145
+
146
+ def forward_qkv(self, x):
147
+ """Transform query, key and value.
148
+
149
+ Args:
150
+ query (torch.Tensor): Query tensor (#batch, time1, size).
151
+ key (torch.Tensor): Key tensor (#batch, time2, size).
152
+ value (torch.Tensor): Value tensor (#batch, time2, size).
153
+
154
+ Returns:
155
+ torch.Tensor: Transformed query tensor (#batch, n_head, time1, d_k).
156
+ torch.Tensor: Transformed key tensor (#batch, n_head, time2, d_k).
157
+ torch.Tensor: Transformed value tensor (#batch, n_head, time2, d_k).
158
+
159
+ """
160
+ b, t, d = x.size()
161
+ q_k_v = self.linear_q_k_v(x)
162
+ q, k, v = torch.split(q_k_v, int(self.h * self.d_k), dim=-1)
163
+ q_h = torch.reshape(q, (b, t, self.h, self.d_k)).transpose(
164
+ 1, 2
165
+ ) # (batch, head, time1, d_k)
166
+ k_h = torch.reshape(k, (b, t, self.h, self.d_k)).transpose(
167
+ 1, 2
168
+ ) # (batch, head, time2, d_k)
169
+ v_h = torch.reshape(v, (b, t, self.h, self.d_k)).transpose(
170
+ 1, 2
171
+ ) # (batch, head, time2, d_k)
172
+
173
+ return q_h, k_h, v_h, v
174
+
175
+ def forward_attention(self, value, scores, mask, mask_att_chunk_encoder=None):
176
+ """Compute attention context vector.
177
+
178
+ Args:
179
+ value (torch.Tensor): Transformed value (#batch, n_head, time2, d_k).
180
+ scores (torch.Tensor): Attention score (#batch, n_head, time1, time2).
181
+ mask (torch.Tensor): Mask (#batch, 1, time2) or (#batch, time1, time2).
182
+
183
+ Returns:
184
+ torch.Tensor: Transformed value (#batch, time1, d_model)
185
+ weighted by the attention score (#batch, time1, time2).
186
+
187
+ """
188
+ n_batch = value.size(0)
189
+ if mask is not None:
190
+ if mask_att_chunk_encoder is not None:
191
+ mask = mask * mask_att_chunk_encoder
192
+
193
+ mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2)
194
+
195
+ min_value = -float(
196
+ "inf"
197
+ ) # float(numpy.finfo(torch.tensor(0, dtype=scores.dtype).numpy().dtype).min)
198
+ scores = scores.masked_fill(mask, min_value)
199
+ self.attn = torch.softmax(scores, dim=-1).masked_fill(
200
+ mask, 0.0
201
+ ) # (batch, head, time1, time2)
202
+ else:
203
+ self.attn = torch.softmax(scores, dim=-1) # (batch, head, time1, time2)
204
+
205
+ p_attn = self.dropout(self.attn)
206
+ x = torch.matmul(p_attn, value) # (batch, head, time1, d_k)
207
+ x = (
208
+ x.transpose(1, 2).contiguous().view(n_batch, -1, self.h * self.d_k)
209
+ ) # (batch, time1, d_model)
210
+
211
+ return self.linear_out(x) # (batch, time1, d_model)
212
+
213
+ def forward(self, x, mask, mask_shfit_chunk=None, mask_att_chunk_encoder=None):
214
+ """Compute scaled dot product attention.
215
+
216
+ Args:
217
+ query (torch.Tensor): Query tensor (#batch, time1, size).
218
+ key (torch.Tensor): Key tensor (#batch, time2, size).
219
+ value (torch.Tensor): Value tensor (#batch, time2, size).
220
+ mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
221
+ (#batch, time1, time2).
222
+
223
+ Returns:
224
+ torch.Tensor: Output tensor (#batch, time1, d_model).
225
+
226
+ """
227
+ q_h, k_h, v_h, v = self.forward_qkv(x)
228
+ fsmn_memory = self.forward_fsmn(v, mask, mask_shfit_chunk)
229
+ q_h = q_h * self.d_k ** (-0.5)
230
+ scores = torch.matmul(q_h, k_h.transpose(-2, -1))
231
+ att_outs = self.forward_attention(v_h, scores, mask, mask_att_chunk_encoder)
232
+ return att_outs + fsmn_memory
233
+
234
+ def forward_chunk(self, x, cache=None, chunk_size=None, look_back=0):
235
+ """Compute scaled dot product attention.
236
+
237
+ Args:
238
+ query (torch.Tensor): Query tensor (#batch, time1, size).
239
+ key (torch.Tensor): Key tensor (#batch, time2, size).
240
+ value (torch.Tensor): Value tensor (#batch, time2, size).
241
+ mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
242
+ (#batch, time1, time2).
243
+
244
+ Returns:
245
+ torch.Tensor: Output tensor (#batch, time1, d_model).
246
+
247
+ """
248
+ q_h, k_h, v_h, v = self.forward_qkv(x)
249
+ if chunk_size is not None and look_back > 0 or look_back == -1:
250
+ if cache is not None:
251
+ k_h_stride = k_h[:, :, : -(chunk_size[2]), :]
252
+ v_h_stride = v_h[:, :, : -(chunk_size[2]), :]
253
+ k_h = torch.cat((cache["k"], k_h), dim=2)
254
+ v_h = torch.cat((cache["v"], v_h), dim=2)
255
+
256
+ cache["k"] = torch.cat((cache["k"], k_h_stride), dim=2)
257
+ cache["v"] = torch.cat((cache["v"], v_h_stride), dim=2)
258
+ if look_back != -1:
259
+ cache["k"] = cache["k"][:, :, -(look_back * chunk_size[1]) :, :]
260
+ cache["v"] = cache["v"][:, :, -(look_back * chunk_size[1]) :, :]
261
+ else:
262
+ cache_tmp = {
263
+ "k": k_h[:, :, : -(chunk_size[2]), :],
264
+ "v": v_h[:, :, : -(chunk_size[2]), :],
265
+ }
266
+ cache = cache_tmp
267
+ fsmn_memory = self.forward_fsmn(v, None)
268
+ q_h = q_h * self.d_k ** (-0.5)
269
+ scores = torch.matmul(q_h, k_h.transpose(-2, -1))
270
+ att_outs = self.forward_attention(v_h, scores, None)
271
+ return att_outs + fsmn_memory, cache
272
+
273
+
274
+ class LayerNorm(nn.LayerNorm):
275
+ def __init__(self, *args, **kwargs):
276
+ super().__init__(*args, **kwargs)
277
+
278
+ def forward(self, input):
279
+ output = F.layer_norm(
280
+ input.float(),
281
+ self.normalized_shape,
282
+ self.weight.float() if self.weight is not None else None,
283
+ self.bias.float() if self.bias is not None else None,
284
+ self.eps,
285
+ )
286
+ return output.type_as(input)
287
+
288
+
289
+ def sequence_mask(lengths, maxlen=None, dtype=torch.float32, device=None):
290
+ if maxlen is None:
291
+ maxlen = lengths.max()
292
+ row_vector = torch.arange(0, maxlen, 1).to(lengths.device)
293
+ matrix = torch.unsqueeze(lengths, dim=-1)
294
+ mask = row_vector < matrix
295
+ mask = mask.detach()
296
+
297
+ return mask.type(dtype).to(device) if device is not None else mask.type(dtype)
298
+
299
+
300
+ class EncoderLayerSANM(nn.Module):
301
+ def __init__(
302
+ self,
303
+ in_size,
304
+ size,
305
+ self_attn,
306
+ feed_forward,
307
+ dropout_rate,
308
+ normalize_before=True,
309
+ concat_after=False,
310
+ stochastic_depth_rate=0.0,
311
+ ):
312
+ """Construct an EncoderLayer object."""
313
+ super(EncoderLayerSANM, self).__init__()
314
+ self.self_attn = self_attn
315
+ self.feed_forward = feed_forward
316
+ self.norm1 = LayerNorm(in_size)
317
+ self.norm2 = LayerNorm(size)
318
+ self.dropout = nn.Dropout(dropout_rate)
319
+ self.in_size = in_size
320
+ self.size = size
321
+ self.normalize_before = normalize_before
322
+ self.concat_after = concat_after
323
+ if self.concat_after:
324
+ self.concat_linear = nn.Linear(size + size, size)
325
+ self.stochastic_depth_rate = stochastic_depth_rate
326
+ self.dropout_rate = dropout_rate
327
+
328
+ def forward(self, x, mask, cache=None, mask_shfit_chunk=None, mask_att_chunk_encoder=None):
329
+ """Compute encoded features.
330
+
331
+ Args:
332
+ x_input (torch.Tensor): Input tensor (#batch, time, size).
333
+ mask (torch.Tensor): Mask tensor for the input (#batch, time).
334
+ cache (torch.Tensor): Cache tensor of the input (#batch, time - 1, size).
335
+
336
+ Returns:
337
+ torch.Tensor: Output tensor (#batch, time, size).
338
+ torch.Tensor: Mask tensor (#batch, time).
339
+
340
+ """
341
+ skip_layer = False
342
+ # with stochastic depth, residual connection `x + f(x)` becomes
343
+ # `x <- x + 1 / (1 - p) * f(x)` at training time.
344
+ stoch_layer_coeff = 1.0
345
+ if self.training and self.stochastic_depth_rate > 0:
346
+ skip_layer = torch.rand(1).item() < self.stochastic_depth_rate
347
+ stoch_layer_coeff = 1.0 / (1 - self.stochastic_depth_rate)
348
+
349
+ if skip_layer:
350
+ if cache is not None:
351
+ x = torch.cat([cache, x], dim=1)
352
+ return x, mask
353
+
354
+ residual = x
355
+ if self.normalize_before:
356
+ x = self.norm1(x)
357
+
358
+ if self.concat_after:
359
+ x_concat = torch.cat(
360
+ (
361
+ x,
362
+ self.self_attn(
363
+ x,
364
+ mask,
365
+ mask_shfit_chunk=mask_shfit_chunk,
366
+ mask_att_chunk_encoder=mask_att_chunk_encoder,
367
+ ),
368
+ ),
369
+ dim=-1,
370
+ )
371
+ if self.in_size == self.size:
372
+ x = residual + stoch_layer_coeff * self.concat_linear(x_concat)
373
+ else:
374
+ x = stoch_layer_coeff * self.concat_linear(x_concat)
375
+ else:
376
+ if self.in_size == self.size:
377
+ x = residual + stoch_layer_coeff * self.dropout(
378
+ self.self_attn(
379
+ x,
380
+ mask,
381
+ mask_shfit_chunk=mask_shfit_chunk,
382
+ mask_att_chunk_encoder=mask_att_chunk_encoder,
383
+ )
384
+ )
385
+ else:
386
+ x = stoch_layer_coeff * self.dropout(
387
+ self.self_attn(
388
+ x,
389
+ mask,
390
+ mask_shfit_chunk=mask_shfit_chunk,
391
+ mask_att_chunk_encoder=mask_att_chunk_encoder,
392
+ )
393
+ )
394
+ if not self.normalize_before:
395
+ x = self.norm1(x)
396
+
397
+ residual = x
398
+ if self.normalize_before:
399
+ x = self.norm2(x)
400
+ x = residual + stoch_layer_coeff * self.dropout(self.feed_forward(x))
401
+ if not self.normalize_before:
402
+ x = self.norm2(x)
403
+
404
+ return x, mask, cache, mask_shfit_chunk, mask_att_chunk_encoder
405
+
406
+ def forward_chunk(self, x, cache=None, chunk_size=None, look_back=0):
407
+ """Compute encoded features.
408
+
409
+ Args:
410
+ x_input (torch.Tensor): Input tensor (#batch, time, size).
411
+ mask (torch.Tensor): Mask tensor for the input (#batch, time).
412
+ cache (torch.Tensor): Cache tensor of the input (#batch, time - 1, size).
413
+
414
+ Returns:
415
+ torch.Tensor: Output tensor (#batch, time, size).
416
+ torch.Tensor: Mask tensor (#batch, time).
417
+
418
+ """
419
+
420
+ residual = x
421
+ if self.normalize_before:
422
+ x = self.norm1(x)
423
+
424
+ if self.in_size == self.size:
425
+ attn, cache = self.self_attn.forward_chunk(x, cache, chunk_size, look_back)
426
+ x = residual + attn
427
+ else:
428
+ x, cache = self.self_attn.forward_chunk(x, cache, chunk_size, look_back)
429
+
430
+ if not self.normalize_before:
431
+ x = self.norm1(x)
432
+
433
+ residual = x
434
+ if self.normalize_before:
435
+ x = self.norm2(x)
436
+ x = residual + self.feed_forward(x)
437
+ if not self.normalize_before:
438
+ x = self.norm2(x)
439
+
440
+ return x, cache
441
+
442
+
443
+ @tables.register("encoder_classes", "SenseVoiceEncoderSmall")
444
+ class SenseVoiceEncoderSmall(nn.Module):
445
+ """
446
+ Author: Speech Lab of DAMO Academy, Alibaba Group
447
+ SCAMA: Streaming chunk-aware multihead attention for online end-to-end speech recognition
448
+ https://arxiv.org/abs/2006.01713
449
+ """
450
+
451
+ def __init__(
452
+ self,
453
+ input_size: int,
454
+ output_size: int = 256,
455
+ attention_heads: int = 4,
456
+ linear_units: int = 2048,
457
+ num_blocks: int = 6,
458
+ tp_blocks: int = 0,
459
+ dropout_rate: float = 0.1,
460
+ positional_dropout_rate: float = 0.1,
461
+ attention_dropout_rate: float = 0.0,
462
+ stochastic_depth_rate: float = 0.0,
463
+ input_layer: Optional[str] = "conv2d",
464
+ pos_enc_class=SinusoidalPositionEncoder,
465
+ normalize_before: bool = True,
466
+ concat_after: bool = False,
467
+ positionwise_layer_type: str = "linear",
468
+ positionwise_conv_kernel_size: int = 1,
469
+ padding_idx: int = -1,
470
+ kernel_size: int = 11,
471
+ sanm_shfit: int = 0,
472
+ selfattention_layer_type: str = "sanm",
473
+ **kwargs,
474
+ ):
475
+ super().__init__()
476
+ self._output_size = output_size
477
+
478
+ self.embed = SinusoidalPositionEncoder()
479
+
480
+ self.normalize_before = normalize_before
481
+
482
+ positionwise_layer = PositionwiseFeedForward
483
+ positionwise_layer_args = (
484
+ output_size,
485
+ linear_units,
486
+ dropout_rate,
487
+ )
488
+
489
+ encoder_selfattn_layer = MultiHeadedAttentionSANM
490
+ encoder_selfattn_layer_args0 = (
491
+ attention_heads,
492
+ input_size,
493
+ output_size,
494
+ attention_dropout_rate,
495
+ kernel_size,
496
+ sanm_shfit,
497
+ )
498
+ encoder_selfattn_layer_args = (
499
+ attention_heads,
500
+ output_size,
501
+ output_size,
502
+ attention_dropout_rate,
503
+ kernel_size,
504
+ sanm_shfit,
505
+ )
506
+
507
+ self.encoders0 = nn.ModuleList(
508
+ [
509
+ EncoderLayerSANM(
510
+ input_size,
511
+ output_size,
512
+ encoder_selfattn_layer(*encoder_selfattn_layer_args0),
513
+ positionwise_layer(*positionwise_layer_args),
514
+ dropout_rate,
515
+ )
516
+ for i in range(1)
517
+ ]
518
+ )
519
+ self.encoders = nn.ModuleList(
520
+ [
521
+ EncoderLayerSANM(
522
+ output_size,
523
+ output_size,
524
+ encoder_selfattn_layer(*encoder_selfattn_layer_args),
525
+ positionwise_layer(*positionwise_layer_args),
526
+ dropout_rate,
527
+ )
528
+ for i in range(num_blocks - 1)
529
+ ]
530
+ )
531
+
532
+ self.tp_encoders = nn.ModuleList(
533
+ [
534
+ EncoderLayerSANM(
535
+ output_size,
536
+ output_size,
537
+ encoder_selfattn_layer(*encoder_selfattn_layer_args),
538
+ positionwise_layer(*positionwise_layer_args),
539
+ dropout_rate,
540
+ )
541
+ for i in range(tp_blocks)
542
+ ]
543
+ )
544
+
545
+ self.after_norm = LayerNorm(output_size)
546
+
547
+ self.tp_norm = LayerNorm(output_size)
548
+
549
+ def output_size(self) -> int:
550
+ return self._output_size
551
+
552
+ def forward(
553
+ self,
554
+ xs_pad: torch.Tensor,
555
+ ilens: torch.Tensor,
556
+ ):
557
+ """Embed positions in tensor."""
558
+ masks = sequence_mask(ilens, device=ilens.device)[:, None, :]
559
+
560
+ xs_pad *= self.output_size() ** 0.5
561
+
562
+ xs_pad = self.embed(xs_pad)
563
+
564
+ # forward encoder1
565
+ for layer_idx, encoder_layer in enumerate(self.encoders0):
566
+ encoder_outs = encoder_layer(xs_pad, masks)
567
+ xs_pad, masks = encoder_outs[0], encoder_outs[1]
568
+
569
+ for layer_idx, encoder_layer in enumerate(self.encoders):
570
+ encoder_outs = encoder_layer(xs_pad, masks)
571
+ xs_pad, masks = encoder_outs[0], encoder_outs[1]
572
+
573
+ xs_pad = self.after_norm(xs_pad)
574
+
575
+ # forward encoder2
576
+ olens = masks.squeeze(1).sum(1).int()
577
+
578
+ for layer_idx, encoder_layer in enumerate(self.tp_encoders):
579
+ encoder_outs = encoder_layer(xs_pad, masks)
580
+ xs_pad, masks = encoder_outs[0], encoder_outs[1]
581
+
582
+ xs_pad = self.tp_norm(xs_pad)
583
+ return xs_pad, olens
584
+
585
+
586
+ @tables.register("model_classes", "SenseVoiceSmall")
587
+ class SenseVoiceSmall(nn.Module):
588
+ """CTC-attention hybrid Encoder-Decoder model"""
589
+
590
+ def __init__(
591
+ self,
592
+ specaug: str = None,
593
+ specaug_conf: dict = None,
594
+ normalize: str = None,
595
+ normalize_conf: dict = None,
596
+ encoder: str = None,
597
+ encoder_conf: dict = None,
598
+ ctc_conf: dict = None,
599
+ input_size: int = 80,
600
+ vocab_size: int = -1,
601
+ ignore_id: int = -1,
602
+ blank_id: int = 0,
603
+ sos: int = 1,
604
+ eos: int = 2,
605
+ length_normalized_loss: bool = False,
606
+ **kwargs,
607
+ ):
608
+
609
+ super().__init__()
610
+
611
+ if specaug is not None:
612
+ specaug_class = tables.specaug_classes.get(specaug)
613
+ specaug = specaug_class(**specaug_conf)
614
+ if normalize is not None:
615
+ normalize_class = tables.normalize_classes.get(normalize)
616
+ normalize = normalize_class(**normalize_conf)
617
+ encoder_class = tables.encoder_classes.get(encoder)
618
+ encoder = encoder_class(input_size=input_size, **encoder_conf)
619
+ encoder_output_size = encoder.output_size()
620
+
621
+ if ctc_conf is None:
622
+ ctc_conf = {}
623
+ ctc = CTC(odim=vocab_size, encoder_output_size=encoder_output_size, **ctc_conf)
624
+
625
+ self.blank_id = blank_id
626
+ self.sos = sos if sos is not None else vocab_size - 1
627
+ self.eos = eos if eos is not None else vocab_size - 1
628
+ self.vocab_size = vocab_size
629
+ self.ignore_id = ignore_id
630
+ self.specaug = specaug
631
+ self.normalize = normalize
632
+ self.encoder = encoder
633
+ self.error_calculator = None
634
+
635
+ self.ctc = ctc
636
+
637
+ self.length_normalized_loss = length_normalized_loss
638
+ self.encoder_output_size = encoder_output_size
639
+
640
+ self.lid_dict = {"auto": 0, "zh": 3, "en": 4, "yue": 7, "ja": 11, "ko": 12, "nospeech": 13}
641
+ self.lid_int_dict = {24884: 3, 24885: 4, 24888: 7, 24892: 11, 24896: 12, 24992: 13}
642
+ self.textnorm_dict = {"withitn": 14, "woitn": 15}
643
+ self.textnorm_int_dict = {25016: 14, 25017: 15}
644
+ self.embed = torch.nn.Embedding(7 + len(self.lid_dict) + len(self.textnorm_dict), input_size)
645
+
646
+ self.criterion_att = LabelSmoothingLoss(
647
+ size=self.vocab_size,
648
+ padding_idx=self.ignore_id,
649
+ smoothing=kwargs.get("lsm_weight", 0.0),
650
+ normalize_length=self.length_normalized_loss,
651
+ )
652
+
653
+ @staticmethod
654
+ def from_pretrained(model:str=None, **kwargs):
655
+ from funasr import AutoModel
656
+ model, kwargs = AutoModel.build_model(model=model, trust_remote_code=True, **kwargs)
657
+
658
+ return model, kwargs
659
+
660
+ def forward(
661
+ self,
662
+ speech: torch.Tensor,
663
+ speech_lengths: torch.Tensor,
664
+ text: torch.Tensor,
665
+ text_lengths: torch.Tensor,
666
+ **kwargs,
667
+ ):
668
+ """Encoder + Decoder + Calc loss
669
+ Args:
670
+ speech: (Batch, Length, ...)
671
+ speech_lengths: (Batch, )
672
+ text: (Batch, Length)
673
+ text_lengths: (Batch,)
674
+ """
675
+ # import pdb;
676
+ # pdb.set_trace()
677
+ if len(text_lengths.size()) > 1:
678
+ text_lengths = text_lengths[:, 0]
679
+ if len(speech_lengths.size()) > 1:
680
+ speech_lengths = speech_lengths[:, 0]
681
+
682
+ batch_size = speech.shape[0]
683
+
684
+ # 1. Encoder
685
+ encoder_out, encoder_out_lens = self.encode(speech, speech_lengths, text)
686
+
687
+ loss_ctc, cer_ctc = None, None
688
+ loss_rich, acc_rich = None, None
689
+ stats = dict()
690
+
691
+ loss_ctc, cer_ctc = self._calc_ctc_loss(
692
+ encoder_out[:, 4:, :], encoder_out_lens - 4, text[:, 4:], text_lengths - 4
693
+ )
694
+
695
+ loss_rich, acc_rich = self._calc_rich_ce_loss(
696
+ encoder_out[:, :4, :], text[:, :4]
697
+ )
698
+
699
+ loss = loss_ctc
700
+ # Collect total loss stats
701
+ stats["loss"] = torch.clone(loss.detach()) if loss_ctc is not None else None
702
+ stats["loss_rich"] = torch.clone(loss_rich.detach()) if loss_rich is not None else None
703
+ stats["acc_rich"] = acc_rich
704
+
705
+ # force_gatherable: to-device and to-tensor if scalar for DataParallel
706
+ if self.length_normalized_loss:
707
+ batch_size = int((text_lengths + 1).sum())
708
+ loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
709
+ return loss, stats, weight
710
+
711
+ def encode(
712
+ self,
713
+ speech: torch.Tensor,
714
+ speech_lengths: torch.Tensor,
715
+ text: torch.Tensor,
716
+ **kwargs,
717
+ ):
718
+ """Frontend + Encoder. Note that this method is used by asr_inference.py
719
+ Args:
720
+ speech: (Batch, Length, ...)
721
+ speech_lengths: (Batch, )
722
+ ind: int
723
+ """
724
+
725
+ # Data augmentation
726
+ if self.specaug is not None and self.training:
727
+ speech, speech_lengths = self.specaug(speech, speech_lengths)
728
+
729
+ # Normalization for feature: e.g. Global-CMVN, Utterance-CMVN
730
+ if self.normalize is not None:
731
+ speech, speech_lengths = self.normalize(speech, speech_lengths)
732
+
733
+
734
+ lids = torch.LongTensor([[self.lid_int_dict[int(lid)] if torch.rand(1) > 0.2 and int(lid) in self.lid_int_dict else 0 ] for lid in text[:, 0]]).to(speech.device)
735
+ language_query = self.embed(lids)
736
+
737
+ styles = torch.LongTensor([[self.textnorm_int_dict[int(style)]] for style in text[:, 3]]).to(speech.device)
738
+ style_query = self.embed(styles)
739
+ speech = torch.cat((style_query, speech), dim=1)
740
+ speech_lengths += 1
741
+
742
+ event_emo_query = self.embed(torch.LongTensor([[1, 2]]).to(speech.device)).repeat(speech.size(0), 1, 1)
743
+ input_query = torch.cat((language_query, event_emo_query), dim=1)
744
+ speech = torch.cat((input_query, speech), dim=1)
745
+ speech_lengths += 3
746
+
747
+ encoder_out, encoder_out_lens = self.encoder(speech, speech_lengths)
748
+
749
+ return encoder_out, encoder_out_lens
750
+
751
+ def _calc_ctc_loss(
752
+ self,
753
+ encoder_out: torch.Tensor,
754
+ encoder_out_lens: torch.Tensor,
755
+ ys_pad: torch.Tensor,
756
+ ys_pad_lens: torch.Tensor,
757
+ ):
758
+ # Calc CTC loss
759
+ loss_ctc = self.ctc(encoder_out, encoder_out_lens, ys_pad, ys_pad_lens)
760
+
761
+ # Calc CER using CTC
762
+ cer_ctc = None
763
+ if not self.training and self.error_calculator is not None:
764
+ ys_hat = self.ctc.argmax(encoder_out).data
765
+ cer_ctc = self.error_calculator(ys_hat.cpu(), ys_pad.cpu(), is_ctc=True)
766
+ return loss_ctc, cer_ctc
767
+
768
+ def _calc_rich_ce_loss(
769
+ self,
770
+ encoder_out: torch.Tensor,
771
+ ys_pad: torch.Tensor,
772
+ ):
773
+ decoder_out = self.ctc.ctc_lo(encoder_out)
774
+ # 2. Compute attention loss
775
+ loss_rich = self.criterion_att(decoder_out, ys_pad.contiguous())
776
+ acc_rich = th_accuracy(
777
+ decoder_out.view(-1, self.vocab_size),
778
+ ys_pad.contiguous(),
779
+ ignore_label=self.ignore_id,
780
+ )
781
+
782
+ return loss_rich, acc_rich
783
+
784
+
785
+ def inference(
786
+ self,
787
+ data_in,
788
+ data_lengths=None,
789
+ key: list = ["wav_file_tmp_name"],
790
+ tokenizer=None,
791
+ frontend=None,
792
+ **kwargs,
793
+ ):
794
+
795
+
796
+ meta_data = {}
797
+ if (
798
+ isinstance(data_in, torch.Tensor) and kwargs.get("data_type", "sound") == "fbank"
799
+ ): # fbank
800
+ speech, speech_lengths = data_in, data_lengths
801
+ if len(speech.shape) < 3:
802
+ speech = speech[None, :, :]
803
+ if speech_lengths is None:
804
+ speech_lengths = speech.shape[1]
805
+ else:
806
+ # extract fbank feats
807
+ time1 = time.perf_counter()
808
+ audio_sample_list = load_audio_text_image_video(
809
+ data_in,
810
+ fs=frontend.fs,
811
+ audio_fs=kwargs.get("fs", 16000),
812
+ data_type=kwargs.get("data_type", "sound"),
813
+ tokenizer=tokenizer,
814
+ )
815
+ time2 = time.perf_counter()
816
+ meta_data["load_data"] = f"{time2 - time1:0.3f}"
817
+ speech, speech_lengths = extract_fbank(
818
+ audio_sample_list, data_type=kwargs.get("data_type", "sound"), frontend=frontend
819
+ )
820
+ time3 = time.perf_counter()
821
+ meta_data["extract_feat"] = f"{time3 - time2:0.3f}"
822
+ meta_data["batch_data_time"] = (
823
+ speech_lengths.sum().item() * frontend.frame_shift * frontend.lfr_n / 1000
824
+ )
825
+
826
+ speech = speech.to(device=kwargs["device"])
827
+ speech_lengths = speech_lengths.to(device=kwargs["device"])
828
+
829
+ language = kwargs.get("language", "auto")
830
+ language_query = self.embed(
831
+ torch.LongTensor(
832
+ [[self.lid_dict[language] if language in self.lid_dict else 0]]
833
+ ).to(speech.device)
834
+ ).repeat(speech.size(0), 1, 1)
835
+
836
+ use_itn = kwargs.get("use_itn", False)
837
+ textnorm = kwargs.get("text_norm", None)
838
+ if textnorm is None:
839
+ textnorm = "withitn" if use_itn else "woitn"
840
+ textnorm_query = self.embed(
841
+ torch.LongTensor([[self.textnorm_dict[textnorm]]]).to(speech.device)
842
+ ).repeat(speech.size(0), 1, 1)
843
+ speech = torch.cat((textnorm_query, speech), dim=1)
844
+ speech_lengths += 1
845
+
846
+ event_emo_query = self.embed(torch.LongTensor([[1, 2]]).to(speech.device)).repeat(
847
+ speech.size(0), 1, 1
848
+ )
849
+ input_query = torch.cat((language_query, event_emo_query), dim=1)
850
+ speech = torch.cat((input_query, speech), dim=1)
851
+ speech_lengths += 3
852
+
853
+ # Encoder
854
+ encoder_out, encoder_out_lens = self.encoder(speech, speech_lengths)
855
+ if isinstance(encoder_out, tuple):
856
+ encoder_out = encoder_out[0]
857
+
858
+ # c. Passed the encoder result and the beam search
859
+ ctc_logits = self.ctc.log_softmax(encoder_out)
860
+
861
+ results = []
862
+ b, n, d = encoder_out.size()
863
+ if isinstance(key[0], (list, tuple)):
864
+ key = key[0]
865
+ if len(key) < b:
866
+ key = key * b
867
+ for i in range(b):
868
+ x = ctc_logits[i, : encoder_out_lens[i].item(), :]
869
+ yseq = x.argmax(dim=-1)
870
+ yseq = torch.unique_consecutive(yseq, dim=-1)
871
+
872
+ ibest_writer = None
873
+ if kwargs.get("output_dir") is not None:
874
+ if not hasattr(self, "writer"):
875
+ self.writer = DatadirWriter(kwargs.get("output_dir"))
876
+ ibest_writer = self.writer[f"1best_recog"]
877
+
878
+ mask = yseq != self.blank_id
879
+ token_int = yseq[mask].tolist()
880
+
881
+ # Change integer-ids to tokens
882
+ text = tokenizer.decode(token_int)
883
+
884
+ result_i = {"key": key[i], "text": text}
885
+ results.append(result_i)
886
+
887
+ if ibest_writer is not None:
888
+ ibest_writer["text"][key[i]] = text
889
+
890
+ return results, meta_data
891
+
892
+ def export(self, **kwargs):
893
+ from .export_meta import export_rebuild_model
894
+
895
+ if "max_seq_len" not in kwargs:
896
+ kwargs["max_seq_len"] = 512
897
+ models = export_rebuild_model(model=self, **kwargs)
898
+ return models
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ -i https://pypi.org/simple
2
+ funasr==1.0.28