Spaces:

littlebird13
/

SenseVoice

Runtime error

App Files Files Community

ljy266987 commited on Jul 5

Commit

56c3b64

•

1 Parent(s): 7c306ab

init

Browse files

Files changed (23) hide show

.gitattributes +1 -0
app.py +345 -0
example/emo_1.wav +3 -0
example/emo_2.wav +3 -0
example/emo_3.wav +3 -0
example/emo_4.wav +3 -0
example/en.mp3 +0 -0
example/event_1.wav +3 -0
example/event_2.wav +3 -0
example/event_3.wav +3 -0
example/ja.mp3 +0 -0
example/ko.mp3 +0 -0
example/longwav_1.wav +3 -0
example/longwav_2.wav +3 -0
example/longwav_3.wav +3 -0
example/longwav_4.wav +3 -0
example/rich_1.wav +3 -0
example/rich_2.wav +3 -0
example/rich_3.wav +3 -0
example/yue.mp3 +0 -0
example/zh.mp3 +0 -0
model.py +898 -0
requirements.txt +2 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.wav filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,345 @@

+# coding=utf-8
+import os
+import librosa
+import base64
+import io
+import gradio as gr
+import re
+import numpy as np
+import torch
+import torchaudio
+from modelscope import HubApi
+api = HubApi()
+key = os.environ["apikey"] if "apikey" in os.environ else ""
+try:
+	api.login(key)
+except:
+	pass
+from funasr import AutoModel
+# model = "/Users/zhifu/Downloads/modelscope_models/SenseVoiceSmall"
+model = "iic/SenseVoiceSmall"
+model = AutoModel(model=model,
+				  vad_model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch",
+				  vad_kwargs={"max_single_segment_time": 30000},
+				  trust_remote_code=True,
+				  )
+import re
+emo_dict = {
+	"<|HAPPY|>": "😊",
+	"<|SAD|>": "😔",
+	"<|ANGRY|>": "😡",
+	"<|NEUTRAL|>": "",
+	"<|FEARFUL|>": "😰",
+	"<|DISGUSTED|>": "🤢",
+	"<|SURPRISED|>": "😮",
+}
+event_dict = {
+	"<|BGM|>": "🎼",
+	"<|Speech|>": "",
+	"<|Applause|>": "👏",
+	"<|Laughter|>": "😀",
+	"<|Cry|>": "😭",
+	"<|Sneeze|>": "🤧",
+	"<|Breath|>": "",
+	"<|Cough|>": "🤧",
+}
+emoji_dict = {
+	"<|nospeech|><|Event_UNK|>": "❓",
+	"<|zh|>": "",
+	"<|en|>": "",
+	"<|yue|>": "",
+	"<|ja|>": "",
+	"<|ko|>": "",
+	"<|nospeech|>": "",
+	"<|HAPPY|>": "😊",
+	"<|SAD|>": "😔",
+	"<|ANGRY|>": "😡",
+	"<|NEUTRAL|>": "",
+	"<|BGM|>": "🎼",
+	"<|Speech|>": "",
+	"<|Applause|>": "👏",
+	"<|Laughter|>": "😀",
+	"<|FEARFUL|>": "😰",
+	"<|DISGUSTED|>": "🤢",
+	"<|SURPRISED|>": "😮",
+	"<|Cry|>": "😭",
+	"<|EMO_UNKNOWN|>": "",
+	"<|Sneeze|>": "🤧",
+	"<|Breath|>": "",
+	"<|Cough|>": "😷",
+	"<|Sing|>": "",
+	"<|Speech_Noise|>": "",
+	"<|withitn|>": "",
+	"<|woitn|>": "",
+	"<|GBG|>": "",
+	"<|Event_UNK|>": "",
+}
+lang_dict =  {
+    "<|zh|>": "<|lang|>",
+    "<|en|>": "<|lang|>",
+    "<|yue|>": "<|lang|>",
+    "<|ja|>": "<|lang|>",
+    "<|ko|>": "<|lang|>",
+    "<|nospeech|>": "<|lang|>",
+}
+emo_set = {"😊", "😔", "😡", "😰", "🤢", "😮"}
+event_set = {"🎼", "👏", "😀", "😭", "🤧", "😷",}
+def format_str(s):
+	for sptk in emoji_dict:
+		s = s.replace(sptk, emoji_dict[sptk])
+	return s
+def format_str_v2(s):
+	sptk_dict = {}
+	for sptk in emoji_dict:
+		sptk_dict[sptk] = s.count(sptk)
+		s = s.replace(sptk, "")
+	emo = "<|NEUTRAL|>"
+	for e in emo_dict:
+		if sptk_dict[e] > sptk_dict[emo]:
+			emo = e
+	for e in event_dict:
+		if sptk_dict[e] > 0:
+			s = event_dict[e] + s
+	s = s + emo_dict[emo]
+	for emoji in emo_set.union(event_set):
+		s = s.replace(" " + emoji, emoji)
+		s = s.replace(emoji + " ", emoji)
+	return s.strip()
+def format_str_v3(s):
+	def get_emo(s):
+		return s[-1] if s[-1] in emo_set else None
+	def get_event(s):
+		return s[0] if s[0] in event_set else None
+	s = s.replace("<|nospeech|><|Event_UNK|>", "❓")
+	for lang in lang_dict:
+		s = s.replace(lang, "<|lang|>")
+	s_list = [format_str_v2(s_i).strip(" ") for s_i in s.split("<|lang|>")]
+	new_s = " " + s_list[0]
+	cur_ent_event = get_event(new_s)
+	for i in range(1, len(s_list)):
+		if len(s_list[i]) == 0:
+			continue
+		if get_event(s_list[i]) == cur_ent_event and get_event(s_list[i]) != None:
+			s_list[i] = s_list[i][1:]
+		#else:
+		cur_ent_event = get_event(s_list[i])
+		if get_emo(s_list[i]) != None and get_emo(s_list[i]) == get_emo(new_s):
+			new_s = new_s[:-1]
+		new_s += s_list[i].strip().lstrip()
+	new_s = new_s.replace("The.", " ")
+	return new_s.strip()
+def model_inference(input_wav, language, fs=16000):
+	# task_abbr = {"Speech Recognition": "ASR", "Rich Text Transcription": ("ASR", "AED", "SER")}
+	language_abbr = {"auto": "auto", "zh": "zh", "en": "en", "yue": "yue", "ja": "ja", "ko": "ko",
+					 "nospeech": "nospeech"}
+	# task = "Speech Recognition" if task is None else task
+	language = "auto" if len(language) < 1 else language
+	selected_language = language_abbr[language]
+	# selected_task = task_abbr.get(task)
+	# print(f"input_wav: {type(input_wav)}, {input_wav[1].shape}, {input_wav}")
+	if isinstance(input_wav, tuple):
+		fs, input_wav = input_wav
+		input_wav = input_wav.astype(np.float32) / np.iinfo(np.int16).max
+		if len(input_wav.shape) > 1:
+			input_wav = input_wav.mean(-1)
+		if fs != 16000:
+			print(f"audio_fs: {fs}")
+			resampler = torchaudio.transforms.Resample(fs, 16000)
+			input_wav_t = torch.from_numpy(input_wav).to(torch.float32)
+			input_wav = resampler(input_wav_t[None, :])[0, :].numpy()
+	# DecodingOptions = {
+	# 	"task": selected_task,
+	# 	"language": selected_language,
+	# 	"fp16": True,
+	# 	"gain_event": True,
+	# }
+	merge_vad = True #False if selected_task == "ASR" else True
+	print(f"language: {language}, merge_vad: {merge_vad}")
+	text = model.generate(input=input_wav,
+						  cache={},
+						  language=language,
+						  use_itn=True,
+						  batch_size_s=0, merge_vad=merge_vad)
+	# if len(input_wav) > 16000*30:
+	# 	text = model.generate(input=input_wav, task=selected_task, language=language, batch_size_s=0)
+	# else:
+	# 	text = model.inference(input=input_wav, task=selected_task, language=language, batch_size_s=0)
+	print(text)
+	text = text[0]["text"]
+	#text = format_str(text)
+	#text = format_str_v2(text)
+	text = format_str_v3(text)
+	# text = distingush_speech(text)
+	# text = "".join(text)
+	print(text)
+	return text
+audio_examples = [
+    ["example/zh.mp3", "zh"],
+    ["example/yue.mp3", "yue"],
+    ["example/en.mp3", "en"],
+    ["example/ja.mp3", "ja"],
+    ["example/ko.mp3", "ko"],
+    ["example/emo_1.wav", "auto"],
+    ["example/emo_2.wav", "auto"],
+    ["example/emo_3.wav", "auto"],
+    #["example/emo_4.wav", "auto"],
+    #["example/event_1.wav", "auto"],
+    #["example/event_2.wav", "auto"],
+    #["example/event_3.wav", "auto"],
+    ["example/rich_1.wav", "auto"],
+    ["example/rich_2.wav", "auto"],
+    #["example/rich_3.wav", "auto"],
+    ["example/longwav_1.wav", "auto"],
+    ["example/longwav_2.wav", "auto"],
+    ["example/longwav_3.wav", "auto"],
+    #["example/longwav_4.wav", "auto"],
+]
+description = """
+# SenseVoice is a speech foundation model with multiple speech understanding capabilities, including automatic speech recognition (ASR),  spoken language identification (LID), speech emotion recognition (SER), and acoustic event classification (AEC) or acoustic event detection (AED).
+## Usage
+### Upload an audio file or input through a microphone, then select the task and language.
+*Language*
+- `auto`: the audio language will be detected automatically.
+- `Language Type Selection`: can also be specified for a particular language type.
+Recommended audio input duration is below 30 seconds. For audio longer than 30 seconds, local deployment is recommended, github repo.
+"""
+html_content = """
+<div>
+    <h2 style="font-size: 22px;margin-left: 0px;">Voice Understanding Model: SenseVoice-Small</h2>
+    <p style="font-size: 18px;margin-left: 20px;">SenseVoice-Small is an encoder-only speech foundation model designed for rapid voice understanding. It encompasses a variety of features including automatic speech recognition (ASR), spoken language identification (LID), speech emotion recognition (SER), and acoustic event detection (AED). SenseVoice-Small supports multilingual recognition for Chinese, English, Cantonese, Japanese, and Korean. Additionally, it offers exceptionally low inference latency, performing 7 times faster than Whisper-small and 17 times faster than Whisper-large.</p>
+    <h2 style="font-size: 22px;margin-left: 0px;">Usage</h2> <p style="font-size: 18px;margin-left: 20px;">Upload an audio file or input through a microphone, then select the task and language. the audio is transcribed into corresponding text along with associated emotions (😊 happy, 😡 angry/exicting, 😔 sad) and types of sound events (😀 laughter, 🎼 music, 👏 applause, 🤧 cough&sneeze, 😭 cry). The event labels are placed in the front of the text and the emotion are in the back of the text.</p>
+	<p style="font-size: 18px;margin-left: 20px;">Recommended audio input duration is below 30 seconds. For audio longer than 30 seconds, local deployment is recommended.</p>
+	<h2 style="font-size: 22px;margin-left: 0px;">Repo</h2>
+	<p style="font-size: 18px;margin-left: 20px;"><a href="https://github.com/FunAudioLLM/SenseVoice" target="_blank">SenseVoice</a>: multilingual speech understanding model</p>
+	<p style="font-size: 18px;margin-left: 20px;"><a href="https://github.com/modelscope/FunASR" target="_blank">FunASR</a>: fundamental speech recognition toolkit</p>
+	<p style="font-size: 18px;margin-left: 20px;"><a href="https://github.com/modelscope/CosyVoice" target="_blank">CosyVoice</a>: high-quality multilingual TTS model</p>
+</div>
+"""
+# 自定义表格的 HTML 和 CSS 代码
+centered_table_html = """
+<style>
+.centered-table {
+  margin-left: auto;
+  margin-right: auto;
+}
+</style>
+<div class="centered-table">
+    <table border="1" style="border-collapse: collapse; width: 100%;">
+        <tr>
+            <th>Samples</th>
+            <th>Speech Recognition</th>
+            <th>Rich Text Transcription</th>
+        </tr>
+        <tr>
+            <td><a href="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/SenseVoice/demo/%E9%9F%A9%E8%AF%AD-%E8%8B%B1%E8%AF%AD-%E7%BB%BC%E8%89%BA.wav" target="_blank">韩语-英语-综艺</a></td>
+            <td>자, 둘, 셋! 안녕 하 세요？저는 새봄 영원 신화 입니다. 한 주간 잘 지내셨나요? 오늘은 어떤 영상을 보러 들을 거냐 혹시 밴드 음악 좋아해요 네 약간 락 그런 거 그냥 밴드요 밴드 음악 좋아하는 밴드 누구 있어요 저요 뭐 우리나라 아니어도 상관없어요 어...라디오 해군 아 진짜 형은요? FT 아일랜드 그런 그런 그룹 들이 많이 있 는데 그런 관련 된 중국 에 서도 밴드 관련 된 프로그램 이 있 다고 해요？그래서 그 프로그램 영상 을 한번 만나 볼 건데 그렇게 감안 해서 생각 하 시고, 보 면서 어떤 음악 을 하 는지 또 밴드 가, 밴드라는 지칭하는 단어가 여기 악기들이 들어가는 그런 연주잖아요 그래서 느낌이 다르지 안 다르지는 잘 모르겠는데 좀 전 ���슷할 것 같아요 그래서 한번 보도록 하겠습니다 틀어주세요. え？うん。 なんか、何なんだ。 여기 왔다 약간 벨로그 こんばんは？ 이디엠 이디엠인가 뭐야, 재밌어야? 哦。 Like, what is that? 세일 고려하는 게 그렇게 많다 아 저 뒤에 사람 있는 게 신기한 화면이 아니라 이거 레이저인 거야 화면이 레이저를 잘 할 필요가 없는데, 정전을 잘 못해? I'm 80 now. 是吧。 No touch. てるよ。 패트가 있다는 게 아니라 투명 투명</td>
+			<td>자, 둘, 셋 안녕하세요 저는 새봄 영원 신화입니다 자, 여러분들! 한주가 잘 지내셨나요 😀 ？오늘은 어떤 영상을 불러드릴 거냐 혹시 밴드 음악 좋아요? 그냥 밴드요 밴드 밴드 음악 밴드 음악이 락 아닌가 밴드 좋아하는 밴드 누구 있어요 저요 우리나라 아니어도 상관없어요 어...라디오 헤드 아 형은요? 🎵 FG 아일랜드 그런 그런 그룹 들이 많이 있 는데 그런 관련 된 중국 에 서도 밴드 관련 된 프로그램 이 있 다고 해요？그래서, 그 프로그램 영상 을 한번 만나 볼 건데 그렇게 감안 해서 생각 하 시고, 보 면서 어떤 음악 을 하 는지 또 밴드 가, 🎶 밴드라는 지칭하는 단어가 여기 악기들이 들어가는 그런 연주잖아요 그래서 느낌이 다르지 안 다르지는 잘 모르겠는데 좀 전 비슷할 것 같아요 그래서 한번 보도록 하겠습니다 자 틀어주세요. 🎵 들어왔다 약간 갤럭시 온라인? 🎶 😀 What is that. 🎵 되게 화려한데, 렉크가 아 저 뒤에 사람 있는 게 신기해 화면이 아니라 이거 레이저인 거야？레이저를 저렇게 조절이 되냐 누구 저게 와우. 🎶 👏 No touch. 👏 👏 👏 👏 👏 🎵 패스가 있다는 게 아니라 투명 투명. 🎶</td>
+        </tr>
+        <tr>
+            <td><a href="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/SenseVoice/demo/%E8%8B%B1%E6%96%87-%E6%AD%8C%E6%9B%B2.wav" target="_blank">英文-歌曲</a></td>
+            <td>What's up, I'm Morris, I'm playing Blood Incantation Hey, this is Jeff Barrett. And this is Isaac Falk, it's Paul Riedel from Blood Incantation here at Amoeba in Hollywood, we're doing What's in My Bag. So, yeah, I found. This CD, it's a. Fates warning, perfect symmetry. 在一。 It's the album they did right around the time of another album, Parallels, where they had a new singer, this is where this band really started going in the progressive direction, where very extended like solo sections and odd time signatures and stuff like that and yeah, very influential, actually parts of this were very influential on Hidden History of the Human Race and we were working on that. First thing in my bag is. Mordred Angels 1999 album, Formulas Fatal to the Flesh. I don't own this on vinyl, I think this is the second. Repress, huge influence on floating containment, if you can tell, one of our favorite records. And it's good to find it here. Yeah, total classic. The first thing I found was in the new arrivals bin. ста 90s reissue of the second Wallenstein album. Classic krautrock on the pills label, great psychedelic Prague krautrock with a little bit of folk and kind of symphonic aspect, killer. And a great cover. Yeah, mother universe. First thing I picked out, I went straight to the jazz section. Finds this record that. Changed my life, I got to see Dave Brubeck before he died when I was about 17 in New Jersey. The saxophone player on this, Paul Desmond, is also one of my favorite jazz players, super important record to me and my dad, which he grew on very quickly. My next thing. Is this Osamu Kitajima album? I have not heard this one, but I'm a really big fan of his stuff from the 70s and 80s, and this album has a 12 minute song, so I'm guessing it's going to be pretty cool, I just want to read the back of this because this was like really interesting to me, it says,"Higher Octave Music is founded upon the vision that we have entered an era of global cooperation and unity in which we have entered an era of global cooperation and unity, We are dedicated to a process of continuous refinement, both artistically and commercially, as in music, so in life. I was like, that sounds great, I'm going to buy that next I got the Carlos Santana John McLaughlin record. Love, devotion, surrender. Morris picked it up at another record store on this tour, they're two amazing guitarists that I love both of their solo works, so I'd like to see them work together. Next is a record I have not heard, but was apparently unavailable on official format until this reissue from Wawa by Bernard Zolotol's like a classic new age. Type of progressive electronics guy. A lot of Terry Riley style tape loops, rather than like. You know, harsh digital sequencers and synthesizer stuff. So I have a couple records from this guy. I never heard this one. It includes a bonus 7". There's also a song called Gliding through the Cosmophonic Dome. So it's probably great. Next step, I got.</td>
+			<td>What's up. I'm Morris playing blood incantation. Hey, this is Jeff Barrett, and this is Isaac Falk. It's Paul Riedel from Blood Incantation here at Amoeba in Hollywood. We're doing what's in my bag. 🎼 So, yeah, I found this Cd. It's a fate's warning. Perfect symmetry. 🎵 It's the album they did right around the time of another album, Parallels, where they had a new singer. This is where this band really started going in the progressive direction, where. 🎶 Very extended, like solo sections and odd time signatures and stuff like that. And yeah, very influential. Actually, parts of this were very influential on hidden history of the human race. And we're working on that. The first thing in my bag is more of an angel's album. Formulas fatal to the flesh. 🎼 I don't own this on vinyl. I think this is the second repress. Huge influence on loading, 🎵 if you can tell one of our favorite records. 🎶 And it's good to find it here. Yeah, total classic. The first thing I found was in the new arrivals bin, 🎵 the S reissue of the second Wallenstein album. Classic kraut rock on the pills label. Great psychedelic 🎶 Prague kraut rock with a little bit of folk and kind of symphonic aspect 🎵 killer and a great cover. Yeah, mother universe. First thing I picked out, I went straight to the jazz section. 🎶 😊 To find this record that changed my life, I got to see Dave Brubeck before he died when I was about seventeen 🎵 in New Jersey. 🎶 The saxophone player on this, Paul Desmond, is also one of my favorite jazz players, super important record to me and my dad, which he grew on very quickly. 🎼 🎵 My next thing is this Osamu Kitajima album. 🎶 I have not heard this one, but I'm a really big fan of his stuff from the S And S. And this album has a minute song. So I'm. I'm guessing it's going to be pretty cool. I just want to read the back of this because this was like, really interesting to me. It says higher octave music is founded upon the vision that we have entered an era of. 😊 Global cooperation and unity in which music plays an integral 🎵 part. Our purpose is to help set a new standard of excellence in the music of this era. 🎶 We are dedicated to a process of continuous refinement, both artistically and commercially, as in music. So in life, 🎵 I was like, that sounds great. I'm going to buy that. Next, I got the Carlos Santana John McLaughlin record. 🎶 😊 Love, devotion, surrender. 🎼 Morris picked it up at another record store on this tour. 🎵 There are two amazing guitarists that I love both of their solo works. So I'd like to see them work together. 🎶 😊 Next is a record I have not heard, but was apparently unavailable on official format until this reissue from Wawa. But Bernard Zolotol is like a classic new age. 🎵 Type of progressive electronics guy. A lot of Terry Riley style tape loops, rather than like. You know, harsh digital sequencers and synthesizer stuff. So I have a couple records from the sky. I never heard this one. It includes a bonus seven inch. There's also a song called Gliding through the Cosmophonic Dome. So it's probably great. Next up, I got. 🎶</td>
+        </tr>
+        <tr>
+            <td><a href="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/SenseVoice/demo/%E8%8B%B1%E6%96%87-%E4%B8%AD%E6%96%87-%E8%85%BE%E6%A0%BC%E5%B0%94-%E5%A4%A9%E5%A0%82.wav" target="_blank">英文-中文-腾格尔-天堂</a></td>
+			<td>Tangry with his song, heaven. de de de le de Absolute shock, but in a great way. That was awesome. That was awesome. What way to open a song that was awesome, awesome. I'd love to check out some more Mongolian throat singing stuff. That is correct, right, It is Mongolian. Let me know. I'd love to check out more. I think a lot of you want to check out the Who if you guys still want me to, I'd be more than happy to. de de de 蓝蓝的天空，清清的湖水啊。 That is incredible, that is incredible. That is incredible for those of you don't know what I'm saying right now, the way he can make it sound like he's finished a note, you know, he like lowers it so low you can't even hear the note anymore and then he brings it back and you can see his mouth still open the way it makes the way he can like finish a note but not finish it, I don't know how to explain that that is an incredible talent that is amazing. 哦哟这是我的家。</td>
+			<td><span style='color: black;'> Tangry with his song. </span><span style='color: black;'> </span><span style='color: black;'> Heaven. </span><span style='color: #0D47A1;'> 🎵</span><span style='color: black;'>absolute.sock but in a great way🎶</span><span style='color: #0D47A1;'>. 🎵 </span><span style='color: black;'> Wow. </span><span style='color: black;'> </span><span style='color: black;'> That was awesome. That was awesome. What way to open a song. That was awesome, awesome. I'd love to check out some more Mongolian 🎶 folk singing stuff. That is correct, right, It is Mongolian. Let me know. I'd love to check out more. I think a lot of you want to check out the Who if you guys still want me to, I'd be more than happy to. </span><span style='color: #0D47A1;'> 😊 🎵蓝蓝的天空。轻轻的呼声。你要。绿绿的草原。🎶 </span><span style='color: black;'> That is incredible. That is incredible. </span><span style='color: black;'> </span><span style='color: black;'> That is incredible for those of you don't know what I'm saying right now, the way he can make it sound like he's finished a note, you know, he like lowers it so low you can't even hear the note anymore and then he brings it back and you can see his mouth still open the way it makes the way he can like finish a note but not finish it, I don't know how to explain that, that is an incredible talent, that is amazing. </span><span style='color: #0D47A1;'> 😡 🎵</span><span style='color: black;'>给我买什么？</span><span style='color: #0D47A1;'>这是我的家。哎，嘿。🎶</span></td>
+        </tr>
+        <tr>
+            <td><a href="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/SenseVoice/demo/%E6%97%A5%E8%AF%AD-%E8%8B%B1%E8%AF%AD-%E8%8B%B9%E6%9E%9C.wav" target="_blank">日语-英语-苹果</a></td>
+			<td>And there's another big game coming to Mac this year, to tell you all about it, here's legendary game creator Kojima-san from Kojima Productions. はい、皆さんこんにちは。僕らが現在取り組んでいるマックのプロジェクトを本日ここで皆さんにご紹介できることをとても嬉しく思って。います。 僕自身、1994年に最初の Mac を購入して以来の Apple の大ファンです。 そして、僕とチームが手掛けた作品をマックでお届けすることが、長年の夢でもありましたマックでのゲーム体験は、ついに新しい時代に突入しました。 その時代に合わせて、デストランディングディレクターズカットの Mac 版のリリースが2023年年末に決定したことをここで発表させていただきます。 マック版デストランディングディレクターズカットでは、Apple の最新のテクノロジーを最大限に活かしたゲーム体験を皆さんにお届けしたいと思います。 メタル fx アップスケーリングによる高精度なグラフィックをはじめ、アップルシリコンの素晴らしいパフォーマンスやモダンなレンダリングパイプラインを兼ね備えたメタル3には今回とても驚かされました。 ぜひ多くの皆さんに、この革新的かつエクサイティングな新しいマックの環境に触れていただければと思っています。 このデストランディングディレクターズカットを皮切りに、今後の小島プロダクションタイトルについても Apple プラットフォームへの展開を積極的に行っていく予定です。 デストランディングディレクターズカットの先行予約は近日中に開始する予定です。ぜひ楽しみにお待ちください。それでは、ザインキュー。</td>
+			<td>And there's another big game coming to Mac this year to tell you all about it. Here's legendary game creator Kojima-san, from Kojima Productions. 🎵みなさんこんにちは、僕らが現在取り組んでいるマックのプロジェクトを本日ここで皆さんにご紹介できることをとても嬉しく思っています。🎶 僕自身、千九百九十四年に最初の mac を購入して以来の🎵アップルの大ファンです。そして、僕とチームが手掛けた作品を mac でお届けすることが、長年の夢でもありました mac でのゲーム体験は、ついに新しい時代に突入しました。🎶 その時代に合わせて、デストランディングディレクターズカットの🎵mac版のリリースが二千二十三年年末に決定したことをここで発表させていただきます。mac 版デストランディングディレクターズカットでは、apple の最新のテクノロジーを最大限に活かしたゲーム体験を皆さんにお届けしたいと思います。メタル fx アップスケーリングによる高精度なグラフィックをはじめ、アップルシリコンの素晴らしいパフォーマンスやモダンなレンダリングパイプラインを兼ね備えたメタルには今回とても驚かされました。ぜひ多くの皆さんに、この革新的かつエクサイティングな新しいマックの環境に触れていただければと思っています。このデストランディングディレクターズカットを皮切りに、今後の小島プロダクションタイトルについても、アップルプラットフォームへの展開を積極的に行っていく予定です。デストランディングディレクターズカットの先行予約は近日中に開始する予定です。是非楽しみにお待ちください。それでは、ザインキュー。🎶</td>
+        </tr>
+        <tr>
+            <td><a href="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/SenseVoice/demo/%E4%B8%AD%E8%8B%B1-%E5%8D%8E%E8%AF%AD%E4%B9%90%E5%9D%9B.wav" target="_blank">中英-华语乐坛</a></td>
+			<td>问你什么想法，我是我对你的表达。问你。 什么看法？我的世界，我的复活。 都说华流才是顶流，而随着华语乐坛的崛起，的确有不少华语歌手真正做到了用作品和歌声征服国际舞台。那么本期视频就为小伙伴们探点了这样火遍全球的四首华语歌曲。话不多说，快来看看有没有你喜欢的吧。 de number four play我呸，由蔡依林演唱，发行于2014年，是一首中西合并，风格十分前卫的歌曲。在这首歌中，蔡依林可谓突破了自己以往的尺度，特别是现场表演，更是气场全开，完全就是女王的风范。 想要挣，他挣我赔，快点去相亲，也是要付我赔，快点去那里来，想要挣我赔，我赔，早赔更赔。 什么都喜欢什么都会。 number three,左手指月，左手指月，指指人心。这是一首暗含佛家禅意的歌曲，除了精妙的作词之外，歌曲超三个八度的高音也只有原唱萨顶顶能演绎出其中的精髓。而她的现场演唱，更是让老外都惊羡不已。 此人是你全部的社会信念。 我的家。 啊，好好的。 number two,光年之外，这是好莱坞大片太空旅客专程邀请段子琪为电影创作的主题曲，而段子琪显然也不负他们所望。这首光年之外，不仅与电影的主题十分契合，而且火爆全网，成为了2017年的年度十大金曲。果然，华语小天后的魅力，你真的可以永远相信。 为爱遥远在空间之外，若能守候未知，为你等待。我没想到，为你活得多么荒凉。 伤多了一小，没有你根本不想逃。 de number one浮夸，或许很多小伙伴不知道的是，原创作者写这首歌，其实一开始就是为了纪念哥哥张国荣，后来被陈奕迅演唱后，更是成为了一个经典浮夸式的演绎。据说在2014年的某颁奖盛典，因为伊森的现场太过浮夸，以至于主办方不得不将这一段给剪掉。 天使帝女马与人类流花吧，一生只想你惊讶，我旧事只为传唱，不怕重聚。 好了，这就是本期节目的全部内容了，喜欢的小伙伴别忘了点赞关注，我们下期见，拜拜。</td>
+			<td><span style='color: #0D47A1;'>🎵问你什么想法，我是我对你的表达。问你。什么看法？我的世界，我的复活。</span><span style='color: black;'>都说华流才是顶流，而随着华语乐坛的崛起，的确有不少华语歌手真正做到了用作品和歌声征服国际舞台。那么本期视频就为小伙伴们探点了这样火遍全球的四首华语歌曲。话不多说，快来看看有没有你喜欢的吧。</span><span style='color: #0D47A1;'>😊🎶 </span><span style='color: black;'>number four play我呸，由蔡依林演唱，发行于2014年，是一首中西合并，风格十分前卫的歌曲。在这首歌中，蔡依林可谓突破了自己以往的尺度，特别是现场表演，更是气场全开，完全就是女王的风范。</span><span style='color: black;'> 🎼 </span><span style='color: black;'>number3，左手指月左手指月，指指人心。这是一首暗含佛家禅意的歌曲，除了精妙的作词之外，歌曲超三个八度的高音也只有原唱萨顶顶能演绎出其中的精髓。而她的现场演唱，更是让老外都惊🎵羡不已。</span><span style='color: #0D47A1;'>此人是你全部的社会信念。😊啊，一生。😊🎶 </span><span style='color: black;'>number two,光年之外，这是好莱坞大片太空旅客专程邀请段子琪为电影创作的主题曲，而段子琪显然也不负他们所望。这首光年之外，不仅与电影的主题十分契合，而且火爆全网，成为了2017年的年度十大金曲。果然，华语小天后的魅力，你真的可以永远相信。</span><span style='color: #0D47A1;'> 🎵为爱遥远在空间之外，若能守候未知，为你等待。我没想到，哎，我如同荒凉。</span><span style='color: black;'>number one浮夸，或许很多小伙伴不知道的是，原创作者写这首歌，其实一开始就是为了纪念哥哥张国荣，后来被陈奕迅演唱后，更是成为了一个经典浮夸式的演绎。据说在2014年的某颁奖盛典，因为伊森的现场🎶太过浮夸，以至于主办方不得不将这一段给剪掉。</span><span style='color: #0D47A1;'> 🎵歇斯底里，马儿，眼泪流花罢，一生只想你惊讶，我旧事只为传唱，不怕重聚。好了，这就是本期节目的全部内容了，喜欢的小伙伴别忘了点赞关注我们，下期见，拜拜。🎶</span></td>
+        </tr>
+        <tr>
+            <td><a href="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/SenseVoice/demo/%E4%B8%AD%E6%96%87-%E6%97%A5%E8%AF%AD-%E7%BD%91%E7%BA%A2%E6%AD%8C%E7%BF%BB%E5%94%B1.wav" target="_blank">中文-日语-网红歌翻唱</a></td>
+			<td>呼啦啦啦啦啦啦啦啦呼啦啦啦啦啦啦啦。 哈喽各位，这里是音乐萌太郎，我是小凡。几率死生命死之后，今天我们又有三首流行歌曲。 被日本看上了，到底被注入了怎样的灵魂，我们一起来听一下吧。东宝石的这首野狼disco，堪称今年最。 被洗脑的神曲前段时间不但风靡大学校园，就连陈伟霆也出了正宗的岗位教程。没想到转眼间，这首歌却被日语看上了，被软萌的罗丽依一唱，我竟然有。 在停裂暗循环的感觉。 夜叫ぼうか嘘が本当だ思いっきりして誰も忘れた君は一番だ知ってるから。 せえのこっちにりょうを描いてこっちに虹を描くいいね逆にこっちに虹を描いてこっちにりょうを描くすごい 哦，不要。 前段时间，由音雀视听书品赵方静演唱的古风电音盲咒，也凭借洗脑旋律在短时间内成功刷屏，这次更是背翻战神日语版走红网络。短短一周的时间，视频已经快要达到200万的播放了。 幻想が一粒の涙そんなの無理っての私だけじゃいき。 出らないあなたの言葉を思えば胸がギュッとなんだかずっと痛いや。 おお、女のない夜空以外。 听听听听。 还记得那首换装歌曲速吗？这次也被小姐姐翻唱成了日语版，不过对于这首歌还是有些争议。有网友表示，空灵的嗓音也许更适合这首歌的曲风。节目的最后，一起来听听这首日语版的歌曲吧。喜欢的小伙伴记得关注，我们下期见，拜拜。 連れ出されて星の中へ。 的从列。</td>
+			<td><span style='color: #0D47A1;'>🎵呼啦啦啦啦啦啦啦啦呼啦啦啦啦啦啦啦啦。😊hello,各位，这里是音乐萌太糖，我是小凡。几率死生命死之后，今天我们又有三首流行歌曲。🎶😊 </span><span style='color: black;'>被日本看上了，到底被注入了怎样的灵魂，我们一起来听一下吧。🎵东宝石的这首野狼disco，堪称今年最。</span><span style='color: black;'>😊</span><span style='color: black;'>被洗脑的神曲前段时间不但风靡大学校园，就连陈伟霆也出了正宗的岗位教程。没想到转眼间，这首歌却被日语看上了，被软萌的罗丽音一唱，我竟然有。</span><span style='color: black;'>😊</span><span style='color: black;'>在停恋爱循环的感觉。</span><span style='color: #0D47A1;'>😊夜叫ぼうか嘘が本当か思いっきりして誰も忘れた君は一番だ知ってるから。せーのこっちにりょうを描いてこっちに虹を描くいいね逆にこっちに虹を描いてこっちにりょうを描くすごい！哦，不要。前段时间，由音雀视听书品赵方静演唱的古风电音盲咒，也凭借洗脑旋律在短时间内成功刷屏，这次更是背翻战场日语版走红网络。短短一周的时间，视频已经快要达到200万的播放了。😊幻想が一粒の涙そんなの無理っての私だけじゃいき。出らないあなたの言葉を思えば胸がギュッとなんだかずっと痛いや。女のない夜空以外。</span><span style='color: black;'>还记得那首换装歌曲速吗？这次也被小姐姐翻唱成了日语版，不过对于这首歌还是有些争议。有网友表示，空灵的嗓音也许更适合这首歌的曲风。节目的最后，一起来听听这首日语版的歌曲吧。喜欢的小伙伴记得关注，我们下期见，拜拜。</span><span style='color: #0D47A1;'>😊連れ出されて星の中へ。的从容。🎶</span></td>
+        </tr>
+    </table>
+</div>
+"""
+def launch():
+	with gr.Blocks(theme=gr.themes.Soft()) as demo:
+		# gr.Markdown(description)
+		gr.HTML(html_content)
+		with gr.Row():
+			with gr.Column():
+				audio_inputs = gr.Audio(label="Upload audio or use the microphone")
+				with gr.Accordion("Configuration"):
+					# task_inputs = gr.Radio(choices=["Speech Recognition", "Rich Text Transcription"],
+					# 					   value="Speech Recognition", label="Task")
+					language_inputs = gr.Dropdown(choices=["auto", "zh", "en", "yue", "ja", "ko", "nospeech"],
+												  value="auto",
+												  label="Language")
+				fn_button = gr.Button("Start", variant="primary")
+				text_outputs = gr.Textbox(label="Results")
+			gr.Examples(examples=audio_examples, inputs=[audio_inputs, language_inputs], examples_per_page=20)
+		fn_button.click(model_inference, inputs=[audio_inputs, language_inputs], outputs=text_outputs)
+		# with gr.Accordion("More examples"):
+		# 	gr.HTML(centered_table_html)
+	demo.launch()
+if __name__ == "__main__":
+	# iface.launch()
+	launch()

example/emo_1.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2729e565c98979826d9335b5563fdc79e9bc1b4ab256f67f38adc8e8c2c1646b
+size 87820

example/emo_2.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:26de72f71c7428451b4f62920f56b5853f4d235069f4281816587dc1d7ad0e05
+size 117900

example/emo_3.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2648a25d1d5012f81da3fd7ea4ec82ca3b6ddeaae89996e474ce81630fdabf69
+size 80780

example/emo_4.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:45834a358ef3e95dae6d7e2643204878dbd28f2b3c2055dec588a8064b5472a6
+size 87084

example/en.mp3 ADDED Viewed

Binary file (57.4 kB). View file

example/event_1.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2de04b4d958ab53a2f66653f66bc640757bb07cacbb1a614c28777f505c9aa20
+size 441044

example/event_2.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a254709bbf039a760772033127d7ddd1b1296827c1c4bf91397c4eed47630415
+size 441044

example/event_3.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1d3237552ea143411a563b9e886f870df5870d31cb3272ae33f2588ba5076fe7
+size 441044

example/ja.mp3 ADDED Viewed

Binary file (57.8 kB). View file

example/ko.mp3 ADDED Viewed

Binary file (27.9 kB). View file

example/longwav_1.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:49ff8d0f20e7c1e9a46f2c40e5eadd2df145bd2be83735e454790571b08a12f1
+size 5925776

example/longwav_2.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ae93488f4091552861fe36f0df4cce01b4f93d4ccdf846979937ab1e4793a4d4
+size 8742988

example/longwav_3.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fe4c9b3ea090c399630266b54fb1d7ff864162b5f82749cd312db16806932a29
+size 7033468

example/longwav_4.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:76efd05d140a38f48d976222a39ec411931db122a54e3f0d3cda3b4a7a1485c8
+size 2783624

example/rich_1.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0d650ab4f08bd3a5d3d59fd67d085fd0765ddabe4729294f4106fceece321fdf
+size 571456

example/rich_2.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3384e2e768ba11d921f3fcbfee5ebc18c70323f18ab41d2c93931e4cf26e45ff
+size 320044

example/rich_3.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:17ff9c06cfc608e13dec503ae2e5015b1748fd5b9509b2080e31089eec2af7cc
+size 206986

example/yue.mp3 ADDED Viewed

Binary file (31.2 kB). View file

example/zh.mp3 ADDED Viewed

Binary file (45 kB). View file

model.py ADDED Viewed

	@@ -0,0 +1,898 @@

+from typing import Iterable, Optional
+import types
+import time
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+from torch import nn
+from torch.cuda.amp import autocast
+from funasr.metrics.compute_acc import compute_accuracy, th_accuracy
+from funasr.losses.label_smoothing_loss import LabelSmoothingLoss
+from funasr.train_utils.device_funcs import force_gatherable
+from funasr.utils.load_utils import load_audio_text_image_video, extract_fbank
+from funasr.utils.datadir_writer import DatadirWriter
+from funasr.models.ctc.ctc import CTC
+from funasr.register import tables
+from funasr.models.paraformer.search import Hypothesis
+class SinusoidalPositionEncoder(torch.nn.Module):
+    """ """
+    def __int__(self, d_model=80, dropout_rate=0.1):
+        pass
+    def encode(
+        self, positions: torch.Tensor = None, depth: int = None, dtype: torch.dtype = torch.float32
+    ):
+        batch_size = positions.size(0)
+        positions = positions.type(dtype)
+        device = positions.device
+        log_timescale_increment = torch.log(torch.tensor([10000], dtype=dtype, device=device)) / (
+            depth / 2 - 1
+        )
+        inv_timescales = torch.exp(
+            torch.arange(depth / 2, device=device).type(dtype) * (-log_timescale_increment)
+        )
+        inv_timescales = torch.reshape(inv_timescales, [batch_size, -1])
+        scaled_time = torch.reshape(positions, [1, -1, 1]) * torch.reshape(
+            inv_timescales, [1, 1, -1]
+        )
+        encoding = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], dim=2)
+        return encoding.type(dtype)
+    def forward(self, x):
+        batch_size, timesteps, input_dim = x.size()
+        positions = torch.arange(1, timesteps + 1, device=x.device)[None, :]
+        position_encoding = self.encode(positions, input_dim, x.dtype).to(x.device)
+        return x + position_encoding
+class PositionwiseFeedForward(torch.nn.Module):
+    """Positionwise feed forward layer.
+    Args:
+        idim (int): Input dimenstion.
+        hidden_units (int): The number of hidden units.
+        dropout_rate (float): Dropout rate.
+    """
+    def __init__(self, idim, hidden_units, dropout_rate, activation=torch.nn.ReLU()):
+        """Construct an PositionwiseFeedForward object."""
+        super(PositionwiseFeedForward, self).__init__()
+        self.w_1 = torch.nn.Linear(idim, hidden_units)
+        self.w_2 = torch.nn.Linear(hidden_units, idim)
+        self.dropout = torch.nn.Dropout(dropout_rate)
+        self.activation = activation
+    def forward(self, x):
+        """Forward function."""
+        return self.w_2(self.dropout(self.activation(self.w_1(x))))
+class MultiHeadedAttentionSANM(nn.Module):
+    """Multi-Head Attention layer.
+    Args:
+        n_head (int): The number of heads.
+        n_feat (int): The number of features.
+        dropout_rate (float): Dropout rate.
+    """
+    def __init__(
+        self,
+        n_head,
+        in_feat,
+        n_feat,
+        dropout_rate,
+        kernel_size,
+        sanm_shfit=0,
+        lora_list=None,
+        lora_rank=8,
+        lora_alpha=16,
+        lora_dropout=0.1,
+    ):
+        """Construct an MultiHeadedAttention object."""
+        super().__init__()
+        assert n_feat % n_head == 0
+        # We assume d_v always equals d_k
+        self.d_k = n_feat // n_head
+        self.h = n_head
+        # self.linear_q = nn.Linear(n_feat, n_feat)
+        # self.linear_k = nn.Linear(n_feat, n_feat)
+        # self.linear_v = nn.Linear(n_feat, n_feat)
+        self.linear_out = nn.Linear(n_feat, n_feat)
+        self.linear_q_k_v = nn.Linear(in_feat, n_feat * 3)
+        self.attn = None
+        self.dropout = nn.Dropout(p=dropout_rate)
+        self.fsmn_block = nn.Conv1d(
+            n_feat, n_feat, kernel_size, stride=1, padding=0, groups=n_feat, bias=False
+        )
+        # padding
+        left_padding = (kernel_size - 1) // 2
+        if sanm_shfit > 0:
+            left_padding = left_padding + sanm_shfit
+        right_padding = kernel_size - 1 - left_padding
+        self.pad_fn = nn.ConstantPad1d((left_padding, right_padding), 0.0)
+    def forward_fsmn(self, inputs, mask, mask_shfit_chunk=None):
+        b, t, d = inputs.size()
+        if mask is not None:
+            mask = torch.reshape(mask, (b, -1, 1))
+            if mask_shfit_chunk is not None:
+                mask = mask * mask_shfit_chunk
+            inputs = inputs * mask
+        x = inputs.transpose(1, 2)
+        x = self.pad_fn(x)
+        x = self.fsmn_block(x)
+        x = x.transpose(1, 2)
+        x += inputs
+        x = self.dropout(x)
+        if mask is not None:
+            x = x * mask
+        return x
+    def forward_qkv(self, x):
+        """Transform query, key and value.
+        Args:
+            query (torch.Tensor): Query tensor (#batch, time1, size).
+            key (torch.Tensor): Key tensor (#batch, time2, size).
+            value (torch.Tensor): Value tensor (#batch, time2, size).
+        Returns:
+            torch.Tensor: Transformed query tensor (#batch, n_head, time1, d_k).
+            torch.Tensor: Transformed key tensor (#batch, n_head, time2, d_k).
+            torch.Tensor: Transformed value tensor (#batch, n_head, time2, d_k).
+        """
+        b, t, d = x.size()
+        q_k_v = self.linear_q_k_v(x)
+        q, k, v = torch.split(q_k_v, int(self.h * self.d_k), dim=-1)
+        q_h = torch.reshape(q, (b, t, self.h, self.d_k)).transpose(
+            1, 2
+        )  # (batch, head, time1, d_k)
+        k_h = torch.reshape(k, (b, t, self.h, self.d_k)).transpose(
+            1, 2
+        )  # (batch, head, time2, d_k)
+        v_h = torch.reshape(v, (b, t, self.h, self.d_k)).transpose(
+            1, 2
+        )  # (batch, head, time2, d_k)
+        return q_h, k_h, v_h, v
+    def forward_attention(self, value, scores, mask, mask_att_chunk_encoder=None):
+        """Compute attention context vector.
+        Args:
+            value (torch.Tensor): Transformed value (#batch, n_head, time2, d_k).
+            scores (torch.Tensor): Attention score (#batch, n_head, time1, time2).
+            mask (torch.Tensor): Mask (#batch, 1, time2) or (#batch, time1, time2).
+        Returns:
+            torch.Tensor: Transformed value (#batch, time1, d_model)
+                weighted by the attention score (#batch, time1, time2).
+        """
+        n_batch = value.size(0)
+        if mask is not None:
+            if mask_att_chunk_encoder is not None:
+                mask = mask * mask_att_chunk_encoder
+            mask = mask.unsqueeze(1).eq(0)  # (batch, 1, *, time2)
+            min_value = -float(
+                "inf"
+            )  # float(numpy.finfo(torch.tensor(0, dtype=scores.dtype).numpy().dtype).min)
+            scores = scores.masked_fill(mask, min_value)
+            self.attn = torch.softmax(scores, dim=-1).masked_fill(
+                mask, 0.0
+            )  # (batch, head, time1, time2)
+        else:
+            self.attn = torch.softmax(scores, dim=-1)  # (batch, head, time1, time2)
+        p_attn = self.dropout(self.attn)
+        x = torch.matmul(p_attn, value)  # (batch, head, time1, d_k)
+        x = (
+            x.transpose(1, 2).contiguous().view(n_batch, -1, self.h * self.d_k)
+        )  # (batch, time1, d_model)
+        return self.linear_out(x)  # (batch, time1, d_model)
+    def forward(self, x, mask, mask_shfit_chunk=None, mask_att_chunk_encoder=None):
+        """Compute scaled dot product attention.
+        Args:
+            query (torch.Tensor): Query tensor (#batch, time1, size).
+            key (torch.Tensor): Key tensor (#batch, time2, size).
+            value (torch.Tensor): Value tensor (#batch, time2, size).
+            mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
+                (#batch, time1, time2).
+        Returns:
+            torch.Tensor: Output tensor (#batch, time1, d_model).
+        """
+        q_h, k_h, v_h, v = self.forward_qkv(x)
+        fsmn_memory = self.forward_fsmn(v, mask, mask_shfit_chunk)
+        q_h = q_h * self.d_k ** (-0.5)
+        scores = torch.matmul(q_h, k_h.transpose(-2, -1))
+        att_outs = self.forward_attention(v_h, scores, mask, mask_att_chunk_encoder)
+        return att_outs + fsmn_memory
+    def forward_chunk(self, x, cache=None, chunk_size=None, look_back=0):
+        """Compute scaled dot product attention.
+        Args:
+            query (torch.Tensor): Query tensor (#batch, time1, size).
+            key (torch.Tensor): Key tensor (#batch, time2, size).
+            value (torch.Tensor): Value tensor (#batch, time2, size).
+            mask (torch.Tensor): Mask tensor (#batch, 1, time2) or
+                (#batch, time1, time2).
+        Returns:
+            torch.Tensor: Output tensor (#batch, time1, d_model).
+        """
+        q_h, k_h, v_h, v = self.forward_qkv(x)
+        if chunk_size is not None and look_back > 0 or look_back == -1:
+            if cache is not None:
+                k_h_stride = k_h[:, :, : -(chunk_size[2]), :]
+                v_h_stride = v_h[:, :, : -(chunk_size[2]), :]
+                k_h = torch.cat((cache["k"], k_h), dim=2)
+                v_h = torch.cat((cache["v"], v_h), dim=2)
+                cache["k"] = torch.cat((cache["k"], k_h_stride), dim=2)
+                cache["v"] = torch.cat((cache["v"], v_h_stride), dim=2)
+                if look_back != -1:
+                    cache["k"] = cache["k"][:, :, -(look_back * chunk_size[1]) :, :]
+                    cache["v"] = cache["v"][:, :, -(look_back * chunk_size[1]) :, :]
+            else:
+                cache_tmp = {
+                    "k": k_h[:, :, : -(chunk_size[2]), :],
+                    "v": v_h[:, :, : -(chunk_size[2]), :],
+                }
+                cache = cache_tmp
+        fsmn_memory = self.forward_fsmn(v, None)
+        q_h = q_h * self.d_k ** (-0.5)
+        scores = torch.matmul(q_h, k_h.transpose(-2, -1))
+        att_outs = self.forward_attention(v_h, scores, None)
+        return att_outs + fsmn_memory, cache
+class LayerNorm(nn.LayerNorm):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+    def forward(self, input):
+        output = F.layer_norm(
+            input.float(),
+            self.normalized_shape,
+            self.weight.float() if self.weight is not None else None,
+            self.bias.float() if self.bias is not None else None,
+            self.eps,
+        )
+        return output.type_as(input)
+def sequence_mask(lengths, maxlen=None, dtype=torch.float32, device=None):
+    if maxlen is None:
+        maxlen = lengths.max()
+    row_vector = torch.arange(0, maxlen, 1).to(lengths.device)
+    matrix = torch.unsqueeze(lengths, dim=-1)
+    mask = row_vector < matrix
+    mask = mask.detach()
+    return mask.type(dtype).to(device) if device is not None else mask.type(dtype)
+class EncoderLayerSANM(nn.Module):
+    def __init__(
+        self,
+        in_size,
+        size,
+        self_attn,
+        feed_forward,
+        dropout_rate,
+        normalize_before=True,
+        concat_after=False,
+        stochastic_depth_rate=0.0,
+    ):
+        """Construct an EncoderLayer object."""
+        super(EncoderLayerSANM, self).__init__()
+        self.self_attn = self_attn
+        self.feed_forward = feed_forward
+        self.norm1 = LayerNorm(in_size)
+        self.norm2 = LayerNorm(size)
+        self.dropout = nn.Dropout(dropout_rate)
+        self.in_size = in_size
+        self.size = size
+        self.normalize_before = normalize_before
+        self.concat_after = concat_after
+        if self.concat_after:
+            self.concat_linear = nn.Linear(size + size, size)
+        self.stochastic_depth_rate = stochastic_depth_rate
+        self.dropout_rate = dropout_rate
+    def forward(self, x, mask, cache=None, mask_shfit_chunk=None, mask_att_chunk_encoder=None):
+        """Compute encoded features.
+        Args:
+            x_input (torch.Tensor): Input tensor (#batch, time, size).
+            mask (torch.Tensor): Mask tensor for the input (#batch, time).
+            cache (torch.Tensor): Cache tensor of the input (#batch, time - 1, size).
+        Returns:
+            torch.Tensor: Output tensor (#batch, time, size).
+            torch.Tensor: Mask tensor (#batch, time).
+        """
+        skip_layer = False
+        # with stochastic depth, residual connection `x + f(x)` becomes
+        # `x <- x + 1 / (1 - p) * f(x)` at training time.
+        stoch_layer_coeff = 1.0
+        if self.training and self.stochastic_depth_rate > 0:
+            skip_layer = torch.rand(1).item() < self.stochastic_depth_rate
+            stoch_layer_coeff = 1.0 / (1 - self.stochastic_depth_rate)
+        if skip_layer:
+            if cache is not None:
+                x = torch.cat([cache, x], dim=1)
+            return x, mask
+        residual = x
+        if self.normalize_before:
+            x = self.norm1(x)
+        if self.concat_after:
+            x_concat = torch.cat(
+                (
+                    x,
+                    self.self_attn(
+                        x,
+                        mask,
+                        mask_shfit_chunk=mask_shfit_chunk,
+                        mask_att_chunk_encoder=mask_att_chunk_encoder,
+                    ),
+                ),
+                dim=-1,
+            )
+            if self.in_size == self.size:
+                x = residual + stoch_layer_coeff * self.concat_linear(x_concat)
+            else:
+                x = stoch_layer_coeff * self.concat_linear(x_concat)
+        else:
+            if self.in_size == self.size:
+                x = residual + stoch_layer_coeff * self.dropout(
+                    self.self_attn(
+                        x,
+                        mask,
+                        mask_shfit_chunk=mask_shfit_chunk,
+                        mask_att_chunk_encoder=mask_att_chunk_encoder,
+                    )
+                )
+            else:
+                x = stoch_layer_coeff * self.dropout(
+                    self.self_attn(
+                        x,
+                        mask,
+                        mask_shfit_chunk=mask_shfit_chunk,
+                        mask_att_chunk_encoder=mask_att_chunk_encoder,
+                    )
+                )
+        if not self.normalize_before:
+            x = self.norm1(x)
+        residual = x
+        if self.normalize_before:
+            x = self.norm2(x)
+        x = residual + stoch_layer_coeff * self.dropout(self.feed_forward(x))
+        if not self.normalize_before:
+            x = self.norm2(x)
+        return x, mask, cache, mask_shfit_chunk, mask_att_chunk_encoder
+    def forward_chunk(self, x, cache=None, chunk_size=None, look_back=0):
+        """Compute encoded features.
+        Args:
+            x_input (torch.Tensor): Input tensor (#batch, time, size).
+            mask (torch.Tensor): Mask tensor for the input (#batch, time).
+            cache (torch.Tensor): Cache tensor of the input (#batch, time - 1, size).
+        Returns:
+            torch.Tensor: Output tensor (#batch, time, size).
+            torch.Tensor: Mask tensor (#batch, time).
+        """
+        residual = x
+        if self.normalize_before:
+            x = self.norm1(x)
+        if self.in_size == self.size:
+            attn, cache = self.self_attn.forward_chunk(x, cache, chunk_size, look_back)
+            x = residual + attn
+        else:
+            x, cache = self.self_attn.forward_chunk(x, cache, chunk_size, look_back)
+        if not self.normalize_before:
+            x = self.norm1(x)
+        residual = x
+        if self.normalize_before:
+            x = self.norm2(x)
+        x = residual + self.feed_forward(x)
+        if not self.normalize_before:
+            x = self.norm2(x)
+        return x, cache
+@tables.register("encoder_classes", "SenseVoiceEncoderSmall")
+class SenseVoiceEncoderSmall(nn.Module):
+    """
+    Author: Speech Lab of DAMO Academy, Alibaba Group
+    SCAMA: Streaming chunk-aware multihead attention for online end-to-end speech recognition
+    https://arxiv.org/abs/2006.01713
+    """
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int = 256,
+        attention_heads: int = 4,
+        linear_units: int = 2048,
+        num_blocks: int = 6,
+        tp_blocks: int = 0,
+        dropout_rate: float = 0.1,
+        positional_dropout_rate: float = 0.1,
+        attention_dropout_rate: float = 0.0,
+        stochastic_depth_rate: float = 0.0,
+        input_layer: Optional[str] = "conv2d",
+        pos_enc_class=SinusoidalPositionEncoder,
+        normalize_before: bool = True,
+        concat_after: bool = False,
+        positionwise_layer_type: str = "linear",
+        positionwise_conv_kernel_size: int = 1,
+        padding_idx: int = -1,
+        kernel_size: int = 11,
+        sanm_shfit: int = 0,
+        selfattention_layer_type: str = "sanm",
+        **kwargs,
+    ):
+        super().__init__()
+        self._output_size = output_size
+        self.embed = SinusoidalPositionEncoder()
+        self.normalize_before = normalize_before
+        positionwise_layer = PositionwiseFeedForward
+        positionwise_layer_args = (
+            output_size,
+            linear_units,
+            dropout_rate,
+        )
+        encoder_selfattn_layer = MultiHeadedAttentionSANM
+        encoder_selfattn_layer_args0 = (
+            attention_heads,
+            input_size,
+            output_size,
+            attention_dropout_rate,
+            kernel_size,
+            sanm_shfit,
+        )
+        encoder_selfattn_layer_args = (
+            attention_heads,
+            output_size,
+            output_size,
+            attention_dropout_rate,
+            kernel_size,
+            sanm_shfit,
+        )
+        self.encoders0 = nn.ModuleList(
+            [
+                EncoderLayerSANM(
+                    input_size,
+                    output_size,
+                    encoder_selfattn_layer(*encoder_selfattn_layer_args0),
+                    positionwise_layer(*positionwise_layer_args),
+                    dropout_rate,
+                )
+                for i in range(1)
+            ]
+        )
+        self.encoders = nn.ModuleList(
+            [
+                EncoderLayerSANM(
+                    output_size,
+                    output_size,
+                    encoder_selfattn_layer(*encoder_selfattn_layer_args),
+                    positionwise_layer(*positionwise_layer_args),
+                    dropout_rate,
+                )
+                for i in range(num_blocks - 1)
+            ]
+        )
+        self.tp_encoders = nn.ModuleList(
+            [
+                EncoderLayerSANM(
+                    output_size,
+                    output_size,
+                    encoder_selfattn_layer(*encoder_selfattn_layer_args),
+                    positionwise_layer(*positionwise_layer_args),
+                    dropout_rate,
+                )
+                for i in range(tp_blocks)
+            ]
+        )
+        self.after_norm = LayerNorm(output_size)
+        self.tp_norm = LayerNorm(output_size)
+    def output_size(self) -> int:
+        return self._output_size
+    def forward(
+        self,
+        xs_pad: torch.Tensor,
+        ilens: torch.Tensor,
+    ):
+        """Embed positions in tensor."""
+        masks = sequence_mask(ilens, device=ilens.device)[:, None, :]
+        xs_pad *= self.output_size() ** 0.5
+        xs_pad = self.embed(xs_pad)
+        # forward encoder1
+        for layer_idx, encoder_layer in enumerate(self.encoders0):
+            encoder_outs = encoder_layer(xs_pad, masks)
+            xs_pad, masks = encoder_outs[0], encoder_outs[1]
+        for layer_idx, encoder_layer in enumerate(self.encoders):
+            encoder_outs = encoder_layer(xs_pad, masks)
+            xs_pad, masks = encoder_outs[0], encoder_outs[1]
+        xs_pad = self.after_norm(xs_pad)
+        # forward encoder2
+        olens = masks.squeeze(1).sum(1).int()
+        for layer_idx, encoder_layer in enumerate(self.tp_encoders):
+            encoder_outs = encoder_layer(xs_pad, masks)
+            xs_pad, masks = encoder_outs[0], encoder_outs[1]
+        xs_pad = self.tp_norm(xs_pad)
+        return xs_pad, olens
+@tables.register("model_classes", "SenseVoiceSmall")
+class SenseVoiceSmall(nn.Module):
+    """CTC-attention hybrid Encoder-Decoder model"""
+    def __init__(
+        self,
+        specaug: str = None,
+        specaug_conf: dict = None,
+        normalize: str = None,
+        normalize_conf: dict = None,
+        encoder: str = None,
+        encoder_conf: dict = None,
+        ctc_conf: dict = None,
+        input_size: int = 80,
+        vocab_size: int = -1,
+        ignore_id: int = -1,
+        blank_id: int = 0,
+        sos: int = 1,
+        eos: int = 2,
+        length_normalized_loss: bool = False,
+        **kwargs,
+    ):
+        super().__init__()
+        if specaug is not None:
+            specaug_class = tables.specaug_classes.get(specaug)
+            specaug = specaug_class(**specaug_conf)
+        if normalize is not None:
+            normalize_class = tables.normalize_classes.get(normalize)
+            normalize = normalize_class(**normalize_conf)
+        encoder_class = tables.encoder_classes.get(encoder)
+        encoder = encoder_class(input_size=input_size, **encoder_conf)
+        encoder_output_size = encoder.output_size()
+        if ctc_conf is None:
+            ctc_conf = {}
+        ctc = CTC(odim=vocab_size, encoder_output_size=encoder_output_size, **ctc_conf)
+        self.blank_id = blank_id
+        self.sos = sos if sos is not None else vocab_size - 1
+        self.eos = eos if eos is not None else vocab_size - 1
+        self.vocab_size = vocab_size
+        self.ignore_id = ignore_id
+        self.specaug = specaug
+        self.normalize = normalize
+        self.encoder = encoder
+        self.error_calculator = None
+        self.ctc = ctc
+        self.length_normalized_loss = length_normalized_loss
+        self.encoder_output_size = encoder_output_size
+        self.lid_dict = {"auto": 0, "zh": 3, "en": 4, "yue": 7, "ja": 11, "ko": 12, "nospeech": 13}
+        self.lid_int_dict = {24884: 3, 24885: 4, 24888: 7, 24892: 11, 24896: 12, 24992: 13}
+        self.textnorm_dict = {"withitn": 14, "woitn": 15}
+        self.textnorm_int_dict = {25016: 14, 25017: 15}
+        self.embed = torch.nn.Embedding(7 + len(self.lid_dict) + len(self.textnorm_dict), input_size)
+        self.criterion_att = LabelSmoothingLoss(
+            size=self.vocab_size,
+            padding_idx=self.ignore_id,
+            smoothing=kwargs.get("lsm_weight", 0.0),
+            normalize_length=self.length_normalized_loss,
+        )
+    @staticmethod
+    def from_pretrained(model:str=None, **kwargs):
+        from funasr import AutoModel
+        model, kwargs = AutoModel.build_model(model=model, trust_remote_code=True, **kwargs)
+        return model, kwargs
+    def forward(
+        self,
+        speech: torch.Tensor,
+        speech_lengths: torch.Tensor,
+        text: torch.Tensor,
+        text_lengths: torch.Tensor,
+        **kwargs,
+    ):
+        """Encoder + Decoder + Calc loss
+        Args:
+                speech: (Batch, Length, ...)
+                speech_lengths: (Batch, )
+                text: (Batch, Length)
+                text_lengths: (Batch,)
+        """
+        # import pdb;
+        # pdb.set_trace()
+        if len(text_lengths.size()) > 1:
+            text_lengths = text_lengths[:, 0]
+        if len(speech_lengths.size()) > 1:
+            speech_lengths = speech_lengths[:, 0]
+        batch_size = speech.shape[0]
+        # 1. Encoder
+        encoder_out, encoder_out_lens = self.encode(speech, speech_lengths, text)
+        loss_ctc, cer_ctc = None, None
+        loss_rich, acc_rich = None, None
+        stats = dict()
+        loss_ctc, cer_ctc = self._calc_ctc_loss(
+            encoder_out[:, 4:, :], encoder_out_lens - 4, text[:, 4:], text_lengths - 4
+        )
+        loss_rich, acc_rich = self._calc_rich_ce_loss(
+            encoder_out[:, :4, :], text[:, :4]
+        )
+        loss = loss_ctc
+        # Collect total loss stats
+        stats["loss"] = torch.clone(loss.detach()) if loss_ctc is not None else None
+        stats["loss_rich"] = torch.clone(loss_rich.detach()) if loss_rich is not None else None
+        stats["acc_rich"] = acc_rich
+        # force_gatherable: to-device and to-tensor if scalar for DataParallel
+        if self.length_normalized_loss:
+            batch_size = int((text_lengths + 1).sum())
+        loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
+        return loss, stats, weight
+    def encode(
+        self,
+        speech: torch.Tensor,
+        speech_lengths: torch.Tensor,
+        text: torch.Tensor,
+        **kwargs,
+    ):
+        """Frontend + Encoder. Note that this method is used by asr_inference.py
+        Args:
+                speech: (Batch, Length, ...)
+                speech_lengths: (Batch, )
+                ind: int
+        """
+        # Data augmentation
+        if self.specaug is not None and self.training:
+            speech, speech_lengths = self.specaug(speech, speech_lengths)
+        # Normalization for feature: e.g. Global-CMVN, Utterance-CMVN
+        if self.normalize is not None:
+            speech, speech_lengths = self.normalize(speech, speech_lengths)
+        lids = torch.LongTensor([[self.lid_int_dict[int(lid)] if torch.rand(1) > 0.2 and int(lid) in self.lid_int_dict else 0 ] for lid in text[:, 0]]).to(speech.device)
+        language_query = self.embed(lids)
+        styles = torch.LongTensor([[self.textnorm_int_dict[int(style)]] for style in text[:, 3]]).to(speech.device)
+        style_query = self.embed(styles)
+        speech = torch.cat((style_query, speech), dim=1)
+        speech_lengths += 1
+        event_emo_query = self.embed(torch.LongTensor([[1, 2]]).to(speech.device)).repeat(speech.size(0), 1, 1)
+        input_query = torch.cat((language_query, event_emo_query), dim=1)
+        speech = torch.cat((input_query, speech), dim=1)
+        speech_lengths += 3
+        encoder_out, encoder_out_lens = self.encoder(speech, speech_lengths)
+        return encoder_out, encoder_out_lens
+    def _calc_ctc_loss(
+        self,
+        encoder_out: torch.Tensor,
+        encoder_out_lens: torch.Tensor,
+        ys_pad: torch.Tensor,
+        ys_pad_lens: torch.Tensor,
+    ):
+        # Calc CTC loss
+        loss_ctc = self.ctc(encoder_out, encoder_out_lens, ys_pad, ys_pad_lens)
+        # Calc CER using CTC
+        cer_ctc = None
+        if not self.training and self.error_calculator is not None:
+            ys_hat = self.ctc.argmax(encoder_out).data
+            cer_ctc = self.error_calculator(ys_hat.cpu(), ys_pad.cpu(), is_ctc=True)
+        return loss_ctc, cer_ctc
+    def _calc_rich_ce_loss(
+        self,
+        encoder_out: torch.Tensor,
+        ys_pad: torch.Tensor,
+    ):
+        decoder_out = self.ctc.ctc_lo(encoder_out)
+        # 2. Compute attention loss
+        loss_rich = self.criterion_att(decoder_out, ys_pad.contiguous())
+        acc_rich = th_accuracy(
+            decoder_out.view(-1, self.vocab_size),
+            ys_pad.contiguous(),
+            ignore_label=self.ignore_id,
+        )
+        return loss_rich, acc_rich
+    def inference(
+        self,
+        data_in,
+        data_lengths=None,
+        key: list = ["wav_file_tmp_name"],
+        tokenizer=None,
+        frontend=None,
+        **kwargs,
+    ):
+        meta_data = {}
+        if (
+            isinstance(data_in, torch.Tensor) and kwargs.get("data_type", "sound") == "fbank"
+        ):  # fbank
+            speech, speech_lengths = data_in, data_lengths
+            if len(speech.shape) < 3:
+                speech = speech[None, :, :]
+            if speech_lengths is None:
+                speech_lengths = speech.shape[1]
+        else:
+            # extract fbank feats
+            time1 = time.perf_counter()
+            audio_sample_list = load_audio_text_image_video(
+                data_in,
+                fs=frontend.fs,
+                audio_fs=kwargs.get("fs", 16000),
+                data_type=kwargs.get("data_type", "sound"),
+                tokenizer=tokenizer,
+            )
+            time2 = time.perf_counter()
+            meta_data["load_data"] = f"{time2 - time1:0.3f}"
+            speech, speech_lengths = extract_fbank(
+                audio_sample_list, data_type=kwargs.get("data_type", "sound"), frontend=frontend
+            )
+            time3 = time.perf_counter()
+            meta_data["extract_feat"] = f"{time3 - time2:0.3f}"
+            meta_data["batch_data_time"] = (
+                speech_lengths.sum().item() * frontend.frame_shift * frontend.lfr_n / 1000
+            )
+        speech = speech.to(device=kwargs["device"])
+        speech_lengths = speech_lengths.to(device=kwargs["device"])
+        language = kwargs.get("language", "auto")
+        language_query = self.embed(
+            torch.LongTensor(
+                [[self.lid_dict[language] if language in self.lid_dict else 0]]
+            ).to(speech.device)
+        ).repeat(speech.size(0), 1, 1)
+        use_itn = kwargs.get("use_itn", False)
+        textnorm = kwargs.get("text_norm", None)
+        if textnorm is None:
+            textnorm = "withitn" if use_itn else "woitn"
+        textnorm_query = self.embed(
+            torch.LongTensor([[self.textnorm_dict[textnorm]]]).to(speech.device)
+        ).repeat(speech.size(0), 1, 1)
+        speech = torch.cat((textnorm_query, speech), dim=1)
+        speech_lengths += 1
+        event_emo_query = self.embed(torch.LongTensor([[1, 2]]).to(speech.device)).repeat(
+            speech.size(0), 1, 1
+        )
+        input_query = torch.cat((language_query, event_emo_query), dim=1)
+        speech = torch.cat((input_query, speech), dim=1)
+        speech_lengths += 3
+        # Encoder
+        encoder_out, encoder_out_lens = self.encoder(speech, speech_lengths)
+        if isinstance(encoder_out, tuple):
+            encoder_out = encoder_out[0]
+        # c. Passed the encoder result and the beam search
+        ctc_logits = self.ctc.log_softmax(encoder_out)
+        results = []
+        b, n, d = encoder_out.size()
+        if isinstance(key[0], (list, tuple)):
+            key = key[0]
+        if len(key) < b:
+            key = key * b
+        for i in range(b):
+            x = ctc_logits[i, : encoder_out_lens[i].item(), :]
+            yseq = x.argmax(dim=-1)
+            yseq = torch.unique_consecutive(yseq, dim=-1)
+            ibest_writer = None
+            if kwargs.get("output_dir") is not None:
+                if not hasattr(self, "writer"):
+                    self.writer = DatadirWriter(kwargs.get("output_dir"))
+                ibest_writer = self.writer[f"1best_recog"]
+            mask = yseq != self.blank_id
+            token_int = yseq[mask].tolist()
+            # Change integer-ids to tokens
+            text = tokenizer.decode(token_int)
+            result_i = {"key": key[i], "text": text}
+            results.append(result_i)
+            if ibest_writer is not None:
+                ibest_writer["text"][key[i]] = text
+        return results, meta_data
+    def export(self, **kwargs):
+        from .export_meta import export_rebuild_model
+        if "max_seq_len" not in kwargs:
+            kwargs["max_seq_len"] = 512
+        models = export_rebuild_model(model=self, **kwargs)
+        return models

requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ -i https://pypi.org/simple
2	+ funasr==1.0.28