Spaces:

remotewith
/

Mercer

Sleeping

App Files Files Community

Mercer / app.py

remotewith

Update app.py

cc148c5 about 1 year ago

raw

history blame

4.42 kB

	import whisper
	import gradio as gr
	import datetime

	import subprocess

	import torch
	import pyannote.audio
	from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding

	from pyannote.audio import Audio
	from pyannote.core import Segment

	import wave
	import contextlib

	from sklearn.cluster import AgglomerativeClustering
	import numpy as np
	optput=""

	model = whisper.load_model("large-v2")
	embedding_model = PretrainedSpeakerEmbedding(
	"speechbrain/spkrec-ecapa-voxceleb",
	device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
	)

	def audio_to_text(audio, num_speakers):
	path, error = convert_to_wav(audio)
	if error is not None:
	return error

	duration = get_duration(path)
	if duration > 4 * 60 * 60:
	return "Audio duration too long"

	result = model.transcribe(path)
	segments = result["segments"]

	num_speakers = min(max(round(num_speakers), 1), len(segments))
	if len(segments) == 1:
	segments[0]['speaker'] = 'SPEAKER 1'
	else:
	embeddings = make_embeddings(path, segments, duration)
	add_speaker_labels(segments, embeddings, num_speakers)
	global output
	output = get_output(segments)
	return output

	def convert_to_wav(path):
	if path[-3:] != 'wav':
	new_path = '.'.join(path.split('.')[:-1]) + '.wav'
	try:
	subprocess.call(['ffmpeg', '-i', path, new_path, '-y'])
	except:
	return path, 'Error: Could not convert file to .wav'
	path = new_path
	return path, None

	def get_duration(path):
	with contextlib.closing(wave.open(path,'r')) as f:
	frames = f.getnframes()
	rate = f.getframerate()
	return frames / float(rate)

	def make_embeddings(path, segments, duration):
	embeddings = np.zeros(shape=(len(segments), 192))
	for i, segment in enumerate(segments):
	embeddings[i] = segment_embedding(path, segment, duration)
	return np.nan_to_num(embeddings)

	audio = Audio()

	def segment_embedding(path, segment, duration):
	start = segment["start"]
	# Whisper overshoots the end timestamp in the last segment
	end = min(duration, segment["end"])
	clip = Segment(start, end)
	waveform, sample_rate = audio.crop(path, clip)
	return embedding_model(waveform[None])

	def add_speaker_labels(segments, embeddings, num_speakers):
	clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
	labels = clustering.labels_
	for i in range(len(segments)):
	segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)

	def time(secs):
	return datetime.timedelta(seconds=round(secs))

	def get_output(segments):
	out = ''
	for (i, segment) in enumerate(segments):
	if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
	if i != 0:
	out += '\n\n'
	out += segment["speaker"] + ' ' + str(time(segment["start"])) + '\n\n'
	out += segment["text"][1:] + ' '
	return out


	def text_to_short_summary():

	from transformers import pipeline
	summarizer = pipeline("summarization", model="knkarthick/MEETING-SUMMARY-BART-LARGE-XSUM-SAMSUM-DIALOGSUM")
	#text = '''The tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey building, and the tallest structure in Paris. Its base is square, measuring 125 metres (410 ft) on each side. During its construction, the Eiffel Tower surpassed the Washington Monument to become the tallest man-made structure in the world, a title it held for 41 years until the Chrysler Building in New York City was finished in 1930. It was the first structure to reach a height of 300 metres. Due to the addition of a broadcasting aerial at the top of the tower in 1957, it is now taller than the Chrysler Building by 5.2 metres (17 ft). Excluding transmitters, the Eiffel Tower is the second tallest free-standing structure in France after the Millau Viaduct.
	#'''
	return summarizer(output)

	###########################################################################


	app1=gr.Interface(
	title = 'AI Voice to Text',
	fn=audio_to_text,
	inputs=[
	gr.inputs.Audio(source="upload", type="filepath"),
	gr.inputs.Number(default=2, label="Number of Speakers")

	],
	outputs=[
	(gr.outputs.Textbox(label='Transcript'))

	]
	)

	app2=gr.Interface(
	fn=text_to_short_summary,
	inputs=None,
	outputs=[gr.outputs.Textbox()]
	)

	demo=gr.TabbedInterface([app1,app2],["Text","Short_sum"])

	demo.launch()