import whisper import gradio as gr import datetime import subprocess import torch import pyannote.audio from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding from pyannote.audio import Audio from pyannote.core import Segment import wave import contextlib from sklearn.cluster import AgglomerativeClustering import numpy as np from transformers import AutoTokenizer, AutoModelForSeq2SeqLM ###################################################### def audio_to_text(audio, num_speakers): path, error = convert_to_wav(audio) if error is not None: return error duration = get_duration(path) if duration > 4 * 60 * 60: return "Audio duration too long" result = model.transcribe(path) segments = result["segments"] num_speakers = min(max(round(num_speakers), 1), len(segments)) if len(segments) == 1: segments[0]['speaker'] = 'SPEAKER 1' else: embeddings = make_embeddings(path, segments, duration) add_speaker_labels(segments, embeddings, num_speakers) output = get_output(segments) text_to_short_summary(output) return output def convert_to_wav(path): if path[-3:] != 'wav': new_path = '.'.join(path.split('.')[:-1]) + '.wav' try: subprocess.call(['ffmpeg', '-i', path, new_path, '-y']) except: return path, 'Error: Could not convert file to .wav' path = new_path return path, None def get_duration(path): with contextlib.closing(wave.open(path,'r')) as f: frames = f.getnframes() rate = f.getframerate() return frames / float(rate) def make_embeddings(path, segments, duration): embeddings = np.zeros(shape=(len(segments), 192)) for i, segment in enumerate(segments): embeddings[i] = segment_embedding(path, segment, duration) return np.nan_to_num(embeddings) audio = Audio() def segment_embedding(path, segment, duration): start = segment["start"] # Whisper overshoots the end timestamp in the last segment end = min(duration, segment["end"]) clip = Segment(start, end) waveform, sample_rate = audio.crop(path, clip) return embedding_model(waveform[None]) def add_speaker_labels(segments, embeddings, num_speakers): clustering = AgglomerativeClustering(num_speakers).fit(embeddings) labels = clustering.labels_ for i in range(len(segments)): segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1) def time(secs): return datetime.timedelta(seconds=round(secs)) def get_output(segments): output = '' for (i, segment) in enumerate(segments): if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]: if i != 0: output += '\n\n' output += segment["speaker"] + ' ' + str(time(segment["start"])) + '\n\n' output += segment["text"][1:] + ' ' return output ######################################################################### def text_to_short_summary(text): #tokenizer = AutoTokenizer.from_pretrained("knkarthick/MEETING-SUMMARY-BART-LARGE-XSUM-SAMSUM-DIALOGSUM") #model = AutoModelForSeq2SeqLM.from_pretrained("knkarthick/MEETING-SUMMARY-BART-LARGE-XSUM-SAMSUM-DIALOGSUM") from transformers import pipeline summarizer = pipeline("summarization", model="knkarthick/MEETING-SUMMARY-BART-LARGE-XSUM-SAMSUM-DIALOGSUM") return summarizer(text) ########################################################################### app1=gr.Interface( title = 'AI Voice to Text', fn=audio_to_text, inputs=[ gr.inputs.Audio(source="upload", type="filepath"), gr.inputs.Number(default=2, label="Number of Speakers") ], outputs=[ gr.outputs.Textbox(label='Transcript') ] ) app2=gr.Interface(fn=text_to_short_summary, inputs=gr.Textbox(label="Enter"), outputs=gr.Textbox(label="Output")) demo = gr.TabbedInterface([app1, app2], ["Text", "Detailed_Summary"]) demo.launch()