File size: 4,418 Bytes
ff1e37d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fe075ed
ff1e37d
a65ae92
 
 
 
fe075ed
4ace62c
91a603c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
930ae9b
 
4ace62c
91a603c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cc148c5
91a603c
 
 
cc148c5
 
 
 
7d7e2c1
 
9c96b66
7d7e2c1
 
 
b4b2f4e
 
4ace62c
7d7e2c1
e3759cc
7d7e2c1
91a603c
 
 
17056be
91a603c
 
 
 
 
 
4ace62c
1127158
91a603c
 
 
4ace62c
 
304fe97
4ace62c
 
86459b3
4ace62c
86459b3
4ace62c
91a603c
 
 
 
 
 
 
ff1e37d
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import whisper
import gradio as gr
import datetime

import subprocess

import torch
import pyannote.audio
from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding

from pyannote.audio import Audio
from pyannote.core import Segment

import wave
import contextlib

from sklearn.cluster import AgglomerativeClustering
import numpy as np
optput=""

model = whisper.load_model("large-v2")
embedding_model = PretrainedSpeakerEmbedding( 
    "speechbrain/spkrec-ecapa-voxceleb",
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
)

def audio_to_text(audio, num_speakers):
  path, error = convert_to_wav(audio)
  if error is not None:
    return error

  duration = get_duration(path)
  if duration > 4 * 60 * 60:
    return "Audio duration too long"

  result = model.transcribe(path)
  segments = result["segments"]

  num_speakers = min(max(round(num_speakers), 1), len(segments))
  if len(segments) == 1:
    segments[0]['speaker'] = 'SPEAKER 1'
  else:
    embeddings = make_embeddings(path, segments, duration)
    add_speaker_labels(segments, embeddings, num_speakers)
  global output
  output = get_output(segments)
  return output

def convert_to_wav(path):
  if path[-3:] != 'wav':
    new_path = '.'.join(path.split('.')[:-1]) + '.wav'
    try:
      subprocess.call(['ffmpeg', '-i', path, new_path, '-y'])
    except:
      return path, 'Error: Could not convert file to .wav'
    path = new_path
  return path, None

def get_duration(path):
  with contextlib.closing(wave.open(path,'r')) as f:
    frames = f.getnframes()
    rate = f.getframerate()
    return frames / float(rate)

def make_embeddings(path, segments, duration):
  embeddings = np.zeros(shape=(len(segments), 192))
  for i, segment in enumerate(segments):
    embeddings[i] = segment_embedding(path, segment, duration)
  return np.nan_to_num(embeddings)

audio = Audio()

def segment_embedding(path, segment, duration):
  start = segment["start"]
  # Whisper overshoots the end timestamp in the last segment
  end = min(duration, segment["end"])
  clip = Segment(start, end)
  waveform, sample_rate = audio.crop(path, clip)
  return embedding_model(waveform[None])

def add_speaker_labels(segments, embeddings, num_speakers):
  clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
  labels = clustering.labels_
  for i in range(len(segments)):
    segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)

def time(secs):
  return datetime.timedelta(seconds=round(secs))

def get_output(segments):
  out = ''
  for (i, segment) in enumerate(segments):
    if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
      if i != 0:
        out += '\n\n'
      out += segment["speaker"] + ' ' + str(time(segment["start"])) + '\n\n'
    out += segment["text"][1:] + ' '
  return out


def text_to_short_summary():
    
    from transformers import pipeline
    summarizer = pipeline("summarization", model="knkarthick/MEETING-SUMMARY-BART-LARGE-XSUM-SAMSUM-DIALOGSUM")
    #text = '''The tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey building, and the tallest structure in Paris. Its base is square, measuring 125 metres (410 ft) on each side. During its construction, the Eiffel Tower surpassed the Washington Monument to become the tallest man-made structure in the world, a title it held for 41 years until the Chrysler Building in New York City was finished in 1930. It was the first structure to reach a height of 300 metres. Due to the addition of a broadcasting aerial at the top of the tower in 1957, it is now taller than the Chrysler Building by 5.2 metres (17 ft). Excluding transmitters, the Eiffel Tower is the second tallest free-standing structure in France after the Millau Viaduct.                                       
    #'''
    return summarizer(output)

###########################################################################


app1=gr.Interface(
    title = 'AI Voice to Text', 
    fn=audio_to_text, 
    inputs=[
        gr.inputs.Audio(source="upload", type="filepath"),
        gr.inputs.Number(default=2, label="Number of Speakers")

    ],
    outputs=[
        (gr.outputs.Textbox(label='Transcript'))
        
    ]
  )

app2=gr.Interface(
    fn=text_to_short_summary,
    inputs=None,
    outputs=[gr.outputs.Textbox()]
)

demo=gr.TabbedInterface([app1,app2],["Text","Short_sum"])

demo.launch()