In [153]:
import numpy as np
import dataclasses
import pandas as pd
from tqdm.auto import tqdm, trange

import dotenv
import openai
import requests

In [12]:
dotenv.load_dotenv()

True

In [146]:
def get_youtube_title(url: str) -> str:
    video_id = url.split("v=")[-1]
    api_url = f"https://www.youtube.com/oembed?url=http://www.youtube.com/watch?v={video_id}&format=json"
    response = requests.get(api_url)
    if response.status_code == 200:
        data = response.json()
        return data['title']
    else:
        return "Error retrieving title"

# video_url = "https://www.youtube.com/watch?v=4E2EbGoXlPQ"
# title = get_youtube_title(video_url)
# print(title)


In [154]:
links = pd.read_csv("links.csv").URL.tolist()
titles = [get_youtube_title(link) for link in tqdm(links)]

  0%|          | 0/17 [00:00<?, ?it/s]

In [10]:
episodes = []

for i in range(17):
    filename = f"transcripts/{i}.vtt"
    with open(filename, "r") as file:
        data = file.read()
        episodes.append(data)

In [26]:
import tiktoken
import math

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

def required_chunks(text: str, max_tokens: int = 8191, encoding_name: str = "cl100k_base") -> int:
    """Returns the number of chunks required to fit the text within the token limit."""
    num_tokens = num_tokens_from_string(text, encoding_name)
    num_chunks = num_tokens // max_tokens
    if num_tokens % max_tokens != 0:
        num_chunks += 1
    return num_chunks

def split_in_chunks(text: str) -> list[str]:
    """Split a text into chunks of equal number of tokens."""
    num_chunks = required_chunks(text)
    encoding = tiktoken.get_encoding("cl100k_base")
    tokens = encoding.encode(text)
    chunk_size = math.ceil(len(tokens) / num_chunks)

    chunks = []
    current_chunk = 0
    for i, token in enumerate(tokens):
        if i % chunk_size == 0:
            chunks.append("")
            current_chunk += 1
        chunks[current_chunk - 1] += encoding.decode([token])
    
    
        
    return chunks
    

[2.1432059577585156,
 0.279819313881089,
 3.282505188621658,
 2.076791600537175,
 2.153216945427909,
 1.9267488707117568,
 1.808448296911244,
 1.7863508729092907,
 2.1497985593944575,
 1.8603345134904163,
 1.9416432670003663,
 2.266634110609205,
 1.9544622146258088,
 0.2497863508729093,
 0.5919912098644854,
 2.005005493834697,
 0.2887315346111586]

In [72]:
episode_chunks = [split_in_chunks(episodes[i]) for i in range(17)]

In [130]:
chunk_labels = [f"Episode {i} - Chunk {j}" for i in range(17) for j in range(len(episode_chunks[i]))]

In [170]:
@dataclasses.dataclass
class Chunk:
    text: str
    title: str
    video_idx: int
    embedding: np.ndarray | None
    link: str

In [171]:
# chunk_metadata = [
#     {
#         "title": titles[i],
#         "video_idx": i,
#         "chunk_idx": j,
#         "text": episode_chunks[i][j],
#         "link": links[i],
#     }
#     for i in range(17)
#     for j in range(len(episode_chunks[i]))
# ]

chunk_metadata = [
    Chunk(
        title=titles[i],
        video_idx=i,
        text=episode_chunks[i][j],
        link=links[i],
        embedding=None
    )
    for i in range(17)
    for j in range(len(episode_chunks[i]))
]

In [172]:
chunks = sum([split_in_chunks(episodes[i]) for i in range(17)], [])

In [173]:
chunk_token_counts = [num_tokens_from_string(chunk, "cl100k_base") for chunk in chunks]

In [79]:
from openai import OpenAI
client = OpenAI()

In [ ]:
def get_batch_embeddings(texts: list[str], model="text-embedding-3-small") -> np.ndarray:
    embeddings = client.embeddings.create(input = texts, model=model)
    np_embeddings = np.array([embeddings.data[i].embedding for i in range(len(embeddings.data))])
    return np_embeddings

def get_one_embedding(text: str, model="text-embedding-3-small") -> np.ndarray:
    embedding = client.embeddings.create(input = [text], model=model).data[0].embedding
    return np.array(embedding)

In [174]:
embeddings = get_batch_embeddings(chunks)

In [103]:
q_embedding = get_one_embedding("Which guest worked at Abercrombie and Fitch?")

In [180]:
embeddings

array([[ 0.05489639, -0.03947796, -0.00708231, ..., -0.0492712 ,
        -0.0219755 ,  0.00376565],
       [ 0.04212333, -0.04137598, -0.01890454, ..., -0.04766051,
        -0.01154145,  0.00671765],
       [ 0.07459503, -0.04596259, -0.05516139, ..., -0.03984053,
        -0.00084816, -0.00865723],
       ...,
       [ 0.07094361, -0.04828836, -0.03882921, ..., -0.03272748,
        -0.00387197,  0.00732427],
       [ 0.05223813, -0.03542471, -0.05290401, ..., -0.03595741,
         0.00376637, -0.01817847],
       [ 0.02517771, -0.03395098, -0.05561592, ..., -0.00542056,
         0.00621656, -0.00047452]])

In [181]:
def cosine_similarity(query: np.ndarray, embeddings: np.ndarray) -> np.ndarray:
    dot_product = np.dot(embeddings, query)
    query_norm = np.linalg.norm(query)
    embeddings_norm = np.linalg.norm(embeddings, axis=1)
    return dot_product / (query_norm * embeddings_norm)

In [182]:
similarities = cosine_similarity(q_embedding, embeddings)

In [191]:
best_chunk_idx = np.argmax(similarities)

In [193]:
chunk_metadata[best_chunk_idx]

Chunk(text="Title:\n\n1\n00:00:00.380 --> 00:00:11.240\noh I know oh no no no\n\n2\n00:00:06.460 --> 00:00:11.240\n[Music]\n\n3\n00:00:13.200 --> 00:00:16.440\nhey what's going on everybody for first\n\n4\n00:00:14.820 --> 00:00:18.240\nweek Feast I'm Sean Evans and you're\n\n5\n00:00:16.440 --> 00:00:19.980\nwatching hot ones it's the show with hot\n\n6\n00:00:18.240 --> 00:00:21.779\nquestions and even hotter wings and\n\n7\n00:00:19.980 --> 00:00:23.520\ntoday we're joined by Kid Cudi he's a\n\n8\n00:00:21.779 --> 00:00:25.019\nGrammy award-winning artist multi-hyphen\n\n9\n00:00:23.520 --> 00:00:26.820\nEntertainer and true to form he just\n\n10\n00:00:25.019 --> 00:00:28.619\ndropped a brand new album accompanied by\n\n11\n00:00:26.820 --> 00:00:30.599\nan animated TV special of the same name\n\n12\n00:00:28.619 --> 00:00:32.160\nit's called Intergalactic check out the\n\n13\n00:00:30.599 --> 00:00:34.020\nalbum wherever you get your music and\n\n14\n00:00:32.160 --> 00:00:36.300\

In [203]:
chunk_metadata[0]

Chunk(text="WEBVTT\n\n1\n00:00:00.900 --> 00:00:03.900\nwow\n\n2\n00:00:04.440 --> 00:00:07.700\nthis one's a winner\n\n3\n00:00:13.880 --> 00:00:17.820\nhey what's going on everybody for first\n\n4\n00:00:16.139 --> 00:00:19.619\nweek Feast I'm Sean Evans and you're\n\n5\n00:00:17.820 --> 00:00:21.359\nwatching hot ones it's the show with hot\n\n6\n00:00:19.619 --> 00:00:23.279\nquestions and even hotter wings and\n\n7\n00:00:21.359 --> 00:00:24.539\ntoday we're joined by David Blaine he's\n\n8\n00:00:23.279 --> 00:00:26.100\nknown the world over for his street\n\n9\n00:00:24.539 --> 00:00:27.720\nmagic and endurance stunts that include\n\n10\n00:00:26.100 --> 00:00:29.400\neverything from being buried alive for\n\n11\n00:00:27.720 --> 00:00:30.720\nseven days to encasing himself in a\n\n12\n00:00:29.400 --> 00:00:33.059\nblock of ice to holding his breath\n\n13\n00:00:30.720 --> 00:00:34.620\nunderwater for 17 minutes he also has a\n\n14\n00:00:33.059 --> 00:00:36.239\npair of high p

In [213]:
SYSTEM_PROMPT = """You are an expert trivia bot. You provide correct answers to the posed question, based on the provided context. You have access to a chunk of text from a video transcript, and you can use this information to answer the question. You also have access to some metadata about this chunk of text. All transcripts come from the podcast "Hot Ones", Season 19, which has a total of 17 episodes. With each answer, provide a link with a timestamp to the relevant part of the video based on the transcript."""


BASE_PROMPT = """
Question: {question}

Relevant chunk of text: {text}
This text comes from the video titled "{title}", which is the video number {video_number} in the dataset and can be found at the following link: {link}.
""".strip()

def get_initial_messages(question: str, chunk: Chunk):
    
    return [
        {
            "role": "system",
            "content": [
                {
                    "type": "text",
                    "text": SYSTEM_PROMPT
                }
            ]
        },
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": BASE_PROMPT.format(
                        question=question,
                        text=chunk.text,
                        title=chunk.title,
                        video_number=chunk.video_idx,
                        link=chunk.link
                    )
                }
            ]
        }
    ]

In [225]:
def rank_chunks(question: str, embeddings: np.ndarray, model: str = "text-embedding-3-small") -> list[Chunk]:
    
    q_embedding = get_one_embedding(question, model)
    similarities = cosine_similarity(q_embedding, embeddings)
    
    sorted_indices = np.argsort(similarities)[::-1]
    return [chunk_metadata[i] for i in sorted_indices]

In [233]:
ranked_chunks = rank_chunks("Who failed making pastries as a teenager?", embeddings)

In [249]:
print(ranked_chunks[8].text)

 so fast and it's a

420
00:16:49.980 --> 00:16:54.180
little trippy it gets a little trippy

421
00:16:52.139 --> 00:16:57.000
okay

422
00:16:54.180 --> 00:16:59.100
I'm like fully serving but I want you to

423
00:16:57.000 --> 00:17:01.079
know and I want everyone back here to

424
00:16:59.100 --> 00:17:03.799
know

425
00:17:01.079 --> 00:17:05.360
I'm not scared so you shouldn't be sad

426
00:17:03.799 --> 00:17:08.600
okay

427
00:17:05.360 --> 00:17:08.600
I'm okay

428
00:17:09.179 --> 00:17:15.299
but I I'm like gushing tears you have

429
00:17:13.079 --> 00:17:16.980
just it's like the one

430
00:17:15.299 --> 00:17:18.780
like it's like out of them believe them

431
00:17:16.980 --> 00:17:20.339
it's out of a blue like this is a part

432
00:17:18.780 --> 00:17:21.959
of the [ __ ] hot one Spiritual

433
00:17:20.339 --> 00:17:23.699
Awakening

434
00:17:21.959 --> 00:17:25.880
good

435
00:17:23.699 --> 00:17:28.679
foreign

436
00:17:25.880 --> 00:17:31.740
like I'm h

In [250]:
titles

['David Blaine Does Magic While Eating Spicy Wings | Hot Ones',
 'Puss in Boots Can’t Feel His Tail While Eating Spicy Wings | Hot Ones',
 'Will Ferrell Brings the Spirit to the Hot Ones Holiday Extravaganza | Hot Ones',
 'Emma Chamberlain Has a Spiritual Awakening While Eating Spicy Wings | Hot Ones',
 'Israel Adesanya Gives Thanks While Eating Spicy Wings | Hot Ones',
 'Viola Davis Gives a Master Class While Eating Spicy Wings | Hot Ones',
 "Cate Blanchett Pretends No One's Watching While Eating Spicy Wings | Hot Ones",
 'Kid Cudi Goes to the Moon While Eating Spicy Wings | Hot Ones',
 'James Corden Experiences Mouth Karma While Eating Spicy Wings | Hot Ones',
 'Zoe Saldaña Gets Scorched By Spicy Wings | Hot Ones',
 'Kate Hudson Stays Positive While Eating Spicy Wings | Hot Ones',
 'Paul Dano Needs a Burp Cloth While Eating Spicy Wings | Hot Ones',
 'Cole Bennett Needs Lemonade While Eating Spicy Wings | Hot Ones',
 'The Best Da Bomb Reactions of 2022 | Hot Ones',
 'Sean Evans Reveal

In [242]:
[ranked_chunks[i].title for i in range(37)]

['Will Ferrell Brings the Spirit to the Hot Ones Holiday Extravaganza | Hot Ones',
 'Israel Adesanya Gives Thanks While Eating Spicy Wings | Hot Ones',
 'David Blaine Does Magic While Eating Spicy Wings | Hot Ones',
 'The Best Last Dab Reactions of 2022 | Hot Ones',
 'Ramy Youssef Lives on a Prayer While Eating Spicy Wings | Hot Ones',
 'David Blaine Does Magic While Eating Spicy Wings | Hot Ones',
 'Viola Davis Gives a Master Class While Eating Spicy Wings | Hot Ones',
 "Cate Blanchett Pretends No One's Watching While Eating Spicy Wings | Hot Ones",
 'Emma Chamberlain Has a Spiritual Awakening While Eating Spicy Wings | Hot Ones',
 'Paul Dano Needs a Burp Cloth While Eating Spicy Wings | Hot Ones',
 'Puss in Boots Can’t Feel His Tail While Eating Spicy Wings | Hot Ones',
 'Kate Hudson Stays Positive While Eating Spicy Wings | Hot Ones',
 'Ramy Youssef Lives on a Prayer While Eating Spicy Wings | Hot Ones',
 "Cate Blanchett Pretends No One's Watching While Eating Spicy Wings | Hot Ones

In [217]:
messages = get_initial_messages("Which guest worked at Abercrombie and Fitch?", chunk_metadata[18])

In [218]:
response = client.chat.completions.create(
  model="gpt-4o",
  messages=messages
)

In [224]:
response.choices[0].message.content

'The guest who worked at Abercrombie and Fitch is Kid Cudi. You can find the relevant discussion in the video at this link: [Kid Cudi on Hot Ones](https://www.youtube.com/watch?v=0allwd60wS4&t=569s).'

In [184]:
np.argsort(similarities)[::-1] 

array([18, 30, 34,  1, 29,  8,  4, 10,  2, 33, 24, 17, 20, 28, 19, 35, 15,
        7, 14,  5, 37, 22,  6, 36,  0, 26, 31, 16, 23, 25, 27, 12, 21, 13,
        9,  3, 32, 11])