File size: 3,080 Bytes
1d9f996
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#yt-dlp --write-subs --skip-download [youtube_url]
from pinecone import Pinecone
from pinecone import ServerlessSpec
from youtube_transcript_api import YouTubeTranscriptApi
import os
from dotenv import load_dotenv, find_dotenv
import torch
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

_ = load_dotenv(find_dotenv())
PINECONE_API_KEY  = os.getenv('PINECONE_API_KEY')

# Get youtube ids
def get_youtube_ids(route):
    yt_ids = []
    with open(route, 'r') as file:
        for line in file:
            yt_ids.append(line.split('=')[1].strip())
    return yt_ids

# Get transcriptions clean
def get_clean_transcriptions(yt_ids):
    trans_bruto = YouTubeTranscriptApi.get_transcripts(yt_ids, languages=['es','en'])
    return {k:" ".join([d['text'] for d in v if len(v)!=0]) for k, v in trans_bruto[0].items()}

# Create index
def create_index():
    pc = Pinecone(api_key=PINECONE_API_KEY)
    cloud = os.environ.get('PINECONE_CLOUD') or 'aws'
    region = os.environ.get('PINECONE_REGION') or 'us-east-1'
    spec = ServerlessSpec(cloud=cloud, region=region)
    index_name = "youtube-videos"
    if index_name not in pc.list_indexes().names():
        # create the index if it does not exist
        pc.create_index(index_name, dimension=768, metric="cosine", spec=spec)
    # connect to index we created
    index = pc.Index(index_name)
    return pc, index

# Load retriever model
def load_retriever():
    # set device to GPU if available
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    # load the retriever model from huggingface model hub
    retriever = SentenceTransformer('flax-sentence-embeddings/all_datasets_v3_mpnet-base', device=device) #load the retriever model from HuggingFace. Use the flax-sentence-embeddings/all_datasets_v3_mpnet-base model
    return retriever

# Create embeddings and upsert them into the index
def create_embeddings(dicc, index, retriever):
    # Passage id
    p_id = 0
    # Itearte over transcriptions
    for yt_id, transcription in dicc.items():
        # Split the transcription into passages
        passages = [transcription[i:i+1000] for i in range(0, len(transcription), 1000)]
        # For each passage, create an embedding and upsert it into the index
        for passage in tqdm(passages):
            emb = retriever.encode(passage, convert_to_tensor=True)
            meta = {'yt_id': yt_id, 'passage_text': passage}
            to_upsert = [(str(p_id), emb.tolist(), meta)]
            _ = index.upsert(vectors=to_upsert)
            p_id += 1
            # upsert/insert these records to pinecone
            _ = index.upsert(vectors=to_upsert)
    # check that we have all vectors in index
    print(index.describe_index_stats())

"""
# Obtenemos las ids de los vídeos
ls_ids = get_youtube_ids('./urls.txt')

# Obtenemos las transcripciones de los vídeos
d_trans = get_clean_transcriptions(ls_ids)

# Creo el index
pc, index = create_index()

# Load retriever model
retriever = load_retriever()

# Poblate the database
create_embeddings(d_trans, index, retriever)
"""