#yt-dlp --write-subs --skip-download [youtube_url] from pinecone import Pinecone from pinecone import ServerlessSpec from youtube_transcript_api import YouTubeTranscriptApi import os from dotenv import load_dotenv, find_dotenv import torch from sentence_transformers import SentenceTransformer from tqdm import tqdm _ = load_dotenv(find_dotenv()) PINECONE_API_KEY = os.getenv('PINECONE_API_KEY') # Get youtube ids def get_youtube_ids(route): yt_ids = [] with open(route, 'r') as file: for line in file: yt_ids.append(line.split('=')[1].strip()) return yt_ids # Get transcriptions clean def get_clean_transcriptions(yt_ids): trans_bruto = YouTubeTranscriptApi.get_transcripts(yt_ids, languages=['es','en']) return {k:" ".join([d['text'] for d in v if len(v)!=0]) for k, v in trans_bruto[0].items()} # Create index def create_index(): pc = Pinecone(api_key=PINECONE_API_KEY) cloud = os.environ.get('PINECONE_CLOUD') or 'aws' region = os.environ.get('PINECONE_REGION') or 'us-east-1' spec = ServerlessSpec(cloud=cloud, region=region) index_name = "youtube-videos" if index_name not in pc.list_indexes().names(): # create the index if it does not exist pc.create_index(index_name, dimension=768, metric="cosine", spec=spec) # connect to index we created index = pc.Index(index_name) return pc, index # Load retriever model def load_retriever(): # set device to GPU if available device = 'cuda' if torch.cuda.is_available() else 'cpu' # load the retriever model from huggingface model hub retriever = SentenceTransformer('flax-sentence-embeddings/all_datasets_v3_mpnet-base', device=device) #load the retriever model from HuggingFace. Use the flax-sentence-embeddings/all_datasets_v3_mpnet-base model return retriever # Create embeddings and upsert them into the index def create_embeddings(dicc, index, retriever): # Passage id p_id = 0 # Itearte over transcriptions for yt_id, transcription in dicc.items(): # Split the transcription into passages passages = [transcription[i:i+1000] for i in range(0, len(transcription), 1000)] # For each passage, create an embedding and upsert it into the index for passage in tqdm(passages): emb = retriever.encode(passage, convert_to_tensor=True) meta = {'yt_id': yt_id, 'passage_text': passage} to_upsert = [(str(p_id), emb.tolist(), meta)] _ = index.upsert(vectors=to_upsert) p_id += 1 # upsert/insert these records to pinecone _ = index.upsert(vectors=to_upsert) # check that we have all vectors in index print(index.describe_index_stats()) """ # Obtenemos las ids de los vídeos ls_ids = get_youtube_ids('./urls.txt') # Obtenemos las transcripciones de los vídeos d_trans = get_clean_transcriptions(ls_ids) # Creo el index pc, index = create_index() # Load retriever model retriever = load_retriever() # Poblate the database create_embeddings(d_trans, index, retriever) """