import tiktoken from langchain_text_splitters import RecursiveCharacterTextSplitter from langchain_chroma import Chroma from langchain_community.embeddings import HuggingFaceBgeEmbeddings from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline from transformers import pipeline from app_config import VECTOR_MAX_TOKENS, VECTORS_TOKEN_OVERLAP_SIZE from langchain.docstore.document import Document import torch from pytube import YouTube from dotenv import load_dotenv from pathlib import Path import os env_path = Path('.') / '.env' load_dotenv(dotenv_path=env_path) tokenizer = tiktoken.get_encoding('cl100k_base') # create the length function def tiktoken_len(text): tokens = tokenizer.encode( text, disallowed_special=() ) return len(tokens) def save_audio_file(url): try: yt_obj = YouTube(url) metadata = f"Title: {yt_obj.title}, Total Time: {yt_obj.length} seconds, Number of views: {yt_obj.views}, Rating: {yt_obj.rating}" metadata += f"""Description: {yt_obj.description} Uploader: {yt_obj.author} Upload Date: {yt_obj.publish_date} Thumbnail URL: {yt_obj.thumbnail_url} Channel URL: {yt_obj.channel_url} Age Restricted: {yt_obj.age_restricted} """ with open("yt_transcription.txt","w") as f: f.write(metadata) yt_audio_stream = yt_obj.streams.get_by_itag(139) yt_audio_stream.download("","yt_audio.mp4") except: print("Connection Error") def get_audio_transcription(): # whisper = pipeline("automatic-speech-recognition", # "openai/whisper-tiny.en") device = "cuda:0" if torch.cuda.is_available() else "cpu" torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 model_id = "distil-whisper/distil-large-v2" model = AutoModelForSpeechSeq2Seq.from_pretrained( model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True ) model.to(device) processor = AutoProcessor.from_pretrained(model_id) whisper = pipeline( "automatic-speech-recognition", model=model, tokenizer=processor.tokenizer, feature_extractor=processor.feature_extractor, max_new_tokens=128, torch_dtype=torch_dtype, device=device, ) transcription = whisper("yt_audio.mp4", chunk_length_s=30, stride_length_s=5, batch_size=8) with open("yt_transcription.txt","a") as f: f.write(transcription['text']) def get_vectorstore(): model_name = "BAAI/bge-small-en" model_kwargs = {"device": "cpu"} encode_kwargs = {"normalize_embeddings": True} hf = HuggingFaceBgeEmbeddings( model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs ) f = open("yt_transcription.txt", "r") data = f.read() text_splitter = RecursiveCharacterTextSplitter( chunk_size=VECTOR_MAX_TOKENS, chunk_overlap=VECTORS_TOKEN_OVERLAP_SIZE, length_function=tiktoken_len, separators=["\n\n\n","\n\n", "\n", " ", ""] ) all_splits = text_splitter.split_text(data) docs = [Document(page_content=t) for t in all_splits] vectorstore = Chroma.from_texts(texts=all_splits ,embedding=hf) return vectorstore,docs