Spaces:
Sleeping
Sleeping
| import tiktoken | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from langchain_chroma import Chroma | |
| from langchain_community.embeddings import HuggingFaceBgeEmbeddings | |
| from transformers import pipeline | |
| from app_config import VECTOR_MAX_TOKENS, VECTORS_TOKEN_OVERLAP_SIZE | |
| from langchain.docstore.document import Document | |
| from pytube import YouTube | |
| from dotenv import load_dotenv | |
| from pathlib import Path | |
| import os | |
| env_path = Path('.') / '.env' | |
| load_dotenv(dotenv_path=env_path) | |
| tokenizer = tiktoken.get_encoding('cl100k_base') | |
| # create the length function | |
| def tiktoken_len(text): | |
| tokens = tokenizer.encode( | |
| text, | |
| disallowed_special=() | |
| ) | |
| return len(tokens) | |
| def save_audio_file(url): | |
| try: | |
| yt_obj = YouTube(url) | |
| metadata = f"Title: {yt_obj.title}, Total Time: {yt_obj.length} seconds, Number of views: {yt_obj.views}, Rating: {yt_obj.rating}" | |
| metadata += f"""Description: {yt_obj.description} Uploader: {yt_obj.author} Upload Date: {yt_obj.publish_date} | |
| Thumbnail URL: {yt_obj.thumbnail_url} | |
| Channel URL: {yt_obj.channel_url} | |
| Age Restricted: {yt_obj.age_restricted} | |
| """ | |
| with open("yt_transcription.txt","w") as f: | |
| f.write(metadata) | |
| yt_audio_stream = yt_obj.streams.get_by_itag(139) | |
| yt_audio_stream.download("","yt_audio.mp4") | |
| except: | |
| print("Connection Error") | |
| def get_audio_transcription(): | |
| whisper = pipeline("automatic-speech-recognition", | |
| "openai/whisper-large-v3") | |
| transcription = whisper("yt_audio.mp4", | |
| chunk_length_s=30, | |
| stride_length_s=5, | |
| batch_size=8) | |
| with open("yt_transcription.txt","a") as f: | |
| f.write(transcription['text']) | |
| def get_vectorstore(): | |
| model_name = "BAAI/bge-small-en" | |
| model_kwargs = {"device": "cpu"} | |
| encode_kwargs = {"normalize_embeddings": True} | |
| hf = HuggingFaceBgeEmbeddings( | |
| model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs | |
| ) | |
| f = open("yt_transcription.txt", "r") | |
| data = f.read() | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=VECTOR_MAX_TOKENS, | |
| chunk_overlap=VECTORS_TOKEN_OVERLAP_SIZE, | |
| length_function=tiktoken_len, | |
| separators=["\n\n\n","\n\n", "\n", " ", ""] | |
| ) | |
| all_splits = text_splitter.split_text(data) | |
| docs = [Document(page_content=t) for t in all_splits] | |
| vectorstore = Chroma.from_texts(texts=all_splits ,embedding=hf) | |
| return vectorstore,docs | |