import os
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings
from config import VECTOR_DIR, EMBED_MODEL 


#this method first load the pdf by using langchain document loader then
# recursively split the text and make document list 
# after that we convert row text into vector by our embedding model
#finally store the vector in fiass for symentic vector search
#fianlly save the fiass vectore db in local data folder that i mention in config folder


def ingest_pdf(pdf_path: str) -> str:
    if not os.path.exists(pdf_path):
        return "PDF file not found."

    loader = PyPDFLoader(pdf_path)
    docs = loader.load()

    splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=200)
    chunks = splitter.split_documents(docs)

    embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL)
    vectordb = FAISS.from_documents(chunks, embeddings)
    vectordb.save_local(VECTOR_DIR)

    return f"Indexed {len(chunks)} chunks successfully."