Spaces:

shallou
/

pdfchatbot

Sleeping

File size: 5,635 Bytes

12fc98c
 
0128aff
 
 
 
12fc98c
 
b35ee0f
 
4d4e63a
b35ee0f
131ff8a
 
 
 
 
 
 
 
 
0128aff
 
 
12fc98c
0128aff
131ff8a
 
 
 
 
 
 
 
 
 
 
4d4e63a
 
b35ee0f
 
4d4e63a
b35ee0f
4d4e63a
 
12fc98c
4d4e63a
 
 
b35ee0f
131ff8a
 
b35ee0f
131ff8a
4d4e63a
 
b35ee0f
4d4e63a
 
131ff8a
b35ee0f
4d4e63a
131ff8a
4d4e63a
131ff8a
b35ee0f
4d4e63a
 
b35ee0f
4d4e63a
131ff8a
 
 
 
 
 
b35ee0f
b4dece8
131ff8a
 
b4dece8
b35ee0f
4d4e63a
 
 
 
 
 
297e092
4d4e63a
297e092
4d4e63a
 
 
 
 
297e092
4d4e63a
 
 
 
 
 
 
b35ee0f
4d4e63a
12fc98c
 
 
 
 
 
 
 
 
 
 
 
 
 
e452141
12fc98c
8116261
e452141

from dotenv import load_dotenv
import streamlit as st
import pickle
from PyPDF2 import PdfReader
from transformers import pipeline
from sentence_transformers import SentenceTransformer
import os
import numpy as np

# Load environment variables from .env file
load_dotenv()

# Define a function to manually chunk text
def chunk_text(text, chunk_size=1000, chunk_overlap=200):
    chunks = []
    i = 0
    while i < len(text):
        chunks.append(text[i:i + chunk_size])
        i += chunk_size - chunk_overlap
    return chunks

# Function to generate embeddings using sentence-transformers
def generate_embeddings(text_chunks, model_name='all-MiniLM-L6-v2'):
    model = SentenceTransformer(model_name)

    embeddings = model.encode(text_chunks, convert_to_tensor=False)
    return embeddings

# Function to find the most relevant chunk based on the cosine similarity
def find_best_chunk(query_embedding, text_embeddings):
    cosine_similarities = np.dot(text_embeddings, query_embedding) / (
        np.linalg.norm(text_embeddings, axis=1) * np.linalg.norm(query_embedding)
    )
    best_index = np.argmax(cosine_similarities)
    return best_index, cosine_similarities[best_index]

# Main Streamlit app function
def main():
    st.header("LLM-powered PDF Chatbot 💬")

    # Upload a PDF file
    pdf = st.file_uploader("Upload your PDF", type='pdf')

    if pdf is not None:
        pdf_reader = PdfReader(pdf)

        text = ""
        for page in pdf_reader.pages:
            text += page.extract_text()

        # Split text into chunks
        chunks = chunk_text(text)

        # Generate embeddings for the chunks
        store_name = pdf.name[:-4]
        st.write(f'{store_name}')

        if os.path.exists(f"{store_name}.pkl"):
            with open(f"{store_name}.pkl", "rb") as f:
                text_embeddings = pickle.load(f)
            st.write('Embeddings Loaded from the Disk')
        else:
            text_embeddings = generate_embeddings(chunks)
            with open(f"{store_name}.pkl", "wb") as f:
                pickle.dump(text_embeddings, f)

        # Accept user questions/query
        query = st.text_input("Ask questions about your PDF file:")

        if query:
            # Generate embeddings for the query
            query_embedding = generate_embeddings([query])[0]

            # Find the best chunk for the query
            best_index, similarity = find_best_chunk(query_embedding, text_embeddings)
            best_chunk = chunks[best_index]

            # Use Hugging Face pipeline for question answering
            qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
            result = qa_pipeline(question=query, context=best_chunk)
            st.write(result['answer'])

def set_bg_from_url(url, opacity=1):
    footer = """
    <link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-gH2yIJqKdNHPEq0n4Mqa/HGKIhSkIHeL5AyhkYV8i59U5AR6csBvApHHNl/vI1Bx" crossorigin="anonymous">
    <footer>
        <div style='visibility: visible;margin-top:7rem;justify-content:center;display:flex;'>
            <p style="font-size:1.1rem;">
                Made by Asmae El-ghezzaz
                &nbsp;
                <a href="https://www.linkedin.com/in/asmae-el-ghezzaz/">
                    <svg xmlns="http://www.w3.org/2000/svg" width="23" height="23" fill="white" class="bi bi-linkedin" viewBox="0 0 16 16">
                        <path d="M0 1.146C0 .513.526 0 1.175 0h13.65C15.474 0 16 .513 16 1.146v13.708c0 .633-.526 1.146-1.175 1.146H1.175C.526 16 0 15.487 0 14.854V1.146zm4.943 12.248V6.169H2.542v7.225h2.401zm-1.2-8.212c.837 0 1.358-.554 1.358-1.248-.015-.709-.52-1.248-1.342-1.248-.822 0-1.359.54-1.359 1.248 0 .694.521 1.248 1.327 1.248h.016zm4.908 8.212V9.359c0-.216.016-.432.08-.586.173-.431.568-.878 1.232-.878.869 0 1.216.662 1.216 1.634v3.865h2.401V9.25c0-2.22-1.184-3.252-2.764-3.252-1.274 0-1.845.7-2.165 1.193v.025h-.016a5.54 5.54 0 0 1 .016-.025V6.169h-2.4c.03.678 0 7.225 0 7.225h2.4z"/>
                    </svg>          
                </a>
                &nbsp;
                <a href="https://github.com/aelghezzaz">
                    <svg xmlns="http://www.w3.org/2000/svg" width="23" height="23" fill="white" class="bi bi-github" viewBox="0 0 16 16">
                        <path d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.012 8.012 0 0 0 16 8c0-4.42-3.58-8-8-8z"/>
                    </svg>
                </a>
            </p>
        </div>
    </footer>
    """
    st.markdown(footer, unsafe_allow_html=True)

    # Set background image using HTML and CSS
    st.markdown(
        f"""
        <style>
            body {{
                background: url('{url}') no-repeat center center fixed;
                background-size: cover;
                opacity: {opacity};
            }}
        </style>
        """,
        unsafe_allow_html=True
    )

# Set background image from URL
set_bg_from_url("https://www.1access.com/wp-content/uploads/2019/10/GettyImages-1180389186.jpg", opacity=0.5)

if __name__ == '__main__':
    main()