Spaces:
Sleeping
Sleeping
File size: 5,635 Bytes
12fc98c 0128aff 12fc98c b35ee0f 4d4e63a b35ee0f 131ff8a 0128aff 12fc98c 0128aff 131ff8a 4d4e63a b35ee0f 4d4e63a b35ee0f 4d4e63a 12fc98c 4d4e63a b35ee0f 131ff8a b35ee0f 131ff8a 4d4e63a b35ee0f 4d4e63a 131ff8a b35ee0f 4d4e63a 131ff8a 4d4e63a 131ff8a b35ee0f 4d4e63a b35ee0f 4d4e63a 131ff8a b35ee0f b4dece8 131ff8a b4dece8 b35ee0f 4d4e63a 297e092 4d4e63a 297e092 4d4e63a 297e092 4d4e63a b35ee0f 4d4e63a 12fc98c e452141 12fc98c 8116261 e452141 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 |
from dotenv import load_dotenv
import streamlit as st
import pickle
from PyPDF2 import PdfReader
from transformers import pipeline
from sentence_transformers import SentenceTransformer
import os
import numpy as np
# Load environment variables from .env file
load_dotenv()
# Define a function to manually chunk text
def chunk_text(text, chunk_size=1000, chunk_overlap=200):
chunks = []
i = 0
while i < len(text):
chunks.append(text[i:i + chunk_size])
i += chunk_size - chunk_overlap
return chunks
# Function to generate embeddings using sentence-transformers
def generate_embeddings(text_chunks, model_name='all-MiniLM-L6-v2'):
model = SentenceTransformer(model_name)
embeddings = model.encode(text_chunks, convert_to_tensor=False)
return embeddings
# Function to find the most relevant chunk based on the cosine similarity
def find_best_chunk(query_embedding, text_embeddings):
cosine_similarities = np.dot(text_embeddings, query_embedding) / (
np.linalg.norm(text_embeddings, axis=1) * np.linalg.norm(query_embedding)
)
best_index = np.argmax(cosine_similarities)
return best_index, cosine_similarities[best_index]
# Main Streamlit app function
def main():
st.header("LLM-powered PDF Chatbot 💬")
# Upload a PDF file
pdf = st.file_uploader("Upload your PDF", type='pdf')
if pdf is not None:
pdf_reader = PdfReader(pdf)
text = ""
for page in pdf_reader.pages:
text += page.extract_text()
# Split text into chunks
chunks = chunk_text(text)
# Generate embeddings for the chunks
store_name = pdf.name[:-4]
st.write(f'{store_name}')
if os.path.exists(f"{store_name}.pkl"):
with open(f"{store_name}.pkl", "rb") as f:
text_embeddings = pickle.load(f)
st.write('Embeddings Loaded from the Disk')
else:
text_embeddings = generate_embeddings(chunks)
with open(f"{store_name}.pkl", "wb") as f:
pickle.dump(text_embeddings, f)
# Accept user questions/query
query = st.text_input("Ask questions about your PDF file:")
if query:
# Generate embeddings for the query
query_embedding = generate_embeddings([query])[0]
# Find the best chunk for the query
best_index, similarity = find_best_chunk(query_embedding, text_embeddings)
best_chunk = chunks[best_index]
# Use Hugging Face pipeline for question answering
qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")
result = qa_pipeline(question=query, context=best_chunk)
st.write(result['answer'])
def set_bg_from_url(url, opacity=1):
footer = """
<link href="https://cdn.jsdelivr.net/npm/[email protected]/dist/css/bootstrap.min.css" rel="stylesheet" integrity="sha384-gH2yIJqKdNHPEq0n4Mqa/HGKIhSkIHeL5AyhkYV8i59U5AR6csBvApHHNl/vI1Bx" crossorigin="anonymous">
<footer>
<div style='visibility: visible;margin-top:7rem;justify-content:center;display:flex;'>
<p style="font-size:1.1rem;">
Made by Asmae El-ghezzaz
<a href="https://www.linkedin.com/in/asmae-el-ghezzaz/">
<svg xmlns="http://www.w3.org/2000/svg" width="23" height="23" fill="white" class="bi bi-linkedin" viewBox="0 0 16 16">
<path d="M0 1.146C0 .513.526 0 1.175 0h13.65C15.474 0 16 .513 16 1.146v13.708c0 .633-.526 1.146-1.175 1.146H1.175C.526 16 0 15.487 0 14.854V1.146zm4.943 12.248V6.169H2.542v7.225h2.401zm-1.2-8.212c.837 0 1.358-.554 1.358-1.248-.015-.709-.52-1.248-1.342-1.248-.822 0-1.359.54-1.359 1.248 0 .694.521 1.248 1.327 1.248h.016zm4.908 8.212V9.359c0-.216.016-.432.08-.586.173-.431.568-.878 1.232-.878.869 0 1.216.662 1.216 1.634v3.865h2.401V9.25c0-2.22-1.184-3.252-2.764-3.252-1.274 0-1.845.7-2.165 1.193v.025h-.016a5.54 5.54 0 0 1 .016-.025V6.169h-2.4c.03.678 0 7.225 0 7.225h2.4z"/>
</svg>
</a>
<a href="https://github.com/aelghezzaz">
<svg xmlns="http://www.w3.org/2000/svg" width="23" height="23" fill="white" class="bi bi-github" viewBox="0 0 16 16">
<path d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.012 8.012 0 0 0 16 8c0-4.42-3.58-8-8-8z"/>
</svg>
</a>
</p>
</div>
</footer>
"""
st.markdown(footer, unsafe_allow_html=True)
# Set background image using HTML and CSS
st.markdown(
f"""
<style>
body {{
background: url('{url}') no-repeat center center fixed;
background-size: cover;
opacity: {opacity};
}}
</style>
""",
unsafe_allow_html=True
)
# Set background image from URL
set_bg_from_url("https://www.1access.com/wp-content/uploads/2019/10/GettyImages-1180389186.jpg", opacity=0.5)
if __name__ == '__main__':
main()
|