|
import os |
|
import streamlit as st |
|
import requests |
|
import zipfile |
|
from haystack.document_stores.in_memory import InMemoryDocumentStore |
|
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever |
|
from haystack.pipelines.standard_pipelines import TextIndexingPipeline |
|
from haystack.components.readers import ExtractiveReader |
|
from haystack.pipelines import ExtractiveQAPipeline |
|
from pydantic import BaseModel |
|
|
|
|
|
class Config(BaseModel): |
|
class Config: |
|
arbitrary_types_allowed = True |
|
|
|
|
|
|
|
|
|
|
|
|
|
def download_and_extract(url, output_dir): |
|
""" |
|
Baixa um arquivo zip de uma URL e o extrai em um diretório especificado. |
|
""" |
|
zip_path = os.path.join(output_dir, "temp.zip") |
|
|
|
|
|
os.makedirs(output_dir, exist_ok=True) |
|
|
|
|
|
response = requests.get(url) |
|
with open(zip_path, "wb") as file: |
|
file.write(response.content) |
|
|
|
|
|
with zipfile.ZipFile(zip_path, "r") as zip_ref: |
|
zip_ref.extractall(output_dir) |
|
|
|
|
|
os.remove(zip_path) |
|
|
|
|
|
|
|
|
|
def initializing_Document_Store(): |
|
doc_dir = "data/build_your_first_question_answering_system" |
|
|
|
download_and_extract( |
|
url="https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt1.zip", |
|
output_dir=doc_dir, |
|
) |
|
|
|
|
|
document_store = InMemoryDocumentStore(embedding_dim=384) |
|
|
|
|
|
retriever = initializing_Retriever(document_store) |
|
|
|
|
|
files_to_index = [doc_dir + "/" + f for f in os.listdir(doc_dir)] |
|
indexing_pipeline = TextIndexingPipeline(document_store) |
|
indexing_pipeline.run_batch(file_paths=files_to_index) |
|
|
|
|
|
document_store.update_embeddings(retriever) |
|
|
|
return document_store, retriever |
|
|
|
|
|
|
|
def initializing_Retriever(document_store): |
|
retriever = InMemoryEmbeddingRetriever( |
|
document_store=document_store, |
|
embedding_model="sentence-transformers/all-MiniLM-L6-v2" |
|
) |
|
return retriever |
|
|
|
|
|
|
|
def initializing_Reader(): |
|
reader = ExtractiveReader(model="deepset/roberta-base-squad2") |
|
return reader |
|
|
|
|
|
|
|
document_store, retriever = initializing_Document_Store() |
|
reader = initializing_Reader() |
|
reader.warm_up() |
|
pipe = Pipeline() |
|
pipe.add_component(instance=retriever, name="retriever") |
|
pipe.add_component(instance=reader, name="reader") |
|
pipe.connect("retriever.documents", "reader.documents") |
|
|
|
|
|
st.title("Ask about Game of Thrones!") |
|
|
|
user_query = st.text_input( |
|
label="Ask a question about Game of Thrones!", |
|
placeholder="Who is the King in the North?", |
|
) |
|
|
|
if user_query: |
|
|
|
with st.spinner("Searching for an answer..."): |
|
try: |
|
|
|
answer = pipeline.run( |
|
data={ |
|
"retriever": {"query": question, "top_k": 10}, |
|
"reader": {"query": question, "top_k": top_k}, |
|
} |
|
) |
|
|
|
|
|
for idx, ans in enumerate(answer["answers"]): |
|
st.info( |
|
f""" |
|
Answer {idx + 1}: "{ans.answer}" | Score: {ans.score:0.4f} |
|
Document: "{ans.document.meta['title']}" |
|
URL: {ans.document.meta['url']} |
|
""" |
|
) |
|
with st.expander("See details", expanded=False): |
|
st.write(ans) |
|
st.divider() |
|
|
|
except Exception as e: |
|
st.error("Sorry, we couldn't find an answer to your question.") |
|
|