Ono-Enzo commited on
Commit
c6ed97e
Β·
verified Β·
1 Parent(s): d8f9f7d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +137 -139
app.py CHANGED
@@ -1,139 +1,137 @@
1
- import streamlit as st
2
- from datasets import load_dataset
3
- from haystack import Pipeline
4
- from haystack.components.readers import ExtractiveReader
5
- from haystack.components.retrievers.in_memory import InMemoryBM25Retriever
6
- from haystack.document_stores.in_memory import InMemoryDocumentStore
7
-
8
- from utils import get_unique_docs
9
-
10
-
11
- # Load the dataset
12
- @st.cache_data(show_spinner=False)
13
- def load_documents():
14
- """
15
- Load the documents from the dataset considering only unique documents.
16
-
17
- Returns:
18
- - documents: list of dictionaries with the documents.
19
- """
20
- unique_docs = set()
21
- dataset_name = "PedroCJardim/QASports"
22
- dataset_split = "basketball"
23
- st.caption(f'Fetching "{dataset_name}" dataset')
24
- # build the dataset
25
- dataset = load_dataset(dataset_name, dataset_split)
26
- docs_validation = get_unique_docs(dataset["validation"], unique_docs)
27
- docs_train = get_unique_docs(dataset["train"], unique_docs)
28
- docs_test = get_unique_docs(dataset["test"], unique_docs)
29
- documents = docs_validation + docs_train + docs_test
30
- return documents
31
-
32
-
33
- @st.cache_resource(show_spinner=False)
34
- def get_document_store(documents):
35
- """
36
- Index the files in the document store.
37
-
38
- Args:
39
- - files: list of dictionaries with the documents.
40
- """
41
- # Create in memory database
42
- st.caption(f"Building the Document Store")
43
- document_store = InMemoryDocumentStore()
44
- document_store.write_documents(documents=documents)
45
- return document_store
46
-
47
-
48
- @st.cache_resource(show_spinner=False)
49
- def get_question_pipeline(_doc_store):
50
- """
51
- Create the pipeline with the retriever and reader components.
52
-
53
- Args:
54
- - doc_store: instance of the document store.
55
-
56
- Returns:
57
- - pipe: instance of the pipeline.
58
- """
59
- st.caption(f"Building the Question Answering pipeline")
60
- # Create the retriever and reader
61
- retriever = InMemoryBM25Retriever(document_store=_doc_store)
62
- reader = ExtractiveReader(model="deepset/roberta-base-squad2")
63
- reader.warm_up()
64
- # Create the pipeline
65
- pipe = Pipeline()
66
- pipe.add_component(instance=retriever, name="retriever")
67
- pipe.add_component(instance=reader, name="reader")
68
- pipe.connect("retriever.documents", "reader.documents")
69
- return pipe
70
-
71
-
72
- def search(pipeline, question: str):
73
- """
74
- Search for the answer to a question in the documents.
75
-
76
- Args:
77
- - pipeline: instance of the pipeline.
78
- - question: string with the question.
79
-
80
- Returns:
81
- - answer: dictionary with the answer.
82
- """
83
- # Get the answers
84
- top_k = 3
85
- answer = pipeline.run(
86
- data={
87
- "retriever": {"query": question, "top_k": 10},
88
- "reader": {"query": question, "top_k": top_k},
89
- }
90
- )
91
- max_k = min(top_k, len(answer["reader"]["answers"]))
92
- return answer["reader"]["answers"][0:max_k]
93
-
94
-
95
- # Streamlit interface
96
- _, centering_column, _ = st.columns(3)
97
- with centering_column:
98
- st.image("assets/qasports-logo.png", use_column_width=True)
99
-
100
- # Loading status
101
- with st.status(
102
- "Downloading dataset...", expanded=st.session_state.get("expanded", True)
103
- ) as status:
104
- documents = load_documents()
105
- status.update(label="Indexing documents...")
106
- doc_store = get_document_store(documents)
107
- status.update(label="Creating pipeline...")
108
- pipe = get_question_pipeline(doc_store)
109
- status.update(
110
- label="Download and indexing complete!", state="complete", expanded=False
111
- )
112
- st.session_state["expanded"] = False
113
-
114
- st.subheader("πŸ”Ž Basketball", divider="rainbow")
115
- st.caption(
116
- """This website presents a collection of documents from the dataset named "QASports", the first large sports question answering dataset for open questions. QASports contains real data of players, teams and matches from the sports soccer, basketball and American football. It counts over 1.5 million questions and answers about 54k preprocessed, cleaned and organized documents from Wikipedia-like sources."""
117
- )
118
-
119
- if user_query := st.text_input(
120
- label="Ask a question about Basketball! πŸ€",
121
- placeholder="How many field goals did Kobe Bryant score?",
122
- ):
123
- # Get the answers
124
- with st.spinner("Waiting"):
125
- try:
126
- answer = search(pipe, user_query)
127
- for idx, ans in enumerate(answer):
128
- st.info(
129
- f"""
130
- Answer {idx+1}: "{ans.data}" | Score: {ans.score:0.4f}
131
- Document: "{ans.document.meta["title"]}"
132
- URL: {ans.document.meta["url"]}
133
- """
134
- )
135
- with st.expander("See details", expanded=False):
136
- st.write(ans)
137
- st.divider()
138
- except Exception as e:
139
- st.error("We do not have an answer for your question")
 
1
+ import streamlit as st
2
+ from datasets import load_dataset
3
+ from haystack import Pipeline
4
+ from haystack.components.readers import ExtractiveReader
5
+ from haystack.document_stores.in_memory import InMemoryDocumentStore
6
+ from haystack.nodes import EmbeddingRetriever
7
+
8
+ from utils import get_unique_docs
9
+
10
+
11
+ # Load the dataset
12
+ @st.cache_data(show_spinner=False)
13
+ def load_documents():
14
+ """
15
+ Load the documents from the dataset considering only unique documents.
16
+ Returns:
17
+ - documents: list of dictionaries with the documents.
18
+ """
19
+ unique_docs = set()
20
+ dataset_name = "PedroCJardim/QASports"
21
+ dataset_split = "basketball"
22
+ st.caption(f'Fetching "{dataset_name}" dataset')
23
+ # build the dataset
24
+ dataset = load_dataset(dataset_name, dataset_split)
25
+ docs_validation = get_unique_docs(dataset["validation"], unique_docs)
26
+ docs_train = get_unique_docs(dataset["train"], unique_docs)
27
+ docs_test = get_unique_docs(dataset["test"], unique_docs)
28
+ documents = docs_validation + docs_train + docs_test
29
+ return documents
30
+
31
+
32
+ @st.cache_resource(show_spinner=False)
33
+ def get_document_store(documents):
34
+ """
35
+ Index the files in the document store.
36
+ Args:
37
+ - files: list of dictionaries with the documents.
38
+ """
39
+ # Create in memory database
40
+ st.caption(f"Building the Document Store")
41
+ document_store = InMemoryDocumentStore(embedding_dim=384)
42
+ document_store.write_documents(documents=documents)
43
+ return document_store
44
+
45
+
46
+ @st.cache_resource(show_spinner=False)
47
+ def get_question_pipeline(_doc_store):
48
+ """
49
+ Create the pipeline with the retriever and reader components.
50
+ Args:
51
+ - doc_store: instance of the document store.
52
+ Returns:
53
+ - pipe: instance of the pipeline.
54
+ """
55
+ st.caption(f"Building the Question Answering pipeline")
56
+ # Create the retriever and reader
57
+ retriever = EmbeddingRetriever(
58
+ document_store=document_store,
59
+ embedding_model="sentence-transformers/all-MiniLM-L6-v2"
60
+ )
61
+ document_store.update_embeddings(retriever)
62
+ reader = ExtractiveReader(model="deepset/roberta-base-squad2")
63
+ reader.warm_up()
64
+ # Create the pipeline
65
+ pipe = Pipeline()
66
+ pipe.add_component(instance=retriever, name="retriever")
67
+ pipe.add_component(instance=reader, name="reader")
68
+ pipe.connect("retriever.documents", "reader.documents")
69
+ return pipe
70
+
71
+
72
+ def search(pipeline, question: str):
73
+ """
74
+ Search for the answer to a question in the documents.
75
+ Args:
76
+ - pipeline: instance of the pipeline.
77
+ - question: string with the question.
78
+ Returns:
79
+ - answer: dictionary with the answer.
80
+ """
81
+ # Get the answers
82
+ top_k = 3
83
+ answer = pipeline.run(
84
+ data={
85
+ "retriever": {"query": question, "top_k": 10},
86
+ "reader": {"query": question, "top_k": top_k},
87
+ }
88
+ )
89
+ max_k = min(top_k, len(answer["reader"]["answers"]))
90
+ return answer["reader"]["answers"][0:max_k]
91
+
92
+
93
+ # Streamlit interface
94
+ _, centering_column, _ = st.columns(3)
95
+ with centering_column:
96
+ st.image("assets/qasports-logo.png", use_column_width=True)
97
+
98
+ # Loading status
99
+ with st.status(
100
+ "Downloading dataset...", expanded=st.session_state.get("expanded", True)
101
+ ) as status:
102
+ documents = load_documents()
103
+ status.update(label="Indexing documents...")
104
+ doc_store = get_document_store(documents)
105
+ status.update(label="Creating pipeline...")
106
+ pipe = get_question_pipeline(doc_store)
107
+ status.update(
108
+ label="Download and indexing complete!", state="complete", expanded=False
109
+ )
110
+ st.session_state["expanded"] = False
111
+
112
+ st.subheader("πŸ”Ž Basketball", divider="rainbow")
113
+ st.caption(
114
+ """This website presents a collection of documents from the dataset named "QASports", the first large sports question answering dataset for open questions. QASports contains real data of players, teams and matches from the sports soccer, basketball and American football. It counts over 1.5 million questions and answers about 54k preprocessed, cleaned and organized documents from Wikipedia-like sources."""
115
+ )
116
+
117
+ if user_query := st.text_input(
118
+ label="Ask a question about Basketball! πŸ€",
119
+ placeholder="How many field goals did Kobe Bryant score?",
120
+ ):
121
+ # Get the answers
122
+ with st.spinner("Waiting"):
123
+ try:
124
+ answer = search(pipe, user_query)
125
+ for idx, ans in enumerate(answer):
126
+ st.info(
127
+ f"""
128
+ Answer {idx+1}: "{ans.data}" | Score: {ans.score:0.4f}
129
+ Document: "{ans.document.meta["title"]}"
130
+ URL: {ans.document.meta["url"]}
131
+ """
132
+ )
133
+ with st.expander("See details", expanded=False):
134
+ st.write(ans)
135
+ st.divider()
136
+ except Exception as e:
137
+ st.error("We do not have an answer for your question")