Spaces:

ugaray96
/

neural-search

Runtime error

App Files Files Community

ugaray96 commited on Sep 23, 2022

Commit

f026256

unverified ·

2 Parent(s): c9524e4 7786dc7

Merge pull request #10 from ugm2/feature/audio_output

Browse files

Files changed (7) hide show

.gitignore +3 -1
core/pipelines.py +34 -3
core/search_index.py +11 -9
interface/components.py +10 -3
interface/pages.py +1 -1
interface/utils.py +10 -0
requirements.txt +5 -1

.gitignore CHANGED Viewed

@@ -128,4 +128,6 @@ dmypy.json
 # Pyre type checker
 .pyre/
-.vscode/

 # Pyre type checker
 .pyre/
+.vscode/
+data/audio/

core/pipelines.py CHANGED Viewed

@@ -2,14 +2,20 @@
 Haystack Pipelines
 """
 from haystack import Pipeline
 from haystack.document_stores import InMemoryDocumentStore
 from haystack.nodes.retriever import DensePassageRetriever, TfidfRetriever
 from haystack.nodes.preprocessor import PreProcessor
 from haystack.nodes.ranker import SentenceTransformersRanker
-def keyword_search(index="documents", split_word_length=100):
     """
     **Keyword Search Pipeline**
@@ -19,8 +25,6 @@ def keyword_search(index="documents", split_word_length=100):
       - Documents that have more lexical overlap with the query are more likely to be relevant
       - Words that occur in fewer documents are more significant than words that occur in many documents
-    :warning: **(HAYSTACK BUG) Keyword Search doesn't work if you reindex:** Please refresh page in order to reindex
     """
     document_store = InMemoryDocumentStore(index=index)
     keyword_retriever = TfidfRetriever(document_store=(document_store))
@@ -44,6 +48,15 @@ def keyword_search(index="documents", split_word_length=100):
         document_store, name="DocumentStore", inputs=["Preprocessor"]
     )
     return search_pipeline, index_pipeline
@@ -52,6 +65,7 @@ def dense_passage_retrieval(
     split_word_length=100,
     query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
     passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
 ):
     """
     **Dense Passage Retrieval Pipeline**
@@ -89,6 +103,15 @@ def dense_passage_retrieval(
         document_store, name="DocumentStore", inputs=["DPRRetriever"]
     )
     return search_pipeline, index_pipeline
@@ -98,6 +121,7 @@ def dense_passage_retrieval_ranker(
     query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
     passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
     ranker_model="cross-encoder/ms-marco-MiniLM-L-12-v2",
 ):
     """
     **Dense Passage Retrieval Ranker Pipeline**
@@ -118,4 +142,11 @@ def dense_passage_retrieval_ranker(
     search_pipeline.add_node(ranker, name="Ranker", inputs=["DPRRetriever"])
     return search_pipeline, index_pipeline

 Haystack Pipelines
 """
+from pathlib import Path
 from haystack import Pipeline
 from haystack.document_stores import InMemoryDocumentStore
 from haystack.nodes.retriever import DensePassageRetriever, TfidfRetriever
 from haystack.nodes.preprocessor import PreProcessor
 from haystack.nodes.ranker import SentenceTransformersRanker
+from haystack.nodes.audio.document_to_speech import DocumentToSpeech
+import os
+data_path = "data/"
+os.makedirs(data_path, exist_ok=True)
+def keyword_search(index="documents", split_word_length=100, audio_output=False):
     """
     **Keyword Search Pipeline**
       - Documents that have more lexical overlap with the query are more likely to be relevant
       - Words that occur in fewer documents are more significant than words that occur in many documents
     """
     document_store = InMemoryDocumentStore(index=index)
     keyword_retriever = TfidfRetriever(document_store=(document_store))
         document_store, name="DocumentStore", inputs=["Preprocessor"]
     )
+    if audio_output:
+        doc2speech = DocumentToSpeech(
+            model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
+            generated_audio_dir=Path(data_path + "audio"),
+        )
+        search_pipeline.add_node(
+            doc2speech, name="DocumentToSpeech", inputs=["TfidfRetriever"]
+        )
     return search_pipeline, index_pipeline
     split_word_length=100,
     query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
     passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
+    audio_output=False,
 ):
     """
     **Dense Passage Retrieval Pipeline**
         document_store, name="DocumentStore", inputs=["DPRRetriever"]
     )
+    if audio_output:
+        doc2speech = DocumentToSpeech(
+            model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
+            generated_audio_dir=Path(data_path + "audio"),
+        )
+        search_pipeline.add_node(
+            doc2speech, name="DocumentToSpeech", inputs=["DPRRetriever"]
+        )
     return search_pipeline, index_pipeline
     query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
     passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
     ranker_model="cross-encoder/ms-marco-MiniLM-L-12-v2",
+    audio_output=False,
 ):
     """
     **Dense Passage Retrieval Ranker Pipeline**
     search_pipeline.add_node(ranker, name="Ranker", inputs=["DPRRetriever"])
+    if audio_output:
+        doc2speech = DocumentToSpeech(
+            model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
+            generated_audio_dir=Path(data_path + "audio"),
+        )
+        search_pipeline.add_node(doc2speech, name="DocumentToSpeech", inputs=["Ranker"])
     return search_pipeline, index_pipeline

core/search_index.py CHANGED Viewed

@@ -37,15 +37,17 @@ def search(queries, pipeline):
         for res in matches:
             if not score_is_empty:
                 score_is_empty = True if res.score is None else False
-            query_results.append(
-                {
-                    "text": res.content,
-                    "score": res.score,
-                    "id": res.meta["id"],
-                    "fragment_id": res.id,
-                    "meta": res.meta,
-                }
-            )
         if not score_is_empty:
             query_results = sorted(
                 query_results, key=lambda x: x["score"], reverse=True

         for res in matches:
             if not score_is_empty:
                 score_is_empty = True if res.score is None else False
+            match = {
+                "text": res.content,
+                "id": res.meta["id"],
+                "fragment_id": res.id,
+                "meta": res.meta,
+            }
+            if not score_is_empty:
+                match.update({"score": res.score})
+            if hasattr(res, "content_audio"):
+                match.update({"content_audio": res.content_audio})
+            query_results.append(match)
         if not score_is_empty:
             query_results = sorted(
                 query_results, key=lambda x: x["score"], reverse=True

interface/components.py CHANGED Viewed

@@ -1,5 +1,10 @@
 import streamlit as st
-from interface.utils import get_pipelines, extract_text_from_url, extract_text_from_file
 from interface.draw_pipelines import get_pipeline_graph
@@ -42,7 +47,7 @@ def component_select_pipeline(container):
                 "index_pipeline": index_pipeline,
                 "doc": pipeline_funcs[index_pipe].__doc__,
             }
-            st.session_state["doc_id"] = 0
 def component_show_pipeline(pipeline, pipeline_name):
@@ -65,8 +70,10 @@ def component_show_search_result(container, results):
             st.markdown(f"**Document**: {document['id']}")
             if "_split_id" in document["meta"]:
                 st.markdown(f"**Document Chunk**: {document['meta']['_split_id']}")
-            if document["score"] is not None:
                 st.markdown(f"**Score**: {document['score']:.3f}")
             st.markdown("---")

 import streamlit as st
+from interface.utils import (
+    get_pipelines,
+    extract_text_from_url,
+    extract_text_from_file,
+    reset_vars_data,
+)
 from interface.draw_pipelines import get_pipeline_graph
                 "index_pipeline": index_pipeline,
                 "doc": pipeline_funcs[index_pipe].__doc__,
             }
+            reset_vars_data()
 def component_show_pipeline(pipeline, pipeline_name):
             st.markdown(f"**Document**: {document['id']}")
             if "_split_id" in document["meta"]:
                 st.markdown(f"**Document Chunk**: {document['meta']['_split_id']}")
+            if "score" in document:
                 st.markdown(f"**Score**: {document['score']:.3f}")
+            if "content_audio" in document:
+                st.audio(str(document["content_audio"]))
             st.markdown("---")

interface/pages.py CHANGED Viewed

@@ -25,12 +25,12 @@ def page_landing_page(container):
             "\n  - Index raw text, URLs, CSVs, PDFs and Images"
             "\n  - Use Dense Passage Retrieval, Keyword Search pipeline and DPR Ranker pipelines"
             "\n  - Search the indexed documents"
         )
         st.markdown(
             "TODO list:"
             "\n  - File type classification and converter nodes"
             "\n  - Audio to text support for indexing"
-            "\n  - Include text to audio to read responses"
             "\n  - Build other pipelines"
         )
         st.markdown(

             "\n  - Index raw text, URLs, CSVs, PDFs and Images"
             "\n  - Use Dense Passage Retrieval, Keyword Search pipeline and DPR Ranker pipelines"
             "\n  - Search the indexed documents"
+            "\n  - Read your responses out loud using the `audio_output` option!"
         )
         st.markdown(
             "TODO list:"
             "\n  - File type classification and converter nodes"
             "\n  - Audio to text support for indexing"
             "\n  - Build other pipelines"
         )
         st.markdown(

interface/utils.py CHANGED Viewed

@@ -1,5 +1,8 @@
 from io import StringIO
 import core.pipelines as pipelines_functions
 from inspect import getmembers, isfunction, signature
 from newspaper import Article
 from PyPDF2 import PdfFileReader
@@ -23,6 +26,13 @@ def get_pipelines():
     return pipeline_names, pipeline_funcs, pipeline_func_parameters
 @st.experimental_memo
 def extract_text_from_url(url: str):
     article = Article(url)

 from io import StringIO
+import os
+import shutil
 import core.pipelines as pipelines_functions
+from core.pipelines import data_path
 from inspect import getmembers, isfunction, signature
 from newspaper import Article
 from PyPDF2 import PdfFileReader
     return pipeline_names, pipeline_funcs, pipeline_func_parameters
+def reset_vars_data():
+    st.session_state["doc_id"] = 0
+    # Delete data files
+    shutil.rmtree(data_path)
+    os.makedirs(data_path, exist_ok=True)
 @st.experimental_memo
 def extract_text_from_url(url: str):
     article = Article(url)

requirements.txt CHANGED Viewed

@@ -5,4 +5,8 @@ black==22.8.0
 plotly==5.10.0
 newspaper3k==0.2.8
 PyPDF2==2.10.7
-pytesseract==0.3.10

 plotly==5.10.0
 newspaper3k==0.2.8
 PyPDF2==2.10.7
+pytesseract==0.3.10
+soundfile==0.10.3.post1
+espnet
+pydub==0.25.1
+espnet_model_zoo==0.1.7