Spaces:
Runtime error
Runtime error
Enhance audio processing and search functionality
Browse files- Add NLTK downloads for text processing
- Update text2speech import and pipeline configuration
- Improve audio file handling and path management
- Refactor search result processing and audio playback
- Update caching decorators and utility functions
- Modify requirements to include necessary dependencies
Signed-off-by: Unai Garay <[email protected]>
- app.py +6 -1
- core/pipelines.py +17 -10
- core/search_index.py +13 -5
- interface/components.py +27 -2
- interface/utils.py +12 -10
- requirements.txt +5 -4
app.py
CHANGED
|
@@ -8,11 +8,16 @@ st.set_page_config(
|
|
| 8 |
menu_items={"About": "https://github.com/ugm2/neural-search-demo"},
|
| 9 |
)
|
| 10 |
|
|
|
|
| 11 |
from streamlit_option_menu import option_menu
|
| 12 |
-
|
| 13 |
from interface.components import component_select_pipeline
|
|
|
|
| 14 |
from interface.utils import load_audio_model
|
| 15 |
|
|
|
|
|
|
|
|
|
|
| 16 |
# Initialization of session state
|
| 17 |
for key, value in session_state_variables.items():
|
| 18 |
if key not in st.session_state:
|
|
|
|
| 8 |
menu_items={"About": "https://github.com/ugm2/neural-search-demo"},
|
| 9 |
)
|
| 10 |
|
| 11 |
+
import nltk
|
| 12 |
from streamlit_option_menu import option_menu
|
| 13 |
+
|
| 14 |
from interface.components import component_select_pipeline
|
| 15 |
+
from interface.config import pages, session_state_variables
|
| 16 |
from interface.utils import load_audio_model
|
| 17 |
|
| 18 |
+
nltk.download("punkt_tab")
|
| 19 |
+
nltk.download("averaged_perceptron_tagger_eng")
|
| 20 |
+
|
| 21 |
# Initialization of session state
|
| 22 |
for key, value in session_state_variables.items():
|
| 23 |
if key not in st.session_state:
|
core/pipelines.py
CHANGED
|
@@ -2,17 +2,22 @@
|
|
| 2 |
Haystack Pipelines
|
| 3 |
"""
|
| 4 |
|
|
|
|
| 5 |
from pathlib import Path
|
|
|
|
| 6 |
from haystack import Pipeline
|
| 7 |
from haystack.document_stores import InMemoryDocumentStore
|
| 8 |
-
from haystack.nodes.retriever import DensePassageRetriever, TfidfRetriever
|
| 9 |
from haystack.nodes.preprocessor import PreProcessor
|
| 10 |
from haystack.nodes.ranker import SentenceTransformersRanker
|
| 11 |
-
from haystack.nodes.
|
| 12 |
-
import
|
| 13 |
|
| 14 |
data_path = "data/"
|
|
|
|
| 15 |
os.makedirs(data_path, exist_ok=True)
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
index = "documents"
|
| 18 |
|
|
@@ -59,7 +64,7 @@ def keyword_search(
|
|
| 59 |
if audio_output:
|
| 60 |
doc2speech = DocumentToSpeech(
|
| 61 |
model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
|
| 62 |
-
generated_audio_dir=Path(
|
| 63 |
)
|
| 64 |
search_pipeline.add_node(
|
| 65 |
doc2speech, name="DocumentToSpeech", inputs=["TfidfRetriever"]
|
|
@@ -114,12 +119,12 @@ def dense_passage_retrieval(
|
|
| 114 |
)
|
| 115 |
|
| 116 |
if audio_output:
|
| 117 |
-
|
| 118 |
model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
|
| 119 |
-
generated_audio_dir=Path(
|
| 120 |
)
|
| 121 |
search_pipeline.add_node(
|
| 122 |
-
|
| 123 |
)
|
| 124 |
|
| 125 |
return search_pipeline, index_pipeline
|
|
@@ -155,10 +160,12 @@ def dense_passage_retrieval_ranker(
|
|
| 155 |
search_pipeline.add_node(ranker, name="Ranker", inputs=["DPRRetriever"])
|
| 156 |
|
| 157 |
if audio_output:
|
| 158 |
-
|
| 159 |
model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
|
| 160 |
-
generated_audio_dir=Path(
|
|
|
|
|
|
|
|
|
|
| 161 |
)
|
| 162 |
-
search_pipeline.add_node(doc2speech, name="DocumentToSpeech", inputs=["Ranker"])
|
| 163 |
|
| 164 |
return search_pipeline, index_pipeline
|
|
|
|
| 2 |
Haystack Pipelines
|
| 3 |
"""
|
| 4 |
|
| 5 |
+
import os
|
| 6 |
from pathlib import Path
|
| 7 |
+
|
| 8 |
from haystack import Pipeline
|
| 9 |
from haystack.document_stores import InMemoryDocumentStore
|
|
|
|
| 10 |
from haystack.nodes.preprocessor import PreProcessor
|
| 11 |
from haystack.nodes.ranker import SentenceTransformersRanker
|
| 12 |
+
from haystack.nodes.retriever import DensePassageRetriever, TfidfRetriever
|
| 13 |
+
from text2speech import DocumentToSpeech
|
| 14 |
|
| 15 |
data_path = "data/"
|
| 16 |
+
audio_path = os.path.join(data_path, "audio")
|
| 17 |
os.makedirs(data_path, exist_ok=True)
|
| 18 |
+
os.makedirs(audio_path, exist_ok=True)
|
| 19 |
+
# Ensure proper permissions
|
| 20 |
+
os.chmod(audio_path, 0o777)
|
| 21 |
|
| 22 |
index = "documents"
|
| 23 |
|
|
|
|
| 64 |
if audio_output:
|
| 65 |
doc2speech = DocumentToSpeech(
|
| 66 |
model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
|
| 67 |
+
generated_audio_dir=Path(audio_path),
|
| 68 |
)
|
| 69 |
search_pipeline.add_node(
|
| 70 |
doc2speech, name="DocumentToSpeech", inputs=["TfidfRetriever"]
|
|
|
|
| 119 |
)
|
| 120 |
|
| 121 |
if audio_output:
|
| 122 |
+
document_to_speech = DocumentToSpeech(
|
| 123 |
model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
|
| 124 |
+
generated_audio_dir=Path(audio_path),
|
| 125 |
)
|
| 126 |
search_pipeline.add_node(
|
| 127 |
+
document_to_speech, name="DocumentToSpeech", inputs=["DPRRetriever"]
|
| 128 |
)
|
| 129 |
|
| 130 |
return search_pipeline, index_pipeline
|
|
|
|
| 160 |
search_pipeline.add_node(ranker, name="Ranker", inputs=["DPRRetriever"])
|
| 161 |
|
| 162 |
if audio_output:
|
| 163 |
+
document_to_speech = DocumentToSpeech(
|
| 164 |
model_name_or_path="espnet/kan-bayashi_ljspeech_vits",
|
| 165 |
+
generated_audio_dir=Path(audio_path),
|
| 166 |
+
)
|
| 167 |
+
search_pipeline.add_node(
|
| 168 |
+
document_to_speech, name="DocumentToSpeech", inputs=["Ranker"]
|
| 169 |
)
|
|
|
|
| 170 |
|
| 171 |
return search_pipeline, index_pipeline
|
core/search_index.py
CHANGED
|
@@ -1,7 +1,8 @@
|
|
| 1 |
-
from haystack.schema import Document
|
| 2 |
-
from haystack.document_stores import BaseDocumentStore
|
| 3 |
import uuid
|
| 4 |
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
def format_docs(documents):
|
| 7 |
"""Given a list of documents, format the documents and return the documents and doc ids."""
|
|
@@ -37,16 +38,23 @@ def search(queries, pipeline):
|
|
| 37 |
for res in matches:
|
| 38 |
if not score_is_empty:
|
| 39 |
score_is_empty = True if res.score is None else False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
match = {
|
| 41 |
-
"text":
|
| 42 |
"id": res.meta["id"],
|
| 43 |
"fragment_id": res.id,
|
| 44 |
"meta": res.meta,
|
| 45 |
}
|
| 46 |
if not score_is_empty:
|
| 47 |
match.update({"score": res.score})
|
| 48 |
-
if
|
| 49 |
-
|
|
|
|
| 50 |
query_results.append(match)
|
| 51 |
if not score_is_empty:
|
| 52 |
query_results = sorted(
|
|
|
|
|
|
|
|
|
|
| 1 |
import uuid
|
| 2 |
|
| 3 |
+
from haystack.document_stores import BaseDocumentStore
|
| 4 |
+
from haystack.schema import Document
|
| 5 |
+
|
| 6 |
|
| 7 |
def format_docs(documents):
|
| 8 |
"""Given a list of documents, format the documents and return the documents and doc ids."""
|
|
|
|
| 38 |
for res in matches:
|
| 39 |
if not score_is_empty:
|
| 40 |
score_is_empty = True if res.score is None else False
|
| 41 |
+
|
| 42 |
+
# Get the original text from content or meta
|
| 43 |
+
original_text = res.content
|
| 44 |
+
if hasattr(res, "meta") and "content_text" in res.meta:
|
| 45 |
+
original_text = res.meta["content_text"]
|
| 46 |
+
|
| 47 |
match = {
|
| 48 |
+
"text": original_text,
|
| 49 |
"id": res.meta["id"],
|
| 50 |
"fragment_id": res.id,
|
| 51 |
"meta": res.meta,
|
| 52 |
}
|
| 53 |
if not score_is_empty:
|
| 54 |
match.update({"score": res.score})
|
| 55 |
+
if res.content_type == "audio":
|
| 56 |
+
# Add audio path from the content field
|
| 57 |
+
match.update({"content_audio": res.content})
|
| 58 |
query_results.append(match)
|
| 59 |
if not score_is_empty:
|
| 60 |
query_results = sorted(
|
interface/components.py
CHANGED
|
@@ -25,16 +25,35 @@ def component_select_pipeline(container):
|
|
| 25 |
index_pipe = pipeline_names.index(selected_pipeline)
|
| 26 |
st.write("---")
|
| 27 |
st.header("Pipeline Parameters")
|
|
|
|
|
|
|
|
|
|
| 28 |
for parameter, value in pipeline_func_parameters[index_pipe].items():
|
| 29 |
-
if
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
value = st.text_input(parameter, value)
|
| 31 |
elif isinstance(value, bool):
|
| 32 |
value = st.checkbox(parameter, value)
|
| 33 |
elif isinstance(value, int):
|
|
|
|
|
|
|
| 34 |
value = int(st.number_input(parameter, value=value))
|
| 35 |
elif isinstance(value, float):
|
| 36 |
value = float(st.number_input(parameter, value=value))
|
| 37 |
pipeline_func_parameters[index_pipe][parameter] = value
|
|
|
|
| 38 |
if (
|
| 39 |
st.session_state["pipeline"] is None
|
| 40 |
or st.session_state["pipeline"]["name"] != selected_pipeline
|
|
@@ -93,12 +112,18 @@ def component_show_search_result(container, results):
|
|
| 93 |
st.markdown(f"### Match {idx+1}")
|
| 94 |
st.markdown(f"**Text**: {document['text']}")
|
| 95 |
st.markdown(f"**Document**: {document['id']}")
|
|
|
|
| 96 |
if "_split_id" in document["meta"]:
|
| 97 |
st.markdown(f"**Document Chunk**: {document['meta']['_split_id']}")
|
| 98 |
if "score" in document:
|
| 99 |
st.markdown(f"**Score**: {document['score']:.3f}")
|
| 100 |
if "content_audio" in document:
|
| 101 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 102 |
st.markdown("---")
|
| 103 |
|
| 104 |
|
|
|
|
| 25 |
index_pipe = pipeline_names.index(selected_pipeline)
|
| 26 |
st.write("---")
|
| 27 |
st.header("Pipeline Parameters")
|
| 28 |
+
|
| 29 |
+
# Process audio_output first to ensure top_k is set correctly
|
| 30 |
+
audio_output_value = False
|
| 31 |
for parameter, value in pipeline_func_parameters[index_pipe].items():
|
| 32 |
+
if parameter == "audio_output":
|
| 33 |
+
audio_output_value = st.checkbox(parameter, value)
|
| 34 |
+
pipeline_func_parameters[index_pipe][
|
| 35 |
+
"audio_output"
|
| 36 |
+
] = audio_output_value
|
| 37 |
+
if audio_output_value:
|
| 38 |
+
pipeline_func_parameters[index_pipe]["top_k"] = 3
|
| 39 |
+
break
|
| 40 |
+
|
| 41 |
+
# Then process all other parameters
|
| 42 |
+
for parameter, value in pipeline_func_parameters[index_pipe].items():
|
| 43 |
+
if parameter == "audio_output":
|
| 44 |
+
continue
|
| 45 |
+
elif isinstance(value, str):
|
| 46 |
value = st.text_input(parameter, value)
|
| 47 |
elif isinstance(value, bool):
|
| 48 |
value = st.checkbox(parameter, value)
|
| 49 |
elif isinstance(value, int):
|
| 50 |
+
if parameter == "top_k" and audio_output_value:
|
| 51 |
+
value = 3
|
| 52 |
value = int(st.number_input(parameter, value=value))
|
| 53 |
elif isinstance(value, float):
|
| 54 |
value = float(st.number_input(parameter, value=value))
|
| 55 |
pipeline_func_parameters[index_pipe][parameter] = value
|
| 56 |
+
|
| 57 |
if (
|
| 58 |
st.session_state["pipeline"] is None
|
| 59 |
or st.session_state["pipeline"]["name"] != selected_pipeline
|
|
|
|
| 112 |
st.markdown(f"### Match {idx+1}")
|
| 113 |
st.markdown(f"**Text**: {document['text']}")
|
| 114 |
st.markdown(f"**Document**: {document['id']}")
|
| 115 |
+
st.json(document)
|
| 116 |
if "_split_id" in document["meta"]:
|
| 117 |
st.markdown(f"**Document Chunk**: {document['meta']['_split_id']}")
|
| 118 |
if "score" in document:
|
| 119 |
st.markdown(f"**Score**: {document['score']:.3f}")
|
| 120 |
if "content_audio" in document:
|
| 121 |
+
try:
|
| 122 |
+
with open(document["content_audio"], "rb") as audio_file:
|
| 123 |
+
audio_bytes = audio_file.read()
|
| 124 |
+
st.audio(audio_bytes, format="audio/wav")
|
| 125 |
+
except Exception as e:
|
| 126 |
+
st.error(f"Error loading audio: {str(e)}")
|
| 127 |
st.markdown("---")
|
| 128 |
|
| 129 |
|
interface/utils.py
CHANGED
|
@@ -1,16 +1,18 @@
|
|
| 1 |
-
from io import StringIO
|
| 2 |
import os
|
| 3 |
import shutil
|
| 4 |
-
import core.pipelines as pipelines_functions
|
| 5 |
-
from core.pipelines import data_path
|
| 6 |
-
from core.audio import audio_to_text, load_model
|
| 7 |
from inspect import getmembers, isfunction, signature
|
| 8 |
-
from
|
| 9 |
-
|
| 10 |
-
import streamlit as st
|
| 11 |
import pandas as pd
|
| 12 |
import pytesseract
|
|
|
|
|
|
|
| 13 |
from PIL import Image
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
|
| 16 |
def get_pipelines():
|
|
@@ -35,7 +37,7 @@ def reset_vars_data():
|
|
| 35 |
os.makedirs(data_path, exist_ok=True)
|
| 36 |
|
| 37 |
|
| 38 |
-
@st.
|
| 39 |
def extract_text_from_url(url: str):
|
| 40 |
article = Article(url)
|
| 41 |
article.download()
|
|
@@ -44,7 +46,7 @@ def extract_text_from_url(url: str):
|
|
| 44 |
return article.text
|
| 45 |
|
| 46 |
|
| 47 |
-
@st.
|
| 48 |
def extract_text_from_file(file):
|
| 49 |
# read text file
|
| 50 |
if file.type == "text/plain":
|
|
@@ -110,6 +112,6 @@ def extract_text_from_file(file):
|
|
| 110 |
return None
|
| 111 |
|
| 112 |
|
| 113 |
-
@st.
|
| 114 |
def load_audio_model():
|
| 115 |
return load_model()
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
import shutil
|
|
|
|
|
|
|
|
|
|
| 3 |
from inspect import getmembers, isfunction, signature
|
| 4 |
+
from io import StringIO
|
| 5 |
+
|
|
|
|
| 6 |
import pandas as pd
|
| 7 |
import pytesseract
|
| 8 |
+
import streamlit as st
|
| 9 |
+
from newspaper import Article
|
| 10 |
from PIL import Image
|
| 11 |
+
from PyPDF2 import PdfFileReader
|
| 12 |
+
|
| 13 |
+
import core.pipelines as pipelines_functions
|
| 14 |
+
from core.audio import audio_to_text, load_model
|
| 15 |
+
from core.pipelines import data_path
|
| 16 |
|
| 17 |
|
| 18 |
def get_pipelines():
|
|
|
|
| 37 |
os.makedirs(data_path, exist_ok=True)
|
| 38 |
|
| 39 |
|
| 40 |
+
@st.cache_data
|
| 41 |
def extract_text_from_url(url: str):
|
| 42 |
article = Article(url)
|
| 43 |
article.download()
|
|
|
|
| 46 |
return article.text
|
| 47 |
|
| 48 |
|
| 49 |
+
@st.cache_data
|
| 50 |
def extract_text_from_file(file):
|
| 51 |
# read text file
|
| 52 |
if file.type == "text/plain":
|
|
|
|
| 112 |
return None
|
| 113 |
|
| 114 |
|
| 115 |
+
@st.cache_resource
|
| 116 |
def load_audio_model():
|
| 117 |
return load_model()
|
requirements.txt
CHANGED
|
@@ -1,14 +1,15 @@
|
|
| 1 |
streamlit==1.40.1
|
| 2 |
-
|
| 3 |
-
farm-haystack==1.26.4
|
| 4 |
black==24.8.0
|
| 5 |
plotly==5.24.1
|
| 6 |
newspaper3k==0.2.8
|
| 7 |
PyPDF2==3.0.1
|
| 8 |
pytesseract==0.3.13
|
| 9 |
soundfile==0.13.1
|
| 10 |
-
espnet==
|
| 11 |
pydub==0.25.1
|
| 12 |
espnet_model_zoo==0.1.7
|
| 13 |
openai-whisper==20240930
|
| 14 |
-
|
|
|
|
|
|
|
|
|
| 1 |
streamlit==1.40.1
|
| 2 |
+
farm-haystack[inference]==1.26.4
|
|
|
|
| 3 |
black==24.8.0
|
| 4 |
plotly==5.24.1
|
| 5 |
newspaper3k==0.2.8
|
| 6 |
PyPDF2==3.0.1
|
| 7 |
pytesseract==0.3.13
|
| 8 |
soundfile==0.13.1
|
| 9 |
+
espnet==202304
|
| 10 |
pydub==0.25.1
|
| 11 |
espnet_model_zoo==0.1.7
|
| 12 |
openai-whisper==20240930
|
| 13 |
+
farm-haystack-text2speech==1.1.1
|
| 14 |
+
altair==5.4.1
|
| 15 |
+
lxml_html_clean==0.4.1
|