Spaces:
Runtime error
Runtime error
| """ | |
| Haystack Pipelines | |
| """ | |
| import tokenizers | |
| from haystack import Pipeline | |
| from haystack.document_stores import InMemoryDocumentStore | |
| from haystack.nodes.retriever import DensePassageRetriever, TfidfRetriever | |
| from haystack.nodes.preprocessor import PreProcessor | |
| import streamlit as st | |
| def keyword_search( | |
| index='documents', | |
| ): | |
| document_store = InMemoryDocumentStore(index=index) | |
| keyword_retriever = TfidfRetriever(document_store=(document_store)) | |
| processor = PreProcessor( | |
| clean_empty_lines=True, | |
| clean_whitespace=True, | |
| clean_header_footer=True, | |
| split_by="word", | |
| split_length=100, | |
| split_respect_sentence_boundary=True, | |
| split_overlap=0, | |
| ) | |
| # SEARCH PIPELINE | |
| search_pipeline = Pipeline() | |
| search_pipeline.add_node(keyword_retriever, name="TfidfRetriever", inputs=["Query"]) | |
| # INDEXING PIPELINE | |
| index_pipeline = Pipeline() | |
| index_pipeline.add_node(processor, name="Preprocessor", inputs=["File"]) | |
| index_pipeline.add_node(keyword_retriever, name="TfidfRetriever", inputs=["Preprocessor"]) | |
| index_pipeline.add_node( | |
| document_store, name="DocumentStore", inputs=["TfidfRetriever"] | |
| ) | |
| return search_pipeline, index_pipeline | |
| def dense_passage_retrieval( | |
| index='documents', | |
| query_embedding_model="facebook/dpr-question_encoder-single-nq-base", | |
| passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base", | |
| ): | |
| document_store = InMemoryDocumentStore(index=index) | |
| dpr_retriever = DensePassageRetriever( | |
| document_store=document_store, | |
| query_embedding_model=query_embedding_model, | |
| passage_embedding_model=passage_embedding_model, | |
| ) | |
| processor = PreProcessor( | |
| clean_empty_lines=True, | |
| clean_whitespace=True, | |
| clean_header_footer=True, | |
| split_by="word", | |
| split_length=100, | |
| split_respect_sentence_boundary=True, | |
| split_overlap=0, | |
| ) | |
| # SEARCH PIPELINE | |
| search_pipeline = Pipeline() | |
| search_pipeline.add_node(dpr_retriever, name="DPRRetriever", inputs=["Query"]) | |
| # INDEXING PIPELINE | |
| index_pipeline = Pipeline() | |
| index_pipeline.add_node(processor, name="Preprocessor", inputs=["File"]) | |
| index_pipeline.add_node(dpr_retriever, name="DPRRetriever", inputs=["Preprocessor"]) | |
| index_pipeline.add_node( | |
| document_store, name="DocumentStore", inputs=["DPRRetriever"] | |
| ) | |
| return search_pipeline, index_pipeline | |