Spaces:

mlnotes
/

borrador_constitucion_chile

Runtime error

App Files Files Community

palegre commited on Jun 13, 2022

Commit

b19c8bc

0 Parent(s):

Add application file beta.

Browse files

Files changed (7) hide show

.gitattributes +1 -0
app.py +115 -0
data/PROPUESTA-DE-BORRADOR-CONSTITUCIONAL-14.05.22-1.pdf +3 -0
data/articles.csv +0 -0
pdf_to_text.py +87 -0
qa_pipeline_faiss.py +76 -0
requirements.txt +4 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1 @@


1	+ *.pdf filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,115 @@

+# %%
+import os
+from time import sleep
+from haystack.document_stores import ElasticsearchDocumentStore
+from haystack.utils import launch_es
+launch_es()
+sleep(30)
+# %%
+os.environ["HAYSTACK_TELEMETRY_ENABLED"] = "False"
+document_store = ElasticsearchDocumentStore(host="localhost", username="", password="", index="document")
+# %%
+import pandas as pd
+df_document = pd.read_csv("data/articles.csv")
+df_document.head()
+# %%
+articles = []
+for idx, row in df_document.iterrows():
+    article = {
+        "id": idx,
+        "content": row["article"],
+        "meta":{
+            "chapter_name": row["chapter_name"],
+            "article_page": row["article_page"],
+            "article_number": row["article_number"],
+            "article_name": row["article_name"],
+        },
+    }
+    articles.append(article)
+document_store.write_documents(articles, index="document")
+print(f"Loaded {document_store.get_document_count()} documents")
+# %%
+from haystack.nodes import BM25Retriever
+retriever = BM25Retriever(document_store=document_store)
+# %%
+from haystack.nodes import FARMReader
+model_ckpt = "mrm8488/distill-bert-base-spanish-wwm-cased-finetuned-spa-squad2-es"
+reader = FARMReader(
+    model_name_or_path=model_ckpt,
+    progress_bar=False,
+    max_seq_len=384,
+    doc_stride=128,
+    return_no_answer=False,
+    use_gpu=False,
+)
+# %%
+from haystack.pipelines import ExtractiveQAPipeline
+pipe = ExtractiveQAPipeline(reader, retriever)
+# %%
+from textwrap import fill
+def run_qa_pipeline(question):
+    results = pipe.run(
+        query=question,
+        params={
+            "Retriever": {"top_k": 10},
+            "Reader": {"top_k": 5}
+        }
+    )
+    return results
+def results_as_markdown(results):
+    top_answers = []
+    for count, result in enumerate(results["answers"]):
+        article = document_store.get_document_by_id(result.document_id)
+        meta = result.meta
+        formatted_answer = """**Capítulo: {}.\t número: {}.\t nombre: {}.\t página: {}.**
+        {}
+        """.format(
+            meta["chapter_name"],
+            meta["article_number"],
+            meta["article_name"],
+            meta["article_page"],
+            fill(article.content, 80),
+        )
+        top_answers.append(formatted_answer)
+    return "\n\n".join(top_answers)
+def query_qa_pipeline(question):
+    results = run_qa_pipeline(question)
+    return results_as_markdown(results)
+# %%
+import gradio as gr
+title = "**CONSOLIDADO NORMAS APROBADAS PARA LA PROPUESTA CONSTITUCIONAL POR EL PLENO DE LA CONVENCIÓN**"
+default_question = "educación gratuita"
+with gr.Blocks() as demo:
+    gr.Markdown(title)
+    with gr.Column():
+        with gr.Row():
+            question = gr.Textbox(lines=2, max_lines=3, label="Pregunta:", placeholder=default_question)
+        with gr.Row():
+            btn = gr.Button("Buscar")
+    with gr.Row():
+        answers = gr.Markdown()
+    btn.click(
+        fn=query_qa_pipeline,
+        inputs=question,
+        outputs=answers,
+    )
+demo.launch(share=True)
+# %%

data/PROPUESTA-DE-BORRADOR-CONSTITUCIONAL-14.05.22-1.pdf ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:be2629a7708b19a9eeadb6d416e7c761cfeb483531a992706d2c732894468b18
+size 1469444

data/articles.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

pdf_to_text.py ADDED Viewed

	@@ -0,0 +1,87 @@

+# %%
+import re
+import fitz
+import pandas as pd
+# %%
+document_path = "data/PROPUESTA-DE-BORRADOR-CONSTITUCIONAL-14.05.22-1.pdf"
+# %%
+skip_header_offset = 1
+regex_article = re.compile(r"(\d+\.- ?Artículo.+?(?:\.|-))")
+regex_chapters = re.compile(r"(?<=\n)(CAPÍTULO \(COM \d+\) \n.+?)(?= \n)")
+# %%
+document = ""
+page_article = {}
+pdf_page_offset = 1
+with fitz.open(document_path) as doc:
+    for page_idx, page in enumerate(doc, pdf_page_offset):
+        text = page.get_text()
+        document += text
+        articles = regex_article.findall(text)
+        for article in articles:
+            page_article[article] = page_idx
+len(page_article)
+# %%
+chapters = {}
+chapter_name = "header"
+splited_chapters = regex_chapters.split(document)
+for chapter in splited_chapters[skip_header_offset:]:
+    if chapter.startswith("CAPÍTULO"):
+        chapter_name = chapter.replace(" \n", ": ")
+    else:
+        chapters[chapter_name] = chapter
+len(chapters), chapters.keys()
+# %%
+minimum_article_length = 65
+def format_article(article):
+    articles = article.lstrip('- ').split("\n \n")
+    formated_articles = []
+    for article in articles:
+        formated_article = article.replace("\n", "").replace("*", "").strip()
+        is_article_single = formated_article.startswith("El Estado")
+        is_article_too_short = len(formated_article) <= minimum_article_length
+        if is_article_too_short and not is_article_single:
+            continue
+        formated_articles.append(formated_article)
+    sentence = " ".join(formated_articles)
+    return sentence
+# %%
+chapter_articles = []
+for chapter_name, chapter in chapters.items():
+    article_name = "header"
+    splited_articles = regex_article.split(chapter)
+    for article in splited_articles[skip_header_offset:]:
+        if regex_article.match(article):
+            article_name = article
+            continue
+        data = {
+            "chapter_name": chapter_name,
+            "article_page": page_article.get(article_name),
+            "article_name": article_name,
+            "article": format_article(article),
+        }
+        chapter_articles.append(data)
+# %%
+df_document = pd.DataFrame.from_dict(chapter_articles)
+df_document["article_number"] = (
+    df_document['article_name']
+    .str.extract(r'(^\d+)', expand=False)
+)
+df_document["article_name"] = (
+    df_document['article_name']
+    .str.extract(r'^\d+\.- ?(.*)', expand=False)
+    .str.rstrip(".-")
+)
+df_document.head()
+# %%
+df_document.to_csv("data/articles.csv", index=False)
+# %%

qa_pipeline_faiss.py ADDED Viewed

	@@ -0,0 +1,76 @@

+# %%
+from haystack.document_stores import FAISSDocumentStore
+document_store = FAISSDocumentStore(faiss_index_factory_str="Flat")
+# %%
+import pandas as pd
+df_document = pd.read_csv("data/articles.csv")
+articles = []
+for idx, row in df_document.iterrows():
+    article = {
+        "content": row["article"],
+        "meta":{
+            "chapter_name": row["chapter_name"],
+            "article_page": row["article_page"],
+            "article_number": row["article_number"],
+            "article_name": row["article_name"],
+        },
+    }
+    articles.append(article)
+document_store.write_documents(articles, index="document")
+print(f"Loaded {document_store.get_document_count()} documents")
+# %%
+from haystack.nodes import DensePassageRetriever
+retriever = DensePassageRetriever(
+    document_store=document_store,
+    query_embedding_model="sadakmed/dpr-passage_encoder-spanish",
+    passage_embedding_model="sadakmed/dpr-passage_encoder-spanish",
+    max_seq_len_query=64,
+    max_seq_len_passage=384,
+    batch_size=16,
+    use_gpu=False,
+    embed_title=True,
+    use_fast_tokenizers=True,
+)
+document_store.update_embeddings(retriever)
+# %%
+from haystack.nodes import FARMReader
+model_ckpt = "mrm8488/distill-bert-base-spanish-wwm-cased-finetuned-spa-squad2-es"
+reader = FARMReader(
+    model_name_or_path=model_ckpt,
+    progress_bar=False,
+    max_seq_len=384,
+    doc_stride=128,
+    return_no_answer=True,
+    use_gpu=False,
+)
+# %%
+from haystack.pipelines import ExtractiveQAPipeline
+pipe = ExtractiveQAPipeline(reader, retriever)
+# %%
+question = "pueblos originarios justicia"
+prediction = pipe.run(
+    query=question,
+    params={
+        "Retriever": {"top_k": 10},
+        "Reader": {"top_k": 5}
+    }
+)
+# %%
+from pprint import pprint
+pprint(prediction)
+# %%
+from haystack.utils import print_answers
+print_answers(prediction, details="minimum")
+# %%

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+PyMuPDF
+haystack
+pandas
+gradio