Spaces:

jpangas
/

gradio-extractor

Sleeping

App Files Files Community

jpangas commited on Jan 21

Commit

9807ccc

verified ·

1 Parent(s): 17c8aa2

Update the app with a better UI

Browse files

Files changed (1) hide show

app.py +98 -18

app.py CHANGED Viewed

@@ -1,30 +1,110 @@
-import gradio as gr
 from grobid_client.grobid_client import GrobidClient
-def extract_text(file):
-    client = GrobidClient(config_path="./config.json")
-    information = client.process_pdf(
-        "processFulltextDocument",
         file.name,
-        generateIDs=False,
-        consolidate_header=False,
-        consolidate_citations=False,
-        include_raw_citations=False,
-        include_raw_affiliations=False,
-        tei_coordinates=False,
-        segment_sentences=False,
     )
-    return information
-# Ask Dr Ahmad about which LLM to use and if we have a token for it
 with gr.Blocks() as demo:
     file_input = gr.File(
-        label="Upload a research paper as a pdf file", file_types=[".pdf"]
     )
-    text_output = gr.Textbox(label="Extracted Text")
-    file_input.upload(fn=extract_text, inputs=file_input, outputs=text_output)
-demo.launch()

 from grobid_client.grobid_client import GrobidClient
+import time
+import gradio as gr
+from langchain_community.document_loaders.generic import GenericLoader
+from langchain_community.document_loaders.parsers import GrobidParser
+from langchain_openai import OpenAIEmbeddings
+from langchain_core.vectorstores import InMemoryVectorStore
+from langchain_openai import ChatOpenAI
+from langchain import hub
+from langchain_core.documents import Document
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from langgraph.graph import START, StateGraph
+from typing_extensions import List, TypedDict
+import xmltodict
+qa_graph = None
+current_file = None
+class State(TypedDict):
+    question: str
+    context: List[Document]
+    answer: str
+def initiate_graph(file):
+    global qa_graph, current_file
+    if current_file != file.name:
+        qa_graph = None
+        current_file = file.name
+    loader = GenericLoader.from_filesystem(
         file.name,
+        parser=GrobidParser(
+            segment_sentences=False,
+            grobid_server="https://jpangas-grobid-paper-extractor.hf.space/api/processFulltextDocument",
+        ),
     )
+    docs = loader.load()
+    embeddings = OpenAIEmbeddings()
+    vector_store = InMemoryVectorStore(embeddings)
+    llm = ChatOpenAI(model="gpt-4o-mini")
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=1000, chunk_overlap=200, add_start_index=True
+    )
+    all_splits = text_splitter.split_documents(docs)
+    vector_store.add_documents(documents=all_splits)
+    prompt = hub.pull("rlm/rag-prompt")
+    def retrieve(state: State):
+        retrieved_docs = vector_store.similarity_search(state["question"])
+        return {"context": retrieved_docs}
+    def generate(state: State):
+        docs_content = "\n\n".join(doc.page_content for doc in state["context"])
+        messages = prompt.invoke(
+            {"question": state["question"], "context": docs_content}
+        )
+        response = llm.invoke(messages)
+        return {"answer": response.content}
+    graph_builder = StateGraph(State).add_sequence([retrieve, generate])
+    graph_builder.add_edge(START, "retrieve")
+    qa_graph = graph_builder.compile()
+    name = file.name.split("/")[-1]
+    return f"The paper {name} has been loaded and is ready for questions!"
+def answer_question(question, history):
+    global qa_graph, current_file
+    if qa_graph is None:
+        return "Please upload a PDF file first and wait for it to be loaded!"
+    response = qa_graph.invoke({"question": question})
+    return response["answer"]
+def slow_echo(message, history):
+    answer = answer_question(message, history)
+    if answer == "Please upload a PDF file first!":
+        yield answer
+        return
+    for i in range(len(answer)):
+        time.sleep(0.01)
+        yield answer[: i + 1]
 with gr.Blocks() as demo:
     file_input = gr.File(
+        label="Upload a research paper as a pdf file wait for it to be loaded",
+        file_types=[".pdf"],
+    )
+    textbox = gr.Textbox(
+        label="Status of Upload", value="No Paper Uploaded", interactive=False
     )
+    chat_interface = gr.ChatInterface(slow_echo, type="messages")
+    file_input.upload(fn=initiate_graph, inputs=file_input, outputs=textbox)
+demo.queue()