jpangas commited on
Commit
15f76b2
·
verified ·
1 Parent(s): de8ed04

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +90 -87
app.py CHANGED
@@ -13,103 +13,105 @@ from langgraph.graph import START, StateGraph
13
  from typing_extensions import List, TypedDict
14
  import xmltodict
15
 
16
- qa_graph = None
17
- current_file = None
18
-
19
- class State(TypedDict):
20
- question: str
21
- context: List[Document]
22
- answer: str
23
-
24
- def get_extra_docs(file_name):
25
- # TODO: Add the code to extract the title, authors and abstract from the PDF file
26
- client = GrobidClient(config_path="./config.json")
27
- information = client.process_pdf(
28
- "processHeaderDocument",
29
- file_name,
30
- generateIDs=False,
31
- consolidate_header=False,
32
- consolidate_citations=False,
33
- include_raw_citations=False,
34
- include_raw_affiliations=False,
35
- tei_coordinates=False,
36
- segment_sentences=False,
37
- )
38
- dict_information = xmltodict.parse(information[2])
39
- title = dict_information["tei"]["teiHeader"]["fileDesc"]["titleStmt"]["title"]
40
- abstract = dict_information["tei"]["teiHeader"]["profileDesc"]["abstract"]["p"]
41
- return title
42
-
43
- def initiate_graph(file):
44
- global qa_graph, current_file
45
-
46
- if current_file != file.name:
47
- qa_graph = None
48
- current_file = file.name
49
-
50
- loader = GenericLoader.from_filesystem(
51
- file.name,
52
- parser=GrobidParser(
53
  segment_sentences=False,
54
- grobid_server="https://jpangas-grobid-paper-extractor.hf.space/api/processFulltextDocument",
55
- ),
56
- )
57
-
58
- docs = loader.load()
59
-
60
- embeddings = OpenAIEmbeddings()
61
- vector_store = InMemoryVectorStore(embeddings)
62
-
63
- llm = ChatOpenAI(model="gpt-4o-mini")
64
- text_splitter = RecursiveCharacterTextSplitter(
65
- chunk_size=1000, chunk_overlap=200, add_start_index=True
66
- )
67
- all_splits = text_splitter.split_documents(docs)
68
- vector_store.add_documents(documents=all_splits)
69
- prompt = hub.pull("rlm/rag-prompt")
70
-
71
- def retrieve(state: State):
72
- retrieved_docs = vector_store.similarity_search(state["question"])
73
- return {"context": retrieved_docs}
74
-
75
- def generate(state: State):
76
- docs_content = "\n\n".join(doc.page_content for doc in state["context"])
77
- messages = prompt.invoke(
78
- {"question": state["question"], "context": docs_content}
79
  )
80
- response = llm.invoke(messages)
81
- return {"answer": response.content}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
 
83
- graph_builder = StateGraph(State).add_sequence([retrieve, generate])
84
- graph_builder.add_edge(START, "retrieve")
85
- qa_graph = graph_builder.compile()
86
 
87
- name = file.name.split("/")[-1]
88
- return f"The paper {name} has been loaded and is ready for questions!"
89
 
90
- def answer_question(question, history):
91
- global qa_graph, current_file
 
92
 
93
- if qa_graph is None:
94
- return "Please upload a PDF file first and wait for it to be loaded!"
95
 
96
- response = qa_graph.invoke({"question": question})
97
- return response["answer"]
 
 
 
98
 
99
- def slow_echo(message, history):
100
- answer = answer_question(message, history)
101
- if answer == "Please upload a PDF file first!":
102
- yield answer
103
- return
104
 
105
- for i in range(len(answer)):
106
- time.sleep(0.01)
107
- yield answer[: i + 1]
108
 
109
  def main():
 
 
110
  with gr.Blocks() as demo:
111
  file_input = gr.File(
112
- label="Upload a research paper as a pdf file and wait for it to be loaded",
113
  file_types=[".pdf"],
114
  )
115
 
@@ -117,11 +119,12 @@ def main():
117
  label="Status of Upload", value="No Paper Uploaded", interactive=False
118
  )
119
 
120
- chat_interface = gr.ChatInterface(slow_echo, type="messages")
 
 
121
 
122
- file_input.upload(fn=initiate_graph, inputs=file_input, outputs=textbox)
123
 
124
- demo.queue().launch()
125
 
126
  if __name__ == "__main__":
127
  main()
 
13
  from typing_extensions import List, TypedDict
14
  import xmltodict
15
 
16
+
17
+ class PaperQA:
18
+ def __init__(self):
19
+ self.qa_graph = None
20
+ self.current_file = None
21
+
22
+ class State(TypedDict):
23
+ question: str
24
+ context: List[Document]
25
+ answer: str
26
+
27
+ def get_extra_docs(self, file_name):
28
+ # TODO: Add the code to extract the title, authors, and abstract from the PDF file
29
+ client = GrobidClient(config_path="./config.json")
30
+ information = client.process_pdf(
31
+ "processHeaderDocument",
32
+ file_name,
33
+ generateIDs=False,
34
+ consolidate_header=False,
35
+ consolidate_citations=False,
36
+ include_raw_citations=False,
37
+ include_raw_affiliations=False,
38
+ tei_coordinates=False,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
  segment_sentences=False,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  )
41
+ dict_information = xmltodict.parse(information[2])
42
+ title = dict_information["tei"]["teiHeader"]["fileDesc"]["titleStmt"]["title"]
43
+ abstract = dict_information["tei"]["teiHeader"]["profileDesc"]["abstract"]["p"]
44
+ return title
45
+
46
+ def initiate_graph(self, file):
47
+ if self.current_file != file.name:
48
+ self.qa_graph = None
49
+ self.current_file = file.name
50
+
51
+ loader = GenericLoader.from_filesystem(
52
+ file.name,
53
+ parser=GrobidParser(
54
+ segment_sentences=False,
55
+ grobid_server="https://jpangas-grobid-paper-extractor.hf.space/api/processFulltextDocument",
56
+ ),
57
+ )
58
+
59
+ docs = loader.load()
60
+
61
+ embeddings = OpenAIEmbeddings()
62
+ vector_store = InMemoryVectorStore(embeddings)
63
+
64
+ llm = ChatOpenAI(model="gpt-4o-mini")
65
+ text_splitter = RecursiveCharacterTextSplitter(
66
+ chunk_size=1000, chunk_overlap=200, add_start_index=True
67
+ )
68
+ all_splits = text_splitter.split_documents(docs)
69
+ vector_store.add_documents(documents=all_splits)
70
+ prompt = hub.pull("rlm/rag-prompt")
71
+
72
+ def retrieve(state: self.State):
73
+ retrieved_docs = vector_store.similarity_search(state["question"])
74
+ return {"context": retrieved_docs}
75
+
76
+ def generate(state: self.State):
77
+ docs_content = "\n\n".join(doc.page_content for doc in state["context"])
78
+ messages = prompt.invoke(
79
+ {"question": state["question"], "context": docs_content}
80
+ )
81
+ response = llm.invoke(messages)
82
+ return {"answer": response.content}
83
 
84
+ graph_builder = StateGraph(self.State).add_sequence([retrieve, generate])
85
+ graph_builder.add_edge(START, "retrieve")
86
+ self.qa_graph = graph_builder.compile()
87
 
88
+ name = file.name.split("/")[-1]
89
+ return f"The paper {name} has been loaded and is ready for questions!"
90
 
91
+ def answer_question(self, question, history):
92
+ if self.qa_graph is None:
93
+ return "Please upload a PDF file first and wait for it to be loaded!"
94
 
95
+ response = self.qa_graph.invoke({"question": question})
96
+ return response["answer"]
97
 
98
+ def slow_echo(self, message, history):
99
+ answer = self.answer_question(message, history)
100
+ if answer == "Please upload a PDF file first!":
101
+ yield answer
102
+ return
103
 
104
+ for i in range(len(answer)):
105
+ time.sleep(0.01)
106
+ yield answer[: i + 1]
 
 
107
 
 
 
 
108
 
109
  def main():
110
+ qa_app = PaperQA()
111
+
112
  with gr.Blocks() as demo:
113
  file_input = gr.File(
114
+ label="Upload a research paper as a PDF file and wait for it to be loaded",
115
  file_types=[".pdf"],
116
  )
117
 
 
119
  label="Status of Upload", value="No Paper Uploaded", interactive=False
120
  )
121
 
122
+ chat_interface = gr.ChatInterface(qa_app.slow_echo, type="messages")
123
+
124
+ file_input.upload(fn=qa_app.initiate_graph, inputs=file_input, outputs=textbox)
125
 
126
+ demo.launch()
127
 
 
128
 
129
  if __name__ == "__main__":
130
  main()