pendrag commited on
Commit
60d7a89
·
1 Parent(s): 37abfe5
Files changed (3) hide show
  1. app.py +228 -0
  2. config.py +3 -0
  3. requirements.txt +4 -0
app.py ADDED
@@ -0,0 +1,228 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from openai import OpenAI
2
+ import os
3
+ import requests
4
+ import json
5
+ from config import CONFIG
6
+ import gradio as gr
7
+ import time
8
+ import re
9
+ #export GRADIO_DEBUG=1
10
+
11
+ def search_inspire(query, size=10):
12
+ """
13
+ Search INSPIRE HEP database using fulltext search
14
+
15
+ Args:
16
+ query (str): Search query
17
+ size (int): Number of results to return
18
+ """
19
+ base_url = "https://inspirehep.net/api/literature"
20
+ params = {
21
+ "q": query,
22
+ "size": size,
23
+ "format": "json"
24
+ }
25
+
26
+ response = requests.get(base_url, params=params)
27
+ return response.json()
28
+
29
+ def format_reference(metadata):
30
+ output = f"{', '.join(author.get('full_name', '') for author in metadata.get('authors', []))} "
31
+ output += f"({metadata.get('publication_info', [{}])[0].get('year', 'N/A')}). "
32
+ output += f"*{metadata.get('titles', [{}])[0].get('title', 'N/A')}*. "
33
+ output += f"DOI: {metadata.get('dois', [{}])[0].get('value', 'N/A') if metadata.get('dois') else 'N/A'}. "
34
+ output += f"[INSPIRE record {metadata['control_number']}](https://inspirehep.net/literature/{metadata['control_number']})"
35
+ output += "\n\n"
36
+ return output
37
+
38
+ def format_results(results):
39
+ """Print formatted search results"""
40
+ output = ""
41
+ for i, hit in enumerate(results['hits']['hits']):
42
+ metadata = hit['metadata']
43
+ output += f"**[{i}]** "
44
+ output += format_reference(metadata)
45
+ return output
46
+
47
+ def results_context(results):
48
+ """ Prepare a context from the results for the LLM """
49
+ context = ""
50
+ for i, hit in enumerate(results['hits']['hits']):
51
+ metadata = hit['metadata']
52
+ context += f"Result [{i}]\n\n"
53
+ context += f"Title: {metadata.get('titles', [{}])[0].get('title', 'N/A')}\n\n"
54
+ context += f"Abstract: {metadata.get('abstracts', [{}])[0].get('value', 'N/A')}\n\n"
55
+ return context
56
+
57
+ def user_prompt(query, context):
58
+ """ Generate a prompt for the LLM """
59
+ prompt = f"""
60
+ QUERY: {query}
61
+
62
+ CONTEXT:
63
+
64
+ {context}
65
+
66
+ ANSWER:
67
+
68
+ """
69
+ return prompt
70
+
71
+ def llm_expand_query(query):
72
+ """ Expands a query to variations of fulltext searches """
73
+
74
+ response = client.chat.completions.create(
75
+ model="gpt-4o",
76
+ messages=[
77
+ {
78
+ "role": "user",
79
+ "content": [
80
+ {
81
+ "type": "text",
82
+ "text": f"""
83
+ Expand this query into a the query format used for a fulltext search
84
+ over the INSPIRE HEP database. Propose alternatives of the query to
85
+ maximize the recall and join those variantes using OR operators and
86
+ prepend each variant with the ft prefix. Just provide the expanded
87
+ query, without explanations.
88
+
89
+ Example of query:
90
+ how far are black holes?
91
+
92
+ Expanded query:
93
+ ft "how far are black holes" OR ft "distance from black holes" OR ft
94
+ "distances to black holes" OR ft "measurement of distance to black
95
+ holes" OR ft "remoteness of black holes" OR ft "distance to black
96
+ holes" OR ft "how far are singularities" OR ft "distance to
97
+ singularities" OR ft "distances to event horizon" OR ft "distance
98
+ from Schwarzschild radius" OR ft "black hole distance"
99
+
100
+ Query: {query}
101
+
102
+ Expanded query:
103
+ """
104
+ }
105
+ ]
106
+ }
107
+ ],
108
+ response_format={
109
+ "type": "text"
110
+ },
111
+ temperature=1,
112
+ max_tokens=2048,
113
+ top_p=1,
114
+ frequency_penalty=0,
115
+ presence_penalty=0
116
+ )
117
+
118
+ return response.choices[0].message.content
119
+
120
+ def llm_generate_answer(prompt):
121
+ """ Generate a response from the LLM """
122
+
123
+ response = client.chat.completions.create(
124
+ model="gpt-4o",
125
+ messages=[
126
+ {
127
+ "role": "system",
128
+ "content": [
129
+ {
130
+ "type": "text",
131
+ "text": """You are part of a Retrieval Augmented Generation system
132
+ (RAG) and are asked with a query and a context of results. Generate an
133
+ answer substantiated by the results provided and citing them using
134
+ their index when used to provide an answer text. Do not generate text
135
+ that is not grounded in a reference, so all paragraphs should cite a
136
+ search result. End the answer with the query and a brief answer as
137
+ summary of the previous discussed results. Do not consider results
138
+ that are not related to the query and, if no specif answer can be
139
+ provided, explain that in the brief answer."""
140
+ }
141
+ ]
142
+ },
143
+ {
144
+ "role": "user",
145
+ "content": [
146
+ {
147
+ "type": "text",
148
+ "text": prompt
149
+ }
150
+ ]
151
+ }
152
+ ],
153
+ response_format={
154
+ "type": "text"
155
+ },
156
+ temperature=1,
157
+ max_tokens=2048,
158
+ top_p=1,
159
+ frequency_penalty=0,
160
+ presence_penalty=0
161
+ )
162
+
163
+ return response.choices[0].message.content
164
+
165
+ def clean_refs(answer, results):
166
+ """ Clean the references from the answer """
167
+
168
+ # Find references
169
+ unique_ordered = []
170
+ for match in re.finditer(r'\[(\d+)\]', answer):
171
+ ref_num = int(match.group(1))
172
+ if ref_num not in unique_ordered:
173
+ unique_ordered.append(ref_num)
174
+
175
+ # Filter references
176
+ new_i = 1
177
+ new_results = ""
178
+ for i, hit in enumerate(results['hits']['hits']):
179
+ if i not in unique_ordered:
180
+ continue
181
+ metadata = hit['metadata']
182
+ new_results += f"**[{new_i}]** "
183
+ new_results += format_reference(metadata)
184
+ new_i += 1
185
+
186
+ new_i = 1
187
+ for i in unique_ordered:
188
+ answer = answer.replace(f"[{i}]", f"[__NEW_REF_ID_{new_i}]")
189
+ new_i += 1
190
+ answer = answer.replace("__NEW_REF_ID_", "")
191
+
192
+ return answer, new_results
193
+
194
+ def search(query, progress=gr.Progress()):
195
+ time.sleep(1)
196
+ progress(0, desc="Expanding query...")
197
+ query = llm_expand_query(query)
198
+ progress(0.25, desc="Searching INSPIRE HEP...")
199
+ results = search_inspire(query)
200
+ progress(0.50, desc="Generating answer...")
201
+ context = results_context(results)
202
+ prompt = user_prompt(query, context)
203
+ answer = llm_generate_answer(prompt)
204
+ new_answer, references = clean_refs(answer, results)
205
+ progress(1, desc="Done!")
206
+
207
+ #json_str = json.dumps(results['hits']['hits'][0]['metadata'], indent=4)
208
+ return "**Answer**:\n\n" + new_answer +"\n\n**References**:\n\n" + references #+ "\n\n <pre>\n" + json_str + "</pre>"
209
+
210
+ # ----------- MAIN ------------------------------------------------------------
211
+
212
+ os.environ["OPENAI_API_KEY"] = "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"
213
+ client = OpenAI()
214
+
215
+ with gr.Blocks() as demo:
216
+ gr.Markdown("# INSPIRE HEP Search")
217
+ with gr.Row():
218
+ with gr.Column():
219
+ query = gr.Textbox(label="Search Query")
220
+ search_btn = gr.Button("Search")
221
+ examples = gr.Examples([["Which one is closest star?"], ["In which particles does the Higgs Boson decay to?"]], query)
222
+ with gr.Column():
223
+ results = gr.Markdown("Answer will appear here...", label="Search Results", )
224
+ search_btn.click(fn=search, inputs=query, outputs=results, api_name="search", show_progress=True)
225
+
226
+
227
+ demo.launch()
228
+ #print(search("how far are black holes?"))
config.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ CONFIG = {
2
+ 'OPEN_API_KEY': "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"
3
+ }
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ gradio
2
+ openai
3
+ requests
4
+ httpx<0.28