Spaces:

pendrag
/

feynbot-ir

Sleeping

App Files Files Community

pendrag commited on Dec 9, 2024

Commit

60d7a89

1 Parent(s): 37abfe5

updated

Browse files

Files changed (3) hide show

app.py +228 -0
config.py +3 -0
requirements.txt +4 -0

app.py ADDED Viewed

	@@ -0,0 +1,228 @@

+from openai import OpenAI
+import os
+import requests
+import json
+from config import CONFIG
+import gradio as gr
+import time
+import re
+#export GRADIO_DEBUG=1
+def search_inspire(query, size=10):
+    """
+    Search INSPIRE HEP database using fulltext search
+    Args:
+        query (str): Search query
+        size (int): Number of results to return
+    """
+    base_url = "https://inspirehep.net/api/literature"
+    params = {
+        "q": query,
+        "size": size,
+        "format": "json"
+    }
+    response = requests.get(base_url, params=params)
+    return response.json()
+def format_reference(metadata):
+  output = f"{', '.join(author.get('full_name', '') for author in metadata.get('authors', []))} "
+  output += f"({metadata.get('publication_info', [{}])[0].get('year', 'N/A')}). "
+  output += f"*{metadata.get('titles', [{}])[0].get('title', 'N/A')}*. "
+  output += f"DOI: {metadata.get('dois', [{}])[0].get('value', 'N/A') if metadata.get('dois') else 'N/A'}. "
+  output += f"[INSPIRE record {metadata['control_number']}](https://inspirehep.net/literature/{metadata['control_number']})"
+  output += "\n\n"
+  return output
+def format_results(results):
+    """Print formatted search results"""
+    output = ""
+    for i, hit in enumerate(results['hits']['hits']):
+        metadata = hit['metadata']
+        output += f"**[{i}]** "
+        output += format_reference(metadata)
+    return output
+def results_context(results):
+  """ Prepare a context from the results for the LLM """
+  context = ""
+  for i, hit in enumerate(results['hits']['hits']):
+    metadata = hit['metadata']
+    context += f"Result [{i}]\n\n"
+    context += f"Title: {metadata.get('titles', [{}])[0].get('title', 'N/A')}\n\n"
+    context += f"Abstract: {metadata.get('abstracts', [{}])[0].get('value', 'N/A')}\n\n"
+  return context
+def user_prompt(query, context):
+  """ Generate a prompt for the LLM """
+  prompt = f"""
+  QUERY: {query}
+  CONTEXT:
+  {context}
+  ANSWER:
+  """
+  return prompt
+def llm_expand_query(query):
+  """ Expands a query to variations of fulltext searches """
+  response = client.chat.completions.create(
+    model="gpt-4o",
+    messages=[
+      {
+        "role": "user",
+        "content": [
+          {
+            "type": "text",
+            "text": f"""
+            Expand this query into a the query format used for a fulltext search
+            over the INSPIRE HEP database. Propose alternatives of the query to
+            maximize the recall and join those variantes using OR operators and
+            prepend each variant with the ft prefix. Just provide the expanded
+            query, without explanations.
+            Example of query:
+            how far are black holes?
+            Expanded query:
+            ft "how far are black holes" OR ft "distance from black holes" OR ft
+            "distances to black holes" OR ft "measurement of distance to black
+            holes"  OR ft "remoteness of black holes"  OR ft "distance to black
+            holes"  OR ft "how far are singularities"  OR ft "distance to
+            singularities"  OR ft "distances to event horizon"  OR ft "distance
+            from Schwarzschild radius" OR ft "black hole distance"
+            Query: {query}
+            Expanded query:
+            """
+          }
+        ]
+      }
+    ],
+    response_format={
+      "type": "text"
+    },
+    temperature=1,
+    max_tokens=2048,
+    top_p=1,
+    frequency_penalty=0,
+    presence_penalty=0
+  )
+  return response.choices[0].message.content
+def llm_generate_answer(prompt):
+  """ Generate a response from the LLM """
+  response = client.chat.completions.create(
+    model="gpt-4o",
+    messages=[
+      {
+        "role": "system",
+        "content": [
+          {
+            "type": "text",
+            "text": """You are part of a Retrieval Augmented Generation system
+            (RAG) and are asked with a query and a context of results. Generate an
+            answer substantiated by the results provided and citing them using
+            their index when used to provide an answer text. Do not generate text
+            that is not grounded in a reference, so all paragraphs should cite a
+            search result. End the answer with the query and a brief answer as
+            summary of the previous discussed results. Do not consider results
+            that are not related to the query and, if no specif answer can be
+            provided, explain that in the brief answer."""
+          }
+        ]
+      },
+      {
+        "role": "user",
+        "content": [
+          {
+            "type": "text",
+            "text": prompt
+          }
+        ]
+      }
+    ],
+    response_format={
+      "type": "text"
+    },
+    temperature=1,
+    max_tokens=2048,
+    top_p=1,
+    frequency_penalty=0,
+    presence_penalty=0
+  )
+  return response.choices[0].message.content
+def clean_refs(answer, results):
+  """ Clean the references from the answer """
+  # Find references
+  unique_ordered = []
+  for match in re.finditer(r'\[(\d+)\]', answer):
+    ref_num = int(match.group(1))
+    if ref_num not in unique_ordered:
+        unique_ordered.append(ref_num)
+  # Filter references
+  new_i = 1
+  new_results = ""
+  for i, hit in enumerate(results['hits']['hits']):
+    if i not in unique_ordered:
+      continue
+    metadata = hit['metadata']
+    new_results += f"**[{new_i}]** "
+    new_results += format_reference(metadata)
+    new_i += 1
+  new_i = 1
+  for i in unique_ordered:
+    answer = answer.replace(f"[{i}]", f"[__NEW_REF_ID_{new_i}]")
+    new_i += 1
+  answer = answer.replace("__NEW_REF_ID_", "")
+  return answer, new_results
+def search(query, progress=gr.Progress()):
+    time.sleep(1)
+    progress(0, desc="Expanding query...")
+    query = llm_expand_query(query)
+    progress(0.25, desc="Searching INSPIRE HEP...")
+    results = search_inspire(query)
+    progress(0.50, desc="Generating answer...")
+    context = results_context(results)
+    prompt = user_prompt(query, context)
+    answer = llm_generate_answer(prompt)
+    new_answer, references = clean_refs(answer, results)
+    progress(1, desc="Done!")
+    #json_str = json.dumps(results['hits']['hits'][0]['metadata'], indent=4)
+    return "**Answer**:\n\n" + new_answer +"\n\n**References**:\n\n" + references #+ "\n\n <pre>\n" + json_str + "</pre>"
+# ----------- MAIN ------------------------------------------------------------
+os.environ["OPENAI_API_KEY"] = "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"
+client = OpenAI()
+with gr.Blocks() as demo:
+    gr.Markdown("# INSPIRE HEP Search")
+    with gr.Row():
+        with gr.Column():
+            query = gr.Textbox(label="Search Query")
+            search_btn = gr.Button("Search")
+            examples = gr.Examples([["Which one is closest star?"], ["In which particles does the Higgs Boson decay to?"]], query)
+        with gr.Column():
+           results = gr.Markdown("Answer will appear here...", label="Search Results", )
+        search_btn.click(fn=search, inputs=query, outputs=results, api_name="search", show_progress=True)
+demo.launch()
+#print(search("how far are black holes?"))

config.py ADDED Viewed

	@@ -0,0 +1,3 @@

+CONFIG = {
+    'OPEN_API_KEY':  "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"
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+gradio
+openai
+requests
+httpx<0.28