RALI-5 / app.py
lemonlemonde
:sparkles: sort of working
8528469
raw
history blame
4.12 kB
import os
import requests
import json
import pymupdf # PyMuPDF
from openai import OpenAI
from dotenv import load_dotenv
import gradio as gr
from pyvis.network import Network
import networkx as nx
from bs4 import BeautifulSoup
# https://arxiv.org/pdf/2410.06401
def download_and_extract(arxiv_url):
paper_id = arxiv_url.split("/")[-1]
os.makedirs("downloads", exist_ok=True)
pdf_path = f"downloads/{paper_id}.pdf"
response = requests.get(arxiv_url)
if response.status_code != 200:
return None, "Failed to download PDF."
with open(pdf_path, "wb") as f:
f.write(response.content)
# text
text_path = f"downloads/{paper_id}_markdown.md"
doc = pymupdf.open(pdf_path)
text = ""
for page in doc:
text += page.get_text()
with open(text_path, "wb") as f:
f.write(text.encode('utf-8'))
return pdf_path, text[:3000], text_path # File + text preview
def get_adj_list(text_path, level):
with open(text_path, "r") as f:
text = f.read()
client = OpenAI(
base_url="https://api.studio.nebius.com/v1/",
api_key=os.environ.get("NEBIUS_API_KEY")
)
response = client.chat.completions.create(
model="Qwen/Qwen3-235B-A22B",
max_tokens=10000,
temperature=0.6,
top_p=0.95,
messages=[
{
"role": "system",
"content": f"The markdown version of an arXiv is here: {text}"
},
{
"role": "user",
"content": [
{
"type": "text",
"text": f"""Given the markdown version of an arXiv paper, provide an adjacency list of concepts and key terms within the field of the paper. Make it helpful for a student at an {level} level in the relevant field. Format it in json, encased in '```json' and '```'."""
}
]
}
]
)
response_json = json.loads(response.to_json())
# actual output
output = response_json["choices"][0]["message"]["content"]
# remove the thinking
output = output[output.find("```json") + 7 :]
output = output[: output.find("```")]
return output
def gen_concept_graph(adj_list):
adj_list_json = json.loads(adj_list)
# given adj_list in json format, turn into graph
G = nx.DiGraph()
# nodes
for u in adj_list_json:
for v in adj_list_json[u]:
G.add_edge(u, v)
net = Network(notebook=True, cdn_resources="remote")
net.from_nx(G)
html = net.generate_html()
html = html.replace("'", "\"")
# https://mahimairaja.medium.com/build-knowledge-graph-from-textdata-using-langchain-under-2min-ce0d0d0e44e8
return f"""<iframe style="width: 100%; height: 600px;margin:0 auto" name="result" allow="midi; geolocation; microphone; camera;
display-capture; encrypted-media;" sandbox="allow-modals allow-forms
allow-scripts allow-same-origin allow-popups
allow-top-navigation-by-user-activation allow-downloads" allowfullscreen=""
allowpaymentrequest="" frameborder="0" srcdoc='{html}'></iframe>"""
def submit(arxiv_url, level):
pdf_path, text, text_path = download_and_extract(arxiv_url)
adj_list = get_adj_list(text_path, level)
concept_graph_html = gen_concept_graph(adj_list)
return pdf_path, text, adj_list, concept_graph_html
# ==============
load_dotenv()
demo = gr.Interface(
fn=submit,
inputs=[
gr.Textbox("https://arxiv.org/pdf/2410.06401", label="Enter arXiv Link", placeholder="https://arxiv.org/pdf/2410.06401"),
gr.Dropdown(["Beginner", "Intermediate", "Expert"], label="Explanation Level"),
],
outputs=[
gr.File(label="Download PDF"),
gr.Textbox(label="Preview Extracted Text"),
gr.Textbox(label="Concept Graph Adj List at Selected Level"),
gr.HTML(label="Concept Graph at Selected Level")
],
title="RALI5 - Reading arXiv Like I'm 5"
)
if __name__ == "__main__":
demo.launch(mcp_server=True)