import os import requests import json import pymupdf # PyMuPDF from openai import OpenAI from dotenv import load_dotenv import gradio as gr from pyvis.network import Network import networkx as nx from bs4 import BeautifulSoup # https://arxiv.org/pdf/2410.06401 def download_and_extract(arxiv_url): paper_id = arxiv_url.split("/")[-1] os.makedirs("downloads", exist_ok=True) pdf_path = f"downloads/{paper_id}.pdf" response = requests.get(arxiv_url) if response.status_code != 200: return None, "Failed to download PDF." with open(pdf_path, "wb") as f: f.write(response.content) # text text_path = f"downloads/{paper_id}_markdown.md" doc = pymupdf.open(pdf_path) text = "" for page in doc: text += page.get_text() with open(text_path, "wb") as f: f.write(text.encode('utf-8')) return pdf_path, text[:3000], text_path # File + text preview def get_adj_list(text_path, level): with open(text_path, "r") as f: text = f.read() client = OpenAI( base_url="https://api.studio.nebius.com/v1/", api_key=os.environ.get("NEBIUS_API_KEY") ) response = client.chat.completions.create( model="Qwen/Qwen3-235B-A22B", max_tokens=10000, temperature=0.6, top_p=0.95, messages=[ { "role": "system", "content": f"The markdown version of an arXiv is here: {text}" }, { "role": "user", "content": [ { "type": "text", "text": f"""Given the markdown version of an arXiv paper, provide an adjacency list of concepts and key terms within the field of the paper. Make it helpful for a student at an {level} level in the relevant field. Format it in json, encased in '```json' and '```'.""" } ] } ] ) response_json = json.loads(response.to_json()) # actual output output = response_json["choices"][0]["message"]["content"] # remove the thinking output = output[output.find("```json") + 7 :] output = output[: output.find("```")] return output def gen_concept_graph(adj_list): adj_list_json = json.loads(adj_list) # given adj_list in json format, turn into graph G = nx.DiGraph() # nodes for u in adj_list_json: for v in adj_list_json[u]: G.add_edge(u, v) net = Network(notebook=True, cdn_resources="remote") net.from_nx(G) html = net.generate_html() html = html.replace("'", "\"") # https://mahimairaja.medium.com/build-knowledge-graph-from-textdata-using-langchain-under-2min-ce0d0d0e44e8 return f"""""" def submit(arxiv_url, level): pdf_path, text, text_path = download_and_extract(arxiv_url) adj_list = get_adj_list(text_path, level) concept_graph_html = gen_concept_graph(adj_list) return pdf_path, text, adj_list, concept_graph_html # ============== load_dotenv() demo = gr.Interface( fn=submit, inputs=[ gr.Textbox("https://arxiv.org/pdf/2410.06401", label="Enter arXiv Link", placeholder="https://arxiv.org/pdf/2410.06401"), gr.Dropdown(["Beginner", "Intermediate", "Expert"], label="Explanation Level"), ], outputs=[ gr.File(label="Download PDF"), gr.Textbox(label="Preview Extracted Text"), gr.Textbox(label="Concept Graph Adj List at Selected Level"), gr.HTML(label="Concept Graph at Selected Level") ], title="RALI5 - Reading arXiv Like I'm 5" ) if __name__ == "__main__": demo.launch(mcp_server=True)