Spaces:
Build error
Build error
| import os | |
| import requests | |
| import json | |
| import pymupdf # PyMuPDF | |
| from openai import OpenAI | |
| from dotenv import load_dotenv | |
| import gradio as gr | |
| from pyvis.network import Network | |
| import networkx as nx | |
| from bs4 import BeautifulSoup | |
| # https://arxiv.org/pdf/2410.06401 | |
| def download_and_extract(arxiv_url): | |
| paper_id = arxiv_url.split("/")[-1] | |
| os.makedirs("downloads", exist_ok=True) | |
| pdf_path = f"downloads/{paper_id}.pdf" | |
| response = requests.get(arxiv_url) | |
| if response.status_code != 200: | |
| return None, "Failed to download PDF." | |
| with open(pdf_path, "wb") as f: | |
| f.write(response.content) | |
| # text | |
| text_path = f"downloads/{paper_id}_markdown.md" | |
| doc = pymupdf.open(pdf_path) | |
| text = "" | |
| for page in doc: | |
| text += page.get_text() | |
| with open(text_path, "wb") as f: | |
| f.write(text.encode('utf-8')) | |
| return pdf_path, text[:3000], text_path # File + text preview | |
| def get_adj_list(text_path, level): | |
| with open(text_path, "r") as f: | |
| text = f.read() | |
| client = OpenAI( | |
| base_url="https://api.studio.nebius.com/v1/", | |
| api_key=os.environ.get("NEBIUS_API_KEY") | |
| ) | |
| response = client.chat.completions.create( | |
| model="Qwen/Qwen3-235B-A22B", | |
| max_tokens=10000, | |
| temperature=0.6, | |
| top_p=0.95, | |
| messages=[ | |
| { | |
| "role": "system", | |
| "content": f"The markdown version of an arXiv is here: {text}" | |
| }, | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": f"""Given the markdown version of an arXiv paper, provide an adjacency list of concepts and key terms within the field of the paper. Make it helpful for a student at an {level} level in the relevant field. Format it in json, encased in '```json' and '```'.""" | |
| } | |
| ] | |
| } | |
| ] | |
| ) | |
| response_json = json.loads(response.to_json()) | |
| # actual output | |
| output = response_json["choices"][0]["message"]["content"] | |
| # remove the thinking | |
| output = output[output.find("```json") + 7 :] | |
| output = output[: output.find("```")] | |
| return output | |
| def gen_concept_graph(adj_list): | |
| adj_list_json = json.loads(adj_list) | |
| # given adj_list in json format, turn into graph | |
| G = nx.DiGraph() | |
| # nodes | |
| for u in adj_list_json: | |
| for v in adj_list_json[u]: | |
| G.add_edge(u, v) | |
| net = Network(notebook=True, cdn_resources="remote") | |
| net.from_nx(G) | |
| html = net.generate_html() | |
| html = html.replace("'", "\"") | |
| # https://mahimairaja.medium.com/build-knowledge-graph-from-textdata-using-langchain-under-2min-ce0d0d0e44e8 | |
| return f"""<iframe style="width: 100%; height: 600px;margin:0 auto" name="result" allow="midi; geolocation; microphone; camera; | |
| display-capture; encrypted-media;" sandbox="allow-modals allow-forms | |
| allow-scripts allow-same-origin allow-popups | |
| allow-top-navigation-by-user-activation allow-downloads" allowfullscreen="" | |
| allowpaymentrequest="" frameborder="0" srcdoc='{html}'></iframe>""" | |
| def submit(arxiv_url, level): | |
| pdf_path, text, text_path = download_and_extract(arxiv_url) | |
| adj_list = get_adj_list(text_path, level) | |
| concept_graph_html = gen_concept_graph(adj_list) | |
| return pdf_path, text, adj_list, concept_graph_html | |
| # ============== | |
| load_dotenv() | |
| demo = gr.Interface( | |
| fn=submit, | |
| inputs=[ | |
| gr.Textbox("https://arxiv.org/pdf/2410.06401", label="Enter arXiv Link", placeholder="https://arxiv.org/pdf/2410.06401"), | |
| gr.Dropdown(["Beginner", "Intermediate", "Expert"], label="Explanation Level"), | |
| ], | |
| outputs=[ | |
| gr.File(label="Download PDF"), | |
| gr.Textbox(label="Preview Extracted Text"), | |
| gr.Textbox(label="Concept Graph Adj List at Selected Level"), | |
| gr.HTML(label="Concept Graph at Selected Level") | |
| ], | |
| title="RALI5 - Reading arXiv Like I'm 5" | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch(mcp_server=True) | |