Spaces:

Agents-MCP-Hackathon
/

RALI-5

Build error

RALI-5 / app.py

lemonlemonde

:sparkles: sort of working

8528469 6 months ago

4.12 kB

	import os
	import requests
	import json

	import pymupdf # PyMuPDF

	from openai import OpenAI
	from dotenv import load_dotenv

	import gradio as gr

	from pyvis.network import Network
	import networkx as nx
	from bs4 import BeautifulSoup

	# https://arxiv.org/pdf/2410.06401

	def download_and_extract(arxiv_url):
	paper_id = arxiv_url.split("/")[-1]
	os.makedirs("downloads", exist_ok=True)
	pdf_path = f"downloads/{paper_id}.pdf"

	response = requests.get(arxiv_url)
	if response.status_code != 200:
	return None, "Failed to download PDF."

	with open(pdf_path, "wb") as f:
	f.write(response.content)

	# text
	text_path = f"downloads/{paper_id}_markdown.md"
	doc = pymupdf.open(pdf_path)
	text = ""
	for page in doc:
	text += page.get_text()
	with open(text_path, "wb") as f:
	f.write(text.encode('utf-8'))

	return pdf_path, text[:3000], text_path # File + text preview

	def get_adj_list(text_path, level):
	with open(text_path, "r") as f:
	text = f.read()

	client = OpenAI(
	base_url="https://api.studio.nebius.com/v1/",
	api_key=os.environ.get("NEBIUS_API_KEY")
	)


	response = client.chat.completions.create(
	model="Qwen/Qwen3-235B-A22B",
	max_tokens=10000,
	temperature=0.6,
	top_p=0.95,
	messages=[
	{
	"role": "system",
	"content": f"The markdown version of an arXiv is here: {text}"
	},
	{
	"role": "user",
	"content": [
	{
	"type": "text",
	"text": f"""Given the markdown version of an arXiv paper, provide an adjacency list of concepts and key terms within the field of the paper. Make it helpful for a student at an {level} level in the relevant field. Format it in json, encased in '```json' and '```'."""
	}
	]
	}
	]
	)

	response_json = json.loads(response.to_json())

	# actual output
	output = response_json["choices"][0]["message"]["content"]
	# remove the thinking
	output = output[output.find("```json") + 7 :]
	output = output[: output.find("```")]
	return output


	def gen_concept_graph(adj_list):
	adj_list_json = json.loads(adj_list)
	# given adj_list in json format, turn into graph
	G = nx.DiGraph()

	# nodes
	for u in adj_list_json:
	for v in adj_list_json[u]:
	G.add_edge(u, v)

	net = Network(notebook=True, cdn_resources="remote")
	net.from_nx(G)

	html = net.generate_html()
	html = html.replace("'", "\"")

	# https://mahimairaja.medium.com/build-knowledge-graph-from-textdata-using-langchain-under-2min-ce0d0d0e44e8
	return f"""<iframe style="width: 100%; height: 600px;margin:0 auto" name="result" allow="midi; geolocation; microphone; camera;
	display-capture; encrypted-media;" sandbox="allow-modals allow-forms
	allow-scripts allow-same-origin allow-popups
	allow-top-navigation-by-user-activation allow-downloads" allowfullscreen=""
	allowpaymentrequest="" frameborder="0" srcdoc='{html}'></iframe>"""

	def submit(arxiv_url, level):
	pdf_path, text, text_path = download_and_extract(arxiv_url)

	adj_list = get_adj_list(text_path, level)

	concept_graph_html = gen_concept_graph(adj_list)

	return pdf_path, text, adj_list, concept_graph_html

	# ==============
	load_dotenv()
	demo = gr.Interface(
	fn=submit,
	inputs=[
	gr.Textbox("https://arxiv.org/pdf/2410.06401", label="Enter arXiv Link", placeholder="https://arxiv.org/pdf/2410.06401"),
	gr.Dropdown(["Beginner", "Intermediate", "Expert"], label="Explanation Level"),
	],
	outputs=[
	gr.File(label="Download PDF"),
	gr.Textbox(label="Preview Extracted Text"),
	gr.Textbox(label="Concept Graph Adj List at Selected Level"),
	gr.HTML(label="Concept Graph at Selected Level")
	],
	title="RALI5 - Reading arXiv Like I'm 5"
	)

	if __name__ == "__main__":
	demo.launch(mcp_server=True)