Commit
·
c3e313a
1
Parent(s):
be9d670
app.py
CHANGED
|
@@ -24,9 +24,7 @@ from typing import List, Dict, Any, Optional
|
|
| 24 |
import gradio as gr
|
| 25 |
import requests
|
| 26 |
import feedparser
|
| 27 |
-
import spacy
|
| 28 |
from bs4 import BeautifulSoup
|
| 29 |
-
from fuzzywuzzy import fuzz
|
| 30 |
|
| 31 |
# Configure logging
|
| 32 |
logging.basicConfig(
|
|
@@ -45,9 +43,6 @@ GITHUB_AUTH = os.environ.get("GITHUB_AUTH")
|
|
| 45 |
if not HF_TOKEN:
|
| 46 |
logger.warning("HF_TOKEN not found in environment variables")
|
| 47 |
|
| 48 |
-
# Global spaCy model (loaded lazily)
|
| 49 |
-
nlp = None
|
| 50 |
-
|
| 51 |
|
| 52 |
# Utility functions
|
| 53 |
def get_arxiv_id(paper_url: str) -> Optional[str]:
|
|
@@ -67,6 +62,60 @@ def extract_links_from_soup(soup, text):
|
|
| 67 |
return html_links + markdown_links
|
| 68 |
|
| 69 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
def create_row_data(input_data: str) -> Dict[str, Any]:
|
| 71 |
"""Create standardized row data structure from input."""
|
| 72 |
row_data = {
|
|
@@ -112,10 +161,25 @@ def infer_paper_from_row(row_data: Dict[str, Any]) -> Optional[str]:
|
|
| 112 |
try:
|
| 113 |
url = urlparse(row_data["Paper"])
|
| 114 |
if url.scheme in ["http", "https"]:
|
|
|
|
| 115 |
if "arxiv.org/pdf/" in row_data["Paper"]:
|
| 116 |
new_url = row_data["Paper"].replace("/pdf/", "/abs/").replace(".pdf", "")
|
| 117 |
logger.info(f"Paper {new_url} inferred from {row_data['Paper']}")
|
| 118 |
return new_url
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
return row_data["Paper"]
|
| 120 |
except Exception:
|
| 121 |
pass
|
|
@@ -246,7 +310,28 @@ def infer_code_from_row(row_data: Dict[str, Any]) -> Optional[str]:
|
|
| 246 |
except Exception:
|
| 247 |
pass
|
| 248 |
|
| 249 |
-
# Try
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 250 |
if row_data.get("Paper") is not None and "arxiv.org" in row_data["Paper"] and GITHUB_AUTH:
|
| 251 |
try:
|
| 252 |
arxiv_id = get_arxiv_id(row_data["Paper"])
|
|
@@ -327,10 +412,29 @@ def infer_model_from_row(row_data: Dict[str, Any]) -> Optional[str]:
|
|
| 327 |
|
| 328 |
if row_data.get("Paper") is not None:
|
| 329 |
arxiv_id = get_arxiv_id(row_data["Paper"])
|
|
|
|
|
|
|
| 330 |
if arxiv_id is not None and arxiv_id in known_model_mappings:
|
| 331 |
model_url = known_model_mappings[arxiv_id]
|
| 332 |
logger.info(f"Model {model_url} inferred from Paper (known mapping)")
|
| 333 |
return model_url
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 334 |
|
| 335 |
return None
|
| 336 |
|
|
@@ -347,16 +451,57 @@ def infer_dataset_from_row(row_data: Dict[str, Any]) -> Optional[str]:
|
|
| 347 |
|
| 348 |
if row_data.get("Paper") is not None:
|
| 349 |
arxiv_id = get_arxiv_id(row_data["Paper"])
|
|
|
|
|
|
|
| 350 |
if arxiv_id is not None and arxiv_id in known_dataset_mappings:
|
| 351 |
dataset_url = known_dataset_mappings[arxiv_id]
|
| 352 |
logger.info(f"Dataset {dataset_url} inferred from Paper (known mapping)")
|
| 353 |
return dataset_url
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 354 |
|
| 355 |
return None
|
| 356 |
|
| 357 |
|
| 358 |
def infer_space_from_row(row_data: Dict[str, Any]) -> Optional[str]:
|
| 359 |
"""Infer HuggingFace space from row data"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 360 |
if row_data.get("Model") is not None:
|
| 361 |
try:
|
| 362 |
model_id = row_data["Model"].split("huggingface.co/")[1]
|
|
@@ -393,36 +538,6 @@ def infer_license_from_row(row_data: Dict[str, Any]) -> Optional[str]:
|
|
| 393 |
return None
|
| 394 |
|
| 395 |
|
| 396 |
-
def infer_orgs_from_row(row_data: Dict[str, Any]) -> List[str]:
|
| 397 |
-
"""Infer organizations from row data"""
|
| 398 |
-
global nlp
|
| 399 |
-
if nlp is None:
|
| 400 |
-
try:
|
| 401 |
-
nlp = spacy.load("en_core_web_sm")
|
| 402 |
-
except OSError as e:
|
| 403 |
-
logger.warning(f"Could not load spaCy model 'en_core_web_sm': {e}")
|
| 404 |
-
return row_data.get("Orgs", [])
|
| 405 |
-
|
| 406 |
-
orgs_input = row_data.get("Orgs", [])
|
| 407 |
-
if not orgs_input or not isinstance(orgs_input, list):
|
| 408 |
-
return []
|
| 409 |
-
|
| 410 |
-
orgs = []
|
| 411 |
-
for org in orgs_input:
|
| 412 |
-
if not org or not isinstance(org, str):
|
| 413 |
-
continue
|
| 414 |
-
doc = nlp(org)
|
| 415 |
-
for ent in doc.ents:
|
| 416 |
-
if ent.label_ == "ORG":
|
| 417 |
-
if ent.text == org and ent.text not in orgs:
|
| 418 |
-
orgs.append(ent.text)
|
| 419 |
-
break
|
| 420 |
-
if fuzz.ratio(ent.text, org) > 80 and ent.text not in orgs:
|
| 421 |
-
orgs.append(ent.text)
|
| 422 |
-
logger.info(f"Org {ent.text} inferred from {org}")
|
| 423 |
-
break
|
| 424 |
-
|
| 425 |
-
return orgs
|
| 426 |
|
| 427 |
|
| 428 |
def infer_field_type(value: str) -> str:
|
|
@@ -575,27 +690,6 @@ def classify_research_url(input_data: str) -> str:
|
|
| 575 |
return "Unknown"
|
| 576 |
|
| 577 |
|
| 578 |
-
def infer_organizations(input_data: str) -> List[str]:
|
| 579 |
-
"""
|
| 580 |
-
Infer affiliated organizations from research paper or project information.
|
| 581 |
-
|
| 582 |
-
Args:
|
| 583 |
-
input_data (str): A URL, paper title, or other research-related input
|
| 584 |
-
|
| 585 |
-
Returns:
|
| 586 |
-
List[str]: A list of organization names, or empty list if no organizations found
|
| 587 |
-
"""
|
| 588 |
-
if not input_data or not input_data.strip():
|
| 589 |
-
return []
|
| 590 |
-
|
| 591 |
-
try:
|
| 592 |
-
row_data = create_row_data(input_data.strip())
|
| 593 |
-
orgs = infer_orgs_from_row(row_data)
|
| 594 |
-
return orgs if isinstance(orgs, list) else []
|
| 595 |
-
|
| 596 |
-
except Exception as e:
|
| 597 |
-
logger.error(f"Error inferring organizations: {e}")
|
| 598 |
-
return []
|
| 599 |
|
| 600 |
|
| 601 |
def infer_publication_date(input_data: str) -> str:
|
|
@@ -734,7 +828,6 @@ def find_research_relationships(input_data: str) -> Dict[str, Any]:
|
|
| 734 |
"code": None,
|
| 735 |
"name": None,
|
| 736 |
"authors": [],
|
| 737 |
-
"organizations": [],
|
| 738 |
"date": None,
|
| 739 |
"model": None,
|
| 740 |
"dataset": None,
|
|
@@ -742,7 +835,7 @@ def find_research_relationships(input_data: str) -> Dict[str, Any]:
|
|
| 742 |
"license": None,
|
| 743 |
"field_type": None,
|
| 744 |
"success_count": 0,
|
| 745 |
-
"total_inferences":
|
| 746 |
}
|
| 747 |
|
| 748 |
inferences = [
|
|
@@ -750,7 +843,6 @@ def find_research_relationships(input_data: str) -> Dict[str, Any]:
|
|
| 750 |
("code", infer_code_repository),
|
| 751 |
("name", infer_research_name),
|
| 752 |
("authors", infer_authors),
|
| 753 |
-
("organizations", infer_organizations),
|
| 754 |
("date", infer_publication_date),
|
| 755 |
("model", infer_model),
|
| 756 |
("dataset", infer_dataset),
|
|
@@ -783,41 +875,109 @@ def find_research_relationships(input_data: str) -> Dict[str, Any]:
|
|
| 783 |
return {"error": str(e), "success_count": 0, "total_inferences": 0}
|
| 784 |
|
| 785 |
|
| 786 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 787 |
with gr.Blocks(title="Research Tracker MCP Server") as demo:
|
| 788 |
-
gr.Markdown("# Research Tracker
|
| 789 |
gr.Markdown("""
|
| 790 |
-
|
| 791 |
-
|
| 792 |
-
**
|
| 793 |
-
-
|
| 794 |
-
-
|
| 795 |
-
- `infer_code_repository` - Discover code repository links
|
| 796 |
-
- `infer_research_name` - Extract research project names
|
| 797 |
-
- `classify_research_url` - Classify URL types (paper/code/model/etc.)
|
| 798 |
-
- `infer_organizations` - Identify affiliated organizations
|
| 799 |
-
- `infer_publication_date` - Extract publication dates
|
| 800 |
-
- `infer_model` - Find associated HuggingFace models
|
| 801 |
-
- `infer_dataset` - Find associated HuggingFace datasets
|
| 802 |
-
- `infer_space` - Find associated HuggingFace spaces
|
| 803 |
-
- `infer_license` - Extract license information
|
| 804 |
-
- `find_research_relationships` - Comprehensive research ecosystem analysis
|
| 805 |
-
|
| 806 |
-
**Input Support:**
|
| 807 |
-
- arXiv paper URLs (https://arxiv.org/abs/...)
|
| 808 |
- GitHub repository URLs (https://github.com/...)
|
| 809 |
- HuggingFace model/dataset/space URLs
|
| 810 |
- Research paper titles and project names
|
| 811 |
- Project page URLs
|
| 812 |
""")
|
| 813 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 814 |
# Expose all core functions as MCP tools
|
| 815 |
gr.api(infer_authors)
|
| 816 |
gr.api(infer_paper_url)
|
| 817 |
gr.api(infer_code_repository)
|
| 818 |
gr.api(infer_research_name)
|
| 819 |
gr.api(classify_research_url)
|
| 820 |
-
gr.api(infer_organizations)
|
| 821 |
gr.api(infer_publication_date)
|
| 822 |
gr.api(infer_model)
|
| 823 |
gr.api(infer_dataset)
|
|
@@ -828,4 +988,4 @@ with gr.Blocks(title="Research Tracker MCP Server") as demo:
|
|
| 828 |
|
| 829 |
if __name__ == "__main__":
|
| 830 |
logger.info("Starting Research Tracker MCP Server")
|
| 831 |
-
demo.launch(mcp_server=True, share=False)
|
|
|
|
| 24 |
import gradio as gr
|
| 25 |
import requests
|
| 26 |
import feedparser
|
|
|
|
| 27 |
from bs4 import BeautifulSoup
|
|
|
|
| 28 |
|
| 29 |
# Configure logging
|
| 30 |
logging.basicConfig(
|
|
|
|
| 43 |
if not HF_TOKEN:
|
| 44 |
logger.warning("HF_TOKEN not found in environment variables")
|
| 45 |
|
|
|
|
|
|
|
|
|
|
| 46 |
|
| 47 |
# Utility functions
|
| 48 |
def get_arxiv_id(paper_url: str) -> Optional[str]:
|
|
|
|
| 62 |
return html_links + markdown_links
|
| 63 |
|
| 64 |
|
| 65 |
+
def scrape_huggingface_paper_page(paper_url: str) -> Dict[str, Any]:
|
| 66 |
+
"""
|
| 67 |
+
Scrape HuggingFace paper page to find associated resources
|
| 68 |
+
|
| 69 |
+
Returns:
|
| 70 |
+
Dict containing found resources: {
|
| 71 |
+
"models": [], "datasets": [], "spaces": [], "code": []
|
| 72 |
+
}
|
| 73 |
+
"""
|
| 74 |
+
resources = {"models": [], "datasets": [], "spaces": [], "code": []}
|
| 75 |
+
|
| 76 |
+
if not paper_url or "huggingface.co/papers" not in paper_url:
|
| 77 |
+
return resources
|
| 78 |
+
|
| 79 |
+
try:
|
| 80 |
+
r = requests.get(paper_url, timeout=REQUEST_TIMEOUT)
|
| 81 |
+
if r.status_code != 200:
|
| 82 |
+
return resources
|
| 83 |
+
|
| 84 |
+
soup = BeautifulSoup(r.text, "html.parser")
|
| 85 |
+
|
| 86 |
+
# Find all links on the page
|
| 87 |
+
links = []
|
| 88 |
+
for link in soup.find_all("a"):
|
| 89 |
+
href = link.get("href")
|
| 90 |
+
if href:
|
| 91 |
+
# Convert relative URLs to absolute
|
| 92 |
+
if href.startswith("/"):
|
| 93 |
+
href = "https://huggingface.co" + href
|
| 94 |
+
elif href.startswith("huggingface.co"):
|
| 95 |
+
href = "https://" + href
|
| 96 |
+
links.append(href)
|
| 97 |
+
|
| 98 |
+
# Categorize links
|
| 99 |
+
for link in links:
|
| 100 |
+
if "huggingface.co/" in link:
|
| 101 |
+
if "/models/" in link and link not in resources["models"]:
|
| 102 |
+
resources["models"].append(link)
|
| 103 |
+
elif "/datasets/" in link and link not in resources["datasets"]:
|
| 104 |
+
resources["datasets"].append(link)
|
| 105 |
+
elif "/spaces/" in link and link not in resources["spaces"]:
|
| 106 |
+
resources["spaces"].append(link)
|
| 107 |
+
elif "github.com" in link and link not in resources["code"]:
|
| 108 |
+
resources["code"].append(link)
|
| 109 |
+
|
| 110 |
+
logger.info(f"Found {len(resources['models'])} models, {len(resources['datasets'])} datasets, "
|
| 111 |
+
f"{len(resources['spaces'])} spaces, {len(resources['code'])} code repos from HF paper page")
|
| 112 |
+
|
| 113 |
+
except Exception as e:
|
| 114 |
+
logger.warning(f"Failed to scrape HuggingFace paper page {paper_url}: {e}")
|
| 115 |
+
|
| 116 |
+
return resources
|
| 117 |
+
|
| 118 |
+
|
| 119 |
def create_row_data(input_data: str) -> Dict[str, Any]:
|
| 120 |
"""Create standardized row data structure from input."""
|
| 121 |
row_data = {
|
|
|
|
| 161 |
try:
|
| 162 |
url = urlparse(row_data["Paper"])
|
| 163 |
if url.scheme in ["http", "https"]:
|
| 164 |
+
# Convert arXiv PDF to abs format
|
| 165 |
if "arxiv.org/pdf/" in row_data["Paper"]:
|
| 166 |
new_url = row_data["Paper"].replace("/pdf/", "/abs/").replace(".pdf", "")
|
| 167 |
logger.info(f"Paper {new_url} inferred from {row_data['Paper']}")
|
| 168 |
return new_url
|
| 169 |
+
|
| 170 |
+
# If this is an arXiv URL, try HuggingFace papers first for better resource discovery
|
| 171 |
+
if "arxiv.org/abs/" in row_data["Paper"]:
|
| 172 |
+
arxiv_id = row_data["Paper"].split("arxiv.org/abs/")[1]
|
| 173 |
+
hf_paper_url = f"https://huggingface.co/papers/{arxiv_id}"
|
| 174 |
+
try:
|
| 175 |
+
# Test if HuggingFace paper page exists and has content
|
| 176 |
+
r = requests.get(hf_paper_url, timeout=10)
|
| 177 |
+
if r.status_code == 200 and len(r.text) > 1000: # Basic check for content
|
| 178 |
+
logger.info(f"Paper {hf_paper_url} inferred from arXiv (HuggingFace preferred)")
|
| 179 |
+
return hf_paper_url
|
| 180 |
+
except Exception:
|
| 181 |
+
pass # Fall back to original arXiv URL
|
| 182 |
+
|
| 183 |
return row_data["Paper"]
|
| 184 |
except Exception:
|
| 185 |
pass
|
|
|
|
| 310 |
except Exception:
|
| 311 |
pass
|
| 312 |
|
| 313 |
+
# Try scraping HuggingFace paper page for code links
|
| 314 |
+
if row_data.get("Paper") is not None:
|
| 315 |
+
arxiv_id = get_arxiv_id(row_data["Paper"])
|
| 316 |
+
|
| 317 |
+
# Try scraping HuggingFace paper page
|
| 318 |
+
if "huggingface.co/papers" in row_data["Paper"]:
|
| 319 |
+
resources = scrape_huggingface_paper_page(row_data["Paper"])
|
| 320 |
+
if resources["code"]:
|
| 321 |
+
code_url = resources["code"][0] # Take first code repo found
|
| 322 |
+
logger.info(f"Code {code_url} inferred from HuggingFace paper page")
|
| 323 |
+
return code_url
|
| 324 |
+
|
| 325 |
+
# If we have arXiv URL, try the HuggingFace version first
|
| 326 |
+
elif "arxiv.org/abs/" in row_data["Paper"] and arxiv_id:
|
| 327 |
+
hf_paper_url = f"https://huggingface.co/papers/{arxiv_id}"
|
| 328 |
+
resources = scrape_huggingface_paper_page(hf_paper_url)
|
| 329 |
+
if resources["code"]:
|
| 330 |
+
code_url = resources["code"][0]
|
| 331 |
+
logger.info(f"Code {code_url} inferred from HuggingFace paper page (via arXiv)")
|
| 332 |
+
return code_url
|
| 333 |
+
|
| 334 |
+
# Fallback: Try GitHub search for papers
|
| 335 |
if row_data.get("Paper") is not None and "arxiv.org" in row_data["Paper"] and GITHUB_AUTH:
|
| 336 |
try:
|
| 337 |
arxiv_id = get_arxiv_id(row_data["Paper"])
|
|
|
|
| 412 |
|
| 413 |
if row_data.get("Paper") is not None:
|
| 414 |
arxiv_id = get_arxiv_id(row_data["Paper"])
|
| 415 |
+
|
| 416 |
+
# First check known mappings
|
| 417 |
if arxiv_id is not None and arxiv_id in known_model_mappings:
|
| 418 |
model_url = known_model_mappings[arxiv_id]
|
| 419 |
logger.info(f"Model {model_url} inferred from Paper (known mapping)")
|
| 420 |
return model_url
|
| 421 |
+
|
| 422 |
+
# Try scraping HuggingFace paper page
|
| 423 |
+
if "huggingface.co/papers" in row_data["Paper"]:
|
| 424 |
+
resources = scrape_huggingface_paper_page(row_data["Paper"])
|
| 425 |
+
if resources["models"]:
|
| 426 |
+
model_url = resources["models"][0] # Take first model found
|
| 427 |
+
logger.info(f"Model {model_url} inferred from HuggingFace paper page")
|
| 428 |
+
return model_url
|
| 429 |
+
|
| 430 |
+
# If we have arXiv URL, try the HuggingFace version
|
| 431 |
+
elif "arxiv.org/abs/" in row_data["Paper"] and arxiv_id:
|
| 432 |
+
hf_paper_url = f"https://huggingface.co/papers/{arxiv_id}"
|
| 433 |
+
resources = scrape_huggingface_paper_page(hf_paper_url)
|
| 434 |
+
if resources["models"]:
|
| 435 |
+
model_url = resources["models"][0]
|
| 436 |
+
logger.info(f"Model {model_url} inferred from HuggingFace paper page (via arXiv)")
|
| 437 |
+
return model_url
|
| 438 |
|
| 439 |
return None
|
| 440 |
|
|
|
|
| 451 |
|
| 452 |
if row_data.get("Paper") is not None:
|
| 453 |
arxiv_id = get_arxiv_id(row_data["Paper"])
|
| 454 |
+
|
| 455 |
+
# First check known mappings
|
| 456 |
if arxiv_id is not None and arxiv_id in known_dataset_mappings:
|
| 457 |
dataset_url = known_dataset_mappings[arxiv_id]
|
| 458 |
logger.info(f"Dataset {dataset_url} inferred from Paper (known mapping)")
|
| 459 |
return dataset_url
|
| 460 |
+
|
| 461 |
+
# Try scraping HuggingFace paper page
|
| 462 |
+
if "huggingface.co/papers" in row_data["Paper"]:
|
| 463 |
+
resources = scrape_huggingface_paper_page(row_data["Paper"])
|
| 464 |
+
if resources["datasets"]:
|
| 465 |
+
dataset_url = resources["datasets"][0] # Take first dataset found
|
| 466 |
+
logger.info(f"Dataset {dataset_url} inferred from HuggingFace paper page")
|
| 467 |
+
return dataset_url
|
| 468 |
+
|
| 469 |
+
# If we have arXiv URL, try the HuggingFace version
|
| 470 |
+
elif "arxiv.org/abs/" in row_data["Paper"] and arxiv_id:
|
| 471 |
+
hf_paper_url = f"https://huggingface.co/papers/{arxiv_id}"
|
| 472 |
+
resources = scrape_huggingface_paper_page(hf_paper_url)
|
| 473 |
+
if resources["datasets"]:
|
| 474 |
+
dataset_url = resources["datasets"][0]
|
| 475 |
+
logger.info(f"Dataset {dataset_url} inferred from HuggingFace paper page (via arXiv)")
|
| 476 |
+
return dataset_url
|
| 477 |
|
| 478 |
return None
|
| 479 |
|
| 480 |
|
| 481 |
def infer_space_from_row(row_data: Dict[str, Any]) -> Optional[str]:
|
| 482 |
"""Infer HuggingFace space from row data"""
|
| 483 |
+
# Try scraping HuggingFace paper page first (most reliable)
|
| 484 |
+
if row_data.get("Paper") is not None:
|
| 485 |
+
arxiv_id = get_arxiv_id(row_data["Paper"])
|
| 486 |
+
|
| 487 |
+
# Try scraping HuggingFace paper page
|
| 488 |
+
if "huggingface.co/papers" in row_data["Paper"]:
|
| 489 |
+
resources = scrape_huggingface_paper_page(row_data["Paper"])
|
| 490 |
+
if resources["spaces"]:
|
| 491 |
+
space_url = resources["spaces"][0] # Take first space found
|
| 492 |
+
logger.info(f"Space {space_url} inferred from HuggingFace paper page")
|
| 493 |
+
return space_url
|
| 494 |
+
|
| 495 |
+
# If we have arXiv URL, try the HuggingFace version
|
| 496 |
+
elif "arxiv.org/abs/" in row_data["Paper"] and arxiv_id:
|
| 497 |
+
hf_paper_url = f"https://huggingface.co/papers/{arxiv_id}"
|
| 498 |
+
resources = scrape_huggingface_paper_page(hf_paper_url)
|
| 499 |
+
if resources["spaces"]:
|
| 500 |
+
space_url = resources["spaces"][0]
|
| 501 |
+
logger.info(f"Space {space_url} inferred from HuggingFace paper page (via arXiv)")
|
| 502 |
+
return space_url
|
| 503 |
+
|
| 504 |
+
# Fallback: try to infer from model
|
| 505 |
if row_data.get("Model") is not None:
|
| 506 |
try:
|
| 507 |
model_id = row_data["Model"].split("huggingface.co/")[1]
|
|
|
|
| 538 |
return None
|
| 539 |
|
| 540 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 541 |
|
| 542 |
|
| 543 |
def infer_field_type(value: str) -> str:
|
|
|
|
| 690 |
return "Unknown"
|
| 691 |
|
| 692 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 693 |
|
| 694 |
|
| 695 |
def infer_publication_date(input_data: str) -> str:
|
|
|
|
| 828 |
"code": None,
|
| 829 |
"name": None,
|
| 830 |
"authors": [],
|
|
|
|
| 831 |
"date": None,
|
| 832 |
"model": None,
|
| 833 |
"dataset": None,
|
|
|
|
| 835 |
"license": None,
|
| 836 |
"field_type": None,
|
| 837 |
"success_count": 0,
|
| 838 |
+
"total_inferences": 10
|
| 839 |
}
|
| 840 |
|
| 841 |
inferences = [
|
|
|
|
| 843 |
("code", infer_code_repository),
|
| 844 |
("name", infer_research_name),
|
| 845 |
("authors", infer_authors),
|
|
|
|
| 846 |
("date", infer_publication_date),
|
| 847 |
("model", infer_model),
|
| 848 |
("dataset", infer_dataset),
|
|
|
|
| 875 |
return {"error": str(e), "success_count": 0, "total_inferences": 0}
|
| 876 |
|
| 877 |
|
| 878 |
+
def format_list_output(items):
|
| 879 |
+
"""Format list items for display"""
|
| 880 |
+
if not items or not isinstance(items, list):
|
| 881 |
+
return "None"
|
| 882 |
+
return "\n".join([f"• {item}" for item in items])
|
| 883 |
+
|
| 884 |
+
def process_research_relationships(input_data):
|
| 885 |
+
"""Process research input and return formatted results"""
|
| 886 |
+
if not input_data or not input_data.strip():
|
| 887 |
+
return "Please enter a valid URL or research name", "", "", "", "", "", "", "", "", ""
|
| 888 |
+
|
| 889 |
+
try:
|
| 890 |
+
result = find_research_relationships(input_data.strip())
|
| 891 |
+
|
| 892 |
+
# Extract individual fields with fallback to empty string
|
| 893 |
+
paper = result.get("paper", "") or ""
|
| 894 |
+
code = result.get("code", "") or ""
|
| 895 |
+
name = result.get("name", "") or ""
|
| 896 |
+
authors = format_list_output(result.get("authors", []))
|
| 897 |
+
date = result.get("date", "") or ""
|
| 898 |
+
model = result.get("model", "") or ""
|
| 899 |
+
dataset = result.get("dataset", "") or ""
|
| 900 |
+
space = result.get("space", "") or ""
|
| 901 |
+
license_info = result.get("license", "") or ""
|
| 902 |
+
field_type = result.get("field_type", "") or ""
|
| 903 |
+
|
| 904 |
+
return paper, code, name, authors, date, model, dataset, space, license_info, field_type
|
| 905 |
+
|
| 906 |
+
except Exception as e:
|
| 907 |
+
error_msg = f"Error processing input: {str(e)}"
|
| 908 |
+
return error_msg, "", "", "", "", "", "", "", "", ""
|
| 909 |
+
|
| 910 |
+
# Create Gradio interface with both UI and MCP tool exposure
|
| 911 |
with gr.Blocks(title="Research Tracker MCP Server") as demo:
|
| 912 |
+
gr.Markdown("# Research Tracker - Find Research Relationships")
|
| 913 |
gr.Markdown("""
|
| 914 |
+
Enter a research paper URL, GitHub repository, or research name to discover all related resources across platforms.
|
| 915 |
+
|
| 916 |
+
**Supported inputs:**
|
| 917 |
+
- arXiv paper URLs (https://arxiv.org/abs/...) - automatically checks HuggingFace papers first
|
| 918 |
+
- HuggingFace paper URLs (https://huggingface.co/papers/...) - preferred for better resource discovery
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 919 |
- GitHub repository URLs (https://github.com/...)
|
| 920 |
- HuggingFace model/dataset/space URLs
|
| 921 |
- Research paper titles and project names
|
| 922 |
- Project page URLs
|
| 923 |
""")
|
| 924 |
|
| 925 |
+
with gr.Row():
|
| 926 |
+
with gr.Column():
|
| 927 |
+
input_text = gr.Textbox(
|
| 928 |
+
label="Paper URL, Repository URL, or Research Name",
|
| 929 |
+
placeholder="https://arxiv.org/abs/2506.18787",
|
| 930 |
+
lines=2
|
| 931 |
+
)
|
| 932 |
+
submit_btn = gr.Button("Find Research Relationships", variant="primary")
|
| 933 |
+
|
| 934 |
+
gr.Markdown("## Research Relationships")
|
| 935 |
+
|
| 936 |
+
with gr.Row():
|
| 937 |
+
with gr.Column():
|
| 938 |
+
paper_output = gr.Textbox(label="Paper URL", interactive=False)
|
| 939 |
+
code_output = gr.Textbox(label="Code Repository", interactive=False)
|
| 940 |
+
name_output = gr.Textbox(label="Research Name", interactive=False)
|
| 941 |
+
authors_output = gr.Textbox(label="Authors", lines=3, interactive=False)
|
| 942 |
+
|
| 943 |
+
with gr.Column():
|
| 944 |
+
date_output = gr.Textbox(label="Publication Date", interactive=False)
|
| 945 |
+
model_output = gr.Textbox(label="HuggingFace Model", interactive=False)
|
| 946 |
+
dataset_output = gr.Textbox(label="HuggingFace Dataset", interactive=False)
|
| 947 |
+
|
| 948 |
+
with gr.Column():
|
| 949 |
+
space_output = gr.Textbox(label="HuggingFace Space", interactive=False)
|
| 950 |
+
license_output = gr.Textbox(label="License", interactive=False)
|
| 951 |
+
field_type_output = gr.Textbox(label="Field Type", interactive=False)
|
| 952 |
+
|
| 953 |
+
# Connect the interface
|
| 954 |
+
submit_btn.click(
|
| 955 |
+
fn=process_research_relationships,
|
| 956 |
+
inputs=[input_text],
|
| 957 |
+
outputs=[
|
| 958 |
+
paper_output, code_output, name_output, authors_output,
|
| 959 |
+
date_output, model_output, dataset_output,
|
| 960 |
+
space_output, license_output, field_type_output
|
| 961 |
+
]
|
| 962 |
+
)
|
| 963 |
+
|
| 964 |
+
# Also trigger on Enter key
|
| 965 |
+
input_text.submit(
|
| 966 |
+
fn=process_research_relationships,
|
| 967 |
+
inputs=[input_text],
|
| 968 |
+
outputs=[
|
| 969 |
+
paper_output, code_output, name_output, authors_output,
|
| 970 |
+
date_output, model_output, dataset_output,
|
| 971 |
+
space_output, license_output, field_type_output
|
| 972 |
+
]
|
| 973 |
+
)
|
| 974 |
+
|
| 975 |
# Expose all core functions as MCP tools
|
| 976 |
gr.api(infer_authors)
|
| 977 |
gr.api(infer_paper_url)
|
| 978 |
gr.api(infer_code_repository)
|
| 979 |
gr.api(infer_research_name)
|
| 980 |
gr.api(classify_research_url)
|
|
|
|
| 981 |
gr.api(infer_publication_date)
|
| 982 |
gr.api(infer_model)
|
| 983 |
gr.api(infer_dataset)
|
|
|
|
| 988 |
|
| 989 |
if __name__ == "__main__":
|
| 990 |
logger.info("Starting Research Tracker MCP Server")
|
| 991 |
+
demo.launch(mcp_server=True, share=False)
|