Spaces:
Sleeping
Sleeping
import httpx | |
from cytoolz import groupby | |
from functools import lru_cache | |
from rich import print | |
from functools import partial | |
import gradio as gr | |
from typing import Optional | |
def query_author(author_name: str): | |
url = f"https://api.semanticscholar.org/graph/v1/author/search?query={author_name}&fields=name,url,externalIds,papers.externalIds,papers.title,papers.year" | |
resp = httpx.get(url) | |
resp.raise_for_status() | |
return resp.json()["data"] | |
def get_arxiv_paper(papers): | |
papers_with_externalIds = [paper for paper in papers if paper.get("externalIds")] | |
return [ | |
paper for paper in papers_with_externalIds if paper["externalIds"].get("ArXiv") | |
] | |
def check_arxiv_in_papers(arxiv_ids, papers): | |
papers_with_externalIds = [paper for paper in papers if paper.get("externalIds")] | |
papers_with_arxiv_ids = [ | |
paper for paper in papers_with_externalIds if paper["externalIds"].get("ArXiv") | |
] | |
return any( | |
paper | |
for paper in papers_with_arxiv_ids | |
if paper["externalIds"].get("ArXiv") in arxiv_ids | |
) | |
def get_author_from_options(potential_authors, positive_arxiv_ids): | |
return next( | |
( | |
author | |
for author in potential_authors | |
if check_arxiv_in_papers(set(positive_arxiv_ids), author["papers"]) | |
), | |
None, | |
) | |
def sort_by_date(papers): | |
return sorted(papers, key=lambda paper: paper["year"], reverse=True) | |
def lookup_hf_paper(arxiv_id): | |
url = f"https://huggingface.co/api/papers/{arxiv_id}" | |
resp = httpx.get(url) | |
return resp.json() | |
def check_if_index_hf_paper(paper): | |
arxiv_id = paper["externalIds"]["ArXiv"] | |
data = lookup_hf_paper(arxiv_id) | |
return not data.get("error") | |
def groupby_indexed_by_hf_papers(papers): | |
return groupby(check_if_index_hf_paper, papers) | |
def check_hf_user_in_authors(paper, hf_user_name): | |
authors = paper["authors"] | |
authors = [author for author in authors if author.get("user")] | |
return any(author["user"]["user"] == hf_user_name for author in authors) | |
def groupby_hf_user_papers(papers, hf_user_name): | |
check_hf_user_in_authors_partial = partial( | |
check_hf_user_in_authors, hf_user_name=hf_user_name | |
) | |
return groupby(check_hf_user_in_authors_partial, papers) | |
def get_papers( | |
author_name: str, positive_arxiv_ids: str, hf_user_name: Optional[gr.OAuthProfile] | |
): | |
if not hf_user_name: | |
raise gr.Error("You must be logged in to use this Space") | |
if not positive_arxiv_ids: | |
raise gr.Error("You must enter at least one ArXiv ID") | |
hf_user_name = hf_user_name.preferred_username | |
positive_arxiv_ids = positive_arxiv_ids.split(",") | |
# strip whitespace | |
positive_arxiv_ids = [arxiv_id.strip() for arxiv_id in positive_arxiv_ids] | |
potential_authors = query_author(author_name) | |
if not potential_authors: | |
raise gr.Error("No authors found with that name") | |
author = get_author_from_options(potential_authors, positive_arxiv_ids) | |
papers = get_arxiv_paper(author["papers"]) | |
papers = sort_by_date(papers) | |
papers_indexed_by_hf = groupby_indexed_by_hf_papers(papers) | |
# print(papers_indexed_by_hf[True]) | |
indexed_papers = [ | |
lookup_hf_paper(paper["externalIds"]["ArXiv"]) | |
for paper in papers_indexed_by_hf[True] | |
] | |
already_claimed = groupby_hf_user_papers(indexed_papers, hf_user_name) | |
if already_claimed.get(False): | |
results = ( | |
"# Papers already indexed by Hugging Face which you haven't claimed\n" | |
+ "These papers are already indexed by Hugging Face, but you haven't" | |
" claimed them yet. You can claim them by clicking on the link to the" | |
" paper and then clicking on your name in the author list.\n" | |
) | |
for paper in already_claimed[False]: | |
url = f"https://huggingface.co/papers/{paper['id']}" | |
results += f"- [{paper['title']}]({url})\n" | |
else: | |
results = "You have claimed all papers indexed by Hugging Face!\n" | |
if papers_indexed_by_hf.get(False): | |
results += "# Papers not yet indexed by Hugging Face which you can claim\n" | |
for paper in papers_indexed_by_hf[False]: | |
paper_title = paper["title"] | |
arxiv_id = paper["externalIds"]["ArXiv"] | |
url = f"https://huggingface.co/papers/{arxiv_id}" | |
results += f"- [{paper_title}]({url})\n" | |
return results | |
def get_name(hf_user_name: Optional[gr.OAuthProfile] = None): | |
return hf_user_name.name if hf_user_name else "" | |
with gr.Blocks() as demo: | |
gr.HTML( | |
"<h1 style='text-align:center;'> 📃 Hugging Face Paper Claimer 📃" | |
" </h1>" | |
) | |
gr.HTML( | |
"""<div style='text-align:center;'>You can use this Space to help you find arXiv papers you can still claim. | |
You need to be logged in to use this Space. | |
Once you login your name will be prepopulated but you can change this if the name you publish under is different.</div>""" | |
) | |
gr.Markdown( | |
"**NOTE** This Space uses the [Semantic Scholar" | |
" API](https://www.semanticscholar.org/product/api) to find papers you have" | |
" authored. Occasionaly this API returns false positives i.e. papers which you" | |
" did not author" | |
) | |
with gr.Row(): | |
gr.LoginButton(size="sm") | |
gr.LogoutButton(size="sm") | |
author_name = gr.Textbox( | |
value=get_name, | |
label="The name you publish under", | |
interactive=True, | |
) | |
positive_arxiv_ids = gr.Textbox( | |
placeholder="1910.01108", | |
label=( | |
"ArXiv ID for a paper for which you are an author, separate multiple IDs" | |
" with commas" | |
), | |
interactive=True, | |
) | |
btn = gr.Button("Get papers") | |
btn.click(get_papers, [author_name, positive_arxiv_ids], gr.Markdown()) | |
demo.launch(debug=True) | |