Spaces:
Build error
Build error
| import dataclasses | |
| from multiprocessing import cpu_count | |
| import tqdm | |
| import requests | |
| import streamlit as st | |
| import pandas as pd | |
| from datasets import Dataset, load_dataset | |
| from paperswithcode import PapersWithCodeClient | |
| class PaperInfo: | |
| date: str | |
| arxiv_id: str | |
| github: str | |
| title: str | |
| paper_page: str | |
| upvotes: int | |
| num_comments: int | |
| def get_df() -> pd.DataFrame: | |
| """ | |
| Load the initial dataset as a Pandas dataframe. | |
| """ | |
| df = pd.merge( | |
| left=load_dataset("hysts-bot-data/daily-papers", split="train").to_pandas(), | |
| right=load_dataset("hysts-bot-data/daily-papers-stats", split="train").to_pandas(), | |
| on="arxiv_id", | |
| ) | |
| df = df[::-1].reset_index(drop=True) | |
| paper_info = [] | |
| for _, row in tqdm.auto.tqdm(df.iterrows(), total=len(df)): | |
| info = PaperInfo( | |
| **row, | |
| paper_page=f"https://huggingface.co/papers/{row.arxiv_id}", | |
| ) | |
| paper_info.append(info) | |
| return pd.DataFrame([dataclasses.asdict(info) for info in paper_info]) | |
| def get_github_url(client: PapersWithCodeClient, paper_title: str) -> str: | |
| """ | |
| Get the Github URL for a paper. | |
| """ | |
| repo_url = "" | |
| try: | |
| # get paper ID | |
| results = client.paper_list(q=paper_title).results | |
| paper_id = results[0].id | |
| # get paper | |
| paper = client.paper_get(paper_id=paper_id) | |
| # get repositories | |
| repositories = client.paper_repository_list(paper_id=paper.id).results | |
| for repo in repositories: | |
| if repo.is_official: | |
| repo_url = repo.url | |
| except: | |
| pass | |
| return repo_url | |
| def add_metadata_batch(batch, client: PapersWithCodeClient): | |
| """ | |
| Add metadata to a batch of papers. | |
| """ | |
| # get Github URLs for all papers in the batch | |
| github_urls = [] | |
| for paper_title in batch["title"]: | |
| github_url = get_github_url(client, paper_title) | |
| github_urls.append(github_url) | |
| # overwrite the Github links | |
| batch["github"] = github_urls | |
| return batch | |
| def add_hf_assets(batch): | |
| """ | |
| Add Hugging Face assets to a batch of papers. | |
| """ | |
| num_spaces = [] | |
| num_models = [] | |
| num_datasets = [] | |
| for arxiv_id in batch["arxiv_id"]: | |
| if arxiv_id != "": | |
| response = requests.get(f"https://huggingface.co/api/arxiv/{arxiv_id}/repos") | |
| result = response.json() | |
| num_spaces_example = len(result["spaces"]) | |
| num_models_example = len(result["models"]) | |
| num_datasets_example = len(result["datasets"]) | |
| else: | |
| num_spaces_example = 0 | |
| num_models_example = 0 | |
| num_datasets_example = 0 | |
| num_spaces.append(num_spaces_example) | |
| num_models.append(num_models_example) | |
| num_datasets.append(num_datasets_example) | |
| batch["num_models"] = num_models | |
| batch["num_datasets"] = num_datasets | |
| batch["num_spaces"] = num_spaces | |
| return batch | |
| def check_hf_mention(batch): | |
| """ | |
| Check if a paper mentions Hugging Face in the README. | |
| """ | |
| hf_mentions = [] | |
| for github_url in batch["github"]: | |
| hf_mention = 0 | |
| if github_url != "": | |
| # get README text using Github API | |
| owner = github_url.split("/")[-2] | |
| repo = github_url.split("/")[-1] | |
| branch = "main" | |
| url = f"https://raw.githubusercontent.com/{owner}/{repo}/{branch}/README.md" | |
| response = requests.get(url) | |
| if response.status_code != 200: | |
| # try master branch as second attempt | |
| branch = "master" | |
| url = f"https://raw.githubusercontent.com/{owner}/{repo}/{branch}/README.md" | |
| response = requests.get(url) | |
| if response.status_code == 200: | |
| # get text | |
| text = response.text | |
| if "huggingface" in text.lower() or "hugging face" in text.lower(): | |
| hf_mention = 1 | |
| hf_mentions.append(hf_mention) | |
| # overwrite the Github links | |
| batch["hf_mention"] = hf_mentions | |
| return batch | |
| def get_data() -> pd.DataFrame: | |
| """ | |
| Load the dataset and enrich it with metadata. | |
| """ | |
| # step 1. load as Pandas dataframe | |
| df = get_df() | |
| df['date'] = pd.to_datetime(df['date']) | |
| # step 2. enrich using PapersWithCode API | |
| dataset = Dataset.from_pandas(df) | |
| # TODO remove | |
| # dataset = dataset.select(range(10)) | |
| dataset = dataset.map(add_metadata_batch, batched=True, batch_size=4, num_proc=cpu_count(), fn_kwargs={"client": PapersWithCodeClient()}) | |
| # step 3. enrich using Hugging Face API | |
| dataset = dataset.map(add_hf_assets, batched=True, batch_size=4, num_proc=cpu_count()) | |
| # step 4. check if Hugging Face is mentioned in the README | |
| dataset = dataset.map(check_hf_mention, batched=True, batch_size=4, num_proc=cpu_count()) | |
| # return as Pandas dataframe | |
| dataframe = dataset.to_pandas() | |
| # convert date column to datetime | |
| dataframe['date'] = pd.to_datetime(dataframe['date']) | |
| print("First few rows of the dataset:") | |
| print(dataframe.head()) | |
| return dataframe |