Spaces:
Paused
Paused
# arxiv_metadata_service.py | |
from typing import List, Dict, Any | |
import logging | |
from huggingface_dataset_manager import HuggingFaceDatasetManager | |
from arxiv_fetcher import fetch_arxiv_metadata | |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
class ArxivMetadataService: | |
def __init__(self, dataset_name: str): | |
self.dataset_manager = HuggingFaceDatasetManager(dataset_name) | |
def fetch_and_persist_metadata(self, query: str, max_results: int = 10) -> bool: | |
metadata_list = fetch_arxiv_metadata(query, max_results) | |
if not metadata_list: | |
logging.warning("No metadata fetched to persist.") | |
return False | |
return self.dataset_manager.persist_to_dataset(metadata_list) | |
def test_arxiv_metadata_service(): | |
# Use a test dataset name | |
test_dataset_name = "dwb2023/arxiv-papers-dataset" | |
service = ArxivMetadataService(test_dataset_name) | |
# Test query | |
test_query = "quantum computing" | |
max_results = 5 | |
success = service.fetch_and_persist_metadata(test_query, max_results) | |
if success: | |
print(f"Successfully fetched and persisted metadata for query: '{test_query}'") | |
else: | |
print(f"Failed to fetch and persist metadata for query: '{test_query}'") | |
# Get and print dataset info | |
info = service.dataset_manager.get_dataset_info() | |
print("\nDataset Info:") | |
print(f"Number of rows: {info.get('num_rows', 'N/A')}") | |
print(f"Features: {info.get('features', 'N/A')}") | |
print(f"Last modified: {info.get('last_modified', 'N/A')}") | |
if __name__ == "__main__": | |
test_arxiv_metadata_service() | |