arxiv-rag-mvp / arxiv_metadata_service.py
donb-hf's picture
update services
84deff7
# arxiv_metadata_service.py
from typing import List, Dict, Any
import logging
from huggingface_dataset_manager import HuggingFaceDatasetManager
from arxiv_fetcher import fetch_arxiv_metadata
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
class ArxivMetadataService:
def __init__(self, dataset_name: str):
self.dataset_manager = HuggingFaceDatasetManager(dataset_name)
def fetch_and_persist_metadata(self, query: str, max_results: int = 10) -> bool:
metadata_list = fetch_arxiv_metadata(query, max_results)
if not metadata_list:
logging.warning("No metadata fetched to persist.")
return False
return self.dataset_manager.persist_to_dataset(metadata_list)
def test_arxiv_metadata_service():
# Use a test dataset name
test_dataset_name = "dwb2023/arxiv-papers-dataset"
service = ArxivMetadataService(test_dataset_name)
# Test query
test_query = "quantum computing"
max_results = 5
success = service.fetch_and_persist_metadata(test_query, max_results)
if success:
print(f"Successfully fetched and persisted metadata for query: '{test_query}'")
else:
print(f"Failed to fetch and persist metadata for query: '{test_query}'")
# Get and print dataset info
info = service.dataset_manager.get_dataset_info()
print("\nDataset Info:")
print(f"Number of rows: {info.get('num_rows', 'N/A')}")
print(f"Features: {info.get('features', 'N/A')}")
print(f"Last modified: {info.get('last_modified', 'N/A')}")
if __name__ == "__main__":
test_arxiv_metadata_service()