File size: 1,670 Bytes
84deff7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
# arxiv_metadata_service.py

from typing import List, Dict, Any
import logging
from huggingface_dataset_manager import HuggingFaceDatasetManager
from arxiv_fetcher import fetch_arxiv_metadata

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

class ArxivMetadataService:
    def __init__(self, dataset_name: str):
        self.dataset_manager = HuggingFaceDatasetManager(dataset_name)

    def fetch_and_persist_metadata(self, query: str, max_results: int = 10) -> bool:
        metadata_list = fetch_arxiv_metadata(query, max_results)
        if not metadata_list:
            logging.warning("No metadata fetched to persist.")
            return False
        
        return self.dataset_manager.persist_to_dataset(metadata_list)

def test_arxiv_metadata_service():
    # Use a test dataset name
    test_dataset_name = "dwb2023/arxiv-papers-dataset"
    
    service = ArxivMetadataService(test_dataset_name)
    
    # Test query
    test_query = "quantum computing"
    max_results = 5
    
    success = service.fetch_and_persist_metadata(test_query, max_results)
    if success:
        print(f"Successfully fetched and persisted metadata for query: '{test_query}'")
    else:
        print(f"Failed to fetch and persist metadata for query: '{test_query}'")
    
    # Get and print dataset info
    info = service.dataset_manager.get_dataset_info()
    print("\nDataset Info:")
    print(f"Number of rows: {info.get('num_rows', 'N/A')}")
    print(f"Features: {info.get('features', 'N/A')}")
    print(f"Last modified: {info.get('last_modified', 'N/A')}")

if __name__ == "__main__":
    test_arxiv_metadata_service()