from datasets import Dataset, concatenate_datasets
from huggingface_hub import login
import os
from datasets import load_dataset
from datasets import DownloadConfig
import gc

def remove_duplicates(original_dataset, new_dataset, unique_key="text", batch_size=1000):
    """
    Removes duplicates from the new_dataset that already exist in the original_dataset.
    
    Args:
        original_dataset: The original dataset (e.g., dataset['train']).
        new_dataset: The new dataset to be added.
        unique_key: The column name that uniquely identifies each entry.
        batch_size: The size of batches for processing large datasets.
    
    Returns:
        A new dataset with duplicates removed.
    """
    # Extract unique keys from the original dataset in batches to save memory
    original_ids = set()
    for batch in original_dataset.iter(batch_size=batch_size):
        original_ids.update(batch[unique_key])
    
    # Filter out rows in the new dataset whose unique key exists in the original dataset
    def filter_function(example):
        return example[unique_key] not in original_ids
    
    deduplicated_new_dataset = new_dataset.filter(filter_function, batched=True, batch_size=batch_size)
    del original_ids
    return deduplicated_new_dataset

def update_db_hub(texts, topics, dates):
    api_token = os.getenv("hf_key")
    login(token=api_token)
    dataset_name = "Danielrahmai1991/row_data"

    new_rows = {
        'text': texts,
        "topic": topics,
        "date": dates
    }
    # print("new_rows", new_rows)
    new_dataset = Dataset.from_dict(new_rows)
    
    try:
        # Load the dataset (use_auth_token=True if it's private)
        dataset = load_dataset(dataset_name, download_config=DownloadConfig(token=api_token))
        # print("Dataset loaded successfully!", dataset)
        # print(dataset)
        # deduplicated_new_dataset = remove_duplicates(
        #     dataset['train'], 
        #     new_dataset, 
        #     unique_key="text", 
        #     batch_size=1000  # Adjust batch size based on available memory
        # )
        
        updated_dataset = concatenate_datasets([dataset['train'], new_dataset])
        
        # updated_dataset = new_dataset
        # del dataset
    except Exception as e:
        updated_dataset = new_dataset
        print(f"Failed to load dataset: {e}")

    
    gc.collect()
    # Replace with your Space's repository name
    # Sample data

    print("updated_dataset", updated_dataset)
    # Push the updated dataset back to the hub
    try:
        updated_dataset.push_to_hub(dataset_name, private=True)  # Set private=False if it's not private
        print(f"Updated dataset pushed to the Hugging Face Hub: {dataset_name}")
    except Exception as e:
        print(f"Failed to push dataset: {e}")