from datasets import Dataset, concatenate_datasets from huggingface_hub import login import os from datasets import load_dataset from datasets import DownloadConfig import gc def remove_duplicates(original_dataset, new_dataset, unique_key="text", batch_size=1000): """ Removes duplicates from the new_dataset that already exist in the original_dataset. Args: original_dataset: The original dataset (e.g., dataset['train']). new_dataset: The new dataset to be added. unique_key: The column name that uniquely identifies each entry. batch_size: The size of batches for processing large datasets. Returns: A new dataset with duplicates removed. """ # Extract unique keys from the original dataset in batches to save memory original_ids = set() for batch in original_dataset.iter(batch_size=batch_size): original_ids.update(batch[unique_key]) # Filter out rows in the new dataset whose unique key exists in the original dataset def filter_function(example): return example[unique_key] not in original_ids deduplicated_new_dataset = new_dataset.filter(filter_function, batched=True, batch_size=batch_size) del original_ids return deduplicated_new_dataset def update_db_hub(texts, topics, dates): api_token = os.getenv("hf_key") login(token=api_token) dataset_name = "Danielrahmai1991/row_data" new_rows = { 'text': texts, "topic": topics, "date": dates } # print("new_rows", new_rows) new_dataset = Dataset.from_dict(new_rows) try: # Load the dataset (use_auth_token=True if it's private) dataset = load_dataset(dataset_name, download_config=DownloadConfig(token=api_token)) # print("Dataset loaded successfully!", dataset) # print(dataset) # deduplicated_new_dataset = remove_duplicates( # dataset['train'], # new_dataset, # unique_key="text", # batch_size=1000 # Adjust batch size based on available memory # ) updated_dataset = concatenate_datasets([dataset['train'], new_dataset]) # updated_dataset = new_dataset # del dataset except Exception as e: updated_dataset = new_dataset print(f"Failed to load dataset: {e}") gc.collect() # Replace with your Space's repository name # Sample data print("updated_dataset", updated_dataset) # Push the updated dataset back to the hub try: updated_dataset.push_to_hub(dataset_name, private=True) # Set private=False if it's not private print(f"Updated dataset pushed to the Hugging Face Hub: {dataset_name}") except Exception as e: print(f"Failed to push dataset: {e}")