dataset_interface / utils.py
Danielrahmai1991's picture
Update utils.py
ccd014c verified
from datasets import Dataset, concatenate_datasets
from huggingface_hub import login
import os
from datasets import load_dataset
from datasets import DownloadConfig
import gc
def remove_duplicates(original_dataset, new_dataset, unique_key="text", batch_size=1000):
"""
Removes duplicates from the new_dataset that already exist in the original_dataset.
Args:
original_dataset: The original dataset (e.g., dataset['train']).
new_dataset: The new dataset to be added.
unique_key: The column name that uniquely identifies each entry.
batch_size: The size of batches for processing large datasets.
Returns:
A new dataset with duplicates removed.
"""
# Extract unique keys from the original dataset in batches to save memory
original_ids = set()
for batch in original_dataset.iter(batch_size=batch_size):
original_ids.update(batch[unique_key])
# Filter out rows in the new dataset whose unique key exists in the original dataset
def filter_function(example):
return example[unique_key] not in original_ids
deduplicated_new_dataset = new_dataset.filter(filter_function, batched=True, batch_size=batch_size)
del original_ids
return deduplicated_new_dataset
def update_db_hub(texts, topics, dates):
api_token = os.getenv("hf_key")
login(token=api_token)
dataset_name = "Danielrahmai1991/row_data"
new_rows = {
'text': texts,
"topic": topics,
"date": dates
}
# print("new_rows", new_rows)
new_dataset = Dataset.from_dict(new_rows)
try:
# Load the dataset (use_auth_token=True if it's private)
dataset = load_dataset(dataset_name, download_config=DownloadConfig(token=api_token))
# print("Dataset loaded successfully!", dataset)
# print(dataset)
# deduplicated_new_dataset = remove_duplicates(
# dataset['train'],
# new_dataset,
# unique_key="text",
# batch_size=1000 # Adjust batch size based on available memory
# )
updated_dataset = concatenate_datasets([dataset['train'], new_dataset])
# updated_dataset = new_dataset
# del dataset
except Exception as e:
updated_dataset = new_dataset
print(f"Failed to load dataset: {e}")
gc.collect()
# Replace with your Space's repository name
# Sample data
print("updated_dataset", updated_dataset)
# Push the updated dataset back to the hub
try:
updated_dataset.push_to_hub(dataset_name, private=True) # Set private=False if it's not private
print(f"Updated dataset pushed to the Hugging Face Hub: {dataset_name}")
except Exception as e:
print(f"Failed to push dataset: {e}")