|
from datasets import Dataset, concatenate_datasets |
|
from huggingface_hub import login |
|
import os |
|
from datasets import load_dataset |
|
from datasets import DownloadConfig |
|
import gc |
|
|
|
def remove_duplicates(original_dataset, new_dataset, unique_key="text", batch_size=1000): |
|
""" |
|
Removes duplicates from the new_dataset that already exist in the original_dataset. |
|
|
|
Args: |
|
original_dataset: The original dataset (e.g., dataset['train']). |
|
new_dataset: The new dataset to be added. |
|
unique_key: The column name that uniquely identifies each entry. |
|
batch_size: The size of batches for processing large datasets. |
|
|
|
Returns: |
|
A new dataset with duplicates removed. |
|
""" |
|
|
|
original_ids = set() |
|
for batch in original_dataset.iter(batch_size=batch_size): |
|
original_ids.update(batch[unique_key]) |
|
|
|
|
|
def filter_function(example): |
|
return example[unique_key] not in original_ids |
|
|
|
deduplicated_new_dataset = new_dataset.filter(filter_function, batched=True, batch_size=batch_size) |
|
del original_ids |
|
return deduplicated_new_dataset |
|
|
|
def update_db_hub(texts, topics, dates): |
|
api_token = os.getenv("hf_key") |
|
login(token=api_token) |
|
dataset_name = "Danielrahmai1991/row_data" |
|
|
|
new_rows = { |
|
'text': texts, |
|
"topic": topics, |
|
"date": dates |
|
} |
|
|
|
new_dataset = Dataset.from_dict(new_rows) |
|
|
|
try: |
|
|
|
dataset = load_dataset(dataset_name, download_config=DownloadConfig(token=api_token)) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
updated_dataset = concatenate_datasets([dataset['train'], new_dataset]) |
|
|
|
|
|
|
|
except Exception as e: |
|
updated_dataset = new_dataset |
|
print(f"Failed to load dataset: {e}") |
|
|
|
|
|
gc.collect() |
|
|
|
|
|
|
|
print("updated_dataset", updated_dataset) |
|
|
|
try: |
|
updated_dataset.push_to_hub(dataset_name, private=True) |
|
print(f"Updated dataset pushed to the Hugging Face Hub: {dataset_name}") |
|
except Exception as e: |
|
print(f"Failed to push dataset: {e}") |