Spaces:
Sleeping
Sleeping
Update utils.py
Browse files
utils.py
CHANGED
@@ -4,6 +4,26 @@ import os
|
|
4 |
from datasets import load_dataset
|
5 |
from datasets import DownloadConfig
|
6 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
7 |
|
8 |
def update_db_hub(texts, topics, dates):
|
9 |
api_token = os.getenv("hf_key")
|
@@ -20,11 +40,12 @@ def update_db_hub(texts, topics, dates):
|
|
20 |
|
21 |
try:
|
22 |
# Load the dataset (use_auth_token=True if it's private)
|
23 |
-
|
24 |
# print("Dataset loaded successfully!", dataset)
|
25 |
# print(dataset)
|
26 |
-
#
|
27 |
-
updated_dataset = new_dataset
|
|
|
28 |
except Exception as e:
|
29 |
updated_dataset = new_dataset
|
30 |
print(f"Failed to load dataset: {e}")
|
|
|
4 |
from datasets import load_dataset
|
5 |
from datasets import DownloadConfig
|
6 |
|
7 |
+
def remove_duplicates(original_dataset: Dataset, new_dataset: Dataset, unique_key: str = "text") -> Dataset:
|
8 |
+
"""
|
9 |
+
Removes duplicates from the new_dataset that already exist in the original_dataset.
|
10 |
+
|
11 |
+
Args:
|
12 |
+
original_dataset (Dataset): The original dataset (e.g., dataset['train']).
|
13 |
+
new_dataset (Dataset): The new dataset to be added.
|
14 |
+
unique_key (str): The column name that uniquely identifies each entry.
|
15 |
+
|
16 |
+
Returns:
|
17 |
+
Dataset: A new dataset with duplicates removed.
|
18 |
+
"""
|
19 |
+
# Extract unique keys from the original dataset
|
20 |
+
original_ids = set(original_dataset[unique_key])
|
21 |
+
|
22 |
+
# Filter out rows in the new dataset whose unique key exists in the original dataset
|
23 |
+
filtered_new_dataset = new_dataset.filter(lambda example: example[unique_key] not in original_ids)
|
24 |
+
|
25 |
+
return filtered_new_dataset
|
26 |
+
|
27 |
|
28 |
def update_db_hub(texts, topics, dates):
|
29 |
api_token = os.getenv("hf_key")
|
|
|
40 |
|
41 |
try:
|
42 |
# Load the dataset (use_auth_token=True if it's private)
|
43 |
+
dataset = load_dataset(dataset_name, download_config=DownloadConfig(token=api_token))
|
44 |
# print("Dataset loaded successfully!", dataset)
|
45 |
# print(dataset)
|
46 |
+
# deduplicated_new_dataset = remove_duplicates(dataset['train'], new_dataset, unique_key="text")
|
47 |
+
updated_dataset = concatenate_datasets([dataset['train'], new_dataset])
|
48 |
+
# updated_dataset = new_dataset
|
49 |
except Exception as e:
|
50 |
updated_dataset = new_dataset
|
51 |
print(f"Failed to load dataset: {e}")
|