Danielrahmai1991 commited on
Commit
5fa49a2
·
verified ·
1 Parent(s): 6d2e16c

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +24 -3
utils.py CHANGED
@@ -4,6 +4,26 @@ import os
4
  from datasets import load_dataset
5
  from datasets import DownloadConfig
6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  def update_db_hub(texts, topics, dates):
9
  api_token = os.getenv("hf_key")
@@ -20,11 +40,12 @@ def update_db_hub(texts, topics, dates):
20
 
21
  try:
22
  # Load the dataset (use_auth_token=True if it's private)
23
- # dataset = load_dataset(dataset_name, download_config=DownloadConfig(token=api_token))
24
  # print("Dataset loaded successfully!", dataset)
25
  # print(dataset)
26
- # updated_dataset = concatenate_datasets([dataset['train'], new_dataset])
27
- updated_dataset = new_dataset
 
28
  except Exception as e:
29
  updated_dataset = new_dataset
30
  print(f"Failed to load dataset: {e}")
 
4
  from datasets import load_dataset
5
  from datasets import DownloadConfig
6
 
7
+ def remove_duplicates(original_dataset: Dataset, new_dataset: Dataset, unique_key: str = "text") -> Dataset:
8
+ """
9
+ Removes duplicates from the new_dataset that already exist in the original_dataset.
10
+
11
+ Args:
12
+ original_dataset (Dataset): The original dataset (e.g., dataset['train']).
13
+ new_dataset (Dataset): The new dataset to be added.
14
+ unique_key (str): The column name that uniquely identifies each entry.
15
+
16
+ Returns:
17
+ Dataset: A new dataset with duplicates removed.
18
+ """
19
+ # Extract unique keys from the original dataset
20
+ original_ids = set(original_dataset[unique_key])
21
+
22
+ # Filter out rows in the new dataset whose unique key exists in the original dataset
23
+ filtered_new_dataset = new_dataset.filter(lambda example: example[unique_key] not in original_ids)
24
+
25
+ return filtered_new_dataset
26
+
27
 
28
  def update_db_hub(texts, topics, dates):
29
  api_token = os.getenv("hf_key")
 
40
 
41
  try:
42
  # Load the dataset (use_auth_token=True if it's private)
43
+ dataset = load_dataset(dataset_name, download_config=DownloadConfig(token=api_token))
44
  # print("Dataset loaded successfully!", dataset)
45
  # print(dataset)
46
+ # deduplicated_new_dataset = remove_duplicates(dataset['train'], new_dataset, unique_key="text")
47
+ updated_dataset = concatenate_datasets([dataset['train'], new_dataset])
48
+ # updated_dataset = new_dataset
49
  except Exception as e:
50
  updated_dataset = new_dataset
51
  print(f"Failed to load dataset: {e}")