Danielrahmai1991 commited on
Commit
ccd014c
·
verified ·
1 Parent(s): 7b6dab4

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +8 -8
utils.py CHANGED
@@ -49,17 +49,17 @@ def update_db_hub(texts, topics, dates):
49
  dataset = load_dataset(dataset_name, download_config=DownloadConfig(token=api_token))
50
  # print("Dataset loaded successfully!", dataset)
51
  # print(dataset)
52
- deduplicated_new_dataset = remove_duplicates(
53
- dataset['train'],
54
- new_dataset,
55
- unique_key="text",
56
- batch_size=1000 # Adjust batch size based on available memory
57
- )
58
 
59
- updated_dataset = concatenate_datasets([dataset['train'], deduplicated_new_dataset])
60
 
61
  # updated_dataset = new_dataset
62
- del dataset
63
  except Exception as e:
64
  updated_dataset = new_dataset
65
  print(f"Failed to load dataset: {e}")
 
49
  dataset = load_dataset(dataset_name, download_config=DownloadConfig(token=api_token))
50
  # print("Dataset loaded successfully!", dataset)
51
  # print(dataset)
52
+ # deduplicated_new_dataset = remove_duplicates(
53
+ # dataset['train'],
54
+ # new_dataset,
55
+ # unique_key="text",
56
+ # batch_size=1000 # Adjust batch size based on available memory
57
+ # )
58
 
59
+ updated_dataset = concatenate_datasets([dataset['train'], new_dataset])
60
 
61
  # updated_dataset = new_dataset
62
+ # del dataset
63
  except Exception as e:
64
  updated_dataset = new_dataset
65
  print(f"Failed to load dataset: {e}")