Danielrahmai1991 commited on
Commit
7b6dab4
·
verified ·
1 Parent(s): 5fa49a2

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +28 -12
utils.py CHANGED
@@ -3,27 +3,33 @@ from huggingface_hub import login
3
  import os
4
  from datasets import load_dataset
5
  from datasets import DownloadConfig
 
6
 
7
- def remove_duplicates(original_dataset: Dataset, new_dataset: Dataset, unique_key: str = "text") -> Dataset:
8
  """
9
  Removes duplicates from the new_dataset that already exist in the original_dataset.
10
 
11
  Args:
12
- original_dataset (Dataset): The original dataset (e.g., dataset['train']).
13
- new_dataset (Dataset): The new dataset to be added.
14
- unique_key (str): The column name that uniquely identifies each entry.
 
15
 
16
  Returns:
17
- Dataset: A new dataset with duplicates removed.
18
  """
19
- # Extract unique keys from the original dataset
20
- original_ids = set(original_dataset[unique_key])
 
 
21
 
22
  # Filter out rows in the new dataset whose unique key exists in the original dataset
23
- filtered_new_dataset = new_dataset.filter(lambda example: example[unique_key] not in original_ids)
24
-
25
- return filtered_new_dataset
26
 
 
 
 
27
 
28
  def update_db_hub(texts, topics, dates):
29
  api_token = os.getenv("hf_key")
@@ -43,13 +49,23 @@ def update_db_hub(texts, topics, dates):
43
  dataset = load_dataset(dataset_name, download_config=DownloadConfig(token=api_token))
44
  # print("Dataset loaded successfully!", dataset)
45
  # print(dataset)
46
- # deduplicated_new_dataset = remove_duplicates(dataset['train'], new_dataset, unique_key="text")
47
- updated_dataset = concatenate_datasets([dataset['train'], new_dataset])
 
 
 
 
 
 
 
48
  # updated_dataset = new_dataset
 
49
  except Exception as e:
50
  updated_dataset = new_dataset
51
  print(f"Failed to load dataset: {e}")
 
52
 
 
53
  # Replace with your Space's repository name
54
  # Sample data
55
 
 
3
  import os
4
  from datasets import load_dataset
5
  from datasets import DownloadConfig
6
+ import gc
7
 
8
+ def remove_duplicates(original_dataset, new_dataset, unique_key="text", batch_size=1000):
9
  """
10
  Removes duplicates from the new_dataset that already exist in the original_dataset.
11
 
12
  Args:
13
+ original_dataset: The original dataset (e.g., dataset['train']).
14
+ new_dataset: The new dataset to be added.
15
+ unique_key: The column name that uniquely identifies each entry.
16
+ batch_size: The size of batches for processing large datasets.
17
 
18
  Returns:
19
+ A new dataset with duplicates removed.
20
  """
21
+ # Extract unique keys from the original dataset in batches to save memory
22
+ original_ids = set()
23
+ for batch in original_dataset.iter(batch_size=batch_size):
24
+ original_ids.update(batch[unique_key])
25
 
26
  # Filter out rows in the new dataset whose unique key exists in the original dataset
27
+ def filter_function(example):
28
+ return example[unique_key] not in original_ids
 
29
 
30
+ deduplicated_new_dataset = new_dataset.filter(filter_function, batched=True, batch_size=batch_size)
31
+ del original_ids
32
+ return deduplicated_new_dataset
33
 
34
  def update_db_hub(texts, topics, dates):
35
  api_token = os.getenv("hf_key")
 
49
  dataset = load_dataset(dataset_name, download_config=DownloadConfig(token=api_token))
50
  # print("Dataset loaded successfully!", dataset)
51
  # print(dataset)
52
+ deduplicated_new_dataset = remove_duplicates(
53
+ dataset['train'],
54
+ new_dataset,
55
+ unique_key="text",
56
+ batch_size=1000 # Adjust batch size based on available memory
57
+ )
58
+
59
+ updated_dataset = concatenate_datasets([dataset['train'], deduplicated_new_dataset])
60
+
61
  # updated_dataset = new_dataset
62
+ del dataset
63
  except Exception as e:
64
  updated_dataset = new_dataset
65
  print(f"Failed to load dataset: {e}")
66
+
67
 
68
+ gc.collect()
69
  # Replace with your Space's repository name
70
  # Sample data
71