Spaces:

Danielrahmai1991
/

dataset_interface

Sleeping

App Files Files Community

Danielrahmai1991 commited on Mar 10

Commit

7b6dab4

verified ·

1 Parent(s): 5fa49a2

Update utils.py

Browse files

Files changed (1) hide show

utils.py +28 -12

utils.py CHANGED Viewed

@@ -3,27 +3,33 @@ from huggingface_hub import login
 import os
 from datasets import load_dataset
 from datasets import DownloadConfig
-def remove_duplicates(original_dataset: Dataset, new_dataset: Dataset, unique_key: str = "text") -> Dataset:
     """
     Removes duplicates from the new_dataset that already exist in the original_dataset.
     Args:
-        original_dataset (Dataset): The original dataset (e.g., dataset['train']).
-        new_dataset (Dataset): The new dataset to be added.
-        unique_key (str): The column name that uniquely identifies each entry.
     Returns:
-        Dataset: A new dataset with duplicates removed.
     """
-    # Extract unique keys from the original dataset
-    original_ids = set(original_dataset[unique_key])
     # Filter out rows in the new dataset whose unique key exists in the original dataset
-    filtered_new_dataset = new_dataset.filter(lambda example: example[unique_key] not in original_ids)
-    return filtered_new_dataset
 def update_db_hub(texts, topics, dates):
     api_token = os.getenv("hf_key")
@@ -43,13 +49,23 @@ def update_db_hub(texts, topics, dates):
         dataset = load_dataset(dataset_name, download_config=DownloadConfig(token=api_token))
         # print("Dataset loaded successfully!", dataset)
         # print(dataset)
-        # deduplicated_new_dataset = remove_duplicates(dataset['train'], new_dataset, unique_key="text")
-        updated_dataset = concatenate_datasets([dataset['train'], new_dataset])
         # updated_dataset = new_dataset
     except Exception as e:
         updated_dataset = new_dataset
         print(f"Failed to load dataset: {e}")
     # Replace with your Space's repository name
     # Sample data

 import os
 from datasets import load_dataset
 from datasets import DownloadConfig
+import gc
+def remove_duplicates(original_dataset, new_dataset, unique_key="text", batch_size=1000):
     """
     Removes duplicates from the new_dataset that already exist in the original_dataset.
     Args:
+        original_dataset: The original dataset (e.g., dataset['train']).
+        new_dataset: The new dataset to be added.
+        unique_key: The column name that uniquely identifies each entry.
+        batch_size: The size of batches for processing large datasets.
     Returns:
+        A new dataset with duplicates removed.
     """
+    # Extract unique keys from the original dataset in batches to save memory
+    original_ids = set()
+    for batch in original_dataset.iter(batch_size=batch_size):
+        original_ids.update(batch[unique_key])
     # Filter out rows in the new dataset whose unique key exists in the original dataset
+    def filter_function(example):
+        return example[unique_key] not in original_ids
+    deduplicated_new_dataset = new_dataset.filter(filter_function, batched=True, batch_size=batch_size)
+    del original_ids
+    return deduplicated_new_dataset
 def update_db_hub(texts, topics, dates):
     api_token = os.getenv("hf_key")
         dataset = load_dataset(dataset_name, download_config=DownloadConfig(token=api_token))
         # print("Dataset loaded successfully!", dataset)
         # print(dataset)
+        deduplicated_new_dataset = remove_duplicates(
+            dataset['train'],
+            new_dataset,
+            unique_key="text",
+            batch_size=1000  # Adjust batch size based on available memory
+        )
+        updated_dataset = concatenate_datasets([dataset['train'], deduplicated_new_dataset])
         # updated_dataset = new_dataset
+        del dataset
     except Exception as e:
         updated_dataset = new_dataset
         print(f"Failed to load dataset: {e}")
+    gc.collect()
     # Replace with your Space's repository name
     # Sample data