Spaces:
Sleeping
Sleeping
Update utils.py
Browse files
utils.py
CHANGED
@@ -3,27 +3,33 @@ from huggingface_hub import login
|
|
3 |
import os
|
4 |
from datasets import load_dataset
|
5 |
from datasets import DownloadConfig
|
|
|
6 |
|
7 |
-
def remove_duplicates(original_dataset
|
8 |
"""
|
9 |
Removes duplicates from the new_dataset that already exist in the original_dataset.
|
10 |
|
11 |
Args:
|
12 |
-
original_dataset
|
13 |
-
new_dataset
|
14 |
-
unique_key
|
|
|
15 |
|
16 |
Returns:
|
17 |
-
|
18 |
"""
|
19 |
-
# Extract unique keys from the original dataset
|
20 |
-
original_ids = set(
|
|
|
|
|
21 |
|
22 |
# Filter out rows in the new dataset whose unique key exists in the original dataset
|
23 |
-
|
24 |
-
|
25 |
-
return filtered_new_dataset
|
26 |
|
|
|
|
|
|
|
27 |
|
28 |
def update_db_hub(texts, topics, dates):
|
29 |
api_token = os.getenv("hf_key")
|
@@ -43,13 +49,23 @@ def update_db_hub(texts, topics, dates):
|
|
43 |
dataset = load_dataset(dataset_name, download_config=DownloadConfig(token=api_token))
|
44 |
# print("Dataset loaded successfully!", dataset)
|
45 |
# print(dataset)
|
46 |
-
|
47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
# updated_dataset = new_dataset
|
|
|
49 |
except Exception as e:
|
50 |
updated_dataset = new_dataset
|
51 |
print(f"Failed to load dataset: {e}")
|
|
|
52 |
|
|
|
53 |
# Replace with your Space's repository name
|
54 |
# Sample data
|
55 |
|
|
|
3 |
import os
|
4 |
from datasets import load_dataset
|
5 |
from datasets import DownloadConfig
|
6 |
+
import gc
|
7 |
|
8 |
+
def remove_duplicates(original_dataset, new_dataset, unique_key="text", batch_size=1000):
|
9 |
"""
|
10 |
Removes duplicates from the new_dataset that already exist in the original_dataset.
|
11 |
|
12 |
Args:
|
13 |
+
original_dataset: The original dataset (e.g., dataset['train']).
|
14 |
+
new_dataset: The new dataset to be added.
|
15 |
+
unique_key: The column name that uniquely identifies each entry.
|
16 |
+
batch_size: The size of batches for processing large datasets.
|
17 |
|
18 |
Returns:
|
19 |
+
A new dataset with duplicates removed.
|
20 |
"""
|
21 |
+
# Extract unique keys from the original dataset in batches to save memory
|
22 |
+
original_ids = set()
|
23 |
+
for batch in original_dataset.iter(batch_size=batch_size):
|
24 |
+
original_ids.update(batch[unique_key])
|
25 |
|
26 |
# Filter out rows in the new dataset whose unique key exists in the original dataset
|
27 |
+
def filter_function(example):
|
28 |
+
return example[unique_key] not in original_ids
|
|
|
29 |
|
30 |
+
deduplicated_new_dataset = new_dataset.filter(filter_function, batched=True, batch_size=batch_size)
|
31 |
+
del original_ids
|
32 |
+
return deduplicated_new_dataset
|
33 |
|
34 |
def update_db_hub(texts, topics, dates):
|
35 |
api_token = os.getenv("hf_key")
|
|
|
49 |
dataset = load_dataset(dataset_name, download_config=DownloadConfig(token=api_token))
|
50 |
# print("Dataset loaded successfully!", dataset)
|
51 |
# print(dataset)
|
52 |
+
deduplicated_new_dataset = remove_duplicates(
|
53 |
+
dataset['train'],
|
54 |
+
new_dataset,
|
55 |
+
unique_key="text",
|
56 |
+
batch_size=1000 # Adjust batch size based on available memory
|
57 |
+
)
|
58 |
+
|
59 |
+
updated_dataset = concatenate_datasets([dataset['train'], deduplicated_new_dataset])
|
60 |
+
|
61 |
# updated_dataset = new_dataset
|
62 |
+
del dataset
|
63 |
except Exception as e:
|
64 |
updated_dataset = new_dataset
|
65 |
print(f"Failed to load dataset: {e}")
|
66 |
+
|
67 |
|
68 |
+
gc.collect()
|
69 |
# Replace with your Space's repository name
|
70 |
# Sample data
|
71 |
|