Spaces:
Sleeping
Sleeping
from datasets import Dataset, concatenate_datasets | |
from huggingface_hub import login | |
import os | |
from datasets import load_dataset | |
from datasets import DownloadConfig | |
import gc | |
def remove_duplicates(original_dataset, new_dataset, unique_key="text", batch_size=1000): | |
""" | |
Removes duplicates from the new_dataset that already exist in the original_dataset. | |
Args: | |
original_dataset: The original dataset (e.g., dataset['train']). | |
new_dataset: The new dataset to be added. | |
unique_key: The column name that uniquely identifies each entry. | |
batch_size: The size of batches for processing large datasets. | |
Returns: | |
A new dataset with duplicates removed. | |
""" | |
# Extract unique keys from the original dataset in batches to save memory | |
original_ids = set() | |
for batch in original_dataset.iter(batch_size=batch_size): | |
original_ids.update(batch[unique_key]) | |
# Filter out rows in the new dataset whose unique key exists in the original dataset | |
def filter_function(example): | |
return example[unique_key] not in original_ids | |
deduplicated_new_dataset = new_dataset.filter(filter_function, batched=True, batch_size=batch_size) | |
del original_ids | |
return deduplicated_new_dataset | |
def update_db_hub(texts, topics, dates): | |
api_token = os.getenv("hf_key") | |
login(token=api_token) | |
dataset_name = "Danielrahmai1991/row_data" | |
new_rows = { | |
'text': texts, | |
"topic": topics, | |
"date": dates | |
} | |
# print("new_rows", new_rows) | |
new_dataset = Dataset.from_dict(new_rows) | |
try: | |
# Load the dataset (use_auth_token=True if it's private) | |
dataset = load_dataset(dataset_name, download_config=DownloadConfig(token=api_token)) | |
# print("Dataset loaded successfully!", dataset) | |
# print(dataset) | |
deduplicated_new_dataset = remove_duplicates( | |
dataset['train'], | |
new_dataset, | |
unique_key="text", | |
batch_size=1000 # Adjust batch size based on available memory | |
) | |
updated_dataset = concatenate_datasets([dataset['train'], deduplicated_new_dataset]) | |
# updated_dataset = new_dataset | |
del dataset | |
except Exception as e: | |
updated_dataset = new_dataset | |
print(f"Failed to load dataset: {e}") | |
gc.collect() | |
# Replace with your Space's repository name | |
# Sample data | |
print("updated_dataset", updated_dataset) | |
# Push the updated dataset back to the hub | |
try: | |
updated_dataset.push_to_hub(dataset_name, private=True) # Set private=False if it's not private | |
print(f"Updated dataset pushed to the Hugging Face Hub: {dataset_name}") | |
except Exception as e: | |
print(f"Failed to push dataset: {e}") |