Spaces:
Sleeping
Sleeping
File size: 2,821 Bytes
3771ffa 1f23a52 7b6dab4 3771ffa 7b6dab4 5fa49a2 7b6dab4 5fa49a2 7b6dab4 5fa49a2 7b6dab4 5fa49a2 7b6dab4 5fa49a2 7b6dab4 3771ffa a6d6f2f 3771ffa a6d6f2f 3771ffa 5350ca6 3771ffa 5fa49a2 5350ca6 cdde01e ccd014c 7b6dab4 ccd014c 7b6dab4 5fa49a2 ccd014c 3771ffa 7b6dab4 3771ffa 7b6dab4 3771ffa cdde01e 3771ffa 5350ca6 3771ffa |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 |
from datasets import Dataset, concatenate_datasets
from huggingface_hub import login
import os
from datasets import load_dataset
from datasets import DownloadConfig
import gc
def remove_duplicates(original_dataset, new_dataset, unique_key="text", batch_size=1000):
"""
Removes duplicates from the new_dataset that already exist in the original_dataset.
Args:
original_dataset: The original dataset (e.g., dataset['train']).
new_dataset: The new dataset to be added.
unique_key: The column name that uniquely identifies each entry.
batch_size: The size of batches for processing large datasets.
Returns:
A new dataset with duplicates removed.
"""
# Extract unique keys from the original dataset in batches to save memory
original_ids = set()
for batch in original_dataset.iter(batch_size=batch_size):
original_ids.update(batch[unique_key])
# Filter out rows in the new dataset whose unique key exists in the original dataset
def filter_function(example):
return example[unique_key] not in original_ids
deduplicated_new_dataset = new_dataset.filter(filter_function, batched=True, batch_size=batch_size)
del original_ids
return deduplicated_new_dataset
def update_db_hub(texts, topics, dates):
api_token = os.getenv("hf_key")
login(token=api_token)
dataset_name = "Danielrahmai1991/row_data"
new_rows = {
'text': texts,
"topic": topics,
"date": dates
}
# print("new_rows", new_rows)
new_dataset = Dataset.from_dict(new_rows)
try:
# Load the dataset (use_auth_token=True if it's private)
dataset = load_dataset(dataset_name, download_config=DownloadConfig(token=api_token))
# print("Dataset loaded successfully!", dataset)
# print(dataset)
# deduplicated_new_dataset = remove_duplicates(
# dataset['train'],
# new_dataset,
# unique_key="text",
# batch_size=1000 # Adjust batch size based on available memory
# )
updated_dataset = concatenate_datasets([dataset['train'], new_dataset])
# updated_dataset = new_dataset
# del dataset
except Exception as e:
updated_dataset = new_dataset
print(f"Failed to load dataset: {e}")
gc.collect()
# Replace with your Space's repository name
# Sample data
print("updated_dataset", updated_dataset)
# Push the updated dataset back to the hub
try:
updated_dataset.push_to_hub(dataset_name, private=True) # Set private=False if it's not private
print(f"Updated dataset pushed to the Hugging Face Hub: {dataset_name}")
except Exception as e:
print(f"Failed to push dataset: {e}") |