Spaces:

Danielrahmai1991
/

dataset_interface

Sleeping

App Files Files Community

dataset_interface / utils.py

Danielrahmai1991

Update utils.py

7b6dab4 verified 19 days ago

raw

history blame

2.82 kB

	from datasets import Dataset, concatenate_datasets
	from huggingface_hub import login
	import os
	from datasets import load_dataset
	from datasets import DownloadConfig
	import gc

	def remove_duplicates(original_dataset, new_dataset, unique_key="text", batch_size=1000):
	"""
	Removes duplicates from the new_dataset that already exist in the original_dataset.

	Args:
	original_dataset: The original dataset (e.g., dataset['train']).
	new_dataset: The new dataset to be added.
	unique_key: The column name that uniquely identifies each entry.
	batch_size: The size of batches for processing large datasets.

	Returns:
	A new dataset with duplicates removed.
	"""
	# Extract unique keys from the original dataset in batches to save memory
	original_ids = set()
	for batch in original_dataset.iter(batch_size=batch_size):
	original_ids.update(batch[unique_key])

	# Filter out rows in the new dataset whose unique key exists in the original dataset
	def filter_function(example):
	return example[unique_key] not in original_ids

	deduplicated_new_dataset = new_dataset.filter(filter_function, batched=True, batch_size=batch_size)
	del original_ids
	return deduplicated_new_dataset

	def update_db_hub(texts, topics, dates):
	api_token = os.getenv("hf_key")
	login(token=api_token)
	dataset_name = "Danielrahmai1991/row_data"

	new_rows = {
	'text': texts,
	"topic": topics,
	"date": dates
	}
	# print("new_rows", new_rows)
	new_dataset = Dataset.from_dict(new_rows)

	try:
	# Load the dataset (use_auth_token=True if it's private)
	dataset = load_dataset(dataset_name, download_config=DownloadConfig(token=api_token))
	# print("Dataset loaded successfully!", dataset)
	# print(dataset)
	deduplicated_new_dataset = remove_duplicates(
	dataset['train'],
	new_dataset,
	unique_key="text",
	batch_size=1000 # Adjust batch size based on available memory
	)

	updated_dataset = concatenate_datasets([dataset['train'], deduplicated_new_dataset])

	# updated_dataset = new_dataset
	del dataset
	except Exception as e:
	updated_dataset = new_dataset
	print(f"Failed to load dataset: {e}")


	gc.collect()
	# Replace with your Space's repository name
	# Sample data

	print("updated_dataset", updated_dataset)
	# Push the updated dataset back to the hub
	try:
	updated_dataset.push_to_hub(dataset_name, private=True) # Set private=False if it's not private
	print(f"Updated dataset pushed to the Hugging Face Hub: {dataset_name}")
	except Exception as e:
	print(f"Failed to push dataset: {e}")