from pydrive2.auth import GoogleAuth from pydrive2.drive import GoogleDrive import os import gradio as gr from datasets import load_dataset, Dataset import pandas as pd from PIL import Image import shutil from tqdm import tqdm import logging # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class DatasetManager: def __init__(self, dataset_name=None, local_images_dir="downloaded_cards"): self.dataset_name = dataset_name self.local_images_dir = local_images_dir self.drive = None # Create local directory if it doesn't exist os.makedirs(local_images_dir, exist_ok=True) def authenticate_drive(self): """Authenticate with Google Drive""" try: gauth = GoogleAuth() gauth.LocalWebserverAuth() self.drive = GoogleDrive(gauth) return True, "Successfully authenticated with Google Drive" except Exception as e: return False, f"Authentication failed: {str(e)}" def download_and_rename_files(self, drive_folder_id, naming_convention): """Download files from Google Drive and rename them""" if not self.drive: return False, "Google Drive not authenticated", [] try: # List files in the folder query = f"'{drive_folder_id}' in parents and trashed=false" file_list = self.drive.ListFile({'q': query}).GetList() if not file_list: return False, "No files found in the specified folder", [] renamed_files = [] for i, file in enumerate(tqdm(file_list, desc="Downloading files")): if file['mimeType'].startswith('image/'): new_filename = f"{naming_convention}_{i+1}.jpg" file_path = os.path.join(self.local_images_dir, new_filename) # Download file file.GetContentFile(file_path) # Verify the image can be opened try: with Image.open(file_path) as img: img.verify() renamed_files.append({ 'file_path': file_path, 'original_name': file['title'], 'new_name': new_filename }) except Exception as e: logger.error(f"Error processing image {file['title']}: {str(e)}") if os.path.exists(file_path): os.remove(file_path) return True, f"Successfully processed {len(renamed_files)} images", renamed_files except Exception as e: return False, f"Error downloading files: {str(e)}", [] def update_huggingface_dataset(self, dataset_name, renamed_files): """Update or create Hugging Face dataset with new images""" try: # Create a DataFrame with the file information df = pd.DataFrame(renamed_files) # Create a Hugging Face Dataset dataset = Dataset.from_pandas(df) # Push to Hugging Face Hub dataset.push_to_hub(dataset_name) return True, f"Successfully updated dataset '{dataset_name}' with {len(renamed_files)} images" except Exception as e: return False, f"Error updating Hugging Face dataset: {str(e)}" def process_pipeline(folder_id, naming_convention, dataset_name): """Main pipeline to process images and update dataset""" manager = DatasetManager() # Step 1: Authenticate auth_success, auth_message = manager.authenticate_drive() if not auth_success: return auth_message # Step 2: Download and rename files success, message, renamed_files = manager.download_and_rename_files(folder_id, naming_convention) if not success: return message # Step 3: Update Hugging Face dataset if dataset_name: success, hf_message = manager.update_huggingface_dataset(dataset_name, renamed_files) return f"{message}\n{hf_message}" return message # Gradio interface demo = gr.Interface( fn=process_pipeline, inputs=[ gr.Textbox( label="Google Drive Folder ID", placeholder="Enter the folder ID from your Google Drive URL" ), gr.Textbox( label="Naming Convention", placeholder="e.g., card", value="card" ), gr.Textbox( label="Hugging Face Dataset Name (Optional)", placeholder="username/dataset-name" ) ], outputs=gr.Textbox(label="Status"), title="Card Image Processor", description="Download card images from Google Drive and add them to your Hugging Face dataset" ) if __name__ == "__main__": demo.launch()