Spaces:
Running
Running
import pandas as pd | |
import requests | |
import os | |
file_path = "open-images-dataset-train0.tsv" | |
# Read TSV file, skipping the first row | |
df = pd.read_csv(file_path, sep="\t", engine="python", skiprows=1, names=["ImageURL", "Subset", "ImageID"]) | |
# Print first few rows to verify | |
print("First few rows of the cleaned dataset:") | |
print(df.head()) | |
# Create a fixed category folder (since 'Subset' contains numbers, not real categories) | |
output_folder = "open_images_v7/dataset" | |
os.makedirs(output_folder, exist_ok=True) | |
# Limit downloads to the first 100 images | |
max_images = 100 | |
for index, row in df.iterrows(): | |
if index >= max_images: | |
break # Stop downloading after 100 images | |
image_url = row["ImageURL"] | |
image_id = row["ImageID"] | |
# Ensure the image filename ends with ".jpg" | |
image_path = os.path.join(output_folder, f"{image_id}.jpg") | |
try: | |
response = requests.get(image_url, timeout=10) | |
if response.status_code == 200: | |
with open(image_path, "wb") as f: | |
f.write(response.content) | |
print(f"β Downloaded: {image_id}.jpg") | |
else: | |
print(f"β Failed: {image_id}") | |
except Exception as e: | |
print(f"β Error downloading {image_id}: {e}") | |