Spaces:
Sleeping
Sleeping
File size: 3,019 Bytes
b2a8272 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
import os
from huggingface_hub import HfApi
import cv2
from pathlib import Path
import pandas as pd
from transformers import pipeline
from transformers import AutoModelForImageClassification
import time
'''
how to use this script:
1. get data from the kaggle competition, including images and the train.csv file
edit the "base" variable, assuming the following layout
ceteans/
βββ images
βΒ Β βββ 00021adfb725ed.jpg
βΒ Β βββ 000562241d384d.jpg
βΒ Β βββ ...
βββ train.csv
2. inspect the df_results dataframe to see how the model is performing
'''
# setup for the ML model on huggingface (our wrapper)
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
rev = 'main'
# load the model
cetacean_classifier = AutoModelForImageClassification.from_pretrained(
"Saving-Willy/cetacean-classifier",
revision=rev,
trust_remote_code=True)
# get ready to load images
base = Path('~/Documents/ceteans/').expanduser()
df = pd.read_csv(base / 'train.csv')
i_max = 100 # put a limit on the number of images to classify in this test (or None)
# for each file in the folder base/images, 1/ load image, 2/ classify, 3/ compare against the relevant row in df
# also keep track of the time it takes to classify each image
classifications = []
img_pth = base / 'images'
img_files = list(img_pth.glob('*.jpg'))
for i, img_file in enumerate(img_files):
# lets check we can get the right target.
img_id = img_file.name # includes .jpg
target = df.loc[df['image'] == img_id, 'species'].item()
#print(img_id, target)
start_time = time.time()
image = cv2.imread(str(img_file))
load_time = time.time() - start_time
start_time = time.time()
out = cetacean_classifier(image) # get top 3 matches
classify_time = time.time() - start_time
whale_prediction1 = out['predictions'][0]
# comparison
ok = whale_prediction1 == target
any = target in [x for x in out['predictions']]
row = [img_id, target, ok, any, load_time, classify_time] + list(out['predictions'])
print(i, row)
classifications.append(row)
if i_max is not None and i >= i_max:
break
df_results = pd.DataFrame(classifications, columns=['img_id', 'target', 'ok', 'any', 'load_time', 'classify_time'] + [f'pred_{i}' for i in range(3)])
# print out a few summary stats
# mean time to load and classify (formatted 3dp), +- std dev (formatted to 2dp),
print(f"Mean load time: {df_results['load_time'].mean():.3f} +- {df_results['load_time'].std():.2f} s")
print(f"Mean classify time: {df_results['classify_time'].mean():.3f} +- {df_results['classify_time'].std():.2f} s")
# accuracy: count of ok / count of any
print(f"Accuracy: correct with top prediction: {df_results['ok'].sum()} | any of top 3 correct: {df_results['any'].sum():.3f} (of total {df_results.shape[0]})")
# diversity: is the model just predicting one class for everything it sees?
print("Which classes are predicted?")
print(df_results.pred_0.value_counts())
|