import gradio as gr import torch import numpy as np from transformers import ViTForImageClassification, ViTModel, ViTImageProcessor from PIL import Image import PIL import io from sklearn.preprocessing import LabelEncoder import json def greet(name): return "Hello " + name + "!!" async def test2(file, top_k: int = 5): # extension = file.filename.split(".")[-1] in ("jpg", "jpeg", "png") # if not extension: # return "Image format must be jpg, jpeg, or png!" # # Read image contents # contents = await file.read() # Preprocess image # image_tensor = preprocess_image(contents) image_tensor = preprocess_image(file) # Make predictions predictions = predict(image_tensor, top_k) item = {"predictions": predictions} return json.dumps(item) encoder = LabelEncoder() encoder.classes_ = np.load('encoder.npy', allow_pickle=True) pretrained_model = ViTModel.from_pretrained('pillIdentifierAI/pillIdentifier') feature_extractor = ViTImageProcessor( image_size=224, do_resize=True, do_normalize=True, do_rescale=False, image_mean=[0.5, 0.5, 0.5], image_std=[0.5, 0.5, 0.5], ) config = pretrained_model.config config.num_labels = 2112 # Change this to the appropriate number of classes model = ViTForImageClassification(config) model.vit = pretrained_model model.eval() # def preprocess_image(contents): def preprocess_image(image): # Convert image bytes to PIL Image # image = Image.open(io.BytesIO(contents)) image = Image.fromarray(np.uint8(image)) if image.mode != 'RGB': image = image.convert('RGB') # Use the feature extractor directly inputs = feature_extractor(images=[image]) image_tensor = inputs['pixel_values'][0] # Convert to tensor image_tensor = torch.tensor(image_tensor, dtype=torch.float32) return image_tensor def predict(image_tensor, top_k=5): # Ensure the model is in evaluation mode model.eval() # Make prediction with torch.no_grad(): outputs = model(pixel_values=image_tensor.unsqueeze(0)) # Add batch dimension logits = outputs.logits.numpy() # Get top k predictions and their probabilities predictions = np.argsort(logits, axis=1)[:, ::-1][:, :top_k] probabilities = np.sort(logits, axis=1)[:, ::-1][:, :top_k] # Decode predictions using the label encoder and create the result dictionary result = {} for i in range(top_k): class_name = encoder.inverse_transform([predictions[0][i]])[0] probability = probabilities[0][i] result[i + 1] = {'label': str(class_name), 'probability': float(probability)} return result iface = gr.Interface(fn=test2, inputs="image", outputs="text") iface.launch(share=True)