Spaces:
Runtime error
Runtime error
File size: 3,412 Bytes
e48b085 d209a4c 03a7dca 89073fe e48b085 03a7dca 2a13288 03a7dca 9b44783 03a7dca 9b44783 03a7dca 2a13288 03a7dca 2a13288 03a7dca 89073fe 03a7dca 2a13288 03a7dca 2a13288 03a7dca 9b44783 03a7dca d209a4c 03a7dca 2a13288 03a7dca 2e5bff4 03a7dca e48b085 03a7dca 6db1028 5669498 2e5bff4 03a7dca |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 |
import os
import time
import gradio as gr
import torch
from PIL import Image
from gtts import gTTS
import numpy as np
import cv2
from transformers import BlipProcessor, BlipForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM
from huggingface_hub import login
# Ler o token da variável de ambiente
hf_token = os.getenv("HUGGINGFACE_TOKEN")
if hf_token:
login(token=hf_token)
# Carregar o modelo YOLOv5
model = torch.hub.load('ultralytics/yolov5', 'yolov5s')
# Função para calcular a GLCM e o contraste manualmente
def calculate_glcm_contrast(image):
gray_image = cv2.cvtColor(np.array(image), cv2.COLOR_BGR2GRAY)
max_value = gray_image.max() + 1
glcm = np.zeros((max_value, max_value), dtype=np.float64)
for i in range(gray_image.shape[0] - 1):
for j in range(gray_image.shape[1] - 1):
x = gray_image[i, j]
y = gray_image[i + 1, j + 1]
glcm[x, y] += 1
glcm = glcm / glcm.sum()
contrast = 0.0
for i in range(max_value):
for j in range(max_value):
contrast += (i - j) ** 2 * glcm[i, j]
return contrast
# Função para descrever imagem usando BLIP
def describe_image(image):
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
inputs = processor(image, return_tensors="pt")
out = model.generate(**inputs)
description = processor.decode(out[0], skip_special_tokens=True)
return description
# Função para traduzir descrição para português
def translate_description(description):
model_name = 'Helsinki-NLP/opus-mt-tc-big-en-pt'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
translated = model.generate(**tokenizer(description, return_tensors="pt", padding=True))
translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
return translated_text
# Função principal para processar imagem e gerar saída de voz
def process_image(image):
# Detecção de objetos
results = model(image)
detected_image = results.render()[0]
# Análise de cor (média RGB)
mean_rgb = np.mean(np.array(image), axis=(0, 1))
# Análise de textura
texture_contrast = calculate_glcm_contrast(image)
# Descrição da imagem
description = describe_image(image)
translated_description = translate_description(description)
# Texto para voz
tts = gTTS(text=translated_description, lang='pt')
attempts = 0
while attempts < 5:
try:
tts.save("output.mp3")
break
except gTTS.tts.gTTSError as e:
if e.r.status_code == 429:
print("Too many requests. Waiting before retrying...")
time.sleep(5)
attempts += 1
else:
raise e
# Retornar imagem com detecções, descrição e áudio
return Image.fromarray(detected_image), translated_description, "output.mp3"
# Carregar imagem de exemplo diretamente do código
example_image_path = "example1.JPG"
# Interface Gradio
iface = gr.Interface(
fn=process_image,
inputs=gr.Image(type="pil"),
outputs=[gr.Image(type="pil"), gr.Textbox(), gr.Audio(type="filepath")],
examples=[example_image_path]
)
iface.launch()
|