Spaces:

rmayormartins
/

inclusion-visually-impaired-image2speech

Runtime error

App Files Files Community

inclusion-visually-impaired-image2speech / app.py

rmayormartins

Subindo arquivosuptkn

1ca60cd about 1 year ago

raw

history blame

3.9 kB

	import os
	import time
	import gradio as gr
	import torch
	from PIL import Image
	from gtts import gTTS
	import numpy as np
	import cv2
	from transformers import BlipProcessor, BlipForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM
	from huggingface_hub import login

	# meu tokennnup
	hf_token = os.getenv("HUGGINGFACE_TOKEN")

	if hf_token:
	login(token=hf_token)

	# YOLOv5
	model = torch.hub.load('ultralytics/yolov5', 'yolov5s')

	# Calcula a GLCM e o contraste
	def calculate_glcm_contrast(image):
	gray_image = cv2.cvtColor(np.array(image), cv2.COLOR_BGR2GRAY)
	max_value = gray_image.max() + 1
	glcm = np.zeros((max_value, max_value), dtype=np.float64)

	for i in range(gray_image.shape[0] - 1):
	for j in range(gray_image.shape[1] - 1):
	x = gray_image[i, j]
	y = gray_image[i + 1, j + 1]
	glcm[x, y] += 1

	glcm = glcm / glcm.sum()

	contrast = 0.0
	for i in range(max_value):
	for j in range(max_value):
	contrast += (i - j) ** 2 * glcm[i, j]

	return contrast

	# Analisar a textura e a temperatura de cor
	def analyze_image_properties(image):
	# Análise de cor (média RGB)
	image_rgb = cv2.cvtColor(np.array(image), cv2.COLOR_BGR2RGB)
	avg_color_per_row = np.average(image_rgb, axis=0)
	avg_color = np.average(avg_color_per_row, axis=0)

	# Determinar temperatura da cor
	if avg_color[0] > avg_color[2]: # Mais vermelho que azul
	temperature = 'quente'
	else:
	temperature = 'fria'

	# Análise de textura
	texture_contrast = calculate_glcm_contrast(image)
	texture = 'lisa' if texture_contrast < 100 else 'texturizada'

	return temperature, texture

	# Descrever imagem usando BLIP
	def describe_image(image):
	processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
	model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
	inputs = processor(image, return_tensors="pt")
	out = model.generate(**inputs)
	description = processor.decode(out[0], skip_special_tokens=True)
	return description

	# Traduzir descrição para pt
	def translate_description(description):
	model_name = 'Helsinki-NLP/opus-mt-tc-big-en-pt'
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
	translated = model.generate(**tokenizer(description, return_tensors="pt", padding=True))
	translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
	return translated_text

	# Processar imagem e gerar saída de voz
	def process_image(image):
	# Detecção de objetos
	results = model(image)
	detected_image = results.render()[0]

	# Análise de textura e temperatura de cor
	temperature, texture = analyze_image_properties(image)

	# Descrição da imagem
	description = describe_image(image)
	translated_description = translate_description(description)

	# Construir a descrição final
	final_description = f"{translated_description}. A textura é {texture} e a temperatura de cor é {temperature}."

	# Texto para voz
	tts = gTTS(text=final_description, lang='pt')
	attempts = 0
	while attempts < 5:
	try:
	tts.save("output.mp3")
	break
	except gTTS.tts.gTTSError as e:
	if e.r.status_code == 429:
	print("Too many requests. Waiting before retrying...")
	time.sleep(5)
	attempts += 1
	else:
	raise e

	# Saída
	return Image.fromarray(detected_image), final_description, "output.mp3"

	#
	example_image_path = "example1.JPG"

	# Gradio
	iface = gr.Interface(
	fn=process_image,
	inputs=gr.Image(type="pil"),
	outputs=[gr.Image(type="pil"), gr.Textbox(), gr.Audio(type="filepath")],
	examples=[example_image_path]
	)

	iface.launch()