Spaces:

rmayormartins
/

inclusion-visually-impaired-image2speech

Runtime error

App Files Files Community

inclusion-visually-impaired-image2speech / app.py

rmayormartins

Subindo arquivos331313

8dbeec6 about 1 year ago

raw

history blame

3.74 kB

	import os
	import time
	import gradio as gr
	import torch
	from PIL import Image
	from gtts import gTTS
	import numpy as np
	import cv2
	from transformers import BlipProcessor, BlipForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM
	from huggingface_hub import login

	#token
	hf_token = os.getenv("HUGGINGFACE_TOKEN")

	if hf_token:
	login(token=hf_token)

	#modelo YOLOv5
	model = torch.hub.load('ultralytics/yolov5', 'yolov5s')

	#Calcular a GLCM e o contraste
	def calculate_glcm_contrast(image):
	gray_image = cv2.cvtColor(np.array(image), cv2.COLOR_BGR2GRAY)
	max_value = gray_image.max() + 1
	glcm = np.zeros((max_value, max_value), dtype=np.float64)

	for i in range(gray_image.shape[0] - 1):
	for j in range(gray_image.shape[1] - 1):
	x = gray_image[i, j]
	y = gray_image[i + 1, j + 1]
	glcm[x, y] += 1

	glcm = glcm / glcm.sum()

	contrast = 0.0
	for i in range(max_value):
	for j in range(max_value):
	contrast += (i - j) ** 2 * glcm[i, j]

	return contrast

	#Analisar a textura e a temperatura de cor
	def analyze_image_properties(image):
	#cor (média RGB)
	image_rgb = cv2.cvtColor(np.array(image), cv2.COLOR_BGR2RGB)
	avg_color_per_row = np.average(image_rgb, axis=0)
	avg_color = np.average(avg_color_per_row, axis=0)
	temperature = 'fria' if np.mean(avg_color) < 128 else 'quente'

	#textura
	texture_contrast = calculate_glcm_contrast(image)
	texture = 'lisa' if texture_contrast < 100 else 'texturizada'

	return temperature, texture

	#Descrever imagem com BLIP
	def describe_image(image):
	processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
	model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
	inputs = processor(image, return_tensors="pt")
	out = model.generate(**inputs)
	description = processor.decode(out[0], skip_special_tokens=True)
	return description

	#Traduz para .pt
	def translate_description(description):
	model_name = 'Helsinki-NLP/opus-mt-tc-big-en-pt'
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
	translated = model.generate(**tokenizer(description, return_tensors="pt", padding=True))
	translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
	return translated_text

	#Processo
	def process_image(image):
	# Detecta
	results = model(image)
	detected_image = results.render()[0]

	# Análise de cor (média RGB)
	mean_rgb = np.mean(np.array(image), axis=(0, 1))

	# Análise de textura e temperatura de cor
	temperature, texture = analyze_image_properties(image)

	# Descrição da imagem
	description = describe_image(image)
	translated_description = translate_description(description)

	# Construção
	final_description = f"{translated_description}. A textura é {texture} e a temperatura de cor é {temperature}."

	# Texto2voz
	tts = gTTS(text=final_description, lang='pt')
	attempts = 0
	while attempts < 5:
	try:
	tts.save("output.mp3")
	break
	except gTTS.tts.gTTSError as e:
	if e.r.status_code == 429:
	print("Muitas requisicoes...")
	time.sleep(5)
	attempts += 1
	else:
	raise e

	#Saída
	return Image.fromarray(detected_image), final_description, "output.mp3"

	#
	example_image_path = "example1.JPG"

	#
	iface = gr.Interface(
	fn=process_image,
	inputs=gr.Image(type="pil"),
	outputs=[gr.Image(type="pil"), gr.Textbox(), gr.Audio(type="filepath")],
	examples=[example_image_path]
	)

	iface.launch()