File size: 2,884 Bytes
03a7dca
 
 
 
 
 
 
 
2a13288
03a7dca
 
9b44783
 
03a7dca
9b44783
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
03a7dca
 
2a13288
03a7dca
 
 
 
 
 
 
 
2a13288
03a7dca
 
 
 
 
 
 
 
2a13288
03a7dca
2a13288
03a7dca
 
 
 
 
 
 
9b44783
03a7dca
 
 
 
 
 
 
 
 
2a13288
03a7dca
 
2e5bff4
 
03a7dca
cdc6975
03a7dca
 
6db1028
5669498
2e5bff4
03a7dca
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import gradio as gr
import torch
from PIL import Image
from gtts import gTTS
import numpy as np
import cv2
from transformers import BlipProcessor, BlipForConditionalGeneration, MarianMTModel, MarianTokenizer

# Carregar o modelo YOLOv5
model = torch.hub.load('ultralytics/yolov5', 'yolov5s')

# Função para calcular a GLCM e o contraste manualmente
def calculate_glcm_contrast(image):
    gray_image = cv2.cvtColor(np.array(image), cv2.COLOR_BGR2GRAY)
    max_value = gray_image.max() + 1
    glcm = np.zeros((max_value, max_value), dtype=np.float64)

    for i in range(gray_image.shape[0] - 1):
        for j in range(gray_image.shape[1] - 1):
            x = gray_image[i, j]
            y = gray_image[i + 1, j + 1]
            glcm[x, y] += 1

    glcm = glcm / glcm.sum()

    contrast = 0.0
    for i in range(max_value):
        for j in range(max_value):
            contrast += (i - j) ** 2 * glcm[i, j]
    
    return contrast

# Função para descrever imagem usando BLIP
def describe_image(image):
    processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
    model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
    inputs = processor(image, return_tensors="pt")
    out = model.generate(**inputs)
    description = processor.decode(out[0], skip_special_tokens=True)
    return description

# Função para traduzir descrição para português
def translate_description(description):
    model_name = 'Helsinki-NLP/opus-mt-en-pt'
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    model = MarianMTModel.from_pretrained(model_name)
    translated = model.generate(**tokenizer(description, return_tensors="pt", padding=True))
    translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
    return translated_text

# Função principal para processar imagem e gerar saída de voz
def process_image(image):
    # Detecção de objetos
    results = model(image)
    detected_image = results.render()[0]

    # Análise de cor (média RGB)
    mean_rgb = np.mean(np.array(image), axis=(0, 1))

    # Análise de textura
    texture_contrast = calculate_glcm_contrast(image)

    # Descrição da imagem
    description = describe_image(image)
    translated_description = translate_description(description)

    # Texto para voz
    tts = gTTS(text=translated_description, lang='pt')
    tts.save("output.mp3")

    # Retornar imagem com detecções, descrição e áudio
    return Image.fromarray(detected_image), translated_description, "output.mp3"

# Carregar imagem de exemplo diretamente do código
example_image_path = "example1.JPG"

# Interface Gradioo
iface = gr.Interface(
    fn=process_image,
    inputs=gr.Image(type="pil"),
    outputs=[gr.Image(type="pil"), gr.Textbox(), gr.Audio(type="filepath")],
    examples=[example_image_path]
)

iface.launch()