File size: 2,558 Bytes
cb9846d
13119ab
a295231
241d532
 
086ae79
cb9846d
c94a411
30214b5
 
 
a12b80a
30214b5
d72f733
 
 
c94a411
96f7c65
 
 
 
 
 
 
 
 
 
cb9846d
086ae79
07b76a5
cb9846d
96f7c65
241d532
 
96f7c65
 
241d532
 
 
 
 
 
 
 
 
 
96f7c65
 
241d532
 
72805bd
 
 
 
 
f549bec
 
 
 
 
 
 
96f7c65
 
f549bec
96f7c65
 
f549bec
96f7c65
 
cb9846d
 
 
 
086ae79
 
96f7c65
cb9846d
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import gradio as gr
from transformers import pipeline
import numpy as np
#from google.cloud import speech_v1
from google.cloud import speech
from google.protobuf import timestamp_pb2

import os
"""Lista los archivos en la carpeta de ejecución."""
archivos = os.listdir()
print("\n".join(archivos))
print(os.getcwd())

rutas = [os.getcwd(),"deploygpt-e9475e7c2c7c.json"]
print('/'.join(rutas))
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = '/'.join(rutas)

#transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en")

#def transcribe(audio_bytes):
#    """Transcribe audio bytes to text using Google Cloud Speech to Text."""
#
#    sr, y = audio_bytes
#    y = y.astype(np.float32)
#    y /= np.max(np.abs(y))
#    
#    return transcriber({"sampling_rate": sr, "raw": y})["text"]

def transcribe(audio_bytes):
    """Transcribe audio bytes to text using Google Cloud Speech to Text."""

    # Crea un cliente de Speech to Text
    #client = speech_v1.SpeechClient()
    client = speech.SpeechClient()

    # Configura la configuración de la solicitud
    #config = speech_v1.RecognitionConfig()
    #config.language_code = "es-AR"
    #config.encoding = speech_v1.RecognitionConfig.Encoding.LINEAR16
    #config.sample_rate_hertz = 16000
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        enable_automatic_punctuation=True,
        audio_channel_count=2,
        language_code="es-AR",
    )

    # Crea una solicitud de reconocimiento de audio
    #audio = speech_v1.RecognitionAudio(content=audio_bytes)
    #request = speech_v1.RecognizeSpeechRequest(config=config, audio=audio)
    print(type(audio_bytes))
    sr, y = audio_bytes
    y = y.astype(np.float32)
    y /= np.max(np.abs(y))
    audio = speech.RecognitionAudio(content=y)
    # Sends the request to google to transcribe the audio
    response = client.recognize(request={"config": config, "audio": audio})
    transcript = []
    # Reads the response
    for result in response.results:
        print("Transcript: {}".format(result.alternatives[0].transcript))
        transcript.append(result.alternatives[0].transcript)

    # Realiza la transcripción
    #response = client.recognize_speech(request)

    # Extrae el texto transcrito
    #transcript = response.results[0].alternatives[0].transcript

    return transcript


demo = gr.Interface(
    transcribe,
    gr.Audio(sources=["microphone"], streaming=False),
    "text",
    #live=True, # No muestra el botón de Submit.
)

demo.launch()