File size: 3,042 Bytes
cb9846d
13119ab
a295231
241d532
 
086ae79
cb9846d
1180c79
c94a411
30214b5
 
 
a12b80a
30214b5
d72f733
 
 
c94a411
3a921e4
cb9846d
086ae79
3a921e4
 
 
 
 
 
 
 
 
 
07b76a5
cb9846d
96f7c65
241d532
 
96f7c65
 
241d532
 
 
 
 
 
 
0fb413a
241d532
 
96f7c65
 
241d532
 
d41cd68
60ff89a
 
 
 
 
 
 
 
 
 
 
 
1180c79
 
 
 
 
 
f549bec
 
 
 
 
 
 
96f7c65
 
f549bec
96f7c65
 
f549bec
96f7c65
 
cb9846d
 
 
3ab4a68
d2ebfcd
086ae79
96f7c65
cb9846d
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import gradio as gr
from transformers import pipeline
import numpy as np
#from google.cloud import speech_v1
from google.cloud import speech
from google.protobuf import timestamp_pb2

import io
import os
"""Lista los archivos en la carpeta de ejecución."""
archivos = os.listdir()
print("\n".join(archivos))
print(os.getcwd())

rutas = [os.getcwd(),"deploygpt-e9475e7c2c7c.json"]
print('/'.join(rutas))
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = '/'.join(rutas)

transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-base.en")

def transcribe(audio_bytes):
    print(type(audio_bytes))
    """Transcribe audio bytes to text using Google Cloud Speech to Text."""

    sr, y = audio_bytes
    y = y.astype(np.float32)
    y /= np.max(np.abs(y))
    
    return transcriber({"sampling_rate": sr, "raw": y})["text"]

def transcribe_2(audio_bytes):
    """Transcribe audio bytes to text using Google Cloud Speech to Text."""

    # Crea un cliente de Speech to Text
    #client = speech_v1.SpeechClient()
    client = speech.SpeechClient()

    # Configura la configuración de la solicitud
    #config = speech_v1.RecognitionConfig()
    #config.language_code = "es-AR"
    #config.encoding = speech_v1.RecognitionConfig.Encoding.LINEAR16
    #config.sample_rate_hertz = 16000
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        enable_automatic_punctuation=True,
        audio_channel_count=1,
        language_code="es-AR",
    )

    # Crea una solicitud de reconocimiento de audio
    #audio = speech_v1.RecognitionAudio(content=audio_bytes)
    #request = speech_v1.RecognizeSpeechRequest(config=config, audio=audio)
    print(f"{type(audio_bytes)} {audio_bytes}")
    file_name = audio_bytes
    #sr, y = audio_bytes
    #print(f"{type(sr)} {sr}")
    #print(type(y))
    #y = y.astype(np.float32)
    #y /= np.max(np.abs(y))

    #import scipy.io.wavfile as wav
    #RATE = sr
    #numpydata = y
    #file_name = 'out.wav'
    #wav.write(file_name, RATE, numpydata)
    #the path of your audio file
    with io.open(file_name, "rb") as audio_file:
        content = audio_file.read()
        audio = speech.RecognitionAudio(content=content)
    
    #audio = speech.RecognitionAudio(content=audio_bytes)
    # Sends the request to google to transcribe the audio
    response = client.recognize(request={"config": config, "audio": audio})
    transcript = []
    # Reads the response
    for result in response.results:
        print("Transcript: {}".format(result.alternatives[0].transcript))
        transcript.append(result.alternatives[0].transcript)

    # Realiza la transcripción
    #response = client.recognize_speech(request)

    # Extrae el texto transcrito
    #transcript = response.results[0].alternatives[0].transcript

    return transcript


demo = gr.Interface(
    transcribe_2,
    gr.Audio(sources=["microphone"], type="filepath", streaming=False),
    "text",
    #live=True, # No muestra el botón de Submit.
)

demo.launch()