File size: 2,838 Bytes
9da994b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import os
from openai import OpenAI
import logging
import hydra
from dotenv import load_dotenv
import wandb
from omegaconf import DictConfig


load_dotenv()
api_key = os.getenv("OPENAI_API_KEY")


def speech_to_text(audio: bytes, openai_client: OpenAI, configuration: dict) -> str:
    """From the path of an audio file, it generates a text transcription using openai

    Args:
        audio_path (str): path of the audio containing the query
        openai_client (OpenAI): client for openai connection

    Returns:
        str: transctiption text
    """
    try:
        # audio_file = open(audio_path, "rb")
        transcription = openai_client.audio.transcriptions.create(
            model=configuration["model"],
            file=audio,  # audio_file,
            language=configuration["language"],
            response_format=configuration["response_format"],
            temperature=configuration["temperature"],
        )

        logging.info("Success: audio converted into text!")
        logging.info(f"Audio transcription: {transcription}")
        return transcription
    except FileNotFoundError as e:
        pass
        logging.error(f"Error: not found - {str(e)}")
    except Exception as e:
        logging.error(f"Error: OpenAI API request failed - {str(e)}")
        return f"error {str(e)}"


@hydra.main(config_path="../../conf", config_name="speech_to_text.yaml")
def speech_to_text_on_wandb(cfg: DictConfig):
    openai_client = OpenAI()
    run = wandb.init(
        project=cfg.main.project_name,
        group=cfg.main.experiment_name,
        config=cfg.openai_parameters,
        job_type="train_llm",
    )

    # download artifact
    artifact = run.use_artifact(
        os.path.join("mpoliti08/lux-voice-processing", cfg.main.audio_dataset),
        type="audio",
    )
    artifact_dir = artifact.download()

    table = wandb.Table(columns=["audio_file", "transcript"])

    for filename in os.listdir(artifact_dir):
        file_path = os.path.join(artifact_dir, filename)
        audio = open(file_path, "rb")
        transcription_text = speech_to_text(
            audio=audio,
            openai_client=openai_client,
            configuration=cfg.openai_parameters,
        )

        audio_file = wandb.Audio(file_path)
        table.add_data(audio_file, transcription_text)

    run.log({"Table": table})
    run.finish()


if __name__ == "__main__":
    openai_client = OpenAI()
    audio_path = "data/audio_recordings/0.wav"
    configuration = {
        "language": "it",
        "model": "whisper-1",
        "response_format": "text",
        "temperature": 0.2,
    }

    audio = open("data/audio_recordings/0.wav", "rb")
    res = speech_to_text(
        audio=audio, openai_client=openai_client, configuration=configuration
    )
    print(res)

    # speech_to_text_on_wandb()