whisper_fileStream

Sleeping

File size: 3,761 Bytes

2cc8a36
 
0a574ec
2cc8a36
 
943c80d
2cc8a36
943c80d
2cc8a36
943c80d
 
2cc8a36
 
 
 
 
 
 
 
 
943c80d
2cc8a36
943c80d
 
 
 
 
0a574ec
943c80d
 
 
 
 
 
 
 
 
 
2cc8a36
 
 
 
943c80d
 
0a574ec
943c80d
2cc8a36
 
 
 
 
 
943c80d
2cc8a36
 
 
 
 
 
 
 
 
0a574ec
2cc8a36
 
0a574ec
2cc8a36
0a574ec
2cc8a36
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0a574ec
2cc8a36
 
 
0a574ec
 
 
 
 
 
 
 
4a60ca6
943c80d


'''
This script calls the model from openai api to predict the next few words.
'''
import os
# os.system("pip install --upgrade pip")
from pprint import pprint
# os.system("pip install git+https://github.com/openai/whisper.git")
import sys
# print("Sys: ", sys.executable)
# os.system("pip install openai")
import openai
import gradio as gr
import whisper
from transformers import pipeline
import torch
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer
import time

EXAMPLE_PROMPT = """This is a tool for helping someone with memory issues remember the next word. 

The predictions follow a few rules:
1) The predictions are suggestions of ways to continue the transcript as if someone forgot what the next word was.
2) The predictions do not repeat themselves.
3) The predictions focus on suggesting nouns, adjectives, and verbs.
4) The predictions are related to the context in the transcript.
    
EXAMPLES:
Transcript: Tomorrow night we're going out to 
Prediction: The Movies, A Restaurant, A Baseball Game, The Theater, A Party for a friend   
Transcript: I would like to order a cheeseburger with a side of
Prediction: Frnech fries, Milkshake, Apple slices, Side salad, Extra katsup 
Transcript: My friend Savanah is
Prediction: An elecrical engineer, A marine biologist, A classical musician 
Transcript: I need to buy a birthday
Prediction: Present, Gift, Cake, Card
Transcript: """

# whisper model specification 
model = whisper.load_model("tiny")

# openai.api_key = os.environ["Openai_APIkey"]

def debug_inference(audio, prompt, model, temperature, state=""):
    breakpoint()
    # load audio data
    audio = whisper.load_audio(audio)
    # ensure sample is in correct format for inference
    audio = whisper.pad_or_trim(audio)

    # generate a log-mel spetrogram of the audio data
    mel = whisper.log_mel_spectrogram(audio)
    
    _, probs = model.detect_language(mel)

    # decode audio data
    options = whisper.DecodingOptions(fp16 = False)
    # transcribe speech to text
    result = whisper.decode(model, mel, options)
    print("result pre gp model from whisper: ", result, ".text ", result.text, "and the data type: ", type(result.text))

    text = prompt + result.text + "\nPrediction: "
    
    response = openai.Completion.create(
                        model=model,
                        prompt=text,
                        temperature=temperature,
                        max_tokens=8,
                        n=5)

    infers = []
    temp = []
    infered=[]
    for i in range(5):
        print("print1 ", response['choices'][i]['text'])
        temp.append(response['choices'][i]['text'])
        print("print2: infers ", infers)
        print("print3: Responses ", response)
        print("Object type of response: ", type(response))
        #infered = list(map(lambda x: x.split(',')[0], infers))
        #print("Infered type is: ", type(infered))
        infers = list(map(lambda x: x.replace("\n", ""), temp))
        #infered = list(map(lambda x: x.split(','), infers))

    return result.text, state, infers, text

# get audio from microphone 
gr.Interface(
    fn=debug_inference, 
    inputs=[gr.inputs.Audio(source="microphone", type="filepath"),
            gr.inputs.Textbox(lines=15, placeholder="Enter a prompt here"),
            gr.inputs.Dropdown(["text-ada-001", "text-davinci-002", "text-davinci-003", "gpt-3.5-turbo"], label="Model"),
            gr.inputs.Slider(minimum=0.0, maximum=1.0, default=0.8, step=0.1, label="Temperature"),
            "state"
            ],
    outputs=["textbox","state","textbox", "textbox"],
    examples=[["example_in-the-mood-to-eat.m4a", EXAMPLE_PROMPT, "text-ada-001", 0.8, ""],["","","",0.9,""]],
    live=False).launch()