File size: 4,591 Bytes
b94dfba
 
 
 
 
 
 
 
 
 
47fcff2
79cade0
1b9ab22
18d6e67
79cade0
0c5007d
1b9ab22
79cade0
246199f
 
0cce7a0
79cade0
 
 
 
 
 
18d6e67
246199f
 
 
79cade0
0cce7a0
efe1573
1b9ab22
47fcff2
18d6e67
 
 
 
1b9ab22
5d4663f
246199f
 
 
1b9ab22
 
efe1573
246199f
3ba965f
246199f
1b9ab22
efe8c50
18d6e67
efe8c50
 
 
 
 
18d6e67
 
efe8c50
 
 
3ba965f
246199f
18d6e67
efe8c50
3ba965f
18d6e67
a604d22
b94dfba
 
 
 
 
 
246199f
a604d22
 
47fcff2
 
3ba965f
 
 
 
 
 
 
 
 
 
b94dfba
3ba965f
 
246199f
 
3ba965f
 
47fcff2
 
 
 
61abdf6
47fcff2
b94dfba
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
The error message you're seeing is likely due to the way you're defining and calling the `generate_response` function in your Gradio app.

In Gradio, when you define a function with the `async` keyword, it's expected to be a coroutine that returns a value. However, when you define a function with the `async` keyword, you need to use the `await` keyword to call it.

In your case, you're defining the `generate_response` function as an `async` function, but you're not using the `await` keyword to call it. Instead, you're passing it as a callback to the `gr.Button` component.

To fix this issue, you need to define the `generate_response` function without the `async` keyword, and then use the `await` keyword to call the `respond` function inside it.

Here's an updated version of your code that should work:
```python
import gradio as gr
import os
import openai
import tenacity

ACCESS_TOKEN = os.getenv("HF_TOKEN")
openai.api_key = ACCESS_TOKEN

# Retry logic with tenacity for handling API rate limits
@tenacity.retry(wait=tenacity.wait_exponential(multiplier=1, min=4, max=10), stop=tenacity.stop_after_attempt(5))
async def respond(
    message,
    system_message,
    max_tokens,
    temperature,
    top_p,
):
    try:
        # Only use the system message and the current message for the response
        messages = [{"role": "system", "content": system_message},
                    {"role": "user", "content": message}]

        response = ""
        # Properly stream chat completions using dot notation
        stream = openai.ChatCompletion.create(
            model="NousResearch/Hermes-3-Llama-3.1-8B",
            max_tokens=max_tokens,
            temperature=temperature,
            top_p=top_p,
            messages=messages,
            stream=True,
        )

        # Stream response and concatenate tokens
        for chunk in stream:
            if 'choices' in chunk and 'delta' in chunk['choices'][0] and 'content' in chunk['choices'][0]['delta']:
                token = chunk['choices'][0]['delta']['content']
                response += token

        return response

    except openai.error.APIError as e:
        # Handle both string and dict types of error bodies
        error_details = e.body
        if isinstance(error_details, dict):
            error_type = error_details.get("type", "Unknown")
            error_code = error_details.get("code", "Unknown")
            error_param = error_details.get("param", "Unknown")
            error_message = error_details.get("message", "An error occurred.")
            error_str = f"{error_type}: {error_message} (code: {error_code}, param: {error_param})"
        else:
            error_str = f"Error: {error_details}"

        print(f"APIError: {error_str}")
        return error_str

    except Exception as e:
        print(f"Exception: {e}")
        return "Error occurred. Please try again."


# Gradio function to handle user input and response generation without history
def generate_response(message, system_message, max_tokens, temperature, top_p):
    import asyncio
    loop = asyncio.new_event_loop()
    asyncio.set_event_loop(loop)
    response = loop.run_until_complete(respond(message, system_message, max_tokens, temperature, top_p))
    return response


def launch_app():
    try:
        demo = gr.Blocks()
        with demo:
            gr.Markdown("# Chatbot")
            message = gr.Textbox(label="Message")
            system_message = gr.Textbox(label="System message")
            max_tokens = gr.Slider(minimum=1, maximum=2048, value=2048, step=1, label="Max new tokens")
            temperature = gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature")
            top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-P")
            response = gr.Text(label="Response")

            # Use the generate_response function without history
            gr.Button("Generate Response").click(
                generate_response,
                inputs=[message, system_message, max_tokens, temperature, top_p],
                outputs=[response],
                show_progress=False,
            )
        demo.launch(show_error=True)
    except KeyError as e:
        print(f"Error: {e}")
        print("Please try again.")

if __name__ == "__main__":
    launch_app()
```
This code defines the `generate_response` function without the `async` keyword, and then uses the `await` keyword to call the `respond` function inside it. It also creates a new event loop to run the `respond` function, since it's an asynchronous function.