Spaces:

amd
/

gemma3-27b-mi-amd

Running

App Files Files Community

Lohia, Aditya commited on Mar 28

Commit

f24a24a

1 Parent(s): e553ed7

update space

Browse files

Files changed (2) hide show

app.py +41 -29
gateway.py +84 -57

app.py CHANGED Viewed

@@ -1,15 +1,34 @@
 import os
 import gradio as gr
 from typing import Iterator
 from dialog import get_dialog_box
 from gateway import check_server_health, request_generation
 # CONSTANTS
-MAX_NEW_TOKENS: int = 2048
-# GET ENVIRONMENT VARIABLES
 CLOUD_GATEWAY_API = os.getenv("API_ENDPOINT")
 def toggle_ui():
@@ -18,7 +37,7 @@ def toggle_ui():
     Returns:
         hide/show main ui/dialog
     """
-    health = check_server_health(cloud_gateway_api=CLOUD_GATEWAY_API)
     if health:
         return gr.update(visible=True), gr.update(
             visible=False
@@ -35,9 +54,8 @@ def generate(
     system_prompt: str,
     max_new_tokens: int = 1024,
     temperature: float = 0.6,
-    top_p: float = 0.9,
-    top_k: int = 50,
-    repetition_penalty: float = 1.2,
 ) -> Iterator[str]:
     """Send a request to backend, fetch the streaming responses and emit to the UI.
@@ -61,14 +79,15 @@ def generate(
     # sample method to yield responses from the llm model
     outputs = []
     for text in request_generation(
         message=message,
         system_prompt=system_prompt,
         max_new_tokens=max_new_tokens,
         temperature=temperature,
-        top_p=top_p,
-        top_k=top_k,
-        repetition_penalty=repetition_penalty,
         cloud_gateway_api=CLOUD_GATEWAY_API,
     ):
         outputs.append(text)
         yield "".join(outputs)
@@ -94,28 +113,21 @@ chat_interface = gr.ChatInterface(
             minimum=0.1,
             maximum=4.0,
             step=0.1,
-            value=1.0,
-        ),
-        gr.Slider(
-            label="Top-p (nucleus sampling)",
-            minimum=0.05,
-            maximum=1.0,
-            step=0.05,
-            value=0.95,
         ),
         gr.Slider(
-            label="Top-k",
-            minimum=1,
-            maximum=1000,
-            step=1,
-            value=64,
         ),
         gr.Slider(
-            label="Repetition penalty",
-            minimum=1.0,
             maximum=2.0,
-            step=0.05,
-            value=1.0,
         ),
     ],
     stop_btn=None,
@@ -134,14 +146,14 @@ chat_interface = gr.ChatInterface(
 with gr.Blocks(css="style.css", fill_height=True) as demo:
     # Get the server status before displaying UI
-    visibility = check_server_health(CLOUD_GATEWAY_API)
     # Container for the main interface
     with gr.Column(visible=visibility, elem_id="main_ui") as main_ui:
         gr.Markdown(
             f"""
-            # Gemma-3 27B Chat
-            This Space is an Alpha release that demonstrates [Gemma-3-27B-It](https://huggingface.co/google/gemma-3-27b-it) model running on AMD MI210 infrastructure. The space is built with Google Gemma 3 [License](https://ai.google.dev/gemma/terms). Feel free to play with it!
             """
         )
         chat_interface.render()

 import os
+import logging
 import gradio as gr
 from typing import Iterator
 from dialog import get_dialog_box
 from gateway import check_server_health, request_generation
+# Setup logging
+logging.basicConfig(level=logging.INFO)
 # CONSTANTS
+# Get max new tokens from environment variable, if it is not set, default to 2048
+MAX_NEW_TOKENS: int = os.getenv("MAX_NEW_TOKENS", 2048)
+# Validate environment variables
 CLOUD_GATEWAY_API = os.getenv("API_ENDPOINT")
+if not CLOUD_GATEWAY_API:
+    raise EnvironmentError("API_ENDPOINT is not set.")
+MODEL_NAME: str = os.getenv("MODEL_NAME")
+if not MODEL_NAME:
+    raise EnvironmentError("MODEL_NAME is not set.")
+# Get API Key
+API_KEY = os.getenv("API_KEY")
+if not API_KEY:  # simple check to validate API Key
+    raise Exception("API Key not valid.")
+# Create a header, avoid declaring multiple times
+HEADER = {"x-api-key": f"{API_KEY}"}
 def toggle_ui():
     Returns:
         hide/show main ui/dialog
     """
+    health = check_server_health(cloud_gateway_api=CLOUD_GATEWAY_API, header=HEADER)
     if health:
         return gr.update(visible=True), gr.update(
             visible=False
     system_prompt: str,
     max_new_tokens: int = 1024,
     temperature: float = 0.6,
+    frequency_penalty: float = 0.0,
+    presence_penalty: float = 0.0,
 ) -> Iterator[str]:
     """Send a request to backend, fetch the streaming responses and emit to the UI.
     # sample method to yield responses from the llm model
     outputs = []
     for text in request_generation(
+        header=HEADER,
         message=message,
         system_prompt=system_prompt,
         max_new_tokens=max_new_tokens,
         temperature=temperature,
+        presence_penalty=presence_penalty,
+        frequency_penalty=frequency_penalty,
         cloud_gateway_api=CLOUD_GATEWAY_API,
+        model_name=MODEL_NAME,
     ):
         outputs.append(text)
         yield "".join(outputs)
             minimum=0.1,
             maximum=4.0,
             step=0.1,
+            value=0.3,
         ),
         gr.Slider(
+            label="Frequency penalty",
+            minimum=-2.0,
+            maximum=2.0,
+            step=0.1,
+            value=0.0,
         ),
         gr.Slider(
+            label="Presence penalty",
+            minimum=-2.0,
             maximum=2.0,
+            step=0.1,
+            value=0.0,
         ),
     ],
     stop_btn=None,
 with gr.Blocks(css="style.css", fill_height=True) as demo:
     # Get the server status before displaying UI
+    visibility = check_server_health(CLOUD_GATEWAY_API, header=HEADER)
     # Container for the main interface
     with gr.Column(visible=visibility, elem_id="main_ui") as main_ui:
         gr.Markdown(
             f"""
+            # Gemma 3 27b Instruct
+            This Space is an Alpha release that demonstrates [Gemma-3-27B-It](https://huggingface.co/google/gemma-3-27b-it) model running on AMD MI300 infrastructure. The space is built with Google Gemma 3 [License](https://ai.google.dev/gemma/terms). Feel free to play with it!
             """
         )
         chat_interface.render()

gateway.py CHANGED Viewed

@@ -1,41 +1,54 @@
 import json
 import requests
-def check_server_health(cloud_gateway_api: str):
     """
     Use the appropriate API endpoint to check the server health.
     Args:
         cloud_gateway_api: API endpoint to probe.
     Returns:
         True if server is active, false otherwise.
     """
     try:
-        response = requests.get(cloud_gateway_api + "/health")
-        if response.status_code == 200:
-            return True
-    except requests.ConnectionError:
-        print("Failed to establish connection to the server.")
-    return False
 def request_generation(
     message: str,
     system_prompt: str,
     cloud_gateway_api: str,
     max_new_tokens: int = 1024,
-    temperature: float = 0.6,
-    top_p: float = 0.9,
-    top_k: int = 50,
-    repetition_penalty: float = 1.2,
 ):
     """
     Request streaming generation from the cloud gateway API. Uses the simple requests module with stream=True to utilize
     token-by-token generation from LLM.
     Args:
         message: prompt from the user.
         system_prompt: system prompt to append.
         cloud_gateway_api (str): API endpoint to send the request.
@@ -43,7 +56,6 @@ def request_generation(
         temperature: the value used to module the next token probabilities.
         top_p: if set to float<1, only the smallest set of most probable tokens with probabilities that add up to top_p
                 or higher are kept for generation.
-        top_k: the number of highest probability vocabulary tokens to keep for top-k-filtering.
         repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty.
     Returns:
@@ -51,54 +63,69 @@ def request_generation(
     """
     payload = {
-        "model": "google/gemma-3-27b-it",
         "messages": [
-            *(
-                [
-                    {
-                        "role": "system",
-                        "content": [{"type": "text", "text": system_prompt}],
-                    }
-                ]
-                if system_prompt
-                else []
-            ),
-            {"role": "user", "content": [{"type": "text", "text": message}]},
         ],
         "max_tokens": max_new_tokens,
         "temperature": temperature,
-        "top_p": top_p,
-        "repetition_penalty": repetition_penalty,
-        "top_k": top_k,
         "stream": True,  # Enable streaming
     }
-    with requests.post(
-        cloud_gateway_api + "/v1/chat/completions", json=payload, stream=True
-    ) as response:
-        for chunk in response.iter_lines():
-            if chunk:
-                # Convert the chunk from bytes to a string and then parse it as json
-                chunk_str = chunk.decode("utf-8")
-                # Remove the `data: ` prefix from the chunk if it exists
-                if chunk_str.startswith("data: "):
-                    chunk_str = chunk_str[len("data: ") :]
-                # Skip empty chunks
-                if chunk_str.strip() == "[DONE]":
-                    break
-                # Parse the chunk into a JSON object
-                try:
-                    chunk_json = json.loads(chunk_str)
-                    # Extract the "content" field from the choices
-                    content = chunk_json["choices"][0]["delta"].get("content", "")
-                    # Print the generated content as it's streamed
-                    if content:
-                        yield content
-                except json.JSONDecodeError:
-                    # Handle any potential errors in decoding
-                    continue

 import json
+import logging
 import requests
+import urllib3
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+# Setup logging
+logging.basicConfig(level=logging.INFO)
+def check_server_health(cloud_gateway_api: str, header: dict) -> bool:
     """
     Use the appropriate API endpoint to check the server health.
     Args:
         cloud_gateway_api: API endpoint to probe.
+        header: Header for Authorization.
     Returns:
         True if server is active, false otherwise.
     """
     try:
+        response = requests.get(
+            cloud_gateway_api + "model/info",
+            headers=header,
+            verify=False,
+        )
+        response.raise_for_status()
+        return True
+    except requests.RequestException as e:
+        logging.error(f"Failed to check server health: {e}")
+        return False
 def request_generation(
+    header: dict,
     message: str,
     system_prompt: str,
     cloud_gateway_api: str,
+    model_name: str,
     max_new_tokens: int = 1024,
+    temperature: float = 0.3,
+    frequency_penalty: float = 0.0,
+    presence_penalty: float = 0.0,
 ):
     """
     Request streaming generation from the cloud gateway API. Uses the simple requests module with stream=True to utilize
     token-by-token generation from LLM.
     Args:
+        header: authorization header for the API.
         message: prompt from the user.
         system_prompt: system prompt to append.
         cloud_gateway_api (str): API endpoint to send the request.
         temperature: the value used to module the next token probabilities.
         top_p: if set to float<1, only the smallest set of most probable tokens with probabilities that add up to top_p
                 or higher are kept for generation.
         repetition_penalty: the parameter for repetition penalty. 1.0 means no penalty.
     Returns:
     """
     payload = {
+        "model": model_name,
         "messages": [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": message},
         ],
         "max_tokens": max_new_tokens,
         "temperature": temperature,
+        "frequency_penalty": frequency_penalty,
+        "presence_penalty": presence_penalty,
         "stream": True,  # Enable streaming
+        "serving_runtime": "vllm",
     }
+    try:
+        response = requests.post(
+            cloud_gateway_api + "chat/conversation",
+            headers=header,
+            json=payload,
+            verify=False,
+        )
+        response.raise_for_status()
+        # Append the conversation ID with the key X-Conversation-ID to the header
+        header["X-Conversation-ID"] = response.json()["conversationId"]
+        with requests.get(
+            cloud_gateway_api + f"conversation/stream",
+            headers=header,
+            verify=False,
+            stream=True,
+        ) as response:
+            for chunk in response.iter_lines():
+                if chunk:
+                    # Convert the chunk from bytes to a string and then parse it as json
+                    chunk_str = chunk.decode("utf-8")
+                    # Remove the `data: ` prefix from the chunk if it exists
+                    for _ in range(2):
+                        if chunk_str.startswith("data: "):
+                            chunk_str = chunk_str[len("data: ") :]
+                    # Skip empty chunks
+                    if chunk_str.strip() == "[DONE]":
+                        break
+                    # Parse the chunk into a JSON object
+                    try:
+                        chunk_json = json.loads(chunk_str)
+                        # Extract the "content" field from the choices
+                        if "choices" in chunk_json and chunk_json["choices"]:
+                            content = chunk_json["choices"][0]["delta"].get(
+                                "content", ""
+                            )
+                        else:
+                            content = ""
+                        # Print the generated content as it's streamed
+                        if content:
+                            yield content
+                    except json.JSONDecodeError:
+                        # Handle any potential errors in decoding
+                        continue
+    except requests.RequestException as e:
+        logging.error(f"Failed to generate response: {e}")
+        yield "Server not responding. Please try again later."