Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,10 +1,6 @@
|
|
| 1 |
import spaces
|
| 2 |
import os
|
| 3 |
import threading
|
| 4 |
-
from collections import deque
|
| 5 |
-
|
| 6 |
-
import plotly.graph_objs as go
|
| 7 |
-
import pynvml
|
| 8 |
|
| 9 |
import gradio as gr
|
| 10 |
from huggingface_hub import snapshot_download
|
|
@@ -30,100 +26,6 @@ models = [
|
|
| 30 |
},
|
| 31 |
]
|
| 32 |
|
| 33 |
-
# Queues for storing historical data (saving the last 100 GPU utilization and memory usage values)
|
| 34 |
-
gpu_util_history = deque(maxlen=100)
|
| 35 |
-
mem_usage_history = deque(maxlen=100)
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
def initialize_nvml():
|
| 39 |
-
"""
|
| 40 |
-
Initialize NVML (NVIDIA Management Library).
|
| 41 |
-
"""
|
| 42 |
-
pynvml.nvmlInit()
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
def get_gpu_info():
|
| 46 |
-
"""
|
| 47 |
-
Get GPU utilization and memory usage information.
|
| 48 |
-
|
| 49 |
-
Returns:
|
| 50 |
-
dict: A dictionary containing GPU utilization and memory usage information.
|
| 51 |
-
"""
|
| 52 |
-
handle = pynvml.nvmlDeviceGetHandleByIndex(0) # Assuming a single GPU setup
|
| 53 |
-
utilization = pynvml.nvmlDeviceGetUtilizationRates(handle)
|
| 54 |
-
memory = pynvml.nvmlDeviceGetMemoryInfo(handle)
|
| 55 |
-
|
| 56 |
-
gpu_info = {
|
| 57 |
-
'gpu_util': utilization.gpu,
|
| 58 |
-
'mem_used': memory.used / 1024**2, # Convert bytes to MiB
|
| 59 |
-
'mem_total': memory.total / 1024**2, # Convert bytes to MiB
|
| 60 |
-
'mem_percent': (memory.used / memory.total) * 100
|
| 61 |
-
}
|
| 62 |
-
return gpu_info
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
def _update_charts(chart_height: int = 200) -> go.Figure:
|
| 66 |
-
"""
|
| 67 |
-
Update the GPU utilization and memory usage charts.
|
| 68 |
-
|
| 69 |
-
Args:
|
| 70 |
-
chart_height (int, optional): used to set the height of the chart. Defaults to 200.
|
| 71 |
-
|
| 72 |
-
Returns:
|
| 73 |
-
plotly.graph_objs.Figure: The updated figure containing the GPU and memory usage charts.
|
| 74 |
-
"""
|
| 75 |
-
# obtain GPU information
|
| 76 |
-
gpu_info = get_gpu_info()
|
| 77 |
-
|
| 78 |
-
# records the latest GPU utilization and memory usage values
|
| 79 |
-
gpu_util = round(gpu_info.get('gpu_util', 0), 1)
|
| 80 |
-
mem_used = round(gpu_info.get('mem_used', 0) / 1024, 2) # Convert MiB to GiB
|
| 81 |
-
gpu_util_history.append(gpu_util)
|
| 82 |
-
mem_usage_history.append(mem_used)
|
| 83 |
-
|
| 84 |
-
# create GPU utilization line chart
|
| 85 |
-
gpu_trace = go.Scatter(
|
| 86 |
-
y=list(gpu_util_history),
|
| 87 |
-
mode='lines+markers',
|
| 88 |
-
text=list(gpu_util_history),
|
| 89 |
-
line=dict(shape='spline', color='blue'), # Make the line smooth and set color
|
| 90 |
-
yaxis='y1' # Link to y-axis 1
|
| 91 |
-
)
|
| 92 |
-
|
| 93 |
-
# create memory usage line chart
|
| 94 |
-
mem_trace = go.Scatter(
|
| 95 |
-
y=list(mem_usage_history),
|
| 96 |
-
mode='lines+markers',
|
| 97 |
-
text=list(mem_usage_history),
|
| 98 |
-
line=dict(shape='spline', color='red'), # Make the line smooth and set color
|
| 99 |
-
yaxis='y2' # Link to y-axis 2
|
| 100 |
-
)
|
| 101 |
-
|
| 102 |
-
# set the layout of the chart
|
| 103 |
-
layout = go.Layout(
|
| 104 |
-
xaxis=dict(title=None, showticklabels=False, ticks=''),
|
| 105 |
-
yaxis=dict(
|
| 106 |
-
title='GPU Utilization (%)',
|
| 107 |
-
range=[-5, 110],
|
| 108 |
-
titlefont=dict(color='blue'),
|
| 109 |
-
tickfont=dict(color='blue'),
|
| 110 |
-
),
|
| 111 |
-
yaxis2=dict(title='Memory Usage (GiB)',
|
| 112 |
-
range=[0, max(24,
|
| 113 |
-
max(mem_usage_history) + 1)],
|
| 114 |
-
titlefont=dict(color='red'),
|
| 115 |
-
tickfont=dict(color='red'),
|
| 116 |
-
overlaying='y',
|
| 117 |
-
side='right'),
|
| 118 |
-
height=chart_height, # set the height of the chart
|
| 119 |
-
margin=dict(l=10, r=10, t=0, b=0), # set the margin of the chart
|
| 120 |
-
showlegend=False # disable the legend
|
| 121 |
-
)
|
| 122 |
-
|
| 123 |
-
fig = go.Figure(data=[gpu_trace, mem_trace], layout=layout)
|
| 124 |
-
return fig
|
| 125 |
-
|
| 126 |
-
|
| 127 |
def initialize_history():
|
| 128 |
"""
|
| 129 |
Initializes the GPU utilization and memory usage history.
|
|
@@ -134,13 +36,6 @@ def initialize_history():
|
|
| 134 |
mem_usage_history.append(round(gpu_info.get('mem_percent', 0), 1))
|
| 135 |
|
| 136 |
|
| 137 |
-
def enable_gpu_info():
|
| 138 |
-
pynvml.nvmlInit()
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
def disable_gpu_info():
|
| 142 |
-
pynvml.nvmlShutdown()
|
| 143 |
-
|
| 144 |
model_choices = [f"{model['name']} ({model['bits']})" for model in models]
|
| 145 |
display_to_model = {f"{model['name']} ({model['bits']})": model['name'] for model in models}
|
| 146 |
|
|
@@ -159,7 +54,8 @@ def download_models_in_background():
|
|
| 159 |
download_thread = threading.Thread(target=download_models_in_background)
|
| 160 |
download_thread.start()
|
| 161 |
|
| 162 |
-
|
|
|
|
| 163 |
|
| 164 |
@spaces.GPU
|
| 165 |
def respond(
|
|
@@ -173,12 +69,16 @@ def respond(
|
|
| 173 |
):
|
| 174 |
model_name = display_to_model[selected_model_display_label]
|
| 175 |
|
|
|
|
|
|
|
|
|
|
| 176 |
# Check if the model is already loaded
|
| 177 |
-
if model_name not
|
| 178 |
# Load and store the model in the cache
|
| 179 |
-
|
|
|
|
| 180 |
|
| 181 |
-
chat_completion =
|
| 182 |
|
| 183 |
messages = [{"role": "system", "content": system_message}]
|
| 184 |
|
|
@@ -240,4 +140,3 @@ with gr.Blocks(fill_height=True) as demo:
|
|
| 240 |
if __name__ == "__main__":
|
| 241 |
share = os.getenv("SHARE_LINK", None) in ["1", "true", "True"]
|
| 242 |
demo.launch(share=share)
|
| 243 |
-
# disable_gpu_info()
|
|
|
|
| 1 |
import spaces
|
| 2 |
import os
|
| 3 |
import threading
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
import gradio as gr
|
| 6 |
from huggingface_hub import snapshot_download
|
|
|
|
| 26 |
},
|
| 27 |
]
|
| 28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
def initialize_history():
|
| 30 |
"""
|
| 31 |
Initializes the GPU utilization and memory usage history.
|
|
|
|
| 36 |
mem_usage_history.append(round(gpu_info.get('mem_percent', 0), 1))
|
| 37 |
|
| 38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
model_choices = [f"{model['name']} ({model['bits']})" for model in models]
|
| 40 |
display_to_model = {f"{model['name']} ({model['bits']})": model['name'] for model in models}
|
| 41 |
|
|
|
|
| 54 |
download_thread = threading.Thread(target=download_models_in_background)
|
| 55 |
download_thread.start()
|
| 56 |
|
| 57 |
+
loaded_model = None
|
| 58 |
+
loaded_model_name = None
|
| 59 |
|
| 60 |
@spaces.GPU
|
| 61 |
def respond(
|
|
|
|
| 69 |
):
|
| 70 |
model_name = display_to_model[selected_model_display_label]
|
| 71 |
|
| 72 |
+
global loaded_model
|
| 73 |
+
global loaded_model_name
|
| 74 |
+
|
| 75 |
# Check if the model is already loaded
|
| 76 |
+
if model_name is not loaded_model_name:
|
| 77 |
# Load and store the model in the cache
|
| 78 |
+
loaded_model = get_chat_loop_generator(model_name)
|
| 79 |
+
loaded_model_name = model_name
|
| 80 |
|
| 81 |
+
chat_completion = loaded_model
|
| 82 |
|
| 83 |
messages = [{"role": "system", "content": system_message}]
|
| 84 |
|
|
|
|
| 140 |
if __name__ == "__main__":
|
| 141 |
share = os.getenv("SHARE_LINK", None) in ["1", "true", "True"]
|
| 142 |
demo.launch(share=share)
|
|
|