Spaces:

Vokturz
/

can-it-run-llm

Running

App Files Files Community

Remove default model selection on startup

#18

by junhyeokk - opened Feb 4

base: refs/heads/main

←

from: refs/pr/18

Discussion Files changed

+47

-33

Files changed (1) hide show

src/app.py +47 -33

src/app.py CHANGED Viewed

@@ -35,23 +35,24 @@ model_list = [
     "tiiuae/falcon-180B",
     "tiiuae/falcon-180B-Chat",
 ]
 st.title("Can you run it? LLM version")
 percentage_width_main = 80
 st.markdown(
-        f"""<style>
-        .appview-container .main .block-container{{
-        max-width: {percentage_width_main}%;}}
-        </style>
-        """,
-        unsafe_allow_html=True,
-    )
 @st.cache_resource()
 def cache_model_list():
     model_list_info = {}
     for model_name in model_list:
-        if not "tiiuae/falcon" in model_name: # Exclude Falcon models
             model = get_model(model_name, library="transformers", access_token="")
             model_list_info[model_name] = calculate_memory(model, ["float32", "float16/bfloat16", "int8", "int4"])
             del model
@@ -85,7 +86,7 @@ def show_gpu_info(info, trainable_params=0, vendor=""):
             if var == 'LoRa Fine-tuning':
                 msg += f" ({trainable_params}%)"
         else:
-            if _info['Number of GPUs']==1:
                 msg = f"You can run **{var}**"
                 func = st.success
                 icon = "✅"
@@ -95,7 +96,6 @@ def show_gpu_info(info, trainable_params=0, vendor=""):
                 icon = "⛔"
         func(msg, icon=icon)
 def get_name(index):
     row = gpu_specs.iloc[index]
     return f"{row['Product Name']} ({row['RAM (GB)']} GB, {row['Year']})"
@@ -106,16 +106,20 @@ def custom_ceil(a, precision=0):
 gpu_specs = get_gpu_specs()
 model_list_info = cache_model_list()
-_, col, _ = st.columns([1,3,1])
 with col.expander("Information", expanded=True):
-    st.markdown("""- GPU information comes from [TechPowerUp GPU Specs](https://www.techpowerup.com/gpu-specs/)
 - Mainly based on [Model Memory Calculator by hf-accelerate](https://huggingface.co/spaces/hf-accelerate/model-memory-usage)
     using `transformers` library
 - Inference is calculated following [EleutherAI Transformer Math 101](https://blog.eleuther.ai/transformer-math/),
-    where is estimated as """)
     st.latex(r"""\text{Memory}_\text{Inference} \approx \text{Model Size} \times 1.2""")
-    st.markdown("""- For LoRa Fine-tuning, I'm asuming a **16-bit** dtype of trainable parameters. The formula (in terms of GB) is""")
     st.latex(r"\text{Memory}_\text{LoRa} \approx \left(\text{Model Size} + \text{ \# trainable Params}_\text{Billions}\times\frac{16}{8} \times 4\right) \times 1.2")
 access_token = st.sidebar.text_input("Access token")
@@ -123,9 +127,9 @@ access_token = st.sidebar.text_input("Access token")
 if access_token:
     login(token=access_token)
-#model_name = st.sidebar.text_input("Model name", value="mistralai/Mistral-7B-v0.1")
 with st.sidebar.container():
-    model_name  = stDatalist("Model name (Press Enter to apply)", model_list, index=0)
 if not model_name:
     st.info("Please enter a model name")
     st.stop()
@@ -145,7 +149,6 @@ if model_name not in st.session_state:
         gc.collect()
     st.session_state['actual_model'] = model_name
 gpu_vendor = st.sidebar.selectbox("GPU Vendor", ["NVIDIA", "AMD", "Intel", "Apple"])
 # year = st.sidebar.selectbox("Filter by Release Year", list(range(2014, 2024))[::-1], index=None)
 gpu_info = gpu_specs[gpu_specs['Vendor'] == gpu_vendor].sort_values('Product Name')
@@ -159,7 +162,7 @@ gpu_info = gpu_info[gpu_info["RAM (GB)"].between(ram[0], ram[1])]
 if len(gpu_info) == 0:
     st.sidebar.error(f"**{gpu_vendor}** has no GPU in that RAM range")
     st.stop()
-gpu = st.sidebar.selectbox("GPU", gpu_info['Product Name'].index.tolist(), format_func=lambda x : gpu_specs.iloc[x]['Product Name'])
 gpu_spec = gpu_specs.iloc[gpu]
 gpu_spec.name = 'INFO'
@@ -169,8 +172,8 @@ st.sidebar.dataframe(gpu_spec.T.astype(str))
 memory_table = pd.DataFrame(st.session_state[model_name]).set_index('dtype')
 memory_table['LoRA Fine-Tuning (GB)'] = (memory_table["Total Size (GB)"] +
-                                          (memory_table["Parameters (Billion)"]* lora_pct/100 * (16/8)*4)) * 1.2
 _memory_table = memory_table.copy()
 memory_table = memory_table.round(2).T
 _memory_table /= gpu_spec['RAM (GB)']
@@ -178,13 +181,16 @@ _memory_table = _memory_table.apply(np.ceil).astype(int).drop(columns=['Paramete
 _memory_table.columns = ['Inference', 'Full Training Adam', 'LoRa Fine-tuning']
 _memory_table = _memory_table.stack().reset_index()
 _memory_table.columns = ['dtype', 'Variable', 'Number of GPUs']
-col1, col2 = st.columns([1,1.3])
 if gpu_vendor == "Apple":
-    col.warning("""For M1/M2/M3 Apple chips, PyTorch uses [Metal Performance Shaders (MPS)](https://huggingface.co/docs/accelerate/usage_guides/mps) as backend.\\
-Remember that Apple M1/M2/M3 chips share memory between CPU and GPU.""", icon="⚠️")
 with col1:
-    st.write(f"####  [{model_name}](https://huggingface.co/{model_name}) ({custom_ceil(memory_table.iloc[3,0],1):.1f}B)")
     dtypes = memory_table.columns.tolist()[::-1]
     tabs = st.tabs(dtypes)
@@ -201,12 +207,20 @@ with col2:
     if gpu_vendor == "Apple":
         st.warning("This graph is irrelevant for M1/M2 chips as they can't run in parallel.", icon="⚠️")
         extra = "⚠️"
-    num_colors= 4
-    colors = [px.colors.sequential.RdBu[int(i*(len(px.colors.sequential.RdBu)-1)/(num_colors-1))] for i in range(num_colors)]
-    fig = px.bar(_memory_table, x='Variable', y='Number of GPUs', color='dtype', barmode='group', color_discrete_sequence=colors)
-    fig.update_layout(title=dict(text=f"{extra} Number of GPUs required for<br> {get_name(gpu)}", font=dict(size=25))
-                    , xaxis_tickfont_size=14, yaxis_tickfont_size=16, yaxis_dtick='1')
-    st.plotly_chart(fig, use_container_width=True)

     "tiiuae/falcon-180B",
     "tiiuae/falcon-180B-Chat",
 ]
 st.title("Can you run it? LLM version")
 percentage_width_main = 80
 st.markdown(
+    f"""<style>
+    .appview-container .main .block-container{{
+    max-width: {percentage_width_main}%;}}
+    </style>
+    """,
+    unsafe_allow_html=True,
+)
 @st.cache_resource()
 def cache_model_list():
     model_list_info = {}
     for model_name in model_list:
+        if not "tiiuae/falcon" in model_name:  # Exclude Falcon models
             model = get_model(model_name, library="transformers", access_token="")
             model_list_info[model_name] = calculate_memory(model, ["float32", "float16/bfloat16", "int8", "int4"])
             del model
             if var == 'LoRa Fine-tuning':
                 msg += f" ({trainable_params}%)"
         else:
+            if _info['Number of GPUs'] == 1:
                 msg = f"You can run **{var}**"
                 func = st.success
                 icon = "✅"
                 icon = "⛔"
         func(msg, icon=icon)
 def get_name(index):
     row = gpu_specs.iloc[index]
     return f"{row['Product Name']} ({row['RAM (GB)']} GB, {row['Year']})"
 gpu_specs = get_gpu_specs()
 model_list_info = cache_model_list()
+_, col, _ = st.columns([1, 3, 1])
 with col.expander("Information", expanded=True):
+    st.markdown(
+        """- GPU information comes from [TechPowerUp GPU Specs](https://www.techpowerup.com/gpu-specs/)
 - Mainly based on [Model Memory Calculator by hf-accelerate](https://huggingface.co/spaces/hf-accelerate/model-memory-usage)
     using `transformers` library
 - Inference is calculated following [EleutherAI Transformer Math 101](https://blog.eleuther.ai/transformer-math/),
+    where is estimated as """
+    )
     st.latex(r"""\text{Memory}_\text{Inference} \approx \text{Model Size} \times 1.2""")
+    st.markdown(
+        """- For LoRa Fine-tuning, I'm asuming a **16-bit** dtype of trainable parameters. The formula (in terms of GB) is"""
+    )
     st.latex(r"\text{Memory}_\text{LoRa} \approx \left(\text{Model Size} + \text{ \# trainable Params}_\text{Billions}\times\frac{16}{8} \times 4\right) \times 1.2")
 access_token = st.sidebar.text_input("Access token")
 if access_token:
     login(token=access_token)
 with st.sidebar.container():
+    model_name = stDatalist("Model name (Press Enter to apply)", model_list)
 if not model_name:
     st.info("Please enter a model name")
     st.stop()
         gc.collect()
     st.session_state['actual_model'] = model_name
 gpu_vendor = st.sidebar.selectbox("GPU Vendor", ["NVIDIA", "AMD", "Intel", "Apple"])
 # year = st.sidebar.selectbox("Filter by Release Year", list(range(2014, 2024))[::-1], index=None)
 gpu_info = gpu_specs[gpu_specs['Vendor'] == gpu_vendor].sort_values('Product Name')
 if len(gpu_info) == 0:
     st.sidebar.error(f"**{gpu_vendor}** has no GPU in that RAM range")
     st.stop()
+gpu = st.sidebar.selectbox("GPU", gpu_info['Product Name'].index.tolist(), format_func=lambda x: gpu_specs.iloc[x]['Product Name'])
 gpu_spec = gpu_specs.iloc[gpu]
 gpu_spec.name = 'INFO'
 memory_table = pd.DataFrame(st.session_state[model_name]).set_index('dtype')
 memory_table['LoRA Fine-Tuning (GB)'] = (memory_table["Total Size (GB)"] +
+                                          (memory_table["Parameters (Billion)"] * lora_pct/100 * (16/8) * 4)) * 1.2
 _memory_table = memory_table.copy()
 memory_table = memory_table.round(2).T
 _memory_table /= gpu_spec['RAM (GB)']
 _memory_table.columns = ['Inference', 'Full Training Adam', 'LoRa Fine-tuning']
 _memory_table = _memory_table.stack().reset_index()
 _memory_table.columns = ['dtype', 'Variable', 'Number of GPUs']
+col1, col2 = st.columns([1, 1.3])
 if gpu_vendor == "Apple":
+    col.warning(
+        """For M1/M2/M3 Apple chips, PyTorch uses [Metal Performance Shaders (MPS)](https://huggingface.co/docs/accelerate/usage_guides/mps) as backend.
+Remember that Apple M1/M2/M3 chips share memory between CPU and GPU.""",
+        icon="⚠️",
+    )
 with col1:
+    st.write(f"####  [{model_name}](https://huggingface.co/{model_name}) ({custom_ceil(memory_table.iloc[3, 0], 1):.1f}B)")
     dtypes = memory_table.columns.tolist()[::-1]
     tabs = st.tabs(dtypes)
     if gpu_vendor == "Apple":
         st.warning("This graph is irrelevant for M1/M2 chips as they can't run in parallel.", icon="⚠️")
         extra = "⚠️"
+    num_colors = 4
+    colors = [px.colors.sequential.RdBu[int(i * (len(px.colors.sequential.RdBu) - 1) / (num_colors - 1))] for i in range(num_colors)]
+    fig = px.bar(
+        _memory_table,
+        x='Variable',
+        y='Number of GPUs',
+        color='dtype',
+        barmode='group',
+        color_discrete_sequence=colors,
+    )
+    fig.update_layout(
+        title=dict(text=f"{extra} Number of GPUs required for<br> {get_name(gpu)}", font=dict(size=25)),
+        xaxis_tickfont_size=14,
+        yaxis_tickfont_size=16,
+        yaxis_dtick='1'
+    )
+    st.plotly_chart(fig, use_container_width=True)