accurate-gguf-vram-calculator

Running

App Files Files Community

oobabooga commited on May 16

Commit

6441476

verified ·

1 Parent(s): 8f3a4a9

Create app.py

Browse files

Files changed (1) hide show

app.py +432 -0

app.py ADDED Viewed

	@@ -0,0 +1,432 @@

+import gradio as gr
+from math import exp
+import re
+import struct
+import requests
+import io
+from enum import IntEnum
+class GGUFValueType(IntEnum):
+    UINT8 = 0
+    INT8 = 1
+    UINT16 = 2
+    INT16 = 3
+    UINT32 = 4
+    INT32 = 5
+    FLOAT32 = 6
+    BOOL = 7
+    STRING = 8
+    ARRAY = 9
+    UINT64 = 10
+    INT64 = 11
+    FLOAT64 = 12
+_simple_value_packing = {
+    GGUFValueType.UINT8: "<B",
+    GGUFValueType.INT8: "<b",
+    GGUFValueType.UINT16: "<H",
+    GGUFValueType.INT16: "<h",
+    GGUFValueType.UINT32: "<I",
+    GGUFValueType.INT32: "<i",
+    GGUFValueType.FLOAT32: "<f",
+    GGUFValueType.UINT64: "<Q",
+    GGUFValueType.INT64: "<q",
+    GGUFValueType.FLOAT64: "<d",
+    GGUFValueType.BOOL: "?",
+}
+value_type_info = {
+    GGUFValueType.UINT8: 1,
+    GGUFValueType.INT8: 1,
+    GGUFValueType.UINT16: 2,
+    GGUFValueType.INT16: 2,
+    GGUFValueType.UINT32: 4,
+    GGUFValueType.INT32: 4,
+    GGUFValueType.FLOAT32: 4,
+    GGUFValueType.UINT64: 8,
+    GGUFValueType.INT64: 8,
+    GGUFValueType.FLOAT64: 8,
+    GGUFValueType.BOOL: 1,
+}
+def get_single(value_type, file):
+    if value_type == GGUFValueType.STRING:
+        value_length = struct.unpack("<Q", file.read(8))[0]
+        value = file.read(value_length)
+        try:
+            value = value.decode('utf-8')
+        except:
+            pass
+    else:
+        type_str = _simple_value_packing.get(value_type)
+        bytes_length = value_type_info.get(value_type)
+        value = struct.unpack(type_str, file.read(bytes_length))[0]
+    return value
+def load_metadata_from_file(file_obj):
+    """Load metadata from a file-like object"""
+    metadata = {}
+    GGUF_MAGIC = struct.unpack("<I", file_obj.read(4))[0]
+    GGUF_VERSION = struct.unpack("<I", file_obj.read(4))[0]
+    ti_data_count = struct.unpack("<Q", file_obj.read(8))[0]
+    kv_data_count = struct.unpack("<Q", file_obj.read(8))[0]
+    if GGUF_VERSION == 1:
+        raise Exception('You are using an outdated GGUF, please download a new one.')
+    for i in range(kv_data_count):
+        key_length = struct.unpack("<Q", file_obj.read(8))[0]
+        key = file_obj.read(key_length)
+        value_type = GGUFValueType(struct.unpack("<I", file_obj.read(4))[0])
+        if value_type == GGUFValueType.ARRAY:
+            ltype = GGUFValueType(struct.unpack("<I", file_obj.read(4))[0])
+            length = struct.unpack("<Q", file_obj.read(8))[0]
+            arr = [get_single(ltype, file_obj) for _ in range(length)]
+            metadata[key.decode()] = arr
+        else:
+            value = get_single(value_type, file_obj)
+            metadata[key.decode()] = value
+    # Extract specific fields needed for VRAM calculation
+    extracted_fields = {}
+    for key, value in metadata.items():
+        if key.endswith('.block_count'):
+            extracted_fields['n_layers'] = value
+        elif key.endswith('.attention.head_count_kv'):
+            extracted_fields['n_kv_heads'] = value
+        elif key.endswith('.embedding_length'):
+            extracted_fields['embedding_dim'] = value
+        elif key.endswith('.context_length'):
+            extracted_fields['context_length'] = value
+        elif key.endswith('.feed_forward_length'):
+            extracted_fields['feed_forward_dim'] = value
+    # Add extracted fields to metadata for easy access
+    metadata.update(extracted_fields)
+    return metadata
+def download_gguf_partial(url, max_bytes=25 * 1024 * 1024):
+    """Download the first max_bytes from a GGUF URL"""
+    try:
+        # Set up headers for partial content request
+        headers = {'Range': f'bytes=0-{max_bytes-1}'}
+        # Make the request
+        response = requests.get(url, headers=headers, stream=True)
+        response.raise_for_status()
+        # Read the content
+        content = response.content
+        # Convert to BytesIO for file-like interface
+        return io.BytesIO(content)
+    except Exception as e:
+        raise Exception(f"Failed to download GGUF file: {str(e)}")
+def load_metadata(model_url, current_metadata):
+    """Load metadata from model URL and return updated metadata dict"""
+    if not model_url or model_url.strip() == "":
+        return {}, "Please enter a model URL"
+    try:
+        # Get model size first
+        model_size_mb = get_model_size_mb_from_url(model_url)
+        # Normalize URL for downloading
+        normalized_url = normalize_huggingface_url(model_url)
+        # Download the first 25MB of the file
+        file_obj = download_gguf_partial(normalized_url)
+        # Parse the metadata
+        metadata = load_metadata_from_file(file_obj)
+        # Extract model name from URL if it's a Hugging Face URL
+        model_name = model_url
+        if "huggingface.co/" in model_url:
+            try:
+                # Extract model name from URL like https://huggingface.co/user/model
+                parts = model_url.split("huggingface.co/")[1].split("/")
+                if len(parts) >= 2:
+                    model_name = f"{parts[0]}/{parts[1]}"
+            except:
+                model_name = model_url
+        # Add URL, model name, and size to metadata
+        metadata['url'] = model_url
+        metadata['model_name'] = model_name
+        metadata['model_size_mb'] = model_size_mb
+        metadata['loaded'] = True
+        return metadata, gr.update(value=metadata["n_layers"], maximum=metadata["n_layers"]), f"Metadata loaded successfully for: {model_name}"
+    except Exception as e:
+        error_msg = f"Error loading metadata: {str(e)}"
+        return {}, gr.update(), error_msg
+def normalize_huggingface_url(url: str) -> str:
+    """Normalize HuggingFace URL to resolve format for direct access"""
+    if 'huggingface.co' not in url:
+        return url
+    # Remove query parameters first
+    base_url = url.split('?')[0]
+    # Convert blob URL to resolve URL
+    if '/blob/' in base_url:
+        base_url = base_url.replace('/blob/', '/resolve/')
+    return base_url
+def get_model_size_mb_from_url(model_url: str) -> float:
+    """Get model size in MB from URL without downloading, handling multi-part files"""
+    try:
+        # Normalize the URL for direct access
+        normalized_url = normalize_huggingface_url(model_url)
+        # Get size of the main file
+        response = requests.head(normalized_url, allow_redirects=True)
+        response.raise_for_status()
+        main_file_size = int(response.headers.get('content-length', 0))
+        # Extract filename from original URL
+        filename = normalized_url.split('/')[-1]
+        # Check for multipart pattern (e.g., model-00001-of-00002.gguf)
+        match = re.match(r'(.+)-(\d+)-of-(\d+)\.gguf$', filename)
+        if match:
+            base_pattern = match.group(1)
+            total_parts = int(match.group(3))
+            total_size = 0
+            base_url = '/'.join(normalized_url.split('/')[:-1]) + '/'
+            # Get size of all parts
+            for part_num in range(1, total_parts + 1):
+                part_filename = f"{base_pattern}-{part_num:05d}-of-{total_parts:05d}.gguf"
+                part_url = base_url + part_filename
+                try:
+                    part_response = requests.head(part_url, allow_redirects=True)
+                    part_response.raise_for_status()
+                    part_size = int(part_response.headers.get('content-length', 0))
+                    total_size += part_size
+                except requests.RequestException as e:
+                    print(f"Warning: Could not get size of {part_filename}, estimating...")
+                    # If we can't get some parts, estimate based on what we have
+                    if total_size > 0:
+                        avg_size = total_size / (part_num - 1)
+                        remaining_parts = total_parts - (part_num - 1)
+                        total_size += avg_size * remaining_parts
+                    else:
+                        # Fallback to main file size * total parts
+                        total_size = main_file_size * total_parts
+                    break
+            return total_size / (1024 ** 2)
+        else:
+            # Single part file
+            return main_file_size / (1024 ** 2)
+    except Exception as e:
+        print(f"Error getting model size: {e}")
+        return 0.0
+def estimate_vram(metadata, gpu_layers, ctx_size, cache_type):
+    """Calculate VRAM usage using the actual formula"""
+    try:
+        # Extract required values from metadata
+        n_layers = metadata.get('n_layers')
+        n_kv_heads = metadata.get('n_kv_heads')
+        embedding_dim = metadata.get('embedding_dim')
+        context_length = metadata.get('context_length')
+        feed_forward_dim = metadata.get('feed_forward_dim')
+        size_in_mb = metadata.get('model_size_mb', 0)
+        # Check if we have all required fields
+        required_fields = [n_layers, n_kv_heads, embedding_dim, context_length, feed_forward_dim]
+        if any(field is None for field in required_fields):
+            missing = [name for name, field in zip(
+                ['n_layers', 'n_kv_heads', 'embedding_dim', 'context_length', 'feed_forward_dim'],
+                required_fields) if field is None]
+            raise ValueError(f"Missing required metadata fields: {missing}")
+        # Ensure gpu_layers doesn't exceed total layers
+        if gpu_layers > n_layers:
+            gpu_layers = n_layers
+        # Convert cache_type to numeric
+        cache_type_map = {'fp16': 16, 'q8_0': 8, 'q4_0': 4}
+        cache_type_numeric = cache_type_map.get(cache_type, 16)
+        # Derived features
+        size_per_layer = size_in_mb / max(n_layers, 1e-6)
+        context_per_layer = context_length / max(n_layers, 1e-6)
+        ffn_per_embedding = feed_forward_dim / max(embedding_dim, 1e-6)
+        kv_cache_factor = n_kv_heads * cache_type_numeric * ctx_size
+        # Helper function for smaller
+        def smaller(x, y):
+            return 1 if x < y else 0
+        # Calculate VRAM using the model
+        vram = (
+            (size_per_layer - 21.19195204848197)
+            * exp(0.0001047328491557063 * size_in_mb * smaller(ffn_per_embedding, 2.671096993407845))
+            + 0.0006621544775632052 * context_per_layer
+            + 3.34664386576376e-05 * kv_cache_factor
+        ) * (1.363306170123392 + gpu_layers) + 1255.163594536052
+        return max(0, vram)  # Ensure non-negative result
+    except Exception as e:
+        print(f"Error in VRAM calculation: {e}")
+        raise
+def estimate_vram_wrapper(model_metadata, gpu_layers, ctx_size, cache_type):
+    """Wrapper function to estimate VRAM usage"""
+    if not model_metadata or 'model_name' not in model_metadata:
+        return "<div id=\"vram-info\">Estimated VRAM to load the model:</div>"
+    # Use cache_type directly (it's already a string from the radio button)
+    try:
+        result = estimate_vram(model_metadata, gpu_layers, ctx_size, cache_type)
+        conservative = result + 906
+        return f"""<div id="vram-info">
+        <div>Expected VRAM usage: <span class="value">{result:.0f} MiB</span></div>
+        <div>Safe estimate: <span class="value">{conservative:.0f} MiB</span> - 95% chance the VRAM is at most this.</div>
+        </div>"""
+    except Exception as e:
+        return f"<div id=\"vram-info\">Estimated VRAM to load the model: <span class=\"value\">Error: {str(e)}</span></div>"
+def create_ui():
+    """Create the simplified UI"""
+    # Custom CSS to limit max width and center the content
+    css = """
+    body {
+        max-width: 810px !important;
+        margin: 0 auto !important;
+    }
+    #vram-info {
+        padding: 10px;
+        border-radius: 4px;
+        background-color: var(--background-fill-secondary);
+    }
+    #vram-info .value {
+        font-weight: bold;
+        color: var(--primary-500);
+    }
+    """
+    with gr.Blocks(css=css) as demo:
+        # State to hold model metadata
+        model_metadata = gr.State(value={})
+        gr.Markdown("# Accurage GGUF VRAM Calculator\n\nCalculate VRAM for GGUF models from GPU layers and context length using an accurate formula.\n\nFor an explanation about how this works, consult this blog post: https://oobabooga.github.io/blog/posts/gguf-vram-formula/")
+        with gr.Row():
+            with gr.Column():
+                # Model URL input
+                model_url = gr.Textbox(
+                    label="GGUF Model URL",
+                    placeholder="https://huggingface.co/bartowski/Meta-Llama-3.1-8B-Instruct-GGUF/blob/main/Meta-Llama-3.1-8B-Instruct-IQ2_M.gguf",
+                    value=""
+                )
+                # Load metadata button
+                load_metadata_btn = gr.Button("Load metadata", elem_classes='refresh-button')
+                # GPU layers slider
+                gpu_layers = gr.Slider(
+                    label="GPU Layers",
+                    minimum=0,
+                    maximum=256,
+                    value=256,
+                    info='`--gpu-layers` in llama.cpp.'
+                )
+                # Context size slider
+                ctx_size = gr.Slider(
+                    label='Context Length',
+                    minimum=512,
+                    maximum=131072,
+                    step=256,
+                    value=8192,
+                    info='`--ctx-size` in llama.cpp.'
+                )
+                # Cache type checkbox group
+                cache_type = gr.Radio(
+                    choices=['fp16', 'q8_0', 'q4_0'],
+                    value='fp16',
+                    label="Cache Type",
+                    info='Cache quantization.'
+                )
+                # VRAM info display
+                vram_info = gr.HTML(
+                    value="<div id=\"vram-info\">Estimated VRAM to load the model:</div>"
+                )
+                # Status display
+                status = gr.Textbox(
+                    label="Status",
+                    value="No model loaded",
+                    interactive=False
+                )
+        # Event handlers
+        load_metadata_btn.click(
+            load_metadata,
+            inputs=[model_url, model_metadata],
+            outputs=[model_metadata, gpu_layers, status],
+            show_progress=True
+        ).then(
+            estimate_vram_wrapper,
+            inputs=[model_metadata, gpu_layers, ctx_size, cache_type],
+            outputs=[vram_info],
+            show_progress=False
+        )
+        # Update VRAM estimate when any parameter changes
+        for component in [gpu_layers, ctx_size, cache_type]:
+            component.change(
+                estimate_vram_wrapper,
+                inputs=[model_metadata, gpu_layers, ctx_size, cache_type],
+                outputs=[vram_info],
+                show_progress=False
+            )
+        # Also update when model_metadata state changes
+        model_metadata.change(
+            estimate_vram_wrapper,
+            inputs=[model_metadata, gpu_layers, ctx_size, cache_type],
+            outputs=[vram_info],
+            show_progress=False
+        )
+    return demo
+if __name__ == "__main__":
+    # Create and launch the app
+    demo = create_ui()
+    demo.launch()