import React, { useState, useEffect, useRef } from 'react' import * as d3 from 'd3' import { useTheme } from '../context/themeContext' import MODELS from '../utils/models' import DEVICES from '../utils/devices' type Precision = 'fp32' | 'fp16' | 'int8' | 'int4' interface ModelSizeBarChartProps { modelSize: number // in GB largestModelSize: number // largest model in full precision (fp32) modelPrecision: Precision // enum of fp32, fp16, int8, int4 deviceMemorySet: boolean // true if device memory is set } interface InferenceRuntimeLineChartProps { availableMemory: AvailableMemory // in GB memoryPerInput: number // in GB } interface LineChartData { seqLength: number batchSize: number } interface AvailableMemory { int4: number int8: number fp16: number fp32: number } // Table containing the mapping of backends to precisions const BackendPrecisionTable = () => { return (
Backend | GPU | CPU | Accuracy |
---|---|---|---|
fast | 16 | 16 | ⭐⭐⭐ |
compress-fast | 4 | 8 | ⭐⭐ |
compress | 4 | 4 | ⭐ |
baseline | 16 | 16 | ⭐⭐⭐ |
prompt_new_tokens
for input tokens and max_new_tokens
for
output tokens when making a request.
TAKEOFF_MAX_BATCH_SIZE
to your desired value.