# inference.py from typing import List, Dict, Optional from hf_client import get_inference_client from models import find_model def chat_completion( model_id: str, messages: List[Dict[str, str]], provider: Optional[str] = None, max_tokens: int = 4096 ) -> str: """ Send a chat completion request to the appropriate inference provider. Args: model_id: The model identifier to use. messages: A list of OpenAI-style {'role','content'} messages. provider: Optional override for provider; uses model default if None. max_tokens: Maximum tokens to generate. Returns: The assistant's response content. """ # resolve default provider from registry if needed if provider is None: meta = find_model(model_id) provider = meta.default_provider if meta else "auto" client = get_inference_client(model_id, provider) resp = client.chat.completions.create( model=model_id, messages=messages, max_tokens=max_tokens ) return resp.choices[0].message.content def stream_chat_completion( model_id: str, messages: List[Dict[str, str]], provider: Optional[str] = None, max_tokens: int = 4096 ): """ Generator for streaming chat completions. Yields partial message chunks as strings. """ if provider is None: meta = find_model(model_id) provider = meta.default_provider if meta else "auto" client = get_inference_client(model_id, provider) stream = client.chat.completions.create( model=model_id, messages=messages, max_tokens=max_tokens, stream=True ) for chunk in stream: delta = getattr(chunk.choices[0].delta, "content", None) if delta: yield delta