File size: 5,643 Bytes
b924465 0ff4ef8 283bd45 b924465 8c7e6f1 f977d49 573aa88 2ac97e2 b61328c 8c7e6f1 b61328c 573aa88 b61328c 60216ec 573aa88 60216ec 5f94ff7 b61328c 5f94ff7 9ab40fd dcf3974 5f94ff7 631cc27 5f94ff7 f977d49 8c7e6f1 f977d49 b61328c 631cc27 b61328c 573aa88 b61328c 60216ec 573aa88 631cc27 9ab40fd 631cc27 8c7e6f1 f977d49 7716903 51a1671 f977d49 60216ec 8c7e6f1 dd66861 b924465 e8b5344 dd66861 d47c403 e8b5344 b924465 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 |
import type { Conversation, ModelWithTokenizer } from "$lib/types";
import type { InferenceSnippet } from "@huggingface/tasks";
import { type ChatCompletionOutputMessage } from "@huggingface/tasks";
import { HfInference, snippets, type InferenceProvider } from "@huggingface/inference";
export async function handleStreamingResponse(
hf: HfInference,
conversation: Conversation,
onChunk: (content: string) => void,
abortController: AbortController
): Promise<void> {
const { model, systemMessage } = conversation;
const messages = [
...(isSystemPromptSupported(model) && systemMessage.content?.length ? [systemMessage] : []),
...conversation.messages,
];
let out = "";
for await (const chunk of hf.chatCompletionStream(
{
model: model.id,
messages,
provider: conversation.provider,
...conversation.config,
},
{ signal: abortController.signal }
)) {
if (chunk.choices && chunk.choices.length > 0 && chunk.choices[0]?.delta?.content) {
out += chunk.choices[0].delta.content;
onChunk(out);
}
}
}
export async function handleNonStreamingResponse(
hf: HfInference,
conversation: Conversation
): Promise<{ message: ChatCompletionOutputMessage; completion_tokens: number }> {
const { model, systemMessage } = conversation;
const messages = [
...(isSystemPromptSupported(model) && systemMessage.content?.length ? [systemMessage] : []),
...conversation.messages,
];
const response = await hf.chatCompletion({
model: model.id,
messages,
provider: conversation.provider,
...conversation.config,
});
if (response.choices && response.choices.length > 0) {
// eslint-disable-next-line @typescript-eslint/no-non-null-assertion
const { message } = response.choices[0]!;
const { completion_tokens } = response.usage;
return { message, completion_tokens };
}
throw new Error("No response from the model");
}
export function isSystemPromptSupported(model: ModelWithTokenizer) {
return model?.tokenizerConfig?.chat_template?.includes("system");
}
export const defaultSystemMessage: { [key: string]: string } = {
"Qwen/QwQ-32B-Preview":
"You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step.",
} as const;
export const customMaxTokens: { [key: string]: number } = {
"01-ai/Yi-1.5-34B-Chat": 2048,
"HuggingFaceM4/idefics-9b-instruct": 2048,
"deepseek-ai/DeepSeek-Coder-V2-Instruct": 16384,
"bigcode/starcoder": 8192,
"bigcode/starcoderplus": 8192,
"HuggingFaceH4/starcoderbase-finetuned-oasst1": 8192,
"google/gemma-7b": 8192,
"google/gemma-1.1-7b-it": 8192,
"google/gemma-2b": 8192,
"google/gemma-1.1-2b-it": 8192,
"google/gemma-2-27b-it": 8192,
"google/gemma-2-9b-it": 4096,
"google/gemma-2-2b-it": 8192,
"tiiuae/falcon-7b": 8192,
"tiiuae/falcon-7b-instruct": 8192,
"timdettmers/guanaco-33b-merged": 2048,
"mistralai/Mixtral-8x7B-Instruct-v0.1": 32768,
"Qwen/Qwen2.5-72B-Instruct": 32768,
"Qwen/Qwen2.5-Coder-32B-Instruct": 32768,
"meta-llama/Meta-Llama-3-70B-Instruct": 8192,
"CohereForAI/c4ai-command-r-plus-08-2024": 32768,
"NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO": 32768,
"meta-llama/Llama-2-70b-chat-hf": 8192,
"HuggingFaceH4/zephyr-7b-alpha": 17432,
"HuggingFaceH4/zephyr-7b-beta": 32768,
"mistralai/Mistral-7B-Instruct-v0.1": 32768,
"mistralai/Mistral-7B-Instruct-v0.2": 32768,
"mistralai/Mistral-7B-Instruct-v0.3": 32768,
"mistralai/Mistral-Nemo-Instruct-2407": 32768,
"meta-llama/Meta-Llama-3-8B-Instruct": 8192,
"mistralai/Mistral-7B-v0.1": 32768,
"bigcode/starcoder2-3b": 16384,
"bigcode/starcoder2-15b": 16384,
"HuggingFaceH4/starchat2-15b-v0.1": 16384,
"codellama/CodeLlama-7b-hf": 8192,
"codellama/CodeLlama-13b-hf": 8192,
"codellama/CodeLlama-34b-Instruct-hf": 8192,
"meta-llama/Llama-2-7b-chat-hf": 8192,
"meta-llama/Llama-2-13b-chat-hf": 8192,
"OpenAssistant/oasst-sft-6-llama-30b": 2048,
"TheBloke/vicuna-7B-v1.5-GPTQ": 2048,
"HuggingFaceH4/starchat-beta": 8192,
"bigcode/octocoder": 8192,
"vwxyzjn/starcoderbase-triviaqa": 8192,
"lvwerra/starcoderbase-gsm8k": 8192,
"NousResearch/Hermes-3-Llama-3.1-8B": 16384,
"microsoft/Phi-3.5-mini-instruct": 32768,
"meta-llama/Llama-3.1-70B-Instruct": 32768,
"meta-llama/Llama-3.1-8B-Instruct": 8192,
} as const;
// Order of the elements in InferenceModal.svelte is determined by this const
export const inferenceSnippetLanguages = ["python", "js", "curl"] as const;
export type InferenceSnippetLanguage = (typeof inferenceSnippetLanguages)[number];
const GET_SNIPPET_FN = {
curl: snippets.curl.getCurlInferenceSnippet,
js: snippets.js.getJsInferenceSnippet,
python: snippets.python.getPythonInferenceSnippet,
} as const;
export type GetInferenceSnippetReturn = (InferenceSnippet & { language: InferenceSnippetLanguage })[];
export function getInferenceSnippet(
model: ModelWithTokenizer,
provider: InferenceProvider,
language: InferenceSnippetLanguage,
accessToken: string,
opts?: Record<string, unknown>
): GetInferenceSnippetReturn {
const providerId = model.inferenceProviderMapping.find(p => p.provider === provider)?.providerId;
const snippetsByClient = GET_SNIPPET_FN[language](
{ ...model, inference: "" },
accessToken,
provider,
providerId,
opts
);
return snippetsByClient.map(snippetByClient => ({ ...snippetByClient, language }));
}
/**
* - If language is defined, the function checks if in an inference snippet is available for that specific language
*/
export function hasInferenceSnippet(
model: ModelWithTokenizer,
provider: InferenceProvider,
language: InferenceSnippetLanguage
): boolean {
return getInferenceSnippet(model, provider, language, "").length > 0;
}
|