Spaces:
Running
Running
Upload folder using huggingface_hub
Browse files- src/openai_patch.ts +5 -0
- src/routes/health.ts +5 -0
- src/routes/index.ts +1 -0
- src/routes/responses.ts +19 -76
- src/schemas.ts +2 -2
- src/server.ts +3 -1
src/openai_patch.ts
CHANGED
@@ -9,6 +9,7 @@ import type {
|
|
9 |
ResponseOutputText,
|
10 |
} from "openai/resources/responses/responses";
|
11 |
|
|
|
12 |
export interface ReasoningTextContent {
|
13 |
type: "reasoning_text";
|
14 |
text: string;
|
@@ -42,3 +43,7 @@ export type PatchedResponseStreamEvent =
|
|
42 |
| PatchedResponseReasoningTextDoneEvent;
|
43 |
|
44 |
export type PatchedResponseContentPart = ResponseOutputText | ResponseOutputRefusal;
|
|
|
|
|
|
|
|
|
|
9 |
ResponseOutputText,
|
10 |
} from "openai/resources/responses/responses";
|
11 |
|
12 |
+
import type { ChatCompletionChunk } from "openai/resources/chat/completions";
|
13 |
export interface ReasoningTextContent {
|
14 |
type: "reasoning_text";
|
15 |
text: string;
|
|
|
43 |
| PatchedResponseReasoningTextDoneEvent;
|
44 |
|
45 |
export type PatchedResponseContentPart = ResponseOutputText | ResponseOutputRefusal;
|
46 |
+
|
47 |
+
export type PatchedDeltaWithReasoning = ChatCompletionChunk.Choice.Delta & {
|
48 |
+
reasoning?: string;
|
49 |
+
};
|
src/routes/health.ts
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import type { Request, Response } from "express";
|
2 |
+
|
3 |
+
export function getHealth(req: Request, res: Response): void {
|
4 |
+
res.send("OK");
|
5 |
+
}
|
src/routes/index.ts
CHANGED
@@ -1,2 +1,3 @@
|
|
1 |
export { postCreateResponse } from "./responses.js";
|
2 |
export { getLandingPageHtml } from "./landingPageHtml.js";
|
|
|
|
1 |
export { postCreateResponse } from "./responses.js";
|
2 |
export { getLandingPageHtml } from "./landingPageHtml.js";
|
3 |
+
export { getHealth } from "./health.js";
|
src/routes/responses.ts
CHANGED
@@ -15,16 +15,15 @@ import type {
|
|
15 |
PatchedResponseReasoningItem,
|
16 |
PatchedResponseStreamEvent,
|
17 |
ReasoningTextContent,
|
|
|
18 |
} from "../openai_patch";
|
19 |
import type {
|
20 |
ChatCompletionCreateParamsStreaming,
|
21 |
ChatCompletionMessageParam,
|
22 |
ChatCompletionTool,
|
23 |
-
ChatCompletionChunk,
|
24 |
} from "openai/resources/chat/completions.js";
|
25 |
import type { FunctionParameters } from "openai/resources/shared.js";
|
26 |
import { callMcpTool, connectMcpServer } from "../mcp.js";
|
27 |
-
import type { Stream } from "openai/core/streaming.js";
|
28 |
|
29 |
class StreamingError extends Error {
|
30 |
constructor(message: string) {
|
@@ -36,10 +35,6 @@ class StreamingError extends Error {
|
|
36 |
type IncompleteResponse = Omit<Response, "incomplete_details" | "output_text" | "parallel_tool_calls">;
|
37 |
const SEQUENCE_NUMBER_PLACEHOLDER = -1;
|
38 |
|
39 |
-
// TODO: this depends on the model. To be adapted.
|
40 |
-
const REASONING_START_TOKEN = "<think>";
|
41 |
-
const REASONING_END_TOKEN = "</think>";
|
42 |
-
|
43 |
export const postCreateResponse = async (
|
44 |
req: ValidatedRequest<CreateResponseParams>,
|
45 |
res: ExpressResponse
|
@@ -498,7 +493,7 @@ async function* handleOneTurnStream(
|
|
498 |
baseURL: process.env.OPENAI_BASE_URL ?? "https://router.huggingface.co/v1",
|
499 |
apiKey: apiKey,
|
500 |
});
|
501 |
-
const stream =
|
502 |
let previousInputTokens = responseObject.usage?.input_tokens ?? 0;
|
503 |
let previousOutputTokens = responseObject.usage?.output_tokens ?? 0;
|
504 |
let previousTotalTokens = responseObject.usage?.total_tokens ?? 0;
|
@@ -516,22 +511,26 @@ async function* handleOneTurnStream(
|
|
516 |
};
|
517 |
}
|
518 |
|
519 |
-
const delta = chunk.choices[0].delta;
|
520 |
|
521 |
-
if (delta.content) {
|
522 |
let currentOutputItem = responseObject.output.at(-1);
|
523 |
-
let deltaText = delta.content;
|
524 |
|
525 |
// If start or end of reasoning, skip token and update the current text mode
|
526 |
-
if (
|
|
|
|
|
|
|
|
|
|
|
527 |
currentTextMode = "reasoning";
|
528 |
-
|
529 |
-
|
530 |
-
|
531 |
-
|
532 |
-
|
533 |
}
|
534 |
-
|
535 |
}
|
536 |
|
537 |
// If start of a new message, create it
|
@@ -611,7 +610,7 @@ async function* handleOneTurnStream(
|
|
611 |
item_id: currentOutputMessage.id,
|
612 |
output_index: responseObject.output.length - 1,
|
613 |
content_index: currentOutputMessage.content.length - 1,
|
614 |
-
delta: delta.content,
|
615 |
sequence_number: SEQUENCE_NUMBER_PLACEHOLDER,
|
616 |
};
|
617 |
} else if (currentTextMode === "reasoning") {
|
@@ -636,13 +635,13 @@ async function* handleOneTurnStream(
|
|
636 |
|
637 |
// Add text delta
|
638 |
const contentPart = currentReasoningItem.content.at(-1) as ReasoningTextContent;
|
639 |
-
contentPart.text += delta.
|
640 |
yield {
|
641 |
type: "response.reasoning_text.delta",
|
642 |
item_id: currentReasoningItem.id,
|
643 |
output_index: responseObject.output.length - 1,
|
644 |
content_index: currentReasoningItem.content.length - 1,
|
645 |
-
delta: delta.
|
646 |
sequence_number: SEQUENCE_NUMBER_PLACEHOLDER,
|
647 |
};
|
648 |
}
|
@@ -992,59 +991,3 @@ async function* closeLastOutputItem(
|
|
992 |
}
|
993 |
}
|
994 |
}
|
995 |
-
|
996 |
-
/*
|
997 |
-
* Wrap a chat completion stream to handle reasoning.
|
998 |
-
*
|
999 |
-
* The reasoning start and end tokens might be sent in a longer text chunk.
|
1000 |
-
* We want to split that text chunk so that the reasoning token is isolated in a separate chunk.
|
1001 |
-
*
|
1002 |
-
* TODO: also adapt for when reasoning token is sent in separate chunks.
|
1003 |
-
*/
|
1004 |
-
async function* wrapChatCompletionStream(
|
1005 |
-
stream: Stream<ChatCompletionChunk & { _request_id?: string | null | undefined }>
|
1006 |
-
): AsyncGenerator<ChatCompletionChunk & { _request_id?: string | null | undefined }> {
|
1007 |
-
function cloneChunkWithContent(baseChunk: ChatCompletionChunk, content: string): ChatCompletionChunk {
|
1008 |
-
return {
|
1009 |
-
...baseChunk,
|
1010 |
-
choices: [
|
1011 |
-
{
|
1012 |
-
...baseChunk.choices[0],
|
1013 |
-
delta: {
|
1014 |
-
...baseChunk.choices[0].delta,
|
1015 |
-
content,
|
1016 |
-
},
|
1017 |
-
},
|
1018 |
-
],
|
1019 |
-
};
|
1020 |
-
}
|
1021 |
-
|
1022 |
-
function* splitAndYieldChunk(chunk: ChatCompletionChunk, content: string, token: string) {
|
1023 |
-
const [beforeContent, afterContent] = content.split(token, 2);
|
1024 |
-
|
1025 |
-
if (beforeContent) {
|
1026 |
-
yield cloneChunkWithContent(chunk, beforeContent);
|
1027 |
-
}
|
1028 |
-
yield cloneChunkWithContent(chunk, token);
|
1029 |
-
if (afterContent) {
|
1030 |
-
yield cloneChunkWithContent(chunk, afterContent);
|
1031 |
-
}
|
1032 |
-
}
|
1033 |
-
|
1034 |
-
for await (const chunk of stream) {
|
1035 |
-
const content = chunk.choices[0].delta.content;
|
1036 |
-
|
1037 |
-
if (!content) {
|
1038 |
-
yield chunk;
|
1039 |
-
continue;
|
1040 |
-
}
|
1041 |
-
|
1042 |
-
if (content.includes(REASONING_START_TOKEN)) {
|
1043 |
-
yield* splitAndYieldChunk(chunk, content, REASONING_START_TOKEN);
|
1044 |
-
} else if (content.includes(REASONING_END_TOKEN)) {
|
1045 |
-
yield* splitAndYieldChunk(chunk, content, REASONING_END_TOKEN);
|
1046 |
-
} else {
|
1047 |
-
yield chunk;
|
1048 |
-
}
|
1049 |
-
}
|
1050 |
-
}
|
|
|
15 |
PatchedResponseReasoningItem,
|
16 |
PatchedResponseStreamEvent,
|
17 |
ReasoningTextContent,
|
18 |
+
PatchedDeltaWithReasoning,
|
19 |
} from "../openai_patch";
|
20 |
import type {
|
21 |
ChatCompletionCreateParamsStreaming,
|
22 |
ChatCompletionMessageParam,
|
23 |
ChatCompletionTool,
|
|
|
24 |
} from "openai/resources/chat/completions.js";
|
25 |
import type { FunctionParameters } from "openai/resources/shared.js";
|
26 |
import { callMcpTool, connectMcpServer } from "../mcp.js";
|
|
|
27 |
|
28 |
class StreamingError extends Error {
|
29 |
constructor(message: string) {
|
|
|
35 |
type IncompleteResponse = Omit<Response, "incomplete_details" | "output_text" | "parallel_tool_calls">;
|
36 |
const SEQUENCE_NUMBER_PLACEHOLDER = -1;
|
37 |
|
|
|
|
|
|
|
|
|
38 |
export const postCreateResponse = async (
|
39 |
req: ValidatedRequest<CreateResponseParams>,
|
40 |
res: ExpressResponse
|
|
|
493 |
baseURL: process.env.OPENAI_BASE_URL ?? "https://router.huggingface.co/v1",
|
494 |
apiKey: apiKey,
|
495 |
});
|
496 |
+
const stream = await client.chat.completions.create(payload);
|
497 |
let previousInputTokens = responseObject.usage?.input_tokens ?? 0;
|
498 |
let previousOutputTokens = responseObject.usage?.output_tokens ?? 0;
|
499 |
let previousTotalTokens = responseObject.usage?.total_tokens ?? 0;
|
|
|
511 |
};
|
512 |
}
|
513 |
|
514 |
+
const delta = chunk.choices[0].delta as PatchedDeltaWithReasoning;
|
515 |
|
516 |
+
if (delta.content || delta.reasoning) {
|
517 |
let currentOutputItem = responseObject.output.at(-1);
|
|
|
518 |
|
519 |
// If start or end of reasoning, skip token and update the current text mode
|
520 |
+
if (delta.reasoning) {
|
521 |
+
if (currentTextMode === "text") {
|
522 |
+
for await (const event of closeLastOutputItem(responseObject, payload, mcpToolsMapping)) {
|
523 |
+
yield event;
|
524 |
+
}
|
525 |
+
}
|
526 |
currentTextMode = "reasoning";
|
527 |
+
} else if (delta.content) {
|
528 |
+
if (currentTextMode === "reasoning") {
|
529 |
+
for await (const event of closeLastOutputItem(responseObject, payload, mcpToolsMapping)) {
|
530 |
+
yield event;
|
531 |
+
}
|
532 |
}
|
533 |
+
currentTextMode = "text";
|
534 |
}
|
535 |
|
536 |
// If start of a new message, create it
|
|
|
610 |
item_id: currentOutputMessage.id,
|
611 |
output_index: responseObject.output.length - 1,
|
612 |
content_index: currentOutputMessage.content.length - 1,
|
613 |
+
delta: delta.content as string,
|
614 |
sequence_number: SEQUENCE_NUMBER_PLACEHOLDER,
|
615 |
};
|
616 |
} else if (currentTextMode === "reasoning") {
|
|
|
635 |
|
636 |
// Add text delta
|
637 |
const contentPart = currentReasoningItem.content.at(-1) as ReasoningTextContent;
|
638 |
+
contentPart.text += delta.reasoning;
|
639 |
yield {
|
640 |
type: "response.reasoning_text.delta",
|
641 |
item_id: currentReasoningItem.id,
|
642 |
output_index: responseObject.output.length - 1,
|
643 |
content_index: currentReasoningItem.content.length - 1,
|
644 |
+
delta: delta.reasoning as string,
|
645 |
sequence_number: SEQUENCE_NUMBER_PLACEHOLDER,
|
646 |
};
|
647 |
}
|
|
|
991 |
}
|
992 |
}
|
993 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/schemas.ts
CHANGED
@@ -101,8 +101,8 @@ export const createResponseParamsSchema = z.object({
|
|
101 |
z.object({
|
102 |
type: z.literal("output_text"),
|
103 |
text: z.string(),
|
104 |
-
annotations: z.array(z.object({})).optional(), // TODO: incomplete
|
105 |
-
logprobs: z.array(z.object({})).optional(), // TODO: incomplete
|
106 |
}),
|
107 |
z.object({
|
108 |
type: z.literal("refusal"),
|
|
|
101 |
z.object({
|
102 |
type: z.literal("output_text"),
|
103 |
text: z.string(),
|
104 |
+
annotations: z.array(z.object({})).nullable().optional(), // TODO: incomplete
|
105 |
+
logprobs: z.array(z.object({})).nullable().optional(), // TODO: incomplete
|
106 |
}),
|
107 |
z.object({
|
108 |
type: z.literal("refusal"),
|
src/server.ts
CHANGED
@@ -2,7 +2,7 @@ import express, { type Express } from "express";
|
|
2 |
import { createResponseParamsSchema } from "./schemas.js";
|
3 |
import { validateBody } from "./middleware/validation.js";
|
4 |
import { requestLogger } from "./middleware/logging.js";
|
5 |
-
import { getLandingPageHtml, postCreateResponse } from "./routes/index.js";
|
6 |
|
7 |
export const createApp = (): Express => {
|
8 |
const app: Express = express();
|
@@ -14,6 +14,8 @@ export const createApp = (): Express => {
|
|
14 |
// Routes
|
15 |
app.get("/", getLandingPageHtml);
|
16 |
|
|
|
|
|
17 |
app.post("/v1/responses", validateBody(createResponseParamsSchema), postCreateResponse);
|
18 |
|
19 |
return app;
|
|
|
2 |
import { createResponseParamsSchema } from "./schemas.js";
|
3 |
import { validateBody } from "./middleware/validation.js";
|
4 |
import { requestLogger } from "./middleware/logging.js";
|
5 |
+
import { getLandingPageHtml, postCreateResponse, getHealth } from "./routes/index.js";
|
6 |
|
7 |
export const createApp = (): Express => {
|
8 |
const app: Express = express();
|
|
|
14 |
// Routes
|
15 |
app.get("/", getLandingPageHtml);
|
16 |
|
17 |
+
app.get("/health", getHealth);
|
18 |
+
|
19 |
app.post("/v1/responses", validateBody(createResponseParamsSchema), postCreateResponse);
|
20 |
|
21 |
return app;
|