Wauplin HF Staff commited on
Commit
5979e6b
·
verified ·
1 Parent(s): 6cb77de

Upload folder using huggingface_hub

Browse files
src/openai_patch.ts CHANGED
@@ -9,6 +9,7 @@ import type {
9
  ResponseOutputText,
10
  } from "openai/resources/responses/responses";
11
 
 
12
  export interface ReasoningTextContent {
13
  type: "reasoning_text";
14
  text: string;
@@ -42,3 +43,7 @@ export type PatchedResponseStreamEvent =
42
  | PatchedResponseReasoningTextDoneEvent;
43
 
44
  export type PatchedResponseContentPart = ResponseOutputText | ResponseOutputRefusal;
 
 
 
 
 
9
  ResponseOutputText,
10
  } from "openai/resources/responses/responses";
11
 
12
+ import type { ChatCompletionChunk } from "openai/resources/chat/completions";
13
  export interface ReasoningTextContent {
14
  type: "reasoning_text";
15
  text: string;
 
43
  | PatchedResponseReasoningTextDoneEvent;
44
 
45
  export type PatchedResponseContentPart = ResponseOutputText | ResponseOutputRefusal;
46
+
47
+ export type PatchedDeltaWithReasoning = ChatCompletionChunk.Choice.Delta & {
48
+ reasoning?: string;
49
+ };
src/routes/health.ts ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ import type { Request, Response } from "express";
2
+
3
+ export function getHealth(req: Request, res: Response): void {
4
+ res.send("OK");
5
+ }
src/routes/index.ts CHANGED
@@ -1,2 +1,3 @@
1
  export { postCreateResponse } from "./responses.js";
2
  export { getLandingPageHtml } from "./landingPageHtml.js";
 
 
1
  export { postCreateResponse } from "./responses.js";
2
  export { getLandingPageHtml } from "./landingPageHtml.js";
3
+ export { getHealth } from "./health.js";
src/routes/responses.ts CHANGED
@@ -15,16 +15,15 @@ import type {
15
  PatchedResponseReasoningItem,
16
  PatchedResponseStreamEvent,
17
  ReasoningTextContent,
 
18
  } from "../openai_patch";
19
  import type {
20
  ChatCompletionCreateParamsStreaming,
21
  ChatCompletionMessageParam,
22
  ChatCompletionTool,
23
- ChatCompletionChunk,
24
  } from "openai/resources/chat/completions.js";
25
  import type { FunctionParameters } from "openai/resources/shared.js";
26
  import { callMcpTool, connectMcpServer } from "../mcp.js";
27
- import type { Stream } from "openai/core/streaming.js";
28
 
29
  class StreamingError extends Error {
30
  constructor(message: string) {
@@ -36,10 +35,6 @@ class StreamingError extends Error {
36
  type IncompleteResponse = Omit<Response, "incomplete_details" | "output_text" | "parallel_tool_calls">;
37
  const SEQUENCE_NUMBER_PLACEHOLDER = -1;
38
 
39
- // TODO: this depends on the model. To be adapted.
40
- const REASONING_START_TOKEN = "<think>";
41
- const REASONING_END_TOKEN = "</think>";
42
-
43
  export const postCreateResponse = async (
44
  req: ValidatedRequest<CreateResponseParams>,
45
  res: ExpressResponse
@@ -498,7 +493,7 @@ async function* handleOneTurnStream(
498
  baseURL: process.env.OPENAI_BASE_URL ?? "https://router.huggingface.co/v1",
499
  apiKey: apiKey,
500
  });
501
- const stream = wrapChatCompletionStream(await client.chat.completions.create(payload));
502
  let previousInputTokens = responseObject.usage?.input_tokens ?? 0;
503
  let previousOutputTokens = responseObject.usage?.output_tokens ?? 0;
504
  let previousTotalTokens = responseObject.usage?.total_tokens ?? 0;
@@ -516,22 +511,26 @@ async function* handleOneTurnStream(
516
  };
517
  }
518
 
519
- const delta = chunk.choices[0].delta;
520
 
521
- if (delta.content) {
522
  let currentOutputItem = responseObject.output.at(-1);
523
- let deltaText = delta.content;
524
 
525
  // If start or end of reasoning, skip token and update the current text mode
526
- if (deltaText === REASONING_START_TOKEN) {
 
 
 
 
 
527
  currentTextMode = "reasoning";
528
- continue;
529
- } else if (deltaText === REASONING_END_TOKEN) {
530
- currentTextMode = "text";
531
- for await (const event of closeLastOutputItem(responseObject, payload, mcpToolsMapping)) {
532
- yield event;
533
  }
534
- continue;
535
  }
536
 
537
  // If start of a new message, create it
@@ -611,7 +610,7 @@ async function* handleOneTurnStream(
611
  item_id: currentOutputMessage.id,
612
  output_index: responseObject.output.length - 1,
613
  content_index: currentOutputMessage.content.length - 1,
614
- delta: delta.content,
615
  sequence_number: SEQUENCE_NUMBER_PLACEHOLDER,
616
  };
617
  } else if (currentTextMode === "reasoning") {
@@ -636,13 +635,13 @@ async function* handleOneTurnStream(
636
 
637
  // Add text delta
638
  const contentPart = currentReasoningItem.content.at(-1) as ReasoningTextContent;
639
- contentPart.text += delta.content;
640
  yield {
641
  type: "response.reasoning_text.delta",
642
  item_id: currentReasoningItem.id,
643
  output_index: responseObject.output.length - 1,
644
  content_index: currentReasoningItem.content.length - 1,
645
- delta: delta.content,
646
  sequence_number: SEQUENCE_NUMBER_PLACEHOLDER,
647
  };
648
  }
@@ -992,59 +991,3 @@ async function* closeLastOutputItem(
992
  }
993
  }
994
  }
995
-
996
- /*
997
- * Wrap a chat completion stream to handle reasoning.
998
- *
999
- * The reasoning start and end tokens might be sent in a longer text chunk.
1000
- * We want to split that text chunk so that the reasoning token is isolated in a separate chunk.
1001
- *
1002
- * TODO: also adapt for when reasoning token is sent in separate chunks.
1003
- */
1004
- async function* wrapChatCompletionStream(
1005
- stream: Stream<ChatCompletionChunk & { _request_id?: string | null | undefined }>
1006
- ): AsyncGenerator<ChatCompletionChunk & { _request_id?: string | null | undefined }> {
1007
- function cloneChunkWithContent(baseChunk: ChatCompletionChunk, content: string): ChatCompletionChunk {
1008
- return {
1009
- ...baseChunk,
1010
- choices: [
1011
- {
1012
- ...baseChunk.choices[0],
1013
- delta: {
1014
- ...baseChunk.choices[0].delta,
1015
- content,
1016
- },
1017
- },
1018
- ],
1019
- };
1020
- }
1021
-
1022
- function* splitAndYieldChunk(chunk: ChatCompletionChunk, content: string, token: string) {
1023
- const [beforeContent, afterContent] = content.split(token, 2);
1024
-
1025
- if (beforeContent) {
1026
- yield cloneChunkWithContent(chunk, beforeContent);
1027
- }
1028
- yield cloneChunkWithContent(chunk, token);
1029
- if (afterContent) {
1030
- yield cloneChunkWithContent(chunk, afterContent);
1031
- }
1032
- }
1033
-
1034
- for await (const chunk of stream) {
1035
- const content = chunk.choices[0].delta.content;
1036
-
1037
- if (!content) {
1038
- yield chunk;
1039
- continue;
1040
- }
1041
-
1042
- if (content.includes(REASONING_START_TOKEN)) {
1043
- yield* splitAndYieldChunk(chunk, content, REASONING_START_TOKEN);
1044
- } else if (content.includes(REASONING_END_TOKEN)) {
1045
- yield* splitAndYieldChunk(chunk, content, REASONING_END_TOKEN);
1046
- } else {
1047
- yield chunk;
1048
- }
1049
- }
1050
- }
 
15
  PatchedResponseReasoningItem,
16
  PatchedResponseStreamEvent,
17
  ReasoningTextContent,
18
+ PatchedDeltaWithReasoning,
19
  } from "../openai_patch";
20
  import type {
21
  ChatCompletionCreateParamsStreaming,
22
  ChatCompletionMessageParam,
23
  ChatCompletionTool,
 
24
  } from "openai/resources/chat/completions.js";
25
  import type { FunctionParameters } from "openai/resources/shared.js";
26
  import { callMcpTool, connectMcpServer } from "../mcp.js";
 
27
 
28
  class StreamingError extends Error {
29
  constructor(message: string) {
 
35
  type IncompleteResponse = Omit<Response, "incomplete_details" | "output_text" | "parallel_tool_calls">;
36
  const SEQUENCE_NUMBER_PLACEHOLDER = -1;
37
 
 
 
 
 
38
  export const postCreateResponse = async (
39
  req: ValidatedRequest<CreateResponseParams>,
40
  res: ExpressResponse
 
493
  baseURL: process.env.OPENAI_BASE_URL ?? "https://router.huggingface.co/v1",
494
  apiKey: apiKey,
495
  });
496
+ const stream = await client.chat.completions.create(payload);
497
  let previousInputTokens = responseObject.usage?.input_tokens ?? 0;
498
  let previousOutputTokens = responseObject.usage?.output_tokens ?? 0;
499
  let previousTotalTokens = responseObject.usage?.total_tokens ?? 0;
 
511
  };
512
  }
513
 
514
+ const delta = chunk.choices[0].delta as PatchedDeltaWithReasoning;
515
 
516
+ if (delta.content || delta.reasoning) {
517
  let currentOutputItem = responseObject.output.at(-1);
 
518
 
519
  // If start or end of reasoning, skip token and update the current text mode
520
+ if (delta.reasoning) {
521
+ if (currentTextMode === "text") {
522
+ for await (const event of closeLastOutputItem(responseObject, payload, mcpToolsMapping)) {
523
+ yield event;
524
+ }
525
+ }
526
  currentTextMode = "reasoning";
527
+ } else if (delta.content) {
528
+ if (currentTextMode === "reasoning") {
529
+ for await (const event of closeLastOutputItem(responseObject, payload, mcpToolsMapping)) {
530
+ yield event;
531
+ }
532
  }
533
+ currentTextMode = "text";
534
  }
535
 
536
  // If start of a new message, create it
 
610
  item_id: currentOutputMessage.id,
611
  output_index: responseObject.output.length - 1,
612
  content_index: currentOutputMessage.content.length - 1,
613
+ delta: delta.content as string,
614
  sequence_number: SEQUENCE_NUMBER_PLACEHOLDER,
615
  };
616
  } else if (currentTextMode === "reasoning") {
 
635
 
636
  // Add text delta
637
  const contentPart = currentReasoningItem.content.at(-1) as ReasoningTextContent;
638
+ contentPart.text += delta.reasoning;
639
  yield {
640
  type: "response.reasoning_text.delta",
641
  item_id: currentReasoningItem.id,
642
  output_index: responseObject.output.length - 1,
643
  content_index: currentReasoningItem.content.length - 1,
644
+ delta: delta.reasoning as string,
645
  sequence_number: SEQUENCE_NUMBER_PLACEHOLDER,
646
  };
647
  }
 
991
  }
992
  }
993
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/schemas.ts CHANGED
@@ -101,8 +101,8 @@ export const createResponseParamsSchema = z.object({
101
  z.object({
102
  type: z.literal("output_text"),
103
  text: z.string(),
104
- annotations: z.array(z.object({})).optional(), // TODO: incomplete
105
- logprobs: z.array(z.object({})).optional(), // TODO: incomplete
106
  }),
107
  z.object({
108
  type: z.literal("refusal"),
 
101
  z.object({
102
  type: z.literal("output_text"),
103
  text: z.string(),
104
+ annotations: z.array(z.object({})).nullable().optional(), // TODO: incomplete
105
+ logprobs: z.array(z.object({})).nullable().optional(), // TODO: incomplete
106
  }),
107
  z.object({
108
  type: z.literal("refusal"),
src/server.ts CHANGED
@@ -2,7 +2,7 @@ import express, { type Express } from "express";
2
  import { createResponseParamsSchema } from "./schemas.js";
3
  import { validateBody } from "./middleware/validation.js";
4
  import { requestLogger } from "./middleware/logging.js";
5
- import { getLandingPageHtml, postCreateResponse } from "./routes/index.js";
6
 
7
  export const createApp = (): Express => {
8
  const app: Express = express();
@@ -14,6 +14,8 @@ export const createApp = (): Express => {
14
  // Routes
15
  app.get("/", getLandingPageHtml);
16
 
 
 
17
  app.post("/v1/responses", validateBody(createResponseParamsSchema), postCreateResponse);
18
 
19
  return app;
 
2
  import { createResponseParamsSchema } from "./schemas.js";
3
  import { validateBody } from "./middleware/validation.js";
4
  import { requestLogger } from "./middleware/logging.js";
5
+ import { getLandingPageHtml, postCreateResponse, getHealth } from "./routes/index.js";
6
 
7
  export const createApp = (): Express => {
8
  const app: Express = express();
 
14
  // Routes
15
  app.get("/", getLandingPageHtml);
16
 
17
+ app.get("/health", getHealth);
18
+
19
  app.post("/v1/responses", validateBody(createResponseParamsSchema), postCreateResponse);
20
 
21
  return app;