Spaces:
Running
Running
Upload folder using huggingface_hub
Browse files- src/openai_patch.ts +44 -0
- src/routes/landingPageHtml.ts +22 -0
- src/routes/responses.ts +321 -141
- src/schemas.ts +4 -4
src/openai_patch.ts
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
/*
|
2 |
+
* This file is a patch to the openai library to add support for the reasoning parameter.
|
3 |
+
* Once openai's official JS SDK supports sending back raw CoT, we will remove this file.
|
4 |
+
*/
|
5 |
+
import type {
|
6 |
+
ResponseReasoningItem as OpenAIResponseReasoningItem,
|
7 |
+
ResponseStreamEvent as OpenAIResponseStreamEvent,
|
8 |
+
ResponseOutputRefusal,
|
9 |
+
ResponseOutputText,
|
10 |
+
} from "openai/resources/responses/responses";
|
11 |
+
|
12 |
+
export interface ReasoningTextContent {
|
13 |
+
type: "reasoning_text";
|
14 |
+
text: string;
|
15 |
+
}
|
16 |
+
export type PatchedResponseReasoningItem = OpenAIResponseReasoningItem & {
|
17 |
+
// Raw CoT returned in reasoning item (in addition to the summary)
|
18 |
+
content: ReasoningTextContent[];
|
19 |
+
};
|
20 |
+
|
21 |
+
interface PatchedResponseReasoningTextDeltaEvent {
|
22 |
+
type: "response.reasoning_text.delta";
|
23 |
+
sequence_number: number;
|
24 |
+
item_id: string;
|
25 |
+
output_index: number;
|
26 |
+
content_index: number;
|
27 |
+
delta: string;
|
28 |
+
}
|
29 |
+
|
30 |
+
interface PatchedResponseReasoningTextDoneEvent {
|
31 |
+
type: "response.reasoning_text.done";
|
32 |
+
sequence_number: number;
|
33 |
+
item_id: string;
|
34 |
+
output_index: number;
|
35 |
+
content_index: number;
|
36 |
+
text: string;
|
37 |
+
}
|
38 |
+
|
39 |
+
export type PatchedResponseStreamEvent =
|
40 |
+
| OpenAIResponseStreamEvent
|
41 |
+
| PatchedResponseReasoningTextDeltaEvent
|
42 |
+
| PatchedResponseReasoningTextDoneEvent;
|
43 |
+
|
44 |
+
export type PatchedResponseContentPart = ResponseOutputText | ResponseOutputRefusal;
|
src/routes/landingPageHtml.ts
CHANGED
@@ -502,6 +502,7 @@ export function getLandingPageHtml(req: Request, res: Response): void {
|
|
502 |
<button class="examples-tab" type="button">Function Calling</button>
|
503 |
<button class="examples-tab" type="button">Structured Output</button>
|
504 |
<button class="examples-tab" type="button">MCP</button>
|
|
|
505 |
</div>
|
506 |
<div class="example-panel active">
|
507 |
<pre><button class="copy-btn" onclick="copyCode(this)">Copy</button><code class="language-python">from openai import OpenAI
|
@@ -687,6 +688,27 @@ response = client.responses.create(
|
|
687 |
for output in response.output:
|
688 |
print(output)</code></pre>
|
689 |
</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
690 |
</section>
|
691 |
<footer class="more-info-footer">
|
692 |
<div style="font-weight:600; color:var(--primary-dark); font-size:1.13em; margin-bottom:0.5em;">More Info</div>
|
|
|
502 |
<button class="examples-tab" type="button">Function Calling</button>
|
503 |
<button class="examples-tab" type="button">Structured Output</button>
|
504 |
<button class="examples-tab" type="button">MCP</button>
|
505 |
+
<button class="examples-tab" type="button">Reasoning</button>
|
506 |
</div>
|
507 |
<div class="example-panel active">
|
508 |
<pre><button class="copy-btn" onclick="copyCode(this)">Copy</button><code class="language-python">from openai import OpenAI
|
|
|
688 |
for output in response.output:
|
689 |
print(output)</code></pre>
|
690 |
</div>
|
691 |
+
<div class="example-panel">
|
692 |
+
<pre><button class="copy-btn" onclick="copyCode(this)">Copy</button><code class="language-python">from openai import OpenAI
|
693 |
+
import os
|
694 |
+
|
695 |
+
client = OpenAI(
|
696 |
+
base_url="${baseUrl}",
|
697 |
+
api_key=os.getenv("HF_TOKEN"), # visit https://huggingface.co/settings/tokens
|
698 |
+
)
|
699 |
+
|
700 |
+
response = client.responses.create(
|
701 |
+
model="deepseek-ai/DeepSeek-R1",
|
702 |
+
instructions="You are a helpful assistant.",
|
703 |
+
input="Say hello to the world.",
|
704 |
+
reasoning={
|
705 |
+
"effort": "low",
|
706 |
+
}
|
707 |
+
)
|
708 |
+
|
709 |
+
for index, item in enumerate(response.output):
|
710 |
+
print(f"Output #{index}: {item.type}", item.content)</code></pre>
|
711 |
+
</div>
|
712 |
</section>
|
713 |
<footer class="more-info-footer">
|
714 |
<div style="font-weight:600; color:var(--primary-dark); font-size:1.13em; margin-bottom:0.5em;">More Info</div>
|
src/routes/responses.ts
CHANGED
@@ -5,19 +5,26 @@ import { generateUniqueId } from "../lib/generateUniqueId.js";
|
|
5 |
import { OpenAI } from "openai";
|
6 |
import type {
|
7 |
Response,
|
8 |
-
ResponseStreamEvent,
|
9 |
ResponseContentPartAddedEvent,
|
10 |
ResponseOutputMessage,
|
11 |
ResponseFunctionToolCall,
|
12 |
ResponseOutputItem,
|
13 |
} from "openai/resources/responses/responses";
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
import type {
|
15 |
ChatCompletionCreateParamsStreaming,
|
16 |
ChatCompletionMessageParam,
|
17 |
ChatCompletionTool,
|
|
|
18 |
} from "openai/resources/chat/completions.js";
|
19 |
import type { FunctionParameters } from "openai/resources/shared.js";
|
20 |
import { callMcpTool, connectMcpServer } from "../mcp.js";
|
|
|
21 |
|
22 |
class StreamingError extends Error {
|
23 |
constructor(message: string) {
|
@@ -29,6 +36,10 @@ class StreamingError extends Error {
|
|
29 |
type IncompleteResponse = Omit<Response, "incomplete_details" | "output_text" | "parallel_tool_calls">;
|
30 |
const SEQUENCE_NUMBER_PLACEHOLDER = -1;
|
31 |
|
|
|
|
|
|
|
|
|
32 |
export const postCreateResponse = async (
|
33 |
req: ValidatedRequest<CreateResponseParams>,
|
34 |
res: ExpressResponse
|
@@ -66,7 +77,7 @@ export const postCreateResponse = async (
|
|
66 |
async function* runCreateResponseStream(
|
67 |
req: ValidatedRequest<CreateResponseParams>,
|
68 |
res: ExpressResponse
|
69 |
-
): AsyncGenerator<
|
70 |
let sequenceNumber = 0;
|
71 |
// Prepare response object that will be iteratively populated
|
72 |
const responseObject: IncompleteResponse = {
|
@@ -147,7 +158,7 @@ async function* innerRunStream(
|
|
147 |
req: ValidatedRequest<CreateResponseParams>,
|
148 |
res: ExpressResponse,
|
149 |
responseObject: IncompleteResponse
|
150 |
-
): AsyncGenerator<
|
151 |
// Retrieve API key from headers
|
152 |
const apiKey = req.headers.authorization?.split(" ")[1];
|
153 |
if (!apiKey) {
|
@@ -158,6 +169,11 @@ async function* innerRunStream(
|
|
158 |
return;
|
159 |
}
|
160 |
|
|
|
|
|
|
|
|
|
|
|
161 |
// List MCP tools from server (if required) + prepare tools for the LLM
|
162 |
let tools: ChatCompletionTool[] | undefined = [];
|
163 |
const mcpToolsMapping: Record<string, McpServerParams> = {};
|
@@ -351,6 +367,7 @@ async function* innerRunStream(
|
|
351 |
}
|
352 |
: { type: req.body.text.format.type }
|
353 |
: undefined,
|
|
|
354 |
temperature: req.body.temperature,
|
355 |
tool_choice:
|
356 |
typeof req.body.tool_choice === "string"
|
@@ -417,7 +434,7 @@ async function* innerRunStream(
|
|
417 |
async function* listMcpToolsStream(
|
418 |
tool: McpServerParams,
|
419 |
responseObject: IncompleteResponse
|
420 |
-
): AsyncGenerator<
|
421 |
const outputObject: ResponseOutputItem.McpListTools = {
|
422 |
id: generateUniqueId("mcpl"),
|
423 |
type: "mcp_list_tools",
|
@@ -476,15 +493,16 @@ async function* handleOneTurnStream(
|
|
476 |
payload: ChatCompletionCreateParamsStreaming,
|
477 |
responseObject: IncompleteResponse,
|
478 |
mcpToolsMapping: Record<string, McpServerParams>
|
479 |
-
): AsyncGenerator<
|
480 |
const client = new OpenAI({
|
481 |
baseURL: process.env.OPENAI_BASE_URL ?? "https://router.huggingface.co/v1",
|
482 |
apiKey: apiKey,
|
483 |
});
|
484 |
-
const stream = await client.chat.completions.create(payload);
|
485 |
let previousInputTokens = responseObject.usage?.input_tokens ?? 0;
|
486 |
let previousOutputTokens = responseObject.usage?.output_tokens ?? 0;
|
487 |
let previousTotalTokens = responseObject.usage?.total_tokens ?? 0;
|
|
|
488 |
|
489 |
for await (const chunk of stream) {
|
490 |
if (chunk.usage) {
|
@@ -502,65 +520,132 @@ async function* handleOneTurnStream(
|
|
502 |
|
503 |
if (delta.content) {
|
504 |
let currentOutputItem = responseObject.output.at(-1);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
505 |
|
506 |
// If start of a new message, create it
|
507 |
-
if (
|
508 |
-
|
509 |
-
|
510 |
-
|
511 |
-
|
512 |
-
|
513 |
-
|
514 |
-
|
515 |
-
|
|
|
516 |
|
517 |
-
|
518 |
-
|
519 |
-
|
520 |
-
|
521 |
-
|
522 |
-
|
523 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
524 |
}
|
525 |
|
526 |
// If start of a new content part, create it
|
527 |
-
|
528 |
-
|
529 |
-
|
530 |
-
|
531 |
-
|
532 |
-
|
533 |
-
|
534 |
-
|
535 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
536 |
|
|
|
|
|
537 |
yield {
|
538 |
-
type: "response.
|
539 |
-
item_id:
|
540 |
output_index: responseObject.output.length - 1,
|
541 |
-
content_index:
|
542 |
-
|
543 |
sequence_number: SEQUENCE_NUMBER_PLACEHOLDER,
|
544 |
};
|
545 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
546 |
|
547 |
-
|
548 |
-
|
549 |
-
|
550 |
-
|
551 |
-
|
552 |
-
|
|
|
|
|
|
|
553 |
|
554 |
-
|
555 |
-
|
556 |
-
|
557 |
-
|
558 |
-
|
559 |
-
|
560 |
-
|
561 |
-
|
562 |
-
|
563 |
-
|
|
|
|
|
564 |
} else if (delta.tool_calls && delta.tool_calls.length > 0) {
|
565 |
if (delta.tool_calls.length > 1) {
|
566 |
console.log("Multiple tool calls are not supported. Only the first one will be processed.");
|
@@ -643,6 +728,117 @@ async function* handleOneTurnStream(
|
|
643 |
}
|
644 |
}
|
645 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
646 |
const lastOutputItem = responseObject.output.at(-1);
|
647 |
if (lastOutputItem) {
|
648 |
if (lastOutputItem?.type === "message") {
|
@@ -669,6 +865,35 @@ async function* handleOneTurnStream(
|
|
669 |
throw new StreamingError("Not implemented: only output_text is supported in streaming mode.");
|
670 |
}
|
671 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
672 |
// Response output item done event
|
673 |
lastOutputItem.status = "completed";
|
674 |
yield {
|
@@ -769,102 +994,57 @@ async function* handleOneTurnStream(
|
|
769 |
}
|
770 |
|
771 |
/*
|
772 |
-
*
|
|
|
|
|
|
|
|
|
|
|
773 |
*/
|
774 |
-
async function*
|
775 |
-
|
776 |
-
|
777 |
-
|
778 |
-
|
779 |
-
|
780 |
-
|
781 |
-
|
782 |
-
|
783 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
784 |
}
|
785 |
|
786 |
-
|
787 |
-
|
788 |
-
id: mcpCallId,
|
789 |
-
name: approvalRequest.name,
|
790 |
-
server_label: approvalRequest.server_label,
|
791 |
-
arguments: approvalRequest.arguments,
|
792 |
-
};
|
793 |
-
responseObject.output.push(outputObject);
|
794 |
|
795 |
-
|
796 |
-
|
797 |
-
|
798 |
-
|
799 |
-
|
800 |
-
|
801 |
-
|
802 |
-
|
803 |
-
yield {
|
804 |
-
type: "response.mcp_call.in_progress",
|
805 |
-
item_id: outputObject.id,
|
806 |
-
output_index: responseObject.output.length - 1,
|
807 |
-
sequence_number: SEQUENCE_NUMBER_PLACEHOLDER,
|
808 |
-
};
|
809 |
-
|
810 |
-
const toolParams = mcpToolsMapping[approvalRequest.name];
|
811 |
-
const toolResult = await callMcpTool(toolParams, approvalRequest.name, approvalRequest.arguments);
|
812 |
-
|
813 |
-
if (toolResult.error) {
|
814 |
-
outputObject.error = toolResult.error;
|
815 |
-
yield {
|
816 |
-
type: "response.mcp_call.failed",
|
817 |
-
sequence_number: SEQUENCE_NUMBER_PLACEHOLDER,
|
818 |
-
};
|
819 |
-
} else {
|
820 |
-
outputObject.output = toolResult.output;
|
821 |
-
yield {
|
822 |
-
type: "response.mcp_call.completed",
|
823 |
-
sequence_number: SEQUENCE_NUMBER_PLACEHOLDER,
|
824 |
-
};
|
825 |
}
|
826 |
|
827 |
-
|
828 |
-
|
829 |
-
output_index: responseObject.output.length - 1,
|
830 |
-
item: outputObject,
|
831 |
-
sequence_number: SEQUENCE_NUMBER_PLACEHOLDER,
|
832 |
-
};
|
833 |
|
834 |
-
|
835 |
-
|
836 |
-
|
837 |
-
role: "assistant",
|
838 |
-
tool_calls: [
|
839 |
-
{
|
840 |
-
id: outputObject.id,
|
841 |
-
type: "function",
|
842 |
-
function: {
|
843 |
-
name: outputObject.name,
|
844 |
-
arguments: outputObject.arguments,
|
845 |
-
// Hacky: type is not correct in inference.js. Will fix it but in the meantime we need to cast it.
|
846 |
-
// TODO: fix it in the inference.js package. Should be "arguments" and not "parameters".
|
847 |
-
},
|
848 |
-
},
|
849 |
-
],
|
850 |
-
},
|
851 |
-
{
|
852 |
-
role: "tool",
|
853 |
-
tool_call_id: outputObject.id,
|
854 |
-
content: outputObject.output ? outputObject.output : outputObject.error ? `Error: ${outputObject.error}` : "",
|
855 |
}
|
856 |
-
);
|
857 |
-
}
|
858 |
|
859 |
-
|
860 |
-
|
861 |
-
|
862 |
-
|
863 |
-
|
864 |
-
|
865 |
-
|
866 |
-
|
867 |
-
: toolParams.require_approval.never?.tool_names?.includes(toolName)
|
868 |
-
? false
|
869 |
-
: true; // behavior is undefined in specs, let's default to true
|
870 |
}
|
|
|
5 |
import { OpenAI } from "openai";
|
6 |
import type {
|
7 |
Response,
|
|
|
8 |
ResponseContentPartAddedEvent,
|
9 |
ResponseOutputMessage,
|
10 |
ResponseFunctionToolCall,
|
11 |
ResponseOutputItem,
|
12 |
} from "openai/resources/responses/responses";
|
13 |
+
import type {
|
14 |
+
PatchedResponseContentPart,
|
15 |
+
PatchedResponseReasoningItem,
|
16 |
+
PatchedResponseStreamEvent,
|
17 |
+
ReasoningTextContent,
|
18 |
+
} from "../openai_patch";
|
19 |
import type {
|
20 |
ChatCompletionCreateParamsStreaming,
|
21 |
ChatCompletionMessageParam,
|
22 |
ChatCompletionTool,
|
23 |
+
ChatCompletionChunk,
|
24 |
} from "openai/resources/chat/completions.js";
|
25 |
import type { FunctionParameters } from "openai/resources/shared.js";
|
26 |
import { callMcpTool, connectMcpServer } from "../mcp.js";
|
27 |
+
import type { Stream } from "openai/core/streaming.js";
|
28 |
|
29 |
class StreamingError extends Error {
|
30 |
constructor(message: string) {
|
|
|
36 |
type IncompleteResponse = Omit<Response, "incomplete_details" | "output_text" | "parallel_tool_calls">;
|
37 |
const SEQUENCE_NUMBER_PLACEHOLDER = -1;
|
38 |
|
39 |
+
// TODO: this depends on the model. To be adapted.
|
40 |
+
const REASONING_START_TOKEN = "<think>";
|
41 |
+
const REASONING_END_TOKEN = "</think>";
|
42 |
+
|
43 |
export const postCreateResponse = async (
|
44 |
req: ValidatedRequest<CreateResponseParams>,
|
45 |
res: ExpressResponse
|
|
|
77 |
async function* runCreateResponseStream(
|
78 |
req: ValidatedRequest<CreateResponseParams>,
|
79 |
res: ExpressResponse
|
80 |
+
): AsyncGenerator<PatchedResponseStreamEvent> {
|
81 |
let sequenceNumber = 0;
|
82 |
// Prepare response object that will be iteratively populated
|
83 |
const responseObject: IncompleteResponse = {
|
|
|
158 |
req: ValidatedRequest<CreateResponseParams>,
|
159 |
res: ExpressResponse,
|
160 |
responseObject: IncompleteResponse
|
161 |
+
): AsyncGenerator<PatchedResponseStreamEvent> {
|
162 |
// Retrieve API key from headers
|
163 |
const apiKey = req.headers.authorization?.split(" ")[1];
|
164 |
if (!apiKey) {
|
|
|
169 |
return;
|
170 |
}
|
171 |
|
172 |
+
// Return early if not supported param
|
173 |
+
if (req.body.reasoning?.summary && req.body.reasoning?.summary !== "auto") {
|
174 |
+
throw new Error(`Not implemented: only 'auto' summary is supported. Got '${req.body.reasoning?.summary}'`);
|
175 |
+
}
|
176 |
+
|
177 |
// List MCP tools from server (if required) + prepare tools for the LLM
|
178 |
let tools: ChatCompletionTool[] | undefined = [];
|
179 |
const mcpToolsMapping: Record<string, McpServerParams> = {};
|
|
|
367 |
}
|
368 |
: { type: req.body.text.format.type }
|
369 |
: undefined,
|
370 |
+
reasoning_effort: req.body.reasoning?.effort,
|
371 |
temperature: req.body.temperature,
|
372 |
tool_choice:
|
373 |
typeof req.body.tool_choice === "string"
|
|
|
434 |
async function* listMcpToolsStream(
|
435 |
tool: McpServerParams,
|
436 |
responseObject: IncompleteResponse
|
437 |
+
): AsyncGenerator<PatchedResponseStreamEvent> {
|
438 |
const outputObject: ResponseOutputItem.McpListTools = {
|
439 |
id: generateUniqueId("mcpl"),
|
440 |
type: "mcp_list_tools",
|
|
|
493 |
payload: ChatCompletionCreateParamsStreaming,
|
494 |
responseObject: IncompleteResponse,
|
495 |
mcpToolsMapping: Record<string, McpServerParams>
|
496 |
+
): AsyncGenerator<PatchedResponseStreamEvent> {
|
497 |
const client = new OpenAI({
|
498 |
baseURL: process.env.OPENAI_BASE_URL ?? "https://router.huggingface.co/v1",
|
499 |
apiKey: apiKey,
|
500 |
});
|
501 |
+
const stream = wrapChatCompletionStream(await client.chat.completions.create(payload));
|
502 |
let previousInputTokens = responseObject.usage?.input_tokens ?? 0;
|
503 |
let previousOutputTokens = responseObject.usage?.output_tokens ?? 0;
|
504 |
let previousTotalTokens = responseObject.usage?.total_tokens ?? 0;
|
505 |
+
let currentTextMode: "text" | "reasoning" = "text";
|
506 |
|
507 |
for await (const chunk of stream) {
|
508 |
if (chunk.usage) {
|
|
|
520 |
|
521 |
if (delta.content) {
|
522 |
let currentOutputItem = responseObject.output.at(-1);
|
523 |
+
let deltaText = delta.content;
|
524 |
+
|
525 |
+
// If start or end of reasoning, skip token and update the current text mode
|
526 |
+
if (deltaText === REASONING_START_TOKEN) {
|
527 |
+
currentTextMode = "reasoning";
|
528 |
+
continue;
|
529 |
+
} else if (deltaText === REASONING_END_TOKEN) {
|
530 |
+
currentTextMode = "text";
|
531 |
+
for await (const event of closeLastOutputItem(responseObject, payload, mcpToolsMapping)) {
|
532 |
+
yield event;
|
533 |
+
}
|
534 |
+
continue;
|
535 |
+
}
|
536 |
|
537 |
// If start of a new message, create it
|
538 |
+
if (currentTextMode === "text") {
|
539 |
+
if (currentOutputItem?.type !== "message" || currentOutputItem?.status !== "in_progress") {
|
540 |
+
const outputObject: ResponseOutputMessage = {
|
541 |
+
id: generateUniqueId("msg"),
|
542 |
+
type: "message",
|
543 |
+
role: "assistant",
|
544 |
+
status: "in_progress",
|
545 |
+
content: [],
|
546 |
+
};
|
547 |
+
responseObject.output.push(outputObject);
|
548 |
|
549 |
+
// Response output item added event
|
550 |
+
yield {
|
551 |
+
type: "response.output_item.added",
|
552 |
+
output_index: 0,
|
553 |
+
item: outputObject,
|
554 |
+
sequence_number: SEQUENCE_NUMBER_PLACEHOLDER,
|
555 |
+
};
|
556 |
+
}
|
557 |
+
} else if (currentTextMode === "reasoning") {
|
558 |
+
if (currentOutputItem?.type !== "reasoning" || currentOutputItem?.status !== "in_progress") {
|
559 |
+
const outputObject: PatchedResponseReasoningItem = {
|
560 |
+
id: generateUniqueId("rs"),
|
561 |
+
type: "reasoning",
|
562 |
+
status: "in_progress",
|
563 |
+
content: [],
|
564 |
+
summary: [],
|
565 |
+
};
|
566 |
+
responseObject.output.push(outputObject);
|
567 |
+
|
568 |
+
// Response output item added event
|
569 |
+
yield {
|
570 |
+
type: "response.output_item.added",
|
571 |
+
output_index: 0,
|
572 |
+
item: outputObject,
|
573 |
+
sequence_number: SEQUENCE_NUMBER_PLACEHOLDER,
|
574 |
+
};
|
575 |
+
}
|
576 |
}
|
577 |
|
578 |
// If start of a new content part, create it
|
579 |
+
if (currentTextMode === "text") {
|
580 |
+
const currentOutputMessage = responseObject.output.at(-1) as ResponseOutputMessage;
|
581 |
+
if (currentOutputMessage.content.length === 0) {
|
582 |
+
// Response content part added event
|
583 |
+
const contentPart: ResponseContentPartAddedEvent["part"] = {
|
584 |
+
type: "output_text",
|
585 |
+
text: "",
|
586 |
+
annotations: [],
|
587 |
+
};
|
588 |
+
currentOutputMessage.content.push(contentPart);
|
589 |
+
|
590 |
+
yield {
|
591 |
+
type: "response.content_part.added",
|
592 |
+
item_id: currentOutputMessage.id,
|
593 |
+
output_index: responseObject.output.length - 1,
|
594 |
+
content_index: currentOutputMessage.content.length - 1,
|
595 |
+
part: contentPart,
|
596 |
+
sequence_number: SEQUENCE_NUMBER_PLACEHOLDER,
|
597 |
+
};
|
598 |
+
}
|
599 |
+
|
600 |
+
const contentPart = currentOutputMessage.content.at(-1);
|
601 |
+
if (!contentPart || contentPart.type !== "output_text") {
|
602 |
+
throw new StreamingError(
|
603 |
+
`Not implemented: only output_text is supported in response.output[].content[].type. Got ${contentPart?.type}`
|
604 |
+
);
|
605 |
+
}
|
606 |
|
607 |
+
// Add text delta
|
608 |
+
contentPart.text += delta.content;
|
609 |
yield {
|
610 |
+
type: "response.output_text.delta",
|
611 |
+
item_id: currentOutputMessage.id,
|
612 |
output_index: responseObject.output.length - 1,
|
613 |
+
content_index: currentOutputMessage.content.length - 1,
|
614 |
+
delta: delta.content,
|
615 |
sequence_number: SEQUENCE_NUMBER_PLACEHOLDER,
|
616 |
};
|
617 |
+
} else if (currentTextMode === "reasoning") {
|
618 |
+
const currentReasoningItem = responseObject.output.at(-1) as PatchedResponseReasoningItem;
|
619 |
+
if (currentReasoningItem.content.length === 0) {
|
620 |
+
// Response content part added event
|
621 |
+
const contentPart: ReasoningTextContent = {
|
622 |
+
type: "reasoning_text",
|
623 |
+
text: "",
|
624 |
+
};
|
625 |
+
currentReasoningItem.content.push(contentPart);
|
626 |
|
627 |
+
yield {
|
628 |
+
type: "response.content_part.added",
|
629 |
+
item_id: currentReasoningItem.id,
|
630 |
+
output_index: responseObject.output.length - 1,
|
631 |
+
content_index: currentReasoningItem.content.length - 1,
|
632 |
+
part: contentPart as unknown as PatchedResponseContentPart, // TODO: adapt once openai-node is updated
|
633 |
+
sequence_number: SEQUENCE_NUMBER_PLACEHOLDER,
|
634 |
+
};
|
635 |
+
}
|
636 |
|
637 |
+
// Add text delta
|
638 |
+
const contentPart = currentReasoningItem.content.at(-1) as ReasoningTextContent;
|
639 |
+
contentPart.text += delta.content;
|
640 |
+
yield {
|
641 |
+
type: "response.reasoning_text.delta",
|
642 |
+
item_id: currentReasoningItem.id,
|
643 |
+
output_index: responseObject.output.length - 1,
|
644 |
+
content_index: currentReasoningItem.content.length - 1,
|
645 |
+
delta: delta.content,
|
646 |
+
sequence_number: SEQUENCE_NUMBER_PLACEHOLDER,
|
647 |
+
};
|
648 |
+
}
|
649 |
} else if (delta.tool_calls && delta.tool_calls.length > 0) {
|
650 |
if (delta.tool_calls.length > 1) {
|
651 |
console.log("Multiple tool calls are not supported. Only the first one will be processed.");
|
|
|
728 |
}
|
729 |
}
|
730 |
|
731 |
+
for await (const event of closeLastOutputItem(responseObject, payload, mcpToolsMapping)) {
|
732 |
+
yield event;
|
733 |
+
}
|
734 |
+
}
|
735 |
+
|
736 |
+
/*
|
737 |
+
* Perform an approved MCP tool call and stream the response.
|
738 |
+
*/
|
739 |
+
async function* callApprovedMCPToolStream(
|
740 |
+
approval_request_id: string,
|
741 |
+
mcpCallId: string,
|
742 |
+
approvalRequest: McpApprovalRequestParams | undefined,
|
743 |
+
mcpToolsMapping: Record<string, McpServerParams>,
|
744 |
+
responseObject: IncompleteResponse,
|
745 |
+
payload: ChatCompletionCreateParamsStreaming
|
746 |
+
): AsyncGenerator<PatchedResponseStreamEvent> {
|
747 |
+
if (!approvalRequest) {
|
748 |
+
throw new Error(`MCP approval request '${approval_request_id}' not found`);
|
749 |
+
}
|
750 |
+
|
751 |
+
const outputObject: ResponseOutputItem.McpCall = {
|
752 |
+
type: "mcp_call",
|
753 |
+
id: mcpCallId,
|
754 |
+
name: approvalRequest.name,
|
755 |
+
server_label: approvalRequest.server_label,
|
756 |
+
arguments: approvalRequest.arguments,
|
757 |
+
};
|
758 |
+
responseObject.output.push(outputObject);
|
759 |
+
|
760 |
+
// Response output item added event
|
761 |
+
yield {
|
762 |
+
type: "response.output_item.added",
|
763 |
+
output_index: responseObject.output.length - 1,
|
764 |
+
item: outputObject,
|
765 |
+
sequence_number: SEQUENCE_NUMBER_PLACEHOLDER,
|
766 |
+
};
|
767 |
+
|
768 |
+
yield {
|
769 |
+
type: "response.mcp_call.in_progress",
|
770 |
+
item_id: outputObject.id,
|
771 |
+
output_index: responseObject.output.length - 1,
|
772 |
+
sequence_number: SEQUENCE_NUMBER_PLACEHOLDER,
|
773 |
+
};
|
774 |
+
|
775 |
+
const toolParams = mcpToolsMapping[approvalRequest.name];
|
776 |
+
const toolResult = await callMcpTool(toolParams, approvalRequest.name, approvalRequest.arguments);
|
777 |
+
|
778 |
+
if (toolResult.error) {
|
779 |
+
outputObject.error = toolResult.error;
|
780 |
+
yield {
|
781 |
+
type: "response.mcp_call.failed",
|
782 |
+
sequence_number: SEQUENCE_NUMBER_PLACEHOLDER,
|
783 |
+
};
|
784 |
+
} else {
|
785 |
+
outputObject.output = toolResult.output;
|
786 |
+
yield {
|
787 |
+
type: "response.mcp_call.completed",
|
788 |
+
sequence_number: SEQUENCE_NUMBER_PLACEHOLDER,
|
789 |
+
};
|
790 |
+
}
|
791 |
+
|
792 |
+
yield {
|
793 |
+
type: "response.output_item.done",
|
794 |
+
output_index: responseObject.output.length - 1,
|
795 |
+
item: outputObject,
|
796 |
+
sequence_number: SEQUENCE_NUMBER_PLACEHOLDER,
|
797 |
+
};
|
798 |
+
|
799 |
+
// Updating the payload for next LLM call
|
800 |
+
payload.messages.push(
|
801 |
+
{
|
802 |
+
role: "assistant",
|
803 |
+
tool_calls: [
|
804 |
+
{
|
805 |
+
id: outputObject.id,
|
806 |
+
type: "function",
|
807 |
+
function: {
|
808 |
+
name: outputObject.name,
|
809 |
+
arguments: outputObject.arguments,
|
810 |
+
// Hacky: type is not correct in inference.js. Will fix it but in the meantime we need to cast it.
|
811 |
+
// TODO: fix it in the inference.js package. Should be "arguments" and not "parameters".
|
812 |
+
},
|
813 |
+
},
|
814 |
+
],
|
815 |
+
},
|
816 |
+
{
|
817 |
+
role: "tool",
|
818 |
+
tool_call_id: outputObject.id,
|
819 |
+
content: outputObject.output ? outputObject.output : outputObject.error ? `Error: ${outputObject.error}` : "",
|
820 |
+
}
|
821 |
+
);
|
822 |
+
}
|
823 |
+
|
824 |
+
function requiresApproval(toolName: string, mcpToolsMapping: Record<string, McpServerParams>): boolean {
|
825 |
+
const toolParams = mcpToolsMapping[toolName];
|
826 |
+
return toolParams.require_approval === "always"
|
827 |
+
? true
|
828 |
+
: toolParams.require_approval === "never"
|
829 |
+
? false
|
830 |
+
: toolParams.require_approval.always?.tool_names?.includes(toolName)
|
831 |
+
? true
|
832 |
+
: toolParams.require_approval.never?.tool_names?.includes(toolName)
|
833 |
+
? false
|
834 |
+
: true; // behavior is undefined in specs, let's default to true
|
835 |
+
}
|
836 |
+
|
837 |
+
async function* closeLastOutputItem(
|
838 |
+
responseObject: IncompleteResponse,
|
839 |
+
payload: ChatCompletionCreateParamsStreaming,
|
840 |
+
mcpToolsMapping: Record<string, McpServerParams>
|
841 |
+
): AsyncGenerator<PatchedResponseStreamEvent> {
|
842 |
const lastOutputItem = responseObject.output.at(-1);
|
843 |
if (lastOutputItem) {
|
844 |
if (lastOutputItem?.type === "message") {
|
|
|
865 |
throw new StreamingError("Not implemented: only output_text is supported in streaming mode.");
|
866 |
}
|
867 |
|
868 |
+
// Response output item done event
|
869 |
+
lastOutputItem.status = "completed";
|
870 |
+
yield {
|
871 |
+
type: "response.output_item.done",
|
872 |
+
output_index: responseObject.output.length - 1,
|
873 |
+
item: lastOutputItem,
|
874 |
+
sequence_number: SEQUENCE_NUMBER_PLACEHOLDER,
|
875 |
+
};
|
876 |
+
} else if (lastOutputItem?.type === "reasoning") {
|
877 |
+
const contentPart = (lastOutputItem as PatchedResponseReasoningItem).content.at(-1);
|
878 |
+
if (contentPart !== undefined) {
|
879 |
+
yield {
|
880 |
+
type: "response.reasoning_text.done",
|
881 |
+
item_id: lastOutputItem.id,
|
882 |
+
output_index: responseObject.output.length - 1,
|
883 |
+
content_index: (lastOutputItem as PatchedResponseReasoningItem).content.length - 1,
|
884 |
+
text: contentPart.text,
|
885 |
+
sequence_number: SEQUENCE_NUMBER_PLACEHOLDER,
|
886 |
+
};
|
887 |
+
|
888 |
+
yield {
|
889 |
+
type: "response.content_part.done",
|
890 |
+
item_id: lastOutputItem.id,
|
891 |
+
output_index: responseObject.output.length - 1,
|
892 |
+
content_index: (lastOutputItem as PatchedResponseReasoningItem).content.length - 1,
|
893 |
+
part: contentPart as unknown as PatchedResponseContentPart, // TODO: adapt once openai-node is updated
|
894 |
+
sequence_number: SEQUENCE_NUMBER_PLACEHOLDER,
|
895 |
+
};
|
896 |
+
}
|
897 |
// Response output item done event
|
898 |
lastOutputItem.status = "completed";
|
899 |
yield {
|
|
|
994 |
}
|
995 |
|
996 |
/*
|
997 |
+
* Wrap a chat completion stream to handle reasoning.
|
998 |
+
*
|
999 |
+
* The reasoning start and end tokens might be sent in a longer text chunk.
|
1000 |
+
* We want to split that text chunk so that the reasoning token is isolated in a separate chunk.
|
1001 |
+
*
|
1002 |
+
* TODO: also adapt for when reasoning token is sent in separate chunks.
|
1003 |
*/
|
1004 |
+
async function* wrapChatCompletionStream(
|
1005 |
+
stream: Stream<ChatCompletionChunk & { _request_id?: string | null | undefined }>
|
1006 |
+
): AsyncGenerator<ChatCompletionChunk & { _request_id?: string | null | undefined }> {
|
1007 |
+
function cloneChunkWithContent(baseChunk: ChatCompletionChunk, content: string): ChatCompletionChunk {
|
1008 |
+
return {
|
1009 |
+
...baseChunk,
|
1010 |
+
choices: [
|
1011 |
+
{
|
1012 |
+
...baseChunk.choices[0],
|
1013 |
+
delta: {
|
1014 |
+
...baseChunk.choices[0].delta,
|
1015 |
+
content,
|
1016 |
+
},
|
1017 |
+
},
|
1018 |
+
],
|
1019 |
+
};
|
1020 |
}
|
1021 |
|
1022 |
+
function* splitAndYieldChunk(chunk: ChatCompletionChunk, content: string, token: string) {
|
1023 |
+
const [beforeContent, afterContent] = content.split(token, 2);
|
|
|
|
|
|
|
|
|
|
|
|
|
1024 |
|
1025 |
+
if (beforeContent) {
|
1026 |
+
yield cloneChunkWithContent(chunk, beforeContent);
|
1027 |
+
}
|
1028 |
+
yield cloneChunkWithContent(chunk, token);
|
1029 |
+
if (afterContent) {
|
1030 |
+
yield cloneChunkWithContent(chunk, afterContent);
|
1031 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1032 |
}
|
1033 |
|
1034 |
+
for await (const chunk of stream) {
|
1035 |
+
const content = chunk.choices[0].delta.content;
|
|
|
|
|
|
|
|
|
1036 |
|
1037 |
+
if (!content) {
|
1038 |
+
yield chunk;
|
1039 |
+
continue;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1040 |
}
|
|
|
|
|
1041 |
|
1042 |
+
if (content.includes(REASONING_START_TOKEN)) {
|
1043 |
+
yield* splitAndYieldChunk(chunk, content, REASONING_START_TOKEN);
|
1044 |
+
} else if (content.includes(REASONING_END_TOKEN)) {
|
1045 |
+
yield* splitAndYieldChunk(chunk, content, REASONING_END_TOKEN);
|
1046 |
+
} else {
|
1047 |
+
yield chunk;
|
1048 |
+
}
|
1049 |
+
}
|
|
|
|
|
|
|
1050 |
}
|
src/schemas.ts
CHANGED
@@ -160,10 +160,10 @@ export const createResponseParamsSchema = z.object({
|
|
160 |
model: z.string(),
|
161 |
// parallel_tool_calls: z.boolean().default(true), // TODO: how to handle this if chat completion doesn't?
|
162 |
// previous_response_id: z.string().nullable().default(null),
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
// store: z.boolean().default(true),
|
168 |
stream: z.boolean().default(false),
|
169 |
temperature: z.number().min(0).max(2).default(1),
|
|
|
160 |
model: z.string(),
|
161 |
// parallel_tool_calls: z.boolean().default(true), // TODO: how to handle this if chat completion doesn't?
|
162 |
// previous_response_id: z.string().nullable().default(null),
|
163 |
+
reasoning: z.object({
|
164 |
+
effort: z.enum(["low", "medium", "high"]).default("medium"),
|
165 |
+
summary: z.enum(["auto", "concise", "detailed"]).nullable().default(null),
|
166 |
+
}),
|
167 |
// store: z.boolean().default(true),
|
168 |
stream: z.boolean().default(false),
|
169 |
temperature: z.number().min(0).max(2).default(1),
|