Wauplin HF Staff commited on
Commit
ae1f3f8
·
verified ·
1 Parent(s): b2ce8c6

Upload folder using huggingface_hub

Browse files
src/openai_patch.ts ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * This file is a patch to the openai library to add support for the reasoning parameter.
3
+ * Once openai's official JS SDK supports sending back raw CoT, we will remove this file.
4
+ */
5
+ import type {
6
+ ResponseReasoningItem as OpenAIResponseReasoningItem,
7
+ ResponseStreamEvent as OpenAIResponseStreamEvent,
8
+ ResponseOutputRefusal,
9
+ ResponseOutputText,
10
+ } from "openai/resources/responses/responses";
11
+
12
+ export interface ReasoningTextContent {
13
+ type: "reasoning_text";
14
+ text: string;
15
+ }
16
+ export type PatchedResponseReasoningItem = OpenAIResponseReasoningItem & {
17
+ // Raw CoT returned in reasoning item (in addition to the summary)
18
+ content: ReasoningTextContent[];
19
+ };
20
+
21
+ interface PatchedResponseReasoningTextDeltaEvent {
22
+ type: "response.reasoning_text.delta";
23
+ sequence_number: number;
24
+ item_id: string;
25
+ output_index: number;
26
+ content_index: number;
27
+ delta: string;
28
+ }
29
+
30
+ interface PatchedResponseReasoningTextDoneEvent {
31
+ type: "response.reasoning_text.done";
32
+ sequence_number: number;
33
+ item_id: string;
34
+ output_index: number;
35
+ content_index: number;
36
+ text: string;
37
+ }
38
+
39
+ export type PatchedResponseStreamEvent =
40
+ | OpenAIResponseStreamEvent
41
+ | PatchedResponseReasoningTextDeltaEvent
42
+ | PatchedResponseReasoningTextDoneEvent;
43
+
44
+ export type PatchedResponseContentPart = ResponseOutputText | ResponseOutputRefusal;
src/routes/landingPageHtml.ts CHANGED
@@ -502,6 +502,7 @@ export function getLandingPageHtml(req: Request, res: Response): void {
502
  <button class="examples-tab" type="button">Function Calling</button>
503
  <button class="examples-tab" type="button">Structured Output</button>
504
  <button class="examples-tab" type="button">MCP</button>
 
505
  </div>
506
  <div class="example-panel active">
507
  <pre><button class="copy-btn" onclick="copyCode(this)">Copy</button><code class="language-python">from openai import OpenAI
@@ -687,6 +688,27 @@ response = client.responses.create(
687
  for output in response.output:
688
  print(output)</code></pre>
689
  </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
690
  </section>
691
  <footer class="more-info-footer">
692
  <div style="font-weight:600; color:var(--primary-dark); font-size:1.13em; margin-bottom:0.5em;">More Info</div>
 
502
  <button class="examples-tab" type="button">Function Calling</button>
503
  <button class="examples-tab" type="button">Structured Output</button>
504
  <button class="examples-tab" type="button">MCP</button>
505
+ <button class="examples-tab" type="button">Reasoning</button>
506
  </div>
507
  <div class="example-panel active">
508
  <pre><button class="copy-btn" onclick="copyCode(this)">Copy</button><code class="language-python">from openai import OpenAI
 
688
  for output in response.output:
689
  print(output)</code></pre>
690
  </div>
691
+ <div class="example-panel">
692
+ <pre><button class="copy-btn" onclick="copyCode(this)">Copy</button><code class="language-python">from openai import OpenAI
693
+ import os
694
+
695
+ client = OpenAI(
696
+ base_url="${baseUrl}",
697
+ api_key=os.getenv("HF_TOKEN"), # visit https://huggingface.co/settings/tokens
698
+ )
699
+
700
+ response = client.responses.create(
701
+ model="deepseek-ai/DeepSeek-R1",
702
+ instructions="You are a helpful assistant.",
703
+ input="Say hello to the world.",
704
+ reasoning={
705
+ "effort": "low",
706
+ }
707
+ )
708
+
709
+ for index, item in enumerate(response.output):
710
+ print(f"Output #{index}: {item.type}", item.content)</code></pre>
711
+ </div>
712
  </section>
713
  <footer class="more-info-footer">
714
  <div style="font-weight:600; color:var(--primary-dark); font-size:1.13em; margin-bottom:0.5em;">More Info</div>
src/routes/responses.ts CHANGED
@@ -5,19 +5,26 @@ import { generateUniqueId } from "../lib/generateUniqueId.js";
5
  import { OpenAI } from "openai";
6
  import type {
7
  Response,
8
- ResponseStreamEvent,
9
  ResponseContentPartAddedEvent,
10
  ResponseOutputMessage,
11
  ResponseFunctionToolCall,
12
  ResponseOutputItem,
13
  } from "openai/resources/responses/responses";
 
 
 
 
 
 
14
  import type {
15
  ChatCompletionCreateParamsStreaming,
16
  ChatCompletionMessageParam,
17
  ChatCompletionTool,
 
18
  } from "openai/resources/chat/completions.js";
19
  import type { FunctionParameters } from "openai/resources/shared.js";
20
  import { callMcpTool, connectMcpServer } from "../mcp.js";
 
21
 
22
  class StreamingError extends Error {
23
  constructor(message: string) {
@@ -29,6 +36,10 @@ class StreamingError extends Error {
29
  type IncompleteResponse = Omit<Response, "incomplete_details" | "output_text" | "parallel_tool_calls">;
30
  const SEQUENCE_NUMBER_PLACEHOLDER = -1;
31
 
 
 
 
 
32
  export const postCreateResponse = async (
33
  req: ValidatedRequest<CreateResponseParams>,
34
  res: ExpressResponse
@@ -66,7 +77,7 @@ export const postCreateResponse = async (
66
  async function* runCreateResponseStream(
67
  req: ValidatedRequest<CreateResponseParams>,
68
  res: ExpressResponse
69
- ): AsyncGenerator<ResponseStreamEvent> {
70
  let sequenceNumber = 0;
71
  // Prepare response object that will be iteratively populated
72
  const responseObject: IncompleteResponse = {
@@ -147,7 +158,7 @@ async function* innerRunStream(
147
  req: ValidatedRequest<CreateResponseParams>,
148
  res: ExpressResponse,
149
  responseObject: IncompleteResponse
150
- ): AsyncGenerator<ResponseStreamEvent> {
151
  // Retrieve API key from headers
152
  const apiKey = req.headers.authorization?.split(" ")[1];
153
  if (!apiKey) {
@@ -158,6 +169,11 @@ async function* innerRunStream(
158
  return;
159
  }
160
 
 
 
 
 
 
161
  // List MCP tools from server (if required) + prepare tools for the LLM
162
  let tools: ChatCompletionTool[] | undefined = [];
163
  const mcpToolsMapping: Record<string, McpServerParams> = {};
@@ -351,6 +367,7 @@ async function* innerRunStream(
351
  }
352
  : { type: req.body.text.format.type }
353
  : undefined,
 
354
  temperature: req.body.temperature,
355
  tool_choice:
356
  typeof req.body.tool_choice === "string"
@@ -417,7 +434,7 @@ async function* innerRunStream(
417
  async function* listMcpToolsStream(
418
  tool: McpServerParams,
419
  responseObject: IncompleteResponse
420
- ): AsyncGenerator<ResponseStreamEvent> {
421
  const outputObject: ResponseOutputItem.McpListTools = {
422
  id: generateUniqueId("mcpl"),
423
  type: "mcp_list_tools",
@@ -476,15 +493,16 @@ async function* handleOneTurnStream(
476
  payload: ChatCompletionCreateParamsStreaming,
477
  responseObject: IncompleteResponse,
478
  mcpToolsMapping: Record<string, McpServerParams>
479
- ): AsyncGenerator<ResponseStreamEvent> {
480
  const client = new OpenAI({
481
  baseURL: process.env.OPENAI_BASE_URL ?? "https://router.huggingface.co/v1",
482
  apiKey: apiKey,
483
  });
484
- const stream = await client.chat.completions.create(payload);
485
  let previousInputTokens = responseObject.usage?.input_tokens ?? 0;
486
  let previousOutputTokens = responseObject.usage?.output_tokens ?? 0;
487
  let previousTotalTokens = responseObject.usage?.total_tokens ?? 0;
 
488
 
489
  for await (const chunk of stream) {
490
  if (chunk.usage) {
@@ -502,65 +520,132 @@ async function* handleOneTurnStream(
502
 
503
  if (delta.content) {
504
  let currentOutputItem = responseObject.output.at(-1);
 
 
 
 
 
 
 
 
 
 
 
 
 
505
 
506
  // If start of a new message, create it
507
- if (currentOutputItem?.type !== "message" || currentOutputItem?.status !== "in_progress") {
508
- const outputObject: ResponseOutputMessage = {
509
- id: generateUniqueId("msg"),
510
- type: "message",
511
- role: "assistant",
512
- status: "in_progress",
513
- content: [],
514
- };
515
- responseObject.output.push(outputObject);
 
516
 
517
- // Response output item added event
518
- yield {
519
- type: "response.output_item.added",
520
- output_index: 0,
521
- item: outputObject,
522
- sequence_number: SEQUENCE_NUMBER_PLACEHOLDER,
523
- };
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
524
  }
525
 
526
  // If start of a new content part, create it
527
- currentOutputItem = responseObject.output.at(-1) as ResponseOutputMessage;
528
- if (currentOutputItem.content.length === 0) {
529
- // Response content part added event
530
- const contentPart: ResponseContentPartAddedEvent["part"] = {
531
- type: "output_text",
532
- text: "",
533
- annotations: [],
534
- };
535
- currentOutputItem.content.push(contentPart);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
536
 
 
 
537
  yield {
538
- type: "response.content_part.added",
539
- item_id: currentOutputItem.id,
540
  output_index: responseObject.output.length - 1,
541
- content_index: currentOutputItem.content.length - 1,
542
- part: contentPart,
543
  sequence_number: SEQUENCE_NUMBER_PLACEHOLDER,
544
  };
545
- }
 
 
 
 
 
 
 
 
546
 
547
- const contentPart = currentOutputItem.content.at(-1);
548
- if (!contentPart || contentPart.type !== "output_text") {
549
- throw new StreamingError(
550
- `Not implemented: only output_text is supported in response.output[].content[].type. Got ${contentPart?.type}`
551
- );
552
- }
 
 
 
553
 
554
- // Add text delta
555
- contentPart.text += delta.content;
556
- yield {
557
- type: "response.output_text.delta",
558
- item_id: currentOutputItem.id,
559
- output_index: responseObject.output.length - 1,
560
- content_index: currentOutputItem.content.length - 1,
561
- delta: delta.content,
562
- sequence_number: SEQUENCE_NUMBER_PLACEHOLDER,
563
- };
 
 
564
  } else if (delta.tool_calls && delta.tool_calls.length > 0) {
565
  if (delta.tool_calls.length > 1) {
566
  console.log("Multiple tool calls are not supported. Only the first one will be processed.");
@@ -643,6 +728,117 @@ async function* handleOneTurnStream(
643
  }
644
  }
645
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
646
  const lastOutputItem = responseObject.output.at(-1);
647
  if (lastOutputItem) {
648
  if (lastOutputItem?.type === "message") {
@@ -669,6 +865,35 @@ async function* handleOneTurnStream(
669
  throw new StreamingError("Not implemented: only output_text is supported in streaming mode.");
670
  }
671
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
672
  // Response output item done event
673
  lastOutputItem.status = "completed";
674
  yield {
@@ -769,102 +994,57 @@ async function* handleOneTurnStream(
769
  }
770
 
771
  /*
772
- * Perform an approved MCP tool call and stream the response.
 
 
 
 
 
773
  */
774
- async function* callApprovedMCPToolStream(
775
- approval_request_id: string,
776
- mcpCallId: string,
777
- approvalRequest: McpApprovalRequestParams | undefined,
778
- mcpToolsMapping: Record<string, McpServerParams>,
779
- responseObject: IncompleteResponse,
780
- payload: ChatCompletionCreateParamsStreaming
781
- ): AsyncGenerator<ResponseStreamEvent> {
782
- if (!approvalRequest) {
783
- throw new Error(`MCP approval request '${approval_request_id}' not found`);
 
 
 
 
 
 
784
  }
785
 
786
- const outputObject: ResponseOutputItem.McpCall = {
787
- type: "mcp_call",
788
- id: mcpCallId,
789
- name: approvalRequest.name,
790
- server_label: approvalRequest.server_label,
791
- arguments: approvalRequest.arguments,
792
- };
793
- responseObject.output.push(outputObject);
794
 
795
- // Response output item added event
796
- yield {
797
- type: "response.output_item.added",
798
- output_index: responseObject.output.length - 1,
799
- item: outputObject,
800
- sequence_number: SEQUENCE_NUMBER_PLACEHOLDER,
801
- };
802
-
803
- yield {
804
- type: "response.mcp_call.in_progress",
805
- item_id: outputObject.id,
806
- output_index: responseObject.output.length - 1,
807
- sequence_number: SEQUENCE_NUMBER_PLACEHOLDER,
808
- };
809
-
810
- const toolParams = mcpToolsMapping[approvalRequest.name];
811
- const toolResult = await callMcpTool(toolParams, approvalRequest.name, approvalRequest.arguments);
812
-
813
- if (toolResult.error) {
814
- outputObject.error = toolResult.error;
815
- yield {
816
- type: "response.mcp_call.failed",
817
- sequence_number: SEQUENCE_NUMBER_PLACEHOLDER,
818
- };
819
- } else {
820
- outputObject.output = toolResult.output;
821
- yield {
822
- type: "response.mcp_call.completed",
823
- sequence_number: SEQUENCE_NUMBER_PLACEHOLDER,
824
- };
825
  }
826
 
827
- yield {
828
- type: "response.output_item.done",
829
- output_index: responseObject.output.length - 1,
830
- item: outputObject,
831
- sequence_number: SEQUENCE_NUMBER_PLACEHOLDER,
832
- };
833
 
834
- // Updating the payload for next LLM call
835
- payload.messages.push(
836
- {
837
- role: "assistant",
838
- tool_calls: [
839
- {
840
- id: outputObject.id,
841
- type: "function",
842
- function: {
843
- name: outputObject.name,
844
- arguments: outputObject.arguments,
845
- // Hacky: type is not correct in inference.js. Will fix it but in the meantime we need to cast it.
846
- // TODO: fix it in the inference.js package. Should be "arguments" and not "parameters".
847
- },
848
- },
849
- ],
850
- },
851
- {
852
- role: "tool",
853
- tool_call_id: outputObject.id,
854
- content: outputObject.output ? outputObject.output : outputObject.error ? `Error: ${outputObject.error}` : "",
855
  }
856
- );
857
- }
858
 
859
- function requiresApproval(toolName: string, mcpToolsMapping: Record<string, McpServerParams>): boolean {
860
- const toolParams = mcpToolsMapping[toolName];
861
- return toolParams.require_approval === "always"
862
- ? true
863
- : toolParams.require_approval === "never"
864
- ? false
865
- : toolParams.require_approval.always?.tool_names?.includes(toolName)
866
- ? true
867
- : toolParams.require_approval.never?.tool_names?.includes(toolName)
868
- ? false
869
- : true; // behavior is undefined in specs, let's default to true
870
  }
 
5
  import { OpenAI } from "openai";
6
  import type {
7
  Response,
 
8
  ResponseContentPartAddedEvent,
9
  ResponseOutputMessage,
10
  ResponseFunctionToolCall,
11
  ResponseOutputItem,
12
  } from "openai/resources/responses/responses";
13
+ import type {
14
+ PatchedResponseContentPart,
15
+ PatchedResponseReasoningItem,
16
+ PatchedResponseStreamEvent,
17
+ ReasoningTextContent,
18
+ } from "../openai_patch";
19
  import type {
20
  ChatCompletionCreateParamsStreaming,
21
  ChatCompletionMessageParam,
22
  ChatCompletionTool,
23
+ ChatCompletionChunk,
24
  } from "openai/resources/chat/completions.js";
25
  import type { FunctionParameters } from "openai/resources/shared.js";
26
  import { callMcpTool, connectMcpServer } from "../mcp.js";
27
+ import type { Stream } from "openai/core/streaming.js";
28
 
29
  class StreamingError extends Error {
30
  constructor(message: string) {
 
36
  type IncompleteResponse = Omit<Response, "incomplete_details" | "output_text" | "parallel_tool_calls">;
37
  const SEQUENCE_NUMBER_PLACEHOLDER = -1;
38
 
39
+ // TODO: this depends on the model. To be adapted.
40
+ const REASONING_START_TOKEN = "<think>";
41
+ const REASONING_END_TOKEN = "</think>";
42
+
43
  export const postCreateResponse = async (
44
  req: ValidatedRequest<CreateResponseParams>,
45
  res: ExpressResponse
 
77
  async function* runCreateResponseStream(
78
  req: ValidatedRequest<CreateResponseParams>,
79
  res: ExpressResponse
80
+ ): AsyncGenerator<PatchedResponseStreamEvent> {
81
  let sequenceNumber = 0;
82
  // Prepare response object that will be iteratively populated
83
  const responseObject: IncompleteResponse = {
 
158
  req: ValidatedRequest<CreateResponseParams>,
159
  res: ExpressResponse,
160
  responseObject: IncompleteResponse
161
+ ): AsyncGenerator<PatchedResponseStreamEvent> {
162
  // Retrieve API key from headers
163
  const apiKey = req.headers.authorization?.split(" ")[1];
164
  if (!apiKey) {
 
169
  return;
170
  }
171
 
172
+ // Return early if not supported param
173
+ if (req.body.reasoning?.summary && req.body.reasoning?.summary !== "auto") {
174
+ throw new Error(`Not implemented: only 'auto' summary is supported. Got '${req.body.reasoning?.summary}'`);
175
+ }
176
+
177
  // List MCP tools from server (if required) + prepare tools for the LLM
178
  let tools: ChatCompletionTool[] | undefined = [];
179
  const mcpToolsMapping: Record<string, McpServerParams> = {};
 
367
  }
368
  : { type: req.body.text.format.type }
369
  : undefined,
370
+ reasoning_effort: req.body.reasoning?.effort,
371
  temperature: req.body.temperature,
372
  tool_choice:
373
  typeof req.body.tool_choice === "string"
 
434
  async function* listMcpToolsStream(
435
  tool: McpServerParams,
436
  responseObject: IncompleteResponse
437
+ ): AsyncGenerator<PatchedResponseStreamEvent> {
438
  const outputObject: ResponseOutputItem.McpListTools = {
439
  id: generateUniqueId("mcpl"),
440
  type: "mcp_list_tools",
 
493
  payload: ChatCompletionCreateParamsStreaming,
494
  responseObject: IncompleteResponse,
495
  mcpToolsMapping: Record<string, McpServerParams>
496
+ ): AsyncGenerator<PatchedResponseStreamEvent> {
497
  const client = new OpenAI({
498
  baseURL: process.env.OPENAI_BASE_URL ?? "https://router.huggingface.co/v1",
499
  apiKey: apiKey,
500
  });
501
+ const stream = wrapChatCompletionStream(await client.chat.completions.create(payload));
502
  let previousInputTokens = responseObject.usage?.input_tokens ?? 0;
503
  let previousOutputTokens = responseObject.usage?.output_tokens ?? 0;
504
  let previousTotalTokens = responseObject.usage?.total_tokens ?? 0;
505
+ let currentTextMode: "text" | "reasoning" = "text";
506
 
507
  for await (const chunk of stream) {
508
  if (chunk.usage) {
 
520
 
521
  if (delta.content) {
522
  let currentOutputItem = responseObject.output.at(-1);
523
+ let deltaText = delta.content;
524
+
525
+ // If start or end of reasoning, skip token and update the current text mode
526
+ if (deltaText === REASONING_START_TOKEN) {
527
+ currentTextMode = "reasoning";
528
+ continue;
529
+ } else if (deltaText === REASONING_END_TOKEN) {
530
+ currentTextMode = "text";
531
+ for await (const event of closeLastOutputItem(responseObject, payload, mcpToolsMapping)) {
532
+ yield event;
533
+ }
534
+ continue;
535
+ }
536
 
537
  // If start of a new message, create it
538
+ if (currentTextMode === "text") {
539
+ if (currentOutputItem?.type !== "message" || currentOutputItem?.status !== "in_progress") {
540
+ const outputObject: ResponseOutputMessage = {
541
+ id: generateUniqueId("msg"),
542
+ type: "message",
543
+ role: "assistant",
544
+ status: "in_progress",
545
+ content: [],
546
+ };
547
+ responseObject.output.push(outputObject);
548
 
549
+ // Response output item added event
550
+ yield {
551
+ type: "response.output_item.added",
552
+ output_index: 0,
553
+ item: outputObject,
554
+ sequence_number: SEQUENCE_NUMBER_PLACEHOLDER,
555
+ };
556
+ }
557
+ } else if (currentTextMode === "reasoning") {
558
+ if (currentOutputItem?.type !== "reasoning" || currentOutputItem?.status !== "in_progress") {
559
+ const outputObject: PatchedResponseReasoningItem = {
560
+ id: generateUniqueId("rs"),
561
+ type: "reasoning",
562
+ status: "in_progress",
563
+ content: [],
564
+ summary: [],
565
+ };
566
+ responseObject.output.push(outputObject);
567
+
568
+ // Response output item added event
569
+ yield {
570
+ type: "response.output_item.added",
571
+ output_index: 0,
572
+ item: outputObject,
573
+ sequence_number: SEQUENCE_NUMBER_PLACEHOLDER,
574
+ };
575
+ }
576
  }
577
 
578
  // If start of a new content part, create it
579
+ if (currentTextMode === "text") {
580
+ const currentOutputMessage = responseObject.output.at(-1) as ResponseOutputMessage;
581
+ if (currentOutputMessage.content.length === 0) {
582
+ // Response content part added event
583
+ const contentPart: ResponseContentPartAddedEvent["part"] = {
584
+ type: "output_text",
585
+ text: "",
586
+ annotations: [],
587
+ };
588
+ currentOutputMessage.content.push(contentPart);
589
+
590
+ yield {
591
+ type: "response.content_part.added",
592
+ item_id: currentOutputMessage.id,
593
+ output_index: responseObject.output.length - 1,
594
+ content_index: currentOutputMessage.content.length - 1,
595
+ part: contentPart,
596
+ sequence_number: SEQUENCE_NUMBER_PLACEHOLDER,
597
+ };
598
+ }
599
+
600
+ const contentPart = currentOutputMessage.content.at(-1);
601
+ if (!contentPart || contentPart.type !== "output_text") {
602
+ throw new StreamingError(
603
+ `Not implemented: only output_text is supported in response.output[].content[].type. Got ${contentPart?.type}`
604
+ );
605
+ }
606
 
607
+ // Add text delta
608
+ contentPart.text += delta.content;
609
  yield {
610
+ type: "response.output_text.delta",
611
+ item_id: currentOutputMessage.id,
612
  output_index: responseObject.output.length - 1,
613
+ content_index: currentOutputMessage.content.length - 1,
614
+ delta: delta.content,
615
  sequence_number: SEQUENCE_NUMBER_PLACEHOLDER,
616
  };
617
+ } else if (currentTextMode === "reasoning") {
618
+ const currentReasoningItem = responseObject.output.at(-1) as PatchedResponseReasoningItem;
619
+ if (currentReasoningItem.content.length === 0) {
620
+ // Response content part added event
621
+ const contentPart: ReasoningTextContent = {
622
+ type: "reasoning_text",
623
+ text: "",
624
+ };
625
+ currentReasoningItem.content.push(contentPart);
626
 
627
+ yield {
628
+ type: "response.content_part.added",
629
+ item_id: currentReasoningItem.id,
630
+ output_index: responseObject.output.length - 1,
631
+ content_index: currentReasoningItem.content.length - 1,
632
+ part: contentPart as unknown as PatchedResponseContentPart, // TODO: adapt once openai-node is updated
633
+ sequence_number: SEQUENCE_NUMBER_PLACEHOLDER,
634
+ };
635
+ }
636
 
637
+ // Add text delta
638
+ const contentPart = currentReasoningItem.content.at(-1) as ReasoningTextContent;
639
+ contentPart.text += delta.content;
640
+ yield {
641
+ type: "response.reasoning_text.delta",
642
+ item_id: currentReasoningItem.id,
643
+ output_index: responseObject.output.length - 1,
644
+ content_index: currentReasoningItem.content.length - 1,
645
+ delta: delta.content,
646
+ sequence_number: SEQUENCE_NUMBER_PLACEHOLDER,
647
+ };
648
+ }
649
  } else if (delta.tool_calls && delta.tool_calls.length > 0) {
650
  if (delta.tool_calls.length > 1) {
651
  console.log("Multiple tool calls are not supported. Only the first one will be processed.");
 
728
  }
729
  }
730
 
731
+ for await (const event of closeLastOutputItem(responseObject, payload, mcpToolsMapping)) {
732
+ yield event;
733
+ }
734
+ }
735
+
736
+ /*
737
+ * Perform an approved MCP tool call and stream the response.
738
+ */
739
+ async function* callApprovedMCPToolStream(
740
+ approval_request_id: string,
741
+ mcpCallId: string,
742
+ approvalRequest: McpApprovalRequestParams | undefined,
743
+ mcpToolsMapping: Record<string, McpServerParams>,
744
+ responseObject: IncompleteResponse,
745
+ payload: ChatCompletionCreateParamsStreaming
746
+ ): AsyncGenerator<PatchedResponseStreamEvent> {
747
+ if (!approvalRequest) {
748
+ throw new Error(`MCP approval request '${approval_request_id}' not found`);
749
+ }
750
+
751
+ const outputObject: ResponseOutputItem.McpCall = {
752
+ type: "mcp_call",
753
+ id: mcpCallId,
754
+ name: approvalRequest.name,
755
+ server_label: approvalRequest.server_label,
756
+ arguments: approvalRequest.arguments,
757
+ };
758
+ responseObject.output.push(outputObject);
759
+
760
+ // Response output item added event
761
+ yield {
762
+ type: "response.output_item.added",
763
+ output_index: responseObject.output.length - 1,
764
+ item: outputObject,
765
+ sequence_number: SEQUENCE_NUMBER_PLACEHOLDER,
766
+ };
767
+
768
+ yield {
769
+ type: "response.mcp_call.in_progress",
770
+ item_id: outputObject.id,
771
+ output_index: responseObject.output.length - 1,
772
+ sequence_number: SEQUENCE_NUMBER_PLACEHOLDER,
773
+ };
774
+
775
+ const toolParams = mcpToolsMapping[approvalRequest.name];
776
+ const toolResult = await callMcpTool(toolParams, approvalRequest.name, approvalRequest.arguments);
777
+
778
+ if (toolResult.error) {
779
+ outputObject.error = toolResult.error;
780
+ yield {
781
+ type: "response.mcp_call.failed",
782
+ sequence_number: SEQUENCE_NUMBER_PLACEHOLDER,
783
+ };
784
+ } else {
785
+ outputObject.output = toolResult.output;
786
+ yield {
787
+ type: "response.mcp_call.completed",
788
+ sequence_number: SEQUENCE_NUMBER_PLACEHOLDER,
789
+ };
790
+ }
791
+
792
+ yield {
793
+ type: "response.output_item.done",
794
+ output_index: responseObject.output.length - 1,
795
+ item: outputObject,
796
+ sequence_number: SEQUENCE_NUMBER_PLACEHOLDER,
797
+ };
798
+
799
+ // Updating the payload for next LLM call
800
+ payload.messages.push(
801
+ {
802
+ role: "assistant",
803
+ tool_calls: [
804
+ {
805
+ id: outputObject.id,
806
+ type: "function",
807
+ function: {
808
+ name: outputObject.name,
809
+ arguments: outputObject.arguments,
810
+ // Hacky: type is not correct in inference.js. Will fix it but in the meantime we need to cast it.
811
+ // TODO: fix it in the inference.js package. Should be "arguments" and not "parameters".
812
+ },
813
+ },
814
+ ],
815
+ },
816
+ {
817
+ role: "tool",
818
+ tool_call_id: outputObject.id,
819
+ content: outputObject.output ? outputObject.output : outputObject.error ? `Error: ${outputObject.error}` : "",
820
+ }
821
+ );
822
+ }
823
+
824
+ function requiresApproval(toolName: string, mcpToolsMapping: Record<string, McpServerParams>): boolean {
825
+ const toolParams = mcpToolsMapping[toolName];
826
+ return toolParams.require_approval === "always"
827
+ ? true
828
+ : toolParams.require_approval === "never"
829
+ ? false
830
+ : toolParams.require_approval.always?.tool_names?.includes(toolName)
831
+ ? true
832
+ : toolParams.require_approval.never?.tool_names?.includes(toolName)
833
+ ? false
834
+ : true; // behavior is undefined in specs, let's default to true
835
+ }
836
+
837
+ async function* closeLastOutputItem(
838
+ responseObject: IncompleteResponse,
839
+ payload: ChatCompletionCreateParamsStreaming,
840
+ mcpToolsMapping: Record<string, McpServerParams>
841
+ ): AsyncGenerator<PatchedResponseStreamEvent> {
842
  const lastOutputItem = responseObject.output.at(-1);
843
  if (lastOutputItem) {
844
  if (lastOutputItem?.type === "message") {
 
865
  throw new StreamingError("Not implemented: only output_text is supported in streaming mode.");
866
  }
867
 
868
+ // Response output item done event
869
+ lastOutputItem.status = "completed";
870
+ yield {
871
+ type: "response.output_item.done",
872
+ output_index: responseObject.output.length - 1,
873
+ item: lastOutputItem,
874
+ sequence_number: SEQUENCE_NUMBER_PLACEHOLDER,
875
+ };
876
+ } else if (lastOutputItem?.type === "reasoning") {
877
+ const contentPart = (lastOutputItem as PatchedResponseReasoningItem).content.at(-1);
878
+ if (contentPart !== undefined) {
879
+ yield {
880
+ type: "response.reasoning_text.done",
881
+ item_id: lastOutputItem.id,
882
+ output_index: responseObject.output.length - 1,
883
+ content_index: (lastOutputItem as PatchedResponseReasoningItem).content.length - 1,
884
+ text: contentPart.text,
885
+ sequence_number: SEQUENCE_NUMBER_PLACEHOLDER,
886
+ };
887
+
888
+ yield {
889
+ type: "response.content_part.done",
890
+ item_id: lastOutputItem.id,
891
+ output_index: responseObject.output.length - 1,
892
+ content_index: (lastOutputItem as PatchedResponseReasoningItem).content.length - 1,
893
+ part: contentPart as unknown as PatchedResponseContentPart, // TODO: adapt once openai-node is updated
894
+ sequence_number: SEQUENCE_NUMBER_PLACEHOLDER,
895
+ };
896
+ }
897
  // Response output item done event
898
  lastOutputItem.status = "completed";
899
  yield {
 
994
  }
995
 
996
  /*
997
+ * Wrap a chat completion stream to handle reasoning.
998
+ *
999
+ * The reasoning start and end tokens might be sent in a longer text chunk.
1000
+ * We want to split that text chunk so that the reasoning token is isolated in a separate chunk.
1001
+ *
1002
+ * TODO: also adapt for when reasoning token is sent in separate chunks.
1003
  */
1004
+ async function* wrapChatCompletionStream(
1005
+ stream: Stream<ChatCompletionChunk & { _request_id?: string | null | undefined }>
1006
+ ): AsyncGenerator<ChatCompletionChunk & { _request_id?: string | null | undefined }> {
1007
+ function cloneChunkWithContent(baseChunk: ChatCompletionChunk, content: string): ChatCompletionChunk {
1008
+ return {
1009
+ ...baseChunk,
1010
+ choices: [
1011
+ {
1012
+ ...baseChunk.choices[0],
1013
+ delta: {
1014
+ ...baseChunk.choices[0].delta,
1015
+ content,
1016
+ },
1017
+ },
1018
+ ],
1019
+ };
1020
  }
1021
 
1022
+ function* splitAndYieldChunk(chunk: ChatCompletionChunk, content: string, token: string) {
1023
+ const [beforeContent, afterContent] = content.split(token, 2);
 
 
 
 
 
 
1024
 
1025
+ if (beforeContent) {
1026
+ yield cloneChunkWithContent(chunk, beforeContent);
1027
+ }
1028
+ yield cloneChunkWithContent(chunk, token);
1029
+ if (afterContent) {
1030
+ yield cloneChunkWithContent(chunk, afterContent);
1031
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1032
  }
1033
 
1034
+ for await (const chunk of stream) {
1035
+ const content = chunk.choices[0].delta.content;
 
 
 
 
1036
 
1037
+ if (!content) {
1038
+ yield chunk;
1039
+ continue;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1040
  }
 
 
1041
 
1042
+ if (content.includes(REASONING_START_TOKEN)) {
1043
+ yield* splitAndYieldChunk(chunk, content, REASONING_START_TOKEN);
1044
+ } else if (content.includes(REASONING_END_TOKEN)) {
1045
+ yield* splitAndYieldChunk(chunk, content, REASONING_END_TOKEN);
1046
+ } else {
1047
+ yield chunk;
1048
+ }
1049
+ }
 
 
 
1050
  }
src/schemas.ts CHANGED
@@ -160,10 +160,10 @@ export const createResponseParamsSchema = z.object({
160
  model: z.string(),
161
  // parallel_tool_calls: z.boolean().default(true), // TODO: how to handle this if chat completion doesn't?
162
  // previous_response_id: z.string().nullable().default(null),
163
- // reasoning: z.object({
164
- // effort: z.enum(["low", "medium", "high"]).default("medium"),
165
- // summary: z.enum(["auto", "concise", "detailed"]).nullable().default(null),
166
- // }),
167
  // store: z.boolean().default(true),
168
  stream: z.boolean().default(false),
169
  temperature: z.number().min(0).max(2).default(1),
 
160
  model: z.string(),
161
  // parallel_tool_calls: z.boolean().default(true), // TODO: how to handle this if chat completion doesn't?
162
  // previous_response_id: z.string().nullable().default(null),
163
+ reasoning: z.object({
164
+ effort: z.enum(["low", "medium", "high"]).default("medium"),
165
+ summary: z.enum(["auto", "concise", "detailed"]).nullable().default(null),
166
+ }),
167
  // store: z.boolean().default(true),
168
  stream: z.boolean().default(false),
169
  temperature: z.number().min(0).max(2).default(1),