fix: native tool type mapping for freeform apply_patch and web_search

OnlyTerp · web-flow · commit b8116262d76b · 2026-06-12T21:14:11.000-04:00
fix: native tool type mapping for freeform apply_patch and web_search

- Maps original Responses tool types through streaming and non-streaming paths
- Emit custom_tool_call for apply_patch (freeform) instead of generic function_call
- Emit web_search_call for web_search tools so Codex Desktop handles them correctly
- Add server-side DuckDuckGo web search for non-streaming BYOK responses
- Add comprehensive tests for all tool type mapping scenarios
diff --git a/codex_shim/server.py b/codex_shim/server.py
@@ -620,7 +620,8 @@ async def _cursor_passthrough(
 
         response = _sse_response()
         await response.prepare(request)
-        state = ResponsesStreamState(slug)
+        tool_types = _build_tool_types(body)
+        state = ResponsesStreamState(slug, tool_types)
         try:
             await state.start(response)
             async for event in iter_cursor_agent_events(prompt, upstream):
@@ -757,10 +758,13 @@ async def _post_openai_chat(
             if upstream.status >= 400:
                 return await _error_response(upstream, slug=route.slug)
             if body.get("stream"):
-                return await self._stream_openai_chat(request, upstream, route, as_responses)
+                return await self._stream_openai_chat(request, upstream, route, as_responses, body)
             payload = await upstream.json(content_type=None)
         if as_responses:
-            return web.json_response(chat_completion_to_response(payload, route.slug))
+            tool_types = _build_tool_types(body)
+            payload = chat_completion_to_response(payload, route.slug, tool_types)
+            intercepted = _maybe_intercept_web_search(payload)
+            return web.json_response(intercepted or payload)
         return web.json_response(payload)
 
     async def _post_openai_chat_as_anthropic(
@@ -788,10 +792,13 @@ async def _post_anthropic(
             if upstream.status >= 400:
                 return await _error_response(upstream)
             if body.get("stream"):
-                return await self._stream_anthropic(request, upstream, route, as_responses)
+                return await self._stream_anthropic(request, upstream, route, as_responses, body)
             payload = await upstream.json(content_type=None)
         if as_responses:
-            return web.json_response(anthropic_to_response(payload, route.slug))
+            tool_types = _build_tool_types(body)
+            payload = anthropic_to_response(payload, route.slug, tool_types)
+            intercepted = _maybe_intercept_web_search(payload)
+            return web.json_response(intercepted or payload)
         return web.json_response(anthropic_to_chat_response(payload, route.slug))
 
     async def _post_anthropic_messages(
@@ -811,12 +818,13 @@ async def _post_anthropic_messages(
         return web.json_response(payload)
 
     async def _stream_openai_chat(
-        self, request: web.Request, upstream, route: ShimModel, as_responses: bool
+        self, request: web.Request, upstream, route: ShimModel, as_responses: bool, body: dict[str, Any] | None = None
     ) -> web.StreamResponse:
         response = _sse_response()
         await response.prepare(request)
         if as_responses:
-            state = ResponsesStreamState(route.slug)
+            tool_types = _build_tool_types(body) if body else {}
+            state = ResponsesStreamState(route.slug, tool_types)
         try:
             if as_responses:
                 await state.start(response)
@@ -873,12 +881,13 @@ async def _stream_openai_chat_as_anthropic(
         return response
 
     async def _stream_anthropic(
-        self, request: web.Request, upstream, route: ShimModel, as_responses: bool
+        self, request: web.Request, upstream, route: ShimModel, as_responses: bool, body: dict[str, Any] | None = None
     ) -> web.StreamResponse:
         response = _sse_response()
         await response.prepare(request)
         if as_responses:
-            state = ResponsesStreamState(route.slug)
+            tool_types = _build_tool_types(body) if body else {}
+            state = ResponsesStreamState(route.slug, tool_types)
         try:
             if as_responses:
                 await state.start(response)
@@ -1207,22 +1216,22 @@ class ResponsesStreamState:
     proper .added / .delta / .done / .completed events plus a final
     `response.completed` with the full reconciled `output` array."""
 
-    def __init__(self, model: str):
+    def __init__(self, model: str, tool_types: dict[str, str] | None = None):
         self.response_id = f"resp_{int(time.time() * 1000)}"
         self.message_item_id = f"msg_{int(time.time() * 1000)}"
         self.model = model
-        self.message_index: int | None = None  # output_index for the assistant message
+        self.message_index: int | None = None
         self.message_text = ""
         self.message_opened = False
         self.message_closed = False
         self.usage: dict[str, Any] | None = None
-        # Tool call state, keyed by upstream "index" (chat-completions) or
-        # anthropic content_block_index. Each entry tracks its assigned
-        # output_index, accumulated arguments, name, etc.
         self.tool_calls: dict[int, dict[str, Any]] = {}
-        # Reasoning (extended thinking) blocks, keyed by upstream index.
         self.reasoning_blocks: dict[Any, dict[str, Any]] = {}
         self.next_output_index = 0
+        # Map sanitized tool name -> original Responses tool type so we can
+        # emit the correct output item type (e.g. custom_tool_call for freeform
+        # apply_patch instead of generic function_call).
+        self.tool_types = tool_types or {}
 
     # ------------------------------------------------------------------
     # Lifecycle
@@ -1493,13 +1502,23 @@ async def _open_tool(self, response: web.StreamResponse, *, key: Any, call_id: s
             await self._close_message(response)
         output_index = self.next_output_index
         self.next_output_index += 1
+        # Determine output item type based on original tool type.
+        # Freeform tools (apply_patch with no schema) emit custom_tool_call
+        # so Codex Desktop knows not to validate against a fixed enum.
+        original_type = self.tool_types.get(name, "")
+        output_type = "function_call"
+        if original_type == "apply_patch":
+            output_type = "custom_tool_call"
+        elif original_type.startswith("web_search"):
+            output_type = "web_search_call"
         state: dict[str, Any] = {
             "id": call_id,
             "call_id": call_id,
             "name": name,
             "arguments": "",
             "output_index": output_index,
             "closed": False,
+            "output_type": output_type,
         }
         self.tool_calls[key] = state
         await _write_sse(
@@ -1509,7 +1528,7 @@ async def _open_tool(self, response: web.StreamResponse, *, key: Any, call_id: s
                 "output_index": output_index,
                 "item": {
                     "id": call_id,
-                    "type": "function_call",
+                    "type": output_type,
                     "status": "in_progress",
                     "call_id": call_id,
                     "name": name,
@@ -1654,7 +1673,7 @@ def _message_item(self, status: str) -> dict[str, Any]:
     def _tool_item(self, state: dict[str, Any], status: str) -> dict[str, Any]:
         return {
             "id": state["id"],
-            "type": "function_call",
+            "type": state.get("output_type", "function_call"),
             "status": status,
             "call_id": state["call_id"],
             "name": state["name"],
@@ -1712,6 +1731,200 @@ def _decode_thinking_payload(encoded: str) -> dict[str, Any] | None:
     return data if isinstance(data, dict) else None
 
 
+def _build_tool_types(body: dict[str, Any]) -> dict[str, str]:
+    """Build a map sanitized tool name -> original tool type from the request tools array.
+
+    Codex Desktop emits native tools like `{"type": "apply_patch"}` and MCP tools
+    like `{"type": "mcp__node_repl", "function": {"name": "js"}}`. When we translate
+    those into chat-completions `function` tools, the original type is lost. We
+    preserve it here so the Responses streaming translator can emit the correct
+    output item type (e.g. `custom_tool_call` for freeform apply_patch instead of
+    generic `function_call`).
+    """
+    tool_types: dict[str, str] = {}
+    for tool in body.get("tools") or []:
+        if not isinstance(tool, dict):
+            continue
+        tool_type = str(tool.get("type") or "").strip().lower()
+        fn = tool.get("function")
+        if isinstance(fn, dict) and fn.get("name"):
+            name = str(fn["name"]).strip()
+        elif tool.get("name"):
+            name = str(tool["name"]).strip()
+        else:
+            name = tool_type
+        clean = re.sub(r"[^a-zA-Z0-9_-]+", "_", name.strip())[:64].strip("_")
+        if clean:
+            tool_types[clean] = tool_type
+    return tool_types
+
+async def _perform_web_search(query: str) -> str:
+    """Execute a web search via DuckDuckGo and return text results.
+
+    This is a server-side fallback for custom models whose provider does not
+    have a native web-search capability.  Codex Desktop expects the shim to
+    return results as a `function_call_output` (or `web_search_call`) item;
+    when the model is BYOK, the Desktop app does not execute the search itself,
+    so the shim must do it and feed the results back into the conversation.
+    """
+    import urllib.parse
+    import urllib.request
+
+    if not query or not query.strip():
+        return "No search query provided."
+
+    # DuckDuckGo lite HTML endpoint (no API key required)
+    url = (
+        "https://html.duckduckgo.com/html/"
+        + "?q="
+        + urllib.parse.quote_plus(query.strip())
+    )
+    req = urllib.request.Request(
+        url,
+        headers={
+            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+            "AppleWebKit/537.36 (KHTML, like Gecko) "
+            "Chrome/120.0.0.0 Safari/537.36"
+        },
+    )
+    try:
+        with urllib.request.urlopen(req, timeout=10) as resp:
+            html = resp.read().decode("utf-8", errors="replace")
+    except Exception as exc:
+        return f"Web search failed: {exc}"
+
+    # Extract title + snippet from result links
+    results: list[str] = []
+    # Each result is in a `.result` div with `.result__a` (title/link) and `.result__snippet`
+    from html.parser import HTMLParser
+
+    class _ResultParser(HTMLParser):
+        def __init__(self) -> None:
+            super().__init__()
+            self.in_result = False
+            self.in_a = False
+            self.in_snippet = False
+            self.current_title = ""
+            self.current_snippet = ""
+            self.results: list[dict[str, str]] = []
+            self._tag_stack: list[str] = []
+            self._class_stack: list[str] = []
+
+        def _current_class(self) -> str:
+            return self._class_stack[-1] if self._class_stack else ""
+
+        def handle_starttag(self, tag: str, attrs_list: list[tuple[str, str | None]]) -> None:
+            attrs = dict(attrs_list)
+            cls = (attrs.get("class") or "").lower()
+            self._tag_stack.append(tag)
+            self._class_stack.append(cls)
+            if "result" in cls and tag == "div":
+                self.in_result = True
+                self.current_title = ""
+                self.current_snippet = ""
+            if self.in_result and tag == "a" and "result__a" in cls:
+                self.in_a = True
+            if self.in_result and ("result__snippet" in cls or "result__body" in cls):
+                self.in_snippet = True
+
+        def handle_endtag(self, tag: str) -> None:
+            if self._tag_stack and self._tag_stack[-1] == tag:
+                self._tag_stack.pop()
+                self._class_stack.pop()
+            if tag == "div" and self.in_result:
+                if self.current_title or self.current_snippet:
+                    self.results.append(
+                        {
+                            "title": self.current_title.strip(),
+                            "snippet": self.current_snippet.strip(),
+                        }
+                    )
+                self.in_result = False
+            if tag == "a":
+                self.in_a = False
+            if tag in {"div", "span", "p"}:
+                self.in_snippet = False
+
+        def handle_data(self, data: str) -> None:
+            if self.in_a:
+                self.current_title += data
+            if self.in_snippet:
+                self.current_snippet += data
+
+    parser = _ResultParser()
+    parser.feed(html)
+    for r in parser.results[:5]:
+        title = r["title"].replace("\n", " ")
+        snippet = r["snippet"].replace("\n", " ")
+        if title and snippet:
+            results.append(f"{title}\n{snippet}")
+        elif title:
+            results.append(title)
+        elif snippet:
+            results.append(snippet)
+
+    if not results:
+        return "No web search results found."
+    return "\n\n".join(results)
+
+def _maybe_intercept_web_search(payload: dict[str, Any]) -> dict[str, Any] | None:
+    """If the response payload contains a web_search_call, execute it server-side
+    and return a new payload with the results embedded as a function_call_output.
+
+    Returns None if no web_search_call is present (pass through unchanged).
+    """
+    output = payload.get("output") or []
+    if not isinstance(output, list):
+        return None
+    search_calls: list[tuple[int, dict[str, Any]]] = []
+    for i, item in enumerate(output):
+        if isinstance(item, dict) and item.get("type") == "web_search_call":
+            search_calls.append((i, item))
+    if not search_calls:
+        return None
+
+    # Build synthetic search results
+    results: list[dict[str, Any]] = []
+    for idx, call in search_calls:
+        try:
+            args = json.loads(call.get("arguments") or "{}")
+        except json.JSONDecodeError:
+            args = {}
+        query = args.get("query") or ""
+        # Run the search synchronously (non-streaming path only)
+        import asyncio
+        try:
+            loop = asyncio.get_running_loop()
+            result_text = loop.run_until_complete(_perform_web_search(query))
+        except RuntimeError:
+            result_text = "Web search unavailable in this context."
+        results.append({
+            "id": f"wso_{call.get('call_id', '0')}",
+            "type": "function_call_output",
+            "status": "completed",
+            "call_id": call.get("call_id"),
+            "output": result_text,
+        })
+
+    # Replace web_search_call items with their results
+    new_output: list[dict[str, Any]] = []
+    for i, item in enumerate(output):
+        if isinstance(item, dict) and item.get("type") == "web_search_call":
+            # Find matching result
+            for r in results:
+                if r.get("call_id") == item.get("call_id"):
+                    new_output.append(r)
+                    break
+            else:
+                new_output.append(item)
+        else:
+            new_output.append(item)
+
+    new_payload = dict(payload)
+    new_payload["output"] = new_output
+    return new_payload
+
+
 _VERSIONED_BASE_RE = re.compile(r"(?:^|/)v\d+$")
 
 
diff --git a/codex_shim/translate.py b/codex_shim/translate.py
@@ -315,7 +315,7 @@ def anthropic_to_chat_response(payload: dict[str, Any], requested_model: str) ->
     }
 
 
-def chat_completion_to_response(payload: dict[str, Any], requested_model: str) -> dict[str, Any]:
+def chat_completion_to_response(payload: dict[str, Any], requested_model: str, tool_types: dict[str, str] | None = None) -> dict[str, Any]:
     choice = (payload.get("choices") or [{}])[0]
     message = choice.get("message") or {}
     output: list[dict[str, Any]] = []
@@ -340,15 +340,23 @@ def chat_completion_to_response(payload: dict[str, Any], requested_model: str) -
                 "content": [{"type": "output_text", "text": text, "annotations": []}],
             }
         )
+    tool_types = tool_types or {}
     for call in message.get("tool_calls") or []:
         fn = call.get("function") or {}
+        name = fn.get("name", "")
+        original_type = tool_types.get(name, "")
+        item_type = "function_call"
+        if original_type == "apply_patch":
+            item_type = "custom_tool_call"
+        elif original_type.startswith("web_search"):
+            item_type = "web_search_call"
         output.append(
             {
                 "id": call.get("id", "call_0"),
-                "type": "function_call",
+                "type": item_type,
                 "status": "completed",
                 "call_id": call.get("id", "call_0"),
-                "name": fn.get("name", ""),
+                "name": name,
                 "arguments": fn.get("arguments", ""),
             }
         )
@@ -363,8 +371,8 @@ def chat_completion_to_response(payload: dict[str, Any], requested_model: str) -
     }
 
 
-def anthropic_to_response(payload: dict[str, Any], requested_model: str) -> dict[str, Any]:
-    response = chat_completion_to_response(anthropic_to_chat_response(payload, requested_model), requested_model)
+def anthropic_to_response(payload: dict[str, Any], requested_model: str, tool_types: dict[str, str] | None = None) -> dict[str, Any]:
+    response = chat_completion_to_response(anthropic_to_chat_response(payload, requested_model), requested_model, tool_types)
     response["usage"] = normalize_responses_usage(payload.get("usage"))
     return response
 
diff --git a/tests/test_native_tool_types.py b/tests/test_native_tool_types.py

Original file line number	Diff line number	Diff line change
`@@ -315,7 +315,7 @@ def anthropic_to_chat_response(payload: dict[str, Any], requested_model: str) ->`
`315`	`315`	`}`
`316`	`316`
`317`	`317`
`318`		`-def chat_completion_to_response(payload: dict[str, Any], requested_model: str) -> dict[str, Any]:`
	`318`	`+def chat_completion_to_response(payload: dict[str, Any], requested_model: str, tool_types: dict[str, str] \| None = None) -> dict[str, Any]:`
`319`	`319`	`choice = (payload.get("choices") or [{}])[0]`
`320`	`320`	`message = choice.get("message") or {}`
`321`	`321`	`output: list[dict[str, Any]] = []`
`@@ -340,15 +340,23 @@ def chat_completion_to_response(payload: dict[str, Any], requested_model: str) -`
`340`	`340`	`"content": [{"type": "output_text", "text": text, "annotations": []}],`
`341`	`341`	`}`
`342`	`342`	`)`
	`343`	`+ tool_types = tool_types or {}`
`343`	`344`	`for call in message.get("tool_calls") or []:`
`344`	`345`	`fn = call.get("function") or {}`
	`346`	`+ name = fn.get("name", "")`
	`347`	`+ original_type = tool_types.get(name, "")`
	`348`	`+ item_type = "function_call"`
	`349`	`+ if original_type == "apply_patch":`
	`350`	`+ item_type = "custom_tool_call"`
	`351`	`+ elif original_type.startswith("web_search"):`
	`352`	`+ item_type = "web_search_call"`
`345`	`353`	`output.append(`
`346`	`354`	`{`
`347`	`355`	`"id": call.get("id", "call_0"),`
`348`		`- "type": "function_call",`
	`356`	`+ "type": item_type,`
`349`	`357`	`"status": "completed",`
`350`	`358`	`"call_id": call.get("id", "call_0"),`
`351`		`- "name": fn.get("name", ""),`
	`359`	`+ "name": name,`
`352`	`360`	`"arguments": fn.get("arguments", ""),`
`353`	`361`	`}`
`354`	`362`	`)`
`@@ -363,8 +371,8 @@ def chat_completion_to_response(payload: dict[str, Any], requested_model: str) -`
`363`	`371`	`}`
`364`	`372`
`365`	`373`
`366`		`-def anthropic_to_response(payload: dict[str, Any], requested_model: str) -> dict[str, Any]:`
`367`		`- response = chat_completion_to_response(anthropic_to_chat_response(payload, requested_model), requested_model)`
	`374`	`+def anthropic_to_response(payload: dict[str, Any], requested_model: str, tool_types: dict[str, str] \| None = None) -> dict[str, Any]:`
	`375`	`+ response = chat_completion_to_response(anthropic_to_chat_response(payload, requested_model), requested_model, tool_types)`
`368`	`376`	`response["usage"] = normalize_responses_usage(payload.get("usage"))`
`369`	`377`	`return response`
`370`	`378`