fix(frontend): auto-detect force_reasoning when chat template appends <think>

navmarri14 · navmarri14 · commit ef6d869a3016 · 2026-04-28T15:35:57.000-07:00
Signed-off-by: Naveen Marri &lt;nmarri@roblox.com&gt;
diff --git a/components/src/dynamo/frontend/sglang_prepost.py b/components/src/dynamo/frontend/sglang_prepost.py
@@ -5,6 +5,7 @@
 
 import json
 import logging
+import re
 from dataclasses import dataclass
 from typing import Any, TypeAlias
 
@@ -40,6 +41,106 @@ class SglangPreprocessResult:
     reasoning_parser: ReasoningParser | None
     guided_decoding: dict[str, Any] | None
     request: dict[str, Any]
+    force_reasoning: bool = False
+
+
+# --- force_reasoning detection (mirrors sglang's template_manager) -------
+#
+# sglang's template_manager sets ``_force_reasoning`` once at startup by
+# scanning the chat template for ``<|im_start|>assistant\n<think>\n``
+# (the qwen3 pattern). We broaden that to also catch GLM-4.5/5 templates
+# which open a thinking block right before the generation prompt.
+#
+# A static, per-server boolean is plenty: per-request decoding of prompt
+# tails adds latency on the hot path with nothing to show for it. The
+# per-request knobs live downstream (``separate_reasoning``,
+# ``chat_template_kwargs.enable_thinking``), matching sglang's API.
+_FORCE_REASONING_PATTERNS = (
+    # qwen3-family: <|im_start|>assistant\n<think>\n
+    re.compile(r"<\|im_start\|>assistant\\n<think>\\n"),
+    # GLM-4.5/5 and similar: <|assistant|> followed by an opening <think>
+    # within the generation-prompt block. The template often has Jinja
+    # expressions (including a '</think>' literal) between the two, so we
+    # match the opening tag literally -- '<think>' never matches
+    # '</think>' because the '/' breaks the literal prefix.
+    re.compile(r"<\|assistant\|>[\s\S]{0,400}?<think>"),
+    # generic fallback for non-delimiter-style templates
+    re.compile(r"\bassistant\b[\s\S]{0,200}?<think>"),
+)
+
+
+def detect_force_reasoning_from_template(chat_template: str | None) -> bool:
+    """Return True if the chat template auto-opens a reasoning block.
+
+    Intended to be called once at processor startup with
+    ``tokenizer.chat_template`` and cached on the processor.
+    """
+    if not chat_template or not isinstance(chat_template, str):
+        return False
+    for pat in _FORCE_REASONING_PATTERNS:
+        if pat.search(chat_template):
+            return True
+    return False
+
+
+# Reasoning parsers that default to "thinking on" unless the client
+# explicitly opts out via chat_template_kwargs. Mirrors sglang's
+# serving_chat._get_reasoning_from_request table.
+_THINKING_BY_DEFAULT = {"qwen3", "glm45", "nemotron_3", "interns1", "kimi_k2"}
+_THINKING_OPT_IN = {"deepseek-v3", "gemma4"}
+
+
+def resolve_request_force_reasoning(
+    request: dict[str, Any],
+    reasoning_parser_name: str | None,
+    template_default: bool,
+) -> bool:
+    """Resolve the effective force_reasoning flag for a single request.
+
+    Mirrors sglang.srt.entrypoints.openai.serving_chat._get_reasoning_from_request
+    combined with template_manager.force_reasoning:
+
+      * opt-out families (``glm45``/``qwen3``/``kimi_k2``/...): on by
+        default, ``chat_template_kwargs.enable_thinking=False`` (or
+        ``thinking=False`` for ``kimi_k2``) disables it.
+      * opt-in families (``deepseek-v3``/``gemma4``): off by default,
+        enabled by ``chat_template_kwargs.{thinking,enable_thinking}=True``.
+      * anything else: follow the statically-detected template default.
+    """
+    if not reasoning_parser_name:
+        return False
+
+    kwargs = request.get("chat_template_kwargs") or {}
+
+    if reasoning_parser_name in _THINKING_BY_DEFAULT:
+        flag_key = (
+            "thinking" if reasoning_parser_name == "kimi_k2" else "enable_thinking"
+        )
+        return kwargs.get(flag_key) is not False
+
+    if reasoning_parser_name in _THINKING_OPT_IN:
+        flag_key = (
+            "thinking" if reasoning_parser_name == "deepseek-v3" else "enable_thinking"
+        )
+        return kwargs.get(flag_key) is True
+
+    return template_default
+
+
+def _client_wants_separate_reasoning(request: dict[str, Any]) -> bool:
+    """Honor the client's ``separate_reasoning`` flag (default True).
+
+    Matches sglang's ChatCompletionRequest.separate_reasoning: a client
+    sending ``separate_reasoning=False`` asks for thinking text to land in
+    ``delta.content`` instead of ``delta.reasoning_content``. We implement
+    that by skipping reasoning-parser creation entirely for the request.
+    """
+    value = request.get("separate_reasoning", True)
+    if isinstance(value, bool):
+        return value
+    if isinstance(value, str):
+        return value.lower() not in ("0", "false", "no", "off")
+    return bool(value)
 
 
 def convert_tools(tools: list[dict[str, Any]] | None) -> list[SglangTool] | None:
@@ -73,15 +174,48 @@ def _materialize_messages(messages: list[Any]) -> list[dict[str, Any]]:
             normalized.append(msg)
         else:
             normalized.append(dict(msg))
+    _parse_tool_call_arguments(normalized)
     return normalized
 
 
+def _parse_tool_call_arguments(messages: list[dict[str, Any]]) -> None:
+    """In-place: parse ``tool_calls[*].function.arguments`` JSON strings to dicts.
+
+    OpenAI sends tool-call arguments as JSON strings, but some chat templates
+    (e.g. GLM-4.5/5) iterate ``arguments.items()`` and raise a Jinja2
+    ``UndefinedError`` on a string. Decoding here keeps the on-wire request
+    OpenAI-compatible while giving the template a mapping to iterate.
+    Malformed JSON is left untouched so downstream validation still sees it.
+    """
+    for m in messages:
+        tool_calls = m.get("tool_calls") if isinstance(m, dict) else None
+        if not tool_calls:
+            continue
+        for tc in tool_calls:
+            if not isinstance(tc, dict):
+                continue
+            target = tc.get("function") if isinstance(tc.get("function"), dict) else tc
+            args = target.get("arguments")
+            if not isinstance(args, str):
+                continue
+            if not args:
+                target["arguments"] = {}
+                continue
+            try:
+                parsed = json.loads(args)
+            except (ValueError, TypeError):
+                continue
+            if isinstance(parsed, dict):
+                target["arguments"] = parsed
+
+
 def create_parsers(
     request: dict[str, Any],
     *,
     tool_call_parser_name: str | None,
     reasoning_parser_name: str | None,
     sglang_tools: list[SglangTool] | None = None,
+    force_reasoning: bool = False,
 ) -> tuple[ToolCallParserType | None, ReasoningParser | None]:
     """Create tool call and reasoning parsers for a request.
 
@@ -115,6 +249,7 @@ def create_parsers(
         reasoning_parser = ReasoningParser(
             model_type=reasoning_parser_name,
             stream_reasoning=True,
+            force_reasoning=force_reasoning,
         )
 
     return tool_call_parser, reasoning_parser
@@ -193,19 +328,16 @@ def build_tool_call_guided_decoding(
 
 
 def _normalize_prompt_token_ids(prompt_token_ids: Any) -> list[int]:
-    if isinstance(prompt_token_ids, list):
-        return prompt_token_ids
+    """Flatten ``apply_chat_template`` output to ``list[int]``.
 
-    input_ids = getattr(prompt_token_ids, "input_ids", None)
-    if input_ids is not None and not isinstance(input_ids, str):
-        return list(input_ids)
-
-    if isinstance(prompt_token_ids, dict):
-        dict_input_ids = prompt_token_ids.get("input_ids")
-        if dict_input_ids is not None and not isinstance(dict_input_ids, str):
-            return list(dict_input_ids)
-
-    return list(prompt_token_ids)
+    On transformers v5 the default ``TokenizersBackend`` returns a
+    ``BatchEncoding`` from ``apply_chat_template(..., tokenize=True)``;
+    unwrap to ``.input_ids`` (a flat list for a single conversation).
+    """
+    ids = getattr(prompt_token_ids, "input_ids", prompt_token_ids)
+    if isinstance(ids, dict):
+        ids = ids.get("input_ids", prompt_token_ids)
+    return list(ids)
 
 
 def preprocess_chat_request(
@@ -215,13 +347,31 @@ def preprocess_chat_request(
     tool_call_parser_name: str | None,
     reasoning_parser_name: str | None,
     exclude_tools_when_tool_choice_none: bool = True,
+    template_force_reasoning: bool = False,
 ) -> SglangPreprocessResult:
     """Preprocess a chat request using SGLang tokenizer and parser APIs.
 
+    ``template_force_reasoning`` is the static per-server flag derived from
+    the chat template (see :func:`detect_force_reasoning_from_template`);
+    the effective per-request value combines it with client knobs
+    (``separate_reasoning``, ``chat_template_kwargs.enable_thinking``).
+
     Synchronous -- suitable for both main-process and worker-process execution.
     """
     messages = _materialize_messages(request.get("messages", []))
 
+    # Per-request client escape hatch: skip reasoning parsing entirely when
+    # the client sends ``separate_reasoning=False`` -- thinking text then
+    # lands in ``delta.content`` instead of ``delta.reasoning_content``.
+    effective_reasoning_parser_name = (
+        reasoning_parser_name if _client_wants_separate_reasoning(request) else None
+    )
+    force_reasoning = resolve_request_force_reasoning(
+        request,
+        effective_reasoning_parser_name,
+        template_force_reasoning,
+    )
+
     # Convert tools to SGLang format (done once, shared with parser creation)
     sglang_tools = convert_tools(request.get("tools"))
 
@@ -266,8 +416,9 @@ def preprocess_chat_request(
     tool_call_parser, reasoning_parser = create_parsers(
         request,
         tool_call_parser_name=tool_call_parser_name,
-        reasoning_parser_name=reasoning_parser_name,
+        reasoning_parser_name=effective_reasoning_parser_name,
         sglang_tools=sglang_tools,
+        force_reasoning=force_reasoning,
     )
     guided_decoding = build_tool_call_guided_decoding(
         request,
@@ -281,6 +432,7 @@ def preprocess_chat_request(
         reasoning_parser=reasoning_parser,
         guided_decoding=guided_decoding,
         request=request,
+        force_reasoning=force_reasoning,
     )
 
 
diff --git a/components/src/dynamo/frontend/sglang_processor.py b/components/src/dynamo/frontend/sglang_processor.py
@@ -33,9 +33,11 @@
 from .sglang_prepost import (
     SglangStreamingPostProcessor,
     ToolCallParserType,
+    _client_wants_separate_reasoning,
     _get_history_tool_calls_count,
     convert_tools,
     create_parsers,
+    detect_force_reasoning_from_template,
     preprocess_chat_request,
 )
 from .utils import PreprocessError, extract_mm_urls, random_uuid, worker_warmup
@@ -104,6 +106,7 @@ def _map_finish_reason(raw: str | None) -> str | None:
 _w_tool_call_parser_name: str | None = None
 _w_reasoning_parser_name: str | None = None
 _w_exclude_tools_when_tool_choice_none: bool = True
+_w_template_force_reasoning: bool = False
 
 
 @dataclass
@@ -113,6 +116,12 @@ class SglangPreprocessWorkerResult:
     prompt_token_ids: list[int]
     dynamo_preproc: dict[str, Any]
     request: dict[str, Any]
+    force_reasoning: bool = False
+    # ``effective_reasoning_parser_name`` is None when the request opted out
+    # via ``separate_reasoning=False``; the main process must skip creating
+    # a reasoning parser in that case so the pool path matches the inline
+    # path byte-for-byte.
+    effective_reasoning_parser_name: str | None = None
 
 
 def _init_worker(
@@ -121,14 +130,16 @@ def _init_worker(
     reasoning_parser_name: str | None,
     exclude_tools_when_tool_choice_none: bool = True,
     trust_remote_code: bool = False,
+    template_force_reasoning: bool = False,
 ) -> None:
     """Initialize a worker process with its own tokenizer."""
     global _w_tokenizer, _w_tool_call_parser_name, _w_reasoning_parser_name
-    global _w_exclude_tools_when_tool_choice_none
+    global _w_exclude_tools_when_tool_choice_none, _w_template_force_reasoning
     _w_tokenizer = get_tokenizer(model_path, trust_remote_code=trust_remote_code)
     _w_tool_call_parser_name = tool_call_parser_name
     _w_reasoning_parser_name = reasoning_parser_name
     _w_exclude_tools_when_tool_choice_none = exclude_tools_when_tool_choice_none
+    _w_template_force_reasoning = template_force_reasoning
 
 
 def _preprocess_worker(
@@ -143,6 +154,7 @@ def _preprocess_worker(
         tool_call_parser_name=_w_tool_call_parser_name,
         reasoning_parser_name=_w_reasoning_parser_name,
         exclude_tools_when_tool_choice_none=_w_exclude_tools_when_tool_choice_none,
+        template_force_reasoning=_w_template_force_reasoning,
     )
 
     n = request.get("n", 1)
@@ -158,10 +170,16 @@ def _preprocess_worker(
         pre.tool_call_parser,
     )
 
+    effective_reasoning_parser_name = (
+        _w_reasoning_parser_name if _client_wants_separate_reasoning(request) else None
+    )
+
     return SglangPreprocessWorkerResult(
         prompt_token_ids=pre.prompt_token_ids,
         dynamo_preproc=dynamo_preproc,
         request=request,
+        force_reasoning=pre.force_reasoning,
+        effective_reasoning_parser_name=effective_reasoning_parser_name,
     )
 
 
@@ -254,6 +272,20 @@ def __init__(
         stream_interval: int = 1,
     ):
         self.tokenizer = tokenizer
+        # Detect force_reasoning once from the chat template, matching
+        # sglang's template_manager. Per-request overrides still apply
+        # (see resolve_request_force_reasoning).
+        self.template_force_reasoning = detect_force_reasoning_from_template(
+            getattr(tokenizer, "chat_template", None)
+        )
+        if self.template_force_reasoning:
+            logger.info(
+                "Detected force-reasoning pattern in chat template; "
+                "thinking tokens will route to delta.reasoning_content by "
+                "default (clients can opt out via "
+                "separate_reasoning=false or "
+                "chat_template_kwargs.enable_thinking=false)."
+            )
         self.router = router
         self.is_kv_router = isinstance(router, KvRouter)
         self.tool_call_parser_name = tool_call_parser_name
@@ -317,6 +349,7 @@ async def _generator_inner(
                 tool_call_parser_name=self.tool_call_parser_name,
                 reasoning_parser_name=self.reasoning_parser_name,
                 exclude_tools_when_tool_choice_none=self.exclude_tools_when_tool_choice_none,
+                template_force_reasoning=self.template_force_reasoning,
             )
 
             if self.debug_perf:
@@ -405,10 +438,15 @@ async def _generator_inner_pool(
             return
 
         # --- Phase 2: Recreate parsers in main process (not picklable) ---
+        # The worker already decided effective_reasoning_parser_name based on
+        # the request's separate_reasoning flag and computed force_reasoning;
+        # we mirror those choices to keep pool- and inline-path outputs
+        # identical.
         tool_call_parser, reasoning_parser = create_parsers(
             request,
             tool_call_parser_name=self.tool_call_parser_name,
-            reasoning_parser_name=self.reasoning_parser_name,
+            reasoning_parser_name=preproc_result.effective_reasoning_parser_name,
+            force_reasoning=preproc_result.force_reasoning,
         )
 
         post = SglangStreamingPostProcessor(
@@ -596,6 +634,13 @@ async def chat_engine_factory(
 
         eos_token_id = getattr(tokenizer, "eos_token_id", None)
 
+        # Static reasoning-template scan (mirrors sglang's template_manager).
+        # Shared with worker-pool processes via initargs so they compute the
+        # same per-request force_reasoning flag as the main process.
+        template_force_reasoning = detect_force_reasoning_from_template(
+            getattr(tokenizer, "chat_template", None)
+        )
+
         tool_call_parser_name = (
             self.tool_call_parser_name
             or _runtime_config_parser_name(mdc, "tool_call_parser")
@@ -643,6 +688,7 @@ async def chat_engine_factory(
                     reasoning_parser_name,
                     self.config.exclude_tools_when_tool_choice_none,
                     self.trust_remote_code,
+                    template_force_reasoning,
                 ),
             )
             futures = [