ai-dynamo
diff --git a/‎components/src/dynamo/frontend/sglang_prepost.py‎
Lines changed: 161 additions & 38 deletions b/‎components/src/dynamo/frontend/sglang_prepost.py‎
Lines changed: 161 additions & 38 deletions
@@ -5,6 +5,7 @@
 
 import json
 import logging
+import re
 from dataclasses import dataclass
 from typing import Any, TypeAlias
 
@@ -40,6 +41,102 @@ class SglangPreprocessResult:
     reasoning_parser: ReasoningParser | None
     guided_decoding: dict[str, Any] | None
     request: dict[str, Any]
+    force_reasoning: bool = False
+
+
+# --- force_reasoning detection (mirrors sglang's template_manager) -------
+#
+# sglang's template_manager sets ``_force_reasoning`` once at startup by
+# scanning the chat template for ``<|im_start|>assistant\n<think>\n``
+# (the qwen3 pattern). We broaden that to also catch GLM-4.5/5 templates
+# which open a thinking block right before the generation prompt.
+#
+# A static, per-server boolean is plenty: per-request decoding of prompt
+# tails adds latency on the hot path with nothing to show for it. The
+# per-request knobs live downstream (``separate_reasoning``,
+# ``chat_template_kwargs.enable_thinking``), matching sglang's API.
+_FORCE_REASONING_PATTERNS = (
+    # qwen3-family: <|im_start|>assistant\n<think>\n
+    re.compile(r"<\|im_start\|>assistant\\n<think>\\n"),
+    # GLM-4.5/5 and similar: <|assistant|> followed by an opening <think>
+    # within the generation-prompt block. The template often has Jinja
+    # expressions (including a '</think>' literal) between the two, so we
+    # match the opening tag literally -- '<think>' never matches
+    # '</think>' because the '/' breaks the literal prefix.
+    re.compile(r"<\|assistant\|>[\s\S]{0,400}?<think>"),
+    # generic fallback for non-delimiter-style templates
+    re.compile(r"\bassistant\b[\s\S]{0,200}?<think>"),
+)
+
+
+def detect_force_reasoning_from_template(chat_template: str | None) -> bool:
+    """Return True if the chat template auto-opens a reasoning block.
+
+    Intended to be called once at processor startup with
+    ``tokenizer.chat_template`` and cached on the processor.
+    """
+    if not chat_template or not isinstance(chat_template, str):
+        return False
+    for pat in _FORCE_REASONING_PATTERNS:
+        if pat.search(chat_template):
+            return True
+    return False
+
+
+# Reasoning parsers that default to "thinking on" unless the client
+# explicitly opts out via chat_template_kwargs. Mirrors sglang's
+# serving_chat._get_reasoning_from_request table.
+_THINKING_BY_DEFAULT = {"qwen3", "glm45", "nemotron_3", "interns1", "kimi_k2"}
+_THINKING_OPT_IN = {"deepseek-v3", "gemma4"}
+
+
+def resolve_request_force_reasoning(
+    request: dict[str, Any],
+    reasoning_parser_name: str | None,
+    template_default: bool,
+) -> bool:
+    """Resolve the effective force_reasoning flag for a single request.
+
+    Mirrors sglang.srt.entrypoints.openai.serving_chat._get_reasoning_from_request
+    combined with template_manager.force_reasoning:
+
+      * opt-out families (``glm45``/``qwen3``/``kimi_k2``/...): on by
+        default, ``chat_template_kwargs.enable_thinking=False`` (or
+        ``thinking=False`` for ``kimi_k2``) disables it.
+      * opt-in families (``deepseek-v3``/``gemma4``): off by default,
+        enabled by ``chat_template_kwargs.{thinking,enable_thinking}=True``.
+      * anything else: follow the statically-detected template default.
+    """
+    if not reasoning_parser_name:
+        return False
+
+    kwargs = request.get("chat_template_kwargs") or {}
+
+    if reasoning_parser_name in _THINKING_BY_DEFAULT:
+        flag_key = "thinking" if reasoning_parser_name == "kimi_k2" else "enable_thinking"
+        return kwargs.get(flag_key) is not False
+
+    if reasoning_parser_name in _THINKING_OPT_IN:
+        flag_key = "thinking" if reasoning_parser_name == "deepseek-v3" else "enable_thinking"
+        return kwargs.get(flag_key) is True
+
+    return template_default
+
+
+def _client_wants_separate_reasoning(request: dict[str, Any]) -> bool:
+    """Honor the client's ``separate_reasoning`` flag (default True).
+
+    Matches sglang's ChatCompletionRequest.separate_reasoning: a client
+    sending ``separate_reasoning=False`` asks for thinking text to land in
+    ``delta.content`` instead of ``delta.reasoning_content``. We implement
+    that by skipping reasoning-parser creation entirely for the request.
+    """
+    value = request.get("separate_reasoning", True)
+    if isinstance(value, bool):
+        return value
+    if isinstance(value, str):
+        return value.lower() not in ("0", "false", "no", "off")
+    return bool(value)
 
 
 def convert_tools(tools: list[dict[str, Any]] | None) -> list[SglangTool] | None:
@@ -73,22 +170,39 @@ def _materialize_messages(messages: list[Any]) -> list[dict[str, Any]]:
             normalized.append(msg)
         else:
             normalized.append(dict(msg))
+    _parse_tool_call_arguments(normalized)
     return normalized
 
 
-def detect_force_reasoning(tokenizer, prompt_token_ids: list[int]) -> bool:
-    """Check if the chat template's generation prompt ends with ``<think>``.
+def _parse_tool_call_arguments(messages: list[dict[str, Any]]) -> None:
+    """In-place: parse ``tool_calls[*].function.arguments`` JSON strings to dicts.
 
-    When the template appends ``<think>`` to the prompt, the model output
-    starts inside a reasoning block without an explicit opening tag.
-    The reasoning parser must be told to begin in reasoning mode
-    (``force_reasoning=True``) so that it correctly separates reasoning
-    content from normal content.
+    OpenAI sends tool-call arguments as JSON strings, but some chat templates
+    (e.g. GLM-4.5/5) iterate ``arguments.items()`` and raise a Jinja2
+    ``UndefinedError`` on a string. Decoding here keeps the on-wire request
+    OpenAI-compatible while giving the template a mapping to iterate.
+    Malformed JSON is left untouched so downstream validation still sees it.
     """
-    if not prompt_token_ids:
-        return False
-    tail = tokenizer.decode(prompt_token_ids[-10:], skip_special_tokens=False)
-    return tail.rstrip().endswith("<think>")
+    for m in messages:
+        tool_calls = m.get("tool_calls") if isinstance(m, dict) else None
+        if not tool_calls:
+            continue
+        for tc in tool_calls:
+            if not isinstance(tc, dict):
+                continue
+            target = tc.get("function") if isinstance(tc.get("function"), dict) else tc
+            args = target.get("arguments")
+            if not isinstance(args, str):
+                continue
+            if not args:
+                target["arguments"] = {}
+                continue
+            try:
+                parsed = json.loads(args)
+            except (ValueError, TypeError):
+                continue
+            if isinstance(parsed, dict):
+                target["arguments"] = parsed
 
 
 def create_parsers(
@@ -128,13 +242,11 @@ def create_parsers(
 
     reasoning_parser = None
     if reasoning_parser_name:
-        kwargs: dict[str, Any] = {
-            "model_type": reasoning_parser_name,
-            "stream_reasoning": True,
-        }
-        if force_reasoning:
-            kwargs["force_reasoning"] = True
-        reasoning_parser = ReasoningParser(**kwargs)
+        reasoning_parser = ReasoningParser(
+            model_type=reasoning_parser_name,
+            stream_reasoning=True,
+            force_reasoning=force_reasoning,
+        )
 
     return tool_call_parser, reasoning_parser
 
@@ -212,19 +324,16 @@ def build_tool_call_guided_decoding(
 
 
 def _normalize_prompt_token_ids(prompt_token_ids: Any) -> list[int]:
-    if isinstance(prompt_token_ids, list):
-        return prompt_token_ids
-
-    input_ids = getattr(prompt_token_ids, "input_ids", None)
-    if input_ids is not None and not isinstance(input_ids, str):
-        return list(input_ids)
-
-    if isinstance(prompt_token_ids, dict):
-        dict_input_ids = prompt_token_ids.get("input_ids")
-        if dict_input_ids is not None and not isinstance(dict_input_ids, str):
-            return list(dict_input_ids)
+    """Flatten ``apply_chat_template`` output to ``list[int]``.
 
-    return list(prompt_token_ids)
+    On transformers v5 the default ``TokenizersBackend`` returns a
+    ``BatchEncoding`` from ``apply_chat_template(..., tokenize=True)``;
+    unwrap to ``.input_ids`` (a flat list for a single conversation).
+    """
+    ids = getattr(prompt_token_ids, "input_ids", prompt_token_ids)
+    if isinstance(ids, dict):
+        ids = ids.get("input_ids", prompt_token_ids)
+    return list(ids)
 
 
 def preprocess_chat_request(
@@ -234,13 +343,33 @@ def preprocess_chat_request(
     tool_call_parser_name: str | None,
     reasoning_parser_name: str | None,
     exclude_tools_when_tool_choice_none: bool = True,
+    template_force_reasoning: bool = False,
 ) -> SglangPreprocessResult:
     """Preprocess a chat request using SGLang tokenizer and parser APIs.
 
+    ``template_force_reasoning`` is the static per-server flag derived from
+    the chat template (see :func:`detect_force_reasoning_from_template`);
+    the effective per-request value combines it with client knobs
+    (``separate_reasoning``, ``chat_template_kwargs.enable_thinking``).
+
     Synchronous -- suitable for both main-process and worker-process execution.
     """
     messages = _materialize_messages(request.get("messages", []))
 
+    # Per-request client escape hatch: skip reasoning parsing entirely when
+    # the client sends ``separate_reasoning=False`` -- thinking text then
+    # lands in ``delta.content`` instead of ``delta.reasoning_content``.
+    effective_reasoning_parser_name = (
+        reasoning_parser_name
+        if _client_wants_separate_reasoning(request)
+        else None
+    )
+    force_reasoning = resolve_request_force_reasoning(
+        request,
+        effective_reasoning_parser_name,
+        template_force_reasoning,
+    )
+
     # Convert tools to SGLang format (done once, shared with parser creation)
     sglang_tools = convert_tools(request.get("tools"))
 
@@ -262,7 +391,6 @@ def preprocess_chat_request(
     template_kwargs: dict[str, Any] = {
         "add_generation_prompt": True,
         "tokenize": True,
-        "return_dict": False,
     }
     # Strip tools from template when tool_choice=none so the model doesn't
     # see them and generate raw XML tool calls in its response.
@@ -283,16 +411,10 @@ def preprocess_chat_request(
         tokenizer.apply_chat_template(messages, **template_kwargs)
     )
 
-    force_reasoning = (
-        detect_force_reasoning(tokenizer, prompt_token_ids)
-        if reasoning_parser_name
-        else False
-    )
-
     tool_call_parser, reasoning_parser = create_parsers(
         request,
         tool_call_parser_name=tool_call_parser_name,
-        reasoning_parser_name=reasoning_parser_name,
+        reasoning_parser_name=effective_reasoning_parser_name,
         sglang_tools=sglang_tools,
         force_reasoning=force_reasoning,
     )
@@ -308,6 +430,7 @@ def preprocess_chat_request(
         reasoning_parser=reasoning_parser,
         guided_decoding=guided_decoding,
         request=request,
+        force_reasoning=force_reasoning,
     )