ggml-org · ochafik · Jun 2, 2025 · May 30, 2025 · May 30, 2025 · May 31, 2025
diff --git a/common/arg.cpp b/common/arg.cpp
@@ -2869,6 +2869,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
         "(default: deepseek)",
         [](common_params & params, const std::string & value) {
             /**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
+            else if (value == "deepseek-legacy") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY; }
             else if (value == "none") {     params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
             else { throw std::invalid_argument("invalid value"); }
         }

diff --git a/common/chat.cpp b/common/chat.cpp
@@ -82,10 +82,10 @@ json common_chat_msg::to_json_oaicompat() const
 
 std::vector<common_chat_msg_diff> common_chat_msg_diff::compute_diffs(const common_chat_msg & previous_msg, const common_chat_msg & new_msg) {
     std::vector<common_chat_msg_diff> diffs;
-    // if (previous_msg.reasoning_content != current.reasoning_content) {
-    //     auto & diff = diffs.emplace_back();
-    //     diff.reasoning_content_delta = string_diff(previous_msg.reasoning_content, current.reasoning_content);
-    // }
+    if (previous_msg.reasoning_content != new_msg.reasoning_content) {
+        auto & diff = diffs.emplace_back();
+        diff.reasoning_content_delta = string_diff(previous_msg.reasoning_content, new_msg.reasoning_content);
+    }
     if (previous_msg.content != new_msg.content) {
         auto & diff = diffs.emplace_back();
         diff.content_delta = string_diff(previous_msg.content, new_msg.content);
@@ -385,9 +385,9 @@ json common_chat_tools_to_json_oaicompat(const std::vector<common_chat_tool> & t
 
 template <> json common_chat_msg_diff_to_json_oaicompat(const common_chat_msg_diff & diff) {
     json delta = json::object();
-    // if (!diff.reasoning_content_delta.empty()) {
-    //     delta["reasoning_content"] = msg.reasoning_content;
-    // }
+    if (!diff.reasoning_content_delta.empty()) {
+        delta["reasoning_content"] = diff.reasoning_content_delta;
+    }
     if (!diff.content_delta.empty()) {
         delta["content"] = diff.content_delta;
     }
@@ -598,6 +598,7 @@ const char * common_reasoning_format_name(common_reasoning_format format) {
     switch (format) {
         case COMMON_REASONING_FORMAT_NONE:     return "none";
         case COMMON_REASONING_FORMAT_DEEPSEEK: return "deepseek";
+        case COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY: return "deepseek-legacy";
         default:
             throw std::runtime_error("Unknown reasoning format");
     }

diff --git a/common/chat.h b/common/chat.h
@@ -70,7 +70,7 @@ struct common_chat_msg {
 };
 
 struct common_chat_msg_diff {
-    // std::string reasoning_content_delta;
+    std::string reasoning_content_delta;
     std::string content_delta;
     size_t tool_call_index = std::string::npos;
     common_chat_tool_call tool_call_delta;

diff --git a/common/common.h b/common/common.h
@@ -215,7 +215,8 @@ struct common_params_vocoder {
 
 enum common_reasoning_format {
     COMMON_REASONING_FORMAT_NONE,
-    COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`
+    COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
+    COMMON_REASONING_FORMAT_DEEPSEEK,        // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
 };
 
 struct common_params {

diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp
@@ -19,8 +19,8 @@
 using json = nlohmann::ordered_json;
 
 static std::ostream & operator<<(std::ostream & os, const common_chat_msg_diff & diff) {
-    // os << "reasoning_content_delta: " << diff.reasoning_content_delta << '\n';
     os << "{ content_delta: " << diff.content_delta << "; ";
+    os << "reasoning_content_delta: " << diff.reasoning_content_delta << "; ";
     if (diff.tool_call_index != std::string::npos) {
         os << "tool_call_index: " << diff.tool_call_index << "; ";
         os << "tool_call_delta.name: " << diff.tool_call_delta.name << "; ";

diff --git a/tools/server/public/index.html.gz b/tools/server/public/index.html.gz
@@ -360,7 +360,7 @@ struct server_task {
                 params.oaicompat_chat_syntax.format = defaults.oaicompat_chat_syntax.format;
             }
             params.oaicompat_chat_syntax.reasoning_format = params_base.reasoning_format;
-            params.oaicompat_chat_syntax.reasoning_in_content = params.stream;
+            params.oaicompat_chat_syntax.reasoning_in_content = params.stream && (params_base.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY);
             params.oaicompat_chat_syntax.thinking_forced_open = json_value(data, "thinking_forced_open", false);
             params.oaicompat_chat_syntax.parse_tool_calls = json_value(data, "parse_tool_calls", false);
         }

@@ -499,13 +499,12 @@ def do_test_calc_result(server: ServerProcess, result_override: str | None, n_pr
 
 
 @pytest.mark.slow
-@pytest.mark.parametrize("n_predict,reasoning_format,stream,expect_reasoning_content,expect_content,hf_repo,template_override", [
-    (128, 'deepseek',   CompletionMode.NORMAL,   None, "^The sum of 102 and 7 is 109[\\s\\S]*",                                       "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",       None),
-    (128,  None,        CompletionMode.NORMAL,   None, "^The sum of 102 and 7 is 109[\\s\\S]*",                                       "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",       None),
-    (1024, 'deepseek',  CompletionMode.NORMAL,   "I need to calculate the sum of 102 and 7[\\s\\S]*", "To find the sum of[\\s\\S]*",  "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
-    (1024, 'deepseek',  CompletionMode.STREAMED, None, "^<think>I need to calculate [\\s\\S]*?</think>To find the sum of [\\s\\S]*",  "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
-    (1024, 'deepseek',  CompletionMode.NORMAL,   "First, I [\\s\\S]*", "To find the sum of[\\s\\S]*",                                 "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", ("llama-cpp-deepseek-r1", None)),
-    (1024, 'deepseek',  CompletionMode.STREAMED, None, "^<think>First, I [\\s\\S]*?</think>To find the sum of[\\s\\S]*",              "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", ("llama-cpp-deepseek-r1", None)),
+@pytest.mark.parametrize("stream", [CompletionMode.NORMAL, CompletionMode.STREAMED])
+@pytest.mark.parametrize("n_predict,reasoning_format,expect_reasoning_content,expect_content,hf_repo,template_override", [
+    (128, 'deepseek',   None, "^The sum of 102 and 7 is 109[\\s\\S]*",                                       "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",       None),
+    (128,  None,        None, "^The sum of 102 and 7 is 109[\\s\\S]*",                                       "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M",       None),
+    (1024, 'deepseek',  "I need to calculate the sum of 102 and 7[\\s\\S]*", "To find the sum of[\\s\\S]*",  "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
+    (1024, 'deepseek',  "First, I [\\s\\S]*", "To find the sum of[\\s\\S]*",                                 "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", ("llama-cpp-deepseek-r1", None)),
     # (1024, 'none',      CompletionMode.NORMAL,   None, "^(<think>\\s*)?I need[\\s\\S]*?</think>\\s*To find[\\s\\S]*",                 "bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF:Q4_K_M", None),
     # (128,  'deepseek',  None, "^Okay, let me figure out the sum of 102 and 7[\\s\\S]*",                      "bartowski/Qwen_QwQ-32B-GGUF:Q4_K_M",                None),
 ])

@@ -308,10 +308,12 @@ def make_any_request(
         stream = data.get('stream', False)
         if stream:
             content: list[str] = []
+            reasoning_content: list[str] = []
             tool_calls: list[dict] = []
             finish_reason: Optional[str] = None
 
             content_parts = 0
+            reasoning_content_parts = 0
             tool_call_parts = 0
             arguments_parts = 0
 
@@ -322,6 +324,10 @@ def make_any_request(
                     assert len(choice['delta']['content']) > 0, f'Expected non empty content delta!'
                     content.append(choice['delta']['content'])
                     content_parts += 1
+                if choice['delta'].get('reasoning_content') is not None:
+                    assert len(choice['delta']['reasoning_content']) > 0, f'Expected non empty reasoning_content delta!'
+                    reasoning_content.append(choice['delta']['reasoning_content'])
+                    reasoning_content_parts += 1
                 if choice['delta'].get('finish_reason') is not None:
                     finish_reason = choice['delta']['finish_reason']
                 for tc in choice['delta'].get('tool_calls', []):
@@ -349,8 +355,10 @@ def make_any_request(
                         tool_call['function']['name'] = tool_call['function'].get('name', '') + fct['name']
                     if fct.get('arguments') is not None:
                         tool_call['function']['arguments'] += fct['arguments']
+                        arguments_parts += 1
+                    tool_call_parts += 1
 
-            print(f'Streamed response had {content_parts} content parts, {tool_call_parts} tool call parts incl. {arguments_parts} arguments parts')
+            print(f'Streamed response had {content_parts} content parts, {reasoning_content_parts} reasoning_content parts, {tool_call_parts} tool call parts incl. {arguments_parts} arguments parts')
             result = dict(
                 choices=[
                     dict(
@@ -359,6 +367,7 @@ def make_any_request(
                         message=dict(
                             role='assistant',
                             content=''.join(content) if content else None,
+                            reasoning_content=''.join(reasoning_content) if reasoning_content else None,
                             tool_calls=tool_calls if tool_calls else None,
                         ),
                     )

diff --git a/tools/server/webui/src/components/ChatMessage.tsx b/tools/server/webui/src/components/ChatMessage.tsx
@@ -12,12 +12,6 @@ import {
 import ChatInputExtraContextItem from './ChatInputExtraContextItem';
 import { BtnWithTooltips } from '../utils/common';
 
-interface SplitMessage {
-  content: PendingMessage['content'];
-  thought?: string;
-  isThinking?: boolean;
-}
-
 export default function ChatMessage({
   msg,
   siblingLeafNodeIds,
@@ -55,32 +49,6 @@ export default function ChatMessage({
   const nextSibling = siblingLeafNodeIds[siblingCurrIdx + 1];
   const prevSibling = siblingLeafNodeIds[siblingCurrIdx - 1];
 
-  // for reasoning model, we split the message into content and thought
-  // TODO: implement this as remark/rehype plugin in the future
-  const { content, thought, isThinking }: SplitMessage = useMemo(() => {
-    if (msg.content === null || msg.role !== 'assistant') {
-      return { content: msg.content };
-    }
-    let actualContent = '';
-    let thought = '';
-    let isThinking = false;
-    let thinkSplit = msg.content.split('<think>', 2);
-    actualContent += thinkSplit[0];
-    while (thinkSplit[1] !== undefined) {
-      // <think> tag found
-      thinkSplit = thinkSplit[1].split('</think>', 2);
-      thought += thinkSplit[0];
-      isThinking = true;
-      if (thinkSplit[1] !== undefined) {
-        // </think> closing tag found
-        isThinking = false;
-        thinkSplit = thinkSplit[1].split('<think>', 2);
-        actualContent += thinkSplit[0];
-      }
-    }
-    return { content: actualContent, thought, isThinking };
-  }, [msg]);
-
   if (!viewingChat) return null;
 
   const isUser = msg.role === 'user';
@@ -141,7 +109,7 @@ export default function ChatMessage({
           {/* not editing content, render message */}
           {editingContent === null && (
             <>
-              {content === null ? (
+              {msg.content === null ? (
                 <>
                   {/* show loading dots for pending message */}
                   <span className="loading loading-dots loading-md"></span>
@@ -150,16 +118,16 @@ export default function ChatMessage({
                 <>
                   {/* render message as markdown */}
                   <div dir="auto" tabIndex={0}>
-                    {thought && (
+                    {msg.reasoningContent && (
                       <ThoughtProcess
-                        isThinking={!!isThinking && !!isPending}
-                        content={thought}
+                        isThinking={!!msg.reasoningContent && !!isPending}
+                        content={msg.reasoningContent}
                         open={config.showThoughtInProgress}
                       />
                     )}
 
                     <MarkdownDisplay
-                      content={content}
+                      content={msg.content}
                       isGenerating={isPending}
                     />
                   </div>

diff --git a/tools/server/webui/src/utils/app.context.tsx b/tools/server/webui/src/utils/app.context.tsx
@@ -186,6 +186,7 @@ export const AppContextProvider = ({
       timestamp: pendingId,
       role: 'assistant',
       content: null,
+      reasoningContent: null,
       parent: leafNodeId,
       children: [],
     };
@@ -254,12 +255,15 @@ export const AppContextProvider = ({
         if (chunk.error) {
           throw new Error(chunk.error?.message || 'Unknown error');
         }
-        const addedContent = chunk.choices[0].delta.content;
-        const lastContent = pendingMsg.content || '';
-        if (addedContent) {
+        const addedContent = chunk.choices[0].delta.content ?? '';
+        const addedReasoningContent =
+          chunk.choices[0].delta.reasoning_content ?? '';
+        if (addedContent.length > 0 || addedReasoningContent.length > 0) {
           pendingMsg = {
             ...pendingMsg,
-            content: lastContent + addedContent,
+            content: (pendingMsg.content ?? '') + addedContent,
+            reasoningContent:
+              (pendingMsg.reasoningContent ?? '') + addedReasoningContent,
           };
         }
         const timings = chunk.timings;
@@ -324,6 +328,7 @@ export const AppContextProvider = ({
         convId,
         role: 'user',
         content,
+        reasoningContent: null,
         extra,
         parent: leafNodeId,
         children: [],
@@ -367,6 +372,7 @@ export const AppContextProvider = ({
           convId,
           role: 'user',
           content,
+          reasoningContent: null,
           extra,
           parent: parentNodeId,
           children: [],

diff --git a/tools/server/webui/src/utils/storage.ts b/tools/server/webui/src/utils/storage.ts
@@ -111,6 +111,7 @@ const StorageUtils = {
       timestamp: now,
       role: 'system',
       content: '',
+      reasoningContent: null,
       parent: -1,
       children: [],
     });
@@ -229,6 +230,7 @@ interface LSMessage {
   id: number;
   role: 'user' | 'assistant' | 'system';
   content: string;
+  reasoningContent: string | null;
   timings?: TimingReport;
 }
 async function migrationLStoIDB() {
@@ -267,6 +269,7 @@ async function migrationLStoIDB() {
         timestamp: rootId,
         role: 'system',
         content: '',
+        reasoningContent: null,
         parent: -1,
         children: [firstMsg.id],
       });

diff --git a/tools/server/webui/src/utils/types.ts b/tools/server/webui/src/utils/types.ts
@@ -41,6 +41,7 @@ export interface Message {
   timestamp: number; // timestamp from Date.now()
   role: 'user' | 'assistant' | 'system';
   content: string;
+  reasoningContent: string | null;
   timings?: TimingReport;
   extra?: MessageExtra[];
   // node based system for branching
@@ -112,6 +113,7 @@ export interface ViewingChat {
 
 export type PendingMessage = Omit<Message, 'content'> & {
   content: string | null;
+  reasoningContent: string | null;
 };
 
 export enum CanvasType {