replace tui_markdown with a custom markdown renderer (#3396)

Also, simplify the streaming behavior. This fixes a number of display issues with streaming markdown, and paves the way for better markdown features (e.g. customizable styles, syntax highlighting, markdown-aware wrapping). Not currently supported: - footnotes - tables - reference-style links
2025-09-10 12:13:53 -07:00
parent acb28bf914
commit 8068cc75f8
16 changed files with 2309 additions and 983 deletions
--- a/codex-rs/tui/src/markdown.rs
+++ b/codex-rs/tui/src/markdown.rs
@@ -1,8 +1,6 @@
-use crate::citation_regex::CITATION_REGEX;
 use codex_core::config::Config;
 use codex_core::config_types::UriBasedFileOpener;
 use ratatui::text::Line;
-use std::borrow::Cow;
 use std::path::Path;

 pub(crate) fn append_markdown(
@@ -19,238 +17,13 @@ fn append_markdown_with_opener_and_cwd(
    file_opener: UriBasedFileOpener,
    cwd: &Path,
 ) {
-    // Historically, we fed the entire `markdown_source` into the renderer in
-    // one pass. However, fenced code blocks sometimes lost leading whitespace
-    // when formatted by the markdown renderer/highlighter. To preserve code
-    // block content exactly, split the source into "text" and "code" segments:
-    // - Render non-code text through `tui_markdown` (with citation rewrite).
-    // - Render code block content verbatim as plain lines without additional
-    //   formatting, preserving leading spaces.
-    for seg in split_text_and_fences(markdown_source) {
-        match seg {
-            Segment::Text(s) => {
-                let processed = rewrite_file_citations(&s, file_opener, cwd);
-                let rendered = tui_markdown::from_str(&processed);
-                crate::render::line_utils::push_owned_lines(&rendered.lines, lines);
-            }
-            Segment::Code { content, .. } => {
-                // Emit the code content exactly as-is, line by line.
-                // We don't attempt syntax highlighting to avoid whitespace bugs.
-                for line in content.split_inclusive('\n') {
-                    // split_inclusive keeps the trailing \n; we want lines without it.
-                    let line = if let Some(stripped) = line.strip_suffix('\n') {
-                        stripped
-                    } else {
-                        line
-                    };
-                    let owned_line: Line<'static> = line.to_string().into();
-                    lines.push(owned_line);
-                }
-            }
-        }
-    }
-}
-
-/// Rewrites file citations in `src` into markdown hyperlinks using the
-/// provided `scheme` (`vscode`, `cursor`, etc.). The resulting URI follows the
-/// format expected by VS Code-compatible file openers:
-///
-/// ```text
-/// <scheme>://file<ABS_PATH>:<LINE>
-/// ```
-fn rewrite_file_citations<'a>(
-    src: &'a str,
-    file_opener: UriBasedFileOpener,
-    cwd: &Path,
-) -> Cow<'a, str> {
-    // Map enum values to the corresponding URI scheme strings.
-    let scheme: &str = match file_opener.get_scheme() {
-        Some(scheme) => scheme,
-        None => return Cow::Borrowed(src),
-    };
-
-    CITATION_REGEX.replace_all(src, |caps: &regex_lite::Captures<'_>| {
-        let file = &caps[1];
-        let start_line = &caps[2];
-
-        // Resolve the path against `cwd` when it is relative.
-        let absolute_path = {
-            let p = Path::new(file);
-            let absolute_path = if p.is_absolute() {
-                path_clean::clean(p)
-            } else {
-                path_clean::clean(cwd.join(p))
-            };
-            // VS Code expects forward slashes even on Windows because URIs use
-            // `/` as the path separator.
-            absolute_path.to_string_lossy().replace('\\', "/")
-        };
-
-        // Render as a normal markdown link so the downstream renderer emits
-        // the hyperlink escape sequence (when supported by the terminal).
-        //
-        // In practice, sometimes multiple citations for the same file, but with a
-        // different line number, are shown sequentially, so we:
-        // - include the line number in the label to disambiguate them
-        // - add a space after the link to make it easier to read
-        format!("[{file}:{start_line}]({scheme}://file{absolute_path}:{start_line}) ")
-    })
-}
-
-// use shared helper from `line_utils`
-
-// Minimal code block splitting.
-// - Recognizes fenced blocks opened by ``` or ~~~ (allowing leading whitespace).
-//   The opening fence may include a language string which we ignore.
-//   The closing fence must be on its own line (ignoring surrounding whitespace).
-// - Additionally recognizes indented code blocks that begin after a blank line
-//   with a line starting with at least 4 spaces or a tab, and continue for
-//   consecutive lines that are blank or also indented by >= 4 spaces or a tab.
-enum Segment {
-    Text(String),
-    Code {
-        _lang: Option<String>,
-        content: String,
-    },
-}
-
-fn split_text_and_fences(src: &str) -> Vec<Segment> {
-    let mut segments = Vec::new();
-    let mut curr_text = String::new();
-    #[derive(Copy, Clone, PartialEq)]
-    enum CodeMode {
-        None,
-        Fenced,
-        Indented,
-    }
-    let mut code_mode = CodeMode::None;
-    let mut fence_token = "";
-    let mut code_lang: Option<String> = None;
-    let mut code_content = String::new();
-    // We intentionally do not require a preceding blank line for indented code blocks,
-    // since streamed model output often omits it. This favors preserving indentation.
-
-    for line in src.split_inclusive('\n') {
-        let line_no_nl = line.strip_suffix('\n');
-        let trimmed_start = match line_no_nl {
-            Some(l) => l.trim_start(),
-            None => line.trim_start(),
-        };
-        if code_mode == CodeMode::None {
-            let open = if trimmed_start.starts_with("```") {
-                Some("```")
-            } else if trimmed_start.starts_with("~~~") {
-                Some("~~~")
-            } else {
-                None
-            };
-            if let Some(tok) = open {
-                // Flush pending text segment.
-                if !curr_text.is_empty() {
-                    segments.push(Segment::Text(curr_text.clone()));
-                    curr_text.clear();
-                }
-                fence_token = tok;
-                // Capture language after the token on this line (before newline).
-                let after = &trimmed_start[tok.len()..];
-                let lang = after.trim();
-                code_lang = if lang.is_empty() {
-                    None
-                } else {
-                    Some(lang.to_string())
-                };
-                code_mode = CodeMode::Fenced;
-                code_content.clear();
-                // Do not include the opening fence line in output.
-                continue;
-            }
-            // Check for start of an indented code block: only after a blank line
-            // (or at the beginning), and the line must start with >=4 spaces or a tab.
-            let raw_line = match line_no_nl {
-                Some(l) => l,
-                None => line,
-            };
-            let leading_spaces = raw_line.chars().take_while(|c| *c == ' ').count();
-            let starts_with_tab = raw_line.starts_with('\t');
-            // Consider any line that begins with >=4 spaces or a tab to start an
-            // indented code block. This favors preserving indentation even when a
-            // preceding blank line is omitted (common in streamed model output).
-            let starts_indented_code = (leading_spaces >= 4) || starts_with_tab;
-            if starts_indented_code {
-                // Flush pending text and begin an indented code block.
-                if !curr_text.is_empty() {
-                    segments.push(Segment::Text(curr_text.clone()));
-                    curr_text.clear();
-                }
-                code_mode = CodeMode::Indented;
-                code_content.clear();
-                code_content.push_str(line);
-                // Inside code now; do not treat this line as normal text.
-                continue;
-            }
-            // Normal text line.
-            curr_text.push_str(line);
-        } else {
-            match code_mode {
-                CodeMode::Fenced => {
-                    // inside fenced code: check for closing fence on its own line
-                    let trimmed = match line_no_nl {
-                        Some(l) => l.trim(),
-                        None => line.trim(),
-                    };
-                    if trimmed == fence_token {
-                        // End code block: emit segment without fences
-                        segments.push(Segment::Code {
-                            _lang: code_lang.take(),
-                            content: code_content.clone(),
-                        });
-                        code_content.clear();
-                        code_mode = CodeMode::None;
-                        fence_token = "";
-                        continue;
-                    }
-                    // Accumulate code content exactly as-is.
-                    code_content.push_str(line);
-                }
-                CodeMode::Indented => {
-                    // Continue while the line is blank, or starts with >=4 spaces, or a tab.
-                    let raw_line = match line_no_nl {
-                        Some(l) => l,
-                        None => line,
-                    };
-                    let is_blank = raw_line.trim().is_empty();
-                    let leading_spaces = raw_line.chars().take_while(|c| *c == ' ').count();
-                    let starts_with_tab = raw_line.starts_with('\t');
-                    if is_blank || leading_spaces >= 4 || starts_with_tab {
-                        code_content.push_str(line);
-                    } else {
-                        // Close the indented code block and reprocess this line as normal text.
-                        segments.push(Segment::Code {
-                            _lang: None,
-                            content: code_content.clone(),
-                        });
-                        code_content.clear();
-                        code_mode = CodeMode::None;
-                        // Now handle current line as text.
-                        curr_text.push_str(line);
-                    }
-                }
-                CodeMode::None => unreachable!(),
-            }
-        }
-    }
-
-    if code_mode != CodeMode::None {
-        // Unterminated code fence: treat accumulated content as a code segment.
-        segments.push(Segment::Code {
-            _lang: code_lang.take(),
-            content: code_content.clone(),
-        });
-    } else if !curr_text.is_empty() {
-        segments.push(Segment::Text(curr_text.clone()));
-    }
-
-    segments
+    // Render via pulldown-cmark and rewrite citations during traversal (outside code blocks).
+    let rendered = crate::markdown_render::render_markdown_text_with_citations(
+        markdown_source,
+        file_opener.get_scheme(),
+        cwd,
+    );
+    crate::render::line_utils::push_owned_lines(&rendered.lines, lines);
 }

 #[cfg(test)]
@@ -258,88 +31,6 @@ mod tests {
    use super::*;
    use pretty_assertions::assert_eq;

-    #[test]
-    fn citation_is_rewritten_with_absolute_path() {
-        let markdown = "See 【F:/src/main.rs†L42-L50】 for details.";
-        let cwd = Path::new("/workspace");
-        let result = rewrite_file_citations(markdown, UriBasedFileOpener::VsCode, cwd);
-
-        assert_eq!(
-            "See [/src/main.rs:42](vscode://file/src/main.rs:42)  for details.",
-            result
-        );
-    }
-
-    #[test]
-    fn citation_is_rewritten_with_relative_path() {
-        let markdown = "Refer to 【F:lib/mod.rs†L5】 here.";
-        let cwd = Path::new("/home/user/project");
-        let result = rewrite_file_citations(markdown, UriBasedFileOpener::Windsurf, cwd);
-
-        assert_eq!(
-            "Refer to [lib/mod.rs:5](windsurf://file/home/user/project/lib/mod.rs:5)  here.",
-            result
-        );
-    }
-
-    #[test]
-    fn citation_followed_by_space_so_they_do_not_run_together() {
-        let markdown = "References on lines 【F:src/foo.rs†L24】【F:src/foo.rs†L42】";
-        let cwd = Path::new("/home/user/project");
-        let result = rewrite_file_citations(markdown, UriBasedFileOpener::VsCode, cwd);
-
-        assert_eq!(
-            "References on lines [src/foo.rs:24](vscode://file/home/user/project/src/foo.rs:24) [src/foo.rs:42](vscode://file/home/user/project/src/foo.rs:42) ",
-            result
-        );
-    }
-
-    #[test]
-    fn citation_unchanged_without_file_opener() {
-        let markdown = "Look at 【F:file.rs†L1】.";
-        let cwd = Path::new("/");
-        let unchanged = rewrite_file_citations(markdown, UriBasedFileOpener::VsCode, cwd);
-        // The helper itself always rewrites – this test validates behaviour of
-        // append_markdown when `file_opener` is None.
-        let mut out = Vec::new();
-        append_markdown_with_opener_and_cwd(markdown, &mut out, UriBasedFileOpener::None, cwd);
-        // Convert lines back to string for comparison.
-        let rendered: String = out
-            .iter()
-            .flat_map(|l| l.spans.iter())
-            .map(|s| s.content.clone())
-            .collect::<Vec<_>>()
-            .join("");
-        assert_eq!(markdown, rendered);
-        // Ensure helper rewrites.
-        assert_ne!(markdown, unchanged);
-    }
-
-    #[test]
-    fn fenced_code_blocks_preserve_leading_whitespace() {
-        let src = "```\n  indented\n\t\twith tabs\n    four spaces\n```\n";
-        let cwd = Path::new("/");
-        let mut out = Vec::new();
-        append_markdown_with_opener_and_cwd(src, &mut out, UriBasedFileOpener::None, cwd);
-        let rendered: Vec<String> = out
-            .iter()
-            .map(|l| {
-                l.spans
-                    .iter()
-                    .map(|s| s.content.clone())
-                    .collect::<String>()
-            })
-            .collect();
-        assert_eq!(
-            rendered,
-            vec![
-                "  indented".to_string(),
-                "\t\twith tabs".to_string(),
-                "    four spaces".to_string()
-            ]
-        );
-    }
-
    #[test]
    fn citations_not_rewritten_inside_code_blocks() {
        let src = "Before 【F:/x.rs†L1】\n```\nInside 【F:/x.rs†L2】\n```\nAfter 【F:/x.rs†L3】\n";
@@ -355,19 +46,31 @@ mod tests {
                    .collect::<String>()
            })
            .collect();
-        // Expect first and last lines rewritten, middle line unchanged.
-        assert!(rendered[0].contains("vscode://file"));
-        assert_eq!(rendered[1], "Inside 【F:/x.rs†L2】");
-        assert!(matches!(rendered.last(), Some(s) if s.contains("vscode://file")));
+        // Expect a line containing the inside text unchanged.
+        assert!(rendered.iter().any(|s| s.contains("Inside 【F:/x.rs†L2】")));
+        // And first/last sections rewritten.
+        assert!(
+            rendered
+                .first()
+                .map(|s| s.contains("vscode://file"))
+                .unwrap_or(false)
+        );
+        assert!(
+            rendered
+                .last()
+                .map(|s| s.contains("vscode://file"))
+                .unwrap_or(false)
+        );
    }

    #[test]
    fn indented_code_blocks_preserve_leading_whitespace() {
-        let src = "Before\n    code 1\n\tcode with tab\n        code 2\nAfter\n";
+        // Basic sanity: indented code with surrounding blank lines should produce the indented line.
+        let src = "Before\n\n    code 1\n\nAfter\n";
        let cwd = Path::new("/");
        let mut out = Vec::new();
        append_markdown_with_opener_and_cwd(src, &mut out, UriBasedFileOpener::None, cwd);
-        let rendered: Vec<String> = out
+        let lines: Vec<String> = out
            .iter()
            .map(|l| {
                l.spans
@@ -376,16 +79,7 @@ mod tests {
                    .collect::<String>()
            })
            .collect();
-        assert_eq!(
-            rendered,
-            vec![
-                "Before".to_string(),
-                "    code 1".to_string(),
-                "\tcode with tab".to_string(),
-                "        code 2".to_string(),
-                "After".to_string()
-            ]
-        );
+        assert_eq!(lines, vec!["Before", "", "    code 1", "", "After"]);
    }

    #[test]
@@ -403,11 +97,17 @@ mod tests {
                    .collect::<String>()
            })
            .collect();
-        // Expect first and last lines rewritten, and the indented code line present
-        // unchanged (citations inside not rewritten). We do not assert on blank
-        // separator lines since the markdown renderer may normalize them.
-        assert!(rendered.iter().any(|s| s.contains("vscode://file")));
-        assert!(rendered.iter().any(|s| s == "    Inside 【F:/x.rs†L2】"));
+        assert!(
+            rendered
+                .iter()
+                .any(|s| s.contains("Start") && s.contains("vscode://file"))
+        );
+        assert!(
+            rendered
+                .iter()
+                .any(|s| s.contains("End") && s.contains("vscode://file"))
+        );
+        assert!(rendered.iter().any(|s| s.contains("Inside 【F:/x.rs†L2】")));
    }

    #[test]
@@ -435,27 +135,6 @@ mod tests {
        );
    }

-    #[test]
-    fn tui_markdown_splits_ordered_marker_and_text() {
-        // With marker and content on the same line, tui_markdown keeps it as one line
-        // even in the surrounding section context.
-        let rendered = tui_markdown::from_str("Loose vs. tight list items:\n1. Tight item\n");
-        let lines: Vec<String> = rendered
-            .lines
-            .iter()
-            .map(|l| {
-                l.spans
-                    .iter()
-                    .map(|s| s.content.clone())
-                    .collect::<String>()
-            })
-            .collect();
-        assert!(
-            lines.iter().any(|w| w == "1. Tight item"),
-            "expected single line '1. Tight item' in context: {lines:?}"
-        );
-    }
-
    #[test]
    fn append_markdown_matches_tui_markdown_for_ordered_item() {
        use codex_core::config_types::UriBasedFileOpener;
@@ -480,72 +159,6 @@ mod tests {
        assert_eq!(lines, vec!["1. Tight item".to_string()]);
    }

-    #[test]
-    fn tui_markdown_shape_for_loose_tight_section() {
-        // Use the exact source from the session deltas used in tests.
-        let source = r#"
-Loose vs. tight list items:
-1. Tight item
-2. Another tight item
-
-3.
-   Loose item
-"#;
-
-        let rendered = tui_markdown::from_str(source);
-        let lines: Vec<String> = rendered
-            .lines
-            .iter()
-            .map(|l| {
-                l.spans
-                    .iter()
-                    .map(|s| s.content.clone())
-                    .collect::<String>()
-            })
-            .collect();
-        // Join into a single string and assert the exact shape we observe
-        // from tui_markdown in this larger context (marker and content split).
-        let joined = {
-            let mut s = String::new();
-            for (i, l) in lines.iter().enumerate() {
-                s.push_str(l);
-                if i + 1 < lines.len() {
-                    s.push('\n');
-                }
-            }
-            s
-        };
-        let expected = r#"Loose vs. tight list items:
-
-1. 
-Tight item
-2. 
-Another tight item
-3. 
-Loose item"#;
-        assert_eq!(
-            joined, expected,
-            "unexpected tui_markdown shape: {joined:?}"
-        );
-    }
-
-    #[test]
-    fn split_text_and_fences_keeps_ordered_list_line_as_text() {
-        // No fences here; expect a single Text segment containing the full input.
-        let src = "Loose vs. tight list items:\n1. Tight item\n";
-        let segs = super::split_text_and_fences(src);
-        assert_eq!(
-            segs.len(),
-            1,
-            "expected single text segment, got {}",
-            segs.len()
-        );
-        match &segs[0] {
-            super::Segment::Text(s) => assert_eq!(s, src),
-            _ => panic!("expected Text segment for non-fence input"),
-        }
-    }
-
    #[test]
    fn append_markdown_keeps_ordered_list_line_unsplit_in_context() {
        use codex_core::config_types::UriBasedFileOpener;