Skip to content

Commit 069819b

Browse files
cscheidclaude
andauthored
Feature/llms txt fixes (#14067)
* Fix llms-txt anchor links and breadcrumb leaking Links with anchors (e.g. about.html#section) were not being converted to .llms.md because the Lua pattern only matched .html at end-of-string. Now also matches .html followed by # and rewrites both cases. Breadcrumbs from sidebar navigation were leaking into .llms.md output. Add quarto-page-breadcrumbs to droppable_classes in the Lua filter. Adds test coverage for both fixes: anchor link conversion in both directions, .html# negative matches, sidebar config to trigger breadcrumbs, and breadcrumb text negative match. Co-Authored-By: Claude Opus 4.6 <[email protected]> * Convert tabsets to headings with content in llms-txt output Tabsets were rendered as a bullet list of empty links followed by disconnected content. Add handle_tabset() to the Lua filter that extracts tab titles from the nav BulletList and pairs them with their tab-pane contents, outputting each as a heading + content. Co-Authored-By: Claude Opus 4.6 <[email protected]> * Preserve code annotations in llms-txt output Save original code block text (with annotation markers) before code-annotation.lua strips them, then restore during HTML-to-markdown conversion. Annotation definition lists are converted to ordered lists. Co-Authored-By: Claude Opus 4.6 <[email protected]> * Support conditional content for llms-txt format Allow users to include/exclude content in .llms.md output using content-visible/content-hidden with when-format="llms-txt". A pre-filter intercepts ConditionalBlock nodes before they are cleared, wrapping them in marker divs that llms.lua and the HTML finalizer handle independently. Co-Authored-By: Claude Opus 4.6 <[email protected]> * move local declarations into function to avoid hitting max # locals Lua limit * Fix llms-txt breadcrumb leaking and link prefix - Remove breadcrumbs from extracted HTML before Pandoc conversion, since Pandoc strips <nav> wrappers and loses the droppable class - Strip ./ prefix from converted .llms.md links for cleaner output - Fix test regex to match code annotation markers with space Co-Authored-By: Claude Opus 4.6 <[email protected]> --------- Co-authored-by: Claude Opus 4.6 <[email protected]>
1 parent baf8fec commit 069819b

9 files changed

Lines changed: 295 additions & 5 deletions

File tree

src/project/types/website/website-llms.ts

Lines changed: 91 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ import { basename, join, relative } from "../../../deno_ral/path.ts";
88
import { existsSync } from "../../../deno_ral/fs.ts";
99
import { pathWithForwardSlashes } from "../../../core/path.ts";
1010

11-
import { Document, Element } from "../../../core/deno-dom.ts";
11+
import { Document, Element, Node } from "../../../core/deno-dom.ts";
1212
import { execProcess } from "../../../core/process.ts";
1313
import { pandocBinaryPath, resourcePath } from "../../../core/resources.ts";
1414

@@ -83,9 +83,36 @@ export function llmsHtmlFinalizer(
8383

8484
// Convert HTML to markdown using Pandoc with the llms.lua filter
8585
await convertHtmlToLlmsMarkdown(htmlContent, llmsOutputPath);
86+
87+
// Clean up conditional content markers from the original HTML doc
88+
cleanupConditionalContent(doc);
8689
};
8790
}
8891

92+
/**
93+
* Clean up conditional content markers from the HTML document.
94+
* - Remove llms-only content (should not appear in HTML output)
95+
* - Unwrap llms-hidden markers (keep content, remove wrapper div)
96+
*/
97+
function cleanupConditionalContent(doc: Document): void {
98+
// Remove llms-only content from HTML output
99+
for (const el of doc.querySelectorAll(".llms-conditional-content")) {
100+
(el as Element).remove();
101+
}
102+
103+
// Unwrap llms-hidden markers (keep content, remove wrapper div)
104+
for (const el of doc.querySelectorAll(".llms-hidden-content")) {
105+
const parent = (el as Element).parentElement;
106+
if (parent) {
107+
const element = el as Element;
108+
while (element.firstChild) {
109+
parent.insertBefore(element.firstChild as Node, element as Node);
110+
}
111+
element.remove();
112+
}
113+
}
114+
}
115+
89116
/**
90117
* Extract the main content from an HTML document, removing navigation,
91118
* sidebars, footers, scripts, and styles.
@@ -104,6 +131,7 @@ function extractMainContent(doc: Document): string {
104131
".sidebar",
105132
".quarto-search",
106133
"nav.navbar",
134+
".quarto-page-breadcrumbs",
107135
"script",
108136
"style",
109137
"link[rel='stylesheet']",
@@ -128,6 +156,9 @@ function extractMainContent(doc: Document): string {
128156
return "";
129157
}
130158

159+
// Preprocess annotated code blocks before converting to markdown
160+
preprocessAnnotatedCodeBlocks(clone, main as Element);
161+
131162
// Return a minimal HTML document with just the content
132163
return `<!DOCTYPE html>
133164
<html>
@@ -138,6 +169,65 @@ ${main.innerHTML}
138169
</html>`;
139170
}
140171

172+
/**
173+
* Preprocess annotated code blocks for llms output.
174+
* Restores original code text (with annotation markers) and converts
175+
* the annotation definition list to an ordered list.
176+
*/
177+
function preprocessAnnotatedCodeBlocks(doc: Document, container: Element): void {
178+
// Restore original code text in annotated code blocks.
179+
// The llms-code-annotations.lua filter saves the original text
180+
// (before code-annotation.lua strips markers) as a data attribute.
181+
const annotated = container.querySelectorAll("[data-llms-code-original]");
182+
for (const node of annotated) {
183+
const el = node as Element;
184+
const originalText = el.getAttribute("data-llms-code-original");
185+
if (!originalText) continue;
186+
187+
// The attribute is on the wrapper div; find the <code> element inside
188+
const codeEl = el.tagName === "CODE"
189+
? el
190+
: el.querySelector("code") as Element | null;
191+
if (codeEl) {
192+
// Replace content with original (removes syntax highlighting spans + annotation buttons)
193+
codeEl.textContent = originalText;
194+
}
195+
196+
el.removeAttribute("data-llms-code-original");
197+
}
198+
199+
// Remove annotation gutter elements
200+
const gutters = container.querySelectorAll(
201+
".code-annotation-gutter, .code-annotation-gutter-bg",
202+
);
203+
for (const gutter of gutters) {
204+
(gutter as Element).remove();
205+
}
206+
207+
// Convert annotation definition lists to ordered lists.
208+
// The annotation text is in <dd> elements; <dt> elements have just the number.
209+
const dls = container.querySelectorAll("dl.code-annotation-container-grid");
210+
for (const dlNode of dls) {
211+
const dl = dlNode as Element;
212+
const ol = doc.createElement("ol");
213+
const dds = dl.querySelectorAll("dd");
214+
for (const ddNode of dds) {
215+
const dd = ddNode as Element;
216+
const li = doc.createElement("li");
217+
li.innerHTML = dd.innerHTML;
218+
ol.appendChild(li);
219+
}
220+
221+
// Replace the DL (and its cell-annotation wrapper div if present)
222+
const parent = dl.parentElement;
223+
if (parent && parent.classList.contains("cell-annotation")) {
224+
parent.parentElement?.replaceChild(ol, parent);
225+
} else {
226+
dl.parentElement?.replaceChild(ol, dl);
227+
}
228+
}
229+
}
230+
141231
/**
142232
* Convert HTML content to markdown using Pandoc with the llms.lua filter.
143233
*/

src/project/types/website/website.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ import { projectOffset, projectOutputDir } from "../../project-shared.ts";
3232
import { isHtmlFileOutput } from "../../../config/format.ts";
3333

3434
import {
35+
kFilterParams,
3536
kIncludeInHeader,
3637
kPageTitle,
3738
kTitle,
@@ -358,6 +359,8 @@ export const websiteProjectType: ProjectType = {
358359

359360
// Add llms.txt finalizer if enabled
360361
if (websiteConfigBoolean(kLlmsTxt, false, project.config)) {
362+
extras[kFilterParams] = extras[kFilterParams] || {};
363+
extras[kFilterParams]["llms-txt"] = true;
361364
extras.html[kHtmlFinalizers]?.push(
362365
llmsHtmlFinalizer(source, project, format),
363366
);

src/resources/filters/llms/llms.lua

Lines changed: 57 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ local skippable_classes = {
1515
["quarto-float"] = true,
1616
["quarto-float-fig"] = true,
1717
["figure"] = true,
18+
["llms-conditional-content"] = true,
1819
}
1920
local droppable_classes = {
2021
["navbar-container"] = true,
@@ -25,6 +26,8 @@ local droppable_classes = {
2526
["listing-categories"] = true,
2627
["quarto-listing-category"] = true, -- category filter sidebar
2728
["listing-category"] = true, -- individual category badges
29+
["quarto-page-breadcrumbs"] = true, -- breadcrumb navigation
30+
["llms-hidden-content"] = true,
2831
}
2932
local droppable_ids = {
3033
["quarto-header"] = true,
@@ -70,6 +73,53 @@ local function clean_element(el)
7073
end
7174
end
7275

76+
local function handle_tabset(div)
77+
local titles = {}
78+
local panes = {}
79+
80+
-- Extract tab titles from the nav BulletList (first one in the div)
81+
-- and tab pane contents from the tab-content Div
82+
for _, block in ipairs(div.content) do
83+
if block.t == "BulletList" and #titles == 0 then
84+
for _, item in ipairs(block.content) do
85+
for _, inner_block in ipairs(item) do
86+
if inner_block.t == "Plain" or inner_block.t == "Para" then
87+
for _, inline in ipairs(inner_block.content) do
88+
if inline.t == "Link" then
89+
table.insert(titles, inline.content)
90+
break
91+
end
92+
end
93+
end
94+
end
95+
end
96+
elseif block.t == "Div" and block.classes:includes("tab-content") then
97+
for _, inner in ipairs(block.content) do
98+
if inner.t == "Div" and inner.classes:includes("tab-pane") then
99+
table.insert(panes, inner.content)
100+
end
101+
end
102+
end
103+
end
104+
105+
-- Build output: heading + content for each tab
106+
local result = pandoc.Blocks({})
107+
for i = 1, math.max(#titles, #panes) do
108+
if titles[i] then
109+
result:insert(pandoc.Header(2, titles[i]))
110+
end
111+
if panes[i] then
112+
result:extend(panes[i])
113+
end
114+
end
115+
116+
if #result > 0 then
117+
return result
118+
end
119+
-- Fallback: return content as-is
120+
return div.content
121+
end
122+
73123
local function handle_callout(div)
74124
local kind = "NOTE" -- NOTE, TIP, IMPORTANT, WARNING, CAUTION
75125
div.classes:map(function(cls)
@@ -140,8 +190,10 @@ function Link(link)
140190
return link.content
141191
end
142192

143-
if link.target and link.target:match("%.html$") then
193+
if link.target and (link.target:match("%.html$") or link.target:match("%.html#")) then
194+
link.target = link.target:gsub("%.html#", ".llms.md#")
144195
link.target = link.target:gsub("%.html$", ".llms.md")
196+
link.target = link.target:gsub("^%./", "")
145197
if link.classes:includes("btn") then
146198
link.attr = pandoc.Attr()
147199
end
@@ -174,6 +226,10 @@ end
174226

175227
function Div(div)
176228

229+
if div.classes:includes("panel-tabset") then
230+
return handle_tabset(div)
231+
end
232+
177233
if div.classes:includes("callout") then
178234
return handle_callout(div)
179235
end

src/resources/filters/main.lua

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,8 @@ import("./quarto-pre/bibliography-formats.lua")
143143
import("./quarto-pre/book-links.lua")
144144
import("./quarto-pre/book-numbering.lua")
145145
import("./quarto-pre/code-annotation.lua")
146+
import("./quarto-pre/llms-code-annotations.lua")
147+
import("./quarto-pre/llms-conditional-content.lua")
146148
import("./quarto-pre/code-filename.lua")
147149
import("./quarto-pre/contentsshortcode.lua")
148150
import("./quarto-pre/engine-escape.lua")
@@ -321,6 +323,15 @@ local quarto_pre_filters = {
321323
traverser = 'jog',
322324
},
323325

326+
{ name = "pre-llms-conditional-content",
327+
filter = filterIf(
328+
function() return param("llms-txt", false) end,
329+
llms_resolve_conditional_content()
330+
),
331+
flags = { "has_conditional_content" },
332+
traverser = 'jog',
333+
},
334+
324335
{ name = "pre-combined-hidden",
325336
filter = combineFilters({
326337
hidden(),
@@ -336,6 +347,15 @@ local quarto_pre_filters = {
336347
traverser = 'jog',
337348
},
338349

350+
{ name = "pre-llms-save-code-annotations",
351+
filter = filterIf(
352+
function() return param("llms-txt", false) end,
353+
llms_save_code_annotations()
354+
),
355+
flags = { "has_code_annotations" },
356+
traverser = 'jog',
357+
},
358+
339359
{ name = "pre-code-annotations",
340360
filter = code_annotations(),
341361
flags = { "has_code_annotations" },
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
-- llms-code-annotations.lua
2+
-- Copyright (C) 2020-2026 Posit Software, PBC
3+
--
4+
-- Saves original CodeBlock text before code-annotation.lua strips markers.
5+
-- Only runs when llms-txt is enabled (guarded by filterIf in main.lua).
6+
7+
function llms_save_code_annotations()
8+
return {
9+
CodeBlock = function(el)
10+
if el.text:match("<%d+>") then
11+
el.attributes["data-llms-code-original"] = el.text
12+
end
13+
return el
14+
end
15+
}
16+
end
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
-- llms-conditional-content.lua
2+
-- Copyright (C) 2020-2026 Posit Software, PBC
3+
--
4+
-- Pre-filter that intercepts ConditionalBlock nodes referencing llms-txt
5+
-- and replaces them with marker Divs so content can be included/excluded
6+
-- from llms.md output independently of the HTML format.
7+
-- Only runs when llms-txt is enabled (guarded by filterIf in main.lua).
8+
9+
function llms_resolve_conditional_content()
10+
-- Determine if a ConditionalBlock should be visible for llms-txt output.
11+
-- Returns true (include), false (exclude), or nil (no llms-txt condition).
12+
local function is_llms_visible(tbl)
13+
local constants = require("modules/constants")
14+
local function list_contains(list, value)
15+
if not list then return false end
16+
for _, v in ipairs(list) do
17+
if v == value then return true end
18+
end
19+
return false
20+
end
21+
22+
local cond = tbl.condition
23+
local has_when = list_contains(cond[constants.kWhenFormat], "llms-txt")
24+
local has_unless = list_contains(cond[constants.kUnlessFormat], "llms-txt")
25+
26+
if not has_when and not has_unless then return nil end
27+
28+
if tbl.behavior == constants.kContentVisible then
29+
-- content-visible when-format="llms-txt" -> include for llms
30+
-- content-visible unless-format="llms-txt" -> exclude for llms
31+
return has_when
32+
else -- content-hidden
33+
-- content-hidden when-format="llms-txt" -> exclude for llms
34+
-- content-hidden unless-format="llms-txt" -> include for llms
35+
return has_unless
36+
end
37+
end
38+
39+
return {
40+
ConditionalBlock = function(tbl)
41+
local llms_visible = is_llms_visible(tbl)
42+
if llms_visible == nil then return nil end
43+
44+
local html_visible = is_visible(tbl) -- from content-hidden.lua
45+
if llms_visible == html_visible then return nil end -- no intervention needed
46+
47+
local div = tbl.original_node:clone()
48+
if llms_visible then
49+
div.classes:insert("llms-conditional-content")
50+
else
51+
div.classes:insert("llms-hidden-content")
52+
end
53+
return div
54+
end
55+
}
56+
end

tests/docs/smoke-all/website/llms-txt/_quarto.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,11 @@ website:
1111
- href: index.qmd
1212
text: Home
1313
- about.qmd
14+
sidebar:
15+
contents:
16+
- section: Info
17+
contents:
18+
- about.qmd
1419

1520
format:
1621
html:

0 commit comments

Comments
 (0)