|
| 1 | +#' Build docs for LLMs |
| 2 | +#' |
| 3 | +#' @description |
| 4 | +#' `build_llm_docs()` creates an `LLMs.txt` at the root of your site |
| 5 | +#' that contains the contents of your `README.md`, your reference index, |
| 6 | +#' and your articles index. It also creates a `.md` file for every existing |
| 7 | +#' `.html` file in your site. Together, this gives an LLM an overview of your |
| 8 | +#' package and the ability to find out more by following links. |
| 9 | +#' |
| 10 | +#' If you don't want these files generated for your site, you can opt-out by |
| 11 | +#' adding the following to your `pkgdown.yml`: |
| 12 | +#' |
| 13 | +#' ```yaml |
| 14 | +#' llm-docs: false |
| 15 | +#' ``` |
| 16 | +#' |
| 17 | +#' @family site components |
| 18 | +#' @inheritParams build_site |
| 19 | +#' @export |
| 20 | +build_llm_docs <- function(pkg = ".") { |
| 21 | + pkg <- as_pkgdown(pkg) |
| 22 | + if (isFALSE(pkg$meta$`llm-docs`)) { |
| 23 | + return(invisible()) |
| 24 | + } |
| 25 | + |
| 26 | + cli::cli_rule("Building docs for llms") |
| 27 | + |
| 28 | + paths <- get_site_paths(pkg) |
| 29 | + purrr::walk(paths, \(path) { |
| 30 | + src_path <- path(pkg[["dst_path"]], path) |
| 31 | + dst_path <- path_ext_set(src_path, "md") |
| 32 | + convert_md(src_path, dst_path, full_url(pkg, path)) |
| 33 | + }) |
| 34 | + |
| 35 | + index <- c( |
| 36 | + read_lines(path(pkg$dst_path, "index.md")), |
| 37 | + "", |
| 38 | + read_file_if_exists(path(pkg$dst_path, "reference", "index.md")), |
| 39 | + "", |
| 40 | + read_file_if_exists(path(pkg$dst_path, "articles", "index.md")) |
| 41 | + ) |
| 42 | + write_lines(index, path(pkg$dst_path, "llms.txt")) |
| 43 | + |
| 44 | + invisible() |
| 45 | +} |
| 46 | + |
| 47 | +full_url <- function(pkg, path) { |
| 48 | + if (is.null(pkg$meta$url)) { |
| 49 | + return() |
| 50 | + } |
| 51 | + |
| 52 | + url <- paste0(pkg$meta$url, "/") |
| 53 | + if (pkg$development$in_dev) { |
| 54 | + url <- paste0(url, pkg$prefix) |
| 55 | + } |
| 56 | + |
| 57 | + xml2::url_absolute(paste0(path_dir(path), "/"), url) |
| 58 | +} |
| 59 | + |
| 60 | +convert_md <- function(src_path, dst_path, url = NULL) { |
| 61 | + html <- xml2::read_html(src_path) |
| 62 | + main_html <- xml2::xml_find_first(html, ".//main") |
| 63 | + if (length(main_html) == 0) { |
| 64 | + return() |
| 65 | + } |
| 66 | + |
| 67 | + simplify_page_header(main_html) |
| 68 | + simplify_anchors(main_html) |
| 69 | + simplify_code(main_html) |
| 70 | + simplify_popovers_to_footnotes(main_html) |
| 71 | + simplify_lifecycle_badges(main_html) |
| 72 | + simplify_dls(main_html) |
| 73 | + create_absolute_links(main_html, url) |
| 74 | + |
| 75 | + path <- file_temp() |
| 76 | + xml2::write_html(main_html, path, format = FALSE) |
| 77 | + on.exit(file_delete(path), add = TRUE) |
| 78 | + |
| 79 | + rmarkdown::pandoc_convert( |
| 80 | + input = path, |
| 81 | + output = dst_path, |
| 82 | + from = "html", |
| 83 | + to = "gfm+definition_lists-raw_html", |
| 84 | + ) |
| 85 | +} |
| 86 | + |
| 87 | +# Helpers --------------------------------------------------------------------- |
| 88 | + |
| 89 | +# simplify page header (which includes logo + source link) |
| 90 | +simplify_page_header <- function(html) { |
| 91 | + title <- xml2::xml_find_first(html, ".//h1") |
| 92 | + # website for a package without README/index.md |
| 93 | + if (length(title) > 0) { |
| 94 | + xml2::xml_remove(xml2::xml_find_first(html, ".//div[@class='page-header']")) |
| 95 | + xml2::xml_add_child(html, title, .where = 0) |
| 96 | + } |
| 97 | + invisible() |
| 98 | +} |
| 99 | + |
| 100 | +# drop internal anchors |
| 101 | +simplify_anchors <- function(html) { |
| 102 | + xml2::xml_remove(xml2::xml_find_all(html, ".//a[@class='anchor']")) |
| 103 | + invisible() |
| 104 | +} |
| 105 | + |
| 106 | +# strip extraneoous classes |
| 107 | +simplify_code <- function(html) { |
| 108 | + extract_lang <- function(class) { |
| 109 | + trimws(gsub("sourceCode|downlit", "", class)) |
| 110 | + } |
| 111 | + code <- xml2::xml_find_all(html, ".//pre[contains(@class, 'sourceCode')]") |
| 112 | + |
| 113 | + purrr::walk(code, \(x) { |
| 114 | + xml2::xml_attr(x, "class") <- extract_lang(xml2::xml_attr(x, "class")) |
| 115 | + }) |
| 116 | + invisible() |
| 117 | +} |
| 118 | + |
| 119 | +simplify_popovers_to_footnotes <- function(main_html) { |
| 120 | + popover_refs <- xml2::xml_find_all(main_html, ".//a[@class='footnote-ref']") |
| 121 | + if (length(popover_refs) == 0) { |
| 122 | + return() |
| 123 | + } |
| 124 | + |
| 125 | + # Create footnotes section |
| 126 | + footnotes_section <- xml2::xml_find_first( |
| 127 | + main_html, |
| 128 | + ".//section[@class='footnotes']" |
| 129 | + ) |
| 130 | + if (length(footnotes_section) == 0) { |
| 131 | + footnotes_section <- xml2::xml_add_child( |
| 132 | + main_html, |
| 133 | + "section", |
| 134 | + id = "footnotes", |
| 135 | + class = "footnotes footnotes-end-of-document", |
| 136 | + role = "doc-endnotes" |
| 137 | + ) |
| 138 | + xml2::xml_add_child(footnotes_section, "hr") |
| 139 | + footnotes_ol <- xml2::xml_add_child(footnotes_section, "ol") |
| 140 | + } else { |
| 141 | + footnotes_ol <- xml2::xml_find_first(footnotes_section, ".//ol") |
| 142 | + } |
| 143 | + |
| 144 | + purrr::iwalk(popover_refs, function(ref, i) { |
| 145 | + text_content <- xml2::xml_attr(ref, "data-bs-content") |
| 146 | + fn_id <- paste0("fn", i) |
| 147 | + fnref_id <- paste0("fnref", i) |
| 148 | + xml2::xml_attrs(ref) <- list( |
| 149 | + href = paste0("#", fn_id), |
| 150 | + id = fnref_id, |
| 151 | + role = "doc-noteref", |
| 152 | + class = "footnote-ref" |
| 153 | + ) |
| 154 | + |
| 155 | + fn_li <- xml2::xml_add_child(footnotes_ol, "li", id = fn_id) |
| 156 | + parsed_content <- xml2::read_html(text_content) |> |
| 157 | + xml2::xml_find_first(".//body") |> |
| 158 | + xml2::xml_children() |
| 159 | + purrr::walk(parsed_content, \(x) xml2::xml_add_child(fn_li, x)) |
| 160 | + }) |
| 161 | +} |
| 162 | + |
| 163 | +simplify_lifecycle_badges <- function(html) { |
| 164 | + # on reference index |
| 165 | + badges <- xml2::xml_find_all(html, "//span[contains(@class, 'lifecycle')]") |
| 166 | + xml2::xml_replace(badges, "strong", paste0("[", xml2::xml_text(badges), "]")) |
| 167 | + |
| 168 | + # on individual pages |
| 169 | + badges <- xml2::xml_find_all( |
| 170 | + html, |
| 171 | + "//a[.//img[starts-with(@src, 'figures/lifecycle-')]]" |
| 172 | + ) |
| 173 | + imgs <- xml2::xml_find_first(badges, ".//img") |
| 174 | + xml2::xml_replace(badges, "strong", tolower(xml2::xml_attr(imgs, "alt"))) |
| 175 | + |
| 176 | + invisible() |
| 177 | +} |
| 178 | + |
| 179 | +create_absolute_links <- function(main_html, url = NULL) { |
| 180 | + a <- xml2::xml_find_all(main_html, ".//a") |
| 181 | + xml2::xml_attr(a, "class") <- NULL |
| 182 | + |
| 183 | + href <- xml2::xml_attr(a, "href") |
| 184 | + is_internal <- !startsWith(href, "https") & !startsWith(href, "#") |
| 185 | + if (!is.null(url)) { |
| 186 | + href[is_internal] <- xml2::url_absolute(href[is_internal], url) |
| 187 | + } |
| 188 | + href[is_internal] <- sub("html$", "md", href[is_internal]) |
| 189 | + |
| 190 | + xml2::xml_attr(a[is_internal], "href") <- href[is_internal] |
| 191 | + |
| 192 | + invisible() |
| 193 | +} |
| 194 | + |
| 195 | +read_file_if_exists <- function(path) { |
| 196 | + if (file_exists(path)) { |
| 197 | + read_lines(path) |
| 198 | + } |
| 199 | +} |
0 commit comments