From 23fa85b0848fe57c8372c431fa1820c8646bcd3d Mon Sep 17 00:00:00 2001
From: Erik Takke <erik.takke@3milabs.tech>
Date: Wed, 17 Dec 2025 12:49:18 +0100
Subject: [PATCH 001/105] spec: initial spec commit

---
 spec/.gitignore                    |    1 +
 spec/README.md                     |   11 +
 spec/book.typ                      |   15 +
 spec/ebook.typ                     |    8 +
 spec/sample_page.typ               |    7 +
 spec/templates/ebook.typ           |   37 +
 spec/templates/page.typ            |  159 ++++
 spec/templates/theme-style.toml    |   30 +
 spec/templates/tokyo-night.tmTheme | 1308 ++++++++++++++++++++++++++++
 9 files changed, 1576 insertions(+)
 create mode 100644 spec/.gitignore
 create mode 100644 spec/README.md
 create mode 100644 spec/book.typ
 create mode 100644 spec/ebook.typ
 create mode 100644 spec/sample_page.typ
 create mode 100644 spec/templates/ebook.typ
 create mode 100644 spec/templates/page.typ
 create mode 100644 spec/templates/theme-style.toml
 create mode 100644 spec/templates/tokyo-night.tmTheme

diff --git a/spec/.gitignore b/spec/.gitignore
new file mode 100644
index 000000000..4be6e160a
--- /dev/null
+++ b/spec/.gitignore
@@ -0,0 +1 @@
+dist/*
\ No newline at end of file
diff --git a/spec/README.md b/spec/README.md
new file mode 100644
index 000000000..d841017cb
--- /dev/null
+++ b/spec/README.md
@@ -0,0 +1,11 @@
+# LambdaVM specification
+This repository contains specification for [`LambdaVM`](https://github.com/yetanotherco/lambda_vm).
+The specification is written in [`Typst`](https://typst.app/) and can be rendered by [`shiroa`](https://myriad-dreamin.github.io/shiroa/) as either a file (pdf) or a wiki (html).
+
+## Installation & Development setup
+1. [Install `Typst`](https://github.com/typst/typst?tab=readme-ov-file#installation)
+2. [Install `shiroa`](https://myriad-dreamin.github.io/shiroa/guide/installation.html)
+3. Clone this reposity
+4. Open the repository in a terminal and execute `shiroa serve`.
+
+At this point, the wiki version is hosted locally and is actively updated as you modify the specification files.
\ No newline at end of file
diff --git a/spec/book.typ b/spec/book.typ
new file mode 100644
index 000000000..b196e3fc6
--- /dev/null
+++ b/spec/book.typ
@@ -0,0 +1,15 @@
+
+#import "@preview/shiroa:0.3.1": *
+
+#show: book
+
+#book-meta(
+  title: "Lambda VM specification",
+  summary: [
+    #prefix-chapter("sample_page.typ")[Sample page]
+  ]
+)
+
+// re-export page template
+#import "/templates/page.typ": project
+#let book-page = project
diff --git a/spec/ebook.typ b/spec/ebook.typ
new file mode 100644
index 000000000..abddf2701
--- /dev/null
+++ b/spec/ebook.typ
@@ -0,0 +1,8 @@
+#import "@preview/shiroa:0.3.1": *
+
+#import "/templates/ebook.typ"
+
+#show: ebook.project.with(title: "typst-book", spec: "book.typ")
+
+// set a resolver for inclusion
+#ebook.resolve-inclusion(it => include it)
diff --git a/spec/sample_page.typ b/spec/sample_page.typ
new file mode 100644
index 000000000..6eb300c4e
--- /dev/null
+++ b/spec/sample_page.typ
@@ -0,0 +1,7 @@
+#import "/book.typ": book-page
+
+#show: book-page.with(title: "Hello, typst")
+
+= Hello, typst
+
+Sample page
diff --git a/spec/templates/ebook.typ b/spec/templates/ebook.typ
new file mode 100644
index 000000000..44e0312d3
--- /dev/null
+++ b/spec/templates/ebook.typ
@@ -0,0 +1,37 @@
+#import "@preview/shiroa:0.3.1": *
+#import "/templates/page.typ": part-style, project
+
+#let _page-project = project
+
+#let _resolve-inclusion-state = state("_resolve-inclusion", none)
+
+#let resolve-inclusion(inc) = _resolve-inclusion-state.update(it => inc)
+
+#let project(title: "", authors: (), spec: "", content) = {
+  // Set document metadata early
+  set document(
+    author: authors,
+    title: title,
+  )
+
+  // Inherit from gh-pages
+  show: _page-project
+
+  if title != "" {
+    heading(title)
+  }
+
+  context {
+    let inc = _resolve-inclusion-state.final()
+    external-book(spec: inc(spec))
+
+    let mt = book-meta-state.final()
+    let styles = (inc: inc, part: part-style, chapter: it => it)
+
+    if mt != none {
+      mt.summary.map(it => visit-summary(it, styles)).sum()
+    }
+  }
+
+  content
+}
diff --git a/spec/templates/page.typ b/spec/templates/page.typ
new file mode 100644
index 000000000..1f7f88ea0
--- /dev/null
+++ b/spec/templates/page.typ
@@ -0,0 +1,159 @@
+// This is important for shiroa to produce a responsive layout
+// and multiple targets.
+#import "@preview/shiroa:0.3.1": (
+  get-page-width, is-html-target, is-pdf-target, is-web-target, plain-text, shiroa-sys-target, templates,
+)
+#import templates: *
+
+/// The site theme to use. If we renders to static HTML, it is suggested to use `starlight`.
+/// otherwise, since `starlight` with dynamic SVG HTML is not supported, `mdbook` is used.
+/// The `is-html-target(exclude-wrapper: true)` is currently a bit internal so you shouldn't use it other place.
+#let web-theme = if is-html-target(exclude-wrapper: true) { "starlight" } else { "mdbook" }
+#let is-starlight-theme = web-theme == "starlight"
+
+// Metadata
+#let page-width = get-page-width()
+#let is-html-target = is-html-target()
+#let is-pdf-target = is-pdf-target()
+#let is-web-target = is-web-target()
+#let sys-is-html-target = ("target" in dictionary(std))
+
+// Theme (Colors)
+#let themes = theme-box-styles-from(toml("theme-style.toml"), read: it => read(it))
+#let (
+  default-theme: (
+    style: theme-style,
+    is-dark: is-dark-theme,
+    is-light: is-light-theme,
+    main-color: main-color,
+    dash-color: dash-color,
+    code-extra-colors: code-extra-colors,
+  ),
+) = themes;
+#let (
+  default-theme: default-theme,
+) = themes;
+#let theme-box = theme-box.with(themes: themes)
+
+// Fonts
+#let main-font = (
+  // "Charter",
+  // "Source Han Serif SC",
+  // "Source Han Serif TC",
+  // shiroa's embedded font
+  "Libertinus Serif",
+)
+#let code-font = (
+  // "BlexMono Nerd Font Mono",
+  // shiroa's embedded font
+  "DejaVu Sans Mono",
+)
+
+// Sizes
+#let main-size = if is-web-target {
+  16pt
+} else {
+  10.5pt
+}
+#let heading-sizes = if is-web-target {
+  (2, 1.5, 1.17, 1, 0.83).map(it => it * main-size)
+} else {
+  (26pt, 22pt, 14pt, 12pt, main-size)
+}
+#let list-indent = 0.5em
+
+// Put your custom CSS here.
+#let extra-css = ```css
+.site-title {
+  font-size: 1.2rem;
+  font-weight: 600;
+  font-style: italic;
+}
+```
+
+/// The project show rule that is used by all pages.
+///
+/// Example:
+/// ```typ
+/// #show: project
+/// ```
+///
+/// - title (str): The title of the page.
+/// - description (auto): The description of the page.
+///   - If description is `auto`, it will be generated from the plain body.
+///   - If description is `none`, an error is raised to force migration. In future, `none` will mean the description is not generated.
+///   - Hint: use `""` to generate an empty description.
+/// - authors (array | str): The author(s) of the page.
+/// - kind (str): The kind of the page.
+/// - plain-body (content): The plain body of the page.
+#let project(title: "Typst Book", description: auto, authors: (), kind: "page", plain-body) = {
+  // set basic document metadata
+  set document(
+    author: authors,
+    title: title,
+  ) if not is-pdf-target
+
+  // set web/pdf page properties
+  set page(
+    numbering: none,
+    number-align: center,
+    width: page-width,
+  ) if not (sys-is-html-target or is-html-target)
+
+  // remove margins for web target
+  set page(
+    margin: (
+      // reserved beautiful top margin
+      top: 20pt,
+      // reserved for our heading style.
+      // If you apply a different heading style, you may remove it.
+      left: 20pt,
+      // Typst is setting the page's bottom to the baseline of the last line of text. So bad :(.
+      bottom: 0.5em,
+      // remove rest margins.
+      rest: 0pt,
+    ),
+    height: auto,
+  ) if is-web-target and not is-html-target
+
+  let common = (
+    web-theme: web-theme,
+  )
+
+  show: template-rules.with(
+    book-meta: include "/book.typ",
+    title: title,
+    description: description,
+    plain-body: plain-body,
+    extra-assets: (extra-css,),
+    ..common,
+  )
+
+  // Set main text
+  set text(
+    font: main-font,
+    size: main-size,
+    fill: main-color,
+    lang: "en",
+  )
+
+  // markup setting
+  show: markup-rules.with(
+    ..common,
+    themes: themes,
+    heading-sizes: heading-sizes,
+    list-indent: list-indent,
+    main-size: main-size,
+  )
+  // math setting
+  show: equation-rules.with(..common, theme-box: theme-box)
+  // code block setting
+  show: code-block-rules.with(..common, themes: themes, code-font: code-font)
+
+  // Main body.
+  set par(justify: true)
+
+  plain-body
+}
+
+#let part-style = heading
diff --git a/spec/templates/theme-style.toml b/spec/templates/theme-style.toml
new file mode 100644
index 000000000..128d0b171
--- /dev/null
+++ b/spec/templates/theme-style.toml
@@ -0,0 +1,30 @@
+
+[light]
+color-scheme = "light"
+main-color = "#000"
+dash-color = "#20609f"
+code-theme = ""
+
+[rust]
+color-scheme = "light"
+main-color = "#262625"
+dash-color = "#2b79a2"
+code-theme = ""
+
+[coal]
+color-scheme = "dark"
+main-color = "#98a3ad"
+dash-color = "#2b79a2"
+code-theme = "tokyo-night.tmTheme"
+
+[navy]
+color-scheme = "dark"
+main-color = "#bcbdd0"
+dash-color = "#2b79a2"
+code-theme = "tokyo-night.tmTheme"
+
+[ayu]
+color-scheme = "dark"
+main-color = "#c5c5c5"
+dash-color = "#0096cf"
+code-theme = "tokyo-night.tmTheme"
diff --git a/spec/templates/tokyo-night.tmTheme b/spec/templates/tokyo-night.tmTheme
new file mode 100644
index 000000000..24829e7c4
--- /dev/null
+++ b/spec/templates/tokyo-night.tmTheme
@@ -0,0 +1,1308 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
+<plist version="1.0">
+  <dict>
+    <key>name</key>
+    <string>Tokyo Night</string>
+    <key>settings</key>
+    <array>
+      <dict>
+        <key>settings</key>
+        <dict>
+          <key>caret</key>
+          <string>#c0caf5</string>
+          <key>selection</key>
+          <string>#515c7e4d</string>
+          <key>lineHighlight</key>
+          <string>#1e202e</string>
+          <key>foreground</key>
+          <string>#a9b1d6</string>
+          <key>background</key>
+          <string>#1a1b26</string>
+          <key>invisibles</key>
+          <string>#363b54</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Italics - Comments, Storage, Keyword Flow, Vue attributes, Decorators</string>
+        <key>scope</key>
+        <string>comment,meta.var.expr storage.type,keyword.control.flow,keyword.control.return,meta.directive.vue punctuation.separator.key-value.html,meta.directive.vue entity.other.attribute-name.html,tag.decorator.js entity.name.tag.js,tag.decorator.js punctuation.definition.tag.js,storage.modifier</string>
+        <key>settings</key>
+        <dict>
+          <key>fontStyle</key>
+          <string>italic</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Fix YAML block scalar</string>
+        <key>scope</key>
+        <string>keyword.control.flow.block-scalar.literal</string>
+        <key>settings</key>
+        <dict>
+          <key>fontStyle</key>
+          <string/>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Comment</string>
+        <key>scope</key>
+        <string>comment,comment.block.documentation,punctuation.definition.comment,comment.block.documentation punctuation</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#444b6a</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Comment Doc</string>
+        <key>scope</key>
+        <string>keyword.operator.assignment.jsdoc,comment.block.documentation variable,comment.block.documentation storage,comment.block.documentation keyword,comment.block.documentation support,comment.block.documentation markup,comment.block.documentation markup.inline.raw.string.markdown,meta.other.type.phpdoc.php keyword.other.type.php,meta.other.type.phpdoc.php support.other.namespace.php,meta.other.type.phpdoc.php punctuation.separator.inheritance.php,meta.other.type.phpdoc.php support.class,keyword.other.phpdoc.php,log.date</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#5a638c</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Comment Doc Emphasized</string>
+        <key>scope</key>
+        <string>meta.other.type.phpdoc.php support.class,comment.block.documentation storage.type,comment.block.documentation punctuation.definition.block.tag,comment.block.documentation entity.name.type.instance</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#646e9c</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Number, Boolean, Undefined, Null</string>
+        <key>scope</key>
+        <string>variable.other.constant,punctuation.definition.constant,constant.language,constant.numeric,support.constant</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#ff9e64</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>String, Symbols</string>
+        <key>scope</key>
+        <string>string,constant.other.symbol,constant.other.key,meta.attribute-selector</string>
+        <key>settings</key>
+        <dict>
+          <key>fontStyle</key>
+          <string/>
+          <key>foreground</key>
+          <string>#9ece6a</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Colors</string>
+        <key>scope</key>
+        <string>constant.other.color,constant.other.color.rgb-value.hex punctuation.definition.constant</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#9aa5ce</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Invalid</string>
+        <key>scope</key>
+        <string>invalid,invalid.illegal</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#ff5370</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Invalid deprecated</string>
+        <key>scope</key>
+        <string>invalid.deprecated</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#bb9af7</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Storage Type</string>
+        <key>scope</key>
+        <string>storage.type</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#bb9af7</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Storage - modifier, var, const, let</string>
+        <key>scope</key>
+        <string>meta.var.expr storage.type,storage.modifier</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#9d7cd8</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Interpolation, PHP tags, Smarty tags</string>
+        <key>scope</key>
+        <string>punctuation.definition.template-expression,punctuation.section.embedded,meta.embedded.line.tag.smarty,support.constant.handlebars,punctuation.section.tag.twig</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#7dcfff</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Blade, Twig, Smarty Handlebars keywords</string>
+        <key>scope</key>
+        <string>keyword.control.smarty,keyword.control.twig,support.constant.handlebars keyword.control,keyword.operator.comparison.twig,keyword.blade,entity.name.function.blade</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#0db9d7</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Spread</string>
+        <key>scope</key>
+        <string>keyword.operator.spread,keyword.operator.rest</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#f7768e</string>
+          <key>fontStyle</key>
+          <string>bold</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Operator, Misc</string>
+        <key>scope</key>
+        <string>keyword.operator,keyword.control.as,keyword.other,keyword.operator.bitwise.shift,punctuation,expression.embbeded.vue punctuation.definition.tag,text.html.twig meta.tag.inline.any.html,meta.tag.template.value.twig meta.function.arguments.twig,meta.directive.vue punctuation.separator.key-value.html,punctuation.definition.constant.markdown,punctuation.definition.string,punctuation.support.type.property-name,text.html.vue-html meta.tag,meta.attribute.directive,punctuation.definition.keyword,punctuation.terminator.rule,punctuation.definition.entity,punctuation.separator.inheritance.php,keyword.other.template,keyword.other.substitution,entity.name.operator,meta.property-list punctuation.separator.key-value,meta.at-rule.mixin punctuation.separator.key-value,meta.at-rule.function variable.parameter.url</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#89ddff</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Import, Export, From, Default</string>
+        <key>scope</key>
+        <string>keyword.control.import,keyword.control.export,keyword.control.from,keyword.control.default,meta.import keyword.other</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#7dcfff</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Keyword</string>
+        <key>scope</key>
+        <string>keyword,keyword.control,keyword.other.important</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#bb9af7</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Keyword SQL</string>
+        <key>scope</key>
+        <string>keyword.other.DML</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#7dcfff</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Keyword Operator Logical, Arrow, Ternary, Comparison</string>
+        <key>scope</key>
+        <string>keyword.operator.logical,storage.type.function,keyword.operator.bitwise,keyword.operator.ternary,keyword.operator.comparison,keyword.operator.relational,keyword.operator.or.regexp</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#bb9af7</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Tag</string>
+        <key>scope</key>
+        <string>entity.name.tag</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#f7768e</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Tag - Custom</string>
+        <key>scope</key>
+        <string>entity.name.tag support.class.component,meta.tag.custom entity.name.tag,meta.tag</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#de5971</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Tag Punctuation</string>
+        <key>scope</key>
+        <string>punctuation.definition.tag</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#ba3c97</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Globals, PHP Constants, etc</string>
+        <key>scope</key>
+        <string>constant.other.php,variable.other.global.safer,variable.other.global.safer punctuation.definition.variable,variable.other.global,variable.other.global punctuation.definition.variable,constant.other</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#e0af68</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Variables</string>
+        <key>scope</key>
+        <string>variable,support.variable,string constant.other.placeholder,variable.parameter.handlebars,variable.other.object</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#c0caf5</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Variable Array Key</string>
+        <key>scope</key>
+        <string>meta.array.literal variable</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#7dcfff</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Object Key</string>
+        <key>scope</key>
+        <string>meta.object-literal.key,entity.name.type.hcl,string.alias.graphql,string.unquoted.graphql,string.unquoted.alias.graphql,meta.group.braces.curly constant.other.object.key.js string.unquoted.label.js,meta.field.declaration.ts variable.object.property,meta.block entity.name.label</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#73daca</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Object Property</string>
+        <key>scope</key>
+        <string>variable.other.property,support.variable.property,support.variable.property.dom,meta.function-call variable.other.object.property</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#7dcfff</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Object Property</string>
+        <key>scope</key>
+        <string>variable.other.object.property</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#c0caf5</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Object Literal Member lvl 3 (Vue Prop Validation)</string>
+        <key>scope</key>
+        <string>meta.objectliteral meta.object.member meta.objectliteral meta.object.member meta.objectliteral meta.object.member meta.object-literal.key</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#41a6b5</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>C-related Block Level Variables</string>
+        <key>scope</key>
+        <string>source.cpp meta.block variable.other</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#f7768e</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Other Variable</string>
+        <key>scope</key>
+        <string>support.other.variable</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#f7768e</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Methods</string>
+        <key>scope</key>
+        <string>meta.class-method.js entity.name.function.js,entity.name.method.js,variable.function.constructor,keyword.other.special-method,storage.type.cs</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#7aa2f7</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Function Definition</string>
+        <key>scope</key>
+        <string>entity.name.function,variable.other.enummember,meta.function-call,meta.function-call entity.name.function,variable.function,meta.definition.method entity.name.function,meta.object-literal entity.name.function</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#7aa2f7</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Function Argument</string>
+        <key>scope</key>
+        <string>variable.parameter.function.language.special,variable.parameter,meta.function.parameters punctuation.definition.variable,meta.function.parameter variable</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#e0af68</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Constant, Tag Attribute</string>
+        <key>scope</key>
+        <string>keyword.other.type.php,storage.type.php,constant.character,constant.escape,keyword.other.unit</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#bb9af7</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Variable Definition</string>
+        <key>scope</key>
+        <string>meta.definition.variable variable.other.constant,meta.definition.variable variable.other.readwrite,variable.declaration.hcl variable.other.readwrite.hcl,meta.mapping.key.hcl variable.other.readwrite.hcl,variable.other.declaration</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#bb9af7</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Inherited Class</string>
+        <key>scope</key>
+        <string>entity.other.inherited-class</string>
+        <key>settings</key>
+        <dict>
+          <key>fontStyle</key>
+          <string/>
+          <key>foreground</key>
+          <string>#bb9af7</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Class, Support, DOM, etc</string>
+        <key>scope</key>
+        <string>support.class,support.type,variable.other.readwrite.alias,support.orther.namespace.use.php,meta.use.php,support.other.namespace.php,support.type.sys-types,support.variable.dom,support.constant.math,support.type.object.module,support.constant.json,entity.name.namespace,meta.import.qualifier,variable.other.constant.object</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#0db9d7</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Class Name</string>
+        <key>scope</key>
+        <string>entity.name</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#c0caf5</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Support Function</string>
+        <key>scope</key>
+        <string>support.function</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#0db9d7</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>CSS Class and Support</string>
+        <key>scope</key>
+        <string>source.css support.type.property-name,source.sass support.type.property-name,source.scss support.type.property-name,source.less support.type.property-name,source.stylus support.type.property-name,source.postcss support.type.property-name,support.type.property-name.css,support.type.vendored.property-name,support.type.map.key</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#7aa2f7</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>CSS Font</string>
+        <key>scope</key>
+        <string>support.constant.font-name,meta.definition.variable</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#9ece6a</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>CSS Class</string>
+        <key>scope</key>
+        <string>entity.other.attribute-name.class,meta.at-rule.mixin.scss entity.name.function.scss</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#9ece6a</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>CSS ID</string>
+        <key>scope</key>
+        <string>entity.other.attribute-name.id</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#fc7b7b</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>CSS Tag</string>
+        <key>scope</key>
+        <string>entity.name.tag.css</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#0db9d7</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>CSS Tag Reference, Pseudo &amp; Class Punctuation</string>
+        <key>scope</key>
+        <string>entity.other.attribute-name.pseudo-class punctuation.definition.entity,entity.other.attribute-name.pseudo-element punctuation.definition.entity,entity.other.attribute-name.class punctuation.definition.entity,entity.name.tag.reference</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#e0af68</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>CSS Punctuation</string>
+        <key>scope</key>
+        <string>meta.property-list</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#9abdf5</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>CSS at-rule fix</string>
+        <key>scope</key>
+        <string>meta.property-list meta.at-rule.if,meta.at-rule.return variable.parameter.url,meta.property-list meta.at-rule.else</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#ff9e64</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>CSS Parent Selector Entity</string>
+        <key>scope</key>
+        <string>entity.other.attribute-name.parent-selector-suffix punctuation.definition.entity.css</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#73daca</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>CSS Punctuation comma fix</string>
+        <key>scope</key>
+        <string>meta.property-list meta.property-list</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#9abdf5</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>SCSS @</string>
+        <key>scope</key>
+        <string>meta.at-rule.mixin keyword.control.at-rule.mixin,meta.at-rule.include entity.name.function.scss,meta.at-rule.include keyword.control.at-rule.include</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#bb9af7</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>SCSS Mixins, Extends, Include Keyword</string>
+        <key>scope</key>
+        <string>keyword.control.at-rule.include punctuation.definition.keyword,keyword.control.at-rule.mixin punctuation.definition.keyword,meta.at-rule.include keyword.control.at-rule.include,keyword.control.at-rule.extend punctuation.definition.keyword,meta.at-rule.extend keyword.control.at-rule.extend,entity.other.attribute-name.placeholder.css punctuation.definition.entity.css,meta.at-rule.media keyword.control.at-rule.media,meta.at-rule.mixin keyword.control.at-rule.mixin,meta.at-rule.function keyword.control.at-rule.function,keyword.control punctuation.definition.keyword</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#9d7cd8</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>SCSS Include Mixin Argument</string>
+        <key>scope</key>
+        <string>meta.property-list meta.at-rule.include</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#c0caf5</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>CSS value</string>
+        <key>scope</key>
+        <string>support.constant.property-value</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#ff9e64</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Sub-methods</string>
+        <key>scope</key>
+        <string>entity.name.module.js,variable.import.parameter.js,variable.other.class.js</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#c0caf5</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Language methods</string>
+        <key>scope</key>
+        <string>variable.language</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#f7768e</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Variable punctuation</string>
+        <key>scope</key>
+        <string>variable.other punctuation.definition.variable</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#c0caf5</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Keyword this with Punctuation, ES7 Bind Operator</string>
+        <key>scope</key>
+        <string>source.js constant.other.object.key.js string.unquoted.label.js,variable.language.this punctuation.definition.variable,keyword.other.this</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#f7768e</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>HTML Attributes</string>
+        <key>scope</key>
+        <string>entity.other.attribute-name,text.html.basic entity.other.attribute-name.html,text.html.basic entity.other.attribute-name</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#bb9af7</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>HTML Character Entity</string>
+        <key>scope</key>
+        <string>text.html constant.character.entity</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#0DB9D7</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Vue (Vetur / deprecated) Template attributes</string>
+        <key>scope</key>
+        <string>entity.other.attribute-name.id.html,meta.directive.vue entity.other.attribute-name.html</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#bb9af7</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>CSS ID's</string>
+        <key>scope</key>
+        <string>source.sass keyword.control</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#7aa2f7</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>CSS psuedo selectors</string>
+        <key>scope</key>
+        <string>entity.other.attribute-name.pseudo-class,entity.other.attribute-name.pseudo-element,entity.other.attribute-name.placeholder,meta.property-list meta.property-value</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#bb9af7</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Inserted</string>
+        <key>scope</key>
+        <string>markup.inserted</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#449dab</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Deleted</string>
+        <key>scope</key>
+        <string>markup.deleted</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#914c54</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Changed</string>
+        <key>scope</key>
+        <string>markup.changed</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#6183bb</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Regular Expressions</string>
+        <key>scope</key>
+        <string>string.regexp</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#b4f9f8</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Regular Expressions - Punctuation</string>
+        <key>scope</key>
+        <string>punctuation.definition.group</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#f7768e</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Regular Expressions - Character Class</string>
+        <key>scope</key>
+        <string>constant.other.character-class.regexp</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#bb9af7</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Regular Expressions - Character Class Set</string>
+        <key>scope</key>
+        <string>constant.other.character-class.set.regexp,punctuation.definition.character-class.regexp</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#e0af68</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Regular Expressions - Quantifier</string>
+        <key>scope</key>
+        <string>keyword.operator.quantifier.regexp</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#89ddff</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Regular Expressions - Backslash</string>
+        <key>scope</key>
+        <string>constant.character.escape.backslash</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#c0caf5</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Escape Characters</string>
+        <key>scope</key>
+        <string>constant.character.escape</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#89ddff</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Decorators</string>
+        <key>scope</key>
+        <string>tag.decorator.js entity.name.tag.js,tag.decorator.js punctuation.definition.tag.js</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#7aa2f7</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>CSS Units</string>
+        <key>scope</key>
+        <string>keyword.other.unit</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#f7768e</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>JSON Key - Level 0</string>
+        <key>scope</key>
+        <string>source.json meta.structure.dictionary.json support.type.property-name.json</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#7aa2f7</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>JSON Key - Level 1</string>
+        <key>scope</key>
+        <string>source.json meta.structure.dictionary.json meta.structure.dictionary.value.json meta.structure.dictionary.json support.type.property-name.json</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#0db9d7</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>JSON Key - Level 2</string>
+        <key>scope</key>
+        <string>source.json meta.structure.dictionary.json meta.structure.dictionary.value.json meta.structure.dictionary.json meta.structure.dictionary.value.json meta.structure.dictionary.json support.type.property-name.json</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#7dcfff</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>JSON Key - Level 3</string>
+        <key>scope</key>
+        <string>source.json meta.structure.dictionary.json meta.structure.dictionary.value.json meta.structure.dictionary.json meta.structure.dictionary.value.json meta.structure.dictionary.json meta.structure.dictionary.value.json meta.structure.dictionary.json support.type.property-name.json</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#bb9af7</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>JSON Key - Level 4</string>
+        <key>scope</key>
+        <string>source.json meta.structure.dictionary.json meta.structure.dictionary.value.json meta.structure.dictionary.json meta.structure.dictionary.value.json meta.structure.dictionary.json meta.structure.dictionary.value.json meta.structure.dictionary.json meta.structure.dictionary.value.json meta.structure.dictionary.json support.type.property-name.json</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#e0af68</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>JSON Key - Level 5</string>
+        <key>scope</key>
+        <string>source.json meta.structure.dictionary.json meta.structure.dictionary.value.json meta.structure.dictionary.json meta.structure.dictionary.value.json meta.structure.dictionary.json meta.structure.dictionary.value.json meta.structure.dictionary.json meta.structure.dictionary.value.json meta.structure.dictionary.json meta.structure.dictionary.value.json meta.structure.dictionary.json support.type.property-name.json</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#0db9d7</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>JSON Key - Level 6</string>
+        <key>scope</key>
+        <string>source.json meta.structure.dictionary.json meta.structure.dictionary.value.json meta.structure.dictionary.json meta.structure.dictionary.value.json meta.structure.dictionary.json meta.structure.dictionary.value.json meta.structure.dictionary.json meta.structure.dictionary.value.json meta.structure.dictionary.json meta.structure.dictionary.value.json meta.structure.dictionary.json meta.structure.dictionary.value.json meta.structure.dictionary.json support.type.property-name.json</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#73daca</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>JSON Key - Level 7</string>
+        <key>scope</key>
+        <string>source.json meta.structure.dictionary.json meta.structure.dictionary.value.json meta.structure.dictionary.json meta.structure.dictionary.value.json meta.structure.dictionary.json meta.structure.dictionary.value.json meta.structure.dictionary.json meta.structure.dictionary.value.json meta.structure.dictionary.json meta.structure.dictionary.value.json meta.structure.dictionary.json meta.structure.dictionary.value.json meta.structure.dictionary.json meta.structure.dictionary.value.json meta.structure.dictionary.json support.type.property-name.json</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#f7768e</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>JSON Key - Level 8</string>
+        <key>scope</key>
+        <string>source.json meta.structure.dictionary.json meta.structure.dictionary.value.json meta.structure.dictionary.json meta.structure.dictionary.value.json meta.structure.dictionary.json meta.structure.dictionary.value.json meta.structure.dictionary.json meta.structure.dictionary.value.json meta.structure.dictionary.json meta.structure.dictionary.value.json meta.structure.dictionary.json meta.structure.dictionary.value.json meta.structure.dictionary.json meta.structure.dictionary.value.json meta.structure.dictionary.json meta.structure.dictionary.value.json meta.structure.dictionary.json support.type.property-name.json</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#9ece6a</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Plain Punctuation</string>
+        <key>scope</key>
+        <string>punctuation.definition.list_item.markdown</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#9abdf5</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Block Punctuation</string>
+        <key>scope</key>
+        <string>meta.block,meta.brace,punctuation.definition.block,punctuation.definition.use,punctuation.definition.class,punctuation.definition.begin.bracket,punctuation.definition.end.bracket,punctuation.definition.switch-expression.begin.bracket,punctuation.definition.switch-expression.end.bracket,punctuation.definition.section.switch-block.begin.bracket,punctuation.definition.section.switch-block.end.bracket,punctuation.definition.group.shell,punctuation.definition.parameters,punctuation.definition.arguments,punctuation.definition.dictionary,punctuation.definition.array,punctuation.section</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#9abdf5</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Markdown - Plain</string>
+        <key>scope</key>
+        <string>meta.jsx.children,meta.embedded.block</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#c0caf5</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>HTML text</string>
+        <key>scope</key>
+        <string>text.html,text.log</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#9aa5ce</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Markdown - Markup Raw Inline</string>
+        <key>scope</key>
+        <string>text.html.markdown markup.inline.raw.markdown</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#bb9af7</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Markdown - Markup Raw Inline Punctuation</string>
+        <key>scope</key>
+        <string>text.html.markdown markup.inline.raw.markdown punctuation.definition.raw.markdown</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#4E5579</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Markdown - Heading 1</string>
+        <key>scope</key>
+        <string>heading.1.markdown entity.name,heading.1.markdown punctuation.definition.heading.markdown</string>
+        <key>settings</key>
+        <dict>
+          <key>fontStyle</key>
+          <string>bold</string>
+          <key>foreground</key>
+          <string>#89ddff</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Markdown - Heading 2</string>
+        <key>scope</key>
+        <string>heading.2.markdown entity.name,heading.2.markdown punctuation.definition.heading.markdown</string>
+        <key>settings</key>
+        <dict>
+          <key>fontStyle</key>
+          <string>bold</string>
+          <key>foreground</key>
+          <string>#61bdf2</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Markdown - Heading 3</string>
+        <key>scope</key>
+        <string>heading.3.markdown entity.name,heading.3.markdown punctuation.definition.heading.markdown</string>
+        <key>settings</key>
+        <dict>
+          <key>fontStyle</key>
+          <string>bold</string>
+          <key>foreground</key>
+          <string>#7aa2f7</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Markdown - Heading 4</string>
+        <key>scope</key>
+        <string>heading.4.markdown entity.name,heading.4.markdown punctuation.definition.heading.markdown</string>
+        <key>settings</key>
+        <dict>
+          <key>fontStyle</key>
+          <string>bold</string>
+          <key>foreground</key>
+          <string>#6d91de</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Markdown - Heading 5</string>
+        <key>scope</key>
+        <string>heading.5.markdown entity.name,heading.5.markdown punctuation.definition.heading.markdown</string>
+        <key>settings</key>
+        <dict>
+          <key>fontStyle</key>
+          <string>bold</string>
+          <key>foreground</key>
+          <string>#9aa5ce</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Markdown - Heading 6</string>
+        <key>scope</key>
+        <string>heading.6.markdown entity.name,heading.6.markdown punctuation.definition.heading.markdown</string>
+        <key>settings</key>
+        <dict>
+          <key>fontStyle</key>
+          <string>bold</string>
+          <key>foreground</key>
+          <string>#747ca1</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Markup - Italic</string>
+        <key>scope</key>
+        <string>markup.italic,markup.italic punctuation</string>
+        <key>settings</key>
+        <dict>
+          <key>fontStyle</key>
+          <string>italic</string>
+          <key>foreground</key>
+          <string>#c0caf5</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Markup - Bold</string>
+        <key>scope</key>
+        <string>markup.bold,markup.bold punctuation</string>
+        <key>settings</key>
+        <dict>
+          <key>fontStyle</key>
+          <string>bold</string>
+          <key>foreground</key>
+          <string>#c0caf5</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Markup - Bold-Italic</string>
+        <key>scope</key>
+        <string>markup.bold markup.italic,markup.bold markup.italic punctuation</string>
+        <key>settings</key>
+        <dict>
+          <key>fontStyle</key>
+          <string>bold italic</string>
+          <key>foreground</key>
+          <string>#c0caf5</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Markup - Underline</string>
+        <key>scope</key>
+        <string>markup.underline,markup.underline punctuation</string>
+        <key>settings</key>
+        <dict>
+          <key>fontStyle</key>
+          <string>underline</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Markdown - Blockquote</string>
+        <key>scope</key>
+        <string>markup.quote punctuation.definition.blockquote.markdown</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#4e5579</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Markup - Quote</string>
+        <key>scope</key>
+        <string>markup.quote</string>
+        <key>settings</key>
+        <dict>
+          <key>fontStyle</key>
+          <string>italic</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Markdown - Link</string>
+        <key>scope</key>
+        <string>string.other.link,markup.underline.link,constant.other.reference.link.markdown,string.other.link.description.title.markdown</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#73daca</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Markdown - Fenced Code Block</string>
+        <key>scope</key>
+        <string>markup.fenced_code.block.markdown,markup.inline.raw.string.markdown,variable.language.fenced.markdown</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#89ddff</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Markdown - Separator</string>
+        <key>scope</key>
+        <string>meta.separator</string>
+        <key>settings</key>
+        <dict>
+          <key>fontStyle</key>
+          <string>bold</string>
+          <key>foreground</key>
+          <string>#444b6a</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Markup - Table</string>
+        <key>scope</key>
+        <string>markup.table</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#c0cefc</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Token - Info</string>
+        <key>scope</key>
+        <string>token.info-token</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#0db9d7</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Token - Warn</string>
+        <key>scope</key>
+        <string>token.warn-token</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#ffdb69</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Token - Error</string>
+        <key>scope</key>
+        <string>token.error-token</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#db4b4b</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Token - Debug</string>
+        <key>scope</key>
+        <string>token.debug-token</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#b267e6</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Apache Tag</string>
+        <key>scope</key>
+        <string>entity.tag.apacheconf</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#f7768e</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>Preprocessor</string>
+        <key>scope</key>
+        <string>meta.preprocessor</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#73daca</string>
+        </dict>
+      </dict>
+      <dict>
+        <key>name</key>
+        <string>ENV value</string>
+        <key>scope</key>
+        <string>source.env</string>
+        <key>settings</key>
+        <dict>
+          <key>foreground</key>
+          <string>#7aa2f7</string>
+        </dict>
+      </dict>
+    </array>
+  </dict>
+</plist>

From 27da436930f28d8a6fad2a0a946cc665e4a0dbcb Mon Sep 17 00:00:00 2001
From: Robin Jadoul <robin.jadoul@3milabs.tech>
Date: Wed, 24 Dec 2025 15:20:56 +0100
Subject: [PATCH 002/105] spec: Basic chip data format and layout

See the original yetanotherco/lambda_vm_spec #1
for more details, if it still exists.

* Introduce `config` and "variables"

* chip column-to-table rendering

* restructuring

* some basic interactions idea

* Sample lt chip design

* Update formatting

* Interpret variable indexing

* BRANCH draft

* Fix indexing + render template

* Render labels for references to constraints

* Rendering chip assumptions

* Add an editorconfig for consistency in indentation and trailing newlines

* The constraint range index found its way back home

* Finish (?) LT chip

* Improve lisp rendering

* support constraint group rendering

* Support "^" type setting

* dvrm

* add dvrm assumptions

* Rendering virtual column definitions and polynomials for arith constraints

* ignore ebook.pdf

* Split LT and BRANCH into groups

* Nicer mutual recursion in expression formatting

* Use negation instead of mult by -1 in lt

* Format expr.typ

* Simplify subtraction expression

* Remove parentheses using precedence rules

* fmt

* improve dvrm readability

* fix lt parentheses

* move `extended_n_sub_r` def from constraint to var

* Set div chip word types to HL

* divrem fixes

* more dvrm tweaks

* Specify grammar

* add docs

* Drop chip files

* Improve `chip` readability

* minor fixes

---------

Co-authored-by: Erik Takke <erik.takke@3milabs.tech>
---
 spec/.editorconfig   |  10 +++
 spec/.gitignore      |   3 +-
 spec/book.typ        |   2 +-
 spec/chip.typ        | 163 +++++++++++++++++++++++++++++++++++++++++++
 spec/expr.typ        | 119 +++++++++++++++++++++++++++++++
 spec/sample_page.typ |   7 --
 spec/src.typ         |  58 +++++++++++++++
 spec/src/config.toml | 113 ++++++++++++++++++++++++++++++
 spec/variables.typ   |  20 ++++++
 9 files changed, 486 insertions(+), 9 deletions(-)
 create mode 100644 spec/.editorconfig
 create mode 100644 spec/chip.typ
 create mode 100644 spec/expr.typ
 delete mode 100644 spec/sample_page.typ
 create mode 100644 spec/src.typ
 create mode 100644 spec/src/config.toml
 create mode 100644 spec/variables.typ

diff --git a/spec/.editorconfig b/spec/.editorconfig
new file mode 100644
index 000000000..dbb9605a4
--- /dev/null
+++ b/spec/.editorconfig
@@ -0,0 +1,10 @@
+root = true
+
+[*]
+end_of_line = lf
+insert_final_newline = true
+charset = utf-8
+
+[*.typ]
+indent_style = space
+indent_size = 2
diff --git a/spec/.gitignore b/spec/.gitignore
index 4be6e160a..73218d5ba 100644
--- a/spec/.gitignore
+++ b/spec/.gitignore
@@ -1 +1,2 @@
-dist/*
\ No newline at end of file
+dist/*
+ebook.pdf
diff --git a/spec/book.typ b/spec/book.typ
index b196e3fc6..f50d7ed29 100644
--- a/spec/book.typ
+++ b/spec/book.typ
@@ -6,7 +6,7 @@
 #book-meta(
   title: "Lambda VM specification",
   summary: [
-    #prefix-chapter("sample_page.typ")[Sample page]
+    #chapter("variables.typ")[Variables]
   ]
 )
 
diff --git a/spec/chip.typ b/spec/chip.typ
new file mode 100644
index 000000000..c694db90a
--- /dev/null
+++ b/spec/chip.typ
@@ -0,0 +1,163 @@
+#import "expr.typ": expr_to_code, expr_to_math
+
+/// Computes the total number of variables in a `chip`
+#let total_nr_variables(chip) = {
+  return chip.variables.values().flatten().len()
+}
+
+// Computes the total number of columns instantiated by `chip`
+#let total_nr_instantiated_columns(chip, config) = {
+  return chip
+  .variables
+  .pairs()
+  .filter(pair => pair.at(0) in config.variables.categories.instantiated)
+  .map(pair => pair.at(1))
+  .flatten()
+  .map(var => config.variables.types.filter(type => type.label == var.type).at(0).subtypes.len())
+  .sum()
+}
+
+/// Generates a table listing `chip`'s columns.
+#let render_chip_column_table(chip, config) = {
+  // Group variables by category
+  figure(table(
+    columns: (auto, auto, 1fr),
+    inset: 6pt,
+    align: left + top,
+    stroke: none,
+    table.header([*Label*], [*Type*], [*Description*]),
+    table.hline(stroke: stroke(thickness: 2pt)),
+    ..for (cat, vars) in chip.variables.pairs() {
+      ([#emph(cat)], [], [], table.hline(stroke: .6pt))
+      for var in vars {
+        ([#raw(var.name)], [#raw(var.type)], [#eval(var.desc, mode: "markup")])
+        for (i, poly) in var.at("polys", default: ()).enumerate() {
+          (if i == 0 { emph[def] }, [], expr_to_math(("=", ("idx", var.name, i), poly)))
+        }
+        if "poly" in var {
+          (emph[def], [], expr_to_math(var.poly))
+        }
+      }
+      ([], [], [])
+    },
+  ), caption: [Column overview of #chip.name chip.])
+}
+
+#let cref(constraint) = {
+  if "ref" in constraint {
+    label(constraint.ref)
+  }
+}
+
+// Render a range if `obj` contains one.
+#let interval(obj) = {
+  if "range" in obj {
+    [#raw(obj.range.at(0)) #sym.in` [`#obj.range.at(1)`,`#obj.range.at(2)`]`]
+  } else { return [] }
+}
+
+#let args_interaction_like(input, output) = {
+  if output != none {
+    expr_to_code(output) + `; `
+  } else {
+    ``
+  } + input.map(expr_to_code).join(`, `)
+}
+
+#let render_chip_assumptions(chip, config) = {
+  let tag(assumption) = {
+    let index = if "range" in assumption { "." + assumption.range.at(0) } else { "" }
+    let lbl = [#chip.name\-A]
+    show figure: (it) => align(left, block[#lbl#context it.counter.display()#index])
+    [#figure(kind: "assumption", numbering: (i) => [#lbl#i#index], supplement: [], [])#cref(assumption)]
+  }
+
+  figure(table(
+    columns: (auto, auto, 1fr),
+    inset: 6pt,
+    align: (top + left, top + left, top + left),
+    stroke: none,
+    table.header([*Tag*], [*Range*], [*Description*]),
+    table.hline(stroke: stroke(thickness: 2pt)),
+    ..for assumption in chip.assumptions {
+      ([#tag(assumption)], [#interval(assumption)], [#eval(assumption.desc, mode: "markup")])
+    },
+  ), caption: [Assumption overview of #chip.name chip.])
+}
+
+/// Generates a table listing all interactions initiated by `chip`'s.
+#let render_constraint_table(chip, config, groups: none) = {
+  let all_groups = chip.constraint_groups.map(group => group.name);
+  if groups == none {
+    // render all
+    groups = all_groups
+  } else if type(groups) == str {
+    groups = (groups,)
+  }
+  assert(groups.all(group => group in all_groups), message: "unknown group")
+
+  /// Render the contraint's tag.
+  let tag(constraint, group) = {
+    let index = if "range" in constraint { "." + constraint.range.at(0) } else { "" }
+    let prefix = if "prefix" in group { group.prefix }
+    let lbl = [#chip.name\-C#prefix]
+    show figure: (it) => align(left, block[#lbl#context it.counter.display()#index])
+    [#figure(kind: "constraint", numbering: (i) => [#lbl#i#index], supplement: [], [])#cref(constraint)]
+  }
+
+  /// Generates a representation of `constraint`
+  let repr_constraint(constraint) = {
+    let kind = constraint.kind
+
+    if kind == "interaction" {
+      raw(constraint.tag) + `[` + args_interaction_like(constraint.input, constraint.at("output", default: none)) + `]`
+    } else if kind == "arith" {
+      [#eval(constraint.constraint)]
+    } else if kind == "template" {
+      raw(constraint.tag) + `<` + args_interaction_like(constraint.input, constraint.at("output", default: none)) + `>`
+    } else {
+      assert(false, message: "illegal constraint format: " + kind)
+    }
+  }
+
+  // Whether constraints has polynomial constraints
+  let has_polynomial_constraints(constraint) = {
+    constraint.at("kind") == "arith" and ("poly" in constraint or "polys" in constraint)
+  }
+
+  // Rendering polynomial constraints
+  let render_polynomial_constraints(constraint) = {
+    assert(constraint.kind == "arith", message: "Only arith needs extra rows")
+    let polys = if "poly" in constraint {
+      (constraint.poly,)
+    } else {
+      constraint.polys
+    }
+
+    (..for poly in polys {
+      ([_polynomial constraint_], [], $#expr_to_math(poly) = 0$, [])
+    },)
+  }
+
+  figure(table(
+    columns: (auto, auto, 1fr, auto),
+    inset: 6pt,
+    align: (top + left, top + left, top + left, top + center),
+    stroke: none,
+    table.header([*Tag*], [*Range*], [*Description*], [*Multiplicity*]),
+    table.hline(stroke: stroke(thickness: 2pt)),
+    ..for group in groups {
+      for constraint in chip.constraints.at(group) {
+        (
+          [#tag(constraint, group)],
+          [#interval(constraint)],
+          [#repr_constraint(constraint)],
+          [#expr_to_math(constraint.at("multiplicity", default: ""))],
+        )
+        if has_polynomial_constraints(constraint) {
+          render_polynomial_constraints(constraint)
+        }
+      }
+    },
+  ), caption: [Constraint overview of #chip.name chip.])
+}
diff --git a/spec/expr.typ b/spec/expr.typ
new file mode 100644
index 000000000..df1ddb2e6
--- /dev/null
+++ b/spec/expr.typ
@@ -0,0 +1,119 @@
+// Grammar
+// <expr> ::= ()                           ; ""
+//          | var                          ; str(var)
+//          | int                          ; int
+//          | ["idx", expr1, expr2]        ; expr1[expr2]
+//          | ["not", expr]                ; !expr
+//          | ["+", expr1, expr2, ...]     ; expr1 + expr2 + ...
+//          | ["*", expr1, expr2, ...]     ; expr1 * expr2 * ...
+//          | ["/", expr1, expr2]          ; expr1 / expr2
+//          | ["^", expr1, expr2]          ; expr1^expr2
+//          | ["=", expr1, expr2]          ; expr1 = expr2
+//          | ["-", expr]                  ; -expr
+//          | ["-", expr1, expr2, ...]     ; expr1 - expr2 - ...
+// 
+// 
+// To limit the number of parentheses that are placed in an expression,
+// the formatter passes `pp` (for Parent Precedence) to each recursive subcall,
+// and wraps itself in parentheses when `pp < expr.precedence`.
+//
+// Precedence values:
+// 0 : ^
+// 1 : neg (e.g., 5 =>  -5)
+// 2 : *
+// 3 : /
+// 4 : not (e.g., 5 => 1-5)
+// 5 : +
+// 6 : -
+// 7 : []
+// 8 : =
+// 10: <the void outside every expression>
+#let MAX_PRECEDENCE = 10
+
+// Mutual recursion through a trick from https://github.com/typst/typst/issues/744
+#let make_expr_formatter(dict, empty: none, var: raw, num: str) = {
+  let res(pp, expr) = {
+    if expr == none {
+      empty
+    } else if type(expr) == str {
+      var(expr)
+    } else if type(expr) == int {
+      num(expr)
+    } else if type(expr) == array {
+      (dict.at(expr.at(0), default: (e) => {
+        assert(false, "Invalid expression: " + repr(e))
+      }))(pp, res, expr)
+    }
+  }
+  res.with(MAX_PRECEDENCE)
+}
+
+// Wrap code `expr` if `apply = true`
+#let cwrap(expr, apply) = {
+  if apply {
+    `(` + expr + `)`
+  } else {
+    expr
+  }
+}
+
+// Typeset an expression as code
+#let expr_to_code = make_expr_formatter(
+  (
+    "idx": (pp, rec, e) => rec(0, e.at(1)) + `[` + rec(10, e.at(2)) + `]`,
+    "not": (pp, rec, e) => cwrap(`1 - ` + rec(4, e.at(1)), pp < 4),
+    "+": (pp, rec, e) => cwrap(e.slice(1).map(rec.with(5)).join(` + `), pp < 5),
+    "*": (pp, rec, e) => cwrap(e.slice(1).map(rec.with(2)).join(` ` + sym.dot + ` `), pp < 2),
+    "/": (pp, rec, e) => cwrap(rec(3, e.at(1)), pp < 3) + ` / ` + rec(3, e.at(2)),
+    "^": (pp, rec, e) => {
+      assert(type(e.at(1)) == int and type(e.at(2)) == int, message: "Can only exponentiate constants")
+      rec(0, e.at(1)) + `^` + rec(0, e.at(2))
+    },
+    "=": (pp, rec, e) => rec(8, e.at(1)) + ` = ` + rec(8, e.at(2)),
+    "-": (pp, rec, e) => {
+      if e.len() == 2 {
+        // Negation
+        cwrap(`-` + rec(1, e.at(1)), pp < 1)
+      } else {
+        // Subtraction
+        cwrap(e.slice(1).map(rec.with(6)).join(` - `), pp < 6)
+      }
+    },
+  ),
+)
+
+// Wrap math `expr` if `apply = true`
+#let mwrap(expr, apply) = {
+  if apply {
+    $($ + expr + $)$
+  } else {
+    expr
+  }
+}
+
+// Typeset an expression as math
+#let expr_to_math = make_expr_formatter(
+  (
+    "idx": (pp, rec, e) => $#rec(7, e.at(1))_(#rec(7, e.at(2)))$,
+    "not": (pp, rec, e) => mwrap($1 - #rec(4, e.at(1))$, pp < 4),
+    "+": (pp, rec, e) => mwrap($#e.slice(1).map(rec.with(5)).join($+$)$, pp < 5),
+    "*": (pp, rec, e) => mwrap($#e.slice(1).map(rec.with(3)).join($dot$)$, pp < 3),
+    "/": (pp, rec, e) => $#rec(3, e.at(1)) / #rec(3, e.at(2))$,
+    "^": (pp, rec, e) => {
+      assert(type(e.at(1)) == int and type(e.at(2)) == int, message: "Can only exponentiate constants")
+      $#e.at(1)^#e.at(2)$
+    },
+    "=": (pp, rec, e) => $#rec(8, e.at(1)) = #rec(8, e.at(2))$,
+    "-": (pp, rec, e) => {
+      if e.len() == 2 {
+        // Negation
+        mwrap($-#rec(1, e.at(1))$, pp < 1)
+      } else {
+        // Subtraction
+        mwrap($#e.slice(1).map(rec.with(6)).join($-$)$, pp < 6)
+      }
+    },
+  ),
+  var: v => if v.len() == 1 { $#v$ } else { $#raw(v)$ },
+  num: n => math.equation[#n],
+)
diff --git a/spec/sample_page.typ b/spec/sample_page.typ
deleted file mode 100644
index 6eb300c4e..000000000
--- a/spec/sample_page.typ
+++ /dev/null
@@ -1,7 +0,0 @@
-#import "/book.typ": book-page
-
-#show: book-page.with(title: "Hello, typst")
-
-= Hello, typst
-
-Sample page
diff --git a/spec/src.typ b/spec/src.typ
new file mode 100644
index 000000000..f791bc1ca
--- /dev/null
+++ b/spec/src.typ
@@ -0,0 +1,58 @@
+/// Path to the config file.
+#let CONFIG_PATH = "src/config.toml"
+
+/// Check the configuration object for internal consistency.
+#let _check_config(config) = {
+  // Check that variable subtypes are listed, or "none"
+  let types = config.variables.types
+  for type in types {
+    for subtype in type.subtypes {
+      assert(
+        subtype in types.map(type => type.label),
+        message: "subtype '" + subtype + "' does not exist.",
+      )
+    }
+  }
+
+  // Check that `instantiated` variables are a subset of `all`
+  let categories = config.variables.categories
+  for category in categories.instantiated {
+    assert(
+      category in categories.all,
+      message: "category '" + category + "' part of `instantiated`, but not `all`.",
+    )
+  }
+}
+
+/// Load the configuration file.
+#let load_config() = {
+  let config = toml(CONFIG_PATH)
+  _check_config(config)
+  return config
+}
+
+/// Check a chip object for internal consistency.
+#let _check_chip(chip, config) = {
+  // Check that all variable categories are valid
+  for category in chip.variables.keys() {
+    assert(category in config.variables.categories.all)
+  }
+
+  for var in chip.variables.values().flatten() {
+    // Check that all variable types are valid
+    assert(
+      var.type in config.variables.types.map(type => type.label),
+      message: "found invalid var type:" + var.type,
+    )
+  }
+}
+
+/// Load a chip object from file
+///
+/// - path(str): path to file containing chip data
+/// - config: configuration data this chip needs to match with
+#let load_chip(path, config) = {
+  let chip = toml(path)
+  _check_chip(chip, config)
+  return chip
+}
diff --git a/spec/src/config.toml b/spec/src/config.toml
new file mode 100644
index 000000000..1977b9155
--- /dev/null
+++ b/spec/src/config.toml
@@ -0,0 +1,113 @@
+[metadata]
+version = 1
+
+[[variables.types]]
+label = "BaseField"
+subtypes = ["BaseField"]
+desc = "Variable that can assume any value in the base field."
+
+[[variables.types]]
+label = "Bit"
+subtypes = ["BaseField"]
+desc = "Variable that can only assume values in the set ${0,1}$."
+
+[[variables.types]]
+label = "B4"
+subtypes = ["BaseField"]
+desc = "Variable that can only assume values in the range $[0, 2^4)$."
+
+[[variables.types]]
+label = "Byte"
+subtypes = ["BaseField"]
+count = 1
+desc = "Variable that can only assume values in the range $[0, 2^8)$."
+
+[[variables.types]]
+label = "Half"
+subtypes = ["BaseField"]
+desc = "Variable that can only assume values in the range $[0, 2^16)$."
+
+[[variables.types]]
+label = "Word"
+subtypes = ["BaseField"]
+desc = "Variable that can only assume values in the range $[0, 2^32)$."
+
+[[variables.types]]
+label = "WordHL"
+subtypes = ["Half", "Half"]
+desc = """\
+       Variable that can only assume values in the range $[0, 2^32)$. \\
+       Represented as an array of two `Half` variables.\
+       """
+
+[[variables.types]]
+label = "WordBL"
+subtypes = ["Byte", "Byte", "Byte", "Byte"]
+desc = """\
+       Variable that can only assume values in the range $[0, 2^32)$. \\
+       Represented as an array of four `Byte` variables.\
+       """
+
+[[variables.types]]
+label = "DWordBL"
+subtypes = ["Byte", "Byte", "Byte", "Byte", "Byte", "Byte", "Byte", "Byte"]
+desc = """\
+       Variable that can only assume values in the range $[0, 2^64)$. \\
+       Represented as an array of eight `Byte` variables.\
+       """
+
+[[variables.types]]
+label = "DWordHL"
+subtypes = ["Half", "Half", "Half", "Half"]
+desc = """\
+       Variable that can only assume values in the range $[0, 2^64)$. \\
+       Represented as an array of four `Half` variables.\
+       """
+
+[[variables.types]]
+label = "DWordWL"
+subtypes = ["Word", "Word"]
+desc = """\
+       Variable that can only assume values in the range $[0, 2^64)$. \\
+       Represented as an array of two `Word` variables.\
+       """
+
+[[variables.types]]
+label = "DWordHHW"
+subtypes = ["Word", "Half", "Half"]
+desc = """\
+       Variable that can only assume values in the range $[0, 2^64)$. \\
+       Represented as a `Word` and two `Half` variables.\
+       The `Word` is the least significant digit.
+       """
+
+# TODO: Having to define these manually will get tedious
+[[variables.types]]
+label = "Bit[3]"
+subtypes = ["Bit", "Bit", "Bit"]
+desc = "Three bits"
+
+[[variables.types]]
+label = "Byte[2]"
+subtypes = ["Byte", "Byte"]
+desc = "Two bytes"
+
+[[variables.types]]
+label = "Byte[8]"
+subtypes = ["Byte", "Byte", "Byte", "Byte", "Byte", "Byte", "Byte", "Byte"]
+desc = "Eight bytes"
+
+[[variables.types]]
+label = "Half[3]"
+subtypes = ["Half", "Half", "Half"]
+desc = "Three halfwords"
+
+[[variables.types]]
+label = "Half[8]"
+subtypes = ["Half", "Half", "Half", "Half", "Half", "Half", "Half", "Half"]
+desc = "Eight halfwords"
+
+
+[variables.categories]
+all = ["input", "output", "auxiliary", "virtual", "multiplicity"]
+instantiated = ["input", "output", "auxiliary", "multiplicity"]
diff --git a/spec/variables.typ b/spec/variables.typ
new file mode 100644
index 000000000..42e7bc379
--- /dev/null
+++ b/spec/variables.typ
@@ -0,0 +1,20 @@
+#import "/book.typ": book-page
+#import "/src.typ": load_config
+
+#show: book-page.with(title: "Variables")
+
+#let config = load_config()
+
+= Variables
+While this VM operates on 64-bit words, the proving system's base field has fewer than $2^64$ elements available and thus cannot represent all words natively.
+To this end, we introduce the concept of "variables" as an abstraction layer on top of the VM's field elements. The following table lists all variable types used in this VM.
+
+#table(
+  columns: (auto, 1fr, auto),
+  inset: 7pt,
+  align: (top+left, top+left, top+center, ),
+  table.header([*Name*], [*Description*], [*\#Columns*]),
+  ..for type in config.variables.types {
+    ([#raw(type.label)], [#eval(type.desc, mode: "markup")], [#type.subtypes.len()])
+  },
+)

From efd78680c76b7ab5d31d0664244f1d14776ef3ce Mon Sep 17 00:00:00 2001
From: Robin Jadoul <robin.jadoul@3milabs.tech>
Date: Tue, 30 Dec 2025 15:01:31 +0100
Subject: [PATCH 003/105] spec: Fix some chip rendering pain points (#83)

This fixes the following pain points:
- Assumptions and constraints requiring a `ref` for rendering to succeed
- The `desc` field of `arith` constraints not being rendered
- The `constraint` field of an `arith` constraint using eval in code mode
- Long tables (columns and constraints) didn't break across pages
- Template constraints did not have conditions rendered
- Constraint groups didn't get the proper prefix if specified
- The default branch of expression rendering has missing arguments

It also introduces a nice visual todo macro

---------

Co-authored-by: Erik <159244975+erik-3milabs@users.noreply.github.com>
---
 spec/book.typ |  7 +++++++
 spec/chip.typ | 43 +++++++++++++++++++++++++++++++++----------
 spec/expr.typ |  4 ++--
 3 files changed, 42 insertions(+), 12 deletions(-)

diff --git a/spec/book.typ b/spec/book.typ
index f50d7ed29..8b5cae160 100644
--- a/spec/book.typ
+++ b/spec/book.typ
@@ -13,3 +13,10 @@
 // re-export page template
 #import "/templates/page.typ": project
 #let book-page = project
+
+#let todo(background: white, foreground: black, name: none, body) = block(fill: background, outset: 0.5em, radius: 20%, stroke: black)[
+  #set text(fill: foreground)
+  *TODO #if name != none { [(#name)] }*: #body
+]
+#let rj = todo.with(background: teal, name: "Robin")
+#let et = todo.with(background: rgb("d4aa3a"), name: "Erik")
diff --git a/spec/chip.typ b/spec/chip.typ
index c694db90a..cb8a8e4cf 100644
--- a/spec/chip.typ
+++ b/spec/chip.typ
@@ -20,6 +20,7 @@
 /// Generates a table listing `chip`'s columns.
 #let render_chip_column_table(chip, config) = {
   // Group variables by category
+  show figure: set block(breakable: true)
   figure(table(
     columns: (auto, auto, 1fr),
     inset: 6pt,
@@ -43,9 +44,11 @@
   ), caption: [Column overview of #chip.name chip.])
 }
 
-#let cref(constraint) = {
-  if "ref" in constraint {
-    label(constraint.ref)
+#let cref(obj, body) = {
+  if "ref" in obj {
+    [#body#label(obj.ref)]
+  } else {
+    body
   }
 }
 
@@ -69,7 +72,7 @@
     let index = if "range" in assumption { "." + assumption.range.at(0) } else { "" }
     let lbl = [#chip.name\-A]
     show figure: (it) => align(left, block[#lbl#context it.counter.display()#index])
-    [#figure(kind: "assumption", numbering: (i) => [#lbl#i#index], supplement: [], [])#cref(assumption)]
+    cref(assumption)[#figure(kind: "assumption", numbering: (i) => [#lbl#i#index], supplement: [], [])]
   }
 
   figure(table(
@@ -96,13 +99,16 @@
   }
   assert(groups.all(group => group in all_groups), message: "unknown group")
 
+  // Find the group definition in the constraint_groups
+  let lookup_group(name) = chip.constraint_groups.filter((g) => g.name == name).at(0, default: (name: name))
+
   /// Render the contraint's tag.
   let tag(constraint, group) = {
     let index = if "range" in constraint { "." + constraint.range.at(0) } else { "" }
     let prefix = if "prefix" in group { group.prefix }
     let lbl = [#chip.name\-C#prefix]
     show figure: (it) => align(left, block[#lbl#context it.counter.display()#index])
-    [#figure(kind: "constraint", numbering: (i) => [#lbl#i#index], supplement: [], [])#cref(constraint)]
+    cref(constraint)[#figure(kind: "constraint", numbering: (i) => [#lbl#i#index], supplement: [], [])]
   }
 
   /// Generates a representation of `constraint`
@@ -112,17 +118,25 @@
     if kind == "interaction" {
       raw(constraint.tag) + `[` + args_interaction_like(constraint.input, constraint.at("output", default: none)) + `]`
     } else if kind == "arith" {
-      [#eval(constraint.constraint)]
+      [#eval(constraint.constraint, mode: "markup")]
     } else if kind == "template" {
-      raw(constraint.tag) + `<` + args_interaction_like(constraint.input, constraint.at("output", default: none)) + `>`
+      let cond = if "cond" in constraint {
+        $#expr_to_math(constraint.cond) arrow.r.double$ + " "
+      }
+      cond + raw(constraint.tag) + `<` + args_interaction_like(constraint.input, constraint.at("output", default: none)) + `>`
     } else {
       assert(false, message: "illegal constraint format: " + kind)
     }
   }
 
-  // Whether constraints has polynomial constraints
+  // Whether constraint has polynomial constraints
   let has_polynomial_constraints(constraint) = {
-    constraint.at("kind") == "arith" and ("poly" in constraint or "polys" in constraint)
+    constraint.kind == "arith" and ("poly" in constraint or "polys" in constraint)
+  }
+
+  // Whether constraint has a "desc" field we need to render separately
+  let has_extra_description(constraint) = {
+    constraint.kind == "arith" and "desc" in constraint
   }
 
   // Rendering polynomial constraints
@@ -139,6 +153,12 @@
     },)
   }
 
+  // Rendering the additional "desc" field for arith constraints
+  let render_extra_description(constraint) = {
+    ([_description_], [], eval(constraint.desc, mode: "markup"), [])
+  }
+
+  show figure: set block(breakable: true)
   figure(table(
     columns: (auto, auto, 1fr, auto),
     inset: 6pt,
@@ -149,11 +169,14 @@
     ..for group in groups {
       for constraint in chip.constraints.at(group) {
         (
-          [#tag(constraint, group)],
+          [#tag(constraint, lookup_group(group))],
           [#interval(constraint)],
           [#repr_constraint(constraint)],
           [#expr_to_math(constraint.at("multiplicity", default: ""))],
         )
+        if has_extra_description(constraint) {
+          render_extra_description(constraint)
+        }
         if has_polynomial_constraints(constraint) {
           render_polynomial_constraints(constraint)
         }
diff --git a/spec/expr.typ b/spec/expr.typ
index df1ddb2e6..1c08655fb 100644
--- a/spec/expr.typ
+++ b/spec/expr.typ
@@ -40,8 +40,8 @@
     } else if type(expr) == int {
       num(expr)
     } else if type(expr) == array {
-      (dict.at(expr.at(0), default: (e) => {
-        assert(false, "Invalid expression: " + repr(e))
+      (dict.at(expr.at(0), default: (pp, rec, e) => {
+        assert(false, message: "Invalid expression: " + repr(e))
       }))(pp, res, expr)
     }
   }

From 3084695bdfd2cc6272626c9bdf6539214183ce66 Mon Sep 17 00:00:00 2001
From: Erik <159244975+erik-3milabs@users.noreply.github.com>
Date: Tue, 30 Dec 2025 17:11:11 +0100
Subject: [PATCH 004/105] spec: support array-like types (#85)

Support array-like variable types.

Typed as:
```toml
[[variables.auxiliary]]
name = "var"
type = ["Bit", 5]
desc = "five bits"
```

---------

Co-authored-by: Robin Jadoul <robin.jadoul@3milabs.tech>
---
 spec/chip.typ        | 13 ++++++++++---
 spec/expr.typ        | 21 +++++++++++++++++++++
 spec/src.typ         | 12 ++++++++----
 spec/src/config.toml | 27 ---------------------------
 4 files changed, 39 insertions(+), 34 deletions(-)

diff --git a/spec/chip.typ b/spec/chip.typ
index cb8a8e4cf..41bd44a29 100644
--- a/spec/chip.typ
+++ b/spec/chip.typ
@@ -1,4 +1,4 @@
-#import "expr.typ": expr_to_code, expr_to_math
+#import "expr.typ": expr_to_code, expr_to_math, type_to_code
 
 /// Computes the total number of variables in a `chip`
 #let total_nr_variables(chip) = {
@@ -13,7 +13,14 @@
   .filter(pair => pair.at(0) in config.variables.categories.instantiated)
   .map(pair => pair.at(1))
   .flatten()
-  .map(var => config.variables.types.filter(type => type.label == var.type).at(0).subtypes.len())
+  .map(var => {
+    let (label, factor) = if type(var.type) == array {
+      (var.type.at(0), var.type.at(1))
+    } else {
+      (var.type, 1)
+    }
+    config.variables.types.filter(type => type.label == label).first().subtypes.len() * factor
+  })
   .sum()
 }
 
@@ -31,7 +38,7 @@
     ..for (cat, vars) in chip.variables.pairs() {
       ([#emph(cat)], [], [], table.hline(stroke: .6pt))
       for var in vars {
-        ([#raw(var.name)], [#raw(var.type)], [#eval(var.desc, mode: "markup")])
+        ([#raw(var.name)], [#type_to_code(var.type)], [#eval(var.desc, mode: "markup")])
         for (i, poly) in var.at("polys", default: ()).enumerate() {
           (if i == 0 { emph[def] }, [], expr_to_math(("=", ("idx", var.name, i), poly)))
         }
diff --git a/spec/expr.typ b/spec/expr.typ
index 1c08655fb..8b207fd4b 100644
--- a/spec/expr.typ
+++ b/spec/expr.typ
@@ -117,3 +117,24 @@
   var: v => if v.len() == 1 { $#v$ } else { $#raw(v)$ },
   num: n => math.equation[#n],
 )
+
+// Check that a type expression is structurally valid, without validating against a set of known base types
+#let check_array_type(typ) = {
+  assert(type(typ.at(0)) == str, message: "Array types need to have a regular type as base")
+  assert(type(typ.at(1)) == int, message: "Array types need to have a constant dimension")
+}
+
+// Render a type to code
+#let type_to_code(typ) = {
+  if type(typ) == array {
+    check_array_type(typ)
+    return raw(typ.at(0) + "[" + str(typ.at(1)) + "]")
+  } else if type(typ) == string {
+    return raw(typ)
+  } else {
+    assert(false, message: "Unknown format for type: " + repr(typ))
+  }
+}
+
+// Render a type to math
+#let type_to_math(typ) = render_type_to_code(typ) // The code version looks reasonable enough in math too
diff --git a/spec/src.typ b/spec/src.typ
index f791bc1ca..75a81a5f9 100644
--- a/spec/src.typ
+++ b/spec/src.typ
@@ -38,12 +38,16 @@
     assert(category in config.variables.categories.all)
   }
 
+  let all_labels = config.variables.types.map(type => type.label);
   for var in chip.variables.values().flatten() {
+    let type_label = if type(var.type) == array {
+      var.type.at(0)
+    } else {
+      var.type
+    }
+
     // Check that all variable types are valid
-    assert(
-      var.type in config.variables.types.map(type => type.label),
-      message: "found invalid var type:" + var.type,
-    )
+    assert(type_label in all_labels, message: "found invalid var type:" + repr(var.type))
   }
 }
 
diff --git a/spec/src/config.toml b/spec/src/config.toml
index 1977b9155..29c0b9d83 100644
--- a/spec/src/config.toml
+++ b/spec/src/config.toml
@@ -81,33 +81,6 @@ desc = """\
        The `Word` is the least significant digit.
        """
 
-# TODO: Having to define these manually will get tedious
-[[variables.types]]
-label = "Bit[3]"
-subtypes = ["Bit", "Bit", "Bit"]
-desc = "Three bits"
-
-[[variables.types]]
-label = "Byte[2]"
-subtypes = ["Byte", "Byte"]
-desc = "Two bytes"
-
-[[variables.types]]
-label = "Byte[8]"
-subtypes = ["Byte", "Byte", "Byte", "Byte", "Byte", "Byte", "Byte", "Byte"]
-desc = "Eight bytes"
-
-[[variables.types]]
-label = "Half[3]"
-subtypes = ["Half", "Half", "Half"]
-desc = "Three halfwords"
-
-[[variables.types]]
-label = "Half[8]"
-subtypes = ["Half", "Half", "Half", "Half", "Half", "Half", "Half", "Half"]
-desc = "Eight halfwords"
-
-
 [variables.categories]
 all = ["input", "output", "auxiliary", "virtual", "multiplicity"]
 instantiated = ["input", "output", "auxiliary", "multiplicity"]

From 5e6a7d88cd884e2ea806d23cf9bdaa4c6b5f0821 Mon Sep 17 00:00:00 2001
From: Robin Jadoul <robin.jadoul@3milabs.tech>
Date: Tue, 30 Dec 2025 17:38:16 +0100
Subject: [PATCH 005/105] spec: Fixup wrong type sanity check for array types
 (#86)

---
 spec/expr.typ | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spec/expr.typ b/spec/expr.typ
index 8b207fd4b..d5a2e4d8d 100644
--- a/spec/expr.typ
+++ b/spec/expr.typ
@@ -129,7 +129,7 @@
   if type(typ) == array {
     check_array_type(typ)
     return raw(typ.at(0) + "[" + str(typ.at(1)) + "]")
-  } else if type(typ) == string {
+  } else if type(typ) == str {
     return raw(typ)
   } else {
     assert(false, message: "Unknown format for type: " + repr(typ))

From 5157849a3709060054cb6fda74970b91023344d8 Mon Sep 17 00:00:00 2001
From: Robin Jadoul <robin.jadoul@3milabs.tech>
Date: Tue, 30 Dec 2025 17:33:57 +0100
Subject: [PATCH 006/105] Make precedence a lookup table instead of hardcoding
 it

---
 spec/expr.typ | 65 +++++++++++++++++++++++++++------------------------
 1 file changed, 34 insertions(+), 31 deletions(-)

diff --git a/spec/expr.typ b/spec/expr.typ
index d5a2e4d8d..c88168c27 100644
--- a/spec/expr.typ
+++ b/spec/expr.typ
@@ -16,19 +16,21 @@
 // To limit the number of parentheses that are placed in an expression,
 // the formatter passes `pp` (for Parent Precedence) to each recursive subcall,
 // and wraps itself in parentheses when `pp < expr.precedence`.
-//
-// Precedence values:
-// 0 : ^
-// 1 : neg (e.g., 5 =>  -5)
-// 2 : *
-// 3 : /
-// 4 : not (e.g., 5 => 1-5)
-// 5 : +
-// 6 : -
-// 7 : []
-// 8 : =
-// 10: <the void outside every expression>
-#let MAX_PRECEDENCE = 10
+
+#let PREC = (
+  "MIN": -1, // <the most secret heart of any expression>
+  "pow": 0,  // ^
+  "neg": 1,  // Unary -
+  "mul": 2,  // *
+  "div": 3,  // /
+  "not": 4,  // not
+  "add": 5,  // +
+  "sub": 6,  // -
+  "idx": 7,  // []
+  "cast": 8, // cast
+  "eq": 9,   // =
+  "MAX": 10, // <the void outside every expression>
+)
 
 // Mutual recursion through a trick from https://github.com/typst/typst/issues/744
 #let make_expr_formatter(dict, empty: none, var: raw, num: str) = {
@@ -45,7 +47,7 @@
       }))(pp, res, expr)
     }
   }
-  res.with(MAX_PRECEDENCE)
+  res.with(PREC.MAX)
 }
 
 // Wrap code `expr` if `apply = true`
@@ -60,23 +62,24 @@
 // Typeset an expression as code
 #let expr_to_code = make_expr_formatter(
   (
-    "idx": (pp, rec, e) => rec(0, e.at(1)) + `[` + rec(10, e.at(2)) + `]`,
-    "not": (pp, rec, e) => cwrap(`1 - ` + rec(4, e.at(1)), pp < 4),
-    "+": (pp, rec, e) => cwrap(e.slice(1).map(rec.with(5)).join(` + `), pp < 5),
-    "*": (pp, rec, e) => cwrap(e.slice(1).map(rec.with(2)).join(` ` + sym.dot + ` `), pp < 2),
-    "/": (pp, rec, e) => cwrap(rec(3, e.at(1)), pp < 3) + ` / ` + rec(3, e.at(2)),
+    "idx": (pp, rec, e) => rec(PREC.MIN, e.at(1)) + `[` + rec(PREC.MAX, e.at(2)) + `]`,
+    "not": (pp, rec, e) => cwrap(`1 - ` + rec(PREC.not, e.at(1)), pp < PREC.not),
+    "+": (pp, rec, e) => cwrap(e.slice(1).map(rec.with(PREC.add)).join(` + `), pp < PREC.add),
+    "*": (pp, rec, e) => cwrap(e.slice(1).map(rec.with(PREC.mul)).join(` ` + sym.dot + ` `), pp < PREC.mul),
+    "/": (pp, rec, e) => cwrap(rec(PREC.div, e.at(1)), pp < PREC.div) + ` / ` + rec(PREC.div, e.at(2)),
     "^": (pp, rec, e) => {
       assert(type(e.at(1)) == int and type(e.at(2)) == int, message: "Can only exponentiate constants")
-      rec(0, e.at(1)) + `^` + rec(0, e.at(2))
+      // technically wrong associativity, but it's a constant
+      rec(PREC.pow, e.at(1)) + `^` + rec(PREC.pow, e.at(2))
     },
-    "=": (pp, rec, e) => rec(8, e.at(1)) + ` = ` + rec(8, e.at(2)),
+    "=": (pp, rec, e) => rec(PREC.eq, e.at(1)) + ` = ` + rec(PREC.eq, e.at(2)),
     "-": (pp, rec, e) => {
       if e.len() == 2 {
         // Negation
-        cwrap(`-` + rec(1, e.at(1)), pp < 1)
+        cwrap(`-` + rec(PREC.neg, e.at(1)), pp < PREC.neg)
       } else {
         // Subtraction
-        cwrap(e.slice(1).map(rec.with(6)).join(` - `), pp < 6)
+        cwrap(e.slice(1).map(rec.with(PREC.sub)).join(` - `), pp < PREC.sub)
       }
     },
   ),
@@ -94,23 +97,23 @@
 // Typeset an expression as math
 #let expr_to_math = make_expr_formatter(
   (
-    "idx": (pp, rec, e) => $#rec(7, e.at(1))_(#rec(7, e.at(2)))$,
-    "not": (pp, rec, e) => mwrap($1 - #rec(4, e.at(1))$, pp < 4),
-    "+": (pp, rec, e) => mwrap($#e.slice(1).map(rec.with(5)).join($+$)$, pp < 5),
-    "*": (pp, rec, e) => mwrap($#e.slice(1).map(rec.with(3)).join($dot$)$, pp < 3),
-    "/": (pp, rec, e) => $#rec(3, e.at(1)) / #rec(3, e.at(2))$,
+    "idx": (pp, rec, e) => $#rec(PREC.idx, e.at(1))_(#rec(PREC.idx, e.at(2)))$,
+    "not": (pp, rec, e) => mwrap($1 - #rec(PREC.not, e.at(1))$, pp < PREC.not),
+    "+": (pp, rec, e) => mwrap($#e.slice(1).map(rec.with(PREC.add)).join($+$)$, pp < PREC.add),
+    "*": (pp, rec, e) => mwrap($#e.slice(1).map(rec.with(PREC.mul)).join($dot$)$, pp < PREC.mul),
+    "/": (pp, rec, e) => $#rec(PREC.div, e.at(1)) / #rec(PREC.div, e.at(2))$,
     "^": (pp, rec, e) => {
       assert(type(e.at(1)) == int and type(e.at(2)) == int, message: "Can only exponentiate constants")
       $#e.at(1)^#e.at(2)$
     },
-    "=": (pp, rec, e) => $#rec(8, e.at(1)) = #rec(8, e.at(2))$,
+    "=": (pp, rec, e) => $#rec(PREC.eq, e.at(1)) = #rec(PREC.eq, e.at(2))$,
     "-": (pp, rec, e) => {
       if e.len() == 2 {
         // Negation
-        mwrap($-#rec(1, e.at(1))$, pp < 1)
+        mwrap($-#rec(PREC.neg, e.at(1))$, pp < PREC.neg)
       } else {
         // Subtraction
-        mwrap($#e.slice(1).map(rec.with(6)).join($-$)$, pp < 6)
+        mwrap($#e.slice(1).map(rec.with(PREC.sub)).join($-$)$, pp < PREC.sub)
       }
     },
   ),

From 9f9c2b9c77204bcda28339ecab67f2126c0a0df2 Mon Sep 17 00:00:00 2001
From: Robin Jadoul <robin.jadoul@3milabs.tech>
Date: Tue, 30 Dec 2025 17:49:54 +0100
Subject: [PATCH 007/105] Render type cast expressions

---
 spec/expr.typ | 58 ++++++++++++++++++++++++++++++++-------------------
 1 file changed, 36 insertions(+), 22 deletions(-)

diff --git a/spec/expr.typ b/spec/expr.typ
index c88168c27..88d66d5d9 100644
--- a/spec/expr.typ
+++ b/spec/expr.typ
@@ -1,4 +1,30 @@
-// Grammar
+// Types and array types
+// <type> ::= str
+//          | [str, int]
+
+// Check that a type expression is structurally valid, without validating against a set of known base types
+#let check_array_type(typ) = {
+  assert(type(typ.at(0)) == str, message: "Array types need to have a regular type as base")
+  assert(type(typ.at(1)) == int, message: "Array types need to have a constant dimension")
+}
+
+// Render a type to code
+#let type_to_code(typ) = {
+  if type(typ) == array {
+    check_array_type(typ)
+    return raw(typ.at(0) + "[" + str(typ.at(1)) + "]")
+  } else if type(typ) == str {
+    return raw(typ)
+  } else {
+    assert(false, message: "Unknown format for type: " + repr(typ))
+  }
+}
+
+// Render a type to math
+#let type_to_math(typ) = type_to_code(typ) // The code version looks reasonable enough in math too
+
+
+// Expression grammar
 // <expr> ::= ()                           ; ""
 //          | var                          ; str(var)
 //          | int                          ; int
@@ -11,6 +37,7 @@
 //          | ["=", expr1, expr2]          ; expr1 = expr2
 //          | ["-", expr]                  ; -expr
 //          | ["-", expr1, expr2, ...]     ; expr1 - expr2 - ...
+//          | ["cast", expr, type]         ; expr as type
 // 
 // 
 // To limit the number of parentheses that are placed in an expression,
@@ -82,6 +109,10 @@
         cwrap(e.slice(1).map(rec.with(PREC.sub)).join(` - `), pp < PREC.sub)
       }
     },
+    "cast": (pp, rec, e) => {
+      assert(e.len() == 3, message: "Invalid type cast: " + repr(e))
+      cwrap(rec(PREC.cast, e.at(1)) + ` as ` + type_to_code(e.at(2)), pp < PREC.cast)
+    },
   ),
 )
 
@@ -116,28 +147,11 @@
         mwrap($#e.slice(1).map(rec.with(PREC.sub)).join($-$)$, pp < PREC.sub)
       }
     },
+    "cast": (pp, rec, e) => {
+      assert(e.len() == 3, message: "Invalid type cast: " + repr(e))
+      cwrap($#rec(PREC.cast, e.at(1)) colon.double #type_to_math(e.at(2))$, pp < PREC.cast)
+    },
   ),
   var: v => if v.len() == 1 { $#v$ } else { $#raw(v)$ },
   num: n => math.equation[#n],
 )
-
-// Check that a type expression is structurally valid, without validating against a set of known base types
-#let check_array_type(typ) = {
-  assert(type(typ.at(0)) == str, message: "Array types need to have a regular type as base")
-  assert(type(typ.at(1)) == int, message: "Array types need to have a constant dimension")
-}
-
-// Render a type to code
-#let type_to_code(typ) = {
-  if type(typ) == array {
-    check_array_type(typ)
-    return raw(typ.at(0) + "[" + str(typ.at(1)) + "]")
-  } else if type(typ) == str {
-    return raw(typ)
-  } else {
-    assert(false, message: "Unknown format for type: " + repr(typ))
-  }
-}
-
-// Render a type to math
-#let type_to_math(typ) = render_type_to_code(typ) // The code version looks reasonable enough in math too

From 9ad78c3190b0c88d0a1618caedb1fae81eadd6e1 Mon Sep 17 00:00:00 2001
From: Robin Jadoul <robin.jadoul@3milabs.tech>
Date: Tue, 30 Dec 2025 14:31:53 +0100
Subject: [PATCH 008/105] spec: Allow desc field on non-arith constraints as
 clarification

---
 spec/chip.typ | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spec/chip.typ b/spec/chip.typ
index 41bd44a29..d207cec8f 100644
--- a/spec/chip.typ
+++ b/spec/chip.typ
@@ -143,7 +143,7 @@
 
   // Whether constraint has a "desc" field we need to render separately
   let has_extra_description(constraint) = {
-    constraint.kind == "arith" and "desc" in constraint
+    "desc" in constraint
   }
 
   // Rendering polynomial constraints

From 03d30111040479df53223228ab77a1fe1e042eae Mon Sep 17 00:00:00 2001
From: Robin Jadoul <robin.jadoul@3milabs.tech>
Date: Wed, 31 Dec 2025 10:51:52 +0100
Subject: [PATCH 009/105] spec: Modify cast operator precedence

---
 spec/expr.typ | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/spec/expr.typ b/spec/expr.typ
index 88d66d5d9..c005e5a35 100644
--- a/spec/expr.typ
+++ b/spec/expr.typ
@@ -48,13 +48,13 @@
   "MIN": -1, // <the most secret heart of any expression>
   "pow": 0,  // ^
   "neg": 1,  // Unary -
-  "mul": 2,  // *
-  "div": 3,  // /
-  "not": 4,  // not
-  "add": 5,  // +
-  "sub": 6,  // -
-  "idx": 7,  // []
-  "cast": 8, // cast
+  "cast": 2, // cast
+  "mul": 3,  // *
+  "div": 4,  // /
+  "not": 5,  // not
+  "add": 6,  // +
+  "sub": 7,  // -
+  "idx": 8,  // []
   "eq": 9,   // =
   "MAX": 10, // <the void outside every expression>
 )

From f3f21bcdf00a776f24956da3323b73a217436b8f Mon Sep 17 00:00:00 2001
From: Erik <159244975+erik-3milabs@users.noreply.github.com>
Date: Fri, 2 Jan 2026 10:43:57 +0100
Subject: [PATCH 010/105] spec: improve definitions (#91)

Updated definition rendering:
* definitions are only allowed for virtual variables
* definitions are now labelled with `def` rather than `poly` or `polys`
* more flexible definitions possible for array-type virtuals.
---
 spec/chip.typ | 75 ++++++++++++++++++++++++++++++++++++++++++++-------
 spec/expr.typ |  4 ++-
 spec/src.typ  | 12 ++++++++-
 3 files changed, 79 insertions(+), 12 deletions(-)

diff --git a/spec/chip.typ b/spec/chip.typ
index d207cec8f..713f39b87 100644
--- a/spec/chip.typ
+++ b/spec/chip.typ
@@ -26,27 +26,82 @@
 
 /// Generates a table listing `chip`'s columns.
 #let render_chip_column_table(chip, config) = {
+
+  // Render a definition's range
+  let render_def_range(idx, range) = {
+    if type(range) == array {
+      if range.len() == 1 {
+        [#raw(idx) `=` #range.at(0)]
+      } else if range.len() == 2 {
+        [#raw(idx) #sym.in `[`#range.at(0)`,`#range.at(1)`]`]
+      } else {
+        assert(false, message: "invalid range: " + repr(range) + repr(range.len()))
+      }
+    } else {
+      [#raw(idx) `=` #range]
+    }
+  }
+
+  // Render definition `def`
+  let render_definition(def, var_name) = {
+    if type(def) in (array, str) {
+      return (
+        [],
+        table.cell(align: right, emph[definition]), 
+        table.cell(colspan: 2, expr_to_math(def))
+      )
+    }
+
+    assert(type(def) == dictionary, message: "invalid definition: " + repr(def))
+
+    if "poly" in def {
+      (
+        [],
+        table.cell(align: right, emph[definition]), 
+        expr_to_math((":=", ("idx", var_name, def.idx), def.poly)),
+        render_def_range(def.idx, def.range)
+      )
+    } else if "polys" in def {
+      (
+        [],
+        table.cell(align: right, emph[definition]), 
+        table.cell(colspan: 2, expr_to_math(("idx", var_name, def.idx)))
+      )
+      for (i, poly) in def.polys.enumerate() {
+        (
+          [],
+          [],              
+          expr_to_math((":=", "  ", poly.poly)),
+          render_def_range(def.idx, poly.range), 
+        )
+      }
+    } else {
+      assert(false, message: "invalid definition: " + repr(def))
+    }
+  }
+
   // Group variables by category
   show figure: set block(breakable: true)
   figure(table(
-    columns: (auto, auto, 1fr),
+    columns: (auto, auto, 1fr, auto),
     inset: 6pt,
     align: left + top,
     stroke: none,
-    table.header([*Label*], [*Type*], [*Description*]),
+    table.header([*Label*], [*Type*], table.cell(colspan: 2, [*Description*])),
     table.hline(stroke: stroke(thickness: 2pt)),
     ..for (cat, vars) in chip.variables.pairs() {
-      ([#emph(cat)], [], [], table.hline(stroke: .6pt))
+      (table.cell(colspan: 4, emph(cat)), table.hline(stroke: .6pt))
       for var in vars {
-        ([#raw(var.name)], [#type_to_code(var.type)], [#eval(var.desc, mode: "markup")])
-        for (i, poly) in var.at("polys", default: ()).enumerate() {
-          (if i == 0 { emph[def] }, [], expr_to_math(("=", ("idx", var.name, i), poly)))
-        }
-        if "poly" in var {
-          (emph[def], [], expr_to_math(var.poly))
+        (
+          [#raw(var.name)], 
+          [#type_to_code(var.type)], 
+          table.cell(colspan: 2, [#eval(var.desc, mode: "markup")])
+        )
+        if "def" in var {
+          render_definition(var.def, var.name)
         }
       }
-      ([], [], [])
+      (table.cell(colspan: 4, []), )
     },
   ), caption: [Column overview of #chip.name chip.])
 }
diff --git a/spec/expr.typ b/spec/expr.typ
index c005e5a35..ae7bd0792 100644
--- a/spec/expr.typ
+++ b/spec/expr.typ
@@ -55,7 +55,7 @@
   "add": 6,  // +
   "sub": 7,  // -
   "idx": 8,  // []
-  "eq": 9,   // =
+  "eq": 9,   // = and :=
   "MAX": 10, // <the void outside every expression>
 )
 
@@ -100,6 +100,7 @@
       rec(PREC.pow, e.at(1)) + `^` + rec(PREC.pow, e.at(2))
     },
     "=": (pp, rec, e) => rec(PREC.eq, e.at(1)) + ` = ` + rec(PREC.eq, e.at(2)),
+    ":=": (pp, rec, e) => rec(PREC.eq, e.at(1)) + ` := ` + rec(PREC.eq, e.at(2)),
     "-": (pp, rec, e) => {
       if e.len() == 2 {
         // Negation
@@ -138,6 +139,7 @@
       $#e.at(1)^#e.at(2)$
     },
     "=": (pp, rec, e) => $#rec(PREC.eq, e.at(1)) = #rec(PREC.eq, e.at(2))$,
+    ":=": (pp, rec, e) => $#rec(PREC.eq, e.at(1)) := #rec(PREC.eq, e.at(2))$,
     "-": (pp, rec, e) => {
       if e.len() == 2 {
         // Negation
diff --git a/spec/src.typ b/spec/src.typ
index 75a81a5f9..7c9e68487 100644
--- a/spec/src.typ
+++ b/spec/src.typ
@@ -38,8 +38,18 @@
     assert(category in config.variables.categories.all)
   }
 
+  // Check that `def` is only contained in `virtual` variables
+  let non_virtual_vars = chip.variables.pairs().filter(x => x.first() != "virtual").map(x => x.last()).flatten();
+  for var in non_virtual_vars {
+    assert(
+      "def" not in var,
+      message: "illegal `def` in non-virtual var: " + repr(var.name)
+    )
+  }
+
+  let all_vars = chip.variables.values().flatten()
   let all_labels = config.variables.types.map(type => type.label);
-  for var in chip.variables.values().flatten() {
+  for var in all_vars {
     let type_label = if type(var.type) == array {
       var.type.at(0)
     } else {

From 442e32719acf52d91d31972d298096dbbd541c3b Mon Sep 17 00:00:00 2001
From: Erik <159244975+erik-3milabs@users.noreply.github.com>
Date: Fri, 2 Jan 2026 15:26:29 +0100
Subject: [PATCH 011/105] spec: update table rendering (#93)

* spec: update `description` printing
* spec: update `polynomial constraint` printing
---
 spec/chip.typ | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spec/chip.typ b/spec/chip.typ
index 713f39b87..30b93373c 100644
--- a/spec/chip.typ
+++ b/spec/chip.typ
@@ -211,13 +211,13 @@
     }
 
     (..for poly in polys {
-      ([_polynomial constraint_], [], $#expr_to_math(poly) = 0$, [])
+      (table.cell(align: right, colspan: 2, [_polynomial constraint_]), $#expr_to_math(poly) = 0$, [])
     },)
   }
 
   // Rendering the additional "desc" field for arith constraints
   let render_extra_description(constraint) = {
-    ([_description_], [], eval(constraint.desc, mode: "markup"), [])
+    (table.cell(align: right, colspan: 2, [_description_]), eval(constraint.desc, mode: "markup"), [])
   }
 
   show figure: set block(breakable: true)

From c271c9309fd91b26f5d52e42dcd48d0ce93aeb54 Mon Sep 17 00:00:00 2001
From: Erik Takke <erik.takke@3milabs.tech>
Date: Sat, 3 Jan 2026 14:55:47 +0100
Subject: [PATCH 012/105] spec: introduce "condition" column type

---
 spec/src.typ         | 5 ++++-
 spec/src/config.toml | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/spec/src.typ b/spec/src.typ
index 7c9e68487..8200b47c1 100644
--- a/spec/src.typ
+++ b/spec/src.typ
@@ -35,7 +35,10 @@
 #let _check_chip(chip, config) = {
   // Check that all variable categories are valid
   for category in chip.variables.keys() {
-    assert(category in config.variables.categories.all)
+    assert(
+      category in config.variables.categories.all, 
+      message: "invalid category: " + repr(category)
+    )
   }
 
   // Check that `def` is only contained in `virtual` variables
diff --git a/spec/src/config.toml b/spec/src/config.toml
index 29c0b9d83..28abdcbfc 100644
--- a/spec/src/config.toml
+++ b/spec/src/config.toml
@@ -82,5 +82,5 @@ desc = """\
        """
 
 [variables.categories]
-all = ["input", "output", "auxiliary", "virtual", "multiplicity"]
+all = ["input", "output", "auxiliary", "virtual", "multiplicity", "condition"]
 instantiated = ["input", "output", "auxiliary", "multiplicity"]

From a0801d9806e66e2b175a4d19d962d4c05cddf2d2 Mon Sep 17 00:00:00 2001
From: Erik Takke <erik.takke@3milabs.tech>
Date: Sat, 3 Jan 2026 16:24:33 +0100
Subject: [PATCH 013/105] spec: is_bit template

---
 spec/book.typ        |  1 +
 spec/is_bit.typ      | 47 ++++++++++++++++++++++++++++++++++++++++++++
 spec/src/is_bit.toml | 20 +++++++++++++++++++
 3 files changed, 68 insertions(+)
 create mode 100644 spec/is_bit.typ
 create mode 100644 spec/src/is_bit.toml

diff --git a/spec/book.typ b/spec/book.typ
index 8b5cae160..dcf1d0c1b 100644
--- a/spec/book.typ
+++ b/spec/book.typ
@@ -7,6 +7,7 @@
   title: "Lambda VM specification",
   summary: [
     #chapter("variables.typ")[Variables]
+    #chapter("is_bit.typ")[IS_BIT template]
   ]
 )
 
diff --git a/spec/is_bit.typ b/spec/is_bit.typ
new file mode 100644
index 000000000..c379080cd
--- /dev/null
+++ b/spec/is_bit.typ
@@ -0,0 +1,47 @@
+#import "/book.typ": book-page
+#import "/src.typ": load_config, load_chip
+#import "/chip.typ": render_chip_column_table, render_constraint_table
+
+#show: book-page.with(title: "IS_BIT template")
+
+#let config = load_config()
+#let chip = load_chip("src/is_bit.toml", config)
+
+#let is_bit = raw(chip.name)
+
+#let highlighted_code(code) = {
+  box(
+    inset: (left: 4pt, right: 4pt), 
+    outset: (top: 4pt, bottom: 4pt), 
+    radius: 2pt,
+    fill: luma(230), 
+    raw(code))
+}
+
+= #is_bit template
+#is_bit is a constraint template that is used to assert that a variable lies in the range ${0, 1}$ if some second variable is non-zero.
+Barring exceptional cases, this template is used to assert that a variable of type `Bit` assumes a valid value under some condition.
+
+== Interface
+The #is_bit constraint template has the following interface:
+#block(radius: 5pt, width: 100%, inset: 1.5em, fill: luma(230), raw("cond => IS_BIT<X>"))
+where `cond` is any value described by an expression _of degree at most $1$_.
+Note that #highlighted_code("IS_BIT<X>") can be used to denote the _unconditional_ application of the #is_bit template to `X`.
+
+== Variables
+The #is_bit template operates on two variables: `cond` and `X`:
+#render_chip_column_table(chip, config)
+
+== Constraints
+It takes only one constraint to enforce that `X` must be either $0$ or $1$ whenever $#`cond` eq.not 0$:
+#render_constraint_table(chip, config)
+*Note*: 
+- In case of _unconditional_ template application, `cond` can be dropped from the constraint, simplifying it to $#`X` (1- #`X`) = 0$.
+- As described earlier, the `cond` variable must be describable by a degree-1 (i.e., linear) expression.
+  This is to make sure that @isbit:c:isbit's expression has degree at most 3.
+
+== Proof of correctness
+If `cond` is $0$, @isbit:c:isbit is trivially satisfied: `X` can assume any value and the polynomial constraint will evaluate to $0$ regardless. 
+When $#`cond` eq.not 0$, it follows that the statement can only be proven when $#`X` (1-#`X`) equiv 0 mod p$, with $p$ the modulus of the field.
+Because `BaseField` is a prime field, this equality is only satisfied if either $#`X` equiv 0 mod p$ or $1-#`X` equiv 0 mod p$.
+Hence, it is proven that when $#`cond` eq.not 0$, @isbit:c:isbit is only satisfied if $#`X` in {0, 1}$. #align(right, $qed$)
diff --git a/spec/src/is_bit.toml b/spec/src/is_bit.toml
new file mode 100644
index 000000000..47e96a27e
--- /dev/null
+++ b/spec/src/is_bit.toml
@@ -0,0 +1,20 @@
+name = "IS_BIT"
+
+[[variables.condition]]
+name = "cond"
+type = "BaseField"
+desc = "Whether the constraint should be applied ($eq.not 0$) or not ($0$)."
+
+[[variables.input]]
+name = "X"
+type = "BaseField"
+desc = "Value for which to assert that it lies in the range ${0, 1}$."
+
+[[constraint_groups]]
+name = "all"
+
+[[constraints.all]]
+kind = "arith"
+constraint = "$#`cond` => #`X` (1-#`X`) = 0$"
+poly = ["*", "cond", "X", ["not", "X"]]
+ref = "isbit:c:isbit"

From 37d4cc056b84db732cd8b05b29a518edd5271552 Mon Sep 17 00:00:00 2001
From: Robin Jadoul <robin.jadoul@3milabs.tech>
Date: Mon, 5 Jan 2026 11:42:10 +0100
Subject: [PATCH 014/105] spec: CPU chip for RV64IMC (#88)

* spec: Initial CPU version to handle RV64IMC

* Address review comments

* Add word_instr as input to SHIFT

---------

Co-authored-by: Erik <159244975+erik-3milabs@users.noreply.github.com>
---
 spec/book.typ        |   2 +-
 spec/cpu.typ         |  84 +++++
 spec/src/config.toml |  16 +-
 spec/src/cpu.toml    | 727 +++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 827 insertions(+), 2 deletions(-)
 create mode 100644 spec/cpu.typ
 create mode 100644 spec/src/cpu.toml

diff --git a/spec/book.typ b/spec/book.typ
index dcf1d0c1b..c4c3a8c6d 100644
--- a/spec/book.typ
+++ b/spec/book.typ
@@ -1,4 +1,3 @@
-
 #import "@preview/shiroa:0.3.1": *
 
 #show: book
@@ -8,6 +7,7 @@
   summary: [
     #chapter("variables.typ")[Variables]
     #chapter("is_bit.typ")[IS_BIT template]
+    #chapter("cpu.typ")[CPU chip]
   ]
 )
 
diff --git a/spec/cpu.typ b/spec/cpu.typ
new file mode 100644
index 000000000..00a33f5a2
--- /dev/null
+++ b/spec/cpu.typ
@@ -0,0 +1,84 @@
+#import "/book.typ": book-page, rj
+#import "/src.typ": load_config, load_chip
+#import "/chip.typ": (
+  render_chip_assumptions,
+  render_chip_column_table,
+  total_nr_variables,
+  total_nr_instantiated_columns,
+  render_constraint_table,
+)
+
+#let config = load_config()
+#let chip = load_chip("src/cpu.toml", config)
+
+#show: book-page.with(title: "CPU chip")
+
+== Columns
+#let nr_variables = total_nr_variables(chip)
+#let nr_columns = total_nr_instantiated_columns(chip, config)
+
+The `CPU` chip is comprised of #nr_variables variables that are expressed using #nr_columns columns:
+#render_chip_column_table(chip, config)
+
+== Assumptions
+#render_chip_assumptions(chip, config)
+
+== Constraints
+First, we perform a decoding lookup for the current PC.
+
+#render_constraint_table(chip, config, groups: "decode")
+
+#rj[All casts for interactions will have to be reviewed once other chip interfaces stabilise]
+
+=== Range checks
+
+We constrain all columns to have the appropriate ranges.
+The flags and register indices looked up from the decoding need to be checked,
+as they are communicated through the interaction in a packed form.
+In contrast, we know ahead of time that decoding will ensure proper range checks for `pc` and `imm`.
+Similarly, since `next_pc` will propagate through the memory argument and be looked up
+in the instruction decoding on the next cycle, it is forced to be in the correct range.#rj[is this true, do we need this elsewhere for chip assumptions?]
+For the auxiliary columns, we need to check the limbs of `arg1`, `arg2`, and `res`.
+The ranges of the other auxiliary columns are enforced through later constraints.
+#rj[Make sure we argue for every column here]
+#rj[is `rvd` still sufficiently constrained? (can also be done through the memory argument like `pc`?)]
+
+#render_constraint_table(chip, config, groups: "range")
+
+=== ALU
+
+The ALU functionality is then obtained through judicious dispatching to the corresponding chips.
+
+#render_constraint_table(chip, config, groups: "alu")
+
+=== Memory
+
+The interactions with the memory, both for register loading and storing, as for `LOAD` and `STORE` instructions are handled.
+Note that since registers need no byte-addressing, we store them in the memory argument with `Word` limbs.
+The timestamps are ensured to be disjoint for disjoint memory locations.
+One consequence of that is that `next_pc` is written at `timestamp + 1`
+to ensure the access is disjoint with the `pc` read into `rv1` as part of the `AUIPC` instruction.
+
+#render_constraint_table(chip, config, groups: "mem")
+
+=== System
+
+The interactions with the wider system.
+
+#render_constraint_table(chip, config, groups: "sys")
+
+=== Input and output to the ALU
+
+We constrain `arg1`, `arg2` and `rvd` to correspond to the wanted values,
+including the appropriate sign/zero extension, depending on `word_instr`.
+
+#render_constraint_table(chip, config, groups: "ext")
+
+=== Other constraints
+
+#rj[proper ref to IsZero/IsEqual]
+For @cpu:c:is_equal, refer to the logic of IsZero or IsEqual, in combination with the subtraction of @cpu:c:sub.
+
+#render_constraint_table(chip, config, groups: "misc")
+
+#rj[Document the choice to not have a multiplicity column here for padding]
diff --git a/spec/src/config.toml b/spec/src/config.toml
index 28abdcbfc..b66639e2a 100644
--- a/spec/src/config.toml
+++ b/spec/src/config.toml
@@ -78,9 +78,23 @@ subtypes = ["Word", "Half", "Half"]
 desc = """\
        Variable that can only assume values in the range $[0, 2^64)$. \\
        Represented as a `Word` and two `Half` variables.\
-       The `Word` is the least significant digit.
+       The `Word` is the *least* significant digit.
        """
 
+[[variables.types]]
+label = "DWordWHH"
+subtypes = ["Half", "Half", "Word"]
+desc = """\
+       Variable that can only assume values in the range $[0, 2^64)$. \\
+       Represented as a `Word` and two `Half` variables.\
+       The `Word` is the *most* significant digit.
+       """
+
+[[variables.types]]
+label = "Timestamp"
+subtypes = ["DWordWL"]
+desc = "A preprocessed column holding timestamps as `DWordWL`. Row `i` of the column contains the value $2^2 dot (i + 1)$. Used in the CPU chip, see there for more details about the magic number."
+
 [variables.categories]
 all = ["input", "output", "auxiliary", "virtual", "multiplicity", "condition"]
 instantiated = ["input", "output", "auxiliary", "multiplicity"]
diff --git a/spec/src/cpu.toml b/spec/src/cpu.toml
new file mode 100644
index 000000000..562a657d0
--- /dev/null
+++ b/spec/src/cpu.toml
@@ -0,0 +1,727 @@
+name = "CPU"
+
+
+# Input
+# Let's call the variables coming from DECODE input
+
+[[variables.input]]
+name = "timestamp"
+type = "Timestamp"
+desc = "A preprocessed timestamp to coordinate the memory argument. Since we have at most 3 non-disjoint memory accesses (`(rs1, rs2, rd)`, `(rs1, pc, pc)`, `(LOAD)` or `(STORE)`) a maximum of 4 slots is enough."
+
+[[variables.input]]
+name = "pc"
+type = "DWordWL"
+desc = "The program counter"
+
+[[variables.input]]
+name = "rs1"
+type = "Byte"
+desc = "Source register 1 index"
+
+[[variables.input]]
+name = "rs2"
+type = "Byte"
+desc = "Source register 2 index"
+
+[[variables.input]]
+name = "rd"
+type = "Byte"
+desc = "Destination register index"
+
+[[variables.input]]
+name = "write_register"
+type = "Bit"
+desc = "Whether to write back to the destination register"
+
+# TODO: can we compress this to a single value? (1: is it worth it, 2: does it work)
+[[variables.input]]
+name = "memory_2bytes"
+type = "Bit"
+desc = "Whether the memory access (read or write) touches at least 2 bytes"
+
+[[variables.input]]
+name = "memory_4bytes"
+type = "Bit"
+desc = "Whether the memory access (read or write) touches at least 4 bytes"
+
+[[variables.input]]
+name = "memory_8bytes"
+type = "Bit"
+desc = "Whether the memory access (read or write) touches at least 8 bytes"
+
+# TODO: Are there usecases where it's nicer to just have this as a length constant?
+[[variables.input]]
+name = "c_type_instruction"
+type = "Bit"
+desc = "Whether the instruction is of C type, i.e., whether it is 2 bytes long instead of 4"
+
+# TODO: Should this just be a word? (CHECK: effect on computation/extension of arg2)
+# TODO: make sure decode correctly extends this (may be zero for unsigned and word_instr?)
+[[variables.input]]
+name = "imm"
+type = "DWordWL"
+desc = "The fully extended 64-bit version of the immediate"
+
+[[variables.input]]
+name = "signed"
+type = "Bit"
+desc = "Indicates whether we're dealing with a signed or unsigned instruction"
+
+[[variables.input]]
+name = "mp_selector"
+type = "Bit"
+desc = """Multi-purpose selector used by different ALU operations for different purposes. Currently, it is used
+    - by the `MUL` chip to select between `MUL`/`MULH` and `MULH[S]U`, and
+    - as flag for inverting the condition of conditional branches (see `branch_cond`)
+    - as direction (left or right) for `SHIFT`"""
+
+[[variables.input]]
+name = "muldiv_selector"
+type = "Bit"
+desc = "Selects which output of `MUL` (lo/hi) or `DIV` (quo/rem) is wanted"
+
+[[variables.input]]
+name = "word_instr"
+type = "Bit"
+desc = "Whether the instruction is a \\*W instruction, requiring the inputs and outputs to be (sign) extended"
+
+[[variables.input]]
+name = "ADD"
+type = "Bit"
+desc = "One-hot ALU selector flag"
+
+[[variables.input]]
+name = "SUB"
+type = "Bit"
+desc = "One-hot ALU selector flag"
+
+[[variables.input]]
+name = "SLT"
+type = "Bit"
+desc = "One-hot ALU selector flag"
+
+[[variables.input]]
+name = "AND"
+type = "Bit"
+desc = "One-hot ALU selector flag"
+
+[[variables.input]]
+name = "OR"
+type = "Bit"
+desc = "One-hot ALU selector flag"
+
+[[variables.input]]
+name = "XOR"
+type = "Bit"
+desc = "One-hot ALU selector flag"
+
+[[variables.input]]
+name = "SHIFT"
+type = "Bit"
+desc = "One-hot ALU selector flag"
+
+[[variables.input]]
+name = "JALR"
+type = "Bit"
+desc = "One-hot ALU selector flag"
+
+[[variables.input]]
+name = "BEQ"
+type = "Bit"
+desc = "One-hot ALU selector flag"
+
+[[variables.input]]
+name = "BLT"
+type = "Bit"
+desc = "One-hot ALU selector flag"
+
+[[variables.input]]
+name = "LOAD"
+type = "Bit"
+desc = "One-hot ALU selector flag"
+
+[[variables.input]]
+name = "STORE"
+type = "Bit"
+desc = "One-hot ALU selector flag"
+
+[[variables.input]]
+name = "MUL"
+type = "Bit"
+desc = "One-hot ALU selector flag"
+
+[[variables.input]]
+name = "DIVREM"
+type = "Bit"
+desc = "One-hot ALU selector flag"
+
+[[variables.input]]
+name = "ECALL"
+type = "Bit"
+desc = "One-hot ALU selector flag"
+
+[[variables.input]]
+name = "EBREAK"
+type = "Bit"
+desc = "One-hot ALU selector flag"
+
+
+# Output
+[[variables.output]]
+name = "next_pc"
+type = "DWordWL"
+desc = "The program counter for the next instruction"
+
+[[variables.output]]
+name = "rvd"
+type = "DWordWL"
+desc = "The value to (maybe) be written back to rvd"
+
+# Auxiliary
+[[variables.auxiliary]]
+name = "rv1"
+type = "DWordWHH"
+desc = "The value of register `rs1`"
+
+[[variables.auxiliary]]
+name = "rv2"
+type = "DWordWHH"
+desc = "The value of register `rs2`"
+
+[[variables.auxiliary]]
+name = "rv1_sign_bit"
+type = "Bit"
+desc = "The sign bit of `rv1` if seen as a 32-bit word"
+
+[[variables.auxiliary]]
+name = "arg1"
+type = "DWordBL"
+desc = "The extended version of `rv1`, depending on `c_type_instruction`"
+
+[[variables.auxiliary]]
+name = "arg2_sign_bit"
+type = "Bit"
+desc = "The sign bit of `arg2` if seen as a 32-bit word"
+
+[[variables.auxiliary]]
+name = "arg2"
+type = "DWordBL"
+desc = "A multiplexed version of `rv2` and `imm`, to be used as second argument to ALU calls"
+
+[[variables.auxiliary]]
+name = "res_sign_bit"
+type = "Bit"
+desc = "The sign bit of `res`, if seen as a 32-bit word"
+
+[[variables.auxiliary]]
+name = "res"
+type = "DWordBL"
+desc = "The ALU result"
+
+[[variables.auxiliary]]
+name = "is_equal"
+type = "Bit"
+desc = "Whether `rv1` and `arg2` are equal"
+
+[[variables.auxiliary]]
+name = "branch_cond"
+type = "Bit"
+desc = "Whether a branch is taken, i.e., the branch condition"
+
+# Virtual
+[[variables.virtual]]
+name = "packed_decode"
+type = "BaseField"
+desc = "A packed representation of all bit flags and register indices obtained from the decoding"
+poly = ["+",
+    ["*", ["^", 2, 0], "write_register"],
+    ["*", ["^", 2, 1], "memory_2bytes"],
+    ["*", ["^", 2, 2], "memory_4bytes"],
+    ["*", ["^", 2, 3], "memory_8bytes"],
+    ["*", ["^", 2, 4], "c_type_instruction"],
+    ["*", ["^", 2, 5], "signed"],
+    ["*", ["^", 2, 6], "mp_selector"],
+    ["*", ["^", 2, 7], "muldiv_selector"],
+    ["*", ["^", 2, 8], "word_instr"],
+    ["*", ["^", 2, 9], "ADD"],
+    ["*", ["^", 2, 10], "SUB"],
+    ["*", ["^", 2, 11], "SLT"],
+    ["*", ["^", 2, 12], "AND"],
+    ["*", ["^", 2, 13], "OR"],
+    ["*", ["^", 2, 14], "XOR"],
+    ["*", ["^", 2, 15], "SHIFT"],
+    ["*", ["^", 2, 16], "JALR"],
+    ["*", ["^", 2, 17], "BEQ"],
+    ["*", ["^", 2, 18], "BLT"],
+    ["*", ["^", 2, 19], "LOAD"],
+    ["*", ["^", 2, 20], "STORE"],
+    ["*", ["^", 2, 21], "MUL"],
+    ["*", ["^", 2, 22], "DIVREM"],
+    ["*", ["^", 2, 23], "ECALL"],
+    ["*", ["^", 2, 24], "EBREAK"],
+    ["*", ["^", 2, 25], "rs1"],
+    ["*", ["^", 2, 33], "rs2"],
+    ["*", ["^", 2, 41], "rd"],
+]
+
+
+[[assumptions]]
+desc = "The flags are a one-hot vector in the decoding"
+ref = "cpu:a:one-hot"
+
+[[assumptions]]
+desc = "When `STORE + LOAD + BEQ + BLT = 0`, either `rs2 = 0` or `imm = 0` should be enforced by the decoding. This is needed for `arg2`."
+ref = "cpu:a:arg2-multiplex"
+
+[[constraint_groups]]
+name = "decode"
+
+[[constraints.decode]]
+kind = "interaction"
+tag = "DECODE"
+input = ["pc", "imm", "packed_decode"]
+
+
+[[constraint_groups]]
+name = "range"
+prefix = "R"
+
+[[constraints.range]]
+kind = "template"
+tag = "IS_BIT"
+input = ["write_register"]
+ref = "cpu:c:range_write_register"
+
+[[constraints.range]]
+kind = "template"
+tag = "IS_BIT"
+input = ["memory_2bytes"]
+ref = "cpu:c:range_memory_2bytes"
+
+[[constraints.range]]
+kind = "template"
+tag = "IS_BIT"
+input = ["memory_4bytes"]
+ref = "cpu:c:range_memory_4bytes"
+
+[[constraints.range]]
+kind = "template"
+tag = "IS_BIT"
+input = ["memory_8bytes"]
+ref = "cpu:c:range_memory_8bytes"
+
+[[constraints.range]]
+kind = "template"
+tag = "IS_BIT"
+input = ["c_kind_instruction"]
+ref = "cpu:c:range_c_kind_instruction"
+
+[[constraints.range]]
+kind = "template"
+tag = "IS_BIT"
+input = ["signed"]
+ref = "cpu:c:range_signed"
+
+[[constraints.range]]
+kind = "template"
+tag = "IS_BIT"
+input = ["mp_selector"]
+ref = "cpu:c:range_mp_selector"
+
+[[constraints.range]]
+kind = "template"
+tag = "IS_BIT"
+input = ["muldiv_selector"]
+ref = "cpu:c:range_muldiv_selector"
+
+[[constraints.range]]
+kind = "template"
+tag = "IS_BIT"
+input = ["word_instr"]
+ref = "cpu:c:range_word_instr"
+
+[[constraints.range]]
+kind = "template"
+tag = "IS_BIT"
+input = ["ADD"]
+ref = "cpu:c:range_ADD"
+
+[[constraints.range]]
+kind = "template"
+tag = "IS_BIT"
+input = ["SUB"]
+ref = "cpu:c:range_SUB"
+
+[[constraints.range]]
+kind = "template"
+tag = "IS_BIT"
+input = ["SLT"]
+ref = "cpu:c:range_SLT"
+
+[[constraints.range]]
+kind = "template"
+tag = "IS_BIT"
+input = ["AND"]
+ref = "cpu:c:range_AND"
+
+[[constraints.range]]
+kind = "template"
+tag = "IS_BIT"
+input = ["OR"]
+ref = "cpu:c:range_OR"
+
+[[constraints.range]]
+kind = "template"
+tag = "IS_BIT"
+input = ["XOR"]
+ref = "cpu:c:range_XOR"
+
+[[constraints.range]]
+kind = "template"
+tag = "IS_BIT"
+input = ["SHIFT"]
+ref = "cpu:c:range_SHIFT"
+
+[[constraints.range]]
+kind = "template"
+tag = "IS_BIT"
+input = ["JALR"]
+ref = "cpu:c:range_JALR"
+
+[[constraints.range]]
+kind = "template"
+tag = "IS_BIT"
+input = ["BEQ"]
+ref = "cpu:c:range_BEQ"
+
+[[constraints.range]]
+kind = "template"
+tag = "IS_BIT"
+input = ["BLT"]
+ref = "cpu:c:range_BLT"
+
+[[constraints.range]]
+kind = "template"
+tag = "IS_BIT"
+input = ["LOAD"]
+ref = "cpu:c:range_LOAD"
+
+[[constraints.range]]
+kind = "template"
+tag = "IS_BIT"
+input = ["STORE"]
+ref = "cpu:c:range_STORE"
+
+[[constraints.range]]
+kind = "template"
+tag = "IS_BIT"
+input = ["MUL"]
+ref = "cpu:c:range_MUL"
+
+[[constraints.range]]
+kind = "template"
+tag = "IS_BIT"
+input = ["DIVREM"]
+ref = "cpu:c:range_DIVREM"
+
+[[constraints.range]]
+kind = "template"
+tag = "IS_BIT"
+input = ["ECALL"]
+ref = "cpu:c:range_ECALL"
+
+[[constraints.range]]
+kind = "template"
+tag = "IS_BIT"
+input = ["EBREAK"]
+ref = "cpu:c:range_EBREAK"
+
+[[constraints.range]]
+kind = "interaction"
+tag = "IS_BYTE"
+input = ["rs1"]
+
+[[constraints.range]]
+kind = "interaction"
+tag = "IS_BYTE"
+input = ["rs2"]
+
+[[constraints.range]]
+kind = "interaction"
+tag = "IS_BYTE"
+input = ["rd"]
+
+[[constraints.range]]
+kind = "interaction"
+tag = "IS_BYTE"
+input = [["idx", "arg1", "i"]]
+range = ["i", 0, 7]
+
+[[constraints.range]]
+kind = "interaction"
+tag = "IS_BYTE"
+input = [["idx", "arg2", "i"]]
+range = ["i", 0, 7]
+
+[[constraints.range]]
+kind = "interaction"
+tag = "IS_BYTE"
+input = [["idx", "res", "i"]]
+range = ["i", 0, 7]
+
+
+[[constraint_groups]]
+name = "alu"
+prefix = "A"
+
+[[constraints.alu]]
+kind = "template"
+tag = "ADD"
+cond = ["+", "ADD", "LOAD", "STORE"]
+input = [["cast", "arg1", "DWordWL"], ["cast", "arg2", "DWordWL"]]
+output = ["cast", "res", "DWordWL"]
+
+[[constraints.alu]]
+kind = "template"
+tag = "SUB"
+cond = ["+", "SUB", "BEQ"]
+input = [["cast", "arg1", "DWordWL"], ["cast", "arg2", "DWordWL"]]
+output = ["cast", "res", "DWordWL"]
+ref = "cpu:c:sub"
+
+[[constraints.alu]]
+kind = "interaction"
+tag = "LT"
+input = [["cast", "arg1", "DWordHHW"], ["cast", "arg2", "DWordHHW"], "signed"]
+output = ["idx", "res", 0]
+multiplicity = ["+", "SLT", "BLT"]
+
+[[constraints.alu]]
+kind = "arith"
+constraint = "$#`SLT` + #`BLT` => #`res[i]` = 0$"
+poly = ["*", ["+", "SLT", "BLT"], ["idx", "res", "i"]]
+range = ["i", 1, 7]
+
+[[constraints.alu]]
+kind = "interaction"
+tag = "AND_BYTE"
+input = [["idx", "arg1", "i"], ["idx", "arg2", "i"]]
+output = ["idx", "res", "i"]
+multiplicity = "AND"
+range = ["i", 0, 7]
+
+[[constraints.alu]]
+kind = "interaction"
+tag = "OR_BYTE"
+input = [["idx", "arg1", "i"], ["idx", "arg2", "i"]]
+output = ["idx", "res", "i"]
+multiplicity = "OR"
+range = ["i", 0, 7]
+
+[[constraints.alu]]
+kind = "interaction"
+tag = "XOR_BYTE"
+input = [["idx", "arg1", "i"], ["idx", "arg2", "i"]]
+output = ["idx", "res", "i"]
+multiplicity = "XOR"
+range = ["i", 0, 7]
+
+[[constraints.alu]]
+kind = "interaction"
+tag = "SHIFT"
+input = [["cast", "arg1", "DWordHL"], ["idx", "arg2", 0], "mp_selector", "signed", "word_instr"]
+output = ["cast", "res", "DWordHL"]
+multiplicity = "SHIFT"
+
+[[constraints.alu]]
+kind = "template"
+tag = "ADD"
+input = ["pc", ["cast", ["+", ["*", 2, "c_type_instruction"], ["*", 4, ["not", "c_type_instruction"]]], "DWordWL"]]
+output = ["cast", "DWordWL", "res"]
+cond = "JALR"
+
+# TODO: no types available, so no casting yet
+[[constraints.alu]]
+kind = "interaction"
+tag = "MUL"
+input = ["arg1", "signed", "arg2", "mp_selector", "muldiv_selector"]
+output = "res"
+multiplicity = "MUL"
+
+# TODO: no types available, so no casting yet
+[[constraints.alu]]
+kind = "interaction"
+tag = "DVRM"
+input = ["arg1", "arg2", "signed", "muldiv_selector"]
+output = "res"
+multiplicity = "DIVREM"
+
+
+[[constraint_groups]]
+name = "mem"
+prefix = "M"
+
+# TODO: no types available, so no casting yet
+[[constraints.mem]]
+kind = "interaction"
+tag = "MEMW"
+input = [1, ["*", 2, "rs1"], "rv1", ["+", "timestamp", 0], 1, 0, 0]
+output = "rv1"
+
+# TODO: no types available, so no casting yet
+[[constraints.mem]]
+kind = "interaction"
+tag = "MEMW"
+input = [1, ["*", 2, "rs2"], "rv2", ["+", "timestamp", 1], 1, 0, 0]
+output = "rv2"
+
+# TODO: no types available, so no casting yet
+[[constraints.mem]]
+kind = "interaction"
+tag = "MEMW"
+input = [1, ["*", 2, "rd"], "rvd", ["+", "timestamp", 2], 1, 0, 0]
+
+# TODO: no types available, so no casting yet
+[[constraints.mem]]
+kind = "interaction"
+tag = "LOAD"
+input = [0, "res", ["+", "timestamp", 0], "memory_2bytes", "memory_4bytes", "memory_8bytes", "signed"]
+output = "rvd"
+multiplicity = "LOAD"
+
+# TODO: no types available, so no casting yet
+[[constraints.mem]]
+kind = "interaction"
+tag = "MEMW"
+input = [0, "res", "rv2", ["+", "timestamp", 1], "memory_2bytes", "memory_4bytes", "memory_8bytes"]
+multiplicity = "STORE"
+
+# TODO: no types available, so no casting yet
+[[constraints.mem]]
+kind = "interaction"
+tag = "MEMW"
+input = [1, ["*", 2, 255], "next_pc", ["+", "timestamp", 1], 1, 0, 0]
+output = "pc"
+
+
+[[constraint_groups]]
+name = "sys"
+prefix = "S"
+
+[[constraints.sys]]
+kind = "arith"
+constraint = "`!EBREAK`"
+desc = "We treat `EBREAK` as an unprovable trap"
+poly = ["not", "EBREAK"]
+
+# TODO: no types available, so no casting yet
+[[constraints.sys]]
+kind = "interaction"
+tag = "ECALL"
+input = ["rv1", "pc", "timestamp", "rv2"]
+output = "rvd"
+multiplicity = "ECALL"
+
+
+[[constraint_groups]]
+name = "ext"
+prefix = "E"
+
+[[constraints.ext]]
+kind = "arith"
+constraint = "$(#`rv1_sign_bit` or #`arg2_sign_bit` or #`res_sign_bit`) => #`word_instr`$"
+poly = ["*", ["+", "rv1_sign_bit", "arg2_sign_bit", "res_sign_bit"], ["not", "word_instr"]]
+
+[[constraints.ext]]
+kind = "interaction"
+tag = "MSB16"
+input = [["idx", "rv1", 1]]
+output = "rv1_sign_bit"
+multiplicity = "word_instr"
+
+[[constraints.ext]]
+kind = "arith"
+constraint = "$#`arg1[:4]` = #`rv1[:2]`$"
+poly = ["-", ["idx", ["cast", "arg1", "DWordWL"], 0], ["idx", ["cast", "rv1", "DWordWL"], 0]]
+
+[[constraints.ext]]
+kind = "arith"
+constraint = "$#`arg1[4:]` = #`rv1[2]` dot (1 - #`word_instr`) + (2^(32) - 1) dot #`rv1_sign_bit` dot #`signed`$"
+poly = ["-", ["idx", ["cast", "arg1", "DWordWL"], 1], ["*", ["not", "word_instr"], ["idx", "rv1", 2]], ["*", "signed", "rv1_sign_bit", ["-", ["^", 2, 32], 1]]]
+
+[[constraints.ext]]
+kind = "interaction"
+tag = "MSB16"
+input = [["idx", "rv2", 1]]
+output = "arg2_sign_bit"
+multiplicity = "word_instr"
+
+[[constraints.ext]]
+kind = "arith"
+constraint = "$#`arg2[:4]` = (1 - #`STORE` - #`LOAD`) dot #`rv2[:2]` + (1 - #`BEQ` - #`BLT`) dot #`imm[0]`$"
+poly = ["-", ["idx", ["cast", "arg2", "DWordWL"], 0], ["*", ["-", 1, "STORE", "LOAD"], ["idx", ["cast", "rv2", "DWordWL"], 0]], ["*", ["-", 1, "BEQ", "BLT"], ["idx", "imm", 0]]]
+
+[[constraints.ext]]
+kind = "arith"
+constraint = "$#`arg2[4:]` = (1 - #`STORE` - #`LOAD`) dot ((1 - #`word_instr`) dot #`rv2[2]` + #`signed` dot #`arg2_sign_bit` dot (2^(32) - 1)) + (1 - #`BEQ` - #`BLT`) dot #`imm[1]`$"
+poly = ["-", ["idx", ["cast", "arg2", "DWordWL"], 1], ["*", ["-", 1, "STORE", "LOAD"], ["not", "word_instr"], ["idx", "rv2", 2]], ["*", ["-", 1, "STORE", "LOAD"], "signed", "arg2_sign_bit", ["-", ["^", 2, 32], 1]], ["*", ["-", 1, "BEQ", "BLT"], ["idx", "imm", 1]]]
+
+[[constraints.ext]]
+kind = "interaction"
+tag = "MSB8"
+input = [["idx", "res", 3]]
+output = "res_sign_bit"
+multiplicity = "word_instr"
+
+[[constraints.ext]]
+kind = "arith"
+constraint = "$#`!LOAD` => #`rvd[0]` = #`res[:4]`$"
+poly = ["*", ["not", "LOAD"], ["-", ["idx", "rvd", 0], ["idx", ["cast", "res", "DWordWL"], 0]]]
+
+[[constraints.ext]]
+kind = "arith"
+constraint = "$#`!LOAD` => #`rvd[1]` = (1 - #`word_instr`) dot #`res[4:]` + #`res_sign_bit` dot (2^(32) - 1)$"
+desc = "_Sign_ extend the output if it wasn't a `LOAD`. Only `LOAD` has both `write_register = 1` and `rvd ≠ res`. `LOAD` and `word_instr` are disjoint"
+poly = ["*", ["not", "LOAD"], ["-", ["idx", "rvd", 1], ["*", ["not", "word_instr"], ["idx", ["cast", "res", "DWordWL"], 1]], ["*", "res_sign_bit", ["-", ["^", 2, 32], 1]]]]
+
+
+
+[[constraint_groups]]
+name = "misc"
+prefix = "O"
+
+[[constraints.misc]]
+kind = "interaction"
+tag = "ZERO"
+input = [["+", ["idx", "res", 0], ["idx", "res", 1], ["idx", "res", 2], ["idx", "res", 3], ["idx", "res", 4], ["idx", "res", 5], ["idx", "res", 6], ["idx", "res", 7]]]
+output = "is_equal"
+multiplicity = "BEQ"
+ref = "cpu:c:is_equal"
+
+[[constraints.misc]]
+kind = "arith"
+constraint = "$#`branch_cond` = #`JALR` or (#`BLT` and (#`res` xor #`invert`)) or (#`BEQ` and (#`is_equal` xor #`invert`))$"
+desc = "where `invert` is represented by `mp_selector`"
+poly = ["+",
+        ["-", "branch_cond"],
+        "JALR",
+        ["*", ["idx", "res", 0], ["not", "mp_selector"], "BLT"],
+        ["*", ["not", ["idx", "res", 0]], "mp_selector", "BLT"],
+        ["*", "is_equal", ["not", "mp_selector"], "BEQ"],
+        ["*", ["not", "is_equal"], "mp_selector", "BEQ"]
+    ]
+
+[[constraints.misc]]
+kind = "interaction"
+tag = "BRANCH"
+input = ["pc", ["idx", "imm", 0], ["cast", "arg1", "DWordWL"], "JALR"]
+output = "next_pc"
+multiplicity = "branch_cond"
+
+[[constraints.misc]]
+kind = "template"
+tag = "ADD"
+input = ["pc", ["cast", ["+", ["*", 2, "c_type_instruction"], ["*", 4, ["not", "c_type_instruction"]]], "DWordWL"]]
+output = "next_pc"
+desc = "Increment `pc` to `next_pc` if we're not branching"

From 4b8c801bbc646cd9eaef367c3cf687990e2f21c0 Mon Sep 17 00:00:00 2001
From: Erik <159244975+erik-3milabs@users.noreply.github.com>
Date: Mon, 5 Jan 2026 15:50:41 +0100
Subject: [PATCH 015/105] spec: improve multi-poly definition rendering (#98)

---
 spec/chip.typ | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spec/chip.typ b/spec/chip.typ
index 30b93373c..26d566e4f 100644
--- a/spec/chip.typ
+++ b/spec/chip.typ
@@ -71,7 +71,7 @@
         (
           [],
           [],              
-          expr_to_math((":=", "  ", poly.poly)),
+          table.cell(inset: (left: 1.5em), expr_to_math((":=", "", poly.poly))),
           render_def_range(def.idx, poly.range), 
         )
       }

From 217d2d72244c46d51b3164298192e9e0e8618ce6 Mon Sep 17 00:00:00 2001
From: Robin Jadoul <robin.jadoul@3milabs.tech>
Date: Mon, 5 Jan 2026 15:50:52 +0100
Subject: [PATCH 016/105] spec: BRANCH chip (#92)

* spec: init BRANCH chip

* Small cleanup

* Clean up variable naming and generally address review comments

* outdated comment

---------

Co-authored-by: Erik Takke <erik.takke@3milabs.tech>
---
 spec/book.typ        |   1 +
 spec/branch.typ      |  38 ++++++++++++
 spec/src/branch.toml | 140 +++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 179 insertions(+)
 create mode 100644 spec/branch.typ
 create mode 100644 spec/src/branch.toml

diff --git a/spec/book.typ b/spec/book.typ
index c4c3a8c6d..af747c88c 100644
--- a/spec/book.typ
+++ b/spec/book.typ
@@ -8,6 +8,7 @@
     #chapter("variables.typ")[Variables]
     #chapter("is_bit.typ")[IS_BIT template]
     #chapter("cpu.typ")[CPU chip]
+    #chapter("branch.typ")[BRANCH]
   ]
 )
 
diff --git a/spec/branch.typ b/spec/branch.typ
new file mode 100644
index 000000000..d01e9fa03
--- /dev/null
+++ b/spec/branch.typ
@@ -0,0 +1,38 @@
+#import "/book.typ": book-page, rj
+#import "/src.typ": load_config, load_chip
+#import "/chip.typ": (
+  render_chip_assumptions,
+  render_chip_column_table,
+  total_nr_variables,
+  total_nr_instantiated_columns,
+  render_constraint_table,
+)
+
+#let config = load_config()
+#let chip = load_chip("src/branch.toml", config)
+
+#show: book-page.with(title: "BRANCH chip")
+
+== Columns
+#let nr_variables = total_nr_variables(chip)
+#let nr_columns = total_nr_instantiated_columns(chip, config)
+
+The `BRANCH` chip is comprised of #nr_variables variables that are expressed using #nr_columns columns:
+#render_chip_column_table(chip, config)
+
+== Assumptions
+
+#render_chip_assumptions(chip, config)
+
+== Constraints
+
+#rj[Check correspondence with CPU for passing in `offset` as word or dword]
+We constrain `next_pc` to be $#`base_address` + #`offset`$,
+where `base_address` equals `pc` when $#`JALR` = 0$ and `register` otherwise.
+
+The range checks on `unmasked_low_byte` and `next_pc_low[0]` are performed implicitly by the `AND_BYTE` lookup.
+#render_constraint_table(chip, config, groups: "all")
+
+This chip contributes the following to the lookup argument.
+#render_constraint_table(chip, config, groups: "output")
+
diff --git a/spec/src/branch.toml b/spec/src/branch.toml
new file mode 100644
index 000000000..b93639602
--- /dev/null
+++ b/spec/src/branch.toml
@@ -0,0 +1,140 @@
+name = "BRANCH"
+
+
+# Input
+
+[[variables.input]]
+name = "pc"
+type = "DWordWL"
+desc = "The current pc, used as base address when `!JALR`"
+
+[[variables.input]]
+name = "offset"
+type = "Word"
+desc = "The offset from the base address to jump to"
+
+[[variables.input]]
+name = "register"
+type = "DWordWL"
+desc = "The base address to use when `JALR`"
+
+[[variables.input]]
+name = "JALR"
+type = "Bit"
+desc = "Selects between `pc` and `register` as base address, needed for the `JALR` instruction"
+
+
+# Output
+
+[[variables.output]]
+name = "next_pc_high"
+type = ["Half", 3]
+desc = "The upper part of the next pc"
+
+[[variables.output]]
+name = "next_pc_low"
+type = ["Byte", 2]
+desc = "The lower part of the next pc"
+
+
+# Auxiliary
+
+[[variables.auxiliary]]
+name = "unmasked_low_byte"
+type = "Byte"
+desc = "The low byte of the next pc, before masking the LSB. Used to constraint the raw addition."
+
+
+# Virtual
+
+[[variables.virtual]]
+name = "next_pc_unmasked"
+type = "DWordWL"
+desc = "The combination of `next_pc_high`, `next_pc_low[1]` and `unmasked_low_byte` to constrain the addition. This is the computed value for the next pc, before masking off the LSB as required by the ISA."
+def = {idx = "i", polys = [
+  {range = [0], poly = ["+", ["*", ["^", 2, 16], ["idx", "next_pc_high", 0]], ["*", ["^", 2, 8], ["idx", "next_pc_low", 1]], ["idx", "unmasked_low_byte", 0]]},
+  {range = [1], poly = ["+", ["*", ["^", 2, 16], ["idx", "next_pc_high", 2]], ["idx", "next_pc_high", 1]]},
+]}
+
+[[variables.virtual]]
+name = "next_pc"
+type = "DWordWL"
+desc = "The computed next pc, after masking off the LSB as required by the ISA."
+def = {idx = "i", polys = [
+  {range = [0], poly = ["+", ["*", ["^", 2, 16], ["idx", "next_pc_high", 0]], ["*", ["^", 2, 8], ["idx", "next_pc_low", 1]], ["idx", "next_pc_low", 0]]},
+  {range = [1], poly = ["+", ["*", ["^", 2, 16], ["idx", "next_pc_high", 2]], ["idx", "next_pc_high", 1]]},
+]}
+
+
+# Multiplicity
+
+[[variables.multiplicity]]
+name = "μ"
+type = "Bit"
+desc = ""
+
+
+[[assumptions]]
+desc = "`pc` is range checked, `IS_WORD[pc[i]]`"
+range = ["i", 0, 1]
+
+[[assumptions]]
+desc = "`offset` is range checked, `IS_WORD[offset]`"
+
+[[assumptions]]
+desc = "`register` is range checked, `IS_WORD[register[i]]`"
+range = ["i", 0, 1]
+
+[[assumptions]]
+desc = "`IS_BIT<JALR>`"
+
+
+
+[[constraint_groups]]
+name = "all"
+
+[[constraints.all]]
+kind = "template"
+tag = "ADD"
+input = ["pc", ["cast", "offset", "DWordWL"]]
+output = "next_pc_unmasked"
+cond = ["not", "JALR"]
+
+[[constraints.all]]
+kind = "template"
+tag = "ADD"
+input = ["register", ["cast", "offset", "DWordWL"]]
+output = "next_pc_unmasked"
+cond = "JALR"
+
+[[constraints.all]]
+kind = "interaction"
+tag = "IS_BYTE"
+input = [["idx", "next_pc_low", 1]]
+multiplicity = "μ"
+
+[[constraints.all]]
+kind = "interaction"
+tag = "AND_BYTE"
+input = [["idx", "unmasked_low_byte", 0], 254]
+output = ["idx", "next_pc_low", 0]
+multiplicity = "μ"
+
+[[constraints.all]]
+kind = "interaction"
+tag = "IS_HALFWORD"
+input = [["idx", "next_pc_high", "i"]]
+range = ["i", 0, 2]
+multiplicity = "μ"
+
+
+[[constraint_groups]]
+name = "output"
+desc = "Each row contributes the following to the LogUp sum"
+
+[[constraints.output]]
+kind = "interaction"
+tag = "BRANCH"
+input = ["pc", "offset", "register", "JALR"]
+output = "next_pc"
+multiplicity = "-μ"

From e53d2a8d473abb1d398bbdc1299c06d0304fe357 Mon Sep 17 00:00:00 2001
From: Erik <159244975+erik-3milabs@users.noreply.github.com>
Date: Mon, 5 Jan 2026 17:10:49 +0100
Subject: [PATCH 017/105] spec: conditionally render constraint table headers
 (#94)

* spec: conditionally render constraint table headers

* spec: simplify `selected_constraints` expression

* spec: repurpose `selected_constraints`
---
 spec/chip.typ | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/spec/chip.typ b/spec/chip.typ
index 26d566e4f..e5b40dadd 100644
--- a/spec/chip.typ
+++ b/spec/chip.typ
@@ -160,6 +160,7 @@
     groups = (groups,)
   }
   assert(groups.all(group => group in all_groups), message: "unknown group")
+  let selected_constraints = groups.map(g => (g: chip.constraints.at(g))).join()
 
   // Find the group definition in the constraint_groups
   let lookup_group(name) = chip.constraint_groups.filter((g) => g.name == name).at(0, default: (name: name))
@@ -220,16 +221,29 @@
     (table.cell(align: right, colspan: 2, [_description_]), eval(constraint.desc, mode: "markup"), [])
   }
 
+  // Whether there is at least one constraint with a range
+  // This can be used to see whether the "Range" label should be displayed
+  let do_display_range = selected_constraints.values().flatten().any(x => "range" in x)
+
+  // Whether there is at least one constraint with a multiplicity
+  // This can be used to see whether the "Multiplicity" label should be displayed
+  let do_display_multiplicity = selected_constraints.values().flatten().any(x => "multiplicity" in x)
+
   show figure: set block(breakable: true)
   figure(table(
     columns: (auto, auto, 1fr, auto),
     inset: 6pt,
     align: (top + left, top + left, top + left, top + center),
     stroke: none,
-    table.header([*Tag*], [*Range*], [*Description*], [*Multiplicity*]),
+    table.header(
+      [*Tag*], 
+      if do_display_range {[*Range*]} else {[]}, 
+      [*Description*], 
+      if do_display_multiplicity {[*Multiplicity*]} else {[]},
+    ),
     table.hline(stroke: stroke(thickness: 2pt)),
-    ..for group in groups {
-      for constraint in chip.constraints.at(group) {
+    ..for (group, group_constraints) in selected_constraints.pairs() {
+      for constraint in group_constraints {
         (
           [#tag(constraint, lookup_group(group))],
           [#interval(constraint)],
@@ -243,6 +257,6 @@
           render_polynomial_constraints(constraint)
         }
       }
-    },
+    }
   ), caption: [Constraint overview of #chip.name chip.])
 }

From 1b838509024500d738cdf558ab26bbba3f61dd3e Mon Sep 17 00:00:00 2001
From: Erik <159244975+erik-3milabs@users.noreply.github.com>
Date: Mon, 5 Jan 2026 17:17:11 +0100
Subject: [PATCH 018/105] spec: do not print index in assumption/constraint ref
 (#96)

---
 spec/chip.typ | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spec/chip.typ b/spec/chip.typ
index e5b40dadd..8be99ac2a 100644
--- a/spec/chip.typ
+++ b/spec/chip.typ
@@ -134,7 +134,7 @@
     let index = if "range" in assumption { "." + assumption.range.at(0) } else { "" }
     let lbl = [#chip.name\-A]
     show figure: (it) => align(left, block[#lbl#context it.counter.display()#index])
-    cref(assumption)[#figure(kind: "assumption", numbering: (i) => [#lbl#i#index], supplement: [], [])]
+    cref(assumption)[#figure(kind: "assumption", numbering: (i) => [#lbl#i], supplement: [], [])]
   }
 
   figure(table(
@@ -171,7 +171,7 @@
     let prefix = if "prefix" in group { group.prefix }
     let lbl = [#chip.name\-C#prefix]
     show figure: (it) => align(left, block[#lbl#context it.counter.display()#index])
-    cref(constraint)[#figure(kind: "constraint", numbering: (i) => [#lbl#i#index], supplement: [], [])]
+    cref(constraint)[#figure(kind: "constraint", numbering: (i) => [#lbl#i], supplement: [], [])]
   }
 
   /// Generates a representation of `constraint`

From 07000c76d22c08948b1a9249b73c066b5dc87267 Mon Sep 17 00:00:00 2001
From: Robin Jadoul <robin.jadoul@3milabs.tech>
Date: Wed, 7 Jan 2026 13:24:00 +0100
Subject: [PATCH 019/105] spec: Make constraint numbering restart when
 displaying multiple chips in one document (#108)

---
 spec/chip.typ | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/spec/chip.typ b/spec/chip.typ
index 8be99ac2a..e807e13ed 100644
--- a/spec/chip.typ
+++ b/spec/chip.typ
@@ -134,7 +134,7 @@
     let index = if "range" in assumption { "." + assumption.range.at(0) } else { "" }
     let lbl = [#chip.name\-A]
     show figure: (it) => align(left, block[#lbl#context it.counter.display()#index])
-    cref(assumption)[#figure(kind: "assumption", numbering: (i) => [#lbl#i], supplement: [], [])]
+    cref(assumption)[#figure(kind: chip.name + "assumption", numbering: (i) => [#lbl#i], supplement: [], [])]
   }
 
   figure(table(
@@ -171,7 +171,7 @@
     let prefix = if "prefix" in group { group.prefix }
     let lbl = [#chip.name\-C#prefix]
     show figure: (it) => align(left, block[#lbl#context it.counter.display()#index])
-    cref(constraint)[#figure(kind: "constraint", numbering: (i) => [#lbl#i], supplement: [], [])]
+    cref(constraint)[#figure(kind: chip.name + "constraint", numbering: (i) => [#lbl#i], supplement: [], [])]
   }
 
   /// Generates a representation of `constraint`

From 7e842e511fe9a5bb63d7c9fa468e81bf3338c18e Mon Sep 17 00:00:00 2001
From: Robin Jadoul <robin.jadoul@3milabs.tech>
Date: Thu, 8 Jan 2026 11:17:27 +0100
Subject: [PATCH 020/105] spec: Introduce LT chip (#90)

Co-authored-by: Erik Takke <erik.takke@3milabs.tech>
---
 spec/book.typ    |   1 +
 spec/lt.typ      |  79 ++++++++++++++++++++++++++
 spec/src/lt.toml | 143 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 223 insertions(+)
 create mode 100644 spec/lt.typ
 create mode 100644 spec/src/lt.toml

diff --git a/spec/book.typ b/spec/book.typ
index af747c88c..7787acbce 100644
--- a/spec/book.typ
+++ b/spec/book.typ
@@ -9,6 +9,7 @@
     #chapter("is_bit.typ")[IS_BIT template]
     #chapter("cpu.typ")[CPU chip]
     #chapter("branch.typ")[BRANCH]
+    #chapter("lt.typ")[LT],
   ]
 )
 
diff --git a/spec/lt.typ b/spec/lt.typ
new file mode 100644
index 000000000..ff3b6dae3
--- /dev/null
+++ b/spec/lt.typ
@@ -0,0 +1,79 @@
+#import "/book.typ": book-page, rj
+#import "/src.typ": load_config, load_chip
+#import "/chip.typ": (
+  render_chip_assumptions,
+  render_chip_column_table,
+  total_nr_variables,
+  total_nr_instantiated_columns,
+  render_constraint_table,
+)
+
+#let config = load_config()
+#let chip = load_chip("src/lt.toml", config)
+
+#show: book-page.with(title: "LT chip")
+
+== Columns
+#let nr_variables = total_nr_variables(chip)
+#let nr_columns = total_nr_instantiated_columns(chip, config)
+
+The `LT` chip is comprised of #nr_variables variables that are expressed using #nr_columns columns:
+#render_chip_column_table(chip, config)
+
+== Assumptions
+We assume the inputs `lhs`, `rhs` and `signed` are appropriately range checked.
+#render_chip_assumptions(chip, config)
+
+== Constraints
+We first constrain that all variables correspond to their definition.
+For the defining constraint of `lt`, @lt:c:lt, observe that it is a choice
+between two options, depending on the input flag `signed`.
+In the case of unsigned comparison, we simply need `unsigned_lt`, indicating
+that a wraparound (carry bit) modulo $2^64$ is needed to go from `rhs` to `lhs` via addition.
+For the case of signed comparison, we first need some case analysis.
+
+We split $a < b$ into four disjoint cases, conditioned on the sign of $a$ and $b$.
+Recall that the sign of a number in two's complement can be read off from the MSB,
+being $1$ for a negative number and $0$ for a positive one.
+For this analysis, we denote the MSB of $a$ as $A$ and the MSB of $b$ as $B$.
+The four disjoint cases then become:
+
++ $dash(A) and B and (a < b)$
++ $A and dash(B) and (a < b)$
++ $A and B and (a < b)$
++ $dash(A) and dash(B) and (a < b)$
+
+The first case is evidently false, while the second case simplifies to $A and dash(B)$.
+For the third and fourth case, observe that when $A = B$, the $<$ relation is preserved
+by the modular correspondence between $[-2^(31), 2^(31))$ and $[0, 2^(64))$.
+Importantly, this modular correspondence is merely a reinterpretation of the
+bits or values of $a$ and $b$, due to the representation in two's complement.
+Hence, we can introduce the value $C = #`unsigned_lt`$, that accurately represents
+the relation $a < b$ when $A = B$.
+
+Combining our three remaining cases, we obtain the boolean formula $A dash(B) or A B C or dash(A) dash(B) C$.
+Since the cases are disjoint, this can be computed with the binary-valued polynomial
+$P(A, B, C) = A (1 - B) + A B C + (1 - A) (1 - B) C$.
+
+The polynomial $P$ can be simplified to a total degree of two.
+We claim that the polynomial $Q(A, B, C) = A (1 - B) + A C + (1 - B) C$
+is, for the purposes of this chip, equivalent to $P$.
+An exhaustive check shows that $P(A, B, C) != Q(A, B, C)$ only for the triple $(A, B, C) = (1, 0, 1)$.
+This is, however, impossible due to the correctness of `ADD`.
+In more detail, if we let $s$ be the (range-checked) difference $a - b$
+(so the equivalent of the #`lhs_sub_rhs` column),
+and $x'$ denote the most significant word of a variable $x$,
+we need $c dot 2^32 + a' = b' + s' + #`carry[0]`$, by the definition of `carry`.
+However, the left hand side of this is at least $3 dot 2^31$, as $(A, C) = (1, 1)$,
+and the right hand side is at most $(2^31 - 1) + (2^32 - 1) + 1 = 3 dot 2^31 - 1$.
+Therefore, we can use $Q$ to constrain `lt` when `signed = 1`.
+
+#render_constraint_table(chip, config, groups: "defs")
+
+And then we constrain the subtraction.
+
+#render_constraint_table(chip, config, groups: "sub")
+
+The chip contributes the following to the lookup argument.
+
+#render_constraint_table(chip, config, groups: "output")
diff --git a/spec/src/lt.toml b/spec/src/lt.toml
new file mode 100644
index 000000000..1a441c2b3
--- /dev/null
+++ b/spec/src/lt.toml
@@ -0,0 +1,143 @@
+name = "LT"
+
+
+# Input
+
+[[variables.input]]
+name = "lhs"
+type = "DWordHHW"
+desc = "The left operand"
+
+[[variables.input]]
+name = "rhs"
+type = "DWordHHW"
+desc = "The right operand"
+
+[[variables.input]]
+name = "signed"
+type = "Bit"
+desc = "whether to interpret `lhs` and `rhs` as signed integers (1) or not (0)"
+
+# Output
+
+[[variables.output]]
+name = "lt"
+type = "Bit"
+desc = "Whether $#`lhs` < #`rhs`$, taking `signed` into account"
+
+
+# Auxiliary
+
+[[variables.auxiliary]]
+name = "lhs_sub_rhs"
+type = "DWordHL"
+desc = "$#`lhs` - #`rhs`$"
+
+[[variables.auxiliary]]
+name = "lhs_msb"
+type = "Bit"
+desc = "The most significant bit of `lhs`"
+
+[[variables.auxiliary]]
+name = "rhs_msb"
+type = "Bit"
+desc = "The most significant bit of `rhs`"
+
+# Virtual
+
+[[variables.virtual]]
+name = "carry"
+type = ["Bit", 2]
+desc = "The carry for adding `lhs_sub_rhs` back to `rhs`"
+def = {idx = "i", polys = [
+  {range = [0], poly = ["*", ["^", 2, -32], ["-", ["+", ["idx", "rhs", 0], ["idx", ["cast", "lhs_sub_rhs", "DWordWL"], 0]], ["idx", "lhs", 0]]]},
+  {range = [1], poly = ["*", ["^", 2, -32], ["-", ["+", ["idx", ["cast", "rhs", "DWordWL"], 1], ["idx", ["cast", "lhs_sub_rhs", "DWordWL"], 1], ["idx", "carry", 0]], ["idx", ["cast", "lhs", "DWordWL"], 1]]]},
+]}
+
+[[variables.virtual]]
+name = "unsigned_lt"
+type = "Bit"
+desc = "Whether $#`lhs` < #`rhs`$, as unsigned integers"
+def = ["idx", "carry", 1]
+
+
+# Multiplicity
+
+[[variables.multiplicity]]
+name = "μ"
+type = "Bit"
+desc = ""
+
+
+[[assumptions]]
+desc = "`IS_HALFWORD[lhs[i]]` and `IS_WORD[lhs[0]]`"
+range = ["i", 1, 2]
+ref = "lt:a:range_lhs"
+
+[[assumptions]]
+desc = "`IS_HALFWORD[rhs[i]]` and `IS_WORD[rhs[0]]`"
+range = ["i", 1, 2]
+ref = "lt:a:range_rhs"
+
+[[assumptions]]
+desc = "`IS_BIT<signed>`"
+ref = "lt:a:range_signed"
+
+
+[[constraint_groups]]
+name = "defs"
+desc = "Enforce that variables have been correctly computed"
+
+[[constraints.defs]]
+kind = "interaction"
+tag = "MSB16"
+input = [["idx", "lhs", 2]]
+output = "lhs_msb"
+multiplicity = "μ"
+ref = "lt:c:lhs_msb"
+
+[[constraints.defs]]
+kind = "interaction"
+tag = "MSB16"
+input = [["idx", "rhs", 2]]
+output = "rhs_msb"
+multiplicity = "μ"
+ref = "lt:c:rhs_msb"
+
+[[constraints.defs]]
+kind = "arith"
+constraint = "$#`lt` = #`signed` dot (A (1 - B) + A C + (1 - B) C) + (1 - #`signed`) dot #`unsigned_lt`$"
+desc = "Where $A = #`lhs_msb`$, $B = #`rhs_msb`$ and $C = #`carry[1]`$"
+poly = ["-", "lt", ["*", "signed", ["+", ["*", "lhs_msb", ["not", "rhs_msb"]], ["*", "lhs_msb", ["idx", "carry", 1]], ["*", ["not", "rhs_msb"], ["idx", "carry", 1]]]], ["*", ["-", 1, "signed"], "unsigned_lt"]]
+ref = "lt:c:lt"
+
+
+[[constraint_groups]]
+name = "sub"
+desc = "Constrain the subtraction"
+
+[[constraints.sub]]
+kind = "template"
+tag = "IS_BIT"
+input = [["idx", "carry", "i"]]
+range = ["i", 0, 1]
+
+[[constraints.sub]]
+kind = "interaction"
+tag = "IS_HALFWORD"
+input = [["idx", "lhs_sub_rhs", "i"]]
+range = ["i", 0, 3]
+multiplicity = "μ"
+ref = "lt:c:lhs_sub_rhs_range"
+
+
+[[constraint_groups]]
+name = "output"
+desc = "Each row contributes the following to the LogUp sum"
+
+[[constraints.output]]
+kind = "interaction"
+tag = "LT"
+input = ["lhs", "rhs", "signed"]
+output = "lt"
+multiplicity = "-μ"

From 4af29ef1afdccb5ccaba7ff9be2bf70a473d650a Mon Sep 17 00:00:00 2001
From: Robin Jadoul <robin.jadoul@3milabs.tech>
Date: Thu, 8 Jan 2026 13:31:10 +0100
Subject: [PATCH 021/105] spec: Fix constraint group lookup (#105)

---
 spec/chip.typ | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spec/chip.typ b/spec/chip.typ
index e807e13ed..6914aa1f6 100644
--- a/spec/chip.typ
+++ b/spec/chip.typ
@@ -160,7 +160,7 @@
     groups = (groups,)
   }
   assert(groups.all(group => group in all_groups), message: "unknown group")
-  let selected_constraints = groups.map(g => (g: chip.constraints.at(g))).join()
+  let selected_constraints = groups.map(g => ((g): chip.constraints.at(g))).join()
 
   // Find the group definition in the constraint_groups
   let lookup_group(name) = chip.constraint_groups.filter((g) => g.name == name).at(0, default: (name: name))

From 9d07e5d0c0903f9e516ed27845983c1e247783ef Mon Sep 17 00:00:00 2001
From: Erik <159244975+erik-3milabs@users.noreply.github.com>
Date: Fri, 9 Jan 2026 09:13:28 +0100
Subject: [PATCH 022/105] spec: `SHIFT` chip (#84)

* spec: rough draft SHIFT chip

* various minor fixes

* implement right-limb shifting

* Update rendering "polynomial constriant" in table

* fix degree 4 issues

* Further update to SHIFT chip

* Clean up SHIFT

* spec/shift: add assumption

* spec/shift: Add lookup constraint

* spec/shift: make extension virtual
Kudos to Robin for uncovering this!

* spec/shift: Simplify limb-situation
Kudos to Robin for pointing this out!

* spec/SHIFT: fix typo

* Turn `limb_shift_x` into array

* spec: support "sum" expression in math

* Simplify limb-shifting constraint

* spec: attempt at refactoring `shift`

* spec: overhaul SHIFT

* spec: SHIFT: rename `extensions` as `extension`

Co-authored-by: Robin Jadoul <robin.jadoul@3milabs.tech>

* spec: SHIFT: make `shift` of type Byte

* spec: SHIFT: replace variable '0x' with constant 0x

* spec: SHIFT: remove "cheaper" remark

* spec: SHIFT: fix `shifted` description

* spec: SHIFT: make output a DWordWL

* spec: SHIFT

* spec: SHIFT: introduce explanation; update some constraint elaborations

* Apply suggestions from the code review

Co-authored-by: Robin Jadoul <robin.jadoul@3milabs.tech>

* spec: SHIFT: update `bits_shift` desc

* spec: SHIFT: update `limb_shift` desc

* spec: SHIFT: add missing IS_BIT constraint for limb_shift

* spec: SHIFT: update description

* spec: SHIFT: fix sum's expr-to-math

* Minor language pass

Co-authored-by: Robin Jadoul <robin.jadoul@3milabs.tech>

---------

Co-authored-by: Robin Jadoul <robin.jadoul@3milabs.tech>
---
 spec/book.typ       |   3 +-
 spec/expr.typ       |  22 +++-
 spec/shift.typ      | 175 +++++++++++++++++++++++++++
 spec/src/shift.toml | 283 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 476 insertions(+), 7 deletions(-)
 create mode 100644 spec/shift.typ
 create mode 100644 spec/src/shift.toml

diff --git a/spec/book.typ b/spec/book.typ
index 7787acbce..6652ed646 100644
--- a/spec/book.typ
+++ b/spec/book.typ
@@ -8,8 +8,9 @@
     #chapter("variables.typ")[Variables]
     #chapter("is_bit.typ")[IS_BIT template]
     #chapter("cpu.typ")[CPU chip]
+    #chapter("shift.typ")[SHIFT chip]
     #chapter("branch.typ")[BRANCH]
-    #chapter("lt.typ")[LT],
+    #chapter("lt.typ")[LT]
   ]
 )
 
diff --git a/spec/expr.typ b/spec/expr.typ
index ae7bd0792..547f2cad2 100644
--- a/spec/expr.typ
+++ b/spec/expr.typ
@@ -31,6 +31,7 @@
 //          | ["idx", expr1, expr2]        ; expr1[expr2]
 //          | ["not", expr]                ; !expr
 //          | ["+", expr1, expr2, ...]     ; expr1 + expr2 + ...
+//          | ["sum", expr1, expr2, expr3] ; Σ_expr1^expr2 expr3
 //          | ["*", expr1, expr2, ...]     ; expr1 * expr2 * ...
 //          | ["/", expr1, expr2]          ; expr1 / expr2
 //          | ["^", expr1, expr2]          ; expr1^expr2
@@ -51,12 +52,13 @@
   "cast": 2, // cast
   "mul": 3,  // *
   "div": 4,  // /
-  "not": 5,  // not
-  "add": 6,  // +
-  "sub": 7,  // -
-  "idx": 8,  // []
-  "eq": 9,   // = and :=
-  "MAX": 10, // <the void outside every expression>
+  "sum": 5,  // Σ
+  "not": 6,  // not
+  "add": 7,  // +
+  "sub": 8,  // -
+  "idx": 9,  // []
+  "eq": 10,   // = and :=
+  "MAX": 11, // <the void outside every expression>
 )
 
 // Mutual recursion through a trick from https://github.com/typst/typst/issues/744
@@ -92,6 +94,7 @@
     "idx": (pp, rec, e) => rec(PREC.MIN, e.at(1)) + `[` + rec(PREC.MAX, e.at(2)) + `]`,
     "not": (pp, rec, e) => cwrap(`1 - ` + rec(PREC.not, e.at(1)), pp < PREC.not),
     "+": (pp, rec, e) => cwrap(e.slice(1).map(rec.with(PREC.add)).join(` + `), pp < PREC.add),
+    "sum": (pp, rec, e) => assert(false, message: "sum is unsupported in code."),
     "*": (pp, rec, e) => cwrap(e.slice(1).map(rec.with(PREC.mul)).join(` ` + sym.dot + ` `), pp < PREC.mul),
     "/": (pp, rec, e) => cwrap(rec(PREC.div, e.at(1)), pp < PREC.div) + ` / ` + rec(PREC.div, e.at(2)),
     "^": (pp, rec, e) => {
@@ -132,6 +135,13 @@
     "idx": (pp, rec, e) => $#rec(PREC.idx, e.at(1))_(#rec(PREC.idx, e.at(2)))$,
     "not": (pp, rec, e) => mwrap($1 - #rec(PREC.not, e.at(1))$, pp < PREC.not),
     "+": (pp, rec, e) => mwrap($#e.slice(1).map(rec.with(PREC.add)).join($+$)$, pp < PREC.add),
+    "sum": (pp, rec, e) => {
+      assert(e.len() == 4, message: "invalid sum:" + repr(e))
+      mwrap(
+        $sum_(#rec(PREC.MAX, e.at(1)))^#rec(PREC.MAX, e.at(2)) #rec(if pp <= PREC.sub {PREC.MAX} else {PREC.sum}, e.at(3))$, 
+        pp <= PREC.sub
+      )
+    },
     "*": (pp, rec, e) => mwrap($#e.slice(1).map(rec.with(PREC.mul)).join($dot$)$, pp < PREC.mul),
     "/": (pp, rec, e) => $#rec(PREC.div, e.at(1)) / #rec(PREC.div, e.at(2))$,
     "^": (pp, rec, e) => {
diff --git a/spec/shift.typ b/spec/shift.typ
new file mode 100644
index 000000000..3555d64e4
--- /dev/null
+++ b/spec/shift.typ
@@ -0,0 +1,175 @@
+#import "/book.typ": book-page, et
+#import "/src.typ": load_config, load_chip
+#import "/chip.typ": (
+  render_chip_column_table,
+  total_nr_variables,
+  total_nr_instantiated_columns,
+  render_constraint_table,
+  render_chip_assumptions,
+)
+
+#let config = load_config()
+#let chip = load_chip("src/shift.toml", config)
+
+#let shift = raw(chip.name)
+
+#show: book-page.with(title: "SHIFT chip")
+
+= #shift chip
+
+== Interface
+The #shift chip has the following interface:
+#block(radius: 5pt, width: 100%, inset: 1.5em, fill: luma(240), 
+```
+// param in: the value being shifted
+// param shift: the number of bits to shift `in` by
+// param direction: whether to shift left (0) or right (1) 
+// param signed: whether to interpret `in` as a signed (1) or unsigned (0) integer
+// param word_instr: whether to execute the SLL/SR* (0) or SLLW/SR*W (1) instruction
+// out shifted: the resulting value
+SHIFT[shifted: DWord; in: DWord, shift: Byte, direction: Bit, signed: Bit, word_instr: Bit]
+```
+)
+In other words, the #shift chip is designed to constrain that 
+$ 
+#`shifted` := cases(
+  #`in` #`<<` #`s` " if" #`direction` = 0,
+  #`in` #`>>` #`s` " if" #`direction` = 1 and #`signed` = 0,
+  #`in` #`>>>` #`s` "if" #`direction` = 1 and #`signed` = 1,
+) 
+$
+where
+$ 
+#`s` := cases(
+  #`shift` mod 32 "if" #`word_instr` = 1,
+  #`shift` mod 64 "if" #`word_instr` = 0,
+) 
+$
+Here, `<<` and `>>` denote the _logical_ left and right shift operations, while `>>>` denotes the _arithmetic_ right shift operation.
+
+== Columns
+#let nr_variables = total_nr_variables(chip)
+#let nr_columns = total_nr_instantiated_columns(chip, config)
+
+The `SHIFT` chip is comprised of #nr_variables variables that are expressed using #nr_columns columns:
+#render_chip_column_table(chip, config)
+
+== Assumptions
+#render_chip_assumptions(chip, config)
+
+== Explanation
+This chip has a rather complex design as a result of designing it to fit in as few columns possible.
+We briefly discuss the intricacies of the design, attempting to illustrate its correctness.
+
+The chip's design revolves around a two-phase shifting process:
+1. shift `in` by $x := #`shift` mod 16$ bits, 
+2. shift that result by $(#`shift`-x) mod 64$ (or $mod 32$ if $ #`word_instr` = 1$).
+The intermediate value representing the state between the two phases is stored in the scratch variables `X` and `Y`.
+The definition of `shifted` describes how one can combine the `X`, `Y` and `extension` variables to construct the output value as described using `Half`-limbs.
+The output variable `out` is equivalent to `shifted`, but expressed using `Word`-limbs.
+
+In the following, we cover how these two phases were designed to complement one another.
+Here, we start with discussing the _logical_ left/right shift operations only; the modifications required to compute the _arithmetic_ right shift will be discussed at the end.
+
+=== First phase
+We zoom in on the first step.
+Here, we make use of the two lookup operations 
+- $#`HWSL[x: Half, y: B4]` := (#`x` #`<<` #`y`) mod 2^16$ (short for "HalfWord Shift Left"), and
+- $#`HWSLC[x: Half, y: B4]` := #`x` #`>>` (16-#`y`)$ (short for "HalfWord Shift Left's Carry")
+Note here that one can use these two lookups to compute `out: Half[4] := in << y` as:
+$
+  #`out[`i#`]` = cases(
+    #`HWSL[in[`0#`], y]` &"if" i = 0,
+    #`HWSL[in[`i#`], y] | HWSLC[in[`i-1#`], y]` &"if" i in [1, 3]   
+  )
+$
+as long as $#`y` < 16$.
+Observing that 
+$#`HWSL[x,` 16-#`y]` = (#`x` #`<<` (16-#`y`)) mod 2^16$, and
+$#`HWSLC[x,` 16-#`y]` = #`x` #`>>` #`y`$ for $#`y` in [1, 15]$,
+one can also use these lookups to compute `out := in >> y` as
+$
+  #`out[`i#`]` = cases(
+    #`HWSLC[in[`i#`],` 16-#`y] | HWSL[in[`i+1#`], y]` &"if" i in [0, 2],
+    #`HWSLC[in[`3#`],` 16-#`y]` &"if" i = 3
+  )
+$
+as long as $0 < #`y` < 16$.
+
+Observe now that the values being looked up are (almost) independent from the direction of the shift: only the shift-amount varies slightly.
+When we now define
+$
+  #`bit_shift` := cases(
+    #`shift` mod 16 & "when shifting left",
+    (16-#`shift`) mod 16 & "when shifting right"
+  ),  
+$
+it only takes some rearranging and combining of the values $#`X[`i#`] := HWSL[in[`i#`], bit_shift]`$ and $#`Y[`i#`] := HWSLC[in[`i#`], bit_shift]`$ to form the limbs of $#`in <</>> shift` mod 16$.
+In the remaining case that $#`right` = 1$ and $#`shift` = 0 mod 16$, the limbs of $#`in <</>> shift` mod 16$ simply match those of `in`.
+
+=== Second phase
+Since we're operating on 16-bit limbs, all the limbs in $#`in <</>> shift`$ must also occur somewhere in $#`in <</>> shift` mod 16$.
+The number of full-limbs we still need to shift is determined by the fifth and sixth least significant bit of `shift`.
+With `limb_shift` containing a unary decoding of the integer represented by these two bits, we find that the intermediate value needs to be shifted over by $i$ limbs (to the `left` or `right`) when $#`limb_shift[`i#`]` = 1$.
+These things combined yield `shifted`'s definition.
+
+Of course, when $#`word_instr` = 1$ and, thus, only $#`shift` mod 32$ should be considered, the bit-mask for the lookup constraining `limb_shift` is adjusted appropriately (see @shift:c:limb_shift_lookup).
+
+=== Arithmetic right shift
+Lastly, we discuss the case of performing the _arithmetic_ right shift.
+Here, `extension` is constrained to contain a repetition of `in`'s most significant bit.
+Copies of this variable are used for any full limbs shifted in when $#`right` = #`signed` = 1$.
+Moreover, `X[4]` contains a copy of `extension` shifted over by the right number of bits, to allow the construction of $#`in >>> shift` mod 16$ as the appropriate intermediate.
+
+== Constraints
+First, we constrain `bit_shift` based on whether we are left or right-shifting.
+@shift:c:zbs makes sure `zbs` is set to `1` if and only if `bit_shift = 0`. 
+This flag is used to indicate the special case that $#`right` = 1$ and $#`shift` = 0 mod 16$.
+#render_constraint_table(chip, config, groups: "bit_shift")
+
+Next, we shift the limbs of `in` left and right by the appropriate amount, storing the results in `X` and `Y` respectively.
+When `zbs = 1`, the output cannot be used to compose $#`in >>/>>> shift` mod 16$.
+To resolve this, we override `Y[i] := in[i]` and `X[i] := 0` in this case.
+
+The case of `left`-shifting and $#`bit_shift` = 0$ will be used for padding rows.
+To prevent unnecessary lookups in padding rows, we override $#`X[i]` := #`in[i]`$ and $#`Y[i]` := 0$ here.
+#render_constraint_table(chip, config, groups: "intra_limb_shift")
+
+=== Full-limb shifting
+Next, we constrain that `limb_shift` is a proper unary encoding of the fifth (and sixth if $#`word_instr` = 0$) bit of `shift`.
+For this to be the case, three requirements must be satisfied:
++ *unary(0)*: $#`limb_shift[`i#`]` in {0, 1}$ for $i in [0, 3]$,
++ *unary(1)*: $#`limb_shift[`i#`]` = 1$ for exactly one $i$, and
++ *proper encoding*: $#`limb_shift[`i#`]` = 1 <=> 1/16 (#`shift &` (48-32 dot #`word_instr`)) = i$
+The first requirement is enforced by constraint @shift:c:limb_shift_is_bit.
+To construct a constraint for the second and third requirement, observe that
+$
+1/16 dot (#`shift &` (48-32 dot #`word_instr`)) in cases(
+  {0, 1, 2, 3} &"if" #`word_instr` = 0,
+  {0, 1} &"if" #`word_instr` = 1
+)
+$
+Observe moreover that, assuming *unary(0)*, the expression
+$
+  1/16 dot (1 + sum_(i=0)^3 (16i-1) dot #`limb_shift[`i#`]`)
+$
+can evaluate to $i$ if and only if $#`limb_shift[`i#`]` = 1$, while the others are $0$.
+This means that the relation
+$
+  1 + sum_(i=0)^3 (16i-1) dot #`limb_shift[`i#`]` = #`shift &` (48-32 dot #`word_instr`)
+$
+enforces both *unary(1)* and *proper encoding*.
+This is the exact relation @shift:c:limb_shift_lookup enforces.
+
+
+Hereafter, one must only check that `out` is the proper cast of `shifted` into a `DWordWL`.
+#render_constraint_table(chip, config, groups: "limb_shifting")
+
+=== Miscellaneous 
+#render_constraint_table(chip, config, groups: ("left_flag", "is_negative"))
+*Note*: `is_negative` is not used when `signed = 0`.
+As such, there is no problem with it being unconstrained in this case.
+
+=== Lookups
+This chip adds the following interaction to the lookup.
+#render_constraint_table(chip, config, groups: "lookups")
diff --git a/spec/src/shift.toml b/spec/src/shift.toml
new file mode 100644
index 000000000..e2ddfa12b
--- /dev/null
+++ b/spec/src/shift.toml
@@ -0,0 +1,283 @@
+name = "SHIFT"
+
+# Input
+
+[[variables.input]]
+name = "in"
+type = "DWordHL"
+desc = "The value being shifted"
+
+[[variables.input]]
+name = "shift"
+type = "Byte"
+desc = "Number of bits to shift `in` by."
+
+[[variables.input]]
+name = "direction"
+type = "Bit"
+desc = "Whether to shift left (0) or right (1)."
+
+[[variables.input]]
+name = "signed"
+type = "Bit"
+desc = "Whether to interpret `in` as a signed integer."
+
+[[variables.input]]
+name = "word_instr"
+type = "Bit"
+desc = "Whether this is a Word-instruction (1) or not (0)."
+
+
+# Output
+
+[[variables.output]]
+name = "out"
+type = "DWordWL"
+desc = "$#`in <</>>/>>>` (#`shift` mod 32 dot (2 - #`word_instr`))$"
+
+# Auxiliary
+
+[[variables.auxiliary]]
+name = "is_negative"
+type = "Bit"
+desc = "Whether `in` is negative"
+
+[[variables.auxiliary]]
+name = "bit_shift"
+type = "Byte"
+desc = "Value by which to shift `in` to obtain `X` and `Y`"
+
+[[variables.auxiliary]]
+name = "zbs"
+type = "Bit"
+desc = "Whether `bit_shift` is zero (1) or not (0)."
+
+[[variables.auxiliary]]
+name = "X"
+type = ["Half", 5]
+desc = "scratch variable."
+
+[[variables.auxiliary]]
+name = "Y"
+type = ["Half", 4]
+desc = "scratch variable."
+
+[[variables.auxiliary]]
+name = "limb_shift"
+type = ["Bit", 4]
+desc = "One-hot vector indicating whether $floor.l #`shift` / 16 floor.r equiv i mod s$, where $s = 2$ when $#`word_instr` = 1$ and $4$ otherwise."
+
+# Virtual
+
+[[variables.virtual]]
+name = "extension"
+type = "Half"
+desc = "sign extension of `in`."
+def = ["*", 65535, "is_negative"]
+
+[[variables.virtual]]
+name = "left"
+type = "Bit"
+desc = "Whether to perform a left-shift."
+def = ["-", "μ", "direction"]
+
+[[variables.virtual]]
+name = "right"
+type = "Bit"
+desc = "Whether to perform a right-shift."
+def = "direction"
+
+[[variables.virtual]]
+name = "intra_limb_left"
+type = "DWordHL"
+desc = "`in << (shift % 16)` if `left`"
+def = {idx="i", polys=[
+    {range=0, poly=["idx", "X", 0]},
+    {range=[1, 3], poly=["+", ["idx", "X", "i"], ["idx", "Y", ["-", "i", 1]]]},
+]}
+
+[[variables.virtual]]
+name = "intra_limb_right"
+type = "DWordHL"
+desc = "`in >>> (shift % 16)` if `right` and `signed`;\\ `in >> (shift % 16)` if `right` and `!signed`"
+def = {idx="i", range=[0, 3], poly=["+", ["idx", "Y", "i"], ["idx", "X", ["+", "i", 1]]]}
+
+[[variables.virtual]]
+name = "shifted"
+type = "DWordHL"
+desc = "$#`in <</>>/>>>` (#`shift` mod 32 dot (2 - #`word_instr`))$"
+def = {idx="i", range=[0, 3], poly=["+", ["*", "left", ["sum", ["=", "j", 0], "i", ["*", ["idx", "limb_shift", "j"], ["idx", "intra_limb_left", ["-", "i", "j"]]]]], ["*", "right", ["+", ["sum", ["=", "j", 0], ["-", 3, "i"], ["*", ["idx", "limb_shift", "j"], ["idx", "intra_limb_right", ["+", "i", "j"]]]], ["*", "extension", ["sum", ["=", "j", ["-", 3, "i"]], 3, ["idx", "limb_shift", "j"]]]]]]}
+
+# Multiplicities
+
+[[variables.multiplicity]]
+name = "μ"
+type = "Bit"
+desc = ""
+
+
+
+# Assumptions
+
+[[assumptions]]
+desc = "`IS_HALFWORD[in[i]]`"
+range = ["i", 0, 3]
+ref = "shift:a:range_in"
+
+[[assumptions]]
+desc = "`IS_BYTE[shift]`"
+ref = "shift:a:range_shift"
+
+[[assumptions]]
+desc = "`IS_BIT<direction>`"
+ref = "shift:a:direction"
+
+[[assumptions]]
+desc = "`IS_BIT<signed>`"
+ref = "shift:a:signed"
+
+[[assumptions]]
+desc = "`IS_BIT<word_instr>`"
+ref = "shift:a:word_instr"
+
+# Constraints
+
+[[constraint_groups]]
+name = "left_flag"
+
+[[constraints.left_flag]]
+kind = "arith"
+desc = "enforces `left` is `Bit`."
+constraint = "$#`direction` => #`μ` = 1$"
+poly = ["*", "direction", ["not", "μ"]]
+ref = "shift:c:direction_implies_mu"
+
+
+[[constraint_groups]]
+name = "is_negative"
+
+[[constraints.is_negative]]
+kind = "interaction"
+tag = "MSB16"
+input = [["idx", "in", 3]]
+output = "is_negative"
+multiplicity = "signed"
+ref = "shift:c:is_negative_if_signed"
+
+
+[[constraint_groups]]
+name = "bit_shift"
+
+[[constraints.bit_shift]]
+kind = "interaction"
+tag = "AND_BYTE"
+input = ["shift", 0x0F]
+output = "bit_shift"
+ref = "shift:c:bit_shift_if_left"
+multiplicity = "left"
+
+[[constraints.bit_shift]]
+kind = "interaction"
+tag = "AND_BYTE"
+input = [["-", ["^", 2, 8], "shift"], 0x0F]
+output = "bit_shift"
+ref = "shift:c:bit_shift_if_right"
+multiplicity = "right"
+
+[[constraints.bit_shift]]
+kind = "template"
+tag = "IsZero"
+input = ["bit_shift"]
+output = "zbs"
+ref = "shift:c:zbs"
+multiplicity = "μ"
+
+
+[[constraint_groups]]
+name = "intra_limb_shift"
+
+[[constraints.intra_limb_shift]]
+kind = "interaction"
+tag = "HWSL"
+input = [["idx", "in", "i"], "bit_shift"]
+output = ["idx", "X", "i"]
+range = ["i", 0, 3]
+ref = "shift:c:hwsl_if_not_zero"
+multiplicity = ["not", "zbs"]
+
+[[constraints.intra_limb_shift]]
+kind = "arith"
+constraint = "$#`zbs` => #`X[i]` = #`in[i]` dot #`left`$"
+poly = ["*", "zbs", ["-", ["idx", "X", "i"], ["*", ["idx", "in", "i"], "left"]]]
+range = ["i", 0, 3]
+ref = "shift:c:zbs_implies_X"
+
+[[constraints.intra_limb_shift]]
+kind = "interaction"
+tag = "HWSL"
+input = ["extension", "bit_shift"]
+output = ["idx", "X", 4]
+ref = "shift:c:hwsl_x4_if_not_zero"
+multiplicity = ["not", "zbs"]
+
+[[constraints.intra_limb_shift]]
+kind = "arith"
+constraint = "$#`zbs` => #`X[4]` = 0$"
+poly = ["*", "zbs", ["idx", "X", 4]]
+ref = "shift:c:zbs_implies_X_4"
+
+[[constraints.intra_limb_shift]]
+kind = "interaction"
+tag = "HWSLC"
+input = [["idx", "in", "i"], "bit_shift"]
+output = ["idx", "Y", "i"]
+range = ["i", 0, 3]
+ref = "shift:c:hwslc_if_not_zero"
+multiplicity = ["not", "zbs"]
+
+[[constraints.intra_limb_shift]]
+kind = "arith"
+constraint = "$#`zbs` => #`Y[i]` = #`in[i]` dot #`right`$"
+poly = ["*", "zbs", ["-", ["idx", "Y", "i"], ["*", ["idx", "in", "i"], "right"]]]
+range = ["i", 0, 3]
+ref = "shift:c:zbs_implies_Y"
+
+
+[[constraint_groups]]
+name = "limb_shifting"
+
+[[constraints.limb_shifting]]
+kind = "template"
+tag = "IS_BIT"
+input = [["idx", "limb_shift", "i"]]
+range = ["i", 0, 3]
+ref = "shift:c:limb_shift_is_bit"
+
+[[constraints.limb_shifting]]
+kind = "interaction"
+tag = "AND_BYTE"
+input = ["shift", ["-", 0x30, ["*", 0x20, "word_instr"]]]
+output = ["+", ["-", 1, ["idx", "limb_shift", 0]], ["*", 15, ["idx", "limb_shift", 1]], ["*", 31, ["idx", "limb_shift", 2]], ["*", 47, ["idx", "limb_shift", 3]]]
+ref = "shift:c:limb_shift_lookup"
+multiplicity = "μ"
+
+[[constraints.limb_shifting]]
+kind = "arith"
+constraint = "$#`out[:2]` = #`shifted[:4]`$"
+poly = ["-", ["idx", "out", "i"], ["idx", ["cast", "shifted", "DWordWL"], "i"]]
+range = ["i", 0, 1]
+ref = "shift:c:out_eq_shifted"
+
+
+# Lookups
+
+[[constraint_groups]]
+name = "lookups"
+
+[[constraints.lookups]]
+kind = "interaction"
+tag = "SHIFT"
+input = ["in", "shift", "direction", "signed", "word_instr"]
+output = "out"
+multiplicity = "-μ"
+ref = "shift:c:lookup"

From 795a7220aca598ba70fa085f4ef5b43e90f51ee1 Mon Sep 17 00:00:00 2001
From: Erik <159244975+erik-3milabs@users.noreply.github.com>
Date: Fri, 9 Jan 2026 09:17:05 +0100
Subject: [PATCH 023/105] spec: `ADD` template (#97)

* spec: ADD draft

* spec: ADD: fix `carry` size

* spec: ADD: clarify sum is mod 2^64

* spec: introduce `SUB` template notation.

* Fix assumption indices

Co-authored-by: Robin Jadoul <robin.jadoul@3milabs.tech>

* Fix typos

Co-authored-by: Robin Jadoul <robin.jadoul@3milabs.tech>

---------

Co-authored-by: Robin Jadoul <robin.jadoul@3milabs.tech>
---
 spec/add.typ      | 48 +++++++++++++++++++++++++++++++++++
 spec/book.typ     |  1 +
 spec/src/add.toml | 64 +++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 113 insertions(+)
 create mode 100644 spec/add.typ
 create mode 100644 spec/src/add.toml

diff --git a/spec/add.typ b/spec/add.typ
new file mode 100644
index 000000000..0dade7b01
--- /dev/null
+++ b/spec/add.typ
@@ -0,0 +1,48 @@
+#import "/book.typ": book-page, et
+#import "/src.typ": load_config, load_chip
+#import "/chip.typ": render_chip_column_table, render_chip_assumptions, render_constraint_table
+
+#show: book-page.with(title: "ADD/SUB")
+
+#let config = load_config()
+#let chip = load_chip("src/add.toml", config)
+
+#let add = raw(chip.name)
+
+#let highlighted_code(code) = {
+  box(
+    inset: (left: 4pt, right: 4pt), 
+    outset: (top: 4pt, bottom: 4pt), 
+    radius: 2pt,
+    fill: luma(230), 
+    raw(code))
+}
+
+= #add template
+#add is a constraint template that is used to assert that $#`sum` = #`lhs` + #`rhs` mod 2^64$, under the condition that `cond` is non-zero.
+
+== Notation
+The #add constraint template has the following interface:
+#block(radius: 5pt, width: 100%, inset: 1.5em, fill: luma(230), raw("cond => ADD<sum; lhs, rhs>"))
+where `cond` is any value described by an expression _of degree at most $1$_.
+#highlighted_code("ADD<sum; lhs, rhs>") can be used to denote the _unconditional_ application of the #add template to `lhs`, `rhs`, and `sum`.
+
+#let sub = raw("SUB")
+=== #sub
+For ease of notation, we moreover introduce the #sub constraint template.
+Its interface
+#block(radius: 5pt, width: 100%, inset: 1.5em, fill: luma(230), raw("cond => SUB<diff; lhs, rhs>"))
+maps onto the #add template as 
+#block(radius: 5pt, width: 100%, inset: 1.5em, fill: luma(230), raw("cond => ADD<lhs; rhs, diff>"))
+It constrains that $#`diff` = #`lhs` - #`rhs` mod 2^64$ when the expression `cond` is non-zero.
+As with #add, #highlighted_code("SUB<diff; lhs, rhs>") can be used to denote the _unconditional_ application of the template.
+
+== Variables
+#render_chip_column_table(chip, config)
+
+== Assumptions
+#render_chip_assumptions(chip, config)
+
+== Constraints
+This template introduces the following constraints
+#render_constraint_table(chip, config)
diff --git a/spec/book.typ b/spec/book.typ
index 6652ed646..3e001e1e0 100644
--- a/spec/book.typ
+++ b/spec/book.typ
@@ -7,6 +7,7 @@
   summary: [
     #chapter("variables.typ")[Variables]
     #chapter("is_bit.typ")[IS_BIT template]
+    #chapter("add.typ")[ADD template]
     #chapter("cpu.typ")[CPU chip]
     #chapter("shift.typ")[SHIFT chip]
     #chapter("branch.typ")[BRANCH]
diff --git a/spec/src/add.toml b/spec/src/add.toml
new file mode 100644
index 000000000..a0ccf6942
--- /dev/null
+++ b/spec/src/add.toml
@@ -0,0 +1,64 @@
+name = "ADD"
+
+# Variables
+
+[[variables.condition]]
+name = "cond"
+type = "BaseField"
+desc = "Whether the relation should be enforced ($eq.not 0$) or not ($0$)."
+
+[[variables.input]]
+name = "lhs"
+type = "DWordWL"
+desc = "left-hand operator"
+
+[[variables.input]]
+name = "rhs"
+type = "DWordWL"
+desc = "right-hand operator"
+
+[[variables.output]]
+name = "sum"
+type = "DWordWL"
+desc = "$#`lhs` + #`rhs`$"
+
+[[variables.virtual]]
+name = "carry"
+desc = "Carry values used to constrain the addition"
+type = ["Bit", 2]
+def = {idx="i", polys=[
+    {range=0, poly=["*", ["^", 2, -32], ["-", ["+", ["idx", "lhs", 0], ["idx", "rhs", 0]], ["idx", "sum", 0]]]},
+    {range=1, poly=["*", ["^", 2, -32], ["-", ["+", ["idx", "lhs", 1], ["idx", "rhs", 1], ["idx", "carry", 0]], ["idx", "sum", 1]]]},
+]}
+
+
+# Assumptions
+
+[[assumptions]]
+desc = "`IS_WORD[lhs[i]]`"
+range = ["i", 0, 1]
+ref = "add:a:lhs"
+
+[[assumptions]]
+desc = "`IS_WORD[rhs[i]]`"
+range = ["i", 0, 1]
+ref = "add:a:rhs"
+
+[[assumptions]]
+desc = "`IS_WORD[sum[i]]`"
+range = ["i", 0, 1]
+ref = "add:a:sum"
+
+# Constraints
+
+[[constraint_groups]]
+name = "all"
+
+[[constraints.all]]
+kind = "template"
+tag = "IS_BIT"
+input = [["idx", "carry", "i"]]
+range = ["i", 0, 1]
+cond = "cond"
+ref = "add:c:carry"
+

From 1ba94213b6bf11f0245438b06f4f282717fe380f Mon Sep 17 00:00:00 2001
From: Erik <159244975+erik-3milabs@users.noreply.github.com>
Date: Fri, 9 Jan 2026 12:21:50 +0100
Subject: [PATCH 024/105] spec: have column table subheaders repeat on page
 wrap (#121)

---
 spec/chip.typ | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spec/chip.typ b/spec/chip.typ
index 6914aa1f6..84a575c92 100644
--- a/spec/chip.typ
+++ b/spec/chip.typ
@@ -90,7 +90,7 @@
     table.header([*Label*], [*Type*], table.cell(colspan: 2, [*Description*])),
     table.hline(stroke: stroke(thickness: 2pt)),
     ..for (cat, vars) in chip.variables.pairs() {
-      (table.cell(colspan: 4, emph(cat)), table.hline(stroke: .6pt))
+      (table.header(level:2, table.cell(colspan: 4, emph(cat))), table.hline(stroke: .6pt))
       for var in vars {
         (
           [#raw(var.name)], 

From 62fc94b5904e6fe73d6c8912c283523bb10cea77 Mon Sep 17 00:00:00 2001
From: Erik <159244975+erik-3milabs@users.noreply.github.com>
Date: Fri, 9 Jan 2026 12:24:20 +0100
Subject: [PATCH 025/105] spec: drop `dot` when multiplying constant with
 one-letter variable. (#120)

---
 spec/expr.typ | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/spec/expr.typ b/spec/expr.typ
index 547f2cad2..745d95d23 100644
--- a/spec/expr.typ
+++ b/spec/expr.typ
@@ -142,7 +142,15 @@
         pp <= PREC.sub
       )
     },
-    "*": (pp, rec, e) => mwrap($#e.slice(1).map(rec.with(PREC.mul)).join($dot$)$, pp < PREC.mul),
+    "*": (pp, rec, e) => {
+      if e.len() == 3 and type(e.at(1)) == int and type(e.at(2)) == str and e.at(2).len() == 1 {
+        // multiplication of a constant with one-letter variable. 
+        // Dropping the "dot"
+        mwrap($#e.slice(1).map(rec.with(PREC.mul)).join($$)$, pp < PREC.mul)
+      } else {
+        mwrap($#e.slice(1).map(rec.with(PREC.mul)).join($dot$)$, pp < PREC.mul)
+      }
+    },
     "/": (pp, rec, e) => $#rec(PREC.div, e.at(1)) / #rec(PREC.div, e.at(2))$,
     "^": (pp, rec, e) => {
       assert(type(e.at(1)) == int and type(e.at(2)) == int, message: "Can only exponentiate constants")

From bf9662fe8e1bed0e49a4713ab8a3b58b09195611 Mon Sep 17 00:00:00 2001
From: Erik <159244975+erik-3milabs@users.noreply.github.com>
Date: Tue, 13 Jan 2026 15:08:13 +0100
Subject: [PATCH 026/105] spec: `MUL` chip (#122)

* spec: support "sum" expression

* spec: introduce "QuadHL" type

* spec: introduce MUL chip

* spec: Introduce QuadWL

* spec: introduce B20[4]

* spec: simplify MUL to 26 columns

* spec: Fix expr-sum bug

* spec: simplify MUL to 22 columns

* spec: improve MUL readability

* spec: MUL: fix indexing

* spec: MUL: refactor

* spec: drop B20

* spec: MUL: fix raw_product relation

* spec: MUL: fix IS_B19 check range

Co-authored-by: Robin Jadoul <robin.jadoul@3milabs.tech>

* spec: MUL: add missing res range check assumption

* spec: MUL: remove superfluous/invalid constraints

* spec: MUL: leverage SIGN template

* spec: MUL: fix index mistake

* spec: MUL: update description

* spec: permit non-constant exponents

* spec: MUL: drop `limb_product`

* spec: MUL: minor tweaks

* spec: MUL: bump headers

* spec: MUL: update description

* spec: MUL: update to IS_B20

* spec: MUL: remove 'eloquent'

* Apply suggestions from code review

Thanks Robin!

Co-authored-by: Robin Jadoul <robin.jadoul@3milabs.tech>

* spec: MUL: define padding

---------

Co-authored-by: Robin Jadoul <robin.jadoul@3milabs.tech>
---
 spec/book.typ        |   1 +
 spec/expr.typ        |   4 +-
 spec/mul.typ         |  94 +++++++++++++++++++++++
 spec/src/config.toml |  31 ++++++++
 spec/src/mul.toml    | 179 +++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 307 insertions(+), 2 deletions(-)
 create mode 100644 spec/mul.typ
 create mode 100644 spec/src/mul.toml

diff --git a/spec/book.typ b/spec/book.typ
index 3e001e1e0..01a362879 100644
--- a/spec/book.typ
+++ b/spec/book.typ
@@ -12,6 +12,7 @@
     #chapter("shift.typ")[SHIFT chip]
     #chapter("branch.typ")[BRANCH]
     #chapter("lt.typ")[LT]
+    #chapter("mul.typ")[MUL chip]
   ]
 )
 
diff --git a/spec/expr.typ b/spec/expr.typ
index 745d95d23..1044001e8 100644
--- a/spec/expr.typ
+++ b/spec/expr.typ
@@ -153,8 +153,8 @@
     },
     "/": (pp, rec, e) => $#rec(PREC.div, e.at(1)) / #rec(PREC.div, e.at(2))$,
     "^": (pp, rec, e) => {
-      assert(type(e.at(1)) == int and type(e.at(2)) == int, message: "Can only exponentiate constants")
-      $#e.at(1)^#e.at(2)$
+      assert(type(e.at(1)) == int, message: "Can only exponentiate constants")
+      $#e.at(1)^#rec(PREC.MAX, e.at(2))$
     },
     "=": (pp, rec, e) => $#rec(PREC.eq, e.at(1)) = #rec(PREC.eq, e.at(2))$,
     ":=": (pp, rec, e) => $#rec(PREC.eq, e.at(1)) := #rec(PREC.eq, e.at(2))$,
diff --git a/spec/mul.typ b/spec/mul.typ
new file mode 100644
index 000000000..92fafe26b
--- /dev/null
+++ b/spec/mul.typ
@@ -0,0 +1,94 @@
+#import "/book.typ": book-page
+#import "/src.typ": load_config, load_chip
+#import "/chip.typ": (
+  render_chip_column_table,
+  total_nr_variables,
+  total_nr_instantiated_columns,
+  render_constraint_table,
+  render_chip_assumptions,
+)
+
+#let config = load_config()
+#let chip = load_chip("src/mul.toml", config)
+
+#show: book-page.with(title: "MUL chip")
+
+#let mul = raw(chip.name)
+
+= #mul chip
+
+== Columns
+#let nr_variables = total_nr_variables(chip)
+#let nr_columns = total_nr_instantiated_columns(chip, config)
+
+The `MUL` chip is comprised of #nr_variables variables that are expressed using #nr_columns columns:
+#render_chip_column_table(chip, config)
+
+#let stackrel(top, bottom) = {
+ $mat(delim: #none, top; bottom)$
+}
+
+== Assumptions
+The following range checks are assumed to be performed/enforced outside of this chip:
+#render_chip_assumptions(chip, config)
+
+== Constraints
+
+=== Overview
+When `lhs` and `rhs` are _unsigned_ integers, computing their product $mod 2^128$ comes down to evaluating
+$
+(sum_(j=0)^3 2^(16j) dot #`lhs`_j) dot (sum_(i=0)^3 2^(16i) dot #`rhs`_i) mod 2^128.
+$
+If `lhs` and `rhs` are signed instead, the computation remains nearly identical: 
+based on their signs, one must either zero or one-extend `lhs` and `rhs` --- forming `lhs_ext` and `rhs_ext` respectively --- and compute their product $mod 2^128$:
+$
+(sum_(j=0)^7 2^(16j) dot #`lhs_ext`_j) dot (sum_(i=0)^7 2^(16i) dot #`rhs_ext`_i) mod 2^128.
+$
+where `lhs_ext` and `rhs_ext` are treated as _unsigned_ integers.
+Note that by setting the extension limbs of `lhs` and/or `rhs` to $0$ when the integer is (i) unsigned or (ii) signed and non-negative, this second formula still applies.
+For the purposes of constraining the multiplication operation, we rewrite this formula as
+#show math.equation: set block(breakable: true)
+$
+  &(sum_(j=0)^7 2^(16j) dot #`lhs_ext`_j) dot (sum_(i=0)^7 2^(16i) dot #`rhs_ext`_i) mod 2^128 \
+  &equiv sum_(j=0)^7 sum_(i=0)^7 2^(16(i+j)) dot #`lhs_ext`_j dot #`rhs_ext`_i mod 2^128 \
+  &stackrel(triangle, equiv) sum_(j=0)^7 sum_(i=0)^(7-j) 2^(16(i+j)) dot #`lhs_ext`_j dot #`rhs_ext`_i mod 2^128 \
+  &stackrel(square, equiv) sum_(j=0)^7 sum_(i=j)^(7) 2^(16i) dot #`lhs_ext`_j dot #`rhs_ext`_(i-j) mod 2^128 \
+  &stackrel(penta, equiv) sum_(i=0)^7 sum_(j=0)^(i) 2^(16i) dot #`lhs_ext`_j dot #`rhs_ext`_(i-j) mod 2^128 \
+  &equiv sum_(i=0)^3 sum_(k=0)^1 sum_(j=0)^(2i+k) 2^(16(2i+k)) dot #`lhs_ext`_j dot #`rhs_ext`_(2i+k-j) mod 2^128 \
+  &equiv sum_(i=0)^3 2^(32i) dot sum_(k=0)^1 2^(16k) dot sum_(j=0)^(2i+k) #`lhs_ext`_j dot #`rhs_ext`_(2i+k-j) mod 2^128
+$
+where at step
+- $triangle$ we can ignore $i > 7-j$, since that makes $2^(16(i+j)) equiv 0 mod 2^128$,
+- $square$ we rewrite the second summation such that $i$ iterates from $j$ to 7, rather than $0$ to $7-j$, and
+- $penta$ we swap the sums.
+
+We let `raw_product` capture the second summation in this last formula (see @mul:c:raw_product).
+By construction, $#`raw_product`_i < 2^51$ for all $i in [0, 3]$, far exceeding the 32-bits that fit in a single `Word`-limb.
+What remains then is to reduce each limb of `raw_product` $mod 2^32$, carrying the overflow of each limb to the next, constructing the output `res` in doing so.
+
+This reduce-and-carry operation is constrained @mul:a:res and @mul:c:carry, combined with `carry`'s definition.
+@mul:c:carry and `carry`'s definition enforce that
+$
+  forall i in [0, 3]: #`raw_product`_i + #`carry`_(i-1) - #`res`_i in { k dot 2^32 | k in [0, 2^20) }
+$
+with $#`carry`_(-1) = 0$ for simplicity.
+In other words: $#`res`_i equiv #`raw_product`_i + #`carry`_(i-1) (mod 2^32)$.
+With @mul:a:res forcing $#`res`_i < 2^32$, $#`res`_i$ can only assume one value: $#`raw_product`_i + #`carry`_(i-1) mod 2^32$.
+
+*Note*: one may have observed that @mul:c:carry requires $#`carry`_i in [0, 2^20)$, while no limb of a valid carry value would ever exceed $2^19$.
+This is indeed the case.
+However, there is some slack in how tight one has to constrain the `carry` values.
+In fact, in this situation it suffices to assert that $#`carry`_i < frac(p, 2^32, style: "skewed") approx 2^31$, where $p$ denotes the field's modulus.
+Given that other chips also use 20-bit lookups, using `IS_B20` makes for a simpler design.
+
+=== Definitions
+We constrain `lhs_is_negative` and `rhs_is_negative` according to their definition; `carry` is appropriately range checked.
+#render_constraint_table(chip, config, groups: "def")
+
+=== Product
+@mul:c:raw_product defines `raw_product` in terms of the (sign extended) input values `lhs` and `rhs`.
+#render_constraint_table(chip, config, groups: "prod")
+
+=== Lookup
+The #mul chip contributes the following to the lookup:
+#render_constraint_table(chip, config, groups: "lookup")
\ No newline at end of file
diff --git a/spec/src/config.toml b/spec/src/config.toml
index b66639e2a..389e4b16a 100644
--- a/spec/src/config.toml
+++ b/spec/src/config.toml
@@ -27,6 +27,11 @@ label = "Half"
 subtypes = ["BaseField"]
 desc = "Variable that can only assume values in the range $[0, 2^16)$."
 
+[[variables.types]]
+label = "B20"
+subtypes = ["BaseField"]
+desc = "Variable that can only assume values in the range $[0, 2^20)$."
+
 [[variables.types]]
 label = "Word"
 subtypes = ["BaseField"]
@@ -48,6 +53,16 @@ desc = """\
        Represented as an array of four `Byte` variables.\
        """
 
+[[variables.types]]
+label = "B35"
+subtypes = ["BaseField"]
+desc = "Variable that can only assume values in the range $[0, 2^35)$."
+
+[[variables.types]]
+label = "B51"
+subtypes = ["BaseField"]
+desc = "Variable that can only assume values in the range $[0, 2^51)$."
+
 [[variables.types]]
 label = "DWordBL"
 subtypes = ["Byte", "Byte", "Byte", "Byte", "Byte", "Byte", "Byte", "Byte"]
@@ -81,6 +96,22 @@ desc = """\
        The `Word` is the *least* significant digit.
        """
 
+[[variables.types]]
+label = "QuadHL"
+subtypes = ["Half", "Half", "Half", "Half", "Half", "Half", "Half", "Half"]
+desc = """\
+       Variable that can only assume values in the range $[0, 2^128)$. \\
+       Represented as an array of eight `Half` variables.\
+       """
+
+[[variables.types]]
+label = "QuadWL"
+subtypes = ["Word", "Word", "Word", "Word"]
+desc = """\
+       Variable that can only assume values in the range $[0, 2^128)$. \\
+       Represented as an array of four `Word` variables.\
+       """
+
 [[variables.types]]
 label = "DWordWHH"
 subtypes = ["Half", "Half", "Word"]
diff --git a/spec/src/mul.toml b/spec/src/mul.toml
new file mode 100644
index 000000000..bf9ffc276
--- /dev/null
+++ b/spec/src/mul.toml
@@ -0,0 +1,179 @@
+name = "MUL"
+
+
+# Input
+
+[[variables.input]]
+name = "lhs"
+type = "DWordHL"
+desc = "the left hand operator."
+pad = 0
+
+[[variables.input]]
+name = "lhs_signed"
+type = "Bit"
+desc = "whether to interpret `lhs` as a signed integer (1) or not (0)."
+pad = 0
+
+[[variables.input]]
+name = "rhs"
+type = "DWordHL"
+desc = "the right hand operator."
+pad = 0
+
+[[variables.input]]
+name = "rhs_signed"
+type = "Bit"
+desc = "whether to interpret `rhs` as a signed integer (1) or not (0)."
+pad = 0
+
+
+# Output
+
+[[variables.output]]
+name = "res"
+type = "QuadWL"
+desc = "the (extended) multiplication result"
+pad = 0
+
+# Auxiliary
+
+[[variables.auxiliary]]
+name = "lhs_is_negative"
+type = "Bit"
+desc = "whether `lhs` is negative (1) or not (0)"
+pad = 0
+
+[[variables.auxiliary]]
+name = "rhs_is_negative"
+type = "Bit"
+desc = "whether `rhs` is negative (1) or not (0)"
+pad = 0
+
+[[variables.auxiliary]]
+name = "raw_product"
+type = ["B51", 4]
+desc = "raw multiplication output"
+pad = 0
+
+# Virtual
+
+[[variables.virtual]]
+name = "lhs_ext"
+type = ["Half", 8]
+desc = "sign-extended value of `lhs`"
+def = {idx="i", polys=[
+    {range=[0, 3], poly=["idx", "lhs", "i"]},
+    {range=[4, 7], poly=["*", 0xFFFF, "lhs_is_negative"]},
+]}
+
+[[variables.virtual]]
+name = "rhs_ext"
+type = ["Half", 8]
+desc = "sign-extended value of `rhs`"
+def = {idx="i", polys=[
+    {range=[0, 3], poly=["idx", "rhs", "i"]},
+    {range=[4, 7], poly=["*", 0xFFFF, "rhs_is_negative"]},
+]}
+
+[[variables.virtual]]
+name = "carry"
+type = ["B20", 4]
+desc = "carry values"
+def = {idx="i", polys=[
+    {range=0, poly=["*", ["^", 2, -32], ["-", ["idx", "raw_product", 0], ["idx", "res", 0]]]},
+    {range=[1, 3], poly=["*", ["^", 2, -32], ["-", ["+", ["idx", "raw_product", "i"], ["idx", "carry", ["-", "i", 1]]], ["idx", "res", "i"]]]},
+]}
+
+[[variables.virtual]]
+name = "μ_sum"
+type = "BaseField"
+desc = "sum of multiplicies"
+def = ["+", "μ_lo", "μ_hi"]
+
+# Multiplicity
+
+[[variables.multiplicity]]
+name = "μ_lo"
+type = "BaseField"
+desc = ""
+pad = 0
+
+[[variables.multiplicity]]
+name = "μ_hi"
+type = "BaseField"
+desc = ""
+pad = 0
+
+# Assumptions
+
+[[assumptions]]
+desc = "`IS_HALF[lhs[i]]`"
+range = ["i", 0, 3]
+
+[[assumptions]]
+desc = "`IS_HALF[rhs[i]]`"
+range = ["i", 0, 3]
+
+[[assumptions]]
+desc = "`IS_WORD[res[i]]`"
+range = ["i", 0, 3]
+ref = "mul:a:res"
+
+
+# Constraints
+
+[[constraint_groups]]
+name = "def"
+
+[[constraints.def]]
+kind = "template"
+tag = "SIGN"
+input = [["idx", "lhs", 3], "lhs_signed"]
+output = "lhs_is_negative"
+ref = "mul:c:lhs_is_negative"
+
+[[constraints.def]]
+kind = "template"
+tag = "SIGN"
+input = [["idx", "rhs", 3], "rhs_signed"]
+output = "rhs_is_negative"
+ref = "mul:c:rhs_is_negative"
+
+[[constraints.def]]
+kind = "interaction"
+tag = "IS_B20"
+input = [["idx", "carry", "i"]]
+range = ["i", 0, 3]
+multiplicity = "μ_sum"
+ref = "mul:c:carry"
+
+[[constraint_groups]]
+name = "prod"
+
+
+[[constraints.prod]]
+kind = "arith"
+constraint = "$#`raw_product[i]` = sum_(#`k`=0)^1 2^(16k) sum_(#`j`=0)^(2i+k) #`lhs_ext[j]` dot #`rhs_ext[2i+k-j]`$"
+poly = ["-", ["sum", ["=", "k", 0], "1", ["*", ["^", 2, ["*", 16, "k"]], ["sum", ["=", "j", 0], ["+", ["*", 2, "i"], "k"], ["*", ["idx", "lhs_ext", "j"], ["idx", "rhs_ext", ["-", ["+", ["*", 2, "i"], "k"], "j"]]]]]], ["idx", "raw_product", "i"]]
+range = ["i", 0, 3]
+ref = "mul:c:raw_product"
+
+[[constraint_groups]]
+name = "lookup"
+
+[[constraints.lookup]]
+kind = "interaction"
+tag = "MUL"
+input = ["lhs", "lhs_signed", "rhs", "rhs_signed", "0"]
+output = ["idx", "res", "0:4"]
+multiplicity = ["-", "μ_lo"]
+ref = "mul:c:lookup_lo"
+
+[[constraints.lookup]]
+kind = "interaction"
+tag = "MUL"
+input = ["lhs", "lhs_signed", "rhs", "rhs_signed", "1"]
+output = ["idx", "res", "4:8"]
+multiplicity = ["-", "μ_hi"]
+ref = "mul:c:lookup_hi"
\ No newline at end of file

From 11a0c64e45a3f090ac22b76c1a1abfba8ed98154 Mon Sep 17 00:00:00 2001
From: Robin Jadoul <robin.jadoul@3milabs.tech>
Date: Tue, 13 Jan 2026 16:30:07 +0100
Subject: [PATCH 027/105] spec: Add support for specifying padding values of
 columns (#133)

Co-authored-by: Erik <159244975+erik-3milabs@users.noreply.github.com>
---
 spec/branch.typ      |  6 ++++++
 spec/chip.typ        | 26 ++++++++++++++++++++++++++
 spec/lt.typ          | 11 +++++++++--
 spec/mul.typ         |  9 ++++++++-
 spec/shift.typ       |  7 +++++++
 spec/src/branch.toml |  8 ++++++++
 spec/src/config.toml |  1 +
 spec/src/lt.toml     |  8 ++++++++
 spec/src/shift.toml  | 13 +++++++++++++
 9 files changed, 86 insertions(+), 3 deletions(-)

diff --git a/spec/branch.typ b/spec/branch.typ
index d01e9fa03..a18c252b7 100644
--- a/spec/branch.typ
+++ b/spec/branch.typ
@@ -6,6 +6,7 @@
   total_nr_variables,
   total_nr_instantiated_columns,
   render_constraint_table,
+  render_chip_padding_table,
 )
 
 #let config = load_config()
@@ -36,3 +37,8 @@ The range checks on `unmasked_low_byte` and `next_pc_low[0]` are performed impli
 This chip contributes the following to the lookup argument.
 #render_constraint_table(chip, config, groups: "output")
 
+== Padding
+
+The table can be padded to the next power of two with the following value assignments:
+
+#render_chip_padding_table(chip, config)
diff --git a/spec/chip.typ b/spec/chip.typ
index 84a575c92..ab709c404 100644
--- a/spec/chip.typ
+++ b/spec/chip.typ
@@ -24,6 +24,32 @@
   .sum()
 }
 
+#let render_chip_padding_table(chip, config) = {
+  // Whether `var` is a preprocessed variable.
+  let is_preprocessed(var) = {
+    config.variables.types
+    .filter(t => t.label == var.type)
+    .all(t => t.at("preprocessed", default: false))
+  }
+
+  let instantiated_vars = config.variables.categories.instantiated.map(c => chip.variables.at(c)).flatten()
+
+  show figure: set block(breakable: true)
+  figure(table(
+    columns: (auto, auto, auto),
+    inset: 6pt,
+    align: (right + top, center + top, left + top),
+    stroke: none,
+    table.header([*Column*], [], [*Padding value*]),
+    table.hline(stroke: stroke(thickness: 2pt)),
+    ..for var in instantiated_vars {
+      if not is_preprocessed(var) {
+        ([#raw(var.name)], [$:=$], [#expr_to_math(var.pad)],)
+      }
+    },
+  ), caption: [Overview of padding values for #chip.name chip.])
+}
+
 /// Generates a table listing `chip`'s columns.
 #let render_chip_column_table(chip, config) = {
 
diff --git a/spec/lt.typ b/spec/lt.typ
index ff3b6dae3..3b57a62e3 100644
--- a/spec/lt.typ
+++ b/spec/lt.typ
@@ -3,9 +3,10 @@
 #import "/chip.typ": (
   render_chip_assumptions,
   render_chip_column_table,
-  total_nr_variables,
-  total_nr_instantiated_columns,
+  render_chip_padding_table,
   render_constraint_table,
+  total_nr_instantiated_columns,
+  total_nr_variables,
 )
 
 #let config = load_config()
@@ -77,3 +78,9 @@ And then we constrain the subtraction.
 The chip contributes the following to the lookup argument.
 
 #render_constraint_table(chip, config, groups: "output")
+
+== Padding
+
+The table can be padded to the next power of two with the following value assignments:
+
+#render_chip_padding_table(chip, config)
diff --git a/spec/mul.typ b/spec/mul.typ
index 92fafe26b..1892994f0 100644
--- a/spec/mul.typ
+++ b/spec/mul.typ
@@ -6,6 +6,7 @@
   total_nr_instantiated_columns,
   render_constraint_table,
   render_chip_assumptions,
+  render_chip_padding_table,
 )
 
 #let config = load_config()
@@ -91,4 +92,10 @@ We constrain `lhs_is_negative` and `rhs_is_negative` according to their definiti
 
 === Lookup
 The #mul chip contributes the following to the lookup:
-#render_constraint_table(chip, config, groups: "lookup")
\ No newline at end of file
+#render_constraint_table(chip, config, groups: "lookup")
+
+== Padding
+
+The table can be padded to the next power of two with the following value assignments:
+
+#render_chip_padding_table(chip, config)
diff --git a/spec/shift.typ b/spec/shift.typ
index 3555d64e4..70aebc97c 100644
--- a/spec/shift.typ
+++ b/spec/shift.typ
@@ -6,6 +6,7 @@
   total_nr_instantiated_columns,
   render_constraint_table,
   render_chip_assumptions,
+  render_chip_padding_table,
 )
 
 #let config = load_config()
@@ -173,3 +174,9 @@ As such, there is no problem with it being unconstrained in this case.
 === Lookups
 This chip adds the following interaction to the lookup.
 #render_constraint_table(chip, config, groups: "lookups")
+
+== Padding
+
+The table can be padded to the next power of two with the following value assignments:
+
+#render_chip_padding_table(chip, config)
diff --git a/spec/src/branch.toml b/spec/src/branch.toml
index b93639602..d6620dcfd 100644
--- a/spec/src/branch.toml
+++ b/spec/src/branch.toml
@@ -7,21 +7,25 @@ name = "BRANCH"
 name = "pc"
 type = "DWordWL"
 desc = "The current pc, used as base address when `!JALR`"
+pad = 0
 
 [[variables.input]]
 name = "offset"
 type = "Word"
 desc = "The offset from the base address to jump to"
+pad = 0
 
 [[variables.input]]
 name = "register"
 type = "DWordWL"
 desc = "The base address to use when `JALR`"
+pad = 0
 
 [[variables.input]]
 name = "JALR"
 type = "Bit"
 desc = "Selects between `pc` and `register` as base address, needed for the `JALR` instruction"
+pad = 0
 
 
 # Output
@@ -30,11 +34,13 @@ desc = "Selects between `pc` and `register` as base address, needed for the `JAL
 name = "next_pc_high"
 type = ["Half", 3]
 desc = "The upper part of the next pc"
+pad = 0 # TODO(#128): improve handling for arrays
 
 [[variables.output]]
 name = "next_pc_low"
 type = ["Byte", 2]
 desc = "The lower part of the next pc"
+pad = 0
 
 
 # Auxiliary
@@ -43,6 +49,7 @@ desc = "The lower part of the next pc"
 name = "unmasked_low_byte"
 type = "Byte"
 desc = "The low byte of the next pc, before masking the LSB. Used to constraint the raw addition."
+pad = 0
 
 
 # Virtual
@@ -72,6 +79,7 @@ def = {idx = "i", polys = [
 name = "μ"
 type = "Bit"
 desc = ""
+pad = 0
 
 
 [[assumptions]]
diff --git a/spec/src/config.toml b/spec/src/config.toml
index 389e4b16a..68f1683de 100644
--- a/spec/src/config.toml
+++ b/spec/src/config.toml
@@ -125,6 +125,7 @@ desc = """\
 label = "Timestamp"
 subtypes = ["DWordWL"]
 desc = "A preprocessed column holding timestamps as `DWordWL`. Row `i` of the column contains the value $2^2 dot (i + 1)$. Used in the CPU chip, see there for more details about the magic number."
+preprocessed = true
 
 [variables.categories]
 all = ["input", "output", "auxiliary", "virtual", "multiplicity", "condition"]
diff --git a/spec/src/lt.toml b/spec/src/lt.toml
index 1a441c2b3..0ee06abc9 100644
--- a/spec/src/lt.toml
+++ b/spec/src/lt.toml
@@ -7,16 +7,19 @@ name = "LT"
 name = "lhs"
 type = "DWordHHW"
 desc = "The left operand"
+pad = 0
 
 [[variables.input]]
 name = "rhs"
 type = "DWordHHW"
 desc = "The right operand"
+pad = 0
 
 [[variables.input]]
 name = "signed"
 type = "Bit"
 desc = "whether to interpret `lhs` and `rhs` as signed integers (1) or not (0)"
+pad = 0
 
 # Output
 
@@ -24,6 +27,7 @@ desc = "whether to interpret `lhs` and `rhs` as signed integers (1) or not (0)"
 name = "lt"
 type = "Bit"
 desc = "Whether $#`lhs` < #`rhs`$, taking `signed` into account"
+pad = 0
 
 
 # Auxiliary
@@ -32,16 +36,19 @@ desc = "Whether $#`lhs` < #`rhs`$, taking `signed` into account"
 name = "lhs_sub_rhs"
 type = "DWordHL"
 desc = "$#`lhs` - #`rhs`$"
+pad = 0
 
 [[variables.auxiliary]]
 name = "lhs_msb"
 type = "Bit"
 desc = "The most significant bit of `lhs`"
+pad = 0
 
 [[variables.auxiliary]]
 name = "rhs_msb"
 type = "Bit"
 desc = "The most significant bit of `rhs`"
+pad = 0
 
 # Virtual
 
@@ -67,6 +74,7 @@ def = ["idx", "carry", 1]
 name = "μ"
 type = "Bit"
 desc = ""
+pad = 0
 
 
 [[assumptions]]
diff --git a/spec/src/shift.toml b/spec/src/shift.toml
index e2ddfa12b..ad6172af8 100644
--- a/spec/src/shift.toml
+++ b/spec/src/shift.toml
@@ -6,26 +6,31 @@ name = "SHIFT"
 name = "in"
 type = "DWordHL"
 desc = "The value being shifted"
+pad = 0
 
 [[variables.input]]
 name = "shift"
 type = "Byte"
 desc = "Number of bits to shift `in` by."
+pad = 0
 
 [[variables.input]]
 name = "direction"
 type = "Bit"
 desc = "Whether to shift left (0) or right (1)."
+pad = 0
 
 [[variables.input]]
 name = "signed"
 type = "Bit"
 desc = "Whether to interpret `in` as a signed integer."
+pad = 0
 
 [[variables.input]]
 name = "word_instr"
 type = "Bit"
 desc = "Whether this is a Word-instruction (1) or not (0)."
+pad = 0
 
 
 # Output
@@ -34,6 +39,7 @@ desc = "Whether this is a Word-instruction (1) or not (0)."
 name = "out"
 type = "DWordWL"
 desc = "$#`in <</>>/>>>` (#`shift` mod 32 dot (2 - #`word_instr`))$"
+pad = 0
 
 # Auxiliary
 
@@ -41,31 +47,37 @@ desc = "$#`in <</>>/>>>` (#`shift` mod 32 dot (2 - #`word_instr`))$"
 name = "is_negative"
 type = "Bit"
 desc = "Whether `in` is negative"
+pad = 0
 
 [[variables.auxiliary]]
 name = "bit_shift"
 type = "Byte"
 desc = "Value by which to shift `in` to obtain `X` and `Y`"
+pad = 0
 
 [[variables.auxiliary]]
 name = "zbs"
 type = "Bit"
 desc = "Whether `bit_shift` is zero (1) or not (0)."
+pad = 1
 
 [[variables.auxiliary]]
 name = "X"
 type = ["Half", 5]
 desc = "scratch variable."
+pad = 0 # TODO: array
 
 [[variables.auxiliary]]
 name = "Y"
 type = ["Half", 4]
 desc = "scratch variable."
+pad = 0 # TODO: array
 
 [[variables.auxiliary]]
 name = "limb_shift"
 type = ["Bit", 4]
 desc = "One-hot vector indicating whether $floor.l #`shift` / 16 floor.r equiv i mod s$, where $s = 2$ when $#`word_instr` = 1$ and $4$ otherwise."
+pad = 0 # TODO: array
 
 # Virtual
 
@@ -114,6 +126,7 @@ def = {idx="i", range=[0, 3], poly=["+", ["*", "left", ["sum", ["=", "j", 0], "i
 name = "μ"
 type = "Bit"
 desc = ""
+pad = 0
 
 
 

From eb3297a6d34e82bee9c8d4c862ffca23f10af1b4 Mon Sep 17 00:00:00 2001
From: Robin Jadoul <robin.jadoul@3milabs.tech>
Date: Tue, 13 Jan 2026 16:34:38 +0100
Subject: [PATCH 028/105] spec: update range specifications to iters concept
 (#130)

Co-authored-by: Erik <159244975+erik-3milabs@users.noreply.github.com>
---
 spec/chip.typ        | 88 ++++++++++++++++++++++++++++++--------------
 spec/expr.typ        | 14 ++++++-
 spec/src/add.toml    | 14 +++----
 spec/src/branch.toml | 14 +++----
 spec/src/cpu.toml    | 14 +++----
 spec/src/lt.toml     | 12 +++---
 spec/src/shift.toml  | 22 +++++------
 7 files changed, 111 insertions(+), 67 deletions(-)

diff --git a/spec/chip.typ b/spec/chip.typ
index ab709c404..4e7d6a143 100644
--- a/spec/chip.typ
+++ b/spec/chip.typ
@@ -24,6 +24,31 @@
   .sum()
 }
 
+// Return a list of iterators needed by `obj`. Taken from `iters` or `iter`.
+// Prepend `name` to every iterator, if given.
+#let iters_of(obj, name: none) = {
+  let clean_iter(it) = {
+    let arr = if type(it) == array {
+      it
+    } else {
+      (it,)
+    }
+    if name != none {
+      (name,) + arr
+    } else {
+      arr
+    }
+  }
+
+  (if "iters" in obj {
+    obj.iters
+  } else if "iter" in obj {
+    (obj.iter,)
+  } else {
+    ()
+  }).map(clean_iter)
+}
+
 #let render_chip_padding_table(chip, config) = {
   // Whether `var` is a preprocessed variable.
   let is_preprocessed(var) = {
@@ -53,19 +78,17 @@
 /// Generates a table listing `chip`'s columns.
 #let render_chip_column_table(chip, config) = {
 
-  // Render a definition's range
-  let render_def_range(idx, range) = {
-    if type(range) == array {
-      if range.len() == 1 {
-        [#raw(idx) `=` #range.at(0)]
-      } else if range.len() == 2 {
-        [#raw(idx) #sym.in `[`#range.at(0)`,`#range.at(1)`]`]
+  // Render a definition's iterators
+  let render_def_iters(iters) = {
+    (..for (name, ..args) in iters {
+      if args.len() == 1 {
+        ([#raw(name) = #expr_to_code(args.at(0))],)
+      } else if args.len() == 2 {
+        ([#raw(name) #sym.in `[`#expr_to_code(args.at(0)), #expr_to_code(args.at(1))`]`],)
       } else {
-        assert(false, message: "invalid range: " + repr(range) + repr(range.len()))
+        assert(false, message: "Invalid def range: " + repr(name, ..args))
       }
-    } else {
-      [#raw(idx) `=` #range]
-    }
+    }).join("\n")
   }
 
   // Render definition `def`
@@ -80,25 +103,38 @@
 
     assert(type(def) == dictionary, message: "invalid definition: " + repr(def))
 
+    let idx = def.at("idx", default: none)
+    let gather_indices(obj) = iters_of(obj, name: idx).map(it => it.first())
+    let index_all(expr, indices) = {
+      for index in indices {
+        expr = ("idx", expr, index)
+      }
+      expr
+    }
+
     if "poly" in def {
       (
         [],
         table.cell(align: right, emph[definition]), 
-        expr_to_math((":=", ("idx", var_name, def.idx), def.poly)),
-        render_def_range(def.idx, def.range)
+        expr_to_math((":=", index_all(var_name, gather_indices(def)), def.poly)),
+        render_def_iters(iters_of(def, name: idx))
       )
     } else if "polys" in def {
+      assert(
+        def.polys.map(gather_indices).dedup().len() == 1,
+        message: "Can only do multiple polys if they're indexed identically"
+      )
       (
         [],
         table.cell(align: right, emph[definition]), 
-        table.cell(colspan: 2, expr_to_math(("idx", var_name, def.idx)))
+        table.cell(colspan: 2, expr_to_math(index_all(var_name, gather_indices(def.polys.first()))))
       )
       for (i, poly) in def.polys.enumerate() {
         (
           [],
           [],              
           table.cell(inset: (left: 1.5em), expr_to_math((":=", "", poly.poly))),
-          render_def_range(def.idx, poly.range), 
+          render_def_iters(iters_of(poly, name: idx)),
         )
       }
     } else {
@@ -140,11 +176,9 @@
   }
 }
 
-// Render a range if `obj` contains one.
-#let interval(obj) = {
-  if "range" in obj {
-    [#raw(obj.range.at(0)) #sym.in` [`#obj.range.at(1)`,`#obj.range.at(2)`]`]
-  } else { return [] }
+// Render the iterators of `obj`.
+#let iters(obj) = {
+  iters_of(obj).map(iter => [#raw(iter.at(0)) #sym.in `[`#expr_to_code(iter.at(1)), #expr_to_code(iter.at(2))`]`]).join("\n")
 }
 
 #let args_interaction_like(input, output) = {
@@ -157,9 +191,9 @@
 
 #let render_chip_assumptions(chip, config) = {
   let tag(assumption) = {
-    let index = if "range" in assumption { "." + assumption.range.at(0) } else { "" }
+    let with_index(x) = ((x,) + iters_of(assumption).map(it => it.at(0))).join(".")
     let lbl = [#chip.name\-A]
-    show figure: (it) => align(left, block[#lbl#context it.counter.display()#index])
+    show figure: (it) => align(left, block[#lbl#context with_index(it.counter.display())])
     cref(assumption)[#figure(kind: chip.name + "assumption", numbering: (i) => [#lbl#i], supplement: [], [])]
   }
 
@@ -171,7 +205,7 @@
     table.header([*Tag*], [*Range*], [*Description*]),
     table.hline(stroke: stroke(thickness: 2pt)),
     ..for assumption in chip.assumptions {
-      ([#tag(assumption)], [#interval(assumption)], [#eval(assumption.desc, mode: "markup")])
+      ([#tag(assumption)], [#iters(assumption)], [#eval(assumption.desc, mode: "markup")])
     },
   ), caption: [Assumption overview of #chip.name chip.])
 }
@@ -193,10 +227,10 @@
 
   /// Render the contraint's tag.
   let tag(constraint, group) = {
-    let index = if "range" in constraint { "." + constraint.range.at(0) } else { "" }
+    let with_index(x) = ((x,) + iters_of(constraint).map(it => it.at(0))).join(".")
     let prefix = if "prefix" in group { group.prefix }
     let lbl = [#chip.name\-C#prefix]
-    show figure: (it) => align(left, block[#lbl#context it.counter.display()#index])
+    show figure: (it) => align(left, block[#lbl#context with_index(it.counter.display())])
     cref(constraint)[#figure(kind: chip.name + "constraint", numbering: (i) => [#lbl#i], supplement: [], [])]
   }
 
@@ -249,7 +283,7 @@
 
   // Whether there is at least one constraint with a range
   // This can be used to see whether the "Range" label should be displayed
-  let do_display_range = selected_constraints.values().flatten().any(x => "range" in x)
+  let do_display_range = selected_constraints.values().flatten().any(x => iters_of(x).len() > 0)
 
   // Whether there is at least one constraint with a multiplicity
   // This can be used to see whether the "Multiplicity" label should be displayed
@@ -272,7 +306,7 @@
       for constraint in group_constraints {
         (
           [#tag(constraint, lookup_group(group))],
-          [#interval(constraint)],
+          [#iters(constraint)],
           [#repr_constraint(constraint)],
           [#expr_to_math(constraint.at("multiplicity", default: ""))],
         )
diff --git a/spec/expr.typ b/spec/expr.typ
index 1044001e8..bf705b462 100644
--- a/spec/expr.typ
+++ b/spec/expr.typ
@@ -129,10 +129,22 @@
   }
 }
 
+#let flat_idxs(expr) = {
+  if expr.at(0) != "idx" {
+    (expr, ())
+  } else {
+    let (sub, gathered) = flat_idxs(expr.at(1))
+    (sub, gathered + (expr.at(2),))
+  }
+}
+
 // Typeset an expression as math
 #let expr_to_math = make_expr_formatter(
   (
-    "idx": (pp, rec, e) => $#rec(PREC.idx, e.at(1))_(#rec(PREC.idx, e.at(2)))$,
+    "idx": (pp, rec, e) => {
+      let (val, idxs) = flat_idxs(e)
+      $#rec(PREC.idx, val)_(#idxs.map(idx => rec(PREC.idx, idx)).join($, $))$
+    },
     "not": (pp, rec, e) => mwrap($1 - #rec(PREC.not, e.at(1))$, pp < PREC.not),
     "+": (pp, rec, e) => mwrap($#e.slice(1).map(rec.with(PREC.add)).join($+$)$, pp < PREC.add),
     "sum": (pp, rec, e) => {
diff --git a/spec/src/add.toml b/spec/src/add.toml
index a0ccf6942..c928a8b32 100644
--- a/spec/src/add.toml
+++ b/spec/src/add.toml
@@ -27,26 +27,25 @@ name = "carry"
 desc = "Carry values used to constrain the addition"
 type = ["Bit", 2]
 def = {idx="i", polys=[
-    {range=0, poly=["*", ["^", 2, -32], ["-", ["+", ["idx", "lhs", 0], ["idx", "rhs", 0]], ["idx", "sum", 0]]]},
-    {range=1, poly=["*", ["^", 2, -32], ["-", ["+", ["idx", "lhs", 1], ["idx", "rhs", 1], ["idx", "carry", 0]], ["idx", "sum", 1]]]},
+    {iter=0, poly=["*", ["^", 2, -32], ["-", ["+", ["idx", "lhs", 0], ["idx", "rhs", 0]], ["idx", "sum", 0]]]},
+    {iter=1, poly=["*", ["^", 2, -32], ["-", ["+", ["idx", "lhs", 1], ["idx", "rhs", 1], ["idx", "carry", 0]], ["idx", "sum", 1]]]},
 ]}
 
-
 # Assumptions
 
 [[assumptions]]
 desc = "`IS_WORD[lhs[i]]`"
-range = ["i", 0, 1]
+iter = ["i", 0, 1]
 ref = "add:a:lhs"
 
 [[assumptions]]
 desc = "`IS_WORD[rhs[i]]`"
-range = ["i", 0, 1]
+iter = ["i", 0, 1]
 ref = "add:a:rhs"
 
 [[assumptions]]
 desc = "`IS_WORD[sum[i]]`"
-range = ["i", 0, 1]
+iter = ["i", 0, 1]
 ref = "add:a:sum"
 
 # Constraints
@@ -58,7 +57,6 @@ name = "all"
 kind = "template"
 tag = "IS_BIT"
 input = [["idx", "carry", "i"]]
-range = ["i", 0, 1]
+iter = ["i", 0, 1]
 cond = "cond"
 ref = "add:c:carry"
-
diff --git a/spec/src/branch.toml b/spec/src/branch.toml
index d6620dcfd..e66974c8e 100644
--- a/spec/src/branch.toml
+++ b/spec/src/branch.toml
@@ -59,8 +59,8 @@ name = "next_pc_unmasked"
 type = "DWordWL"
 desc = "The combination of `next_pc_high`, `next_pc_low[1]` and `unmasked_low_byte` to constrain the addition. This is the computed value for the next pc, before masking off the LSB as required by the ISA."
 def = {idx = "i", polys = [
-  {range = [0], poly = ["+", ["*", ["^", 2, 16], ["idx", "next_pc_high", 0]], ["*", ["^", 2, 8], ["idx", "next_pc_low", 1]], ["idx", "unmasked_low_byte", 0]]},
-  {range = [1], poly = ["+", ["*", ["^", 2, 16], ["idx", "next_pc_high", 2]], ["idx", "next_pc_high", 1]]},
+  {iter = 0, poly = ["+", ["*", ["^", 2, 16], ["idx", "next_pc_high", 0]], ["*", ["^", 2, 8], ["idx", "next_pc_low", 1]], ["idx", "unmasked_low_byte", 0]]},
+  {iter = 1, poly = ["+", ["*", ["^", 2, 16], ["idx", "next_pc_high", 2]], ["idx", "next_pc_high", 1]]},
 ]}
 
 [[variables.virtual]]
@@ -68,8 +68,8 @@ name = "next_pc"
 type = "DWordWL"
 desc = "The computed next pc, after masking off the LSB as required by the ISA."
 def = {idx = "i", polys = [
-  {range = [0], poly = ["+", ["*", ["^", 2, 16], ["idx", "next_pc_high", 0]], ["*", ["^", 2, 8], ["idx", "next_pc_low", 1]], ["idx", "next_pc_low", 0]]},
-  {range = [1], poly = ["+", ["*", ["^", 2, 16], ["idx", "next_pc_high", 2]], ["idx", "next_pc_high", 1]]},
+  {iter = 0, poly = ["+", ["*", ["^", 2, 16], ["idx", "next_pc_high", 0]], ["*", ["^", 2, 8], ["idx", "next_pc_low", 1]], ["idx", "next_pc_low", 0]]},
+  {iter = 1, poly = ["+", ["*", ["^", 2, 16], ["idx", "next_pc_high", 2]], ["idx", "next_pc_high", 1]]},
 ]}
 
 
@@ -84,14 +84,14 @@ pad = 0
 
 [[assumptions]]
 desc = "`pc` is range checked, `IS_WORD[pc[i]]`"
-range = ["i", 0, 1]
+iter = ["i", 0, 1]
 
 [[assumptions]]
 desc = "`offset` is range checked, `IS_WORD[offset]`"
 
 [[assumptions]]
 desc = "`register` is range checked, `IS_WORD[register[i]]`"
-range = ["i", 0, 1]
+iter = ["i", 0, 1]
 
 [[assumptions]]
 desc = "`IS_BIT<JALR>`"
@@ -132,7 +132,7 @@ multiplicity = "μ"
 kind = "interaction"
 tag = "IS_HALFWORD"
 input = [["idx", "next_pc_high", "i"]]
-range = ["i", 0, 2]
+iter = ["i", 0, 2]
 multiplicity = "μ"
 
 
diff --git a/spec/src/cpu.toml b/spec/src/cpu.toml
index 562a657d0..747497d44 100644
--- a/spec/src/cpu.toml
+++ b/spec/src/cpu.toml
@@ -456,19 +456,19 @@ input = ["rd"]
 kind = "interaction"
 tag = "IS_BYTE"
 input = [["idx", "arg1", "i"]]
-range = ["i", 0, 7]
+iter = ["i", 0, 7]
 
 [[constraints.range]]
 kind = "interaction"
 tag = "IS_BYTE"
 input = [["idx", "arg2", "i"]]
-range = ["i", 0, 7]
+iter = ["i", 0, 7]
 
 [[constraints.range]]
 kind = "interaction"
 tag = "IS_BYTE"
 input = [["idx", "res", "i"]]
-range = ["i", 0, 7]
+iter = ["i", 0, 7]
 
 
 [[constraint_groups]]
@@ -501,7 +501,7 @@ multiplicity = ["+", "SLT", "BLT"]
 kind = "arith"
 constraint = "$#`SLT` + #`BLT` => #`res[i]` = 0$"
 poly = ["*", ["+", "SLT", "BLT"], ["idx", "res", "i"]]
-range = ["i", 1, 7]
+iter = ["i", 1, 7]
 
 [[constraints.alu]]
 kind = "interaction"
@@ -509,7 +509,7 @@ tag = "AND_BYTE"
 input = [["idx", "arg1", "i"], ["idx", "arg2", "i"]]
 output = ["idx", "res", "i"]
 multiplicity = "AND"
-range = ["i", 0, 7]
+iter = ["i", 0, 7]
 
 [[constraints.alu]]
 kind = "interaction"
@@ -517,7 +517,7 @@ tag = "OR_BYTE"
 input = [["idx", "arg1", "i"], ["idx", "arg2", "i"]]
 output = ["idx", "res", "i"]
 multiplicity = "OR"
-range = ["i", 0, 7]
+iter = ["i", 0, 7]
 
 [[constraints.alu]]
 kind = "interaction"
@@ -525,7 +525,7 @@ tag = "XOR_BYTE"
 input = [["idx", "arg1", "i"], ["idx", "arg2", "i"]]
 output = ["idx", "res", "i"]
 multiplicity = "XOR"
-range = ["i", 0, 7]
+iter = ["i", 0, 7]
 
 [[constraints.alu]]
 kind = "interaction"
diff --git a/spec/src/lt.toml b/spec/src/lt.toml
index 0ee06abc9..3836cdd13 100644
--- a/spec/src/lt.toml
+++ b/spec/src/lt.toml
@@ -57,8 +57,8 @@ name = "carry"
 type = ["Bit", 2]
 desc = "The carry for adding `lhs_sub_rhs` back to `rhs`"
 def = {idx = "i", polys = [
-  {range = [0], poly = ["*", ["^", 2, -32], ["-", ["+", ["idx", "rhs", 0], ["idx", ["cast", "lhs_sub_rhs", "DWordWL"], 0]], ["idx", "lhs", 0]]]},
-  {range = [1], poly = ["*", ["^", 2, -32], ["-", ["+", ["idx", ["cast", "rhs", "DWordWL"], 1], ["idx", ["cast", "lhs_sub_rhs", "DWordWL"], 1], ["idx", "carry", 0]], ["idx", ["cast", "lhs", "DWordWL"], 1]]]},
+  {iter = 0, poly = ["*", ["^", 2, -32], ["-", ["+", ["idx", "rhs", 0], ["idx", ["cast", "lhs_sub_rhs", "DWordWL"], 0]], ["idx", "lhs", 0]]]},
+  {iter = 1, poly = ["*", ["^", 2, -32], ["-", ["+", ["idx", ["cast", "rhs", "DWordWL"], 1], ["idx", ["cast", "lhs_sub_rhs", "DWordWL"], 1], ["idx", "carry", 0]], ["idx", ["cast", "lhs", "DWordWL"], 1]]]},
 ]}
 
 [[variables.virtual]]
@@ -79,12 +79,12 @@ pad = 0
 
 [[assumptions]]
 desc = "`IS_HALFWORD[lhs[i]]` and `IS_WORD[lhs[0]]`"
-range = ["i", 1, 2]
+iter = ["i", 1, 2]
 ref = "lt:a:range_lhs"
 
 [[assumptions]]
 desc = "`IS_HALFWORD[rhs[i]]` and `IS_WORD[rhs[0]]`"
-range = ["i", 1, 2]
+iter = ["i", 1, 2]
 ref = "lt:a:range_rhs"
 
 [[assumptions]]
@@ -128,13 +128,13 @@ desc = "Constrain the subtraction"
 kind = "template"
 tag = "IS_BIT"
 input = [["idx", "carry", "i"]]
-range = ["i", 0, 1]
+iter = ["i", 0, 1]
 
 [[constraints.sub]]
 kind = "interaction"
 tag = "IS_HALFWORD"
 input = [["idx", "lhs_sub_rhs", "i"]]
-range = ["i", 0, 3]
+iter = ["i", 0, 3]
 multiplicity = "μ"
 ref = "lt:c:lhs_sub_rhs_range"
 
diff --git a/spec/src/shift.toml b/spec/src/shift.toml
index ad6172af8..4b7044e7d 100644
--- a/spec/src/shift.toml
+++ b/spec/src/shift.toml
@@ -104,21 +104,21 @@ name = "intra_limb_left"
 type = "DWordHL"
 desc = "`in << (shift % 16)` if `left`"
 def = {idx="i", polys=[
-    {range=0, poly=["idx", "X", 0]},
-    {range=[1, 3], poly=["+", ["idx", "X", "i"], ["idx", "Y", ["-", "i", 1]]]},
+    {iter=0, poly=["idx", "X", 0]},
+    {iter=[1, 3], poly=["+", ["idx", "X", "i"], ["idx", "Y", ["-", "i", 1]]]},
 ]}
 
 [[variables.virtual]]
 name = "intra_limb_right"
 type = "DWordHL"
 desc = "`in >>> (shift % 16)` if `right` and `signed`;\\ `in >> (shift % 16)` if `right` and `!signed`"
-def = {idx="i", range=[0, 3], poly=["+", ["idx", "Y", "i"], ["idx", "X", ["+", "i", 1]]]}
+def = {idx="i", iter=[0, 3], poly=["+", ["idx", "Y", "i"], ["idx", "X", ["+", "i", 1]]]}
 
 [[variables.virtual]]
 name = "shifted"
 type = "DWordHL"
 desc = "$#`in <</>>/>>>` (#`shift` mod 32 dot (2 - #`word_instr`))$"
-def = {idx="i", range=[0, 3], poly=["+", ["*", "left", ["sum", ["=", "j", 0], "i", ["*", ["idx", "limb_shift", "j"], ["idx", "intra_limb_left", ["-", "i", "j"]]]]], ["*", "right", ["+", ["sum", ["=", "j", 0], ["-", 3, "i"], ["*", ["idx", "limb_shift", "j"], ["idx", "intra_limb_right", ["+", "i", "j"]]]], ["*", "extension", ["sum", ["=", "j", ["-", 3, "i"]], 3, ["idx", "limb_shift", "j"]]]]]]}
+def = {idx="i", iter=[0, 3], poly=["+", ["*", "left", ["sum", ["=", "j", 0], "i", ["*", ["idx", "limb_shift", "j"], ["idx", "intra_limb_left", ["-", "i", "j"]]]]], ["*", "right", ["+", ["sum", ["=", "j", 0], ["-", 3, "i"], ["*", ["idx", "limb_shift", "j"], ["idx", "intra_limb_right", ["+", "i", "j"]]]], ["*", "extension", ["sum", ["=", "j", ["-", 3, "i"]], 3, ["idx", "limb_shift", "j"]]]]]]}
 
 # Multiplicities
 
@@ -134,7 +134,7 @@ pad = 0
 
 [[assumptions]]
 desc = "`IS_HALFWORD[in[i]]`"
-range = ["i", 0, 3]
+iter = ["i", 0, 3]
 ref = "shift:a:range_in"
 
 [[assumptions]]
@@ -214,7 +214,7 @@ kind = "interaction"
 tag = "HWSL"
 input = [["idx", "in", "i"], "bit_shift"]
 output = ["idx", "X", "i"]
-range = ["i", 0, 3]
+iter = ["i", 0, 3]
 ref = "shift:c:hwsl_if_not_zero"
 multiplicity = ["not", "zbs"]
 
@@ -222,7 +222,7 @@ multiplicity = ["not", "zbs"]
 kind = "arith"
 constraint = "$#`zbs` => #`X[i]` = #`in[i]` dot #`left`$"
 poly = ["*", "zbs", ["-", ["idx", "X", "i"], ["*", ["idx", "in", "i"], "left"]]]
-range = ["i", 0, 3]
+iter = ["i", 0, 3]
 ref = "shift:c:zbs_implies_X"
 
 [[constraints.intra_limb_shift]]
@@ -244,7 +244,7 @@ kind = "interaction"
 tag = "HWSLC"
 input = [["idx", "in", "i"], "bit_shift"]
 output = ["idx", "Y", "i"]
-range = ["i", 0, 3]
+iter = ["i", 0, 3]
 ref = "shift:c:hwslc_if_not_zero"
 multiplicity = ["not", "zbs"]
 
@@ -252,7 +252,7 @@ multiplicity = ["not", "zbs"]
 kind = "arith"
 constraint = "$#`zbs` => #`Y[i]` = #`in[i]` dot #`right`$"
 poly = ["*", "zbs", ["-", ["idx", "Y", "i"], ["*", ["idx", "in", "i"], "right"]]]
-range = ["i", 0, 3]
+iter = ["i", 0, 3]
 ref = "shift:c:zbs_implies_Y"
 
 
@@ -263,7 +263,7 @@ name = "limb_shifting"
 kind = "template"
 tag = "IS_BIT"
 input = [["idx", "limb_shift", "i"]]
-range = ["i", 0, 3]
+iter = ["i", 0, 3]
 ref = "shift:c:limb_shift_is_bit"
 
 [[constraints.limb_shifting]]
@@ -278,7 +278,7 @@ multiplicity = "μ"
 kind = "arith"
 constraint = "$#`out[:2]` = #`shifted[:4]`$"
 poly = ["-", ["idx", "out", "i"], ["idx", ["cast", "shifted", "DWordWL"], "i"]]
-range = ["i", 0, 1]
+iter = ["i", 0, 1]
 ref = "shift:c:out_eq_shifted"
 
 

From 22fa781b300246c7c98117227e54ebe5e5c88e50 Mon Sep 17 00:00:00 2001
From: Erik <159244975+erik-3milabs@users.noreply.github.com>
Date: Wed, 14 Jan 2026 10:12:24 +0100
Subject: [PATCH 029/105] spec: `BITWISE` chip (#138)

* spec: introduce BITWISE

* spec: BITWISE: outline optimizations

* spec: BITWISE: fix SLL naming mismatch

* spec: BITWISE: fix length computation mistake

* spec: drop `dot` in `expr_to_code` when multiplying constant with single-letter variable
---
 spec/bitwise.typ      |  44 ++++++++++
 spec/book.typ         |   1 +
 spec/expr.typ         |  10 ++-
 spec/src/bitwise.toml | 200 ++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 254 insertions(+), 1 deletion(-)
 create mode 100644 spec/bitwise.typ
 create mode 100644 spec/src/bitwise.toml

diff --git a/spec/bitwise.typ b/spec/bitwise.typ
new file mode 100644
index 000000000..34ec6dd10
--- /dev/null
+++ b/spec/bitwise.typ
@@ -0,0 +1,44 @@
+#import "/book.typ": book-page, rj
+#import "/src.typ": load_config, load_chip
+#import "/chip.typ": (
+  render_chip_assumptions,
+  render_chip_column_table,
+  total_nr_variables,
+  total_nr_instantiated_columns,
+  render_constraint_table,
+)
+
+#let config = load_config()
+#let chip = load_chip("src/bitwise.toml", config)
+
+#let bitwise = raw(chip.name)
+
+#show: book-page.with(title: "BRANCH chip")
+
+= #bitwise chip
+
+== Columns
+#let nr_variables = total_nr_variables(chip)
+#let nr_columns = total_nr_instantiated_columns(chip, config)
+#let nr_precomputed = ("input", "output").map(c => chip.variables.at(c)).flatten().len()
+
+The #bitwise chip is comprised of #nr_variables variables that are expressed using #nr_columns columns.
+Of these, the _input_ and _output_ variables (#nr_precomputed in total) are precomputed.
+#render_chip_column_table(chip, config)
+
+*Note*: This table contains one row for every possible value of `(X, Y, Z)`.
+As such, it has length $2^8 dot 2^8 dot 2^4 = 2^(20)$.
+
+== Lookup
+This chip adds the following interactions to the lookup:
+#render_constraint_table(chip, config)
+
+== Areas of Optimization
+The following ideas may prove to be optimizations for the #bitwise chip:
++ Extend `IS_BYTE[X]` to `ARE_BYTES[X, Y]`, such that two bytes are range checked at once. 
+  When only a single check is required, one can still execute `IS_BYTE[X] := ARE_BYTES[X, 0]`.
++ Drop `MSB8` column, and instead define the `MSB8` lookup as `MSB8<X> := MSB16[256X]`.
+  Note: currently, `MSB8` also implicity range checks the input `X` (the lookup fails if `X` is not a `Byte`).
+  This optimization should only be executed when all chips leveraging `MSB8` do _not_ need this implicit range check.
++ Place the 16-bit (`AND`, `OR`, `XOR`, `MSB16`, `ZERO`, etc.) and 20-bit (`HWSL`, `HWSLC`, `IS_B20`) lookups in separate tables.
++ Combine `HWSL` and `HWSLC` into a single lookup (see also \#119).
diff --git a/spec/book.typ b/spec/book.typ
index 01a362879..1bf944862 100644
--- a/spec/book.typ
+++ b/spec/book.typ
@@ -13,6 +13,7 @@
     #chapter("branch.typ")[BRANCH]
     #chapter("lt.typ")[LT]
     #chapter("mul.typ")[MUL chip]
+    #chapter("bitwise.typ")[BITWISE]
   ]
 )
 
diff --git a/spec/expr.typ b/spec/expr.typ
index bf705b462..751493619 100644
--- a/spec/expr.typ
+++ b/spec/expr.typ
@@ -95,7 +95,15 @@
     "not": (pp, rec, e) => cwrap(`1 - ` + rec(PREC.not, e.at(1)), pp < PREC.not),
     "+": (pp, rec, e) => cwrap(e.slice(1).map(rec.with(PREC.add)).join(` + `), pp < PREC.add),
     "sum": (pp, rec, e) => assert(false, message: "sum is unsupported in code."),
-    "*": (pp, rec, e) => cwrap(e.slice(1).map(rec.with(PREC.mul)).join(` ` + sym.dot + ` `), pp < PREC.mul),
+    "*": (pp, rec, e) => {
+      if e.len() == 3 and type(e.at(1)) == int and type(e.at(2)) == str and e.at(2).len() == 1 {
+        // multiplication of a constant with one-letter variable. 
+        // Dropping the "dot"
+        cwrap(e.slice(1).map(rec.with(PREC.mul)).join(``), pp < PREC.mul)
+      } else {
+        cwrap(e.slice(1).map(rec.with(PREC.mul)).join(` ` + sym.dot + ` `), pp < PREC.mul)
+      }
+    },
     "/": (pp, rec, e) => cwrap(rec(PREC.div, e.at(1)), pp < PREC.div) + ` / ` + rec(PREC.div, e.at(2)),
     "^": (pp, rec, e) => {
       assert(type(e.at(1)) == int and type(e.at(2)) == int, message: "Can only exponentiate constants")
diff --git a/spec/src/bitwise.toml b/spec/src/bitwise.toml
new file mode 100644
index 000000000..2eeec4059
--- /dev/null
+++ b/spec/src/bitwise.toml
@@ -0,0 +1,200 @@
+name = "BITWISE"
+
+[[variables.input]]
+name = "X"
+type = "Byte"
+desc = ""
+precomputed = "true"
+
+[[variables.input]]
+name = "Y"
+type = "Byte"
+desc = ""
+precomputed = "true"
+
+[[variables.input]]
+name = "Z"
+type = "B4"
+desc = ""
+precomputed = "true"
+
+[[variables.output]]
+name = "AND"
+type = "Byte"
+desc = "the binary AND of `X` and `Y`"
+precomputed = "true"
+
+[[variables.output]]
+name = "OR"
+type = "Byte"
+desc = "the binary OR of `X` and `Y`"
+precomputed = "true"
+
+[[variables.output]]
+name = "XOR"
+type = "Byte"
+desc = "the binary XOR of `X` and `Y`"
+precomputed = "true"
+
+[[variables.output]]
+name = "MSB8"
+type = "Bit"
+desc = "the most significant bit of `X`"
+precomputed = "true"
+
+[[variables.output]]
+name = "MSB16"
+type = "Bit"
+desc = "the most significant bit of `Y`"
+precomputed = "true"
+
+[[variables.output]]
+name = "ZERO"
+type = "Bit"
+desc = "whether $#`X` = 0 and #`Y` = 0$"
+precomputed = "true"
+
+[[variables.output]]
+name = "SLL"
+type = "Half"
+desc = "`X||Y` logically left-shifted by `Z`: $((#`X` + 256#`Y`) #`<<` #`Z`) mod 2^16$"
+precomputed = "true"
+
+[[variables.output]]
+name = "SLLC"
+type = "Half"
+desc = "`X||Y` logically right-shifted by `Z`: $(#`X` + 256#`Y`) #`>>` (16 - #`Z`)$"
+precomputed = "true"
+
+[[variables.multiplicity]]
+name = "μ_AND"
+type = "BaseField"
+desc = ""
+
+[[variables.multiplicity]]
+name = "μ_OR"
+type = "BaseField"
+desc = ""
+
+[[variables.multiplicity]]
+name = "μ_XOR"
+type = "BaseField"
+desc = ""
+
+[[variables.multiplicity]]
+name = "μ_MSB8"
+type = "BaseField"
+desc = ""
+
+[[variables.multiplicity]]
+name = "μ_MSB16"
+type = "BaseField"
+desc = ""
+
+[[variables.multiplicity]]
+name = "μ_ZERO"
+type = "BaseField"
+desc = ""
+
+[[variables.multiplicity]]
+name = "μ_IS_BYTE"
+type = "BaseField"
+desc = ""
+
+[[variables.multiplicity]]
+name = "μ_IS_HALF"
+type = "BaseField"
+desc = ""
+
+[[variables.multiplicity]]
+name = "μ_IS_B20"
+type = "BaseField"
+desc = ""
+
+[[variables.multiplicity]]
+name = "μ_HWSL"
+type = "BaseField"
+desc = ""
+
+[[variables.multiplicity]]
+name = "μ_HWSLC"
+type = "BaseField"
+desc = ""
+
+
+[[constraint_groups]]
+name = "contributions"
+
+[[constraints.contributions]]
+kind = "interaction"
+tag = "AND_BYTE"
+input = ["X", "Y"]
+output = "AND"
+multiplicity = ["-", "μ_AND"]
+
+[[constraints.contributions]]
+kind = "interaction"
+tag = "OR_BYTE"
+input = ["X", "Y"]
+output = "OR"
+multiplicity = ["-", "μ_OR"]
+
+[[constraints.contributions]]
+kind = "interaction"
+tag = "XOR_BYTE"
+input = ["X", "Y"]
+output = "XOR"
+multiplicity = ["-", "μ_XOR"]
+
+[[constraints.contributions]]
+kind = "interaction"
+tag = "MSB8"
+input = ["X"]
+output = "MSB8"
+multiplicity = ["-", "μ_MSB8"]
+
+[[constraints.contributions]]
+kind = "interaction"
+tag = "MSB16"
+input = [["+", "X", ["*", 256, "Y"]]]
+output = "MSB16"
+multiplicity = ["-", "μ_MSB16"]
+
+[[constraints.contributions]]
+kind = "interaction"
+tag = "ZERO"
+input = [["+", "X", ["*", 256, "Y"]]]
+output = "ZERO"
+multiplicity = ["-", "μ_ZERO"]
+
+[[constraints.contributions]]
+kind = "interaction"
+tag = "IS_BYTE"
+input = ["X"]
+multiplicity = ["-", "μ_IS_BYTE"]
+
+[[constraints.contributions]]
+kind = "interaction"
+tag = "IS_HALF"
+input = [["+", "X", ["*", 256, "Y"]]]
+multiplicity = ["-", "μ_IS_HALF"]
+
+[[constraints.contributions]]
+kind = "interaction"
+tag = "IS_B20"
+input = [["+", "X", ["*", 256, "Y"], ["*", 65536, "Z"]]]
+multiplicity = ["-", "μ_IS_B20"]
+
+[[constraints.contributions]]
+kind = "interaction"
+tag = "HWSL"
+input = [["+", "X", ["*", 256, "Y"]], "Z"]
+output = "SLL"
+multiplicity = ["-", "μ_HWSL"]
+
+[[constraints.contributions]]
+kind = "interaction"
+tag = "HWSLC"
+input = [["+", "X", ["*", 256, "Y"]], "Z"]
+output = "SLLC"
+multiplicity = ["-", "μ_HWSLC"]
\ No newline at end of file

From 52d152243b440ca7448224ef1ebbdaa0a7f04dea Mon Sep 17 00:00:00 2001
From: Robin Jadoul <robin.jadoul@3milabs.tech>
Date: Thu, 15 Jan 2026 14:36:41 +0100
Subject: [PATCH 030/105] spec: Initial inefficient MEMW chip (#104)

Co-authored-by: Erik <159244975+erik-3milabs@users.noreply.github.com>
---
 spec/book.typ      |   1 +
 spec/chip.typ      |   1 +
 spec/memw.typ      |  59 ++++++++++
 spec/src/cpu.toml  |   6 +-
 spec/src/memw.toml | 288 +++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 352 insertions(+), 3 deletions(-)
 create mode 100644 spec/memw.typ
 create mode 100644 spec/src/memw.toml

diff --git a/spec/book.typ b/spec/book.typ
index 1bf944862..841ffb133 100644
--- a/spec/book.typ
+++ b/spec/book.typ
@@ -11,6 +11,7 @@
     #chapter("cpu.typ")[CPU chip]
     #chapter("shift.typ")[SHIFT chip]
     #chapter("branch.typ")[BRANCH]
+    #chapter("memw.typ")[MEMW]
     #chapter("lt.typ")[LT]
     #chapter("mul.typ")[MUL chip]
     #chapter("bitwise.typ")[BITWISE]
diff --git a/spec/chip.typ b/spec/chip.typ
index 4e7d6a143..b24153098 100644
--- a/spec/chip.typ
+++ b/spec/chip.typ
@@ -113,6 +113,7 @@
     }
 
     if "poly" in def {
+      // assert(false, message: repr(index_all(var_name, gather_indices(def))))
       (
         [],
         table.cell(align: right, emph[definition]), 
diff --git a/spec/memw.typ b/spec/memw.typ
new file mode 100644
index 000000000..bcf6a64b0
--- /dev/null
+++ b/spec/memw.typ
@@ -0,0 +1,59 @@
+#import "/book.typ": book-page, rj
+#import "/src.typ": load_config, load_chip
+#import "/chip.typ": (
+  render_chip_assumptions,
+  render_chip_column_table,
+  total_nr_variables,
+  total_nr_instantiated_columns,
+  render_constraint_table,
+)
+
+#let config = load_config()
+#let chip = load_chip("src/memw.toml", config)
+
+#show: book-page.with(title: "MEMW chip")
+
+== Columns
+#let nr_variables = total_nr_variables(chip)
+#let nr_columns = total_nr_instantiated_columns(chip, config)
+
+The `MEMW` chip is comprised of #nr_variables variables that are expressed using #nr_columns columns:
+#render_chip_column_table(chip, config)
+
+== Assumptions
+
+#render_chip_assumptions(chip, config)
+
+Our assumptions do not explicitly cover any range checks for the `is_register` and `value` columns,
+as these are not necessary for the correctness of this chip in isolation.
+These properties are necessary for the consistency of the system as a whole, and therefore
+we document it here, keeping the type information as a reading help.
+
+== Constraints
+
+#render_constraint_table(chip, config, groups: "consistency")
+
+As long as `timestamp` is properly range-checked, the presence of `old_timestamp`
+in the memory argument automatically ensures appropriate range checking
+(as long as no external entities provide negative multiplicities without range checking the timestamp).
+This ensures the assumptions for `LT` are satisfied.
+
+We additionally check that the address does not overflow
+for more significant bytes of the access.
+#render_constraint_table(chip, config, groups: "overflow")
+
+The chip adds the following tuples to the lookup argument,
+to effectuate that part of the memory argument.
+#render_constraint_table(chip, config, groups: "memory")
+
+This chip contributes the following to the lookup argument.
+#render_constraint_table(chip, config, groups: "output")
+
+
+== Future optimization ideas
+
+- Fast path for aligned memory access where all bytes have the same old timestamp
+- MEMB chip that deals does a one-byte write to remove old_timestamp from here (uncertain tradeoffs)
+- Compute `base_address[1] + 1` once and have high words of `address_add` as Words
+- Improve overflow trapping somehow so we don't need `LT` (could tie into previous one by checking carry bit of the +1)
+- Adding `μ_sum`/`w2`/`w4`/`write8` multiplicities to the `IS_HALFWORD` lookups may make some GKR things faster if there are known zeroes.
diff --git a/spec/src/cpu.toml b/spec/src/cpu.toml
index 747497d44..97db6d6f0 100644
--- a/spec/src/cpu.toml
+++ b/spec/src/cpu.toml
@@ -38,17 +38,17 @@ desc = "Whether to write back to the destination register"
 [[variables.input]]
 name = "memory_2bytes"
 type = "Bit"
-desc = "Whether the memory access (read or write) touches at least 2 bytes"
+desc = "Whether the memory access (read or write) touches exactly 2 bytes"
 
 [[variables.input]]
 name = "memory_4bytes"
 type = "Bit"
-desc = "Whether the memory access (read or write) touches at least 4 bytes"
+desc = "Whether the memory access (read or write) touches exactly 4 bytes"
 
 [[variables.input]]
 name = "memory_8bytes"
 type = "Bit"
-desc = "Whether the memory access (read or write) touches at least 8 bytes"
+desc = "Whether the memory access (read or write) touches exactly 8 bytes"
 
 # TODO: Are there usecases where it's nicer to just have this as a length constant?
 [[variables.input]]
diff --git a/spec/src/memw.toml b/spec/src/memw.toml
new file mode 100644
index 000000000..9aa9cd592
--- /dev/null
+++ b/spec/src/memw.toml
@@ -0,0 +1,288 @@
+name = "MEMW"
+
+# Input
+
+[[variables.input]]
+name = "is_register"
+type = "Bit"
+desc = "Whether the address represents a register index"
+
+[[variables.input]]
+name = "base_address"
+type = "DWordWL"
+desc = "The base address to read/write from/to, gets offset by $[0, 7]$, depending on how big the access is"
+
+[[variables.input]]
+name = "value"
+type = ["BaseField", 8]
+desc = "The values to store in memory. For regular memory, these should be (up to) 8 range-checked `Byte`s; registers are stored as two range-checked `Word`s"
+
+[[variables.input]]
+name = "timestamp"
+type = "DWordWL"
+desc = "The timestamp at which this memory access is said to occur"
+
+[[variables.input]]
+name = "write2"
+type = "Bit"
+desc = "Whether to write exactly 2 values"
+
+[[variables.input]]
+name = "write4"
+type = "Bit"
+desc = "Whether to write exactly 4 values"
+
+[[variables.input]]
+name = "write8"
+type = "Bit"
+desc = "Whether to write exactly 8 values"
+
+# Output
+
+[[variables.output]]
+name = "old"
+type = ["BaseField", 8]
+desc = """The old value written at `base_address`. See `value` for information about representation.
+Only the elements corresponding to the `writeN` bits are guaranteed"""
+
+# Auxiliary
+
+[[variables.auxiliary]]
+name = "address_add"
+type = ["DWordHL", 7]
+desc = "`address_add[i] = base_address + i + 1`"
+
+[[variables.auxiliary]]
+name = "old_timestamp"
+type = ["DWordWL", 8]
+desc = "The timestamp at which the address was last accessed"
+
+# Virtual
+
+[[variables.virtual]]
+name = "w2"
+type = "Bit"
+desc = "writing at least 2 bytes"
+def = ["+", "write2", "write4", "write8"]
+
+[[variables.virtual]]
+name = "w4"
+type = "Bit"
+desc = "writing at least 4 bytes"
+def = ["+", "write4", "write8"]
+
+[[variables.virtual]]
+name = "μ_sum"
+type = "Bit"
+desc = ""
+def = ["+", "μ_read", "μ_write"]
+
+# Multiplicity
+
+[[variables.multiplicity]]
+name = "μ_read"
+type = "Bit"
+desc = "Whether we are performing a read (and hence return `out`)"
+
+[[variables.multiplicity]]
+name = "μ_write"
+type = "Bit"
+desc = "Whether we are performing a write (and hence not return `out`)"
+
+
+[[assumptions]]
+desc = "`IS_WORD[base_address[i]]`"
+iter = ["i", 0, 1]
+
+[[assumptions]]
+desc = "`IS_BIT<write2>`"
+
+[[assumptions]]
+desc = "`IS_BIT<write4>`"
+
+[[assumptions]]
+desc = "`IS_BIT<write8>`"
+
+[[assumptions]]
+desc = "`IS_BIT<write2 + write4 + write8>`"
+
+[[assumptions]]
+desc = "`IS_WORD[timestamp[i]]`"
+iter = ["i", 0, 1]
+
+
+[[constraint_groups]]
+name = "consistency"
+
+[[constraints.consistency]]
+kind = "template"
+tag = "IS_BIT"
+input = ["μ_sum"]
+
+[[constraints.consistency]]
+kind = "arith"
+constraint = "$#`w2` => #`μ_sum`$"
+poly = ["*", "w2", ["not", "μ_sum"]]
+
+[[constraints.consistency]]
+kind = "template"
+tag = "ADD"
+input = ["base_address", 1]
+output = ["cast", ["idx", "address_add", 0], "DWordWL"]
+multiplicity = "w2"
+
+[[constraints.consistency]]
+kind = "template"
+tag = "ADD"
+input = ["base_address", ["+", "i", 1]]
+output = ["cast", ["idx", "address_add", "i"], "DWordWL"]
+iter = ["i", 1, 2]
+multiplicity = "w4"
+
+[[constraints.consistency]]
+kind = "template"
+tag = "ADD"
+input = ["base_address", ["+", "i", 1]]
+output = ["cast", ["idx", "address_add", "i"], "DWordWL"]
+iter = ["i", 3, 6]
+multiplicity = "write8"
+
+[[constraints.consistency]]
+kind = "interaction"
+tag = "IS_HALFWORD"
+input = [["idx", ["idx", "address_add", "i"], "j"]]
+iters = [
+  ["i", 0, 6],
+  ["j", 0, 3],
+]
+
+[[constraints.consistency]]
+kind = "interaction"
+tag = "LT"
+input = [["idx", "old_timestamp", 0], "timestamp"]
+output = 1
+multiplicity = "μ_sum"
+
+[[constraints.consistency]]
+kind = "interaction"
+tag = "LT"
+input = [["idx", "old_timestamp", 1], "timestamp"]
+output = 1
+multiplicity = "w2"
+
+[[constraints.consistency]]
+kind = "interaction"
+tag = "LT"
+input = [["idx", "old_timestamp", "i"], "timestamp"]
+output = 1
+iter = ["i", 2, 3]
+multiplicity = "w4"
+
+[[constraints.consistency]]
+kind = "interaction"
+tag = "LT"
+input = [["idx", "old_timestamp", "i"], "timestamp"]
+output = 1
+iter = ["i", 4, 7]
+multiplicity = "write8"
+
+
+[[constraint_groups]]
+name = "overflow"
+prefix = "R"
+
+[[constraints.overflow]]
+kind = "interaction"
+tag = "LT"
+input = ["base_address", ["cast", ["idx", "address_add", 0], "DWordWL"]]
+output = 1
+multiplicity = "write2"
+
+[[constraints.overflow]]
+kind = "interaction"
+tag = "LT"
+input = ["base_address", ["cast", ["idx", "address_add", 2], "DWordWL"]]
+output = 1
+multiplicity = "write4"
+
+[[constraints.overflow]]
+kind = "interaction"
+tag = "LT"
+input = ["base_address", ["cast", ["idx", "address_add", 6], "DWordWL"]]
+output = 1
+multiplicity = "write8"
+
+
+[[constraint_groups]]
+name = "memory"
+prefix = "M"
+
+[[constraints.memory]]
+kind = "interaction"
+tag = "memory"
+input = ["is_register", "base_address", ["idx", "old_timestamp", 0], ["idx", "old", 0]]
+multiplicity = "μ_sum"
+
+[[constraints.memory]]
+kind = "interaction"
+tag = "memory"
+input = ["is_register", "base_address", "timestamp", ["idx", "value", 0]]
+multiplicity = ["-", "μ_sum"]
+
+[[constraints.memory]]
+kind = "interaction"
+tag = "memory"
+input = ["is_register", ["idx", "address_add", 0], ["idx", "old_timestamp", 1], ["idx", "old", 1]]
+multiplicity = "w2"
+
+[[constraints.memory]]
+kind = "interaction"
+tag = "memory"
+input = ["is_register", ["idx", "address_add", 0], "timestamp", ["idx", "value", 1]]
+multiplicity = ["-", "w2"]
+
+[[constraints.memory]]
+kind = "interaction"
+tag = "memory"
+input = ["is_register", ["idx", "address_add", ["-", "i", 1]], ["idx", "old_timestamp", "i"], ["idx", "old", "i"]]
+multiplicity = "w4"
+iter = ["i", 2, 3]
+
+[[constraints.memory]]
+kind = "interaction"
+tag = "memory"
+input = ["is_register", ["idx", "address_add", ["-", "i", 1]], "timestamp", ["idx", "value", "i"]]
+multiplicity = ["-", "w4"]
+iter = ["i", 2, 3]
+
+[[constraints.memory]]
+kind = "interaction"
+tag = "memory"
+input = ["is_register", ["idx", "address_add", ["-", "i", 1]], ["idx", "old_timestamp", "i"], ["idx", "old", "i"]]
+multiplicity = "write8"
+iter = ["i", 4, 7]
+
+[[constraints.memory]]
+kind = "interaction"
+tag = "memory"
+input = ["is_register", ["idx", "address_add", ["-", "i", 1]], "timestamp", ["idx", "value", "i"]]
+multiplicity = ["-", "write8"]
+iter = ["i", 4, 7]
+
+
+[[constraint_groups]]
+name = "output"
+prefix = "O"
+
+[[constraints.output]]
+kind = "interaction"
+tag = "MEMW"
+input = ["is_register", "base_address", "value", "timestamp", "write2", "write4", "write8"]
+output = "old"
+multiplicity = "μ_read"
+
+[[constraints.output]]
+kind = "interaction"
+tag = "MEMW"
+input = ["is_register", "base_address", "value", "timestamp", "write2", "write4", "write8"]
+multiplicity = "μ_write"

From 11cd790e747b344c58e6e3ff39513b0258b7916e Mon Sep 17 00:00:00 2001
From: Robin Jadoul <robin.jadoul@3milabs.tech>
Date: Thu, 15 Jan 2026 15:38:39 +0100
Subject: [PATCH 031/105] spec: LOAD chip (#144)

---
 spec/book.typ      |   1 +
 spec/load.typ      |  42 ++++++++++++
 spec/src/cpu.toml  |   1 -
 spec/src/load.toml | 160 +++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 203 insertions(+), 1 deletion(-)
 create mode 100644 spec/load.typ
 create mode 100644 spec/src/load.toml

diff --git a/spec/book.typ b/spec/book.typ
index 841ffb133..3363c0c26 100644
--- a/spec/book.typ
+++ b/spec/book.typ
@@ -14,6 +14,7 @@
     #chapter("memw.typ")[MEMW]
     #chapter("lt.typ")[LT]
     #chapter("mul.typ")[MUL chip]
+    #chapter("load.typ")[LOAD chip]
     #chapter("bitwise.typ")[BITWISE]
   ]
 )
diff --git a/spec/load.typ b/spec/load.typ
new file mode 100644
index 000000000..931611108
--- /dev/null
+++ b/spec/load.typ
@@ -0,0 +1,42 @@
+#import "/book.typ": book-page, rj
+#import "/src.typ": load_config, load_chip
+#import "/chip.typ": (
+  render_chip_assumptions,
+  render_chip_column_table,
+  render_chip_padding_table,
+  render_constraint_table,
+  total_nr_instantiated_columns,
+  total_nr_variables,
+)
+
+#let config = load_config()
+#let chip = load_chip("src/load.toml", config)
+
+#show: book-page.with(title: "LOAD chip")
+
+== Columns
+#let nr_variables = total_nr_variables(chip)
+#let nr_columns = total_nr_instantiated_columns(chip, config)
+
+The `LOAD` chip is comprised of #nr_variables variables that are expressed using #nr_columns columns:
+#render_chip_column_table(chip, config)
+
+== Assumptions
+#render_chip_assumptions(chip, config)
+
+== Constraints
+The chip delegates the actual memory interaction to the `MEMW` chip,
+and ensures correctness of the requested sign/zero extension.
+The output `res` is correctly range-checked as long as the memory contents are.
+
+#render_constraint_table(chip, config, groups: "all")
+
+The chip contributes the following to the lookup argument.
+
+#render_constraint_table(chip, config, groups: "output")
+
+== Padding
+
+The table can be padded to the next power of two with the following value assignments:
+
+#render_chip_padding_table(chip, config)
diff --git a/spec/src/cpu.toml b/spec/src/cpu.toml
index 97db6d6f0..52154751e 100644
--- a/spec/src/cpu.toml
+++ b/spec/src/cpu.toml
@@ -582,7 +582,6 @@ kind = "interaction"
 tag = "MEMW"
 input = [1, ["*", 2, "rd"], "rvd", ["+", "timestamp", 2], 1, 0, 0]
 
-# TODO: no types available, so no casting yet
 [[constraints.mem]]
 kind = "interaction"
 tag = "LOAD"
diff --git a/spec/src/load.toml b/spec/src/load.toml
new file mode 100644
index 000000000..fcbd2b87f
--- /dev/null
+++ b/spec/src/load.toml
@@ -0,0 +1,160 @@
+name = "LOAD"
+
+# Input
+
+[[variables.input]]
+name = "base_address"
+type = "DWordWL"
+desc = "The base address to read/write from/to, gets offset by $[0, 7]$, depending on how big the access is"
+pad = 0
+
+[[variables.input]]
+name = "timestamp"
+type = "DWordWL"
+desc = "The timestamp at which this memory access is said to occur"
+pad = 0
+
+[[variables.input]]
+name = "read2"
+type = "Bit"
+desc = "Whether to read exactly 2 bytes"
+pad = 0
+
+[[variables.input]]
+name = "read4"
+type = "Bit"
+desc = "Whether to read exactly 4 bytes"
+pad = 0
+
+[[variables.input]]
+name = "read8"
+type = "Bit"
+desc = "Whether to read exactly 8 bytes"
+pad = 0
+
+[[variables.input]]
+name = "signed"
+type = "Bit"
+desc = "Whether to sign-extend (1) or zero-extend (0)"
+pad = 0
+
+# Output
+
+[[variables.output]]
+name = "res"
+type = "DWordBL"
+desc = "The result of reading (up to) 8 bytes from `base_address`, extended corresponding to `signed`."
+pad = 0
+
+# Auxiliary
+
+[[variables.auxiliary]]
+name = "sign_bit"
+type = "Bit"
+desc = "The sign bit extracted from the bytes retrieved from memory"
+pad = 0
+
+# Virtual
+
+[[variables.virtual]]
+name = "read1"
+type = "Bit"
+desc = "Whether to read exactly 1 byte"
+def = ["-", "μ", "read2", "read4", "read8"]
+
+# Multiplicity
+
+[[variables.multiplicity]]
+name = "μ"
+type = "Bit"
+desc = ""
+pad = 0
+
+
+[[assumptions]]
+desc = "`IS_WORD[base_address[i]]`"
+iter = ["i", 0, 1]
+
+[[assumptions]]
+desc = "`IS_BIT<signed>`"
+
+[[assumptions]]
+desc = "`IS_BIT<read2>`"
+
+[[assumptions]]
+desc = "`IS_BIT<read4>`"
+
+[[assumptions]]
+desc = "`IS_BIT<read8>`"
+
+[[assumptions]]
+desc = "`IS_BIT<read2 + read4 + read8>`"
+
+[[assumptions]]
+desc = "`IS_WORD[timestamp[i]]`"
+iter = ["i", 0, 1]
+
+
+[[constraint_groups]]
+name = "all"
+
+[[constraints.all]]
+kind = "arith"
+constraint = "$#`read2` + #`read4` + #`read8` => #`μ`$"
+poly = ["*", ["+", "read2", "read4", "read8"], ["not", "μ"]]
+
+[[constraints.all]]
+kind = "interaction"
+tag = "MEMW"
+input = [0, "base_address", ["cast", "res", ["BaseField", 8]], "timestamp", "read2", "read4", "read8"]
+output = "res"
+multiplicity = "μ"
+
+[[constraints.all]]
+kind = "interaction"
+tag = "MSB8"
+input = [["idx", "res", 0]]
+output = "sign_bit"
+multiplicity = "read1"
+
+[[constraints.all]]
+kind = "interaction"
+tag = "MSB8"
+input = [["idx", "res", 1]]
+output = "sign_bit"
+multiplicity = "read2"
+
+[[constraints.all]]
+kind = "interaction"
+tag = "MSB8"
+input = [["idx", "res", 3]]
+output = "sign_bit"
+multiplicity = "read4"
+
+[[constraints.all]]
+kind = "arith"
+constraint = "$!#`read8` => #`res`_i = #`signed` dot #`sign_bit` dot 255$"
+poly = ["*", ["not", "read8"], ["-", ["idx", "res", "i"], ["*", "signed", "sign_bit", 255]]]
+iter = ["i", 4, 7]
+
+[[constraints.all]]
+kind = "arith"
+constraint = "$!(#`read4` + #`read8`) => #`res`_i = #`signed` dot #`sign_bit` dot 255$"
+poly = ["*", ["-", 1, "read4", "read8"], ["-", ["idx", "res", "i"], ["*", "signed", "sign_bit", 255]]]
+iter = ["i", 2, 3]
+
+[[constraints.all]]
+kind = "arith"
+constraint = "$!(#`read2` + #`read4` + #`read8`) => #`res`_1 = #`signed` dot #`sign_bit` dot 255$"
+poly = ["*", ["-", 1, "read2", "read4", "read8"], ["-", ["idx", "res", 1], ["*", "signed", "sign_bit", 255]]]
+
+
+[[constraint_groups]]
+name = "output"
+
+[[constraints.output]]
+kind = "interaction"
+tag = "LOAD"
+input = ["base_address", "timestamp", "read2", "read4", "read8"]
+output = ["cast", "res", "DWordWL"]
+multiplicity = ["-", "μ"]

From 760f4461dfee0bd002c21f925976d3d212f243b4 Mon Sep 17 00:00:00 2001
From: Joaquin Carletti <56092489+ColoCarletti@users.noreply.github.com>
Date: Tue, 20 Jan 2026 12:46:27 -0300
Subject: [PATCH 032/105] fix CPU-CA41 typo (#189)

---
 spec/src/cpu.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spec/src/cpu.toml b/spec/src/cpu.toml
index 52154751e..63ed2005f 100644
--- a/spec/src/cpu.toml
+++ b/spec/src/cpu.toml
@@ -538,7 +538,7 @@ multiplicity = "SHIFT"
 kind = "template"
 tag = "ADD"
 input = ["pc", ["cast", ["+", ["*", 2, "c_type_instruction"], ["*", 4, ["not", "c_type_instruction"]]], "DWordWL"]]
-output = ["cast", "DWordWL", "res"]
+output = ["cast", "res", "DWordWL"]
 cond = "JALR"
 
 # TODO: no types available, so no casting yet

From a358eed8662b10bcd11c39bb6981b5ede3c948ce Mon Sep 17 00:00:00 2001
From: Erik <159244975+erik-3milabs@users.noreply.github.com>
Date: Wed, 21 Jan 2026 09:51:43 +0100
Subject: [PATCH 033/105] spec: `DECODE` (#143)

* spec: DECODE: decode basics

* spec: DECODE: update table + add *W instructions

* spec: fix padding table for chips that don't have all types of variables

* spec: introduce B49

* spec: DECODE: split-off decode uncompressed

* spec: DECODE: overhaul decode

* Apply suggestions from code review

Co-authored-by: Robin Jadoul <robin.jadoul@3milabs.tech>

* Apply suggestion from @RobinJadoul

Co-authored-by: Robin Jadoul <robin.jadoul@3milabs.tech>

* Fix `ADDI` flag mistakes

* spec: DECODE: make `packed_encode` a `BaseField`; remove superfluous `B49`

* spec: DECODE: set `mem_xB` when reading/writing _exactly_ `x` bytes

* spec: DECODE: update `mp_selector` description.

* Apply suggestions from code review

* spec: DECODE: merge uncompressed page into decode.typ

---------

Co-authored-by: Robin Jadoul <robin.jadoul@3milabs.tech>
---
 spec/book.typ                     |   1 +
 spec/chip.typ                     |   3 +-
 spec/decode.typ                   | 219 ++++++++++++++++++++++++++++++
 spec/src/decode.toml              |  58 ++++++++
 spec/src/decode_uncompressed.toml | 157 +++++++++++++++++++++
 5 files changed, 437 insertions(+), 1 deletion(-)
 create mode 100644 spec/decode.typ
 create mode 100644 spec/src/decode.toml
 create mode 100644 spec/src/decode_uncompressed.toml

diff --git a/spec/book.typ b/spec/book.typ
index 3363c0c26..15f90f276 100644
--- a/spec/book.typ
+++ b/spec/book.typ
@@ -8,6 +8,7 @@
     #chapter("variables.typ")[Variables]
     #chapter("is_bit.typ")[IS_BIT template]
     #chapter("add.typ")[ADD template]
+    #chapter("decode.typ")[DECODE chip]
     #chapter("cpu.typ")[CPU chip]
     #chapter("shift.typ")[SHIFT chip]
     #chapter("branch.typ")[BRANCH]
diff --git a/spec/chip.typ b/spec/chip.typ
index b24153098..8e2c4ac33 100644
--- a/spec/chip.typ
+++ b/spec/chip.typ
@@ -57,7 +57,7 @@
     .all(t => t.at("preprocessed", default: false))
   }
 
-  let instantiated_vars = config.variables.categories.instantiated.map(c => chip.variables.at(c)).flatten()
+  let instantiated_vars = config.variables.categories.instantiated.map(c => chip.variables.at(c, default: ())).flatten()
 
   show figure: set block(breakable: true)
   figure(table(
@@ -198,6 +198,7 @@
     cref(assumption)[#figure(kind: chip.name + "assumption", numbering: (i) => [#lbl#i], supplement: [], [])]
   }
 
+  show figure: set block(breakable: true)
   figure(table(
     columns: (auto, auto, 1fr),
     inset: 6pt,
diff --git a/spec/decode.typ b/spec/decode.typ
new file mode 100644
index 000000000..24846d2c1
--- /dev/null
+++ b/spec/decode.typ
@@ -0,0 +1,219 @@
+#import "/book.typ": book-page, rj
+#import "/src.typ": load_config, load_chip
+#import "/chip.typ": (
+  render_chip_assumptions,
+  render_chip_column_table,
+  total_nr_variables,
+  total_nr_instantiated_columns,
+  render_constraint_table,
+  render_chip_padding_table,
+)
+
+#let config = load_config()
+#let chip = load_chip("src/decode.toml", config)
+#show: book-page.with(title: "DECODE chip")
+
+#let decode = raw(chip.name)
+
+= #decode table
+All `RV64IMC` instruction are to be decoded to a format that can be interpreted by the VM.
+This section outlines the decoding table being used in the VM.
+For reasons of efficiency, data in this table is significantly compressed.
+Since reasoning about this compressed form is needlessly complex, the `decode (uncompressed)` section presents the same table in uncompressed form, and explains how to decode `RV64IM` assembly instructions to it.
+Instructions on how to compress the uncompressed table to form the compressed decode table, can be derived from the `packed_decode` variable provided below.
+
+== Columns
+#let nr_variables = total_nr_variables(chip)
+#let nr_columns = total_nr_instantiated_columns(chip, config)
+
+The #decode table is comprised of #nr_variables variables that are expressed using #nr_columns columns:
+#render_chip_column_table(chip, config)
+
+== Padding
+The #decode table must be padded to a length that is a power of two.
+Empty rows with the following content can be added to achieve this:
+#render_chip_padding_table(chip, config)
+
+
+== Decoding
+For the purposes of explaining decoding, we decompress #decode's `packed_decode` variable into its constituent variables.
+Note that the below table is _not_ used in practice: it is solely used for the purposes of this explanation.
+
+#let config = load_config()
+#let uncompressed_chip = load_chip("src/decode_uncompressed.toml", config)
+
+#render_chip_column_table(uncompressed_chip, config)
+
+We will illustrate how each instruction should be expressed in this (uncompressed) decoding table.
+The columns of the accompanying table represent the following:
+- *`operation`*: the assembly operation being encoded,
+- *`op-flag`*: which of the "`ALU` selector flags" operation flags to set. Each operation sets exactly one.
+- *`w_reg`*, *`w_instr`*, *`signed`*: whether to set the `write_register`, `word_instr` or `signed` flag, respectively,
+- *other*: the other flags that should be set or variables that should be given specific values.
+
+For the purpose of brevity and readability, the table uses the following rules-of-thumb:
++ `rd`, `rs1`, `rs2`, and `imm` are mapped to the values provided by the instruction;
+  when a value is not specified by an instruction it defaults to $0$.
++ Any flag that is not listed is set to $0$, with the exception of the `c_type` flag. 
+  *The `c_type` flag is set independently of the below table*, as explained below.
+
+Further clarification is provided in the notes following the table.
+
+=== C-type instructions
+The `RV64C` extension for compressed instructions specifies that \~50% of all instructions can be represented using a 16-bit instruction (rather than 32-bits), saving \~25% in code size.
+This execution of assembly code is _not_ agnostic to an instruction's compression state; after executing a compressed instruction, the `pc` should be incremented by $2$ rather than $4$.
+To indicate an instruction is provided in compressed form, the `c_type` flag is introduced.
+*This flag should be set to $1$ whenever the decoded instruction is provided in compressed form and $0$ otherwise.*
+
+/// Add a reference to one or more notes following this table.
+#let ref_note(..refs) = {
+  super("[" + refs.pos().map(r => ref(r)).join(",") + "]")
+}
+
+#let decoding_table(lines) = {
+  show figure: set block(breakable: true)
+
+  figure(table(
+    columns: (auto, auto, 40pt, 40pt, 40pt, 1fr, 15pt),
+    stroke: 0pt,
+    inset: (right: .5em),
+    align: (left, right, center, center, center, left, right),
+    fill: (_, y) =>
+      if calc.odd(y) and y <= lines.len() { luma(245) }
+      else { white },
+    table.header([*Operation*], [*op-flag*], [*`w_reg`*], [*`w_instr`*], [*`signed`*], [*other*], []),
+    table.hline(stroke: 1.5pt),
+    table.vline(x: 1, start: 1, end: lines.len() + 1, stroke: .5pt),
+    ..lines.flatten(),
+    table.hline(stroke: 1.5pt),
+    table.footer([*Operation*], [*op-flag*], [*`w_reg`*], [*`w_instr`*], [*`signed`*], [*other*]),
+    ),  
+    caption: [Decoding table]
+  )
+}
+
+#let decoding = (
+    // OP-IMM
+  ([`ADDI[W]   rd, rs1, imm`], [`ADD`], [$#`rd` eq.not 0$], [`[W]`], [], [], [#ref_note(<note_w_reg>, <note_word_instr>)]),
+  ([`SLTI[U]   rd, rs1, imm`], [`SLT`], [$#`rd` eq.not 0$], [], [#sym.not`[U]`], [], [#ref_note(<note_w_reg>, <note_signed>)]),
+  ([`ANDI      rd, rs1, imm`], [`AND`], [$#`rd` eq.not 0$], [], [], [], [#ref_note(<note_w_reg>)]),
+  ([`ORI       rd, rs1, imm`], [`OR`],  [$#`rd` eq.not 0$],  [], [], [], [#ref_note(<note_w_reg>)]),
+  ([`XORI      rd, rs1, imm`], [`XOR`], [$#`rd` eq.not 0$], [], [], [], [#ref_note(<note_w_reg>)]),
+  ([`SLLI[W]   rd, rs1, imm`], [`SHIFT`], [$#`rd` eq.not 0$], [`[W]`], [], [], [#ref_note(<note_w_reg>)]),
+  ([`SRLI[W]   rd, rs1, imm`], [`SHIFT`], [$#`rd` eq.not 0$], [`[W]`], [], [`mp_selector`], [#ref_note(<note_w_reg>, <note_word_instr>)]),
+  ([`SRAI[W]   rd, rs1, imm`], [`SHIFT`], [$#`rd` eq.not 0$], [`[W]`], [1], [`mp_selector`], [#ref_note(<note_w_reg>, <note_word_instr>)]),
+  // OP
+  ([`ADD[W]    rd, rs1, rs2`], [`ADD`], [$#`rd` eq.not 0$], [`[W]`], [], [], [#ref_note(<note_w_reg>, <note_word_instr>)]),
+  ([`SUB[W]    rd, rs1, rs2`], [`SUB`], [$#`rd` eq.not 0$], [`[W]`], [], [], [#ref_note(<note_w_reg>, <note_word_instr>)]),
+  ([`SLT[U]    rd, rs1, rs2`], [`SLT`], [$#`rd` eq.not 0$], [], [#sym.not`[U]`], [], [#ref_note(<note_w_reg>, <note_signed>)]),
+  ([`AND       rd, rs1, rs2`], [`AND`], [$#`rd` eq.not 0$], [], [], [], [#ref_note(<note_w_reg>)]),
+  ([`OR        rd, rs1, rs2`], [`OR`], [$#`rd` eq.not 0$], [], [], [], [#ref_note(<note_w_reg>)]),
+  ([`XOR       rd, rs1, rs2`], [`XOR`], [$#`rd` eq.not 0$], [], [], [], [#ref_note(<note_w_reg>)]),
+  ([`SLL[W]    rd, rs1, rs2`], [`SHIFT`], [$#`rd` eq.not 0$], [`[W]`], [], [], [#ref_note(<note_w_reg>, <note_word_instr>)]),
+  ([`SRL[W]    rd, rs1, rs2`], [`SHIFT`], [$#`rd` eq.not 0$], [`[W]`], [], [`mp_selector`], [#ref_note(<note_w_reg>, <note_word_instr>)]),
+  ([`SRA[W]    rd, rs1, rs2`], [`SHIFT`], [$#`rd` eq.not 0$], [`[W]`], [1], [`mp_selector`], [#ref_note(<note_w_reg>, <note_word_instr>)]),
+  // OP - M
+  ([`MUL[W]    rd, rs1, rs2`], [`MUL`], [$#`rd` eq.not 0$], [`[W]`], [1], [`mp_selector`], [#ref_note(<note_w_reg>, <note_word_instr>)]),
+  ([`MULH      rd, rs1, rs2`], [`MUL`], [$#`rd` eq.not 0$], [], [1], [`mp_selector`, `muldiv_selector`], [#ref_note(<note_w_reg>)]),
+  ([`MULHU     rd, rs1, rs2`], [`MUL`], [$#`rd` eq.not 0$], [], [], [`muldiv_selector`], [#ref_note(<note_w_reg>)]),
+  ([`MULHSU    rd, rs1, rs2`], [`MUL`], [$#`rd` eq.not 0$], [], [1], [`muldiv_selector`], [#ref_note(<note_w_reg>)]),
+  ([`DIV[U][W] rd, rs1, rs2`], [`DIVREM`], [$#`rd` eq.not 0$], [`[W]`], [#sym.not`[U]`], [], [#ref_note(<note_w_reg>, <note_word_instr>, <note_signed>)]),
+  ([`REM[U][W] rd, rs1, rs2`], [`DIVREM`], [$#`rd` eq.not 0$], [`[W]`], [#sym.not`[U]`], [`muldiv_selector`], [#ref_note(<note_w_reg>, <note_word_instr>, <note_signed>)]),
+  // LUI/AUIPC
+  ([`LUI       rd, imm`], [`ADD`], [$#`rd` eq.not 0$], [], [], [], [#ref_note(<note_w_reg>, <note-lui>)]),
+  ([`AUIPC     rd, imm`], [`ADD`], [$#`rd` eq.not 0$], [], [], [`rs1 := x255`], [#ref_note(<note_w_reg>, <note-auipc>)]),
+  ([`JAL       rd, imm`], [`JALR`], [$#`rd` eq.not 0$], [], [], [`rs1 := x255`], [#ref_note(<note_w_reg>, <note-jal>)]),
+  // Branching
+  ([`JALR      rd, rs1, imm`], [`JALR`], [$#`rd` eq.not 0$], [], [], [], [#ref_note(<note_w_reg>)]),
+  ([`BEQ      rs1, rs2, imm`], [`BEQ`], [], [], [], [], []),
+  ([`BNE      rs1, rs2, imm`], [`BEQ`], [], [], [], [`mp_selector`], []),
+  ([`BLT[U]   rs1, rs2, imm`], [`BLT`], [], [], [#sym.not`[U]`], [], [#ref_note(<note_signed>)]),
+  ([`BGE[U]   rs1, rs2, imm`], [`BLT`], [], [], [#sym.not`[U]`], [`mp_selector`], [#ref_note(<note_signed>)]),
+  // LOAD
+  ([`LD        rd, rs1, imm`], [`LOAD`], [], [], [], [`mem_8B`], []),
+  ([`LW[U]     rd, rs1, imm`], [`LOAD`], [], [], [#sym.not`[U]`], [`mem_4B`], [#ref_note(<note_signed>)]),
+  ([`LH[U]     rd, rs1, imm`], [`LOAD`], [], [], [#sym.not`[U]`], [`mem_2B`], [#ref_note(<note_signed>)]),
+  ([`LB[U]     rd, rs1, imm`], [`LOAD`], [], [], [#sym.not`[U]`], [], [#ref_note(<note_signed>)]),
+  // STORE
+  ([`SD       rs1, rs2, imm`], [`STORE`], [], [], [], [`mem_8B`], []),
+  ([`SW       rs1, rs2, imm`], [`STORE`], [], [], [], [`mem_4B`], []),
+  ([`SH       rs1, rs2, imm`], [`STORE`], [], [], [], [`mem_2B`], []),
+  ([`SB       rs1, rs2, imm`], [`STORE`], [], [], [], [], []),
+  // ECALL/EBREAK
+  ([`ECALL`], [`ECALL`], [1], [], [], [$#`rs1` := #`x17`$, $#`rs2` := #`x11`$, $#`rd` := #`x10`$], [#ref_note(<note-ecall>)]),
+  ([`EBREAK`], [`EBREAK`], [], [], [], [], []),
+  // FENCE
+  ([`FENCE`], [`ADD`], [], [], [], [], [#ref_note(<note-fence>)]),
+)
+
+#decoding_table(decoding)
+
+// Construct a note that can be referenced through `lbl`
+#let referenceable_note(lbl, note) = {
+  show figure: (it) => align(left, [#it])
+  [#figure(kind: "note", supplement: [], [#note]) #label(lbl)]
+}
+
+==== Notes
+We note the following about the above decoding table:
+#enum(numbering: "[1]",
+  enum.item(
+    referenceable_note(
+      "note_w_reg",
+      [`write_register`: $#`rd` eq.not 0$ indicates that $#`write_register` = 1$ when $#`rd` eq.not 0$ and $0$ otherwise.]
+    )
+  ),
+  enum.item(
+    referenceable_note(
+      "note_word_instr",
+      [`word_instr`: `[W]` indicates that $#`word_instr` = 0$ for the `W`-variant of the operation, and $0$ for the non-`W`-variant.]
+    )
+  ),
+  enum.item(
+    referenceable_note(
+      "note_signed",
+      [`signed`: #sym.not`[U]` indicates that $#`signed` = 1$ for the *non-`U`*-variant of the operation, and $0$ for the `U`-variant.]
+    )
+  ),
+  enum.item(
+    referenceable_note(
+      "note-lui",
+      [`LUI`: this operation loads the 20-bit `imm` in the upper bits of `rd`.
+      Observe that this can be represented using `ADDI rd, x0, imm`.
+      As such, *we expect the decoding to take care of writing the immediate in bit range $[12:32]$ of `imm` and extending it to 64 bits.*]
+    )
+  ),
+  enum.item(
+    referenceable_note(
+      "note-auipc",
+      [`AUIPC`: this operation adds the 20-bit immediate to the upper bits of `pc` and stores the result in `rd`. 
+      Given that the `pc` is stored in `x255`, this operation can be represented using `ADDI rd, x255, imm`.
+      As such, *we expect the decoding to take care of writing the immediate in bit range $[12:32]$ of `imm` and extending it to 64 bits.*]
+    )
+  ),
+  enum.item(
+    referenceable_note(
+      "note-jal",
+      [`JAL`: this operation stores `pc + 4` in `rd` and adds two times the sign-extended 20-bit immediate to the `pc`.
+      Note that this can be represented using `JALR rd, x255, imm`.
+      As such, *we expect the decoding to take care of writing the immediate in bit range $[1:13]$ of `imm` and extending it to 64 bits; the least significant bit should always be 0.*]
+    )
+  ),
+  enum.item(
+    referenceable_note(
+      "note-ecall",
+      [`ECALL`:
+      "On RISC-V a system call has its own instruction: `ECALL`. A system call can have up to 7 arguments and has 1 return value. The arguments are in registers A0-A6, in that order, and the return value is written into A0 before giving back control to the guest. A7 contains the system call number." #link("https://libriscv.no/docs/concepts/syscalls/#the-risc-v-system-call-abi")[[source]]
+      As such,
+      - syscall number in A7 (= register `x17`)
+      - first syscall argument in A1 (= register `x11`)
+      - syscall output in A0 (= register `x10`)]
+    )
+  ),
+  enum.item(
+    referenceable_note(
+      "note-fence",
+      [`FENCE`: currently, the VM interprets this operation as `ADDI x0 x0 0`; a no-op.]
+    )
+  )
+)
diff --git a/spec/src/decode.toml b/spec/src/decode.toml
new file mode 100644
index 000000000..6c01e4f6c
--- /dev/null
+++ b/spec/src/decode.toml
@@ -0,0 +1,58 @@
+name = "DECODE"
+
+[[variables.output]]
+name = "pc"
+type = "DWordWL"
+desc = "value of the program counter this instruction is associated with."
+# TODO(#136): fix this when padding the CPU
+pad = 1
+
+[[variables.output]]
+name = "packed_decode"
+type = "BaseField"
+desc = """Ordered concatenation of several small variables.
+The `decode (uncompressed)` section explains the purpose of each variable.\\
+A list of each variable and the bit(-range) in which it is located:\\
+[0:7] `rs1`, \\
+[8:15] `rs2`, \\
+[16:23] `rd`, \\
+[24] `write_register`, \\
+[25] `memory_2bytes`, \\
+[26] `memory_4bytes`, \\
+[27] `memory_8bytes`, \\
+[28] `c_type`, \\
+[29] `signed`, \\
+[30] `mp_selector`, \\
+[31] `muldiv_selector`, \\
+[32] `word_instr`, \\
+[33] `ADD`, \\
+[34] `SUB`, \\
+[35] `SLT`, \\
+[36] `AND`, \\
+[37] `OR`, \\
+[38] `XOR`, \\
+[39] `SHIFT`, \\
+[40] `JALR`, \\
+[41] `BEQ`, \\
+[42] `BLT`, \\
+[43] `LOAD`, \\
+[44] `STORE`, \\
+[45] `MUL`, \\
+[46] `DIVREM`, \\
+[47] `ECALL`, \\
+[48] `EBREAK`; \\
+the remaining bits are set to zero.
+"""
+pad = 0
+
+[[variables.output]]
+name = "imm"
+type = "DWordWL"
+desc = "the *fully extended (!)* 64-bit version of the immediate."
+pad = 0
+
+[[variables.multiplicity]]
+name = "μ"
+type = "BaseField"
+desc = "The multiplicity with which this instruction is looked up in the `CPU` table."
+pad = 0
diff --git a/spec/src/decode_uncompressed.toml b/spec/src/decode_uncompressed.toml
new file mode 100644
index 000000000..8457005f8
--- /dev/null
+++ b/spec/src/decode_uncompressed.toml
@@ -0,0 +1,157 @@
+name = "DECODE"
+
+[[variables.output]]
+name = "pc"
+type = "DWordWL"
+desc = "value of the program counter this instruction is associated with."
+
+[[variables.output]]
+name = "rs1"
+type = "Byte"
+desc = "index of source register 1."
+
+[[variables.output]]
+name = "rs2"
+type = "Byte"
+desc = "index of source register 2."
+
+[[variables.output]]
+name = "rd"
+type = "Byte"
+desc = "index of destination register."
+
+[[variables.output]]
+name = "write_register"
+type = "Bit"
+desc = "whether the result should be written to `rd` ($=0$ for memory write and when $#`rd` = #`x0`$."
+
+[[variables.output]]
+name = "mem_2B"
+type = "Bit"
+desc = "whether the memory access (read or write) touches exactly $2$ bytes."
+
+[[variables.output]]
+name = "mem_4B"
+type = "Bit"
+desc = "whether the memory access (read or write) touches exactly $4$ bytes."
+
+[[variables.output]]
+name = "mem_8B"
+type = "Bit"
+desc = "whether the memory access (read or write) touches exactly $8$ bytes."
+
+[[variables.output]]
+name = "c_type"
+type = "Bit"
+desc = "Whether the instruction is of type `C`, i.e., whether it is $2$ bytes long instead of $4$."
+
+[[variables.output]]
+name = "imm"
+type = "DWordWL"
+desc = "the *fully extended (!)* 64-bit version of the immediate."
+
+[[variables.output]]
+name = "signed"
+type = "Bit"
+desc = "selector used to indicate signed or unsigned input interpretation."
+
+[[variables.output]]
+name = "mp_selector"
+type = "Bit"
+desc = """Multi-purpose selector used by the CPU to to configure several ALU operations in different ways. 
+          See the `CPU` chip for more details."""
+
+[[variables.output]]
+name = "muldiv_selector"
+type = "Bit"
+desc = "selects which output of `MUL` (lo/hi) or `DVRM` (quo/rem) is wanted."
+
+[[variables.output]]
+name = "word_instr"
+type = "Bit"
+desc = "Whether the instruction is a `*W` instruction, requiring the inputs and outputs to be (sign) extended."
+
+[[variables.output]]
+name = "ADD"
+type = "Bit"
+desc = "ALU selector flag"
+
+[[variables.output]]
+name = "SUB"
+type = "Bit"
+desc = "ALU selector flag"
+
+[[variables.output]]
+name = "SLT"
+type = "Bit"
+desc = "ALU selector flag"
+
+[[variables.output]]
+name = "AND"
+type = "Bit"
+desc = "ALU selector flag"
+
+[[variables.output]]
+name = "OR"
+type = "Bit"
+desc = "ALU selector flag"
+
+[[variables.output]]
+name = "XOR"
+type = "Bit"
+desc = "ALU selector flag"
+
+[[variables.output]]
+name = "SHIFT"
+type = "Bit"
+desc = "ALU selector flag"
+
+[[variables.output]]
+name = "JALR"
+type = "Bit"
+desc = "ALU selector flag"
+
+[[variables.output]]
+name = "BEQ"
+type = "Bit"
+desc = "ALU selector flag"
+
+[[variables.output]]
+name = "BLT"
+type = "Bit"
+desc = "ALU selector flag"
+
+[[variables.output]]
+name = "LOAD"
+type = "Bit"
+desc = "ALU selector flag"
+
+[[variables.output]]
+name = "STORE"
+type = "Bit"
+desc = "ALU selector flag"
+
+[[variables.output]]
+name = "MUL"
+type = "Bit"
+desc = "ALU selector flag"
+
+[[variables.output]]
+name = "DIVREM"
+type = "Bit"
+desc = "ALU selector flag"
+
+[[variables.output]]
+name = "ECALL"
+type = "Bit"
+desc = "ALU selector flag"
+
+[[variables.output]]
+name = "EBREAK"
+type = "Bit"
+desc = "ALU selector flag"
+
+[[variables.multiplicity]]
+name = "μ"
+type = "BaseField"
+desc = "The multiplicity with which this instruction is looked up in the `CPU` table."

From 1be9a481d9ef084a63d909f6c05343108041a24b Mon Sep 17 00:00:00 2001
From: Robin Jadoul <robin.jadoul@3milabs.tech>
Date: Wed, 21 Jan 2026 11:37:15 +0100
Subject: [PATCH 034/105] =?UTF-8?q?=C2=A0spec:=20placeholder=20chapters=20?=
 =?UTF-8?q?for=20chips=20to=20come=20(#190)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 spec/book.typ  |  2 ++
 spec/dvrm.typ  | 17 +++++++++++++++++
 spec/ecall.typ | 17 +++++++++++++++++
 3 files changed, 36 insertions(+)
 create mode 100644 spec/dvrm.typ
 create mode 100644 spec/ecall.typ

diff --git a/spec/book.typ b/spec/book.typ
index 15f90f276..d12bafc09 100644
--- a/spec/book.typ
+++ b/spec/book.typ
@@ -15,7 +15,9 @@
     #chapter("memw.typ")[MEMW]
     #chapter("lt.typ")[LT]
     #chapter("mul.typ")[MUL chip]
+    #chapter("dvrm.typ")[DVRM chip]
     #chapter("load.typ")[LOAD chip]
+    #chapter("ecall.typ")[ECALL chips]
     #chapter("bitwise.typ")[BITWISE]
   ]
 )
diff --git a/spec/dvrm.typ b/spec/dvrm.typ
new file mode 100644
index 000000000..69e79cee2
--- /dev/null
+++ b/spec/dvrm.typ
@@ -0,0 +1,17 @@
+#import "/book.typ": book-page
+#import "/src.typ": load_config, load_chip
+#import "/chip.typ": (
+  render_chip_column_table,
+  total_nr_variables,
+  total_nr_instantiated_columns,
+  render_constraint_table,
+  render_chip_assumptions,
+  render_chip_padding_table,
+)
+
+#let config = load_config()
+// #let chip = load_chip("src/dvrm.toml", config)
+
+#show: book-page.with(title: "DVRM chip")
+
+*placeholder chapter: WIP*
diff --git a/spec/ecall.typ b/spec/ecall.typ
new file mode 100644
index 000000000..fee25768c
--- /dev/null
+++ b/spec/ecall.typ
@@ -0,0 +1,17 @@
+#import "/book.typ": book-page
+#import "/src.typ": load_config, load_chip
+#import "/chip.typ": (
+  render_chip_column_table,
+  total_nr_variables,
+  total_nr_instantiated_columns,
+  render_constraint_table,
+  render_chip_assumptions,
+  render_chip_padding_table,
+)
+
+#let config = load_config()
+
+#show: book-page.with(title: "ECALL chips")
+
+*placeholder chapter: WIP*
+

From 2d39c55b890cde2cb7080327fcf1f22e485d4f16 Mon Sep 17 00:00:00 2001
From: Robin Jadoul <robin.jadoul@3milabs.tech>
Date: Wed, 21 Jan 2026 17:11:55 +0100
Subject: [PATCH 035/105] fix(spec): Use a better precedence value for "idx"
 (#197)

---
 spec/expr.typ | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/spec/expr.typ b/spec/expr.typ
index 751493619..a0530525b 100644
--- a/spec/expr.typ
+++ b/spec/expr.typ
@@ -47,16 +47,16 @@
 
 #let PREC = (
   "MIN": -1, // <the most secret heart of any expression>
-  "pow": 0,  // ^
-  "neg": 1,  // Unary -
-  "cast": 2, // cast
-  "mul": 3,  // *
-  "div": 4,  // /
-  "sum": 5,  // Σ
-  "not": 6,  // not
-  "add": 7,  // +
-  "sub": 8,  // -
-  "idx": 9,  // []
+  "idx": 0,  // []
+  "pow": 1,  // ^
+  "neg": 2,  // Unary -
+  "cast": 3, // cast
+  "mul": 4,  // *
+  "div": 5,  // /
+  "sum": 6,  // Σ
+  "not": 7,  // not
+  "add": 8,  // +
+  "sub": 9,  // -
   "eq": 10,   // = and :=
   "MAX": 11, // <the void outside every expression>
 )

From 9cb3aff9270018202eae72b27318332c4f85321c Mon Sep 17 00:00:00 2001
From: Robin Jadoul <robin.jadoul@3milabs.tech>
Date: Wed, 21 Jan 2026 17:12:42 +0100
Subject: [PATCH 036/105] fix(spec): Missing `write_register` multiplicity.
 (#196)

---
 spec/src/cpu.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/spec/src/cpu.toml b/spec/src/cpu.toml
index 63ed2005f..a8345c820 100644
--- a/spec/src/cpu.toml
+++ b/spec/src/cpu.toml
@@ -581,6 +581,7 @@ output = "rv2"
 kind = "interaction"
 tag = "MEMW"
 input = [1, ["*", 2, "rd"], "rvd", ["+", "timestamp", 2], 1, 0, 0]
+multiplicity = "write_register"
 
 [[constraints.mem]]
 kind = "interaction"

From e68549f134a9f37ee51852a969981ca0e6d69f75 Mon Sep 17 00:00:00 2001
From: Robin Jadoul <robin.jadoul@3milabs.tech>
Date: Wed, 21 Jan 2026 18:03:59 +0100
Subject: [PATCH 037/105] spec: Initial version of memory argument (#164)

Co-authored-by: Erik <159244975+erik-3milabs@users.noreply.github.com>
---
 spec/book.typ        |  15 +++
 spec/ebook.typ       |   4 +
 spec/memory.typ      | 233 +++++++++++++++++++++++++++++++++++++++++++
 spec/src/config.toml |   6 ++
 spec/src/page.toml   |  57 +++++++++++
 5 files changed, 315 insertions(+)
 create mode 100644 spec/memory.typ
 create mode 100644 spec/src/page.toml

diff --git a/spec/book.typ b/spec/book.typ
index d12bafc09..29e61350c 100644
--- a/spec/book.typ
+++ b/spec/book.typ
@@ -5,6 +5,7 @@
 #book-meta(
   title: "Lambda VM specification",
   summary: [
+    #chapter("memory.typ")[Memory argument]
     #chapter("variables.typ")[Variables]
     #chapter("is_bit.typ")[IS_BIT template]
     #chapter("add.typ")[ADD template]
@@ -32,3 +33,17 @@
 ]
 #let rj = todo.with(background: teal, name: "Robin")
 #let et = todo.with(background: rgb("d4aa3a"), name: "Erik")
+
+#let style = state("style", (
+  foreground: white,
+))
+
+#let aside(title, body) = context figure(
+  block(inset: (left: 1em, right: 1em, bottom: 1em), stroke: style.final().foreground, breakable: false)[
+    #block(inset: (left: 1em, right: 1em, top: .75em, bottom: .75em),
+           width: 100% + 2em,
+           fill: rgb("55aaff"),
+           stroke: style.final().foreground,
+           align(center, strong(text(fill: black, title))))
+    #align(left, body)
+])
diff --git a/spec/ebook.typ b/spec/ebook.typ
index abddf2701..410e926bb 100644
--- a/spec/ebook.typ
+++ b/spec/ebook.typ
@@ -1,8 +1,12 @@
 #import "@preview/shiroa:0.3.1": *
+#import "/book.typ": style
 
 #import "/templates/ebook.typ"
 
 #show: ebook.project.with(title: "typst-book", spec: "book.typ")
+#style.update((
+  foreground: black,
+))
 
 // set a resolver for inclusion
 #ebook.resolve-inclusion(it => include it)
diff --git a/spec/memory.typ b/spec/memory.typ
new file mode 100644
index 000000000..6687d733d
--- /dev/null
+++ b/spec/memory.typ
@@ -0,0 +1,233 @@
+#import "/book.typ": book-page, rj, aside
+#import "/src.typ": load_config, load_chip
+#import "/chip.typ": (
+  render_chip_assumptions,
+  render_chip_column_table,
+  render_chip_padding_table,
+  render_constraint_table,
+  total_nr_instantiated_columns,
+  total_nr_variables,
+)
+
+#let config = load_config()
+#let chip = load_chip("src/page.toml", config)
+
+#show: book-page.with(title: "Memory argument")
+
+= Memory argument
+
+As part of fully proving the correct execution of a RISC-V program,
+the VM must ensure that memory reads and writes are consistent.
+That is, every byte read from some address corresponds to the byte that was last written to that address
+--- or the initial value if nothing has been written yet.
+We consider "memory" in a broad sense here:
+both RAM and the general purpose registers can be seen as instantiations of memory
+and are therefore handled simultaneously.
+#footnote[
+  While RAM is byte addressed, we do choose to store registers as a `DWordWL` over two word addresses.
+]
+
+On a high level, we ensure memory consistency by an interacting system of
+reads and writes to a lookup argument, combined with an initialization and finalization scheme.
+The initialization and finalization schemes together ensure both that (1) the necessary preconditions
+for the lookup system are satisfied, and (2) the program is executed with the correct
+initial memory and register contents as specified by the ELF binary and the ISA.
+
+== Memory types
+
+A commonly made distinction of memory types is that of _read-only_ and _read-write_ memory,
+with the more restrictive read-only variant often allowing for more efficient solutions
+(be that regarding prover time, verifier time or proof size) via table lookup proofs.
+Naturally, the VM’s main memory and registers should be handled by a read-write system
+as the guest program/environment can issue instructions that write to memory.
+While there are some subsystems that can be modelled as read-only memory
+---e.g., the program memory and instruction decoding---
+we opt to integrate these into the proof system via chip interactions (relying on techniques derived from table lookup arguments).
+As such, we only concern ourselves with read-write memory, moving forward.
+
+== Memory operations
+
+Every memory operation has some conceptual attributes that are relevant to mention or discuss:
+
+- The type of operation (read or write)
+- The memory address --- this is an address in the broad sense:
+  main memory and registers have their own dedicated part of the unified address space.
+- The value being read from or written to the memory address
+- When the value was read or written, see the below paragraph
+
+Since we will have to ensure that memory accesses are temporally consistent within the execution of the VM,
+we additionally consider a _timestamp_ for  every memory access, that should be strictly increasing.
+As such, it should never be possible for the system to generate accesses to the same address at identical timestamps.
+Multiple memory accesses can (and indeed will, consider e.g. register reads) occur in a single execution cycle of the VM,
+so we cannot use the cycle counter directly as timestamp for register accesses.
+We can, however, statically bound the maximal number of memory accesses made during a single execution by a granularity constant $k$
+and derive timestamps from the cycle counter.
+The $i$th possible memory access in cycle $c$ will obtain as timestamp the value $k dot c + i$.
+For simplicity, we will always reserve a timestamp for every possible memory access, and leave the timestamp unused if an instruction does not use it.
+
+
+#aside[Note on "simultaneous" memory accesses][
+  For reasons of completeness (since temporal integrity as discussed below is a security necessity),
+  we cannot deal with multiple accesses to the same address at identical timestamps.
+  However, if multiple accesses are guaranteed to be independent (that is, to different addresses), they can still share a timestamp
+  --- consider, e.g., the case of reading a word as 4 bytes with the `LW` load instruction.
+  This property is already taken into account where possible in the design of the system.
+  For instance, in the CPU chip, we can ensure that there are at most 3 memory accesses not guaranteed
+  to be independent, so a timestamp granularity of 4 timestamps per cycle is enough.
+]
+
+
+== Permutation argument
+
+We can conceptually organise the state of the memory as a collection of "tokens" that represent tuples
+$(serif("timestamp"), serif("address"), serif("value"))$,
+meaning the current value written to $serif("address")$ is $serif("value")$,
+last written to memory at $serif("timestamp")$.
+Having exactly one value associated with any address will be ensured (see further down in this document)
+by the interaction of memory initialization, memory finalization, and the effects of memory operations.
+
+Each memory operation will then do two things:
+
+- Consume the current token in the memory
+- Emit a new token to replace it
+
+Naturally, for a read operation, the _values_ embedded in the consumed and emitted tokens must be identical.
+From the need to consume a token even on the first memory access,
+we can see the necessity for a memory initialization procedure
+---in addition to having to make sure the initial memory content lines up with what the binary dictates.
+
+So long as we can properly constrain temporal integrity (that is, no memory operation can consume future tokens),
+this "balancing" act of tokens can be integrated (with sufficient domain separation) into the existing LogUp argument:
+consuming a token corresponds to a "receive" and emitting a new token is a "send".
+#rj[properly link/refer to the logup spec]
+
+== Temporal integrity
+
+To ensure temporal integrity, every memory operation needs to be constrained for the newly emitted token
+to have a strictly greater timestamp than the consumed token.
+This raises the question of how to represent timestamps and cleanly perform this check,
+as over a finite field the “less than” relation is ill-defined
+(though it is common and natural to consider it as the less than relation over the natural lift of the field into the integers).
+We choose to represent timestamps as machine words, using the existing `LT` chip functionality for comparisons.
+#rj[Properly link/refer to the LT chip]
+
+#aside[Note on options and trade-offs for timestamp representation][
+ #grid(columns: (1fr, 1fr), gutter: 1em)[#align(center, emph[Machine word])][#align(center, emph[Field element])][
+    - Clean definition of “less-than”, using the already existing `LT` functionality in the ALU
+    - Harder to perform increments, needing extra constraints beyond field arithmetic
+      - But this can be alleviated by providing a precomputed column that has a fixed increment per CPU row
+  ][
+    - Comparison is more annoying, but can work by:
+      - Decomposition into a machine word and chip interaction with the LT chip
+      - Bit decomposition and comparison constraints
+      - Range-checking the difference to be sufficiently small w.r.t. the field characteristic.
+    - Increments and basic arithmetic operations are cheap
+  ]
+]
+
+#rj[reference to CPU chip/timestamp column and MEMW chip]
+
+== Initialization and Finalization
+
+Because the LogUp argument handling token consumption and emission needs to be fully balanced
+--- every token emitted should be consumed, and vice versa ---
+we need to have a system to emit the initial tokens and consume the final tokens.
+This needs to ensure that every address has at most a single initializing emission, and at most one finalizing consumption.
+Having at most one initialization will, through the correctness of the lookup argument,
+immediately lead to having at most one correct finalization, and vice versa.
+
+The initialization will need to correspond to a fixed initial register state for the VM,
+as well as the memory loaded from the program binary, zero-initialization of memory elsewhere, and private input provided by the prover.
+The contribution of initialization with static data from the ELF executable and the initial register state to the sum
+can be handled directly by the verifier, ensuring correctness corresponding to the ELF binary being proven.
+This leaves only zero-initialization and prover input as prover-side concerns for initialization,
+alongside the finalization of the entire used memory.
+
+For our chosen scheme (which we refer to as "paged initialization/finalization"),
+the available memory range is split into equally (power-of-two) sized "pages".
+Each address can then be represented as `address = page_base_address + page_offset`,
+with `page_base_address` being "page-aligned", and `page_offset` belonging to a limited range (the page size).
+As such, initialization or finalization of a page is represented by a table with columns `page`, `offset`, `value`, and ---for finalization--- `timestamp`.
+The `page` column is a preprocessed, constant value (which can be entirely virtualized/inlined into the constraints for this table),
+and the `offset` column is a preprocessed column containing its row index.
+Depending on the type of initialization, `value` can be a prover-committed column (input data), or a precomputed, constant column containing `0` (free memory space).
+This table then feeds into the LogUp system in the normal way,
+emitting the initial tokens for all addresses in a page, without consuming any tokens.
+Since the `offset` column is always the same, it can be reused across all paged initialization and finalization tables.
+
+Concretely, each page gets an associated `PAGE` table, consisting of #total_nr_variables(chip) variables
+over #total_nr_instantiated_columns(chip, config) columns.
+For each such table, the `page` variable is instantiated as the constant base address of the page.
+The `offset` column is preprocessed, which helps the verifier ensure that each page has a single fixed size,
+but the verifier should still check that no pages overlap and all `page` values are page-aligned.
+
+=== Page initialization
+
+#rj[check whether we need `fini` to be range-checked]
+We present here a set of constraints on the `PAGE` table that
+
++ enforces the initial and final values of each address are bytes
++ adds the initial and final interaction to the LogUp argument
+
+For zero-initialized pages, `init` can be a constant `0`,
+and hence doesn't need a column, nor a range check.
+
+#render_chip_column_table(chip, config)
+#render_constraint_table(chip, config)
+
+
+#aside[Note on alternatives and trade-offs][
+  We identify a few alternatives that would achieve the desired initialization/finalization functionalities, and consider their respective trade-offs.
+
+  _"Free-zero" initialization_
+
+  Zero-initialization could be achieved by allowing the `MEMW` chip to output a zero
+  without consuming a token from the lookup argument.
+  This would in turn be made secure by finalization consuming at most one token per address:
+  if an address is initialized more than once, the proof cannot be finalized.
+  - This requires fewer pages (and hence tables) for zero-initialization.
+  - But it comes at a cost of added complexity in the `MEMW `chip, and likely some extra columns to handle this.
+    Keeping track of initialized addresses, and potentially having to initialize only some of the bytes in a word-read
+    may make bookkeeping challenging.
+  - This is an alternative form of sparse initialization (see below), so it is incompatible with paged finalization.
+    Paged finalization can be made into a compatible sparse form by adding a bit-checked multiplicity column.
+
+  _Sparse initialization/finalization_
+
+  One or more STARK tables (depending on the amount of memory used) consisting of `(address, value)` columns are introduced,
+  where for zero-initialization, `value` can be constant zero.
+  Transition constraints ensure that `address` is strictly increasing, enforcing the "at most once" property;
+  `value` is range-checked to consist of bytes.
+  Similar to paged finalization, an additional `timestamp` column is added, containing the final timestamp each address was accessed.
+  This table is then further used to contribute to the LogUp sum as with any other interactions.
+  - The transition constraints can be chosen to only apply on finalization, as at-most-once finalization is enough to ensure consistency.
+  - Sparse initialization is incompatible with paged finalization, see also the remark under free-zero initialization above.
+  - This would require transition constraints, which currently are not needed elsewhere in the VM design
+    - Additionally, for memory use exceeding the capacity of a single initialization/finalization table, some form of transition constraint between tables is needed
+    - Alternatively, transition constraints could potentially be avoided by more integration into the LogUp system, but this could turn out more costly in practice
+  - This is compatible with the above "free zero" initialization
+  - Since a prover-committed address column is needed (rather than a precomputed one), the number of required columns increases.
+    - As an optimization, the address column could potentially be used simultaneously for initialization and finalization
+  - Sparse initialization/finalization reduces the cost for sparse memory access patterns,
+    where only a few addresses would be accessed per page.
+      Most programs and compilers should however favor a memory locality that makes paged initialization/finalization comparable.
+]
+
+=== Register initialization/finalization
+
+#rj[Properly link/reference ECALL/HALT chip]
+The initial and final state of registers can be entirely known by
+the verifier, since the relevant initialization values are either zero,
+or embedded in the ELF, and the final values can be set to a known value
+by the HALT ecall.
+As additionally, the number of registers is small, the verifier can directly
+add the required balancing terms to the LogUp sum.
+
+== Notes and considerations
+
+- Register reads and writes may interact within a single cycle, so a correct and fixed ordering needs to be ensured
+- Correctness of initialization and completeness of finalization need to be ensured
+
+== Future topics of interest
+
+- Optimize memory systems after determining factual bottlenecks (e.g. taking inspiration from Twist and Shout, or other recent research)
diff --git a/spec/src/config.toml b/spec/src/config.toml
index 68f1683de..d836f80e5 100644
--- a/spec/src/config.toml
+++ b/spec/src/config.toml
@@ -127,6 +127,12 @@ subtypes = ["DWordWL"]
 desc = "A preprocessed column holding timestamps as `DWordWL`. Row `i` of the column contains the value $2^2 dot (i + 1)$. Used in the CPU chip, see there for more details about the magic number."
 preprocessed = true
 
+[[variables.types]]
+label = "RowIndex"
+subtypes = ["Word"]
+desc = "A preprocessed column holding the row index (zero-indexed)."
+preprocessed = true
+
 [variables.categories]
 all = ["input", "output", "auxiliary", "virtual", "multiplicity", "condition"]
 instantiated = ["input", "output", "auxiliary", "multiplicity"]
diff --git a/spec/src/page.toml b/spec/src/page.toml
new file mode 100644
index 000000000..8053d63df
--- /dev/null
+++ b/spec/src/page.toml
@@ -0,0 +1,57 @@
+name = "PAGE"
+
+# Input
+
+[[variables.input]]
+name = "offset"
+type = "RowIndex"
+desc = "The offset from the page base address."
+
+[[variables.input]]
+name = "init"
+type = "Byte"
+desc = "The initial value of this address. Can be replaced by a constant zero for zero-initialization"
+
+[[variables.input]]
+name = "fini"
+type = "Byte"
+desc = "The final value this address took"
+
+[[variables.input]]
+name = "timestamp"
+type = "DWordWL"
+desc = "The timestamp at which this address was last accessed"
+
+# Virtual
+
+[[variables.virtual]]
+name = "address"
+type = "DWordWL"
+desc = "Adding `offset` to the page base address `page`. `page` is a constant with respect to a single instance of this table."
+def = ["+", "page", ["cast", "offset", "DWordWL"]]
+
+
+[[constraint_groups]]
+name = "all"
+
+[[constraints.all]]
+kind = "interaction"
+tag = "IS_BYTE"
+input = ["init"]
+
+[[constraints.all]]
+kind = "interaction"
+tag = "IS_BYTE"
+input = ["fini"]
+
+[[constraints.all]]
+kind = "interaction"
+tag = "memory"
+input = [0, "address", 0, "init"]
+multiplicity = -1
+
+[[constraints.all]]
+kind = "interaction"
+tag = "memory"
+input = [0, "address", "timestamp", "fini"]
+multiplicity = 1

From 4d9cba1e75b216db895d98d4a1823864c8e73935 Mon Sep 17 00:00:00 2001
From: Joaquin Carletti <joaquin.carletti@lambdaclass.com>
Date: Wed, 21 Jan 2026 17:02:40 -0300
Subject: [PATCH 038/105] add script to create marckdown from spec

---
 docs/spec/add.md                    |  50 +++
 docs/spec/bitwise.md                |  58 ++++
 docs/spec/branch.md                 |  78 +++++
 docs/spec/cpu.md                    | 189 +++++++++++
 docs/spec/decode.md                 |  17 +
 docs/spec/decode_uncompressed.md    |  44 +++
 docs/spec/is_bit.md                 |  24 ++
 docs/spec/load.md                   |  80 +++++
 docs/spec/lt.md                     |  85 +++++
 docs/spec/memw.md                   | 115 +++++++
 docs/spec/mul.md                    |  97 ++++++
 docs/spec/shift.md                  | 141 ++++++++
 scripts/extract_and_convert_spec.sh |  55 ++++
 scripts/spec_to_md.py               | 494 ++++++++++++++++++++++++++++
 14 files changed, 1527 insertions(+)
 create mode 100644 docs/spec/add.md
 create mode 100644 docs/spec/bitwise.md
 create mode 100644 docs/spec/branch.md
 create mode 100644 docs/spec/cpu.md
 create mode 100644 docs/spec/decode.md
 create mode 100644 docs/spec/decode_uncompressed.md
 create mode 100644 docs/spec/is_bit.md
 create mode 100644 docs/spec/load.md
 create mode 100644 docs/spec/lt.md
 create mode 100644 docs/spec/memw.md
 create mode 100644 docs/spec/mul.md
 create mode 100644 docs/spec/shift.md
 create mode 100755 scripts/extract_and_convert_spec.sh
 create mode 100755 scripts/spec_to_md.py

diff --git a/docs/spec/add.md b/docs/spec/add.md
new file mode 100644
index 000000000..27e570787
--- /dev/null
+++ b/docs/spec/add.md
@@ -0,0 +1,50 @@
+# ADD Chip
+
+## Columns
+
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `lhs` | `DWordWL` | left-hand operator |
+| `rhs` | `DWordWL` | right-hand operator |
+
+### Output
+
+| Name | Type | Description |
+|------|------|-------------|
+| `sum` | `DWordWL` | $`lhs` + `rhs`$ |
+
+### Virtual
+
+| Name | Type | Description |
+|------|------|-------------|
+| `carry` | `Bit[2]` | Carry values used to constrain the addition |
+
+**Definition of `carry`:**
+```
+carry (when iter=0) := 2^-32 * (lhs[0] + rhs[0] - sum[0])
+carry (when iter=1) := 2^-32 * (lhs[1] + rhs[1] + carry[0] - sum[1])
+```
+
+### Condition
+
+| Name | Type | Description |
+|------|------|-------------|
+| `cond` | `BaseField` | Whether the relation should be enforced ($eq.not 0$) or not ($0$). |
+
+## Assumptions
+
+| Ref | Range | Description |
+|-----|-------|-------------|
+| `add:a:lhs` | i ∈ [0, 1] | `IS_WORD[lhs[i]]` |
+| `add:a:rhs` | i ∈ [0, 1] | `IS_WORD[rhs[i]]` |
+| `add:a:sum` | i ∈ [0, 1] | `IS_WORD[sum[i]]` |
+
+## Constraints
+
+### all
+
+| Ref | Kind | Range | Description |
+|-----|------|-------|-------------|
+| `add:c:carry` | template | i ∈ [0, 1] | cond ⇒ `IS_BIT<carry[i]>` |
diff --git a/docs/spec/bitwise.md b/docs/spec/bitwise.md
new file mode 100644
index 000000000..0dd3038e8
--- /dev/null
+++ b/docs/spec/bitwise.md
@@ -0,0 +1,58 @@
+# BITWISE Chip
+
+## Columns
+
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `X` | `Byte` |  |
+| `Y` | `Byte` |  |
+| `Z` | `B4` |  |
+
+### Output
+
+| Name | Type | Description |
+|------|------|-------------|
+| `AND` | `Byte` | the binary AND of `X` and `Y` |
+| `OR` | `Byte` | the binary OR of `X` and `Y` |
+| `XOR` | `Byte` | the binary XOR of `X` and `Y` |
+| `MSB8` | `Bit` | the most significant bit of `X` |
+| `MSB16` | `Bit` | the most significant bit of `Y` |
+| `ZERO` | `Bit` | whether $`X` = 0 and `Y` = 0$ |
+| `SLL` | `Half` | `X\|\|Y` logically left-shifted by `Z`: $((`X` + 256`Y`) `<<` `Z`) mod 2^16$ |
+| `SLLC` | `Half` | `X\|\|Y` logically right-shifted by `Z`: $(`X` + 256`Y`) `>>` (16 - `Z`)$ |
+
+### Multiplicity
+
+| Name | Type | Description |
+|------|------|-------------|
+| `μ_AND` | `BaseField` |  |
+| `μ_OR` | `BaseField` |  |
+| `μ_XOR` | `BaseField` |  |
+| `μ_MSB8` | `BaseField` |  |
+| `μ_MSB16` | `BaseField` |  |
+| `μ_ZERO` | `BaseField` |  |
+| `μ_IS_BYTE` | `BaseField` |  |
+| `μ_IS_HALF` | `BaseField` |  |
+| `μ_IS_B20` | `BaseField` |  |
+| `μ_HWSL` | `BaseField` |  |
+| `μ_HWSLC` | `BaseField` |  |
+
+## Constraints
+
+### contributions
+
+| Ref | Kind | Description | Multiplicity |
+|-----|------|-------------|--------------|
+| `1` | interaction | `AND_BYTE[AND; X, Y]` | -μ_AND |
+| `2` | interaction | `OR_BYTE[OR; X, Y]` | -μ_OR |
+| `3` | interaction | `XOR_BYTE[XOR; X, Y]` | -μ_XOR |
+| `4` | interaction | `MSB8[MSB8; X]` | -μ_MSB8 |
+| `5` | interaction | `MSB16[MSB16; X + 256 * Y]` | -μ_MSB16 |
+| `6` | interaction | `ZERO[ZERO; X + 256 * Y]` | -μ_ZERO |
+| `7` | interaction | `IS_BYTE[X]` | -μ_IS_BYTE |
+| `8` | interaction | `IS_HALF[X + 256 * Y]` | -μ_IS_HALF |
+| `9` | interaction | `IS_B20[X + 256 * Y + 65536 * Z]` | -μ_IS_B20 |
+| `10` | interaction | `HWSL[SLL; X + 256 * Y, Z]` | -μ_HWSL |
+| `11` | interaction | `HWSLC[SLLC; X + 256 * Y, Z]` | -μ_HWSLC |
diff --git a/docs/spec/branch.md b/docs/spec/branch.md
new file mode 100644
index 000000000..d5d465319
--- /dev/null
+++ b/docs/spec/branch.md
@@ -0,0 +1,78 @@
+# BRANCH Chip
+
+## Columns
+
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `pc` | `DWordWL` | The current pc, used as base address when `!JALR` |
+| `offset` | `Word` | The offset from the base address to jump to |
+| `register` | `DWordWL` | The base address to use when `JALR` |
+| `JALR` | `Bit` | Selects between `pc` and `register` as base address, needed for the `JALR` instruction |
+
+### Output
+
+| Name | Type | Description |
+|------|------|-------------|
+| `next_pc_high` | `Half[3]` | The upper part of the next pc |
+| `next_pc_low` | `Byte[2]` | The lower part of the next pc |
+
+### Auxiliary
+
+| Name | Type | Description |
+|------|------|-------------|
+| `unmasked_low_byte` | `Byte` | The low byte of the next pc, before masking the LSB. Used to constraint the raw addition. |
+
+### Virtual
+
+| Name | Type | Description |
+|------|------|-------------|
+| `next_pc_unmasked` | `DWordWL` | The combination of `next_pc_high`, `next_pc_low[1]` and `unmasked_low_byte` to constrain the addition. This is the computed value for the next pc, before masking off the LSB as required by the ISA. |
+| `next_pc` | `DWordWL` | The computed next pc, after masking off the LSB as required by the ISA. |
+
+**Definition of `next_pc_unmasked`:**
+```
+next_pc_unmasked (when iter=0) := 2^16 * next_pc_high[0] + 2^8 * next_pc_low[1] + unmasked_low_byte[0]
+next_pc_unmasked (when iter=1) := 2^16 * next_pc_high[2] + next_pc_high[1]
+```
+
+**Definition of `next_pc`:**
+```
+next_pc (when iter=0) := 2^16 * next_pc_high[0] + 2^8 * next_pc_low[1] + next_pc_low[0]
+next_pc (when iter=1) := 2^16 * next_pc_high[2] + next_pc_high[1]
+```
+
+### Multiplicity
+
+| Name | Type | Description |
+|------|------|-------------|
+| `μ` | `Bit` |  |
+
+## Assumptions
+
+| Ref | Range | Description |
+|-----|-------|-------------|
+| `A1` | i ∈ [0, 1] | `pc` is range checked, `IS_WORD[pc[i]]` |
+| `A2` |  | `offset` is range checked, `IS_WORD[offset]` |
+| `A3` | i ∈ [0, 1] | `register` is range checked, `IS_WORD[register[i]]` |
+| `A4` |  | `IS_BIT<JALR>` |
+
+## Constraints
+
+### all
+
+| Ref | Kind | Range | Description | Multiplicity |
+|-----|------|-------|-------------|--------------|
+| `1` | template |  | 1 - JALR ⇒ `ADD<next_pc_unmasked; pc, offset::DWordWL>` |  |
+| `2` | template |  | JALR ⇒ `ADD<next_pc_unmasked; register, offset::DWordWL>` |  |
+| `3` | interaction |  | `IS_BYTE[next_pc_low[1]]` | μ |
+| `4` | interaction |  | `AND_BYTE[next_pc_low[0]; unmasked_low_byte[0], 254]` | μ |
+| `5` | interaction | i ∈ [0, 2] | `IS_HALFWORD[next_pc_high[i]]` | μ |
+
+### output
+_Each row contributes the following to the LogUp sum_
+
+| Ref | Kind | Description | Multiplicity |
+|-----|------|-------------|--------------|
+| `1` | interaction | `BRANCH[next_pc; pc, offset, register, JALR]` | -μ |
diff --git a/docs/spec/cpu.md b/docs/spec/cpu.md
new file mode 100644
index 000000000..315cc86c4
--- /dev/null
+++ b/docs/spec/cpu.md
@@ -0,0 +1,189 @@
+# CPU Chip
+
+## Columns
+
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `timestamp` | `Timestamp` | A preprocessed timestamp to coordinate the memory argument. Since we have at most 3 non-disjoint memory accesses (`(rs1, rs2, rd)`, `(rs1, pc, pc)`, `(LOAD)` or `(STORE)`) a maximum of 4 slots is enough. |
+| `pc` | `DWordWL` | The program counter |
+| `rs1` | `Byte` | Source register 1 index |
+| `rs2` | `Byte` | Source register 2 index |
+| `rd` | `Byte` | Destination register index |
+| `write_register` | `Bit` | Whether to write back to the destination register |
+| `memory_2bytes` | `Bit` | Whether the memory access (read or write) touches exactly 2 bytes |
+| `memory_4bytes` | `Bit` | Whether the memory access (read or write) touches exactly 4 bytes |
+| `memory_8bytes` | `Bit` | Whether the memory access (read or write) touches exactly 8 bytes |
+| `c_type_instruction` | `Bit` | Whether the instruction is of C type, i.e., whether it is 2 bytes long instead of 4 |
+| `imm` | `DWordWL` | The fully extended 64-bit version of the immediate |
+| `signed` | `Bit` | Indicates whether we're dealing with a signed or unsigned instruction |
+| `mp_selector` | `Bit` | Multi-purpose selector used by different ALU operations for different purposes. Currently, it is used     - by the `MUL` chip to select between `MUL`/`MULH` and `MULH[S]U`, and     - as flag for inverting the condition of conditional branches (see `branch_cond`)     - as direction (left or right) for `SHIFT` |
+| `muldiv_selector` | `Bit` | Selects which output of `MUL` (lo/hi) or `DIV` (quo/rem) is wanted |
+| `word_instr` | `Bit` | Whether the instruction is a \*W instruction, requiring the inputs and outputs to be (sign) extended |
+| `ADD` | `Bit` | One-hot ALU selector flag |
+| `SUB` | `Bit` | One-hot ALU selector flag |
+| `SLT` | `Bit` | One-hot ALU selector flag |
+| `AND` | `Bit` | One-hot ALU selector flag |
+| `OR` | `Bit` | One-hot ALU selector flag |
+| `XOR` | `Bit` | One-hot ALU selector flag |
+| `SHIFT` | `Bit` | One-hot ALU selector flag |
+| `JALR` | `Bit` | One-hot ALU selector flag |
+| `BEQ` | `Bit` | One-hot ALU selector flag |
+| `BLT` | `Bit` | One-hot ALU selector flag |
+| `LOAD` | `Bit` | One-hot ALU selector flag |
+| `STORE` | `Bit` | One-hot ALU selector flag |
+| `MUL` | `Bit` | One-hot ALU selector flag |
+| `DIVREM` | `Bit` | One-hot ALU selector flag |
+| `ECALL` | `Bit` | One-hot ALU selector flag |
+| `EBREAK` | `Bit` | One-hot ALU selector flag |
+
+### Output
+
+| Name | Type | Description |
+|------|------|-------------|
+| `next_pc` | `DWordWL` | The program counter for the next instruction |
+| `rvd` | `DWordWL` | The value to (maybe) be written back to rvd |
+
+### Auxiliary
+
+| Name | Type | Description |
+|------|------|-------------|
+| `rv1` | `DWordWHH` | The value of register `rs1` |
+| `rv2` | `DWordWHH` | The value of register `rs2` |
+| `rv1_sign_bit` | `Bit` | The sign bit of `rv1` if seen as a 32-bit word |
+| `arg1` | `DWordBL` | The extended version of `rv1`, depending on `c_type_instruction` |
+| `arg2_sign_bit` | `Bit` | The sign bit of `arg2` if seen as a 32-bit word |
+| `arg2` | `DWordBL` | A multiplexed version of `rv2` and `imm`, to be used as second argument to ALU calls |
+| `res_sign_bit` | `Bit` | The sign bit of `res`, if seen as a 32-bit word |
+| `res` | `DWordBL` | The ALU result |
+| `is_equal` | `Bit` | Whether `rv1` and `arg2` are equal |
+| `branch_cond` | `Bit` | Whether a branch is taken, i.e., the branch condition |
+
+### Virtual
+
+| Name | Type | Description |
+|------|------|-------------|
+| `packed_decode` | `BaseField` | A packed representation of all bit flags and register indices obtained from the decoding |
+
+## Assumptions
+
+| Ref | Range | Description |
+|-----|-------|-------------|
+| `cpu:a:one-hot` |  | The flags are a one-hot vector in the decoding |
+| `cpu:a:arg2-multiplex` |  | When `STORE + LOAD + BEQ + BLT = 0`, either `rs2 = 0` or `imm = 0` should be enforced by the decoding. This is needed for `arg2`. |
+
+## Constraints
+
+### decode
+
+| Ref | Kind | Description |
+|-----|------|-------------|
+| `1` | interaction | `DECODE[pc, imm, packed_decode]` |
+
+### range
+
+| Ref | Kind | Range | Description |
+|-----|------|-------|-------------|
+| `cpu:c:range_write_register` | template |  | `IS_BIT<write_register>` |
+| `cpu:c:range_memory_2bytes` | template |  | `IS_BIT<memory_2bytes>` |
+| `cpu:c:range_memory_4bytes` | template |  | `IS_BIT<memory_4bytes>` |
+| `cpu:c:range_memory_8bytes` | template |  | `IS_BIT<memory_8bytes>` |
+| `cpu:c:range_c_kind_instruction` | template |  | `IS_BIT<c_kind_instruction>` |
+| `cpu:c:range_signed` | template |  | `IS_BIT<signed>` |
+| `cpu:c:range_mp_selector` | template |  | `IS_BIT<mp_selector>` |
+| `cpu:c:range_muldiv_selector` | template |  | `IS_BIT<muldiv_selector>` |
+| `cpu:c:range_word_instr` | template |  | `IS_BIT<word_instr>` |
+| `cpu:c:range_ADD` | template |  | `IS_BIT<ADD>` |
+| `cpu:c:range_SUB` | template |  | `IS_BIT<SUB>` |
+| `cpu:c:range_SLT` | template |  | `IS_BIT<SLT>` |
+| `cpu:c:range_AND` | template |  | `IS_BIT<AND>` |
+| `cpu:c:range_OR` | template |  | `IS_BIT<OR>` |
+| `cpu:c:range_XOR` | template |  | `IS_BIT<XOR>` |
+| `cpu:c:range_SHIFT` | template |  | `IS_BIT<SHIFT>` |
+| `cpu:c:range_JALR` | template |  | `IS_BIT<JALR>` |
+| `cpu:c:range_BEQ` | template |  | `IS_BIT<BEQ>` |
+| `cpu:c:range_BLT` | template |  | `IS_BIT<BLT>` |
+| `cpu:c:range_LOAD` | template |  | `IS_BIT<LOAD>` |
+| `cpu:c:range_STORE` | template |  | `IS_BIT<STORE>` |
+| `cpu:c:range_MUL` | template |  | `IS_BIT<MUL>` |
+| `cpu:c:range_DIVREM` | template |  | `IS_BIT<DIVREM>` |
+| `cpu:c:range_ECALL` | template |  | `IS_BIT<ECALL>` |
+| `cpu:c:range_EBREAK` | template |  | `IS_BIT<EBREAK>` |
+| `R26` | interaction |  | `IS_BYTE[rs1]` |
+| `R27` | interaction |  | `IS_BYTE[rs2]` |
+| `R28` | interaction |  | `IS_BYTE[rd]` |
+| `R29` | interaction | i ∈ [0, 7] | `IS_BYTE[arg1[i]]` |
+| `R30` | interaction | i ∈ [0, 7] | `IS_BYTE[arg2[i]]` |
+| `R31` | interaction | i ∈ [0, 7] | `IS_BYTE[res[i]]` |
+
+### alu
+
+| Ref | Kind | Range | Description | Multiplicity |
+|-----|------|-------|-------------|--------------|
+| `A1` | template |  | ADD + LOAD + STORE ⇒ `ADD<res::DWordWL; arg1::DWordWL, arg2::DWordWL>` |  |
+| `cpu:c:sub` | template |  | SUB + BEQ ⇒ `SUB<res::DWordWL; arg1::DWordWL, arg2::DWordWL>` |  |
+| `A3` | interaction |  | `LT[res[0]; arg1::DWordHHW, arg2::DWordHHW, signed]` | SLT + BLT |
+| `A4` | arith | i ∈ [1, 7] | `SLT` + `BLT` => `res[i]` = 0 |  |
+| | | _polynomial:_ `(SLT + BLT) * res[i] = 0` | |
+| `A5` | interaction | i ∈ [0, 7] | `AND_BYTE[res[i]; arg1[i], arg2[i]]` | AND |
+| `A6` | interaction | i ∈ [0, 7] | `OR_BYTE[res[i]; arg1[i], arg2[i]]` | OR |
+| `A7` | interaction | i ∈ [0, 7] | `XOR_BYTE[res[i]; arg1[i], arg2[i]]` | XOR |
+| `A8` | interaction |  | `SHIFT[res::DWordHL; arg1::DWordHL, arg2[0], mp_selector, signed, word_instr]` | SHIFT |
+| `A9` | template |  | JALR ⇒ `ADD<res::DWordWL; pc, (2 * c_type_instruction + 4 * (1 - c_type_instruction))::DWordWL>` |  |
+| `A10` | interaction |  | `MUL[res; arg1, signed, arg2, mp_selector, muldiv_selector]` | MUL |
+| `A11` | interaction |  | `DVRM[res; arg1, arg2, signed, muldiv_selector]` | DIVREM |
+
+### mem
+
+| Ref | Kind | Description | Multiplicity |
+|-----|------|-------------|--------------|
+| `M1` | interaction | `MEMW[rv1; 1, 2 * rs1, rv1, timestamp + 0, 1, 0, 0]` |  |
+| `M2` | interaction | `MEMW[rv2; 1, 2 * rs2, rv2, timestamp + 1, 1, 0, 0]` |  |
+| `M3` | interaction | `MEMW[1, 2 * rd, rvd, timestamp + 2, 1, 0, 0]` | write_register |
+| `M4` | interaction | `LOAD[rvd; 0, res, timestamp + 0, memory_2bytes, memory_4bytes, memory_8bytes, signed]` | LOAD |
+| `M5` | interaction | `MEMW[0, res, rv2, timestamp + 1, memory_2bytes, memory_4bytes, memory_8bytes]` | STORE |
+| `M6` | interaction | `MEMW[pc; 1, 2 * 255, next_pc, timestamp + 1, 1, 0, 0]` |  |
+
+### sys
+
+| Ref | Kind | Description | Multiplicity |
+|-----|------|-------------|--------------|
+| `S1` | arith | `!EBREAK` |  |
+| | | _polynomial:_ `1 - EBREAK = 0` | |
+| | | _note:_ We treat `EBREAK` as an unprovable trap | |
+| `S2` | interaction | `ECALL[rvd; rv1, pc, timestamp, rv2]` | ECALL |
+
+### ext
+
+| Ref | Kind | Description | Multiplicity |
+|-----|------|-------------|--------------|
+| `E1` | arith | (`rv1_sign_bit` or `arg2_sign_bit` or `res_sign_bit`) => `word_instr` |  |
+| | | _polynomial:_ `(rv1_sign_bit + arg2_sign_bit + res_sign_bit) * (1 - word_instr) = 0` | |
+| `E2` | interaction | `MSB16[rv1_sign_bit; rv1[1]]` | word_instr |
+| `E3` | arith | `arg1[:4]` = `rv1[:2]` |  |
+| | | _polynomial:_ `(arg1::DWordWL)[0] - (rv1::DWordWL)[0] = 0` | |
+| `E4` | arith | `arg1[4:]` = `rv1[2]` dot (1 - `word_instr`) + (2^(32) - 1) dot `rv1_sign_bit` dot `signed` |  |
+| | | _polynomial:_ `(arg1::DWordWL)[1] - (1 - word_instr) * rv1[2] - signed * rv1_sign_bit * (2^32 - 1) = 0` | |
+| `E5` | interaction | `MSB16[arg2_sign_bit; rv2[1]]` | word_instr |
+| `E6` | arith | `arg2[:4]` = (1 - `STORE` - `LOAD`) dot `rv2[:2]` + (1 - `BEQ` - `BLT`) dot `imm[0]` |  |
+| | | _polynomial:_ `(arg2::DWordWL)[0] - (1 - STORE - LOAD) * (rv2::DWordWL)[0] - (1 - BEQ - BLT) * imm[0] = 0` | |
+| `E7` | arith | `arg2[4:]` = (1 - `STORE` - `LOAD`) dot ((1 - `word_instr`) dot `rv2[2]` + `signed` dot `arg2_sign_bit` dot (2^(32) - 1)) + (1 - `BEQ` - `BLT`) dot `imm[1]` |  |
+| | | _polynomial:_ `(arg2::DWordWL)[1] - (1 - STORE - LOAD) * (1 - word_instr) * rv2[2] - (1 - STORE - LOAD) * signed * arg2_sign_bit * (2^32 - 1) - (1 - BEQ - BLT) * imm[1] = 0` | |
+| `E8` | interaction | `MSB8[res_sign_bit; res[3]]` | word_instr |
+| `E9` | arith | `!LOAD` => `rvd[0]` = `res[:4]` |  |
+| | | _polynomial:_ `(1 - LOAD) * (rvd[0] - (res::DWordWL)[0]) = 0` | |
+| `E10` | arith | `!LOAD` => `rvd[1]` = (1 - `word_instr`) dot `res[4:]` + `res_sign_bit` dot (2^(32) - 1) |  |
+| | | _polynomial:_ `(1 - LOAD) * (rvd[1] - (1 - word_instr) * (res::DWordWL)[1] - res_sign_bit * (2^32 - 1)) = 0` | |
+| | | _note:_ _Sign_ extend the output if it wasn't a `LOAD`. Only `LOAD` has both `write_register = 1` and `rvd ≠ res`. `LOAD` and `word_instr` are disjoint | |
+
+### misc
+
+| Ref | Kind | Description | Multiplicity |
+|-----|------|-------------|--------------|
+| `cpu:c:is_equal` | interaction | `ZERO[is_equal; res[0] + res[1] + res[2] + res[3] + res[4] + res[5] + res[6] + res[7]]` | BEQ |
+| `O2` | arith | `branch_cond` = `JALR` or (`BLT` and (`res` xor `invert`)) or (`BEQ` and (`is_equal` xor `invert`)) |  |
+| | | _polynomial:_ `-branch_cond + JALR + res[0] * (1 - mp_selector) * BLT + (1 - res[0]) * mp_selector * BLT + is_equal * (1 - mp_selector) * BEQ + (1 - is_equal) * mp_selector * BEQ = 0` | |
+| | | _note:_ where `invert` is represented by `mp_selector` | |
+| `O3` | interaction | `BRANCH[next_pc; pc, imm[0], arg1::DWordWL, JALR]` | branch_cond |
+| `O4` | template | `ADD<next_pc; pc, (2 * c_type_instruction + 4 * (1 - c_type_instruction))::DWordWL>` |  |
diff --git a/docs/spec/decode.md b/docs/spec/decode.md
new file mode 100644
index 000000000..fc5ced687
--- /dev/null
+++ b/docs/spec/decode.md
@@ -0,0 +1,17 @@
+# DECODE Chip
+
+## Columns
+
+### Output
+
+| Name | Type | Description |
+|------|------|-------------|
+| `pc` | `DWordWL` | value of the program counter this instruction is associated with. |
+| `packed_decode` | `BaseField` | Ordered concatenation of several small variables. The `decode (uncompressed)` section explains the purpose of each variable.\ A list of each variable and the bit(-range) in which it is located:\ [0:7] `rs1`, \ [8:15] `rs2`, \ [16:23] `rd`, \ [24] `write_register`, \ [25] `memory_2bytes`, \ [26] `memory_4bytes`, \ [27] `memory_8bytes`, \ [28] `c_type`, \ [29] `signed`, \ [30] `mp_selector`, \ [31] `muldiv_selector`, \ [32] `word_instr`, \ [33] `ADD`, \ [34] `SUB`, \ [35] `SLT`, \ [36] `AND`, \ [37] `OR`, \ [38] `XOR`, \ [39] `SHIFT`, \ [40] `JALR`, \ [41] `BEQ`, \ [42] `BLT`, \ [43] `LOAD`, \ [44] `STORE`, \ [45] `MUL`, \ [46] `DIVREM`, \ [47] `ECALL`, \ [48] `EBREAK`; \ the remaining bits are set to zero.  |
+| `imm` | `DWordWL` | the *fully extended (!)* 64-bit version of the immediate. |
+
+### Multiplicity
+
+| Name | Type | Description |
+|------|------|-------------|
+| `μ` | `BaseField` | The multiplicity with which this instruction is looked up in the `CPU` table. |
diff --git a/docs/spec/decode_uncompressed.md b/docs/spec/decode_uncompressed.md
new file mode 100644
index 000000000..9e3aebc77
--- /dev/null
+++ b/docs/spec/decode_uncompressed.md
@@ -0,0 +1,44 @@
+# DECODE Chip
+
+## Columns
+
+### Output
+
+| Name | Type | Description |
+|------|------|-------------|
+| `pc` | `DWordWL` | value of the program counter this instruction is associated with. |
+| `rs1` | `Byte` | index of source register 1. |
+| `rs2` | `Byte` | index of source register 2. |
+| `rd` | `Byte` | index of destination register. |
+| `write_register` | `Bit` | whether the result should be written to `rd` ($=0$ for memory write and when $`rd` = `x0`$. |
+| `mem_2B` | `Bit` | whether the memory access (read or write) touches exactly $2$ bytes. |
+| `mem_4B` | `Bit` | whether the memory access (read or write) touches exactly $4$ bytes. |
+| `mem_8B` | `Bit` | whether the memory access (read or write) touches exactly $8$ bytes. |
+| `c_type` | `Bit` | Whether the instruction is of type `C`, i.e., whether it is $2$ bytes long instead of $4$. |
+| `imm` | `DWordWL` | the *fully extended (!)* 64-bit version of the immediate. |
+| `signed` | `Bit` | selector used to indicate signed or unsigned input interpretation. |
+| `mp_selector` | `Bit` | Multi-purpose selector used by the CPU to to configure several ALU operations in different ways.            See the `CPU` chip for more details. |
+| `muldiv_selector` | `Bit` | selects which output of `MUL` (lo/hi) or `DVRM` (quo/rem) is wanted. |
+| `word_instr` | `Bit` | Whether the instruction is a `*W` instruction, requiring the inputs and outputs to be (sign) extended. |
+| `ADD` | `Bit` | ALU selector flag |
+| `SUB` | `Bit` | ALU selector flag |
+| `SLT` | `Bit` | ALU selector flag |
+| `AND` | `Bit` | ALU selector flag |
+| `OR` | `Bit` | ALU selector flag |
+| `XOR` | `Bit` | ALU selector flag |
+| `SHIFT` | `Bit` | ALU selector flag |
+| `JALR` | `Bit` | ALU selector flag |
+| `BEQ` | `Bit` | ALU selector flag |
+| `BLT` | `Bit` | ALU selector flag |
+| `LOAD` | `Bit` | ALU selector flag |
+| `STORE` | `Bit` | ALU selector flag |
+| `MUL` | `Bit` | ALU selector flag |
+| `DIVREM` | `Bit` | ALU selector flag |
+| `ECALL` | `Bit` | ALU selector flag |
+| `EBREAK` | `Bit` | ALU selector flag |
+
+### Multiplicity
+
+| Name | Type | Description |
+|------|------|-------------|
+| `μ` | `BaseField` | The multiplicity with which this instruction is looked up in the `CPU` table. |
diff --git a/docs/spec/is_bit.md b/docs/spec/is_bit.md
new file mode 100644
index 000000000..1c4a3182b
--- /dev/null
+++ b/docs/spec/is_bit.md
@@ -0,0 +1,24 @@
+# IS_BIT Chip
+
+## Columns
+
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `X` | `BaseField` | Value for which to assert that it lies in the range ${0, 1}$. |
+
+### Condition
+
+| Name | Type | Description |
+|------|------|-------------|
+| `cond` | `BaseField` | Whether the constraint should be applied ($eq.not 0$) or not ($0$). |
+
+## Constraints
+
+### all
+
+| Ref | Kind | Description |
+|-----|------|-------------|
+| `isbit:c:isbit` | arith | `cond` => `X` (1-`X`) = 0 |
+| | | _polynomial:_ `cond * X * (1 - X) = 0` |
diff --git a/docs/spec/load.md b/docs/spec/load.md
new file mode 100644
index 000000000..1f14a1adc
--- /dev/null
+++ b/docs/spec/load.md
@@ -0,0 +1,80 @@
+# LOAD Chip
+
+## Columns
+
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `base_address` | `DWordWL` | The base address to read/write from/to, gets offset by $[0, 7]$, depending on how big the access is |
+| `timestamp` | `DWordWL` | The timestamp at which this memory access is said to occur |
+| `read2` | `Bit` | Whether to read exactly 2 bytes |
+| `read4` | `Bit` | Whether to read exactly 4 bytes |
+| `read8` | `Bit` | Whether to read exactly 8 bytes |
+| `signed` | `Bit` | Whether to sign-extend (1) or zero-extend (0) |
+
+### Output
+
+| Name | Type | Description |
+|------|------|-------------|
+| `res` | `DWordBL` | The result of reading (up to) 8 bytes from `base_address`, extended corresponding to `signed`. |
+
+### Auxiliary
+
+| Name | Type | Description |
+|------|------|-------------|
+| `sign_bit` | `Bit` | The sign bit extracted from the bytes retrieved from memory |
+
+### Virtual
+
+| Name | Type | Description |
+|------|------|-------------|
+| `read1` | `Bit` | Whether to read exactly 1 byte |
+
+**Definition of `read1`:**
+```
+read1 := μ - read2 - read4 - read8
+```
+
+### Multiplicity
+
+| Name | Type | Description |
+|------|------|-------------|
+| `μ` | `Bit` |  |
+
+## Assumptions
+
+| Ref | Range | Description |
+|-----|-------|-------------|
+| `A1` | i ∈ [0, 1] | `IS_WORD[base_address[i]]` |
+| `A2` |  | `IS_BIT<signed>` |
+| `A3` |  | `IS_BIT<read2>` |
+| `A4` |  | `IS_BIT<read4>` |
+| `A5` |  | `IS_BIT<read8>` |
+| `A6` |  | `IS_BIT<read2 + read4 + read8>` |
+| `A7` | i ∈ [0, 1] | `IS_WORD[timestamp[i]]` |
+
+## Constraints
+
+### all
+
+| Ref | Kind | Range | Description | Multiplicity |
+|-----|------|-------|-------------|--------------|
+| `1` | arith |  | `read2` + `read4` + `read8` => `μ` |  |
+| | | _polynomial:_ `(read2 + read4 + read8) * (1 - μ) = 0` | |
+| `2` | interaction |  | `MEMW[res; 0, base_address, res::BaseField[8], timestamp, read2, read4, read8]` | μ |
+| `3` | interaction |  | `MSB8[sign_bit; res[0]]` | read1 |
+| `4` | interaction |  | `MSB8[sign_bit; res[1]]` | read2 |
+| `5` | interaction |  | `MSB8[sign_bit; res[3]]` | read4 |
+| `6` | arith | i ∈ [4, 7] | !`read8` => `res`_i = `signed` dot `sign_bit` dot 255 |  |
+| | | _polynomial:_ `(1 - read8) * (res[i] - signed * sign_bit * 255) = 0` | |
+| `7` | arith | i ∈ [2, 3] | !(`read4` + `read8`) => `res`_i = `signed` dot `sign_bit` dot 255 |  |
+| | | _polynomial:_ `(1 - read4 - read8) * (res[i] - signed * sign_bit * 255) = 0` | |
+| `8` | arith |  | !(`read2` + `read4` + `read8`) => `res`_1 = `signed` dot `sign_bit` dot 255 |  |
+| | | _polynomial:_ `(1 - read2 - read4 - read8) * (res[1] - signed * sign_bit * 255) = 0` | |
+
+### output
+
+| Ref | Kind | Description | Multiplicity |
+|-----|------|-------------|--------------|
+| `1` | interaction | `LOAD[res::DWordWL; base_address, timestamp, read2, read4, read8]` | -μ |
diff --git a/docs/spec/lt.md b/docs/spec/lt.md
new file mode 100644
index 000000000..f2cbb1e9b
--- /dev/null
+++ b/docs/spec/lt.md
@@ -0,0 +1,85 @@
+# LT Chip
+
+## Columns
+
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `lhs` | `DWordHHW` | The left operand |
+| `rhs` | `DWordHHW` | The right operand |
+| `signed` | `Bit` | whether to interpret `lhs` and `rhs` as signed integers (1) or not (0) |
+
+### Output
+
+| Name | Type | Description |
+|------|------|-------------|
+| `lt` | `Bit` | Whether $`lhs` < `rhs`$, taking `signed` into account |
+
+### Auxiliary
+
+| Name | Type | Description |
+|------|------|-------------|
+| `lhs_sub_rhs` | `DWordHL` | $`lhs` - `rhs`$ |
+| `lhs_msb` | `Bit` | The most significant bit of `lhs` |
+| `rhs_msb` | `Bit` | The most significant bit of `rhs` |
+
+### Virtual
+
+| Name | Type | Description |
+|------|------|-------------|
+| `carry` | `Bit[2]` | The carry for adding `lhs_sub_rhs` back to `rhs` |
+| `unsigned_lt` | `Bit` | Whether $`lhs` < `rhs`$, as unsigned integers |
+
+**Definition of `carry`:**
+```
+carry (when iter=0) := 2^-32 * (rhs[0] + (lhs_sub_rhs::DWordWL)[0] - lhs[0])
+carry (when iter=1) := 2^-32 * ((rhs::DWordWL)[1] + (lhs_sub_rhs::DWordWL)[1] + carry[0] - (lhs::DWordWL)[1])
+```
+
+**Definition of `unsigned_lt`:**
+```
+unsigned_lt := carry[1]
+```
+
+### Multiplicity
+
+| Name | Type | Description |
+|------|------|-------------|
+| `μ` | `Bit` |  |
+
+## Assumptions
+
+| Ref | Range | Description |
+|-----|-------|-------------|
+| `lt:a:range_lhs` | i ∈ [1, 2] | `IS_HALFWORD[lhs[i]]` and `IS_WORD[lhs[0]]` |
+| `lt:a:range_rhs` | i ∈ [1, 2] | `IS_HALFWORD[rhs[i]]` and `IS_WORD[rhs[0]]` |
+| `lt:a:range_signed` |  | `IS_BIT<signed>` |
+
+## Constraints
+
+### defs
+_Enforce that variables have been correctly computed_
+
+| Ref | Kind | Description | Multiplicity |
+|-----|------|-------------|--------------|
+| `lt:c:lhs_msb` | interaction | `MSB16[lhs_msb; lhs[2]]` | μ |
+| `lt:c:rhs_msb` | interaction | `MSB16[rhs_msb; rhs[2]]` | μ |
+| `lt:c:lt` | arith | `lt` = `signed` dot (A (1 - B) + A C + (1 - B) C) + (1 - `signed`) dot `unsigned_lt` |  |
+| | | _polynomial:_ `lt - signed * (lhs_msb * (1 - rhs_msb) + lhs_msb * carry[1] + (1 - rhs_msb) * carry[1]) - (1 - signed) * unsigned_lt = 0` | |
+| | | _note:_ Where $A = #`lhs_msb`$, $B = #`rhs_msb`$ and $C = #`carry[1]`$ | |
+
+### sub
+_Constrain the subtraction_
+
+| Ref | Kind | Range | Description | Multiplicity |
+|-----|------|-------|-------------|--------------|
+| `1` | template | i ∈ [0, 1] | `IS_BIT<carry[i]>` |  |
+| `lt:c:lhs_sub_rhs_range` | interaction | i ∈ [0, 3] | `IS_HALFWORD[lhs_sub_rhs[i]]` | μ |
+
+### output
+_Each row contributes the following to the LogUp sum_
+
+| Ref | Kind | Description | Multiplicity |
+|-----|------|-------------|--------------|
+| `1` | interaction | `LT[lt; lhs, rhs, signed]` | -μ |
diff --git a/docs/spec/memw.md b/docs/spec/memw.md
new file mode 100644
index 000000000..ddee6b852
--- /dev/null
+++ b/docs/spec/memw.md
@@ -0,0 +1,115 @@
+# MEMW Chip
+
+## Columns
+
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `is_register` | `Bit` | Whether the address represents a register index |
+| `base_address` | `DWordWL` | The base address to read/write from/to, gets offset by $[0, 7]$, depending on how big the access is |
+| `value` | `BaseField[8]` | The values to store in memory. For regular memory, these should be (up to) 8 range-checked `Byte`s; registers are stored as two range-checked `Word`s |
+| `timestamp` | `DWordWL` | The timestamp at which this memory access is said to occur |
+| `write2` | `Bit` | Whether to write exactly 2 values |
+| `write4` | `Bit` | Whether to write exactly 4 values |
+| `write8` | `Bit` | Whether to write exactly 8 values |
+
+### Output
+
+| Name | Type | Description |
+|------|------|-------------|
+| `old` | `BaseField[8]` | The old value written at `base_address`. See `value` for information about representation. Only the elements corresponding to the `writeN` bits are guaranteed |
+
+### Auxiliary
+
+| Name | Type | Description |
+|------|------|-------------|
+| `address_add` | `DWordHL[7]` | `address_add[i] = base_address + i + 1` |
+| `old_timestamp` | `DWordWL[8]` | The timestamp at which the address was last accessed |
+
+### Virtual
+
+| Name | Type | Description |
+|------|------|-------------|
+| `w2` | `Bit` | writing at least 2 bytes |
+| `w4` | `Bit` | writing at least 4 bytes |
+| `μ_sum` | `Bit` |  |
+
+**Definition of `w2`:**
+```
+w2 := write2 + write4 + write8
+```
+
+**Definition of `w4`:**
+```
+w4 := write4 + write8
+```
+
+**Definition of `μ_sum`:**
+```
+μ_sum := μ_read + μ_write
+```
+
+### Multiplicity
+
+| Name | Type | Description |
+|------|------|-------------|
+| `μ_read` | `Bit` | Whether we are performing a read (and hence return `out`) |
+| `μ_write` | `Bit` | Whether we are performing a write (and hence not return `out`) |
+
+## Assumptions
+
+| Ref | Range | Description |
+|-----|-------|-------------|
+| `A1` | i ∈ [0, 1] | `IS_WORD[base_address[i]]` |
+| `A2` |  | `IS_BIT<write2>` |
+| `A3` |  | `IS_BIT<write4>` |
+| `A4` |  | `IS_BIT<write8>` |
+| `A5` |  | `IS_BIT<write2 + write4 + write8>` |
+| `A6` | i ∈ [0, 1] | `IS_WORD[timestamp[i]]` |
+
+## Constraints
+
+### consistency
+
+| Ref | Kind | Range | Description | Multiplicity |
+|-----|------|-------|-------------|--------------|
+| `1` | template |  | `IS_BIT<μ_sum>` |  |
+| `2` | arith |  | `w2` => `μ_sum` |  |
+| | | _polynomial:_ `w2 * (1 - μ_sum) = 0` | |
+| `3` | template |  | `ADD<address_add[0]::DWordWL; base_address, 1>` | w2 |
+| `4` | template | i ∈ [1, 2] | `ADD<address_add[i]::DWordWL; base_address, i + 1>` | w4 |
+| `5` | template | i ∈ [3, 6] | `ADD<address_add[i]::DWordWL; base_address, i + 1>` | write8 |
+| `6` | interaction | i ∈ [0, 6], j ∈ [0, 3] | `IS_HALFWORD[address_add[i][j]]` |  |
+| `7` | interaction |  | `LT[1; old_timestamp[0], timestamp]` | μ_sum |
+| `8` | interaction |  | `LT[1; old_timestamp[1], timestamp]` | w2 |
+| `9` | interaction | i ∈ [2, 3] | `LT[1; old_timestamp[i], timestamp]` | w4 |
+| `10` | interaction | i ∈ [4, 7] | `LT[1; old_timestamp[i], timestamp]` | write8 |
+
+### overflow
+
+| Ref | Kind | Description | Multiplicity |
+|-----|------|-------------|--------------|
+| `R1` | interaction | `LT[1; base_address, address_add[0]::DWordWL]` | write2 |
+| `R2` | interaction | `LT[1; base_address, address_add[2]::DWordWL]` | write4 |
+| `R3` | interaction | `LT[1; base_address, address_add[6]::DWordWL]` | write8 |
+
+### memory
+
+| Ref | Kind | Range | Description | Multiplicity |
+|-----|------|-------|-------------|--------------|
+| `M1` | interaction |  | `memory[is_register, base_address, old_timestamp[0], old[0]]` | μ_sum |
+| `M2` | interaction |  | `memory[is_register, base_address, timestamp, value[0]]` | -μ_sum |
+| `M3` | interaction |  | `memory[is_register, address_add[0], old_timestamp[1], old[1]]` | w2 |
+| `M4` | interaction |  | `memory[is_register, address_add[0], timestamp, value[1]]` | -w2 |
+| `M5` | interaction | i ∈ [2, 3] | `memory[is_register, address_add[i - 1], old_timestamp[i], old[i]]` | w4 |
+| `M6` | interaction | i ∈ [2, 3] | `memory[is_register, address_add[i - 1], timestamp, value[i]]` | -w4 |
+| `M7` | interaction | i ∈ [4, 7] | `memory[is_register, address_add[i - 1], old_timestamp[i], old[i]]` | write8 |
+| `M8` | interaction | i ∈ [4, 7] | `memory[is_register, address_add[i - 1], timestamp, value[i]]` | -write8 |
+
+### output
+
+| Ref | Kind | Description | Multiplicity |
+|-----|------|-------------|--------------|
+| `O1` | interaction | `MEMW[old; is_register, base_address, value, timestamp, write2, write4, write8]` | μ_read |
+| `O2` | interaction | `MEMW[is_register, base_address, value, timestamp, write2, write4, write8]` | μ_write |
diff --git a/docs/spec/mul.md b/docs/spec/mul.md
new file mode 100644
index 000000000..c0efb1d4f
--- /dev/null
+++ b/docs/spec/mul.md
@@ -0,0 +1,97 @@
+# MUL Chip
+
+## Columns
+
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `lhs` | `DWordHL` | the left hand operator. |
+| `lhs_signed` | `Bit` | whether to interpret `lhs` as a signed integer (1) or not (0). |
+| `rhs` | `DWordHL` | the right hand operator. |
+| `rhs_signed` | `Bit` | whether to interpret `rhs` as a signed integer (1) or not (0). |
+
+### Output
+
+| Name | Type | Description |
+|------|------|-------------|
+| `res` | `QuadWL` | the (extended) multiplication result |
+
+### Auxiliary
+
+| Name | Type | Description |
+|------|------|-------------|
+| `lhs_is_negative` | `Bit` | whether `lhs` is negative (1) or not (0) |
+| `rhs_is_negative` | `Bit` | whether `rhs` is negative (1) or not (0) |
+| `raw_product` | `B51[4]` | raw multiplication output |
+
+### Virtual
+
+| Name | Type | Description |
+|------|------|-------------|
+| `lhs_ext` | `Half[8]` | sign-extended value of `lhs` |
+| `rhs_ext` | `Half[8]` | sign-extended value of `rhs` |
+| `carry` | `B20[4]` | carry values |
+| `μ_sum` | `BaseField` | sum of multiplicies |
+
+**Definition of `lhs_ext`:**
+```
+lhs_ext := lhs[i]
+lhs_ext := 65535 * lhs_is_negative
+```
+
+**Definition of `rhs_ext`:**
+```
+rhs_ext := rhs[i]
+rhs_ext := 65535 * rhs_is_negative
+```
+
+**Definition of `carry`:**
+```
+carry := 2^-32 * (raw_product[0] - res[0])
+carry := 2^-32 * (raw_product[i] + carry[i - 1] - res[i])
+```
+
+**Definition of `μ_sum`:**
+```
+μ_sum := μ_lo + μ_hi
+```
+
+### Multiplicity
+
+| Name | Type | Description |
+|------|------|-------------|
+| `μ_lo` | `BaseField` |  |
+| `μ_hi` | `BaseField` |  |
+
+## Assumptions
+
+| Ref | Range | Description |
+|-----|-------|-------------|
+| `A1` |  | `IS_HALF[lhs[i]]` |
+| `A2` |  | `IS_HALF[rhs[i]]` |
+| `mul:a:res` |  | `IS_WORD[res[i]]` |
+
+## Constraints
+
+### def
+
+| Ref | Kind | Description | Multiplicity |
+|-----|------|-------------|--------------|
+| `mul:c:lhs_is_negative` | template | `SIGN<lhs_is_negative; lhs[3], lhs_signed>` |  |
+| `mul:c:rhs_is_negative` | template | `SIGN<rhs_is_negative; rhs[3], rhs_signed>` |  |
+| `mul:c:carry` | interaction | `IS_B20[carry[i]]` | μ_sum |
+
+### prod
+
+| Ref | Kind | Description |
+|-----|------|-------------|
+| `mul:c:raw_product` | arith | `raw_product[i]` = sum_(`k`=0)^1 2^(16k) sum_(`j`=0)^(2i+k) `lhs_ext[j]` dot `rhs_ext[2i+k-j]` |
+| | | _polynomial:_ `Σ_k = 0^1 2^(16 * k) * Σ_j = 0^2 * i + k lhs_ext[j] * rhs_ext[2 * i + k - j] - raw_product[i] = 0` |
+
+### lookup
+
+| Ref | Kind | Description | Multiplicity |
+|-----|------|-------------|--------------|
+| `mul:c:lookup_lo` | interaction | `MUL[res[0:4]; lhs, lhs_signed, rhs, rhs_signed, 0]` | -μ_lo |
+| `mul:c:lookup_hi` | interaction | `MUL[res[4:8]; lhs, lhs_signed, rhs, rhs_signed, 1]` | -μ_hi |
diff --git a/docs/spec/shift.md b/docs/spec/shift.md
new file mode 100644
index 000000000..cde491157
--- /dev/null
+++ b/docs/spec/shift.md
@@ -0,0 +1,141 @@
+# SHIFT Chip
+
+## Columns
+
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `in` | `DWordHL` | The value being shifted |
+| `shift` | `Byte` | Number of bits to shift `in` by. |
+| `direction` | `Bit` | Whether to shift left (0) or right (1). |
+| `signed` | `Bit` | Whether to interpret `in` as a signed integer. |
+| `word_instr` | `Bit` | Whether this is a Word-instruction (1) or not (0). |
+
+### Output
+
+| Name | Type | Description |
+|------|------|-------------|
+| `out` | `DWordWL` | $`in <</>>/>>>` (`shift` mod 32 dot (2 - `word_instr`))$ |
+
+### Auxiliary
+
+| Name | Type | Description |
+|------|------|-------------|
+| `is_negative` | `Bit` | Whether `in` is negative |
+| `bit_shift` | `Byte` | Value by which to shift `in` to obtain `X` and `Y` |
+| `zbs` | `Bit` | Whether `bit_shift` is zero (1) or not (0). |
+| `X` | `Half[5]` | scratch variable. |
+| `Y` | `Half[4]` | scratch variable. |
+| `limb_shift` | `Bit[4]` | One-hot vector indicating whether $floor.l `shift` / 16 floor.r equiv i mod s$, where $s = 2$ when $`word_instr` = 1$ and $4$ otherwise. |
+
+### Virtual
+
+| Name | Type | Description |
+|------|------|-------------|
+| `extension` | `Half` | sign extension of `in`. |
+| `left` | `Bit` | Whether to perform a left-shift. |
+| `right` | `Bit` | Whether to perform a right-shift. |
+| `intra_limb_left` | `DWordHL` | `in << (shift % 16)` if `left` |
+| `intra_limb_right` | `DWordHL` | `in >>> (shift % 16)` if `right` and `signed`;\ `in >> (shift % 16)` if `right` and `!signed` |
+| `shifted` | `DWordHL` | $`in <</>>/>>>` (`shift` mod 32 dot (2 - `word_instr`))$ |
+
+**Definition of `extension`:**
+```
+extension := 65535 * is_negative
+```
+
+**Definition of `left`:**
+```
+left := μ - direction
+```
+
+**Definition of `right`:**
+```
+right := direction
+```
+
+**Definition of `intra_limb_left`:**
+```
+intra_limb_left (when iter=0) := X[0]
+intra_limb_left (when iter=[1, 3]) := X[i] + Y[i - 1]
+```
+
+**Definition of `intra_limb_right`:**
+```
+intra_limb_right := Y[i] + X[i + 1]
+```
+
+**Definition of `shifted`:**
+```
+shifted := left * Σ_j = 0^i limb_shift[j] * intra_limb_left[i - j] + right * (Σ_j = 0^3 - i limb_shift[j] * intra_limb_right[i + j] + extension * Σ_j = 3 - i^3 limb_shift[j])
+```
+
+### Multiplicity
+
+| Name | Type | Description |
+|------|------|-------------|
+| `μ` | `Bit` |  |
+
+## Assumptions
+
+| Ref | Range | Description |
+|-----|-------|-------------|
+| `shift:a:range_in` | i ∈ [0, 3] | `IS_HALFWORD[in[i]]` |
+| `shift:a:range_shift` |  | `IS_BYTE[shift]` |
+| `shift:a:direction` |  | `IS_BIT<direction>` |
+| `shift:a:signed` |  | `IS_BIT<signed>` |
+| `shift:a:word_instr` |  | `IS_BIT<word_instr>` |
+
+## Constraints
+
+### left_flag
+
+| Ref | Kind | Description |
+|-----|------|-------------|
+| `shift:c:direction_implies_mu` | arith | `direction` => `μ` = 1 |
+| | | _polynomial:_ `direction * (1 - μ) = 0` |
+| | | _note:_ enforces `left` is `Bit`. |
+
+### is_negative
+
+| Ref | Kind | Description | Multiplicity |
+|-----|------|-------------|--------------|
+| `shift:c:is_negative_if_signed` | interaction | `MSB16[is_negative; in[3]]` | signed |
+
+### bit_shift
+
+| Ref | Kind | Description | Multiplicity |
+|-----|------|-------------|--------------|
+| `shift:c:bit_shift_if_left` | interaction | `AND_BYTE[bit_shift; shift, 15]` | left |
+| `shift:c:bit_shift_if_right` | interaction | `AND_BYTE[bit_shift; 2^8 - shift, 15]` | right |
+| `shift:c:zbs` | template | `IsZero<zbs; bit_shift>` | μ |
+
+### intra_limb_shift
+
+| Ref | Kind | Range | Description | Multiplicity |
+|-----|------|-------|-------------|--------------|
+| `shift:c:hwsl_if_not_zero` | interaction | i ∈ [0, 3] | `HWSL[X[i]; in[i], bit_shift]` | 1 - zbs |
+| `shift:c:zbs_implies_X` | arith | i ∈ [0, 3] | `zbs` => `X[i]` = `in[i]` dot `left` |  |
+| | | _polynomial:_ `zbs * (X[i] - in[i] * left) = 0` | |
+| `shift:c:hwsl_x4_if_not_zero` | interaction |  | `HWSL[X[4]; extension, bit_shift]` | 1 - zbs |
+| `shift:c:zbs_implies_X_4` | arith |  | `zbs` => `X[4]` = 0 |  |
+| | | _polynomial:_ `zbs * X[4] = 0` | |
+| `shift:c:hwslc_if_not_zero` | interaction | i ∈ [0, 3] | `HWSLC[Y[i]; in[i], bit_shift]` | 1 - zbs |
+| `shift:c:zbs_implies_Y` | arith | i ∈ [0, 3] | `zbs` => `Y[i]` = `in[i]` dot `right` |  |
+| | | _polynomial:_ `zbs * (Y[i] - in[i] * right) = 0` | |
+
+### limb_shifting
+
+| Ref | Kind | Range | Description | Multiplicity |
+|-----|------|-------|-------------|--------------|
+| `shift:c:limb_shift_is_bit` | template | i ∈ [0, 3] | `IS_BIT<limb_shift[i]>` |  |
+| `shift:c:limb_shift_lookup` | interaction |  | `AND_BYTE[(1 - limb_shift[0]) + 15 * limb_shift[1] + 31 * limb_shift[2] + 47 * limb_shift[3]; shift, 48 - 32 * word_instr]` | μ |
+| `shift:c:out_eq_shifted` | arith | i ∈ [0, 1] | `out[:2]` = `shifted[:4]` |  |
+| | | _polynomial:_ `out[i] - (shifted::DWordWL)[i] = 0` | |
+
+### lookups
+
+| Ref | Kind | Description | Multiplicity |
+|-----|------|-------------|--------------|
+| `shift:c:lookup` | interaction | `SHIFT[out; in, shift, direction, signed, word_instr]` | -μ |
diff --git a/scripts/extract_and_convert_spec.sh b/scripts/extract_and_convert_spec.sh
new file mode 100755
index 000000000..1ddb0bafa
--- /dev/null
+++ b/scripts/extract_and_convert_spec.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+# Extract spec TOML files from spec/main branch and convert to Markdown
+#
+# Usage:
+#   ./scripts/extract_and_convert_spec.sh [output_dir]
+#
+# Default output directory: docs/spec
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
+OUTPUT_DIR="${1:-$REPO_ROOT/docs/spec}"
+TEMP_DIR=$(mktemp -d)
+
+echo "Extracting spec files from origin/spec/main..."
+
+# Create temp directory structure
+mkdir -p "$TEMP_DIR/src"
+
+# Extract config
+git show origin/spec/main:spec/src/config.toml > "$TEMP_DIR/src/config.toml" 2>/dev/null || {
+    echo "Error: Could not find spec/src/config.toml in origin/spec/main"
+    echo "Make sure to fetch the branch: git fetch origin spec/main"
+    rm -rf "$TEMP_DIR"
+    exit 1
+}
+
+# Extract all chip TOML files
+for file in $(git ls-tree -r origin/spec/main --name-only | grep '^spec/src/.*\.toml$' | grep -v config.toml | grep -v page.toml); do
+    filename=$(basename "$file")
+    git show "origin/spec/main:$file" > "$TEMP_DIR/src/$filename" 2>/dev/null || true
+done
+
+# List extracted files
+echo "Extracted files:"
+ls -la "$TEMP_DIR/src/"
+
+# Create output directory
+mkdir -p "$OUTPUT_DIR"
+
+# Run the Python converter
+echo ""
+echo "Converting to Markdown..."
+python3 "$SCRIPT_DIR/spec_to_md.py" \
+    "$TEMP_DIR/src/config.toml" \
+    "$TEMP_DIR/src/"*.toml \
+    --output-dir "$OUTPUT_DIR"
+
+# Cleanup
+rm -rf "$TEMP_DIR"
+
+echo ""
+echo "Done! Markdown files written to: $OUTPUT_DIR"
+ls -la "$OUTPUT_DIR"
diff --git a/scripts/spec_to_md.py b/scripts/spec_to_md.py
new file mode 100755
index 000000000..09becab09
--- /dev/null
+++ b/scripts/spec_to_md.py
@@ -0,0 +1,494 @@
+#!/usr/bin/env python3
+"""
+Convert Typst spec TOML files to Markdown.
+
+Usage:
+    # First, extract spec files from the spec/main branch:
+    git show origin/spec/main:spec/src/config.toml > /tmp/spec/config.toml
+    git show origin/spec/main:spec/src/cpu.toml > /tmp/spec/cpu.toml
+    # etc.
+
+    # Then run:
+    python scripts/spec_to_md.py /tmp/spec/config.toml /tmp/spec/cpu.toml
+
+    # Or convert all chips:
+    python scripts/spec_to_md.py /tmp/spec/config.toml /tmp/spec/*.toml
+
+    # Output to a specific directory:
+    python scripts/spec_to_md.py --output-dir docs/spec /tmp/spec/config.toml /tmp/spec/*.toml
+"""
+
+import argparse
+import sys
+from pathlib import Path
+
+# Python 3.11+ has tomllib in stdlib, fallback to tomli for older versions
+try:
+    import tomllib
+except ImportError:
+    try:
+        import tomli as tomllib
+    except ImportError:
+        print("Error: Please install tomli: pip install tomli", file=sys.stderr)
+        sys.exit(1)
+
+
+# =============================================================================
+# Expression Rendering
+# =============================================================================
+
+def expr_to_text(expr: any, parent_prec: int = 100) -> str:
+    """
+    Convert a polynomial expression to readable text.
+
+    Expression grammar (from spec/expr.typ):
+        <expr> ::= str                           ; variable name
+                 | int                           ; constant
+                 | ["idx", expr1, expr2]         ; expr1[expr2]
+                 | ["not", expr]                 ; 1 - expr
+                 | ["+", expr1, expr2, ...]      ; expr1 + expr2 + ...
+                 | ["sum", expr1, expr2, expr3]  ; sum from expr1 to expr2 of expr3
+                 | ["*", expr1, expr2, ...]      ; expr1 * expr2 * ...
+                 | ["/", expr1, expr2]           ; expr1 / expr2
+                 | ["^", expr1, expr2]           ; expr1^expr2
+                 | ["=", expr1, expr2]           ; expr1 = expr2
+                 | [":=", expr1, expr2]          ; expr1 := expr2
+                 | ["-", expr]                   ; -expr (unary)
+                 | ["-", expr1, expr2, ...]      ; expr1 - expr2 - ... (binary)
+                 | ["cast", expr, type]          ; expr::type
+    """
+    PREC = {
+        "idx": 0,
+        "pow": 1,
+        "neg": 2,
+        "cast": 3,
+        "mul": 4,
+        "div": 5,
+        "sum": 6,
+        "not": 7,
+        "add": 8,
+        "sub": 9,
+        "eq": 10,
+    }
+
+    def wrap(s: str, prec: int) -> str:
+        return f"({s})" if parent_prec < prec else s
+
+    if expr is None or expr == "":
+        return ""
+
+    if isinstance(expr, str):
+        return expr
+
+    if isinstance(expr, (int, float)):
+        return str(expr)
+
+    if isinstance(expr, list) and len(expr) > 0:
+        op = expr[0]
+
+        if op == "idx":
+            # expr1[expr2]
+            base = expr_to_text(expr[1], PREC["idx"])
+            idx = expr_to_text(expr[2], 100)
+            return f"{base}[{idx}]"
+
+        elif op == "not":
+            # 1 - expr
+            inner = expr_to_text(expr[1], PREC["not"])
+            return wrap(f"1 - {inner}", PREC["not"])
+
+        elif op == "+":
+            # expr1 + expr2 + ...
+            parts = [expr_to_text(e, PREC["add"]) for e in expr[1:]]
+            return wrap(" + ".join(parts), PREC["add"])
+
+        elif op == "sum":
+            # Σ from expr1 to expr2 of expr3
+            var = expr_to_text(expr[1], 100)
+            upper = expr_to_text(expr[2], 100)
+            body = expr_to_text(expr[3], PREC["sum"])
+            return f"Σ_{var}^{upper} {body}"
+
+        elif op == "*":
+            # expr1 * expr2 * ...
+            parts = [expr_to_text(e, PREC["mul"]) for e in expr[1:]]
+            return wrap(" * ".join(parts), PREC["mul"])
+
+        elif op == "/":
+            # expr1 / expr2
+            num = expr_to_text(expr[1], PREC["div"])
+            den = expr_to_text(expr[2], PREC["div"])
+            return wrap(f"{num} / {den}", PREC["div"])
+
+        elif op == "^":
+            # expr1^expr2
+            base = expr_to_text(expr[1], PREC["pow"])
+            exp = expr_to_text(expr[2], PREC["pow"])
+            return f"{base}^{exp}"
+
+        elif op == "=":
+            # expr1 = expr2
+            lhs = expr_to_text(expr[1], PREC["eq"])
+            rhs = expr_to_text(expr[2], PREC["eq"])
+            return f"{lhs} = {rhs}"
+
+        elif op == ":=":
+            # expr1 := expr2
+            lhs = expr_to_text(expr[1], PREC["eq"])
+            rhs = expr_to_text(expr[2], PREC["eq"])
+            return f"{lhs} := {rhs}"
+
+        elif op == "-":
+            if len(expr) == 2:
+                # Unary negation
+                inner = expr_to_text(expr[1], PREC["neg"])
+                return wrap(f"-{inner}", PREC["neg"])
+            else:
+                # Binary subtraction
+                parts = [expr_to_text(e, PREC["sub"]) for e in expr[1:]]
+                return wrap(" - ".join(parts), PREC["sub"])
+
+        elif op == "cast":
+            # expr::type
+            inner = expr_to_text(expr[1], PREC["cast"])
+            type_str = type_to_text(expr[2])
+            return wrap(f"{inner}::{type_str}", PREC["cast"])
+
+        else:
+            # Unknown operator, render as-is
+            return str(expr)
+
+    return str(expr)
+
+
+def type_to_text(typ: any) -> str:
+    """Convert a type to text."""
+    if isinstance(typ, str):
+        return typ
+    if isinstance(typ, list) and len(typ) == 2:
+        return f"{typ[0]}[{typ[1]}]"
+    return str(typ)
+
+
+def iters_to_text(obj: dict) -> str:
+    """Extract iterator ranges from a constraint/assumption."""
+    iters = []
+
+    if "iter" in obj:
+        it = obj["iter"]
+        if isinstance(it, list) and len(it) == 3:
+            iters.append(f"{it[0]} ∈ [{it[1]}, {it[2]}]")
+        elif isinstance(it, list) and len(it) == 2:
+            iters.append(f"{it[0]} = {it[1]}")
+
+    if "iters" in obj:
+        for it in obj["iters"]:
+            if isinstance(it, list) and len(it) == 3:
+                iters.append(f"{it[0]} ∈ [{it[1]}, {it[2]}]")
+            elif isinstance(it, list) and len(it) == 2:
+                iters.append(f"{it[0]} = {it[1]}")
+
+    return ", ".join(iters)
+
+
+# =============================================================================
+# Markdown Generation
+# =============================================================================
+
+def escape_md(s: str) -> str:
+    """Escape pipe characters for Markdown tables."""
+    if s is None:
+        return ""
+    return str(s).replace("|", "\\|").replace("\n", " ")
+
+
+def render_variables_table(variables: dict, config: dict) -> str:
+    """Render variables as Markdown tables, grouped by category."""
+    lines = []
+
+    category_order = ["input", "output", "auxiliary", "virtual", "multiplicity", "condition"]
+
+    for category in category_order:
+        if category not in variables:
+            continue
+
+        vars_list = variables[category]
+        if not vars_list:
+            continue
+
+        lines.append(f"### {category.capitalize()}")
+        lines.append("")
+        lines.append("| Name | Type | Description |")
+        lines.append("|------|------|-------------|")
+
+        for var in vars_list:
+            name = f"`{var['name']}`"
+            typ = f"`{type_to_text(var.get('type', ''))}`"
+            desc = escape_md(var.get('desc', ''))
+            # Clean up Typst markup in descriptions
+            desc = desc.replace('#`', '`').replace('`#', '`')
+            lines.append(f"| {name} | {typ} | {desc} |")
+
+        # Add definition if present (for virtual variables)
+        for var in vars_list:
+            if "def" in var:
+                defn = var["def"]
+                lines.append("")
+                lines.append(f"**Definition of `{var['name']}`:**")
+                if isinstance(defn, dict):
+                    if "poly" in defn:
+                        lines.append(f"```")
+                        lines.append(f"{var['name']} := {expr_to_text(defn['poly'])}")
+                        lines.append(f"```")
+                    elif "polys" in defn:
+                        lines.append(f"```")
+                        for i, p in enumerate(defn["polys"]):
+                            iter_str = ""
+                            if "iter" in p:
+                                iter_str = f" (when iter={p['iter']})"
+                            lines.append(f"{var['name']}{iter_str} := {expr_to_text(p['poly'])}")
+                        lines.append(f"```")
+                elif isinstance(defn, (list, str)):
+                    lines.append(f"```")
+                    lines.append(f"{var['name']} := {expr_to_text(defn)}")
+                    lines.append(f"```")
+
+        lines.append("")
+
+    return "\n".join(lines)
+
+
+def render_assumptions_table(assumptions: list) -> str:
+    """Render assumptions as a Markdown table."""
+    if not assumptions:
+        return ""
+
+    lines = []
+    lines.append("## Assumptions")
+    lines.append("")
+    lines.append("| Ref | Range | Description |")
+    lines.append("|-----|-------|-------------|")
+
+    for i, assumption in enumerate(assumptions, 1):
+        ref = assumption.get("ref", f"A{i}")
+        iters = iters_to_text(assumption)
+        desc = escape_md(assumption.get("desc", ""))
+        lines.append(f"| `{ref}` | {iters} | {desc} |")
+
+    lines.append("")
+    return "\n".join(lines)
+
+
+def render_constraints_table(constraints: dict, constraint_groups: list) -> str:
+    """Render constraints as Markdown tables, grouped by constraint group."""
+    if not constraints:
+        return ""
+
+    lines = []
+    lines.append("## Constraints")
+    lines.append("")
+
+    # Build group lookup
+    group_info = {g["name"]: g for g in constraint_groups}
+
+    for group_name, group_constraints in constraints.items():
+        if not group_constraints:
+            continue
+
+        group = group_info.get(group_name, {"name": group_name})
+        prefix = group.get("prefix", "")
+        group_desc = group.get("desc", "")
+
+        lines.append(f"### {group_name}")
+        if group_desc:
+            lines.append(f"_{group_desc}_")
+        lines.append("")
+
+        # Determine columns needed
+        has_multiplicity = any("multiplicity" in c for c in group_constraints)
+        has_iter = any(iters_to_text(c) for c in group_constraints)
+
+        # Build header
+        if has_iter and has_multiplicity:
+            header = "| Ref | Kind | Range | Description | Multiplicity |"
+            separator = "|-----|------|-------|-------------|--------------|"
+        elif has_iter:
+            header = "| Ref | Kind | Range | Description |"
+            separator = "|-----|------|-------|-------------|"
+        elif has_multiplicity:
+            header = "| Ref | Kind | Description | Multiplicity |"
+            separator = "|-----|------|-------------|--------------|"
+        else:
+            header = "| Ref | Kind | Description |"
+            separator = "|-----|------|-------------|"
+
+        lines.append(header)
+        lines.append(separator)
+
+        for i, constraint in enumerate(group_constraints, 1):
+            ref = constraint.get("ref", f"{prefix}{i}")
+            kind = constraint.get("kind", "")
+            tag = constraint.get("tag", "")
+            iters = iters_to_text(constraint)
+            mult = expr_to_text(constraint.get("multiplicity", ""))
+
+            # Build description based on kind
+            if kind == "interaction":
+                inputs = ", ".join(expr_to_text(inp) for inp in constraint.get("input", []))
+                output = constraint.get("output")
+                if output:
+                    desc = f"`{tag}[{expr_to_text(output)}; {inputs}]`"
+                else:
+                    desc = f"`{tag}[{inputs}]`"
+
+            elif kind == "arith":
+                desc = escape_md(constraint.get("constraint", ""))
+                # Clean up Typst math markup
+                desc = desc.replace("$", "").replace("#", "")
+
+            elif kind == "template":
+                inputs = ", ".join(expr_to_text(inp) for inp in constraint.get("input", []))
+                output = constraint.get("output")
+                cond = constraint.get("cond")
+                cond_str = f"{expr_to_text(cond)} ⇒ " if cond else ""
+                if output:
+                    desc = f"{cond_str}`{tag}<{expr_to_text(output)}; {inputs}>`"
+                else:
+                    desc = f"{cond_str}`{tag}<{inputs}>`"
+
+            else:
+                desc = str(constraint)
+
+            # Build row
+            row = f"| `{ref}` | {kind} |"
+            if has_iter:
+                row += f" {iters} |"
+            row += f" {desc} |"
+            if has_multiplicity:
+                row += f" {mult} |"
+
+            lines.append(row)
+
+            # Add polynomial constraint if present
+            if kind == "arith" and ("poly" in constraint or "polys" in constraint):
+                if "poly" in constraint:
+                    poly_str = expr_to_text(constraint["poly"])
+                    lines.append(f"| | | _polynomial:_ `{poly_str} = 0` |" + (" |" if has_multiplicity else ""))
+                elif "polys" in constraint:
+                    for poly in constraint["polys"]:
+                        poly_str = expr_to_text(poly)
+                        lines.append(f"| | | _polynomial:_ `{poly_str} = 0` |" + (" |" if has_multiplicity else ""))
+
+            # Add description if present
+            if "desc" in constraint and kind == "arith":
+                desc_text = escape_md(constraint["desc"])
+                lines.append(f"| | | _note:_ {desc_text} |" + (" |" if has_multiplicity else ""))
+
+        lines.append("")
+
+    return "\n".join(lines)
+
+
+def chip_to_markdown(chip: dict, config: dict) -> str:
+    """Convert a chip TOML to Markdown."""
+    lines = []
+
+    name = chip.get("name", "Unknown")
+    lines.append(f"# {name} Chip")
+    lines.append("")
+
+    # Variables
+    variables = chip.get("variables", {})
+    if variables:
+        lines.append("## Columns")
+        lines.append("")
+        lines.append(render_variables_table(variables, config))
+
+    # Assumptions
+    assumptions = chip.get("assumptions", [])
+    if assumptions:
+        lines.append(render_assumptions_table(assumptions))
+
+    # Constraints
+    constraints = chip.get("constraints", {})
+    constraint_groups = chip.get("constraint_groups", [])
+    if constraints:
+        lines.append(render_constraints_table(constraints, constraint_groups))
+
+    return "\n".join(lines)
+
+
+# =============================================================================
+# Main
+# =============================================================================
+
+def load_toml(path: Path) -> dict:
+    """Load a TOML file."""
+    with open(path, "rb") as f:
+        return tomllib.load(f)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Convert Typst spec TOML files to Markdown",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=__doc__
+    )
+    parser.add_argument(
+        "config",
+        type=Path,
+        help="Path to config.toml"
+    )
+    parser.add_argument(
+        "chips",
+        type=Path,
+        nargs="+",
+        help="Paths to chip TOML files (e.g., cpu.toml, lt.toml)"
+    )
+    parser.add_argument(
+        "--output-dir", "-o",
+        type=Path,
+        default=None,
+        help="Output directory for Markdown files (default: stdout)"
+    )
+
+    args = parser.parse_args()
+
+    # Load config
+    config = load_toml(args.config)
+
+    # Process each chip
+    for chip_path in args.chips:
+        # Skip config.toml if passed as chip
+        if chip_path.name == "config.toml":
+            continue
+
+        # Skip non-chip TOML files
+        if chip_path.name in ("page.toml", "theme-style.toml"):
+            continue
+
+        try:
+            chip = load_toml(chip_path)
+        except Exception as e:
+            print(f"Warning: Failed to load {chip_path}: {e}", file=sys.stderr)
+            continue
+
+        # Check if it's a valid chip file (has 'name' field)
+        if "name" not in chip:
+            continue
+
+        md_content = chip_to_markdown(chip, config)
+
+        if args.output_dir:
+            args.output_dir.mkdir(parents=True, exist_ok=True)
+            output_path = args.output_dir / f"{chip_path.stem}.md"
+            with open(output_path, "w") as f:
+                f.write(md_content)
+            print(f"Generated: {output_path}")
+        else:
+            print(md_content)
+            print("\n" + "=" * 80 + "\n")
+
+
+if __name__ == "__main__":
+    main()

From 4ca2a4e304b1aba3d03d7c3e4f9fa1fe16aa633f Mon Sep 17 00:00:00 2001
From: Cyprien de Saint Guilhem <c.desaintguilhem@gmail.com>
Date: Thu, 22 Jan 2026 08:48:10 -0800
Subject: [PATCH 039/105] fix(spec): Correct typo in spec README and align
 style (#210)

---
 spec/README.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/spec/README.md b/spec/README.md
index d841017cb..127e528c8 100644
--- a/spec/README.md
+++ b/spec/README.md
@@ -3,9 +3,9 @@ This repository contains specification for [`LambdaVM`](https://github.com/yetan
 The specification is written in [`Typst`](https://typst.app/) and can be rendered by [`shiroa`](https://myriad-dreamin.github.io/shiroa/) as either a file (pdf) or a wiki (html).
 
 ## Installation & Development setup
-1. [Install `Typst`](https://github.com/typst/typst?tab=readme-ov-file#installation)
-2. [Install `shiroa`](https://myriad-dreamin.github.io/shiroa/guide/installation.html)
-3. Clone this reposity
+1. [Install `Typst`](https://github.com/typst/typst?tab=readme-ov-file#installation).
+2. [Install `shiroa`](https://myriad-dreamin.github.io/shiroa/guide/installation.html).
+3. Clone this repository.
 4. Open the repository in a terminal and execute `shiroa serve`.
 
-At this point, the wiki version is hosted locally and is actively updated as you modify the specification files.
\ No newline at end of file
+At this point, the wiki version is hosted locally and is actively updated as you modify the specification files.

From 5e6fdc3eeb8630bde17699fb7fe806b2fe092c14 Mon Sep 17 00:00:00 2001
From: Robin Jadoul <robin.jadoul@3milabs.tech>
Date: Fri, 23 Jan 2026 12:05:47 +0100
Subject: [PATCH 040/105] spec: CPU padding (#195)

* spec: CPU fast path for x0 reads

* Do not write/read pc when in a padding row

* specify padding for the CPU

* Apply suggestions from code review

Co-authored-by: Erik <159244975+erik-3milabs@users.noreply.github.com>

* spec: DECODE: update padding row

* spec: DECODE: explain 'one more instruction'

* spec: CPU: fix c_type_instruction typo

* Apply suggestions from code review

Co-authored-by: Erik <159244975+erik-3milabs@users.noreply.github.com>

* spec: align `packed_decode` in  `DECODE` and `CPU`

* spec: DECODE: add `read_registerX` to `packed_decode`

* spec: DECODE: specify `read_register1` and `2`

* spec: DECODE: update pc padding value

* spec: DECODE: several small fixes

* spec: DECODE: fix ECALL's rs2 value

Co-authored-by: Robin Jadoul <robin.jadoul@3milabs.tech>

* spec: DECODE: minor rewording

Co-authored-by: Robin Jadoul <robin.jadoul@3milabs.tech>

* spec: DECODE: minor fix

---------

Co-authored-by: Erik <159244975+erik-3milabs@users.noreply.github.com>
Co-authored-by: Erik Takke <erik.takke@3milabs.tech>
---
 spec/cpu.typ                      |  10 ++
 spec/decode.typ                   | 125 +++++++++++++-----------
 spec/src/cpu.toml                 | 156 +++++++++++++++++++++++-------
 spec/src/decode.toml              |  63 ++++++------
 spec/src/decode_uncompressed.toml |  10 ++
 5 files changed, 241 insertions(+), 123 deletions(-)

diff --git a/spec/cpu.typ b/spec/cpu.typ
index 00a33f5a2..784d750d2 100644
--- a/spec/cpu.typ
+++ b/spec/cpu.typ
@@ -6,6 +6,7 @@
   total_nr_variables,
   total_nr_instantiated_columns,
   render_constraint_table,
+  render_chip_padding_table,
 )
 
 #let config = load_config()
@@ -82,3 +83,12 @@ For @cpu:c:is_equal, refer to the logic of IsZero or IsEqual, in combination wit
 #render_constraint_table(chip, config, groups: "misc")
 
 #rj[Document the choice to not have a multiplicity column here for padding]
+
+== Padding
+
+The CPU can be padded with the following values, which have a corresponding row
+in the DECODE table, at the _odd_ address 1, only reachable through a HALT ecall.
+
+#render_chip_padding_table(chip, config)
+
+This approach minimizes the number of dependent lookups, increasing only multiplicities in the DECODE table and the IS_BYTE lookup.
diff --git a/spec/decode.typ b/spec/decode.typ
index 24846d2c1..d8332e033 100644
--- a/spec/decode.typ
+++ b/spec/decode.typ
@@ -32,8 +32,13 @@ The #decode table is comprised of #nr_variables variables that are expressed usi
 == Padding
 The #decode table must be padded to a length that is a power of two.
 Empty rows with the following content can be added to achieve this:
+
 #render_chip_padding_table(chip, config)
 
+Note that this row sets the `EBREAK` flag.
+Given that `CPU` asserts that `EBREAK = 0` (see @cpu:c:ebreak_traps), using this "padding-instruction" would immediately make the CPU table unprovable.
+Note moreover that the `pc` is set to $7$.
+This value is the _smallest odd number_ (i.e., not reachable during regular execution) that is more than _$4$_ (i.e., the max `pc`-increment) greater than _$1$_ (i.e., the `pc`-value used in the #link(<cpu-padding-decode-row>)[additional instruction] referred to by `CPU`-padding lines).
 
 == Decoding
 For the purposes of explaining decoding, we decompress #decode's `packed_decode` variable into its constituent variables.
@@ -46,16 +51,17 @@ Note that the below table is _not_ used in practice: it is solely used for the p
 
 We will illustrate how each instruction should be expressed in this (uncompressed) decoding table.
 The columns of the accompanying table represent the following:
-- *`operation`*: the assembly operation being encoded,
+- *`operation`*: the assembly operation being encoded.
 - *`op-flag`*: which of the "`ALU` selector flags" operation flags to set. Each operation sets exactly one.
-- *`w_reg`*, *`w_instr`*, *`signed`*: whether to set the `write_register`, `word_instr` or `signed` flag, respectively,
+- *`w_instr`*, *`signed`*: whether to set the `word_instr` and `signed` flags, respectively.
 - *other*: the other flags that should be set or variables that should be given specific values.
 
 For the purpose of brevity and readability, the table uses the following rules-of-thumb:
 + `rd`, `rs1`, `rs2`, and `imm` are mapped to the values provided by the instruction;
   when a value is not specified by an instruction it defaults to $0$.
++ `read_register1`, `read_register2` and `write_register` are set to $1$ when respectively $#`rs1` != 0$, $#`rs2` != 0$, or  $#`rd` != 0$.
 + Any flag that is not listed is set to $0$, with the exception of the `c_type` flag. 
-  *The `c_type` flag is set independently of the below table*, as explained below.
+  *The `c_type` flag is set independently of the below table*, as explained next.
 
 Further clarification is provided in the notes following the table.
 
@@ -74,19 +80,19 @@ To indicate an instruction is provided in compressed form, the `c_type` flag is
   show figure: set block(breakable: true)
 
   figure(table(
-    columns: (auto, auto, 40pt, 40pt, 40pt, 1fr, 15pt),
+    columns: (auto, auto, 40pt, 40pt, 1fr, 15pt),
     stroke: 0pt,
     inset: (right: .5em),
-    align: (left, right, center, center, center, left, right),
+    align: (left, right, center, center, left, right),
     fill: (_, y) =>
       if calc.odd(y) and y <= lines.len() { luma(245) }
       else { white },
-    table.header([*Operation*], [*op-flag*], [*`w_reg`*], [*`w_instr`*], [*`signed`*], [*other*], []),
+    table.header([*Operation*], [*op-flag*], [*`w_instr`*], [*`signed`*], [*other*], []),
     table.hline(stroke: 1.5pt),
     table.vline(x: 1, start: 1, end: lines.len() + 1, stroke: .5pt),
     ..lines.flatten(),
     table.hline(stroke: 1.5pt),
-    table.footer([*Operation*], [*op-flag*], [*`w_reg`*], [*`w_instr`*], [*`signed`*], [*other*]),
+    table.footer([*Operation*], [*op-flag*], [*`w_instr`*], [*`signed`*], [*other*]),
     ),  
     caption: [Decoding table]
   )
@@ -94,56 +100,56 @@ To indicate an instruction is provided in compressed form, the `c_type` flag is
 
 #let decoding = (
     // OP-IMM
-  ([`ADDI[W]   rd, rs1, imm`], [`ADD`], [$#`rd` eq.not 0$], [`[W]`], [], [], [#ref_note(<note_w_reg>, <note_word_instr>)]),
-  ([`SLTI[U]   rd, rs1, imm`], [`SLT`], [$#`rd` eq.not 0$], [], [#sym.not`[U]`], [], [#ref_note(<note_w_reg>, <note_signed>)]),
-  ([`ANDI      rd, rs1, imm`], [`AND`], [$#`rd` eq.not 0$], [], [], [], [#ref_note(<note_w_reg>)]),
-  ([`ORI       rd, rs1, imm`], [`OR`],  [$#`rd` eq.not 0$],  [], [], [], [#ref_note(<note_w_reg>)]),
-  ([`XORI      rd, rs1, imm`], [`XOR`], [$#`rd` eq.not 0$], [], [], [], [#ref_note(<note_w_reg>)]),
-  ([`SLLI[W]   rd, rs1, imm`], [`SHIFT`], [$#`rd` eq.not 0$], [`[W]`], [], [], [#ref_note(<note_w_reg>)]),
-  ([`SRLI[W]   rd, rs1, imm`], [`SHIFT`], [$#`rd` eq.not 0$], [`[W]`], [], [`mp_selector`], [#ref_note(<note_w_reg>, <note_word_instr>)]),
-  ([`SRAI[W]   rd, rs1, imm`], [`SHIFT`], [$#`rd` eq.not 0$], [`[W]`], [1], [`mp_selector`], [#ref_note(<note_w_reg>, <note_word_instr>)]),
+  ([`ADDI[W]   rd, rs1, imm`], [`ADD`], [`[W]`], [], [], [#ref_note(<note_word_instr>)]),
+  ([`SLTI[U]   rd, rs1, imm`], [`SLT`], [], [#sym.not`[U]`], [], [#ref_note(<note_signed>)]),
+  ([`ANDI      rd, rs1, imm`], [`AND`], [], [], [], []),
+  ([`ORI       rd, rs1, imm`], [`OR`],   [], [], [], []),
+  ([`XORI      rd, rs1, imm`], [`XOR`], [], [], [], []),
+  ([`SLLI[W]   rd, rs1, imm`], [`SHIFT`], [`[W]`], [], [], []),
+  ([`SRLI[W]   rd, rs1, imm`], [`SHIFT`], [`[W]`], [], [`mp_selector`], [#ref_note(<note_word_instr>)]),
+  ([`SRAI[W]   rd, rs1, imm`], [`SHIFT`], [`[W]`], [1], [`mp_selector`], [#ref_note(<note_word_instr>)]),
   // OP
-  ([`ADD[W]    rd, rs1, rs2`], [`ADD`], [$#`rd` eq.not 0$], [`[W]`], [], [], [#ref_note(<note_w_reg>, <note_word_instr>)]),
-  ([`SUB[W]    rd, rs1, rs2`], [`SUB`], [$#`rd` eq.not 0$], [`[W]`], [], [], [#ref_note(<note_w_reg>, <note_word_instr>)]),
-  ([`SLT[U]    rd, rs1, rs2`], [`SLT`], [$#`rd` eq.not 0$], [], [#sym.not`[U]`], [], [#ref_note(<note_w_reg>, <note_signed>)]),
-  ([`AND       rd, rs1, rs2`], [`AND`], [$#`rd` eq.not 0$], [], [], [], [#ref_note(<note_w_reg>)]),
-  ([`OR        rd, rs1, rs2`], [`OR`], [$#`rd` eq.not 0$], [], [], [], [#ref_note(<note_w_reg>)]),
-  ([`XOR       rd, rs1, rs2`], [`XOR`], [$#`rd` eq.not 0$], [], [], [], [#ref_note(<note_w_reg>)]),
-  ([`SLL[W]    rd, rs1, rs2`], [`SHIFT`], [$#`rd` eq.not 0$], [`[W]`], [], [], [#ref_note(<note_w_reg>, <note_word_instr>)]),
-  ([`SRL[W]    rd, rs1, rs2`], [`SHIFT`], [$#`rd` eq.not 0$], [`[W]`], [], [`mp_selector`], [#ref_note(<note_w_reg>, <note_word_instr>)]),
-  ([`SRA[W]    rd, rs1, rs2`], [`SHIFT`], [$#`rd` eq.not 0$], [`[W]`], [1], [`mp_selector`], [#ref_note(<note_w_reg>, <note_word_instr>)]),
+  ([`ADD[W]    rd, rs1, rs2`], [`ADD`], [`[W]`], [], [], [#ref_note(<note_word_instr>)]),
+  ([`SUB[W]    rd, rs1, rs2`], [`SUB`], [`[W]`], [], [], [#ref_note(<note_word_instr>)]),
+  ([`SLT[U]    rd, rs1, rs2`], [`SLT`], [], [#sym.not`[U]`], [], [#ref_note(<note_signed>)]),
+  ([`AND       rd, rs1, rs2`], [`AND`], [], [], [], []),
+  ([`OR        rd, rs1, rs2`], [`OR`], [], [], [], []),
+  ([`XOR       rd, rs1, rs2`], [`XOR`], [], [], [], []),
+  ([`SLL[W]    rd, rs1, rs2`], [`SHIFT`], [`[W]`], [], [], [#ref_note(<note_word_instr>)]),
+  ([`SRL[W]    rd, rs1, rs2`], [`SHIFT`], [`[W]`], [], [`mp_selector`], [#ref_note(<note_word_instr>)]),
+  ([`SRA[W]    rd, rs1, rs2`], [`SHIFT`], [`[W]`], [1], [`mp_selector`], [#ref_note(<note_word_instr>)]),
   // OP - M
-  ([`MUL[W]    rd, rs1, rs2`], [`MUL`], [$#`rd` eq.not 0$], [`[W]`], [1], [`mp_selector`], [#ref_note(<note_w_reg>, <note_word_instr>)]),
-  ([`MULH      rd, rs1, rs2`], [`MUL`], [$#`rd` eq.not 0$], [], [1], [`mp_selector`, `muldiv_selector`], [#ref_note(<note_w_reg>)]),
-  ([`MULHU     rd, rs1, rs2`], [`MUL`], [$#`rd` eq.not 0$], [], [], [`muldiv_selector`], [#ref_note(<note_w_reg>)]),
-  ([`MULHSU    rd, rs1, rs2`], [`MUL`], [$#`rd` eq.not 0$], [], [1], [`muldiv_selector`], [#ref_note(<note_w_reg>)]),
-  ([`DIV[U][W] rd, rs1, rs2`], [`DIVREM`], [$#`rd` eq.not 0$], [`[W]`], [#sym.not`[U]`], [], [#ref_note(<note_w_reg>, <note_word_instr>, <note_signed>)]),
-  ([`REM[U][W] rd, rs1, rs2`], [`DIVREM`], [$#`rd` eq.not 0$], [`[W]`], [#sym.not`[U]`], [`muldiv_selector`], [#ref_note(<note_w_reg>, <note_word_instr>, <note_signed>)]),
+  ([`MUL[W]    rd, rs1, rs2`], [`MUL`], [`[W]`], [1], [`mp_selector`], [#ref_note(<note_word_instr>)]),
+  ([`MULH      rd, rs1, rs2`], [`MUL`], [], [1], [`mp_selector`, `muldiv_selector`], []),
+  ([`MULHU     rd, rs1, rs2`], [`MUL`], [], [], [`muldiv_selector`], []),
+  ([`MULHSU    rd, rs1, rs2`], [`MUL`], [], [1], [`muldiv_selector`], []),
+  ([`DIV[U][W] rd, rs1, rs2`], [`DIVREM`], [`[W]`], [#sym.not`[U]`], [], [#ref_note(<note_word_instr>, <note_signed>)]),
+  ([`REM[U][W] rd, rs1, rs2`], [`DIVREM`], [`[W]`], [#sym.not`[U]`], [`muldiv_selector`], [#ref_note(<note_word_instr>, <note_signed>)]),
   // LUI/AUIPC
-  ([`LUI       rd, imm`], [`ADD`], [$#`rd` eq.not 0$], [], [], [], [#ref_note(<note_w_reg>, <note-lui>)]),
-  ([`AUIPC     rd, imm`], [`ADD`], [$#`rd` eq.not 0$], [], [], [`rs1 := x255`], [#ref_note(<note_w_reg>, <note-auipc>)]),
-  ([`JAL       rd, imm`], [`JALR`], [$#`rd` eq.not 0$], [], [], [`rs1 := x255`], [#ref_note(<note_w_reg>, <note-jal>)]),
+  ([`LUI       rd, imm`], [`ADD`], [], [], [], [#ref_note(<note-lui>)]),
+  ([`AUIPC     rd, imm`], [`ADD`], [], [], [`rs1 := x255`], [#ref_note(<note-auipc>)]),
+  ([`JAL       rd, imm`], [`JALR`], [], [], [`rs1 := x255`], [#ref_note(<note-jal>)]),
   // Branching
-  ([`JALR      rd, rs1, imm`], [`JALR`], [$#`rd` eq.not 0$], [], [], [], [#ref_note(<note_w_reg>)]),
-  ([`BEQ      rs1, rs2, imm`], [`BEQ`], [], [], [], [], []),
-  ([`BNE      rs1, rs2, imm`], [`BEQ`], [], [], [], [`mp_selector`], []),
-  ([`BLT[U]   rs1, rs2, imm`], [`BLT`], [], [], [#sym.not`[U]`], [], [#ref_note(<note_signed>)]),
-  ([`BGE[U]   rs1, rs2, imm`], [`BLT`], [], [], [#sym.not`[U]`], [`mp_selector`], [#ref_note(<note_signed>)]),
+  ([`JALR      rd, rs1, imm`], [`JALR`], [], [], [], []),
+  ([`BEQ      rs1, rs2, imm`], [`BEQ`], [], [], [], []),
+  ([`BNE      rs1, rs2, imm`], [`BEQ`], [], [], [`mp_selector`], []),
+  ([`BLT[U]   rs1, rs2, imm`], [`BLT`], [], [#sym.not`[U]`], [], [#ref_note(<note_signed>)]),
+  ([`BGE[U]   rs1, rs2, imm`], [`BLT`], [], [#sym.not`[U]`], [`mp_selector`], [#ref_note(<note_signed>)]),
   // LOAD
-  ([`LD        rd, rs1, imm`], [`LOAD`], [], [], [], [`mem_8B`], []),
-  ([`LW[U]     rd, rs1, imm`], [`LOAD`], [], [], [#sym.not`[U]`], [`mem_4B`], [#ref_note(<note_signed>)]),
-  ([`LH[U]     rd, rs1, imm`], [`LOAD`], [], [], [#sym.not`[U]`], [`mem_2B`], [#ref_note(<note_signed>)]),
-  ([`LB[U]     rd, rs1, imm`], [`LOAD`], [], [], [#sym.not`[U]`], [], [#ref_note(<note_signed>)]),
+  ([`LD        rd, rs1, imm`], [`LOAD`], [], [], [`mem_8B`], []),
+  ([`LW[U]     rd, rs1, imm`], [`LOAD`], [], [#sym.not`[U]`], [`mem_4B`], [#ref_note(<note_signed>)]),
+  ([`LH[U]     rd, rs1, imm`], [`LOAD`], [], [#sym.not`[U]`], [`mem_2B`], [#ref_note(<note_signed>)]),
+  ([`LB[U]     rd, rs1, imm`], [`LOAD`], [], [#sym.not`[U]`], [], [#ref_note(<note_signed>)]),
   // STORE
-  ([`SD       rs1, rs2, imm`], [`STORE`], [], [], [], [`mem_8B`], []),
-  ([`SW       rs1, rs2, imm`], [`STORE`], [], [], [], [`mem_4B`], []),
-  ([`SH       rs1, rs2, imm`], [`STORE`], [], [], [], [`mem_2B`], []),
-  ([`SB       rs1, rs2, imm`], [`STORE`], [], [], [], [], []),
+  ([`SD       rs1, rs2, imm`], [`STORE`], [], [], [`mem_8B`], []),
+  ([`SW       rs1, rs2, imm`], [`STORE`], [], [], [`mem_4B`], []),
+  ([`SH       rs1, rs2, imm`], [`STORE`], [], [], [`mem_2B`], []),
+  ([`SB       rs1, rs2, imm`], [`STORE`], [], [], [], []),
   // ECALL/EBREAK
-  ([`ECALL`], [`ECALL`], [1], [], [], [$#`rs1` := #`x17`$, $#`rs2` := #`x11`$, $#`rd` := #`x10`$], [#ref_note(<note-ecall>)]),
-  ([`EBREAK`], [`EBREAK`], [], [], [], [], []),
+  ([`ECALL`], [`ECALL`], [], [], [$#`rs1` := #`x17`$, $#`rs2` := #`x10`$, $#`rd` := #`x10`$], [#ref_note(<note-ecall>)]),
+  ([`EBREAK`], [`EBREAK`], [], [], [], []),
   // FENCE
-  ([`FENCE`], [`ADD`], [], [], [], [], [#ref_note(<note-fence>)]),
+  ([`FENCE`], [`ADD`], [], [], [], [#ref_note(<note-fence>)]),
 )
 
 #decoding_table(decoding)
@@ -157,16 +163,10 @@ To indicate an instruction is provided in compressed form, the `c_type` flag is
 ==== Notes
 We note the following about the above decoding table:
 #enum(numbering: "[1]",
-  enum.item(
-    referenceable_note(
-      "note_w_reg",
-      [`write_register`: $#`rd` eq.not 0$ indicates that $#`write_register` = 1$ when $#`rd` eq.not 0$ and $0$ otherwise.]
-    )
-  ),
   enum.item(
     referenceable_note(
       "note_word_instr",
-      [`word_instr`: `[W]` indicates that $#`word_instr` = 0$ for the `W`-variant of the operation, and $0$ for the non-`W`-variant.]
+      [`word_instr`: `[W]` indicates that $#`word_instr` = 1$ for the `W`-variant of the operation, and $0$ for the non-`W`-variant.]
     )
   ),
   enum.item(
@@ -194,9 +194,9 @@ We note the following about the above decoding table:
   enum.item(
     referenceable_note(
       "note-jal",
-      [`JAL`: this operation stores `pc + 4` in `rd` and adds two times the sign-extended 20-bit immediate to the `pc`.
+      [`JAL`: this operation stores $#`pc` + 4$ in `rd` and adds two times the sign-extended 20-bit immediate to the `pc`.
       Note that this can be represented using `JALR rd, x255, imm`.
-      As such, *we expect the decoding to take care of writing the immediate in bit range $[1:13]$ of `imm` and extending it to 64 bits; the least significant bit should always be 0.*]
+      As such, *we expect the decoding to take care of writing the immediate in bit range $[1:21]$ of `imm` and extending it to 64 bits; the least significant bit should always be 0.*]
     )
   ),
   enum.item(
@@ -206,7 +206,7 @@ We note the following about the above decoding table:
       "On RISC-V a system call has its own instruction: `ECALL`. A system call can have up to 7 arguments and has 1 return value. The arguments are in registers A0-A6, in that order, and the return value is written into A0 before giving back control to the guest. A7 contains the system call number." #link("https://libriscv.no/docs/concepts/syscalls/#the-risc-v-system-call-abi")[[source]]
       As such,
       - syscall number in A7 (= register `x17`)
-      - first syscall argument in A1 (= register `x11`)
+      - first syscall argument in A0 (= register `x10`)
       - syscall output in A0 (= register `x10`)]
     )
   ),
@@ -217,3 +217,10 @@ We note the following about the above decoding table:
     )
   )
 )
+
+== One more instruction <cpu-padding-decode-row>
+In addition to decoding all instructions provided in the ELF and adding a corresponding entry to the #decode table, one must include an entry that has $#`pc` = 1$ and every other variable set to $0$.
+Note that this will never conflict with any entry in the ELF, since it has an odd `pc` value.
+
+This entry is used to pad the `CPU` table.
+More details on this matter are provided in the `CPU` chip.
\ No newline at end of file
diff --git a/spec/src/cpu.toml b/spec/src/cpu.toml
index a8345c820..49a78ee15 100644
--- a/spec/src/cpu.toml
+++ b/spec/src/cpu.toml
@@ -13,48 +13,69 @@ desc = "A preprocessed timestamp to coordinate the memory argument. Since we hav
 name = "pc"
 type = "DWordWL"
 desc = "The program counter"
+pad = 1
 
 [[variables.input]]
 name = "rs1"
 type = "Byte"
 desc = "Source register 1 index"
+pad = 0
 
 [[variables.input]]
 name = "rs2"
 type = "Byte"
 desc = "Source register 2 index"
+pad = 0
 
 [[variables.input]]
 name = "rd"
 type = "Byte"
 desc = "Destination register index"
+pad = 0
+
+[[variables.input]]
+name = "read_register1"
+type = "Bit"
+desc = "Whether to read from `rs1` (1) or to place a 0 in `rv1` (0)"
+pad = 0
+
+[[variables.input]]
+name = "read_register2"
+type = "Bit"
+desc = "Whether to read from `rs2` (1) or to place a 0 in `rv2` (0)"
+pad = 0
 
 [[variables.input]]
 name = "write_register"
 type = "Bit"
 desc = "Whether to write back to the destination register"
+pad = 0
 
 # TODO: can we compress this to a single value? (1: is it worth it, 2: does it work)
 [[variables.input]]
 name = "memory_2bytes"
 type = "Bit"
 desc = "Whether the memory access (read or write) touches exactly 2 bytes"
+pad = 0
 
 [[variables.input]]
 name = "memory_4bytes"
 type = "Bit"
 desc = "Whether the memory access (read or write) touches exactly 4 bytes"
+pad = 0
 
 [[variables.input]]
 name = "memory_8bytes"
 type = "Bit"
 desc = "Whether the memory access (read or write) touches exactly 8 bytes"
+pad = 0
 
 # TODO: Are there usecases where it's nicer to just have this as a length constant?
 [[variables.input]]
 name = "c_type_instruction"
 type = "Bit"
 desc = "Whether the instruction is of C type, i.e., whether it is 2 bytes long instead of 4"
+pad = 0
 
 # TODO: Should this just be a word? (CHECK: effect on computation/extension of arg2)
 # TODO: make sure decode correctly extends this (may be zero for unsigned and word_instr?)
@@ -62,11 +83,13 @@ desc = "Whether the instruction is of C type, i.e., whether it is 2 bytes long i
 name = "imm"
 type = "DWordWL"
 desc = "The fully extended 64-bit version of the immediate"
+pad = 0
 
 [[variables.input]]
 name = "signed"
 type = "Bit"
 desc = "Indicates whether we're dealing with a signed or unsigned instruction"
+pad = 0
 
 [[variables.input]]
 name = "mp_selector"
@@ -75,96 +98,115 @@ desc = """Multi-purpose selector used by different ALU operations for different
     - by the `MUL` chip to select between `MUL`/`MULH` and `MULH[S]U`, and
     - as flag for inverting the condition of conditional branches (see `branch_cond`)
     - as direction (left or right) for `SHIFT`"""
+pad = 0
 
 [[variables.input]]
 name = "muldiv_selector"
 type = "Bit"
 desc = "Selects which output of `MUL` (lo/hi) or `DIV` (quo/rem) is wanted"
+pad = 0
 
 [[variables.input]]
 name = "word_instr"
 type = "Bit"
 desc = "Whether the instruction is a \\*W instruction, requiring the inputs and outputs to be (sign) extended"
+pad = 0
 
 [[variables.input]]
 name = "ADD"
 type = "Bit"
 desc = "One-hot ALU selector flag"
+pad = 0
 
 [[variables.input]]
 name = "SUB"
 type = "Bit"
 desc = "One-hot ALU selector flag"
+pad = 0
 
 [[variables.input]]
 name = "SLT"
 type = "Bit"
 desc = "One-hot ALU selector flag"
+pad = 0
 
 [[variables.input]]
 name = "AND"
 type = "Bit"
 desc = "One-hot ALU selector flag"
+pad = 0
 
 [[variables.input]]
 name = "OR"
 type = "Bit"
 desc = "One-hot ALU selector flag"
+pad = 0
 
 [[variables.input]]
 name = "XOR"
 type = "Bit"
 desc = "One-hot ALU selector flag"
+pad = 0
 
 [[variables.input]]
 name = "SHIFT"
 type = "Bit"
 desc = "One-hot ALU selector flag"
+pad = 0
 
 [[variables.input]]
 name = "JALR"
 type = "Bit"
 desc = "One-hot ALU selector flag"
+pad = 0
 
 [[variables.input]]
 name = "BEQ"
 type = "Bit"
 desc = "One-hot ALU selector flag"
+pad = 0
 
 [[variables.input]]
 name = "BLT"
 type = "Bit"
 desc = "One-hot ALU selector flag"
+pad = 0
 
 [[variables.input]]
 name = "LOAD"
 type = "Bit"
 desc = "One-hot ALU selector flag"
+pad = 0
 
 [[variables.input]]
 name = "STORE"
 type = "Bit"
 desc = "One-hot ALU selector flag"
+pad = 0
 
 [[variables.input]]
 name = "MUL"
 type = "Bit"
 desc = "One-hot ALU selector flag"
+pad = 0
 
 [[variables.input]]
 name = "DIVREM"
 type = "Bit"
 desc = "One-hot ALU selector flag"
+pad = 0
 
 [[variables.input]]
 name = "ECALL"
 type = "Bit"
 desc = "One-hot ALU selector flag"
+pad = 0
 
 [[variables.input]]
 name = "EBREAK"
 type = "Bit"
 desc = "One-hot ALU selector flag"
+pad = 0
 
 
 # Output
@@ -172,102 +214,122 @@ desc = "One-hot ALU selector flag"
 name = "next_pc"
 type = "DWordWL"
 desc = "The program counter for the next instruction"
+pad = 5
 
 [[variables.output]]
 name = "rvd"
 type = "DWordWL"
 desc = "The value to (maybe) be written back to rvd"
+pad = 0
 
 # Auxiliary
 [[variables.auxiliary]]
 name = "rv1"
 type = "DWordWHH"
 desc = "The value of register `rs1`"
+pad = 0
 
 [[variables.auxiliary]]
 name = "rv2"
 type = "DWordWHH"
 desc = "The value of register `rs2`"
+pad = 0
 
 [[variables.auxiliary]]
 name = "rv1_sign_bit"
 type = "Bit"
 desc = "The sign bit of `rv1` if seen as a 32-bit word"
+pad = 0
 
 [[variables.auxiliary]]
 name = "arg1"
 type = "DWordBL"
-desc = "The extended version of `rv1`, depending on `c_type_instruction`"
+desc = "The extended version of `rv1`, depending on `word_instr`"
+pad = 0
 
 [[variables.auxiliary]]
 name = "arg2_sign_bit"
 type = "Bit"
 desc = "The sign bit of `arg2` if seen as a 32-bit word"
+pad = 0
 
 [[variables.auxiliary]]
 name = "arg2"
 type = "DWordBL"
 desc = "A multiplexed version of `rv2` and `imm`, to be used as second argument to ALU calls"
+pad = 0
 
 [[variables.auxiliary]]
 name = "res_sign_bit"
 type = "Bit"
 desc = "The sign bit of `res`, if seen as a 32-bit word"
+pad = 0
 
 [[variables.auxiliary]]
 name = "res"
 type = "DWordBL"
 desc = "The ALU result"
+pad = 0
 
 [[variables.auxiliary]]
 name = "is_equal"
 type = "Bit"
 desc = "Whether `rv1` and `arg2` are equal"
+pad = 0
 
 [[variables.auxiliary]]
 name = "branch_cond"
 type = "Bit"
 desc = "Whether a branch is taken, i.e., the branch condition"
+pad = 0
 
 # Virtual
 [[variables.virtual]]
 name = "packed_decode"
 type = "BaseField"
 desc = "A packed representation of all bit flags and register indices obtained from the decoding"
-poly = ["+",
-    ["*", ["^", 2, 0], "write_register"],
-    ["*", ["^", 2, 1], "memory_2bytes"],
-    ["*", ["^", 2, 2], "memory_4bytes"],
-    ["*", ["^", 2, 3], "memory_8bytes"],
-    ["*", ["^", 2, 4], "c_type_instruction"],
-    ["*", ["^", 2, 5], "signed"],
-    ["*", ["^", 2, 6], "mp_selector"],
-    ["*", ["^", 2, 7], "muldiv_selector"],
-    ["*", ["^", 2, 8], "word_instr"],
-    ["*", ["^", 2, 9], "ADD"],
-    ["*", ["^", 2, 10], "SUB"],
-    ["*", ["^", 2, 11], "SLT"],
-    ["*", ["^", 2, 12], "AND"],
-    ["*", ["^", 2, 13], "OR"],
-    ["*", ["^", 2, 14], "XOR"],
-    ["*", ["^", 2, 15], "SHIFT"],
-    ["*", ["^", 2, 16], "JALR"],
-    ["*", ["^", 2, 17], "BEQ"],
-    ["*", ["^", 2, 18], "BLT"],
-    ["*", ["^", 2, 19], "LOAD"],
-    ["*", ["^", 2, 20], "STORE"],
-    ["*", ["^", 2, 21], "MUL"],
-    ["*", ["^", 2, 22], "DIVREM"],
-    ["*", ["^", 2, 23], "ECALL"],
-    ["*", ["^", 2, 24], "EBREAK"],
-    ["*", ["^", 2, 25], "rs1"],
-    ["*", ["^", 2, 33], "rs2"],
-    ["*", ["^", 2, 41], "rd"],
+def = ["+",
+    ["*", ["^", 2, 0], "read_register1"],
+    ["*", ["^", 2, 1], "read_register2"],
+    ["*", ["^", 2, 2], "write_register"],
+    ["*", ["^", 2, 3], "memory_2bytes"],
+    ["*", ["^", 2, 4], "memory_4bytes"],
+    ["*", ["^", 2, 5], "memory_8bytes"],
+    ["*", ["^", 2, 6], "c_type_instruction"],
+    ["*", ["^", 2, 7], "signed"],
+    ["*", ["^", 2, 8], "mp_selector"],
+    ["*", ["^", 2, 9], "muldiv_selector"],
+    ["*", ["^", 2, 10], "word_instr"],
+    ["*", ["^", 2, 11], "ADD"],
+    ["*", ["^", 2, 12], "SUB"],
+    ["*", ["^", 2, 13], "SLT"],
+    ["*", ["^", 2, 14], "AND"],
+    ["*", ["^", 2, 15], "OR"],
+    ["*", ["^", 2, 16], "XOR"],
+    ["*", ["^", 2, 17], "SHIFT"],
+    ["*", ["^", 2, 18], "JALR"],
+    ["*", ["^", 2, 19], "BEQ"],
+    ["*", ["^", 2, 20], "BLT"],
+    ["*", ["^", 2, 21], "LOAD"],
+    ["*", ["^", 2, 22], "STORE"],
+    ["*", ["^", 2, 23], "MUL"],
+    ["*", ["^", 2, 24], "DIVREM"],
+    ["*", ["^", 2, 25], "ECALL"],
+    ["*", ["^", 2, 26], "EBREAK"],
+    ["*", ["^", 2, 27], "rs1"],
+    ["*", ["^", 2, 35], "rs2"],
+    ["*", ["^", 2, 43], "rd"],
 ]
 
+[[variables.virtual]]
+name = "pad"
+type = "Bit"
+desc = "When no flags are set, we must be in a padding row."
+def = ["-", 1, "ADD", "SUB", "SLT", "AND", "OR", "XOR", "SHIFT", "JALR", "BEQ", "BLT", "LOAD", "STORE", "MUL", "DIVREM", "ECALL", "EBREAK"]
+
 
 [[assumptions]]
-desc = "The flags are a one-hot vector in the decoding"
+desc = "At most one ALU selector flag is 1 by the decoding, and every other flag is 0."
 ref = "cpu:a:one-hot"
 
 [[assumptions]]
@@ -287,6 +349,18 @@ input = ["pc", "imm", "packed_decode"]
 name = "range"
 prefix = "R"
 
+[[constraints.range]]
+kind = "template"
+tag = "IS_BIT"
+input = ["read_register1"]
+ref = "cpu:c:range_read_register1"
+
+[[constraints.range]]
+kind = "template"
+tag = "IS_BIT"
+input = ["read_register2"]
+ref = "cpu:c:range_read_register2"
+
 [[constraints.range]]
 kind = "template"
 tag = "IS_BIT"
@@ -314,8 +388,8 @@ ref = "cpu:c:range_memory_8bytes"
 [[constraints.range]]
 kind = "template"
 tag = "IS_BIT"
-input = ["c_kind_instruction"]
-ref = "cpu:c:range_c_kind_instruction"
+input = ["c_type_instruction"]
+ref = "cpu:c:range_c_type_instruction"
 
 [[constraints.range]]
 kind = "template"
@@ -568,6 +642,13 @@ kind = "interaction"
 tag = "MEMW"
 input = [1, ["*", 2, "rs1"], "rv1", ["+", "timestamp", 0], 1, 0, 0]
 output = "rv1"
+multiplicity = "read_register1"
+
+[[constraints.mem]]
+kind = "arith"
+constraint = "$#`!read_register1` => #`rv1[i]` = 0$"
+poly = ["*", ["not", "read_register1"], ["idx", "rv1", "i"]]
+iter = ["i", 0, 2]
 
 # TODO: no types available, so no casting yet
 [[constraints.mem]]
@@ -575,6 +656,13 @@ kind = "interaction"
 tag = "MEMW"
 input = [1, ["*", 2, "rs2"], "rv2", ["+", "timestamp", 1], 1, 0, 0]
 output = "rv2"
+multiplicity = "read_register2"
+
+[[constraints.mem]]
+kind = "arith"
+constraint = "$#`!read_register2` => #`rv2[i]` = 0$"
+poly = ["*", ["not", "read_register2"], ["idx", "rv2", "i"]]
+iter = ["i", 0, 2]
 
 # TODO: no types available, so no casting yet
 [[constraints.mem]]
@@ -603,6 +691,7 @@ kind = "interaction"
 tag = "MEMW"
 input = [1, ["*", 2, 255], "next_pc", ["+", "timestamp", 1], 1, 0, 0]
 output = "pc"
+multiplicity = ["not", "pad"]
 
 
 [[constraint_groups]]
@@ -614,6 +703,7 @@ kind = "arith"
 constraint = "`!EBREAK`"
 desc = "We treat `EBREAK` as an unprovable trap"
 poly = ["not", "EBREAK"]
+ref = "cpu:c:ebreak_traps"
 
 # TODO: no types available, so no casting yet
 [[constraints.sys]]
diff --git a/spec/src/decode.toml b/spec/src/decode.toml
index 6c01e4f6c..367db1568 100644
--- a/spec/src/decode.toml
+++ b/spec/src/decode.toml
@@ -4,8 +4,7 @@ name = "DECODE"
 name = "pc"
 type = "DWordWL"
 desc = "value of the program counter this instruction is associated with."
-# TODO(#136): fix this when padding the CPU
-pad = 1
+pad = 7
 
 [[variables.output]]
 name = "packed_decode"
@@ -13,37 +12,39 @@ type = "BaseField"
 desc = """Ordered concatenation of several small variables.
 The `decode (uncompressed)` section explains the purpose of each variable.\\
 A list of each variable and the bit(-range) in which it is located:\\
-[0:7] `rs1`, \\
-[8:15] `rs2`, \\
-[16:23] `rd`, \\
-[24] `write_register`, \\
-[25] `memory_2bytes`, \\
-[26] `memory_4bytes`, \\
-[27] `memory_8bytes`, \\
-[28] `c_type`, \\
-[29] `signed`, \\
-[30] `mp_selector`, \\
-[31] `muldiv_selector`, \\
-[32] `word_instr`, \\
-[33] `ADD`, \\
-[34] `SUB`, \\
-[35] `SLT`, \\
-[36] `AND`, \\
-[37] `OR`, \\
-[38] `XOR`, \\
-[39] `SHIFT`, \\
-[40] `JALR`, \\
-[41] `BEQ`, \\
-[42] `BLT`, \\
-[43] `LOAD`, \\
-[44] `STORE`, \\
-[45] `MUL`, \\
-[46] `DIVREM`, \\
-[47] `ECALL`, \\
-[48] `EBREAK`; \\
+[0] `read_register1`, \\
+[1] `read_register2`, \\
+[2] `write_register`, \\
+[3] `memory_2bytes`, \\
+[4] `memory_4bytes`, \\
+[5] `memory_8bytes`, \\
+[6] `c_type`, \\
+[7] `signed`, \\
+[8] `mp_selector`, \\
+[9] `muldiv_selector`, \\
+[10] `word_instr`, \\
+[11] `ADD`, \\
+[12] `SUB`, \\
+[13] `SLT`, \\
+[14] `AND`, \\
+[15] `OR`, \\
+[16] `XOR`, \\
+[17] `SHIFT`, \\
+[18] `JALR`, \\
+[19] `BEQ`, \\
+[20] `BLT`, \\
+[21] `LOAD`, \\
+[22] `STORE`, \\
+[23] `MUL`, \\
+[24] `DIVREM`, \\
+[25] `ECALL`, \\
+[26] `EBREAK`; \\
+[27:35] `rs1`, \\
+[35:43] `rs2`, \\
+[43:51] `rd`, \\
 the remaining bits are set to zero.
 """
-pad = 0
+pad = ["^", 2, 26]
 
 [[variables.output]]
 name = "imm"
diff --git a/spec/src/decode_uncompressed.toml b/spec/src/decode_uncompressed.toml
index 8457005f8..0f6c931c2 100644
--- a/spec/src/decode_uncompressed.toml
+++ b/spec/src/decode_uncompressed.toml
@@ -20,6 +20,16 @@ name = "rd"
 type = "Byte"
 desc = "index of destination register."
 
+[[variables.output]]
+name = "read_register1"
+type = "Bit"
+desc = "whether to load the contents of address `rs1` (1) or `0` (0) into `rv1`."
+
+[[variables.output]]
+name = "read_register2"
+type = "Bit"
+desc = "whether to load the contents of address `rs2` (1) or `0` (0) into `rv2`."
+
 [[variables.output]]
 name = "write_register"
 type = "Bit"

From 52bba0749953e2ce0b9e72a5a424075b583d5ed3 Mon Sep 17 00:00:00 2001
From: Joaquin Carletti <joaquin.carletti@lambdaclass.com>
Date: Fri, 23 Jan 2026 10:34:08 -0300
Subject: [PATCH 041/105] update md docs

---
 docs/spec/cpu.md                 | 55 +++++++++++++++++++++-----------
 docs/spec/decode.md              |  2 +-
 docs/spec/decode_uncompressed.md |  2 ++
 3 files changed, 40 insertions(+), 19 deletions(-)

diff --git a/docs/spec/cpu.md b/docs/spec/cpu.md
index 315cc86c4..960133d60 100644
--- a/docs/spec/cpu.md
+++ b/docs/spec/cpu.md
@@ -11,6 +11,8 @@
 | `rs1` | `Byte` | Source register 1 index |
 | `rs2` | `Byte` | Source register 2 index |
 | `rd` | `Byte` | Destination register index |
+| `read_register1` | `Bit` | Whether to read from `rs1` (1) or to place a 0 in `rv1` (0) |
+| `read_register2` | `Bit` | Whether to read from `rs2` (1) or to place a 0 in `rv2` (0) |
 | `write_register` | `Bit` | Whether to write back to the destination register |
 | `memory_2bytes` | `Bit` | Whether the memory access (read or write) touches exactly 2 bytes |
 | `memory_4bytes` | `Bit` | Whether the memory access (read or write) touches exactly 4 bytes |
@@ -52,7 +54,7 @@
 | `rv1` | `DWordWHH` | The value of register `rs1` |
 | `rv2` | `DWordWHH` | The value of register `rs2` |
 | `rv1_sign_bit` | `Bit` | The sign bit of `rv1` if seen as a 32-bit word |
-| `arg1` | `DWordBL` | The extended version of `rv1`, depending on `c_type_instruction` |
+| `arg1` | `DWordBL` | The extended version of `rv1`, depending on `word_instr` |
 | `arg2_sign_bit` | `Bit` | The sign bit of `arg2` if seen as a 32-bit word |
 | `arg2` | `DWordBL` | A multiplexed version of `rv2` and `imm`, to be used as second argument to ALU calls |
 | `res_sign_bit` | `Bit` | The sign bit of `res`, if seen as a 32-bit word |
@@ -65,12 +67,23 @@
 | Name | Type | Description |
 |------|------|-------------|
 | `packed_decode` | `BaseField` | A packed representation of all bit flags and register indices obtained from the decoding |
+| `pad` | `Bit` | When no flags are set, we must be in a padding row. |
+
+**Definition of `packed_decode`:**
+```
+packed_decode := 2^0 * read_register1 + 2^1 * read_register2 + 2^2 * write_register + 2^3 * memory_2bytes + 2^4 * memory_4bytes + 2^5 * memory_8bytes + 2^6 * c_type_instruction + 2^7 * signed + 2^8 * mp_selector + 2^9 * muldiv_selector + 2^10 * word_instr + 2^11 * ADD + 2^12 * SUB + 2^13 * SLT + 2^14 * AND + 2^15 * OR + 2^16 * XOR + 2^17 * SHIFT + 2^18 * JALR + 2^19 * BEQ + 2^20 * BLT + 2^21 * LOAD + 2^22 * STORE + 2^23 * MUL + 2^24 * DIVREM + 2^25 * ECALL + 2^26 * EBREAK + 2^27 * rs1 + 2^35 * rs2 + 2^43 * rd
+```
+
+**Definition of `pad`:**
+```
+pad := 1 - ADD - SUB - SLT - AND - OR - XOR - SHIFT - JALR - BEQ - BLT - LOAD - STORE - MUL - DIVREM - ECALL - EBREAK
+```
 
 ## Assumptions
 
 | Ref | Range | Description |
 |-----|-------|-------------|
-| `cpu:a:one-hot` |  | The flags are a one-hot vector in the decoding |
+| `cpu:a:one-hot` |  | At most one ALU selector flag is 1 by the decoding, and every other flag is 0. |
 | `cpu:a:arg2-multiplex` |  | When `STORE + LOAD + BEQ + BLT = 0`, either `rs2 = 0` or `imm = 0` should be enforced by the decoding. This is needed for `arg2`. |
 
 ## Constraints
@@ -85,11 +98,13 @@
 
 | Ref | Kind | Range | Description |
 |-----|------|-------|-------------|
+| `cpu:c:range_read_register1` | template |  | `IS_BIT<read_register1>` |
+| `cpu:c:range_read_register2` | template |  | `IS_BIT<read_register2>` |
 | `cpu:c:range_write_register` | template |  | `IS_BIT<write_register>` |
 | `cpu:c:range_memory_2bytes` | template |  | `IS_BIT<memory_2bytes>` |
 | `cpu:c:range_memory_4bytes` | template |  | `IS_BIT<memory_4bytes>` |
 | `cpu:c:range_memory_8bytes` | template |  | `IS_BIT<memory_8bytes>` |
-| `cpu:c:range_c_kind_instruction` | template |  | `IS_BIT<c_kind_instruction>` |
+| `cpu:c:range_c_type_instruction` | template |  | `IS_BIT<c_type_instruction>` |
 | `cpu:c:range_signed` | template |  | `IS_BIT<signed>` |
 | `cpu:c:range_mp_selector` | template |  | `IS_BIT<mp_selector>` |
 | `cpu:c:range_muldiv_selector` | template |  | `IS_BIT<muldiv_selector>` |
@@ -110,12 +125,12 @@
 | `cpu:c:range_DIVREM` | template |  | `IS_BIT<DIVREM>` |
 | `cpu:c:range_ECALL` | template |  | `IS_BIT<ECALL>` |
 | `cpu:c:range_EBREAK` | template |  | `IS_BIT<EBREAK>` |
-| `R26` | interaction |  | `IS_BYTE[rs1]` |
-| `R27` | interaction |  | `IS_BYTE[rs2]` |
-| `R28` | interaction |  | `IS_BYTE[rd]` |
-| `R29` | interaction | i ∈ [0, 7] | `IS_BYTE[arg1[i]]` |
-| `R30` | interaction | i ∈ [0, 7] | `IS_BYTE[arg2[i]]` |
-| `R31` | interaction | i ∈ [0, 7] | `IS_BYTE[res[i]]` |
+| `R28` | interaction |  | `IS_BYTE[rs1]` |
+| `R29` | interaction |  | `IS_BYTE[rs2]` |
+| `R30` | interaction |  | `IS_BYTE[rd]` |
+| `R31` | interaction | i ∈ [0, 7] | `IS_BYTE[arg1[i]]` |
+| `R32` | interaction | i ∈ [0, 7] | `IS_BYTE[arg2[i]]` |
+| `R33` | interaction | i ∈ [0, 7] | `IS_BYTE[res[i]]` |
 
 ### alu
 
@@ -136,20 +151,24 @@
 
 ### mem
 
-| Ref | Kind | Description | Multiplicity |
-|-----|------|-------------|--------------|
-| `M1` | interaction | `MEMW[rv1; 1, 2 * rs1, rv1, timestamp + 0, 1, 0, 0]` |  |
-| `M2` | interaction | `MEMW[rv2; 1, 2 * rs2, rv2, timestamp + 1, 1, 0, 0]` |  |
-| `M3` | interaction | `MEMW[1, 2 * rd, rvd, timestamp + 2, 1, 0, 0]` | write_register |
-| `M4` | interaction | `LOAD[rvd; 0, res, timestamp + 0, memory_2bytes, memory_4bytes, memory_8bytes, signed]` | LOAD |
-| `M5` | interaction | `MEMW[0, res, rv2, timestamp + 1, memory_2bytes, memory_4bytes, memory_8bytes]` | STORE |
-| `M6` | interaction | `MEMW[pc; 1, 2 * 255, next_pc, timestamp + 1, 1, 0, 0]` |  |
+| Ref | Kind | Range | Description | Multiplicity |
+|-----|------|-------|-------------|--------------|
+| `M1` | interaction |  | `MEMW[rv1; 1, 2 * rs1, rv1, timestamp + 0, 1, 0, 0]` | read_register1 |
+| `M2` | arith | i ∈ [0, 2] | `!read_register1` => `rv1[i]` = 0 |  |
+| | | _polynomial:_ `(1 - read_register1) * rv1[i] = 0` | |
+| `M3` | interaction |  | `MEMW[rv2; 1, 2 * rs2, rv2, timestamp + 1, 1, 0, 0]` | read_register2 |
+| `M4` | arith | i ∈ [0, 2] | `!read_register2` => `rv2[i]` = 0 |  |
+| | | _polynomial:_ `(1 - read_register2) * rv2[i] = 0` | |
+| `M5` | interaction |  | `MEMW[1, 2 * rd, rvd, timestamp + 2, 1, 0, 0]` | write_register |
+| `M6` | interaction |  | `LOAD[rvd; 0, res, timestamp + 0, memory_2bytes, memory_4bytes, memory_8bytes, signed]` | LOAD |
+| `M7` | interaction |  | `MEMW[0, res, rv2, timestamp + 1, memory_2bytes, memory_4bytes, memory_8bytes]` | STORE |
+| `M8` | interaction |  | `MEMW[pc; 1, 2 * 255, next_pc, timestamp + 1, 1, 0, 0]` | 1 - pad |
 
 ### sys
 
 | Ref | Kind | Description | Multiplicity |
 |-----|------|-------------|--------------|
-| `S1` | arith | `!EBREAK` |  |
+| `cpu:c:ebreak_traps` | arith | `!EBREAK` |  |
 | | | _polynomial:_ `1 - EBREAK = 0` | |
 | | | _note:_ We treat `EBREAK` as an unprovable trap | |
 | `S2` | interaction | `ECALL[rvd; rv1, pc, timestamp, rv2]` | ECALL |
diff --git a/docs/spec/decode.md b/docs/spec/decode.md
index fc5ced687..9bf1fbcb7 100644
--- a/docs/spec/decode.md
+++ b/docs/spec/decode.md
@@ -7,7 +7,7 @@
 | Name | Type | Description |
 |------|------|-------------|
 | `pc` | `DWordWL` | value of the program counter this instruction is associated with. |
-| `packed_decode` | `BaseField` | Ordered concatenation of several small variables. The `decode (uncompressed)` section explains the purpose of each variable.\ A list of each variable and the bit(-range) in which it is located:\ [0:7] `rs1`, \ [8:15] `rs2`, \ [16:23] `rd`, \ [24] `write_register`, \ [25] `memory_2bytes`, \ [26] `memory_4bytes`, \ [27] `memory_8bytes`, \ [28] `c_type`, \ [29] `signed`, \ [30] `mp_selector`, \ [31] `muldiv_selector`, \ [32] `word_instr`, \ [33] `ADD`, \ [34] `SUB`, \ [35] `SLT`, \ [36] `AND`, \ [37] `OR`, \ [38] `XOR`, \ [39] `SHIFT`, \ [40] `JALR`, \ [41] `BEQ`, \ [42] `BLT`, \ [43] `LOAD`, \ [44] `STORE`, \ [45] `MUL`, \ [46] `DIVREM`, \ [47] `ECALL`, \ [48] `EBREAK`; \ the remaining bits are set to zero.  |
+| `packed_decode` | `BaseField` | Ordered concatenation of several small variables. The `decode (uncompressed)` section explains the purpose of each variable.\ A list of each variable and the bit(-range) in which it is located:\ [0] `read_register1`, \ [1] `read_register2`, \ [2] `write_register`, \ [3] `memory_2bytes`, \ [4] `memory_4bytes`, \ [5] `memory_8bytes`, \ [6] `c_type`, \ [7] `signed`, \ [8] `mp_selector`, \ [9] `muldiv_selector`, \ [10] `word_instr`, \ [11] `ADD`, \ [12] `SUB`, \ [13] `SLT`, \ [14] `AND`, \ [15] `OR`, \ [16] `XOR`, \ [17] `SHIFT`, \ [18] `JALR`, \ [19] `BEQ`, \ [20] `BLT`, \ [21] `LOAD`, \ [22] `STORE`, \ [23] `MUL`, \ [24] `DIVREM`, \ [25] `ECALL`, \ [26] `EBREAK`; \ [27:35] `rs1`, \ [35:43] `rs2`, \ [43:51] `rd`, \ the remaining bits are set to zero.  |
 | `imm` | `DWordWL` | the *fully extended (!)* 64-bit version of the immediate. |
 
 ### Multiplicity
diff --git a/docs/spec/decode_uncompressed.md b/docs/spec/decode_uncompressed.md
index 9e3aebc77..4bf226594 100644
--- a/docs/spec/decode_uncompressed.md
+++ b/docs/spec/decode_uncompressed.md
@@ -10,6 +10,8 @@
 | `rs1` | `Byte` | index of source register 1. |
 | `rs2` | `Byte` | index of source register 2. |
 | `rd` | `Byte` | index of destination register. |
+| `read_register1` | `Bit` | whether to load the contents of address `rs1` (1) or `0` (0) into `rv1`. |
+| `read_register2` | `Bit` | whether to load the contents of address `rs2` (1) or `0` (0) into `rv2`. |
 | `write_register` | `Bit` | whether the result should be written to `rd` ($=0$ for memory write and when $`rd` = `x0`$. |
 | `mem_2B` | `Bit` | whether the memory access (read or write) touches exactly $2$ bytes. |
 | `mem_4B` | `Bit` | whether the memory access (read or write) touches exactly $4$ bytes. |

From 335ea11434cb01612b73d333d577cebfeef8bd3a Mon Sep 17 00:00:00 2001
From: MauroFab <maurotoscano2@gmail.com>
Date: Mon, 26 Jan 2026 17:55:14 -0300
Subject: [PATCH 042/105] Add readme

---
 spec/README.md | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/spec/README.md b/spec/README.md
index 127e528c8..5df147522 100644
--- a/spec/README.md
+++ b/spec/README.md
@@ -9,3 +9,17 @@ The specification is written in [`Typst`](https://typst.app/) and can be rendere
 4. Open the repository in a terminal and execute `shiroa serve`.
 
 At this point, the wiki version is hosted locally and is actively updated as you modify the specification files.
+
+## Converting to Markdown
+
+To convert the spec TOML files to Markdown (for documentation or LLM consumption):
+
+```bash
+# From the repository root:
+python3 scripts/spec_to_md.py spec/src/config.toml spec/src/*.toml
+
+# Or output to a specific directory:
+python3 scripts/spec_to_md.py --output-dir docs/spec spec/src/config.toml spec/src/*.toml
+```
+
+This generates one Markdown file per chip (cpu.md, add.md, lt.md, etc.) with tables for columns, constraints, and assumptions.

From a184f95be3d22f8f166731576387a52ef7433251 Mon Sep 17 00:00:00 2001
From: Erik <159244975+erik-3milabs@users.noreply.github.com>
Date: Tue, 27 Jan 2026 14:51:09 +0100
Subject: [PATCH 043/105] spec: update `ECALL` signature (#244)

* spec: update `ECALL` signature

* spec: CPU/ECALL: cast rv1 to DWordWL
---
 spec/decode.typ   | 9 +++------
 spec/src/cpu.toml | 4 +---
 2 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/spec/decode.typ b/spec/decode.typ
index d8332e033..218cb84a3 100644
--- a/spec/decode.typ
+++ b/spec/decode.typ
@@ -146,7 +146,7 @@ To indicate an instruction is provided in compressed form, the `c_type` flag is
   ([`SH       rs1, rs2, imm`], [`STORE`], [], [], [`mem_2B`], []),
   ([`SB       rs1, rs2, imm`], [`STORE`], [], [], [], []),
   // ECALL/EBREAK
-  ([`ECALL`], [`ECALL`], [], [], [$#`rs1` := #`x17`$, $#`rs2` := #`x10`$, $#`rd` := #`x10`$], [#ref_note(<note-ecall>)]),
+  ([`ECALL`], [`ECALL`], [], [], [$#`rs1` := #`x17`$], [#ref_note(<note-ecall>)]),
   ([`EBREAK`], [`EBREAK`], [], [], [], []),
   // FENCE
   ([`FENCE`], [`ADD`], [], [], [], [#ref_note(<note-fence>)]),
@@ -203,11 +203,8 @@ We note the following about the above decoding table:
     referenceable_note(
       "note-ecall",
       [`ECALL`:
-      "On RISC-V a system call has its own instruction: `ECALL`. A system call can have up to 7 arguments and has 1 return value. The arguments are in registers A0-A6, in that order, and the return value is written into A0 before giving back control to the guest. A7 contains the system call number." #link("https://libriscv.no/docs/concepts/syscalls/#the-risc-v-system-call-abi")[[source]]
-      As such,
-      - syscall number in A7 (= register `x17`)
-      - first syscall argument in A0 (= register `x10`)
-      - syscall output in A0 (= register `x10`)]
+      "On RISC-V a system call has its own instruction: `ECALL`. [...] A7 [= register `x17`] contains the system call number." #link("https://libriscv.no/docs/concepts/syscalls/#the-risc-v-system-call-abi")[[source]]
+      ]
     )
   ),
   enum.item(
diff --git a/spec/src/cpu.toml b/spec/src/cpu.toml
index 49a78ee15..d8609e935 100644
--- a/spec/src/cpu.toml
+++ b/spec/src/cpu.toml
@@ -705,12 +705,10 @@ desc = "We treat `EBREAK` as an unprovable trap"
 poly = ["not", "EBREAK"]
 ref = "cpu:c:ebreak_traps"
 
-# TODO: no types available, so no casting yet
 [[constraints.sys]]
 kind = "interaction"
 tag = "ECALL"
-input = ["rv1", "pc", "timestamp", "rv2"]
-output = "rvd"
+input = ["timestamp", ["cast", "rv1", "DWordWL"]]
 multiplicity = "ECALL"
 
 

From 37a9a9fb5dc22ce48713fa6263d383c52d11a75c Mon Sep 17 00:00:00 2001
From: Robin Jadoul <robin.jadoul@3milabs.tech>
Date: Tue, 27 Jan 2026 15:13:46 +0100
Subject: [PATCH 044/105] spec: Allow for cross referencing between different
 chapters, both in pdf and web mode (#225)

* spec: Allow for cross referencing between different chapters, both in pdf and web mode

* Improve PDF organization

The PDF now no longer depends on shiroa trickery
to compile, so errors are more clearly visible instead
of being hidden behind layour iterations.
Additionally, we can now have nice chapter headings
and references to them.

* Allow xref by specifying only the label

* document strip-all

* It does work, after all; with only ~7GB of RAM usage for the entire thing

* small cleanup

* Update spec/book.typ

Co-authored-by: Erik <159244975+erik-3milabs@users.noreply.github.com>

* Address some review comments

* less repetition for file names

---------

Co-authored-by: Erik <159244975+erik-3milabs@users.noreply.github.com>
---
 spec/add.typ             |   5 +-
 spec/bitwise.typ         |   4 +-
 spec/book.typ            | 146 +++++++++++++++++++++++++++++++++------
 spec/branch.typ          |   2 +-
 spec/cpu.typ             |   2 +-
 spec/decode.typ          |   7 +-
 spec/dvrm.typ            |   2 +-
 spec/ebook.typ           |  19 +++--
 spec/ecall.typ           |   2 +-
 spec/is_bit.typ          |   5 +-
 spec/load.typ            |   2 +-
 spec/lt.typ              |   2 +-
 spec/memory.typ          |   8 +--
 spec/memw.typ            |   2 +-
 spec/mul.typ             |   4 +-
 spec/shift.typ           |   4 +-
 spec/templates/ebook.typ |  37 ----------
 spec/templates/page.typ  |  44 +++++++-----
 spec/variables.typ       |   3 +-
 19 files changed, 185 insertions(+), 115 deletions(-)
 delete mode 100644 spec/templates/ebook.typ

diff --git a/spec/add.typ b/spec/add.typ
index 0dade7b01..981a0ccb1 100644
--- a/spec/add.typ
+++ b/spec/add.typ
@@ -2,11 +2,11 @@
 #import "/src.typ": load_config, load_chip
 #import "/chip.typ": render_chip_column_table, render_chip_assumptions, render_constraint_table
 
-#show: book-page.with(title: "ADD/SUB")
-
 #let config = load_config()
 #let chip = load_chip("src/add.toml", config)
 
+#show: book-page(chip.name)
+
 #let add = raw(chip.name)
 
 #let highlighted_code(code) = {
@@ -18,7 +18,6 @@
     raw(code))
 }
 
-= #add template
 #add is a constraint template that is used to assert that $#`sum` = #`lhs` + #`rhs` mod 2^64$, under the condition that `cond` is non-zero.
 
 == Notation
diff --git a/spec/bitwise.typ b/spec/bitwise.typ
index 34ec6dd10..9b5b4a638 100644
--- a/spec/bitwise.typ
+++ b/spec/bitwise.typ
@@ -13,9 +13,7 @@
 
 #let bitwise = raw(chip.name)
 
-#show: book-page.with(title: "BRANCH chip")
-
-= #bitwise chip
+#show: book-page(chip.name)
 
 == Columns
 #let nr_variables = total_nr_variables(chip)
diff --git a/spec/book.typ b/spec/book.typ
index 29e61350c..b194b7f70 100644
--- a/spec/book.typ
+++ b/spec/book.typ
@@ -1,31 +1,35 @@
 #import "@preview/shiroa:0.3.1": *
+#import "/templates/page.typ": project
 
 #show: book
 
-#book-meta(
+#let meta = (
   title: "Lambda VM specification",
-  summary: [
-    #chapter("memory.typ")[Memory argument]
-    #chapter("variables.typ")[Variables]
-    #chapter("is_bit.typ")[IS_BIT template]
-    #chapter("add.typ")[ADD template]
-    #chapter("decode.typ")[DECODE chip]
-    #chapter("cpu.typ")[CPU chip]
-    #chapter("shift.typ")[SHIFT chip]
-    #chapter("branch.typ")[BRANCH]
-    #chapter("memw.typ")[MEMW]
-    #chapter("lt.typ")[LT]
-    #chapter("mul.typ")[MUL chip]
-    #chapter("dvrm.typ")[DVRM chip]
-    #chapter("load.typ")[LOAD chip]
-    #chapter("ecall.typ")[ECALL chips]
-    #chapter("bitwise.typ")[BITWISE]
-  ]
+  authors: ("3MI Labs", "Aligned"),
+  summary: (
+    ("memory.typ", [Memory argument], <memory>),
+    ("variables.typ", [Variables], <vars>),
+    ("is_bit.typ", [IS_BIT template], <isbit>),
+    ("add.typ", [ADD/SUB template], <add>),
+    ("decode.typ", [DECODE table], <decode>),
+    ("cpu.typ", [CPU chip], <cpu>),
+    ("shift.typ", [SHIFT chip], <shift>),
+    ("branch.typ", [BRANCH chip], <branch>),
+    ("memw.typ", [MEMW chip], <memw>),
+    ("lt.typ", [LT chip], <lt>),
+    ("mul.typ", [MUL chip], <mul>),
+    ("dvrm.typ", [DVRM chip], <dvrm>),
+    ("load.typ", [LOAD chip], <load>),
+    ("ecall.typ", [ECALL chips], <ecall>),
+    ("bitwise.typ", [BITWISE chips], <bitwise>),
+  )
+)
+#book-meta(
+  title: meta.title,
+  authors: meta.authors,
+  summary: meta.summary.map(((ch, title, _ref)) => chapter(ch, title)).join()
 )
 
-// re-export page template
-#import "/templates/page.typ": project
-#let book-page = project
 
 #let todo(background: white, foreground: black, name: none, body) = block(fill: background, outset: 0.5em, radius: 20%, stroke: black)[
   #set text(fill: foreground)
@@ -47,3 +51,103 @@
            align(center, strong(text(fill: black, title))))
     #align(left, body)
 ])
+
+
+#let is-shiroa = "x-target" in sys.inputs
+
+// Strip styling to keep only "pure" content.
+// This is useful to avoid errors on the `set document(...)` in `project`
+// when invisibly including other chapters to resolve xrefs.
+#let strip-all(content) = {
+  if repr(content.func()) == "sequence" {
+    for c in content.children {
+      strip-all(c)
+    }
+  } else if repr(content.func()) == "styled" {
+    strip-all(content.child)
+  } else {
+    content
+  }
+}
+
+#let _toplevel = state("_toplevel", none)
+#let _xref-included = state("_xref-included", (:))
+
+// Invisibly include another chapter, so that its labels can be resolved
+#let xref-include(f) = {
+  context if f not in _xref-included.get() {
+    hide(box(width: 0%, height: 0%, strip-all(include "/" + f)))
+  }
+  context _xref-included.update(x => x + ((f): true))
+}
+
+// Generate a cross-link for references to other chapters.
+// Leaves the ref untouched if it can't be resolved or points to the current chapter.
+#let xref(rf) = {
+  assert(is-shiroa, message: "xref should only be used when compiling for shiroa")
+  let lbl = rf.target
+  let found = meta.summary.find(((_, _, tag)) => str(lbl).starts-with(str(tag)))
+  context if found != none and found.at(0) != _toplevel.final() {
+    let (ch, title, ref) = found
+    if ref == lbl {
+      cross-link("/" + ch, [Chapter #(meta.summary.position(x => x == found) + 1)])
+    } else {
+      // Because shiroa does weird url escaping
+      let shiroa-label = label(str(lbl).replace(":", "%3A"))
+      xref-include(ch)
+      // The ideal would be to use `rf` directly as content argument to `cross-link`,
+      // as that would inherit any/all formatting of the ref we want or need.
+      // Unfortunately the ref link seems to take precedence over the cross-link hyperlink
+      // when clicking.
+      // There may still be some way around it by messing with some html output
+      let link-content = context {
+        let fig = query(lbl).first()
+        let counter = if fig.has("counter") {
+          fig.counter
+        } else {
+          counter(fig.func())
+        }
+
+        let supplement = if rf.supplement == auto {
+          fig.fields().at("supplement", default: none)
+        } else {
+          rf.supplement
+        }
+        [#supplement#numbering(fig.numbering, ..counter.at(lbl))]
+      }
+      cross-link("/" + ch, reference: shiroa-label, link-content)
+    }
+  } else {
+    rf
+  }
+}
+
+#let book-page(file, ..args) = {
+  let file = if file.ends-with(".typ") {
+    file
+  } else {
+    lower(file) + ".typ"
+  }
+  assert(meta.summary.find(((f, _, _)) => f == file) != none, message: "Couldn't resolve typst source file " + file)
+  if is-shiroa {
+    (body) => {
+      context _xref-included.update(x => x + ((file): true))
+      context _toplevel.update(s => {
+        if s == none {
+          file
+        } else {
+          s
+        }
+      })
+      let cond() = _toplevel.final() == file
+      project.with(..args, title: context meta.summary.find(x => x.at(0) == _toplevel.final()).at(1), cond: cond)([
+        #show ref: it => context if _toplevel.final() == file {
+          xref(it)
+        }
+        #body
+      ])
+    }
+  } else {
+    (body) => body
+  }
+}
diff --git a/spec/branch.typ b/spec/branch.typ
index a18c252b7..f448f2da4 100644
--- a/spec/branch.typ
+++ b/spec/branch.typ
@@ -12,7 +12,7 @@
 #let config = load_config()
 #let chip = load_chip("src/branch.toml", config)
 
-#show: book-page.with(title: "BRANCH chip")
+#show: book-page(chip.name)
 
 == Columns
 #let nr_variables = total_nr_variables(chip)
diff --git a/spec/cpu.typ b/spec/cpu.typ
index 784d750d2..0afa75f62 100644
--- a/spec/cpu.typ
+++ b/spec/cpu.typ
@@ -12,7 +12,7 @@
 #let config = load_config()
 #let chip = load_chip("src/cpu.toml", config)
 
-#show: book-page.with(title: "CPU chip")
+#show: book-page(chip.name)
 
 == Columns
 #let nr_variables = total_nr_variables(chip)
diff --git a/spec/decode.typ b/spec/decode.typ
index 218cb84a3..586625226 100644
--- a/spec/decode.typ
+++ b/spec/decode.typ
@@ -1,4 +1,4 @@
-#import "/book.typ": book-page, rj
+#import "/book.typ": book-page, rj, xref
 #import "/src.typ": load_config, load_chip
 #import "/chip.typ": (
   render_chip_assumptions,
@@ -11,11 +11,10 @@
 
 #let config = load_config()
 #let chip = load_chip("src/decode.toml", config)
-#show: book-page.with(title: "DECODE chip")
+#show: book-page(chip.name)
 
 #let decode = raw(chip.name)
 
-= #decode table
 All `RV64IMC` instruction are to be decoded to a format that can be interpreted by the VM.
 This section outlines the decoding table being used in the VM.
 For reasons of efficiency, data in this table is significantly compressed.
@@ -220,4 +219,4 @@ In addition to decoding all instructions provided in the ELF and adding a corres
 Note that this will never conflict with any entry in the ELF, since it has an odd `pc` value.
 
 This entry is used to pad the `CPU` table.
-More details on this matter are provided in the `CPU` chip.
\ No newline at end of file
+More details on this matter are provided in the `CPU` chip.
diff --git a/spec/dvrm.typ b/spec/dvrm.typ
index 69e79cee2..f1d9a3a4c 100644
--- a/spec/dvrm.typ
+++ b/spec/dvrm.typ
@@ -12,6 +12,6 @@
 #let config = load_config()
 // #let chip = load_chip("src/dvrm.toml", config)
 
-#show: book-page.with(title: "DVRM chip")
+#show: book-page("dvrm.typ")
 
 *placeholder chapter: WIP*
diff --git a/spec/ebook.typ b/spec/ebook.typ
index 410e926bb..835751163 100644
--- a/spec/ebook.typ
+++ b/spec/ebook.typ
@@ -1,12 +1,19 @@
-#import "@preview/shiroa:0.3.1": *
-#import "/book.typ": style
+#import "/book.typ": style, meta
 
-#import "/templates/ebook.typ"
+#set document(author: meta.authors, title: meta.title)
 
-#show: ebook.project.with(title: "typst-book", spec: "book.typ")
 #style.update((
   foreground: black,
 ))
 
-// set a resolver for inclusion
-#ebook.resolve-inclusion(it => include it)
+#align(center, title(meta.title))
+#pagebreak(weak: true)
+#outline()
+
+#show heading: set heading(numbering: "1.1")
+
+#meta.summary.map(((ch, title, ref)) => [
+  #pagebreak(weak: true)
+  #heading(supplement: [Chapter], level: 1, title)#ref
+  #include ch
+]).join()
diff --git a/spec/ecall.typ b/spec/ecall.typ
index fee25768c..3bf2d3b69 100644
--- a/spec/ecall.typ
+++ b/spec/ecall.typ
@@ -11,7 +11,7 @@
 
 #let config = load_config()
 
-#show: book-page.with(title: "ECALL chips")
+#show: book-page("ecall.typ")
 
 *placeholder chapter: WIP*
 
diff --git a/spec/is_bit.typ b/spec/is_bit.typ
index c379080cd..a12d62108 100644
--- a/spec/is_bit.typ
+++ b/spec/is_bit.typ
@@ -2,11 +2,11 @@
 #import "/src.typ": load_config, load_chip
 #import "/chip.typ": render_chip_column_table, render_constraint_table
 
-#show: book-page.with(title: "IS_BIT template")
-
 #let config = load_config()
 #let chip = load_chip("src/is_bit.toml", config)
 
+#show: book-page(chip.name)
+
 #let is_bit = raw(chip.name)
 
 #let highlighted_code(code) = {
@@ -18,7 +18,6 @@
     raw(code))
 }
 
-= #is_bit template
 #is_bit is a constraint template that is used to assert that a variable lies in the range ${0, 1}$ if some second variable is non-zero.
 Barring exceptional cases, this template is used to assert that a variable of type `Bit` assumes a valid value under some condition.
 
diff --git a/spec/load.typ b/spec/load.typ
index 931611108..71c274d1b 100644
--- a/spec/load.typ
+++ b/spec/load.typ
@@ -12,7 +12,7 @@
 #let config = load_config()
 #let chip = load_chip("src/load.toml", config)
 
-#show: book-page.with(title: "LOAD chip")
+#show: book-page(chip.name)
 
 == Columns
 #let nr_variables = total_nr_variables(chip)
diff --git a/spec/lt.typ b/spec/lt.typ
index 3b57a62e3..cb4f9c141 100644
--- a/spec/lt.typ
+++ b/spec/lt.typ
@@ -12,7 +12,7 @@
 #let config = load_config()
 #let chip = load_chip("src/lt.toml", config)
 
-#show: book-page.with(title: "LT chip")
+#show: book-page(chip.name)
 
 == Columns
 #let nr_variables = total_nr_variables(chip)
diff --git a/spec/memory.typ b/spec/memory.typ
index 6687d733d..ec8735e49 100644
--- a/spec/memory.typ
+++ b/spec/memory.typ
@@ -1,4 +1,4 @@
-#import "/book.typ": book-page, rj, aside
+#import "/book.typ": book-page, rj, aside, xref
 #import "/src.typ": load_config, load_chip
 #import "/chip.typ": (
   render_chip_assumptions,
@@ -12,9 +12,7 @@
 #let config = load_config()
 #let chip = load_chip("src/page.toml", config)
 
-#show: book-page.with(title: "Memory argument")
-
-= Memory argument
+#show: book-page("memory.typ")
 
 As part of fully proving the correct execution of a RISC-V program,
 the VM must ensure that memory reads and writes are consistent.
@@ -108,7 +106,7 @@ to have a strictly greater timestamp than the consumed token.
 This raises the question of how to represent timestamps and cleanly perform this check,
 as over a finite field the “less than” relation is ill-defined
 (though it is common and natural to consider it as the less than relation over the natural lift of the field into the integers).
-We choose to represent timestamps as machine words, using the existing `LT` chip functionality for comparisons.
+We choose to represent timestamps as machine words, using the existing `LT` chip (@lt) functionality for comparisons.
 #rj[Properly link/refer to the LT chip]
 
 #aside[Note on options and trade-offs for timestamp representation][
diff --git a/spec/memw.typ b/spec/memw.typ
index bcf6a64b0..77b786bf6 100644
--- a/spec/memw.typ
+++ b/spec/memw.typ
@@ -11,7 +11,7 @@
 #let config = load_config()
 #let chip = load_chip("src/memw.toml", config)
 
-#show: book-page.with(title: "MEMW chip")
+#show: book-page(chip.name)
 
 == Columns
 #let nr_variables = total_nr_variables(chip)
diff --git a/spec/mul.typ b/spec/mul.typ
index 1892994f0..bc5898fa0 100644
--- a/spec/mul.typ
+++ b/spec/mul.typ
@@ -12,12 +12,10 @@
 #let config = load_config()
 #let chip = load_chip("src/mul.toml", config)
 
-#show: book-page.with(title: "MUL chip")
+#show: book-page(chip.name)
 
 #let mul = raw(chip.name)
 
-= #mul chip
-
 == Columns
 #let nr_variables = total_nr_variables(chip)
 #let nr_columns = total_nr_instantiated_columns(chip, config)
diff --git a/spec/shift.typ b/spec/shift.typ
index 70aebc97c..177ce6104 100644
--- a/spec/shift.typ
+++ b/spec/shift.typ
@@ -14,9 +14,7 @@
 
 #let shift = raw(chip.name)
 
-#show: book-page.with(title: "SHIFT chip")
-
-= #shift chip
+#show: book-page(chip.name)
 
 == Interface
 The #shift chip has the following interface:
diff --git a/spec/templates/ebook.typ b/spec/templates/ebook.typ
deleted file mode 100644
index 44e0312d3..000000000
--- a/spec/templates/ebook.typ
+++ /dev/null
@@ -1,37 +0,0 @@
-#import "@preview/shiroa:0.3.1": *
-#import "/templates/page.typ": part-style, project
-
-#let _page-project = project
-
-#let _resolve-inclusion-state = state("_resolve-inclusion", none)
-
-#let resolve-inclusion(inc) = _resolve-inclusion-state.update(it => inc)
-
-#let project(title: "", authors: (), spec: "", content) = {
-  // Set document metadata early
-  set document(
-    author: authors,
-    title: title,
-  )
-
-  // Inherit from gh-pages
-  show: _page-project
-
-  if title != "" {
-    heading(title)
-  }
-
-  context {
-    let inc = _resolve-inclusion-state.final()
-    external-book(spec: inc(spec))
-
-    let mt = book-meta-state.final()
-    let styles = (inc: inc, part: part-style, chapter: it => it)
-
-    if mt != none {
-      mt.summary.map(it => visit-summary(it, styles)).sum()
-    }
-  }
-
-  content
-}
diff --git a/spec/templates/page.typ b/spec/templates/page.typ
index 1f7f88ea0..4ec7b27ac 100644
--- a/spec/templates/page.typ
+++ b/spec/templates/page.typ
@@ -85,8 +85,11 @@
 ///   - Hint: use `""` to generate an empty description.
 /// - authors (array | str): The author(s) of the page.
 /// - kind (str): The kind of the page.
+/// - cond (function): A predicate that can be used inside of `context`
+///                    to check whether display rules should be applied.
+///                    Useful for including other chapters invisibly to figure out information about their labels
 /// - plain-body (content): The plain body of the page.
-#let project(title: "Typst Book", description: auto, authors: (), kind: "page", plain-body) = {
+#let project(title: "Typst Book", description: auto, authors: (), kind: "page", cond: none, plain-body) = {
   // set basic document metadata
   set document(
     author: authors,
@@ -137,23 +140,28 @@
     lang: "en",
   )
 
-  // markup setting
-  show: markup-rules.with(
-    ..common,
-    themes: themes,
-    heading-sizes: heading-sizes,
-    list-indent: list-indent,
-    main-size: main-size,
-  )
-  // math setting
-  show: equation-rules.with(..common, theme-box: theme-box)
-  // code block setting
-  show: code-block-rules.with(..common, themes: themes, code-font: code-font)
-
-  // Main body.
-  set par(justify: true)
-
-  plain-body
+  context if cond() {
+    // markup setting
+    show: markup-rules.with(
+      ..common,
+      themes: themes,
+      heading-sizes: heading-sizes,
+      list-indent: list-indent,
+      main-size: main-size,
+    )
+
+    // math setting
+    show: equation-rules.with(..common, theme-box: theme-box)
+    // code block setting
+    show: code-block-rules.with(..common, themes: themes, code-font: code-font)
+
+    // Main body.
+    set par(justify: true)
+
+    plain-body
+  } else {
+    plain-body
+  }
 }
 
 #let part-style = heading
diff --git a/spec/variables.typ b/spec/variables.typ
index 42e7bc379..d62fec7ac 100644
--- a/spec/variables.typ
+++ b/spec/variables.typ
@@ -1,11 +1,10 @@
 #import "/book.typ": book-page
 #import "/src.typ": load_config
 
-#show: book-page.with(title: "Variables")
+#show: book-page("variables.typ")
 
 #let config = load_config()
 
-= Variables
 While this VM operates on 64-bit words, the proving system's base field has fewer than $2^64$ elements available and thus cannot represent all words natively.
 To this end, we introduce the concept of "variables" as an abstraction layer on top of the VM's field elements. The following table lists all variable types used in this VM.
 

From 5084c80dcb5ba347092f8f5e001ce0f9ec1f5d30 Mon Sep 17 00:00:00 2001
From: Robin Jadoul <robin.jadoul@3milabs.tech>
Date: Tue, 27 Jan 2026 16:06:22 +0100
Subject: [PATCH 045/105] spec: Update LT interaction signature so that it can
 be used properly for timestamps (#246)

* spec: Update LT interaction signature so that it can be used properly for timestamps

* fix(spec): add missing signed argument to LT from MEMW

* Update spec/src/lt.toml

Co-authored-by: Erik <159244975+erik-3milabs@users.noreply.github.com>

* Update spec/src/lt.toml

Co-authored-by: Erik <159244975+erik-3milabs@users.noreply.github.com>

---------

Co-authored-by: Erik <159244975+erik-3milabs@users.noreply.github.com>
---
 spec/lt.typ        |  5 +++--
 spec/src/cpu.toml  |  2 +-
 spec/src/lt.toml   | 22 +++++++++++++++++-----
 spec/src/memw.toml | 14 +++++++-------
 4 files changed, 28 insertions(+), 15 deletions(-)

diff --git a/spec/lt.typ b/spec/lt.typ
index cb4f9c141..ea36eb4dc 100644
--- a/spec/lt.typ
+++ b/spec/lt.typ
@@ -22,7 +22,7 @@ The `LT` chip is comprised of #nr_variables variables that are expressed using #
 #render_chip_column_table(chip, config)
 
 == Assumptions
-We assume the inputs `lhs`, `rhs` and `signed` are appropriately range checked.
+We assume the inputs `lhs`, `rhs` and `signed` are partially range checked.
 #render_chip_assumptions(chip, config)
 
 == Constraints
@@ -71,7 +71,8 @@ Therefore, we can use $Q$ to constrain `lt` when `signed = 1`.
 
 #render_constraint_table(chip, config, groups: "defs")
 
-And then we constrain the subtraction.
+And then we constrain the subtraction,
+taking care of the remaining range checking not yet covered by the assumptions or the `MSB16` lookup.
 
 #render_constraint_table(chip, config, groups: "sub")
 
diff --git a/spec/src/cpu.toml b/spec/src/cpu.toml
index d8609e935..c151b6eff 100644
--- a/spec/src/cpu.toml
+++ b/spec/src/cpu.toml
@@ -567,7 +567,7 @@ ref = "cpu:c:sub"
 [[constraints.alu]]
 kind = "interaction"
 tag = "LT"
-input = [["cast", "arg1", "DWordHHW"], ["cast", "arg2", "DWordHHW"], "signed"]
+input = [["cast", "arg1", "DWordWL"], ["cast", "arg2", "DWordWL"], "signed"]
 output = ["idx", "res", 0]
 multiplicity = ["+", "SLT", "BLT"]
 
diff --git a/spec/src/lt.toml b/spec/src/lt.toml
index 3836cdd13..10497b637 100644
--- a/spec/src/lt.toml
+++ b/spec/src/lt.toml
@@ -78,13 +78,11 @@ pad = 0
 
 
 [[assumptions]]
-desc = "`IS_HALFWORD[lhs[i]]` and `IS_WORD[lhs[0]]`"
-iter = ["i", 1, 2]
+desc = "`IS_WORD[lhs[0]]`"
 ref = "lt:a:range_lhs"
 
 [[assumptions]]
-desc = "`IS_HALFWORD[rhs[i]]` and `IS_WORD[rhs[0]]`"
-iter = ["i", 1, 2]
+desc = "`IS_WORD[rhs[0]]`"
 ref = "lt:a:range_rhs"
 
 [[assumptions]]
@@ -130,6 +128,20 @@ tag = "IS_BIT"
 input = [["idx", "carry", "i"]]
 iter = ["i", 0, 1]
 
+[[constraints.defs]]
+kind = "interaction"
+tag = "IS_HALFWORD"
+input = [["idx", "lhs", 1]]
+multiplicity = "μ"
+ref = "lt:c:range_lhs"
+
+[[constraints.defs]]
+kind = "interaction"
+tag = "IS_HALFWORD"
+input = [["idx", "rhs", 1]]
+multiplicity = "μ"
+ref = "lt:c:range_rhs"
+
 [[constraints.sub]]
 kind = "interaction"
 tag = "IS_HALFWORD"
@@ -146,6 +158,6 @@ desc = "Each row contributes the following to the LogUp sum"
 [[constraints.output]]
 kind = "interaction"
 tag = "LT"
-input = ["lhs", "rhs", "signed"]
+input = [["cast", "lhs", "DWordWL"], ["cast", "rhs", "DWordWL"], "signed"]
 output = "lt"
 multiplicity = "-μ"
diff --git a/spec/src/memw.toml b/spec/src/memw.toml
index 9aa9cd592..f7276a9cd 100644
--- a/spec/src/memw.toml
+++ b/spec/src/memw.toml
@@ -159,21 +159,21 @@ iters = [
 [[constraints.consistency]]
 kind = "interaction"
 tag = "LT"
-input = [["idx", "old_timestamp", 0], "timestamp"]
+input = [["idx", "old_timestamp", 0], "timestamp", 0]
 output = 1
 multiplicity = "μ_sum"
 
 [[constraints.consistency]]
 kind = "interaction"
 tag = "LT"
-input = [["idx", "old_timestamp", 1], "timestamp"]
+input = [["idx", "old_timestamp", 1], "timestamp", 0]
 output = 1
 multiplicity = "w2"
 
 [[constraints.consistency]]
 kind = "interaction"
 tag = "LT"
-input = [["idx", "old_timestamp", "i"], "timestamp"]
+input = [["idx", "old_timestamp", "i"], "timestamp", 0]
 output = 1
 iter = ["i", 2, 3]
 multiplicity = "w4"
@@ -181,7 +181,7 @@ multiplicity = "w4"
 [[constraints.consistency]]
 kind = "interaction"
 tag = "LT"
-input = [["idx", "old_timestamp", "i"], "timestamp"]
+input = [["idx", "old_timestamp", "i"], "timestamp", 0]
 output = 1
 iter = ["i", 4, 7]
 multiplicity = "write8"
@@ -194,21 +194,21 @@ prefix = "R"
 [[constraints.overflow]]
 kind = "interaction"
 tag = "LT"
-input = ["base_address", ["cast", ["idx", "address_add", 0], "DWordWL"]]
+input = ["base_address", ["cast", ["idx", "address_add", 0], "DWordWL"], 0]
 output = 1
 multiplicity = "write2"
 
 [[constraints.overflow]]
 kind = "interaction"
 tag = "LT"
-input = ["base_address", ["cast", ["idx", "address_add", 2], "DWordWL"]]
+input = ["base_address", ["cast", ["idx", "address_add", 2], "DWordWL"], 0]
 output = 1
 multiplicity = "write4"
 
 [[constraints.overflow]]
 kind = "interaction"
 tag = "LT"
-input = ["base_address", ["cast", ["idx", "address_add", 6], "DWordWL"]]
+input = ["base_address", ["cast", ["idx", "address_add", 6], "DWordWL"], 0]
 output = 1
 multiplicity = "write8"
 

From c72eefef3c43faee9e0a463581a58397eb5b68a7 Mon Sep 17 00:00:00 2001
From: Erik <159244975+erik-3milabs@users.noreply.github.com>
Date: Tue, 27 Jan 2026 16:43:19 +0100
Subject: [PATCH 046/105] spec: `HALT` chip (#235)

* spec: HALT: first draft

* spec: HALT: add link to sys call number

* spec: HALT: update ECALL signature

* spec: HALT: minor update

* spec: HALT: document cleanup verification alternative

* adapt to new chapter format

* spec: HALT: fix MEMW register indexing

* spec: HALT: move halt.typ into ecall.typ

---------

Co-authored-by: Robin Jadoul <robin.jadoul@3milabs.tech>
---
 spec/ecall.typ     | 44 ++++++++++++++++++++++++++++++++++--
 spec/src/halt.toml | 56 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 98 insertions(+), 2 deletions(-)
 create mode 100644 spec/src/halt.toml

diff --git a/spec/ecall.typ b/spec/ecall.typ
index 3bf2d3b69..9f2d96048 100644
--- a/spec/ecall.typ
+++ b/spec/ecall.typ
@@ -1,4 +1,4 @@
-#import "/book.typ": book-page
+#import "/book.typ": book-page, aside
 #import "/src.typ": load_config, load_chip
 #import "/chip.typ": (
   render_chip_column_table,
@@ -13,5 +13,45 @@
 
 #show: book-page("ecall.typ")
 
-*placeholder chapter: WIP*
+#let config = load_config()
+#let chip = load_chip("src/halt.toml", config)
+#let halt = raw(chip.name)
+== #halt chip
+
+=== Columns
+#let nr_variables = total_nr_variables(chip)
+#let nr_columns = total_nr_instantiated_columns(chip, config)
+
+The #halt chip leverages #nr_variables variable, spanning #nr_columns columns:
+#render_chip_column_table(chip, config)
+
+=== Assumptions
+It is assumed the input is range checked:
+#render_chip_assumptions(chip, config)
+
+=== Constraints
+The #halt chip:
++ makes sure register `x10` (containing the exit code) equals $0$ (@halt:c:read_zero_exit_code),
++ writes $0$ to all other registers (@halt:c:zeroize_registers_lo/@halt:c:zeroize_registers_hi), and
++ sets `pc` equal to $1$ (@halt:c:pc).
+Note that the writes performed by all these interactions are accompanied by the timestamp $2^64-1$; the maximum timestamp.
+This prevents any other operation involving memory from being executed hereafter.
+#render_constraint_table(chip, config, groups: "all")
+
+#aside("Note on register clean up",
+[
+  Observe that --- in its current state --- this solution puts the burden of verifying the register cleanup on the verifier inside of the lookup argument.
+  Alternatively, one could add 31 lookups to the "memory" table to remove the _known_ final tokens for the registers there.
+])
+
+==== Lookup
+The HALT chip contributes the following interaction to the lookup-argument:
+#render_constraint_table(chip, config, groups: "lookup")
+
+*Note*: #link("https://github.com/riscv-collab/riscv-gnu-toolchain/blob/master/linux-headers/include/asm-generic/unistd.h#L258")[$93$ is the system call number corresponding to `sys_exit`.]
+
+=== Padding
+This chip should only contain a single row.
+Given that $2^0 = 1$, this chip does not need to be padded.
+As such, no padding is defined.
 
diff --git a/spec/src/halt.toml b/spec/src/halt.toml
new file mode 100644
index 000000000..b0606e3e4
--- /dev/null
+++ b/spec/src/halt.toml
@@ -0,0 +1,56 @@
+name = "HALT"
+
+[[variables.input]]
+name = "timestamp"
+type = "DWordWL"
+desc = "timestamp at which to halt the program"
+
+
+[[assumptions]]
+desc = "`IS_WORD[timestamp[i]]`"
+iter = ["i", 0, 1]
+
+
+[[constraint_groups]]
+name = "all"
+
+[[constraints.all]]
+kind = "interaction"
+tag = "MEMW"
+input = [1, ["*", 2, "i"], 0, ["-", ["^", 2, 64], 1], 1, 0, 0]
+iter = ["i", 1, 9]
+multiplicity = 1
+ref = "halt:c:zeroize_registers_lo"
+
+[[constraints.all]]
+kind = "interaction"
+tag = "MEMW"
+input = [1, ["*", 2, 10], 0, ["-", ["^", 2, 64], 1], 1, 0, 0]
+output = 0
+multiplicity = 1
+ref = "halt:c:read_zero_exit_code"
+
+[[constraints.all]]
+kind = "interaction"
+tag = "MEMW"
+input = [1, ["*", 2, "i"], 0, ["-", ["^", 2, 64], 1], 1, 0, 0]
+iter = ["i", 11, 31]
+multiplicity = 1
+ref = "halt:c:zeroize_registers_hi"
+
+[[constraints.all]]
+kind = "interaction"
+tag = "MEMW"
+input = [1, ["*", 2, 255], 1, ["-", ["^", 2, 64], 1], 1, 0, 0]
+multiplicity = 1
+ref = "halt:c:pc"
+
+[[constraint_groups]]
+name = "lookup"
+
+[[constraints.lookup]]
+kind = "interaction"
+tag = "ECALL"
+input = ["timestamp", 93]
+multiplicity = ["-", 1]
+ref = "halt:c:lookup"
\ No newline at end of file

From 19c016e7b3d107de9360b311c0cfeaf48792e979 Mon Sep 17 00:00:00 2001
From: Joaquin Carletti <joaquin.carletti@lambdaclass.com>
Date: Wed, 28 Jan 2026 10:45:31 -0300
Subject: [PATCH 047/105] update

---
 docs/spec/cpu.md  |  4 ++--
 docs/spec/halt.md | 32 ++++++++++++++++++++++++++++++++
 docs/spec/lt.md   |  8 +++++---
 docs/spec/memw.md | 14 +++++++-------
 4 files changed, 46 insertions(+), 12 deletions(-)
 create mode 100644 docs/spec/halt.md

diff --git a/docs/spec/cpu.md b/docs/spec/cpu.md
index 960133d60..66c539635 100644
--- a/docs/spec/cpu.md
+++ b/docs/spec/cpu.md
@@ -138,7 +138,7 @@ pad := 1 - ADD - SUB - SLT - AND - OR - XOR - SHIFT - JALR - BEQ - BLT - LOAD -
 |-----|------|-------|-------------|--------------|
 | `A1` | template |  | ADD + LOAD + STORE ⇒ `ADD<res::DWordWL; arg1::DWordWL, arg2::DWordWL>` |  |
 | `cpu:c:sub` | template |  | SUB + BEQ ⇒ `SUB<res::DWordWL; arg1::DWordWL, arg2::DWordWL>` |  |
-| `A3` | interaction |  | `LT[res[0]; arg1::DWordHHW, arg2::DWordHHW, signed]` | SLT + BLT |
+| `A3` | interaction |  | `LT[res[0]; arg1::DWordWL, arg2::DWordWL, signed]` | SLT + BLT |
 | `A4` | arith | i ∈ [1, 7] | `SLT` + `BLT` => `res[i]` = 0 |  |
 | | | _polynomial:_ `(SLT + BLT) * res[i] = 0` | |
 | `A5` | interaction | i ∈ [0, 7] | `AND_BYTE[res[i]; arg1[i], arg2[i]]` | AND |
@@ -171,7 +171,7 @@ pad := 1 - ADD - SUB - SLT - AND - OR - XOR - SHIFT - JALR - BEQ - BLT - LOAD -
 | `cpu:c:ebreak_traps` | arith | `!EBREAK` |  |
 | | | _polynomial:_ `1 - EBREAK = 0` | |
 | | | _note:_ We treat `EBREAK` as an unprovable trap | |
-| `S2` | interaction | `ECALL[rvd; rv1, pc, timestamp, rv2]` | ECALL |
+| `S2` | interaction | `ECALL[timestamp, rv1::DWordWL]` | ECALL |
 
 ### ext
 
diff --git a/docs/spec/halt.md b/docs/spec/halt.md
new file mode 100644
index 000000000..72ecac037
--- /dev/null
+++ b/docs/spec/halt.md
@@ -0,0 +1,32 @@
+# HALT Chip
+
+## Columns
+
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `timestamp` | `DWordWL` | timestamp at which to halt the program |
+
+## Assumptions
+
+| Ref | Range | Description |
+|-----|-------|-------------|
+| `A1` | i ∈ [0, 1] | `IS_WORD[timestamp[i]]` |
+
+## Constraints
+
+### all
+
+| Ref | Kind | Range | Description | Multiplicity |
+|-----|------|-------|-------------|--------------|
+| `halt:c:zeroize_registers_lo` | interaction | i ∈ [1, 9] | `MEMW[1, 2 * i, 0, 2^64 - 1, 1, 0, 0]` | 1 |
+| `halt:c:read_zero_exit_code` | interaction |  | `MEMW[1, 2 * 10, 0, 2^64 - 1, 1, 0, 0]` | 1 |
+| `halt:c:zeroize_registers_hi` | interaction | i ∈ [11, 31] | `MEMW[1, 2 * i, 0, 2^64 - 1, 1, 0, 0]` | 1 |
+| `halt:c:pc` | interaction |  | `MEMW[1, 2 * 255, 1, 2^64 - 1, 1, 0, 0]` | 1 |
+
+### lookup
+
+| Ref | Kind | Description | Multiplicity |
+|-----|------|-------------|--------------|
+| `halt:c:lookup` | interaction | `ECALL[timestamp, 93]` | -1 |
diff --git a/docs/spec/lt.md b/docs/spec/lt.md
index f2cbb1e9b..b72ed2041 100644
--- a/docs/spec/lt.md
+++ b/docs/spec/lt.md
@@ -52,8 +52,8 @@ unsigned_lt := carry[1]
 
 | Ref | Range | Description |
 |-----|-------|-------------|
-| `lt:a:range_lhs` | i ∈ [1, 2] | `IS_HALFWORD[lhs[i]]` and `IS_WORD[lhs[0]]` |
-| `lt:a:range_rhs` | i ∈ [1, 2] | `IS_HALFWORD[rhs[i]]` and `IS_WORD[rhs[0]]` |
+| `lt:a:range_lhs` |  | `IS_WORD[lhs[0]]` |
+| `lt:a:range_rhs` |  | `IS_WORD[rhs[0]]` |
 | `lt:a:range_signed` |  | `IS_BIT<signed>` |
 
 ## Constraints
@@ -68,6 +68,8 @@ _Enforce that variables have been correctly computed_
 | `lt:c:lt` | arith | `lt` = `signed` dot (A (1 - B) + A C + (1 - B) C) + (1 - `signed`) dot `unsigned_lt` |  |
 | | | _polynomial:_ `lt - signed * (lhs_msb * (1 - rhs_msb) + lhs_msb * carry[1] + (1 - rhs_msb) * carry[1]) - (1 - signed) * unsigned_lt = 0` | |
 | | | _note:_ Where $A = #`lhs_msb`$, $B = #`rhs_msb`$ and $C = #`carry[1]`$ | |
+| `lt:c:range_lhs` | interaction | `IS_HALFWORD[lhs[1]]` | μ |
+| `lt:c:range_rhs` | interaction | `IS_HALFWORD[rhs[1]]` | μ |
 
 ### sub
 _Constrain the subtraction_
@@ -82,4 +84,4 @@ _Each row contributes the following to the LogUp sum_
 
 | Ref | Kind | Description | Multiplicity |
 |-----|------|-------------|--------------|
-| `1` | interaction | `LT[lt; lhs, rhs, signed]` | -μ |
+| `1` | interaction | `LT[lt; lhs::DWordWL, rhs::DWordWL, signed]` | -μ |
diff --git a/docs/spec/memw.md b/docs/spec/memw.md
index ddee6b852..84246b8e0 100644
--- a/docs/spec/memw.md
+++ b/docs/spec/memw.md
@@ -81,18 +81,18 @@ w4 := write4 + write8
 | `4` | template | i ∈ [1, 2] | `ADD<address_add[i]::DWordWL; base_address, i + 1>` | w4 |
 | `5` | template | i ∈ [3, 6] | `ADD<address_add[i]::DWordWL; base_address, i + 1>` | write8 |
 | `6` | interaction | i ∈ [0, 6], j ∈ [0, 3] | `IS_HALFWORD[address_add[i][j]]` |  |
-| `7` | interaction |  | `LT[1; old_timestamp[0], timestamp]` | μ_sum |
-| `8` | interaction |  | `LT[1; old_timestamp[1], timestamp]` | w2 |
-| `9` | interaction | i ∈ [2, 3] | `LT[1; old_timestamp[i], timestamp]` | w4 |
-| `10` | interaction | i ∈ [4, 7] | `LT[1; old_timestamp[i], timestamp]` | write8 |
+| `7` | interaction |  | `LT[1; old_timestamp[0], timestamp, 0]` | μ_sum |
+| `8` | interaction |  | `LT[1; old_timestamp[1], timestamp, 0]` | w2 |
+| `9` | interaction | i ∈ [2, 3] | `LT[1; old_timestamp[i], timestamp, 0]` | w4 |
+| `10` | interaction | i ∈ [4, 7] | `LT[1; old_timestamp[i], timestamp, 0]` | write8 |
 
 ### overflow
 
 | Ref | Kind | Description | Multiplicity |
 |-----|------|-------------|--------------|
-| `R1` | interaction | `LT[1; base_address, address_add[0]::DWordWL]` | write2 |
-| `R2` | interaction | `LT[1; base_address, address_add[2]::DWordWL]` | write4 |
-| `R3` | interaction | `LT[1; base_address, address_add[6]::DWordWL]` | write8 |
+| `R1` | interaction | `LT[1; base_address, address_add[0]::DWordWL, 0]` | write2 |
+| `R2` | interaction | `LT[1; base_address, address_add[2]::DWordWL, 0]` | write4 |
+| `R3` | interaction | `LT[1; base_address, address_add[6]::DWordWL, 0]` | write8 |
 
 ### memory
 

From c902acdc83d7a61022b336f0196f8bca9cb7e7a7 Mon Sep 17 00:00:00 2001
From: Erik <159244975+erik-3milabs@users.noreply.github.com>
Date: Thu, 29 Jan 2026 13:45:19 +0100
Subject: [PATCH 048/105] spec: minor `MUL` fixes (#223)

* spec: MUL: fix missing iters

* spec: MUL: fix res slice in lookup contribution

* spec: MU: split `res` into `lo` and `hi`

* spec: MUL: replace `range` by `iter`

* spec: MUL: update `lo` and `hi` types + introduce `res` as virtual

* spec: MUL: add note on future optimization
---
 spec/mul.typ      | 16 ++++++++---
 spec/src/mul.toml | 68 ++++++++++++++++++++++++++++++++---------------
 2 files changed, 59 insertions(+), 25 deletions(-)

diff --git a/spec/mul.typ b/spec/mul.typ
index bc5898fa0..b2fb53d92 100644
--- a/spec/mul.typ
+++ b/spec/mul.typ
@@ -32,7 +32,6 @@ The following range checks are assumed to be performed/enforced outside of this
 #render_chip_assumptions(chip, config)
 
 == Constraints
-
 === Overview
 When `lhs` and `rhs` are _unsigned_ integers, computing their product $mod 2^128$ comes down to evaluating
 $
@@ -65,14 +64,14 @@ We let `raw_product` capture the second summation in this last formula (see @mul
 By construction, $#`raw_product`_i < 2^51$ for all $i in [0, 3]$, far exceeding the 32-bits that fit in a single `Word`-limb.
 What remains then is to reduce each limb of `raw_product` $mod 2^32$, carrying the overflow of each limb to the next, constructing the output `res` in doing so.
 
-This reduce-and-carry operation is constrained @mul:a:res and @mul:c:carry, combined with `carry`'s definition.
+This reduce-and-carry operation is constrained by @mul:c:range_lo/@mul:c:range_hi and @mul:c:carry, combined with `carry`'s definition.
 @mul:c:carry and `carry`'s definition enforce that
 $
   forall i in [0, 3]: #`raw_product`_i + #`carry`_(i-1) - #`res`_i in { k dot 2^32 | k in [0, 2^20) }
 $
 with $#`carry`_(-1) = 0$ for simplicity.
 In other words: $#`res`_i equiv #`raw_product`_i + #`carry`_(i-1) (mod 2^32)$.
-With @mul:a:res forcing $#`res`_i < 2^32$, $#`res`_i$ can only assume one value: $#`raw_product`_i + #`carry`_(i-1) mod 2^32$.
+With @mul:c:range_lo/@mul:c:range_hi forcing $#`res`_i < 2^32$, $#`res`_i$ can only assume one value: $#`raw_product`_i + #`carry`_(i-1) mod 2^32$.
 
 *Note*: one may have observed that @mul:c:carry requires $#`carry`_i in [0, 2^20)$, while no limb of a valid carry value would ever exceed $2^19$.
 This is indeed the case.
@@ -81,7 +80,7 @@ In fact, in this situation it suffices to assert that $#`carry`_i < frac(p, 2^32
 Given that other chips also use 20-bit lookups, using `IS_B20` makes for a simpler design.
 
 === Definitions
-We constrain `lhs_is_negative` and `rhs_is_negative` according to their definition; `carry` is appropriately range checked.
+We constrain `lhs_is_negative` and `rhs_is_negative` according to their definition; `lo`, `hi` and `carry` are appropriately range checked.
 #render_constraint_table(chip, config, groups: "def")
 
 === Product
@@ -97,3 +96,12 @@ The #mul chip contributes the following to the lookup:
 The table can be padded to the next power of two with the following value assignments:
 
 #render_chip_padding_table(chip, config)
+
+== Notes
+- `lo` and `hi` are stored in `DWordHL`s (rather than `DWordWL`s) because of their values being range checked.
+  Since it is not required that both `μ_lo` and `μ_hi` are non-zero at the same time, one cannot safely assume their range to be checked elsewhere.
+
+  As an optimization, one might be able to use a `DWordWL` and `DWordHL` to store `lo` and `hi`, 
+  where one would decide which to store in which based on the multiplicities `μ_lo` and `μ_hi`;
+  the value sent into the lookup could then be assumed range-checked by the other side of the relation.
+  This optimization was not included at this moment because of its negative impact on the readability and verifiability of the chip.
\ No newline at end of file
diff --git a/spec/src/mul.toml b/spec/src/mul.toml
index bf9ffc276..e987b0f75 100644
--- a/spec/src/mul.toml
+++ b/spec/src/mul.toml
@@ -31,9 +31,15 @@ pad = 0
 # Output
 
 [[variables.output]]
-name = "res"
-type = "QuadWL"
-desc = "the (extended) multiplication result"
+name = "lo"
+type = "DWordHL"
+desc = "the lower limbs of the (extended) multiplication result"
+pad = 0
+
+[[variables.output]]
+name = "hi"
+type = "DWordHL"
+desc = "the upper limbs of the (extended) multiplication result"
 pad = 0
 
 # Auxiliary
@@ -63,8 +69,8 @@ name = "lhs_ext"
 type = ["Half", 8]
 desc = "sign-extended value of `lhs`"
 def = {idx="i", polys=[
-    {range=[0, 3], poly=["idx", "lhs", "i"]},
-    {range=[4, 7], poly=["*", 0xFFFF, "lhs_is_negative"]},
+    {iter=[0, 3], poly=["idx", "lhs", "i"]},
+    {iter=[4, 7], poly=["*", 0xFFFF, "lhs_is_negative"]},
 ]}
 
 [[variables.virtual]]
@@ -72,17 +78,27 @@ name = "rhs_ext"
 type = ["Half", 8]
 desc = "sign-extended value of `rhs`"
 def = {idx="i", polys=[
-    {range=[0, 3], poly=["idx", "rhs", "i"]},
-    {range=[4, 7], poly=["*", 0xFFFF, "rhs_is_negative"]},
+    {iter=[0, 3], poly=["idx", "rhs", "i"]},
+    {iter=[4, 7], poly=["*", 0xFFFF, "rhs_is_negative"]},
 ]}
 
+[[variables.virtual]]
+name = "res"
+type = "QuadWL"
+desc = "concatenation of `lo` and `hi`."
+def = {idx="i", polys=[
+    {iter=[0, 1], poly=["idx", ["cast", "lo", "DWordWL"], "i"]},
+    {iter=[2, 3], poly=["idx", ["cast", "hi", "DWordWL"], ["-", "i", 2]]},
+]}
+
+
 [[variables.virtual]]
 name = "carry"
 type = ["B20", 4]
 desc = "carry values"
 def = {idx="i", polys=[
-    {range=0, poly=["*", ["^", 2, -32], ["-", ["idx", "raw_product", 0], ["idx", "res", 0]]]},
-    {range=[1, 3], poly=["*", ["^", 2, -32], ["-", ["+", ["idx", "raw_product", "i"], ["idx", "carry", ["-", "i", 1]]], ["idx", "res", "i"]]]},
+    {iter=0, poly=["*", ["^", 2, -32], ["-", ["idx", "raw_product", 0], ["idx", "res", 0]]]},
+    {iter=[1, 3], poly=["*", ["^", 2, -32], ["-", ["+", ["idx", "raw_product", "i"], ["idx", "carry", ["-", "i", 1]]], ["idx", "res", "i"]]]},
 ]}
 
 [[variables.virtual]]
@@ -109,17 +125,11 @@ pad = 0
 
 [[assumptions]]
 desc = "`IS_HALF[lhs[i]]`"
-range = ["i", 0, 3]
+iter = ["i", 0, 3]
 
 [[assumptions]]
 desc = "`IS_HALF[rhs[i]]`"
-range = ["i", 0, 3]
-
-[[assumptions]]
-desc = "`IS_WORD[res[i]]`"
-range = ["i", 0, 3]
-ref = "mul:a:res"
-
+iter = ["i", 0, 3]
 
 # Constraints
 
@@ -140,11 +150,27 @@ input = [["idx", "rhs", 3], "rhs_signed"]
 output = "rhs_is_negative"
 ref = "mul:c:rhs_is_negative"
 
+[[constraints.def]]
+kind = "interaction"
+tag = "IS_HALF"
+input = [["idx", "lo", "i"]]
+iter = ["i", 0, 3]
+multiplicity = "μ_sum"
+ref = "mul:c:range_lo"
+
+[[constraints.def]]
+kind = "interaction"
+tag = "IS_HALF"
+input = [["idx", "hi", "i"]]
+iter = ["i", 0, 3]
+multiplicity = "μ_sum"
+ref = "mul:c:range_hi"
+
 [[constraints.def]]
 kind = "interaction"
 tag = "IS_B20"
 input = [["idx", "carry", "i"]]
-range = ["i", 0, 3]
+iter = ["i", 0, 3]
 multiplicity = "μ_sum"
 ref = "mul:c:carry"
 
@@ -156,7 +182,7 @@ name = "prod"
 kind = "arith"
 constraint = "$#`raw_product[i]` = sum_(#`k`=0)^1 2^(16k) sum_(#`j`=0)^(2i+k) #`lhs_ext[j]` dot #`rhs_ext[2i+k-j]`$"
 poly = ["-", ["sum", ["=", "k", 0], "1", ["*", ["^", 2, ["*", 16, "k"]], ["sum", ["=", "j", 0], ["+", ["*", 2, "i"], "k"], ["*", ["idx", "lhs_ext", "j"], ["idx", "rhs_ext", ["-", ["+", ["*", 2, "i"], "k"], "j"]]]]]], ["idx", "raw_product", "i"]]
-range = ["i", 0, 3]
+iter = ["i", 0, 3]
 ref = "mul:c:raw_product"
 
 [[constraint_groups]]
@@ -166,7 +192,7 @@ name = "lookup"
 kind = "interaction"
 tag = "MUL"
 input = ["lhs", "lhs_signed", "rhs", "rhs_signed", "0"]
-output = ["idx", "res", "0:4"]
+output = ["cast", "lo", "DWordWL"]
 multiplicity = ["-", "μ_lo"]
 ref = "mul:c:lookup_lo"
 
@@ -174,6 +200,6 @@ ref = "mul:c:lookup_lo"
 kind = "interaction"
 tag = "MUL"
 input = ["lhs", "lhs_signed", "rhs", "rhs_signed", "1"]
-output = ["idx", "res", "4:8"]
+output = ["cast", "hi", "DWordWL"]
 multiplicity = ["-", "μ_hi"]
 ref = "mul:c:lookup_hi"
\ No newline at end of file

From f85cc3893a1c6193becf325a23bc59d077f490de Mon Sep 17 00:00:00 2001
From: Joaquin Carletti <joaquin.carletti@lambdaclass.com>
Date: Thu, 29 Jan 2026 15:47:31 -0300
Subject: [PATCH 049/105] update spec md

---
 docs/spec/cpu.md  |  4 ++--
 docs/spec/halt.md | 32 +++++++++++++++++++++++++++++++
 docs/spec/lt.md   |  8 +++++---
 docs/spec/memw.md | 14 +++++++-------
 docs/spec/mul.md  | 49 ++++++++++++++++++++++++++++-------------------
 5 files changed, 75 insertions(+), 32 deletions(-)
 create mode 100644 docs/spec/halt.md

diff --git a/docs/spec/cpu.md b/docs/spec/cpu.md
index 960133d60..66c539635 100644
--- a/docs/spec/cpu.md
+++ b/docs/spec/cpu.md
@@ -138,7 +138,7 @@ pad := 1 - ADD - SUB - SLT - AND - OR - XOR - SHIFT - JALR - BEQ - BLT - LOAD -
 |-----|------|-------|-------------|--------------|
 | `A1` | template |  | ADD + LOAD + STORE ⇒ `ADD<res::DWordWL; arg1::DWordWL, arg2::DWordWL>` |  |
 | `cpu:c:sub` | template |  | SUB + BEQ ⇒ `SUB<res::DWordWL; arg1::DWordWL, arg2::DWordWL>` |  |
-| `A3` | interaction |  | `LT[res[0]; arg1::DWordHHW, arg2::DWordHHW, signed]` | SLT + BLT |
+| `A3` | interaction |  | `LT[res[0]; arg1::DWordWL, arg2::DWordWL, signed]` | SLT + BLT |
 | `A4` | arith | i ∈ [1, 7] | `SLT` + `BLT` => `res[i]` = 0 |  |
 | | | _polynomial:_ `(SLT + BLT) * res[i] = 0` | |
 | `A5` | interaction | i ∈ [0, 7] | `AND_BYTE[res[i]; arg1[i], arg2[i]]` | AND |
@@ -171,7 +171,7 @@ pad := 1 - ADD - SUB - SLT - AND - OR - XOR - SHIFT - JALR - BEQ - BLT - LOAD -
 | `cpu:c:ebreak_traps` | arith | `!EBREAK` |  |
 | | | _polynomial:_ `1 - EBREAK = 0` | |
 | | | _note:_ We treat `EBREAK` as an unprovable trap | |
-| `S2` | interaction | `ECALL[rvd; rv1, pc, timestamp, rv2]` | ECALL |
+| `S2` | interaction | `ECALL[timestamp, rv1::DWordWL]` | ECALL |
 
 ### ext
 
diff --git a/docs/spec/halt.md b/docs/spec/halt.md
new file mode 100644
index 000000000..72ecac037
--- /dev/null
+++ b/docs/spec/halt.md
@@ -0,0 +1,32 @@
+# HALT Chip
+
+## Columns
+
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `timestamp` | `DWordWL` | timestamp at which to halt the program |
+
+## Assumptions
+
+| Ref | Range | Description |
+|-----|-------|-------------|
+| `A1` | i ∈ [0, 1] | `IS_WORD[timestamp[i]]` |
+
+## Constraints
+
+### all
+
+| Ref | Kind | Range | Description | Multiplicity |
+|-----|------|-------|-------------|--------------|
+| `halt:c:zeroize_registers_lo` | interaction | i ∈ [1, 9] | `MEMW[1, 2 * i, 0, 2^64 - 1, 1, 0, 0]` | 1 |
+| `halt:c:read_zero_exit_code` | interaction |  | `MEMW[1, 2 * 10, 0, 2^64 - 1, 1, 0, 0]` | 1 |
+| `halt:c:zeroize_registers_hi` | interaction | i ∈ [11, 31] | `MEMW[1, 2 * i, 0, 2^64 - 1, 1, 0, 0]` | 1 |
+| `halt:c:pc` | interaction |  | `MEMW[1, 2 * 255, 1, 2^64 - 1, 1, 0, 0]` | 1 |
+
+### lookup
+
+| Ref | Kind | Description | Multiplicity |
+|-----|------|-------------|--------------|
+| `halt:c:lookup` | interaction | `ECALL[timestamp, 93]` | -1 |
diff --git a/docs/spec/lt.md b/docs/spec/lt.md
index f2cbb1e9b..b72ed2041 100644
--- a/docs/spec/lt.md
+++ b/docs/spec/lt.md
@@ -52,8 +52,8 @@ unsigned_lt := carry[1]
 
 | Ref | Range | Description |
 |-----|-------|-------------|
-| `lt:a:range_lhs` | i ∈ [1, 2] | `IS_HALFWORD[lhs[i]]` and `IS_WORD[lhs[0]]` |
-| `lt:a:range_rhs` | i ∈ [1, 2] | `IS_HALFWORD[rhs[i]]` and `IS_WORD[rhs[0]]` |
+| `lt:a:range_lhs` |  | `IS_WORD[lhs[0]]` |
+| `lt:a:range_rhs` |  | `IS_WORD[rhs[0]]` |
 | `lt:a:range_signed` |  | `IS_BIT<signed>` |
 
 ## Constraints
@@ -68,6 +68,8 @@ _Enforce that variables have been correctly computed_
 | `lt:c:lt` | arith | `lt` = `signed` dot (A (1 - B) + A C + (1 - B) C) + (1 - `signed`) dot `unsigned_lt` |  |
 | | | _polynomial:_ `lt - signed * (lhs_msb * (1 - rhs_msb) + lhs_msb * carry[1] + (1 - rhs_msb) * carry[1]) - (1 - signed) * unsigned_lt = 0` | |
 | | | _note:_ Where $A = #`lhs_msb`$, $B = #`rhs_msb`$ and $C = #`carry[1]`$ | |
+| `lt:c:range_lhs` | interaction | `IS_HALFWORD[lhs[1]]` | μ |
+| `lt:c:range_rhs` | interaction | `IS_HALFWORD[rhs[1]]` | μ |
 
 ### sub
 _Constrain the subtraction_
@@ -82,4 +84,4 @@ _Each row contributes the following to the LogUp sum_
 
 | Ref | Kind | Description | Multiplicity |
 |-----|------|-------------|--------------|
-| `1` | interaction | `LT[lt; lhs, rhs, signed]` | -μ |
+| `1` | interaction | `LT[lt; lhs::DWordWL, rhs::DWordWL, signed]` | -μ |
diff --git a/docs/spec/memw.md b/docs/spec/memw.md
index ddee6b852..84246b8e0 100644
--- a/docs/spec/memw.md
+++ b/docs/spec/memw.md
@@ -81,18 +81,18 @@ w4 := write4 + write8
 | `4` | template | i ∈ [1, 2] | `ADD<address_add[i]::DWordWL; base_address, i + 1>` | w4 |
 | `5` | template | i ∈ [3, 6] | `ADD<address_add[i]::DWordWL; base_address, i + 1>` | write8 |
 | `6` | interaction | i ∈ [0, 6], j ∈ [0, 3] | `IS_HALFWORD[address_add[i][j]]` |  |
-| `7` | interaction |  | `LT[1; old_timestamp[0], timestamp]` | μ_sum |
-| `8` | interaction |  | `LT[1; old_timestamp[1], timestamp]` | w2 |
-| `9` | interaction | i ∈ [2, 3] | `LT[1; old_timestamp[i], timestamp]` | w4 |
-| `10` | interaction | i ∈ [4, 7] | `LT[1; old_timestamp[i], timestamp]` | write8 |
+| `7` | interaction |  | `LT[1; old_timestamp[0], timestamp, 0]` | μ_sum |
+| `8` | interaction |  | `LT[1; old_timestamp[1], timestamp, 0]` | w2 |
+| `9` | interaction | i ∈ [2, 3] | `LT[1; old_timestamp[i], timestamp, 0]` | w4 |
+| `10` | interaction | i ∈ [4, 7] | `LT[1; old_timestamp[i], timestamp, 0]` | write8 |
 
 ### overflow
 
 | Ref | Kind | Description | Multiplicity |
 |-----|------|-------------|--------------|
-| `R1` | interaction | `LT[1; base_address, address_add[0]::DWordWL]` | write2 |
-| `R2` | interaction | `LT[1; base_address, address_add[2]::DWordWL]` | write4 |
-| `R3` | interaction | `LT[1; base_address, address_add[6]::DWordWL]` | write8 |
+| `R1` | interaction | `LT[1; base_address, address_add[0]::DWordWL, 0]` | write2 |
+| `R2` | interaction | `LT[1; base_address, address_add[2]::DWordWL, 0]` | write4 |
+| `R3` | interaction | `LT[1; base_address, address_add[6]::DWordWL, 0]` | write8 |
 
 ### memory
 
diff --git a/docs/spec/mul.md b/docs/spec/mul.md
index c0efb1d4f..0df284d98 100644
--- a/docs/spec/mul.md
+++ b/docs/spec/mul.md
@@ -15,7 +15,8 @@
 
 | Name | Type | Description |
 |------|------|-------------|
-| `res` | `QuadWL` | the (extended) multiplication result |
+| `lo` | `DWordHL` | the lower limbs of the (extended) multiplication result |
+| `hi` | `DWordHL` | the upper limbs of the (extended) multiplication result |
 
 ### Auxiliary
 
@@ -31,25 +32,32 @@
 |------|------|-------------|
 | `lhs_ext` | `Half[8]` | sign-extended value of `lhs` |
 | `rhs_ext` | `Half[8]` | sign-extended value of `rhs` |
+| `res` | `QuadWL` | concatenation of `lo` and `hi`. |
 | `carry` | `B20[4]` | carry values |
 | `μ_sum` | `BaseField` | sum of multiplicies |
 
 **Definition of `lhs_ext`:**
 ```
-lhs_ext := lhs[i]
-lhs_ext := 65535 * lhs_is_negative
+lhs_ext (when iter=[0, 3]) := lhs[i]
+lhs_ext (when iter=[4, 7]) := 65535 * lhs_is_negative
 ```
 
 **Definition of `rhs_ext`:**
 ```
-rhs_ext := rhs[i]
-rhs_ext := 65535 * rhs_is_negative
+rhs_ext (when iter=[0, 3]) := rhs[i]
+rhs_ext (when iter=[4, 7]) := 65535 * rhs_is_negative
+```
+
+**Definition of `res`:**
+```
+res (when iter=[0, 1]) := (lo::DWordWL)[i]
+res (when iter=[2, 3]) := (hi::DWordWL)[i - 2]
 ```
 
 **Definition of `carry`:**
 ```
-carry := 2^-32 * (raw_product[0] - res[0])
-carry := 2^-32 * (raw_product[i] + carry[i - 1] - res[i])
+carry (when iter=0) := 2^-32 * (raw_product[0] - res[0])
+carry (when iter=[1, 3]) := 2^-32 * (raw_product[i] + carry[i - 1] - res[i])
 ```
 
 **Definition of `μ_sum`:**
@@ -68,30 +76,31 @@ carry := 2^-32 * (raw_product[i] + carry[i - 1] - res[i])
 
 | Ref | Range | Description |
 |-----|-------|-------------|
-| `A1` |  | `IS_HALF[lhs[i]]` |
-| `A2` |  | `IS_HALF[rhs[i]]` |
-| `mul:a:res` |  | `IS_WORD[res[i]]` |
+| `A1` | i ∈ [0, 3] | `IS_HALF[lhs[i]]` |
+| `A2` | i ∈ [0, 3] | `IS_HALF[rhs[i]]` |
 
 ## Constraints
 
 ### def
 
-| Ref | Kind | Description | Multiplicity |
-|-----|------|-------------|--------------|
-| `mul:c:lhs_is_negative` | template | `SIGN<lhs_is_negative; lhs[3], lhs_signed>` |  |
-| `mul:c:rhs_is_negative` | template | `SIGN<rhs_is_negative; rhs[3], rhs_signed>` |  |
-| `mul:c:carry` | interaction | `IS_B20[carry[i]]` | μ_sum |
+| Ref | Kind | Range | Description | Multiplicity |
+|-----|------|-------|-------------|--------------|
+| `mul:c:lhs_is_negative` | template |  | `SIGN<lhs_is_negative; lhs[3], lhs_signed>` |  |
+| `mul:c:rhs_is_negative` | template |  | `SIGN<rhs_is_negative; rhs[3], rhs_signed>` |  |
+| `mul:c:range_lo` | interaction | i ∈ [0, 3] | `IS_HALF[lo[i]]` | μ_sum |
+| `mul:c:range_hi` | interaction | i ∈ [0, 3] | `IS_HALF[hi[i]]` | μ_sum |
+| `mul:c:carry` | interaction | i ∈ [0, 3] | `IS_B20[carry[i]]` | μ_sum |
 
 ### prod
 
-| Ref | Kind | Description |
-|-----|------|-------------|
-| `mul:c:raw_product` | arith | `raw_product[i]` = sum_(`k`=0)^1 2^(16k) sum_(`j`=0)^(2i+k) `lhs_ext[j]` dot `rhs_ext[2i+k-j]` |
+| Ref | Kind | Range | Description |
+|-----|------|-------|-------------|
+| `mul:c:raw_product` | arith | i ∈ [0, 3] | `raw_product[i]` = sum_(`k`=0)^1 2^(16k) sum_(`j`=0)^(2i+k) `lhs_ext[j]` dot `rhs_ext[2i+k-j]` |
 | | | _polynomial:_ `Σ_k = 0^1 2^(16 * k) * Σ_j = 0^2 * i + k lhs_ext[j] * rhs_ext[2 * i + k - j] - raw_product[i] = 0` |
 
 ### lookup
 
 | Ref | Kind | Description | Multiplicity |
 |-----|------|-------------|--------------|
-| `mul:c:lookup_lo` | interaction | `MUL[res[0:4]; lhs, lhs_signed, rhs, rhs_signed, 0]` | -μ_lo |
-| `mul:c:lookup_hi` | interaction | `MUL[res[4:8]; lhs, lhs_signed, rhs, rhs_signed, 1]` | -μ_hi |
+| `mul:c:lookup_lo` | interaction | `MUL[lo::DWordWL; lhs, lhs_signed, rhs, rhs_signed, 0]` | -μ_lo |
+| `mul:c:lookup_hi` | interaction | `MUL[hi::DWordWL; lhs, lhs_signed, rhs, rhs_signed, 1]` | -μ_hi |

From 6801f606bb13d3730bce0d36c538490c2a1c2f83 Mon Sep 17 00:00:00 2001
From: MauroFab <maurotoscano2@gmail.com>
Date: Fri, 30 Jan 2026 13:23:06 -0300
Subject: [PATCH 050/105] Full typst to markdown

---
 .gitignore             |   6 +
 scripts/README.md      |  44 +++
 scripts/spec_to_md.py  | 494 -----------------------------
 scripts/typst_to_md.py | 686 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 736 insertions(+), 494 deletions(-)
 create mode 100644 scripts/README.md
 delete mode 100755 scripts/spec_to_md.py
 create mode 100644 scripts/typst_to_md.py

diff --git a/.gitignore b/.gitignore
index 8fd6665a6..1da4b778a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,12 @@
 .vscode
 .DS_Store
 
+# Python
+__pycache__/
+*.pyc
+*.pyo
+.venv/
+
 # Compiled ELF artifacts (built by CI/make)
 executor/program_artifacts/
 
diff --git a/scripts/README.md b/scripts/README.md
new file mode 100644
index 000000000..14b6acb42
--- /dev/null
+++ b/scripts/README.md
@@ -0,0 +1,44 @@
+# Scripts
+
+## typst_to_md.py
+
+Converts the Typst specification to Markdown format.
+
+### What it does
+
+1. Parses `.typ` files for prose content (headings, paragraphs, notes)
+2. Parses `.toml` files for structured data (variables, constraints, assumptions)
+3. Detects `#render_constraint_table()` calls to insert tables at correct positions
+4. Reads constraint group prefixes from TOML (e.g., `prefix = "R"` → `CR`)
+5. Maintains continuous constraint numbering across groups (CPU-C1 → CPU-CR2 → ...)
+
+### Usage
+
+```bash
+cd scripts
+source .venv/bin/activate
+python typst_to_md.py                          # Output to ../spec/
+python typst_to_md.py -o ../others/spec_md     # Output to specific directory
+```
+
+### Requirements
+
+Python 3.8+ with `tomli` (or Python 3.11+ which has `tomllib` built-in):
+
+```bash
+cd scripts
+python -m venv .venv
+source .venv/bin/activate
+pip install tomli
+```
+
+### Output
+
+Generates 16 markdown files:
+- Individual chapter files (`cpu.md`, `memw.md`, etc.)
+- Combined file (`spec_full.md`)
+
+### Notes
+
+- Math expressions are preserved in Typst notation (not LaTeX), but semantically equivalent
+- The script reads from `../spec/` (typst source) and `../spec/src/` (TOML data)
diff --git a/scripts/spec_to_md.py b/scripts/spec_to_md.py
deleted file mode 100755
index 09becab09..000000000
--- a/scripts/spec_to_md.py
+++ /dev/null
@@ -1,494 +0,0 @@
-#!/usr/bin/env python3
-"""
-Convert Typst spec TOML files to Markdown.
-
-Usage:
-    # First, extract spec files from the spec/main branch:
-    git show origin/spec/main:spec/src/config.toml > /tmp/spec/config.toml
-    git show origin/spec/main:spec/src/cpu.toml > /tmp/spec/cpu.toml
-    # etc.
-
-    # Then run:
-    python scripts/spec_to_md.py /tmp/spec/config.toml /tmp/spec/cpu.toml
-
-    # Or convert all chips:
-    python scripts/spec_to_md.py /tmp/spec/config.toml /tmp/spec/*.toml
-
-    # Output to a specific directory:
-    python scripts/spec_to_md.py --output-dir docs/spec /tmp/spec/config.toml /tmp/spec/*.toml
-"""
-
-import argparse
-import sys
-from pathlib import Path
-
-# Python 3.11+ has tomllib in stdlib, fallback to tomli for older versions
-try:
-    import tomllib
-except ImportError:
-    try:
-        import tomli as tomllib
-    except ImportError:
-        print("Error: Please install tomli: pip install tomli", file=sys.stderr)
-        sys.exit(1)
-
-
-# =============================================================================
-# Expression Rendering
-# =============================================================================
-
-def expr_to_text(expr: any, parent_prec: int = 100) -> str:
-    """
-    Convert a polynomial expression to readable text.
-
-    Expression grammar (from spec/expr.typ):
-        <expr> ::= str                           ; variable name
-                 | int                           ; constant
-                 | ["idx", expr1, expr2]         ; expr1[expr2]
-                 | ["not", expr]                 ; 1 - expr
-                 | ["+", expr1, expr2, ...]      ; expr1 + expr2 + ...
-                 | ["sum", expr1, expr2, expr3]  ; sum from expr1 to expr2 of expr3
-                 | ["*", expr1, expr2, ...]      ; expr1 * expr2 * ...
-                 | ["/", expr1, expr2]           ; expr1 / expr2
-                 | ["^", expr1, expr2]           ; expr1^expr2
-                 | ["=", expr1, expr2]           ; expr1 = expr2
-                 | [":=", expr1, expr2]          ; expr1 := expr2
-                 | ["-", expr]                   ; -expr (unary)
-                 | ["-", expr1, expr2, ...]      ; expr1 - expr2 - ... (binary)
-                 | ["cast", expr, type]          ; expr::type
-    """
-    PREC = {
-        "idx": 0,
-        "pow": 1,
-        "neg": 2,
-        "cast": 3,
-        "mul": 4,
-        "div": 5,
-        "sum": 6,
-        "not": 7,
-        "add": 8,
-        "sub": 9,
-        "eq": 10,
-    }
-
-    def wrap(s: str, prec: int) -> str:
-        return f"({s})" if parent_prec < prec else s
-
-    if expr is None or expr == "":
-        return ""
-
-    if isinstance(expr, str):
-        return expr
-
-    if isinstance(expr, (int, float)):
-        return str(expr)
-
-    if isinstance(expr, list) and len(expr) > 0:
-        op = expr[0]
-
-        if op == "idx":
-            # expr1[expr2]
-            base = expr_to_text(expr[1], PREC["idx"])
-            idx = expr_to_text(expr[2], 100)
-            return f"{base}[{idx}]"
-
-        elif op == "not":
-            # 1 - expr
-            inner = expr_to_text(expr[1], PREC["not"])
-            return wrap(f"1 - {inner}", PREC["not"])
-
-        elif op == "+":
-            # expr1 + expr2 + ...
-            parts = [expr_to_text(e, PREC["add"]) for e in expr[1:]]
-            return wrap(" + ".join(parts), PREC["add"])
-
-        elif op == "sum":
-            # Σ from expr1 to expr2 of expr3
-            var = expr_to_text(expr[1], 100)
-            upper = expr_to_text(expr[2], 100)
-            body = expr_to_text(expr[3], PREC["sum"])
-            return f"Σ_{var}^{upper} {body}"
-
-        elif op == "*":
-            # expr1 * expr2 * ...
-            parts = [expr_to_text(e, PREC["mul"]) for e in expr[1:]]
-            return wrap(" * ".join(parts), PREC["mul"])
-
-        elif op == "/":
-            # expr1 / expr2
-            num = expr_to_text(expr[1], PREC["div"])
-            den = expr_to_text(expr[2], PREC["div"])
-            return wrap(f"{num} / {den}", PREC["div"])
-
-        elif op == "^":
-            # expr1^expr2
-            base = expr_to_text(expr[1], PREC["pow"])
-            exp = expr_to_text(expr[2], PREC["pow"])
-            return f"{base}^{exp}"
-
-        elif op == "=":
-            # expr1 = expr2
-            lhs = expr_to_text(expr[1], PREC["eq"])
-            rhs = expr_to_text(expr[2], PREC["eq"])
-            return f"{lhs} = {rhs}"
-
-        elif op == ":=":
-            # expr1 := expr2
-            lhs = expr_to_text(expr[1], PREC["eq"])
-            rhs = expr_to_text(expr[2], PREC["eq"])
-            return f"{lhs} := {rhs}"
-
-        elif op == "-":
-            if len(expr) == 2:
-                # Unary negation
-                inner = expr_to_text(expr[1], PREC["neg"])
-                return wrap(f"-{inner}", PREC["neg"])
-            else:
-                # Binary subtraction
-                parts = [expr_to_text(e, PREC["sub"]) for e in expr[1:]]
-                return wrap(" - ".join(parts), PREC["sub"])
-
-        elif op == "cast":
-            # expr::type
-            inner = expr_to_text(expr[1], PREC["cast"])
-            type_str = type_to_text(expr[2])
-            return wrap(f"{inner}::{type_str}", PREC["cast"])
-
-        else:
-            # Unknown operator, render as-is
-            return str(expr)
-
-    return str(expr)
-
-
-def type_to_text(typ: any) -> str:
-    """Convert a type to text."""
-    if isinstance(typ, str):
-        return typ
-    if isinstance(typ, list) and len(typ) == 2:
-        return f"{typ[0]}[{typ[1]}]"
-    return str(typ)
-
-
-def iters_to_text(obj: dict) -> str:
-    """Extract iterator ranges from a constraint/assumption."""
-    iters = []
-
-    if "iter" in obj:
-        it = obj["iter"]
-        if isinstance(it, list) and len(it) == 3:
-            iters.append(f"{it[0]} ∈ [{it[1]}, {it[2]}]")
-        elif isinstance(it, list) and len(it) == 2:
-            iters.append(f"{it[0]} = {it[1]}")
-
-    if "iters" in obj:
-        for it in obj["iters"]:
-            if isinstance(it, list) and len(it) == 3:
-                iters.append(f"{it[0]} ∈ [{it[1]}, {it[2]}]")
-            elif isinstance(it, list) and len(it) == 2:
-                iters.append(f"{it[0]} = {it[1]}")
-
-    return ", ".join(iters)
-
-
-# =============================================================================
-# Markdown Generation
-# =============================================================================
-
-def escape_md(s: str) -> str:
-    """Escape pipe characters for Markdown tables."""
-    if s is None:
-        return ""
-    return str(s).replace("|", "\\|").replace("\n", " ")
-
-
-def render_variables_table(variables: dict, config: dict) -> str:
-    """Render variables as Markdown tables, grouped by category."""
-    lines = []
-
-    category_order = ["input", "output", "auxiliary", "virtual", "multiplicity", "condition"]
-
-    for category in category_order:
-        if category not in variables:
-            continue
-
-        vars_list = variables[category]
-        if not vars_list:
-            continue
-
-        lines.append(f"### {category.capitalize()}")
-        lines.append("")
-        lines.append("| Name | Type | Description |")
-        lines.append("|------|------|-------------|")
-
-        for var in vars_list:
-            name = f"`{var['name']}`"
-            typ = f"`{type_to_text(var.get('type', ''))}`"
-            desc = escape_md(var.get('desc', ''))
-            # Clean up Typst markup in descriptions
-            desc = desc.replace('#`', '`').replace('`#', '`')
-            lines.append(f"| {name} | {typ} | {desc} |")
-
-        # Add definition if present (for virtual variables)
-        for var in vars_list:
-            if "def" in var:
-                defn = var["def"]
-                lines.append("")
-                lines.append(f"**Definition of `{var['name']}`:**")
-                if isinstance(defn, dict):
-                    if "poly" in defn:
-                        lines.append(f"```")
-                        lines.append(f"{var['name']} := {expr_to_text(defn['poly'])}")
-                        lines.append(f"```")
-                    elif "polys" in defn:
-                        lines.append(f"```")
-                        for i, p in enumerate(defn["polys"]):
-                            iter_str = ""
-                            if "iter" in p:
-                                iter_str = f" (when iter={p['iter']})"
-                            lines.append(f"{var['name']}{iter_str} := {expr_to_text(p['poly'])}")
-                        lines.append(f"```")
-                elif isinstance(defn, (list, str)):
-                    lines.append(f"```")
-                    lines.append(f"{var['name']} := {expr_to_text(defn)}")
-                    lines.append(f"```")
-
-        lines.append("")
-
-    return "\n".join(lines)
-
-
-def render_assumptions_table(assumptions: list) -> str:
-    """Render assumptions as a Markdown table."""
-    if not assumptions:
-        return ""
-
-    lines = []
-    lines.append("## Assumptions")
-    lines.append("")
-    lines.append("| Ref | Range | Description |")
-    lines.append("|-----|-------|-------------|")
-
-    for i, assumption in enumerate(assumptions, 1):
-        ref = assumption.get("ref", f"A{i}")
-        iters = iters_to_text(assumption)
-        desc = escape_md(assumption.get("desc", ""))
-        lines.append(f"| `{ref}` | {iters} | {desc} |")
-
-    lines.append("")
-    return "\n".join(lines)
-
-
-def render_constraints_table(constraints: dict, constraint_groups: list) -> str:
-    """Render constraints as Markdown tables, grouped by constraint group."""
-    if not constraints:
-        return ""
-
-    lines = []
-    lines.append("## Constraints")
-    lines.append("")
-
-    # Build group lookup
-    group_info = {g["name"]: g for g in constraint_groups}
-
-    for group_name, group_constraints in constraints.items():
-        if not group_constraints:
-            continue
-
-        group = group_info.get(group_name, {"name": group_name})
-        prefix = group.get("prefix", "")
-        group_desc = group.get("desc", "")
-
-        lines.append(f"### {group_name}")
-        if group_desc:
-            lines.append(f"_{group_desc}_")
-        lines.append("")
-
-        # Determine columns needed
-        has_multiplicity = any("multiplicity" in c for c in group_constraints)
-        has_iter = any(iters_to_text(c) for c in group_constraints)
-
-        # Build header
-        if has_iter and has_multiplicity:
-            header = "| Ref | Kind | Range | Description | Multiplicity |"
-            separator = "|-----|------|-------|-------------|--------------|"
-        elif has_iter:
-            header = "| Ref | Kind | Range | Description |"
-            separator = "|-----|------|-------|-------------|"
-        elif has_multiplicity:
-            header = "| Ref | Kind | Description | Multiplicity |"
-            separator = "|-----|------|-------------|--------------|"
-        else:
-            header = "| Ref | Kind | Description |"
-            separator = "|-----|------|-------------|"
-
-        lines.append(header)
-        lines.append(separator)
-
-        for i, constraint in enumerate(group_constraints, 1):
-            ref = constraint.get("ref", f"{prefix}{i}")
-            kind = constraint.get("kind", "")
-            tag = constraint.get("tag", "")
-            iters = iters_to_text(constraint)
-            mult = expr_to_text(constraint.get("multiplicity", ""))
-
-            # Build description based on kind
-            if kind == "interaction":
-                inputs = ", ".join(expr_to_text(inp) for inp in constraint.get("input", []))
-                output = constraint.get("output")
-                if output:
-                    desc = f"`{tag}[{expr_to_text(output)}; {inputs}]`"
-                else:
-                    desc = f"`{tag}[{inputs}]`"
-
-            elif kind == "arith":
-                desc = escape_md(constraint.get("constraint", ""))
-                # Clean up Typst math markup
-                desc = desc.replace("$", "").replace("#", "")
-
-            elif kind == "template":
-                inputs = ", ".join(expr_to_text(inp) for inp in constraint.get("input", []))
-                output = constraint.get("output")
-                cond = constraint.get("cond")
-                cond_str = f"{expr_to_text(cond)} ⇒ " if cond else ""
-                if output:
-                    desc = f"{cond_str}`{tag}<{expr_to_text(output)}; {inputs}>`"
-                else:
-                    desc = f"{cond_str}`{tag}<{inputs}>`"
-
-            else:
-                desc = str(constraint)
-
-            # Build row
-            row = f"| `{ref}` | {kind} |"
-            if has_iter:
-                row += f" {iters} |"
-            row += f" {desc} |"
-            if has_multiplicity:
-                row += f" {mult} |"
-
-            lines.append(row)
-
-            # Add polynomial constraint if present
-            if kind == "arith" and ("poly" in constraint or "polys" in constraint):
-                if "poly" in constraint:
-                    poly_str = expr_to_text(constraint["poly"])
-                    lines.append(f"| | | _polynomial:_ `{poly_str} = 0` |" + (" |" if has_multiplicity else ""))
-                elif "polys" in constraint:
-                    for poly in constraint["polys"]:
-                        poly_str = expr_to_text(poly)
-                        lines.append(f"| | | _polynomial:_ `{poly_str} = 0` |" + (" |" if has_multiplicity else ""))
-
-            # Add description if present
-            if "desc" in constraint and kind == "arith":
-                desc_text = escape_md(constraint["desc"])
-                lines.append(f"| | | _note:_ {desc_text} |" + (" |" if has_multiplicity else ""))
-
-        lines.append("")
-
-    return "\n".join(lines)
-
-
-def chip_to_markdown(chip: dict, config: dict) -> str:
-    """Convert a chip TOML to Markdown."""
-    lines = []
-
-    name = chip.get("name", "Unknown")
-    lines.append(f"# {name} Chip")
-    lines.append("")
-
-    # Variables
-    variables = chip.get("variables", {})
-    if variables:
-        lines.append("## Columns")
-        lines.append("")
-        lines.append(render_variables_table(variables, config))
-
-    # Assumptions
-    assumptions = chip.get("assumptions", [])
-    if assumptions:
-        lines.append(render_assumptions_table(assumptions))
-
-    # Constraints
-    constraints = chip.get("constraints", {})
-    constraint_groups = chip.get("constraint_groups", [])
-    if constraints:
-        lines.append(render_constraints_table(constraints, constraint_groups))
-
-    return "\n".join(lines)
-
-
-# =============================================================================
-# Main
-# =============================================================================
-
-def load_toml(path: Path) -> dict:
-    """Load a TOML file."""
-    with open(path, "rb") as f:
-        return tomllib.load(f)
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Convert Typst spec TOML files to Markdown",
-        formatter_class=argparse.RawDescriptionHelpFormatter,
-        epilog=__doc__
-    )
-    parser.add_argument(
-        "config",
-        type=Path,
-        help="Path to config.toml"
-    )
-    parser.add_argument(
-        "chips",
-        type=Path,
-        nargs="+",
-        help="Paths to chip TOML files (e.g., cpu.toml, lt.toml)"
-    )
-    parser.add_argument(
-        "--output-dir", "-o",
-        type=Path,
-        default=None,
-        help="Output directory for Markdown files (default: stdout)"
-    )
-
-    args = parser.parse_args()
-
-    # Load config
-    config = load_toml(args.config)
-
-    # Process each chip
-    for chip_path in args.chips:
-        # Skip config.toml if passed as chip
-        if chip_path.name == "config.toml":
-            continue
-
-        # Skip non-chip TOML files
-        if chip_path.name in ("page.toml", "theme-style.toml"):
-            continue
-
-        try:
-            chip = load_toml(chip_path)
-        except Exception as e:
-            print(f"Warning: Failed to load {chip_path}: {e}", file=sys.stderr)
-            continue
-
-        # Check if it's a valid chip file (has 'name' field)
-        if "name" not in chip:
-            continue
-
-        md_content = chip_to_markdown(chip, config)
-
-        if args.output_dir:
-            args.output_dir.mkdir(parents=True, exist_ok=True)
-            output_path = args.output_dir / f"{chip_path.stem}.md"
-            with open(output_path, "w") as f:
-                f.write(md_content)
-            print(f"Generated: {output_path}")
-        else:
-            print(md_content)
-            print("\n" + "=" * 80 + "\n")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/scripts/typst_to_md.py b/scripts/typst_to_md.py
new file mode 100644
index 000000000..ecc17f104
--- /dev/null
+++ b/scripts/typst_to_md.py
@@ -0,0 +1,686 @@
+#!/usr/bin/env python3
+"""
+Convert Typst spec files to Markdown by parsing both .typ prose and .toml data.
+
+This script:
+1. Parses .typ files for prose content (headings, paragraphs, notes)
+2. Parses .toml files for structured data (variables, constraints, assumptions)
+3. Detects #render_constraint_table() calls to insert tables at correct positions
+4. Reads constraint group prefixes from TOML (e.g., "R" -> "CR")
+5. Maintains continuous constraint numbering across groups
+
+Usage:
+    cd scripts
+    source .venv/bin/activate
+    python typst_to_md.py                              # Output to spec/
+    python typst_to_md.py -o ../others/spec_new_md     # Output to specific dir
+
+Requirements:
+    pip install tomli  (or use Python 3.11+ which has tomllib built-in)
+"""
+
+import argparse
+import re
+import sys
+from pathlib import Path
+
+try:
+    import tomllib
+except ImportError:
+    try:
+        import tomli as tomllib
+    except ImportError:
+        print("Error: Please install tomli: pip install tomli", file=sys.stderr)
+        sys.exit(1)
+
+
+# =============================================================================
+# Expression Rendering (from TOML constraint expressions)
+# =============================================================================
+
+def type_to_text(typ) -> str:
+    """Convert a type to text."""
+    if isinstance(typ, str):
+        return typ
+    if isinstance(typ, list) and len(typ) == 2:
+        return f"{typ[0]}[{typ[1]}]"
+    return str(typ)
+
+
+def expr_to_text(expr, parent_prec: int = 100) -> str:
+    """
+    Convert a polynomial expression to readable text.
+
+    Expression grammar (from spec/expr.typ):
+        <expr> ::= str                           ; variable name
+                 | int                           ; constant
+                 | ["idx", expr1, expr2]         ; expr1[expr2]
+                 | ["not", expr]                 ; 1 - expr
+                 | ["+", expr1, expr2, ...]      ; expr1 + expr2 + ...
+                 | ["sum", expr1, expr2, expr3]  ; sum from expr1 to expr2 of expr3
+                 | ["*", expr1, expr2, ...]      ; expr1 * expr2 * ...
+                 | ["/", expr1, expr2]           ; expr1 / expr2
+                 | ["^", expr1, expr2]           ; expr1^expr2
+                 | ["=", expr1, expr2]           ; expr1 = expr2
+                 | [":=", expr1, expr2]          ; expr1 := expr2
+                 | ["-", expr]                   ; -expr (unary)
+                 | ["-", expr1, expr2, ...]      ; expr1 - expr2 - ... (binary)
+                 | ["cast", expr, type]          ; expr::type
+    """
+    PREC = {
+        "idx": 0, "pow": 1, "neg": 2, "cast": 3, "mul": 4,
+        "div": 5, "sum": 6, "not": 7, "add": 8, "sub": 9, "eq": 10,
+    }
+
+    def wrap(s: str, prec: int) -> str:
+        return f"({s})" if parent_prec < prec else s
+
+    if expr is None or expr == "":
+        return ""
+    if isinstance(expr, str):
+        return expr
+    if isinstance(expr, (int, float)):
+        return str(expr)
+
+    if isinstance(expr, list) and len(expr) > 0:
+        op = expr[0]
+
+        if op == "idx":
+            base = expr_to_text(expr[1], PREC["idx"])
+            idx = expr_to_text(expr[2], 100)
+            return f"{base}[{idx}]"
+        elif op == "not":
+            inner = expr_to_text(expr[1], PREC["not"])
+            return wrap(f"1 - {inner}", PREC["not"])
+        elif op == "+":
+            parts = [expr_to_text(e, PREC["add"]) for e in expr[1:]]
+            return wrap(" + ".join(parts), PREC["add"])
+        elif op == "sum":
+            var = expr_to_text(expr[1], 100)
+            upper = expr_to_text(expr[2], 100)
+            body = expr_to_text(expr[3], PREC["sum"])
+            return f"Σ_{var}^{upper} {body}"
+        elif op == "*":
+            parts = [expr_to_text(e, PREC["mul"]) for e in expr[1:]]
+            return wrap(" * ".join(parts), PREC["mul"])
+        elif op == "/":
+            num = expr_to_text(expr[1], PREC["div"])
+            den = expr_to_text(expr[2], PREC["div"])
+            return wrap(f"{num} / {den}", PREC["div"])
+        elif op == "^":
+            base = expr_to_text(expr[1], PREC["pow"])
+            exp = expr_to_text(expr[2], PREC["pow"])
+            return f"{base}^{exp}"
+        elif op == "=":
+            lhs = expr_to_text(expr[1], PREC["eq"])
+            rhs = expr_to_text(expr[2], PREC["eq"])
+            return f"{lhs} = {rhs}"
+        elif op == ":=":
+            lhs = expr_to_text(expr[1], PREC["eq"])
+            rhs = expr_to_text(expr[2], PREC["eq"])
+            return f"{lhs} := {rhs}"
+        elif op == "-":
+            if len(expr) == 2:
+                inner = expr_to_text(expr[1], PREC["neg"])
+                return wrap(f"-{inner}", PREC["neg"])
+            else:
+                parts = [expr_to_text(e, PREC["sub"]) for e in expr[1:]]
+                return wrap(" - ".join(parts), PREC["sub"])
+        elif op == "cast":
+            inner = expr_to_text(expr[1], PREC["cast"])
+            type_str = type_to_text(expr[2])
+            return wrap(f"{inner}::{type_str}", PREC["cast"])
+        else:
+            return str(expr)
+
+    return str(expr)
+
+
+def iters_to_text(obj: dict) -> str:
+    """Extract iterator ranges from a constraint/assumption."""
+    iters = []
+
+    if "iter" in obj:
+        it = obj["iter"]
+        if isinstance(it, list) and len(it) == 3:
+            iters.append(f"{it[0]} ∈ [{it[1]}, {it[2]}]")
+        elif isinstance(it, list) and len(it) == 2:
+            iters.append(f"{it[0]} = {it[1]}")
+
+    if "iters" in obj:
+        for it in obj["iters"]:
+            if isinstance(it, list) and len(it) == 3:
+                iters.append(f"{it[0]} ∈ [{it[1]}, {it[2]}]")
+            elif isinstance(it, list) and len(it) == 2:
+                iters.append(f"{it[0]} = {it[1]}")
+
+    return ", ".join(iters)
+
+
+# Chapters in order (from book.typ)
+CHAPTERS = [
+    ("memory", "Memory Argument"),
+    ("variables", "Variables"),
+    ("is_bit", "IS_BIT Template"),
+    ("add", "ADD/SUB Template"),
+    ("decode", "DECODE Table"),
+    ("cpu", "CPU Chip"),
+    ("shift", "SHIFT Chip"),
+    ("branch", "BRANCH Chip"),
+    ("memw", "MEMW Chip"),
+    ("lt", "LT Chip"),
+    ("mul", "MUL Chip"),
+    ("dvrm", "DVRM Chip"),
+    ("load", "LOAD Chip"),
+    ("ecall", "ECALL Chips"),
+    ("bitwise", "BITWISE Chips"),
+]
+
+
+def load_toml(path: Path) -> dict:
+    """Load a TOML file."""
+    if not path.exists():
+        return {}
+    with open(path, "rb") as f:
+        return tomllib.load(f)
+
+
+def parse_typst_prose(content: str) -> list:
+    """
+    Parse Typst file and extract prose sections.
+    Returns list of (type, content) tuples.
+    """
+    elements = []
+
+    # Remove imports and let bindings at the start
+    content = re.sub(r'^#import[^\n]*\n', '', content, flags=re.MULTILINE)
+    content = re.sub(r'^#let[^\n]*\n', '', content, flags=re.MULTILINE)
+    content = re.sub(r'^#show:[^\n]*\n', '', content, flags=re.MULTILINE)
+
+    # Remove multi-line import blocks
+    content = re.sub(r'#import[^)]+\)', '', content)
+
+    lines = content.split('\n')
+    i = 0
+    current_para = []
+
+    while i < len(lines):
+        line = lines[i]
+        stripped = line.strip()
+
+        # Skip empty lines
+        if not stripped:
+            if current_para:
+                elements.append(('para', ' '.join(current_para)))
+                current_para = []
+            i += 1
+            continue
+
+        # Capture render_constraint_table calls to know which group to render
+        if stripped.startswith('#render_constraint_table'):
+            if current_para:
+                elements.append(('para', ' '.join(current_para)))
+                current_para = []
+            # Extract group name: #render_constraint_table(chip, config, groups: "range")
+            match = re.search(r'groups:\s*"([^"]+)"', stripped)
+            if match:
+                elements.append(('render_constraints', match.group(1)))
+            i += 1
+            continue
+
+        # Skip other function calls (table renders, etc.)
+        if stripped.startswith('#render_') or stripped.startswith('#total_'):
+            if current_para:
+                elements.append(('para', ' '.join(current_para)))
+                current_para = []
+            i += 1
+            continue
+
+        # Skip lines that are just function names (from multi-line imports)
+        if re.match(r'^[a-z_]+,?\s*$', stripped) or stripped == ')':
+            i += 1
+            continue
+
+        # Skip other Typst commands we don't need
+        if stripped.startswith('#') and not stripped.startswith('#rj[') and not stripped.startswith('#et['):
+            if current_para:
+                elements.append(('para', ' '.join(current_para)))
+                current_para = []
+            i += 1
+            continue
+
+        # Headings
+        if stripped.startswith('=='):
+            if current_para:
+                elements.append(('para', ' '.join(current_para)))
+                current_para = []
+
+            level = len(re.match(r'^=+', stripped).group())
+            title = stripped[level:].strip()
+            elements.append((f'h{level}', title))
+            i += 1
+            continue
+
+        # TODO/review notes - extract the content
+        todo_match = re.match(r'#(rj|et)\[([^\]]*)\]', stripped)
+        if todo_match:
+            note_content = todo_match.group(2)
+            elements.append(('note', note_content))
+            i += 1
+            continue
+
+        # Regular text (prose)
+        # Clean up inline Typst markup
+        text = stripped
+        text = re.sub(r'#`([^`]*)`', r'`\1`', text)  # #`code` -> `code`
+        text = re.sub(r'@(\w+:\w+:\w+)', r'[\1]', text)  # @ref:to:thing -> [ref:to:thing]
+        text = re.sub(r'@(\w+)', r'[\1]', text)  # @ref -> [ref]
+        text = re.sub(r'#total_nr_\w+\([^)]+\)', 'N', text)  # #total_nr_xxx(chip) -> N
+        text = re.sub(r'#\w+\([^)]*\)', '', text)  # Remove other function calls
+        text = re.sub(r'\$([^$]+)\$', r'`\1`', text)  # $math$ -> `math`
+
+        if text and not text.startswith('#'):
+            current_para.append(text)
+
+        i += 1
+
+    if current_para:
+        elements.append(('para', ' '.join(current_para)))
+
+    return elements
+
+
+def render_variables_table(chip: dict, config: dict) -> str:
+    """Render variables as Markdown tables."""
+    variables = chip.get("variables", {})
+    if not variables:
+        return ""
+
+    lines = []
+    category_order = ["input", "output", "auxiliary", "virtual", "multiplicity", "condition"]
+
+    for category in category_order:
+        if category not in variables:
+            continue
+
+        vars_list = variables[category]
+        if not vars_list:
+            continue
+
+        lines.append(f"### {category.capitalize()}")
+        lines.append("")
+        lines.append("| Name | Type | Description |")
+        lines.append("|------|------|-------------|")
+
+        for var in vars_list:
+            name = f"`{var['name']}`"
+            typ = f"`{type_to_text(var.get('type', ''))}`"
+            desc = var.get('desc', '').replace('|', '\\|').replace('\n', ' ')
+            desc = re.sub(r'#`([^`]*)`', r'`\1`', desc)
+            lines.append(f"| {name} | {typ} | {desc} |")
+
+        # Add definitions for virtual variables
+        for var in vars_list:
+            if "def" in var:
+                defn = var["def"]
+                lines.append("")
+                lines.append(f"**Definition of `{var['name']}`:**")
+                if isinstance(defn, dict):
+                    if "poly" in defn:
+                        lines.append("```")
+                        lines.append(f"{var['name']} := {expr_to_text(defn['poly'])}")
+                        lines.append("```")
+                    elif "polys" in defn:
+                        lines.append("```")
+                        for p in defn["polys"]:
+                            iter_str = ""
+                            if "iter" in p:
+                                iter_str = f" (when iter={p['iter']})"
+                            lines.append(f"{var['name']}{iter_str} := {expr_to_text(p['poly'])}")
+                        lines.append("```")
+                elif isinstance(defn, (list, str)):
+                    lines.append("```")
+                    lines.append(f"{var['name']} := {expr_to_text(defn)}")
+                    lines.append("```")
+
+        lines.append("")
+
+    return "\n".join(lines)
+
+
+def render_constraints_table(chip: dict, config: dict, group_filter: str = None, skip_heading: bool = False, start_counter: int = None) -> str:
+    """Render constraints as Markdown tables."""
+    constraints = chip.get("constraints", {})
+    constraint_groups = chip.get("constraint_groups", [])
+
+    if not constraints:
+        return ""
+
+    chip_name = chip.get("name", "").upper()
+    lines = []
+    group_info = {g["name"]: g for g in constraint_groups}
+
+    # Calculate starting counter based on constraints before the filtered group
+    if start_counter is not None:
+        global_counter = start_counter
+    elif group_filter:
+        # Count constraints in all groups that come before this one
+        global_counter = 1
+        for cg in constraint_groups:
+            if cg["name"] == group_filter:
+                break
+            group_constraints = constraints.get(cg["name"], [])
+            global_counter += len(group_constraints)
+    else:
+        global_counter = 1
+
+    for group_name, group_constraints in constraints.items():
+        if group_filter and group_name != group_filter:
+            continue
+        if not group_constraints:
+            continue
+
+        group = group_info.get(group_name, {"name": group_name})
+        # Get prefix from TOML constraint_groups (e.g., "R" -> "CR", "M" -> "CM")
+        # The base is always "C" for Constraint, plus the group's prefix if defined
+        group_prefix = "C" + group.get("prefix", "")
+
+        # Check if any constraint has multiplicity or polynomial
+        has_mult = any("multiplicity" in c for c in group_constraints)
+        has_iter = any(iters_to_text(c) for c in group_constraints)
+        has_poly = any(c.get("kind") == "arith" and ("poly" in c or "polys" in c) for c in group_constraints)
+
+        if not skip_heading:
+            lines.append(f"### {group_name}")
+            lines.append("")
+
+        # Build header based on columns needed
+        if has_iter and has_mult:
+            lines.append("| Tag | Range | Description | Multiplicity |")
+            lines.append("|-----|-------|-------------|--------------|")
+        elif has_iter:
+            lines.append("| Tag | Range | Description |")
+            lines.append("|-----|-------|-------------|")
+        elif has_mult:
+            lines.append("| Tag | Description | Multiplicity |")
+            lines.append("|-----|-------------|--------------|")
+        else:
+            lines.append("| Tag | Description |")
+            lines.append("|-----|-------------|")
+
+        for i, constraint in enumerate(group_constraints, 1):
+            # Always auto-generate ref with chip and group prefix (like shiroa does)
+            iters = iters_to_text(constraint)
+            iter_suffix = ".i" if iters else ""
+
+            ref = f"{chip_name}-{group_prefix}{global_counter}{iter_suffix}" if chip_name else f"{group_prefix}{global_counter}{iter_suffix}"
+
+            kind = constraint.get("kind", "")
+            tag = constraint.get("tag", "")
+
+            # Build description based on kind
+            cond = constraint.get("cond")
+            cond_str = f"{expr_to_text(cond)} ⇒ " if cond else ""
+
+            if kind == "interaction":
+                inputs = ", ".join(expr_to_text(inp) for inp in constraint.get("input", []))
+                output = constraint.get("output")
+                if output:
+                    desc = f"{cond_str}`{tag}[{expr_to_text(output)}; {inputs}]`"
+                else:
+                    desc = f"{cond_str}`{tag}[{inputs}]`"
+            elif kind == "arith":
+                desc = constraint.get("constraint", "")
+                desc = desc.replace("$", "").replace("#", "")
+                if cond_str:
+                    desc = f"{cond_str}{desc}"
+            elif kind == "template":
+                inputs = ", ".join(expr_to_text(inp) for inp in constraint.get("input", []))
+                output = constraint.get("output")
+                if output:
+                    desc = f"{cond_str}`{tag}<{expr_to_text(output)}; {inputs}>`"
+                else:
+                    desc = f"{cond_str}`{tag}<{inputs}>`"
+            else:
+                desc = str(constraint)
+
+            # Get range and multiplicity
+            mult = expr_to_text(constraint.get("multiplicity", ""))
+
+            # Build row based on columns
+            if has_iter and has_mult:
+                lines.append(f"| `{ref}` | {iters} | {desc} | {mult} |")
+            elif has_iter:
+                lines.append(f"| `{ref}` | {iters} | {desc} |")
+            elif has_mult:
+                lines.append(f"| `{ref}` | {desc} | {mult} |")
+            else:
+                lines.append(f"| `{ref}` | {desc} |")
+
+            # Add polynomial constraint if present (for arith constraints)
+            if kind == "arith" and ("poly" in constraint or "polys" in constraint):
+                if "poly" in constraint:
+                    poly_str = expr_to_text(constraint["poly"])
+                    if has_iter and has_mult:
+                        lines.append(f"| | | _polynomial:_ `{poly_str} = 0` | |")
+                    elif has_iter:
+                        lines.append(f"| | | _polynomial:_ `{poly_str} = 0` |")
+                    elif has_mult:
+                        lines.append(f"| | _polynomial:_ `{poly_str} = 0` | |")
+                    else:
+                        lines.append(f"| | _polynomial:_ `{poly_str} = 0` |")
+                elif "polys" in constraint:
+                    for poly in constraint["polys"]:
+                        poly_str = expr_to_text(poly)
+                        if has_iter and has_mult:
+                            lines.append(f"| | | _polynomial:_ `{poly_str} = 0` | |")
+                        elif has_iter:
+                            lines.append(f"| | | _polynomial:_ `{poly_str} = 0` |")
+                        elif has_mult:
+                            lines.append(f"| | _polynomial:_ `{poly_str} = 0` | |")
+                        else:
+                            lines.append(f"| | _polynomial:_ `{poly_str} = 0` |")
+
+            global_counter += 1
+
+        lines.append("")
+
+    return "\n".join(lines)
+
+
+def render_assumptions_table(chip: dict, config: dict) -> str:
+    """Render assumptions as Markdown table."""
+    assumptions = chip.get("assumptions", [])
+    if not assumptions:
+        return ""
+
+    chip_name = chip.get("name", "").upper()
+    prefix = f"{chip_name}-A" if chip_name else "A"
+
+    lines = []
+    lines.append("| Tag | Range | Description |")
+    lines.append("|-----|-------|-------------|")
+
+    for i, assumption in enumerate(assumptions, 1):
+        iters = iters_to_text(assumption)
+        iter_suffix = ".i" if iters else ""
+        ref = f"{chip_name}-A{i}{iter_suffix}" if chip_name else f"A{i}{iter_suffix}"
+        desc = assumption.get("desc", "").replace("|", "\\|")
+        lines.append(f"| `{ref}` | {iters} | {desc} |")
+
+    lines.append("")
+    return "\n".join(lines)
+
+
+def convert_chapter(typ_path: Path, toml_path: Path, title: str, config: dict) -> str:
+    """Convert a chapter from .typ and .toml to Markdown."""
+    lines = [f"# {title}", ""]
+
+    # Load TOML data
+    chip = load_toml(toml_path)
+
+    # Track what sections we've rendered from TOML
+    rendered_columns = False
+    rendered_assumptions = False
+    rendered_constraints = False
+    rendered_constraint_groups = set()
+
+    # Parse Typst prose
+    if typ_path.exists():
+        typst_content = typ_path.read_text()
+        elements = parse_typst_prose(typst_content)
+
+        for elem_type, content in elements:
+            if elem_type.startswith('h'):
+                level = int(elem_type[1])
+                lines.append("")
+                lines.append("#" * level + " " + content)
+                lines.append("")
+
+                # Render TOML data after relevant headings
+                content_lower = content.lower()
+                if 'column' in content_lower and chip and not rendered_columns:
+                    lines.append(render_variables_table(chip, config))
+                    rendered_columns = True
+                elif 'assumption' in content_lower and chip and not rendered_assumptions:
+                    lines.append(render_assumptions_table(chip, config))
+                    rendered_assumptions = True
+                elif content_lower == "constraints" and chip:
+                    # Mark that we've hit the Constraints section
+                    rendered_constraints = True
+
+            elif elem_type == 'render_constraints' and chip:
+                # Render the constraint group specified in the typst file
+                group_name = content
+                if group_name not in rendered_constraint_groups:
+                    # Skip heading since prose already has the section title
+                    group_table = render_constraints_table(chip, config, group_filter=group_name, skip_heading=True)
+                    if group_table.strip():
+                        lines.append(group_table)
+                        rendered_constraint_groups.add(group_name)
+
+            elif elem_type == 'para':
+                lines.append(content)
+                lines.append("")
+
+            elif elem_type == 'note':
+                lines.append(f"> **Note:** {content}")
+                lines.append("")
+
+    # Render any TOML data that wasn't triggered by prose headings
+    if chip:
+        if chip.get("variables") and not rendered_columns:
+            lines.append("## Columns")
+            lines.append("")
+            lines.append(render_variables_table(chip, config))
+
+        if chip.get("assumptions") and not rendered_assumptions:
+            lines.append("## Assumptions")
+            lines.append("")
+            lines.append(render_assumptions_table(chip, config))
+
+        if chip.get("constraints"):
+            # Get all constraint groups from TOML
+            all_groups = set(chip.get("constraints", {}).keys())
+            remaining_groups = all_groups - rendered_constraint_groups
+
+            if remaining_groups and not rendered_constraints:
+                # No prose Constraints section existed, add one
+                lines.append("## Constraints")
+                lines.append("")
+
+            # Render any constraint groups not already rendered inline
+            for group_name in remaining_groups:
+                group_table = render_constraints_table(chip, config, group_filter=group_name)
+                if group_table.strip():
+                    lines.append(group_table)
+
+    result = "\n".join(lines)
+    result = re.sub(r'\n{3,}', '\n\n', result)
+    # Clean up remaining Typst artifacts
+    result = re.sub(r'#\w+\[[^\]]*\]', '', result)  # #rj[...], #et[...]
+    result = re.sub(r'#\w+', '', result)  # #nr_variables etc
+    return result.strip()
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Convert Typst spec to Markdown",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=__doc__
+    )
+    parser.add_argument(
+        "--spec-dir", "-s",
+        type=Path,
+        default=None,
+        help="Path to spec directory (default: ../spec)"
+    )
+    parser.add_argument(
+        "--output-dir", "-o",
+        type=Path,
+        default=None,
+        help="Output directory (default: spec directory)"
+    )
+
+    args = parser.parse_args()
+
+    script_dir = Path(__file__).parent
+
+    spec_dir = args.spec_dir
+    if spec_dir is None:
+        spec_dir = script_dir / "../spec"
+    spec_dir = spec_dir.resolve()
+
+    output_dir = args.output_dir
+    if output_dir is None:
+        output_dir = spec_dir
+    output_dir = output_dir.resolve()
+
+    if not spec_dir.exists():
+        print(f"ERROR: Spec directory not found: {spec_dir}", file=sys.stderr)
+        return 1
+
+    # Load config
+    config_path = spec_dir / "src" / "config.toml"
+    config = load_toml(config_path)
+
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    print(f"Reading from: {spec_dir}")
+    print(f"Writing to: {output_dir}")
+    print()
+
+    all_content = []
+
+    for name, title in CHAPTERS:
+        typ_path = spec_dir / f"{name}.typ"
+        toml_path = spec_dir / "src" / f"{name}.toml"
+
+        print(f"Converting: {name} ({title})")
+
+        try:
+            markdown = convert_chapter(typ_path, toml_path, title, config)
+
+            output_file = output_dir / f"{name}.md"
+            output_file.write_text(markdown)
+
+            all_content.append(markdown)
+
+        except Exception as e:
+            print(f"  ERROR: {e}", file=sys.stderr)
+            import traceback
+            traceback.print_exc()
+
+    # Combined file
+    combined_file = output_dir / "spec_full.md"
+    combined = "# Lambda VM Specification\n\n"
+    combined += "\n\n---\n\n".join(all_content)
+    combined_file.write_text(combined)
+    print(f"\nCombined: {combined_file}")
+
+    print(f"\nDone! Converted {len(all_content)} chapters.")
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())

From 25713fa683b08f46fffde372c5cd14906452ad94 Mon Sep 17 00:00:00 2001
From: MauroFab <maurotoscano2@gmail.com>
Date: Fri, 30 Jan 2026 13:27:03 -0300
Subject: [PATCH 051/105] new typst to md

---
 docs/spec/add.md       |   48 +-
 docs/spec/bitwise.md   |   40 +-
 docs/spec/branch.md    |   45 +-
 docs/spec/cpu.md       |  263 ++++----
 docs/spec/decode.md    |   48 +-
 docs/spec/dvrm.md      |    5 +
 docs/spec/ecall.md     |   27 +
 docs/spec/is_bit.md    |   36 +-
 docs/spec/load.md      |   52 +-
 docs/spec/lt.md        |   76 ++-
 docs/spec/memory.md    |   89 +++
 docs/spec/memw.md      |  106 +--
 docs/spec/mul.md       |   74 ++-
 docs/spec/shift.md     |  141 ++--
 docs/spec/spec_full.md | 1389 ++++++++++++++++++++++++++++++++++++++++
 docs/spec/variables.md |    5 +
 scripts/README.md      |    2 +-
 scripts/typst_to_md.py |    2 +-
 18 files changed, 2109 insertions(+), 339 deletions(-)
 create mode 100644 docs/spec/dvrm.md
 create mode 100644 docs/spec/ecall.md
 create mode 100644 docs/spec/memory.md
 create mode 100644 docs/spec/spec_full.md
 create mode 100644 docs/spec/variables.md

diff --git a/docs/spec/add.md b/docs/spec/add.md
index 27e570787..8711c8493 100644
--- a/docs/spec/add.md
+++ b/docs/spec/add.md
@@ -1,4 +1,34 @@
-# ADD Chip
+# ADD/SUB Template
+
+box( inset: (left: 4pt, right: 4pt), outset: (top: 4pt, bottom: 4pt), radius: 2pt, fill: luma(230), raw(code)) }
+
+## Notation
+
+The  constraint template has the following interface:
+
+where `cond` is any value described by an expression _of degree at most `1`_.
+
+### 
+
+For ease of notation, we moreover introduce the  constraint template. Its interface
+
+maps onto the  template as
+
+It constrains that ``diff` = `lhs` - `rhs` mod 2^64` when the expression `cond` is non-zero. As with ,  can be used to denote the _unconditional_ application of the template.
+
+## Variables
+
+## Assumptions
+
+| Tag | Range | Description |
+|-----|-------|-------------|
+| `ADD-A1.i` | i ∈ [0, 1] | `IS_WORD[lhs[i]]` |
+| `ADD-A2.i` | i ∈ [0, 1] | `IS_WORD[rhs[i]]` |
+| `ADD-A3.i` | i ∈ [0, 1] | `IS_WORD[sum[i]]` |
+
+## Constraints
+
+This template introduces the following constraints
 
 ## Columns
 
@@ -33,18 +63,8 @@ carry (when iter=1) := 2^-32 * (lhs[1] + rhs[1] + carry[0] - sum[1])
 |------|------|-------------|
 | `cond` | `BaseField` | Whether the relation should be enforced ($eq.not 0$) or not ($0$). |
 
-## Assumptions
-
-| Ref | Range | Description |
-|-----|-------|-------------|
-| `add:a:lhs` | i ∈ [0, 1] | `IS_WORD[lhs[i]]` |
-| `add:a:rhs` | i ∈ [0, 1] | `IS_WORD[rhs[i]]` |
-| `add:a:sum` | i ∈ [0, 1] | `IS_WORD[sum[i]]` |
-
-## Constraints
-
 ### all
 
-| Ref | Kind | Range | Description |
-|-----|------|-------|-------------|
-| `add:c:carry` | template | i ∈ [0, 1] | cond ⇒ `IS_BIT<carry[i]>` |
+| Tag | Range | Description |
+|-----|-------|-------------|
+| `ADD-C1.i` | i ∈ [0, 1] | cond ⇒ `IS_BIT<carry[i]>` |
\ No newline at end of file
diff --git a/docs/spec/bitwise.md b/docs/spec/bitwise.md
index 0dd3038e8..93b0f1f1e 100644
--- a/docs/spec/bitwise.md
+++ b/docs/spec/bitwise.md
@@ -1,4 +1,4 @@
-# BITWISE Chip
+# BITWISE Chips
 
 ## Columns
 
@@ -39,20 +39,32 @@
 | `μ_HWSL` | `BaseField` |  |
 | `μ_HWSLC` | `BaseField` |  |
 
+The  chip is comprised of  variables that are expressed using  columns. Of these, the _input_ and _output_ variables ( in total) are precomputed.
+
+*Note*: This table contains one row for every possible value of `(X, Y, Z)`. As such, it has length `2^8 dot 2^8 dot 2^4 = 2^(20)`.
+
+## Lookup
+
+This chip adds the following interactions to the lookup:
+
+## Areas of Optimization
+
+The following ideas may prove to be optimizations for the  chip: + Extend `IS_BYTE[X]` to `ARE_BYTES[X, Y]`, such that two bytes are range checked at once. When only a single check is required, one can still execute `IS_BYTE[X] := ARE_BYTES[X, 0]`. + Drop `MSB8` column, and instead define the `MSB8` lookup as `MSB8<X> := MSB16[256X]`. Note: currently, `MSB8` also implicity range checks the input `X` (the lookup fails if `X` is not a `Byte`). This optimization should only be executed when all chips leveraging `MSB8` do _not_ need this implicit range check. + Place the 16-bit (`AND`, `OR`, `XOR`, `MSB16`, `ZERO`, etc.) and 20-bit (`HWSL`, `HWSLC`, `IS_B20`) lookups in separate tables. + Combine `HWSL` and `HWSLC` into a single lookup (see also \).
+
 ## Constraints
 
 ### contributions
 
-| Ref | Kind | Description | Multiplicity |
-|-----|------|-------------|--------------|
-| `1` | interaction | `AND_BYTE[AND; X, Y]` | -μ_AND |
-| `2` | interaction | `OR_BYTE[OR; X, Y]` | -μ_OR |
-| `3` | interaction | `XOR_BYTE[XOR; X, Y]` | -μ_XOR |
-| `4` | interaction | `MSB8[MSB8; X]` | -μ_MSB8 |
-| `5` | interaction | `MSB16[MSB16; X + 256 * Y]` | -μ_MSB16 |
-| `6` | interaction | `ZERO[ZERO; X + 256 * Y]` | -μ_ZERO |
-| `7` | interaction | `IS_BYTE[X]` | -μ_IS_BYTE |
-| `8` | interaction | `IS_HALF[X + 256 * Y]` | -μ_IS_HALF |
-| `9` | interaction | `IS_B20[X + 256 * Y + 65536 * Z]` | -μ_IS_B20 |
-| `10` | interaction | `HWSL[SLL; X + 256 * Y, Z]` | -μ_HWSL |
-| `11` | interaction | `HWSLC[SLLC; X + 256 * Y, Z]` | -μ_HWSLC |
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `BITWISE-C1` | `AND_BYTE[AND; X, Y]` | -μ_AND |
+| `BITWISE-C2` | `OR_BYTE[OR; X, Y]` | -μ_OR |
+| `BITWISE-C3` | `XOR_BYTE[XOR; X, Y]` | -μ_XOR |
+| `BITWISE-C4` | `MSB8[MSB8; X]` | -μ_MSB8 |
+| `BITWISE-C5` | `MSB16[MSB16; X + 256 * Y]` | -μ_MSB16 |
+| `BITWISE-C6` | `ZERO[ZERO; X + 256 * Y]` | -μ_ZERO |
+| `BITWISE-C7` | `IS_BYTE[X]` | -μ_IS_BYTE |
+| `BITWISE-C8` | `IS_HALF[X + 256 * Y]` | -μ_IS_HALF |
+| `BITWISE-C9` | `IS_B20[X + 256 * Y + 65536 * Z]` | -μ_IS_B20 |
+| `BITWISE-C10` | `HWSL[SLL; X + 256 * Y, Z]` | -μ_HWSL |
+| `BITWISE-C11` | `HWSLC[SLLC; X + 256 * Y, Z]` | -μ_HWSLC |
\ No newline at end of file
diff --git a/docs/spec/branch.md b/docs/spec/branch.md
index d5d465319..80199d934 100644
--- a/docs/spec/branch.md
+++ b/docs/spec/branch.md
@@ -49,30 +49,39 @@ next_pc (when iter=1) := 2^16 * next_pc_high[2] + next_pc_high[1]
 |------|------|-------------|
 | `μ` | `Bit` |  |
 
+The `BRANCH` chip is comprised of  variables that are expressed using  columns:
+
 ## Assumptions
 
-| Ref | Range | Description |
+| Tag | Range | Description |
 |-----|-------|-------------|
-| `A1` | i ∈ [0, 1] | `pc` is range checked, `IS_WORD[pc[i]]` |
-| `A2` |  | `offset` is range checked, `IS_WORD[offset]` |
-| `A3` | i ∈ [0, 1] | `register` is range checked, `IS_WORD[register[i]]` |
-| `A4` |  | `IS_BIT<JALR>` |
+| `BRANCH-A1.i` | i ∈ [0, 1] | `pc` is range checked, `IS_WORD[pc[i]]` |
+| `BRANCH-A2` |  | `offset` is range checked, `IS_WORD[offset]` |
+| `BRANCH-A3.i` | i ∈ [0, 1] | `register` is range checked, `IS_WORD[register[i]]` |
+| `BRANCH-A4` |  | `IS_BIT<JALR>` |
 
 ## Constraints
 
-### all
+> **Note:** Check correspondence with CPU for passing in `offset` as word or dword
+
+We constrain `next_pc` to be ``base_address` + `offset``, where `base_address` equals `pc` when ``JALR` = 0` and `register` otherwise.
+
+The range checks on `unmasked_low_byte` and `next_pc_low[0]` are performed implicitly by the `AND_BYTE` lookup.
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `BRANCH-C1` |  | 1 - JALR ⇒ `ADD<next_pc_unmasked; pc, offset::DWordWL>` |  |
+| `BRANCH-C2` |  | JALR ⇒ `ADD<next_pc_unmasked; register, offset::DWordWL>` |  |
+| `BRANCH-C3` |  | `IS_BYTE[next_pc_low[1]]` | μ |
+| `BRANCH-C4` |  | `AND_BYTE[next_pc_low[0]; unmasked_low_byte[0], 254]` | μ |
+| `BRANCH-C5.i` | i ∈ [0, 2] | `IS_HALFWORD[next_pc_high[i]]` | μ |
+
+This chip contributes the following to the lookup argument.
 
-| Ref | Kind | Range | Description | Multiplicity |
-|-----|------|-------|-------------|--------------|
-| `1` | template |  | 1 - JALR ⇒ `ADD<next_pc_unmasked; pc, offset::DWordWL>` |  |
-| `2` | template |  | JALR ⇒ `ADD<next_pc_unmasked; register, offset::DWordWL>` |  |
-| `3` | interaction |  | `IS_BYTE[next_pc_low[1]]` | μ |
-| `4` | interaction |  | `AND_BYTE[next_pc_low[0]; unmasked_low_byte[0], 254]` | μ |
-| `5` | interaction | i ∈ [0, 2] | `IS_HALFWORD[next_pc_high[i]]` | μ |
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `BRANCH-C6` | `BRANCH[next_pc; pc, offset, register, JALR]` | -μ |
 
-### output
-_Each row contributes the following to the LogUp sum_
+## Padding
 
-| Ref | Kind | Description | Multiplicity |
-|-----|------|-------------|--------------|
-| `1` | interaction | `BRANCH[next_pc; pc, offset, register, JALR]` | -μ |
+The table can be padded to the next power of two with the following value assignments:
\ No newline at end of file
diff --git a/docs/spec/cpu.md b/docs/spec/cpu.md
index 66c539635..0d3a4b364 100644
--- a/docs/spec/cpu.md
+++ b/docs/spec/cpu.md
@@ -79,130 +79,157 @@ packed_decode := 2^0 * read_register1 + 2^1 * read_register2 + 2^2 * write_regis
 pad := 1 - ADD - SUB - SLT - AND - OR - XOR - SHIFT - JALR - BEQ - BLT - LOAD - STORE - MUL - DIVREM - ECALL - EBREAK
 ```
 
+The `CPU` chip is comprised of  variables that are expressed using  columns:
+
 ## Assumptions
 
-| Ref | Range | Description |
+| Tag | Range | Description |
 |-----|-------|-------------|
-| `cpu:a:one-hot` |  | At most one ALU selector flag is 1 by the decoding, and every other flag is 0. |
-| `cpu:a:arg2-multiplex` |  | When `STORE + LOAD + BEQ + BLT = 0`, either `rs2 = 0` or `imm = 0` should be enforced by the decoding. This is needed for `arg2`. |
+| `CPU-A1` |  | At most one ALU selector flag is 1 by the decoding, and every other flag is 0. |
+| `CPU-A2` |  | When `STORE + LOAD + BEQ + BLT = 0`, either `rs2 = 0` or `imm = 0` should be enforced by the decoding. This is needed for `arg2`. |
 
 ## Constraints
 
-### decode
-
-| Ref | Kind | Description |
-|-----|------|-------------|
-| `1` | interaction | `DECODE[pc, imm, packed_decode]` |
-
-### range
-
-| Ref | Kind | Range | Description |
-|-----|------|-------|-------------|
-| `cpu:c:range_read_register1` | template |  | `IS_BIT<read_register1>` |
-| `cpu:c:range_read_register2` | template |  | `IS_BIT<read_register2>` |
-| `cpu:c:range_write_register` | template |  | `IS_BIT<write_register>` |
-| `cpu:c:range_memory_2bytes` | template |  | `IS_BIT<memory_2bytes>` |
-| `cpu:c:range_memory_4bytes` | template |  | `IS_BIT<memory_4bytes>` |
-| `cpu:c:range_memory_8bytes` | template |  | `IS_BIT<memory_8bytes>` |
-| `cpu:c:range_c_type_instruction` | template |  | `IS_BIT<c_type_instruction>` |
-| `cpu:c:range_signed` | template |  | `IS_BIT<signed>` |
-| `cpu:c:range_mp_selector` | template |  | `IS_BIT<mp_selector>` |
-| `cpu:c:range_muldiv_selector` | template |  | `IS_BIT<muldiv_selector>` |
-| `cpu:c:range_word_instr` | template |  | `IS_BIT<word_instr>` |
-| `cpu:c:range_ADD` | template |  | `IS_BIT<ADD>` |
-| `cpu:c:range_SUB` | template |  | `IS_BIT<SUB>` |
-| `cpu:c:range_SLT` | template |  | `IS_BIT<SLT>` |
-| `cpu:c:range_AND` | template |  | `IS_BIT<AND>` |
-| `cpu:c:range_OR` | template |  | `IS_BIT<OR>` |
-| `cpu:c:range_XOR` | template |  | `IS_BIT<XOR>` |
-| `cpu:c:range_SHIFT` | template |  | `IS_BIT<SHIFT>` |
-| `cpu:c:range_JALR` | template |  | `IS_BIT<JALR>` |
-| `cpu:c:range_BEQ` | template |  | `IS_BIT<BEQ>` |
-| `cpu:c:range_BLT` | template |  | `IS_BIT<BLT>` |
-| `cpu:c:range_LOAD` | template |  | `IS_BIT<LOAD>` |
-| `cpu:c:range_STORE` | template |  | `IS_BIT<STORE>` |
-| `cpu:c:range_MUL` | template |  | `IS_BIT<MUL>` |
-| `cpu:c:range_DIVREM` | template |  | `IS_BIT<DIVREM>` |
-| `cpu:c:range_ECALL` | template |  | `IS_BIT<ECALL>` |
-| `cpu:c:range_EBREAK` | template |  | `IS_BIT<EBREAK>` |
-| `R28` | interaction |  | `IS_BYTE[rs1]` |
-| `R29` | interaction |  | `IS_BYTE[rs2]` |
-| `R30` | interaction |  | `IS_BYTE[rd]` |
-| `R31` | interaction | i ∈ [0, 7] | `IS_BYTE[arg1[i]]` |
-| `R32` | interaction | i ∈ [0, 7] | `IS_BYTE[arg2[i]]` |
-| `R33` | interaction | i ∈ [0, 7] | `IS_BYTE[res[i]]` |
-
-### alu
-
-| Ref | Kind | Range | Description | Multiplicity |
-|-----|------|-------|-------------|--------------|
-| `A1` | template |  | ADD + LOAD + STORE ⇒ `ADD<res::DWordWL; arg1::DWordWL, arg2::DWordWL>` |  |
-| `cpu:c:sub` | template |  | SUB + BEQ ⇒ `SUB<res::DWordWL; arg1::DWordWL, arg2::DWordWL>` |  |
-| `A3` | interaction |  | `LT[res[0]; arg1::DWordWL, arg2::DWordWL, signed]` | SLT + BLT |
-| `A4` | arith | i ∈ [1, 7] | `SLT` + `BLT` => `res[i]` = 0 |  |
+First, we perform a decoding lookup for the current PC.
+
+| Tag | Description |
+|-----|-------------|
+| `CPU-C1` | `DECODE[pc, imm, packed_decode]` |
+
+> **Note:** All casts for interactions will have to be reviewed once other chip interfaces stabilise
+
+### Range checks
+
+> **Note:** Make sure we argue for every column here
+
+> **Note:** is `rvd` still sufficiently constrained? (can also be done through the memory argument like `pc`?)
+
+We constrain all columns to have the appropriate ranges. The flags and register indices looked up from the decoding need to be checked, as they are communicated through the interaction in a packed form. In contrast, we know ahead of time that decoding will ensure proper range checks for `pc` and `imm`. Similarly, since `next_pc` will propagate through the memory argument and be looked up in the instruction decoding on the next cycle, it is forced to be in the correct range. For the auxiliary columns, we need to check the limbs of `arg1`, `arg2`, and `res`. The ranges of the other auxiliary columns are enforced through later constraints.
+
+| Tag | Range | Description |
+|-----|-------|-------------|
+| `CPU-CR2` |  | `IS_BIT<read_register1>` |
+| `CPU-CR3` |  | `IS_BIT<read_register2>` |
+| `CPU-CR4` |  | `IS_BIT<write_register>` |
+| `CPU-CR5` |  | `IS_BIT<memory_2bytes>` |
+| `CPU-CR6` |  | `IS_BIT<memory_4bytes>` |
+| `CPU-CR7` |  | `IS_BIT<memory_8bytes>` |
+| `CPU-CR8` |  | `IS_BIT<c_type_instruction>` |
+| `CPU-CR9` |  | `IS_BIT<signed>` |
+| `CPU-CR10` |  | `IS_BIT<mp_selector>` |
+| `CPU-CR11` |  | `IS_BIT<muldiv_selector>` |
+| `CPU-CR12` |  | `IS_BIT<word_instr>` |
+| `CPU-CR13` |  | `IS_BIT<ADD>` |
+| `CPU-CR14` |  | `IS_BIT<SUB>` |
+| `CPU-CR15` |  | `IS_BIT<SLT>` |
+| `CPU-CR16` |  | `IS_BIT<AND>` |
+| `CPU-CR17` |  | `IS_BIT<OR>` |
+| `CPU-CR18` |  | `IS_BIT<XOR>` |
+| `CPU-CR19` |  | `IS_BIT<SHIFT>` |
+| `CPU-CR20` |  | `IS_BIT<JALR>` |
+| `CPU-CR21` |  | `IS_BIT<BEQ>` |
+| `CPU-CR22` |  | `IS_BIT<BLT>` |
+| `CPU-CR23` |  | `IS_BIT<LOAD>` |
+| `CPU-CR24` |  | `IS_BIT<STORE>` |
+| `CPU-CR25` |  | `IS_BIT<MUL>` |
+| `CPU-CR26` |  | `IS_BIT<DIVREM>` |
+| `CPU-CR27` |  | `IS_BIT<ECALL>` |
+| `CPU-CR28` |  | `IS_BIT<EBREAK>` |
+| `CPU-CR29` |  | `IS_BYTE[rs1]` |
+| `CPU-CR30` |  | `IS_BYTE[rs2]` |
+| `CPU-CR31` |  | `IS_BYTE[rd]` |
+| `CPU-CR32.i` | i ∈ [0, 7] | `IS_BYTE[arg1[i]]` |
+| `CPU-CR33.i` | i ∈ [0, 7] | `IS_BYTE[arg2[i]]` |
+| `CPU-CR34.i` | i ∈ [0, 7] | `IS_BYTE[res[i]]` |
+
+### ALU
+
+The ALU functionality is then obtained through judicious dispatching to the corresponding chips.
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `CPU-CA35` |  | ADD + LOAD + STORE ⇒ `ADD<res::DWordWL; arg1::DWordWL, arg2::DWordWL>` |  |
+| `CPU-CA36` |  | SUB + BEQ ⇒ `SUB<res::DWordWL; arg1::DWordWL, arg2::DWordWL>` |  |
+| `CPU-CA37` |  | `LT[res[0]; arg1::DWordWL, arg2::DWordWL, signed]` | SLT + BLT |
+| `CPU-CA38.i` | i ∈ [1, 7] | `SLT` + `BLT` => `res[i]` = 0 |  |
 | | | _polynomial:_ `(SLT + BLT) * res[i] = 0` | |
-| `A5` | interaction | i ∈ [0, 7] | `AND_BYTE[res[i]; arg1[i], arg2[i]]` | AND |
-| `A6` | interaction | i ∈ [0, 7] | `OR_BYTE[res[i]; arg1[i], arg2[i]]` | OR |
-| `A7` | interaction | i ∈ [0, 7] | `XOR_BYTE[res[i]; arg1[i], arg2[i]]` | XOR |
-| `A8` | interaction |  | `SHIFT[res::DWordHL; arg1::DWordHL, arg2[0], mp_selector, signed, word_instr]` | SHIFT |
-| `A9` | template |  | JALR ⇒ `ADD<res::DWordWL; pc, (2 * c_type_instruction + 4 * (1 - c_type_instruction))::DWordWL>` |  |
-| `A10` | interaction |  | `MUL[res; arg1, signed, arg2, mp_selector, muldiv_selector]` | MUL |
-| `A11` | interaction |  | `DVRM[res; arg1, arg2, signed, muldiv_selector]` | DIVREM |
-
-### mem
-
-| Ref | Kind | Range | Description | Multiplicity |
-|-----|------|-------|-------------|--------------|
-| `M1` | interaction |  | `MEMW[rv1; 1, 2 * rs1, rv1, timestamp + 0, 1, 0, 0]` | read_register1 |
-| `M2` | arith | i ∈ [0, 2] | `!read_register1` => `rv1[i]` = 0 |  |
+| `CPU-CA39.i` | i ∈ [0, 7] | `AND_BYTE[res[i]; arg1[i], arg2[i]]` | AND |
+| `CPU-CA40.i` | i ∈ [0, 7] | `OR_BYTE[res[i]; arg1[i], arg2[i]]` | OR |
+| `CPU-CA41.i` | i ∈ [0, 7] | `XOR_BYTE[res[i]; arg1[i], arg2[i]]` | XOR |
+| `CPU-CA42` |  | `SHIFT[res::DWordHL; arg1::DWordHL, arg2[0], mp_selector, signed, word_instr]` | SHIFT |
+| `CPU-CA43` |  | JALR ⇒ `ADD<res::DWordWL; pc, (2 * c_type_instruction + 4 * (1 - c_type_instruction))::DWordWL>` |  |
+| `CPU-CA44` |  | `MUL[res; arg1, signed, arg2, mp_selector, muldiv_selector]` | MUL |
+| `CPU-CA45` |  | `DVRM[res; arg1, arg2, signed, muldiv_selector]` | DIVREM |
+
+### Memory
+
+The interactions with the memory, both for register loading and storing, as for `LOAD` and `STORE` instructions are handled. Note that since registers need no byte-addressing, we store them in the memory argument with `Word` limbs. The timestamps are ensured to be disjoint for disjoint memory locations. One consequence of that is that `next_pc` is written at `timestamp + 1` to ensure the access is disjoint with the `pc` read into `rv1` as part of the `AUIPC` instruction.
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `CPU-CM46` |  | `MEMW[rv1; 1, 2 * rs1, rv1, timestamp + 0, 1, 0, 0]` | read_register1 |
+| `CPU-CM47.i` | i ∈ [0, 2] | `!read_register1` => `rv1[i]` = 0 |  |
 | | | _polynomial:_ `(1 - read_register1) * rv1[i] = 0` | |
-| `M3` | interaction |  | `MEMW[rv2; 1, 2 * rs2, rv2, timestamp + 1, 1, 0, 0]` | read_register2 |
-| `M4` | arith | i ∈ [0, 2] | `!read_register2` => `rv2[i]` = 0 |  |
+| `CPU-CM48` |  | `MEMW[rv2; 1, 2 * rs2, rv2, timestamp + 1, 1, 0, 0]` | read_register2 |
+| `CPU-CM49.i` | i ∈ [0, 2] | `!read_register2` => `rv2[i]` = 0 |  |
 | | | _polynomial:_ `(1 - read_register2) * rv2[i] = 0` | |
-| `M5` | interaction |  | `MEMW[1, 2 * rd, rvd, timestamp + 2, 1, 0, 0]` | write_register |
-| `M6` | interaction |  | `LOAD[rvd; 0, res, timestamp + 0, memory_2bytes, memory_4bytes, memory_8bytes, signed]` | LOAD |
-| `M7` | interaction |  | `MEMW[0, res, rv2, timestamp + 1, memory_2bytes, memory_4bytes, memory_8bytes]` | STORE |
-| `M8` | interaction |  | `MEMW[pc; 1, 2 * 255, next_pc, timestamp + 1, 1, 0, 0]` | 1 - pad |
-
-### sys
-
-| Ref | Kind | Description | Multiplicity |
-|-----|------|-------------|--------------|
-| `cpu:c:ebreak_traps` | arith | `!EBREAK` |  |
-| | | _polynomial:_ `1 - EBREAK = 0` | |
-| | | _note:_ We treat `EBREAK` as an unprovable trap | |
-| `S2` | interaction | `ECALL[timestamp, rv1::DWordWL]` | ECALL |
-
-### ext
-
-| Ref | Kind | Description | Multiplicity |
-|-----|------|-------------|--------------|
-| `E1` | arith | (`rv1_sign_bit` or `arg2_sign_bit` or `res_sign_bit`) => `word_instr` |  |
-| | | _polynomial:_ `(rv1_sign_bit + arg2_sign_bit + res_sign_bit) * (1 - word_instr) = 0` | |
-| `E2` | interaction | `MSB16[rv1_sign_bit; rv1[1]]` | word_instr |
-| `E3` | arith | `arg1[:4]` = `rv1[:2]` |  |
-| | | _polynomial:_ `(arg1::DWordWL)[0] - (rv1::DWordWL)[0] = 0` | |
-| `E4` | arith | `arg1[4:]` = `rv1[2]` dot (1 - `word_instr`) + (2^(32) - 1) dot `rv1_sign_bit` dot `signed` |  |
-| | | _polynomial:_ `(arg1::DWordWL)[1] - (1 - word_instr) * rv1[2] - signed * rv1_sign_bit * (2^32 - 1) = 0` | |
-| `E5` | interaction | `MSB16[arg2_sign_bit; rv2[1]]` | word_instr |
-| `E6` | arith | `arg2[:4]` = (1 - `STORE` - `LOAD`) dot `rv2[:2]` + (1 - `BEQ` - `BLT`) dot `imm[0]` |  |
-| | | _polynomial:_ `(arg2::DWordWL)[0] - (1 - STORE - LOAD) * (rv2::DWordWL)[0] - (1 - BEQ - BLT) * imm[0] = 0` | |
-| `E7` | arith | `arg2[4:]` = (1 - `STORE` - `LOAD`) dot ((1 - `word_instr`) dot `rv2[2]` + `signed` dot `arg2_sign_bit` dot (2^(32) - 1)) + (1 - `BEQ` - `BLT`) dot `imm[1]` |  |
-| | | _polynomial:_ `(arg2::DWordWL)[1] - (1 - STORE - LOAD) * (1 - word_instr) * rv2[2] - (1 - STORE - LOAD) * signed * arg2_sign_bit * (2^32 - 1) - (1 - BEQ - BLT) * imm[1] = 0` | |
-| `E8` | interaction | `MSB8[res_sign_bit; res[3]]` | word_instr |
-| `E9` | arith | `!LOAD` => `rvd[0]` = `res[:4]` |  |
-| | | _polynomial:_ `(1 - LOAD) * (rvd[0] - (res::DWordWL)[0]) = 0` | |
-| `E10` | arith | `!LOAD` => `rvd[1]` = (1 - `word_instr`) dot `res[4:]` + `res_sign_bit` dot (2^(32) - 1) |  |
-| | | _polynomial:_ `(1 - LOAD) * (rvd[1] - (1 - word_instr) * (res::DWordWL)[1] - res_sign_bit * (2^32 - 1)) = 0` | |
-| | | _note:_ _Sign_ extend the output if it wasn't a `LOAD`. Only `LOAD` has both `write_register = 1` and `rvd ≠ res`. `LOAD` and `word_instr` are disjoint | |
-
-### misc
-
-| Ref | Kind | Description | Multiplicity |
-|-----|------|-------------|--------------|
-| `cpu:c:is_equal` | interaction | `ZERO[is_equal; res[0] + res[1] + res[2] + res[3] + res[4] + res[5] + res[6] + res[7]]` | BEQ |
-| `O2` | arith | `branch_cond` = `JALR` or (`BLT` and (`res` xor `invert`)) or (`BEQ` and (`is_equal` xor `invert`)) |  |
-| | | _polynomial:_ `-branch_cond + JALR + res[0] * (1 - mp_selector) * BLT + (1 - res[0]) * mp_selector * BLT + is_equal * (1 - mp_selector) * BEQ + (1 - is_equal) * mp_selector * BEQ = 0` | |
-| | | _note:_ where `invert` is represented by `mp_selector` | |
-| `O3` | interaction | `BRANCH[next_pc; pc, imm[0], arg1::DWordWL, JALR]` | branch_cond |
-| `O4` | template | `ADD<next_pc; pc, (2 * c_type_instruction + 4 * (1 - c_type_instruction))::DWordWL>` |  |
+| `CPU-CM50` |  | `MEMW[1, 2 * rd, rvd, timestamp + 2, 1, 0, 0]` | write_register |
+| `CPU-CM51` |  | `LOAD[rvd; 0, res, timestamp + 0, memory_2bytes, memory_4bytes, memory_8bytes, signed]` | LOAD |
+| `CPU-CM52` |  | `MEMW[0, res, rv2, timestamp + 1, memory_2bytes, memory_4bytes, memory_8bytes]` | STORE |
+| `CPU-CM53` |  | `MEMW[pc; 1, 2 * 255, next_pc, timestamp + 1, 1, 0, 0]` | 1 - pad |
+
+### System
+
+The interactions with the wider system.
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `CPU-CS54` | `!EBREAK` |  |
+| | _polynomial:_ `1 - EBREAK = 0` | |
+| `CPU-CS55` | `ECALL[timestamp, rv1::DWordWL]` | ECALL |
+
+### Input and output to the ALU
+
+We constrain `arg1`, `arg2` and `rvd` to correspond to the wanted values, including the appropriate sign/zero extension, depending on `word_instr`.
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `CPU-CE56` | (`rv1_sign_bit` or `arg2_sign_bit` or `res_sign_bit`) => `word_instr` |  |
+| | _polynomial:_ `(rv1_sign_bit + arg2_sign_bit + res_sign_bit) * (1 - word_instr) = 0` | |
+| `CPU-CE57` | `MSB16[rv1_sign_bit; rv1[1]]` | word_instr |
+| `CPU-CE58` | `arg1[:4]` = `rv1[:2]` |  |
+| | _polynomial:_ `(arg1::DWordWL)[0] - (rv1::DWordWL)[0] = 0` | |
+| `CPU-CE59` | `arg1[4:]` = `rv1[2]` dot (1 - `word_instr`) + (2^(32) - 1) dot `rv1_sign_bit` dot `signed` |  |
+| | _polynomial:_ `(arg1::DWordWL)[1] - (1 - word_instr) * rv1[2] - signed * rv1_sign_bit * (2^32 - 1) = 0` | |
+| `CPU-CE60` | `MSB16[arg2_sign_bit; rv2[1]]` | word_instr |
+| `CPU-CE61` | `arg2[:4]` = (1 - `STORE` - `LOAD`) dot `rv2[:2]` + (1 - `BEQ` - `BLT`) dot `imm[0]` |  |
+| | _polynomial:_ `(arg2::DWordWL)[0] - (1 - STORE - LOAD) * (rv2::DWordWL)[0] - (1 - BEQ - BLT) * imm[0] = 0` | |
+| `CPU-CE62` | `arg2[4:]` = (1 - `STORE` - `LOAD`) dot ((1 - `word_instr`) dot `rv2[2]` + `signed` dot `arg2_sign_bit` dot (2^(32) - 1)) + (1 - `BEQ` - `BLT`) dot `imm[1]` |  |
+| | _polynomial:_ `(arg2::DWordWL)[1] - (1 - STORE - LOAD) * (1 - word_instr) * rv2[2] - (1 - STORE - LOAD) * signed * arg2_sign_bit * (2^32 - 1) - (1 - BEQ - BLT) * imm[1] = 0` | |
+| `CPU-CE63` | `MSB8[res_sign_bit; res[3]]` | word_instr |
+| `CPU-CE64` | `!LOAD` => `rvd[0]` = `res[:4]` |  |
+| | _polynomial:_ `(1 - LOAD) * (rvd[0] - (res::DWordWL)[0]) = 0` | |
+| `CPU-CE65` | `!LOAD` => `rvd[1]` = (1 - `word_instr`) dot `res[4:]` + `res_sign_bit` dot (2^(32) - 1) |  |
+| | _polynomial:_ `(1 - LOAD) * (rvd[1] - (1 - word_instr) * (res::DWordWL)[1] - res_sign_bit * (2^32 - 1)) = 0` | |
+
+### Other constraints
+
+> **Note:** proper ref to IsZero/IsEqual
+
+For [cpu:c:is_equal], refer to the logic of IsZero or IsEqual, in combination with the subtraction of [cpu:c:sub].
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `CPU-CO66` | `ZERO[is_equal; res[0] + res[1] + res[2] + res[3] + res[4] + res[5] + res[6] + res[7]]` | BEQ |
+| `CPU-CO67` | `branch_cond` = `JALR` or (`BLT` and (`res` xor `invert`)) or (`BEQ` and (`is_equal` xor `invert`)) |  |
+| | _polynomial:_ `-branch_cond + JALR + res[0] * (1 - mp_selector) * BLT + (1 - res[0]) * mp_selector * BLT + is_equal * (1 - mp_selector) * BEQ + (1 - is_equal) * mp_selector * BEQ = 0` | |
+| `CPU-CO68` | `BRANCH[next_pc; pc, imm[0], arg1::DWordWL, JALR]` | branch_cond |
+| `CPU-CO69` | `ADD<next_pc; pc, (2 * c_type_instruction + 4 * (1 - c_type_instruction))::DWordWL>` |  |
+
+> **Note:** Document the choice to not have a multiplicity column here for padding
+
+## Padding
+
+The CPU can be padded with the following values, which have a corresponding row in the DECODE table, at the _odd_ address 1, only reachable through a HALT ecall.
+
+This approach minimizes the number of dependent lookups, increasing only multiplicities in the DECODE table and the IS_BYTE lookup.
\ No newline at end of file
diff --git a/docs/spec/decode.md b/docs/spec/decode.md
index 9bf1fbcb7..7e3fc6722 100644
--- a/docs/spec/decode.md
+++ b/docs/spec/decode.md
@@ -1,4 +1,6 @@
-# DECODE Chip
+# DECODE Table
+
+All `RV64IMC` instruction are to be decoded to a format that can be interpreted by the VM. This section outlines the decoding table being used in the VM. For reasons of efficiency, data in this table is significantly compressed. Since reasoning about this compressed form is needlessly complex, the `decode (uncompressed)` section presents the same table in uncompressed form, and explains how to decode `RV64IM` assembly instructions to it. Instructions on how to compress the uncompressed table to form the compressed decode table, can be derived from the `packed_decode` variable provided below.
 
 ## Columns
 
@@ -15,3 +17,47 @@
 | Name | Type | Description |
 |------|------|-------------|
 | `μ` | `BaseField` | The multiplicity with which this instruction is looked up in the `CPU` table. |
+
+The  table is comprised of  variables that are expressed using  columns:
+
+## Padding
+
+The  table must be padded to a length that is a power of two. Empty rows with the following content can be added to achieve this:
+
+Note that this row sets the `EBREAK` flag. Given that `CPU` asserts that `EBREAK = 0` (see [cpu:c:ebreak_traps]), using this "padding-instruction" would immediately make the CPU table unprovable. Note moreover that the `pc` is set to `7`. This value is the _smallest odd number_ (i.e., not reachable during regular execution) that is more than _`4`_ (i.e., the max `pc`-increment) greater than _`1`_ (i.e., the `pc`-value used in the [additional instruction] referred to by `CPU`-padding lines).
+
+## Decoding
+
+For the purposes of explaining decoding, we decompress 's `packed_decode` variable into its constituent variables. Note that the below table is _not_ used in practice: it is solely used for the purposes of this explanation.
+
+We will illustrate how each instruction should be expressed in this (uncompressed) decoding table. The columns of the accompanying table represent the following: - *`operation`*: the assembly operation being encoded. - *`op-flag`*: which of the "`ALU` selector flags" operation flags to set. Each operation sets exactly one. - *`w_instr`*, *`signed`*: whether to set the `word_instr` and `signed` flags, respectively. - *other*: the other flags that should be set or variables that should be given specific values.
+
+For the purpose of brevity and readability, the table uses the following rules-of-thumb: + `rd`, `rs1`, `rs2`, and `imm` are mapped to the values provided by the instruction; when a value is not specified by an instruction it defaults to `0`. + `read_register1`, `read_register2` and `write_register` are set to `1` when respectively ``rs1` != 0`, ``rs2` != 0`, or  ``rd` != 0`. + Any flag that is not listed is set to `0`, with the exception of the `c_type` flag. *The `c_type` flag is set independently of the below table*, as explained next.
+
+Further clarification is provided in the notes following the table.
+
+### C-type instructions
+
+The `RV64C` extension for compressed instructions specifies that \~50% of all instructions can be represented using a 16-bit instruction (rather than 32-bits), saving \~25% in code size. This execution of assembly code is _not_ agnostic to an instruction's compression state; after executing a compressed instruction, the `pc` should be incremented by `2` rather than `4`. To indicate an instruction is provided in compressed form, the `c_type` flag is introduced. *This flag should be set to `1` whenever the decoded instruction is provided in compressed form and `0` otherwise.*
+
+/// Add a reference to one or more notes following this table. super("[" + refs.pos().map(r => ref(r)).join(",") + "]") }
+
+show figure: set block(breakable: true)
+
+figure(table( columns: (auto, auto, 40pt, 40pt, 1fr, 15pt), stroke: 0pt, inset: (right: .5em), align: (left, right, center, center, left, right), fill: (_, y) => if calc.odd(y) and y <= lines.len() { luma(245) } else { white }, table.header([*Operation*], [*op-flag*], [*`w_instr`*], [*`signed`*], [*other*], []), table.hline(stroke: 1.5pt), table.vline(x: 1, start: 1, end: lines.len() + 1, stroke: .5pt), ..lines.flatten(), table.hline(stroke: 1.5pt), table.footer([*Operation*], [*op-flag*], [*`w_instr`*], [*`signed`*], [*other*]), ), caption: [Decoding table] }
+
+// OP-IMM ([`ADDI[W]   rd, rs1, imm`], [`ADD`], [`[W]`], [], [], []), ([`SLTI[U]   rd, rs1, imm`], [`SLT`], [], [.not`[U]`], [], []), ([`ANDI      rd, rs1, imm`], [`AND`], [], [], [], []), ([`ORI       rd, rs1, imm`], [`OR`],   [], [], [], []), ([`XORI      rd, rs1, imm`], [`XOR`], [], [], [], []), ([`SLLI[W]   rd, rs1, imm`], [`SHIFT`], [`[W]`], [], [], []), ([`SRLI[W]   rd, rs1, imm`], [`SHIFT`], [`[W]`], [], [`mp_selector`], []), ([`SRAI[W]   rd, rs1, imm`], [`SHIFT`], [`[W]`], [1], [`mp_selector`], []), // OP ([`ADD[W]    rd, rs1, rs2`], [`ADD`], [`[W]`], [], [], []), ([`SUB[W]    rd, rs1, rs2`], [`SUB`], [`[W]`], [], [], []), ([`SLT[U]    rd, rs1, rs2`], [`SLT`], [], [.not`[U]`], [], []), ([`AND       rd, rs1, rs2`], [`AND`], [], [], [], []), ([`OR        rd, rs1, rs2`], [`OR`], [], [], [], []), ([`XOR       rd, rs1, rs2`], [`XOR`], [], [], [], []), ([`SLL[W]    rd, rs1, rs2`], [`SHIFT`], [`[W]`], [], [], []), ([`SRL[W]    rd, rs1, rs2`], [`SHIFT`], [`[W]`], [], [`mp_selector`], []), ([`SRA[W]    rd, rs1, rs2`], [`SHIFT`], [`[W]`], [1], [`mp_selector`], []), // OP - M ([`MUL[W]    rd, rs1, rs2`], [`MUL`], [`[W]`], [1], [`mp_selector`], []), ([`MULH      rd, rs1, rs2`], [`MUL`], [], [1], [`mp_selector`, `muldiv_selector`], []), ([`MULHU     rd, rs1, rs2`], [`MUL`], [], [], [`muldiv_selector`], []), ([`MULHSU    rd, rs1, rs2`], [`MUL`], [], [1], [`muldiv_selector`], []), ([`DIV[U][W] rd, rs1, rs2`], [`DIVREM`], [`[W]`], [.not`[U]`], [], []), ([`REM[U][W] rd, rs1, rs2`], [`DIVREM`], [`[W]`], [.not`[U]`], [`muldiv_selector`], []), // LUI/AUIPC ([`LUI       rd, imm`], [`ADD`], [], [], [], []), ([`AUIPC     rd, imm`], [`ADD`], [], [], [`rs1 := x255`], []), ([`JAL       rd, imm`], [`JALR`], [], [], [`rs1 := x255`], []), // Branching ([`JALR      rd, rs1, imm`], [`JALR`], [], [], [], []), ([`BEQ      rs1, rs2, imm`], [`BEQ`], [], [], [], []), ([`BNE      rs1, rs2, imm`], [`BEQ`], [], [], [`mp_selector`], []), ([`BLT[U]   rs1, rs2, imm`], [`BLT`], [], [.not`[U]`], [], []), ([`BGE[U]   rs1, rs2, imm`], [`BLT`], [], [.not`[U]`], [`mp_selector`], []), // LOAD ([`LD        rd, rs1, imm`], [`LOAD`], [], [], [`mem_8B`], []), ([`LW[U]     rd, rs1, imm`], [`LOAD`], [], [.not`[U]`], [`mem_4B`], []), ([`LH[U]     rd, rs1, imm`], [`LOAD`], [], [.not`[U]`], [`mem_2B`], []), ([`LB[U]     rd, rs1, imm`], [`LOAD`], [], [.not`[U]`], [], []), // STORE ([`SD       rs1, rs2, imm`], [`STORE`], [], [], [`mem_8B`], []), ([`SW       rs1, rs2, imm`], [`STORE`], [], [], [`mem_4B`], []), ([`SH       rs1, rs2, imm`], [`STORE`], [], [], [`mem_2B`], []), ([`SB       rs1, rs2, imm`], [`STORE`], [], [], [], []), // ECALL/EBREAK ([`ECALL`], [`ECALL`], [], [], [``rs1` := `x17``], []), ([`EBREAK`], [`EBREAK`], [], [], [], []), // FENCE ([`FENCE`], [`ADD`], [], [], [], []),
+
+// Construct a note that can be referenced through `lbl` show figure: (it) => align(left, []) [ ] }
+
+#### Notes
+
+We note the following about the above decoding table:
+
+enum.item( referenceable_note( "note_word_instr", [`word_instr`: `[W]` indicates that ``word_instr` = 1` for the `W`-variant of the operation, and `0` for the non-`W`-variant.] ), enum.item( referenceable_note( "note_signed", [`signed`: .not`[U]` indicates that ``signed` = 1` for the *non-`U`*-variant of the operation, and `0` for the `U`-variant.] ), enum.item( referenceable_note( "note-lui", [`LUI`: this operation loads the 20-bit `imm` in the upper bits of `rd`. Observe that this can be represented using `ADDI rd, x0, imm`. As such, *we expect the decoding to take care of writing the immediate in bit range `[12:32]` of `imm` and extending it to 64 bits.*] ), enum.item( referenceable_note( "note-auipc", [`AUIPC`: this operation adds the 20-bit immediate to the upper bits of `pc` and stores the result in `rd`. Given that the `pc` is stored in `x255`, this operation can be represented using `ADDI rd, x255, imm`. As such, *we expect the decoding to take care of writing the immediate in bit range `[12:32]` of `imm` and extending it to 64 bits.*] ), enum.item( referenceable_note( "note-jal", [`JAL`: this operation stores ``pc` + 4` in `rd` and adds two times the sign-extended 20-bit immediate to the `pc`. Note that this can be represented using `JALR rd, x255, imm`. As such, *we expect the decoding to take care of writing the immediate in bit range `[1:21]` of `imm` and extending it to 64 bits; the least significant bit should always be 0.*] ), enum.item( referenceable_note( "note-ecall", [`ECALL`: "On RISC-V a system call has its own instruction: `ECALL`. [...] A7 [= register `x17`] contains the system call number." [[source]] ] ), enum.item( referenceable_note( "note-fence", [`FENCE`: currently, the VM interprets this operation as `ADDI x0 x0 0`; a no-op.]
+
+## One more instruction <cpu-padding-decode-row>
+
+In addition to decoding all instructions provided in the ELF and adding a corresponding entry to the  table, one must include an entry that has ``pc` = 1` and every other variable set to `0`. Note that this will never conflict with any entry in the ELF, since it has an odd `pc` value.
+
+This entry is used to pad the `CPU` table. More details on this matter are provided in the `CPU` chip.
\ No newline at end of file
diff --git a/docs/spec/dvrm.md b/docs/spec/dvrm.md
new file mode 100644
index 000000000..fc32ae17f
--- /dev/null
+++ b/docs/spec/dvrm.md
@@ -0,0 +1,5 @@
+# DVRM Chip
+
+//  chip = load_chip("src/dvrm.toml", config)
+
+*placeholder chapter: WIP*
\ No newline at end of file
diff --git a/docs/spec/ecall.md b/docs/spec/ecall.md
new file mode 100644
index 000000000..2d9891074
--- /dev/null
+++ b/docs/spec/ecall.md
@@ -0,0 +1,27 @@
+# ECALL Chips
+
+##  chip
+
+### Columns
+
+The  chip leverages  variable, spanning  columns:
+
+### Assumptions
+
+It is assumed the input is range checked:
+
+### Constraints
+
+The  chip: + makes sure register `x10` (containing the exit code) equals `0` ([halt:c:read_zero_exit_code]), + writes `0` to all other registers ([halt:c:zeroize_registers_lo]/[halt:c:zeroize_registers_hi]), and + sets `pc` equal to `1` ([halt:c:pc]). Note that the writes performed by all these interactions are accompanied by the timestamp `2^64-1`; the maximum timestamp. This prevents any other operation involving memory from being executed hereafter.
+
+[ Observe that --- in its current state --- this solution puts the burden of verifying the register cleanup on the verifier inside of the lookup argument. Alternatively, one could add 31 lookups to the "memory" table to remove the _known_ final tokens for the registers there. ])
+
+#### Lookup
+
+The HALT chip contributes the following interaction to the lookup-argument:
+
+*Note*: [`93` is the system call number corresponding to `sys_exit`.]
+
+### Padding
+
+This chip should only contain a single row. Given that `2^0 = 1`, this chip does not need to be padded. As such, no padding is defined.
\ No newline at end of file
diff --git a/docs/spec/is_bit.md b/docs/spec/is_bit.md
index 1c4a3182b..bb6e0090f 100644
--- a/docs/spec/is_bit.md
+++ b/docs/spec/is_bit.md
@@ -1,4 +1,28 @@
-# IS_BIT Chip
+# IS_BIT Template
+
+box( inset: (left: 4pt, right: 4pt), outset: (top: 4pt, bottom: 4pt), radius: 2pt, fill: luma(230), raw(code)) }
+
+Barring exceptional cases, this template is used to assert that a variable of type `Bit` assumes a valid value under some condition.
+
+## Interface
+
+The  constraint template has the following interface:
+
+where `cond` is any value described by an expression _of degree at most `1`_. Note that  can be used to denote the _unconditional_ application of the  template to `X`.
+
+## Variables
+
+The  template operates on two variables: `cond` and `X`:
+
+## Constraints
+
+It takes only one constraint to enforce that `X` must be either `0` or `1` whenever ``cond` eq.not 0`:
+
+*Note*: - In case of _unconditional_ template application, `cond` can be dropped from the constraint, simplifying it to ``X` (1- `X`) = 0`. - As described earlier, the `cond` variable must be describable by a degree-1 (i.e., linear) expression. This is to make sure that [isbit:c:isbit]'s expression has degree at most 3.
+
+## Proof of correctness
+
+If `cond` is `0`, [isbit:c:isbit] is trivially satisfied: `X` can assume any value and the polynomial constraint will evaluate to `0` regardless. When ``cond` eq.not 0`, it follows that the statement can only be proven when ``X` (1-`X`) equiv 0 mod p`, with `p` the modulus of the field. Because `BaseField` is a prime field, this equality is only satisfied if either ``X` equiv 0 mod p` or `1-`X` equiv 0 mod p`. Hence, it is proven that when ``cond` eq.not 0`, [isbit:c:isbit] is only satisfied if ``X` in {0, 1}`. 
 
 ## Columns
 
@@ -14,11 +38,9 @@
 |------|------|-------------|
 | `cond` | `BaseField` | Whether the constraint should be applied ($eq.not 0$) or not ($0$). |
 
-## Constraints
-
 ### all
 
-| Ref | Kind | Description |
-|-----|------|-------------|
-| `isbit:c:isbit` | arith | `cond` => `X` (1-`X`) = 0 |
-| | | _polynomial:_ `cond * X * (1 - X) = 0` |
+| Tag | Description |
+|-----|-------------|
+| `IS_BIT-C1` | `cond` => `X` (1-`X`) = 0 |
+| | _polynomial:_ `cond * X * (1 - X) = 0` |
\ No newline at end of file
diff --git a/docs/spec/load.md b/docs/spec/load.md
index 1f14a1adc..6f519564a 100644
--- a/docs/spec/load.md
+++ b/docs/spec/load.md
@@ -42,39 +42,45 @@ read1 := μ - read2 - read4 - read8
 |------|------|-------------|
 | `μ` | `Bit` |  |
 
+The `LOAD` chip is comprised of  variables that are expressed using  columns:
+
 ## Assumptions
 
-| Ref | Range | Description |
+| Tag | Range | Description |
 |-----|-------|-------------|
-| `A1` | i ∈ [0, 1] | `IS_WORD[base_address[i]]` |
-| `A2` |  | `IS_BIT<signed>` |
-| `A3` |  | `IS_BIT<read2>` |
-| `A4` |  | `IS_BIT<read4>` |
-| `A5` |  | `IS_BIT<read8>` |
-| `A6` |  | `IS_BIT<read2 + read4 + read8>` |
-| `A7` | i ∈ [0, 1] | `IS_WORD[timestamp[i]]` |
+| `LOAD-A1.i` | i ∈ [0, 1] | `IS_WORD[base_address[i]]` |
+| `LOAD-A2` |  | `IS_BIT<signed>` |
+| `LOAD-A3` |  | `IS_BIT<read2>` |
+| `LOAD-A4` |  | `IS_BIT<read4>` |
+| `LOAD-A5` |  | `IS_BIT<read8>` |
+| `LOAD-A6` |  | `IS_BIT<read2 + read4 + read8>` |
+| `LOAD-A7.i` | i ∈ [0, 1] | `IS_WORD[timestamp[i]]` |
 
 ## Constraints
 
-### all
+The chip delegates the actual memory interaction to the `MEMW` chip, and ensures correctness of the requested sign/zero extension. The output `res` is correctly range-checked as long as the memory contents are.
 
-| Ref | Kind | Range | Description | Multiplicity |
-|-----|------|-------|-------------|--------------|
-| `1` | arith |  | `read2` + `read4` + `read8` => `μ` |  |
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `LOAD-C1` |  | `read2` + `read4` + `read8` => `μ` |  |
 | | | _polynomial:_ `(read2 + read4 + read8) * (1 - μ) = 0` | |
-| `2` | interaction |  | `MEMW[res; 0, base_address, res::BaseField[8], timestamp, read2, read4, read8]` | μ |
-| `3` | interaction |  | `MSB8[sign_bit; res[0]]` | read1 |
-| `4` | interaction |  | `MSB8[sign_bit; res[1]]` | read2 |
-| `5` | interaction |  | `MSB8[sign_bit; res[3]]` | read4 |
-| `6` | arith | i ∈ [4, 7] | !`read8` => `res`_i = `signed` dot `sign_bit` dot 255 |  |
+| `LOAD-C2` |  | `MEMW[res; 0, base_address, res::BaseField[8], timestamp, read2, read4, read8]` | μ |
+| `LOAD-C3` |  | `MSB8[sign_bit; res[0]]` | read1 |
+| `LOAD-C4` |  | `MSB8[sign_bit; res[1]]` | read2 |
+| `LOAD-C5` |  | `MSB8[sign_bit; res[3]]` | read4 |
+| `LOAD-C6.i` | i ∈ [4, 7] | !`read8` => `res`_i = `signed` dot `sign_bit` dot 255 |  |
 | | | _polynomial:_ `(1 - read8) * (res[i] - signed * sign_bit * 255) = 0` | |
-| `7` | arith | i ∈ [2, 3] | !(`read4` + `read8`) => `res`_i = `signed` dot `sign_bit` dot 255 |  |
+| `LOAD-C7.i` | i ∈ [2, 3] | !(`read4` + `read8`) => `res`_i = `signed` dot `sign_bit` dot 255 |  |
 | | | _polynomial:_ `(1 - read4 - read8) * (res[i] - signed * sign_bit * 255) = 0` | |
-| `8` | arith |  | !(`read2` + `read4` + `read8`) => `res`_1 = `signed` dot `sign_bit` dot 255 |  |
+| `LOAD-C8` |  | !(`read2` + `read4` + `read8`) => `res`_1 = `signed` dot `sign_bit` dot 255 |  |
 | | | _polynomial:_ `(1 - read2 - read4 - read8) * (res[1] - signed * sign_bit * 255) = 0` | |
 
-### output
+The chip contributes the following to the lookup argument.
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `LOAD-C9` | `LOAD[res::DWordWL; base_address, timestamp, read2, read4, read8]` | -μ |
+
+## Padding
 
-| Ref | Kind | Description | Multiplicity |
-|-----|------|-------------|--------------|
-| `1` | interaction | `LOAD[res::DWordWL; base_address, timestamp, read2, read4, read8]` | -μ |
+The table can be padded to the next power of two with the following value assignments:
\ No newline at end of file
diff --git a/docs/spec/lt.md b/docs/spec/lt.md
index b72ed2041..a8f65057a 100644
--- a/docs/spec/lt.md
+++ b/docs/spec/lt.md
@@ -48,40 +48,54 @@ unsigned_lt := carry[1]
 |------|------|-------------|
 | `μ` | `Bit` |  |
 
+The `LT` chip is comprised of  variables that are expressed using  columns:
+
 ## Assumptions
 
-| Ref | Range | Description |
+| Tag | Range | Description |
 |-----|-------|-------------|
-| `lt:a:range_lhs` |  | `IS_WORD[lhs[0]]` |
-| `lt:a:range_rhs` |  | `IS_WORD[rhs[0]]` |
-| `lt:a:range_signed` |  | `IS_BIT<signed>` |
+| `LT-A1` |  | `IS_WORD[lhs[0]]` |
+| `LT-A2` |  | `IS_WORD[rhs[0]]` |
+| `LT-A3` |  | `IS_BIT<signed>` |
+
+We assume the inputs `lhs`, `rhs` and `signed` are partially range checked.
 
 ## Constraints
 
-### defs
-_Enforce that variables have been correctly computed_
-
-| Ref | Kind | Description | Multiplicity |
-|-----|------|-------------|--------------|
-| `lt:c:lhs_msb` | interaction | `MSB16[lhs_msb; lhs[2]]` | μ |
-| `lt:c:rhs_msb` | interaction | `MSB16[rhs_msb; rhs[2]]` | μ |
-| `lt:c:lt` | arith | `lt` = `signed` dot (A (1 - B) + A C + (1 - B) C) + (1 - `signed`) dot `unsigned_lt` |  |
-| | | _polynomial:_ `lt - signed * (lhs_msb * (1 - rhs_msb) + lhs_msb * carry[1] + (1 - rhs_msb) * carry[1]) - (1 - signed) * unsigned_lt = 0` | |
-| | | _note:_ Where $A = #`lhs_msb`$, $B = #`rhs_msb`$ and $C = #`carry[1]`$ | |
-| `lt:c:range_lhs` | interaction | `IS_HALFWORD[lhs[1]]` | μ |
-| `lt:c:range_rhs` | interaction | `IS_HALFWORD[rhs[1]]` | μ |
-
-### sub
-_Constrain the subtraction_
-
-| Ref | Kind | Range | Description | Multiplicity |
-|-----|------|-------|-------------|--------------|
-| `1` | template | i ∈ [0, 1] | `IS_BIT<carry[i]>` |  |
-| `lt:c:lhs_sub_rhs_range` | interaction | i ∈ [0, 3] | `IS_HALFWORD[lhs_sub_rhs[i]]` | μ |
-
-### output
-_Each row contributes the following to the LogUp sum_
-
-| Ref | Kind | Description | Multiplicity |
-|-----|------|-------------|--------------|
-| `1` | interaction | `LT[lt; lhs::DWordWL, rhs::DWordWL, signed]` | -μ |
+We first constrain that all variables correspond to their definition. For the defining constraint of `lt`, [lt:c:lt], observe that it is a choice between two options, depending on the input flag `signed`. In the case of unsigned comparison, we simply need `unsigned_lt`, indicating that a wraparound (carry bit) modulo `2^64` is needed to go from `rhs` to `lhs` via addition. For the case of signed comparison, we first need some case analysis.
+
+We split `a < b` into four disjoint cases, conditioned on the sign of `a` and `b`. Recall that the sign of a number in two's complement can be read off from the MSB, being `1` for a negative number and `0` for a positive one. For this analysis, we denote the MSB of `a` as `A` and the MSB of `b` as `B`. The four disjoint cases then become:
+
++ `dash(A) and B and (a < b)` + `A and dash(B) and (a < b)` + `A and B and (a < b)` + `dash(A) and dash(B) and (a < b)`
+
+The first case is evidently false, while the second case simplifies to `A and dash(B)`. For the third and fourth case, observe that when `A = B`, the `<` relation is preserved by the modular correspondence between `[-2^(31), 2^(31))` and `[0, 2^(64))`. Importantly, this modular correspondence is merely a reinterpretation of the bits or values of `a` and `b`, due to the representation in two's complement. Hence, we can introduce the value `C = `unsigned_lt``, that accurately represents the relation `a < b` when `A = B`.
+
+Combining our three remaining cases, we obtain the boolean formula `A dash(B) or A B C or dash(A) dash(B) C`. Since the cases are disjoint, this can be computed with the binary-valued polynomial `P(A, B, C) = A (1 - B) + A B C + (1 - A) (1 - B) C`.
+
+The polynomial `P` can be simplified to a total degree of two. We claim that the polynomial `Q(A, B, C) = A (1 - B) + A C + (1 - B) C` is, for the purposes of this chip, equivalent to `P`. An exhaustive check shows that `P(A, B, C) != Q(A, B, C)` only for the triple `(A, B, C) = (1, 0, 1)`. This is, however, impossible due to the correctness of `ADD`. In more detail, if we let `s` be the (range-checked) difference `a - b` (so the equivalent of the `lhs_sub_rhs` column), and `x'` denote the most significant word of a variable `x`, we need `c dot 2^32 + a' = b' + s' + `carry[0]``, by the definition of `carry`. However, the left hand side of this is at least `3 dot 2^31`, as `(A, C) = (1, 1)`, and the right hand side is at most `(2^31 - 1) + (2^32 - 1) + 1 = 3 dot 2^31 - 1`. Therefore, we can use `Q` to constrain `lt` when `signed = 1`.
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `LT-C1` | `MSB16[lhs_msb; lhs[2]]` | μ |
+| `LT-C2` | `MSB16[rhs_msb; rhs[2]]` | μ |
+| `LT-C3` | `lt` = `signed` dot (A (1 - B) + A C + (1 - B) C) + (1 - `signed`) dot `unsigned_lt` |  |
+| | _polynomial:_ `lt - signed * (lhs_msb * (1 - rhs_msb) + lhs_msb * carry[1] + (1 - rhs_msb) * carry[1]) - (1 - signed) * unsigned_lt = 0` | |
+| `LT-C4` | `IS_HALFWORD[lhs[1]]` | μ |
+| `LT-C5` | `IS_HALFWORD[rhs[1]]` | μ |
+
+And then we constrain the subtraction, taking care of the remaining range checking not yet covered by the assumptions or the `MSB16` lookup.
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `LT-C6.i` | i ∈ [0, 1] | `IS_BIT<carry[i]>` |  |
+| `LT-C7.i` | i ∈ [0, 3] | `IS_HALFWORD[lhs_sub_rhs[i]]` | μ |
+
+The chip contributes the following to the lookup argument.
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `LT-C8` | `LT[lt; lhs::DWordWL, rhs::DWordWL, signed]` | -μ |
+
+## Padding
+
+The table can be padded to the next power of two with the following value assignments:
\ No newline at end of file
diff --git a/docs/spec/memory.md b/docs/spec/memory.md
new file mode 100644
index 000000000..6a5ea151c
--- /dev/null
+++ b/docs/spec/memory.md
@@ -0,0 +1,89 @@
+# Memory Argument
+
+As part of fully proving the correct execution of a RISC-V program, the VM must ensure that memory reads and writes are consistent. That is, every byte read from some address corresponds to the byte that was last written to that address --- or the initial value if nothing has been written yet. We consider "memory" in a broad sense here: both RAM and the general purpose registers can be seen as instantiations of memory and are therefore handled simultaneously.
+
+While RAM is byte addressed, we do choose to store registers as a `DWordWL` over two word addresses. ]
+
+On a high level, we ensure memory consistency by an interacting system of reads and writes to a lookup argument, combined with an initialization and finalization scheme. The initialization and finalization schemes together ensure both that (1) the necessary preconditions for the lookup system are satisfied, and (2) the program is executed with the correct initial memory and register contents as specified by the ELF binary and the ISA.
+
+## Memory types
+
+A commonly made distinction of memory types is that of _read-only_ and _read-write_ memory, with the more restrictive read-only variant often allowing for more efficient solutions (be that regarding prover time, verifier time or proof size) via table lookup proofs. Naturally, the VM’s main memory and registers should be handled by a read-write system as the guest program/environment can issue instructions that write to memory. While there are some subsystems that can be modelled as read-only memory ---e.g., the program memory and instruction decoding--- we opt to integrate these into the proof system via chip interactions (relying on techniques derived from table lookup arguments). As such, we only concern ourselves with read-write memory, moving forward.
+
+## Memory operations
+
+Every memory operation has some conceptual attributes that are relevant to mention or discuss:
+
+- The type of operation (read or write) - The memory address --- this is an address in the broad sense: main memory and registers have their own dedicated part of the unified address space. - The value being read from or written to the memory address - When the value was read or written, see the below paragraph
+
+Since we will have to ensure that memory accesses are temporally consistent within the execution of the VM, we additionally consider a _timestamp_ for  every memory access, that should be strictly increasing. As such, it should never be possible for the system to generate accesses to the same address at identical timestamps. Multiple memory accesses can (and indeed will, consider e.g. register reads) occur in a single execution cycle of the VM, so we cannot use the cycle counter directly as timestamp for register accesses. We can, however, statically bound the maximal number of memory accesses made during a single execution by a granularity constant `k` and derive timestamps from the cycle counter. The `i`th possible memory access in cycle `c` will obtain as timestamp the value `k dot c + i`. For simplicity, we will always reserve a timestamp for every possible memory access, and leave the timestamp unused if an instruction does not use it.
+
+For reasons of completeness (since temporal integrity as discussed below is a security necessity), we cannot deal with multiple accesses to the same address at identical timestamps. However, if multiple accesses are guaranteed to be independent (that is, to different addresses), they can still share a timestamp --- consider, e.g., the case of reading a word as 4 bytes with the `LW` load instruction. This property is already taken into account where possible in the design of the system. For instance, in the CPU chip, we can ensure that there are at most 3 memory accesses not guaranteed to be independent, so a timestamp granularity of 4 timestamps per cycle is enough. ]
+
+## Permutation argument
+
+We can conceptually organise the state of the memory as a collection of "tokens" that represent tuples `(serif("timestamp"), serif("address"), serif("value"))`, meaning the current value written to `serif("address")` is `serif("value")`, last written to memory at `serif("timestamp")`. Having exactly one value associated with any address will be ensured (see further down in this document) by the interaction of memory initialization, memory finalization, and the effects of memory operations.
+
+Each memory operation will then do two things:
+
+- Consume the current token in the memory - Emit a new token to replace it
+
+Naturally, for a read operation, the _values_ embedded in the consumed and emitted tokens must be identical. From the need to consume a token even on the first memory access, we can see the necessity for a memory initialization procedure ---in addition to having to make sure the initial memory content lines up with what the binary dictates.
+
+> **Note:** properly link/refer to the logup spec
+
+So long as we can properly constrain temporal integrity (that is, no memory operation can consume future tokens), this "balancing" act of tokens can be integrated (with sufficient domain separation) into the existing LogUp argument: consuming a token corresponds to a "receive" and emitting a new token is a "send".
+
+## Temporal integrity
+
+> **Note:** Properly link/refer to the LT chip
+
+To ensure temporal integrity, every memory operation needs to be constrained for the newly emitted token to have a strictly greater timestamp than the consumed token. This raises the question of how to represent timestamps and cleanly perform this check, as over a finite field the “less than” relation is ill-defined (though it is common and natural to consider it as the less than relation over the natural lift of the field into the integers). We choose to represent timestamps as machine words, using the existing `LT` chip ([lt]) functionality for comparisons.
+
+- Clean definition of “less-than”, using the already existing `LT` functionality in the ALU - Harder to perform increments, needing extra constraints beyond field arithmetic - But this can be alleviated by providing a precomputed column that has a fixed increment per CPU row ][ - Comparison is more annoying, but can work by: - Decomposition into a machine word and chip interaction with the LT chip - Bit decomposition and comparison constraints - Range-checking the difference to be sufficiently small w.r.t. the field characteristic. - Increments and basic arithmetic operations are cheap ] ]
+
+> **Note:** reference to CPU chip/timestamp column and MEMW chip
+
+## Initialization and Finalization
+
+Because the LogUp argument handling token consumption and emission needs to be fully balanced --- every token emitted should be consumed, and vice versa --- we need to have a system to emit the initial tokens and consume the final tokens. This needs to ensure that every address has at most a single initializing emission, and at most one finalizing consumption. Having at most one initialization will, through the correctness of the lookup argument, immediately lead to having at most one correct finalization, and vice versa.
+
+The initialization will need to correspond to a fixed initial register state for the VM, as well as the memory loaded from the program binary, zero-initialization of memory elsewhere, and private input provided by the prover. The contribution of initialization with static data from the ELF executable and the initial register state to the sum can be handled directly by the verifier, ensuring correctness corresponding to the ELF binary being proven. This leaves only zero-initialization and prover input as prover-side concerns for initialization, alongside the finalization of the entire used memory.
+
+For our chosen scheme (which we refer to as "paged initialization/finalization"), the available memory range is split into equally (power-of-two) sized "pages". Each address can then be represented as `address = page_base_address + page_offset`, with `page_base_address` being "page-aligned", and `page_offset` belonging to a limited range (the page size). As such, initialization or finalization of a page is represented by a table with columns `page`, `offset`, `value`, and ---for finalization--- `timestamp`. The `page` column is a preprocessed, constant value (which can be entirely virtualized/inlined into the constraints for this table), and the `offset` column is a preprocessed column containing its row index. Depending on the type of initialization, `value` can be a prover-committed column (input data), or a precomputed, constant column containing `0` (free memory space). This table then feeds into the LogUp system in the normal way, emitting the initial tokens for all addresses in a page, without consuming any tokens. Since the `offset` column is always the same, it can be reused across all paged initialization and finalization tables.
+
+Concretely, each page gets an associated `PAGE` table, consisting of N variables over N columns. For each such table, the `page` variable is instantiated as the constant base address of the page. The `offset` column is preprocessed, which helps the verifier ensure that each page has a single fixed size, but the verifier should still check that no pages overlap and all `page` values are page-aligned.
+
+### Page initialization
+
+> **Note:** check whether we need `fini` to be range-checked
+
+We present here a set of constraints on the `PAGE` table that
+
++ enforces the initial and final values of each address are bytes + adds the initial and final interaction to the LogUp argument
+
+For zero-initialized pages, `init` can be a constant `0`, and hence doesn't need a column, nor a range check.
+
+We identify a few alternatives that would achieve the desired initialization/finalization functionalities, and consider their respective trade-offs.
+
+_"Free-zero" initialization_
+
+Zero-initialization could be achieved by allowing the `MEMW` chip to output a zero without consuming a token from the lookup argument. This would in turn be made secure by finalization consuming at most one token per address: if an address is initialized more than once, the proof cannot be finalized. - This requires fewer pages (and hence tables) for zero-initialization. - But it comes at a cost of added complexity in the `MEMW `chip, and likely some extra columns to handle this. Keeping track of initialized addresses, and potentially having to initialize only some of the bytes in a word-read may make bookkeeping challenging. - This is an alternative form of sparse initialization (see below), so it is incompatible with paged finalization. Paged finalization can be made into a compatible sparse form by adding a bit-checked multiplicity column.
+
+_Sparse initialization/finalization_
+
+One or more STARK tables (depending on the amount of memory used) consisting of `(address, value)` columns are introduced, where for zero-initialization, `value` can be constant zero. Transition constraints ensure that `address` is strictly increasing, enforcing the "at most once" property; `value` is range-checked to consist of bytes. Similar to paged finalization, an additional `timestamp` column is added, containing the final timestamp each address was accessed. This table is then further used to contribute to the LogUp sum as with any other interactions. - The transition constraints can be chosen to only apply on finalization, as at-most-once finalization is enough to ensure consistency. - Sparse initialization is incompatible with paged finalization, see also the remark under free-zero initialization above. - This would require transition constraints, which currently are not needed elsewhere in the VM design - Additionally, for memory use exceeding the capacity of a single initialization/finalization table, some form of transition constraint between tables is needed - Alternatively, transition constraints could potentially be avoided by more integration into the LogUp system, but this could turn out more costly in practice - This is compatible with the above "free zero" initialization - Since a prover-committed address column is needed (rather than a precomputed one), the number of required columns increases. - As an optimization, the address column could potentially be used simultaneously for initialization and finalization - Sparse initialization/finalization reduces the cost for sparse memory access patterns, where only a few addresses would be accessed per page. Most programs and compilers should however favor a memory locality that makes paged initialization/finalization comparable. ]
+
+### Register initialization/finalization
+
+> **Note:** Properly link/reference ECALL/HALT chip
+
+The initial and final state of registers can be entirely known by the verifier, since the relevant initialization values are either zero, or embedded in the ELF, and the final values can be set to a known value by the HALT ecall. As additionally, the number of registers is small, the verifier can directly add the required balancing terms to the LogUp sum.
+
+## Notes and considerations
+
+- Register reads and writes may interact within a single cycle, so a correct and fixed ordering needs to be ensured - Correctness of initialization and completeness of finalization need to be ensured
+
+## Future topics of interest
+
+- Optimize memory systems after determining factual bottlenecks (e.g. taking inspiration from Twist and Shout, or other recent research)
\ No newline at end of file
diff --git a/docs/spec/memw.md b/docs/spec/memw.md
index 84246b8e0..613c96612 100644
--- a/docs/spec/memw.md
+++ b/docs/spec/memw.md
@@ -57,59 +57,67 @@ w4 := write4 + write8
 | `μ_read` | `Bit` | Whether we are performing a read (and hence return `out`) |
 | `μ_write` | `Bit` | Whether we are performing a write (and hence not return `out`) |
 
+The `MEMW` chip is comprised of  variables that are expressed using  columns:
+
 ## Assumptions
 
-| Ref | Range | Description |
+| Tag | Range | Description |
 |-----|-------|-------------|
-| `A1` | i ∈ [0, 1] | `IS_WORD[base_address[i]]` |
-| `A2` |  | `IS_BIT<write2>` |
-| `A3` |  | `IS_BIT<write4>` |
-| `A4` |  | `IS_BIT<write8>` |
-| `A5` |  | `IS_BIT<write2 + write4 + write8>` |
-| `A6` | i ∈ [0, 1] | `IS_WORD[timestamp[i]]` |
+| `MEMW-A1.i` | i ∈ [0, 1] | `IS_WORD[base_address[i]]` |
+| `MEMW-A2` |  | `IS_BIT<write2>` |
+| `MEMW-A3` |  | `IS_BIT<write4>` |
+| `MEMW-A4` |  | `IS_BIT<write8>` |
+| `MEMW-A5` |  | `IS_BIT<write2 + write4 + write8>` |
+| `MEMW-A6.i` | i ∈ [0, 1] | `IS_WORD[timestamp[i]]` |
 
-## Constraints
+Our assumptions do not explicitly cover any range checks for the `is_register` and `value` columns, as these are not necessary for the correctness of this chip in isolation. These properties are necessary for the consistency of the system as a whole, and therefore we document it here, keeping the type information as a reading help.
 
-### consistency
+## Constraints
 
-| Ref | Kind | Range | Description | Multiplicity |
-|-----|------|-------|-------------|--------------|
-| `1` | template |  | `IS_BIT<μ_sum>` |  |
-| `2` | arith |  | `w2` => `μ_sum` |  |
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `MEMW-C1` |  | `IS_BIT<μ_sum>` |  |
+| `MEMW-C2` |  | `w2` => `μ_sum` |  |
 | | | _polynomial:_ `w2 * (1 - μ_sum) = 0` | |
-| `3` | template |  | `ADD<address_add[0]::DWordWL; base_address, 1>` | w2 |
-| `4` | template | i ∈ [1, 2] | `ADD<address_add[i]::DWordWL; base_address, i + 1>` | w4 |
-| `5` | template | i ∈ [3, 6] | `ADD<address_add[i]::DWordWL; base_address, i + 1>` | write8 |
-| `6` | interaction | i ∈ [0, 6], j ∈ [0, 3] | `IS_HALFWORD[address_add[i][j]]` |  |
-| `7` | interaction |  | `LT[1; old_timestamp[0], timestamp, 0]` | μ_sum |
-| `8` | interaction |  | `LT[1; old_timestamp[1], timestamp, 0]` | w2 |
-| `9` | interaction | i ∈ [2, 3] | `LT[1; old_timestamp[i], timestamp, 0]` | w4 |
-| `10` | interaction | i ∈ [4, 7] | `LT[1; old_timestamp[i], timestamp, 0]` | write8 |
-
-### overflow
-
-| Ref | Kind | Description | Multiplicity |
-|-----|------|-------------|--------------|
-| `R1` | interaction | `LT[1; base_address, address_add[0]::DWordWL, 0]` | write2 |
-| `R2` | interaction | `LT[1; base_address, address_add[2]::DWordWL, 0]` | write4 |
-| `R3` | interaction | `LT[1; base_address, address_add[6]::DWordWL, 0]` | write8 |
-
-### memory
-
-| Ref | Kind | Range | Description | Multiplicity |
-|-----|------|-------|-------------|--------------|
-| `M1` | interaction |  | `memory[is_register, base_address, old_timestamp[0], old[0]]` | μ_sum |
-| `M2` | interaction |  | `memory[is_register, base_address, timestamp, value[0]]` | -μ_sum |
-| `M3` | interaction |  | `memory[is_register, address_add[0], old_timestamp[1], old[1]]` | w2 |
-| `M4` | interaction |  | `memory[is_register, address_add[0], timestamp, value[1]]` | -w2 |
-| `M5` | interaction | i ∈ [2, 3] | `memory[is_register, address_add[i - 1], old_timestamp[i], old[i]]` | w4 |
-| `M6` | interaction | i ∈ [2, 3] | `memory[is_register, address_add[i - 1], timestamp, value[i]]` | -w4 |
-| `M7` | interaction | i ∈ [4, 7] | `memory[is_register, address_add[i - 1], old_timestamp[i], old[i]]` | write8 |
-| `M8` | interaction | i ∈ [4, 7] | `memory[is_register, address_add[i - 1], timestamp, value[i]]` | -write8 |
-
-### output
-
-| Ref | Kind | Description | Multiplicity |
-|-----|------|-------------|--------------|
-| `O1` | interaction | `MEMW[old; is_register, base_address, value, timestamp, write2, write4, write8]` | μ_read |
-| `O2` | interaction | `MEMW[is_register, base_address, value, timestamp, write2, write4, write8]` | μ_write |
+| `MEMW-C3` |  | `ADD<address_add[0]::DWordWL; base_address, 1>` | w2 |
+| `MEMW-C4.i` | i ∈ [1, 2] | `ADD<address_add[i]::DWordWL; base_address, i + 1>` | w4 |
+| `MEMW-C5.i` | i ∈ [3, 6] | `ADD<address_add[i]::DWordWL; base_address, i + 1>` | write8 |
+| `MEMW-C6.i` | i ∈ [0, 6], j ∈ [0, 3] | `IS_HALFWORD[address_add[i][j]]` |  |
+| `MEMW-C7` |  | `LT[1; old_timestamp[0], timestamp, 0]` | μ_sum |
+| `MEMW-C8` |  | `LT[1; old_timestamp[1], timestamp, 0]` | w2 |
+| `MEMW-C9.i` | i ∈ [2, 3] | `LT[1; old_timestamp[i], timestamp, 0]` | w4 |
+| `MEMW-C10.i` | i ∈ [4, 7] | `LT[1; old_timestamp[i], timestamp, 0]` | write8 |
+
+As long as `timestamp` is properly range-checked, the presence of `old_timestamp` in the memory argument automatically ensures appropriate range checking (as long as no external entities provide negative multiplicities without range checking the timestamp). This ensures the assumptions for `LT` are satisfied.
+
+We additionally check that the address does not overflow for more significant bytes of the access.
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `MEMW-CR11` | `LT[1; base_address, address_add[0]::DWordWL, 0]` | write2 |
+| `MEMW-CR12` | `LT[1; base_address, address_add[2]::DWordWL, 0]` | write4 |
+| `MEMW-CR13` | `LT[1; base_address, address_add[6]::DWordWL, 0]` | write8 |
+
+The chip adds the following tuples to the lookup argument, to effectuate that part of the memory argument.
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `MEMW-CM14` |  | `memory[is_register, base_address, old_timestamp[0], old[0]]` | μ_sum |
+| `MEMW-CM15` |  | `memory[is_register, base_address, timestamp, value[0]]` | -μ_sum |
+| `MEMW-CM16` |  | `memory[is_register, address_add[0], old_timestamp[1], old[1]]` | w2 |
+| `MEMW-CM17` |  | `memory[is_register, address_add[0], timestamp, value[1]]` | -w2 |
+| `MEMW-CM18.i` | i ∈ [2, 3] | `memory[is_register, address_add[i - 1], old_timestamp[i], old[i]]` | w4 |
+| `MEMW-CM19.i` | i ∈ [2, 3] | `memory[is_register, address_add[i - 1], timestamp, value[i]]` | -w4 |
+| `MEMW-CM20.i` | i ∈ [4, 7] | `memory[is_register, address_add[i - 1], old_timestamp[i], old[i]]` | write8 |
+| `MEMW-CM21.i` | i ∈ [4, 7] | `memory[is_register, address_add[i - 1], timestamp, value[i]]` | -write8 |
+
+This chip contributes the following to the lookup argument.
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `MEMW-CO22` | `MEMW[old; is_register, base_address, value, timestamp, write2, write4, write8]` | μ_read |
+| `MEMW-CO23` | `MEMW[is_register, base_address, value, timestamp, write2, write4, write8]` | μ_write |
+
+## Future optimization ideas
+
+- Fast path for aligned memory access where all bytes have the same old timestamp - MEMB chip that deals does a one-byte write to remove old_timestamp from here (uncertain tradeoffs) - Compute `base_address[1] + 1` once and have high words of `address_add` as Words - Improve overflow trapping somehow so we don't need `LT` (could tie into previous one by checking carry bit of the +1) - Adding `μ_sum`/`w2`/`w4`/`write8` multiplicities to the `IS_HALFWORD` lookups may make some GKR things faster if there are known zeroes.
\ No newline at end of file
diff --git a/docs/spec/mul.md b/docs/spec/mul.md
index 0df284d98..25f7b02e3 100644
--- a/docs/spec/mul.md
+++ b/docs/spec/mul.md
@@ -72,35 +72,69 @@ carry (when iter=[1, 3]) := 2^-32 * (raw_product[i] + carry[i - 1] - res[i])
 | `μ_lo` | `BaseField` |  |
 | `μ_hi` | `BaseField` |  |
 
+The `MUL` chip is comprised of  variables that are expressed using  columns:
+
+`mat(delim: , top; bottom)` }
+
 ## Assumptions
 
-| Ref | Range | Description |
+| Tag | Range | Description |
 |-----|-------|-------------|
-| `A1` | i ∈ [0, 3] | `IS_HALF[lhs[i]]` |
-| `A2` | i ∈ [0, 3] | `IS_HALF[rhs[i]]` |
+| `MUL-A1.i` | i ∈ [0, 3] | `IS_HALF[lhs[i]]` |
+| `MUL-A2.i` | i ∈ [0, 3] | `IS_HALF[rhs[i]]` |
+
+The following range checks are assumed to be performed/enforced outside of this chip:
 
 ## Constraints
 
-### def
+### Overview
+
+When `lhs` and `rhs` are _unsigned_ integers, computing their product `mod 2^128` comes down to evaluating $ (sum_(j=0)^3 2^(16j) dot `lhs`_j) dot (sum_(i=0)^3 2^(16i) dot `rhs`_i) mod 2^128. $ If `lhs` and `rhs` are signed instead, the computation remains nearly identical: based on their signs, one must either zero or one-extend `lhs` and `rhs` --- forming `lhs_ext` and `rhs_ext` respectively --- and compute their product `mod 2^128`: $ (sum_(j=0)^7 2^(16j) dot `lhs_ext`_j) dot (sum_(i=0)^7 2^(16i) dot `rhs_ext`_i) mod 2^128. $ where `lhs_ext` and `rhs_ext` are treated as _unsigned_ integers. Note that by setting the extension limbs of `lhs` and/or `rhs` to `0` when the integer is (i) unsigned or (ii) signed and non-negative, this second formula still applies. For the purposes of constraining the multiplication operation, we rewrite this formula as
+
+$ &(sum_(j=0)^7 2^(16j) dot `lhs_ext`_j) dot (sum_(i=0)^7 2^(16i) dot `rhs_ext`_i) mod 2^128 \ &equiv sum_(j=0)^7 sum_(i=0)^7 2^(16(i+j)) dot `lhs_ext`_j dot `rhs_ext`_i mod 2^128 \ &stackrel(triangle, equiv) sum_(j=0)^7 sum_(i=0)^(7-j) 2^(16(i+j)) dot `lhs_ext`_j dot `rhs_ext`_i mod 2^128 \ &stackrel(square, equiv) sum_(j=0)^7 sum_(i=j)^(7) 2^(16i) dot `lhs_ext`_j dot `rhs_ext`_(i-j) mod 2^128 \ &stackrel(penta, equiv) sum_(i=0)^7 sum_(j=0)^(i) 2^(16i) dot `lhs_ext`_j dot `rhs_ext`_(i-j) mod 2^128 \ &equiv sum_(i=0)^3 sum_(k=0)^1 sum_(j=0)^(2i+k) 2^(16(2i+k)) dot `lhs_ext`_j dot `rhs_ext`_(2i+k-j) mod 2^128 \ &equiv sum_(i=0)^3 2^(32i) dot sum_(k=0)^1 2^(16k) dot sum_(j=0)^(2i+k) `lhs_ext`_j dot `rhs_ext`_(2i+k-j) mod 2^128 $ where at step - `triangle` we can ignore `i > 7-j`, since that makes `2^(16(i+j)) equiv 0 mod 2^128`, - `square` we rewrite the second summation such that `i` iterates from `j` to 7, rather than `0` to `7-j`, and - `penta` we swap the sums.
+
+We let `raw_product` capture the second summation in this last formula (see [mul:c:raw_product]). By construction, ``raw_product`_i < 2^51` for all `i in [0, 3]`, far exceeding the 32-bits that fit in a single `Word`-limb. What remains then is to reduce each limb of `raw_product` `mod 2^32`, carrying the overflow of each limb to the next, constructing the output `res` in doing so.
+
+This reduce-and-carry operation is constrained by [mul:c:range_lo]/[mul:c:range_hi] and [mul:c:carry], combined with `carry`'s definition. [mul:c:carry] and `carry`'s definition enforce that $ forall i in [0, 3]: `raw_product`_i + `carry`_(i-1) - `res`_i in { k dot 2^32 | k in [0, 2^20) } $ with ``carry`_(-1) = 0` for simplicity. In other words: ``res`_i equiv `raw_product`_i + `carry`_(i-1) (mod 2^32)`. With [mul:c:range_lo]/[mul:c:range_hi] forcing ``res`_i < 2^32`, ``res`_i` can only assume one value: ``raw_product`_i + `carry`_(i-1) mod 2^32`.
+
+*Note*: one may have observed that [mul:c:carry] requires ``carry`_i in [0, 2^20)`, while no limb of a valid carry value would ever exceed `2^19`. This is indeed the case. However, there is some slack in how tight one has to constrain the `carry` values. In fact, in this situation it suffices to assert that ``carry`_i < frac(p, 2^32, style: "skewed") approx 2^31`, where `p` denotes the field's modulus. Given that other chips also use 20-bit lookups, using `IS_B20` makes for a simpler design.
+
+### Definitions
 
-| Ref | Kind | Range | Description | Multiplicity |
-|-----|------|-------|-------------|--------------|
-| `mul:c:lhs_is_negative` | template |  | `SIGN<lhs_is_negative; lhs[3], lhs_signed>` |  |
-| `mul:c:rhs_is_negative` | template |  | `SIGN<rhs_is_negative; rhs[3], rhs_signed>` |  |
-| `mul:c:range_lo` | interaction | i ∈ [0, 3] | `IS_HALF[lo[i]]` | μ_sum |
-| `mul:c:range_hi` | interaction | i ∈ [0, 3] | `IS_HALF[hi[i]]` | μ_sum |
-| `mul:c:carry` | interaction | i ∈ [0, 3] | `IS_B20[carry[i]]` | μ_sum |
+We constrain `lhs_is_negative` and `rhs_is_negative` according to their definition; `lo`, `hi` and `carry` are appropriately range checked.
 
-### prod
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `MUL-C1` |  | `SIGN<lhs_is_negative; lhs[3], lhs_signed>` |  |
+| `MUL-C2` |  | `SIGN<rhs_is_negative; rhs[3], rhs_signed>` |  |
+| `MUL-C3.i` | i ∈ [0, 3] | `IS_HALF[lo[i]]` | μ_sum |
+| `MUL-C4.i` | i ∈ [0, 3] | `IS_HALF[hi[i]]` | μ_sum |
+| `MUL-C5.i` | i ∈ [0, 3] | `IS_B20[carry[i]]` | μ_sum |
 
-| Ref | Kind | Range | Description |
-|-----|------|-------|-------------|
-| `mul:c:raw_product` | arith | i ∈ [0, 3] | `raw_product[i]` = sum_(`k`=0)^1 2^(16k) sum_(`j`=0)^(2i+k) `lhs_ext[j]` dot `rhs_ext[2i+k-j]` |
+### Product
+
+[mul:c:raw_product] defines `raw_product` in terms of the (sign extended) input values `lhs` and `rhs`.
+
+| Tag | Range | Description |
+|-----|-------|-------------|
+| `MUL-C6.i` | i ∈ [0, 3] | `raw_product[i]` = sum_(`k`=0)^1 2^(16k) sum_(`j`=0)^(2i+k) `lhs_ext[j]` dot `rhs_ext[2i+k-j]` |
 | | | _polynomial:_ `Σ_k = 0^1 2^(16 * k) * Σ_j = 0^2 * i + k lhs_ext[j] * rhs_ext[2 * i + k - j] - raw_product[i] = 0` |
 
-### lookup
+### Lookup
+
+The  chip contributes the following to the lookup:
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `MUL-C7` | `MUL[lo::DWordWL; lhs, lhs_signed, rhs, rhs_signed, 0]` | -μ_lo |
+| `MUL-C8` | `MUL[hi::DWordWL; lhs, lhs_signed, rhs, rhs_signed, 1]` | -μ_hi |
+
+## Padding
+
+The table can be padded to the next power of two with the following value assignments:
+
+## Notes
+
+- `lo` and `hi` are stored in `DWordHL`s (rather than `DWordWL`s) because of their values being range checked. Since it is not required that both `μ_lo` and `μ_hi` are non-zero at the same time, one cannot safely assume their range to be checked elsewhere.
 
-| Ref | Kind | Description | Multiplicity |
-|-----|------|-------------|--------------|
-| `mul:c:lookup_lo` | interaction | `MUL[lo::DWordWL; lhs, lhs_signed, rhs, rhs_signed, 0]` | -μ_lo |
-| `mul:c:lookup_hi` | interaction | `MUL[hi::DWordWL; lhs, lhs_signed, rhs, rhs_signed, 1]` | -μ_hi |
+As an optimization, one might be able to use a `DWordWL` and `DWordHL` to store `lo` and `hi`, where one would decide which to store in which based on the multiplicities `μ_lo` and `μ_hi`; the value sent into the lookup could then be assumed range-checked by the other side of the relation. This optimization was not included at this moment because of its negative impact on the readability and verifiability of the chip.
\ No newline at end of file
diff --git a/docs/spec/shift.md b/docs/spec/shift.md
index cde491157..bd9b9980a 100644
--- a/docs/spec/shift.md
+++ b/docs/spec/shift.md
@@ -1,5 +1,15 @@
 # SHIFT Chip
 
+## Interface
+
+The  chip has the following interface:
+
+``` // param in: the value being shifted // param shift: the number of bits to shift `in` by // param direction: whether to shift left (0) or right (1) // param signed: whether to interpret `in` as a signed (1) or unsigned (0) integer // param word_instr: whether to execute the SLL/SR* (0) or SLLW/SR*W (1) instruction // out shifted: the resulting value SHIFT[shifted: DWord; in: DWord, shift: Byte, direction: Bit, signed: Bit, word_instr: Bit] ``` In other words, the  chip is designed to constrain that $
+
+$ $
+
+$ Here, `<<` and `>>` denote the _logical_ left and right shift operations, while `>>>` denotes the _arithmetic_ right shift operation.
+
 ## Columns
 
 ### Input
@@ -77,65 +87,112 @@ shifted := left * Σ_j = 0^i limb_shift[j] * intra_limb_left[i - j] + right * (
 |------|------|-------------|
 | `μ` | `Bit` |  |
 
+The `SHIFT` chip is comprised of  variables that are expressed using  columns:
+
 ## Assumptions
 
-| Ref | Range | Description |
+| Tag | Range | Description |
 |-----|-------|-------------|
-| `shift:a:range_in` | i ∈ [0, 3] | `IS_HALFWORD[in[i]]` |
-| `shift:a:range_shift` |  | `IS_BYTE[shift]` |
-| `shift:a:direction` |  | `IS_BIT<direction>` |
-| `shift:a:signed` |  | `IS_BIT<signed>` |
-| `shift:a:word_instr` |  | `IS_BIT<word_instr>` |
+| `SHIFT-A1.i` | i ∈ [0, 3] | `IS_HALFWORD[in[i]]` |
+| `SHIFT-A2` |  | `IS_BYTE[shift]` |
+| `SHIFT-A3` |  | `IS_BIT<direction>` |
+| `SHIFT-A4` |  | `IS_BIT<signed>` |
+| `SHIFT-A5` |  | `IS_BIT<word_instr>` |
 
-## Constraints
+## Explanation
 
-### left_flag
+This chip has a rather complex design as a result of designing it to fit in as few columns possible. We briefly discuss the intricacies of the design, attempting to illustrate its correctness.
 
-| Ref | Kind | Description |
-|-----|------|-------------|
-| `shift:c:direction_implies_mu` | arith | `direction` => `μ` = 1 |
-| | | _polynomial:_ `direction * (1 - μ) = 0` |
-| | | _note:_ enforces `left` is `Bit`. |
+The chip's design revolves around a two-phase shifting process: 1. shift `in` by `x := `shift` mod 16` bits, 2. shift that result by `(`shift`-x) mod 64` (or `mod 32` if ` `word_instr` = 1`). The intermediate value representing the state between the two phases is stored in the scratch variables `X` and `Y`. The definition of `shifted` describes how one can combine the `X`, `Y` and `extension` variables to construct the output value as described using `Half`-limbs. The output variable `out` is equivalent to `shifted`, but expressed using `Word`-limbs.
 
-### is_negative
+In the following, we cover how these two phases were designed to complement one another. Here, we start with discussing the _logical_ left/right shift operations only; the modifications required to compute the _arithmetic_ right shift will be discussed at the end.
 
-| Ref | Kind | Description | Multiplicity |
-|-----|------|-------------|--------------|
-| `shift:c:is_negative_if_signed` | interaction | `MSB16[is_negative; in[3]]` | signed |
+### First phase
 
-### bit_shift
+We zoom in on the first step. Here, we make use of the two lookup operations - ``HWSL[x: Half, y: B4]` := (`x` `<<` `y`) mod 2^16` (short for "HalfWord Shift Left"), and - ``HWSLC[x: Half, y: B4]` := `x` `>>` (16-`y`)` (short for "HalfWord Shift Left's Carry") Note here that one can use these two lookups to compute `out: Half[4] := in << y` as: $
 
-| Ref | Kind | Description | Multiplicity |
-|-----|------|-------------|--------------|
-| `shift:c:bit_shift_if_left` | interaction | `AND_BYTE[bit_shift; shift, 15]` | left |
-| `shift:c:bit_shift_if_right` | interaction | `AND_BYTE[bit_shift; 2^8 - shift, 15]` | right |
-| `shift:c:zbs` | template | `IsZero<zbs; bit_shift>` | μ |
+$ as long as ``y` < 16`. Observing that ``HWSL[x,` 16-`y]` = (`x` `<<` (16-`y`)) mod 2^16`, and ``HWSLC[x,` 16-`y]` = `x` `>>` `y`` for ``y` in [1, 15]`, one can also use these lookups to compute `out := in >> y` as $
 
-### intra_limb_shift
+$ as long as `0 < `y` < 16`.
+
+Observe now that the values being looked up are (almost) independent from the direction of the shift: only the shift-amount varies slightly. When we now define $
+
+(16-`shift`) mod 16 & "when shifting right" ), $ it only takes some rearranging and combining of the values ``X[`i`] := HWSL[in[`i`], bit_shift]`` and ``Y[`i`] := HWSLC[in[`i`], bit_shift]`` to form the limbs of ``in <</>> shift` mod 16`. In the remaining case that ``right` = 1` and ``shift` = 0 mod 16`, the limbs of ``in <</>> shift` mod 16` simply match those of `in`.
+
+### Second phase
+
+Since we're operating on 16-bit limbs, all the limbs in ``in <</>> shift`` must also occur somewhere in ``in <</>> shift` mod 16`. The number of full-limbs we still need to shift is determined by the fifth and sixth least significant bit of `shift`. With `limb_shift` containing a unary decoding of the integer represented by these two bits, we find that the intermediate value needs to be shifted over by `i` limbs (to the `left` or `right`) when ``limb_shift[`i`]` = 1`. These things combined yield `shifted`'s definition.
+
+Of course, when ``word_instr` = 1` and, thus, only ``shift` mod 32` should be considered, the bit-mask for the lookup constraining `limb_shift` is adjusted appropriately (see [shift:c:limb_shift_lookup]).
+
+### Arithmetic right shift
+
+Lastly, we discuss the case of performing the _arithmetic_ right shift. Here, `extension` is constrained to contain a repetition of `in`'s most significant bit. Copies of this variable are used for any full limbs shifted in when ``right` = `signed` = 1`. Moreover, `X[4]` contains a copy of `extension` shifted over by the right number of bits, to allow the construction of ``in >>> shift` mod 16` as the appropriate intermediate.
+
+## Constraints
 
-| Ref | Kind | Range | Description | Multiplicity |
-|-----|------|-------|-------------|--------------|
-| `shift:c:hwsl_if_not_zero` | interaction | i ∈ [0, 3] | `HWSL[X[i]; in[i], bit_shift]` | 1 - zbs |
-| `shift:c:zbs_implies_X` | arith | i ∈ [0, 3] | `zbs` => `X[i]` = `in[i]` dot `left` |  |
+First, we constrain `bit_shift` based on whether we are left or right-shifting. [shift:c:zbs] makes sure `zbs` is set to `1` if and only if `bit_shift = 0`. This flag is used to indicate the special case that ``right` = 1` and ``shift` = 0 mod 16`.
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `SHIFT-C3` | `AND_BYTE[bit_shift; shift, 15]` | left |
+| `SHIFT-C4` | `AND_BYTE[bit_shift; 2^8 - shift, 15]` | right |
+| `SHIFT-C5` | `IsZero<zbs; bit_shift>` | μ |
+
+Next, we shift the limbs of `in` left and right by the appropriate amount, storing the results in `X` and `Y` respectively. When `zbs = 1`, the output cannot be used to compose ``in >>/>>> shift` mod 16`. To resolve this, we override `Y[i] := in[i]` and `X[i] := 0` in this case.
+
+The case of `left`-shifting and ``bit_shift` = 0` will be used for padding rows. To prevent unnecessary lookups in padding rows, we override ``X[i]` := `in[i]`` and ``Y[i]` := 0` here.
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `SHIFT-C6.i` | i ∈ [0, 3] | `HWSL[X[i]; in[i], bit_shift]` | 1 - zbs |
+| `SHIFT-C7.i` | i ∈ [0, 3] | `zbs` => `X[i]` = `in[i]` dot `left` |  |
 | | | _polynomial:_ `zbs * (X[i] - in[i] * left) = 0` | |
-| `shift:c:hwsl_x4_if_not_zero` | interaction |  | `HWSL[X[4]; extension, bit_shift]` | 1 - zbs |
-| `shift:c:zbs_implies_X_4` | arith |  | `zbs` => `X[4]` = 0 |  |
+| `SHIFT-C8` |  | `HWSL[X[4]; extension, bit_shift]` | 1 - zbs |
+| `SHIFT-C9` |  | `zbs` => `X[4]` = 0 |  |
 | | | _polynomial:_ `zbs * X[4] = 0` | |
-| `shift:c:hwslc_if_not_zero` | interaction | i ∈ [0, 3] | `HWSLC[Y[i]; in[i], bit_shift]` | 1 - zbs |
-| `shift:c:zbs_implies_Y` | arith | i ∈ [0, 3] | `zbs` => `Y[i]` = `in[i]` dot `right` |  |
+| `SHIFT-C10.i` | i ∈ [0, 3] | `HWSLC[Y[i]; in[i], bit_shift]` | 1 - zbs |
+| `SHIFT-C11.i` | i ∈ [0, 3] | `zbs` => `Y[i]` = `in[i]` dot `right` |  |
 | | | _polynomial:_ `zbs * (Y[i] - in[i] * right) = 0` | |
 
-### limb_shifting
+### Full-limb shifting
+
+Next, we constrain that `limb_shift` is a proper unary encoding of the fifth (and sixth if ``word_instr` = 0`) bit of `shift`. For this to be the case, three requirements must be satisfied: + *unary(0)*: ``limb_shift[`i`]` in {0, 1}` for `i in [0, 3]`, + *unary(1)*: ``limb_shift[`i`]` = 1` for exactly one `i`, and + *proper encoding*: ``limb_shift[`i`]` = 1 <=> 1/16 (`shift &` (48-32 dot `word_instr`)) = i` The first requirement is enforced by constraint [shift:c:limb_shift_is_bit]. To construct a constraint for the second and third requirement, observe that $ 1/16 dot (`shift &` (48-32 dot `word_instr`)) in cases( {0, 1, 2, 3} &"if" `word_instr` = 0, {0, 1} &"if" `word_instr` = 1 $ Observe moreover that, assuming *unary(0)*, the expression $ 1/16 dot (1 + sum_(i=0)^3 (16i-1) dot `limb_shift[`i`]`) $ can evaluate to `i` if and only if ``limb_shift[`i`]` = 1`, while the others are `0`. This means that the relation $ 1 + sum_(i=0)^3 (16i-1) dot `limb_shift[`i`]` = `shift &` (48-32 dot `word_instr`) $ enforces both *unary(1)* and *proper encoding*. This is the exact relation [shift:c:limb_shift_lookup] enforces.
 
-| Ref | Kind | Range | Description | Multiplicity |
-|-----|------|-------|-------------|--------------|
-| `shift:c:limb_shift_is_bit` | template | i ∈ [0, 3] | `IS_BIT<limb_shift[i]>` |  |
-| `shift:c:limb_shift_lookup` | interaction |  | `AND_BYTE[(1 - limb_shift[0]) + 15 * limb_shift[1] + 31 * limb_shift[2] + 47 * limb_shift[3]; shift, 48 - 32 * word_instr]` | μ |
-| `shift:c:out_eq_shifted` | arith | i ∈ [0, 1] | `out[:2]` = `shifted[:4]` |  |
+Hereafter, one must only check that `out` is the proper cast of `shifted` into a `DWordWL`.
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `SHIFT-C12.i` | i ∈ [0, 3] | `IS_BIT<limb_shift[i]>` |  |
+| `SHIFT-C13` |  | `AND_BYTE[(1 - limb_shift[0]) + 15 * limb_shift[1] + 31 * limb_shift[2] + 47 * limb_shift[3]; shift, 48 - 32 * word_instr]` | μ |
+| `SHIFT-C14.i` | i ∈ [0, 1] | `out[:2]` = `shifted[:4]` |  |
 | | | _polynomial:_ `out[i] - (shifted::DWordWL)[i] = 0` | |
 
-### lookups
+### Miscellaneous
+
+*Note*: `is_negative` is not used when `signed = 0`. As such, there is no problem with it being unconstrained in this case.
+
+### Lookups
+
+This chip adds the following interaction to the lookup.
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `SHIFT-C15` | `SHIFT[out; in, shift, direction, signed, word_instr]` | -μ |
+
+## Padding
+
+The table can be padded to the next power of two with the following value assignments:
+
+### is_negative
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `SHIFT-C2` | `MSB16[is_negative; in[3]]` | signed |
+
+### left_flag
 
-| Ref | Kind | Description | Multiplicity |
-|-----|------|-------------|--------------|
-| `shift:c:lookup` | interaction | `SHIFT[out; in, shift, direction, signed, word_instr]` | -μ |
+| Tag | Description |
+|-----|-------------|
+| `SHIFT-C1` | `direction` => `μ` = 1 |
+| | _polynomial:_ `direction * (1 - μ) = 0` |
\ No newline at end of file
diff --git a/docs/spec/spec_full.md b/docs/spec/spec_full.md
new file mode 100644
index 000000000..29ec7e963
--- /dev/null
+++ b/docs/spec/spec_full.md
@@ -0,0 +1,1389 @@
+# Lambda VM Specification
+
+# Memory Argument
+
+As part of fully proving the correct execution of a RISC-V program, the VM must ensure that memory reads and writes are consistent. That is, every byte read from some address corresponds to the byte that was last written to that address --- or the initial value if nothing has been written yet. We consider "memory" in a broad sense here: both RAM and the general purpose registers can be seen as instantiations of memory and are therefore handled simultaneously.
+
+While RAM is byte addressed, we do choose to store registers as a `DWordWL` over two word addresses. ]
+
+On a high level, we ensure memory consistency by an interacting system of reads and writes to a lookup argument, combined with an initialization and finalization scheme. The initialization and finalization schemes together ensure both that (1) the necessary preconditions for the lookup system are satisfied, and (2) the program is executed with the correct initial memory and register contents as specified by the ELF binary and the ISA.
+
+## Memory types
+
+A commonly made distinction of memory types is that of _read-only_ and _read-write_ memory, with the more restrictive read-only variant often allowing for more efficient solutions (be that regarding prover time, verifier time or proof size) via table lookup proofs. Naturally, the VM’s main memory and registers should be handled by a read-write system as the guest program/environment can issue instructions that write to memory. While there are some subsystems that can be modelled as read-only memory ---e.g., the program memory and instruction decoding--- we opt to integrate these into the proof system via chip interactions (relying on techniques derived from table lookup arguments). As such, we only concern ourselves with read-write memory, moving forward.
+
+## Memory operations
+
+Every memory operation has some conceptual attributes that are relevant to mention or discuss:
+
+- The type of operation (read or write) - The memory address --- this is an address in the broad sense: main memory and registers have their own dedicated part of the unified address space. - The value being read from or written to the memory address - When the value was read or written, see the below paragraph
+
+Since we will have to ensure that memory accesses are temporally consistent within the execution of the VM, we additionally consider a _timestamp_ for  every memory access, that should be strictly increasing. As such, it should never be possible for the system to generate accesses to the same address at identical timestamps. Multiple memory accesses can (and indeed will, consider e.g. register reads) occur in a single execution cycle of the VM, so we cannot use the cycle counter directly as timestamp for register accesses. We can, however, statically bound the maximal number of memory accesses made during a single execution by a granularity constant `k` and derive timestamps from the cycle counter. The `i`th possible memory access in cycle `c` will obtain as timestamp the value `k dot c + i`. For simplicity, we will always reserve a timestamp for every possible memory access, and leave the timestamp unused if an instruction does not use it.
+
+For reasons of completeness (since temporal integrity as discussed below is a security necessity), we cannot deal with multiple accesses to the same address at identical timestamps. However, if multiple accesses are guaranteed to be independent (that is, to different addresses), they can still share a timestamp --- consider, e.g., the case of reading a word as 4 bytes with the `LW` load instruction. This property is already taken into account where possible in the design of the system. For instance, in the CPU chip, we can ensure that there are at most 3 memory accesses not guaranteed to be independent, so a timestamp granularity of 4 timestamps per cycle is enough. ]
+
+## Permutation argument
+
+We can conceptually organise the state of the memory as a collection of "tokens" that represent tuples `(serif("timestamp"), serif("address"), serif("value"))`, meaning the current value written to `serif("address")` is `serif("value")`, last written to memory at `serif("timestamp")`. Having exactly one value associated with any address will be ensured (see further down in this document) by the interaction of memory initialization, memory finalization, and the effects of memory operations.
+
+Each memory operation will then do two things:
+
+- Consume the current token in the memory - Emit a new token to replace it
+
+Naturally, for a read operation, the _values_ embedded in the consumed and emitted tokens must be identical. From the need to consume a token even on the first memory access, we can see the necessity for a memory initialization procedure ---in addition to having to make sure the initial memory content lines up with what the binary dictates.
+
+> **Note:** properly link/refer to the logup spec
+
+So long as we can properly constrain temporal integrity (that is, no memory operation can consume future tokens), this "balancing" act of tokens can be integrated (with sufficient domain separation) into the existing LogUp argument: consuming a token corresponds to a "receive" and emitting a new token is a "send".
+
+## Temporal integrity
+
+> **Note:** Properly link/refer to the LT chip
+
+To ensure temporal integrity, every memory operation needs to be constrained for the newly emitted token to have a strictly greater timestamp than the consumed token. This raises the question of how to represent timestamps and cleanly perform this check, as over a finite field the “less than” relation is ill-defined (though it is common and natural to consider it as the less than relation over the natural lift of the field into the integers). We choose to represent timestamps as machine words, using the existing `LT` chip ([lt]) functionality for comparisons.
+
+- Clean definition of “less-than”, using the already existing `LT` functionality in the ALU - Harder to perform increments, needing extra constraints beyond field arithmetic - But this can be alleviated by providing a precomputed column that has a fixed increment per CPU row ][ - Comparison is more annoying, but can work by: - Decomposition into a machine word and chip interaction with the LT chip - Bit decomposition and comparison constraints - Range-checking the difference to be sufficiently small w.r.t. the field characteristic. - Increments and basic arithmetic operations are cheap ] ]
+
+> **Note:** reference to CPU chip/timestamp column and MEMW chip
+
+## Initialization and Finalization
+
+Because the LogUp argument handling token consumption and emission needs to be fully balanced --- every token emitted should be consumed, and vice versa --- we need to have a system to emit the initial tokens and consume the final tokens. This needs to ensure that every address has at most a single initializing emission, and at most one finalizing consumption. Having at most one initialization will, through the correctness of the lookup argument, immediately lead to having at most one correct finalization, and vice versa.
+
+The initialization will need to correspond to a fixed initial register state for the VM, as well as the memory loaded from the program binary, zero-initialization of memory elsewhere, and private input provided by the prover. The contribution of initialization with static data from the ELF executable and the initial register state to the sum can be handled directly by the verifier, ensuring correctness corresponding to the ELF binary being proven. This leaves only zero-initialization and prover input as prover-side concerns for initialization, alongside the finalization of the entire used memory.
+
+For our chosen scheme (which we refer to as "paged initialization/finalization"), the available memory range is split into equally (power-of-two) sized "pages". Each address can then be represented as `address = page_base_address + page_offset`, with `page_base_address` being "page-aligned", and `page_offset` belonging to a limited range (the page size). As such, initialization or finalization of a page is represented by a table with columns `page`, `offset`, `value`, and ---for finalization--- `timestamp`. The `page` column is a preprocessed, constant value (which can be entirely virtualized/inlined into the constraints for this table), and the `offset` column is a preprocessed column containing its row index. Depending on the type of initialization, `value` can be a prover-committed column (input data), or a precomputed, constant column containing `0` (free memory space). This table then feeds into the LogUp system in the normal way, emitting the initial tokens for all addresses in a page, without consuming any tokens. Since the `offset` column is always the same, it can be reused across all paged initialization and finalization tables.
+
+Concretely, each page gets an associated `PAGE` table, consisting of N variables over N columns. For each such table, the `page` variable is instantiated as the constant base address of the page. The `offset` column is preprocessed, which helps the verifier ensure that each page has a single fixed size, but the verifier should still check that no pages overlap and all `page` values are page-aligned.
+
+### Page initialization
+
+> **Note:** check whether we need `fini` to be range-checked
+
+We present here a set of constraints on the `PAGE` table that
+
++ enforces the initial and final values of each address are bytes + adds the initial and final interaction to the LogUp argument
+
+For zero-initialized pages, `init` can be a constant `0`, and hence doesn't need a column, nor a range check.
+
+We identify a few alternatives that would achieve the desired initialization/finalization functionalities, and consider their respective trade-offs.
+
+_"Free-zero" initialization_
+
+Zero-initialization could be achieved by allowing the `MEMW` chip to output a zero without consuming a token from the lookup argument. This would in turn be made secure by finalization consuming at most one token per address: if an address is initialized more than once, the proof cannot be finalized. - This requires fewer pages (and hence tables) for zero-initialization. - But it comes at a cost of added complexity in the `MEMW `chip, and likely some extra columns to handle this. Keeping track of initialized addresses, and potentially having to initialize only some of the bytes in a word-read may make bookkeeping challenging. - This is an alternative form of sparse initialization (see below), so it is incompatible with paged finalization. Paged finalization can be made into a compatible sparse form by adding a bit-checked multiplicity column.
+
+_Sparse initialization/finalization_
+
+One or more STARK tables (depending on the amount of memory used) consisting of `(address, value)` columns are introduced, where for zero-initialization, `value` can be constant zero. Transition constraints ensure that `address` is strictly increasing, enforcing the "at most once" property; `value` is range-checked to consist of bytes. Similar to paged finalization, an additional `timestamp` column is added, containing the final timestamp each address was accessed. This table is then further used to contribute to the LogUp sum as with any other interactions. - The transition constraints can be chosen to only apply on finalization, as at-most-once finalization is enough to ensure consistency. - Sparse initialization is incompatible with paged finalization, see also the remark under free-zero initialization above. - This would require transition constraints, which currently are not needed elsewhere in the VM design - Additionally, for memory use exceeding the capacity of a single initialization/finalization table, some form of transition constraint between tables is needed - Alternatively, transition constraints could potentially be avoided by more integration into the LogUp system, but this could turn out more costly in practice - This is compatible with the above "free zero" initialization - Since a prover-committed address column is needed (rather than a precomputed one), the number of required columns increases. - As an optimization, the address column could potentially be used simultaneously for initialization and finalization - Sparse initialization/finalization reduces the cost for sparse memory access patterns, where only a few addresses would be accessed per page. Most programs and compilers should however favor a memory locality that makes paged initialization/finalization comparable. ]
+
+### Register initialization/finalization
+
+> **Note:** Properly link/reference ECALL/HALT chip
+
+The initial and final state of registers can be entirely known by the verifier, since the relevant initialization values are either zero, or embedded in the ELF, and the final values can be set to a known value by the HALT ecall. As additionally, the number of registers is small, the verifier can directly add the required balancing terms to the LogUp sum.
+
+## Notes and considerations
+
+- Register reads and writes may interact within a single cycle, so a correct and fixed ordering needs to be ensured - Correctness of initialization and completeness of finalization need to be ensured
+
+## Future topics of interest
+
+- Optimize memory systems after determining factual bottlenecks (e.g. taking inspiration from Twist and Shout, or other recent research)
+
+---
+
+# Variables
+
+While this VM operates on 64-bit words, the proving system's base field has fewer than `2^64` elements available and thus cannot represent all words natively. To this end, we introduce the concept of "variables" as an abstraction layer on top of the VM's field elements. The following table lists all variable types used in this VM.
+
+columns: (auto, 1fr, auto), inset: 7pt, align: (top+left, top+left, top+center, ), table.header([*Name*], [*Description*], [*\*]), ..for type in config.variables.types { ([], [], [.subtypes.len()]) },
+
+---
+
+# IS_BIT Template
+
+box( inset: (left: 4pt, right: 4pt), outset: (top: 4pt, bottom: 4pt), radius: 2pt, fill: luma(230), raw(code)) }
+
+Barring exceptional cases, this template is used to assert that a variable of type `Bit` assumes a valid value under some condition.
+
+## Interface
+
+The  constraint template has the following interface:
+
+where `cond` is any value described by an expression _of degree at most `1`_. Note that  can be used to denote the _unconditional_ application of the  template to `X`.
+
+## Variables
+
+The  template operates on two variables: `cond` and `X`:
+
+## Constraints
+
+It takes only one constraint to enforce that `X` must be either `0` or `1` whenever ``cond` eq.not 0`:
+
+*Note*: - In case of _unconditional_ template application, `cond` can be dropped from the constraint, simplifying it to ``X` (1- `X`) = 0`. - As described earlier, the `cond` variable must be describable by a degree-1 (i.e., linear) expression. This is to make sure that [isbit:c:isbit]'s expression has degree at most 3.
+
+## Proof of correctness
+
+If `cond` is `0`, [isbit:c:isbit] is trivially satisfied: `X` can assume any value and the polynomial constraint will evaluate to `0` regardless. When ``cond` eq.not 0`, it follows that the statement can only be proven when ``X` (1-`X`) equiv 0 mod p`, with `p` the modulus of the field. Because `BaseField` is a prime field, this equality is only satisfied if either ``X` equiv 0 mod p` or `1-`X` equiv 0 mod p`. Hence, it is proven that when ``cond` eq.not 0`, [isbit:c:isbit] is only satisfied if ``X` in {0, 1}`. 
+
+## Columns
+
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `X` | `BaseField` | Value for which to assert that it lies in the range ${0, 1}$. |
+
+### Condition
+
+| Name | Type | Description |
+|------|------|-------------|
+| `cond` | `BaseField` | Whether the constraint should be applied ($eq.not 0$) or not ($0$). |
+
+### all
+
+| Tag | Description |
+|-----|-------------|
+| `IS_BIT-C1` | `cond` => `X` (1-`X`) = 0 |
+| | _polynomial:_ `cond * X * (1 - X) = 0` |
+
+---
+
+# ADD/SUB Template
+
+box( inset: (left: 4pt, right: 4pt), outset: (top: 4pt, bottom: 4pt), radius: 2pt, fill: luma(230), raw(code)) }
+
+## Notation
+
+The  constraint template has the following interface:
+
+where `cond` is any value described by an expression _of degree at most `1`_.
+
+### 
+
+For ease of notation, we moreover introduce the  constraint template. Its interface
+
+maps onto the  template as
+
+It constrains that ``diff` = `lhs` - `rhs` mod 2^64` when the expression `cond` is non-zero. As with ,  can be used to denote the _unconditional_ application of the template.
+
+## Variables
+
+## Assumptions
+
+| Tag | Range | Description |
+|-----|-------|-------------|
+| `ADD-A1.i` | i ∈ [0, 1] | `IS_WORD[lhs[i]]` |
+| `ADD-A2.i` | i ∈ [0, 1] | `IS_WORD[rhs[i]]` |
+| `ADD-A3.i` | i ∈ [0, 1] | `IS_WORD[sum[i]]` |
+
+## Constraints
+
+This template introduces the following constraints
+
+## Columns
+
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `lhs` | `DWordWL` | left-hand operator |
+| `rhs` | `DWordWL` | right-hand operator |
+
+### Output
+
+| Name | Type | Description |
+|------|------|-------------|
+| `sum` | `DWordWL` | $`lhs` + `rhs`$ |
+
+### Virtual
+
+| Name | Type | Description |
+|------|------|-------------|
+| `carry` | `Bit[2]` | Carry values used to constrain the addition |
+
+**Definition of `carry`:**
+```
+carry (when iter=0) := 2^-32 * (lhs[0] + rhs[0] - sum[0])
+carry (when iter=1) := 2^-32 * (lhs[1] + rhs[1] + carry[0] - sum[1])
+```
+
+### Condition
+
+| Name | Type | Description |
+|------|------|-------------|
+| `cond` | `BaseField` | Whether the relation should be enforced ($eq.not 0$) or not ($0$). |
+
+### all
+
+| Tag | Range | Description |
+|-----|-------|-------------|
+| `ADD-C1.i` | i ∈ [0, 1] | cond ⇒ `IS_BIT<carry[i]>` |
+
+---
+
+# DECODE Table
+
+All `RV64IMC` instruction are to be decoded to a format that can be interpreted by the VM. This section outlines the decoding table being used in the VM. For reasons of efficiency, data in this table is significantly compressed. Since reasoning about this compressed form is needlessly complex, the `decode (uncompressed)` section presents the same table in uncompressed form, and explains how to decode `RV64IM` assembly instructions to it. Instructions on how to compress the uncompressed table to form the compressed decode table, can be derived from the `packed_decode` variable provided below.
+
+## Columns
+
+### Output
+
+| Name | Type | Description |
+|------|------|-------------|
+| `pc` | `DWordWL` | value of the program counter this instruction is associated with. |
+| `packed_decode` | `BaseField` | Ordered concatenation of several small variables. The `decode (uncompressed)` section explains the purpose of each variable.\ A list of each variable and the bit(-range) in which it is located:\ [0] `read_register1`, \ [1] `read_register2`, \ [2] `write_register`, \ [3] `memory_2bytes`, \ [4] `memory_4bytes`, \ [5] `memory_8bytes`, \ [6] `c_type`, \ [7] `signed`, \ [8] `mp_selector`, \ [9] `muldiv_selector`, \ [10] `word_instr`, \ [11] `ADD`, \ [12] `SUB`, \ [13] `SLT`, \ [14] `AND`, \ [15] `OR`, \ [16] `XOR`, \ [17] `SHIFT`, \ [18] `JALR`, \ [19] `BEQ`, \ [20] `BLT`, \ [21] `LOAD`, \ [22] `STORE`, \ [23] `MUL`, \ [24] `DIVREM`, \ [25] `ECALL`, \ [26] `EBREAK`; \ [27:35] `rs1`, \ [35:43] `rs2`, \ [43:51] `rd`, \ the remaining bits are set to zero.  |
+| `imm` | `DWordWL` | the *fully extended (!)* 64-bit version of the immediate. |
+
+### Multiplicity
+
+| Name | Type | Description |
+|------|------|-------------|
+| `μ` | `BaseField` | The multiplicity with which this instruction is looked up in the `CPU` table. |
+
+The  table is comprised of  variables that are expressed using  columns:
+
+## Padding
+
+The  table must be padded to a length that is a power of two. Empty rows with the following content can be added to achieve this:
+
+Note that this row sets the `EBREAK` flag. Given that `CPU` asserts that `EBREAK = 0` (see [cpu:c:ebreak_traps]), using this "padding-instruction" would immediately make the CPU table unprovable. Note moreover that the `pc` is set to `7`. This value is the _smallest odd number_ (i.e., not reachable during regular execution) that is more than _`4`_ (i.e., the max `pc`-increment) greater than _`1`_ (i.e., the `pc`-value used in the [additional instruction] referred to by `CPU`-padding lines).
+
+## Decoding
+
+For the purposes of explaining decoding, we decompress 's `packed_decode` variable into its constituent variables. Note that the below table is _not_ used in practice: it is solely used for the purposes of this explanation.
+
+We will illustrate how each instruction should be expressed in this (uncompressed) decoding table. The columns of the accompanying table represent the following: - *`operation`*: the assembly operation being encoded. - *`op-flag`*: which of the "`ALU` selector flags" operation flags to set. Each operation sets exactly one. - *`w_instr`*, *`signed`*: whether to set the `word_instr` and `signed` flags, respectively. - *other*: the other flags that should be set or variables that should be given specific values.
+
+For the purpose of brevity and readability, the table uses the following rules-of-thumb: + `rd`, `rs1`, `rs2`, and `imm` are mapped to the values provided by the instruction; when a value is not specified by an instruction it defaults to `0`. + `read_register1`, `read_register2` and `write_register` are set to `1` when respectively ``rs1` != 0`, ``rs2` != 0`, or  ``rd` != 0`. + Any flag that is not listed is set to `0`, with the exception of the `c_type` flag. *The `c_type` flag is set independently of the below table*, as explained next.
+
+Further clarification is provided in the notes following the table.
+
+### C-type instructions
+
+The `RV64C` extension for compressed instructions specifies that \~50% of all instructions can be represented using a 16-bit instruction (rather than 32-bits), saving \~25% in code size. This execution of assembly code is _not_ agnostic to an instruction's compression state; after executing a compressed instruction, the `pc` should be incremented by `2` rather than `4`. To indicate an instruction is provided in compressed form, the `c_type` flag is introduced. *This flag should be set to `1` whenever the decoded instruction is provided in compressed form and `0` otherwise.*
+
+/// Add a reference to one or more notes following this table. super("[" + refs.pos().map(r => ref(r)).join(",") + "]") }
+
+show figure: set block(breakable: true)
+
+figure(table( columns: (auto, auto, 40pt, 40pt, 1fr, 15pt), stroke: 0pt, inset: (right: .5em), align: (left, right, center, center, left, right), fill: (_, y) => if calc.odd(y) and y <= lines.len() { luma(245) } else { white }, table.header([*Operation*], [*op-flag*], [*`w_instr`*], [*`signed`*], [*other*], []), table.hline(stroke: 1.5pt), table.vline(x: 1, start: 1, end: lines.len() + 1, stroke: .5pt), ..lines.flatten(), table.hline(stroke: 1.5pt), table.footer([*Operation*], [*op-flag*], [*`w_instr`*], [*`signed`*], [*other*]), ), caption: [Decoding table] }
+
+// OP-IMM ([`ADDI[W]   rd, rs1, imm`], [`ADD`], [`[W]`], [], [], []), ([`SLTI[U]   rd, rs1, imm`], [`SLT`], [], [.not`[U]`], [], []), ([`ANDI      rd, rs1, imm`], [`AND`], [], [], [], []), ([`ORI       rd, rs1, imm`], [`OR`],   [], [], [], []), ([`XORI      rd, rs1, imm`], [`XOR`], [], [], [], []), ([`SLLI[W]   rd, rs1, imm`], [`SHIFT`], [`[W]`], [], [], []), ([`SRLI[W]   rd, rs1, imm`], [`SHIFT`], [`[W]`], [], [`mp_selector`], []), ([`SRAI[W]   rd, rs1, imm`], [`SHIFT`], [`[W]`], [1], [`mp_selector`], []), // OP ([`ADD[W]    rd, rs1, rs2`], [`ADD`], [`[W]`], [], [], []), ([`SUB[W]    rd, rs1, rs2`], [`SUB`], [`[W]`], [], [], []), ([`SLT[U]    rd, rs1, rs2`], [`SLT`], [], [.not`[U]`], [], []), ([`AND       rd, rs1, rs2`], [`AND`], [], [], [], []), ([`OR        rd, rs1, rs2`], [`OR`], [], [], [], []), ([`XOR       rd, rs1, rs2`], [`XOR`], [], [], [], []), ([`SLL[W]    rd, rs1, rs2`], [`SHIFT`], [`[W]`], [], [], []), ([`SRL[W]    rd, rs1, rs2`], [`SHIFT`], [`[W]`], [], [`mp_selector`], []), ([`SRA[W]    rd, rs1, rs2`], [`SHIFT`], [`[W]`], [1], [`mp_selector`], []), // OP - M ([`MUL[W]    rd, rs1, rs2`], [`MUL`], [`[W]`], [1], [`mp_selector`], []), ([`MULH      rd, rs1, rs2`], [`MUL`], [], [1], [`mp_selector`, `muldiv_selector`], []), ([`MULHU     rd, rs1, rs2`], [`MUL`], [], [], [`muldiv_selector`], []), ([`MULHSU    rd, rs1, rs2`], [`MUL`], [], [1], [`muldiv_selector`], []), ([`DIV[U][W] rd, rs1, rs2`], [`DIVREM`], [`[W]`], [.not`[U]`], [], []), ([`REM[U][W] rd, rs1, rs2`], [`DIVREM`], [`[W]`], [.not`[U]`], [`muldiv_selector`], []), // LUI/AUIPC ([`LUI       rd, imm`], [`ADD`], [], [], [], []), ([`AUIPC     rd, imm`], [`ADD`], [], [], [`rs1 := x255`], []), ([`JAL       rd, imm`], [`JALR`], [], [], [`rs1 := x255`], []), // Branching ([`JALR      rd, rs1, imm`], [`JALR`], [], [], [], []), ([`BEQ      rs1, rs2, imm`], [`BEQ`], [], [], [], []), ([`BNE      rs1, rs2, imm`], [`BEQ`], [], [], [`mp_selector`], []), ([`BLT[U]   rs1, rs2, imm`], [`BLT`], [], [.not`[U]`], [], []), ([`BGE[U]   rs1, rs2, imm`], [`BLT`], [], [.not`[U]`], [`mp_selector`], []), // LOAD ([`LD        rd, rs1, imm`], [`LOAD`], [], [], [`mem_8B`], []), ([`LW[U]     rd, rs1, imm`], [`LOAD`], [], [.not`[U]`], [`mem_4B`], []), ([`LH[U]     rd, rs1, imm`], [`LOAD`], [], [.not`[U]`], [`mem_2B`], []), ([`LB[U]     rd, rs1, imm`], [`LOAD`], [], [.not`[U]`], [], []), // STORE ([`SD       rs1, rs2, imm`], [`STORE`], [], [], [`mem_8B`], []), ([`SW       rs1, rs2, imm`], [`STORE`], [], [], [`mem_4B`], []), ([`SH       rs1, rs2, imm`], [`STORE`], [], [], [`mem_2B`], []), ([`SB       rs1, rs2, imm`], [`STORE`], [], [], [], []), // ECALL/EBREAK ([`ECALL`], [`ECALL`], [], [], [``rs1` := `x17``], []), ([`EBREAK`], [`EBREAK`], [], [], [], []), // FENCE ([`FENCE`], [`ADD`], [], [], [], []),
+
+// Construct a note that can be referenced through `lbl` show figure: (it) => align(left, []) [ ] }
+
+#### Notes
+
+We note the following about the above decoding table:
+
+enum.item( referenceable_note( "note_word_instr", [`word_instr`: `[W]` indicates that ``word_instr` = 1` for the `W`-variant of the operation, and `0` for the non-`W`-variant.] ), enum.item( referenceable_note( "note_signed", [`signed`: .not`[U]` indicates that ``signed` = 1` for the *non-`U`*-variant of the operation, and `0` for the `U`-variant.] ), enum.item( referenceable_note( "note-lui", [`LUI`: this operation loads the 20-bit `imm` in the upper bits of `rd`. Observe that this can be represented using `ADDI rd, x0, imm`. As such, *we expect the decoding to take care of writing the immediate in bit range `[12:32]` of `imm` and extending it to 64 bits.*] ), enum.item( referenceable_note( "note-auipc", [`AUIPC`: this operation adds the 20-bit immediate to the upper bits of `pc` and stores the result in `rd`. Given that the `pc` is stored in `x255`, this operation can be represented using `ADDI rd, x255, imm`. As such, *we expect the decoding to take care of writing the immediate in bit range `[12:32]` of `imm` and extending it to 64 bits.*] ), enum.item( referenceable_note( "note-jal", [`JAL`: this operation stores ``pc` + 4` in `rd` and adds two times the sign-extended 20-bit immediate to the `pc`. Note that this can be represented using `JALR rd, x255, imm`. As such, *we expect the decoding to take care of writing the immediate in bit range `[1:21]` of `imm` and extending it to 64 bits; the least significant bit should always be 0.*] ), enum.item( referenceable_note( "note-ecall", [`ECALL`: "On RISC-V a system call has its own instruction: `ECALL`. [...] A7 [= register `x17`] contains the system call number." [[source]] ] ), enum.item( referenceable_note( "note-fence", [`FENCE`: currently, the VM interprets this operation as `ADDI x0 x0 0`; a no-op.]
+
+## One more instruction <cpu-padding-decode-row>
+
+In addition to decoding all instructions provided in the ELF and adding a corresponding entry to the  table, one must include an entry that has ``pc` = 1` and every other variable set to `0`. Note that this will never conflict with any entry in the ELF, since it has an odd `pc` value.
+
+This entry is used to pad the `CPU` table. More details on this matter are provided in the `CPU` chip.
+
+---
+
+# CPU Chip
+
+## Columns
+
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `timestamp` | `Timestamp` | A preprocessed timestamp to coordinate the memory argument. Since we have at most 3 non-disjoint memory accesses (`(rs1, rs2, rd)`, `(rs1, pc, pc)`, `(LOAD)` or `(STORE)`) a maximum of 4 slots is enough. |
+| `pc` | `DWordWL` | The program counter |
+| `rs1` | `Byte` | Source register 1 index |
+| `rs2` | `Byte` | Source register 2 index |
+| `rd` | `Byte` | Destination register index |
+| `read_register1` | `Bit` | Whether to read from `rs1` (1) or to place a 0 in `rv1` (0) |
+| `read_register2` | `Bit` | Whether to read from `rs2` (1) or to place a 0 in `rv2` (0) |
+| `write_register` | `Bit` | Whether to write back to the destination register |
+| `memory_2bytes` | `Bit` | Whether the memory access (read or write) touches exactly 2 bytes |
+| `memory_4bytes` | `Bit` | Whether the memory access (read or write) touches exactly 4 bytes |
+| `memory_8bytes` | `Bit` | Whether the memory access (read or write) touches exactly 8 bytes |
+| `c_type_instruction` | `Bit` | Whether the instruction is of C type, i.e., whether it is 2 bytes long instead of 4 |
+| `imm` | `DWordWL` | The fully extended 64-bit version of the immediate |
+| `signed` | `Bit` | Indicates whether we're dealing with a signed or unsigned instruction |
+| `mp_selector` | `Bit` | Multi-purpose selector used by different ALU operations for different purposes. Currently, it is used     - by the `MUL` chip to select between `MUL`/`MULH` and `MULH[S]U`, and     - as flag for inverting the condition of conditional branches (see `branch_cond`)     - as direction (left or right) for `SHIFT` |
+| `muldiv_selector` | `Bit` | Selects which output of `MUL` (lo/hi) or `DIV` (quo/rem) is wanted |
+| `word_instr` | `Bit` | Whether the instruction is a \*W instruction, requiring the inputs and outputs to be (sign) extended |
+| `ADD` | `Bit` | One-hot ALU selector flag |
+| `SUB` | `Bit` | One-hot ALU selector flag |
+| `SLT` | `Bit` | One-hot ALU selector flag |
+| `AND` | `Bit` | One-hot ALU selector flag |
+| `OR` | `Bit` | One-hot ALU selector flag |
+| `XOR` | `Bit` | One-hot ALU selector flag |
+| `SHIFT` | `Bit` | One-hot ALU selector flag |
+| `JALR` | `Bit` | One-hot ALU selector flag |
+| `BEQ` | `Bit` | One-hot ALU selector flag |
+| `BLT` | `Bit` | One-hot ALU selector flag |
+| `LOAD` | `Bit` | One-hot ALU selector flag |
+| `STORE` | `Bit` | One-hot ALU selector flag |
+| `MUL` | `Bit` | One-hot ALU selector flag |
+| `DIVREM` | `Bit` | One-hot ALU selector flag |
+| `ECALL` | `Bit` | One-hot ALU selector flag |
+| `EBREAK` | `Bit` | One-hot ALU selector flag |
+
+### Output
+
+| Name | Type | Description |
+|------|------|-------------|
+| `next_pc` | `DWordWL` | The program counter for the next instruction |
+| `rvd` | `DWordWL` | The value to (maybe) be written back to rvd |
+
+### Auxiliary
+
+| Name | Type | Description |
+|------|------|-------------|
+| `rv1` | `DWordWHH` | The value of register `rs1` |
+| `rv2` | `DWordWHH` | The value of register `rs2` |
+| `rv1_sign_bit` | `Bit` | The sign bit of `rv1` if seen as a 32-bit word |
+| `arg1` | `DWordBL` | The extended version of `rv1`, depending on `word_instr` |
+| `arg2_sign_bit` | `Bit` | The sign bit of `arg2` if seen as a 32-bit word |
+| `arg2` | `DWordBL` | A multiplexed version of `rv2` and `imm`, to be used as second argument to ALU calls |
+| `res_sign_bit` | `Bit` | The sign bit of `res`, if seen as a 32-bit word |
+| `res` | `DWordBL` | The ALU result |
+| `is_equal` | `Bit` | Whether `rv1` and `arg2` are equal |
+| `branch_cond` | `Bit` | Whether a branch is taken, i.e., the branch condition |
+
+### Virtual
+
+| Name | Type | Description |
+|------|------|-------------|
+| `packed_decode` | `BaseField` | A packed representation of all bit flags and register indices obtained from the decoding |
+| `pad` | `Bit` | When no flags are set, we must be in a padding row. |
+
+**Definition of `packed_decode`:**
+```
+packed_decode := 2^0 * read_register1 + 2^1 * read_register2 + 2^2 * write_register + 2^3 * memory_2bytes + 2^4 * memory_4bytes + 2^5 * memory_8bytes + 2^6 * c_type_instruction + 2^7 * signed + 2^8 * mp_selector + 2^9 * muldiv_selector + 2^10 * word_instr + 2^11 * ADD + 2^12 * SUB + 2^13 * SLT + 2^14 * AND + 2^15 * OR + 2^16 * XOR + 2^17 * SHIFT + 2^18 * JALR + 2^19 * BEQ + 2^20 * BLT + 2^21 * LOAD + 2^22 * STORE + 2^23 * MUL + 2^24 * DIVREM + 2^25 * ECALL + 2^26 * EBREAK + 2^27 * rs1 + 2^35 * rs2 + 2^43 * rd
+```
+
+**Definition of `pad`:**
+```
+pad := 1 - ADD - SUB - SLT - AND - OR - XOR - SHIFT - JALR - BEQ - BLT - LOAD - STORE - MUL - DIVREM - ECALL - EBREAK
+```
+
+The `CPU` chip is comprised of  variables that are expressed using  columns:
+
+## Assumptions
+
+| Tag | Range | Description |
+|-----|-------|-------------|
+| `CPU-A1` |  | At most one ALU selector flag is 1 by the decoding, and every other flag is 0. |
+| `CPU-A2` |  | When `STORE + LOAD + BEQ + BLT = 0`, either `rs2 = 0` or `imm = 0` should be enforced by the decoding. This is needed for `arg2`. |
+
+## Constraints
+
+First, we perform a decoding lookup for the current PC.
+
+| Tag | Description |
+|-----|-------------|
+| `CPU-C1` | `DECODE[pc, imm, packed_decode]` |
+
+> **Note:** All casts for interactions will have to be reviewed once other chip interfaces stabilise
+
+### Range checks
+
+> **Note:** Make sure we argue for every column here
+
+> **Note:** is `rvd` still sufficiently constrained? (can also be done through the memory argument like `pc`?)
+
+We constrain all columns to have the appropriate ranges. The flags and register indices looked up from the decoding need to be checked, as they are communicated through the interaction in a packed form. In contrast, we know ahead of time that decoding will ensure proper range checks for `pc` and `imm`. Similarly, since `next_pc` will propagate through the memory argument and be looked up in the instruction decoding on the next cycle, it is forced to be in the correct range. For the auxiliary columns, we need to check the limbs of `arg1`, `arg2`, and `res`. The ranges of the other auxiliary columns are enforced through later constraints.
+
+| Tag | Range | Description |
+|-----|-------|-------------|
+| `CPU-CR2` |  | `IS_BIT<read_register1>` |
+| `CPU-CR3` |  | `IS_BIT<read_register2>` |
+| `CPU-CR4` |  | `IS_BIT<write_register>` |
+| `CPU-CR5` |  | `IS_BIT<memory_2bytes>` |
+| `CPU-CR6` |  | `IS_BIT<memory_4bytes>` |
+| `CPU-CR7` |  | `IS_BIT<memory_8bytes>` |
+| `CPU-CR8` |  | `IS_BIT<c_type_instruction>` |
+| `CPU-CR9` |  | `IS_BIT<signed>` |
+| `CPU-CR10` |  | `IS_BIT<mp_selector>` |
+| `CPU-CR11` |  | `IS_BIT<muldiv_selector>` |
+| `CPU-CR12` |  | `IS_BIT<word_instr>` |
+| `CPU-CR13` |  | `IS_BIT<ADD>` |
+| `CPU-CR14` |  | `IS_BIT<SUB>` |
+| `CPU-CR15` |  | `IS_BIT<SLT>` |
+| `CPU-CR16` |  | `IS_BIT<AND>` |
+| `CPU-CR17` |  | `IS_BIT<OR>` |
+| `CPU-CR18` |  | `IS_BIT<XOR>` |
+| `CPU-CR19` |  | `IS_BIT<SHIFT>` |
+| `CPU-CR20` |  | `IS_BIT<JALR>` |
+| `CPU-CR21` |  | `IS_BIT<BEQ>` |
+| `CPU-CR22` |  | `IS_BIT<BLT>` |
+| `CPU-CR23` |  | `IS_BIT<LOAD>` |
+| `CPU-CR24` |  | `IS_BIT<STORE>` |
+| `CPU-CR25` |  | `IS_BIT<MUL>` |
+| `CPU-CR26` |  | `IS_BIT<DIVREM>` |
+| `CPU-CR27` |  | `IS_BIT<ECALL>` |
+| `CPU-CR28` |  | `IS_BIT<EBREAK>` |
+| `CPU-CR29` |  | `IS_BYTE[rs1]` |
+| `CPU-CR30` |  | `IS_BYTE[rs2]` |
+| `CPU-CR31` |  | `IS_BYTE[rd]` |
+| `CPU-CR32.i` | i ∈ [0, 7] | `IS_BYTE[arg1[i]]` |
+| `CPU-CR33.i` | i ∈ [0, 7] | `IS_BYTE[arg2[i]]` |
+| `CPU-CR34.i` | i ∈ [0, 7] | `IS_BYTE[res[i]]` |
+
+### ALU
+
+The ALU functionality is then obtained through judicious dispatching to the corresponding chips.
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `CPU-CA35` |  | ADD + LOAD + STORE ⇒ `ADD<res::DWordWL; arg1::DWordWL, arg2::DWordWL>` |  |
+| `CPU-CA36` |  | SUB + BEQ ⇒ `SUB<res::DWordWL; arg1::DWordWL, arg2::DWordWL>` |  |
+| `CPU-CA37` |  | `LT[res[0]; arg1::DWordWL, arg2::DWordWL, signed]` | SLT + BLT |
+| `CPU-CA38.i` | i ∈ [1, 7] | `SLT` + `BLT` => `res[i]` = 0 |  |
+| | | _polynomial:_ `(SLT + BLT) * res[i] = 0` | |
+| `CPU-CA39.i` | i ∈ [0, 7] | `AND_BYTE[res[i]; arg1[i], arg2[i]]` | AND |
+| `CPU-CA40.i` | i ∈ [0, 7] | `OR_BYTE[res[i]; arg1[i], arg2[i]]` | OR |
+| `CPU-CA41.i` | i ∈ [0, 7] | `XOR_BYTE[res[i]; arg1[i], arg2[i]]` | XOR |
+| `CPU-CA42` |  | `SHIFT[res::DWordHL; arg1::DWordHL, arg2[0], mp_selector, signed, word_instr]` | SHIFT |
+| `CPU-CA43` |  | JALR ⇒ `ADD<res::DWordWL; pc, (2 * c_type_instruction + 4 * (1 - c_type_instruction))::DWordWL>` |  |
+| `CPU-CA44` |  | `MUL[res; arg1, signed, arg2, mp_selector, muldiv_selector]` | MUL |
+| `CPU-CA45` |  | `DVRM[res; arg1, arg2, signed, muldiv_selector]` | DIVREM |
+
+### Memory
+
+The interactions with the memory, both for register loading and storing, as for `LOAD` and `STORE` instructions are handled. Note that since registers need no byte-addressing, we store them in the memory argument with `Word` limbs. The timestamps are ensured to be disjoint for disjoint memory locations. One consequence of that is that `next_pc` is written at `timestamp + 1` to ensure the access is disjoint with the `pc` read into `rv1` as part of the `AUIPC` instruction.
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `CPU-CM46` |  | `MEMW[rv1; 1, 2 * rs1, rv1, timestamp + 0, 1, 0, 0]` | read_register1 |
+| `CPU-CM47.i` | i ∈ [0, 2] | `!read_register1` => `rv1[i]` = 0 |  |
+| | | _polynomial:_ `(1 - read_register1) * rv1[i] = 0` | |
+| `CPU-CM48` |  | `MEMW[rv2; 1, 2 * rs2, rv2, timestamp + 1, 1, 0, 0]` | read_register2 |
+| `CPU-CM49.i` | i ∈ [0, 2] | `!read_register2` => `rv2[i]` = 0 |  |
+| | | _polynomial:_ `(1 - read_register2) * rv2[i] = 0` | |
+| `CPU-CM50` |  | `MEMW[1, 2 * rd, rvd, timestamp + 2, 1, 0, 0]` | write_register |
+| `CPU-CM51` |  | `LOAD[rvd; 0, res, timestamp + 0, memory_2bytes, memory_4bytes, memory_8bytes, signed]` | LOAD |
+| `CPU-CM52` |  | `MEMW[0, res, rv2, timestamp + 1, memory_2bytes, memory_4bytes, memory_8bytes]` | STORE |
+| `CPU-CM53` |  | `MEMW[pc; 1, 2 * 255, next_pc, timestamp + 1, 1, 0, 0]` | 1 - pad |
+
+### System
+
+The interactions with the wider system.
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `CPU-CS54` | `!EBREAK` |  |
+| | _polynomial:_ `1 - EBREAK = 0` | |
+| `CPU-CS55` | `ECALL[timestamp, rv1::DWordWL]` | ECALL |
+
+### Input and output to the ALU
+
+We constrain `arg1`, `arg2` and `rvd` to correspond to the wanted values, including the appropriate sign/zero extension, depending on `word_instr`.
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `CPU-CE56` | (`rv1_sign_bit` or `arg2_sign_bit` or `res_sign_bit`) => `word_instr` |  |
+| | _polynomial:_ `(rv1_sign_bit + arg2_sign_bit + res_sign_bit) * (1 - word_instr) = 0` | |
+| `CPU-CE57` | `MSB16[rv1_sign_bit; rv1[1]]` | word_instr |
+| `CPU-CE58` | `arg1[:4]` = `rv1[:2]` |  |
+| | _polynomial:_ `(arg1::DWordWL)[0] - (rv1::DWordWL)[0] = 0` | |
+| `CPU-CE59` | `arg1[4:]` = `rv1[2]` dot (1 - `word_instr`) + (2^(32) - 1) dot `rv1_sign_bit` dot `signed` |  |
+| | _polynomial:_ `(arg1::DWordWL)[1] - (1 - word_instr) * rv1[2] - signed * rv1_sign_bit * (2^32 - 1) = 0` | |
+| `CPU-CE60` | `MSB16[arg2_sign_bit; rv2[1]]` | word_instr |
+| `CPU-CE61` | `arg2[:4]` = (1 - `STORE` - `LOAD`) dot `rv2[:2]` + (1 - `BEQ` - `BLT`) dot `imm[0]` |  |
+| | _polynomial:_ `(arg2::DWordWL)[0] - (1 - STORE - LOAD) * (rv2::DWordWL)[0] - (1 - BEQ - BLT) * imm[0] = 0` | |
+| `CPU-CE62` | `arg2[4:]` = (1 - `STORE` - `LOAD`) dot ((1 - `word_instr`) dot `rv2[2]` + `signed` dot `arg2_sign_bit` dot (2^(32) - 1)) + (1 - `BEQ` - `BLT`) dot `imm[1]` |  |
+| | _polynomial:_ `(arg2::DWordWL)[1] - (1 - STORE - LOAD) * (1 - word_instr) * rv2[2] - (1 - STORE - LOAD) * signed * arg2_sign_bit * (2^32 - 1) - (1 - BEQ - BLT) * imm[1] = 0` | |
+| `CPU-CE63` | `MSB8[res_sign_bit; res[3]]` | word_instr |
+| `CPU-CE64` | `!LOAD` => `rvd[0]` = `res[:4]` |  |
+| | _polynomial:_ `(1 - LOAD) * (rvd[0] - (res::DWordWL)[0]) = 0` | |
+| `CPU-CE65` | `!LOAD` => `rvd[1]` = (1 - `word_instr`) dot `res[4:]` + `res_sign_bit` dot (2^(32) - 1) |  |
+| | _polynomial:_ `(1 - LOAD) * (rvd[1] - (1 - word_instr) * (res::DWordWL)[1] - res_sign_bit * (2^32 - 1)) = 0` | |
+
+### Other constraints
+
+> **Note:** proper ref to IsZero/IsEqual
+
+For [cpu:c:is_equal], refer to the logic of IsZero or IsEqual, in combination with the subtraction of [cpu:c:sub].
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `CPU-CO66` | `ZERO[is_equal; res[0] + res[1] + res[2] + res[3] + res[4] + res[5] + res[6] + res[7]]` | BEQ |
+| `CPU-CO67` | `branch_cond` = `JALR` or (`BLT` and (`res` xor `invert`)) or (`BEQ` and (`is_equal` xor `invert`)) |  |
+| | _polynomial:_ `-branch_cond + JALR + res[0] * (1 - mp_selector) * BLT + (1 - res[0]) * mp_selector * BLT + is_equal * (1 - mp_selector) * BEQ + (1 - is_equal) * mp_selector * BEQ = 0` | |
+| `CPU-CO68` | `BRANCH[next_pc; pc, imm[0], arg1::DWordWL, JALR]` | branch_cond |
+| `CPU-CO69` | `ADD<next_pc; pc, (2 * c_type_instruction + 4 * (1 - c_type_instruction))::DWordWL>` |  |
+
+> **Note:** Document the choice to not have a multiplicity column here for padding
+
+## Padding
+
+The CPU can be padded with the following values, which have a corresponding row in the DECODE table, at the _odd_ address 1, only reachable through a HALT ecall.
+
+This approach minimizes the number of dependent lookups, increasing only multiplicities in the DECODE table and the IS_BYTE lookup.
+
+---
+
+# SHIFT Chip
+
+## Interface
+
+The  chip has the following interface:
+
+``` // param in: the value being shifted // param shift: the number of bits to shift `in` by // param direction: whether to shift left (0) or right (1) // param signed: whether to interpret `in` as a signed (1) or unsigned (0) integer // param word_instr: whether to execute the SLL/SR* (0) or SLLW/SR*W (1) instruction // out shifted: the resulting value SHIFT[shifted: DWord; in: DWord, shift: Byte, direction: Bit, signed: Bit, word_instr: Bit] ``` In other words, the  chip is designed to constrain that $
+
+$ $
+
+$ Here, `<<` and `>>` denote the _logical_ left and right shift operations, while `>>>` denotes the _arithmetic_ right shift operation.
+
+## Columns
+
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `in` | `DWordHL` | The value being shifted |
+| `shift` | `Byte` | Number of bits to shift `in` by. |
+| `direction` | `Bit` | Whether to shift left (0) or right (1). |
+| `signed` | `Bit` | Whether to interpret `in` as a signed integer. |
+| `word_instr` | `Bit` | Whether this is a Word-instruction (1) or not (0). |
+
+### Output
+
+| Name | Type | Description |
+|------|------|-------------|
+| `out` | `DWordWL` | $`in <</>>/>>>` (`shift` mod 32 dot (2 - `word_instr`))$ |
+
+### Auxiliary
+
+| Name | Type | Description |
+|------|------|-------------|
+| `is_negative` | `Bit` | Whether `in` is negative |
+| `bit_shift` | `Byte` | Value by which to shift `in` to obtain `X` and `Y` |
+| `zbs` | `Bit` | Whether `bit_shift` is zero (1) or not (0). |
+| `X` | `Half[5]` | scratch variable. |
+| `Y` | `Half[4]` | scratch variable. |
+| `limb_shift` | `Bit[4]` | One-hot vector indicating whether $floor.l `shift` / 16 floor.r equiv i mod s$, where $s = 2$ when $`word_instr` = 1$ and $4$ otherwise. |
+
+### Virtual
+
+| Name | Type | Description |
+|------|------|-------------|
+| `extension` | `Half` | sign extension of `in`. |
+| `left` | `Bit` | Whether to perform a left-shift. |
+| `right` | `Bit` | Whether to perform a right-shift. |
+| `intra_limb_left` | `DWordHL` | `in << (shift % 16)` if `left` |
+| `intra_limb_right` | `DWordHL` | `in >>> (shift % 16)` if `right` and `signed`;\ `in >> (shift % 16)` if `right` and `!signed` |
+| `shifted` | `DWordHL` | $`in <</>>/>>>` (`shift` mod 32 dot (2 - `word_instr`))$ |
+
+**Definition of `extension`:**
+```
+extension := 65535 * is_negative
+```
+
+**Definition of `left`:**
+```
+left := μ - direction
+```
+
+**Definition of `right`:**
+```
+right := direction
+```
+
+**Definition of `intra_limb_left`:**
+```
+intra_limb_left (when iter=0) := X[0]
+intra_limb_left (when iter=[1, 3]) := X[i] + Y[i - 1]
+```
+
+**Definition of `intra_limb_right`:**
+```
+intra_limb_right := Y[i] + X[i + 1]
+```
+
+**Definition of `shifted`:**
+```
+shifted := left * Σ_j = 0^i limb_shift[j] * intra_limb_left[i - j] + right * (Σ_j = 0^3 - i limb_shift[j] * intra_limb_right[i + j] + extension * Σ_j = 3 - i^3 limb_shift[j])
+```
+
+### Multiplicity
+
+| Name | Type | Description |
+|------|------|-------------|
+| `μ` | `Bit` |  |
+
+The `SHIFT` chip is comprised of  variables that are expressed using  columns:
+
+## Assumptions
+
+| Tag | Range | Description |
+|-----|-------|-------------|
+| `SHIFT-A1.i` | i ∈ [0, 3] | `IS_HALFWORD[in[i]]` |
+| `SHIFT-A2` |  | `IS_BYTE[shift]` |
+| `SHIFT-A3` |  | `IS_BIT<direction>` |
+| `SHIFT-A4` |  | `IS_BIT<signed>` |
+| `SHIFT-A5` |  | `IS_BIT<word_instr>` |
+
+## Explanation
+
+This chip has a rather complex design as a result of designing it to fit in as few columns possible. We briefly discuss the intricacies of the design, attempting to illustrate its correctness.
+
+The chip's design revolves around a two-phase shifting process: 1. shift `in` by `x := `shift` mod 16` bits, 2. shift that result by `(`shift`-x) mod 64` (or `mod 32` if ` `word_instr` = 1`). The intermediate value representing the state between the two phases is stored in the scratch variables `X` and `Y`. The definition of `shifted` describes how one can combine the `X`, `Y` and `extension` variables to construct the output value as described using `Half`-limbs. The output variable `out` is equivalent to `shifted`, but expressed using `Word`-limbs.
+
+In the following, we cover how these two phases were designed to complement one another. Here, we start with discussing the _logical_ left/right shift operations only; the modifications required to compute the _arithmetic_ right shift will be discussed at the end.
+
+### First phase
+
+We zoom in on the first step. Here, we make use of the two lookup operations - ``HWSL[x: Half, y: B4]` := (`x` `<<` `y`) mod 2^16` (short for "HalfWord Shift Left"), and - ``HWSLC[x: Half, y: B4]` := `x` `>>` (16-`y`)` (short for "HalfWord Shift Left's Carry") Note here that one can use these two lookups to compute `out: Half[4] := in << y` as: $
+
+$ as long as ``y` < 16`. Observing that ``HWSL[x,` 16-`y]` = (`x` `<<` (16-`y`)) mod 2^16`, and ``HWSLC[x,` 16-`y]` = `x` `>>` `y`` for ``y` in [1, 15]`, one can also use these lookups to compute `out := in >> y` as $
+
+$ as long as `0 < `y` < 16`.
+
+Observe now that the values being looked up are (almost) independent from the direction of the shift: only the shift-amount varies slightly. When we now define $
+
+(16-`shift`) mod 16 & "when shifting right" ), $ it only takes some rearranging and combining of the values ``X[`i`] := HWSL[in[`i`], bit_shift]`` and ``Y[`i`] := HWSLC[in[`i`], bit_shift]`` to form the limbs of ``in <</>> shift` mod 16`. In the remaining case that ``right` = 1` and ``shift` = 0 mod 16`, the limbs of ``in <</>> shift` mod 16` simply match those of `in`.
+
+### Second phase
+
+Since we're operating on 16-bit limbs, all the limbs in ``in <</>> shift`` must also occur somewhere in ``in <</>> shift` mod 16`. The number of full-limbs we still need to shift is determined by the fifth and sixth least significant bit of `shift`. With `limb_shift` containing a unary decoding of the integer represented by these two bits, we find that the intermediate value needs to be shifted over by `i` limbs (to the `left` or `right`) when ``limb_shift[`i`]` = 1`. These things combined yield `shifted`'s definition.
+
+Of course, when ``word_instr` = 1` and, thus, only ``shift` mod 32` should be considered, the bit-mask for the lookup constraining `limb_shift` is adjusted appropriately (see [shift:c:limb_shift_lookup]).
+
+### Arithmetic right shift
+
+Lastly, we discuss the case of performing the _arithmetic_ right shift. Here, `extension` is constrained to contain a repetition of `in`'s most significant bit. Copies of this variable are used for any full limbs shifted in when ``right` = `signed` = 1`. Moreover, `X[4]` contains a copy of `extension` shifted over by the right number of bits, to allow the construction of ``in >>> shift` mod 16` as the appropriate intermediate.
+
+## Constraints
+
+First, we constrain `bit_shift` based on whether we are left or right-shifting. [shift:c:zbs] makes sure `zbs` is set to `1` if and only if `bit_shift = 0`. This flag is used to indicate the special case that ``right` = 1` and ``shift` = 0 mod 16`.
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `SHIFT-C3` | `AND_BYTE[bit_shift; shift, 15]` | left |
+| `SHIFT-C4` | `AND_BYTE[bit_shift; 2^8 - shift, 15]` | right |
+| `SHIFT-C5` | `IsZero<zbs; bit_shift>` | μ |
+
+Next, we shift the limbs of `in` left and right by the appropriate amount, storing the results in `X` and `Y` respectively. When `zbs = 1`, the output cannot be used to compose ``in >>/>>> shift` mod 16`. To resolve this, we override `Y[i] := in[i]` and `X[i] := 0` in this case.
+
+The case of `left`-shifting and ``bit_shift` = 0` will be used for padding rows. To prevent unnecessary lookups in padding rows, we override ``X[i]` := `in[i]`` and ``Y[i]` := 0` here.
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `SHIFT-C6.i` | i ∈ [0, 3] | `HWSL[X[i]; in[i], bit_shift]` | 1 - zbs |
+| `SHIFT-C7.i` | i ∈ [0, 3] | `zbs` => `X[i]` = `in[i]` dot `left` |  |
+| | | _polynomial:_ `zbs * (X[i] - in[i] * left) = 0` | |
+| `SHIFT-C8` |  | `HWSL[X[4]; extension, bit_shift]` | 1 - zbs |
+| `SHIFT-C9` |  | `zbs` => `X[4]` = 0 |  |
+| | | _polynomial:_ `zbs * X[4] = 0` | |
+| `SHIFT-C10.i` | i ∈ [0, 3] | `HWSLC[Y[i]; in[i], bit_shift]` | 1 - zbs |
+| `SHIFT-C11.i` | i ∈ [0, 3] | `zbs` => `Y[i]` = `in[i]` dot `right` |  |
+| | | _polynomial:_ `zbs * (Y[i] - in[i] * right) = 0` | |
+
+### Full-limb shifting
+
+Next, we constrain that `limb_shift` is a proper unary encoding of the fifth (and sixth if ``word_instr` = 0`) bit of `shift`. For this to be the case, three requirements must be satisfied: + *unary(0)*: ``limb_shift[`i`]` in {0, 1}` for `i in [0, 3]`, + *unary(1)*: ``limb_shift[`i`]` = 1` for exactly one `i`, and + *proper encoding*: ``limb_shift[`i`]` = 1 <=> 1/16 (`shift &` (48-32 dot `word_instr`)) = i` The first requirement is enforced by constraint [shift:c:limb_shift_is_bit]. To construct a constraint for the second and third requirement, observe that $ 1/16 dot (`shift &` (48-32 dot `word_instr`)) in cases( {0, 1, 2, 3} &"if" `word_instr` = 0, {0, 1} &"if" `word_instr` = 1 $ Observe moreover that, assuming *unary(0)*, the expression $ 1/16 dot (1 + sum_(i=0)^3 (16i-1) dot `limb_shift[`i`]`) $ can evaluate to `i` if and only if ``limb_shift[`i`]` = 1`, while the others are `0`. This means that the relation $ 1 + sum_(i=0)^3 (16i-1) dot `limb_shift[`i`]` = `shift &` (48-32 dot `word_instr`) $ enforces both *unary(1)* and *proper encoding*. This is the exact relation [shift:c:limb_shift_lookup] enforces.
+
+Hereafter, one must only check that `out` is the proper cast of `shifted` into a `DWordWL`.
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `SHIFT-C12.i` | i ∈ [0, 3] | `IS_BIT<limb_shift[i]>` |  |
+| `SHIFT-C13` |  | `AND_BYTE[(1 - limb_shift[0]) + 15 * limb_shift[1] + 31 * limb_shift[2] + 47 * limb_shift[3]; shift, 48 - 32 * word_instr]` | μ |
+| `SHIFT-C14.i` | i ∈ [0, 1] | `out[:2]` = `shifted[:4]` |  |
+| | | _polynomial:_ `out[i] - (shifted::DWordWL)[i] = 0` | |
+
+### Miscellaneous
+
+*Note*: `is_negative` is not used when `signed = 0`. As such, there is no problem with it being unconstrained in this case.
+
+### Lookups
+
+This chip adds the following interaction to the lookup.
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `SHIFT-C15` | `SHIFT[out; in, shift, direction, signed, word_instr]` | -μ |
+
+## Padding
+
+The table can be padded to the next power of two with the following value assignments:
+
+### is_negative
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `SHIFT-C2` | `MSB16[is_negative; in[3]]` | signed |
+
+### left_flag
+
+| Tag | Description |
+|-----|-------------|
+| `SHIFT-C1` | `direction` => `μ` = 1 |
+| | _polynomial:_ `direction * (1 - μ) = 0` |
+
+---
+
+# BRANCH Chip
+
+## Columns
+
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `pc` | `DWordWL` | The current pc, used as base address when `!JALR` |
+| `offset` | `Word` | The offset from the base address to jump to |
+| `register` | `DWordWL` | The base address to use when `JALR` |
+| `JALR` | `Bit` | Selects between `pc` and `register` as base address, needed for the `JALR` instruction |
+
+### Output
+
+| Name | Type | Description |
+|------|------|-------------|
+| `next_pc_high` | `Half[3]` | The upper part of the next pc |
+| `next_pc_low` | `Byte[2]` | The lower part of the next pc |
+
+### Auxiliary
+
+| Name | Type | Description |
+|------|------|-------------|
+| `unmasked_low_byte` | `Byte` | The low byte of the next pc, before masking the LSB. Used to constraint the raw addition. |
+
+### Virtual
+
+| Name | Type | Description |
+|------|------|-------------|
+| `next_pc_unmasked` | `DWordWL` | The combination of `next_pc_high`, `next_pc_low[1]` and `unmasked_low_byte` to constrain the addition. This is the computed value for the next pc, before masking off the LSB as required by the ISA. |
+| `next_pc` | `DWordWL` | The computed next pc, after masking off the LSB as required by the ISA. |
+
+**Definition of `next_pc_unmasked`:**
+```
+next_pc_unmasked (when iter=0) := 2^16 * next_pc_high[0] + 2^8 * next_pc_low[1] + unmasked_low_byte[0]
+next_pc_unmasked (when iter=1) := 2^16 * next_pc_high[2] + next_pc_high[1]
+```
+
+**Definition of `next_pc`:**
+```
+next_pc (when iter=0) := 2^16 * next_pc_high[0] + 2^8 * next_pc_low[1] + next_pc_low[0]
+next_pc (when iter=1) := 2^16 * next_pc_high[2] + next_pc_high[1]
+```
+
+### Multiplicity
+
+| Name | Type | Description |
+|------|------|-------------|
+| `μ` | `Bit` |  |
+
+The `BRANCH` chip is comprised of  variables that are expressed using  columns:
+
+## Assumptions
+
+| Tag | Range | Description |
+|-----|-------|-------------|
+| `BRANCH-A1.i` | i ∈ [0, 1] | `pc` is range checked, `IS_WORD[pc[i]]` |
+| `BRANCH-A2` |  | `offset` is range checked, `IS_WORD[offset]` |
+| `BRANCH-A3.i` | i ∈ [0, 1] | `register` is range checked, `IS_WORD[register[i]]` |
+| `BRANCH-A4` |  | `IS_BIT<JALR>` |
+
+## Constraints
+
+> **Note:** Check correspondence with CPU for passing in `offset` as word or dword
+
+We constrain `next_pc` to be ``base_address` + `offset``, where `base_address` equals `pc` when ``JALR` = 0` and `register` otherwise.
+
+The range checks on `unmasked_low_byte` and `next_pc_low[0]` are performed implicitly by the `AND_BYTE` lookup.
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `BRANCH-C1` |  | 1 - JALR ⇒ `ADD<next_pc_unmasked; pc, offset::DWordWL>` |  |
+| `BRANCH-C2` |  | JALR ⇒ `ADD<next_pc_unmasked; register, offset::DWordWL>` |  |
+| `BRANCH-C3` |  | `IS_BYTE[next_pc_low[1]]` | μ |
+| `BRANCH-C4` |  | `AND_BYTE[next_pc_low[0]; unmasked_low_byte[0], 254]` | μ |
+| `BRANCH-C5.i` | i ∈ [0, 2] | `IS_HALFWORD[next_pc_high[i]]` | μ |
+
+This chip contributes the following to the lookup argument.
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `BRANCH-C6` | `BRANCH[next_pc; pc, offset, register, JALR]` | -μ |
+
+## Padding
+
+The table can be padded to the next power of two with the following value assignments:
+
+---
+
+# MEMW Chip
+
+## Columns
+
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `is_register` | `Bit` | Whether the address represents a register index |
+| `base_address` | `DWordWL` | The base address to read/write from/to, gets offset by $[0, 7]$, depending on how big the access is |
+| `value` | `BaseField[8]` | The values to store in memory. For regular memory, these should be (up to) 8 range-checked `Byte`s; registers are stored as two range-checked `Word`s |
+| `timestamp` | `DWordWL` | The timestamp at which this memory access is said to occur |
+| `write2` | `Bit` | Whether to write exactly 2 values |
+| `write4` | `Bit` | Whether to write exactly 4 values |
+| `write8` | `Bit` | Whether to write exactly 8 values |
+
+### Output
+
+| Name | Type | Description |
+|------|------|-------------|
+| `old` | `BaseField[8]` | The old value written at `base_address`. See `value` for information about representation. Only the elements corresponding to the `writeN` bits are guaranteed |
+
+### Auxiliary
+
+| Name | Type | Description |
+|------|------|-------------|
+| `address_add` | `DWordHL[7]` | `address_add[i] = base_address + i + 1` |
+| `old_timestamp` | `DWordWL[8]` | The timestamp at which the address was last accessed |
+
+### Virtual
+
+| Name | Type | Description |
+|------|------|-------------|
+| `w2` | `Bit` | writing at least 2 bytes |
+| `w4` | `Bit` | writing at least 4 bytes |
+| `μ_sum` | `Bit` |  |
+
+**Definition of `w2`:**
+```
+w2 := write2 + write4 + write8
+```
+
+**Definition of `w4`:**
+```
+w4 := write4 + write8
+```
+
+**Definition of `μ_sum`:**
+```
+μ_sum := μ_read + μ_write
+```
+
+### Multiplicity
+
+| Name | Type | Description |
+|------|------|-------------|
+| `μ_read` | `Bit` | Whether we are performing a read (and hence return `out`) |
+| `μ_write` | `Bit` | Whether we are performing a write (and hence not return `out`) |
+
+The `MEMW` chip is comprised of  variables that are expressed using  columns:
+
+## Assumptions
+
+| Tag | Range | Description |
+|-----|-------|-------------|
+| `MEMW-A1.i` | i ∈ [0, 1] | `IS_WORD[base_address[i]]` |
+| `MEMW-A2` |  | `IS_BIT<write2>` |
+| `MEMW-A3` |  | `IS_BIT<write4>` |
+| `MEMW-A4` |  | `IS_BIT<write8>` |
+| `MEMW-A5` |  | `IS_BIT<write2 + write4 + write8>` |
+| `MEMW-A6.i` | i ∈ [0, 1] | `IS_WORD[timestamp[i]]` |
+
+Our assumptions do not explicitly cover any range checks for the `is_register` and `value` columns, as these are not necessary for the correctness of this chip in isolation. These properties are necessary for the consistency of the system as a whole, and therefore we document it here, keeping the type information as a reading help.
+
+## Constraints
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `MEMW-C1` |  | `IS_BIT<μ_sum>` |  |
+| `MEMW-C2` |  | `w2` => `μ_sum` |  |
+| | | _polynomial:_ `w2 * (1 - μ_sum) = 0` | |
+| `MEMW-C3` |  | `ADD<address_add[0]::DWordWL; base_address, 1>` | w2 |
+| `MEMW-C4.i` | i ∈ [1, 2] | `ADD<address_add[i]::DWordWL; base_address, i + 1>` | w4 |
+| `MEMW-C5.i` | i ∈ [3, 6] | `ADD<address_add[i]::DWordWL; base_address, i + 1>` | write8 |
+| `MEMW-C6.i` | i ∈ [0, 6], j ∈ [0, 3] | `IS_HALFWORD[address_add[i][j]]` |  |
+| `MEMW-C7` |  | `LT[1; old_timestamp[0], timestamp, 0]` | μ_sum |
+| `MEMW-C8` |  | `LT[1; old_timestamp[1], timestamp, 0]` | w2 |
+| `MEMW-C9.i` | i ∈ [2, 3] | `LT[1; old_timestamp[i], timestamp, 0]` | w4 |
+| `MEMW-C10.i` | i ∈ [4, 7] | `LT[1; old_timestamp[i], timestamp, 0]` | write8 |
+
+As long as `timestamp` is properly range-checked, the presence of `old_timestamp` in the memory argument automatically ensures appropriate range checking (as long as no external entities provide negative multiplicities without range checking the timestamp). This ensures the assumptions for `LT` are satisfied.
+
+We additionally check that the address does not overflow for more significant bytes of the access.
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `MEMW-CR11` | `LT[1; base_address, address_add[0]::DWordWL, 0]` | write2 |
+| `MEMW-CR12` | `LT[1; base_address, address_add[2]::DWordWL, 0]` | write4 |
+| `MEMW-CR13` | `LT[1; base_address, address_add[6]::DWordWL, 0]` | write8 |
+
+The chip adds the following tuples to the lookup argument, to effectuate that part of the memory argument.
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `MEMW-CM14` |  | `memory[is_register, base_address, old_timestamp[0], old[0]]` | μ_sum |
+| `MEMW-CM15` |  | `memory[is_register, base_address, timestamp, value[0]]` | -μ_sum |
+| `MEMW-CM16` |  | `memory[is_register, address_add[0], old_timestamp[1], old[1]]` | w2 |
+| `MEMW-CM17` |  | `memory[is_register, address_add[0], timestamp, value[1]]` | -w2 |
+| `MEMW-CM18.i` | i ∈ [2, 3] | `memory[is_register, address_add[i - 1], old_timestamp[i], old[i]]` | w4 |
+| `MEMW-CM19.i` | i ∈ [2, 3] | `memory[is_register, address_add[i - 1], timestamp, value[i]]` | -w4 |
+| `MEMW-CM20.i` | i ∈ [4, 7] | `memory[is_register, address_add[i - 1], old_timestamp[i], old[i]]` | write8 |
+| `MEMW-CM21.i` | i ∈ [4, 7] | `memory[is_register, address_add[i - 1], timestamp, value[i]]` | -write8 |
+
+This chip contributes the following to the lookup argument.
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `MEMW-CO22` | `MEMW[old; is_register, base_address, value, timestamp, write2, write4, write8]` | μ_read |
+| `MEMW-CO23` | `MEMW[is_register, base_address, value, timestamp, write2, write4, write8]` | μ_write |
+
+## Future optimization ideas
+
+- Fast path for aligned memory access where all bytes have the same old timestamp - MEMB chip that deals does a one-byte write to remove old_timestamp from here (uncertain tradeoffs) - Compute `base_address[1] + 1` once and have high words of `address_add` as Words - Improve overflow trapping somehow so we don't need `LT` (could tie into previous one by checking carry bit of the +1) - Adding `μ_sum`/`w2`/`w4`/`write8` multiplicities to the `IS_HALFWORD` lookups may make some GKR things faster if there are known zeroes.
+
+---
+
+# LT Chip
+
+## Columns
+
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `lhs` | `DWordHHW` | The left operand |
+| `rhs` | `DWordHHW` | The right operand |
+| `signed` | `Bit` | whether to interpret `lhs` and `rhs` as signed integers (1) or not (0) |
+
+### Output
+
+| Name | Type | Description |
+|------|------|-------------|
+| `lt` | `Bit` | Whether $`lhs` < `rhs`$, taking `signed` into account |
+
+### Auxiliary
+
+| Name | Type | Description |
+|------|------|-------------|
+| `lhs_sub_rhs` | `DWordHL` | $`lhs` - `rhs`$ |
+| `lhs_msb` | `Bit` | The most significant bit of `lhs` |
+| `rhs_msb` | `Bit` | The most significant bit of `rhs` |
+
+### Virtual
+
+| Name | Type | Description |
+|------|------|-------------|
+| `carry` | `Bit[2]` | The carry for adding `lhs_sub_rhs` back to `rhs` |
+| `unsigned_lt` | `Bit` | Whether $`lhs` < `rhs`$, as unsigned integers |
+
+**Definition of `carry`:**
+```
+carry (when iter=0) := 2^-32 * (rhs[0] + (lhs_sub_rhs::DWordWL)[0] - lhs[0])
+carry (when iter=1) := 2^-32 * ((rhs::DWordWL)[1] + (lhs_sub_rhs::DWordWL)[1] + carry[0] - (lhs::DWordWL)[1])
+```
+
+**Definition of `unsigned_lt`:**
+```
+unsigned_lt := carry[1]
+```
+
+### Multiplicity
+
+| Name | Type | Description |
+|------|------|-------------|
+| `μ` | `Bit` |  |
+
+The `LT` chip is comprised of  variables that are expressed using  columns:
+
+## Assumptions
+
+| Tag | Range | Description |
+|-----|-------|-------------|
+| `LT-A1` |  | `IS_WORD[lhs[0]]` |
+| `LT-A2` |  | `IS_WORD[rhs[0]]` |
+| `LT-A3` |  | `IS_BIT<signed>` |
+
+We assume the inputs `lhs`, `rhs` and `signed` are partially range checked.
+
+## Constraints
+
+We first constrain that all variables correspond to their definition. For the defining constraint of `lt`, [lt:c:lt], observe that it is a choice between two options, depending on the input flag `signed`. In the case of unsigned comparison, we simply need `unsigned_lt`, indicating that a wraparound (carry bit) modulo `2^64` is needed to go from `rhs` to `lhs` via addition. For the case of signed comparison, we first need some case analysis.
+
+We split `a < b` into four disjoint cases, conditioned on the sign of `a` and `b`. Recall that the sign of a number in two's complement can be read off from the MSB, being `1` for a negative number and `0` for a positive one. For this analysis, we denote the MSB of `a` as `A` and the MSB of `b` as `B`. The four disjoint cases then become:
+
++ `dash(A) and B and (a < b)` + `A and dash(B) and (a < b)` + `A and B and (a < b)` + `dash(A) and dash(B) and (a < b)`
+
+The first case is evidently false, while the second case simplifies to `A and dash(B)`. For the third and fourth case, observe that when `A = B`, the `<` relation is preserved by the modular correspondence between `[-2^(31), 2^(31))` and `[0, 2^(64))`. Importantly, this modular correspondence is merely a reinterpretation of the bits or values of `a` and `b`, due to the representation in two's complement. Hence, we can introduce the value `C = `unsigned_lt``, that accurately represents the relation `a < b` when `A = B`.
+
+Combining our three remaining cases, we obtain the boolean formula `A dash(B) or A B C or dash(A) dash(B) C`. Since the cases are disjoint, this can be computed with the binary-valued polynomial `P(A, B, C) = A (1 - B) + A B C + (1 - A) (1 - B) C`.
+
+The polynomial `P` can be simplified to a total degree of two. We claim that the polynomial `Q(A, B, C) = A (1 - B) + A C + (1 - B) C` is, for the purposes of this chip, equivalent to `P`. An exhaustive check shows that `P(A, B, C) != Q(A, B, C)` only for the triple `(A, B, C) = (1, 0, 1)`. This is, however, impossible due to the correctness of `ADD`. In more detail, if we let `s` be the (range-checked) difference `a - b` (so the equivalent of the `lhs_sub_rhs` column), and `x'` denote the most significant word of a variable `x`, we need `c dot 2^32 + a' = b' + s' + `carry[0]``, by the definition of `carry`. However, the left hand side of this is at least `3 dot 2^31`, as `(A, C) = (1, 1)`, and the right hand side is at most `(2^31 - 1) + (2^32 - 1) + 1 = 3 dot 2^31 - 1`. Therefore, we can use `Q` to constrain `lt` when `signed = 1`.
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `LT-C1` | `MSB16[lhs_msb; lhs[2]]` | μ |
+| `LT-C2` | `MSB16[rhs_msb; rhs[2]]` | μ |
+| `LT-C3` | `lt` = `signed` dot (A (1 - B) + A C + (1 - B) C) + (1 - `signed`) dot `unsigned_lt` |  |
+| | _polynomial:_ `lt - signed * (lhs_msb * (1 - rhs_msb) + lhs_msb * carry[1] + (1 - rhs_msb) * carry[1]) - (1 - signed) * unsigned_lt = 0` | |
+| `LT-C4` | `IS_HALFWORD[lhs[1]]` | μ |
+| `LT-C5` | `IS_HALFWORD[rhs[1]]` | μ |
+
+And then we constrain the subtraction, taking care of the remaining range checking not yet covered by the assumptions or the `MSB16` lookup.
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `LT-C6.i` | i ∈ [0, 1] | `IS_BIT<carry[i]>` |  |
+| `LT-C7.i` | i ∈ [0, 3] | `IS_HALFWORD[lhs_sub_rhs[i]]` | μ |
+
+The chip contributes the following to the lookup argument.
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `LT-C8` | `LT[lt; lhs::DWordWL, rhs::DWordWL, signed]` | -μ |
+
+## Padding
+
+The table can be padded to the next power of two with the following value assignments:
+
+---
+
+# MUL Chip
+
+## Columns
+
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `lhs` | `DWordHL` | the left hand operator. |
+| `lhs_signed` | `Bit` | whether to interpret `lhs` as a signed integer (1) or not (0). |
+| `rhs` | `DWordHL` | the right hand operator. |
+| `rhs_signed` | `Bit` | whether to interpret `rhs` as a signed integer (1) or not (0). |
+
+### Output
+
+| Name | Type | Description |
+|------|------|-------------|
+| `lo` | `DWordHL` | the lower limbs of the (extended) multiplication result |
+| `hi` | `DWordHL` | the upper limbs of the (extended) multiplication result |
+
+### Auxiliary
+
+| Name | Type | Description |
+|------|------|-------------|
+| `lhs_is_negative` | `Bit` | whether `lhs` is negative (1) or not (0) |
+| `rhs_is_negative` | `Bit` | whether `rhs` is negative (1) or not (0) |
+| `raw_product` | `B51[4]` | raw multiplication output |
+
+### Virtual
+
+| Name | Type | Description |
+|------|------|-------------|
+| `lhs_ext` | `Half[8]` | sign-extended value of `lhs` |
+| `rhs_ext` | `Half[8]` | sign-extended value of `rhs` |
+| `res` | `QuadWL` | concatenation of `lo` and `hi`. |
+| `carry` | `B20[4]` | carry values |
+| `μ_sum` | `BaseField` | sum of multiplicies |
+
+**Definition of `lhs_ext`:**
+```
+lhs_ext (when iter=[0, 3]) := lhs[i]
+lhs_ext (when iter=[4, 7]) := 65535 * lhs_is_negative
+```
+
+**Definition of `rhs_ext`:**
+```
+rhs_ext (when iter=[0, 3]) := rhs[i]
+rhs_ext (when iter=[4, 7]) := 65535 * rhs_is_negative
+```
+
+**Definition of `res`:**
+```
+res (when iter=[0, 1]) := (lo::DWordWL)[i]
+res (when iter=[2, 3]) := (hi::DWordWL)[i - 2]
+```
+
+**Definition of `carry`:**
+```
+carry (when iter=0) := 2^-32 * (raw_product[0] - res[0])
+carry (when iter=[1, 3]) := 2^-32 * (raw_product[i] + carry[i - 1] - res[i])
+```
+
+**Definition of `μ_sum`:**
+```
+μ_sum := μ_lo + μ_hi
+```
+
+### Multiplicity
+
+| Name | Type | Description |
+|------|------|-------------|
+| `μ_lo` | `BaseField` |  |
+| `μ_hi` | `BaseField` |  |
+
+The `MUL` chip is comprised of  variables that are expressed using  columns:
+
+`mat(delim: , top; bottom)` }
+
+## Assumptions
+
+| Tag | Range | Description |
+|-----|-------|-------------|
+| `MUL-A1.i` | i ∈ [0, 3] | `IS_HALF[lhs[i]]` |
+| `MUL-A2.i` | i ∈ [0, 3] | `IS_HALF[rhs[i]]` |
+
+The following range checks are assumed to be performed/enforced outside of this chip:
+
+## Constraints
+
+### Overview
+
+When `lhs` and `rhs` are _unsigned_ integers, computing their product `mod 2^128` comes down to evaluating $ (sum_(j=0)^3 2^(16j) dot `lhs`_j) dot (sum_(i=0)^3 2^(16i) dot `rhs`_i) mod 2^128. $ If `lhs` and `rhs` are signed instead, the computation remains nearly identical: based on their signs, one must either zero or one-extend `lhs` and `rhs` --- forming `lhs_ext` and `rhs_ext` respectively --- and compute their product `mod 2^128`: $ (sum_(j=0)^7 2^(16j) dot `lhs_ext`_j) dot (sum_(i=0)^7 2^(16i) dot `rhs_ext`_i) mod 2^128. $ where `lhs_ext` and `rhs_ext` are treated as _unsigned_ integers. Note that by setting the extension limbs of `lhs` and/or `rhs` to `0` when the integer is (i) unsigned or (ii) signed and non-negative, this second formula still applies. For the purposes of constraining the multiplication operation, we rewrite this formula as
+
+$ &(sum_(j=0)^7 2^(16j) dot `lhs_ext`_j) dot (sum_(i=0)^7 2^(16i) dot `rhs_ext`_i) mod 2^128 \ &equiv sum_(j=0)^7 sum_(i=0)^7 2^(16(i+j)) dot `lhs_ext`_j dot `rhs_ext`_i mod 2^128 \ &stackrel(triangle, equiv) sum_(j=0)^7 sum_(i=0)^(7-j) 2^(16(i+j)) dot `lhs_ext`_j dot `rhs_ext`_i mod 2^128 \ &stackrel(square, equiv) sum_(j=0)^7 sum_(i=j)^(7) 2^(16i) dot `lhs_ext`_j dot `rhs_ext`_(i-j) mod 2^128 \ &stackrel(penta, equiv) sum_(i=0)^7 sum_(j=0)^(i) 2^(16i) dot `lhs_ext`_j dot `rhs_ext`_(i-j) mod 2^128 \ &equiv sum_(i=0)^3 sum_(k=0)^1 sum_(j=0)^(2i+k) 2^(16(2i+k)) dot `lhs_ext`_j dot `rhs_ext`_(2i+k-j) mod 2^128 \ &equiv sum_(i=0)^3 2^(32i) dot sum_(k=0)^1 2^(16k) dot sum_(j=0)^(2i+k) `lhs_ext`_j dot `rhs_ext`_(2i+k-j) mod 2^128 $ where at step - `triangle` we can ignore `i > 7-j`, since that makes `2^(16(i+j)) equiv 0 mod 2^128`, - `square` we rewrite the second summation such that `i` iterates from `j` to 7, rather than `0` to `7-j`, and - `penta` we swap the sums.
+
+We let `raw_product` capture the second summation in this last formula (see [mul:c:raw_product]). By construction, ``raw_product`_i < 2^51` for all `i in [0, 3]`, far exceeding the 32-bits that fit in a single `Word`-limb. What remains then is to reduce each limb of `raw_product` `mod 2^32`, carrying the overflow of each limb to the next, constructing the output `res` in doing so.
+
+This reduce-and-carry operation is constrained by [mul:c:range_lo]/[mul:c:range_hi] and [mul:c:carry], combined with `carry`'s definition. [mul:c:carry] and `carry`'s definition enforce that $ forall i in [0, 3]: `raw_product`_i + `carry`_(i-1) - `res`_i in { k dot 2^32 | k in [0, 2^20) } $ with ``carry`_(-1) = 0` for simplicity. In other words: ``res`_i equiv `raw_product`_i + `carry`_(i-1) (mod 2^32)`. With [mul:c:range_lo]/[mul:c:range_hi] forcing ``res`_i < 2^32`, ``res`_i` can only assume one value: ``raw_product`_i + `carry`_(i-1) mod 2^32`.
+
+*Note*: one may have observed that [mul:c:carry] requires ``carry`_i in [0, 2^20)`, while no limb of a valid carry value would ever exceed `2^19`. This is indeed the case. However, there is some slack in how tight one has to constrain the `carry` values. In fact, in this situation it suffices to assert that ``carry`_i < frac(p, 2^32, style: "skewed") approx 2^31`, where `p` denotes the field's modulus. Given that other chips also use 20-bit lookups, using `IS_B20` makes for a simpler design.
+
+### Definitions
+
+We constrain `lhs_is_negative` and `rhs_is_negative` according to their definition; `lo`, `hi` and `carry` are appropriately range checked.
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `MUL-C1` |  | `SIGN<lhs_is_negative; lhs[3], lhs_signed>` |  |
+| `MUL-C2` |  | `SIGN<rhs_is_negative; rhs[3], rhs_signed>` |  |
+| `MUL-C3.i` | i ∈ [0, 3] | `IS_HALF[lo[i]]` | μ_sum |
+| `MUL-C4.i` | i ∈ [0, 3] | `IS_HALF[hi[i]]` | μ_sum |
+| `MUL-C5.i` | i ∈ [0, 3] | `IS_B20[carry[i]]` | μ_sum |
+
+### Product
+
+[mul:c:raw_product] defines `raw_product` in terms of the (sign extended) input values `lhs` and `rhs`.
+
+| Tag | Range | Description |
+|-----|-------|-------------|
+| `MUL-C6.i` | i ∈ [0, 3] | `raw_product[i]` = sum_(`k`=0)^1 2^(16k) sum_(`j`=0)^(2i+k) `lhs_ext[j]` dot `rhs_ext[2i+k-j]` |
+| | | _polynomial:_ `Σ_k = 0^1 2^(16 * k) * Σ_j = 0^2 * i + k lhs_ext[j] * rhs_ext[2 * i + k - j] - raw_product[i] = 0` |
+
+### Lookup
+
+The  chip contributes the following to the lookup:
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `MUL-C7` | `MUL[lo::DWordWL; lhs, lhs_signed, rhs, rhs_signed, 0]` | -μ_lo |
+| `MUL-C8` | `MUL[hi::DWordWL; lhs, lhs_signed, rhs, rhs_signed, 1]` | -μ_hi |
+
+## Padding
+
+The table can be padded to the next power of two with the following value assignments:
+
+## Notes
+
+- `lo` and `hi` are stored in `DWordHL`s (rather than `DWordWL`s) because of their values being range checked. Since it is not required that both `μ_lo` and `μ_hi` are non-zero at the same time, one cannot safely assume their range to be checked elsewhere.
+
+As an optimization, one might be able to use a `DWordWL` and `DWordHL` to store `lo` and `hi`, where one would decide which to store in which based on the multiplicities `μ_lo` and `μ_hi`; the value sent into the lookup could then be assumed range-checked by the other side of the relation. This optimization was not included at this moment because of its negative impact on the readability and verifiability of the chip.
+
+---
+
+# DVRM Chip
+
+//  chip = load_chip("src/dvrm.toml", config)
+
+*placeholder chapter: WIP*
+
+---
+
+# LOAD Chip
+
+## Columns
+
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `base_address` | `DWordWL` | The base address to read/write from/to, gets offset by $[0, 7]$, depending on how big the access is |
+| `timestamp` | `DWordWL` | The timestamp at which this memory access is said to occur |
+| `read2` | `Bit` | Whether to read exactly 2 bytes |
+| `read4` | `Bit` | Whether to read exactly 4 bytes |
+| `read8` | `Bit` | Whether to read exactly 8 bytes |
+| `signed` | `Bit` | Whether to sign-extend (1) or zero-extend (0) |
+
+### Output
+
+| Name | Type | Description |
+|------|------|-------------|
+| `res` | `DWordBL` | The result of reading (up to) 8 bytes from `base_address`, extended corresponding to `signed`. |
+
+### Auxiliary
+
+| Name | Type | Description |
+|------|------|-------------|
+| `sign_bit` | `Bit` | The sign bit extracted from the bytes retrieved from memory |
+
+### Virtual
+
+| Name | Type | Description |
+|------|------|-------------|
+| `read1` | `Bit` | Whether to read exactly 1 byte |
+
+**Definition of `read1`:**
+```
+read1 := μ - read2 - read4 - read8
+```
+
+### Multiplicity
+
+| Name | Type | Description |
+|------|------|-------------|
+| `μ` | `Bit` |  |
+
+The `LOAD` chip is comprised of  variables that are expressed using  columns:
+
+## Assumptions
+
+| Tag | Range | Description |
+|-----|-------|-------------|
+| `LOAD-A1.i` | i ∈ [0, 1] | `IS_WORD[base_address[i]]` |
+| `LOAD-A2` |  | `IS_BIT<signed>` |
+| `LOAD-A3` |  | `IS_BIT<read2>` |
+| `LOAD-A4` |  | `IS_BIT<read4>` |
+| `LOAD-A5` |  | `IS_BIT<read8>` |
+| `LOAD-A6` |  | `IS_BIT<read2 + read4 + read8>` |
+| `LOAD-A7.i` | i ∈ [0, 1] | `IS_WORD[timestamp[i]]` |
+
+## Constraints
+
+The chip delegates the actual memory interaction to the `MEMW` chip, and ensures correctness of the requested sign/zero extension. The output `res` is correctly range-checked as long as the memory contents are.
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `LOAD-C1` |  | `read2` + `read4` + `read8` => `μ` |  |
+| | | _polynomial:_ `(read2 + read4 + read8) * (1 - μ) = 0` | |
+| `LOAD-C2` |  | `MEMW[res; 0, base_address, res::BaseField[8], timestamp, read2, read4, read8]` | μ |
+| `LOAD-C3` |  | `MSB8[sign_bit; res[0]]` | read1 |
+| `LOAD-C4` |  | `MSB8[sign_bit; res[1]]` | read2 |
+| `LOAD-C5` |  | `MSB8[sign_bit; res[3]]` | read4 |
+| `LOAD-C6.i` | i ∈ [4, 7] | !`read8` => `res`_i = `signed` dot `sign_bit` dot 255 |  |
+| | | _polynomial:_ `(1 - read8) * (res[i] - signed * sign_bit * 255) = 0` | |
+| `LOAD-C7.i` | i ∈ [2, 3] | !(`read4` + `read8`) => `res`_i = `signed` dot `sign_bit` dot 255 |  |
+| | | _polynomial:_ `(1 - read4 - read8) * (res[i] - signed * sign_bit * 255) = 0` | |
+| `LOAD-C8` |  | !(`read2` + `read4` + `read8`) => `res`_1 = `signed` dot `sign_bit` dot 255 |  |
+| | | _polynomial:_ `(1 - read2 - read4 - read8) * (res[1] - signed * sign_bit * 255) = 0` | |
+
+The chip contributes the following to the lookup argument.
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `LOAD-C9` | `LOAD[res::DWordWL; base_address, timestamp, read2, read4, read8]` | -μ |
+
+## Padding
+
+The table can be padded to the next power of two with the following value assignments:
+
+---
+
+# ECALL Chips
+
+##  chip
+
+### Columns
+
+The  chip leverages  variable, spanning  columns:
+
+### Assumptions
+
+It is assumed the input is range checked:
+
+### Constraints
+
+The  chip: + makes sure register `x10` (containing the exit code) equals `0` ([halt:c:read_zero_exit_code]), + writes `0` to all other registers ([halt:c:zeroize_registers_lo]/[halt:c:zeroize_registers_hi]), and + sets `pc` equal to `1` ([halt:c:pc]). Note that the writes performed by all these interactions are accompanied by the timestamp `2^64-1`; the maximum timestamp. This prevents any other operation involving memory from being executed hereafter.
+
+[ Observe that --- in its current state --- this solution puts the burden of verifying the register cleanup on the verifier inside of the lookup argument. Alternatively, one could add 31 lookups to the "memory" table to remove the _known_ final tokens for the registers there. ])
+
+#### Lookup
+
+The HALT chip contributes the following interaction to the lookup-argument:
+
+*Note*: [`93` is the system call number corresponding to `sys_exit`.]
+
+### Padding
+
+This chip should only contain a single row. Given that `2^0 = 1`, this chip does not need to be padded. As such, no padding is defined.
+
+---
+
+# BITWISE Chips
+
+## Columns
+
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `X` | `Byte` |  |
+| `Y` | `Byte` |  |
+| `Z` | `B4` |  |
+
+### Output
+
+| Name | Type | Description |
+|------|------|-------------|
+| `AND` | `Byte` | the binary AND of `X` and `Y` |
+| `OR` | `Byte` | the binary OR of `X` and `Y` |
+| `XOR` | `Byte` | the binary XOR of `X` and `Y` |
+| `MSB8` | `Bit` | the most significant bit of `X` |
+| `MSB16` | `Bit` | the most significant bit of `Y` |
+| `ZERO` | `Bit` | whether $`X` = 0 and `Y` = 0$ |
+| `SLL` | `Half` | `X\|\|Y` logically left-shifted by `Z`: $((`X` + 256`Y`) `<<` `Z`) mod 2^16$ |
+| `SLLC` | `Half` | `X\|\|Y` logically right-shifted by `Z`: $(`X` + 256`Y`) `>>` (16 - `Z`)$ |
+
+### Multiplicity
+
+| Name | Type | Description |
+|------|------|-------------|
+| `μ_AND` | `BaseField` |  |
+| `μ_OR` | `BaseField` |  |
+| `μ_XOR` | `BaseField` |  |
+| `μ_MSB8` | `BaseField` |  |
+| `μ_MSB16` | `BaseField` |  |
+| `μ_ZERO` | `BaseField` |  |
+| `μ_IS_BYTE` | `BaseField` |  |
+| `μ_IS_HALF` | `BaseField` |  |
+| `μ_IS_B20` | `BaseField` |  |
+| `μ_HWSL` | `BaseField` |  |
+| `μ_HWSLC` | `BaseField` |  |
+
+The  chip is comprised of  variables that are expressed using  columns. Of these, the _input_ and _output_ variables ( in total) are precomputed.
+
+*Note*: This table contains one row for every possible value of `(X, Y, Z)`. As such, it has length `2^8 dot 2^8 dot 2^4 = 2^(20)`.
+
+## Lookup
+
+This chip adds the following interactions to the lookup:
+
+## Areas of Optimization
+
+The following ideas may prove to be optimizations for the  chip: + Extend `IS_BYTE[X]` to `ARE_BYTES[X, Y]`, such that two bytes are range checked at once. When only a single check is required, one can still execute `IS_BYTE[X] := ARE_BYTES[X, 0]`. + Drop `MSB8` column, and instead define the `MSB8` lookup as `MSB8<X> := MSB16[256X]`. Note: currently, `MSB8` also implicity range checks the input `X` (the lookup fails if `X` is not a `Byte`). This optimization should only be executed when all chips leveraging `MSB8` do _not_ need this implicit range check. + Place the 16-bit (`AND`, `OR`, `XOR`, `MSB16`, `ZERO`, etc.) and 20-bit (`HWSL`, `HWSLC`, `IS_B20`) lookups in separate tables. + Combine `HWSL` and `HWSLC` into a single lookup (see also \).
+
+## Constraints
+
+### contributions
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `BITWISE-C1` | `AND_BYTE[AND; X, Y]` | -μ_AND |
+| `BITWISE-C2` | `OR_BYTE[OR; X, Y]` | -μ_OR |
+| `BITWISE-C3` | `XOR_BYTE[XOR; X, Y]` | -μ_XOR |
+| `BITWISE-C4` | `MSB8[MSB8; X]` | -μ_MSB8 |
+| `BITWISE-C5` | `MSB16[MSB16; X + 256 * Y]` | -μ_MSB16 |
+| `BITWISE-C6` | `ZERO[ZERO; X + 256 * Y]` | -μ_ZERO |
+| `BITWISE-C7` | `IS_BYTE[X]` | -μ_IS_BYTE |
+| `BITWISE-C8` | `IS_HALF[X + 256 * Y]` | -μ_IS_HALF |
+| `BITWISE-C9` | `IS_B20[X + 256 * Y + 65536 * Z]` | -μ_IS_B20 |
+| `BITWISE-C10` | `HWSL[SLL; X + 256 * Y, Z]` | -μ_HWSL |
+| `BITWISE-C11` | `HWSLC[SLLC; X + 256 * Y, Z]` | -μ_HWSLC |
\ No newline at end of file
diff --git a/docs/spec/variables.md b/docs/spec/variables.md
new file mode 100644
index 000000000..5181a6337
--- /dev/null
+++ b/docs/spec/variables.md
@@ -0,0 +1,5 @@
+# Variables
+
+While this VM operates on 64-bit words, the proving system's base field has fewer than `2^64` elements available and thus cannot represent all words natively. To this end, we introduce the concept of "variables" as an abstraction layer on top of the VM's field elements. The following table lists all variable types used in this VM.
+
+columns: (auto, 1fr, auto), inset: 7pt, align: (top+left, top+left, top+center, ), table.header([*Name*], [*Description*], [*\*]), ..for type in config.variables.types { ([], [], [.subtypes.len()]) },
\ No newline at end of file
diff --git a/scripts/README.md b/scripts/README.md
index 14b6acb42..820acfc62 100644
--- a/scripts/README.md
+++ b/scripts/README.md
@@ -17,7 +17,7 @@ Converts the Typst specification to Markdown format.
 ```bash
 cd scripts
 source .venv/bin/activate
-python typst_to_md.py                          # Output to ../spec/
+python typst_to_md.py                          # Output to ../docs/spec/
 python typst_to_md.py -o ../others/spec_md     # Output to specific directory
 ```
 
diff --git a/scripts/typst_to_md.py b/scripts/typst_to_md.py
index ecc17f104..d1864428e 100644
--- a/scripts/typst_to_md.py
+++ b/scripts/typst_to_md.py
@@ -633,7 +633,7 @@ def main():
 
     output_dir = args.output_dir
     if output_dir is None:
-        output_dir = spec_dir
+        output_dir = script_dir / "../docs/spec"
     output_dir = output_dir.resolve()
 
     if not spec_dir.exists():

From d5000f0d432c297dae38228c2ffa436687a20ac2 Mon Sep 17 00:00:00 2001
From: Erik <159244975+erik-3milabs@users.noreply.github.com>
Date: Tue, 3 Feb 2026 10:53:16 +0100
Subject: [PATCH 052/105] spec: `SIGN` (#279)

* spec: introduce SIGN template

* Update spec/src/sign.toml

Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com>

---------

Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com>
---
 spec/book.typ      |  1 +
 spec/sign.typ      | 42 ++++++++++++++++++++++++++++++++++++++++++
 spec/src/sign.toml | 42 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 85 insertions(+)
 create mode 100644 spec/sign.typ
 create mode 100644 spec/src/sign.toml

diff --git a/spec/book.typ b/spec/book.typ
index b194b7f70..2fd6d8767 100644
--- a/spec/book.typ
+++ b/spec/book.typ
@@ -10,6 +10,7 @@
     ("memory.typ", [Memory argument], <memory>),
     ("variables.typ", [Variables], <vars>),
     ("is_bit.typ", [IS_BIT template], <isbit>),
+    ("sign.typ", [SIGN template], <sign>),
     ("add.typ", [ADD/SUB template], <add>),
     ("decode.typ", [DECODE table], <decode>),
     ("cpu.typ", [CPU chip], <cpu>),
diff --git a/spec/sign.typ b/spec/sign.typ
new file mode 100644
index 000000000..6f8993f53
--- /dev/null
+++ b/spec/sign.typ
@@ -0,0 +1,42 @@
+#import "/book.typ": book-page
+#import "/src.typ": load_config, load_chip
+#import "/chip.typ": render_chip_column_table,   render_chip_assumptions, render_constraint_table
+
+
+#let config = load_config()
+#let chip = load_chip("src/sign.toml", config)
+#show: book-page(chip.name)
+
+#let sign = raw(chip.name)
+
+#let highlighted_code(code) = {
+  box(
+    inset: (left: 4pt, right: 4pt), 
+    outset: (top: 4pt, bottom: 4pt), 
+    radius: 2pt,
+    fill: luma(230), 
+    raw(code))
+}
+
+#sign is a constraint template that is used to extract a `Half`word's sign.
+
+== Interface
+The #sign constraint template has the following interface:
+#block(radius: 5pt, width: 100%, inset: 1.5em, fill: luma(230), raw("SIGN<sign; X, signed>"))
+It constrains that `sign` is set to `1` when both `X`'s most significant bit and `signed` are $1$, and $0$ otherwise.
+
+== Variables
+The #sign template operates on three variables:
+#render_chip_column_table(chip, config)
+
+== Assumptions
+The #sign template operates on the following assumptions:
+#render_chip_assumptions(chip, config)
+
+== Constraints
+It takes only two constraints to compute the `sign` of `X`, given whether `X` represents a `signed` value or not. 
+When $#`signed` = 1$, the sign of `X` is equal to its most significant bit. 
+This value is extracted in @sign:c:sign_if_signed.
+If `X` is unsigned (i.e., $#`signed` = 0$), its sign is always $0$.
+This is constrained by @sign:c:sign_if_unsigned.
+#render_constraint_table(chip, config)
diff --git a/spec/src/sign.toml b/spec/src/sign.toml
new file mode 100644
index 000000000..ca799e0cc
--- /dev/null
+++ b/spec/src/sign.toml
@@ -0,0 +1,42 @@
+name = "SIGN"
+
+[[variables.input]]
+name = "X"
+type = "Half"
+desc = "Value for which to extract its sign."
+
+[[variables.input]]
+name = "signed"
+type = "Bit"
+desc = "Whether `X` represents a signed value (1) or not (0)"
+
+[[variables.output]]
+name = "sign"
+type = "Bit"
+desc = "Sign of `X`"
+
+
+[[assumptions]]
+desc = "`IS_HALF[X]`"
+
+[[assumptions]]
+desc = "`IS_BIT<signed>`"
+
+
+[[constraint_groups]]
+name = "all"
+
+[[constraints.all]]
+kind = "interaction"
+tag = "MSB16"
+input = ["X"]
+output = "sign"
+multiplicity = "signed"
+ref = "sign:c:sign_if_signed"
+
+[[constraints.all]]
+kind = "arith"
+constraint = "$not#`signed` => #`sign` = 0$"
+poly = ["*", ["not", "signed"], "sign"]
+ref = "sign:c:sign_if_unsigned"
+

From 485b93ff81d752115387790688220195fcdcd907 Mon Sep 17 00:00:00 2001
From: Erik <159244975+erik-3milabs@users.noreply.github.com>
Date: Tue, 3 Feb 2026 10:57:16 +0100
Subject: [PATCH 053/105] spec: drop `IsZero` template (#278)

---
 spec/cpu.typ        | 5 ++---
 spec/src/shift.toml | 4 ++--
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/spec/cpu.typ b/spec/cpu.typ
index 0afa75f62..9036593ac 100644
--- a/spec/cpu.typ
+++ b/spec/cpu.typ
@@ -76,9 +76,8 @@ including the appropriate sign/zero extension, depending on `word_instr`.
 #render_constraint_table(chip, config, groups: "ext")
 
 === Other constraints
-
-#rj[proper ref to IsZero/IsEqual]
-For @cpu:c:is_equal, refer to the logic of IsZero or IsEqual, in combination with the subtraction of @cpu:c:sub.
+For @cpu:c:is_equal, note that @cpu:c:sub sets `res` to be the difference between `arg1` and `arg2` whenever `BEQ` is $1$.
+Given that this difference is $0$ when both are equal, @cpu:c:is_equal ensures `is_equal` is set to $1$ if and only if $#`arg1` = #`arg2`$ and `BEQ` is set.
 
 #render_constraint_table(chip, config, groups: "misc")
 
diff --git a/spec/src/shift.toml b/spec/src/shift.toml
index 4b7044e7d..591efb839 100644
--- a/spec/src/shift.toml
+++ b/spec/src/shift.toml
@@ -198,8 +198,8 @@ ref = "shift:c:bit_shift_if_right"
 multiplicity = "right"
 
 [[constraints.bit_shift]]
-kind = "template"
-tag = "IsZero"
+kind = "interaction"
+tag = "ZERO"
 input = ["bit_shift"]
 output = "zbs"
 ref = "shift:c:zbs"

From 9bb2eed3c2d636db21dcad84dacf04bbe028928f Mon Sep 17 00:00:00 2001
From: Erik <159244975+erik-3milabs@users.noreply.github.com>
Date: Tue, 3 Feb 2026 11:03:05 +0100
Subject: [PATCH 054/105] spec: fix header levels (#264)

* spec: offset headers in PDF

* spec: decrement header levels for chip descriptions

* spec: move heading offset to ebook.typ
---
 spec/add.typ     | 10 +++++-----
 spec/bitwise.typ |  6 +++---
 spec/branch.typ  |  8 ++++----
 spec/cpu.typ     | 20 ++++++++++----------
 spec/decode.typ  | 10 +++++-----
 spec/ebook.typ   |  1 +
 spec/ecall.typ   | 12 ++++++------
 spec/is_bit.typ  |  8 ++++----
 spec/load.typ    |  8 ++++----
 spec/lt.typ      |  8 ++++----
 spec/memory.typ  | 18 +++++++++---------
 spec/memw.typ    |  8 ++++----
 spec/mul.typ     | 18 +++++++++---------
 spec/shift.typ   | 24 ++++++++++++------------
 spec/sign.typ    |  8 ++++----
 15 files changed, 84 insertions(+), 83 deletions(-)

diff --git a/spec/add.typ b/spec/add.typ
index 981a0ccb1..241ea8621 100644
--- a/spec/add.typ
+++ b/spec/add.typ
@@ -20,14 +20,14 @@
 
 #add is a constraint template that is used to assert that $#`sum` = #`lhs` + #`rhs` mod 2^64$, under the condition that `cond` is non-zero.
 
-== Notation
+= Notation
 The #add constraint template has the following interface:
 #block(radius: 5pt, width: 100%, inset: 1.5em, fill: luma(230), raw("cond => ADD<sum; lhs, rhs>"))
 where `cond` is any value described by an expression _of degree at most $1$_.
 #highlighted_code("ADD<sum; lhs, rhs>") can be used to denote the _unconditional_ application of the #add template to `lhs`, `rhs`, and `sum`.
 
 #let sub = raw("SUB")
-=== #sub
+== #sub
 For ease of notation, we moreover introduce the #sub constraint template.
 Its interface
 #block(radius: 5pt, width: 100%, inset: 1.5em, fill: luma(230), raw("cond => SUB<diff; lhs, rhs>"))
@@ -36,12 +36,12 @@ maps onto the #add template as
 It constrains that $#`diff` = #`lhs` - #`rhs` mod 2^64$ when the expression `cond` is non-zero.
 As with #add, #highlighted_code("SUB<diff; lhs, rhs>") can be used to denote the _unconditional_ application of the template.
 
-== Variables
+= Variables
 #render_chip_column_table(chip, config)
 
-== Assumptions
+= Assumptions
 #render_chip_assumptions(chip, config)
 
-== Constraints
+= Constraints
 This template introduces the following constraints
 #render_constraint_table(chip, config)
diff --git a/spec/bitwise.typ b/spec/bitwise.typ
index 9b5b4a638..36fe3b6e0 100644
--- a/spec/bitwise.typ
+++ b/spec/bitwise.typ
@@ -15,7 +15,7 @@
 
 #show: book-page(chip.name)
 
-== Columns
+= Columns
 #let nr_variables = total_nr_variables(chip)
 #let nr_columns = total_nr_instantiated_columns(chip, config)
 #let nr_precomputed = ("input", "output").map(c => chip.variables.at(c)).flatten().len()
@@ -27,11 +27,11 @@ Of these, the _input_ and _output_ variables (#nr_precomputed in total) are prec
 *Note*: This table contains one row for every possible value of `(X, Y, Z)`.
 As such, it has length $2^8 dot 2^8 dot 2^4 = 2^(20)$.
 
-== Lookup
+= Lookup
 This chip adds the following interactions to the lookup:
 #render_constraint_table(chip, config)
 
-== Areas of Optimization
+= Areas of Optimization
 The following ideas may prove to be optimizations for the #bitwise chip:
 + Extend `IS_BYTE[X]` to `ARE_BYTES[X, Y]`, such that two bytes are range checked at once. 
   When only a single check is required, one can still execute `IS_BYTE[X] := ARE_BYTES[X, 0]`.
diff --git a/spec/branch.typ b/spec/branch.typ
index f448f2da4..3e944ca63 100644
--- a/spec/branch.typ
+++ b/spec/branch.typ
@@ -14,18 +14,18 @@
 
 #show: book-page(chip.name)
 
-== Columns
+= Columns
 #let nr_variables = total_nr_variables(chip)
 #let nr_columns = total_nr_instantiated_columns(chip, config)
 
 The `BRANCH` chip is comprised of #nr_variables variables that are expressed using #nr_columns columns:
 #render_chip_column_table(chip, config)
 
-== Assumptions
+= Assumptions
 
 #render_chip_assumptions(chip, config)
 
-== Constraints
+= Constraints
 
 #rj[Check correspondence with CPU for passing in `offset` as word or dword]
 We constrain `next_pc` to be $#`base_address` + #`offset`$,
@@ -37,7 +37,7 @@ The range checks on `unmasked_low_byte` and `next_pc_low[0]` are performed impli
 This chip contributes the following to the lookup argument.
 #render_constraint_table(chip, config, groups: "output")
 
-== Padding
+= Padding
 
 The table can be padded to the next power of two with the following value assignments:
 
diff --git a/spec/cpu.typ b/spec/cpu.typ
index 9036593ac..ed6126388 100644
--- a/spec/cpu.typ
+++ b/spec/cpu.typ
@@ -14,24 +14,24 @@
 
 #show: book-page(chip.name)
 
-== Columns
+= Columns
 #let nr_variables = total_nr_variables(chip)
 #let nr_columns = total_nr_instantiated_columns(chip, config)
 
 The `CPU` chip is comprised of #nr_variables variables that are expressed using #nr_columns columns:
 #render_chip_column_table(chip, config)
 
-== Assumptions
+= Assumptions
 #render_chip_assumptions(chip, config)
 
-== Constraints
+= Constraints
 First, we perform a decoding lookup for the current PC.
 
 #render_constraint_table(chip, config, groups: "decode")
 
 #rj[All casts for interactions will have to be reviewed once other chip interfaces stabilise]
 
-=== Range checks
+== Range checks
 
 We constrain all columns to have the appropriate ranges.
 The flags and register indices looked up from the decoding need to be checked,
@@ -46,13 +46,13 @@ The ranges of the other auxiliary columns are enforced through later constraints
 
 #render_constraint_table(chip, config, groups: "range")
 
-=== ALU
+== ALU
 
 The ALU functionality is then obtained through judicious dispatching to the corresponding chips.
 
 #render_constraint_table(chip, config, groups: "alu")
 
-=== Memory
+== Memory
 
 The interactions with the memory, both for register loading and storing, as for `LOAD` and `STORE` instructions are handled.
 Note that since registers need no byte-addressing, we store them in the memory argument with `Word` limbs.
@@ -62,20 +62,20 @@ to ensure the access is disjoint with the `pc` read into `rv1` as part of the `A
 
 #render_constraint_table(chip, config, groups: "mem")
 
-=== System
+== System
 
 The interactions with the wider system.
 
 #render_constraint_table(chip, config, groups: "sys")
 
-=== Input and output to the ALU
+== Input and output to the ALU
 
 We constrain `arg1`, `arg2` and `rvd` to correspond to the wanted values,
 including the appropriate sign/zero extension, depending on `word_instr`.
 
 #render_constraint_table(chip, config, groups: "ext")
 
-=== Other constraints
+== Other constraints
 For @cpu:c:is_equal, note that @cpu:c:sub sets `res` to be the difference between `arg1` and `arg2` whenever `BEQ` is $1$.
 Given that this difference is $0$ when both are equal, @cpu:c:is_equal ensures `is_equal` is set to $1$ if and only if $#`arg1` = #`arg2`$ and `BEQ` is set.
 
@@ -83,7 +83,7 @@ Given that this difference is $0$ when both are equal, @cpu:c:is_equal ensures `
 
 #rj[Document the choice to not have a multiplicity column here for padding]
 
-== Padding
+= Padding
 
 The CPU can be padded with the following values, which have a corresponding row
 in the DECODE table, at the _odd_ address 1, only reachable through a HALT ecall.
diff --git a/spec/decode.typ b/spec/decode.typ
index 586625226..87f6083f5 100644
--- a/spec/decode.typ
+++ b/spec/decode.typ
@@ -21,14 +21,14 @@ For reasons of efficiency, data in this table is significantly compressed.
 Since reasoning about this compressed form is needlessly complex, the `decode (uncompressed)` section presents the same table in uncompressed form, and explains how to decode `RV64IM` assembly instructions to it.
 Instructions on how to compress the uncompressed table to form the compressed decode table, can be derived from the `packed_decode` variable provided below.
 
-== Columns
+= Columns
 #let nr_variables = total_nr_variables(chip)
 #let nr_columns = total_nr_instantiated_columns(chip, config)
 
 The #decode table is comprised of #nr_variables variables that are expressed using #nr_columns columns:
 #render_chip_column_table(chip, config)
 
-== Padding
+= Padding
 The #decode table must be padded to a length that is a power of two.
 Empty rows with the following content can be added to achieve this:
 
@@ -39,7 +39,7 @@ Given that `CPU` asserts that `EBREAK = 0` (see @cpu:c:ebreak_traps), using this
 Note moreover that the `pc` is set to $7$.
 This value is the _smallest odd number_ (i.e., not reachable during regular execution) that is more than _$4$_ (i.e., the max `pc`-increment) greater than _$1$_ (i.e., the `pc`-value used in the #link(<cpu-padding-decode-row>)[additional instruction] referred to by `CPU`-padding lines).
 
-== Decoding
+= Decoding
 For the purposes of explaining decoding, we decompress #decode's `packed_decode` variable into its constituent variables.
 Note that the below table is _not_ used in practice: it is solely used for the purposes of this explanation.
 
@@ -64,7 +64,7 @@ For the purpose of brevity and readability, the table uses the following rules-o
 
 Further clarification is provided in the notes following the table.
 
-=== C-type instructions
+== C-type instructions
 The `RV64C` extension for compressed instructions specifies that \~50% of all instructions can be represented using a 16-bit instruction (rather than 32-bits), saving \~25% in code size.
 This execution of assembly code is _not_ agnostic to an instruction's compression state; after executing a compressed instruction, the `pc` should be incremented by $2$ rather than $4$.
 To indicate an instruction is provided in compressed form, the `c_type` flag is introduced.
@@ -159,7 +159,7 @@ To indicate an instruction is provided in compressed form, the `c_type` flag is
   [#figure(kind: "note", supplement: [], [#note]) #label(lbl)]
 }
 
-==== Notes
+== Notes
 We note the following about the above decoding table:
 #enum(numbering: "[1]",
   enum.item(
diff --git a/spec/ebook.typ b/spec/ebook.typ
index 835751163..f9ba76046 100644
--- a/spec/ebook.typ
+++ b/spec/ebook.typ
@@ -15,5 +15,6 @@
 #meta.summary.map(((ch, title, ref)) => [
   #pagebreak(weak: true)
   #heading(supplement: [Chapter], level: 1, title)#ref
+  #set heading(offset: 1)
   #include ch
 ]).join()
diff --git a/spec/ecall.typ b/spec/ecall.typ
index 9f2d96048..6908f768b 100644
--- a/spec/ecall.typ
+++ b/spec/ecall.typ
@@ -16,20 +16,20 @@
 #let config = load_config()
 #let chip = load_chip("src/halt.toml", config)
 #let halt = raw(chip.name)
-== #halt chip
+= #halt chip
 
-=== Columns
+== Columns
 #let nr_variables = total_nr_variables(chip)
 #let nr_columns = total_nr_instantiated_columns(chip, config)
 
 The #halt chip leverages #nr_variables variable, spanning #nr_columns columns:
 #render_chip_column_table(chip, config)
 
-=== Assumptions
+== Assumptions
 It is assumed the input is range checked:
 #render_chip_assumptions(chip, config)
 
-=== Constraints
+== Constraints
 The #halt chip:
 + makes sure register `x10` (containing the exit code) equals $0$ (@halt:c:read_zero_exit_code),
 + writes $0$ to all other registers (@halt:c:zeroize_registers_lo/@halt:c:zeroize_registers_hi), and
@@ -44,13 +44,13 @@ This prevents any other operation involving memory from being executed hereafter
   Alternatively, one could add 31 lookups to the "memory" table to remove the _known_ final tokens for the registers there.
 ])
 
-==== Lookup
+=== Lookup
 The HALT chip contributes the following interaction to the lookup-argument:
 #render_constraint_table(chip, config, groups: "lookup")
 
 *Note*: #link("https://github.com/riscv-collab/riscv-gnu-toolchain/blob/master/linux-headers/include/asm-generic/unistd.h#L258")[$93$ is the system call number corresponding to `sys_exit`.]
 
-=== Padding
+== Padding
 This chip should only contain a single row.
 Given that $2^0 = 1$, this chip does not need to be padded.
 As such, no padding is defined.
diff --git a/spec/is_bit.typ b/spec/is_bit.typ
index a12d62108..33477d377 100644
--- a/spec/is_bit.typ
+++ b/spec/is_bit.typ
@@ -21,17 +21,17 @@
 #is_bit is a constraint template that is used to assert that a variable lies in the range ${0, 1}$ if some second variable is non-zero.
 Barring exceptional cases, this template is used to assert that a variable of type `Bit` assumes a valid value under some condition.
 
-== Interface
+= Interface
 The #is_bit constraint template has the following interface:
 #block(radius: 5pt, width: 100%, inset: 1.5em, fill: luma(230), raw("cond => IS_BIT<X>"))
 where `cond` is any value described by an expression _of degree at most $1$_.
 Note that #highlighted_code("IS_BIT<X>") can be used to denote the _unconditional_ application of the #is_bit template to `X`.
 
-== Variables
+= Variables
 The #is_bit template operates on two variables: `cond` and `X`:
 #render_chip_column_table(chip, config)
 
-== Constraints
+= Constraints
 It takes only one constraint to enforce that `X` must be either $0$ or $1$ whenever $#`cond` eq.not 0$:
 #render_constraint_table(chip, config)
 *Note*: 
@@ -39,7 +39,7 @@ It takes only one constraint to enforce that `X` must be either $0$ or $1$ whene
 - As described earlier, the `cond` variable must be describable by a degree-1 (i.e., linear) expression.
   This is to make sure that @isbit:c:isbit's expression has degree at most 3.
 
-== Proof of correctness
+= Proof of correctness
 If `cond` is $0$, @isbit:c:isbit is trivially satisfied: `X` can assume any value and the polynomial constraint will evaluate to $0$ regardless. 
 When $#`cond` eq.not 0$, it follows that the statement can only be proven when $#`X` (1-#`X`) equiv 0 mod p$, with $p$ the modulus of the field.
 Because `BaseField` is a prime field, this equality is only satisfied if either $#`X` equiv 0 mod p$ or $1-#`X` equiv 0 mod p$.
diff --git a/spec/load.typ b/spec/load.typ
index 71c274d1b..bccb830f8 100644
--- a/spec/load.typ
+++ b/spec/load.typ
@@ -14,17 +14,17 @@
 
 #show: book-page(chip.name)
 
-== Columns
+= Columns
 #let nr_variables = total_nr_variables(chip)
 #let nr_columns = total_nr_instantiated_columns(chip, config)
 
 The `LOAD` chip is comprised of #nr_variables variables that are expressed using #nr_columns columns:
 #render_chip_column_table(chip, config)
 
-== Assumptions
+= Assumptions
 #render_chip_assumptions(chip, config)
 
-== Constraints
+= Constraints
 The chip delegates the actual memory interaction to the `MEMW` chip,
 and ensures correctness of the requested sign/zero extension.
 The output `res` is correctly range-checked as long as the memory contents are.
@@ -35,7 +35,7 @@ The chip contributes the following to the lookup argument.
 
 #render_constraint_table(chip, config, groups: "output")
 
-== Padding
+= Padding
 
 The table can be padded to the next power of two with the following value assignments:
 
diff --git a/spec/lt.typ b/spec/lt.typ
index ea36eb4dc..3447efd70 100644
--- a/spec/lt.typ
+++ b/spec/lt.typ
@@ -14,18 +14,18 @@
 
 #show: book-page(chip.name)
 
-== Columns
+= Columns
 #let nr_variables = total_nr_variables(chip)
 #let nr_columns = total_nr_instantiated_columns(chip, config)
 
 The `LT` chip is comprised of #nr_variables variables that are expressed using #nr_columns columns:
 #render_chip_column_table(chip, config)
 
-== Assumptions
+= Assumptions
 We assume the inputs `lhs`, `rhs` and `signed` are partially range checked.
 #render_chip_assumptions(chip, config)
 
-== Constraints
+= Constraints
 We first constrain that all variables correspond to their definition.
 For the defining constraint of `lt`, @lt:c:lt, observe that it is a choice
 between two options, depending on the input flag `signed`.
@@ -80,7 +80,7 @@ The chip contributes the following to the lookup argument.
 
 #render_constraint_table(chip, config, groups: "output")
 
-== Padding
+= Padding
 
 The table can be padded to the next power of two with the following value assignments:
 
diff --git a/spec/memory.typ b/spec/memory.typ
index ec8735e49..1fcb7b54e 100644
--- a/spec/memory.typ
+++ b/spec/memory.typ
@@ -31,7 +31,7 @@ The initialization and finalization schemes together ensure both that (1) the ne
 for the lookup system are satisfied, and (2) the program is executed with the correct
 initial memory and register contents as specified by the ELF binary and the ISA.
 
-== Memory types
+= Memory types
 
 A commonly made distinction of memory types is that of _read-only_ and _read-write_ memory,
 with the more restrictive read-only variant often allowing for more efficient solutions
@@ -43,7 +43,7 @@ While there are some subsystems that can be modelled as read-only memory
 we opt to integrate these into the proof system via chip interactions (relying on techniques derived from table lookup arguments).
 As such, we only concern ourselves with read-write memory, moving forward.
 
-== Memory operations
+= Memory operations
 
 Every memory operation has some conceptual attributes that are relevant to mention or discuss:
 
@@ -75,7 +75,7 @@ For simplicity, we will always reserve a timestamp for every possible memory acc
 ]
 
 
-== Permutation argument
+= Permutation argument
 
 We can conceptually organise the state of the memory as a collection of "tokens" that represent tuples
 $(serif("timestamp"), serif("address"), serif("value"))$,
@@ -99,7 +99,7 @@ this "balancing" act of tokens can be integrated (with sufficient domain separat
 consuming a token corresponds to a "receive" and emitting a new token is a "send".
 #rj[properly link/refer to the logup spec]
 
-== Temporal integrity
+= Temporal integrity
 
 To ensure temporal integrity, every memory operation needs to be constrained for the newly emitted token
 to have a strictly greater timestamp than the consumed token.
@@ -125,7 +125,7 @@ We choose to represent timestamps as machine words, using the existing `LT` chip
 
 #rj[reference to CPU chip/timestamp column and MEMW chip]
 
-== Initialization and Finalization
+= Initialization and Finalization
 
 Because the LogUp argument handling token consumption and emission needs to be fully balanced
 --- every token emitted should be consumed, and vice versa ---
@@ -159,7 +159,7 @@ For each such table, the `page` variable is instantiated as the constant base ad
 The `offset` column is preprocessed, which helps the verifier ensure that each page has a single fixed size,
 but the verifier should still check that no pages overlap and all `page` values are page-aligned.
 
-=== Page initialization
+== Page initialization
 
 #rj[check whether we need `fini` to be range-checked]
 We present here a set of constraints on the `PAGE` table that
@@ -211,7 +211,7 @@ and hence doesn't need a column, nor a range check.
       Most programs and compilers should however favor a memory locality that makes paged initialization/finalization comparable.
 ]
 
-=== Register initialization/finalization
+== Register initialization/finalization
 
 #rj[Properly link/reference ECALL/HALT chip]
 The initial and final state of registers can be entirely known by
@@ -221,11 +221,11 @@ by the HALT ecall.
 As additionally, the number of registers is small, the verifier can directly
 add the required balancing terms to the LogUp sum.
 
-== Notes and considerations
+= Notes and considerations
 
 - Register reads and writes may interact within a single cycle, so a correct and fixed ordering needs to be ensured
 - Correctness of initialization and completeness of finalization need to be ensured
 
-== Future topics of interest
+= Future topics of interest
 
 - Optimize memory systems after determining factual bottlenecks (e.g. taking inspiration from Twist and Shout, or other recent research)
diff --git a/spec/memw.typ b/spec/memw.typ
index 77b786bf6..a3dfda42c 100644
--- a/spec/memw.typ
+++ b/spec/memw.typ
@@ -13,14 +13,14 @@
 
 #show: book-page(chip.name)
 
-== Columns
+= Columns
 #let nr_variables = total_nr_variables(chip)
 #let nr_columns = total_nr_instantiated_columns(chip, config)
 
 The `MEMW` chip is comprised of #nr_variables variables that are expressed using #nr_columns columns:
 #render_chip_column_table(chip, config)
 
-== Assumptions
+= Assumptions
 
 #render_chip_assumptions(chip, config)
 
@@ -29,7 +29,7 @@ as these are not necessary for the correctness of this chip in isolation.
 These properties are necessary for the consistency of the system as a whole, and therefore
 we document it here, keeping the type information as a reading help.
 
-== Constraints
+= Constraints
 
 #render_constraint_table(chip, config, groups: "consistency")
 
@@ -50,7 +50,7 @@ This chip contributes the following to the lookup argument.
 #render_constraint_table(chip, config, groups: "output")
 
 
-== Future optimization ideas
+= Future optimization ideas
 
 - Fast path for aligned memory access where all bytes have the same old timestamp
 - MEMB chip that deals does a one-byte write to remove old_timestamp from here (uncertain tradeoffs)
diff --git a/spec/mul.typ b/spec/mul.typ
index b2fb53d92..a2fb7d1fc 100644
--- a/spec/mul.typ
+++ b/spec/mul.typ
@@ -16,7 +16,7 @@
 
 #let mul = raw(chip.name)
 
-== Columns
+= Columns
 #let nr_variables = total_nr_variables(chip)
 #let nr_columns = total_nr_instantiated_columns(chip, config)
 
@@ -27,12 +27,12 @@ The `MUL` chip is comprised of #nr_variables variables that are expressed using
  $mat(delim: #none, top; bottom)$
 }
 
-== Assumptions
+= Assumptions
 The following range checks are assumed to be performed/enforced outside of this chip:
 #render_chip_assumptions(chip, config)
 
-== Constraints
-=== Overview
+= Constraints
+== Overview
 When `lhs` and `rhs` are _unsigned_ integers, computing their product $mod 2^128$ comes down to evaluating
 $
 (sum_(j=0)^3 2^(16j) dot #`lhs`_j) dot (sum_(i=0)^3 2^(16i) dot #`rhs`_i) mod 2^128.
@@ -79,25 +79,25 @@ However, there is some slack in how tight one has to constrain the `carry` value
 In fact, in this situation it suffices to assert that $#`carry`_i < frac(p, 2^32, style: "skewed") approx 2^31$, where $p$ denotes the field's modulus.
 Given that other chips also use 20-bit lookups, using `IS_B20` makes for a simpler design.
 
-=== Definitions
+== Definitions
 We constrain `lhs_is_negative` and `rhs_is_negative` according to their definition; `lo`, `hi` and `carry` are appropriately range checked.
 #render_constraint_table(chip, config, groups: "def")
 
-=== Product
+== Product
 @mul:c:raw_product defines `raw_product` in terms of the (sign extended) input values `lhs` and `rhs`.
 #render_constraint_table(chip, config, groups: "prod")
 
-=== Lookup
+== Lookup
 The #mul chip contributes the following to the lookup:
 #render_constraint_table(chip, config, groups: "lookup")
 
-== Padding
+= Padding
 
 The table can be padded to the next power of two with the following value assignments:
 
 #render_chip_padding_table(chip, config)
 
-== Notes
+= Notes
 - `lo` and `hi` are stored in `DWordHL`s (rather than `DWordWL`s) because of their values being range checked.
   Since it is not required that both `μ_lo` and `μ_hi` are non-zero at the same time, one cannot safely assume their range to be checked elsewhere.
 
diff --git a/spec/shift.typ b/spec/shift.typ
index 177ce6104..a2a3ec968 100644
--- a/spec/shift.typ
+++ b/spec/shift.typ
@@ -16,7 +16,7 @@
 
 #show: book-page(chip.name)
 
-== Interface
+= Interface
 The #shift chip has the following interface:
 #block(radius: 5pt, width: 100%, inset: 1.5em, fill: luma(240), 
 ```
@@ -46,17 +46,17 @@ $
 $
 Here, `<<` and `>>` denote the _logical_ left and right shift operations, while `>>>` denotes the _arithmetic_ right shift operation.
 
-== Columns
+= Columns
 #let nr_variables = total_nr_variables(chip)
 #let nr_columns = total_nr_instantiated_columns(chip, config)
 
 The `SHIFT` chip is comprised of #nr_variables variables that are expressed using #nr_columns columns:
 #render_chip_column_table(chip, config)
 
-== Assumptions
+= Assumptions
 #render_chip_assumptions(chip, config)
 
-== Explanation
+= Explanation
 This chip has a rather complex design as a result of designing it to fit in as few columns possible.
 We briefly discuss the intricacies of the design, attempting to illustrate its correctness.
 
@@ -70,7 +70,7 @@ The output variable `out` is equivalent to `shifted`, but expressed using `Word`
 In the following, we cover how these two phases were designed to complement one another.
 Here, we start with discussing the _logical_ left/right shift operations only; the modifications required to compute the _arithmetic_ right shift will be discussed at the end.
 
-=== First phase
+== First phase
 We zoom in on the first step.
 Here, we make use of the two lookup operations 
 - $#`HWSL[x: Half, y: B4]` := (#`x` #`<<` #`y`) mod 2^16$ (short for "HalfWord Shift Left"), and
@@ -106,7 +106,7 @@ $
 it only takes some rearranging and combining of the values $#`X[`i#`] := HWSL[in[`i#`], bit_shift]`$ and $#`Y[`i#`] := HWSLC[in[`i#`], bit_shift]`$ to form the limbs of $#`in <</>> shift` mod 16$.
 In the remaining case that $#`right` = 1$ and $#`shift` = 0 mod 16$, the limbs of $#`in <</>> shift` mod 16$ simply match those of `in`.
 
-=== Second phase
+== Second phase
 Since we're operating on 16-bit limbs, all the limbs in $#`in <</>> shift`$ must also occur somewhere in $#`in <</>> shift` mod 16$.
 The number of full-limbs we still need to shift is determined by the fifth and sixth least significant bit of `shift`.
 With `limb_shift` containing a unary decoding of the integer represented by these two bits, we find that the intermediate value needs to be shifted over by $i$ limbs (to the `left` or `right`) when $#`limb_shift[`i#`]` = 1$.
@@ -114,13 +114,13 @@ These things combined yield `shifted`'s definition.
 
 Of course, when $#`word_instr` = 1$ and, thus, only $#`shift` mod 32$ should be considered, the bit-mask for the lookup constraining `limb_shift` is adjusted appropriately (see @shift:c:limb_shift_lookup).
 
-=== Arithmetic right shift
+== Arithmetic right shift
 Lastly, we discuss the case of performing the _arithmetic_ right shift.
 Here, `extension` is constrained to contain a repetition of `in`'s most significant bit.
 Copies of this variable are used for any full limbs shifted in when $#`right` = #`signed` = 1$.
 Moreover, `X[4]` contains a copy of `extension` shifted over by the right number of bits, to allow the construction of $#`in >>> shift` mod 16$ as the appropriate intermediate.
 
-== Constraints
+= Constraints
 First, we constrain `bit_shift` based on whether we are left or right-shifting.
 @shift:c:zbs makes sure `zbs` is set to `1` if and only if `bit_shift = 0`. 
 This flag is used to indicate the special case that $#`right` = 1$ and $#`shift` = 0 mod 16$.
@@ -134,7 +134,7 @@ The case of `left`-shifting and $#`bit_shift` = 0$ will be used for padding rows
 To prevent unnecessary lookups in padding rows, we override $#`X[i]` := #`in[i]`$ and $#`Y[i]` := 0$ here.
 #render_constraint_table(chip, config, groups: "intra_limb_shift")
 
-=== Full-limb shifting
+== Full-limb shifting
 Next, we constrain that `limb_shift` is a proper unary encoding of the fifth (and sixth if $#`word_instr` = 0$) bit of `shift`.
 For this to be the case, three requirements must be satisfied:
 + *unary(0)*: $#`limb_shift[`i#`]` in {0, 1}$ for $i in [0, 3]$,
@@ -164,16 +164,16 @@ This is the exact relation @shift:c:limb_shift_lookup enforces.
 Hereafter, one must only check that `out` is the proper cast of `shifted` into a `DWordWL`.
 #render_constraint_table(chip, config, groups: "limb_shifting")
 
-=== Miscellaneous 
+== Miscellaneous 
 #render_constraint_table(chip, config, groups: ("left_flag", "is_negative"))
 *Note*: `is_negative` is not used when `signed = 0`.
 As such, there is no problem with it being unconstrained in this case.
 
-=== Lookups
+== Lookups
 This chip adds the following interaction to the lookup.
 #render_constraint_table(chip, config, groups: "lookups")
 
-== Padding
+= Padding
 
 The table can be padded to the next power of two with the following value assignments:
 
diff --git a/spec/sign.typ b/spec/sign.typ
index 6f8993f53..dcc941e47 100644
--- a/spec/sign.typ
+++ b/spec/sign.typ
@@ -20,20 +20,20 @@
 
 #sign is a constraint template that is used to extract a `Half`word's sign.
 
-== Interface
+= Interface
 The #sign constraint template has the following interface:
 #block(radius: 5pt, width: 100%, inset: 1.5em, fill: luma(230), raw("SIGN<sign; X, signed>"))
 It constrains that `sign` is set to `1` when both `X`'s most significant bit and `signed` are $1$, and $0$ otherwise.
 
-== Variables
+= Variables
 The #sign template operates on three variables:
 #render_chip_column_table(chip, config)
 
-== Assumptions
+= Assumptions
 The #sign template operates on the following assumptions:
 #render_chip_assumptions(chip, config)
 
-== Constraints
+= Constraints
 It takes only two constraints to compute the `sign` of `X`, given whether `X` represents a `signed` value or not. 
 When $#`signed` = 1$, the sign of `X` is equal to its most significant bit. 
 This value is extracted in @sign:c:sign_if_signed.

From ed3d8839aabfe5f5416eb3c069d3f7e7c9a9e510 Mon Sep 17 00:00:00 2001
From: Erik <159244975+erik-3milabs@users.noreply.github.com>
Date: Thu, 5 Feb 2026 11:25:03 +0100
Subject: [PATCH 055/105] spec: LOAD: fix LOAD-C9 signature (#284)

---
 spec/src/load.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/spec/src/load.toml b/spec/src/load.toml
index fcbd2b87f..f8a974c9a 100644
--- a/spec/src/load.toml
+++ b/spec/src/load.toml
@@ -155,6 +155,6 @@ name = "output"
 [[constraints.output]]
 kind = "interaction"
 tag = "LOAD"
-input = ["base_address", "timestamp", "read2", "read4", "read8"]
+input = ["base_address", "timestamp", "read2", "read4", "read8", "signed"]
 output = ["cast", "res", "DWordWL"]
 multiplicity = ["-", "μ"]

From 76a008c681b23ca84a9ac1ac33ebd878cb4cc1a3 Mon Sep 17 00:00:00 2001
From: Erik <159244975+erik-3milabs@users.noreply.github.com>
Date: Thu, 5 Feb 2026 11:52:38 +0100
Subject: [PATCH 056/105] spec: `NEG` template (#270)

* spec: tweak code-rendering "not"

* spec: introduce NEG template

* Update spec/book.typ

Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com>

* spec: update NEG

* spec: NEG: refactor

* spec: NEG: fix range-assumption on x

* spec: NEG: update cond

* spec: tweak math-rendering "not"
Analogous to 801f5ee9

* spec: NEG: add non-zero x case distinction

---------

Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com>
---
 spec/book.typ     |  1 +
 spec/expr.typ     |  4 +--
 spec/neg.typ      | 78 +++++++++++++++++++++++++++++++++++++++++++++++
 spec/src/neg.toml | 53 ++++++++++++++++++++++++++++++++
 4 files changed, 134 insertions(+), 2 deletions(-)
 create mode 100644 spec/neg.typ
 create mode 100644 spec/src/neg.toml

diff --git a/spec/book.typ b/spec/book.typ
index 2fd6d8767..64d78e15a 100644
--- a/spec/book.typ
+++ b/spec/book.typ
@@ -12,6 +12,7 @@
     ("is_bit.typ", [IS_BIT template], <isbit>),
     ("sign.typ", [SIGN template], <sign>),
     ("add.typ", [ADD/SUB template], <add>),
+    ("neg.typ", [NEG template], <neg>),
     ("decode.typ", [DECODE table], <decode>),
     ("cpu.typ", [CPU chip], <cpu>),
     ("shift.typ", [SHIFT chip], <shift>),
diff --git a/spec/expr.typ b/spec/expr.typ
index a0530525b..1c6c7942e 100644
--- a/spec/expr.typ
+++ b/spec/expr.typ
@@ -92,7 +92,7 @@
 #let expr_to_code = make_expr_formatter(
   (
     "idx": (pp, rec, e) => rec(PREC.MIN, e.at(1)) + `[` + rec(PREC.MAX, e.at(2)) + `]`,
-    "not": (pp, rec, e) => cwrap(`1 - ` + rec(PREC.not, e.at(1)), pp < PREC.not),
+    "not": (pp, rec, e) => cwrap(rec(PREC.not, 1) + ` - ` + rec(PREC.not, e.at(1)), pp < PREC.not),
     "+": (pp, rec, e) => cwrap(e.slice(1).map(rec.with(PREC.add)).join(` + `), pp < PREC.add),
     "sum": (pp, rec, e) => assert(false, message: "sum is unsupported in code."),
     "*": (pp, rec, e) => {
@@ -153,7 +153,7 @@
       let (val, idxs) = flat_idxs(e)
       $#rec(PREC.idx, val)_(#idxs.map(idx => rec(PREC.idx, idx)).join($, $))$
     },
-    "not": (pp, rec, e) => mwrap($1 - #rec(PREC.not, e.at(1))$, pp < PREC.not),
+    "not": (pp, rec, e) => mwrap(rec(PREC.not, 1) + $ - #rec(PREC.not, e.at(1))$, pp < PREC.not),
     "+": (pp, rec, e) => mwrap($#e.slice(1).map(rec.with(PREC.add)).join($+$)$, pp < PREC.add),
     "sum": (pp, rec, e) => {
       assert(e.len() == 4, message: "invalid sum:" + repr(e))
diff --git a/spec/neg.typ b/spec/neg.typ
new file mode 100644
index 000000000..ac8554689
--- /dev/null
+++ b/spec/neg.typ
@@ -0,0 +1,78 @@
+#import "/book.typ": book-page, et
+#import "/src.typ": load_config, load_chip
+#import "/chip.typ": render_chip_column_table, render_chip_assumptions, render_constraint_table
+
+#let config = load_config()
+#let chip = load_chip("src/neg.toml", config)
+#show: book-page(chip.name)
+
+#let neg = raw(chip.name)
+
+#let highlighted_code(code) = {
+  box(
+    inset: (left: 4pt, right: 4pt), 
+    outset: (top: 4pt, bottom: 4pt), 
+    radius: 2pt,
+    fill: luma(230), 
+    raw(code))
+}
+
+#neg is a constraint template that is used to assert that $#`neg` = -#`x`$, under the condition that `cond` is non-zero.
+
+= Notation
+The #neg constraint template has the following interface:
+#block(radius: 5pt, width: 100%, inset: 1.5em, fill: luma(230), raw("cond => NEG<neg; x>"))
+where `cond` is a bit value (i.e., lies in ${0, 1}$)  described by an expression _of degree at most $1$_.
+#highlighted_code("NEG<neg; x>") can be used to denote the _unconditional_ application of the #neg template to `x` and `neg` (which is equivalent to $#`cond` = 1$).
+
+= Variables
+#render_chip_column_table(chip, config)
+
+= Assumptions
+#render_chip_assumptions(chip, config)
+
+= Constraints
+We constrain this equality using two constraints:
+#render_constraint_table(chip, config)
+The constraints force the `carry` values to be fixed.
+Writing `carry`'s definition, we then find that
+$
+  #`neg`_0 &= 2^32 dot #`carry`_0 - (#`x as DWordWL`)_0
+ = cases(
+  2^32 - (#`x as DWordWL`)_0 & "if" (#`x as DWordWL`)_0 != 0,
+  0 & "if" (#`x as DWordWL`)_0 = 0
+ ),\
+  #`neg`_1 &= 2^32 dot #`carry`_1 - (#`x as DWordWL`)_1 - #`carry`_0 = cases(
+  2^32 - (#`x as DWordWL`)_1 - 1 & "if" #`x` != 0,
+  0 & "if" #`x` = 0
+ )
+$
+Clearly, $#`neg` = 0$ when $#`x` = 0$ (and `cond` is set).
+For non-zero `x`, we distinguish two cases.
+When $(#`x as DWordWL`)_0 = 0$,
+$
+  #`neg` 
+  &= 2^32 dot #`neg`_1 + #`neg`_0\
+  &= 2^32 dot (2^32 - (#`x as DWordWL`)_1) + 0\
+  &= 2^32 dot (2^32 - (#`x as DWordWL`)_1) + (#`x as DWordWL`)_0\
+  &= 2^64 - (2^32 dot (#`x as DWordWL`)_1 + (#`x as DWordWL`)_0)\
+  &= 2^64 - #`x`\
+  &equiv -x mod 2^64,
+$
+while when $(#`x as DWordWL`)_0 != 0$,
+$
+  #`neg` 
+  &= 2^32 dot #`neg`_1 + #`neg`_0\
+  &= 2^32 dot (2^32 - (#`x as DWordWL`)_1 - 1) + (2^32 - (#`x as DWordWL`)_0)  \
+  &= 2^64 - 2^32 dot (#`x as DWordWL`)_1 - 2^32 + 2^32 - (#`x as DWordWL`)_0  \
+  &= 2^64 - ((#`x as DWordWL`)_0 + 2^32 dot (#`x as DWordWL`)_1) \
+  &= 2^64 - #`x`\
+  &equiv -x mod 2^64
+$
+when `cond` is set.
+When `cond` is not set, the two lookups are not executed, allowing `neg` to take any value in either case.
+
+= Note
+It is worth noting that this construction does _not_ require the limbs of `neg` to be range checked, 
+thus allowing it be represented by the unrangecheckable `DWordWL` rather than a `DWordHL`.
+The input value `x` is still assumed to be range-checked, however.
diff --git a/spec/src/neg.toml b/spec/src/neg.toml
new file mode 100644
index 000000000..2cd70f354
--- /dev/null
+++ b/spec/src/neg.toml
@@ -0,0 +1,53 @@
+name = "NEG"
+
+[[variables.condition]]
+name = "cond"
+type = "Bit"
+desc = "condition on whether to negate x"
+
+[[variables.input]]
+name = "x"
+type = "DWordHL"
+desc = "value to compute negation of"
+
+[[variables.output]]
+name = "neg"
+type = "DWordWL"
+desc = "negation of `x` if $#`cond` != 0$; unconstrained otherwise."
+
+[[variables.virtual]]
+name = "carry"
+type = ["Bit", 2]
+desc = "carries of the addition $#`neg` + #`x`$."
+def = {idx="i", polys=[
+    {iter=0, poly=["*", ["^", 2, -32], ["+", ["idx", ["cast", "x", "DWordWL"], 0], ["idx", "neg", 0]]]},
+    {iter=1, poly=["*", ["^", 2, -32], ["+", ["idx", ["cast", "x", "DWordWL"], 1], ["idx", "neg", 1], ["idx", "carry", 0]]]}
+]}
+
+
+[[assumptions]]
+desc = "`IS_HALF[x[i]]`"
+iter = ["i", 0, 3]
+
+[[assumptions]]
+desc = "`IS_BIT<cond>`"
+
+
+[[constraint_groups]]
+name = "all"
+
+[[constraints.all]]
+kind = "interaction"
+tag = "ZERO"
+input = [["+", ["idx", "x", 0], ["idx", "x", 1]]]
+output = ["not", ["idx", "carry", 0]]
+multiplicity = "cond"
+ref = "neg:c:carry_0"
+
+[[constraints.all]]
+kind = "interaction"
+tag = "ZERO"
+input = [["+", ["idx", "x", 0], ["idx", "x", 1], ["idx", "x", 2], ["idx", "x", 3]]]
+output = ["not", ["idx", "carry", 1]]
+multiplicity = "cond"
+ref = "neg:c:carry_1"

From f86e427129f06e7e51546795fffdea95d34062ed Mon Sep 17 00:00:00 2001
From: Erik Takke <erik.takke@3milabs.tech>
Date: Wed, 24 Dec 2025 15:06:52 +0100
Subject: [PATCH 057/105] spec: Introduce DVRM chip
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

spec: DVRM: introduce `μ_sum`

spec: DVRM: apply SIGN template

spec: DVRM: fix `n_sub_r_is_negative`

spec: DVRM: range check `n_sub_r`

spec: DVRM: add missing LT constraint

spec: DVRM: add missing abs_* range checks required by SUB calls.

spec: DVRM: fix LT lookup

spec: support variable labelling

spec: DVRM: completely refactor DVRM chip

spec: DVRM: make multiplicities binary

spec: DVRM: spec padding

spec: DVRM: remove superfluous TODOs

spec: DVRM: drop msb lookup for `sign_r`

spec: DVRM: replace `range=` by `iter=`

spec: DVRM: replace range assumptions for q and r by constraints

Apply suggestions from code review

Co-authored-by: Robin Jadoul <robin.jadoul@3milabs.tech>

spec: DVRM: drop bit checks for multiplicities

spec:DVRM: complete refactor

spec: DVRM: update padding

spec: DVRM: fix minor discrepancy

spec: DVRM: drop superfluous `q_if_overflow`

spec: DVRM: fix typos

spec: DVRM: fix casting

spec: ZERO: expand lookup to B20

spec: DVRM: abandon `IsZero` and `IsEqual` templates

spec: DVRM: fix typo

spec: expr: update constant rendering in expr_to_math

Update spec/bitwise.typ

Co-authored-by: Robin Jadoul <robin.jadoul@3milabs.tech>

spec: DVRM: replace [Half, x] by xHL

spec: DVRM: use QuadHL-sub to constrain `extended_n_sub_r`

spec: drop support variable labelling

This reverts commit c8d68968d58ade883fc5fee1118fbd1957af5a3a (and removes a bit more).

spec: DVRM: fix dvrm:c:div_by_zero

Update spec/dvrm.typ

Co-authored-by: Robin Jadoul <robin.jadoul@3milabs.tech>
---
 spec/bitwise.typ      |   2 +-
 spec/chip.typ         |   2 +-
 spec/dvrm.typ         | 135 +++++++++++++-
 spec/src/bitwise.toml |   4 +-
 spec/src/dvrm.toml    | 404 ++++++++++++++++++++++++++++++++++++++++++
 spec/src/mul.toml     |   3 +-
 6 files changed, 541 insertions(+), 9 deletions(-)
 create mode 100644 spec/src/dvrm.toml

diff --git a/spec/bitwise.typ b/spec/bitwise.typ
index 36fe3b6e0..ef1e3a671 100644
--- a/spec/bitwise.typ
+++ b/spec/bitwise.typ
@@ -38,5 +38,5 @@ The following ideas may prove to be optimizations for the #bitwise chip:
 + Drop `MSB8` column, and instead define the `MSB8` lookup as `MSB8<X> := MSB16[256X]`.
   Note: currently, `MSB8` also implicity range checks the input `X` (the lookup fails if `X` is not a `Byte`).
   This optimization should only be executed when all chips leveraging `MSB8` do _not_ need this implicit range check.
-+ Place the 16-bit (`AND`, `OR`, `XOR`, `MSB16`, `ZERO`, etc.) and 20-bit (`HWSL`, `HWSLC`, `IS_B20`) lookups in separate tables.
++ Place the 16-bit (`AND`, `OR`, `XOR`, `MSB16`, etc.) and 20-bit (`HWSL`, `HWSLC`, `IS_B20`, `ZERO`) lookups in separate tables.
 + Combine `HWSL` and `HWSLC` into a single lookup (see also \#119).
diff --git a/spec/chip.typ b/spec/chip.typ
index 8e2c4ac33..10479943e 100644
--- a/spec/chip.typ
+++ b/spec/chip.typ
@@ -156,7 +156,7 @@
       (table.header(level:2, table.cell(colspan: 4, emph(cat))), table.hline(stroke: .6pt))
       for var in vars {
         (
-          [#raw(var.name)], 
+          [#raw(var.name)],
           [#type_to_code(var.type)], 
           table.cell(colspan: 2, [#eval(var.desc, mode: "markup")])
         )
diff --git a/spec/dvrm.typ b/spec/dvrm.typ
index f1d9a3a4c..e68f4bee8 100644
--- a/spec/dvrm.typ
+++ b/spec/dvrm.typ
@@ -5,13 +5,140 @@
   total_nr_variables,
   total_nr_instantiated_columns,
   render_constraint_table,
-  render_chip_assumptions,
   render_chip_padding_table,
+  render_chip_assumptions
 )
 
+
 #let config = load_config()
-// #let chip = load_chip("src/dvrm.toml", config)
+#let chip = load_chip("src/dvrm.toml", config)
+
+#show: book-page(chip.name)
+
+#let dvrm = raw(chip.name)
+
+= Columns
+#let nr_variables = total_nr_variables(chip)
+#let nr_columns = total_nr_instantiated_columns(chip, config)
+
+The `DVRM` chip is comprised of #nr_variables variables that are expressed using #nr_columns columns:
+#render_chip_column_table(chip, config)
+
+= Assumptions
+#render_chip_assumptions(chip, config)
+
+= Constraints
+From the ISA, we gather five requirements for the `DIV[U][W]` and `REM[U][W]` instructions:
+#enum(numbering: "R1.",
+  enum.item([
+    _For both signed and unsigned division, except in the case of_ overflow, _it holds that $#`n` = #`q` #`d` + #`r`$._
+  ]),
+  enum.item([
+    _`DIV` and `DIVU` perform [...] signed and unsigned integer division [...] rounding towards zero._
+  ]),
+  enum.item([
+    _For `REM`, the sign of a nonzero [remainder] equals the sign of the [numerator]._
+  ]),
+  enum.item([
+    In case of _division-by-zero_, $#`r` = #`n`$ and $#`q` = 2^64-1$ (unsigned) or $#`q` = -1$ (signed).
+  ]),
+  enum.item([
+    In case of _overflow_, $#`q` = #`n`$ and $#`r` = 0$
+  ]),
+)
+where _overflow_ occurs when $#`n` = -2^(63)$ and $#`d` = -1$ (and, hence, $#`signed` = 1$), and _division-by-zero_ indicates that $#`d` = 0$.
+In the following, we list the constraints associated with the #dvrm chip, and explain how these together enforce all five of these requirements.
+
+== R3: Sign remainder equals sign numerator
+We start with R3, which is straightforwardly asserted by constraint @dvrm:c:sign_r_equals_sign_n.
+#render_constraint_table(chip, config, groups:("sign_equality", ))
+
+== R2: rounding towards zero
+R2 states that "_[in] signed and unsigned integer division [the quotient is] round[ed] towards zero._"
+In other words,
++ the sign of $#`n`-#`qd`$ must match that of `n` (unless $#`qd` = #`n`$), and 
++ $|#`n`-#`qd`|  < |#`d`|$ (unless $#`d` = 0$).
+
+Leveraging R1 #footnote([Note: we need not worry about the _overflow_ case in applying this relation, since R5 requires specific values for `q` and `r` in this case.]), we can rewrite these as
++ the sign of $#`r`$ must match that of `n` (unless $#`r` = 0$), and 
++ $|#`r`|  < |#`d`|$ (unless $#`d` = 0$).
+
+Focusing on the first statement, we observe that this trivially holds when $#`signed` = 0$, 
+while R3 deals with the case that $#`signed` = 1$.
+The second statement is enforced by @dvrm:c:abs_r_lt_abs_d.
+@dvrm:c:abs_r_if_negative and @dvrm:c:abs_r_if_nonnegative (resp. @dvrm:c:abs_d_if_negative and @dvrm:c:abs_d_if_nonnegative) are included to ensure that `abs_r` (resp. `abs_d`) is the absolute values of `r` (resp. `d`).
+@dvrm:c:abs_r_range_check and @dvrm:c:abs_d_range_check are required to uphold assumption @add:a:lhs required by the `SUB` chip.
+
+#render_constraint_table(chip, config, groups:("abs_diff", ))
+
+== R5: overflow
+The ISA requires that $#`q` = #`n`$ and $#`r` = 0$ in the event of overflow (i.e., when $#`n` = -2^63$ and $#`d` = -1$).
+We note that the second half of this requirement is already satisfied by R2: since $#`d` = -1 != 0$, R2 requires that $|#`r`| < |#`d`| = 1$, to which $#`r` = 0$ is the only satisfying value.
+
+We moreover find that R1 can be leveraged to enforce the correct value of `q`.
+While $#`n` = #`qd` + #`r`$ (R1) does _not_ hold in the case of overflow, the relation $#`n` = |#`q`|#`d` + #`r`$ _does_.
+We moreover note that the 64-bit _signed_ two's complement representation of $-2^63$ is identical to the 64-bit _unsigned_ representation of $|-2^63| = 2^63$.
+As such, by interpreting `q` as an unsigned integer when $#`overflow` = 1$, it follows that R1 will enforce $#`q` = #`0x80...00`$.
+
+In summary, in case of overflow R2 enforces that $#`r` = 0$.
+Moreover it suffices to interpret `q` as unsigned integer (@dvrm:c:sign_q); R1 will ensure it contains the correct value.
+
+#render_constraint_table(chip, config, groups:"overflow")
+
+We highlight @dvrm:c:overflow.
+Recall that the `overflow` flag should be set if and only if (i) $#`signed` = 1$, (ii) $#`n` = #`0x80...00`$, and (iii) $#`d` = #`0xFF...FF`$.
+These requirements are equivalent to the state where:
+$
+  forall i in [0, 3]:&& 65535 - #`d`_i &= 0,\
+  forall i in [0, 2]:&& #`n`_i &= 0,\
+  && #`n`_3 - 2^15 dot #`sign_n` &= 0,\
+  && 1 - #`sign_n` &= 0,\
+$
+where $#`signed` = 1$ follows from the last equality.
+The requirement is phrased in this way, because the left-hand sides of the above expressions are $>= 0$ by construction.
+Given that the sum of these expressions does not exceed $2^19$ (and thus never wraps in the field), we can now say that the `overflow` bit should be set to $1$ if and only if their sum evaluates to $0$.
+The `ZERO` lookup guarantees this to be the case.
+
+== R1: $#`n` = #`qd` + #`r`$
+Rewriting R1, we find the constraint $not#`overflow` => #`n` - #`r` = #`qd`$.
+#footnote([Recall that @dvrm:c:sign_q allows to assert this equality even when `overflow`.])
+Since `n`, `d`, `q` and `r` are all 64-bit integers, we must assert this equality $mod 2^128$, rather than $mod 2^64$.
+To this end, we introduce `extended_n_sub_r` and leverage the `MUL` chip to verify that it is equal to $#`qd` mod 2^128$ using constraints @dvrm:c:mul_lower and @dvrm:c:mul_upper;
+@dvrm:c:q_range is included to uphold assumption @mul:a:rhs.
+
+#render_constraint_table(chip, config, groups:("equality", ))
+
+It now remains to enforce that `extended_n_sub_r` is the _signed_ 128-bit representation of $#`n`-#`r`$.
+Here, we introduce `extended_n` and `extended_r`.
+By their definition, these variables contain the signed 128-bit representations of `n` and `r`.
+The `carry` variable has been defined such that it mimics those in the `ADD` chip,
+except that here we add two `QuadHL`s rather than two `DWordHL`, thus needing four carry bits instead of two.
+With this in place, @dvrm:c:n_sub_r (mimicking @add:c:carry) ensures `extended_n_sub_r` must contain the correct value.
+
+Lastly, observe that $#`n` - #`r` in (-2^64, 2^64)$, _regardless_ of the value of `signed`.
+Moreover, note that the upper halves of the 128-bit representations of all values in this range are either `0xFFFFFFFF` (negative) or `0x00000000` (non-negative).
+This means that we do not need to store all 128 bits of `extended_n_sub_r`.
+Rather, we need only store the lower 64-bits, and a separate bit (`sign_n_sub_r`) indicating whether the top limbs are all-ones or all-zeroes.
+The prover is free to select the value for `sign_n_sub_r`; only one of the two will fit the proof.
+
+#render_constraint_table(chip, config, groups:("n_sub_r", ))
+
+== R4: division-by-zero
+R4 requires that $#`q` = 2^64-1$ (unsigned) or $-1$ (signed) and $#`r` = n$ when $#`d` = 0$.
+Recalling R1, we see that $#`n` = #`q` #`d` + #`r` = #`r`$ when $#`d` = 0$, already enforces the latter.
+Next, we note that, in two's complement, the _unsigned_ value $2^64-1$ and _signed_ value $-1$ are both represented by the bit string `0xFFFFFFFF`.
+Hence, only @dvrm:c:q_if_div_by_zero is required to completely constrain R4; @dvrm:c:div_by_zero just ensures the `div_by_zero` flag is set when $#`d` = 0$.
+
+#render_constraint_table(chip, config, groups:("div_by_zero", ))
+
+== Other
+The following constraints are included to enforce the values of `sign_n`, `sign_r` and `sign_d` are correct.
+#render_constraint_table(chip, config, groups:("defs", ))
 
-#show: book-page("dvrm.typ")
+== Output
+Lastly, this chip contributes the following to the lookup:
+#render_constraint_table(chip, config, groups:("output", ))
 
-*placeholder chapter: WIP*
+= Padding
+To pad the #dvrm table, we use the following data, representing the unsigned division $frac(0, 0, style: "horizontal")$:
+#render_chip_padding_table(chip, config)
diff --git a/spec/src/bitwise.toml b/spec/src/bitwise.toml
index 2eeec4059..9b4a3f951 100644
--- a/spec/src/bitwise.toml
+++ b/spec/src/bitwise.toml
@@ -51,7 +51,7 @@ precomputed = "true"
 [[variables.output]]
 name = "ZERO"
 type = "Bit"
-desc = "whether $#`X` = 0 and #`Y` = 0$"
+desc = "whether $#`X` = 0$, $#`Y` = 0$ and $#`Z` = 0$."
 precomputed = "true"
 
 [[variables.output]]
@@ -163,7 +163,7 @@ multiplicity = ["-", "μ_MSB16"]
 [[constraints.contributions]]
 kind = "interaction"
 tag = "ZERO"
-input = [["+", "X", ["*", 256, "Y"]]]
+input = [["+", "X", ["*", 256, "Y"], ["*", 65536, "Z"]]]
 output = "ZERO"
 multiplicity = ["-", "μ_ZERO"]
 
diff --git a/spec/src/dvrm.toml b/spec/src/dvrm.toml
new file mode 100644
index 000000000..ceeabf1e2
--- /dev/null
+++ b/spec/src/dvrm.toml
@@ -0,0 +1,404 @@
+name = "DVRM"
+
+# Input
+
+[[variables.input]]
+name = "n"
+type = "DWordHL"
+desc = "The numerator"
+pad = 0
+
+[[variables.input]]
+name = "d"
+type = "DWordHL"
+desc = "The denominator"
+pad = 0
+
+[[variables.input]]
+name = "signed"
+type = "Bit"
+desc = "Whether to interpret the input as signed (1) or unsigned (0) integers."
+pad = 0
+
+
+# Output
+
+[[variables.output]]
+name = "q"
+type = "DWordHL"
+desc = "The quotient; $#`n` / #`d`$ rounded towards zero."
+pad = 0
+
+[[variables.output]]
+name = "r"
+type = "DWordHL"
+desc = "The remainder; $#`n` - #`q` #`d`$."
+pad = 0
+
+# Auxiliary
+
+[[variables.auxiliary]]
+name = "div_by_zero"
+type = "Bit"
+desc = "Whether $#`d`=0$."
+pad = 1
+
+[[variables.auxiliary]]
+name = "overflow"
+type = "Bit"
+desc = "Whether $#`n` = -2^63$ and $#`d`=-1$."
+pad = 0
+
+[[variables.auxiliary]]
+name = "abs_r"
+type = "DWordHL"
+desc = "Absolute value of `r`."
+pad = 0
+
+[[variables.auxiliary]]
+name = "abs_d"
+type = "DWordHL"
+desc = "Absolute value of `d`."
+pad = 0
+
+[[variables.auxiliary]]
+name = "n_sub_r"
+type = "DWordHL"
+desc = "$#`n`-#`r`$."
+pad = 0
+
+[[variables.auxiliary]]
+name = "sign_n_sub_r"
+type = "Bit"
+desc = "Sign of `n_sub_r`."
+pad = 0
+
+[[variables.auxiliary]]
+name = "sign_n"
+type = "Bit"
+desc = "Sign of `n`."
+pad = 0
+
+[[variables.auxiliary]]
+name = "sign_d"
+type = "Bit"
+desc = "Sign of `d`."
+pad = 0
+
+[[variables.auxiliary]]
+name = "sign_q"
+type = "Bit"
+desc = "Sign of `q`."
+pad = 0
+
+[[variables.auxiliary]]
+name = "sign_r"
+type = "Bit"
+desc = "Sign of `r`."
+pad = 0
+
+# Virtual
+
+[[variables.virtual]]
+name = "extended_n"
+type = "QuadHL"
+desc = "sign-extended value of `n`."
+def = {idx="i", polys = [
+  {iter=[0, 3], poly=["idx", "n", "i"]},
+  {iter=[4, 7], poly=["*", 0xFFFF, "sign_n"]}
+]}
+
+[[variables.virtual]]
+name = "extended_r"
+type = "QuadHL"
+desc = "sign-extended value of `r`."
+def = {idx="i", polys = [
+  {iter=[0, 3], poly=["idx", "r", "i"]},
+  {iter=[4, 7], poly=["*", 0xFFFF, "sign_r"]}
+]}
+
+[[variables.virtual]]
+name = "extension_n_sub_r"
+type = "DWordHL"
+desc = "sign-extension limbs of `n_sub_r`."
+def = {idx="i", iter=[0, 3], poly=["*", 0xFFFF, "sign_n_sub_r"]}
+
+[[variables.virtual]]
+name = "extended_n_sub_r"
+type = "QuadHL"
+desc = "sign-extended value of `n_sub_r`."
+def = {idx="i", polys = [
+  {iter=[0, 3], poly=["idx", "n_sub_r", "i"]},
+  {iter=[4, 7], poly=["idx", "extension_n_sub_r", ["-", "i", 4]]}
+]}
+
+[[variables.virtual]]
+name = "carry"
+type = ["Bit", 4]
+desc = "carries for adding `extended_n_sub_r` to `extended_r`, forming `extended_n`."
+def = {idx="i", polys = [
+    {iter=0, poly=["*", 
+        ["^", 2, -32], 
+        ["-", 
+            ["+", 
+                ["idx", ["cast", "extended_n_sub_r", "QuadWL"], "i"], 
+                ["idx", ["cast", "extended_r", "QuadWL"], "i"]
+            ], 
+            ["idx", ["cast", "extended_n", "QuadWL"], "i"]
+        ]
+    ]},
+    {iter=[1, 3], poly=["*", 
+        ["^", 2, -32], 
+        ["-", 
+            ["+", 
+                ["idx", ["cast", "extended_n_sub_r", "QuadWL"], "i"], 
+                ["idx", ["cast", "extended_r", "QuadWL"], "i"],
+                ["idx", "carry", ["-", "i", 1]],
+            ], 
+            ["idx", ["cast", "extended_n", "QuadWL"], "i"]
+        ]
+    ]},
+]}
+
+[[variables.virtual]]
+name = "μ_sum"
+type = "BaseField"
+desc = "sum of multiplicities"
+def = ["+", "μ_q", "μ_r"]
+
+
+# Multiplicities
+
+[[variables.multiplicity]]
+name = "μ_q"
+type = "BaseField"
+desc = ""
+pad = 0
+
+[[variables.multiplicity]]
+name = "μ_r"
+type = "BaseField"
+desc = ""
+pad = 0
+
+
+# Assumptions
+
+[[assumptions]]
+desc = "`IS_HALF[n[i]]`"
+iter = ["i", 0, 3]
+ref = "lt:a:range_n"
+
+[[assumptions]]
+desc = "`IS_HALF[d[i]]`"
+iter = ["i", 0, 3]
+ref = "lt:a:range_d"
+
+[[assumptions]]
+desc = "`IS_BIT<signed>`"
+ref = "lt:a:range_signed"
+
+# Constraints
+
+[[constraint_groups]]
+name = "sign_equality"
+
+[[constraints.sign_equality]]
+kind = "arith"
+constraint = "$#`r` eq.not 0 => #`sign_r` = #`sign_n`$"
+poly = ["*", ["sum", ["=", "i", 0], 3, ["idx", "r", "i"]], ["-", "sign_r", "sign_n"]]
+ref = "dvrm:c:sign_r_equals_sign_n"
+
+[[constraint_groups]]
+name = "abs_diff"
+
+[[constraints.abs_diff]]
+kind = "interaction"
+tag = "LT"
+input = [["cast", "abs_r", "DWordWL"], ["cast", "abs_d", "DWordWL"], 0]
+output = ["not", "div_by_zero"]
+multiplicity = "μ_sum"
+ref ="dvrm:c:abs_r_lt_abs_d"
+
+[[constraints.abs_diff]]
+kind = "interaction"
+tag = "IS_HALF"
+input = [["idx", "abs_r", "i"]]
+iter = ["i", 0, 3]
+multiplicity = "sign_r"
+ref = "dvrm:c:abs_r_range_check"
+
+[[constraints.abs_diff]]
+kind = "template"
+tag = "SUB"
+input = [0, ["cast", "r", "DWordWL"]]
+output = ["cast", "abs_r", "DWordWL"]
+cond = "sign_r"
+ref = "dvrm:c:abs_r_if_negative"
+
+[[constraints.abs_diff]]
+kind = "arith"
+constraint = "$not#`sign_r` => #`abs_r[i]`=#`r[i]`$"
+iter = ["i", 0, 3]
+poly = ["*", ["-", 1, "sign_r"], ["-", ["idx", "abs_r", "i"], ["idx", "r", "i"]]]
+ref = "dvrm:c:abs_r_if_nonnegative"
+
+[[constraints.abs_diff]]
+kind = "interaction"
+tag = "IS_HALF"
+input = [["idx", "abs_d", "i"]]
+iter = ["i", 0, 3]
+multiplicity = "sign_d"
+ref = "dvrm:c:abs_d_range_check"
+
+[[constraints.abs_diff]]
+kind = "template"
+tag = "SUB"
+input = [0, ["cast", "d", "DWordWL"]]
+output = ["cast", "abs_d", "DWordWL"]
+cond = "sign_d"
+ref = "dvrm:c:abs_d_if_negative"
+
+[[constraints.abs_diff]]
+kind = "arith"
+constraint = "$not#`sign_d` => #`abs_d[i]`=#`d[i]`$"
+iter = ["i", 0, 3]
+poly = ["*", ["-", 1, "sign_d"], ["-", ["idx", "abs_d", "i"], ["idx", "d", "i"]]]
+ref = "dvrm:c:abs_d_if_nonnegative"
+
+[[constraint_groups]]
+name = "overflow"
+
+[[constraints.overflow]]
+kind = "arith"
+constraint = "$#`sign_q` = #`signed` dot (1- #`overflow`)$"
+poly = ["-", ["*", "signed", ["-", 1, "overflow"]], "sign_q"]
+ref = "dvrm:c:sign_q"
+
+[[constraints.overflow]]
+kind = "interaction"
+tag = "ZERO"
+input = [["+", ["idx", "n", 0], ["idx", "n", 1], ["idx", "n", 2], ["-", ["idx", "n", 3], ["*", ["^", 2, 15], "sign_n"]], ["-", 1, "sign_n"], ["-", 65535, ["idx", "d", 0]], ["-", 65535, ["idx", "d", 1]], ["-", 65535, ["idx", "d", 2]], ["-", 65535, ["idx", "d", 3]]]]
+output = "overflow"
+multiplicity = "μ_sum"
+ref = "dvrm:c:overflow"
+
+[[constraint_groups]]
+name = "n_sub_r"
+
+[[constraints.n_sub_r]]
+kind = "template"
+tag = "IS_BIT"
+input = [["idx", "carry", "i"]]
+iter = ["i", 0, 3]
+ref = "dvrm:c:n_sub_r"
+
+[[constraints.n_sub_r]]
+kind = "interaction"
+tag = "IS_HALF"
+input = [["idx", "r", "i"]]
+iter = ["i", 0, 3]
+multiplicity = "μ_sum"
+ref = "dvrm:c:r_range"
+
+[[constraints.n_sub_r]]
+kind = "interaction"
+tag = "IS_HALF"
+input = [["idx", "n_sub_r", "i"]]
+iter = ["i", 0, 3]
+multiplicity = "μ_sum"
+ref = "dvrm:c:n_sub_r_range"
+
+[[constraints.n_sub_r]]
+kind = "template"
+tag = "IS_BIT"
+input = ["sign_n_sub_r"]
+ref = "dvrm:c:sign_n_sub_r_is_bit"
+
+[[constraint_groups]]
+name = "equality"
+
+[[constraints.equality]]
+kind = "interaction"
+tag = "MUL"
+input = ["d", "signed", "q", "sign_q", 0]
+output = ["cast", "n_sub_r", "DWordWL"]
+multiplicity = "μ_sum"
+ref = "dvrm:c:mul_lower"
+
+[[constraints.equality]]
+kind = "interaction"
+tag = "MUL"
+input = ["d", "signed", "q", "sign_q", 1]
+output = ["cast", "extension_n_sub_r", "DWordWL"]
+multiplicity = "μ_sum"
+ref = "dvrm:c:mul_upper"
+
+[[constraints.equality]]
+kind = "interaction"
+tag = "IS_HALF"
+input = [["idx", "q", "i"]]
+iter = ["i", 0, 3]
+multiplicity = "μ_sum"
+ref = "dvrm:c:q_range"
+
+
+[[constraint_groups]]
+name = "defs"
+
+[[constraints.defs]]
+kind = "template"
+tag = "SIGN"
+input = [["idx", "n", 3], "signed"]
+output = "sign_n"
+ref = "dvrm:c:sign_n"
+
+[[constraints.defs]]
+kind = "template"
+tag = "SIGN"
+input = [["idx", "r", 3], "signed"]
+output = "sign_r"
+ref = "dvrm:c:sign_r"
+
+[[constraints.defs]]
+kind = "template"
+tag = "SIGN"
+input = [["idx", "d", 3], "signed"]
+output = "sign_d"
+ref = "dvrm:c:sign_d"
+
+[[constraint_groups]]
+name = "div_by_zero"
+
+[[constraints.div_by_zero]]
+kind = "arith"
+iter = ["i", 0, 3]
+constraint = "$#`div_by_zero` => #`q[i]` = 65535$"
+poly = ["*", "div_by_zero", ["-", ["idx", "q", "i"], 65535]]
+ref = "dvrm:c:q_if_div_by_zero"
+
+[[constraints.div_by_zero]]
+kind = "interaction"
+tag = "ZERO"
+input = [["+", ["idx", "d", 0], ["idx", "d", 1], ["idx", "d", 2], ["idx", "d", 3]]]
+output = "div_by_zero"
+ref = "dvrm:c:div_by_zero"
+multiplicity = "μ_sum"
+
+[[constraint_groups]]
+name = "output"
+desc = "Each row contributes the following to the LogUp sum"
+
+[[constraints.output]]
+kind = "interaction"
+tag = "DVRM"
+input = ["n", "d", "signed", "0"]
+output = ["cast", "q", "DWordWL"]
+multiplicity = "-μ_q"
+
+[[constraints.output]]
+kind = "interaction"
+tag = "DVRM"
+input = ["n", "d", "signed", "1"]
+output = ["cast", "r", "DWordWL"]
+multiplicity = "-μ_r"
\ No newline at end of file
diff --git a/spec/src/mul.toml b/spec/src/mul.toml
index e987b0f75..238bfe01f 100644
--- a/spec/src/mul.toml
+++ b/spec/src/mul.toml
@@ -130,6 +130,7 @@ iter = ["i", 0, 3]
 [[assumptions]]
 desc = "`IS_HALF[rhs[i]]`"
 iter = ["i", 0, 3]
+ref = "mul:a:rhs"
 
 # Constraints
 
@@ -202,4 +203,4 @@ tag = "MUL"
 input = ["lhs", "lhs_signed", "rhs", "rhs_signed", "1"]
 output = ["cast", "hi", "DWordWL"]
 multiplicity = ["-", "μ_hi"]
-ref = "mul:c:lookup_hi"
\ No newline at end of file
+ref = "mul:c:lookup_hi"

From 8f0e8d3c0a3389d982766862e35c4442c5dd599c Mon Sep 17 00:00:00 2001
From: Erik <159244975+erik-3milabs@users.noreply.github.com>
Date: Thu, 5 Feb 2026 11:26:55 +0100
Subject: [PATCH 058/105] spec: signatures (#280)

* spec: list all interaction signatures

* Update spec/signatures.typ

Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com>

* spec: signatures: fix LOAD signature

* spec: signatures: make IS_BIT's cond a BaseField

* spec: signatures: make ECALL's syscallnr a DWordWL

* spec: signatures: preemptively introduce NEG signature (see #270)

* spec: signatures: fix DWordDL typo

Co-authored-by: Robin Jadoul <robin.jadoul@3milabs.tech>

---------

Co-authored-by: greptile-apps[bot] <165735046+greptile-apps[bot]@users.noreply.github.com>
Co-authored-by: Robin Jadoul <robin.jadoul@3milabs.tech>
---
 spec/book.typ            |   1 +
 spec/signatures.typ      |  90 ++++++++++++++++++++
 spec/src.typ             |  51 +++++++++++
 spec/src/signatures.toml | 178 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 320 insertions(+)
 create mode 100644 spec/signatures.typ
 create mode 100644 spec/src/signatures.toml

diff --git a/spec/book.typ b/spec/book.typ
index 64d78e15a..076d31cf3 100644
--- a/spec/book.typ
+++ b/spec/book.typ
@@ -9,6 +9,7 @@
   summary: (
     ("memory.typ", [Memory argument], <memory>),
     ("variables.typ", [Variables], <vars>),
+    ("signatures.typ", [Signatures], <signatures>),
     ("is_bit.typ", [IS_BIT template], <isbit>),
     ("sign.typ", [SIGN template], <sign>),
     ("add.typ", [ADD/SUB template], <add>),
diff --git a/spec/signatures.typ b/spec/signatures.typ
new file mode 100644
index 000000000..5673cdcf6
--- /dev/null
+++ b/spec/signatures.typ
@@ -0,0 +1,90 @@
+#import "/book.typ": book-page
+#import "/src.typ": load_signatures, load_config
+
+#show: book-page("signatures.typ")
+
+#let config = load_config()
+#let signatures = load_signatures(config)
+
+// Render a signature
+#let render_signature(sig) = {
+  let (lb, rb) = if sig.kind == "interaction" {
+    (`[`, `]`)
+  } else if sig.kind == "template" {
+    (`<`, `>`)
+  }
+
+  let cond = sig.at("cond", default: none)
+  let cond_str = if cond != none {
+    raw(cond) + ` => `
+  } else {``}
+
+  let input_str = sig.input.map(elt => {
+    if type(elt) == array {
+      raw(elt.at(0)) + `[` + raw(str(elt.at(1))) + `]`
+    } else {
+      raw(elt)
+    }
+  }).join(`, `)
+
+  let output = sig.at("output", default: none)
+  let output_str = if output != none {
+    if type(output) == array {
+      raw(output.at(0)) + `[` + raw(str(output.at(1))) + `]`
+    } else {
+      raw(output)
+    } + `; `
+  } else {``}
+
+  return [#cond_str#raw(sig.tag)#lb#output_str#input_str#rb]
+}
+
+// Compute the bus size of an interaction
+#let interaction_bus_size(sig) = {
+  let vars = sig.input + if "output" in sig { (sig.output, )} else {()}
+
+  return vars.map(v => {
+    let (label, factor) = if type(v) == array {
+      (v.at(0), v.at(1))
+    } else {
+      (v, 1)
+    }
+    config.variables.types.filter(type => type.label == label).first().subtypes.len() * factor
+  })
+  .sum()
+}
+
+#let interactions = signatures.signatures.filter(s => s.kind == "interaction")
+The following lists signatures of the #interactions.len() interactions in this VM.
+#figure(
+  table(
+    columns: (1fr, auto),
+    inset: 7pt,
+    align: (top+left, center),
+    stroke: none,
+    table.header([*Signature*], [*Bus size*]),
+    table.hline(stroke: 1pt),
+    table.vline(stroke: 1pt, x: 1),
+    ..for sig in interactions {
+      ([#render_signature(sig)], [#interaction_bus_size(sig)])
+    },
+  ),
+  caption: "Signature overview of interactions",
+)
+
+#let templates = signatures.signatures.filter(s => s.kind == "template")
+Below, we list the signatures of the #templates.len() templates in this VM.
+#figure(
+  table(
+    columns: 1fr,
+    inset: 7pt,
+    align: (top+left, center),
+    stroke: none,
+    table.header([*Signature*]),
+    table.hline(stroke: 1pt),
+    ..for sig in templates {
+      ([#render_signature(sig)], )
+    },
+  ),
+  caption: "Signature overview of templates",
+)
diff --git a/spec/src.typ b/spec/src.typ
index 8200b47c1..6328c4665 100644
--- a/spec/src.typ
+++ b/spec/src.typ
@@ -1,5 +1,7 @@
 /// Path to the config file.
 #let CONFIG_PATH = "src/config.toml"
+/// Path to the signatures file
+#let SIGNATURES_PATH = "src/signatures.toml"
 
 /// Check the configuration object for internal consistency.
 #let _check_config(config) = {
@@ -31,6 +33,55 @@
   return config
 }
 
+
+// Validate the `signatures` overview
+#let _check_signatures(signatures, config) = {
+  let var_labels = config.variables.types.map(t => t.label)
+
+  // Verify that `var` is a valid variable.
+  let verify_variable(var) = {
+    if type(var) == array {
+        assert(var.at(0) in var_labels, message: "Invalid var type: " + repr(var))
+        assert(type(var.at(1)) == int, message: "Invalid var type: " + repr(var))
+    } else if type(var) == str {
+      assert(var in var_labels, message: "Invalid var type: " + repr(var))
+    } else {
+      assert(false, message: "Invalid var type: " + repr(var))
+    }
+  }
+
+  assert("signatures" in signatures, message: "No signatures listed")
+  for sig in signatures.signatures {
+    assert("tag" in sig, message: "No tag associated with " + repr(sig))
+    assert(type(sig.tag) == str, message: "Tag is not of type str: " + repr(sig.tag))
+
+    assert("kind" in sig, message: "No kind associated with " + repr(sig))
+    assert(type(sig.kind) == str, message: "kind is not of type str: " + repr(sig.kind))
+    assert(sig.kind in ("interaction", "template"), message: "Invalid kind: " + repr(sig.kind))
+
+    if "cond" in sig {
+      assert(sig.kind != "interaction", message: "Invalid condition for interaction: " + repr(sig))
+      verify_variable(sig.cond)      
+    }    
+    
+    assert("input" in sig, message: "No input associated with " + repr(sig))
+    assert(type(sig.input) == array, message: "Invalid input type: " + repr(sig.input))
+    sig.input.map(i => verify_variable(i))
+
+    if "output" in sig {
+      verify_variable(sig.output)
+    }
+  }
+}
+
+// Load the signatures from file
+#let load_signatures(config) = {
+  let signatures = toml(SIGNATURES_PATH)
+  _check_signatures(signatures, config)
+  return signatures
+}
+
+
 /// Check a chip object for internal consistency.
 #let _check_chip(chip, config) = {
   // Check that all variable categories are valid
diff --git a/spec/src/signatures.toml b/spec/src/signatures.toml
new file mode 100644
index 000000000..ba233f1e6
--- /dev/null
+++ b/spec/src/signatures.toml
@@ -0,0 +1,178 @@
+# cond => IS_BIT<X>
+[[signatures]]
+tag = "IS_BIT"
+kind = "template"
+input = ["BaseField"]
+cond = "BaseField"
+
+# cond => ADD<sum; lhs, rhs>
+[[signatures]]
+tag = "ADD"
+kind = "template"
+input = ["DWordWL", "DWordWL"]
+output = "DWordWL"
+cond = "BaseField"
+
+# cond => SUB<diff; lhs, rhs>
+[[signatures]]
+tag = "SUB"
+kind = "template"
+input = ["DWordWL", "DWordWL"]
+output = "DWordWL"
+cond = "BaseField"
+
+# cond => NEG<neg; X>
+[[signatures]]
+tag = "NEG"
+kind = "template"
+input = ["DWordHL"]
+output = "DWordWL"
+cond = "Bit"
+
+# SIGN<sign; X, signed>
+[[signatures]]
+tag = "SIGN"
+kind = "template"
+input = ["Half", "Bit"]
+output = "Bit"
+
+# DECODE[pc, imm, packed_decode]
+[[signatures]]
+tag = "DECODE"
+kind = "interaction"
+input = ["DWordWL", "DWordWL", "BaseField"]
+
+# SHIFT[out; in, shift, direction, signed, word_instr]
+[[signatures]]
+tag = "SHIFT"
+kind = "interaction"
+input = ["DWordHL", "Byte", "Bit", "Bit", "Bit"]
+output = "DWordWL"
+
+# BRANCH[next_pc; pc, offset, register, JALR]
+[[signatures]]
+tag = "BRANCH"
+kind = "interaction"
+input = ["DWordWL", "Word", "DWordWL", "Bit"]
+output = "DWordWL"
+
+# MEMW[old; is_register, base_address, value, timestamp, write2, write4, write8]
+[[signatures]]
+tag = "MEMW"
+kind = "interaction"
+input = ["Bit", "DWordWL", ["BaseField", 8], "DWordWL", "Bit", "Bit", "Bit"]
+output = ["BaseField", 8]
+
+# MEMW[is_register, base_address, value, timestamp, write2, write4, write8]
+[[signatures]]
+tag = "MEMW"
+kind = "interaction"
+input = ["Bit", "DWordWL", ["BaseField", 8], "DWordWL", "Bit", "Bit", "Bit"]
+
+# LT[lt; lhs, rhs, signed]
+[[signatures]]
+tag = "LT"
+kind = "interaction"
+input = ["DWordWL", "DWordWL", "Bit"]
+output = "Bit"
+
+# MUL[lo/hi; lhs, lhs_signed, rhs, rhs_signed, 0/1]
+[[signatures]]
+tag = "MUL"
+kind = "interaction"
+input = ["DWordHL", "Bit", "DWordHL", "Bit", "Bit"]
+output = "DWordWL"
+
+# DVRM[q/r; n, d, signed, 0/1]
+[[signatures]]
+tag = "DVRM"
+kind = "interaction"
+input = ["DWordHL", "DWordHL", "Bit", "Bit"]
+output = "DWordWL"
+
+# LOAD[res; base_address, timestamp, read2, read4, read8, signed]
+[[signatures]]
+tag = "LOAD"
+kind = "interaction"
+input = ["DWordWL", "DWordWL", "Bit", "Bit", "Bit", "Bit"]
+output = "DWordWL"
+
+# ECALL[timestamp, syscallnr]
+[[signatures]]
+tag = "ECALL"
+kind = "interaction"
+input = ["DWordWL", "DWordWL"]
+
+# AND_BYTE[res; X, Y]
+[[signatures]]
+tag = "AND_BYTE"
+kind = "interaction"
+input = ["Byte", "Byte"]
+output = "Byte"
+
+# OR_BYTE[res; X, Y]
+[[signatures]]
+tag = "OR_BYTE"
+kind = "interaction"
+input = ["Byte", "Byte"]
+output = "Byte"
+
+# XOR_BYTE[res; X, Y]
+[[signatures]]
+tag = "XOR_BYTE"
+kind = "interaction"
+input = ["Byte", "Byte"]
+output = "Byte"
+
+# MSB8[msb; X]
+[[signatures]]
+tag = "MSB8"
+kind = "interaction"
+input = ["Byte"]
+output = "Bit"
+
+# MSB16[msb; X]
+[[signatures]]
+tag = "MSB16"
+kind = "interaction"
+input = ["Half"]
+output = "Bit"
+
+# ZERO[is_zero; X]
+[[signatures]]
+tag = "ZERO"
+kind = "interaction"
+input = ["B20"]
+output = "Bit"
+
+# IS_BYTE[X]
+[[signatures]]
+tag = "IS_BYTE"
+kind = "interaction"
+input = ["Byte"]
+
+# IS_HALF[X]
+[[signatures]]
+tag = "IS_HALF"
+kind = "interaction"
+input = ["Half"]
+
+# IS_B20[X]
+[[signatures]]
+tag = "IS_B20"
+kind = "interaction"
+input = ["B20"]
+
+# HWSL[res; X, shift]
+[[signatures]]
+tag = "HWSL"
+kind = "interaction"
+input = ["Half", "B4"]
+output = "Half"
+
+# HWSLC[res; X, shift]
+[[signatures]]
+tag = "HWSLC"
+kind = "interaction"
+input = ["Half", "B4"]
+output = "Half"
\ No newline at end of file

From 1b2cfb99b35b0678b86ec42ccddd878279ae24ce Mon Sep 17 00:00:00 2001
From: Erik <159244975+erik-3milabs@users.noreply.github.com>
Date: Fri, 6 Feb 2026 08:31:42 +0100
Subject: [PATCH 059/105] spec: Leverage `NEG` in `DVRM` (#287)

* spec: DVRM: use NEG template for abs_r and abs_d
This saves 4 columns.

* Apply suggestions from @RobinJadoul

Co-authored-by: Robin Jadoul <robin.jadoul@3milabs.tech>

---------

Co-authored-by: Robin Jadoul <robin.jadoul@3milabs.tech>
---
 spec/dvrm.typ      |  1 -
 spec/src/dvrm.toml | 46 +++++++++++++++-------------------------------
 2 files changed, 15 insertions(+), 32 deletions(-)

diff --git a/spec/dvrm.typ b/spec/dvrm.typ
index e68f4bee8..54e71d771 100644
--- a/spec/dvrm.typ
+++ b/spec/dvrm.typ
@@ -67,7 +67,6 @@ Focusing on the first statement, we observe that this trivially holds when $#`si
 while R3 deals with the case that $#`signed` = 1$.
 The second statement is enforced by @dvrm:c:abs_r_lt_abs_d.
 @dvrm:c:abs_r_if_negative and @dvrm:c:abs_r_if_nonnegative (resp. @dvrm:c:abs_d_if_negative and @dvrm:c:abs_d_if_nonnegative) are included to ensure that `abs_r` (resp. `abs_d`) is the absolute values of `r` (resp. `d`).
-@dvrm:c:abs_r_range_check and @dvrm:c:abs_d_range_check are required to uphold assumption @add:a:lhs required by the `SUB` chip.
 
 #render_constraint_table(chip, config, groups:("abs_diff", ))
 
diff --git a/spec/src/dvrm.toml b/spec/src/dvrm.toml
index ceeabf1e2..d93449228 100644
--- a/spec/src/dvrm.toml
+++ b/spec/src/dvrm.toml
@@ -51,13 +51,13 @@ pad = 0
 
 [[variables.auxiliary]]
 name = "abs_r"
-type = "DWordHL"
+type = "DWordWL"
 desc = "Absolute value of `r`."
 pad = 0
 
 [[variables.auxiliary]]
 name = "abs_d"
-type = "DWordHL"
+type = "DWordWL"
 desc = "Absolute value of `d`."
 pad = 0
 
@@ -215,55 +215,39 @@ name = "abs_diff"
 [[constraints.abs_diff]]
 kind = "interaction"
 tag = "LT"
-input = [["cast", "abs_r", "DWordWL"], ["cast", "abs_d", "DWordWL"], 0]
+input = ["abs_r", "abs_d", 0]
 output = ["not", "div_by_zero"]
 multiplicity = "μ_sum"
 ref ="dvrm:c:abs_r_lt_abs_d"
 
-[[constraints.abs_diff]]
-kind = "interaction"
-tag = "IS_HALF"
-input = [["idx", "abs_r", "i"]]
-iter = ["i", 0, 3]
-multiplicity = "sign_r"
-ref = "dvrm:c:abs_r_range_check"
-
 [[constraints.abs_diff]]
 kind = "template"
-tag = "SUB"
-input = [0, ["cast", "r", "DWordWL"]]
-output = ["cast", "abs_r", "DWordWL"]
+tag = "NEG"
+input = ["r"]
+output = "abs_r"
 cond = "sign_r"
 ref = "dvrm:c:abs_r_if_negative"
 
 [[constraints.abs_diff]]
 kind = "arith"
-constraint = "$not#`sign_r` => #`abs_r[i]`=#`r[i]`$"
-iter = ["i", 0, 3]
-poly = ["*", ["-", 1, "sign_r"], ["-", ["idx", "abs_r", "i"], ["idx", "r", "i"]]]
+constraint = "$not#`sign_r` => #`abs_r` = #`r`$"
+poly = ["*", ["not", "sign_r"], ["-", ["idx", "abs_r", "i"], ["idx", ["cast", "r", "DWordWL"], "i"]]]
+iter = ["i", 0, 1]
 ref = "dvrm:c:abs_r_if_nonnegative"
 
-[[constraints.abs_diff]]
-kind = "interaction"
-tag = "IS_HALF"
-input = [["idx", "abs_d", "i"]]
-iter = ["i", 0, 3]
-multiplicity = "sign_d"
-ref = "dvrm:c:abs_d_range_check"
-
 [[constraints.abs_diff]]
 kind = "template"
-tag = "SUB"
-input = [0, ["cast", "d", "DWordWL"]]
-output = ["cast", "abs_d", "DWordWL"]
+tag = "NEG"
+input = ["d"]
+output = "abs_d"
 cond = "sign_d"
 ref = "dvrm:c:abs_d_if_negative"
 
 [[constraints.abs_diff]]
 kind = "arith"
-constraint = "$not#`sign_d` => #`abs_d[i]`=#`d[i]`$"
-iter = ["i", 0, 3]
-poly = ["*", ["-", 1, "sign_d"], ["-", ["idx", "abs_d", "i"], ["idx", "d", "i"]]]
+constraint = "$not#`sign_d` => #`abs_d` = #`d`$"
+iter = ["i", 0, 1]
+poly = ["*", ["not", "sign_d"], ["-", ["idx", "abs_d", "i"], ["idx", ["cast", "d", "DWordWL"], "i"]]]
 ref = "dvrm:c:abs_d_if_nonnegative"
 
 [[constraint_groups]]

From 39b147995915f823b5e36421ccc6f03bc99daf59 Mon Sep 17 00:00:00 2001
From: Joaquin Carletti <joaquin.carletti@lambdaclass.com>
Date: Fri, 6 Feb 2026 08:49:35 -0300
Subject: [PATCH 060/105] update docs

---
 docs/spec/add.md                    |   30 +-
 docs/spec/bitwise.md                |   26 +-
 docs/spec/branch.md                 |   62 +-
 docs/spec/cpu.md                    |  196 ++---
 docs/spec/decode.md                 |   46 +-
 docs/spec/decode_uncompressed.md    |   46 --
 docs/spec/dvrm.md                   |  215 ++++-
 docs/spec/ecall.md                  |   12 +-
 docs/spec/halt.md                   |   32 -
 docs/spec/is_bit.md                 |   18 +-
 docs/spec/load.md                   |   66 +-
 docs/spec/lt.md                     |   90 +--
 docs/spec/memory.md                 |   18 +-
 docs/spec/memw.md                   |  114 +--
 docs/spec/mul.md                    |  122 +--
 docs/spec/neg.md                    |   73 ++
 docs/spec/shift.md                  |  176 ++--
 docs/spec/sign.md                   |   45 ++
 docs/spec/signatures.md             |   23 +
 docs/spec/spec_full.md              | 1161 ++++++++++++++++++---------
 scripts/extract_and_convert_spec.sh |   14 +-
 scripts/typst_to_md.py              |    3 +
 22 files changed, 1617 insertions(+), 971 deletions(-)
 delete mode 100644 docs/spec/decode_uncompressed.md
 delete mode 100644 docs/spec/halt.md
 create mode 100644 docs/spec/neg.md
 create mode 100644 docs/spec/sign.md
 create mode 100644 docs/spec/signatures.md

diff --git a/docs/spec/add.md b/docs/spec/add.md
index 8711c8493..051a55a27 100644
--- a/docs/spec/add.md
+++ b/docs/spec/add.md
@@ -2,13 +2,11 @@
 
 box( inset: (left: 4pt, right: 4pt), outset: (top: 4pt, bottom: 4pt), radius: 2pt, fill: luma(230), raw(code)) }
 
-## Notation
-
-The  constraint template has the following interface:
+= Notation The  constraint template has the following interface:
 
 where `cond` is any value described by an expression _of degree at most `1`_.
 
-### 
+## 
 
 For ease of notation, we moreover introduce the  constraint template. Its interface
 
@@ -16,19 +14,11 @@ maps onto the  template as
 
 It constrains that ``diff` = `lhs` - `rhs` mod 2^64` when the expression `cond` is non-zero. As with ,  can be used to denote the _unconditional_ application of the template.
 
-## Variables
-
-## Assumptions
-
-| Tag | Range | Description |
-|-----|-------|-------------|
-| `ADD-A1.i` | i ∈ [0, 1] | `IS_WORD[lhs[i]]` |
-| `ADD-A2.i` | i ∈ [0, 1] | `IS_WORD[rhs[i]]` |
-| `ADD-A3.i` | i ∈ [0, 1] | `IS_WORD[sum[i]]` |
+= Variables
 
-## Constraints
+= Assumptions
 
-This template introduces the following constraints
+= Constraints This template introduces the following constraints
 
 ## Columns
 
@@ -63,6 +53,16 @@ carry (when iter=1) := 2^-32 * (lhs[1] + rhs[1] + carry[0] - sum[1])
 |------|------|-------------|
 | `cond` | `BaseField` | Whether the relation should be enforced ($eq.not 0$) or not ($0$). |
 
+## Assumptions
+
+| Tag | Range | Description |
+|-----|-------|-------------|
+| `ADD-A1.i` | i ∈ [0, 1] | `IS_WORD[lhs[i]]` |
+| `ADD-A2.i` | i ∈ [0, 1] | `IS_WORD[rhs[i]]` |
+| `ADD-A3.i` | i ∈ [0, 1] | `IS_WORD[sum[i]]` |
+
+## Constraints
+
 ### all
 
 | Tag | Range | Description |
diff --git a/docs/spec/bitwise.md b/docs/spec/bitwise.md
index 93b0f1f1e..50d61d8d4 100644
--- a/docs/spec/bitwise.md
+++ b/docs/spec/bitwise.md
@@ -1,5 +1,15 @@
 # BITWISE Chips
 
+= Columns
+
+The  chip is comprised of  variables that are expressed using  columns. Of these, the _input_ and _output_ variables ( in total) are precomputed.
+
+*Note*: This table contains one row for every possible value of `(X, Y, Z)`. As such, it has length `2^8 dot 2^8 dot 2^4 = 2^(20)`.
+
+= Lookup This chip adds the following interactions to the lookup:
+
+= Areas of Optimization The following ideas may prove to be optimizations for the  chip: + Extend `IS_BYTE[X]` to `ARE_BYTES[X, Y]`, such that two bytes are range checked at once. When only a single check is required, one can still execute `IS_BYTE[X] := ARE_BYTES[X, 0]`. + Drop `MSB8` column, and instead define the `MSB8` lookup as `MSB8<X> := MSB16[256X]`. Note: currently, `MSB8` also implicity range checks the input `X` (the lookup fails if `X` is not a `Byte`). This optimization should only be executed when all chips leveraging `MSB8` do _not_ need this implicit range check. + Place the 16-bit (`AND`, `OR`, `XOR`, `MSB16`, etc.) and 20-bit (`HWSL`, `HWSLC`, `IS_B20`, `ZERO`) lookups in separate tables. + Combine `HWSL` and `HWSLC` into a single lookup (see also \).
+
 ## Columns
 
 ### Input
@@ -19,7 +29,7 @@
 | `XOR` | `Byte` | the binary XOR of `X` and `Y` |
 | `MSB8` | `Bit` | the most significant bit of `X` |
 | `MSB16` | `Bit` | the most significant bit of `Y` |
-| `ZERO` | `Bit` | whether $`X` = 0 and `Y` = 0$ |
+| `ZERO` | `Bit` | whether $`X` = 0$, $`Y` = 0$ and $`Z` = 0$. |
 | `SLL` | `Half` | `X\|\|Y` logically left-shifted by `Z`: $((`X` + 256`Y`) `<<` `Z`) mod 2^16$ |
 | `SLLC` | `Half` | `X\|\|Y` logically right-shifted by `Z`: $(`X` + 256`Y`) `>>` (16 - `Z`)$ |
 
@@ -39,18 +49,6 @@
 | `μ_HWSL` | `BaseField` |  |
 | `μ_HWSLC` | `BaseField` |  |
 
-The  chip is comprised of  variables that are expressed using  columns. Of these, the _input_ and _output_ variables ( in total) are precomputed.
-
-*Note*: This table contains one row for every possible value of `(X, Y, Z)`. As such, it has length `2^8 dot 2^8 dot 2^4 = 2^(20)`.
-
-## Lookup
-
-This chip adds the following interactions to the lookup:
-
-## Areas of Optimization
-
-The following ideas may prove to be optimizations for the  chip: + Extend `IS_BYTE[X]` to `ARE_BYTES[X, Y]`, such that two bytes are range checked at once. When only a single check is required, one can still execute `IS_BYTE[X] := ARE_BYTES[X, 0]`. + Drop `MSB8` column, and instead define the `MSB8` lookup as `MSB8<X> := MSB16[256X]`. Note: currently, `MSB8` also implicity range checks the input `X` (the lookup fails if `X` is not a `Byte`). This optimization should only be executed when all chips leveraging `MSB8` do _not_ need this implicit range check. + Place the 16-bit (`AND`, `OR`, `XOR`, `MSB16`, `ZERO`, etc.) and 20-bit (`HWSL`, `HWSLC`, `IS_B20`) lookups in separate tables. + Combine `HWSL` and `HWSLC` into a single lookup (see also \).
-
 ## Constraints
 
 ### contributions
@@ -62,7 +60,7 @@ The following ideas may prove to be optimizations for the  chip: + Extend `IS_BY
 | `BITWISE-C3` | `XOR_BYTE[XOR; X, Y]` | -μ_XOR |
 | `BITWISE-C4` | `MSB8[MSB8; X]` | -μ_MSB8 |
 | `BITWISE-C5` | `MSB16[MSB16; X + 256 * Y]` | -μ_MSB16 |
-| `BITWISE-C6` | `ZERO[ZERO; X + 256 * Y]` | -μ_ZERO |
+| `BITWISE-C6` | `ZERO[ZERO; X + 256 * Y + 65536 * Z]` | -μ_ZERO |
 | `BITWISE-C7` | `IS_BYTE[X]` | -μ_IS_BYTE |
 | `BITWISE-C8` | `IS_HALF[X + 256 * Y]` | -μ_IS_HALF |
 | `BITWISE-C9` | `IS_B20[X + 256 * Y + 65536 * Z]` | -μ_IS_B20 |
diff --git a/docs/spec/branch.md b/docs/spec/branch.md
index 80199d934..9d4a07a76 100644
--- a/docs/spec/branch.md
+++ b/docs/spec/branch.md
@@ -1,5 +1,37 @@
 # BRANCH Chip
 
+= Columns
+
+The `BRANCH` chip is comprised of  variables that are expressed using  columns:
+
+= Assumptions
+
+= Constraints
+
+> **Note:** Check correspondence with CPU for passing in `offset` as word or dword
+
+We constrain `next_pc` to be ``base_address` + `offset``, where `base_address` equals `pc` when ``JALR` = 0` and `register` otherwise.
+
+The range checks on `unmasked_low_byte` and `next_pc_low[0]` are performed implicitly by the `AND_BYTE` lookup.
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `BRANCH-C1` |  | 1 - JALR ⇒ `ADD<next_pc_unmasked; pc, offset::DWordWL>` |  |
+| `BRANCH-C2` |  | JALR ⇒ `ADD<next_pc_unmasked; register, offset::DWordWL>` |  |
+| `BRANCH-C3` |  | `IS_BYTE[next_pc_low[1]]` | μ |
+| `BRANCH-C4` |  | `AND_BYTE[next_pc_low[0]; unmasked_low_byte[0], 254]` | μ |
+| `BRANCH-C5.i` | i ∈ [0, 2] | `IS_HALFWORD[next_pc_high[i]]` | μ |
+
+This chip contributes the following to the lookup argument.
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `BRANCH-C6` | `BRANCH[next_pc; pc, offset, register, JALR]` | -μ |
+
+= Padding
+
+The table can be padded to the next power of two with the following value assignments:
+
 ## Columns
 
 ### Input
@@ -49,8 +81,6 @@ next_pc (when iter=1) := 2^16 * next_pc_high[2] + next_pc_high[1]
 |------|------|-------------|
 | `μ` | `Bit` |  |
 
-The `BRANCH` chip is comprised of  variables that are expressed using  columns:
-
 ## Assumptions
 
 | Tag | Range | Description |
@@ -58,30 +88,4 @@ The `BRANCH` chip is comprised of  variables that are expressed using  columns:
 | `BRANCH-A1.i` | i ∈ [0, 1] | `pc` is range checked, `IS_WORD[pc[i]]` |
 | `BRANCH-A2` |  | `offset` is range checked, `IS_WORD[offset]` |
 | `BRANCH-A3.i` | i ∈ [0, 1] | `register` is range checked, `IS_WORD[register[i]]` |
-| `BRANCH-A4` |  | `IS_BIT<JALR>` |
-
-## Constraints
-
-> **Note:** Check correspondence with CPU for passing in `offset` as word or dword
-
-We constrain `next_pc` to be ``base_address` + `offset``, where `base_address` equals `pc` when ``JALR` = 0` and `register` otherwise.
-
-The range checks on `unmasked_low_byte` and `next_pc_low[0]` are performed implicitly by the `AND_BYTE` lookup.
-
-| Tag | Range | Description | Multiplicity |
-|-----|-------|-------------|--------------|
-| `BRANCH-C1` |  | 1 - JALR ⇒ `ADD<next_pc_unmasked; pc, offset::DWordWL>` |  |
-| `BRANCH-C2` |  | JALR ⇒ `ADD<next_pc_unmasked; register, offset::DWordWL>` |  |
-| `BRANCH-C3` |  | `IS_BYTE[next_pc_low[1]]` | μ |
-| `BRANCH-C4` |  | `AND_BYTE[next_pc_low[0]; unmasked_low_byte[0], 254]` | μ |
-| `BRANCH-C5.i` | i ∈ [0, 2] | `IS_HALFWORD[next_pc_high[i]]` | μ |
-
-This chip contributes the following to the lookup argument.
-
-| Tag | Description | Multiplicity |
-|-----|-------------|--------------|
-| `BRANCH-C6` | `BRANCH[next_pc; pc, offset, register, JALR]` | -μ |
-
-## Padding
-
-The table can be padded to the next power of two with the following value assignments:
\ No newline at end of file
+| `BRANCH-A4` |  | `IS_BIT<JALR>` |
\ No newline at end of file
diff --git a/docs/spec/cpu.md b/docs/spec/cpu.md
index 0d3a4b364..756c67c9e 100644
--- a/docs/spec/cpu.md
+++ b/docs/spec/cpu.md
@@ -1,96 +1,12 @@
 # CPU Chip
 
-## Columns
-
-### Input
-
-| Name | Type | Description |
-|------|------|-------------|
-| `timestamp` | `Timestamp` | A preprocessed timestamp to coordinate the memory argument. Since we have at most 3 non-disjoint memory accesses (`(rs1, rs2, rd)`, `(rs1, pc, pc)`, `(LOAD)` or `(STORE)`) a maximum of 4 slots is enough. |
-| `pc` | `DWordWL` | The program counter |
-| `rs1` | `Byte` | Source register 1 index |
-| `rs2` | `Byte` | Source register 2 index |
-| `rd` | `Byte` | Destination register index |
-| `read_register1` | `Bit` | Whether to read from `rs1` (1) or to place a 0 in `rv1` (0) |
-| `read_register2` | `Bit` | Whether to read from `rs2` (1) or to place a 0 in `rv2` (0) |
-| `write_register` | `Bit` | Whether to write back to the destination register |
-| `memory_2bytes` | `Bit` | Whether the memory access (read or write) touches exactly 2 bytes |
-| `memory_4bytes` | `Bit` | Whether the memory access (read or write) touches exactly 4 bytes |
-| `memory_8bytes` | `Bit` | Whether the memory access (read or write) touches exactly 8 bytes |
-| `c_type_instruction` | `Bit` | Whether the instruction is of C type, i.e., whether it is 2 bytes long instead of 4 |
-| `imm` | `DWordWL` | The fully extended 64-bit version of the immediate |
-| `signed` | `Bit` | Indicates whether we're dealing with a signed or unsigned instruction |
-| `mp_selector` | `Bit` | Multi-purpose selector used by different ALU operations for different purposes. Currently, it is used     - by the `MUL` chip to select between `MUL`/`MULH` and `MULH[S]U`, and     - as flag for inverting the condition of conditional branches (see `branch_cond`)     - as direction (left or right) for `SHIFT` |
-| `muldiv_selector` | `Bit` | Selects which output of `MUL` (lo/hi) or `DIV` (quo/rem) is wanted |
-| `word_instr` | `Bit` | Whether the instruction is a \*W instruction, requiring the inputs and outputs to be (sign) extended |
-| `ADD` | `Bit` | One-hot ALU selector flag |
-| `SUB` | `Bit` | One-hot ALU selector flag |
-| `SLT` | `Bit` | One-hot ALU selector flag |
-| `AND` | `Bit` | One-hot ALU selector flag |
-| `OR` | `Bit` | One-hot ALU selector flag |
-| `XOR` | `Bit` | One-hot ALU selector flag |
-| `SHIFT` | `Bit` | One-hot ALU selector flag |
-| `JALR` | `Bit` | One-hot ALU selector flag |
-| `BEQ` | `Bit` | One-hot ALU selector flag |
-| `BLT` | `Bit` | One-hot ALU selector flag |
-| `LOAD` | `Bit` | One-hot ALU selector flag |
-| `STORE` | `Bit` | One-hot ALU selector flag |
-| `MUL` | `Bit` | One-hot ALU selector flag |
-| `DIVREM` | `Bit` | One-hot ALU selector flag |
-| `ECALL` | `Bit` | One-hot ALU selector flag |
-| `EBREAK` | `Bit` | One-hot ALU selector flag |
-
-### Output
-
-| Name | Type | Description |
-|------|------|-------------|
-| `next_pc` | `DWordWL` | The program counter for the next instruction |
-| `rvd` | `DWordWL` | The value to (maybe) be written back to rvd |
-
-### Auxiliary
-
-| Name | Type | Description |
-|------|------|-------------|
-| `rv1` | `DWordWHH` | The value of register `rs1` |
-| `rv2` | `DWordWHH` | The value of register `rs2` |
-| `rv1_sign_bit` | `Bit` | The sign bit of `rv1` if seen as a 32-bit word |
-| `arg1` | `DWordBL` | The extended version of `rv1`, depending on `word_instr` |
-| `arg2_sign_bit` | `Bit` | The sign bit of `arg2` if seen as a 32-bit word |
-| `arg2` | `DWordBL` | A multiplexed version of `rv2` and `imm`, to be used as second argument to ALU calls |
-| `res_sign_bit` | `Bit` | The sign bit of `res`, if seen as a 32-bit word |
-| `res` | `DWordBL` | The ALU result |
-| `is_equal` | `Bit` | Whether `rv1` and `arg2` are equal |
-| `branch_cond` | `Bit` | Whether a branch is taken, i.e., the branch condition |
-
-### Virtual
-
-| Name | Type | Description |
-|------|------|-------------|
-| `packed_decode` | `BaseField` | A packed representation of all bit flags and register indices obtained from the decoding |
-| `pad` | `Bit` | When no flags are set, we must be in a padding row. |
-
-**Definition of `packed_decode`:**
-```
-packed_decode := 2^0 * read_register1 + 2^1 * read_register2 + 2^2 * write_register + 2^3 * memory_2bytes + 2^4 * memory_4bytes + 2^5 * memory_8bytes + 2^6 * c_type_instruction + 2^7 * signed + 2^8 * mp_selector + 2^9 * muldiv_selector + 2^10 * word_instr + 2^11 * ADD + 2^12 * SUB + 2^13 * SLT + 2^14 * AND + 2^15 * OR + 2^16 * XOR + 2^17 * SHIFT + 2^18 * JALR + 2^19 * BEQ + 2^20 * BLT + 2^21 * LOAD + 2^22 * STORE + 2^23 * MUL + 2^24 * DIVREM + 2^25 * ECALL + 2^26 * EBREAK + 2^27 * rs1 + 2^35 * rs2 + 2^43 * rd
-```
-
-**Definition of `pad`:**
-```
-pad := 1 - ADD - SUB - SLT - AND - OR - XOR - SHIFT - JALR - BEQ - BLT - LOAD - STORE - MUL - DIVREM - ECALL - EBREAK
-```
+= Columns
 
 The `CPU` chip is comprised of  variables that are expressed using  columns:
 
-## Assumptions
-
-| Tag | Range | Description |
-|-----|-------|-------------|
-| `CPU-A1` |  | At most one ALU selector flag is 1 by the decoding, and every other flag is 0. |
-| `CPU-A2` |  | When `STORE + LOAD + BEQ + BLT = 0`, either `rs2 = 0` or `imm = 0` should be enforced by the decoding. This is needed for `arg2`. |
+= Assumptions
 
-## Constraints
-
-First, we perform a decoding lookup for the current PC.
+= Constraints First, we perform a decoding lookup for the current PC.
 
 | Tag | Description |
 |-----|-------------|
@@ -98,7 +14,7 @@ First, we perform a decoding lookup for the current PC.
 
 > **Note:** All casts for interactions will have to be reviewed once other chip interfaces stabilise
 
-### Range checks
+## Range checks
 
 > **Note:** Make sure we argue for every column here
 
@@ -142,7 +58,7 @@ We constrain all columns to have the appropriate ranges. The flags and register
 | `CPU-CR33.i` | i ∈ [0, 7] | `IS_BYTE[arg2[i]]` |
 | `CPU-CR34.i` | i ∈ [0, 7] | `IS_BYTE[res[i]]` |
 
-### ALU
+## ALU
 
 The ALU functionality is then obtained through judicious dispatching to the corresponding chips.
 
@@ -161,7 +77,7 @@ The ALU functionality is then obtained through judicious dispatching to the corr
 | `CPU-CA44` |  | `MUL[res; arg1, signed, arg2, mp_selector, muldiv_selector]` | MUL |
 | `CPU-CA45` |  | `DVRM[res; arg1, arg2, signed, muldiv_selector]` | DIVREM |
 
-### Memory
+## Memory
 
 The interactions with the memory, both for register loading and storing, as for `LOAD` and `STORE` instructions are handled. Note that since registers need no byte-addressing, we store them in the memory argument with `Word` limbs. The timestamps are ensured to be disjoint for disjoint memory locations. One consequence of that is that `next_pc` is written at `timestamp + 1` to ensure the access is disjoint with the `pc` read into `rv1` as part of the `AUIPC` instruction.
 
@@ -178,7 +94,7 @@ The interactions with the memory, both for register loading and storing, as for
 | `CPU-CM52` |  | `MEMW[0, res, rv2, timestamp + 1, memory_2bytes, memory_4bytes, memory_8bytes]` | STORE |
 | `CPU-CM53` |  | `MEMW[pc; 1, 2 * 255, next_pc, timestamp + 1, 1, 0, 0]` | 1 - pad |
 
-### System
+## System
 
 The interactions with the wider system.
 
@@ -188,7 +104,7 @@ The interactions with the wider system.
 | | _polynomial:_ `1 - EBREAK = 0` | |
 | `CPU-CS55` | `ECALL[timestamp, rv1::DWordWL]` | ECALL |
 
-### Input and output to the ALU
+## Input and output to the ALU
 
 We constrain `arg1`, `arg2` and `rvd` to correspond to the wanted values, including the appropriate sign/zero extension, depending on `word_instr`.
 
@@ -212,11 +128,9 @@ We constrain `arg1`, `arg2` and `rvd` to correspond to the wanted values, includ
 | `CPU-CE65` | `!LOAD` => `rvd[1]` = (1 - `word_instr`) dot `res[4:]` + `res_sign_bit` dot (2^(32) - 1) |  |
 | | _polynomial:_ `(1 - LOAD) * (rvd[1] - (1 - word_instr) * (res::DWordWL)[1] - res_sign_bit * (2^32 - 1)) = 0` | |
 
-### Other constraints
-
-> **Note:** proper ref to IsZero/IsEqual
+## Other constraints
 
-For [cpu:c:is_equal], refer to the logic of IsZero or IsEqual, in combination with the subtraction of [cpu:c:sub].
+For [cpu:c:is_equal], note that [cpu:c:sub] sets `res` to be the difference between `arg1` and `arg2` whenever `BEQ` is `1`. Given that this difference is `0` when both are equal, [cpu:c:is_equal] ensures `is_equal` is set to `1` if and only if ``arg1` = `arg2`` and `BEQ` is set.
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
@@ -228,8 +142,94 @@ For [cpu:c:is_equal], refer to the logic of IsZero or IsEqual, in combination wi
 
 > **Note:** Document the choice to not have a multiplicity column here for padding
 
-## Padding
+= Padding
 
 The CPU can be padded with the following values, which have a corresponding row in the DECODE table, at the _odd_ address 1, only reachable through a HALT ecall.
 
-This approach minimizes the number of dependent lookups, increasing only multiplicities in the DECODE table and the IS_BYTE lookup.
\ No newline at end of file
+This approach minimizes the number of dependent lookups, increasing only multiplicities in the DECODE table and the IS_BYTE lookup.
+
+## Columns
+
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `timestamp` | `Timestamp` | A preprocessed timestamp to coordinate the memory argument. Since we have at most 3 non-disjoint memory accesses (`(rs1, rs2, rd)`, `(rs1, pc, pc)`, `(LOAD)` or `(STORE)`) a maximum of 4 slots is enough. |
+| `pc` | `DWordWL` | The program counter |
+| `rs1` | `Byte` | Source register 1 index |
+| `rs2` | `Byte` | Source register 2 index |
+| `rd` | `Byte` | Destination register index |
+| `read_register1` | `Bit` | Whether to read from `rs1` (1) or to place a 0 in `rv1` (0) |
+| `read_register2` | `Bit` | Whether to read from `rs2` (1) or to place a 0 in `rv2` (0) |
+| `write_register` | `Bit` | Whether to write back to the destination register |
+| `memory_2bytes` | `Bit` | Whether the memory access (read or write) touches exactly 2 bytes |
+| `memory_4bytes` | `Bit` | Whether the memory access (read or write) touches exactly 4 bytes |
+| `memory_8bytes` | `Bit` | Whether the memory access (read or write) touches exactly 8 bytes |
+| `c_type_instruction` | `Bit` | Whether the instruction is of C type, i.e., whether it is 2 bytes long instead of 4 |
+| `imm` | `DWordWL` | The fully extended 64-bit version of the immediate |
+| `signed` | `Bit` | Indicates whether we're dealing with a signed or unsigned instruction |
+| `mp_selector` | `Bit` | Multi-purpose selector used by different ALU operations for different purposes. Currently, it is used     - by the `MUL` chip to select between `MUL`/`MULH` and `MULH[S]U`, and     - as flag for inverting the condition of conditional branches (see `branch_cond`)     - as direction (left or right) for `SHIFT` |
+| `muldiv_selector` | `Bit` | Selects which output of `MUL` (lo/hi) or `DIV` (quo/rem) is wanted |
+| `word_instr` | `Bit` | Whether the instruction is a \*W instruction, requiring the inputs and outputs to be (sign) extended |
+| `ADD` | `Bit` | One-hot ALU selector flag |
+| `SUB` | `Bit` | One-hot ALU selector flag |
+| `SLT` | `Bit` | One-hot ALU selector flag |
+| `AND` | `Bit` | One-hot ALU selector flag |
+| `OR` | `Bit` | One-hot ALU selector flag |
+| `XOR` | `Bit` | One-hot ALU selector flag |
+| `SHIFT` | `Bit` | One-hot ALU selector flag |
+| `JALR` | `Bit` | One-hot ALU selector flag |
+| `BEQ` | `Bit` | One-hot ALU selector flag |
+| `BLT` | `Bit` | One-hot ALU selector flag |
+| `LOAD` | `Bit` | One-hot ALU selector flag |
+| `STORE` | `Bit` | One-hot ALU selector flag |
+| `MUL` | `Bit` | One-hot ALU selector flag |
+| `DIVREM` | `Bit` | One-hot ALU selector flag |
+| `ECALL` | `Bit` | One-hot ALU selector flag |
+| `EBREAK` | `Bit` | One-hot ALU selector flag |
+
+### Output
+
+| Name | Type | Description |
+|------|------|-------------|
+| `next_pc` | `DWordWL` | The program counter for the next instruction |
+| `rvd` | `DWordWL` | The value to (maybe) be written back to rvd |
+
+### Auxiliary
+
+| Name | Type | Description |
+|------|------|-------------|
+| `rv1` | `DWordWHH` | The value of register `rs1` |
+| `rv2` | `DWordWHH` | The value of register `rs2` |
+| `rv1_sign_bit` | `Bit` | The sign bit of `rv1` if seen as a 32-bit word |
+| `arg1` | `DWordBL` | The extended version of `rv1`, depending on `word_instr` |
+| `arg2_sign_bit` | `Bit` | The sign bit of `arg2` if seen as a 32-bit word |
+| `arg2` | `DWordBL` | A multiplexed version of `rv2` and `imm`, to be used as second argument to ALU calls |
+| `res_sign_bit` | `Bit` | The sign bit of `res`, if seen as a 32-bit word |
+| `res` | `DWordBL` | The ALU result |
+| `is_equal` | `Bit` | Whether `rv1` and `arg2` are equal |
+| `branch_cond` | `Bit` | Whether a branch is taken, i.e., the branch condition |
+
+### Virtual
+
+| Name | Type | Description |
+|------|------|-------------|
+| `packed_decode` | `BaseField` | A packed representation of all bit flags and register indices obtained from the decoding |
+| `pad` | `Bit` | When no flags are set, we must be in a padding row. |
+
+**Definition of `packed_decode`:**
+```
+packed_decode := 2^0 * read_register1 + 2^1 * read_register2 + 2^2 * write_register + 2^3 * memory_2bytes + 2^4 * memory_4bytes + 2^5 * memory_8bytes + 2^6 * c_type_instruction + 2^7 * signed + 2^8 * mp_selector + 2^9 * muldiv_selector + 2^10 * word_instr + 2^11 * ADD + 2^12 * SUB + 2^13 * SLT + 2^14 * AND + 2^15 * OR + 2^16 * XOR + 2^17 * SHIFT + 2^18 * JALR + 2^19 * BEQ + 2^20 * BLT + 2^21 * LOAD + 2^22 * STORE + 2^23 * MUL + 2^24 * DIVREM + 2^25 * ECALL + 2^26 * EBREAK + 2^27 * rs1 + 2^35 * rs2 + 2^43 * rd
+```
+
+**Definition of `pad`:**
+```
+pad := 1 - ADD - SUB - SLT - AND - OR - XOR - SHIFT - JALR - BEQ - BLT - LOAD - STORE - MUL - DIVREM - ECALL - EBREAK
+```
+
+## Assumptions
+
+| Tag | Range | Description |
+|-----|-------|-------------|
+| `CPU-A1` |  | At most one ALU selector flag is 1 by the decoding, and every other flag is 0. |
+| `CPU-A2` |  | When `STORE + LOAD + BEQ + BLT = 0`, either `rs2 = 0` or `imm = 0` should be enforced by the decoding. This is needed for `arg2`. |
\ No newline at end of file
diff --git a/docs/spec/decode.md b/docs/spec/decode.md
index 7e3fc6722..36b942ef4 100644
--- a/docs/spec/decode.md
+++ b/docs/spec/decode.md
@@ -2,33 +2,15 @@
 
 All `RV64IMC` instruction are to be decoded to a format that can be interpreted by the VM. This section outlines the decoding table being used in the VM. For reasons of efficiency, data in this table is significantly compressed. Since reasoning about this compressed form is needlessly complex, the `decode (uncompressed)` section presents the same table in uncompressed form, and explains how to decode `RV64IM` assembly instructions to it. Instructions on how to compress the uncompressed table to form the compressed decode table, can be derived from the `packed_decode` variable provided below.
 
-## Columns
-
-### Output
-
-| Name | Type | Description |
-|------|------|-------------|
-| `pc` | `DWordWL` | value of the program counter this instruction is associated with. |
-| `packed_decode` | `BaseField` | Ordered concatenation of several small variables. The `decode (uncompressed)` section explains the purpose of each variable.\ A list of each variable and the bit(-range) in which it is located:\ [0] `read_register1`, \ [1] `read_register2`, \ [2] `write_register`, \ [3] `memory_2bytes`, \ [4] `memory_4bytes`, \ [5] `memory_8bytes`, \ [6] `c_type`, \ [7] `signed`, \ [8] `mp_selector`, \ [9] `muldiv_selector`, \ [10] `word_instr`, \ [11] `ADD`, \ [12] `SUB`, \ [13] `SLT`, \ [14] `AND`, \ [15] `OR`, \ [16] `XOR`, \ [17] `SHIFT`, \ [18] `JALR`, \ [19] `BEQ`, \ [20] `BLT`, \ [21] `LOAD`, \ [22] `STORE`, \ [23] `MUL`, \ [24] `DIVREM`, \ [25] `ECALL`, \ [26] `EBREAK`; \ [27:35] `rs1`, \ [35:43] `rs2`, \ [43:51] `rd`, \ the remaining bits are set to zero.  |
-| `imm` | `DWordWL` | the *fully extended (!)* 64-bit version of the immediate. |
-
-### Multiplicity
-
-| Name | Type | Description |
-|------|------|-------------|
-| `μ` | `BaseField` | The multiplicity with which this instruction is looked up in the `CPU` table. |
+= Columns
 
 The  table is comprised of  variables that are expressed using  columns:
 
-## Padding
-
-The  table must be padded to a length that is a power of two. Empty rows with the following content can be added to achieve this:
+= Padding The  table must be padded to a length that is a power of two. Empty rows with the following content can be added to achieve this:
 
 Note that this row sets the `EBREAK` flag. Given that `CPU` asserts that `EBREAK = 0` (see [cpu:c:ebreak_traps]), using this "padding-instruction" would immediately make the CPU table unprovable. Note moreover that the `pc` is set to `7`. This value is the _smallest odd number_ (i.e., not reachable during regular execution) that is more than _`4`_ (i.e., the max `pc`-increment) greater than _`1`_ (i.e., the `pc`-value used in the [additional instruction] referred to by `CPU`-padding lines).
 
-## Decoding
-
-For the purposes of explaining decoding, we decompress 's `packed_decode` variable into its constituent variables. Note that the below table is _not_ used in practice: it is solely used for the purposes of this explanation.
+= Decoding For the purposes of explaining decoding, we decompress 's `packed_decode` variable into its constituent variables. Note that the below table is _not_ used in practice: it is solely used for the purposes of this explanation.
 
 We will illustrate how each instruction should be expressed in this (uncompressed) decoding table. The columns of the accompanying table represent the following: - *`operation`*: the assembly operation being encoded. - *`op-flag`*: which of the "`ALU` selector flags" operation flags to set. Each operation sets exactly one. - *`w_instr`*, *`signed`*: whether to set the `word_instr` and `signed` flags, respectively. - *other*: the other flags that should be set or variables that should be given specific values.
 
@@ -36,7 +18,7 @@ For the purpose of brevity and readability, the table uses the following rules-o
 
 Further clarification is provided in the notes following the table.
 
-### C-type instructions
+## C-type instructions
 
 The `RV64C` extension for compressed instructions specifies that \~50% of all instructions can be represented using a 16-bit instruction (rather than 32-bits), saving \~25% in code size. This execution of assembly code is _not_ agnostic to an instruction's compression state; after executing a compressed instruction, the `pc` should be incremented by `2` rather than `4`. To indicate an instruction is provided in compressed form, the `c_type` flag is introduced. *This flag should be set to `1` whenever the decoded instruction is provided in compressed form and `0` otherwise.*
 
@@ -50,7 +32,7 @@ figure(table( columns: (auto, auto, 40pt, 40pt, 1fr, 15pt), stroke: 0pt, inset:
 
 // Construct a note that can be referenced through `lbl` show figure: (it) => align(left, []) [ ] }
 
-#### Notes
+## Notes
 
 We note the following about the above decoding table:
 
@@ -60,4 +42,20 @@ enum.item( referenceable_note( "note_word_instr", [`word_instr`: `[W]` indicates
 
 In addition to decoding all instructions provided in the ELF and adding a corresponding entry to the  table, one must include an entry that has ``pc` = 1` and every other variable set to `0`. Note that this will never conflict with any entry in the ELF, since it has an odd `pc` value.
 
-This entry is used to pad the `CPU` table. More details on this matter are provided in the `CPU` chip.
\ No newline at end of file
+This entry is used to pad the `CPU` table. More details on this matter are provided in the `CPU` chip.
+
+## Columns
+
+### Output
+
+| Name | Type | Description |
+|------|------|-------------|
+| `pc` | `DWordWL` | value of the program counter this instruction is associated with. |
+| `packed_decode` | `BaseField` | Ordered concatenation of several small variables. The `decode (uncompressed)` section explains the purpose of each variable.\ A list of each variable and the bit(-range) in which it is located:\ [0] `read_register1`, \ [1] `read_register2`, \ [2] `write_register`, \ [3] `memory_2bytes`, \ [4] `memory_4bytes`, \ [5] `memory_8bytes`, \ [6] `c_type`, \ [7] `signed`, \ [8] `mp_selector`, \ [9] `muldiv_selector`, \ [10] `word_instr`, \ [11] `ADD`, \ [12] `SUB`, \ [13] `SLT`, \ [14] `AND`, \ [15] `OR`, \ [16] `XOR`, \ [17] `SHIFT`, \ [18] `JALR`, \ [19] `BEQ`, \ [20] `BLT`, \ [21] `LOAD`, \ [22] `STORE`, \ [23] `MUL`, \ [24] `DIVREM`, \ [25] `ECALL`, \ [26] `EBREAK`; \ [27:35] `rs1`, \ [35:43] `rs2`, \ [43:51] `rd`, \ the remaining bits are set to zero.  |
+| `imm` | `DWordWL` | the *fully extended (!)* 64-bit version of the immediate. |
+
+### Multiplicity
+
+| Name | Type | Description |
+|------|------|-------------|
+| `μ` | `BaseField` | The multiplicity with which this instruction is looked up in the `CPU` table. |
\ No newline at end of file
diff --git a/docs/spec/decode_uncompressed.md b/docs/spec/decode_uncompressed.md
deleted file mode 100644
index 4bf226594..000000000
--- a/docs/spec/decode_uncompressed.md
+++ /dev/null
@@ -1,46 +0,0 @@
-# DECODE Chip
-
-## Columns
-
-### Output
-
-| Name | Type | Description |
-|------|------|-------------|
-| `pc` | `DWordWL` | value of the program counter this instruction is associated with. |
-| `rs1` | `Byte` | index of source register 1. |
-| `rs2` | `Byte` | index of source register 2. |
-| `rd` | `Byte` | index of destination register. |
-| `read_register1` | `Bit` | whether to load the contents of address `rs1` (1) or `0` (0) into `rv1`. |
-| `read_register2` | `Bit` | whether to load the contents of address `rs2` (1) or `0` (0) into `rv2`. |
-| `write_register` | `Bit` | whether the result should be written to `rd` ($=0$ for memory write and when $`rd` = `x0`$. |
-| `mem_2B` | `Bit` | whether the memory access (read or write) touches exactly $2$ bytes. |
-| `mem_4B` | `Bit` | whether the memory access (read or write) touches exactly $4$ bytes. |
-| `mem_8B` | `Bit` | whether the memory access (read or write) touches exactly $8$ bytes. |
-| `c_type` | `Bit` | Whether the instruction is of type `C`, i.e., whether it is $2$ bytes long instead of $4$. |
-| `imm` | `DWordWL` | the *fully extended (!)* 64-bit version of the immediate. |
-| `signed` | `Bit` | selector used to indicate signed or unsigned input interpretation. |
-| `mp_selector` | `Bit` | Multi-purpose selector used by the CPU to to configure several ALU operations in different ways.            See the `CPU` chip for more details. |
-| `muldiv_selector` | `Bit` | selects which output of `MUL` (lo/hi) or `DVRM` (quo/rem) is wanted. |
-| `word_instr` | `Bit` | Whether the instruction is a `*W` instruction, requiring the inputs and outputs to be (sign) extended. |
-| `ADD` | `Bit` | ALU selector flag |
-| `SUB` | `Bit` | ALU selector flag |
-| `SLT` | `Bit` | ALU selector flag |
-| `AND` | `Bit` | ALU selector flag |
-| `OR` | `Bit` | ALU selector flag |
-| `XOR` | `Bit` | ALU selector flag |
-| `SHIFT` | `Bit` | ALU selector flag |
-| `JALR` | `Bit` | ALU selector flag |
-| `BEQ` | `Bit` | ALU selector flag |
-| `BLT` | `Bit` | ALU selector flag |
-| `LOAD` | `Bit` | ALU selector flag |
-| `STORE` | `Bit` | ALU selector flag |
-| `MUL` | `Bit` | ALU selector flag |
-| `DIVREM` | `Bit` | ALU selector flag |
-| `ECALL` | `Bit` | ALU selector flag |
-| `EBREAK` | `Bit` | ALU selector flag |
-
-### Multiplicity
-
-| Name | Type | Description |
-|------|------|-------------|
-| `μ` | `BaseField` | The multiplicity with which this instruction is looked up in the `CPU` table. |
diff --git a/docs/spec/dvrm.md b/docs/spec/dvrm.md
index fc32ae17f..7295a9f45 100644
--- a/docs/spec/dvrm.md
+++ b/docs/spec/dvrm.md
@@ -1,5 +1,216 @@
 # DVRM Chip
 
-//  chip = load_chip("src/dvrm.toml", config)
+= Columns
 
-*placeholder chapter: WIP*
\ No newline at end of file
+The `DVRM` chip is comprised of  variables that are expressed using  columns:
+
+= Assumptions
+
+= Constraints From the ISA, we gather five requirements for the `DIV[U][W]` and `REM[U][W]` instructions:
+
+enum.item([ _For both signed and unsigned division, except in the case of_ overflow, _it holds that ``n` = `q` `d` + `r``._ ]), enum.item([ _`DIV` and `DIVU` perform [...] signed and unsigned integer division [...] rounding towards zero._ ]), enum.item([ _For `REM`, the sign of a nonzero [remainder] equals the sign of the [numerator]._ ]), enum.item([ In case of _division-by-zero_, ``r` = `n`` and ``q` = 2^64-1` (unsigned) or ``q` = -1` (signed). ]), enum.item([ In case of _overflow_, ``q` = `n`` and ``r` = 0` ]), where _overflow_ occurs when ``n` = -2^(63)` and ``d` = -1` (and, hence, ``signed` = 1`), and _division-by-zero_ indicates that ``d` = 0`. In the following, we list the constraints associated with the  chip, and explain how these together enforce all five of these requirements.
+
+## R3: Sign remainder equals sign numerator
+
+We start with R3, which is straightforwardly asserted by constraint [dvrm:c:sign_r_equals_sign_n].
+
+## R2: rounding towards zero
+
+R2 states that "_[in] signed and unsigned integer division [the quotient is] round[ed] towards zero._" In other words, + the sign of ``n`-`qd`` must match that of `n` (unless ``qd` = `n``), and + `|`n`-`qd`|  < |`d`|` (unless ``d` = 0`).
+
+Leveraging R1 , we can rewrite these as + the sign of ``r`` must match that of `n` (unless ``r` = 0`), and + `|`r`|  < |`d`|` (unless ``d` = 0`).
+
+Focusing on the first statement, we observe that this trivially holds when ``signed` = 0`, while R3 deals with the case that ``signed` = 1`. The second statement is enforced by [dvrm:c:abs_r_lt_abs_d]. [dvrm:c:abs_r_if_negative] and [dvrm:c:abs_r_if_nonnegative] (resp. [dvrm:c:abs_d_if_negative] and [dvrm:c:abs_d_if_nonnegative]) are included to ensure that `abs_r` (resp. `abs_d`) is the absolute values of `r` (resp. `d`).
+
+## R5: overflow
+
+The ISA requires that ``q` = `n`` and ``r` = 0` in the event of overflow (i.e., when ``n` = -2^63` and ``d` = -1`). We note that the second half of this requirement is already satisfied by R2: since ``d` = -1 != 0`, R2 requires that `|`r`| < |`d`| = 1`, to which ``r` = 0` is the only satisfying value.
+
+We moreover find that R1 can be leveraged to enforce the correct value of `q`. While ``n` = `qd` + `r`` (R1) does _not_ hold in the case of overflow, the relation ``n` = |`q`|`d` + `r`` _does_. We moreover note that the 64-bit _signed_ two's complement representation of `-2^63` is identical to the 64-bit _unsigned_ representation of `|-2^63| = 2^63`. As such, by interpreting `q` as an unsigned integer when ``overflow` = 1`, it follows that R1 will enforce ``q` = `0x80...00``.
+
+In summary, in case of overflow R2 enforces that ``r` = 0`. Moreover it suffices to interpret `q` as unsigned integer ([dvrm:c:sign_q]); R1 will ensure it contains the correct value.
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `DVRM-C7` | `sign_q` = `signed` dot (1- `overflow`) |  |
+| | _polynomial:_ `signed * (1 - overflow) - sign_q = 0` | |
+| `DVRM-C8` | `ZERO[overflow; n[0] + n[1] + n[2] + (n[3] - 2^15 * sign_n) + (1 - sign_n) + (65535 - d[0]) + (65535 - d[1]) + (65535 - d[2]) + (65535 - d[3])]` | μ_sum |
+
+We highlight [dvrm:c:overflow]. Recall that the `overflow` flag should be set if and only if (i) ``signed` = 1`, (ii) ``n` = `0x80...00``, and (iii) ``d` = `0xFF...FF``. These requirements are equivalent to the state where: $ forall i in [0, 3]:&& 65535 - `d`_i &= 0,\ forall i in [0, 2]:&& `n`_i &= 0,\ && `n`_3 - 2^15 dot `sign_n` &= 0,\ && 1 - `sign_n` &= 0,\ $ where ``signed` = 1` follows from the last equality. The requirement is phrased in this way, because the left-hand sides of the above expressions are `>= 0` by construction. Given that the sum of these expressions does not exceed `2^19` (and thus never wraps in the field), we can now say that the `overflow` bit should be set to `1` if and only if their sum evaluates to `0`. The `ZERO` lookup guarantees this to be the case.
+
+## R1: $#`n` = #`qd` + #`r`$
+
+Rewriting R1, we find the constraint `not`overflow` => `n` - `r` = `qd``.
+
+Since `n`, `d`, `q` and `r` are all 64-bit integers, we must assert this equality `mod 2^128`, rather than `mod 2^64`. To this end, we introduce `extended_n_sub_r` and leverage the `MUL` chip to verify that it is equal to ``qd` mod 2^128` using constraints [dvrm:c:mul_lower] and [dvrm:c:mul_upper]; [dvrm:c:q_range] is included to uphold assumption [mul:a:rhs].
+
+It now remains to enforce that `extended_n_sub_r` is the _signed_ 128-bit representation of ``n`-`r``. Here, we introduce `extended_n` and `extended_r`. By their definition, these variables contain the signed 128-bit representations of `n` and `r`. The `carry` variable has been defined such that it mimics those in the `ADD` chip, except that here we add two `QuadHL`s rather than two `DWordHL`, thus needing four carry bits instead of two. With this in place, [dvrm:c:n_sub_r] (mimicking [add:c:carry]) ensures `extended_n_sub_r` must contain the correct value.
+
+Lastly, observe that ``n` - `r` in (-2^64, 2^64)`, _regardless_ of the value of `signed`. Moreover, note that the upper halves of the 128-bit representations of all values in this range are either `0xFFFFFFFF` (negative) or `0x00000000` (non-negative). This means that we do not need to store all 128 bits of `extended_n_sub_r`. Rather, we need only store the lower 64-bits, and a separate bit (`sign_n_sub_r`) indicating whether the top limbs are all-ones or all-zeroes. The prover is free to select the value for `sign_n_sub_r`; only one of the two will fit the proof.
+
+## R4: division-by-zero
+
+R4 requires that ``q` = 2^64-1` (unsigned) or `-1` (signed) and ``r` = n` when ``d` = 0`. Recalling R1, we see that ``n` = `q` `d` + `r` = `r`` when ``d` = 0`, already enforces the latter. Next, we note that, in two's complement, the _unsigned_ value `2^64-1` and _signed_ value `-1` are both represented by the bit string `0xFFFFFFFF`. Hence, only [dvrm:c:q_if_div_by_zero] is required to completely constrain R4; [dvrm:c:div_by_zero] just ensures the `div_by_zero` flag is set when ``d` = 0`.
+
+## Other
+
+The following constraints are included to enforce the values of `sign_n`, `sign_r` and `sign_d` are correct.
+
+## Output
+
+Lastly, this chip contributes the following to the lookup:
+
+= Padding To pad the  table, we use the following data, representing the unsigned division `frac(0, 0, style: "horizontal")`:
+
+## Columns
+
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `n` | `DWordHL` | The numerator |
+| `d` | `DWordHL` | The denominator |
+| `signed` | `Bit` | Whether to interpret the input as signed (1) or unsigned (0) integers. |
+
+### Output
+
+| Name | Type | Description |
+|------|------|-------------|
+| `q` | `DWordHL` | The quotient; $`n` / `d`$ rounded towards zero. |
+| `r` | `DWordHL` | The remainder; $`n` - `q` `d`$. |
+
+### Auxiliary
+
+| Name | Type | Description |
+|------|------|-------------|
+| `div_by_zero` | `Bit` | Whether $`d`=0$. |
+| `overflow` | `Bit` | Whether $`n` = -2^63$ and $`d`=-1$. |
+| `abs_r` | `DWordWL` | Absolute value of `r`. |
+| `abs_d` | `DWordWL` | Absolute value of `d`. |
+| `n_sub_r` | `DWordHL` | $`n`-`r`$. |
+| `sign_n_sub_r` | `Bit` | Sign of `n_sub_r`. |
+| `sign_n` | `Bit` | Sign of `n`. |
+| `sign_d` | `Bit` | Sign of `d`. |
+| `sign_q` | `Bit` | Sign of `q`. |
+| `sign_r` | `Bit` | Sign of `r`. |
+
+### Virtual
+
+| Name | Type | Description |
+|------|------|-------------|
+| `extended_n` | `QuadHL` | sign-extended value of `n`. |
+| `extended_r` | `QuadHL` | sign-extended value of `r`. |
+| `extension_n_sub_r` | `DWordHL` | sign-extension limbs of `n_sub_r`. |
+| `extended_n_sub_r` | `QuadHL` | sign-extended value of `n_sub_r`. |
+| `carry` | `Bit[4]` | carries for adding `extended_n_sub_r` to `extended_r`, forming `extended_n`. |
+| `μ_sum` | `BaseField` | sum of multiplicities |
+
+**Definition of `extended_n`:**
+```
+extended_n (when iter=[0, 3]) := n[i]
+extended_n (when iter=[4, 7]) := 65535 * sign_n
+```
+
+**Definition of `extended_r`:**
+```
+extended_r (when iter=[0, 3]) := r[i]
+extended_r (when iter=[4, 7]) := 65535 * sign_r
+```
+
+**Definition of `extension_n_sub_r`:**
+```
+extension_n_sub_r := 65535 * sign_n_sub_r
+```
+
+**Definition of `extended_n_sub_r`:**
+```
+extended_n_sub_r (when iter=[0, 3]) := n_sub_r[i]
+extended_n_sub_r (when iter=[4, 7]) := extension_n_sub_r[i - 4]
+```
+
+**Definition of `carry`:**
+```
+carry (when iter=0) := 2^-32 * ((extended_n_sub_r::QuadWL)[i] + (extended_r::QuadWL)[i] - (extended_n::QuadWL)[i])
+carry (when iter=[1, 3]) := 2^-32 * ((extended_n_sub_r::QuadWL)[i] + (extended_r::QuadWL)[i] + carry[i - 1] - (extended_n::QuadWL)[i])
+```
+
+**Definition of `μ_sum`:**
+```
+μ_sum := μ_q + μ_r
+```
+
+### Multiplicity
+
+| Name | Type | Description |
+|------|------|-------------|
+| `μ_q` | `BaseField` |  |
+| `μ_r` | `BaseField` |  |
+
+## Assumptions
+
+| Tag | Range | Description |
+|-----|-------|-------------|
+| `DVRM-A1.i` | i ∈ [0, 3] | `IS_HALF[n[i]]` |
+| `DVRM-A2.i` | i ∈ [0, 3] | `IS_HALF[d[i]]` |
+| `DVRM-A3` |  | `IS_BIT<signed>` |
+
+## Constraints
+
+### equality
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `DVRM-C13` |  | `MUL[n_sub_r::DWordWL; d, signed, q, sign_q, 0]` | μ_sum |
+| `DVRM-C14` |  | `MUL[extension_n_sub_r::DWordWL; d, signed, q, sign_q, 1]` | μ_sum |
+| `DVRM-C15.i` | i ∈ [0, 3] | `IS_HALF[q[i]]` | μ_sum |
+
+### defs
+
+| Tag | Description |
+|-----|-------------|
+| `DVRM-C16` | `SIGN<sign_n; n[3], signed>` |
+| `DVRM-C17` | `SIGN<sign_r; r[3], signed>` |
+| `DVRM-C18` | `SIGN<sign_d; d[3], signed>` |
+
+### n_sub_r
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `DVRM-C9.i` | i ∈ [0, 3] | `IS_BIT<carry[i]>` |  |
+| `DVRM-C10.i` | i ∈ [0, 3] | `IS_HALF[r[i]]` | μ_sum |
+| `DVRM-C11.i` | i ∈ [0, 3] | `IS_HALF[n_sub_r[i]]` | μ_sum |
+| `DVRM-C12` |  | `IS_BIT<sign_n_sub_r>` |  |
+
+### output
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `DVRM-C21` | `DVRM[q::DWordWL; n, d, signed, 0]` | -μ_q |
+| `DVRM-C22` | `DVRM[r::DWordWL; n, d, signed, 1]` | -μ_r |
+
+### abs_diff
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `DVRM-C2` |  | `LT[1 - div_by_zero; abs_r, abs_d, 0]` | μ_sum |
+| `DVRM-C3` |  | sign_r ⇒ `NEG<abs_r; r>` |  |
+| `DVRM-C4.i` | i ∈ [0, 1] | not`sign_r` => `abs_r` = `r` |  |
+| | | _polynomial:_ `(1 - sign_r) * (abs_r[i] - (r::DWordWL)[i]) = 0` | |
+| `DVRM-C5` |  | sign_d ⇒ `NEG<abs_d; d>` |  |
+| `DVRM-C6.i` | i ∈ [0, 1] | not`sign_d` => `abs_d` = `d` |  |
+| | | _polynomial:_ `(1 - sign_d) * (abs_d[i] - (d::DWordWL)[i]) = 0` | |
+
+### div_by_zero
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `DVRM-C19.i` | i ∈ [0, 3] | `div_by_zero` => `q[i]` = 65535 |  |
+| | | _polynomial:_ `div_by_zero * (q[i] - 65535) = 0` | |
+| `DVRM-C20` |  | `ZERO[div_by_zero; d[0] + d[1] + d[2] + d[3]]` | μ_sum |
+
+### sign_equality
+
+| Tag | Description |
+|-----|-------------|
+| `DVRM-C1` | `r` eq.not 0 => `sign_r` = `sign_n` |
+| | _polynomial:_ `Σ_i = 0^3 r[i] * (sign_r - sign_n) = 0` |
\ No newline at end of file
diff --git a/docs/spec/ecall.md b/docs/spec/ecall.md
index 2d9891074..a25eb052e 100644
--- a/docs/spec/ecall.md
+++ b/docs/spec/ecall.md
@@ -1,27 +1,27 @@
 # ECALL Chips
 
-##  chip
+=  chip
 
-### Columns
+## Columns
 
 The  chip leverages  variable, spanning  columns:
 
-### Assumptions
+## Assumptions
 
 It is assumed the input is range checked:
 
-### Constraints
+## Constraints
 
 The  chip: + makes sure register `x10` (containing the exit code) equals `0` ([halt:c:read_zero_exit_code]), + writes `0` to all other registers ([halt:c:zeroize_registers_lo]/[halt:c:zeroize_registers_hi]), and + sets `pc` equal to `1` ([halt:c:pc]). Note that the writes performed by all these interactions are accompanied by the timestamp `2^64-1`; the maximum timestamp. This prevents any other operation involving memory from being executed hereafter.
 
 [ Observe that --- in its current state --- this solution puts the burden of verifying the register cleanup on the verifier inside of the lookup argument. Alternatively, one could add 31 lookups to the "memory" table to remove the _known_ final tokens for the registers there. ])
 
-#### Lookup
+### Lookup
 
 The HALT chip contributes the following interaction to the lookup-argument:
 
 *Note*: [`93` is the system call number corresponding to `sys_exit`.]
 
-### Padding
+## Padding
 
 This chip should only contain a single row. Given that `2^0 = 1`, this chip does not need to be padded. As such, no padding is defined.
\ No newline at end of file
diff --git a/docs/spec/halt.md b/docs/spec/halt.md
deleted file mode 100644
index 72ecac037..000000000
--- a/docs/spec/halt.md
+++ /dev/null
@@ -1,32 +0,0 @@
-# HALT Chip
-
-## Columns
-
-### Input
-
-| Name | Type | Description |
-|------|------|-------------|
-| `timestamp` | `DWordWL` | timestamp at which to halt the program |
-
-## Assumptions
-
-| Ref | Range | Description |
-|-----|-------|-------------|
-| `A1` | i ∈ [0, 1] | `IS_WORD[timestamp[i]]` |
-
-## Constraints
-
-### all
-
-| Ref | Kind | Range | Description | Multiplicity |
-|-----|------|-------|-------------|--------------|
-| `halt:c:zeroize_registers_lo` | interaction | i ∈ [1, 9] | `MEMW[1, 2 * i, 0, 2^64 - 1, 1, 0, 0]` | 1 |
-| `halt:c:read_zero_exit_code` | interaction |  | `MEMW[1, 2 * 10, 0, 2^64 - 1, 1, 0, 0]` | 1 |
-| `halt:c:zeroize_registers_hi` | interaction | i ∈ [11, 31] | `MEMW[1, 2 * i, 0, 2^64 - 1, 1, 0, 0]` | 1 |
-| `halt:c:pc` | interaction |  | `MEMW[1, 2 * 255, 1, 2^64 - 1, 1, 0, 0]` | 1 |
-
-### lookup
-
-| Ref | Kind | Description | Multiplicity |
-|-----|------|-------------|--------------|
-| `halt:c:lookup` | interaction | `ECALL[timestamp, 93]` | -1 |
diff --git a/docs/spec/is_bit.md b/docs/spec/is_bit.md
index bb6e0090f..b04830776 100644
--- a/docs/spec/is_bit.md
+++ b/docs/spec/is_bit.md
@@ -4,25 +4,17 @@ box( inset: (left: 4pt, right: 4pt), outset: (top: 4pt, bottom: 4pt), radius: 2p
 
 Barring exceptional cases, this template is used to assert that a variable of type `Bit` assumes a valid value under some condition.
 
-## Interface
-
-The  constraint template has the following interface:
+= Interface The  constraint template has the following interface:
 
 where `cond` is any value described by an expression _of degree at most `1`_. Note that  can be used to denote the _unconditional_ application of the  template to `X`.
 
-## Variables
-
-The  template operates on two variables: `cond` and `X`:
-
-## Constraints
+= Variables The  template operates on two variables: `cond` and `X`:
 
-It takes only one constraint to enforce that `X` must be either `0` or `1` whenever ``cond` eq.not 0`:
+= Constraints It takes only one constraint to enforce that `X` must be either `0` or `1` whenever ``cond` eq.not 0`:
 
 *Note*: - In case of _unconditional_ template application, `cond` can be dropped from the constraint, simplifying it to ``X` (1- `X`) = 0`. - As described earlier, the `cond` variable must be describable by a degree-1 (i.e., linear) expression. This is to make sure that [isbit:c:isbit]'s expression has degree at most 3.
 
-## Proof of correctness
-
-If `cond` is `0`, [isbit:c:isbit] is trivially satisfied: `X` can assume any value and the polynomial constraint will evaluate to `0` regardless. When ``cond` eq.not 0`, it follows that the statement can only be proven when ``X` (1-`X`) equiv 0 mod p`, with `p` the modulus of the field. Because `BaseField` is a prime field, this equality is only satisfied if either ``X` equiv 0 mod p` or `1-`X` equiv 0 mod p`. Hence, it is proven that when ``cond` eq.not 0`, [isbit:c:isbit] is only satisfied if ``X` in {0, 1}`. 
+= Proof of correctness If `cond` is `0`, [isbit:c:isbit] is trivially satisfied: `X` can assume any value and the polynomial constraint will evaluate to `0` regardless. When ``cond` eq.not 0`, it follows that the statement can only be proven when ``X` (1-`X`) equiv 0 mod p`, with `p` the modulus of the field. Because `BaseField` is a prime field, this equality is only satisfied if either ``X` equiv 0 mod p` or `1-`X` equiv 0 mod p`. Hence, it is proven that when ``cond` eq.not 0`, [isbit:c:isbit] is only satisfied if ``X` in {0, 1}`. 
 
 ## Columns
 
@@ -38,6 +30,8 @@ If `cond` is `0`, [isbit:c:isbit] is trivially satisfied: `X` can assume any val
 |------|------|-------------|
 | `cond` | `BaseField` | Whether the constraint should be applied ($eq.not 0$) or not ($0$). |
 
+## Constraints
+
 ### all
 
 | Tag | Description |
diff --git a/docs/spec/load.md b/docs/spec/load.md
index 6f519564a..9c9df5644 100644
--- a/docs/spec/load.md
+++ b/docs/spec/load.md
@@ -1,5 +1,38 @@
 # LOAD Chip
 
+= Columns
+
+The `LOAD` chip is comprised of  variables that are expressed using  columns:
+
+= Assumptions
+
+= Constraints The chip delegates the actual memory interaction to the `MEMW` chip, and ensures correctness of the requested sign/zero extension. The output `res` is correctly range-checked as long as the memory contents are.
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `LOAD-C1` |  | `read2` + `read4` + `read8` => `μ` |  |
+| | | _polynomial:_ `(read2 + read4 + read8) * (1 - μ) = 0` | |
+| `LOAD-C2` |  | `MEMW[res; 0, base_address, res::BaseField[8], timestamp, read2, read4, read8]` | μ |
+| `LOAD-C3` |  | `MSB8[sign_bit; res[0]]` | read1 |
+| `LOAD-C4` |  | `MSB8[sign_bit; res[1]]` | read2 |
+| `LOAD-C5` |  | `MSB8[sign_bit; res[3]]` | read4 |
+| `LOAD-C6.i` | i ∈ [4, 7] | !`read8` => `res`_i = `signed` dot `sign_bit` dot 255 |  |
+| | | _polynomial:_ `(1 - read8) * (res[i] - signed * sign_bit * 255) = 0` | |
+| `LOAD-C7.i` | i ∈ [2, 3] | !(`read4` + `read8`) => `res`_i = `signed` dot `sign_bit` dot 255 |  |
+| | | _polynomial:_ `(1 - read4 - read8) * (res[i] - signed * sign_bit * 255) = 0` | |
+| `LOAD-C8` |  | !(`read2` + `read4` + `read8`) => `res`_1 = `signed` dot `sign_bit` dot 255 |  |
+| | | _polynomial:_ `(1 - read2 - read4 - read8) * (res[1] - signed * sign_bit * 255) = 0` | |
+
+The chip contributes the following to the lookup argument.
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `LOAD-C9` | `LOAD[res::DWordWL; base_address, timestamp, read2, read4, read8, signed]` | -μ |
+
+= Padding
+
+The table can be padded to the next power of two with the following value assignments:
+
 ## Columns
 
 ### Input
@@ -42,8 +75,6 @@ read1 := μ - read2 - read4 - read8
 |------|------|-------------|
 | `μ` | `Bit` |  |
 
-The `LOAD` chip is comprised of  variables that are expressed using  columns:
-
 ## Assumptions
 
 | Tag | Range | Description |
@@ -54,33 +85,4 @@ The `LOAD` chip is comprised of  variables that are expressed using  columns:
 | `LOAD-A4` |  | `IS_BIT<read4>` |
 | `LOAD-A5` |  | `IS_BIT<read8>` |
 | `LOAD-A6` |  | `IS_BIT<read2 + read4 + read8>` |
-| `LOAD-A7.i` | i ∈ [0, 1] | `IS_WORD[timestamp[i]]` |
-
-## Constraints
-
-The chip delegates the actual memory interaction to the `MEMW` chip, and ensures correctness of the requested sign/zero extension. The output `res` is correctly range-checked as long as the memory contents are.
-
-| Tag | Range | Description | Multiplicity |
-|-----|-------|-------------|--------------|
-| `LOAD-C1` |  | `read2` + `read4` + `read8` => `μ` |  |
-| | | _polynomial:_ `(read2 + read4 + read8) * (1 - μ) = 0` | |
-| `LOAD-C2` |  | `MEMW[res; 0, base_address, res::BaseField[8], timestamp, read2, read4, read8]` | μ |
-| `LOAD-C3` |  | `MSB8[sign_bit; res[0]]` | read1 |
-| `LOAD-C4` |  | `MSB8[sign_bit; res[1]]` | read2 |
-| `LOAD-C5` |  | `MSB8[sign_bit; res[3]]` | read4 |
-| `LOAD-C6.i` | i ∈ [4, 7] | !`read8` => `res`_i = `signed` dot `sign_bit` dot 255 |  |
-| | | _polynomial:_ `(1 - read8) * (res[i] - signed * sign_bit * 255) = 0` | |
-| `LOAD-C7.i` | i ∈ [2, 3] | !(`read4` + `read8`) => `res`_i = `signed` dot `sign_bit` dot 255 |  |
-| | | _polynomial:_ `(1 - read4 - read8) * (res[i] - signed * sign_bit * 255) = 0` | |
-| `LOAD-C8` |  | !(`read2` + `read4` + `read8`) => `res`_1 = `signed` dot `sign_bit` dot 255 |  |
-| | | _polynomial:_ `(1 - read2 - read4 - read8) * (res[1] - signed * sign_bit * 255) = 0` | |
-
-The chip contributes the following to the lookup argument.
-
-| Tag | Description | Multiplicity |
-|-----|-------------|--------------|
-| `LOAD-C9` | `LOAD[res::DWordWL; base_address, timestamp, read2, read4, read8]` | -μ |
-
-## Padding
-
-The table can be padded to the next power of two with the following value assignments:
\ No newline at end of file
+| `LOAD-A7.i` | i ∈ [0, 1] | `IS_WORD[timestamp[i]]` |
\ No newline at end of file
diff --git a/docs/spec/lt.md b/docs/spec/lt.md
index a8f65057a..2c77368d2 100644
--- a/docs/spec/lt.md
+++ b/docs/spec/lt.md
@@ -1,5 +1,49 @@
 # LT Chip
 
+= Columns
+
+The `LT` chip is comprised of  variables that are expressed using  columns:
+
+= Assumptions We assume the inputs `lhs`, `rhs` and `signed` are partially range checked.
+
+= Constraints We first constrain that all variables correspond to their definition. For the defining constraint of `lt`, [lt:c:lt], observe that it is a choice between two options, depending on the input flag `signed`. In the case of unsigned comparison, we simply need `unsigned_lt`, indicating that a wraparound (carry bit) modulo `2^64` is needed to go from `rhs` to `lhs` via addition. For the case of signed comparison, we first need some case analysis.
+
+We split `a < b` into four disjoint cases, conditioned on the sign of `a` and `b`. Recall that the sign of a number in two's complement can be read off from the MSB, being `1` for a negative number and `0` for a positive one. For this analysis, we denote the MSB of `a` as `A` and the MSB of `b` as `B`. The four disjoint cases then become:
+
++ `dash(A) and B and (a < b)` + `A and dash(B) and (a < b)` + `A and B and (a < b)` + `dash(A) and dash(B) and (a < b)`
+
+The first case is evidently false, while the second case simplifies to `A and dash(B)`. For the third and fourth case, observe that when `A = B`, the `<` relation is preserved by the modular correspondence between `[-2^(31), 2^(31))` and `[0, 2^(64))`. Importantly, this modular correspondence is merely a reinterpretation of the bits or values of `a` and `b`, due to the representation in two's complement. Hence, we can introduce the value `C = `unsigned_lt``, that accurately represents the relation `a < b` when `A = B`.
+
+Combining our three remaining cases, we obtain the boolean formula `A dash(B) or A B C or dash(A) dash(B) C`. Since the cases are disjoint, this can be computed with the binary-valued polynomial `P(A, B, C) = A (1 - B) + A B C + (1 - A) (1 - B) C`.
+
+The polynomial `P` can be simplified to a total degree of two. We claim that the polynomial `Q(A, B, C) = A (1 - B) + A C + (1 - B) C` is, for the purposes of this chip, equivalent to `P`. An exhaustive check shows that `P(A, B, C) != Q(A, B, C)` only for the triple `(A, B, C) = (1, 0, 1)`. This is, however, impossible due to the correctness of `ADD`. In more detail, if we let `s` be the (range-checked) difference `a - b` (so the equivalent of the `lhs_sub_rhs` column), and `x'` denote the most significant word of a variable `x`, we need `c dot 2^32 + a' = b' + s' + `carry[0]``, by the definition of `carry`. However, the left hand side of this is at least `3 dot 2^31`, as `(A, C) = (1, 1)`, and the right hand side is at most `(2^31 - 1) + (2^32 - 1) + 1 = 3 dot 2^31 - 1`. Therefore, we can use `Q` to constrain `lt` when `signed = 1`.
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `LT-C1` | `MSB16[lhs_msb; lhs[2]]` | μ |
+| `LT-C2` | `MSB16[rhs_msb; rhs[2]]` | μ |
+| `LT-C3` | `lt` = `signed` dot (A (1 - B) + A C + (1 - B) C) + (1 - `signed`) dot `unsigned_lt` |  |
+| | _polynomial:_ `lt - signed * (lhs_msb * (1 - rhs_msb) + lhs_msb * carry[1] + (1 - rhs_msb) * carry[1]) - (1 - signed) * unsigned_lt = 0` | |
+| `LT-C4` | `IS_HALFWORD[lhs[1]]` | μ |
+| `LT-C5` | `IS_HALFWORD[rhs[1]]` | μ |
+
+And then we constrain the subtraction, taking care of the remaining range checking not yet covered by the assumptions or the `MSB16` lookup.
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `LT-C6.i` | i ∈ [0, 1] | `IS_BIT<carry[i]>` |  |
+| `LT-C7.i` | i ∈ [0, 3] | `IS_HALFWORD[lhs_sub_rhs[i]]` | μ |
+
+The chip contributes the following to the lookup argument.
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `LT-C8` | `LT[lt; lhs::DWordWL, rhs::DWordWL, signed]` | -μ |
+
+= Padding
+
+The table can be padded to the next power of two with the following value assignments:
+
 ## Columns
 
 ### Input
@@ -48,54 +92,10 @@ unsigned_lt := carry[1]
 |------|------|-------------|
 | `μ` | `Bit` |  |
 
-The `LT` chip is comprised of  variables that are expressed using  columns:
-
 ## Assumptions
 
 | Tag | Range | Description |
 |-----|-------|-------------|
 | `LT-A1` |  | `IS_WORD[lhs[0]]` |
 | `LT-A2` |  | `IS_WORD[rhs[0]]` |
-| `LT-A3` |  | `IS_BIT<signed>` |
-
-We assume the inputs `lhs`, `rhs` and `signed` are partially range checked.
-
-## Constraints
-
-We first constrain that all variables correspond to their definition. For the defining constraint of `lt`, [lt:c:lt], observe that it is a choice between two options, depending on the input flag `signed`. In the case of unsigned comparison, we simply need `unsigned_lt`, indicating that a wraparound (carry bit) modulo `2^64` is needed to go from `rhs` to `lhs` via addition. For the case of signed comparison, we first need some case analysis.
-
-We split `a < b` into four disjoint cases, conditioned on the sign of `a` and `b`. Recall that the sign of a number in two's complement can be read off from the MSB, being `1` for a negative number and `0` for a positive one. For this analysis, we denote the MSB of `a` as `A` and the MSB of `b` as `B`. The four disjoint cases then become:
-
-+ `dash(A) and B and (a < b)` + `A and dash(B) and (a < b)` + `A and B and (a < b)` + `dash(A) and dash(B) and (a < b)`
-
-The first case is evidently false, while the second case simplifies to `A and dash(B)`. For the third and fourth case, observe that when `A = B`, the `<` relation is preserved by the modular correspondence between `[-2^(31), 2^(31))` and `[0, 2^(64))`. Importantly, this modular correspondence is merely a reinterpretation of the bits or values of `a` and `b`, due to the representation in two's complement. Hence, we can introduce the value `C = `unsigned_lt``, that accurately represents the relation `a < b` when `A = B`.
-
-Combining our three remaining cases, we obtain the boolean formula `A dash(B) or A B C or dash(A) dash(B) C`. Since the cases are disjoint, this can be computed with the binary-valued polynomial `P(A, B, C) = A (1 - B) + A B C + (1 - A) (1 - B) C`.
-
-The polynomial `P` can be simplified to a total degree of two. We claim that the polynomial `Q(A, B, C) = A (1 - B) + A C + (1 - B) C` is, for the purposes of this chip, equivalent to `P`. An exhaustive check shows that `P(A, B, C) != Q(A, B, C)` only for the triple `(A, B, C) = (1, 0, 1)`. This is, however, impossible due to the correctness of `ADD`. In more detail, if we let `s` be the (range-checked) difference `a - b` (so the equivalent of the `lhs_sub_rhs` column), and `x'` denote the most significant word of a variable `x`, we need `c dot 2^32 + a' = b' + s' + `carry[0]``, by the definition of `carry`. However, the left hand side of this is at least `3 dot 2^31`, as `(A, C) = (1, 1)`, and the right hand side is at most `(2^31 - 1) + (2^32 - 1) + 1 = 3 dot 2^31 - 1`. Therefore, we can use `Q` to constrain `lt` when `signed = 1`.
-
-| Tag | Description | Multiplicity |
-|-----|-------------|--------------|
-| `LT-C1` | `MSB16[lhs_msb; lhs[2]]` | μ |
-| `LT-C2` | `MSB16[rhs_msb; rhs[2]]` | μ |
-| `LT-C3` | `lt` = `signed` dot (A (1 - B) + A C + (1 - B) C) + (1 - `signed`) dot `unsigned_lt` |  |
-| | _polynomial:_ `lt - signed * (lhs_msb * (1 - rhs_msb) + lhs_msb * carry[1] + (1 - rhs_msb) * carry[1]) - (1 - signed) * unsigned_lt = 0` | |
-| `LT-C4` | `IS_HALFWORD[lhs[1]]` | μ |
-| `LT-C5` | `IS_HALFWORD[rhs[1]]` | μ |
-
-And then we constrain the subtraction, taking care of the remaining range checking not yet covered by the assumptions or the `MSB16` lookup.
-
-| Tag | Range | Description | Multiplicity |
-|-----|-------|-------------|--------------|
-| `LT-C6.i` | i ∈ [0, 1] | `IS_BIT<carry[i]>` |  |
-| `LT-C7.i` | i ∈ [0, 3] | `IS_HALFWORD[lhs_sub_rhs[i]]` | μ |
-
-The chip contributes the following to the lookup argument.
-
-| Tag | Description | Multiplicity |
-|-----|-------------|--------------|
-| `LT-C8` | `LT[lt; lhs::DWordWL, rhs::DWordWL, signed]` | -μ |
-
-## Padding
-
-The table can be padded to the next power of two with the following value assignments:
\ No newline at end of file
+| `LT-A3` |  | `IS_BIT<signed>` |
\ No newline at end of file
diff --git a/docs/spec/memory.md b/docs/spec/memory.md
index 6a5ea151c..d5517248c 100644
--- a/docs/spec/memory.md
+++ b/docs/spec/memory.md
@@ -6,11 +6,11 @@ While RAM is byte addressed, we do choose to store registers as a `DWordWL` over
 
 On a high level, we ensure memory consistency by an interacting system of reads and writes to a lookup argument, combined with an initialization and finalization scheme. The initialization and finalization schemes together ensure both that (1) the necessary preconditions for the lookup system are satisfied, and (2) the program is executed with the correct initial memory and register contents as specified by the ELF binary and the ISA.
 
-## Memory types
+= Memory types
 
 A commonly made distinction of memory types is that of _read-only_ and _read-write_ memory, with the more restrictive read-only variant often allowing for more efficient solutions (be that regarding prover time, verifier time or proof size) via table lookup proofs. Naturally, the VM’s main memory and registers should be handled by a read-write system as the guest program/environment can issue instructions that write to memory. While there are some subsystems that can be modelled as read-only memory ---e.g., the program memory and instruction decoding--- we opt to integrate these into the proof system via chip interactions (relying on techniques derived from table lookup arguments). As such, we only concern ourselves with read-write memory, moving forward.
 
-## Memory operations
+= Memory operations
 
 Every memory operation has some conceptual attributes that are relevant to mention or discuss:
 
@@ -20,7 +20,7 @@ Since we will have to ensure that memory accesses are temporally consistent with
 
 For reasons of completeness (since temporal integrity as discussed below is a security necessity), we cannot deal with multiple accesses to the same address at identical timestamps. However, if multiple accesses are guaranteed to be independent (that is, to different addresses), they can still share a timestamp --- consider, e.g., the case of reading a word as 4 bytes with the `LW` load instruction. This property is already taken into account where possible in the design of the system. For instance, in the CPU chip, we can ensure that there are at most 3 memory accesses not guaranteed to be independent, so a timestamp granularity of 4 timestamps per cycle is enough. ]
 
-## Permutation argument
+= Permutation argument
 
 We can conceptually organise the state of the memory as a collection of "tokens" that represent tuples `(serif("timestamp"), serif("address"), serif("value"))`, meaning the current value written to `serif("address")` is `serif("value")`, last written to memory at `serif("timestamp")`. Having exactly one value associated with any address will be ensured (see further down in this document) by the interaction of memory initialization, memory finalization, and the effects of memory operations.
 
@@ -34,7 +34,7 @@ Naturally, for a read operation, the _values_ embedded in the consumed and emitt
 
 So long as we can properly constrain temporal integrity (that is, no memory operation can consume future tokens), this "balancing" act of tokens can be integrated (with sufficient domain separation) into the existing LogUp argument: consuming a token corresponds to a "receive" and emitting a new token is a "send".
 
-## Temporal integrity
+= Temporal integrity
 
 > **Note:** Properly link/refer to the LT chip
 
@@ -44,7 +44,7 @@ To ensure temporal integrity, every memory operation needs to be constrained for
 
 > **Note:** reference to CPU chip/timestamp column and MEMW chip
 
-## Initialization and Finalization
+= Initialization and Finalization
 
 Because the LogUp argument handling token consumption and emission needs to be fully balanced --- every token emitted should be consumed, and vice versa --- we need to have a system to emit the initial tokens and consume the final tokens. This needs to ensure that every address has at most a single initializing emission, and at most one finalizing consumption. Having at most one initialization will, through the correctness of the lookup argument, immediately lead to having at most one correct finalization, and vice versa.
 
@@ -54,7 +54,7 @@ For our chosen scheme (which we refer to as "paged initialization/finalization")
 
 Concretely, each page gets an associated `PAGE` table, consisting of N variables over N columns. For each such table, the `page` variable is instantiated as the constant base address of the page. The `offset` column is preprocessed, which helps the verifier ensure that each page has a single fixed size, but the verifier should still check that no pages overlap and all `page` values are page-aligned.
 
-### Page initialization
+## Page initialization
 
 > **Note:** check whether we need `fini` to be range-checked
 
@@ -74,16 +74,16 @@ _Sparse initialization/finalization_
 
 One or more STARK tables (depending on the amount of memory used) consisting of `(address, value)` columns are introduced, where for zero-initialization, `value` can be constant zero. Transition constraints ensure that `address` is strictly increasing, enforcing the "at most once" property; `value` is range-checked to consist of bytes. Similar to paged finalization, an additional `timestamp` column is added, containing the final timestamp each address was accessed. This table is then further used to contribute to the LogUp sum as with any other interactions. - The transition constraints can be chosen to only apply on finalization, as at-most-once finalization is enough to ensure consistency. - Sparse initialization is incompatible with paged finalization, see also the remark under free-zero initialization above. - This would require transition constraints, which currently are not needed elsewhere in the VM design - Additionally, for memory use exceeding the capacity of a single initialization/finalization table, some form of transition constraint between tables is needed - Alternatively, transition constraints could potentially be avoided by more integration into the LogUp system, but this could turn out more costly in practice - This is compatible with the above "free zero" initialization - Since a prover-committed address column is needed (rather than a precomputed one), the number of required columns increases. - As an optimization, the address column could potentially be used simultaneously for initialization and finalization - Sparse initialization/finalization reduces the cost for sparse memory access patterns, where only a few addresses would be accessed per page. Most programs and compilers should however favor a memory locality that makes paged initialization/finalization comparable. ]
 
-### Register initialization/finalization
+## Register initialization/finalization
 
 > **Note:** Properly link/reference ECALL/HALT chip
 
 The initial and final state of registers can be entirely known by the verifier, since the relevant initialization values are either zero, or embedded in the ELF, and the final values can be set to a known value by the HALT ecall. As additionally, the number of registers is small, the verifier can directly add the required balancing terms to the LogUp sum.
 
-## Notes and considerations
+= Notes and considerations
 
 - Register reads and writes may interact within a single cycle, so a correct and fixed ordering needs to be ensured - Correctness of initialization and completeness of finalization need to be ensured
 
-## Future topics of interest
+= Future topics of interest
 
 - Optimize memory systems after determining factual bottlenecks (e.g. taking inspiration from Twist and Shout, or other recent research)
\ No newline at end of file
diff --git a/docs/spec/memw.md b/docs/spec/memw.md
index 613c96612..520abffa4 100644
--- a/docs/spec/memw.md
+++ b/docs/spec/memw.md
@@ -1,5 +1,63 @@
 # MEMW Chip
 
+= Columns
+
+The `MEMW` chip is comprised of  variables that are expressed using  columns:
+
+= Assumptions
+
+Our assumptions do not explicitly cover any range checks for the `is_register` and `value` columns, as these are not necessary for the correctness of this chip in isolation. These properties are necessary for the consistency of the system as a whole, and therefore we document it here, keeping the type information as a reading help.
+
+= Constraints
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `MEMW-C1` |  | `IS_BIT<μ_sum>` |  |
+| `MEMW-C2` |  | `w2` => `μ_sum` |  |
+| | | _polynomial:_ `w2 * (1 - μ_sum) = 0` | |
+| `MEMW-C3` |  | `ADD<address_add[0]::DWordWL; base_address, 1>` | w2 |
+| `MEMW-C4.i` | i ∈ [1, 2] | `ADD<address_add[i]::DWordWL; base_address, i + 1>` | w4 |
+| `MEMW-C5.i` | i ∈ [3, 6] | `ADD<address_add[i]::DWordWL; base_address, i + 1>` | write8 |
+| `MEMW-C6.i` | i ∈ [0, 6], j ∈ [0, 3] | `IS_HALFWORD[address_add[i][j]]` |  |
+| `MEMW-C7` |  | `LT[1; old_timestamp[0], timestamp, 0]` | μ_sum |
+| `MEMW-C8` |  | `LT[1; old_timestamp[1], timestamp, 0]` | w2 |
+| `MEMW-C9.i` | i ∈ [2, 3] | `LT[1; old_timestamp[i], timestamp, 0]` | w4 |
+| `MEMW-C10.i` | i ∈ [4, 7] | `LT[1; old_timestamp[i], timestamp, 0]` | write8 |
+
+As long as `timestamp` is properly range-checked, the presence of `old_timestamp` in the memory argument automatically ensures appropriate range checking (as long as no external entities provide negative multiplicities without range checking the timestamp). This ensures the assumptions for `LT` are satisfied.
+
+We additionally check that the address does not overflow for more significant bytes of the access.
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `MEMW-CR11` | `LT[1; base_address, address_add[0]::DWordWL, 0]` | write2 |
+| `MEMW-CR12` | `LT[1; base_address, address_add[2]::DWordWL, 0]` | write4 |
+| `MEMW-CR13` | `LT[1; base_address, address_add[6]::DWordWL, 0]` | write8 |
+
+The chip adds the following tuples to the lookup argument, to effectuate that part of the memory argument.
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `MEMW-CM14` |  | `memory[is_register, base_address, old_timestamp[0], old[0]]` | μ_sum |
+| `MEMW-CM15` |  | `memory[is_register, base_address, timestamp, value[0]]` | -μ_sum |
+| `MEMW-CM16` |  | `memory[is_register, address_add[0], old_timestamp[1], old[1]]` | w2 |
+| `MEMW-CM17` |  | `memory[is_register, address_add[0], timestamp, value[1]]` | -w2 |
+| `MEMW-CM18.i` | i ∈ [2, 3] | `memory[is_register, address_add[i - 1], old_timestamp[i], old[i]]` | w4 |
+| `MEMW-CM19.i` | i ∈ [2, 3] | `memory[is_register, address_add[i - 1], timestamp, value[i]]` | -w4 |
+| `MEMW-CM20.i` | i ∈ [4, 7] | `memory[is_register, address_add[i - 1], old_timestamp[i], old[i]]` | write8 |
+| `MEMW-CM21.i` | i ∈ [4, 7] | `memory[is_register, address_add[i - 1], timestamp, value[i]]` | -write8 |
+
+This chip contributes the following to the lookup argument.
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `MEMW-CO22` | `MEMW[old; is_register, base_address, value, timestamp, write2, write4, write8]` | μ_read |
+| `MEMW-CO23` | `MEMW[is_register, base_address, value, timestamp, write2, write4, write8]` | μ_write |
+
+= Future optimization ideas
+
+- Fast path for aligned memory access where all bytes have the same old timestamp - MEMB chip that deals does a one-byte write to remove old_timestamp from here (uncertain tradeoffs) - Compute `base_address[1] + 1` once and have high words of `address_add` as Words - Improve overflow trapping somehow so we don't need `LT` (could tie into previous one by checking carry bit of the +1) - Adding `μ_sum`/`w2`/`w4`/`write8` multiplicities to the `IS_HALFWORD` lookups may make some GKR things faster if there are known zeroes.
+
 ## Columns
 
 ### Input
@@ -57,8 +115,6 @@ w4 := write4 + write8
 | `μ_read` | `Bit` | Whether we are performing a read (and hence return `out`) |
 | `μ_write` | `Bit` | Whether we are performing a write (and hence not return `out`) |
 
-The `MEMW` chip is comprised of  variables that are expressed using  columns:
-
 ## Assumptions
 
 | Tag | Range | Description |
@@ -68,56 +124,4 @@ The `MEMW` chip is comprised of  variables that are expressed using  columns:
 | `MEMW-A3` |  | `IS_BIT<write4>` |
 | `MEMW-A4` |  | `IS_BIT<write8>` |
 | `MEMW-A5` |  | `IS_BIT<write2 + write4 + write8>` |
-| `MEMW-A6.i` | i ∈ [0, 1] | `IS_WORD[timestamp[i]]` |
-
-Our assumptions do not explicitly cover any range checks for the `is_register` and `value` columns, as these are not necessary for the correctness of this chip in isolation. These properties are necessary for the consistency of the system as a whole, and therefore we document it here, keeping the type information as a reading help.
-
-## Constraints
-
-| Tag | Range | Description | Multiplicity |
-|-----|-------|-------------|--------------|
-| `MEMW-C1` |  | `IS_BIT<μ_sum>` |  |
-| `MEMW-C2` |  | `w2` => `μ_sum` |  |
-| | | _polynomial:_ `w2 * (1 - μ_sum) = 0` | |
-| `MEMW-C3` |  | `ADD<address_add[0]::DWordWL; base_address, 1>` | w2 |
-| `MEMW-C4.i` | i ∈ [1, 2] | `ADD<address_add[i]::DWordWL; base_address, i + 1>` | w4 |
-| `MEMW-C5.i` | i ∈ [3, 6] | `ADD<address_add[i]::DWordWL; base_address, i + 1>` | write8 |
-| `MEMW-C6.i` | i ∈ [0, 6], j ∈ [0, 3] | `IS_HALFWORD[address_add[i][j]]` |  |
-| `MEMW-C7` |  | `LT[1; old_timestamp[0], timestamp, 0]` | μ_sum |
-| `MEMW-C8` |  | `LT[1; old_timestamp[1], timestamp, 0]` | w2 |
-| `MEMW-C9.i` | i ∈ [2, 3] | `LT[1; old_timestamp[i], timestamp, 0]` | w4 |
-| `MEMW-C10.i` | i ∈ [4, 7] | `LT[1; old_timestamp[i], timestamp, 0]` | write8 |
-
-As long as `timestamp` is properly range-checked, the presence of `old_timestamp` in the memory argument automatically ensures appropriate range checking (as long as no external entities provide negative multiplicities without range checking the timestamp). This ensures the assumptions for `LT` are satisfied.
-
-We additionally check that the address does not overflow for more significant bytes of the access.
-
-| Tag | Description | Multiplicity |
-|-----|-------------|--------------|
-| `MEMW-CR11` | `LT[1; base_address, address_add[0]::DWordWL, 0]` | write2 |
-| `MEMW-CR12` | `LT[1; base_address, address_add[2]::DWordWL, 0]` | write4 |
-| `MEMW-CR13` | `LT[1; base_address, address_add[6]::DWordWL, 0]` | write8 |
-
-The chip adds the following tuples to the lookup argument, to effectuate that part of the memory argument.
-
-| Tag | Range | Description | Multiplicity |
-|-----|-------|-------------|--------------|
-| `MEMW-CM14` |  | `memory[is_register, base_address, old_timestamp[0], old[0]]` | μ_sum |
-| `MEMW-CM15` |  | `memory[is_register, base_address, timestamp, value[0]]` | -μ_sum |
-| `MEMW-CM16` |  | `memory[is_register, address_add[0], old_timestamp[1], old[1]]` | w2 |
-| `MEMW-CM17` |  | `memory[is_register, address_add[0], timestamp, value[1]]` | -w2 |
-| `MEMW-CM18.i` | i ∈ [2, 3] | `memory[is_register, address_add[i - 1], old_timestamp[i], old[i]]` | w4 |
-| `MEMW-CM19.i` | i ∈ [2, 3] | `memory[is_register, address_add[i - 1], timestamp, value[i]]` | -w4 |
-| `MEMW-CM20.i` | i ∈ [4, 7] | `memory[is_register, address_add[i - 1], old_timestamp[i], old[i]]` | write8 |
-| `MEMW-CM21.i` | i ∈ [4, 7] | `memory[is_register, address_add[i - 1], timestamp, value[i]]` | -write8 |
-
-This chip contributes the following to the lookup argument.
-
-| Tag | Description | Multiplicity |
-|-----|-------------|--------------|
-| `MEMW-CO22` | `MEMW[old; is_register, base_address, value, timestamp, write2, write4, write8]` | μ_read |
-| `MEMW-CO23` | `MEMW[is_register, base_address, value, timestamp, write2, write4, write8]` | μ_write |
-
-## Future optimization ideas
-
-- Fast path for aligned memory access where all bytes have the same old timestamp - MEMB chip that deals does a one-byte write to remove old_timestamp from here (uncertain tradeoffs) - Compute `base_address[1] + 1` once and have high words of `address_add` as Words - Improve overflow trapping somehow so we don't need `LT` (could tie into previous one by checking carry bit of the +1) - Adding `μ_sum`/`w2`/`w4`/`write8` multiplicities to the `IS_HALFWORD` lookups may make some GKR things faster if there are known zeroes.
\ No newline at end of file
+| `MEMW-A6.i` | i ∈ [0, 1] | `IS_WORD[timestamp[i]]` |
\ No newline at end of file
diff --git a/docs/spec/mul.md b/docs/spec/mul.md
index 25f7b02e3..894f2ee03 100644
--- a/docs/spec/mul.md
+++ b/docs/spec/mul.md
@@ -1,5 +1,65 @@
 # MUL Chip
 
+= Columns
+
+The `MUL` chip is comprised of  variables that are expressed using  columns:
+
+`mat(delim: , top; bottom)` }
+
+= Assumptions The following range checks are assumed to be performed/enforced outside of this chip:
+
+= Constraints
+
+## Overview
+
+When `lhs` and `rhs` are _unsigned_ integers, computing their product `mod 2^128` comes down to evaluating $ (sum_(j=0)^3 2^(16j) dot `lhs`_j) dot (sum_(i=0)^3 2^(16i) dot `rhs`_i) mod 2^128. $ If `lhs` and `rhs` are signed instead, the computation remains nearly identical: based on their signs, one must either zero or one-extend `lhs` and `rhs` --- forming `lhs_ext` and `rhs_ext` respectively --- and compute their product `mod 2^128`: $ (sum_(j=0)^7 2^(16j) dot `lhs_ext`_j) dot (sum_(i=0)^7 2^(16i) dot `rhs_ext`_i) mod 2^128. $ where `lhs_ext` and `rhs_ext` are treated as _unsigned_ integers. Note that by setting the extension limbs of `lhs` and/or `rhs` to `0` when the integer is (i) unsigned or (ii) signed and non-negative, this second formula still applies. For the purposes of constraining the multiplication operation, we rewrite this formula as
+
+$ &(sum_(j=0)^7 2^(16j) dot `lhs_ext`_j) dot (sum_(i=0)^7 2^(16i) dot `rhs_ext`_i) mod 2^128 \ &equiv sum_(j=0)^7 sum_(i=0)^7 2^(16(i+j)) dot `lhs_ext`_j dot `rhs_ext`_i mod 2^128 \ &stackrel(triangle, equiv) sum_(j=0)^7 sum_(i=0)^(7-j) 2^(16(i+j)) dot `lhs_ext`_j dot `rhs_ext`_i mod 2^128 \ &stackrel(square, equiv) sum_(j=0)^7 sum_(i=j)^(7) 2^(16i) dot `lhs_ext`_j dot `rhs_ext`_(i-j) mod 2^128 \ &stackrel(penta, equiv) sum_(i=0)^7 sum_(j=0)^(i) 2^(16i) dot `lhs_ext`_j dot `rhs_ext`_(i-j) mod 2^128 \ &equiv sum_(i=0)^3 sum_(k=0)^1 sum_(j=0)^(2i+k) 2^(16(2i+k)) dot `lhs_ext`_j dot `rhs_ext`_(2i+k-j) mod 2^128 \ &equiv sum_(i=0)^3 2^(32i) dot sum_(k=0)^1 2^(16k) dot sum_(j=0)^(2i+k) `lhs_ext`_j dot `rhs_ext`_(2i+k-j) mod 2^128 $ where at step - `triangle` we can ignore `i > 7-j`, since that makes `2^(16(i+j)) equiv 0 mod 2^128`, - `square` we rewrite the second summation such that `i` iterates from `j` to 7, rather than `0` to `7-j`, and - `penta` we swap the sums.
+
+We let `raw_product` capture the second summation in this last formula (see [mul:c:raw_product]). By construction, ``raw_product`_i < 2^51` for all `i in [0, 3]`, far exceeding the 32-bits that fit in a single `Word`-limb. What remains then is to reduce each limb of `raw_product` `mod 2^32`, carrying the overflow of each limb to the next, constructing the output `res` in doing so.
+
+This reduce-and-carry operation is constrained by [mul:c:range_lo]/[mul:c:range_hi] and [mul:c:carry], combined with `carry`'s definition. [mul:c:carry] and `carry`'s definition enforce that $ forall i in [0, 3]: `raw_product`_i + `carry`_(i-1) - `res`_i in { k dot 2^32 | k in [0, 2^20) } $ with ``carry`_(-1) = 0` for simplicity. In other words: ``res`_i equiv `raw_product`_i + `carry`_(i-1) (mod 2^32)`. With [mul:c:range_lo]/[mul:c:range_hi] forcing ``res`_i < 2^32`, ``res`_i` can only assume one value: ``raw_product`_i + `carry`_(i-1) mod 2^32`.
+
+*Note*: one may have observed that [mul:c:carry] requires ``carry`_i in [0, 2^20)`, while no limb of a valid carry value would ever exceed `2^19`. This is indeed the case. However, there is some slack in how tight one has to constrain the `carry` values. In fact, in this situation it suffices to assert that ``carry`_i < frac(p, 2^32, style: "skewed") approx 2^31`, where `p` denotes the field's modulus. Given that other chips also use 20-bit lookups, using `IS_B20` makes for a simpler design.
+
+## Definitions
+
+We constrain `lhs_is_negative` and `rhs_is_negative` according to their definition; `lo`, `hi` and `carry` are appropriately range checked.
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `MUL-C1` |  | `SIGN<lhs_is_negative; lhs[3], lhs_signed>` |  |
+| `MUL-C2` |  | `SIGN<rhs_is_negative; rhs[3], rhs_signed>` |  |
+| `MUL-C3.i` | i ∈ [0, 3] | `IS_HALF[lo[i]]` | μ_sum |
+| `MUL-C4.i` | i ∈ [0, 3] | `IS_HALF[hi[i]]` | μ_sum |
+| `MUL-C5.i` | i ∈ [0, 3] | `IS_B20[carry[i]]` | μ_sum |
+
+## Product
+
+[mul:c:raw_product] defines `raw_product` in terms of the (sign extended) input values `lhs` and `rhs`.
+
+| Tag | Range | Description |
+|-----|-------|-------------|
+| `MUL-C6.i` | i ∈ [0, 3] | `raw_product[i]` = sum_(`k`=0)^1 2^(16k) sum_(`j`=0)^(2i+k) `lhs_ext[j]` dot `rhs_ext[2i+k-j]` |
+| | | _polynomial:_ `Σ_k = 0^1 2^(16 * k) * Σ_j = 0^2 * i + k lhs_ext[j] * rhs_ext[2 * i + k - j] - raw_product[i] = 0` |
+
+## Lookup
+
+The  chip contributes the following to the lookup:
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `MUL-C7` | `MUL[lo::DWordWL; lhs, lhs_signed, rhs, rhs_signed, 0]` | -μ_lo |
+| `MUL-C8` | `MUL[hi::DWordWL; lhs, lhs_signed, rhs, rhs_signed, 1]` | -μ_hi |
+
+= Padding
+
+The table can be padded to the next power of two with the following value assignments:
+
+= Notes - `lo` and `hi` are stored in `DWordHL`s (rather than `DWordWL`s) because of their values being range checked. Since it is not required that both `μ_lo` and `μ_hi` are non-zero at the same time, one cannot safely assume their range to be checked elsewhere.
+
+As an optimization, one might be able to use a `DWordWL` and `DWordHL` to store `lo` and `hi`, where one would decide which to store in which based on the multiplicities `μ_lo` and `μ_hi`; the value sent into the lookup could then be assumed range-checked by the other side of the relation. This optimization was not included at this moment because of its negative impact on the readability and verifiability of the chip.
+
 ## Columns
 
 ### Input
@@ -72,69 +132,9 @@ carry (when iter=[1, 3]) := 2^-32 * (raw_product[i] + carry[i - 1] - res[i])
 | `μ_lo` | `BaseField` |  |
 | `μ_hi` | `BaseField` |  |
 
-The `MUL` chip is comprised of  variables that are expressed using  columns:
-
-`mat(delim: , top; bottom)` }
-
 ## Assumptions
 
 | Tag | Range | Description |
 |-----|-------|-------------|
 | `MUL-A1.i` | i ∈ [0, 3] | `IS_HALF[lhs[i]]` |
-| `MUL-A2.i` | i ∈ [0, 3] | `IS_HALF[rhs[i]]` |
-
-The following range checks are assumed to be performed/enforced outside of this chip:
-
-## Constraints
-
-### Overview
-
-When `lhs` and `rhs` are _unsigned_ integers, computing their product `mod 2^128` comes down to evaluating $ (sum_(j=0)^3 2^(16j) dot `lhs`_j) dot (sum_(i=0)^3 2^(16i) dot `rhs`_i) mod 2^128. $ If `lhs` and `rhs` are signed instead, the computation remains nearly identical: based on their signs, one must either zero or one-extend `lhs` and `rhs` --- forming `lhs_ext` and `rhs_ext` respectively --- and compute their product `mod 2^128`: $ (sum_(j=0)^7 2^(16j) dot `lhs_ext`_j) dot (sum_(i=0)^7 2^(16i) dot `rhs_ext`_i) mod 2^128. $ where `lhs_ext` and `rhs_ext` are treated as _unsigned_ integers. Note that by setting the extension limbs of `lhs` and/or `rhs` to `0` when the integer is (i) unsigned or (ii) signed and non-negative, this second formula still applies. For the purposes of constraining the multiplication operation, we rewrite this formula as
-
-$ &(sum_(j=0)^7 2^(16j) dot `lhs_ext`_j) dot (sum_(i=0)^7 2^(16i) dot `rhs_ext`_i) mod 2^128 \ &equiv sum_(j=0)^7 sum_(i=0)^7 2^(16(i+j)) dot `lhs_ext`_j dot `rhs_ext`_i mod 2^128 \ &stackrel(triangle, equiv) sum_(j=0)^7 sum_(i=0)^(7-j) 2^(16(i+j)) dot `lhs_ext`_j dot `rhs_ext`_i mod 2^128 \ &stackrel(square, equiv) sum_(j=0)^7 sum_(i=j)^(7) 2^(16i) dot `lhs_ext`_j dot `rhs_ext`_(i-j) mod 2^128 \ &stackrel(penta, equiv) sum_(i=0)^7 sum_(j=0)^(i) 2^(16i) dot `lhs_ext`_j dot `rhs_ext`_(i-j) mod 2^128 \ &equiv sum_(i=0)^3 sum_(k=0)^1 sum_(j=0)^(2i+k) 2^(16(2i+k)) dot `lhs_ext`_j dot `rhs_ext`_(2i+k-j) mod 2^128 \ &equiv sum_(i=0)^3 2^(32i) dot sum_(k=0)^1 2^(16k) dot sum_(j=0)^(2i+k) `lhs_ext`_j dot `rhs_ext`_(2i+k-j) mod 2^128 $ where at step - `triangle` we can ignore `i > 7-j`, since that makes `2^(16(i+j)) equiv 0 mod 2^128`, - `square` we rewrite the second summation such that `i` iterates from `j` to 7, rather than `0` to `7-j`, and - `penta` we swap the sums.
-
-We let `raw_product` capture the second summation in this last formula (see [mul:c:raw_product]). By construction, ``raw_product`_i < 2^51` for all `i in [0, 3]`, far exceeding the 32-bits that fit in a single `Word`-limb. What remains then is to reduce each limb of `raw_product` `mod 2^32`, carrying the overflow of each limb to the next, constructing the output `res` in doing so.
-
-This reduce-and-carry operation is constrained by [mul:c:range_lo]/[mul:c:range_hi] and [mul:c:carry], combined with `carry`'s definition. [mul:c:carry] and `carry`'s definition enforce that $ forall i in [0, 3]: `raw_product`_i + `carry`_(i-1) - `res`_i in { k dot 2^32 | k in [0, 2^20) } $ with ``carry`_(-1) = 0` for simplicity. In other words: ``res`_i equiv `raw_product`_i + `carry`_(i-1) (mod 2^32)`. With [mul:c:range_lo]/[mul:c:range_hi] forcing ``res`_i < 2^32`, ``res`_i` can only assume one value: ``raw_product`_i + `carry`_(i-1) mod 2^32`.
-
-*Note*: one may have observed that [mul:c:carry] requires ``carry`_i in [0, 2^20)`, while no limb of a valid carry value would ever exceed `2^19`. This is indeed the case. However, there is some slack in how tight one has to constrain the `carry` values. In fact, in this situation it suffices to assert that ``carry`_i < frac(p, 2^32, style: "skewed") approx 2^31`, where `p` denotes the field's modulus. Given that other chips also use 20-bit lookups, using `IS_B20` makes for a simpler design.
-
-### Definitions
-
-We constrain `lhs_is_negative` and `rhs_is_negative` according to their definition; `lo`, `hi` and `carry` are appropriately range checked.
-
-| Tag | Range | Description | Multiplicity |
-|-----|-------|-------------|--------------|
-| `MUL-C1` |  | `SIGN<lhs_is_negative; lhs[3], lhs_signed>` |  |
-| `MUL-C2` |  | `SIGN<rhs_is_negative; rhs[3], rhs_signed>` |  |
-| `MUL-C3.i` | i ∈ [0, 3] | `IS_HALF[lo[i]]` | μ_sum |
-| `MUL-C4.i` | i ∈ [0, 3] | `IS_HALF[hi[i]]` | μ_sum |
-| `MUL-C5.i` | i ∈ [0, 3] | `IS_B20[carry[i]]` | μ_sum |
-
-### Product
-
-[mul:c:raw_product] defines `raw_product` in terms of the (sign extended) input values `lhs` and `rhs`.
-
-| Tag | Range | Description |
-|-----|-------|-------------|
-| `MUL-C6.i` | i ∈ [0, 3] | `raw_product[i]` = sum_(`k`=0)^1 2^(16k) sum_(`j`=0)^(2i+k) `lhs_ext[j]` dot `rhs_ext[2i+k-j]` |
-| | | _polynomial:_ `Σ_k = 0^1 2^(16 * k) * Σ_j = 0^2 * i + k lhs_ext[j] * rhs_ext[2 * i + k - j] - raw_product[i] = 0` |
-
-### Lookup
-
-The  chip contributes the following to the lookup:
-
-| Tag | Description | Multiplicity |
-|-----|-------------|--------------|
-| `MUL-C7` | `MUL[lo::DWordWL; lhs, lhs_signed, rhs, rhs_signed, 0]` | -μ_lo |
-| `MUL-C8` | `MUL[hi::DWordWL; lhs, lhs_signed, rhs, rhs_signed, 1]` | -μ_hi |
-
-## Padding
-
-The table can be padded to the next power of two with the following value assignments:
-
-## Notes
-
-- `lo` and `hi` are stored in `DWordHL`s (rather than `DWordWL`s) because of their values being range checked. Since it is not required that both `μ_lo` and `μ_hi` are non-zero at the same time, one cannot safely assume their range to be checked elsewhere.
-
-As an optimization, one might be able to use a `DWordWL` and `DWordHL` to store `lo` and `hi`, where one would decide which to store in which based on the multiplicities `μ_lo` and `μ_hi`; the value sent into the lookup could then be assumed range-checked by the other side of the relation. This optimization was not included at this moment because of its negative impact on the readability and verifiability of the chip.
\ No newline at end of file
+| `MUL-A2.i` | i ∈ [0, 3] | `IS_HALF[rhs[i]]` |
\ No newline at end of file
diff --git a/docs/spec/neg.md b/docs/spec/neg.md
new file mode 100644
index 000000000..8535c1066
--- /dev/null
+++ b/docs/spec/neg.md
@@ -0,0 +1,73 @@
+# NEG Template
+
+box( inset: (left: 4pt, right: 4pt), outset: (top: 4pt, bottom: 4pt), radius: 2pt, fill: luma(230), raw(code)) }
+
+= Notation The  constraint template has the following interface:
+
+where `cond` is a bit value (i.e., lies in `{0, 1}`)  described by an expression _of degree at most `1`_.
+
+= Variables
+
+= Assumptions
+
+= Constraints We constrain this equality using two constraints:
+
+The constraints force the `carry` values to be fixed. Writing `carry`'s definition, we then find that $
+
+= cases( 2^32 - (`x as DWordWL`)_0 & "if" (`x as DWordWL`)_0 != 0, 0 & "if" (`x as DWordWL`)_0 = 0 ),\
+
+2^32 - (`x as DWordWL`)_1 - 1 & "if" `x` != 0, 0 & "if" `x` = 0 $ Clearly, ``neg` = 0` when ``x` = 0` (and `cond` is set). For non-zero `x`, we distinguish two cases. When `(`x as DWordWL`)_0 = 0`, $
+
+&= 2^32 dot `neg`_1 + `neg`_0\ &= 2^32 dot (2^32 - (`x as DWordWL`)_1) + 0\ &= 2^32 dot (2^32 - (`x as DWordWL`)_1) + (`x as DWordWL`)_0\ &= 2^64 - (2^32 dot (`x as DWordWL`)_1 + (`x as DWordWL`)_0)\ &= 2^64 - `x`\ &equiv -x mod 2^64, $ while when `(`x as DWordWL`)_0 != 0`, $
+
+&= 2^32 dot `neg`_1 + `neg`_0\ &= 2^32 dot (2^32 - (`x as DWordWL`)_1 - 1) + (2^32 - (`x as DWordWL`)_0)  \ &= 2^64 - 2^32 dot (`x as DWordWL`)_1 - 2^32 + 2^32 - (`x as DWordWL`)_0  \ &= 2^64 - ((`x as DWordWL`)_0 + 2^32 dot (`x as DWordWL`)_1) \ &= 2^64 - `x`\ &equiv -x mod 2^64 $ when `cond` is set. When `cond` is not set, the two lookups are not executed, allowing `neg` to take any value in either case.
+
+= Note It is worth noting that this construction does _not_ require the limbs of `neg` to be range checked, thus allowing it be represented by the unrangecheckable `DWordWL` rather than a `DWordHL`. The input value `x` is still assumed to be range-checked, however.
+
+## Columns
+
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `x` | `DWordHL` | value to compute negation of |
+
+### Output
+
+| Name | Type | Description |
+|------|------|-------------|
+| `neg` | `DWordWL` | negation of `x` if $`cond` != 0$; unconstrained otherwise. |
+
+### Virtual
+
+| Name | Type | Description |
+|------|------|-------------|
+| `carry` | `Bit[2]` | carries of the addition $`neg` + `x`$. |
+
+**Definition of `carry`:**
+```
+carry (when iter=0) := 2^-32 * ((x::DWordWL)[0] + neg[0])
+carry (when iter=1) := 2^-32 * ((x::DWordWL)[1] + neg[1] + carry[0])
+```
+
+### Condition
+
+| Name | Type | Description |
+|------|------|-------------|
+| `cond` | `Bit` | condition on whether to negate x |
+
+## Assumptions
+
+| Tag | Range | Description |
+|-----|-------|-------------|
+| `NEG-A1.i` | i ∈ [0, 3] | `IS_HALF[x[i]]` |
+| `NEG-A2` |  | `IS_BIT<cond>` |
+
+## Constraints
+
+### all
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `NEG-C1` | `ZERO[1 - carry[0]; x[0] + x[1]]` | cond |
+| `NEG-C2` | `ZERO[1 - carry[1]; x[0] + x[1] + x[2] + x[3]]` | cond |
\ No newline at end of file
diff --git a/docs/spec/shift.md b/docs/spec/shift.md
index bd9b9980a..cbee34a08 100644
--- a/docs/spec/shift.md
+++ b/docs/spec/shift.md
@@ -1,8 +1,6 @@
 # SHIFT Chip
 
-## Interface
-
-The  chip has the following interface:
+= Interface The  chip has the following interface:
 
 ``` // param in: the value being shifted // param shift: the number of bits to shift `in` by // param direction: whether to shift left (0) or right (1) // param signed: whether to interpret `in` as a signed (1) or unsigned (0) integer // param word_instr: whether to execute the SLL/SR* (0) or SLLW/SR*W (1) instruction // out shifted: the resulting value SHIFT[shifted: DWord; in: DWord, shift: Byte, direction: Bit, signed: Bit, word_instr: Bit] ``` In other words, the  chip is designed to constrain that $
 
@@ -10,6 +8,93 @@ $ $
 
 $ Here, `<<` and `>>` denote the _logical_ left and right shift operations, while `>>>` denotes the _arithmetic_ right shift operation.
 
+= Columns
+
+The `SHIFT` chip is comprised of  variables that are expressed using  columns:
+
+= Assumptions
+
+= Explanation This chip has a rather complex design as a result of designing it to fit in as few columns possible. We briefly discuss the intricacies of the design, attempting to illustrate its correctness.
+
+The chip's design revolves around a two-phase shifting process: 1. shift `in` by `x := `shift` mod 16` bits, 2. shift that result by `(`shift`-x) mod 64` (or `mod 32` if ` `word_instr` = 1`). The intermediate value representing the state between the two phases is stored in the scratch variables `X` and `Y`. The definition of `shifted` describes how one can combine the `X`, `Y` and `extension` variables to construct the output value as described using `Half`-limbs. The output variable `out` is equivalent to `shifted`, but expressed using `Word`-limbs.
+
+In the following, we cover how these two phases were designed to complement one another. Here, we start with discussing the _logical_ left/right shift operations only; the modifications required to compute the _arithmetic_ right shift will be discussed at the end.
+
+## First phase
+
+We zoom in on the first step. Here, we make use of the two lookup operations - ``HWSL[x: Half, y: B4]` := (`x` `<<` `y`) mod 2^16` (short for "HalfWord Shift Left"), and - ``HWSLC[x: Half, y: B4]` := `x` `>>` (16-`y`)` (short for "HalfWord Shift Left's Carry") Note here that one can use these two lookups to compute `out: Half[4] := in << y` as: $
+
+$ as long as ``y` < 16`. Observing that ``HWSL[x,` 16-`y]` = (`x` `<<` (16-`y`)) mod 2^16`, and ``HWSLC[x,` 16-`y]` = `x` `>>` `y`` for ``y` in [1, 15]`, one can also use these lookups to compute `out := in >> y` as $
+
+$ as long as `0 < `y` < 16`.
+
+Observe now that the values being looked up are (almost) independent from the direction of the shift: only the shift-amount varies slightly. When we now define $
+
+(16-`shift`) mod 16 & "when shifting right" ), $ it only takes some rearranging and combining of the values ``X[`i`] := HWSL[in[`i`], bit_shift]`` and ``Y[`i`] := HWSLC[in[`i`], bit_shift]`` to form the limbs of ``in <</>> shift` mod 16`. In the remaining case that ``right` = 1` and ``shift` = 0 mod 16`, the limbs of ``in <</>> shift` mod 16` simply match those of `in`.
+
+## Second phase
+
+Since we're operating on 16-bit limbs, all the limbs in ``in <</>> shift`` must also occur somewhere in ``in <</>> shift` mod 16`. The number of full-limbs we still need to shift is determined by the fifth and sixth least significant bit of `shift`. With `limb_shift` containing a unary decoding of the integer represented by these two bits, we find that the intermediate value needs to be shifted over by `i` limbs (to the `left` or `right`) when ``limb_shift[`i`]` = 1`. These things combined yield `shifted`'s definition.
+
+Of course, when ``word_instr` = 1` and, thus, only ``shift` mod 32` should be considered, the bit-mask for the lookup constraining `limb_shift` is adjusted appropriately (see [shift:c:limb_shift_lookup]).
+
+## Arithmetic right shift
+
+Lastly, we discuss the case of performing the _arithmetic_ right shift. Here, `extension` is constrained to contain a repetition of `in`'s most significant bit. Copies of this variable are used for any full limbs shifted in when ``right` = `signed` = 1`. Moreover, `X[4]` contains a copy of `extension` shifted over by the right number of bits, to allow the construction of ``in >>> shift` mod 16` as the appropriate intermediate.
+
+= Constraints First, we constrain `bit_shift` based on whether we are left or right-shifting. [shift:c:zbs] makes sure `zbs` is set to `1` if and only if `bit_shift = 0`. This flag is used to indicate the special case that ``right` = 1` and ``shift` = 0 mod 16`.
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `SHIFT-C3` | `AND_BYTE[bit_shift; shift, 15]` | left |
+| `SHIFT-C4` | `AND_BYTE[bit_shift; 2^8 - shift, 15]` | right |
+| `SHIFT-C5` | `ZERO[zbs; bit_shift]` | μ |
+
+Next, we shift the limbs of `in` left and right by the appropriate amount, storing the results in `X` and `Y` respectively. When `zbs = 1`, the output cannot be used to compose ``in >>/>>> shift` mod 16`. To resolve this, we override `Y[i] := in[i]` and `X[i] := 0` in this case.
+
+The case of `left`-shifting and ``bit_shift` = 0` will be used for padding rows. To prevent unnecessary lookups in padding rows, we override ``X[i]` := `in[i]`` and ``Y[i]` := 0` here.
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `SHIFT-C6.i` | i ∈ [0, 3] | `HWSL[X[i]; in[i], bit_shift]` | 1 - zbs |
+| `SHIFT-C7.i` | i ∈ [0, 3] | `zbs` => `X[i]` = `in[i]` dot `left` |  |
+| | | _polynomial:_ `zbs * (X[i] - in[i] * left) = 0` | |
+| `SHIFT-C8` |  | `HWSL[X[4]; extension, bit_shift]` | 1 - zbs |
+| `SHIFT-C9` |  | `zbs` => `X[4]` = 0 |  |
+| | | _polynomial:_ `zbs * X[4] = 0` | |
+| `SHIFT-C10.i` | i ∈ [0, 3] | `HWSLC[Y[i]; in[i], bit_shift]` | 1 - zbs |
+| `SHIFT-C11.i` | i ∈ [0, 3] | `zbs` => `Y[i]` = `in[i]` dot `right` |  |
+| | | _polynomial:_ `zbs * (Y[i] - in[i] * right) = 0` | |
+
+## Full-limb shifting
+
+Next, we constrain that `limb_shift` is a proper unary encoding of the fifth (and sixth if ``word_instr` = 0`) bit of `shift`. For this to be the case, three requirements must be satisfied: + *unary(0)*: ``limb_shift[`i`]` in {0, 1}` for `i in [0, 3]`, + *unary(1)*: ``limb_shift[`i`]` = 1` for exactly one `i`, and + *proper encoding*: ``limb_shift[`i`]` = 1 <=> 1/16 (`shift &` (48-32 dot `word_instr`)) = i` The first requirement is enforced by constraint [shift:c:limb_shift_is_bit]. To construct a constraint for the second and third requirement, observe that $ 1/16 dot (`shift &` (48-32 dot `word_instr`)) in cases( {0, 1, 2, 3} &"if" `word_instr` = 0, {0, 1} &"if" `word_instr` = 1 $ Observe moreover that, assuming *unary(0)*, the expression $ 1/16 dot (1 + sum_(i=0)^3 (16i-1) dot `limb_shift[`i`]`) $ can evaluate to `i` if and only if ``limb_shift[`i`]` = 1`, while the others are `0`. This means that the relation $ 1 + sum_(i=0)^3 (16i-1) dot `limb_shift[`i`]` = `shift &` (48-32 dot `word_instr`) $ enforces both *unary(1)* and *proper encoding*. This is the exact relation [shift:c:limb_shift_lookup] enforces.
+
+Hereafter, one must only check that `out` is the proper cast of `shifted` into a `DWordWL`.
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `SHIFT-C12.i` | i ∈ [0, 3] | `IS_BIT<limb_shift[i]>` |  |
+| `SHIFT-C13` |  | `AND_BYTE[(1 - limb_shift[0]) + 15 * limb_shift[1] + 31 * limb_shift[2] + 47 * limb_shift[3]; shift, 48 - 32 * word_instr]` | μ |
+| `SHIFT-C14.i` | i ∈ [0, 1] | `out[:2]` = `shifted[:4]` |  |
+| | | _polynomial:_ `out[i] - (shifted::DWordWL)[i] = 0` | |
+
+## Miscellaneous
+
+*Note*: `is_negative` is not used when `signed = 0`. As such, there is no problem with it being unconstrained in this case.
+
+## Lookups
+
+This chip adds the following interaction to the lookup.
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `SHIFT-C15` | `SHIFT[out; in, shift, direction, signed, word_instr]` | -μ |
+
+= Padding
+
+The table can be padded to the next power of two with the following value assignments:
+
 ## Columns
 
 ### Input
@@ -87,8 +172,6 @@ shifted := left * Σ_j = 0^i limb_shift[j] * intra_limb_left[i - j] + right * (
 |------|------|-------------|
 | `μ` | `Bit` |  |
 
-The `SHIFT` chip is comprised of  variables that are expressed using  columns:
-
 ## Assumptions
 
 | Tag | Range | Description |
@@ -99,91 +182,8 @@ The `SHIFT` chip is comprised of  variables that are expressed using  columns:
 | `SHIFT-A4` |  | `IS_BIT<signed>` |
 | `SHIFT-A5` |  | `IS_BIT<word_instr>` |
 
-## Explanation
-
-This chip has a rather complex design as a result of designing it to fit in as few columns possible. We briefly discuss the intricacies of the design, attempting to illustrate its correctness.
-
-The chip's design revolves around a two-phase shifting process: 1. shift `in` by `x := `shift` mod 16` bits, 2. shift that result by `(`shift`-x) mod 64` (or `mod 32` if ` `word_instr` = 1`). The intermediate value representing the state between the two phases is stored in the scratch variables `X` and `Y`. The definition of `shifted` describes how one can combine the `X`, `Y` and `extension` variables to construct the output value as described using `Half`-limbs. The output variable `out` is equivalent to `shifted`, but expressed using `Word`-limbs.
-
-In the following, we cover how these two phases were designed to complement one another. Here, we start with discussing the _logical_ left/right shift operations only; the modifications required to compute the _arithmetic_ right shift will be discussed at the end.
-
-### First phase
-
-We zoom in on the first step. Here, we make use of the two lookup operations - ``HWSL[x: Half, y: B4]` := (`x` `<<` `y`) mod 2^16` (short for "HalfWord Shift Left"), and - ``HWSLC[x: Half, y: B4]` := `x` `>>` (16-`y`)` (short for "HalfWord Shift Left's Carry") Note here that one can use these two lookups to compute `out: Half[4] := in << y` as: $
-
-$ as long as ``y` < 16`. Observing that ``HWSL[x,` 16-`y]` = (`x` `<<` (16-`y`)) mod 2^16`, and ``HWSLC[x,` 16-`y]` = `x` `>>` `y`` for ``y` in [1, 15]`, one can also use these lookups to compute `out := in >> y` as $
-
-$ as long as `0 < `y` < 16`.
-
-Observe now that the values being looked up are (almost) independent from the direction of the shift: only the shift-amount varies slightly. When we now define $
-
-(16-`shift`) mod 16 & "when shifting right" ), $ it only takes some rearranging and combining of the values ``X[`i`] := HWSL[in[`i`], bit_shift]`` and ``Y[`i`] := HWSLC[in[`i`], bit_shift]`` to form the limbs of ``in <</>> shift` mod 16`. In the remaining case that ``right` = 1` and ``shift` = 0 mod 16`, the limbs of ``in <</>> shift` mod 16` simply match those of `in`.
-
-### Second phase
-
-Since we're operating on 16-bit limbs, all the limbs in ``in <</>> shift`` must also occur somewhere in ``in <</>> shift` mod 16`. The number of full-limbs we still need to shift is determined by the fifth and sixth least significant bit of `shift`. With `limb_shift` containing a unary decoding of the integer represented by these two bits, we find that the intermediate value needs to be shifted over by `i` limbs (to the `left` or `right`) when ``limb_shift[`i`]` = 1`. These things combined yield `shifted`'s definition.
-
-Of course, when ``word_instr` = 1` and, thus, only ``shift` mod 32` should be considered, the bit-mask for the lookup constraining `limb_shift` is adjusted appropriately (see [shift:c:limb_shift_lookup]).
-
-### Arithmetic right shift
-
-Lastly, we discuss the case of performing the _arithmetic_ right shift. Here, `extension` is constrained to contain a repetition of `in`'s most significant bit. Copies of this variable are used for any full limbs shifted in when ``right` = `signed` = 1`. Moreover, `X[4]` contains a copy of `extension` shifted over by the right number of bits, to allow the construction of ``in >>> shift` mod 16` as the appropriate intermediate.
-
 ## Constraints
 
-First, we constrain `bit_shift` based on whether we are left or right-shifting. [shift:c:zbs] makes sure `zbs` is set to `1` if and only if `bit_shift = 0`. This flag is used to indicate the special case that ``right` = 1` and ``shift` = 0 mod 16`.
-
-| Tag | Description | Multiplicity |
-|-----|-------------|--------------|
-| `SHIFT-C3` | `AND_BYTE[bit_shift; shift, 15]` | left |
-| `SHIFT-C4` | `AND_BYTE[bit_shift; 2^8 - shift, 15]` | right |
-| `SHIFT-C5` | `IsZero<zbs; bit_shift>` | μ |
-
-Next, we shift the limbs of `in` left and right by the appropriate amount, storing the results in `X` and `Y` respectively. When `zbs = 1`, the output cannot be used to compose ``in >>/>>> shift` mod 16`. To resolve this, we override `Y[i] := in[i]` and `X[i] := 0` in this case.
-
-The case of `left`-shifting and ``bit_shift` = 0` will be used for padding rows. To prevent unnecessary lookups in padding rows, we override ``X[i]` := `in[i]`` and ``Y[i]` := 0` here.
-
-| Tag | Range | Description | Multiplicity |
-|-----|-------|-------------|--------------|
-| `SHIFT-C6.i` | i ∈ [0, 3] | `HWSL[X[i]; in[i], bit_shift]` | 1 - zbs |
-| `SHIFT-C7.i` | i ∈ [0, 3] | `zbs` => `X[i]` = `in[i]` dot `left` |  |
-| | | _polynomial:_ `zbs * (X[i] - in[i] * left) = 0` | |
-| `SHIFT-C8` |  | `HWSL[X[4]; extension, bit_shift]` | 1 - zbs |
-| `SHIFT-C9` |  | `zbs` => `X[4]` = 0 |  |
-| | | _polynomial:_ `zbs * X[4] = 0` | |
-| `SHIFT-C10.i` | i ∈ [0, 3] | `HWSLC[Y[i]; in[i], bit_shift]` | 1 - zbs |
-| `SHIFT-C11.i` | i ∈ [0, 3] | `zbs` => `Y[i]` = `in[i]` dot `right` |  |
-| | | _polynomial:_ `zbs * (Y[i] - in[i] * right) = 0` | |
-
-### Full-limb shifting
-
-Next, we constrain that `limb_shift` is a proper unary encoding of the fifth (and sixth if ``word_instr` = 0`) bit of `shift`. For this to be the case, three requirements must be satisfied: + *unary(0)*: ``limb_shift[`i`]` in {0, 1}` for `i in [0, 3]`, + *unary(1)*: ``limb_shift[`i`]` = 1` for exactly one `i`, and + *proper encoding*: ``limb_shift[`i`]` = 1 <=> 1/16 (`shift &` (48-32 dot `word_instr`)) = i` The first requirement is enforced by constraint [shift:c:limb_shift_is_bit]. To construct a constraint for the second and third requirement, observe that $ 1/16 dot (`shift &` (48-32 dot `word_instr`)) in cases( {0, 1, 2, 3} &"if" `word_instr` = 0, {0, 1} &"if" `word_instr` = 1 $ Observe moreover that, assuming *unary(0)*, the expression $ 1/16 dot (1 + sum_(i=0)^3 (16i-1) dot `limb_shift[`i`]`) $ can evaluate to `i` if and only if ``limb_shift[`i`]` = 1`, while the others are `0`. This means that the relation $ 1 + sum_(i=0)^3 (16i-1) dot `limb_shift[`i`]` = `shift &` (48-32 dot `word_instr`) $ enforces both *unary(1)* and *proper encoding*. This is the exact relation [shift:c:limb_shift_lookup] enforces.
-
-Hereafter, one must only check that `out` is the proper cast of `shifted` into a `DWordWL`.
-
-| Tag | Range | Description | Multiplicity |
-|-----|-------|-------------|--------------|
-| `SHIFT-C12.i` | i ∈ [0, 3] | `IS_BIT<limb_shift[i]>` |  |
-| `SHIFT-C13` |  | `AND_BYTE[(1 - limb_shift[0]) + 15 * limb_shift[1] + 31 * limb_shift[2] + 47 * limb_shift[3]; shift, 48 - 32 * word_instr]` | μ |
-| `SHIFT-C14.i` | i ∈ [0, 1] | `out[:2]` = `shifted[:4]` |  |
-| | | _polynomial:_ `out[i] - (shifted::DWordWL)[i] = 0` | |
-
-### Miscellaneous
-
-*Note*: `is_negative` is not used when `signed = 0`. As such, there is no problem with it being unconstrained in this case.
-
-### Lookups
-
-This chip adds the following interaction to the lookup.
-
-| Tag | Description | Multiplicity |
-|-----|-------------|--------------|
-| `SHIFT-C15` | `SHIFT[out; in, shift, direction, signed, word_instr]` | -μ |
-
-## Padding
-
-The table can be padded to the next power of two with the following value assignments:
-
 ### is_negative
 
 | Tag | Description | Multiplicity |
diff --git a/docs/spec/sign.md b/docs/spec/sign.md
new file mode 100644
index 000000000..a656215bf
--- /dev/null
+++ b/docs/spec/sign.md
@@ -0,0 +1,45 @@
+# SIGN Template
+
+box( inset: (left: 4pt, right: 4pt), outset: (top: 4pt, bottom: 4pt), radius: 2pt, fill: luma(230), raw(code)) }
+
+= Interface The  constraint template has the following interface:
+
+It constrains that `sign` is set to `1` when both `X`'s most significant bit and `signed` are `1`, and `0` otherwise.
+
+= Variables The  template operates on three variables:
+
+= Assumptions The  template operates on the following assumptions:
+
+= Constraints It takes only two constraints to compute the `sign` of `X`, given whether `X` represents a `signed` value or not. When ``signed` = 1`, the sign of `X` is equal to its most significant bit. This value is extracted in [sign:c:sign_if_signed]. If `X` is unsigned (i.e., ``signed` = 0`), its sign is always `0`. This is constrained by [sign:c:sign_if_unsigned].
+
+## Columns
+
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `X` | `Half` | Value for which to extract its sign. |
+| `signed` | `Bit` | Whether `X` represents a signed value (1) or not (0) |
+
+### Output
+
+| Name | Type | Description |
+|------|------|-------------|
+| `sign` | `Bit` | Sign of `X` |
+
+## Assumptions
+
+| Tag | Range | Description |
+|-----|-------|-------------|
+| `SIGN-A1` |  | `IS_HALF[X]` |
+| `SIGN-A2` |  | `IS_BIT<signed>` |
+
+## Constraints
+
+### all
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `SIGN-C1` | `MSB16[sign; X]` | signed |
+| `SIGN-C2` | not`signed` => `sign` = 0 |  |
+| | _polynomial:_ `(1 - signed) * sign = 0` | |
\ No newline at end of file
diff --git a/docs/spec/signatures.md b/docs/spec/signatures.md
new file mode 100644
index 000000000..98cbcdec1
--- /dev/null
+++ b/docs/spec/signatures.md
@@ -0,0 +1,23 @@
+# Signatures
+
+// Render a signature let (lb, rb) = if sig.kind == "interaction" { (`[`, `]`) } else if sig.kind == "template" { (`<`, `>`) }
+
+let cond = sig.at("cond", default: none) let cond_str = if cond != none { raw(cond) + ` => ` } else {``}
+
+let input_str = sig.input.map(elt => { if type(elt) == array { raw(elt.at(0)) + `[` + raw(str(elt.at(1))) + `]` } else { raw(elt) } }).join(`, `)
+
+let output = sig.at("output", default: none) let output_str = if output != none { if type(output) == array { raw(output.at(0)) + `[` + raw(str(output.at(1))) + `]` } else { raw(output) } + `; ` } else {``}
+
+return [] }
+
+// Compute the bus size of an interaction let vars = sig.input + if "output" in sig { (sig.output, )} else {()}
+
+return vars.map(v => { let (label, factor) = if type(v) == array { (v.at(0), v.at(1)) } else { (v, 1) } config.variables.types.filter(type => type.label == label).first().subtypes.len() * factor }) .sum() }
+
+The following lists signatures of the .len() interactions in this VM.
+
+table( columns: (1fr, auto), inset: 7pt, align: (top+left, center), stroke: none, table.header([*Signature*], [*Bus size*]), table.hline(stroke: 1pt), table.vline(stroke: 1pt, x: 1), ..for sig in interactions { ([], []) }, ), caption: "Signature overview of interactions",
+
+Below, we list the signatures of the .len() templates in this VM.
+
+table( columns: 1fr, inset: 7pt, align: (top+left, center), stroke: none, table.header([*Signature*]), table.hline(stroke: 1pt), ..for sig in templates { ([], ) }, ), caption: "Signature overview of templates",
\ No newline at end of file
diff --git a/docs/spec/spec_full.md b/docs/spec/spec_full.md
index 29ec7e963..fd9b196e5 100644
--- a/docs/spec/spec_full.md
+++ b/docs/spec/spec_full.md
@@ -8,11 +8,11 @@ While RAM is byte addressed, we do choose to store registers as a `DWordWL` over
 
 On a high level, we ensure memory consistency by an interacting system of reads and writes to a lookup argument, combined with an initialization and finalization scheme. The initialization and finalization schemes together ensure both that (1) the necessary preconditions for the lookup system are satisfied, and (2) the program is executed with the correct initial memory and register contents as specified by the ELF binary and the ISA.
 
-## Memory types
+= Memory types
 
 A commonly made distinction of memory types is that of _read-only_ and _read-write_ memory, with the more restrictive read-only variant often allowing for more efficient solutions (be that regarding prover time, verifier time or proof size) via table lookup proofs. Naturally, the VM’s main memory and registers should be handled by a read-write system as the guest program/environment can issue instructions that write to memory. While there are some subsystems that can be modelled as read-only memory ---e.g., the program memory and instruction decoding--- we opt to integrate these into the proof system via chip interactions (relying on techniques derived from table lookup arguments). As such, we only concern ourselves with read-write memory, moving forward.
 
-## Memory operations
+= Memory operations
 
 Every memory operation has some conceptual attributes that are relevant to mention or discuss:
 
@@ -22,7 +22,7 @@ Since we will have to ensure that memory accesses are temporally consistent with
 
 For reasons of completeness (since temporal integrity as discussed below is a security necessity), we cannot deal with multiple accesses to the same address at identical timestamps. However, if multiple accesses are guaranteed to be independent (that is, to different addresses), they can still share a timestamp --- consider, e.g., the case of reading a word as 4 bytes with the `LW` load instruction. This property is already taken into account where possible in the design of the system. For instance, in the CPU chip, we can ensure that there are at most 3 memory accesses not guaranteed to be independent, so a timestamp granularity of 4 timestamps per cycle is enough. ]
 
-## Permutation argument
+= Permutation argument
 
 We can conceptually organise the state of the memory as a collection of "tokens" that represent tuples `(serif("timestamp"), serif("address"), serif("value"))`, meaning the current value written to `serif("address")` is `serif("value")`, last written to memory at `serif("timestamp")`. Having exactly one value associated with any address will be ensured (see further down in this document) by the interaction of memory initialization, memory finalization, and the effects of memory operations.
 
@@ -36,7 +36,7 @@ Naturally, for a read operation, the _values_ embedded in the consumed and emitt
 
 So long as we can properly constrain temporal integrity (that is, no memory operation can consume future tokens), this "balancing" act of tokens can be integrated (with sufficient domain separation) into the existing LogUp argument: consuming a token corresponds to a "receive" and emitting a new token is a "send".
 
-## Temporal integrity
+= Temporal integrity
 
 > **Note:** Properly link/refer to the LT chip
 
@@ -46,7 +46,7 @@ To ensure temporal integrity, every memory operation needs to be constrained for
 
 > **Note:** reference to CPU chip/timestamp column and MEMW chip
 
-## Initialization and Finalization
+= Initialization and Finalization
 
 Because the LogUp argument handling token consumption and emission needs to be fully balanced --- every token emitted should be consumed, and vice versa --- we need to have a system to emit the initial tokens and consume the final tokens. This needs to ensure that every address has at most a single initializing emission, and at most one finalizing consumption. Having at most one initialization will, through the correctness of the lookup argument, immediately lead to having at most one correct finalization, and vice versa.
 
@@ -56,7 +56,7 @@ For our chosen scheme (which we refer to as "paged initialization/finalization")
 
 Concretely, each page gets an associated `PAGE` table, consisting of N variables over N columns. For each such table, the `page` variable is instantiated as the constant base address of the page. The `offset` column is preprocessed, which helps the verifier ensure that each page has a single fixed size, but the verifier should still check that no pages overlap and all `page` values are page-aligned.
 
-### Page initialization
+## Page initialization
 
 > **Note:** check whether we need `fini` to be range-checked
 
@@ -76,17 +76,17 @@ _Sparse initialization/finalization_
 
 One or more STARK tables (depending on the amount of memory used) consisting of `(address, value)` columns are introduced, where for zero-initialization, `value` can be constant zero. Transition constraints ensure that `address` is strictly increasing, enforcing the "at most once" property; `value` is range-checked to consist of bytes. Similar to paged finalization, an additional `timestamp` column is added, containing the final timestamp each address was accessed. This table is then further used to contribute to the LogUp sum as with any other interactions. - The transition constraints can be chosen to only apply on finalization, as at-most-once finalization is enough to ensure consistency. - Sparse initialization is incompatible with paged finalization, see also the remark under free-zero initialization above. - This would require transition constraints, which currently are not needed elsewhere in the VM design - Additionally, for memory use exceeding the capacity of a single initialization/finalization table, some form of transition constraint between tables is needed - Alternatively, transition constraints could potentially be avoided by more integration into the LogUp system, but this could turn out more costly in practice - This is compatible with the above "free zero" initialization - Since a prover-committed address column is needed (rather than a precomputed one), the number of required columns increases. - As an optimization, the address column could potentially be used simultaneously for initialization and finalization - Sparse initialization/finalization reduces the cost for sparse memory access patterns, where only a few addresses would be accessed per page. Most programs and compilers should however favor a memory locality that makes paged initialization/finalization comparable. ]
 
-### Register initialization/finalization
+## Register initialization/finalization
 
 > **Note:** Properly link/reference ECALL/HALT chip
 
 The initial and final state of registers can be entirely known by the verifier, since the relevant initialization values are either zero, or embedded in the ELF, and the final values can be set to a known value by the HALT ecall. As additionally, the number of registers is small, the verifier can directly add the required balancing terms to the LogUp sum.
 
-## Notes and considerations
+= Notes and considerations
 
 - Register reads and writes may interact within a single cycle, so a correct and fixed ordering needs to be ensured - Correctness of initialization and completeness of finalization need to be ensured
 
-## Future topics of interest
+= Future topics of interest
 
 - Optimize memory systems after determining factual bottlenecks (e.g. taking inspiration from Twist and Shout, or other recent research)
 
@@ -100,31 +100,49 @@ columns: (auto, 1fr, auto), inset: 7pt, align: (top+left, top+left, top+center,
 
 ---
 
+# Signatures
+
+// Render a signature let (lb, rb) = if sig.kind == "interaction" { (`[`, `]`) } else if sig.kind == "template" { (`<`, `>`) }
+
+let cond = sig.at("cond", default: none) let cond_str = if cond != none { raw(cond) + ` => ` } else {``}
+
+let input_str = sig.input.map(elt => { if type(elt) == array { raw(elt.at(0)) + `[` + raw(str(elt.at(1))) + `]` } else { raw(elt) } }).join(`, `)
+
+let output = sig.at("output", default: none) let output_str = if output != none { if type(output) == array { raw(output.at(0)) + `[` + raw(str(output.at(1))) + `]` } else { raw(output) } + `; ` } else {``}
+
+return [] }
+
+// Compute the bus size of an interaction let vars = sig.input + if "output" in sig { (sig.output, )} else {()}
+
+return vars.map(v => { let (label, factor) = if type(v) == array { (v.at(0), v.at(1)) } else { (v, 1) } config.variables.types.filter(type => type.label == label).first().subtypes.len() * factor }) .sum() }
+
+The following lists signatures of the .len() interactions in this VM.
+
+table( columns: (1fr, auto), inset: 7pt, align: (top+left, center), stroke: none, table.header([*Signature*], [*Bus size*]), table.hline(stroke: 1pt), table.vline(stroke: 1pt, x: 1), ..for sig in interactions { ([], []) }, ), caption: "Signature overview of interactions",
+
+Below, we list the signatures of the .len() templates in this VM.
+
+table( columns: 1fr, inset: 7pt, align: (top+left, center), stroke: none, table.header([*Signature*]), table.hline(stroke: 1pt), ..for sig in templates { ([], ) }, ), caption: "Signature overview of templates",
+
+---
+
 # IS_BIT Template
 
 box( inset: (left: 4pt, right: 4pt), outset: (top: 4pt, bottom: 4pt), radius: 2pt, fill: luma(230), raw(code)) }
 
 Barring exceptional cases, this template is used to assert that a variable of type `Bit` assumes a valid value under some condition.
 
-## Interface
-
-The  constraint template has the following interface:
+= Interface The  constraint template has the following interface:
 
 where `cond` is any value described by an expression _of degree at most `1`_. Note that  can be used to denote the _unconditional_ application of the  template to `X`.
 
-## Variables
-
-The  template operates on two variables: `cond` and `X`:
+= Variables The  template operates on two variables: `cond` and `X`:
 
-## Constraints
-
-It takes only one constraint to enforce that `X` must be either `0` or `1` whenever ``cond` eq.not 0`:
+= Constraints It takes only one constraint to enforce that `X` must be either `0` or `1` whenever ``cond` eq.not 0`:
 
 *Note*: - In case of _unconditional_ template application, `cond` can be dropped from the constraint, simplifying it to ``X` (1- `X`) = 0`. - As described earlier, the `cond` variable must be describable by a degree-1 (i.e., linear) expression. This is to make sure that [isbit:c:isbit]'s expression has degree at most 3.
 
-## Proof of correctness
-
-If `cond` is `0`, [isbit:c:isbit] is trivially satisfied: `X` can assume any value and the polynomial constraint will evaluate to `0` regardless. When ``cond` eq.not 0`, it follows that the statement can only be proven when ``X` (1-`X`) equiv 0 mod p`, with `p` the modulus of the field. Because `BaseField` is a prime field, this equality is only satisfied if either ``X` equiv 0 mod p` or `1-`X` equiv 0 mod p`. Hence, it is proven that when ``cond` eq.not 0`, [isbit:c:isbit] is only satisfied if ``X` in {0, 1}`. 
+= Proof of correctness If `cond` is `0`, [isbit:c:isbit] is trivially satisfied: `X` can assume any value and the polynomial constraint will evaluate to `0` regardless. When ``cond` eq.not 0`, it follows that the statement can only be proven when ``X` (1-`X`) equiv 0 mod p`, with `p` the modulus of the field. Because `BaseField` is a prime field, this equality is only satisfied if either ``X` equiv 0 mod p` or `1-`X` equiv 0 mod p`. Hence, it is proven that when ``cond` eq.not 0`, [isbit:c:isbit] is only satisfied if ``X` in {0, 1}`. 
 
 ## Columns
 
@@ -140,6 +158,8 @@ If `cond` is `0`, [isbit:c:isbit] is trivially satisfied: `X` can assume any val
 |------|------|-------------|
 | `cond` | `BaseField` | Whether the constraint should be applied ($eq.not 0$) or not ($0$). |
 
+## Constraints
+
 ### all
 
 | Tag | Description |
@@ -149,37 +169,75 @@ If `cond` is `0`, [isbit:c:isbit] is trivially satisfied: `X` can assume any val
 
 ---
 
-# ADD/SUB Template
+# SIGN Template
 
 box( inset: (left: 4pt, right: 4pt), outset: (top: 4pt, bottom: 4pt), radius: 2pt, fill: luma(230), raw(code)) }
 
-## Notation
+= Interface The  constraint template has the following interface:
 
-The  constraint template has the following interface:
+It constrains that `sign` is set to `1` when both `X`'s most significant bit and `signed` are `1`, and `0` otherwise.
 
-where `cond` is any value described by an expression _of degree at most `1`_.
+= Variables The  template operates on three variables:
 
-### 
+= Assumptions The  template operates on the following assumptions:
 
-For ease of notation, we moreover introduce the  constraint template. Its interface
+= Constraints It takes only two constraints to compute the `sign` of `X`, given whether `X` represents a `signed` value or not. When ``signed` = 1`, the sign of `X` is equal to its most significant bit. This value is extracted in [sign:c:sign_if_signed]. If `X` is unsigned (i.e., ``signed` = 0`), its sign is always `0`. This is constrained by [sign:c:sign_if_unsigned].
 
-maps onto the  template as
+## Columns
 
-It constrains that ``diff` = `lhs` - `rhs` mod 2^64` when the expression `cond` is non-zero. As with ,  can be used to denote the _unconditional_ application of the template.
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `X` | `Half` | Value for which to extract its sign. |
+| `signed` | `Bit` | Whether `X` represents a signed value (1) or not (0) |
+
+### Output
 
-## Variables
+| Name | Type | Description |
+|------|------|-------------|
+| `sign` | `Bit` | Sign of `X` |
 
 ## Assumptions
 
 | Tag | Range | Description |
 |-----|-------|-------------|
-| `ADD-A1.i` | i ∈ [0, 1] | `IS_WORD[lhs[i]]` |
-| `ADD-A2.i` | i ∈ [0, 1] | `IS_WORD[rhs[i]]` |
-| `ADD-A3.i` | i ∈ [0, 1] | `IS_WORD[sum[i]]` |
+| `SIGN-A1` |  | `IS_HALF[X]` |
+| `SIGN-A2` |  | `IS_BIT<signed>` |
 
 ## Constraints
 
-This template introduces the following constraints
+### all
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `SIGN-C1` | `MSB16[sign; X]` | signed |
+| `SIGN-C2` | not`signed` => `sign` = 0 |  |
+| | _polynomial:_ `(1 - signed) * sign = 0` | |
+
+---
+
+# ADD/SUB Template
+
+box( inset: (left: 4pt, right: 4pt), outset: (top: 4pt, bottom: 4pt), radius: 2pt, fill: luma(230), raw(code)) }
+
+= Notation The  constraint template has the following interface:
+
+where `cond` is any value described by an expression _of degree at most `1`_.
+
+## 
+
+For ease of notation, we moreover introduce the  constraint template. Its interface
+
+maps onto the  template as
+
+It constrains that ``diff` = `lhs` - `rhs` mod 2^64` when the expression `cond` is non-zero. As with ,  can be used to denote the _unconditional_ application of the template.
+
+= Variables
+
+= Assumptions
+
+= Constraints This template introduces the following constraints
 
 ## Columns
 
@@ -214,6 +272,16 @@ carry (when iter=1) := 2^-32 * (lhs[1] + rhs[1] + carry[0] - sum[1])
 |------|------|-------------|
 | `cond` | `BaseField` | Whether the relation should be enforced ($eq.not 0$) or not ($0$). |
 
+## Assumptions
+
+| Tag | Range | Description |
+|-----|-------|-------------|
+| `ADD-A1.i` | i ∈ [0, 1] | `IS_WORD[lhs[i]]` |
+| `ADD-A2.i` | i ∈ [0, 1] | `IS_WORD[rhs[i]]` |
+| `ADD-A3.i` | i ∈ [0, 1] | `IS_WORD[sum[i]]` |
+
+## Constraints
+
 ### all
 
 | Tag | Range | Description |
@@ -222,37 +290,95 @@ carry (when iter=1) := 2^-32 * (lhs[1] + rhs[1] + carry[0] - sum[1])
 
 ---
 
-# DECODE Table
+# NEG Template
 
-All `RV64IMC` instruction are to be decoded to a format that can be interpreted by the VM. This section outlines the decoding table being used in the VM. For reasons of efficiency, data in this table is significantly compressed. Since reasoning about this compressed form is needlessly complex, the `decode (uncompressed)` section presents the same table in uncompressed form, and explains how to decode `RV64IM` assembly instructions to it. Instructions on how to compress the uncompressed table to form the compressed decode table, can be derived from the `packed_decode` variable provided below.
+box( inset: (left: 4pt, right: 4pt), outset: (top: 4pt, bottom: 4pt), radius: 2pt, fill: luma(230), raw(code)) }
+
+= Notation The  constraint template has the following interface:
+
+where `cond` is a bit value (i.e., lies in `{0, 1}`)  described by an expression _of degree at most `1`_.
+
+= Variables
+
+= Assumptions
+
+= Constraints We constrain this equality using two constraints:
+
+The constraints force the `carry` values to be fixed. Writing `carry`'s definition, we then find that $
+
+= cases( 2^32 - (`x as DWordWL`)_0 & "if" (`x as DWordWL`)_0 != 0, 0 & "if" (`x as DWordWL`)_0 = 0 ),\
+
+2^32 - (`x as DWordWL`)_1 - 1 & "if" `x` != 0, 0 & "if" `x` = 0 $ Clearly, ``neg` = 0` when ``x` = 0` (and `cond` is set). For non-zero `x`, we distinguish two cases. When `(`x as DWordWL`)_0 = 0`, $
+
+&= 2^32 dot `neg`_1 + `neg`_0\ &= 2^32 dot (2^32 - (`x as DWordWL`)_1) + 0\ &= 2^32 dot (2^32 - (`x as DWordWL`)_1) + (`x as DWordWL`)_0\ &= 2^64 - (2^32 dot (`x as DWordWL`)_1 + (`x as DWordWL`)_0)\ &= 2^64 - `x`\ &equiv -x mod 2^64, $ while when `(`x as DWordWL`)_0 != 0`, $
+
+&= 2^32 dot `neg`_1 + `neg`_0\ &= 2^32 dot (2^32 - (`x as DWordWL`)_1 - 1) + (2^32 - (`x as DWordWL`)_0)  \ &= 2^64 - 2^32 dot (`x as DWordWL`)_1 - 2^32 + 2^32 - (`x as DWordWL`)_0  \ &= 2^64 - ((`x as DWordWL`)_0 + 2^32 dot (`x as DWordWL`)_1) \ &= 2^64 - `x`\ &equiv -x mod 2^64 $ when `cond` is set. When `cond` is not set, the two lookups are not executed, allowing `neg` to take any value in either case.
+
+= Note It is worth noting that this construction does _not_ require the limbs of `neg` to be range checked, thus allowing it be represented by the unrangecheckable `DWordWL` rather than a `DWordHL`. The input value `x` is still assumed to be range-checked, however.
 
 ## Columns
 
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `x` | `DWordHL` | value to compute negation of |
+
 ### Output
 
 | Name | Type | Description |
 |------|------|-------------|
-| `pc` | `DWordWL` | value of the program counter this instruction is associated with. |
-| `packed_decode` | `BaseField` | Ordered concatenation of several small variables. The `decode (uncompressed)` section explains the purpose of each variable.\ A list of each variable and the bit(-range) in which it is located:\ [0] `read_register1`, \ [1] `read_register2`, \ [2] `write_register`, \ [3] `memory_2bytes`, \ [4] `memory_4bytes`, \ [5] `memory_8bytes`, \ [6] `c_type`, \ [7] `signed`, \ [8] `mp_selector`, \ [9] `muldiv_selector`, \ [10] `word_instr`, \ [11] `ADD`, \ [12] `SUB`, \ [13] `SLT`, \ [14] `AND`, \ [15] `OR`, \ [16] `XOR`, \ [17] `SHIFT`, \ [18] `JALR`, \ [19] `BEQ`, \ [20] `BLT`, \ [21] `LOAD`, \ [22] `STORE`, \ [23] `MUL`, \ [24] `DIVREM`, \ [25] `ECALL`, \ [26] `EBREAK`; \ [27:35] `rs1`, \ [35:43] `rs2`, \ [43:51] `rd`, \ the remaining bits are set to zero.  |
-| `imm` | `DWordWL` | the *fully extended (!)* 64-bit version of the immediate. |
+| `neg` | `DWordWL` | negation of `x` if $`cond` != 0$; unconstrained otherwise. |
 
-### Multiplicity
+### Virtual
 
 | Name | Type | Description |
 |------|------|-------------|
-| `μ` | `BaseField` | The multiplicity with which this instruction is looked up in the `CPU` table. |
+| `carry` | `Bit[2]` | carries of the addition $`neg` + `x`$. |
 
-The  table is comprised of  variables that are expressed using  columns:
+**Definition of `carry`:**
+```
+carry (when iter=0) := 2^-32 * ((x::DWordWL)[0] + neg[0])
+carry (when iter=1) := 2^-32 * ((x::DWordWL)[1] + neg[1] + carry[0])
+```
 
-## Padding
+### Condition
+
+| Name | Type | Description |
+|------|------|-------------|
+| `cond` | `Bit` | condition on whether to negate x |
 
-The  table must be padded to a length that is a power of two. Empty rows with the following content can be added to achieve this:
+## Assumptions
 
-Note that this row sets the `EBREAK` flag. Given that `CPU` asserts that `EBREAK = 0` (see [cpu:c:ebreak_traps]), using this "padding-instruction" would immediately make the CPU table unprovable. Note moreover that the `pc` is set to `7`. This value is the _smallest odd number_ (i.e., not reachable during regular execution) that is more than _`4`_ (i.e., the max `pc`-increment) greater than _`1`_ (i.e., the `pc`-value used in the [additional instruction] referred to by `CPU`-padding lines).
+| Tag | Range | Description |
+|-----|-------|-------------|
+| `NEG-A1.i` | i ∈ [0, 3] | `IS_HALF[x[i]]` |
+| `NEG-A2` |  | `IS_BIT<cond>` |
+
+## Constraints
+
+### all
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `NEG-C1` | `ZERO[1 - carry[0]; x[0] + x[1]]` | cond |
+| `NEG-C2` | `ZERO[1 - carry[1]; x[0] + x[1] + x[2] + x[3]]` | cond |
+
+---
+
+# DECODE Table
 
-## Decoding
+All `RV64IMC` instruction are to be decoded to a format that can be interpreted by the VM. This section outlines the decoding table being used in the VM. For reasons of efficiency, data in this table is significantly compressed. Since reasoning about this compressed form is needlessly complex, the `decode (uncompressed)` section presents the same table in uncompressed form, and explains how to decode `RV64IM` assembly instructions to it. Instructions on how to compress the uncompressed table to form the compressed decode table, can be derived from the `packed_decode` variable provided below.
+
+= Columns
+
+The  table is comprised of  variables that are expressed using  columns:
 
-For the purposes of explaining decoding, we decompress 's `packed_decode` variable into its constituent variables. Note that the below table is _not_ used in practice: it is solely used for the purposes of this explanation.
+= Padding The  table must be padded to a length that is a power of two. Empty rows with the following content can be added to achieve this:
+
+Note that this row sets the `EBREAK` flag. Given that `CPU` asserts that `EBREAK = 0` (see [cpu:c:ebreak_traps]), using this "padding-instruction" would immediately make the CPU table unprovable. Note moreover that the `pc` is set to `7`. This value is the _smallest odd number_ (i.e., not reachable during regular execution) that is more than _`4`_ (i.e., the max `pc`-increment) greater than _`1`_ (i.e., the `pc`-value used in the [additional instruction] referred to by `CPU`-padding lines).
+
+= Decoding For the purposes of explaining decoding, we decompress 's `packed_decode` variable into its constituent variables. Note that the below table is _not_ used in practice: it is solely used for the purposes of this explanation.
 
 We will illustrate how each instruction should be expressed in this (uncompressed) decoding table. The columns of the accompanying table represent the following: - *`operation`*: the assembly operation being encoded. - *`op-flag`*: which of the "`ALU` selector flags" operation flags to set. Each operation sets exactly one. - *`w_instr`*, *`signed`*: whether to set the `word_instr` and `signed` flags, respectively. - *other*: the other flags that should be set or variables that should be given specific values.
 
@@ -260,7 +386,7 @@ For the purpose of brevity and readability, the table uses the following rules-o
 
 Further clarification is provided in the notes following the table.
 
-### C-type instructions
+## C-type instructions
 
 The `RV64C` extension for compressed instructions specifies that \~50% of all instructions can be represented using a 16-bit instruction (rather than 32-bits), saving \~25% in code size. This execution of assembly code is _not_ agnostic to an instruction's compression state; after executing a compressed instruction, the `pc` should be incremented by `2` rather than `4`. To indicate an instruction is provided in compressed form, the `c_type` flag is introduced. *This flag should be set to `1` whenever the decoded instruction is provided in compressed form and `0` otherwise.*
 
@@ -274,7 +400,7 @@ figure(table( columns: (auto, auto, 40pt, 40pt, 1fr, 15pt), stroke: 0pt, inset:
 
 // Construct a note that can be referenced through `lbl` show figure: (it) => align(left, []) [ ] }
 
-#### Notes
+## Notes
 
 We note the following about the above decoding table:
 
@@ -286,101 +412,33 @@ In addition to decoding all instructions provided in the ELF and adding a corres
 
 This entry is used to pad the `CPU` table. More details on this matter are provided in the `CPU` chip.
 
----
-
-# CPU Chip
-
 ## Columns
 
-### Input
-
-| Name | Type | Description |
-|------|------|-------------|
-| `timestamp` | `Timestamp` | A preprocessed timestamp to coordinate the memory argument. Since we have at most 3 non-disjoint memory accesses (`(rs1, rs2, rd)`, `(rs1, pc, pc)`, `(LOAD)` or `(STORE)`) a maximum of 4 slots is enough. |
-| `pc` | `DWordWL` | The program counter |
-| `rs1` | `Byte` | Source register 1 index |
-| `rs2` | `Byte` | Source register 2 index |
-| `rd` | `Byte` | Destination register index |
-| `read_register1` | `Bit` | Whether to read from `rs1` (1) or to place a 0 in `rv1` (0) |
-| `read_register2` | `Bit` | Whether to read from `rs2` (1) or to place a 0 in `rv2` (0) |
-| `write_register` | `Bit` | Whether to write back to the destination register |
-| `memory_2bytes` | `Bit` | Whether the memory access (read or write) touches exactly 2 bytes |
-| `memory_4bytes` | `Bit` | Whether the memory access (read or write) touches exactly 4 bytes |
-| `memory_8bytes` | `Bit` | Whether the memory access (read or write) touches exactly 8 bytes |
-| `c_type_instruction` | `Bit` | Whether the instruction is of C type, i.e., whether it is 2 bytes long instead of 4 |
-| `imm` | `DWordWL` | The fully extended 64-bit version of the immediate |
-| `signed` | `Bit` | Indicates whether we're dealing with a signed or unsigned instruction |
-| `mp_selector` | `Bit` | Multi-purpose selector used by different ALU operations for different purposes. Currently, it is used     - by the `MUL` chip to select between `MUL`/`MULH` and `MULH[S]U`, and     - as flag for inverting the condition of conditional branches (see `branch_cond`)     - as direction (left or right) for `SHIFT` |
-| `muldiv_selector` | `Bit` | Selects which output of `MUL` (lo/hi) or `DIV` (quo/rem) is wanted |
-| `word_instr` | `Bit` | Whether the instruction is a \*W instruction, requiring the inputs and outputs to be (sign) extended |
-| `ADD` | `Bit` | One-hot ALU selector flag |
-| `SUB` | `Bit` | One-hot ALU selector flag |
-| `SLT` | `Bit` | One-hot ALU selector flag |
-| `AND` | `Bit` | One-hot ALU selector flag |
-| `OR` | `Bit` | One-hot ALU selector flag |
-| `XOR` | `Bit` | One-hot ALU selector flag |
-| `SHIFT` | `Bit` | One-hot ALU selector flag |
-| `JALR` | `Bit` | One-hot ALU selector flag |
-| `BEQ` | `Bit` | One-hot ALU selector flag |
-| `BLT` | `Bit` | One-hot ALU selector flag |
-| `LOAD` | `Bit` | One-hot ALU selector flag |
-| `STORE` | `Bit` | One-hot ALU selector flag |
-| `MUL` | `Bit` | One-hot ALU selector flag |
-| `DIVREM` | `Bit` | One-hot ALU selector flag |
-| `ECALL` | `Bit` | One-hot ALU selector flag |
-| `EBREAK` | `Bit` | One-hot ALU selector flag |
-
 ### Output
 
 | Name | Type | Description |
 |------|------|-------------|
-| `next_pc` | `DWordWL` | The program counter for the next instruction |
-| `rvd` | `DWordWL` | The value to (maybe) be written back to rvd |
+| `pc` | `DWordWL` | value of the program counter this instruction is associated with. |
+| `packed_decode` | `BaseField` | Ordered concatenation of several small variables. The `decode (uncompressed)` section explains the purpose of each variable.\ A list of each variable and the bit(-range) in which it is located:\ [0] `read_register1`, \ [1] `read_register2`, \ [2] `write_register`, \ [3] `memory_2bytes`, \ [4] `memory_4bytes`, \ [5] `memory_8bytes`, \ [6] `c_type`, \ [7] `signed`, \ [8] `mp_selector`, \ [9] `muldiv_selector`, \ [10] `word_instr`, \ [11] `ADD`, \ [12] `SUB`, \ [13] `SLT`, \ [14] `AND`, \ [15] `OR`, \ [16] `XOR`, \ [17] `SHIFT`, \ [18] `JALR`, \ [19] `BEQ`, \ [20] `BLT`, \ [21] `LOAD`, \ [22] `STORE`, \ [23] `MUL`, \ [24] `DIVREM`, \ [25] `ECALL`, \ [26] `EBREAK`; \ [27:35] `rs1`, \ [35:43] `rs2`, \ [43:51] `rd`, \ the remaining bits are set to zero.  |
+| `imm` | `DWordWL` | the *fully extended (!)* 64-bit version of the immediate. |
 
-### Auxiliary
+### Multiplicity
 
 | Name | Type | Description |
 |------|------|-------------|
-| `rv1` | `DWordWHH` | The value of register `rs1` |
-| `rv2` | `DWordWHH` | The value of register `rs2` |
-| `rv1_sign_bit` | `Bit` | The sign bit of `rv1` if seen as a 32-bit word |
-| `arg1` | `DWordBL` | The extended version of `rv1`, depending on `word_instr` |
-| `arg2_sign_bit` | `Bit` | The sign bit of `arg2` if seen as a 32-bit word |
-| `arg2` | `DWordBL` | A multiplexed version of `rv2` and `imm`, to be used as second argument to ALU calls |
-| `res_sign_bit` | `Bit` | The sign bit of `res`, if seen as a 32-bit word |
-| `res` | `DWordBL` | The ALU result |
-| `is_equal` | `Bit` | Whether `rv1` and `arg2` are equal |
-| `branch_cond` | `Bit` | Whether a branch is taken, i.e., the branch condition |
-
-### Virtual
+| `μ` | `BaseField` | The multiplicity with which this instruction is looked up in the `CPU` table. |
 
-| Name | Type | Description |
-|------|------|-------------|
-| `packed_decode` | `BaseField` | A packed representation of all bit flags and register indices obtained from the decoding |
-| `pad` | `Bit` | When no flags are set, we must be in a padding row. |
+---
 
-**Definition of `packed_decode`:**
-```
-packed_decode := 2^0 * read_register1 + 2^1 * read_register2 + 2^2 * write_register + 2^3 * memory_2bytes + 2^4 * memory_4bytes + 2^5 * memory_8bytes + 2^6 * c_type_instruction + 2^7 * signed + 2^8 * mp_selector + 2^9 * muldiv_selector + 2^10 * word_instr + 2^11 * ADD + 2^12 * SUB + 2^13 * SLT + 2^14 * AND + 2^15 * OR + 2^16 * XOR + 2^17 * SHIFT + 2^18 * JALR + 2^19 * BEQ + 2^20 * BLT + 2^21 * LOAD + 2^22 * STORE + 2^23 * MUL + 2^24 * DIVREM + 2^25 * ECALL + 2^26 * EBREAK + 2^27 * rs1 + 2^35 * rs2 + 2^43 * rd
-```
+# CPU Chip
 
-**Definition of `pad`:**
-```
-pad := 1 - ADD - SUB - SLT - AND - OR - XOR - SHIFT - JALR - BEQ - BLT - LOAD - STORE - MUL - DIVREM - ECALL - EBREAK
-```
+= Columns
 
 The `CPU` chip is comprised of  variables that are expressed using  columns:
 
-## Assumptions
-
-| Tag | Range | Description |
-|-----|-------|-------------|
-| `CPU-A1` |  | At most one ALU selector flag is 1 by the decoding, and every other flag is 0. |
-| `CPU-A2` |  | When `STORE + LOAD + BEQ + BLT = 0`, either `rs2 = 0` or `imm = 0` should be enforced by the decoding. This is needed for `arg2`. |
-
-## Constraints
+= Assumptions
 
-First, we perform a decoding lookup for the current PC.
+= Constraints First, we perform a decoding lookup for the current PC.
 
 | Tag | Description |
 |-----|-------------|
@@ -388,7 +446,7 @@ First, we perform a decoding lookup for the current PC.
 
 > **Note:** All casts for interactions will have to be reviewed once other chip interfaces stabilise
 
-### Range checks
+## Range checks
 
 > **Note:** Make sure we argue for every column here
 
@@ -432,7 +490,7 @@ We constrain all columns to have the appropriate ranges. The flags and register
 | `CPU-CR33.i` | i ∈ [0, 7] | `IS_BYTE[arg2[i]]` |
 | `CPU-CR34.i` | i ∈ [0, 7] | `IS_BYTE[res[i]]` |
 
-### ALU
+## ALU
 
 The ALU functionality is then obtained through judicious dispatching to the corresponding chips.
 
@@ -451,7 +509,7 @@ The ALU functionality is then obtained through judicious dispatching to the corr
 | `CPU-CA44` |  | `MUL[res; arg1, signed, arg2, mp_selector, muldiv_selector]` | MUL |
 | `CPU-CA45` |  | `DVRM[res; arg1, arg2, signed, muldiv_selector]` | DIVREM |
 
-### Memory
+## Memory
 
 The interactions with the memory, both for register loading and storing, as for `LOAD` and `STORE` instructions are handled. Note that since registers need no byte-addressing, we store them in the memory argument with `Word` limbs. The timestamps are ensured to be disjoint for disjoint memory locations. One consequence of that is that `next_pc` is written at `timestamp + 1` to ensure the access is disjoint with the `pc` read into `rv1` as part of the `AUIPC` instruction.
 
@@ -468,7 +526,7 @@ The interactions with the memory, both for register loading and storing, as for
 | `CPU-CM52` |  | `MEMW[0, res, rv2, timestamp + 1, memory_2bytes, memory_4bytes, memory_8bytes]` | STORE |
 | `CPU-CM53` |  | `MEMW[pc; 1, 2 * 255, next_pc, timestamp + 1, 1, 0, 0]` | 1 - pad |
 
-### System
+## System
 
 The interactions with the wider system.
 
@@ -478,7 +536,7 @@ The interactions with the wider system.
 | | _polynomial:_ `1 - EBREAK = 0` | |
 | `CPU-CS55` | `ECALL[timestamp, rv1::DWordWL]` | ECALL |
 
-### Input and output to the ALU
+## Input and output to the ALU
 
 We constrain `arg1`, `arg2` and `rvd` to correspond to the wanted values, including the appropriate sign/zero extension, depending on `word_instr`.
 
@@ -502,11 +560,9 @@ We constrain `arg1`, `arg2` and `rvd` to correspond to the wanted values, includ
 | `CPU-CE65` | `!LOAD` => `rvd[1]` = (1 - `word_instr`) dot `res[4:]` + `res_sign_bit` dot (2^(32) - 1) |  |
 | | _polynomial:_ `(1 - LOAD) * (rvd[1] - (1 - word_instr) * (res::DWordWL)[1] - res_sign_bit * (2^32 - 1)) = 0` | |
 
-### Other constraints
+## Other constraints
 
-> **Note:** proper ref to IsZero/IsEqual
-
-For [cpu:c:is_equal], refer to the logic of IsZero or IsEqual, in combination with the subtraction of [cpu:c:sub].
+For [cpu:c:is_equal], note that [cpu:c:sub] sets `res` to be the difference between `arg1` and `arg2` whenever `BEQ` is `1`. Given that this difference is `0` when both are equal, [cpu:c:is_equal] ensures `is_equal` is set to `1` if and only if ``arg1` = `arg2`` and `BEQ` is set.
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
@@ -518,124 +574,123 @@ For [cpu:c:is_equal], refer to the logic of IsZero or IsEqual, in combination wi
 
 > **Note:** Document the choice to not have a multiplicity column here for padding
 
-## Padding
+= Padding
 
 The CPU can be padded with the following values, which have a corresponding row in the DECODE table, at the _odd_ address 1, only reachable through a HALT ecall.
 
 This approach minimizes the number of dependent lookups, increasing only multiplicities in the DECODE table and the IS_BYTE lookup.
 
----
-
-# SHIFT Chip
-
-## Interface
-
-The  chip has the following interface:
-
-``` // param in: the value being shifted // param shift: the number of bits to shift `in` by // param direction: whether to shift left (0) or right (1) // param signed: whether to interpret `in` as a signed (1) or unsigned (0) integer // param word_instr: whether to execute the SLL/SR* (0) or SLLW/SR*W (1) instruction // out shifted: the resulting value SHIFT[shifted: DWord; in: DWord, shift: Byte, direction: Bit, signed: Bit, word_instr: Bit] ``` In other words, the  chip is designed to constrain that $
-
-$ $
-
-$ Here, `<<` and `>>` denote the _logical_ left and right shift operations, while `>>>` denotes the _arithmetic_ right shift operation.
-
 ## Columns
 
 ### Input
 
 | Name | Type | Description |
 |------|------|-------------|
-| `in` | `DWordHL` | The value being shifted |
-| `shift` | `Byte` | Number of bits to shift `in` by. |
-| `direction` | `Bit` | Whether to shift left (0) or right (1). |
-| `signed` | `Bit` | Whether to interpret `in` as a signed integer. |
-| `word_instr` | `Bit` | Whether this is a Word-instruction (1) or not (0). |
-
-### Output
-
-| Name | Type | Description |
-|------|------|-------------|
-| `out` | `DWordWL` | $`in <</>>/>>>` (`shift` mod 32 dot (2 - `word_instr`))$ |
-
-### Auxiliary
-
-| Name | Type | Description |
-|------|------|-------------|
-| `is_negative` | `Bit` | Whether `in` is negative |
-| `bit_shift` | `Byte` | Value by which to shift `in` to obtain `X` and `Y` |
-| `zbs` | `Bit` | Whether `bit_shift` is zero (1) or not (0). |
-| `X` | `Half[5]` | scratch variable. |
-| `Y` | `Half[4]` | scratch variable. |
-| `limb_shift` | `Bit[4]` | One-hot vector indicating whether $floor.l `shift` / 16 floor.r equiv i mod s$, where $s = 2$ when $`word_instr` = 1$ and $4$ otherwise. |
-
-### Virtual
-
-| Name | Type | Description |
-|------|------|-------------|
-| `extension` | `Half` | sign extension of `in`. |
-| `left` | `Bit` | Whether to perform a left-shift. |
-| `right` | `Bit` | Whether to perform a right-shift. |
-| `intra_limb_left` | `DWordHL` | `in << (shift % 16)` if `left` |
-| `intra_limb_right` | `DWordHL` | `in >>> (shift % 16)` if `right` and `signed`;\ `in >> (shift % 16)` if `right` and `!signed` |
-| `shifted` | `DWordHL` | $`in <</>>/>>>` (`shift` mod 32 dot (2 - `word_instr`))$ |
+| `timestamp` | `Timestamp` | A preprocessed timestamp to coordinate the memory argument. Since we have at most 3 non-disjoint memory accesses (`(rs1, rs2, rd)`, `(rs1, pc, pc)`, `(LOAD)` or `(STORE)`) a maximum of 4 slots is enough. |
+| `pc` | `DWordWL` | The program counter |
+| `rs1` | `Byte` | Source register 1 index |
+| `rs2` | `Byte` | Source register 2 index |
+| `rd` | `Byte` | Destination register index |
+| `read_register1` | `Bit` | Whether to read from `rs1` (1) or to place a 0 in `rv1` (0) |
+| `read_register2` | `Bit` | Whether to read from `rs2` (1) or to place a 0 in `rv2` (0) |
+| `write_register` | `Bit` | Whether to write back to the destination register |
+| `memory_2bytes` | `Bit` | Whether the memory access (read or write) touches exactly 2 bytes |
+| `memory_4bytes` | `Bit` | Whether the memory access (read or write) touches exactly 4 bytes |
+| `memory_8bytes` | `Bit` | Whether the memory access (read or write) touches exactly 8 bytes |
+| `c_type_instruction` | `Bit` | Whether the instruction is of C type, i.e., whether it is 2 bytes long instead of 4 |
+| `imm` | `DWordWL` | The fully extended 64-bit version of the immediate |
+| `signed` | `Bit` | Indicates whether we're dealing with a signed or unsigned instruction |
+| `mp_selector` | `Bit` | Multi-purpose selector used by different ALU operations for different purposes. Currently, it is used     - by the `MUL` chip to select between `MUL`/`MULH` and `MULH[S]U`, and     - as flag for inverting the condition of conditional branches (see `branch_cond`)     - as direction (left or right) for `SHIFT` |
+| `muldiv_selector` | `Bit` | Selects which output of `MUL` (lo/hi) or `DIV` (quo/rem) is wanted |
+| `word_instr` | `Bit` | Whether the instruction is a \*W instruction, requiring the inputs and outputs to be (sign) extended |
+| `ADD` | `Bit` | One-hot ALU selector flag |
+| `SUB` | `Bit` | One-hot ALU selector flag |
+| `SLT` | `Bit` | One-hot ALU selector flag |
+| `AND` | `Bit` | One-hot ALU selector flag |
+| `OR` | `Bit` | One-hot ALU selector flag |
+| `XOR` | `Bit` | One-hot ALU selector flag |
+| `SHIFT` | `Bit` | One-hot ALU selector flag |
+| `JALR` | `Bit` | One-hot ALU selector flag |
+| `BEQ` | `Bit` | One-hot ALU selector flag |
+| `BLT` | `Bit` | One-hot ALU selector flag |
+| `LOAD` | `Bit` | One-hot ALU selector flag |
+| `STORE` | `Bit` | One-hot ALU selector flag |
+| `MUL` | `Bit` | One-hot ALU selector flag |
+| `DIVREM` | `Bit` | One-hot ALU selector flag |
+| `ECALL` | `Bit` | One-hot ALU selector flag |
+| `EBREAK` | `Bit` | One-hot ALU selector flag |
 
-**Definition of `extension`:**
-```
-extension := 65535 * is_negative
-```
+### Output
 
-**Definition of `left`:**
-```
-left := μ - direction
-```
+| Name | Type | Description |
+|------|------|-------------|
+| `next_pc` | `DWordWL` | The program counter for the next instruction |
+| `rvd` | `DWordWL` | The value to (maybe) be written back to rvd |
 
-**Definition of `right`:**
-```
-right := direction
-```
+### Auxiliary
 
-**Definition of `intra_limb_left`:**
-```
-intra_limb_left (when iter=0) := X[0]
-intra_limb_left (when iter=[1, 3]) := X[i] + Y[i - 1]
-```
+| Name | Type | Description |
+|------|------|-------------|
+| `rv1` | `DWordWHH` | The value of register `rs1` |
+| `rv2` | `DWordWHH` | The value of register `rs2` |
+| `rv1_sign_bit` | `Bit` | The sign bit of `rv1` if seen as a 32-bit word |
+| `arg1` | `DWordBL` | The extended version of `rv1`, depending on `word_instr` |
+| `arg2_sign_bit` | `Bit` | The sign bit of `arg2` if seen as a 32-bit word |
+| `arg2` | `DWordBL` | A multiplexed version of `rv2` and `imm`, to be used as second argument to ALU calls |
+| `res_sign_bit` | `Bit` | The sign bit of `res`, if seen as a 32-bit word |
+| `res` | `DWordBL` | The ALU result |
+| `is_equal` | `Bit` | Whether `rv1` and `arg2` are equal |
+| `branch_cond` | `Bit` | Whether a branch is taken, i.e., the branch condition |
 
-**Definition of `intra_limb_right`:**
+### Virtual
+
+| Name | Type | Description |
+|------|------|-------------|
+| `packed_decode` | `BaseField` | A packed representation of all bit flags and register indices obtained from the decoding |
+| `pad` | `Bit` | When no flags are set, we must be in a padding row. |
+
+**Definition of `packed_decode`:**
 ```
-intra_limb_right := Y[i] + X[i + 1]
+packed_decode := 2^0 * read_register1 + 2^1 * read_register2 + 2^2 * write_register + 2^3 * memory_2bytes + 2^4 * memory_4bytes + 2^5 * memory_8bytes + 2^6 * c_type_instruction + 2^7 * signed + 2^8 * mp_selector + 2^9 * muldiv_selector + 2^10 * word_instr + 2^11 * ADD + 2^12 * SUB + 2^13 * SLT + 2^14 * AND + 2^15 * OR + 2^16 * XOR + 2^17 * SHIFT + 2^18 * JALR + 2^19 * BEQ + 2^20 * BLT + 2^21 * LOAD + 2^22 * STORE + 2^23 * MUL + 2^24 * DIVREM + 2^25 * ECALL + 2^26 * EBREAK + 2^27 * rs1 + 2^35 * rs2 + 2^43 * rd
 ```
 
-**Definition of `shifted`:**
+**Definition of `pad`:**
 ```
-shifted := left * Σ_j = 0^i limb_shift[j] * intra_limb_left[i - j] + right * (Σ_j = 0^3 - i limb_shift[j] * intra_limb_right[i + j] + extension * Σ_j = 3 - i^3 limb_shift[j])
+pad := 1 - ADD - SUB - SLT - AND - OR - XOR - SHIFT - JALR - BEQ - BLT - LOAD - STORE - MUL - DIVREM - ECALL - EBREAK
 ```
 
-### Multiplicity
-
-| Name | Type | Description |
-|------|------|-------------|
-| `μ` | `Bit` |  |
-
-The `SHIFT` chip is comprised of  variables that are expressed using  columns:
-
 ## Assumptions
 
 | Tag | Range | Description |
 |-----|-------|-------------|
-| `SHIFT-A1.i` | i ∈ [0, 3] | `IS_HALFWORD[in[i]]` |
-| `SHIFT-A2` |  | `IS_BYTE[shift]` |
-| `SHIFT-A3` |  | `IS_BIT<direction>` |
-| `SHIFT-A4` |  | `IS_BIT<signed>` |
-| `SHIFT-A5` |  | `IS_BIT<word_instr>` |
+| `CPU-A1` |  | At most one ALU selector flag is 1 by the decoding, and every other flag is 0. |
+| `CPU-A2` |  | When `STORE + LOAD + BEQ + BLT = 0`, either `rs2 = 0` or `imm = 0` should be enforced by the decoding. This is needed for `arg2`. |
+
+---
+
+# SHIFT Chip
+
+= Interface The  chip has the following interface:
+
+``` // param in: the value being shifted // param shift: the number of bits to shift `in` by // param direction: whether to shift left (0) or right (1) // param signed: whether to interpret `in` as a signed (1) or unsigned (0) integer // param word_instr: whether to execute the SLL/SR* (0) or SLLW/SR*W (1) instruction // out shifted: the resulting value SHIFT[shifted: DWord; in: DWord, shift: Byte, direction: Bit, signed: Bit, word_instr: Bit] ``` In other words, the  chip is designed to constrain that $
+
+$ $
 
-## Explanation
+$ Here, `<<` and `>>` denote the _logical_ left and right shift operations, while `>>>` denotes the _arithmetic_ right shift operation.
+
+= Columns
+
+The `SHIFT` chip is comprised of  variables that are expressed using  columns:
+
+= Assumptions
 
-This chip has a rather complex design as a result of designing it to fit in as few columns possible. We briefly discuss the intricacies of the design, attempting to illustrate its correctness.
+= Explanation This chip has a rather complex design as a result of designing it to fit in as few columns possible. We briefly discuss the intricacies of the design, attempting to illustrate its correctness.
 
 The chip's design revolves around a two-phase shifting process: 1. shift `in` by `x := `shift` mod 16` bits, 2. shift that result by `(`shift`-x) mod 64` (or `mod 32` if ` `word_instr` = 1`). The intermediate value representing the state between the two phases is stored in the scratch variables `X` and `Y`. The definition of `shifted` describes how one can combine the `X`, `Y` and `extension` variables to construct the output value as described using `Half`-limbs. The output variable `out` is equivalent to `shifted`, but expressed using `Word`-limbs.
 
 In the following, we cover how these two phases were designed to complement one another. Here, we start with discussing the _logical_ left/right shift operations only; the modifications required to compute the _arithmetic_ right shift will be discussed at the end.
 
-### First phase
+## First phase
 
 We zoom in on the first step. Here, we make use of the two lookup operations - ``HWSL[x: Half, y: B4]` := (`x` `<<` `y`) mod 2^16` (short for "HalfWord Shift Left"), and - ``HWSLC[x: Half, y: B4]` := `x` `>>` (16-`y`)` (short for "HalfWord Shift Left's Carry") Note here that one can use these two lookups to compute `out: Half[4] := in << y` as: $
 
@@ -647,25 +702,23 @@ Observe now that the values being looked up are (almost) independent from the di
 
 (16-`shift`) mod 16 & "when shifting right" ), $ it only takes some rearranging and combining of the values ``X[`i`] := HWSL[in[`i`], bit_shift]`` and ``Y[`i`] := HWSLC[in[`i`], bit_shift]`` to form the limbs of ``in <</>> shift` mod 16`. In the remaining case that ``right` = 1` and ``shift` = 0 mod 16`, the limbs of ``in <</>> shift` mod 16` simply match those of `in`.
 
-### Second phase
+## Second phase
 
 Since we're operating on 16-bit limbs, all the limbs in ``in <</>> shift`` must also occur somewhere in ``in <</>> shift` mod 16`. The number of full-limbs we still need to shift is determined by the fifth and sixth least significant bit of `shift`. With `limb_shift` containing a unary decoding of the integer represented by these two bits, we find that the intermediate value needs to be shifted over by `i` limbs (to the `left` or `right`) when ``limb_shift[`i`]` = 1`. These things combined yield `shifted`'s definition.
 
 Of course, when ``word_instr` = 1` and, thus, only ``shift` mod 32` should be considered, the bit-mask for the lookup constraining `limb_shift` is adjusted appropriately (see [shift:c:limb_shift_lookup]).
 
-### Arithmetic right shift
+## Arithmetic right shift
 
 Lastly, we discuss the case of performing the _arithmetic_ right shift. Here, `extension` is constrained to contain a repetition of `in`'s most significant bit. Copies of this variable are used for any full limbs shifted in when ``right` = `signed` = 1`. Moreover, `X[4]` contains a copy of `extension` shifted over by the right number of bits, to allow the construction of ``in >>> shift` mod 16` as the appropriate intermediate.
 
-## Constraints
-
-First, we constrain `bit_shift` based on whether we are left or right-shifting. [shift:c:zbs] makes sure `zbs` is set to `1` if and only if `bit_shift = 0`. This flag is used to indicate the special case that ``right` = 1` and ``shift` = 0 mod 16`.
+= Constraints First, we constrain `bit_shift` based on whether we are left or right-shifting. [shift:c:zbs] makes sure `zbs` is set to `1` if and only if `bit_shift = 0`. This flag is used to indicate the special case that ``right` = 1` and ``shift` = 0 mod 16`.
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
 | `SHIFT-C3` | `AND_BYTE[bit_shift; shift, 15]` | left |
 | `SHIFT-C4` | `AND_BYTE[bit_shift; 2^8 - shift, 15]` | right |
-| `SHIFT-C5` | `IsZero<zbs; bit_shift>` | μ |
+| `SHIFT-C5` | `ZERO[zbs; bit_shift]` | μ |
 
 Next, we shift the limbs of `in` left and right by the appropriate amount, storing the results in `X` and `Y` respectively. When `zbs = 1`, the output cannot be used to compose ``in >>/>>> shift` mod 16`. To resolve this, we override `Y[i] := in[i]` and `X[i] := 0` in this case.
 
@@ -683,7 +736,7 @@ The case of `left`-shifting and ``bit_shift` = 0` will be used for padding rows.
 | `SHIFT-C11.i` | i ∈ [0, 3] | `zbs` => `Y[i]` = `in[i]` dot `right` |  |
 | | | _polynomial:_ `zbs * (Y[i] - in[i] * right) = 0` | |
 
-### Full-limb shifting
+## Full-limb shifting
 
 Next, we constrain that `limb_shift` is a proper unary encoding of the fifth (and sixth if ``word_instr` = 0`) bit of `shift`. For this to be the case, three requirements must be satisfied: + *unary(0)*: ``limb_shift[`i`]` in {0, 1}` for `i in [0, 3]`, + *unary(1)*: ``limb_shift[`i`]` = 1` for exactly one `i`, and + *proper encoding*: ``limb_shift[`i`]` = 1 <=> 1/16 (`shift &` (48-32 dot `word_instr`)) = i` The first requirement is enforced by constraint [shift:c:limb_shift_is_bit]. To construct a constraint for the second and third requirement, observe that $ 1/16 dot (`shift &` (48-32 dot `word_instr`)) in cases( {0, 1, 2, 3} &"if" `word_instr` = 0, {0, 1} &"if" `word_instr` = 1 $ Observe moreover that, assuming *unary(0)*, the expression $ 1/16 dot (1 + sum_(i=0)^3 (16i-1) dot `limb_shift[`i`]`) $ can evaluate to `i` if and only if ``limb_shift[`i`]` = 1`, while the others are `0`. This means that the relation $ 1 + sum_(i=0)^3 (16i-1) dot `limb_shift[`i`]` = `shift &` (48-32 dot `word_instr`) $ enforces both *unary(1)* and *proper encoding*. This is the exact relation [shift:c:limb_shift_lookup] enforces.
 
@@ -696,11 +749,11 @@ Hereafter, one must only check that `out` is the proper cast of `shifted` into a
 | `SHIFT-C14.i` | i ∈ [0, 1] | `out[:2]` = `shifted[:4]` |  |
 | | | _polynomial:_ `out[i] - (shifted::DWordWL)[i] = 0` | |
 
-### Miscellaneous
+## Miscellaneous
 
 *Note*: `is_negative` is not used when `signed = 0`. As such, there is no problem with it being unconstrained in this case.
 
-### Lookups
+## Lookups
 
 This chip adds the following interaction to the lookup.
 
@@ -708,10 +761,99 @@ This chip adds the following interaction to the lookup.
 |-----|-------------|--------------|
 | `SHIFT-C15` | `SHIFT[out; in, shift, direction, signed, word_instr]` | -μ |
 
-## Padding
+= Padding
 
 The table can be padded to the next power of two with the following value assignments:
 
+## Columns
+
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `in` | `DWordHL` | The value being shifted |
+| `shift` | `Byte` | Number of bits to shift `in` by. |
+| `direction` | `Bit` | Whether to shift left (0) or right (1). |
+| `signed` | `Bit` | Whether to interpret `in` as a signed integer. |
+| `word_instr` | `Bit` | Whether this is a Word-instruction (1) or not (0). |
+
+### Output
+
+| Name | Type | Description |
+|------|------|-------------|
+| `out` | `DWordWL` | $`in <</>>/>>>` (`shift` mod 32 dot (2 - `word_instr`))$ |
+
+### Auxiliary
+
+| Name | Type | Description |
+|------|------|-------------|
+| `is_negative` | `Bit` | Whether `in` is negative |
+| `bit_shift` | `Byte` | Value by which to shift `in` to obtain `X` and `Y` |
+| `zbs` | `Bit` | Whether `bit_shift` is zero (1) or not (0). |
+| `X` | `Half[5]` | scratch variable. |
+| `Y` | `Half[4]` | scratch variable. |
+| `limb_shift` | `Bit[4]` | One-hot vector indicating whether $floor.l `shift` / 16 floor.r equiv i mod s$, where $s = 2$ when $`word_instr` = 1$ and $4$ otherwise. |
+
+### Virtual
+
+| Name | Type | Description |
+|------|------|-------------|
+| `extension` | `Half` | sign extension of `in`. |
+| `left` | `Bit` | Whether to perform a left-shift. |
+| `right` | `Bit` | Whether to perform a right-shift. |
+| `intra_limb_left` | `DWordHL` | `in << (shift % 16)` if `left` |
+| `intra_limb_right` | `DWordHL` | `in >>> (shift % 16)` if `right` and `signed`;\ `in >> (shift % 16)` if `right` and `!signed` |
+| `shifted` | `DWordHL` | $`in <</>>/>>>` (`shift` mod 32 dot (2 - `word_instr`))$ |
+
+**Definition of `extension`:**
+```
+extension := 65535 * is_negative
+```
+
+**Definition of `left`:**
+```
+left := μ - direction
+```
+
+**Definition of `right`:**
+```
+right := direction
+```
+
+**Definition of `intra_limb_left`:**
+```
+intra_limb_left (when iter=0) := X[0]
+intra_limb_left (when iter=[1, 3]) := X[i] + Y[i - 1]
+```
+
+**Definition of `intra_limb_right`:**
+```
+intra_limb_right := Y[i] + X[i + 1]
+```
+
+**Definition of `shifted`:**
+```
+shifted := left * Σ_j = 0^i limb_shift[j] * intra_limb_left[i - j] + right * (Σ_j = 0^3 - i limb_shift[j] * intra_limb_right[i + j] + extension * Σ_j = 3 - i^3 limb_shift[j])
+```
+
+### Multiplicity
+
+| Name | Type | Description |
+|------|------|-------------|
+| `μ` | `Bit` |  |
+
+## Assumptions
+
+| Tag | Range | Description |
+|-----|-------|-------------|
+| `SHIFT-A1.i` | i ∈ [0, 3] | `IS_HALFWORD[in[i]]` |
+| `SHIFT-A2` |  | `IS_BYTE[shift]` |
+| `SHIFT-A3` |  | `IS_BIT<direction>` |
+| `SHIFT-A4` |  | `IS_BIT<signed>` |
+| `SHIFT-A5` |  | `IS_BIT<word_instr>` |
+
+## Constraints
+
 ### is_negative
 
 | Tag | Description | Multiplicity |
@@ -729,6 +871,38 @@ The table can be padded to the next power of two with the following value assign
 
 # BRANCH Chip
 
+= Columns
+
+The `BRANCH` chip is comprised of  variables that are expressed using  columns:
+
+= Assumptions
+
+= Constraints
+
+> **Note:** Check correspondence with CPU for passing in `offset` as word or dword
+
+We constrain `next_pc` to be ``base_address` + `offset``, where `base_address` equals `pc` when ``JALR` = 0` and `register` otherwise.
+
+The range checks on `unmasked_low_byte` and `next_pc_low[0]` are performed implicitly by the `AND_BYTE` lookup.
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `BRANCH-C1` |  | 1 - JALR ⇒ `ADD<next_pc_unmasked; pc, offset::DWordWL>` |  |
+| `BRANCH-C2` |  | JALR ⇒ `ADD<next_pc_unmasked; register, offset::DWordWL>` |  |
+| `BRANCH-C3` |  | `IS_BYTE[next_pc_low[1]]` | μ |
+| `BRANCH-C4` |  | `AND_BYTE[next_pc_low[0]; unmasked_low_byte[0], 254]` | μ |
+| `BRANCH-C5.i` | i ∈ [0, 2] | `IS_HALFWORD[next_pc_high[i]]` | μ |
+
+This chip contributes the following to the lookup argument.
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `BRANCH-C6` | `BRANCH[next_pc; pc, offset, register, JALR]` | -μ |
+
+= Padding
+
+The table can be padded to the next power of two with the following value assignments:
+
 ## Columns
 
 ### Input
@@ -778,8 +952,6 @@ next_pc (when iter=1) := 2^16 * next_pc_high[2] + next_pc_high[1]
 |------|------|-------------|
 | `μ` | `Bit` |  |
 
-The `BRANCH` chip is comprised of  variables that are expressed using  columns:
-
 ## Assumptions
 
 | Tag | Range | Description |
@@ -789,35 +961,67 @@ The `BRANCH` chip is comprised of  variables that are expressed using  columns:
 | `BRANCH-A3.i` | i ∈ [0, 1] | `register` is range checked, `IS_WORD[register[i]]` |
 | `BRANCH-A4` |  | `IS_BIT<JALR>` |
 
-## Constraints
+---
 
-> **Note:** Check correspondence with CPU for passing in `offset` as word or dword
+# MEMW Chip
 
-We constrain `next_pc` to be ``base_address` + `offset``, where `base_address` equals `pc` when ``JALR` = 0` and `register` otherwise.
+= Columns
 
-The range checks on `unmasked_low_byte` and `next_pc_low[0]` are performed implicitly by the `AND_BYTE` lookup.
+The `MEMW` chip is comprised of  variables that are expressed using  columns:
+
+= Assumptions
+
+Our assumptions do not explicitly cover any range checks for the `is_register` and `value` columns, as these are not necessary for the correctness of this chip in isolation. These properties are necessary for the consistency of the system as a whole, and therefore we document it here, keeping the type information as a reading help.
+
+= Constraints
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `BRANCH-C1` |  | 1 - JALR ⇒ `ADD<next_pc_unmasked; pc, offset::DWordWL>` |  |
-| `BRANCH-C2` |  | JALR ⇒ `ADD<next_pc_unmasked; register, offset::DWordWL>` |  |
-| `BRANCH-C3` |  | `IS_BYTE[next_pc_low[1]]` | μ |
-| `BRANCH-C4` |  | `AND_BYTE[next_pc_low[0]; unmasked_low_byte[0], 254]` | μ |
-| `BRANCH-C5.i` | i ∈ [0, 2] | `IS_HALFWORD[next_pc_high[i]]` | μ |
+| `MEMW-C1` |  | `IS_BIT<μ_sum>` |  |
+| `MEMW-C2` |  | `w2` => `μ_sum` |  |
+| | | _polynomial:_ `w2 * (1 - μ_sum) = 0` | |
+| `MEMW-C3` |  | `ADD<address_add[0]::DWordWL; base_address, 1>` | w2 |
+| `MEMW-C4.i` | i ∈ [1, 2] | `ADD<address_add[i]::DWordWL; base_address, i + 1>` | w4 |
+| `MEMW-C5.i` | i ∈ [3, 6] | `ADD<address_add[i]::DWordWL; base_address, i + 1>` | write8 |
+| `MEMW-C6.i` | i ∈ [0, 6], j ∈ [0, 3] | `IS_HALFWORD[address_add[i][j]]` |  |
+| `MEMW-C7` |  | `LT[1; old_timestamp[0], timestamp, 0]` | μ_sum |
+| `MEMW-C8` |  | `LT[1; old_timestamp[1], timestamp, 0]` | w2 |
+| `MEMW-C9.i` | i ∈ [2, 3] | `LT[1; old_timestamp[i], timestamp, 0]` | w4 |
+| `MEMW-C10.i` | i ∈ [4, 7] | `LT[1; old_timestamp[i], timestamp, 0]` | write8 |
 
-This chip contributes the following to the lookup argument.
+As long as `timestamp` is properly range-checked, the presence of `old_timestamp` in the memory argument automatically ensures appropriate range checking (as long as no external entities provide negative multiplicities without range checking the timestamp). This ensures the assumptions for `LT` are satisfied.
+
+We additionally check that the address does not overflow for more significant bytes of the access.
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `BRANCH-C6` | `BRANCH[next_pc; pc, offset, register, JALR]` | -μ |
+| `MEMW-CR11` | `LT[1; base_address, address_add[0]::DWordWL, 0]` | write2 |
+| `MEMW-CR12` | `LT[1; base_address, address_add[2]::DWordWL, 0]` | write4 |
+| `MEMW-CR13` | `LT[1; base_address, address_add[6]::DWordWL, 0]` | write8 |
 
-## Padding
+The chip adds the following tuples to the lookup argument, to effectuate that part of the memory argument.
 
-The table can be padded to the next power of two with the following value assignments:
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `MEMW-CM14` |  | `memory[is_register, base_address, old_timestamp[0], old[0]]` | μ_sum |
+| `MEMW-CM15` |  | `memory[is_register, base_address, timestamp, value[0]]` | -μ_sum |
+| `MEMW-CM16` |  | `memory[is_register, address_add[0], old_timestamp[1], old[1]]` | w2 |
+| `MEMW-CM17` |  | `memory[is_register, address_add[0], timestamp, value[1]]` | -w2 |
+| `MEMW-CM18.i` | i ∈ [2, 3] | `memory[is_register, address_add[i - 1], old_timestamp[i], old[i]]` | w4 |
+| `MEMW-CM19.i` | i ∈ [2, 3] | `memory[is_register, address_add[i - 1], timestamp, value[i]]` | -w4 |
+| `MEMW-CM20.i` | i ∈ [4, 7] | `memory[is_register, address_add[i - 1], old_timestamp[i], old[i]]` | write8 |
+| `MEMW-CM21.i` | i ∈ [4, 7] | `memory[is_register, address_add[i - 1], timestamp, value[i]]` | -write8 |
 
----
+This chip contributes the following to the lookup argument.
 
-# MEMW Chip
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `MEMW-CO22` | `MEMW[old; is_register, base_address, value, timestamp, write2, write4, write8]` | μ_read |
+| `MEMW-CO23` | `MEMW[is_register, base_address, value, timestamp, write2, write4, write8]` | μ_write |
+
+= Future optimization ideas
+
+- Fast path for aligned memory access where all bytes have the same old timestamp - MEMB chip that deals does a one-byte write to remove old_timestamp from here (uncertain tradeoffs) - Compute `base_address[1] + 1` once and have high words of `address_add` as Words - Improve overflow trapping somehow so we don't need `LT` (could tie into previous one by checking carry bit of the +1) - Adding `μ_sum`/`w2`/`w4`/`write8` multiplicities to the `IS_HALFWORD` lookups may make some GKR things faster if there are known zeroes.
 
 ## Columns
 
@@ -876,8 +1080,6 @@ w4 := write4 + write8
 | `μ_read` | `Bit` | Whether we are performing a read (and hence return `out`) |
 | `μ_write` | `Bit` | Whether we are performing a write (and hence not return `out`) |
 
-The `MEMW` chip is comprised of  variables that are expressed using  columns:
-
 ## Assumptions
 
 | Tag | Range | Description |
@@ -889,61 +1091,53 @@ The `MEMW` chip is comprised of  variables that are expressed using  columns:
 | `MEMW-A5` |  | `IS_BIT<write2 + write4 + write8>` |
 | `MEMW-A6.i` | i ∈ [0, 1] | `IS_WORD[timestamp[i]]` |
 
-Our assumptions do not explicitly cover any range checks for the `is_register` and `value` columns, as these are not necessary for the correctness of this chip in isolation. These properties are necessary for the consistency of the system as a whole, and therefore we document it here, keeping the type information as a reading help.
+---
 
-## Constraints
+# LT Chip
 
-| Tag | Range | Description | Multiplicity |
-|-----|-------|-------------|--------------|
-| `MEMW-C1` |  | `IS_BIT<μ_sum>` |  |
-| `MEMW-C2` |  | `w2` => `μ_sum` |  |
-| | | _polynomial:_ `w2 * (1 - μ_sum) = 0` | |
-| `MEMW-C3` |  | `ADD<address_add[0]::DWordWL; base_address, 1>` | w2 |
-| `MEMW-C4.i` | i ∈ [1, 2] | `ADD<address_add[i]::DWordWL; base_address, i + 1>` | w4 |
-| `MEMW-C5.i` | i ∈ [3, 6] | `ADD<address_add[i]::DWordWL; base_address, i + 1>` | write8 |
-| `MEMW-C6.i` | i ∈ [0, 6], j ∈ [0, 3] | `IS_HALFWORD[address_add[i][j]]` |  |
-| `MEMW-C7` |  | `LT[1; old_timestamp[0], timestamp, 0]` | μ_sum |
-| `MEMW-C8` |  | `LT[1; old_timestamp[1], timestamp, 0]` | w2 |
-| `MEMW-C9.i` | i ∈ [2, 3] | `LT[1; old_timestamp[i], timestamp, 0]` | w4 |
-| `MEMW-C10.i` | i ∈ [4, 7] | `LT[1; old_timestamp[i], timestamp, 0]` | write8 |
+= Columns
 
-As long as `timestamp` is properly range-checked, the presence of `old_timestamp` in the memory argument automatically ensures appropriate range checking (as long as no external entities provide negative multiplicities without range checking the timestamp). This ensures the assumptions for `LT` are satisfied.
+The `LT` chip is comprised of  variables that are expressed using  columns:
 
-We additionally check that the address does not overflow for more significant bytes of the access.
+= Assumptions We assume the inputs `lhs`, `rhs` and `signed` are partially range checked.
+
+= Constraints We first constrain that all variables correspond to their definition. For the defining constraint of `lt`, [lt:c:lt], observe that it is a choice between two options, depending on the input flag `signed`. In the case of unsigned comparison, we simply need `unsigned_lt`, indicating that a wraparound (carry bit) modulo `2^64` is needed to go from `rhs` to `lhs` via addition. For the case of signed comparison, we first need some case analysis.
+
+We split `a < b` into four disjoint cases, conditioned on the sign of `a` and `b`. Recall that the sign of a number in two's complement can be read off from the MSB, being `1` for a negative number and `0` for a positive one. For this analysis, we denote the MSB of `a` as `A` and the MSB of `b` as `B`. The four disjoint cases then become:
+
++ `dash(A) and B and (a < b)` + `A and dash(B) and (a < b)` + `A and B and (a < b)` + `dash(A) and dash(B) and (a < b)`
+
+The first case is evidently false, while the second case simplifies to `A and dash(B)`. For the third and fourth case, observe that when `A = B`, the `<` relation is preserved by the modular correspondence between `[-2^(31), 2^(31))` and `[0, 2^(64))`. Importantly, this modular correspondence is merely a reinterpretation of the bits or values of `a` and `b`, due to the representation in two's complement. Hence, we can introduce the value `C = `unsigned_lt``, that accurately represents the relation `a < b` when `A = B`.
+
+Combining our three remaining cases, we obtain the boolean formula `A dash(B) or A B C or dash(A) dash(B) C`. Since the cases are disjoint, this can be computed with the binary-valued polynomial `P(A, B, C) = A (1 - B) + A B C + (1 - A) (1 - B) C`.
+
+The polynomial `P` can be simplified to a total degree of two. We claim that the polynomial `Q(A, B, C) = A (1 - B) + A C + (1 - B) C` is, for the purposes of this chip, equivalent to `P`. An exhaustive check shows that `P(A, B, C) != Q(A, B, C)` only for the triple `(A, B, C) = (1, 0, 1)`. This is, however, impossible due to the correctness of `ADD`. In more detail, if we let `s` be the (range-checked) difference `a - b` (so the equivalent of the `lhs_sub_rhs` column), and `x'` denote the most significant word of a variable `x`, we need `c dot 2^32 + a' = b' + s' + `carry[0]``, by the definition of `carry`. However, the left hand side of this is at least `3 dot 2^31`, as `(A, C) = (1, 1)`, and the right hand side is at most `(2^31 - 1) + (2^32 - 1) + 1 = 3 dot 2^31 - 1`. Therefore, we can use `Q` to constrain `lt` when `signed = 1`.
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `MEMW-CR11` | `LT[1; base_address, address_add[0]::DWordWL, 0]` | write2 |
-| `MEMW-CR12` | `LT[1; base_address, address_add[2]::DWordWL, 0]` | write4 |
-| `MEMW-CR13` | `LT[1; base_address, address_add[6]::DWordWL, 0]` | write8 |
+| `LT-C1` | `MSB16[lhs_msb; lhs[2]]` | μ |
+| `LT-C2` | `MSB16[rhs_msb; rhs[2]]` | μ |
+| `LT-C3` | `lt` = `signed` dot (A (1 - B) + A C + (1 - B) C) + (1 - `signed`) dot `unsigned_lt` |  |
+| | _polynomial:_ `lt - signed * (lhs_msb * (1 - rhs_msb) + lhs_msb * carry[1] + (1 - rhs_msb) * carry[1]) - (1 - signed) * unsigned_lt = 0` | |
+| `LT-C4` | `IS_HALFWORD[lhs[1]]` | μ |
+| `LT-C5` | `IS_HALFWORD[rhs[1]]` | μ |
 
-The chip adds the following tuples to the lookup argument, to effectuate that part of the memory argument.
+And then we constrain the subtraction, taking care of the remaining range checking not yet covered by the assumptions or the `MSB16` lookup.
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `MEMW-CM14` |  | `memory[is_register, base_address, old_timestamp[0], old[0]]` | μ_sum |
-| `MEMW-CM15` |  | `memory[is_register, base_address, timestamp, value[0]]` | -μ_sum |
-| `MEMW-CM16` |  | `memory[is_register, address_add[0], old_timestamp[1], old[1]]` | w2 |
-| `MEMW-CM17` |  | `memory[is_register, address_add[0], timestamp, value[1]]` | -w2 |
-| `MEMW-CM18.i` | i ∈ [2, 3] | `memory[is_register, address_add[i - 1], old_timestamp[i], old[i]]` | w4 |
-| `MEMW-CM19.i` | i ∈ [2, 3] | `memory[is_register, address_add[i - 1], timestamp, value[i]]` | -w4 |
-| `MEMW-CM20.i` | i ∈ [4, 7] | `memory[is_register, address_add[i - 1], old_timestamp[i], old[i]]` | write8 |
-| `MEMW-CM21.i` | i ∈ [4, 7] | `memory[is_register, address_add[i - 1], timestamp, value[i]]` | -write8 |
+| `LT-C6.i` | i ∈ [0, 1] | `IS_BIT<carry[i]>` |  |
+| `LT-C7.i` | i ∈ [0, 3] | `IS_HALFWORD[lhs_sub_rhs[i]]` | μ |
 
-This chip contributes the following to the lookup argument.
+The chip contributes the following to the lookup argument.
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `MEMW-CO22` | `MEMW[old; is_register, base_address, value, timestamp, write2, write4, write8]` | μ_read |
-| `MEMW-CO23` | `MEMW[is_register, base_address, value, timestamp, write2, write4, write8]` | μ_write |
-
-## Future optimization ideas
-
-- Fast path for aligned memory access where all bytes have the same old timestamp - MEMB chip that deals does a one-byte write to remove old_timestamp from here (uncertain tradeoffs) - Compute `base_address[1] + 1` once and have high words of `address_add` as Words - Improve overflow trapping somehow so we don't need `LT` (could tie into previous one by checking carry bit of the +1) - Adding `μ_sum`/`w2`/`w4`/`write8` multiplicities to the `IS_HALFWORD` lookups may make some GKR things faster if there are known zeroes.
+| `LT-C8` | `LT[lt; lhs::DWordWL, rhs::DWordWL, signed]` | -μ |
 
----
+= Padding
 
-# LT Chip
+The table can be padded to the next power of two with the following value assignments:
 
 ## Columns
 
@@ -993,8 +1187,6 @@ unsigned_lt := carry[1]
 |------|------|-------------|
 | `μ` | `Bit` |  |
 
-The `LT` chip is comprised of  variables that are expressed using  columns:
-
 ## Assumptions
 
 | Tag | Range | Description |
@@ -1003,51 +1195,69 @@ The `LT` chip is comprised of  variables that are expressed using  columns:
 | `LT-A2` |  | `IS_WORD[rhs[0]]` |
 | `LT-A3` |  | `IS_BIT<signed>` |
 
-We assume the inputs `lhs`, `rhs` and `signed` are partially range checked.
+---
 
-## Constraints
+# MUL Chip
 
-We first constrain that all variables correspond to their definition. For the defining constraint of `lt`, [lt:c:lt], observe that it is a choice between two options, depending on the input flag `signed`. In the case of unsigned comparison, we simply need `unsigned_lt`, indicating that a wraparound (carry bit) modulo `2^64` is needed to go from `rhs` to `lhs` via addition. For the case of signed comparison, we first need some case analysis.
+= Columns
 
-We split `a < b` into four disjoint cases, conditioned on the sign of `a` and `b`. Recall that the sign of a number in two's complement can be read off from the MSB, being `1` for a negative number and `0` for a positive one. For this analysis, we denote the MSB of `a` as `A` and the MSB of `b` as `B`. The four disjoint cases then become:
+The `MUL` chip is comprised of  variables that are expressed using  columns:
 
-+ `dash(A) and B and (a < b)` + `A and dash(B) and (a < b)` + `A and B and (a < b)` + `dash(A) and dash(B) and (a < b)`
+`mat(delim: , top; bottom)` }
 
-The first case is evidently false, while the second case simplifies to `A and dash(B)`. For the third and fourth case, observe that when `A = B`, the `<` relation is preserved by the modular correspondence between `[-2^(31), 2^(31))` and `[0, 2^(64))`. Importantly, this modular correspondence is merely a reinterpretation of the bits or values of `a` and `b`, due to the representation in two's complement. Hence, we can introduce the value `C = `unsigned_lt``, that accurately represents the relation `a < b` when `A = B`.
+= Assumptions The following range checks are assumed to be performed/enforced outside of this chip:
 
-Combining our three remaining cases, we obtain the boolean formula `A dash(B) or A B C or dash(A) dash(B) C`. Since the cases are disjoint, this can be computed with the binary-valued polynomial `P(A, B, C) = A (1 - B) + A B C + (1 - A) (1 - B) C`.
+= Constraints
 
-The polynomial `P` can be simplified to a total degree of two. We claim that the polynomial `Q(A, B, C) = A (1 - B) + A C + (1 - B) C` is, for the purposes of this chip, equivalent to `P`. An exhaustive check shows that `P(A, B, C) != Q(A, B, C)` only for the triple `(A, B, C) = (1, 0, 1)`. This is, however, impossible due to the correctness of `ADD`. In more detail, if we let `s` be the (range-checked) difference `a - b` (so the equivalent of the `lhs_sub_rhs` column), and `x'` denote the most significant word of a variable `x`, we need `c dot 2^32 + a' = b' + s' + `carry[0]``, by the definition of `carry`. However, the left hand side of this is at least `3 dot 2^31`, as `(A, C) = (1, 1)`, and the right hand side is at most `(2^31 - 1) + (2^32 - 1) + 1 = 3 dot 2^31 - 1`. Therefore, we can use `Q` to constrain `lt` when `signed = 1`.
+## Overview
 
-| Tag | Description | Multiplicity |
-|-----|-------------|--------------|
-| `LT-C1` | `MSB16[lhs_msb; lhs[2]]` | μ |
-| `LT-C2` | `MSB16[rhs_msb; rhs[2]]` | μ |
-| `LT-C3` | `lt` = `signed` dot (A (1 - B) + A C + (1 - B) C) + (1 - `signed`) dot `unsigned_lt` |  |
-| | _polynomial:_ `lt - signed * (lhs_msb * (1 - rhs_msb) + lhs_msb * carry[1] + (1 - rhs_msb) * carry[1]) - (1 - signed) * unsigned_lt = 0` | |
-| `LT-C4` | `IS_HALFWORD[lhs[1]]` | μ |
-| `LT-C5` | `IS_HALFWORD[rhs[1]]` | μ |
+When `lhs` and `rhs` are _unsigned_ integers, computing their product `mod 2^128` comes down to evaluating $ (sum_(j=0)^3 2^(16j) dot `lhs`_j) dot (sum_(i=0)^3 2^(16i) dot `rhs`_i) mod 2^128. $ If `lhs` and `rhs` are signed instead, the computation remains nearly identical: based on their signs, one must either zero or one-extend `lhs` and `rhs` --- forming `lhs_ext` and `rhs_ext` respectively --- and compute their product `mod 2^128`: $ (sum_(j=0)^7 2^(16j) dot `lhs_ext`_j) dot (sum_(i=0)^7 2^(16i) dot `rhs_ext`_i) mod 2^128. $ where `lhs_ext` and `rhs_ext` are treated as _unsigned_ integers. Note that by setting the extension limbs of `lhs` and/or `rhs` to `0` when the integer is (i) unsigned or (ii) signed and non-negative, this second formula still applies. For the purposes of constraining the multiplication operation, we rewrite this formula as
 
-And then we constrain the subtraction, taking care of the remaining range checking not yet covered by the assumptions or the `MSB16` lookup.
+$ &(sum_(j=0)^7 2^(16j) dot `lhs_ext`_j) dot (sum_(i=0)^7 2^(16i) dot `rhs_ext`_i) mod 2^128 \ &equiv sum_(j=0)^7 sum_(i=0)^7 2^(16(i+j)) dot `lhs_ext`_j dot `rhs_ext`_i mod 2^128 \ &stackrel(triangle, equiv) sum_(j=0)^7 sum_(i=0)^(7-j) 2^(16(i+j)) dot `lhs_ext`_j dot `rhs_ext`_i mod 2^128 \ &stackrel(square, equiv) sum_(j=0)^7 sum_(i=j)^(7) 2^(16i) dot `lhs_ext`_j dot `rhs_ext`_(i-j) mod 2^128 \ &stackrel(penta, equiv) sum_(i=0)^7 sum_(j=0)^(i) 2^(16i) dot `lhs_ext`_j dot `rhs_ext`_(i-j) mod 2^128 \ &equiv sum_(i=0)^3 sum_(k=0)^1 sum_(j=0)^(2i+k) 2^(16(2i+k)) dot `lhs_ext`_j dot `rhs_ext`_(2i+k-j) mod 2^128 \ &equiv sum_(i=0)^3 2^(32i) dot sum_(k=0)^1 2^(16k) dot sum_(j=0)^(2i+k) `lhs_ext`_j dot `rhs_ext`_(2i+k-j) mod 2^128 $ where at step - `triangle` we can ignore `i > 7-j`, since that makes `2^(16(i+j)) equiv 0 mod 2^128`, - `square` we rewrite the second summation such that `i` iterates from `j` to 7, rather than `0` to `7-j`, and - `penta` we swap the sums.
+
+We let `raw_product` capture the second summation in this last formula (see [mul:c:raw_product]). By construction, ``raw_product`_i < 2^51` for all `i in [0, 3]`, far exceeding the 32-bits that fit in a single `Word`-limb. What remains then is to reduce each limb of `raw_product` `mod 2^32`, carrying the overflow of each limb to the next, constructing the output `res` in doing so.
+
+This reduce-and-carry operation is constrained by [mul:c:range_lo]/[mul:c:range_hi] and [mul:c:carry], combined with `carry`'s definition. [mul:c:carry] and `carry`'s definition enforce that $ forall i in [0, 3]: `raw_product`_i + `carry`_(i-1) - `res`_i in { k dot 2^32 | k in [0, 2^20) } $ with ``carry`_(-1) = 0` for simplicity. In other words: ``res`_i equiv `raw_product`_i + `carry`_(i-1) (mod 2^32)`. With [mul:c:range_lo]/[mul:c:range_hi] forcing ``res`_i < 2^32`, ``res`_i` can only assume one value: ``raw_product`_i + `carry`_(i-1) mod 2^32`.
+
+*Note*: one may have observed that [mul:c:carry] requires ``carry`_i in [0, 2^20)`, while no limb of a valid carry value would ever exceed `2^19`. This is indeed the case. However, there is some slack in how tight one has to constrain the `carry` values. In fact, in this situation it suffices to assert that ``carry`_i < frac(p, 2^32, style: "skewed") approx 2^31`, where `p` denotes the field's modulus. Given that other chips also use 20-bit lookups, using `IS_B20` makes for a simpler design.
+
+## Definitions
+
+We constrain `lhs_is_negative` and `rhs_is_negative` according to their definition; `lo`, `hi` and `carry` are appropriately range checked.
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `LT-C6.i` | i ∈ [0, 1] | `IS_BIT<carry[i]>` |  |
-| `LT-C7.i` | i ∈ [0, 3] | `IS_HALFWORD[lhs_sub_rhs[i]]` | μ |
+| `MUL-C1` |  | `SIGN<lhs_is_negative; lhs[3], lhs_signed>` |  |
+| `MUL-C2` |  | `SIGN<rhs_is_negative; rhs[3], rhs_signed>` |  |
+| `MUL-C3.i` | i ∈ [0, 3] | `IS_HALF[lo[i]]` | μ_sum |
+| `MUL-C4.i` | i ∈ [0, 3] | `IS_HALF[hi[i]]` | μ_sum |
+| `MUL-C5.i` | i ∈ [0, 3] | `IS_B20[carry[i]]` | μ_sum |
 
-The chip contributes the following to the lookup argument.
+## Product
+
+[mul:c:raw_product] defines `raw_product` in terms of the (sign extended) input values `lhs` and `rhs`.
+
+| Tag | Range | Description |
+|-----|-------|-------------|
+| `MUL-C6.i` | i ∈ [0, 3] | `raw_product[i]` = sum_(`k`=0)^1 2^(16k) sum_(`j`=0)^(2i+k) `lhs_ext[j]` dot `rhs_ext[2i+k-j]` |
+| | | _polynomial:_ `Σ_k = 0^1 2^(16 * k) * Σ_j = 0^2 * i + k lhs_ext[j] * rhs_ext[2 * i + k - j] - raw_product[i] = 0` |
+
+## Lookup
+
+The  chip contributes the following to the lookup:
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `LT-C8` | `LT[lt; lhs::DWordWL, rhs::DWordWL, signed]` | -μ |
+| `MUL-C7` | `MUL[lo::DWordWL; lhs, lhs_signed, rhs, rhs_signed, 0]` | -μ_lo |
+| `MUL-C8` | `MUL[hi::DWordWL; lhs, lhs_signed, rhs, rhs_signed, 1]` | -μ_hi |
 
-## Padding
+= Padding
 
 The table can be padded to the next power of two with the following value assignments:
 
----
+= Notes - `lo` and `hi` are stored in `DWordHL`s (rather than `DWordWL`s) because of their values being range checked. Since it is not required that both `μ_lo` and `μ_hi` are non-zero at the same time, one cannot safely assume their range to be checked elsewhere.
 
-# MUL Chip
+As an optimization, one might be able to use a `DWordWL` and `DWordHL` to store `lo` and `hi`, where one would decide which to store in which based on the multiplicities `μ_lo` and `μ_hi`; the value sent into the lookup could then be assumed range-checked by the other side of the relation. This optimization was not included at this moment because of its negative impact on the readability and verifiability of the chip.
 
 ## Columns
 
@@ -1121,10 +1331,6 @@ carry (when iter=[1, 3]) := 2^-32 * (raw_product[i] + carry[i - 1] - res[i])
 | `μ_lo` | `BaseField` |  |
 | `μ_hi` | `BaseField` |  |
 
-The `MUL` chip is comprised of  variables that are expressed using  columns:
-
-`mat(delim: , top; bottom)` }
-
 ## Assumptions
 
 | Tag | Range | Description |
@@ -1132,73 +1338,71 @@ The `MUL` chip is comprised of  variables that are expressed using  columns:
 | `MUL-A1.i` | i ∈ [0, 3] | `IS_HALF[lhs[i]]` |
 | `MUL-A2.i` | i ∈ [0, 3] | `IS_HALF[rhs[i]]` |
 
-The following range checks are assumed to be performed/enforced outside of this chip:
+---
 
-## Constraints
+# DVRM Chip
 
-### Overview
+= Columns
 
-When `lhs` and `rhs` are _unsigned_ integers, computing their product `mod 2^128` comes down to evaluating $ (sum_(j=0)^3 2^(16j) dot `lhs`_j) dot (sum_(i=0)^3 2^(16i) dot `rhs`_i) mod 2^128. $ If `lhs` and `rhs` are signed instead, the computation remains nearly identical: based on their signs, one must either zero or one-extend `lhs` and `rhs` --- forming `lhs_ext` and `rhs_ext` respectively --- and compute their product `mod 2^128`: $ (sum_(j=0)^7 2^(16j) dot `lhs_ext`_j) dot (sum_(i=0)^7 2^(16i) dot `rhs_ext`_i) mod 2^128. $ where `lhs_ext` and `rhs_ext` are treated as _unsigned_ integers. Note that by setting the extension limbs of `lhs` and/or `rhs` to `0` when the integer is (i) unsigned or (ii) signed and non-negative, this second formula still applies. For the purposes of constraining the multiplication operation, we rewrite this formula as
+The `DVRM` chip is comprised of  variables that are expressed using  columns:
 
-$ &(sum_(j=0)^7 2^(16j) dot `lhs_ext`_j) dot (sum_(i=0)^7 2^(16i) dot `rhs_ext`_i) mod 2^128 \ &equiv sum_(j=0)^7 sum_(i=0)^7 2^(16(i+j)) dot `lhs_ext`_j dot `rhs_ext`_i mod 2^128 \ &stackrel(triangle, equiv) sum_(j=0)^7 sum_(i=0)^(7-j) 2^(16(i+j)) dot `lhs_ext`_j dot `rhs_ext`_i mod 2^128 \ &stackrel(square, equiv) sum_(j=0)^7 sum_(i=j)^(7) 2^(16i) dot `lhs_ext`_j dot `rhs_ext`_(i-j) mod 2^128 \ &stackrel(penta, equiv) sum_(i=0)^7 sum_(j=0)^(i) 2^(16i) dot `lhs_ext`_j dot `rhs_ext`_(i-j) mod 2^128 \ &equiv sum_(i=0)^3 sum_(k=0)^1 sum_(j=0)^(2i+k) 2^(16(2i+k)) dot `lhs_ext`_j dot `rhs_ext`_(2i+k-j) mod 2^128 \ &equiv sum_(i=0)^3 2^(32i) dot sum_(k=0)^1 2^(16k) dot sum_(j=0)^(2i+k) `lhs_ext`_j dot `rhs_ext`_(2i+k-j) mod 2^128 $ where at step - `triangle` we can ignore `i > 7-j`, since that makes `2^(16(i+j)) equiv 0 mod 2^128`, - `square` we rewrite the second summation such that `i` iterates from `j` to 7, rather than `0` to `7-j`, and - `penta` we swap the sums.
+= Assumptions
 
-We let `raw_product` capture the second summation in this last formula (see [mul:c:raw_product]). By construction, ``raw_product`_i < 2^51` for all `i in [0, 3]`, far exceeding the 32-bits that fit in a single `Word`-limb. What remains then is to reduce each limb of `raw_product` `mod 2^32`, carrying the overflow of each limb to the next, constructing the output `res` in doing so.
+= Constraints From the ISA, we gather five requirements for the `DIV[U][W]` and `REM[U][W]` instructions:
 
-This reduce-and-carry operation is constrained by [mul:c:range_lo]/[mul:c:range_hi] and [mul:c:carry], combined with `carry`'s definition. [mul:c:carry] and `carry`'s definition enforce that $ forall i in [0, 3]: `raw_product`_i + `carry`_(i-1) - `res`_i in { k dot 2^32 | k in [0, 2^20) } $ with ``carry`_(-1) = 0` for simplicity. In other words: ``res`_i equiv `raw_product`_i + `carry`_(i-1) (mod 2^32)`. With [mul:c:range_lo]/[mul:c:range_hi] forcing ``res`_i < 2^32`, ``res`_i` can only assume one value: ``raw_product`_i + `carry`_(i-1) mod 2^32`.
+enum.item([ _For both signed and unsigned division, except in the case of_ overflow, _it holds that ``n` = `q` `d` + `r``._ ]), enum.item([ _`DIV` and `DIVU` perform [...] signed and unsigned integer division [...] rounding towards zero._ ]), enum.item([ _For `REM`, the sign of a nonzero [remainder] equals the sign of the [numerator]._ ]), enum.item([ In case of _division-by-zero_, ``r` = `n`` and ``q` = 2^64-1` (unsigned) or ``q` = -1` (signed). ]), enum.item([ In case of _overflow_, ``q` = `n`` and ``r` = 0` ]), where _overflow_ occurs when ``n` = -2^(63)` and ``d` = -1` (and, hence, ``signed` = 1`), and _division-by-zero_ indicates that ``d` = 0`. In the following, we list the constraints associated with the  chip, and explain how these together enforce all five of these requirements.
 
-*Note*: one may have observed that [mul:c:carry] requires ``carry`_i in [0, 2^20)`, while no limb of a valid carry value would ever exceed `2^19`. This is indeed the case. However, there is some slack in how tight one has to constrain the `carry` values. In fact, in this situation it suffices to assert that ``carry`_i < frac(p, 2^32, style: "skewed") approx 2^31`, where `p` denotes the field's modulus. Given that other chips also use 20-bit lookups, using `IS_B20` makes for a simpler design.
+## R3: Sign remainder equals sign numerator
 
-### Definitions
+We start with R3, which is straightforwardly asserted by constraint [dvrm:c:sign_r_equals_sign_n].
 
-We constrain `lhs_is_negative` and `rhs_is_negative` according to their definition; `lo`, `hi` and `carry` are appropriately range checked.
+## R2: rounding towards zero
 
-| Tag | Range | Description | Multiplicity |
-|-----|-------|-------------|--------------|
-| `MUL-C1` |  | `SIGN<lhs_is_negative; lhs[3], lhs_signed>` |  |
-| `MUL-C2` |  | `SIGN<rhs_is_negative; rhs[3], rhs_signed>` |  |
-| `MUL-C3.i` | i ∈ [0, 3] | `IS_HALF[lo[i]]` | μ_sum |
-| `MUL-C4.i` | i ∈ [0, 3] | `IS_HALF[hi[i]]` | μ_sum |
-| `MUL-C5.i` | i ∈ [0, 3] | `IS_B20[carry[i]]` | μ_sum |
+R2 states that "_[in] signed and unsigned integer division [the quotient is] round[ed] towards zero._" In other words, + the sign of ``n`-`qd`` must match that of `n` (unless ``qd` = `n``), and + `|`n`-`qd`|  < |`d`|` (unless ``d` = 0`).
 
-### Product
+Leveraging R1 , we can rewrite these as + the sign of ``r`` must match that of `n` (unless ``r` = 0`), and + `|`r`|  < |`d`|` (unless ``d` = 0`).
 
-[mul:c:raw_product] defines `raw_product` in terms of the (sign extended) input values `lhs` and `rhs`.
+Focusing on the first statement, we observe that this trivially holds when ``signed` = 0`, while R3 deals with the case that ``signed` = 1`. The second statement is enforced by [dvrm:c:abs_r_lt_abs_d]. [dvrm:c:abs_r_if_negative] and [dvrm:c:abs_r_if_nonnegative] (resp. [dvrm:c:abs_d_if_negative] and [dvrm:c:abs_d_if_nonnegative]) are included to ensure that `abs_r` (resp. `abs_d`) is the absolute values of `r` (resp. `d`).
 
-| Tag | Range | Description |
-|-----|-------|-------------|
-| `MUL-C6.i` | i ∈ [0, 3] | `raw_product[i]` = sum_(`k`=0)^1 2^(16k) sum_(`j`=0)^(2i+k) `lhs_ext[j]` dot `rhs_ext[2i+k-j]` |
-| | | _polynomial:_ `Σ_k = 0^1 2^(16 * k) * Σ_j = 0^2 * i + k lhs_ext[j] * rhs_ext[2 * i + k - j] - raw_product[i] = 0` |
+## R5: overflow
 
-### Lookup
+The ISA requires that ``q` = `n`` and ``r` = 0` in the event of overflow (i.e., when ``n` = -2^63` and ``d` = -1`). We note that the second half of this requirement is already satisfied by R2: since ``d` = -1 != 0`, R2 requires that `|`r`| < |`d`| = 1`, to which ``r` = 0` is the only satisfying value.
 
-The  chip contributes the following to the lookup:
+We moreover find that R1 can be leveraged to enforce the correct value of `q`. While ``n` = `qd` + `r`` (R1) does _not_ hold in the case of overflow, the relation ``n` = |`q`|`d` + `r`` _does_. We moreover note that the 64-bit _signed_ two's complement representation of `-2^63` is identical to the 64-bit _unsigned_ representation of `|-2^63| = 2^63`. As such, by interpreting `q` as an unsigned integer when ``overflow` = 1`, it follows that R1 will enforce ``q` = `0x80...00``.
+
+In summary, in case of overflow R2 enforces that ``r` = 0`. Moreover it suffices to interpret `q` as unsigned integer ([dvrm:c:sign_q]); R1 will ensure it contains the correct value.
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `MUL-C7` | `MUL[lo::DWordWL; lhs, lhs_signed, rhs, rhs_signed, 0]` | -μ_lo |
-| `MUL-C8` | `MUL[hi::DWordWL; lhs, lhs_signed, rhs, rhs_signed, 1]` | -μ_hi |
+| `DVRM-C7` | `sign_q` = `signed` dot (1- `overflow`) |  |
+| | _polynomial:_ `signed * (1 - overflow) - sign_q = 0` | |
+| `DVRM-C8` | `ZERO[overflow; n[0] + n[1] + n[2] + (n[3] - 2^15 * sign_n) + (1 - sign_n) + (65535 - d[0]) + (65535 - d[1]) + (65535 - d[2]) + (65535 - d[3])]` | μ_sum |
 
-## Padding
+We highlight [dvrm:c:overflow]. Recall that the `overflow` flag should be set if and only if (i) ``signed` = 1`, (ii) ``n` = `0x80...00``, and (iii) ``d` = `0xFF...FF``. These requirements are equivalent to the state where: $ forall i in [0, 3]:&& 65535 - `d`_i &= 0,\ forall i in [0, 2]:&& `n`_i &= 0,\ && `n`_3 - 2^15 dot `sign_n` &= 0,\ && 1 - `sign_n` &= 0,\ $ where ``signed` = 1` follows from the last equality. The requirement is phrased in this way, because the left-hand sides of the above expressions are `>= 0` by construction. Given that the sum of these expressions does not exceed `2^19` (and thus never wraps in the field), we can now say that the `overflow` bit should be set to `1` if and only if their sum evaluates to `0`. The `ZERO` lookup guarantees this to be the case.
 
-The table can be padded to the next power of two with the following value assignments:
+## R1: $#`n` = #`qd` + #`r`$
 
-## Notes
+Rewriting R1, we find the constraint `not`overflow` => `n` - `r` = `qd``.
 
-- `lo` and `hi` are stored in `DWordHL`s (rather than `DWordWL`s) because of their values being range checked. Since it is not required that both `μ_lo` and `μ_hi` are non-zero at the same time, one cannot safely assume their range to be checked elsewhere.
+Since `n`, `d`, `q` and `r` are all 64-bit integers, we must assert this equality `mod 2^128`, rather than `mod 2^64`. To this end, we introduce `extended_n_sub_r` and leverage the `MUL` chip to verify that it is equal to ``qd` mod 2^128` using constraints [dvrm:c:mul_lower] and [dvrm:c:mul_upper]; [dvrm:c:q_range] is included to uphold assumption [mul:a:rhs].
 
-As an optimization, one might be able to use a `DWordWL` and `DWordHL` to store `lo` and `hi`, where one would decide which to store in which based on the multiplicities `μ_lo` and `μ_hi`; the value sent into the lookup could then be assumed range-checked by the other side of the relation. This optimization was not included at this moment because of its negative impact on the readability and verifiability of the chip.
+It now remains to enforce that `extended_n_sub_r` is the _signed_ 128-bit representation of ``n`-`r``. Here, we introduce `extended_n` and `extended_r`. By their definition, these variables contain the signed 128-bit representations of `n` and `r`. The `carry` variable has been defined such that it mimics those in the `ADD` chip, except that here we add two `QuadHL`s rather than two `DWordHL`, thus needing four carry bits instead of two. With this in place, [dvrm:c:n_sub_r] (mimicking [add:c:carry]) ensures `extended_n_sub_r` must contain the correct value.
 
----
+Lastly, observe that ``n` - `r` in (-2^64, 2^64)`, _regardless_ of the value of `signed`. Moreover, note that the upper halves of the 128-bit representations of all values in this range are either `0xFFFFFFFF` (negative) or `0x00000000` (non-negative). This means that we do not need to store all 128 bits of `extended_n_sub_r`. Rather, we need only store the lower 64-bits, and a separate bit (`sign_n_sub_r`) indicating whether the top limbs are all-ones or all-zeroes. The prover is free to select the value for `sign_n_sub_r`; only one of the two will fit the proof.
 
-# DVRM Chip
+## R4: division-by-zero
 
-//  chip = load_chip("src/dvrm.toml", config)
+R4 requires that ``q` = 2^64-1` (unsigned) or `-1` (signed) and ``r` = n` when ``d` = 0`. Recalling R1, we see that ``n` = `q` `d` + `r` = `r`` when ``d` = 0`, already enforces the latter. Next, we note that, in two's complement, the _unsigned_ value `2^64-1` and _signed_ value `-1` are both represented by the bit string `0xFFFFFFFF`. Hence, only [dvrm:c:q_if_div_by_zero] is required to completely constrain R4; [dvrm:c:div_by_zero] just ensures the `div_by_zero` flag is set when ``d` = 0`.
 
-*placeholder chapter: WIP*
+## Other
 
----
+The following constraints are included to enforce the values of `sign_n`, `sign_r` and `sign_d` are correct.
 
-# LOAD Chip
+## Output
+
+Lastly, this chip contributes the following to the lookup:
+
+= Padding To pad the  table, we use the following data, representing the unsigned division `frac(0, 0, style: "horizontal")`:
 
 ## Columns
 
@@ -1206,59 +1410,164 @@ As an optimization, one might be able to use a `DWordWL` and `DWordHL` to store
 
 | Name | Type | Description |
 |------|------|-------------|
-| `base_address` | `DWordWL` | The base address to read/write from/to, gets offset by $[0, 7]$, depending on how big the access is |
-| `timestamp` | `DWordWL` | The timestamp at which this memory access is said to occur |
-| `read2` | `Bit` | Whether to read exactly 2 bytes |
-| `read4` | `Bit` | Whether to read exactly 4 bytes |
-| `read8` | `Bit` | Whether to read exactly 8 bytes |
-| `signed` | `Bit` | Whether to sign-extend (1) or zero-extend (0) |
+| `n` | `DWordHL` | The numerator |
+| `d` | `DWordHL` | The denominator |
+| `signed` | `Bit` | Whether to interpret the input as signed (1) or unsigned (0) integers. |
 
 ### Output
 
 | Name | Type | Description |
 |------|------|-------------|
-| `res` | `DWordBL` | The result of reading (up to) 8 bytes from `base_address`, extended corresponding to `signed`. |
+| `q` | `DWordHL` | The quotient; $`n` / `d`$ rounded towards zero. |
+| `r` | `DWordHL` | The remainder; $`n` - `q` `d`$. |
 
 ### Auxiliary
 
 | Name | Type | Description |
 |------|------|-------------|
-| `sign_bit` | `Bit` | The sign bit extracted from the bytes retrieved from memory |
+| `div_by_zero` | `Bit` | Whether $`d`=0$. |
+| `overflow` | `Bit` | Whether $`n` = -2^63$ and $`d`=-1$. |
+| `abs_r` | `DWordWL` | Absolute value of `r`. |
+| `abs_d` | `DWordWL` | Absolute value of `d`. |
+| `n_sub_r` | `DWordHL` | $`n`-`r`$. |
+| `sign_n_sub_r` | `Bit` | Sign of `n_sub_r`. |
+| `sign_n` | `Bit` | Sign of `n`. |
+| `sign_d` | `Bit` | Sign of `d`. |
+| `sign_q` | `Bit` | Sign of `q`. |
+| `sign_r` | `Bit` | Sign of `r`. |
 
 ### Virtual
 
 | Name | Type | Description |
 |------|------|-------------|
-| `read1` | `Bit` | Whether to read exactly 1 byte |
+| `extended_n` | `QuadHL` | sign-extended value of `n`. |
+| `extended_r` | `QuadHL` | sign-extended value of `r`. |
+| `extension_n_sub_r` | `DWordHL` | sign-extension limbs of `n_sub_r`. |
+| `extended_n_sub_r` | `QuadHL` | sign-extended value of `n_sub_r`. |
+| `carry` | `Bit[4]` | carries for adding `extended_n_sub_r` to `extended_r`, forming `extended_n`. |
+| `μ_sum` | `BaseField` | sum of multiplicities |
+
+**Definition of `extended_n`:**
+```
+extended_n (when iter=[0, 3]) := n[i]
+extended_n (when iter=[4, 7]) := 65535 * sign_n
+```
 
-**Definition of `read1`:**
+**Definition of `extended_r`:**
 ```
-read1 := μ - read2 - read4 - read8
+extended_r (when iter=[0, 3]) := r[i]
+extended_r (when iter=[4, 7]) := 65535 * sign_r
+```
+
+**Definition of `extension_n_sub_r`:**
+```
+extension_n_sub_r := 65535 * sign_n_sub_r
+```
+
+**Definition of `extended_n_sub_r`:**
+```
+extended_n_sub_r (when iter=[0, 3]) := n_sub_r[i]
+extended_n_sub_r (when iter=[4, 7]) := extension_n_sub_r[i - 4]
+```
+
+**Definition of `carry`:**
+```
+carry (when iter=0) := 2^-32 * ((extended_n_sub_r::QuadWL)[i] + (extended_r::QuadWL)[i] - (extended_n::QuadWL)[i])
+carry (when iter=[1, 3]) := 2^-32 * ((extended_n_sub_r::QuadWL)[i] + (extended_r::QuadWL)[i] + carry[i - 1] - (extended_n::QuadWL)[i])
+```
+
+**Definition of `μ_sum`:**
+```
+μ_sum := μ_q + μ_r
 ```
 
 ### Multiplicity
 
 | Name | Type | Description |
 |------|------|-------------|
-| `μ` | `Bit` |  |
-
-The `LOAD` chip is comprised of  variables that are expressed using  columns:
+| `μ_q` | `BaseField` |  |
+| `μ_r` | `BaseField` |  |
 
 ## Assumptions
 
 | Tag | Range | Description |
 |-----|-------|-------------|
-| `LOAD-A1.i` | i ∈ [0, 1] | `IS_WORD[base_address[i]]` |
-| `LOAD-A2` |  | `IS_BIT<signed>` |
-| `LOAD-A3` |  | `IS_BIT<read2>` |
-| `LOAD-A4` |  | `IS_BIT<read4>` |
-| `LOAD-A5` |  | `IS_BIT<read8>` |
-| `LOAD-A6` |  | `IS_BIT<read2 + read4 + read8>` |
-| `LOAD-A7.i` | i ∈ [0, 1] | `IS_WORD[timestamp[i]]` |
+| `DVRM-A1.i` | i ∈ [0, 3] | `IS_HALF[n[i]]` |
+| `DVRM-A2.i` | i ∈ [0, 3] | `IS_HALF[d[i]]` |
+| `DVRM-A3` |  | `IS_BIT<signed>` |
 
 ## Constraints
 
-The chip delegates the actual memory interaction to the `MEMW` chip, and ensures correctness of the requested sign/zero extension. The output `res` is correctly range-checked as long as the memory contents are.
+### equality
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `DVRM-C13` |  | `MUL[n_sub_r::DWordWL; d, signed, q, sign_q, 0]` | μ_sum |
+| `DVRM-C14` |  | `MUL[extension_n_sub_r::DWordWL; d, signed, q, sign_q, 1]` | μ_sum |
+| `DVRM-C15.i` | i ∈ [0, 3] | `IS_HALF[q[i]]` | μ_sum |
+
+### defs
+
+| Tag | Description |
+|-----|-------------|
+| `DVRM-C16` | `SIGN<sign_n; n[3], signed>` |
+| `DVRM-C17` | `SIGN<sign_r; r[3], signed>` |
+| `DVRM-C18` | `SIGN<sign_d; d[3], signed>` |
+
+### n_sub_r
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `DVRM-C9.i` | i ∈ [0, 3] | `IS_BIT<carry[i]>` |  |
+| `DVRM-C10.i` | i ∈ [0, 3] | `IS_HALF[r[i]]` | μ_sum |
+| `DVRM-C11.i` | i ∈ [0, 3] | `IS_HALF[n_sub_r[i]]` | μ_sum |
+| `DVRM-C12` |  | `IS_BIT<sign_n_sub_r>` |  |
+
+### output
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `DVRM-C21` | `DVRM[q::DWordWL; n, d, signed, 0]` | -μ_q |
+| `DVRM-C22` | `DVRM[r::DWordWL; n, d, signed, 1]` | -μ_r |
+
+### abs_diff
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `DVRM-C2` |  | `LT[1 - div_by_zero; abs_r, abs_d, 0]` | μ_sum |
+| `DVRM-C3` |  | sign_r ⇒ `NEG<abs_r; r>` |  |
+| `DVRM-C4.i` | i ∈ [0, 1] | not`sign_r` => `abs_r` = `r` |  |
+| | | _polynomial:_ `(1 - sign_r) * (abs_r[i] - (r::DWordWL)[i]) = 0` | |
+| `DVRM-C5` |  | sign_d ⇒ `NEG<abs_d; d>` |  |
+| `DVRM-C6.i` | i ∈ [0, 1] | not`sign_d` => `abs_d` = `d` |  |
+| | | _polynomial:_ `(1 - sign_d) * (abs_d[i] - (d::DWordWL)[i]) = 0` | |
+
+### div_by_zero
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `DVRM-C19.i` | i ∈ [0, 3] | `div_by_zero` => `q[i]` = 65535 |  |
+| | | _polynomial:_ `div_by_zero * (q[i] - 65535) = 0` | |
+| `DVRM-C20` |  | `ZERO[div_by_zero; d[0] + d[1] + d[2] + d[3]]` | μ_sum |
+
+### sign_equality
+
+| Tag | Description |
+|-----|-------------|
+| `DVRM-C1` | `r` eq.not 0 => `sign_r` = `sign_n` |
+| | _polynomial:_ `Σ_i = 0^3 r[i] * (sign_r - sign_n) = 0` |
+
+---
+
+# LOAD Chip
+
+= Columns
+
+The `LOAD` chip is comprised of  variables that are expressed using  columns:
+
+= Assumptions
+
+= Constraints The chip delegates the actual memory interaction to the `MEMW` chip, and ensures correctness of the requested sign/zero extension. The output `res` is correctly range-checked as long as the memory contents are.
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
@@ -1279,39 +1588,93 @@ The chip contributes the following to the lookup argument.
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `LOAD-C9` | `LOAD[res::DWordWL; base_address, timestamp, read2, read4, read8]` | -μ |
+| `LOAD-C9` | `LOAD[res::DWordWL; base_address, timestamp, read2, read4, read8, signed]` | -μ |
 
-## Padding
+= Padding
 
 The table can be padded to the next power of two with the following value assignments:
 
+## Columns
+
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `base_address` | `DWordWL` | The base address to read/write from/to, gets offset by $[0, 7]$, depending on how big the access is |
+| `timestamp` | `DWordWL` | The timestamp at which this memory access is said to occur |
+| `read2` | `Bit` | Whether to read exactly 2 bytes |
+| `read4` | `Bit` | Whether to read exactly 4 bytes |
+| `read8` | `Bit` | Whether to read exactly 8 bytes |
+| `signed` | `Bit` | Whether to sign-extend (1) or zero-extend (0) |
+
+### Output
+
+| Name | Type | Description |
+|------|------|-------------|
+| `res` | `DWordBL` | The result of reading (up to) 8 bytes from `base_address`, extended corresponding to `signed`. |
+
+### Auxiliary
+
+| Name | Type | Description |
+|------|------|-------------|
+| `sign_bit` | `Bit` | The sign bit extracted from the bytes retrieved from memory |
+
+### Virtual
+
+| Name | Type | Description |
+|------|------|-------------|
+| `read1` | `Bit` | Whether to read exactly 1 byte |
+
+**Definition of `read1`:**
+```
+read1 := μ - read2 - read4 - read8
+```
+
+### Multiplicity
+
+| Name | Type | Description |
+|------|------|-------------|
+| `μ` | `Bit` |  |
+
+## Assumptions
+
+| Tag | Range | Description |
+|-----|-------|-------------|
+| `LOAD-A1.i` | i ∈ [0, 1] | `IS_WORD[base_address[i]]` |
+| `LOAD-A2` |  | `IS_BIT<signed>` |
+| `LOAD-A3` |  | `IS_BIT<read2>` |
+| `LOAD-A4` |  | `IS_BIT<read4>` |
+| `LOAD-A5` |  | `IS_BIT<read8>` |
+| `LOAD-A6` |  | `IS_BIT<read2 + read4 + read8>` |
+| `LOAD-A7.i` | i ∈ [0, 1] | `IS_WORD[timestamp[i]]` |
+
 ---
 
 # ECALL Chips
 
-##  chip
+=  chip
 
-### Columns
+## Columns
 
 The  chip leverages  variable, spanning  columns:
 
-### Assumptions
+## Assumptions
 
 It is assumed the input is range checked:
 
-### Constraints
+## Constraints
 
 The  chip: + makes sure register `x10` (containing the exit code) equals `0` ([halt:c:read_zero_exit_code]), + writes `0` to all other registers ([halt:c:zeroize_registers_lo]/[halt:c:zeroize_registers_hi]), and + sets `pc` equal to `1` ([halt:c:pc]). Note that the writes performed by all these interactions are accompanied by the timestamp `2^64-1`; the maximum timestamp. This prevents any other operation involving memory from being executed hereafter.
 
 [ Observe that --- in its current state --- this solution puts the burden of verifying the register cleanup on the verifier inside of the lookup argument. Alternatively, one could add 31 lookups to the "memory" table to remove the _known_ final tokens for the registers there. ])
 
-#### Lookup
+### Lookup
 
 The HALT chip contributes the following interaction to the lookup-argument:
 
 *Note*: [`93` is the system call number corresponding to `sys_exit`.]
 
-### Padding
+## Padding
 
 This chip should only contain a single row. Given that `2^0 = 1`, this chip does not need to be padded. As such, no padding is defined.
 
@@ -1319,6 +1682,16 @@ This chip should only contain a single row. Given that `2^0 = 1`, this chip does
 
 # BITWISE Chips
 
+= Columns
+
+The  chip is comprised of  variables that are expressed using  columns. Of these, the _input_ and _output_ variables ( in total) are precomputed.
+
+*Note*: This table contains one row for every possible value of `(X, Y, Z)`. As such, it has length `2^8 dot 2^8 dot 2^4 = 2^(20)`.
+
+= Lookup This chip adds the following interactions to the lookup:
+
+= Areas of Optimization The following ideas may prove to be optimizations for the  chip: + Extend `IS_BYTE[X]` to `ARE_BYTES[X, Y]`, such that two bytes are range checked at once. When only a single check is required, one can still execute `IS_BYTE[X] := ARE_BYTES[X, 0]`. + Drop `MSB8` column, and instead define the `MSB8` lookup as `MSB8<X> := MSB16[256X]`. Note: currently, `MSB8` also implicity range checks the input `X` (the lookup fails if `X` is not a `Byte`). This optimization should only be executed when all chips leveraging `MSB8` do _not_ need this implicit range check. + Place the 16-bit (`AND`, `OR`, `XOR`, `MSB16`, etc.) and 20-bit (`HWSL`, `HWSLC`, `IS_B20`, `ZERO`) lookups in separate tables. + Combine `HWSL` and `HWSLC` into a single lookup (see also \).
+
 ## Columns
 
 ### Input
@@ -1338,7 +1711,7 @@ This chip should only contain a single row. Given that `2^0 = 1`, this chip does
 | `XOR` | `Byte` | the binary XOR of `X` and `Y` |
 | `MSB8` | `Bit` | the most significant bit of `X` |
 | `MSB16` | `Bit` | the most significant bit of `Y` |
-| `ZERO` | `Bit` | whether $`X` = 0 and `Y` = 0$ |
+| `ZERO` | `Bit` | whether $`X` = 0$, $`Y` = 0$ and $`Z` = 0$. |
 | `SLL` | `Half` | `X\|\|Y` logically left-shifted by `Z`: $((`X` + 256`Y`) `<<` `Z`) mod 2^16$ |
 | `SLLC` | `Half` | `X\|\|Y` logically right-shifted by `Z`: $(`X` + 256`Y`) `>>` (16 - `Z`)$ |
 
@@ -1358,18 +1731,6 @@ This chip should only contain a single row. Given that `2^0 = 1`, this chip does
 | `μ_HWSL` | `BaseField` |  |
 | `μ_HWSLC` | `BaseField` |  |
 
-The  chip is comprised of  variables that are expressed using  columns. Of these, the _input_ and _output_ variables ( in total) are precomputed.
-
-*Note*: This table contains one row for every possible value of `(X, Y, Z)`. As such, it has length `2^8 dot 2^8 dot 2^4 = 2^(20)`.
-
-## Lookup
-
-This chip adds the following interactions to the lookup:
-
-## Areas of Optimization
-
-The following ideas may prove to be optimizations for the  chip: + Extend `IS_BYTE[X]` to `ARE_BYTES[X, Y]`, such that two bytes are range checked at once. When only a single check is required, one can still execute `IS_BYTE[X] := ARE_BYTES[X, 0]`. + Drop `MSB8` column, and instead define the `MSB8` lookup as `MSB8<X> := MSB16[256X]`. Note: currently, `MSB8` also implicity range checks the input `X` (the lookup fails if `X` is not a `Byte`). This optimization should only be executed when all chips leveraging `MSB8` do _not_ need this implicit range check. + Place the 16-bit (`AND`, `OR`, `XOR`, `MSB16`, `ZERO`, etc.) and 20-bit (`HWSL`, `HWSLC`, `IS_B20`) lookups in separate tables. + Combine `HWSL` and `HWSLC` into a single lookup (see also \).
-
 ## Constraints
 
 ### contributions
@@ -1381,7 +1742,7 @@ The following ideas may prove to be optimizations for the  chip: + Extend `IS_BY
 | `BITWISE-C3` | `XOR_BYTE[XOR; X, Y]` | -μ_XOR |
 | `BITWISE-C4` | `MSB8[MSB8; X]` | -μ_MSB8 |
 | `BITWISE-C5` | `MSB16[MSB16; X + 256 * Y]` | -μ_MSB16 |
-| `BITWISE-C6` | `ZERO[ZERO; X + 256 * Y]` | -μ_ZERO |
+| `BITWISE-C6` | `ZERO[ZERO; X + 256 * Y + 65536 * Z]` | -μ_ZERO |
 | `BITWISE-C7` | `IS_BYTE[X]` | -μ_IS_BYTE |
 | `BITWISE-C8` | `IS_HALF[X + 256 * Y]` | -μ_IS_HALF |
 | `BITWISE-C9` | `IS_B20[X + 256 * Y + 65536 * Z]` | -μ_IS_B20 |
diff --git a/scripts/extract_and_convert_spec.sh b/scripts/extract_and_convert_spec.sh
index 1ddb0bafa..be8b1859b 100755
--- a/scripts/extract_and_convert_spec.sh
+++ b/scripts/extract_and_convert_spec.sh
@@ -32,9 +32,18 @@ for file in $(git ls-tree -r origin/spec/main --name-only | grep '^spec/src/.*\.
     git show "origin/spec/main:$file" > "$TEMP_DIR/src/$filename" 2>/dev/null || true
 done
 
+# Extract all Typst (.typ) files
+for file in $(git ls-tree -r origin/spec/main --name-only | grep '^spec/.*\.typ$'); do
+    filename=$(basename "$file")
+    git show "origin/spec/main:$file" > "$TEMP_DIR/$filename" 2>/dev/null || true
+done
+
 # List extracted files
 echo "Extracted files:"
 ls -la "$TEMP_DIR/src/"
+echo ""
+echo "Extracted .typ files:"
+ls -la "$TEMP_DIR/"*.typ 2>/dev/null || echo "(none)"
 
 # Create output directory
 mkdir -p "$OUTPUT_DIR"
@@ -42,9 +51,8 @@ mkdir -p "$OUTPUT_DIR"
 # Run the Python converter
 echo ""
 echo "Converting to Markdown..."
-python3 "$SCRIPT_DIR/spec_to_md.py" \
-    "$TEMP_DIR/src/config.toml" \
-    "$TEMP_DIR/src/"*.toml \
+python3 "$SCRIPT_DIR/typst_to_md.py" \
+    --spec-dir "$TEMP_DIR" \
     --output-dir "$OUTPUT_DIR"
 
 # Cleanup
diff --git a/scripts/typst_to_md.py b/scripts/typst_to_md.py
index d1864428e..16aa96fcb 100644
--- a/scripts/typst_to_md.py
+++ b/scripts/typst_to_md.py
@@ -161,8 +161,11 @@ def iters_to_text(obj: dict) -> str:
 CHAPTERS = [
     ("memory", "Memory Argument"),
     ("variables", "Variables"),
+    ("signatures", "Signatures"),
     ("is_bit", "IS_BIT Template"),
+    ("sign", "SIGN Template"),
     ("add", "ADD/SUB Template"),
+    ("neg", "NEG Template"),
     ("decode", "DECODE Table"),
     ("cpu", "CPU Chip"),
     ("shift", "SHIFT Chip"),

From 278f932b9f947e9d7951b9520567c6e1b91900e0 Mon Sep 17 00:00:00 2001
From: Joaquin Carletti <joaquin.carletti@lambdaclass.com>
Date: Mon, 9 Feb 2026 14:24:32 -0300
Subject: [PATCH 061/105] update

---
 docs/spec/dvrm.md      | 40 ++++++++++++++++----------------
 docs/spec/shift.md     | 14 ++++++------
 docs/spec/spec_full.md | 52 +++++++++++++++++++++---------------------
 3 files changed, 53 insertions(+), 53 deletions(-)

diff --git a/docs/spec/dvrm.md b/docs/spec/dvrm.md
index 7295a9f45..5c846f3cb 100644
--- a/docs/spec/dvrm.md
+++ b/docs/spec/dvrm.md
@@ -156,6 +156,13 @@ carry (when iter=[1, 3]) := 2^-32 * ((extended_n_sub_r::QuadWL)[i] + (extended_r
 
 ## Constraints
 
+### output
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `DVRM-C21` | `DVRM[q::DWordWL; n, d, signed, 0]` | -μ_q |
+| `DVRM-C22` | `DVRM[r::DWordWL; n, d, signed, 1]` | -μ_r |
+
 ### equality
 
 | Tag | Range | Description | Multiplicity |
@@ -164,13 +171,12 @@ carry (when iter=[1, 3]) := 2^-32 * ((extended_n_sub_r::QuadWL)[i] + (extended_r
 | `DVRM-C14` |  | `MUL[extension_n_sub_r::DWordWL; d, signed, q, sign_q, 1]` | μ_sum |
 | `DVRM-C15.i` | i ∈ [0, 3] | `IS_HALF[q[i]]` | μ_sum |
 
-### defs
+### sign_equality
 
 | Tag | Description |
 |-----|-------------|
-| `DVRM-C16` | `SIGN<sign_n; n[3], signed>` |
-| `DVRM-C17` | `SIGN<sign_r; r[3], signed>` |
-| `DVRM-C18` | `SIGN<sign_d; d[3], signed>` |
+| `DVRM-C1` | `r` eq.not 0 => `sign_r` = `sign_n` |
+| | _polynomial:_ `Σ_i = 0^3 r[i] * (sign_r - sign_n) = 0` |
 
 ### n_sub_r
 
@@ -181,12 +187,13 @@ carry (when iter=[1, 3]) := 2^-32 * ((extended_n_sub_r::QuadWL)[i] + (extended_r
 | `DVRM-C11.i` | i ∈ [0, 3] | `IS_HALF[n_sub_r[i]]` | μ_sum |
 | `DVRM-C12` |  | `IS_BIT<sign_n_sub_r>` |  |
 
-### output
+### div_by_zero
 
-| Tag | Description | Multiplicity |
-|-----|-------------|--------------|
-| `DVRM-C21` | `DVRM[q::DWordWL; n, d, signed, 0]` | -μ_q |
-| `DVRM-C22` | `DVRM[r::DWordWL; n, d, signed, 1]` | -μ_r |
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `DVRM-C19.i` | i ∈ [0, 3] | `div_by_zero` => `q[i]` = 65535 |  |
+| | | _polynomial:_ `div_by_zero * (q[i] - 65535) = 0` | |
+| `DVRM-C20` |  | `ZERO[div_by_zero; d[0] + d[1] + d[2] + d[3]]` | μ_sum |
 
 ### abs_diff
 
@@ -200,17 +207,10 @@ carry (when iter=[1, 3]) := 2^-32 * ((extended_n_sub_r::QuadWL)[i] + (extended_r
 | `DVRM-C6.i` | i ∈ [0, 1] | not`sign_d` => `abs_d` = `d` |  |
 | | | _polynomial:_ `(1 - sign_d) * (abs_d[i] - (d::DWordWL)[i]) = 0` | |
 
-### div_by_zero
-
-| Tag | Range | Description | Multiplicity |
-|-----|-------|-------------|--------------|
-| `DVRM-C19.i` | i ∈ [0, 3] | `div_by_zero` => `q[i]` = 65535 |  |
-| | | _polynomial:_ `div_by_zero * (q[i] - 65535) = 0` | |
-| `DVRM-C20` |  | `ZERO[div_by_zero; d[0] + d[1] + d[2] + d[3]]` | μ_sum |
-
-### sign_equality
+### defs
 
 | Tag | Description |
 |-----|-------------|
-| `DVRM-C1` | `r` eq.not 0 => `sign_r` = `sign_n` |
-| | _polynomial:_ `Σ_i = 0^3 r[i] * (sign_r - sign_n) = 0` |
\ No newline at end of file
+| `DVRM-C16` | `SIGN<sign_n; n[3], signed>` |
+| `DVRM-C17` | `SIGN<sign_r; r[3], signed>` |
+| `DVRM-C18` | `SIGN<sign_d; d[3], signed>` |
\ No newline at end of file
diff --git a/docs/spec/shift.md b/docs/spec/shift.md
index cbee34a08..a3130f953 100644
--- a/docs/spec/shift.md
+++ b/docs/spec/shift.md
@@ -184,15 +184,15 @@ shifted := left * Σ_j = 0^i limb_shift[j] * intra_limb_left[i - j] + right * (
 
 ## Constraints
 
-### is_negative
-
-| Tag | Description | Multiplicity |
-|-----|-------------|--------------|
-| `SHIFT-C2` | `MSB16[is_negative; in[3]]` | signed |
-
 ### left_flag
 
 | Tag | Description |
 |-----|-------------|
 | `SHIFT-C1` | `direction` => `μ` = 1 |
-| | _polynomial:_ `direction * (1 - μ) = 0` |
\ No newline at end of file
+| | _polynomial:_ `direction * (1 - μ) = 0` |
+
+### is_negative
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `SHIFT-C2` | `MSB16[is_negative; in[3]]` | signed |
\ No newline at end of file
diff --git a/docs/spec/spec_full.md b/docs/spec/spec_full.md
index fd9b196e5..d3eb2c93e 100644
--- a/docs/spec/spec_full.md
+++ b/docs/spec/spec_full.md
@@ -854,12 +854,6 @@ shifted := left * Σ_j = 0^i limb_shift[j] * intra_limb_left[i - j] + right * (
 
 ## Constraints
 
-### is_negative
-
-| Tag | Description | Multiplicity |
-|-----|-------------|--------------|
-| `SHIFT-C2` | `MSB16[is_negative; in[3]]` | signed |
-
 ### left_flag
 
 | Tag | Description |
@@ -867,6 +861,12 @@ shifted := left * Σ_j = 0^i limb_shift[j] * intra_limb_left[i - j] + right * (
 | `SHIFT-C1` | `direction` => `μ` = 1 |
 | | _polynomial:_ `direction * (1 - μ) = 0` |
 
+### is_negative
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `SHIFT-C2` | `MSB16[is_negative; in[3]]` | signed |
+
 ---
 
 # BRANCH Chip
@@ -1498,6 +1498,13 @@ carry (when iter=[1, 3]) := 2^-32 * ((extended_n_sub_r::QuadWL)[i] + (extended_r
 
 ## Constraints
 
+### output
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `DVRM-C21` | `DVRM[q::DWordWL; n, d, signed, 0]` | -μ_q |
+| `DVRM-C22` | `DVRM[r::DWordWL; n, d, signed, 1]` | -μ_r |
+
 ### equality
 
 | Tag | Range | Description | Multiplicity |
@@ -1506,13 +1513,12 @@ carry (when iter=[1, 3]) := 2^-32 * ((extended_n_sub_r::QuadWL)[i] + (extended_r
 | `DVRM-C14` |  | `MUL[extension_n_sub_r::DWordWL; d, signed, q, sign_q, 1]` | μ_sum |
 | `DVRM-C15.i` | i ∈ [0, 3] | `IS_HALF[q[i]]` | μ_sum |
 
-### defs
+### sign_equality
 
 | Tag | Description |
 |-----|-------------|
-| `DVRM-C16` | `SIGN<sign_n; n[3], signed>` |
-| `DVRM-C17` | `SIGN<sign_r; r[3], signed>` |
-| `DVRM-C18` | `SIGN<sign_d; d[3], signed>` |
+| `DVRM-C1` | `r` eq.not 0 => `sign_r` = `sign_n` |
+| | _polynomial:_ `Σ_i = 0^3 r[i] * (sign_r - sign_n) = 0` |
 
 ### n_sub_r
 
@@ -1523,12 +1529,13 @@ carry (when iter=[1, 3]) := 2^-32 * ((extended_n_sub_r::QuadWL)[i] + (extended_r
 | `DVRM-C11.i` | i ∈ [0, 3] | `IS_HALF[n_sub_r[i]]` | μ_sum |
 | `DVRM-C12` |  | `IS_BIT<sign_n_sub_r>` |  |
 
-### output
+### div_by_zero
 
-| Tag | Description | Multiplicity |
-|-----|-------------|--------------|
-| `DVRM-C21` | `DVRM[q::DWordWL; n, d, signed, 0]` | -μ_q |
-| `DVRM-C22` | `DVRM[r::DWordWL; n, d, signed, 1]` | -μ_r |
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `DVRM-C19.i` | i ∈ [0, 3] | `div_by_zero` => `q[i]` = 65535 |  |
+| | | _polynomial:_ `div_by_zero * (q[i] - 65535) = 0` | |
+| `DVRM-C20` |  | `ZERO[div_by_zero; d[0] + d[1] + d[2] + d[3]]` | μ_sum |
 
 ### abs_diff
 
@@ -1542,20 +1549,13 @@ carry (when iter=[1, 3]) := 2^-32 * ((extended_n_sub_r::QuadWL)[i] + (extended_r
 | `DVRM-C6.i` | i ∈ [0, 1] | not`sign_d` => `abs_d` = `d` |  |
 | | | _polynomial:_ `(1 - sign_d) * (abs_d[i] - (d::DWordWL)[i]) = 0` | |
 
-### div_by_zero
-
-| Tag | Range | Description | Multiplicity |
-|-----|-------|-------------|--------------|
-| `DVRM-C19.i` | i ∈ [0, 3] | `div_by_zero` => `q[i]` = 65535 |  |
-| | | _polynomial:_ `div_by_zero * (q[i] - 65535) = 0` | |
-| `DVRM-C20` |  | `ZERO[div_by_zero; d[0] + d[1] + d[2] + d[3]]` | μ_sum |
-
-### sign_equality
+### defs
 
 | Tag | Description |
 |-----|-------------|
-| `DVRM-C1` | `r` eq.not 0 => `sign_r` = `sign_n` |
-| | _polynomial:_ `Σ_i = 0^3 r[i] * (sign_r - sign_n) = 0` |
+| `DVRM-C16` | `SIGN<sign_n; n[3], signed>` |
+| `DVRM-C17` | `SIGN<sign_r; r[3], signed>` |
+| `DVRM-C18` | `SIGN<sign_d; d[3], signed>` |
 
 ---
 

From 742a5bdb4ee109fbdb74ebdb73ab7b934478907a Mon Sep 17 00:00:00 2001
From: Robin Jadoul <robin.jadoul@3milabs.tech>
Date: Tue, 10 Feb 2026 15:58:39 +0100
Subject: [PATCH 062/105] spec: Add initial tooling to check data formats,
 prepare for more elaborate type checking (#271)

* spec: Add initial tooling to check data formats, prepare for more elaborate type checking

* Initial type checking

* ruff format

* Update some more typing mismatches

* Move to range-based type checks

* Avoid casting to more limbs by leveraging scalar-array mult and literal casts

* toml fixes to pass type checks

* Type check virtual definitions properly now

* ruff format

* Make typst compile by turning big range values to string

* Switch some isinstance checks around to make both mypy and ty work

* Fix issues after rebasing on spec/main

* Address review comments

* Review comments

* lit -> const
---
 spec/chip.typ         |   1 -
 spec/memory.typ       |   1 +
 spec/src/bitwise.toml |  24 +-
 spec/src/branch.toml  |   8 +-
 spec/src/config.toml  |  48 +-
 spec/src/cpu.toml     |  25 +-
 spec/src/dvrm.toml    |   8 +-
 spec/src/is_bit.toml  |   2 +-
 spec/src/lt.toml      |   2 +-
 spec/src/memw.toml    |  29 +-
 spec/src/mul.toml     |   6 +-
 spec/src/page.toml    |  10 +-
 spec/src/shift.toml   |   2 +-
 spec/tooling/chip.py  | 988 ++++++++++++++++++++++++++++++++++++++++++
 14 files changed, 1082 insertions(+), 72 deletions(-)
 create mode 100644 spec/tooling/chip.py

diff --git a/spec/chip.typ b/spec/chip.typ
index 10479943e..4749b886e 100644
--- a/spec/chip.typ
+++ b/spec/chip.typ
@@ -113,7 +113,6 @@
     }
 
     if "poly" in def {
-      // assert(false, message: repr(index_all(var_name, gather_indices(def))))
       (
         [],
         table.cell(align: right, emph[definition]), 
diff --git a/spec/memory.typ b/spec/memory.typ
index 1fcb7b54e..62059de37 100644
--- a/spec/memory.typ
+++ b/spec/memory.typ
@@ -229,3 +229,4 @@ add the required balancing terms to the LogUp sum.
 = Future topics of interest
 
 - Optimize memory systems after determining factual bottlenecks (e.g. taking inspiration from Twist and Shout, or other recent research)
+- Double check whether IS_BYTE constraints are needed for fini
diff --git a/spec/src/bitwise.toml b/spec/src/bitwise.toml
index 9b4a3f951..75e8faee4 100644
--- a/spec/src/bitwise.toml
+++ b/spec/src/bitwise.toml
@@ -4,67 +4,67 @@ name = "BITWISE"
 name = "X"
 type = "Byte"
 desc = ""
-precomputed = "true"
+precomputed = true
 
 [[variables.input]]
 name = "Y"
 type = "Byte"
 desc = ""
-precomputed = "true"
+precomputed = true
 
 [[variables.input]]
 name = "Z"
 type = "B4"
 desc = ""
-precomputed = "true"
+precomputed = true
 
 [[variables.output]]
 name = "AND"
 type = "Byte"
 desc = "the binary AND of `X` and `Y`"
-precomputed = "true"
+precomputed = true
 
 [[variables.output]]
 name = "OR"
 type = "Byte"
 desc = "the binary OR of `X` and `Y`"
-precomputed = "true"
+precomputed = true
 
 [[variables.output]]
 name = "XOR"
 type = "Byte"
 desc = "the binary XOR of `X` and `Y`"
-precomputed = "true"
+precomputed = true
 
 [[variables.output]]
 name = "MSB8"
 type = "Bit"
 desc = "the most significant bit of `X`"
-precomputed = "true"
+precomputed = true
 
 [[variables.output]]
 name = "MSB16"
 type = "Bit"
 desc = "the most significant bit of `Y`"
-precomputed = "true"
+precomputed = true
 
 [[variables.output]]
 name = "ZERO"
 type = "Bit"
 desc = "whether $#`X` = 0$, $#`Y` = 0$ and $#`Z` = 0$."
-precomputed = "true"
+precomputed = true
 
 [[variables.output]]
 name = "SLL"
 type = "Half"
 desc = "`X||Y` logically left-shifted by `Z`: $((#`X` + 256#`Y`) #`<<` #`Z`) mod 2^16$"
-precomputed = "true"
+precomputed = true
 
 [[variables.output]]
 name = "SLLC"
 type = "Half"
 desc = "`X||Y` logically right-shifted by `Z`: $(#`X` + 256#`Y`) #`>>` (16 - #`Z`)$"
-precomputed = "true"
+precomputed = true
 
 [[variables.multiplicity]]
 name = "μ_AND"
@@ -197,4 +197,4 @@ kind = "interaction"
 tag = "HWSLC"
 input = [["+", "X", ["*", 256, "Y"]], "Z"]
 output = "SLLC"
-multiplicity = ["-", "μ_HWSLC"]
\ No newline at end of file
+multiplicity = ["-", "μ_HWSLC"]
diff --git a/spec/src/branch.toml b/spec/src/branch.toml
index e66974c8e..beb3c1922 100644
--- a/spec/src/branch.toml
+++ b/spec/src/branch.toml
@@ -11,7 +11,7 @@ pad = 0
 
 [[variables.input]]
 name = "offset"
-type = "Word"
+type = "DWordWL"
 desc = "The offset from the base address to jump to"
 pad = 0
 
@@ -59,7 +59,7 @@ name = "next_pc_unmasked"
 type = "DWordWL"
 desc = "The combination of `next_pc_high`, `next_pc_low[1]` and `unmasked_low_byte` to constrain the addition. This is the computed value for the next pc, before masking off the LSB as required by the ISA."
 def = {idx = "i", polys = [
-  {iter = 0, poly = ["+", ["*", ["^", 2, 16], ["idx", "next_pc_high", 0]], ["*", ["^", 2, 8], ["idx", "next_pc_low", 1]], ["idx", "unmasked_low_byte", 0]]},
+  {iter = 0, poly = ["+", ["*", ["^", 2, 16], ["idx", "next_pc_high", 0]], ["*", ["^", 2, 8], ["idx", "next_pc_low", 1]], "unmasked_low_byte"]},
   {iter = 1, poly = ["+", ["*", ["^", 2, 16], ["idx", "next_pc_high", 2]], ["idx", "next_pc_high", 1]]},
 ]}
 
@@ -124,7 +124,7 @@ multiplicity = "μ"
 [[constraints.all]]
 kind = "interaction"
 tag = "AND_BYTE"
-input = [["idx", "unmasked_low_byte", 0], 254]
+input = ["unmasked_low_byte", 254]
 output = ["idx", "next_pc_low", 0]
 multiplicity = "μ"
 
@@ -145,4 +145,4 @@ kind = "interaction"
 tag = "BRANCH"
 input = ["pc", "offset", "register", "JALR"]
 output = "next_pc"
-multiplicity = "-μ"
+multiplicity = ["-", "μ"]
diff --git a/spec/src/config.toml b/spec/src/config.toml
index d836f80e5..0f6ef11d6 100644
--- a/spec/src/config.toml
+++ b/spec/src/config.toml
@@ -4,63 +4,49 @@ version = 1
 [[variables.types]]
 label = "BaseField"
 subtypes = ["BaseField"]
+range = [0, "18446744069414584320"]
 desc = "Variable that can assume any value in the base field."
 
 [[variables.types]]
 label = "Bit"
 subtypes = ["BaseField"]
+range = [0, 1]
 desc = "Variable that can only assume values in the set ${0,1}$."
 
 [[variables.types]]
 label = "B4"
 subtypes = ["BaseField"]
+range = [0, 15]
 desc = "Variable that can only assume values in the range $[0, 2^4)$."
 
 [[variables.types]]
 label = "Byte"
 subtypes = ["BaseField"]
-count = 1
+range = [0, 255]
 desc = "Variable that can only assume values in the range $[0, 2^8)$."
 
 [[variables.types]]
 label = "Half"
 subtypes = ["BaseField"]
+range = [0, 65535]
 desc = "Variable that can only assume values in the range $[0, 2^16)$."
 
 [[variables.types]]
 label = "B20"
 subtypes = ["BaseField"]
+range = [0, 1048575]
 desc = "Variable that can only assume values in the range $[0, 2^20)$."
 
 [[variables.types]]
 label = "Word"
 subtypes = ["BaseField"]
+range = [0, 4294967295]
 desc = "Variable that can only assume values in the range $[0, 2^32)$."
 
-[[variables.types]]
-label = "WordHL"
-subtypes = ["Half", "Half"]
-desc = """\
-       Variable that can only assume values in the range $[0, 2^32)$. \\
-       Represented as an array of two `Half` variables.\
-       """
-
-[[variables.types]]
-label = "WordBL"
-subtypes = ["Byte", "Byte", "Byte", "Byte"]
-desc = """\
-       Variable that can only assume values in the range $[0, 2^32)$. \\
-       Represented as an array of four `Byte` variables.\
-       """
-
-[[variables.types]]
-label = "B35"
-subtypes = ["BaseField"]
-desc = "Variable that can only assume values in the range $[0, 2^35)$."
-
 [[variables.types]]
 label = "B51"
 subtypes = ["BaseField"]
+range = [0, 2251799813685247]
 desc = "Variable that can only assume values in the range $[0, 2^51)$."
 
 [[variables.types]]
@@ -96,6 +82,15 @@ desc = """\
        The `Word` is the *least* significant digit.
        """
 
+[[variables.types]]
+label = "DWordWHH"
+subtypes = ["Half", "Half", "Word"]
+desc = """\
+       Variable that can only assume values in the range $[0, 2^64)$. \\
+       Represented as a `Word` and two `Half` variables.\
+       The `Word` is the *most* significant digit.
+       """
+
 [[variables.types]]
 label = "QuadHL"
 subtypes = ["Half", "Half", "Half", "Half", "Half", "Half", "Half", "Half"]
@@ -112,15 +107,6 @@ desc = """\
        Represented as an array of four `Word` variables.\
        """
 
-[[variables.types]]
-label = "DWordWHH"
-subtypes = ["Half", "Half", "Word"]
-desc = """\
-       Variable that can only assume values in the range $[0, 2^64)$. \\
-       Represented as a `Word` and two `Half` variables.\
-       The `Word` is the *most* significant digit.
-       """
-
 [[variables.types]]
 label = "Timestamp"
 subtypes = ["DWordWL"]
diff --git a/spec/src/cpu.toml b/spec/src/cpu.toml
index c151b6eff..994fda508 100644
--- a/spec/src/cpu.toml
+++ b/spec/src/cpu.toml
@@ -343,6 +343,7 @@ name = "decode"
 kind = "interaction"
 tag = "DECODE"
 input = ["pc", "imm", "packed_decode"]
+multiplicity = 1
 
 
 [[constraint_groups]]
@@ -515,34 +516,40 @@ ref = "cpu:c:range_EBREAK"
 kind = "interaction"
 tag = "IS_BYTE"
 input = ["rs1"]
+multiplicity = 1
 
 [[constraints.range]]
 kind = "interaction"
 tag = "IS_BYTE"
 input = ["rs2"]
+multiplicity = 1
 
 [[constraints.range]]
 kind = "interaction"
 tag = "IS_BYTE"
 input = ["rd"]
+multiplicity = 1
 
 [[constraints.range]]
 kind = "interaction"
 tag = "IS_BYTE"
 input = [["idx", "arg1", "i"]]
 iter = ["i", 0, 7]
+multiplicity = 1
 
 [[constraints.range]]
 kind = "interaction"
 tag = "IS_BYTE"
 input = [["idx", "arg2", "i"]]
 iter = ["i", 0, 7]
+multiplicity = 1
 
 [[constraints.range]]
 kind = "interaction"
 tag = "IS_BYTE"
 input = [["idx", "res", "i"]]
 iter = ["i", 0, 7]
+multiplicity = 1
 
 
 [[constraint_groups]]
@@ -611,7 +618,7 @@ multiplicity = "SHIFT"
 [[constraints.alu]]
 kind = "template"
 tag = "ADD"
-input = ["pc", ["cast", ["+", ["*", 2, "c_type_instruction"], ["*", 4, ["not", "c_type_instruction"]]], "DWordWL"]]
+input = ["pc", ["*", ["+", ["*", 2, "c_type_instruction"], ["*", 4, ["not", "c_type_instruction"]]], ["cast", 1, "DWordWL"]]]
 output = ["cast", "res", "DWordWL"]
 cond = "JALR"
 
@@ -640,7 +647,7 @@ prefix = "M"
 [[constraints.mem]]
 kind = "interaction"
 tag = "MEMW"
-input = [1, ["*", 2, "rs1"], "rv1", ["+", "timestamp", 0], 1, 0, 0]
+input = [1, ["*", 2, "rs1"], "rv1", ["+", "timestamp", ["cast", 0, "DWordWL"]], 1, 0, 0]
 output = "rv1"
 multiplicity = "read_register1"
 
@@ -654,7 +661,7 @@ iter = ["i", 0, 2]
 [[constraints.mem]]
 kind = "interaction"
 tag = "MEMW"
-input = [1, ["*", 2, "rs2"], "rv2", ["+", "timestamp", 1], 1, 0, 0]
+input = [1, ["*", 2, "rs2"], "rv2", ["+", "timestamp", ["cast", 1, "DWordWL"]], 1, 0, 0]
 output = "rv2"
 multiplicity = "read_register2"
 
@@ -668,13 +675,13 @@ iter = ["i", 0, 2]
 [[constraints.mem]]
 kind = "interaction"
 tag = "MEMW"
-input = [1, ["*", 2, "rd"], "rvd", ["+", "timestamp", 2], 1, 0, 0]
+input = [1, ["*", 2, "rd"], "rvd", ["+", "timestamp", ["cast", 2, "DWordWL"]], 1, 0, 0]
 multiplicity = "write_register"
 
 [[constraints.mem]]
 kind = "interaction"
 tag = "LOAD"
-input = [0, "res", ["+", "timestamp", 0], "memory_2bytes", "memory_4bytes", "memory_8bytes", "signed"]
+input = [0, "res", ["+", "timestamp", ["cast", 0, "DWordWL"]], "memory_2bytes", "memory_4bytes", "memory_8bytes", "signed"]
 output = "rvd"
 multiplicity = "LOAD"
 
@@ -682,14 +689,14 @@ multiplicity = "LOAD"
 [[constraints.mem]]
 kind = "interaction"
 tag = "MEMW"
-input = [0, "res", "rv2", ["+", "timestamp", 1], "memory_2bytes", "memory_4bytes", "memory_8bytes"]
+input = [0, "res", "rv2", ["+", "timestamp", ["cast", 1, "DWordWL"]], "memory_2bytes", "memory_4bytes", "memory_8bytes"]
 multiplicity = "STORE"
 
 # TODO: no types available, so no casting yet
 [[constraints.mem]]
 kind = "interaction"
 tag = "MEMW"
-input = [1, ["*", 2, 255], "next_pc", ["+", "timestamp", 1], 1, 0, 0]
+input = [1, ["*", 2, 255], "next_pc", ["+", "timestamp", ["cast", 1, "DWordWL"]], 1, 0, 0]
 output = "pc"
 multiplicity = ["not", "pad"]
 
@@ -795,7 +802,7 @@ poly = ["+",
         ["-", "branch_cond"],
         "JALR",
         ["*", ["idx", "res", 0], ["not", "mp_selector"], "BLT"],
-        ["*", ["not", ["idx", "res", 0]], "mp_selector", "BLT"],
+        ["*", ["-", 1, ["idx", "res", 0]], "mp_selector", "BLT"],
         ["*", "is_equal", ["not", "mp_selector"], "BEQ"],
         ["*", ["not", "is_equal"], "mp_selector", "BEQ"]
     ]
@@ -810,6 +817,6 @@ multiplicity = "branch_cond"
 [[constraints.misc]]
 kind = "template"
 tag = "ADD"
-input = ["pc", ["cast", ["+", ["*", 2, "c_type_instruction"], ["*", 4, ["not", "c_type_instruction"]]], "DWordWL"]]
+input = ["pc", ["*", ["+", ["*", 2, "c_type_instruction"], ["*", 4, ["not", "c_type_instruction"]]], ["cast", 1, "DWordWL"]]]
 output = "next_pc"
 desc = "Increment `pc` to `next_pc` if we're not branching"
diff --git a/spec/src/dvrm.toml b/spec/src/dvrm.toml
index d93449228..52583907c 100644
--- a/spec/src/dvrm.toml
+++ b/spec/src/dvrm.toml
@@ -376,13 +376,13 @@ desc = "Each row contributes the following to the LogUp sum"
 [[constraints.output]]
 kind = "interaction"
 tag = "DVRM"
-input = ["n", "d", "signed", "0"]
+input = ["n", "d", "signed", 0]
 output = ["cast", "q", "DWordWL"]
-multiplicity = "-μ_q"
+multiplicity = ["-", "μ_q"]
 
 [[constraints.output]]
 kind = "interaction"
 tag = "DVRM"
-input = ["n", "d", "signed", "1"]
+input = ["n", "d", "signed", 1]
 output = ["cast", "r", "DWordWL"]
-multiplicity = "-μ_r"
\ No newline at end of file
+multiplicity = ["-", "μ_r"]
diff --git a/spec/src/is_bit.toml b/spec/src/is_bit.toml
index 47e96a27e..a72b5f648 100644
--- a/spec/src/is_bit.toml
+++ b/spec/src/is_bit.toml
@@ -16,5 +16,5 @@ name = "all"
 [[constraints.all]]
 kind = "arith"
 constraint = "$#`cond` => #`X` (1-#`X`) = 0$"
-poly = ["*", "cond", "X", ["not", "X"]]
+poly = ["*", "cond", "X", ["-", 1, "X"]]
 ref = "isbit:c:isbit"
diff --git a/spec/src/lt.toml b/spec/src/lt.toml
index 10497b637..1941dbb7a 100644
--- a/spec/src/lt.toml
+++ b/spec/src/lt.toml
@@ -160,4 +160,4 @@ kind = "interaction"
 tag = "LT"
 input = [["cast", "lhs", "DWordWL"], ["cast", "rhs", "DWordWL"], "signed"]
 output = "lt"
-multiplicity = "-μ"
+multiplicity = ["-", "μ"]
diff --git a/spec/src/memw.toml b/spec/src/memw.toml
index f7276a9cd..af005c2b4 100644
--- a/spec/src/memw.toml
+++ b/spec/src/memw.toml
@@ -129,7 +129,7 @@ kind = "template"
 tag = "ADD"
 input = ["base_address", 1]
 output = ["cast", ["idx", "address_add", 0], "DWordWL"]
-multiplicity = "w2"
+cond = "w2"
 
 [[constraints.consistency]]
 kind = "template"
@@ -137,7 +137,7 @@ tag = "ADD"
 input = ["base_address", ["+", "i", 1]]
 output = ["cast", ["idx", "address_add", "i"], "DWordWL"]
 iter = ["i", 1, 2]
-multiplicity = "w4"
+cond = "w4"
 
 [[constraints.consistency]]
 kind = "template"
@@ -145,16 +145,37 @@ tag = "ADD"
 input = ["base_address", ["+", "i", 1]]
 output = ["cast", ["idx", "address_add", "i"], "DWordWL"]
 iter = ["i", 3, 6]
-multiplicity = "write8"
+cond = "write8"
+
+[[constraints.consistency]]
+kind = "interaction"
+tag = "IS_HALFWORD"
+input = [["idx", ["idx", "address_add", "i"], "j"]]
+iters = [
+  ["i", 0, 0],
+  ["j", 0, 3],
+]
+multiplicity = "w2"
 
 [[constraints.consistency]]
 kind = "interaction"
 tag = "IS_HALFWORD"
 input = [["idx", ["idx", "address_add", "i"], "j"]]
 iters = [
-  ["i", 0, 6],
+  ["i", 1, 2],
   ["j", 0, 3],
 ]
+multiplicity = "w4"
+
+[[constraints.consistency]]
+kind = "interaction"
+tag = "IS_HALFWORD"
+input = [["idx", ["idx", "address_add", "i"], "j"]]
+iters = [
+  ["i", 3, 6],
+  ["j", 0, 3],
+]
+multiplicity = "write8"
 
 [[constraints.consistency]]
 kind = "interaction"
diff --git a/spec/src/mul.toml b/spec/src/mul.toml
index 238bfe01f..a798c682d 100644
--- a/spec/src/mul.toml
+++ b/spec/src/mul.toml
@@ -182,7 +182,7 @@ name = "prod"
 [[constraints.prod]]
 kind = "arith"
 constraint = "$#`raw_product[i]` = sum_(#`k`=0)^1 2^(16k) sum_(#`j`=0)^(2i+k) #`lhs_ext[j]` dot #`rhs_ext[2i+k-j]`$"
-poly = ["-", ["sum", ["=", "k", 0], "1", ["*", ["^", 2, ["*", 16, "k"]], ["sum", ["=", "j", 0], ["+", ["*", 2, "i"], "k"], ["*", ["idx", "lhs_ext", "j"], ["idx", "rhs_ext", ["-", ["+", ["*", 2, "i"], "k"], "j"]]]]]], ["idx", "raw_product", "i"]]
+poly = ["-", ["sum", ["=", "k", 0], 1, ["*", ["^", 2, ["*", 16, "k"]], ["sum", ["=", "j", 0], ["+", ["*", 2, "i"], "k"], ["*", ["idx", "lhs_ext", "j"], ["idx", "rhs_ext", ["-", ["+", ["*", 2, "i"], "k"], "j"]]]]]], ["idx", "raw_product", "i"]]
 iter = ["i", 0, 3]
 ref = "mul:c:raw_product"
 
@@ -192,7 +192,7 @@ name = "lookup"
 [[constraints.lookup]]
 kind = "interaction"
 tag = "MUL"
-input = ["lhs", "lhs_signed", "rhs", "rhs_signed", "0"]
+input = ["lhs", "lhs_signed", "rhs", "rhs_signed", 0]
 output = ["cast", "lo", "DWordWL"]
 multiplicity = ["-", "μ_lo"]
 ref = "mul:c:lookup_lo"
@@ -200,7 +200,7 @@ ref = "mul:c:lookup_lo"
 [[constraints.lookup]]
 kind = "interaction"
 tag = "MUL"
-input = ["lhs", "lhs_signed", "rhs", "rhs_signed", "1"]
+input = ["lhs", "lhs_signed", "rhs", "rhs_signed", 1]
 output = ["cast", "hi", "DWordWL"]
 multiplicity = ["-", "μ_hi"]
 ref = "mul:c:lookup_hi"
diff --git a/spec/src/page.toml b/spec/src/page.toml
index 8053d63df..21ec76757 100644
--- a/spec/src/page.toml
+++ b/spec/src/page.toml
@@ -2,6 +2,12 @@ name = "PAGE"
 
 # Input
 
+# TODO: add `page` as "constant" column or smth
+[[variables.input]]
+name = "page"
+type = "DWordWL"
+desc = "Constant column containing the page base address; should be integrated into the constraints directly"
+
 [[variables.input]]
 name = "offset"
 type = "RowIndex"
@@ -28,7 +34,7 @@ desc = "The timestamp at which this address was last accessed"
 name = "address"
 type = "DWordWL"
 desc = "Adding `offset` to the page base address `page`. `page` is a constant with respect to a single instance of this table."
-def = ["+", "page", ["cast", "offset", "DWordWL"]]
+def = ["+", "page", ["*", "offset", ["cast", 1, "DWordWL"]]]
 
 
 [[constraint_groups]]
@@ -38,11 +44,13 @@ name = "all"
 kind = "interaction"
 tag = "IS_BYTE"
 input = ["init"]
+multiplicity = 1
 
 [[constraints.all]]
 kind = "interaction"
 tag = "IS_BYTE"
 input = ["fini"]
+multiplicity = 1
 
 [[constraints.all]]
 kind = "interaction"
diff --git a/spec/src/shift.toml b/spec/src/shift.toml
index 591efb839..bd6c471a6 100644
--- a/spec/src/shift.toml
+++ b/spec/src/shift.toml
@@ -292,5 +292,5 @@ kind = "interaction"
 tag = "SHIFT"
 input = ["in", "shift", "direction", "signed", "word_instr"]
 output = "out"
-multiplicity = "-μ"
+multiplicity = ["-", "μ"]
 ref = "shift:c:lookup"
diff --git a/spec/tooling/chip.py b/spec/tooling/chip.py
new file mode 100644
index 000000000..8a15ae338
--- /dev/null
+++ b/spec/tooling/chip.py
@@ -0,0 +1,988 @@
+import sys
+import tomllib
+from collections.abc import Callable, Iterable
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Never, Optional, Self
+
+
+class ErrorReporter:
+    reported: bool
+    location: str
+
+    def __init__(self, location: str):
+        self.reported = False
+        self.location = location
+
+    def update_location(self, loc: str):
+        self.reported = False
+        self.location = loc
+
+    def error(self, message: str):
+        self.reported = True
+        print(f"ERROR {self.location}: {message}", file=sys.stderr)
+
+    def asserts(self, condition: bool, message: str):
+        if not condition:
+            self.error(message)
+
+
+reporter = ErrorReporter("unknown")
+
+
+def assert_no_unexpected(data: dict, possible_keys: Iterable[str]):
+    for key in data.keys():
+        reporter.asserts(key in possible_keys, f"Unexpected key: {key!r}")
+
+
+@dataclass(frozen=True)
+class Range:
+    low: int
+    high: int
+
+    @classmethod
+    def const(cls, x: int) -> Self:
+        return cls(x, x)
+
+    def is_bool(self):
+        return self.low >= 0 and self.high <= 1
+
+    def is_const(self):
+        return self.low == self.high
+
+    def get_const(self) -> int:
+        assert self.is_const()
+        return self.low
+
+
+type Type = list[Type] | Range
+
+DEFAULT_TYPE: Type = Range.const(0)
+
+type Expr = (
+    LitExpr
+    | VarExpr
+    | IdxExpr
+    | CastExpr
+    | MulExpr
+    | AddExpr
+    | SubExpr
+    | PowExpr
+    | SumExpr
+    | NotExpr
+    | DummyExpr
+)
+
+
+@dataclass
+class Environment:
+    config: "Config"
+    valmap: dict[str, Range]
+    typemap: dict[str, Type]
+
+    def with_val(self, key: str, val: Range) -> Self:
+        return type(self)(self.config, {**self.valmap, key: val}, self.typemap)
+
+
+@dataclass
+class LitExpr:
+    lit: int
+
+    def typecheck(self, _env: Environment) -> Type:
+        return Range.const(self.lit)
+
+
+@dataclass
+class VarExpr:
+    name: str
+
+    def typecheck(self, env: Environment) -> Type:
+        if self.name in env.valmap:
+            return env.valmap[self.name]
+        if self.name in env.typemap:
+            return env.typemap[self.name]
+        reporter.error(f"Unknown variable: {self.name!r}")
+        return DEFAULT_TYPE
+
+
+@dataclass
+class ArrExpr:
+    elems: list[Expr]
+
+    def typecheck(self, env: Environment) -> Type:
+        reporter.asserts(self.elems != [], f"Empty array: {self!r}")
+        return [e.typecheck(env) for e in self.elems]
+
+
+@dataclass
+class IdxExpr:
+    base: Expr
+    idx: Expr
+
+    def typecheck(self, env: Environment) -> Type:
+        base = self.base.typecheck(env)
+        idx = self.idx.typecheck(env)
+        if not isinstance(idx, Range) or not idx.is_const():
+            reporter.error(f"Invalid index: {idx!r}")
+            return Range.const(-1)
+        idxconst = idx.get_const()
+        if isinstance(base, Range):
+            reporter.error(f"Indexing into non-array type: {self!r}")
+            return DEFAULT_TYPE
+        if not (0 <= idxconst < len(base)):
+            reporter.error(f"Index out of range {self!r}")
+            idxconst = 0
+        return base[idxconst]
+
+
+@dataclass
+class CastExpr:
+    base: Expr
+    type: Type
+
+    def typecheck(self, env: Environment) -> Type:
+        base = self.base.typecheck(env)
+        # TODO? Detect more sorts of invalid casts
+        baselen = len(base) if isinstance(base, list) else 1
+        castlen = len(self.type) if isinstance(self.type, list) else 1
+        reporter.asserts(
+            baselen >= castlen or (isinstance(base, Range) and base.is_const()),
+            f"Casting from fewer columns to more: {self!r} {base} {self.type}",
+        )
+        return self.type
+
+
+@dataclass
+class MulExpr:
+    factors: list[Expr]
+
+    def type_match(self, a: Type, b: Type) -> Type:
+        if isinstance(a, list) and isinstance(b, list):
+            reporter.error(f"Multiplication of non-scalar types: {self!r}")
+            return DEFAULT_TYPE
+        elif not isinstance(a, Range):
+            return [self.type_match(x, b) for x in a]
+        elif isinstance(b, list):
+            return self.type_match(b, a)
+        else:
+            extrema = [x * y for x in [a.low, a.high] for y in [b.low, b.high]]
+            return Range(min(extrema), max(extrema))
+
+    def typecheck(self, env: Environment) -> Type:
+        reporter.asserts(self.factors != [], f"Empty product: {self!r}")
+        t: Type = Range.const(1)
+        for f in self.factors:
+            t = self.type_match(t, f.typecheck(env))
+        return t
+
+
+@dataclass
+class AddExpr:
+    terms: list[Expr]
+
+    def type_match(self, a: Type, b: Type) -> Type:
+        if isinstance(a, list) and isinstance(b, list):
+            if len(a) != len(b):
+                reporter.error(f"Adding array types of different length {self!r}")
+                return [DEFAULT_TYPE for _ in b]
+            return [self.type_match(x, y) for x, y in zip(a, b)]
+        elif isinstance(a, list) or isinstance(b, list):
+            reporter.error(f"Adding of scalar and array types {self!r}")
+            return DEFAULT_TYPE
+        else:
+            return Range(a.low + b.low, a.high + b.high)
+
+    def typecheck(self, env: Environment) -> Type:
+        if not self.terms:
+            reporter.error("Empty add")
+            return Range.const(0)
+        t: Type = self.terms[0].typecheck(env)
+        for term in self.terms[1:]:
+            t = self.type_match(t, term.typecheck(env))
+        return t
+
+
+@dataclass
+class SubExpr:
+    head: Expr
+    subs: list[Expr]
+
+    def type_match(self, a: Type, b: Type) -> Type:
+        if isinstance(a, list) and isinstance(b, list):
+            if len(a) != len(b):
+                reporter.error(f"Subtracting array types of different length {self!r}")
+                return [DEFAULT_TYPE for _ in a]
+            return [self.type_match(x, y) for x, y in zip(a, b)]
+        elif isinstance(a, list) or isinstance(b, list):
+            reporter.error(f"Subtraction of scalar and array types {self!r}")
+            return DEFAULT_TYPE
+        else:
+            return Range(a.low - b.high, a.high - b.low)
+
+    def typecheck(self, env: Environment) -> Type:
+        t = self.head.typecheck(env)
+        if not self.subs:
+            if not isinstance(t, Range):
+                reporter.error(f"Negating a non-scalar type: {self!r}")
+                return t
+            return Range(-t.high, -t.low)
+        for term in self.subs:
+            t = self.type_match(t, term.typecheck(env))
+        return t
+
+
+@dataclass
+class PowExpr:
+    base: Expr
+    exp: Expr
+
+    def typecheck(self, env: Environment) -> Type:
+        base = self.base.typecheck(env)
+        exp = self.exp.typecheck(env)
+        if isinstance(base, list) or not base.is_const():
+            reporter.error(f"Invalid exponentiation with non-const base: {self.base!r}")
+            return DEFAULT_TYPE
+        if isinstance(exp, list) or not exp.is_const():
+            reporter.error(
+                f"Invalid exponentiation with non-const exponent: {self.exp!r}"
+            )
+            return DEFAULT_TYPE
+        val = pow(base.get_const(), exp.get_const(), env.config.variables.prime)
+        return Range.const(val)
+
+
+@dataclass
+class SumExpr:
+    iter: "Iter"
+    terms: Expr
+
+    def type_match(self, a: Type, b: Type) -> Type:
+        if isinstance(a, list) and isinstance(b, list):
+            if len(a) != len(b):
+                reporter.error(f"Summing array types of different length {self!r}")
+                return [DEFAULT_TYPE for _ in b]
+            return [self.type_match(x, y) for x, y in zip(a, b)]
+        elif isinstance(a, list) or isinstance(b, list):
+            reporter.error(f"Summing of scalar and array types {self!r}")
+            return DEFAULT_TYPE
+        else:
+            return Range(a.low + b.low, a.high + b.high)
+
+    def typecheck(self, env: Environment) -> Type:
+        t: Type = Range.const(0)
+        for tc in self.iter.typecheck(env, lambda e: [self.terms.typecheck(e)]):
+            t = self.type_match(t, tc)
+        return t
+
+
+@dataclass
+class NotExpr:
+    inner: Expr
+
+    def typecheck(self, env: Environment) -> Type:
+        inner = self.inner.typecheck(env)
+        if isinstance(inner, list) or not inner.is_bool():
+            reporter.error(f"Not a bool passed to `not`: {self.inner!r}")
+            return Range(0, 1)
+        return Range(1 - inner.high, 1 - inner.low)
+
+
+@dataclass
+class DummyExpr:
+    def typecheck(self, _env: Environment) -> Type:
+        return DEFAULT_TYPE
+
+
+def build_expr(config: Optional["Config"], data: object) -> Expr:
+    # Does this need config, or do we delay any config-checking to when we use the expr?
+    match data:
+        case int(x):
+            return LitExpr(x)
+        case str(x):
+            reporter.asserts(
+                x.isidentifier(), f"Invalid identifier name for variable {x!r}"
+            )
+            return VarExpr(x)
+        case ["idx", x, y]:
+            return IdxExpr(build_expr(config, x), build_expr(config, y))
+        case ["cast", x, t]:
+            assert config is not None
+            assert isinstance(t, (list, str))
+            return CastExpr(build_expr(config, x), build_type(config, t))
+        case ["*", *factors]:
+            return MulExpr([build_expr(config, f) for f in factors])
+        case ["+", *terms]:
+            return AddExpr([build_expr(config, t) for t in terms])
+        case ["-", head, *subs]:
+            return SubExpr(
+                build_expr(config, head), [build_expr(config, s) for s in subs]
+            )
+        case ["^", base, exp]:
+            return PowExpr(build_expr(config, base), build_expr(config, exp))
+        case ["sum", ["=", str(var), start], stop, terms]:
+            assert config is not None
+            return SumExpr(Iter(config, var, start, stop), build_expr(config, terms))
+        case ["not", e]:
+            return NotExpr(build_expr(config, e))
+        case other:
+            reporter.error(f"Unknown expression: {other!r}")
+            return DummyExpr()
+
+
+@dataclass
+class Iter:
+    name: str
+    start: Expr
+    stop: Expr
+
+    def __init__(self, config: "Config", name: str, start: object, stop: object):
+        self.name = name
+        reporter.asserts(
+            isinstance(self.name, str), f"iter name is not a string: {self.name!r}"
+        )
+        reporter.asserts(
+            self.name.isidentifier(), f"Not a valid identifier: {self.name!r}"
+        )
+        self.start = build_expr(config, start)
+        self.stop = build_expr(config, stop)
+
+    def typecheck[T](
+        self, env: Environment, callback: Callable[[Environment], Iterable[T]]
+    ) -> Iterable[T]:
+        start = self.start.typecheck(env)
+        if isinstance(start, list) or not start.is_const():
+            reporter.error(f"Starting value of iterator not a const: {self!r}")
+            start = Range.const(0)
+        stop = self.stop.typecheck(env)
+        if isinstance(stop, list) or not stop.is_const():
+            reporter.error(f"Ending value of iterator not a const: {self!r}")
+            stop = Range.const(start.get_const())
+
+        # While it's tempting to replace this loop by an assignment of Range(start, stop + 1) to self.name
+        # that would break both detection of consts, and narrowing down to the correct type for indexing
+        # heterogenous array types
+        for i in range(start.get_const(), stop.get_const() + 1):
+            yield from callback(env.with_val(self.name, Range.const(i)))
+
+
+def iters_of(obj: dict, name=None) -> list[Iter]:
+    """Return a list of iterators needed by `obj`. Taken from `iters` or `iter`.
+    Prepend `name` to every iterator, if given.
+    Adapted from the corresponding typst implementation."""
+
+    def clean_iter(it):
+        arr = it if isinstance(it, list) else [it]
+        if name is not None:
+            arr = [name] + arr
+
+        if len(arr) == 2:
+            # Assume single-element range
+            arr.append(arr[-1])
+
+        if len(arr) != 3:
+            reporter.error(f"Invalid length iter: {arr!r}")
+            return Iter(config, "_", 0, 0)
+        return Iter(config, *arr)
+
+    if "iters" in obj:
+        reporter.asserts(
+            "iter" not in obj, f"Object has both `iters` and `iter`: {obj!r}"
+        )
+        return [clean_iter(it) for it in obj["iters"]]
+    elif "iter" in obj:
+        return [clean_iter(obj["iter"])]
+    else:
+        return []
+
+
+@dataclass
+class TypeConfig:
+    label: str
+    subtypes: list[Type]
+    range: Optional[Range]
+    desc: str
+    preprocessed: bool
+
+    def __init__(self, default_name: str, lookup: Callable[[str], Type], data: dict):
+        assert_no_unexpected(data, type(self).__annotations__.keys())
+        self.label = data["label"]
+        if "range" in data:
+            reporter.asserts(
+                data["subtypes"] == [default_name],
+                f"Specified a range on a non-base composite type: {data!r}",
+            )
+            reporter.asserts(
+                isinstance(data["range"], list) and len(data["range"]) == 2,
+                f"Invalid range: {data!r}",
+            )
+            start, stop = data["range"]
+            if not isinstance(start, int) and not (
+                isinstance(start, str) and start.isdigit()
+            ):
+                reporter.error(f"Range start not an int: {data!r}")
+                start = 0
+            if not isinstance(stop, int) and not (
+                isinstance(stop, str) and stop.isdigit()
+            ):
+                reporter.error(f"Range end not an int: {data!r}")
+                stop = start
+            reporter.asserts(int(start) <= int(stop), f"Inverted range: {data!r}")
+            self.range = Range(int(start), int(stop))
+            self.subtypes = []
+        else:
+            self.range = None
+            self.subtypes = [lookup(tp) for tp in data["subtypes"]]
+        self.desc = data["desc"]
+        self.preprocessed = data.get("preprocessed", False)
+
+    def as_type(self) -> Type:
+        return self.range or self.subtypes[:]
+
+
+@dataclass
+class ConfigCategories:
+    all: list[str]
+    instantiated: list[str]
+
+    def __init__(self, data: dict):
+        assert_no_unexpected(data, type(self).__annotations__.keys())
+        self.all = data["all"]
+        self.instantiated = data["instantiated"]
+        reporter.asserts(
+            all(isinstance(v, str) for v in self.all),
+            f"Something's not a string: {self.all}",
+        )
+        reporter.asserts(
+            all(isinstance(v, str) for v in self.instantiated),
+            f"Something's not a string: {self.instantiated}",
+        )
+        reporter.asserts(
+            set(self.instantiated) <= set(self.all),
+            f"Instantiated not a subset of all: {self!r}",
+        )
+
+
+@dataclass
+class ConfigVariables:
+    types: list[TypeConfig]
+    categories: ConfigCategories
+    prime: int
+
+    def __init__(self, data: dict):
+        assert_no_unexpected(data, type(self).__annotations__.keys())
+        self.types = []
+        base_type = data["types"][0]["label"]
+        for tp in data["types"]:
+            self.types.append(TypeConfig(base_type, self.lookup_type, tp))
+        self.categories = ConfigCategories(data["categories"])
+        basefield = self.lookup_type(base_type)
+        assert isinstance(basefield, Range)
+        self.prime = basefield.high + 1
+
+    def lookup_type(self, typename: str) -> Type:
+        matches = [t for t in self.types if t.label == typename]
+        if len(matches) != 1:
+            reporter.error(f"Couldn't lookup type by name: {typename!r}")
+            return DEFAULT_TYPE
+        return matches[0].as_type()
+
+
+@dataclass
+class ConfigMetadata:
+    version: int
+
+    def __init__(self, data: dict):
+        assert_no_unexpected(data, type(self).__annotations__.keys())
+        self.version = data["version"]
+        reporter.asserts(
+            isinstance(self.version, int), f"version {self.version!r} is not an int"
+        )
+
+
+@dataclass
+class Config:
+    metadata: ConfigMetadata
+    variables: ConfigVariables
+
+    def __init__(self, data: dict):
+        """Construct a Config from toml-parsed data"""
+        assert_no_unexpected(data, type(self).__annotations__.keys())
+        self.metadata = ConfigMetadata(data["metadata"])
+        self.variables = ConfigVariables(data["variables"])
+
+    @classmethod
+    def from_file(cls, filename: str | Path) -> Self:
+        reporter.update_location(str(filename))
+        return cls(tomllib.load(open(filename, "rb")))
+
+    @classmethod
+    def from_string(cls, s: str) -> Self:
+        reporter.update_location("<string>")
+        return cls(tomllib.loads(s))
+
+
+def build_type(config: Config, data: list | str):
+    if isinstance(data, list):
+        if len(data) != 2:
+            reporter.error(f"Invalid type: {data!r}")
+            return DEFAULT_TYPE
+        return [build_type(config, data[0]) for _ in range(data[1])]
+    else:
+        return config.variables.lookup_type(data)
+
+
+@dataclass
+class Variable:
+    category: str
+    name: str
+    type: Type
+    desc: str
+    pad: Expr
+    precomputed: bool
+
+    def __init__(self, config: Config, category: str, data: dict):
+        self.category = category
+        assert_no_unexpected(data, Variable.__annotations__.keys())
+        self.name = data["name"]
+        reporter.asserts(isinstance(self.name, str), f"{self.name!r} is not a string")
+        reporter.asserts(self.name.isidentifier(), f"Invalid identifier: {self.name!r}")
+        self.type = build_type(config, data["type"])
+        self.desc = data["desc"]
+        reporter.asserts(isinstance(self.desc, str), f"{self.desc!r} is not a string")
+        self.pad = build_expr(None, data.get("pad", 0))
+        self.precomputed = data.get("precomputed", False)
+        reporter.asserts(
+            isinstance(self.precomputed, bool),
+            f"precomputed is not a bool: {self.precomputed!r}",
+        )
+
+
+def all_iters[T](
+    its: list[Iter], env: Environment, callback: Callable[[Environment], Iterable[T]]
+) -> Iterable[T]:
+    if not its:
+        yield from callback(env)
+    else:
+        yield from its[0].typecheck(env, lambda e: all_iters(its[1:], e, callback))
+
+
+@dataclass
+class PolyWithIters:
+    poly: Expr
+    iters: list[Iter]
+
+
+@dataclass
+class VirtualDef:
+    # A list of polynomials with each a set of iters they range over
+    defs: list[PolyWithIters]
+
+    def __init__(self, config: Config, name: str, tp: Type, data: dict):
+        if "poly" in data:
+            idx = data.get("idx", None)
+            self.defs = [
+                PolyWithIters(
+                    build_expr(config, data["poly"]), iters_of(data, name=idx)
+                )
+            ]
+        elif "polys" in data:
+            idx = data.get("idx", None)
+            self.defs = [
+                PolyWithIters(
+                    build_expr(config, poly["poly"]), iters_of(poly, name=idx)
+                )
+                for poly in data["polys"]
+            ]
+        else:
+            self.defs = [PolyWithIters(build_expr(config, data), [])]
+
+
+@dataclass
+class VirtualVariable(Variable):
+    def_: VirtualDef
+
+    def __init__(self, config: Config, category: str, data: dict):
+        assert_no_unexpected(data, set(Variable.__annotations__.keys()) | {"def"})
+        reporter.asserts("def" in data, f"Missing def for virtual column: {data!r}")
+        def_ = data.pop("def", {})
+        super().__init__(config, category, data)
+        self.def_ = VirtualDef(config, self.name, self.type, def_)
+
+    def typecheck(self, env: Environment) -> Type:
+        def structure_matches(a: Type, b: Type) -> bool:
+            if isinstance(a, Range) and isinstance(b, (Range, type(None))):
+                return True
+            elif isinstance(a, list) and isinstance(b, list):
+                return len(a) == len(b) and all(
+                    structure_matches(x, y) for x, y in zip(a, b)
+                )
+            else:
+                return False
+
+        def handle_iters(
+            env: Environment,
+            iters: list[Iter],
+            poly: Expr,
+            expected: Type,
+            indices: list[int],
+            seen: set[tuple],
+        ):
+            if not iters:
+                # Check not doubly defined
+                for s in seen:
+                    ln = min(len(s), len(indices))
+                    if s[:ln] == tuple(indices[:ln]):
+                        reporter.error(
+                            f"Double definition for virtual column: {self!r} at index {indices}"
+                        )
+                        break
+
+                val = poly.typecheck(env)
+                # check val structure matches assigned
+                reporter.asserts(
+                    structure_matches(val, expected),
+                    f"Invalid structure for definition to virtual column: {self!r}",
+                )
+                # Check type fits?
+
+                seen.add(tuple(indices))
+            else:
+                it, *its = iters
+                # Some duplicated code/concepts from Iter.typecheck
+                # But threading the extra needed state through overly complicates everything
+                start = it.start.typecheck(env)
+                if isinstance(start, list) or not start.is_const():
+                    reporter.error(
+                        f"Starting value of virtual def iter not a const: {self!r}"
+                    )
+                    start = Range.const(0)
+                stop = it.stop.typecheck(env)
+                if isinstance(stop, list) or not stop.is_const():
+                    reporter.error(
+                        f"Ending value of virtual def iter not a const: {self!r}"
+                    )
+                    stop = Range.const(start.get_const())
+
+                if isinstance(expected, Range):
+                    reporter.error(
+                        f"Virtual definition has an iter for a scalar: {self!r}"
+                    )
+                    return
+
+                if not 0 <= start.get_const() <= stop.get_const() < len(expected):
+                    reporter.error(
+                        f"Virtual definition index [{start.get_const()}, {stop.get_const()}] out of range for {expected}: {self!r}"
+                    )
+                    return
+
+                for i in range(start.get_const(), stop.get_const() + 1):
+                    handle_iters(
+                        env.with_val(it.name, Range.const(i)),
+                        its,
+                        poly,
+                        expected[i],
+                        indices + [i],
+                        seen,
+                    )
+
+        def is_covered(seen: set[tuple], indices: list[int]) -> bool:
+            for s in seen:
+                if len(s) <= len(indices) and s == tuple(indices[: len(s)]):
+                    return True
+            return False
+
+        def check_covered(t: Type, seen: set[tuple], indices: list[int]):
+            if isinstance(t, Range):
+                reporter.asserts(
+                    is_covered(seen, indices),
+                    f"Virtual column {self.name!r} not completely defined",
+                )
+            else:
+                for i, elt in enumerate(t):
+                    check_covered(elt, seen, indices + [i])
+
+        # Special case for better error messages
+        if isinstance(self.type, Range):
+            reporter.asserts(
+                len(self.def_.defs) == 1 and not self.def_.defs[0].iters,
+                f"Invalid def for scalar column: {self!r}",
+            )
+            assigned_type = self.def_.defs[0].poly.typecheck(env)
+            if not isinstance(assigned_type, Range):
+                reporter.error(
+                    f"Assigning non-scalar type to scalar virtual column: {self!r}"
+                )
+                return self.type
+            # Check type fits?
+            # Leaving this out because it produces too much noise with one-hot assumptions
+            # reporter.asserts(self.type.low <= assigned_type.low <= assigned_type.high <= self.type.high, f"Definition may not fit in virtual column: {self!r}")
+        else:
+            # Check no indices are covered twice
+            seen: set[tuple] = set()
+            for poly_iters in self.def_.defs:
+                handle_iters(
+                    env, poly_iters.iters, poly_iters.poly, self.type, [], seen
+                )
+            # Check everything is covered
+            check_covered(self.type, seen, [])
+        return self.type
+
+
+@dataclass
+class Assumption:
+    desc: str
+    iters: list[Iter]
+
+    def __init__(self, config: Config, data: dict):
+        assert_no_unexpected(
+            data, set(self.__annotations__.keys()) | {"iter", "iters", "ref"}
+        )
+        self.desc = data["desc"]
+        self.iters = iters_of(data)
+
+
+@dataclass
+class ArithConstraint:
+    constraint: str
+    desc: str
+    poly: Expr
+    iters: list[Iter]
+
+    def __init__(self, config: Config, data: dict):
+        assert_no_unexpected(
+            data, set(self.__annotations__.keys()) | {"kind", "ref", "iter", "iters"}
+        )
+        assert data["kind"] == "arith"
+        self.constraint = data["constraint"]
+        reporter.asserts(
+            isinstance(self.constraint, str),
+            f"Constraint not a string: {self.constraint!r}",
+        )
+        self.desc = data.get("desc", "")
+        reporter.asserts(
+            isinstance(self.desc, str), f"desc is not a string: {self.desc!r}"
+        )
+        self.poly = build_expr(config, data["poly"])
+        self.iters = iters_of(data)
+
+    def typecheck(self, env: Environment) -> Iterable[Never]:
+        # TODO? Should we check that there's no overflow of the modulus?
+        #   This would probably struggle due to things like one-hot invariants
+
+        def check_includes_zero(t: Type):
+            if isinstance(t, Range):
+                reporter.asserts(
+                    t.low <= 0 <= t.high,
+                    f"Unsatisfiable constraint, 0 not in range: {self!r} {t}",
+                )
+            else:
+                reporter.error(
+                    f"Non-scalar value for polynomial constraint: {self!r} {t}"
+                )
+
+        for t in all_iters(self.iters, env, lambda e: [self.poly.typecheck(e)]):
+            check_includes_zero(t)
+        return []
+
+
+@dataclass
+class Signature:
+    tag: str
+    input: list[Type]
+    output: Optional[Type]
+
+
+@dataclass
+class InteractionLike:
+    kind: str
+    conditional_name: str
+    conditional_required: bool
+    signature: type[Signature]
+
+    tag: str
+    desc: str
+    input: list[Expr]
+    output: Optional[Expr]
+    conditional: Optional[Expr]
+    iters: list[Iter]
+
+    def __init__(self, config: Config, data: dict):
+        assert_no_unexpected(
+            data,
+            {
+                "tag",
+                "desc",
+                "input",
+                "output",
+                self.conditional_name,
+                "kind",
+                "ref",
+                "iter",
+                "iters",
+            },
+        )
+        assert data["kind"] == self.kind
+        self.tag = data["tag"]
+        reporter.asserts(
+            isinstance(self.tag, str), f"tag is not a string: {self.tag!r}"
+        )
+        self.desc = data.get("desc", "")
+        reporter.asserts(
+            isinstance(self.desc, str), f"Description is not a string: {self.desc!r}"
+        )
+        self.input = [build_expr(config, inp) for inp in data["input"]]
+        if "output" in data:
+            self.output = build_expr(config, data["output"])
+        else:
+            self.output = None
+        if self.conditional_name in data:
+            self.conditional = build_expr(config, data[self.conditional_name])
+        else:
+            reporter.asserts(
+                not self.conditional_required,
+                f"Missing {self.conditional_name}: {data!r}",
+            )
+            self.conditional = None
+        self.iters = iters_of(data)
+
+    def typecheck(self, env: Environment) -> Iterable[Signature]:
+        def callback(e: Environment) -> Iterable[Signature]:
+            # TODO: Should we be able to check cond/multiplicity somehow?
+            if self.conditional is not None:
+                self.conditional.typecheck(e)
+            return [
+                self.signature(
+                    self.tag,
+                    [inp.typecheck(e) for inp in self.input],
+                    self.output.typecheck(e) if self.output else None,
+                )
+            ]
+
+        return all_iters(self.iters, env, callback)
+
+
+class TemplateSignature(Signature):
+    pass
+
+
+class TemplateConstraint(InteractionLike):
+    kind = "template"
+    conditional_name = "cond"
+    conditional_required = False
+    signature = TemplateSignature
+
+
+class InteractionSignature(Signature):
+    pass
+
+
+class InteractionConstraint(InteractionLike):
+    kind = "interaction"
+    conditional_name = "multiplicity"
+    conditional_required = True
+    signature = InteractionSignature
+
+
+@dataclass
+class DummyConstraint:
+    def typecheck(self, env: Environment) -> list[Never]:
+        return []
+
+
+type Constraint = (
+    ArithConstraint | TemplateConstraint | InteractionConstraint | DummyConstraint
+)
+
+
+def build_constraint(config, data: dict) -> Constraint:
+    match data["kind"]:
+        case "arith":
+            return ArithConstraint(config, data)
+        case "template":
+            return TemplateConstraint(config, data)
+        case "interaction":
+            return InteractionConstraint(config, data)
+        case other:
+            reporter.error(f"Unknown constraint kind: {other!r}")
+            return DummyConstraint()
+
+
+@dataclass
+class Chip:
+    config: Config
+    name: str
+    variables: list[Variable]
+    assumptions: list[Assumption]
+    constraints: list[Constraint]
+
+    def __init__(self, config: Config, data: dict):
+        """Construct a chip from toml-parsed data"""
+        assert_no_unexpected(
+            data, set(type(self).__annotations__.keys()) | {"constraint_groups"}
+        )
+        assert_no_unexpected(data["variables"], config.variables.categories.all)
+        self.config = config
+        self.name = data["name"]
+        reporter.asserts(
+            isinstance(self.name, str), f"name is not a string: {self.name!r}"
+        )
+        reporter.asserts(self.name.isidentifier(), f"Invalid identifier: {self.name!r}")
+        self.variables = [
+            (Variable if cat != "virtual" else VirtualVariable)(config, cat, var)
+            for cat, vars in data["variables"].items()
+            for var in vars
+        ]
+        self.assumptions = [Assumption(config, a) for a in data.get("assumptions", [])]
+        constraint_groups = [grp["name"] for grp in data.get("constraint_groups", [])]
+        assert_no_unexpected(data.get("constraints", {}), constraint_groups)
+        self.constraints = [
+            build_constraint(config, constraint)
+            for group in data.get("constraints", {}).values()
+            for constraint in group
+        ]
+
+    @classmethod
+    def from_file(cls, config: Config, filename: str | Path) -> Self:
+        reporter.update_location(str(filename))
+        return cls(config, tomllib.load(open(filename, "rb")))
+
+    @classmethod
+    def from_string(cls, config: Config, s: str) -> Self:
+        reporter.update_location("<string>")
+        return cls(config, tomllib.loads(s))
+
+    def typecheck(self) -> Iterable[Signature]:
+        typemap = {}
+        for v in self.variables:
+            if isinstance(v.type, list) and len(v.type) == 1:
+                t = v.type[0]
+            else:
+                t = v.type
+            typemap[v.name] = t
+
+        env = Environment(self.config, {}, typemap)
+        for v in self.variables:
+            if isinstance(v, VirtualVariable):
+                v.typecheck(env)
+        for c in self.constraints:
+            yield from c.typecheck(env)
+
+
+if __name__ == "__main__":
+    config = Config.from_file(sys.argv[1])
+    signatures = sys.argv[2]  # Later
+    if reporter.reported:
+        sys.exit(1)
+    reported = False
+    chips: list[Chip] = []
+    for file in sys.argv[3:]:
+        if file in sys.argv[1:3]:
+            continue
+        chips.append(Chip.from_file(config, file))
+        reported |= reporter.reported
+    if not reported:
+        for chip in chips:
+            reporter.update_location(f"Chip {chip.name}")
+            # TODO: do something with the signatures
+            # Use list for the sideeffect of forcing the generator until we use the content
+            list(chip.typecheck())

From 26ae8331d7cd6095539744b47e47cd990ce0050a Mon Sep 17 00:00:00 2001
From: Robin Jadoul <robin.jadoul@3milabs.tech>
Date: Tue, 10 Feb 2026 16:43:15 +0100
Subject: [PATCH 063/105] spec: Introduce array expressions (#295)

Closes #135
---
 spec/chip.typ        | 4 ++--
 spec/expr.typ        | 3 +++
 spec/src/branch.toml | 2 +-
 spec/src/shift.toml  | 6 +++---
 spec/tooling/chip.py | 3 +++
 5 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/spec/chip.typ b/spec/chip.typ
index 4749b886e..f3ac28a74 100644
--- a/spec/chip.typ
+++ b/spec/chip.typ
@@ -52,9 +52,9 @@
 #let render_chip_padding_table(chip, config) = {
   // Whether `var` is a preprocessed variable.
   let is_preprocessed(var) = {
-    config.variables.types
+    let type = config.variables.types
     .filter(t => t.label == var.type)
-    .all(t => t.at("preprocessed", default: false))
+    type.len() > 0 and type.all(t => t.at("preprocessed", default: false))
   }
 
   let instantiated_vars = config.variables.categories.instantiated.map(c => chip.variables.at(c, default: ())).flatten()
diff --git a/spec/expr.typ b/spec/expr.typ
index 1c6c7942e..c4f84eb59 100644
--- a/spec/expr.typ
+++ b/spec/expr.typ
@@ -28,6 +28,7 @@
 // <expr> ::= ()                           ; ""
 //          | var                          ; str(var)
 //          | int                          ; int
+//          | ["arr", expr, ...]           ; [expr, ...]
 //          | ["idx", expr1, expr2]        ; expr1[expr2]
 //          | ["not", expr]                ; !expr
 //          | ["+", expr1, expr2, ...]     ; expr1 + expr2 + ...
@@ -91,6 +92,7 @@
 // Typeset an expression as code
 #let expr_to_code = make_expr_formatter(
   (
+    "arr": (pp, rec, e) => `[` + e.slice(1).map(rec.with(PREC.MAX)).join(`, `) + `]`,
     "idx": (pp, rec, e) => rec(PREC.MIN, e.at(1)) + `[` + rec(PREC.MAX, e.at(2)) + `]`,
     "not": (pp, rec, e) => cwrap(rec(PREC.not, 1) + ` - ` + rec(PREC.not, e.at(1)), pp < PREC.not),
     "+": (pp, rec, e) => cwrap(e.slice(1).map(rec.with(PREC.add)).join(` + `), pp < PREC.add),
@@ -149,6 +151,7 @@
 // Typeset an expression as math
 #let expr_to_math = make_expr_formatter(
   (
+    "arr": (pp, rec, e) => $[#e.slice(1).map(rec.with(PREC.MAX)).join($, $)]$,
     "idx": (pp, rec, e) => {
       let (val, idxs) = flat_idxs(e)
       $#rec(PREC.idx, val)_(#idxs.map(idx => rec(PREC.idx, idx)).join($, $))$
diff --git a/spec/src/branch.toml b/spec/src/branch.toml
index beb3c1922..34bcdf8cb 100644
--- a/spec/src/branch.toml
+++ b/spec/src/branch.toml
@@ -34,7 +34,7 @@ pad = 0
 name = "next_pc_high"
 type = ["Half", 3]
 desc = "The upper part of the next pc"
-pad = 0 # TODO(#128): improve handling for arrays
+pad = ["arr", 0, 0, 0]
 
 [[variables.output]]
 name = "next_pc_low"
diff --git a/spec/src/shift.toml b/spec/src/shift.toml
index bd6c471a6..5faed54c7 100644
--- a/spec/src/shift.toml
+++ b/spec/src/shift.toml
@@ -65,19 +65,19 @@ pad = 1
 name = "X"
 type = ["Half", 5]
 desc = "scratch variable."
-pad = 0 # TODO: array
+pad = ["arr", 0, 0, 0, 0, 0]
 
 [[variables.auxiliary]]
 name = "Y"
 type = ["Half", 4]
 desc = "scratch variable."
-pad = 0 # TODO: array
+pad = ["arr", 0, 0, 0, 0]
 
 [[variables.auxiliary]]
 name = "limb_shift"
 type = ["Bit", 4]
 desc = "One-hot vector indicating whether $floor.l #`shift` / 16 floor.r equiv i mod s$, where $s = 2$ when $#`word_instr` = 1$ and $4$ otherwise."
-pad = 0 # TODO: array
+pad = ["arr", 0, 0, 0, 0]
 
 # Virtual
 
diff --git a/spec/tooling/chip.py b/spec/tooling/chip.py
index 8a15ae338..6a78dc091 100644
--- a/spec/tooling/chip.py
+++ b/spec/tooling/chip.py
@@ -62,6 +62,7 @@ def get_const(self) -> int:
 type Expr = (
     LitExpr
     | VarExpr
+    | ArrExpr
     | IdxExpr
     | CastExpr
     | MulExpr
@@ -303,6 +304,8 @@ def build_expr(config: Optional["Config"], data: object) -> Expr:
                 x.isidentifier(), f"Invalid identifier name for variable {x!r}"
             )
             return VarExpr(x)
+        case ["arr", *elems]:
+            return ArrExpr([build_expr(config, e) for e in elems])
         case ["idx", x, y]:
             return IdxExpr(build_expr(config, x), build_expr(config, y))
         case ["cast", x, t]:

From 9fd3bf813273b7c10053ffeb70e9728c237cf341 Mon Sep 17 00:00:00 2001
From: Robin Jadoul <robin.jadoul@3milabs.tech>
Date: Tue, 10 Feb 2026 16:46:19 +0100
Subject: [PATCH 064/105] spec: separate ALU path for STORE to enable byte
 representation of rv2 to exist in arg2 (#308)

* spec: separate ALU path for STORE to enable byte representation of rv2 to exist in arg2

* Apply review suggestion

Co-authored-by: Erik <159244975+erik-3milabs@users.noreply.github.com>

* Update spec/src/cpu.toml

Co-authored-by: Erik <159244975+erik-3milabs@users.noreply.github.com>

---------

Co-authored-by: Erik <159244975+erik-3milabs@users.noreply.github.com>
---
 spec/src/cpu.toml | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/spec/src/cpu.toml b/spec/src/cpu.toml
index 994fda508..283597246 100644
--- a/spec/src/cpu.toml
+++ b/spec/src/cpu.toml
@@ -559,10 +559,17 @@ prefix = "A"
 [[constraints.alu]]
 kind = "template"
 tag = "ADD"
-cond = ["+", "ADD", "LOAD", "STORE"]
+cond = ["+", "ADD", "LOAD"]
 input = [["cast", "arg1", "DWordWL"], ["cast", "arg2", "DWordWL"]]
 output = ["cast", "res", "DWordWL"]
 
+[[constraints.alu]]
+kind = "template"
+tag = "ADD"
+cond = "STORE"
+input = [["cast", "arg1", "DWordWL"], "imm"]
+output = ["cast", "res", "DWordWL"]
+
 [[constraints.alu]]
 kind = "template"
 tag = "SUB"
@@ -689,7 +696,7 @@ multiplicity = "LOAD"
 [[constraints.mem]]
 kind = "interaction"
 tag = "MEMW"
-input = [0, "res", "rv2", ["+", "timestamp", ["cast", 1, "DWordWL"]], "memory_2bytes", "memory_4bytes", "memory_8bytes"]
+input = [0, "res", ["cast", "arg2", ["Byte", 8]], ["+", "timestamp", ["cast", 1, "DWordWL"]], "memory_2bytes", "memory_4bytes", "memory_8bytes"]
 multiplicity = "STORE"
 
 # TODO: no types available, so no casting yet
@@ -754,13 +761,13 @@ multiplicity = "word_instr"
 
 [[constraints.ext]]
 kind = "arith"
-constraint = "$#`arg2[:4]` = (1 - #`STORE` - #`LOAD`) dot #`rv2[:2]` + (1 - #`BEQ` - #`BLT`) dot #`imm[0]`$"
-poly = ["-", ["idx", ["cast", "arg2", "DWordWL"], 0], ["*", ["-", 1, "STORE", "LOAD"], ["idx", ["cast", "rv2", "DWordWL"], 0]], ["*", ["-", 1, "BEQ", "BLT"], ["idx", "imm", 0]]]
+constraint = "$#`arg2[:4]` = (1 - #`LOAD`) dot #`rv2[:2]` + (1 - #`BEQ` - #`BLT` - #`STORE`) dot #`imm[0]`$"
+poly = ["-", ["idx", ["cast", "arg2", "DWordWL"], 0], ["*", ["not", "LOAD"], ["idx", ["cast", "rv2", "DWordWL"], 0]], ["*", ["-", 1, "BEQ", "BLT", "STORE"], ["idx", "imm", 0]]]
 
 [[constraints.ext]]
 kind = "arith"
-constraint = "$#`arg2[4:]` = (1 - #`STORE` - #`LOAD`) dot ((1 - #`word_instr`) dot #`rv2[2]` + #`signed` dot #`arg2_sign_bit` dot (2^(32) - 1)) + (1 - #`BEQ` - #`BLT`) dot #`imm[1]`$"
-poly = ["-", ["idx", ["cast", "arg2", "DWordWL"], 1], ["*", ["-", 1, "STORE", "LOAD"], ["not", "word_instr"], ["idx", "rv2", 2]], ["*", ["-", 1, "STORE", "LOAD"], "signed", "arg2_sign_bit", ["-", ["^", 2, 32], 1]], ["*", ["-", 1, "BEQ", "BLT"], ["idx", "imm", 1]]]
+constraint = "$#`arg2[4:]` = (1 - #`LOAD`) dot ((1 - #`word_instr`) dot #`rv2[2]` + #`signed` dot #`arg2_sign_bit` dot (2^(32) - 1)) + (1 - #`BEQ` - #`BLT` - #`STORE`) dot #`imm[1]`$"
+poly = ["-", ["idx", ["cast", "arg2", "DWordWL"], 1], ["*", ["not", "LOAD"], ["not", "word_instr"], ["idx", "rv2", 2]], ["*", ["not", "LOAD"], "signed", "arg2_sign_bit", ["-", ["^", 2, 32], 1]], ["*", ["-", 1, "BEQ", "BLT", "STORE"], ["idx", "imm", 1]]]
 
 [[constraints.ext]]
 kind = "interaction"

From f19e9a49c66d4c08f0f9870d038984b3047be96a Mon Sep 17 00:00:00 2001
From: Joaquin Carletti <joaquin.carletti@lambdaclass.com>
Date: Wed, 11 Feb 2026 16:44:12 -0300
Subject: [PATCH 065/105] update_docs

---
 docs/spec/branch.md    |   6 +-
 docs/spec/cpu.md       | 151 +++++++++++++-------------
 docs/spec/dvrm.md      |  40 +++----
 docs/spec/memory.md    |   2 +-
 docs/spec/memw.md      |  44 ++++----
 docs/spec/spec_full.md | 241 +++++++++++++++++++++--------------------
 6 files changed, 245 insertions(+), 239 deletions(-)

diff --git a/docs/spec/branch.md b/docs/spec/branch.md
index 9d4a07a76..37fa8b918 100644
--- a/docs/spec/branch.md
+++ b/docs/spec/branch.md
@@ -19,7 +19,7 @@ The range checks on `unmasked_low_byte` and `next_pc_low[0]` are performed impli
 | `BRANCH-C1` |  | 1 - JALR ⇒ `ADD<next_pc_unmasked; pc, offset::DWordWL>` |  |
 | `BRANCH-C2` |  | JALR ⇒ `ADD<next_pc_unmasked; register, offset::DWordWL>` |  |
 | `BRANCH-C3` |  | `IS_BYTE[next_pc_low[1]]` | μ |
-| `BRANCH-C4` |  | `AND_BYTE[next_pc_low[0]; unmasked_low_byte[0], 254]` | μ |
+| `BRANCH-C4` |  | `AND_BYTE[next_pc_low[0]; unmasked_low_byte, 254]` | μ |
 | `BRANCH-C5.i` | i ∈ [0, 2] | `IS_HALFWORD[next_pc_high[i]]` | μ |
 
 This chip contributes the following to the lookup argument.
@@ -39,7 +39,7 @@ The table can be padded to the next power of two with the following value assign
 | Name | Type | Description |
 |------|------|-------------|
 | `pc` | `DWordWL` | The current pc, used as base address when `!JALR` |
-| `offset` | `Word` | The offset from the base address to jump to |
+| `offset` | `DWordWL` | The offset from the base address to jump to |
 | `register` | `DWordWL` | The base address to use when `JALR` |
 | `JALR` | `Bit` | Selects between `pc` and `register` as base address, needed for the `JALR` instruction |
 
@@ -65,7 +65,7 @@ The table can be padded to the next power of two with the following value assign
 
 **Definition of `next_pc_unmasked`:**
 ```
-next_pc_unmasked (when iter=0) := 2^16 * next_pc_high[0] + 2^8 * next_pc_low[1] + unmasked_low_byte[0]
+next_pc_unmasked (when iter=0) := 2^16 * next_pc_high[0] + 2^8 * next_pc_low[1] + unmasked_low_byte
 next_pc_unmasked (when iter=1) := 2^16 * next_pc_high[2] + next_pc_high[1]
 ```
 
diff --git a/docs/spec/cpu.md b/docs/spec/cpu.md
index 756c67c9e..0bd5e0d6d 100644
--- a/docs/spec/cpu.md
+++ b/docs/spec/cpu.md
@@ -8,9 +8,9 @@ The `CPU` chip is comprised of  variables that are expressed using  columns:
 
 = Constraints First, we perform a decoding lookup for the current PC.
 
-| Tag | Description |
-|-----|-------------|
-| `CPU-C1` | `DECODE[pc, imm, packed_decode]` |
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `CPU-C1` | `DECODE[pc, imm, packed_decode]` | 1 |
 
 > **Note:** All casts for interactions will have to be reviewed once other chip interfaces stabilise
 
@@ -22,41 +22,41 @@ The `CPU` chip is comprised of  variables that are expressed using  columns:
 
 We constrain all columns to have the appropriate ranges. The flags and register indices looked up from the decoding need to be checked, as they are communicated through the interaction in a packed form. In contrast, we know ahead of time that decoding will ensure proper range checks for `pc` and `imm`. Similarly, since `next_pc` will propagate through the memory argument and be looked up in the instruction decoding on the next cycle, it is forced to be in the correct range. For the auxiliary columns, we need to check the limbs of `arg1`, `arg2`, and `res`. The ranges of the other auxiliary columns are enforced through later constraints.
 
-| Tag | Range | Description |
-|-----|-------|-------------|
-| `CPU-CR2` |  | `IS_BIT<read_register1>` |
-| `CPU-CR3` |  | `IS_BIT<read_register2>` |
-| `CPU-CR4` |  | `IS_BIT<write_register>` |
-| `CPU-CR5` |  | `IS_BIT<memory_2bytes>` |
-| `CPU-CR6` |  | `IS_BIT<memory_4bytes>` |
-| `CPU-CR7` |  | `IS_BIT<memory_8bytes>` |
-| `CPU-CR8` |  | `IS_BIT<c_type_instruction>` |
-| `CPU-CR9` |  | `IS_BIT<signed>` |
-| `CPU-CR10` |  | `IS_BIT<mp_selector>` |
-| `CPU-CR11` |  | `IS_BIT<muldiv_selector>` |
-| `CPU-CR12` |  | `IS_BIT<word_instr>` |
-| `CPU-CR13` |  | `IS_BIT<ADD>` |
-| `CPU-CR14` |  | `IS_BIT<SUB>` |
-| `CPU-CR15` |  | `IS_BIT<SLT>` |
-| `CPU-CR16` |  | `IS_BIT<AND>` |
-| `CPU-CR17` |  | `IS_BIT<OR>` |
-| `CPU-CR18` |  | `IS_BIT<XOR>` |
-| `CPU-CR19` |  | `IS_BIT<SHIFT>` |
-| `CPU-CR20` |  | `IS_BIT<JALR>` |
-| `CPU-CR21` |  | `IS_BIT<BEQ>` |
-| `CPU-CR22` |  | `IS_BIT<BLT>` |
-| `CPU-CR23` |  | `IS_BIT<LOAD>` |
-| `CPU-CR24` |  | `IS_BIT<STORE>` |
-| `CPU-CR25` |  | `IS_BIT<MUL>` |
-| `CPU-CR26` |  | `IS_BIT<DIVREM>` |
-| `CPU-CR27` |  | `IS_BIT<ECALL>` |
-| `CPU-CR28` |  | `IS_BIT<EBREAK>` |
-| `CPU-CR29` |  | `IS_BYTE[rs1]` |
-| `CPU-CR30` |  | `IS_BYTE[rs2]` |
-| `CPU-CR31` |  | `IS_BYTE[rd]` |
-| `CPU-CR32.i` | i ∈ [0, 7] | `IS_BYTE[arg1[i]]` |
-| `CPU-CR33.i` | i ∈ [0, 7] | `IS_BYTE[arg2[i]]` |
-| `CPU-CR34.i` | i ∈ [0, 7] | `IS_BYTE[res[i]]` |
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `CPU-CR2` |  | `IS_BIT<read_register1>` |  |
+| `CPU-CR3` |  | `IS_BIT<read_register2>` |  |
+| `CPU-CR4` |  | `IS_BIT<write_register>` |  |
+| `CPU-CR5` |  | `IS_BIT<memory_2bytes>` |  |
+| `CPU-CR6` |  | `IS_BIT<memory_4bytes>` |  |
+| `CPU-CR7` |  | `IS_BIT<memory_8bytes>` |  |
+| `CPU-CR8` |  | `IS_BIT<c_type_instruction>` |  |
+| `CPU-CR9` |  | `IS_BIT<signed>` |  |
+| `CPU-CR10` |  | `IS_BIT<mp_selector>` |  |
+| `CPU-CR11` |  | `IS_BIT<muldiv_selector>` |  |
+| `CPU-CR12` |  | `IS_BIT<word_instr>` |  |
+| `CPU-CR13` |  | `IS_BIT<ADD>` |  |
+| `CPU-CR14` |  | `IS_BIT<SUB>` |  |
+| `CPU-CR15` |  | `IS_BIT<SLT>` |  |
+| `CPU-CR16` |  | `IS_BIT<AND>` |  |
+| `CPU-CR17` |  | `IS_BIT<OR>` |  |
+| `CPU-CR18` |  | `IS_BIT<XOR>` |  |
+| `CPU-CR19` |  | `IS_BIT<SHIFT>` |  |
+| `CPU-CR20` |  | `IS_BIT<JALR>` |  |
+| `CPU-CR21` |  | `IS_BIT<BEQ>` |  |
+| `CPU-CR22` |  | `IS_BIT<BLT>` |  |
+| `CPU-CR23` |  | `IS_BIT<LOAD>` |  |
+| `CPU-CR24` |  | `IS_BIT<STORE>` |  |
+| `CPU-CR25` |  | `IS_BIT<MUL>` |  |
+| `CPU-CR26` |  | `IS_BIT<DIVREM>` |  |
+| `CPU-CR27` |  | `IS_BIT<ECALL>` |  |
+| `CPU-CR28` |  | `IS_BIT<EBREAK>` |  |
+| `CPU-CR29` |  | `IS_BYTE[rs1]` | 1 |
+| `CPU-CR30` |  | `IS_BYTE[rs2]` | 1 |
+| `CPU-CR31` |  | `IS_BYTE[rd]` | 1 |
+| `CPU-CR32.i` | i ∈ [0, 7] | `IS_BYTE[arg1[i]]` | 1 |
+| `CPU-CR33.i` | i ∈ [0, 7] | `IS_BYTE[arg2[i]]` | 1 |
+| `CPU-CR34.i` | i ∈ [0, 7] | `IS_BYTE[res[i]]` | 1 |
 
 ## ALU
 
@@ -64,18 +64,19 @@ The ALU functionality is then obtained through judicious dispatching to the corr
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `CPU-CA35` |  | ADD + LOAD + STORE ⇒ `ADD<res::DWordWL; arg1::DWordWL, arg2::DWordWL>` |  |
-| `CPU-CA36` |  | SUB + BEQ ⇒ `SUB<res::DWordWL; arg1::DWordWL, arg2::DWordWL>` |  |
-| `CPU-CA37` |  | `LT[res[0]; arg1::DWordWL, arg2::DWordWL, signed]` | SLT + BLT |
-| `CPU-CA38.i` | i ∈ [1, 7] | `SLT` + `BLT` => `res[i]` = 0 |  |
+| `CPU-CA35` |  | ADD + LOAD ⇒ `ADD<res::DWordWL; arg1::DWordWL, arg2::DWordWL>` |  |
+| `CPU-CA36` |  | STORE ⇒ `ADD<res::DWordWL; arg1::DWordWL, imm>` |  |
+| `CPU-CA37` |  | SUB + BEQ ⇒ `SUB<res::DWordWL; arg1::DWordWL, arg2::DWordWL>` |  |
+| `CPU-CA38` |  | `LT[res[0]; arg1::DWordWL, arg2::DWordWL, signed]` | SLT + BLT |
+| `CPU-CA39.i` | i ∈ [1, 7] | `SLT` + `BLT` => `res[i]` = 0 |  |
 | | | _polynomial:_ `(SLT + BLT) * res[i] = 0` | |
-| `CPU-CA39.i` | i ∈ [0, 7] | `AND_BYTE[res[i]; arg1[i], arg2[i]]` | AND |
-| `CPU-CA40.i` | i ∈ [0, 7] | `OR_BYTE[res[i]; arg1[i], arg2[i]]` | OR |
-| `CPU-CA41.i` | i ∈ [0, 7] | `XOR_BYTE[res[i]; arg1[i], arg2[i]]` | XOR |
-| `CPU-CA42` |  | `SHIFT[res::DWordHL; arg1::DWordHL, arg2[0], mp_selector, signed, word_instr]` | SHIFT |
-| `CPU-CA43` |  | JALR ⇒ `ADD<res::DWordWL; pc, (2 * c_type_instruction + 4 * (1 - c_type_instruction))::DWordWL>` |  |
-| `CPU-CA44` |  | `MUL[res; arg1, signed, arg2, mp_selector, muldiv_selector]` | MUL |
-| `CPU-CA45` |  | `DVRM[res; arg1, arg2, signed, muldiv_selector]` | DIVREM |
+| `CPU-CA40.i` | i ∈ [0, 7] | `AND_BYTE[res[i]; arg1[i], arg2[i]]` | AND |
+| `CPU-CA41.i` | i ∈ [0, 7] | `OR_BYTE[res[i]; arg1[i], arg2[i]]` | OR |
+| `CPU-CA42.i` | i ∈ [0, 7] | `XOR_BYTE[res[i]; arg1[i], arg2[i]]` | XOR |
+| `CPU-CA43` |  | `SHIFT[res::DWordHL; arg1::DWordHL, arg2[0], mp_selector, signed, word_instr]` | SHIFT |
+| `CPU-CA44` |  | JALR ⇒ `ADD<res::DWordWL; pc, (2 * c_type_instruction + 4 * (1 - c_type_instruction)) * 1::DWordWL>` |  |
+| `CPU-CA45` |  | `MUL[res; arg1, signed, arg2, mp_selector, muldiv_selector]` | MUL |
+| `CPU-CA46` |  | `DVRM[res; arg1, arg2, signed, muldiv_selector]` | DIVREM |
 
 ## Memory
 
@@ -83,16 +84,16 @@ The interactions with the memory, both for register loading and storing, as for
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `CPU-CM46` |  | `MEMW[rv1; 1, 2 * rs1, rv1, timestamp + 0, 1, 0, 0]` | read_register1 |
-| `CPU-CM47.i` | i ∈ [0, 2] | `!read_register1` => `rv1[i]` = 0 |  |
+| `CPU-CM47` |  | `MEMW[rv1; 1, 2 * rs1, rv1, timestamp + 0::DWordWL, 1, 0, 0]` | read_register1 |
+| `CPU-CM48.i` | i ∈ [0, 2] | `!read_register1` => `rv1[i]` = 0 |  |
 | | | _polynomial:_ `(1 - read_register1) * rv1[i] = 0` | |
-| `CPU-CM48` |  | `MEMW[rv2; 1, 2 * rs2, rv2, timestamp + 1, 1, 0, 0]` | read_register2 |
-| `CPU-CM49.i` | i ∈ [0, 2] | `!read_register2` => `rv2[i]` = 0 |  |
+| `CPU-CM49` |  | `MEMW[rv2; 1, 2 * rs2, rv2, timestamp + 1::DWordWL, 1, 0, 0]` | read_register2 |
+| `CPU-CM50.i` | i ∈ [0, 2] | `!read_register2` => `rv2[i]` = 0 |  |
 | | | _polynomial:_ `(1 - read_register2) * rv2[i] = 0` | |
-| `CPU-CM50` |  | `MEMW[1, 2 * rd, rvd, timestamp + 2, 1, 0, 0]` | write_register |
-| `CPU-CM51` |  | `LOAD[rvd; 0, res, timestamp + 0, memory_2bytes, memory_4bytes, memory_8bytes, signed]` | LOAD |
-| `CPU-CM52` |  | `MEMW[0, res, rv2, timestamp + 1, memory_2bytes, memory_4bytes, memory_8bytes]` | STORE |
-| `CPU-CM53` |  | `MEMW[pc; 1, 2 * 255, next_pc, timestamp + 1, 1, 0, 0]` | 1 - pad |
+| `CPU-CM51` |  | `MEMW[1, 2 * rd, rvd, timestamp + 2::DWordWL, 1, 0, 0]` | write_register |
+| `CPU-CM52` |  | `LOAD[rvd; 0, res, timestamp + 0::DWordWL, memory_2bytes, memory_4bytes, memory_8bytes, signed]` | LOAD |
+| `CPU-CM53` |  | `MEMW[0, res, arg2::Byte[8], timestamp + 1::DWordWL, memory_2bytes, memory_4bytes, memory_8bytes]` | STORE |
+| `CPU-CM54` |  | `MEMW[pc; 1, 2 * 255, next_pc, timestamp + 1::DWordWL, 1, 0, 0]` | 1 - pad |
 
 ## System
 
@@ -100,9 +101,9 @@ The interactions with the wider system.
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `CPU-CS54` | `!EBREAK` |  |
+| `CPU-CS55` | `!EBREAK` |  |
 | | _polynomial:_ `1 - EBREAK = 0` | |
-| `CPU-CS55` | `ECALL[timestamp, rv1::DWordWL]` | ECALL |
+| `CPU-CS56` | `ECALL[timestamp, rv1::DWordWL]` | ECALL |
 
 ## Input and output to the ALU
 
@@ -110,22 +111,22 @@ We constrain `arg1`, `arg2` and `rvd` to correspond to the wanted values, includ
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `CPU-CE56` | (`rv1_sign_bit` or `arg2_sign_bit` or `res_sign_bit`) => `word_instr` |  |
+| `CPU-CE57` | (`rv1_sign_bit` or `arg2_sign_bit` or `res_sign_bit`) => `word_instr` |  |
 | | _polynomial:_ `(rv1_sign_bit + arg2_sign_bit + res_sign_bit) * (1 - word_instr) = 0` | |
-| `CPU-CE57` | `MSB16[rv1_sign_bit; rv1[1]]` | word_instr |
-| `CPU-CE58` | `arg1[:4]` = `rv1[:2]` |  |
+| `CPU-CE58` | `MSB16[rv1_sign_bit; rv1[1]]` | word_instr |
+| `CPU-CE59` | `arg1[:4]` = `rv1[:2]` |  |
 | | _polynomial:_ `(arg1::DWordWL)[0] - (rv1::DWordWL)[0] = 0` | |
-| `CPU-CE59` | `arg1[4:]` = `rv1[2]` dot (1 - `word_instr`) + (2^(32) - 1) dot `rv1_sign_bit` dot `signed` |  |
+| `CPU-CE60` | `arg1[4:]` = `rv1[2]` dot (1 - `word_instr`) + (2^(32) - 1) dot `rv1_sign_bit` dot `signed` |  |
 | | _polynomial:_ `(arg1::DWordWL)[1] - (1 - word_instr) * rv1[2] - signed * rv1_sign_bit * (2^32 - 1) = 0` | |
-| `CPU-CE60` | `MSB16[arg2_sign_bit; rv2[1]]` | word_instr |
-| `CPU-CE61` | `arg2[:4]` = (1 - `STORE` - `LOAD`) dot `rv2[:2]` + (1 - `BEQ` - `BLT`) dot `imm[0]` |  |
-| | _polynomial:_ `(arg2::DWordWL)[0] - (1 - STORE - LOAD) * (rv2::DWordWL)[0] - (1 - BEQ - BLT) * imm[0] = 0` | |
-| `CPU-CE62` | `arg2[4:]` = (1 - `STORE` - `LOAD`) dot ((1 - `word_instr`) dot `rv2[2]` + `signed` dot `arg2_sign_bit` dot (2^(32) - 1)) + (1 - `BEQ` - `BLT`) dot `imm[1]` |  |
-| | _polynomial:_ `(arg2::DWordWL)[1] - (1 - STORE - LOAD) * (1 - word_instr) * rv2[2] - (1 - STORE - LOAD) * signed * arg2_sign_bit * (2^32 - 1) - (1 - BEQ - BLT) * imm[1] = 0` | |
-| `CPU-CE63` | `MSB8[res_sign_bit; res[3]]` | word_instr |
-| `CPU-CE64` | `!LOAD` => `rvd[0]` = `res[:4]` |  |
+| `CPU-CE61` | `MSB16[arg2_sign_bit; rv2[1]]` | word_instr |
+| `CPU-CE62` | `arg2[:4]` = (1 - `LOAD`) dot `rv2[:2]` + (1 - `BEQ` - `BLT` - `STORE`) dot `imm[0]` |  |
+| | _polynomial:_ `(arg2::DWordWL)[0] - (1 - LOAD) * (rv2::DWordWL)[0] - (1 - BEQ - BLT - STORE) * imm[0] = 0` | |
+| `CPU-CE63` | `arg2[4:]` = (1 - `LOAD`) dot ((1 - `word_instr`) dot `rv2[2]` + `signed` dot `arg2_sign_bit` dot (2^(32) - 1)) + (1 - `BEQ` - `BLT` - `STORE`) dot `imm[1]` |  |
+| | _polynomial:_ `(arg2::DWordWL)[1] - (1 - LOAD) * (1 - word_instr) * rv2[2] - (1 - LOAD) * signed * arg2_sign_bit * (2^32 - 1) - (1 - BEQ - BLT - STORE) * imm[1] = 0` | |
+| `CPU-CE64` | `MSB8[res_sign_bit; res[3]]` | word_instr |
+| `CPU-CE65` | `!LOAD` => `rvd[0]` = `res[:4]` |  |
 | | _polynomial:_ `(1 - LOAD) * (rvd[0] - (res::DWordWL)[0]) = 0` | |
-| `CPU-CE65` | `!LOAD` => `rvd[1]` = (1 - `word_instr`) dot `res[4:]` + `res_sign_bit` dot (2^(32) - 1) |  |
+| `CPU-CE66` | `!LOAD` => `rvd[1]` = (1 - `word_instr`) dot `res[4:]` + `res_sign_bit` dot (2^(32) - 1) |  |
 | | _polynomial:_ `(1 - LOAD) * (rvd[1] - (1 - word_instr) * (res::DWordWL)[1] - res_sign_bit * (2^32 - 1)) = 0` | |
 
 ## Other constraints
@@ -134,11 +135,11 @@ For [cpu:c:is_equal], note that [cpu:c:sub] sets `res` to be the difference betw
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `CPU-CO66` | `ZERO[is_equal; res[0] + res[1] + res[2] + res[3] + res[4] + res[5] + res[6] + res[7]]` | BEQ |
-| `CPU-CO67` | `branch_cond` = `JALR` or (`BLT` and (`res` xor `invert`)) or (`BEQ` and (`is_equal` xor `invert`)) |  |
+| `CPU-CO67` | `ZERO[is_equal; res[0] + res[1] + res[2] + res[3] + res[4] + res[5] + res[6] + res[7]]` | BEQ |
+| `CPU-CO68` | `branch_cond` = `JALR` or (`BLT` and (`res` xor `invert`)) or (`BEQ` and (`is_equal` xor `invert`)) |  |
 | | _polynomial:_ `-branch_cond + JALR + res[0] * (1 - mp_selector) * BLT + (1 - res[0]) * mp_selector * BLT + is_equal * (1 - mp_selector) * BEQ + (1 - is_equal) * mp_selector * BEQ = 0` | |
-| `CPU-CO68` | `BRANCH[next_pc; pc, imm[0], arg1::DWordWL, JALR]` | branch_cond |
-| `CPU-CO69` | `ADD<next_pc; pc, (2 * c_type_instruction + 4 * (1 - c_type_instruction))::DWordWL>` |  |
+| `CPU-CO69` | `BRANCH[next_pc; pc, imm[0], arg1::DWordWL, JALR]` | branch_cond |
+| `CPU-CO70` | `ADD<next_pc; pc, (2 * c_type_instruction + 4 * (1 - c_type_instruction)) * 1::DWordWL>` |  |
 
 > **Note:** Document the choice to not have a multiplicity column here for padding
 
diff --git a/docs/spec/dvrm.md b/docs/spec/dvrm.md
index 5c846f3cb..56e6b3257 100644
--- a/docs/spec/dvrm.md
+++ b/docs/spec/dvrm.md
@@ -163,20 +163,13 @@ carry (when iter=[1, 3]) := 2^-32 * ((extended_n_sub_r::QuadWL)[i] + (extended_r
 | `DVRM-C21` | `DVRM[q::DWordWL; n, d, signed, 0]` | -μ_q |
 | `DVRM-C22` | `DVRM[r::DWordWL; n, d, signed, 1]` | -μ_r |
 
-### equality
-
-| Tag | Range | Description | Multiplicity |
-|-----|-------|-------------|--------------|
-| `DVRM-C13` |  | `MUL[n_sub_r::DWordWL; d, signed, q, sign_q, 0]` | μ_sum |
-| `DVRM-C14` |  | `MUL[extension_n_sub_r::DWordWL; d, signed, q, sign_q, 1]` | μ_sum |
-| `DVRM-C15.i` | i ∈ [0, 3] | `IS_HALF[q[i]]` | μ_sum |
-
-### sign_equality
+### defs
 
 | Tag | Description |
 |-----|-------------|
-| `DVRM-C1` | `r` eq.not 0 => `sign_r` = `sign_n` |
-| | _polynomial:_ `Σ_i = 0^3 r[i] * (sign_r - sign_n) = 0` |
+| `DVRM-C16` | `SIGN<sign_n; n[3], signed>` |
+| `DVRM-C17` | `SIGN<sign_r; r[3], signed>` |
+| `DVRM-C18` | `SIGN<sign_d; d[3], signed>` |
 
 ### n_sub_r
 
@@ -187,6 +180,13 @@ carry (when iter=[1, 3]) := 2^-32 * ((extended_n_sub_r::QuadWL)[i] + (extended_r
 | `DVRM-C11.i` | i ∈ [0, 3] | `IS_HALF[n_sub_r[i]]` | μ_sum |
 | `DVRM-C12` |  | `IS_BIT<sign_n_sub_r>` |  |
 
+### sign_equality
+
+| Tag | Description |
+|-----|-------------|
+| `DVRM-C1` | `r` eq.not 0 => `sign_r` = `sign_n` |
+| | _polynomial:_ `Σ_i = 0^3 r[i] * (sign_r - sign_n) = 0` |
+
 ### div_by_zero
 
 | Tag | Range | Description | Multiplicity |
@@ -195,6 +195,14 @@ carry (when iter=[1, 3]) := 2^-32 * ((extended_n_sub_r::QuadWL)[i] + (extended_r
 | | | _polynomial:_ `div_by_zero * (q[i] - 65535) = 0` | |
 | `DVRM-C20` |  | `ZERO[div_by_zero; d[0] + d[1] + d[2] + d[3]]` | μ_sum |
 
+### equality
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `DVRM-C13` |  | `MUL[n_sub_r::DWordWL; d, signed, q, sign_q, 0]` | μ_sum |
+| `DVRM-C14` |  | `MUL[extension_n_sub_r::DWordWL; d, signed, q, sign_q, 1]` | μ_sum |
+| `DVRM-C15.i` | i ∈ [0, 3] | `IS_HALF[q[i]]` | μ_sum |
+
 ### abs_diff
 
 | Tag | Range | Description | Multiplicity |
@@ -205,12 +213,4 @@ carry (when iter=[1, 3]) := 2^-32 * ((extended_n_sub_r::QuadWL)[i] + (extended_r
 | | | _polynomial:_ `(1 - sign_r) * (abs_r[i] - (r::DWordWL)[i]) = 0` | |
 | `DVRM-C5` |  | sign_d ⇒ `NEG<abs_d; d>` |  |
 | `DVRM-C6.i` | i ∈ [0, 1] | not`sign_d` => `abs_d` = `d` |  |
-| | | _polynomial:_ `(1 - sign_d) * (abs_d[i] - (d::DWordWL)[i]) = 0` | |
-
-### defs
-
-| Tag | Description |
-|-----|-------------|
-| `DVRM-C16` | `SIGN<sign_n; n[3], signed>` |
-| `DVRM-C17` | `SIGN<sign_r; r[3], signed>` |
-| `DVRM-C18` | `SIGN<sign_d; d[3], signed>` |
\ No newline at end of file
+| | | _polynomial:_ `(1 - sign_d) * (abs_d[i] - (d::DWordWL)[i]) = 0` | |
\ No newline at end of file
diff --git a/docs/spec/memory.md b/docs/spec/memory.md
index d5517248c..be95ed1e6 100644
--- a/docs/spec/memory.md
+++ b/docs/spec/memory.md
@@ -86,4 +86,4 @@ The initial and final state of registers can be entirely known by the verifier,
 
 = Future topics of interest
 
-- Optimize memory systems after determining factual bottlenecks (e.g. taking inspiration from Twist and Shout, or other recent research)
\ No newline at end of file
+- Optimize memory systems after determining factual bottlenecks (e.g. taking inspiration from Twist and Shout, or other recent research) - Double check whether IS_BYTE constraints are needed for fini
\ No newline at end of file
diff --git a/docs/spec/memw.md b/docs/spec/memw.md
index 520abffa4..b6223086d 100644
--- a/docs/spec/memw.md
+++ b/docs/spec/memw.md
@@ -15,14 +15,16 @@ Our assumptions do not explicitly cover any range checks for the `is_register` a
 | `MEMW-C1` |  | `IS_BIT<μ_sum>` |  |
 | `MEMW-C2` |  | `w2` => `μ_sum` |  |
 | | | _polynomial:_ `w2 * (1 - μ_sum) = 0` | |
-| `MEMW-C3` |  | `ADD<address_add[0]::DWordWL; base_address, 1>` | w2 |
-| `MEMW-C4.i` | i ∈ [1, 2] | `ADD<address_add[i]::DWordWL; base_address, i + 1>` | w4 |
-| `MEMW-C5.i` | i ∈ [3, 6] | `ADD<address_add[i]::DWordWL; base_address, i + 1>` | write8 |
-| `MEMW-C6.i` | i ∈ [0, 6], j ∈ [0, 3] | `IS_HALFWORD[address_add[i][j]]` |  |
-| `MEMW-C7` |  | `LT[1; old_timestamp[0], timestamp, 0]` | μ_sum |
-| `MEMW-C8` |  | `LT[1; old_timestamp[1], timestamp, 0]` | w2 |
-| `MEMW-C9.i` | i ∈ [2, 3] | `LT[1; old_timestamp[i], timestamp, 0]` | w4 |
-| `MEMW-C10.i` | i ∈ [4, 7] | `LT[1; old_timestamp[i], timestamp, 0]` | write8 |
+| `MEMW-C3` |  | w2 ⇒ `ADD<address_add[0]::DWordWL; base_address, 1>` |  |
+| `MEMW-C4.i` | i ∈ [1, 2] | w4 ⇒ `ADD<address_add[i]::DWordWL; base_address, i + 1>` |  |
+| `MEMW-C5.i` | i ∈ [3, 6] | write8 ⇒ `ADD<address_add[i]::DWordWL; base_address, i + 1>` |  |
+| `MEMW-C6.i` | i ∈ [0, 0], j ∈ [0, 3] | `IS_HALFWORD[address_add[i][j]]` | w2 |
+| `MEMW-C7.i` | i ∈ [1, 2], j ∈ [0, 3] | `IS_HALFWORD[address_add[i][j]]` | w4 |
+| `MEMW-C8.i` | i ∈ [3, 6], j ∈ [0, 3] | `IS_HALFWORD[address_add[i][j]]` | write8 |
+| `MEMW-C9` |  | `LT[1; old_timestamp[0], timestamp, 0]` | μ_sum |
+| `MEMW-C10` |  | `LT[1; old_timestamp[1], timestamp, 0]` | w2 |
+| `MEMW-C11.i` | i ∈ [2, 3] | `LT[1; old_timestamp[i], timestamp, 0]` | w4 |
+| `MEMW-C12.i` | i ∈ [4, 7] | `LT[1; old_timestamp[i], timestamp, 0]` | write8 |
 
 As long as `timestamp` is properly range-checked, the presence of `old_timestamp` in the memory argument automatically ensures appropriate range checking (as long as no external entities provide negative multiplicities without range checking the timestamp). This ensures the assumptions for `LT` are satisfied.
 
@@ -30,29 +32,29 @@ We additionally check that the address does not overflow for more significant by
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `MEMW-CR11` | `LT[1; base_address, address_add[0]::DWordWL, 0]` | write2 |
-| `MEMW-CR12` | `LT[1; base_address, address_add[2]::DWordWL, 0]` | write4 |
-| `MEMW-CR13` | `LT[1; base_address, address_add[6]::DWordWL, 0]` | write8 |
+| `MEMW-CR13` | `LT[1; base_address, address_add[0]::DWordWL, 0]` | write2 |
+| `MEMW-CR14` | `LT[1; base_address, address_add[2]::DWordWL, 0]` | write4 |
+| `MEMW-CR15` | `LT[1; base_address, address_add[6]::DWordWL, 0]` | write8 |
 
 The chip adds the following tuples to the lookup argument, to effectuate that part of the memory argument.
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `MEMW-CM14` |  | `memory[is_register, base_address, old_timestamp[0], old[0]]` | μ_sum |
-| `MEMW-CM15` |  | `memory[is_register, base_address, timestamp, value[0]]` | -μ_sum |
-| `MEMW-CM16` |  | `memory[is_register, address_add[0], old_timestamp[1], old[1]]` | w2 |
-| `MEMW-CM17` |  | `memory[is_register, address_add[0], timestamp, value[1]]` | -w2 |
-| `MEMW-CM18.i` | i ∈ [2, 3] | `memory[is_register, address_add[i - 1], old_timestamp[i], old[i]]` | w4 |
-| `MEMW-CM19.i` | i ∈ [2, 3] | `memory[is_register, address_add[i - 1], timestamp, value[i]]` | -w4 |
-| `MEMW-CM20.i` | i ∈ [4, 7] | `memory[is_register, address_add[i - 1], old_timestamp[i], old[i]]` | write8 |
-| `MEMW-CM21.i` | i ∈ [4, 7] | `memory[is_register, address_add[i - 1], timestamp, value[i]]` | -write8 |
+| `MEMW-CM16` |  | `memory[is_register, base_address, old_timestamp[0], old[0]]` | μ_sum |
+| `MEMW-CM17` |  | `memory[is_register, base_address, timestamp, value[0]]` | -μ_sum |
+| `MEMW-CM18` |  | `memory[is_register, address_add[0], old_timestamp[1], old[1]]` | w2 |
+| `MEMW-CM19` |  | `memory[is_register, address_add[0], timestamp, value[1]]` | -w2 |
+| `MEMW-CM20.i` | i ∈ [2, 3] | `memory[is_register, address_add[i - 1], old_timestamp[i], old[i]]` | w4 |
+| `MEMW-CM21.i` | i ∈ [2, 3] | `memory[is_register, address_add[i - 1], timestamp, value[i]]` | -w4 |
+| `MEMW-CM22.i` | i ∈ [4, 7] | `memory[is_register, address_add[i - 1], old_timestamp[i], old[i]]` | write8 |
+| `MEMW-CM23.i` | i ∈ [4, 7] | `memory[is_register, address_add[i - 1], timestamp, value[i]]` | -write8 |
 
 This chip contributes the following to the lookup argument.
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `MEMW-CO22` | `MEMW[old; is_register, base_address, value, timestamp, write2, write4, write8]` | μ_read |
-| `MEMW-CO23` | `MEMW[is_register, base_address, value, timestamp, write2, write4, write8]` | μ_write |
+| `MEMW-CO24` | `MEMW[old; is_register, base_address, value, timestamp, write2, write4, write8]` | μ_read |
+| `MEMW-CO25` | `MEMW[is_register, base_address, value, timestamp, write2, write4, write8]` | μ_write |
 
 = Future optimization ideas
 
diff --git a/docs/spec/spec_full.md b/docs/spec/spec_full.md
index d3eb2c93e..6d9f79290 100644
--- a/docs/spec/spec_full.md
+++ b/docs/spec/spec_full.md
@@ -88,7 +88,7 @@ The initial and final state of registers can be entirely known by the verifier,
 
 = Future topics of interest
 
-- Optimize memory systems after determining factual bottlenecks (e.g. taking inspiration from Twist and Shout, or other recent research)
+- Optimize memory systems after determining factual bottlenecks (e.g. taking inspiration from Twist and Shout, or other recent research) - Double check whether IS_BYTE constraints are needed for fini
 
 ---
 
@@ -440,9 +440,9 @@ The `CPU` chip is comprised of  variables that are expressed using  columns:
 
 = Constraints First, we perform a decoding lookup for the current PC.
 
-| Tag | Description |
-|-----|-------------|
-| `CPU-C1` | `DECODE[pc, imm, packed_decode]` |
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `CPU-C1` | `DECODE[pc, imm, packed_decode]` | 1 |
 
 > **Note:** All casts for interactions will have to be reviewed once other chip interfaces stabilise
 
@@ -454,41 +454,41 @@ The `CPU` chip is comprised of  variables that are expressed using  columns:
 
 We constrain all columns to have the appropriate ranges. The flags and register indices looked up from the decoding need to be checked, as they are communicated through the interaction in a packed form. In contrast, we know ahead of time that decoding will ensure proper range checks for `pc` and `imm`. Similarly, since `next_pc` will propagate through the memory argument and be looked up in the instruction decoding on the next cycle, it is forced to be in the correct range. For the auxiliary columns, we need to check the limbs of `arg1`, `arg2`, and `res`. The ranges of the other auxiliary columns are enforced through later constraints.
 
-| Tag | Range | Description |
-|-----|-------|-------------|
-| `CPU-CR2` |  | `IS_BIT<read_register1>` |
-| `CPU-CR3` |  | `IS_BIT<read_register2>` |
-| `CPU-CR4` |  | `IS_BIT<write_register>` |
-| `CPU-CR5` |  | `IS_BIT<memory_2bytes>` |
-| `CPU-CR6` |  | `IS_BIT<memory_4bytes>` |
-| `CPU-CR7` |  | `IS_BIT<memory_8bytes>` |
-| `CPU-CR8` |  | `IS_BIT<c_type_instruction>` |
-| `CPU-CR9` |  | `IS_BIT<signed>` |
-| `CPU-CR10` |  | `IS_BIT<mp_selector>` |
-| `CPU-CR11` |  | `IS_BIT<muldiv_selector>` |
-| `CPU-CR12` |  | `IS_BIT<word_instr>` |
-| `CPU-CR13` |  | `IS_BIT<ADD>` |
-| `CPU-CR14` |  | `IS_BIT<SUB>` |
-| `CPU-CR15` |  | `IS_BIT<SLT>` |
-| `CPU-CR16` |  | `IS_BIT<AND>` |
-| `CPU-CR17` |  | `IS_BIT<OR>` |
-| `CPU-CR18` |  | `IS_BIT<XOR>` |
-| `CPU-CR19` |  | `IS_BIT<SHIFT>` |
-| `CPU-CR20` |  | `IS_BIT<JALR>` |
-| `CPU-CR21` |  | `IS_BIT<BEQ>` |
-| `CPU-CR22` |  | `IS_BIT<BLT>` |
-| `CPU-CR23` |  | `IS_BIT<LOAD>` |
-| `CPU-CR24` |  | `IS_BIT<STORE>` |
-| `CPU-CR25` |  | `IS_BIT<MUL>` |
-| `CPU-CR26` |  | `IS_BIT<DIVREM>` |
-| `CPU-CR27` |  | `IS_BIT<ECALL>` |
-| `CPU-CR28` |  | `IS_BIT<EBREAK>` |
-| `CPU-CR29` |  | `IS_BYTE[rs1]` |
-| `CPU-CR30` |  | `IS_BYTE[rs2]` |
-| `CPU-CR31` |  | `IS_BYTE[rd]` |
-| `CPU-CR32.i` | i ∈ [0, 7] | `IS_BYTE[arg1[i]]` |
-| `CPU-CR33.i` | i ∈ [0, 7] | `IS_BYTE[arg2[i]]` |
-| `CPU-CR34.i` | i ∈ [0, 7] | `IS_BYTE[res[i]]` |
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `CPU-CR2` |  | `IS_BIT<read_register1>` |  |
+| `CPU-CR3` |  | `IS_BIT<read_register2>` |  |
+| `CPU-CR4` |  | `IS_BIT<write_register>` |  |
+| `CPU-CR5` |  | `IS_BIT<memory_2bytes>` |  |
+| `CPU-CR6` |  | `IS_BIT<memory_4bytes>` |  |
+| `CPU-CR7` |  | `IS_BIT<memory_8bytes>` |  |
+| `CPU-CR8` |  | `IS_BIT<c_type_instruction>` |  |
+| `CPU-CR9` |  | `IS_BIT<signed>` |  |
+| `CPU-CR10` |  | `IS_BIT<mp_selector>` |  |
+| `CPU-CR11` |  | `IS_BIT<muldiv_selector>` |  |
+| `CPU-CR12` |  | `IS_BIT<word_instr>` |  |
+| `CPU-CR13` |  | `IS_BIT<ADD>` |  |
+| `CPU-CR14` |  | `IS_BIT<SUB>` |  |
+| `CPU-CR15` |  | `IS_BIT<SLT>` |  |
+| `CPU-CR16` |  | `IS_BIT<AND>` |  |
+| `CPU-CR17` |  | `IS_BIT<OR>` |  |
+| `CPU-CR18` |  | `IS_BIT<XOR>` |  |
+| `CPU-CR19` |  | `IS_BIT<SHIFT>` |  |
+| `CPU-CR20` |  | `IS_BIT<JALR>` |  |
+| `CPU-CR21` |  | `IS_BIT<BEQ>` |  |
+| `CPU-CR22` |  | `IS_BIT<BLT>` |  |
+| `CPU-CR23` |  | `IS_BIT<LOAD>` |  |
+| `CPU-CR24` |  | `IS_BIT<STORE>` |  |
+| `CPU-CR25` |  | `IS_BIT<MUL>` |  |
+| `CPU-CR26` |  | `IS_BIT<DIVREM>` |  |
+| `CPU-CR27` |  | `IS_BIT<ECALL>` |  |
+| `CPU-CR28` |  | `IS_BIT<EBREAK>` |  |
+| `CPU-CR29` |  | `IS_BYTE[rs1]` | 1 |
+| `CPU-CR30` |  | `IS_BYTE[rs2]` | 1 |
+| `CPU-CR31` |  | `IS_BYTE[rd]` | 1 |
+| `CPU-CR32.i` | i ∈ [0, 7] | `IS_BYTE[arg1[i]]` | 1 |
+| `CPU-CR33.i` | i ∈ [0, 7] | `IS_BYTE[arg2[i]]` | 1 |
+| `CPU-CR34.i` | i ∈ [0, 7] | `IS_BYTE[res[i]]` | 1 |
 
 ## ALU
 
@@ -496,18 +496,19 @@ The ALU functionality is then obtained through judicious dispatching to the corr
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `CPU-CA35` |  | ADD + LOAD + STORE ⇒ `ADD<res::DWordWL; arg1::DWordWL, arg2::DWordWL>` |  |
-| `CPU-CA36` |  | SUB + BEQ ⇒ `SUB<res::DWordWL; arg1::DWordWL, arg2::DWordWL>` |  |
-| `CPU-CA37` |  | `LT[res[0]; arg1::DWordWL, arg2::DWordWL, signed]` | SLT + BLT |
-| `CPU-CA38.i` | i ∈ [1, 7] | `SLT` + `BLT` => `res[i]` = 0 |  |
+| `CPU-CA35` |  | ADD + LOAD ⇒ `ADD<res::DWordWL; arg1::DWordWL, arg2::DWordWL>` |  |
+| `CPU-CA36` |  | STORE ⇒ `ADD<res::DWordWL; arg1::DWordWL, imm>` |  |
+| `CPU-CA37` |  | SUB + BEQ ⇒ `SUB<res::DWordWL; arg1::DWordWL, arg2::DWordWL>` |  |
+| `CPU-CA38` |  | `LT[res[0]; arg1::DWordWL, arg2::DWordWL, signed]` | SLT + BLT |
+| `CPU-CA39.i` | i ∈ [1, 7] | `SLT` + `BLT` => `res[i]` = 0 |  |
 | | | _polynomial:_ `(SLT + BLT) * res[i] = 0` | |
-| `CPU-CA39.i` | i ∈ [0, 7] | `AND_BYTE[res[i]; arg1[i], arg2[i]]` | AND |
-| `CPU-CA40.i` | i ∈ [0, 7] | `OR_BYTE[res[i]; arg1[i], arg2[i]]` | OR |
-| `CPU-CA41.i` | i ∈ [0, 7] | `XOR_BYTE[res[i]; arg1[i], arg2[i]]` | XOR |
-| `CPU-CA42` |  | `SHIFT[res::DWordHL; arg1::DWordHL, arg2[0], mp_selector, signed, word_instr]` | SHIFT |
-| `CPU-CA43` |  | JALR ⇒ `ADD<res::DWordWL; pc, (2 * c_type_instruction + 4 * (1 - c_type_instruction))::DWordWL>` |  |
-| `CPU-CA44` |  | `MUL[res; arg1, signed, arg2, mp_selector, muldiv_selector]` | MUL |
-| `CPU-CA45` |  | `DVRM[res; arg1, arg2, signed, muldiv_selector]` | DIVREM |
+| `CPU-CA40.i` | i ∈ [0, 7] | `AND_BYTE[res[i]; arg1[i], arg2[i]]` | AND |
+| `CPU-CA41.i` | i ∈ [0, 7] | `OR_BYTE[res[i]; arg1[i], arg2[i]]` | OR |
+| `CPU-CA42.i` | i ∈ [0, 7] | `XOR_BYTE[res[i]; arg1[i], arg2[i]]` | XOR |
+| `CPU-CA43` |  | `SHIFT[res::DWordHL; arg1::DWordHL, arg2[0], mp_selector, signed, word_instr]` | SHIFT |
+| `CPU-CA44` |  | JALR ⇒ `ADD<res::DWordWL; pc, (2 * c_type_instruction + 4 * (1 - c_type_instruction)) * 1::DWordWL>` |  |
+| `CPU-CA45` |  | `MUL[res; arg1, signed, arg2, mp_selector, muldiv_selector]` | MUL |
+| `CPU-CA46` |  | `DVRM[res; arg1, arg2, signed, muldiv_selector]` | DIVREM |
 
 ## Memory
 
@@ -515,16 +516,16 @@ The interactions with the memory, both for register loading and storing, as for
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `CPU-CM46` |  | `MEMW[rv1; 1, 2 * rs1, rv1, timestamp + 0, 1, 0, 0]` | read_register1 |
-| `CPU-CM47.i` | i ∈ [0, 2] | `!read_register1` => `rv1[i]` = 0 |  |
+| `CPU-CM47` |  | `MEMW[rv1; 1, 2 * rs1, rv1, timestamp + 0::DWordWL, 1, 0, 0]` | read_register1 |
+| `CPU-CM48.i` | i ∈ [0, 2] | `!read_register1` => `rv1[i]` = 0 |  |
 | | | _polynomial:_ `(1 - read_register1) * rv1[i] = 0` | |
-| `CPU-CM48` |  | `MEMW[rv2; 1, 2 * rs2, rv2, timestamp + 1, 1, 0, 0]` | read_register2 |
-| `CPU-CM49.i` | i ∈ [0, 2] | `!read_register2` => `rv2[i]` = 0 |  |
+| `CPU-CM49` |  | `MEMW[rv2; 1, 2 * rs2, rv2, timestamp + 1::DWordWL, 1, 0, 0]` | read_register2 |
+| `CPU-CM50.i` | i ∈ [0, 2] | `!read_register2` => `rv2[i]` = 0 |  |
 | | | _polynomial:_ `(1 - read_register2) * rv2[i] = 0` | |
-| `CPU-CM50` |  | `MEMW[1, 2 * rd, rvd, timestamp + 2, 1, 0, 0]` | write_register |
-| `CPU-CM51` |  | `LOAD[rvd; 0, res, timestamp + 0, memory_2bytes, memory_4bytes, memory_8bytes, signed]` | LOAD |
-| `CPU-CM52` |  | `MEMW[0, res, rv2, timestamp + 1, memory_2bytes, memory_4bytes, memory_8bytes]` | STORE |
-| `CPU-CM53` |  | `MEMW[pc; 1, 2 * 255, next_pc, timestamp + 1, 1, 0, 0]` | 1 - pad |
+| `CPU-CM51` |  | `MEMW[1, 2 * rd, rvd, timestamp + 2::DWordWL, 1, 0, 0]` | write_register |
+| `CPU-CM52` |  | `LOAD[rvd; 0, res, timestamp + 0::DWordWL, memory_2bytes, memory_4bytes, memory_8bytes, signed]` | LOAD |
+| `CPU-CM53` |  | `MEMW[0, res, arg2::Byte[8], timestamp + 1::DWordWL, memory_2bytes, memory_4bytes, memory_8bytes]` | STORE |
+| `CPU-CM54` |  | `MEMW[pc; 1, 2 * 255, next_pc, timestamp + 1::DWordWL, 1, 0, 0]` | 1 - pad |
 
 ## System
 
@@ -532,9 +533,9 @@ The interactions with the wider system.
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `CPU-CS54` | `!EBREAK` |  |
+| `CPU-CS55` | `!EBREAK` |  |
 | | _polynomial:_ `1 - EBREAK = 0` | |
-| `CPU-CS55` | `ECALL[timestamp, rv1::DWordWL]` | ECALL |
+| `CPU-CS56` | `ECALL[timestamp, rv1::DWordWL]` | ECALL |
 
 ## Input and output to the ALU
 
@@ -542,22 +543,22 @@ We constrain `arg1`, `arg2` and `rvd` to correspond to the wanted values, includ
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `CPU-CE56` | (`rv1_sign_bit` or `arg2_sign_bit` or `res_sign_bit`) => `word_instr` |  |
+| `CPU-CE57` | (`rv1_sign_bit` or `arg2_sign_bit` or `res_sign_bit`) => `word_instr` |  |
 | | _polynomial:_ `(rv1_sign_bit + arg2_sign_bit + res_sign_bit) * (1 - word_instr) = 0` | |
-| `CPU-CE57` | `MSB16[rv1_sign_bit; rv1[1]]` | word_instr |
-| `CPU-CE58` | `arg1[:4]` = `rv1[:2]` |  |
+| `CPU-CE58` | `MSB16[rv1_sign_bit; rv1[1]]` | word_instr |
+| `CPU-CE59` | `arg1[:4]` = `rv1[:2]` |  |
 | | _polynomial:_ `(arg1::DWordWL)[0] - (rv1::DWordWL)[0] = 0` | |
-| `CPU-CE59` | `arg1[4:]` = `rv1[2]` dot (1 - `word_instr`) + (2^(32) - 1) dot `rv1_sign_bit` dot `signed` |  |
+| `CPU-CE60` | `arg1[4:]` = `rv1[2]` dot (1 - `word_instr`) + (2^(32) - 1) dot `rv1_sign_bit` dot `signed` |  |
 | | _polynomial:_ `(arg1::DWordWL)[1] - (1 - word_instr) * rv1[2] - signed * rv1_sign_bit * (2^32 - 1) = 0` | |
-| `CPU-CE60` | `MSB16[arg2_sign_bit; rv2[1]]` | word_instr |
-| `CPU-CE61` | `arg2[:4]` = (1 - `STORE` - `LOAD`) dot `rv2[:2]` + (1 - `BEQ` - `BLT`) dot `imm[0]` |  |
-| | _polynomial:_ `(arg2::DWordWL)[0] - (1 - STORE - LOAD) * (rv2::DWordWL)[0] - (1 - BEQ - BLT) * imm[0] = 0` | |
-| `CPU-CE62` | `arg2[4:]` = (1 - `STORE` - `LOAD`) dot ((1 - `word_instr`) dot `rv2[2]` + `signed` dot `arg2_sign_bit` dot (2^(32) - 1)) + (1 - `BEQ` - `BLT`) dot `imm[1]` |  |
-| | _polynomial:_ `(arg2::DWordWL)[1] - (1 - STORE - LOAD) * (1 - word_instr) * rv2[2] - (1 - STORE - LOAD) * signed * arg2_sign_bit * (2^32 - 1) - (1 - BEQ - BLT) * imm[1] = 0` | |
-| `CPU-CE63` | `MSB8[res_sign_bit; res[3]]` | word_instr |
-| `CPU-CE64` | `!LOAD` => `rvd[0]` = `res[:4]` |  |
+| `CPU-CE61` | `MSB16[arg2_sign_bit; rv2[1]]` | word_instr |
+| `CPU-CE62` | `arg2[:4]` = (1 - `LOAD`) dot `rv2[:2]` + (1 - `BEQ` - `BLT` - `STORE`) dot `imm[0]` |  |
+| | _polynomial:_ `(arg2::DWordWL)[0] - (1 - LOAD) * (rv2::DWordWL)[0] - (1 - BEQ - BLT - STORE) * imm[0] = 0` | |
+| `CPU-CE63` | `arg2[4:]` = (1 - `LOAD`) dot ((1 - `word_instr`) dot `rv2[2]` + `signed` dot `arg2_sign_bit` dot (2^(32) - 1)) + (1 - `BEQ` - `BLT` - `STORE`) dot `imm[1]` |  |
+| | _polynomial:_ `(arg2::DWordWL)[1] - (1 - LOAD) * (1 - word_instr) * rv2[2] - (1 - LOAD) * signed * arg2_sign_bit * (2^32 - 1) - (1 - BEQ - BLT - STORE) * imm[1] = 0` | |
+| `CPU-CE64` | `MSB8[res_sign_bit; res[3]]` | word_instr |
+| `CPU-CE65` | `!LOAD` => `rvd[0]` = `res[:4]` |  |
 | | _polynomial:_ `(1 - LOAD) * (rvd[0] - (res::DWordWL)[0]) = 0` | |
-| `CPU-CE65` | `!LOAD` => `rvd[1]` = (1 - `word_instr`) dot `res[4:]` + `res_sign_bit` dot (2^(32) - 1) |  |
+| `CPU-CE66` | `!LOAD` => `rvd[1]` = (1 - `word_instr`) dot `res[4:]` + `res_sign_bit` dot (2^(32) - 1) |  |
 | | _polynomial:_ `(1 - LOAD) * (rvd[1] - (1 - word_instr) * (res::DWordWL)[1] - res_sign_bit * (2^32 - 1)) = 0` | |
 
 ## Other constraints
@@ -566,11 +567,11 @@ For [cpu:c:is_equal], note that [cpu:c:sub] sets `res` to be the difference betw
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `CPU-CO66` | `ZERO[is_equal; res[0] + res[1] + res[2] + res[3] + res[4] + res[5] + res[6] + res[7]]` | BEQ |
-| `CPU-CO67` | `branch_cond` = `JALR` or (`BLT` and (`res` xor `invert`)) or (`BEQ` and (`is_equal` xor `invert`)) |  |
+| `CPU-CO67` | `ZERO[is_equal; res[0] + res[1] + res[2] + res[3] + res[4] + res[5] + res[6] + res[7]]` | BEQ |
+| `CPU-CO68` | `branch_cond` = `JALR` or (`BLT` and (`res` xor `invert`)) or (`BEQ` and (`is_equal` xor `invert`)) |  |
 | | _polynomial:_ `-branch_cond + JALR + res[0] * (1 - mp_selector) * BLT + (1 - res[0]) * mp_selector * BLT + is_equal * (1 - mp_selector) * BEQ + (1 - is_equal) * mp_selector * BEQ = 0` | |
-| `CPU-CO68` | `BRANCH[next_pc; pc, imm[0], arg1::DWordWL, JALR]` | branch_cond |
-| `CPU-CO69` | `ADD<next_pc; pc, (2 * c_type_instruction + 4 * (1 - c_type_instruction))::DWordWL>` |  |
+| `CPU-CO69` | `BRANCH[next_pc; pc, imm[0], arg1::DWordWL, JALR]` | branch_cond |
+| `CPU-CO70` | `ADD<next_pc; pc, (2 * c_type_instruction + 4 * (1 - c_type_instruction)) * 1::DWordWL>` |  |
 
 > **Note:** Document the choice to not have a multiplicity column here for padding
 
@@ -890,7 +891,7 @@ The range checks on `unmasked_low_byte` and `next_pc_low[0]` are performed impli
 | `BRANCH-C1` |  | 1 - JALR ⇒ `ADD<next_pc_unmasked; pc, offset::DWordWL>` |  |
 | `BRANCH-C2` |  | JALR ⇒ `ADD<next_pc_unmasked; register, offset::DWordWL>` |  |
 | `BRANCH-C3` |  | `IS_BYTE[next_pc_low[1]]` | μ |
-| `BRANCH-C4` |  | `AND_BYTE[next_pc_low[0]; unmasked_low_byte[0], 254]` | μ |
+| `BRANCH-C4` |  | `AND_BYTE[next_pc_low[0]; unmasked_low_byte, 254]` | μ |
 | `BRANCH-C5.i` | i ∈ [0, 2] | `IS_HALFWORD[next_pc_high[i]]` | μ |
 
 This chip contributes the following to the lookup argument.
@@ -910,7 +911,7 @@ The table can be padded to the next power of two with the following value assign
 | Name | Type | Description |
 |------|------|-------------|
 | `pc` | `DWordWL` | The current pc, used as base address when `!JALR` |
-| `offset` | `Word` | The offset from the base address to jump to |
+| `offset` | `DWordWL` | The offset from the base address to jump to |
 | `register` | `DWordWL` | The base address to use when `JALR` |
 | `JALR` | `Bit` | Selects between `pc` and `register` as base address, needed for the `JALR` instruction |
 
@@ -936,7 +937,7 @@ The table can be padded to the next power of two with the following value assign
 
 **Definition of `next_pc_unmasked`:**
 ```
-next_pc_unmasked (when iter=0) := 2^16 * next_pc_high[0] + 2^8 * next_pc_low[1] + unmasked_low_byte[0]
+next_pc_unmasked (when iter=0) := 2^16 * next_pc_high[0] + 2^8 * next_pc_low[1] + unmasked_low_byte
 next_pc_unmasked (when iter=1) := 2^16 * next_pc_high[2] + next_pc_high[1]
 ```
 
@@ -980,14 +981,16 @@ Our assumptions do not explicitly cover any range checks for the `is_register` a
 | `MEMW-C1` |  | `IS_BIT<μ_sum>` |  |
 | `MEMW-C2` |  | `w2` => `μ_sum` |  |
 | | | _polynomial:_ `w2 * (1 - μ_sum) = 0` | |
-| `MEMW-C3` |  | `ADD<address_add[0]::DWordWL; base_address, 1>` | w2 |
-| `MEMW-C4.i` | i ∈ [1, 2] | `ADD<address_add[i]::DWordWL; base_address, i + 1>` | w4 |
-| `MEMW-C5.i` | i ∈ [3, 6] | `ADD<address_add[i]::DWordWL; base_address, i + 1>` | write8 |
-| `MEMW-C6.i` | i ∈ [0, 6], j ∈ [0, 3] | `IS_HALFWORD[address_add[i][j]]` |  |
-| `MEMW-C7` |  | `LT[1; old_timestamp[0], timestamp, 0]` | μ_sum |
-| `MEMW-C8` |  | `LT[1; old_timestamp[1], timestamp, 0]` | w2 |
-| `MEMW-C9.i` | i ∈ [2, 3] | `LT[1; old_timestamp[i], timestamp, 0]` | w4 |
-| `MEMW-C10.i` | i ∈ [4, 7] | `LT[1; old_timestamp[i], timestamp, 0]` | write8 |
+| `MEMW-C3` |  | w2 ⇒ `ADD<address_add[0]::DWordWL; base_address, 1>` |  |
+| `MEMW-C4.i` | i ∈ [1, 2] | w4 ⇒ `ADD<address_add[i]::DWordWL; base_address, i + 1>` |  |
+| `MEMW-C5.i` | i ∈ [3, 6] | write8 ⇒ `ADD<address_add[i]::DWordWL; base_address, i + 1>` |  |
+| `MEMW-C6.i` | i ∈ [0, 0], j ∈ [0, 3] | `IS_HALFWORD[address_add[i][j]]` | w2 |
+| `MEMW-C7.i` | i ∈ [1, 2], j ∈ [0, 3] | `IS_HALFWORD[address_add[i][j]]` | w4 |
+| `MEMW-C8.i` | i ∈ [3, 6], j ∈ [0, 3] | `IS_HALFWORD[address_add[i][j]]` | write8 |
+| `MEMW-C9` |  | `LT[1; old_timestamp[0], timestamp, 0]` | μ_sum |
+| `MEMW-C10` |  | `LT[1; old_timestamp[1], timestamp, 0]` | w2 |
+| `MEMW-C11.i` | i ∈ [2, 3] | `LT[1; old_timestamp[i], timestamp, 0]` | w4 |
+| `MEMW-C12.i` | i ∈ [4, 7] | `LT[1; old_timestamp[i], timestamp, 0]` | write8 |
 
 As long as `timestamp` is properly range-checked, the presence of `old_timestamp` in the memory argument automatically ensures appropriate range checking (as long as no external entities provide negative multiplicities without range checking the timestamp). This ensures the assumptions for `LT` are satisfied.
 
@@ -995,29 +998,29 @@ We additionally check that the address does not overflow for more significant by
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `MEMW-CR11` | `LT[1; base_address, address_add[0]::DWordWL, 0]` | write2 |
-| `MEMW-CR12` | `LT[1; base_address, address_add[2]::DWordWL, 0]` | write4 |
-| `MEMW-CR13` | `LT[1; base_address, address_add[6]::DWordWL, 0]` | write8 |
+| `MEMW-CR13` | `LT[1; base_address, address_add[0]::DWordWL, 0]` | write2 |
+| `MEMW-CR14` | `LT[1; base_address, address_add[2]::DWordWL, 0]` | write4 |
+| `MEMW-CR15` | `LT[1; base_address, address_add[6]::DWordWL, 0]` | write8 |
 
 The chip adds the following tuples to the lookup argument, to effectuate that part of the memory argument.
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `MEMW-CM14` |  | `memory[is_register, base_address, old_timestamp[0], old[0]]` | μ_sum |
-| `MEMW-CM15` |  | `memory[is_register, base_address, timestamp, value[0]]` | -μ_sum |
-| `MEMW-CM16` |  | `memory[is_register, address_add[0], old_timestamp[1], old[1]]` | w2 |
-| `MEMW-CM17` |  | `memory[is_register, address_add[0], timestamp, value[1]]` | -w2 |
-| `MEMW-CM18.i` | i ∈ [2, 3] | `memory[is_register, address_add[i - 1], old_timestamp[i], old[i]]` | w4 |
-| `MEMW-CM19.i` | i ∈ [2, 3] | `memory[is_register, address_add[i - 1], timestamp, value[i]]` | -w4 |
-| `MEMW-CM20.i` | i ∈ [4, 7] | `memory[is_register, address_add[i - 1], old_timestamp[i], old[i]]` | write8 |
-| `MEMW-CM21.i` | i ∈ [4, 7] | `memory[is_register, address_add[i - 1], timestamp, value[i]]` | -write8 |
+| `MEMW-CM16` |  | `memory[is_register, base_address, old_timestamp[0], old[0]]` | μ_sum |
+| `MEMW-CM17` |  | `memory[is_register, base_address, timestamp, value[0]]` | -μ_sum |
+| `MEMW-CM18` |  | `memory[is_register, address_add[0], old_timestamp[1], old[1]]` | w2 |
+| `MEMW-CM19` |  | `memory[is_register, address_add[0], timestamp, value[1]]` | -w2 |
+| `MEMW-CM20.i` | i ∈ [2, 3] | `memory[is_register, address_add[i - 1], old_timestamp[i], old[i]]` | w4 |
+| `MEMW-CM21.i` | i ∈ [2, 3] | `memory[is_register, address_add[i - 1], timestamp, value[i]]` | -w4 |
+| `MEMW-CM22.i` | i ∈ [4, 7] | `memory[is_register, address_add[i - 1], old_timestamp[i], old[i]]` | write8 |
+| `MEMW-CM23.i` | i ∈ [4, 7] | `memory[is_register, address_add[i - 1], timestamp, value[i]]` | -write8 |
 
 This chip contributes the following to the lookup argument.
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `MEMW-CO22` | `MEMW[old; is_register, base_address, value, timestamp, write2, write4, write8]` | μ_read |
-| `MEMW-CO23` | `MEMW[is_register, base_address, value, timestamp, write2, write4, write8]` | μ_write |
+| `MEMW-CO24` | `MEMW[old; is_register, base_address, value, timestamp, write2, write4, write8]` | μ_read |
+| `MEMW-CO25` | `MEMW[is_register, base_address, value, timestamp, write2, write4, write8]` | μ_write |
 
 = Future optimization ideas
 
@@ -1505,20 +1508,13 @@ carry (when iter=[1, 3]) := 2^-32 * ((extended_n_sub_r::QuadWL)[i] + (extended_r
 | `DVRM-C21` | `DVRM[q::DWordWL; n, d, signed, 0]` | -μ_q |
 | `DVRM-C22` | `DVRM[r::DWordWL; n, d, signed, 1]` | -μ_r |
 
-### equality
-
-| Tag | Range | Description | Multiplicity |
-|-----|-------|-------------|--------------|
-| `DVRM-C13` |  | `MUL[n_sub_r::DWordWL; d, signed, q, sign_q, 0]` | μ_sum |
-| `DVRM-C14` |  | `MUL[extension_n_sub_r::DWordWL; d, signed, q, sign_q, 1]` | μ_sum |
-| `DVRM-C15.i` | i ∈ [0, 3] | `IS_HALF[q[i]]` | μ_sum |
-
-### sign_equality
+### defs
 
 | Tag | Description |
 |-----|-------------|
-| `DVRM-C1` | `r` eq.not 0 => `sign_r` = `sign_n` |
-| | _polynomial:_ `Σ_i = 0^3 r[i] * (sign_r - sign_n) = 0` |
+| `DVRM-C16` | `SIGN<sign_n; n[3], signed>` |
+| `DVRM-C17` | `SIGN<sign_r; r[3], signed>` |
+| `DVRM-C18` | `SIGN<sign_d; d[3], signed>` |
 
 ### n_sub_r
 
@@ -1529,6 +1525,13 @@ carry (when iter=[1, 3]) := 2^-32 * ((extended_n_sub_r::QuadWL)[i] + (extended_r
 | `DVRM-C11.i` | i ∈ [0, 3] | `IS_HALF[n_sub_r[i]]` | μ_sum |
 | `DVRM-C12` |  | `IS_BIT<sign_n_sub_r>` |  |
 
+### sign_equality
+
+| Tag | Description |
+|-----|-------------|
+| `DVRM-C1` | `r` eq.not 0 => `sign_r` = `sign_n` |
+| | _polynomial:_ `Σ_i = 0^3 r[i] * (sign_r - sign_n) = 0` |
+
 ### div_by_zero
 
 | Tag | Range | Description | Multiplicity |
@@ -1537,6 +1540,14 @@ carry (when iter=[1, 3]) := 2^-32 * ((extended_n_sub_r::QuadWL)[i] + (extended_r
 | | | _polynomial:_ `div_by_zero * (q[i] - 65535) = 0` | |
 | `DVRM-C20` |  | `ZERO[div_by_zero; d[0] + d[1] + d[2] + d[3]]` | μ_sum |
 
+### equality
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `DVRM-C13` |  | `MUL[n_sub_r::DWordWL; d, signed, q, sign_q, 0]` | μ_sum |
+| `DVRM-C14` |  | `MUL[extension_n_sub_r::DWordWL; d, signed, q, sign_q, 1]` | μ_sum |
+| `DVRM-C15.i` | i ∈ [0, 3] | `IS_HALF[q[i]]` | μ_sum |
+
 ### abs_diff
 
 | Tag | Range | Description | Multiplicity |
@@ -1549,14 +1560,6 @@ carry (when iter=[1, 3]) := 2^-32 * ((extended_n_sub_r::QuadWL)[i] + (extended_r
 | `DVRM-C6.i` | i ∈ [0, 1] | not`sign_d` => `abs_d` = `d` |  |
 | | | _polynomial:_ `(1 - sign_d) * (abs_d[i] - (d::DWordWL)[i]) = 0` | |
 
-### defs
-
-| Tag | Description |
-|-----|-------------|
-| `DVRM-C16` | `SIGN<sign_n; n[3], signed>` |
-| `DVRM-C17` | `SIGN<sign_r; r[3], signed>` |
-| `DVRM-C18` | `SIGN<sign_d; d[3], signed>` |
-
 ---
 
 # LOAD Chip

From 9deb62f2aa487da75e8f10e75d5d400d2a33a18f Mon Sep 17 00:00:00 2001
From: Joaquin Carletti <joaquin.carletti@lambdaclass.com>
Date: Wed, 11 Feb 2026 17:43:24 -0300
Subject: [PATCH 066/105] update scripot

---
 scripts/extract_and_convert_spec.sh | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/scripts/extract_and_convert_spec.sh b/scripts/extract_and_convert_spec.sh
index be8b1859b..ee3ac62de 100755
--- a/scripts/extract_and_convert_spec.sh
+++ b/scripts/extract_and_convert_spec.sh
@@ -1,41 +1,43 @@
 #!/bin/bash
-# Extract spec TOML files from spec/main branch and convert to Markdown
+# Extract spec TOML files from a spec branch and convert to Markdown
 #
 # Usage:
-#   ./scripts/extract_and_convert_spec.sh [output_dir]
+#   ./scripts/extract_and_convert_spec.sh [branch] [output_dir]
 #
+# Default branch: origin/spec/main
 # Default output directory: docs/spec
 
 set -e
 
 SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 REPO_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
-OUTPUT_DIR="${1:-$REPO_ROOT/docs/spec}"
+BRANCH="${1:-origin/spec/main}"
+OUTPUT_DIR="${2:-$REPO_ROOT/docs/spec}"
 TEMP_DIR=$(mktemp -d)
 
-echo "Extracting spec files from origin/spec/main..."
+echo "Extracting spec files from $BRANCH..."
 
 # Create temp directory structure
 mkdir -p "$TEMP_DIR/src"
 
 # Extract config
-git show origin/spec/main:spec/src/config.toml > "$TEMP_DIR/src/config.toml" 2>/dev/null || {
-    echo "Error: Could not find spec/src/config.toml in origin/spec/main"
-    echo "Make sure to fetch the branch: git fetch origin spec/main"
+git show "$BRANCH:spec/src/config.toml" > "$TEMP_DIR/src/config.toml" 2>/dev/null || {
+    echo "Error: Could not find spec/src/config.toml in $BRANCH"
+    echo "Make sure to fetch the branch: git fetch origin <branch-name>"
     rm -rf "$TEMP_DIR"
     exit 1
 }
 
 # Extract all chip TOML files
-for file in $(git ls-tree -r origin/spec/main --name-only | grep '^spec/src/.*\.toml$' | grep -v config.toml | grep -v page.toml); do
+for file in $(git ls-tree -r "$BRANCH" --name-only | grep '^spec/src/.*\.toml$' | grep -v config.toml | grep -v page.toml); do
     filename=$(basename "$file")
-    git show "origin/spec/main:$file" > "$TEMP_DIR/src/$filename" 2>/dev/null || true
+    git show "$BRANCH:$file" > "$TEMP_DIR/src/$filename" 2>/dev/null || true
 done
 
 # Extract all Typst (.typ) files
-for file in $(git ls-tree -r origin/spec/main --name-only | grep '^spec/.*\.typ$'); do
+for file in $(git ls-tree -r "$BRANCH" --name-only | grep '^spec/.*\.typ$'); do
     filename=$(basename "$file")
-    git show "origin/spec/main:$file" > "$TEMP_DIR/$filename" 2>/dev/null || true
+    git show "$BRANCH:$file" > "$TEMP_DIR/$filename" 2>/dev/null || true
 done
 
 # List extracted files

From fd662fe9777415d33046b4b1d107e184d40dc26c Mon Sep 17 00:00:00 2001
From: Joaquin Carletti <joaquin.carletti@lambdaclass.com>
Date: Wed, 11 Feb 2026 17:43:43 -0300
Subject: [PATCH 067/105] update

---
 docs/spec/dvrm.md      | 30 +++++++++++++++---------------
 docs/spec/spec_full.md | 28 ++++++++++++++--------------
 2 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/docs/spec/dvrm.md b/docs/spec/dvrm.md
index 56e6b3257..1ce0fc969 100644
--- a/docs/spec/dvrm.md
+++ b/docs/spec/dvrm.md
@@ -156,20 +156,12 @@ carry (when iter=[1, 3]) := 2^-32 * ((extended_n_sub_r::QuadWL)[i] + (extended_r
 
 ## Constraints
 
-### output
-
-| Tag | Description | Multiplicity |
-|-----|-------------|--------------|
-| `DVRM-C21` | `DVRM[q::DWordWL; n, d, signed, 0]` | -μ_q |
-| `DVRM-C22` | `DVRM[r::DWordWL; n, d, signed, 1]` | -μ_r |
-
-### defs
+### sign_equality
 
 | Tag | Description |
 |-----|-------------|
-| `DVRM-C16` | `SIGN<sign_n; n[3], signed>` |
-| `DVRM-C17` | `SIGN<sign_r; r[3], signed>` |
-| `DVRM-C18` | `SIGN<sign_d; d[3], signed>` |
+| `DVRM-C1` | `r` eq.not 0 => `sign_r` = `sign_n` |
+| | _polynomial:_ `Σ_i = 0^3 r[i] * (sign_r - sign_n) = 0` |
 
 ### n_sub_r
 
@@ -180,12 +172,13 @@ carry (when iter=[1, 3]) := 2^-32 * ((extended_n_sub_r::QuadWL)[i] + (extended_r
 | `DVRM-C11.i` | i ∈ [0, 3] | `IS_HALF[n_sub_r[i]]` | μ_sum |
 | `DVRM-C12` |  | `IS_BIT<sign_n_sub_r>` |  |
 
-### sign_equality
+### defs
 
 | Tag | Description |
 |-----|-------------|
-| `DVRM-C1` | `r` eq.not 0 => `sign_r` = `sign_n` |
-| | _polynomial:_ `Σ_i = 0^3 r[i] * (sign_r - sign_n) = 0` |
+| `DVRM-C16` | `SIGN<sign_n; n[3], signed>` |
+| `DVRM-C17` | `SIGN<sign_r; r[3], signed>` |
+| `DVRM-C18` | `SIGN<sign_d; d[3], signed>` |
 
 ### div_by_zero
 
@@ -213,4 +206,11 @@ carry (when iter=[1, 3]) := 2^-32 * ((extended_n_sub_r::QuadWL)[i] + (extended_r
 | | | _polynomial:_ `(1 - sign_r) * (abs_r[i] - (r::DWordWL)[i]) = 0` | |
 | `DVRM-C5` |  | sign_d ⇒ `NEG<abs_d; d>` |  |
 | `DVRM-C6.i` | i ∈ [0, 1] | not`sign_d` => `abs_d` = `d` |  |
-| | | _polynomial:_ `(1 - sign_d) * (abs_d[i] - (d::DWordWL)[i]) = 0` | |
\ No newline at end of file
+| | | _polynomial:_ `(1 - sign_d) * (abs_d[i] - (d::DWordWL)[i]) = 0` | |
+
+### output
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `DVRM-C21` | `DVRM[q::DWordWL; n, d, signed, 0]` | -μ_q |
+| `DVRM-C22` | `DVRM[r::DWordWL; n, d, signed, 1]` | -μ_r |
\ No newline at end of file
diff --git a/docs/spec/spec_full.md b/docs/spec/spec_full.md
index 6d9f79290..42836b3fa 100644
--- a/docs/spec/spec_full.md
+++ b/docs/spec/spec_full.md
@@ -1501,20 +1501,12 @@ carry (when iter=[1, 3]) := 2^-32 * ((extended_n_sub_r::QuadWL)[i] + (extended_r
 
 ## Constraints
 
-### output
-
-| Tag | Description | Multiplicity |
-|-----|-------------|--------------|
-| `DVRM-C21` | `DVRM[q::DWordWL; n, d, signed, 0]` | -μ_q |
-| `DVRM-C22` | `DVRM[r::DWordWL; n, d, signed, 1]` | -μ_r |
-
-### defs
+### sign_equality
 
 | Tag | Description |
 |-----|-------------|
-| `DVRM-C16` | `SIGN<sign_n; n[3], signed>` |
-| `DVRM-C17` | `SIGN<sign_r; r[3], signed>` |
-| `DVRM-C18` | `SIGN<sign_d; d[3], signed>` |
+| `DVRM-C1` | `r` eq.not 0 => `sign_r` = `sign_n` |
+| | _polynomial:_ `Σ_i = 0^3 r[i] * (sign_r - sign_n) = 0` |
 
 ### n_sub_r
 
@@ -1525,12 +1517,13 @@ carry (when iter=[1, 3]) := 2^-32 * ((extended_n_sub_r::QuadWL)[i] + (extended_r
 | `DVRM-C11.i` | i ∈ [0, 3] | `IS_HALF[n_sub_r[i]]` | μ_sum |
 | `DVRM-C12` |  | `IS_BIT<sign_n_sub_r>` |  |
 
-### sign_equality
+### defs
 
 | Tag | Description |
 |-----|-------------|
-| `DVRM-C1` | `r` eq.not 0 => `sign_r` = `sign_n` |
-| | _polynomial:_ `Σ_i = 0^3 r[i] * (sign_r - sign_n) = 0` |
+| `DVRM-C16` | `SIGN<sign_n; n[3], signed>` |
+| `DVRM-C17` | `SIGN<sign_r; r[3], signed>` |
+| `DVRM-C18` | `SIGN<sign_d; d[3], signed>` |
 
 ### div_by_zero
 
@@ -1560,6 +1553,13 @@ carry (when iter=[1, 3]) := 2^-32 * ((extended_n_sub_r::QuadWL)[i] + (extended_r
 | `DVRM-C6.i` | i ∈ [0, 1] | not`sign_d` => `abs_d` = `d` |  |
 | | | _polynomial:_ `(1 - sign_d) * (abs_d[i] - (d::DWordWL)[i]) = 0` | |
 
+### output
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `DVRM-C21` | `DVRM[q::DWordWL; n, d, signed, 0]` | -μ_q |
+| `DVRM-C22` | `DVRM[r::DWordWL; n, d, signed, 1]` | -μ_r |
+
 ---
 
 # LOAD Chip

From 172cf3e12ce5f7a4de43fc29c09f6a9d3a14e1f1 Mon Sep 17 00:00:00 2001
From: Erik <159244975+erik-3milabs@users.noreply.github.com>
Date: Thu, 12 Feb 2026 10:39:11 +0100
Subject: [PATCH 068/105] spec: `COMMIT` chip (#283)

* spec: update footnote numbering

* spec: COMMIT: specify commit chip

* spec: COMMIT: fix typos

* Move footnote numbering to a more general spot and allow easy future updates

* Update common-formatting location

* spec: COMMIT: update citation links

* spec: COMMIT: deal with committing 0 bytes

* spec: COMMIT: list future improvement

* Fix typos

Co-authored-by: Robin Jadoul <robin.jadoul@3milabs.tech>

* spec: COMMIT: rearrange CNB multiplicity

* spec: COMMIT: update padding strategy
permitting ADD and SUB constraints of lower degree

* spec: COMMIT: list two possible optimizations

---------

Co-authored-by: Robin Jadoul <robin.jadoul@3milabs.tech>
---
 spec/book.typ        |   8 +-
 spec/ebook.typ       |   3 +-
 spec/ecall.typ       | 115 +++++++++++++++++++++-
 spec/src/commit.toml | 221 +++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 342 insertions(+), 5 deletions(-)
 create mode 100644 spec/src/commit.toml

diff --git a/spec/book.typ b/spec/book.typ
index 076d31cf3..7a55323cd 100644
--- a/spec/book.typ
+++ b/spec/book.typ
@@ -33,6 +33,11 @@
   summary: meta.summary.map(((ch, title, _ref)) => chapter(ch, title)).join()
 )
 
+#let common-formatting(body) = {
+  set footnote(numbering: "[1]")
+  body
+}
+
 
 #let todo(background: white, foreground: black, name: none, body) = block(fill: background, outset: 0.5em, radius: 20%, stroke: black)[
   #set text(fill: foreground)
@@ -134,6 +139,7 @@
   assert(meta.summary.find(((f, _, _)) => f == file) != none, message: "Couldn't resolve typst source file " + file)
   if is-shiroa {
     (body) => {
+      show: common-formatting
       context _xref-included.update(x => x + ((file): true))
       context _toplevel.update(s => {
         if s == none {
@@ -151,6 +157,6 @@
       ])
     }
   } else {
-    (body) => body
+    body => body
   }
 }
diff --git a/spec/ebook.typ b/spec/ebook.typ
index f9ba76046..e1b15253e 100644
--- a/spec/ebook.typ
+++ b/spec/ebook.typ
@@ -1,4 +1,4 @@
-#import "/book.typ": style, meta
+#import "/book.typ": style, meta, common-formatting
 
 #set document(author: meta.authors, title: meta.title)
 
@@ -10,6 +10,7 @@
 #pagebreak(weak: true)
 #outline()
 
+#show: common-formatting
 #show heading: set heading(numbering: "1.1")
 
 #meta.summary.map(((ch, title, ref)) => [
diff --git a/spec/ecall.typ b/spec/ecall.typ
index 6908f768b..dd3544ef3 100644
--- a/spec/ecall.typ
+++ b/spec/ecall.typ
@@ -12,6 +12,15 @@
 #let config = load_config()
 
 #show: book-page("ecall.typ")
+= About `ECALL`
+When `ECALL` is executed, it is assumed that:
+- register `A7` contains the system call number
+  #footnote([The RISC-V system call ABI; libriscv.no, #link("https://web.archive.org/web/20260128152107/https://libriscv.no/docs/concepts/syscalls/#the-risc-v-system-call-abi")[[src]]]),
+- the arguments are located in registers `A0`-`A6`, and
+- the return value is written to `A0`,
+where `A0`-`A7` are symbolic names for the registers `x10`-`x17`
+#footnote([RISC-V - Register sets; en.wikipedia.org, #link("https://web.archive.org/web/20260209053447/https://en.wikipedia.org/wiki/RISC-V#Register_sets")[[src]]]).
+
 
 #let config = load_config()
 #let chip = load_chip("src/halt.toml", config)
@@ -45,13 +54,113 @@ This prevents any other operation involving memory from being executed hereafter
 ])
 
 === Lookup
-The HALT chip contributes the following interaction to the lookup-argument:
+In this VM, halting is considered equivalent to executing a `sys_exit`.
+Hence, this chip responds to `ECALL`s with system call number 93.
+#footnote([RISC-V GNU-toolchain, `unistd.h`; version 2026-01-23, #link("https://github.com/riscv-collab/riscv-gnu-toolchain/blob/2026.01.23/linux-headers/include/asm-generic/unistd.h#L258")[[src]]])
+The HALT chip therefore contributes the following interaction to the lookup-argument:
 #render_constraint_table(chip, config, groups: "lookup")
 
-*Note*: #link("https://github.com/riscv-collab/riscv-gnu-toolchain/blob/master/linux-headers/include/asm-generic/unistd.h#L258")[$93$ is the system call number corresponding to `sys_exit`.]
-
 == Padding
 This chip should only contain a single row.
 Given that $2^0 = 1$, this chip does not need to be padded.
 As such, no padding is defined.
 
+
+#let config = load_config()
+#let chip = load_chip("src/commit.toml", config)
+#let commit = raw(chip.name)
+= #commit chip
+
+== Columns
+#let nr_variables = total_nr_variables(chip)
+#let nr_columns = total_nr_instantiated_columns(chip, config)
+
+The #commit chip leverages #nr_variables variables, spanning #nr_columns columns:
+#render_chip_column_table(chip, config)
+
+== Constraints
+In this VM, committing is considered equivalent to writing a value to `stdout`.
+Hence, this chip responds to `ECALL`s with system call number 64.
+#footnote([RISC-V GNU-toolchain, `unistd.h`; version 2026-01-23, #link("https://github.com/riscv-collab/riscv-gnu-toolchain/blob/2026.01.23/linux-headers/include/asm-generic/unistd.h#L174")[[src]]])
+Since we do not know how many bytes are to be committed, this chip employs a recursive design:
+each iteration commits one byte, and recursively "calls" itself to commit the remaining bytes.
+As such, only the call from the CPU to this chip (i.e., the `first` in the recursion tree) should accept the `ECALL`; later recursive calls should not.
+This is why @commit:c:receive_ecall has multiplicity $-#`first`$.
+#render_constraint_table(chip, config, groups: "incoming")
+
+The `write` operation --- writing to a file descriptor --- has the following signature:
+#footnote([Linux man-page on `write`; man7.org, version 6.16, 2025-10-29. #link("https://man7.org/linux/man-pages/man2/write.2.html")[[src]]])
+#[
+#show raw.where(block: true): it => block(it, fill: luma(230), inset: 1em, width: 100%, radius: 5pt)
+```c
+ssize_t write(size_t count; int fd, const void buf[count], size_t count);
+```
+]
+That is to say,
+- `A0` contains the file descriptor,
+- `A1` contains the address of `buf`'s first byte, 
+- `A2` contains `count`, and
+- the written count should be written to `A0`.
+
+@commit:c:read_address reads `address` from `x11` (=`A1`) and @commit:c:read_count reads `count` from `x12` (=`A2`).
+Since we only support writing to `stdout` (which corresponds to $#`fd` = 1$
+#footnote([The Open Group Standard for Information Technology --- Portable Operating System Interface (POSIX) Base Specifications, `unistd.h`; The Open Group, issue 8, #link("https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/unistd.h.html")[[src]]]))
+we assert that `x10` contains $1$ in @commit:c:read_fd_write_count.
+Note that this constraint _also_ writes `count` to `A0`; 
+in this VM it is impossible for a commit to be interrupted or fail.
+Lastly, the `index` is read from `x254`#footnote([In this VM, register 254 is reserved for containing the commitment index.]); in the same operation, $#`index` + #`count`$ is written back to this location by @commit:c:read_index.
+This, too, leverages the fact that a commit will not be interrupted or fail to update the `index` for the next commit sequence.
+Again, each of these memory interactions only take place when this is the `first` call in the recursion tree.
+
+#render_constraint_table(chip, config, groups: "read_input")
+
+*Note*: the observant reader will notice that @commit:c:read_index casts `count` to a `BaseField`, potentiallly losing information.
+This is indeed correct.
+However, since it is practically impossible to commit more than $2^64-2^32$ bytes in a single VM execution, it was decided to permit this.
+
+Next, we read the `value` located at buffer address `address` and commit to it under the given `index`.
+This is only performed when we have not yet reached the `end` of the commit sequence.
+#render_constraint_table(chip, config, groups: "commit")
+
+In parallel, we compute $#`address_incr` = #`address` + 1$ (@commit:c:address_incr) as address of the next byte to commit, and $#`count_decr` = #`count` - 1$ (@commit:c:count_decr) as the number of bytes that still has to be committed after committing this byte.
+@commit:c:range_address_incr and @commit:c:range_count_decr are included to satisfy @add:a:sum respectively @add:a:rhs.
+#render_constraint_table(chip, config, groups: "incr_decr")
+
+When `count` hits $0$, we should stop performing further recursive calls.
+We use the `end` bit to indicate these circumstances.
+
+#render_constraint_table(chip, config, groups: "end")
+
+*Note*: 
++ Rather than setting $#`end` = 1$ when $#`count` = 0$, we do so when $#`count_decr` = -1$.
+  This technique allows `count` to be stored in a `DWordWL` rather than a `DWordHL`, saving two columns.
++ $forall i in [0, 3]: 65535 - #`count_decr`_i >= 0$ as a result of @commit:c:range_count_decr.
+ Hence, 
+  $
+  sum_(i=0)^3 65535 - #`count_decr`_i = 0 arrow.l.r.double.long forall i in [0, 3]: #`count_decr`_i = 65535
+  $
+
+When this was not the `end` byte to commit in this recursion sequence, we recursively _Commit the Next Byte_ (`CNB`), specifying the timestamp, address to continue reading and the number of bytes that should still be committed (@commit:c:send_commit_next_byte).
+Since that certainly won't be the `first` call in the sequence, we read `address_incr` and `count_decr` from the previous recursion level into `address` and `count` and continue executing the commit.
+#render_constraint_table(chip, config, groups: "lookups")
+
+Lastly, we must make sure `first`, `end` and `μ` are bits (@commit:c:range_first, @commit:c:range_end, @commit:c:range_mu), and that when either $#`first` = 1$ or $#`end` = 1$ imply that $#`μ` = 1$ (@commit:c:first_or_end_implies_mu).
+These are required to ensure the multiplicities $-(#`μ` - #`first`)$ and $#`μ` - #`end`$ are binary.
+#render_constraint_table(chip, config, groups: "bits")
+
+== Padding
+To pad this chip, use the below data.
+#render_chip_padding_table(chip, config)
+
+== Notes/optimizations
+- The current version only supports writing to `stdout`.
+  This chip could potentially be extended to support writing to arbitrary `fd`s
+- One might be able to replace @commit:c:end by `end => count = 0`.
+  While loosening the constraint (`count = 0 => end` is no longer enforced), this should not cause any problems:
+  if the prover does not set `end` when `count=0`, they simply cannot complete the proof.
+  First of all, one would have to recursively work through all $2^64$ values of `count`, something that is practically infeasible.
+  Moreover, if this is done with a sequence that originally has $#`count` > 0$, one will inevitably have to read a memory address twice at the same timestamp, which is impossible to prove.
+  In addition to dropping the `ZERO` lookup, this optimization might also permit moving `count_decr` from a `DWordHL` to a `DWordWL`, saving two columns.
+- Given that it is practically infeasible to commit more than $#`p`-1 = 2^64-2^32$ bytes in a program, it might suffice to store `count_decr` in a `BaseField`.
+  Note that this would probably involve having an extra (virtual) column storing `count` in `BaseField` form as well.
+  Moreover, one might need to add a lookup to `LT` to ensure $#`count` <= #`p`-1$ when being read from memory at the beginning of each commitment sequence.
diff --git a/spec/src/commit.toml b/spec/src/commit.toml
new file mode 100644
index 000000000..5d8325363
--- /dev/null
+++ b/spec/src/commit.toml
@@ -0,0 +1,221 @@
+name = "COMMIT"
+
+# Variables
+
+[[variables.input]]
+name = "timestamp"
+type = "DWordWL"
+desc = "timestamp at which to commit"
+pad = 0
+
+[[variables.auxiliary]]
+name = "index"
+type = "BaseField"
+desc = "Index of value being committed."
+pad = 0
+
+[[variables.auxiliary]]
+name = "address"
+type = "DWordWL"
+desc = "Address of first byte to commit."
+pad = ["arr", 0, 0, 0, 0]
+
+[[variables.auxiliary]]
+name = "address_incr"
+type = "DWordHL"
+desc = "$#`address` + 1$"
+pad = ["arr", 1, 0, 0, 0]
+
+[[variables.auxiliary]]
+name = "count"
+type = "DWordWL"
+desc = "number of bytes to commit"
+pad = ["arr", 1, 0, 0, 0]
+
+[[variables.auxiliary]]
+name = "count_decr"
+type = "DWordHL"
+desc = "$#`count` - 1$"
+pad = ["arr", 0, 0, 0, 0]
+
+[[variables.auxiliary]]
+name = "first"
+type = "Bit"
+desc = "Whether this is the first commitment in this sequence."
+pad = 0
+
+[[variables.auxiliary]]
+name = "end"
+type = "Bit"
+desc = "Whether this is the end of the commitment sequence."
+pad = 0
+
+[[variables.auxiliary]]
+name = "value"
+type = "Byte"
+desc = "Byte stored at `address`."
+pad = 0
+
+[[variables.multiplicity]]
+name = "μ"
+type = "Bit"
+desc = ""
+pad = 0
+
+# Assumptions
+
+
+# Constraints
+
+[[constraint_groups]]
+name = "incoming"
+
+[[constraints.incoming]]
+kind = "interaction"
+tag = "ECALL"
+input = ["timestamp",64]
+multiplicity = ["-", "first"]
+ref = "commit:c:receive_ecall"
+
+[[constraint_groups]]
+name = "read_input"
+
+[[constraints.read_input]]
+kind = "interaction"
+tag = "MEMW"
+input = [1, ["*", 2, 11], "address", "timestamp", 1, 0, 0]
+output = "address"
+multiplicity = "first"
+ref = "commit:c:read_address"
+
+[[constraints.read_input]]
+kind = "interaction"
+tag = "MEMW"
+input = [1, ["*", 2, 12], "count", "timestamp", 1, 0, 0]
+output = "count"
+multiplicity = "first"
+ref = "commit:c:read_count"
+
+[[constraints.read_input]]
+kind = "interaction"
+tag = "MEMW"
+input = [1, ["*", 2, 10], "count", "timestamp", 1, 0, 0]
+output = 1
+multiplicity = "first"
+ref = "commit:c:read_fd_write_count"
+
+[[constraints.read_input]]
+kind = "interaction"
+tag = "MEMW"
+input = [1, ["*", 2, 254], ["+", "index", ["cast", "count", "BaseField"]], "timestamp", 0, 0, 0]
+output = "index"
+multiplicity = "first"
+ref = "commit:c:read_index"
+
+
+[[constraint_groups]]
+name = "incr_decr"
+
+[[constraints.incr_decr]]
+kind = "template"
+tag = "ADD"
+input = ["address", ["cast", 1, "DWordWL"]]
+output = ["cast", "address_incr", "DWordWL"]
+ref = "commit:c:address_incr"
+
+[[constraints.incr_decr]]
+kind = "interaction"
+tag = "IS_HALF"
+input = [["idx", "address_incr", "i"]]
+iter = ["i", 0, 3]
+multiplicity = "μ"
+ref = "commit:c:range_address_incr"
+
+[[constraints.incr_decr]]
+kind = "template"
+tag = "SUB"
+input = ["count", ["cast", 1, "DWordWL"]]
+output = ["cast", "count_decr", "DWordWL"]
+ref = "commit:c:count_decr"
+
+[[constraints.incr_decr]]
+kind = "interaction"
+tag = "IS_HALF"
+input = [["idx", "count_decr", "i"]]
+iter = ["i", 0, 3]
+multiplicity = "μ"
+ref = "commit:c:range_count_decr"
+
+
+[[constraint_groups]]
+name = "commit"
+
+[[constraints.commit]]
+kind = "interaction"
+tag = "MEWM"
+input = [0, "address", "value", "timestamp", 0, 0, 0]
+output = "value"
+multiplicity = ["-", "μ", "end"]
+ref = "commit:c:read_value"
+
+[[constraints.commit]]
+kind = "interaction"
+tag = "COMMIT"
+input = ["index", "value"]
+multiplicity = ["-", "μ", "end"]
+ref = "commit:c:commit_value"
+
+[[constraint_groups]]
+name = "end"
+
+[[constraints.end]]
+kind = "interaction"
+tag = "ZERO"
+input = [["+", ["-", 0xFFFF, ["idx", "count_decr", 0]], ["-", 0xFFFF, ["idx", "count_decr", 1]], ["-", 0xFFFF, ["idx", "count_decr", 2]], ["-", 0xFFFF, ["idx", "count_decr", 3]]]]
+output = "end"
+multiplicity = "μ"
+ref = "commit:c:end"
+
+[[constraint_groups]]
+name = "bits"
+
+[[constraints.bits]]
+kind = "template"
+tag = "IS_BIT"
+input = ["first"]
+ref = "commit:c:range_first"
+
+[[constraints.bits]]
+kind = "template"
+tag = "IS_BIT"
+input = ["end"]
+ref = "commit:c:range_end"
+
+[[constraints.bits]]
+kind = "template"
+tag = "IS_BIT"
+input = ["μ"]
+ref = "commit:c:range_mu"
+
+[[constraints.bits]]
+kind = "arith"
+constraint = "$#`first` + #`end` => #`μ` = 1$"
+poly = ["*", ["+", "first", "end"], ["not", "μ"]]
+ref = "commit:c:first_or_end_implies_mu"
+
+[[constraint_groups]]
+name = "lookups"
+
+[[constraints.lookups]]
+kind = "interaction"
+tag = "CNB"
+input = ["timestamp", ["+", "index", 1], ["cast", "address_incr", "DWordWL"], "count_decr"]
+multiplicity = ["-", "μ", "end"]
+ref = "commit:c:send_commit_next_byte"
+
+[[constraints.lookups]]
+kind = "interaction"
+tag = "CNB"
+input = ["timestamp", "index", "address", "count"]
+multiplicity = ["-", ["-", "μ", "first"]]
+ref = "commit:c:receive_commit_next_byte"

From 8af2cb7daf54eb3c08bfab2a85c42bdd095eb9c6 Mon Sep 17 00:00:00 2001
From: Robin Jadoul <robin.jadoul@3milabs.tech>
Date: Thu, 12 Feb 2026 14:46:27 +0100
Subject: [PATCH 069/105] spec: Typecheck signatures and make all chips pass
 (#312)

* spec: Typecheck signatures and make all chips pass

* Apply suggestion from @RobinJadoul

* Apply suggestion from @RobinJadoul

* Apply suggestion from @RobinJadoul

* s/IS_HALFWORD/IS_HALF

* Ensure constants being casted fit into the first limb
---
 spec/memw.typ            |  2 +-
 spec/src/branch.toml     |  2 +-
 spec/src/cpu.toml        | 39 +++++++----------
 spec/src/halt.toml       | 14 +++---
 spec/src/lt.toml         |  6 +--
 spec/src/memw.toml       | 24 +++++------
 spec/src/page.toml       |  2 +-
 spec/src/shift.toml      |  2 +-
 spec/src/signatures.toml | 10 ++++-
 spec/tooling/chip.py     | 93 ++++++++++++++++++++++++++++++++++------
 10 files changed, 128 insertions(+), 66 deletions(-)

diff --git a/spec/memw.typ b/spec/memw.typ
index a3dfda42c..6c12f00a7 100644
--- a/spec/memw.typ
+++ b/spec/memw.typ
@@ -56,4 +56,4 @@ This chip contributes the following to the lookup argument.
 - MEMB chip that deals does a one-byte write to remove old_timestamp from here (uncertain tradeoffs)
 - Compute `base_address[1] + 1` once and have high words of `address_add` as Words
 - Improve overflow trapping somehow so we don't need `LT` (could tie into previous one by checking carry bit of the +1)
-- Adding `μ_sum`/`w2`/`w4`/`write8` multiplicities to the `IS_HALFWORD` lookups may make some GKR things faster if there are known zeroes.
+- Adding `μ_sum`/`w2`/`w4`/`write8` multiplicities to the `IS_HALF` lookups may make some GKR things faster if there are known zeroes.
diff --git a/spec/src/branch.toml b/spec/src/branch.toml
index 34bcdf8cb..a98974678 100644
--- a/spec/src/branch.toml
+++ b/spec/src/branch.toml
@@ -130,7 +130,7 @@ multiplicity = "μ"
 
 [[constraints.all]]
 kind = "interaction"
-tag = "IS_HALFWORD"
+tag = "IS_HALF"
 input = [["idx", "next_pc_high", "i"]]
 iter = ["i", 0, 2]
 multiplicity = "μ"
diff --git a/spec/src/cpu.toml b/spec/src/cpu.toml
index 283597246..634d00bf9 100644
--- a/spec/src/cpu.toml
+++ b/spec/src/cpu.toml
@@ -77,8 +77,6 @@ type = "Bit"
 desc = "Whether the instruction is of C type, i.e., whether it is 2 bytes long instead of 4"
 pad = 0
 
-# TODO: Should this just be a word? (CHECK: effect on computation/extension of arg2)
-# TODO: make sure decode correctly extends this (may be zero for unsigned and word_instr?)
 [[variables.input]]
 name = "imm"
 type = "DWordWL"
@@ -619,7 +617,7 @@ iter = ["i", 0, 7]
 kind = "interaction"
 tag = "SHIFT"
 input = [["cast", "arg1", "DWordHL"], ["idx", "arg2", 0], "mp_selector", "signed", "word_instr"]
-output = ["cast", "res", "DWordHL"]
+output = ["cast", "res", "DWordWL"]
 multiplicity = "SHIFT"
 
 [[constraints.alu]]
@@ -629,20 +627,18 @@ input = ["pc", ["*", ["+", ["*", 2, "c_type_instruction"], ["*", 4, ["not", "c_t
 output = ["cast", "res", "DWordWL"]
 cond = "JALR"
 
-# TODO: no types available, so no casting yet
 [[constraints.alu]]
 kind = "interaction"
 tag = "MUL"
-input = ["arg1", "signed", "arg2", "mp_selector", "muldiv_selector"]
-output = "res"
+input = [["cast", "arg1", "DWordHL"], "signed", ["cast", "arg2", "DWordHL"], "mp_selector", "muldiv_selector"]
+output = ["cast", "res", "DWordWL"]
 multiplicity = "MUL"
 
-# TODO: no types available, so no casting yet
 [[constraints.alu]]
 kind = "interaction"
 tag = "DVRM"
-input = ["arg1", "arg2", "signed", "muldiv_selector"]
-output = "res"
+input = [["cast", "arg1", "DWordHL"], ["cast", "arg2", "DWordHL"], "signed", "muldiv_selector"]
+output = ["cast", "res", "DWordWL"]
 multiplicity = "DIVREM"
 
 
@@ -650,12 +646,11 @@ multiplicity = "DIVREM"
 name = "mem"
 prefix = "M"
 
-# TODO: no types available, so no casting yet
 [[constraints.mem]]
 kind = "interaction"
 tag = "MEMW"
-input = [1, ["*", 2, "rs1"], "rv1", ["+", "timestamp", ["cast", 0, "DWordWL"]], 1, 0, 0]
-output = "rv1"
+input = [1, ["*", ["cast", 2, "DWordWL"], "rs1"], ["arr", ["idx", ["cast", "rv1", "DWordWL"], 0], ["idx", ["cast", "rv1", "DWordWL"], 1], 0, 0, 0, 0, 0, 0], ["+", "timestamp", ["cast", 0, "DWordWL"]], 1, 0, 0]
+output = ["arr", ["idx", ["cast", "rv1", "DWordWL"], 0], ["idx", ["cast", "rv1", "DWordWL"], 1], 0, 0, 0, 0, 0, 0]
 multiplicity = "read_register1"
 
 [[constraints.mem]]
@@ -664,12 +659,11 @@ constraint = "$#`!read_register1` => #`rv1[i]` = 0$"
 poly = ["*", ["not", "read_register1"], ["idx", "rv1", "i"]]
 iter = ["i", 0, 2]
 
-# TODO: no types available, so no casting yet
 [[constraints.mem]]
 kind = "interaction"
 tag = "MEMW"
-input = [1, ["*", 2, "rs2"], "rv2", ["+", "timestamp", ["cast", 1, "DWordWL"]], 1, 0, 0]
-output = "rv2"
+input = [1, ["*", ["cast", 2, "DWordWL"], "rs2"], ["arr", ["idx", ["cast", "rv2", "DWordWL"], 0], ["idx", ["cast", "rv2", "DWordWL"], 1], 0, 0, 0, 0, 0, 0], ["+", "timestamp", ["cast", 1, "DWordWL"]], 1, 0, 0]
+output = ["arr", ["idx", ["cast", "rv2", "DWordWL"], 0], ["idx", ["cast", "rv2", "DWordWL"], 1], 0, 0, 0, 0, 0, 0]
 multiplicity = "read_register2"
 
 [[constraints.mem]]
@@ -678,33 +672,30 @@ constraint = "$#`!read_register2` => #`rv2[i]` = 0$"
 poly = ["*", ["not", "read_register2"], ["idx", "rv2", "i"]]
 iter = ["i", 0, 2]
 
-# TODO: no types available, so no casting yet
 [[constraints.mem]]
 kind = "interaction"
 tag = "MEMW"
-input = [1, ["*", 2, "rd"], "rvd", ["+", "timestamp", ["cast", 2, "DWordWL"]], 1, 0, 0]
+input = [1, ["*", ["cast", 2, "DWordWL"], "rd"], ["arr", ["idx", "rvd", 0], ["idx", "rvd", 1], 0, 0, 0, 0, 0, 0], ["+", "timestamp", ["cast", 2, "DWordWL"]], 1, 0, 0]
 multiplicity = "write_register"
 
 [[constraints.mem]]
 kind = "interaction"
 tag = "LOAD"
-input = [0, "res", ["+", "timestamp", ["cast", 0, "DWordWL"]], "memory_2bytes", "memory_4bytes", "memory_8bytes", "signed"]
+input = [["cast", "res", "DWordWL"], ["+", "timestamp", ["cast", 0, "DWordWL"]], "memory_2bytes", "memory_4bytes", "memory_8bytes", "signed"]
 output = "rvd"
 multiplicity = "LOAD"
 
-# TODO: no types available, so no casting yet
 [[constraints.mem]]
 kind = "interaction"
 tag = "MEMW"
-input = [0, "res", ["cast", "arg2", ["Byte", 8]], ["+", "timestamp", ["cast", 1, "DWordWL"]], "memory_2bytes", "memory_4bytes", "memory_8bytes"]
+input = [0, ["cast", "res", "DWordWL"], ["cast", "arg2", ["Byte", 8]], ["+", "timestamp", ["cast", 1, "DWordWL"]], "memory_2bytes", "memory_4bytes", "memory_8bytes"]
 multiplicity = "STORE"
 
-# TODO: no types available, so no casting yet
 [[constraints.mem]]
 kind = "interaction"
 tag = "MEMW"
-input = [1, ["*", 2, 255], "next_pc", ["+", "timestamp", ["cast", 1, "DWordWL"]], 1, 0, 0]
-output = "pc"
+input = [1, ["cast", ["*", 2, 255], "DWordWL"], ["arr", ["idx", "next_pc", 0], ["idx", "next_pc", 1], 0, 0, 0, 0, 0, 0], ["+", "timestamp", ["cast", 1, "DWordWL"]], 1, 0, 0]
+output = ["arr", ["idx", "pc", 0], ["idx", "pc", 1], 0, 0, 0, 0, 0, 0]
 multiplicity = ["not", "pad"]
 
 
@@ -817,7 +808,7 @@ poly = ["+",
 [[constraints.misc]]
 kind = "interaction"
 tag = "BRANCH"
-input = ["pc", ["idx", "imm", 0], ["cast", "arg1", "DWordWL"], "JALR"]
+input = ["pc", "imm", ["cast", "arg1", "DWordWL"], "JALR"]
 output = "next_pc"
 multiplicity = "branch_cond"
 
diff --git a/spec/src/halt.toml b/spec/src/halt.toml
index b0606e3e4..9fee04877 100644
--- a/spec/src/halt.toml
+++ b/spec/src/halt.toml
@@ -17,7 +17,7 @@ name = "all"
 [[constraints.all]]
 kind = "interaction"
 tag = "MEMW"
-input = [1, ["*", 2, "i"], 0, ["-", ["^", 2, 64], 1], 1, 0, 0]
+input = [1, ["cast", ["*", 2, "i"], "DWordWL"], ["cast", 0, ["BaseField", 8]], ["cast", ["-", ["^", 2, 64], 1], "DWordWL"], 1, 0, 0]
 iter = ["i", 1, 9]
 multiplicity = 1
 ref = "halt:c:zeroize_registers_lo"
@@ -25,15 +25,15 @@ ref = "halt:c:zeroize_registers_lo"
 [[constraints.all]]
 kind = "interaction"
 tag = "MEMW"
-input = [1, ["*", 2, 10], 0, ["-", ["^", 2, 64], 1], 1, 0, 0]
-output = 0
+input = [1, ["cast", ["*", 2, 10], "DWordWL"], ["cast", 0, ["BaseField", 8]], ["cast", ["-", ["^", 2, 64], 1], "DWordWL"], 1, 0, 0]
+output = ["cast", 0, ["BaseField", 8]]
 multiplicity = 1
 ref = "halt:c:read_zero_exit_code"
 
 [[constraints.all]]
 kind = "interaction"
 tag = "MEMW"
-input = [1, ["*", 2, "i"], 0, ["-", ["^", 2, 64], 1], 1, 0, 0]
+input = [1, ["cast", ["*", 2, "i"], "DWordWL"], ["cast", 0, ["BaseField", 8]], ["cast", ["-", ["^", 2, 64], 1], "DWordWL"], 1, 0, 0]
 iter = ["i", 11, 31]
 multiplicity = 1
 ref = "halt:c:zeroize_registers_hi"
@@ -41,7 +41,7 @@ ref = "halt:c:zeroize_registers_hi"
 [[constraints.all]]
 kind = "interaction"
 tag = "MEMW"
-input = [1, ["*", 2, 255], 1, ["-", ["^", 2, 64], 1], 1, 0, 0]
+input = [1, ["cast", ["*", 2, 255], "DWordWL"], ["arr", 1, 0, 0, 0, 0, 0, 0, 0], ["cast", ["-", ["^", 2, 64], 1], "DWordWL"], 1, 0, 0]
 multiplicity = 1
 ref = "halt:c:pc"
 
@@ -51,6 +51,6 @@ name = "lookup"
 [[constraints.lookup]]
 kind = "interaction"
 tag = "ECALL"
-input = ["timestamp", 93]
+input = ["timestamp", ["cast", 93, "DWordWL"]]
 multiplicity = ["-", 1]
-ref = "halt:c:lookup"
\ No newline at end of file
+ref = "halt:c:lookup"
diff --git a/spec/src/lt.toml b/spec/src/lt.toml
index 1941dbb7a..70d25c919 100644
--- a/spec/src/lt.toml
+++ b/spec/src/lt.toml
@@ -130,21 +130,21 @@ iter = ["i", 0, 1]
 
 [[constraints.defs]]
 kind = "interaction"
-tag = "IS_HALFWORD"
+tag = "IS_HALF"
 input = [["idx", "lhs", 1]]
 multiplicity = "μ"
 ref = "lt:c:range_lhs"
 
 [[constraints.defs]]
 kind = "interaction"
-tag = "IS_HALFWORD"
+tag = "IS_HALF"
 input = [["idx", "rhs", 1]]
 multiplicity = "μ"
 ref = "lt:c:range_rhs"
 
 [[constraints.sub]]
 kind = "interaction"
-tag = "IS_HALFWORD"
+tag = "IS_HALF"
 input = [["idx", "lhs_sub_rhs", "i"]]
 iter = ["i", 0, 3]
 multiplicity = "μ"
diff --git a/spec/src/memw.toml b/spec/src/memw.toml
index af005c2b4..0ae3b9410 100644
--- a/spec/src/memw.toml
+++ b/spec/src/memw.toml
@@ -127,14 +127,14 @@ poly = ["*", "w2", ["not", "μ_sum"]]
 [[constraints.consistency]]
 kind = "template"
 tag = "ADD"
-input = ["base_address", 1]
+input = ["base_address", ["cast", 1, "DWordWL"]]
 output = ["cast", ["idx", "address_add", 0], "DWordWL"]
 cond = "w2"
 
 [[constraints.consistency]]
 kind = "template"
 tag = "ADD"
-input = ["base_address", ["+", "i", 1]]
+input = ["base_address", ["cast", ["+", "i", 1], "DWordWL"]]
 output = ["cast", ["idx", "address_add", "i"], "DWordWL"]
 iter = ["i", 1, 2]
 cond = "w4"
@@ -142,14 +142,14 @@ cond = "w4"
 [[constraints.consistency]]
 kind = "template"
 tag = "ADD"
-input = ["base_address", ["+", "i", 1]]
+input = ["base_address", ["cast", ["+", "i", 1], "DWordWL"]]
 output = ["cast", ["idx", "address_add", "i"], "DWordWL"]
 iter = ["i", 3, 6]
 cond = "write8"
 
 [[constraints.consistency]]
 kind = "interaction"
-tag = "IS_HALFWORD"
+tag = "IS_HALF"
 input = [["idx", ["idx", "address_add", "i"], "j"]]
 iters = [
   ["i", 0, 0],
@@ -159,7 +159,7 @@ multiplicity = "w2"
 
 [[constraints.consistency]]
 kind = "interaction"
-tag = "IS_HALFWORD"
+tag = "IS_HALF"
 input = [["idx", ["idx", "address_add", "i"], "j"]]
 iters = [
   ["i", 1, 2],
@@ -169,7 +169,7 @@ multiplicity = "w4"
 
 [[constraints.consistency]]
 kind = "interaction"
-tag = "IS_HALFWORD"
+tag = "IS_HALF"
 input = [["idx", ["idx", "address_add", "i"], "j"]]
 iters = [
   ["i", 3, 6],
@@ -253,40 +253,40 @@ multiplicity = ["-", "μ_sum"]
 [[constraints.memory]]
 kind = "interaction"
 tag = "memory"
-input = ["is_register", ["idx", "address_add", 0], ["idx", "old_timestamp", 1], ["idx", "old", 1]]
+input = ["is_register", ["cast", ["idx", "address_add", 0], "DWordWL"], ["idx", "old_timestamp", 1], ["idx", "old", 1]]
 multiplicity = "w2"
 
 [[constraints.memory]]
 kind = "interaction"
 tag = "memory"
-input = ["is_register", ["idx", "address_add", 0], "timestamp", ["idx", "value", 1]]
+input = ["is_register", ["cast", ["idx", "address_add", 0], "DWordWL"], "timestamp", ["idx", "value", 1]]
 multiplicity = ["-", "w2"]
 
 [[constraints.memory]]
 kind = "interaction"
 tag = "memory"
-input = ["is_register", ["idx", "address_add", ["-", "i", 1]], ["idx", "old_timestamp", "i"], ["idx", "old", "i"]]
+input = ["is_register", ["cast", ["idx", "address_add", ["-", "i", 1]], "DWordWL"], ["idx", "old_timestamp", "i"], ["idx", "old", "i"]]
 multiplicity = "w4"
 iter = ["i", 2, 3]
 
 [[constraints.memory]]
 kind = "interaction"
 tag = "memory"
-input = ["is_register", ["idx", "address_add", ["-", "i", 1]], "timestamp", ["idx", "value", "i"]]
+input = ["is_register", ["cast", ["idx", "address_add", ["-", "i", 1]], "DWordWL"], "timestamp", ["idx", "value", "i"]]
 multiplicity = ["-", "w4"]
 iter = ["i", 2, 3]
 
 [[constraints.memory]]
 kind = "interaction"
 tag = "memory"
-input = ["is_register", ["idx", "address_add", ["-", "i", 1]], ["idx", "old_timestamp", "i"], ["idx", "old", "i"]]
+input = ["is_register", ["cast", ["idx", "address_add", ["-", "i", 1]], "DWordWL"], ["idx", "old_timestamp", "i"], ["idx", "old", "i"]]
 multiplicity = "write8"
 iter = ["i", 4, 7]
 
 [[constraints.memory]]
 kind = "interaction"
 tag = "memory"
-input = ["is_register", ["idx", "address_add", ["-", "i", 1]], "timestamp", ["idx", "value", "i"]]
+input = ["is_register", ["cast", ["idx", "address_add", ["-", "i", 1]], "DWordWL"], "timestamp", ["idx", "value", "i"]]
 multiplicity = ["-", "write8"]
 iter = ["i", 4, 7]
 
diff --git a/spec/src/page.toml b/spec/src/page.toml
index 21ec76757..e937cfaa4 100644
--- a/spec/src/page.toml
+++ b/spec/src/page.toml
@@ -55,7 +55,7 @@ multiplicity = 1
 [[constraints.all]]
 kind = "interaction"
 tag = "memory"
-input = [0, "address", 0, "init"]
+input = [0, "address", ["cast", 0, "DWordWL"], "init"]
 multiplicity = -1
 
 [[constraints.all]]
diff --git a/spec/src/shift.toml b/spec/src/shift.toml
index 5faed54c7..45c00b064 100644
--- a/spec/src/shift.toml
+++ b/spec/src/shift.toml
@@ -133,7 +133,7 @@ pad = 0
 # Assumptions
 
 [[assumptions]]
-desc = "`IS_HALFWORD[in[i]]`"
+desc = "`IS_HALF[in[i]]`"
 iter = ["i", 0, 3]
 ref = "shift:a:range_in"
 
diff --git a/spec/src/signatures.toml b/spec/src/signatures.toml
index ba233f1e6..66b5bd6cd 100644
--- a/spec/src/signatures.toml
+++ b/spec/src/signatures.toml
@@ -53,7 +53,7 @@ output = "DWordWL"
 [[signatures]]
 tag = "BRANCH"
 kind = "interaction"
-input = ["DWordWL", "Word", "DWordWL", "Bit"]
+input = ["DWordWL", "DWordWL", "DWordWL", "Bit"]
 output = "DWordWL"
 
 # MEMW[old; is_register, base_address, value, timestamp, write2, write4, write8]
@@ -175,4 +175,10 @@ output = "Half"
 tag = "HWSLC"
 kind = "interaction"
 input = ["Half", "B4"]
-output = "Half"
\ No newline at end of file
+output = "Half"
+
+# The actual memory tokens, see MEMW and PAGE
+[[signatures]]
+tag = "memory"
+kind = "interaction"
+input = ["Bit", "DWordWL", "DWordWL", "BaseField"]
diff --git a/spec/tooling/chip.py b/spec/tooling/chip.py
index 6a78dc091..58deb4b3c 100644
--- a/spec/tooling/chip.py
+++ b/spec/tooling/chip.py
@@ -59,6 +59,23 @@ def get_const(self) -> int:
 
 DEFAULT_TYPE: Type = Range.const(0)
 
+
+def structure_matches(a: Type, b: Type) -> bool:
+    if isinstance(a, Range) and isinstance(b, (Range, type(None))):
+        return True
+    elif isinstance(a, list) and isinstance(b, list):
+        return len(a) == len(b) and all(structure_matches(x, y) for x, y in zip(a, b))
+    else:
+        return False
+
+
+def constant_fits(cst: int, target: Type) -> bool:
+    if isinstance(target, Range):
+        return target.low <= cst <= target.high
+    else:
+        return constant_fits(cst, target[0])
+
+
 type Expr = (
     LitExpr
     | VarExpr
@@ -150,6 +167,11 @@ def typecheck(self, env: Environment) -> Type:
             baselen >= castlen or (isinstance(base, Range) and base.is_const()),
             f"Casting from fewer columns to more: {self!r} {base} {self.type}",
         )
+        if isinstance(base, Range) and base.is_const():
+            reporter.asserts(
+                constant_fits(base.get_const(), self.type),
+                f"Casting const to type it doesn't fit: {self!r}",
+            )
         return self.type
 
 
@@ -612,16 +634,6 @@ def __init__(self, config: Config, category: str, data: dict):
         self.def_ = VirtualDef(config, self.name, self.type, def_)
 
     def typecheck(self, env: Environment) -> Type:
-        def structure_matches(a: Type, b: Type) -> bool:
-            if isinstance(a, Range) and isinstance(b, (Range, type(None))):
-                return True
-            elif isinstance(a, list) and isinstance(b, list):
-                return len(a) == len(b) and all(
-                    structure_matches(x, y) for x, y in zip(a, b)
-                )
-            else:
-                return False
-
         def handle_iters(
             env: Environment,
             iters: list[Iter],
@@ -794,6 +806,21 @@ class Signature:
     input: list[Type]
     output: Optional[Type]
 
+    def matches(self, other: Self) -> bool:
+        if not isinstance(other, type(self)):
+            return False
+        if self.tag != other.tag:
+            return False
+        if (self.output is None) != (other.output is None):
+            return False
+        if (
+            self.output is not None
+            and other.output is not None
+            and not structure_matches(self.output, other.output)
+        ):
+            return False
+        return structure_matches(self.input, other.input)
+
 
 @dataclass
 class InteractionLike:
@@ -971,9 +998,49 @@ def typecheck(self) -> Iterable[Signature]:
             yield from c.typecheck(env)
 
 
+def build_signature(config: Config, data: dict) -> Signature:
+    assert_no_unexpected(
+        data, {"tag", "kind", "input", "output", "cond", "multiplicity"}
+    )
+    Sig: type[Signature]
+    match data["kind"]:
+        case "template":
+            reporter.asserts(
+                "multiplicity" not in data,
+                f"Template signature with multiplicity: {data!r}",
+            )
+            Sig = TemplateSignature
+        case "interaction":
+            reporter.asserts(
+                "cond" not in data, f"Template signature with cond: {data!r}"
+            )
+            Sig = InteractionSignature
+    tag = data["tag"]
+    reporter.asserts(isinstance(tag, str), f"Signature tag not a string: {tag!r}")
+    input = [build_type(config, inp) for inp in data["input"]]
+    if "output" in data:
+        output = build_type(config, data["output"])
+    else:
+        output = None
+    return Sig(tag, input, output)
+
+
+def read_signatures(config, filename) -> list[Signature]:
+    data = tomllib.load(open(filename, "rb"))
+    assert_no_unexpected(data, {"signatures"})
+    return [build_signature(config, sig) for sig in data["signatures"]]
+
+
+def check_signatures(found: Iterable[Signature], expected: list[Signature]):
+    for sig in found:
+        reporter.asserts(
+            any(sig.matches(exp) for exp in expected), f"Unexpected signature: {sig}"
+        )
+
+
 if __name__ == "__main__":
     config = Config.from_file(sys.argv[1])
-    signatures = sys.argv[2]  # Later
+    signatures = read_signatures(config, sys.argv[2])
     if reporter.reported:
         sys.exit(1)
     reported = False
@@ -986,6 +1053,4 @@ def typecheck(self) -> Iterable[Signature]:
     if not reported:
         for chip in chips:
             reporter.update_location(f"Chip {chip.name}")
-            # TODO: do something with the signatures
-            # Use list for the sideeffect of forcing the generator until we use the content
-            list(chip.typecheck())
+            check_signatures(chip.typecheck(), signatures)

From 5b81913b865538e311a1ea8d2f6ed5bc009e7531 Mon Sep 17 00:00:00 2001
From: Robin Jadoul <robin.jadoul@3milabs.tech>
Date: Thu, 12 Feb 2026 15:57:32 +0100
Subject: [PATCH 070/105] spec: Variable category for constants (#327)

Closes #303
---
 spec/src/config.toml | 2 +-
 spec/src/page.toml   | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/spec/src/config.toml b/spec/src/config.toml
index 0f6ef11d6..fd0885d40 100644
--- a/spec/src/config.toml
+++ b/spec/src/config.toml
@@ -120,5 +120,5 @@ desc = "A preprocessed column holding the row index (zero-indexed)."
 preprocessed = true
 
 [variables.categories]
-all = ["input", "output", "auxiliary", "virtual", "multiplicity", "condition"]
+all = ["constant", "input", "output", "auxiliary", "virtual", "multiplicity", "condition"]
 instantiated = ["input", "output", "auxiliary", "multiplicity"]
diff --git a/spec/src/page.toml b/spec/src/page.toml
index e937cfaa4..dff939558 100644
--- a/spec/src/page.toml
+++ b/spec/src/page.toml
@@ -2,8 +2,7 @@ name = "PAGE"
 
 # Input
 
-# TODO: add `page` as "constant" column or smth
-[[variables.input]]
+[[variables.constant]]
 name = "page"
 type = "DWordWL"
 desc = "Constant column containing the page base address; should be integrated into the constraints directly"

From 68457ea6a12a48ac26cd170561a87a6c76f936ea Mon Sep 17 00:00:00 2001
From: Robin Jadoul <robin.jadoul@3milabs.tech>
Date: Fri, 13 Feb 2026 14:27:36 +0100
Subject: [PATCH 071/105] spec: Fix interaction signatures for COMMIT (#328)

---
 spec/src/commit.toml     | 26 +++++++++++++-------------
 spec/src/signatures.toml | 12 ++++++++++++
 2 files changed, 25 insertions(+), 13 deletions(-)

diff --git a/spec/src/commit.toml b/spec/src/commit.toml
index 5d8325363..89fa133c6 100644
--- a/spec/src/commit.toml
+++ b/spec/src/commit.toml
@@ -73,7 +73,7 @@ name = "incoming"
 [[constraints.incoming]]
 kind = "interaction"
 tag = "ECALL"
-input = ["timestamp",64]
+input = ["timestamp", ["cast", 64, "DWordWL"]]
 multiplicity = ["-", "first"]
 ref = "commit:c:receive_ecall"
 
@@ -83,32 +83,32 @@ name = "read_input"
 [[constraints.read_input]]
 kind = "interaction"
 tag = "MEMW"
-input = [1, ["*", 2, 11], "address", "timestamp", 1, 0, 0]
-output = "address"
+input = [1, ["cast", ["*", 2, 11], "DWordWL"], ["arr", ["idx", "address", 0], ["idx", "address", 1], 0, 0, 0, 0, 0, 0], "timestamp", 1, 0, 0]
+output = ["arr", ["idx", "address", 0], ["idx", "address", 1], 0, 0, 0, 0, 0, 0]
 multiplicity = "first"
 ref = "commit:c:read_address"
 
 [[constraints.read_input]]
 kind = "interaction"
 tag = "MEMW"
-input = [1, ["*", 2, 12], "count", "timestamp", 1, 0, 0]
-output = "count"
+input = [1, ["cast", ["*", 2, 12], "DWordWL"], ["arr", ["idx", "count", 0], ["idx", "count", 1], 0, 0, 0, 0, 0, 0], "timestamp", 1, 0, 0]
+output = ["arr", ["idx", "count", 0], ["idx", "count", 1], 0, 0, 0, 0, 0, 0]
 multiplicity = "first"
 ref = "commit:c:read_count"
 
 [[constraints.read_input]]
 kind = "interaction"
 tag = "MEMW"
-input = [1, ["*", 2, 10], "count", "timestamp", 1, 0, 0]
-output = 1
+input = [1, ["cast", ["*", 2, 10], "DWordWL"], ["arr", ["idx", "count", 0], ["idx", "count", 1], 0, 0, 0, 0, 0, 0], "timestamp", 1, 0, 0]
+output = ["arr", 1, 0, 0, 0, 0, 0, 0, 0]
 multiplicity = "first"
 ref = "commit:c:read_fd_write_count"
 
 [[constraints.read_input]]
 kind = "interaction"
 tag = "MEMW"
-input = [1, ["*", 2, 254], ["+", "index", ["cast", "count", "BaseField"]], "timestamp", 0, 0, 0]
-output = "index"
+input = [1, ["cast", ["*", 2, 254], "DWordWL"], ["arr", ["+", "index", ["cast", "count", "BaseField"]], 0, 0, 0, 0, 0, 0, 0], "timestamp", 0, 0, 0]
+output = ["arr", "index", 0, 0, 0, 0, 0, 0, 0]
 multiplicity = "first"
 ref = "commit:c:read_index"
 
@@ -152,9 +152,9 @@ name = "commit"
 
 [[constraints.commit]]
 kind = "interaction"
-tag = "MEWM"
-input = [0, "address", "value", "timestamp", 0, 0, 0]
-output = "value"
+tag = "MEMW"
+input = [0, "address", ["arr", "value", 0, 0, 0, 0, 0, 0, 0], "timestamp", 0, 0, 0]
+output = ["arr", "value", 0, 0, 0, 0, 0, 0, 0]
 multiplicity = ["-", "μ", "end"]
 ref = "commit:c:read_value"
 
@@ -209,7 +209,7 @@ name = "lookups"
 [[constraints.lookups]]
 kind = "interaction"
 tag = "CNB"
-input = ["timestamp", ["+", "index", 1], ["cast", "address_incr", "DWordWL"], "count_decr"]
+input = ["timestamp", ["+", "index", 1], ["cast", "address_incr", "DWordWL"], ["cast", "count_decr", "DWordWL"]]
 multiplicity = ["-", "μ", "end"]
 ref = "commit:c:send_commit_next_byte"
 
diff --git a/spec/src/signatures.toml b/spec/src/signatures.toml
index 66b5bd6cd..69a839d2e 100644
--- a/spec/src/signatures.toml
+++ b/spec/src/signatures.toml
@@ -103,6 +103,18 @@ tag = "ECALL"
 kind = "interaction"
 input = ["DWordWL", "DWordWL"]
 
+# CNB[timestamp, index, address, count]
+[[signatures]]
+tag = "CNB"
+kind = "interaction"
+input = ["DWordWL", "BaseField", "DWordWL", "DWordWL"]
+
+# COMMIT[index, value]
+[[signatures]]
+tag = "COMMIT"
+kind = "interaction"
+input = ["BaseField", "Byte"]
+
 # AND_BYTE[res; X, Y]
 [[signatures]]
 tag = "AND_BYTE"

From b0bdd6017399b67760cbcd5835359c5dce9f25cc Mon Sep 17 00:00:00 2001
From: Robin Jadoul <robin.jadoul@3milabs.tech>
Date: Tue, 17 Feb 2026 15:10:32 +0100
Subject: [PATCH 072/105] spec: Cleanup, uniformize chapters, make colors work
 better on web. (#336)

* spec: Cleanup, uniformize chapters, make colors work better on web.

* Fix double scroll bar

* Improve decode table

* Remove `style` state and make aside box grey.

Having multiple web themes makes the style approach
almost always wrong, since we cannot rely on the
scheme being dark or light, in contrast to a regular PDF.

* Apply suggestions from code review

Co-authored-by: Erik <159244975+erik-3milabs@users.noreply.github.com>

* Update spec/cpu.typ

Co-authored-by: Erik <159244975+erik-3milabs@users.noreply.github.com>

---------

Co-authored-by: Erik <159244975+erik-3milabs@users.noreply.github.com>
---
 spec/add.typ         | 34 ++++++++--------------------------
 spec/bitwise.typ     |  4 ++++
 spec/book.typ        | 20 +++++++++-----------
 spec/branch.typ      |  4 +++-
 spec/cpu.typ         |  6 ++++--
 spec/decode.typ      |  7 ++++---
 spec/dvrm.typ        |  2 ++
 spec/ebook.typ       |  7 ++-----
 spec/ecall.typ       |  9 +++++----
 spec/is_bit.typ      | 15 ---------------
 spec/load.typ        |  4 ++++
 spec/lt.typ          |  3 +++
 spec/memory.typ      |  8 +++-----
 spec/memw.typ        |  7 +++++++
 spec/mul.typ         |  5 ++++-
 spec/neg.typ         | 16 +---------------
 spec/shift.typ       | 15 +--------------
 spec/sign.typ        | 13 -------------
 spec/src/config.toml |  2 +-
 19 files changed, 65 insertions(+), 116 deletions(-)

diff --git a/spec/add.typ b/spec/add.typ
index 241ea8621..d2afb1788 100644
--- a/spec/add.typ
+++ b/spec/add.typ
@@ -8,33 +8,15 @@
 #show: book-page(chip.name)
 
 #let add = raw(chip.name)
-
-#let highlighted_code(code) = {
-  box(
-    inset: (left: 4pt, right: 4pt), 
-    outset: (top: 4pt, bottom: 4pt), 
-    radius: 2pt,
-    fill: luma(230), 
-    raw(code))
-}
-
-#add is a constraint template that is used to assert that $#`sum` = #`lhs` + #`rhs` mod 2^64$, under the condition that `cond` is non-zero.
-
-= Notation
-The #add constraint template has the following interface:
-#block(radius: 5pt, width: 100%, inset: 1.5em, fill: luma(230), raw("cond => ADD<sum; lhs, rhs>"))
-where `cond` is any value described by an expression _of degree at most $1$_.
-#highlighted_code("ADD<sum; lhs, rhs>") can be used to denote the _unconditional_ application of the #add template to `lhs`, `rhs`, and `sum`.
-
 #let sub = raw("SUB")
-== #sub
-For ease of notation, we moreover introduce the #sub constraint template.
-Its interface
-#block(radius: 5pt, width: 100%, inset: 1.5em, fill: luma(230), raw("cond => SUB<diff; lhs, rhs>"))
-maps onto the #add template as 
-#block(radius: 5pt, width: 100%, inset: 1.5em, fill: luma(230), raw("cond => ADD<lhs; rhs, diff>"))
-It constrains that $#`diff` = #`lhs` - #`rhs` mod 2^64$ when the expression `cond` is non-zero.
-As with #add, #highlighted_code("SUB<diff; lhs, rhs>") can be used to denote the _unconditional_ application of the template.
+
+#add is a constraint template that is used to assert that $#`sum` equiv #`lhs` + #`rhs` (mod 2^64)$, under the condition that `cond` is non-zero.
+For ease of notation, we moreover introduce the #sub constraint template
+$
+#`SUB<diff; lhs, rhs>` := #`ADD<lhs; rhs, diff>`,
+$
+in both conditional and unconditional versions.
+It constrains that $#`diff` equiv #`lhs` - #`rhs` (mod 2^64)$ when the expression `cond` is non-zero.
 
 = Variables
 #render_chip_column_table(chip, config)
diff --git a/spec/bitwise.typ b/spec/bitwise.typ
index ef1e3a671..d0b3d89e2 100644
--- a/spec/bitwise.typ
+++ b/spec/bitwise.typ
@@ -14,6 +14,10 @@
 #let bitwise = raw(chip.name)
 
 #show: book-page(chip.name)
+#let bitwise = raw(chip.name)
+
+The #bitwise chips deal with precomputed lookup tables for bitwise boolean operations
+and convenience functionalities over small domains.
 
 = Columns
 #let nr_variables = total_nr_variables(chip)
diff --git a/spec/book.typ b/spec/book.typ
index 7a55323cd..338b2679b 100644
--- a/spec/book.typ
+++ b/spec/book.typ
@@ -35,6 +35,7 @@
 
 #let common-formatting(body) = {
   set footnote(numbering: "[1]")
+  show raw.where(block: true): it => block(it, inset: 1em, width: 100%, radius: 5pt)
   body
 }
 
@@ -46,16 +47,12 @@
 #let rj = todo.with(background: teal, name: "Robin")
 #let et = todo.with(background: rgb("d4aa3a"), name: "Erik")
 
-#let style = state("style", (
-  foreground: white,
-))
-
 #let aside(title, body) = context figure(
-  block(inset: (left: 1em, right: 1em, bottom: 1em), stroke: style.final().foreground, breakable: false)[
+  block(inset: (left: 1em, right: 1em, bottom: 1em), stroke: luma(50%), breakable: false)[
     #block(inset: (left: 1em, right: 1em, top: .75em, bottom: .75em),
            width: 100% + 2em,
            fill: rgb("55aaff"),
-           stroke: style.final().foreground,
+           stroke: luma(50%),
            align(center, strong(text(fill: black, title))))
     #align(left, body)
 ])
@@ -83,10 +80,9 @@
 
 // Invisibly include another chapter, so that its labels can be resolved
 #let xref-include(f) = {
-  context if f not in _xref-included.get() {
-    hide(box(width: 0%, height: 0%, strip-all(include "/" + f)))
+  context {
+    place(hide(box(width: auto, height: 0%, strip-all(include "/" + f))))
   }
-  context _xref-included.update(x => x + ((f): true))
 }
 
 // Generate a cross-link for references to other chapters.
@@ -102,7 +98,7 @@
     } else {
       // Because shiroa does weird url escaping
       let shiroa-label = label(str(lbl).replace(":", "%3A"))
-      xref-include(ch)
+      context _xref-included.update(x => x + ((ch): true))
       // The ideal would be to use `rf` directly as content argument to `cross-link`,
       // as that would inherit any/all formatting of the ref we want or need.
       // Unfortunately the ref link seems to take precedence over the cross-link hyperlink
@@ -140,7 +136,6 @@
   if is-shiroa {
     (body) => {
       show: common-formatting
-      context _xref-included.update(x => x + ((file): true))
       context _toplevel.update(s => {
         if s == none {
           file
@@ -153,6 +148,9 @@
         #show ref: it => context if _toplevel.final() == file {
           xref(it)
         }
+        #context _xref-included.final().pairs().map(((key, value)) => context if value and cond() {
+          xref-include(key)
+        }).join()
         #body
       ])
     }
diff --git a/spec/branch.typ b/spec/branch.typ
index 3e944ca63..90503e862 100644
--- a/spec/branch.typ
+++ b/spec/branch.typ
@@ -13,6 +13,9 @@
 #let chip = load_chip("src/branch.toml", config)
 
 #show: book-page(chip.name)
+#let branch = raw(chip.name)
+
+The #branch chip computes the target address of a branching instruction.
 
 = Columns
 #let nr_variables = total_nr_variables(chip)
@@ -27,7 +30,6 @@ The `BRANCH` chip is comprised of #nr_variables variables that are expressed usi
 
 = Constraints
 
-#rj[Check correspondence with CPU for passing in `offset` as word or dword]
 We constrain `next_pc` to be $#`base_address` + #`offset`$,
 where `base_address` equals `pc` when $#`JALR` = 0$ and `register` otherwise.
 
diff --git a/spec/cpu.typ b/spec/cpu.typ
index ed6126388..08fe1533d 100644
--- a/spec/cpu.typ
+++ b/spec/cpu.typ
@@ -13,6 +13,10 @@
 #let chip = load_chip("src/cpu.toml", config)
 
 #show: book-page(chip.name)
+#let cpu = raw(chip.name)
+
+The #cpu chip coordinates memory accesses and dispatches to other chips for arithmetic and logical operations.
+It bases its decisions on the entry of the `DECODE` table (@decode) corresponding the the current program counter (PC).
 
 = Columns
 #let nr_variables = total_nr_variables(chip)
@@ -29,8 +33,6 @@ First, we perform a decoding lookup for the current PC.
 
 #render_constraint_table(chip, config, groups: "decode")
 
-#rj[All casts for interactions will have to be reviewed once other chip interfaces stabilise]
-
 == Range checks
 
 We constrain all columns to have the appropriate ranges.
diff --git a/spec/decode.typ b/spec/decode.typ
index 87f6083f5..f57d7c76f 100644
--- a/spec/decode.typ
+++ b/spec/decode.typ
@@ -79,13 +79,14 @@ To indicate an instruction is provided in compressed form, the `c_type` flag is
   show figure: set block(breakable: true)
 
   figure(table(
-    columns: (auto, auto, 40pt, 40pt, 1fr, 15pt),
+    columns: (auto, auto, auto, auto, 1fr, auto),
     stroke: 0pt,
     inset: (right: .5em),
     align: (left, right, center, center, left, right),
     fill: (_, y) =>
-      if calc.odd(y) and y <= lines.len() { luma(245) }
-      else { white },
+      // Overlay a low-opacity fill color to distinguish the different rows better
+      if calc.odd(y) and y <= lines.len() { color.rgb(0, 0, 100, 20) }
+      else { color.rgb(255, 255, 255, 20) },
     table.header([*Operation*], [*op-flag*], [*`w_instr`*], [*`signed`*], [*other*], []),
     table.hline(stroke: 1.5pt),
     table.vline(x: 1, start: 1, end: lines.len() + 1, stroke: .5pt),
diff --git a/spec/dvrm.typ b/spec/dvrm.typ
index 54e71d771..920aec075 100644
--- a/spec/dvrm.typ
+++ b/spec/dvrm.typ
@@ -17,6 +17,8 @@
 
 #let dvrm = raw(chip.name)
 
+The #dvrm chip provides division and remainder functionality, both signed and unsigned.
+
 = Columns
 #let nr_variables = total_nr_variables(chip)
 #let nr_columns = total_nr_instantiated_columns(chip, config)
diff --git a/spec/ebook.typ b/spec/ebook.typ
index e1b15253e..0e08536fd 100644
--- a/spec/ebook.typ
+++ b/spec/ebook.typ
@@ -1,17 +1,14 @@
-#import "/book.typ": style, meta, common-formatting
+#import "/book.typ": meta, common-formatting
 
 #set document(author: meta.authors, title: meta.title)
 
-#style.update((
-  foreground: black,
-))
-
 #align(center, title(meta.title))
 #pagebreak(weak: true)
 #outline()
 
 #show: common-formatting
 #show heading: set heading(numbering: "1.1")
+#show raw.where(block: true): set block(fill: luma(230))
 
 #meta.summary.map(((ch, title, ref)) => [
   #pagebreak(weak: true)
diff --git a/spec/ecall.typ b/spec/ecall.typ
index dd3544ef3..3b82019db 100644
--- a/spec/ecall.typ
+++ b/spec/ecall.typ
@@ -12,7 +12,9 @@
 #let config = load_config()
 
 #show: book-page("ecall.typ")
-= About `ECALL`
+
+ECALLs provide system-level functionalities to the guest program.
+
 When `ECALL` is executed, it is assumed that:
 - register `A7` contains the system call number
   #footnote([The RISC-V system call ABI; libriscv.no, #link("https://web.archive.org/web/20260128152107/https://libriscv.no/docs/concepts/syscalls/#the-risc-v-system-call-abi")[[src]]]),
@@ -90,12 +92,11 @@ This is why @commit:c:receive_ecall has multiplicity $-#`first`$.
 
 The `write` operation --- writing to a file descriptor --- has the following signature:
 #footnote([Linux man-page on `write`; man7.org, version 6.16, 2025-10-29. #link("https://man7.org/linux/man-pages/man2/write.2.html")[[src]]])
-#[
-#show raw.where(block: true): it => block(it, fill: luma(230), inset: 1em, width: 100%, radius: 5pt)
+
 ```c
 ssize_t write(size_t count; int fd, const void buf[count], size_t count);
 ```
-]
+
 That is to say,
 - `A0` contains the file descriptor,
 - `A1` contains the address of `buf`'s first byte, 
diff --git a/spec/is_bit.typ b/spec/is_bit.typ
index 33477d377..b09242fe4 100644
--- a/spec/is_bit.typ
+++ b/spec/is_bit.typ
@@ -9,24 +9,9 @@
 
 #let is_bit = raw(chip.name)
 
-#let highlighted_code(code) = {
-  box(
-    inset: (left: 4pt, right: 4pt), 
-    outset: (top: 4pt, bottom: 4pt), 
-    radius: 2pt,
-    fill: luma(230), 
-    raw(code))
-}
-
 #is_bit is a constraint template that is used to assert that a variable lies in the range ${0, 1}$ if some second variable is non-zero.
 Barring exceptional cases, this template is used to assert that a variable of type `Bit` assumes a valid value under some condition.
 
-= Interface
-The #is_bit constraint template has the following interface:
-#block(radius: 5pt, width: 100%, inset: 1.5em, fill: luma(230), raw("cond => IS_BIT<X>"))
-where `cond` is any value described by an expression _of degree at most $1$_.
-Note that #highlighted_code("IS_BIT<X>") can be used to denote the _unconditional_ application of the #is_bit template to `X`.
-
 = Variables
 The #is_bit template operates on two variables: `cond` and `X`:
 #render_chip_column_table(chip, config)
diff --git a/spec/load.typ b/spec/load.typ
index bccb830f8..b12e1c04d 100644
--- a/spec/load.typ
+++ b/spec/load.typ
@@ -13,6 +13,10 @@
 #let chip = load_chip("src/load.toml", config)
 
 #show: book-page(chip.name)
+#let load = raw(chip.name)
+
+The #load chip provides functionality to read values from memory and sign-extend them where appropriate.
+It delegates low-level memory handling to the `MEMW` chip (@memw).
 
 = Columns
 #let nr_variables = total_nr_variables(chip)
diff --git a/spec/lt.typ b/spec/lt.typ
index 3447efd70..8e55b390b 100644
--- a/spec/lt.typ
+++ b/spec/lt.typ
@@ -13,6 +13,9 @@
 #let chip = load_chip("src/lt.toml", config)
 
 #show: book-page(chip.name)
+#let lt = raw(chip.name)
+
+The #lt chip constrains an indicator bit for the less-than relation, signed or unsigned.
 
 = Columns
 #let nr_variables = total_nr_variables(chip)
diff --git a/spec/memory.typ b/spec/memory.typ
index 62059de37..778183dab 100644
--- a/spec/memory.typ
+++ b/spec/memory.typ
@@ -107,7 +107,8 @@ This raises the question of how to represent timestamps and cleanly perform this
 as over a finite field the “less than” relation is ill-defined
 (though it is common and natural to consider it as the less than relation over the natural lift of the field into the integers).
 We choose to represent timestamps as machine words, using the existing `LT` chip (@lt) functionality for comparisons.
-#rj[Properly link/refer to the LT chip]
+The full implementation of the timestamp system can be seen in the `timestamp` column of the `CPU` (@cpu) and `MEMW` chips (@memw).
+The `CPU` merely passes in the current timestamp, while `MEMW` can recall the previously written timestamp and constrain the correct sequencing.
 
 #aside[Note on options and trade-offs for timestamp representation][
  #grid(columns: (1fr, 1fr), gutter: 1em)[#align(center, emph[Machine word])][#align(center, emph[Field element])][
@@ -123,8 +124,6 @@ We choose to represent timestamps as machine words, using the existing `LT` chip
   ]
 ]
 
-#rj[reference to CPU chip/timestamp column and MEMW chip]
-
 = Initialization and Finalization
 
 Because the LogUp argument handling token consumption and emission needs to be fully balanced
@@ -213,11 +212,10 @@ and hence doesn't need a column, nor a range check.
 
 == Register initialization/finalization
 
-#rj[Properly link/reference ECALL/HALT chip]
 The initial and final state of registers can be entirely known by
 the verifier, since the relevant initialization values are either zero,
 or embedded in the ELF, and the final values can be set to a known value
-by the HALT ecall.
+by the `HALT` ecall (@ecall).
 As additionally, the number of registers is small, the verifier can directly
 add the required balancing terms to the LogUp sum.
 
diff --git a/spec/memw.typ b/spec/memw.typ
index 6c12f00a7..4b644218a 100644
--- a/spec/memw.typ
+++ b/spec/memw.typ
@@ -13,6 +13,13 @@
 
 #show: book-page(chip.name)
 
+#let memw = raw(chip.name)
+
+The #memw chip is used to read and write memory locations (both RAM and registers)
+in chunks of 1, 2, 4 or 8 values.
+It introduces the old value and last-accessed timestamps of memory addresses internally,
+in order to satisfy the design of the memory argument (@memory).
+
 = Columns
 #let nr_variables = total_nr_variables(chip)
 #let nr_columns = total_nr_instantiated_columns(chip, config)
diff --git a/spec/mul.typ b/spec/mul.typ
index a2fb7d1fc..68ab4a2b8 100644
--- a/spec/mul.typ
+++ b/spec/mul.typ
@@ -16,6 +16,9 @@
 
 #let mul = raw(chip.name)
 
+The #mul chip constrains multiplication, both signed and unsigned,
+as well as providing access to the low and high halfs of the multiplication result.
+
 = Columns
 #let nr_variables = total_nr_variables(chip)
 #let nr_columns = total_nr_instantiated_columns(chip, config)
@@ -104,4 +107,4 @@ The table can be padded to the next power of two with the following value assign
   As an optimization, one might be able to use a `DWordWL` and `DWordHL` to store `lo` and `hi`, 
   where one would decide which to store in which based on the multiplicities `μ_lo` and `μ_hi`;
   the value sent into the lookup could then be assumed range-checked by the other side of the relation.
-  This optimization was not included at this moment because of its negative impact on the readability and verifiability of the chip.
\ No newline at end of file
+  This optimization was not included at this moment because of its negative impact on the readability and verifiability of the chip.
diff --git a/spec/neg.typ b/spec/neg.typ
index ac8554689..d700892cb 100644
--- a/spec/neg.typ
+++ b/spec/neg.typ
@@ -8,22 +8,8 @@
 
 #let neg = raw(chip.name)
 
-#let highlighted_code(code) = {
-  box(
-    inset: (left: 4pt, right: 4pt), 
-    outset: (top: 4pt, bottom: 4pt), 
-    radius: 2pt,
-    fill: luma(230), 
-    raw(code))
-}
-
 #neg is a constraint template that is used to assert that $#`neg` = -#`x`$, under the condition that `cond` is non-zero.
-
-= Notation
-The #neg constraint template has the following interface:
-#block(radius: 5pt, width: 100%, inset: 1.5em, fill: luma(230), raw("cond => NEG<neg; x>"))
-where `cond` is a bit value (i.e., lies in ${0, 1}$)  described by an expression _of degree at most $1$_.
-#highlighted_code("NEG<neg; x>") can be used to denote the _unconditional_ application of the #neg template to `x` and `neg` (which is equivalent to $#`cond` = 1$).
+It requires `cond` to be a bit.
 
 = Variables
 #render_chip_column_table(chip, config)
diff --git a/spec/shift.typ b/spec/shift.typ
index a2a3ec968..b705adb32 100644
--- a/spec/shift.typ
+++ b/spec/shift.typ
@@ -16,20 +16,7 @@
 
 #show: book-page(chip.name)
 
-= Interface
-The #shift chip has the following interface:
-#block(radius: 5pt, width: 100%, inset: 1.5em, fill: luma(240), 
-```
-// param in: the value being shifted
-// param shift: the number of bits to shift `in` by
-// param direction: whether to shift left (0) or right (1) 
-// param signed: whether to interpret `in` as a signed (1) or unsigned (0) integer
-// param word_instr: whether to execute the SLL/SR* (0) or SLLW/SR*W (1) instruction
-// out shifted: the resulting value
-SHIFT[shifted: DWord; in: DWord, shift: Byte, direction: Bit, signed: Bit, word_instr: Bit]
-```
-)
-In other words, the #shift chip is designed to constrain that 
+The #shift chip is designed to constrain that 
 $ 
 #`shifted` := cases(
   #`in` #`<<` #`s` " if" #`direction` = 0,
diff --git a/spec/sign.typ b/spec/sign.typ
index dcc941e47..fc1b8d0a5 100644
--- a/spec/sign.typ
+++ b/spec/sign.typ
@@ -9,20 +9,7 @@
 
 #let sign = raw(chip.name)
 
-#let highlighted_code(code) = {
-  box(
-    inset: (left: 4pt, right: 4pt), 
-    outset: (top: 4pt, bottom: 4pt), 
-    radius: 2pt,
-    fill: luma(230), 
-    raw(code))
-}
-
 #sign is a constraint template that is used to extract a `Half`word's sign.
-
-= Interface
-The #sign constraint template has the following interface:
-#block(radius: 5pt, width: 100%, inset: 1.5em, fill: luma(230), raw("SIGN<sign; X, signed>"))
 It constrains that `sign` is set to `1` when both `X`'s most significant bit and `signed` are $1$, and $0$ otherwise.
 
 = Variables
diff --git a/spec/src/config.toml b/spec/src/config.toml
index fd0885d40..7e6389e3b 100644
--- a/spec/src/config.toml
+++ b/spec/src/config.toml
@@ -110,7 +110,7 @@ desc = """\
 [[variables.types]]
 label = "Timestamp"
 subtypes = ["DWordWL"]
-desc = "A preprocessed column holding timestamps as `DWordWL`. Row `i` of the column contains the value $2^2 dot (i + 1)$. Used in the CPU chip, see there for more details about the magic number."
+desc = "A preprocessed column holding timestamps as `DWordWL`. Row `i` of the column contains the value $2^2 dot (i + 1)$. Used in the CPU chip (@cpu), see there for more details about the magic number."
 preprocessed = true
 
 [[variables.types]]

From 2a259a510db7f63a334bd68bff5ad302d4c157e8 Mon Sep 17 00:00:00 2001
From: Cyprien de Saint Guilhem <c.desaintguilhem@gmail.com>
Date: Mon, 23 Feb 2026 08:06:35 -0800
Subject: [PATCH 073/105] spec: LogUp: Vanilla protocol description (#243)

---------

Co-authored-by: Robin Jadoul <robin.jadoul@3milabs.tech>
---
 spec/book.typ   |   4 +-
 spec/logup.typ  | 146 ++++++++++++++++++++++++++++++++++++++++++++++++
 spec/memory.typ |   3 +-
 3 files changed, 150 insertions(+), 3 deletions(-)
 create mode 100644 spec/logup.typ

diff --git a/spec/book.typ b/spec/book.typ
index 338b2679b..190e63f17 100644
--- a/spec/book.typ
+++ b/spec/book.typ
@@ -7,6 +7,7 @@
   title: "Lambda VM specification",
   authors: ("3MI Labs", "Aligned"),
   summary: (
+    ("logup.typ", [LogUp argument], <logup>),
     ("memory.typ", [Memory argument], <memory>),
     ("variables.typ", [Variables], <vars>),
     ("signatures.typ", [Signatures], <signatures>),
@@ -40,12 +41,13 @@
 }
 
 
-#let todo(background: white, foreground: black, name: none, body) = block(fill: background, outset: 0.5em, radius: 20%, stroke: black)[
+#let todo(background: white, foreground: black, name: none, body) = block(fill: background, outset: 0.4em, radius: 20%, stroke: black)[
   #set text(fill: foreground)
   *TODO #if name != none { [(#name)] }*: #body
 ]
 #let rj = todo.with(background: teal, name: "Robin")
 #let et = todo.with(background: rgb("d4aa3a"), name: "Erik")
+#let cdsg = todo.with(background: olive, name: "Cyprien")
 
 #let aside(title, body) = context figure(
   block(inset: (left: 1em, right: 1em, bottom: 1em), stroke: luma(50%), breakable: false)[
diff --git a/spec/logup.typ b/spec/logup.typ
new file mode 100644
index 000000000..7bb9a085d
--- /dev/null
+++ b/spec/logup.typ
@@ -0,0 +1,146 @@
+#import "/book.typ": book-page, aside, cdsg
+
+#show: book-page("logup")
+#set heading(numbering: "1.")
+#show link: underline
+
+#show "constraint choice": link(<constraint_choices>)[constraint choice]
+
+The _LogUp_ proof system conducts a permutation check based on summing partial derivatives. This check ensures that whatever tuple is sent to be "looked-up" by a _source table_ is indeed received in the expected _destination table_.
+
+= Notation
+
+#let BaseField = math.FF
+#let ExtensionField = math.GG
+
+== VM Notation
+
+=== Preliminary notation
+- $NN$: the set of non-negative natural integers.
+- $BaseField$: the base finite field used by the arithmetisation.
+- $ExtensionField$: a finite extension of $BaseField$ of cryptographic size.
+- $[n]$ for $n in NN$: the set of integers ${0, dots, n - 1}$.
+- $X[i]$ for tuple $X$: the $i$-th element of $X$, starting at $0$.
+
+=== Arithmetisation notation
+
+#let numTables = $sans(t)$
+#let Table = $T$
+#let TableSet = ${Table_i}_(i in [t])$
+#let numColumns = $sans(m)$
+#let numRows = $sans(N)$
+
+- $numTables in NN$: number of tables $Table_i$ in the arithmetisation of the VM.
+- $TableSet$: set of all tables $Table_i$ in the arithmetisation of the VM.
+- $numColumns_i in NN$: number of _columns_ in table $Table_i$ (not the number of variables).
+- $numRows_i in NN$: number of _rows_ in table $Table_i$.
+
+== Interaction Notation
+
+#let Interaction = $I$
+#let id = $sans(id)$
+#let numElements = $ell$
+#let weightFunction = $w$
+#let multiplicity = $mu$
+
+The $j$-th _interaction_ $Interaction_j$ of table $Table_i$ is defined by the following tuple:
+
+#table(
+  columns: (auto, auto),
+  inset: 6pt,
+  align: horizon,
+  stroke: none,
+  table.header([*Symbol*], [*Description*]),
+  table.hline(stroke: 1pt),
+  table.vline(stroke: 1pt, x: 1),
+  [$id_(i,j) in FF$], 
+  [the _type identifier_ of the interaction, usually the identifier of the chip that is constraining the relation expected to hold within the looked-up tuple.],
+  [$numElements_(i,j) in NN$], 
+  [the _length_ of the tuple of elements being looked-up.],
+  [
+    $weightFunction_(i,j) : FF^(numColumns_i) & arrow FF^(numElements_(i,j) + 1) \
+    R & mapsto arrow(t)_(i,j) || mu_(i,j)$
+  ],
+  [the _weight function_ that maps a row $R$ of table $Table_i$ to the looked-up tuple $arrow(t)_(i,j)$ and its multiplicity $mu_(i,j) in BaseField$.],
+)
+
+
+= Vanilla LogUp
+
+== Protocol Description
+
+#let logupChallenge = math.alpha
+#let fingerprintCoeff = math.beta
+
+#set enum(numbering: "1.a.i.1.a.")
+
++ Prover commits to all traces.
+
++ Verifier samples a random _(global) LogUp challenge_ $logupChallenge in ExtensionField$ and a random _fingerprint coefficient_ $fingerprintCoeff in ExtensionField$ and sends them to the Prover.
+
++ Prover commits to (i) interaction contribution, (ii) table running sum columns, and (iii) each table's contribution:
+
+  + For each table $Table_i$, populate the interaction contribution columns and compute the _table (LogUp) contribution_:
+
+    + For each interaction $Interaction_j$ of table $Table_i$, initialize an empty _interaction contribution column_ of length $numRows_i$.
+
+    + Initialise a _table running sum column_ $S_i in ExtensionField^(numRows_i)$ with the first value $S_i [0]$ populated according to the constraint choice.
+
+    + *Constrain* the first row if required by selected constraint choice.
+
+    + For each $j$-th row $R_j in BaseField^(numColumns_i)$ of $Table_i$, for $j in [numRows_i - 1]$:
+      + For each $k$-th interaction $Interaction_k$ of table $Table_i$:
+        + Compute the _interaction contribution numerator_ $ n_(j,k) = mu_(i,k) = w_(i,k)(R_j)[numElements_(i,k)] $
+        + If $n eq.not 0$, compute the _interaction contribution denominator_ $ d_(j,k) = logupChallenge + fingerprintCoeff dot id_(i,k) + sum_(l = 0)^(numElements_(i,k) - 1) fingerprintCoeff^(l + 2) dot weightFunction_(i,k) (R_j)[l]. $
+        + Save the _interaction contribution_ as $n_(j,k)/d_(j,k) in ExtensionField$ in the corresponding interaction contribution column for this interaction.
+        + *Constrain* the interaction contribution column according to the definitions of $n$ and~$d$.
+
+      + Compute the _row contribution_ as the sum $s_(j) = sum_k n_(j,k) / d_(j,k)$ and compute the next row's table running sum value $S_i [j+1] = S_i [j] + s_(j)$.
+
+      + *Constrain* the transition of the running sum column as indicated by the constraint choice.
+
+    + *Constrain* the last row if required by selected constraint choice.
+
+  + Batch-commit to every table's interaction contribution columns and running sum columns with the column commitment scheme and commit to the table's overall contribution $S_i [N_i - 1]$ by sending it in the clear to the verifier.
+
++ Verifier checks that the sum of every table's overall contribution is equal to zero: $sum_i S_i [N_i - 1] = 0_ExtensionField$, and delegates the checks of the constraints to the STARK.
+
+== Running Sum Constraint Choices <constraint_choices>
+
+#cdsg[Write the constraints in this section more formally after STARK description has been written.]
+
+=== Choice 1: transitions looking back
+
+tl,dr: implicit $0_ExtensionField$ initial value, explicit final value.
+
++ (*Boundary, first row*) Constrain first row of running sum column to equal the sum of the first row of every interaction contribution column. (This is analogous an implicit $-1$-th row initialised at $0_ExtensionField$.)
++ (*Transition, looking back, applied to rows $1, dots, numRows_i - 1$*) For each row _other than the first_, constrain the _current_ running sum value to equal the sum of every current interaction contribution column added to the _previous_ running sum value.
++ (*Boundary, last row*) Constrain last row of running sum column to equal the claimed table contribution.
+
+Total constraints: 2 boundary + 1 transition over $numRows_i - 1$ rows.
+
+=== Choice 2: transitions looking forward
+
+tl,dr: explicit $0_ExtensionField$ initial value, implicit final value.
+
++ (*Boundary, first row*) Constrain first row of running sum column to equal $0_ExtensionField$.
++ (*Transition, looking forward, applied to rows $0, dots, numRows_i - 2$*) For each row _other than the last_, constrain the _next_ running sum value to equal the sum of every current interaction contribution column added to the _current_ running sum value.
++ (*Boundary, last row*) Constrain last row of running sum column added to sum of last row of every interaction column to equal the claimed table contribution. (That is, the claimed table contribution is implicit in the last row of the table, but not written to last value of running sum column.)
+
+Total constraints: 2 boundary + 1 transition over $numRows_i - 1$ rows.
+
+=== Choice 3: circular transitions looking back/forward
+
++ For each row, constrain the _current/next_ (wrapping to first on last if "next") running sum value to equal the sum of every current interaction contribution value added to the _previous/current_ (wrapping to last on first if "previous") running sum value added to claimed table contribution divided by $numRows_i$.
+
+Total constraints: 1 _circular_ transition over $numRows_i$ rows.
+
+#aside("Justification")[
+  This single circular constraint checks that each row's contribution $s_(i,j)$ is added to the running sum column, either in the current row's cell or in the next row's.
+  In order to avoid boundary constraints, the look-back or peek-forward into the running sum column wraps around the beginning or end of the table.
+
+  This alone implies that difference between first and last row's values will be the table's overall real contribution $sum_j s_(i,j)$, which will be incompatible with the circularity of the constraint.
+  Since boundary constraints are avoided, the way to check that $sum_j s_(i,j)$ equals the claimed contribution $L_i$ is to remove a fraction of $L_i$ at each row in such a way that $L_i$ is removed completely after summing all $numRows_i$ rows; i.e., the constraint subtracts the public term $L_i / numRows_i$ from the running sum at every row.
+
+  If the expected equality $sum_j s_(i,j) = L_i$ holds, then the circularity of the constraint will also hold.
+]
diff --git a/spec/memory.typ b/spec/memory.typ
index 778183dab..183bb95fa 100644
--- a/spec/memory.typ
+++ b/spec/memory.typ
@@ -95,9 +95,8 @@ we can see the necessity for a memory initialization procedure
 ---in addition to having to make sure the initial memory content lines up with what the binary dictates.
 
 So long as we can properly constrain temporal integrity (that is, no memory operation can consume future tokens),
-this "balancing" act of tokens can be integrated (with sufficient domain separation) into the existing LogUp argument:
+this "balancing" act of tokens can be integrated (with sufficient domain separation) into the existing LogUp argument (@logup):
 consuming a token corresponds to a "receive" and emitting a new token is a "send".
-#rj[properly link/refer to the logup spec]
 
 = Temporal integrity
 

From a3cbdc91ec3fd31abc5573b15fe7242e94301bbb Mon Sep 17 00:00:00 2001
From: Joaquin Carletti <joaquin.carletti@lambdaclass.com>
Date: Wed, 25 Feb 2026 11:18:01 -0300
Subject: [PATCH 074/105] update

---
 docs/spec/add.md       |  14 +--
 docs/spec/bitwise.md   |   2 +
 docs/spec/branch.md    |   6 +-
 docs/spec/cpu.md       |  24 ++---
 docs/spec/decode.md    |   2 +-
 docs/spec/dvrm.md      |  40 +++----
 docs/spec/ecall.md     |  56 +++++++++-
 docs/spec/is_bit.md    |   6 --
 docs/spec/load.md      |   2 +
 docs/spec/lt.md        |   8 +-
 docs/spec/memory.md    |  14 +--
 docs/spec/memw.md      |  28 ++---
 docs/spec/mul.md       |   2 +
 docs/spec/neg.md       |   6 +-
 docs/spec/shift.md     |  20 ++--
 docs/spec/sign.md      |   4 -
 docs/spec/spec_full.md | 230 +++++++++++++++++++++++------------------
 17 files changed, 260 insertions(+), 204 deletions(-)

diff --git a/docs/spec/add.md b/docs/spec/add.md
index 051a55a27..f1f2a3191 100644
--- a/docs/spec/add.md
+++ b/docs/spec/add.md
@@ -1,18 +1,8 @@
 # ADD/SUB Template
 
-box( inset: (left: 4pt, right: 4pt), outset: (top: 4pt, bottom: 4pt), radius: 2pt, fill: luma(230), raw(code)) }
+For ease of notation, we moreover introduce the  constraint template $
 
-= Notation The  constraint template has the following interface:
-
-where `cond` is any value described by an expression _of degree at most `1`_.
-
-## 
-
-For ease of notation, we moreover introduce the  constraint template. Its interface
-
-maps onto the  template as
-
-It constrains that ``diff` = `lhs` - `rhs` mod 2^64` when the expression `cond` is non-zero. As with ,  can be used to denote the _unconditional_ application of the template.
+$ in both conditional and unconditional versions. It constrains that ``diff` equiv `lhs` - `rhs` (mod 2^64)` when the expression `cond` is non-zero.
 
 = Variables
 
diff --git a/docs/spec/bitwise.md b/docs/spec/bitwise.md
index 50d61d8d4..aede36293 100644
--- a/docs/spec/bitwise.md
+++ b/docs/spec/bitwise.md
@@ -1,5 +1,7 @@
 # BITWISE Chips
 
+The  chips deal with precomputed lookup tables for bitwise boolean operations and convenience functionalities over small domains.
+
 = Columns
 
 The  chip is comprised of  variables that are expressed using  columns. Of these, the _input_ and _output_ variables ( in total) are precomputed.
diff --git a/docs/spec/branch.md b/docs/spec/branch.md
index 37fa8b918..a7605e329 100644
--- a/docs/spec/branch.md
+++ b/docs/spec/branch.md
@@ -1,5 +1,7 @@
 # BRANCH Chip
 
+The  chip computes the target address of a branching instruction.
+
 = Columns
 
 The `BRANCH` chip is comprised of  variables that are expressed using  columns:
@@ -8,8 +10,6 @@ The `BRANCH` chip is comprised of  variables that are expressed using  columns:
 
 = Constraints
 
-> **Note:** Check correspondence with CPU for passing in `offset` as word or dword
-
 We constrain `next_pc` to be ``base_address` + `offset``, where `base_address` equals `pc` when ``JALR` = 0` and `register` otherwise.
 
 The range checks on `unmasked_low_byte` and `next_pc_low[0]` are performed implicitly by the `AND_BYTE` lookup.
@@ -20,7 +20,7 @@ The range checks on `unmasked_low_byte` and `next_pc_low[0]` are performed impli
 | `BRANCH-C2` |  | JALR ⇒ `ADD<next_pc_unmasked; register, offset::DWordWL>` |  |
 | `BRANCH-C3` |  | `IS_BYTE[next_pc_low[1]]` | μ |
 | `BRANCH-C4` |  | `AND_BYTE[next_pc_low[0]; unmasked_low_byte, 254]` | μ |
-| `BRANCH-C5.i` | i ∈ [0, 2] | `IS_HALFWORD[next_pc_high[i]]` | μ |
+| `BRANCH-C5.i` | i ∈ [0, 2] | `IS_HALF[next_pc_high[i]]` | μ |
 
 This chip contributes the following to the lookup argument.
 
diff --git a/docs/spec/cpu.md b/docs/spec/cpu.md
index 0bd5e0d6d..e0a2af9c2 100644
--- a/docs/spec/cpu.md
+++ b/docs/spec/cpu.md
@@ -1,5 +1,7 @@
 # CPU Chip
 
+The  chip coordinates memory accesses and dispatches to other chips for arithmetic and logical operations. It bases its decisions on the entry of the `DECODE` table ([decode]) corresponding the the current program counter (PC).
+
 = Columns
 
 The `CPU` chip is comprised of  variables that are expressed using  columns:
@@ -12,8 +14,6 @@ The `CPU` chip is comprised of  variables that are expressed using  columns:
 |-----|-------------|--------------|
 | `CPU-C1` | `DECODE[pc, imm, packed_decode]` | 1 |
 
-> **Note:** All casts for interactions will have to be reviewed once other chip interfaces stabilise
-
 ## Range checks
 
 > **Note:** Make sure we argue for every column here
@@ -73,10 +73,10 @@ The ALU functionality is then obtained through judicious dispatching to the corr
 | `CPU-CA40.i` | i ∈ [0, 7] | `AND_BYTE[res[i]; arg1[i], arg2[i]]` | AND |
 | `CPU-CA41.i` | i ∈ [0, 7] | `OR_BYTE[res[i]; arg1[i], arg2[i]]` | OR |
 | `CPU-CA42.i` | i ∈ [0, 7] | `XOR_BYTE[res[i]; arg1[i], arg2[i]]` | XOR |
-| `CPU-CA43` |  | `SHIFT[res::DWordHL; arg1::DWordHL, arg2[0], mp_selector, signed, word_instr]` | SHIFT |
+| `CPU-CA43` |  | `SHIFT[res::DWordWL; arg1::DWordHL, arg2[0], mp_selector, signed, word_instr]` | SHIFT |
 | `CPU-CA44` |  | JALR ⇒ `ADD<res::DWordWL; pc, (2 * c_type_instruction + 4 * (1 - c_type_instruction)) * 1::DWordWL>` |  |
-| `CPU-CA45` |  | `MUL[res; arg1, signed, arg2, mp_selector, muldiv_selector]` | MUL |
-| `CPU-CA46` |  | `DVRM[res; arg1, arg2, signed, muldiv_selector]` | DIVREM |
+| `CPU-CA45` |  | `MUL[res::DWordWL; arg1::DWordHL, signed, arg2::DWordHL, mp_selector, muldiv_selector]` | MUL |
+| `CPU-CA46` |  | `DVRM[res::DWordWL; arg1::DWordHL, arg2::DWordHL, signed, muldiv_selector]` | DIVREM |
 
 ## Memory
 
@@ -84,16 +84,16 @@ The interactions with the memory, both for register loading and storing, as for
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `CPU-CM47` |  | `MEMW[rv1; 1, 2 * rs1, rv1, timestamp + 0::DWordWL, 1, 0, 0]` | read_register1 |
+| `CPU-CM47` |  | `MEMW[['arr', ['idx', ['cast', 'rv1', 'DWordWL'], 0], ['idx', ['cast', 'rv1', 'DWordWL'], 1], 0, 0, 0, 0, 0, 0]; 1, 2::DWordWL * rs1, ['arr', ['idx', ['cast', 'rv1', 'DWordWL'], 0], ['idx', ['cast', 'rv1', 'DWordWL'], 1], 0, 0, 0, 0, 0, 0], timestamp + 0::DWordWL, 1, 0, 0]` | read_register1 |
 | `CPU-CM48.i` | i ∈ [0, 2] | `!read_register1` => `rv1[i]` = 0 |  |
 | | | _polynomial:_ `(1 - read_register1) * rv1[i] = 0` | |
-| `CPU-CM49` |  | `MEMW[rv2; 1, 2 * rs2, rv2, timestamp + 1::DWordWL, 1, 0, 0]` | read_register2 |
+| `CPU-CM49` |  | `MEMW[['arr', ['idx', ['cast', 'rv2', 'DWordWL'], 0], ['idx', ['cast', 'rv2', 'DWordWL'], 1], 0, 0, 0, 0, 0, 0]; 1, 2::DWordWL * rs2, ['arr', ['idx', ['cast', 'rv2', 'DWordWL'], 0], ['idx', ['cast', 'rv2', 'DWordWL'], 1], 0, 0, 0, 0, 0, 0], timestamp + 1::DWordWL, 1, 0, 0]` | read_register2 |
 | `CPU-CM50.i` | i ∈ [0, 2] | `!read_register2` => `rv2[i]` = 0 |  |
 | | | _polynomial:_ `(1 - read_register2) * rv2[i] = 0` | |
-| `CPU-CM51` |  | `MEMW[1, 2 * rd, rvd, timestamp + 2::DWordWL, 1, 0, 0]` | write_register |
-| `CPU-CM52` |  | `LOAD[rvd; 0, res, timestamp + 0::DWordWL, memory_2bytes, memory_4bytes, memory_8bytes, signed]` | LOAD |
-| `CPU-CM53` |  | `MEMW[0, res, arg2::Byte[8], timestamp + 1::DWordWL, memory_2bytes, memory_4bytes, memory_8bytes]` | STORE |
-| `CPU-CM54` |  | `MEMW[pc; 1, 2 * 255, next_pc, timestamp + 1::DWordWL, 1, 0, 0]` | 1 - pad |
+| `CPU-CM51` |  | `MEMW[1, 2::DWordWL * rd, ['arr', ['idx', 'rvd', 0], ['idx', 'rvd', 1], 0, 0, 0, 0, 0, 0], timestamp + 2::DWordWL, 1, 0, 0]` | write_register |
+| `CPU-CM52` |  | `LOAD[rvd; res::DWordWL, timestamp + 0::DWordWL, memory_2bytes, memory_4bytes, memory_8bytes, signed]` | LOAD |
+| `CPU-CM53` |  | `MEMW[0, res::DWordWL, arg2::Byte[8], timestamp + 1::DWordWL, memory_2bytes, memory_4bytes, memory_8bytes]` | STORE |
+| `CPU-CM54` |  | `MEMW[['arr', ['idx', 'pc', 0], ['idx', 'pc', 1], 0, 0, 0, 0, 0, 0]; 1, (2 * 255)::DWordWL, ['arr', ['idx', 'next_pc', 0], ['idx', 'next_pc', 1], 0, 0, 0, 0, 0, 0], timestamp + 1::DWordWL, 1, 0, 0]` | 1 - pad |
 
 ## System
 
@@ -138,7 +138,7 @@ For [cpu:c:is_equal], note that [cpu:c:sub] sets `res` to be the difference betw
 | `CPU-CO67` | `ZERO[is_equal; res[0] + res[1] + res[2] + res[3] + res[4] + res[5] + res[6] + res[7]]` | BEQ |
 | `CPU-CO68` | `branch_cond` = `JALR` or (`BLT` and (`res` xor `invert`)) or (`BEQ` and (`is_equal` xor `invert`)) |  |
 | | _polynomial:_ `-branch_cond + JALR + res[0] * (1 - mp_selector) * BLT + (1 - res[0]) * mp_selector * BLT + is_equal * (1 - mp_selector) * BEQ + (1 - is_equal) * mp_selector * BEQ = 0` | |
-| `CPU-CO69` | `BRANCH[next_pc; pc, imm[0], arg1::DWordWL, JALR]` | branch_cond |
+| `CPU-CO69` | `BRANCH[next_pc; pc, imm, arg1::DWordWL, JALR]` | branch_cond |
 | `CPU-CO70` | `ADD<next_pc; pc, (2 * c_type_instruction + 4 * (1 - c_type_instruction)) * 1::DWordWL>` |  |
 
 > **Note:** Document the choice to not have a multiplicity column here for padding
diff --git a/docs/spec/decode.md b/docs/spec/decode.md
index 36b942ef4..26ce8b81e 100644
--- a/docs/spec/decode.md
+++ b/docs/spec/decode.md
@@ -26,7 +26,7 @@ The `RV64C` extension for compressed instructions specifies that \~50% of all in
 
 show figure: set block(breakable: true)
 
-figure(table( columns: (auto, auto, 40pt, 40pt, 1fr, 15pt), stroke: 0pt, inset: (right: .5em), align: (left, right, center, center, left, right), fill: (_, y) => if calc.odd(y) and y <= lines.len() { luma(245) } else { white }, table.header([*Operation*], [*op-flag*], [*`w_instr`*], [*`signed`*], [*other*], []), table.hline(stroke: 1.5pt), table.vline(x: 1, start: 1, end: lines.len() + 1, stroke: .5pt), ..lines.flatten(), table.hline(stroke: 1.5pt), table.footer([*Operation*], [*op-flag*], [*`w_instr`*], [*`signed`*], [*other*]), ), caption: [Decoding table] }
+figure(table( columns: (auto, auto, auto, auto, 1fr, auto), stroke: 0pt, inset: (right: .5em), align: (left, right, center, center, left, right), fill: (_, y) => // Overlay a low-opacity fill color to distinguish the different rows better if calc.odd(y) and y <= lines.len() { color.rgb(0, 0, 100, 20) } else { color.rgb(255, 255, 255, 20) }, table.header([*Operation*], [*op-flag*], [*`w_instr`*], [*`signed`*], [*other*], []), table.hline(stroke: 1.5pt), table.vline(x: 1, start: 1, end: lines.len() + 1, stroke: .5pt), ..lines.flatten(), table.hline(stroke: 1.5pt), table.footer([*Operation*], [*op-flag*], [*`w_instr`*], [*`signed`*], [*other*]), ), caption: [Decoding table] }
 
 // OP-IMM ([`ADDI[W]   rd, rs1, imm`], [`ADD`], [`[W]`], [], [], []), ([`SLTI[U]   rd, rs1, imm`], [`SLT`], [], [.not`[U]`], [], []), ([`ANDI      rd, rs1, imm`], [`AND`], [], [], [], []), ([`ORI       rd, rs1, imm`], [`OR`],   [], [], [], []), ([`XORI      rd, rs1, imm`], [`XOR`], [], [], [], []), ([`SLLI[W]   rd, rs1, imm`], [`SHIFT`], [`[W]`], [], [], []), ([`SRLI[W]   rd, rs1, imm`], [`SHIFT`], [`[W]`], [], [`mp_selector`], []), ([`SRAI[W]   rd, rs1, imm`], [`SHIFT`], [`[W]`], [1], [`mp_selector`], []), // OP ([`ADD[W]    rd, rs1, rs2`], [`ADD`], [`[W]`], [], [], []), ([`SUB[W]    rd, rs1, rs2`], [`SUB`], [`[W]`], [], [], []), ([`SLT[U]    rd, rs1, rs2`], [`SLT`], [], [.not`[U]`], [], []), ([`AND       rd, rs1, rs2`], [`AND`], [], [], [], []), ([`OR        rd, rs1, rs2`], [`OR`], [], [], [], []), ([`XOR       rd, rs1, rs2`], [`XOR`], [], [], [], []), ([`SLL[W]    rd, rs1, rs2`], [`SHIFT`], [`[W]`], [], [], []), ([`SRL[W]    rd, rs1, rs2`], [`SHIFT`], [`[W]`], [], [`mp_selector`], []), ([`SRA[W]    rd, rs1, rs2`], [`SHIFT`], [`[W]`], [1], [`mp_selector`], []), // OP - M ([`MUL[W]    rd, rs1, rs2`], [`MUL`], [`[W]`], [1], [`mp_selector`], []), ([`MULH      rd, rs1, rs2`], [`MUL`], [], [1], [`mp_selector`, `muldiv_selector`], []), ([`MULHU     rd, rs1, rs2`], [`MUL`], [], [], [`muldiv_selector`], []), ([`MULHSU    rd, rs1, rs2`], [`MUL`], [], [1], [`muldiv_selector`], []), ([`DIV[U][W] rd, rs1, rs2`], [`DIVREM`], [`[W]`], [.not`[U]`], [], []), ([`REM[U][W] rd, rs1, rs2`], [`DIVREM`], [`[W]`], [.not`[U]`], [`muldiv_selector`], []), // LUI/AUIPC ([`LUI       rd, imm`], [`ADD`], [], [], [], []), ([`AUIPC     rd, imm`], [`ADD`], [], [], [`rs1 := x255`], []), ([`JAL       rd, imm`], [`JALR`], [], [], [`rs1 := x255`], []), // Branching ([`JALR      rd, rs1, imm`], [`JALR`], [], [], [], []), ([`BEQ      rs1, rs2, imm`], [`BEQ`], [], [], [], []), ([`BNE      rs1, rs2, imm`], [`BEQ`], [], [], [`mp_selector`], []), ([`BLT[U]   rs1, rs2, imm`], [`BLT`], [], [.not`[U]`], [], []), ([`BGE[U]   rs1, rs2, imm`], [`BLT`], [], [.not`[U]`], [`mp_selector`], []), // LOAD ([`LD        rd, rs1, imm`], [`LOAD`], [], [], [`mem_8B`], []), ([`LW[U]     rd, rs1, imm`], [`LOAD`], [], [.not`[U]`], [`mem_4B`], []), ([`LH[U]     rd, rs1, imm`], [`LOAD`], [], [.not`[U]`], [`mem_2B`], []), ([`LB[U]     rd, rs1, imm`], [`LOAD`], [], [.not`[U]`], [], []), // STORE ([`SD       rs1, rs2, imm`], [`STORE`], [], [], [`mem_8B`], []), ([`SW       rs1, rs2, imm`], [`STORE`], [], [], [`mem_4B`], []), ([`SH       rs1, rs2, imm`], [`STORE`], [], [], [`mem_2B`], []), ([`SB       rs1, rs2, imm`], [`STORE`], [], [], [], []), // ECALL/EBREAK ([`ECALL`], [`ECALL`], [], [], [``rs1` := `x17``], []), ([`EBREAK`], [`EBREAK`], [], [], [], []), // FENCE ([`FENCE`], [`ADD`], [], [], [], []),
 
diff --git a/docs/spec/dvrm.md b/docs/spec/dvrm.md
index 1ce0fc969..f6da41e92 100644
--- a/docs/spec/dvrm.md
+++ b/docs/spec/dvrm.md
@@ -1,5 +1,7 @@
 # DVRM Chip
 
+The  chip provides division and remainder functionality, both signed and unsigned.
+
 = Columns
 
 The `DVRM` chip is comprised of  variables that are expressed using  columns:
@@ -163,6 +165,13 @@ carry (when iter=[1, 3]) := 2^-32 * ((extended_n_sub_r::QuadWL)[i] + (extended_r
 | `DVRM-C1` | `r` eq.not 0 => `sign_r` = `sign_n` |
 | | _polynomial:_ `Σ_i = 0^3 r[i] * (sign_r - sign_n) = 0` |
 
+### output
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `DVRM-C21` | `DVRM[q::DWordWL; n, d, signed, 0]` | -μ_q |
+| `DVRM-C22` | `DVRM[r::DWordWL; n, d, signed, 1]` | -μ_r |
+
 ### n_sub_r
 
 | Tag | Range | Description | Multiplicity |
@@ -172,14 +181,6 @@ carry (when iter=[1, 3]) := 2^-32 * ((extended_n_sub_r::QuadWL)[i] + (extended_r
 | `DVRM-C11.i` | i ∈ [0, 3] | `IS_HALF[n_sub_r[i]]` | μ_sum |
 | `DVRM-C12` |  | `IS_BIT<sign_n_sub_r>` |  |
 
-### defs
-
-| Tag | Description |
-|-----|-------------|
-| `DVRM-C16` | `SIGN<sign_n; n[3], signed>` |
-| `DVRM-C17` | `SIGN<sign_r; r[3], signed>` |
-| `DVRM-C18` | `SIGN<sign_d; d[3], signed>` |
-
 ### div_by_zero
 
 | Tag | Range | Description | Multiplicity |
@@ -188,13 +189,13 @@ carry (when iter=[1, 3]) := 2^-32 * ((extended_n_sub_r::QuadWL)[i] + (extended_r
 | | | _polynomial:_ `div_by_zero * (q[i] - 65535) = 0` | |
 | `DVRM-C20` |  | `ZERO[div_by_zero; d[0] + d[1] + d[2] + d[3]]` | μ_sum |
 
-### equality
+### defs
 
-| Tag | Range | Description | Multiplicity |
-|-----|-------|-------------|--------------|
-| `DVRM-C13` |  | `MUL[n_sub_r::DWordWL; d, signed, q, sign_q, 0]` | μ_sum |
-| `DVRM-C14` |  | `MUL[extension_n_sub_r::DWordWL; d, signed, q, sign_q, 1]` | μ_sum |
-| `DVRM-C15.i` | i ∈ [0, 3] | `IS_HALF[q[i]]` | μ_sum |
+| Tag | Description |
+|-----|-------------|
+| `DVRM-C16` | `SIGN<sign_n; n[3], signed>` |
+| `DVRM-C17` | `SIGN<sign_r; r[3], signed>` |
+| `DVRM-C18` | `SIGN<sign_d; d[3], signed>` |
 
 ### abs_diff
 
@@ -208,9 +209,10 @@ carry (when iter=[1, 3]) := 2^-32 * ((extended_n_sub_r::QuadWL)[i] + (extended_r
 | `DVRM-C6.i` | i ∈ [0, 1] | not`sign_d` => `abs_d` = `d` |  |
 | | | _polynomial:_ `(1 - sign_d) * (abs_d[i] - (d::DWordWL)[i]) = 0` | |
 
-### output
+### equality
 
-| Tag | Description | Multiplicity |
-|-----|-------------|--------------|
-| `DVRM-C21` | `DVRM[q::DWordWL; n, d, signed, 0]` | -μ_q |
-| `DVRM-C22` | `DVRM[r::DWordWL; n, d, signed, 1]` | -μ_r |
\ No newline at end of file
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `DVRM-C13` |  | `MUL[n_sub_r::DWordWL; d, signed, q, sign_q, 0]` | μ_sum |
+| `DVRM-C14` |  | `MUL[extension_n_sub_r::DWordWL; d, signed, q, sign_q, 1]` | μ_sum |
+| `DVRM-C15.i` | i ∈ [0, 3] | `IS_HALF[q[i]]` | μ_sum |
\ No newline at end of file
diff --git a/docs/spec/ecall.md b/docs/spec/ecall.md
index a25eb052e..d9b182039 100644
--- a/docs/spec/ecall.md
+++ b/docs/spec/ecall.md
@@ -1,5 +1,11 @@
 # ECALL Chips
 
+ECALLs provide system-level functionalities to the guest program.
+
+When `ECALL` is executed, it is assumed that: - register `A7` contains the system call number
+
+- the arguments are located in registers `A0`-`A6`, and - the return value is written to `A0`, where `A0`-`A7` are symbolic names for the registers `x10`-`x17`
+
 =  chip
 
 ## Columns
@@ -18,10 +24,54 @@ The  chip: + makes sure register `x10` (containing the exit code) equals `0` ([h
 
 ### Lookup
 
-The HALT chip contributes the following interaction to the lookup-argument:
+In this VM, halting is considered equivalent to executing a `sys_exit`. Hence, this chip responds to `ECALL`s with system call number 93.
 
-*Note*: [`93` is the system call number corresponding to `sys_exit`.]
+The HALT chip therefore contributes the following interaction to the lookup-argument:
 
 ## Padding
 
-This chip should only contain a single row. Given that `2^0 = 1`, this chip does not need to be padded. As such, no padding is defined.
\ No newline at end of file
+This chip should only contain a single row. Given that `2^0 = 1`, this chip does not need to be padded. As such, no padding is defined.
+
+=  chip
+
+## Columns
+
+The  chip leverages  variables, spanning  columns:
+
+## Constraints
+
+In this VM, committing is considered equivalent to writing a value to `stdout`. Hence, this chip responds to `ECALL`s with system call number 64.
+
+Since we do not know how many bytes are to be committed, this chip employs a recursive design: each iteration commits one byte, and recursively "calls" itself to commit the remaining bytes. As such, only the call from the CPU to this chip (i.e., the `first` in the recursion tree) should accept the `ECALL`; later recursive calls should not. This is why [commit:c:receive_ecall] has multiplicity `-`first``.
+
+The `write` operation --- writing to a file descriptor --- has the following signature:
+
+```c ssize_t write(size_t count; int fd, const void buf[count], size_t count); ```
+
+That is to say, - `A0` contains the file descriptor, - `A1` contains the address of `buf`'s first byte, - `A2` contains `count`, and - the written count should be written to `A0`.
+
+[commit:c:read_address] reads `address` from `x11` (=`A1`) and [commit:c:read_count] reads `count` from `x12` (=`A2`). Since we only support writing to `stdout` (which corresponds to ``fd` = 1`
+
+we assert that `x10` contains `1` in [commit:c:read_fd_write_count]. Note that this constraint _also_ writes `count` to `A0`; in this VM it is impossible for a commit to be interrupted or fail. Lastly, the `index` is read from `x254`; in the same operation, ``index` + `count`` is written back to this location by [commit:c:read_index]. This, too, leverages the fact that a commit will not be interrupted or fail to update the `index` for the next commit sequence. Again, each of these memory interactions only take place when this is the `first` call in the recursion tree.
+
+*Note*: the observant reader will notice that [commit:c:read_index] casts `count` to a `BaseField`, potentiallly losing information. This is indeed correct. However, since it is practically impossible to commit more than `2^64-2^32` bytes in a single VM execution, it was decided to permit this.
+
+Next, we read the `value` located at buffer address `address` and commit to it under the given `index`. This is only performed when we have not yet reached the `end` of the commit sequence.
+
+In parallel, we compute ``address_incr` = `address` + 1` ([commit:c:address_incr]) as address of the next byte to commit, and ``count_decr` = `count` - 1` ([commit:c:count_decr]) as the number of bytes that still has to be committed after committing this byte. [commit:c:range_address_incr] and [commit:c:range_count_decr] are included to satisfy [add:a:sum] respectively [add:a:rhs].
+
+When `count` hits `0`, we should stop performing further recursive calls. We use the `end` bit to indicate these circumstances.
+
+*Note*: + Rather than setting ``end` = 1` when ``count` = 0`, we do so when ``count_decr` = -1`. This technique allows `count` to be stored in a `DWordWL` rather than a `DWordHL`, saving two columns. + `forall i in [0, 3]: 65535 - `count_decr`_i >= 0` as a result of [commit:c:range_count_decr]. Hence, $ sum_(i=0)^3 65535 - `count_decr`_i = 0 arrow.l.r.double.long forall i in [0, 3]: `count_decr`_i = 65535 $
+
+When this was not the `end` byte to commit in this recursion sequence, we recursively _Commit the Next Byte_ (`CNB`), specifying the timestamp, address to continue reading and the number of bytes that should still be committed ([commit:c:send_commit_next_byte]). Since that certainly won't be the `first` call in the sequence, we read `address_incr` and `count_decr` from the previous recursion level into `address` and `count` and continue executing the commit.
+
+Lastly, we must make sure `first`, `end` and `μ` are bits ([commit:c:range_first], [commit:c:range_end], [commit:c:range_mu]), and that when either ``first` = 1` or ``end` = 1` imply that ``μ` = 1` ([commit:c:first_or_end_implies_mu]). These are required to ensure the multiplicities `-(`μ` - `first`)` and ``μ` - `end`` are binary.
+
+## Padding
+
+To pad this chip, use the below data.
+
+## Notes/optimizations
+
+- The current version only supports writing to `stdout`. This chip could potentially be extended to support writing to arbitrary `fd`s - One might be able to replace [commit:c:end] by `end => count = 0`. While loosening the constraint (`count = 0 => end` is no longer enforced), this should not cause any problems: if the prover does not set `end` when `count=0`, they simply cannot complete the proof. First of all, one would have to recursively work through all `2^64` values of `count`, something that is practically infeasible. Moreover, if this is done with a sequence that originally has ``count` > 0`, one will inevitably have to read a memory address twice at the same timestamp, which is impossible to prove. In addition to dropping the `ZERO` lookup, this optimization might also permit moving `count_decr` from a `DWordHL` to a `DWordWL`, saving two columns. - Given that it is practically infeasible to commit more than ``p`-1 = 2^64-2^32` bytes in a program, it might suffice to store `count_decr` in a `BaseField`. Note that this would probably involve having an extra (virtual) column storing `count` in `BaseField` form as well. Moreover, one might need to add a lookup to `LT` to ensure ``count` <= `p`-1` when being read from memory at the beginning of each commitment sequence.
\ No newline at end of file
diff --git a/docs/spec/is_bit.md b/docs/spec/is_bit.md
index b04830776..389867944 100644
--- a/docs/spec/is_bit.md
+++ b/docs/spec/is_bit.md
@@ -1,13 +1,7 @@
 # IS_BIT Template
 
-box( inset: (left: 4pt, right: 4pt), outset: (top: 4pt, bottom: 4pt), radius: 2pt, fill: luma(230), raw(code)) }
-
 Barring exceptional cases, this template is used to assert that a variable of type `Bit` assumes a valid value under some condition.
 
-= Interface The  constraint template has the following interface:
-
-where `cond` is any value described by an expression _of degree at most `1`_. Note that  can be used to denote the _unconditional_ application of the  template to `X`.
-
 = Variables The  template operates on two variables: `cond` and `X`:
 
 = Constraints It takes only one constraint to enforce that `X` must be either `0` or `1` whenever ``cond` eq.not 0`:
diff --git a/docs/spec/load.md b/docs/spec/load.md
index 9c9df5644..8f75cd39b 100644
--- a/docs/spec/load.md
+++ b/docs/spec/load.md
@@ -1,5 +1,7 @@
 # LOAD Chip
 
+The  chip provides functionality to read values from memory and sign-extend them where appropriate. It delegates low-level memory handling to the `MEMW` chip ([memw]).
+
 = Columns
 
 The `LOAD` chip is comprised of  variables that are expressed using  columns:
diff --git a/docs/spec/lt.md b/docs/spec/lt.md
index 2c77368d2..1eba8c181 100644
--- a/docs/spec/lt.md
+++ b/docs/spec/lt.md
@@ -1,5 +1,7 @@
 # LT Chip
 
+The  chip constrains an indicator bit for the less-than relation, signed or unsigned.
+
 = Columns
 
 The `LT` chip is comprised of  variables that are expressed using  columns:
@@ -24,15 +26,15 @@ The polynomial `P` can be simplified to a total degree of two. We claim that the
 | `LT-C2` | `MSB16[rhs_msb; rhs[2]]` | μ |
 | `LT-C3` | `lt` = `signed` dot (A (1 - B) + A C + (1 - B) C) + (1 - `signed`) dot `unsigned_lt` |  |
 | | _polynomial:_ `lt - signed * (lhs_msb * (1 - rhs_msb) + lhs_msb * carry[1] + (1 - rhs_msb) * carry[1]) - (1 - signed) * unsigned_lt = 0` | |
-| `LT-C4` | `IS_HALFWORD[lhs[1]]` | μ |
-| `LT-C5` | `IS_HALFWORD[rhs[1]]` | μ |
+| `LT-C4` | `IS_HALF[lhs[1]]` | μ |
+| `LT-C5` | `IS_HALF[rhs[1]]` | μ |
 
 And then we constrain the subtraction, taking care of the remaining range checking not yet covered by the assumptions or the `MSB16` lookup.
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
 | `LT-C6.i` | i ∈ [0, 1] | `IS_BIT<carry[i]>` |  |
-| `LT-C7.i` | i ∈ [0, 3] | `IS_HALFWORD[lhs_sub_rhs[i]]` | μ |
+| `LT-C7.i` | i ∈ [0, 3] | `IS_HALF[lhs_sub_rhs[i]]` | μ |
 
 The chip contributes the following to the lookup argument.
 
diff --git a/docs/spec/memory.md b/docs/spec/memory.md
index be95ed1e6..a38debcfa 100644
--- a/docs/spec/memory.md
+++ b/docs/spec/memory.md
@@ -30,20 +30,14 @@ Each memory operation will then do two things:
 
 Naturally, for a read operation, the _values_ embedded in the consumed and emitted tokens must be identical. From the need to consume a token even on the first memory access, we can see the necessity for a memory initialization procedure ---in addition to having to make sure the initial memory content lines up with what the binary dictates.
 
-> **Note:** properly link/refer to the logup spec
-
-So long as we can properly constrain temporal integrity (that is, no memory operation can consume future tokens), this "balancing" act of tokens can be integrated (with sufficient domain separation) into the existing LogUp argument: consuming a token corresponds to a "receive" and emitting a new token is a "send".
+So long as we can properly constrain temporal integrity (that is, no memory operation can consume future tokens), this "balancing" act of tokens can be integrated (with sufficient domain separation) into the existing LogUp argument ([logup]): consuming a token corresponds to a "receive" and emitting a new token is a "send".
 
 = Temporal integrity
 
-> **Note:** Properly link/refer to the LT chip
-
-To ensure temporal integrity, every memory operation needs to be constrained for the newly emitted token to have a strictly greater timestamp than the consumed token. This raises the question of how to represent timestamps and cleanly perform this check, as over a finite field the “less than” relation is ill-defined (though it is common and natural to consider it as the less than relation over the natural lift of the field into the integers). We choose to represent timestamps as machine words, using the existing `LT` chip ([lt]) functionality for comparisons.
+To ensure temporal integrity, every memory operation needs to be constrained for the newly emitted token to have a strictly greater timestamp than the consumed token. This raises the question of how to represent timestamps and cleanly perform this check, as over a finite field the “less than” relation is ill-defined (though it is common and natural to consider it as the less than relation over the natural lift of the field into the integers). We choose to represent timestamps as machine words, using the existing `LT` chip ([lt]) functionality for comparisons. The full implementation of the timestamp system can be seen in the `timestamp` column of the `CPU` ([cpu]) and `MEMW` chips ([memw]). The `CPU` merely passes in the current timestamp, while `MEMW` can recall the previously written timestamp and constrain the correct sequencing.
 
 - Clean definition of “less-than”, using the already existing `LT` functionality in the ALU - Harder to perform increments, needing extra constraints beyond field arithmetic - But this can be alleviated by providing a precomputed column that has a fixed increment per CPU row ][ - Comparison is more annoying, but can work by: - Decomposition into a machine word and chip interaction with the LT chip - Bit decomposition and comparison constraints - Range-checking the difference to be sufficiently small w.r.t. the field characteristic. - Increments and basic arithmetic operations are cheap ] ]
 
-> **Note:** reference to CPU chip/timestamp column and MEMW chip
-
 = Initialization and Finalization
 
 Because the LogUp argument handling token consumption and emission needs to be fully balanced --- every token emitted should be consumed, and vice versa --- we need to have a system to emit the initial tokens and consume the final tokens. This needs to ensure that every address has at most a single initializing emission, and at most one finalizing consumption. Having at most one initialization will, through the correctness of the lookup argument, immediately lead to having at most one correct finalization, and vice versa.
@@ -76,9 +70,7 @@ One or more STARK tables (depending on the amount of memory used) consisting of
 
 ## Register initialization/finalization
 
-> **Note:** Properly link/reference ECALL/HALT chip
-
-The initial and final state of registers can be entirely known by the verifier, since the relevant initialization values are either zero, or embedded in the ELF, and the final values can be set to a known value by the HALT ecall. As additionally, the number of registers is small, the verifier can directly add the required balancing terms to the LogUp sum.
+The initial and final state of registers can be entirely known by the verifier, since the relevant initialization values are either zero, or embedded in the ELF, and the final values can be set to a known value by the `HALT` ecall ([ecall]). As additionally, the number of registers is small, the verifier can directly add the required balancing terms to the LogUp sum.
 
 = Notes and considerations
 
diff --git a/docs/spec/memw.md b/docs/spec/memw.md
index b6223086d..c78f56a32 100644
--- a/docs/spec/memw.md
+++ b/docs/spec/memw.md
@@ -1,5 +1,7 @@
 # MEMW Chip
 
+The  chip is used to read and write memory locations (both RAM and registers) in chunks of 1, 2, 4 or 8 values. It introduces the old value and last-accessed timestamps of memory addresses internally, in order to satisfy the design of the memory argument ([memory]).
+
 = Columns
 
 The `MEMW` chip is comprised of  variables that are expressed using  columns:
@@ -15,12 +17,12 @@ Our assumptions do not explicitly cover any range checks for the `is_register` a
 | `MEMW-C1` |  | `IS_BIT<μ_sum>` |  |
 | `MEMW-C2` |  | `w2` => `μ_sum` |  |
 | | | _polynomial:_ `w2 * (1 - μ_sum) = 0` | |
-| `MEMW-C3` |  | w2 ⇒ `ADD<address_add[0]::DWordWL; base_address, 1>` |  |
-| `MEMW-C4.i` | i ∈ [1, 2] | w4 ⇒ `ADD<address_add[i]::DWordWL; base_address, i + 1>` |  |
-| `MEMW-C5.i` | i ∈ [3, 6] | write8 ⇒ `ADD<address_add[i]::DWordWL; base_address, i + 1>` |  |
-| `MEMW-C6.i` | i ∈ [0, 0], j ∈ [0, 3] | `IS_HALFWORD[address_add[i][j]]` | w2 |
-| `MEMW-C7.i` | i ∈ [1, 2], j ∈ [0, 3] | `IS_HALFWORD[address_add[i][j]]` | w4 |
-| `MEMW-C8.i` | i ∈ [3, 6], j ∈ [0, 3] | `IS_HALFWORD[address_add[i][j]]` | write8 |
+| `MEMW-C3` |  | w2 ⇒ `ADD<address_add[0]::DWordWL; base_address, 1::DWordWL>` |  |
+| `MEMW-C4.i` | i ∈ [1, 2] | w4 ⇒ `ADD<address_add[i]::DWordWL; base_address, (i + 1)::DWordWL>` |  |
+| `MEMW-C5.i` | i ∈ [3, 6] | write8 ⇒ `ADD<address_add[i]::DWordWL; base_address, (i + 1)::DWordWL>` |  |
+| `MEMW-C6.i` | i ∈ [0, 0], j ∈ [0, 3] | `IS_HALF[address_add[i][j]]` | w2 |
+| `MEMW-C7.i` | i ∈ [1, 2], j ∈ [0, 3] | `IS_HALF[address_add[i][j]]` | w4 |
+| `MEMW-C8.i` | i ∈ [3, 6], j ∈ [0, 3] | `IS_HALF[address_add[i][j]]` | write8 |
 | `MEMW-C9` |  | `LT[1; old_timestamp[0], timestamp, 0]` | μ_sum |
 | `MEMW-C10` |  | `LT[1; old_timestamp[1], timestamp, 0]` | w2 |
 | `MEMW-C11.i` | i ∈ [2, 3] | `LT[1; old_timestamp[i], timestamp, 0]` | w4 |
@@ -42,12 +44,12 @@ The chip adds the following tuples to the lookup argument, to effectuate that pa
 |-----|-------|-------------|--------------|
 | `MEMW-CM16` |  | `memory[is_register, base_address, old_timestamp[0], old[0]]` | μ_sum |
 | `MEMW-CM17` |  | `memory[is_register, base_address, timestamp, value[0]]` | -μ_sum |
-| `MEMW-CM18` |  | `memory[is_register, address_add[0], old_timestamp[1], old[1]]` | w2 |
-| `MEMW-CM19` |  | `memory[is_register, address_add[0], timestamp, value[1]]` | -w2 |
-| `MEMW-CM20.i` | i ∈ [2, 3] | `memory[is_register, address_add[i - 1], old_timestamp[i], old[i]]` | w4 |
-| `MEMW-CM21.i` | i ∈ [2, 3] | `memory[is_register, address_add[i - 1], timestamp, value[i]]` | -w4 |
-| `MEMW-CM22.i` | i ∈ [4, 7] | `memory[is_register, address_add[i - 1], old_timestamp[i], old[i]]` | write8 |
-| `MEMW-CM23.i` | i ∈ [4, 7] | `memory[is_register, address_add[i - 1], timestamp, value[i]]` | -write8 |
+| `MEMW-CM18` |  | `memory[is_register, address_add[0]::DWordWL, old_timestamp[1], old[1]]` | w2 |
+| `MEMW-CM19` |  | `memory[is_register, address_add[0]::DWordWL, timestamp, value[1]]` | -w2 |
+| `MEMW-CM20.i` | i ∈ [2, 3] | `memory[is_register, address_add[i - 1]::DWordWL, old_timestamp[i], old[i]]` | w4 |
+| `MEMW-CM21.i` | i ∈ [2, 3] | `memory[is_register, address_add[i - 1]::DWordWL, timestamp, value[i]]` | -w4 |
+| `MEMW-CM22.i` | i ∈ [4, 7] | `memory[is_register, address_add[i - 1]::DWordWL, old_timestamp[i], old[i]]` | write8 |
+| `MEMW-CM23.i` | i ∈ [4, 7] | `memory[is_register, address_add[i - 1]::DWordWL, timestamp, value[i]]` | -write8 |
 
 This chip contributes the following to the lookup argument.
 
@@ -58,7 +60,7 @@ This chip contributes the following to the lookup argument.
 
 = Future optimization ideas
 
-- Fast path for aligned memory access where all bytes have the same old timestamp - MEMB chip that deals does a one-byte write to remove old_timestamp from here (uncertain tradeoffs) - Compute `base_address[1] + 1` once and have high words of `address_add` as Words - Improve overflow trapping somehow so we don't need `LT` (could tie into previous one by checking carry bit of the +1) - Adding `μ_sum`/`w2`/`w4`/`write8` multiplicities to the `IS_HALFWORD` lookups may make some GKR things faster if there are known zeroes.
+- Fast path for aligned memory access where all bytes have the same old timestamp - MEMB chip that deals does a one-byte write to remove old_timestamp from here (uncertain tradeoffs) - Compute `base_address[1] + 1` once and have high words of `address_add` as Words - Improve overflow trapping somehow so we don't need `LT` (could tie into previous one by checking carry bit of the +1) - Adding `μ_sum`/`w2`/`w4`/`write8` multiplicities to the `IS_HALF` lookups may make some GKR things faster if there are known zeroes.
 
 ## Columns
 
diff --git a/docs/spec/mul.md b/docs/spec/mul.md
index 894f2ee03..30f9a57a5 100644
--- a/docs/spec/mul.md
+++ b/docs/spec/mul.md
@@ -1,5 +1,7 @@
 # MUL Chip
 
+The  chip constrains multiplication, both signed and unsigned, as well as providing access to the low and high halfs of the multiplication result.
+
 = Columns
 
 The `MUL` chip is comprised of  variables that are expressed using  columns:
diff --git a/docs/spec/neg.md b/docs/spec/neg.md
index 8535c1066..fd638d975 100644
--- a/docs/spec/neg.md
+++ b/docs/spec/neg.md
@@ -1,10 +1,6 @@
 # NEG Template
 
-box( inset: (left: 4pt, right: 4pt), outset: (top: 4pt, bottom: 4pt), radius: 2pt, fill: luma(230), raw(code)) }
-
-= Notation The  constraint template has the following interface:
-
-where `cond` is a bit value (i.e., lies in `{0, 1}`)  described by an expression _of degree at most `1`_.
+It requires `cond` to be a bit.
 
 = Variables
 
diff --git a/docs/spec/shift.md b/docs/spec/shift.md
index a3130f953..9c3a8dd39 100644
--- a/docs/spec/shift.md
+++ b/docs/spec/shift.md
@@ -1,8 +1,6 @@
 # SHIFT Chip
 
-= Interface The  chip has the following interface:
-
-``` // param in: the value being shifted // param shift: the number of bits to shift `in` by // param direction: whether to shift left (0) or right (1) // param signed: whether to interpret `in` as a signed (1) or unsigned (0) integer // param word_instr: whether to execute the SLL/SR* (0) or SLLW/SR*W (1) instruction // out shifted: the resulting value SHIFT[shifted: DWord; in: DWord, shift: Byte, direction: Bit, signed: Bit, word_instr: Bit] ``` In other words, the  chip is designed to constrain that $
+The  chip is designed to constrain that $
 
 $ $
 
@@ -176,7 +174,7 @@ shifted := left * Σ_j = 0^i limb_shift[j] * intra_limb_left[i - j] + right * (
 
 | Tag | Range | Description |
 |-----|-------|-------------|
-| `SHIFT-A1.i` | i ∈ [0, 3] | `IS_HALFWORD[in[i]]` |
+| `SHIFT-A1.i` | i ∈ [0, 3] | `IS_HALF[in[i]]` |
 | `SHIFT-A2` |  | `IS_BYTE[shift]` |
 | `SHIFT-A3` |  | `IS_BIT<direction>` |
 | `SHIFT-A4` |  | `IS_BIT<signed>` |
@@ -184,15 +182,15 @@ shifted := left * Σ_j = 0^i limb_shift[j] * intra_limb_left[i - j] + right * (
 
 ## Constraints
 
+### is_negative
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `SHIFT-C2` | `MSB16[is_negative; in[3]]` | signed |
+
 ### left_flag
 
 | Tag | Description |
 |-----|-------------|
 | `SHIFT-C1` | `direction` => `μ` = 1 |
-| | _polynomial:_ `direction * (1 - μ) = 0` |
-
-### is_negative
-
-| Tag | Description | Multiplicity |
-|-----|-------------|--------------|
-| `SHIFT-C2` | `MSB16[is_negative; in[3]]` | signed |
\ No newline at end of file
+| | _polynomial:_ `direction * (1 - μ) = 0` |
\ No newline at end of file
diff --git a/docs/spec/sign.md b/docs/spec/sign.md
index a656215bf..df9cac450 100644
--- a/docs/spec/sign.md
+++ b/docs/spec/sign.md
@@ -1,9 +1,5 @@
 # SIGN Template
 
-box( inset: (left: 4pt, right: 4pt), outset: (top: 4pt, bottom: 4pt), radius: 2pt, fill: luma(230), raw(code)) }
-
-= Interface The  constraint template has the following interface:
-
 It constrains that `sign` is set to `1` when both `X`'s most significant bit and `signed` are `1`, and `0` otherwise.
 
 = Variables The  template operates on three variables:
diff --git a/docs/spec/spec_full.md b/docs/spec/spec_full.md
index 42836b3fa..8536bc7db 100644
--- a/docs/spec/spec_full.md
+++ b/docs/spec/spec_full.md
@@ -32,20 +32,14 @@ Each memory operation will then do two things:
 
 Naturally, for a read operation, the _values_ embedded in the consumed and emitted tokens must be identical. From the need to consume a token even on the first memory access, we can see the necessity for a memory initialization procedure ---in addition to having to make sure the initial memory content lines up with what the binary dictates.
 
-> **Note:** properly link/refer to the logup spec
-
-So long as we can properly constrain temporal integrity (that is, no memory operation can consume future tokens), this "balancing" act of tokens can be integrated (with sufficient domain separation) into the existing LogUp argument: consuming a token corresponds to a "receive" and emitting a new token is a "send".
+So long as we can properly constrain temporal integrity (that is, no memory operation can consume future tokens), this "balancing" act of tokens can be integrated (with sufficient domain separation) into the existing LogUp argument ([logup]): consuming a token corresponds to a "receive" and emitting a new token is a "send".
 
 = Temporal integrity
 
-> **Note:** Properly link/refer to the LT chip
-
-To ensure temporal integrity, every memory operation needs to be constrained for the newly emitted token to have a strictly greater timestamp than the consumed token. This raises the question of how to represent timestamps and cleanly perform this check, as over a finite field the “less than” relation is ill-defined (though it is common and natural to consider it as the less than relation over the natural lift of the field into the integers). We choose to represent timestamps as machine words, using the existing `LT` chip ([lt]) functionality for comparisons.
+To ensure temporal integrity, every memory operation needs to be constrained for the newly emitted token to have a strictly greater timestamp than the consumed token. This raises the question of how to represent timestamps and cleanly perform this check, as over a finite field the “less than” relation is ill-defined (though it is common and natural to consider it as the less than relation over the natural lift of the field into the integers). We choose to represent timestamps as machine words, using the existing `LT` chip ([lt]) functionality for comparisons. The full implementation of the timestamp system can be seen in the `timestamp` column of the `CPU` ([cpu]) and `MEMW` chips ([memw]). The `CPU` merely passes in the current timestamp, while `MEMW` can recall the previously written timestamp and constrain the correct sequencing.
 
 - Clean definition of “less-than”, using the already existing `LT` functionality in the ALU - Harder to perform increments, needing extra constraints beyond field arithmetic - But this can be alleviated by providing a precomputed column that has a fixed increment per CPU row ][ - Comparison is more annoying, but can work by: - Decomposition into a machine word and chip interaction with the LT chip - Bit decomposition and comparison constraints - Range-checking the difference to be sufficiently small w.r.t. the field characteristic. - Increments and basic arithmetic operations are cheap ] ]
 
-> **Note:** reference to CPU chip/timestamp column and MEMW chip
-
 = Initialization and Finalization
 
 Because the LogUp argument handling token consumption and emission needs to be fully balanced --- every token emitted should be consumed, and vice versa --- we need to have a system to emit the initial tokens and consume the final tokens. This needs to ensure that every address has at most a single initializing emission, and at most one finalizing consumption. Having at most one initialization will, through the correctness of the lookup argument, immediately lead to having at most one correct finalization, and vice versa.
@@ -78,9 +72,7 @@ One or more STARK tables (depending on the amount of memory used) consisting of
 
 ## Register initialization/finalization
 
-> **Note:** Properly link/reference ECALL/HALT chip
-
-The initial and final state of registers can be entirely known by the verifier, since the relevant initialization values are either zero, or embedded in the ELF, and the final values can be set to a known value by the HALT ecall. As additionally, the number of registers is small, the verifier can directly add the required balancing terms to the LogUp sum.
+The initial and final state of registers can be entirely known by the verifier, since the relevant initialization values are either zero, or embedded in the ELF, and the final values can be set to a known value by the `HALT` ecall ([ecall]). As additionally, the number of registers is small, the verifier can directly add the required balancing terms to the LogUp sum.
 
 = Notes and considerations
 
@@ -128,14 +120,8 @@ table( columns: 1fr, inset: 7pt, align: (top+left, center), stroke: none, table.
 
 # IS_BIT Template
 
-box( inset: (left: 4pt, right: 4pt), outset: (top: 4pt, bottom: 4pt), radius: 2pt, fill: luma(230), raw(code)) }
-
 Barring exceptional cases, this template is used to assert that a variable of type `Bit` assumes a valid value under some condition.
 
-= Interface The  constraint template has the following interface:
-
-where `cond` is any value described by an expression _of degree at most `1`_. Note that  can be used to denote the _unconditional_ application of the  template to `X`.
-
 = Variables The  template operates on two variables: `cond` and `X`:
 
 = Constraints It takes only one constraint to enforce that `X` must be either `0` or `1` whenever ``cond` eq.not 0`:
@@ -171,10 +157,6 @@ where `cond` is any value described by an expression _of degree at most `1`_. No
 
 # SIGN Template
 
-box( inset: (left: 4pt, right: 4pt), outset: (top: 4pt, bottom: 4pt), radius: 2pt, fill: luma(230), raw(code)) }
-
-= Interface The  constraint template has the following interface:
-
 It constrains that `sign` is set to `1` when both `X`'s most significant bit and `signed` are `1`, and `0` otherwise.
 
 = Variables The  template operates on three variables:
@@ -219,19 +201,9 @@ It constrains that `sign` is set to `1` when both `X`'s most significant bit and
 
 # ADD/SUB Template
 
-box( inset: (left: 4pt, right: 4pt), outset: (top: 4pt, bottom: 4pt), radius: 2pt, fill: luma(230), raw(code)) }
-
-= Notation The  constraint template has the following interface:
-
-where `cond` is any value described by an expression _of degree at most `1`_.
-
-## 
-
-For ease of notation, we moreover introduce the  constraint template. Its interface
-
-maps onto the  template as
+For ease of notation, we moreover introduce the  constraint template $
 
-It constrains that ``diff` = `lhs` - `rhs` mod 2^64` when the expression `cond` is non-zero. As with ,  can be used to denote the _unconditional_ application of the template.
+$ in both conditional and unconditional versions. It constrains that ``diff` equiv `lhs` - `rhs` (mod 2^64)` when the expression `cond` is non-zero.
 
 = Variables
 
@@ -292,11 +264,7 @@ carry (when iter=1) := 2^-32 * (lhs[1] + rhs[1] + carry[0] - sum[1])
 
 # NEG Template
 
-box( inset: (left: 4pt, right: 4pt), outset: (top: 4pt, bottom: 4pt), radius: 2pt, fill: luma(230), raw(code)) }
-
-= Notation The  constraint template has the following interface:
-
-where `cond` is a bit value (i.e., lies in `{0, 1}`)  described by an expression _of degree at most `1`_.
+It requires `cond` to be a bit.
 
 = Variables
 
@@ -394,7 +362,7 @@ The `RV64C` extension for compressed instructions specifies that \~50% of all in
 
 show figure: set block(breakable: true)
 
-figure(table( columns: (auto, auto, 40pt, 40pt, 1fr, 15pt), stroke: 0pt, inset: (right: .5em), align: (left, right, center, center, left, right), fill: (_, y) => if calc.odd(y) and y <= lines.len() { luma(245) } else { white }, table.header([*Operation*], [*op-flag*], [*`w_instr`*], [*`signed`*], [*other*], []), table.hline(stroke: 1.5pt), table.vline(x: 1, start: 1, end: lines.len() + 1, stroke: .5pt), ..lines.flatten(), table.hline(stroke: 1.5pt), table.footer([*Operation*], [*op-flag*], [*`w_instr`*], [*`signed`*], [*other*]), ), caption: [Decoding table] }
+figure(table( columns: (auto, auto, auto, auto, 1fr, auto), stroke: 0pt, inset: (right: .5em), align: (left, right, center, center, left, right), fill: (_, y) => // Overlay a low-opacity fill color to distinguish the different rows better if calc.odd(y) and y <= lines.len() { color.rgb(0, 0, 100, 20) } else { color.rgb(255, 255, 255, 20) }, table.header([*Operation*], [*op-flag*], [*`w_instr`*], [*`signed`*], [*other*], []), table.hline(stroke: 1.5pt), table.vline(x: 1, start: 1, end: lines.len() + 1, stroke: .5pt), ..lines.flatten(), table.hline(stroke: 1.5pt), table.footer([*Operation*], [*op-flag*], [*`w_instr`*], [*`signed`*], [*other*]), ), caption: [Decoding table] }
 
 // OP-IMM ([`ADDI[W]   rd, rs1, imm`], [`ADD`], [`[W]`], [], [], []), ([`SLTI[U]   rd, rs1, imm`], [`SLT`], [], [.not`[U]`], [], []), ([`ANDI      rd, rs1, imm`], [`AND`], [], [], [], []), ([`ORI       rd, rs1, imm`], [`OR`],   [], [], [], []), ([`XORI      rd, rs1, imm`], [`XOR`], [], [], [], []), ([`SLLI[W]   rd, rs1, imm`], [`SHIFT`], [`[W]`], [], [], []), ([`SRLI[W]   rd, rs1, imm`], [`SHIFT`], [`[W]`], [], [`mp_selector`], []), ([`SRAI[W]   rd, rs1, imm`], [`SHIFT`], [`[W]`], [1], [`mp_selector`], []), // OP ([`ADD[W]    rd, rs1, rs2`], [`ADD`], [`[W]`], [], [], []), ([`SUB[W]    rd, rs1, rs2`], [`SUB`], [`[W]`], [], [], []), ([`SLT[U]    rd, rs1, rs2`], [`SLT`], [], [.not`[U]`], [], []), ([`AND       rd, rs1, rs2`], [`AND`], [], [], [], []), ([`OR        rd, rs1, rs2`], [`OR`], [], [], [], []), ([`XOR       rd, rs1, rs2`], [`XOR`], [], [], [], []), ([`SLL[W]    rd, rs1, rs2`], [`SHIFT`], [`[W]`], [], [], []), ([`SRL[W]    rd, rs1, rs2`], [`SHIFT`], [`[W]`], [], [`mp_selector`], []), ([`SRA[W]    rd, rs1, rs2`], [`SHIFT`], [`[W]`], [1], [`mp_selector`], []), // OP - M ([`MUL[W]    rd, rs1, rs2`], [`MUL`], [`[W]`], [1], [`mp_selector`], []), ([`MULH      rd, rs1, rs2`], [`MUL`], [], [1], [`mp_selector`, `muldiv_selector`], []), ([`MULHU     rd, rs1, rs2`], [`MUL`], [], [], [`muldiv_selector`], []), ([`MULHSU    rd, rs1, rs2`], [`MUL`], [], [1], [`muldiv_selector`], []), ([`DIV[U][W] rd, rs1, rs2`], [`DIVREM`], [`[W]`], [.not`[U]`], [], []), ([`REM[U][W] rd, rs1, rs2`], [`DIVREM`], [`[W]`], [.not`[U]`], [`muldiv_selector`], []), // LUI/AUIPC ([`LUI       rd, imm`], [`ADD`], [], [], [], []), ([`AUIPC     rd, imm`], [`ADD`], [], [], [`rs1 := x255`], []), ([`JAL       rd, imm`], [`JALR`], [], [], [`rs1 := x255`], []), // Branching ([`JALR      rd, rs1, imm`], [`JALR`], [], [], [], []), ([`BEQ      rs1, rs2, imm`], [`BEQ`], [], [], [], []), ([`BNE      rs1, rs2, imm`], [`BEQ`], [], [], [`mp_selector`], []), ([`BLT[U]   rs1, rs2, imm`], [`BLT`], [], [.not`[U]`], [], []), ([`BGE[U]   rs1, rs2, imm`], [`BLT`], [], [.not`[U]`], [`mp_selector`], []), // LOAD ([`LD        rd, rs1, imm`], [`LOAD`], [], [], [`mem_8B`], []), ([`LW[U]     rd, rs1, imm`], [`LOAD`], [], [.not`[U]`], [`mem_4B`], []), ([`LH[U]     rd, rs1, imm`], [`LOAD`], [], [.not`[U]`], [`mem_2B`], []), ([`LB[U]     rd, rs1, imm`], [`LOAD`], [], [.not`[U]`], [], []), // STORE ([`SD       rs1, rs2, imm`], [`STORE`], [], [], [`mem_8B`], []), ([`SW       rs1, rs2, imm`], [`STORE`], [], [], [`mem_4B`], []), ([`SH       rs1, rs2, imm`], [`STORE`], [], [], [`mem_2B`], []), ([`SB       rs1, rs2, imm`], [`STORE`], [], [], [], []), // ECALL/EBREAK ([`ECALL`], [`ECALL`], [], [], [``rs1` := `x17``], []), ([`EBREAK`], [`EBREAK`], [], [], [], []), // FENCE ([`FENCE`], [`ADD`], [], [], [], []),
 
@@ -432,6 +400,8 @@ This entry is used to pad the `CPU` table. More details on this matter are provi
 
 # CPU Chip
 
+The  chip coordinates memory accesses and dispatches to other chips for arithmetic and logical operations. It bases its decisions on the entry of the `DECODE` table ([decode]) corresponding the the current program counter (PC).
+
 = Columns
 
 The `CPU` chip is comprised of  variables that are expressed using  columns:
@@ -444,8 +414,6 @@ The `CPU` chip is comprised of  variables that are expressed using  columns:
 |-----|-------------|--------------|
 | `CPU-C1` | `DECODE[pc, imm, packed_decode]` | 1 |
 
-> **Note:** All casts for interactions will have to be reviewed once other chip interfaces stabilise
-
 ## Range checks
 
 > **Note:** Make sure we argue for every column here
@@ -505,10 +473,10 @@ The ALU functionality is then obtained through judicious dispatching to the corr
 | `CPU-CA40.i` | i ∈ [0, 7] | `AND_BYTE[res[i]; arg1[i], arg2[i]]` | AND |
 | `CPU-CA41.i` | i ∈ [0, 7] | `OR_BYTE[res[i]; arg1[i], arg2[i]]` | OR |
 | `CPU-CA42.i` | i ∈ [0, 7] | `XOR_BYTE[res[i]; arg1[i], arg2[i]]` | XOR |
-| `CPU-CA43` |  | `SHIFT[res::DWordHL; arg1::DWordHL, arg2[0], mp_selector, signed, word_instr]` | SHIFT |
+| `CPU-CA43` |  | `SHIFT[res::DWordWL; arg1::DWordHL, arg2[0], mp_selector, signed, word_instr]` | SHIFT |
 | `CPU-CA44` |  | JALR ⇒ `ADD<res::DWordWL; pc, (2 * c_type_instruction + 4 * (1 - c_type_instruction)) * 1::DWordWL>` |  |
-| `CPU-CA45` |  | `MUL[res; arg1, signed, arg2, mp_selector, muldiv_selector]` | MUL |
-| `CPU-CA46` |  | `DVRM[res; arg1, arg2, signed, muldiv_selector]` | DIVREM |
+| `CPU-CA45` |  | `MUL[res::DWordWL; arg1::DWordHL, signed, arg2::DWordHL, mp_selector, muldiv_selector]` | MUL |
+| `CPU-CA46` |  | `DVRM[res::DWordWL; arg1::DWordHL, arg2::DWordHL, signed, muldiv_selector]` | DIVREM |
 
 ## Memory
 
@@ -516,16 +484,16 @@ The interactions with the memory, both for register loading and storing, as for
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `CPU-CM47` |  | `MEMW[rv1; 1, 2 * rs1, rv1, timestamp + 0::DWordWL, 1, 0, 0]` | read_register1 |
+| `CPU-CM47` |  | `MEMW[['arr', ['idx', ['cast', 'rv1', 'DWordWL'], 0], ['idx', ['cast', 'rv1', 'DWordWL'], 1], 0, 0, 0, 0, 0, 0]; 1, 2::DWordWL * rs1, ['arr', ['idx', ['cast', 'rv1', 'DWordWL'], 0], ['idx', ['cast', 'rv1', 'DWordWL'], 1], 0, 0, 0, 0, 0, 0], timestamp + 0::DWordWL, 1, 0, 0]` | read_register1 |
 | `CPU-CM48.i` | i ∈ [0, 2] | `!read_register1` => `rv1[i]` = 0 |  |
 | | | _polynomial:_ `(1 - read_register1) * rv1[i] = 0` | |
-| `CPU-CM49` |  | `MEMW[rv2; 1, 2 * rs2, rv2, timestamp + 1::DWordWL, 1, 0, 0]` | read_register2 |
+| `CPU-CM49` |  | `MEMW[['arr', ['idx', ['cast', 'rv2', 'DWordWL'], 0], ['idx', ['cast', 'rv2', 'DWordWL'], 1], 0, 0, 0, 0, 0, 0]; 1, 2::DWordWL * rs2, ['arr', ['idx', ['cast', 'rv2', 'DWordWL'], 0], ['idx', ['cast', 'rv2', 'DWordWL'], 1], 0, 0, 0, 0, 0, 0], timestamp + 1::DWordWL, 1, 0, 0]` | read_register2 |
 | `CPU-CM50.i` | i ∈ [0, 2] | `!read_register2` => `rv2[i]` = 0 |  |
 | | | _polynomial:_ `(1 - read_register2) * rv2[i] = 0` | |
-| `CPU-CM51` |  | `MEMW[1, 2 * rd, rvd, timestamp + 2::DWordWL, 1, 0, 0]` | write_register |
-| `CPU-CM52` |  | `LOAD[rvd; 0, res, timestamp + 0::DWordWL, memory_2bytes, memory_4bytes, memory_8bytes, signed]` | LOAD |
-| `CPU-CM53` |  | `MEMW[0, res, arg2::Byte[8], timestamp + 1::DWordWL, memory_2bytes, memory_4bytes, memory_8bytes]` | STORE |
-| `CPU-CM54` |  | `MEMW[pc; 1, 2 * 255, next_pc, timestamp + 1::DWordWL, 1, 0, 0]` | 1 - pad |
+| `CPU-CM51` |  | `MEMW[1, 2::DWordWL * rd, ['arr', ['idx', 'rvd', 0], ['idx', 'rvd', 1], 0, 0, 0, 0, 0, 0], timestamp + 2::DWordWL, 1, 0, 0]` | write_register |
+| `CPU-CM52` |  | `LOAD[rvd; res::DWordWL, timestamp + 0::DWordWL, memory_2bytes, memory_4bytes, memory_8bytes, signed]` | LOAD |
+| `CPU-CM53` |  | `MEMW[0, res::DWordWL, arg2::Byte[8], timestamp + 1::DWordWL, memory_2bytes, memory_4bytes, memory_8bytes]` | STORE |
+| `CPU-CM54` |  | `MEMW[['arr', ['idx', 'pc', 0], ['idx', 'pc', 1], 0, 0, 0, 0, 0, 0]; 1, (2 * 255)::DWordWL, ['arr', ['idx', 'next_pc', 0], ['idx', 'next_pc', 1], 0, 0, 0, 0, 0, 0], timestamp + 1::DWordWL, 1, 0, 0]` | 1 - pad |
 
 ## System
 
@@ -570,7 +538,7 @@ For [cpu:c:is_equal], note that [cpu:c:sub] sets `res` to be the difference betw
 | `CPU-CO67` | `ZERO[is_equal; res[0] + res[1] + res[2] + res[3] + res[4] + res[5] + res[6] + res[7]]` | BEQ |
 | `CPU-CO68` | `branch_cond` = `JALR` or (`BLT` and (`res` xor `invert`)) or (`BEQ` and (`is_equal` xor `invert`)) |  |
 | | _polynomial:_ `-branch_cond + JALR + res[0] * (1 - mp_selector) * BLT + (1 - res[0]) * mp_selector * BLT + is_equal * (1 - mp_selector) * BEQ + (1 - is_equal) * mp_selector * BEQ = 0` | |
-| `CPU-CO69` | `BRANCH[next_pc; pc, imm[0], arg1::DWordWL, JALR]` | branch_cond |
+| `CPU-CO69` | `BRANCH[next_pc; pc, imm, arg1::DWordWL, JALR]` | branch_cond |
 | `CPU-CO70` | `ADD<next_pc; pc, (2 * c_type_instruction + 4 * (1 - c_type_instruction)) * 1::DWordWL>` |  |
 
 > **Note:** Document the choice to not have a multiplicity column here for padding
@@ -671,9 +639,7 @@ pad := 1 - ADD - SUB - SLT - AND - OR - XOR - SHIFT - JALR - BEQ - BLT - LOAD -
 
 # SHIFT Chip
 
-= Interface The  chip has the following interface:
-
-``` // param in: the value being shifted // param shift: the number of bits to shift `in` by // param direction: whether to shift left (0) or right (1) // param signed: whether to interpret `in` as a signed (1) or unsigned (0) integer // param word_instr: whether to execute the SLL/SR* (0) or SLLW/SR*W (1) instruction // out shifted: the resulting value SHIFT[shifted: DWord; in: DWord, shift: Byte, direction: Bit, signed: Bit, word_instr: Bit] ``` In other words, the  chip is designed to constrain that $
+The  chip is designed to constrain that $
 
 $ $
 
@@ -847,7 +813,7 @@ shifted := left * Σ_j = 0^i limb_shift[j] * intra_limb_left[i - j] + right * (
 
 | Tag | Range | Description |
 |-----|-------|-------------|
-| `SHIFT-A1.i` | i ∈ [0, 3] | `IS_HALFWORD[in[i]]` |
+| `SHIFT-A1.i` | i ∈ [0, 3] | `IS_HALF[in[i]]` |
 | `SHIFT-A2` |  | `IS_BYTE[shift]` |
 | `SHIFT-A3` |  | `IS_BIT<direction>` |
 | `SHIFT-A4` |  | `IS_BIT<signed>` |
@@ -855,6 +821,12 @@ shifted := left * Σ_j = 0^i limb_shift[j] * intra_limb_left[i - j] + right * (
 
 ## Constraints
 
+### is_negative
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `SHIFT-C2` | `MSB16[is_negative; in[3]]` | signed |
+
 ### left_flag
 
 | Tag | Description |
@@ -862,16 +834,12 @@ shifted := left * Σ_j = 0^i limb_shift[j] * intra_limb_left[i - j] + right * (
 | `SHIFT-C1` | `direction` => `μ` = 1 |
 | | _polynomial:_ `direction * (1 - μ) = 0` |
 
-### is_negative
-
-| Tag | Description | Multiplicity |
-|-----|-------------|--------------|
-| `SHIFT-C2` | `MSB16[is_negative; in[3]]` | signed |
-
 ---
 
 # BRANCH Chip
 
+The  chip computes the target address of a branching instruction.
+
 = Columns
 
 The `BRANCH` chip is comprised of  variables that are expressed using  columns:
@@ -880,8 +848,6 @@ The `BRANCH` chip is comprised of  variables that are expressed using  columns:
 
 = Constraints
 
-> **Note:** Check correspondence with CPU for passing in `offset` as word or dword
-
 We constrain `next_pc` to be ``base_address` + `offset``, where `base_address` equals `pc` when ``JALR` = 0` and `register` otherwise.
 
 The range checks on `unmasked_low_byte` and `next_pc_low[0]` are performed implicitly by the `AND_BYTE` lookup.
@@ -892,7 +858,7 @@ The range checks on `unmasked_low_byte` and `next_pc_low[0]` are performed impli
 | `BRANCH-C2` |  | JALR ⇒ `ADD<next_pc_unmasked; register, offset::DWordWL>` |  |
 | `BRANCH-C3` |  | `IS_BYTE[next_pc_low[1]]` | μ |
 | `BRANCH-C4` |  | `AND_BYTE[next_pc_low[0]; unmasked_low_byte, 254]` | μ |
-| `BRANCH-C5.i` | i ∈ [0, 2] | `IS_HALFWORD[next_pc_high[i]]` | μ |
+| `BRANCH-C5.i` | i ∈ [0, 2] | `IS_HALF[next_pc_high[i]]` | μ |
 
 This chip contributes the following to the lookup argument.
 
@@ -966,6 +932,8 @@ next_pc (when iter=1) := 2^16 * next_pc_high[2] + next_pc_high[1]
 
 # MEMW Chip
 
+The  chip is used to read and write memory locations (both RAM and registers) in chunks of 1, 2, 4 or 8 values. It introduces the old value and last-accessed timestamps of memory addresses internally, in order to satisfy the design of the memory argument ([memory]).
+
 = Columns
 
 The `MEMW` chip is comprised of  variables that are expressed using  columns:
@@ -981,12 +949,12 @@ Our assumptions do not explicitly cover any range checks for the `is_register` a
 | `MEMW-C1` |  | `IS_BIT<μ_sum>` |  |
 | `MEMW-C2` |  | `w2` => `μ_sum` |  |
 | | | _polynomial:_ `w2 * (1 - μ_sum) = 0` | |
-| `MEMW-C3` |  | w2 ⇒ `ADD<address_add[0]::DWordWL; base_address, 1>` |  |
-| `MEMW-C4.i` | i ∈ [1, 2] | w4 ⇒ `ADD<address_add[i]::DWordWL; base_address, i + 1>` |  |
-| `MEMW-C5.i` | i ∈ [3, 6] | write8 ⇒ `ADD<address_add[i]::DWordWL; base_address, i + 1>` |  |
-| `MEMW-C6.i` | i ∈ [0, 0], j ∈ [0, 3] | `IS_HALFWORD[address_add[i][j]]` | w2 |
-| `MEMW-C7.i` | i ∈ [1, 2], j ∈ [0, 3] | `IS_HALFWORD[address_add[i][j]]` | w4 |
-| `MEMW-C8.i` | i ∈ [3, 6], j ∈ [0, 3] | `IS_HALFWORD[address_add[i][j]]` | write8 |
+| `MEMW-C3` |  | w2 ⇒ `ADD<address_add[0]::DWordWL; base_address, 1::DWordWL>` |  |
+| `MEMW-C4.i` | i ∈ [1, 2] | w4 ⇒ `ADD<address_add[i]::DWordWL; base_address, (i + 1)::DWordWL>` |  |
+| `MEMW-C5.i` | i ∈ [3, 6] | write8 ⇒ `ADD<address_add[i]::DWordWL; base_address, (i + 1)::DWordWL>` |  |
+| `MEMW-C6.i` | i ∈ [0, 0], j ∈ [0, 3] | `IS_HALF[address_add[i][j]]` | w2 |
+| `MEMW-C7.i` | i ∈ [1, 2], j ∈ [0, 3] | `IS_HALF[address_add[i][j]]` | w4 |
+| `MEMW-C8.i` | i ∈ [3, 6], j ∈ [0, 3] | `IS_HALF[address_add[i][j]]` | write8 |
 | `MEMW-C9` |  | `LT[1; old_timestamp[0], timestamp, 0]` | μ_sum |
 | `MEMW-C10` |  | `LT[1; old_timestamp[1], timestamp, 0]` | w2 |
 | `MEMW-C11.i` | i ∈ [2, 3] | `LT[1; old_timestamp[i], timestamp, 0]` | w4 |
@@ -1008,12 +976,12 @@ The chip adds the following tuples to the lookup argument, to effectuate that pa
 |-----|-------|-------------|--------------|
 | `MEMW-CM16` |  | `memory[is_register, base_address, old_timestamp[0], old[0]]` | μ_sum |
 | `MEMW-CM17` |  | `memory[is_register, base_address, timestamp, value[0]]` | -μ_sum |
-| `MEMW-CM18` |  | `memory[is_register, address_add[0], old_timestamp[1], old[1]]` | w2 |
-| `MEMW-CM19` |  | `memory[is_register, address_add[0], timestamp, value[1]]` | -w2 |
-| `MEMW-CM20.i` | i ∈ [2, 3] | `memory[is_register, address_add[i - 1], old_timestamp[i], old[i]]` | w4 |
-| `MEMW-CM21.i` | i ∈ [2, 3] | `memory[is_register, address_add[i - 1], timestamp, value[i]]` | -w4 |
-| `MEMW-CM22.i` | i ∈ [4, 7] | `memory[is_register, address_add[i - 1], old_timestamp[i], old[i]]` | write8 |
-| `MEMW-CM23.i` | i ∈ [4, 7] | `memory[is_register, address_add[i - 1], timestamp, value[i]]` | -write8 |
+| `MEMW-CM18` |  | `memory[is_register, address_add[0]::DWordWL, old_timestamp[1], old[1]]` | w2 |
+| `MEMW-CM19` |  | `memory[is_register, address_add[0]::DWordWL, timestamp, value[1]]` | -w2 |
+| `MEMW-CM20.i` | i ∈ [2, 3] | `memory[is_register, address_add[i - 1]::DWordWL, old_timestamp[i], old[i]]` | w4 |
+| `MEMW-CM21.i` | i ∈ [2, 3] | `memory[is_register, address_add[i - 1]::DWordWL, timestamp, value[i]]` | -w4 |
+| `MEMW-CM22.i` | i ∈ [4, 7] | `memory[is_register, address_add[i - 1]::DWordWL, old_timestamp[i], old[i]]` | write8 |
+| `MEMW-CM23.i` | i ∈ [4, 7] | `memory[is_register, address_add[i - 1]::DWordWL, timestamp, value[i]]` | -write8 |
 
 This chip contributes the following to the lookup argument.
 
@@ -1024,7 +992,7 @@ This chip contributes the following to the lookup argument.
 
 = Future optimization ideas
 
-- Fast path for aligned memory access where all bytes have the same old timestamp - MEMB chip that deals does a one-byte write to remove old_timestamp from here (uncertain tradeoffs) - Compute `base_address[1] + 1` once and have high words of `address_add` as Words - Improve overflow trapping somehow so we don't need `LT` (could tie into previous one by checking carry bit of the +1) - Adding `μ_sum`/`w2`/`w4`/`write8` multiplicities to the `IS_HALFWORD` lookups may make some GKR things faster if there are known zeroes.
+- Fast path for aligned memory access where all bytes have the same old timestamp - MEMB chip that deals does a one-byte write to remove old_timestamp from here (uncertain tradeoffs) - Compute `base_address[1] + 1` once and have high words of `address_add` as Words - Improve overflow trapping somehow so we don't need `LT` (could tie into previous one by checking carry bit of the +1) - Adding `μ_sum`/`w2`/`w4`/`write8` multiplicities to the `IS_HALF` lookups may make some GKR things faster if there are known zeroes.
 
 ## Columns
 
@@ -1098,6 +1066,8 @@ w4 := write4 + write8
 
 # LT Chip
 
+The  chip constrains an indicator bit for the less-than relation, signed or unsigned.
+
 = Columns
 
 The `LT` chip is comprised of  variables that are expressed using  columns:
@@ -1122,15 +1092,15 @@ The polynomial `P` can be simplified to a total degree of two. We claim that the
 | `LT-C2` | `MSB16[rhs_msb; rhs[2]]` | μ |
 | `LT-C3` | `lt` = `signed` dot (A (1 - B) + A C + (1 - B) C) + (1 - `signed`) dot `unsigned_lt` |  |
 | | _polynomial:_ `lt - signed * (lhs_msb * (1 - rhs_msb) + lhs_msb * carry[1] + (1 - rhs_msb) * carry[1]) - (1 - signed) * unsigned_lt = 0` | |
-| `LT-C4` | `IS_HALFWORD[lhs[1]]` | μ |
-| `LT-C5` | `IS_HALFWORD[rhs[1]]` | μ |
+| `LT-C4` | `IS_HALF[lhs[1]]` | μ |
+| `LT-C5` | `IS_HALF[rhs[1]]` | μ |
 
 And then we constrain the subtraction, taking care of the remaining range checking not yet covered by the assumptions or the `MSB16` lookup.
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
 | `LT-C6.i` | i ∈ [0, 1] | `IS_BIT<carry[i]>` |  |
-| `LT-C7.i` | i ∈ [0, 3] | `IS_HALFWORD[lhs_sub_rhs[i]]` | μ |
+| `LT-C7.i` | i ∈ [0, 3] | `IS_HALF[lhs_sub_rhs[i]]` | μ |
 
 The chip contributes the following to the lookup argument.
 
@@ -1202,6 +1172,8 @@ unsigned_lt := carry[1]
 
 # MUL Chip
 
+The  chip constrains multiplication, both signed and unsigned, as well as providing access to the low and high halfs of the multiplication result.
+
 = Columns
 
 The `MUL` chip is comprised of  variables that are expressed using  columns:
@@ -1345,6 +1317,8 @@ carry (when iter=[1, 3]) := 2^-32 * (raw_product[i] + carry[i - 1] - res[i])
 
 # DVRM Chip
 
+The  chip provides division and remainder functionality, both signed and unsigned.
+
 = Columns
 
 The `DVRM` chip is comprised of  variables that are expressed using  columns:
@@ -1508,6 +1482,13 @@ carry (when iter=[1, 3]) := 2^-32 * ((extended_n_sub_r::QuadWL)[i] + (extended_r
 | `DVRM-C1` | `r` eq.not 0 => `sign_r` = `sign_n` |
 | | _polynomial:_ `Σ_i = 0^3 r[i] * (sign_r - sign_n) = 0` |
 
+### output
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `DVRM-C21` | `DVRM[q::DWordWL; n, d, signed, 0]` | -μ_q |
+| `DVRM-C22` | `DVRM[r::DWordWL; n, d, signed, 1]` | -μ_r |
+
 ### n_sub_r
 
 | Tag | Range | Description | Multiplicity |
@@ -1517,14 +1498,6 @@ carry (when iter=[1, 3]) := 2^-32 * ((extended_n_sub_r::QuadWL)[i] + (extended_r
 | `DVRM-C11.i` | i ∈ [0, 3] | `IS_HALF[n_sub_r[i]]` | μ_sum |
 | `DVRM-C12` |  | `IS_BIT<sign_n_sub_r>` |  |
 
-### defs
-
-| Tag | Description |
-|-----|-------------|
-| `DVRM-C16` | `SIGN<sign_n; n[3], signed>` |
-| `DVRM-C17` | `SIGN<sign_r; r[3], signed>` |
-| `DVRM-C18` | `SIGN<sign_d; d[3], signed>` |
-
 ### div_by_zero
 
 | Tag | Range | Description | Multiplicity |
@@ -1533,13 +1506,13 @@ carry (when iter=[1, 3]) := 2^-32 * ((extended_n_sub_r::QuadWL)[i] + (extended_r
 | | | _polynomial:_ `div_by_zero * (q[i] - 65535) = 0` | |
 | `DVRM-C20` |  | `ZERO[div_by_zero; d[0] + d[1] + d[2] + d[3]]` | μ_sum |
 
-### equality
+### defs
 
-| Tag | Range | Description | Multiplicity |
-|-----|-------|-------------|--------------|
-| `DVRM-C13` |  | `MUL[n_sub_r::DWordWL; d, signed, q, sign_q, 0]` | μ_sum |
-| `DVRM-C14` |  | `MUL[extension_n_sub_r::DWordWL; d, signed, q, sign_q, 1]` | μ_sum |
-| `DVRM-C15.i` | i ∈ [0, 3] | `IS_HALF[q[i]]` | μ_sum |
+| Tag | Description |
+|-----|-------------|
+| `DVRM-C16` | `SIGN<sign_n; n[3], signed>` |
+| `DVRM-C17` | `SIGN<sign_r; r[3], signed>` |
+| `DVRM-C18` | `SIGN<sign_d; d[3], signed>` |
 
 ### abs_diff
 
@@ -1553,17 +1526,20 @@ carry (when iter=[1, 3]) := 2^-32 * ((extended_n_sub_r::QuadWL)[i] + (extended_r
 | `DVRM-C6.i` | i ∈ [0, 1] | not`sign_d` => `abs_d` = `d` |  |
 | | | _polynomial:_ `(1 - sign_d) * (abs_d[i] - (d::DWordWL)[i]) = 0` | |
 
-### output
+### equality
 
-| Tag | Description | Multiplicity |
-|-----|-------------|--------------|
-| `DVRM-C21` | `DVRM[q::DWordWL; n, d, signed, 0]` | -μ_q |
-| `DVRM-C22` | `DVRM[r::DWordWL; n, d, signed, 1]` | -μ_r |
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `DVRM-C13` |  | `MUL[n_sub_r::DWordWL; d, signed, q, sign_q, 0]` | μ_sum |
+| `DVRM-C14` |  | `MUL[extension_n_sub_r::DWordWL; d, signed, q, sign_q, 1]` | μ_sum |
+| `DVRM-C15.i` | i ∈ [0, 3] | `IS_HALF[q[i]]` | μ_sum |
 
 ---
 
 # LOAD Chip
 
+The  chip provides functionality to read values from memory and sign-extend them where appropriate. It delegates low-level memory handling to the `MEMW` chip ([memw]).
+
 = Columns
 
 The `LOAD` chip is comprised of  variables that are expressed using  columns:
@@ -1655,6 +1631,12 @@ read1 := μ - read2 - read4 - read8
 
 # ECALL Chips
 
+ECALLs provide system-level functionalities to the guest program.
+
+When `ECALL` is executed, it is assumed that: - register `A7` contains the system call number
+
+- the arguments are located in registers `A0`-`A6`, and - the return value is written to `A0`, where `A0`-`A7` are symbolic names for the registers `x10`-`x17`
+
 =  chip
 
 ## Columns
@@ -1673,18 +1655,64 @@ The  chip: + makes sure register `x10` (containing the exit code) equals `0` ([h
 
 ### Lookup
 
-The HALT chip contributes the following interaction to the lookup-argument:
+In this VM, halting is considered equivalent to executing a `sys_exit`. Hence, this chip responds to `ECALL`s with system call number 93.
 
-*Note*: [`93` is the system call number corresponding to `sys_exit`.]
+The HALT chip therefore contributes the following interaction to the lookup-argument:
 
 ## Padding
 
 This chip should only contain a single row. Given that `2^0 = 1`, this chip does not need to be padded. As such, no padding is defined.
 
+=  chip
+
+## Columns
+
+The  chip leverages  variables, spanning  columns:
+
+## Constraints
+
+In this VM, committing is considered equivalent to writing a value to `stdout`. Hence, this chip responds to `ECALL`s with system call number 64.
+
+Since we do not know how many bytes are to be committed, this chip employs a recursive design: each iteration commits one byte, and recursively "calls" itself to commit the remaining bytes. As such, only the call from the CPU to this chip (i.e., the `first` in the recursion tree) should accept the `ECALL`; later recursive calls should not. This is why [commit:c:receive_ecall] has multiplicity `-`first``.
+
+The `write` operation --- writing to a file descriptor --- has the following signature:
+
+```c ssize_t write(size_t count; int fd, const void buf[count], size_t count); ```
+
+That is to say, - `A0` contains the file descriptor, - `A1` contains the address of `buf`'s first byte, - `A2` contains `count`, and - the written count should be written to `A0`.
+
+[commit:c:read_address] reads `address` from `x11` (=`A1`) and [commit:c:read_count] reads `count` from `x12` (=`A2`). Since we only support writing to `stdout` (which corresponds to ``fd` = 1`
+
+we assert that `x10` contains `1` in [commit:c:read_fd_write_count]. Note that this constraint _also_ writes `count` to `A0`; in this VM it is impossible for a commit to be interrupted or fail. Lastly, the `index` is read from `x254`; in the same operation, ``index` + `count`` is written back to this location by [commit:c:read_index]. This, too, leverages the fact that a commit will not be interrupted or fail to update the `index` for the next commit sequence. Again, each of these memory interactions only take place when this is the `first` call in the recursion tree.
+
+*Note*: the observant reader will notice that [commit:c:read_index] casts `count` to a `BaseField`, potentiallly losing information. This is indeed correct. However, since it is practically impossible to commit more than `2^64-2^32` bytes in a single VM execution, it was decided to permit this.
+
+Next, we read the `value` located at buffer address `address` and commit to it under the given `index`. This is only performed when we have not yet reached the `end` of the commit sequence.
+
+In parallel, we compute ``address_incr` = `address` + 1` ([commit:c:address_incr]) as address of the next byte to commit, and ``count_decr` = `count` - 1` ([commit:c:count_decr]) as the number of bytes that still has to be committed after committing this byte. [commit:c:range_address_incr] and [commit:c:range_count_decr] are included to satisfy [add:a:sum] respectively [add:a:rhs].
+
+When `count` hits `0`, we should stop performing further recursive calls. We use the `end` bit to indicate these circumstances.
+
+*Note*: + Rather than setting ``end` = 1` when ``count` = 0`, we do so when ``count_decr` = -1`. This technique allows `count` to be stored in a `DWordWL` rather than a `DWordHL`, saving two columns. + `forall i in [0, 3]: 65535 - `count_decr`_i >= 0` as a result of [commit:c:range_count_decr]. Hence, $ sum_(i=0)^3 65535 - `count_decr`_i = 0 arrow.l.r.double.long forall i in [0, 3]: `count_decr`_i = 65535 $
+
+When this was not the `end` byte to commit in this recursion sequence, we recursively _Commit the Next Byte_ (`CNB`), specifying the timestamp, address to continue reading and the number of bytes that should still be committed ([commit:c:send_commit_next_byte]). Since that certainly won't be the `first` call in the sequence, we read `address_incr` and `count_decr` from the previous recursion level into `address` and `count` and continue executing the commit.
+
+Lastly, we must make sure `first`, `end` and `μ` are bits ([commit:c:range_first], [commit:c:range_end], [commit:c:range_mu]), and that when either ``first` = 1` or ``end` = 1` imply that ``μ` = 1` ([commit:c:first_or_end_implies_mu]). These are required to ensure the multiplicities `-(`μ` - `first`)` and ``μ` - `end`` are binary.
+
+## Padding
+
+To pad this chip, use the below data.
+
+## Notes/optimizations
+
+- The current version only supports writing to `stdout`. This chip could potentially be extended to support writing to arbitrary `fd`s - One might be able to replace [commit:c:end] by `end => count = 0`. While loosening the constraint (`count = 0 => end` is no longer enforced), this should not cause any problems: if the prover does not set `end` when `count=0`, they simply cannot complete the proof. First of all, one would have to recursively work through all `2^64` values of `count`, something that is practically infeasible. Moreover, if this is done with a sequence that originally has ``count` > 0`, one will inevitably have to read a memory address twice at the same timestamp, which is impossible to prove. In addition to dropping the `ZERO` lookup, this optimization might also permit moving `count_decr` from a `DWordHL` to a `DWordWL`, saving two columns. - Given that it is practically infeasible to commit more than ``p`-1 = 2^64-2^32` bytes in a program, it might suffice to store `count_decr` in a `BaseField`. Note that this would probably involve having an extra (virtual) column storing `count` in `BaseField` form as well. Moreover, one might need to add a lookup to `LT` to ensure ``count` <= `p`-1` when being read from memory at the beginning of each commitment sequence.
+
 ---
 
 # BITWISE Chips
 
+The  chips deal with precomputed lookup tables for bitwise boolean operations and convenience functionalities over small domains.
+
 = Columns
 
 The  chip is comprised of  variables that are expressed using  columns. Of these, the _input_ and _output_ variables ( in total) are precomputed.

From fa26492f99849c9bb3f41c529d22d44cf46b62de Mon Sep 17 00:00:00 2001
From: Robin Jadoul <robin.jadoul@3milabs.tech>
Date: Fri, 27 Feb 2026 11:27:54 +0100
Subject: [PATCH 075/105] spec: Add a version and title/front pages (#367)

---
 spec/book.typ  |  3 ++-
 spec/ebook.typ |  2 ++
 spec/front.typ | 11 +++++++++++
 3 files changed, 15 insertions(+), 1 deletion(-)
 create mode 100644 spec/front.typ

diff --git a/spec/book.typ b/spec/book.typ
index 190e63f17..bf8b044ec 100644
--- a/spec/book.typ
+++ b/spec/book.typ
@@ -6,6 +6,7 @@
 #let meta = (
   title: "Lambda VM specification",
   authors: ("3MI Labs", "Aligned"),
+  version: "0.2",
   summary: (
     ("logup.typ", [LogUp argument], <logup>),
     ("memory.typ", [Memory argument], <memory>),
@@ -31,7 +32,7 @@
 #book-meta(
   title: meta.title,
   authors: meta.authors,
-  summary: meta.summary.map(((ch, title, _ref)) => chapter(ch, title)).join()
+  summary: prefix-chapter("front.typ", meta.title) + meta.summary.map(((ch, title, _ref)) => chapter(ch, title)).join()
 )
 
 #let common-formatting(body) = {
diff --git a/spec/ebook.typ b/spec/ebook.typ
index 0e08536fd..c176dcec3 100644
--- a/spec/ebook.typ
+++ b/spec/ebook.typ
@@ -3,6 +3,8 @@
 #set document(author: meta.authors, title: meta.title)
 
 #align(center, title(meta.title))
+#align(center, text(style: "italic", fill: luma(40%))[Version #meta.version])
+#align(center, meta.authors.join(", "))
 #pagebreak(weak: true)
 #outline()
 
diff --git a/spec/front.typ b/spec/front.typ
new file mode 100644
index 000000000..d78b0a38e
--- /dev/null
+++ b/spec/front.typ
@@ -0,0 +1,11 @@
+#import "/book.typ": project, meta
+
+#show: project.with(title: "", cond: () => true)
+
+#align(center, title(meta.title))
+#align(center)[_Version #meta.version _]
+#align(center, meta.authors.join(", "))
+
+
+This is the specification for the #link("https://github.com/yetanotherco/lambda_vm/")[Lambda verifiable vm].
+

From b4cbb21e75f8337dc21394d46ccee793735a61f0 Mon Sep 17 00:00:00 2001
From: Joaquin Carletti <joaquin.carletti@lambdaclass.com>
Date: Tue, 3 Mar 2026 14:53:07 -0300
Subject: [PATCH 076/105] update

---
 docs/spec/dvrm.md      | 42 ++++++++++++++++----------------
 docs/spec/shift.md     | 14 +++++------
 docs/spec/spec_full.md | 54 +++++++++++++++++++++---------------------
 3 files changed, 55 insertions(+), 55 deletions(-)

diff --git a/docs/spec/dvrm.md b/docs/spec/dvrm.md
index f6da41e92..65305a1f8 100644
--- a/docs/spec/dvrm.md
+++ b/docs/spec/dvrm.md
@@ -158,6 +158,14 @@ carry (when iter=[1, 3]) := 2^-32 * ((extended_n_sub_r::QuadWL)[i] + (extended_r
 
 ## Constraints
 
+### equality
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `DVRM-C13` |  | `MUL[n_sub_r::DWordWL; d, signed, q, sign_q, 0]` | μ_sum |
+| `DVRM-C14` |  | `MUL[extension_n_sub_r::DWordWL; d, signed, q, sign_q, 1]` | μ_sum |
+| `DVRM-C15.i` | i ∈ [0, 3] | `IS_HALF[q[i]]` | μ_sum |
+
 ### sign_equality
 
 | Tag | Description |
@@ -165,6 +173,14 @@ carry (when iter=[1, 3]) := 2^-32 * ((extended_n_sub_r::QuadWL)[i] + (extended_r
 | `DVRM-C1` | `r` eq.not 0 => `sign_r` = `sign_n` |
 | | _polynomial:_ `Σ_i = 0^3 r[i] * (sign_r - sign_n) = 0` |
 
+### defs
+
+| Tag | Description |
+|-----|-------------|
+| `DVRM-C16` | `SIGN<sign_n; n[3], signed>` |
+| `DVRM-C17` | `SIGN<sign_r; r[3], signed>` |
+| `DVRM-C18` | `SIGN<sign_d; d[3], signed>` |
+
 ### output
 
 | Tag | Description | Multiplicity |
@@ -172,15 +188,6 @@ carry (when iter=[1, 3]) := 2^-32 * ((extended_n_sub_r::QuadWL)[i] + (extended_r
 | `DVRM-C21` | `DVRM[q::DWordWL; n, d, signed, 0]` | -μ_q |
 | `DVRM-C22` | `DVRM[r::DWordWL; n, d, signed, 1]` | -μ_r |
 
-### n_sub_r
-
-| Tag | Range | Description | Multiplicity |
-|-----|-------|-------------|--------------|
-| `DVRM-C9.i` | i ∈ [0, 3] | `IS_BIT<carry[i]>` |  |
-| `DVRM-C10.i` | i ∈ [0, 3] | `IS_HALF[r[i]]` | μ_sum |
-| `DVRM-C11.i` | i ∈ [0, 3] | `IS_HALF[n_sub_r[i]]` | μ_sum |
-| `DVRM-C12` |  | `IS_BIT<sign_n_sub_r>` |  |
-
 ### div_by_zero
 
 | Tag | Range | Description | Multiplicity |
@@ -189,14 +196,6 @@ carry (when iter=[1, 3]) := 2^-32 * ((extended_n_sub_r::QuadWL)[i] + (extended_r
 | | | _polynomial:_ `div_by_zero * (q[i] - 65535) = 0` | |
 | `DVRM-C20` |  | `ZERO[div_by_zero; d[0] + d[1] + d[2] + d[3]]` | μ_sum |
 
-### defs
-
-| Tag | Description |
-|-----|-------------|
-| `DVRM-C16` | `SIGN<sign_n; n[3], signed>` |
-| `DVRM-C17` | `SIGN<sign_r; r[3], signed>` |
-| `DVRM-C18` | `SIGN<sign_d; d[3], signed>` |
-
 ### abs_diff
 
 | Tag | Range | Description | Multiplicity |
@@ -209,10 +208,11 @@ carry (when iter=[1, 3]) := 2^-32 * ((extended_n_sub_r::QuadWL)[i] + (extended_r
 | `DVRM-C6.i` | i ∈ [0, 1] | not`sign_d` => `abs_d` = `d` |  |
 | | | _polynomial:_ `(1 - sign_d) * (abs_d[i] - (d::DWordWL)[i]) = 0` | |
 
-### equality
+### n_sub_r
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `DVRM-C13` |  | `MUL[n_sub_r::DWordWL; d, signed, q, sign_q, 0]` | μ_sum |
-| `DVRM-C14` |  | `MUL[extension_n_sub_r::DWordWL; d, signed, q, sign_q, 1]` | μ_sum |
-| `DVRM-C15.i` | i ∈ [0, 3] | `IS_HALF[q[i]]` | μ_sum |
\ No newline at end of file
+| `DVRM-C9.i` | i ∈ [0, 3] | `IS_BIT<carry[i]>` |  |
+| `DVRM-C10.i` | i ∈ [0, 3] | `IS_HALF[r[i]]` | μ_sum |
+| `DVRM-C11.i` | i ∈ [0, 3] | `IS_HALF[n_sub_r[i]]` | μ_sum |
+| `DVRM-C12` |  | `IS_BIT<sign_n_sub_r>` |  |
\ No newline at end of file
diff --git a/docs/spec/shift.md b/docs/spec/shift.md
index 9c3a8dd39..78842188b 100644
--- a/docs/spec/shift.md
+++ b/docs/spec/shift.md
@@ -182,15 +182,15 @@ shifted := left * Σ_j = 0^i limb_shift[j] * intra_limb_left[i - j] + right * (
 
 ## Constraints
 
-### is_negative
-
-| Tag | Description | Multiplicity |
-|-----|-------------|--------------|
-| `SHIFT-C2` | `MSB16[is_negative; in[3]]` | signed |
-
 ### left_flag
 
 | Tag | Description |
 |-----|-------------|
 | `SHIFT-C1` | `direction` => `μ` = 1 |
-| | _polynomial:_ `direction * (1 - μ) = 0` |
\ No newline at end of file
+| | _polynomial:_ `direction * (1 - μ) = 0` |
+
+### is_negative
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `SHIFT-C2` | `MSB16[is_negative; in[3]]` | signed |
\ No newline at end of file
diff --git a/docs/spec/spec_full.md b/docs/spec/spec_full.md
index 8536bc7db..83cace1ca 100644
--- a/docs/spec/spec_full.md
+++ b/docs/spec/spec_full.md
@@ -821,12 +821,6 @@ shifted := left * Σ_j = 0^i limb_shift[j] * intra_limb_left[i - j] + right * (
 
 ## Constraints
 
-### is_negative
-
-| Tag | Description | Multiplicity |
-|-----|-------------|--------------|
-| `SHIFT-C2` | `MSB16[is_negative; in[3]]` | signed |
-
 ### left_flag
 
 | Tag | Description |
@@ -834,6 +828,12 @@ shifted := left * Σ_j = 0^i limb_shift[j] * intra_limb_left[i - j] + right * (
 | `SHIFT-C1` | `direction` => `μ` = 1 |
 | | _polynomial:_ `direction * (1 - μ) = 0` |
 
+### is_negative
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `SHIFT-C2` | `MSB16[is_negative; in[3]]` | signed |
+
 ---
 
 # BRANCH Chip
@@ -1475,6 +1475,14 @@ carry (when iter=[1, 3]) := 2^-32 * ((extended_n_sub_r::QuadWL)[i] + (extended_r
 
 ## Constraints
 
+### equality
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `DVRM-C13` |  | `MUL[n_sub_r::DWordWL; d, signed, q, sign_q, 0]` | μ_sum |
+| `DVRM-C14` |  | `MUL[extension_n_sub_r::DWordWL; d, signed, q, sign_q, 1]` | μ_sum |
+| `DVRM-C15.i` | i ∈ [0, 3] | `IS_HALF[q[i]]` | μ_sum |
+
 ### sign_equality
 
 | Tag | Description |
@@ -1482,6 +1490,14 @@ carry (when iter=[1, 3]) := 2^-32 * ((extended_n_sub_r::QuadWL)[i] + (extended_r
 | `DVRM-C1` | `r` eq.not 0 => `sign_r` = `sign_n` |
 | | _polynomial:_ `Σ_i = 0^3 r[i] * (sign_r - sign_n) = 0` |
 
+### defs
+
+| Tag | Description |
+|-----|-------------|
+| `DVRM-C16` | `SIGN<sign_n; n[3], signed>` |
+| `DVRM-C17` | `SIGN<sign_r; r[3], signed>` |
+| `DVRM-C18` | `SIGN<sign_d; d[3], signed>` |
+
 ### output
 
 | Tag | Description | Multiplicity |
@@ -1489,15 +1505,6 @@ carry (when iter=[1, 3]) := 2^-32 * ((extended_n_sub_r::QuadWL)[i] + (extended_r
 | `DVRM-C21` | `DVRM[q::DWordWL; n, d, signed, 0]` | -μ_q |
 | `DVRM-C22` | `DVRM[r::DWordWL; n, d, signed, 1]` | -μ_r |
 
-### n_sub_r
-
-| Tag | Range | Description | Multiplicity |
-|-----|-------|-------------|--------------|
-| `DVRM-C9.i` | i ∈ [0, 3] | `IS_BIT<carry[i]>` |  |
-| `DVRM-C10.i` | i ∈ [0, 3] | `IS_HALF[r[i]]` | μ_sum |
-| `DVRM-C11.i` | i ∈ [0, 3] | `IS_HALF[n_sub_r[i]]` | μ_sum |
-| `DVRM-C12` |  | `IS_BIT<sign_n_sub_r>` |  |
-
 ### div_by_zero
 
 | Tag | Range | Description | Multiplicity |
@@ -1506,14 +1513,6 @@ carry (when iter=[1, 3]) := 2^-32 * ((extended_n_sub_r::QuadWL)[i] + (extended_r
 | | | _polynomial:_ `div_by_zero * (q[i] - 65535) = 0` | |
 | `DVRM-C20` |  | `ZERO[div_by_zero; d[0] + d[1] + d[2] + d[3]]` | μ_sum |
 
-### defs
-
-| Tag | Description |
-|-----|-------------|
-| `DVRM-C16` | `SIGN<sign_n; n[3], signed>` |
-| `DVRM-C17` | `SIGN<sign_r; r[3], signed>` |
-| `DVRM-C18` | `SIGN<sign_d; d[3], signed>` |
-
 ### abs_diff
 
 | Tag | Range | Description | Multiplicity |
@@ -1526,13 +1525,14 @@ carry (when iter=[1, 3]) := 2^-32 * ((extended_n_sub_r::QuadWL)[i] + (extended_r
 | `DVRM-C6.i` | i ∈ [0, 1] | not`sign_d` => `abs_d` = `d` |  |
 | | | _polynomial:_ `(1 - sign_d) * (abs_d[i] - (d::DWordWL)[i]) = 0` | |
 
-### equality
+### n_sub_r
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `DVRM-C13` |  | `MUL[n_sub_r::DWordWL; d, signed, q, sign_q, 0]` | μ_sum |
-| `DVRM-C14` |  | `MUL[extension_n_sub_r::DWordWL; d, signed, q, sign_q, 1]` | μ_sum |
-| `DVRM-C15.i` | i ∈ [0, 3] | `IS_HALF[q[i]]` | μ_sum |
+| `DVRM-C9.i` | i ∈ [0, 3] | `IS_BIT<carry[i]>` |  |
+| `DVRM-C10.i` | i ∈ [0, 3] | `IS_HALF[r[i]]` | μ_sum |
+| `DVRM-C11.i` | i ∈ [0, 3] | `IS_HALF[n_sub_r[i]]` | μ_sum |
+| `DVRM-C12` |  | `IS_BIT<sign_n_sub_r>` |  |
 
 ---
 

From 0fbf92788693b5d1f5606267d90748d6ad40acc0 Mon Sep 17 00:00:00 2001
From: Nicole <nicole.graus@lambdaclass.com>
Date: Fri, 6 Mar 2026 15:57:28 -0300
Subject: [PATCH 077/105] fix constraint numbering

---
 docs/spec/dvrm.md      | 108 ++++++++++++-----------------
 docs/spec/shift.md     |  50 ++++++-------
 docs/spec/spec_full.md | 154 ++++++++++++++++++-----------------------
 scripts/typst_to_md.py |  45 +++++++-----
 4 files changed, 163 insertions(+), 194 deletions(-)

diff --git a/docs/spec/dvrm.md b/docs/spec/dvrm.md
index 65305a1f8..2fde0208a 100644
--- a/docs/spec/dvrm.md
+++ b/docs/spec/dvrm.md
@@ -16,6 +16,11 @@ enum.item([ _For both signed and unsigned division, except in the case of_ overf
 
 We start with R3, which is straightforwardly asserted by constraint [dvrm:c:sign_r_equals_sign_n].
 
+| Tag | Description |
+|-----|-------------|
+| `DVRM-C1` | `r` eq.not 0 => `sign_r` = `sign_n` |
+| | _polynomial:_ `Σ_i = 0^3 r[i] * (sign_r - sign_n) = 0` |
+
 ## R2: rounding towards zero
 
 R2 states that "_[in] signed and unsigned integer division [the quotient is] round[ed] towards zero._" In other words, + the sign of ``n`-`qd`` must match that of `n` (unless ``qd` = `n``), and + `|`n`-`qd`|  < |`d`|` (unless ``d` = 0`).
@@ -24,6 +29,16 @@ Leveraging R1 , we can rewrite these as + the sign of ``r`` must match that of `
 
 Focusing on the first statement, we observe that this trivially holds when ``signed` = 0`, while R3 deals with the case that ``signed` = 1`. The second statement is enforced by [dvrm:c:abs_r_lt_abs_d]. [dvrm:c:abs_r_if_negative] and [dvrm:c:abs_r_if_nonnegative] (resp. [dvrm:c:abs_d_if_negative] and [dvrm:c:abs_d_if_nonnegative]) are included to ensure that `abs_r` (resp. `abs_d`) is the absolute values of `r` (resp. `d`).
 
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `DVRM-C2` |  | `LT[1 - div_by_zero; abs_r, abs_d, 0]` | μ_sum |
+| `DVRM-C3` |  | sign_r ⇒ `NEG<abs_r; r>` |  |
+| `DVRM-C4.i` | i ∈ [0, 1] | not`sign_r` => `abs_r` = `r` |  |
+| | | _polynomial:_ `(1 - sign_r) * (abs_r[i] - (r::DWordWL)[i]) = 0` | |
+| `DVRM-C5` |  | sign_d ⇒ `NEG<abs_d; d>` |  |
+| `DVRM-C6.i` | i ∈ [0, 1] | not`sign_d` => `abs_d` = `d` |  |
+| | | _polynomial:_ `(1 - sign_d) * (abs_d[i] - (d::DWordWL)[i]) = 0` | |
+
 ## R5: overflow
 
 The ISA requires that ``q` = `n`` and ``r` = 0` in the event of overflow (i.e., when ``n` = -2^63` and ``d` = -1`). We note that the second half of this requirement is already satisfied by R2: since ``d` = -1 != 0`, R2 requires that `|`r`| < |`d`| = 1`, to which ``r` = 0` is the only satisfying value.
@@ -46,22 +61,52 @@ Rewriting R1, we find the constraint `not`overflow` => `n` - `r` = `qd``.
 
 Since `n`, `d`, `q` and `r` are all 64-bit integers, we must assert this equality `mod 2^128`, rather than `mod 2^64`. To this end, we introduce `extended_n_sub_r` and leverage the `MUL` chip to verify that it is equal to ``qd` mod 2^128` using constraints [dvrm:c:mul_lower] and [dvrm:c:mul_upper]; [dvrm:c:q_range] is included to uphold assumption [mul:a:rhs].
 
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `DVRM-C9` |  | `MUL[n_sub_r::DWordWL; d, signed, q, sign_q, 0]` | μ_sum |
+| `DVRM-C10` |  | `MUL[extension_n_sub_r::DWordWL; d, signed, q, sign_q, 1]` | μ_sum |
+| `DVRM-C11.i` | i ∈ [0, 3] | `IS_HALF[q[i]]` | μ_sum |
+
 It now remains to enforce that `extended_n_sub_r` is the _signed_ 128-bit representation of ``n`-`r``. Here, we introduce `extended_n` and `extended_r`. By their definition, these variables contain the signed 128-bit representations of `n` and `r`. The `carry` variable has been defined such that it mimics those in the `ADD` chip, except that here we add two `QuadHL`s rather than two `DWordHL`, thus needing four carry bits instead of two. With this in place, [dvrm:c:n_sub_r] (mimicking [add:c:carry]) ensures `extended_n_sub_r` must contain the correct value.
 
 Lastly, observe that ``n` - `r` in (-2^64, 2^64)`, _regardless_ of the value of `signed`. Moreover, note that the upper halves of the 128-bit representations of all values in this range are either `0xFFFFFFFF` (negative) or `0x00000000` (non-negative). This means that we do not need to store all 128 bits of `extended_n_sub_r`. Rather, we need only store the lower 64-bits, and a separate bit (`sign_n_sub_r`) indicating whether the top limbs are all-ones or all-zeroes. The prover is free to select the value for `sign_n_sub_r`; only one of the two will fit the proof.
 
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `DVRM-C12.i` | i ∈ [0, 3] | `IS_BIT<carry[i]>` |  |
+| `DVRM-C13.i` | i ∈ [0, 3] | `IS_HALF[r[i]]` | μ_sum |
+| `DVRM-C14.i` | i ∈ [0, 3] | `IS_HALF[n_sub_r[i]]` | μ_sum |
+| `DVRM-C15` |  | `IS_BIT<sign_n_sub_r>` |  |
+
 ## R4: division-by-zero
 
 R4 requires that ``q` = 2^64-1` (unsigned) or `-1` (signed) and ``r` = n` when ``d` = 0`. Recalling R1, we see that ``n` = `q` `d` + `r` = `r`` when ``d` = 0`, already enforces the latter. Next, we note that, in two's complement, the _unsigned_ value `2^64-1` and _signed_ value `-1` are both represented by the bit string `0xFFFFFFFF`. Hence, only [dvrm:c:q_if_div_by_zero] is required to completely constrain R4; [dvrm:c:div_by_zero] just ensures the `div_by_zero` flag is set when ``d` = 0`.
 
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `DVRM-C16.i` | i ∈ [0, 3] | `div_by_zero` => `q[i]` = 65535 |  |
+| | | _polynomial:_ `div_by_zero * (q[i] - 65535) = 0` | |
+| `DVRM-C17` |  | `ZERO[div_by_zero; d[0] + d[1] + d[2] + d[3]]` | μ_sum |
+
 ## Other
 
 The following constraints are included to enforce the values of `sign_n`, `sign_r` and `sign_d` are correct.
 
+| Tag | Description |
+|-----|-------------|
+| `DVRM-C18` | `SIGN<sign_n; n[3], signed>` |
+| `DVRM-C19` | `SIGN<sign_r; r[3], signed>` |
+| `DVRM-C20` | `SIGN<sign_d; d[3], signed>` |
+
 ## Output
 
 Lastly, this chip contributes the following to the lookup:
 
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `DVRM-C21` | `DVRM[q::DWordWL; n, d, signed, 0]` | -μ_q |
+| `DVRM-C22` | `DVRM[r::DWordWL; n, d, signed, 1]` | -μ_r |
+
 = Padding To pad the  table, we use the following data, representing the unsigned division `frac(0, 0, style: "horizontal")`:
 
 ## Columns
@@ -154,65 +199,4 @@ carry (when iter=[1, 3]) := 2^-32 * ((extended_n_sub_r::QuadWL)[i] + (extended_r
 |-----|-------|-------------|
 | `DVRM-A1.i` | i ∈ [0, 3] | `IS_HALF[n[i]]` |
 | `DVRM-A2.i` | i ∈ [0, 3] | `IS_HALF[d[i]]` |
-| `DVRM-A3` |  | `IS_BIT<signed>` |
-
-## Constraints
-
-### equality
-
-| Tag | Range | Description | Multiplicity |
-|-----|-------|-------------|--------------|
-| `DVRM-C13` |  | `MUL[n_sub_r::DWordWL; d, signed, q, sign_q, 0]` | μ_sum |
-| `DVRM-C14` |  | `MUL[extension_n_sub_r::DWordWL; d, signed, q, sign_q, 1]` | μ_sum |
-| `DVRM-C15.i` | i ∈ [0, 3] | `IS_HALF[q[i]]` | μ_sum |
-
-### sign_equality
-
-| Tag | Description |
-|-----|-------------|
-| `DVRM-C1` | `r` eq.not 0 => `sign_r` = `sign_n` |
-| | _polynomial:_ `Σ_i = 0^3 r[i] * (sign_r - sign_n) = 0` |
-
-### defs
-
-| Tag | Description |
-|-----|-------------|
-| `DVRM-C16` | `SIGN<sign_n; n[3], signed>` |
-| `DVRM-C17` | `SIGN<sign_r; r[3], signed>` |
-| `DVRM-C18` | `SIGN<sign_d; d[3], signed>` |
-
-### output
-
-| Tag | Description | Multiplicity |
-|-----|-------------|--------------|
-| `DVRM-C21` | `DVRM[q::DWordWL; n, d, signed, 0]` | -μ_q |
-| `DVRM-C22` | `DVRM[r::DWordWL; n, d, signed, 1]` | -μ_r |
-
-### div_by_zero
-
-| Tag | Range | Description | Multiplicity |
-|-----|-------|-------------|--------------|
-| `DVRM-C19.i` | i ∈ [0, 3] | `div_by_zero` => `q[i]` = 65535 |  |
-| | | _polynomial:_ `div_by_zero * (q[i] - 65535) = 0` | |
-| `DVRM-C20` |  | `ZERO[div_by_zero; d[0] + d[1] + d[2] + d[3]]` | μ_sum |
-
-### abs_diff
-
-| Tag | Range | Description | Multiplicity |
-|-----|-------|-------------|--------------|
-| `DVRM-C2` |  | `LT[1 - div_by_zero; abs_r, abs_d, 0]` | μ_sum |
-| `DVRM-C3` |  | sign_r ⇒ `NEG<abs_r; r>` |  |
-| `DVRM-C4.i` | i ∈ [0, 1] | not`sign_r` => `abs_r` = `r` |  |
-| | | _polynomial:_ `(1 - sign_r) * (abs_r[i] - (r::DWordWL)[i]) = 0` | |
-| `DVRM-C5` |  | sign_d ⇒ `NEG<abs_d; d>` |  |
-| `DVRM-C6.i` | i ∈ [0, 1] | not`sign_d` => `abs_d` = `d` |  |
-| | | _polynomial:_ `(1 - sign_d) * (abs_d[i] - (d::DWordWL)[i]) = 0` | |
-
-### n_sub_r
-
-| Tag | Range | Description | Multiplicity |
-|-----|-------|-------------|--------------|
-| `DVRM-C9.i` | i ∈ [0, 3] | `IS_BIT<carry[i]>` |  |
-| `DVRM-C10.i` | i ∈ [0, 3] | `IS_HALF[r[i]]` | μ_sum |
-| `DVRM-C11.i` | i ∈ [0, 3] | `IS_HALF[n_sub_r[i]]` | μ_sum |
-| `DVRM-C12` |  | `IS_BIT<sign_n_sub_r>` |  |
\ No newline at end of file
+| `DVRM-A3` |  | `IS_BIT<signed>` |
\ No newline at end of file
diff --git a/docs/spec/shift.md b/docs/spec/shift.md
index 78842188b..76c147141 100644
--- a/docs/spec/shift.md
+++ b/docs/spec/shift.md
@@ -44,9 +44,9 @@ Lastly, we discuss the case of performing the _arithmetic_ right shift. Here, `e
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `SHIFT-C3` | `AND_BYTE[bit_shift; shift, 15]` | left |
-| `SHIFT-C4` | `AND_BYTE[bit_shift; 2^8 - shift, 15]` | right |
-| `SHIFT-C5` | `ZERO[zbs; bit_shift]` | μ |
+| `SHIFT-C1` | `AND_BYTE[bit_shift; shift, 15]` | left |
+| `SHIFT-C2` | `AND_BYTE[bit_shift; 2^8 - shift, 15]` | right |
+| `SHIFT-C3` | `ZERO[zbs; bit_shift]` | μ |
 
 Next, we shift the limbs of `in` left and right by the appropriate amount, storing the results in `X` and `Y` respectively. When `zbs = 1`, the output cannot be used to compose ``in >>/>>> shift` mod 16`. To resolve this, we override `Y[i] := in[i]` and `X[i] := 0` in this case.
 
@@ -54,14 +54,14 @@ The case of `left`-shifting and ``bit_shift` = 0` will be used for padding rows.
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `SHIFT-C6.i` | i ∈ [0, 3] | `HWSL[X[i]; in[i], bit_shift]` | 1 - zbs |
-| `SHIFT-C7.i` | i ∈ [0, 3] | `zbs` => `X[i]` = `in[i]` dot `left` |  |
+| `SHIFT-C4.i` | i ∈ [0, 3] | `HWSL[X[i]; in[i], bit_shift]` | 1 - zbs |
+| `SHIFT-C5.i` | i ∈ [0, 3] | `zbs` => `X[i]` = `in[i]` dot `left` |  |
 | | | _polynomial:_ `zbs * (X[i] - in[i] * left) = 0` | |
-| `SHIFT-C8` |  | `HWSL[X[4]; extension, bit_shift]` | 1 - zbs |
-| `SHIFT-C9` |  | `zbs` => `X[4]` = 0 |  |
+| `SHIFT-C6` |  | `HWSL[X[4]; extension, bit_shift]` | 1 - zbs |
+| `SHIFT-C7` |  | `zbs` => `X[4]` = 0 |  |
 | | | _polynomial:_ `zbs * X[4] = 0` | |
-| `SHIFT-C10.i` | i ∈ [0, 3] | `HWSLC[Y[i]; in[i], bit_shift]` | 1 - zbs |
-| `SHIFT-C11.i` | i ∈ [0, 3] | `zbs` => `Y[i]` = `in[i]` dot `right` |  |
+| `SHIFT-C8.i` | i ∈ [0, 3] | `HWSLC[Y[i]; in[i], bit_shift]` | 1 - zbs |
+| `SHIFT-C9.i` | i ∈ [0, 3] | `zbs` => `Y[i]` = `in[i]` dot `right` |  |
 | | | _polynomial:_ `zbs * (Y[i] - in[i] * right) = 0` | |
 
 ## Full-limb shifting
@@ -72,13 +72,22 @@ Hereafter, one must only check that `out` is the proper cast of `shifted` into a
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `SHIFT-C12.i` | i ∈ [0, 3] | `IS_BIT<limb_shift[i]>` |  |
-| `SHIFT-C13` |  | `AND_BYTE[(1 - limb_shift[0]) + 15 * limb_shift[1] + 31 * limb_shift[2] + 47 * limb_shift[3]; shift, 48 - 32 * word_instr]` | μ |
-| `SHIFT-C14.i` | i ∈ [0, 1] | `out[:2]` = `shifted[:4]` |  |
+| `SHIFT-C10.i` | i ∈ [0, 3] | `IS_BIT<limb_shift[i]>` |  |
+| `SHIFT-C11` |  | `AND_BYTE[(1 - limb_shift[0]) + 15 * limb_shift[1] + 31 * limb_shift[2] + 47 * limb_shift[3]; shift, 48 - 32 * word_instr]` | μ |
+| `SHIFT-C12.i` | i ∈ [0, 1] | `out[:2]` = `shifted[:4]` |  |
 | | | _polynomial:_ `out[i] - (shifted::DWordWL)[i] = 0` | |
 
 ## Miscellaneous
 
+| Tag | Description |
+|-----|-------------|
+| `SHIFT-C13` | `direction` => `μ` = 1 |
+| | _polynomial:_ `direction * (1 - μ) = 0` |
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `SHIFT-C14` | `MSB16[is_negative; in[3]]` | signed |
+
 *Note*: `is_negative` is not used when `signed = 0`. As such, there is no problem with it being unconstrained in this case.
 
 ## Lookups
@@ -178,19 +187,4 @@ shifted := left * Σ_j = 0^i limb_shift[j] * intra_limb_left[i - j] + right * (
 | `SHIFT-A2` |  | `IS_BYTE[shift]` |
 | `SHIFT-A3` |  | `IS_BIT<direction>` |
 | `SHIFT-A4` |  | `IS_BIT<signed>` |
-| `SHIFT-A5` |  | `IS_BIT<word_instr>` |
-
-## Constraints
-
-### left_flag
-
-| Tag | Description |
-|-----|-------------|
-| `SHIFT-C1` | `direction` => `μ` = 1 |
-| | _polynomial:_ `direction * (1 - μ) = 0` |
-
-### is_negative
-
-| Tag | Description | Multiplicity |
-|-----|-------------|--------------|
-| `SHIFT-C2` | `MSB16[is_negative; in[3]]` | signed |
\ No newline at end of file
+| `SHIFT-A5` |  | `IS_BIT<word_instr>` |
\ No newline at end of file
diff --git a/docs/spec/spec_full.md b/docs/spec/spec_full.md
index 83cace1ca..05c19a3cc 100644
--- a/docs/spec/spec_full.md
+++ b/docs/spec/spec_full.md
@@ -683,9 +683,9 @@ Lastly, we discuss the case of performing the _arithmetic_ right shift. Here, `e
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `SHIFT-C3` | `AND_BYTE[bit_shift; shift, 15]` | left |
-| `SHIFT-C4` | `AND_BYTE[bit_shift; 2^8 - shift, 15]` | right |
-| `SHIFT-C5` | `ZERO[zbs; bit_shift]` | μ |
+| `SHIFT-C1` | `AND_BYTE[bit_shift; shift, 15]` | left |
+| `SHIFT-C2` | `AND_BYTE[bit_shift; 2^8 - shift, 15]` | right |
+| `SHIFT-C3` | `ZERO[zbs; bit_shift]` | μ |
 
 Next, we shift the limbs of `in` left and right by the appropriate amount, storing the results in `X` and `Y` respectively. When `zbs = 1`, the output cannot be used to compose ``in >>/>>> shift` mod 16`. To resolve this, we override `Y[i] := in[i]` and `X[i] := 0` in this case.
 
@@ -693,14 +693,14 @@ The case of `left`-shifting and ``bit_shift` = 0` will be used for padding rows.
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `SHIFT-C6.i` | i ∈ [0, 3] | `HWSL[X[i]; in[i], bit_shift]` | 1 - zbs |
-| `SHIFT-C7.i` | i ∈ [0, 3] | `zbs` => `X[i]` = `in[i]` dot `left` |  |
+| `SHIFT-C4.i` | i ∈ [0, 3] | `HWSL[X[i]; in[i], bit_shift]` | 1 - zbs |
+| `SHIFT-C5.i` | i ∈ [0, 3] | `zbs` => `X[i]` = `in[i]` dot `left` |  |
 | | | _polynomial:_ `zbs * (X[i] - in[i] * left) = 0` | |
-| `SHIFT-C8` |  | `HWSL[X[4]; extension, bit_shift]` | 1 - zbs |
-| `SHIFT-C9` |  | `zbs` => `X[4]` = 0 |  |
+| `SHIFT-C6` |  | `HWSL[X[4]; extension, bit_shift]` | 1 - zbs |
+| `SHIFT-C7` |  | `zbs` => `X[4]` = 0 |  |
 | | | _polynomial:_ `zbs * X[4] = 0` | |
-| `SHIFT-C10.i` | i ∈ [0, 3] | `HWSLC[Y[i]; in[i], bit_shift]` | 1 - zbs |
-| `SHIFT-C11.i` | i ∈ [0, 3] | `zbs` => `Y[i]` = `in[i]` dot `right` |  |
+| `SHIFT-C8.i` | i ∈ [0, 3] | `HWSLC[Y[i]; in[i], bit_shift]` | 1 - zbs |
+| `SHIFT-C9.i` | i ∈ [0, 3] | `zbs` => `Y[i]` = `in[i]` dot `right` |  |
 | | | _polynomial:_ `zbs * (Y[i] - in[i] * right) = 0` | |
 
 ## Full-limb shifting
@@ -711,13 +711,22 @@ Hereafter, one must only check that `out` is the proper cast of `shifted` into a
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `SHIFT-C12.i` | i ∈ [0, 3] | `IS_BIT<limb_shift[i]>` |  |
-| `SHIFT-C13` |  | `AND_BYTE[(1 - limb_shift[0]) + 15 * limb_shift[1] + 31 * limb_shift[2] + 47 * limb_shift[3]; shift, 48 - 32 * word_instr]` | μ |
-| `SHIFT-C14.i` | i ∈ [0, 1] | `out[:2]` = `shifted[:4]` |  |
+| `SHIFT-C10.i` | i ∈ [0, 3] | `IS_BIT<limb_shift[i]>` |  |
+| `SHIFT-C11` |  | `AND_BYTE[(1 - limb_shift[0]) + 15 * limb_shift[1] + 31 * limb_shift[2] + 47 * limb_shift[3]; shift, 48 - 32 * word_instr]` | μ |
+| `SHIFT-C12.i` | i ∈ [0, 1] | `out[:2]` = `shifted[:4]` |  |
 | | | _polynomial:_ `out[i] - (shifted::DWordWL)[i] = 0` | |
 
 ## Miscellaneous
 
+| Tag | Description |
+|-----|-------------|
+| `SHIFT-C13` | `direction` => `μ` = 1 |
+| | _polynomial:_ `direction * (1 - μ) = 0` |
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `SHIFT-C14` | `MSB16[is_negative; in[3]]` | signed |
+
 *Note*: `is_negative` is not used when `signed = 0`. As such, there is no problem with it being unconstrained in this case.
 
 ## Lookups
@@ -819,21 +828,6 @@ shifted := left * Σ_j = 0^i limb_shift[j] * intra_limb_left[i - j] + right * (
 | `SHIFT-A4` |  | `IS_BIT<signed>` |
 | `SHIFT-A5` |  | `IS_BIT<word_instr>` |
 
-## Constraints
-
-### left_flag
-
-| Tag | Description |
-|-----|-------------|
-| `SHIFT-C1` | `direction` => `μ` = 1 |
-| | _polynomial:_ `direction * (1 - μ) = 0` |
-
-### is_negative
-
-| Tag | Description | Multiplicity |
-|-----|-------------|--------------|
-| `SHIFT-C2` | `MSB16[is_negative; in[3]]` | signed |
-
 ---
 
 # BRANCH Chip
@@ -1333,6 +1327,11 @@ enum.item([ _For both signed and unsigned division, except in the case of_ overf
 
 We start with R3, which is straightforwardly asserted by constraint [dvrm:c:sign_r_equals_sign_n].
 
+| Tag | Description |
+|-----|-------------|
+| `DVRM-C1` | `r` eq.not 0 => `sign_r` = `sign_n` |
+| | _polynomial:_ `Σ_i = 0^3 r[i] * (sign_r - sign_n) = 0` |
+
 ## R2: rounding towards zero
 
 R2 states that "_[in] signed and unsigned integer division [the quotient is] round[ed] towards zero._" In other words, + the sign of ``n`-`qd`` must match that of `n` (unless ``qd` = `n``), and + `|`n`-`qd`|  < |`d`|` (unless ``d` = 0`).
@@ -1341,6 +1340,16 @@ Leveraging R1 , we can rewrite these as + the sign of ``r`` must match that of `
 
 Focusing on the first statement, we observe that this trivially holds when ``signed` = 0`, while R3 deals with the case that ``signed` = 1`. The second statement is enforced by [dvrm:c:abs_r_lt_abs_d]. [dvrm:c:abs_r_if_negative] and [dvrm:c:abs_r_if_nonnegative] (resp. [dvrm:c:abs_d_if_negative] and [dvrm:c:abs_d_if_nonnegative]) are included to ensure that `abs_r` (resp. `abs_d`) is the absolute values of `r` (resp. `d`).
 
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `DVRM-C2` |  | `LT[1 - div_by_zero; abs_r, abs_d, 0]` | μ_sum |
+| `DVRM-C3` |  | sign_r ⇒ `NEG<abs_r; r>` |  |
+| `DVRM-C4.i` | i ∈ [0, 1] | not`sign_r` => `abs_r` = `r` |  |
+| | | _polynomial:_ `(1 - sign_r) * (abs_r[i] - (r::DWordWL)[i]) = 0` | |
+| `DVRM-C5` |  | sign_d ⇒ `NEG<abs_d; d>` |  |
+| `DVRM-C6.i` | i ∈ [0, 1] | not`sign_d` => `abs_d` = `d` |  |
+| | | _polynomial:_ `(1 - sign_d) * (abs_d[i] - (d::DWordWL)[i]) = 0` | |
+
 ## R5: overflow
 
 The ISA requires that ``q` = `n`` and ``r` = 0` in the event of overflow (i.e., when ``n` = -2^63` and ``d` = -1`). We note that the second half of this requirement is already satisfied by R2: since ``d` = -1 != 0`, R2 requires that `|`r`| < |`d`| = 1`, to which ``r` = 0` is the only satisfying value.
@@ -1363,22 +1372,52 @@ Rewriting R1, we find the constraint `not`overflow` => `n` - `r` = `qd``.
 
 Since `n`, `d`, `q` and `r` are all 64-bit integers, we must assert this equality `mod 2^128`, rather than `mod 2^64`. To this end, we introduce `extended_n_sub_r` and leverage the `MUL` chip to verify that it is equal to ``qd` mod 2^128` using constraints [dvrm:c:mul_lower] and [dvrm:c:mul_upper]; [dvrm:c:q_range] is included to uphold assumption [mul:a:rhs].
 
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `DVRM-C9` |  | `MUL[n_sub_r::DWordWL; d, signed, q, sign_q, 0]` | μ_sum |
+| `DVRM-C10` |  | `MUL[extension_n_sub_r::DWordWL; d, signed, q, sign_q, 1]` | μ_sum |
+| `DVRM-C11.i` | i ∈ [0, 3] | `IS_HALF[q[i]]` | μ_sum |
+
 It now remains to enforce that `extended_n_sub_r` is the _signed_ 128-bit representation of ``n`-`r``. Here, we introduce `extended_n` and `extended_r`. By their definition, these variables contain the signed 128-bit representations of `n` and `r`. The `carry` variable has been defined such that it mimics those in the `ADD` chip, except that here we add two `QuadHL`s rather than two `DWordHL`, thus needing four carry bits instead of two. With this in place, [dvrm:c:n_sub_r] (mimicking [add:c:carry]) ensures `extended_n_sub_r` must contain the correct value.
 
 Lastly, observe that ``n` - `r` in (-2^64, 2^64)`, _regardless_ of the value of `signed`. Moreover, note that the upper halves of the 128-bit representations of all values in this range are either `0xFFFFFFFF` (negative) or `0x00000000` (non-negative). This means that we do not need to store all 128 bits of `extended_n_sub_r`. Rather, we need only store the lower 64-bits, and a separate bit (`sign_n_sub_r`) indicating whether the top limbs are all-ones or all-zeroes. The prover is free to select the value for `sign_n_sub_r`; only one of the two will fit the proof.
 
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `DVRM-C12.i` | i ∈ [0, 3] | `IS_BIT<carry[i]>` |  |
+| `DVRM-C13.i` | i ∈ [0, 3] | `IS_HALF[r[i]]` | μ_sum |
+| `DVRM-C14.i` | i ∈ [0, 3] | `IS_HALF[n_sub_r[i]]` | μ_sum |
+| `DVRM-C15` |  | `IS_BIT<sign_n_sub_r>` |  |
+
 ## R4: division-by-zero
 
 R4 requires that ``q` = 2^64-1` (unsigned) or `-1` (signed) and ``r` = n` when ``d` = 0`. Recalling R1, we see that ``n` = `q` `d` + `r` = `r`` when ``d` = 0`, already enforces the latter. Next, we note that, in two's complement, the _unsigned_ value `2^64-1` and _signed_ value `-1` are both represented by the bit string `0xFFFFFFFF`. Hence, only [dvrm:c:q_if_div_by_zero] is required to completely constrain R4; [dvrm:c:div_by_zero] just ensures the `div_by_zero` flag is set when ``d` = 0`.
 
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `DVRM-C16.i` | i ∈ [0, 3] | `div_by_zero` => `q[i]` = 65535 |  |
+| | | _polynomial:_ `div_by_zero * (q[i] - 65535) = 0` | |
+| `DVRM-C17` |  | `ZERO[div_by_zero; d[0] + d[1] + d[2] + d[3]]` | μ_sum |
+
 ## Other
 
 The following constraints are included to enforce the values of `sign_n`, `sign_r` and `sign_d` are correct.
 
+| Tag | Description |
+|-----|-------------|
+| `DVRM-C18` | `SIGN<sign_n; n[3], signed>` |
+| `DVRM-C19` | `SIGN<sign_r; r[3], signed>` |
+| `DVRM-C20` | `SIGN<sign_d; d[3], signed>` |
+
 ## Output
 
 Lastly, this chip contributes the following to the lookup:
 
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `DVRM-C21` | `DVRM[q::DWordWL; n, d, signed, 0]` | -μ_q |
+| `DVRM-C22` | `DVRM[r::DWordWL; n, d, signed, 1]` | -μ_r |
+
 = Padding To pad the  table, we use the following data, representing the unsigned division `frac(0, 0, style: "horizontal")`:
 
 ## Columns
@@ -1473,67 +1512,6 @@ carry (when iter=[1, 3]) := 2^-32 * ((extended_n_sub_r::QuadWL)[i] + (extended_r
 | `DVRM-A2.i` | i ∈ [0, 3] | `IS_HALF[d[i]]` |
 | `DVRM-A3` |  | `IS_BIT<signed>` |
 
-## Constraints
-
-### equality
-
-| Tag | Range | Description | Multiplicity |
-|-----|-------|-------------|--------------|
-| `DVRM-C13` |  | `MUL[n_sub_r::DWordWL; d, signed, q, sign_q, 0]` | μ_sum |
-| `DVRM-C14` |  | `MUL[extension_n_sub_r::DWordWL; d, signed, q, sign_q, 1]` | μ_sum |
-| `DVRM-C15.i` | i ∈ [0, 3] | `IS_HALF[q[i]]` | μ_sum |
-
-### sign_equality
-
-| Tag | Description |
-|-----|-------------|
-| `DVRM-C1` | `r` eq.not 0 => `sign_r` = `sign_n` |
-| | _polynomial:_ `Σ_i = 0^3 r[i] * (sign_r - sign_n) = 0` |
-
-### defs
-
-| Tag | Description |
-|-----|-------------|
-| `DVRM-C16` | `SIGN<sign_n; n[3], signed>` |
-| `DVRM-C17` | `SIGN<sign_r; r[3], signed>` |
-| `DVRM-C18` | `SIGN<sign_d; d[3], signed>` |
-
-### output
-
-| Tag | Description | Multiplicity |
-|-----|-------------|--------------|
-| `DVRM-C21` | `DVRM[q::DWordWL; n, d, signed, 0]` | -μ_q |
-| `DVRM-C22` | `DVRM[r::DWordWL; n, d, signed, 1]` | -μ_r |
-
-### div_by_zero
-
-| Tag | Range | Description | Multiplicity |
-|-----|-------|-------------|--------------|
-| `DVRM-C19.i` | i ∈ [0, 3] | `div_by_zero` => `q[i]` = 65535 |  |
-| | | _polynomial:_ `div_by_zero * (q[i] - 65535) = 0` | |
-| `DVRM-C20` |  | `ZERO[div_by_zero; d[0] + d[1] + d[2] + d[3]]` | μ_sum |
-
-### abs_diff
-
-| Tag | Range | Description | Multiplicity |
-|-----|-------|-------------|--------------|
-| `DVRM-C2` |  | `LT[1 - div_by_zero; abs_r, abs_d, 0]` | μ_sum |
-| `DVRM-C3` |  | sign_r ⇒ `NEG<abs_r; r>` |  |
-| `DVRM-C4.i` | i ∈ [0, 1] | not`sign_r` => `abs_r` = `r` |  |
-| | | _polynomial:_ `(1 - sign_r) * (abs_r[i] - (r::DWordWL)[i]) = 0` | |
-| `DVRM-C5` |  | sign_d ⇒ `NEG<abs_d; d>` |  |
-| `DVRM-C6.i` | i ∈ [0, 1] | not`sign_d` => `abs_d` = `d` |  |
-| | | _polynomial:_ `(1 - sign_d) * (abs_d[i] - (d::DWordWL)[i]) = 0` | |
-
-### n_sub_r
-
-| Tag | Range | Description | Multiplicity |
-|-----|-------|-------------|--------------|
-| `DVRM-C9.i` | i ∈ [0, 3] | `IS_BIT<carry[i]>` |  |
-| `DVRM-C10.i` | i ∈ [0, 3] | `IS_HALF[r[i]]` | μ_sum |
-| `DVRM-C11.i` | i ∈ [0, 3] | `IS_HALF[n_sub_r[i]]` | μ_sum |
-| `DVRM-C12` |  | `IS_BIT<sign_n_sub_r>` |  |
-
 ---
 
 # LOAD Chip
diff --git a/scripts/typst_to_md.py b/scripts/typst_to_md.py
index 16aa96fcb..dc40acaeb 100644
--- a/scripts/typst_to_md.py
+++ b/scripts/typst_to_md.py
@@ -224,10 +224,17 @@ def parse_typst_prose(content: str) -> list:
             if current_para:
                 elements.append(('para', ' '.join(current_para)))
                 current_para = []
-            # Extract group name: #render_constraint_table(chip, config, groups: "range")
-            match = re.search(r'groups:\s*"([^"]+)"', stripped)
-            if match:
-                elements.append(('render_constraints', match.group(1)))
+            # Extract group names: handles both single `groups: "g"` and array `groups: ("g1", "g2")`
+            groups = []
+            array_match = re.search(r'groups:\s*\(([^)]*)\)', stripped)
+            if array_match:
+                groups = re.findall(r'"([^"]+)"', array_match.group(1))
+            else:
+                single_match = re.search(r'groups:\s*"([^"]+)"', stripped)
+                if single_match:
+                    groups = [single_match.group(1)]
+            if groups:
+                elements.append(('render_constraints', groups))
             i += 1
             continue
 
@@ -527,6 +534,8 @@ def convert_chapter(typ_path: Path, toml_path: Path, title: str, config: dict) -
     rendered_assumptions = False
     rendered_constraints = False
     rendered_constraint_groups = set()
+    # Counter incremented in render order (matching Typst's global figure counter)
+    constraint_counter = 1
 
     # Parse Typst prose
     if typ_path.exists():
@@ -553,14 +562,17 @@ def convert_chapter(typ_path: Path, toml_path: Path, title: str, config: dict) -
                     rendered_constraints = True
 
             elif elem_type == 'render_constraints' and chip:
-                # Render the constraint group specified in the typst file
-                group_name = content
-                if group_name not in rendered_constraint_groups:
-                    # Skip heading since prose already has the section title
-                    group_table = render_constraints_table(chip, config, group_filter=group_name, skip_heading=True)
-                    if group_table.strip():
-                        lines.append(group_table)
+                # content is a list of group names to render (in order)
+                group_names = content
+                for group_name in group_names:
+                    if group_name not in rendered_constraint_groups:
+                        # Use the running render-order counter so numbering matches Typst
+                        group_table = render_constraints_table(chip, config, group_filter=group_name, skip_heading=True, start_counter=constraint_counter)
+                        if group_table.strip():
+                            lines.append(group_table)
                         rendered_constraint_groups.add(group_name)
+                    # Always advance the counter for this group (rendered or already seen)
+                    constraint_counter += len(chip.get("constraints", {}).get(group_name, []))
 
             elif elem_type == 'para':
                 lines.append(content)
@@ -583,20 +595,21 @@ def convert_chapter(typ_path: Path, toml_path: Path, title: str, config: dict) -
             lines.append(render_assumptions_table(chip, config))
 
         if chip.get("constraints"):
-            # Get all constraint groups from TOML
-            all_groups = set(chip.get("constraints", {}).keys())
-            remaining_groups = all_groups - rendered_constraint_groups
+            # Get remaining groups in TOML order
+            all_groups_ordered = [cg["name"] for cg in chip.get("constraint_groups", [])]
+            remaining_groups = [g for g in all_groups_ordered if g not in rendered_constraint_groups]
 
             if remaining_groups and not rendered_constraints:
                 # No prose Constraints section existed, add one
                 lines.append("## Constraints")
                 lines.append("")
 
-            # Render any constraint groups not already rendered inline
+            # Render any constraint groups not already rendered inline, continuing the counter
             for group_name in remaining_groups:
-                group_table = render_constraints_table(chip, config, group_filter=group_name)
+                group_table = render_constraints_table(chip, config, group_filter=group_name, start_counter=constraint_counter)
                 if group_table.strip():
                     lines.append(group_table)
+                constraint_counter += len(chip.get("constraints", {}).get(group_name, []))
 
     result = "\n".join(lines)
     result = re.sub(r'\n{3,}', '\n\n', result)

From 40c40aba4af220b7357ccfad342ec33f938e7b77 Mon Sep 17 00:00:00 2001
From: Nicole <nicole.graus@lambdaclass.com>
Date: Fri, 6 Mar 2026 16:27:54 -0300
Subject: [PATCH 078/105] fix ecall.md

---
 docs/spec/decode.md     |   8 ++-
 docs/spec/ecall.md      |  86 +++++++++++++++++++++++++
 docs/spec/memory.md     |  35 ++++++++++-
 docs/spec/signatures.md |   8 ++-
 docs/spec/spec_full.md  | 135 ++++++++++++++++++++++++++++++++++++++--
 scripts/typst_to_md.py  |  80 +++++++++++++++++-------
 6 files changed, 319 insertions(+), 33 deletions(-)

diff --git a/docs/spec/decode.md b/docs/spec/decode.md
index 26ce8b81e..188598ce2 100644
--- a/docs/spec/decode.md
+++ b/docs/spec/decode.md
@@ -22,7 +22,9 @@ Further clarification is provided in the notes following the table.
 
 The `RV64C` extension for compressed instructions specifies that \~50% of all instructions can be represented using a 16-bit instruction (rather than 32-bits), saving \~25% in code size. This execution of assembly code is _not_ agnostic to an instruction's compression state; after executing a compressed instruction, the `pc` should be incremented by `2` rather than `4`. To indicate an instruction is provided in compressed form, the `c_type` flag is introduced. *This flag should be set to `1` whenever the decoded instruction is provided in compressed form and `0` otherwise.*
 
-/// Add a reference to one or more notes following this table. super("[" + refs.pos().map(r => ref(r)).join(",") + "]") }
+/// Add a reference to one or more notes following this table.
+
+super("[" + refs.pos().map(r => ref(r)).join(",") + "]") }
 
 show figure: set block(breakable: true)
 
@@ -30,7 +32,9 @@ figure(table( columns: (auto, auto, auto, auto, 1fr, auto), stroke: 0pt, inset:
 
 // OP-IMM ([`ADDI[W]   rd, rs1, imm`], [`ADD`], [`[W]`], [], [], []), ([`SLTI[U]   rd, rs1, imm`], [`SLT`], [], [.not`[U]`], [], []), ([`ANDI      rd, rs1, imm`], [`AND`], [], [], [], []), ([`ORI       rd, rs1, imm`], [`OR`],   [], [], [], []), ([`XORI      rd, rs1, imm`], [`XOR`], [], [], [], []), ([`SLLI[W]   rd, rs1, imm`], [`SHIFT`], [`[W]`], [], [], []), ([`SRLI[W]   rd, rs1, imm`], [`SHIFT`], [`[W]`], [], [`mp_selector`], []), ([`SRAI[W]   rd, rs1, imm`], [`SHIFT`], [`[W]`], [1], [`mp_selector`], []), // OP ([`ADD[W]    rd, rs1, rs2`], [`ADD`], [`[W]`], [], [], []), ([`SUB[W]    rd, rs1, rs2`], [`SUB`], [`[W]`], [], [], []), ([`SLT[U]    rd, rs1, rs2`], [`SLT`], [], [.not`[U]`], [], []), ([`AND       rd, rs1, rs2`], [`AND`], [], [], [], []), ([`OR        rd, rs1, rs2`], [`OR`], [], [], [], []), ([`XOR       rd, rs1, rs2`], [`XOR`], [], [], [], []), ([`SLL[W]    rd, rs1, rs2`], [`SHIFT`], [`[W]`], [], [], []), ([`SRL[W]    rd, rs1, rs2`], [`SHIFT`], [`[W]`], [], [`mp_selector`], []), ([`SRA[W]    rd, rs1, rs2`], [`SHIFT`], [`[W]`], [1], [`mp_selector`], []), // OP - M ([`MUL[W]    rd, rs1, rs2`], [`MUL`], [`[W]`], [1], [`mp_selector`], []), ([`MULH      rd, rs1, rs2`], [`MUL`], [], [1], [`mp_selector`, `muldiv_selector`], []), ([`MULHU     rd, rs1, rs2`], [`MUL`], [], [], [`muldiv_selector`], []), ([`MULHSU    rd, rs1, rs2`], [`MUL`], [], [1], [`muldiv_selector`], []), ([`DIV[U][W] rd, rs1, rs2`], [`DIVREM`], [`[W]`], [.not`[U]`], [], []), ([`REM[U][W] rd, rs1, rs2`], [`DIVREM`], [`[W]`], [.not`[U]`], [`muldiv_selector`], []), // LUI/AUIPC ([`LUI       rd, imm`], [`ADD`], [], [], [], []), ([`AUIPC     rd, imm`], [`ADD`], [], [], [`rs1 := x255`], []), ([`JAL       rd, imm`], [`JALR`], [], [], [`rs1 := x255`], []), // Branching ([`JALR      rd, rs1, imm`], [`JALR`], [], [], [], []), ([`BEQ      rs1, rs2, imm`], [`BEQ`], [], [], [], []), ([`BNE      rs1, rs2, imm`], [`BEQ`], [], [], [`mp_selector`], []), ([`BLT[U]   rs1, rs2, imm`], [`BLT`], [], [.not`[U]`], [], []), ([`BGE[U]   rs1, rs2, imm`], [`BLT`], [], [.not`[U]`], [`mp_selector`], []), // LOAD ([`LD        rd, rs1, imm`], [`LOAD`], [], [], [`mem_8B`], []), ([`LW[U]     rd, rs1, imm`], [`LOAD`], [], [.not`[U]`], [`mem_4B`], []), ([`LH[U]     rd, rs1, imm`], [`LOAD`], [], [.not`[U]`], [`mem_2B`], []), ([`LB[U]     rd, rs1, imm`], [`LOAD`], [], [.not`[U]`], [], []), // STORE ([`SD       rs1, rs2, imm`], [`STORE`], [], [], [`mem_8B`], []), ([`SW       rs1, rs2, imm`], [`STORE`], [], [], [`mem_4B`], []), ([`SH       rs1, rs2, imm`], [`STORE`], [], [], [`mem_2B`], []), ([`SB       rs1, rs2, imm`], [`STORE`], [], [], [], []), // ECALL/EBREAK ([`ECALL`], [`ECALL`], [], [], [``rs1` := `x17``], []), ([`EBREAK`], [`EBREAK`], [], [], [], []), // FENCE ([`FENCE`], [`ADD`], [], [], [], []),
 
-// Construct a note that can be referenced through `lbl` show figure: (it) => align(left, []) [ ] }
+// Construct a note that can be referenced through `lbl`
+
+show figure: (it) => align(left, []) [ ] }
 
 ## Notes
 
diff --git a/docs/spec/ecall.md b/docs/spec/ecall.md
index d9b182039..62ebb0b7b 100644
--- a/docs/spec/ecall.md
+++ b/docs/spec/ecall.md
@@ -10,16 +10,33 @@ When `ECALL` is executed, it is assumed that: - register `A7` contains the syste
 
 ## Columns
 
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `timestamp` | `DWordWL` | timestamp at which to halt the program |
+
 The  chip leverages  variable, spanning  columns:
 
 ## Assumptions
 
+| Tag | Range | Description |
+|-----|-------|-------------|
+| `HALT-A1.i` | i ∈ [0, 1] | `IS_WORD[timestamp[i]]` |
+
 It is assumed the input is range checked:
 
 ## Constraints
 
 The  chip: + makes sure register `x10` (containing the exit code) equals `0` ([halt:c:read_zero_exit_code]), + writes `0` to all other registers ([halt:c:zeroize_registers_lo]/[halt:c:zeroize_registers_hi]), and + sets `pc` equal to `1` ([halt:c:pc]). Note that the writes performed by all these interactions are accompanied by the timestamp `2^64-1`; the maximum timestamp. This prevents any other operation involving memory from being executed hereafter.
 
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `HALT-C1.i` | i ∈ [1, 9] | `MEMW[1, (2 * i)::DWordWL, 0::BaseField[8], (2^64 - 1)::DWordWL, 1, 0, 0]` | 1 |
+| `HALT-C2` |  | `MEMW[0::BaseField[8]; 1, (2 * 10)::DWordWL, 0::BaseField[8], (2^64 - 1)::DWordWL, 1, 0, 0]` | 1 |
+| `HALT-C3.i` | i ∈ [11, 31] | `MEMW[1, (2 * i)::DWordWL, 0::BaseField[8], (2^64 - 1)::DWordWL, 1, 0, 0]` | 1 |
+| `HALT-C4` |  | `MEMW[1, (2 * 255)::DWordWL, ['arr', 1, 0, 0, 0, 0, 0, 0, 0], (2^64 - 1)::DWordWL, 1, 0, 0]` | 1 |
+
 [ Observe that --- in its current state --- this solution puts the burden of verifying the register cleanup on the verifier inside of the lookup argument. Alternatively, one could add 31 lookups to the "memory" table to remove the _known_ final tokens for the registers there. ])
 
 ### Lookup
@@ -28,6 +45,10 @@ In this VM, halting is considered equivalent to executing a `sys_exit`. Hence, t
 
 The HALT chip therefore contributes the following interaction to the lookup-argument:
 
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `HALT-C5` | `ECALL[timestamp, 93::DWordWL]` | -1 |
+
 ## Padding
 
 This chip should only contain a single row. Given that `2^0 = 1`, this chip does not need to be padded. As such, no padding is defined.
@@ -36,6 +57,31 @@ This chip should only contain a single row. Given that `2^0 = 1`, this chip does
 
 ## Columns
 
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `timestamp` | `DWordWL` | timestamp at which to commit |
+
+### Auxiliary
+
+| Name | Type | Description |
+|------|------|-------------|
+| `index` | `BaseField` | Index of value being committed. |
+| `address` | `DWordWL` | Address of first byte to commit. |
+| `address_incr` | `DWordHL` | $`address` + 1$ |
+| `count` | `DWordWL` | number of bytes to commit |
+| `count_decr` | `DWordHL` | $`count` - 1$ |
+| `first` | `Bit` | Whether this is the first commitment in this sequence. |
+| `end` | `Bit` | Whether this is the end of the commitment sequence. |
+| `value` | `Byte` | Byte stored at `address`. |
+
+### Multiplicity
+
+| Name | Type | Description |
+|------|------|-------------|
+| `μ` | `Bit` |  |
+
 The  chip leverages  variables, spanning  columns:
 
 ## Constraints
@@ -44,6 +90,10 @@ In this VM, committing is considered equivalent to writing a value to `stdout`.
 
 Since we do not know how many bytes are to be committed, this chip employs a recursive design: each iteration commits one byte, and recursively "calls" itself to commit the remaining bytes. As such, only the call from the CPU to this chip (i.e., the `first` in the recursion tree) should accept the `ECALL`; later recursive calls should not. This is why [commit:c:receive_ecall] has multiplicity `-`first``.
 
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `COMMIT-C1` | `ECALL[timestamp, 64::DWordWL]` | -first |
+
 The `write` operation --- writing to a file descriptor --- has the following signature:
 
 ```c ssize_t write(size_t count; int fd, const void buf[count], size_t count); ```
@@ -54,20 +104,56 @@ That is to say, - `A0` contains the file descriptor, - `A1` contains the address
 
 we assert that `x10` contains `1` in [commit:c:read_fd_write_count]. Note that this constraint _also_ writes `count` to `A0`; in this VM it is impossible for a commit to be interrupted or fail. Lastly, the `index` is read from `x254`; in the same operation, ``index` + `count`` is written back to this location by [commit:c:read_index]. This, too, leverages the fact that a commit will not be interrupted or fail to update the `index` for the next commit sequence. Again, each of these memory interactions only take place when this is the `first` call in the recursion tree.
 
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `COMMIT-C2` | `MEMW[['arr', ['idx', 'address', 0], ['idx', 'address', 1], 0, 0, 0, 0, 0, 0]; 1, (2 * 11)::DWordWL, ['arr', ['idx', 'address', 0], ['idx', 'address', 1], 0, 0, 0, 0, 0, 0], timestamp, 1, 0, 0]` | first |
+| `COMMIT-C3` | `MEMW[['arr', ['idx', 'count', 0], ['idx', 'count', 1], 0, 0, 0, 0, 0, 0]; 1, (2 * 12)::DWordWL, ['arr', ['idx', 'count', 0], ['idx', 'count', 1], 0, 0, 0, 0, 0, 0], timestamp, 1, 0, 0]` | first |
+| `COMMIT-C4` | `MEMW[['arr', 1, 0, 0, 0, 0, 0, 0, 0]; 1, (2 * 10)::DWordWL, ['arr', ['idx', 'count', 0], ['idx', 'count', 1], 0, 0, 0, 0, 0, 0], timestamp, 1, 0, 0]` | first |
+| `COMMIT-C5` | `MEMW[['arr', 'index', 0, 0, 0, 0, 0, 0, 0]; 1, (2 * 254)::DWordWL, ['arr', ['+', 'index', ['cast', 'count', 'BaseField']], 0, 0, 0, 0, 0, 0, 0], timestamp, 0, 0, 0]` | first |
+
 *Note*: the observant reader will notice that [commit:c:read_index] casts `count` to a `BaseField`, potentiallly losing information. This is indeed correct. However, since it is practically impossible to commit more than `2^64-2^32` bytes in a single VM execution, it was decided to permit this.
 
 Next, we read the `value` located at buffer address `address` and commit to it under the given `index`. This is only performed when we have not yet reached the `end` of the commit sequence.
 
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `COMMIT-C6` | `MEMW[['arr', 'value', 0, 0, 0, 0, 0, 0, 0]; 0, address, ['arr', 'value', 0, 0, 0, 0, 0, 0, 0], timestamp, 0, 0, 0]` | μ - end |
+| `COMMIT-C7` | `COMMIT[index, value]` | μ - end |
+
 In parallel, we compute ``address_incr` = `address` + 1` ([commit:c:address_incr]) as address of the next byte to commit, and ``count_decr` = `count` - 1` ([commit:c:count_decr]) as the number of bytes that still has to be committed after committing this byte. [commit:c:range_address_incr] and [commit:c:range_count_decr] are included to satisfy [add:a:sum] respectively [add:a:rhs].
 
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `COMMIT-C8` |  | `ADD<address_incr::DWordWL; address, 1::DWordWL>` |  |
+| `COMMIT-C9.i` | i ∈ [0, 3] | `IS_HALF[address_incr[i]]` | μ |
+| `COMMIT-C10` |  | `SUB<count_decr::DWordWL; count, 1::DWordWL>` |  |
+| `COMMIT-C11.i` | i ∈ [0, 3] | `IS_HALF[count_decr[i]]` | μ |
+
 When `count` hits `0`, we should stop performing further recursive calls. We use the `end` bit to indicate these circumstances.
 
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `COMMIT-C12` | `ZERO[end; (65535 - count_decr[0]) + (65535 - count_decr[1]) + (65535 - count_decr[2]) + (65535 - count_decr[3])]` | μ |
+
 *Note*: + Rather than setting ``end` = 1` when ``count` = 0`, we do so when ``count_decr` = -1`. This technique allows `count` to be stored in a `DWordWL` rather than a `DWordHL`, saving two columns. + `forall i in [0, 3]: 65535 - `count_decr`_i >= 0` as a result of [commit:c:range_count_decr]. Hence, $ sum_(i=0)^3 65535 - `count_decr`_i = 0 arrow.l.r.double.long forall i in [0, 3]: `count_decr`_i = 65535 $
 
 When this was not the `end` byte to commit in this recursion sequence, we recursively _Commit the Next Byte_ (`CNB`), specifying the timestamp, address to continue reading and the number of bytes that should still be committed ([commit:c:send_commit_next_byte]). Since that certainly won't be the `first` call in the sequence, we read `address_incr` and `count_decr` from the previous recursion level into `address` and `count` and continue executing the commit.
 
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `COMMIT-C13` | `CNB[timestamp, index + 1, address_incr::DWordWL, count_decr::DWordWL]` | μ - end |
+| `COMMIT-C14` | `CNB[timestamp, index, address, count]` | -(μ - first) |
+
 Lastly, we must make sure `first`, `end` and `μ` are bits ([commit:c:range_first], [commit:c:range_end], [commit:c:range_mu]), and that when either ``first` = 1` or ``end` = 1` imply that ``μ` = 1` ([commit:c:first_or_end_implies_mu]). These are required to ensure the multiplicities `-(`μ` - `first`)` and ``μ` - `end`` are binary.
 
+| Tag | Description |
+|-----|-------------|
+| `COMMIT-C15` | `IS_BIT<first>` |
+| `COMMIT-C16` | `IS_BIT<end>` |
+| `COMMIT-C17` | `IS_BIT<μ>` |
+| `COMMIT-C18` | `first` + `end` => `μ` = 1 |
+| | _polynomial:_ `(first + end) * (1 - μ) = 0` |
+
 ## Padding
 
 To pad this chip, use the below data.
diff --git a/docs/spec/memory.md b/docs/spec/memory.md
index a38debcfa..9faad803f 100644
--- a/docs/spec/memory.md
+++ b/docs/spec/memory.md
@@ -78,4 +78,37 @@ The initial and final state of registers can be entirely known by the verifier,
 
 = Future topics of interest
 
-- Optimize memory systems after determining factual bottlenecks (e.g. taking inspiration from Twist and Shout, or other recent research) - Double check whether IS_BYTE constraints are needed for fini
\ No newline at end of file
+- Optimize memory systems after determining factual bottlenecks (e.g. taking inspiration from Twist and Shout, or other recent research) - Double check whether IS_BYTE constraints are needed for fini
+
+## Columns
+
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `offset` | `RowIndex` | The offset from the page base address. |
+| `init` | `Byte` | The initial value of this address. Can be replaced by a constant zero for zero-initialization |
+| `fini` | `Byte` | The final value this address took |
+| `timestamp` | `DWordWL` | The timestamp at which this address was last accessed |
+
+### Virtual
+
+| Name | Type | Description |
+|------|------|-------------|
+| `address` | `DWordWL` | Adding `offset` to the page base address `page`. `page` is a constant with respect to a single instance of this table. |
+
+**Definition of `address`:**
+```
+address := page + offset * 1::DWordWL
+```
+
+## Constraints
+
+### all
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `PAGE-C1` | `IS_BYTE[init]` | 1 |
+| `PAGE-C2` | `IS_BYTE[fini]` | 1 |
+| `PAGE-C3` | `memory[0, address, 0::DWordWL, init]` | -1 |
+| `PAGE-C4` | `memory[0, address, timestamp, fini]` | 1 |
\ No newline at end of file
diff --git a/docs/spec/signatures.md b/docs/spec/signatures.md
index 98cbcdec1..b691ff739 100644
--- a/docs/spec/signatures.md
+++ b/docs/spec/signatures.md
@@ -1,6 +1,8 @@
 # Signatures
 
-// Render a signature let (lb, rb) = if sig.kind == "interaction" { (`[`, `]`) } else if sig.kind == "template" { (`<`, `>`) }
+// Render a signature
+
+let (lb, rb) = if sig.kind == "interaction" { (`[`, `]`) } else if sig.kind == "template" { (`<`, `>`) }
 
 let cond = sig.at("cond", default: none) let cond_str = if cond != none { raw(cond) + ` => ` } else {``}
 
@@ -10,7 +12,9 @@ let output = sig.at("output", default: none) let output_str = if output != none
 
 return [] }
 
-// Compute the bus size of an interaction let vars = sig.input + if "output" in sig { (sig.output, )} else {()}
+// Compute the bus size of an interaction
+
+let vars = sig.input + if "output" in sig { (sig.output, )} else {()}
 
 return vars.map(v => { let (label, factor) = if type(v) == array { (v.at(0), v.at(1)) } else { (v, 1) } config.variables.types.filter(type => type.label == label).first().subtypes.len() * factor }) .sum() }
 
diff --git a/docs/spec/spec_full.md b/docs/spec/spec_full.md
index 05c19a3cc..564d372dd 100644
--- a/docs/spec/spec_full.md
+++ b/docs/spec/spec_full.md
@@ -82,6 +82,39 @@ The initial and final state of registers can be entirely known by the verifier,
 
 - Optimize memory systems after determining factual bottlenecks (e.g. taking inspiration from Twist and Shout, or other recent research) - Double check whether IS_BYTE constraints are needed for fini
 
+## Columns
+
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `offset` | `RowIndex` | The offset from the page base address. |
+| `init` | `Byte` | The initial value of this address. Can be replaced by a constant zero for zero-initialization |
+| `fini` | `Byte` | The final value this address took |
+| `timestamp` | `DWordWL` | The timestamp at which this address was last accessed |
+
+### Virtual
+
+| Name | Type | Description |
+|------|------|-------------|
+| `address` | `DWordWL` | Adding `offset` to the page base address `page`. `page` is a constant with respect to a single instance of this table. |
+
+**Definition of `address`:**
+```
+address := page + offset * 1::DWordWL
+```
+
+## Constraints
+
+### all
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `PAGE-C1` | `IS_BYTE[init]` | 1 |
+| `PAGE-C2` | `IS_BYTE[fini]` | 1 |
+| `PAGE-C3` | `memory[0, address, 0::DWordWL, init]` | -1 |
+| `PAGE-C4` | `memory[0, address, timestamp, fini]` | 1 |
+
 ---
 
 # Variables
@@ -94,7 +127,9 @@ columns: (auto, 1fr, auto), inset: 7pt, align: (top+left, top+left, top+center,
 
 # Signatures
 
-// Render a signature let (lb, rb) = if sig.kind == "interaction" { (`[`, `]`) } else if sig.kind == "template" { (`<`, `>`) }
+// Render a signature
+
+let (lb, rb) = if sig.kind == "interaction" { (`[`, `]`) } else if sig.kind == "template" { (`<`, `>`) }
 
 let cond = sig.at("cond", default: none) let cond_str = if cond != none { raw(cond) + ` => ` } else {``}
 
@@ -104,7 +139,9 @@ let output = sig.at("output", default: none) let output_str = if output != none
 
 return [] }
 
-// Compute the bus size of an interaction let vars = sig.input + if "output" in sig { (sig.output, )} else {()}
+// Compute the bus size of an interaction
+
+let vars = sig.input + if "output" in sig { (sig.output, )} else {()}
 
 return vars.map(v => { let (label, factor) = if type(v) == array { (v.at(0), v.at(1)) } else { (v, 1) } config.variables.types.filter(type => type.label == label).first().subtypes.len() * factor }) .sum() }
 
@@ -358,7 +395,9 @@ Further clarification is provided in the notes following the table.
 
 The `RV64C` extension for compressed instructions specifies that \~50% of all instructions can be represented using a 16-bit instruction (rather than 32-bits), saving \~25% in code size. This execution of assembly code is _not_ agnostic to an instruction's compression state; after executing a compressed instruction, the `pc` should be incremented by `2` rather than `4`. To indicate an instruction is provided in compressed form, the `c_type` flag is introduced. *This flag should be set to `1` whenever the decoded instruction is provided in compressed form and `0` otherwise.*
 
-/// Add a reference to one or more notes following this table. super("[" + refs.pos().map(r => ref(r)).join(",") + "]") }
+/// Add a reference to one or more notes following this table.
+
+super("[" + refs.pos().map(r => ref(r)).join(",") + "]") }
 
 show figure: set block(breakable: true)
 
@@ -366,7 +405,9 @@ figure(table( columns: (auto, auto, auto, auto, 1fr, auto), stroke: 0pt, inset:
 
 // OP-IMM ([`ADDI[W]   rd, rs1, imm`], [`ADD`], [`[W]`], [], [], []), ([`SLTI[U]   rd, rs1, imm`], [`SLT`], [], [.not`[U]`], [], []), ([`ANDI      rd, rs1, imm`], [`AND`], [], [], [], []), ([`ORI       rd, rs1, imm`], [`OR`],   [], [], [], []), ([`XORI      rd, rs1, imm`], [`XOR`], [], [], [], []), ([`SLLI[W]   rd, rs1, imm`], [`SHIFT`], [`[W]`], [], [], []), ([`SRLI[W]   rd, rs1, imm`], [`SHIFT`], [`[W]`], [], [`mp_selector`], []), ([`SRAI[W]   rd, rs1, imm`], [`SHIFT`], [`[W]`], [1], [`mp_selector`], []), // OP ([`ADD[W]    rd, rs1, rs2`], [`ADD`], [`[W]`], [], [], []), ([`SUB[W]    rd, rs1, rs2`], [`SUB`], [`[W]`], [], [], []), ([`SLT[U]    rd, rs1, rs2`], [`SLT`], [], [.not`[U]`], [], []), ([`AND       rd, rs1, rs2`], [`AND`], [], [], [], []), ([`OR        rd, rs1, rs2`], [`OR`], [], [], [], []), ([`XOR       rd, rs1, rs2`], [`XOR`], [], [], [], []), ([`SLL[W]    rd, rs1, rs2`], [`SHIFT`], [`[W]`], [], [], []), ([`SRL[W]    rd, rs1, rs2`], [`SHIFT`], [`[W]`], [], [`mp_selector`], []), ([`SRA[W]    rd, rs1, rs2`], [`SHIFT`], [`[W]`], [1], [`mp_selector`], []), // OP - M ([`MUL[W]    rd, rs1, rs2`], [`MUL`], [`[W]`], [1], [`mp_selector`], []), ([`MULH      rd, rs1, rs2`], [`MUL`], [], [1], [`mp_selector`, `muldiv_selector`], []), ([`MULHU     rd, rs1, rs2`], [`MUL`], [], [], [`muldiv_selector`], []), ([`MULHSU    rd, rs1, rs2`], [`MUL`], [], [1], [`muldiv_selector`], []), ([`DIV[U][W] rd, rs1, rs2`], [`DIVREM`], [`[W]`], [.not`[U]`], [], []), ([`REM[U][W] rd, rs1, rs2`], [`DIVREM`], [`[W]`], [.not`[U]`], [`muldiv_selector`], []), // LUI/AUIPC ([`LUI       rd, imm`], [`ADD`], [], [], [], []), ([`AUIPC     rd, imm`], [`ADD`], [], [], [`rs1 := x255`], []), ([`JAL       rd, imm`], [`JALR`], [], [], [`rs1 := x255`], []), // Branching ([`JALR      rd, rs1, imm`], [`JALR`], [], [], [], []), ([`BEQ      rs1, rs2, imm`], [`BEQ`], [], [], [], []), ([`BNE      rs1, rs2, imm`], [`BEQ`], [], [], [`mp_selector`], []), ([`BLT[U]   rs1, rs2, imm`], [`BLT`], [], [.not`[U]`], [], []), ([`BGE[U]   rs1, rs2, imm`], [`BLT`], [], [.not`[U]`], [`mp_selector`], []), // LOAD ([`LD        rd, rs1, imm`], [`LOAD`], [], [], [`mem_8B`], []), ([`LW[U]     rd, rs1, imm`], [`LOAD`], [], [.not`[U]`], [`mem_4B`], []), ([`LH[U]     rd, rs1, imm`], [`LOAD`], [], [.not`[U]`], [`mem_2B`], []), ([`LB[U]     rd, rs1, imm`], [`LOAD`], [], [.not`[U]`], [], []), // STORE ([`SD       rs1, rs2, imm`], [`STORE`], [], [], [`mem_8B`], []), ([`SW       rs1, rs2, imm`], [`STORE`], [], [], [`mem_4B`], []), ([`SH       rs1, rs2, imm`], [`STORE`], [], [], [`mem_2B`], []), ([`SB       rs1, rs2, imm`], [`STORE`], [], [], [], []), // ECALL/EBREAK ([`ECALL`], [`ECALL`], [], [], [``rs1` := `x17``], []), ([`EBREAK`], [`EBREAK`], [], [], [], []), // FENCE ([`FENCE`], [`ADD`], [], [], [], []),
 
-// Construct a note that can be referenced through `lbl` show figure: (it) => align(left, []) [ ] }
+// Construct a note that can be referenced through `lbl`
+
+show figure: (it) => align(left, []) [ ] }
 
 ## Notes
 
@@ -1619,16 +1660,33 @@ When `ECALL` is executed, it is assumed that: - register `A7` contains the syste
 
 ## Columns
 
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `timestamp` | `DWordWL` | timestamp at which to halt the program |
+
 The  chip leverages  variable, spanning  columns:
 
 ## Assumptions
 
+| Tag | Range | Description |
+|-----|-------|-------------|
+| `HALT-A1.i` | i ∈ [0, 1] | `IS_WORD[timestamp[i]]` |
+
 It is assumed the input is range checked:
 
 ## Constraints
 
 The  chip: + makes sure register `x10` (containing the exit code) equals `0` ([halt:c:read_zero_exit_code]), + writes `0` to all other registers ([halt:c:zeroize_registers_lo]/[halt:c:zeroize_registers_hi]), and + sets `pc` equal to `1` ([halt:c:pc]). Note that the writes performed by all these interactions are accompanied by the timestamp `2^64-1`; the maximum timestamp. This prevents any other operation involving memory from being executed hereafter.
 
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `HALT-C1.i` | i ∈ [1, 9] | `MEMW[1, (2 * i)::DWordWL, 0::BaseField[8], (2^64 - 1)::DWordWL, 1, 0, 0]` | 1 |
+| `HALT-C2` |  | `MEMW[0::BaseField[8]; 1, (2 * 10)::DWordWL, 0::BaseField[8], (2^64 - 1)::DWordWL, 1, 0, 0]` | 1 |
+| `HALT-C3.i` | i ∈ [11, 31] | `MEMW[1, (2 * i)::DWordWL, 0::BaseField[8], (2^64 - 1)::DWordWL, 1, 0, 0]` | 1 |
+| `HALT-C4` |  | `MEMW[1, (2 * 255)::DWordWL, ['arr', 1, 0, 0, 0, 0, 0, 0, 0], (2^64 - 1)::DWordWL, 1, 0, 0]` | 1 |
+
 [ Observe that --- in its current state --- this solution puts the burden of verifying the register cleanup on the verifier inside of the lookup argument. Alternatively, one could add 31 lookups to the "memory" table to remove the _known_ final tokens for the registers there. ])
 
 ### Lookup
@@ -1637,6 +1695,10 @@ In this VM, halting is considered equivalent to executing a `sys_exit`. Hence, t
 
 The HALT chip therefore contributes the following interaction to the lookup-argument:
 
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `HALT-C5` | `ECALL[timestamp, 93::DWordWL]` | -1 |
+
 ## Padding
 
 This chip should only contain a single row. Given that `2^0 = 1`, this chip does not need to be padded. As such, no padding is defined.
@@ -1645,6 +1707,31 @@ This chip should only contain a single row. Given that `2^0 = 1`, this chip does
 
 ## Columns
 
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `timestamp` | `DWordWL` | timestamp at which to commit |
+
+### Auxiliary
+
+| Name | Type | Description |
+|------|------|-------------|
+| `index` | `BaseField` | Index of value being committed. |
+| `address` | `DWordWL` | Address of first byte to commit. |
+| `address_incr` | `DWordHL` | $`address` + 1$ |
+| `count` | `DWordWL` | number of bytes to commit |
+| `count_decr` | `DWordHL` | $`count` - 1$ |
+| `first` | `Bit` | Whether this is the first commitment in this sequence. |
+| `end` | `Bit` | Whether this is the end of the commitment sequence. |
+| `value` | `Byte` | Byte stored at `address`. |
+
+### Multiplicity
+
+| Name | Type | Description |
+|------|------|-------------|
+| `μ` | `Bit` |  |
+
 The  chip leverages  variables, spanning  columns:
 
 ## Constraints
@@ -1653,6 +1740,10 @@ In this VM, committing is considered equivalent to writing a value to `stdout`.
 
 Since we do not know how many bytes are to be committed, this chip employs a recursive design: each iteration commits one byte, and recursively "calls" itself to commit the remaining bytes. As such, only the call from the CPU to this chip (i.e., the `first` in the recursion tree) should accept the `ECALL`; later recursive calls should not. This is why [commit:c:receive_ecall] has multiplicity `-`first``.
 
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `COMMIT-C1` | `ECALL[timestamp, 64::DWordWL]` | -first |
+
 The `write` operation --- writing to a file descriptor --- has the following signature:
 
 ```c ssize_t write(size_t count; int fd, const void buf[count], size_t count); ```
@@ -1663,20 +1754,56 @@ That is to say, - `A0` contains the file descriptor, - `A1` contains the address
 
 we assert that `x10` contains `1` in [commit:c:read_fd_write_count]. Note that this constraint _also_ writes `count` to `A0`; in this VM it is impossible for a commit to be interrupted or fail. Lastly, the `index` is read from `x254`; in the same operation, ``index` + `count`` is written back to this location by [commit:c:read_index]. This, too, leverages the fact that a commit will not be interrupted or fail to update the `index` for the next commit sequence. Again, each of these memory interactions only take place when this is the `first` call in the recursion tree.
 
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `COMMIT-C2` | `MEMW[['arr', ['idx', 'address', 0], ['idx', 'address', 1], 0, 0, 0, 0, 0, 0]; 1, (2 * 11)::DWordWL, ['arr', ['idx', 'address', 0], ['idx', 'address', 1], 0, 0, 0, 0, 0, 0], timestamp, 1, 0, 0]` | first |
+| `COMMIT-C3` | `MEMW[['arr', ['idx', 'count', 0], ['idx', 'count', 1], 0, 0, 0, 0, 0, 0]; 1, (2 * 12)::DWordWL, ['arr', ['idx', 'count', 0], ['idx', 'count', 1], 0, 0, 0, 0, 0, 0], timestamp, 1, 0, 0]` | first |
+| `COMMIT-C4` | `MEMW[['arr', 1, 0, 0, 0, 0, 0, 0, 0]; 1, (2 * 10)::DWordWL, ['arr', ['idx', 'count', 0], ['idx', 'count', 1], 0, 0, 0, 0, 0, 0], timestamp, 1, 0, 0]` | first |
+| `COMMIT-C5` | `MEMW[['arr', 'index', 0, 0, 0, 0, 0, 0, 0]; 1, (2 * 254)::DWordWL, ['arr', ['+', 'index', ['cast', 'count', 'BaseField']], 0, 0, 0, 0, 0, 0, 0], timestamp, 0, 0, 0]` | first |
+
 *Note*: the observant reader will notice that [commit:c:read_index] casts `count` to a `BaseField`, potentiallly losing information. This is indeed correct. However, since it is practically impossible to commit more than `2^64-2^32` bytes in a single VM execution, it was decided to permit this.
 
 Next, we read the `value` located at buffer address `address` and commit to it under the given `index`. This is only performed when we have not yet reached the `end` of the commit sequence.
 
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `COMMIT-C6` | `MEMW[['arr', 'value', 0, 0, 0, 0, 0, 0, 0]; 0, address, ['arr', 'value', 0, 0, 0, 0, 0, 0, 0], timestamp, 0, 0, 0]` | μ - end |
+| `COMMIT-C7` | `COMMIT[index, value]` | μ - end |
+
 In parallel, we compute ``address_incr` = `address` + 1` ([commit:c:address_incr]) as address of the next byte to commit, and ``count_decr` = `count` - 1` ([commit:c:count_decr]) as the number of bytes that still has to be committed after committing this byte. [commit:c:range_address_incr] and [commit:c:range_count_decr] are included to satisfy [add:a:sum] respectively [add:a:rhs].
 
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `COMMIT-C8` |  | `ADD<address_incr::DWordWL; address, 1::DWordWL>` |  |
+| `COMMIT-C9.i` | i ∈ [0, 3] | `IS_HALF[address_incr[i]]` | μ |
+| `COMMIT-C10` |  | `SUB<count_decr::DWordWL; count, 1::DWordWL>` |  |
+| `COMMIT-C11.i` | i ∈ [0, 3] | `IS_HALF[count_decr[i]]` | μ |
+
 When `count` hits `0`, we should stop performing further recursive calls. We use the `end` bit to indicate these circumstances.
 
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `COMMIT-C12` | `ZERO[end; (65535 - count_decr[0]) + (65535 - count_decr[1]) + (65535 - count_decr[2]) + (65535 - count_decr[3])]` | μ |
+
 *Note*: + Rather than setting ``end` = 1` when ``count` = 0`, we do so when ``count_decr` = -1`. This technique allows `count` to be stored in a `DWordWL` rather than a `DWordHL`, saving two columns. + `forall i in [0, 3]: 65535 - `count_decr`_i >= 0` as a result of [commit:c:range_count_decr]. Hence, $ sum_(i=0)^3 65535 - `count_decr`_i = 0 arrow.l.r.double.long forall i in [0, 3]: `count_decr`_i = 65535 $
 
 When this was not the `end` byte to commit in this recursion sequence, we recursively _Commit the Next Byte_ (`CNB`), specifying the timestamp, address to continue reading and the number of bytes that should still be committed ([commit:c:send_commit_next_byte]). Since that certainly won't be the `first` call in the sequence, we read `address_incr` and `count_decr` from the previous recursion level into `address` and `count` and continue executing the commit.
 
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `COMMIT-C13` | `CNB[timestamp, index + 1, address_incr::DWordWL, count_decr::DWordWL]` | μ - end |
+| `COMMIT-C14` | `CNB[timestamp, index, address, count]` | -(μ - first) |
+
 Lastly, we must make sure `first`, `end` and `μ` are bits ([commit:c:range_first], [commit:c:range_end], [commit:c:range_mu]), and that when either ``first` = 1` or ``end` = 1` imply that ``μ` = 1` ([commit:c:first_or_end_implies_mu]). These are required to ensure the multiplicities `-(`μ` - `first`)` and ``μ` - `end`` are binary.
 
+| Tag | Description |
+|-----|-------------|
+| `COMMIT-C15` | `IS_BIT<first>` |
+| `COMMIT-C16` | `IS_BIT<end>` |
+| `COMMIT-C17` | `IS_BIT<μ>` |
+| `COMMIT-C18` | `first` + `end` => `μ` = 1 |
+| | _polynomial:_ `(first + end) * (1 - μ) = 0` |
+
 ## Padding
 
 To pad this chip, use the below data.
diff --git a/scripts/typst_to_md.py b/scripts/typst_to_md.py
index dc40acaeb..be8b09dea 100644
--- a/scripts/typst_to_md.py
+++ b/scripts/typst_to_md.py
@@ -195,12 +195,9 @@ def parse_typst_prose(content: str) -> list:
     """
     elements = []
 
-    # Remove imports and let bindings at the start
+    # Remove multi-line import blocks and top-level #import/#show lines
     content = re.sub(r'^#import[^\n]*\n', '', content, flags=re.MULTILINE)
-    content = re.sub(r'^#let[^\n]*\n', '', content, flags=re.MULTILINE)
     content = re.sub(r'^#show:[^\n]*\n', '', content, flags=re.MULTILINE)
-
-    # Remove multi-line import blocks
     content = re.sub(r'#import[^)]+\)', '', content)
 
     lines = content.split('\n')
@@ -251,6 +248,16 @@ def parse_typst_prose(content: str) -> list:
             i += 1
             continue
 
+        # Detect chip switches: #let chip = load_chip("src/foo.toml", config)
+        load_chip_match = re.match(r'#let\s+chip\s*=\s*load_chip\("([^"]+)"', stripped)
+        if load_chip_match:
+            if current_para:
+                elements.append(('para', ' '.join(current_para)))
+                current_para = []
+            elements.append(('load_chip', load_chip_match.group(1)))
+            i += 1
+            continue
+
         # Skip other Typst commands we don't need
         if stripped.startswith('#') and not stripped.startswith('#rj[') and not stripped.startswith('#et['):
             if current_para:
@@ -522,20 +529,23 @@ def render_assumptions_table(chip: dict, config: dict) -> str:
     return "\n".join(lines)
 
 
-def convert_chapter(typ_path: Path, toml_path: Path, title: str, config: dict) -> str:
+def convert_chapter(typ_path: Path, toml_path: Path, title: str, config: dict, spec_dir: Path = None) -> str:
     """Convert a chapter from .typ and .toml to Markdown."""
     lines = [f"# {title}", ""]
 
-    # Load TOML data
+    # Load TOML data (may be empty for multi-chip files like ecall)
     chip = load_toml(toml_path)
 
-    # Track what sections we've rendered from TOML
-    rendered_columns = False
-    rendered_assumptions = False
-    rendered_constraints = False
-    rendered_constraint_groups = set()
-    # Counter incremented in render order (matching Typst's global figure counter)
-    constraint_counter = 1
+    def reset_chip_state():
+        return {
+            'rendered_columns': False,
+            'rendered_assumptions': False,
+            'rendered_constraints': False,
+            'rendered_constraint_groups': set(),
+            'constraint_counter': 1,
+        }
+
+    state = reset_chip_state()
 
     # Parse Typst prose
     if typ_path.exists():
@@ -543,36 +553,52 @@ def convert_chapter(typ_path: Path, toml_path: Path, title: str, config: dict) -
         elements = parse_typst_prose(typst_content)
 
         for elem_type, content in elements:
+            if elem_type == 'load_chip':
+                # Multi-chip file: switch active chip and reset per-chip state
+                chip_toml_path = spec_dir / content if spec_dir else Path(content)
+                chip = load_toml(chip_toml_path)
+                state = reset_chip_state()
+                continue
+
+            rendered_columns = state['rendered_columns']
+            rendered_assumptions = state['rendered_assumptions']
+            rendered_constraints = state['rendered_constraints']
+            rendered_constraint_groups = state['rendered_constraint_groups']
+            constraint_counter = state['constraint_counter']
+
             if elem_type.startswith('h'):
                 level = int(elem_type[1])
                 lines.append("")
-                lines.append("#" * level + " " + content)
+                # Replace Typst variable references in headings with chip name if available
+                heading_text = content
+                if chip and chip.get('name'):
+                    heading_text = re.sub(r'`[^`]*`\s*chip\b', f"`{chip['name']}` chip", heading_text)
+                lines.append("#" * level + " " + heading_text)
                 lines.append("")
 
                 # Render TOML data after relevant headings
                 content_lower = content.lower()
                 if 'column' in content_lower and chip and not rendered_columns:
                     lines.append(render_variables_table(chip, config))
-                    rendered_columns = True
+                    state['rendered_columns'] = True
                 elif 'assumption' in content_lower and chip and not rendered_assumptions:
                     lines.append(render_assumptions_table(chip, config))
-                    rendered_assumptions = True
+                    state['rendered_assumptions'] = True
                 elif content_lower == "constraints" and chip:
-                    # Mark that we've hit the Constraints section
-                    rendered_constraints = True
+                    state['rendered_constraints'] = True
 
             elif elem_type == 'render_constraints' and chip:
                 # content is a list of group names to render (in order)
                 group_names = content
                 for group_name in group_names:
-                    if group_name not in rendered_constraint_groups:
+                    if group_name not in state['rendered_constraint_groups']:
                         # Use the running render-order counter so numbering matches Typst
-                        group_table = render_constraints_table(chip, config, group_filter=group_name, skip_heading=True, start_counter=constraint_counter)
+                        group_table = render_constraints_table(chip, config, group_filter=group_name, skip_heading=True, start_counter=state['constraint_counter'])
                         if group_table.strip():
                             lines.append(group_table)
-                        rendered_constraint_groups.add(group_name)
+                        state['rendered_constraint_groups'].add(group_name)
                     # Always advance the counter for this group (rendered or already seen)
-                    constraint_counter += len(chip.get("constraints", {}).get(group_name, []))
+                    state['constraint_counter'] += len(chip.get("constraints", {}).get(group_name, []))
 
             elif elem_type == 'para':
                 lines.append(content)
@@ -582,7 +608,13 @@ def convert_chapter(typ_path: Path, toml_path: Path, title: str, config: dict) -
                 lines.append(f"> **Note:** {content}")
                 lines.append("")
 
-    # Render any TOML data that wasn't triggered by prose headings
+    # Render any TOML data that wasn't triggered by prose headings (for the last active chip)
+    rendered_columns = state['rendered_columns']
+    rendered_assumptions = state['rendered_assumptions']
+    rendered_constraints = state['rendered_constraints']
+    rendered_constraint_groups = state['rendered_constraint_groups']
+    constraint_counter = state['constraint_counter']
+
     if chip:
         if chip.get("variables") and not rendered_columns:
             lines.append("## Columns")
@@ -675,7 +707,7 @@ def main():
         print(f"Converting: {name} ({title})")
 
         try:
-            markdown = convert_chapter(typ_path, toml_path, title, config)
+            markdown = convert_chapter(typ_path, toml_path, title, config, spec_dir=spec_dir)
 
             output_file = output_dir / f"{name}.md"
             output_file.write_text(markdown)

From 0b71ac2aed638765688d2c65be21dc74e274447e Mon Sep 17 00:00:00 2001
From: Joaquin Carletti <joaquin.carletti@lambdaclass.com>
Date: Tue, 10 Mar 2026 15:43:50 -0300
Subject: [PATCH 079/105] update

---
 docs/spec/memory.md    | 35 +----------------------------------
 docs/spec/spec_full.md | 33 ---------------------------------
 2 files changed, 1 insertion(+), 67 deletions(-)

diff --git a/docs/spec/memory.md b/docs/spec/memory.md
index 9faad803f..a38debcfa 100644
--- a/docs/spec/memory.md
+++ b/docs/spec/memory.md
@@ -78,37 +78,4 @@ The initial and final state of registers can be entirely known by the verifier,
 
 = Future topics of interest
 
-- Optimize memory systems after determining factual bottlenecks (e.g. taking inspiration from Twist and Shout, or other recent research) - Double check whether IS_BYTE constraints are needed for fini
-
-## Columns
-
-### Input
-
-| Name | Type | Description |
-|------|------|-------------|
-| `offset` | `RowIndex` | The offset from the page base address. |
-| `init` | `Byte` | The initial value of this address. Can be replaced by a constant zero for zero-initialization |
-| `fini` | `Byte` | The final value this address took |
-| `timestamp` | `DWordWL` | The timestamp at which this address was last accessed |
-
-### Virtual
-
-| Name | Type | Description |
-|------|------|-------------|
-| `address` | `DWordWL` | Adding `offset` to the page base address `page`. `page` is a constant with respect to a single instance of this table. |
-
-**Definition of `address`:**
-```
-address := page + offset * 1::DWordWL
-```
-
-## Constraints
-
-### all
-
-| Tag | Description | Multiplicity |
-|-----|-------------|--------------|
-| `PAGE-C1` | `IS_BYTE[init]` | 1 |
-| `PAGE-C2` | `IS_BYTE[fini]` | 1 |
-| `PAGE-C3` | `memory[0, address, 0::DWordWL, init]` | -1 |
-| `PAGE-C4` | `memory[0, address, timestamp, fini]` | 1 |
\ No newline at end of file
+- Optimize memory systems after determining factual bottlenecks (e.g. taking inspiration from Twist and Shout, or other recent research) - Double check whether IS_BYTE constraints are needed for fini
\ No newline at end of file
diff --git a/docs/spec/spec_full.md b/docs/spec/spec_full.md
index 564d372dd..a1a8d6792 100644
--- a/docs/spec/spec_full.md
+++ b/docs/spec/spec_full.md
@@ -82,39 +82,6 @@ The initial and final state of registers can be entirely known by the verifier,
 
 - Optimize memory systems after determining factual bottlenecks (e.g. taking inspiration from Twist and Shout, or other recent research) - Double check whether IS_BYTE constraints are needed for fini
 
-## Columns
-
-### Input
-
-| Name | Type | Description |
-|------|------|-------------|
-| `offset` | `RowIndex` | The offset from the page base address. |
-| `init` | `Byte` | The initial value of this address. Can be replaced by a constant zero for zero-initialization |
-| `fini` | `Byte` | The final value this address took |
-| `timestamp` | `DWordWL` | The timestamp at which this address was last accessed |
-
-### Virtual
-
-| Name | Type | Description |
-|------|------|-------------|
-| `address` | `DWordWL` | Adding `offset` to the page base address `page`. `page` is a constant with respect to a single instance of this table. |
-
-**Definition of `address`:**
-```
-address := page + offset * 1::DWordWL
-```
-
-## Constraints
-
-### all
-
-| Tag | Description | Multiplicity |
-|-----|-------------|--------------|
-| `PAGE-C1` | `IS_BYTE[init]` | 1 |
-| `PAGE-C2` | `IS_BYTE[fini]` | 1 |
-| `PAGE-C3` | `memory[0, address, 0::DWordWL, init]` | -1 |
-| `PAGE-C4` | `memory[0, address, timestamp, fini]` | 1 |
-
 ---
 
 # Variables

From 376a726900b36b01e821d1b5bab6c0389daaf98e Mon Sep 17 00:00:00 2001
From: Robin Jadoul <robin.jadoul@3milabs.tech>
Date: Fri, 13 Mar 2026 12:50:09 +0100
Subject: [PATCH 080/105] spec: Losing some MEMW weight (#398)

---
 spec/memw.typ              |  42 +++++--
 spec/src/memw.toml         | 107 ++++-------------
 spec/src/memw_aligned.toml | 228 +++++++++++++++++++++++++++++++++++++
 spec/tooling/chip.py       |   2 +-
 4 files changed, 285 insertions(+), 94 deletions(-)
 create mode 100644 spec/src/memw_aligned.toml

diff --git a/spec/memw.typ b/spec/memw.typ
index 4b644218a..57907e26c 100644
--- a/spec/memw.typ
+++ b/spec/memw.typ
@@ -38,6 +38,13 @@ we document it here, keeping the type information as a reading help.
 
 = Constraints
 
+We can compute the addresses for the later bytes based on a single bit each,
+indicating whether adding `i` to `base_address` overflows the lower limb.
+We can safely assume that additions for which this bit is not correctly set
+will have either an overflow on the upper or lower word, and hence not match
+any existing memory tokens, which are only initialized for correctly formatted
+and range-checked doublewords (see @memory).
+
 #render_constraint_table(chip, config, groups: "consistency")
 
 As long as `timestamp` is properly range-checked, the presence of `old_timestamp`
@@ -45,9 +52,9 @@ in the memory argument automatically ensures appropriate range checking
 (as long as no external entities provide negative multiplicities without range checking the timestamp).
 This ensures the assumptions for `LT` are satisfied.
 
-We additionally check that the address does not overflow
-for more significant bytes of the access.
-#render_constraint_table(chip, config, groups: "overflow")
+There is no need to check that the address does not overflow,
+as our address calculations are not performed modulo $2^64$ here,
+and any overflow will result in an address without matching initialization.
 
 The chip adds the following tuples to the lookup argument,
 to effectuate that part of the memory argument.
@@ -56,11 +63,32 @@ to effectuate that part of the memory argument.
 This chip contributes the following to the lookup argument.
 #render_constraint_table(chip, config, groups: "output")
 
+= Read-size aligned fast path
+
+#let alignedchip = load_chip("src/memw_aligned.toml", config)
+#let aligned = raw(alignedchip.name)
+
+When a memory access happens at an address with proper alignment
+(that is, enough trailing zeros) for its access size, and all accessed
+elements were last accessed at the same timestamp, we can 
+instead use the #aligned chip to save on total column count.
+The saving comes from only requiring a single old timestamp to be stored,
+as well as being able to guarantee that all values of `add_limb_overflow` would be zero.
+A minor extra cost is introduced in the form of a check that the alignment is indeed correct,
+and the corresponding decomposition of the `base_address`.
+
+Further logic remains essentially the same, so we briefly present the relevant tables for this chip.
+#let nr_variables = total_nr_variables(alignedchip)
+#let nr_columns = total_nr_instantiated_columns(alignedchip, config)
+
+The #aligned chip only needs #nr_variables variables, expressed through #nr_columns columns.
+#render_chip_column_table(alignedchip, config)
+#render_chip_assumptions(alignedchip, config)
+#render_constraint_table(alignedchip, config)
+
 
 = Future optimization ideas
 
-- Fast path for aligned memory access where all bytes have the same old timestamp
-- MEMB chip that deals does a one-byte write to remove old_timestamp from here (uncertain tradeoffs)
-- Compute `base_address[1] + 1` once and have high words of `address_add` as Words
-- Improve overflow trapping somehow so we don't need `LT` (could tie into previous one by checking carry bit of the +1)
+- `MEMB` chip that does a one-byte write to remove old_timestamp from here (uncertain tradeoffs)
+- Additional fast path for registers? (Always guaranteed same timestamp, alignment could be an assumption, always only two values)
 - Adding `μ_sum`/`w2`/`w4`/`write8` multiplicities to the `IS_HALF` lookups may make some GKR things faster if there are known zeroes.
diff --git a/spec/src/memw.toml b/spec/src/memw.toml
index 0ae3b9410..c9519e115 100644
--- a/spec/src/memw.toml
+++ b/spec/src/memw.toml
@@ -48,9 +48,9 @@ Only the elements corresponding to the `writeN` bits are guaranteed"""
 # Auxiliary
 
 [[variables.auxiliary]]
-name = "address_add"
-type = ["DWordHL", 7]
-desc = "`address_add[i] = base_address + i + 1`"
+name = "add_limb_overflow"
+type = ["Bit", 7]
+desc = "Whether adding `i` to `base_address[0]` as a field element exceeds $2^32$"
 
 [[variables.auxiliary]]
 name = "old_timestamp"
@@ -71,6 +71,15 @@ type = "Bit"
 desc = "writing at least 4 bytes"
 def = ["+", "write4", "write8"]
 
+[[variables.virtual]]
+name = "address_add"
+type = ["DWordWL", 7]
+desc = "`address_add[i] = base_address + i + 1`"
+def.iter = ["i", 0, 6]
+def.poly = ["arr",
+  ["+", ["idx", "base_address", 0], "i", 1, ["*", ["-", ["^", 2, 32]], ["idx", "add_limb_overflow", "i"]]],
+  ["+", ["idx", "base_address", 1], ["idx", "add_limb_overflow", "i"]]]
+
 [[variables.virtual]]
 name = "μ_sum"
 type = "Bit"
@@ -126,56 +135,9 @@ poly = ["*", "w2", ["not", "μ_sum"]]
 
 [[constraints.consistency]]
 kind = "template"
-tag = "ADD"
-input = ["base_address", ["cast", 1, "DWordWL"]]
-output = ["cast", ["idx", "address_add", 0], "DWordWL"]
-cond = "w2"
-
-[[constraints.consistency]]
-kind = "template"
-tag = "ADD"
-input = ["base_address", ["cast", ["+", "i", 1], "DWordWL"]]
-output = ["cast", ["idx", "address_add", "i"], "DWordWL"]
-iter = ["i", 1, 2]
-cond = "w4"
-
-[[constraints.consistency]]
-kind = "template"
-tag = "ADD"
-input = ["base_address", ["cast", ["+", "i", 1], "DWordWL"]]
-output = ["cast", ["idx", "address_add", "i"], "DWordWL"]
-iter = ["i", 3, 6]
-cond = "write8"
-
-[[constraints.consistency]]
-kind = "interaction"
-tag = "IS_HALF"
-input = [["idx", ["idx", "address_add", "i"], "j"]]
-iters = [
-  ["i", 0, 0],
-  ["j", 0, 3],
-]
-multiplicity = "w2"
-
-[[constraints.consistency]]
-kind = "interaction"
-tag = "IS_HALF"
-input = [["idx", ["idx", "address_add", "i"], "j"]]
-iters = [
-  ["i", 1, 2],
-  ["j", 0, 3],
-]
-multiplicity = "w4"
-
-[[constraints.consistency]]
-kind = "interaction"
-tag = "IS_HALF"
-input = [["idx", ["idx", "address_add", "i"], "j"]]
-iters = [
-  ["i", 3, 6],
-  ["j", 0, 3],
-]
-multiplicity = "write8"
+tag = "IS_BIT"
+input = [["idx", "add_limb_overflow", "i"]]
+iter = ["i", 0, 6]
 
 [[constraints.consistency]]
 kind = "interaction"
@@ -207,33 +169,6 @@ output = 1
 iter = ["i", 4, 7]
 multiplicity = "write8"
 
-
-[[constraint_groups]]
-name = "overflow"
-prefix = "R"
-
-[[constraints.overflow]]
-kind = "interaction"
-tag = "LT"
-input = ["base_address", ["cast", ["idx", "address_add", 0], "DWordWL"], 0]
-output = 1
-multiplicity = "write2"
-
-[[constraints.overflow]]
-kind = "interaction"
-tag = "LT"
-input = ["base_address", ["cast", ["idx", "address_add", 2], "DWordWL"], 0]
-output = 1
-multiplicity = "write4"
-
-[[constraints.overflow]]
-kind = "interaction"
-tag = "LT"
-input = ["base_address", ["cast", ["idx", "address_add", 6], "DWordWL"], 0]
-output = 1
-multiplicity = "write8"
-
-
 [[constraint_groups]]
 name = "memory"
 prefix = "M"
@@ -253,40 +188,40 @@ multiplicity = ["-", "μ_sum"]
 [[constraints.memory]]
 kind = "interaction"
 tag = "memory"
-input = ["is_register", ["cast", ["idx", "address_add", 0], "DWordWL"], ["idx", "old_timestamp", 1], ["idx", "old", 1]]
+input = ["is_register", ["idx", "address_add", 0], ["idx", "old_timestamp", 1], ["idx", "old", 1]]
 multiplicity = "w2"
 
 [[constraints.memory]]
 kind = "interaction"
 tag = "memory"
-input = ["is_register", ["cast", ["idx", "address_add", 0], "DWordWL"], "timestamp", ["idx", "value", 1]]
+input = ["is_register", ["idx", "address_add", 0], "timestamp", ["idx", "value", 1]]
 multiplicity = ["-", "w2"]
 
 [[constraints.memory]]
 kind = "interaction"
 tag = "memory"
-input = ["is_register", ["cast", ["idx", "address_add", ["-", "i", 1]], "DWordWL"], ["idx", "old_timestamp", "i"], ["idx", "old", "i"]]
+input = ["is_register", ["idx", "address_add", ["-", "i", 1]], ["idx", "old_timestamp", "i"], ["idx", "old", "i"]]
 multiplicity = "w4"
 iter = ["i", 2, 3]
 
 [[constraints.memory]]
 kind = "interaction"
 tag = "memory"
-input = ["is_register", ["cast", ["idx", "address_add", ["-", "i", 1]], "DWordWL"], "timestamp", ["idx", "value", "i"]]
+input = ["is_register", ["idx", "address_add", ["-", "i", 1]], "timestamp", ["idx", "value", "i"]]
 multiplicity = ["-", "w4"]
 iter = ["i", 2, 3]
 
 [[constraints.memory]]
 kind = "interaction"
 tag = "memory"
-input = ["is_register", ["cast", ["idx", "address_add", ["-", "i", 1]], "DWordWL"], ["idx", "old_timestamp", "i"], ["idx", "old", "i"]]
+input = ["is_register", ["idx", "address_add", ["-", "i", 1]], ["idx", "old_timestamp", "i"], ["idx", "old", "i"]]
 multiplicity = "write8"
 iter = ["i", 4, 7]
 
 [[constraints.memory]]
 kind = "interaction"
 tag = "memory"
-input = ["is_register", ["cast", ["idx", "address_add", ["-", "i", 1]], "DWordWL"], "timestamp", ["idx", "value", "i"]]
+input = ["is_register", ["idx", "address_add", ["-", "i", 1]], "timestamp", ["idx", "value", "i"]]
 multiplicity = ["-", "write8"]
 iter = ["i", 4, 7]
 
diff --git a/spec/src/memw_aligned.toml b/spec/src/memw_aligned.toml
new file mode 100644
index 000000000..715f57c85
--- /dev/null
+++ b/spec/src/memw_aligned.toml
@@ -0,0 +1,228 @@
+name = "MEMW-A"
+
+# Input
+
+[[variables.input]]
+name = "is_register"
+type = "Bit"
+desc = "Whether the address represents a register index"
+
+[[variables.input]]
+name = "base_address_high"
+type = "Word"
+desc = "The high word of the base address to read/write from/to, gets offset by $[0, 7]$, depending on how big the access is"
+
+[[variables.input]]
+name = "base_address_mid"
+type = "Half"
+desc = "The middle halfword of the base address to read/write from/to, gets offset by $[0, 7]$, depending on how big the access is"
+
+[[variables.input]]
+name = "base_address_low"
+type = ["Byte", 2]
+desc = "The low bytes of the base address to read/write from/to, gets offset by $[0, 7]$, depending on how big the access is"
+
+[[variables.input]]
+name = "value"
+type = ["BaseField", 8]
+desc = "The values to store in memory. For regular memory, these should be (up to) 8 range-checked `Byte`s; registers are stored as two range-checked `Word`s"
+
+[[variables.input]]
+name = "timestamp"
+type = "DWordWL"
+desc = "The timestamp at which this memory access is said to occur"
+
+[[variables.input]]
+name = "write2"
+type = "Bit"
+desc = "Whether to write exactly 2 values"
+
+[[variables.input]]
+name = "write4"
+type = "Bit"
+desc = "Whether to write exactly 4 values"
+
+[[variables.input]]
+name = "write8"
+type = "Bit"
+desc = "Whether to write exactly 8 values"
+
+# Output
+
+[[variables.output]]
+name = "old"
+type = ["BaseField", 8]
+desc = """The old value written at `base_address`. See `value` for information about representation.
+Only the elements corresponding to the `writeN` bits are guaranteed"""
+
+# Auxiliary
+
+[[variables.auxiliary]]
+name = "old_timestamp"
+type = "DWordWL"
+desc = "The timestamp at which the address was last accessed"
+
+# Virtual
+
+[[variables.virtual]]
+name = "base_address"
+type = "DWordWL"
+desc = "Recomposing the base address from its parts"
+defs = {idx = "i", polys = [["+", ["*", ["^", 2, 16], "base_address_mid"], ["*", ["^", 2, 8], ["idx", "base_address_low", 1]], ["idx", "base_address_low", 0]], "base_address_high"]}
+
+[[variables.virtual]]
+name = "w2"
+type = "Bit"
+desc = "writing at least 2 bytes"
+def = ["+", "write2", "write4", "write8"]
+
+[[variables.virtual]]
+name = "w4"
+type = "Bit"
+desc = "writing at least 4 bytes"
+def = ["+", "write4", "write8"]
+
+[[variables.virtual]]
+name = "μ_sum"
+type = "Bit"
+desc = ""
+def = ["+", "μ_read", "μ_write"]
+
+# Multiplicity
+
+[[variables.multiplicity]]
+name = "μ_read"
+type = "Bit"
+desc = "Whether we are performing a read (and hence return `out`)"
+
+[[variables.multiplicity]]
+name = "μ_write"
+type = "Bit"
+desc = "Whether we are performing a write (and hence not return `out`)"
+
+[[assumptions]]
+desc = "`IS_WORD[base_address_high]`"
+
+[[assumptions]]
+desc = "`IS_HALF[base_address_mid]`"
+
+[[assumptions]]
+desc = "`IS_BYTE[base_address_low[i]]`"
+iter = ["i", 0, 1]
+
+[[assumptions]]
+desc = "`IS_BIT<write2>`"
+
+[[assumptions]]
+desc = "`IS_BIT<write4>`"
+
+[[assumptions]]
+desc = "`IS_BIT<write8>`"
+
+[[assumptions]]
+desc = "`IS_BIT<write2 + write4 + write8>`"
+
+[[assumptions]]
+desc = "`IS_WORD[timestamp[i]]`"
+iter = ["i", 0, 1]
+
+
+[[constraint_groups]]
+name = "consistency"
+
+[[constraints.consistency]]
+kind = "template"
+tag = "AND_BYTE"
+input = [["idx", "base_address_low", 0], ["+", ["*", "write2", 1], ["*", "write4", 3], ["*", "write8", 7]]]
+output = 0
+
+[[constraints.consistency]]
+kind = "template"
+tag = "IS_BIT"
+input = ["μ_sum"]
+
+[[constraints.consistency]]
+kind = "arith"
+constraint = "$#`w2` => #`μ_sum`$"
+poly = ["*", "w2", ["not", "μ_sum"]]
+
+[[constraints.consistency]]
+kind = "interaction"
+tag = "LT"
+input = ["old_timestamp", "timestamp", 0]
+output = 1
+multiplicity = "μ_sum"
+
+[[constraint_groups]]
+name = "memory"
+prefix = "M"
+
+[[constraints.memory]]
+kind = "interaction"
+tag = "memory"
+input = ["is_register", "base_address", "old_timestamp", ["idx", "old", 0]]
+multiplicity = "μ_sum"
+
+[[constraints.memory]]
+kind = "interaction"
+tag = "memory"
+input = ["is_register", "base_address", "timestamp", ["idx", "value", 0]]
+multiplicity = ["-", "μ_sum"]
+
+[[constraints.memory]]
+kind = "interaction"
+tag = "memory"
+input = ["is_register", ["+", "base_address", ["cast", 1, "DWordWL"]], "old_timestamp", ["idx", "old", 1]]
+multiplicity = "w2"
+
+[[constraints.memory]]
+kind = "interaction"
+tag = "memory"
+input = ["is_register", ["+", "base_address", ["cast", 1, "DWordWL"]], "timestamp", ["idx", "value", 1]]
+multiplicity = ["-", "w2"]
+
+[[constraints.memory]]
+kind = "interaction"
+tag = "memory"
+input = ["is_register", ["+", "base_address", ["cast", "i", "DWordWL"]], "old_timestamp", ["idx", "old", "i"]]
+multiplicity = "w4"
+iter = ["i", 2, 3]
+
+[[constraints.memory]]
+kind = "interaction"
+tag = "memory"
+input = ["is_register", ["+", "base_address", ["cast", "i", "DWordWL"]], "timestamp", ["idx", "value", "i"]]
+multiplicity = ["-", "w4"]
+iter = ["i", 2, 3]
+
+[[constraints.memory]]
+kind = "interaction"
+tag = "memory"
+input = ["is_register", ["+", "base_address", ["cast", "i", "DWordWL"]], "old_timestamp", ["idx", "old", "i"]]
+multiplicity = "write8"
+iter = ["i", 4, 7]
+
+[[constraints.memory]]
+kind = "interaction"
+tag = "memory"
+input = ["is_register", ["+", "base_address", ["cast", "i", "DWordWL"]], "timestamp", ["idx", "value", "i"]]
+multiplicity = ["-", "write8"]
+iter = ["i", 4, 7]
+
+
+[[constraint_groups]]
+name = "output"
+prefix = "O"
+
+[[constraints.output]]
+kind = "interaction"
+tag = "MEMW"
+input = ["is_register", "base_address", "value", "timestamp", "write2", "write4", "write8"]
+output = "old"
+multiplicity = "μ_read"
+
+[[constraints.output]]
+kind = "interaction"
+tag = "MEMW"
+input = ["is_register", "base_address", "value", "timestamp", "write2", "write4", "write8"]
+multiplicity = "μ_write"
diff --git a/spec/tooling/chip.py b/spec/tooling/chip.py
index 58deb4b3c..688743754 100644
--- a/spec/tooling/chip.py
+++ b/spec/tooling/chip.py
@@ -627,7 +627,7 @@ class VirtualVariable(Variable):
     def_: VirtualDef
 
     def __init__(self, config: Config, category: str, data: dict):
-        assert_no_unexpected(data, set(Variable.__annotations__.keys()) | {"def"})
+        assert_no_unexpected(data, (set(Variable.__annotations__.keys()) | {"def"}) - {"pad"})
         reporter.asserts("def" in data, f"Missing def for virtual column: {data!r}")
         def_ = data.pop("def", {})
         super().__init__(config, category, data)

From 42b4c9f58d299dcad942a0da3bef9fda8a42402f Mon Sep 17 00:00:00 2001
From: Robin Jadoul <robin.jadoul@3milabs.tech>
Date: Fri, 13 Mar 2026 12:50:31 +0100
Subject: [PATCH 081/105] spec: Some fixes and improvements for SHIFT (#400)

* spec: Some fixes for SHIFT

Closes: #389

* spec: Merge HWSL with HWSLC, to simplify SHIFT

Closes: #119

* typo

Co-authored-by: Cyprien de Saint Guilhem <c.desaintguilhem@gmail.com>

---------

Co-authored-by: Cyprien de Saint Guilhem <c.desaintguilhem@gmail.com>
---
 spec/bitwise.typ         |  3 +--
 spec/shift.typ           | 23 +++++++++----------
 spec/src/bitwise.toml    | 14 +-----------
 spec/src/shift.toml      | 48 ++++++++++++++++++++--------------------
 spec/src/signatures.toml |  9 +-------
 5 files changed, 38 insertions(+), 59 deletions(-)

diff --git a/spec/bitwise.typ b/spec/bitwise.typ
index d0b3d89e2..06a2ce822 100644
--- a/spec/bitwise.typ
+++ b/spec/bitwise.typ
@@ -42,5 +42,4 @@ The following ideas may prove to be optimizations for the #bitwise chip:
 + Drop `MSB8` column, and instead define the `MSB8` lookup as `MSB8<X> := MSB16[256X]`.
   Note: currently, `MSB8` also implicity range checks the input `X` (the lookup fails if `X` is not a `Byte`).
   This optimization should only be executed when all chips leveraging `MSB8` do _not_ need this implicit range check.
-+ Place the 16-bit (`AND`, `OR`, `XOR`, `MSB16`, etc.) and 20-bit (`HWSL`, `HWSLC`, `IS_B20`, `ZERO`) lookups in separate tables.
-+ Combine `HWSL` and `HWSLC` into a single lookup (see also \#119).
++ Place the 16-bit (`AND`, `OR`, `XOR`, `MSB16`, etc.) and 20-bit (`HWSL`, `IS_B20`, `ZERO`) lookups in separate tables.
diff --git a/spec/shift.typ b/spec/shift.typ
index b705adb32..289ade1d7 100644
--- a/spec/shift.typ
+++ b/spec/shift.typ
@@ -59,25 +59,24 @@ Here, we start with discussing the _logical_ left/right shift operations only; t
 
 == First phase
 We zoom in on the first step.
-Here, we make use of the two lookup operations 
-- $#`HWSL[x: Half, y: B4]` := (#`x` #`<<` #`y`) mod 2^16$ (short for "HalfWord Shift Left"), and
-- $#`HWSLC[x: Half, y: B4]` := #`x` #`>>` (16-#`y`)$ (short for "HalfWord Shift Left's Carry")
-Note here that one can use these two lookups to compute `out: Half[4] := in << y` as:
+Here, we make use of the lookup operation `HWSL` (short for "HalfWord Shift Left"):
+$ #`HWSL[x: Half, y: B4]` := [(#`x` #`<<` #`y`) mod 2^16, #`x` #`>>` (16 - #`y`)]. $
+One can use this to compute `out: Half[4] := in << y` as:
 $
   #`out[`i#`]` = cases(
-    #`HWSL[in[`0#`], y]` &"if" i = 0,
-    #`HWSL[in[`i#`], y] | HWSLC[in[`i-1#`], y]` &"if" i in [1, 3]   
+    #`HWSL[in[`0#`], y]`_0 &"if" i = 0,
+    #`HWSL[in[`i#`], y]`_0 | #`HWSL[in[`i-1#`], y]`_1 &"if" i in [1, 3]   
   )
 $
 as long as $#`y` < 16$.
 Observing that 
-$#`HWSL[x,` 16-#`y]` = (#`x` #`<<` (16-#`y`)) mod 2^16$, and
-$#`HWSLC[x,` 16-#`y]` = #`x` #`>>` #`y`$ for $#`y` in [1, 15]$,
-one can also use these lookups to compute `out := in >> y` as
+$#`HWSL[x,` 16-#`y]`_0 = (#`x` #`<<` (16-#`y`)) mod 2^16$, and
+$#`HWSL[x,` 16-#`y]`_1 = #`x` #`>>` #`y`$ for $#`y` in [1, 15]$,
+one can also use it to compute `out := in >> y` as
 $
   #`out[`i#`]` = cases(
-    #`HWSLC[in[`i#`],` 16-#`y] | HWSL[in[`i+1#`], y]` &"if" i in [0, 2],
-    #`HWSLC[in[`3#`],` 16-#`y]` &"if" i = 3
+    #`HWSL[in[`i#`],` 16-#`y]`_1 | #`HWSL[in[`i+1#`], y]`_0 &"if" i in [0, 2],
+    #`HWSL[in[`3#`],` 16-#`y]`_1 &"if" i = 3
   )
 $
 as long as $0 < #`y` < 16$.
@@ -90,7 +89,7 @@ $
     (16-#`shift`) mod 16 & "when shifting right"
   ),  
 $
-it only takes some rearranging and combining of the values $#`X[`i#`] := HWSL[in[`i#`], bit_shift]`$ and $#`Y[`i#`] := HWSLC[in[`i#`], bit_shift]`$ to form the limbs of $#`in <</>> shift` mod 16$.
+it only takes some rearranging and combining of the values $#`X[`i#`] := HWSL[in[`i#`], bit_shift]`_0$ and $#`Y[`i#`] := HWSL[in[`i#`], bit_shift]`_1$ to form the limbs of $#`in <</>> shift` mod 16$.
 In the remaining case that $#`right` = 1$ and $#`shift` = 0 mod 16$, the limbs of $#`in <</>> shift` mod 16$ simply match those of `in`.
 
 == Second phase
diff --git a/spec/src/bitwise.toml b/spec/src/bitwise.toml
index 75e8faee4..67d73facd 100644
--- a/spec/src/bitwise.toml
+++ b/spec/src/bitwise.toml
@@ -116,11 +116,6 @@ name = "μ_HWSL"
 type = "BaseField"
 desc = ""
 
-[[variables.multiplicity]]
-name = "μ_HWSLC"
-type = "BaseField"
-desc = ""
-
 
 [[constraint_groups]]
 name = "contributions"
@@ -189,12 +184,5 @@ multiplicity = ["-", "μ_IS_B20"]
 kind = "interaction"
 tag = "HWSL"
 input = [["+", "X", ["*", 256, "Y"]], "Z"]
-output = "SLL"
+output = ["arr", "SLL", "SLLC"]
 multiplicity = ["-", "μ_HWSL"]
-
-[[constraints.contributions]]
-kind = "interaction"
-tag = "HWSLC"
-input = [["+", "X", ["*", 256, "Y"]], "Z"]
-output = "SLLC"
-multiplicity = ["-", "μ_HWSLC"]
diff --git a/spec/src/shift.toml b/spec/src/shift.toml
index 45c00b064..bbe22a5d9 100644
--- a/spec/src/shift.toml
+++ b/spec/src/shift.toml
@@ -74,13 +74,22 @@ desc = "scratch variable."
 pad = ["arr", 0, 0, 0, 0]
 
 [[variables.auxiliary]]
-name = "limb_shift"
-type = ["Bit", 4]
-desc = "One-hot vector indicating whether $floor.l #`shift` / 16 floor.r equiv i mod s$, where $s = 2$ when $#`word_instr` = 1$ and $4$ otherwise."
-pad = ["arr", 0, 0, 0, 0]
+name = "limb_shift_raw"
+type = ["Bit", 3]
+desc = "One-hot vector indicating whether $floor.l #`shift` / 16 floor.r equiv i mod s$, where $s = 2$ when $#`word_instr` = 1$ and $4$ otherwise. These columns store the first 3 values, and the 4th is derived from the one-hot property."
+pad = ["arr", 0, 0, 0]
 
 # Virtual
 
+[[variables.virtual]]
+name = "limb_shift"
+type = ["Bit", 4]
+desc = ""
+def = {idx = "i", polys = [
+    {iter = [0, 2], poly = ["idx", "limb_shift_raw", "i"]},
+    {iter = 3, poly = ["-", 1, ["sum", ["=", "j", 0], 2, ["idx", "limb_shift_raw", "j"]]]},
+]}
+
 [[variables.virtual]]
 name = "extension"
 type = "Half"
@@ -118,7 +127,7 @@ def = {idx="i", iter=[0, 3], poly=["+", ["idx", "Y", "i"], ["idx", "X", ["+", "i
 name = "shifted"
 type = "DWordHL"
 desc = "$#`in <</>>/>>>` (#`shift` mod 32 dot (2 - #`word_instr`))$"
-def = {idx="i", iter=[0, 3], poly=["+", ["*", "left", ["sum", ["=", "j", 0], "i", ["*", ["idx", "limb_shift", "j"], ["idx", "intra_limb_left", ["-", "i", "j"]]]]], ["*", "right", ["+", ["sum", ["=", "j", 0], ["-", 3, "i"], ["*", ["idx", "limb_shift", "j"], ["idx", "intra_limb_right", ["+", "i", "j"]]]], ["*", "extension", ["sum", ["=", "j", ["-", 3, "i"]], 3, ["idx", "limb_shift", "j"]]]]]]}
+def = {idx="i", iter=[0, 3], poly=["+", ["*", "left", ["sum", ["=", "j", 0], "i", ["*", ["idx", "limb_shift", "j"], ["idx", "intra_limb_left", ["-", "i", "j"]]]]], ["*", "right", ["+", ["sum", ["=", "j", 0], ["-", 3, "i"], ["*", ["idx", "limb_shift", "j"], ["idx", "intra_limb_right", ["+", "i", "j"]]]], ["*", "extension", ["sum", ["=", "j", ["-", 4, "i"]], 3, ["idx", "limb_shift", "j"]]]]]]}
 
 # Multiplicities
 
@@ -192,7 +201,7 @@ multiplicity = "left"
 [[constraints.bit_shift]]
 kind = "interaction"
 tag = "AND_BYTE"
-input = [["-", ["^", 2, 8], "shift"], 0x0F]
+input = [["-", ["^", 2, 8], ["*", 16, "zbs"], "shift"], 0x0F]
 output = "bit_shift"
 ref = "shift:c:bit_shift_if_right"
 multiplicity = "right"
@@ -213,7 +222,7 @@ name = "intra_limb_shift"
 kind = "interaction"
 tag = "HWSL"
 input = [["idx", "in", "i"], "bit_shift"]
-output = ["idx", "X", "i"]
+output = ["arr", ["idx", "X", "i"], ["idx", "Y", "i"]]
 iter = ["i", 0, 3]
 ref = "shift:c:hwsl_if_not_zero"
 multiplicity = ["not", "zbs"]
@@ -225,11 +234,18 @@ poly = ["*", "zbs", ["-", ["idx", "X", "i"], ["*", ["idx", "in", "i"], "left"]]]
 iter = ["i", 0, 3]
 ref = "shift:c:zbs_implies_X"
 
+[[constraints.intra_limb_shift]]
+kind = "arith"
+constraint = "$#`zbs` => #`Y[i]` = #`in[i]` dot #`right`$"
+poly = ["*", "zbs", ["-", ["idx", "Y", "i"], ["*", ["idx", "in", "i"], "right"]]]
+iter = ["i", 0, 3]
+ref = "shift:c:zbs_implies_Y"
+
 [[constraints.intra_limb_shift]]
 kind = "interaction"
 tag = "HWSL"
 input = ["extension", "bit_shift"]
-output = ["idx", "X", 4]
+output = ["arr", ["idx", "X", 4], ["-", "extension", ["idx", "X", 4]]]
 ref = "shift:c:hwsl_x4_if_not_zero"
 multiplicity = ["not", "zbs"]
 
@@ -239,22 +255,6 @@ constraint = "$#`zbs` => #`X[4]` = 0$"
 poly = ["*", "zbs", ["idx", "X", 4]]
 ref = "shift:c:zbs_implies_X_4"
 
-[[constraints.intra_limb_shift]]
-kind = "interaction"
-tag = "HWSLC"
-input = [["idx", "in", "i"], "bit_shift"]
-output = ["idx", "Y", "i"]
-iter = ["i", 0, 3]
-ref = "shift:c:hwslc_if_not_zero"
-multiplicity = ["not", "zbs"]
-
-[[constraints.intra_limb_shift]]
-kind = "arith"
-constraint = "$#`zbs` => #`Y[i]` = #`in[i]` dot #`right`$"
-poly = ["*", "zbs", ["-", ["idx", "Y", "i"], ["*", ["idx", "in", "i"], "right"]]]
-iter = ["i", 0, 3]
-ref = "shift:c:zbs_implies_Y"
-
 
 [[constraint_groups]]
 name = "limb_shifting"
diff --git a/spec/src/signatures.toml b/spec/src/signatures.toml
index 69a839d2e..17ecd3933 100644
--- a/spec/src/signatures.toml
+++ b/spec/src/signatures.toml
@@ -180,14 +180,7 @@ input = ["B20"]
 tag = "HWSL"
 kind = "interaction"
 input = ["Half", "B4"]
-output = "Half"
-
-# HWSLC[res; X, shift]
-[[signatures]]
-tag = "HWSLC"
-kind = "interaction"
-input = ["Half", "B4"]
-output = "Half"
+output = ["Half", 2]
 
 # The actual memory tokens, see MEMW and PAGE
 [[signatures]]

From 268ac3b44913c57a733312fefc33611ee130a8c2 Mon Sep 17 00:00:00 2001
From: Nicole <nicole.graus@lambdaclass.com>
Date: Fri, 13 Mar 2026 15:36:20 -0300
Subject: [PATCH 082/105] update spec

---
 docs/spec/bitwise.md   |   6 +--
 docs/spec/memw.md      |  63 ++++++++++++-----------
 docs/spec/shift.md     |  42 +++++++++-------
 docs/spec/spec_full.md | 111 ++++++++++++++++++++++-------------------
 4 files changed, 120 insertions(+), 102 deletions(-)

diff --git a/docs/spec/bitwise.md b/docs/spec/bitwise.md
index aede36293..91f2b127c 100644
--- a/docs/spec/bitwise.md
+++ b/docs/spec/bitwise.md
@@ -10,7 +10,7 @@ The  chip is comprised of  variables that are expressed using  columns. Of these
 
 = Lookup This chip adds the following interactions to the lookup:
 
-= Areas of Optimization The following ideas may prove to be optimizations for the  chip: + Extend `IS_BYTE[X]` to `ARE_BYTES[X, Y]`, such that two bytes are range checked at once. When only a single check is required, one can still execute `IS_BYTE[X] := ARE_BYTES[X, 0]`. + Drop `MSB8` column, and instead define the `MSB8` lookup as `MSB8<X> := MSB16[256X]`. Note: currently, `MSB8` also implicity range checks the input `X` (the lookup fails if `X` is not a `Byte`). This optimization should only be executed when all chips leveraging `MSB8` do _not_ need this implicit range check. + Place the 16-bit (`AND`, `OR`, `XOR`, `MSB16`, etc.) and 20-bit (`HWSL`, `HWSLC`, `IS_B20`, `ZERO`) lookups in separate tables. + Combine `HWSL` and `HWSLC` into a single lookup (see also \).
+= Areas of Optimization The following ideas may prove to be optimizations for the  chip: + Extend `IS_BYTE[X]` to `ARE_BYTES[X, Y]`, such that two bytes are range checked at once. When only a single check is required, one can still execute `IS_BYTE[X] := ARE_BYTES[X, 0]`. + Drop `MSB8` column, and instead define the `MSB8` lookup as `MSB8<X> := MSB16[256X]`. Note: currently, `MSB8` also implicity range checks the input `X` (the lookup fails if `X` is not a `Byte`). This optimization should only be executed when all chips leveraging `MSB8` do _not_ need this implicit range check. + Place the 16-bit (`AND`, `OR`, `XOR`, `MSB16`, etc.) and 20-bit (`HWSL`, `IS_B20`, `ZERO`) lookups in separate tables.
 
 ## Columns
 
@@ -49,7 +49,6 @@ The  chip is comprised of  variables that are expressed using  columns. Of these
 | `μ_IS_HALF` | `BaseField` |  |
 | `μ_IS_B20` | `BaseField` |  |
 | `μ_HWSL` | `BaseField` |  |
-| `μ_HWSLC` | `BaseField` |  |
 
 ## Constraints
 
@@ -66,5 +65,4 @@ The  chip is comprised of  variables that are expressed using  columns. Of these
 | `BITWISE-C7` | `IS_BYTE[X]` | -μ_IS_BYTE |
 | `BITWISE-C8` | `IS_HALF[X + 256 * Y]` | -μ_IS_HALF |
 | `BITWISE-C9` | `IS_B20[X + 256 * Y + 65536 * Z]` | -μ_IS_B20 |
-| `BITWISE-C10` | `HWSL[SLL; X + 256 * Y, Z]` | -μ_HWSL |
-| `BITWISE-C11` | `HWSLC[SLLC; X + 256 * Y, Z]` | -μ_HWSLC |
\ No newline at end of file
+| `BITWISE-C10` | `HWSL[['arr', 'SLL', 'SLLC']; X + 256 * Y, Z]` | -μ_HWSL |
\ No newline at end of file
diff --git a/docs/spec/memw.md b/docs/spec/memw.md
index c78f56a32..4862b9ebf 100644
--- a/docs/spec/memw.md
+++ b/docs/spec/memw.md
@@ -12,55 +12,54 @@ Our assumptions do not explicitly cover any range checks for the `is_register` a
 
 = Constraints
 
+We can compute the addresses for the later bytes based on a single bit each, indicating whether adding `i` to `base_address` overflows the lower limb. We can safely assume that additions for which this bit is not correctly set will have either an overflow on the upper or lower word, and hence not match any existing memory tokens, which are only initialized for correctly formatted and range-checked doublewords (see [memory]).
+
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
 | `MEMW-C1` |  | `IS_BIT<μ_sum>` |  |
 | `MEMW-C2` |  | `w2` => `μ_sum` |  |
 | | | _polynomial:_ `w2 * (1 - μ_sum) = 0` | |
-| `MEMW-C3` |  | w2 ⇒ `ADD<address_add[0]::DWordWL; base_address, 1::DWordWL>` |  |
-| `MEMW-C4.i` | i ∈ [1, 2] | w4 ⇒ `ADD<address_add[i]::DWordWL; base_address, (i + 1)::DWordWL>` |  |
-| `MEMW-C5.i` | i ∈ [3, 6] | write8 ⇒ `ADD<address_add[i]::DWordWL; base_address, (i + 1)::DWordWL>` |  |
-| `MEMW-C6.i` | i ∈ [0, 0], j ∈ [0, 3] | `IS_HALF[address_add[i][j]]` | w2 |
-| `MEMW-C7.i` | i ∈ [1, 2], j ∈ [0, 3] | `IS_HALF[address_add[i][j]]` | w4 |
-| `MEMW-C8.i` | i ∈ [3, 6], j ∈ [0, 3] | `IS_HALF[address_add[i][j]]` | write8 |
-| `MEMW-C9` |  | `LT[1; old_timestamp[0], timestamp, 0]` | μ_sum |
-| `MEMW-C10` |  | `LT[1; old_timestamp[1], timestamp, 0]` | w2 |
-| `MEMW-C11.i` | i ∈ [2, 3] | `LT[1; old_timestamp[i], timestamp, 0]` | w4 |
-| `MEMW-C12.i` | i ∈ [4, 7] | `LT[1; old_timestamp[i], timestamp, 0]` | write8 |
+| `MEMW-C3.i` | i ∈ [0, 6] | `IS_BIT<add_limb_overflow[i]>` |  |
+| `MEMW-C4` |  | `LT[1; old_timestamp[0], timestamp, 0]` | μ_sum |
+| `MEMW-C5` |  | `LT[1; old_timestamp[1], timestamp, 0]` | w2 |
+| `MEMW-C6.i` | i ∈ [2, 3] | `LT[1; old_timestamp[i], timestamp, 0]` | w4 |
+| `MEMW-C7.i` | i ∈ [4, 7] | `LT[1; old_timestamp[i], timestamp, 0]` | write8 |
 
 As long as `timestamp` is properly range-checked, the presence of `old_timestamp` in the memory argument automatically ensures appropriate range checking (as long as no external entities provide negative multiplicities without range checking the timestamp). This ensures the assumptions for `LT` are satisfied.
 
-We additionally check that the address does not overflow for more significant bytes of the access.
-
-| Tag | Description | Multiplicity |
-|-----|-------------|--------------|
-| `MEMW-CR13` | `LT[1; base_address, address_add[0]::DWordWL, 0]` | write2 |
-| `MEMW-CR14` | `LT[1; base_address, address_add[2]::DWordWL, 0]` | write4 |
-| `MEMW-CR15` | `LT[1; base_address, address_add[6]::DWordWL, 0]` | write8 |
+There is no need to check that the address does not overflow, as our address calculations are not performed modulo `2^64` here, and any overflow will result in an address without matching initialization.
 
 The chip adds the following tuples to the lookup argument, to effectuate that part of the memory argument.
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `MEMW-CM16` |  | `memory[is_register, base_address, old_timestamp[0], old[0]]` | μ_sum |
-| `MEMW-CM17` |  | `memory[is_register, base_address, timestamp, value[0]]` | -μ_sum |
-| `MEMW-CM18` |  | `memory[is_register, address_add[0]::DWordWL, old_timestamp[1], old[1]]` | w2 |
-| `MEMW-CM19` |  | `memory[is_register, address_add[0]::DWordWL, timestamp, value[1]]` | -w2 |
-| `MEMW-CM20.i` | i ∈ [2, 3] | `memory[is_register, address_add[i - 1]::DWordWL, old_timestamp[i], old[i]]` | w4 |
-| `MEMW-CM21.i` | i ∈ [2, 3] | `memory[is_register, address_add[i - 1]::DWordWL, timestamp, value[i]]` | -w4 |
-| `MEMW-CM22.i` | i ∈ [4, 7] | `memory[is_register, address_add[i - 1]::DWordWL, old_timestamp[i], old[i]]` | write8 |
-| `MEMW-CM23.i` | i ∈ [4, 7] | `memory[is_register, address_add[i - 1]::DWordWL, timestamp, value[i]]` | -write8 |
+| `MEMW-CM8` |  | `memory[is_register, base_address, old_timestamp[0], old[0]]` | μ_sum |
+| `MEMW-CM9` |  | `memory[is_register, base_address, timestamp, value[0]]` | -μ_sum |
+| `MEMW-CM10` |  | `memory[is_register, address_add[0], old_timestamp[1], old[1]]` | w2 |
+| `MEMW-CM11` |  | `memory[is_register, address_add[0], timestamp, value[1]]` | -w2 |
+| `MEMW-CM12.i` | i ∈ [2, 3] | `memory[is_register, address_add[i - 1], old_timestamp[i], old[i]]` | w4 |
+| `MEMW-CM13.i` | i ∈ [2, 3] | `memory[is_register, address_add[i - 1], timestamp, value[i]]` | -w4 |
+| `MEMW-CM14.i` | i ∈ [4, 7] | `memory[is_register, address_add[i - 1], old_timestamp[i], old[i]]` | write8 |
+| `MEMW-CM15.i` | i ∈ [4, 7] | `memory[is_register, address_add[i - 1], timestamp, value[i]]` | -write8 |
 
 This chip contributes the following to the lookup argument.
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `MEMW-CO24` | `MEMW[old; is_register, base_address, value, timestamp, write2, write4, write8]` | μ_read |
-| `MEMW-CO25` | `MEMW[is_register, base_address, value, timestamp, write2, write4, write8]` | μ_write |
+| `MEMW-CO16` | `MEMW[old; is_register, base_address, value, timestamp, write2, write4, write8]` | μ_read |
+| `MEMW-CO17` | `MEMW[is_register, base_address, value, timestamp, write2, write4, write8]` | μ_write |
+
+= Read-size aligned fast path
+
+When a memory access happens at an address with proper alignment (that is, enough trailing zeros) for its access size, and all accessed elements were last accessed at the same timestamp, we can instead use the  chip to save on total column count. The saving comes from only requiring a single old timestamp to be stored, as well as being able to guarantee that all values of `add_limb_overflow` would be zero. A minor extra cost is introduced in the form of a check that the alignment is indeed correct, and the corresponding decomposition of the `base_address`.
+
+Further logic remains essentially the same, so we briefly present the relevant tables for this chip.
+
+The  chip only needs  variables, expressed through  columns.
 
 = Future optimization ideas
 
-- Fast path for aligned memory access where all bytes have the same old timestamp - MEMB chip that deals does a one-byte write to remove old_timestamp from here (uncertain tradeoffs) - Compute `base_address[1] + 1` once and have high words of `address_add` as Words - Improve overflow trapping somehow so we don't need `LT` (could tie into previous one by checking carry bit of the +1) - Adding `μ_sum`/`w2`/`w4`/`write8` multiplicities to the `IS_HALF` lookups may make some GKR things faster if there are known zeroes.
+- `MEMB` chip that does a one-byte write to remove old_timestamp from here (uncertain tradeoffs) - Additional fast path for registers? (Always guaranteed same timestamp, alignment could be an assumption, always only two values) - Adding `μ_sum`/`w2`/`w4`/`write8` multiplicities to the `IS_HALF` lookups may make some GKR things faster if there are known zeroes.
 
 ## Columns
 
@@ -86,7 +85,7 @@ This chip contributes the following to the lookup argument.
 
 | Name | Type | Description |
 |------|------|-------------|
-| `address_add` | `DWordHL[7]` | `address_add[i] = base_address + i + 1` |
+| `add_limb_overflow` | `Bit[7]` | Whether adding `i` to `base_address[0]` as a field element exceeds $2^32$ |
 | `old_timestamp` | `DWordWL[8]` | The timestamp at which the address was last accessed |
 
 ### Virtual
@@ -95,6 +94,7 @@ This chip contributes the following to the lookup argument.
 |------|------|-------------|
 | `w2` | `Bit` | writing at least 2 bytes |
 | `w4` | `Bit` | writing at least 4 bytes |
+| `address_add` | `DWordWL[7]` | `address_add[i] = base_address + i + 1` |
 | `μ_sum` | `Bit` |  |
 
 **Definition of `w2`:**
@@ -107,6 +107,11 @@ w2 := write2 + write4 + write8
 w4 := write4 + write8
 ```
 
+**Definition of `address_add`:**
+```
+address_add := ['arr', ['+', ['idx', 'base_address', 0], 'i', 1, ['*', ['-', ['^', 2, 32]], ['idx', 'add_limb_overflow', 'i']]], ['+', ['idx', 'base_address', 1], ['idx', 'add_limb_overflow', 'i']]]
+```
+
 **Definition of `μ_sum`:**
 ```
 μ_sum := μ_read + μ_write
diff --git a/docs/spec/shift.md b/docs/spec/shift.md
index 76c147141..32679168f 100644
--- a/docs/spec/shift.md
+++ b/docs/spec/shift.md
@@ -20,15 +20,15 @@ In the following, we cover how these two phases were designed to complement one
 
 ## First phase
 
-We zoom in on the first step. Here, we make use of the two lookup operations - ``HWSL[x: Half, y: B4]` := (`x` `<<` `y`) mod 2^16` (short for "HalfWord Shift Left"), and - ``HWSLC[x: Half, y: B4]` := `x` `>>` (16-`y`)` (short for "HalfWord Shift Left's Carry") Note here that one can use these two lookups to compute `out: Half[4] := in << y` as: $
+We zoom in on the first step. Here, we make use of the lookup operation `HWSL` (short for "HalfWord Shift Left"): ` `HWSL[x: Half, y: B4]` := [(`x` `<<` `y`) mod 2^16, `x` `>>` (16 - `y`)]. ` One can use this to compute `out: Half[4] := in << y` as: $
 
-$ as long as ``y` < 16`. Observing that ``HWSL[x,` 16-`y]` = (`x` `<<` (16-`y`)) mod 2^16`, and ``HWSLC[x,` 16-`y]` = `x` `>>` `y`` for ``y` in [1, 15]`, one can also use these lookups to compute `out := in >> y` as $
+$ as long as ``y` < 16`. Observing that ``HWSL[x,` 16-`y]`_0 = (`x` `<<` (16-`y`)) mod 2^16`, and ``HWSL[x,` 16-`y]`_1 = `x` `>>` `y`` for ``y` in [1, 15]`, one can also use it to compute `out := in >> y` as $
 
 $ as long as `0 < `y` < 16`.
 
 Observe now that the values being looked up are (almost) independent from the direction of the shift: only the shift-amount varies slightly. When we now define $
 
-(16-`shift`) mod 16 & "when shifting right" ), $ it only takes some rearranging and combining of the values ``X[`i`] := HWSL[in[`i`], bit_shift]`` and ``Y[`i`] := HWSLC[in[`i`], bit_shift]`` to form the limbs of ``in <</>> shift` mod 16`. In the remaining case that ``right` = 1` and ``shift` = 0 mod 16`, the limbs of ``in <</>> shift` mod 16` simply match those of `in`.
+(16-`shift`) mod 16 & "when shifting right" ), $ it only takes some rearranging and combining of the values ``X[`i`] := HWSL[in[`i`], bit_shift]`_0` and ``Y[`i`] := HWSL[in[`i`], bit_shift]`_1` to form the limbs of ``in <</>> shift` mod 16`. In the remaining case that ``right` = 1` and ``shift` = 0 mod 16`, the limbs of ``in <</>> shift` mod 16` simply match those of `in`.
 
 ## Second phase
 
@@ -45,7 +45,7 @@ Lastly, we discuss the case of performing the _arithmetic_ right shift. Here, `e
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
 | `SHIFT-C1` | `AND_BYTE[bit_shift; shift, 15]` | left |
-| `SHIFT-C2` | `AND_BYTE[bit_shift; 2^8 - shift, 15]` | right |
+| `SHIFT-C2` | `AND_BYTE[bit_shift; 2^8 - 16 * zbs - shift, 15]` | right |
 | `SHIFT-C3` | `ZERO[zbs; bit_shift]` | μ |
 
 Next, we shift the limbs of `in` left and right by the appropriate amount, storing the results in `X` and `Y` respectively. When `zbs = 1`, the output cannot be used to compose ``in >>/>>> shift` mod 16`. To resolve this, we override `Y[i] := in[i]` and `X[i] := 0` in this case.
@@ -54,15 +54,14 @@ The case of `left`-shifting and ``bit_shift` = 0` will be used for padding rows.
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `SHIFT-C4.i` | i ∈ [0, 3] | `HWSL[X[i]; in[i], bit_shift]` | 1 - zbs |
+| `SHIFT-C4.i` | i ∈ [0, 3] | `HWSL[['arr', ['idx', 'X', 'i'], ['idx', 'Y', 'i']]; in[i], bit_shift]` | 1 - zbs |
 | `SHIFT-C5.i` | i ∈ [0, 3] | `zbs` => `X[i]` = `in[i]` dot `left` |  |
 | | | _polynomial:_ `zbs * (X[i] - in[i] * left) = 0` | |
-| `SHIFT-C6` |  | `HWSL[X[4]; extension, bit_shift]` | 1 - zbs |
-| `SHIFT-C7` |  | `zbs` => `X[4]` = 0 |  |
-| | | _polynomial:_ `zbs * X[4] = 0` | |
-| `SHIFT-C8.i` | i ∈ [0, 3] | `HWSLC[Y[i]; in[i], bit_shift]` | 1 - zbs |
-| `SHIFT-C9.i` | i ∈ [0, 3] | `zbs` => `Y[i]` = `in[i]` dot `right` |  |
+| `SHIFT-C6.i` | i ∈ [0, 3] | `zbs` => `Y[i]` = `in[i]` dot `right` |  |
 | | | _polynomial:_ `zbs * (Y[i] - in[i] * right) = 0` | |
+| `SHIFT-C7` |  | `HWSL[['arr', ['idx', 'X', 4], ['-', 'extension', ['idx', 'X', 4]]]; extension, bit_shift]` | 1 - zbs |
+| `SHIFT-C8` |  | `zbs` => `X[4]` = 0 |  |
+| | | _polynomial:_ `zbs * X[4] = 0` | |
 
 ## Full-limb shifting
 
@@ -72,21 +71,21 @@ Hereafter, one must only check that `out` is the proper cast of `shifted` into a
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `SHIFT-C10.i` | i ∈ [0, 3] | `IS_BIT<limb_shift[i]>` |  |
-| `SHIFT-C11` |  | `AND_BYTE[(1 - limb_shift[0]) + 15 * limb_shift[1] + 31 * limb_shift[2] + 47 * limb_shift[3]; shift, 48 - 32 * word_instr]` | μ |
-| `SHIFT-C12.i` | i ∈ [0, 1] | `out[:2]` = `shifted[:4]` |  |
+| `SHIFT-C9.i` | i ∈ [0, 3] | `IS_BIT<limb_shift[i]>` |  |
+| `SHIFT-C10` |  | `AND_BYTE[(1 - limb_shift[0]) + 15 * limb_shift[1] + 31 * limb_shift[2] + 47 * limb_shift[3]; shift, 48 - 32 * word_instr]` | μ |
+| `SHIFT-C11.i` | i ∈ [0, 1] | `out[:2]` = `shifted[:4]` |  |
 | | | _polynomial:_ `out[i] - (shifted::DWordWL)[i] = 0` | |
 
 ## Miscellaneous
 
 | Tag | Description |
 |-----|-------------|
-| `SHIFT-C13` | `direction` => `μ` = 1 |
+| `SHIFT-C12` | `direction` => `μ` = 1 |
 | | _polynomial:_ `direction * (1 - μ) = 0` |
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `SHIFT-C14` | `MSB16[is_negative; in[3]]` | signed |
+| `SHIFT-C13` | `MSB16[is_negative; in[3]]` | signed |
 
 *Note*: `is_negative` is not used when `signed = 0`. As such, there is no problem with it being unconstrained in this case.
 
@@ -96,7 +95,7 @@ This chip adds the following interaction to the lookup.
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `SHIFT-C15` | `SHIFT[out; in, shift, direction, signed, word_instr]` | -μ |
+| `SHIFT-C14` | `SHIFT[out; in, shift, direction, signed, word_instr]` | -μ |
 
 = Padding
 
@@ -129,12 +128,13 @@ The table can be padded to the next power of two with the following value assign
 | `zbs` | `Bit` | Whether `bit_shift` is zero (1) or not (0). |
 | `X` | `Half[5]` | scratch variable. |
 | `Y` | `Half[4]` | scratch variable. |
-| `limb_shift` | `Bit[4]` | One-hot vector indicating whether $floor.l `shift` / 16 floor.r equiv i mod s$, where $s = 2$ when $`word_instr` = 1$ and $4$ otherwise. |
+| `limb_shift_raw` | `Bit[3]` | One-hot vector indicating whether $floor.l `shift` / 16 floor.r equiv i mod s$, where $s = 2$ when $`word_instr` = 1$ and $4$ otherwise. These columns store the first 3 values, and the 4th is derived from the one-hot property. |
 
 ### Virtual
 
 | Name | Type | Description |
 |------|------|-------------|
+| `limb_shift` | `Bit[4]` |  |
 | `extension` | `Half` | sign extension of `in`. |
 | `left` | `Bit` | Whether to perform a left-shift. |
 | `right` | `Bit` | Whether to perform a right-shift. |
@@ -142,6 +142,12 @@ The table can be padded to the next power of two with the following value assign
 | `intra_limb_right` | `DWordHL` | `in >>> (shift % 16)` if `right` and `signed`;\ `in >> (shift % 16)` if `right` and `!signed` |
 | `shifted` | `DWordHL` | $`in <</>>/>>>` (`shift` mod 32 dot (2 - `word_instr`))$ |
 
+**Definition of `limb_shift`:**
+```
+limb_shift (when iter=[0, 2]) := limb_shift_raw[i]
+limb_shift (when iter=3) := 1 - Σ_j = 0^2 limb_shift_raw[j]
+```
+
 **Definition of `extension`:**
 ```
 extension := 65535 * is_negative
@@ -170,7 +176,7 @@ intra_limb_right := Y[i] + X[i + 1]
 
 **Definition of `shifted`:**
 ```
-shifted := left * Σ_j = 0^i limb_shift[j] * intra_limb_left[i - j] + right * (Σ_j = 0^3 - i limb_shift[j] * intra_limb_right[i + j] + extension * Σ_j = 3 - i^3 limb_shift[j])
+shifted := left * Σ_j = 0^i limb_shift[j] * intra_limb_left[i - j] + right * (Σ_j = 0^3 - i limb_shift[j] * intra_limb_right[i + j] + extension * Σ_j = 4 - i^3 limb_shift[j])
 ```
 
 ### Multiplicity
diff --git a/docs/spec/spec_full.md b/docs/spec/spec_full.md
index a1a8d6792..17ed0a923 100644
--- a/docs/spec/spec_full.md
+++ b/docs/spec/spec_full.md
@@ -667,15 +667,15 @@ In the following, we cover how these two phases were designed to complement one
 
 ## First phase
 
-We zoom in on the first step. Here, we make use of the two lookup operations - ``HWSL[x: Half, y: B4]` := (`x` `<<` `y`) mod 2^16` (short for "HalfWord Shift Left"), and - ``HWSLC[x: Half, y: B4]` := `x` `>>` (16-`y`)` (short for "HalfWord Shift Left's Carry") Note here that one can use these two lookups to compute `out: Half[4] := in << y` as: $
+We zoom in on the first step. Here, we make use of the lookup operation `HWSL` (short for "HalfWord Shift Left"): ` `HWSL[x: Half, y: B4]` := [(`x` `<<` `y`) mod 2^16, `x` `>>` (16 - `y`)]. ` One can use this to compute `out: Half[4] := in << y` as: $
 
-$ as long as ``y` < 16`. Observing that ``HWSL[x,` 16-`y]` = (`x` `<<` (16-`y`)) mod 2^16`, and ``HWSLC[x,` 16-`y]` = `x` `>>` `y`` for ``y` in [1, 15]`, one can also use these lookups to compute `out := in >> y` as $
+$ as long as ``y` < 16`. Observing that ``HWSL[x,` 16-`y]`_0 = (`x` `<<` (16-`y`)) mod 2^16`, and ``HWSL[x,` 16-`y]`_1 = `x` `>>` `y`` for ``y` in [1, 15]`, one can also use it to compute `out := in >> y` as $
 
 $ as long as `0 < `y` < 16`.
 
 Observe now that the values being looked up are (almost) independent from the direction of the shift: only the shift-amount varies slightly. When we now define $
 
-(16-`shift`) mod 16 & "when shifting right" ), $ it only takes some rearranging and combining of the values ``X[`i`] := HWSL[in[`i`], bit_shift]`` and ``Y[`i`] := HWSLC[in[`i`], bit_shift]`` to form the limbs of ``in <</>> shift` mod 16`. In the remaining case that ``right` = 1` and ``shift` = 0 mod 16`, the limbs of ``in <</>> shift` mod 16` simply match those of `in`.
+(16-`shift`) mod 16 & "when shifting right" ), $ it only takes some rearranging and combining of the values ``X[`i`] := HWSL[in[`i`], bit_shift]`_0` and ``Y[`i`] := HWSL[in[`i`], bit_shift]`_1` to form the limbs of ``in <</>> shift` mod 16`. In the remaining case that ``right` = 1` and ``shift` = 0 mod 16`, the limbs of ``in <</>> shift` mod 16` simply match those of `in`.
 
 ## Second phase
 
@@ -692,7 +692,7 @@ Lastly, we discuss the case of performing the _arithmetic_ right shift. Here, `e
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
 | `SHIFT-C1` | `AND_BYTE[bit_shift; shift, 15]` | left |
-| `SHIFT-C2` | `AND_BYTE[bit_shift; 2^8 - shift, 15]` | right |
+| `SHIFT-C2` | `AND_BYTE[bit_shift; 2^8 - 16 * zbs - shift, 15]` | right |
 | `SHIFT-C3` | `ZERO[zbs; bit_shift]` | μ |
 
 Next, we shift the limbs of `in` left and right by the appropriate amount, storing the results in `X` and `Y` respectively. When `zbs = 1`, the output cannot be used to compose ``in >>/>>> shift` mod 16`. To resolve this, we override `Y[i] := in[i]` and `X[i] := 0` in this case.
@@ -701,15 +701,14 @@ The case of `left`-shifting and ``bit_shift` = 0` will be used for padding rows.
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `SHIFT-C4.i` | i ∈ [0, 3] | `HWSL[X[i]; in[i], bit_shift]` | 1 - zbs |
+| `SHIFT-C4.i` | i ∈ [0, 3] | `HWSL[['arr', ['idx', 'X', 'i'], ['idx', 'Y', 'i']]; in[i], bit_shift]` | 1 - zbs |
 | `SHIFT-C5.i` | i ∈ [0, 3] | `zbs` => `X[i]` = `in[i]` dot `left` |  |
 | | | _polynomial:_ `zbs * (X[i] - in[i] * left) = 0` | |
-| `SHIFT-C6` |  | `HWSL[X[4]; extension, bit_shift]` | 1 - zbs |
-| `SHIFT-C7` |  | `zbs` => `X[4]` = 0 |  |
-| | | _polynomial:_ `zbs * X[4] = 0` | |
-| `SHIFT-C8.i` | i ∈ [0, 3] | `HWSLC[Y[i]; in[i], bit_shift]` | 1 - zbs |
-| `SHIFT-C9.i` | i ∈ [0, 3] | `zbs` => `Y[i]` = `in[i]` dot `right` |  |
+| `SHIFT-C6.i` | i ∈ [0, 3] | `zbs` => `Y[i]` = `in[i]` dot `right` |  |
 | | | _polynomial:_ `zbs * (Y[i] - in[i] * right) = 0` | |
+| `SHIFT-C7` |  | `HWSL[['arr', ['idx', 'X', 4], ['-', 'extension', ['idx', 'X', 4]]]; extension, bit_shift]` | 1 - zbs |
+| `SHIFT-C8` |  | `zbs` => `X[4]` = 0 |  |
+| | | _polynomial:_ `zbs * X[4] = 0` | |
 
 ## Full-limb shifting
 
@@ -719,21 +718,21 @@ Hereafter, one must only check that `out` is the proper cast of `shifted` into a
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `SHIFT-C10.i` | i ∈ [0, 3] | `IS_BIT<limb_shift[i]>` |  |
-| `SHIFT-C11` |  | `AND_BYTE[(1 - limb_shift[0]) + 15 * limb_shift[1] + 31 * limb_shift[2] + 47 * limb_shift[3]; shift, 48 - 32 * word_instr]` | μ |
-| `SHIFT-C12.i` | i ∈ [0, 1] | `out[:2]` = `shifted[:4]` |  |
+| `SHIFT-C9.i` | i ∈ [0, 3] | `IS_BIT<limb_shift[i]>` |  |
+| `SHIFT-C10` |  | `AND_BYTE[(1 - limb_shift[0]) + 15 * limb_shift[1] + 31 * limb_shift[2] + 47 * limb_shift[3]; shift, 48 - 32 * word_instr]` | μ |
+| `SHIFT-C11.i` | i ∈ [0, 1] | `out[:2]` = `shifted[:4]` |  |
 | | | _polynomial:_ `out[i] - (shifted::DWordWL)[i] = 0` | |
 
 ## Miscellaneous
 
 | Tag | Description |
 |-----|-------------|
-| `SHIFT-C13` | `direction` => `μ` = 1 |
+| `SHIFT-C12` | `direction` => `μ` = 1 |
 | | _polynomial:_ `direction * (1 - μ) = 0` |
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `SHIFT-C14` | `MSB16[is_negative; in[3]]` | signed |
+| `SHIFT-C13` | `MSB16[is_negative; in[3]]` | signed |
 
 *Note*: `is_negative` is not used when `signed = 0`. As such, there is no problem with it being unconstrained in this case.
 
@@ -743,7 +742,7 @@ This chip adds the following interaction to the lookup.
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `SHIFT-C15` | `SHIFT[out; in, shift, direction, signed, word_instr]` | -μ |
+| `SHIFT-C14` | `SHIFT[out; in, shift, direction, signed, word_instr]` | -μ |
 
 = Padding
 
@@ -776,12 +775,13 @@ The table can be padded to the next power of two with the following value assign
 | `zbs` | `Bit` | Whether `bit_shift` is zero (1) or not (0). |
 | `X` | `Half[5]` | scratch variable. |
 | `Y` | `Half[4]` | scratch variable. |
-| `limb_shift` | `Bit[4]` | One-hot vector indicating whether $floor.l `shift` / 16 floor.r equiv i mod s$, where $s = 2$ when $`word_instr` = 1$ and $4$ otherwise. |
+| `limb_shift_raw` | `Bit[3]` | One-hot vector indicating whether $floor.l `shift` / 16 floor.r equiv i mod s$, where $s = 2$ when $`word_instr` = 1$ and $4$ otherwise. These columns store the first 3 values, and the 4th is derived from the one-hot property. |
 
 ### Virtual
 
 | Name | Type | Description |
 |------|------|-------------|
+| `limb_shift` | `Bit[4]` |  |
 | `extension` | `Half` | sign extension of `in`. |
 | `left` | `Bit` | Whether to perform a left-shift. |
 | `right` | `Bit` | Whether to perform a right-shift. |
@@ -789,6 +789,12 @@ The table can be padded to the next power of two with the following value assign
 | `intra_limb_right` | `DWordHL` | `in >>> (shift % 16)` if `right` and `signed`;\ `in >> (shift % 16)` if `right` and `!signed` |
 | `shifted` | `DWordHL` | $`in <</>>/>>>` (`shift` mod 32 dot (2 - `word_instr`))$ |
 
+**Definition of `limb_shift`:**
+```
+limb_shift (when iter=[0, 2]) := limb_shift_raw[i]
+limb_shift (when iter=3) := 1 - Σ_j = 0^2 limb_shift_raw[j]
+```
+
 **Definition of `extension`:**
 ```
 extension := 65535 * is_negative
@@ -817,7 +823,7 @@ intra_limb_right := Y[i] + X[i + 1]
 
 **Definition of `shifted`:**
 ```
-shifted := left * Σ_j = 0^i limb_shift[j] * intra_limb_left[i - j] + right * (Σ_j = 0^3 - i limb_shift[j] * intra_limb_right[i + j] + extension * Σ_j = 3 - i^3 limb_shift[j])
+shifted := left * Σ_j = 0^i limb_shift[j] * intra_limb_left[i - j] + right * (Σ_j = 0^3 - i limb_shift[j] * intra_limb_right[i + j] + extension * Σ_j = 4 - i^3 limb_shift[j])
 ```
 
 ### Multiplicity
@@ -946,55 +952,54 @@ Our assumptions do not explicitly cover any range checks for the `is_register` a
 
 = Constraints
 
+We can compute the addresses for the later bytes based on a single bit each, indicating whether adding `i` to `base_address` overflows the lower limb. We can safely assume that additions for which this bit is not correctly set will have either an overflow on the upper or lower word, and hence not match any existing memory tokens, which are only initialized for correctly formatted and range-checked doublewords (see [memory]).
+
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
 | `MEMW-C1` |  | `IS_BIT<μ_sum>` |  |
 | `MEMW-C2` |  | `w2` => `μ_sum` |  |
 | | | _polynomial:_ `w2 * (1 - μ_sum) = 0` | |
-| `MEMW-C3` |  | w2 ⇒ `ADD<address_add[0]::DWordWL; base_address, 1::DWordWL>` |  |
-| `MEMW-C4.i` | i ∈ [1, 2] | w4 ⇒ `ADD<address_add[i]::DWordWL; base_address, (i + 1)::DWordWL>` |  |
-| `MEMW-C5.i` | i ∈ [3, 6] | write8 ⇒ `ADD<address_add[i]::DWordWL; base_address, (i + 1)::DWordWL>` |  |
-| `MEMW-C6.i` | i ∈ [0, 0], j ∈ [0, 3] | `IS_HALF[address_add[i][j]]` | w2 |
-| `MEMW-C7.i` | i ∈ [1, 2], j ∈ [0, 3] | `IS_HALF[address_add[i][j]]` | w4 |
-| `MEMW-C8.i` | i ∈ [3, 6], j ∈ [0, 3] | `IS_HALF[address_add[i][j]]` | write8 |
-| `MEMW-C9` |  | `LT[1; old_timestamp[0], timestamp, 0]` | μ_sum |
-| `MEMW-C10` |  | `LT[1; old_timestamp[1], timestamp, 0]` | w2 |
-| `MEMW-C11.i` | i ∈ [2, 3] | `LT[1; old_timestamp[i], timestamp, 0]` | w4 |
-| `MEMW-C12.i` | i ∈ [4, 7] | `LT[1; old_timestamp[i], timestamp, 0]` | write8 |
+| `MEMW-C3.i` | i ∈ [0, 6] | `IS_BIT<add_limb_overflow[i]>` |  |
+| `MEMW-C4` |  | `LT[1; old_timestamp[0], timestamp, 0]` | μ_sum |
+| `MEMW-C5` |  | `LT[1; old_timestamp[1], timestamp, 0]` | w2 |
+| `MEMW-C6.i` | i ∈ [2, 3] | `LT[1; old_timestamp[i], timestamp, 0]` | w4 |
+| `MEMW-C7.i` | i ∈ [4, 7] | `LT[1; old_timestamp[i], timestamp, 0]` | write8 |
 
 As long as `timestamp` is properly range-checked, the presence of `old_timestamp` in the memory argument automatically ensures appropriate range checking (as long as no external entities provide negative multiplicities without range checking the timestamp). This ensures the assumptions for `LT` are satisfied.
 
-We additionally check that the address does not overflow for more significant bytes of the access.
-
-| Tag | Description | Multiplicity |
-|-----|-------------|--------------|
-| `MEMW-CR13` | `LT[1; base_address, address_add[0]::DWordWL, 0]` | write2 |
-| `MEMW-CR14` | `LT[1; base_address, address_add[2]::DWordWL, 0]` | write4 |
-| `MEMW-CR15` | `LT[1; base_address, address_add[6]::DWordWL, 0]` | write8 |
+There is no need to check that the address does not overflow, as our address calculations are not performed modulo `2^64` here, and any overflow will result in an address without matching initialization.
 
 The chip adds the following tuples to the lookup argument, to effectuate that part of the memory argument.
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `MEMW-CM16` |  | `memory[is_register, base_address, old_timestamp[0], old[0]]` | μ_sum |
-| `MEMW-CM17` |  | `memory[is_register, base_address, timestamp, value[0]]` | -μ_sum |
-| `MEMW-CM18` |  | `memory[is_register, address_add[0]::DWordWL, old_timestamp[1], old[1]]` | w2 |
-| `MEMW-CM19` |  | `memory[is_register, address_add[0]::DWordWL, timestamp, value[1]]` | -w2 |
-| `MEMW-CM20.i` | i ∈ [2, 3] | `memory[is_register, address_add[i - 1]::DWordWL, old_timestamp[i], old[i]]` | w4 |
-| `MEMW-CM21.i` | i ∈ [2, 3] | `memory[is_register, address_add[i - 1]::DWordWL, timestamp, value[i]]` | -w4 |
-| `MEMW-CM22.i` | i ∈ [4, 7] | `memory[is_register, address_add[i - 1]::DWordWL, old_timestamp[i], old[i]]` | write8 |
-| `MEMW-CM23.i` | i ∈ [4, 7] | `memory[is_register, address_add[i - 1]::DWordWL, timestamp, value[i]]` | -write8 |
+| `MEMW-CM8` |  | `memory[is_register, base_address, old_timestamp[0], old[0]]` | μ_sum |
+| `MEMW-CM9` |  | `memory[is_register, base_address, timestamp, value[0]]` | -μ_sum |
+| `MEMW-CM10` |  | `memory[is_register, address_add[0], old_timestamp[1], old[1]]` | w2 |
+| `MEMW-CM11` |  | `memory[is_register, address_add[0], timestamp, value[1]]` | -w2 |
+| `MEMW-CM12.i` | i ∈ [2, 3] | `memory[is_register, address_add[i - 1], old_timestamp[i], old[i]]` | w4 |
+| `MEMW-CM13.i` | i ∈ [2, 3] | `memory[is_register, address_add[i - 1], timestamp, value[i]]` | -w4 |
+| `MEMW-CM14.i` | i ∈ [4, 7] | `memory[is_register, address_add[i - 1], old_timestamp[i], old[i]]` | write8 |
+| `MEMW-CM15.i` | i ∈ [4, 7] | `memory[is_register, address_add[i - 1], timestamp, value[i]]` | -write8 |
 
 This chip contributes the following to the lookup argument.
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `MEMW-CO24` | `MEMW[old; is_register, base_address, value, timestamp, write2, write4, write8]` | μ_read |
-| `MEMW-CO25` | `MEMW[is_register, base_address, value, timestamp, write2, write4, write8]` | μ_write |
+| `MEMW-CO16` | `MEMW[old; is_register, base_address, value, timestamp, write2, write4, write8]` | μ_read |
+| `MEMW-CO17` | `MEMW[is_register, base_address, value, timestamp, write2, write4, write8]` | μ_write |
+
+= Read-size aligned fast path
+
+When a memory access happens at an address with proper alignment (that is, enough trailing zeros) for its access size, and all accessed elements were last accessed at the same timestamp, we can instead use the  chip to save on total column count. The saving comes from only requiring a single old timestamp to be stored, as well as being able to guarantee that all values of `add_limb_overflow` would be zero. A minor extra cost is introduced in the form of a check that the alignment is indeed correct, and the corresponding decomposition of the `base_address`.
+
+Further logic remains essentially the same, so we briefly present the relevant tables for this chip.
+
+The  chip only needs  variables, expressed through  columns.
 
 = Future optimization ideas
 
-- Fast path for aligned memory access where all bytes have the same old timestamp - MEMB chip that deals does a one-byte write to remove old_timestamp from here (uncertain tradeoffs) - Compute `base_address[1] + 1` once and have high words of `address_add` as Words - Improve overflow trapping somehow so we don't need `LT` (could tie into previous one by checking carry bit of the +1) - Adding `μ_sum`/`w2`/`w4`/`write8` multiplicities to the `IS_HALF` lookups may make some GKR things faster if there are known zeroes.
+- `MEMB` chip that does a one-byte write to remove old_timestamp from here (uncertain tradeoffs) - Additional fast path for registers? (Always guaranteed same timestamp, alignment could be an assumption, always only two values) - Adding `μ_sum`/`w2`/`w4`/`write8` multiplicities to the `IS_HALF` lookups may make some GKR things faster if there are known zeroes.
 
 ## Columns
 
@@ -1020,7 +1025,7 @@ This chip contributes the following to the lookup argument.
 
 | Name | Type | Description |
 |------|------|-------------|
-| `address_add` | `DWordHL[7]` | `address_add[i] = base_address + i + 1` |
+| `add_limb_overflow` | `Bit[7]` | Whether adding `i` to `base_address[0]` as a field element exceeds $2^32$ |
 | `old_timestamp` | `DWordWL[8]` | The timestamp at which the address was last accessed |
 
 ### Virtual
@@ -1029,6 +1034,7 @@ This chip contributes the following to the lookup argument.
 |------|------|-------------|
 | `w2` | `Bit` | writing at least 2 bytes |
 | `w4` | `Bit` | writing at least 4 bytes |
+| `address_add` | `DWordWL[7]` | `address_add[i] = base_address + i + 1` |
 | `μ_sum` | `Bit` |  |
 
 **Definition of `w2`:**
@@ -1041,6 +1047,11 @@ w2 := write2 + write4 + write8
 w4 := write4 + write8
 ```
 
+**Definition of `address_add`:**
+```
+address_add := ['arr', ['+', ['idx', 'base_address', 0], 'i', 1, ['*', ['-', ['^', 2, 32]], ['idx', 'add_limb_overflow', 'i']]], ['+', ['idx', 'base_address', 1], ['idx', 'add_limb_overflow', 'i']]]
+```
+
 **Definition of `μ_sum`:**
 ```
 μ_sum := μ_read + μ_write
@@ -1793,7 +1804,7 @@ The  chip is comprised of  variables that are expressed using  columns. Of these
 
 = Lookup This chip adds the following interactions to the lookup:
 
-= Areas of Optimization The following ideas may prove to be optimizations for the  chip: + Extend `IS_BYTE[X]` to `ARE_BYTES[X, Y]`, such that two bytes are range checked at once. When only a single check is required, one can still execute `IS_BYTE[X] := ARE_BYTES[X, 0]`. + Drop `MSB8` column, and instead define the `MSB8` lookup as `MSB8<X> := MSB16[256X]`. Note: currently, `MSB8` also implicity range checks the input `X` (the lookup fails if `X` is not a `Byte`). This optimization should only be executed when all chips leveraging `MSB8` do _not_ need this implicit range check. + Place the 16-bit (`AND`, `OR`, `XOR`, `MSB16`, etc.) and 20-bit (`HWSL`, `HWSLC`, `IS_B20`, `ZERO`) lookups in separate tables. + Combine `HWSL` and `HWSLC` into a single lookup (see also \).
+= Areas of Optimization The following ideas may prove to be optimizations for the  chip: + Extend `IS_BYTE[X]` to `ARE_BYTES[X, Y]`, such that two bytes are range checked at once. When only a single check is required, one can still execute `IS_BYTE[X] := ARE_BYTES[X, 0]`. + Drop `MSB8` column, and instead define the `MSB8` lookup as `MSB8<X> := MSB16[256X]`. Note: currently, `MSB8` also implicity range checks the input `X` (the lookup fails if `X` is not a `Byte`). This optimization should only be executed when all chips leveraging `MSB8` do _not_ need this implicit range check. + Place the 16-bit (`AND`, `OR`, `XOR`, `MSB16`, etc.) and 20-bit (`HWSL`, `IS_B20`, `ZERO`) lookups in separate tables.
 
 ## Columns
 
@@ -1832,7 +1843,6 @@ The  chip is comprised of  variables that are expressed using  columns. Of these
 | `μ_IS_HALF` | `BaseField` |  |
 | `μ_IS_B20` | `BaseField` |  |
 | `μ_HWSL` | `BaseField` |  |
-| `μ_HWSLC` | `BaseField` |  |
 
 ## Constraints
 
@@ -1849,5 +1859,4 @@ The  chip is comprised of  variables that are expressed using  columns. Of these
 | `BITWISE-C7` | `IS_BYTE[X]` | -μ_IS_BYTE |
 | `BITWISE-C8` | `IS_HALF[X + 256 * Y]` | -μ_IS_HALF |
 | `BITWISE-C9` | `IS_B20[X + 256 * Y + 65536 * Z]` | -μ_IS_B20 |
-| `BITWISE-C10` | `HWSL[SLL; X + 256 * Y, Z]` | -μ_HWSL |
-| `BITWISE-C11` | `HWSLC[SLLC; X + 256 * Y, Z]` | -μ_HWSLC |
\ No newline at end of file
+| `BITWISE-C10` | `HWSL[['arr', 'SLL', 'SLLC']; X + 256 * Y, Z]` | -μ_HWSL |
\ No newline at end of file

From 5cf536911966cf9ffe4d9ab8b46a7fe49e97e1e9 Mon Sep 17 00:00:00 2001
From: Robin Jadoul <robin.jadoul@3milabs.tech>
Date: Mon, 16 Mar 2026 10:25:19 +0100
Subject: [PATCH 083/105] Fix type checking for MEMW_A (#423)

---
 spec/src/memw_aligned.toml | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/spec/src/memw_aligned.toml b/spec/src/memw_aligned.toml
index 715f57c85..8df912111 100644
--- a/spec/src/memw_aligned.toml
+++ b/spec/src/memw_aligned.toml
@@ -1,4 +1,4 @@
-name = "MEMW-A"
+name = "MEMW_A"
 
 # Input
 
@@ -68,7 +68,9 @@ desc = "The timestamp at which the address was last accessed"
 name = "base_address"
 type = "DWordWL"
 desc = "Recomposing the base address from its parts"
-defs = {idx = "i", polys = [["+", ["*", ["^", 2, 16], "base_address_mid"], ["*", ["^", 2, 8], ["idx", "base_address_low", 1]], ["idx", "base_address_low", 0]], "base_address_high"]}
+def = {idx = "i", polys = [
+  { iter = 0, poly = ["+", ["*", ["^", 2, 16], "base_address_mid"], ["*", ["^", 2, 8], ["idx", "base_address_low", 1]], ["idx", "base_address_low", 0]] },
+  { iter = 1, poly = "base_address_high" }]}
 
 [[variables.virtual]]
 name = "w2"
@@ -131,10 +133,11 @@ iter = ["i", 0, 1]
 name = "consistency"
 
 [[constraints.consistency]]
-kind = "template"
+kind = "interaction"
 tag = "AND_BYTE"
 input = [["idx", "base_address_low", 0], ["+", ["*", "write2", 1], ["*", "write4", 3], ["*", "write8", 7]]]
 output = 0
+multiplicity = "μ_sum"
 
 [[constraints.consistency]]
 kind = "template"

From 8f7ddeae14a57f85be1d7610d71c75ffed6091f9 Mon Sep 17 00:00:00 2001
From: Erik <159244975+erik-3milabs@users.noreply.github.com>
Date: Mon, 23 Mar 2026 17:23:22 +0100
Subject: [PATCH 084/105] Spec/memw update (#434)

* spec/memw: read/write from/to -> read from/write to

* spec/memw: rename add_limb_overflow as carry

* spec/memw: minor var desc updates

* spec/memw: remove superfluous minus symbol

* spec/memw: update description

* spec/memw_a: minor optimization

* Apply suggestions from code review

Co-authored-by: Robin Jadoul <robin.jadoul@gmail.com>

* spec/MEMW: fix interaction typing

* spec/MEMW: drop superfluous notes

* spec/MEMW: update alignment requirement

* spec/MEMW: intentionally separate carry's prose and .toml descriptions

---------

Co-authored-by: Robin Jadoul <robin.jadoul@gmail.com>
---
 spec/memw.typ              | 29 +++++++++---------
 spec/src/memw.toml         | 18 ++++++------
 spec/src/memw_aligned.toml | 60 ++++++++++++--------------------------
 3 files changed, 43 insertions(+), 64 deletions(-)

diff --git a/spec/memw.typ b/spec/memw.typ
index 57907e26c..f70496aa8 100644
--- a/spec/memw.typ
+++ b/spec/memw.typ
@@ -33,26 +33,27 @@ The `MEMW` chip is comprised of #nr_variables variables that are expressed using
 
 Our assumptions do not explicitly cover any range checks for the `is_register` and `value` columns,
 as these are not necessary for the correctness of this chip in isolation.
-These properties are necessary for the consistency of the system as a whole, and therefore
+Still, these properties are necessary for the consistency of the system as a whole, and therefore
 we document it here, keeping the type information as a reading help.
 
 = Constraints
 
-We can compute the addresses for the later bytes based on a single bit each,
-indicating whether adding `i` to `base_address` overflows the lower limb.
-We can safely assume that additions for which this bit is not correctly set
-will have either an overflow on the upper or lower word, and hence not match
-any existing memory tokens, which are only initialized for correctly formatted
-and range-checked doublewords (see @memory).
+Depending on the values of `write2`, `write4` and `write8`, the addresses following `base_address` need to be constructed.
+Rather than computing these in full (which would require the later addresses to be instantiated), 
+it suffices to know the `carry`: the bit indicating whether $#`base_address`_0 + t >= 2^32$, i.e., whether adding $t in [1, 7]$ to `base_address` requires a carry from the lower to the upper limb.
+Note that it is safe for the prover to chose these bits: additions for which this bit is not correctly set
+will yield an address where either the lower or upper limb is out of bounds.
+As such, the constructed address will not match any existing memory tokens, 
+which are only initialized for correctly formatted and range-checked doublewords (see @memory).
 
 #render_constraint_table(chip, config, groups: "consistency")
 
 As long as `timestamp` is properly range-checked, the presence of `old_timestamp`
-in the memory argument automatically ensures appropriate range checking
-(as long as no external entities provide negative multiplicities without range checking the timestamp).
+in the memory argument automatically ensures it is appropriately range checked
+(this assumes no external entities provide negative multiplicities without range checking the timestamp).
 This ensures the assumptions for `LT` are satisfied.
 
-There is no need to check that the address does not overflow,
+There is no need to check that the additions do not overflow,
 as our address calculations are not performed modulo $2^64$ here,
 and any overflow will result in an address without matching initialization.
 
@@ -60,7 +61,7 @@ The chip adds the following tuples to the lookup argument,
 to effectuate that part of the memory argument.
 #render_constraint_table(chip, config, groups: "memory")
 
-This chip contributes the following to the lookup argument.
+This chip contributes the following to the lookup argument:
 #render_constraint_table(chip, config, groups: "output")
 
 = Read-size aligned fast path
@@ -68,9 +69,9 @@ This chip contributes the following to the lookup argument.
 #let alignedchip = load_chip("src/memw_aligned.toml", config)
 #let aligned = raw(alignedchip.name)
 
-When a memory access happens at an address with proper alignment
-(that is, enough trailing zeros) for its access size, and all accessed
-elements were last accessed at the same timestamp, we can 
+When a memory access happens at an address with proper alignment for its access size
+(i.e., adding the access size to `base_address`'s lowest limb does not overflow), 
+and all accessed elements were last accessed at the same timestamp, we can 
 instead use the #aligned chip to save on total column count.
 The saving comes from only requiring a single old timestamp to be stored,
 as well as being able to guarantee that all values of `add_limb_overflow` would be zero.
diff --git a/spec/src/memw.toml b/spec/src/memw.toml
index c9519e115..4905fc5aa 100644
--- a/spec/src/memw.toml
+++ b/spec/src/memw.toml
@@ -10,17 +10,17 @@ desc = "Whether the address represents a register index"
 [[variables.input]]
 name = "base_address"
 type = "DWordWL"
-desc = "The base address to read/write from/to, gets offset by $[0, 7]$, depending on how big the access is"
+desc = "The base address to read from/write to. Gets offset by $[0, 7]$ depending on the size of the access"
 
 [[variables.input]]
 name = "value"
 type = ["BaseField", 8]
-desc = "The values to store in memory. For regular memory, these should be (up to) 8 range-checked `Byte`s; registers are stored as two range-checked `Word`s"
+desc = "The values to store in memory. For RAM, these should be (up to) 8 range-checked `Byte`s; registers are stored as two range-checked `Word`s"
 
 [[variables.input]]
 name = "timestamp"
 type = "DWordWL"
-desc = "The timestamp at which this memory access is said to occur"
+desc = "The timestamp at which this memory access occurs"
 
 [[variables.input]]
 name = "write2"
@@ -48,14 +48,14 @@ Only the elements corresponding to the `writeN` bits are guaranteed"""
 # Auxiliary
 
 [[variables.auxiliary]]
-name = "add_limb_overflow"
+name = "carry"
 type = ["Bit", 7]
-desc = "Whether adding `i` to `base_address[0]` as a field element exceeds $2^32$"
+desc = "Whether `base_address[0] + i + 1` $>= 2^32$"
 
 [[variables.auxiliary]]
 name = "old_timestamp"
 type = ["DWordWL", 8]
-desc = "The timestamp at which the address was last accessed"
+desc = "The timestamp at which address `base_address + i` was last accessed"
 
 # Virtual
 
@@ -77,8 +77,8 @@ type = ["DWordWL", 7]
 desc = "`address_add[i] = base_address + i + 1`"
 def.iter = ["i", 0, 6]
 def.poly = ["arr",
-  ["+", ["idx", "base_address", 0], "i", 1, ["*", ["-", ["^", 2, 32]], ["idx", "add_limb_overflow", "i"]]],
-  ["+", ["idx", "base_address", 1], ["idx", "add_limb_overflow", "i"]]]
+  ["-", ["+", ["idx", "base_address", 0], "i", 1], ["*", ["^", 2, 32], ["idx", "carry", "i"]]],
+  ["+", ["idx", "base_address", 1], ["idx", "carry", "i"]]]
 
 [[variables.virtual]]
 name = "μ_sum"
@@ -136,7 +136,7 @@ poly = ["*", "w2", ["not", "μ_sum"]]
 [[constraints.consistency]]
 kind = "template"
 tag = "IS_BIT"
-input = [["idx", "add_limb_overflow", "i"]]
+input = [["idx", "carry", "i"]]
 iter = ["i", 0, 6]
 
 [[constraints.consistency]]
diff --git a/spec/src/memw_aligned.toml b/spec/src/memw_aligned.toml
index 8df912111..be6bb1603 100644
--- a/spec/src/memw_aligned.toml
+++ b/spec/src/memw_aligned.toml
@@ -8,19 +8,9 @@ type = "Bit"
 desc = "Whether the address represents a register index"
 
 [[variables.input]]
-name = "base_address_high"
-type = "Word"
-desc = "The high word of the base address to read/write from/to, gets offset by $[0, 7]$, depending on how big the access is"
-
-[[variables.input]]
-name = "base_address_mid"
-type = "Half"
-desc = "The middle halfword of the base address to read/write from/to, gets offset by $[0, 7]$, depending on how big the access is"
-
-[[variables.input]]
-name = "base_address_low"
-type = ["Byte", 2]
-desc = "The low bytes of the base address to read/write from/to, gets offset by $[0, 7]$, depending on how big the access is"
+name = "base_address"
+type = "DWordWHH"
+desc = "The base address to read from/write to. Gets offset by $[0, 7]$ depending on the size of the access"
 
 [[variables.input]]
 name = "value"
@@ -52,7 +42,7 @@ desc = "Whether to write exactly 8 values"
 [[variables.output]]
 name = "old"
 type = ["BaseField", 8]
-desc = """The old value written at `base_address`. See `value` for information about representation.
+desc = """The old value written at `base_address + i`. See `value` for information about representation.
 Only the elements corresponding to the `writeN` bits are guaranteed"""
 
 # Auxiliary
@@ -64,14 +54,6 @@ desc = "The timestamp at which the address was last accessed"
 
 # Virtual
 
-[[variables.virtual]]
-name = "base_address"
-type = "DWordWL"
-desc = "Recomposing the base address from its parts"
-def = {idx = "i", polys = [
-  { iter = 0, poly = ["+", ["*", ["^", 2, 16], "base_address_mid"], ["*", ["^", 2, 8], ["idx", "base_address_low", 1]], ["idx", "base_address_low", 0]] },
-  { iter = 1, poly = "base_address_high" }]}
-
 [[variables.virtual]]
 name = "w2"
 type = "Bit"
@@ -103,14 +85,11 @@ type = "Bit"
 desc = "Whether we are performing a write (and hence not return `out`)"
 
 [[assumptions]]
-desc = "`IS_WORD[base_address_high]`"
-
-[[assumptions]]
-desc = "`IS_HALF[base_address_mid]`"
+desc = "`IS_HALF[base_address[i]]`"
+iter = ["i", 0, 1]
 
 [[assumptions]]
-desc = "`IS_BYTE[base_address_low[i]]`"
-iter = ["i", 0, 1]
+desc = "`IS_WORD[base_address[2]]`"
 
 [[assumptions]]
 desc = "`IS_BIT<write2>`"
@@ -134,9 +113,8 @@ name = "consistency"
 
 [[constraints.consistency]]
 kind = "interaction"
-tag = "AND_BYTE"
-input = [["idx", "base_address_low", 0], ["+", ["*", "write2", 1], ["*", "write4", 3], ["*", "write8", 7]]]
-output = 0
+tag = "IS_HALF"
+input = [["+", ["idx", "base_address", 0], "write2", ["*", 3, "write4"], ["*", 7, "write8"]]]
 multiplicity = "μ_sum"
 
 [[constraints.consistency]]
@@ -163,52 +141,52 @@ prefix = "M"
 [[constraints.memory]]
 kind = "interaction"
 tag = "memory"
-input = ["is_register", "base_address", "old_timestamp", ["idx", "old", 0]]
+input = ["is_register", ["cast", "base_address", "DWordWL"], "old_timestamp", ["idx", "old", 0]]
 multiplicity = "μ_sum"
 
 [[constraints.memory]]
 kind = "interaction"
 tag = "memory"
-input = ["is_register", "base_address", "timestamp", ["idx", "value", 0]]
+input = ["is_register", ["cast", "base_address", "DWordWL"], "timestamp", ["idx", "value", 0]]
 multiplicity = ["-", "μ_sum"]
 
 [[constraints.memory]]
 kind = "interaction"
 tag = "memory"
-input = ["is_register", ["+", "base_address", ["cast", 1, "DWordWL"]], "old_timestamp", ["idx", "old", 1]]
+input = ["is_register", ["+", ["cast", "base_address", "DWordWL"], ["cast", 1, "DWordWL"]], "old_timestamp", ["idx", "old", 1]]
 multiplicity = "w2"
 
 [[constraints.memory]]
 kind = "interaction"
 tag = "memory"
-input = ["is_register", ["+", "base_address", ["cast", 1, "DWordWL"]], "timestamp", ["idx", "value", 1]]
+input = ["is_register", ["+", ["cast", "base_address", "DWordWL"], ["cast", 1, "DWordWL"]], "timestamp", ["idx", "value", 1]]
 multiplicity = ["-", "w2"]
 
 [[constraints.memory]]
 kind = "interaction"
 tag = "memory"
-input = ["is_register", ["+", "base_address", ["cast", "i", "DWordWL"]], "old_timestamp", ["idx", "old", "i"]]
+input = ["is_register", ["+", ["cast", "base_address", "DWordWL"], ["cast", "i", "DWordWL"]], "old_timestamp", ["idx", "old", "i"]]
 multiplicity = "w4"
 iter = ["i", 2, 3]
 
 [[constraints.memory]]
 kind = "interaction"
 tag = "memory"
-input = ["is_register", ["+", "base_address", ["cast", "i", "DWordWL"]], "timestamp", ["idx", "value", "i"]]
+input = ["is_register", ["+", ["cast", "base_address", "DWordWL"], ["cast", "i", "DWordWL"]], "timestamp", ["idx", "value", "i"]]
 multiplicity = ["-", "w4"]
 iter = ["i", 2, 3]
 
 [[constraints.memory]]
 kind = "interaction"
 tag = "memory"
-input = ["is_register", ["+", "base_address", ["cast", "i", "DWordWL"]], "old_timestamp", ["idx", "old", "i"]]
+input = ["is_register", ["+", ["cast", "base_address", "DWordWL"], ["cast", "i", "DWordWL"]], "old_timestamp", ["idx", "old", "i"]]
 multiplicity = "write8"
 iter = ["i", 4, 7]
 
 [[constraints.memory]]
 kind = "interaction"
 tag = "memory"
-input = ["is_register", ["+", "base_address", ["cast", "i", "DWordWL"]], "timestamp", ["idx", "value", "i"]]
+input = ["is_register", ["+", ["cast", "base_address", "DWordWL"], ["cast", "i", "DWordWL"]], "timestamp", ["idx", "value", "i"]]
 multiplicity = ["-", "write8"]
 iter = ["i", 4, 7]
 
@@ -220,12 +198,12 @@ prefix = "O"
 [[constraints.output]]
 kind = "interaction"
 tag = "MEMW"
-input = ["is_register", "base_address", "value", "timestamp", "write2", "write4", "write8"]
+input = ["is_register", ["cast", "base_address", "DWordWL"], "value", "timestamp", "write2", "write4", "write8"]
 output = "old"
 multiplicity = "μ_read"
 
 [[constraints.output]]
 kind = "interaction"
 tag = "MEMW"
-input = ["is_register", "base_address", "value", "timestamp", "write2", "write4", "write8"]
+input = ["is_register", ["cast", "base_address", "DWordWL"], "value", "timestamp", "write2", "write4", "write8"]
 multiplicity = "μ_write"

From a9c073a94628b016280722e86491728368deff37 Mon Sep 17 00:00:00 2001
From: Erik <159244975+erik-3milabs@users.noreply.github.com>
Date: Mon, 23 Mar 2026 17:31:35 +0100
Subject: [PATCH 085/105] spec/MEMW(_A): minor update (#459)

* spec/memw: read/write from/to -> read from/write to

* spec/memw: rename add_limb_overflow as carry

* spec/memw: minor var desc updates

* spec/memw: remove superfluous minus symbol

* spec/memw: update description

* spec/memw_a: minor optimization

* Apply suggestions from code review

Co-authored-by: Robin Jadoul <robin.jadoul@gmail.com>

* spec/MEMW: fix interaction typing

* spec/MEMW: drop superfluous notes

* spec/MEMW: update alignment requirement

* spec/MEMW: intentionally separate carry's prose and .toml descriptions

* spec/MEMW: fix multiplicities

* spec/MEMW_A: padding

* spec/MEMW: padding

* spec/MEMW: bit check multiplicities

* spec/MEMW: simplify padding

---------

Co-authored-by: Robin Jadoul <robin.jadoul@gmail.com>
---
 spec/memw.typ              |  8 ++++++++
 spec/src/memw.toml         | 27 ++++++++++++++++++++++++---
 spec/src/memw_aligned.toml | 25 +++++++++++++++++++++++--
 3 files changed, 55 insertions(+), 5 deletions(-)

diff --git a/spec/memw.typ b/spec/memw.typ
index f70496aa8..b1f95a491 100644
--- a/spec/memw.typ
+++ b/spec/memw.typ
@@ -6,6 +6,7 @@
   total_nr_variables,
   total_nr_instantiated_columns,
   render_constraint_table,
+  render_chip_padding_table,
 )
 
 #let config = load_config()
@@ -64,6 +65,10 @@ to effectuate that part of the memory argument.
 This chip contributes the following to the lookup argument:
 #render_constraint_table(chip, config, groups: "output")
 
+= Padding
+The table can be padded to the next power of two with the following value assignments:
+#render_chip_padding_table(chip, config)
+
 = Read-size aligned fast path
 
 #let alignedchip = load_chip("src/memw_aligned.toml", config)
@@ -87,6 +92,9 @@ The #aligned chip only needs #nr_variables variables, expressed through #nr_colu
 #render_chip_assumptions(alignedchip, config)
 #render_constraint_table(alignedchip, config)
 
+== Padding
+The table can be padded to the next power of two with the following value assignments:
+#render_chip_padding_table(alignedchip, config)
 
 = Future optimization ideas
 
diff --git a/spec/src/memw.toml b/spec/src/memw.toml
index 4905fc5aa..1cc0dd3c2 100644
--- a/spec/src/memw.toml
+++ b/spec/src/memw.toml
@@ -6,36 +6,43 @@ name = "MEMW"
 name = "is_register"
 type = "Bit"
 desc = "Whether the address represents a register index"
+pad = 0
 
 [[variables.input]]
 name = "base_address"
 type = "DWordWL"
 desc = "The base address to read from/write to. Gets offset by $[0, 7]$ depending on the size of the access"
+pad = 0
 
 [[variables.input]]
 name = "value"
 type = ["BaseField", 8]
 desc = "The values to store in memory. For RAM, these should be (up to) 8 range-checked `Byte`s; registers are stored as two range-checked `Word`s"
+pad = 0
 
 [[variables.input]]
 name = "timestamp"
 type = "DWordWL"
 desc = "The timestamp at which this memory access occurs"
+pad = 0
 
 [[variables.input]]
 name = "write2"
 type = "Bit"
 desc = "Whether to write exactly 2 values"
+pad = 0
 
 [[variables.input]]
 name = "write4"
 type = "Bit"
 desc = "Whether to write exactly 4 values"
+pad = 0
 
 [[variables.input]]
 name = "write8"
 type = "Bit"
 desc = "Whether to write exactly 8 values"
+pad = 0
 
 # Output
 
@@ -44,6 +51,7 @@ name = "old"
 type = ["BaseField", 8]
 desc = """The old value written at `base_address`. See `value` for information about representation.
 Only the elements corresponding to the `writeN` bits are guaranteed"""
+pad = 0
 
 # Auxiliary
 
@@ -51,11 +59,13 @@ Only the elements corresponding to the `writeN` bits are guaranteed"""
 name = "carry"
 type = ["Bit", 7]
 desc = "Whether `base_address[0] + i + 1` $>= 2^32$"
+pad = 0
 
 [[variables.auxiliary]]
 name = "old_timestamp"
 type = ["DWordWL", 8]
 desc = "The timestamp at which address `base_address + i` was last accessed"
+pad = 0
 
 # Virtual
 
@@ -92,12 +102,13 @@ def = ["+", "μ_read", "μ_write"]
 name = "μ_read"
 type = "Bit"
 desc = "Whether we are performing a read (and hence return `out`)"
+pad = 0
 
 [[variables.multiplicity]]
 name = "μ_write"
 type = "Bit"
 desc = "Whether we are performing a write (and hence not return `out`)"
-
+pad = 0
 
 [[assumptions]]
 desc = "`IS_WORD[base_address[i]]`"
@@ -123,6 +134,16 @@ iter = ["i", 0, 1]
 [[constraint_groups]]
 name = "consistency"
 
+[[constraints.consistency]]
+kind = "template"
+tag = "IS_BIT"
+input = ["μ_read"]
+
+[[constraints.consistency]]
+kind = "template"
+tag = "IS_BIT"
+input = ["μ_write"]
+
 [[constraints.consistency]]
 kind = "template"
 tag = "IS_BIT"
@@ -235,10 +256,10 @@ kind = "interaction"
 tag = "MEMW"
 input = ["is_register", "base_address", "value", "timestamp", "write2", "write4", "write8"]
 output = "old"
-multiplicity = "μ_read"
+multiplicity = ["-", "μ_read"]
 
 [[constraints.output]]
 kind = "interaction"
 tag = "MEMW"
 input = ["is_register", "base_address", "value", "timestamp", "write2", "write4", "write8"]
-multiplicity = "μ_write"
+multiplicity = ["-", "μ_write"]
diff --git a/spec/src/memw_aligned.toml b/spec/src/memw_aligned.toml
index be6bb1603..93a636aba 100644
--- a/spec/src/memw_aligned.toml
+++ b/spec/src/memw_aligned.toml
@@ -6,36 +6,43 @@ name = "MEMW_A"
 name = "is_register"
 type = "Bit"
 desc = "Whether the address represents a register index"
+pad = 0
 
 [[variables.input]]
 name = "base_address"
 type = "DWordWHH"
 desc = "The base address to read from/write to. Gets offset by $[0, 7]$ depending on the size of the access"
+pad = 0
 
 [[variables.input]]
 name = "value"
 type = ["BaseField", 8]
 desc = "The values to store in memory. For regular memory, these should be (up to) 8 range-checked `Byte`s; registers are stored as two range-checked `Word`s"
+pad = 0
 
 [[variables.input]]
 name = "timestamp"
 type = "DWordWL"
 desc = "The timestamp at which this memory access is said to occur"
+pad = 0
 
 [[variables.input]]
 name = "write2"
 type = "Bit"
 desc = "Whether to write exactly 2 values"
+pad = 0
 
 [[variables.input]]
 name = "write4"
 type = "Bit"
 desc = "Whether to write exactly 4 values"
+pad = 0
 
 [[variables.input]]
 name = "write8"
 type = "Bit"
 desc = "Whether to write exactly 8 values"
+pad = 0
 
 # Output
 
@@ -44,6 +51,7 @@ name = "old"
 type = ["BaseField", 8]
 desc = """The old value written at `base_address + i`. See `value` for information about representation.
 Only the elements corresponding to the `writeN` bits are guaranteed"""
+pad = 0
 
 # Auxiliary
 
@@ -51,6 +59,7 @@ Only the elements corresponding to the `writeN` bits are guaranteed"""
 name = "old_timestamp"
 type = "DWordWL"
 desc = "The timestamp at which the address was last accessed"
+pad = 0
 
 # Virtual
 
@@ -78,11 +87,13 @@ def = ["+", "μ_read", "μ_write"]
 name = "μ_read"
 type = "Bit"
 desc = "Whether we are performing a read (and hence return `out`)"
+pad = 0
 
 [[variables.multiplicity]]
 name = "μ_write"
 type = "Bit"
 desc = "Whether we are performing a write (and hence not return `out`)"
+pad = 0
 
 [[assumptions]]
 desc = "`IS_HALF[base_address[i]]`"
@@ -117,6 +128,16 @@ tag = "IS_HALF"
 input = [["+", ["idx", "base_address", 0], "write2", ["*", 3, "write4"], ["*", 7, "write8"]]]
 multiplicity = "μ_sum"
 
+[[constraints.consistency]]
+kind = "template"
+tag = "IS_BIT"
+input = ["μ_read"]
+
+[[constraints.consistency]]
+kind = "template"
+tag = "IS_BIT"
+input = ["μ_write"]
+
 [[constraints.consistency]]
 kind = "template"
 tag = "IS_BIT"
@@ -200,10 +221,10 @@ kind = "interaction"
 tag = "MEMW"
 input = ["is_register", ["cast", "base_address", "DWordWL"], "value", "timestamp", "write2", "write4", "write8"]
 output = "old"
-multiplicity = "μ_read"
+multiplicity = ["-", "μ_read"]
 
 [[constraints.output]]
 kind = "interaction"
 tag = "MEMW"
 input = ["is_register", ["cast", "base_address", "DWordWL"], "value", "timestamp", "write2", "write4", "write8"]
-multiplicity = "μ_write"
+multiplicity = ["-", "μ_write"]

From 7d4518f82542a2e11d4dc44d14813812b0135b22 Mon Sep 17 00:00:00 2001
From: Erik <159244975+erik-3milabs@users.noreply.github.com>
Date: Tue, 24 Mar 2026 12:25:04 +0100
Subject: [PATCH 086/105] spec/MEMW_R: register access fast path (#457)

---------

Co-authored-by: Robin Jadoul <robin.jadoul@gmail.com>
---
 spec/memw.typ               |  60 ++++++++++++++-
 spec/src/memw_register.toml | 141 ++++++++++++++++++++++++++++++++++++
 2 files changed, 198 insertions(+), 3 deletions(-)
 create mode 100644 spec/src/memw_register.toml

diff --git a/spec/memw.typ b/spec/memw.typ
index b1f95a491..1c7a1e6e5 100644
--- a/spec/memw.typ
+++ b/spec/memw.typ
@@ -6,7 +6,7 @@
   total_nr_variables,
   total_nr_instantiated_columns,
   render_constraint_table,
-  render_chip_padding_table,
+  render_chip_padding_table
 )
 
 #let config = load_config()
@@ -96,8 +96,62 @@ The #aligned chip only needs #nr_variables variables, expressed through #nr_colu
 The table can be padded to the next power of two with the following value assignments:
 #render_chip_padding_table(alignedchip, config)
 
-= Future optimization ideas
+= Register fast-path
+
+#let config = load_config()
+#let register_chip = load_chip("src/memw_register.toml", config)
+#let reg = raw(register_chip.name)
+
+The #reg chip provides a fast-path for accessing registers.
+This fast-path leverages that registers
++ can be addressed using a `Byte`, rather than a full `DWord`,
++ are constantly accessed, i.e., $#`timestamp` - #`old_timestamp`$ is small, and
++ have a fixed access pattern
+to achieve a footprint that is significantly smaller than both #memw and #aligned.
+
+Note: as a result of hard optimization, this chip can only be used for register accesses for which 
++ $#`timestamp` - #`old_timestamp` in [1, 2^16]$, and
++ $#`timestamp[0]` > #`old_timestamp[0]`$
+If either of these rules does not apply to your access, you should fall back to using `MEMW_A`.
+
+Note moreover that this chip does not guard against misaligned register access faults: to access register with a given `address`, one must provide $2 dot #`address`$ in the lookup. 
+
+== Columns
+#let nr_variables = total_nr_variables(register_chip)
+#let nr_columns = total_nr_instantiated_columns(register_chip, config)
+
+The #reg chip is comprised of #nr_variables variables that are expressed using #nr_columns columns:
+#render_chip_column_table(register_chip, config)
+
+== Assumptions
+The following range checks are assumed to be performed/enforced outside of this chip:
+#render_chip_assumptions(register_chip, config)
 
+== Constraints
+Since most registers are frequently accessed, the difference between `timestamp` and `old_timestamp` is small most of the times.
+Rather than storing their (nearly) identical upper limbs twice, it is instead assumed that
+$#`old_timestamp[1]` = #`timestamp[1]`$; #aligned can be used for accesses where this is not the case.
+
+Verifying that $#`timestamp` > #`old_timestamp`$ now simplifies to verifying that $#`timestamp[0]` - #`old_timestamp[0]` > 0$.
+For most accesses, this value will be small enough to fit in a `Half`.
+This chip thus enforces this by means of the following constraint:
+#render_constraint_table(register_chip, config, groups: "diff")
+
+With $#`old_timestamp`<#`timestamp`$ asserted, `old` is read from the register (@regw:c:read_old) and `val` is written back (@regw:c:write_val).
+#render_constraint_table(register_chip, config, groups: "interactions")
+
+This chip can either just write ($#`μ_write` = 1$), or both read and write ($#`μ_read` = 1$) in the same cycle.
+It must be asserted that at most one of these two options is selected:
+#render_constraint_table(register_chip, config, groups: "multiplicities")
+
+Lastly, this chip contributes the following interactions to the logup:
+#render_constraint_table(register_chip, config, groups: "output")
+
+== Padding
+The table can be padded to the next power of two with the following value assignments:
+#render_chip_padding_table(register_chip, config)
+
+= Future optimization ideas
 - `MEMB` chip that does a one-byte write to remove old_timestamp from here (uncertain tradeoffs)
-- Additional fast path for registers? (Always guaranteed same timestamp, alignment could be an assumption, always only two values)
 - Adding `μ_sum`/`w2`/`w4`/`write8` multiplicities to the `IS_HALF` lookups may make some GKR things faster if there are known zeroes.
+- For the register fast-path, one may upgrade the `IS_HALF` check to an `IS_B20` check for extended range at the cost of looking through a larger table.
\ No newline at end of file
diff --git a/spec/src/memw_register.toml b/spec/src/memw_register.toml
new file mode 100644
index 000000000..3e7cdcf28
--- /dev/null
+++ b/spec/src/memw_register.toml
@@ -0,0 +1,141 @@
+name = "MEMW_R"
+
+# Variables
+
+[[variables.input]]
+name = "address"
+type = "Byte"
+desc = "address of the register being accessed"
+pad = 0
+
+[[variables.input]]
+name = "timestamp"
+type = "DWordWL"
+desc = "timestamp at which the access takes place"
+pad = 0
+
+[[variables.input]]
+name = "val"
+type = "DWordWL"
+desc = "value being written to this register"
+pad = 0
+
+[[variables.output]]
+name = "old"
+type = "DWordWL"
+desc = "value of this register at `old_timestamp`."
+pad = 0
+
+[[variables.auxiliary]]
+name = "old_timestamp_lo"
+type = "Word"
+desc = "the lower limb of `old_timestamp`"
+pad = 0
+
+[[variables.virtual]]
+name = "old_timestamp"
+type = "DWordWL"
+desc = "timestamp at which this register was last accessed"
+def = ["cast", ["arr", "old_timestamp_lo", ["idx", "timestamp", 1]], "DWordWL"]
+
+[[variables.virtual]]
+name = "μ_sum"
+type = "Bit"
+desc = ""
+def = ["+", "μ_read", "μ_write"]
+
+[[variables.multiplicity]]
+name = "μ_read"
+type = "Bit"
+desc = "Whether we are performing a read (and hence return `out`)"
+pad = 0
+
+[[variables.multiplicity]]
+name = "μ_write"
+type = "Bit"
+desc = "Whether we are performing a write (and hence not return `out`)"
+pad = 0
+
+
+
+# Assumptions
+
+[[assumptions]]
+desc = "`IS_WORD[val[i]]`"
+iter = ["i", 0, 1]
+ref = "regw:a:val"
+
+[[assumptions]]
+desc = "`IS_WORD[timestamp[i]]`"
+iter = ["i", 0, 1]
+ref = "regw:a:timestamp"
+
+# Constraints
+
+[[constraint_groups]]
+name = "diff"
+
+[[constraints.diff]]
+kind = "interaction"
+tag = "IS_HALF"
+input = [["-", ["idx", "timestamp", 0], ["idx", "old_timestamp", 0], 1]]
+multiplicity = "μ_sum"
+ref = "regw:c:diff"
+
+
+[[constraint_groups]]
+name = "multiplicities"
+
+[[constraints.multiplicities]]
+kind = "template"
+tag = "IS_BIT"
+input = ["μ_read"]
+ref = "regw:c:μ_read_is_bit"
+
+[[constraints.multiplicities]]
+kind = "template"
+tag = "IS_BIT"
+input = ["μ_write"]
+ref = "regw:c:μ_write_is_bit"
+
+[[constraints.multiplicities]]
+kind = "template"
+tag = "IS_BIT"
+input = ["μ_sum"]
+ref = "regw:c:μ_sum_is_bit"
+
+[[constraint_groups]]
+name = "interactions"
+
+[[constraints.interactions]]
+kind = "interaction"
+tag = "memory"
+input = [1, ["arr", ["cast", ["+", ["*", 2, "address"], "i"], "Word"], 0], "old_timestamp", ["idx", "old", "i"]]
+iter = ["i", 0, 1]
+multiplicity = "μ_sum"
+ref = "regw:c:read_old"
+
+[[constraints.interactions]]
+kind = "interaction"
+tag = "memory"
+input = [1, ["arr", ["cast", ["+", ["*", 2, "address"], "i"], "Word"], 0], "timestamp", ["idx", "val", "i"]]
+iter = ["i", 0, 1]
+multiplicity = ["-", "μ_sum"]
+ref = "regw:c:write_val"
+
+
+[[constraint_groups]]
+name = "output"
+
+[[constraints.output]]
+kind = "interaction"
+tag = "MEMW"
+input = [1, ["arr", ["cast", ["*", 2, "address"], "Word"], 0], ["arr", ["idx", "val", 0], ["idx", "val", 1], 0, 0, 0, 0, 0, 0], "timestamp", 1, 0, 0]
+output = ["arr", ["idx", "old", 0], ["idx", "old", 1], 0, 0, 0, 0, 0, 0]
+multiplicity = ["-", "μ_read"]
+
+[[constraints.output]]
+kind = "interaction"
+tag = "MEMW"
+input = [1, ["arr", ["cast", ["*", 2, "address"], "Word"], 0], ["arr", ["idx", "val", 0], ["idx", "val", 1], 0, 0, 0, 0, 0, 0], "timestamp", 1, 0, 0]
+multiplicity = ["-", "μ_write"]

From 12474262a3b498c5da6dd61175aff90356b37419 Mon Sep 17 00:00:00 2001
From: Robin Jadoul <robin.jadoul@3milabs.tech>
Date: Wed, 25 Mar 2026 10:51:09 +0100
Subject: [PATCH 087/105] spec: Fix CPU sign bit constraints for `word_instr`
 (#435)

---
 spec/sign.typ      |  2 ++
 spec/src/cpu.toml  | 57 +++++++++++++++++++---------------------------
 spec/src/sign.toml |  3 ---
 3 files changed, 26 insertions(+), 36 deletions(-)

diff --git a/spec/sign.typ b/spec/sign.typ
index fc1b8d0a5..7135ba9c6 100644
--- a/spec/sign.typ
+++ b/spec/sign.typ
@@ -20,6 +20,8 @@ The #sign template operates on three variables:
 The #sign template operates on the following assumptions:
 #render_chip_assumptions(chip, config)
 
+If `sign` is set to $1$, `X` will be range-checked to be a halfword, and hence proving may fail if this is not ensured.
+
 = Constraints
 It takes only two constraints to compute the `sign` of `X`, given whether `X` represents a `signed` value or not. 
 When $#`signed` = 1$, the sign of `X` is equal to its most significant bit. 
diff --git a/spec/src/cpu.toml b/spec/src/cpu.toml
index 634d00bf9..a455b854f 100644
--- a/spec/src/cpu.toml
+++ b/spec/src/cpu.toml
@@ -234,9 +234,9 @@ desc = "The value of register `rs2`"
 pad = 0
 
 [[variables.auxiliary]]
-name = "rv1_sign_bit"
+name = "rv1_ext_bit"
 type = "Bit"
-desc = "The sign bit of `rv1` if seen as a 32-bit word"
+desc = "The sign bit of `rv1` if seen as a 32-bit word, used for sign extension with `word_instr`"
 pad = 0
 
 [[variables.auxiliary]]
@@ -246,9 +246,9 @@ desc = "The extended version of `rv1`, depending on `word_instr`"
 pad = 0
 
 [[variables.auxiliary]]
-name = "arg2_sign_bit"
+name = "rv2_ext_bit"
 type = "Bit"
-desc = "The sign bit of `arg2` if seen as a 32-bit word"
+desc = "The sign bit of `rv2` if seen as a 32-bit word, used for sign extension with `word_instr`"
 pad = 0
 
 [[variables.auxiliary]]
@@ -258,9 +258,9 @@ desc = "A multiplexed version of `rv2` and `imm`, to be used as second argument
 pad = 0
 
 [[variables.auxiliary]]
-name = "res_sign_bit"
+name = "res_ext_bit"
 type = "Bit"
-desc = "The sign bit of `res`, if seen as a 32-bit word"
+desc = "The sign bit of `res`, if seen as a 32-bit word, used for sign extension with `word_instr`"
 pad = 0
 
 [[variables.auxiliary]]
@@ -722,16 +722,10 @@ name = "ext"
 prefix = "E"
 
 [[constraints.ext]]
-kind = "arith"
-constraint = "$(#`rv1_sign_bit` or #`arg2_sign_bit` or #`res_sign_bit`) => #`word_instr`$"
-poly = ["*", ["+", "rv1_sign_bit", "arg2_sign_bit", "res_sign_bit"], ["not", "word_instr"]]
-
-[[constraints.ext]]
-kind = "interaction"
-tag = "MSB16"
-input = [["idx", "rv1", 1]]
-output = "rv1_sign_bit"
-multiplicity = "word_instr"
+kind = "template"
+tag = "SIGN"
+input = [["idx", "rv1", 1], "word_instr"]
+output = "rv1_ext_bit"
 
 [[constraints.ext]]
 kind = "arith"
@@ -740,15 +734,14 @@ poly = ["-", ["idx", ["cast", "arg1", "DWordWL"], 0], ["idx", ["cast", "rv1", "D
 
 [[constraints.ext]]
 kind = "arith"
-constraint = "$#`arg1[4:]` = #`rv1[2]` dot (1 - #`word_instr`) + (2^(32) - 1) dot #`rv1_sign_bit` dot #`signed`$"
-poly = ["-", ["idx", ["cast", "arg1", "DWordWL"], 1], ["*", ["not", "word_instr"], ["idx", "rv1", 2]], ["*", "signed", "rv1_sign_bit", ["-", ["^", 2, 32], 1]]]
+constraint = "$#`arg1[4:]` = #`rv1[2]` dot (1 - #`word_instr`) + (2^(32) - 1) dot #`rv1_ext_bit` dot #`signed`$"
+poly = ["-", ["idx", ["cast", "arg1", "DWordWL"], 1], ["*", ["not", "word_instr"], ["idx", "rv1", 2]], ["*", "signed", "rv1_ext_bit", ["-", ["^", 2, 32], 1]]]
 
 [[constraints.ext]]
-kind = "interaction"
-tag = "MSB16"
-input = [["idx", "rv2", 1]]
-output = "arg2_sign_bit"
-multiplicity = "word_instr"
+kind = "template"
+tag = "SIGN"
+input = [["idx", "rv2", 1], "word_instr"]
+output = "rv2_ext_bit"
 
 [[constraints.ext]]
 kind = "arith"
@@ -757,15 +750,14 @@ poly = ["-", ["idx", ["cast", "arg2", "DWordWL"], 0], ["*", ["not", "LOAD"], ["i
 
 [[constraints.ext]]
 kind = "arith"
-constraint = "$#`arg2[4:]` = (1 - #`LOAD`) dot ((1 - #`word_instr`) dot #`rv2[2]` + #`signed` dot #`arg2_sign_bit` dot (2^(32) - 1)) + (1 - #`BEQ` - #`BLT` - #`STORE`) dot #`imm[1]`$"
-poly = ["-", ["idx", ["cast", "arg2", "DWordWL"], 1], ["*", ["not", "LOAD"], ["not", "word_instr"], ["idx", "rv2", 2]], ["*", ["not", "LOAD"], "signed", "arg2_sign_bit", ["-", ["^", 2, 32], 1]], ["*", ["-", 1, "BEQ", "BLT", "STORE"], ["idx", "imm", 1]]]
+constraint = "$#`arg2[4:]` = (1 - #`LOAD`) dot ((1 - #`word_instr`) dot #`rv2[2]` + #`signed` dot #`rv2_ext_bit` dot (2^(32) - 1)) + (1 - #`BEQ` - #`BLT` - #`STORE`) dot #`imm[1]`$"
+poly = ["-", ["idx", ["cast", "arg2", "DWordWL"], 1], ["*", ["not", "LOAD"], ["not", "word_instr"], ["idx", "rv2", 2]], ["*", ["not", "LOAD"], "signed", "rv2_ext_bit", ["-", ["^", 2, 32], 1]], ["*", ["-", 1, "BEQ", "BLT", "STORE"], ["idx", "imm", 1]]]
 
 [[constraints.ext]]
-kind = "interaction"
-tag = "MSB8"
-input = [["idx", "res", 3]]
-output = "res_sign_bit"
-multiplicity = "word_instr"
+kind = "template"
+tag = "SIGN"
+input = [["idx", ["cast", "res", "DWordHL"], 1],  "word_instr"]
+output = "res_ext_bit"
 
 [[constraints.ext]]
 kind = "arith"
@@ -774,10 +766,9 @@ poly = ["*", ["not", "LOAD"], ["-", ["idx", "rvd", 0], ["idx", ["cast", "res", "
 
 [[constraints.ext]]
 kind = "arith"
-constraint = "$#`!LOAD` => #`rvd[1]` = (1 - #`word_instr`) dot #`res[4:]` + #`res_sign_bit` dot (2^(32) - 1)$"
+constraint = "$#`!LOAD` => #`rvd[1]` = (1 - #`word_instr`) dot #`res[4:]` + #`res_ext_bit` dot (2^(32) - 1)$"
 desc = "_Sign_ extend the output if it wasn't a `LOAD`. Only `LOAD` has both `write_register = 1` and `rvd ≠ res`. `LOAD` and `word_instr` are disjoint"
-poly = ["*", ["not", "LOAD"], ["-", ["idx", "rvd", 1], ["*", ["not", "word_instr"], ["idx", ["cast", "res", "DWordWL"], 1]], ["*", "res_sign_bit", ["-", ["^", 2, 32], 1]]]]
-
+poly = ["*", ["not", "LOAD"], ["-", ["idx", "rvd", 1], ["*", ["not", "word_instr"], ["idx", ["cast", "res", "DWordWL"], 1]], ["*", "res_ext_bit", ["-", ["^", 2, 32], 1]]]]
 
 
 [[constraint_groups]]
diff --git a/spec/src/sign.toml b/spec/src/sign.toml
index ca799e0cc..24e99bd0e 100644
--- a/spec/src/sign.toml
+++ b/spec/src/sign.toml
@@ -16,9 +16,6 @@ type = "Bit"
 desc = "Sign of `X`"
 
 
-[[assumptions]]
-desc = "`IS_HALF[X]`"
-
 [[assumptions]]
 desc = "`IS_BIT<signed>`"
 

From 3124c1abbc99c82f3d0323fb69de04e8e0ad2811 Mon Sep 17 00:00:00 2001
From: Nicole <nicole.graus@lambdaclass.com>
Date: Wed, 25 Mar 2026 10:01:10 -0300
Subject: [PATCH 088/105] update spec

---
 docs/spec/cpu.md       |  50 +++++++-------
 docs/spec/memw.md      |  98 +++++++++++++++++---------
 docs/spec/sign.md      |   5 +-
 docs/spec/spec_full.md | 151 +++++++++++++++++++++++++----------------
 4 files changed, 185 insertions(+), 119 deletions(-)

diff --git a/docs/spec/cpu.md b/docs/spec/cpu.md
index e0a2af9c2..04fb1045e 100644
--- a/docs/spec/cpu.md
+++ b/docs/spec/cpu.md
@@ -109,25 +109,23 @@ The interactions with the wider system.
 
 We constrain `arg1`, `arg2` and `rvd` to correspond to the wanted values, including the appropriate sign/zero extension, depending on `word_instr`.
 
-| Tag | Description | Multiplicity |
-|-----|-------------|--------------|
-| `CPU-CE57` | (`rv1_sign_bit` or `arg2_sign_bit` or `res_sign_bit`) => `word_instr` |  |
-| | _polynomial:_ `(rv1_sign_bit + arg2_sign_bit + res_sign_bit) * (1 - word_instr) = 0` | |
-| `CPU-CE58` | `MSB16[rv1_sign_bit; rv1[1]]` | word_instr |
-| `CPU-CE59` | `arg1[:4]` = `rv1[:2]` |  |
-| | _polynomial:_ `(arg1::DWordWL)[0] - (rv1::DWordWL)[0] = 0` | |
-| `CPU-CE60` | `arg1[4:]` = `rv1[2]` dot (1 - `word_instr`) + (2^(32) - 1) dot `rv1_sign_bit` dot `signed` |  |
-| | _polynomial:_ `(arg1::DWordWL)[1] - (1 - word_instr) * rv1[2] - signed * rv1_sign_bit * (2^32 - 1) = 0` | |
-| `CPU-CE61` | `MSB16[arg2_sign_bit; rv2[1]]` | word_instr |
-| `CPU-CE62` | `arg2[:4]` = (1 - `LOAD`) dot `rv2[:2]` + (1 - `BEQ` - `BLT` - `STORE`) dot `imm[0]` |  |
-| | _polynomial:_ `(arg2::DWordWL)[0] - (1 - LOAD) * (rv2::DWordWL)[0] - (1 - BEQ - BLT - STORE) * imm[0] = 0` | |
-| `CPU-CE63` | `arg2[4:]` = (1 - `LOAD`) dot ((1 - `word_instr`) dot `rv2[2]` + `signed` dot `arg2_sign_bit` dot (2^(32) - 1)) + (1 - `BEQ` - `BLT` - `STORE`) dot `imm[1]` |  |
-| | _polynomial:_ `(arg2::DWordWL)[1] - (1 - LOAD) * (1 - word_instr) * rv2[2] - (1 - LOAD) * signed * arg2_sign_bit * (2^32 - 1) - (1 - BEQ - BLT - STORE) * imm[1] = 0` | |
-| `CPU-CE64` | `MSB8[res_sign_bit; res[3]]` | word_instr |
-| `CPU-CE65` | `!LOAD` => `rvd[0]` = `res[:4]` |  |
-| | _polynomial:_ `(1 - LOAD) * (rvd[0] - (res::DWordWL)[0]) = 0` | |
-| `CPU-CE66` | `!LOAD` => `rvd[1]` = (1 - `word_instr`) dot `res[4:]` + `res_sign_bit` dot (2^(32) - 1) |  |
-| | _polynomial:_ `(1 - LOAD) * (rvd[1] - (1 - word_instr) * (res::DWordWL)[1] - res_sign_bit * (2^32 - 1)) = 0` | |
+| Tag | Description |
+|-----|-------------|
+| `CPU-CE57` | `SIGN<rv1_ext_bit; rv1[1], word_instr>` |
+| `CPU-CE58` | `arg1[:4]` = `rv1[:2]` |
+| | _polynomial:_ `(arg1::DWordWL)[0] - (rv1::DWordWL)[0] = 0` |
+| `CPU-CE59` | `arg1[4:]` = `rv1[2]` dot (1 - `word_instr`) + (2^(32) - 1) dot `rv1_ext_bit` dot `signed` |
+| | _polynomial:_ `(arg1::DWordWL)[1] - (1 - word_instr) * rv1[2] - signed * rv1_ext_bit * (2^32 - 1) = 0` |
+| `CPU-CE60` | `SIGN<rv2_ext_bit; rv2[1], word_instr>` |
+| `CPU-CE61` | `arg2[:4]` = (1 - `LOAD`) dot `rv2[:2]` + (1 - `BEQ` - `BLT` - `STORE`) dot `imm[0]` |
+| | _polynomial:_ `(arg2::DWordWL)[0] - (1 - LOAD) * (rv2::DWordWL)[0] - (1 - BEQ - BLT - STORE) * imm[0] = 0` |
+| `CPU-CE62` | `arg2[4:]` = (1 - `LOAD`) dot ((1 - `word_instr`) dot `rv2[2]` + `signed` dot `rv2_ext_bit` dot (2^(32) - 1)) + (1 - `BEQ` - `BLT` - `STORE`) dot `imm[1]` |
+| | _polynomial:_ `(arg2::DWordWL)[1] - (1 - LOAD) * (1 - word_instr) * rv2[2] - (1 - LOAD) * signed * rv2_ext_bit * (2^32 - 1) - (1 - BEQ - BLT - STORE) * imm[1] = 0` |
+| `CPU-CE63` | `SIGN<res_ext_bit; (res::DWordHL)[1], word_instr>` |
+| `CPU-CE64` | `!LOAD` => `rvd[0]` = `res[:4]` |
+| | _polynomial:_ `(1 - LOAD) * (rvd[0] - (res::DWordWL)[0]) = 0` |
+| `CPU-CE65` | `!LOAD` => `rvd[1]` = (1 - `word_instr`) dot `res[4:]` + `res_ext_bit` dot (2^(32) - 1) |
+| | _polynomial:_ `(1 - LOAD) * (rvd[1] - (1 - word_instr) * (res::DWordWL)[1] - res_ext_bit * (2^32 - 1)) = 0` |
 
 ## Other constraints
 
@@ -135,11 +133,11 @@ For [cpu:c:is_equal], note that [cpu:c:sub] sets `res` to be the difference betw
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `CPU-CO67` | `ZERO[is_equal; res[0] + res[1] + res[2] + res[3] + res[4] + res[5] + res[6] + res[7]]` | BEQ |
-| `CPU-CO68` | `branch_cond` = `JALR` or (`BLT` and (`res` xor `invert`)) or (`BEQ` and (`is_equal` xor `invert`)) |  |
+| `CPU-CO66` | `ZERO[is_equal; res[0] + res[1] + res[2] + res[3] + res[4] + res[5] + res[6] + res[7]]` | BEQ |
+| `CPU-CO67` | `branch_cond` = `JALR` or (`BLT` and (`res` xor `invert`)) or (`BEQ` and (`is_equal` xor `invert`)) |  |
 | | _polynomial:_ `-branch_cond + JALR + res[0] * (1 - mp_selector) * BLT + (1 - res[0]) * mp_selector * BLT + is_equal * (1 - mp_selector) * BEQ + (1 - is_equal) * mp_selector * BEQ = 0` | |
-| `CPU-CO69` | `BRANCH[next_pc; pc, imm, arg1::DWordWL, JALR]` | branch_cond |
-| `CPU-CO70` | `ADD<next_pc; pc, (2 * c_type_instruction + 4 * (1 - c_type_instruction)) * 1::DWordWL>` |  |
+| `CPU-CO68` | `BRANCH[next_pc; pc, imm, arg1::DWordWL, JALR]` | branch_cond |
+| `CPU-CO69` | `ADD<next_pc; pc, (2 * c_type_instruction + 4 * (1 - c_type_instruction)) * 1::DWordWL>` |  |
 
 > **Note:** Document the choice to not have a multiplicity column here for padding
 
@@ -202,11 +200,11 @@ This approach minimizes the number of dependent lookups, increasing only multipl
 |------|------|-------------|
 | `rv1` | `DWordWHH` | The value of register `rs1` |
 | `rv2` | `DWordWHH` | The value of register `rs2` |
-| `rv1_sign_bit` | `Bit` | The sign bit of `rv1` if seen as a 32-bit word |
+| `rv1_ext_bit` | `Bit` | The sign bit of `rv1` if seen as a 32-bit word, used for sign extension with `word_instr` |
 | `arg1` | `DWordBL` | The extended version of `rv1`, depending on `word_instr` |
-| `arg2_sign_bit` | `Bit` | The sign bit of `arg2` if seen as a 32-bit word |
+| `rv2_ext_bit` | `Bit` | The sign bit of `rv2` if seen as a 32-bit word, used for sign extension with `word_instr` |
 | `arg2` | `DWordBL` | A multiplexed version of `rv2` and `imm`, to be used as second argument to ALU calls |
-| `res_sign_bit` | `Bit` | The sign bit of `res`, if seen as a 32-bit word |
+| `res_ext_bit` | `Bit` | The sign bit of `res`, if seen as a 32-bit word, used for sign extension with `word_instr` |
 | `res` | `DWordBL` | The ALU result |
 | `is_equal` | `Bit` | Whether `rv1` and `arg2` are equal |
 | `branch_cond` | `Bit` | Whether a branch is taken, i.e., the branch condition |
diff --git a/docs/spec/memw.md b/docs/spec/memw.md
index 4862b9ebf..d44fa38af 100644
--- a/docs/spec/memw.md
+++ b/docs/spec/memw.md
@@ -8,58 +8,70 @@ The `MEMW` chip is comprised of  variables that are expressed using  columns:
 
 = Assumptions
 
-Our assumptions do not explicitly cover any range checks for the `is_register` and `value` columns, as these are not necessary for the correctness of this chip in isolation. These properties are necessary for the consistency of the system as a whole, and therefore we document it here, keeping the type information as a reading help.
+Our assumptions do not explicitly cover any range checks for the `is_register` and `value` columns, as these are not necessary for the correctness of this chip in isolation. Still, these properties are necessary for the consistency of the system as a whole, and therefore we document it here, keeping the type information as a reading help.
 
 = Constraints
 
-We can compute the addresses for the later bytes based on a single bit each, indicating whether adding `i` to `base_address` overflows the lower limb. We can safely assume that additions for which this bit is not correctly set will have either an overflow on the upper or lower word, and hence not match any existing memory tokens, which are only initialized for correctly formatted and range-checked doublewords (see [memory]).
+Depending on the values of `write2`, `write4` and `write8`, the addresses following `base_address` need to be constructed. Rather than computing these in full (which would require the later addresses to be instantiated), it suffices to know the `carry`: the bit indicating whether ``base_address`_0 + t >= 2^32`, i.e., whether adding `t in [1, 7]` to `base_address` requires a carry from the lower to the upper limb. Note that it is safe for the prover to chose these bits: additions for which this bit is not correctly set will yield an address where either the lower or upper limb is out of bounds. As such, the constructed address will not match any existing memory tokens, which are only initialized for correctly formatted and range-checked doublewords (see [memory]).
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `MEMW-C1` |  | `IS_BIT<μ_sum>` |  |
-| `MEMW-C2` |  | `w2` => `μ_sum` |  |
+| `MEMW-C1` |  | `IS_BIT<μ_read>` |  |
+| `MEMW-C2` |  | `IS_BIT<μ_write>` |  |
+| `MEMW-C3` |  | `IS_BIT<μ_sum>` |  |
+| `MEMW-C4` |  | `w2` => `μ_sum` |  |
 | | | _polynomial:_ `w2 * (1 - μ_sum) = 0` | |
-| `MEMW-C3.i` | i ∈ [0, 6] | `IS_BIT<add_limb_overflow[i]>` |  |
-| `MEMW-C4` |  | `LT[1; old_timestamp[0], timestamp, 0]` | μ_sum |
-| `MEMW-C5` |  | `LT[1; old_timestamp[1], timestamp, 0]` | w2 |
-| `MEMW-C6.i` | i ∈ [2, 3] | `LT[1; old_timestamp[i], timestamp, 0]` | w4 |
-| `MEMW-C7.i` | i ∈ [4, 7] | `LT[1; old_timestamp[i], timestamp, 0]` | write8 |
+| `MEMW-C5.i` | i ∈ [0, 6] | `IS_BIT<carry[i]>` |  |
+| `MEMW-C6` |  | `LT[1; old_timestamp[0], timestamp, 0]` | μ_sum |
+| `MEMW-C7` |  | `LT[1; old_timestamp[1], timestamp, 0]` | w2 |
+| `MEMW-C8.i` | i ∈ [2, 3] | `LT[1; old_timestamp[i], timestamp, 0]` | w4 |
+| `MEMW-C9.i` | i ∈ [4, 7] | `LT[1; old_timestamp[i], timestamp, 0]` | write8 |
 
-As long as `timestamp` is properly range-checked, the presence of `old_timestamp` in the memory argument automatically ensures appropriate range checking (as long as no external entities provide negative multiplicities without range checking the timestamp). This ensures the assumptions for `LT` are satisfied.
+As long as `timestamp` is properly range-checked, the presence of `old_timestamp` in the memory argument automatically ensures it is appropriately range checked (this assumes no external entities provide negative multiplicities without range checking the timestamp). This ensures the assumptions for `LT` are satisfied.
 
-There is no need to check that the address does not overflow, as our address calculations are not performed modulo `2^64` here, and any overflow will result in an address without matching initialization.
+There is no need to check that the additions do not overflow, as our address calculations are not performed modulo `2^64` here, and any overflow will result in an address without matching initialization.
 
 The chip adds the following tuples to the lookup argument, to effectuate that part of the memory argument.
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `MEMW-CM8` |  | `memory[is_register, base_address, old_timestamp[0], old[0]]` | μ_sum |
-| `MEMW-CM9` |  | `memory[is_register, base_address, timestamp, value[0]]` | -μ_sum |
-| `MEMW-CM10` |  | `memory[is_register, address_add[0], old_timestamp[1], old[1]]` | w2 |
-| `MEMW-CM11` |  | `memory[is_register, address_add[0], timestamp, value[1]]` | -w2 |
-| `MEMW-CM12.i` | i ∈ [2, 3] | `memory[is_register, address_add[i - 1], old_timestamp[i], old[i]]` | w4 |
-| `MEMW-CM13.i` | i ∈ [2, 3] | `memory[is_register, address_add[i - 1], timestamp, value[i]]` | -w4 |
-| `MEMW-CM14.i` | i ∈ [4, 7] | `memory[is_register, address_add[i - 1], old_timestamp[i], old[i]]` | write8 |
-| `MEMW-CM15.i` | i ∈ [4, 7] | `memory[is_register, address_add[i - 1], timestamp, value[i]]` | -write8 |
+| `MEMW-CM10` |  | `memory[is_register, base_address, old_timestamp[0], old[0]]` | μ_sum |
+| `MEMW-CM11` |  | `memory[is_register, base_address, timestamp, value[0]]` | -μ_sum |
+| `MEMW-CM12` |  | `memory[is_register, address_add[0], old_timestamp[1], old[1]]` | w2 |
+| `MEMW-CM13` |  | `memory[is_register, address_add[0], timestamp, value[1]]` | -w2 |
+| `MEMW-CM14.i` | i ∈ [2, 3] | `memory[is_register, address_add[i - 1], old_timestamp[i], old[i]]` | w4 |
+| `MEMW-CM15.i` | i ∈ [2, 3] | `memory[is_register, address_add[i - 1], timestamp, value[i]]` | -w4 |
+| `MEMW-CM16.i` | i ∈ [4, 7] | `memory[is_register, address_add[i - 1], old_timestamp[i], old[i]]` | write8 |
+| `MEMW-CM17.i` | i ∈ [4, 7] | `memory[is_register, address_add[i - 1], timestamp, value[i]]` | -write8 |
 
-This chip contributes the following to the lookup argument.
+This chip contributes the following to the lookup argument:
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `MEMW-CO16` | `MEMW[old; is_register, base_address, value, timestamp, write2, write4, write8]` | μ_read |
-| `MEMW-CO17` | `MEMW[is_register, base_address, value, timestamp, write2, write4, write8]` | μ_write |
+| `MEMW-CO18` | `MEMW[old; is_register, base_address, value, timestamp, write2, write4, write8]` | -μ_read |
+| `MEMW-CO19` | `MEMW[is_register, base_address, value, timestamp, write2, write4, write8]` | -μ_write |
+
+= Padding The table can be padded to the next power of two with the following value assignments:
 
 = Read-size aligned fast path
 
-When a memory access happens at an address with proper alignment (that is, enough trailing zeros) for its access size, and all accessed elements were last accessed at the same timestamp, we can instead use the  chip to save on total column count. The saving comes from only requiring a single old timestamp to be stored, as well as being able to guarantee that all values of `add_limb_overflow` would be zero. A minor extra cost is introduced in the form of a check that the alignment is indeed correct, and the corresponding decomposition of the `base_address`.
+When a memory access happens at an address with proper alignment for its access size (i.e., adding the access size to `base_address`'s lowest limb does not overflow), and all accessed elements were last accessed at the same timestamp, we can instead use the  chip to save on total column count. The saving comes from only requiring a single old timestamp to be stored, as well as being able to guarantee that all values of `add_limb_overflow` would be zero. A minor extra cost is introduced in the form of a check that the alignment is indeed correct, and the corresponding decomposition of the `base_address`.
 
 Further logic remains essentially the same, so we briefly present the relevant tables for this chip.
 
 The  chip only needs  variables, expressed through  columns.
 
-= Future optimization ideas
+## Padding
+
+The table can be padded to the next power of two with the following value assignments:
+
+= Register fast-path
+
+The  chip provides a fast-path for accessing registers. This fast-path leverages that registers + can be addressed using a `Byte`, rather than a full `DWord`, + are constantly accessed, i.e., ``timestamp` - `old_timestamp`` is small, and + have a fixed access pattern to achieve a footprint that is significantly smaller than both  and .
 
-- `MEMB` chip that does a one-byte write to remove old_timestamp from here (uncertain tradeoffs) - Additional fast path for registers? (Always guaranteed same timestamp, alignment could be an assumption, always only two values) - Adding `μ_sum`/`w2`/`w4`/`write8` multiplicities to the `IS_HALF` lookups may make some GKR things faster if there are known zeroes.
+Note: as a result of hard optimization, this chip can only be used for register accesses for which + ``timestamp` - `old_timestamp` in [1, 2^16]`, and + ``timestamp[0]` > `old_timestamp[0]`` If either of these rules does not apply to your access, you should fall back to using `MEMW_A`.
+
+Note moreover that this chip does not guard against misaligned register access faults: to access register with a given `address`, one must provide `2 dot `address`` in the lookup.
 
 ## Columns
 
@@ -68,9 +80,9 @@ The  chip only needs  variables, expressed through  columns.
 | Name | Type | Description |
 |------|------|-------------|
 | `is_register` | `Bit` | Whether the address represents a register index |
-| `base_address` | `DWordWL` | The base address to read/write from/to, gets offset by $[0, 7]$, depending on how big the access is |
-| `value` | `BaseField[8]` | The values to store in memory. For regular memory, these should be (up to) 8 range-checked `Byte`s; registers are stored as two range-checked `Word`s |
-| `timestamp` | `DWordWL` | The timestamp at which this memory access is said to occur |
+| `base_address` | `DWordWL` | The base address to read from/write to. Gets offset by $[0, 7]$ depending on the size of the access |
+| `value` | `BaseField[8]` | The values to store in memory. For RAM, these should be (up to) 8 range-checked `Byte`s; registers are stored as two range-checked `Word`s |
+| `timestamp` | `DWordWL` | The timestamp at which this memory access occurs |
 | `write2` | `Bit` | Whether to write exactly 2 values |
 | `write4` | `Bit` | Whether to write exactly 4 values |
 | `write8` | `Bit` | Whether to write exactly 8 values |
@@ -85,8 +97,8 @@ The  chip only needs  variables, expressed through  columns.
 
 | Name | Type | Description |
 |------|------|-------------|
-| `add_limb_overflow` | `Bit[7]` | Whether adding `i` to `base_address[0]` as a field element exceeds $2^32$ |
-| `old_timestamp` | `DWordWL[8]` | The timestamp at which the address was last accessed |
+| `carry` | `Bit[7]` | Whether `base_address[0] + i + 1` $>= 2^32$ |
+| `old_timestamp` | `DWordWL[8]` | The timestamp at which address `base_address + i` was last accessed |
 
 ### Virtual
 
@@ -109,7 +121,7 @@ w4 := write4 + write8
 
 **Definition of `address_add`:**
 ```
-address_add := ['arr', ['+', ['idx', 'base_address', 0], 'i', 1, ['*', ['-', ['^', 2, 32]], ['idx', 'add_limb_overflow', 'i']]], ['+', ['idx', 'base_address', 1], ['idx', 'add_limb_overflow', 'i']]]
+address_add := ['arr', ['-', ['+', ['idx', 'base_address', 0], 'i', 1], ['*', ['^', 2, 32], ['idx', 'carry', 'i']]], ['+', ['idx', 'base_address', 1], ['idx', 'carry', 'i']]]
 ```
 
 **Definition of `μ_sum`:**
@@ -124,6 +136,8 @@ address_add := ['arr', ['+', ['idx', 'base_address', 0], 'i', 1, ['*', ['-', ['^
 | `μ_read` | `Bit` | Whether we are performing a read (and hence return `out`) |
 | `μ_write` | `Bit` | Whether we are performing a write (and hence not return `out`) |
 
+The  chip is comprised of  variables that are expressed using  columns:
+
 ## Assumptions
 
 | Tag | Range | Description |
@@ -133,4 +147,24 @@ address_add := ['arr', ['+', ['idx', 'base_address', 0], 'i', 1, ['*', ['-', ['^
 | `MEMW-A3` |  | `IS_BIT<write4>` |
 | `MEMW-A4` |  | `IS_BIT<write8>` |
 | `MEMW-A5` |  | `IS_BIT<write2 + write4 + write8>` |
-| `MEMW-A6.i` | i ∈ [0, 1] | `IS_WORD[timestamp[i]]` |
\ No newline at end of file
+| `MEMW-A6.i` | i ∈ [0, 1] | `IS_WORD[timestamp[i]]` |
+
+The following range checks are assumed to be performed/enforced outside of this chip:
+
+## Constraints
+
+Since most registers are frequently accessed, the difference between `timestamp` and `old_timestamp` is small most of the times. Rather than storing their (nearly) identical upper limbs twice, it is instead assumed that ``old_timestamp[1]` = `timestamp[1]``;  can be used for accesses where this is not the case.
+
+Verifying that ``timestamp` > `old_timestamp`` now simplifies to verifying that ``timestamp[0]` - `old_timestamp[0]` > 0`. For most accesses, this value will be small enough to fit in a `Half`. This chip thus enforces this by means of the following constraint:
+
+With ``old_timestamp`<`timestamp`` asserted, `old` is read from the register ([regw:c:read_old]) and `val` is written back ([regw:c:write_val]).
+
+This chip can either just write (``μ_write` = 1`), or both read and write (``μ_read` = 1`) in the same cycle. It must be asserted that at most one of these two options is selected:
+
+Lastly, this chip contributes the following interactions to the logup:
+
+## Padding
+
+The table can be padded to the next power of two with the following value assignments:
+
+= Future optimization ideas - `MEMB` chip that does a one-byte write to remove old_timestamp from here (uncertain tradeoffs) - Adding `μ_sum`/`w2`/`w4`/`write8` multiplicities to the `IS_HALF` lookups may make some GKR things faster if there are known zeroes. - For the register fast-path, one may upgrade the `IS_HALF` check to an `IS_B20` check for extended range at the cost of looking through a larger table.
\ No newline at end of file
diff --git a/docs/spec/sign.md b/docs/spec/sign.md
index df9cac450..592c78489 100644
--- a/docs/spec/sign.md
+++ b/docs/spec/sign.md
@@ -6,6 +6,8 @@ It constrains that `sign` is set to `1` when both `X`'s most significant bit and
 
 = Assumptions The  template operates on the following assumptions:
 
+If `sign` is set to `1`, `X` will be range-checked to be a halfword, and hence proving may fail if this is not ensured.
+
 = Constraints It takes only two constraints to compute the `sign` of `X`, given whether `X` represents a `signed` value or not. When ``signed` = 1`, the sign of `X` is equal to its most significant bit. This value is extracted in [sign:c:sign_if_signed]. If `X` is unsigned (i.e., ``signed` = 0`), its sign is always `0`. This is constrained by [sign:c:sign_if_unsigned].
 
 ## Columns
@@ -27,8 +29,7 @@ It constrains that `sign` is set to `1` when both `X`'s most significant bit and
 
 | Tag | Range | Description |
 |-----|-------|-------------|
-| `SIGN-A1` |  | `IS_HALF[X]` |
-| `SIGN-A2` |  | `IS_BIT<signed>` |
+| `SIGN-A1` |  | `IS_BIT<signed>` |
 
 ## Constraints
 
diff --git a/docs/spec/spec_full.md b/docs/spec/spec_full.md
index 17ed0a923..86ccf58c8 100644
--- a/docs/spec/spec_full.md
+++ b/docs/spec/spec_full.md
@@ -167,6 +167,8 @@ It constrains that `sign` is set to `1` when both `X`'s most significant bit and
 
 = Assumptions The  template operates on the following assumptions:
 
+If `sign` is set to `1`, `X` will be range-checked to be a halfword, and hence proving may fail if this is not ensured.
+
 = Constraints It takes only two constraints to compute the `sign` of `X`, given whether `X` represents a `signed` value or not. When ``signed` = 1`, the sign of `X` is equal to its most significant bit. This value is extracted in [sign:c:sign_if_signed]. If `X` is unsigned (i.e., ``signed` = 0`), its sign is always `0`. This is constrained by [sign:c:sign_if_unsigned].
 
 ## Columns
@@ -188,8 +190,7 @@ It constrains that `sign` is set to `1` when both `X`'s most significant bit and
 
 | Tag | Range | Description |
 |-----|-------|-------------|
-| `SIGN-A1` |  | `IS_HALF[X]` |
-| `SIGN-A2` |  | `IS_BIT<signed>` |
+| `SIGN-A1` |  | `IS_BIT<signed>` |
 
 ## Constraints
 
@@ -517,25 +518,23 @@ The interactions with the wider system.
 
 We constrain `arg1`, `arg2` and `rvd` to correspond to the wanted values, including the appropriate sign/zero extension, depending on `word_instr`.
 
-| Tag | Description | Multiplicity |
-|-----|-------------|--------------|
-| `CPU-CE57` | (`rv1_sign_bit` or `arg2_sign_bit` or `res_sign_bit`) => `word_instr` |  |
-| | _polynomial:_ `(rv1_sign_bit + arg2_sign_bit + res_sign_bit) * (1 - word_instr) = 0` | |
-| `CPU-CE58` | `MSB16[rv1_sign_bit; rv1[1]]` | word_instr |
-| `CPU-CE59` | `arg1[:4]` = `rv1[:2]` |  |
-| | _polynomial:_ `(arg1::DWordWL)[0] - (rv1::DWordWL)[0] = 0` | |
-| `CPU-CE60` | `arg1[4:]` = `rv1[2]` dot (1 - `word_instr`) + (2^(32) - 1) dot `rv1_sign_bit` dot `signed` |  |
-| | _polynomial:_ `(arg1::DWordWL)[1] - (1 - word_instr) * rv1[2] - signed * rv1_sign_bit * (2^32 - 1) = 0` | |
-| `CPU-CE61` | `MSB16[arg2_sign_bit; rv2[1]]` | word_instr |
-| `CPU-CE62` | `arg2[:4]` = (1 - `LOAD`) dot `rv2[:2]` + (1 - `BEQ` - `BLT` - `STORE`) dot `imm[0]` |  |
-| | _polynomial:_ `(arg2::DWordWL)[0] - (1 - LOAD) * (rv2::DWordWL)[0] - (1 - BEQ - BLT - STORE) * imm[0] = 0` | |
-| `CPU-CE63` | `arg2[4:]` = (1 - `LOAD`) dot ((1 - `word_instr`) dot `rv2[2]` + `signed` dot `arg2_sign_bit` dot (2^(32) - 1)) + (1 - `BEQ` - `BLT` - `STORE`) dot `imm[1]` |  |
-| | _polynomial:_ `(arg2::DWordWL)[1] - (1 - LOAD) * (1 - word_instr) * rv2[2] - (1 - LOAD) * signed * arg2_sign_bit * (2^32 - 1) - (1 - BEQ - BLT - STORE) * imm[1] = 0` | |
-| `CPU-CE64` | `MSB8[res_sign_bit; res[3]]` | word_instr |
-| `CPU-CE65` | `!LOAD` => `rvd[0]` = `res[:4]` |  |
-| | _polynomial:_ `(1 - LOAD) * (rvd[0] - (res::DWordWL)[0]) = 0` | |
-| `CPU-CE66` | `!LOAD` => `rvd[1]` = (1 - `word_instr`) dot `res[4:]` + `res_sign_bit` dot (2^(32) - 1) |  |
-| | _polynomial:_ `(1 - LOAD) * (rvd[1] - (1 - word_instr) * (res::DWordWL)[1] - res_sign_bit * (2^32 - 1)) = 0` | |
+| Tag | Description |
+|-----|-------------|
+| `CPU-CE57` | `SIGN<rv1_ext_bit; rv1[1], word_instr>` |
+| `CPU-CE58` | `arg1[:4]` = `rv1[:2]` |
+| | _polynomial:_ `(arg1::DWordWL)[0] - (rv1::DWordWL)[0] = 0` |
+| `CPU-CE59` | `arg1[4:]` = `rv1[2]` dot (1 - `word_instr`) + (2^(32) - 1) dot `rv1_ext_bit` dot `signed` |
+| | _polynomial:_ `(arg1::DWordWL)[1] - (1 - word_instr) * rv1[2] - signed * rv1_ext_bit * (2^32 - 1) = 0` |
+| `CPU-CE60` | `SIGN<rv2_ext_bit; rv2[1], word_instr>` |
+| `CPU-CE61` | `arg2[:4]` = (1 - `LOAD`) dot `rv2[:2]` + (1 - `BEQ` - `BLT` - `STORE`) dot `imm[0]` |
+| | _polynomial:_ `(arg2::DWordWL)[0] - (1 - LOAD) * (rv2::DWordWL)[0] - (1 - BEQ - BLT - STORE) * imm[0] = 0` |
+| `CPU-CE62` | `arg2[4:]` = (1 - `LOAD`) dot ((1 - `word_instr`) dot `rv2[2]` + `signed` dot `rv2_ext_bit` dot (2^(32) - 1)) + (1 - `BEQ` - `BLT` - `STORE`) dot `imm[1]` |
+| | _polynomial:_ `(arg2::DWordWL)[1] - (1 - LOAD) * (1 - word_instr) * rv2[2] - (1 - LOAD) * signed * rv2_ext_bit * (2^32 - 1) - (1 - BEQ - BLT - STORE) * imm[1] = 0` |
+| `CPU-CE63` | `SIGN<res_ext_bit; (res::DWordHL)[1], word_instr>` |
+| `CPU-CE64` | `!LOAD` => `rvd[0]` = `res[:4]` |
+| | _polynomial:_ `(1 - LOAD) * (rvd[0] - (res::DWordWL)[0]) = 0` |
+| `CPU-CE65` | `!LOAD` => `rvd[1]` = (1 - `word_instr`) dot `res[4:]` + `res_ext_bit` dot (2^(32) - 1) |
+| | _polynomial:_ `(1 - LOAD) * (rvd[1] - (1 - word_instr) * (res::DWordWL)[1] - res_ext_bit * (2^32 - 1)) = 0` |
 
 ## Other constraints
 
@@ -543,11 +542,11 @@ For [cpu:c:is_equal], note that [cpu:c:sub] sets `res` to be the difference betw
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `CPU-CO67` | `ZERO[is_equal; res[0] + res[1] + res[2] + res[3] + res[4] + res[5] + res[6] + res[7]]` | BEQ |
-| `CPU-CO68` | `branch_cond` = `JALR` or (`BLT` and (`res` xor `invert`)) or (`BEQ` and (`is_equal` xor `invert`)) |  |
+| `CPU-CO66` | `ZERO[is_equal; res[0] + res[1] + res[2] + res[3] + res[4] + res[5] + res[6] + res[7]]` | BEQ |
+| `CPU-CO67` | `branch_cond` = `JALR` or (`BLT` and (`res` xor `invert`)) or (`BEQ` and (`is_equal` xor `invert`)) |  |
 | | _polynomial:_ `-branch_cond + JALR + res[0] * (1 - mp_selector) * BLT + (1 - res[0]) * mp_selector * BLT + is_equal * (1 - mp_selector) * BEQ + (1 - is_equal) * mp_selector * BEQ = 0` | |
-| `CPU-CO69` | `BRANCH[next_pc; pc, imm, arg1::DWordWL, JALR]` | branch_cond |
-| `CPU-CO70` | `ADD<next_pc; pc, (2 * c_type_instruction + 4 * (1 - c_type_instruction)) * 1::DWordWL>` |  |
+| `CPU-CO68` | `BRANCH[next_pc; pc, imm, arg1::DWordWL, JALR]` | branch_cond |
+| `CPU-CO69` | `ADD<next_pc; pc, (2 * c_type_instruction + 4 * (1 - c_type_instruction)) * 1::DWordWL>` |  |
 
 > **Note:** Document the choice to not have a multiplicity column here for padding
 
@@ -610,11 +609,11 @@ This approach minimizes the number of dependent lookups, increasing only multipl
 |------|------|-------------|
 | `rv1` | `DWordWHH` | The value of register `rs1` |
 | `rv2` | `DWordWHH` | The value of register `rs2` |
-| `rv1_sign_bit` | `Bit` | The sign bit of `rv1` if seen as a 32-bit word |
+| `rv1_ext_bit` | `Bit` | The sign bit of `rv1` if seen as a 32-bit word, used for sign extension with `word_instr` |
 | `arg1` | `DWordBL` | The extended version of `rv1`, depending on `word_instr` |
-| `arg2_sign_bit` | `Bit` | The sign bit of `arg2` if seen as a 32-bit word |
+| `rv2_ext_bit` | `Bit` | The sign bit of `rv2` if seen as a 32-bit word, used for sign extension with `word_instr` |
 | `arg2` | `DWordBL` | A multiplexed version of `rv2` and `imm`, to be used as second argument to ALU calls |
-| `res_sign_bit` | `Bit` | The sign bit of `res`, if seen as a 32-bit word |
+| `res_ext_bit` | `Bit` | The sign bit of `res`, if seen as a 32-bit word, used for sign extension with `word_instr` |
 | `res` | `DWordBL` | The ALU result |
 | `is_equal` | `Bit` | Whether `rv1` and `arg2` are equal |
 | `branch_cond` | `Bit` | Whether a branch is taken, i.e., the branch condition |
@@ -948,58 +947,70 @@ The `MEMW` chip is comprised of  variables that are expressed using  columns:
 
 = Assumptions
 
-Our assumptions do not explicitly cover any range checks for the `is_register` and `value` columns, as these are not necessary for the correctness of this chip in isolation. These properties are necessary for the consistency of the system as a whole, and therefore we document it here, keeping the type information as a reading help.
+Our assumptions do not explicitly cover any range checks for the `is_register` and `value` columns, as these are not necessary for the correctness of this chip in isolation. Still, these properties are necessary for the consistency of the system as a whole, and therefore we document it here, keeping the type information as a reading help.
 
 = Constraints
 
-We can compute the addresses for the later bytes based on a single bit each, indicating whether adding `i` to `base_address` overflows the lower limb. We can safely assume that additions for which this bit is not correctly set will have either an overflow on the upper or lower word, and hence not match any existing memory tokens, which are only initialized for correctly formatted and range-checked doublewords (see [memory]).
+Depending on the values of `write2`, `write4` and `write8`, the addresses following `base_address` need to be constructed. Rather than computing these in full (which would require the later addresses to be instantiated), it suffices to know the `carry`: the bit indicating whether ``base_address`_0 + t >= 2^32`, i.e., whether adding `t in [1, 7]` to `base_address` requires a carry from the lower to the upper limb. Note that it is safe for the prover to chose these bits: additions for which this bit is not correctly set will yield an address where either the lower or upper limb is out of bounds. As such, the constructed address will not match any existing memory tokens, which are only initialized for correctly formatted and range-checked doublewords (see [memory]).
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `MEMW-C1` |  | `IS_BIT<μ_sum>` |  |
-| `MEMW-C2` |  | `w2` => `μ_sum` |  |
+| `MEMW-C1` |  | `IS_BIT<μ_read>` |  |
+| `MEMW-C2` |  | `IS_BIT<μ_write>` |  |
+| `MEMW-C3` |  | `IS_BIT<μ_sum>` |  |
+| `MEMW-C4` |  | `w2` => `μ_sum` |  |
 | | | _polynomial:_ `w2 * (1 - μ_sum) = 0` | |
-| `MEMW-C3.i` | i ∈ [0, 6] | `IS_BIT<add_limb_overflow[i]>` |  |
-| `MEMW-C4` |  | `LT[1; old_timestamp[0], timestamp, 0]` | μ_sum |
-| `MEMW-C5` |  | `LT[1; old_timestamp[1], timestamp, 0]` | w2 |
-| `MEMW-C6.i` | i ∈ [2, 3] | `LT[1; old_timestamp[i], timestamp, 0]` | w4 |
-| `MEMW-C7.i` | i ∈ [4, 7] | `LT[1; old_timestamp[i], timestamp, 0]` | write8 |
+| `MEMW-C5.i` | i ∈ [0, 6] | `IS_BIT<carry[i]>` |  |
+| `MEMW-C6` |  | `LT[1; old_timestamp[0], timestamp, 0]` | μ_sum |
+| `MEMW-C7` |  | `LT[1; old_timestamp[1], timestamp, 0]` | w2 |
+| `MEMW-C8.i` | i ∈ [2, 3] | `LT[1; old_timestamp[i], timestamp, 0]` | w4 |
+| `MEMW-C9.i` | i ∈ [4, 7] | `LT[1; old_timestamp[i], timestamp, 0]` | write8 |
 
-As long as `timestamp` is properly range-checked, the presence of `old_timestamp` in the memory argument automatically ensures appropriate range checking (as long as no external entities provide negative multiplicities without range checking the timestamp). This ensures the assumptions for `LT` are satisfied.
+As long as `timestamp` is properly range-checked, the presence of `old_timestamp` in the memory argument automatically ensures it is appropriately range checked (this assumes no external entities provide negative multiplicities without range checking the timestamp). This ensures the assumptions for `LT` are satisfied.
 
-There is no need to check that the address does not overflow, as our address calculations are not performed modulo `2^64` here, and any overflow will result in an address without matching initialization.
+There is no need to check that the additions do not overflow, as our address calculations are not performed modulo `2^64` here, and any overflow will result in an address without matching initialization.
 
 The chip adds the following tuples to the lookup argument, to effectuate that part of the memory argument.
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `MEMW-CM8` |  | `memory[is_register, base_address, old_timestamp[0], old[0]]` | μ_sum |
-| `MEMW-CM9` |  | `memory[is_register, base_address, timestamp, value[0]]` | -μ_sum |
-| `MEMW-CM10` |  | `memory[is_register, address_add[0], old_timestamp[1], old[1]]` | w2 |
-| `MEMW-CM11` |  | `memory[is_register, address_add[0], timestamp, value[1]]` | -w2 |
-| `MEMW-CM12.i` | i ∈ [2, 3] | `memory[is_register, address_add[i - 1], old_timestamp[i], old[i]]` | w4 |
-| `MEMW-CM13.i` | i ∈ [2, 3] | `memory[is_register, address_add[i - 1], timestamp, value[i]]` | -w4 |
-| `MEMW-CM14.i` | i ∈ [4, 7] | `memory[is_register, address_add[i - 1], old_timestamp[i], old[i]]` | write8 |
-| `MEMW-CM15.i` | i ∈ [4, 7] | `memory[is_register, address_add[i - 1], timestamp, value[i]]` | -write8 |
+| `MEMW-CM10` |  | `memory[is_register, base_address, old_timestamp[0], old[0]]` | μ_sum |
+| `MEMW-CM11` |  | `memory[is_register, base_address, timestamp, value[0]]` | -μ_sum |
+| `MEMW-CM12` |  | `memory[is_register, address_add[0], old_timestamp[1], old[1]]` | w2 |
+| `MEMW-CM13` |  | `memory[is_register, address_add[0], timestamp, value[1]]` | -w2 |
+| `MEMW-CM14.i` | i ∈ [2, 3] | `memory[is_register, address_add[i - 1], old_timestamp[i], old[i]]` | w4 |
+| `MEMW-CM15.i` | i ∈ [2, 3] | `memory[is_register, address_add[i - 1], timestamp, value[i]]` | -w4 |
+| `MEMW-CM16.i` | i ∈ [4, 7] | `memory[is_register, address_add[i - 1], old_timestamp[i], old[i]]` | write8 |
+| `MEMW-CM17.i` | i ∈ [4, 7] | `memory[is_register, address_add[i - 1], timestamp, value[i]]` | -write8 |
 
-This chip contributes the following to the lookup argument.
+This chip contributes the following to the lookup argument:
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `MEMW-CO16` | `MEMW[old; is_register, base_address, value, timestamp, write2, write4, write8]` | μ_read |
-| `MEMW-CO17` | `MEMW[is_register, base_address, value, timestamp, write2, write4, write8]` | μ_write |
+| `MEMW-CO18` | `MEMW[old; is_register, base_address, value, timestamp, write2, write4, write8]` | -μ_read |
+| `MEMW-CO19` | `MEMW[is_register, base_address, value, timestamp, write2, write4, write8]` | -μ_write |
+
+= Padding The table can be padded to the next power of two with the following value assignments:
 
 = Read-size aligned fast path
 
-When a memory access happens at an address with proper alignment (that is, enough trailing zeros) for its access size, and all accessed elements were last accessed at the same timestamp, we can instead use the  chip to save on total column count. The saving comes from only requiring a single old timestamp to be stored, as well as being able to guarantee that all values of `add_limb_overflow` would be zero. A minor extra cost is introduced in the form of a check that the alignment is indeed correct, and the corresponding decomposition of the `base_address`.
+When a memory access happens at an address with proper alignment for its access size (i.e., adding the access size to `base_address`'s lowest limb does not overflow), and all accessed elements were last accessed at the same timestamp, we can instead use the  chip to save on total column count. The saving comes from only requiring a single old timestamp to be stored, as well as being able to guarantee that all values of `add_limb_overflow` would be zero. A minor extra cost is introduced in the form of a check that the alignment is indeed correct, and the corresponding decomposition of the `base_address`.
 
 Further logic remains essentially the same, so we briefly present the relevant tables for this chip.
 
 The  chip only needs  variables, expressed through  columns.
 
-= Future optimization ideas
+## Padding
 
-- `MEMB` chip that does a one-byte write to remove old_timestamp from here (uncertain tradeoffs) - Additional fast path for registers? (Always guaranteed same timestamp, alignment could be an assumption, always only two values) - Adding `μ_sum`/`w2`/`w4`/`write8` multiplicities to the `IS_HALF` lookups may make some GKR things faster if there are known zeroes.
+The table can be padded to the next power of two with the following value assignments:
+
+= Register fast-path
+
+The  chip provides a fast-path for accessing registers. This fast-path leverages that registers + can be addressed using a `Byte`, rather than a full `DWord`, + are constantly accessed, i.e., ``timestamp` - `old_timestamp`` is small, and + have a fixed access pattern to achieve a footprint that is significantly smaller than both  and .
+
+Note: as a result of hard optimization, this chip can only be used for register accesses for which + ``timestamp` - `old_timestamp` in [1, 2^16]`, and + ``timestamp[0]` > `old_timestamp[0]`` If either of these rules does not apply to your access, you should fall back to using `MEMW_A`.
+
+Note moreover that this chip does not guard against misaligned register access faults: to access register with a given `address`, one must provide `2 dot `address`` in the lookup.
 
 ## Columns
 
@@ -1008,9 +1019,9 @@ The  chip only needs  variables, expressed through  columns.
 | Name | Type | Description |
 |------|------|-------------|
 | `is_register` | `Bit` | Whether the address represents a register index |
-| `base_address` | `DWordWL` | The base address to read/write from/to, gets offset by $[0, 7]$, depending on how big the access is |
-| `value` | `BaseField[8]` | The values to store in memory. For regular memory, these should be (up to) 8 range-checked `Byte`s; registers are stored as two range-checked `Word`s |
-| `timestamp` | `DWordWL` | The timestamp at which this memory access is said to occur |
+| `base_address` | `DWordWL` | The base address to read from/write to. Gets offset by $[0, 7]$ depending on the size of the access |
+| `value` | `BaseField[8]` | The values to store in memory. For RAM, these should be (up to) 8 range-checked `Byte`s; registers are stored as two range-checked `Word`s |
+| `timestamp` | `DWordWL` | The timestamp at which this memory access occurs |
 | `write2` | `Bit` | Whether to write exactly 2 values |
 | `write4` | `Bit` | Whether to write exactly 4 values |
 | `write8` | `Bit` | Whether to write exactly 8 values |
@@ -1025,8 +1036,8 @@ The  chip only needs  variables, expressed through  columns.
 
 | Name | Type | Description |
 |------|------|-------------|
-| `add_limb_overflow` | `Bit[7]` | Whether adding `i` to `base_address[0]` as a field element exceeds $2^32$ |
-| `old_timestamp` | `DWordWL[8]` | The timestamp at which the address was last accessed |
+| `carry` | `Bit[7]` | Whether `base_address[0] + i + 1` $>= 2^32$ |
+| `old_timestamp` | `DWordWL[8]` | The timestamp at which address `base_address + i` was last accessed |
 
 ### Virtual
 
@@ -1049,7 +1060,7 @@ w4 := write4 + write8
 
 **Definition of `address_add`:**
 ```
-address_add := ['arr', ['+', ['idx', 'base_address', 0], 'i', 1, ['*', ['-', ['^', 2, 32]], ['idx', 'add_limb_overflow', 'i']]], ['+', ['idx', 'base_address', 1], ['idx', 'add_limb_overflow', 'i']]]
+address_add := ['arr', ['-', ['+', ['idx', 'base_address', 0], 'i', 1], ['*', ['^', 2, 32], ['idx', 'carry', 'i']]], ['+', ['idx', 'base_address', 1], ['idx', 'carry', 'i']]]
 ```
 
 **Definition of `μ_sum`:**
@@ -1064,6 +1075,8 @@ address_add := ['arr', ['+', ['idx', 'base_address', 0], 'i', 1, ['*', ['-', ['^
 | `μ_read` | `Bit` | Whether we are performing a read (and hence return `out`) |
 | `μ_write` | `Bit` | Whether we are performing a write (and hence not return `out`) |
 
+The  chip is comprised of  variables that are expressed using  columns:
+
 ## Assumptions
 
 | Tag | Range | Description |
@@ -1075,6 +1088,26 @@ address_add := ['arr', ['+', ['idx', 'base_address', 0], 'i', 1, ['*', ['-', ['^
 | `MEMW-A5` |  | `IS_BIT<write2 + write4 + write8>` |
 | `MEMW-A6.i` | i ∈ [0, 1] | `IS_WORD[timestamp[i]]` |
 
+The following range checks are assumed to be performed/enforced outside of this chip:
+
+## Constraints
+
+Since most registers are frequently accessed, the difference between `timestamp` and `old_timestamp` is small most of the times. Rather than storing their (nearly) identical upper limbs twice, it is instead assumed that ``old_timestamp[1]` = `timestamp[1]``;  can be used for accesses where this is not the case.
+
+Verifying that ``timestamp` > `old_timestamp`` now simplifies to verifying that ``timestamp[0]` - `old_timestamp[0]` > 0`. For most accesses, this value will be small enough to fit in a `Half`. This chip thus enforces this by means of the following constraint:
+
+With ``old_timestamp`<`timestamp`` asserted, `old` is read from the register ([regw:c:read_old]) and `val` is written back ([regw:c:write_val]).
+
+This chip can either just write (``μ_write` = 1`), or both read and write (``μ_read` = 1`) in the same cycle. It must be asserted that at most one of these two options is selected:
+
+Lastly, this chip contributes the following interactions to the logup:
+
+## Padding
+
+The table can be padded to the next power of two with the following value assignments:
+
+= Future optimization ideas - `MEMB` chip that does a one-byte write to remove old_timestamp from here (uncertain tradeoffs) - Adding `μ_sum`/`w2`/`w4`/`write8` multiplicities to the `IS_HALF` lookups may make some GKR things faster if there are known zeroes. - For the register fast-path, one may upgrade the `IS_HALF` check to an `IS_B20` check for extended range at the cost of looking through a larger table.
+
 ---
 
 # LT Chip

From caa053c0772166f6e23daef9efea32445fac6236 Mon Sep 17 00:00:00 2001
From: Robin Jadoul <robin.jadoul@3milabs.tech>
Date: Fri, 10 Apr 2026 17:28:03 +0200
Subject: [PATCH 089/105] spec: SHA256 accelerator (#372)

* spec: First sha256 accelerator draft

* Types checked

* Update typst description

* HWSLC -> HWSL

* Apply suggestions from code review

Co-authored-by: Erik <159244975+erik-3milabs@users.noreply.github.com>
Co-authored-by: Robin Jadoul <robin.jadoul@gmail.com>

* preliminary changes after review

* fix out_e carry check

* rotxor with 2 fewer columns

* Correct count of bytes for out range checks

* Explicit tables for SHA256_K

* whoops

* cosmetic + explanation

* Apply suggestions from code review

Co-authored-by: Erik <159244975+erik-3milabs@users.noreply.github.com>

* Update spec/sha256.typ

* Replace base_addr by entry in addr

* review comments

* structure

* Apply suggestions from code review

Co-authored-by: Erik <159244975+erik-3milabs@users.noreply.github.com>

* Update spec/sha256.typ

Co-authored-by: Erik <159244975+erik-3milabs@users.noreply.github.com>

* spec/sha256: rebase fixes

---------

Co-authored-by: Erik <159244975+erik-3milabs@users.noreply.github.com>
Co-authored-by: Erik Takke <erik.takke@3milabs.tech>
---
 spec/book.typ                |   1 +
 spec/sha256.typ              | 204 +++++++++++++++++++++++++
 spec/src/config.toml         |  16 ++
 spec/src/rotxor.toml         | 186 ++++++++++++++++++++++
 spec/src/sha256.toml         | 270 ++++++++++++++++++++++++++++++++
 spec/src/sha256consts.toml   |  28 ++++
 spec/src/sha256msgsched.toml | 163 ++++++++++++++++++++
 spec/src/sha256round.toml    | 288 +++++++++++++++++++++++++++++++++++
 spec/src/signatures.toml     |  24 +++
 9 files changed, 1180 insertions(+)
 create mode 100644 spec/sha256.typ
 create mode 100644 spec/src/rotxor.toml
 create mode 100644 spec/src/sha256.toml
 create mode 100644 spec/src/sha256consts.toml
 create mode 100644 spec/src/sha256msgsched.toml
 create mode 100644 spec/src/sha256round.toml

diff --git a/spec/book.typ b/spec/book.typ
index bcc5fec19..3de02e363 100644
--- a/spec/book.typ
+++ b/spec/book.typ
@@ -42,6 +42,7 @@
       ("about_ecalls.typ", [About `ECALL`], <ecall>),
       ("halt.typ", [`HALT` chip], <halt>),
       ("commit.typ", [`COMMIT` chip], <commit>),
+      ("sha256.typ", [`SHA256` accelerator], <sha256>),
     ))
   )
 )
diff --git a/spec/sha256.typ b/spec/sha256.typ
new file mode 100644
index 000000000..d72d5fc7a
--- /dev/null
+++ b/spec/sha256.typ
@@ -0,0 +1,204 @@
+#import "/book.typ": book-page, aside, rj
+#import "/src.typ": load_config, load_chip
+#import "/chip.typ": (
+  render_chip_variable_table,
+  total_nr_variables,
+  total_nr_instantiated_columns,
+  render_constraint_table,
+  render_chip_assumptions,
+  render_chip_padding_table,
+)
+
+#let config = load_config()
+
+#show: book-page("sha256.typ")
+
+#let sha256chip = load_chip("src/sha256.toml", config)
+#let sha256msgschedchip = load_chip("src/sha256msgsched.toml", config)
+#let sha256roundchip = load_chip("src/sha256round.toml", config)
+#let rotxorchip = load_chip("src/rotxor.toml", config)
+#let sha256 = raw(sha256chip.name)
+#let sha256msgsched = raw(sha256msgschedchip.name)
+#let sha256round = raw(sha256roundchip.name)
+#let rotxor = raw(rotxorchip.name)
+
+The following chips constitute an accelerator for the SHA256 compression function; 
+other aspects of SHA256 hashing (such as repeated compression invocation, 
+input padding and state initialization) fall outside the scope of this accelerator.
+
+The base #sha256 chip provides the `ECALL` interface, interacts with memory and then delegates to the #sha256msgsched and #sha256round chips
+to perform the message schedule and the compression rounds, respectively.
+The `SHA256_M` interaction signature is used to represent the output of the message schedule.
+The `SHA256_K` interaction signature is used to represent the `k` constants.
+It could either be instantiated with a (short) precomputed table, or through hardcoded LogUp contributions in this chip.
+For this exposition, we choose the former option, and present a table further below.
+Additionally, we introduce a #rotxor chip to perform the common action of computing the XOR of three rotations (or shifts) of a word.
+
+Most of the structure and variable naming follows the pseudocode of the wikipedia page#footnote(link("https://web.archive.org/web/20260320010021/https://en.wikipedia.org/wiki/SHA-2#Pseudocode")).
+
+= #sha256 chip
+
+== Columns
+#let nr_variables = total_nr_variables(sha256chip)
+#let nr_columns = total_nr_instantiated_columns(sha256chip, config)
+
+The #sha256 chip leverages #nr_variables variables, spanning #nr_columns columns:
+#render_chip_variable_table(sha256chip, config)
+
+== Constraints
+
+The first responsibility of the chip is to read the current state and message chunk from memory,
+passed as arguments through pointers.
+Since the memory ranges could overlap, we read the chunk first (in @sha256:c:read_chunk, at timestamp `timestamp`), before reading and writing the state (in @sha256:c:read_state, at timestamp `timestamp + 1`).
+The addresses containing the state and the current chunk are passed in as arguments `A0 = x10` and `A1 = x11`, respectively.
+Note that following the SHA256 spec, this state and the chunks are read and written as big-endian.
+#render_constraint_table(sha256chip, config, groups: "memory")
+
+Then we prepare the message schedule, by emitting the input chunk with multiplicities
+corresponding to the number of times it will be read during a compression evaluation.
+The #sha256msgsched chip itself is implicitly invoked by itself and #sha256round, setting the `amount`
+column appropriately for the number of times the `w` value is required.
+#render_constraint_table(sha256chip, config, groups: "sched")
+
+And finally, we provide the boundaries for the #sha256round chip and the
+final addition of the compression to the old state.
+Observe that we embed the addition into the upper 32 bits of a double word,
+in order to satisfy and use the `ADD` chip.
+#render_constraint_table(sha256chip, config, groups: "compress")
+
+In this VM, we assign syscall number -1 to the #sha256 accelerator.
+The chip therefore contributes the following interaction to the lookup-argument:
+#render_constraint_table(sha256chip, config, groups: "lookup")
+
+== Padding
+
+#render_chip_padding_table(sha256chip, config)
+
+= #sha256msgsched chip
+
+== Columns
+
+#let nr_variables = total_nr_variables(sha256msgschedchip)
+#let nr_columns = total_nr_instantiated_columns(sha256msgschedchip, config)
+
+The #sha256msgsched chip leverages #nr_variables variables, spanning #nr_columns columns:
+#render_chip_variable_table(sha256msgschedchip, config)
+
+== Assumptions
+
+#render_chip_assumptions(sha256msgschedchip, config)
+
+== Constraints
+
+First, we gather the dependencies from earlier in the message schedule.
+
+#render_constraint_table(sha256msgschedchip, config, groups: "lookback")
+
+Then, we calculate the result.
+It suffices to check that the carry of adding four range-checked words
+into a range-checked word is not too big, following the logic from @add.
+In this case, using the `IS_BYTE` constraint allows us to add multiple words together
+at the same time, without needing to store and range-check intermediate results.
+#render_constraint_table(sha256msgschedchip, config, groups: "calc")
+
+Finally, we contribute to the LogUp.
+#render_constraint_table(sha256msgschedchip, config, groups: "output")
+
+= #sha256round chip
+
+== Columns
+
+#let nr_variables = total_nr_variables(sha256roundchip)
+#let nr_columns = total_nr_instantiated_columns(sha256roundchip, config)
+
+The #sha256round chip leverages #nr_variables variables, spanning #nr_columns columns:
+#render_chip_variable_table(sha256roundchip, config)
+
+== Assumptions
+
+#render_chip_assumptions(sha256roundchip, config)
+
+== Constraints
+
+First, we compute the necessary intermediate values.
+#let bitand = math.class("binary", math.amp)
+To compute `maj`, observe that $ (a bitand b) xor (a bitand c) xor (b bitand c) = (a bitand b) xor (c bitand (a xor b)), $
+by distribution.
+Additionally, since for this form, $(a bitand b)$ and $(a xor b)$ are disjoint, so are $(a bitand b)$ and $(c bitand (a xor b))$,
+and hence we can replace that top-level XOR with a field addition to compute $(a bitand b) + (c bitand (a xor b))$,
+needing fewer intermediate columns.
+Similarly, `ch` can be written as $(e bitand f) + ((2^32 - 1 - e) bitand g)$.
+#render_constraint_table(sha256roundchip, config, groups: "value")
+
+Then we constrain the addition for the new state, constraining additions with the same `IS_BYTE` trick as before.
+#render_constraint_table(sha256roundchip, config, groups: "addition")
+
+Finally, we chain the rounds together through the interactions.
+#render_constraint_table(sha256roundchip, config, groups: "output")
+
+== Padding
+
+#render_chip_padding_table(sha256roundchip, config)
+
+= #rotxor chip
+
+
+This chip takes as input `a`, `r0`, `r1`, `r2` (4-bit values) and a bit `last_rot` to compute
+$
+  cases(
+    (a >>> (16 + r_0)) xor (a >>> (16 + r_0 - r_1)) xor (a >>> r_2) quad "if" #`last_rot`,
+    (a >>> (16 + r_0)) xor (a >>> (16 + r_0 - r_1)) xor (a >> r_2) quad "if" #`!last_rot`
+  ),
+$
+where we let $>>>$ denote right rotation and $>>$ logical shift right.
+We choose this representation so that all shift amounts required fit into 4 bits,
+making the usage of `HWSL` more straightforward and avoid extra columns to represent more bits.
+
+== Columns
+
+#let nr_variables = total_nr_variables(rotxorchip)
+#let nr_columns = total_nr_instantiated_columns(rotxorchip, config)
+The #rotxor chip leverages #nr_variables variables, spanning #nr_columns columns:
+
+#render_chip_variable_table(rotxorchip, config)
+
+== Assumptions
+
+Range checking for all elements is inherited from the bitwise lookups.
+We can safely assume that no `r_i` will be zero, and avoid extra work due to right rotation needing `16 - shift` as arguments to the `HWSL` interactions.
+#render_chip_assumptions(rotxorchip, config)
+
+== Constraints
+
+We first compute all rotations (or shifts) of `a`.
+`a1` is computed as a left rotation of `a0`, in order to not need
+additional columns to represent the full right-rotation amounts.
+#render_constraint_table(rotxorchip, config, groups: "shift")
+
+Then the bitwise XOR of the results.
+#render_constraint_table(rotxorchip, config, groups: "xor")
+
+And finally contribute to the lookup argument.
+#render_constraint_table(rotxorchip, config, groups: "output")
+
+== Padding
+
+#render_chip_padding_table(rotxorchip, config)
+
+= Constant lookup
+
+#let sha256_kchip = load_chip("src/sha256consts.toml", config)
+#let sha256_k = raw(sha256_kchip.name)
+
+As mentioned, we provide the round constants through a short precomputed lookup table: #sha256_k.
+
+#render_chip_variable_table(sha256_kchip, config)
+#render_constraint_table(sha256_kchip, config)
+
+= Notes/optimizations
+- This could instead be designed following the #link("https://github.com/riscv/riscv-crypto")[RISC-V Crypto Scalar extension `Zknh`],
+  for wider compatibility, but this design is likely to be more efficient.
+  It is still possible, if desired, to expose #rotxor (or a selection of parameter instantiations thereof)
+  as implementation for these primitives.
+- The message schedule could be exposed as its own ECALL instead, but the direct integration leads to better efficiency.
+- Some of these chips could be made narrower, at the cost of introducing some extra lookups and extra tables to compute and store intermediate results.
diff --git a/spec/src/config.toml b/spec/src/config.toml
index d9dcaec37..9ced2ce0d 100644
--- a/spec/src/config.toml
+++ b/spec/src/config.toml
@@ -91,6 +91,22 @@ desc = """\
        The `Word` is the *most* significant digit.
        """
 
+[[variables.types]]
+label = "WordBL"
+subtypes = ["Byte", "Byte", "Byte", "Byte"]
+desc = """\
+       Variable that can only assume values in the range $[0, 2^32)$. \\
+       Represented as an array of four `Byte` variables.\
+       """
+
+[[variables.types]]
+label = "WordHL"
+subtypes = ["Half", "Half"]
+desc = """\
+       Variable that can only assume values in the range $[0, 2^32)$. \\
+       Represented as an array of two `Half` variables.\
+       """
+
 [[variables.types]]
 label = "QuadHL"
 subtypes = ["Half", "Half", "Half", "Half", "Half", "Half", "Half", "Half"]
diff --git a/spec/src/rotxor.toml b/spec/src/rotxor.toml
new file mode 100644
index 000000000..730e9bda5
--- /dev/null
+++ b/spec/src/rotxor.toml
@@ -0,0 +1,186 @@
+name = "ROTXOR"
+
+[[variables.input]]
+name = "a"
+type = "WordHL"
+desc = "The input value"
+pad = 0
+
+[[variables.input]]
+name = "r0"
+type = "Byte"
+desc = "The first amount of rotation, low nibble"
+pad = 0
+
+[[variables.input]]
+name = "r1"
+type = "Byte"
+desc = "The second amount of rotation, low nibble"
+pad = 0
+
+[[variables.input]]
+name = "r2"
+type = "Byte"
+desc = "The third amount of rotation, low nibble"
+pad = 0
+
+[[variables.input]]
+name = "last_rot"
+type = "Bit"
+desc = "Whether the rotation by `r2` is a rotation (1) or just a shift (0)"
+pad = 0
+
+[[variables.output]]
+name = "out"
+type = "WordBL"
+desc = "The output"
+pad = 0
+
+[[variables.auxiliary]]
+name = "a0_left"
+type = "WordHL"
+desc = "`a << (16 - r0)`"
+pad = 0
+
+[[variables.auxiliary]]
+name = "a0_right"
+type = "WordHL"
+desc = "`a >> r0`"
+pad = 0
+
+[[variables.auxiliary]]
+name = "a1_left"
+type = "WordHL"
+desc = "`a0 << r1`"
+pad = 0
+
+[[variables.auxiliary]]
+name = "a1_right"
+type = "WordHL"
+desc = "`a0 >> (16 - r1)`"
+pad = 0
+
+[[variables.auxiliary]]
+name = "a2_left"
+type = "WordHL"
+desc = "`a << (16 - r2)`"
+pad = 0
+
+[[variables.auxiliary]]
+name = "a2_right"
+type = "WordHL"
+desc = "`a >> r2`"
+pad = 0
+
+[[variables.auxiliary]]
+name = "a0"
+type = "WordBL"
+desc = "`a >>> (16 + r0)`"
+pad = 0
+
+[[variables.auxiliary]]
+name = "a1"
+type = "WordBL"
+desc = "`a >>> (16 + r0 - r1)` (which is `a0 <<< r1`)"
+pad = 0
+
+[[variables.auxiliary]]
+name = "a2"
+type = "WordBL"
+desc = "`a >>> r2` or `a >> r2`"
+pad = 0
+
+[[variables.auxiliary]]
+name = "a01"
+type = "WordBL"
+desc = "$a_0 xor a_1$"
+pad = 0
+
+[[variables.multiplicity]]
+name = "μ"
+type = "BaseField"
+desc = ""
+pad = 0
+
+
+[[assumptions]]
+desc = "$#`r0`, #`r1`, #`r2` in [1, 15]$"
+
+
+[[constraint_groups]]
+name = "shift"
+
+[[constraints.shift]]
+kind = "interaction"
+tag = "HWSL"
+input = [["idx", "a", "i"], ["-", 16, "r0"]]
+output = ["arr", ["idx", "a0_left", "i"], ["idx", "a0_right", "i"]]
+iter = ["i", 0, 1]
+multiplicity = "μ"
+
+[[constraints.shift]]
+kind = "interaction"
+tag = "HWSL"
+input = [["idx", ["cast", "a0", "WordHL"], "i"], "r1"]
+output = ["arr", ["idx", "a1_left", "i"], ["idx", "a1_right", "i"]]
+iter = ["i", 0, 1]
+multiplicity = "μ"
+
+[[constraints.shift]]
+kind = "interaction"
+tag = "HWSL"
+input = [["idx", "a", "i"], ["-", 16, "r2"]]
+output = ["arr", ["idx", "a2_left", "i"], ["idx", "a2_right", "i"]]
+iter = ["i", 0, 1]
+multiplicity = "μ"
+
+[[constraints.shift]]
+kind = "arith"
+constraint = "$#`a0[i]` = #`a0_left[i]` + #`a0_right[1 - i]`$"
+poly = ["-", ["idx", ["cast", "a0", "WordHL"], "i"], ["idx", "a0_left", "i"], ["idx", "a0_right", ["-", 1, "i"]]]
+iter = ["i", 0, 1]
+
+[[constraints.shift]]
+kind = "arith"
+constraint = "$#`a1[i]` = #`a1_left[i]` + #`a1_right[1 - i]`$"
+poly = ["-", ["idx", ["cast", "a1", "WordHL"], "i"], ["idx", "a1_left", "i"], ["idx", "a1_right", ["-", 1, "i"]]]
+iter = ["i", 0, 1]
+
+[[constraints.shift]]
+kind = "arith"
+constraint = "$#`a2[0]` = #`a2_left[1]` + #`a2_right[0]`$"
+poly = ["-", ["idx", ["cast", "a2", "WordHL"], 0], ["idx", "a2_left", 1], ["idx", "a2_right", 0]]
+
+[[constraints.shift]]
+kind = "arith"
+constraint = "$#`a2[1]` = #`last_rot` dot #`a2_left[0]` + #`a2_right[1]`$"
+poly = ["-", ["idx", ["cast", "a2", "WordHL"], 0], ["*", "last_rot", ["idx", "a2_left", 0]], ["idx", "a2_right", 1]]
+
+[[constraint_groups]]
+name = "xor"
+
+[[constraints.xor]]
+kind = "interaction"
+tag = "XOR_BYTE"
+input = [["idx", "a0", "i"], ["idx", "a1", "i"]]
+output = ["idx", "a01", "i"]
+multiplicity = "μ"
+iter = ["i", 0, 3]
+
+[[constraints.xor]]
+kind = "interaction"
+tag = "XOR_BYTE"
+input = [["idx", "a01", "i"], ["idx", "a2", "i"]]
+output = ["idx", "out", "i"]
+multiplicity = "μ"
+iter = ["i", 0, 3]
+
+[[constraint_groups]]
+name = "output"
+
+[[constraints.output]]
+kind = "interaction"
+tag = "ROTXOR"
+input = [["cast", "a", "Word"], "r0", "r1", "r2", "last_rot"]
+output = ["cast", "out", "Word"]
+multiplicity = ["-", "μ"]
diff --git a/spec/src/sha256.toml b/spec/src/sha256.toml
new file mode 100644
index 000000000..4cd4de9ba
--- /dev/null
+++ b/spec/src/sha256.toml
@@ -0,0 +1,270 @@
+name = "SHA256"
+
+[[variables.input]]
+name = "timestamp"
+type = "DWordWL"
+desc = "Timestamp at which the ECALL is invoked. Used as unique identifier for this invocation."
+pad = 0
+
+[[variables.input]]
+name = "h"
+type = ["Byte", 32]
+desc = "The state of the hash function."
+pad = 0
+
+[[variables.input]]
+name = "h_addr"
+type = ["DWordHL", 4]
+desc = "The addresses of the doublewords of `h`"
+pad = ["arr", 0, 8, 16, 24]
+
+[[variables.input]]
+name = "m"
+type = ["Byte", 64]
+desc = "The input chunk."
+pad = 0
+
+[[variables.input]]
+name = "m_addr"
+type = ["DWordHL", 8]
+desc = "The addresses of the doublewords of `m`"
+pad = ["arr", 0, 8, 16, 24, 32, 40, 48, 56]
+
+[[variables.output]]
+name = "out"
+type = ["Byte", 32]
+desc = "The new state."
+pad = 0
+
+[[variables.auxiliary]]
+name = "last_round_out"
+type = ["Word", 8]
+desc = "The output from the last compression round"
+pad = 0
+
+[[variables.multiplicity]]
+name = "μ"
+type = "Bit"
+desc = ""
+pad = 0
+
+
+[[constraint_groups]]
+name = "memory"
+
+[[constraints.memory]]
+kind = "interaction"
+tag = "MEMW"
+input = [1, ["cast", ["*", 2, 11], "DWordWL"], ["arr", ["idx", ["cast", ["idx", "m_addr", 0], "DWordWL"], 0], ["idx", ["cast", ["idx", "m_addr", 0], "DWordWL"], 1], 0, 0, 0, 0, 0, 0], "timestamp", 1, 0, 0]
+output = ["arr", ["idx", ["cast", ["idx", "m_addr", 0], "DWordWL"], 0], ["idx", ["cast", ["idx", "m_addr", 0], "DWordWL"], 0], 0, 0, 0, 0, 0, 0]
+multiplicity = "μ"
+
+[[constraints.memory]]
+kind = "interaction"
+tag = "IS_HALF"
+input = [["idx", ["idx", "m_addr", "i"], "j"]]
+multiplicity = "μ"
+iters = [["i", 0, 7], ["j", 0, 3]]
+
+[[constraints.memory]]
+kind = "template"
+tag = "ADD"
+input = [["cast", ["idx", "m_addr", 0], "DWordWL"], ["cast", ["*", 8, "i"], "DWordWL"]]
+output = ["cast", ["idx", "m_addr", "i"], "DWordWL"]
+iter = ["i", 1, 7]
+
+[[constraints.memory]]
+kind = "interaction"
+tag = "MEMW"
+input = [0, ["cast", ["idx", "m_addr", "i"], "DWordWL"], ["arr",
+    ["idx", "m", ["+", ["*", 8, "i"], 3]],
+    ["idx", "m", ["+", ["*", 8, "i"], 2]],
+    ["idx", "m", ["+", ["*", 8, "i"], 1]],
+    ["idx", "m", ["+", ["*", 8, "i"], 0]],
+    ["idx", "m", ["+", ["*", 8, "i"], 7]],
+    ["idx", "m", ["+", ["*", 8, "i"], 6]],
+    ["idx", "m", ["+", ["*", 8, "i"], 5]],
+    ["idx", "m", ["+", ["*", 8, "i"], 4]]],
+  "timestamp", 0, 0, 1]
+output = ["arr",
+    ["idx", "m", ["+", ["*", 8, "i"], 3]],
+    ["idx", "m", ["+", ["*", 8, "i"], 2]],
+    ["idx", "m", ["+", ["*", 8, "i"], 1]],
+    ["idx", "m", ["+", ["*", 8, "i"], 0]],
+    ["idx", "m", ["+", ["*", 8, "i"], 7]],
+    ["idx", "m", ["+", ["*", 8, "i"], 6]],
+    ["idx", "m", ["+", ["*", 8, "i"], 5]],
+    ["idx", "m", ["+", ["*", 8, "i"], 4]]]
+multiplicity = "μ"
+iter = ["i", 0, 7]
+ref = "sha256:c:read_chunk"
+
+[[constraints.memory]]
+kind = "interaction"
+tag = "MEMW"
+input = [1, ["cast", ["*", 2, 10], "DWordWL"], ["arr", ["idx", ["cast", ["idx", "h_addr", 0], "DWordWL"], 0], ["idx", ["cast", ["idx", "h_addr", 0], "DWordWL"], 1], 0, 0, 0, 0, 0, 0], "timestamp", 1, 0, 0]
+output = ["arr", ["idx", ["cast", ["idx", "h_addr", 0], "DWordWL"], 0], ["idx", ["cast", ["idx", "h_addr", 0], "DWordWL"], 1], 0, 0, 0, 0, 0, 0]
+multiplicity = "μ"
+
+[[constraints.memory]]
+kind = "interaction"
+tag = "IS_HALF"
+input = [["idx", ["idx", "h_addr", "i"], "j"]]
+multiplicity = "μ"
+iters = [["i", 0, 3], ["j", 0, 3]]
+
+[[constraints.memory]]
+kind = "template"
+tag = "ADD"
+input = [["cast", ["idx", "h_addr", 0], "DWordWL"], ["*", 8, ["cast", "i", "DWordWL"]]]
+output = ["cast", ["idx", "h_addr", "i"], "DWordWL"]
+iter = ["i", 1, 3]
+
+[[constraints.memory]]
+kind = "interaction"
+tag = "MEMW"
+input = [
+  0,
+  ["cast", ["idx", "h_addr", "i"], "DWordWL"],
+  ["arr",
+      ["idx", "out", ["+", ["*", 8, "i"], 3]],
+      ["idx", "out", ["+", ["*", 8, "i"], 2]],
+      ["idx", "out", ["+", ["*", 8, "i"], 1]],
+      ["idx", "out", ["+", ["*", 8, "i"], 0]],
+      ["idx", "out", ["+", ["*", 8, "i"], 7]],
+      ["idx", "out", ["+", ["*", 8, "i"], 6]],
+      ["idx", "out", ["+", ["*", 8, "i"], 5]],
+      ["idx", "out", ["+", ["*", 8, "i"], 4]]],
+  ["+", "timestamp", ["cast", 1, "DWordWL"]],
+  0, 0, 1
+]
+output = ["arr",
+  ["idx", "h", ["+", ["*", 8, "i"], 3]],
+  ["idx", "h", ["+", ["*", 8, "i"], 2]],
+  ["idx", "h", ["+", ["*", 8, "i"], 1]],
+  ["idx", "h", ["+", ["*", 8, "i"], 0]],
+  ["idx", "h", ["+", ["*", 8, "i"], 7]],
+  ["idx", "h", ["+", ["*", 8, "i"], 6]],
+  ["idx", "h", ["+", ["*", 8, "i"], 5]],
+  ["idx", "h", ["+", ["*", 8, "i"], 4]],
+]
+iter = ["i", 0, 3]
+multiplicity = "μ"
+ref = "sha256:c:read_state"
+
+[[constraint_groups]]
+name = "sched"
+
+[[constraints.sched]]
+kind = "interaction"
+tag = "SHA256_M"
+input = ["timestamp", "i"]
+output = ["+",
+  ["*", ["^", 2, 0], ["idx", "m", ["+", ["*", 4, "i"], 3]]],
+  ["*", ["^", 2, 8], ["idx", "m", ["+", ["*", 4, "i"], 2]]],
+  ["*", ["^", 2, 16], ["idx", "m", ["+", ["*", 4, "i"], 1]]],
+  ["*", ["^", 2, 24], ["idx", "m", ["+", ["*", 4, "i"], 0]]],
+]
+multiplicity = ["*", -2, "μ"]
+iter = ["i", 0, 0]
+
+[[constraints.sched]]
+kind = "interaction"
+tag = "SHA256_M"
+input = ["timestamp", "i"]
+output = ["+",
+  ["*", ["^", 2, 0], ["idx", "m", ["+", ["*", 4, "i"], 3]]],
+  ["*", ["^", 2, 8], ["idx", "m", ["+", ["*", 4, "i"], 2]]],
+  ["*", ["^", 2, 16], ["idx", "m", ["+", ["*", 4, "i"], 1]]],
+  ["*", ["^", 2, 24], ["idx", "m", ["+", ["*", 4, "i"], 0]]],
+]
+multiplicity = ["*", -3, "μ"]
+iter = ["i", 1, 8]
+
+[[constraints.sched]]
+kind = "interaction"
+tag = "SHA256_M"
+input = ["timestamp", "i"]
+output = ["+",
+  ["*", ["^", 2, 0], ["idx", "m", ["+", ["*", 4, "i"], 3]]],
+  ["*", ["^", 2, 8], ["idx", "m", ["+", ["*", 4, "i"], 2]]],
+  ["*", ["^", 2, 16], ["idx", "m", ["+", ["*", 4, "i"], 1]]],
+  ["*", ["^", 2, 24], ["idx", "m", ["+", ["*", 4, "i"], 0]]],
+]
+multiplicity = ["*", -4, "μ"]
+iter = ["i", 9, 13]
+
+[[constraints.sched]]
+kind = "interaction"
+tag = "SHA256_M"
+input = ["timestamp", "i"]
+output = ["+",
+  ["*", ["^", 2, 0], ["idx", "m", ["+", ["*", 4, "i"], 3]]],
+  ["*", ["^", 2, 8], ["idx", "m", ["+", ["*", 4, "i"], 2]]],
+  ["*", ["^", 2, 16], ["idx", "m", ["+", ["*", 4, "i"], 1]]],
+  ["*", ["^", 2, 24], ["idx", "m", ["+", ["*", 4, "i"], 0]]],
+]
+multiplicity = ["*", -5, "μ"]
+iter = ["i", 14, 15]
+
+[[constraint_groups]]
+name = "compress"
+
+[[constraints.compress]]
+kind = "interaction"
+tag = "SHA256ROUND"
+input = ["timestamp", ["arr",
+  ["+", ["*", ["^", 2, 0], ["idx", "h", 3]], ["*", ["^", 2, 8], ["idx", "h", 2]], ["*", ["^", 2, 16], ["idx", "h", 1]], ["*", ["^", 2, 24], ["idx", "h", 0]]],
+  ["+", ["*", ["^", 2, 0], ["idx", "h", 7]], ["*", ["^", 2, 8], ["idx", "h", 6]], ["*", ["^", 2, 16], ["idx", "h", 5]], ["*", ["^", 2, 24], ["idx", "h", 4]]],
+  ["+", ["*", ["^", 2, 0], ["idx", "h", 11]], ["*", ["^", 2, 8], ["idx", "h", 10]], ["*", ["^", 2, 16], ["idx", "h", 9]], ["*", ["^", 2, 24], ["idx", "h", 8]]],
+  ["+", ["*", ["^", 2, 0], ["idx", "h", 15]], ["*", ["^", 2, 8], ["idx", "h", 14]], ["*", ["^", 2, 16], ["idx", "h", 13]], ["*", ["^", 2, 24], ["idx", "h", 12]]],
+  ["+", ["*", ["^", 2, 0], ["idx", "h", 19]], ["*", ["^", 2, 8], ["idx", "h", 18]], ["*", ["^", 2, 16], ["idx", "h", 17]], ["*", ["^", 2, 24], ["idx", "h", 16]]],
+  ["+", ["*", ["^", 2, 0], ["idx", "h", 23]], ["*", ["^", 2, 8], ["idx", "h", 22]], ["*", ["^", 2, 16], ["idx", "h", 21]], ["*", ["^", 2, 24], ["idx", "h", 20]]],
+  ["+", ["*", ["^", 2, 0], ["idx", "h", 27]], ["*", ["^", 2, 8], ["idx", "h", 26]], ["*", ["^", 2, 16], ["idx", "h", 25]], ["*", ["^", 2, 24], ["idx", "h", 24]]],
+  ["+", ["*", ["^", 2, 0], ["idx", "h", 31]], ["*", ["^", 2, 8], ["idx", "h", 30]], ["*", ["^", 2, 16], ["idx", "h", 29]], ["*", ["^", 2, 24], ["idx", "h", 28]]],
+], 0]
+multiplicity = "μ"
+
+[[constraints.compress]]
+kind = "interaction"
+tag = "SHA256ROUND"
+input = ["timestamp", "last_round_out", 64]
+multiplicity = ["-", "μ"]
+
+[[constraints.compress]]
+kind = "interaction"
+tag = "IS_BYTE"
+input = [["idx", "out", "i"]]
+multiplicity = "μ"
+iter = ["i", 0, 31]
+
+[[constraints.compress]]
+kind = "template"
+tag = "ADD"
+input = [["arr", 0, ["idx", "last_round_out", "i"]], ["arr", 0, ["+",
+  ["*", ["^", 2, 0], ["idx", "h", ["+", ["*", 4, "i"], 3]]],
+  ["*", ["^", 2, 8], ["idx", "h", ["+", ["*", 4, "i"], 2]]],
+  ["*", ["^", 2, 16], ["idx", "h", ["+", ["*", 4, "i"], 1]]],
+  ["*", ["^", 2, 24], ["idx", "h", ["+", ["*", 4, "i"], 0]]],
+]]]
+output = ["arr", 0, ["+",
+  ["*", ["^", 2, 0], ["idx", "out", ["+", ["*", 4, "i"], 3]]],
+  ["*", ["^", 2, 8], ["idx", "out", ["+", ["*", 4, "i"], 2]]],
+  ["*", ["^", 2, 16], ["idx", "out", ["+", ["*", 4, "i"], 1]]],
+  ["*", ["^", 2, 24], ["idx", "out", ["+", ["*", 4, "i"], 0]]],
+]]
+iter = ["i", 0, 7]
+
+[[constraint_groups]]
+name = "lookup"
+
+[[constraints.lookup]]
+kind = "template"
+tag = "IS_BIT"
+input = ["μ"]
+
+[[constraints.lookup]]
+kind = "interaction"
+tag = "ECALL"
+input = ["timestamp", ["arr", ["-", ["^", 2, 32], 1], ["-", ["^", 2, 32], 1]]]
+multiplicity = ["-", "μ"]
diff --git a/spec/src/sha256consts.toml b/spec/src/sha256consts.toml
new file mode 100644
index 000000000..17fe6fb0f
--- /dev/null
+++ b/spec/src/sha256consts.toml
@@ -0,0 +1,28 @@
+name = "SHA256_K"
+
+[[variables.input]]
+name = "index"
+type = "BaseField"
+desc = ""
+precomputed = true
+
+[[variables.input]]
+name = "K"
+type = "Word"
+desc = ""
+precomputed = true
+
+[[variables.multiplicity]]
+name = "μ"
+type = "BaseField"
+desc = ""
+
+[[constraint_groups]]
+name = "contributions"
+
+[[constraints.contributions]]
+kind = "interaction"
+tag = "SHA256_K"
+input = ["index"]
+output = "K"
+multiplicity = ["-", "μ"]
diff --git a/spec/src/sha256msgsched.toml b/spec/src/sha256msgsched.toml
new file mode 100644
index 000000000..79664a797
--- /dev/null
+++ b/spec/src/sha256msgsched.toml
@@ -0,0 +1,163 @@
+name = "SHA256MSGSCHED"
+
+[[variables.input]]
+name = "timestamp"
+type = "DWordWL"
+desc = "The timestamp/identifier for this execution of the message schedule"
+pad = 0
+
+[[variables.input]]
+name = "index"
+type = "BaseField"
+desc = "The index of the output word"
+pad = 0
+
+[[variables.input]]
+name = "amount"
+type = "BaseField"
+desc = "The multiplicity with which to output the resulting word"
+pad = 0
+
+[[variables.output]]
+name = "out"
+type = "WordHL"
+desc = "The output, `w[index]`"
+
+[[variables.auxiliary]]
+name = "back2"
+type = "Word"
+desc = "`w[index - 2]`"
+pad = 0
+
+[[variables.auxiliary]]
+name = "back7"
+type = "Word"
+desc = "`w[index - 7]`"
+pad = 0
+
+[[variables.auxiliary]]
+name = "back15"
+type = "Word"
+desc = "`w[index - 15]`"
+pad = 0
+
+[[variables.auxiliary]]
+name = "back16"
+type = "Word"
+desc = "`w[index - 16]`"
+pad = 0
+
+[[variables.auxiliary]]
+name = "s0"
+type = "Word"
+desc = "$#`back15` >>> 7 xor #`back15` >>> 18 xor #`back15` >> 3$"
+pad = 0
+
+[[variables.auxiliary]]
+name = "s1"
+type = "Word"
+desc = "$#`back2` >>> 17 xor #`back2` >>> 19 xor #`back2` >> 10$"
+pad = 0
+
+[[variables.virtual]]
+name = "carry"
+type = "Byte"
+desc = "The carry of computing `out`"
+def = ["*", ["^", 2, -32], ["-", ["+", "back16", "s0", "back7", "s1"], ["cast", "out", "Word"]]]
+
+[[variables.multiplicity]]
+name = "μ"
+type = "Bit"
+desc = ""
+pad = 0
+
+[[assumptions]]
+desc = "#`IS_WORD[SHA256_M[timestamp, i]]` for $0 <= i < #`index`$"
+
+
+[[constraint_groups]]
+name = "lookback"
+
+[[constraints.lookback]]
+kind = "interaction"
+tag = "IS_BYTE"
+input = [["-", "index", 16]]
+multiplicity = "μ"
+
+[[constraints.lookback]]
+kind = "interaction"
+tag = "SHA256_M"
+input = ["timestamp", ["-", "index", 2]]
+output = "back2"
+multiplicity = "μ"
+
+[[constraints.lookback]]
+kind = "interaction"
+tag = "SHA256_M"
+input = ["timestamp", ["-", "index", 7]]
+output = "back7"
+multiplicity = "μ"
+
+[[constraints.lookback]]
+kind = "interaction"
+tag = "SHA256_M"
+input = ["timestamp", ["-", "index", 15]]
+output = "back15"
+multiplicity = "μ"
+
+[[constraints.lookback]]
+kind = "interaction"
+tag = "SHA256_M"
+input = ["timestamp", ["-", "index", 16]]
+output = "back16"
+multiplicity = "μ"
+
+[[constraint_groups]]
+name = "calc"
+
+[[constraints.calc]]
+kind = "interaction"
+tag = "ROTXOR"
+input = ["back15", 2, 11, 3, 0]
+output = "s0"
+multiplicity = "μ"
+
+[[constraints.calc]]
+kind = "interaction"
+tag = "ROTXOR"
+input = ["back2", 3, 2, 10, 0]
+output = "s1"
+multiplicity = "μ"
+
+[[constraints.calc]]
+kind = "interaction"
+tag = "IS_BYTE"
+input = ["carry"]
+multiplicity = "μ"
+
+[[constraints.calc]]
+kind = "interaction"
+tag = "IS_HALF"
+input = [["idx", "out", "i"]]
+iter = ["i", 0, 1]
+multiplicity = "μ"
+
+[[constraint_groups]]
+name = "output"
+
+[[constraints.output]]
+kind = "template"
+tag = "IS_BIT"
+input = ["μ"]
+
+[[constraints.output]]
+kind = "arith"
+constraint = "$#`μ` = 0 => #`amount` = 0$"
+poly = ["*", ["not", "μ"], "amount"]
+
+[[constraints.output]]
+kind = "interaction"
+tag = "SHA256_M"
+input = ["timestamp", "index"]
+output = ["cast", "out", "Word"]
+multiplicity = ["-", "amount"]
diff --git a/spec/src/sha256round.toml b/spec/src/sha256round.toml
new file mode 100644
index 000000000..8ec93ea36
--- /dev/null
+++ b/spec/src/sha256round.toml
@@ -0,0 +1,288 @@
+name = "SHA256ROUND"
+
+[[variables.input]]
+name = "timestamp"
+type = "DWordWL"
+desc = "The timestamp/identifier for this execution of the round function"
+pad = 0
+
+[[variables.input]]
+name = "a"
+type = "WordBL"
+desc = "State element"
+pad = 0
+
+[[variables.input]]
+name = "b"
+type = "WordBL"
+desc = "State element"
+pad = 0
+
+[[variables.input]]
+name = "c"
+type = "WordBL"
+desc = "State element"
+pad = 0
+
+[[variables.input]]
+name = "d"
+type = "Word"
+desc = "State element"
+pad = 0
+
+[[variables.input]]
+name = "e"
+type = "WordBL"
+desc = "State element"
+pad = 0
+
+[[variables.input]]
+name = "f"
+type = "WordBL"
+desc = "State element"
+pad = 0
+
+[[variables.input]]
+name = "g"
+type = "WordBL"
+desc = "State element"
+pad = 0
+
+[[variables.input]]
+name = "h"
+type = "Word"
+desc = "State element"
+pad = 0
+
+[[variables.input]]
+name = "index"
+type = "BaseField"
+desc = "The round number/index"
+pad = 0
+
+[[variables.output]]
+name = "out_a"
+type = "WordHL"
+desc = "$#`temp1` + #`temp2`$"
+pad = 0
+
+[[variables.output]]
+name = "out_e"
+type = "WordHL"
+desc = "$#`d` + #`temp1`$"
+pad = 0
+
+[[variables.auxiliary]]
+name = "a_and_b"
+type = "WordBL"
+desc = "$#`a` class(\"binary\", amp) #`b`$. Part of `maj`"
+pad = 0
+
+[[variables.auxiliary]]
+name = "a_xor_b"
+type = "WordBL"
+desc = "$#`a` xor #`b`$. Part of `maj`"
+pad = 0
+
+[[variables.auxiliary]]
+name = "c_and_a_xor_b"
+type = "WordBL"
+desc = "$#`c` class(\"binary\", amp) (#`a` xor #`b`)$. Part of `maj`"
+pad = 0
+
+[[variables.auxiliary]]
+name = "e_and_f"
+type = "WordBL"
+desc = "$#`e` class(\"binary\", amp) #`f`$. Part of `ch`"
+pad = 0
+
+[[variables.auxiliary]]
+name = "not_e_and_g"
+type = "WordBL"
+desc = "$(not #`e`) class(\"binary\", amp) #`g`$. Part of `ch`"
+pad = 0
+
+[[variables.auxiliary]]
+name = "kval"
+type = "Word"
+desc = "`k[index]`"
+pad = 0
+
+[[variables.auxiliary]]
+name = "S0"
+type = "Word"
+desc = "Transformation of `a`"
+pad = 0
+
+[[variables.auxiliary]]
+name = "S1"
+type = "Word"
+desc = "Transformation of `e`"
+pad = 0
+
+[[variables.auxiliary]]
+name = "wval"
+type = "Word"
+desc = "`w[index]`"
+pad = 0
+
+[[variables.virtual]]
+name = "carry_a"
+type = "Byte"
+desc = "The carry from `out_a`"
+def = ["*", ["^", 2, -32], ["-", ["+", "temp1", "temp2"], ["cast", "out_a", "Word"]]]
+
+[[variables.virtual]]
+name = "carry_e"
+type = "Byte"
+desc = "The carry from `out_e`"
+def = ["*", ["^", 2, -32], ["-", ["+", "d", "temp1"], ["cast", "out_e", "Word"]]]
+
+[[variables.virtual]]
+name = "ch"
+type = "Word"
+desc = "ch value"
+def = ["+", ["cast", "e_and_f", "Word"], ["cast", "not_e_and_g", "Word"]]
+
+[[variables.virtual]]
+name = "maj"
+type = "Word"
+desc = "maj value"
+def = ["+", ["cast", "a_and_b", "Word"], ["cast", "c_and_a_xor_b", "Word"]]
+
+[[variables.virtual]]
+name = "temp1"
+type = "BaseField"
+desc = "`temp1` value"
+def = ["+", "h", "S1", "ch", "kval", "wval"]
+
+[[variables.virtual]]
+name = "temp2"
+type = "BaseField"
+desc = "`temp2` value"
+def = ["+", "S0", "maj"]
+
+[[variables.multiplicity]]
+name = "μ"
+type = "Bit"
+desc = ""
+pad = 0
+
+[[assumptions]]
+desc = "All state values are valid words"
+
+
+[[constraint_groups]]
+name = "value"
+
+[[constraints.value]]
+kind = "interaction"
+tag = "AND_BYTE"
+input = [["idx", "a", "i"], ["idx", "b", "i"]]
+output = ["idx", "a_and_b", "i"]
+multiplicity = "μ"
+iter = ["i", 0, 3]
+
+[[constraints.value]]
+kind = "interaction"
+tag = "XOR_BYTE"
+input = [["idx", "a", "i"], ["idx", "b", "i"]]
+output = ["idx", "a_xor_b", "i"]
+multiplicity = "μ"
+iter = ["i", 0, 3]
+
+[[constraints.value]]
+kind = "interaction"
+tag = "AND_BYTE"
+input = [["idx", "c", "i"], ["idx", "a_xor_b", "i"]]
+output = ["idx", "c_and_a_xor_b", "i"]
+multiplicity = "μ"
+iter = ["i", 0, 3]
+
+[[constraints.value]]
+kind = "interaction"
+tag = "AND_BYTE"
+input = [["idx", "e", "i"], ["idx", "f", "i"]]
+output = ["idx", "e_and_f", "i"]
+multiplicity = "μ"
+iter = ["i", 0, 3]
+
+[[constraints.value]]
+kind = "interaction"
+tag = "AND_BYTE"
+input = [["-", 255, ["idx", "e", "i"]], ["idx", "g", "i"]]
+output = ["idx", "not_e_and_g", "i"]
+multiplicity = "μ"
+iter = ["i", 0, 3]
+
+[[constraints.value]]
+kind = "interaction"
+tag = "SHA256_K"
+input = ["index"]
+output = "kval"
+multiplicity = "μ"
+
+[[constraints.value]]
+kind = "interaction"
+tag = "SHA256_M"
+input = ["timestamp", "index"]
+output = "wval"
+multiplicity = "μ"
+
+[[constraints.value]]
+kind = "interaction"
+tag = "ROTXOR"
+input = [["cast", "a", "Word"], 6, 9, 2, 1]
+output = "S0"
+multiplicity = "μ"
+
+[[constraints.value]]
+kind = "interaction"
+tag = "ROTXOR"
+input = [["cast", "e", "Word"], 9, 14, 6, 1]
+output = "S1"
+multiplicity = "μ"
+
+[[constraint_groups]]
+name = "addition"
+
+[[constraints.addition]]
+kind = "interaction"
+tag = "IS_HALF"
+input = [["idx", "out_a", "i"]]
+multiplicity = "μ"
+iter = ["i", 0, 1]
+
+[[constraints.addition]]
+kind = "interaction"
+tag = "IS_BYTE"
+input = ["carry_a"]
+multiplicity = "μ"
+
+[[constraints.addition]]
+kind = "interaction"
+tag = "IS_HALF"
+input = [["idx", "out_e", "i"]]
+multiplicity = "μ"
+iter = ["i", 0, 1]
+
+[[constraints.addition]]
+kind = "interaction"
+tag = "IS_BYTE"
+input = ["carry_e"]
+multiplicity = "μ"
+
+[[constraint_groups]]
+name = "output"
+
+[[constraints.output]]
+kind = "interaction"
+tag = "SHA256ROUND"
+input = ["timestamp", ["arr", ["cast", "a", "Word"], ["cast", "b", "Word"], ["cast", "c", "Word"], "d", ["cast", "e", "Word"], ["cast", "f", "Word"], ["cast", "g", "Word"], "h"], "index"]
+multiplicity = ["-", "μ"]
+
+[[constraints.output]]
+kind = "interaction"
+tag = "SHA256ROUND"
+input = ["timestamp", ["arr", ["cast", "out_a", "Word"], ["cast", "a", "Word"], ["cast", "b", "Word"], ["cast", "c", "Word"], ["cast", "out_e", "Word"], ["cast", "e", "Word"], ["cast", "f", "Word"], ["cast", "g", "Word"]], ["+", "index", 1]]
+multiplicity = "μ"
diff --git a/spec/src/signatures.toml b/spec/src/signatures.toml
index 17ecd3933..33f97cebf 100644
--- a/spec/src/signatures.toml
+++ b/spec/src/signatures.toml
@@ -187,3 +187,27 @@ output = ["Half", 2]
 tag = "memory"
 kind = "interaction"
 input = ["Bit", "DWordWL", "DWordWL", "BaseField"]
+
+# SHA256 things
+[[signatures]]
+tag = "SHA256_K"
+kind = "interaction"
+input = ["BaseField"]
+output = "Word"
+
+[[signatures]]
+tag = "SHA256_M"
+kind = "interaction"
+input = ["DWordWL", "BaseField"]
+output = "Word"
+
+[[signatures]]
+tag = "SHA256ROUND"
+kind = "interaction"
+input = ["DWordWL", ["Word", 8], "BaseField"]
+
+[[signatures]]
+tag = "ROTXOR"
+kind = "interaction"
+input = ["Word", "Byte", "Byte", "Byte", "Bit"]
+output = "Word"

From 435c1166d19261205e8f2aa9b969569b1ee3ae9c Mon Sep 17 00:00:00 2001
From: Joaquin Carletti <joaquin.carletti@lambdaclass.com>
Date: Tue, 14 Apr 2026 10:56:28 -0300
Subject: [PATCH 090/105] update_md

---
 docs/spec/add.md        |   2 +-
 docs/spec/bitwise.md    |   4 +-
 docs/spec/branch.md     |   4 +-
 docs/spec/cpu.md        |   4 +-
 docs/spec/decode.md     |   4 +-
 docs/spec/dvrm.md       |   4 +-
 docs/spec/ecall.md      | 164 +---------------------
 docs/spec/is_bit.md     |   6 +-
 docs/spec/load.md       |   4 +-
 docs/spec/lt.md         |   4 +-
 docs/spec/memw.md       |  76 ++++++-----
 docs/spec/mul.md        |   8 +-
 docs/spec/neg.md        |   6 +-
 docs/spec/shift.md      |   4 +-
 docs/spec/sign.md       |   2 +-
 docs/spec/signatures.md |   4 +-
 docs/spec/spec_full.md  | 296 ++++++++++------------------------------
 17 files changed, 140 insertions(+), 456 deletions(-)

diff --git a/docs/spec/add.md b/docs/spec/add.md
index f1f2a3191..a021bf7c9 100644
--- a/docs/spec/add.md
+++ b/docs/spec/add.md
@@ -4,7 +4,7 @@ For ease of notation, we moreover introduce the  constraint template $
 
 $ in both conditional and unconditional versions. It constrains that ``diff` equiv `lhs` - `rhs` (mod 2^64)` when the expression `cond` is non-zero.
 
-= Variables
+= Variables This template introduces  interaction(s).
 
 = Assumptions
 
diff --git a/docs/spec/bitwise.md b/docs/spec/bitwise.md
index 91f2b127c..53bea4b90 100644
--- a/docs/spec/bitwise.md
+++ b/docs/spec/bitwise.md
@@ -2,7 +2,7 @@
 
 The  chips deal with precomputed lookup tables for bitwise boolean operations and convenience functionalities over small domains.
 
-= Columns
+= Variables
 
 The  chip is comprised of  variables that are expressed using  columns. Of these, the _input_ and _output_ variables ( in total) are precomputed.
 
@@ -10,7 +10,7 @@ The  chip is comprised of  variables that are expressed using  columns. Of these
 
 = Lookup This chip adds the following interactions to the lookup:
 
-= Areas of Optimization The following ideas may prove to be optimizations for the  chip: + Extend `IS_BYTE[X]` to `ARE_BYTES[X, Y]`, such that two bytes are range checked at once. When only a single check is required, one can still execute `IS_BYTE[X] := ARE_BYTES[X, 0]`. + Drop `MSB8` column, and instead define the `MSB8` lookup as `MSB8<X> := MSB16[256X]`. Note: currently, `MSB8` also implicity range checks the input `X` (the lookup fails if `X` is not a `Byte`). This optimization should only be executed when all chips leveraging `MSB8` do _not_ need this implicit range check. + Place the 16-bit (`AND`, `OR`, `XOR`, `MSB16`, etc.) and 20-bit (`HWSL`, `IS_B20`, `ZERO`) lookups in separate tables.
+= Notes/Optimizations The following ideas may prove to be optimizations for the  chip: + Extend `IS_BYTE[X]` to `ARE_BYTES[X, Y]`, such that two bytes are range checked at once. When only a single check is required, one can still execute `IS_BYTE[X] := ARE_BYTES[X, 0]`. + Drop `MSB8` column, and instead define the `MSB8` lookup as `MSB8<X> := MSB16[256X]`. Note: currently, `MSB8` also implicity range checks the input `X` (the lookup fails if `X` is not a `Byte`). This optimization should only be executed when all chips leveraging `MSB8` do _not_ need this implicit range check. + Place the 16-bit (`AND`, `OR`, `XOR`, `MSB16`, etc.) and 20-bit (`HWSL`, `IS_B20`, `ZERO`) lookups in separate tables.
 
 ## Columns
 
diff --git a/docs/spec/branch.md b/docs/spec/branch.md
index a7605e329..c68c2c4a2 100644
--- a/docs/spec/branch.md
+++ b/docs/spec/branch.md
@@ -2,9 +2,9 @@
 
 The  chip computes the target address of a branching instruction.
 
-= Columns
+= Variables
 
-The `BRANCH` chip is comprised of  variables that are expressed using  columns:
+The  chip is comprised of  variables that are expressed using  columns and leverages  interaction(s):
 
 = Assumptions
 
diff --git a/docs/spec/cpu.md b/docs/spec/cpu.md
index 04fb1045e..d203e1e1e 100644
--- a/docs/spec/cpu.md
+++ b/docs/spec/cpu.md
@@ -2,9 +2,9 @@
 
 The  chip coordinates memory accesses and dispatches to other chips for arithmetic and logical operations. It bases its decisions on the entry of the `DECODE` table ([decode]) corresponding the the current program counter (PC).
 
-= Columns
+= Variables
 
-The `CPU` chip is comprised of  variables that are expressed using  columns:
+The  chip is comprised of  variables that are expressed using  columns and leverages  interaction(s):
 
 = Assumptions
 
diff --git a/docs/spec/decode.md b/docs/spec/decode.md
index 188598ce2..8112035ff 100644
--- a/docs/spec/decode.md
+++ b/docs/spec/decode.md
@@ -2,7 +2,7 @@
 
 All `RV64IMC` instruction are to be decoded to a format that can be interpreted by the VM. This section outlines the decoding table being used in the VM. For reasons of efficiency, data in this table is significantly compressed. Since reasoning about this compressed form is needlessly complex, the `decode (uncompressed)` section presents the same table in uncompressed form, and explains how to decode `RV64IM` assembly instructions to it. Instructions on how to compress the uncompressed table to form the compressed decode table, can be derived from the `packed_decode` variable provided below.
 
-= Columns
+= Variables
 
 The  table is comprised of  variables that are expressed using  columns:
 
@@ -28,7 +28,7 @@ super("[" + refs.pos().map(r => ref(r)).join(",") + "]") }
 
 show figure: set block(breakable: true)
 
-figure(table( columns: (auto, auto, auto, auto, 1fr, auto), stroke: 0pt, inset: (right: .5em), align: (left, right, center, center, left, right), fill: (_, y) => // Overlay a low-opacity fill color to distinguish the different rows better if calc.odd(y) and y <= lines.len() { color.rgb(0, 0, 100, 20) } else { color.rgb(255, 255, 255, 20) }, table.header([*Operation*], [*op-flag*], [*`w_instr`*], [*`signed`*], [*other*], []), table.hline(stroke: 1.5pt), table.vline(x: 1, start: 1, end: lines.len() + 1, stroke: .5pt), ..lines.flatten(), table.hline(stroke: 1.5pt), table.footer([*Operation*], [*op-flag*], [*`w_instr`*], [*`signed`*], [*other*]), ), caption: [Decoding table] }
+figure(table( columns: (auto, auto, auto, auto, 1fr, auto), stroke: 0pt, inset: (right: .5em), align: (left, right, center, center, left, right), fill: (_, y) => // Overlay a low-opacity fill color to distinguish the different rows better if calc.odd(y) and y <= lines.len() { color.rgb(0, 0, 100, 20) } else { color.rgb(255, 255, 255, 20) }, table.header([*Operation*], [*op-flag*], [*`w_instr`*], [*`signed`*], [*other*], []), table.hline(stroke: 1.5pt), table.vline(x: 1, start: 1, end: lines.len() + 1, stroke: .5pt), ..lines.flatten(), table.hline(stroke: 1.5pt), table.footer([*Operation*], [*op-flag*], [*`w_instr`*], [*`signed`*], [*other*]), )) }
 
 // OP-IMM ([`ADDI[W]   rd, rs1, imm`], [`ADD`], [`[W]`], [], [], []), ([`SLTI[U]   rd, rs1, imm`], [`SLT`], [], [.not`[U]`], [], []), ([`ANDI      rd, rs1, imm`], [`AND`], [], [], [], []), ([`ORI       rd, rs1, imm`], [`OR`],   [], [], [], []), ([`XORI      rd, rs1, imm`], [`XOR`], [], [], [], []), ([`SLLI[W]   rd, rs1, imm`], [`SHIFT`], [`[W]`], [], [], []), ([`SRLI[W]   rd, rs1, imm`], [`SHIFT`], [`[W]`], [], [`mp_selector`], []), ([`SRAI[W]   rd, rs1, imm`], [`SHIFT`], [`[W]`], [1], [`mp_selector`], []), // OP ([`ADD[W]    rd, rs1, rs2`], [`ADD`], [`[W]`], [], [], []), ([`SUB[W]    rd, rs1, rs2`], [`SUB`], [`[W]`], [], [], []), ([`SLT[U]    rd, rs1, rs2`], [`SLT`], [], [.not`[U]`], [], []), ([`AND       rd, rs1, rs2`], [`AND`], [], [], [], []), ([`OR        rd, rs1, rs2`], [`OR`], [], [], [], []), ([`XOR       rd, rs1, rs2`], [`XOR`], [], [], [], []), ([`SLL[W]    rd, rs1, rs2`], [`SHIFT`], [`[W]`], [], [], []), ([`SRL[W]    rd, rs1, rs2`], [`SHIFT`], [`[W]`], [], [`mp_selector`], []), ([`SRA[W]    rd, rs1, rs2`], [`SHIFT`], [`[W]`], [1], [`mp_selector`], []), // OP - M ([`MUL[W]    rd, rs1, rs2`], [`MUL`], [`[W]`], [1], [`mp_selector`], []), ([`MULH      rd, rs1, rs2`], [`MUL`], [], [1], [`mp_selector`, `muldiv_selector`], []), ([`MULHU     rd, rs1, rs2`], [`MUL`], [], [], [`muldiv_selector`], []), ([`MULHSU    rd, rs1, rs2`], [`MUL`], [], [1], [`muldiv_selector`], []), ([`DIV[U][W] rd, rs1, rs2`], [`DIVREM`], [`[W]`], [.not`[U]`], [], []), ([`REM[U][W] rd, rs1, rs2`], [`DIVREM`], [`[W]`], [.not`[U]`], [`muldiv_selector`], []), // LUI/AUIPC ([`LUI       rd, imm`], [`ADD`], [], [], [], []), ([`AUIPC     rd, imm`], [`ADD`], [], [], [`rs1 := x255`], []), ([`JAL       rd, imm`], [`JALR`], [], [], [`rs1 := x255`], []), // Branching ([`JALR      rd, rs1, imm`], [`JALR`], [], [], [], []), ([`BEQ      rs1, rs2, imm`], [`BEQ`], [], [], [], []), ([`BNE      rs1, rs2, imm`], [`BEQ`], [], [], [`mp_selector`], []), ([`BLT[U]   rs1, rs2, imm`], [`BLT`], [], [.not`[U]`], [], []), ([`BGE[U]   rs1, rs2, imm`], [`BLT`], [], [.not`[U]`], [`mp_selector`], []), // LOAD ([`LD        rd, rs1, imm`], [`LOAD`], [], [], [`mem_8B`], []), ([`LW[U]     rd, rs1, imm`], [`LOAD`], [], [.not`[U]`], [`mem_4B`], []), ([`LH[U]     rd, rs1, imm`], [`LOAD`], [], [.not`[U]`], [`mem_2B`], []), ([`LB[U]     rd, rs1, imm`], [`LOAD`], [], [.not`[U]`], [], []), // STORE ([`SD       rs1, rs2, imm`], [`STORE`], [], [], [`mem_8B`], []), ([`SW       rs1, rs2, imm`], [`STORE`], [], [], [`mem_4B`], []), ([`SH       rs1, rs2, imm`], [`STORE`], [], [], [`mem_2B`], []), ([`SB       rs1, rs2, imm`], [`STORE`], [], [], [], []), // ECALL/EBREAK ([`ECALL`], [`ECALL`], [], [], [``rs1` := `x17``], []), ([`EBREAK`], [`EBREAK`], [], [], [], []), // FENCE ([`FENCE`], [`ADD`], [], [], [], []),
 
diff --git a/docs/spec/dvrm.md b/docs/spec/dvrm.md
index 2fde0208a..7ec791c08 100644
--- a/docs/spec/dvrm.md
+++ b/docs/spec/dvrm.md
@@ -2,9 +2,9 @@
 
 The  chip provides division and remainder functionality, both signed and unsigned.
 
-= Columns
+= Variables
 
-The `DVRM` chip is comprised of  variables that are expressed using  columns:
+The  chip is comprised of  variables that are expressed using  columns and leverages  interaction(s):
 
 = Assumptions
 
diff --git a/docs/spec/ecall.md b/docs/spec/ecall.md
index 62ebb0b7b..934c2e75f 100644
--- a/docs/spec/ecall.md
+++ b/docs/spec/ecall.md
@@ -1,163 +1 @@
-# ECALL Chips
-
-ECALLs provide system-level functionalities to the guest program.
-
-When `ECALL` is executed, it is assumed that: - register `A7` contains the system call number
-
-- the arguments are located in registers `A0`-`A6`, and - the return value is written to `A0`, where `A0`-`A7` are symbolic names for the registers `x10`-`x17`
-
-=  chip
-
-## Columns
-
-### Input
-
-| Name | Type | Description |
-|------|------|-------------|
-| `timestamp` | `DWordWL` | timestamp at which to halt the program |
-
-The  chip leverages  variable, spanning  columns:
-
-## Assumptions
-
-| Tag | Range | Description |
-|-----|-------|-------------|
-| `HALT-A1.i` | i ∈ [0, 1] | `IS_WORD[timestamp[i]]` |
-
-It is assumed the input is range checked:
-
-## Constraints
-
-The  chip: + makes sure register `x10` (containing the exit code) equals `0` ([halt:c:read_zero_exit_code]), + writes `0` to all other registers ([halt:c:zeroize_registers_lo]/[halt:c:zeroize_registers_hi]), and + sets `pc` equal to `1` ([halt:c:pc]). Note that the writes performed by all these interactions are accompanied by the timestamp `2^64-1`; the maximum timestamp. This prevents any other operation involving memory from being executed hereafter.
-
-| Tag | Range | Description | Multiplicity |
-|-----|-------|-------------|--------------|
-| `HALT-C1.i` | i ∈ [1, 9] | `MEMW[1, (2 * i)::DWordWL, 0::BaseField[8], (2^64 - 1)::DWordWL, 1, 0, 0]` | 1 |
-| `HALT-C2` |  | `MEMW[0::BaseField[8]; 1, (2 * 10)::DWordWL, 0::BaseField[8], (2^64 - 1)::DWordWL, 1, 0, 0]` | 1 |
-| `HALT-C3.i` | i ∈ [11, 31] | `MEMW[1, (2 * i)::DWordWL, 0::BaseField[8], (2^64 - 1)::DWordWL, 1, 0, 0]` | 1 |
-| `HALT-C4` |  | `MEMW[1, (2 * 255)::DWordWL, ['arr', 1, 0, 0, 0, 0, 0, 0, 0], (2^64 - 1)::DWordWL, 1, 0, 0]` | 1 |
-
-[ Observe that --- in its current state --- this solution puts the burden of verifying the register cleanup on the verifier inside of the lookup argument. Alternatively, one could add 31 lookups to the "memory" table to remove the _known_ final tokens for the registers there. ])
-
-### Lookup
-
-In this VM, halting is considered equivalent to executing a `sys_exit`. Hence, this chip responds to `ECALL`s with system call number 93.
-
-The HALT chip therefore contributes the following interaction to the lookup-argument:
-
-| Tag | Description | Multiplicity |
-|-----|-------------|--------------|
-| `HALT-C5` | `ECALL[timestamp, 93::DWordWL]` | -1 |
-
-## Padding
-
-This chip should only contain a single row. Given that `2^0 = 1`, this chip does not need to be padded. As such, no padding is defined.
-
-=  chip
-
-## Columns
-
-### Input
-
-| Name | Type | Description |
-|------|------|-------------|
-| `timestamp` | `DWordWL` | timestamp at which to commit |
-
-### Auxiliary
-
-| Name | Type | Description |
-|------|------|-------------|
-| `index` | `BaseField` | Index of value being committed. |
-| `address` | `DWordWL` | Address of first byte to commit. |
-| `address_incr` | `DWordHL` | $`address` + 1$ |
-| `count` | `DWordWL` | number of bytes to commit |
-| `count_decr` | `DWordHL` | $`count` - 1$ |
-| `first` | `Bit` | Whether this is the first commitment in this sequence. |
-| `end` | `Bit` | Whether this is the end of the commitment sequence. |
-| `value` | `Byte` | Byte stored at `address`. |
-
-### Multiplicity
-
-| Name | Type | Description |
-|------|------|-------------|
-| `μ` | `Bit` |  |
-
-The  chip leverages  variables, spanning  columns:
-
-## Constraints
-
-In this VM, committing is considered equivalent to writing a value to `stdout`. Hence, this chip responds to `ECALL`s with system call number 64.
-
-Since we do not know how many bytes are to be committed, this chip employs a recursive design: each iteration commits one byte, and recursively "calls" itself to commit the remaining bytes. As such, only the call from the CPU to this chip (i.e., the `first` in the recursion tree) should accept the `ECALL`; later recursive calls should not. This is why [commit:c:receive_ecall] has multiplicity `-`first``.
-
-| Tag | Description | Multiplicity |
-|-----|-------------|--------------|
-| `COMMIT-C1` | `ECALL[timestamp, 64::DWordWL]` | -first |
-
-The `write` operation --- writing to a file descriptor --- has the following signature:
-
-```c ssize_t write(size_t count; int fd, const void buf[count], size_t count); ```
-
-That is to say, - `A0` contains the file descriptor, - `A1` contains the address of `buf`'s first byte, - `A2` contains `count`, and - the written count should be written to `A0`.
-
-[commit:c:read_address] reads `address` from `x11` (=`A1`) and [commit:c:read_count] reads `count` from `x12` (=`A2`). Since we only support writing to `stdout` (which corresponds to ``fd` = 1`
-
-we assert that `x10` contains `1` in [commit:c:read_fd_write_count]. Note that this constraint _also_ writes `count` to `A0`; in this VM it is impossible for a commit to be interrupted or fail. Lastly, the `index` is read from `x254`; in the same operation, ``index` + `count`` is written back to this location by [commit:c:read_index]. This, too, leverages the fact that a commit will not be interrupted or fail to update the `index` for the next commit sequence. Again, each of these memory interactions only take place when this is the `first` call in the recursion tree.
-
-| Tag | Description | Multiplicity |
-|-----|-------------|--------------|
-| `COMMIT-C2` | `MEMW[['arr', ['idx', 'address', 0], ['idx', 'address', 1], 0, 0, 0, 0, 0, 0]; 1, (2 * 11)::DWordWL, ['arr', ['idx', 'address', 0], ['idx', 'address', 1], 0, 0, 0, 0, 0, 0], timestamp, 1, 0, 0]` | first |
-| `COMMIT-C3` | `MEMW[['arr', ['idx', 'count', 0], ['idx', 'count', 1], 0, 0, 0, 0, 0, 0]; 1, (2 * 12)::DWordWL, ['arr', ['idx', 'count', 0], ['idx', 'count', 1], 0, 0, 0, 0, 0, 0], timestamp, 1, 0, 0]` | first |
-| `COMMIT-C4` | `MEMW[['arr', 1, 0, 0, 0, 0, 0, 0, 0]; 1, (2 * 10)::DWordWL, ['arr', ['idx', 'count', 0], ['idx', 'count', 1], 0, 0, 0, 0, 0, 0], timestamp, 1, 0, 0]` | first |
-| `COMMIT-C5` | `MEMW[['arr', 'index', 0, 0, 0, 0, 0, 0, 0]; 1, (2 * 254)::DWordWL, ['arr', ['+', 'index', ['cast', 'count', 'BaseField']], 0, 0, 0, 0, 0, 0, 0], timestamp, 0, 0, 0]` | first |
-
-*Note*: the observant reader will notice that [commit:c:read_index] casts `count` to a `BaseField`, potentiallly losing information. This is indeed correct. However, since it is practically impossible to commit more than `2^64-2^32` bytes in a single VM execution, it was decided to permit this.
-
-Next, we read the `value` located at buffer address `address` and commit to it under the given `index`. This is only performed when we have not yet reached the `end` of the commit sequence.
-
-| Tag | Description | Multiplicity |
-|-----|-------------|--------------|
-| `COMMIT-C6` | `MEMW[['arr', 'value', 0, 0, 0, 0, 0, 0, 0]; 0, address, ['arr', 'value', 0, 0, 0, 0, 0, 0, 0], timestamp, 0, 0, 0]` | μ - end |
-| `COMMIT-C7` | `COMMIT[index, value]` | μ - end |
-
-In parallel, we compute ``address_incr` = `address` + 1` ([commit:c:address_incr]) as address of the next byte to commit, and ``count_decr` = `count` - 1` ([commit:c:count_decr]) as the number of bytes that still has to be committed after committing this byte. [commit:c:range_address_incr] and [commit:c:range_count_decr] are included to satisfy [add:a:sum] respectively [add:a:rhs].
-
-| Tag | Range | Description | Multiplicity |
-|-----|-------|-------------|--------------|
-| `COMMIT-C8` |  | `ADD<address_incr::DWordWL; address, 1::DWordWL>` |  |
-| `COMMIT-C9.i` | i ∈ [0, 3] | `IS_HALF[address_incr[i]]` | μ |
-| `COMMIT-C10` |  | `SUB<count_decr::DWordWL; count, 1::DWordWL>` |  |
-| `COMMIT-C11.i` | i ∈ [0, 3] | `IS_HALF[count_decr[i]]` | μ |
-
-When `count` hits `0`, we should stop performing further recursive calls. We use the `end` bit to indicate these circumstances.
-
-| Tag | Description | Multiplicity |
-|-----|-------------|--------------|
-| `COMMIT-C12` | `ZERO[end; (65535 - count_decr[0]) + (65535 - count_decr[1]) + (65535 - count_decr[2]) + (65535 - count_decr[3])]` | μ |
-
-*Note*: + Rather than setting ``end` = 1` when ``count` = 0`, we do so when ``count_decr` = -1`. This technique allows `count` to be stored in a `DWordWL` rather than a `DWordHL`, saving two columns. + `forall i in [0, 3]: 65535 - `count_decr`_i >= 0` as a result of [commit:c:range_count_decr]. Hence, $ sum_(i=0)^3 65535 - `count_decr`_i = 0 arrow.l.r.double.long forall i in [0, 3]: `count_decr`_i = 65535 $
-
-When this was not the `end` byte to commit in this recursion sequence, we recursively _Commit the Next Byte_ (`CNB`), specifying the timestamp, address to continue reading and the number of bytes that should still be committed ([commit:c:send_commit_next_byte]). Since that certainly won't be the `first` call in the sequence, we read `address_incr` and `count_decr` from the previous recursion level into `address` and `count` and continue executing the commit.
-
-| Tag | Description | Multiplicity |
-|-----|-------------|--------------|
-| `COMMIT-C13` | `CNB[timestamp, index + 1, address_incr::DWordWL, count_decr::DWordWL]` | μ - end |
-| `COMMIT-C14` | `CNB[timestamp, index, address, count]` | -(μ - first) |
-
-Lastly, we must make sure `first`, `end` and `μ` are bits ([commit:c:range_first], [commit:c:range_end], [commit:c:range_mu]), and that when either ``first` = 1` or ``end` = 1` imply that ``μ` = 1` ([commit:c:first_or_end_implies_mu]). These are required to ensure the multiplicities `-(`μ` - `first`)` and ``μ` - `end`` are binary.
-
-| Tag | Description |
-|-----|-------------|
-| `COMMIT-C15` | `IS_BIT<first>` |
-| `COMMIT-C16` | `IS_BIT<end>` |
-| `COMMIT-C17` | `IS_BIT<μ>` |
-| `COMMIT-C18` | `first` + `end` => `μ` = 1 |
-| | _polynomial:_ `(first + end) * (1 - μ) = 0` |
-
-## Padding
-
-To pad this chip, use the below data.
-
-## Notes/optimizations
-
-- The current version only supports writing to `stdout`. This chip could potentially be extended to support writing to arbitrary `fd`s - One might be able to replace [commit:c:end] by `end => count = 0`. While loosening the constraint (`count = 0 => end` is no longer enforced), this should not cause any problems: if the prover does not set `end` when `count=0`, they simply cannot complete the proof. First of all, one would have to recursively work through all `2^64` values of `count`, something that is practically infeasible. Moreover, if this is done with a sequence that originally has ``count` > 0`, one will inevitably have to read a memory address twice at the same timestamp, which is impossible to prove. In addition to dropping the `ZERO` lookup, this optimization might also permit moving `count_decr` from a `DWordHL` to a `DWordWL`, saving two columns. - Given that it is practically infeasible to commit more than ``p`-1 = 2^64-2^32` bytes in a program, it might suffice to store `count_decr` in a `BaseField`. Note that this would probably involve having an extra (virtual) column storing `count` in `BaseField` form as well. Moreover, one might need to add a lookup to `LT` to ensure ``count` <= `p`-1` when being read from memory at the beginning of each commitment sequence.
\ No newline at end of file
+# ECALL Chips
\ No newline at end of file
diff --git a/docs/spec/is_bit.md b/docs/spec/is_bit.md
index 389867944..eb0070036 100644
--- a/docs/spec/is_bit.md
+++ b/docs/spec/is_bit.md
@@ -2,13 +2,15 @@
 
 Barring exceptional cases, this template is used to assert that a variable of type `Bit` assumes a valid value under some condition.
 
-= Variables The  template operates on two variables: `cond` and `X`:
+= Variables The  template operates on  variables:
 
 = Constraints It takes only one constraint to enforce that `X` must be either `0` or `1` whenever ``cond` eq.not 0`:
 
 *Note*: - In case of _unconditional_ template application, `cond` can be dropped from the constraint, simplifying it to ``X` (1- `X`) = 0`. - As described earlier, the `cond` variable must be describable by a degree-1 (i.e., linear) expression. This is to make sure that [isbit:c:isbit]'s expression has degree at most 3.
 
-= Proof of correctness If `cond` is `0`, [isbit:c:isbit] is trivially satisfied: `X` can assume any value and the polynomial constraint will evaluate to `0` regardless. When ``cond` eq.not 0`, it follows that the statement can only be proven when ``X` (1-`X`) equiv 0 mod p`, with `p` the modulus of the field. Because `BaseField` is a prime field, this equality is only satisfied if either ``X` equiv 0 mod p` or `1-`X` equiv 0 mod p`. Hence, it is proven that when ``cond` eq.not 0`, [isbit:c:isbit] is only satisfied if ``X` in {0, 1}`. 
+## Correctness argument
+
+If `cond` is `0`, [isbit:c:isbit] is trivially satisfied: `X` can assume any value and the polynomial constraint will evaluate to `0` regardless. When ``cond` eq.not 0`, it follows that the statement can only be proven when ``X` (1-`X`) equiv 0 mod p`, with `p` the modulus of the field. Because `BaseField` is a prime field, this equality is only satisfied if either ``X` equiv 0 mod p` or `1-`X` equiv 0 mod p`. Hence, it is proven that when ``cond` eq.not 0`, [isbit:c:isbit] is only satisfied if ``X` in {0, 1}`. 
 
 ## Columns
 
diff --git a/docs/spec/load.md b/docs/spec/load.md
index 8f75cd39b..62fce1bf7 100644
--- a/docs/spec/load.md
+++ b/docs/spec/load.md
@@ -2,9 +2,9 @@
 
 The  chip provides functionality to read values from memory and sign-extend them where appropriate. It delegates low-level memory handling to the `MEMW` chip ([memw]).
 
-= Columns
+= Variables
 
-The `LOAD` chip is comprised of  variables that are expressed using  columns:
+The  chip is comprised of  variables that are expressed using  columns and leverages  interaction(s):
 
 = Assumptions
 
diff --git a/docs/spec/lt.md b/docs/spec/lt.md
index 1eba8c181..7fd0d806f 100644
--- a/docs/spec/lt.md
+++ b/docs/spec/lt.md
@@ -2,9 +2,9 @@
 
 The  chip constrains an indicator bit for the less-than relation, signed or unsigned.
 
-= Columns
+= Variables
 
-The `LT` chip is comprised of  variables that are expressed using  columns:
+The  chip is comprised of  variables that are expressed using  columns and leverages  interaction(s):
 
 = Assumptions We assume the inputs `lhs`, `rhs` and `signed` are partially range checked.
 
diff --git a/docs/spec/memw.md b/docs/spec/memw.md
index d44fa38af..b3d47d788 100644
--- a/docs/spec/memw.md
+++ b/docs/spec/memw.md
@@ -2,9 +2,9 @@
 
 The  chip is used to read and write memory locations (both RAM and registers) in chunks of 1, 2, 4 or 8 values. It introduces the old value and last-accessed timestamps of memory addresses internally, in order to satisfy the design of the memory argument ([memory]).
 
-= Columns
+= Variables
 
-The `MEMW` chip is comprised of  variables that are expressed using  columns:
+The  chip is comprised of  variables that are expressed using  columns and leverages  interaction(s):
 
 = Assumptions
 
@@ -59,7 +59,7 @@ When a memory access happens at an address with proper alignment for its access
 
 Further logic remains essentially the same, so we briefly present the relevant tables for this chip.
 
-The  chip only needs  variables, expressed through  columns.
+The  chip only needs  variables, expressed through  columns; it leverages  interactions.
 
 ## Padding
 
@@ -73,6 +73,41 @@ Note: as a result of hard optimization, this chip can only be used for register
 
 Note moreover that this chip does not guard against misaligned register access faults: to access register with a given `address`, one must provide `2 dot `address`` in the lookup.
 
+## Variables
+
+The  chip is comprised of  variables that are expressed using  columns and leverages  interactions:
+
+## Assumptions
+
+| Tag | Range | Description |
+|-----|-------|-------------|
+| `MEMW-A1.i` | i ∈ [0, 1] | `IS_WORD[base_address[i]]` |
+| `MEMW-A2` |  | `IS_BIT<write2>` |
+| `MEMW-A3` |  | `IS_BIT<write4>` |
+| `MEMW-A4` |  | `IS_BIT<write8>` |
+| `MEMW-A5` |  | `IS_BIT<write2 + write4 + write8>` |
+| `MEMW-A6.i` | i ∈ [0, 1] | `IS_WORD[timestamp[i]]` |
+
+The following range checks are assumed to be performed/enforced outside of this chip:
+
+## Constraints
+
+Since most registers are frequently accessed, the difference between `timestamp` and `old_timestamp` is small most of the times. Rather than storing their (nearly) identical upper limbs twice, it is instead assumed that ``old_timestamp[1]` = `timestamp[1]``;  can be used for accesses where this is not the case.
+
+Verifying that ``timestamp` > `old_timestamp`` now simplifies to verifying that ``timestamp[0]` - `old_timestamp[0]` > 0`. For most accesses, this value will be small enough to fit in a `Half`. This chip thus enforces this by means of the following constraint:
+
+With ``old_timestamp`<`timestamp`` asserted, `old` is read from the register ([regw:c:read_old]) and `val` is written back ([regw:c:write_val]).
+
+This chip can either just write (``μ_write` = 1`), or both read and write (``μ_read` = 1`) in the same cycle. It must be asserted that at most one of these two options is selected:
+
+Lastly, this chip contributes the following interactions to the logup:
+
+## Padding
+
+The table can be padded to the next power of two with the following value assignments:
+
+= Notes/optimizations The following ideas may prove to be optimizations for the // chip: - `MEMB` chip that does a one-byte write to remove old_timestamp from here (uncertain tradeoffs) - Adding `μ_sum`/`w2`/`w4`/`write8` multiplicities to the `IS_HALF` lookups may make some GKR things faster if there are known zeroes. - For the register fast-path, one may upgrade the `IS_HALF` check to an `IS_B20` check for extended range at the cost of looking through a larger table.
+
 ## Columns
 
 ### Input
@@ -134,37 +169,4 @@ address_add := ['arr', ['-', ['+', ['idx', 'base_address', 0], 'i', 1], ['*', ['
 | Name | Type | Description |
 |------|------|-------------|
 | `μ_read` | `Bit` | Whether we are performing a read (and hence return `out`) |
-| `μ_write` | `Bit` | Whether we are performing a write (and hence not return `out`) |
-
-The  chip is comprised of  variables that are expressed using  columns:
-
-## Assumptions
-
-| Tag | Range | Description |
-|-----|-------|-------------|
-| `MEMW-A1.i` | i ∈ [0, 1] | `IS_WORD[base_address[i]]` |
-| `MEMW-A2` |  | `IS_BIT<write2>` |
-| `MEMW-A3` |  | `IS_BIT<write4>` |
-| `MEMW-A4` |  | `IS_BIT<write8>` |
-| `MEMW-A5` |  | `IS_BIT<write2 + write4 + write8>` |
-| `MEMW-A6.i` | i ∈ [0, 1] | `IS_WORD[timestamp[i]]` |
-
-The following range checks are assumed to be performed/enforced outside of this chip:
-
-## Constraints
-
-Since most registers are frequently accessed, the difference between `timestamp` and `old_timestamp` is small most of the times. Rather than storing their (nearly) identical upper limbs twice, it is instead assumed that ``old_timestamp[1]` = `timestamp[1]``;  can be used for accesses where this is not the case.
-
-Verifying that ``timestamp` > `old_timestamp`` now simplifies to verifying that ``timestamp[0]` - `old_timestamp[0]` > 0`. For most accesses, this value will be small enough to fit in a `Half`. This chip thus enforces this by means of the following constraint:
-
-With ``old_timestamp`<`timestamp`` asserted, `old` is read from the register ([regw:c:read_old]) and `val` is written back ([regw:c:write_val]).
-
-This chip can either just write (``μ_write` = 1`), or both read and write (``μ_read` = 1`) in the same cycle. It must be asserted that at most one of these two options is selected:
-
-Lastly, this chip contributes the following interactions to the logup:
-
-## Padding
-
-The table can be padded to the next power of two with the following value assignments:
-
-= Future optimization ideas - `MEMB` chip that does a one-byte write to remove old_timestamp from here (uncertain tradeoffs) - Adding `μ_sum`/`w2`/`w4`/`write8` multiplicities to the `IS_HALF` lookups may make some GKR things faster if there are known zeroes. - For the register fast-path, one may upgrade the `IS_HALF` check to an `IS_B20` check for extended range at the cost of looking through a larger table.
\ No newline at end of file
+| `μ_write` | `Bit` | Whether we are performing a write (and hence not return `out`) |
\ No newline at end of file
diff --git a/docs/spec/mul.md b/docs/spec/mul.md
index 30f9a57a5..fa459cde8 100644
--- a/docs/spec/mul.md
+++ b/docs/spec/mul.md
@@ -2,9 +2,9 @@
 
 The  chip constrains multiplication, both signed and unsigned, as well as providing access to the low and high halfs of the multiplication result.
 
-= Columns
+= Variables
 
-The `MUL` chip is comprised of  variables that are expressed using  columns:
+The  chip is comprised of  variables that are expressed using  columns and leverages  interaction(s):
 
 `mat(delim: , top; bottom)` }
 
@@ -58,9 +58,7 @@ The  chip contributes the following to the lookup:
 
 The table can be padded to the next power of two with the following value assignments:
 
-= Notes - `lo` and `hi` are stored in `DWordHL`s (rather than `DWordWL`s) because of their values being range checked. Since it is not required that both `μ_lo` and `μ_hi` are non-zero at the same time, one cannot safely assume their range to be checked elsewhere.
-
-As an optimization, one might be able to use a `DWordWL` and `DWordHL` to store `lo` and `hi`, where one would decide which to store in which based on the multiplicities `μ_lo` and `μ_hi`; the value sent into the lookup could then be assumed range-checked by the other side of the relation. This optimization was not included at this moment because of its negative impact on the readability and verifiability of the chip.
+= Notes/optimizations - `lo` and `hi` are stored in `DWordHL`s (rather than `DWordWL`s) because of their values being range checked. Since it is not required that both `μ_lo` and `μ_hi` are non-zero at the same time, one cannot safely assume their range to be checked elsewhere. - As an optimization, one might be able to use a `DWordWL` and `DWordHL` to store `lo` and `hi`, where one would decide which to store in which based on the multiplicities `μ_lo` and `μ_hi`; the value sent into the lookup could then be assumed range-checked by the other side of the relation. This optimization was not included at this moment because of its negative impact on the readability and verifiability of the chip.
 
 ## Columns
 
diff --git a/docs/spec/neg.md b/docs/spec/neg.md
index fd638d975..bd7685bf8 100644
--- a/docs/spec/neg.md
+++ b/docs/spec/neg.md
@@ -2,12 +2,14 @@
 
 It requires `cond` to be a bit.
 
-= Variables
+= Variables This template introduces  interaction(s).
 
 = Assumptions
 
 = Constraints We constrain this equality using two constraints:
 
+## Correctness argument
+
 The constraints force the `carry` values to be fixed. Writing `carry`'s definition, we then find that $
 
 = cases( 2^32 - (`x as DWordWL`)_0 & "if" (`x as DWordWL`)_0 != 0, 0 & "if" (`x as DWordWL`)_0 = 0 ),\
@@ -18,7 +20,7 @@ The constraints force the `carry` values to be fixed. Writing `carry`'s definiti
 
 &= 2^32 dot `neg`_1 + `neg`_0\ &= 2^32 dot (2^32 - (`x as DWordWL`)_1 - 1) + (2^32 - (`x as DWordWL`)_0)  \ &= 2^64 - 2^32 dot (`x as DWordWL`)_1 - 2^32 + 2^32 - (`x as DWordWL`)_0  \ &= 2^64 - ((`x as DWordWL`)_0 + 2^32 dot (`x as DWordWL`)_1) \ &= 2^64 - `x`\ &equiv -x mod 2^64 $ when `cond` is set. When `cond` is not set, the two lookups are not executed, allowing `neg` to take any value in either case.
 
-= Note It is worth noting that this construction does _not_ require the limbs of `neg` to be range checked, thus allowing it be represented by the unrangecheckable `DWordWL` rather than a `DWordHL`. The input value `x` is still assumed to be range-checked, however.
+It is worth noting that this construction does _not_ require the limbs of `neg` to be range checked, thus allowing it be represented by the unrangecheckable `DWordWL` rather than a `DWordHL`. The input value `x` is still assumed to be range-checked, however. ]
 
 ## Columns
 
diff --git a/docs/spec/shift.md b/docs/spec/shift.md
index 32679168f..0b52509ec 100644
--- a/docs/spec/shift.md
+++ b/docs/spec/shift.md
@@ -6,9 +6,9 @@ $ $
 
 $ Here, `<<` and `>>` denote the _logical_ left and right shift operations, while `>>>` denotes the _arithmetic_ right shift operation.
 
-= Columns
+= Variables
 
-The `SHIFT` chip is comprised of  variables that are expressed using  columns:
+The `SHIFT` chip is comprised of  variables that are expressed using  columns and leverages  interaction(s):
 
 = Assumptions
 
diff --git a/docs/spec/sign.md b/docs/spec/sign.md
index 592c78489..a0aac8eb7 100644
--- a/docs/spec/sign.md
+++ b/docs/spec/sign.md
@@ -2,7 +2,7 @@
 
 It constrains that `sign` is set to `1` when both `X`'s most significant bit and `signed` are `1`, and `0` otherwise.
 
-= Variables The  template operates on three variables:
+= Variables The  template introduces  interaction(s):
 
 = Assumptions The  template operates on the following assumptions:
 
diff --git a/docs/spec/signatures.md b/docs/spec/signatures.md
index b691ff739..fd6e45e63 100644
--- a/docs/spec/signatures.md
+++ b/docs/spec/signatures.md
@@ -20,8 +20,8 @@ return vars.map(v => { let (label, factor) = if type(v) == array { (v.at(0), v.a
 
 The following lists signatures of the .len() interactions in this VM.
 
-table( columns: (1fr, auto), inset: 7pt, align: (top+left, center), stroke: none, table.header([*Signature*], [*Bus size*]), table.hline(stroke: 1pt), table.vline(stroke: 1pt, x: 1), ..for sig in interactions { ([], []) }, ), caption: "Signature overview of interactions",
+columns: (1fr, auto), inset: 7pt, align: (top+left, center), stroke: none, table.header([*Signature*], [*Bus size*]), table.hline(stroke: 1pt), table.vline(stroke: 1pt, x: 1), ..for sig in interactions { ([], []) }, ))
 
 Below, we list the signatures of the .len() templates in this VM.
 
-table( columns: 1fr, inset: 7pt, align: (top+left, center), stroke: none, table.header([*Signature*]), table.hline(stroke: 1pt), ..for sig in templates { ([], ) }, ), caption: "Signature overview of templates",
\ No newline at end of file
+columns: 1fr, inset: 7pt, align: (top+left, center), stroke: none, table.header([*Signature*]), table.hline(stroke: 1pt), ..for sig in templates { ([], ) }, ))
\ No newline at end of file
diff --git a/docs/spec/spec_full.md b/docs/spec/spec_full.md
index 86ccf58c8..b651377e4 100644
--- a/docs/spec/spec_full.md
+++ b/docs/spec/spec_full.md
@@ -114,11 +114,11 @@ return vars.map(v => { let (label, factor) = if type(v) == array { (v.at(0), v.a
 
 The following lists signatures of the .len() interactions in this VM.
 
-table( columns: (1fr, auto), inset: 7pt, align: (top+left, center), stroke: none, table.header([*Signature*], [*Bus size*]), table.hline(stroke: 1pt), table.vline(stroke: 1pt, x: 1), ..for sig in interactions { ([], []) }, ), caption: "Signature overview of interactions",
+columns: (1fr, auto), inset: 7pt, align: (top+left, center), stroke: none, table.header([*Signature*], [*Bus size*]), table.hline(stroke: 1pt), table.vline(stroke: 1pt, x: 1), ..for sig in interactions { ([], []) }, ))
 
 Below, we list the signatures of the .len() templates in this VM.
 
-table( columns: 1fr, inset: 7pt, align: (top+left, center), stroke: none, table.header([*Signature*]), table.hline(stroke: 1pt), ..for sig in templates { ([], ) }, ), caption: "Signature overview of templates",
+columns: 1fr, inset: 7pt, align: (top+left, center), stroke: none, table.header([*Signature*]), table.hline(stroke: 1pt), ..for sig in templates { ([], ) }, ))
 
 ---
 
@@ -126,13 +126,15 @@ table( columns: 1fr, inset: 7pt, align: (top+left, center), stroke: none, table.
 
 Barring exceptional cases, this template is used to assert that a variable of type `Bit` assumes a valid value under some condition.
 
-= Variables The  template operates on two variables: `cond` and `X`:
+= Variables The  template operates on  variables:
 
 = Constraints It takes only one constraint to enforce that `X` must be either `0` or `1` whenever ``cond` eq.not 0`:
 
 *Note*: - In case of _unconditional_ template application, `cond` can be dropped from the constraint, simplifying it to ``X` (1- `X`) = 0`. - As described earlier, the `cond` variable must be describable by a degree-1 (i.e., linear) expression. This is to make sure that [isbit:c:isbit]'s expression has degree at most 3.
 
-= Proof of correctness If `cond` is `0`, [isbit:c:isbit] is trivially satisfied: `X` can assume any value and the polynomial constraint will evaluate to `0` regardless. When ``cond` eq.not 0`, it follows that the statement can only be proven when ``X` (1-`X`) equiv 0 mod p`, with `p` the modulus of the field. Because `BaseField` is a prime field, this equality is only satisfied if either ``X` equiv 0 mod p` or `1-`X` equiv 0 mod p`. Hence, it is proven that when ``cond` eq.not 0`, [isbit:c:isbit] is only satisfied if ``X` in {0, 1}`. 
+## Correctness argument
+
+If `cond` is `0`, [isbit:c:isbit] is trivially satisfied: `X` can assume any value and the polynomial constraint will evaluate to `0` regardless. When ``cond` eq.not 0`, it follows that the statement can only be proven when ``X` (1-`X`) equiv 0 mod p`, with `p` the modulus of the field. Because `BaseField` is a prime field, this equality is only satisfied if either ``X` equiv 0 mod p` or `1-`X` equiv 0 mod p`. Hence, it is proven that when ``cond` eq.not 0`, [isbit:c:isbit] is only satisfied if ``X` in {0, 1}`. 
 
 ## Columns
 
@@ -163,7 +165,7 @@ Barring exceptional cases, this template is used to assert that a variable of ty
 
 It constrains that `sign` is set to `1` when both `X`'s most significant bit and `signed` are `1`, and `0` otherwise.
 
-= Variables The  template operates on three variables:
+= Variables The  template introduces  interaction(s):
 
 = Assumptions The  template operates on the following assumptions:
 
@@ -210,7 +212,7 @@ For ease of notation, we moreover introduce the  constraint template $
 
 $ in both conditional and unconditional versions. It constrains that ``diff` equiv `lhs` - `rhs` (mod 2^64)` when the expression `cond` is non-zero.
 
-= Variables
+= Variables This template introduces  interaction(s).
 
 = Assumptions
 
@@ -271,12 +273,14 @@ carry (when iter=1) := 2^-32 * (lhs[1] + rhs[1] + carry[0] - sum[1])
 
 It requires `cond` to be a bit.
 
-= Variables
+= Variables This template introduces  interaction(s).
 
 = Assumptions
 
 = Constraints We constrain this equality using two constraints:
 
+## Correctness argument
+
 The constraints force the `carry` values to be fixed. Writing `carry`'s definition, we then find that $
 
 = cases( 2^32 - (`x as DWordWL`)_0 & "if" (`x as DWordWL`)_0 != 0, 0 & "if" (`x as DWordWL`)_0 = 0 ),\
@@ -287,7 +291,7 @@ The constraints force the `carry` values to be fixed. Writing `carry`'s definiti
 
 &= 2^32 dot `neg`_1 + `neg`_0\ &= 2^32 dot (2^32 - (`x as DWordWL`)_1 - 1) + (2^32 - (`x as DWordWL`)_0)  \ &= 2^64 - 2^32 dot (`x as DWordWL`)_1 - 2^32 + 2^32 - (`x as DWordWL`)_0  \ &= 2^64 - ((`x as DWordWL`)_0 + 2^32 dot (`x as DWordWL`)_1) \ &= 2^64 - `x`\ &equiv -x mod 2^64 $ when `cond` is set. When `cond` is not set, the two lookups are not executed, allowing `neg` to take any value in either case.
 
-= Note It is worth noting that this construction does _not_ require the limbs of `neg` to be range checked, thus allowing it be represented by the unrangecheckable `DWordWL` rather than a `DWordHL`. The input value `x` is still assumed to be range-checked, however.
+It is worth noting that this construction does _not_ require the limbs of `neg` to be range checked, thus allowing it be represented by the unrangecheckable `DWordWL` rather than a `DWordHL`. The input value `x` is still assumed to be range-checked, however. ]
 
 ## Columns
 
@@ -343,7 +347,7 @@ carry (when iter=1) := 2^-32 * ((x::DWordWL)[1] + neg[1] + carry[0])
 
 All `RV64IMC` instruction are to be decoded to a format that can be interpreted by the VM. This section outlines the decoding table being used in the VM. For reasons of efficiency, data in this table is significantly compressed. Since reasoning about this compressed form is needlessly complex, the `decode (uncompressed)` section presents the same table in uncompressed form, and explains how to decode `RV64IM` assembly instructions to it. Instructions on how to compress the uncompressed table to form the compressed decode table, can be derived from the `packed_decode` variable provided below.
 
-= Columns
+= Variables
 
 The  table is comprised of  variables that are expressed using  columns:
 
@@ -369,7 +373,7 @@ super("[" + refs.pos().map(r => ref(r)).join(",") + "]") }
 
 show figure: set block(breakable: true)
 
-figure(table( columns: (auto, auto, auto, auto, 1fr, auto), stroke: 0pt, inset: (right: .5em), align: (left, right, center, center, left, right), fill: (_, y) => // Overlay a low-opacity fill color to distinguish the different rows better if calc.odd(y) and y <= lines.len() { color.rgb(0, 0, 100, 20) } else { color.rgb(255, 255, 255, 20) }, table.header([*Operation*], [*op-flag*], [*`w_instr`*], [*`signed`*], [*other*], []), table.hline(stroke: 1.5pt), table.vline(x: 1, start: 1, end: lines.len() + 1, stroke: .5pt), ..lines.flatten(), table.hline(stroke: 1.5pt), table.footer([*Operation*], [*op-flag*], [*`w_instr`*], [*`signed`*], [*other*]), ), caption: [Decoding table] }
+figure(table( columns: (auto, auto, auto, auto, 1fr, auto), stroke: 0pt, inset: (right: .5em), align: (left, right, center, center, left, right), fill: (_, y) => // Overlay a low-opacity fill color to distinguish the different rows better if calc.odd(y) and y <= lines.len() { color.rgb(0, 0, 100, 20) } else { color.rgb(255, 255, 255, 20) }, table.header([*Operation*], [*op-flag*], [*`w_instr`*], [*`signed`*], [*other*], []), table.hline(stroke: 1.5pt), table.vline(x: 1, start: 1, end: lines.len() + 1, stroke: .5pt), ..lines.flatten(), table.hline(stroke: 1.5pt), table.footer([*Operation*], [*op-flag*], [*`w_instr`*], [*`signed`*], [*other*]), )) }
 
 // OP-IMM ([`ADDI[W]   rd, rs1, imm`], [`ADD`], [`[W]`], [], [], []), ([`SLTI[U]   rd, rs1, imm`], [`SLT`], [], [.not`[U]`], [], []), ([`ANDI      rd, rs1, imm`], [`AND`], [], [], [], []), ([`ORI       rd, rs1, imm`], [`OR`],   [], [], [], []), ([`XORI      rd, rs1, imm`], [`XOR`], [], [], [], []), ([`SLLI[W]   rd, rs1, imm`], [`SHIFT`], [`[W]`], [], [], []), ([`SRLI[W]   rd, rs1, imm`], [`SHIFT`], [`[W]`], [], [`mp_selector`], []), ([`SRAI[W]   rd, rs1, imm`], [`SHIFT`], [`[W]`], [1], [`mp_selector`], []), // OP ([`ADD[W]    rd, rs1, rs2`], [`ADD`], [`[W]`], [], [], []), ([`SUB[W]    rd, rs1, rs2`], [`SUB`], [`[W]`], [], [], []), ([`SLT[U]    rd, rs1, rs2`], [`SLT`], [], [.not`[U]`], [], []), ([`AND       rd, rs1, rs2`], [`AND`], [], [], [], []), ([`OR        rd, rs1, rs2`], [`OR`], [], [], [], []), ([`XOR       rd, rs1, rs2`], [`XOR`], [], [], [], []), ([`SLL[W]    rd, rs1, rs2`], [`SHIFT`], [`[W]`], [], [], []), ([`SRL[W]    rd, rs1, rs2`], [`SHIFT`], [`[W]`], [], [`mp_selector`], []), ([`SRA[W]    rd, rs1, rs2`], [`SHIFT`], [`[W]`], [1], [`mp_selector`], []), // OP - M ([`MUL[W]    rd, rs1, rs2`], [`MUL`], [`[W]`], [1], [`mp_selector`], []), ([`MULH      rd, rs1, rs2`], [`MUL`], [], [1], [`mp_selector`, `muldiv_selector`], []), ([`MULHU     rd, rs1, rs2`], [`MUL`], [], [], [`muldiv_selector`], []), ([`MULHSU    rd, rs1, rs2`], [`MUL`], [], [1], [`muldiv_selector`], []), ([`DIV[U][W] rd, rs1, rs2`], [`DIVREM`], [`[W]`], [.not`[U]`], [], []), ([`REM[U][W] rd, rs1, rs2`], [`DIVREM`], [`[W]`], [.not`[U]`], [`muldiv_selector`], []), // LUI/AUIPC ([`LUI       rd, imm`], [`ADD`], [], [], [], []), ([`AUIPC     rd, imm`], [`ADD`], [], [], [`rs1 := x255`], []), ([`JAL       rd, imm`], [`JALR`], [], [], [`rs1 := x255`], []), // Branching ([`JALR      rd, rs1, imm`], [`JALR`], [], [], [], []), ([`BEQ      rs1, rs2, imm`], [`BEQ`], [], [], [], []), ([`BNE      rs1, rs2, imm`], [`BEQ`], [], [], [`mp_selector`], []), ([`BLT[U]   rs1, rs2, imm`], [`BLT`], [], [.not`[U]`], [], []), ([`BGE[U]   rs1, rs2, imm`], [`BLT`], [], [.not`[U]`], [`mp_selector`], []), // LOAD ([`LD        rd, rs1, imm`], [`LOAD`], [], [], [`mem_8B`], []), ([`LW[U]     rd, rs1, imm`], [`LOAD`], [], [.not`[U]`], [`mem_4B`], []), ([`LH[U]     rd, rs1, imm`], [`LOAD`], [], [.not`[U]`], [`mem_2B`], []), ([`LB[U]     rd, rs1, imm`], [`LOAD`], [], [.not`[U]`], [], []), // STORE ([`SD       rs1, rs2, imm`], [`STORE`], [], [], [`mem_8B`], []), ([`SW       rs1, rs2, imm`], [`STORE`], [], [], [`mem_4B`], []), ([`SH       rs1, rs2, imm`], [`STORE`], [], [], [`mem_2B`], []), ([`SB       rs1, rs2, imm`], [`STORE`], [], [], [], []), // ECALL/EBREAK ([`ECALL`], [`ECALL`], [], [], [``rs1` := `x17``], []), ([`EBREAK`], [`EBREAK`], [], [], [], []), // FENCE ([`FENCE`], [`ADD`], [], [], [], []),
 
@@ -411,9 +415,9 @@ This entry is used to pad the `CPU` table. More details on this matter are provi
 
 The  chip coordinates memory accesses and dispatches to other chips for arithmetic and logical operations. It bases its decisions on the entry of the `DECODE` table ([decode]) corresponding the the current program counter (PC).
 
-= Columns
+= Variables
 
-The `CPU` chip is comprised of  variables that are expressed using  columns:
+The  chip is comprised of  variables that are expressed using  columns and leverages  interaction(s):
 
 = Assumptions
 
@@ -652,9 +656,9 @@ $ $
 
 $ Here, `<<` and `>>` denote the _logical_ left and right shift operations, while `>>>` denotes the _arithmetic_ right shift operation.
 
-= Columns
+= Variables
 
-The `SHIFT` chip is comprised of  variables that are expressed using  columns:
+The `SHIFT` chip is comprised of  variables that are expressed using  columns and leverages  interaction(s):
 
 = Assumptions
 
@@ -847,9 +851,9 @@ shifted := left * Σ_j = 0^i limb_shift[j] * intra_limb_left[i - j] + right * (
 
 The  chip computes the target address of a branching instruction.
 
-= Columns
+= Variables
 
-The `BRANCH` chip is comprised of  variables that are expressed using  columns:
+The  chip is comprised of  variables that are expressed using  columns and leverages  interaction(s):
 
 = Assumptions
 
@@ -941,9 +945,9 @@ next_pc (when iter=1) := 2^16 * next_pc_high[2] + next_pc_high[1]
 
 The  chip is used to read and write memory locations (both RAM and registers) in chunks of 1, 2, 4 or 8 values. It introduces the old value and last-accessed timestamps of memory addresses internally, in order to satisfy the design of the memory argument ([memory]).
 
-= Columns
+= Variables
 
-The `MEMW` chip is comprised of  variables that are expressed using  columns:
+The  chip is comprised of  variables that are expressed using  columns and leverages  interaction(s):
 
 = Assumptions
 
@@ -998,7 +1002,7 @@ When a memory access happens at an address with proper alignment for its access
 
 Further logic remains essentially the same, so we briefly present the relevant tables for this chip.
 
-The  chip only needs  variables, expressed through  columns.
+The  chip only needs  variables, expressed through  columns; it leverages  interactions.
 
 ## Padding
 
@@ -1012,6 +1016,41 @@ Note: as a result of hard optimization, this chip can only be used for register
 
 Note moreover that this chip does not guard against misaligned register access faults: to access register with a given `address`, one must provide `2 dot `address`` in the lookup.
 
+## Variables
+
+The  chip is comprised of  variables that are expressed using  columns and leverages  interactions:
+
+## Assumptions
+
+| Tag | Range | Description |
+|-----|-------|-------------|
+| `MEMW-A1.i` | i ∈ [0, 1] | `IS_WORD[base_address[i]]` |
+| `MEMW-A2` |  | `IS_BIT<write2>` |
+| `MEMW-A3` |  | `IS_BIT<write4>` |
+| `MEMW-A4` |  | `IS_BIT<write8>` |
+| `MEMW-A5` |  | `IS_BIT<write2 + write4 + write8>` |
+| `MEMW-A6.i` | i ∈ [0, 1] | `IS_WORD[timestamp[i]]` |
+
+The following range checks are assumed to be performed/enforced outside of this chip:
+
+## Constraints
+
+Since most registers are frequently accessed, the difference between `timestamp` and `old_timestamp` is small most of the times. Rather than storing their (nearly) identical upper limbs twice, it is instead assumed that ``old_timestamp[1]` = `timestamp[1]``;  can be used for accesses where this is not the case.
+
+Verifying that ``timestamp` > `old_timestamp`` now simplifies to verifying that ``timestamp[0]` - `old_timestamp[0]` > 0`. For most accesses, this value will be small enough to fit in a `Half`. This chip thus enforces this by means of the following constraint:
+
+With ``old_timestamp`<`timestamp`` asserted, `old` is read from the register ([regw:c:read_old]) and `val` is written back ([regw:c:write_val]).
+
+This chip can either just write (``μ_write` = 1`), or both read and write (``μ_read` = 1`) in the same cycle. It must be asserted that at most one of these two options is selected:
+
+Lastly, this chip contributes the following interactions to the logup:
+
+## Padding
+
+The table can be padded to the next power of two with the following value assignments:
+
+= Notes/optimizations The following ideas may prove to be optimizations for the // chip: - `MEMB` chip that does a one-byte write to remove old_timestamp from here (uncertain tradeoffs) - Adding `μ_sum`/`w2`/`w4`/`write8` multiplicities to the `IS_HALF` lookups may make some GKR things faster if there are known zeroes. - For the register fast-path, one may upgrade the `IS_HALF` check to an `IS_B20` check for extended range at the cost of looking through a larger table.
+
 ## Columns
 
 ### Input
@@ -1075,48 +1114,15 @@ address_add := ['arr', ['-', ['+', ['idx', 'base_address', 0], 'i', 1], ['*', ['
 | `μ_read` | `Bit` | Whether we are performing a read (and hence return `out`) |
 | `μ_write` | `Bit` | Whether we are performing a write (and hence not return `out`) |
 
-The  chip is comprised of  variables that are expressed using  columns:
-
-## Assumptions
-
-| Tag | Range | Description |
-|-----|-------|-------------|
-| `MEMW-A1.i` | i ∈ [0, 1] | `IS_WORD[base_address[i]]` |
-| `MEMW-A2` |  | `IS_BIT<write2>` |
-| `MEMW-A3` |  | `IS_BIT<write4>` |
-| `MEMW-A4` |  | `IS_BIT<write8>` |
-| `MEMW-A5` |  | `IS_BIT<write2 + write4 + write8>` |
-| `MEMW-A6.i` | i ∈ [0, 1] | `IS_WORD[timestamp[i]]` |
-
-The following range checks are assumed to be performed/enforced outside of this chip:
-
-## Constraints
-
-Since most registers are frequently accessed, the difference between `timestamp` and `old_timestamp` is small most of the times. Rather than storing their (nearly) identical upper limbs twice, it is instead assumed that ``old_timestamp[1]` = `timestamp[1]``;  can be used for accesses where this is not the case.
-
-Verifying that ``timestamp` > `old_timestamp`` now simplifies to verifying that ``timestamp[0]` - `old_timestamp[0]` > 0`. For most accesses, this value will be small enough to fit in a `Half`. This chip thus enforces this by means of the following constraint:
-
-With ``old_timestamp`<`timestamp`` asserted, `old` is read from the register ([regw:c:read_old]) and `val` is written back ([regw:c:write_val]).
-
-This chip can either just write (``μ_write` = 1`), or both read and write (``μ_read` = 1`) in the same cycle. It must be asserted that at most one of these two options is selected:
-
-Lastly, this chip contributes the following interactions to the logup:
-
-## Padding
-
-The table can be padded to the next power of two with the following value assignments:
-
-= Future optimization ideas - `MEMB` chip that does a one-byte write to remove old_timestamp from here (uncertain tradeoffs) - Adding `μ_sum`/`w2`/`w4`/`write8` multiplicities to the `IS_HALF` lookups may make some GKR things faster if there are known zeroes. - For the register fast-path, one may upgrade the `IS_HALF` check to an `IS_B20` check for extended range at the cost of looking through a larger table.
-
 ---
 
 # LT Chip
 
 The  chip constrains an indicator bit for the less-than relation, signed or unsigned.
 
-= Columns
+= Variables
 
-The `LT` chip is comprised of  variables that are expressed using  columns:
+The  chip is comprised of  variables that are expressed using  columns and leverages  interaction(s):
 
 = Assumptions We assume the inputs `lhs`, `rhs` and `signed` are partially range checked.
 
@@ -1220,9 +1226,9 @@ unsigned_lt := carry[1]
 
 The  chip constrains multiplication, both signed and unsigned, as well as providing access to the low and high halfs of the multiplication result.
 
-= Columns
+= Variables
 
-The `MUL` chip is comprised of  variables that are expressed using  columns:
+The  chip is comprised of  variables that are expressed using  columns and leverages  interaction(s):
 
 `mat(delim: , top; bottom)` }
 
@@ -1276,9 +1282,7 @@ The  chip contributes the following to the lookup:
 
 The table can be padded to the next power of two with the following value assignments:
 
-= Notes - `lo` and `hi` are stored in `DWordHL`s (rather than `DWordWL`s) because of their values being range checked. Since it is not required that both `μ_lo` and `μ_hi` are non-zero at the same time, one cannot safely assume their range to be checked elsewhere.
-
-As an optimization, one might be able to use a `DWordWL` and `DWordHL` to store `lo` and `hi`, where one would decide which to store in which based on the multiplicities `μ_lo` and `μ_hi`; the value sent into the lookup could then be assumed range-checked by the other side of the relation. This optimization was not included at this moment because of its negative impact on the readability and verifiability of the chip.
+= Notes/optimizations - `lo` and `hi` are stored in `DWordHL`s (rather than `DWordWL`s) because of their values being range checked. Since it is not required that both `μ_lo` and `μ_hi` are non-zero at the same time, one cannot safely assume their range to be checked elsewhere. - As an optimization, one might be able to use a `DWordWL` and `DWordHL` to store `lo` and `hi`, where one would decide which to store in which based on the multiplicities `μ_lo` and `μ_hi`; the value sent into the lookup could then be assumed range-checked by the other side of the relation. This optimization was not included at this moment because of its negative impact on the readability and verifiability of the chip.
 
 ## Columns
 
@@ -1365,9 +1369,9 @@ carry (when iter=[1, 3]) := 2^-32 * (raw_product[i] + carry[i - 1] - res[i])
 
 The  chip provides division and remainder functionality, both signed and unsigned.
 
-= Columns
+= Variables
 
-The `DVRM` chip is comprised of  variables that are expressed using  columns:
+The  chip is comprised of  variables that are expressed using  columns and leverages  interaction(s):
 
 = Assumptions
 
@@ -1570,9 +1574,9 @@ carry (when iter=[1, 3]) := 2^-32 * ((extended_n_sub_r::QuadWL)[i] + (extended_r
 
 The  chip provides functionality to read values from memory and sign-extend them where appropriate. It delegates low-level memory handling to the `MEMW` chip ([memw]).
 
-= Columns
+= Variables
 
-The `LOAD` chip is comprised of  variables that are expressed using  columns:
+The  chip is comprised of  variables that are expressed using  columns and leverages  interaction(s):
 
 = Assumptions
 
@@ -1661,175 +1665,13 @@ read1 := μ - read2 - read4 - read8
 
 # ECALL Chips
 
-ECALLs provide system-level functionalities to the guest program.
-
-When `ECALL` is executed, it is assumed that: - register `A7` contains the system call number
-
-- the arguments are located in registers `A0`-`A6`, and - the return value is written to `A0`, where `A0`-`A7` are symbolic names for the registers `x10`-`x17`
-
-=  chip
-
-## Columns
-
-### Input
-
-| Name | Type | Description |
-|------|------|-------------|
-| `timestamp` | `DWordWL` | timestamp at which to halt the program |
-
-The  chip leverages  variable, spanning  columns:
-
-## Assumptions
-
-| Tag | Range | Description |
-|-----|-------|-------------|
-| `HALT-A1.i` | i ∈ [0, 1] | `IS_WORD[timestamp[i]]` |
-
-It is assumed the input is range checked:
-
-## Constraints
-
-The  chip: + makes sure register `x10` (containing the exit code) equals `0` ([halt:c:read_zero_exit_code]), + writes `0` to all other registers ([halt:c:zeroize_registers_lo]/[halt:c:zeroize_registers_hi]), and + sets `pc` equal to `1` ([halt:c:pc]). Note that the writes performed by all these interactions are accompanied by the timestamp `2^64-1`; the maximum timestamp. This prevents any other operation involving memory from being executed hereafter.
-
-| Tag | Range | Description | Multiplicity |
-|-----|-------|-------------|--------------|
-| `HALT-C1.i` | i ∈ [1, 9] | `MEMW[1, (2 * i)::DWordWL, 0::BaseField[8], (2^64 - 1)::DWordWL, 1, 0, 0]` | 1 |
-| `HALT-C2` |  | `MEMW[0::BaseField[8]; 1, (2 * 10)::DWordWL, 0::BaseField[8], (2^64 - 1)::DWordWL, 1, 0, 0]` | 1 |
-| `HALT-C3.i` | i ∈ [11, 31] | `MEMW[1, (2 * i)::DWordWL, 0::BaseField[8], (2^64 - 1)::DWordWL, 1, 0, 0]` | 1 |
-| `HALT-C4` |  | `MEMW[1, (2 * 255)::DWordWL, ['arr', 1, 0, 0, 0, 0, 0, 0, 0], (2^64 - 1)::DWordWL, 1, 0, 0]` | 1 |
-
-[ Observe that --- in its current state --- this solution puts the burden of verifying the register cleanup on the verifier inside of the lookup argument. Alternatively, one could add 31 lookups to the "memory" table to remove the _known_ final tokens for the registers there. ])
-
-### Lookup
-
-In this VM, halting is considered equivalent to executing a `sys_exit`. Hence, this chip responds to `ECALL`s with system call number 93.
-
-The HALT chip therefore contributes the following interaction to the lookup-argument:
-
-| Tag | Description | Multiplicity |
-|-----|-------------|--------------|
-| `HALT-C5` | `ECALL[timestamp, 93::DWordWL]` | -1 |
-
-## Padding
-
-This chip should only contain a single row. Given that `2^0 = 1`, this chip does not need to be padded. As such, no padding is defined.
-
-=  chip
-
-## Columns
-
-### Input
-
-| Name | Type | Description |
-|------|------|-------------|
-| `timestamp` | `DWordWL` | timestamp at which to commit |
-
-### Auxiliary
-
-| Name | Type | Description |
-|------|------|-------------|
-| `index` | `BaseField` | Index of value being committed. |
-| `address` | `DWordWL` | Address of first byte to commit. |
-| `address_incr` | `DWordHL` | $`address` + 1$ |
-| `count` | `DWordWL` | number of bytes to commit |
-| `count_decr` | `DWordHL` | $`count` - 1$ |
-| `first` | `Bit` | Whether this is the first commitment in this sequence. |
-| `end` | `Bit` | Whether this is the end of the commitment sequence. |
-| `value` | `Byte` | Byte stored at `address`. |
-
-### Multiplicity
-
-| Name | Type | Description |
-|------|------|-------------|
-| `μ` | `Bit` |  |
-
-The  chip leverages  variables, spanning  columns:
-
-## Constraints
-
-In this VM, committing is considered equivalent to writing a value to `stdout`. Hence, this chip responds to `ECALL`s with system call number 64.
-
-Since we do not know how many bytes are to be committed, this chip employs a recursive design: each iteration commits one byte, and recursively "calls" itself to commit the remaining bytes. As such, only the call from the CPU to this chip (i.e., the `first` in the recursion tree) should accept the `ECALL`; later recursive calls should not. This is why [commit:c:receive_ecall] has multiplicity `-`first``.
-
-| Tag | Description | Multiplicity |
-|-----|-------------|--------------|
-| `COMMIT-C1` | `ECALL[timestamp, 64::DWordWL]` | -first |
-
-The `write` operation --- writing to a file descriptor --- has the following signature:
-
-```c ssize_t write(size_t count; int fd, const void buf[count], size_t count); ```
-
-That is to say, - `A0` contains the file descriptor, - `A1` contains the address of `buf`'s first byte, - `A2` contains `count`, and - the written count should be written to `A0`.
-
-[commit:c:read_address] reads `address` from `x11` (=`A1`) and [commit:c:read_count] reads `count` from `x12` (=`A2`). Since we only support writing to `stdout` (which corresponds to ``fd` = 1`
-
-we assert that `x10` contains `1` in [commit:c:read_fd_write_count]. Note that this constraint _also_ writes `count` to `A0`; in this VM it is impossible for a commit to be interrupted or fail. Lastly, the `index` is read from `x254`; in the same operation, ``index` + `count`` is written back to this location by [commit:c:read_index]. This, too, leverages the fact that a commit will not be interrupted or fail to update the `index` for the next commit sequence. Again, each of these memory interactions only take place when this is the `first` call in the recursion tree.
-
-| Tag | Description | Multiplicity |
-|-----|-------------|--------------|
-| `COMMIT-C2` | `MEMW[['arr', ['idx', 'address', 0], ['idx', 'address', 1], 0, 0, 0, 0, 0, 0]; 1, (2 * 11)::DWordWL, ['arr', ['idx', 'address', 0], ['idx', 'address', 1], 0, 0, 0, 0, 0, 0], timestamp, 1, 0, 0]` | first |
-| `COMMIT-C3` | `MEMW[['arr', ['idx', 'count', 0], ['idx', 'count', 1], 0, 0, 0, 0, 0, 0]; 1, (2 * 12)::DWordWL, ['arr', ['idx', 'count', 0], ['idx', 'count', 1], 0, 0, 0, 0, 0, 0], timestamp, 1, 0, 0]` | first |
-| `COMMIT-C4` | `MEMW[['arr', 1, 0, 0, 0, 0, 0, 0, 0]; 1, (2 * 10)::DWordWL, ['arr', ['idx', 'count', 0], ['idx', 'count', 1], 0, 0, 0, 0, 0, 0], timestamp, 1, 0, 0]` | first |
-| `COMMIT-C5` | `MEMW[['arr', 'index', 0, 0, 0, 0, 0, 0, 0]; 1, (2 * 254)::DWordWL, ['arr', ['+', 'index', ['cast', 'count', 'BaseField']], 0, 0, 0, 0, 0, 0, 0], timestamp, 0, 0, 0]` | first |
-
-*Note*: the observant reader will notice that [commit:c:read_index] casts `count` to a `BaseField`, potentiallly losing information. This is indeed correct. However, since it is practically impossible to commit more than `2^64-2^32` bytes in a single VM execution, it was decided to permit this.
-
-Next, we read the `value` located at buffer address `address` and commit to it under the given `index`. This is only performed when we have not yet reached the `end` of the commit sequence.
-
-| Tag | Description | Multiplicity |
-|-----|-------------|--------------|
-| `COMMIT-C6` | `MEMW[['arr', 'value', 0, 0, 0, 0, 0, 0, 0]; 0, address, ['arr', 'value', 0, 0, 0, 0, 0, 0, 0], timestamp, 0, 0, 0]` | μ - end |
-| `COMMIT-C7` | `COMMIT[index, value]` | μ - end |
-
-In parallel, we compute ``address_incr` = `address` + 1` ([commit:c:address_incr]) as address of the next byte to commit, and ``count_decr` = `count` - 1` ([commit:c:count_decr]) as the number of bytes that still has to be committed after committing this byte. [commit:c:range_address_incr] and [commit:c:range_count_decr] are included to satisfy [add:a:sum] respectively [add:a:rhs].
-
-| Tag | Range | Description | Multiplicity |
-|-----|-------|-------------|--------------|
-| `COMMIT-C8` |  | `ADD<address_incr::DWordWL; address, 1::DWordWL>` |  |
-| `COMMIT-C9.i` | i ∈ [0, 3] | `IS_HALF[address_incr[i]]` | μ |
-| `COMMIT-C10` |  | `SUB<count_decr::DWordWL; count, 1::DWordWL>` |  |
-| `COMMIT-C11.i` | i ∈ [0, 3] | `IS_HALF[count_decr[i]]` | μ |
-
-When `count` hits `0`, we should stop performing further recursive calls. We use the `end` bit to indicate these circumstances.
-
-| Tag | Description | Multiplicity |
-|-----|-------------|--------------|
-| `COMMIT-C12` | `ZERO[end; (65535 - count_decr[0]) + (65535 - count_decr[1]) + (65535 - count_decr[2]) + (65535 - count_decr[3])]` | μ |
-
-*Note*: + Rather than setting ``end` = 1` when ``count` = 0`, we do so when ``count_decr` = -1`. This technique allows `count` to be stored in a `DWordWL` rather than a `DWordHL`, saving two columns. + `forall i in [0, 3]: 65535 - `count_decr`_i >= 0` as a result of [commit:c:range_count_decr]. Hence, $ sum_(i=0)^3 65535 - `count_decr`_i = 0 arrow.l.r.double.long forall i in [0, 3]: `count_decr`_i = 65535 $
-
-When this was not the `end` byte to commit in this recursion sequence, we recursively _Commit the Next Byte_ (`CNB`), specifying the timestamp, address to continue reading and the number of bytes that should still be committed ([commit:c:send_commit_next_byte]). Since that certainly won't be the `first` call in the sequence, we read `address_incr` and `count_decr` from the previous recursion level into `address` and `count` and continue executing the commit.
-
-| Tag | Description | Multiplicity |
-|-----|-------------|--------------|
-| `COMMIT-C13` | `CNB[timestamp, index + 1, address_incr::DWordWL, count_decr::DWordWL]` | μ - end |
-| `COMMIT-C14` | `CNB[timestamp, index, address, count]` | -(μ - first) |
-
-Lastly, we must make sure `first`, `end` and `μ` are bits ([commit:c:range_first], [commit:c:range_end], [commit:c:range_mu]), and that when either ``first` = 1` or ``end` = 1` imply that ``μ` = 1` ([commit:c:first_or_end_implies_mu]). These are required to ensure the multiplicities `-(`μ` - `first`)` and ``μ` - `end`` are binary.
-
-| Tag | Description |
-|-----|-------------|
-| `COMMIT-C15` | `IS_BIT<first>` |
-| `COMMIT-C16` | `IS_BIT<end>` |
-| `COMMIT-C17` | `IS_BIT<μ>` |
-| `COMMIT-C18` | `first` + `end` => `μ` = 1 |
-| | _polynomial:_ `(first + end) * (1 - μ) = 0` |
-
-## Padding
-
-To pad this chip, use the below data.
-
-## Notes/optimizations
-
-- The current version only supports writing to `stdout`. This chip could potentially be extended to support writing to arbitrary `fd`s - One might be able to replace [commit:c:end] by `end => count = 0`. While loosening the constraint (`count = 0 => end` is no longer enforced), this should not cause any problems: if the prover does not set `end` when `count=0`, they simply cannot complete the proof. First of all, one would have to recursively work through all `2^64` values of `count`, something that is practically infeasible. Moreover, if this is done with a sequence that originally has ``count` > 0`, one will inevitably have to read a memory address twice at the same timestamp, which is impossible to prove. In addition to dropping the `ZERO` lookup, this optimization might also permit moving `count_decr` from a `DWordHL` to a `DWordWL`, saving two columns. - Given that it is practically infeasible to commit more than ``p`-1 = 2^64-2^32` bytes in a program, it might suffice to store `count_decr` in a `BaseField`. Note that this would probably involve having an extra (virtual) column storing `count` in `BaseField` form as well. Moreover, one might need to add a lookup to `LT` to ensure ``count` <= `p`-1` when being read from memory at the beginning of each commitment sequence.
-
 ---
 
 # BITWISE Chips
 
 The  chips deal with precomputed lookup tables for bitwise boolean operations and convenience functionalities over small domains.
 
-= Columns
+= Variables
 
 The  chip is comprised of  variables that are expressed using  columns. Of these, the _input_ and _output_ variables ( in total) are precomputed.
 
@@ -1837,7 +1679,7 @@ The  chip is comprised of  variables that are expressed using  columns. Of these
 
 = Lookup This chip adds the following interactions to the lookup:
 
-= Areas of Optimization The following ideas may prove to be optimizations for the  chip: + Extend `IS_BYTE[X]` to `ARE_BYTES[X, Y]`, such that two bytes are range checked at once. When only a single check is required, one can still execute `IS_BYTE[X] := ARE_BYTES[X, 0]`. + Drop `MSB8` column, and instead define the `MSB8` lookup as `MSB8<X> := MSB16[256X]`. Note: currently, `MSB8` also implicity range checks the input `X` (the lookup fails if `X` is not a `Byte`). This optimization should only be executed when all chips leveraging `MSB8` do _not_ need this implicit range check. + Place the 16-bit (`AND`, `OR`, `XOR`, `MSB16`, etc.) and 20-bit (`HWSL`, `IS_B20`, `ZERO`) lookups in separate tables.
+= Notes/Optimizations The following ideas may prove to be optimizations for the  chip: + Extend `IS_BYTE[X]` to `ARE_BYTES[X, Y]`, such that two bytes are range checked at once. When only a single check is required, one can still execute `IS_BYTE[X] := ARE_BYTES[X, 0]`. + Drop `MSB8` column, and instead define the `MSB8` lookup as `MSB8<X> := MSB16[256X]`. Note: currently, `MSB8` also implicity range checks the input `X` (the lookup fails if `X` is not a `Byte`). This optimization should only be executed when all chips leveraging `MSB8` do _not_ need this implicit range check. + Place the 16-bit (`AND`, `OR`, `XOR`, `MSB16`, etc.) and 20-bit (`HWSL`, `IS_B20`, `ZERO`) lookups in separate tables.
 
 ## Columns
 

From 82ae2dd60b49e17b92a40c5f300286e28aaa2656 Mon Sep 17 00:00:00 2001
From: Joaquin Carletti <joaquin.carletti@lambdaclass.com>
Date: Tue, 14 Apr 2026 11:39:19 -0300
Subject: [PATCH 091/105] fix

---
 docs/spec/about_ecalls.md |    7 +
 docs/spec/add.md          |   10 +-
 docs/spec/bitwise.md      |   22 +-
 docs/spec/branch.md       |   56 +-
 docs/spec/commit.md       |  108 ++
 docs/spec/cpu.md          |  192 ++-
 docs/spec/decode.md       |   91 +-
 docs/spec/dvrm.md         |  204 +--
 docs/spec/ecall.md        |    2 +-
 docs/spec/halt.md         |   46 +
 docs/spec/is_bit.md       |   22 +-
 docs/spec/load.md         |   64 +-
 docs/spec/logup.md        |   81 ++
 docs/spec/lt.md           |   88 +-
 docs/spec/memory.md       |   45 +-
 docs/spec/memw.md         |  275 +++-
 docs/spec/mul.md          |  116 +-
 docs/spec/neg.md          |   42 +-
 docs/spec/sha256.md       |  479 +++++++
 docs/spec/shift.md        |  190 +--
 docs/spec/sign.md         |   16 +-
 docs/spec/spec_full.md    | 2681 +++++++++++++++++++++++++------------
 22 files changed, 3358 insertions(+), 1479 deletions(-)
 create mode 100644 docs/spec/about_ecalls.md
 create mode 100644 docs/spec/commit.md
 create mode 100644 docs/spec/halt.md
 create mode 100644 docs/spec/logup.md
 create mode 100644 docs/spec/sha256.md

diff --git a/docs/spec/about_ecalls.md b/docs/spec/about_ecalls.md
new file mode 100644
index 000000000..a128c5e3a
--- /dev/null
+++ b/docs/spec/about_ecalls.md
@@ -0,0 +1,7 @@
+# About ECALL
+
+ECALLs provide system-level functionalities to the guest program.
+
+When `ECALL` is executed, it is assumed that: - register `A7` contains the system call number
+
+- the arguments are located in registers `A0`-`A6`, and - the return value is written to `A0`, where `A0`-`A7` are symbolic names for the registers `x10`-`x17`
\ No newline at end of file
diff --git a/docs/spec/add.md b/docs/spec/add.md
index a021bf7c9..69d772aab 100644
--- a/docs/spec/add.md
+++ b/docs/spec/add.md
@@ -4,13 +4,9 @@ For ease of notation, we moreover introduce the  constraint template $
 
 $ in both conditional and unconditional versions. It constrains that ``diff` equiv `lhs` - `rhs` (mod 2^64)` when the expression `cond` is non-zero.
 
-= Variables This template introduces  interaction(s).
+## Variables
 
-= Assumptions
-
-= Constraints This template introduces the following constraints
-
-## Columns
+This template introduces  interaction(s).
 
 ### Input
 
@@ -53,7 +49,7 @@ carry (when iter=1) := 2^-32 * (lhs[1] + rhs[1] + carry[0] - sum[1])
 
 ## Constraints
 
-### all
+This template introduces the following constraints
 
 | Tag | Range | Description |
 |-----|-------|-------------|
diff --git a/docs/spec/bitwise.md b/docs/spec/bitwise.md
index 53bea4b90..8d78d0e35 100644
--- a/docs/spec/bitwise.md
+++ b/docs/spec/bitwise.md
@@ -2,18 +2,10 @@
 
 The  chips deal with precomputed lookup tables for bitwise boolean operations and convenience functionalities over small domains.
 
-= Variables
+## Variables
 
 The  chip is comprised of  variables that are expressed using  columns. Of these, the _input_ and _output_ variables ( in total) are precomputed.
 
-*Note*: This table contains one row for every possible value of `(X, Y, Z)`. As such, it has length `2^8 dot 2^8 dot 2^4 = 2^(20)`.
-
-= Lookup This chip adds the following interactions to the lookup:
-
-= Notes/Optimizations The following ideas may prove to be optimizations for the  chip: + Extend `IS_BYTE[X]` to `ARE_BYTES[X, Y]`, such that two bytes are range checked at once. When only a single check is required, one can still execute `IS_BYTE[X] := ARE_BYTES[X, 0]`. + Drop `MSB8` column, and instead define the `MSB8` lookup as `MSB8<X> := MSB16[256X]`. Note: currently, `MSB8` also implicity range checks the input `X` (the lookup fails if `X` is not a `Byte`). This optimization should only be executed when all chips leveraging `MSB8` do _not_ need this implicit range check. + Place the 16-bit (`AND`, `OR`, `XOR`, `MSB16`, etc.) and 20-bit (`HWSL`, `IS_B20`, `ZERO`) lookups in separate tables.
-
-## Columns
-
 ### Input
 
 | Name | Type | Description |
@@ -50,9 +42,11 @@ The  chip is comprised of  variables that are expressed using  columns. Of these
 | `μ_IS_B20` | `BaseField` |  |
 | `μ_HWSL` | `BaseField` |  |
 
-## Constraints
+*Note*: This table contains one row for every possible value of `(X, Y, Z)`. As such, it has length `2^8 dot 2^8 dot 2^4 = 2^(20)`.
+
+## Lookup
 
-### contributions
+This chip adds the following interactions to the lookup:
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
@@ -65,4 +59,8 @@ The  chip is comprised of  variables that are expressed using  columns. Of these
 | `BITWISE-C7` | `IS_BYTE[X]` | -μ_IS_BYTE |
 | `BITWISE-C8` | `IS_HALF[X + 256 * Y]` | -μ_IS_HALF |
 | `BITWISE-C9` | `IS_B20[X + 256 * Y + 65536 * Z]` | -μ_IS_B20 |
-| `BITWISE-C10` | `HWSL[['arr', 'SLL', 'SLLC']; X + 256 * Y, Z]` | -μ_HWSL |
\ No newline at end of file
+| `BITWISE-C10` | `HWSL[['arr', 'SLL', 'SLLC']; X + 256 * Y, Z]` | -μ_HWSL |
+
+## Notes/Optimizations
+
+The following ideas may prove to be optimizations for the  chip: + Extend `IS_BYTE[X]` to `ARE_BYTES[X, Y]`, such that two bytes are range checked at once. When only a single check is required, one can still execute `IS_BYTE[X] := ARE_BYTES[X, 0]`. + Drop `MSB8` column, and instead define the `MSB8` lookup as `MSB8<X> := MSB16[256X]`. Note: currently, `MSB8` also implicity range checks the input `X` (the lookup fails if `X` is not a `Byte`). This optimization should only be executed when all chips leveraging `MSB8` do _not_ need this implicit range check. + Place the 16-bit (`AND`, `OR`, `XOR`, `MSB16`, etc.) and 20-bit (`HWSL`, `IS_B20`, `ZERO`) lookups in separate tables.
\ No newline at end of file
diff --git a/docs/spec/branch.md b/docs/spec/branch.md
index c68c2c4a2..fd4aba45f 100644
--- a/docs/spec/branch.md
+++ b/docs/spec/branch.md
@@ -2,38 +2,10 @@
 
 The  chip computes the target address of a branching instruction.
 
-= Variables
+## Variables
 
 The  chip is comprised of  variables that are expressed using  columns and leverages  interaction(s):
 
-= Assumptions
-
-= Constraints
-
-We constrain `next_pc` to be ``base_address` + `offset``, where `base_address` equals `pc` when ``JALR` = 0` and `register` otherwise.
-
-The range checks on `unmasked_low_byte` and `next_pc_low[0]` are performed implicitly by the `AND_BYTE` lookup.
-
-| Tag | Range | Description | Multiplicity |
-|-----|-------|-------------|--------------|
-| `BRANCH-C1` |  | 1 - JALR ⇒ `ADD<next_pc_unmasked; pc, offset::DWordWL>` |  |
-| `BRANCH-C2` |  | JALR ⇒ `ADD<next_pc_unmasked; register, offset::DWordWL>` |  |
-| `BRANCH-C3` |  | `IS_BYTE[next_pc_low[1]]` | μ |
-| `BRANCH-C4` |  | `AND_BYTE[next_pc_low[0]; unmasked_low_byte, 254]` | μ |
-| `BRANCH-C5.i` | i ∈ [0, 2] | `IS_HALF[next_pc_high[i]]` | μ |
-
-This chip contributes the following to the lookup argument.
-
-| Tag | Description | Multiplicity |
-|-----|-------------|--------------|
-| `BRANCH-C6` | `BRANCH[next_pc; pc, offset, register, JALR]` | -μ |
-
-= Padding
-
-The table can be padded to the next power of two with the following value assignments:
-
-## Columns
-
 ### Input
 
 | Name | Type | Description |
@@ -88,4 +60,28 @@ next_pc (when iter=1) := 2^16 * next_pc_high[2] + next_pc_high[1]
 | `BRANCH-A1.i` | i ∈ [0, 1] | `pc` is range checked, `IS_WORD[pc[i]]` |
 | `BRANCH-A2` |  | `offset` is range checked, `IS_WORD[offset]` |
 | `BRANCH-A3.i` | i ∈ [0, 1] | `register` is range checked, `IS_WORD[register[i]]` |
-| `BRANCH-A4` |  | `IS_BIT<JALR>` |
\ No newline at end of file
+| `BRANCH-A4` |  | `IS_BIT<JALR>` |
+
+## Constraints
+
+We constrain `next_pc` to be ``base_address` + `offset``, where `base_address` equals `pc` when ``JALR` = 0` and `register` otherwise.
+
+The range checks on `unmasked_low_byte` and `next_pc_low[0]` are performed implicitly by the `AND_BYTE` lookup.
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `BRANCH-C1` |  | 1 - JALR ⇒ `ADD<next_pc_unmasked; pc, offset::DWordWL>` |  |
+| `BRANCH-C2` |  | JALR ⇒ `ADD<next_pc_unmasked; register, offset::DWordWL>` |  |
+| `BRANCH-C3` |  | `IS_BYTE[next_pc_low[1]]` | μ |
+| `BRANCH-C4` |  | `AND_BYTE[next_pc_low[0]; unmasked_low_byte, 254]` | μ |
+| `BRANCH-C5.i` | i ∈ [0, 2] | `IS_HALF[next_pc_high[i]]` | μ |
+
+This chip contributes the following to the lookup argument.
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `BRANCH-C6` | `BRANCH[next_pc; pc, offset, register, JALR]` | -μ |
+
+## Padding
+
+The table can be padded to the next power of two with the following value assignments:
\ No newline at end of file
diff --git a/docs/spec/commit.md b/docs/spec/commit.md
new file mode 100644
index 000000000..83f921d3b
--- /dev/null
+++ b/docs/spec/commit.md
@@ -0,0 +1,108 @@
+# COMMIT Chip
+
+## Variables
+
+The  chip leverages  variables, spanning  columns and leverages  interactions:
+
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `timestamp` | `DWordWL` | timestamp at which to commit |
+
+### Auxiliary
+
+| Name | Type | Description |
+|------|------|-------------|
+| `index` | `BaseField` | Index of value being committed. |
+| `address` | `DWordWL` | Address of first byte to commit. |
+| `address_incr` | `DWordHL` | $`address` + 1$ |
+| `count` | `DWordWL` | number of bytes to commit |
+| `count_decr` | `DWordHL` | $`count` - 1$ |
+| `first` | `Bit` | Whether this is the first commitment in this sequence. |
+| `end` | `Bit` | Whether this is the end of the commitment sequence. |
+| `value` | `Byte` | Byte stored at `address`. |
+
+### Multiplicity
+
+| Name | Type | Description |
+|------|------|-------------|
+| `μ` | `Bit` |  |
+
+## Constraints
+
+In this VM, committing is considered equivalent to writing a value to `stdout`. Hence, this chip responds to `ECALL`s with system call number 64.
+
+Since we do not know how many bytes are to be committed, this chip employs a recursive design: each iteration commits one byte, and recursively "calls" itself to commit the remaining bytes. As such, only the call from the CPU to this chip (i.e., the `first` in the recursion tree) should accept the `ECALL`; later recursive calls should not. This is why [commit:c:receive_ecall] has multiplicity `-`first``.
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `COMMIT-C1` | `ECALL[timestamp, 64::DWordWL]` | -first |
+
+The `write` operation --- writing to a file descriptor --- has the following signature:
+
+```c ssize_t write(size_t count; int fd, const void buf[count], size_t count); ```
+
+That is to say, - `A0` contains the file descriptor, - `A1` contains the address of `buf`'s first byte, - `A2` contains `count`, and - the written count should be written to `A0`.
+
+[commit:c:read_address] reads `address` from `x11` (=`A1`) and [commit:c:read_count] reads `count` from `x12` (=`A2`). Since we only support writing to `stdout` (which corresponds to ``fd` = 1`
+
+we assert that `x10` contains `1` in [commit:c:read_fd_write_count]. Note that this constraint _also_ writes `count` to `A0`; in this VM it is impossible for a commit to be interrupted or fail. Lastly, the `index` is read from `x254`; in the same operation, ``index` + `count`` is written back to this location by [commit:c:read_index]. This, too, leverages the fact that a commit will not be interrupted or fail to update the `index` for the next commit sequence. Again, each of these memory interactions only take place when this is the `first` call in the recursion tree.
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `COMMIT-C2` | `MEMW[['arr', ['idx', 'address', 0], ['idx', 'address', 1], 0, 0, 0, 0, 0, 0]; 1, (2 * 11)::DWordWL, ['arr', ['idx', 'address', 0], ['idx', 'address', 1], 0, 0, 0, 0, 0, 0], timestamp, 1, 0, 0]` | first |
+| `COMMIT-C3` | `MEMW[['arr', ['idx', 'count', 0], ['idx', 'count', 1], 0, 0, 0, 0, 0, 0]; 1, (2 * 12)::DWordWL, ['arr', ['idx', 'count', 0], ['idx', 'count', 1], 0, 0, 0, 0, 0, 0], timestamp, 1, 0, 0]` | first |
+| `COMMIT-C4` | `MEMW[['arr', 1, 0, 0, 0, 0, 0, 0, 0]; 1, (2 * 10)::DWordWL, ['arr', ['idx', 'count', 0], ['idx', 'count', 1], 0, 0, 0, 0, 0, 0], timestamp, 1, 0, 0]` | first |
+| `COMMIT-C5` | `MEMW[['arr', 'index', 0, 0, 0, 0, 0, 0, 0]; 1, (2 * 254)::DWordWL, ['arr', ['+', 'index', ['cast', 'count', 'BaseField']], 0, 0, 0, 0, 0, 0, 0], timestamp, 0, 0, 0]` | first |
+
+*Note*: the observant reader will notice that [commit:c:read_index] casts `count` to a `BaseField`, potentiallly losing information. This is indeed correct. However, since it is practically impossible to commit more than `2^64-2^32` bytes in a single VM execution, it was decided to permit this.
+
+Next, we read the `value` located at buffer address `address` and commit to it under the given `index`. This is only performed when we have not yet reached the `end` of the commit sequence.
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `COMMIT-C6` | `MEMW[['arr', 'value', 0, 0, 0, 0, 0, 0, 0]; 0, address, ['arr', 'value', 0, 0, 0, 0, 0, 0, 0], timestamp, 0, 0, 0]` | μ - end |
+| `COMMIT-C7` | `COMMIT[index, value]` | μ - end |
+
+In parallel, we compute ``address_incr` = `address` + 1` ([commit:c:address_incr]) as address of the next byte to commit, and ``count_decr` = `count` - 1` ([commit:c:count_decr]) as the number of bytes that still has to be committed after committing this byte. [commit:c:range_address_incr] and [commit:c:range_count_decr] are included to satisfy [add:a:sum] respectively [add:a:rhs].
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `COMMIT-C8` |  | `ADD<address_incr::DWordWL; address, 1::DWordWL>` |  |
+| `COMMIT-C9.i` | i ∈ [0, 3] | `IS_HALF[address_incr[i]]` | μ |
+| `COMMIT-C10` |  | `SUB<count_decr::DWordWL; count, 1::DWordWL>` |  |
+| `COMMIT-C11.i` | i ∈ [0, 3] | `IS_HALF[count_decr[i]]` | μ |
+
+When `count` hits `0`, we should stop performing further recursive calls. We use the `end` bit to indicate these circumstances.
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `COMMIT-C12` | `ZERO[end; (65535 - count_decr[0]) + (65535 - count_decr[1]) + (65535 - count_decr[2]) + (65535 - count_decr[3])]` | μ |
+
+*Note*: + Rather than setting ``end` = 1` when ``count` = 0`, we do so when ``count_decr` = -1`. This technique allows `count` to be stored in a `DWordWL` rather than a `DWordHL`, saving two columns. + `forall i in [0, 3]: 65535 - `count_decr`_i >= 0` as a result of [commit:c:range_count_decr]. Hence, $ sum_(i=0)^3 65535 - `count_decr`_i = 0 arrow.l.r.double.long forall i in [0, 3]: `count_decr`_i = 65535 $
+
+When this was not the `end` byte to commit in this recursion sequence, we recursively _Commit the Next Byte_ (`CNB`), specifying the timestamp, address to continue reading and the number of bytes that should still be committed ([commit:c:send_commit_next_byte]). Since that certainly won't be the `first` call in the sequence, we read `address_incr` and `count_decr` from the previous recursion level into `address` and `count` and continue executing the commit.
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `COMMIT-C13` | `CNB[timestamp, index + 1, address_incr::DWordWL, count_decr::DWordWL]` | μ - end |
+| `COMMIT-C14` | `CNB[timestamp, index, address, count]` | -(μ - first) |
+
+Lastly, we must make sure `first`, `end` and `μ` are bits ([commit:c:range_first], [commit:c:range_end], [commit:c:range_mu]), and that when either ``first` = 1` or ``end` = 1` imply that ``μ` = 1` ([commit:c:first_or_end_implies_mu]). These are required to ensure the multiplicities `-(`μ` - `first`)` and ``μ` - `end`` are binary.
+
+| Tag | Description |
+|-----|-------------|
+| `COMMIT-C15` | `IS_BIT<first>` |
+| `COMMIT-C16` | `IS_BIT<end>` |
+| `COMMIT-C17` | `IS_BIT<μ>` |
+| `COMMIT-C18` | `first` + `end` => `μ` = 1 |
+| | _polynomial:_ `(first + end) * (1 - μ) = 0` |
+
+## Padding
+
+To pad this chip, use the below data.
+
+## Notes/optimizations
+
+- The current version only supports writing to `stdout`. This chip could potentially be extended to support writing to arbitrary `fd`s - One might be able to replace [commit:c:end] by `end => count = 0`. While loosening the constraint (`count = 0 => end` is no longer enforced), this should not cause any problems: if the prover does not set `end` when `count=0`, they simply cannot complete the proof. First of all, one would have to recursively work through all `2^64` values of `count`, something that is practically infeasible. Moreover, if this is done with a sequence that originally has ``count` > 0`, one will inevitably have to read a memory address twice at the same timestamp, which is impossible to prove. In addition to dropping the `ZERO` lookup, this optimization might also permit moving `count_decr` from a `DWordHL` to a `DWordWL`, saving two columns. - Given that it is practically infeasible to commit more than ``p`-1 = 2^64-2^32` bytes in a program, it might suffice to store `count_decr` in a `BaseField`. Note that this would probably involve having an extra (virtual) column storing `count` in `BaseField` form as well. Moreover, one might need to add a lookup to `LT` to ensure ``count` <= `p`-1` when being read from memory at the beginning of each commitment sequence.
\ No newline at end of file
diff --git a/docs/spec/cpu.md b/docs/spec/cpu.md
index d203e1e1e..0383d28ef 100644
--- a/docs/spec/cpu.md
+++ b/docs/spec/cpu.md
@@ -2,19 +2,103 @@
 
 The  chip coordinates memory accesses and dispatches to other chips for arithmetic and logical operations. It bases its decisions on the entry of the `DECODE` table ([decode]) corresponding the the current program counter (PC).
 
-= Variables
+## Variables
 
 The  chip is comprised of  variables that are expressed using  columns and leverages  interaction(s):
 
-= Assumptions
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `timestamp` | `Timestamp` | A preprocessed timestamp to coordinate the memory argument. Since we have at most 3 non-disjoint memory accesses (`(rs1, rs2, rd)`, `(rs1, pc, pc)`, `(LOAD)` or `(STORE)`) a maximum of 4 slots is enough. |
+| `pc` | `DWordWL` | The program counter |
+| `rs1` | `Byte` | Source register 1 index |
+| `rs2` | `Byte` | Source register 2 index |
+| `rd` | `Byte` | Destination register index |
+| `read_register1` | `Bit` | Whether to read from `rs1` (1) or to place a 0 in `rv1` (0) |
+| `read_register2` | `Bit` | Whether to read from `rs2` (1) or to place a 0 in `rv2` (0) |
+| `write_register` | `Bit` | Whether to write back to the destination register |
+| `memory_2bytes` | `Bit` | Whether the memory access (read or write) touches exactly 2 bytes |
+| `memory_4bytes` | `Bit` | Whether the memory access (read or write) touches exactly 4 bytes |
+| `memory_8bytes` | `Bit` | Whether the memory access (read or write) touches exactly 8 bytes |
+| `c_type_instruction` | `Bit` | Whether the instruction is of C type, i.e., whether it is 2 bytes long instead of 4 |
+| `imm` | `DWordWL` | The fully extended 64-bit version of the immediate |
+| `signed` | `Bit` | Indicates whether we're dealing with a signed or unsigned instruction |
+| `mp_selector` | `Bit` | Multi-purpose selector used by different ALU operations for different purposes. Currently, it is used     - by the `MUL` chip to select between `MUL`/`MULH` and `MULH[S]U`, and     - as flag for inverting the condition of conditional branches (see `branch_cond`)     - as direction (left or right) for `SHIFT` |
+| `muldiv_selector` | `Bit` | Selects which output of `MUL` (lo/hi) or `DIV` (quo/rem) is wanted |
+| `word_instr` | `Bit` | Whether the instruction is a \*W instruction, requiring the inputs and outputs to be (sign) extended |
+| `ADD` | `Bit` | One-hot ALU selector flag |
+| `SUB` | `Bit` | One-hot ALU selector flag |
+| `SLT` | `Bit` | One-hot ALU selector flag |
+| `AND` | `Bit` | One-hot ALU selector flag |
+| `OR` | `Bit` | One-hot ALU selector flag |
+| `XOR` | `Bit` | One-hot ALU selector flag |
+| `SHIFT` | `Bit` | One-hot ALU selector flag |
+| `JALR` | `Bit` | One-hot ALU selector flag |
+| `BEQ` | `Bit` | One-hot ALU selector flag |
+| `BLT` | `Bit` | One-hot ALU selector flag |
+| `LOAD` | `Bit` | One-hot ALU selector flag |
+| `STORE` | `Bit` | One-hot ALU selector flag |
+| `MUL` | `Bit` | One-hot ALU selector flag |
+| `DIVREM` | `Bit` | One-hot ALU selector flag |
+| `ECALL` | `Bit` | One-hot ALU selector flag |
+| `EBREAK` | `Bit` | One-hot ALU selector flag |
+
+### Output
+
+| Name | Type | Description |
+|------|------|-------------|
+| `next_pc` | `DWordWL` | The program counter for the next instruction |
+| `rvd` | `DWordWL` | The value to (maybe) be written back to rvd |
+
+### Auxiliary
+
+| Name | Type | Description |
+|------|------|-------------|
+| `rv1` | `DWordWHH` | The value of register `rs1` |
+| `rv2` | `DWordWHH` | The value of register `rs2` |
+| `rv1_ext_bit` | `Bit` | The sign bit of `rv1` if seen as a 32-bit word, used for sign extension with `word_instr` |
+| `arg1` | `DWordBL` | The extended version of `rv1`, depending on `word_instr` |
+| `rv2_ext_bit` | `Bit` | The sign bit of `rv2` if seen as a 32-bit word, used for sign extension with `word_instr` |
+| `arg2` | `DWordBL` | A multiplexed version of `rv2` and `imm`, to be used as second argument to ALU calls |
+| `res_ext_bit` | `Bit` | The sign bit of `res`, if seen as a 32-bit word, used for sign extension with `word_instr` |
+| `res` | `DWordBL` | The ALU result |
+| `is_equal` | `Bit` | Whether `rv1` and `arg2` are equal |
+| `branch_cond` | `Bit` | Whether a branch is taken, i.e., the branch condition |
+
+### Virtual
+
+| Name | Type | Description |
+|------|------|-------------|
+| `packed_decode` | `BaseField` | A packed representation of all bit flags and register indices obtained from the decoding |
+| `pad` | `Bit` | When no flags are set, we must be in a padding row. |
 
-= Constraints First, we perform a decoding lookup for the current PC.
+**Definition of `packed_decode`:**
+```
+packed_decode := 2^0 * read_register1 + 2^1 * read_register2 + 2^2 * write_register + 2^3 * memory_2bytes + 2^4 * memory_4bytes + 2^5 * memory_8bytes + 2^6 * c_type_instruction + 2^7 * signed + 2^8 * mp_selector + 2^9 * muldiv_selector + 2^10 * word_instr + 2^11 * ADD + 2^12 * SUB + 2^13 * SLT + 2^14 * AND + 2^15 * OR + 2^16 * XOR + 2^17 * SHIFT + 2^18 * JALR + 2^19 * BEQ + 2^20 * BLT + 2^21 * LOAD + 2^22 * STORE + 2^23 * MUL + 2^24 * DIVREM + 2^25 * ECALL + 2^26 * EBREAK + 2^27 * rs1 + 2^35 * rs2 + 2^43 * rd
+```
+
+**Definition of `pad`:**
+```
+pad := 1 - ADD - SUB - SLT - AND - OR - XOR - SHIFT - JALR - BEQ - BLT - LOAD - STORE - MUL - DIVREM - ECALL - EBREAK
+```
+
+## Assumptions
+
+| Tag | Range | Description |
+|-----|-------|-------------|
+| `CPU-A1` |  | At most one ALU selector flag is 1 by the decoding, and every other flag is 0. |
+| `CPU-A2` |  | When `STORE + LOAD + BEQ + BLT = 0`, either `rs2 = 0` or `imm = 0` should be enforced by the decoding. This is needed for `arg2`. |
+
+## Constraints
+
+First, we perform a decoding lookup for the current PC.
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
 | `CPU-C1` | `DECODE[pc, imm, packed_decode]` | 1 |
 
-## Range checks
+### Range checks
 
 > **Note:** Make sure we argue for every column here
 
@@ -58,7 +142,7 @@ We constrain all columns to have the appropriate ranges. The flags and register
 | `CPU-CR33.i` | i ∈ [0, 7] | `IS_BYTE[arg2[i]]` | 1 |
 | `CPU-CR34.i` | i ∈ [0, 7] | `IS_BYTE[res[i]]` | 1 |
 
-## ALU
+### ALU
 
 The ALU functionality is then obtained through judicious dispatching to the corresponding chips.
 
@@ -78,7 +162,7 @@ The ALU functionality is then obtained through judicious dispatching to the corr
 | `CPU-CA45` |  | `MUL[res::DWordWL; arg1::DWordHL, signed, arg2::DWordHL, mp_selector, muldiv_selector]` | MUL |
 | `CPU-CA46` |  | `DVRM[res::DWordWL; arg1::DWordHL, arg2::DWordHL, signed, muldiv_selector]` | DIVREM |
 
-## Memory
+### Memory
 
 The interactions with the memory, both for register loading and storing, as for `LOAD` and `STORE` instructions are handled. Note that since registers need no byte-addressing, we store them in the memory argument with `Word` limbs. The timestamps are ensured to be disjoint for disjoint memory locations. One consequence of that is that `next_pc` is written at `timestamp + 1` to ensure the access is disjoint with the `pc` read into `rv1` as part of the `AUIPC` instruction.
 
@@ -95,7 +179,7 @@ The interactions with the memory, both for register loading and storing, as for
 | `CPU-CM53` |  | `MEMW[0, res::DWordWL, arg2::Byte[8], timestamp + 1::DWordWL, memory_2bytes, memory_4bytes, memory_8bytes]` | STORE |
 | `CPU-CM54` |  | `MEMW[['arr', ['idx', 'pc', 0], ['idx', 'pc', 1], 0, 0, 0, 0, 0, 0]; 1, (2 * 255)::DWordWL, ['arr', ['idx', 'next_pc', 0], ['idx', 'next_pc', 1], 0, 0, 0, 0, 0, 0], timestamp + 1::DWordWL, 1, 0, 0]` | 1 - pad |
 
-## System
+### System
 
 The interactions with the wider system.
 
@@ -105,7 +189,7 @@ The interactions with the wider system.
 | | _polynomial:_ `1 - EBREAK = 0` | |
 | `CPU-CS56` | `ECALL[timestamp, rv1::DWordWL]` | ECALL |
 
-## Input and output to the ALU
+### Input and output to the ALU
 
 We constrain `arg1`, `arg2` and `rvd` to correspond to the wanted values, including the appropriate sign/zero extension, depending on `word_instr`.
 
@@ -127,7 +211,7 @@ We constrain `arg1`, `arg2` and `rvd` to correspond to the wanted values, includ
 | `CPU-CE65` | `!LOAD` => `rvd[1]` = (1 - `word_instr`) dot `res[4:]` + `res_ext_bit` dot (2^(32) - 1) |
 | | _polynomial:_ `(1 - LOAD) * (rvd[1] - (1 - word_instr) * (res::DWordWL)[1] - res_ext_bit * (2^32 - 1)) = 0` |
 
-## Other constraints
+### Other constraints
 
 For [cpu:c:is_equal], note that [cpu:c:sub] sets `res` to be the difference between `arg1` and `arg2` whenever `BEQ` is `1`. Given that this difference is `0` when both are equal, [cpu:c:is_equal] ensures `is_equal` is set to `1` if and only if ``arg1` = `arg2`` and `BEQ` is set.
 
@@ -141,94 +225,8 @@ For [cpu:c:is_equal], note that [cpu:c:sub] sets `res` to be the difference betw
 
 > **Note:** Document the choice to not have a multiplicity column here for padding
 
-= Padding
+## Padding
 
 The CPU can be padded with the following values, which have a corresponding row in the DECODE table, at the _odd_ address 1, only reachable through a HALT ecall.
 
-This approach minimizes the number of dependent lookups, increasing only multiplicities in the DECODE table and the IS_BYTE lookup.
-
-## Columns
-
-### Input
-
-| Name | Type | Description |
-|------|------|-------------|
-| `timestamp` | `Timestamp` | A preprocessed timestamp to coordinate the memory argument. Since we have at most 3 non-disjoint memory accesses (`(rs1, rs2, rd)`, `(rs1, pc, pc)`, `(LOAD)` or `(STORE)`) a maximum of 4 slots is enough. |
-| `pc` | `DWordWL` | The program counter |
-| `rs1` | `Byte` | Source register 1 index |
-| `rs2` | `Byte` | Source register 2 index |
-| `rd` | `Byte` | Destination register index |
-| `read_register1` | `Bit` | Whether to read from `rs1` (1) or to place a 0 in `rv1` (0) |
-| `read_register2` | `Bit` | Whether to read from `rs2` (1) or to place a 0 in `rv2` (0) |
-| `write_register` | `Bit` | Whether to write back to the destination register |
-| `memory_2bytes` | `Bit` | Whether the memory access (read or write) touches exactly 2 bytes |
-| `memory_4bytes` | `Bit` | Whether the memory access (read or write) touches exactly 4 bytes |
-| `memory_8bytes` | `Bit` | Whether the memory access (read or write) touches exactly 8 bytes |
-| `c_type_instruction` | `Bit` | Whether the instruction is of C type, i.e., whether it is 2 bytes long instead of 4 |
-| `imm` | `DWordWL` | The fully extended 64-bit version of the immediate |
-| `signed` | `Bit` | Indicates whether we're dealing with a signed or unsigned instruction |
-| `mp_selector` | `Bit` | Multi-purpose selector used by different ALU operations for different purposes. Currently, it is used     - by the `MUL` chip to select between `MUL`/`MULH` and `MULH[S]U`, and     - as flag for inverting the condition of conditional branches (see `branch_cond`)     - as direction (left or right) for `SHIFT` |
-| `muldiv_selector` | `Bit` | Selects which output of `MUL` (lo/hi) or `DIV` (quo/rem) is wanted |
-| `word_instr` | `Bit` | Whether the instruction is a \*W instruction, requiring the inputs and outputs to be (sign) extended |
-| `ADD` | `Bit` | One-hot ALU selector flag |
-| `SUB` | `Bit` | One-hot ALU selector flag |
-| `SLT` | `Bit` | One-hot ALU selector flag |
-| `AND` | `Bit` | One-hot ALU selector flag |
-| `OR` | `Bit` | One-hot ALU selector flag |
-| `XOR` | `Bit` | One-hot ALU selector flag |
-| `SHIFT` | `Bit` | One-hot ALU selector flag |
-| `JALR` | `Bit` | One-hot ALU selector flag |
-| `BEQ` | `Bit` | One-hot ALU selector flag |
-| `BLT` | `Bit` | One-hot ALU selector flag |
-| `LOAD` | `Bit` | One-hot ALU selector flag |
-| `STORE` | `Bit` | One-hot ALU selector flag |
-| `MUL` | `Bit` | One-hot ALU selector flag |
-| `DIVREM` | `Bit` | One-hot ALU selector flag |
-| `ECALL` | `Bit` | One-hot ALU selector flag |
-| `EBREAK` | `Bit` | One-hot ALU selector flag |
-
-### Output
-
-| Name | Type | Description |
-|------|------|-------------|
-| `next_pc` | `DWordWL` | The program counter for the next instruction |
-| `rvd` | `DWordWL` | The value to (maybe) be written back to rvd |
-
-### Auxiliary
-
-| Name | Type | Description |
-|------|------|-------------|
-| `rv1` | `DWordWHH` | The value of register `rs1` |
-| `rv2` | `DWordWHH` | The value of register `rs2` |
-| `rv1_ext_bit` | `Bit` | The sign bit of `rv1` if seen as a 32-bit word, used for sign extension with `word_instr` |
-| `arg1` | `DWordBL` | The extended version of `rv1`, depending on `word_instr` |
-| `rv2_ext_bit` | `Bit` | The sign bit of `rv2` if seen as a 32-bit word, used for sign extension with `word_instr` |
-| `arg2` | `DWordBL` | A multiplexed version of `rv2` and `imm`, to be used as second argument to ALU calls |
-| `res_ext_bit` | `Bit` | The sign bit of `res`, if seen as a 32-bit word, used for sign extension with `word_instr` |
-| `res` | `DWordBL` | The ALU result |
-| `is_equal` | `Bit` | Whether `rv1` and `arg2` are equal |
-| `branch_cond` | `Bit` | Whether a branch is taken, i.e., the branch condition |
-
-### Virtual
-
-| Name | Type | Description |
-|------|------|-------------|
-| `packed_decode` | `BaseField` | A packed representation of all bit flags and register indices obtained from the decoding |
-| `pad` | `Bit` | When no flags are set, we must be in a padding row. |
-
-**Definition of `packed_decode`:**
-```
-packed_decode := 2^0 * read_register1 + 2^1 * read_register2 + 2^2 * write_register + 2^3 * memory_2bytes + 2^4 * memory_4bytes + 2^5 * memory_8bytes + 2^6 * c_type_instruction + 2^7 * signed + 2^8 * mp_selector + 2^9 * muldiv_selector + 2^10 * word_instr + 2^11 * ADD + 2^12 * SUB + 2^13 * SLT + 2^14 * AND + 2^15 * OR + 2^16 * XOR + 2^17 * SHIFT + 2^18 * JALR + 2^19 * BEQ + 2^20 * BLT + 2^21 * LOAD + 2^22 * STORE + 2^23 * MUL + 2^24 * DIVREM + 2^25 * ECALL + 2^26 * EBREAK + 2^27 * rs1 + 2^35 * rs2 + 2^43 * rd
-```
-
-**Definition of `pad`:**
-```
-pad := 1 - ADD - SUB - SLT - AND - OR - XOR - SHIFT - JALR - BEQ - BLT - LOAD - STORE - MUL - DIVREM - ECALL - EBREAK
-```
-
-## Assumptions
-
-| Tag | Range | Description |
-|-----|-------|-------------|
-| `CPU-A1` |  | At most one ALU selector flag is 1 by the decoding, and every other flag is 0. |
-| `CPU-A2` |  | When `STORE + LOAD + BEQ + BLT = 0`, either `rs2 = 0` or `imm = 0` should be enforced by the decoding. This is needed for `arg2`. |
\ No newline at end of file
+This approach minimizes the number of dependent lookups, increasing only multiplicities in the DECODE table and the IS_BYTE lookup.
\ No newline at end of file
diff --git a/docs/spec/decode.md b/docs/spec/decode.md
index 8112035ff..83b51cba1 100644
--- a/docs/spec/decode.md
+++ b/docs/spec/decode.md
@@ -2,15 +2,76 @@
 
 All `RV64IMC` instruction are to be decoded to a format that can be interpreted by the VM. This section outlines the decoding table being used in the VM. For reasons of efficiency, data in this table is significantly compressed. Since reasoning about this compressed form is needlessly complex, the `decode (uncompressed)` section presents the same table in uncompressed form, and explains how to decode `RV64IM` assembly instructions to it. Instructions on how to compress the uncompressed table to form the compressed decode table, can be derived from the `packed_decode` variable provided below.
 
-= Variables
+## Variables
 
 The  table is comprised of  variables that are expressed using  columns:
 
-= Padding The  table must be padded to a length that is a power of two. Empty rows with the following content can be added to achieve this:
+### Output
+
+| Name | Type | Description |
+|------|------|-------------|
+| `pc` | `DWordWL` | value of the program counter this instruction is associated with. |
+| `packed_decode` | `BaseField` | Ordered concatenation of several small variables. The `decode (uncompressed)` section explains the purpose of each variable.\ A list of each variable and the bit(-range) in which it is located:\ [0] `read_register1`, \ [1] `read_register2`, \ [2] `write_register`, \ [3] `memory_2bytes`, \ [4] `memory_4bytes`, \ [5] `memory_8bytes`, \ [6] `c_type`, \ [7] `signed`, \ [8] `mp_selector`, \ [9] `muldiv_selector`, \ [10] `word_instr`, \ [11] `ADD`, \ [12] `SUB`, \ [13] `SLT`, \ [14] `AND`, \ [15] `OR`, \ [16] `XOR`, \ [17] `SHIFT`, \ [18] `JALR`, \ [19] `BEQ`, \ [20] `BLT`, \ [21] `LOAD`, \ [22] `STORE`, \ [23] `MUL`, \ [24] `DIVREM`, \ [25] `ECALL`, \ [26] `EBREAK`; \ [27:35] `rs1`, \ [35:43] `rs2`, \ [43:51] `rd`, \ the remaining bits are set to zero.  |
+| `imm` | `DWordWL` | the *fully extended (!)* 64-bit version of the immediate. |
+
+### Multiplicity
+
+| Name | Type | Description |
+|------|------|-------------|
+| `μ` | `BaseField` | The multiplicity with which this instruction is looked up in the `CPU` table. |
+
+## Padding
+
+The  table must be padded to a length that is a power of two. Empty rows with the following content can be added to achieve this:
 
 Note that this row sets the `EBREAK` flag. Given that `CPU` asserts that `EBREAK = 0` (see [cpu:c:ebreak_traps]), using this "padding-instruction" would immediately make the CPU table unprovable. Note moreover that the `pc` is set to `7`. This value is the _smallest odd number_ (i.e., not reachable during regular execution) that is more than _`4`_ (i.e., the max `pc`-increment) greater than _`1`_ (i.e., the `pc`-value used in the [additional instruction] referred to by `CPU`-padding lines).
 
-= Decoding For the purposes of explaining decoding, we decompress 's `packed_decode` variable into its constituent variables. Note that the below table is _not_ used in practice: it is solely used for the purposes of this explanation.
+## Decoding
+
+For the purposes of explaining decoding, we decompress 's `packed_decode` variable into its constituent variables. Note that the below table is _not_ used in practice: it is solely used for the purposes of this explanation.
+
+### Output
+
+| Name | Type | Description |
+|------|------|-------------|
+| `pc` | `DWordWL` | value of the program counter this instruction is associated with. |
+| `rs1` | `Byte` | index of source register 1. |
+| `rs2` | `Byte` | index of source register 2. |
+| `rd` | `Byte` | index of destination register. |
+| `read_register1` | `Bit` | whether to load the contents of address `rs1` (1) or `0` (0) into `rv1`. |
+| `read_register2` | `Bit` | whether to load the contents of address `rs2` (1) or `0` (0) into `rv2`. |
+| `write_register` | `Bit` | whether the result should be written to `rd` ($=0$ for memory write and when $`rd` = `x0`$. |
+| `mem_2B` | `Bit` | whether the memory access (read or write) touches exactly $2$ bytes. |
+| `mem_4B` | `Bit` | whether the memory access (read or write) touches exactly $4$ bytes. |
+| `mem_8B` | `Bit` | whether the memory access (read or write) touches exactly $8$ bytes. |
+| `c_type` | `Bit` | Whether the instruction is of type `C`, i.e., whether it is $2$ bytes long instead of $4$. |
+| `imm` | `DWordWL` | the *fully extended (!)* 64-bit version of the immediate. |
+| `signed` | `Bit` | selector used to indicate signed or unsigned input interpretation. |
+| `mp_selector` | `Bit` | Multi-purpose selector used by the CPU to to configure several ALU operations in different ways.            See the `CPU` chip for more details. |
+| `muldiv_selector` | `Bit` | selects which output of `MUL` (lo/hi) or `DVRM` (quo/rem) is wanted. |
+| `word_instr` | `Bit` | Whether the instruction is a `*W` instruction, requiring the inputs and outputs to be (sign) extended. |
+| `ADD` | `Bit` | ALU selector flag |
+| `SUB` | `Bit` | ALU selector flag |
+| `SLT` | `Bit` | ALU selector flag |
+| `AND` | `Bit` | ALU selector flag |
+| `OR` | `Bit` | ALU selector flag |
+| `XOR` | `Bit` | ALU selector flag |
+| `SHIFT` | `Bit` | ALU selector flag |
+| `JALR` | `Bit` | ALU selector flag |
+| `BEQ` | `Bit` | ALU selector flag |
+| `BLT` | `Bit` | ALU selector flag |
+| `LOAD` | `Bit` | ALU selector flag |
+| `STORE` | `Bit` | ALU selector flag |
+| `MUL` | `Bit` | ALU selector flag |
+| `DIVREM` | `Bit` | ALU selector flag |
+| `ECALL` | `Bit` | ALU selector flag |
+| `EBREAK` | `Bit` | ALU selector flag |
+
+### Multiplicity
+
+| Name | Type | Description |
+|------|------|-------------|
+| `μ` | `BaseField` | The multiplicity with which this instruction is looked up in the `CPU` table. |
 
 We will illustrate how each instruction should be expressed in this (uncompressed) decoding table. The columns of the accompanying table represent the following: - *`operation`*: the assembly operation being encoded. - *`op-flag`*: which of the "`ALU` selector flags" operation flags to set. Each operation sets exactly one. - *`w_instr`*, *`signed`*: whether to set the `word_instr` and `signed` flags, respectively. - *other*: the other flags that should be set or variables that should be given specific values.
 
@@ -18,7 +79,7 @@ For the purpose of brevity and readability, the table uses the following rules-o
 
 Further clarification is provided in the notes following the table.
 
-## C-type instructions
+### C-type instructions
 
 The `RV64C` extension for compressed instructions specifies that \~50% of all instructions can be represented using a 16-bit instruction (rather than 32-bits), saving \~25% in code size. This execution of assembly code is _not_ agnostic to an instruction's compression state; after executing a compressed instruction, the `pc` should be incremented by `2` rather than `4`. To indicate an instruction is provided in compressed form, the `c_type` flag is introduced. *This flag should be set to `1` whenever the decoded instruction is provided in compressed form and `0` otherwise.*
 
@@ -36,30 +97,14 @@ figure(table( columns: (auto, auto, auto, auto, 1fr, auto), stroke: 0pt, inset:
 
 show figure: (it) => align(left, []) [ ] }
 
-## Notes
+### Notes
 
 We note the following about the above decoding table:
 
 enum.item( referenceable_note( "note_word_instr", [`word_instr`: `[W]` indicates that ``word_instr` = 1` for the `W`-variant of the operation, and `0` for the non-`W`-variant.] ), enum.item( referenceable_note( "note_signed", [`signed`: .not`[U]` indicates that ``signed` = 1` for the *non-`U`*-variant of the operation, and `0` for the `U`-variant.] ), enum.item( referenceable_note( "note-lui", [`LUI`: this operation loads the 20-bit `imm` in the upper bits of `rd`. Observe that this can be represented using `ADDI rd, x0, imm`. As such, *we expect the decoding to take care of writing the immediate in bit range `[12:32]` of `imm` and extending it to 64 bits.*] ), enum.item( referenceable_note( "note-auipc", [`AUIPC`: this operation adds the 20-bit immediate to the upper bits of `pc` and stores the result in `rd`. Given that the `pc` is stored in `x255`, this operation can be represented using `ADDI rd, x255, imm`. As such, *we expect the decoding to take care of writing the immediate in bit range `[12:32]` of `imm` and extending it to 64 bits.*] ), enum.item( referenceable_note( "note-jal", [`JAL`: this operation stores ``pc` + 4` in `rd` and adds two times the sign-extended 20-bit immediate to the `pc`. Note that this can be represented using `JALR rd, x255, imm`. As such, *we expect the decoding to take care of writing the immediate in bit range `[1:21]` of `imm` and extending it to 64 bits; the least significant bit should always be 0.*] ), enum.item( referenceable_note( "note-ecall", [`ECALL`: "On RISC-V a system call has its own instruction: `ECALL`. [...] A7 [= register `x17`] contains the system call number." [[source]] ] ), enum.item( referenceable_note( "note-fence", [`FENCE`: currently, the VM interprets this operation as `ADDI x0 x0 0`; a no-op.]
 
-## One more instruction <cpu-padding-decode-row>
+### One more instruction <cpu-padding-decode-row>
 
 In addition to decoding all instructions provided in the ELF and adding a corresponding entry to the  table, one must include an entry that has ``pc` = 1` and every other variable set to `0`. Note that this will never conflict with any entry in the ELF, since it has an odd `pc` value.
 
-This entry is used to pad the `CPU` table. More details on this matter are provided in the `CPU` chip.
-
-## Columns
-
-### Output
-
-| Name | Type | Description |
-|------|------|-------------|
-| `pc` | `DWordWL` | value of the program counter this instruction is associated with. |
-| `packed_decode` | `BaseField` | Ordered concatenation of several small variables. The `decode (uncompressed)` section explains the purpose of each variable.\ A list of each variable and the bit(-range) in which it is located:\ [0] `read_register1`, \ [1] `read_register2`, \ [2] `write_register`, \ [3] `memory_2bytes`, \ [4] `memory_4bytes`, \ [5] `memory_8bytes`, \ [6] `c_type`, \ [7] `signed`, \ [8] `mp_selector`, \ [9] `muldiv_selector`, \ [10] `word_instr`, \ [11] `ADD`, \ [12] `SUB`, \ [13] `SLT`, \ [14] `AND`, \ [15] `OR`, \ [16] `XOR`, \ [17] `SHIFT`, \ [18] `JALR`, \ [19] `BEQ`, \ [20] `BLT`, \ [21] `LOAD`, \ [22] `STORE`, \ [23] `MUL`, \ [24] `DIVREM`, \ [25] `ECALL`, \ [26] `EBREAK`; \ [27:35] `rs1`, \ [35:43] `rs2`, \ [43:51] `rd`, \ the remaining bits are set to zero.  |
-| `imm` | `DWordWL` | the *fully extended (!)* 64-bit version of the immediate. |
-
-### Multiplicity
-
-| Name | Type | Description |
-|------|------|-------------|
-| `μ` | `BaseField` | The multiplicity with which this instruction is looked up in the `CPU` table. |
\ No newline at end of file
+This entry is used to pad the `CPU` table. More details on this matter are provided in the `CPU` chip.
\ No newline at end of file
diff --git a/docs/spec/dvrm.md b/docs/spec/dvrm.md
index 7ec791c08..a57fa5ef1 100644
--- a/docs/spec/dvrm.md
+++ b/docs/spec/dvrm.md
@@ -2,17 +2,107 @@
 
 The  chip provides division and remainder functionality, both signed and unsigned.
 
-= Variables
+## Variables
 
 The  chip is comprised of  variables that are expressed using  columns and leverages  interaction(s):
 
-= Assumptions
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `n` | `DWordHL` | The numerator |
+| `d` | `DWordHL` | The denominator |
+| `signed` | `Bit` | Whether to interpret the input as signed (1) or unsigned (0) integers. |
+
+### Output
+
+| Name | Type | Description |
+|------|------|-------------|
+| `q` | `DWordHL` | The quotient; $`n` / `d`$ rounded towards zero. |
+| `r` | `DWordHL` | The remainder; $`n` - `q` `d`$. |
+
+### Auxiliary
+
+| Name | Type | Description |
+|------|------|-------------|
+| `div_by_zero` | `Bit` | Whether $`d`=0$. |
+| `overflow` | `Bit` | Whether $`n` = -2^63$ and $`d`=-1$. |
+| `abs_r` | `DWordWL` | Absolute value of `r`. |
+| `abs_d` | `DWordWL` | Absolute value of `d`. |
+| `n_sub_r` | `DWordHL` | $`n`-`r`$. |
+| `sign_n_sub_r` | `Bit` | Sign of `n_sub_r`. |
+| `sign_n` | `Bit` | Sign of `n`. |
+| `sign_d` | `Bit` | Sign of `d`. |
+| `sign_q` | `Bit` | Sign of `q`. |
+| `sign_r` | `Bit` | Sign of `r`. |
+
+### Virtual
+
+| Name | Type | Description |
+|------|------|-------------|
+| `extended_n` | `QuadHL` | sign-extended value of `n`. |
+| `extended_r` | `QuadHL` | sign-extended value of `r`. |
+| `extension_n_sub_r` | `DWordHL` | sign-extension limbs of `n_sub_r`. |
+| `extended_n_sub_r` | `QuadHL` | sign-extended value of `n_sub_r`. |
+| `carry` | `Bit[4]` | carries for adding `extended_n_sub_r` to `extended_r`, forming `extended_n`. |
+| `μ_sum` | `BaseField` | sum of multiplicities |
+
+**Definition of `extended_n`:**
+```
+extended_n (when iter=[0, 3]) := n[i]
+extended_n (when iter=[4, 7]) := 65535 * sign_n
+```
+
+**Definition of `extended_r`:**
+```
+extended_r (when iter=[0, 3]) := r[i]
+extended_r (when iter=[4, 7]) := 65535 * sign_r
+```
+
+**Definition of `extension_n_sub_r`:**
+```
+extension_n_sub_r := 65535 * sign_n_sub_r
+```
 
-= Constraints From the ISA, we gather five requirements for the `DIV[U][W]` and `REM[U][W]` instructions:
+**Definition of `extended_n_sub_r`:**
+```
+extended_n_sub_r (when iter=[0, 3]) := n_sub_r[i]
+extended_n_sub_r (when iter=[4, 7]) := extension_n_sub_r[i - 4]
+```
+
+**Definition of `carry`:**
+```
+carry (when iter=0) := 2^-32 * ((extended_n_sub_r::QuadWL)[i] + (extended_r::QuadWL)[i] - (extended_n::QuadWL)[i])
+carry (when iter=[1, 3]) := 2^-32 * ((extended_n_sub_r::QuadWL)[i] + (extended_r::QuadWL)[i] + carry[i - 1] - (extended_n::QuadWL)[i])
+```
+
+**Definition of `μ_sum`:**
+```
+μ_sum := μ_q + μ_r
+```
+
+### Multiplicity
+
+| Name | Type | Description |
+|------|------|-------------|
+| `μ_q` | `BaseField` |  |
+| `μ_r` | `BaseField` |  |
+
+## Assumptions
+
+| Tag | Range | Description |
+|-----|-------|-------------|
+| `DVRM-A1.i` | i ∈ [0, 3] | `IS_HALF[n[i]]` |
+| `DVRM-A2.i` | i ∈ [0, 3] | `IS_HALF[d[i]]` |
+| `DVRM-A3` |  | `IS_BIT<signed>` |
+
+## Constraints
+
+From the ISA, we gather five requirements for the `DIV[U][W]` and `REM[U][W]` instructions:
 
 enum.item([ _For both signed and unsigned division, except in the case of_ overflow, _it holds that ``n` = `q` `d` + `r``._ ]), enum.item([ _`DIV` and `DIVU` perform [...] signed and unsigned integer division [...] rounding towards zero._ ]), enum.item([ _For `REM`, the sign of a nonzero [remainder] equals the sign of the [numerator]._ ]), enum.item([ In case of _division-by-zero_, ``r` = `n`` and ``q` = 2^64-1` (unsigned) or ``q` = -1` (signed). ]), enum.item([ In case of _overflow_, ``q` = `n`` and ``r` = 0` ]), where _overflow_ occurs when ``n` = -2^(63)` and ``d` = -1` (and, hence, ``signed` = 1`), and _division-by-zero_ indicates that ``d` = 0`. In the following, we list the constraints associated with the  chip, and explain how these together enforce all five of these requirements.
 
-## R3: Sign remainder equals sign numerator
+### R3: Sign remainder equals sign numerator
 
 We start with R3, which is straightforwardly asserted by constraint [dvrm:c:sign_r_equals_sign_n].
 
@@ -21,7 +111,7 @@ We start with R3, which is straightforwardly asserted by constraint [dvrm:c:sign
 | `DVRM-C1` | `r` eq.not 0 => `sign_r` = `sign_n` |
 | | _polynomial:_ `Σ_i = 0^3 r[i] * (sign_r - sign_n) = 0` |
 
-## R2: rounding towards zero
+### R2: rounding towards zero
 
 R2 states that "_[in] signed and unsigned integer division [the quotient is] round[ed] towards zero._" In other words, + the sign of ``n`-`qd`` must match that of `n` (unless ``qd` = `n``), and + `|`n`-`qd`|  < |`d`|` (unless ``d` = 0`).
 
@@ -39,7 +129,7 @@ Focusing on the first statement, we observe that this trivially holds when ``sig
 | `DVRM-C6.i` | i ∈ [0, 1] | not`sign_d` => `abs_d` = `d` |  |
 | | | _polynomial:_ `(1 - sign_d) * (abs_d[i] - (d::DWordWL)[i]) = 0` | |
 
-## R5: overflow
+### R5: overflow
 
 The ISA requires that ``q` = `n`` and ``r` = 0` in the event of overflow (i.e., when ``n` = -2^63` and ``d` = -1`). We note that the second half of this requirement is already satisfied by R2: since ``d` = -1 != 0`, R2 requires that `|`r`| < |`d`| = 1`, to which ``r` = 0` is the only satisfying value.
 
@@ -55,7 +145,7 @@ In summary, in case of overflow R2 enforces that ``r` = 0`. Moreover it suffices
 
 We highlight [dvrm:c:overflow]. Recall that the `overflow` flag should be set if and only if (i) ``signed` = 1`, (ii) ``n` = `0x80...00``, and (iii) ``d` = `0xFF...FF``. These requirements are equivalent to the state where: $ forall i in [0, 3]:&& 65535 - `d`_i &= 0,\ forall i in [0, 2]:&& `n`_i &= 0,\ && `n`_3 - 2^15 dot `sign_n` &= 0,\ && 1 - `sign_n` &= 0,\ $ where ``signed` = 1` follows from the last equality. The requirement is phrased in this way, because the left-hand sides of the above expressions are `>= 0` by construction. Given that the sum of these expressions does not exceed `2^19` (and thus never wraps in the field), we can now say that the `overflow` bit should be set to `1` if and only if their sum evaluates to `0`. The `ZERO` lookup guarantees this to be the case.
 
-## R1: $#`n` = #`qd` + #`r`$
+### R1: $#`n` = #`qd` + #`r`$
 
 Rewriting R1, we find the constraint `not`overflow` => `n` - `r` = `qd``.
 
@@ -78,7 +168,7 @@ Lastly, observe that ``n` - `r` in (-2^64, 2^64)`, _regardless_ of the value of
 | `DVRM-C14.i` | i ∈ [0, 3] | `IS_HALF[n_sub_r[i]]` | μ_sum |
 | `DVRM-C15` |  | `IS_BIT<sign_n_sub_r>` |  |
 
-## R4: division-by-zero
+### R4: division-by-zero
 
 R4 requires that ``q` = 2^64-1` (unsigned) or `-1` (signed) and ``r` = n` when ``d` = 0`. Recalling R1, we see that ``n` = `q` `d` + `r` = `r`` when ``d` = 0`, already enforces the latter. Next, we note that, in two's complement, the _unsigned_ value `2^64-1` and _signed_ value `-1` are both represented by the bit string `0xFFFFFFFF`. Hence, only [dvrm:c:q_if_div_by_zero] is required to completely constrain R4; [dvrm:c:div_by_zero] just ensures the `div_by_zero` flag is set when ``d` = 0`.
 
@@ -88,7 +178,7 @@ R4 requires that ``q` = 2^64-1` (unsigned) or `-1` (signed) and ``r` = n` when `
 | | | _polynomial:_ `div_by_zero * (q[i] - 65535) = 0` | |
 | `DVRM-C17` |  | `ZERO[div_by_zero; d[0] + d[1] + d[2] + d[3]]` | μ_sum |
 
-## Other
+### Other
 
 The following constraints are included to enforce the values of `sign_n`, `sign_r` and `sign_d` are correct.
 
@@ -98,7 +188,7 @@ The following constraints are included to enforce the values of `sign_n`, `sign_
 | `DVRM-C19` | `SIGN<sign_r; r[3], signed>` |
 | `DVRM-C20` | `SIGN<sign_d; d[3], signed>` |
 
-## Output
+### Output
 
 Lastly, this chip contributes the following to the lookup:
 
@@ -107,96 +197,6 @@ Lastly, this chip contributes the following to the lookup:
 | `DVRM-C21` | `DVRM[q::DWordWL; n, d, signed, 0]` | -μ_q |
 | `DVRM-C22` | `DVRM[r::DWordWL; n, d, signed, 1]` | -μ_r |
 
-= Padding To pad the  table, we use the following data, representing the unsigned division `frac(0, 0, style: "horizontal")`:
+## Padding
 
-## Columns
-
-### Input
-
-| Name | Type | Description |
-|------|------|-------------|
-| `n` | `DWordHL` | The numerator |
-| `d` | `DWordHL` | The denominator |
-| `signed` | `Bit` | Whether to interpret the input as signed (1) or unsigned (0) integers. |
-
-### Output
-
-| Name | Type | Description |
-|------|------|-------------|
-| `q` | `DWordHL` | The quotient; $`n` / `d`$ rounded towards zero. |
-| `r` | `DWordHL` | The remainder; $`n` - `q` `d`$. |
-
-### Auxiliary
-
-| Name | Type | Description |
-|------|------|-------------|
-| `div_by_zero` | `Bit` | Whether $`d`=0$. |
-| `overflow` | `Bit` | Whether $`n` = -2^63$ and $`d`=-1$. |
-| `abs_r` | `DWordWL` | Absolute value of `r`. |
-| `abs_d` | `DWordWL` | Absolute value of `d`. |
-| `n_sub_r` | `DWordHL` | $`n`-`r`$. |
-| `sign_n_sub_r` | `Bit` | Sign of `n_sub_r`. |
-| `sign_n` | `Bit` | Sign of `n`. |
-| `sign_d` | `Bit` | Sign of `d`. |
-| `sign_q` | `Bit` | Sign of `q`. |
-| `sign_r` | `Bit` | Sign of `r`. |
-
-### Virtual
-
-| Name | Type | Description |
-|------|------|-------------|
-| `extended_n` | `QuadHL` | sign-extended value of `n`. |
-| `extended_r` | `QuadHL` | sign-extended value of `r`. |
-| `extension_n_sub_r` | `DWordHL` | sign-extension limbs of `n_sub_r`. |
-| `extended_n_sub_r` | `QuadHL` | sign-extended value of `n_sub_r`. |
-| `carry` | `Bit[4]` | carries for adding `extended_n_sub_r` to `extended_r`, forming `extended_n`. |
-| `μ_sum` | `BaseField` | sum of multiplicities |
-
-**Definition of `extended_n`:**
-```
-extended_n (when iter=[0, 3]) := n[i]
-extended_n (when iter=[4, 7]) := 65535 * sign_n
-```
-
-**Definition of `extended_r`:**
-```
-extended_r (when iter=[0, 3]) := r[i]
-extended_r (when iter=[4, 7]) := 65535 * sign_r
-```
-
-**Definition of `extension_n_sub_r`:**
-```
-extension_n_sub_r := 65535 * sign_n_sub_r
-```
-
-**Definition of `extended_n_sub_r`:**
-```
-extended_n_sub_r (when iter=[0, 3]) := n_sub_r[i]
-extended_n_sub_r (when iter=[4, 7]) := extension_n_sub_r[i - 4]
-```
-
-**Definition of `carry`:**
-```
-carry (when iter=0) := 2^-32 * ((extended_n_sub_r::QuadWL)[i] + (extended_r::QuadWL)[i] - (extended_n::QuadWL)[i])
-carry (when iter=[1, 3]) := 2^-32 * ((extended_n_sub_r::QuadWL)[i] + (extended_r::QuadWL)[i] + carry[i - 1] - (extended_n::QuadWL)[i])
-```
-
-**Definition of `μ_sum`:**
-```
-μ_sum := μ_q + μ_r
-```
-
-### Multiplicity
-
-| Name | Type | Description |
-|------|------|-------------|
-| `μ_q` | `BaseField` |  |
-| `μ_r` | `BaseField` |  |
-
-## Assumptions
-
-| Tag | Range | Description |
-|-----|-------|-------------|
-| `DVRM-A1.i` | i ∈ [0, 3] | `IS_HALF[n[i]]` |
-| `DVRM-A2.i` | i ∈ [0, 3] | `IS_HALF[d[i]]` |
-| `DVRM-A3` |  | `IS_BIT<signed>` |
\ No newline at end of file
+To pad the  table, we use the following data, representing the unsigned division `frac(0, 0, style: "horizontal")`:
\ No newline at end of file
diff --git a/docs/spec/ecall.md b/docs/spec/ecall.md
index 934c2e75f..7e90cb363 100644
--- a/docs/spec/ecall.md
+++ b/docs/spec/ecall.md
@@ -1 +1 @@
-# ECALL Chips
\ No newline at end of file
+ca# ECALL Chips
diff --git a/docs/spec/halt.md b/docs/spec/halt.md
new file mode 100644
index 000000000..1c516c628
--- /dev/null
+++ b/docs/spec/halt.md
@@ -0,0 +1,46 @@
+# HALT Chip
+
+## Variables
+
+The  chip leverages  variable, spanning  columns and leverages  interactions:
+
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `timestamp` | `DWordWL` | timestamp at which to halt the program |
+
+## Assumptions
+
+It is assumed the input is range checked:
+
+| Tag | Range | Description |
+|-----|-------|-------------|
+| `HALT-A1.i` | i ∈ [0, 1] | `IS_WORD[timestamp[i]]` |
+
+## Constraints
+
+The  chip: + makes sure register `x10` (containing the exit code) equals `0` ([halt:c:read_zero_exit_code]), + writes `0` to all other registers ([halt:c:zeroize_registers_lo]/[halt:c:zeroize_registers_hi]), and + sets `pc` equal to `1` ([halt:c:pc]). Note that the writes performed by all these interactions are accompanied by the timestamp `2^64-1`; the maximum timestamp. This prevents any other operation involving memory from being executed hereafter.
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `HALT-C1.i` | i ∈ [1, 9] | `MEMW[1, (2 * i)::DWordWL, 0::BaseField[8], (2^64 - 1)::DWordWL, 1, 0, 0]` | 1 |
+| `HALT-C2` |  | `MEMW[0::BaseField[8]; 1, (2 * 10)::DWordWL, 0::BaseField[8], (2^64 - 1)::DWordWL, 1, 0, 0]` | 1 |
+| `HALT-C3.i` | i ∈ [11, 31] | `MEMW[1, (2 * i)::DWordWL, 0::BaseField[8], (2^64 - 1)::DWordWL, 1, 0, 0]` | 1 |
+| `HALT-C4` |  | `MEMW[1, (2 * 255)::DWordWL, ['arr', 1, 0, 0, 0, 0, 0, 0, 0], (2^64 - 1)::DWordWL, 1, 0, 0]` | 1 |
+
+[ Observe that --- in its current state --- this solution puts the burden of verifying the register cleanup on the verifier inside of the lookup argument. Alternatively, one could add 31 lookups to the "memory" table to remove the _known_ final tokens for the registers there. ])
+
+### Lookup
+
+In this VM, halting is considered equivalent to executing a `sys_exit`. Hence, this chip responds to `ECALL`s with system call number 93.
+
+The HALT chip therefore contributes the following interaction to the lookup-argument:
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `HALT-C5` | `ECALL[timestamp, 93::DWordWL]` | -1 |
+
+## Padding
+
+This chip should only contain a single row. Given that `2^0 = 1`, this chip does not need to be padded. As such, no padding is defined.
\ No newline at end of file
diff --git a/docs/spec/is_bit.md b/docs/spec/is_bit.md
index eb0070036..5d604ec7f 100644
--- a/docs/spec/is_bit.md
+++ b/docs/spec/is_bit.md
@@ -2,17 +2,9 @@
 
 Barring exceptional cases, this template is used to assert that a variable of type `Bit` assumes a valid value under some condition.
 
-= Variables The  template operates on  variables:
+## Variables
 
-= Constraints It takes only one constraint to enforce that `X` must be either `0` or `1` whenever ``cond` eq.not 0`:
-
-*Note*: - In case of _unconditional_ template application, `cond` can be dropped from the constraint, simplifying it to ``X` (1- `X`) = 0`. - As described earlier, the `cond` variable must be describable by a degree-1 (i.e., linear) expression. This is to make sure that [isbit:c:isbit]'s expression has degree at most 3.
-
-## Correctness argument
-
-If `cond` is `0`, [isbit:c:isbit] is trivially satisfied: `X` can assume any value and the polynomial constraint will evaluate to `0` regardless. When ``cond` eq.not 0`, it follows that the statement can only be proven when ``X` (1-`X`) equiv 0 mod p`, with `p` the modulus of the field. Because `BaseField` is a prime field, this equality is only satisfied if either ``X` equiv 0 mod p` or `1-`X` equiv 0 mod p`. Hence, it is proven that when ``cond` eq.not 0`, [isbit:c:isbit] is only satisfied if ``X` in {0, 1}`. 
-
-## Columns
+The  template operates on  variables:
 
 ### Input
 
@@ -28,9 +20,15 @@ If `cond` is `0`, [isbit:c:isbit] is trivially satisfied: `X` can assume any val
 
 ## Constraints
 
-### all
+It takes only one constraint to enforce that `X` must be either `0` or `1` whenever ``cond` eq.not 0`:
 
 | Tag | Description |
 |-----|-------------|
 | `IS_BIT-C1` | `cond` => `X` (1-`X`) = 0 |
-| | _polynomial:_ `cond * X * (1 - X) = 0` |
\ No newline at end of file
+| | _polynomial:_ `cond * X * (1 - X) = 0` |
+
+*Note*: - In case of _unconditional_ template application, `cond` can be dropped from the constraint, simplifying it to ``X` (1- `X`) = 0`. - As described earlier, the `cond` variable must be describable by a degree-1 (i.e., linear) expression. This is to make sure that [isbit:c:isbit]'s expression has degree at most 3.
+
+### Correctness argument
+
+If `cond` is `0`, [isbit:c:isbit] is trivially satisfied: `X` can assume any value and the polynomial constraint will evaluate to `0` regardless. When ``cond` eq.not 0`, it follows that the statement can only be proven when ``X` (1-`X`) equiv 0 mod p`, with `p` the modulus of the field. Because `BaseField` is a prime field, this equality is only satisfied if either ``X` equiv 0 mod p` or `1-`X` equiv 0 mod p`. Hence, it is proven that when ``cond` eq.not 0`, [isbit:c:isbit] is only satisfied if ``X` in {0, 1}`.
\ No newline at end of file
diff --git a/docs/spec/load.md b/docs/spec/load.md
index 62fce1bf7..51f80997d 100644
--- a/docs/spec/load.md
+++ b/docs/spec/load.md
@@ -2,41 +2,10 @@
 
 The  chip provides functionality to read values from memory and sign-extend them where appropriate. It delegates low-level memory handling to the `MEMW` chip ([memw]).
 
-= Variables
+## Variables
 
 The  chip is comprised of  variables that are expressed using  columns and leverages  interaction(s):
 
-= Assumptions
-
-= Constraints The chip delegates the actual memory interaction to the `MEMW` chip, and ensures correctness of the requested sign/zero extension. The output `res` is correctly range-checked as long as the memory contents are.
-
-| Tag | Range | Description | Multiplicity |
-|-----|-------|-------------|--------------|
-| `LOAD-C1` |  | `read2` + `read4` + `read8` => `μ` |  |
-| | | _polynomial:_ `(read2 + read4 + read8) * (1 - μ) = 0` | |
-| `LOAD-C2` |  | `MEMW[res; 0, base_address, res::BaseField[8], timestamp, read2, read4, read8]` | μ |
-| `LOAD-C3` |  | `MSB8[sign_bit; res[0]]` | read1 |
-| `LOAD-C4` |  | `MSB8[sign_bit; res[1]]` | read2 |
-| `LOAD-C5` |  | `MSB8[sign_bit; res[3]]` | read4 |
-| `LOAD-C6.i` | i ∈ [4, 7] | !`read8` => `res`_i = `signed` dot `sign_bit` dot 255 |  |
-| | | _polynomial:_ `(1 - read8) * (res[i] - signed * sign_bit * 255) = 0` | |
-| `LOAD-C7.i` | i ∈ [2, 3] | !(`read4` + `read8`) => `res`_i = `signed` dot `sign_bit` dot 255 |  |
-| | | _polynomial:_ `(1 - read4 - read8) * (res[i] - signed * sign_bit * 255) = 0` | |
-| `LOAD-C8` |  | !(`read2` + `read4` + `read8`) => `res`_1 = `signed` dot `sign_bit` dot 255 |  |
-| | | _polynomial:_ `(1 - read2 - read4 - read8) * (res[1] - signed * sign_bit * 255) = 0` | |
-
-The chip contributes the following to the lookup argument.
-
-| Tag | Description | Multiplicity |
-|-----|-------------|--------------|
-| `LOAD-C9` | `LOAD[res::DWordWL; base_address, timestamp, read2, read4, read8, signed]` | -μ |
-
-= Padding
-
-The table can be padded to the next power of two with the following value assignments:
-
-## Columns
-
 ### Input
 
 | Name | Type | Description |
@@ -87,4 +56,33 @@ read1 := μ - read2 - read4 - read8
 | `LOAD-A4` |  | `IS_BIT<read4>` |
 | `LOAD-A5` |  | `IS_BIT<read8>` |
 | `LOAD-A6` |  | `IS_BIT<read2 + read4 + read8>` |
-| `LOAD-A7.i` | i ∈ [0, 1] | `IS_WORD[timestamp[i]]` |
\ No newline at end of file
+| `LOAD-A7.i` | i ∈ [0, 1] | `IS_WORD[timestamp[i]]` |
+
+## Constraints
+
+The chip delegates the actual memory interaction to the `MEMW` chip, and ensures correctness of the requested sign/zero extension. The output `res` is correctly range-checked as long as the memory contents are.
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `LOAD-C1` |  | `read2` + `read4` + `read8` => `μ` |  |
+| | | _polynomial:_ `(read2 + read4 + read8) * (1 - μ) = 0` | |
+| `LOAD-C2` |  | `MEMW[res; 0, base_address, res::BaseField[8], timestamp, read2, read4, read8]` | μ |
+| `LOAD-C3` |  | `MSB8[sign_bit; res[0]]` | read1 |
+| `LOAD-C4` |  | `MSB8[sign_bit; res[1]]` | read2 |
+| `LOAD-C5` |  | `MSB8[sign_bit; res[3]]` | read4 |
+| `LOAD-C6.i` | i ∈ [4, 7] | !`read8` => `res`_i = `signed` dot `sign_bit` dot 255 |  |
+| | | _polynomial:_ `(1 - read8) * (res[i] - signed * sign_bit * 255) = 0` | |
+| `LOAD-C7.i` | i ∈ [2, 3] | !(`read4` + `read8`) => `res`_i = `signed` dot `sign_bit` dot 255 |  |
+| | | _polynomial:_ `(1 - read4 - read8) * (res[i] - signed * sign_bit * 255) = 0` | |
+| `LOAD-C8` |  | !(`read2` + `read4` + `read8`) => `res`_1 = `signed` dot `sign_bit` dot 255 |  |
+| | | _polynomial:_ `(1 - read2 - read4 - read8) * (res[1] - signed * sign_bit * 255) = 0` | |
+
+The chip contributes the following to the lookup argument.
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `LOAD-C9` | `LOAD[res::DWordWL; base_address, timestamp, read2, read4, read8, signed]` | -μ |
+
+## Padding
+
+The table can be padded to the next power of two with the following value assignments:
\ No newline at end of file
diff --git a/docs/spec/logup.md b/docs/spec/logup.md
new file mode 100644
index 000000000..ac4bcb2d2
--- /dev/null
+++ b/docs/spec/logup.md
@@ -0,0 +1,81 @@
+# LogUp Argument
+
+The _LogUp_ proof system conducts a permutation check based on summing partial derivatives. This check ensures that whatever tuple is sent to be "looked-up" by a _source table_ is indeed received in the expected _destination table_.
+
+## Notation
+
+### VM Notation
+
+#### Preliminary notation
+
+- `NN`: the set of non-negative natural integers. - `BaseField`: the base finite field used by the arithmetisation. - `ExtensionField`: a finite extension of `BaseField` of cryptographic size. - `[n]` for `n in NN`: the set of integers `{0, dots, n - 1}`. - `X[i]` for tuple `X`: the `i`-th element of `X`, starting at `0`.
+
+#### Arithmetisation notation
+
+- `numTables in NN`: number of tables `Table_i` in the arithmetisation of the VM. - `TableSet`: set of all tables `Table_i` in the arithmetisation of the VM. - `numColumns_i in NN`: number of _columns_ in table `Table_i` (not the number of variables). - `numRows_i in NN`: number of _rows_ in table `Table_i`.
+
+### Interaction Notation
+
+The `j`-th _interaction_ `Interaction_j` of table `Table_i` is defined by the following tuple:
+
+columns: (auto, auto), inset: 6pt, align: horizon, stroke: none, table.header([*Symbol*], [*Description*]), table.hline(stroke: 1pt), table.vline(stroke: 1pt, x: 1), [`id_(i,j) in FF`], [the _type identifier_ of the interaction, usually the identifier of the chip that is constraining the relation expected to hold within the looked-up tuple.], [`numElements_(i,j) in NN`], [the _length_ of the tuple of elements being looked-up.], [ $weightFunction_(i,j) : FF^(numColumns_i) & arrow FF^(numElements_(i,j) + 1) \ R & mapsto arrow(t)_(i,j) || mu_(i,j)$ ], [the _weight function_ that maps a row `R` of table `Table_i` to the looked-up tuple `arrow(t)_(i,j)` and its multiplicity `mu_(i,j) in BaseField`.],
+
+## Vanilla LogUp
+
+### Protocol Description
+
++ Prover commits to all traces.
+
++ Verifier samples a random _(global) LogUp challenge_ `logupChallenge in ExtensionField` and a random _fingerprint coefficient_ `fingerprintCoeff in ExtensionField` and sends them to the Prover.
+
++ Prover commits to (i) interaction contribution, (ii) table running sum columns, and (iii) each table's contribution:
+
++ For each table `Table_i`, populate the interaction contribution columns and compute the _table (LogUp) contribution_:
+
++ For each interaction `Interaction_j` of table `Table_i`, initialize an empty _interaction contribution column_ of length `numRows_i`.
+
++ Initialise a _table running sum column_ `S_i in ExtensionField^(numRows_i)` with the first value `S_i [0]` populated according to the constraint choice.
+
++ *Constrain* the first row if required by selected constraint choice.
+
++ For each `j`-th row `R_j in BaseField^(numColumns_i)` of `Table_i`, for `j in [numRows_i - 1]`: + For each `k`-th interaction `Interaction_k` of table `Table_i`: + Compute the _interaction contribution numerator_ ` n_(j,k) = mu_(i,k) = w_(i,k)(R_j)[numElements_(i,k)] ` + If `n eq.not 0`, compute the _interaction contribution denominator_ ` d_(j,k) = logupChallenge + fingerprintCoeff dot id_(i,k) + sum_(l = 0)^(numElements_(i,k) - 1) fingerprintCoeff^(l + 2) dot weightFunction_(i,k) (R_j)[l]. ` + Save the _interaction contribution_ as `n_(j,k)/d_(j,k) in ExtensionField` in the corresponding interaction contribution column for this interaction. + *Constrain* the interaction contribution column according to the definitions of `n` and~`d`.
+
++ Compute the _row contribution_ as the sum `s_(j) = sum_k n_(j,k) / d_(j,k)` and compute the next row's table running sum value `S_i [j+1] = S_i [j] + s_(j)`.
+
++ *Constrain* the transition of the running sum column as indicated by the constraint choice.
+
++ *Constrain* the last row if required by selected constraint choice.
+
++ Batch-commit to every table's interaction contribution columns and running sum columns with the column commitment scheme and commit to the table's overall contribution `S_i [N_i - 1]` by sending it in the clear to the verifier.
+
++ Verifier checks that the sum of every table's overall contribution is equal to zero: `sum_i S_i [N_i - 1] = 0_ExtensionField`, and delegates the checks of the constraints to the STARK.
+
+### Running Sum Constraint Choices <constraint_choices>
+
+#### Choice 1: transitions looking back
+
+tl,dr: implicit `0_ExtensionField` initial value, explicit final value.
+
++ (*Boundary, first row*) Constrain first row of running sum column to equal the sum of the first row of every interaction contribution column. (This is analogous an implicit `-1`-th row initialised at `0_ExtensionField`.) + (*Transition, looking back, applied to rows `1, dots, numRows_i - 1`*) For each row _other than the first_, constrain the _current_ running sum value to equal the sum of every current interaction contribution column added to the _previous_ running sum value. + (*Boundary, last row*) Constrain last row of running sum column to equal the claimed table contribution.
+
+Total constraints: 2 boundary + 1 transition over `numRows_i - 1` rows.
+
+#### Choice 2: transitions looking forward
+
+tl,dr: explicit `0_ExtensionField` initial value, implicit final value.
+
++ (*Boundary, first row*) Constrain first row of running sum column to equal `0_ExtensionField`. + (*Transition, looking forward, applied to rows `0, dots, numRows_i - 2`*) For each row _other than the last_, constrain the _next_ running sum value to equal the sum of every current interaction contribution column added to the _current_ running sum value. + (*Boundary, last row*) Constrain last row of running sum column added to sum of last row of every interaction column to equal the claimed table contribution. (That is, the claimed table contribution is implicit in the last row of the table, but not written to last value of running sum column.)
+
+Total constraints: 2 boundary + 1 transition over `numRows_i - 1` rows.
+
+#### Choice 3: circular transitions looking back/forward
+
++ For each row, constrain the _current/next_ (wrapping to first on last if "next") running sum value to equal the sum of every current interaction contribution value added to the _previous/current_ (wrapping to last on first if "previous") running sum value added to claimed table contribution divided by `numRows_i`.
+
+Total constraints: 1 _circular_ transition over `numRows_i` rows.
+
+This single circular constraint checks that each row's contribution `s_(i,j)` is added to the running sum column, either in the current row's cell or in the next row's. In order to avoid boundary constraints, the look-back or peek-forward into the running sum column wraps around the beginning or end of the table.
+
+This alone implies that difference between first and last row's values will be the table's overall real contribution `sum_j s_(i,j)`, which will be incompatible with the circularity of the constraint. Since boundary constraints are avoided, the way to check that `sum_j s_(i,j)` equals the claimed contribution `L_i` is to remove a fraction of `L_i` at each row in such a way that `L_i` is removed completely after summing all `numRows_i` rows; i.e., the constraint subtracts the public term `L_i / numRows_i` from the running sum at every row.
+
+If the expected equality `sum_j s_(i,j) = L_i` holds, then the circularity of the constraint will also hold. ]
\ No newline at end of file
diff --git a/docs/spec/lt.md b/docs/spec/lt.md
index 7fd0d806f..a043d4db9 100644
--- a/docs/spec/lt.md
+++ b/docs/spec/lt.md
@@ -2,52 +2,10 @@
 
 The  chip constrains an indicator bit for the less-than relation, signed or unsigned.
 
-= Variables
+## Variables
 
 The  chip is comprised of  variables that are expressed using  columns and leverages  interaction(s):
 
-= Assumptions We assume the inputs `lhs`, `rhs` and `signed` are partially range checked.
-
-= Constraints We first constrain that all variables correspond to their definition. For the defining constraint of `lt`, [lt:c:lt], observe that it is a choice between two options, depending on the input flag `signed`. In the case of unsigned comparison, we simply need `unsigned_lt`, indicating that a wraparound (carry bit) modulo `2^64` is needed to go from `rhs` to `lhs` via addition. For the case of signed comparison, we first need some case analysis.
-
-We split `a < b` into four disjoint cases, conditioned on the sign of `a` and `b`. Recall that the sign of a number in two's complement can be read off from the MSB, being `1` for a negative number and `0` for a positive one. For this analysis, we denote the MSB of `a` as `A` and the MSB of `b` as `B`. The four disjoint cases then become:
-
-+ `dash(A) and B and (a < b)` + `A and dash(B) and (a < b)` + `A and B and (a < b)` + `dash(A) and dash(B) and (a < b)`
-
-The first case is evidently false, while the second case simplifies to `A and dash(B)`. For the third and fourth case, observe that when `A = B`, the `<` relation is preserved by the modular correspondence between `[-2^(31), 2^(31))` and `[0, 2^(64))`. Importantly, this modular correspondence is merely a reinterpretation of the bits or values of `a` and `b`, due to the representation in two's complement. Hence, we can introduce the value `C = `unsigned_lt``, that accurately represents the relation `a < b` when `A = B`.
-
-Combining our three remaining cases, we obtain the boolean formula `A dash(B) or A B C or dash(A) dash(B) C`. Since the cases are disjoint, this can be computed with the binary-valued polynomial `P(A, B, C) = A (1 - B) + A B C + (1 - A) (1 - B) C`.
-
-The polynomial `P` can be simplified to a total degree of two. We claim that the polynomial `Q(A, B, C) = A (1 - B) + A C + (1 - B) C` is, for the purposes of this chip, equivalent to `P`. An exhaustive check shows that `P(A, B, C) != Q(A, B, C)` only for the triple `(A, B, C) = (1, 0, 1)`. This is, however, impossible due to the correctness of `ADD`. In more detail, if we let `s` be the (range-checked) difference `a - b` (so the equivalent of the `lhs_sub_rhs` column), and `x'` denote the most significant word of a variable `x`, we need `c dot 2^32 + a' = b' + s' + `carry[0]``, by the definition of `carry`. However, the left hand side of this is at least `3 dot 2^31`, as `(A, C) = (1, 1)`, and the right hand side is at most `(2^31 - 1) + (2^32 - 1) + 1 = 3 dot 2^31 - 1`. Therefore, we can use `Q` to constrain `lt` when `signed = 1`.
-
-| Tag | Description | Multiplicity |
-|-----|-------------|--------------|
-| `LT-C1` | `MSB16[lhs_msb; lhs[2]]` | μ |
-| `LT-C2` | `MSB16[rhs_msb; rhs[2]]` | μ |
-| `LT-C3` | `lt` = `signed` dot (A (1 - B) + A C + (1 - B) C) + (1 - `signed`) dot `unsigned_lt` |  |
-| | _polynomial:_ `lt - signed * (lhs_msb * (1 - rhs_msb) + lhs_msb * carry[1] + (1 - rhs_msb) * carry[1]) - (1 - signed) * unsigned_lt = 0` | |
-| `LT-C4` | `IS_HALF[lhs[1]]` | μ |
-| `LT-C5` | `IS_HALF[rhs[1]]` | μ |
-
-And then we constrain the subtraction, taking care of the remaining range checking not yet covered by the assumptions or the `MSB16` lookup.
-
-| Tag | Range | Description | Multiplicity |
-|-----|-------|-------------|--------------|
-| `LT-C6.i` | i ∈ [0, 1] | `IS_BIT<carry[i]>` |  |
-| `LT-C7.i` | i ∈ [0, 3] | `IS_HALF[lhs_sub_rhs[i]]` | μ |
-
-The chip contributes the following to the lookup argument.
-
-| Tag | Description | Multiplicity |
-|-----|-------------|--------------|
-| `LT-C8` | `LT[lt; lhs::DWordWL, rhs::DWordWL, signed]` | -μ |
-
-= Padding
-
-The table can be padded to the next power of two with the following value assignments:
-
-## Columns
-
 ### Input
 
 | Name | Type | Description |
@@ -96,8 +54,50 @@ unsigned_lt := carry[1]
 
 ## Assumptions
 
+We assume the inputs `lhs`, `rhs` and `signed` are partially range checked.
+
 | Tag | Range | Description |
 |-----|-------|-------------|
 | `LT-A1` |  | `IS_WORD[lhs[0]]` |
 | `LT-A2` |  | `IS_WORD[rhs[0]]` |
-| `LT-A3` |  | `IS_BIT<signed>` |
\ No newline at end of file
+| `LT-A3` |  | `IS_BIT<signed>` |
+
+## Constraints
+
+We first constrain that all variables correspond to their definition. For the defining constraint of `lt`, [lt:c:lt], observe that it is a choice between two options, depending on the input flag `signed`. In the case of unsigned comparison, we simply need `unsigned_lt`, indicating that a wraparound (carry bit) modulo `2^64` is needed to go from `rhs` to `lhs` via addition. For the case of signed comparison, we first need some case analysis.
+
+We split `a < b` into four disjoint cases, conditioned on the sign of `a` and `b`. Recall that the sign of a number in two's complement can be read off from the MSB, being `1` for a negative number and `0` for a positive one. For this analysis, we denote the MSB of `a` as `A` and the MSB of `b` as `B`. The four disjoint cases then become:
+
++ `dash(A) and B and (a < b)` + `A and dash(B) and (a < b)` + `A and B and (a < b)` + `dash(A) and dash(B) and (a < b)`
+
+The first case is evidently false, while the second case simplifies to `A and dash(B)`. For the third and fourth case, observe that when `A = B`, the `<` relation is preserved by the modular correspondence between `[-2^(31), 2^(31))` and `[0, 2^(64))`. Importantly, this modular correspondence is merely a reinterpretation of the bits or values of `a` and `b`, due to the representation in two's complement. Hence, we can introduce the value `C = `unsigned_lt``, that accurately represents the relation `a < b` when `A = B`.
+
+Combining our three remaining cases, we obtain the boolean formula `A dash(B) or A B C or dash(A) dash(B) C`. Since the cases are disjoint, this can be computed with the binary-valued polynomial `P(A, B, C) = A (1 - B) + A B C + (1 - A) (1 - B) C`.
+
+The polynomial `P` can be simplified to a total degree of two. We claim that the polynomial `Q(A, B, C) = A (1 - B) + A C + (1 - B) C` is, for the purposes of this chip, equivalent to `P`. An exhaustive check shows that `P(A, B, C) != Q(A, B, C)` only for the triple `(A, B, C) = (1, 0, 1)`. This is, however, impossible due to the correctness of `ADD`. In more detail, if we let `s` be the (range-checked) difference `a - b` (so the equivalent of the `lhs_sub_rhs` column), and `x'` denote the most significant word of a variable `x`, we need `c dot 2^32 + a' = b' + s' + `carry[0]``, by the definition of `carry`. However, the left hand side of this is at least `3 dot 2^31`, as `(A, C) = (1, 1)`, and the right hand side is at most `(2^31 - 1) + (2^32 - 1) + 1 = 3 dot 2^31 - 1`. Therefore, we can use `Q` to constrain `lt` when `signed = 1`.
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `LT-C1` | `MSB16[lhs_msb; lhs[2]]` | μ |
+| `LT-C2` | `MSB16[rhs_msb; rhs[2]]` | μ |
+| `LT-C3` | `lt` = `signed` dot (A (1 - B) + A C + (1 - B) C) + (1 - `signed`) dot `unsigned_lt` |  |
+| | _polynomial:_ `lt - signed * (lhs_msb * (1 - rhs_msb) + lhs_msb * carry[1] + (1 - rhs_msb) * carry[1]) - (1 - signed) * unsigned_lt = 0` | |
+| `LT-C4` | `IS_HALF[lhs[1]]` | μ |
+| `LT-C5` | `IS_HALF[rhs[1]]` | μ |
+
+And then we constrain the subtraction, taking care of the remaining range checking not yet covered by the assumptions or the `MSB16` lookup.
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `LT-C6.i` | i ∈ [0, 1] | `IS_BIT<carry[i]>` |  |
+| `LT-C7.i` | i ∈ [0, 3] | `IS_HALF[lhs_sub_rhs[i]]` | μ |
+
+The chip contributes the following to the lookup argument.
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `LT-C8` | `LT[lt; lhs::DWordWL, rhs::DWordWL, signed]` | -μ |
+
+## Padding
+
+The table can be padded to the next power of two with the following value assignments:
\ No newline at end of file
diff --git a/docs/spec/memory.md b/docs/spec/memory.md
index a38debcfa..b623c3414 100644
--- a/docs/spec/memory.md
+++ b/docs/spec/memory.md
@@ -6,11 +6,11 @@ While RAM is byte addressed, we do choose to store registers as a `DWordWL` over
 
 On a high level, we ensure memory consistency by an interacting system of reads and writes to a lookup argument, combined with an initialization and finalization scheme. The initialization and finalization schemes together ensure both that (1) the necessary preconditions for the lookup system are satisfied, and (2) the program is executed with the correct initial memory and register contents as specified by the ELF binary and the ISA.
 
-= Memory types
+## Memory types
 
 A commonly made distinction of memory types is that of _read-only_ and _read-write_ memory, with the more restrictive read-only variant often allowing for more efficient solutions (be that regarding prover time, verifier time or proof size) via table lookup proofs. Naturally, the VM’s main memory and registers should be handled by a read-write system as the guest program/environment can issue instructions that write to memory. While there are some subsystems that can be modelled as read-only memory ---e.g., the program memory and instruction decoding--- we opt to integrate these into the proof system via chip interactions (relying on techniques derived from table lookup arguments). As such, we only concern ourselves with read-write memory, moving forward.
 
-= Memory operations
+## Memory operations
 
 Every memory operation has some conceptual attributes that are relevant to mention or discuss:
 
@@ -20,7 +20,7 @@ Since we will have to ensure that memory accesses are temporally consistent with
 
 For reasons of completeness (since temporal integrity as discussed below is a security necessity), we cannot deal with multiple accesses to the same address at identical timestamps. However, if multiple accesses are guaranteed to be independent (that is, to different addresses), they can still share a timestamp --- consider, e.g., the case of reading a word as 4 bytes with the `LW` load instruction. This property is already taken into account where possible in the design of the system. For instance, in the CPU chip, we can ensure that there are at most 3 memory accesses not guaranteed to be independent, so a timestamp granularity of 4 timestamps per cycle is enough. ]
 
-= Permutation argument
+## Permutation argument
 
 We can conceptually organise the state of the memory as a collection of "tokens" that represent tuples `(serif("timestamp"), serif("address"), serif("value"))`, meaning the current value written to `serif("address")` is `serif("value")`, last written to memory at `serif("timestamp")`. Having exactly one value associated with any address will be ensured (see further down in this document) by the interaction of memory initialization, memory finalization, and the effects of memory operations.
 
@@ -32,13 +32,13 @@ Naturally, for a read operation, the _values_ embedded in the consumed and emitt
 
 So long as we can properly constrain temporal integrity (that is, no memory operation can consume future tokens), this "balancing" act of tokens can be integrated (with sufficient domain separation) into the existing LogUp argument ([logup]): consuming a token corresponds to a "receive" and emitting a new token is a "send".
 
-= Temporal integrity
+## Temporal integrity
 
 To ensure temporal integrity, every memory operation needs to be constrained for the newly emitted token to have a strictly greater timestamp than the consumed token. This raises the question of how to represent timestamps and cleanly perform this check, as over a finite field the “less than” relation is ill-defined (though it is common and natural to consider it as the less than relation over the natural lift of the field into the integers). We choose to represent timestamps as machine words, using the existing `LT` chip ([lt]) functionality for comparisons. The full implementation of the timestamp system can be seen in the `timestamp` column of the `CPU` ([cpu]) and `MEMW` chips ([memw]). The `CPU` merely passes in the current timestamp, while `MEMW` can recall the previously written timestamp and constrain the correct sequencing.
 
 - Clean definition of “less-than”, using the already existing `LT` functionality in the ALU - Harder to perform increments, needing extra constraints beyond field arithmetic - But this can be alleviated by providing a precomputed column that has a fixed increment per CPU row ][ - Comparison is more annoying, but can work by: - Decomposition into a machine word and chip interaction with the LT chip - Bit decomposition and comparison constraints - Range-checking the difference to be sufficiently small w.r.t. the field characteristic. - Increments and basic arithmetic operations are cheap ] ]
 
-= Initialization and Finalization
+## Initialization and Finalization
 
 Because the LogUp argument handling token consumption and emission needs to be fully balanced --- every token emitted should be consumed, and vice versa --- we need to have a system to emit the initial tokens and consume the final tokens. This needs to ensure that every address has at most a single initializing emission, and at most one finalizing consumption. Having at most one initialization will, through the correctness of the lookup argument, immediately lead to having at most one correct finalization, and vice versa.
 
@@ -48,7 +48,7 @@ For our chosen scheme (which we refer to as "paged initialization/finalization")
 
 Concretely, each page gets an associated `PAGE` table, consisting of N variables over N columns. For each such table, the `page` variable is instantiated as the constant base address of the page. The `offset` column is preprocessed, which helps the verifier ensure that each page has a single fixed size, but the verifier should still check that no pages overlap and all `page` values are page-aligned.
 
-## Page initialization
+### Page initialization
 
 > **Note:** check whether we need `fini` to be range-checked
 
@@ -58,6 +58,33 @@ We present here a set of constraints on the `PAGE` table that
 
 For zero-initialized pages, `init` can be a constant `0`, and hence doesn't need a column, nor a range check.
 
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `offset` | `RowIndex` | The offset from the page base address. |
+| `init` | `Byte` | The initial value of this address. Can be replaced by a constant zero for zero-initialization |
+| `fini` | `Byte` | The final value this address took |
+| `timestamp` | `DWordWL` | The timestamp at which this address was last accessed |
+
+### Virtual
+
+| Name | Type | Description |
+|------|------|-------------|
+| `address` | `DWordWL` | Adding `offset` to the page base address `page`. `page` is a constant with respect to a single instance of this table. |
+
+**Definition of `address`:**
+```
+address := page + offset * 1::DWordWL
+```
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `PAGE-C1` | `IS_BYTE[init]` | 1 |
+| `PAGE-C2` | `IS_BYTE[fini]` | 1 |
+| `PAGE-C3` | `memory[0, address, 0::DWordWL, init]` | -1 |
+| `PAGE-C4` | `memory[0, address, timestamp, fini]` | 1 |
+
 We identify a few alternatives that would achieve the desired initialization/finalization functionalities, and consider their respective trade-offs.
 
 _"Free-zero" initialization_
@@ -68,14 +95,14 @@ _Sparse initialization/finalization_
 
 One or more STARK tables (depending on the amount of memory used) consisting of `(address, value)` columns are introduced, where for zero-initialization, `value` can be constant zero. Transition constraints ensure that `address` is strictly increasing, enforcing the "at most once" property; `value` is range-checked to consist of bytes. Similar to paged finalization, an additional `timestamp` column is added, containing the final timestamp each address was accessed. This table is then further used to contribute to the LogUp sum as with any other interactions. - The transition constraints can be chosen to only apply on finalization, as at-most-once finalization is enough to ensure consistency. - Sparse initialization is incompatible with paged finalization, see also the remark under free-zero initialization above. - This would require transition constraints, which currently are not needed elsewhere in the VM design - Additionally, for memory use exceeding the capacity of a single initialization/finalization table, some form of transition constraint between tables is needed - Alternatively, transition constraints could potentially be avoided by more integration into the LogUp system, but this could turn out more costly in practice - This is compatible with the above "free zero" initialization - Since a prover-committed address column is needed (rather than a precomputed one), the number of required columns increases. - As an optimization, the address column could potentially be used simultaneously for initialization and finalization - Sparse initialization/finalization reduces the cost for sparse memory access patterns, where only a few addresses would be accessed per page. Most programs and compilers should however favor a memory locality that makes paged initialization/finalization comparable. ]
 
-## Register initialization/finalization
+### Register initialization/finalization
 
 The initial and final state of registers can be entirely known by the verifier, since the relevant initialization values are either zero, or embedded in the ELF, and the final values can be set to a known value by the `HALT` ecall ([ecall]). As additionally, the number of registers is small, the verifier can directly add the required balancing terms to the LogUp sum.
 
-= Notes and considerations
+## Notes and considerations
 
 - Register reads and writes may interact within a single cycle, so a correct and fixed ordering needs to be ensured - Correctness of initialization and completeness of finalization need to be ensured
 
-= Future topics of interest
+## Future topics of interest
 
 - Optimize memory systems after determining factual bottlenecks (e.g. taking inspiration from Twist and Shout, or other recent research) - Double check whether IS_BYTE constraints are needed for fini
\ No newline at end of file
diff --git a/docs/spec/memw.md b/docs/spec/memw.md
index b3d47d788..2fc1ce831 100644
--- a/docs/spec/memw.md
+++ b/docs/spec/memw.md
@@ -2,15 +2,85 @@
 
 The  chip is used to read and write memory locations (both RAM and registers) in chunks of 1, 2, 4 or 8 values. It introduces the old value and last-accessed timestamps of memory addresses internally, in order to satisfy the design of the memory argument ([memory]).
 
-= Variables
+## Variables
 
 The  chip is comprised of  variables that are expressed using  columns and leverages  interaction(s):
 
-= Assumptions
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `is_register` | `Bit` | Whether the address represents a register index |
+| `base_address` | `DWordWL` | The base address to read from/write to. Gets offset by $[0, 7]$ depending on the size of the access |
+| `value` | `BaseField[8]` | The values to store in memory. For RAM, these should be (up to) 8 range-checked `Byte`s; registers are stored as two range-checked `Word`s |
+| `timestamp` | `DWordWL` | The timestamp at which this memory access occurs |
+| `write2` | `Bit` | Whether to write exactly 2 values |
+| `write4` | `Bit` | Whether to write exactly 4 values |
+| `write8` | `Bit` | Whether to write exactly 8 values |
+
+### Output
+
+| Name | Type | Description |
+|------|------|-------------|
+| `old` | `BaseField[8]` | The old value written at `base_address`. See `value` for information about representation. Only the elements corresponding to the `writeN` bits are guaranteed |
+
+### Auxiliary
+
+| Name | Type | Description |
+|------|------|-------------|
+| `carry` | `Bit[7]` | Whether `base_address[0] + i + 1` $>= 2^32$ |
+| `old_timestamp` | `DWordWL[8]` | The timestamp at which address `base_address + i` was last accessed |
+
+### Virtual
+
+| Name | Type | Description |
+|------|------|-------------|
+| `w2` | `Bit` | writing at least 2 bytes |
+| `w4` | `Bit` | writing at least 4 bytes |
+| `address_add` | `DWordWL[7]` | `address_add[i] = base_address + i + 1` |
+| `μ_sum` | `Bit` |  |
+
+**Definition of `w2`:**
+```
+w2 := write2 + write4 + write8
+```
+
+**Definition of `w4`:**
+```
+w4 := write4 + write8
+```
+
+**Definition of `address_add`:**
+```
+address_add := ['arr', ['-', ['+', ['idx', 'base_address', 0], 'i', 1], ['*', ['^', 2, 32], ['idx', 'carry', 'i']]], ['+', ['idx', 'base_address', 1], ['idx', 'carry', 'i']]]
+```
+
+**Definition of `μ_sum`:**
+```
+μ_sum := μ_read + μ_write
+```
+
+### Multiplicity
+
+| Name | Type | Description |
+|------|------|-------------|
+| `μ_read` | `Bit` | Whether we are performing a read (and hence return `out`) |
+| `μ_write` | `Bit` | Whether we are performing a write (and hence not return `out`) |
+
+## Assumptions
+
+| Tag | Range | Description |
+|-----|-------|-------------|
+| `MEMW-A1.i` | i ∈ [0, 1] | `IS_WORD[base_address[i]]` |
+| `MEMW-A2` |  | `IS_BIT<write2>` |
+| `MEMW-A3` |  | `IS_BIT<write4>` |
+| `MEMW-A4` |  | `IS_BIT<write8>` |
+| `MEMW-A5` |  | `IS_BIT<write2 + write4 + write8>` |
+| `MEMW-A6.i` | i ∈ [0, 1] | `IS_WORD[timestamp[i]]` |
 
 Our assumptions do not explicitly cover any range checks for the `is_register` and `value` columns, as these are not necessary for the correctness of this chip in isolation. Still, these properties are necessary for the consistency of the system as a whole, and therefore we document it here, keeping the type information as a reading help.
 
-= Constraints
+## Constraints
 
 Depending on the values of `write2`, `write4` and `write8`, the addresses following `base_address` need to be constructed. Rather than computing these in full (which would require the later addresses to be instantiated), it suffices to know the `carry`: the bit indicating whether ``base_address`_0 + t >= 2^32`, i.e., whether adding `t in [1, 7]` to `base_address` requires a carry from the lower to the upper limb. Note that it is safe for the prover to chose these bits: additions for which this bit is not correctly set will yield an address where either the lower or upper limb is out of bounds. As such, the constructed address will not match any existing memory tokens, which are only initialized for correctly formatted and range-checked doublewords (see [memory]).
 
@@ -51,9 +121,11 @@ This chip contributes the following to the lookup argument:
 | `MEMW-CO18` | `MEMW[old; is_register, base_address, value, timestamp, write2, write4, write8]` | -μ_read |
 | `MEMW-CO19` | `MEMW[is_register, base_address, value, timestamp, write2, write4, write8]` | -μ_write |
 
-= Padding The table can be padded to the next power of two with the following value assignments:
+## Padding
+
+The table can be padded to the next power of two with the following value assignments:
 
-= Read-size aligned fast path
+## Read-size aligned fast path
 
 When a memory access happens at an address with proper alignment for its access size (i.e., adding the access size to `base_address`'s lowest limb does not overflow), and all accessed elements were last accessed at the same timestamp, we can instead use the  chip to save on total column count. The saving comes from only requiring a single old timestamp to be stored, as well as being able to guarantee that all values of `add_limb_overflow` would be zero. A minor extra cost is introduced in the form of a check that the alignment is indeed correct, and the corresponding decomposition of the `base_address`.
 
@@ -61,102 +133,142 @@ Further logic remains essentially the same, so we briefly present the relevant t
 
 The  chip only needs  variables, expressed through  columns; it leverages  interactions.
 
-## Padding
+### Input
 
-The table can be padded to the next power of two with the following value assignments:
+| Name | Type | Description |
+|------|------|-------------|
+| `is_register` | `Bit` | Whether the address represents a register index |
+| `base_address` | `DWordWHH` | The base address to read from/write to. Gets offset by $[0, 7]$ depending on the size of the access |
+| `value` | `BaseField[8]` | The values to store in memory. For regular memory, these should be (up to) 8 range-checked `Byte`s; registers are stored as two range-checked `Word`s |
+| `timestamp` | `DWordWL` | The timestamp at which this memory access is said to occur |
+| `write2` | `Bit` | Whether to write exactly 2 values |
+| `write4` | `Bit` | Whether to write exactly 4 values |
+| `write8` | `Bit` | Whether to write exactly 8 values |
 
-= Register fast-path
+### Output
 
-The  chip provides a fast-path for accessing registers. This fast-path leverages that registers + can be addressed using a `Byte`, rather than a full `DWord`, + are constantly accessed, i.e., ``timestamp` - `old_timestamp`` is small, and + have a fixed access pattern to achieve a footprint that is significantly smaller than both  and .
+| Name | Type | Description |
+|------|------|-------------|
+| `old` | `BaseField[8]` | The old value written at `base_address + i`. See `value` for information about representation. Only the elements corresponding to the `writeN` bits are guaranteed |
 
-Note: as a result of hard optimization, this chip can only be used for register accesses for which + ``timestamp` - `old_timestamp` in [1, 2^16]`, and + ``timestamp[0]` > `old_timestamp[0]`` If either of these rules does not apply to your access, you should fall back to using `MEMW_A`.
+### Auxiliary
 
-Note moreover that this chip does not guard against misaligned register access faults: to access register with a given `address`, one must provide `2 dot `address`` in the lookup.
+| Name | Type | Description |
+|------|------|-------------|
+| `old_timestamp` | `DWordWL` | The timestamp at which the address was last accessed |
 
-## Variables
+### Virtual
 
-The  chip is comprised of  variables that are expressed using  columns and leverages  interactions:
+| Name | Type | Description |
+|------|------|-------------|
+| `w2` | `Bit` | writing at least 2 bytes |
+| `w4` | `Bit` | writing at least 4 bytes |
+| `μ_sum` | `Bit` |  |
 
-## Assumptions
+**Definition of `w2`:**
+```
+w2 := write2 + write4 + write8
+```
+
+**Definition of `w4`:**
+```
+w4 := write4 + write8
+```
+
+**Definition of `μ_sum`:**
+```
+μ_sum := μ_read + μ_write
+```
+
+### Multiplicity
+
+| Name | Type | Description |
+|------|------|-------------|
+| `μ_read` | `Bit` | Whether we are performing a read (and hence return `out`) |
+| `μ_write` | `Bit` | Whether we are performing a write (and hence not return `out`) |
 
 | Tag | Range | Description |
 |-----|-------|-------------|
-| `MEMW-A1.i` | i ∈ [0, 1] | `IS_WORD[base_address[i]]` |
-| `MEMW-A2` |  | `IS_BIT<write2>` |
-| `MEMW-A3` |  | `IS_BIT<write4>` |
-| `MEMW-A4` |  | `IS_BIT<write8>` |
-| `MEMW-A5` |  | `IS_BIT<write2 + write4 + write8>` |
-| `MEMW-A6.i` | i ∈ [0, 1] | `IS_WORD[timestamp[i]]` |
+| `MEMW_A-A1.i` | i ∈ [0, 1] | `IS_HALF[base_address[i]]` |
+| `MEMW_A-A2` |  | `IS_WORD[base_address[2]]` |
+| `MEMW_A-A3` |  | `IS_BIT<write2>` |
+| `MEMW_A-A4` |  | `IS_BIT<write4>` |
+| `MEMW_A-A5` |  | `IS_BIT<write8>` |
+| `MEMW_A-A6` |  | `IS_BIT<write2 + write4 + write8>` |
+| `MEMW_A-A7.i` | i ∈ [0, 1] | `IS_WORD[timestamp[i]]` |
 
-The following range checks are assumed to be performed/enforced outside of this chip:
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `MEMW_A-C1` | `IS_HALF[base_address[0] + write2 + 3 * write4 + 7 * write8]` | μ_sum |
+| `MEMW_A-C2` | `IS_BIT<μ_read>` |  |
+| `MEMW_A-C3` | `IS_BIT<μ_write>` |  |
+| `MEMW_A-C4` | `IS_BIT<μ_sum>` |  |
+| `MEMW_A-C5` | `w2` => `μ_sum` |  |
+| | _polynomial:_ `w2 * (1 - μ_sum) = 0` | |
+| `MEMW_A-C6` | `LT[1; old_timestamp, timestamp, 0]` | μ_sum |
 
-## Constraints
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `MEMW_A-CM7` |  | `memory[is_register, base_address::DWordWL, old_timestamp, old[0]]` | μ_sum |
+| `MEMW_A-CM8` |  | `memory[is_register, base_address::DWordWL, timestamp, value[0]]` | -μ_sum |
+| `MEMW_A-CM9` |  | `memory[is_register, base_address::DWordWL + 1::DWordWL, old_timestamp, old[1]]` | w2 |
+| `MEMW_A-CM10` |  | `memory[is_register, base_address::DWordWL + 1::DWordWL, timestamp, value[1]]` | -w2 |
+| `MEMW_A-CM11.i` | i ∈ [2, 3] | `memory[is_register, base_address::DWordWL + i::DWordWL, old_timestamp, old[i]]` | w4 |
+| `MEMW_A-CM12.i` | i ∈ [2, 3] | `memory[is_register, base_address::DWordWL + i::DWordWL, timestamp, value[i]]` | -w4 |
+| `MEMW_A-CM13.i` | i ∈ [4, 7] | `memory[is_register, base_address::DWordWL + i::DWordWL, old_timestamp, old[i]]` | write8 |
+| `MEMW_A-CM14.i` | i ∈ [4, 7] | `memory[is_register, base_address::DWordWL + i::DWordWL, timestamp, value[i]]` | -write8 |
 
-Since most registers are frequently accessed, the difference between `timestamp` and `old_timestamp` is small most of the times. Rather than storing their (nearly) identical upper limbs twice, it is instead assumed that ``old_timestamp[1]` = `timestamp[1]``;  can be used for accesses where this is not the case.
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `MEMW_A-CO15` | `MEMW[old; is_register, base_address::DWordWL, value, timestamp, write2, write4, write8]` | -μ_read |
+| `MEMW_A-CO16` | `MEMW[is_register, base_address::DWordWL, value, timestamp, write2, write4, write8]` | -μ_write |
 
-Verifying that ``timestamp` > `old_timestamp`` now simplifies to verifying that ``timestamp[0]` - `old_timestamp[0]` > 0`. For most accesses, this value will be small enough to fit in a `Half`. This chip thus enforces this by means of the following constraint:
+### Padding
 
-With ``old_timestamp`<`timestamp`` asserted, `old` is read from the register ([regw:c:read_old]) and `val` is written back ([regw:c:write_val]).
+The table can be padded to the next power of two with the following value assignments:
 
-This chip can either just write (``μ_write` = 1`), or both read and write (``μ_read` = 1`) in the same cycle. It must be asserted that at most one of these two options is selected:
+## Register fast-path
 
-Lastly, this chip contributes the following interactions to the logup:
+The  chip provides a fast-path for accessing registers. This fast-path leverages that registers + can be addressed using a `Byte`, rather than a full `DWord`, + are constantly accessed, i.e., ``timestamp` - `old_timestamp`` is small, and + have a fixed access pattern to achieve a footprint that is significantly smaller than both  and .
 
-## Padding
+Note: as a result of hard optimization, this chip can only be used for register accesses for which + ``timestamp` - `old_timestamp` in [1, 2^16]`, and + ``timestamp[0]` > `old_timestamp[0]`` If either of these rules does not apply to your access, you should fall back to using `MEMW_A`.
 
-The table can be padded to the next power of two with the following value assignments:
+Note moreover that this chip does not guard against misaligned register access faults: to access register with a given `address`, one must provide `2 dot `address`` in the lookup.
 
-= Notes/optimizations The following ideas may prove to be optimizations for the // chip: - `MEMB` chip that does a one-byte write to remove old_timestamp from here (uncertain tradeoffs) - Adding `μ_sum`/`w2`/`w4`/`write8` multiplicities to the `IS_HALF` lookups may make some GKR things faster if there are known zeroes. - For the register fast-path, one may upgrade the `IS_HALF` check to an `IS_B20` check for extended range at the cost of looking through a larger table.
+### Variables
 
-## Columns
+The  chip is comprised of  variables that are expressed using  columns and leverages  interactions:
 
 ### Input
 
 | Name | Type | Description |
 |------|------|-------------|
-| `is_register` | `Bit` | Whether the address represents a register index |
-| `base_address` | `DWordWL` | The base address to read from/write to. Gets offset by $[0, 7]$ depending on the size of the access |
-| `value` | `BaseField[8]` | The values to store in memory. For RAM, these should be (up to) 8 range-checked `Byte`s; registers are stored as two range-checked `Word`s |
-| `timestamp` | `DWordWL` | The timestamp at which this memory access occurs |
-| `write2` | `Bit` | Whether to write exactly 2 values |
-| `write4` | `Bit` | Whether to write exactly 4 values |
-| `write8` | `Bit` | Whether to write exactly 8 values |
+| `address` | `Byte` | address of the register being accessed |
+| `timestamp` | `DWordWL` | timestamp at which the access takes place |
+| `val` | `DWordWL` | value being written to this register |
 
 ### Output
 
 | Name | Type | Description |
 |------|------|-------------|
-| `old` | `BaseField[8]` | The old value written at `base_address`. See `value` for information about representation. Only the elements corresponding to the `writeN` bits are guaranteed |
+| `old` | `DWordWL` | value of this register at `old_timestamp`. |
 
 ### Auxiliary
 
 | Name | Type | Description |
 |------|------|-------------|
-| `carry` | `Bit[7]` | Whether `base_address[0] + i + 1` $>= 2^32$ |
-| `old_timestamp` | `DWordWL[8]` | The timestamp at which address `base_address + i` was last accessed |
+| `old_timestamp_lo` | `Word` | the lower limb of `old_timestamp` |
 
 ### Virtual
 
 | Name | Type | Description |
 |------|------|-------------|
-| `w2` | `Bit` | writing at least 2 bytes |
-| `w4` | `Bit` | writing at least 4 bytes |
-| `address_add` | `DWordWL[7]` | `address_add[i] = base_address + i + 1` |
+| `old_timestamp` | `DWordWL` | timestamp at which this register was last accessed |
 | `μ_sum` | `Bit` |  |
 
-**Definition of `w2`:**
-```
-w2 := write2 + write4 + write8
-```
-
-**Definition of `w4`:**
-```
-w4 := write4 + write8
-```
-
-**Definition of `address_add`:**
+**Definition of `old_timestamp`:**
 ```
-address_add := ['arr', ['-', ['+', ['idx', 'base_address', 0], 'i', 1], ['*', ['^', 2, 32], ['idx', 'carry', 'i']]], ['+', ['idx', 'base_address', 1], ['idx', 'carry', 'i']]]
+old_timestamp := ['arr', 'old_timestamp_lo', ['idx', 'timestamp', 1]]::DWordWL
 ```
 
 **Definition of `μ_sum`:**
@@ -169,4 +281,53 @@ address_add := ['arr', ['-', ['+', ['idx', 'base_address', 0], 'i', 1], ['*', ['
 | Name | Type | Description |
 |------|------|-------------|
 | `μ_read` | `Bit` | Whether we are performing a read (and hence return `out`) |
-| `μ_write` | `Bit` | Whether we are performing a write (and hence not return `out`) |
\ No newline at end of file
+| `μ_write` | `Bit` | Whether we are performing a write (and hence not return `out`) |
+
+### Assumptions
+
+The following range checks are assumed to be performed/enforced outside of this chip:
+
+| Tag | Range | Description |
+|-----|-------|-------------|
+| `MEMW_R-A1.i` | i ∈ [0, 1] | `IS_WORD[val[i]]` |
+| `MEMW_R-A2.i` | i ∈ [0, 1] | `IS_WORD[timestamp[i]]` |
+
+### Constraints
+
+Since most registers are frequently accessed, the difference between `timestamp` and `old_timestamp` is small most of the times. Rather than storing their (nearly) identical upper limbs twice, it is instead assumed that ``old_timestamp[1]` = `timestamp[1]``;  can be used for accesses where this is not the case.
+
+Verifying that ``timestamp` > `old_timestamp`` now simplifies to verifying that ``timestamp[0]` - `old_timestamp[0]` > 0`. For most accesses, this value will be small enough to fit in a `Half`. This chip thus enforces this by means of the following constraint:
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `MEMW_R-C1` | `IS_HALF[timestamp[0] - old_timestamp[0] - 1]` | μ_sum |
+
+With ``old_timestamp`<`timestamp`` asserted, `old` is read from the register ([regw:c:read_old]) and `val` is written back ([regw:c:write_val]).
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `MEMW_R-C2.i` | i ∈ [0, 1] | `memory[1, ['arr', ['cast', ['+', ['*', 2, 'address'], 'i'], 'Word'], 0], old_timestamp, old[i]]` | μ_sum |
+| `MEMW_R-C3.i` | i ∈ [0, 1] | `memory[1, ['arr', ['cast', ['+', ['*', 2, 'address'], 'i'], 'Word'], 0], timestamp, val[i]]` | -μ_sum |
+
+This chip can either just write (``μ_write` = 1`), or both read and write (``μ_read` = 1`) in the same cycle. It must be asserted that at most one of these two options is selected:
+
+| Tag | Description |
+|-----|-------------|
+| `MEMW_R-C4` | `IS_BIT<μ_read>` |
+| `MEMW_R-C5` | `IS_BIT<μ_write>` |
+| `MEMW_R-C6` | `IS_BIT<μ_sum>` |
+
+Lastly, this chip contributes the following interactions to the logup:
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `MEMW_R-C7` | `MEMW[['arr', ['idx', 'old', 0], ['idx', 'old', 1], 0, 0, 0, 0, 0, 0]; 1, ['arr', ['cast', ['*', 2, 'address'], 'Word'], 0], ['arr', ['idx', 'val', 0], ['idx', 'val', 1], 0, 0, 0, 0, 0, 0], timestamp, 1, 0, 0]` | -μ_read |
+| `MEMW_R-C8` | `MEMW[1, ['arr', ['cast', ['*', 2, 'address'], 'Word'], 0], ['arr', ['idx', 'val', 0], ['idx', 'val', 1], 0, 0, 0, 0, 0, 0], timestamp, 1, 0, 0]` | -μ_write |
+
+### Padding
+
+The table can be padded to the next power of two with the following value assignments:
+
+## Notes/optimizations
+
+The following ideas may prove to be optimizations for the // chip: - `MEMB` chip that does a one-byte write to remove old_timestamp from here (uncertain tradeoffs) - Adding `μ_sum`/`w2`/`w4`/`write8` multiplicities to the `IS_HALF` lookups may make some GKR things faster if there are known zeroes. - For the register fast-path, one may upgrade the `IS_HALF` check to an `IS_B20` check for extended range at the cost of looking through a larger table.
\ No newline at end of file
diff --git a/docs/spec/mul.md b/docs/spec/mul.md
index fa459cde8..e8fcf2fbc 100644
--- a/docs/spec/mul.md
+++ b/docs/spec/mul.md
@@ -2,66 +2,10 @@
 
 The  chip constrains multiplication, both signed and unsigned, as well as providing access to the low and high halfs of the multiplication result.
 
-= Variables
+## Variables
 
 The  chip is comprised of  variables that are expressed using  columns and leverages  interaction(s):
 
-`mat(delim: , top; bottom)` }
-
-= Assumptions The following range checks are assumed to be performed/enforced outside of this chip:
-
-= Constraints
-
-## Overview
-
-When `lhs` and `rhs` are _unsigned_ integers, computing their product `mod 2^128` comes down to evaluating $ (sum_(j=0)^3 2^(16j) dot `lhs`_j) dot (sum_(i=0)^3 2^(16i) dot `rhs`_i) mod 2^128. $ If `lhs` and `rhs` are signed instead, the computation remains nearly identical: based on their signs, one must either zero or one-extend `lhs` and `rhs` --- forming `lhs_ext` and `rhs_ext` respectively --- and compute their product `mod 2^128`: $ (sum_(j=0)^7 2^(16j) dot `lhs_ext`_j) dot (sum_(i=0)^7 2^(16i) dot `rhs_ext`_i) mod 2^128. $ where `lhs_ext` and `rhs_ext` are treated as _unsigned_ integers. Note that by setting the extension limbs of `lhs` and/or `rhs` to `0` when the integer is (i) unsigned or (ii) signed and non-negative, this second formula still applies. For the purposes of constraining the multiplication operation, we rewrite this formula as
-
-$ &(sum_(j=0)^7 2^(16j) dot `lhs_ext`_j) dot (sum_(i=0)^7 2^(16i) dot `rhs_ext`_i) mod 2^128 \ &equiv sum_(j=0)^7 sum_(i=0)^7 2^(16(i+j)) dot `lhs_ext`_j dot `rhs_ext`_i mod 2^128 \ &stackrel(triangle, equiv) sum_(j=0)^7 sum_(i=0)^(7-j) 2^(16(i+j)) dot `lhs_ext`_j dot `rhs_ext`_i mod 2^128 \ &stackrel(square, equiv) sum_(j=0)^7 sum_(i=j)^(7) 2^(16i) dot `lhs_ext`_j dot `rhs_ext`_(i-j) mod 2^128 \ &stackrel(penta, equiv) sum_(i=0)^7 sum_(j=0)^(i) 2^(16i) dot `lhs_ext`_j dot `rhs_ext`_(i-j) mod 2^128 \ &equiv sum_(i=0)^3 sum_(k=0)^1 sum_(j=0)^(2i+k) 2^(16(2i+k)) dot `lhs_ext`_j dot `rhs_ext`_(2i+k-j) mod 2^128 \ &equiv sum_(i=0)^3 2^(32i) dot sum_(k=0)^1 2^(16k) dot sum_(j=0)^(2i+k) `lhs_ext`_j dot `rhs_ext`_(2i+k-j) mod 2^128 $ where at step - `triangle` we can ignore `i > 7-j`, since that makes `2^(16(i+j)) equiv 0 mod 2^128`, - `square` we rewrite the second summation such that `i` iterates from `j` to 7, rather than `0` to `7-j`, and - `penta` we swap the sums.
-
-We let `raw_product` capture the second summation in this last formula (see [mul:c:raw_product]). By construction, ``raw_product`_i < 2^51` for all `i in [0, 3]`, far exceeding the 32-bits that fit in a single `Word`-limb. What remains then is to reduce each limb of `raw_product` `mod 2^32`, carrying the overflow of each limb to the next, constructing the output `res` in doing so.
-
-This reduce-and-carry operation is constrained by [mul:c:range_lo]/[mul:c:range_hi] and [mul:c:carry], combined with `carry`'s definition. [mul:c:carry] and `carry`'s definition enforce that $ forall i in [0, 3]: `raw_product`_i + `carry`_(i-1) - `res`_i in { k dot 2^32 | k in [0, 2^20) } $ with ``carry`_(-1) = 0` for simplicity. In other words: ``res`_i equiv `raw_product`_i + `carry`_(i-1) (mod 2^32)`. With [mul:c:range_lo]/[mul:c:range_hi] forcing ``res`_i < 2^32`, ``res`_i` can only assume one value: ``raw_product`_i + `carry`_(i-1) mod 2^32`.
-
-*Note*: one may have observed that [mul:c:carry] requires ``carry`_i in [0, 2^20)`, while no limb of a valid carry value would ever exceed `2^19`. This is indeed the case. However, there is some slack in how tight one has to constrain the `carry` values. In fact, in this situation it suffices to assert that ``carry`_i < frac(p, 2^32, style: "skewed") approx 2^31`, where `p` denotes the field's modulus. Given that other chips also use 20-bit lookups, using `IS_B20` makes for a simpler design.
-
-## Definitions
-
-We constrain `lhs_is_negative` and `rhs_is_negative` according to their definition; `lo`, `hi` and `carry` are appropriately range checked.
-
-| Tag | Range | Description | Multiplicity |
-|-----|-------|-------------|--------------|
-| `MUL-C1` |  | `SIGN<lhs_is_negative; lhs[3], lhs_signed>` |  |
-| `MUL-C2` |  | `SIGN<rhs_is_negative; rhs[3], rhs_signed>` |  |
-| `MUL-C3.i` | i ∈ [0, 3] | `IS_HALF[lo[i]]` | μ_sum |
-| `MUL-C4.i` | i ∈ [0, 3] | `IS_HALF[hi[i]]` | μ_sum |
-| `MUL-C5.i` | i ∈ [0, 3] | `IS_B20[carry[i]]` | μ_sum |
-
-## Product
-
-[mul:c:raw_product] defines `raw_product` in terms of the (sign extended) input values `lhs` and `rhs`.
-
-| Tag | Range | Description |
-|-----|-------|-------------|
-| `MUL-C6.i` | i ∈ [0, 3] | `raw_product[i]` = sum_(`k`=0)^1 2^(16k) sum_(`j`=0)^(2i+k) `lhs_ext[j]` dot `rhs_ext[2i+k-j]` |
-| | | _polynomial:_ `Σ_k = 0^1 2^(16 * k) * Σ_j = 0^2 * i + k lhs_ext[j] * rhs_ext[2 * i + k - j] - raw_product[i] = 0` |
-
-## Lookup
-
-The  chip contributes the following to the lookup:
-
-| Tag | Description | Multiplicity |
-|-----|-------------|--------------|
-| `MUL-C7` | `MUL[lo::DWordWL; lhs, lhs_signed, rhs, rhs_signed, 0]` | -μ_lo |
-| `MUL-C8` | `MUL[hi::DWordWL; lhs, lhs_signed, rhs, rhs_signed, 1]` | -μ_hi |
-
-= Padding
-
-The table can be padded to the next power of two with the following value assignments:
-
-= Notes/optimizations - `lo` and `hi` are stored in `DWordHL`s (rather than `DWordWL`s) because of their values being range checked. Since it is not required that both `μ_lo` and `μ_hi` are non-zero at the same time, one cannot safely assume their range to be checked elsewhere. - As an optimization, one might be able to use a `DWordWL` and `DWordHL` to store `lo` and `hi`, where one would decide which to store in which based on the multiplicities `μ_lo` and `μ_hi`; the value sent into the lookup could then be assumed range-checked by the other side of the relation. This optimization was not included at this moment because of its negative impact on the readability and verifiability of the chip.
-
-## Columns
-
 ### Input
 
 | Name | Type | Description |
@@ -132,9 +76,65 @@ carry (when iter=[1, 3]) := 2^-32 * (raw_product[i] + carry[i - 1] - res[i])
 | `μ_lo` | `BaseField` |  |
 | `μ_hi` | `BaseField` |  |
 
+`mat(delim: , top; bottom)` }
+
 ## Assumptions
 
+The following range checks are assumed to be performed/enforced outside of this chip:
+
 | Tag | Range | Description |
 |-----|-------|-------------|
 | `MUL-A1.i` | i ∈ [0, 3] | `IS_HALF[lhs[i]]` |
-| `MUL-A2.i` | i ∈ [0, 3] | `IS_HALF[rhs[i]]` |
\ No newline at end of file
+| `MUL-A2.i` | i ∈ [0, 3] | `IS_HALF[rhs[i]]` |
+
+## Constraints
+
+### Overview
+
+When `lhs` and `rhs` are _unsigned_ integers, computing their product `mod 2^128` comes down to evaluating $ (sum_(j=0)^3 2^(16j) dot `lhs`_j) dot (sum_(i=0)^3 2^(16i) dot `rhs`_i) mod 2^128. $ If `lhs` and `rhs` are signed instead, the computation remains nearly identical: based on their signs, one must either zero or one-extend `lhs` and `rhs` --- forming `lhs_ext` and `rhs_ext` respectively --- and compute their product `mod 2^128`: $ (sum_(j=0)^7 2^(16j) dot `lhs_ext`_j) dot (sum_(i=0)^7 2^(16i) dot `rhs_ext`_i) mod 2^128. $ where `lhs_ext` and `rhs_ext` are treated as _unsigned_ integers. Note that by setting the extension limbs of `lhs` and/or `rhs` to `0` when the integer is (i) unsigned or (ii) signed and non-negative, this second formula still applies. For the purposes of constraining the multiplication operation, we rewrite this formula as
+
+$ &(sum_(j=0)^7 2^(16j) dot `lhs_ext`_j) dot (sum_(i=0)^7 2^(16i) dot `rhs_ext`_i) mod 2^128 \ &equiv sum_(j=0)^7 sum_(i=0)^7 2^(16(i+j)) dot `lhs_ext`_j dot `rhs_ext`_i mod 2^128 \ &stackrel(triangle, equiv) sum_(j=0)^7 sum_(i=0)^(7-j) 2^(16(i+j)) dot `lhs_ext`_j dot `rhs_ext`_i mod 2^128 \ &stackrel(square, equiv) sum_(j=0)^7 sum_(i=j)^(7) 2^(16i) dot `lhs_ext`_j dot `rhs_ext`_(i-j) mod 2^128 \ &stackrel(penta, equiv) sum_(i=0)^7 sum_(j=0)^(i) 2^(16i) dot `lhs_ext`_j dot `rhs_ext`_(i-j) mod 2^128 \ &equiv sum_(i=0)^3 sum_(k=0)^1 sum_(j=0)^(2i+k) 2^(16(2i+k)) dot `lhs_ext`_j dot `rhs_ext`_(2i+k-j) mod 2^128 \ &equiv sum_(i=0)^3 2^(32i) dot sum_(k=0)^1 2^(16k) dot sum_(j=0)^(2i+k) `lhs_ext`_j dot `rhs_ext`_(2i+k-j) mod 2^128 $ where at step - `triangle` we can ignore `i > 7-j`, since that makes `2^(16(i+j)) equiv 0 mod 2^128`, - `square` we rewrite the second summation such that `i` iterates from `j` to 7, rather than `0` to `7-j`, and - `penta` we swap the sums.
+
+We let `raw_product` capture the second summation in this last formula (see [mul:c:raw_product]). By construction, ``raw_product`_i < 2^51` for all `i in [0, 3]`, far exceeding the 32-bits that fit in a single `Word`-limb. What remains then is to reduce each limb of `raw_product` `mod 2^32`, carrying the overflow of each limb to the next, constructing the output `res` in doing so.
+
+This reduce-and-carry operation is constrained by [mul:c:range_lo]/[mul:c:range_hi] and [mul:c:carry], combined with `carry`'s definition. [mul:c:carry] and `carry`'s definition enforce that $ forall i in [0, 3]: `raw_product`_i + `carry`_(i-1) - `res`_i in { k dot 2^32 | k in [0, 2^20) } $ with ``carry`_(-1) = 0` for simplicity. In other words: ``res`_i equiv `raw_product`_i + `carry`_(i-1) (mod 2^32)`. With [mul:c:range_lo]/[mul:c:range_hi] forcing ``res`_i < 2^32`, ``res`_i` can only assume one value: ``raw_product`_i + `carry`_(i-1) mod 2^32`.
+
+*Note*: one may have observed that [mul:c:carry] requires ``carry`_i in [0, 2^20)`, while no limb of a valid carry value would ever exceed `2^19`. This is indeed the case. However, there is some slack in how tight one has to constrain the `carry` values. In fact, in this situation it suffices to assert that ``carry`_i < frac(p, 2^32, style: "skewed") approx 2^31`, where `p` denotes the field's modulus. Given that other chips also use 20-bit lookups, using `IS_B20` makes for a simpler design.
+
+### Definitions
+
+We constrain `lhs_is_negative` and `rhs_is_negative` according to their definition; `lo`, `hi` and `carry` are appropriately range checked.
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `MUL-C1` |  | `SIGN<lhs_is_negative; lhs[3], lhs_signed>` |  |
+| `MUL-C2` |  | `SIGN<rhs_is_negative; rhs[3], rhs_signed>` |  |
+| `MUL-C3.i` | i ∈ [0, 3] | `IS_HALF[lo[i]]` | μ_sum |
+| `MUL-C4.i` | i ∈ [0, 3] | `IS_HALF[hi[i]]` | μ_sum |
+| `MUL-C5.i` | i ∈ [0, 3] | `IS_B20[carry[i]]` | μ_sum |
+
+### Product
+
+[mul:c:raw_product] defines `raw_product` in terms of the (sign extended) input values `lhs` and `rhs`.
+
+| Tag | Range | Description |
+|-----|-------|-------------|
+| `MUL-C6.i` | i ∈ [0, 3] | `raw_product[i]` = sum_(`k`=0)^1 2^(16k) sum_(`j`=0)^(2i+k) `lhs_ext[j]` dot `rhs_ext[2i+k-j]` |
+| | | _polynomial:_ `Σ_k = 0^1 2^(16 * k) * Σ_j = 0^2 * i + k lhs_ext[j] * rhs_ext[2 * i + k - j] - raw_product[i] = 0` |
+
+### Lookup
+
+The  chip contributes the following to the lookup:
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `MUL-C7` | `MUL[lo::DWordWL; lhs, lhs_signed, rhs, rhs_signed, 0]` | -μ_lo |
+| `MUL-C8` | `MUL[hi::DWordWL; lhs, lhs_signed, rhs, rhs_signed, 1]` | -μ_hi |
+
+## Padding
+
+The table can be padded to the next power of two with the following value assignments:
+
+## Notes/optimizations
+
+- `lo` and `hi` are stored in `DWordHL`s (rather than `DWordWL`s) because of their values being range checked. Since it is not required that both `μ_lo` and `μ_hi` are non-zero at the same time, one cannot safely assume their range to be checked elsewhere. - As an optimization, one might be able to use a `DWordWL` and `DWordHL` to store `lo` and `hi`, where one would decide which to store in which based on the multiplicities `μ_lo` and `μ_hi`; the value sent into the lookup could then be assumed range-checked by the other side of the relation. This optimization was not included at this moment because of its negative impact on the readability and verifiability of the chip.
\ No newline at end of file
diff --git a/docs/spec/neg.md b/docs/spec/neg.md
index bd7685bf8..e46420361 100644
--- a/docs/spec/neg.md
+++ b/docs/spec/neg.md
@@ -2,27 +2,9 @@
 
 It requires `cond` to be a bit.
 
-= Variables This template introduces  interaction(s).
+## Variables
 
-= Assumptions
-
-= Constraints We constrain this equality using two constraints:
-
-## Correctness argument
-
-The constraints force the `carry` values to be fixed. Writing `carry`'s definition, we then find that $
-
-= cases( 2^32 - (`x as DWordWL`)_0 & "if" (`x as DWordWL`)_0 != 0, 0 & "if" (`x as DWordWL`)_0 = 0 ),\
-
-2^32 - (`x as DWordWL`)_1 - 1 & "if" `x` != 0, 0 & "if" `x` = 0 $ Clearly, ``neg` = 0` when ``x` = 0` (and `cond` is set). For non-zero `x`, we distinguish two cases. When `(`x as DWordWL`)_0 = 0`, $
-
-&= 2^32 dot `neg`_1 + `neg`_0\ &= 2^32 dot (2^32 - (`x as DWordWL`)_1) + 0\ &= 2^32 dot (2^32 - (`x as DWordWL`)_1) + (`x as DWordWL`)_0\ &= 2^64 - (2^32 dot (`x as DWordWL`)_1 + (`x as DWordWL`)_0)\ &= 2^64 - `x`\ &equiv -x mod 2^64, $ while when `(`x as DWordWL`)_0 != 0`, $
-
-&= 2^32 dot `neg`_1 + `neg`_0\ &= 2^32 dot (2^32 - (`x as DWordWL`)_1 - 1) + (2^32 - (`x as DWordWL`)_0)  \ &= 2^64 - 2^32 dot (`x as DWordWL`)_1 - 2^32 + 2^32 - (`x as DWordWL`)_0  \ &= 2^64 - ((`x as DWordWL`)_0 + 2^32 dot (`x as DWordWL`)_1) \ &= 2^64 - `x`\ &equiv -x mod 2^64 $ when `cond` is set. When `cond` is not set, the two lookups are not executed, allowing `neg` to take any value in either case.
-
-It is worth noting that this construction does _not_ require the limbs of `neg` to be range checked, thus allowing it be represented by the unrangecheckable `DWordWL` rather than a `DWordHL`. The input value `x` is still assumed to be range-checked, however. ]
-
-## Columns
+This template introduces  interaction(s).
 
 ### Input
 
@@ -63,9 +45,25 @@ carry (when iter=1) := 2^-32 * ((x::DWordWL)[1] + neg[1] + carry[0])
 
 ## Constraints
 
-### all
+We constrain this equality using two constraints:
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
 | `NEG-C1` | `ZERO[1 - carry[0]; x[0] + x[1]]` | cond |
-| `NEG-C2` | `ZERO[1 - carry[1]; x[0] + x[1] + x[2] + x[3]]` | cond |
\ No newline at end of file
+| `NEG-C2` | `ZERO[1 - carry[1]; x[0] + x[1] + x[2] + x[3]]` | cond |
+
+### Correctness argument
+
+The constraints force the `carry` values to be fixed. Writing `carry`'s definition, we then find that $
+
+## cases(
+
+2^32 - (`x as DWordWL`)_0 & "if" (`x as DWordWL`)_0 != 0, 0 & "if" (`x as DWordWL`)_0 = 0 ),\
+
+2^32 - (`x as DWordWL`)_1 - 1 & "if" `x` != 0, 0 & "if" `x` = 0 $ Clearly, ``neg` = 0` when ``x` = 0` (and `cond` is set). For non-zero `x`, we distinguish two cases. When `(`x as DWordWL`)_0 = 0`, $
+
+&= 2^32 dot `neg`_1 + `neg`_0\ &= 2^32 dot (2^32 - (`x as DWordWL`)_1) + 0\ &= 2^32 dot (2^32 - (`x as DWordWL`)_1) + (`x as DWordWL`)_0\ &= 2^64 - (2^32 dot (`x as DWordWL`)_1 + (`x as DWordWL`)_0)\ &= 2^64 - `x`\ &equiv -x mod 2^64, $ while when `(`x as DWordWL`)_0 != 0`, $
+
+&= 2^32 dot `neg`_1 + `neg`_0\ &= 2^32 dot (2^32 - (`x as DWordWL`)_1 - 1) + (2^32 - (`x as DWordWL`)_0)  \ &= 2^64 - 2^32 dot (`x as DWordWL`)_1 - 2^32 + 2^32 - (`x as DWordWL`)_0  \ &= 2^64 - ((`x as DWordWL`)_0 + 2^32 dot (`x as DWordWL`)_1) \ &= 2^64 - `x`\ &equiv -x mod 2^64 $ when `cond` is set. When `cond` is not set, the two lookups are not executed, allowing `neg` to take any value in either case.
+
+It is worth noting that this construction does _not_ require the limbs of `neg` to be range checked, thus allowing it be represented by the unrangecheckable `DWordWL` rather than a `DWordHL`. The input value `x` is still assumed to be range-checked, however. ]
\ No newline at end of file
diff --git a/docs/spec/sha256.md b/docs/spec/sha256.md
new file mode 100644
index 000000000..61b756060
--- /dev/null
+++ b/docs/spec/sha256.md
@@ -0,0 +1,479 @@
+# SHA256 Accelerator
+
+The following chips constitute an accelerator for the SHA256 compression function; other aspects of SHA256 hashing (such as repeated compression invocation, input padding and state initialization) fall outside the scope of this accelerator.
+
+The base  chip provides the `ECALL` interface, interacts with memory and then delegates to the  and  chips to perform the message schedule and the compression rounds, respectively. The `SHA256_M` interaction signature is used to represent the output of the message schedule. The `SHA256_K` interaction signature is used to represent the `k` constants. It could either be instantiated with a (short) precomputed table, or through hardcoded LogUp contributions in this chip. For this exposition, we choose the former option, and present a table further below. Additionally, we introduce a  chip to perform the common action of computing the XOR of three rotations (or shifts) of a word.
+
+Most of the structure and variable naming follows the pseudocode of the wikipedia page).
+
+## `SHA256` chip
+
+### Columns
+
+The  chip leverages  variables, spanning  columns:
+
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `timestamp` | `DWordWL` | Timestamp at which the ECALL is invoked. Used as unique identifier for this invocation. |
+| `h` | `Byte[32]` | The state of the hash function. |
+| `h_addr` | `DWordHL[4]` | The addresses of the doublewords of `h` |
+| `m` | `Byte[64]` | The input chunk. |
+| `m_addr` | `DWordHL[8]` | The addresses of the doublewords of `m` |
+
+### Output
+
+| Name | Type | Description |
+|------|------|-------------|
+| `out` | `Byte[32]` | The new state. |
+
+### Auxiliary
+
+| Name | Type | Description |
+|------|------|-------------|
+| `last_round_out` | `Word[8]` | The output from the last compression round |
+
+### Multiplicity
+
+| Name | Type | Description |
+|------|------|-------------|
+| `μ` | `Bit` |  |
+
+### Constraints
+
+The first responsibility of the chip is to read the current state and message chunk from memory, passed as arguments through pointers. Since the memory ranges could overlap, we read the chunk first (in [sha256:c:read_chunk], at timestamp `timestamp`), before reading and writing the state (in [sha256:c:read_state], at timestamp `timestamp + 1`). The addresses containing the state and the current chunk are passed in as arguments `A0 = x10` and `A1 = x11`, respectively. Note that following the SHA256 spec, this state and the chunks are read and written as big-endian.
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `SHA256-C1` |  | `MEMW[['arr', ['idx', ['cast', ['idx', 'm_addr', 0], 'DWordWL'], 0], ['idx', ['cast', ['idx', 'm_addr', 0], 'DWordWL'], 0], 0, 0, 0, 0, 0, 0]; 1, (2 * 11)::DWordWL, ['arr', ['idx', ['cast', ['idx', 'm_addr', 0], 'DWordWL'], 0], ['idx', ['cast', ['idx', 'm_addr', 0], 'DWordWL'], 1], 0, 0, 0, 0, 0, 0], timestamp, 1, 0, 0]` | μ |
+| `SHA256-C2.i` | i ∈ [0, 7], j ∈ [0, 3] | `IS_HALF[m_addr[i][j]]` | μ |
+| `SHA256-C3.i` | i ∈ [1, 7] | `ADD<m_addr[i]::DWordWL; m_addr[0]::DWordWL, (8 * i)::DWordWL>` |  |
+| `SHA256-C4.i` | i ∈ [0, 7] | `MEMW[['arr', ['idx', 'm', ['+', ['*', 8, 'i'], 3]], ['idx', 'm', ['+', ['*', 8, 'i'], 2]], ['idx', 'm', ['+', ['*', 8, 'i'], 1]], ['idx', 'm', ['+', ['*', 8, 'i'], 0]], ['idx', 'm', ['+', ['*', 8, 'i'], 7]], ['idx', 'm', ['+', ['*', 8, 'i'], 6]], ['idx', 'm', ['+', ['*', 8, 'i'], 5]], ['idx', 'm', ['+', ['*', 8, 'i'], 4]]]; 0, m_addr[i]::DWordWL, ['arr', ['idx', 'm', ['+', ['*', 8, 'i'], 3]], ['idx', 'm', ['+', ['*', 8, 'i'], 2]], ['idx', 'm', ['+', ['*', 8, 'i'], 1]], ['idx', 'm', ['+', ['*', 8, 'i'], 0]], ['idx', 'm', ['+', ['*', 8, 'i'], 7]], ['idx', 'm', ['+', ['*', 8, 'i'], 6]], ['idx', 'm', ['+', ['*', 8, 'i'], 5]], ['idx', 'm', ['+', ['*', 8, 'i'], 4]]], timestamp, 0, 0, 1]` | μ |
+| `SHA256-C5` |  | `MEMW[['arr', ['idx', ['cast', ['idx', 'h_addr', 0], 'DWordWL'], 0], ['idx', ['cast', ['idx', 'h_addr', 0], 'DWordWL'], 1], 0, 0, 0, 0, 0, 0]; 1, (2 * 10)::DWordWL, ['arr', ['idx', ['cast', ['idx', 'h_addr', 0], 'DWordWL'], 0], ['idx', ['cast', ['idx', 'h_addr', 0], 'DWordWL'], 1], 0, 0, 0, 0, 0, 0], timestamp, 1, 0, 0]` | μ |
+| `SHA256-C6.i` | i ∈ [0, 3], j ∈ [0, 3] | `IS_HALF[h_addr[i][j]]` | μ |
+| `SHA256-C7.i` | i ∈ [1, 3] | `ADD<h_addr[i]::DWordWL; h_addr[0]::DWordWL, 8 * i::DWordWL>` |  |
+| `SHA256-C8.i` | i ∈ [0, 3] | `MEMW[['arr', ['idx', 'h', ['+', ['*', 8, 'i'], 3]], ['idx', 'h', ['+', ['*', 8, 'i'], 2]], ['idx', 'h', ['+', ['*', 8, 'i'], 1]], ['idx', 'h', ['+', ['*', 8, 'i'], 0]], ['idx', 'h', ['+', ['*', 8, 'i'], 7]], ['idx', 'h', ['+', ['*', 8, 'i'], 6]], ['idx', 'h', ['+', ['*', 8, 'i'], 5]], ['idx', 'h', ['+', ['*', 8, 'i'], 4]]]; 0, h_addr[i]::DWordWL, ['arr', ['idx', 'out', ['+', ['*', 8, 'i'], 3]], ['idx', 'out', ['+', ['*', 8, 'i'], 2]], ['idx', 'out', ['+', ['*', 8, 'i'], 1]], ['idx', 'out', ['+', ['*', 8, 'i'], 0]], ['idx', 'out', ['+', ['*', 8, 'i'], 7]], ['idx', 'out', ['+', ['*', 8, 'i'], 6]], ['idx', 'out', ['+', ['*', 8, 'i'], 5]], ['idx', 'out', ['+', ['*', 8, 'i'], 4]]], timestamp + 1::DWordWL, 0, 0, 1]` | μ |
+
+Then we prepare the message schedule, by emitting the input chunk with multiplicities corresponding to the number of times it will be read during a compression evaluation. The  chip itself is implicitly invoked by itself and , setting the `amount` column appropriately for the number of times the `w` value is required.
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `SHA256-C9.i` | i ∈ [0, 0] | `SHA256_M[2^0 * m[4 * i + 3] + 2^8 * m[4 * i + 2] + 2^16 * m[4 * i + 1] + 2^24 * m[4 * i + 0]; timestamp, i]` | -2 * μ |
+| `SHA256-C10.i` | i ∈ [1, 8] | `SHA256_M[2^0 * m[4 * i + 3] + 2^8 * m[4 * i + 2] + 2^16 * m[4 * i + 1] + 2^24 * m[4 * i + 0]; timestamp, i]` | -3 * μ |
+| `SHA256-C11.i` | i ∈ [9, 13] | `SHA256_M[2^0 * m[4 * i + 3] + 2^8 * m[4 * i + 2] + 2^16 * m[4 * i + 1] + 2^24 * m[4 * i + 0]; timestamp, i]` | -4 * μ |
+| `SHA256-C12.i` | i ∈ [14, 15] | `SHA256_M[2^0 * m[4 * i + 3] + 2^8 * m[4 * i + 2] + 2^16 * m[4 * i + 1] + 2^24 * m[4 * i + 0]; timestamp, i]` | -5 * μ |
+
+And finally, we provide the boundaries for the  chip and the final addition of the compression to the old state. Observe that we embed the addition into the upper 32 bits of a double word, in order to satisfy and use the `ADD` chip.
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `SHA256-C13` |  | `SHA256ROUND[timestamp, ['arr', ['+', ['*', ['^', 2, 0], ['idx', 'h', 3]], ['*', ['^', 2, 8], ['idx', 'h', 2]], ['*', ['^', 2, 16], ['idx', 'h', 1]], ['*', ['^', 2, 24], ['idx', 'h', 0]]], ['+', ['*', ['^', 2, 0], ['idx', 'h', 7]], ['*', ['^', 2, 8], ['idx', 'h', 6]], ['*', ['^', 2, 16], ['idx', 'h', 5]], ['*', ['^', 2, 24], ['idx', 'h', 4]]], ['+', ['*', ['^', 2, 0], ['idx', 'h', 11]], ['*', ['^', 2, 8], ['idx', 'h', 10]], ['*', ['^', 2, 16], ['idx', 'h', 9]], ['*', ['^', 2, 24], ['idx', 'h', 8]]], ['+', ['*', ['^', 2, 0], ['idx', 'h', 15]], ['*', ['^', 2, 8], ['idx', 'h', 14]], ['*', ['^', 2, 16], ['idx', 'h', 13]], ['*', ['^', 2, 24], ['idx', 'h', 12]]], ['+', ['*', ['^', 2, 0], ['idx', 'h', 19]], ['*', ['^', 2, 8], ['idx', 'h', 18]], ['*', ['^', 2, 16], ['idx', 'h', 17]], ['*', ['^', 2, 24], ['idx', 'h', 16]]], ['+', ['*', ['^', 2, 0], ['idx', 'h', 23]], ['*', ['^', 2, 8], ['idx', 'h', 22]], ['*', ['^', 2, 16], ['idx', 'h', 21]], ['*', ['^', 2, 24], ['idx', 'h', 20]]], ['+', ['*', ['^', 2, 0], ['idx', 'h', 27]], ['*', ['^', 2, 8], ['idx', 'h', 26]], ['*', ['^', 2, 16], ['idx', 'h', 25]], ['*', ['^', 2, 24], ['idx', 'h', 24]]], ['+', ['*', ['^', 2, 0], ['idx', 'h', 31]], ['*', ['^', 2, 8], ['idx', 'h', 30]], ['*', ['^', 2, 16], ['idx', 'h', 29]], ['*', ['^', 2, 24], ['idx', 'h', 28]]]], 0]` | μ |
+| `SHA256-C14` |  | `SHA256ROUND[timestamp, last_round_out, 64]` | -μ |
+| `SHA256-C15.i` | i ∈ [0, 31] | `IS_BYTE[out[i]]` | μ |
+| `SHA256-C16.i` | i ∈ [0, 7] | `ADD<['arr', 0, ['+', ['*', ['^', 2, 0], ['idx', 'out', ['+', ['*', 4, 'i'], 3]]], ['*', ['^', 2, 8], ['idx', 'out', ['+', ['*', 4, 'i'], 2]]], ['*', ['^', 2, 16], ['idx', 'out', ['+', ['*', 4, 'i'], 1]]], ['*', ['^', 2, 24], ['idx', 'out', ['+', ['*', 4, 'i'], 0]]]]]; ['arr', 0, ['idx', 'last_round_out', 'i']], ['arr', 0, ['+', ['*', ['^', 2, 0], ['idx', 'h', ['+', ['*', 4, 'i'], 3]]], ['*', ['^', 2, 8], ['idx', 'h', ['+', ['*', 4, 'i'], 2]]], ['*', ['^', 2, 16], ['idx', 'h', ['+', ['*', 4, 'i'], 1]]], ['*', ['^', 2, 24], ['idx', 'h', ['+', ['*', 4, 'i'], 0]]]]]>` |  |
+
+In this VM, we assign syscall number -1 to the  accelerator. The chip therefore contributes the following interaction to the lookup-argument:
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `SHA256-C17` | `IS_BIT<μ>` |  |
+| `SHA256-C18` | `ECALL[timestamp, ['arr', ['-', ['^', 2, 32], 1], ['-', ['^', 2, 32], 1]]]` | -μ |
+
+### Padding
+
+## `SHA256`msgsched chip
+
+### Columns
+
+The  chip leverages  variables, spanning  columns:
+
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `timestamp` | `DWordWL` | The timestamp/identifier for this execution of the message schedule |
+| `index` | `BaseField` | The index of the output word |
+| `amount` | `BaseField` | The multiplicity with which to output the resulting word |
+
+### Output
+
+| Name | Type | Description |
+|------|------|-------------|
+| `out` | `WordHL` | The output, `w[index]` |
+
+### Auxiliary
+
+| Name | Type | Description |
+|------|------|-------------|
+| `back2` | `Word` | `w[index - 2]` |
+| `back7` | `Word` | `w[index - 7]` |
+| `back15` | `Word` | `w[index - 15]` |
+| `back16` | `Word` | `w[index - 16]` |
+| `s0` | `Word` | $`back15` >>> 7 xor `back15` >>> 18 xor `back15` >> 3$ |
+| `s1` | `Word` | $`back2` >>> 17 xor `back2` >>> 19 xor `back2` >> 10$ |
+
+### Virtual
+
+| Name | Type | Description |
+|------|------|-------------|
+| `carry` | `Byte` | The carry of computing `out` |
+
+**Definition of `carry`:**
+```
+carry := 2^-32 * (back16 + s0 + back7 + s1 - out::Word)
+```
+
+### Multiplicity
+
+| Name | Type | Description |
+|------|------|-------------|
+| `μ` | `Bit` |  |
+
+### Assumptions
+
+| Tag | Range | Description |
+|-----|-------|-------------|
+| `SHA256MSGSCHED-A1` |  | #`IS_WORD[SHA256_M[timestamp, i]]` for $0 <= i < #`index`$ |
+
+### Constraints
+
+First, we gather the dependencies from earlier in the message schedule.
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `SHA256MSGSCHED-C1` | `IS_BYTE[index - 16]` | μ |
+| `SHA256MSGSCHED-C2` | `SHA256_M[back2; timestamp, index - 2]` | μ |
+| `SHA256MSGSCHED-C3` | `SHA256_M[back7; timestamp, index - 7]` | μ |
+| `SHA256MSGSCHED-C4` | `SHA256_M[back15; timestamp, index - 15]` | μ |
+| `SHA256MSGSCHED-C5` | `SHA256_M[back16; timestamp, index - 16]` | μ |
+
+Then, we calculate the result. It suffices to check that the carry of adding four range-checked words into a range-checked word is not too big, following the logic from [add]. In this case, using the `IS_BYTE` constraint allows us to add multiple words together at the same time, without needing to store and range-check intermediate results.
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `SHA256MSGSCHED-C6` |  | `ROTXOR[s0; back15, 2, 11, 3, 0]` | μ |
+| `SHA256MSGSCHED-C7` |  | `ROTXOR[s1; back2, 3, 2, 10, 0]` | μ |
+| `SHA256MSGSCHED-C8` |  | `IS_BYTE[carry]` | μ |
+| `SHA256MSGSCHED-C9.i` | i ∈ [0, 1] | `IS_HALF[out[i]]` | μ |
+
+Finally, we contribute to the LogUp.
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `SHA256MSGSCHED-C10` | `IS_BIT<μ>` |  |
+| `SHA256MSGSCHED-C11` | `μ` = 0 => `amount` = 0 |  |
+| | _polynomial:_ `(1 - μ) * amount = 0` | |
+| `SHA256MSGSCHED-C12` | `SHA256_M[out::Word; timestamp, index]` | -amount |
+
+## `SHA256`round chip
+
+### Columns
+
+The  chip leverages  variables, spanning  columns:
+
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `timestamp` | `DWordWL` | The timestamp/identifier for this execution of the round function |
+| `a` | `WordBL` | State element |
+| `b` | `WordBL` | State element |
+| `c` | `WordBL` | State element |
+| `d` | `Word` | State element |
+| `e` | `WordBL` | State element |
+| `f` | `WordBL` | State element |
+| `g` | `WordBL` | State element |
+| `h` | `Word` | State element |
+| `index` | `BaseField` | The round number/index |
+
+### Output
+
+| Name | Type | Description |
+|------|------|-------------|
+| `out_a` | `WordHL` | $`temp1` + `temp2`$ |
+| `out_e` | `WordHL` | $`d` + `temp1`$ |
+
+### Auxiliary
+
+| Name | Type | Description |
+|------|------|-------------|
+| `a_and_b` | `WordBL` | $`a` class("binary", amp) `b`$. Part of `maj` |
+| `a_xor_b` | `WordBL` | $`a` xor `b`$. Part of `maj` |
+| `c_and_a_xor_b` | `WordBL` | $`c` class("binary", amp) (`a` xor `b`)$. Part of `maj` |
+| `e_and_f` | `WordBL` | $`e` class("binary", amp) `f`$. Part of `ch` |
+| `not_e_and_g` | `WordBL` | $(not `e`) class("binary", amp) `g`$. Part of `ch` |
+| `kval` | `Word` | `k[index]` |
+| `S0` | `Word` | Transformation of `a` |
+| `S1` | `Word` | Transformation of `e` |
+| `wval` | `Word` | `w[index]` |
+
+### Virtual
+
+| Name | Type | Description |
+|------|------|-------------|
+| `carry_a` | `Byte` | The carry from `out_a` |
+| `carry_e` | `Byte` | The carry from `out_e` |
+| `ch` | `Word` | ch value |
+| `maj` | `Word` | maj value |
+| `temp1` | `BaseField` | `temp1` value |
+| `temp2` | `BaseField` | `temp2` value |
+
+**Definition of `carry_a`:**
+```
+carry_a := 2^-32 * (temp1 + temp2 - out_a::Word)
+```
+
+**Definition of `carry_e`:**
+```
+carry_e := 2^-32 * (d + temp1 - out_e::Word)
+```
+
+**Definition of `ch`:**
+```
+ch := e_and_f::Word + not_e_and_g::Word
+```
+
+**Definition of `maj`:**
+```
+maj := a_and_b::Word + c_and_a_xor_b::Word
+```
+
+**Definition of `temp1`:**
+```
+temp1 := h + S1 + ch + kval + wval
+```
+
+**Definition of `temp2`:**
+```
+temp2 := S0 + maj
+```
+
+### Multiplicity
+
+| Name | Type | Description |
+|------|------|-------------|
+| `μ` | `Bit` |  |
+
+### Assumptions
+
+| Tag | Range | Description |
+|-----|-------|-------------|
+| `SHA256ROUND-A1` |  | All state values are valid words |
+
+### Constraints
+
+First, we compute the necessary intermediate values.
+
+To compute `maj`, observe that ` (a bitand b) xor (a bitand c) xor (b bitand c) = (a bitand b) xor (c bitand (a xor b)), ` by distribution. Additionally, since for this form, `(a bitand b)` and `(a xor b)` are disjoint, so are `(a bitand b)` and `(c bitand (a xor b))`, and hence we can replace that top-level XOR with a field addition to compute `(a bitand b) + (c bitand (a xor b))`, needing fewer intermediate columns. Similarly, `ch` can be written as `(e bitand f) + ((2^32 - 1 - e) bitand g)`.
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `SHA256ROUND-C1.i` | i ∈ [0, 3] | `AND_BYTE[a_and_b[i]; a[i], b[i]]` | μ |
+| `SHA256ROUND-C2.i` | i ∈ [0, 3] | `XOR_BYTE[a_xor_b[i]; a[i], b[i]]` | μ |
+| `SHA256ROUND-C3.i` | i ∈ [0, 3] | `AND_BYTE[c_and_a_xor_b[i]; c[i], a_xor_b[i]]` | μ |
+| `SHA256ROUND-C4.i` | i ∈ [0, 3] | `AND_BYTE[e_and_f[i]; e[i], f[i]]` | μ |
+| `SHA256ROUND-C5.i` | i ∈ [0, 3] | `AND_BYTE[not_e_and_g[i]; 255 - e[i], g[i]]` | μ |
+| `SHA256ROUND-C6` |  | `SHA256_K[kval; index]` | μ |
+| `SHA256ROUND-C7` |  | `SHA256_M[wval; timestamp, index]` | μ |
+| `SHA256ROUND-C8` |  | `ROTXOR[S0; a::Word, 6, 9, 2, 1]` | μ |
+| `SHA256ROUND-C9` |  | `ROTXOR[S1; e::Word, 9, 14, 6, 1]` | μ |
+
+Then we constrain the addition for the new state, constraining additions with the same `IS_BYTE` trick as before.
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `SHA256ROUND-C10.i` | i ∈ [0, 1] | `IS_HALF[out_a[i]]` | μ |
+| `SHA256ROUND-C11` |  | `IS_BYTE[carry_a]` | μ |
+| `SHA256ROUND-C12.i` | i ∈ [0, 1] | `IS_HALF[out_e[i]]` | μ |
+| `SHA256ROUND-C13` |  | `IS_BYTE[carry_e]` | μ |
+
+Finally, we chain the rounds together through the interactions.
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `SHA256ROUND-C14` | `SHA256ROUND[timestamp, ['arr', ['cast', 'a', 'Word'], ['cast', 'b', 'Word'], ['cast', 'c', 'Word'], 'd', ['cast', 'e', 'Word'], ['cast', 'f', 'Word'], ['cast', 'g', 'Word'], 'h'], index]` | -μ |
+| `SHA256ROUND-C15` | `SHA256ROUND[timestamp, ['arr', ['cast', 'out_a', 'Word'], ['cast', 'a', 'Word'], ['cast', 'b', 'Word'], ['cast', 'c', 'Word'], ['cast', 'out_e', 'Word'], ['cast', 'e', 'Word'], ['cast', 'f', 'Word'], ['cast', 'g', 'Word']], index + 1]` | μ |
+
+### Padding
+
+## `ROTXOR` chip
+
+This chip takes as input `a`, `r0`, `r1`, `r2` (4-bit values) and a bit `last_rot` to compute $ cases( (a >>> (16 + r_0)) xor (a >>> (16 + r_0 - r_1)) xor (a >>> r_2) quad "if" `last_rot`, (a >>> (16 + r_0)) xor (a >>> (16 + r_0 - r_1)) xor (a >> r_2) quad "if" `!last_rot` ), $ where we let `>>>` denote right rotation and `>>` logical shift right. We choose this representation so that all shift amounts required fit into 4 bits, making the usage of `HWSL` more straightforward and avoid extra columns to represent more bits.
+
+### Columns
+
+The  chip leverages  variables, spanning  columns:
+
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `a` | `WordHL` | The input value |
+| `r0` | `Byte` | The first amount of rotation, low nibble |
+| `r1` | `Byte` | The second amount of rotation, low nibble |
+| `r2` | `Byte` | The third amount of rotation, low nibble |
+| `last_rot` | `Bit` | Whether the rotation by `r2` is a rotation (1) or just a shift (0) |
+
+### Output
+
+| Name | Type | Description |
+|------|------|-------------|
+| `out` | `WordBL` | The output |
+
+### Auxiliary
+
+| Name | Type | Description |
+|------|------|-------------|
+| `a0_left` | `WordHL` | `a << (16 - r0)` |
+| `a0_right` | `WordHL` | `a >> r0` |
+| `a1_left` | `WordHL` | `a0 << r1` |
+| `a1_right` | `WordHL` | `a0 >> (16 - r1)` |
+| `a2_left` | `WordHL` | `a << (16 - r2)` |
+| `a2_right` | `WordHL` | `a >> r2` |
+| `a0` | `WordBL` | `a >>> (16 + r0)` |
+| `a1` | `WordBL` | `a >>> (16 + r0 - r1)` (which is `a0 <<< r1`) |
+| `a2` | `WordBL` | `a >>> r2` or `a >> r2` |
+| `a01` | `WordBL` | $a_0 xor a_1$ |
+
+### Multiplicity
+
+| Name | Type | Description |
+|------|------|-------------|
+| `μ` | `BaseField` |  |
+
+### Assumptions
+
+Range checking for all elements is inherited from the bitwise lookups. We can safely assume that no `r_i` will be zero, and avoid extra work due to right rotation needing `16 - shift` as arguments to the `HWSL` interactions.
+
+| Tag | Range | Description |
+|-----|-------|-------------|
+| `ROTXOR-A1` |  | $#`r0`, #`r1`, #`r2` in [1, 15]$ |
+
+### Constraints
+
+We first compute all rotations (or shifts) of `a`. `a1` is computed as a left rotation of `a0`, in order to not need additional columns to represent the full right-rotation amounts.
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `ROTXOR-C1.i` | i ∈ [0, 1] | `HWSL[['arr', ['idx', 'a0_left', 'i'], ['idx', 'a0_right', 'i']]; a[i], 16 - r0]` | μ |
+| `ROTXOR-C2.i` | i ∈ [0, 1] | `HWSL[['arr', ['idx', 'a1_left', 'i'], ['idx', 'a1_right', 'i']]; (a0::WordHL)[i], r1]` | μ |
+| `ROTXOR-C3.i` | i ∈ [0, 1] | `HWSL[['arr', ['idx', 'a2_left', 'i'], ['idx', 'a2_right', 'i']]; a[i], 16 - r2]` | μ |
+| `ROTXOR-C4.i` | i ∈ [0, 1] | `a0[i]` = `a0_left[i]` + `a0_right[1 - i]` |  |
+| | | _polynomial:_ `(a0::WordHL)[i] - a0_left[i] - a0_right[1 - i] = 0` | |
+| `ROTXOR-C5.i` | i ∈ [0, 1] | `a1[i]` = `a1_left[i]` + `a1_right[1 - i]` |  |
+| | | _polynomial:_ `(a1::WordHL)[i] - a1_left[i] - a1_right[1 - i] = 0` | |
+| `ROTXOR-C6` |  | `a2[0]` = `a2_left[1]` + `a2_right[0]` |  |
+| | | _polynomial:_ `(a2::WordHL)[0] - a2_left[1] - a2_right[0] = 0` | |
+| `ROTXOR-C7` |  | `a2[1]` = `last_rot` dot `a2_left[0]` + `a2_right[1]` |  |
+| | | _polynomial:_ `(a2::WordHL)[0] - last_rot * a2_left[0] - a2_right[1] = 0` | |
+
+Then the bitwise XOR of the results.
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `ROTXOR-C8.i` | i ∈ [0, 3] | `XOR_BYTE[a01[i]; a0[i], a1[i]]` | μ |
+| `ROTXOR-C9.i` | i ∈ [0, 3] | `XOR_BYTE[out[i]; a01[i], a2[i]]` | μ |
+
+And finally contribute to the lookup argument.
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `ROTXOR-C10` | `ROTXOR[out::Word; a::Word, r0, r1, r2, last_rot]` | -μ |
+
+### Padding
+
+## Constant lookup
+
+As mentioned, we provide the round constants through a short precomputed lookup table: .
+
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `index` | `BaseField` |  |
+| `K` | `Word` |  |
+
+### Multiplicity
+
+| Name | Type | Description |
+|------|------|-------------|
+| `μ` | `BaseField` |  |
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `SHA256_K-C1` | `SHA256_K[K; index]` | -μ |
+
+## Notes/optimizations
+
+- This could instead be designed following the [RISC-V Crypto Scalar extension `Zknh`], for wider compatibility, but this design is likely to be more efficient. It is still possible, if desired, to expose  (or a selection of parameter instantiations thereof) as implementation for these primitives. - The message schedule could be exposed as its own ECALL instead, but the direct integration leads to better efficiency. - Some of these chips could be made narrower, at the cost of introducing some extra lookups and extra tables to compute and store intermediate results.
+
+## Columns
+
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `timestamp` | `DWordWL` | Timestamp at which the ECALL is invoked. Used as unique identifier for this invocation. |
+| `h` | `Byte[32]` | The state of the hash function. |
+| `h_addr` | `DWordHL[4]` | The addresses of the doublewords of `h` |
+| `m` | `Byte[64]` | The input chunk. |
+| `m_addr` | `DWordHL[8]` | The addresses of the doublewords of `m` |
+
+### Output
+
+| Name | Type | Description |
+|------|------|-------------|
+| `out` | `Byte[32]` | The new state. |
+
+### Auxiliary
+
+| Name | Type | Description |
+|------|------|-------------|
+| `last_round_out` | `Word[8]` | The output from the last compression round |
+
+### Multiplicity
+
+| Name | Type | Description |
+|------|------|-------------|
+| `μ` | `Bit` |  |
+
+## Constraints
+
+### memory
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `SHA256-C1` |  | `MEMW[['arr', ['idx', ['cast', ['idx', 'm_addr', 0], 'DWordWL'], 0], ['idx', ['cast', ['idx', 'm_addr', 0], 'DWordWL'], 0], 0, 0, 0, 0, 0, 0]; 1, (2 * 11)::DWordWL, ['arr', ['idx', ['cast', ['idx', 'm_addr', 0], 'DWordWL'], 0], ['idx', ['cast', ['idx', 'm_addr', 0], 'DWordWL'], 1], 0, 0, 0, 0, 0, 0], timestamp, 1, 0, 0]` | μ |
+| `SHA256-C2.i` | i ∈ [0, 7], j ∈ [0, 3] | `IS_HALF[m_addr[i][j]]` | μ |
+| `SHA256-C3.i` | i ∈ [1, 7] | `ADD<m_addr[i]::DWordWL; m_addr[0]::DWordWL, (8 * i)::DWordWL>` |  |
+| `SHA256-C4.i` | i ∈ [0, 7] | `MEMW[['arr', ['idx', 'm', ['+', ['*', 8, 'i'], 3]], ['idx', 'm', ['+', ['*', 8, 'i'], 2]], ['idx', 'm', ['+', ['*', 8, 'i'], 1]], ['idx', 'm', ['+', ['*', 8, 'i'], 0]], ['idx', 'm', ['+', ['*', 8, 'i'], 7]], ['idx', 'm', ['+', ['*', 8, 'i'], 6]], ['idx', 'm', ['+', ['*', 8, 'i'], 5]], ['idx', 'm', ['+', ['*', 8, 'i'], 4]]]; 0, m_addr[i]::DWordWL, ['arr', ['idx', 'm', ['+', ['*', 8, 'i'], 3]], ['idx', 'm', ['+', ['*', 8, 'i'], 2]], ['idx', 'm', ['+', ['*', 8, 'i'], 1]], ['idx', 'm', ['+', ['*', 8, 'i'], 0]], ['idx', 'm', ['+', ['*', 8, 'i'], 7]], ['idx', 'm', ['+', ['*', 8, 'i'], 6]], ['idx', 'm', ['+', ['*', 8, 'i'], 5]], ['idx', 'm', ['+', ['*', 8, 'i'], 4]]], timestamp, 0, 0, 1]` | μ |
+| `SHA256-C5` |  | `MEMW[['arr', ['idx', ['cast', ['idx', 'h_addr', 0], 'DWordWL'], 0], ['idx', ['cast', ['idx', 'h_addr', 0], 'DWordWL'], 1], 0, 0, 0, 0, 0, 0]; 1, (2 * 10)::DWordWL, ['arr', ['idx', ['cast', ['idx', 'h_addr', 0], 'DWordWL'], 0], ['idx', ['cast', ['idx', 'h_addr', 0], 'DWordWL'], 1], 0, 0, 0, 0, 0, 0], timestamp, 1, 0, 0]` | μ |
+| `SHA256-C6.i` | i ∈ [0, 3], j ∈ [0, 3] | `IS_HALF[h_addr[i][j]]` | μ |
+| `SHA256-C7.i` | i ∈ [1, 3] | `ADD<h_addr[i]::DWordWL; h_addr[0]::DWordWL, 8 * i::DWordWL>` |  |
+| `SHA256-C8.i` | i ∈ [0, 3] | `MEMW[['arr', ['idx', 'h', ['+', ['*', 8, 'i'], 3]], ['idx', 'h', ['+', ['*', 8, 'i'], 2]], ['idx', 'h', ['+', ['*', 8, 'i'], 1]], ['idx', 'h', ['+', ['*', 8, 'i'], 0]], ['idx', 'h', ['+', ['*', 8, 'i'], 7]], ['idx', 'h', ['+', ['*', 8, 'i'], 6]], ['idx', 'h', ['+', ['*', 8, 'i'], 5]], ['idx', 'h', ['+', ['*', 8, 'i'], 4]]]; 0, h_addr[i]::DWordWL, ['arr', ['idx', 'out', ['+', ['*', 8, 'i'], 3]], ['idx', 'out', ['+', ['*', 8, 'i'], 2]], ['idx', 'out', ['+', ['*', 8, 'i'], 1]], ['idx', 'out', ['+', ['*', 8, 'i'], 0]], ['idx', 'out', ['+', ['*', 8, 'i'], 7]], ['idx', 'out', ['+', ['*', 8, 'i'], 6]], ['idx', 'out', ['+', ['*', 8, 'i'], 5]], ['idx', 'out', ['+', ['*', 8, 'i'], 4]]], timestamp + 1::DWordWL, 0, 0, 1]` | μ |
+
+### sched
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `SHA256-C9.i` | i ∈ [0, 0] | `SHA256_M[2^0 * m[4 * i + 3] + 2^8 * m[4 * i + 2] + 2^16 * m[4 * i + 1] + 2^24 * m[4 * i + 0]; timestamp, i]` | -2 * μ |
+| `SHA256-C10.i` | i ∈ [1, 8] | `SHA256_M[2^0 * m[4 * i + 3] + 2^8 * m[4 * i + 2] + 2^16 * m[4 * i + 1] + 2^24 * m[4 * i + 0]; timestamp, i]` | -3 * μ |
+| `SHA256-C11.i` | i ∈ [9, 13] | `SHA256_M[2^0 * m[4 * i + 3] + 2^8 * m[4 * i + 2] + 2^16 * m[4 * i + 1] + 2^24 * m[4 * i + 0]; timestamp, i]` | -4 * μ |
+| `SHA256-C12.i` | i ∈ [14, 15] | `SHA256_M[2^0 * m[4 * i + 3] + 2^8 * m[4 * i + 2] + 2^16 * m[4 * i + 1] + 2^24 * m[4 * i + 0]; timestamp, i]` | -5 * μ |
+
+### compress
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `SHA256-C13` |  | `SHA256ROUND[timestamp, ['arr', ['+', ['*', ['^', 2, 0], ['idx', 'h', 3]], ['*', ['^', 2, 8], ['idx', 'h', 2]], ['*', ['^', 2, 16], ['idx', 'h', 1]], ['*', ['^', 2, 24], ['idx', 'h', 0]]], ['+', ['*', ['^', 2, 0], ['idx', 'h', 7]], ['*', ['^', 2, 8], ['idx', 'h', 6]], ['*', ['^', 2, 16], ['idx', 'h', 5]], ['*', ['^', 2, 24], ['idx', 'h', 4]]], ['+', ['*', ['^', 2, 0], ['idx', 'h', 11]], ['*', ['^', 2, 8], ['idx', 'h', 10]], ['*', ['^', 2, 16], ['idx', 'h', 9]], ['*', ['^', 2, 24], ['idx', 'h', 8]]], ['+', ['*', ['^', 2, 0], ['idx', 'h', 15]], ['*', ['^', 2, 8], ['idx', 'h', 14]], ['*', ['^', 2, 16], ['idx', 'h', 13]], ['*', ['^', 2, 24], ['idx', 'h', 12]]], ['+', ['*', ['^', 2, 0], ['idx', 'h', 19]], ['*', ['^', 2, 8], ['idx', 'h', 18]], ['*', ['^', 2, 16], ['idx', 'h', 17]], ['*', ['^', 2, 24], ['idx', 'h', 16]]], ['+', ['*', ['^', 2, 0], ['idx', 'h', 23]], ['*', ['^', 2, 8], ['idx', 'h', 22]], ['*', ['^', 2, 16], ['idx', 'h', 21]], ['*', ['^', 2, 24], ['idx', 'h', 20]]], ['+', ['*', ['^', 2, 0], ['idx', 'h', 27]], ['*', ['^', 2, 8], ['idx', 'h', 26]], ['*', ['^', 2, 16], ['idx', 'h', 25]], ['*', ['^', 2, 24], ['idx', 'h', 24]]], ['+', ['*', ['^', 2, 0], ['idx', 'h', 31]], ['*', ['^', 2, 8], ['idx', 'h', 30]], ['*', ['^', 2, 16], ['idx', 'h', 29]], ['*', ['^', 2, 24], ['idx', 'h', 28]]]], 0]` | μ |
+| `SHA256-C14` |  | `SHA256ROUND[timestamp, last_round_out, 64]` | -μ |
+| `SHA256-C15.i` | i ∈ [0, 31] | `IS_BYTE[out[i]]` | μ |
+| `SHA256-C16.i` | i ∈ [0, 7] | `ADD<['arr', 0, ['+', ['*', ['^', 2, 0], ['idx', 'out', ['+', ['*', 4, 'i'], 3]]], ['*', ['^', 2, 8], ['idx', 'out', ['+', ['*', 4, 'i'], 2]]], ['*', ['^', 2, 16], ['idx', 'out', ['+', ['*', 4, 'i'], 1]]], ['*', ['^', 2, 24], ['idx', 'out', ['+', ['*', 4, 'i'], 0]]]]]; ['arr', 0, ['idx', 'last_round_out', 'i']], ['arr', 0, ['+', ['*', ['^', 2, 0], ['idx', 'h', ['+', ['*', 4, 'i'], 3]]], ['*', ['^', 2, 8], ['idx', 'h', ['+', ['*', 4, 'i'], 2]]], ['*', ['^', 2, 16], ['idx', 'h', ['+', ['*', 4, 'i'], 1]]], ['*', ['^', 2, 24], ['idx', 'h', ['+', ['*', 4, 'i'], 0]]]]]>` |  |
+
+### lookup
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `SHA256-C17` | `IS_BIT<μ>` |  |
+| `SHA256-C18` | `ECALL[timestamp, ['arr', ['-', ['^', 2, 32], 1], ['-', ['^', 2, 32], 1]]]` | -μ |
\ No newline at end of file
diff --git a/docs/spec/shift.md b/docs/spec/shift.md
index 0b52509ec..53c46ca2a 100644
--- a/docs/spec/shift.md
+++ b/docs/spec/shift.md
@@ -6,103 +6,10 @@ $ $
 
 $ Here, `<<` and `>>` denote the _logical_ left and right shift operations, while `>>>` denotes the _arithmetic_ right shift operation.
 
-= Variables
+## Variables
 
 The `SHIFT` chip is comprised of  variables that are expressed using  columns and leverages  interaction(s):
 
-= Assumptions
-
-= Explanation This chip has a rather complex design as a result of designing it to fit in as few columns possible. We briefly discuss the intricacies of the design, attempting to illustrate its correctness.
-
-The chip's design revolves around a two-phase shifting process: 1. shift `in` by `x := `shift` mod 16` bits, 2. shift that result by `(`shift`-x) mod 64` (or `mod 32` if ` `word_instr` = 1`). The intermediate value representing the state between the two phases is stored in the scratch variables `X` and `Y`. The definition of `shifted` describes how one can combine the `X`, `Y` and `extension` variables to construct the output value as described using `Half`-limbs. The output variable `out` is equivalent to `shifted`, but expressed using `Word`-limbs.
-
-In the following, we cover how these two phases were designed to complement one another. Here, we start with discussing the _logical_ left/right shift operations only; the modifications required to compute the _arithmetic_ right shift will be discussed at the end.
-
-## First phase
-
-We zoom in on the first step. Here, we make use of the lookup operation `HWSL` (short for "HalfWord Shift Left"): ` `HWSL[x: Half, y: B4]` := [(`x` `<<` `y`) mod 2^16, `x` `>>` (16 - `y`)]. ` One can use this to compute `out: Half[4] := in << y` as: $
-
-$ as long as ``y` < 16`. Observing that ``HWSL[x,` 16-`y]`_0 = (`x` `<<` (16-`y`)) mod 2^16`, and ``HWSL[x,` 16-`y]`_1 = `x` `>>` `y`` for ``y` in [1, 15]`, one can also use it to compute `out := in >> y` as $
-
-$ as long as `0 < `y` < 16`.
-
-Observe now that the values being looked up are (almost) independent from the direction of the shift: only the shift-amount varies slightly. When we now define $
-
-(16-`shift`) mod 16 & "when shifting right" ), $ it only takes some rearranging and combining of the values ``X[`i`] := HWSL[in[`i`], bit_shift]`_0` and ``Y[`i`] := HWSL[in[`i`], bit_shift]`_1` to form the limbs of ``in <</>> shift` mod 16`. In the remaining case that ``right` = 1` and ``shift` = 0 mod 16`, the limbs of ``in <</>> shift` mod 16` simply match those of `in`.
-
-## Second phase
-
-Since we're operating on 16-bit limbs, all the limbs in ``in <</>> shift`` must also occur somewhere in ``in <</>> shift` mod 16`. The number of full-limbs we still need to shift is determined by the fifth and sixth least significant bit of `shift`. With `limb_shift` containing a unary decoding of the integer represented by these two bits, we find that the intermediate value needs to be shifted over by `i` limbs (to the `left` or `right`) when ``limb_shift[`i`]` = 1`. These things combined yield `shifted`'s definition.
-
-Of course, when ``word_instr` = 1` and, thus, only ``shift` mod 32` should be considered, the bit-mask for the lookup constraining `limb_shift` is adjusted appropriately (see [shift:c:limb_shift_lookup]).
-
-## Arithmetic right shift
-
-Lastly, we discuss the case of performing the _arithmetic_ right shift. Here, `extension` is constrained to contain a repetition of `in`'s most significant bit. Copies of this variable are used for any full limbs shifted in when ``right` = `signed` = 1`. Moreover, `X[4]` contains a copy of `extension` shifted over by the right number of bits, to allow the construction of ``in >>> shift` mod 16` as the appropriate intermediate.
-
-= Constraints First, we constrain `bit_shift` based on whether we are left or right-shifting. [shift:c:zbs] makes sure `zbs` is set to `1` if and only if `bit_shift = 0`. This flag is used to indicate the special case that ``right` = 1` and ``shift` = 0 mod 16`.
-
-| Tag | Description | Multiplicity |
-|-----|-------------|--------------|
-| `SHIFT-C1` | `AND_BYTE[bit_shift; shift, 15]` | left |
-| `SHIFT-C2` | `AND_BYTE[bit_shift; 2^8 - 16 * zbs - shift, 15]` | right |
-| `SHIFT-C3` | `ZERO[zbs; bit_shift]` | μ |
-
-Next, we shift the limbs of `in` left and right by the appropriate amount, storing the results in `X` and `Y` respectively. When `zbs = 1`, the output cannot be used to compose ``in >>/>>> shift` mod 16`. To resolve this, we override `Y[i] := in[i]` and `X[i] := 0` in this case.
-
-The case of `left`-shifting and ``bit_shift` = 0` will be used for padding rows. To prevent unnecessary lookups in padding rows, we override ``X[i]` := `in[i]`` and ``Y[i]` := 0` here.
-
-| Tag | Range | Description | Multiplicity |
-|-----|-------|-------------|--------------|
-| `SHIFT-C4.i` | i ∈ [0, 3] | `HWSL[['arr', ['idx', 'X', 'i'], ['idx', 'Y', 'i']]; in[i], bit_shift]` | 1 - zbs |
-| `SHIFT-C5.i` | i ∈ [0, 3] | `zbs` => `X[i]` = `in[i]` dot `left` |  |
-| | | _polynomial:_ `zbs * (X[i] - in[i] * left) = 0` | |
-| `SHIFT-C6.i` | i ∈ [0, 3] | `zbs` => `Y[i]` = `in[i]` dot `right` |  |
-| | | _polynomial:_ `zbs * (Y[i] - in[i] * right) = 0` | |
-| `SHIFT-C7` |  | `HWSL[['arr', ['idx', 'X', 4], ['-', 'extension', ['idx', 'X', 4]]]; extension, bit_shift]` | 1 - zbs |
-| `SHIFT-C8` |  | `zbs` => `X[4]` = 0 |  |
-| | | _polynomial:_ `zbs * X[4] = 0` | |
-
-## Full-limb shifting
-
-Next, we constrain that `limb_shift` is a proper unary encoding of the fifth (and sixth if ``word_instr` = 0`) bit of `shift`. For this to be the case, three requirements must be satisfied: + *unary(0)*: ``limb_shift[`i`]` in {0, 1}` for `i in [0, 3]`, + *unary(1)*: ``limb_shift[`i`]` = 1` for exactly one `i`, and + *proper encoding*: ``limb_shift[`i`]` = 1 <=> 1/16 (`shift &` (48-32 dot `word_instr`)) = i` The first requirement is enforced by constraint [shift:c:limb_shift_is_bit]. To construct a constraint for the second and third requirement, observe that $ 1/16 dot (`shift &` (48-32 dot `word_instr`)) in cases( {0, 1, 2, 3} &"if" `word_instr` = 0, {0, 1} &"if" `word_instr` = 1 $ Observe moreover that, assuming *unary(0)*, the expression $ 1/16 dot (1 + sum_(i=0)^3 (16i-1) dot `limb_shift[`i`]`) $ can evaluate to `i` if and only if ``limb_shift[`i`]` = 1`, while the others are `0`. This means that the relation $ 1 + sum_(i=0)^3 (16i-1) dot `limb_shift[`i`]` = `shift &` (48-32 dot `word_instr`) $ enforces both *unary(1)* and *proper encoding*. This is the exact relation [shift:c:limb_shift_lookup] enforces.
-
-Hereafter, one must only check that `out` is the proper cast of `shifted` into a `DWordWL`.
-
-| Tag | Range | Description | Multiplicity |
-|-----|-------|-------------|--------------|
-| `SHIFT-C9.i` | i ∈ [0, 3] | `IS_BIT<limb_shift[i]>` |  |
-| `SHIFT-C10` |  | `AND_BYTE[(1 - limb_shift[0]) + 15 * limb_shift[1] + 31 * limb_shift[2] + 47 * limb_shift[3]; shift, 48 - 32 * word_instr]` | μ |
-| `SHIFT-C11.i` | i ∈ [0, 1] | `out[:2]` = `shifted[:4]` |  |
-| | | _polynomial:_ `out[i] - (shifted::DWordWL)[i] = 0` | |
-
-## Miscellaneous
-
-| Tag | Description |
-|-----|-------------|
-| `SHIFT-C12` | `direction` => `μ` = 1 |
-| | _polynomial:_ `direction * (1 - μ) = 0` |
-
-| Tag | Description | Multiplicity |
-|-----|-------------|--------------|
-| `SHIFT-C13` | `MSB16[is_negative; in[3]]` | signed |
-
-*Note*: `is_negative` is not used when `signed = 0`. As such, there is no problem with it being unconstrained in this case.
-
-## Lookups
-
-This chip adds the following interaction to the lookup.
-
-| Tag | Description | Multiplicity |
-|-----|-------------|--------------|
-| `SHIFT-C14` | `SHIFT[out; in, shift, direction, signed, word_instr]` | -μ |
-
-= Padding
-
-The table can be padded to the next power of two with the following value assignments:
-
-## Columns
-
 ### Input
 
 | Name | Type | Description |
@@ -193,4 +100,97 @@ shifted := left * Σ_j = 0^i limb_shift[j] * intra_limb_left[i - j] + right * (
 | `SHIFT-A2` |  | `IS_BYTE[shift]` |
 | `SHIFT-A3` |  | `IS_BIT<direction>` |
 | `SHIFT-A4` |  | `IS_BIT<signed>` |
-| `SHIFT-A5` |  | `IS_BIT<word_instr>` |
\ No newline at end of file
+| `SHIFT-A5` |  | `IS_BIT<word_instr>` |
+
+## Explanation
+
+This chip has a rather complex design as a result of designing it to fit in as few columns possible. We briefly discuss the intricacies of the design, attempting to illustrate its correctness.
+
+The chip's design revolves around a two-phase shifting process: 1. shift `in` by `x := `shift` mod 16` bits, 2. shift that result by `(`shift`-x) mod 64` (or `mod 32` if ` `word_instr` = 1`). The intermediate value representing the state between the two phases is stored in the scratch variables `X` and `Y`. The definition of `shifted` describes how one can combine the `X`, `Y` and `extension` variables to construct the output value as described using `Half`-limbs. The output variable `out` is equivalent to `shifted`, but expressed using `Word`-limbs.
+
+In the following, we cover how these two phases were designed to complement one another. Here, we start with discussing the _logical_ left/right shift operations only; the modifications required to compute the _arithmetic_ right shift will be discussed at the end.
+
+### First phase
+
+We zoom in on the first step. Here, we make use of the lookup operation `HWSL` (short for "HalfWord Shift Left"): ` `HWSL[x: Half, y: B4]` := [(`x` `<<` `y`) mod 2^16, `x` `>>` (16 - `y`)]. ` One can use this to compute `out: Half[4] := in << y` as: $
+
+$ as long as ``y` < 16`. Observing that ``HWSL[x,` 16-`y]`_0 = (`x` `<<` (16-`y`)) mod 2^16`, and ``HWSL[x,` 16-`y]`_1 = `x` `>>` `y`` for ``y` in [1, 15]`, one can also use it to compute `out := in >> y` as $
+
+$ as long as `0 < `y` < 16`.
+
+Observe now that the values being looked up are (almost) independent from the direction of the shift: only the shift-amount varies slightly. When we now define $
+
+(16-`shift`) mod 16 & "when shifting right" ), $ it only takes some rearranging and combining of the values ``X[`i`] := HWSL[in[`i`], bit_shift]`_0` and ``Y[`i`] := HWSL[in[`i`], bit_shift]`_1` to form the limbs of ``in <</>> shift` mod 16`. In the remaining case that ``right` = 1` and ``shift` = 0 mod 16`, the limbs of ``in <</>> shift` mod 16` simply match those of `in`.
+
+### Second phase
+
+Since we're operating on 16-bit limbs, all the limbs in ``in <</>> shift`` must also occur somewhere in ``in <</>> shift` mod 16`. The number of full-limbs we still need to shift is determined by the fifth and sixth least significant bit of `shift`. With `limb_shift` containing a unary decoding of the integer represented by these two bits, we find that the intermediate value needs to be shifted over by `i` limbs (to the `left` or `right`) when ``limb_shift[`i`]` = 1`. These things combined yield `shifted`'s definition.
+
+Of course, when ``word_instr` = 1` and, thus, only ``shift` mod 32` should be considered, the bit-mask for the lookup constraining `limb_shift` is adjusted appropriately (see [shift:c:limb_shift_lookup]).
+
+### Arithmetic right shift
+
+Lastly, we discuss the case of performing the _arithmetic_ right shift. Here, `extension` is constrained to contain a repetition of `in`'s most significant bit. Copies of this variable are used for any full limbs shifted in when ``right` = `signed` = 1`. Moreover, `X[4]` contains a copy of `extension` shifted over by the right number of bits, to allow the construction of ``in >>> shift` mod 16` as the appropriate intermediate.
+
+## Constraints
+
+First, we constrain `bit_shift` based on whether we are left or right-shifting. [shift:c:zbs] makes sure `zbs` is set to `1` if and only if `bit_shift = 0`. This flag is used to indicate the special case that ``right` = 1` and ``shift` = 0 mod 16`.
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `SHIFT-C1` | `AND_BYTE[bit_shift; shift, 15]` | left |
+| `SHIFT-C2` | `AND_BYTE[bit_shift; 2^8 - 16 * zbs - shift, 15]` | right |
+| `SHIFT-C3` | `ZERO[zbs; bit_shift]` | μ |
+
+Next, we shift the limbs of `in` left and right by the appropriate amount, storing the results in `X` and `Y` respectively. When `zbs = 1`, the output cannot be used to compose ``in >>/>>> shift` mod 16`. To resolve this, we override `Y[i] := in[i]` and `X[i] := 0` in this case.
+
+The case of `left`-shifting and ``bit_shift` = 0` will be used for padding rows. To prevent unnecessary lookups in padding rows, we override ``X[i]` := `in[i]`` and ``Y[i]` := 0` here.
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `SHIFT-C4.i` | i ∈ [0, 3] | `HWSL[['arr', ['idx', 'X', 'i'], ['idx', 'Y', 'i']]; in[i], bit_shift]` | 1 - zbs |
+| `SHIFT-C5.i` | i ∈ [0, 3] | `zbs` => `X[i]` = `in[i]` dot `left` |  |
+| | | _polynomial:_ `zbs * (X[i] - in[i] * left) = 0` | |
+| `SHIFT-C6.i` | i ∈ [0, 3] | `zbs` => `Y[i]` = `in[i]` dot `right` |  |
+| | | _polynomial:_ `zbs * (Y[i] - in[i] * right) = 0` | |
+| `SHIFT-C7` |  | `HWSL[['arr', ['idx', 'X', 4], ['-', 'extension', ['idx', 'X', 4]]]; extension, bit_shift]` | 1 - zbs |
+| `SHIFT-C8` |  | `zbs` => `X[4]` = 0 |  |
+| | | _polynomial:_ `zbs * X[4] = 0` | |
+
+### Full-limb shifting
+
+Next, we constrain that `limb_shift` is a proper unary encoding of the fifth (and sixth if ``word_instr` = 0`) bit of `shift`. For this to be the case, three requirements must be satisfied: + *unary(0)*: ``limb_shift[`i`]` in {0, 1}` for `i in [0, 3]`, + *unary(1)*: ``limb_shift[`i`]` = 1` for exactly one `i`, and + *proper encoding*: ``limb_shift[`i`]` = 1 <=> 1/16 (`shift &` (48-32 dot `word_instr`)) = i` The first requirement is enforced by constraint [shift:c:limb_shift_is_bit]. To construct a constraint for the second and third requirement, observe that $ 1/16 dot (`shift &` (48-32 dot `word_instr`)) in cases( {0, 1, 2, 3} &"if" `word_instr` = 0, {0, 1} &"if" `word_instr` = 1 $ Observe moreover that, assuming *unary(0)*, the expression $ 1/16 dot (1 + sum_(i=0)^3 (16i-1) dot `limb_shift[`i`]`) $ can evaluate to `i` if and only if ``limb_shift[`i`]` = 1`, while the others are `0`. This means that the relation $ 1 + sum_(i=0)^3 (16i-1) dot `limb_shift[`i`]` = `shift &` (48-32 dot `word_instr`) $ enforces both *unary(1)* and *proper encoding*. This is the exact relation [shift:c:limb_shift_lookup] enforces.
+
+Hereafter, one must only check that `out` is the proper cast of `shifted` into a `DWordWL`.
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `SHIFT-C9.i` | i ∈ [0, 3] | `IS_BIT<limb_shift[i]>` |  |
+| `SHIFT-C10` |  | `AND_BYTE[(1 - limb_shift[0]) + 15 * limb_shift[1] + 31 * limb_shift[2] + 47 * limb_shift[3]; shift, 48 - 32 * word_instr]` | μ |
+| `SHIFT-C11.i` | i ∈ [0, 1] | `out[:2]` = `shifted[:4]` |  |
+| | | _polynomial:_ `out[i] - (shifted::DWordWL)[i] = 0` | |
+
+### Miscellaneous
+
+| Tag | Description |
+|-----|-------------|
+| `SHIFT-C12` | `direction` => `μ` = 1 |
+| | _polynomial:_ `direction * (1 - μ) = 0` |
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `SHIFT-C13` | `MSB16[is_negative; in[3]]` | signed |
+
+*Note*: `is_negative` is not used when `signed = 0`. As such, there is no problem with it being unconstrained in this case.
+
+### Lookups
+
+This chip adds the following interaction to the lookup.
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `SHIFT-C14` | `SHIFT[out; in, shift, direction, signed, word_instr]` | -μ |
+
+## Padding
+
+The table can be padded to the next power of two with the following value assignments:
\ No newline at end of file
diff --git a/docs/spec/sign.md b/docs/spec/sign.md
index a0aac8eb7..7cf9dd038 100644
--- a/docs/spec/sign.md
+++ b/docs/spec/sign.md
@@ -2,15 +2,9 @@
 
 It constrains that `sign` is set to `1` when both `X`'s most significant bit and `signed` are `1`, and `0` otherwise.
 
-= Variables The  template introduces  interaction(s):
+## Variables
 
-= Assumptions The  template operates on the following assumptions:
-
-If `sign` is set to `1`, `X` will be range-checked to be a halfword, and hence proving may fail if this is not ensured.
-
-= Constraints It takes only two constraints to compute the `sign` of `X`, given whether `X` represents a `signed` value or not. When ``signed` = 1`, the sign of `X` is equal to its most significant bit. This value is extracted in [sign:c:sign_if_signed]. If `X` is unsigned (i.e., ``signed` = 0`), its sign is always `0`. This is constrained by [sign:c:sign_if_unsigned].
-
-## Columns
+The  template introduces  interaction(s):
 
 ### Input
 
@@ -27,13 +21,17 @@ If `sign` is set to `1`, `X` will be range-checked to be a halfword, and hence p
 
 ## Assumptions
 
+The  template operates on the following assumptions:
+
 | Tag | Range | Description |
 |-----|-------|-------------|
 | `SIGN-A1` |  | `IS_BIT<signed>` |
 
+If `sign` is set to `1`, `X` will be range-checked to be a halfword, and hence proving may fail if this is not ensured.
+
 ## Constraints
 
-### all
+It takes only two constraints to compute the `sign` of `X`, given whether `X` represents a `signed` value or not. When ``signed` = 1`, the sign of `X` is equal to its most significant bit. This value is extracted in [sign:c:sign_if_signed]. If `X` is unsigned (i.e., ``signed` = 0`), its sign is always `0`. This is constrained by [sign:c:sign_if_unsigned].
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
diff --git a/docs/spec/spec_full.md b/docs/spec/spec_full.md
index b651377e4..76c1189e1 100644
--- a/docs/spec/spec_full.md
+++ b/docs/spec/spec_full.md
@@ -1,5 +1,89 @@
 # Lambda VM Specification
 
+# LogUp Argument
+
+The _LogUp_ proof system conducts a permutation check based on summing partial derivatives. This check ensures that whatever tuple is sent to be "looked-up" by a _source table_ is indeed received in the expected _destination table_.
+
+## Notation
+
+### VM Notation
+
+#### Preliminary notation
+
+- `NN`: the set of non-negative natural integers. - `BaseField`: the base finite field used by the arithmetisation. - `ExtensionField`: a finite extension of `BaseField` of cryptographic size. - `[n]` for `n in NN`: the set of integers `{0, dots, n - 1}`. - `X[i]` for tuple `X`: the `i`-th element of `X`, starting at `0`.
+
+#### Arithmetisation notation
+
+- `numTables in NN`: number of tables `Table_i` in the arithmetisation of the VM. - `TableSet`: set of all tables `Table_i` in the arithmetisation of the VM. - `numColumns_i in NN`: number of _columns_ in table `Table_i` (not the number of variables). - `numRows_i in NN`: number of _rows_ in table `Table_i`.
+
+### Interaction Notation
+
+The `j`-th _interaction_ `Interaction_j` of table `Table_i` is defined by the following tuple:
+
+columns: (auto, auto), inset: 6pt, align: horizon, stroke: none, table.header([*Symbol*], [*Description*]), table.hline(stroke: 1pt), table.vline(stroke: 1pt, x: 1), [`id_(i,j) in FF`], [the _type identifier_ of the interaction, usually the identifier of the chip that is constraining the relation expected to hold within the looked-up tuple.], [`numElements_(i,j) in NN`], [the _length_ of the tuple of elements being looked-up.], [ $weightFunction_(i,j) : FF^(numColumns_i) & arrow FF^(numElements_(i,j) + 1) \ R & mapsto arrow(t)_(i,j) || mu_(i,j)$ ], [the _weight function_ that maps a row `R` of table `Table_i` to the looked-up tuple `arrow(t)_(i,j)` and its multiplicity `mu_(i,j) in BaseField`.],
+
+## Vanilla LogUp
+
+### Protocol Description
+
++ Prover commits to all traces.
+
++ Verifier samples a random _(global) LogUp challenge_ `logupChallenge in ExtensionField` and a random _fingerprint coefficient_ `fingerprintCoeff in ExtensionField` and sends them to the Prover.
+
++ Prover commits to (i) interaction contribution, (ii) table running sum columns, and (iii) each table's contribution:
+
++ For each table `Table_i`, populate the interaction contribution columns and compute the _table (LogUp) contribution_:
+
++ For each interaction `Interaction_j` of table `Table_i`, initialize an empty _interaction contribution column_ of length `numRows_i`.
+
++ Initialise a _table running sum column_ `S_i in ExtensionField^(numRows_i)` with the first value `S_i [0]` populated according to the constraint choice.
+
++ *Constrain* the first row if required by selected constraint choice.
+
++ For each `j`-th row `R_j in BaseField^(numColumns_i)` of `Table_i`, for `j in [numRows_i - 1]`: + For each `k`-th interaction `Interaction_k` of table `Table_i`: + Compute the _interaction contribution numerator_ ` n_(j,k) = mu_(i,k) = w_(i,k)(R_j)[numElements_(i,k)] ` + If `n eq.not 0`, compute the _interaction contribution denominator_ ` d_(j,k) = logupChallenge + fingerprintCoeff dot id_(i,k) + sum_(l = 0)^(numElements_(i,k) - 1) fingerprintCoeff^(l + 2) dot weightFunction_(i,k) (R_j)[l]. ` + Save the _interaction contribution_ as `n_(j,k)/d_(j,k) in ExtensionField` in the corresponding interaction contribution column for this interaction. + *Constrain* the interaction contribution column according to the definitions of `n` and~`d`.
+
++ Compute the _row contribution_ as the sum `s_(j) = sum_k n_(j,k) / d_(j,k)` and compute the next row's table running sum value `S_i [j+1] = S_i [j] + s_(j)`.
+
++ *Constrain* the transition of the running sum column as indicated by the constraint choice.
+
++ *Constrain* the last row if required by selected constraint choice.
+
++ Batch-commit to every table's interaction contribution columns and running sum columns with the column commitment scheme and commit to the table's overall contribution `S_i [N_i - 1]` by sending it in the clear to the verifier.
+
++ Verifier checks that the sum of every table's overall contribution is equal to zero: `sum_i S_i [N_i - 1] = 0_ExtensionField`, and delegates the checks of the constraints to the STARK.
+
+### Running Sum Constraint Choices <constraint_choices>
+
+#### Choice 1: transitions looking back
+
+tl,dr: implicit `0_ExtensionField` initial value, explicit final value.
+
++ (*Boundary, first row*) Constrain first row of running sum column to equal the sum of the first row of every interaction contribution column. (This is analogous an implicit `-1`-th row initialised at `0_ExtensionField`.) + (*Transition, looking back, applied to rows `1, dots, numRows_i - 1`*) For each row _other than the first_, constrain the _current_ running sum value to equal the sum of every current interaction contribution column added to the _previous_ running sum value. + (*Boundary, last row*) Constrain last row of running sum column to equal the claimed table contribution.
+
+Total constraints: 2 boundary + 1 transition over `numRows_i - 1` rows.
+
+#### Choice 2: transitions looking forward
+
+tl,dr: explicit `0_ExtensionField` initial value, implicit final value.
+
++ (*Boundary, first row*) Constrain first row of running sum column to equal `0_ExtensionField`. + (*Transition, looking forward, applied to rows `0, dots, numRows_i - 2`*) For each row _other than the last_, constrain the _next_ running sum value to equal the sum of every current interaction contribution column added to the _current_ running sum value. + (*Boundary, last row*) Constrain last row of running sum column added to sum of last row of every interaction column to equal the claimed table contribution. (That is, the claimed table contribution is implicit in the last row of the table, but not written to last value of running sum column.)
+
+Total constraints: 2 boundary + 1 transition over `numRows_i - 1` rows.
+
+#### Choice 3: circular transitions looking back/forward
+
++ For each row, constrain the _current/next_ (wrapping to first on last if "next") running sum value to equal the sum of every current interaction contribution value added to the _previous/current_ (wrapping to last on first if "previous") running sum value added to claimed table contribution divided by `numRows_i`.
+
+Total constraints: 1 _circular_ transition over `numRows_i` rows.
+
+This single circular constraint checks that each row's contribution `s_(i,j)` is added to the running sum column, either in the current row's cell or in the next row's. In order to avoid boundary constraints, the look-back or peek-forward into the running sum column wraps around the beginning or end of the table.
+
+This alone implies that difference between first and last row's values will be the table's overall real contribution `sum_j s_(i,j)`, which will be incompatible with the circularity of the constraint. Since boundary constraints are avoided, the way to check that `sum_j s_(i,j)` equals the claimed contribution `L_i` is to remove a fraction of `L_i` at each row in such a way that `L_i` is removed completely after summing all `numRows_i` rows; i.e., the constraint subtracts the public term `L_i / numRows_i` from the running sum at every row.
+
+If the expected equality `sum_j s_(i,j) = L_i` holds, then the circularity of the constraint will also hold. ]
+
+---
+
 # Memory Argument
 
 As part of fully proving the correct execution of a RISC-V program, the VM must ensure that memory reads and writes are consistent. That is, every byte read from some address corresponds to the byte that was last written to that address --- or the initial value if nothing has been written yet. We consider "memory" in a broad sense here: both RAM and the general purpose registers can be seen as instantiations of memory and are therefore handled simultaneously.
@@ -8,11 +92,11 @@ While RAM is byte addressed, we do choose to store registers as a `DWordWL` over
 
 On a high level, we ensure memory consistency by an interacting system of reads and writes to a lookup argument, combined with an initialization and finalization scheme. The initialization and finalization schemes together ensure both that (1) the necessary preconditions for the lookup system are satisfied, and (2) the program is executed with the correct initial memory and register contents as specified by the ELF binary and the ISA.
 
-= Memory types
+## Memory types
 
 A commonly made distinction of memory types is that of _read-only_ and _read-write_ memory, with the more restrictive read-only variant often allowing for more efficient solutions (be that regarding prover time, verifier time or proof size) via table lookup proofs. Naturally, the VM’s main memory and registers should be handled by a read-write system as the guest program/environment can issue instructions that write to memory. While there are some subsystems that can be modelled as read-only memory ---e.g., the program memory and instruction decoding--- we opt to integrate these into the proof system via chip interactions (relying on techniques derived from table lookup arguments). As such, we only concern ourselves with read-write memory, moving forward.
 
-= Memory operations
+## Memory operations
 
 Every memory operation has some conceptual attributes that are relevant to mention or discuss:
 
@@ -22,7 +106,7 @@ Since we will have to ensure that memory accesses are temporally consistent with
 
 For reasons of completeness (since temporal integrity as discussed below is a security necessity), we cannot deal with multiple accesses to the same address at identical timestamps. However, if multiple accesses are guaranteed to be independent (that is, to different addresses), they can still share a timestamp --- consider, e.g., the case of reading a word as 4 bytes with the `LW` load instruction. This property is already taken into account where possible in the design of the system. For instance, in the CPU chip, we can ensure that there are at most 3 memory accesses not guaranteed to be independent, so a timestamp granularity of 4 timestamps per cycle is enough. ]
 
-= Permutation argument
+## Permutation argument
 
 We can conceptually organise the state of the memory as a collection of "tokens" that represent tuples `(serif("timestamp"), serif("address"), serif("value"))`, meaning the current value written to `serif("address")` is `serif("value")`, last written to memory at `serif("timestamp")`. Having exactly one value associated with any address will be ensured (see further down in this document) by the interaction of memory initialization, memory finalization, and the effects of memory operations.
 
@@ -34,13 +118,13 @@ Naturally, for a read operation, the _values_ embedded in the consumed and emitt
 
 So long as we can properly constrain temporal integrity (that is, no memory operation can consume future tokens), this "balancing" act of tokens can be integrated (with sufficient domain separation) into the existing LogUp argument ([logup]): consuming a token corresponds to a "receive" and emitting a new token is a "send".
 
-= Temporal integrity
+## Temporal integrity
 
 To ensure temporal integrity, every memory operation needs to be constrained for the newly emitted token to have a strictly greater timestamp than the consumed token. This raises the question of how to represent timestamps and cleanly perform this check, as over a finite field the “less than” relation is ill-defined (though it is common and natural to consider it as the less than relation over the natural lift of the field into the integers). We choose to represent timestamps as machine words, using the existing `LT` chip ([lt]) functionality for comparisons. The full implementation of the timestamp system can be seen in the `timestamp` column of the `CPU` ([cpu]) and `MEMW` chips ([memw]). The `CPU` merely passes in the current timestamp, while `MEMW` can recall the previously written timestamp and constrain the correct sequencing.
 
 - Clean definition of “less-than”, using the already existing `LT` functionality in the ALU - Harder to perform increments, needing extra constraints beyond field arithmetic - But this can be alleviated by providing a precomputed column that has a fixed increment per CPU row ][ - Comparison is more annoying, but can work by: - Decomposition into a machine word and chip interaction with the LT chip - Bit decomposition and comparison constraints - Range-checking the difference to be sufficiently small w.r.t. the field characteristic. - Increments and basic arithmetic operations are cheap ] ]
 
-= Initialization and Finalization
+## Initialization and Finalization
 
 Because the LogUp argument handling token consumption and emission needs to be fully balanced --- every token emitted should be consumed, and vice versa --- we need to have a system to emit the initial tokens and consume the final tokens. This needs to ensure that every address has at most a single initializing emission, and at most one finalizing consumption. Having at most one initialization will, through the correctness of the lookup argument, immediately lead to having at most one correct finalization, and vice versa.
 
@@ -50,7 +134,7 @@ For our chosen scheme (which we refer to as "paged initialization/finalization")
 
 Concretely, each page gets an associated `PAGE` table, consisting of N variables over N columns. For each such table, the `page` variable is instantiated as the constant base address of the page. The `offset` column is preprocessed, which helps the verifier ensure that each page has a single fixed size, but the verifier should still check that no pages overlap and all `page` values are page-aligned.
 
-## Page initialization
+### Page initialization
 
 > **Note:** check whether we need `fini` to be range-checked
 
@@ -60,6 +144,33 @@ We present here a set of constraints on the `PAGE` table that
 
 For zero-initialized pages, `init` can be a constant `0`, and hence doesn't need a column, nor a range check.
 
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `offset` | `RowIndex` | The offset from the page base address. |
+| `init` | `Byte` | The initial value of this address. Can be replaced by a constant zero for zero-initialization |
+| `fini` | `Byte` | The final value this address took |
+| `timestamp` | `DWordWL` | The timestamp at which this address was last accessed |
+
+### Virtual
+
+| Name | Type | Description |
+|------|------|-------------|
+| `address` | `DWordWL` | Adding `offset` to the page base address `page`. `page` is a constant with respect to a single instance of this table. |
+
+**Definition of `address`:**
+```
+address := page + offset * 1::DWordWL
+```
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `PAGE-C1` | `IS_BYTE[init]` | 1 |
+| `PAGE-C2` | `IS_BYTE[fini]` | 1 |
+| `PAGE-C3` | `memory[0, address, 0::DWordWL, init]` | -1 |
+| `PAGE-C4` | `memory[0, address, timestamp, fini]` | 1 |
+
 We identify a few alternatives that would achieve the desired initialization/finalization functionalities, and consider their respective trade-offs.
 
 _"Free-zero" initialization_
@@ -70,15 +181,15 @@ _Sparse initialization/finalization_
 
 One or more STARK tables (depending on the amount of memory used) consisting of `(address, value)` columns are introduced, where for zero-initialization, `value` can be constant zero. Transition constraints ensure that `address` is strictly increasing, enforcing the "at most once" property; `value` is range-checked to consist of bytes. Similar to paged finalization, an additional `timestamp` column is added, containing the final timestamp each address was accessed. This table is then further used to contribute to the LogUp sum as with any other interactions. - The transition constraints can be chosen to only apply on finalization, as at-most-once finalization is enough to ensure consistency. - Sparse initialization is incompatible with paged finalization, see also the remark under free-zero initialization above. - This would require transition constraints, which currently are not needed elsewhere in the VM design - Additionally, for memory use exceeding the capacity of a single initialization/finalization table, some form of transition constraint between tables is needed - Alternatively, transition constraints could potentially be avoided by more integration into the LogUp system, but this could turn out more costly in practice - This is compatible with the above "free zero" initialization - Since a prover-committed address column is needed (rather than a precomputed one), the number of required columns increases. - As an optimization, the address column could potentially be used simultaneously for initialization and finalization - Sparse initialization/finalization reduces the cost for sparse memory access patterns, where only a few addresses would be accessed per page. Most programs and compilers should however favor a memory locality that makes paged initialization/finalization comparable. ]
 
-## Register initialization/finalization
+### Register initialization/finalization
 
 The initial and final state of registers can be entirely known by the verifier, since the relevant initialization values are either zero, or embedded in the ELF, and the final values can be set to a known value by the `HALT` ecall ([ecall]). As additionally, the number of registers is small, the verifier can directly add the required balancing terms to the LogUp sum.
 
-= Notes and considerations
+## Notes and considerations
 
 - Register reads and writes may interact within a single cycle, so a correct and fixed ordering needs to be ensured - Correctness of initialization and completeness of finalization need to be ensured
 
-= Future topics of interest
+## Future topics of interest
 
 - Optimize memory systems after determining factual bottlenecks (e.g. taking inspiration from Twist and Shout, or other recent research) - Double check whether IS_BYTE constraints are needed for fini
 
@@ -126,17 +237,9 @@ columns: 1fr, inset: 7pt, align: (top+left, center), stroke: none, table.header(
 
 Barring exceptional cases, this template is used to assert that a variable of type `Bit` assumes a valid value under some condition.
 
-= Variables The  template operates on  variables:
-
-= Constraints It takes only one constraint to enforce that `X` must be either `0` or `1` whenever ``cond` eq.not 0`:
-
-*Note*: - In case of _unconditional_ template application, `cond` can be dropped from the constraint, simplifying it to ``X` (1- `X`) = 0`. - As described earlier, the `cond` variable must be describable by a degree-1 (i.e., linear) expression. This is to make sure that [isbit:c:isbit]'s expression has degree at most 3.
-
-## Correctness argument
-
-If `cond` is `0`, [isbit:c:isbit] is trivially satisfied: `X` can assume any value and the polynomial constraint will evaluate to `0` regardless. When ``cond` eq.not 0`, it follows that the statement can only be proven when ``X` (1-`X`) equiv 0 mod p`, with `p` the modulus of the field. Because `BaseField` is a prime field, this equality is only satisfied if either ``X` equiv 0 mod p` or `1-`X` equiv 0 mod p`. Hence, it is proven that when ``cond` eq.not 0`, [isbit:c:isbit] is only satisfied if ``X` in {0, 1}`. 
+## Variables
 
-## Columns
+The  template operates on  variables:
 
 ### Input
 
@@ -152,28 +255,28 @@ If `cond` is `0`, [isbit:c:isbit] is trivially satisfied: `X` can assume any val
 
 ## Constraints
 
-### all
+It takes only one constraint to enforce that `X` must be either `0` or `1` whenever ``cond` eq.not 0`:
 
 | Tag | Description |
 |-----|-------------|
 | `IS_BIT-C1` | `cond` => `X` (1-`X`) = 0 |
 | | _polynomial:_ `cond * X * (1 - X) = 0` |
 
----
+*Note*: - In case of _unconditional_ template application, `cond` can be dropped from the constraint, simplifying it to ``X` (1- `X`) = 0`. - As described earlier, the `cond` variable must be describable by a degree-1 (i.e., linear) expression. This is to make sure that [isbit:c:isbit]'s expression has degree at most 3.
 
-# SIGN Template
+### Correctness argument
 
-It constrains that `sign` is set to `1` when both `X`'s most significant bit and `signed` are `1`, and `0` otherwise.
+If `cond` is `0`, [isbit:c:isbit] is trivially satisfied: `X` can assume any value and the polynomial constraint will evaluate to `0` regardless. When ``cond` eq.not 0`, it follows that the statement can only be proven when ``X` (1-`X`) equiv 0 mod p`, with `p` the modulus of the field. Because `BaseField` is a prime field, this equality is only satisfied if either ``X` equiv 0 mod p` or `1-`X` equiv 0 mod p`. Hence, it is proven that when ``cond` eq.not 0`, [isbit:c:isbit] is only satisfied if ``X` in {0, 1}`.
 
-= Variables The  template introduces  interaction(s):
+---
 
-= Assumptions The  template operates on the following assumptions:
+# SIGN Template
 
-If `sign` is set to `1`, `X` will be range-checked to be a halfword, and hence proving may fail if this is not ensured.
+It constrains that `sign` is set to `1` when both `X`'s most significant bit and `signed` are `1`, and `0` otherwise.
 
-= Constraints It takes only two constraints to compute the `sign` of `X`, given whether `X` represents a `signed` value or not. When ``signed` = 1`, the sign of `X` is equal to its most significant bit. This value is extracted in [sign:c:sign_if_signed]. If `X` is unsigned (i.e., ``signed` = 0`), its sign is always `0`. This is constrained by [sign:c:sign_if_unsigned].
+## Variables
 
-## Columns
+The  template introduces  interaction(s):
 
 ### Input
 
@@ -190,13 +293,17 @@ If `sign` is set to `1`, `X` will be range-checked to be a halfword, and hence p
 
 ## Assumptions
 
+The  template operates on the following assumptions:
+
 | Tag | Range | Description |
 |-----|-------|-------------|
 | `SIGN-A1` |  | `IS_BIT<signed>` |
 
+If `sign` is set to `1`, `X` will be range-checked to be a halfword, and hence proving may fail if this is not ensured.
+
 ## Constraints
 
-### all
+It takes only two constraints to compute the `sign` of `X`, given whether `X` represents a `signed` value or not. When ``signed` = 1`, the sign of `X` is equal to its most significant bit. This value is extracted in [sign:c:sign_if_signed]. If `X` is unsigned (i.e., ``signed` = 0`), its sign is always `0`. This is constrained by [sign:c:sign_if_unsigned].
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
@@ -212,13 +319,9 @@ For ease of notation, we moreover introduce the  constraint template $
 
 $ in both conditional and unconditional versions. It constrains that ``diff` equiv `lhs` - `rhs` (mod 2^64)` when the expression `cond` is non-zero.
 
-= Variables This template introduces  interaction(s).
-
-= Assumptions
-
-= Constraints This template introduces the following constraints
+## Variables
 
-## Columns
+This template introduces  interaction(s).
 
 ### Input
 
@@ -261,7 +364,7 @@ carry (when iter=1) := 2^-32 * (lhs[1] + rhs[1] + carry[0] - sum[1])
 
 ## Constraints
 
-### all
+This template introduces the following constraints
 
 | Tag | Range | Description |
 |-----|-------|-------------|
@@ -273,27 +376,9 @@ carry (when iter=1) := 2^-32 * (lhs[1] + rhs[1] + carry[0] - sum[1])
 
 It requires `cond` to be a bit.
 
-= Variables This template introduces  interaction(s).
-
-= Assumptions
-
-= Constraints We constrain this equality using two constraints:
-
-## Correctness argument
-
-The constraints force the `carry` values to be fixed. Writing `carry`'s definition, we then find that $
-
-= cases( 2^32 - (`x as DWordWL`)_0 & "if" (`x as DWordWL`)_0 != 0, 0 & "if" (`x as DWordWL`)_0 = 0 ),\
-
-2^32 - (`x as DWordWL`)_1 - 1 & "if" `x` != 0, 0 & "if" `x` = 0 $ Clearly, ``neg` = 0` when ``x` = 0` (and `cond` is set). For non-zero `x`, we distinguish two cases. When `(`x as DWordWL`)_0 = 0`, $
-
-&= 2^32 dot `neg`_1 + `neg`_0\ &= 2^32 dot (2^32 - (`x as DWordWL`)_1) + 0\ &= 2^32 dot (2^32 - (`x as DWordWL`)_1) + (`x as DWordWL`)_0\ &= 2^64 - (2^32 dot (`x as DWordWL`)_1 + (`x as DWordWL`)_0)\ &= 2^64 - `x`\ &equiv -x mod 2^64, $ while when `(`x as DWordWL`)_0 != 0`, $
-
-&= 2^32 dot `neg`_1 + `neg`_0\ &= 2^32 dot (2^32 - (`x as DWordWL`)_1 - 1) + (2^32 - (`x as DWordWL`)_0)  \ &= 2^64 - 2^32 dot (`x as DWordWL`)_1 - 2^32 + 2^32 - (`x as DWordWL`)_0  \ &= 2^64 - ((`x as DWordWL`)_0 + 2^32 dot (`x as DWordWL`)_1) \ &= 2^64 - `x`\ &equiv -x mod 2^64 $ when `cond` is set. When `cond` is not set, the two lookups are not executed, allowing `neg` to take any value in either case.
-
-It is worth noting that this construction does _not_ require the limbs of `neg` to be range checked, thus allowing it be represented by the unrangecheckable `DWordWL` rather than a `DWordHL`. The input value `x` is still assumed to be range-checked, however. ]
+## Variables
 
-## Columns
+This template introduces  interaction(s).
 
 ### Input
 
@@ -334,125 +419,606 @@ carry (when iter=1) := 2^-32 * ((x::DWordWL)[1] + neg[1] + carry[0])
 
 ## Constraints
 
-### all
+We constrain this equality using two constraints:
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
 | `NEG-C1` | `ZERO[1 - carry[0]; x[0] + x[1]]` | cond |
 | `NEG-C2` | `ZERO[1 - carry[1]; x[0] + x[1] + x[2] + x[3]]` | cond |
 
----
-
-# DECODE Table
-
-All `RV64IMC` instruction are to be decoded to a format that can be interpreted by the VM. This section outlines the decoding table being used in the VM. For reasons of efficiency, data in this table is significantly compressed. Since reasoning about this compressed form is needlessly complex, the `decode (uncompressed)` section presents the same table in uncompressed form, and explains how to decode `RV64IM` assembly instructions to it. Instructions on how to compress the uncompressed table to form the compressed decode table, can be derived from the `packed_decode` variable provided below.
+### Correctness argument
 
-= Variables
+The constraints force the `carry` values to be fixed. Writing `carry`'s definition, we then find that $
 
-The  table is comprised of  variables that are expressed using  columns:
+## cases(
 
-= Padding The  table must be padded to a length that is a power of two. Empty rows with the following content can be added to achieve this:
+2^32 - (`x as DWordWL`)_0 & "if" (`x as DWordWL`)_0 != 0, 0 & "if" (`x as DWordWL`)_0 = 0 ),\
 
-Note that this row sets the `EBREAK` flag. Given that `CPU` asserts that `EBREAK = 0` (see [cpu:c:ebreak_traps]), using this "padding-instruction" would immediately make the CPU table unprovable. Note moreover that the `pc` is set to `7`. This value is the _smallest odd number_ (i.e., not reachable during regular execution) that is more than _`4`_ (i.e., the max `pc`-increment) greater than _`1`_ (i.e., the `pc`-value used in the [additional instruction] referred to by `CPU`-padding lines).
+2^32 - (`x as DWordWL`)_1 - 1 & "if" `x` != 0, 0 & "if" `x` = 0 $ Clearly, ``neg` = 0` when ``x` = 0` (and `cond` is set). For non-zero `x`, we distinguish two cases. When `(`x as DWordWL`)_0 = 0`, $
 
-= Decoding For the purposes of explaining decoding, we decompress 's `packed_decode` variable into its constituent variables. Note that the below table is _not_ used in practice: it is solely used for the purposes of this explanation.
+&= 2^32 dot `neg`_1 + `neg`_0\ &= 2^32 dot (2^32 - (`x as DWordWL`)_1) + 0\ &= 2^32 dot (2^32 - (`x as DWordWL`)_1) + (`x as DWordWL`)_0\ &= 2^64 - (2^32 dot (`x as DWordWL`)_1 + (`x as DWordWL`)_0)\ &= 2^64 - `x`\ &equiv -x mod 2^64, $ while when `(`x as DWordWL`)_0 != 0`, $
 
-We will illustrate how each instruction should be expressed in this (uncompressed) decoding table. The columns of the accompanying table represent the following: - *`operation`*: the assembly operation being encoded. - *`op-flag`*: which of the "`ALU` selector flags" operation flags to set. Each operation sets exactly one. - *`w_instr`*, *`signed`*: whether to set the `word_instr` and `signed` flags, respectively. - *other*: the other flags that should be set or variables that should be given specific values.
+&= 2^32 dot `neg`_1 + `neg`_0\ &= 2^32 dot (2^32 - (`x as DWordWL`)_1 - 1) + (2^32 - (`x as DWordWL`)_0)  \ &= 2^64 - 2^32 dot (`x as DWordWL`)_1 - 2^32 + 2^32 - (`x as DWordWL`)_0  \ &= 2^64 - ((`x as DWordWL`)_0 + 2^32 dot (`x as DWordWL`)_1) \ &= 2^64 - `x`\ &equiv -x mod 2^64 $ when `cond` is set. When `cond` is not set, the two lookups are not executed, allowing `neg` to take any value in either case.
 
-For the purpose of brevity and readability, the table uses the following rules-of-thumb: + `rd`, `rs1`, `rs2`, and `imm` are mapped to the values provided by the instruction; when a value is not specified by an instruction it defaults to `0`. + `read_register1`, `read_register2` and `write_register` are set to `1` when respectively ``rs1` != 0`, ``rs2` != 0`, or  ``rd` != 0`. + Any flag that is not listed is set to `0`, with the exception of the `c_type` flag. *The `c_type` flag is set independently of the below table*, as explained next.
+It is worth noting that this construction does _not_ require the limbs of `neg` to be range checked, thus allowing it be represented by the unrangecheckable `DWordWL` rather than a `DWordHL`. The input value `x` is still assumed to be range-checked, however. ]
 
-Further clarification is provided in the notes following the table.
+---
 
-## C-type instructions
+# MEMW Chip
 
-The `RV64C` extension for compressed instructions specifies that \~50% of all instructions can be represented using a 16-bit instruction (rather than 32-bits), saving \~25% in code size. This execution of assembly code is _not_ agnostic to an instruction's compression state; after executing a compressed instruction, the `pc` should be incremented by `2` rather than `4`. To indicate an instruction is provided in compressed form, the `c_type` flag is introduced. *This flag should be set to `1` whenever the decoded instruction is provided in compressed form and `0` otherwise.*
+The  chip is used to read and write memory locations (both RAM and registers) in chunks of 1, 2, 4 or 8 values. It introduces the old value and last-accessed timestamps of memory addresses internally, in order to satisfy the design of the memory argument ([memory]).
 
-/// Add a reference to one or more notes following this table.
+## Variables
 
-super("[" + refs.pos().map(r => ref(r)).join(",") + "]") }
+The  chip is comprised of  variables that are expressed using  columns and leverages  interaction(s):
 
-show figure: set block(breakable: true)
+### Input
 
-figure(table( columns: (auto, auto, auto, auto, 1fr, auto), stroke: 0pt, inset: (right: .5em), align: (left, right, center, center, left, right), fill: (_, y) => // Overlay a low-opacity fill color to distinguish the different rows better if calc.odd(y) and y <= lines.len() { color.rgb(0, 0, 100, 20) } else { color.rgb(255, 255, 255, 20) }, table.header([*Operation*], [*op-flag*], [*`w_instr`*], [*`signed`*], [*other*], []), table.hline(stroke: 1.5pt), table.vline(x: 1, start: 1, end: lines.len() + 1, stroke: .5pt), ..lines.flatten(), table.hline(stroke: 1.5pt), table.footer([*Operation*], [*op-flag*], [*`w_instr`*], [*`signed`*], [*other*]), )) }
+| Name | Type | Description |
+|------|------|-------------|
+| `is_register` | `Bit` | Whether the address represents a register index |
+| `base_address` | `DWordWL` | The base address to read from/write to. Gets offset by $[0, 7]$ depending on the size of the access |
+| `value` | `BaseField[8]` | The values to store in memory. For RAM, these should be (up to) 8 range-checked `Byte`s; registers are stored as two range-checked `Word`s |
+| `timestamp` | `DWordWL` | The timestamp at which this memory access occurs |
+| `write2` | `Bit` | Whether to write exactly 2 values |
+| `write4` | `Bit` | Whether to write exactly 4 values |
+| `write8` | `Bit` | Whether to write exactly 8 values |
 
-// OP-IMM ([`ADDI[W]   rd, rs1, imm`], [`ADD`], [`[W]`], [], [], []), ([`SLTI[U]   rd, rs1, imm`], [`SLT`], [], [.not`[U]`], [], []), ([`ANDI      rd, rs1, imm`], [`AND`], [], [], [], []), ([`ORI       rd, rs1, imm`], [`OR`],   [], [], [], []), ([`XORI      rd, rs1, imm`], [`XOR`], [], [], [], []), ([`SLLI[W]   rd, rs1, imm`], [`SHIFT`], [`[W]`], [], [], []), ([`SRLI[W]   rd, rs1, imm`], [`SHIFT`], [`[W]`], [], [`mp_selector`], []), ([`SRAI[W]   rd, rs1, imm`], [`SHIFT`], [`[W]`], [1], [`mp_selector`], []), // OP ([`ADD[W]    rd, rs1, rs2`], [`ADD`], [`[W]`], [], [], []), ([`SUB[W]    rd, rs1, rs2`], [`SUB`], [`[W]`], [], [], []), ([`SLT[U]    rd, rs1, rs2`], [`SLT`], [], [.not`[U]`], [], []), ([`AND       rd, rs1, rs2`], [`AND`], [], [], [], []), ([`OR        rd, rs1, rs2`], [`OR`], [], [], [], []), ([`XOR       rd, rs1, rs2`], [`XOR`], [], [], [], []), ([`SLL[W]    rd, rs1, rs2`], [`SHIFT`], [`[W]`], [], [], []), ([`SRL[W]    rd, rs1, rs2`], [`SHIFT`], [`[W]`], [], [`mp_selector`], []), ([`SRA[W]    rd, rs1, rs2`], [`SHIFT`], [`[W]`], [1], [`mp_selector`], []), // OP - M ([`MUL[W]    rd, rs1, rs2`], [`MUL`], [`[W]`], [1], [`mp_selector`], []), ([`MULH      rd, rs1, rs2`], [`MUL`], [], [1], [`mp_selector`, `muldiv_selector`], []), ([`MULHU     rd, rs1, rs2`], [`MUL`], [], [], [`muldiv_selector`], []), ([`MULHSU    rd, rs1, rs2`], [`MUL`], [], [1], [`muldiv_selector`], []), ([`DIV[U][W] rd, rs1, rs2`], [`DIVREM`], [`[W]`], [.not`[U]`], [], []), ([`REM[U][W] rd, rs1, rs2`], [`DIVREM`], [`[W]`], [.not`[U]`], [`muldiv_selector`], []), // LUI/AUIPC ([`LUI       rd, imm`], [`ADD`], [], [], [], []), ([`AUIPC     rd, imm`], [`ADD`], [], [], [`rs1 := x255`], []), ([`JAL       rd, imm`], [`JALR`], [], [], [`rs1 := x255`], []), // Branching ([`JALR      rd, rs1, imm`], [`JALR`], [], [], [], []), ([`BEQ      rs1, rs2, imm`], [`BEQ`], [], [], [], []), ([`BNE      rs1, rs2, imm`], [`BEQ`], [], [], [`mp_selector`], []), ([`BLT[U]   rs1, rs2, imm`], [`BLT`], [], [.not`[U]`], [], []), ([`BGE[U]   rs1, rs2, imm`], [`BLT`], [], [.not`[U]`], [`mp_selector`], []), // LOAD ([`LD        rd, rs1, imm`], [`LOAD`], [], [], [`mem_8B`], []), ([`LW[U]     rd, rs1, imm`], [`LOAD`], [], [.not`[U]`], [`mem_4B`], []), ([`LH[U]     rd, rs1, imm`], [`LOAD`], [], [.not`[U]`], [`mem_2B`], []), ([`LB[U]     rd, rs1, imm`], [`LOAD`], [], [.not`[U]`], [], []), // STORE ([`SD       rs1, rs2, imm`], [`STORE`], [], [], [`mem_8B`], []), ([`SW       rs1, rs2, imm`], [`STORE`], [], [], [`mem_4B`], []), ([`SH       rs1, rs2, imm`], [`STORE`], [], [], [`mem_2B`], []), ([`SB       rs1, rs2, imm`], [`STORE`], [], [], [], []), // ECALL/EBREAK ([`ECALL`], [`ECALL`], [], [], [``rs1` := `x17``], []), ([`EBREAK`], [`EBREAK`], [], [], [], []), // FENCE ([`FENCE`], [`ADD`], [], [], [], []),
+### Output
 
-// Construct a note that can be referenced through `lbl`
+| Name | Type | Description |
+|------|------|-------------|
+| `old` | `BaseField[8]` | The old value written at `base_address`. See `value` for information about representation. Only the elements corresponding to the `writeN` bits are guaranteed |
 
-show figure: (it) => align(left, []) [ ] }
+### Auxiliary
 
-## Notes
+| Name | Type | Description |
+|------|------|-------------|
+| `carry` | `Bit[7]` | Whether `base_address[0] + i + 1` $>= 2^32$ |
+| `old_timestamp` | `DWordWL[8]` | The timestamp at which address `base_address + i` was last accessed |
 
-We note the following about the above decoding table:
+### Virtual
 
-enum.item( referenceable_note( "note_word_instr", [`word_instr`: `[W]` indicates that ``word_instr` = 1` for the `W`-variant of the operation, and `0` for the non-`W`-variant.] ), enum.item( referenceable_note( "note_signed", [`signed`: .not`[U]` indicates that ``signed` = 1` for the *non-`U`*-variant of the operation, and `0` for the `U`-variant.] ), enum.item( referenceable_note( "note-lui", [`LUI`: this operation loads the 20-bit `imm` in the upper bits of `rd`. Observe that this can be represented using `ADDI rd, x0, imm`. As such, *we expect the decoding to take care of writing the immediate in bit range `[12:32]` of `imm` and extending it to 64 bits.*] ), enum.item( referenceable_note( "note-auipc", [`AUIPC`: this operation adds the 20-bit immediate to the upper bits of `pc` and stores the result in `rd`. Given that the `pc` is stored in `x255`, this operation can be represented using `ADDI rd, x255, imm`. As such, *we expect the decoding to take care of writing the immediate in bit range `[12:32]` of `imm` and extending it to 64 bits.*] ), enum.item( referenceable_note( "note-jal", [`JAL`: this operation stores ``pc` + 4` in `rd` and adds two times the sign-extended 20-bit immediate to the `pc`. Note that this can be represented using `JALR rd, x255, imm`. As such, *we expect the decoding to take care of writing the immediate in bit range `[1:21]` of `imm` and extending it to 64 bits; the least significant bit should always be 0.*] ), enum.item( referenceable_note( "note-ecall", [`ECALL`: "On RISC-V a system call has its own instruction: `ECALL`. [...] A7 [= register `x17`] contains the system call number." [[source]] ] ), enum.item( referenceable_note( "note-fence", [`FENCE`: currently, the VM interprets this operation as `ADDI x0 x0 0`; a no-op.]
+| Name | Type | Description |
+|------|------|-------------|
+| `w2` | `Bit` | writing at least 2 bytes |
+| `w4` | `Bit` | writing at least 4 bytes |
+| `address_add` | `DWordWL[7]` | `address_add[i] = base_address + i + 1` |
+| `μ_sum` | `Bit` |  |
 
-## One more instruction <cpu-padding-decode-row>
+**Definition of `w2`:**
+```
+w2 := write2 + write4 + write8
+```
 
-In addition to decoding all instructions provided in the ELF and adding a corresponding entry to the  table, one must include an entry that has ``pc` = 1` and every other variable set to `0`. Note that this will never conflict with any entry in the ELF, since it has an odd `pc` value.
+**Definition of `w4`:**
+```
+w4 := write4 + write8
+```
 
-This entry is used to pad the `CPU` table. More details on this matter are provided in the `CPU` chip.
+**Definition of `address_add`:**
+```
+address_add := ['arr', ['-', ['+', ['idx', 'base_address', 0], 'i', 1], ['*', ['^', 2, 32], ['idx', 'carry', 'i']]], ['+', ['idx', 'base_address', 1], ['idx', 'carry', 'i']]]
+```
 
-## Columns
+**Definition of `μ_sum`:**
+```
+μ_sum := μ_read + μ_write
+```
 
-### Output
+### Multiplicity
 
 | Name | Type | Description |
 |------|------|-------------|
-| `pc` | `DWordWL` | value of the program counter this instruction is associated with. |
-| `packed_decode` | `BaseField` | Ordered concatenation of several small variables. The `decode (uncompressed)` section explains the purpose of each variable.\ A list of each variable and the bit(-range) in which it is located:\ [0] `read_register1`, \ [1] `read_register2`, \ [2] `write_register`, \ [3] `memory_2bytes`, \ [4] `memory_4bytes`, \ [5] `memory_8bytes`, \ [6] `c_type`, \ [7] `signed`, \ [8] `mp_selector`, \ [9] `muldiv_selector`, \ [10] `word_instr`, \ [11] `ADD`, \ [12] `SUB`, \ [13] `SLT`, \ [14] `AND`, \ [15] `OR`, \ [16] `XOR`, \ [17] `SHIFT`, \ [18] `JALR`, \ [19] `BEQ`, \ [20] `BLT`, \ [21] `LOAD`, \ [22] `STORE`, \ [23] `MUL`, \ [24] `DIVREM`, \ [25] `ECALL`, \ [26] `EBREAK`; \ [27:35] `rs1`, \ [35:43] `rs2`, \ [43:51] `rd`, \ the remaining bits are set to zero.  |
-| `imm` | `DWordWL` | the *fully extended (!)* 64-bit version of the immediate. |
+| `μ_read` | `Bit` | Whether we are performing a read (and hence return `out`) |
+| `μ_write` | `Bit` | Whether we are performing a write (and hence not return `out`) |
 
-### Multiplicity
+## Assumptions
 
-| Name | Type | Description |
-|------|------|-------------|
-| `μ` | `BaseField` | The multiplicity with which this instruction is looked up in the `CPU` table. |
+| Tag | Range | Description |
+|-----|-------|-------------|
+| `MEMW-A1.i` | i ∈ [0, 1] | `IS_WORD[base_address[i]]` |
+| `MEMW-A2` |  | `IS_BIT<write2>` |
+| `MEMW-A3` |  | `IS_BIT<write4>` |
+| `MEMW-A4` |  | `IS_BIT<write8>` |
+| `MEMW-A5` |  | `IS_BIT<write2 + write4 + write8>` |
+| `MEMW-A6.i` | i ∈ [0, 1] | `IS_WORD[timestamp[i]]` |
 
----
+Our assumptions do not explicitly cover any range checks for the `is_register` and `value` columns, as these are not necessary for the correctness of this chip in isolation. Still, these properties are necessary for the consistency of the system as a whole, and therefore we document it here, keeping the type information as a reading help.
 
-# CPU Chip
+## Constraints
 
-The  chip coordinates memory accesses and dispatches to other chips for arithmetic and logical operations. It bases its decisions on the entry of the `DECODE` table ([decode]) corresponding the the current program counter (PC).
+Depending on the values of `write2`, `write4` and `write8`, the addresses following `base_address` need to be constructed. Rather than computing these in full (which would require the later addresses to be instantiated), it suffices to know the `carry`: the bit indicating whether ``base_address`_0 + t >= 2^32`, i.e., whether adding `t in [1, 7]` to `base_address` requires a carry from the lower to the upper limb. Note that it is safe for the prover to chose these bits: additions for which this bit is not correctly set will yield an address where either the lower or upper limb is out of bounds. As such, the constructed address will not match any existing memory tokens, which are only initialized for correctly formatted and range-checked doublewords (see [memory]).
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `MEMW-C1` |  | `IS_BIT<μ_read>` |  |
+| `MEMW-C2` |  | `IS_BIT<μ_write>` |  |
+| `MEMW-C3` |  | `IS_BIT<μ_sum>` |  |
+| `MEMW-C4` |  | `w2` => `μ_sum` |  |
+| | | _polynomial:_ `w2 * (1 - μ_sum) = 0` | |
+| `MEMW-C5.i` | i ∈ [0, 6] | `IS_BIT<carry[i]>` |  |
+| `MEMW-C6` |  | `LT[1; old_timestamp[0], timestamp, 0]` | μ_sum |
+| `MEMW-C7` |  | `LT[1; old_timestamp[1], timestamp, 0]` | w2 |
+| `MEMW-C8.i` | i ∈ [2, 3] | `LT[1; old_timestamp[i], timestamp, 0]` | w4 |
+| `MEMW-C9.i` | i ∈ [4, 7] | `LT[1; old_timestamp[i], timestamp, 0]` | write8 |
 
-= Variables
+As long as `timestamp` is properly range-checked, the presence of `old_timestamp` in the memory argument automatically ensures it is appropriately range checked (this assumes no external entities provide negative multiplicities without range checking the timestamp). This ensures the assumptions for `LT` are satisfied.
 
-The  chip is comprised of  variables that are expressed using  columns and leverages  interaction(s):
+There is no need to check that the additions do not overflow, as our address calculations are not performed modulo `2^64` here, and any overflow will result in an address without matching initialization.
 
-= Assumptions
+The chip adds the following tuples to the lookup argument, to effectuate that part of the memory argument.
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `MEMW-CM10` |  | `memory[is_register, base_address, old_timestamp[0], old[0]]` | μ_sum |
+| `MEMW-CM11` |  | `memory[is_register, base_address, timestamp, value[0]]` | -μ_sum |
+| `MEMW-CM12` |  | `memory[is_register, address_add[0], old_timestamp[1], old[1]]` | w2 |
+| `MEMW-CM13` |  | `memory[is_register, address_add[0], timestamp, value[1]]` | -w2 |
+| `MEMW-CM14.i` | i ∈ [2, 3] | `memory[is_register, address_add[i - 1], old_timestamp[i], old[i]]` | w4 |
+| `MEMW-CM15.i` | i ∈ [2, 3] | `memory[is_register, address_add[i - 1], timestamp, value[i]]` | -w4 |
+| `MEMW-CM16.i` | i ∈ [4, 7] | `memory[is_register, address_add[i - 1], old_timestamp[i], old[i]]` | write8 |
+| `MEMW-CM17.i` | i ∈ [4, 7] | `memory[is_register, address_add[i - 1], timestamp, value[i]]` | -write8 |
 
-= Constraints First, we perform a decoding lookup for the current PC.
+This chip contributes the following to the lookup argument:
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `CPU-C1` | `DECODE[pc, imm, packed_decode]` | 1 |
+| `MEMW-CO18` | `MEMW[old; is_register, base_address, value, timestamp, write2, write4, write8]` | -μ_read |
+| `MEMW-CO19` | `MEMW[is_register, base_address, value, timestamp, write2, write4, write8]` | -μ_write |
 
-## Range checks
+## Padding
 
-> **Note:** Make sure we argue for every column here
+The table can be padded to the next power of two with the following value assignments:
 
-> **Note:** is `rvd` still sufficiently constrained? (can also be done through the memory argument like `pc`?)
+## Read-size aligned fast path
 
-We constrain all columns to have the appropriate ranges. The flags and register indices looked up from the decoding need to be checked, as they are communicated through the interaction in a packed form. In contrast, we know ahead of time that decoding will ensure proper range checks for `pc` and `imm`. Similarly, since `next_pc` will propagate through the memory argument and be looked up in the instruction decoding on the next cycle, it is forced to be in the correct range. For the auxiliary columns, we need to check the limbs of `arg1`, `arg2`, and `res`. The ranges of the other auxiliary columns are enforced through later constraints.
+When a memory access happens at an address with proper alignment for its access size (i.e., adding the access size to `base_address`'s lowest limb does not overflow), and all accessed elements were last accessed at the same timestamp, we can instead use the  chip to save on total column count. The saving comes from only requiring a single old timestamp to be stored, as well as being able to guarantee that all values of `add_limb_overflow` would be zero. A minor extra cost is introduced in the form of a check that the alignment is indeed correct, and the corresponding decomposition of the `base_address`.
 
-| Tag | Range | Description | Multiplicity |
-|-----|-------|-------------|--------------|
-| `CPU-CR2` |  | `IS_BIT<read_register1>` |  |
-| `CPU-CR3` |  | `IS_BIT<read_register2>` |  |
-| `CPU-CR4` |  | `IS_BIT<write_register>` |  |
-| `CPU-CR5` |  | `IS_BIT<memory_2bytes>` |  |
-| `CPU-CR6` |  | `IS_BIT<memory_4bytes>` |  |
-| `CPU-CR7` |  | `IS_BIT<memory_8bytes>` |  |
-| `CPU-CR8` |  | `IS_BIT<c_type_instruction>` |  |
-| `CPU-CR9` |  | `IS_BIT<signed>` |  |
-| `CPU-CR10` |  | `IS_BIT<mp_selector>` |  |
-| `CPU-CR11` |  | `IS_BIT<muldiv_selector>` |  |
-| `CPU-CR12` |  | `IS_BIT<word_instr>` |  |
-| `CPU-CR13` |  | `IS_BIT<ADD>` |  |
-| `CPU-CR14` |  | `IS_BIT<SUB>` |  |
-| `CPU-CR15` |  | `IS_BIT<SLT>` |  |
-| `CPU-CR16` |  | `IS_BIT<AND>` |  |
-| `CPU-CR17` |  | `IS_BIT<OR>` |  |
+Further logic remains essentially the same, so we briefly present the relevant tables for this chip.
+
+The  chip only needs  variables, expressed through  columns; it leverages  interactions.
+
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `is_register` | `Bit` | Whether the address represents a register index |
+| `base_address` | `DWordWHH` | The base address to read from/write to. Gets offset by $[0, 7]$ depending on the size of the access |
+| `value` | `BaseField[8]` | The values to store in memory. For regular memory, these should be (up to) 8 range-checked `Byte`s; registers are stored as two range-checked `Word`s |
+| `timestamp` | `DWordWL` | The timestamp at which this memory access is said to occur |
+| `write2` | `Bit` | Whether to write exactly 2 values |
+| `write4` | `Bit` | Whether to write exactly 4 values |
+| `write8` | `Bit` | Whether to write exactly 8 values |
+
+### Output
+
+| Name | Type | Description |
+|------|------|-------------|
+| `old` | `BaseField[8]` | The old value written at `base_address + i`. See `value` for information about representation. Only the elements corresponding to the `writeN` bits are guaranteed |
+
+### Auxiliary
+
+| Name | Type | Description |
+|------|------|-------------|
+| `old_timestamp` | `DWordWL` | The timestamp at which the address was last accessed |
+
+### Virtual
+
+| Name | Type | Description |
+|------|------|-------------|
+| `w2` | `Bit` | writing at least 2 bytes |
+| `w4` | `Bit` | writing at least 4 bytes |
+| `μ_sum` | `Bit` |  |
+
+**Definition of `w2`:**
+```
+w2 := write2 + write4 + write8
+```
+
+**Definition of `w4`:**
+```
+w4 := write4 + write8
+```
+
+**Definition of `μ_sum`:**
+```
+μ_sum := μ_read + μ_write
+```
+
+### Multiplicity
+
+| Name | Type | Description |
+|------|------|-------------|
+| `μ_read` | `Bit` | Whether we are performing a read (and hence return `out`) |
+| `μ_write` | `Bit` | Whether we are performing a write (and hence not return `out`) |
+
+| Tag | Range | Description |
+|-----|-------|-------------|
+| `MEMW_A-A1.i` | i ∈ [0, 1] | `IS_HALF[base_address[i]]` |
+| `MEMW_A-A2` |  | `IS_WORD[base_address[2]]` |
+| `MEMW_A-A3` |  | `IS_BIT<write2>` |
+| `MEMW_A-A4` |  | `IS_BIT<write4>` |
+| `MEMW_A-A5` |  | `IS_BIT<write8>` |
+| `MEMW_A-A6` |  | `IS_BIT<write2 + write4 + write8>` |
+| `MEMW_A-A7.i` | i ∈ [0, 1] | `IS_WORD[timestamp[i]]` |
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `MEMW_A-C1` | `IS_HALF[base_address[0] + write2 + 3 * write4 + 7 * write8]` | μ_sum |
+| `MEMW_A-C2` | `IS_BIT<μ_read>` |  |
+| `MEMW_A-C3` | `IS_BIT<μ_write>` |  |
+| `MEMW_A-C4` | `IS_BIT<μ_sum>` |  |
+| `MEMW_A-C5` | `w2` => `μ_sum` |  |
+| | _polynomial:_ `w2 * (1 - μ_sum) = 0` | |
+| `MEMW_A-C6` | `LT[1; old_timestamp, timestamp, 0]` | μ_sum |
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `MEMW_A-CM7` |  | `memory[is_register, base_address::DWordWL, old_timestamp, old[0]]` | μ_sum |
+| `MEMW_A-CM8` |  | `memory[is_register, base_address::DWordWL, timestamp, value[0]]` | -μ_sum |
+| `MEMW_A-CM9` |  | `memory[is_register, base_address::DWordWL + 1::DWordWL, old_timestamp, old[1]]` | w2 |
+| `MEMW_A-CM10` |  | `memory[is_register, base_address::DWordWL + 1::DWordWL, timestamp, value[1]]` | -w2 |
+| `MEMW_A-CM11.i` | i ∈ [2, 3] | `memory[is_register, base_address::DWordWL + i::DWordWL, old_timestamp, old[i]]` | w4 |
+| `MEMW_A-CM12.i` | i ∈ [2, 3] | `memory[is_register, base_address::DWordWL + i::DWordWL, timestamp, value[i]]` | -w4 |
+| `MEMW_A-CM13.i` | i ∈ [4, 7] | `memory[is_register, base_address::DWordWL + i::DWordWL, old_timestamp, old[i]]` | write8 |
+| `MEMW_A-CM14.i` | i ∈ [4, 7] | `memory[is_register, base_address::DWordWL + i::DWordWL, timestamp, value[i]]` | -write8 |
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `MEMW_A-CO15` | `MEMW[old; is_register, base_address::DWordWL, value, timestamp, write2, write4, write8]` | -μ_read |
+| `MEMW_A-CO16` | `MEMW[is_register, base_address::DWordWL, value, timestamp, write2, write4, write8]` | -μ_write |
+
+### Padding
+
+The table can be padded to the next power of two with the following value assignments:
+
+## Register fast-path
+
+The  chip provides a fast-path for accessing registers. This fast-path leverages that registers + can be addressed using a `Byte`, rather than a full `DWord`, + are constantly accessed, i.e., ``timestamp` - `old_timestamp`` is small, and + have a fixed access pattern to achieve a footprint that is significantly smaller than both  and .
+
+Note: as a result of hard optimization, this chip can only be used for register accesses for which + ``timestamp` - `old_timestamp` in [1, 2^16]`, and + ``timestamp[0]` > `old_timestamp[0]`` If either of these rules does not apply to your access, you should fall back to using `MEMW_A`.
+
+Note moreover that this chip does not guard against misaligned register access faults: to access register with a given `address`, one must provide `2 dot `address`` in the lookup.
+
+### Variables
+
+The  chip is comprised of  variables that are expressed using  columns and leverages  interactions:
+
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `address` | `Byte` | address of the register being accessed |
+| `timestamp` | `DWordWL` | timestamp at which the access takes place |
+| `val` | `DWordWL` | value being written to this register |
+
+### Output
+
+| Name | Type | Description |
+|------|------|-------------|
+| `old` | `DWordWL` | value of this register at `old_timestamp`. |
+
+### Auxiliary
+
+| Name | Type | Description |
+|------|------|-------------|
+| `old_timestamp_lo` | `Word` | the lower limb of `old_timestamp` |
+
+### Virtual
+
+| Name | Type | Description |
+|------|------|-------------|
+| `old_timestamp` | `DWordWL` | timestamp at which this register was last accessed |
+| `μ_sum` | `Bit` |  |
+
+**Definition of `old_timestamp`:**
+```
+old_timestamp := ['arr', 'old_timestamp_lo', ['idx', 'timestamp', 1]]::DWordWL
+```
+
+**Definition of `μ_sum`:**
+```
+μ_sum := μ_read + μ_write
+```
+
+### Multiplicity
+
+| Name | Type | Description |
+|------|------|-------------|
+| `μ_read` | `Bit` | Whether we are performing a read (and hence return `out`) |
+| `μ_write` | `Bit` | Whether we are performing a write (and hence not return `out`) |
+
+### Assumptions
+
+The following range checks are assumed to be performed/enforced outside of this chip:
+
+| Tag | Range | Description |
+|-----|-------|-------------|
+| `MEMW_R-A1.i` | i ∈ [0, 1] | `IS_WORD[val[i]]` |
+| `MEMW_R-A2.i` | i ∈ [0, 1] | `IS_WORD[timestamp[i]]` |
+
+### Constraints
+
+Since most registers are frequently accessed, the difference between `timestamp` and `old_timestamp` is small most of the times. Rather than storing their (nearly) identical upper limbs twice, it is instead assumed that ``old_timestamp[1]` = `timestamp[1]``;  can be used for accesses where this is not the case.
+
+Verifying that ``timestamp` > `old_timestamp`` now simplifies to verifying that ``timestamp[0]` - `old_timestamp[0]` > 0`. For most accesses, this value will be small enough to fit in a `Half`. This chip thus enforces this by means of the following constraint:
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `MEMW_R-C1` | `IS_HALF[timestamp[0] - old_timestamp[0] - 1]` | μ_sum |
+
+With ``old_timestamp`<`timestamp`` asserted, `old` is read from the register ([regw:c:read_old]) and `val` is written back ([regw:c:write_val]).
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `MEMW_R-C2.i` | i ∈ [0, 1] | `memory[1, ['arr', ['cast', ['+', ['*', 2, 'address'], 'i'], 'Word'], 0], old_timestamp, old[i]]` | μ_sum |
+| `MEMW_R-C3.i` | i ∈ [0, 1] | `memory[1, ['arr', ['cast', ['+', ['*', 2, 'address'], 'i'], 'Word'], 0], timestamp, val[i]]` | -μ_sum |
+
+This chip can either just write (``μ_write` = 1`), or both read and write (``μ_read` = 1`) in the same cycle. It must be asserted that at most one of these two options is selected:
+
+| Tag | Description |
+|-----|-------------|
+| `MEMW_R-C4` | `IS_BIT<μ_read>` |
+| `MEMW_R-C5` | `IS_BIT<μ_write>` |
+| `MEMW_R-C6` | `IS_BIT<μ_sum>` |
+
+Lastly, this chip contributes the following interactions to the logup:
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `MEMW_R-C7` | `MEMW[['arr', ['idx', 'old', 0], ['idx', 'old', 1], 0, 0, 0, 0, 0, 0]; 1, ['arr', ['cast', ['*', 2, 'address'], 'Word'], 0], ['arr', ['idx', 'val', 0], ['idx', 'val', 1], 0, 0, 0, 0, 0, 0], timestamp, 1, 0, 0]` | -μ_read |
+| `MEMW_R-C8` | `MEMW[1, ['arr', ['cast', ['*', 2, 'address'], 'Word'], 0], ['arr', ['idx', 'val', 0], ['idx', 'val', 1], 0, 0, 0, 0, 0, 0], timestamp, 1, 0, 0]` | -μ_write |
+
+### Padding
+
+The table can be padded to the next power of two with the following value assignments:
+
+## Notes/optimizations
+
+The following ideas may prove to be optimizations for the // chip: - `MEMB` chip that does a one-byte write to remove old_timestamp from here (uncertain tradeoffs) - Adding `μ_sum`/`w2`/`w4`/`write8` multiplicities to the `IS_HALF` lookups may make some GKR things faster if there are known zeroes. - For the register fast-path, one may upgrade the `IS_HALF` check to an `IS_B20` check for extended range at the cost of looking through a larger table.
+
+---
+
+# DECODE Table
+
+All `RV64IMC` instruction are to be decoded to a format that can be interpreted by the VM. This section outlines the decoding table being used in the VM. For reasons of efficiency, data in this table is significantly compressed. Since reasoning about this compressed form is needlessly complex, the `decode (uncompressed)` section presents the same table in uncompressed form, and explains how to decode `RV64IM` assembly instructions to it. Instructions on how to compress the uncompressed table to form the compressed decode table, can be derived from the `packed_decode` variable provided below.
+
+## Variables
+
+The  table is comprised of  variables that are expressed using  columns:
+
+### Output
+
+| Name | Type | Description |
+|------|------|-------------|
+| `pc` | `DWordWL` | value of the program counter this instruction is associated with. |
+| `packed_decode` | `BaseField` | Ordered concatenation of several small variables. The `decode (uncompressed)` section explains the purpose of each variable.\ A list of each variable and the bit(-range) in which it is located:\ [0] `read_register1`, \ [1] `read_register2`, \ [2] `write_register`, \ [3] `memory_2bytes`, \ [4] `memory_4bytes`, \ [5] `memory_8bytes`, \ [6] `c_type`, \ [7] `signed`, \ [8] `mp_selector`, \ [9] `muldiv_selector`, \ [10] `word_instr`, \ [11] `ADD`, \ [12] `SUB`, \ [13] `SLT`, \ [14] `AND`, \ [15] `OR`, \ [16] `XOR`, \ [17] `SHIFT`, \ [18] `JALR`, \ [19] `BEQ`, \ [20] `BLT`, \ [21] `LOAD`, \ [22] `STORE`, \ [23] `MUL`, \ [24] `DIVREM`, \ [25] `ECALL`, \ [26] `EBREAK`; \ [27:35] `rs1`, \ [35:43] `rs2`, \ [43:51] `rd`, \ the remaining bits are set to zero.  |
+| `imm` | `DWordWL` | the *fully extended (!)* 64-bit version of the immediate. |
+
+### Multiplicity
+
+| Name | Type | Description |
+|------|------|-------------|
+| `μ` | `BaseField` | The multiplicity with which this instruction is looked up in the `CPU` table. |
+
+## Padding
+
+The  table must be padded to a length that is a power of two. Empty rows with the following content can be added to achieve this:
+
+Note that this row sets the `EBREAK` flag. Given that `CPU` asserts that `EBREAK = 0` (see [cpu:c:ebreak_traps]), using this "padding-instruction" would immediately make the CPU table unprovable. Note moreover that the `pc` is set to `7`. This value is the _smallest odd number_ (i.e., not reachable during regular execution) that is more than _`4`_ (i.e., the max `pc`-increment) greater than _`1`_ (i.e., the `pc`-value used in the [additional instruction] referred to by `CPU`-padding lines).
+
+## Decoding
+
+For the purposes of explaining decoding, we decompress 's `packed_decode` variable into its constituent variables. Note that the below table is _not_ used in practice: it is solely used for the purposes of this explanation.
+
+### Output
+
+| Name | Type | Description |
+|------|------|-------------|
+| `pc` | `DWordWL` | value of the program counter this instruction is associated with. |
+| `rs1` | `Byte` | index of source register 1. |
+| `rs2` | `Byte` | index of source register 2. |
+| `rd` | `Byte` | index of destination register. |
+| `read_register1` | `Bit` | whether to load the contents of address `rs1` (1) or `0` (0) into `rv1`. |
+| `read_register2` | `Bit` | whether to load the contents of address `rs2` (1) or `0` (0) into `rv2`. |
+| `write_register` | `Bit` | whether the result should be written to `rd` ($=0$ for memory write and when $`rd` = `x0`$. |
+| `mem_2B` | `Bit` | whether the memory access (read or write) touches exactly $2$ bytes. |
+| `mem_4B` | `Bit` | whether the memory access (read or write) touches exactly $4$ bytes. |
+| `mem_8B` | `Bit` | whether the memory access (read or write) touches exactly $8$ bytes. |
+| `c_type` | `Bit` | Whether the instruction is of type `C`, i.e., whether it is $2$ bytes long instead of $4$. |
+| `imm` | `DWordWL` | the *fully extended (!)* 64-bit version of the immediate. |
+| `signed` | `Bit` | selector used to indicate signed or unsigned input interpretation. |
+| `mp_selector` | `Bit` | Multi-purpose selector used by the CPU to to configure several ALU operations in different ways.            See the `CPU` chip for more details. |
+| `muldiv_selector` | `Bit` | selects which output of `MUL` (lo/hi) or `DVRM` (quo/rem) is wanted. |
+| `word_instr` | `Bit` | Whether the instruction is a `*W` instruction, requiring the inputs and outputs to be (sign) extended. |
+| `ADD` | `Bit` | ALU selector flag |
+| `SUB` | `Bit` | ALU selector flag |
+| `SLT` | `Bit` | ALU selector flag |
+| `AND` | `Bit` | ALU selector flag |
+| `OR` | `Bit` | ALU selector flag |
+| `XOR` | `Bit` | ALU selector flag |
+| `SHIFT` | `Bit` | ALU selector flag |
+| `JALR` | `Bit` | ALU selector flag |
+| `BEQ` | `Bit` | ALU selector flag |
+| `BLT` | `Bit` | ALU selector flag |
+| `LOAD` | `Bit` | ALU selector flag |
+| `STORE` | `Bit` | ALU selector flag |
+| `MUL` | `Bit` | ALU selector flag |
+| `DIVREM` | `Bit` | ALU selector flag |
+| `ECALL` | `Bit` | ALU selector flag |
+| `EBREAK` | `Bit` | ALU selector flag |
+
+### Multiplicity
+
+| Name | Type | Description |
+|------|------|-------------|
+| `μ` | `BaseField` | The multiplicity with which this instruction is looked up in the `CPU` table. |
+
+We will illustrate how each instruction should be expressed in this (uncompressed) decoding table. The columns of the accompanying table represent the following: - *`operation`*: the assembly operation being encoded. - *`op-flag`*: which of the "`ALU` selector flags" operation flags to set. Each operation sets exactly one. - *`w_instr`*, *`signed`*: whether to set the `word_instr` and `signed` flags, respectively. - *other*: the other flags that should be set or variables that should be given specific values.
+
+For the purpose of brevity and readability, the table uses the following rules-of-thumb: + `rd`, `rs1`, `rs2`, and `imm` are mapped to the values provided by the instruction; when a value is not specified by an instruction it defaults to `0`. + `read_register1`, `read_register2` and `write_register` are set to `1` when respectively ``rs1` != 0`, ``rs2` != 0`, or  ``rd` != 0`. + Any flag that is not listed is set to `0`, with the exception of the `c_type` flag. *The `c_type` flag is set independently of the below table*, as explained next.
+
+Further clarification is provided in the notes following the table.
+
+### C-type instructions
+
+The `RV64C` extension for compressed instructions specifies that \~50% of all instructions can be represented using a 16-bit instruction (rather than 32-bits), saving \~25% in code size. This execution of assembly code is _not_ agnostic to an instruction's compression state; after executing a compressed instruction, the `pc` should be incremented by `2` rather than `4`. To indicate an instruction is provided in compressed form, the `c_type` flag is introduced. *This flag should be set to `1` whenever the decoded instruction is provided in compressed form and `0` otherwise.*
+
+/// Add a reference to one or more notes following this table.
+
+super("[" + refs.pos().map(r => ref(r)).join(",") + "]") }
+
+show figure: set block(breakable: true)
+
+figure(table( columns: (auto, auto, auto, auto, 1fr, auto), stroke: 0pt, inset: (right: .5em), align: (left, right, center, center, left, right), fill: (_, y) => // Overlay a low-opacity fill color to distinguish the different rows better if calc.odd(y) and y <= lines.len() { color.rgb(0, 0, 100, 20) } else { color.rgb(255, 255, 255, 20) }, table.header([*Operation*], [*op-flag*], [*`w_instr`*], [*`signed`*], [*other*], []), table.hline(stroke: 1.5pt), table.vline(x: 1, start: 1, end: lines.len() + 1, stroke: .5pt), ..lines.flatten(), table.hline(stroke: 1.5pt), table.footer([*Operation*], [*op-flag*], [*`w_instr`*], [*`signed`*], [*other*]), )) }
+
+// OP-IMM ([`ADDI[W]   rd, rs1, imm`], [`ADD`], [`[W]`], [], [], []), ([`SLTI[U]   rd, rs1, imm`], [`SLT`], [], [.not`[U]`], [], []), ([`ANDI      rd, rs1, imm`], [`AND`], [], [], [], []), ([`ORI       rd, rs1, imm`], [`OR`],   [], [], [], []), ([`XORI      rd, rs1, imm`], [`XOR`], [], [], [], []), ([`SLLI[W]   rd, rs1, imm`], [`SHIFT`], [`[W]`], [], [], []), ([`SRLI[W]   rd, rs1, imm`], [`SHIFT`], [`[W]`], [], [`mp_selector`], []), ([`SRAI[W]   rd, rs1, imm`], [`SHIFT`], [`[W]`], [1], [`mp_selector`], []), // OP ([`ADD[W]    rd, rs1, rs2`], [`ADD`], [`[W]`], [], [], []), ([`SUB[W]    rd, rs1, rs2`], [`SUB`], [`[W]`], [], [], []), ([`SLT[U]    rd, rs1, rs2`], [`SLT`], [], [.not`[U]`], [], []), ([`AND       rd, rs1, rs2`], [`AND`], [], [], [], []), ([`OR        rd, rs1, rs2`], [`OR`], [], [], [], []), ([`XOR       rd, rs1, rs2`], [`XOR`], [], [], [], []), ([`SLL[W]    rd, rs1, rs2`], [`SHIFT`], [`[W]`], [], [], []), ([`SRL[W]    rd, rs1, rs2`], [`SHIFT`], [`[W]`], [], [`mp_selector`], []), ([`SRA[W]    rd, rs1, rs2`], [`SHIFT`], [`[W]`], [1], [`mp_selector`], []), // OP - M ([`MUL[W]    rd, rs1, rs2`], [`MUL`], [`[W]`], [1], [`mp_selector`], []), ([`MULH      rd, rs1, rs2`], [`MUL`], [], [1], [`mp_selector`, `muldiv_selector`], []), ([`MULHU     rd, rs1, rs2`], [`MUL`], [], [], [`muldiv_selector`], []), ([`MULHSU    rd, rs1, rs2`], [`MUL`], [], [1], [`muldiv_selector`], []), ([`DIV[U][W] rd, rs1, rs2`], [`DIVREM`], [`[W]`], [.not`[U]`], [], []), ([`REM[U][W] rd, rs1, rs2`], [`DIVREM`], [`[W]`], [.not`[U]`], [`muldiv_selector`], []), // LUI/AUIPC ([`LUI       rd, imm`], [`ADD`], [], [], [], []), ([`AUIPC     rd, imm`], [`ADD`], [], [], [`rs1 := x255`], []), ([`JAL       rd, imm`], [`JALR`], [], [], [`rs1 := x255`], []), // Branching ([`JALR      rd, rs1, imm`], [`JALR`], [], [], [], []), ([`BEQ      rs1, rs2, imm`], [`BEQ`], [], [], [], []), ([`BNE      rs1, rs2, imm`], [`BEQ`], [], [], [`mp_selector`], []), ([`BLT[U]   rs1, rs2, imm`], [`BLT`], [], [.not`[U]`], [], []), ([`BGE[U]   rs1, rs2, imm`], [`BLT`], [], [.not`[U]`], [`mp_selector`], []), // LOAD ([`LD        rd, rs1, imm`], [`LOAD`], [], [], [`mem_8B`], []), ([`LW[U]     rd, rs1, imm`], [`LOAD`], [], [.not`[U]`], [`mem_4B`], []), ([`LH[U]     rd, rs1, imm`], [`LOAD`], [], [.not`[U]`], [`mem_2B`], []), ([`LB[U]     rd, rs1, imm`], [`LOAD`], [], [.not`[U]`], [], []), // STORE ([`SD       rs1, rs2, imm`], [`STORE`], [], [], [`mem_8B`], []), ([`SW       rs1, rs2, imm`], [`STORE`], [], [], [`mem_4B`], []), ([`SH       rs1, rs2, imm`], [`STORE`], [], [], [`mem_2B`], []), ([`SB       rs1, rs2, imm`], [`STORE`], [], [], [], []), // ECALL/EBREAK ([`ECALL`], [`ECALL`], [], [], [``rs1` := `x17``], []), ([`EBREAK`], [`EBREAK`], [], [], [], []), // FENCE ([`FENCE`], [`ADD`], [], [], [], []),
+
+// Construct a note that can be referenced through `lbl`
+
+show figure: (it) => align(left, []) [ ] }
+
+### Notes
+
+We note the following about the above decoding table:
+
+enum.item( referenceable_note( "note_word_instr", [`word_instr`: `[W]` indicates that ``word_instr` = 1` for the `W`-variant of the operation, and `0` for the non-`W`-variant.] ), enum.item( referenceable_note( "note_signed", [`signed`: .not`[U]` indicates that ``signed` = 1` for the *non-`U`*-variant of the operation, and `0` for the `U`-variant.] ), enum.item( referenceable_note( "note-lui", [`LUI`: this operation loads the 20-bit `imm` in the upper bits of `rd`. Observe that this can be represented using `ADDI rd, x0, imm`. As such, *we expect the decoding to take care of writing the immediate in bit range `[12:32]` of `imm` and extending it to 64 bits.*] ), enum.item( referenceable_note( "note-auipc", [`AUIPC`: this operation adds the 20-bit immediate to the upper bits of `pc` and stores the result in `rd`. Given that the `pc` is stored in `x255`, this operation can be represented using `ADDI rd, x255, imm`. As such, *we expect the decoding to take care of writing the immediate in bit range `[12:32]` of `imm` and extending it to 64 bits.*] ), enum.item( referenceable_note( "note-jal", [`JAL`: this operation stores ``pc` + 4` in `rd` and adds two times the sign-extended 20-bit immediate to the `pc`. Note that this can be represented using `JALR rd, x255, imm`. As such, *we expect the decoding to take care of writing the immediate in bit range `[1:21]` of `imm` and extending it to 64 bits; the least significant bit should always be 0.*] ), enum.item( referenceable_note( "note-ecall", [`ECALL`: "On RISC-V a system call has its own instruction: `ECALL`. [...] A7 [= register `x17`] contains the system call number." [[source]] ] ), enum.item( referenceable_note( "note-fence", [`FENCE`: currently, the VM interprets this operation as `ADDI x0 x0 0`; a no-op.]
+
+### One more instruction <cpu-padding-decode-row>
+
+In addition to decoding all instructions provided in the ELF and adding a corresponding entry to the  table, one must include an entry that has ``pc` = 1` and every other variable set to `0`. Note that this will never conflict with any entry in the ELF, since it has an odd `pc` value.
+
+This entry is used to pad the `CPU` table. More details on this matter are provided in the `CPU` chip.
+
+---
+
+# CPU Chip
+
+The  chip coordinates memory accesses and dispatches to other chips for arithmetic and logical operations. It bases its decisions on the entry of the `DECODE` table ([decode]) corresponding the the current program counter (PC).
+
+## Variables
+
+The  chip is comprised of  variables that are expressed using  columns and leverages  interaction(s):
+
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `timestamp` | `Timestamp` | A preprocessed timestamp to coordinate the memory argument. Since we have at most 3 non-disjoint memory accesses (`(rs1, rs2, rd)`, `(rs1, pc, pc)`, `(LOAD)` or `(STORE)`) a maximum of 4 slots is enough. |
+| `pc` | `DWordWL` | The program counter |
+| `rs1` | `Byte` | Source register 1 index |
+| `rs2` | `Byte` | Source register 2 index |
+| `rd` | `Byte` | Destination register index |
+| `read_register1` | `Bit` | Whether to read from `rs1` (1) or to place a 0 in `rv1` (0) |
+| `read_register2` | `Bit` | Whether to read from `rs2` (1) or to place a 0 in `rv2` (0) |
+| `write_register` | `Bit` | Whether to write back to the destination register |
+| `memory_2bytes` | `Bit` | Whether the memory access (read or write) touches exactly 2 bytes |
+| `memory_4bytes` | `Bit` | Whether the memory access (read or write) touches exactly 4 bytes |
+| `memory_8bytes` | `Bit` | Whether the memory access (read or write) touches exactly 8 bytes |
+| `c_type_instruction` | `Bit` | Whether the instruction is of C type, i.e., whether it is 2 bytes long instead of 4 |
+| `imm` | `DWordWL` | The fully extended 64-bit version of the immediate |
+| `signed` | `Bit` | Indicates whether we're dealing with a signed or unsigned instruction |
+| `mp_selector` | `Bit` | Multi-purpose selector used by different ALU operations for different purposes. Currently, it is used     - by the `MUL` chip to select between `MUL`/`MULH` and `MULH[S]U`, and     - as flag for inverting the condition of conditional branches (see `branch_cond`)     - as direction (left or right) for `SHIFT` |
+| `muldiv_selector` | `Bit` | Selects which output of `MUL` (lo/hi) or `DIV` (quo/rem) is wanted |
+| `word_instr` | `Bit` | Whether the instruction is a \*W instruction, requiring the inputs and outputs to be (sign) extended |
+| `ADD` | `Bit` | One-hot ALU selector flag |
+| `SUB` | `Bit` | One-hot ALU selector flag |
+| `SLT` | `Bit` | One-hot ALU selector flag |
+| `AND` | `Bit` | One-hot ALU selector flag |
+| `OR` | `Bit` | One-hot ALU selector flag |
+| `XOR` | `Bit` | One-hot ALU selector flag |
+| `SHIFT` | `Bit` | One-hot ALU selector flag |
+| `JALR` | `Bit` | One-hot ALU selector flag |
+| `BEQ` | `Bit` | One-hot ALU selector flag |
+| `BLT` | `Bit` | One-hot ALU selector flag |
+| `LOAD` | `Bit` | One-hot ALU selector flag |
+| `STORE` | `Bit` | One-hot ALU selector flag |
+| `MUL` | `Bit` | One-hot ALU selector flag |
+| `DIVREM` | `Bit` | One-hot ALU selector flag |
+| `ECALL` | `Bit` | One-hot ALU selector flag |
+| `EBREAK` | `Bit` | One-hot ALU selector flag |
+
+### Output
+
+| Name | Type | Description |
+|------|------|-------------|
+| `next_pc` | `DWordWL` | The program counter for the next instruction |
+| `rvd` | `DWordWL` | The value to (maybe) be written back to rvd |
+
+### Auxiliary
+
+| Name | Type | Description |
+|------|------|-------------|
+| `rv1` | `DWordWHH` | The value of register `rs1` |
+| `rv2` | `DWordWHH` | The value of register `rs2` |
+| `rv1_ext_bit` | `Bit` | The sign bit of `rv1` if seen as a 32-bit word, used for sign extension with `word_instr` |
+| `arg1` | `DWordBL` | The extended version of `rv1`, depending on `word_instr` |
+| `rv2_ext_bit` | `Bit` | The sign bit of `rv2` if seen as a 32-bit word, used for sign extension with `word_instr` |
+| `arg2` | `DWordBL` | A multiplexed version of `rv2` and `imm`, to be used as second argument to ALU calls |
+| `res_ext_bit` | `Bit` | The sign bit of `res`, if seen as a 32-bit word, used for sign extension with `word_instr` |
+| `res` | `DWordBL` | The ALU result |
+| `is_equal` | `Bit` | Whether `rv1` and `arg2` are equal |
+| `branch_cond` | `Bit` | Whether a branch is taken, i.e., the branch condition |
+
+### Virtual
+
+| Name | Type | Description |
+|------|------|-------------|
+| `packed_decode` | `BaseField` | A packed representation of all bit flags and register indices obtained from the decoding |
+| `pad` | `Bit` | When no flags are set, we must be in a padding row. |
+
+**Definition of `packed_decode`:**
+```
+packed_decode := 2^0 * read_register1 + 2^1 * read_register2 + 2^2 * write_register + 2^3 * memory_2bytes + 2^4 * memory_4bytes + 2^5 * memory_8bytes + 2^6 * c_type_instruction + 2^7 * signed + 2^8 * mp_selector + 2^9 * muldiv_selector + 2^10 * word_instr + 2^11 * ADD + 2^12 * SUB + 2^13 * SLT + 2^14 * AND + 2^15 * OR + 2^16 * XOR + 2^17 * SHIFT + 2^18 * JALR + 2^19 * BEQ + 2^20 * BLT + 2^21 * LOAD + 2^22 * STORE + 2^23 * MUL + 2^24 * DIVREM + 2^25 * ECALL + 2^26 * EBREAK + 2^27 * rs1 + 2^35 * rs2 + 2^43 * rd
+```
+
+**Definition of `pad`:**
+```
+pad := 1 - ADD - SUB - SLT - AND - OR - XOR - SHIFT - JALR - BEQ - BLT - LOAD - STORE - MUL - DIVREM - ECALL - EBREAK
+```
+
+## Assumptions
+
+| Tag | Range | Description |
+|-----|-------|-------------|
+| `CPU-A1` |  | At most one ALU selector flag is 1 by the decoding, and every other flag is 0. |
+| `CPU-A2` |  | When `STORE + LOAD + BEQ + BLT = 0`, either `rs2 = 0` or `imm = 0` should be enforced by the decoding. This is needed for `arg2`. |
+
+## Constraints
+
+First, we perform a decoding lookup for the current PC.
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `CPU-C1` | `DECODE[pc, imm, packed_decode]` | 1 |
+
+### Range checks
+
+> **Note:** Make sure we argue for every column here
+
+> **Note:** is `rvd` still sufficiently constrained? (can also be done through the memory argument like `pc`?)
+
+We constrain all columns to have the appropriate ranges. The flags and register indices looked up from the decoding need to be checked, as they are communicated through the interaction in a packed form. In contrast, we know ahead of time that decoding will ensure proper range checks for `pc` and `imm`. Similarly, since `next_pc` will propagate through the memory argument and be looked up in the instruction decoding on the next cycle, it is forced to be in the correct range. For the auxiliary columns, we need to check the limbs of `arg1`, `arg2`, and `res`. The ranges of the other auxiliary columns are enforced through later constraints.
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `CPU-CR2` |  | `IS_BIT<read_register1>` |  |
+| `CPU-CR3` |  | `IS_BIT<read_register2>` |  |
+| `CPU-CR4` |  | `IS_BIT<write_register>` |  |
+| `CPU-CR5` |  | `IS_BIT<memory_2bytes>` |  |
+| `CPU-CR6` |  | `IS_BIT<memory_4bytes>` |  |
+| `CPU-CR7` |  | `IS_BIT<memory_8bytes>` |  |
+| `CPU-CR8` |  | `IS_BIT<c_type_instruction>` |  |
+| `CPU-CR9` |  | `IS_BIT<signed>` |  |
+| `CPU-CR10` |  | `IS_BIT<mp_selector>` |  |
+| `CPU-CR11` |  | `IS_BIT<muldiv_selector>` |  |
+| `CPU-CR12` |  | `IS_BIT<word_instr>` |  |
+| `CPU-CR13` |  | `IS_BIT<ADD>` |  |
+| `CPU-CR14` |  | `IS_BIT<SUB>` |  |
+| `CPU-CR15` |  | `IS_BIT<SLT>` |  |
+| `CPU-CR16` |  | `IS_BIT<AND>` |  |
+| `CPU-CR17` |  | `IS_BIT<OR>` |  |
 | `CPU-CR18` |  | `IS_BIT<XOR>` |  |
 | `CPU-CR19` |  | `IS_BIT<SHIFT>` |  |
 | `CPU-CR20` |  | `IS_BIT<JALR>` |  |
@@ -471,1215 +1037,1579 @@ We constrain all columns to have the appropriate ranges. The flags and register
 | `CPU-CR33.i` | i ∈ [0, 7] | `IS_BYTE[arg2[i]]` | 1 |
 | `CPU-CR34.i` | i ∈ [0, 7] | `IS_BYTE[res[i]]` | 1 |
 
-## ALU
+### ALU
+
+The ALU functionality is then obtained through judicious dispatching to the corresponding chips.
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `CPU-CA35` |  | ADD + LOAD ⇒ `ADD<res::DWordWL; arg1::DWordWL, arg2::DWordWL>` |  |
+| `CPU-CA36` |  | STORE ⇒ `ADD<res::DWordWL; arg1::DWordWL, imm>` |  |
+| `CPU-CA37` |  | SUB + BEQ ⇒ `SUB<res::DWordWL; arg1::DWordWL, arg2::DWordWL>` |  |
+| `CPU-CA38` |  | `LT[res[0]; arg1::DWordWL, arg2::DWordWL, signed]` | SLT + BLT |
+| `CPU-CA39.i` | i ∈ [1, 7] | `SLT` + `BLT` => `res[i]` = 0 |  |
+| | | _polynomial:_ `(SLT + BLT) * res[i] = 0` | |
+| `CPU-CA40.i` | i ∈ [0, 7] | `AND_BYTE[res[i]; arg1[i], arg2[i]]` | AND |
+| `CPU-CA41.i` | i ∈ [0, 7] | `OR_BYTE[res[i]; arg1[i], arg2[i]]` | OR |
+| `CPU-CA42.i` | i ∈ [0, 7] | `XOR_BYTE[res[i]; arg1[i], arg2[i]]` | XOR |
+| `CPU-CA43` |  | `SHIFT[res::DWordWL; arg1::DWordHL, arg2[0], mp_selector, signed, word_instr]` | SHIFT |
+| `CPU-CA44` |  | JALR ⇒ `ADD<res::DWordWL; pc, (2 * c_type_instruction + 4 * (1 - c_type_instruction)) * 1::DWordWL>` |  |
+| `CPU-CA45` |  | `MUL[res::DWordWL; arg1::DWordHL, signed, arg2::DWordHL, mp_selector, muldiv_selector]` | MUL |
+| `CPU-CA46` |  | `DVRM[res::DWordWL; arg1::DWordHL, arg2::DWordHL, signed, muldiv_selector]` | DIVREM |
+
+### Memory
+
+The interactions with the memory, both for register loading and storing, as for `LOAD` and `STORE` instructions are handled. Note that since registers need no byte-addressing, we store them in the memory argument with `Word` limbs. The timestamps are ensured to be disjoint for disjoint memory locations. One consequence of that is that `next_pc` is written at `timestamp + 1` to ensure the access is disjoint with the `pc` read into `rv1` as part of the `AUIPC` instruction.
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `CPU-CM47` |  | `MEMW[['arr', ['idx', ['cast', 'rv1', 'DWordWL'], 0], ['idx', ['cast', 'rv1', 'DWordWL'], 1], 0, 0, 0, 0, 0, 0]; 1, 2::DWordWL * rs1, ['arr', ['idx', ['cast', 'rv1', 'DWordWL'], 0], ['idx', ['cast', 'rv1', 'DWordWL'], 1], 0, 0, 0, 0, 0, 0], timestamp + 0::DWordWL, 1, 0, 0]` | read_register1 |
+| `CPU-CM48.i` | i ∈ [0, 2] | `!read_register1` => `rv1[i]` = 0 |  |
+| | | _polynomial:_ `(1 - read_register1) * rv1[i] = 0` | |
+| `CPU-CM49` |  | `MEMW[['arr', ['idx', ['cast', 'rv2', 'DWordWL'], 0], ['idx', ['cast', 'rv2', 'DWordWL'], 1], 0, 0, 0, 0, 0, 0]; 1, 2::DWordWL * rs2, ['arr', ['idx', ['cast', 'rv2', 'DWordWL'], 0], ['idx', ['cast', 'rv2', 'DWordWL'], 1], 0, 0, 0, 0, 0, 0], timestamp + 1::DWordWL, 1, 0, 0]` | read_register2 |
+| `CPU-CM50.i` | i ∈ [0, 2] | `!read_register2` => `rv2[i]` = 0 |  |
+| | | _polynomial:_ `(1 - read_register2) * rv2[i] = 0` | |
+| `CPU-CM51` |  | `MEMW[1, 2::DWordWL * rd, ['arr', ['idx', 'rvd', 0], ['idx', 'rvd', 1], 0, 0, 0, 0, 0, 0], timestamp + 2::DWordWL, 1, 0, 0]` | write_register |
+| `CPU-CM52` |  | `LOAD[rvd; res::DWordWL, timestamp + 0::DWordWL, memory_2bytes, memory_4bytes, memory_8bytes, signed]` | LOAD |
+| `CPU-CM53` |  | `MEMW[0, res::DWordWL, arg2::Byte[8], timestamp + 1::DWordWL, memory_2bytes, memory_4bytes, memory_8bytes]` | STORE |
+| `CPU-CM54` |  | `MEMW[['arr', ['idx', 'pc', 0], ['idx', 'pc', 1], 0, 0, 0, 0, 0, 0]; 1, (2 * 255)::DWordWL, ['arr', ['idx', 'next_pc', 0], ['idx', 'next_pc', 1], 0, 0, 0, 0, 0, 0], timestamp + 1::DWordWL, 1, 0, 0]` | 1 - pad |
+
+### System
+
+The interactions with the wider system.
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `CPU-CS55` | `!EBREAK` |  |
+| | _polynomial:_ `1 - EBREAK = 0` | |
+| `CPU-CS56` | `ECALL[timestamp, rv1::DWordWL]` | ECALL |
+
+### Input and output to the ALU
+
+We constrain `arg1`, `arg2` and `rvd` to correspond to the wanted values, including the appropriate sign/zero extension, depending on `word_instr`.
+
+| Tag | Description |
+|-----|-------------|
+| `CPU-CE57` | `SIGN<rv1_ext_bit; rv1[1], word_instr>` |
+| `CPU-CE58` | `arg1[:4]` = `rv1[:2]` |
+| | _polynomial:_ `(arg1::DWordWL)[0] - (rv1::DWordWL)[0] = 0` |
+| `CPU-CE59` | `arg1[4:]` = `rv1[2]` dot (1 - `word_instr`) + (2^(32) - 1) dot `rv1_ext_bit` dot `signed` |
+| | _polynomial:_ `(arg1::DWordWL)[1] - (1 - word_instr) * rv1[2] - signed * rv1_ext_bit * (2^32 - 1) = 0` |
+| `CPU-CE60` | `SIGN<rv2_ext_bit; rv2[1], word_instr>` |
+| `CPU-CE61` | `arg2[:4]` = (1 - `LOAD`) dot `rv2[:2]` + (1 - `BEQ` - `BLT` - `STORE`) dot `imm[0]` |
+| | _polynomial:_ `(arg2::DWordWL)[0] - (1 - LOAD) * (rv2::DWordWL)[0] - (1 - BEQ - BLT - STORE) * imm[0] = 0` |
+| `CPU-CE62` | `arg2[4:]` = (1 - `LOAD`) dot ((1 - `word_instr`) dot `rv2[2]` + `signed` dot `rv2_ext_bit` dot (2^(32) - 1)) + (1 - `BEQ` - `BLT` - `STORE`) dot `imm[1]` |
+| | _polynomial:_ `(arg2::DWordWL)[1] - (1 - LOAD) * (1 - word_instr) * rv2[2] - (1 - LOAD) * signed * rv2_ext_bit * (2^32 - 1) - (1 - BEQ - BLT - STORE) * imm[1] = 0` |
+| `CPU-CE63` | `SIGN<res_ext_bit; (res::DWordHL)[1], word_instr>` |
+| `CPU-CE64` | `!LOAD` => `rvd[0]` = `res[:4]` |
+| | _polynomial:_ `(1 - LOAD) * (rvd[0] - (res::DWordWL)[0]) = 0` |
+| `CPU-CE65` | `!LOAD` => `rvd[1]` = (1 - `word_instr`) dot `res[4:]` + `res_ext_bit` dot (2^(32) - 1) |
+| | _polynomial:_ `(1 - LOAD) * (rvd[1] - (1 - word_instr) * (res::DWordWL)[1] - res_ext_bit * (2^32 - 1)) = 0` |
+
+### Other constraints
+
+For [cpu:c:is_equal], note that [cpu:c:sub] sets `res` to be the difference between `arg1` and `arg2` whenever `BEQ` is `1`. Given that this difference is `0` when both are equal, [cpu:c:is_equal] ensures `is_equal` is set to `1` if and only if ``arg1` = `arg2`` and `BEQ` is set.
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `CPU-CO66` | `ZERO[is_equal; res[0] + res[1] + res[2] + res[3] + res[4] + res[5] + res[6] + res[7]]` | BEQ |
+| `CPU-CO67` | `branch_cond` = `JALR` or (`BLT` and (`res` xor `invert`)) or (`BEQ` and (`is_equal` xor `invert`)) |  |
+| | _polynomial:_ `-branch_cond + JALR + res[0] * (1 - mp_selector) * BLT + (1 - res[0]) * mp_selector * BLT + is_equal * (1 - mp_selector) * BEQ + (1 - is_equal) * mp_selector * BEQ = 0` | |
+| `CPU-CO68` | `BRANCH[next_pc; pc, imm, arg1::DWordWL, JALR]` | branch_cond |
+| `CPU-CO69` | `ADD<next_pc; pc, (2 * c_type_instruction + 4 * (1 - c_type_instruction)) * 1::DWordWL>` |  |
+
+> **Note:** Document the choice to not have a multiplicity column here for padding
+
+## Padding
+
+The CPU can be padded with the following values, which have a corresponding row in the DECODE table, at the _odd_ address 1, only reachable through a HALT ecall.
+
+This approach minimizes the number of dependent lookups, increasing only multiplicities in the DECODE table and the IS_BYTE lookup.
+
+---
+
+# SHIFT Chip
+
+The  chip is designed to constrain that $
+
+$ $
+
+$ Here, `<<` and `>>` denote the _logical_ left and right shift operations, while `>>>` denotes the _arithmetic_ right shift operation.
+
+## Variables
+
+The `SHIFT` chip is comprised of  variables that are expressed using  columns and leverages  interaction(s):
+
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `in` | `DWordHL` | The value being shifted |
+| `shift` | `Byte` | Number of bits to shift `in` by. |
+| `direction` | `Bit` | Whether to shift left (0) or right (1). |
+| `signed` | `Bit` | Whether to interpret `in` as a signed integer. |
+| `word_instr` | `Bit` | Whether this is a Word-instruction (1) or not (0). |
+
+### Output
+
+| Name | Type | Description |
+|------|------|-------------|
+| `out` | `DWordWL` | $`in <</>>/>>>` (`shift` mod 32 dot (2 - `word_instr`))$ |
+
+### Auxiliary
+
+| Name | Type | Description |
+|------|------|-------------|
+| `is_negative` | `Bit` | Whether `in` is negative |
+| `bit_shift` | `Byte` | Value by which to shift `in` to obtain `X` and `Y` |
+| `zbs` | `Bit` | Whether `bit_shift` is zero (1) or not (0). |
+| `X` | `Half[5]` | scratch variable. |
+| `Y` | `Half[4]` | scratch variable. |
+| `limb_shift_raw` | `Bit[3]` | One-hot vector indicating whether $floor.l `shift` / 16 floor.r equiv i mod s$, where $s = 2$ when $`word_instr` = 1$ and $4$ otherwise. These columns store the first 3 values, and the 4th is derived from the one-hot property. |
+
+### Virtual
+
+| Name | Type | Description |
+|------|------|-------------|
+| `limb_shift` | `Bit[4]` |  |
+| `extension` | `Half` | sign extension of `in`. |
+| `left` | `Bit` | Whether to perform a left-shift. |
+| `right` | `Bit` | Whether to perform a right-shift. |
+| `intra_limb_left` | `DWordHL` | `in << (shift % 16)` if `left` |
+| `intra_limb_right` | `DWordHL` | `in >>> (shift % 16)` if `right` and `signed`;\ `in >> (shift % 16)` if `right` and `!signed` |
+| `shifted` | `DWordHL` | $`in <</>>/>>>` (`shift` mod 32 dot (2 - `word_instr`))$ |
+
+**Definition of `limb_shift`:**
+```
+limb_shift (when iter=[0, 2]) := limb_shift_raw[i]
+limb_shift (when iter=3) := 1 - Σ_j = 0^2 limb_shift_raw[j]
+```
+
+**Definition of `extension`:**
+```
+extension := 65535 * is_negative
+```
+
+**Definition of `left`:**
+```
+left := μ - direction
+```
+
+**Definition of `right`:**
+```
+right := direction
+```
+
+**Definition of `intra_limb_left`:**
+```
+intra_limb_left (when iter=0) := X[0]
+intra_limb_left (when iter=[1, 3]) := X[i] + Y[i - 1]
+```
+
+**Definition of `intra_limb_right`:**
+```
+intra_limb_right := Y[i] + X[i + 1]
+```
+
+**Definition of `shifted`:**
+```
+shifted := left * Σ_j = 0^i limb_shift[j] * intra_limb_left[i - j] + right * (Σ_j = 0^3 - i limb_shift[j] * intra_limb_right[i + j] + extension * Σ_j = 4 - i^3 limb_shift[j])
+```
+
+### Multiplicity
+
+| Name | Type | Description |
+|------|------|-------------|
+| `μ` | `Bit` |  |
+
+## Assumptions
+
+| Tag | Range | Description |
+|-----|-------|-------------|
+| `SHIFT-A1.i` | i ∈ [0, 3] | `IS_HALF[in[i]]` |
+| `SHIFT-A2` |  | `IS_BYTE[shift]` |
+| `SHIFT-A3` |  | `IS_BIT<direction>` |
+| `SHIFT-A4` |  | `IS_BIT<signed>` |
+| `SHIFT-A5` |  | `IS_BIT<word_instr>` |
+
+## Explanation
+
+This chip has a rather complex design as a result of designing it to fit in as few columns possible. We briefly discuss the intricacies of the design, attempting to illustrate its correctness.
+
+The chip's design revolves around a two-phase shifting process: 1. shift `in` by `x := `shift` mod 16` bits, 2. shift that result by `(`shift`-x) mod 64` (or `mod 32` if ` `word_instr` = 1`). The intermediate value representing the state between the two phases is stored in the scratch variables `X` and `Y`. The definition of `shifted` describes how one can combine the `X`, `Y` and `extension` variables to construct the output value as described using `Half`-limbs. The output variable `out` is equivalent to `shifted`, but expressed using `Word`-limbs.
+
+In the following, we cover how these two phases were designed to complement one another. Here, we start with discussing the _logical_ left/right shift operations only; the modifications required to compute the _arithmetic_ right shift will be discussed at the end.
+
+### First phase
+
+We zoom in on the first step. Here, we make use of the lookup operation `HWSL` (short for "HalfWord Shift Left"): ` `HWSL[x: Half, y: B4]` := [(`x` `<<` `y`) mod 2^16, `x` `>>` (16 - `y`)]. ` One can use this to compute `out: Half[4] := in << y` as: $
+
+$ as long as ``y` < 16`. Observing that ``HWSL[x,` 16-`y]`_0 = (`x` `<<` (16-`y`)) mod 2^16`, and ``HWSL[x,` 16-`y]`_1 = `x` `>>` `y`` for ``y` in [1, 15]`, one can also use it to compute `out := in >> y` as $
+
+$ as long as `0 < `y` < 16`.
+
+Observe now that the values being looked up are (almost) independent from the direction of the shift: only the shift-amount varies slightly. When we now define $
+
+(16-`shift`) mod 16 & "when shifting right" ), $ it only takes some rearranging and combining of the values ``X[`i`] := HWSL[in[`i`], bit_shift]`_0` and ``Y[`i`] := HWSL[in[`i`], bit_shift]`_1` to form the limbs of ``in <</>> shift` mod 16`. In the remaining case that ``right` = 1` and ``shift` = 0 mod 16`, the limbs of ``in <</>> shift` mod 16` simply match those of `in`.
 
-The ALU functionality is then obtained through judicious dispatching to the corresponding chips.
+### Second phase
 
-| Tag | Range | Description | Multiplicity |
-|-----|-------|-------------|--------------|
-| `CPU-CA35` |  | ADD + LOAD ⇒ `ADD<res::DWordWL; arg1::DWordWL, arg2::DWordWL>` |  |
-| `CPU-CA36` |  | STORE ⇒ `ADD<res::DWordWL; arg1::DWordWL, imm>` |  |
-| `CPU-CA37` |  | SUB + BEQ ⇒ `SUB<res::DWordWL; arg1::DWordWL, arg2::DWordWL>` |  |
-| `CPU-CA38` |  | `LT[res[0]; arg1::DWordWL, arg2::DWordWL, signed]` | SLT + BLT |
-| `CPU-CA39.i` | i ∈ [1, 7] | `SLT` + `BLT` => `res[i]` = 0 |  |
-| | | _polynomial:_ `(SLT + BLT) * res[i] = 0` | |
-| `CPU-CA40.i` | i ∈ [0, 7] | `AND_BYTE[res[i]; arg1[i], arg2[i]]` | AND |
-| `CPU-CA41.i` | i ∈ [0, 7] | `OR_BYTE[res[i]; arg1[i], arg2[i]]` | OR |
-| `CPU-CA42.i` | i ∈ [0, 7] | `XOR_BYTE[res[i]; arg1[i], arg2[i]]` | XOR |
-| `CPU-CA43` |  | `SHIFT[res::DWordWL; arg1::DWordHL, arg2[0], mp_selector, signed, word_instr]` | SHIFT |
-| `CPU-CA44` |  | JALR ⇒ `ADD<res::DWordWL; pc, (2 * c_type_instruction + 4 * (1 - c_type_instruction)) * 1::DWordWL>` |  |
-| `CPU-CA45` |  | `MUL[res::DWordWL; arg1::DWordHL, signed, arg2::DWordHL, mp_selector, muldiv_selector]` | MUL |
-| `CPU-CA46` |  | `DVRM[res::DWordWL; arg1::DWordHL, arg2::DWordHL, signed, muldiv_selector]` | DIVREM |
+Since we're operating on 16-bit limbs, all the limbs in ``in <</>> shift`` must also occur somewhere in ``in <</>> shift` mod 16`. The number of full-limbs we still need to shift is determined by the fifth and sixth least significant bit of `shift`. With `limb_shift` containing a unary decoding of the integer represented by these two bits, we find that the intermediate value needs to be shifted over by `i` limbs (to the `left` or `right`) when ``limb_shift[`i`]` = 1`. These things combined yield `shifted`'s definition.
 
-## Memory
+Of course, when ``word_instr` = 1` and, thus, only ``shift` mod 32` should be considered, the bit-mask for the lookup constraining `limb_shift` is adjusted appropriately (see [shift:c:limb_shift_lookup]).
 
-The interactions with the memory, both for register loading and storing, as for `LOAD` and `STORE` instructions are handled. Note that since registers need no byte-addressing, we store them in the memory argument with `Word` limbs. The timestamps are ensured to be disjoint for disjoint memory locations. One consequence of that is that `next_pc` is written at `timestamp + 1` to ensure the access is disjoint with the `pc` read into `rv1` as part of the `AUIPC` instruction.
+### Arithmetic right shift
 
-| Tag | Range | Description | Multiplicity |
-|-----|-------|-------------|--------------|
-| `CPU-CM47` |  | `MEMW[['arr', ['idx', ['cast', 'rv1', 'DWordWL'], 0], ['idx', ['cast', 'rv1', 'DWordWL'], 1], 0, 0, 0, 0, 0, 0]; 1, 2::DWordWL * rs1, ['arr', ['idx', ['cast', 'rv1', 'DWordWL'], 0], ['idx', ['cast', 'rv1', 'DWordWL'], 1], 0, 0, 0, 0, 0, 0], timestamp + 0::DWordWL, 1, 0, 0]` | read_register1 |
-| `CPU-CM48.i` | i ∈ [0, 2] | `!read_register1` => `rv1[i]` = 0 |  |
-| | | _polynomial:_ `(1 - read_register1) * rv1[i] = 0` | |
-| `CPU-CM49` |  | `MEMW[['arr', ['idx', ['cast', 'rv2', 'DWordWL'], 0], ['idx', ['cast', 'rv2', 'DWordWL'], 1], 0, 0, 0, 0, 0, 0]; 1, 2::DWordWL * rs2, ['arr', ['idx', ['cast', 'rv2', 'DWordWL'], 0], ['idx', ['cast', 'rv2', 'DWordWL'], 1], 0, 0, 0, 0, 0, 0], timestamp + 1::DWordWL, 1, 0, 0]` | read_register2 |
-| `CPU-CM50.i` | i ∈ [0, 2] | `!read_register2` => `rv2[i]` = 0 |  |
-| | | _polynomial:_ `(1 - read_register2) * rv2[i] = 0` | |
-| `CPU-CM51` |  | `MEMW[1, 2::DWordWL * rd, ['arr', ['idx', 'rvd', 0], ['idx', 'rvd', 1], 0, 0, 0, 0, 0, 0], timestamp + 2::DWordWL, 1, 0, 0]` | write_register |
-| `CPU-CM52` |  | `LOAD[rvd; res::DWordWL, timestamp + 0::DWordWL, memory_2bytes, memory_4bytes, memory_8bytes, signed]` | LOAD |
-| `CPU-CM53` |  | `MEMW[0, res::DWordWL, arg2::Byte[8], timestamp + 1::DWordWL, memory_2bytes, memory_4bytes, memory_8bytes]` | STORE |
-| `CPU-CM54` |  | `MEMW[['arr', ['idx', 'pc', 0], ['idx', 'pc', 1], 0, 0, 0, 0, 0, 0]; 1, (2 * 255)::DWordWL, ['arr', ['idx', 'next_pc', 0], ['idx', 'next_pc', 1], 0, 0, 0, 0, 0, 0], timestamp + 1::DWordWL, 1, 0, 0]` | 1 - pad |
+Lastly, we discuss the case of performing the _arithmetic_ right shift. Here, `extension` is constrained to contain a repetition of `in`'s most significant bit. Copies of this variable are used for any full limbs shifted in when ``right` = `signed` = 1`. Moreover, `X[4]` contains a copy of `extension` shifted over by the right number of bits, to allow the construction of ``in >>> shift` mod 16` as the appropriate intermediate.
 
-## System
+## Constraints
 
-The interactions with the wider system.
+First, we constrain `bit_shift` based on whether we are left or right-shifting. [shift:c:zbs] makes sure `zbs` is set to `1` if and only if `bit_shift = 0`. This flag is used to indicate the special case that ``right` = 1` and ``shift` = 0 mod 16`.
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `CPU-CS55` | `!EBREAK` |  |
-| | _polynomial:_ `1 - EBREAK = 0` | |
-| `CPU-CS56` | `ECALL[timestamp, rv1::DWordWL]` | ECALL |
+| `SHIFT-C1` | `AND_BYTE[bit_shift; shift, 15]` | left |
+| `SHIFT-C2` | `AND_BYTE[bit_shift; 2^8 - 16 * zbs - shift, 15]` | right |
+| `SHIFT-C3` | `ZERO[zbs; bit_shift]` | μ |
 
-## Input and output to the ALU
+Next, we shift the limbs of `in` left and right by the appropriate amount, storing the results in `X` and `Y` respectively. When `zbs = 1`, the output cannot be used to compose ``in >>/>>> shift` mod 16`. To resolve this, we override `Y[i] := in[i]` and `X[i] := 0` in this case.
 
-We constrain `arg1`, `arg2` and `rvd` to correspond to the wanted values, including the appropriate sign/zero extension, depending on `word_instr`.
+The case of `left`-shifting and ``bit_shift` = 0` will be used for padding rows. To prevent unnecessary lookups in padding rows, we override ``X[i]` := `in[i]`` and ``Y[i]` := 0` here.
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `SHIFT-C4.i` | i ∈ [0, 3] | `HWSL[['arr', ['idx', 'X', 'i'], ['idx', 'Y', 'i']]; in[i], bit_shift]` | 1 - zbs |
+| `SHIFT-C5.i` | i ∈ [0, 3] | `zbs` => `X[i]` = `in[i]` dot `left` |  |
+| | | _polynomial:_ `zbs * (X[i] - in[i] * left) = 0` | |
+| `SHIFT-C6.i` | i ∈ [0, 3] | `zbs` => `Y[i]` = `in[i]` dot `right` |  |
+| | | _polynomial:_ `zbs * (Y[i] - in[i] * right) = 0` | |
+| `SHIFT-C7` |  | `HWSL[['arr', ['idx', 'X', 4], ['-', 'extension', ['idx', 'X', 4]]]; extension, bit_shift]` | 1 - zbs |
+| `SHIFT-C8` |  | `zbs` => `X[4]` = 0 |  |
+| | | _polynomial:_ `zbs * X[4] = 0` | |
+
+### Full-limb shifting
+
+Next, we constrain that `limb_shift` is a proper unary encoding of the fifth (and sixth if ``word_instr` = 0`) bit of `shift`. For this to be the case, three requirements must be satisfied: + *unary(0)*: ``limb_shift[`i`]` in {0, 1}` for `i in [0, 3]`, + *unary(1)*: ``limb_shift[`i`]` = 1` for exactly one `i`, and + *proper encoding*: ``limb_shift[`i`]` = 1 <=> 1/16 (`shift &` (48-32 dot `word_instr`)) = i` The first requirement is enforced by constraint [shift:c:limb_shift_is_bit]. To construct a constraint for the second and third requirement, observe that $ 1/16 dot (`shift &` (48-32 dot `word_instr`)) in cases( {0, 1, 2, 3} &"if" `word_instr` = 0, {0, 1} &"if" `word_instr` = 1 $ Observe moreover that, assuming *unary(0)*, the expression $ 1/16 dot (1 + sum_(i=0)^3 (16i-1) dot `limb_shift[`i`]`) $ can evaluate to `i` if and only if ``limb_shift[`i`]` = 1`, while the others are `0`. This means that the relation $ 1 + sum_(i=0)^3 (16i-1) dot `limb_shift[`i`]` = `shift &` (48-32 dot `word_instr`) $ enforces both *unary(1)* and *proper encoding*. This is the exact relation [shift:c:limb_shift_lookup] enforces.
+
+Hereafter, one must only check that `out` is the proper cast of `shifted` into a `DWordWL`.
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `SHIFT-C9.i` | i ∈ [0, 3] | `IS_BIT<limb_shift[i]>` |  |
+| `SHIFT-C10` |  | `AND_BYTE[(1 - limb_shift[0]) + 15 * limb_shift[1] + 31 * limb_shift[2] + 47 * limb_shift[3]; shift, 48 - 32 * word_instr]` | μ |
+| `SHIFT-C11.i` | i ∈ [0, 1] | `out[:2]` = `shifted[:4]` |  |
+| | | _polynomial:_ `out[i] - (shifted::DWordWL)[i] = 0` | |
+
+### Miscellaneous
 
 | Tag | Description |
 |-----|-------------|
-| `CPU-CE57` | `SIGN<rv1_ext_bit; rv1[1], word_instr>` |
-| `CPU-CE58` | `arg1[:4]` = `rv1[:2]` |
-| | _polynomial:_ `(arg1::DWordWL)[0] - (rv1::DWordWL)[0] = 0` |
-| `CPU-CE59` | `arg1[4:]` = `rv1[2]` dot (1 - `word_instr`) + (2^(32) - 1) dot `rv1_ext_bit` dot `signed` |
-| | _polynomial:_ `(arg1::DWordWL)[1] - (1 - word_instr) * rv1[2] - signed * rv1_ext_bit * (2^32 - 1) = 0` |
-| `CPU-CE60` | `SIGN<rv2_ext_bit; rv2[1], word_instr>` |
-| `CPU-CE61` | `arg2[:4]` = (1 - `LOAD`) dot `rv2[:2]` + (1 - `BEQ` - `BLT` - `STORE`) dot `imm[0]` |
-| | _polynomial:_ `(arg2::DWordWL)[0] - (1 - LOAD) * (rv2::DWordWL)[0] - (1 - BEQ - BLT - STORE) * imm[0] = 0` |
-| `CPU-CE62` | `arg2[4:]` = (1 - `LOAD`) dot ((1 - `word_instr`) dot `rv2[2]` + `signed` dot `rv2_ext_bit` dot (2^(32) - 1)) + (1 - `BEQ` - `BLT` - `STORE`) dot `imm[1]` |
-| | _polynomial:_ `(arg2::DWordWL)[1] - (1 - LOAD) * (1 - word_instr) * rv2[2] - (1 - LOAD) * signed * rv2_ext_bit * (2^32 - 1) - (1 - BEQ - BLT - STORE) * imm[1] = 0` |
-| `CPU-CE63` | `SIGN<res_ext_bit; (res::DWordHL)[1], word_instr>` |
-| `CPU-CE64` | `!LOAD` => `rvd[0]` = `res[:4]` |
-| | _polynomial:_ `(1 - LOAD) * (rvd[0] - (res::DWordWL)[0]) = 0` |
-| `CPU-CE65` | `!LOAD` => `rvd[1]` = (1 - `word_instr`) dot `res[4:]` + `res_ext_bit` dot (2^(32) - 1) |
-| | _polynomial:_ `(1 - LOAD) * (rvd[1] - (1 - word_instr) * (res::DWordWL)[1] - res_ext_bit * (2^32 - 1)) = 0` |
+| `SHIFT-C12` | `direction` => `μ` = 1 |
+| | _polynomial:_ `direction * (1 - μ) = 0` |
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `SHIFT-C13` | `MSB16[is_negative; in[3]]` | signed |
 
-## Other constraints
+*Note*: `is_negative` is not used when `signed = 0`. As such, there is no problem with it being unconstrained in this case.
 
-For [cpu:c:is_equal], note that [cpu:c:sub] sets `res` to be the difference between `arg1` and `arg2` whenever `BEQ` is `1`. Given that this difference is `0` when both are equal, [cpu:c:is_equal] ensures `is_equal` is set to `1` if and only if ``arg1` = `arg2`` and `BEQ` is set.
+### Lookups
+
+This chip adds the following interaction to the lookup.
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `CPU-CO66` | `ZERO[is_equal; res[0] + res[1] + res[2] + res[3] + res[4] + res[5] + res[6] + res[7]]` | BEQ |
-| `CPU-CO67` | `branch_cond` = `JALR` or (`BLT` and (`res` xor `invert`)) or (`BEQ` and (`is_equal` xor `invert`)) |  |
-| | _polynomial:_ `-branch_cond + JALR + res[0] * (1 - mp_selector) * BLT + (1 - res[0]) * mp_selector * BLT + is_equal * (1 - mp_selector) * BEQ + (1 - is_equal) * mp_selector * BEQ = 0` | |
-| `CPU-CO68` | `BRANCH[next_pc; pc, imm, arg1::DWordWL, JALR]` | branch_cond |
-| `CPU-CO69` | `ADD<next_pc; pc, (2 * c_type_instruction + 4 * (1 - c_type_instruction)) * 1::DWordWL>` |  |
+| `SHIFT-C14` | `SHIFT[out; in, shift, direction, signed, word_instr]` | -μ |
 
-> **Note:** Document the choice to not have a multiplicity column here for padding
+## Padding
 
-= Padding
+The table can be padded to the next power of two with the following value assignments:
 
-The CPU can be padded with the following values, which have a corresponding row in the DECODE table, at the _odd_ address 1, only reachable through a HALT ecall.
+---
 
-This approach minimizes the number of dependent lookups, increasing only multiplicities in the DECODE table and the IS_BYTE lookup.
+# BRANCH Chip
 
-## Columns
+The  chip computes the target address of a branching instruction.
+
+## Variables
+
+The  chip is comprised of  variables that are expressed using  columns and leverages  interaction(s):
 
 ### Input
 
 | Name | Type | Description |
 |------|------|-------------|
-| `timestamp` | `Timestamp` | A preprocessed timestamp to coordinate the memory argument. Since we have at most 3 non-disjoint memory accesses (`(rs1, rs2, rd)`, `(rs1, pc, pc)`, `(LOAD)` or `(STORE)`) a maximum of 4 slots is enough. |
-| `pc` | `DWordWL` | The program counter |
-| `rs1` | `Byte` | Source register 1 index |
-| `rs2` | `Byte` | Source register 2 index |
-| `rd` | `Byte` | Destination register index |
-| `read_register1` | `Bit` | Whether to read from `rs1` (1) or to place a 0 in `rv1` (0) |
-| `read_register2` | `Bit` | Whether to read from `rs2` (1) or to place a 0 in `rv2` (0) |
-| `write_register` | `Bit` | Whether to write back to the destination register |
-| `memory_2bytes` | `Bit` | Whether the memory access (read or write) touches exactly 2 bytes |
-| `memory_4bytes` | `Bit` | Whether the memory access (read or write) touches exactly 4 bytes |
-| `memory_8bytes` | `Bit` | Whether the memory access (read or write) touches exactly 8 bytes |
-| `c_type_instruction` | `Bit` | Whether the instruction is of C type, i.e., whether it is 2 bytes long instead of 4 |
-| `imm` | `DWordWL` | The fully extended 64-bit version of the immediate |
-| `signed` | `Bit` | Indicates whether we're dealing with a signed or unsigned instruction |
-| `mp_selector` | `Bit` | Multi-purpose selector used by different ALU operations for different purposes. Currently, it is used     - by the `MUL` chip to select between `MUL`/`MULH` and `MULH[S]U`, and     - as flag for inverting the condition of conditional branches (see `branch_cond`)     - as direction (left or right) for `SHIFT` |
-| `muldiv_selector` | `Bit` | Selects which output of `MUL` (lo/hi) or `DIV` (quo/rem) is wanted |
-| `word_instr` | `Bit` | Whether the instruction is a \*W instruction, requiring the inputs and outputs to be (sign) extended |
-| `ADD` | `Bit` | One-hot ALU selector flag |
-| `SUB` | `Bit` | One-hot ALU selector flag |
-| `SLT` | `Bit` | One-hot ALU selector flag |
-| `AND` | `Bit` | One-hot ALU selector flag |
-| `OR` | `Bit` | One-hot ALU selector flag |
-| `XOR` | `Bit` | One-hot ALU selector flag |
-| `SHIFT` | `Bit` | One-hot ALU selector flag |
-| `JALR` | `Bit` | One-hot ALU selector flag |
-| `BEQ` | `Bit` | One-hot ALU selector flag |
-| `BLT` | `Bit` | One-hot ALU selector flag |
-| `LOAD` | `Bit` | One-hot ALU selector flag |
-| `STORE` | `Bit` | One-hot ALU selector flag |
-| `MUL` | `Bit` | One-hot ALU selector flag |
-| `DIVREM` | `Bit` | One-hot ALU selector flag |
-| `ECALL` | `Bit` | One-hot ALU selector flag |
-| `EBREAK` | `Bit` | One-hot ALU selector flag |
+| `pc` | `DWordWL` | The current pc, used as base address when `!JALR` |
+| `offset` | `DWordWL` | The offset from the base address to jump to |
+| `register` | `DWordWL` | The base address to use when `JALR` |
+| `JALR` | `Bit` | Selects between `pc` and `register` as base address, needed for the `JALR` instruction |
 
 ### Output
 
 | Name | Type | Description |
 |------|------|-------------|
-| `next_pc` | `DWordWL` | The program counter for the next instruction |
-| `rvd` | `DWordWL` | The value to (maybe) be written back to rvd |
+| `next_pc_high` | `Half[3]` | The upper part of the next pc |
+| `next_pc_low` | `Byte[2]` | The lower part of the next pc |
 
 ### Auxiliary
 
 | Name | Type | Description |
 |------|------|-------------|
-| `rv1` | `DWordWHH` | The value of register `rs1` |
-| `rv2` | `DWordWHH` | The value of register `rs2` |
-| `rv1_ext_bit` | `Bit` | The sign bit of `rv1` if seen as a 32-bit word, used for sign extension with `word_instr` |
-| `arg1` | `DWordBL` | The extended version of `rv1`, depending on `word_instr` |
-| `rv2_ext_bit` | `Bit` | The sign bit of `rv2` if seen as a 32-bit word, used for sign extension with `word_instr` |
-| `arg2` | `DWordBL` | A multiplexed version of `rv2` and `imm`, to be used as second argument to ALU calls |
-| `res_ext_bit` | `Bit` | The sign bit of `res`, if seen as a 32-bit word, used for sign extension with `word_instr` |
-| `res` | `DWordBL` | The ALU result |
-| `is_equal` | `Bit` | Whether `rv1` and `arg2` are equal |
-| `branch_cond` | `Bit` | Whether a branch is taken, i.e., the branch condition |
+| `unmasked_low_byte` | `Byte` | The low byte of the next pc, before masking the LSB. Used to constraint the raw addition. |
 
 ### Virtual
 
 | Name | Type | Description |
 |------|------|-------------|
-| `packed_decode` | `BaseField` | A packed representation of all bit flags and register indices obtained from the decoding |
-| `pad` | `Bit` | When no flags are set, we must be in a padding row. |
+| `next_pc_unmasked` | `DWordWL` | The combination of `next_pc_high`, `next_pc_low[1]` and `unmasked_low_byte` to constrain the addition. This is the computed value for the next pc, before masking off the LSB as required by the ISA. |
+| `next_pc` | `DWordWL` | The computed next pc, after masking off the LSB as required by the ISA. |
 
-**Definition of `packed_decode`:**
+**Definition of `next_pc_unmasked`:**
 ```
-packed_decode := 2^0 * read_register1 + 2^1 * read_register2 + 2^2 * write_register + 2^3 * memory_2bytes + 2^4 * memory_4bytes + 2^5 * memory_8bytes + 2^6 * c_type_instruction + 2^7 * signed + 2^8 * mp_selector + 2^9 * muldiv_selector + 2^10 * word_instr + 2^11 * ADD + 2^12 * SUB + 2^13 * SLT + 2^14 * AND + 2^15 * OR + 2^16 * XOR + 2^17 * SHIFT + 2^18 * JALR + 2^19 * BEQ + 2^20 * BLT + 2^21 * LOAD + 2^22 * STORE + 2^23 * MUL + 2^24 * DIVREM + 2^25 * ECALL + 2^26 * EBREAK + 2^27 * rs1 + 2^35 * rs2 + 2^43 * rd
+next_pc_unmasked (when iter=0) := 2^16 * next_pc_high[0] + 2^8 * next_pc_low[1] + unmasked_low_byte
+next_pc_unmasked (when iter=1) := 2^16 * next_pc_high[2] + next_pc_high[1]
 ```
 
-**Definition of `pad`:**
+**Definition of `next_pc`:**
 ```
-pad := 1 - ADD - SUB - SLT - AND - OR - XOR - SHIFT - JALR - BEQ - BLT - LOAD - STORE - MUL - DIVREM - ECALL - EBREAK
+next_pc (when iter=0) := 2^16 * next_pc_high[0] + 2^8 * next_pc_low[1] + next_pc_low[0]
+next_pc (when iter=1) := 2^16 * next_pc_high[2] + next_pc_high[1]
 ```
 
+### Multiplicity
+
+| Name | Type | Description |
+|------|------|-------------|
+| `μ` | `Bit` |  |
+
 ## Assumptions
 
 | Tag | Range | Description |
 |-----|-------|-------------|
-| `CPU-A1` |  | At most one ALU selector flag is 1 by the decoding, and every other flag is 0. |
-| `CPU-A2` |  | When `STORE + LOAD + BEQ + BLT = 0`, either `rs2 = 0` or `imm = 0` should be enforced by the decoding. This is needed for `arg2`. |
+| `BRANCH-A1.i` | i ∈ [0, 1] | `pc` is range checked, `IS_WORD[pc[i]]` |
+| `BRANCH-A2` |  | `offset` is range checked, `IS_WORD[offset]` |
+| `BRANCH-A3.i` | i ∈ [0, 1] | `register` is range checked, `IS_WORD[register[i]]` |
+| `BRANCH-A4` |  | `IS_BIT<JALR>` |
+
+## Constraints
+
+We constrain `next_pc` to be ``base_address` + `offset``, where `base_address` equals `pc` when ``JALR` = 0` and `register` otherwise.
+
+The range checks on `unmasked_low_byte` and `next_pc_low[0]` are performed implicitly by the `AND_BYTE` lookup.
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `BRANCH-C1` |  | 1 - JALR ⇒ `ADD<next_pc_unmasked; pc, offset::DWordWL>` |  |
+| `BRANCH-C2` |  | JALR ⇒ `ADD<next_pc_unmasked; register, offset::DWordWL>` |  |
+| `BRANCH-C3` |  | `IS_BYTE[next_pc_low[1]]` | μ |
+| `BRANCH-C4` |  | `AND_BYTE[next_pc_low[0]; unmasked_low_byte, 254]` | μ |
+| `BRANCH-C5.i` | i ∈ [0, 2] | `IS_HALF[next_pc_high[i]]` | μ |
+
+This chip contributes the following to the lookup argument.
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `BRANCH-C6` | `BRANCH[next_pc; pc, offset, register, JALR]` | -μ |
+
+## Padding
+
+The table can be padded to the next power of two with the following value assignments:
 
 ---
 
-# SHIFT Chip
+# LT Chip
 
-The  chip is designed to constrain that $
+The  chip constrains an indicator bit for the less-than relation, signed or unsigned.
 
-$ $
+## Variables
 
-$ Here, `<<` and `>>` denote the _logical_ left and right shift operations, while `>>>` denotes the _arithmetic_ right shift operation.
+The  chip is comprised of  variables that are expressed using  columns and leverages  interaction(s):
+
+### Input
 
-= Variables
+| Name | Type | Description |
+|------|------|-------------|
+| `lhs` | `DWordHHW` | The left operand |
+| `rhs` | `DWordHHW` | The right operand |
+| `signed` | `Bit` | whether to interpret `lhs` and `rhs` as signed integers (1) or not (0) |
 
-The `SHIFT` chip is comprised of  variables that are expressed using  columns and leverages  interaction(s):
+### Output
 
-= Assumptions
+| Name | Type | Description |
+|------|------|-------------|
+| `lt` | `Bit` | Whether $`lhs` < `rhs`$, taking `signed` into account |
 
-= Explanation This chip has a rather complex design as a result of designing it to fit in as few columns possible. We briefly discuss the intricacies of the design, attempting to illustrate its correctness.
+### Auxiliary
 
-The chip's design revolves around a two-phase shifting process: 1. shift `in` by `x := `shift` mod 16` bits, 2. shift that result by `(`shift`-x) mod 64` (or `mod 32` if ` `word_instr` = 1`). The intermediate value representing the state between the two phases is stored in the scratch variables `X` and `Y`. The definition of `shifted` describes how one can combine the `X`, `Y` and `extension` variables to construct the output value as described using `Half`-limbs. The output variable `out` is equivalent to `shifted`, but expressed using `Word`-limbs.
+| Name | Type | Description |
+|------|------|-------------|
+| `lhs_sub_rhs` | `DWordHL` | $`lhs` - `rhs`$ |
+| `lhs_msb` | `Bit` | The most significant bit of `lhs` |
+| `rhs_msb` | `Bit` | The most significant bit of `rhs` |
 
-In the following, we cover how these two phases were designed to complement one another. Here, we start with discussing the _logical_ left/right shift operations only; the modifications required to compute the _arithmetic_ right shift will be discussed at the end.
+### Virtual
+
+| Name | Type | Description |
+|------|------|-------------|
+| `carry` | `Bit[2]` | The carry for adding `lhs_sub_rhs` back to `rhs` |
+| `unsigned_lt` | `Bit` | Whether $`lhs` < `rhs`$, as unsigned integers |
+
+**Definition of `carry`:**
+```
+carry (when iter=0) := 2^-32 * (rhs[0] + (lhs_sub_rhs::DWordWL)[0] - lhs[0])
+carry (when iter=1) := 2^-32 * ((rhs::DWordWL)[1] + (lhs_sub_rhs::DWordWL)[1] + carry[0] - (lhs::DWordWL)[1])
+```
+
+**Definition of `unsigned_lt`:**
+```
+unsigned_lt := carry[1]
+```
 
-## First phase
+### Multiplicity
 
-We zoom in on the first step. Here, we make use of the lookup operation `HWSL` (short for "HalfWord Shift Left"): ` `HWSL[x: Half, y: B4]` := [(`x` `<<` `y`) mod 2^16, `x` `>>` (16 - `y`)]. ` One can use this to compute `out: Half[4] := in << y` as: $
+| Name | Type | Description |
+|------|------|-------------|
+| `μ` | `Bit` |  |
 
-$ as long as ``y` < 16`. Observing that ``HWSL[x,` 16-`y]`_0 = (`x` `<<` (16-`y`)) mod 2^16`, and ``HWSL[x,` 16-`y]`_1 = `x` `>>` `y`` for ``y` in [1, 15]`, one can also use it to compute `out := in >> y` as $
+## Assumptions
 
-$ as long as `0 < `y` < 16`.
+We assume the inputs `lhs`, `rhs` and `signed` are partially range checked.
 
-Observe now that the values being looked up are (almost) independent from the direction of the shift: only the shift-amount varies slightly. When we now define $
+| Tag | Range | Description |
+|-----|-------|-------------|
+| `LT-A1` |  | `IS_WORD[lhs[0]]` |
+| `LT-A2` |  | `IS_WORD[rhs[0]]` |
+| `LT-A3` |  | `IS_BIT<signed>` |
 
-(16-`shift`) mod 16 & "when shifting right" ), $ it only takes some rearranging and combining of the values ``X[`i`] := HWSL[in[`i`], bit_shift]`_0` and ``Y[`i`] := HWSL[in[`i`], bit_shift]`_1` to form the limbs of ``in <</>> shift` mod 16`. In the remaining case that ``right` = 1` and ``shift` = 0 mod 16`, the limbs of ``in <</>> shift` mod 16` simply match those of `in`.
+## Constraints
 
-## Second phase
+We first constrain that all variables correspond to their definition. For the defining constraint of `lt`, [lt:c:lt], observe that it is a choice between two options, depending on the input flag `signed`. In the case of unsigned comparison, we simply need `unsigned_lt`, indicating that a wraparound (carry bit) modulo `2^64` is needed to go from `rhs` to `lhs` via addition. For the case of signed comparison, we first need some case analysis.
 
-Since we're operating on 16-bit limbs, all the limbs in ``in <</>> shift`` must also occur somewhere in ``in <</>> shift` mod 16`. The number of full-limbs we still need to shift is determined by the fifth and sixth least significant bit of `shift`. With `limb_shift` containing a unary decoding of the integer represented by these two bits, we find that the intermediate value needs to be shifted over by `i` limbs (to the `left` or `right`) when ``limb_shift[`i`]` = 1`. These things combined yield `shifted`'s definition.
+We split `a < b` into four disjoint cases, conditioned on the sign of `a` and `b`. Recall that the sign of a number in two's complement can be read off from the MSB, being `1` for a negative number and `0` for a positive one. For this analysis, we denote the MSB of `a` as `A` and the MSB of `b` as `B`. The four disjoint cases then become:
 
-Of course, when ``word_instr` = 1` and, thus, only ``shift` mod 32` should be considered, the bit-mask for the lookup constraining `limb_shift` is adjusted appropriately (see [shift:c:limb_shift_lookup]).
++ `dash(A) and B and (a < b)` + `A and dash(B) and (a < b)` + `A and B and (a < b)` + `dash(A) and dash(B) and (a < b)`
 
-## Arithmetic right shift
+The first case is evidently false, while the second case simplifies to `A and dash(B)`. For the third and fourth case, observe that when `A = B`, the `<` relation is preserved by the modular correspondence between `[-2^(31), 2^(31))` and `[0, 2^(64))`. Importantly, this modular correspondence is merely a reinterpretation of the bits or values of `a` and `b`, due to the representation in two's complement. Hence, we can introduce the value `C = `unsigned_lt``, that accurately represents the relation `a < b` when `A = B`.
 
-Lastly, we discuss the case of performing the _arithmetic_ right shift. Here, `extension` is constrained to contain a repetition of `in`'s most significant bit. Copies of this variable are used for any full limbs shifted in when ``right` = `signed` = 1`. Moreover, `X[4]` contains a copy of `extension` shifted over by the right number of bits, to allow the construction of ``in >>> shift` mod 16` as the appropriate intermediate.
+Combining our three remaining cases, we obtain the boolean formula `A dash(B) or A B C or dash(A) dash(B) C`. Since the cases are disjoint, this can be computed with the binary-valued polynomial `P(A, B, C) = A (1 - B) + A B C + (1 - A) (1 - B) C`.
 
-= Constraints First, we constrain `bit_shift` based on whether we are left or right-shifting. [shift:c:zbs] makes sure `zbs` is set to `1` if and only if `bit_shift = 0`. This flag is used to indicate the special case that ``right` = 1` and ``shift` = 0 mod 16`.
+The polynomial `P` can be simplified to a total degree of two. We claim that the polynomial `Q(A, B, C) = A (1 - B) + A C + (1 - B) C` is, for the purposes of this chip, equivalent to `P`. An exhaustive check shows that `P(A, B, C) != Q(A, B, C)` only for the triple `(A, B, C) = (1, 0, 1)`. This is, however, impossible due to the correctness of `ADD`. In more detail, if we let `s` be the (range-checked) difference `a - b` (so the equivalent of the `lhs_sub_rhs` column), and `x'` denote the most significant word of a variable `x`, we need `c dot 2^32 + a' = b' + s' + `carry[0]``, by the definition of `carry`. However, the left hand side of this is at least `3 dot 2^31`, as `(A, C) = (1, 1)`, and the right hand side is at most `(2^31 - 1) + (2^32 - 1) + 1 = 3 dot 2^31 - 1`. Therefore, we can use `Q` to constrain `lt` when `signed = 1`.
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `SHIFT-C1` | `AND_BYTE[bit_shift; shift, 15]` | left |
-| `SHIFT-C2` | `AND_BYTE[bit_shift; 2^8 - 16 * zbs - shift, 15]` | right |
-| `SHIFT-C3` | `ZERO[zbs; bit_shift]` | μ |
-
-Next, we shift the limbs of `in` left and right by the appropriate amount, storing the results in `X` and `Y` respectively. When `zbs = 1`, the output cannot be used to compose ``in >>/>>> shift` mod 16`. To resolve this, we override `Y[i] := in[i]` and `X[i] := 0` in this case.
-
-The case of `left`-shifting and ``bit_shift` = 0` will be used for padding rows. To prevent unnecessary lookups in padding rows, we override ``X[i]` := `in[i]`` and ``Y[i]` := 0` here.
-
-| Tag | Range | Description | Multiplicity |
-|-----|-------|-------------|--------------|
-| `SHIFT-C4.i` | i ∈ [0, 3] | `HWSL[['arr', ['idx', 'X', 'i'], ['idx', 'Y', 'i']]; in[i], bit_shift]` | 1 - zbs |
-| `SHIFT-C5.i` | i ∈ [0, 3] | `zbs` => `X[i]` = `in[i]` dot `left` |  |
-| | | _polynomial:_ `zbs * (X[i] - in[i] * left) = 0` | |
-| `SHIFT-C6.i` | i ∈ [0, 3] | `zbs` => `Y[i]` = `in[i]` dot `right` |  |
-| | | _polynomial:_ `zbs * (Y[i] - in[i] * right) = 0` | |
-| `SHIFT-C7` |  | `HWSL[['arr', ['idx', 'X', 4], ['-', 'extension', ['idx', 'X', 4]]]; extension, bit_shift]` | 1 - zbs |
-| `SHIFT-C8` |  | `zbs` => `X[4]` = 0 |  |
-| | | _polynomial:_ `zbs * X[4] = 0` | |
-
-## Full-limb shifting
-
-Next, we constrain that `limb_shift` is a proper unary encoding of the fifth (and sixth if ``word_instr` = 0`) bit of `shift`. For this to be the case, three requirements must be satisfied: + *unary(0)*: ``limb_shift[`i`]` in {0, 1}` for `i in [0, 3]`, + *unary(1)*: ``limb_shift[`i`]` = 1` for exactly one `i`, and + *proper encoding*: ``limb_shift[`i`]` = 1 <=> 1/16 (`shift &` (48-32 dot `word_instr`)) = i` The first requirement is enforced by constraint [shift:c:limb_shift_is_bit]. To construct a constraint for the second and third requirement, observe that $ 1/16 dot (`shift &` (48-32 dot `word_instr`)) in cases( {0, 1, 2, 3} &"if" `word_instr` = 0, {0, 1} &"if" `word_instr` = 1 $ Observe moreover that, assuming *unary(0)*, the expression $ 1/16 dot (1 + sum_(i=0)^3 (16i-1) dot `limb_shift[`i`]`) $ can evaluate to `i` if and only if ``limb_shift[`i`]` = 1`, while the others are `0`. This means that the relation $ 1 + sum_(i=0)^3 (16i-1) dot `limb_shift[`i`]` = `shift &` (48-32 dot `word_instr`) $ enforces both *unary(1)* and *proper encoding*. This is the exact relation [shift:c:limb_shift_lookup] enforces.
+| `LT-C1` | `MSB16[lhs_msb; lhs[2]]` | μ |
+| `LT-C2` | `MSB16[rhs_msb; rhs[2]]` | μ |
+| `LT-C3` | `lt` = `signed` dot (A (1 - B) + A C + (1 - B) C) + (1 - `signed`) dot `unsigned_lt` |  |
+| | _polynomial:_ `lt - signed * (lhs_msb * (1 - rhs_msb) + lhs_msb * carry[1] + (1 - rhs_msb) * carry[1]) - (1 - signed) * unsigned_lt = 0` | |
+| `LT-C4` | `IS_HALF[lhs[1]]` | μ |
+| `LT-C5` | `IS_HALF[rhs[1]]` | μ |
 
-Hereafter, one must only check that `out` is the proper cast of `shifted` into a `DWordWL`.
+And then we constrain the subtraction, taking care of the remaining range checking not yet covered by the assumptions or the `MSB16` lookup.
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `SHIFT-C9.i` | i ∈ [0, 3] | `IS_BIT<limb_shift[i]>` |  |
-| `SHIFT-C10` |  | `AND_BYTE[(1 - limb_shift[0]) + 15 * limb_shift[1] + 31 * limb_shift[2] + 47 * limb_shift[3]; shift, 48 - 32 * word_instr]` | μ |
-| `SHIFT-C11.i` | i ∈ [0, 1] | `out[:2]` = `shifted[:4]` |  |
-| | | _polynomial:_ `out[i] - (shifted::DWordWL)[i] = 0` | |
-
-## Miscellaneous
+| `LT-C6.i` | i ∈ [0, 1] | `IS_BIT<carry[i]>` |  |
+| `LT-C7.i` | i ∈ [0, 3] | `IS_HALF[lhs_sub_rhs[i]]` | μ |
 
-| Tag | Description |
-|-----|-------------|
-| `SHIFT-C12` | `direction` => `μ` = 1 |
-| | _polynomial:_ `direction * (1 - μ) = 0` |
+The chip contributes the following to the lookup argument.
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `SHIFT-C13` | `MSB16[is_negative; in[3]]` | signed |
+| `LT-C8` | `LT[lt; lhs::DWordWL, rhs::DWordWL, signed]` | -μ |
 
-*Note*: `is_negative` is not used when `signed = 0`. As such, there is no problem with it being unconstrained in this case.
+## Padding
 
-## Lookups
+The table can be padded to the next power of two with the following value assignments:
 
-This chip adds the following interaction to the lookup.
+---
 
-| Tag | Description | Multiplicity |
-|-----|-------------|--------------|
-| `SHIFT-C14` | `SHIFT[out; in, shift, direction, signed, word_instr]` | -μ |
+# MUL Chip
 
-= Padding
+The  chip constrains multiplication, both signed and unsigned, as well as providing access to the low and high halfs of the multiplication result.
 
-The table can be padded to the next power of two with the following value assignments:
+## Variables
 
-## Columns
+The  chip is comprised of  variables that are expressed using  columns and leverages  interaction(s):
 
 ### Input
 
 | Name | Type | Description |
 |------|------|-------------|
-| `in` | `DWordHL` | The value being shifted |
-| `shift` | `Byte` | Number of bits to shift `in` by. |
-| `direction` | `Bit` | Whether to shift left (0) or right (1). |
-| `signed` | `Bit` | Whether to interpret `in` as a signed integer. |
-| `word_instr` | `Bit` | Whether this is a Word-instruction (1) or not (0). |
+| `lhs` | `DWordHL` | the left hand operator. |
+| `lhs_signed` | `Bit` | whether to interpret `lhs` as a signed integer (1) or not (0). |
+| `rhs` | `DWordHL` | the right hand operator. |
+| `rhs_signed` | `Bit` | whether to interpret `rhs` as a signed integer (1) or not (0). |
 
 ### Output
 
 | Name | Type | Description |
 |------|------|-------------|
-| `out` | `DWordWL` | $`in <</>>/>>>` (`shift` mod 32 dot (2 - `word_instr`))$ |
+| `lo` | `DWordHL` | the lower limbs of the (extended) multiplication result |
+| `hi` | `DWordHL` | the upper limbs of the (extended) multiplication result |
 
 ### Auxiliary
 
 | Name | Type | Description |
 |------|------|-------------|
-| `is_negative` | `Bit` | Whether `in` is negative |
-| `bit_shift` | `Byte` | Value by which to shift `in` to obtain `X` and `Y` |
-| `zbs` | `Bit` | Whether `bit_shift` is zero (1) or not (0). |
-| `X` | `Half[5]` | scratch variable. |
-| `Y` | `Half[4]` | scratch variable. |
-| `limb_shift_raw` | `Bit[3]` | One-hot vector indicating whether $floor.l `shift` / 16 floor.r equiv i mod s$, where $s = 2$ when $`word_instr` = 1$ and $4$ otherwise. These columns store the first 3 values, and the 4th is derived from the one-hot property. |
+| `lhs_is_negative` | `Bit` | whether `lhs` is negative (1) or not (0) |
+| `rhs_is_negative` | `Bit` | whether `rhs` is negative (1) or not (0) |
+| `raw_product` | `B51[4]` | raw multiplication output |
 
 ### Virtual
 
 | Name | Type | Description |
 |------|------|-------------|
-| `limb_shift` | `Bit[4]` |  |
-| `extension` | `Half` | sign extension of `in`. |
-| `left` | `Bit` | Whether to perform a left-shift. |
-| `right` | `Bit` | Whether to perform a right-shift. |
-| `intra_limb_left` | `DWordHL` | `in << (shift % 16)` if `left` |
-| `intra_limb_right` | `DWordHL` | `in >>> (shift % 16)` if `right` and `signed`;\ `in >> (shift % 16)` if `right` and `!signed` |
-| `shifted` | `DWordHL` | $`in <</>>/>>>` (`shift` mod 32 dot (2 - `word_instr`))$ |
-
-**Definition of `limb_shift`:**
-```
-limb_shift (when iter=[0, 2]) := limb_shift_raw[i]
-limb_shift (when iter=3) := 1 - Σ_j = 0^2 limb_shift_raw[j]
-```
-
-**Definition of `extension`:**
-```
-extension := 65535 * is_negative
-```
+| `lhs_ext` | `Half[8]` | sign-extended value of `lhs` |
+| `rhs_ext` | `Half[8]` | sign-extended value of `rhs` |
+| `res` | `QuadWL` | concatenation of `lo` and `hi`. |
+| `carry` | `B20[4]` | carry values |
+| `μ_sum` | `BaseField` | sum of multiplicies |
 
-**Definition of `left`:**
+**Definition of `lhs_ext`:**
 ```
-left := μ - direction
+lhs_ext (when iter=[0, 3]) := lhs[i]
+lhs_ext (when iter=[4, 7]) := 65535 * lhs_is_negative
 ```
 
-**Definition of `right`:**
+**Definition of `rhs_ext`:**
 ```
-right := direction
+rhs_ext (when iter=[0, 3]) := rhs[i]
+rhs_ext (when iter=[4, 7]) := 65535 * rhs_is_negative
 ```
 
-**Definition of `intra_limb_left`:**
+**Definition of `res`:**
 ```
-intra_limb_left (when iter=0) := X[0]
-intra_limb_left (when iter=[1, 3]) := X[i] + Y[i - 1]
+res (when iter=[0, 1]) := (lo::DWordWL)[i]
+res (when iter=[2, 3]) := (hi::DWordWL)[i - 2]
 ```
 
-**Definition of `intra_limb_right`:**
+**Definition of `carry`:**
 ```
-intra_limb_right := Y[i] + X[i + 1]
+carry (when iter=0) := 2^-32 * (raw_product[0] - res[0])
+carry (when iter=[1, 3]) := 2^-32 * (raw_product[i] + carry[i - 1] - res[i])
 ```
 
-**Definition of `shifted`:**
+**Definition of `μ_sum`:**
 ```
-shifted := left * Σ_j = 0^i limb_shift[j] * intra_limb_left[i - j] + right * (Σ_j = 0^3 - i limb_shift[j] * intra_limb_right[i + j] + extension * Σ_j = 4 - i^3 limb_shift[j])
+μ_sum := μ_lo + μ_hi
 ```
 
 ### Multiplicity
 
 | Name | Type | Description |
 |------|------|-------------|
-| `μ` | `Bit` |  |
+| `μ_lo` | `BaseField` |  |
+| `μ_hi` | `BaseField` |  |
+
+`mat(delim: , top; bottom)` }
 
 ## Assumptions
 
+The following range checks are assumed to be performed/enforced outside of this chip:
+
 | Tag | Range | Description |
 |-----|-------|-------------|
-| `SHIFT-A1.i` | i ∈ [0, 3] | `IS_HALF[in[i]]` |
-| `SHIFT-A2` |  | `IS_BYTE[shift]` |
-| `SHIFT-A3` |  | `IS_BIT<direction>` |
-| `SHIFT-A4` |  | `IS_BIT<signed>` |
-| `SHIFT-A5` |  | `IS_BIT<word_instr>` |
+| `MUL-A1.i` | i ∈ [0, 3] | `IS_HALF[lhs[i]]` |
+| `MUL-A2.i` | i ∈ [0, 3] | `IS_HALF[rhs[i]]` |
 
----
+## Constraints
 
-# BRANCH Chip
+### Overview
 
-The  chip computes the target address of a branching instruction.
+When `lhs` and `rhs` are _unsigned_ integers, computing their product `mod 2^128` comes down to evaluating $ (sum_(j=0)^3 2^(16j) dot `lhs`_j) dot (sum_(i=0)^3 2^(16i) dot `rhs`_i) mod 2^128. $ If `lhs` and `rhs` are signed instead, the computation remains nearly identical: based on their signs, one must either zero or one-extend `lhs` and `rhs` --- forming `lhs_ext` and `rhs_ext` respectively --- and compute their product `mod 2^128`: $ (sum_(j=0)^7 2^(16j) dot `lhs_ext`_j) dot (sum_(i=0)^7 2^(16i) dot `rhs_ext`_i) mod 2^128. $ where `lhs_ext` and `rhs_ext` are treated as _unsigned_ integers. Note that by setting the extension limbs of `lhs` and/or `rhs` to `0` when the integer is (i) unsigned or (ii) signed and non-negative, this second formula still applies. For the purposes of constraining the multiplication operation, we rewrite this formula as
 
-= Variables
+$ &(sum_(j=0)^7 2^(16j) dot `lhs_ext`_j) dot (sum_(i=0)^7 2^(16i) dot `rhs_ext`_i) mod 2^128 \ &equiv sum_(j=0)^7 sum_(i=0)^7 2^(16(i+j)) dot `lhs_ext`_j dot `rhs_ext`_i mod 2^128 \ &stackrel(triangle, equiv) sum_(j=0)^7 sum_(i=0)^(7-j) 2^(16(i+j)) dot `lhs_ext`_j dot `rhs_ext`_i mod 2^128 \ &stackrel(square, equiv) sum_(j=0)^7 sum_(i=j)^(7) 2^(16i) dot `lhs_ext`_j dot `rhs_ext`_(i-j) mod 2^128 \ &stackrel(penta, equiv) sum_(i=0)^7 sum_(j=0)^(i) 2^(16i) dot `lhs_ext`_j dot `rhs_ext`_(i-j) mod 2^128 \ &equiv sum_(i=0)^3 sum_(k=0)^1 sum_(j=0)^(2i+k) 2^(16(2i+k)) dot `lhs_ext`_j dot `rhs_ext`_(2i+k-j) mod 2^128 \ &equiv sum_(i=0)^3 2^(32i) dot sum_(k=0)^1 2^(16k) dot sum_(j=0)^(2i+k) `lhs_ext`_j dot `rhs_ext`_(2i+k-j) mod 2^128 $ where at step - `triangle` we can ignore `i > 7-j`, since that makes `2^(16(i+j)) equiv 0 mod 2^128`, - `square` we rewrite the second summation such that `i` iterates from `j` to 7, rather than `0` to `7-j`, and - `penta` we swap the sums.
 
-The  chip is comprised of  variables that are expressed using  columns and leverages  interaction(s):
+We let `raw_product` capture the second summation in this last formula (see [mul:c:raw_product]). By construction, ``raw_product`_i < 2^51` for all `i in [0, 3]`, far exceeding the 32-bits that fit in a single `Word`-limb. What remains then is to reduce each limb of `raw_product` `mod 2^32`, carrying the overflow of each limb to the next, constructing the output `res` in doing so.
 
-= Assumptions
+This reduce-and-carry operation is constrained by [mul:c:range_lo]/[mul:c:range_hi] and [mul:c:carry], combined with `carry`'s definition. [mul:c:carry] and `carry`'s definition enforce that $ forall i in [0, 3]: `raw_product`_i + `carry`_(i-1) - `res`_i in { k dot 2^32 | k in [0, 2^20) } $ with ``carry`_(-1) = 0` for simplicity. In other words: ``res`_i equiv `raw_product`_i + `carry`_(i-1) (mod 2^32)`. With [mul:c:range_lo]/[mul:c:range_hi] forcing ``res`_i < 2^32`, ``res`_i` can only assume one value: ``raw_product`_i + `carry`_(i-1) mod 2^32`.
 
-= Constraints
+*Note*: one may have observed that [mul:c:carry] requires ``carry`_i in [0, 2^20)`, while no limb of a valid carry value would ever exceed `2^19`. This is indeed the case. However, there is some slack in how tight one has to constrain the `carry` values. In fact, in this situation it suffices to assert that ``carry`_i < frac(p, 2^32, style: "skewed") approx 2^31`, where `p` denotes the field's modulus. Given that other chips also use 20-bit lookups, using `IS_B20` makes for a simpler design.
 
-We constrain `next_pc` to be ``base_address` + `offset``, where `base_address` equals `pc` when ``JALR` = 0` and `register` otherwise.
+### Definitions
 
-The range checks on `unmasked_low_byte` and `next_pc_low[0]` are performed implicitly by the `AND_BYTE` lookup.
+We constrain `lhs_is_negative` and `rhs_is_negative` according to their definition; `lo`, `hi` and `carry` are appropriately range checked.
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `BRANCH-C1` |  | 1 - JALR ⇒ `ADD<next_pc_unmasked; pc, offset::DWordWL>` |  |
-| `BRANCH-C2` |  | JALR ⇒ `ADD<next_pc_unmasked; register, offset::DWordWL>` |  |
-| `BRANCH-C3` |  | `IS_BYTE[next_pc_low[1]]` | μ |
-| `BRANCH-C4` |  | `AND_BYTE[next_pc_low[0]; unmasked_low_byte, 254]` | μ |
-| `BRANCH-C5.i` | i ∈ [0, 2] | `IS_HALF[next_pc_high[i]]` | μ |
+| `MUL-C1` |  | `SIGN<lhs_is_negative; lhs[3], lhs_signed>` |  |
+| `MUL-C2` |  | `SIGN<rhs_is_negative; rhs[3], rhs_signed>` |  |
+| `MUL-C3.i` | i ∈ [0, 3] | `IS_HALF[lo[i]]` | μ_sum |
+| `MUL-C4.i` | i ∈ [0, 3] | `IS_HALF[hi[i]]` | μ_sum |
+| `MUL-C5.i` | i ∈ [0, 3] | `IS_B20[carry[i]]` | μ_sum |
 
-This chip contributes the following to the lookup argument.
+### Product
+
+[mul:c:raw_product] defines `raw_product` in terms of the (sign extended) input values `lhs` and `rhs`.
+
+| Tag | Range | Description |
+|-----|-------|-------------|
+| `MUL-C6.i` | i ∈ [0, 3] | `raw_product[i]` = sum_(`k`=0)^1 2^(16k) sum_(`j`=0)^(2i+k) `lhs_ext[j]` dot `rhs_ext[2i+k-j]` |
+| | | _polynomial:_ `Σ_k = 0^1 2^(16 * k) * Σ_j = 0^2 * i + k lhs_ext[j] * rhs_ext[2 * i + k - j] - raw_product[i] = 0` |
+
+### Lookup
+
+The  chip contributes the following to the lookup:
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `BRANCH-C6` | `BRANCH[next_pc; pc, offset, register, JALR]` | -μ |
+| `MUL-C7` | `MUL[lo::DWordWL; lhs, lhs_signed, rhs, rhs_signed, 0]` | -μ_lo |
+| `MUL-C8` | `MUL[hi::DWordWL; lhs, lhs_signed, rhs, rhs_signed, 1]` | -μ_hi |
 
-= Padding
+## Padding
 
 The table can be padded to the next power of two with the following value assignments:
 
-## Columns
+## Notes/optimizations
+
+- `lo` and `hi` are stored in `DWordHL`s (rather than `DWordWL`s) because of their values being range checked. Since it is not required that both `μ_lo` and `μ_hi` are non-zero at the same time, one cannot safely assume their range to be checked elsewhere. - As an optimization, one might be able to use a `DWordWL` and `DWordHL` to store `lo` and `hi`, where one would decide which to store in which based on the multiplicities `μ_lo` and `μ_hi`; the value sent into the lookup could then be assumed range-checked by the other side of the relation. This optimization was not included at this moment because of its negative impact on the readability and verifiability of the chip.
+
+---
+
+# DVRM Chip
+
+The  chip provides division and remainder functionality, both signed and unsigned.
+
+## Variables
+
+The  chip is comprised of  variables that are expressed using  columns and leverages  interaction(s):
 
 ### Input
 
 | Name | Type | Description |
 |------|------|-------------|
-| `pc` | `DWordWL` | The current pc, used as base address when `!JALR` |
-| `offset` | `DWordWL` | The offset from the base address to jump to |
-| `register` | `DWordWL` | The base address to use when `JALR` |
-| `JALR` | `Bit` | Selects between `pc` and `register` as base address, needed for the `JALR` instruction |
+| `n` | `DWordHL` | The numerator |
+| `d` | `DWordHL` | The denominator |
+| `signed` | `Bit` | Whether to interpret the input as signed (1) or unsigned (0) integers. |
 
 ### Output
 
 | Name | Type | Description |
 |------|------|-------------|
-| `next_pc_high` | `Half[3]` | The upper part of the next pc |
-| `next_pc_low` | `Byte[2]` | The lower part of the next pc |
+| `q` | `DWordHL` | The quotient; $`n` / `d`$ rounded towards zero. |
+| `r` | `DWordHL` | The remainder; $`n` - `q` `d`$. |
 
 ### Auxiliary
 
-| Name | Type | Description |
-|------|------|-------------|
-| `unmasked_low_byte` | `Byte` | The low byte of the next pc, before masking the LSB. Used to constraint the raw addition. |
+| Name | Type | Description |
+|------|------|-------------|
+| `div_by_zero` | `Bit` | Whether $`d`=0$. |
+| `overflow` | `Bit` | Whether $`n` = -2^63$ and $`d`=-1$. |
+| `abs_r` | `DWordWL` | Absolute value of `r`. |
+| `abs_d` | `DWordWL` | Absolute value of `d`. |
+| `n_sub_r` | `DWordHL` | $`n`-`r`$. |
+| `sign_n_sub_r` | `Bit` | Sign of `n_sub_r`. |
+| `sign_n` | `Bit` | Sign of `n`. |
+| `sign_d` | `Bit` | Sign of `d`. |
+| `sign_q` | `Bit` | Sign of `q`. |
+| `sign_r` | `Bit` | Sign of `r`. |
+
+### Virtual
+
+| Name | Type | Description |
+|------|------|-------------|
+| `extended_n` | `QuadHL` | sign-extended value of `n`. |
+| `extended_r` | `QuadHL` | sign-extended value of `r`. |
+| `extension_n_sub_r` | `DWordHL` | sign-extension limbs of `n_sub_r`. |
+| `extended_n_sub_r` | `QuadHL` | sign-extended value of `n_sub_r`. |
+| `carry` | `Bit[4]` | carries for adding `extended_n_sub_r` to `extended_r`, forming `extended_n`. |
+| `μ_sum` | `BaseField` | sum of multiplicities |
+
+**Definition of `extended_n`:**
+```
+extended_n (when iter=[0, 3]) := n[i]
+extended_n (when iter=[4, 7]) := 65535 * sign_n
+```
+
+**Definition of `extended_r`:**
+```
+extended_r (when iter=[0, 3]) := r[i]
+extended_r (when iter=[4, 7]) := 65535 * sign_r
+```
 
-### Virtual
+**Definition of `extension_n_sub_r`:**
+```
+extension_n_sub_r := 65535 * sign_n_sub_r
+```
 
-| Name | Type | Description |
-|------|------|-------------|
-| `next_pc_unmasked` | `DWordWL` | The combination of `next_pc_high`, `next_pc_low[1]` and `unmasked_low_byte` to constrain the addition. This is the computed value for the next pc, before masking off the LSB as required by the ISA. |
-| `next_pc` | `DWordWL` | The computed next pc, after masking off the LSB as required by the ISA. |
+**Definition of `extended_n_sub_r`:**
+```
+extended_n_sub_r (when iter=[0, 3]) := n_sub_r[i]
+extended_n_sub_r (when iter=[4, 7]) := extension_n_sub_r[i - 4]
+```
 
-**Definition of `next_pc_unmasked`:**
+**Definition of `carry`:**
 ```
-next_pc_unmasked (when iter=0) := 2^16 * next_pc_high[0] + 2^8 * next_pc_low[1] + unmasked_low_byte
-next_pc_unmasked (when iter=1) := 2^16 * next_pc_high[2] + next_pc_high[1]
+carry (when iter=0) := 2^-32 * ((extended_n_sub_r::QuadWL)[i] + (extended_r::QuadWL)[i] - (extended_n::QuadWL)[i])
+carry (when iter=[1, 3]) := 2^-32 * ((extended_n_sub_r::QuadWL)[i] + (extended_r::QuadWL)[i] + carry[i - 1] - (extended_n::QuadWL)[i])
 ```
 
-**Definition of `next_pc`:**
+**Definition of `μ_sum`:**
 ```
-next_pc (when iter=0) := 2^16 * next_pc_high[0] + 2^8 * next_pc_low[1] + next_pc_low[0]
-next_pc (when iter=1) := 2^16 * next_pc_high[2] + next_pc_high[1]
+μ_sum := μ_q + μ_r
 ```
 
 ### Multiplicity
 
 | Name | Type | Description |
 |------|------|-------------|
-| `μ` | `Bit` |  |
+| `μ_q` | `BaseField` |  |
+| `μ_r` | `BaseField` |  |
 
 ## Assumptions
 
 | Tag | Range | Description |
 |-----|-------|-------------|
-| `BRANCH-A1.i` | i ∈ [0, 1] | `pc` is range checked, `IS_WORD[pc[i]]` |
-| `BRANCH-A2` |  | `offset` is range checked, `IS_WORD[offset]` |
-| `BRANCH-A3.i` | i ∈ [0, 1] | `register` is range checked, `IS_WORD[register[i]]` |
-| `BRANCH-A4` |  | `IS_BIT<JALR>` |
+| `DVRM-A1.i` | i ∈ [0, 3] | `IS_HALF[n[i]]` |
+| `DVRM-A2.i` | i ∈ [0, 3] | `IS_HALF[d[i]]` |
+| `DVRM-A3` |  | `IS_BIT<signed>` |
 
----
+## Constraints
 
-# MEMW Chip
+From the ISA, we gather five requirements for the `DIV[U][W]` and `REM[U][W]` instructions:
 
-The  chip is used to read and write memory locations (both RAM and registers) in chunks of 1, 2, 4 or 8 values. It introduces the old value and last-accessed timestamps of memory addresses internally, in order to satisfy the design of the memory argument ([memory]).
+enum.item([ _For both signed and unsigned division, except in the case of_ overflow, _it holds that ``n` = `q` `d` + `r``._ ]), enum.item([ _`DIV` and `DIVU` perform [...] signed and unsigned integer division [...] rounding towards zero._ ]), enum.item([ _For `REM`, the sign of a nonzero [remainder] equals the sign of the [numerator]._ ]), enum.item([ In case of _division-by-zero_, ``r` = `n`` and ``q` = 2^64-1` (unsigned) or ``q` = -1` (signed). ]), enum.item([ In case of _overflow_, ``q` = `n`` and ``r` = 0` ]), where _overflow_ occurs when ``n` = -2^(63)` and ``d` = -1` (and, hence, ``signed` = 1`), and _division-by-zero_ indicates that ``d` = 0`. In the following, we list the constraints associated with the  chip, and explain how these together enforce all five of these requirements.
 
-= Variables
+### R3: Sign remainder equals sign numerator
 
-The  chip is comprised of  variables that are expressed using  columns and leverages  interaction(s):
+We start with R3, which is straightforwardly asserted by constraint [dvrm:c:sign_r_equals_sign_n].
+
+| Tag | Description |
+|-----|-------------|
+| `DVRM-C1` | `r` eq.not 0 => `sign_r` = `sign_n` |
+| | _polynomial:_ `Σ_i = 0^3 r[i] * (sign_r - sign_n) = 0` |
 
-= Assumptions
+### R2: rounding towards zero
 
-Our assumptions do not explicitly cover any range checks for the `is_register` and `value` columns, as these are not necessary for the correctness of this chip in isolation. Still, these properties are necessary for the consistency of the system as a whole, and therefore we document it here, keeping the type information as a reading help.
+R2 states that "_[in] signed and unsigned integer division [the quotient is] round[ed] towards zero._" In other words, + the sign of ``n`-`qd`` must match that of `n` (unless ``qd` = `n``), and + `|`n`-`qd`|  < |`d`|` (unless ``d` = 0`).
 
-= Constraints
+Leveraging R1 , we can rewrite these as + the sign of ``r`` must match that of `n` (unless ``r` = 0`), and + `|`r`|  < |`d`|` (unless ``d` = 0`).
 
-Depending on the values of `write2`, `write4` and `write8`, the addresses following `base_address` need to be constructed. Rather than computing these in full (which would require the later addresses to be instantiated), it suffices to know the `carry`: the bit indicating whether ``base_address`_0 + t >= 2^32`, i.e., whether adding `t in [1, 7]` to `base_address` requires a carry from the lower to the upper limb. Note that it is safe for the prover to chose these bits: additions for which this bit is not correctly set will yield an address where either the lower or upper limb is out of bounds. As such, the constructed address will not match any existing memory tokens, which are only initialized for correctly formatted and range-checked doublewords (see [memory]).
+Focusing on the first statement, we observe that this trivially holds when ``signed` = 0`, while R3 deals with the case that ``signed` = 1`. The second statement is enforced by [dvrm:c:abs_r_lt_abs_d]. [dvrm:c:abs_r_if_negative] and [dvrm:c:abs_r_if_nonnegative] (resp. [dvrm:c:abs_d_if_negative] and [dvrm:c:abs_d_if_nonnegative]) are included to ensure that `abs_r` (resp. `abs_d`) is the absolute values of `r` (resp. `d`).
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `MEMW-C1` |  | `IS_BIT<μ_read>` |  |
-| `MEMW-C2` |  | `IS_BIT<μ_write>` |  |
-| `MEMW-C3` |  | `IS_BIT<μ_sum>` |  |
-| `MEMW-C4` |  | `w2` => `μ_sum` |  |
-| | | _polynomial:_ `w2 * (1 - μ_sum) = 0` | |
-| `MEMW-C5.i` | i ∈ [0, 6] | `IS_BIT<carry[i]>` |  |
-| `MEMW-C6` |  | `LT[1; old_timestamp[0], timestamp, 0]` | μ_sum |
-| `MEMW-C7` |  | `LT[1; old_timestamp[1], timestamp, 0]` | w2 |
-| `MEMW-C8.i` | i ∈ [2, 3] | `LT[1; old_timestamp[i], timestamp, 0]` | w4 |
-| `MEMW-C9.i` | i ∈ [4, 7] | `LT[1; old_timestamp[i], timestamp, 0]` | write8 |
-
-As long as `timestamp` is properly range-checked, the presence of `old_timestamp` in the memory argument automatically ensures it is appropriately range checked (this assumes no external entities provide negative multiplicities without range checking the timestamp). This ensures the assumptions for `LT` are satisfied.
+| `DVRM-C2` |  | `LT[1 - div_by_zero; abs_r, abs_d, 0]` | μ_sum |
+| `DVRM-C3` |  | sign_r ⇒ `NEG<abs_r; r>` |  |
+| `DVRM-C4.i` | i ∈ [0, 1] | not`sign_r` => `abs_r` = `r` |  |
+| | | _polynomial:_ `(1 - sign_r) * (abs_r[i] - (r::DWordWL)[i]) = 0` | |
+| `DVRM-C5` |  | sign_d ⇒ `NEG<abs_d; d>` |  |
+| `DVRM-C6.i` | i ∈ [0, 1] | not`sign_d` => `abs_d` = `d` |  |
+| | | _polynomial:_ `(1 - sign_d) * (abs_d[i] - (d::DWordWL)[i]) = 0` | |
 
-There is no need to check that the additions do not overflow, as our address calculations are not performed modulo `2^64` here, and any overflow will result in an address without matching initialization.
+### R5: overflow
 
-The chip adds the following tuples to the lookup argument, to effectuate that part of the memory argument.
+The ISA requires that ``q` = `n`` and ``r` = 0` in the event of overflow (i.e., when ``n` = -2^63` and ``d` = -1`). We note that the second half of this requirement is already satisfied by R2: since ``d` = -1 != 0`, R2 requires that `|`r`| < |`d`| = 1`, to which ``r` = 0` is the only satisfying value.
 
-| Tag | Range | Description | Multiplicity |
-|-----|-------|-------------|--------------|
-| `MEMW-CM10` |  | `memory[is_register, base_address, old_timestamp[0], old[0]]` | μ_sum |
-| `MEMW-CM11` |  | `memory[is_register, base_address, timestamp, value[0]]` | -μ_sum |
-| `MEMW-CM12` |  | `memory[is_register, address_add[0], old_timestamp[1], old[1]]` | w2 |
-| `MEMW-CM13` |  | `memory[is_register, address_add[0], timestamp, value[1]]` | -w2 |
-| `MEMW-CM14.i` | i ∈ [2, 3] | `memory[is_register, address_add[i - 1], old_timestamp[i], old[i]]` | w4 |
-| `MEMW-CM15.i` | i ∈ [2, 3] | `memory[is_register, address_add[i - 1], timestamp, value[i]]` | -w4 |
-| `MEMW-CM16.i` | i ∈ [4, 7] | `memory[is_register, address_add[i - 1], old_timestamp[i], old[i]]` | write8 |
-| `MEMW-CM17.i` | i ∈ [4, 7] | `memory[is_register, address_add[i - 1], timestamp, value[i]]` | -write8 |
+We moreover find that R1 can be leveraged to enforce the correct value of `q`. While ``n` = `qd` + `r`` (R1) does _not_ hold in the case of overflow, the relation ``n` = |`q`|`d` + `r`` _does_. We moreover note that the 64-bit _signed_ two's complement representation of `-2^63` is identical to the 64-bit _unsigned_ representation of `|-2^63| = 2^63`. As such, by interpreting `q` as an unsigned integer when ``overflow` = 1`, it follows that R1 will enforce ``q` = `0x80...00``.
 
-This chip contributes the following to the lookup argument:
+In summary, in case of overflow R2 enforces that ``r` = 0`. Moreover it suffices to interpret `q` as unsigned integer ([dvrm:c:sign_q]); R1 will ensure it contains the correct value.
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `MEMW-CO18` | `MEMW[old; is_register, base_address, value, timestamp, write2, write4, write8]` | -μ_read |
-| `MEMW-CO19` | `MEMW[is_register, base_address, value, timestamp, write2, write4, write8]` | -μ_write |
-
-= Padding The table can be padded to the next power of two with the following value assignments:
-
-= Read-size aligned fast path
+| `DVRM-C7` | `sign_q` = `signed` dot (1- `overflow`) |  |
+| | _polynomial:_ `signed * (1 - overflow) - sign_q = 0` | |
+| `DVRM-C8` | `ZERO[overflow; n[0] + n[1] + n[2] + (n[3] - 2^15 * sign_n) + (1 - sign_n) + (65535 - d[0]) + (65535 - d[1]) + (65535 - d[2]) + (65535 - d[3])]` | μ_sum |
 
-When a memory access happens at an address with proper alignment for its access size (i.e., adding the access size to `base_address`'s lowest limb does not overflow), and all accessed elements were last accessed at the same timestamp, we can instead use the  chip to save on total column count. The saving comes from only requiring a single old timestamp to be stored, as well as being able to guarantee that all values of `add_limb_overflow` would be zero. A minor extra cost is introduced in the form of a check that the alignment is indeed correct, and the corresponding decomposition of the `base_address`.
+We highlight [dvrm:c:overflow]. Recall that the `overflow` flag should be set if and only if (i) ``signed` = 1`, (ii) ``n` = `0x80...00``, and (iii) ``d` = `0xFF...FF``. These requirements are equivalent to the state where: $ forall i in [0, 3]:&& 65535 - `d`_i &= 0,\ forall i in [0, 2]:&& `n`_i &= 0,\ && `n`_3 - 2^15 dot `sign_n` &= 0,\ && 1 - `sign_n` &= 0,\ $ where ``signed` = 1` follows from the last equality. The requirement is phrased in this way, because the left-hand sides of the above expressions are `>= 0` by construction. Given that the sum of these expressions does not exceed `2^19` (and thus never wraps in the field), we can now say that the `overflow` bit should be set to `1` if and only if their sum evaluates to `0`. The `ZERO` lookup guarantees this to be the case.
 
-Further logic remains essentially the same, so we briefly present the relevant tables for this chip.
+### R1: $#`n` = #`qd` + #`r`$
 
-The  chip only needs  variables, expressed through  columns; it leverages  interactions.
+Rewriting R1, we find the constraint `not`overflow` => `n` - `r` = `qd``.
 
-## Padding
+Since `n`, `d`, `q` and `r` are all 64-bit integers, we must assert this equality `mod 2^128`, rather than `mod 2^64`. To this end, we introduce `extended_n_sub_r` and leverage the `MUL` chip to verify that it is equal to ``qd` mod 2^128` using constraints [dvrm:c:mul_lower] and [dvrm:c:mul_upper]; [dvrm:c:q_range] is included to uphold assumption [mul:a:rhs].
 
-The table can be padded to the next power of two with the following value assignments:
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `DVRM-C9` |  | `MUL[n_sub_r::DWordWL; d, signed, q, sign_q, 0]` | μ_sum |
+| `DVRM-C10` |  | `MUL[extension_n_sub_r::DWordWL; d, signed, q, sign_q, 1]` | μ_sum |
+| `DVRM-C11.i` | i ∈ [0, 3] | `IS_HALF[q[i]]` | μ_sum |
 
-= Register fast-path
+It now remains to enforce that `extended_n_sub_r` is the _signed_ 128-bit representation of ``n`-`r``. Here, we introduce `extended_n` and `extended_r`. By their definition, these variables contain the signed 128-bit representations of `n` and `r`. The `carry` variable has been defined such that it mimics those in the `ADD` chip, except that here we add two `QuadHL`s rather than two `DWordHL`, thus needing four carry bits instead of two. With this in place, [dvrm:c:n_sub_r] (mimicking [add:c:carry]) ensures `extended_n_sub_r` must contain the correct value.
 
-The  chip provides a fast-path for accessing registers. This fast-path leverages that registers + can be addressed using a `Byte`, rather than a full `DWord`, + are constantly accessed, i.e., ``timestamp` - `old_timestamp`` is small, and + have a fixed access pattern to achieve a footprint that is significantly smaller than both  and .
+Lastly, observe that ``n` - `r` in (-2^64, 2^64)`, _regardless_ of the value of `signed`. Moreover, note that the upper halves of the 128-bit representations of all values in this range are either `0xFFFFFFFF` (negative) or `0x00000000` (non-negative). This means that we do not need to store all 128 bits of `extended_n_sub_r`. Rather, we need only store the lower 64-bits, and a separate bit (`sign_n_sub_r`) indicating whether the top limbs are all-ones or all-zeroes. The prover is free to select the value for `sign_n_sub_r`; only one of the two will fit the proof.
 
-Note: as a result of hard optimization, this chip can only be used for register accesses for which + ``timestamp` - `old_timestamp` in [1, 2^16]`, and + ``timestamp[0]` > `old_timestamp[0]`` If either of these rules does not apply to your access, you should fall back to using `MEMW_A`.
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `DVRM-C12.i` | i ∈ [0, 3] | `IS_BIT<carry[i]>` |  |
+| `DVRM-C13.i` | i ∈ [0, 3] | `IS_HALF[r[i]]` | μ_sum |
+| `DVRM-C14.i` | i ∈ [0, 3] | `IS_HALF[n_sub_r[i]]` | μ_sum |
+| `DVRM-C15` |  | `IS_BIT<sign_n_sub_r>` |  |
 
-Note moreover that this chip does not guard against misaligned register access faults: to access register with a given `address`, one must provide `2 dot `address`` in the lookup.
+### R4: division-by-zero
 
-## Variables
+R4 requires that ``q` = 2^64-1` (unsigned) or `-1` (signed) and ``r` = n` when ``d` = 0`. Recalling R1, we see that ``n` = `q` `d` + `r` = `r`` when ``d` = 0`, already enforces the latter. Next, we note that, in two's complement, the _unsigned_ value `2^64-1` and _signed_ value `-1` are both represented by the bit string `0xFFFFFFFF`. Hence, only [dvrm:c:q_if_div_by_zero] is required to completely constrain R4; [dvrm:c:div_by_zero] just ensures the `div_by_zero` flag is set when ``d` = 0`.
 
-The  chip is comprised of  variables that are expressed using  columns and leverages  interactions:
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `DVRM-C16.i` | i ∈ [0, 3] | `div_by_zero` => `q[i]` = 65535 |  |
+| | | _polynomial:_ `div_by_zero * (q[i] - 65535) = 0` | |
+| `DVRM-C17` |  | `ZERO[div_by_zero; d[0] + d[1] + d[2] + d[3]]` | μ_sum |
 
-## Assumptions
+### Other
 
-| Tag | Range | Description |
-|-----|-------|-------------|
-| `MEMW-A1.i` | i ∈ [0, 1] | `IS_WORD[base_address[i]]` |
-| `MEMW-A2` |  | `IS_BIT<write2>` |
-| `MEMW-A3` |  | `IS_BIT<write4>` |
-| `MEMW-A4` |  | `IS_BIT<write8>` |
-| `MEMW-A5` |  | `IS_BIT<write2 + write4 + write8>` |
-| `MEMW-A6.i` | i ∈ [0, 1] | `IS_WORD[timestamp[i]]` |
+The following constraints are included to enforce the values of `sign_n`, `sign_r` and `sign_d` are correct.
 
-The following range checks are assumed to be performed/enforced outside of this chip:
+| Tag | Description |
+|-----|-------------|
+| `DVRM-C18` | `SIGN<sign_n; n[3], signed>` |
+| `DVRM-C19` | `SIGN<sign_r; r[3], signed>` |
+| `DVRM-C20` | `SIGN<sign_d; d[3], signed>` |
 
-## Constraints
+### Output
 
-Since most registers are frequently accessed, the difference between `timestamp` and `old_timestamp` is small most of the times. Rather than storing their (nearly) identical upper limbs twice, it is instead assumed that ``old_timestamp[1]` = `timestamp[1]``;  can be used for accesses where this is not the case.
+Lastly, this chip contributes the following to the lookup:
 
-Verifying that ``timestamp` > `old_timestamp`` now simplifies to verifying that ``timestamp[0]` - `old_timestamp[0]` > 0`. For most accesses, this value will be small enough to fit in a `Half`. This chip thus enforces this by means of the following constraint:
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `DVRM-C21` | `DVRM[q::DWordWL; n, d, signed, 0]` | -μ_q |
+| `DVRM-C22` | `DVRM[r::DWordWL; n, d, signed, 1]` | -μ_r |
 
-With ``old_timestamp`<`timestamp`` asserted, `old` is read from the register ([regw:c:read_old]) and `val` is written back ([regw:c:write_val]).
+## Padding
 
-This chip can either just write (``μ_write` = 1`), or both read and write (``μ_read` = 1`) in the same cycle. It must be asserted that at most one of these two options is selected:
+To pad the  table, we use the following data, representing the unsigned division `frac(0, 0, style: "horizontal")`:
 
-Lastly, this chip contributes the following interactions to the logup:
+---
 
-## Padding
+# LOAD Chip
 
-The table can be padded to the next power of two with the following value assignments:
+The  chip provides functionality to read values from memory and sign-extend them where appropriate. It delegates low-level memory handling to the `MEMW` chip ([memw]).
 
-= Notes/optimizations The following ideas may prove to be optimizations for the // chip: - `MEMB` chip that does a one-byte write to remove old_timestamp from here (uncertain tradeoffs) - Adding `μ_sum`/`w2`/`w4`/`write8` multiplicities to the `IS_HALF` lookups may make some GKR things faster if there are known zeroes. - For the register fast-path, one may upgrade the `IS_HALF` check to an `IS_B20` check for extended range at the cost of looking through a larger table.
+## Variables
 
-## Columns
+The  chip is comprised of  variables that are expressed using  columns and leverages  interaction(s):
 
 ### Input
 
 | Name | Type | Description |
 |------|------|-------------|
-| `is_register` | `Bit` | Whether the address represents a register index |
-| `base_address` | `DWordWL` | The base address to read from/write to. Gets offset by $[0, 7]$ depending on the size of the access |
-| `value` | `BaseField[8]` | The values to store in memory. For RAM, these should be (up to) 8 range-checked `Byte`s; registers are stored as two range-checked `Word`s |
-| `timestamp` | `DWordWL` | The timestamp at which this memory access occurs |
-| `write2` | `Bit` | Whether to write exactly 2 values |
-| `write4` | `Bit` | Whether to write exactly 4 values |
-| `write8` | `Bit` | Whether to write exactly 8 values |
+| `base_address` | `DWordWL` | The base address to read/write from/to, gets offset by $[0, 7]$, depending on how big the access is |
+| `timestamp` | `DWordWL` | The timestamp at which this memory access is said to occur |
+| `read2` | `Bit` | Whether to read exactly 2 bytes |
+| `read4` | `Bit` | Whether to read exactly 4 bytes |
+| `read8` | `Bit` | Whether to read exactly 8 bytes |
+| `signed` | `Bit` | Whether to sign-extend (1) or zero-extend (0) |
 
 ### Output
 
 | Name | Type | Description |
 |------|------|-------------|
-| `old` | `BaseField[8]` | The old value written at `base_address`. See `value` for information about representation. Only the elements corresponding to the `writeN` bits are guaranteed |
+| `res` | `DWordBL` | The result of reading (up to) 8 bytes from `base_address`, extended corresponding to `signed`. |
 
 ### Auxiliary
 
 | Name | Type | Description |
 |------|------|-------------|
-| `carry` | `Bit[7]` | Whether `base_address[0] + i + 1` $>= 2^32$ |
-| `old_timestamp` | `DWordWL[8]` | The timestamp at which address `base_address + i` was last accessed |
+| `sign_bit` | `Bit` | The sign bit extracted from the bytes retrieved from memory |
 
 ### Virtual
 
 | Name | Type | Description |
 |------|------|-------------|
-| `w2` | `Bit` | writing at least 2 bytes |
-| `w4` | `Bit` | writing at least 4 bytes |
-| `address_add` | `DWordWL[7]` | `address_add[i] = base_address + i + 1` |
-| `μ_sum` | `Bit` |  |
-
-**Definition of `w2`:**
-```
-w2 := write2 + write4 + write8
-```
-
-**Definition of `w4`:**
-```
-w4 := write4 + write8
-```
-
-**Definition of `address_add`:**
-```
-address_add := ['arr', ['-', ['+', ['idx', 'base_address', 0], 'i', 1], ['*', ['^', 2, 32], ['idx', 'carry', 'i']]], ['+', ['idx', 'base_address', 1], ['idx', 'carry', 'i']]]
-```
+| `read1` | `Bit` | Whether to read exactly 1 byte |
 
-**Definition of `μ_sum`:**
+**Definition of `read1`:**
 ```
-μ_sum := μ_read + μ_write
+read1 := μ - read2 - read4 - read8
 ```
 
 ### Multiplicity
 
 | Name | Type | Description |
 |------|------|-------------|
-| `μ_read` | `Bit` | Whether we are performing a read (and hence return `out`) |
-| `μ_write` | `Bit` | Whether we are performing a write (and hence not return `out`) |
-
----
-
-# LT Chip
-
-The  chip constrains an indicator bit for the less-than relation, signed or unsigned.
-
-= Variables
-
-The  chip is comprised of  variables that are expressed using  columns and leverages  interaction(s):
-
-= Assumptions We assume the inputs `lhs`, `rhs` and `signed` are partially range checked.
-
-= Constraints We first constrain that all variables correspond to their definition. For the defining constraint of `lt`, [lt:c:lt], observe that it is a choice between two options, depending on the input flag `signed`. In the case of unsigned comparison, we simply need `unsigned_lt`, indicating that a wraparound (carry bit) modulo `2^64` is needed to go from `rhs` to `lhs` via addition. For the case of signed comparison, we first need some case analysis.
-
-We split `a < b` into four disjoint cases, conditioned on the sign of `a` and `b`. Recall that the sign of a number in two's complement can be read off from the MSB, being `1` for a negative number and `0` for a positive one. For this analysis, we denote the MSB of `a` as `A` and the MSB of `b` as `B`. The four disjoint cases then become:
-
-+ `dash(A) and B and (a < b)` + `A and dash(B) and (a < b)` + `A and B and (a < b)` + `dash(A) and dash(B) and (a < b)`
-
-The first case is evidently false, while the second case simplifies to `A and dash(B)`. For the third and fourth case, observe that when `A = B`, the `<` relation is preserved by the modular correspondence between `[-2^(31), 2^(31))` and `[0, 2^(64))`. Importantly, this modular correspondence is merely a reinterpretation of the bits or values of `a` and `b`, due to the representation in two's complement. Hence, we can introduce the value `C = `unsigned_lt``, that accurately represents the relation `a < b` when `A = B`.
+| `μ` | `Bit` |  |
 
-Combining our three remaining cases, we obtain the boolean formula `A dash(B) or A B C or dash(A) dash(B) C`. Since the cases are disjoint, this can be computed with the binary-valued polynomial `P(A, B, C) = A (1 - B) + A B C + (1 - A) (1 - B) C`.
+## Assumptions
 
-The polynomial `P` can be simplified to a total degree of two. We claim that the polynomial `Q(A, B, C) = A (1 - B) + A C + (1 - B) C` is, for the purposes of this chip, equivalent to `P`. An exhaustive check shows that `P(A, B, C) != Q(A, B, C)` only for the triple `(A, B, C) = (1, 0, 1)`. This is, however, impossible due to the correctness of `ADD`. In more detail, if we let `s` be the (range-checked) difference `a - b` (so the equivalent of the `lhs_sub_rhs` column), and `x'` denote the most significant word of a variable `x`, we need `c dot 2^32 + a' = b' + s' + `carry[0]``, by the definition of `carry`. However, the left hand side of this is at least `3 dot 2^31`, as `(A, C) = (1, 1)`, and the right hand side is at most `(2^31 - 1) + (2^32 - 1) + 1 = 3 dot 2^31 - 1`. Therefore, we can use `Q` to constrain `lt` when `signed = 1`.
+| Tag | Range | Description |
+|-----|-------|-------------|
+| `LOAD-A1.i` | i ∈ [0, 1] | `IS_WORD[base_address[i]]` |
+| `LOAD-A2` |  | `IS_BIT<signed>` |
+| `LOAD-A3` |  | `IS_BIT<read2>` |
+| `LOAD-A4` |  | `IS_BIT<read4>` |
+| `LOAD-A5` |  | `IS_BIT<read8>` |
+| `LOAD-A6` |  | `IS_BIT<read2 + read4 + read8>` |
+| `LOAD-A7.i` | i ∈ [0, 1] | `IS_WORD[timestamp[i]]` |
 
-| Tag | Description | Multiplicity |
-|-----|-------------|--------------|
-| `LT-C1` | `MSB16[lhs_msb; lhs[2]]` | μ |
-| `LT-C2` | `MSB16[rhs_msb; rhs[2]]` | μ |
-| `LT-C3` | `lt` = `signed` dot (A (1 - B) + A C + (1 - B) C) + (1 - `signed`) dot `unsigned_lt` |  |
-| | _polynomial:_ `lt - signed * (lhs_msb * (1 - rhs_msb) + lhs_msb * carry[1] + (1 - rhs_msb) * carry[1]) - (1 - signed) * unsigned_lt = 0` | |
-| `LT-C4` | `IS_HALF[lhs[1]]` | μ |
-| `LT-C5` | `IS_HALF[rhs[1]]` | μ |
+## Constraints
 
-And then we constrain the subtraction, taking care of the remaining range checking not yet covered by the assumptions or the `MSB16` lookup.
+The chip delegates the actual memory interaction to the `MEMW` chip, and ensures correctness of the requested sign/zero extension. The output `res` is correctly range-checked as long as the memory contents are.
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `LT-C6.i` | i ∈ [0, 1] | `IS_BIT<carry[i]>` |  |
-| `LT-C7.i` | i ∈ [0, 3] | `IS_HALF[lhs_sub_rhs[i]]` | μ |
+| `LOAD-C1` |  | `read2` + `read4` + `read8` => `μ` |  |
+| | | _polynomial:_ `(read2 + read4 + read8) * (1 - μ) = 0` | |
+| `LOAD-C2` |  | `MEMW[res; 0, base_address, res::BaseField[8], timestamp, read2, read4, read8]` | μ |
+| `LOAD-C3` |  | `MSB8[sign_bit; res[0]]` | read1 |
+| `LOAD-C4` |  | `MSB8[sign_bit; res[1]]` | read2 |
+| `LOAD-C5` |  | `MSB8[sign_bit; res[3]]` | read4 |
+| `LOAD-C6.i` | i ∈ [4, 7] | !`read8` => `res`_i = `signed` dot `sign_bit` dot 255 |  |
+| | | _polynomial:_ `(1 - read8) * (res[i] - signed * sign_bit * 255) = 0` | |
+| `LOAD-C7.i` | i ∈ [2, 3] | !(`read4` + `read8`) => `res`_i = `signed` dot `sign_bit` dot 255 |  |
+| | | _polynomial:_ `(1 - read4 - read8) * (res[i] - signed * sign_bit * 255) = 0` | |
+| `LOAD-C8` |  | !(`read2` + `read4` + `read8`) => `res`_1 = `signed` dot `sign_bit` dot 255 |  |
+| | | _polynomial:_ `(1 - read2 - read4 - read8) * (res[1] - signed * sign_bit * 255) = 0` | |
 
 The chip contributes the following to the lookup argument.
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `LT-C8` | `LT[lt; lhs::DWordWL, rhs::DWordWL, signed]` | -μ |
+| `LOAD-C9` | `LOAD[res::DWordWL; base_address, timestamp, read2, read4, read8, signed]` | -μ |
 
-= Padding
+## Padding
 
 The table can be padded to the next power of two with the following value assignments:
 
-## Columns
+---
 
-### Input
+# BITWISE Chips
 
-| Name | Type | Description |
-|------|------|-------------|
-| `lhs` | `DWordHHW` | The left operand |
-| `rhs` | `DWordHHW` | The right operand |
-| `signed` | `Bit` | whether to interpret `lhs` and `rhs` as signed integers (1) or not (0) |
+The  chips deal with precomputed lookup tables for bitwise boolean operations and convenience functionalities over small domains.
 
-### Output
+## Variables
+
+The  chip is comprised of  variables that are expressed using  columns. Of these, the _input_ and _output_ variables ( in total) are precomputed.
+
+### Input
 
 | Name | Type | Description |
 |------|------|-------------|
-| `lt` | `Bit` | Whether $`lhs` < `rhs`$, taking `signed` into account |
+| `X` | `Byte` |  |
+| `Y` | `Byte` |  |
+| `Z` | `B4` |  |
 
-### Auxiliary
+### Output
 
 | Name | Type | Description |
 |------|------|-------------|
-| `lhs_sub_rhs` | `DWordHL` | $`lhs` - `rhs`$ |
-| `lhs_msb` | `Bit` | The most significant bit of `lhs` |
-| `rhs_msb` | `Bit` | The most significant bit of `rhs` |
+| `AND` | `Byte` | the binary AND of `X` and `Y` |
+| `OR` | `Byte` | the binary OR of `X` and `Y` |
+| `XOR` | `Byte` | the binary XOR of `X` and `Y` |
+| `MSB8` | `Bit` | the most significant bit of `X` |
+| `MSB16` | `Bit` | the most significant bit of `Y` |
+| `ZERO` | `Bit` | whether $`X` = 0$, $`Y` = 0$ and $`Z` = 0$. |
+| `SLL` | `Half` | `X\|\|Y` logically left-shifted by `Z`: $((`X` + 256`Y`) `<<` `Z`) mod 2^16$ |
+| `SLLC` | `Half` | `X\|\|Y` logically right-shifted by `Z`: $(`X` + 256`Y`) `>>` (16 - `Z`)$ |
 
-### Virtual
+### Multiplicity
 
 | Name | Type | Description |
 |------|------|-------------|
-| `carry` | `Bit[2]` | The carry for adding `lhs_sub_rhs` back to `rhs` |
-| `unsigned_lt` | `Bit` | Whether $`lhs` < `rhs`$, as unsigned integers |
+| `μ_AND` | `BaseField` |  |
+| `μ_OR` | `BaseField` |  |
+| `μ_XOR` | `BaseField` |  |
+| `μ_MSB8` | `BaseField` |  |
+| `μ_MSB16` | `BaseField` |  |
+| `μ_ZERO` | `BaseField` |  |
+| `μ_IS_BYTE` | `BaseField` |  |
+| `μ_IS_HALF` | `BaseField` |  |
+| `μ_IS_B20` | `BaseField` |  |
+| `μ_HWSL` | `BaseField` |  |
 
-**Definition of `carry`:**
-```
-carry (when iter=0) := 2^-32 * (rhs[0] + (lhs_sub_rhs::DWordWL)[0] - lhs[0])
-carry (when iter=1) := 2^-32 * ((rhs::DWordWL)[1] + (lhs_sub_rhs::DWordWL)[1] + carry[0] - (lhs::DWordWL)[1])
-```
+*Note*: This table contains one row for every possible value of `(X, Y, Z)`. As such, it has length `2^8 dot 2^8 dot 2^4 = 2^(20)`.
 
-**Definition of `unsigned_lt`:**
-```
-unsigned_lt := carry[1]
-```
+## Lookup
 
-### Multiplicity
+This chip adds the following interactions to the lookup:
 
-| Name | Type | Description |
-|------|------|-------------|
-| `μ` | `Bit` |  |
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `BITWISE-C1` | `AND_BYTE[AND; X, Y]` | -μ_AND |
+| `BITWISE-C2` | `OR_BYTE[OR; X, Y]` | -μ_OR |
+| `BITWISE-C3` | `XOR_BYTE[XOR; X, Y]` | -μ_XOR |
+| `BITWISE-C4` | `MSB8[MSB8; X]` | -μ_MSB8 |
+| `BITWISE-C5` | `MSB16[MSB16; X + 256 * Y]` | -μ_MSB16 |
+| `BITWISE-C6` | `ZERO[ZERO; X + 256 * Y + 65536 * Z]` | -μ_ZERO |
+| `BITWISE-C7` | `IS_BYTE[X]` | -μ_IS_BYTE |
+| `BITWISE-C8` | `IS_HALF[X + 256 * Y]` | -μ_IS_HALF |
+| `BITWISE-C9` | `IS_B20[X + 256 * Y + 65536 * Z]` | -μ_IS_B20 |
+| `BITWISE-C10` | `HWSL[['arr', 'SLL', 'SLLC']; X + 256 * Y, Z]` | -μ_HWSL |
 
-## Assumptions
+## Notes/Optimizations
 
-| Tag | Range | Description |
-|-----|-------|-------------|
-| `LT-A1` |  | `IS_WORD[lhs[0]]` |
-| `LT-A2` |  | `IS_WORD[rhs[0]]` |
-| `LT-A3` |  | `IS_BIT<signed>` |
+The following ideas may prove to be optimizations for the  chip: + Extend `IS_BYTE[X]` to `ARE_BYTES[X, Y]`, such that two bytes are range checked at once. When only a single check is required, one can still execute `IS_BYTE[X] := ARE_BYTES[X, 0]`. + Drop `MSB8` column, and instead define the `MSB8` lookup as `MSB8<X> := MSB16[256X]`. Note: currently, `MSB8` also implicity range checks the input `X` (the lookup fails if `X` is not a `Byte`). This optimization should only be executed when all chips leveraging `MSB8` do _not_ need this implicit range check. + Place the 16-bit (`AND`, `OR`, `XOR`, `MSB16`, etc.) and 20-bit (`HWSL`, `IS_B20`, `ZERO`) lookups in separate tables.
 
 ---
 
-# MUL Chip
+# About ECALL
 
-The  chip constrains multiplication, both signed and unsigned, as well as providing access to the low and high halfs of the multiplication result.
+ECALLs provide system-level functionalities to the guest program.
 
-= Variables
+When `ECALL` is executed, it is assumed that: - register `A7` contains the system call number
 
-The  chip is comprised of  variables that are expressed using  columns and leverages  interaction(s):
+- the arguments are located in registers `A0`-`A6`, and - the return value is written to `A0`, where `A0`-`A7` are symbolic names for the registers `x10`-`x17`
 
-`mat(delim: , top; bottom)` }
+---
 
-= Assumptions The following range checks are assumed to be performed/enforced outside of this chip:
+# HALT Chip
 
-= Constraints
+## Variables
 
-## Overview
+The  chip leverages  variable, spanning  columns and leverages  interactions:
 
-When `lhs` and `rhs` are _unsigned_ integers, computing their product `mod 2^128` comes down to evaluating $ (sum_(j=0)^3 2^(16j) dot `lhs`_j) dot (sum_(i=0)^3 2^(16i) dot `rhs`_i) mod 2^128. $ If `lhs` and `rhs` are signed instead, the computation remains nearly identical: based on their signs, one must either zero or one-extend `lhs` and `rhs` --- forming `lhs_ext` and `rhs_ext` respectively --- and compute their product `mod 2^128`: $ (sum_(j=0)^7 2^(16j) dot `lhs_ext`_j) dot (sum_(i=0)^7 2^(16i) dot `rhs_ext`_i) mod 2^128. $ where `lhs_ext` and `rhs_ext` are treated as _unsigned_ integers. Note that by setting the extension limbs of `lhs` and/or `rhs` to `0` when the integer is (i) unsigned or (ii) signed and non-negative, this second formula still applies. For the purposes of constraining the multiplication operation, we rewrite this formula as
+### Input
 
-$ &(sum_(j=0)^7 2^(16j) dot `lhs_ext`_j) dot (sum_(i=0)^7 2^(16i) dot `rhs_ext`_i) mod 2^128 \ &equiv sum_(j=0)^7 sum_(i=0)^7 2^(16(i+j)) dot `lhs_ext`_j dot `rhs_ext`_i mod 2^128 \ &stackrel(triangle, equiv) sum_(j=0)^7 sum_(i=0)^(7-j) 2^(16(i+j)) dot `lhs_ext`_j dot `rhs_ext`_i mod 2^128 \ &stackrel(square, equiv) sum_(j=0)^7 sum_(i=j)^(7) 2^(16i) dot `lhs_ext`_j dot `rhs_ext`_(i-j) mod 2^128 \ &stackrel(penta, equiv) sum_(i=0)^7 sum_(j=0)^(i) 2^(16i) dot `lhs_ext`_j dot `rhs_ext`_(i-j) mod 2^128 \ &equiv sum_(i=0)^3 sum_(k=0)^1 sum_(j=0)^(2i+k) 2^(16(2i+k)) dot `lhs_ext`_j dot `rhs_ext`_(2i+k-j) mod 2^128 \ &equiv sum_(i=0)^3 2^(32i) dot sum_(k=0)^1 2^(16k) dot sum_(j=0)^(2i+k) `lhs_ext`_j dot `rhs_ext`_(2i+k-j) mod 2^128 $ where at step - `triangle` we can ignore `i > 7-j`, since that makes `2^(16(i+j)) equiv 0 mod 2^128`, - `square` we rewrite the second summation such that `i` iterates from `j` to 7, rather than `0` to `7-j`, and - `penta` we swap the sums.
+| Name | Type | Description |
+|------|------|-------------|
+| `timestamp` | `DWordWL` | timestamp at which to halt the program |
 
-We let `raw_product` capture the second summation in this last formula (see [mul:c:raw_product]). By construction, ``raw_product`_i < 2^51` for all `i in [0, 3]`, far exceeding the 32-bits that fit in a single `Word`-limb. What remains then is to reduce each limb of `raw_product` `mod 2^32`, carrying the overflow of each limb to the next, constructing the output `res` in doing so.
+## Assumptions
 
-This reduce-and-carry operation is constrained by [mul:c:range_lo]/[mul:c:range_hi] and [mul:c:carry], combined with `carry`'s definition. [mul:c:carry] and `carry`'s definition enforce that $ forall i in [0, 3]: `raw_product`_i + `carry`_(i-1) - `res`_i in { k dot 2^32 | k in [0, 2^20) } $ with ``carry`_(-1) = 0` for simplicity. In other words: ``res`_i equiv `raw_product`_i + `carry`_(i-1) (mod 2^32)`. With [mul:c:range_lo]/[mul:c:range_hi] forcing ``res`_i < 2^32`, ``res`_i` can only assume one value: ``raw_product`_i + `carry`_(i-1) mod 2^32`.
+It is assumed the input is range checked:
 
-*Note*: one may have observed that [mul:c:carry] requires ``carry`_i in [0, 2^20)`, while no limb of a valid carry value would ever exceed `2^19`. This is indeed the case. However, there is some slack in how tight one has to constrain the `carry` values. In fact, in this situation it suffices to assert that ``carry`_i < frac(p, 2^32, style: "skewed") approx 2^31`, where `p` denotes the field's modulus. Given that other chips also use 20-bit lookups, using `IS_B20` makes for a simpler design.
+| Tag | Range | Description |
+|-----|-------|-------------|
+| `HALT-A1.i` | i ∈ [0, 1] | `IS_WORD[timestamp[i]]` |
 
-## Definitions
+## Constraints
 
-We constrain `lhs_is_negative` and `rhs_is_negative` according to their definition; `lo`, `hi` and `carry` are appropriately range checked.
+The  chip: + makes sure register `x10` (containing the exit code) equals `0` ([halt:c:read_zero_exit_code]), + writes `0` to all other registers ([halt:c:zeroize_registers_lo]/[halt:c:zeroize_registers_hi]), and + sets `pc` equal to `1` ([halt:c:pc]). Note that the writes performed by all these interactions are accompanied by the timestamp `2^64-1`; the maximum timestamp. This prevents any other operation involving memory from being executed hereafter.
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `MUL-C1` |  | `SIGN<lhs_is_negative; lhs[3], lhs_signed>` |  |
-| `MUL-C2` |  | `SIGN<rhs_is_negative; rhs[3], rhs_signed>` |  |
-| `MUL-C3.i` | i ∈ [0, 3] | `IS_HALF[lo[i]]` | μ_sum |
-| `MUL-C4.i` | i ∈ [0, 3] | `IS_HALF[hi[i]]` | μ_sum |
-| `MUL-C5.i` | i ∈ [0, 3] | `IS_B20[carry[i]]` | μ_sum |
+| `HALT-C1.i` | i ∈ [1, 9] | `MEMW[1, (2 * i)::DWordWL, 0::BaseField[8], (2^64 - 1)::DWordWL, 1, 0, 0]` | 1 |
+| `HALT-C2` |  | `MEMW[0::BaseField[8]; 1, (2 * 10)::DWordWL, 0::BaseField[8], (2^64 - 1)::DWordWL, 1, 0, 0]` | 1 |
+| `HALT-C3.i` | i ∈ [11, 31] | `MEMW[1, (2 * i)::DWordWL, 0::BaseField[8], (2^64 - 1)::DWordWL, 1, 0, 0]` | 1 |
+| `HALT-C4` |  | `MEMW[1, (2 * 255)::DWordWL, ['arr', 1, 0, 0, 0, 0, 0, 0, 0], (2^64 - 1)::DWordWL, 1, 0, 0]` | 1 |
 
-## Product
-
-[mul:c:raw_product] defines `raw_product` in terms of the (sign extended) input values `lhs` and `rhs`.
+[ Observe that --- in its current state --- this solution puts the burden of verifying the register cleanup on the verifier inside of the lookup argument. Alternatively, one could add 31 lookups to the "memory" table to remove the _known_ final tokens for the registers there. ])
 
-| Tag | Range | Description |
-|-----|-------|-------------|
-| `MUL-C6.i` | i ∈ [0, 3] | `raw_product[i]` = sum_(`k`=0)^1 2^(16k) sum_(`j`=0)^(2i+k) `lhs_ext[j]` dot `rhs_ext[2i+k-j]` |
-| | | _polynomial:_ `Σ_k = 0^1 2^(16 * k) * Σ_j = 0^2 * i + k lhs_ext[j] * rhs_ext[2 * i + k - j] - raw_product[i] = 0` |
+### Lookup
 
-## Lookup
+In this VM, halting is considered equivalent to executing a `sys_exit`. Hence, this chip responds to `ECALL`s with system call number 93.
 
-The  chip contributes the following to the lookup:
+The HALT chip therefore contributes the following interaction to the lookup-argument:
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `MUL-C7` | `MUL[lo::DWordWL; lhs, lhs_signed, rhs, rhs_signed, 0]` | -μ_lo |
-| `MUL-C8` | `MUL[hi::DWordWL; lhs, lhs_signed, rhs, rhs_signed, 1]` | -μ_hi |
+| `HALT-C5` | `ECALL[timestamp, 93::DWordWL]` | -1 |
 
-= Padding
+## Padding
 
-The table can be padded to the next power of two with the following value assignments:
+This chip should only contain a single row. Given that `2^0 = 1`, this chip does not need to be padded. As such, no padding is defined.
 
-= Notes/optimizations - `lo` and `hi` are stored in `DWordHL`s (rather than `DWordWL`s) because of their values being range checked. Since it is not required that both `μ_lo` and `μ_hi` are non-zero at the same time, one cannot safely assume their range to be checked elsewhere. - As an optimization, one might be able to use a `DWordWL` and `DWordHL` to store `lo` and `hi`, where one would decide which to store in which based on the multiplicities `μ_lo` and `μ_hi`; the value sent into the lookup could then be assumed range-checked by the other side of the relation. This optimization was not included at this moment because of its negative impact on the readability and verifiability of the chip.
+---
 
-## Columns
+# COMMIT Chip
 
-### Input
+## Variables
 
-| Name | Type | Description |
-|------|------|-------------|
-| `lhs` | `DWordHL` | the left hand operator. |
-| `lhs_signed` | `Bit` | whether to interpret `lhs` as a signed integer (1) or not (0). |
-| `rhs` | `DWordHL` | the right hand operator. |
-| `rhs_signed` | `Bit` | whether to interpret `rhs` as a signed integer (1) or not (0). |
+The  chip leverages  variables, spanning  columns and leverages  interactions:
 
-### Output
+### Input
 
 | Name | Type | Description |
 |------|------|-------------|
-| `lo` | `DWordHL` | the lower limbs of the (extended) multiplication result |
-| `hi` | `DWordHL` | the upper limbs of the (extended) multiplication result |
+| `timestamp` | `DWordWL` | timestamp at which to commit |
 
 ### Auxiliary
 
 | Name | Type | Description |
 |------|------|-------------|
-| `lhs_is_negative` | `Bit` | whether `lhs` is negative (1) or not (0) |
-| `rhs_is_negative` | `Bit` | whether `rhs` is negative (1) or not (0) |
-| `raw_product` | `B51[4]` | raw multiplication output |
+| `index` | `BaseField` | Index of value being committed. |
+| `address` | `DWordWL` | Address of first byte to commit. |
+| `address_incr` | `DWordHL` | $`address` + 1$ |
+| `count` | `DWordWL` | number of bytes to commit |
+| `count_decr` | `DWordHL` | $`count` - 1$ |
+| `first` | `Bit` | Whether this is the first commitment in this sequence. |
+| `end` | `Bit` | Whether this is the end of the commitment sequence. |
+| `value` | `Byte` | Byte stored at `address`. |
 
-### Virtual
+### Multiplicity
 
 | Name | Type | Description |
 |------|------|-------------|
-| `lhs_ext` | `Half[8]` | sign-extended value of `lhs` |
-| `rhs_ext` | `Half[8]` | sign-extended value of `rhs` |
-| `res` | `QuadWL` | concatenation of `lo` and `hi`. |
-| `carry` | `B20[4]` | carry values |
-| `μ_sum` | `BaseField` | sum of multiplicies |
+| `μ` | `Bit` |  |
 
-**Definition of `lhs_ext`:**
-```
-lhs_ext (when iter=[0, 3]) := lhs[i]
-lhs_ext (when iter=[4, 7]) := 65535 * lhs_is_negative
-```
+## Constraints
 
-**Definition of `rhs_ext`:**
-```
-rhs_ext (when iter=[0, 3]) := rhs[i]
-rhs_ext (when iter=[4, 7]) := 65535 * rhs_is_negative
-```
+In this VM, committing is considered equivalent to writing a value to `stdout`. Hence, this chip responds to `ECALL`s with system call number 64.
 
-**Definition of `res`:**
-```
-res (when iter=[0, 1]) := (lo::DWordWL)[i]
-res (when iter=[2, 3]) := (hi::DWordWL)[i - 2]
-```
+Since we do not know how many bytes are to be committed, this chip employs a recursive design: each iteration commits one byte, and recursively "calls" itself to commit the remaining bytes. As such, only the call from the CPU to this chip (i.e., the `first` in the recursion tree) should accept the `ECALL`; later recursive calls should not. This is why [commit:c:receive_ecall] has multiplicity `-`first``.
 
-**Definition of `carry`:**
-```
-carry (when iter=0) := 2^-32 * (raw_product[0] - res[0])
-carry (when iter=[1, 3]) := 2^-32 * (raw_product[i] + carry[i - 1] - res[i])
-```
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `COMMIT-C1` | `ECALL[timestamp, 64::DWordWL]` | -first |
 
-**Definition of `μ_sum`:**
-```
-μ_sum := μ_lo + μ_hi
-```
+The `write` operation --- writing to a file descriptor --- has the following signature:
 
-### Multiplicity
+```c ssize_t write(size_t count; int fd, const void buf[count], size_t count); ```
 
-| Name | Type | Description |
-|------|------|-------------|
-| `μ_lo` | `BaseField` |  |
-| `μ_hi` | `BaseField` |  |
+That is to say, - `A0` contains the file descriptor, - `A1` contains the address of `buf`'s first byte, - `A2` contains `count`, and - the written count should be written to `A0`.
 
-## Assumptions
+[commit:c:read_address] reads `address` from `x11` (=`A1`) and [commit:c:read_count] reads `count` from `x12` (=`A2`). Since we only support writing to `stdout` (which corresponds to ``fd` = 1`
 
-| Tag | Range | Description |
-|-----|-------|-------------|
-| `MUL-A1.i` | i ∈ [0, 3] | `IS_HALF[lhs[i]]` |
-| `MUL-A2.i` | i ∈ [0, 3] | `IS_HALF[rhs[i]]` |
+we assert that `x10` contains `1` in [commit:c:read_fd_write_count]. Note that this constraint _also_ writes `count` to `A0`; in this VM it is impossible for a commit to be interrupted or fail. Lastly, the `index` is read from `x254`; in the same operation, ``index` + `count`` is written back to this location by [commit:c:read_index]. This, too, leverages the fact that a commit will not be interrupted or fail to update the `index` for the next commit sequence. Again, each of these memory interactions only take place when this is the `first` call in the recursion tree.
 
----
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `COMMIT-C2` | `MEMW[['arr', ['idx', 'address', 0], ['idx', 'address', 1], 0, 0, 0, 0, 0, 0]; 1, (2 * 11)::DWordWL, ['arr', ['idx', 'address', 0], ['idx', 'address', 1], 0, 0, 0, 0, 0, 0], timestamp, 1, 0, 0]` | first |
+| `COMMIT-C3` | `MEMW[['arr', ['idx', 'count', 0], ['idx', 'count', 1], 0, 0, 0, 0, 0, 0]; 1, (2 * 12)::DWordWL, ['arr', ['idx', 'count', 0], ['idx', 'count', 1], 0, 0, 0, 0, 0, 0], timestamp, 1, 0, 0]` | first |
+| `COMMIT-C4` | `MEMW[['arr', 1, 0, 0, 0, 0, 0, 0, 0]; 1, (2 * 10)::DWordWL, ['arr', ['idx', 'count', 0], ['idx', 'count', 1], 0, 0, 0, 0, 0, 0], timestamp, 1, 0, 0]` | first |
+| `COMMIT-C5` | `MEMW[['arr', 'index', 0, 0, 0, 0, 0, 0, 0]; 1, (2 * 254)::DWordWL, ['arr', ['+', 'index', ['cast', 'count', 'BaseField']], 0, 0, 0, 0, 0, 0, 0], timestamp, 0, 0, 0]` | first |
 
-# DVRM Chip
+*Note*: the observant reader will notice that [commit:c:read_index] casts `count` to a `BaseField`, potentiallly losing information. This is indeed correct. However, since it is practically impossible to commit more than `2^64-2^32` bytes in a single VM execution, it was decided to permit this.
 
-The  chip provides division and remainder functionality, both signed and unsigned.
+Next, we read the `value` located at buffer address `address` and commit to it under the given `index`. This is only performed when we have not yet reached the `end` of the commit sequence.
 
-= Variables
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `COMMIT-C6` | `MEMW[['arr', 'value', 0, 0, 0, 0, 0, 0, 0]; 0, address, ['arr', 'value', 0, 0, 0, 0, 0, 0, 0], timestamp, 0, 0, 0]` | μ - end |
+| `COMMIT-C7` | `COMMIT[index, value]` | μ - end |
 
-The  chip is comprised of  variables that are expressed using  columns and leverages  interaction(s):
+In parallel, we compute ``address_incr` = `address` + 1` ([commit:c:address_incr]) as address of the next byte to commit, and ``count_decr` = `count` - 1` ([commit:c:count_decr]) as the number of bytes that still has to be committed after committing this byte. [commit:c:range_address_incr] and [commit:c:range_count_decr] are included to satisfy [add:a:sum] respectively [add:a:rhs].
 
-= Assumptions
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `COMMIT-C8` |  | `ADD<address_incr::DWordWL; address, 1::DWordWL>` |  |
+| `COMMIT-C9.i` | i ∈ [0, 3] | `IS_HALF[address_incr[i]]` | μ |
+| `COMMIT-C10` |  | `SUB<count_decr::DWordWL; count, 1::DWordWL>` |  |
+| `COMMIT-C11.i` | i ∈ [0, 3] | `IS_HALF[count_decr[i]]` | μ |
 
-= Constraints From the ISA, we gather five requirements for the `DIV[U][W]` and `REM[U][W]` instructions:
+When `count` hits `0`, we should stop performing further recursive calls. We use the `end` bit to indicate these circumstances.
 
-enum.item([ _For both signed and unsigned division, except in the case of_ overflow, _it holds that ``n` = `q` `d` + `r``._ ]), enum.item([ _`DIV` and `DIVU` perform [...] signed and unsigned integer division [...] rounding towards zero._ ]), enum.item([ _For `REM`, the sign of a nonzero [remainder] equals the sign of the [numerator]._ ]), enum.item([ In case of _division-by-zero_, ``r` = `n`` and ``q` = 2^64-1` (unsigned) or ``q` = -1` (signed). ]), enum.item([ In case of _overflow_, ``q` = `n`` and ``r` = 0` ]), where _overflow_ occurs when ``n` = -2^(63)` and ``d` = -1` (and, hence, ``signed` = 1`), and _division-by-zero_ indicates that ``d` = 0`. In the following, we list the constraints associated with the  chip, and explain how these together enforce all five of these requirements.
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `COMMIT-C12` | `ZERO[end; (65535 - count_decr[0]) + (65535 - count_decr[1]) + (65535 - count_decr[2]) + (65535 - count_decr[3])]` | μ |
 
-## R3: Sign remainder equals sign numerator
+*Note*: + Rather than setting ``end` = 1` when ``count` = 0`, we do so when ``count_decr` = -1`. This technique allows `count` to be stored in a `DWordWL` rather than a `DWordHL`, saving two columns. + `forall i in [0, 3]: 65535 - `count_decr`_i >= 0` as a result of [commit:c:range_count_decr]. Hence, $ sum_(i=0)^3 65535 - `count_decr`_i = 0 arrow.l.r.double.long forall i in [0, 3]: `count_decr`_i = 65535 $
 
-We start with R3, which is straightforwardly asserted by constraint [dvrm:c:sign_r_equals_sign_n].
+When this was not the `end` byte to commit in this recursion sequence, we recursively _Commit the Next Byte_ (`CNB`), specifying the timestamp, address to continue reading and the number of bytes that should still be committed ([commit:c:send_commit_next_byte]). Since that certainly won't be the `first` call in the sequence, we read `address_incr` and `count_decr` from the previous recursion level into `address` and `count` and continue executing the commit.
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `COMMIT-C13` | `CNB[timestamp, index + 1, address_incr::DWordWL, count_decr::DWordWL]` | μ - end |
+| `COMMIT-C14` | `CNB[timestamp, index, address, count]` | -(μ - first) |
+
+Lastly, we must make sure `first`, `end` and `μ` are bits ([commit:c:range_first], [commit:c:range_end], [commit:c:range_mu]), and that when either ``first` = 1` or ``end` = 1` imply that ``μ` = 1` ([commit:c:first_or_end_implies_mu]). These are required to ensure the multiplicities `-(`μ` - `first`)` and ``μ` - `end`` are binary.
 
 | Tag | Description |
 |-----|-------------|
-| `DVRM-C1` | `r` eq.not 0 => `sign_r` = `sign_n` |
-| | _polynomial:_ `Σ_i = 0^3 r[i] * (sign_r - sign_n) = 0` |
+| `COMMIT-C15` | `IS_BIT<first>` |
+| `COMMIT-C16` | `IS_BIT<end>` |
+| `COMMIT-C17` | `IS_BIT<μ>` |
+| `COMMIT-C18` | `first` + `end` => `μ` = 1 |
+| | _polynomial:_ `(first + end) * (1 - μ) = 0` |
 
-## R2: rounding towards zero
+## Padding
 
-R2 states that "_[in] signed and unsigned integer division [the quotient is] round[ed] towards zero._" In other words, + the sign of ``n`-`qd`` must match that of `n` (unless ``qd` = `n``), and + `|`n`-`qd`|  < |`d`|` (unless ``d` = 0`).
+To pad this chip, use the below data.
 
-Leveraging R1 , we can rewrite these as + the sign of ``r`` must match that of `n` (unless ``r` = 0`), and + `|`r`|  < |`d`|` (unless ``d` = 0`).
+## Notes/optimizations
 
-Focusing on the first statement, we observe that this trivially holds when ``signed` = 0`, while R3 deals with the case that ``signed` = 1`. The second statement is enforced by [dvrm:c:abs_r_lt_abs_d]. [dvrm:c:abs_r_if_negative] and [dvrm:c:abs_r_if_nonnegative] (resp. [dvrm:c:abs_d_if_negative] and [dvrm:c:abs_d_if_nonnegative]) are included to ensure that `abs_r` (resp. `abs_d`) is the absolute values of `r` (resp. `d`).
+- The current version only supports writing to `stdout`. This chip could potentially be extended to support writing to arbitrary `fd`s - One might be able to replace [commit:c:end] by `end => count = 0`. While loosening the constraint (`count = 0 => end` is no longer enforced), this should not cause any problems: if the prover does not set `end` when `count=0`, they simply cannot complete the proof. First of all, one would have to recursively work through all `2^64` values of `count`, something that is practically infeasible. Moreover, if this is done with a sequence that originally has ``count` > 0`, one will inevitably have to read a memory address twice at the same timestamp, which is impossible to prove. In addition to dropping the `ZERO` lookup, this optimization might also permit moving `count_decr` from a `DWordHL` to a `DWordWL`, saving two columns. - Given that it is practically infeasible to commit more than ``p`-1 = 2^64-2^32` bytes in a program, it might suffice to store `count_decr` in a `BaseField`. Note that this would probably involve having an extra (virtual) column storing `count` in `BaseField` form as well. Moreover, one might need to add a lookup to `LT` to ensure ``count` <= `p`-1` when being read from memory at the beginning of each commitment sequence.
+
+---
+
+# SHA256 Accelerator
+
+The following chips constitute an accelerator for the SHA256 compression function; other aspects of SHA256 hashing (such as repeated compression invocation, input padding and state initialization) fall outside the scope of this accelerator.
+
+The base  chip provides the `ECALL` interface, interacts with memory and then delegates to the  and  chips to perform the message schedule and the compression rounds, respectively. The `SHA256_M` interaction signature is used to represent the output of the message schedule. The `SHA256_K` interaction signature is used to represent the `k` constants. It could either be instantiated with a (short) precomputed table, or through hardcoded LogUp contributions in this chip. For this exposition, we choose the former option, and present a table further below. Additionally, we introduce a  chip to perform the common action of computing the XOR of three rotations (or shifts) of a word.
+
+Most of the structure and variable naming follows the pseudocode of the wikipedia page).
+
+## `SHA256` chip
+
+### Columns
+
+The  chip leverages  variables, spanning  columns:
+
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `timestamp` | `DWordWL` | Timestamp at which the ECALL is invoked. Used as unique identifier for this invocation. |
+| `h` | `Byte[32]` | The state of the hash function. |
+| `h_addr` | `DWordHL[4]` | The addresses of the doublewords of `h` |
+| `m` | `Byte[64]` | The input chunk. |
+| `m_addr` | `DWordHL[8]` | The addresses of the doublewords of `m` |
+
+### Output
+
+| Name | Type | Description |
+|------|------|-------------|
+| `out` | `Byte[32]` | The new state. |
+
+### Auxiliary
+
+| Name | Type | Description |
+|------|------|-------------|
+| `last_round_out` | `Word[8]` | The output from the last compression round |
+
+### Multiplicity
+
+| Name | Type | Description |
+|------|------|-------------|
+| `μ` | `Bit` |  |
+
+### Constraints
+
+The first responsibility of the chip is to read the current state and message chunk from memory, passed as arguments through pointers. Since the memory ranges could overlap, we read the chunk first (in [sha256:c:read_chunk], at timestamp `timestamp`), before reading and writing the state (in [sha256:c:read_state], at timestamp `timestamp + 1`). The addresses containing the state and the current chunk are passed in as arguments `A0 = x10` and `A1 = x11`, respectively. Note that following the SHA256 spec, this state and the chunks are read and written as big-endian.
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `SHA256-C1` |  | `MEMW[['arr', ['idx', ['cast', ['idx', 'm_addr', 0], 'DWordWL'], 0], ['idx', ['cast', ['idx', 'm_addr', 0], 'DWordWL'], 0], 0, 0, 0, 0, 0, 0]; 1, (2 * 11)::DWordWL, ['arr', ['idx', ['cast', ['idx', 'm_addr', 0], 'DWordWL'], 0], ['idx', ['cast', ['idx', 'm_addr', 0], 'DWordWL'], 1], 0, 0, 0, 0, 0, 0], timestamp, 1, 0, 0]` | μ |
+| `SHA256-C2.i` | i ∈ [0, 7], j ∈ [0, 3] | `IS_HALF[m_addr[i][j]]` | μ |
+| `SHA256-C3.i` | i ∈ [1, 7] | `ADD<m_addr[i]::DWordWL; m_addr[0]::DWordWL, (8 * i)::DWordWL>` |  |
+| `SHA256-C4.i` | i ∈ [0, 7] | `MEMW[['arr', ['idx', 'm', ['+', ['*', 8, 'i'], 3]], ['idx', 'm', ['+', ['*', 8, 'i'], 2]], ['idx', 'm', ['+', ['*', 8, 'i'], 1]], ['idx', 'm', ['+', ['*', 8, 'i'], 0]], ['idx', 'm', ['+', ['*', 8, 'i'], 7]], ['idx', 'm', ['+', ['*', 8, 'i'], 6]], ['idx', 'm', ['+', ['*', 8, 'i'], 5]], ['idx', 'm', ['+', ['*', 8, 'i'], 4]]]; 0, m_addr[i]::DWordWL, ['arr', ['idx', 'm', ['+', ['*', 8, 'i'], 3]], ['idx', 'm', ['+', ['*', 8, 'i'], 2]], ['idx', 'm', ['+', ['*', 8, 'i'], 1]], ['idx', 'm', ['+', ['*', 8, 'i'], 0]], ['idx', 'm', ['+', ['*', 8, 'i'], 7]], ['idx', 'm', ['+', ['*', 8, 'i'], 6]], ['idx', 'm', ['+', ['*', 8, 'i'], 5]], ['idx', 'm', ['+', ['*', 8, 'i'], 4]]], timestamp, 0, 0, 1]` | μ |
+| `SHA256-C5` |  | `MEMW[['arr', ['idx', ['cast', ['idx', 'h_addr', 0], 'DWordWL'], 0], ['idx', ['cast', ['idx', 'h_addr', 0], 'DWordWL'], 1], 0, 0, 0, 0, 0, 0]; 1, (2 * 10)::DWordWL, ['arr', ['idx', ['cast', ['idx', 'h_addr', 0], 'DWordWL'], 0], ['idx', ['cast', ['idx', 'h_addr', 0], 'DWordWL'], 1], 0, 0, 0, 0, 0, 0], timestamp, 1, 0, 0]` | μ |
+| `SHA256-C6.i` | i ∈ [0, 3], j ∈ [0, 3] | `IS_HALF[h_addr[i][j]]` | μ |
+| `SHA256-C7.i` | i ∈ [1, 3] | `ADD<h_addr[i]::DWordWL; h_addr[0]::DWordWL, 8 * i::DWordWL>` |  |
+| `SHA256-C8.i` | i ∈ [0, 3] | `MEMW[['arr', ['idx', 'h', ['+', ['*', 8, 'i'], 3]], ['idx', 'h', ['+', ['*', 8, 'i'], 2]], ['idx', 'h', ['+', ['*', 8, 'i'], 1]], ['idx', 'h', ['+', ['*', 8, 'i'], 0]], ['idx', 'h', ['+', ['*', 8, 'i'], 7]], ['idx', 'h', ['+', ['*', 8, 'i'], 6]], ['idx', 'h', ['+', ['*', 8, 'i'], 5]], ['idx', 'h', ['+', ['*', 8, 'i'], 4]]]; 0, h_addr[i]::DWordWL, ['arr', ['idx', 'out', ['+', ['*', 8, 'i'], 3]], ['idx', 'out', ['+', ['*', 8, 'i'], 2]], ['idx', 'out', ['+', ['*', 8, 'i'], 1]], ['idx', 'out', ['+', ['*', 8, 'i'], 0]], ['idx', 'out', ['+', ['*', 8, 'i'], 7]], ['idx', 'out', ['+', ['*', 8, 'i'], 6]], ['idx', 'out', ['+', ['*', 8, 'i'], 5]], ['idx', 'out', ['+', ['*', 8, 'i'], 4]]], timestamp + 1::DWordWL, 0, 0, 1]` | μ |
+
+Then we prepare the message schedule, by emitting the input chunk with multiplicities corresponding to the number of times it will be read during a compression evaluation. The  chip itself is implicitly invoked by itself and , setting the `amount` column appropriately for the number of times the `w` value is required.
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `DVRM-C2` |  | `LT[1 - div_by_zero; abs_r, abs_d, 0]` | μ_sum |
-| `DVRM-C3` |  | sign_r ⇒ `NEG<abs_r; r>` |  |
-| `DVRM-C4.i` | i ∈ [0, 1] | not`sign_r` => `abs_r` = `r` |  |
-| | | _polynomial:_ `(1 - sign_r) * (abs_r[i] - (r::DWordWL)[i]) = 0` | |
-| `DVRM-C5` |  | sign_d ⇒ `NEG<abs_d; d>` |  |
-| `DVRM-C6.i` | i ∈ [0, 1] | not`sign_d` => `abs_d` = `d` |  |
-| | | _polynomial:_ `(1 - sign_d) * (abs_d[i] - (d::DWordWL)[i]) = 0` | |
-
-## R5: overflow
+| `SHA256-C9.i` | i ∈ [0, 0] | `SHA256_M[2^0 * m[4 * i + 3] + 2^8 * m[4 * i + 2] + 2^16 * m[4 * i + 1] + 2^24 * m[4 * i + 0]; timestamp, i]` | -2 * μ |
+| `SHA256-C10.i` | i ∈ [1, 8] | `SHA256_M[2^0 * m[4 * i + 3] + 2^8 * m[4 * i + 2] + 2^16 * m[4 * i + 1] + 2^24 * m[4 * i + 0]; timestamp, i]` | -3 * μ |
+| `SHA256-C11.i` | i ∈ [9, 13] | `SHA256_M[2^0 * m[4 * i + 3] + 2^8 * m[4 * i + 2] + 2^16 * m[4 * i + 1] + 2^24 * m[4 * i + 0]; timestamp, i]` | -4 * μ |
+| `SHA256-C12.i` | i ∈ [14, 15] | `SHA256_M[2^0 * m[4 * i + 3] + 2^8 * m[4 * i + 2] + 2^16 * m[4 * i + 1] + 2^24 * m[4 * i + 0]; timestamp, i]` | -5 * μ |
 
-The ISA requires that ``q` = `n`` and ``r` = 0` in the event of overflow (i.e., when ``n` = -2^63` and ``d` = -1`). We note that the second half of this requirement is already satisfied by R2: since ``d` = -1 != 0`, R2 requires that `|`r`| < |`d`| = 1`, to which ``r` = 0` is the only satisfying value.
+And finally, we provide the boundaries for the  chip and the final addition of the compression to the old state. Observe that we embed the addition into the upper 32 bits of a double word, in order to satisfy and use the `ADD` chip.
 
-We moreover find that R1 can be leveraged to enforce the correct value of `q`. While ``n` = `qd` + `r`` (R1) does _not_ hold in the case of overflow, the relation ``n` = |`q`|`d` + `r`` _does_. We moreover note that the 64-bit _signed_ two's complement representation of `-2^63` is identical to the 64-bit _unsigned_ representation of `|-2^63| = 2^63`. As such, by interpreting `q` as an unsigned integer when ``overflow` = 1`, it follows that R1 will enforce ``q` = `0x80...00``.
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `SHA256-C13` |  | `SHA256ROUND[timestamp, ['arr', ['+', ['*', ['^', 2, 0], ['idx', 'h', 3]], ['*', ['^', 2, 8], ['idx', 'h', 2]], ['*', ['^', 2, 16], ['idx', 'h', 1]], ['*', ['^', 2, 24], ['idx', 'h', 0]]], ['+', ['*', ['^', 2, 0], ['idx', 'h', 7]], ['*', ['^', 2, 8], ['idx', 'h', 6]], ['*', ['^', 2, 16], ['idx', 'h', 5]], ['*', ['^', 2, 24], ['idx', 'h', 4]]], ['+', ['*', ['^', 2, 0], ['idx', 'h', 11]], ['*', ['^', 2, 8], ['idx', 'h', 10]], ['*', ['^', 2, 16], ['idx', 'h', 9]], ['*', ['^', 2, 24], ['idx', 'h', 8]]], ['+', ['*', ['^', 2, 0], ['idx', 'h', 15]], ['*', ['^', 2, 8], ['idx', 'h', 14]], ['*', ['^', 2, 16], ['idx', 'h', 13]], ['*', ['^', 2, 24], ['idx', 'h', 12]]], ['+', ['*', ['^', 2, 0], ['idx', 'h', 19]], ['*', ['^', 2, 8], ['idx', 'h', 18]], ['*', ['^', 2, 16], ['idx', 'h', 17]], ['*', ['^', 2, 24], ['idx', 'h', 16]]], ['+', ['*', ['^', 2, 0], ['idx', 'h', 23]], ['*', ['^', 2, 8], ['idx', 'h', 22]], ['*', ['^', 2, 16], ['idx', 'h', 21]], ['*', ['^', 2, 24], ['idx', 'h', 20]]], ['+', ['*', ['^', 2, 0], ['idx', 'h', 27]], ['*', ['^', 2, 8], ['idx', 'h', 26]], ['*', ['^', 2, 16], ['idx', 'h', 25]], ['*', ['^', 2, 24], ['idx', 'h', 24]]], ['+', ['*', ['^', 2, 0], ['idx', 'h', 31]], ['*', ['^', 2, 8], ['idx', 'h', 30]], ['*', ['^', 2, 16], ['idx', 'h', 29]], ['*', ['^', 2, 24], ['idx', 'h', 28]]]], 0]` | μ |
+| `SHA256-C14` |  | `SHA256ROUND[timestamp, last_round_out, 64]` | -μ |
+| `SHA256-C15.i` | i ∈ [0, 31] | `IS_BYTE[out[i]]` | μ |
+| `SHA256-C16.i` | i ∈ [0, 7] | `ADD<['arr', 0, ['+', ['*', ['^', 2, 0], ['idx', 'out', ['+', ['*', 4, 'i'], 3]]], ['*', ['^', 2, 8], ['idx', 'out', ['+', ['*', 4, 'i'], 2]]], ['*', ['^', 2, 16], ['idx', 'out', ['+', ['*', 4, 'i'], 1]]], ['*', ['^', 2, 24], ['idx', 'out', ['+', ['*', 4, 'i'], 0]]]]]; ['arr', 0, ['idx', 'last_round_out', 'i']], ['arr', 0, ['+', ['*', ['^', 2, 0], ['idx', 'h', ['+', ['*', 4, 'i'], 3]]], ['*', ['^', 2, 8], ['idx', 'h', ['+', ['*', 4, 'i'], 2]]], ['*', ['^', 2, 16], ['idx', 'h', ['+', ['*', 4, 'i'], 1]]], ['*', ['^', 2, 24], ['idx', 'h', ['+', ['*', 4, 'i'], 0]]]]]>` |  |
 
-In summary, in case of overflow R2 enforces that ``r` = 0`. Moreover it suffices to interpret `q` as unsigned integer ([dvrm:c:sign_q]); R1 will ensure it contains the correct value.
+In this VM, we assign syscall number -1 to the  accelerator. The chip therefore contributes the following interaction to the lookup-argument:
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `DVRM-C7` | `sign_q` = `signed` dot (1- `overflow`) |  |
-| | _polynomial:_ `signed * (1 - overflow) - sign_q = 0` | |
-| `DVRM-C8` | `ZERO[overflow; n[0] + n[1] + n[2] + (n[3] - 2^15 * sign_n) + (1 - sign_n) + (65535 - d[0]) + (65535 - d[1]) + (65535 - d[2]) + (65535 - d[3])]` | μ_sum |
+| `SHA256-C17` | `IS_BIT<μ>` |  |
+| `SHA256-C18` | `ECALL[timestamp, ['arr', ['-', ['^', 2, 32], 1], ['-', ['^', 2, 32], 1]]]` | -μ |
 
-We highlight [dvrm:c:overflow]. Recall that the `overflow` flag should be set if and only if (i) ``signed` = 1`, (ii) ``n` = `0x80...00``, and (iii) ``d` = `0xFF...FF``. These requirements are equivalent to the state where: $ forall i in [0, 3]:&& 65535 - `d`_i &= 0,\ forall i in [0, 2]:&& `n`_i &= 0,\ && `n`_3 - 2^15 dot `sign_n` &= 0,\ && 1 - `sign_n` &= 0,\ $ where ``signed` = 1` follows from the last equality. The requirement is phrased in this way, because the left-hand sides of the above expressions are `>= 0` by construction. Given that the sum of these expressions does not exceed `2^19` (and thus never wraps in the field), we can now say that the `overflow` bit should be set to `1` if and only if their sum evaluates to `0`. The `ZERO` lookup guarantees this to be the case.
+### Padding
 
-## R1: $#`n` = #`qd` + #`r`$
+## `SHA256`msgsched chip
 
-Rewriting R1, we find the constraint `not`overflow` => `n` - `r` = `qd``.
+### Columns
 
-Since `n`, `d`, `q` and `r` are all 64-bit integers, we must assert this equality `mod 2^128`, rather than `mod 2^64`. To this end, we introduce `extended_n_sub_r` and leverage the `MUL` chip to verify that it is equal to ``qd` mod 2^128` using constraints [dvrm:c:mul_lower] and [dvrm:c:mul_upper]; [dvrm:c:q_range] is included to uphold assumption [mul:a:rhs].
+The  chip leverages  variables, spanning  columns:
 
-| Tag | Range | Description | Multiplicity |
-|-----|-------|-------------|--------------|
-| `DVRM-C9` |  | `MUL[n_sub_r::DWordWL; d, signed, q, sign_q, 0]` | μ_sum |
-| `DVRM-C10` |  | `MUL[extension_n_sub_r::DWordWL; d, signed, q, sign_q, 1]` | μ_sum |
-| `DVRM-C11.i` | i ∈ [0, 3] | `IS_HALF[q[i]]` | μ_sum |
+### Input
 
-It now remains to enforce that `extended_n_sub_r` is the _signed_ 128-bit representation of ``n`-`r``. Here, we introduce `extended_n` and `extended_r`. By their definition, these variables contain the signed 128-bit representations of `n` and `r`. The `carry` variable has been defined such that it mimics those in the `ADD` chip, except that here we add two `QuadHL`s rather than two `DWordHL`, thus needing four carry bits instead of two. With this in place, [dvrm:c:n_sub_r] (mimicking [add:c:carry]) ensures `extended_n_sub_r` must contain the correct value.
+| Name | Type | Description |
+|------|------|-------------|
+| `timestamp` | `DWordWL` | The timestamp/identifier for this execution of the message schedule |
+| `index` | `BaseField` | The index of the output word |
+| `amount` | `BaseField` | The multiplicity with which to output the resulting word |
 
-Lastly, observe that ``n` - `r` in (-2^64, 2^64)`, _regardless_ of the value of `signed`. Moreover, note that the upper halves of the 128-bit representations of all values in this range are either `0xFFFFFFFF` (negative) or `0x00000000` (non-negative). This means that we do not need to store all 128 bits of `extended_n_sub_r`. Rather, we need only store the lower 64-bits, and a separate bit (`sign_n_sub_r`) indicating whether the top limbs are all-ones or all-zeroes. The prover is free to select the value for `sign_n_sub_r`; only one of the two will fit the proof.
+### Output
 
-| Tag | Range | Description | Multiplicity |
-|-----|-------|-------------|--------------|
-| `DVRM-C12.i` | i ∈ [0, 3] | `IS_BIT<carry[i]>` |  |
-| `DVRM-C13.i` | i ∈ [0, 3] | `IS_HALF[r[i]]` | μ_sum |
-| `DVRM-C14.i` | i ∈ [0, 3] | `IS_HALF[n_sub_r[i]]` | μ_sum |
-| `DVRM-C15` |  | `IS_BIT<sign_n_sub_r>` |  |
+| Name | Type | Description |
+|------|------|-------------|
+| `out` | `WordHL` | The output, `w[index]` |
 
-## R4: division-by-zero
+### Auxiliary
 
-R4 requires that ``q` = 2^64-1` (unsigned) or `-1` (signed) and ``r` = n` when ``d` = 0`. Recalling R1, we see that ``n` = `q` `d` + `r` = `r`` when ``d` = 0`, already enforces the latter. Next, we note that, in two's complement, the _unsigned_ value `2^64-1` and _signed_ value `-1` are both represented by the bit string `0xFFFFFFFF`. Hence, only [dvrm:c:q_if_div_by_zero] is required to completely constrain R4; [dvrm:c:div_by_zero] just ensures the `div_by_zero` flag is set when ``d` = 0`.
+| Name | Type | Description |
+|------|------|-------------|
+| `back2` | `Word` | `w[index - 2]` |
+| `back7` | `Word` | `w[index - 7]` |
+| `back15` | `Word` | `w[index - 15]` |
+| `back16` | `Word` | `w[index - 16]` |
+| `s0` | `Word` | $`back15` >>> 7 xor `back15` >>> 18 xor `back15` >> 3$ |
+| `s1` | `Word` | $`back2` >>> 17 xor `back2` >>> 19 xor `back2` >> 10$ |
 
-| Tag | Range | Description | Multiplicity |
-|-----|-------|-------------|--------------|
-| `DVRM-C16.i` | i ∈ [0, 3] | `div_by_zero` => `q[i]` = 65535 |  |
-| | | _polynomial:_ `div_by_zero * (q[i] - 65535) = 0` | |
-| `DVRM-C17` |  | `ZERO[div_by_zero; d[0] + d[1] + d[2] + d[3]]` | μ_sum |
+### Virtual
 
-## Other
+| Name | Type | Description |
+|------|------|-------------|
+| `carry` | `Byte` | The carry of computing `out` |
 
-The following constraints are included to enforce the values of `sign_n`, `sign_r` and `sign_d` are correct.
+**Definition of `carry`:**
+```
+carry := 2^-32 * (back16 + s0 + back7 + s1 - out::Word)
+```
 
-| Tag | Description |
-|-----|-------------|
-| `DVRM-C18` | `SIGN<sign_n; n[3], signed>` |
-| `DVRM-C19` | `SIGN<sign_r; r[3], signed>` |
-| `DVRM-C20` | `SIGN<sign_d; d[3], signed>` |
+### Multiplicity
 
-## Output
+| Name | Type | Description |
+|------|------|-------------|
+| `μ` | `Bit` |  |
 
-Lastly, this chip contributes the following to the lookup:
+### Assumptions
+
+| Tag | Range | Description |
+|-----|-------|-------------|
+| `SHA256MSGSCHED-A1` |  | #`IS_WORD[SHA256_M[timestamp, i]]` for $0 <= i < #`index`$ |
+
+### Constraints
+
+First, we gather the dependencies from earlier in the message schedule.
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `DVRM-C21` | `DVRM[q::DWordWL; n, d, signed, 0]` | -μ_q |
-| `DVRM-C22` | `DVRM[r::DWordWL; n, d, signed, 1]` | -μ_r |
+| `SHA256MSGSCHED-C1` | `IS_BYTE[index - 16]` | μ |
+| `SHA256MSGSCHED-C2` | `SHA256_M[back2; timestamp, index - 2]` | μ |
+| `SHA256MSGSCHED-C3` | `SHA256_M[back7; timestamp, index - 7]` | μ |
+| `SHA256MSGSCHED-C4` | `SHA256_M[back15; timestamp, index - 15]` | μ |
+| `SHA256MSGSCHED-C5` | `SHA256_M[back16; timestamp, index - 16]` | μ |
 
-= Padding To pad the  table, we use the following data, representing the unsigned division `frac(0, 0, style: "horizontal")`:
+Then, we calculate the result. It suffices to check that the carry of adding four range-checked words into a range-checked word is not too big, following the logic from [add]. In this case, using the `IS_BYTE` constraint allows us to add multiple words together at the same time, without needing to store and range-check intermediate results.
 
-## Columns
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `SHA256MSGSCHED-C6` |  | `ROTXOR[s0; back15, 2, 11, 3, 0]` | μ |
+| `SHA256MSGSCHED-C7` |  | `ROTXOR[s1; back2, 3, 2, 10, 0]` | μ |
+| `SHA256MSGSCHED-C8` |  | `IS_BYTE[carry]` | μ |
+| `SHA256MSGSCHED-C9.i` | i ∈ [0, 1] | `IS_HALF[out[i]]` | μ |
+
+Finally, we contribute to the LogUp.
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `SHA256MSGSCHED-C10` | `IS_BIT<μ>` |  |
+| `SHA256MSGSCHED-C11` | `μ` = 0 => `amount` = 0 |  |
+| | _polynomial:_ `(1 - μ) * amount = 0` | |
+| `SHA256MSGSCHED-C12` | `SHA256_M[out::Word; timestamp, index]` | -amount |
+
+## `SHA256`round chip
+
+### Columns
+
+The  chip leverages  variables, spanning  columns:
 
 ### Input
 
 | Name | Type | Description |
 |------|------|-------------|
-| `n` | `DWordHL` | The numerator |
-| `d` | `DWordHL` | The denominator |
-| `signed` | `Bit` | Whether to interpret the input as signed (1) or unsigned (0) integers. |
+| `timestamp` | `DWordWL` | The timestamp/identifier for this execution of the round function |
+| `a` | `WordBL` | State element |
+| `b` | `WordBL` | State element |
+| `c` | `WordBL` | State element |
+| `d` | `Word` | State element |
+| `e` | `WordBL` | State element |
+| `f` | `WordBL` | State element |
+| `g` | `WordBL` | State element |
+| `h` | `Word` | State element |
+| `index` | `BaseField` | The round number/index |
 
 ### Output
 
 | Name | Type | Description |
 |------|------|-------------|
-| `q` | `DWordHL` | The quotient; $`n` / `d`$ rounded towards zero. |
-| `r` | `DWordHL` | The remainder; $`n` - `q` `d`$. |
+| `out_a` | `WordHL` | $`temp1` + `temp2`$ |
+| `out_e` | `WordHL` | $`d` + `temp1`$ |
 
 ### Auxiliary
 
 | Name | Type | Description |
 |------|------|-------------|
-| `div_by_zero` | `Bit` | Whether $`d`=0$. |
-| `overflow` | `Bit` | Whether $`n` = -2^63$ and $`d`=-1$. |
-| `abs_r` | `DWordWL` | Absolute value of `r`. |
-| `abs_d` | `DWordWL` | Absolute value of `d`. |
-| `n_sub_r` | `DWordHL` | $`n`-`r`$. |
-| `sign_n_sub_r` | `Bit` | Sign of `n_sub_r`. |
-| `sign_n` | `Bit` | Sign of `n`. |
-| `sign_d` | `Bit` | Sign of `d`. |
-| `sign_q` | `Bit` | Sign of `q`. |
-| `sign_r` | `Bit` | Sign of `r`. |
+| `a_and_b` | `WordBL` | $`a` class("binary", amp) `b`$. Part of `maj` |
+| `a_xor_b` | `WordBL` | $`a` xor `b`$. Part of `maj` |
+| `c_and_a_xor_b` | `WordBL` | $`c` class("binary", amp) (`a` xor `b`)$. Part of `maj` |
+| `e_and_f` | `WordBL` | $`e` class("binary", amp) `f`$. Part of `ch` |
+| `not_e_and_g` | `WordBL` | $(not `e`) class("binary", amp) `g`$. Part of `ch` |
+| `kval` | `Word` | `k[index]` |
+| `S0` | `Word` | Transformation of `a` |
+| `S1` | `Word` | Transformation of `e` |
+| `wval` | `Word` | `w[index]` |
 
 ### Virtual
 
 | Name | Type | Description |
 |------|------|-------------|
-| `extended_n` | `QuadHL` | sign-extended value of `n`. |
-| `extended_r` | `QuadHL` | sign-extended value of `r`. |
-| `extension_n_sub_r` | `DWordHL` | sign-extension limbs of `n_sub_r`. |
-| `extended_n_sub_r` | `QuadHL` | sign-extended value of `n_sub_r`. |
-| `carry` | `Bit[4]` | carries for adding `extended_n_sub_r` to `extended_r`, forming `extended_n`. |
-| `μ_sum` | `BaseField` | sum of multiplicities |
-
-**Definition of `extended_n`:**
+| `carry_a` | `Byte` | The carry from `out_a` |
+| `carry_e` | `Byte` | The carry from `out_e` |
+| `ch` | `Word` | ch value |
+| `maj` | `Word` | maj value |
+| `temp1` | `BaseField` | `temp1` value |
+| `temp2` | `BaseField` | `temp2` value |
+
+**Definition of `carry_a`:**
 ```
-extended_n (when iter=[0, 3]) := n[i]
-extended_n (when iter=[4, 7]) := 65535 * sign_n
+carry_a := 2^-32 * (temp1 + temp2 - out_a::Word)
 ```
 
-**Definition of `extended_r`:**
+**Definition of `carry_e`:**
 ```
-extended_r (when iter=[0, 3]) := r[i]
-extended_r (when iter=[4, 7]) := 65535 * sign_r
+carry_e := 2^-32 * (d + temp1 - out_e::Word)
 ```
 
-**Definition of `extension_n_sub_r`:**
+**Definition of `ch`:**
 ```
-extension_n_sub_r := 65535 * sign_n_sub_r
+ch := e_and_f::Word + not_e_and_g::Word
 ```
 
-**Definition of `extended_n_sub_r`:**
+**Definition of `maj`:**
 ```
-extended_n_sub_r (when iter=[0, 3]) := n_sub_r[i]
-extended_n_sub_r (when iter=[4, 7]) := extension_n_sub_r[i - 4]
+maj := a_and_b::Word + c_and_a_xor_b::Word
 ```
 
-**Definition of `carry`:**
+**Definition of `temp1`:**
 ```
-carry (when iter=0) := 2^-32 * ((extended_n_sub_r::QuadWL)[i] + (extended_r::QuadWL)[i] - (extended_n::QuadWL)[i])
-carry (when iter=[1, 3]) := 2^-32 * ((extended_n_sub_r::QuadWL)[i] + (extended_r::QuadWL)[i] + carry[i - 1] - (extended_n::QuadWL)[i])
+temp1 := h + S1 + ch + kval + wval
 ```
 
-**Definition of `μ_sum`:**
+**Definition of `temp2`:**
 ```
-μ_sum := μ_q + μ_r
+temp2 := S0 + maj
 ```
 
 ### Multiplicity
 
 | Name | Type | Description |
 |------|------|-------------|
-| `μ_q` | `BaseField` |  |
-| `μ_r` | `BaseField` |  |
+| `μ` | `Bit` |  |
 
-## Assumptions
+### Assumptions
 
 | Tag | Range | Description |
 |-----|-------|-------------|
-| `DVRM-A1.i` | i ∈ [0, 3] | `IS_HALF[n[i]]` |
-| `DVRM-A2.i` | i ∈ [0, 3] | `IS_HALF[d[i]]` |
-| `DVRM-A3` |  | `IS_BIT<signed>` |
-
----
+| `SHA256ROUND-A1` |  | All state values are valid words |
 
-# LOAD Chip
-
-The  chip provides functionality to read values from memory and sign-extend them where appropriate. It delegates low-level memory handling to the `MEMW` chip ([memw]).
-
-= Variables
+### Constraints
 
-The  chip is comprised of  variables that are expressed using  columns and leverages  interaction(s):
+First, we compute the necessary intermediate values.
 
-= Assumptions
+To compute `maj`, observe that ` (a bitand b) xor (a bitand c) xor (b bitand c) = (a bitand b) xor (c bitand (a xor b)), ` by distribution. Additionally, since for this form, `(a bitand b)` and `(a xor b)` are disjoint, so are `(a bitand b)` and `(c bitand (a xor b))`, and hence we can replace that top-level XOR with a field addition to compute `(a bitand b) + (c bitand (a xor b))`, needing fewer intermediate columns. Similarly, `ch` can be written as `(e bitand f) + ((2^32 - 1 - e) bitand g)`.
 
-= Constraints The chip delegates the actual memory interaction to the `MEMW` chip, and ensures correctness of the requested sign/zero extension. The output `res` is correctly range-checked as long as the memory contents are.
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `SHA256ROUND-C1.i` | i ∈ [0, 3] | `AND_BYTE[a_and_b[i]; a[i], b[i]]` | μ |
+| `SHA256ROUND-C2.i` | i ∈ [0, 3] | `XOR_BYTE[a_xor_b[i]; a[i], b[i]]` | μ |
+| `SHA256ROUND-C3.i` | i ∈ [0, 3] | `AND_BYTE[c_and_a_xor_b[i]; c[i], a_xor_b[i]]` | μ |
+| `SHA256ROUND-C4.i` | i ∈ [0, 3] | `AND_BYTE[e_and_f[i]; e[i], f[i]]` | μ |
+| `SHA256ROUND-C5.i` | i ∈ [0, 3] | `AND_BYTE[not_e_and_g[i]; 255 - e[i], g[i]]` | μ |
+| `SHA256ROUND-C6` |  | `SHA256_K[kval; index]` | μ |
+| `SHA256ROUND-C7` |  | `SHA256_M[wval; timestamp, index]` | μ |
+| `SHA256ROUND-C8` |  | `ROTXOR[S0; a::Word, 6, 9, 2, 1]` | μ |
+| `SHA256ROUND-C9` |  | `ROTXOR[S1; e::Word, 9, 14, 6, 1]` | μ |
+
+Then we constrain the addition for the new state, constraining additions with the same `IS_BYTE` trick as before.
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `LOAD-C1` |  | `read2` + `read4` + `read8` => `μ` |  |
-| | | _polynomial:_ `(read2 + read4 + read8) * (1 - μ) = 0` | |
-| `LOAD-C2` |  | `MEMW[res; 0, base_address, res::BaseField[8], timestamp, read2, read4, read8]` | μ |
-| `LOAD-C3` |  | `MSB8[sign_bit; res[0]]` | read1 |
-| `LOAD-C4` |  | `MSB8[sign_bit; res[1]]` | read2 |
-| `LOAD-C5` |  | `MSB8[sign_bit; res[3]]` | read4 |
-| `LOAD-C6.i` | i ∈ [4, 7] | !`read8` => `res`_i = `signed` dot `sign_bit` dot 255 |  |
-| | | _polynomial:_ `(1 - read8) * (res[i] - signed * sign_bit * 255) = 0` | |
-| `LOAD-C7.i` | i ∈ [2, 3] | !(`read4` + `read8`) => `res`_i = `signed` dot `sign_bit` dot 255 |  |
-| | | _polynomial:_ `(1 - read4 - read8) * (res[i] - signed * sign_bit * 255) = 0` | |
-| `LOAD-C8` |  | !(`read2` + `read4` + `read8`) => `res`_1 = `signed` dot `sign_bit` dot 255 |  |
-| | | _polynomial:_ `(1 - read2 - read4 - read8) * (res[1] - signed * sign_bit * 255) = 0` | |
+| `SHA256ROUND-C10.i` | i ∈ [0, 1] | `IS_HALF[out_a[i]]` | μ |
+| `SHA256ROUND-C11` |  | `IS_BYTE[carry_a]` | μ |
+| `SHA256ROUND-C12.i` | i ∈ [0, 1] | `IS_HALF[out_e[i]]` | μ |
+| `SHA256ROUND-C13` |  | `IS_BYTE[carry_e]` | μ |
 
-The chip contributes the following to the lookup argument.
+Finally, we chain the rounds together through the interactions.
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `LOAD-C9` | `LOAD[res::DWordWL; base_address, timestamp, read2, read4, read8, signed]` | -μ |
+| `SHA256ROUND-C14` | `SHA256ROUND[timestamp, ['arr', ['cast', 'a', 'Word'], ['cast', 'b', 'Word'], ['cast', 'c', 'Word'], 'd', ['cast', 'e', 'Word'], ['cast', 'f', 'Word'], ['cast', 'g', 'Word'], 'h'], index]` | -μ |
+| `SHA256ROUND-C15` | `SHA256ROUND[timestamp, ['arr', ['cast', 'out_a', 'Word'], ['cast', 'a', 'Word'], ['cast', 'b', 'Word'], ['cast', 'c', 'Word'], ['cast', 'out_e', 'Word'], ['cast', 'e', 'Word'], ['cast', 'f', 'Word'], ['cast', 'g', 'Word']], index + 1]` | μ |
 
-= Padding
+### Padding
 
-The table can be padded to the next power of two with the following value assignments:
+## `ROTXOR` chip
 
-## Columns
+This chip takes as input `a`, `r0`, `r1`, `r2` (4-bit values) and a bit `last_rot` to compute $ cases( (a >>> (16 + r_0)) xor (a >>> (16 + r_0 - r_1)) xor (a >>> r_2) quad "if" `last_rot`, (a >>> (16 + r_0)) xor (a >>> (16 + r_0 - r_1)) xor (a >> r_2) quad "if" `!last_rot` ), $ where we let `>>>` denote right rotation and `>>` logical shift right. We choose this representation so that all shift amounts required fit into 4 bits, making the usage of `HWSL` more straightforward and avoid extra columns to represent more bits.
+
+### Columns
+
+The  chip leverages  variables, spanning  columns:
 
 ### Input
 
 | Name | Type | Description |
 |------|------|-------------|
-| `base_address` | `DWordWL` | The base address to read/write from/to, gets offset by $[0, 7]$, depending on how big the access is |
-| `timestamp` | `DWordWL` | The timestamp at which this memory access is said to occur |
-| `read2` | `Bit` | Whether to read exactly 2 bytes |
-| `read4` | `Bit` | Whether to read exactly 4 bytes |
-| `read8` | `Bit` | Whether to read exactly 8 bytes |
-| `signed` | `Bit` | Whether to sign-extend (1) or zero-extend (0) |
+| `a` | `WordHL` | The input value |
+| `r0` | `Byte` | The first amount of rotation, low nibble |
+| `r1` | `Byte` | The second amount of rotation, low nibble |
+| `r2` | `Byte` | The third amount of rotation, low nibble |
+| `last_rot` | `Bit` | Whether the rotation by `r2` is a rotation (1) or just a shift (0) |
 
 ### Output
 
 | Name | Type | Description |
 |------|------|-------------|
-| `res` | `DWordBL` | The result of reading (up to) 8 bytes from `base_address`, extended corresponding to `signed`. |
+| `out` | `WordBL` | The output |
 
 ### Auxiliary
 
 | Name | Type | Description |
 |------|------|-------------|
-| `sign_bit` | `Bit` | The sign bit extracted from the bytes retrieved from memory |
-
-### Virtual
-
-| Name | Type | Description |
-|------|------|-------------|
-| `read1` | `Bit` | Whether to read exactly 1 byte |
-
-**Definition of `read1`:**
-```
-read1 := μ - read2 - read4 - read8
-```
+| `a0_left` | `WordHL` | `a << (16 - r0)` |
+| `a0_right` | `WordHL` | `a >> r0` |
+| `a1_left` | `WordHL` | `a0 << r1` |
+| `a1_right` | `WordHL` | `a0 >> (16 - r1)` |
+| `a2_left` | `WordHL` | `a << (16 - r2)` |
+| `a2_right` | `WordHL` | `a >> r2` |
+| `a0` | `WordBL` | `a >>> (16 + r0)` |
+| `a1` | `WordBL` | `a >>> (16 + r0 - r1)` (which is `a0 <<< r1`) |
+| `a2` | `WordBL` | `a >>> r2` or `a >> r2` |
+| `a01` | `WordBL` | $a_0 xor a_1$ |
 
 ### Multiplicity
 
 | Name | Type | Description |
 |------|------|-------------|
-| `μ` | `Bit` |  |
+| `μ` | `BaseField` |  |
 
-## Assumptions
+### Assumptions
+
+Range checking for all elements is inherited from the bitwise lookups. We can safely assume that no `r_i` will be zero, and avoid extra work due to right rotation needing `16 - shift` as arguments to the `HWSL` interactions.
 
 | Tag | Range | Description |
 |-----|-------|-------------|
-| `LOAD-A1.i` | i ∈ [0, 1] | `IS_WORD[base_address[i]]` |
-| `LOAD-A2` |  | `IS_BIT<signed>` |
-| `LOAD-A3` |  | `IS_BIT<read2>` |
-| `LOAD-A4` |  | `IS_BIT<read4>` |
-| `LOAD-A5` |  | `IS_BIT<read8>` |
-| `LOAD-A6` |  | `IS_BIT<read2 + read4 + read8>` |
-| `LOAD-A7.i` | i ∈ [0, 1] | `IS_WORD[timestamp[i]]` |
+| `ROTXOR-A1` |  | $#`r0`, #`r1`, #`r2` in [1, 15]$ |
 
----
+### Constraints
 
-# ECALL Chips
+We first compute all rotations (or shifts) of `a`. `a1` is computed as a left rotation of `a0`, in order to not need additional columns to represent the full right-rotation amounts.
 
----
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `ROTXOR-C1.i` | i ∈ [0, 1] | `HWSL[['arr', ['idx', 'a0_left', 'i'], ['idx', 'a0_right', 'i']]; a[i], 16 - r0]` | μ |
+| `ROTXOR-C2.i` | i ∈ [0, 1] | `HWSL[['arr', ['idx', 'a1_left', 'i'], ['idx', 'a1_right', 'i']]; (a0::WordHL)[i], r1]` | μ |
+| `ROTXOR-C3.i` | i ∈ [0, 1] | `HWSL[['arr', ['idx', 'a2_left', 'i'], ['idx', 'a2_right', 'i']]; a[i], 16 - r2]` | μ |
+| `ROTXOR-C4.i` | i ∈ [0, 1] | `a0[i]` = `a0_left[i]` + `a0_right[1 - i]` |  |
+| | | _polynomial:_ `(a0::WordHL)[i] - a0_left[i] - a0_right[1 - i] = 0` | |
+| `ROTXOR-C5.i` | i ∈ [0, 1] | `a1[i]` = `a1_left[i]` + `a1_right[1 - i]` |  |
+| | | _polynomial:_ `(a1::WordHL)[i] - a1_left[i] - a1_right[1 - i] = 0` | |
+| `ROTXOR-C6` |  | `a2[0]` = `a2_left[1]` + `a2_right[0]` |  |
+| | | _polynomial:_ `(a2::WordHL)[0] - a2_left[1] - a2_right[0] = 0` | |
+| `ROTXOR-C7` |  | `a2[1]` = `last_rot` dot `a2_left[0]` + `a2_right[1]` |  |
+| | | _polynomial:_ `(a2::WordHL)[0] - last_rot * a2_left[0] - a2_right[1] = 0` | |
+
+Then the bitwise XOR of the results.
 
-# BITWISE Chips
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `ROTXOR-C8.i` | i ∈ [0, 3] | `XOR_BYTE[a01[i]; a0[i], a1[i]]` | μ |
+| `ROTXOR-C9.i` | i ∈ [0, 3] | `XOR_BYTE[out[i]; a01[i], a2[i]]` | μ |
 
-The  chips deal with precomputed lookup tables for bitwise boolean operations and convenience functionalities over small domains.
+And finally contribute to the lookup argument.
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `ROTXOR-C10` | `ROTXOR[out::Word; a::Word, r0, r1, r2, last_rot]` | -μ |
 
-= Variables
+### Padding
 
-The  chip is comprised of  variables that are expressed using  columns. Of these, the _input_ and _output_ variables ( in total) are precomputed.
+## Constant lookup
 
-*Note*: This table contains one row for every possible value of `(X, Y, Z)`. As such, it has length `2^8 dot 2^8 dot 2^4 = 2^(20)`.
+As mentioned, we provide the round constants through a short precomputed lookup table: .
+
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `index` | `BaseField` |  |
+| `K` | `Word` |  |
+
+### Multiplicity
+
+| Name | Type | Description |
+|------|------|-------------|
+| `μ` | `BaseField` |  |
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `SHA256_K-C1` | `SHA256_K[K; index]` | -μ |
 
-= Lookup This chip adds the following interactions to the lookup:
+## Notes/optimizations
 
-= Notes/Optimizations The following ideas may prove to be optimizations for the  chip: + Extend `IS_BYTE[X]` to `ARE_BYTES[X, Y]`, such that two bytes are range checked at once. When only a single check is required, one can still execute `IS_BYTE[X] := ARE_BYTES[X, 0]`. + Drop `MSB8` column, and instead define the `MSB8` lookup as `MSB8<X> := MSB16[256X]`. Note: currently, `MSB8` also implicity range checks the input `X` (the lookup fails if `X` is not a `Byte`). This optimization should only be executed when all chips leveraging `MSB8` do _not_ need this implicit range check. + Place the 16-bit (`AND`, `OR`, `XOR`, `MSB16`, etc.) and 20-bit (`HWSL`, `IS_B20`, `ZERO`) lookups in separate tables.
+- This could instead be designed following the [RISC-V Crypto Scalar extension `Zknh`], for wider compatibility, but this design is likely to be more efficient. It is still possible, if desired, to expose  (or a selection of parameter instantiations thereof) as implementation for these primitives. - The message schedule could be exposed as its own ECALL instead, but the direct integration leads to better efficiency. - Some of these chips could be made narrower, at the cost of introducing some extra lookups and extra tables to compute and store intermediate results.
 
 ## Columns
 
@@ -1687,51 +2617,66 @@ The  chip is comprised of  variables that are expressed using  columns. Of these
 
 | Name | Type | Description |
 |------|------|-------------|
-| `X` | `Byte` |  |
-| `Y` | `Byte` |  |
-| `Z` | `B4` |  |
+| `timestamp` | `DWordWL` | Timestamp at which the ECALL is invoked. Used as unique identifier for this invocation. |
+| `h` | `Byte[32]` | The state of the hash function. |
+| `h_addr` | `DWordHL[4]` | The addresses of the doublewords of `h` |
+| `m` | `Byte[64]` | The input chunk. |
+| `m_addr` | `DWordHL[8]` | The addresses of the doublewords of `m` |
 
 ### Output
 
 | Name | Type | Description |
 |------|------|-------------|
-| `AND` | `Byte` | the binary AND of `X` and `Y` |
-| `OR` | `Byte` | the binary OR of `X` and `Y` |
-| `XOR` | `Byte` | the binary XOR of `X` and `Y` |
-| `MSB8` | `Bit` | the most significant bit of `X` |
-| `MSB16` | `Bit` | the most significant bit of `Y` |
-| `ZERO` | `Bit` | whether $`X` = 0$, $`Y` = 0$ and $`Z` = 0$. |
-| `SLL` | `Half` | `X\|\|Y` logically left-shifted by `Z`: $((`X` + 256`Y`) `<<` `Z`) mod 2^16$ |
-| `SLLC` | `Half` | `X\|\|Y` logically right-shifted by `Z`: $(`X` + 256`Y`) `>>` (16 - `Z`)$ |
+| `out` | `Byte[32]` | The new state. |
+
+### Auxiliary
+
+| Name | Type | Description |
+|------|------|-------------|
+| `last_round_out` | `Word[8]` | The output from the last compression round |
 
 ### Multiplicity
 
 | Name | Type | Description |
 |------|------|-------------|
-| `μ_AND` | `BaseField` |  |
-| `μ_OR` | `BaseField` |  |
-| `μ_XOR` | `BaseField` |  |
-| `μ_MSB8` | `BaseField` |  |
-| `μ_MSB16` | `BaseField` |  |
-| `μ_ZERO` | `BaseField` |  |
-| `μ_IS_BYTE` | `BaseField` |  |
-| `μ_IS_HALF` | `BaseField` |  |
-| `μ_IS_B20` | `BaseField` |  |
-| `μ_HWSL` | `BaseField` |  |
+| `μ` | `Bit` |  |
 
 ## Constraints
 
-### contributions
+### memory
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `SHA256-C1` |  | `MEMW[['arr', ['idx', ['cast', ['idx', 'm_addr', 0], 'DWordWL'], 0], ['idx', ['cast', ['idx', 'm_addr', 0], 'DWordWL'], 0], 0, 0, 0, 0, 0, 0]; 1, (2 * 11)::DWordWL, ['arr', ['idx', ['cast', ['idx', 'm_addr', 0], 'DWordWL'], 0], ['idx', ['cast', ['idx', 'm_addr', 0], 'DWordWL'], 1], 0, 0, 0, 0, 0, 0], timestamp, 1, 0, 0]` | μ |
+| `SHA256-C2.i` | i ∈ [0, 7], j ∈ [0, 3] | `IS_HALF[m_addr[i][j]]` | μ |
+| `SHA256-C3.i` | i ∈ [1, 7] | `ADD<m_addr[i]::DWordWL; m_addr[0]::DWordWL, (8 * i)::DWordWL>` |  |
+| `SHA256-C4.i` | i ∈ [0, 7] | `MEMW[['arr', ['idx', 'm', ['+', ['*', 8, 'i'], 3]], ['idx', 'm', ['+', ['*', 8, 'i'], 2]], ['idx', 'm', ['+', ['*', 8, 'i'], 1]], ['idx', 'm', ['+', ['*', 8, 'i'], 0]], ['idx', 'm', ['+', ['*', 8, 'i'], 7]], ['idx', 'm', ['+', ['*', 8, 'i'], 6]], ['idx', 'm', ['+', ['*', 8, 'i'], 5]], ['idx', 'm', ['+', ['*', 8, 'i'], 4]]]; 0, m_addr[i]::DWordWL, ['arr', ['idx', 'm', ['+', ['*', 8, 'i'], 3]], ['idx', 'm', ['+', ['*', 8, 'i'], 2]], ['idx', 'm', ['+', ['*', 8, 'i'], 1]], ['idx', 'm', ['+', ['*', 8, 'i'], 0]], ['idx', 'm', ['+', ['*', 8, 'i'], 7]], ['idx', 'm', ['+', ['*', 8, 'i'], 6]], ['idx', 'm', ['+', ['*', 8, 'i'], 5]], ['idx', 'm', ['+', ['*', 8, 'i'], 4]]], timestamp, 0, 0, 1]` | μ |
+| `SHA256-C5` |  | `MEMW[['arr', ['idx', ['cast', ['idx', 'h_addr', 0], 'DWordWL'], 0], ['idx', ['cast', ['idx', 'h_addr', 0], 'DWordWL'], 1], 0, 0, 0, 0, 0, 0]; 1, (2 * 10)::DWordWL, ['arr', ['idx', ['cast', ['idx', 'h_addr', 0], 'DWordWL'], 0], ['idx', ['cast', ['idx', 'h_addr', 0], 'DWordWL'], 1], 0, 0, 0, 0, 0, 0], timestamp, 1, 0, 0]` | μ |
+| `SHA256-C6.i` | i ∈ [0, 3], j ∈ [0, 3] | `IS_HALF[h_addr[i][j]]` | μ |
+| `SHA256-C7.i` | i ∈ [1, 3] | `ADD<h_addr[i]::DWordWL; h_addr[0]::DWordWL, 8 * i::DWordWL>` |  |
+| `SHA256-C8.i` | i ∈ [0, 3] | `MEMW[['arr', ['idx', 'h', ['+', ['*', 8, 'i'], 3]], ['idx', 'h', ['+', ['*', 8, 'i'], 2]], ['idx', 'h', ['+', ['*', 8, 'i'], 1]], ['idx', 'h', ['+', ['*', 8, 'i'], 0]], ['idx', 'h', ['+', ['*', 8, 'i'], 7]], ['idx', 'h', ['+', ['*', 8, 'i'], 6]], ['idx', 'h', ['+', ['*', 8, 'i'], 5]], ['idx', 'h', ['+', ['*', 8, 'i'], 4]]]; 0, h_addr[i]::DWordWL, ['arr', ['idx', 'out', ['+', ['*', 8, 'i'], 3]], ['idx', 'out', ['+', ['*', 8, 'i'], 2]], ['idx', 'out', ['+', ['*', 8, 'i'], 1]], ['idx', 'out', ['+', ['*', 8, 'i'], 0]], ['idx', 'out', ['+', ['*', 8, 'i'], 7]], ['idx', 'out', ['+', ['*', 8, 'i'], 6]], ['idx', 'out', ['+', ['*', 8, 'i'], 5]], ['idx', 'out', ['+', ['*', 8, 'i'], 4]]], timestamp + 1::DWordWL, 0, 0, 1]` | μ |
+
+### sched
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `SHA256-C9.i` | i ∈ [0, 0] | `SHA256_M[2^0 * m[4 * i + 3] + 2^8 * m[4 * i + 2] + 2^16 * m[4 * i + 1] + 2^24 * m[4 * i + 0]; timestamp, i]` | -2 * μ |
+| `SHA256-C10.i` | i ∈ [1, 8] | `SHA256_M[2^0 * m[4 * i + 3] + 2^8 * m[4 * i + 2] + 2^16 * m[4 * i + 1] + 2^24 * m[4 * i + 0]; timestamp, i]` | -3 * μ |
+| `SHA256-C11.i` | i ∈ [9, 13] | `SHA256_M[2^0 * m[4 * i + 3] + 2^8 * m[4 * i + 2] + 2^16 * m[4 * i + 1] + 2^24 * m[4 * i + 0]; timestamp, i]` | -4 * μ |
+| `SHA256-C12.i` | i ∈ [14, 15] | `SHA256_M[2^0 * m[4 * i + 3] + 2^8 * m[4 * i + 2] + 2^16 * m[4 * i + 1] + 2^24 * m[4 * i + 0]; timestamp, i]` | -5 * μ |
+
+### compress
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `SHA256-C13` |  | `SHA256ROUND[timestamp, ['arr', ['+', ['*', ['^', 2, 0], ['idx', 'h', 3]], ['*', ['^', 2, 8], ['idx', 'h', 2]], ['*', ['^', 2, 16], ['idx', 'h', 1]], ['*', ['^', 2, 24], ['idx', 'h', 0]]], ['+', ['*', ['^', 2, 0], ['idx', 'h', 7]], ['*', ['^', 2, 8], ['idx', 'h', 6]], ['*', ['^', 2, 16], ['idx', 'h', 5]], ['*', ['^', 2, 24], ['idx', 'h', 4]]], ['+', ['*', ['^', 2, 0], ['idx', 'h', 11]], ['*', ['^', 2, 8], ['idx', 'h', 10]], ['*', ['^', 2, 16], ['idx', 'h', 9]], ['*', ['^', 2, 24], ['idx', 'h', 8]]], ['+', ['*', ['^', 2, 0], ['idx', 'h', 15]], ['*', ['^', 2, 8], ['idx', 'h', 14]], ['*', ['^', 2, 16], ['idx', 'h', 13]], ['*', ['^', 2, 24], ['idx', 'h', 12]]], ['+', ['*', ['^', 2, 0], ['idx', 'h', 19]], ['*', ['^', 2, 8], ['idx', 'h', 18]], ['*', ['^', 2, 16], ['idx', 'h', 17]], ['*', ['^', 2, 24], ['idx', 'h', 16]]], ['+', ['*', ['^', 2, 0], ['idx', 'h', 23]], ['*', ['^', 2, 8], ['idx', 'h', 22]], ['*', ['^', 2, 16], ['idx', 'h', 21]], ['*', ['^', 2, 24], ['idx', 'h', 20]]], ['+', ['*', ['^', 2, 0], ['idx', 'h', 27]], ['*', ['^', 2, 8], ['idx', 'h', 26]], ['*', ['^', 2, 16], ['idx', 'h', 25]], ['*', ['^', 2, 24], ['idx', 'h', 24]]], ['+', ['*', ['^', 2, 0], ['idx', 'h', 31]], ['*', ['^', 2, 8], ['idx', 'h', 30]], ['*', ['^', 2, 16], ['idx', 'h', 29]], ['*', ['^', 2, 24], ['idx', 'h', 28]]]], 0]` | μ |
+| `SHA256-C14` |  | `SHA256ROUND[timestamp, last_round_out, 64]` | -μ |
+| `SHA256-C15.i` | i ∈ [0, 31] | `IS_BYTE[out[i]]` | μ |
+| `SHA256-C16.i` | i ∈ [0, 7] | `ADD<['arr', 0, ['+', ['*', ['^', 2, 0], ['idx', 'out', ['+', ['*', 4, 'i'], 3]]], ['*', ['^', 2, 8], ['idx', 'out', ['+', ['*', 4, 'i'], 2]]], ['*', ['^', 2, 16], ['idx', 'out', ['+', ['*', 4, 'i'], 1]]], ['*', ['^', 2, 24], ['idx', 'out', ['+', ['*', 4, 'i'], 0]]]]]; ['arr', 0, ['idx', 'last_round_out', 'i']], ['arr', 0, ['+', ['*', ['^', 2, 0], ['idx', 'h', ['+', ['*', 4, 'i'], 3]]], ['*', ['^', 2, 8], ['idx', 'h', ['+', ['*', 4, 'i'], 2]]], ['*', ['^', 2, 16], ['idx', 'h', ['+', ['*', 4, 'i'], 1]]], ['*', ['^', 2, 24], ['idx', 'h', ['+', ['*', 4, 'i'], 0]]]]]>` |  |
+
+### lookup
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `BITWISE-C1` | `AND_BYTE[AND; X, Y]` | -μ_AND |
-| `BITWISE-C2` | `OR_BYTE[OR; X, Y]` | -μ_OR |
-| `BITWISE-C3` | `XOR_BYTE[XOR; X, Y]` | -μ_XOR |
-| `BITWISE-C4` | `MSB8[MSB8; X]` | -μ_MSB8 |
-| `BITWISE-C5` | `MSB16[MSB16; X + 256 * Y]` | -μ_MSB16 |
-| `BITWISE-C6` | `ZERO[ZERO; X + 256 * Y + 65536 * Z]` | -μ_ZERO |
-| `BITWISE-C7` | `IS_BYTE[X]` | -μ_IS_BYTE |
-| `BITWISE-C8` | `IS_HALF[X + 256 * Y]` | -μ_IS_HALF |
-| `BITWISE-C9` | `IS_B20[X + 256 * Y + 65536 * Z]` | -μ_IS_B20 |
-| `BITWISE-C10` | `HWSL[['arr', 'SLL', 'SLLC']; X + 256 * Y, Z]` | -μ_HWSL |
\ No newline at end of file
+| `SHA256-C17` | `IS_BIT<μ>` |  |
+| `SHA256-C18` | `ECALL[timestamp, ['arr', ['-', ['^', 2, 32], 1], ['-', ['^', 2, 32], 1]]]` | -μ |
\ No newline at end of file

From 9819eac26d36780de88f479216951e3998a6d7f6 Mon Sep 17 00:00:00 2001
From: Joaquin Carletti <joaquin.carletti@lambdaclass.com>
Date: Tue, 14 Apr 2026 11:39:37 -0300
Subject: [PATCH 092/105] update_script

---
 scripts/typst_to_md.py | 261 ++++++++++++++++++++++++++++++-----------
 1 file changed, 194 insertions(+), 67 deletions(-)

diff --git a/scripts/typst_to_md.py b/scripts/typst_to_md.py
index be8b09dea..e0a949536 100644
--- a/scripts/typst_to_md.py
+++ b/scripts/typst_to_md.py
@@ -159,6 +159,7 @@ def iters_to_text(obj: dict) -> str:
 
 # Chapters in order (from book.typ)
 CHAPTERS = [
+    ("logup", "LogUp Argument"),
     ("memory", "Memory Argument"),
     ("variables", "Variables"),
     ("signatures", "Signatures"),
@@ -166,17 +167,20 @@ def iters_to_text(obj: dict) -> str:
     ("sign", "SIGN Template"),
     ("add", "ADD/SUB Template"),
     ("neg", "NEG Template"),
+    ("memw", "MEMW Chip"),
     ("decode", "DECODE Table"),
     ("cpu", "CPU Chip"),
     ("shift", "SHIFT Chip"),
     ("branch", "BRANCH Chip"),
-    ("memw", "MEMW Chip"),
     ("lt", "LT Chip"),
     ("mul", "MUL Chip"),
     ("dvrm", "DVRM Chip"),
     ("load", "LOAD Chip"),
-    ("ecall", "ECALL Chips"),
     ("bitwise", "BITWISE Chips"),
+    ("about_ecalls", "About ECALL"),
+    ("halt", "HALT Chip"),
+    ("commit", "COMMIT Chip"),
+    ("sha256", "SHA256 Accelerator"),
 ]
 
 
@@ -221,6 +225,9 @@ def parse_typst_prose(content: str) -> list:
             if current_para:
                 elements.append(('para', ' '.join(current_para)))
                 current_para = []
+            # Extract chip variable (first argument)
+            chip_var_match = re.match(r'#render_constraint_table\((\w+)', stripped)
+            chip_var = chip_var_match.group(1) if chip_var_match else None
             # Extract group names: handles both single `groups: "g"` and array `groups: ("g1", "g2")`
             groups = []
             array_match = re.search(r'groups:\s*\(([^)]*)\)', stripped)
@@ -231,7 +238,43 @@ def parse_typst_prose(content: str) -> list:
                 if single_match:
                     groups = [single_match.group(1)]
             if groups:
-                elements.append(('render_constraints', groups))
+                elements.append(('render_constraints', (chip_var, groups)))
+            else:
+                # No groups specified — render all
+                elements.append(('render_constraints', (chip_var, None)))
+            i += 1
+            continue
+
+        # Capture explicit variable/column table renders
+        if stripped.startswith('#render_chip_variable_table') or stripped.startswith('#render_chip_column_table'):
+            if current_para:
+                elements.append(('para', ' '.join(current_para)))
+                current_para = []
+            chip_var_match = re.match(r'#render_chip_(?:variable|column)_table\((\w+)', stripped)
+            chip_var = chip_var_match.group(1) if chip_var_match else None
+            elements.append(('render_variables', chip_var))
+            i += 1
+            continue
+
+        # Capture explicit assumptions renders
+        if stripped.startswith('#render_chip_assumptions'):
+            if current_para:
+                elements.append(('para', ' '.join(current_para)))
+                current_para = []
+            chip_var_match = re.match(r'#render_chip_assumptions\((\w+)', stripped)
+            chip_var = chip_var_match.group(1) if chip_var_match else None
+            elements.append(('render_assumptions', chip_var))
+            i += 1
+            continue
+
+        # Capture explicit padding table renders
+        if stripped.startswith('#render_chip_padding_table'):
+            if current_para:
+                elements.append(('para', ' '.join(current_para)))
+                current_para = []
+            chip_var_match = re.match(r'#render_chip_padding_table\((\w+)', stripped)
+            chip_var = chip_var_match.group(1) if chip_var_match else None
+            elements.append(('render_padding', chip_var))
             i += 1
             continue
 
@@ -248,13 +291,27 @@ def parse_typst_prose(content: str) -> list:
             i += 1
             continue
 
-        # Detect chip switches: #let chip = load_chip("src/foo.toml", config)
-        load_chip_match = re.match(r'#let\s+chip\s*=\s*load_chip\("([^"]+)"', stripped)
+        # Detect chip loads: #let <varname> = load_chip("src/foo.toml", config)
+        load_chip_match = re.match(r'#let\s+(\w+)\s*=\s*load_chip\("([^"]+)"', stripped)
         if load_chip_match:
             if current_para:
                 elements.append(('para', ' '.join(current_para)))
                 current_para = []
-            elements.append(('load_chip', load_chip_match.group(1)))
+            var_name = load_chip_match.group(1)
+            chip_path = load_chip_match.group(2)
+            elements.append(('load_chip', (var_name, chip_path)))
+            i += 1
+            continue
+
+        # Detect chip name aliases: #let <alias> = raw(<chipvar>.name)
+        name_alias_match = re.match(r'#let\s+(\w+)\s*=\s*raw\((\w+)\.name\)', stripped)
+        if name_alias_match:
+            if current_para:
+                elements.append(('para', ' '.join(current_para)))
+                current_para = []
+            alias = name_alias_match.group(1)
+            chip_var = name_alias_match.group(2)
+            elements.append(('name_alias', (alias, chip_var)))
             i += 1
             continue
 
@@ -266,8 +323,8 @@ def parse_typst_prose(content: str) -> list:
             i += 1
             continue
 
-        # Headings
-        if stripped.startswith('=='):
+        # Headings (= level 1, == level 2, etc.)
+        if stripped.startswith('=') and (len(stripped) == 1 or stripped[len(re.match(r'^=+', stripped).group())] == ' '):
             if current_para:
                 elements.append(('para', ' '.join(current_para)))
                 current_para = []
@@ -529,12 +586,37 @@ def render_assumptions_table(chip: dict, config: dict) -> str:
     return "\n".join(lines)
 
 
+def render_padding_table(chip: dict, config: dict) -> str:
+    """Render padding data as Markdown table."""
+    padding = chip.get("padding", {})
+    if not padding:
+        return ""
+
+    lines = []
+    lines.append("| Column | Value |")
+    lines.append("|--------|-------|")
+
+    for col_name, value in padding.items():
+        lines.append(f"| `{col_name}` | `{value}` |")
+
+    lines.append("")
+    return "\n".join(lines)
+
+
 def convert_chapter(typ_path: Path, toml_path: Path, title: str, config: dict, spec_dir: Path = None) -> str:
     """Convert a chapter from .typ and .toml to Markdown."""
     lines = [f"# {title}", ""]
 
-    # Load TOML data (may be empty for multi-chip files like ecall)
-    chip = load_toml(toml_path)
+    # Load default TOML data (may be empty for prose-only or multi-chip files)
+    default_chip = load_toml(toml_path)
+
+    # Chip registry: variable_name -> chip_data
+    chips = {}
+    if default_chip:
+        chips['chip'] = default_chip
+
+    # Name alias registry: alias -> chip_var_name (from #let alias = raw(chipvar.name))
+    name_aliases = {}
 
     def reset_chip_state():
         return {
@@ -545,7 +627,29 @@ def reset_chip_state():
             'constraint_counter': 1,
         }
 
-    state = reset_chip_state()
+    # State registry: variable_name -> render state
+    states = {}
+    if default_chip:
+        states['chip'] = reset_chip_state()
+
+    def resolve_chip(var_name):
+        """Resolve chip variable name to (chip_data, state)."""
+        if var_name and var_name in chips:
+            if var_name not in states:
+                states[var_name] = reset_chip_state()
+            return chips[var_name], states[var_name]
+        # Fallback to default 'chip' key
+        if 'chip' in chips:
+            if 'chip' not in states:
+                states['chip'] = reset_chip_state()
+            return chips['chip'], states['chip']
+        # Fallback to first loaded chip
+        if chips:
+            first_key = next(iter(chips))
+            if first_key not in states:
+                states[first_key] = reset_chip_state()
+            return chips[first_key], states[first_key]
+        return {}, reset_chip_state()
 
     # Parse Typst prose
     if typ_path.exists():
@@ -554,51 +658,73 @@ def reset_chip_state():
 
         for elem_type, content in elements:
             if elem_type == 'load_chip':
-                # Multi-chip file: switch active chip and reset per-chip state
-                chip_toml_path = spec_dir / content if spec_dir else Path(content)
-                chip = load_toml(chip_toml_path)
-                state = reset_chip_state()
+                var_name, chip_path = content
+                chip_toml_path = spec_dir / chip_path if spec_dir else Path(chip_path)
+                chips[var_name] = load_toml(chip_toml_path)
+                states[var_name] = reset_chip_state()
                 continue
 
-            rendered_columns = state['rendered_columns']
-            rendered_assumptions = state['rendered_assumptions']
-            rendered_constraints = state['rendered_constraints']
-            rendered_constraint_groups = state['rendered_constraint_groups']
-            constraint_counter = state['constraint_counter']
+            if elem_type == 'name_alias':
+                alias, chip_var = content
+                name_aliases[alias] = chip_var
+                continue
 
             if elem_type.startswith('h'):
                 level = int(elem_type[1])
                 lines.append("")
-                # Replace Typst variable references in headings with chip name if available
                 heading_text = content
-                if chip and chip.get('name'):
-                    heading_text = re.sub(r'`[^`]*`\s*chip\b', f"`{chip['name']}` chip", heading_text)
-                lines.append("#" * level + " " + heading_text)
+                # Replace Typst variable references (#varname) with chip names
+                for alias, chip_var in name_aliases.items():
+                    if f'#{alias}' in heading_text and chip_var in chips:
+                        chip_name = chips[chip_var].get('name', alias)
+                        heading_text = heading_text.replace(f'#{alias}', f'`{chip_name}`')
+                # Offset by +1 since the chapter title already uses #
+                lines.append("#" * (level + 1) + " " + heading_text)
                 lines.append("")
 
-                # Render TOML data after relevant headings
-                content_lower = content.lower()
-                if 'column' in content_lower and chip and not rendered_columns:
-                    lines.append(render_variables_table(chip, config))
-                    state['rendered_columns'] = True
-                elif 'assumption' in content_lower and chip and not rendered_assumptions:
-                    lines.append(render_assumptions_table(chip, config))
-                    state['rendered_assumptions'] = True
-                elif content_lower == "constraints" and chip:
-                    state['rendered_constraints'] = True
-
-            elif elem_type == 'render_constraints' and chip:
-                # content is a list of group names to render (in order)
-                group_names = content
-                for group_name in group_names:
-                    if group_name not in state['rendered_constraint_groups']:
-                        # Use the running render-order counter so numbering matches Typst
-                        group_table = render_constraints_table(chip, config, group_filter=group_name, skip_heading=True, start_counter=state['constraint_counter'])
-                        if group_table.strip():
-                            lines.append(group_table)
-                        state['rendered_constraint_groups'].add(group_name)
-                    # Always advance the counter for this group (rendered or already seen)
-                    state['constraint_counter'] += len(chip.get("constraints", {}).get(group_name, []))
+            elif elem_type == 'render_variables':
+                chip_var = content
+                chip_data, st = resolve_chip(chip_var)
+                if chip_data and not st['rendered_columns']:
+                    lines.append(render_variables_table(chip_data, config))
+                    st['rendered_columns'] = True
+
+            elif elem_type == 'render_assumptions':
+                chip_var = content
+                chip_data, st = resolve_chip(chip_var)
+                if chip_data and not st['rendered_assumptions']:
+                    lines.append(render_assumptions_table(chip_data, config))
+                    st['rendered_assumptions'] = True
+
+            elif elem_type == 'render_padding':
+                chip_var = content
+                chip_data, st = resolve_chip(chip_var)
+                if chip_data:
+                    padding = render_padding_table(chip_data, config)
+                    if padding.strip():
+                        lines.append(padding)
+
+            elif elem_type == 'render_constraints':
+                chip_var, group_names = content
+                chip_data, st = resolve_chip(chip_var)
+                if chip_data:
+                    if group_names is None:
+                        # Render all groups
+                        group_names = [cg["name"] for cg in chip_data.get("constraint_groups", [])]
+                    for group_name in group_names:
+                        if group_name not in st['rendered_constraint_groups']:
+                            group_table = render_constraints_table(
+                                chip_data, config,
+                                group_filter=group_name,
+                                skip_heading=True,
+                                start_counter=st['constraint_counter'],
+                            )
+                            if group_table.strip():
+                                lines.append(group_table)
+                            st['rendered_constraint_groups'].add(group_name)
+                        st['constraint_counter'] += len(
+                            chip_data.get("constraints", {}).get(group_name, [])
+                        )
 
             elif elem_type == 'para':
                 lines.append(content)
@@ -608,40 +734,41 @@ def reset_chip_state():
                 lines.append(f"> **Note:** {content}")
                 lines.append("")
 
-    # Render any TOML data that wasn't triggered by prose headings (for the last active chip)
-    rendered_columns = state['rendered_columns']
-    rendered_assumptions = state['rendered_assumptions']
-    rendered_constraints = state['rendered_constraints']
-    rendered_constraint_groups = state['rendered_constraint_groups']
-    constraint_counter = state['constraint_counter']
+    # Fallback: render any TOML data not yet triggered by explicit render calls
+    for var_name, chip_data in chips.items():
+        if var_name not in states:
+            states[var_name] = reset_chip_state()
+        st = states[var_name]
 
-    if chip:
-        if chip.get("variables") and not rendered_columns:
+        if chip_data.get("variables") and not st['rendered_columns']:
             lines.append("## Columns")
             lines.append("")
-            lines.append(render_variables_table(chip, config))
+            lines.append(render_variables_table(chip_data, config))
 
-        if chip.get("assumptions") and not rendered_assumptions:
+        if chip_data.get("assumptions") and not st['rendered_assumptions']:
             lines.append("## Assumptions")
             lines.append("")
-            lines.append(render_assumptions_table(chip, config))
+            lines.append(render_assumptions_table(chip_data, config))
 
-        if chip.get("constraints"):
-            # Get remaining groups in TOML order
-            all_groups_ordered = [cg["name"] for cg in chip.get("constraint_groups", [])]
-            remaining_groups = [g for g in all_groups_ordered if g not in rendered_constraint_groups]
+        if chip_data.get("constraints"):
+            all_groups_ordered = [cg["name"] for cg in chip_data.get("constraint_groups", [])]
+            remaining_groups = [g for g in all_groups_ordered if g not in st['rendered_constraint_groups']]
 
-            if remaining_groups and not rendered_constraints:
-                # No prose Constraints section existed, add one
+            if remaining_groups and not st['rendered_constraints']:
                 lines.append("## Constraints")
                 lines.append("")
 
-            # Render any constraint groups not already rendered inline, continuing the counter
             for group_name in remaining_groups:
-                group_table = render_constraints_table(chip, config, group_filter=group_name, start_counter=constraint_counter)
+                group_table = render_constraints_table(
+                    chip_data, config,
+                    group_filter=group_name,
+                    start_counter=st['constraint_counter'],
+                )
                 if group_table.strip():
                     lines.append(group_table)
-                constraint_counter += len(chip.get("constraints", {}).get(group_name, []))
+                st['constraint_counter'] += len(
+                    chip_data.get("constraints", {}).get(group_name, [])
+                )
 
     result = "\n".join(lines)
     result = re.sub(r'\n{3,}', '\n\n', result)

From d23f806fe61c0a5dec11e0269e413cf35d468165 Mon Sep 17 00:00:00 2001
From: Erik <159244975+erik-3milabs@users.noreply.github.com>
Date: Wed, 22 Apr 2026 11:07:33 +0200
Subject: [PATCH 093/105] spec: KECCAK accelerator (#474)

* spec: math/code render mod expr

* spec/type_check: add ModExpr

* spec: add multi-dimensional array support

* spec/KECCAK: introduce v0

* spec/keccak: define padding

* spec: support multidimensional array in signatures

* spec/keccak: add signatures

* spec/keccak: update core chip

* spec/keccak: update keccak_rnd description

* spec/keccak: define round constant lookup

* Apply suggestions from code review

Co-authored-by: claude[bot] <209825114+claude[bot]@users.noreply.github.com>

* spec/keccak: clarify "optimizations" header

* spec/keccak: list `state_ptr` simplification optimization

* spec/keccak: fix C3

* spec/keccak: fix missing EOF

* spec/keccak: list interaction counts

* spec/keccak: list three-way XOR optimization idea

* spec/tooling: fix mod_expr default

* spec: add spaces round `%` rendering

* spec: reuse `type_to_code` in `signatures.typ`

* Apply suggestions from code review

Co-authored-by: Robin Jadoul <robin.jadoul@gmail.com>

* spec/keccak: update three-way XOR optimization benefits

* spec/ecall: reintroduce ecall-number overview

* spec/keccak: ref to sections in FIPS202 on state endianness

* spec/keccak: fix typo

---------

Co-authored-by: claude[bot] <209825114+claude[bot]@users.noreply.github.com>
Co-authored-by: Robin Jadoul <robin.jadoul@gmail.com>
---
 spec/about_ecalls.typ      |  10 ++
 spec/book.typ              |   1 +
 spec/chip.typ              |  15 +-
 spec/expr.typ              |  49 ++++--
 spec/keccak.typ            | 122 +++++++++++++++
 spec/signatures.typ        |  26 ++--
 spec/src.typ               |  18 +--
 spec/src/keccak.toml       | 102 +++++++++++++
 spec/src/keccak_rc.toml    |  28 ++++
 spec/src/keccak_round.toml | 301 +++++++++++++++++++++++++++++++++++++
 spec/src/signatures.toml   |  13 ++
 spec/tooling/chip.py       |  26 ++++
 12 files changed, 665 insertions(+), 46 deletions(-)
 create mode 100644 spec/keccak.typ
 create mode 100644 spec/src/keccak.toml
 create mode 100644 spec/src/keccak_rc.toml
 create mode 100644 spec/src/keccak_round.toml

diff --git a/spec/about_ecalls.typ b/spec/about_ecalls.typ
index ef4203610..9b37d5f21 100644
--- a/spec/about_ecalls.typ
+++ b/spec/about_ecalls.typ
@@ -22,3 +22,13 @@ When `ECALL` is executed, it is assumed that:
 - the return value is written to `A0`,
 where `A0`-`A7` are symbolic names for the registers `x10`-`x17`
 #footnote([RISC-V - Register sets; en.wikipedia.org, #link("https://web.archive.org/web/20260209053447/https://en.wikipedia.org/wiki/RISC-V#Register_sets")[[src]]]).
+
+= ECALL number overview
+
+We provide a list of supported ECALL numbers.
+Negative numbers (represented as 2s complement 64-bit numbers), are used for our own custom accelerators/extensions.
+
+/ 64: `write` (@commit)
+/ 93: `exit` (@halt)
+/ -1: `SHA256` (@sha256)
+/ -2: `KECCAK` (@keccak)
\ No newline at end of file
diff --git a/spec/book.typ b/spec/book.typ
index 3de02e363..3b65642f9 100644
--- a/spec/book.typ
+++ b/spec/book.typ
@@ -43,6 +43,7 @@
       ("halt.typ", [`HALT` chip], <halt>),
       ("commit.typ", [`COMMIT` chip], <commit>),
       ("sha256.typ", [`SHA256` accelerator], <sha256>),
+      ("keccak.typ", [`KECCAK` accelerator], <keccak>),
     ))
   )
 )
diff --git a/spec/chip.typ b/spec/chip.typ
index f3e0892f7..c6cce5073 100644
--- a/spec/chip.typ
+++ b/spec/chip.typ
@@ -14,12 +14,15 @@
   .map(pair => pair.at(1))
   .flatten()
   .map(var => {
-    let (label, factor) = if type(var.type) == array {
-      (var.type.at(0), var.type.at(1))
-    } else {
-      (var.type, 1)
+    let (factor, var_type) = (1, var.type)
+    while type(var_type) == array {
+      assert(var_type.len() == 2, message: "invalid var (sub)type length: " + repr(var.type))
+      assert(type(var_type.at(1)) == int, message: "invalid var (sub)type length: " + repr(var.type))
+      factor *= var_type.at(1)
+      var_type = var_type.at(0)
     }
-    config.variables.types.filter(type => type.label == label).first().subtypes.len() * factor
+    
+    config.variables.types.filter(type => type.label == var_type).first().subtypes.len() * factor
   })
   .sum()
 }
@@ -290,7 +293,7 @@
   } else if type(groups) == str {
     groups = (groups,)
   }
-  assert(groups.all(group => group in all_groups), message: "unknown group")
+  assert(groups.all(group => group in all_groups), message: "unknown group: " + repr(groups))
   let selected_constraints = groups.map(g => ((g): chip.constraints.at(g))).join()
 
   // Find the group definition in the constraint_groups
diff --git a/spec/expr.typ b/spec/expr.typ
index 2de0d6ba3..20a55d753 100644
--- a/spec/expr.typ
+++ b/spec/expr.typ
@@ -1,20 +1,26 @@
 // Types and array types
 // <type> ::= str
-//          | [str, int]
+//          | [<type>, int]
 
 // Check that a type expression is structurally valid, without validating against a set of known base types
 #let check_array_type(typ) = {
-  assert(type(typ.at(0)) == str, message: "Array types need to have a regular type as base")
-  assert(type(typ.at(1)) == int, message: "Array types need to have a constant dimension")
+  while type(typ) == array {
+    assert(typ.len() == 2, message: "Array types must specify two parameters")
+    assert(type(typ.at(1)) == int, message: "Array types need to have a constant dimension")
+    typ = typ.at(0)
+  }
+  assert(type(typ) == str, message: "Array types need to have a regular type as base")
 }
 
 // Render a type to code
 #let type_to_code(typ) = {
-  if type(typ) == array {
-    check_array_type(typ)
-    return raw(typ.at(0) + "[" + str(typ.at(1)) + "]")
-  } else if type(typ) == str {
-    return raw(typ)
+  let label = ""
+  while type(typ) == array {
+    label += "[" + str(typ.at(1)) + "]"
+    typ = typ.at(0)
+  }
+  if type(typ) == str {
+    return raw(typ + label)
   } else {
     assert(false, message: "Unknown format for type: " + repr(typ))
   }
@@ -54,12 +60,13 @@
   "cast": 3, // cast
   "mul": 4,  // *
   "div": 5,  // /
-  "sum": 6,  // Σ
-  "not": 7,  // not
-  "sub": 8,  // -
-  "add": 9,  // +  
-  "eq": 10,   // = and :=
-  "MAX": 11, // <the void outside every expression>
+  "mod": 6,  // mod
+  "sum": 7,  // Σ
+  "not": 8,  // not
+  "sub": 9,  // -
+  "add": 10,  // +  
+  "eq": 11,   // = and :=
+  "MAX": 12, // <the void outside every expression>
 )
 
 // Mutual recursion through a trick from https://github.com/typst/typst/issues/744
@@ -97,6 +104,13 @@
     "not": (pp, rec, e) => cwrap(rec(PREC.not, 1) + ` - ` + rec(PREC.not, e.at(1)), pp < PREC.not),
     "+": (pp, rec, e) => cwrap(e.slice(1).map(rec.with(PREC.add)).join(` + `), pp < PREC.add),
     "sum": (pp, rec, e) => assert(false, message: "sum is unsupported in code."),
+    "mod": (pp, rec, e) => {
+      assert(e.len() == 3 and type(e.at(2)) == int, message: "Invalid mod expr: " + repr(e))
+      cwrap(
+        rec(PREC.mod, e.at(1)) + ` % ` + rec(PREC.mod, e.at(2)), 
+        pp <= PREC.mod
+      ) 
+    },
     "*": (pp, rec, e) => {
       if e.len() == 3 and type(e.at(1)) == int and type(e.at(2)) == str and e.at(2).len() == 1 {
         // multiplication of a constant with one-letter variable. 
@@ -165,6 +179,13 @@
         pp <= PREC.sub
       )
     },
+    "mod": (pp, rec, e) => {
+      assert(e.len() == 3 and type(e.at(2)) == int, message: "Invalid mod expr: " + repr(e))
+      mwrap(
+        $#rec(PREC.mod, e.at(1)) mod #rec(PREC.mod, e.at(2))$, 
+        pp <= PREC.mod
+      )
+    },
     "*": (pp, rec, e) => {
       if e.len() == 3 and type(e.at(1)) == int and type(e.at(2)) == str and e.at(2).len() == 1 {
         // multiplication of a constant with one-letter variable. 
diff --git a/spec/keccak.typ b/spec/keccak.typ
new file mode 100644
index 000000000..e6e16f43f
--- /dev/null
+++ b/spec/keccak.typ
@@ -0,0 +1,122 @@
+#import "/book.typ": book-page
+#import "/src.typ": load_config, load_chip
+#import "/chip.typ": (
+  compute_nr_interactions,
+  render_chip_assumptions,
+  render_chip_variable_table,
+  total_nr_variables,
+  total_nr_instantiated_columns,
+  render_constraint_table,
+  render_chip_padding_table,
+)
+
+#let config = load_config()
+#let chip = load_chip("src/keccak.toml", config)
+
+#show: book-page(chip.name)
+#let keccak = raw(chip.name)
+
+The #keccak chip applies the keccak permutation $kappa$ to a given memory range;
+other aspects of keccak hashing (such as repeated permutation invocation, 
+input padding and state initialization) fall outside the scope of this accelerator.
+
+This permutation $kappa: FF_2^1600 -> FF_2^1600$ operates on 1600 bits and is composed of 24 applications of round-permutation $Lambda: FF_2^1600 times NN -> FF_2^1600$, where the additional parameter is the round constant.
+$Lambda$ is defined as the composition $iota compose chi compose pi compose rho compose theta$, where only $iota$ depends on the round constant.
+#footnote("More details on the KECCAK permutation: FIPS 202, NIST, " + link("https://csrc.nist.gov/pubs/fips/202/final"))
+
+The keccak accelerator comprises two chips: a core chip that interacts with the memory --- loading the input and writing the output, and a round chip that applies the round permutation.
+
+
+= Core chip
+== Columns
+#let nr_variables = total_nr_variables(chip)
+#let nr_columns = total_nr_instantiated_columns(chip, config)
+#let nr_interactions = compute_nr_interactions(chip)
+
+The #keccak chip is comprised of #nr_variables variables that are expressed using #nr_columns columns and leverages #nr_interactions interaction(s):
+#render_chip_variable_table(chip, config)
+
+== Constraints
+In this VM, we assign syscall number -2 to the #keccak accelerator.
+The chip therefore contributes the following interaction to the lookup-argument:
+#render_constraint_table(chip, config, groups: "output")
+
+The address containing the state to be permuted is passed in as argument `A0 = x10`.
+The following constraints describe that this address is read into `addr` (@keccak:c:read_addr), from which `state_ptr` --- the collection of pointers to all lanes of the state --- is derived (@keccak:c:state_ptr).
+The state is then read into `input_state`, while the `output_state` is written back to the indicated address (@keccak:c:load_store_state).
+#render_constraint_table(chip, config, groups: "mem")
+
+Lastly, the input state is pushed to the Keccak-round function, while the output after 24 rounds is taken off the bus:
+#render_constraint_table(chip, config, groups: "round")
+
+== Padding
+The #keccak table can be padded to the next power of two with the following value assignments:
+#render_chip_padding_table(chip, config)
+
+= Round chip
+#let round_chip = load_chip("src/keccak_round.toml", config)
+#let keccak_rnd = raw(round_chip.name)
+
+== Columns
+#let nr_variables = total_nr_variables(round_chip)
+#let nr_columns = total_nr_instantiated_columns(round_chip, config)
+#let nr_interactions = compute_nr_interactions(round_chip)
+
+The #keccak_rnd chip is comprised of #nr_variables variables that are expressed using #nr_columns columns and leverages #nr_interactions interaction(s):
+#render_chip_variable_table(round_chip, config)
+
+#strong("Note on " + raw("start") + ".")
+`start` contains the state to which the permutation should be applied.
+Its three-dimensional array mimics the specification's three-dimensional state
+#footnote("FIPS 202, NIST, Section 3.1 (" + link("https://csrc.nist.gov/pubs/fips/202/final") + ")")
+and orders the bits as prescribed.
+#footnote("FIPS 202, NIST, Section B.1, Algorithm 10 (" + link("https://csrc.nist.gov/pubs/fips/202/final") + ")")
+
+#strong("Note on " + raw("rnc") + " and " + raw("rbc") + ".")
+Rho rotates every lane by a rotation offset in $[0, 64)$.
+These offsets are identical for every round.
+#footnote("FIPS 202, NIST, page 13, Table 2 (" + link("https://csrc.nist.gov/pubs/fips/202/final") + ")")
+We decompose each offset in three components: the lower nibble (4 bits) are represented by `rnc`, while the upper two bits are represented by as `Bit`s in `rbc`.
+That is, $#`rho_offset[x][y]` = #`rnc[x][y]` + 16 dot #`rbc[x][y][0]` + 32 dot #`rbc[x][y][1]`$.
+
+
+== Constraints
+
+The following constraints ensure that `theta` captures the state after applying the first subpermutation of the round-permutation: $theta$.
+Note here that `Cxz_left` and `Cxz_right` do have to be range-checked; it cannot be assumed that this implicitly follows from @keccak:c:Dxz combined with `rotated_Cxz`'s definition.
+#render_constraint_table(round_chip, config, groups: "theta")
+
+Next, we constrain that `rho` captures the state after applying subpermutation $rho$.
+Note here as well that `rot_left` and `rot_right` do have to be range-checked; it cannot be assumed that this implicitly follows from later constraints.
+#render_constraint_table(round_chip, config, groups: "rho")
+
+Observe that the lane-permutation performed by $pi$ is absorbed in `pi`'s definition.
+The next permutation that is constrained in $chi$:
+#render_constraint_table(round_chip, config, groups: "chi")
+
+Lastly, the round constants are added to one of the lanes in the state.
+`iota` contains the updated lane.
+In the definition of `out`, the output of `chi` and `iota` is combined to construct the output of the permutation.
+#render_constraint_table(round_chip, config, groups: "iota")
+
+Lastly, the round chip contributes the following interactions to the lookup:
+#render_constraint_table(round_chip, config, groups: "io")
+
+== Notes/potential optimizations
+- one does not have to repeat `addr` in `state_ptr`; this saves 4 columns and 4 `IS_HALF` checks.
+- step $rho$ does not need to be applied to `state[0][0]`; its has a zero-shift. This saves 16 columns and 4 `HWSL` interactions.
+- $#`rc[2]` = #`rc[4]` = #`rc[5]` = #`rc[6]` = 0$. As such, those elements need not be stored in `rc`, and need not be XORed into the state in the $iota$-step. This saves 8 columns and 4 `XOR_BYTE` interactions.
+- when executed in large volumnes, `KECCAK_RND` could benefit from having a three-way XOR lookup table. With this in place, the 80 interactions in @keccak:c:theta_cxz_start and @keccak:c:theta_cxz could be dropped.
+  Likewise, 80 columns could be removed from the chip (a \~5% savings).
+
+= Round constant lookup
+#let rc_chip = load_chip("src/keccak_rc.toml", config)
+#let keccak_rc = raw(rc_chip.name)
+
+== Columns
+#let nr_variables = total_nr_variables(rc_chip)
+#let nr_columns = total_nr_instantiated_columns(rc_chip, config)
+
+We provide the round constants through a short precomputed lookup table: #keccak_rc.
+#render_chip_variable_table(rc_chip, config)
+#render_constraint_table(rc_chip, config)
\ No newline at end of file
diff --git a/spec/signatures.typ b/spec/signatures.typ
index 12d84f757..2839a74c6 100644
--- a/spec/signatures.typ
+++ b/spec/signatures.typ
@@ -1,5 +1,6 @@
 #import "/book.typ": book-page
 #import "/src.typ": load_signatures, load_config
+#import "/expr.typ": type_to_code
 
 #show: book-page("signatures.typ")
 
@@ -19,21 +20,11 @@
     raw(cond) + ` => `
   } else {``}
 
-  let input_str = sig.input.map(elt => {
-    if type(elt) == array {
-      raw(elt.at(0)) + `[` + raw(str(elt.at(1))) + `]`
-    } else {
-      raw(elt)
-    }
-  }).join(`, `)
+  let input_str = sig.input.map(type_to_code).join(`, `)
 
   let output = sig.at("output", default: none)
   let output_str = if output != none {
-    if type(output) == array {
-      raw(output.at(0)) + `[` + raw(str(output.at(1))) + `]`
-    } else {
-      raw(output)
-    } + `; `
+    type_to_code(output) + `; `
   } else {``}
 
   return [#cond_str#raw(sig.tag)#lb#output_str#input_str#rb]
@@ -44,12 +35,13 @@
   let vars = sig.input + if "output" in sig { (sig.output, )} else {()}
 
   return vars.map(v => {
-    let (label, factor) = if type(v) == array {
-      (v.at(0), v.at(1))
-    } else {
-      (v, 1)
+    let factor = 1
+    while type(v) == array {
+      factor *= v.at(1)
+      v = v.at(0)
     }
-    config.variables.types.filter(type => type.label == label).first().subtypes.len() * factor
+    let lbl = v
+    config.variables.types.filter(type => type.label == lbl).first().subtypes.len() * factor
   })
   .sum()
 }
diff --git a/spec/src.typ b/spec/src.typ
index 6328c4665..d553629ff 100644
--- a/spec/src.typ
+++ b/spec/src.typ
@@ -40,10 +40,11 @@
 
   // Verify that `var` is a valid variable.
   let verify_variable(var) = {
-    if type(var) == array {
-        assert(var.at(0) in var_labels, message: "Invalid var type: " + repr(var))
+    while type(var) == array {
         assert(type(var.at(1)) == int, message: "Invalid var type: " + repr(var))
-    } else if type(var) == str {
+        var = var.at(0)
+    }
+    if type(var) == str {
       assert(var in var_labels, message: "Invalid var type: " + repr(var))
     } else {
       assert(false, message: "Invalid var type: " + repr(var))
@@ -104,14 +105,13 @@
   let all_vars = chip.variables.values().flatten()
   let all_labels = config.variables.types.map(type => type.label);
   for var in all_vars {
-    let type_label = if type(var.type) == array {
-      var.type.at(0)
-    } else {
-      var.type
+    let type_label = var.type
+    while type(type_label) == array {
+      assert(type_label.len() == 2 and type(type_label.at(1)) == int, message: "invalid type: " + repr(var.type))
+      type_label = type_label.at(0)
     }
-
     // Check that all variable types are valid
-    assert(type_label in all_labels, message: "found invalid var type:" + repr(var.type))
+    assert(type_label in all_labels, message: "found invalid var type: " + repr(var.type))
   }
 }
 
diff --git a/spec/src/keccak.toml b/spec/src/keccak.toml
new file mode 100644
index 000000000..b8f2d91c2
--- /dev/null
+++ b/spec/src/keccak.toml
@@ -0,0 +1,102 @@
+name = "KECCAK"
+
+[[variables.input]]
+name = "timestamp"
+type = "DWordWL"
+desc = "timestamp at which the permutation is performed"
+pad = 0
+
+[[variables.input]]
+name = "addr"
+type = "DWordBL"
+desc = "memory address storing the first bit of the state"
+pad = 0
+
+[[variables.input]]
+name = "input_state"
+type = [[["Byte", 8], 5], 5]
+desc = "state at the start of executing the permutation"
+pad = 0
+
+[[variables.output]]
+name = "output_state"
+type = [[["Byte", 8], 5], 5]
+desc = "state after executing the permutation"
+pad = 0
+
+[[variables.auxiliary]]
+name = "state_ptr"
+type = [["DWordHL", 5], 5]
+desc = "memory addresses storing the entire state"
+pad = ["*", 8, ["arr", 
+        ["arr", 0, 1, 2, 3, 4],
+        ["arr", 5, 6, 7, 8, 9],
+        ["arr", 10, 11, 12, 13, 14],
+        ["arr", 15, 16, 17, 18, 19],
+        ["arr", 20, 21, 22, 23, 24]
+]]
+
+[[variables.multiplicity]]
+name = "μ"
+type = "Bit"
+desc = ""
+pad = 0
+
+[[constraint_groups]]
+name = "output"
+
+[[constraints.output]]
+kind = "interaction"
+tag = "ECALL"
+input = ["timestamp", ["arr", ["-", ["^", 2, 32], 1], ["-", ["^", 2, 32], 2]]]
+multiplicity = ["-", "μ"]
+
+[[constraint_groups]]
+name = "mem"
+
+[[constraints.mem]]
+kind = "interaction"
+tag = "MEMW"
+input = [1, ["cast", ["*", 2, 10], "DWordWL"], "addr", "timestamp", 1, 0, 0]
+output = "addr"
+multiplicity = "μ"
+ref = "keccak:c:read_addr"
+
+[[constraints.mem]]
+kind = "template"
+tag = "ADD"
+input = [["cast", "addr", "DWordWL"], ["cast", ["*", 8, ["+", ["*", 5, "y"], "x"]], "DWordWL"]]
+output = ["cast", ["idx", ["idx", "state_ptr", "x"], "y"], "DWordWL"]
+iters = [["x", 0, 4], ["y", 0, 4]]
+ref = "keccak:c:state_ptr"
+
+[[constraints.mem]]
+kind = "interaction"
+tag = "IS_HALF"
+input = [["idx", ["idx", ["idx", "state_ptr", "x"], "y"], "z"]]
+iters = [["x", 0, 4], ["y", 0, 4], ["z", 0, 3]]
+multiplicity = "μ"
+
+[[constraints.mem]]
+kind = "interaction"
+tag = "MEMW"
+input = [0, ["cast", ["idx", ["idx", "state_ptr", "x"], "y"], "DWordWL"], ["idx", ["idx", "output_state", "x"], "y"], "timestamp", 0, 0, 1]
+output = ["idx", ["idx", "input_state", "x"], "y"]
+iters = [["x", 0, 4], ["y", 0, 4]]
+multiplicity = "μ"
+ref = "keccak:c:load_store_state"
+
+[[constraint_groups]]
+name = "round"
+
+[[constraints.round]]
+kind = "interaction"
+tag = "KECCAK"
+input = ["timestamp", 0, "input_state"]
+multiplicity = "μ"
+
+[[constraints.round]]
+kind = "interaction"
+tag = "KECCAK"
+input = ["timestamp", 24, "output_state"]
+multiplicity = ["-", "μ"]
diff --git a/spec/src/keccak_rc.toml b/spec/src/keccak_rc.toml
new file mode 100644
index 000000000..7844dfbee
--- /dev/null
+++ b/spec/src/keccak_rc.toml
@@ -0,0 +1,28 @@
+name = "KECCAK_RC"
+
+[[variables.input]]
+name = "round"
+type = "BaseField"
+desc = ""
+precomputed = true
+
+[[variables.input]]
+name = "RC"
+type = ["Byte", 8]
+desc = "round constants for the given `round`"
+precomputed = true
+
+[[variables.multiplicity]]
+name = "μ"
+type = "BaseField"
+desc = ""
+
+[[constraint_groups]]
+name = "contributions"
+
+[[constraints.contributions]]
+kind = "interaction"
+tag = "KECCAK_RC"
+input = ["round"]
+output = "RC"
+multiplicity = ["-", "μ"]
diff --git a/spec/src/keccak_round.toml b/spec/src/keccak_round.toml
new file mode 100644
index 000000000..548cb5151
--- /dev/null
+++ b/spec/src/keccak_round.toml
@@ -0,0 +1,301 @@
+name = "KECCAK_RND"
+
+[[variables.input]]
+name = "timestamp"
+type = "DWordWL"
+desc = "timestamp at which the permutation is performed"
+
+
+[[variables.input]]
+name = "round"
+type = "BaseField"
+desc = "index of the permutation round"
+
+[[variables.input]]
+name = "start"
+type = [[["Byte", 8], 5], 5]
+desc = "state at the start of executing the permutation"
+
+[[variables.auxiliary]]
+name = "Cxz"
+type = [[["Byte", 8], 4], 5]
+desc = "$xor_(i=0)^(y+2) #`start[x,i,z]`$"
+
+[[variables.auxiliary]]
+name = "Cxz_left"
+type = [["Byte", 8], 5]
+desc = "the left-rotated component of `rotated_Cxz`"
+
+[[variables.auxiliary]]
+name = "Cxz_right"
+type = [["Byte", 8], 5]
+desc = "the right-rotated component of `rotated_Cxz`"
+
+[[variables.auxiliary]]
+name = "Dxz"
+type = [["Byte", 8], 5]
+desc = "$#`Cxz[`\\(#`x` - 1) mod 5#`,y,z]` xor #`rotated_Cxz[`\\(#`x` + 1) mod 5#`,y,z]`$"
+
+[[variables.auxiliary]]
+name = "theta"
+type = [[["Byte", 8], 5], 5]
+desc = "$theta(#`start`)$, the state after applying $theta$."
+
+[[variables.auxiliary]]
+name = "rot_left"
+type = [[["Byte", 8], 5], 5]
+desc = "the left-rotated component of $#`theta[x,y]` <<< #`rnc`$"
+
+[[variables.auxiliary]]
+name = "rot_right"
+type = [[["Byte", 8], 5], 5]
+desc = "the right-rotated component of $#`theta[x,y]` <<< #`rnc`$"
+
+[[variables.auxiliary]]
+name = "chi_ANDs"
+type = [[["Byte", 8], 5], 5]
+desc = "$(#`pi[`\\(x+1) mod 5#`,y,z]` xor 255) times.o #`pi[`\\(x + 2) mod 5#`,y,z]`$"
+
+[[variables.auxiliary]]
+name = "chi"
+type = [[["Byte", 8], 5], 5]
+desc = "$(chi compose pi compose rho compose theta)(#`start`)$; the state after applying $chi$"
+
+[[variables.auxiliary]]
+name = "rc"
+type = ["Byte", 8]
+desc = "round constants"
+
+[[variables.auxiliary]]
+name = "iota"
+type = ["Byte", 8]
+desc = "state update following from step $iota$."
+
+[[variables.virtual]]
+name = "rotated_Cxz"
+type = [["Byte", 8], 5]
+desc = "$#`Cxz[x,`3#`,z]` <<< 1$"
+def = {iters=[["x", 0, 4], ["z", 0, 7]], poly=["+", ["idx", ["idx", "Cxz_left", "x"], "z"], ["idx", ["idx", "Cxz_right", "x"], ["mod", ["-", "z", 1], 8]]]}
+
+[[variables.virtual]]
+name = "out"
+type = [[["Byte", 8], 5], 5]
+desc = "state at the end of executing the permutation"
+def = {polys=[
+    {iters=[["x", 0], ["y", 0], ["z", 0, 7]], poly=["idx", "iota","z"]},
+    {iters=[["x", 1, 4], ["y", 0], ["z", 0, 7]], poly=["idx",["idx",["idx","chi","x"],"y"],"z"]},
+    {iters=[["x", 0], ["y", 1, 4], ["z", 0, 7]], poly=["idx",["idx",["idx","chi","x"],"y"],"z"]},
+    {iters=[["x", 1, 4], ["y", 1, 4], ["z", 0, 7]], poly=["idx",["idx",["idx","chi","x"],"y"],"z"]}
+]}
+
+[[variables.virtual]]
+name = "rho"
+type = [[["Byte", 8], 5], 5]
+desc = "$(rho compose theta)(#`start`)$; the state state after applying $rho$"
+def = {iters=[["x", 0, 4], ["y", 0, 4], ["z", 0, 7]], poly=[
+    "+", 
+    ["*", 
+        ["not", ["idx", ["idx", ["idx", "rbc", "x"], "y"], 0]],
+        ["not", ["idx", ["idx", ["idx", "rbc", "x"], "y"], 1]],
+        ["+",
+            ["idx", ["idx", ["idx", "rot_left", "x"], "y"], "z"], 
+            ["idx", ["idx", ["idx", "rot_right", "x"], "y"], ["mod", ["-", "z", 1], 8]],
+        ]
+    ],
+    ["*", 
+        ["idx", ["idx", ["idx", "rbc", "x"], "y"], 0],
+        ["not", ["idx", ["idx", ["idx", "rbc", "x"], "y"], 1]],
+        ["+",
+            ["idx", ["idx", ["idx", "rot_left", "x"], "y"], ["mod", ["-", "z", 2], 8]], 
+            ["idx", ["idx", ["idx", "rot_right", "x"], "y"], ["mod", ["-", "z", 3], 8]],
+        ]
+    ],
+    ["*",
+        ["not", ["idx", ["idx", ["idx", "rbc", "x"], "y"], 0]],
+        ["idx", ["idx", ["idx", "rbc", "x"], "y"], 1],
+        ["+",
+            ["idx", ["idx", ["idx", "rot_left", "x"], "y"], ["mod", ["-", "z", 4], 8]], 
+            ["idx", ["idx", ["idx", "rot_right", "x"], "y"], ["mod", ["-", "z", 5], 8]],
+        ]
+    ],
+    ["*", 
+        ["idx", ["idx", ["idx", "rbc", "x"], "y"], 0],
+        ["idx", ["idx", ["idx", "rbc", "x"], "y"], 1],
+        ["+",
+            ["idx", ["idx", ["idx", "rot_left", "x"], "y"], ["mod", ["-", "z", 6], 8]], 
+            ["idx", ["idx", ["idx", "rot_right", "x"], "y"], ["mod", ["-", "z", 7], 8]],
+        ]
+    ],
+]}
+
+[[variables.virtual]]
+name = "pi"
+type = [[["Byte", 8], 5], 5]
+desc = "$(pi compose rho compose theta)(#`start`)$; the state after applying $pi$"
+def = {iters=[["x", 0, 4], ["y", 0, 4], ["z", 0, 7]], poly=["idx", ["idx", ["idx", "rho", ["mod", ["+", "x", ["*", 3, "y"]], 5]], "x"], "z"]}
+
+[[variables.constant]]
+name = "rnc"
+type = [["Byte", 5], 5]
+desc = "lower nibble of `ρ` constants"
+
+[[variables.constant]]
+name = "rbc"
+type = [[["Bit", 2], 5], 5]
+desc = "top two bits of `ρ` constants"
+
+[[variables.multiplicity]]
+name = "μ"
+type = "Bit"
+desc = ""
+
+
+# Assumptions
+
+
+# Constraints
+
+[[constraint_groups]]
+name = "io"
+
+[[constraints.io]]
+kind = "interaction"
+tag = "KECCAK"
+input = ["timestamp", "round", "start"]
+multiplicity = ["-", "μ"]
+
+[[constraints.io]]
+kind = "interaction"
+tag = "KECCAK"
+input = ["timestamp", ["+", "round", 1], "out"]
+multiplicity = "μ"
+
+[[constraints.io]]
+kind = "interaction"
+tag = "KECCAK_RC"
+input = ["round"]
+output = "rc"
+multiplicity = ["-", "μ"]
+
+[[constraint_groups]]
+name = "theta"
+
+[[constraints.theta]]
+kind = "interaction"
+tag = "XOR_BYTE"
+input = [["idx", ["idx", ["idx", "start", "x"], 0], "z"], ["idx", ["idx", ["idx", "start", "x"], 1], "z"]]
+output = ["idx", ["idx", ["idx", "Cxz", "x"], 0], "z"]
+iters = [["x", 0, 4], ["z", 0, 7]]
+multiplicity = "μ"
+ref = "keccak:c:theta_cxz_start"
+
+[[constraints.theta]]
+kind = "interaction"
+tag = "XOR_BYTE"
+input = [["idx", ["idx", ["idx", "Cxz", "x"], ["-", "y", 2]], "z"], ["idx", ["idx", ["idx", "start", "x"], "y"], "z"]]
+output = ["idx", ["idx", ["idx", "Cxz", "x"], ["-", "y", 1]], "z"]
+iters = [["x", 0, 4], ["y", 2, 4], ["z", 0, 7]]
+multiplicity = "μ"
+ref = "keccak:c:theta_cxz"
+
+[[constraints.theta]]
+kind = "interaction"
+tag = "HWSL"
+input = [["idx", ["cast", ["idx", ["idx", "Cxz", "x"], 3], "DWordHL"], "z"], 1]
+output = ["arr", ["idx", ["cast", ["idx", "Cxz_left", "x"], "DWordHL"], "z"], ["idx", ["cast", ["idx", "Cxz_right", "x"], "DWordHL"], "z"]]
+iters = [["x", 0, 4], ["z", 0, 3]]
+multiplicity = "μ"
+
+# Note: these IS_BYTE checks are necessary.
+# Without them, it is possible to prove 0 <<< S evaluates to -1 by setting
+# Cxz_left  = [-1,  256, -1,  256, -1,  256, -1,  256] and
+# Cxz_right = [ 1, -256,  1, -256,  1, -256,  1, -256]
+[[constraints.theta]]
+kind = "interaction"
+tag = "IS_BYTE"
+input = [["idx", ["idx", "Cxz_left", "x"], "z"]]
+iters = [["x", 0, 4], ["z", 0, 7]]
+multiplicity = "μ"
+
+[[constraints.theta]]
+kind = "interaction"
+tag = "IS_BYTE"
+input = [["idx", ["idx", "Cxz_right", "x"], "z"]]
+iters = [["x", 0, 4], ["z", 0, 7]]
+multiplicity = "μ"
+
+[[constraints.theta]]
+kind = "interaction"
+tag = "XOR_BYTE"
+input = [["idx", ["idx", ["idx", "Cxz", ["mod", ["-", "x", 1], 5]], 3], "z"], ["idx", ["idx", "rotated_Cxz", ["mod", ["+", "x", 1], 5]], "z"]]
+output = ["idx", ["idx", "Dxz", "x"], "z"]
+iters = [["x", 0, 4], ["z", 0, 7]]
+multiplicity = "μ"
+ref = "keccak:c:Dxz"
+
+[[constraints.theta]]
+kind = "interaction"
+tag = "XOR_BYTE"
+input = [["idx", ["idx", ["idx", "start", "x"], "y"], "z"], ["idx", ["idx", "Dxz", "x"], "z"]]
+output = ["idx", ["idx", ["idx", "theta", "x"], "y"], "z"]
+iters = [["x", 0, 4], ["y", 0, 4], ["z", 0, 7]]
+multiplicity = "μ"
+
+[[constraint_groups]]
+name = "rho"
+
+[[constraints.rho]]
+kind = "interaction"
+tag = "HWSL"
+input = [["idx", ["cast", ["idx", ["idx", "theta", "x"], "y"], "DWordHL"], "z"], ["idx", ["idx", "rnc", "x"], "y"]]
+output = ["arr", ["idx", ["cast", ["idx", ["idx", "rot_left", "x"], "y"], "DWordHL"], "z"], ["idx", ["cast", ["idx", ["idx", "rot_right", "x"], "y"], "DWordHL"], "z"]]
+iters = [["x", 0, 4], ["y", 0, 4], ["z", 0, 3]]
+multiplicity = "μ"
+
+# Note: these IS_BYTE checks are necessary.
+# Without them, it is possible to prove 0 <<< S evaluates to -1 by setting
+# rot_left  = [-1,  256, -1,  256, -1,  256, -1,  256] and
+# rot_right = [ 1, -256,  1, -256,  1, -256,  1, -256]
+[[constraints.rho]]
+kind = "interaction"
+tag = "IS_BYTE"
+input = [["idx", ["idx", ["idx", "rot_left", "x"], "y"], "z"]]
+iters = [["x", 0, 4], ["y", 0, 4], ["z", 0, 7]]
+multiplicity = "μ"
+
+[[constraints.rho]]
+kind = "interaction"
+tag = "IS_BYTE"
+input = [["idx", ["idx", ["idx", "rot_right", "x"], "y"], "z"]]
+iters = [["x", 0, 4], ["y", 0, 4], ["z", 0, 7]]
+multiplicity = "μ"
+
+[[constraint_groups]]
+name = "chi"
+
+[[constraints.chi]]
+kind = "interaction"
+tag = "AND_BYTE"
+input = [["-", 255, ["idx", ["idx", ["idx", "pi", ["mod", ["+", "x", 1], 5]], "y"], "z"]], ["idx",["idx",["idx", "pi", ["mod", ["+", "x", 2], 5]], "y"], "z"]]
+output = ["idx", ["idx", ["idx", "chi_ANDs", "x"], "y"], "z"]
+iters = [["x", 0, 4], ["y", 0, 4], ["z", 0, 7]]
+multiplicity = "μ"
+
+[[constraints.chi]]
+kind = "interaction"
+tag = "XOR_BYTE"
+input = [["idx", ["idx", ["idx", "pi", "x"], "y"], "z"], ["idx",["idx",["idx", "chi_ANDs", "x"], "y"], "z"]]
+output = ["idx", ["idx", ["idx", "chi", "x"], "y"], "z"]
+iters = [["x", 0, 4], ["y", 0, 4], ["z", 0, 7]]
+multiplicity = "μ"
+
+[[constraint_groups]]
+name = "iota"
+
+[[constraints.iota]]
+kind = "interaction"
+tag = "XOR_BYTE"
+input = [["idx", ["idx", ["idx", "chi", 0], 0], "z"], ["idx","rc","z"]]
+output = ["idx", "iota", "z"]
+iter = ["z", 0, 7]
+multiplicity = "μ"
diff --git a/spec/src/signatures.toml b/spec/src/signatures.toml
index 33f97cebf..b04b3d561 100644
--- a/spec/src/signatures.toml
+++ b/spec/src/signatures.toml
@@ -211,3 +211,16 @@ tag = "ROTXOR"
 kind = "interaction"
 input = ["Word", "Byte", "Byte", "Byte", "Bit"]
 output = "Word"
+
+# Keccak communication between rounds
+[[signatures]]
+tag = "KECCAK"
+kind = "interaction"
+input = ["DWordWL", "BaseField", [[["Byte", 8], 5], 5]]
+
+# Keccak round constants
+[[signatures]]
+tag = "KECCAK_RC"
+kind = "interaction"
+input = ["BaseField"]
+output = ["Byte", 8]
diff --git a/spec/tooling/chip.py b/spec/tooling/chip.py
index d597d2274..7f7ecca81 100644
--- a/spec/tooling/chip.py
+++ b/spec/tooling/chip.py
@@ -255,6 +255,30 @@ def typecheck(self, env: Environment) -> Type:
         return t
 
 
+@dataclass
+class ModExpr:
+    elt: Expr
+    modulus: Expr
+
+    def typecheck(self, env: Environment) -> Type:
+        elt = self.elt.typecheck(env)
+        modulus = self.modulus.typecheck(env)
+
+        if isinstance(modulus, list) or not modulus.is_const():
+            reporter.error(f"Invalid non-constant modulus: {self.modulus!r}")
+            return Range.const(0)
+        modulus = modulus.get_const()
+        if modulus <= 0:
+            reporter.error(f"Invalid non-positive modulus: {self.modulus!r}")
+            return Range.const(0)
+
+        if elt.is_const():
+            elt = elt.get_const()
+            return Range.const(elt % modulus)
+        else:
+            return Range(0, modulus-1)
+
+
 @dataclass
 class PowExpr:
     base: Expr
@@ -343,6 +367,8 @@ def build_expr(config: Optional["Config"], data: object) -> Expr:
             return SubExpr(
                 build_expr(config, head), [build_expr(config, s) for s in subs]
             )
+        case ["mod", elt, modulus]:
+            return ModExpr(build_expr(config, elt), build_expr(config, modulus))
         case ["^", base, exp]:
             return PowExpr(build_expr(config, base), build_expr(config, exp))
         case ["sum", ["=", str(var), start], stop, terms]:

From 0f90155442dd27f64bf3a584b3c53becf894792d Mon Sep 17 00:00:00 2001
From: Robin Jadoul <robin.jadoul@3milabs.tech>
Date: Fri, 24 Apr 2026 11:41:43 +0200
Subject: [PATCH 094/105] spec: Inline PC memory access into CPU (#501)

* spec: Inline PC memory access into CPU

* Apply suggestions from code review

Co-authored-by: claude[bot] <209825114+claude[bot]@users.noreply.github.com>
Co-authored-by: Robin Jadoul <robin.jadoul@gmail.com>

* Apply review suggestion

Co-authored-by: Erik <159244975+erik-3milabs@users.noreply.github.com>

* Remove `pc_double_read` constraints and clarify why in cpu.typ

* Potential optimization -> subsubsection

* Address review comments

* Clarifying remark on register initialization

---------

Co-authored-by: claude[bot] <209825114+claude[bot]@users.noreply.github.com>
Co-authored-by: Erik <159244975+erik-3milabs@users.noreply.github.com>
---
 spec/cpu.typ      | 16 +++++++++++++---
 spec/decode.typ   | 14 +++++++-------
 spec/memory.typ   |  2 ++
 spec/src/cpu.toml | 36 +++++++++++++++++++++++++++++++++---
 4 files changed, 55 insertions(+), 13 deletions(-)

diff --git a/spec/cpu.typ b/spec/cpu.typ
index 2fbd60d59..2ee85befa 100644
--- a/spec/cpu.typ
+++ b/spec/cpu.typ
@@ -56,16 +56,26 @@ The ALU functionality is then obtained through judicious dispatching to the corr
 
 #render_constraint_table(chip, config, groups: "alu")
 
-== Memory
+== Memory<cpu:memory>
 
 The interactions with the memory, both for register loading and storing, as for `LOAD` and `STORE` instructions are handled.
 Note that since registers need no byte-addressing, we store them in the memory argument with `Word` limbs.
-The timestamps are ensured to be disjoint for disjoint memory locations.
+The `pc` register behaves very predictably with respect to its timestamps and when it is being read,
+so for performance reasons, we inline its memory interactions directly into the #cpu chip.
+
+Potentially overlapping memory accesses are ensured to have disjoint timestamps.
 One consequence of that is that `next_pc` is written at `timestamp + 1`
-to ensure the access is disjoint with the `pc` read into `rv1` as part of the `AUIPC` instruction.
+to ensure the access is disjoint with the `pc` read into `rv1` as part of the `AUIPC` instruction (see @cpu:c:read_rv1 and @decode:decoding-overview).
+Constraints regarding whether `pc_double_read` corresponds to an `AUIPC` instruction are not necessary,
+as regardless of its value, the old timestamp is guaranteed smaller than the new timestamp,
+and the integrity of the memory argument therefore ensures the correctness of this bit.
 
 #render_constraint_table(chip, config, groups: "mem")
 
+=== Potential optimizations
+
+- `double_pc_read` could be integrated into decoding, so that `AUIPC` could set `read_register1 = 0` and no extra MEMW access for `rv1` is needed at this point.
+
 == System
 
 The interactions with the wider system.
diff --git a/spec/decode.typ b/spec/decode.typ
index bb5d0d5a1..21cd26acb 100644
--- a/spec/decode.typ
+++ b/spec/decode.typ
@@ -39,7 +39,7 @@ Given that `CPU` asserts that `EBREAK = 0` (see @cpu:c:ebreak_traps), using this
 Note moreover that the `pc` is set to $7$.
 This value is the _smallest odd number_ (i.e., not reachable during regular execution) that is more than _$4$_ (i.e., the max `pc`-increment) greater than _$1$_ (i.e., the `pc`-value used in the #link(<cpu-padding-decode-row>)[additional instruction] referred to by `CPU`-padding lines).
 
-= Decoding
+= Decoding<decode:decoding-overview>
 For the purposes of explaining decoding, we decompress #decode's `packed_decode` variable into its constituent variables.
 Note that the below table is _not_ used in practice: it is solely used for the purposes of this explanation.
 
@@ -64,12 +64,6 @@ For the purpose of brevity and readability, the table uses the following rules-o
 
 Further clarification is provided in the notes following the table.
 
-== C-type instructions
-The `RV64C` extension for compressed instructions specifies that \~50% of all instructions can be represented using a 16-bit instruction (rather than 32-bits), saving \~25% in code size.
-This execution of assembly code is _not_ agnostic to an instruction's compression state; after executing a compressed instruction, the `pc` should be incremented by $2$ rather than $4$.
-To indicate an instruction is provided in compressed form, the `c_type` flag is introduced.
-*This flag should be set to $1$ whenever the decoded instruction is provided in compressed form and $0$ otherwise.*
-
 /// Add a reference to one or more notes following this table.
 #let ref_note(..refs) = {
   super("[" + refs.pos().map(r => ref(r)).join(",") + "]")
@@ -152,6 +146,12 @@ To indicate an instruction is provided in compressed form, the `c_type` flag is
 
 #decoding_table(decoding)
 
+== C-type instructions
+The `RV64C` extension for compressed instructions specifies that \~50% of all instructions can be represented using a 16-bit instruction (rather than 32-bits), saving \~25% in code size.
+This execution of assembly code is _not_ agnostic to an instruction's compression state; after executing a compressed instruction, the `pc` should be incremented by $2$ rather than $4$.
+To indicate an instruction is provided in compressed form, the `c_type` flag is introduced.
+*This flag should be set to $1$ whenever the decoded instruction is provided in compressed form and $0$ otherwise.*
+
 // Construct a note that can be referenced through `lbl`
 #let referenceable_note(lbl, note) = {
   show figure: (it) => align(left, [#it])
diff --git a/spec/memory.typ b/spec/memory.typ
index 876884c85..6a204c1b7 100644
--- a/spec/memory.typ
+++ b/spec/memory.typ
@@ -136,6 +136,8 @@ The initialization will need to correspond to a fixed initial register state for
 as well as the memory loaded from the program binary, zero-initialization of memory elsewhere, and private input provided by the prover.
 The contribution of initialization with static data from the ELF executable and the initial register state to the sum
 can be handled directly by the verifier, ensuring correctness corresponding to the ELF binary being proven.
+To enable the loading of the PC in @cpu:memory, register initialization happens at timestamp 1.
+Register finalization is made possible for the verifier by having a known state from the HALT chip (@halt).
 This leaves only zero-initialization and prover input as prover-side concerns for initialization,
 alongside the finalization of the entire used memory.
 
diff --git a/spec/src/cpu.toml b/spec/src/cpu.toml
index a455b854f..b25138d4d 100644
--- a/spec/src/cpu.toml
+++ b/spec/src/cpu.toml
@@ -221,6 +221,18 @@ desc = "The value to (maybe) be written back to rvd"
 pad = 0
 
 # Auxiliary
+[[variables.auxiliary]]
+name = "prev_pc_timestamp_borrow"
+type = "Bit"
+desc = "The borrow bit for computing the previous timestamp the PC was accessed"
+pad = 0
+
+[[variables.auxiliary]]
+name = "pc_double_read"
+type = "Bit"
+desc = "Whether the PC is being read as a general purpose register (`rs1`) this cycle"
+pad = 0
+
 [[variables.auxiliary]]
 name = "rv1"
 type = "DWordWHH"
@@ -652,6 +664,7 @@ tag = "MEMW"
 input = [1, ["*", ["cast", 2, "DWordWL"], "rs1"], ["arr", ["idx", ["cast", "rv1", "DWordWL"], 0], ["idx", ["cast", "rv1", "DWordWL"], 1], 0, 0, 0, 0, 0, 0], ["+", "timestamp", ["cast", 0, "DWordWL"]], 1, 0, 0]
 output = ["arr", ["idx", ["cast", "rv1", "DWordWL"], 0], ["idx", ["cast", "rv1", "DWordWL"], 1], 0, 0, 0, 0, 0, 0]
 multiplicity = "read_register1"
+ref = "cpu:c:read_rv1"
 
 [[constraints.mem]]
 kind = "arith"
@@ -691,12 +704,29 @@ tag = "MEMW"
 input = [0, ["cast", "res", "DWordWL"], ["cast", "arg2", ["Byte", 8]], ["+", "timestamp", ["cast", 1, "DWordWL"]], "memory_2bytes", "memory_4bytes", "memory_8bytes"]
 multiplicity = "STORE"
 
+[[constraints.mem]]
+kind = "template"
+tag = "IS_BIT"
+input = ["pc_double_read"]
+
+[[constraints.mem]]
+kind = "template"
+tag = "IS_BIT"
+input = ["prev_pc_timestamp_borrow"]
+
 [[constraints.mem]]
 kind = "interaction"
-tag = "MEMW"
-input = [1, ["cast", ["*", 2, 255], "DWordWL"], ["arr", ["idx", "next_pc", 0], ["idx", "next_pc", 1], 0, 0, 0, 0, 0, 0], ["+", "timestamp", ["cast", 1, "DWordWL"]], 1, 0, 0]
-output = ["arr", ["idx", "pc", 0], ["idx", "pc", 1], 0, 0, 0, 0, 0, 0]
+tag = "memory"
+input = [1, ["arr", ["+", ["*", 2, 255], "i"], 0], ["arr", ["+", ["-", ["idx", "timestamp", 0], ["*", 3, ["not", "pc_double_read"]]], ["*", ["^", 2, 32], "prev_pc_timestamp_borrow"]], ["-", ["idx", "timestamp", 1], "prev_pc_timestamp_borrow"]], ["idx", "pc", "i"]]
 multiplicity = ["not", "pad"]
+iter = ["i", 0, 1]
+
+[[constraints.mem]]
+kind = "interaction"
+tag = "memory"
+input = [1, ["arr", ["+", ["*", 2, 255], "i"], 0], ["+", "timestamp", ["cast", 1, "DWordWL"]], ["idx", "next_pc", "i"]]
+multiplicity = ["-", ["not", "pad"]]
+iter = ["i", 0, 1]
 
 
 [[constraint_groups]]

From b58514da83333101a7a7b9003d2d5fc0c3fc10c2 Mon Sep 17 00:00:00 2001
From: Joaquin Carletti <joaquin.carletti@lambdaclass.com>
Date: Fri, 24 Apr 2026 12:32:59 -0300
Subject: [PATCH 095/105] update

---
 docs/spec/about_ecalls.md |  8 +++-
 docs/spec/cpu.md          | 47 ++++++++++++-------
 docs/spec/decode.md       | 10 ++--
 docs/spec/memory.md       | 29 +-----------
 docs/spec/signatures.md   |  6 +--
 docs/spec/spec_full.md    | 98 ++++++++++++++++++---------------------
 6 files changed, 89 insertions(+), 109 deletions(-)

diff --git a/docs/spec/about_ecalls.md b/docs/spec/about_ecalls.md
index a128c5e3a..39a36d91a 100644
--- a/docs/spec/about_ecalls.md
+++ b/docs/spec/about_ecalls.md
@@ -4,4 +4,10 @@ ECALLs provide system-level functionalities to the guest program.
 
 When `ECALL` is executed, it is assumed that: - register `A7` contains the system call number
 
-- the arguments are located in registers `A0`-`A6`, and - the return value is written to `A0`, where `A0`-`A7` are symbolic names for the registers `x10`-`x17`
\ No newline at end of file
+- the arguments are located in registers `A0`-`A6`, and - the return value is written to `A0`, where `A0`-`A7` are symbolic names for the registers `x10`-`x17`
+
+## ECALL number overview
+
+We provide a list of supported ECALL numbers. Negative numbers (represented as 2s complement 64-bit numbers), are used for our own custom accelerators/extensions.
+
+/ 64: `write` ([commit]) / 93: `exit` ([halt]) / -1: `SHA256` ([sha256]) / -2: `KECCAK` ([keccak])
\ No newline at end of file
diff --git a/docs/spec/cpu.md b/docs/spec/cpu.md
index 0383d28ef..3c00a3091 100644
--- a/docs/spec/cpu.md
+++ b/docs/spec/cpu.md
@@ -55,6 +55,8 @@ The  chip is comprised of  variables that are expressed using  columns and lever
 
 | Name | Type | Description |
 |------|------|-------------|
+| `prev_pc_timestamp_borrow` | `Bit` | The borrow bit for computing the previous timestamp the PC was accessed |
+| `pc_double_read` | `Bit` | Whether the PC is being read as a general purpose register (`rs1`) this cycle |
 | `rv1` | `DWordWHH` | The value of register `rs1` |
 | `rv2` | `DWordWHH` | The value of register `rs2` |
 | `rv1_ext_bit` | `Bit` | The sign bit of `rv1` if seen as a 32-bit word, used for sign extension with `word_instr` |
@@ -162,9 +164,11 @@ The ALU functionality is then obtained through judicious dispatching to the corr
 | `CPU-CA45` |  | `MUL[res::DWordWL; arg1::DWordHL, signed, arg2::DWordHL, mp_selector, muldiv_selector]` | MUL |
 | `CPU-CA46` |  | `DVRM[res::DWordWL; arg1::DWordHL, arg2::DWordHL, signed, muldiv_selector]` | DIVREM |
 
-### Memory
+### Memory<cpu:memory>
 
-The interactions with the memory, both for register loading and storing, as for `LOAD` and `STORE` instructions are handled. Note that since registers need no byte-addressing, we store them in the memory argument with `Word` limbs. The timestamps are ensured to be disjoint for disjoint memory locations. One consequence of that is that `next_pc` is written at `timestamp + 1` to ensure the access is disjoint with the `pc` read into `rv1` as part of the `AUIPC` instruction.
+The interactions with the memory, both for register loading and storing, as for `LOAD` and `STORE` instructions are handled. Note that since registers need no byte-addressing, we store them in the memory argument with `Word` limbs. The `pc` register behaves very predictably with respect to its timestamps and when it is being read, so for performance reasons, we inline its memory interactions directly into the  chip.
+
+Potentially overlapping memory accesses are ensured to have disjoint timestamps. One consequence of that is that `next_pc` is written at `timestamp + 1` to ensure the access is disjoint with the `pc` read into `rv1` as part of the `AUIPC` instruction (see [cpu:c:read_rv1] and [decode]:decoding-overview). Constraints regarding whether `pc_double_read` corresponds to an `AUIPC` instruction are not necessary, as regardless of its value, the old timestamp is guaranteed smaller than the new timestamp, and the integrity of the memory argument therefore ensures the correctness of this bit.
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
@@ -177,7 +181,14 @@ The interactions with the memory, both for register loading and storing, as for
 | `CPU-CM51` |  | `MEMW[1, 2::DWordWL * rd, ['arr', ['idx', 'rvd', 0], ['idx', 'rvd', 1], 0, 0, 0, 0, 0, 0], timestamp + 2::DWordWL, 1, 0, 0]` | write_register |
 | `CPU-CM52` |  | `LOAD[rvd; res::DWordWL, timestamp + 0::DWordWL, memory_2bytes, memory_4bytes, memory_8bytes, signed]` | LOAD |
 | `CPU-CM53` |  | `MEMW[0, res::DWordWL, arg2::Byte[8], timestamp + 1::DWordWL, memory_2bytes, memory_4bytes, memory_8bytes]` | STORE |
-| `CPU-CM54` |  | `MEMW[['arr', ['idx', 'pc', 0], ['idx', 'pc', 1], 0, 0, 0, 0, 0, 0]; 1, (2 * 255)::DWordWL, ['arr', ['idx', 'next_pc', 0], ['idx', 'next_pc', 1], 0, 0, 0, 0, 0, 0], timestamp + 1::DWordWL, 1, 0, 0]` | 1 - pad |
+| `CPU-CM54` |  | `IS_BIT<pc_double_read>` |  |
+| `CPU-CM55` |  | `IS_BIT<prev_pc_timestamp_borrow>` |  |
+| `CPU-CM56.i` | i ∈ [0, 1] | `memory[1, ['arr', ['+', ['*', 2, 255], 'i'], 0], ['arr', ['+', ['-', ['idx', 'timestamp', 0], ['*', 3, ['not', 'pc_double_read']]], ['*', ['^', 2, 32], 'prev_pc_timestamp_borrow']], ['-', ['idx', 'timestamp', 1], 'prev_pc_timestamp_borrow']], pc[i]]` | 1 - pad |
+| `CPU-CM57.i` | i ∈ [0, 1] | `memory[1, ['arr', ['+', ['*', 2, 255], 'i'], 0], timestamp + 1::DWordWL, next_pc[i]]` | -(1 - pad) |
+
+#### Potential optimizations
+
+- `double_pc_read` could be integrated into decoding, so that `AUIPC` could set `read_register1 = 0` and no extra MEMW access for `rv1` is needed at this point.
 
 ### System
 
@@ -185,9 +196,9 @@ The interactions with the wider system.
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `CPU-CS55` | `!EBREAK` |  |
+| `CPU-CS58` | `!EBREAK` |  |
 | | _polynomial:_ `1 - EBREAK = 0` | |
-| `CPU-CS56` | `ECALL[timestamp, rv1::DWordWL]` | ECALL |
+| `CPU-CS59` | `ECALL[timestamp, rv1::DWordWL]` | ECALL |
 
 ### Input and output to the ALU
 
@@ -195,20 +206,20 @@ We constrain `arg1`, `arg2` and `rvd` to correspond to the wanted values, includ
 
 | Tag | Description |
 |-----|-------------|
-| `CPU-CE57` | `SIGN<rv1_ext_bit; rv1[1], word_instr>` |
-| `CPU-CE58` | `arg1[:4]` = `rv1[:2]` |
+| `CPU-CE60` | `SIGN<rv1_ext_bit; rv1[1], word_instr>` |
+| `CPU-CE61` | `arg1[:4]` = `rv1[:2]` |
 | | _polynomial:_ `(arg1::DWordWL)[0] - (rv1::DWordWL)[0] = 0` |
-| `CPU-CE59` | `arg1[4:]` = `rv1[2]` dot (1 - `word_instr`) + (2^(32) - 1) dot `rv1_ext_bit` dot `signed` |
+| `CPU-CE62` | `arg1[4:]` = `rv1[2]` dot (1 - `word_instr`) + (2^(32) - 1) dot `rv1_ext_bit` dot `signed` |
 | | _polynomial:_ `(arg1::DWordWL)[1] - (1 - word_instr) * rv1[2] - signed * rv1_ext_bit * (2^32 - 1) = 0` |
-| `CPU-CE60` | `SIGN<rv2_ext_bit; rv2[1], word_instr>` |
-| `CPU-CE61` | `arg2[:4]` = (1 - `LOAD`) dot `rv2[:2]` + (1 - `BEQ` - `BLT` - `STORE`) dot `imm[0]` |
+| `CPU-CE63` | `SIGN<rv2_ext_bit; rv2[1], word_instr>` |
+| `CPU-CE64` | `arg2[:4]` = (1 - `LOAD`) dot `rv2[:2]` + (1 - `BEQ` - `BLT` - `STORE`) dot `imm[0]` |
 | | _polynomial:_ `(arg2::DWordWL)[0] - (1 - LOAD) * (rv2::DWordWL)[0] - (1 - BEQ - BLT - STORE) * imm[0] = 0` |
-| `CPU-CE62` | `arg2[4:]` = (1 - `LOAD`) dot ((1 - `word_instr`) dot `rv2[2]` + `signed` dot `rv2_ext_bit` dot (2^(32) - 1)) + (1 - `BEQ` - `BLT` - `STORE`) dot `imm[1]` |
+| `CPU-CE65` | `arg2[4:]` = (1 - `LOAD`) dot ((1 - `word_instr`) dot `rv2[2]` + `signed` dot `rv2_ext_bit` dot (2^(32) - 1)) + (1 - `BEQ` - `BLT` - `STORE`) dot `imm[1]` |
 | | _polynomial:_ `(arg2::DWordWL)[1] - (1 - LOAD) * (1 - word_instr) * rv2[2] - (1 - LOAD) * signed * rv2_ext_bit * (2^32 - 1) - (1 - BEQ - BLT - STORE) * imm[1] = 0` |
-| `CPU-CE63` | `SIGN<res_ext_bit; (res::DWordHL)[1], word_instr>` |
-| `CPU-CE64` | `!LOAD` => `rvd[0]` = `res[:4]` |
+| `CPU-CE66` | `SIGN<res_ext_bit; (res::DWordHL)[1], word_instr>` |
+| `CPU-CE67` | `!LOAD` => `rvd[0]` = `res[:4]` |
 | | _polynomial:_ `(1 - LOAD) * (rvd[0] - (res::DWordWL)[0]) = 0` |
-| `CPU-CE65` | `!LOAD` => `rvd[1]` = (1 - `word_instr`) dot `res[4:]` + `res_ext_bit` dot (2^(32) - 1) |
+| `CPU-CE68` | `!LOAD` => `rvd[1]` = (1 - `word_instr`) dot `res[4:]` + `res_ext_bit` dot (2^(32) - 1) |
 | | _polynomial:_ `(1 - LOAD) * (rvd[1] - (1 - word_instr) * (res::DWordWL)[1] - res_ext_bit * (2^32 - 1)) = 0` |
 
 ### Other constraints
@@ -217,11 +228,11 @@ For [cpu:c:is_equal], note that [cpu:c:sub] sets `res` to be the difference betw
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `CPU-CO66` | `ZERO[is_equal; res[0] + res[1] + res[2] + res[3] + res[4] + res[5] + res[6] + res[7]]` | BEQ |
-| `CPU-CO67` | `branch_cond` = `JALR` or (`BLT` and (`res` xor `invert`)) or (`BEQ` and (`is_equal` xor `invert`)) |  |
+| `CPU-CO69` | `ZERO[is_equal; res[0] + res[1] + res[2] + res[3] + res[4] + res[5] + res[6] + res[7]]` | BEQ |
+| `CPU-CO70` | `branch_cond` = `JALR` or (`BLT` and (`res` xor `invert`)) or (`BEQ` and (`is_equal` xor `invert`)) |  |
 | | _polynomial:_ `-branch_cond + JALR + res[0] * (1 - mp_selector) * BLT + (1 - res[0]) * mp_selector * BLT + is_equal * (1 - mp_selector) * BEQ + (1 - is_equal) * mp_selector * BEQ = 0` | |
-| `CPU-CO68` | `BRANCH[next_pc; pc, imm, arg1::DWordWL, JALR]` | branch_cond |
-| `CPU-CO69` | `ADD<next_pc; pc, (2 * c_type_instruction + 4 * (1 - c_type_instruction)) * 1::DWordWL>` |  |
+| `CPU-CO71` | `BRANCH[next_pc; pc, imm, arg1::DWordWL, JALR]` | branch_cond |
+| `CPU-CO72` | `ADD<next_pc; pc, (2 * c_type_instruction + 4 * (1 - c_type_instruction)) * 1::DWordWL>` |  |
 
 > **Note:** Document the choice to not have a multiplicity column here for padding
 
diff --git a/docs/spec/decode.md b/docs/spec/decode.md
index 83b51cba1..afadc0465 100644
--- a/docs/spec/decode.md
+++ b/docs/spec/decode.md
@@ -26,7 +26,7 @@ The  table must be padded to a length that is a power of two. Empty rows with th
 
 Note that this row sets the `EBREAK` flag. Given that `CPU` asserts that `EBREAK = 0` (see [cpu:c:ebreak_traps]), using this "padding-instruction" would immediately make the CPU table unprovable. Note moreover that the `pc` is set to `7`. This value is the _smallest odd number_ (i.e., not reachable during regular execution) that is more than _`4`_ (i.e., the max `pc`-increment) greater than _`1`_ (i.e., the `pc`-value used in the [additional instruction] referred to by `CPU`-padding lines).
 
-## Decoding
+## Decoding<decode:decoding-overview>
 
 For the purposes of explaining decoding, we decompress 's `packed_decode` variable into its constituent variables. Note that the below table is _not_ used in practice: it is solely used for the purposes of this explanation.
 
@@ -79,10 +79,6 @@ For the purpose of brevity and readability, the table uses the following rules-o
 
 Further clarification is provided in the notes following the table.
 
-### C-type instructions
-
-The `RV64C` extension for compressed instructions specifies that \~50% of all instructions can be represented using a 16-bit instruction (rather than 32-bits), saving \~25% in code size. This execution of assembly code is _not_ agnostic to an instruction's compression state; after executing a compressed instruction, the `pc` should be incremented by `2` rather than `4`. To indicate an instruction is provided in compressed form, the `c_type` flag is introduced. *This flag should be set to `1` whenever the decoded instruction is provided in compressed form and `0` otherwise.*
-
 /// Add a reference to one or more notes following this table.
 
 super("[" + refs.pos().map(r => ref(r)).join(",") + "]") }
@@ -93,6 +89,10 @@ figure(table( columns: (auto, auto, auto, auto, 1fr, auto), stroke: 0pt, inset:
 
 // OP-IMM ([`ADDI[W]   rd, rs1, imm`], [`ADD`], [`[W]`], [], [], []), ([`SLTI[U]   rd, rs1, imm`], [`SLT`], [], [.not`[U]`], [], []), ([`ANDI      rd, rs1, imm`], [`AND`], [], [], [], []), ([`ORI       rd, rs1, imm`], [`OR`],   [], [], [], []), ([`XORI      rd, rs1, imm`], [`XOR`], [], [], [], []), ([`SLLI[W]   rd, rs1, imm`], [`SHIFT`], [`[W]`], [], [], []), ([`SRLI[W]   rd, rs1, imm`], [`SHIFT`], [`[W]`], [], [`mp_selector`], []), ([`SRAI[W]   rd, rs1, imm`], [`SHIFT`], [`[W]`], [1], [`mp_selector`], []), // OP ([`ADD[W]    rd, rs1, rs2`], [`ADD`], [`[W]`], [], [], []), ([`SUB[W]    rd, rs1, rs2`], [`SUB`], [`[W]`], [], [], []), ([`SLT[U]    rd, rs1, rs2`], [`SLT`], [], [.not`[U]`], [], []), ([`AND       rd, rs1, rs2`], [`AND`], [], [], [], []), ([`OR        rd, rs1, rs2`], [`OR`], [], [], [], []), ([`XOR       rd, rs1, rs2`], [`XOR`], [], [], [], []), ([`SLL[W]    rd, rs1, rs2`], [`SHIFT`], [`[W]`], [], [], []), ([`SRL[W]    rd, rs1, rs2`], [`SHIFT`], [`[W]`], [], [`mp_selector`], []), ([`SRA[W]    rd, rs1, rs2`], [`SHIFT`], [`[W]`], [1], [`mp_selector`], []), // OP - M ([`MUL[W]    rd, rs1, rs2`], [`MUL`], [`[W]`], [1], [`mp_selector`], []), ([`MULH      rd, rs1, rs2`], [`MUL`], [], [1], [`mp_selector`, `muldiv_selector`], []), ([`MULHU     rd, rs1, rs2`], [`MUL`], [], [], [`muldiv_selector`], []), ([`MULHSU    rd, rs1, rs2`], [`MUL`], [], [1], [`muldiv_selector`], []), ([`DIV[U][W] rd, rs1, rs2`], [`DIVREM`], [`[W]`], [.not`[U]`], [], []), ([`REM[U][W] rd, rs1, rs2`], [`DIVREM`], [`[W]`], [.not`[U]`], [`muldiv_selector`], []), // LUI/AUIPC ([`LUI       rd, imm`], [`ADD`], [], [], [], []), ([`AUIPC     rd, imm`], [`ADD`], [], [], [`rs1 := x255`], []), ([`JAL       rd, imm`], [`JALR`], [], [], [`rs1 := x255`], []), // Branching ([`JALR      rd, rs1, imm`], [`JALR`], [], [], [], []), ([`BEQ      rs1, rs2, imm`], [`BEQ`], [], [], [], []), ([`BNE      rs1, rs2, imm`], [`BEQ`], [], [], [`mp_selector`], []), ([`BLT[U]   rs1, rs2, imm`], [`BLT`], [], [.not`[U]`], [], []), ([`BGE[U]   rs1, rs2, imm`], [`BLT`], [], [.not`[U]`], [`mp_selector`], []), // LOAD ([`LD        rd, rs1, imm`], [`LOAD`], [], [], [`mem_8B`], []), ([`LW[U]     rd, rs1, imm`], [`LOAD`], [], [.not`[U]`], [`mem_4B`], []), ([`LH[U]     rd, rs1, imm`], [`LOAD`], [], [.not`[U]`], [`mem_2B`], []), ([`LB[U]     rd, rs1, imm`], [`LOAD`], [], [.not`[U]`], [], []), // STORE ([`SD       rs1, rs2, imm`], [`STORE`], [], [], [`mem_8B`], []), ([`SW       rs1, rs2, imm`], [`STORE`], [], [], [`mem_4B`], []), ([`SH       rs1, rs2, imm`], [`STORE`], [], [], [`mem_2B`], []), ([`SB       rs1, rs2, imm`], [`STORE`], [], [], [], []), // ECALL/EBREAK ([`ECALL`], [`ECALL`], [], [], [``rs1` := `x17``], []), ([`EBREAK`], [`EBREAK`], [], [], [], []), // FENCE ([`FENCE`], [`ADD`], [], [], [], []),
 
+### C-type instructions
+
+The `RV64C` extension for compressed instructions specifies that \~50% of all instructions can be represented using a 16-bit instruction (rather than 32-bits), saving \~25% in code size. This execution of assembly code is _not_ agnostic to an instruction's compression state; after executing a compressed instruction, the `pc` should be incremented by `2` rather than `4`. To indicate an instruction is provided in compressed form, the `c_type` flag is introduced. *This flag should be set to `1` whenever the decoded instruction is provided in compressed form and `0` otherwise.*
+
 // Construct a note that can be referenced through `lbl`
 
 show figure: (it) => align(left, []) [ ] }
diff --git a/docs/spec/memory.md b/docs/spec/memory.md
index b623c3414..ae02f69f0 100644
--- a/docs/spec/memory.md
+++ b/docs/spec/memory.md
@@ -42,7 +42,7 @@ To ensure temporal integrity, every memory operation needs to be constrained for
 
 Because the LogUp argument handling token consumption and emission needs to be fully balanced --- every token emitted should be consumed, and vice versa --- we need to have a system to emit the initial tokens and consume the final tokens. This needs to ensure that every address has at most a single initializing emission, and at most one finalizing consumption. Having at most one initialization will, through the correctness of the lookup argument, immediately lead to having at most one correct finalization, and vice versa.
 
-The initialization will need to correspond to a fixed initial register state for the VM, as well as the memory loaded from the program binary, zero-initialization of memory elsewhere, and private input provided by the prover. The contribution of initialization with static data from the ELF executable and the initial register state to the sum can be handled directly by the verifier, ensuring correctness corresponding to the ELF binary being proven. This leaves only zero-initialization and prover input as prover-side concerns for initialization, alongside the finalization of the entire used memory.
+The initialization will need to correspond to a fixed initial register state for the VM, as well as the memory loaded from the program binary, zero-initialization of memory elsewhere, and private input provided by the prover. The contribution of initialization with static data from the ELF executable and the initial register state to the sum can be handled directly by the verifier, ensuring correctness corresponding to the ELF binary being proven. To enable the loading of the PC in [cpu]:memory, register initialization happens at timestamp 1. Register finalization is made possible for the verifier by having a known state from the HALT chip ([halt]). This leaves only zero-initialization and prover input as prover-side concerns for initialization, alongside the finalization of the entire used memory.
 
 For our chosen scheme (which we refer to as "paged initialization/finalization"), the available memory range is split into equally (power-of-two) sized "pages". Each address can then be represented as `address = page_base_address + page_offset`, with `page_base_address` being "page-aligned", and `page_offset` belonging to a limited range (the page size). As such, initialization or finalization of a page is represented by a table with columns `page`, `offset`, `value`, and ---for finalization--- `timestamp`. The `page` column is a preprocessed, constant value (which can be entirely virtualized/inlined into the constraints for this table), and the `offset` column is a preprocessed column containing its row index. Depending on the type of initialization, `value` can be a prover-committed column (input data), or a precomputed, constant column containing `0` (free memory space). This table then feeds into the LogUp system in the normal way, emitting the initial tokens for all addresses in a page, without consuming any tokens. Since the `offset` column is always the same, it can be reused across all paged initialization and finalization tables.
 
@@ -58,33 +58,6 @@ We present here a set of constraints on the `PAGE` table that
 
 For zero-initialized pages, `init` can be a constant `0`, and hence doesn't need a column, nor a range check.
 
-### Input
-
-| Name | Type | Description |
-|------|------|-------------|
-| `offset` | `RowIndex` | The offset from the page base address. |
-| `init` | `Byte` | The initial value of this address. Can be replaced by a constant zero for zero-initialization |
-| `fini` | `Byte` | The final value this address took |
-| `timestamp` | `DWordWL` | The timestamp at which this address was last accessed |
-
-### Virtual
-
-| Name | Type | Description |
-|------|------|-------------|
-| `address` | `DWordWL` | Adding `offset` to the page base address `page`. `page` is a constant with respect to a single instance of this table. |
-
-**Definition of `address`:**
-```
-address := page + offset * 1::DWordWL
-```
-
-| Tag | Description | Multiplicity |
-|-----|-------------|--------------|
-| `PAGE-C1` | `IS_BYTE[init]` | 1 |
-| `PAGE-C2` | `IS_BYTE[fini]` | 1 |
-| `PAGE-C3` | `memory[0, address, 0::DWordWL, init]` | -1 |
-| `PAGE-C4` | `memory[0, address, timestamp, fini]` | 1 |
-
 We identify a few alternatives that would achieve the desired initialization/finalization functionalities, and consider their respective trade-offs.
 
 _"Free-zero" initialization_
diff --git a/docs/spec/signatures.md b/docs/spec/signatures.md
index fd6e45e63..e44e3f615 100644
--- a/docs/spec/signatures.md
+++ b/docs/spec/signatures.md
@@ -6,9 +6,9 @@ let (lb, rb) = if sig.kind == "interaction" { (`[`, `]`) } else if sig.kind == "
 
 let cond = sig.at("cond", default: none) let cond_str = if cond != none { raw(cond) + ` => ` } else {``}
 
-let input_str = sig.input.map(elt => { if type(elt) == array { raw(elt.at(0)) + `[` + raw(str(elt.at(1))) + `]` } else { raw(elt) } }).join(`, `)
+let input_str = sig.input.map(type_to_code).join(`, `)
 
-let output = sig.at("output", default: none) let output_str = if output != none { if type(output) == array { raw(output.at(0)) + `[` + raw(str(output.at(1))) + `]` } else { raw(output) } + `; ` } else {``}
+let output = sig.at("output", default: none) let output_str = if output != none { type_to_code(output) + `; ` } else {``}
 
 return [] }
 
@@ -16,7 +16,7 @@ return [] }
 
 let vars = sig.input + if "output" in sig { (sig.output, )} else {()}
 
-return vars.map(v => { let (label, factor) = if type(v) == array { (v.at(0), v.at(1)) } else { (v, 1) } config.variables.types.filter(type => type.label == label).first().subtypes.len() * factor }) .sum() }
+return vars.map(v => { let factor = 1 while type(v) == array { factor *= v.at(1) v = v.at(0) } let lbl = v config.variables.types.filter(type => type.label == lbl).first().subtypes.len() * factor }) .sum() }
 
 The following lists signatures of the .len() interactions in this VM.
 
diff --git a/docs/spec/spec_full.md b/docs/spec/spec_full.md
index 76c1189e1..f42aec0ca 100644
--- a/docs/spec/spec_full.md
+++ b/docs/spec/spec_full.md
@@ -128,7 +128,7 @@ To ensure temporal integrity, every memory operation needs to be constrained for
 
 Because the LogUp argument handling token consumption and emission needs to be fully balanced --- every token emitted should be consumed, and vice versa --- we need to have a system to emit the initial tokens and consume the final tokens. This needs to ensure that every address has at most a single initializing emission, and at most one finalizing consumption. Having at most one initialization will, through the correctness of the lookup argument, immediately lead to having at most one correct finalization, and vice versa.
 
-The initialization will need to correspond to a fixed initial register state for the VM, as well as the memory loaded from the program binary, zero-initialization of memory elsewhere, and private input provided by the prover. The contribution of initialization with static data from the ELF executable and the initial register state to the sum can be handled directly by the verifier, ensuring correctness corresponding to the ELF binary being proven. This leaves only zero-initialization and prover input as prover-side concerns for initialization, alongside the finalization of the entire used memory.
+The initialization will need to correspond to a fixed initial register state for the VM, as well as the memory loaded from the program binary, zero-initialization of memory elsewhere, and private input provided by the prover. The contribution of initialization with static data from the ELF executable and the initial register state to the sum can be handled directly by the verifier, ensuring correctness corresponding to the ELF binary being proven. To enable the loading of the PC in [cpu]:memory, register initialization happens at timestamp 1. Register finalization is made possible for the verifier by having a known state from the HALT chip ([halt]). This leaves only zero-initialization and prover input as prover-side concerns for initialization, alongside the finalization of the entire used memory.
 
 For our chosen scheme (which we refer to as "paged initialization/finalization"), the available memory range is split into equally (power-of-two) sized "pages". Each address can then be represented as `address = page_base_address + page_offset`, with `page_base_address` being "page-aligned", and `page_offset` belonging to a limited range (the page size). As such, initialization or finalization of a page is represented by a table with columns `page`, `offset`, `value`, and ---for finalization--- `timestamp`. The `page` column is a preprocessed, constant value (which can be entirely virtualized/inlined into the constraints for this table), and the `offset` column is a preprocessed column containing its row index. Depending on the type of initialization, `value` can be a prover-committed column (input data), or a precomputed, constant column containing `0` (free memory space). This table then feeds into the LogUp system in the normal way, emitting the initial tokens for all addresses in a page, without consuming any tokens. Since the `offset` column is always the same, it can be reused across all paged initialization and finalization tables.
 
@@ -144,33 +144,6 @@ We present here a set of constraints on the `PAGE` table that
 
 For zero-initialized pages, `init` can be a constant `0`, and hence doesn't need a column, nor a range check.
 
-### Input
-
-| Name | Type | Description |
-|------|------|-------------|
-| `offset` | `RowIndex` | The offset from the page base address. |
-| `init` | `Byte` | The initial value of this address. Can be replaced by a constant zero for zero-initialization |
-| `fini` | `Byte` | The final value this address took |
-| `timestamp` | `DWordWL` | The timestamp at which this address was last accessed |
-
-### Virtual
-
-| Name | Type | Description |
-|------|------|-------------|
-| `address` | `DWordWL` | Adding `offset` to the page base address `page`. `page` is a constant with respect to a single instance of this table. |
-
-**Definition of `address`:**
-```
-address := page + offset * 1::DWordWL
-```
-
-| Tag | Description | Multiplicity |
-|-----|-------------|--------------|
-| `PAGE-C1` | `IS_BYTE[init]` | 1 |
-| `PAGE-C2` | `IS_BYTE[fini]` | 1 |
-| `PAGE-C3` | `memory[0, address, 0::DWordWL, init]` | -1 |
-| `PAGE-C4` | `memory[0, address, timestamp, fini]` | 1 |
-
 We identify a few alternatives that would achieve the desired initialization/finalization functionalities, and consider their respective trade-offs.
 
 _"Free-zero" initialization_
@@ -211,9 +184,9 @@ let (lb, rb) = if sig.kind == "interaction" { (`[`, `]`) } else if sig.kind == "
 
 let cond = sig.at("cond", default: none) let cond_str = if cond != none { raw(cond) + ` => ` } else {``}
 
-let input_str = sig.input.map(elt => { if type(elt) == array { raw(elt.at(0)) + `[` + raw(str(elt.at(1))) + `]` } else { raw(elt) } }).join(`, `)
+let input_str = sig.input.map(type_to_code).join(`, `)
 
-let output = sig.at("output", default: none) let output_str = if output != none { if type(output) == array { raw(output.at(0)) + `[` + raw(str(output.at(1))) + `]` } else { raw(output) } + `; ` } else {``}
+let output = sig.at("output", default: none) let output_str = if output != none { type_to_code(output) + `; ` } else {``}
 
 return [] }
 
@@ -221,7 +194,7 @@ return [] }
 
 let vars = sig.input + if "output" in sig { (sig.output, )} else {()}
 
-return vars.map(v => { let (label, factor) = if type(v) == array { (v.at(0), v.at(1)) } else { (v, 1) } config.variables.types.filter(type => type.label == label).first().subtypes.len() * factor }) .sum() }
+return vars.map(v => { let factor = 1 while type(v) == array { factor *= v.at(1) v = v.at(0) } let lbl = v config.variables.types.filter(type => type.label == lbl).first().subtypes.len() * factor }) .sum() }
 
 The following lists signatures of the .len() interactions in this VM.
 
@@ -808,7 +781,7 @@ The  table must be padded to a length that is a power of two. Empty rows with th
 
 Note that this row sets the `EBREAK` flag. Given that `CPU` asserts that `EBREAK = 0` (see [cpu:c:ebreak_traps]), using this "padding-instruction" would immediately make the CPU table unprovable. Note moreover that the `pc` is set to `7`. This value is the _smallest odd number_ (i.e., not reachable during regular execution) that is more than _`4`_ (i.e., the max `pc`-increment) greater than _`1`_ (i.e., the `pc`-value used in the [additional instruction] referred to by `CPU`-padding lines).
 
-## Decoding
+## Decoding<decode:decoding-overview>
 
 For the purposes of explaining decoding, we decompress 's `packed_decode` variable into its constituent variables. Note that the below table is _not_ used in practice: it is solely used for the purposes of this explanation.
 
@@ -861,10 +834,6 @@ For the purpose of brevity and readability, the table uses the following rules-o
 
 Further clarification is provided in the notes following the table.
 
-### C-type instructions
-
-The `RV64C` extension for compressed instructions specifies that \~50% of all instructions can be represented using a 16-bit instruction (rather than 32-bits), saving \~25% in code size. This execution of assembly code is _not_ agnostic to an instruction's compression state; after executing a compressed instruction, the `pc` should be incremented by `2` rather than `4`. To indicate an instruction is provided in compressed form, the `c_type` flag is introduced. *This flag should be set to `1` whenever the decoded instruction is provided in compressed form and `0` otherwise.*
-
 /// Add a reference to one or more notes following this table.
 
 super("[" + refs.pos().map(r => ref(r)).join(",") + "]") }
@@ -875,6 +844,10 @@ figure(table( columns: (auto, auto, auto, auto, 1fr, auto), stroke: 0pt, inset:
 
 // OP-IMM ([`ADDI[W]   rd, rs1, imm`], [`ADD`], [`[W]`], [], [], []), ([`SLTI[U]   rd, rs1, imm`], [`SLT`], [], [.not`[U]`], [], []), ([`ANDI      rd, rs1, imm`], [`AND`], [], [], [], []), ([`ORI       rd, rs1, imm`], [`OR`],   [], [], [], []), ([`XORI      rd, rs1, imm`], [`XOR`], [], [], [], []), ([`SLLI[W]   rd, rs1, imm`], [`SHIFT`], [`[W]`], [], [], []), ([`SRLI[W]   rd, rs1, imm`], [`SHIFT`], [`[W]`], [], [`mp_selector`], []), ([`SRAI[W]   rd, rs1, imm`], [`SHIFT`], [`[W]`], [1], [`mp_selector`], []), // OP ([`ADD[W]    rd, rs1, rs2`], [`ADD`], [`[W]`], [], [], []), ([`SUB[W]    rd, rs1, rs2`], [`SUB`], [`[W]`], [], [], []), ([`SLT[U]    rd, rs1, rs2`], [`SLT`], [], [.not`[U]`], [], []), ([`AND       rd, rs1, rs2`], [`AND`], [], [], [], []), ([`OR        rd, rs1, rs2`], [`OR`], [], [], [], []), ([`XOR       rd, rs1, rs2`], [`XOR`], [], [], [], []), ([`SLL[W]    rd, rs1, rs2`], [`SHIFT`], [`[W]`], [], [], []), ([`SRL[W]    rd, rs1, rs2`], [`SHIFT`], [`[W]`], [], [`mp_selector`], []), ([`SRA[W]    rd, rs1, rs2`], [`SHIFT`], [`[W]`], [1], [`mp_selector`], []), // OP - M ([`MUL[W]    rd, rs1, rs2`], [`MUL`], [`[W]`], [1], [`mp_selector`], []), ([`MULH      rd, rs1, rs2`], [`MUL`], [], [1], [`mp_selector`, `muldiv_selector`], []), ([`MULHU     rd, rs1, rs2`], [`MUL`], [], [], [`muldiv_selector`], []), ([`MULHSU    rd, rs1, rs2`], [`MUL`], [], [1], [`muldiv_selector`], []), ([`DIV[U][W] rd, rs1, rs2`], [`DIVREM`], [`[W]`], [.not`[U]`], [], []), ([`REM[U][W] rd, rs1, rs2`], [`DIVREM`], [`[W]`], [.not`[U]`], [`muldiv_selector`], []), // LUI/AUIPC ([`LUI       rd, imm`], [`ADD`], [], [], [], []), ([`AUIPC     rd, imm`], [`ADD`], [], [], [`rs1 := x255`], []), ([`JAL       rd, imm`], [`JALR`], [], [], [`rs1 := x255`], []), // Branching ([`JALR      rd, rs1, imm`], [`JALR`], [], [], [], []), ([`BEQ      rs1, rs2, imm`], [`BEQ`], [], [], [], []), ([`BNE      rs1, rs2, imm`], [`BEQ`], [], [], [`mp_selector`], []), ([`BLT[U]   rs1, rs2, imm`], [`BLT`], [], [.not`[U]`], [], []), ([`BGE[U]   rs1, rs2, imm`], [`BLT`], [], [.not`[U]`], [`mp_selector`], []), // LOAD ([`LD        rd, rs1, imm`], [`LOAD`], [], [], [`mem_8B`], []), ([`LW[U]     rd, rs1, imm`], [`LOAD`], [], [.not`[U]`], [`mem_4B`], []), ([`LH[U]     rd, rs1, imm`], [`LOAD`], [], [.not`[U]`], [`mem_2B`], []), ([`LB[U]     rd, rs1, imm`], [`LOAD`], [], [.not`[U]`], [], []), // STORE ([`SD       rs1, rs2, imm`], [`STORE`], [], [], [`mem_8B`], []), ([`SW       rs1, rs2, imm`], [`STORE`], [], [], [`mem_4B`], []), ([`SH       rs1, rs2, imm`], [`STORE`], [], [], [`mem_2B`], []), ([`SB       rs1, rs2, imm`], [`STORE`], [], [], [], []), // ECALL/EBREAK ([`ECALL`], [`ECALL`], [], [], [``rs1` := `x17``], []), ([`EBREAK`], [`EBREAK`], [], [], [], []), // FENCE ([`FENCE`], [`ADD`], [], [], [], []),
 
+### C-type instructions
+
+The `RV64C` extension for compressed instructions specifies that \~50% of all instructions can be represented using a 16-bit instruction (rather than 32-bits), saving \~25% in code size. This execution of assembly code is _not_ agnostic to an instruction's compression state; after executing a compressed instruction, the `pc` should be incremented by `2` rather than `4`. To indicate an instruction is provided in compressed form, the `c_type` flag is introduced. *This flag should be set to `1` whenever the decoded instruction is provided in compressed form and `0` otherwise.*
+
 // Construct a note that can be referenced through `lbl`
 
 show figure: (it) => align(left, []) [ ] }
@@ -950,6 +923,8 @@ The  chip is comprised of  variables that are expressed using  columns and lever
 
 | Name | Type | Description |
 |------|------|-------------|
+| `prev_pc_timestamp_borrow` | `Bit` | The borrow bit for computing the previous timestamp the PC was accessed |
+| `pc_double_read` | `Bit` | Whether the PC is being read as a general purpose register (`rs1`) this cycle |
 | `rv1` | `DWordWHH` | The value of register `rs1` |
 | `rv2` | `DWordWHH` | The value of register `rs2` |
 | `rv1_ext_bit` | `Bit` | The sign bit of `rv1` if seen as a 32-bit word, used for sign extension with `word_instr` |
@@ -1057,9 +1032,11 @@ The ALU functionality is then obtained through judicious dispatching to the corr
 | `CPU-CA45` |  | `MUL[res::DWordWL; arg1::DWordHL, signed, arg2::DWordHL, mp_selector, muldiv_selector]` | MUL |
 | `CPU-CA46` |  | `DVRM[res::DWordWL; arg1::DWordHL, arg2::DWordHL, signed, muldiv_selector]` | DIVREM |
 
-### Memory
+### Memory<cpu:memory>
+
+The interactions with the memory, both for register loading and storing, as for `LOAD` and `STORE` instructions are handled. Note that since registers need no byte-addressing, we store them in the memory argument with `Word` limbs. The `pc` register behaves very predictably with respect to its timestamps and when it is being read, so for performance reasons, we inline its memory interactions directly into the  chip.
 
-The interactions with the memory, both for register loading and storing, as for `LOAD` and `STORE` instructions are handled. Note that since registers need no byte-addressing, we store them in the memory argument with `Word` limbs. The timestamps are ensured to be disjoint for disjoint memory locations. One consequence of that is that `next_pc` is written at `timestamp + 1` to ensure the access is disjoint with the `pc` read into `rv1` as part of the `AUIPC` instruction.
+Potentially overlapping memory accesses are ensured to have disjoint timestamps. One consequence of that is that `next_pc` is written at `timestamp + 1` to ensure the access is disjoint with the `pc` read into `rv1` as part of the `AUIPC` instruction (see [cpu:c:read_rv1] and [decode]:decoding-overview). Constraints regarding whether `pc_double_read` corresponds to an `AUIPC` instruction are not necessary, as regardless of its value, the old timestamp is guaranteed smaller than the new timestamp, and the integrity of the memory argument therefore ensures the correctness of this bit.
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
@@ -1072,7 +1049,14 @@ The interactions with the memory, both for register loading and storing, as for
 | `CPU-CM51` |  | `MEMW[1, 2::DWordWL * rd, ['arr', ['idx', 'rvd', 0], ['idx', 'rvd', 1], 0, 0, 0, 0, 0, 0], timestamp + 2::DWordWL, 1, 0, 0]` | write_register |
 | `CPU-CM52` |  | `LOAD[rvd; res::DWordWL, timestamp + 0::DWordWL, memory_2bytes, memory_4bytes, memory_8bytes, signed]` | LOAD |
 | `CPU-CM53` |  | `MEMW[0, res::DWordWL, arg2::Byte[8], timestamp + 1::DWordWL, memory_2bytes, memory_4bytes, memory_8bytes]` | STORE |
-| `CPU-CM54` |  | `MEMW[['arr', ['idx', 'pc', 0], ['idx', 'pc', 1], 0, 0, 0, 0, 0, 0]; 1, (2 * 255)::DWordWL, ['arr', ['idx', 'next_pc', 0], ['idx', 'next_pc', 1], 0, 0, 0, 0, 0, 0], timestamp + 1::DWordWL, 1, 0, 0]` | 1 - pad |
+| `CPU-CM54` |  | `IS_BIT<pc_double_read>` |  |
+| `CPU-CM55` |  | `IS_BIT<prev_pc_timestamp_borrow>` |  |
+| `CPU-CM56.i` | i ∈ [0, 1] | `memory[1, ['arr', ['+', ['*', 2, 255], 'i'], 0], ['arr', ['+', ['-', ['idx', 'timestamp', 0], ['*', 3, ['not', 'pc_double_read']]], ['*', ['^', 2, 32], 'prev_pc_timestamp_borrow']], ['-', ['idx', 'timestamp', 1], 'prev_pc_timestamp_borrow']], pc[i]]` | 1 - pad |
+| `CPU-CM57.i` | i ∈ [0, 1] | `memory[1, ['arr', ['+', ['*', 2, 255], 'i'], 0], timestamp + 1::DWordWL, next_pc[i]]` | -(1 - pad) |
+
+#### Potential optimizations
+
+- `double_pc_read` could be integrated into decoding, so that `AUIPC` could set `read_register1 = 0` and no extra MEMW access for `rv1` is needed at this point.
 
 ### System
 
@@ -1080,9 +1064,9 @@ The interactions with the wider system.
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `CPU-CS55` | `!EBREAK` |  |
+| `CPU-CS58` | `!EBREAK` |  |
 | | _polynomial:_ `1 - EBREAK = 0` | |
-| `CPU-CS56` | `ECALL[timestamp, rv1::DWordWL]` | ECALL |
+| `CPU-CS59` | `ECALL[timestamp, rv1::DWordWL]` | ECALL |
 
 ### Input and output to the ALU
 
@@ -1090,20 +1074,20 @@ We constrain `arg1`, `arg2` and `rvd` to correspond to the wanted values, includ
 
 | Tag | Description |
 |-----|-------------|
-| `CPU-CE57` | `SIGN<rv1_ext_bit; rv1[1], word_instr>` |
-| `CPU-CE58` | `arg1[:4]` = `rv1[:2]` |
+| `CPU-CE60` | `SIGN<rv1_ext_bit; rv1[1], word_instr>` |
+| `CPU-CE61` | `arg1[:4]` = `rv1[:2]` |
 | | _polynomial:_ `(arg1::DWordWL)[0] - (rv1::DWordWL)[0] = 0` |
-| `CPU-CE59` | `arg1[4:]` = `rv1[2]` dot (1 - `word_instr`) + (2^(32) - 1) dot `rv1_ext_bit` dot `signed` |
+| `CPU-CE62` | `arg1[4:]` = `rv1[2]` dot (1 - `word_instr`) + (2^(32) - 1) dot `rv1_ext_bit` dot `signed` |
 | | _polynomial:_ `(arg1::DWordWL)[1] - (1 - word_instr) * rv1[2] - signed * rv1_ext_bit * (2^32 - 1) = 0` |
-| `CPU-CE60` | `SIGN<rv2_ext_bit; rv2[1], word_instr>` |
-| `CPU-CE61` | `arg2[:4]` = (1 - `LOAD`) dot `rv2[:2]` + (1 - `BEQ` - `BLT` - `STORE`) dot `imm[0]` |
+| `CPU-CE63` | `SIGN<rv2_ext_bit; rv2[1], word_instr>` |
+| `CPU-CE64` | `arg2[:4]` = (1 - `LOAD`) dot `rv2[:2]` + (1 - `BEQ` - `BLT` - `STORE`) dot `imm[0]` |
 | | _polynomial:_ `(arg2::DWordWL)[0] - (1 - LOAD) * (rv2::DWordWL)[0] - (1 - BEQ - BLT - STORE) * imm[0] = 0` |
-| `CPU-CE62` | `arg2[4:]` = (1 - `LOAD`) dot ((1 - `word_instr`) dot `rv2[2]` + `signed` dot `rv2_ext_bit` dot (2^(32) - 1)) + (1 - `BEQ` - `BLT` - `STORE`) dot `imm[1]` |
+| `CPU-CE65` | `arg2[4:]` = (1 - `LOAD`) dot ((1 - `word_instr`) dot `rv2[2]` + `signed` dot `rv2_ext_bit` dot (2^(32) - 1)) + (1 - `BEQ` - `BLT` - `STORE`) dot `imm[1]` |
 | | _polynomial:_ `(arg2::DWordWL)[1] - (1 - LOAD) * (1 - word_instr) * rv2[2] - (1 - LOAD) * signed * rv2_ext_bit * (2^32 - 1) - (1 - BEQ - BLT - STORE) * imm[1] = 0` |
-| `CPU-CE63` | `SIGN<res_ext_bit; (res::DWordHL)[1], word_instr>` |
-| `CPU-CE64` | `!LOAD` => `rvd[0]` = `res[:4]` |
+| `CPU-CE66` | `SIGN<res_ext_bit; (res::DWordHL)[1], word_instr>` |
+| `CPU-CE67` | `!LOAD` => `rvd[0]` = `res[:4]` |
 | | _polynomial:_ `(1 - LOAD) * (rvd[0] - (res::DWordWL)[0]) = 0` |
-| `CPU-CE65` | `!LOAD` => `rvd[1]` = (1 - `word_instr`) dot `res[4:]` + `res_ext_bit` dot (2^(32) - 1) |
+| `CPU-CE68` | `!LOAD` => `rvd[1]` = (1 - `word_instr`) dot `res[4:]` + `res_ext_bit` dot (2^(32) - 1) |
 | | _polynomial:_ `(1 - LOAD) * (rvd[1] - (1 - word_instr) * (res::DWordWL)[1] - res_ext_bit * (2^32 - 1)) = 0` |
 
 ### Other constraints
@@ -1112,11 +1096,11 @@ For [cpu:c:is_equal], note that [cpu:c:sub] sets `res` to be the difference betw
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `CPU-CO66` | `ZERO[is_equal; res[0] + res[1] + res[2] + res[3] + res[4] + res[5] + res[6] + res[7]]` | BEQ |
-| `CPU-CO67` | `branch_cond` = `JALR` or (`BLT` and (`res` xor `invert`)) or (`BEQ` and (`is_equal` xor `invert`)) |  |
+| `CPU-CO69` | `ZERO[is_equal; res[0] + res[1] + res[2] + res[3] + res[4] + res[5] + res[6] + res[7]]` | BEQ |
+| `CPU-CO70` | `branch_cond` = `JALR` or (`BLT` and (`res` xor `invert`)) or (`BEQ` and (`is_equal` xor `invert`)) |  |
 | | _polynomial:_ `-branch_cond + JALR + res[0] * (1 - mp_selector) * BLT + (1 - res[0]) * mp_selector * BLT + is_equal * (1 - mp_selector) * BEQ + (1 - is_equal) * mp_selector * BEQ = 0` | |
-| `CPU-CO68` | `BRANCH[next_pc; pc, imm, arg1::DWordWL, JALR]` | branch_cond |
-| `CPU-CO69` | `ADD<next_pc; pc, (2 * c_type_instruction + 4 * (1 - c_type_instruction)) * 1::DWordWL>` |  |
+| `CPU-CO71` | `BRANCH[next_pc; pc, imm, arg1::DWordWL, JALR]` | branch_cond |
+| `CPU-CO72` | `ADD<next_pc; pc, (2 * c_type_instruction + 4 * (1 - c_type_instruction)) * 1::DWordWL>` |  |
 
 > **Note:** Document the choice to not have a multiplicity column here for padding
 
@@ -2039,6 +2023,12 @@ When `ECALL` is executed, it is assumed that: - register `A7` contains the syste
 
 - the arguments are located in registers `A0`-`A6`, and - the return value is written to `A0`, where `A0`-`A7` are symbolic names for the registers `x10`-`x17`
 
+## ECALL number overview
+
+We provide a list of supported ECALL numbers. Negative numbers (represented as 2s complement 64-bit numbers), are used for our own custom accelerators/extensions.
+
+/ 64: `write` ([commit]) / 93: `exit` ([halt]) / -1: `SHA256` ([sha256]) / -2: `KECCAK` ([keccak])
+
 ---
 
 # HALT Chip

From 6363f3e0c2594bc26611efdfd049f88df7d8bae2 Mon Sep 17 00:00:00 2001
From: Robin Jadoul <robin.jadoul@3milabs.tech>
Date: Tue, 28 Apr 2026 16:56:00 +0200
Subject: [PATCH 096/105] spec: shiroa fixes (#533)

* Fix shiroa build

- Strip raw blocks from chapter titles for `project` argument
- Add an explicit description (based on chapter title) to chapters
  to avoid compilation issues when context appears early in the chapter
- Export interaction counts from the pdf version to use in shiroa,
  since otherwise we run into convergence issues that are hard to debug

* cd into script directory and harden against stale interaction counts

* Update spec/book.typ

Co-authored-by: Erik <159244975+erik-3milabs@users.noreply.github.com>

* Fall back to single-threaded shiroa when insufficient memory

---------

Co-authored-by: Erik <159244975+erik-3milabs@users.noreply.github.com>
---
 spec/.gitignore      |  1 +
 spec/book.typ        | 13 +++++++++----
 spec/build_shiroa.sh | 27 +++++++++++++++++++++++++++
 spec/chip.typ        | 11 ++++++++---
 4 files changed, 45 insertions(+), 7 deletions(-)
 create mode 100755 spec/build_shiroa.sh

diff --git a/spec/.gitignore b/spec/.gitignore
index 73218d5ba..b5ca9ae62 100644
--- a/spec/.gitignore
+++ b/spec/.gitignore
@@ -1,2 +1,3 @@
 dist/*
+interaction_count.json
 ebook.pdf
diff --git a/spec/book.typ b/spec/book.typ
index 3b65642f9..5bf498b49 100644
--- a/spec/book.typ
+++ b/spec/book.typ
@@ -107,6 +107,7 @@
 
 // Invisibly include another chapter, so that its labels can be resolved
 #let xref-include(f) = {
+  show ref: none
   context {
     place(hide(box(width: auto, height: 0%, strip-all(include "/" + f))))
   }
@@ -171,13 +172,17 @@
         }
       })
       let cond() = _toplevel.final() == file
-      project.with(..args, title: context meta_sections.find(x => x.at(0) == _toplevel.final()).at(1), cond: cond)([
-        #show ref: it => context if _toplevel.final() == file {
-          xref(it)
-        }
+      show ref: it => context if cond() { xref(it) }
+      let title = context {
+        // Strip raw, because shiroa already makes the title raw
+        show raw: it => it.text
+        meta_sections.find(x => x.at(0) == _toplevel.final()).at(1)
+      }
+      project.with(..args, title: title, description: plain-text(meta_sections.find(x => x.at(0) == file).at(1)), cond: cond)([
         #context _xref-included.final().pairs().map(((key, value)) => context if value and cond() {
           xref-include(key)
         }).join()
+        #metadata(json("interaction_count.json").sum(default: (:)))<interaction_count>
         #body
       ])
     }
diff --git a/spec/build_shiroa.sh b/spec/build_shiroa.sh
new file mode 100755
index 000000000..55a3c8d24
--- /dev/null
+++ b/spec/build_shiroa.sh
@@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+# cd into the script directory
+cd "$(dirname "${BASH_SOURCE[0]}")"
+
+# Clean up potential old file
+rm -f interaction_count.json
+
+# Always clean up after ourselves
+trap 'rm -f interaction_count.json' EXIT
+
+# Query the ebook version for the proper counts
+typst query ebook.typ '<interaction_count>' --field value > interaction_count.json
+
+# Check if there's enough memory available for a parallel shiroa build
+# 20GiB as comfortable baseline
+available_kb=$(awk '/MemAvailable/ { print $2 }' /proc/meminfo)
+required_kb=$((20 * 1024 * 1024))
+if [ "$available_kb" -lt "$required_kb" ]; then
+  echo "Falling back to single-thread"
+  export RAYON_NUM_THREADS=1
+fi
+
+# And build
+shiroa build
diff --git a/spec/chip.typ b/spec/chip.typ
index c6cce5073..1c89dcc55 100644
--- a/spec/chip.typ
+++ b/spec/chip.typ
@@ -51,14 +51,19 @@
 // store it as metadata under the `<interaction_count>` label
 // with tag `chip.name`. This tag is overwritten by `name` when specified.
 #let set_nr_interactions(chip, name: none) = {
+  // Skip when building shiroa, since the web/chapter structure fails to converge properly
+  import "book.typ": is-shiroa
+  if is-shiroa {
+    return
+  }
   if name == none {
-      name = chip.name
-    }
+    name = chip.name
+  }
 
   let constraints = chip
     .constraints
     .values()
-    .flatten()
+    .sum(default: ())
 
   // nr. of direct interactions
   let nr-direct-interactions = constraints

From b6478d6d218a05b780e8142f7bb884f4d801a5a5 Mon Sep 17 00:00:00 2001
From: Robin Jadoul <robin.jadoul@3milabs.tech>
Date: Tue, 28 Apr 2026 17:01:58 +0200
Subject: [PATCH 097/105] spec: Enable heading numbering for section references
 in shiroa (#534)

* spec: Enable heading numbering for section references in shiroa

* Remove explicit heading numbering from logup chapter
---
 spec/book.typ  | 7 ++++++-
 spec/logup.typ | 1 -
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/spec/book.typ b/spec/book.typ
index 5bf498b49..57363ee23 100644
--- a/spec/book.typ
+++ b/spec/book.typ
@@ -145,7 +145,7 @@
         } else {
           rf.supplement
         }
-        [#supplement#numbering(fig.numbering, ..counter.at(lbl))]
+        [#supplement #numbering(fig.numbering, ..counter.at(lbl))]
       }
       cross-link("/" + ch, reference: shiroa-label, link-content)
     }
@@ -183,6 +183,11 @@
           xref-include(key)
         }).join()
         #metadata(json("interaction_count.json").sum(default: (:)))<interaction_count>
+
+        #let chapter-index = meta_sections.position(x => x.at(0) == file) + 1
+        #set heading(numbering: (..args) => [#chapter-index.#numbering("1.1", ..args)])
+        #counter(heading).update(0)
+
         #body
       ])
     }
diff --git a/spec/logup.typ b/spec/logup.typ
index 7bb9a085d..038d67a8c 100644
--- a/spec/logup.typ
+++ b/spec/logup.typ
@@ -1,7 +1,6 @@
 #import "/book.typ": book-page, aside, cdsg
 
 #show: book-page("logup")
-#set heading(numbering: "1.")
 #show link: underline
 
 #show "constraint choice": link(<constraint_choices>)[constraint choice]

From 687253725716d22b3262e56e727087f0a2a6eeb6 Mon Sep 17 00:00:00 2001
From: Erik <159244975+erik-3milabs@users.noreply.github.com>
Date: Tue, 28 Apr 2026 17:22:28 +0200
Subject: [PATCH 098/105] spec: `ARE_BYTES` (#532)

* spec/ARE_BYTES: introduce ARE_BYTES signature

* spec/ARE_BYTES: introduce ARE_BYTES lookup

* spec/ARE_BYTES: introduce IS_BYTE template

* spec/ARE_BYTES: switch IS_BYTE lookup to IS_BYTE template

* spec/ARE_BYTES: drop IS_BYTE interaction

* spec/ARE_BYTES: drop IS_BYTE lookup signature

* spec/ARE_BYTES: turn multiplicity into cond

* spec/ARE_BYTES: remove as potential optimization

* spec/ARE_BYTES: update assumptions using IS_BYTE
---
 spec/bitwise.typ             |  2 --
 spec/book.typ                |  1 +
 spec/is_byte.typ             | 22 ++++++++++++++++++++++
 spec/src/bitwise.toml        | 11 ++++++++---
 spec/src/branch.toml         |  4 ++--
 spec/src/cpu.toml            | 18 ++++++------------
 spec/src/is_byte.toml        | 21 +++++++++++++++++++++
 spec/src/keccak_round.toml   | 18 +++++++++---------
 spec/src/page.toml           |  6 ++----
 spec/src/sha256.toml         |  4 ++--
 spec/src/sha256msgsched.toml |  8 ++++----
 spec/src/sha256round.toml    |  8 ++++----
 spec/src/shift.toml          |  2 +-
 spec/src/signatures.toml     | 13 ++++++++++---
 14 files changed, 92 insertions(+), 46 deletions(-)
 create mode 100644 spec/is_byte.typ
 create mode 100644 spec/src/is_byte.toml

diff --git a/spec/bitwise.typ b/spec/bitwise.typ
index 82f9e36f9..332bef3d9 100644
--- a/spec/bitwise.typ
+++ b/spec/bitwise.typ
@@ -37,8 +37,6 @@ This chip adds the following interactions to the lookup:
 
 = Notes/Optimizations
 The following ideas may prove to be optimizations for the #bitwise chip:
-+ Extend `IS_BYTE[X]` to `ARE_BYTES[X, Y]`, such that two bytes are range checked at once. 
-  When only a single check is required, one can still execute `IS_BYTE[X] := ARE_BYTES[X, 0]`.
 + Drop `MSB8` column, and instead define the `MSB8` lookup as `MSB8<X> := MSB16[256X]`.
   Note: currently, `MSB8` also implicity range checks the input `X` (the lookup fails if `X` is not a `Byte`).
   This optimization should only be executed when all chips leveraging `MSB8` do _not_ need this implicit range check.
diff --git a/spec/book.typ b/spec/book.typ
index 57363ee23..f4e95e493 100644
--- a/spec/book.typ
+++ b/spec/book.typ
@@ -18,6 +18,7 @@
     )),
     ("TEMPLATES", (
       ("is_bit.typ", [`IS_BIT` template], <isbit>),
+      ("is_byte.typ", [`IS_BYTE` template], <isbyte>),
       ("sign.typ", [`SIGN` template], <sign>),
       ("add.typ", [`ADD`/`SUB` template], <add>),
       ("neg.typ", [`NEG` template], <neg>),
diff --git a/spec/is_byte.typ b/spec/is_byte.typ
new file mode 100644
index 000000000..09bf78e98
--- /dev/null
+++ b/spec/is_byte.typ
@@ -0,0 +1,22 @@
+#import "/book.typ": book-page
+#import "/src.typ": load_config, load_chip
+#import "/chip.typ": render_chip_variable_table, render_constraint_table, compute_nr_interactions, total_nr_variables, total_nr_instantiated_columns
+
+#let config = load_config()
+#let chip = load_chip("src/is_byte.toml", config)
+
+#show: book-page(chip.name)
+#let is_byte = raw(chip.name)
+
+#is_byte is a constraint template that is used to assert that a variable lies in the range $[0, 255]$ under the condition that `cond` is non-zero. Note: when `cond` is omitted, it defaults to $1$.
+
+When a chip leverages this template twice or more, implementors are encouraged to merge pairs of #is_byte interactions with identical conditions into `ARE_BYTES` interactions; the #is_byte template is included for convenience of notation, and to complete the specification of chips that use an odd number of #is_byte range checks.
+
+= Variables
+#let nr_interactions = compute_nr_interactions(chip)
+
+The #is_byte template leverages #nr_interactions interaction(s):
+#render_chip_variable_table(chip, config)
+
+= Constraints
+#render_constraint_table(chip, config)
\ No newline at end of file
diff --git a/spec/src/bitwise.toml b/spec/src/bitwise.toml
index 67d73facd..30875a810 100644
--- a/spec/src/bitwise.toml
+++ b/spec/src/bitwise.toml
@@ -101,6 +101,11 @@ name = "μ_IS_BYTE"
 type = "BaseField"
 desc = ""
 
+[[variables.multiplicity]]
+name = "μ_ARE_BYTES"
+type = "BaseField"
+desc = ""
+
 [[variables.multiplicity]]
 name = "μ_IS_HALF"
 type = "BaseField"
@@ -164,9 +169,9 @@ multiplicity = ["-", "μ_ZERO"]
 
 [[constraints.contributions]]
 kind = "interaction"
-tag = "IS_BYTE"
-input = ["X"]
-multiplicity = ["-", "μ_IS_BYTE"]
+tag = "ARE_BYTES"
+input = ["X", "Y"]
+multiplicity = ["-", "μ_ARE_BYTES"]
 
 [[constraints.contributions]]
 kind = "interaction"
diff --git a/spec/src/branch.toml b/spec/src/branch.toml
index a98974678..49a7833a3 100644
--- a/spec/src/branch.toml
+++ b/spec/src/branch.toml
@@ -116,10 +116,10 @@ output = "next_pc_unmasked"
 cond = "JALR"
 
 [[constraints.all]]
-kind = "interaction"
+kind = "template"
 tag = "IS_BYTE"
 input = [["idx", "next_pc_low", 1]]
-multiplicity = "μ"
+cond = "μ"
 
 [[constraints.all]]
 kind = "interaction"
diff --git a/spec/src/cpu.toml b/spec/src/cpu.toml
index b25138d4d..a0bd3925c 100644
--- a/spec/src/cpu.toml
+++ b/spec/src/cpu.toml
@@ -523,43 +523,37 @@ input = ["EBREAK"]
 ref = "cpu:c:range_EBREAK"
 
 [[constraints.range]]
-kind = "interaction"
+kind = "template"
 tag = "IS_BYTE"
 input = ["rs1"]
-multiplicity = 1
 
 [[constraints.range]]
-kind = "interaction"
+kind = "template"
 tag = "IS_BYTE"
 input = ["rs2"]
-multiplicity = 1
 
 [[constraints.range]]
-kind = "interaction"
+kind = "template"
 tag = "IS_BYTE"
 input = ["rd"]
-multiplicity = 1
 
 [[constraints.range]]
-kind = "interaction"
+kind = "template"
 tag = "IS_BYTE"
 input = [["idx", "arg1", "i"]]
 iter = ["i", 0, 7]
-multiplicity = 1
 
 [[constraints.range]]
-kind = "interaction"
+kind = "template"
 tag = "IS_BYTE"
 input = [["idx", "arg2", "i"]]
 iter = ["i", 0, 7]
-multiplicity = 1
 
 [[constraints.range]]
-kind = "interaction"
+kind = "template"
 tag = "IS_BYTE"
 input = [["idx", "res", "i"]]
 iter = ["i", 0, 7]
-multiplicity = 1
 
 
 [[constraint_groups]]
diff --git a/spec/src/is_byte.toml b/spec/src/is_byte.toml
new file mode 100644
index 000000000..c9f8ee363
--- /dev/null
+++ b/spec/src/is_byte.toml
@@ -0,0 +1,21 @@
+name = "IS_BYTE"
+
+[[variables.condition]]
+name = "cond"
+type = "BaseField"
+desc = ""
+
+[[variables.input]]
+name = "X"
+type = "BaseField"
+desc = "Value for which to assert that it lies in the range $[0, 255]$."
+
+[[constraint_groups]]
+name = "all"
+
+[[constraints.all]]
+kind = "interaction"
+tag = "ARE_BYTES"
+input = [0, "X"]
+multiplicity = "cond"
+ref = "isbyte:c:isbyte"
diff --git a/spec/src/keccak_round.toml b/spec/src/keccak_round.toml
index 548cb5151..d41b7c472 100644
--- a/spec/src/keccak_round.toml
+++ b/spec/src/keccak_round.toml
@@ -211,18 +211,18 @@ multiplicity = "μ"
 # Cxz_left  = [-1,  256, -1,  256, -1,  256, -1,  256] and
 # Cxz_right = [ 1, -256,  1, -256,  1, -256,  1, -256]
 [[constraints.theta]]
-kind = "interaction"
+kind = "template"
 tag = "IS_BYTE"
 input = [["idx", ["idx", "Cxz_left", "x"], "z"]]
 iters = [["x", 0, 4], ["z", 0, 7]]
-multiplicity = "μ"
+cond = "μ"
 
 [[constraints.theta]]
-kind = "interaction"
+kind = "template"
 tag = "IS_BYTE"
 input = [["idx", ["idx", "Cxz_right", "x"], "z"]]
 iters = [["x", 0, 4], ["z", 0, 7]]
-multiplicity = "μ"
+cond = "μ"
 
 [[constraints.theta]]
 kind = "interaction"
@@ -257,18 +257,18 @@ multiplicity = "μ"
 # rot_left  = [-1,  256, -1,  256, -1,  256, -1,  256] and
 # rot_right = [ 1, -256,  1, -256,  1, -256,  1, -256]
 [[constraints.rho]]
-kind = "interaction"
+kind = "template"
 tag = "IS_BYTE"
 input = [["idx", ["idx", ["idx", "rot_left", "x"], "y"], "z"]]
-iters = [["x", 0, 4], ["y", 0, 4], ["z", 0, 7]]
-multiplicity = "μ"
+iters = [["x", 0, 4], ["y", 0, 4], ["z", 0, 7]] 
+cond = "μ"
 
 [[constraints.rho]]
-kind = "interaction"
+kind = "template"
 tag = "IS_BYTE"
 input = [["idx", ["idx", ["idx", "rot_right", "x"], "y"], "z"]]
 iters = [["x", 0, 4], ["y", 0, 4], ["z", 0, 7]]
-multiplicity = "μ"
+cond = "μ"
 
 [[constraint_groups]]
 name = "chi"
diff --git a/spec/src/page.toml b/spec/src/page.toml
index dff939558..b6247a7c7 100644
--- a/spec/src/page.toml
+++ b/spec/src/page.toml
@@ -40,16 +40,14 @@ def = ["+", "page", ["*", "offset", ["cast", 1, "DWordWL"]]]
 name = "all"
 
 [[constraints.all]]
-kind = "interaction"
+kind = "template"
 tag = "IS_BYTE"
 input = ["init"]
-multiplicity = 1
 
 [[constraints.all]]
-kind = "interaction"
+kind = "template"
 tag = "IS_BYTE"
 input = ["fini"]
-multiplicity = 1
 
 [[constraints.all]]
 kind = "interaction"
diff --git a/spec/src/sha256.toml b/spec/src/sha256.toml
index 4cd4de9ba..78d022515 100644
--- a/spec/src/sha256.toml
+++ b/spec/src/sha256.toml
@@ -232,11 +232,11 @@ input = ["timestamp", "last_round_out", 64]
 multiplicity = ["-", "μ"]
 
 [[constraints.compress]]
-kind = "interaction"
+kind = "template"
 tag = "IS_BYTE"
 input = [["idx", "out", "i"]]
-multiplicity = "μ"
 iter = ["i", 0, 31]
+cond = "μ"
 
 [[constraints.compress]]
 kind = "template"
diff --git a/spec/src/sha256msgsched.toml b/spec/src/sha256msgsched.toml
index 79664a797..402f7459a 100644
--- a/spec/src/sha256msgsched.toml
+++ b/spec/src/sha256msgsched.toml
@@ -79,10 +79,10 @@ desc = "#`IS_WORD[SHA256_M[timestamp, i]]` for $0 <= i < #`index`$"
 name = "lookback"
 
 [[constraints.lookback]]
-kind = "interaction"
+kind = "template"
 tag = "IS_BYTE"
 input = [["-", "index", 16]]
-multiplicity = "μ"
+cond = "μ"
 
 [[constraints.lookback]]
 kind = "interaction"
@@ -130,10 +130,10 @@ output = "s1"
 multiplicity = "μ"
 
 [[constraints.calc]]
-kind = "interaction"
+kind = "template"
 tag = "IS_BYTE"
 input = ["carry"]
-multiplicity = "μ"
+cond = "μ"
 
 [[constraints.calc]]
 kind = "interaction"
diff --git a/spec/src/sha256round.toml b/spec/src/sha256round.toml
index 8ec93ea36..2469b560c 100644
--- a/spec/src/sha256round.toml
+++ b/spec/src/sha256round.toml
@@ -254,10 +254,10 @@ multiplicity = "μ"
 iter = ["i", 0, 1]
 
 [[constraints.addition]]
-kind = "interaction"
+kind = "template"
 tag = "IS_BYTE"
 input = ["carry_a"]
-multiplicity = "μ"
+cond = "μ"
 
 [[constraints.addition]]
 kind = "interaction"
@@ -267,10 +267,10 @@ multiplicity = "μ"
 iter = ["i", 0, 1]
 
 [[constraints.addition]]
-kind = "interaction"
+kind = "template"
 tag = "IS_BYTE"
 input = ["carry_e"]
-multiplicity = "μ"
+cond = "μ"
 
 [[constraint_groups]]
 name = "output"
diff --git a/spec/src/shift.toml b/spec/src/shift.toml
index bbe22a5d9..18c03ecad 100644
--- a/spec/src/shift.toml
+++ b/spec/src/shift.toml
@@ -147,7 +147,7 @@ iter = ["i", 0, 3]
 ref = "shift:a:range_in"
 
 [[assumptions]]
-desc = "`IS_BYTE[shift]`"
+desc = "`IS_BYTE<shift>`"
 ref = "shift:a:range_shift"
 
 [[assumptions]]
diff --git a/spec/src/signatures.toml b/spec/src/signatures.toml
index b04b3d561..e93c87b05 100644
--- a/spec/src/signatures.toml
+++ b/spec/src/signatures.toml
@@ -5,6 +5,13 @@ kind = "template"
 input = ["BaseField"]
 cond = "BaseField"
 
+# IS_BYTE<X, μ>
+[[signatures]]
+tag = "IS_BYTE"
+kind = "template"
+input = ["BaseField"]
+cond = "BaseField"
+
 # cond => ADD<sum; lhs, rhs>
 [[signatures]]
 tag = "ADD"
@@ -157,11 +164,11 @@ kind = "interaction"
 input = ["B20"]
 output = "Bit"
 
-# IS_BYTE[X]
+# ARE_BYTES[X, Y]
 [[signatures]]
-tag = "IS_BYTE"
+tag = "ARE_BYTES"
 kind = "interaction"
-input = ["Byte"]
+input = ["Byte", "Byte"]
 
 # IS_HALF[X]
 [[signatures]]

From 2e05a257a25d3cb21463142a7441c611699f2c43 Mon Sep 17 00:00:00 2001
From: Erik <159244975+erik-3milabs@users.noreply.github.com>
Date: Tue, 5 May 2026 10:25:03 +0200
Subject: [PATCH 099/105] spec: Fix `KECCAK` (#554)

* spec/keccak: fix cyclic-shift indexing mistakes

* spec/ecall: fix negative ECALL numbering

* spec/keccak: optimize Cxz_right from Byte to Bit

* spec/KECCAK: add potential optimization

rot_left and rot_right contain 96 constant zero-columns, which can be dropped. Additionally, those zeroes do not have to be byte-checked.

* spec/KECCAK: fix typo

* spec/KECCAK: list another potential optimization

* spec/keccak: fix index division problems

* spec/keccak: remove condition from IS_BIT
---
 spec/keccak.typ            |  5 +++++
 spec/src/keccak.toml       |  2 +-
 spec/src/keccak_round.toml | 40 +++++++++++++++++++++++++-------------
 spec/src/sha256.toml       |  2 +-
 4 files changed, 34 insertions(+), 15 deletions(-)

diff --git a/spec/keccak.typ b/spec/keccak.typ
index e6e16f43f..11d900c73 100644
--- a/spec/keccak.typ
+++ b/spec/keccak.typ
@@ -105,6 +105,11 @@ Lastly, the round chip contributes the following interactions to the lookup:
 == Notes/potential optimizations
 - one does not have to repeat `addr` in `state_ptr`; this saves 4 columns and 4 `IS_HALF` checks.
 - step $rho$ does not need to be applied to `state[0][0]`; its has a zero-shift. This saves 16 columns and 4 `HWSL` interactions.
+- when the output of `HWSL` are `Byte`s mapped as `Half`s, we find that out of every four output bytes, at least one is zero. 
+  Since `rnc` is constant, @keccak:c:rho_rotation makes those zero-bytes show up in `rot_left` and `rot_right` at constant locations.
+  This means 96 columns can be removed from the chip at no cost.
+  Likewise, 96 `IS_BYTE` interactions can be dropped from @keccak:c:range_rot_left and @keccak:c:range_rot_right.
+  - the shift-constants are equivalent to $1 mod 16$ for $(#`x`, #`y`) = (1, 0)$ and $-1 mod 16$ for $(2, 3)$. This means that for those lanes it suffices to constrain `rot_left`/`rot_right` as `Bit`s rather than `Byte`s, saving an additional 8 `IS_BYTE` interactions.
 - $#`rc[2]` = #`rc[4]` = #`rc[5]` = #`rc[6]` = 0$. As such, those elements need not be stored in `rc`, and need not be XORed into the state in the $iota$-step. This saves 8 columns and 4 `XOR_BYTE` interactions.
 - when executed in large volumnes, `KECCAK_RND` could benefit from having a three-way XOR lookup table. With this in place, the 80 interactions in @keccak:c:theta_cxz_start and @keccak:c:theta_cxz could be dropped.
   Likewise, 80 columns could be removed from the chip (a \~5% savings).
diff --git a/spec/src/keccak.toml b/spec/src/keccak.toml
index b8f2d91c2..1c3bade44 100644
--- a/spec/src/keccak.toml
+++ b/spec/src/keccak.toml
@@ -48,7 +48,7 @@ name = "output"
 [[constraints.output]]
 kind = "interaction"
 tag = "ECALL"
-input = ["timestamp", ["arr", ["-", ["^", 2, 32], 1], ["-", ["^", 2, 32], 2]]]
+input = ["timestamp", ["cast", ["-", ["^", 2, 64], 2], "DWordWL"]]
 multiplicity = ["-", "μ"]
 
 [[constraint_groups]]
diff --git a/spec/src/keccak_round.toml b/spec/src/keccak_round.toml
index d41b7c472..59daba923 100644
--- a/spec/src/keccak_round.toml
+++ b/spec/src/keccak_round.toml
@@ -28,8 +28,8 @@ desc = "the left-rotated component of `rotated_Cxz`"
 
 [[variables.auxiliary]]
 name = "Cxz_right"
-type = [["Byte", 8], 5]
-desc = "the right-rotated component of `rotated_Cxz`"
+type = [["Bit", 4], 5]
+desc = "the right-rotated component of `rotated_Cxz` (which is a single bit)"
 
 [[variables.auxiliary]]
 name = "Dxz"
@@ -75,7 +75,16 @@ desc = "state update following from step $iota$."
 name = "rotated_Cxz"
 type = [["Byte", 8], 5]
 desc = "$#`Cxz[x,`3#`,z]` <<< 1$"
-def = {iters=[["x", 0, 4], ["z", 0, 7]], poly=["+", ["idx", ["idx", "Cxz_left", "x"], "z"], ["idx", ["idx", "Cxz_right", "x"], ["mod", ["-", "z", 1], 8]]]}
+def = {polys=[
+    {iters=[["x", 0, 4], ["z", 0]], poly=["+", ["idx", ["idx", "Cxz_left", "x"], "z"], ["idx", ["idx", "Cxz_right", "x"], 3]]},
+    {iters=[["x", 0, 4], ["z", 1]], poly=["idx", ["idx", "Cxz_left", "x"], "z"]},
+    {iters=[["x", 0, 4], ["z", 2]], poly=["+", ["idx", ["idx", "Cxz_left", "x"], "z"], ["idx", ["idx", "Cxz_right", "x"], 0]]},
+    {iters=[["x", 0, 4], ["z", 3]], poly=["idx", ["idx", "Cxz_left", "x"], "z"]},
+    {iters=[["x", 0, 4], ["z", 4]], poly=["+", ["idx", ["idx", "Cxz_left", "x"], "z"], ["idx", ["idx", "Cxz_right", "x"], 1]]},
+    {iters=[["x", 0, 4], ["z", 5]], poly=["idx", ["idx", "Cxz_left", "x"], "z"]},
+    {iters=[["x", 0, 4], ["z", 6]], poly=["+", ["idx", ["idx", "Cxz_left", "x"], "z"], ["idx", ["idx", "Cxz_right", "x"], 2]]},
+    {iters=[["x", 0, 4], ["z", 7]], poly=["idx", ["idx", "Cxz_left", "x"], "z"]},
+]}
 
 [[variables.virtual]]
 name = "out"
@@ -91,7 +100,7 @@ def = {polys=[
 [[variables.virtual]]
 name = "rho"
 type = [[["Byte", 8], 5], 5]
-desc = "$(rho compose theta)(#`start`)$; the state state after applying $rho$"
+desc = "$(rho compose theta)(#`start`)$; the state after applying $rho$"
 def = {iters=[["x", 0, 4], ["y", 0, 4], ["z", 0, 7]], poly=[
     "+", 
     ["*", 
@@ -99,7 +108,7 @@ def = {iters=[["x", 0, 4], ["y", 0, 4], ["z", 0, 7]], poly=[
         ["not", ["idx", ["idx", ["idx", "rbc", "x"], "y"], 1]],
         ["+",
             ["idx", ["idx", ["idx", "rot_left", "x"], "y"], "z"], 
-            ["idx", ["idx", ["idx", "rot_right", "x"], "y"], ["mod", ["-", "z", 1], 8]],
+            ["idx", ["idx", ["idx", "rot_right", "x"], "y"], ["mod", ["-", "z", 2], 8]],
         ]
     ],
     ["*", 
@@ -107,7 +116,7 @@ def = {iters=[["x", 0, 4], ["y", 0, 4], ["z", 0, 7]], poly=[
         ["not", ["idx", ["idx", ["idx", "rbc", "x"], "y"], 1]],
         ["+",
             ["idx", ["idx", ["idx", "rot_left", "x"], "y"], ["mod", ["-", "z", 2], 8]], 
-            ["idx", ["idx", ["idx", "rot_right", "x"], "y"], ["mod", ["-", "z", 3], 8]],
+            ["idx", ["idx", ["idx", "rot_right", "x"], "y"], ["mod", ["-", "z", 4], 8]],
         ]
     ],
     ["*",
@@ -115,7 +124,7 @@ def = {iters=[["x", 0, 4], ["y", 0, 4], ["z", 0, 7]], poly=[
         ["idx", ["idx", ["idx", "rbc", "x"], "y"], 1],
         ["+",
             ["idx", ["idx", ["idx", "rot_left", "x"], "y"], ["mod", ["-", "z", 4], 8]], 
-            ["idx", ["idx", ["idx", "rot_right", "x"], "y"], ["mod", ["-", "z", 5], 8]],
+            ["idx", ["idx", ["idx", "rot_right", "x"], "y"], ["mod", ["-", "z", 6], 8]],
         ]
     ],
     ["*", 
@@ -123,7 +132,7 @@ def = {iters=[["x", 0, 4], ["y", 0, 4], ["z", 0, 7]], poly=[
         ["idx", ["idx", ["idx", "rbc", "x"], "y"], 1],
         ["+",
             ["idx", ["idx", ["idx", "rot_left", "x"], "y"], ["mod", ["-", "z", 6], 8]], 
-            ["idx", ["idx", ["idx", "rot_right", "x"], "y"], ["mod", ["-", "z", 7], 8]],
+            ["idx", ["idx", ["idx", "rot_right", "x"], "y"], "z"],
         ]
     ],
 ]}
@@ -202,7 +211,10 @@ ref = "keccak:c:theta_cxz"
 kind = "interaction"
 tag = "HWSL"
 input = [["idx", ["cast", ["idx", ["idx", "Cxz", "x"], 3], "DWordHL"], "z"], 1]
-output = ["arr", ["idx", ["cast", ["idx", "Cxz_left", "x"], "DWordHL"], "z"], ["idx", ["cast", ["idx", "Cxz_right", "x"], "DWordHL"], "z"]]
+output = ["arr", 
+    ["idx", ["cast", ["idx", "Cxz_left", "x"], "DWordHL"], "z"], 
+    ["cast", ["idx", ["idx", "Cxz_right", "x"], "z"], "Half"]
+]
 iters = [["x", 0, 4], ["z", 0, 3]]
 multiplicity = "μ"
 
@@ -219,10 +231,9 @@ cond = "μ"
 
 [[constraints.theta]]
 kind = "template"
-tag = "IS_BYTE"
+tag = "IS_BIT"
 input = [["idx", ["idx", "Cxz_right", "x"], "z"]]
-iters = [["x", 0, 4], ["z", 0, 7]]
-cond = "μ"
+iters = [["x", 0, 4], ["z", 0, 3]]
 
 [[constraints.theta]]
 kind = "interaction"
@@ -251,6 +262,7 @@ input = [["idx", ["cast", ["idx", ["idx", "theta", "x"], "y"], "DWordHL"], "z"],
 output = ["arr", ["idx", ["cast", ["idx", ["idx", "rot_left", "x"], "y"], "DWordHL"], "z"], ["idx", ["cast", ["idx", ["idx", "rot_right", "x"], "y"], "DWordHL"], "z"]]
 iters = [["x", 0, 4], ["y", 0, 4], ["z", 0, 3]]
 multiplicity = "μ"
+ref = "keccak:c:rho_rotation"
 
 # Note: these IS_BYTE checks are necessary.
 # Without them, it is possible to prove 0 <<< S evaluates to -1 by setting
@@ -260,8 +272,9 @@ multiplicity = "μ"
 kind = "template"
 tag = "IS_BYTE"
 input = [["idx", ["idx", ["idx", "rot_left", "x"], "y"], "z"]]
-iters = [["x", 0, 4], ["y", 0, 4], ["z", 0, 7]] 
+iters = [["x", 0, 4], ["y", 0, 4], ["z", 0, 7]]
 cond = "μ"
+ref = "keccak:c:range_rot_left"
 
 [[constraints.rho]]
 kind = "template"
@@ -269,6 +282,7 @@ tag = "IS_BYTE"
 input = [["idx", ["idx", ["idx", "rot_right", "x"], "y"], "z"]]
 iters = [["x", 0, 4], ["y", 0, 4], ["z", 0, 7]]
 cond = "μ"
+ref = "keccak:c:range_rot_right"
 
 [[constraint_groups]]
 name = "chi"
diff --git a/spec/src/sha256.toml b/spec/src/sha256.toml
index 78d022515..59d710998 100644
--- a/spec/src/sha256.toml
+++ b/spec/src/sha256.toml
@@ -266,5 +266,5 @@ input = ["μ"]
 [[constraints.lookup]]
 kind = "interaction"
 tag = "ECALL"
-input = ["timestamp", ["arr", ["-", ["^", 2, 32], 1], ["-", ["^", 2, 32], 1]]]
+input = ["timestamp", ["cast", ["-", ["^", 2, 64], 1], "DWordWL"]]
 multiplicity = ["-", "μ"]

From ec9f75627a61b510db48df7b396917fe73e20927 Mon Sep 17 00:00:00 2001
From: Robin Jadoul <robin.jadoul@3milabs.tech>
Date: Mon, 1 Jun 2026 10:48:43 +0200
Subject: [PATCH 100/105] spec: Fix typographical errors in SHA256 and ROTXOR
 constraints (#635)

---
 spec/src/rotxor.toml | 2 +-
 spec/src/sha256.toml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/spec/src/rotxor.toml b/spec/src/rotxor.toml
index 730e9bda5..f1ff904b0 100644
--- a/spec/src/rotxor.toml
+++ b/spec/src/rotxor.toml
@@ -154,7 +154,7 @@ poly = ["-", ["idx", ["cast", "a2", "WordHL"], 0], ["idx", "a2_left", 1], ["idx"
 [[constraints.shift]]
 kind = "arith"
 constraint = "$#`a2[1]` = #`last_rot` dot #`a2_left[0]` + #`a2_right[1]`$"
-poly = ["-", ["idx", ["cast", "a2", "WordHL"], 0], ["*", "last_rot", ["idx", "a2_left", 0]], ["idx", "a2_right", 1]]
+poly = ["-", ["idx", ["cast", "a2", "WordHL"], 1], ["*", "last_rot", ["idx", "a2_left", 0]], ["idx", "a2_right", 1]]
 
 [[constraint_groups]]
 name = "xor"
diff --git a/spec/src/sha256.toml b/spec/src/sha256.toml
index 59d710998..8ef356578 100644
--- a/spec/src/sha256.toml
+++ b/spec/src/sha256.toml
@@ -56,7 +56,7 @@ name = "memory"
 kind = "interaction"
 tag = "MEMW"
 input = [1, ["cast", ["*", 2, 11], "DWordWL"], ["arr", ["idx", ["cast", ["idx", "m_addr", 0], "DWordWL"], 0], ["idx", ["cast", ["idx", "m_addr", 0], "DWordWL"], 1], 0, 0, 0, 0, 0, 0], "timestamp", 1, 0, 0]
-output = ["arr", ["idx", ["cast", ["idx", "m_addr", 0], "DWordWL"], 0], ["idx", ["cast", ["idx", "m_addr", 0], "DWordWL"], 0], 0, 0, 0, 0, 0, 0]
+output = ["arr", ["idx", ["cast", ["idx", "m_addr", 0], "DWordWL"], 0], ["idx", ["cast", ["idx", "m_addr", 0], "DWordWL"], 1], 0, 0, 0, 0, 0, 0]
 multiplicity = "μ"
 
 [[constraints.memory]]

From f006292e976920496e3b62feb975611e2b39243c Mon Sep 17 00:00:00 2001
From: Robin Jadoul <robin.jadoul@3milabs.tech>
Date: Tue, 2 Jun 2026 16:05:34 +0200
Subject: [PATCH 101/105] spec: rework CPU for smaller footprint (#624)

* spec: Draft CPU rework, still missing updates for all ALU chips, DECODE and CPU32

* Decoding for the new CPU

* Bitwise using BYTE_ALU

* Update old chips for the new CPU

* Fix decode for MEMORY signed flag

* Add input range check to SHIFTs typ

* MEMORY signature HL -> WL

* Clarify SHIFTW decoding

* Further updates and new chips for the new CPU

* Fix arg2 muxes

* Link instruction length to CPU32

* Apply easy changes from code review

Co-authored-by: Erik <159244975+erik-3milabs@users.noreply.github.com>

* Address more review comments

* Fix CPU32 register interactions

* Fix CPUs PC write multiplicity

* Fix HALT's interaction with the CPU padding

* Add some optional extra arith constraints to enforce the "easy" assumptions

* Combine assumptions constraints

* Remove todo comments that got a tracking issue

* word_instr gate for read_registerX out of caution

* Fix JAL(R) decoding and ALU

* Placate the type checker

* Represent instruction length as half

* Update spec/src/cpu.toml

Co-authored-by: Erik <159244975+erik-3milabs@users.noreply.github.com>

* fixes

---------

Co-authored-by: Erik <159244975+erik-3milabs@users.noreply.github.com>
---
 spec/bitwise.typ                  |   4 +
 spec/book.typ                     |  12 +-
 spec/branch.typ                   |   5 +
 spec/bytewise.typ                 |  38 ++
 spec/cpu.typ                      |  76 ++--
 spec/cpu32.typ                    |  61 +++
 spec/decode.typ                   | 109 +++--
 spec/dvrm.typ                     |   8 +-
 spec/eq.typ                       |  41 ++
 spec/expr.typ                     |   8 +
 spec/halt.typ                     |   7 +-
 spec/lt.typ                       |   9 +-
 spec/memw.typ                     |  13 +-
 spec/mul.typ                      |   3 -
 spec/shift.typ                    |   8 +-
 spec/src/bitwise.toml             |  12 +-
 spec/src/branch.toml              |  11 +-
 spec/src/bytewise.toml            |  49 +++
 spec/src/config.toml              |   9 +
 spec/src/cpu.toml                 | 676 +++++++++---------------------
 spec/src/cpu32.toml               | 416 ++++++++++++++++++
 spec/src/decode.toml              |  44 +-
 spec/src/decode_uncompressed.toml | 153 ++++---
 spec/src/dvrm.toml                |  55 ++-
 spec/src/eq.toml                  |  93 ++++
 spec/src/halt.toml                |  19 +-
 spec/src/keccak_round.toml        |  28 +-
 spec/src/load.toml                |  44 +-
 spec/src/lt.toml                  |  72 +++-
 spec/src/memw.toml                |  46 +-
 spec/src/memw_aligned.toml        |  28 +-
 spec/src/mul.toml                 |  44 +-
 spec/src/rotxor.toml              |   8 +-
 spec/src/sha256round.toml         |  20 +-
 spec/src/shift.toml               |  71 ++--
 spec/src/signatures.toml          |  65 +--
 spec/src/store.toml               | 122 ++++++
 spec/store.typ                    |  47 +++
 spec/tooling/chip.py              |   7 +
 39 files changed, 1659 insertions(+), 882 deletions(-)
 create mode 100644 spec/bytewise.typ
 create mode 100644 spec/cpu32.typ
 create mode 100644 spec/eq.typ
 create mode 100644 spec/src/bytewise.toml
 create mode 100644 spec/src/cpu32.toml
 create mode 100644 spec/src/eq.toml
 create mode 100644 spec/src/store.toml
 create mode 100644 spec/store.typ

diff --git a/spec/bitwise.typ b/spec/bitwise.typ
index 332bef3d9..1babeefcc 100644
--- a/spec/bitwise.typ
+++ b/spec/bitwise.typ
@@ -26,11 +26,15 @@ and convenience functionalities over small domains.
 
 The #bitwise chip is comprised of #nr_variables variables that are expressed using #nr_columns columns.
 Of these, the _input_ and _output_ variables (#nr_precomputed in total) are precomputed.
+
 #render_chip_variable_table(chip, config)
 
 *Note*: This table contains one row for every possible value of `(X, Y, Z)`.
 As such, it has length $2^8 dot 2^8 dot 2^4 = 2^(20)$.
 
+We use the ALU operation descriptors from @decode to identify the operations in the `BYTE_ALU` interaction.
+Since each of the three columns is only $2^16$ rows long, they can be combined in a single $2^20$ column (with room to spare).
+
 = Lookup
 This chip adds the following interactions to the lookup:
 #render_constraint_table(chip, config)
diff --git a/spec/book.typ b/spec/book.typ
index f4e95e493..8bf8612af 100644
--- a/spec/book.typ
+++ b/spec/book.typ
@@ -23,21 +23,25 @@
       ("add.typ", [`ADD`/`SUB` template], <add>),
       ("neg.typ", [`NEG` template], <neg>),
     )),
-    ("MEMORY", (
-      ("memw.typ", [`MEMW` chip], <memw>),
-    )),
     ("CPU", (
       ("decode.typ", [`DECODE` table], <decode>),
       ("cpu.typ", [`CPU` chip], <cpu>),
+      ("cpu32.typ", [`CPU32` chip], <cpu32>),
     )),
     ("ALU", (
       ("shift.typ", [`SHIFT` chip], <shift>),
       ("branch.typ", [`BRANCH` chip], <branch>),
       ("lt.typ", [`LT` chip], <lt>),
+      ("eq.typ", [`EQ` chip], <eq>),
       ("mul.typ", [`MUL` chip], <mul>),
       ("dvrm.typ", [`DVRM` chip], <dvrm>),
-      ("load.typ", [`LOAD` chip], <load>),
       ("bitwise.typ", [`BITWISE` chips], <bitwise>),
+      ("bytewise.typ", [`BYTEWISE` chip], <bytewise>)
+    )),
+    ("MEMORY", (
+      ("memw.typ", [`MEMW` chip], <memw>),
+      ("load.typ", [`LOAD` chip], <load>),
+      ("store.typ", [`STORE` chip], <store>),
     )),
     ("ECALLS", (
       ("about_ecalls.typ", [About `ECALL`], <ecall>),
diff --git a/spec/branch.typ b/spec/branch.typ
index 0743ae2f9..d6ab53a18 100644
--- a/spec/branch.typ
+++ b/spec/branch.typ
@@ -30,6 +30,11 @@ The #branch chip is comprised of #nr_variables variables that are expressed usin
 
 #render_chip_assumptions(chip, config)
 
+Some of the assumptions can be checked with only arithmetic constraints, so we
+provide these below.
+
+#render_constraint_table(chip, config, groups: "assumptions")
+
 = Constraints
 
 We constrain `next_pc` to be $#`base_address` + #`offset`$,
diff --git a/spec/bytewise.typ b/spec/bytewise.typ
new file mode 100644
index 000000000..452e4fdbc
--- /dev/null
+++ b/spec/bytewise.typ
@@ -0,0 +1,38 @@
+#import "/book.typ": book-page, rj
+#import "/src.typ": load_config, load_chip
+#import "/chip.typ": (
+  render_chip_assumptions,
+  render_chip_variable_table,
+  total_nr_variables,
+  total_nr_instantiated_columns,
+  compute_nr_interactions,
+  render_constraint_table,
+  render_chip_padding_table,
+)
+
+#let config = load_config()
+#let chip = load_chip("src/bytewise.toml", config)
+
+#show: book-page(chip.name)
+#let bytewise = raw(chip.name)
+
+The #bytewise chip is an ALU chip that decomposes the input `DWordWL` values into bytes and
+performs a `BITWISE` operation pairwise (AND, OR, XOR).
+The `BITWISE` lookup inherently performs a range check, so no further constraints are necessary.
+
+= Variables
+#let nr_variables = total_nr_variables(chip)
+#let nr_columns = total_nr_instantiated_columns(chip, config)
+#let nr_interactions = compute_nr_interactions(chip)
+
+The #bytewise chip is comprised of #nr_variables variables that are expressed using #nr_columns columns and leverages #nr_interactions interaction(s):
+#render_chip_variable_table(chip, config)
+
+= Constraints
+
+#render_constraint_table(chip, config)
+
+= Padding
+
+The chip can be padded with the following values:
+#render_chip_padding_table(chip, config)
diff --git a/spec/cpu.typ b/spec/cpu.typ
index 2ee85befa..cb11661f3 100644
--- a/spec/cpu.typ
+++ b/spec/cpu.typ
@@ -17,7 +17,7 @@
 #let cpu = raw(chip.name)
 
 The #cpu chip coordinates memory accesses and dispatches to other chips for arithmetic and logical operations.
-It bases its decisions on the entry of the `DECODE` table (@decode) corresponding the the current program counter (PC).
+It bases its decisions on the entry of the `DECODE` table (@decode) corresponding the current program counter (PC).
 
 = Variables
 #let nr_variables = total_nr_variables(chip)
@@ -30,36 +30,52 @@ The #cpu chip is comprised of #nr_variables variables that are expressed using #
 = Assumptions
 #render_chip_assumptions(chip, config)
 
+Additionally, the following constraints can be used to provide defense-in-depth
+validation of the assumptions.
+
+#render_constraint_table(chip, config, groups: "assumptions")
+
 = Constraints
+
 First, we perform a decoding lookup for the current PC.
+Instructions having the `word_instr` flag set are not decoded here, as they are delegated to the `CPU32` chip.
+In that case, we ensure that the current row of the CPU cannot have any other observable effects.
 
 #render_constraint_table(chip, config, groups: "decode")
 
 == Range checks
 
 We constrain all columns to have the appropriate ranges.
-The flags and register indices looked up from the decoding need to be checked,
-as they are communicated through the interaction in a packed form.
+All values in `packed_decode` need to be checked to ensure
+the packing is correct for the interaction.
 In contrast, we know ahead of time that decoding will ensure proper range checks for `pc` and `imm`.
 Similarly, since `next_pc` will propagate through the memory argument and be looked up
-in the instruction decoding on the next cycle, it is forced to be in the correct range.#rj[is this true, do we need this elsewhere for chip assumptions?]
-For the auxiliary columns, we need to check the limbs of `arg1`, `arg2`, and `res`.
+in the instruction decoding on the next cycle, it is forced to be in the correct range;
+the final value for `next_pc` is similarly fixed by the memory finalization.
+For the auxiliary columns, we need to check the limbs of `res`, since
+`rv1` and `rv2` are enforced by the memory argument, and `rvd` is correct by the correctness of the dependent chips.
 The ranges of the other auxiliary columns are enforced through later constraints.
-#rj[Make sure we argue for every column here]
-#rj[is `rvd` still sufficiently constrained? (can also be done through the memory argument like `pc`?)]
 
 #render_constraint_table(chip, config, groups: "range")
 
 == ALU
 
-The ALU functionality is then obtained through judicious dispatching to the corresponding chips.
+The ALU functionality is then obtained through delegation to the `ALU` signature, backed by the various ALU chips,
+or by using the appropriate template.
+For the pure ALU path, `arg2` is computed as `rv2 + imm`, which relies on @cpu:a:arg2-multiplex to
+be either `rv2` or `imm`, depending on the instruction.
+The other contributions for `arg2` are specific to the (mutually exclusive, @cpu:a:mem-branch-mutex)
+`MEMORY` and `BRANCH` flags:
+- For the `MEMORY` path, we want the output of the ALU to be $#`rv1` + #`imm`$, as that is the
+  address at which the memory access occurs.
+- For the `BRANCH` path, we want the ALU output to reflect the branch condition (or just be inactive for JALR).
 
 #render_constraint_table(chip, config, groups: "alu")
 
 == Memory<cpu:memory>
 
-The interactions with the memory, both for register loading and storing, as for `LOAD` and `STORE` instructions are handled.
-Note that since registers need no byte-addressing, we store them in the memory argument with `Word` limbs.
+Note that since registers need no byte-addressing, we store them in the memory argument with `Word` limbs,
+simultaneously ensuring that register reads are properly range checked as long as all writes are.
 The `pc` register behaves very predictably with respect to its timestamps and when it is being read,
 so for performance reasons, we inline its memory interactions directly into the #cpu chip.
 
@@ -70,32 +86,36 @@ Constraints regarding whether `pc_double_read` corresponds to an `AUIPC` instruc
 as regardless of its value, the old timestamp is guaranteed smaller than the new timestamp,
 and the integrity of the memory argument therefore ensures the correctness of this bit.
 
-#render_constraint_table(chip, config, groups: "mem")
-
-=== Potential optimizations
+The memory interaction itself is handled by the `MEMORY` signature,
+which will read the `mem_flags` argument to perform either a `LOAD` or a `STORE`.
+We refer to the previous section's description of `arg2` for how
+the address is computed.
 
-- `double_pc_read` could be integrated into decoding, so that `AUIPC` could set `read_register1 = 0` and no extra MEMW access for `rv1` is needed at this point.
+The value to (potentially) be written back to `rd` is stored in `rvd`,
+which can either come from the ALU --- in case of an ALU operation or a JALR branch ---
+or from the MEMORY interaction.
 
-== System
-
-The interactions with the wider system.
+#render_constraint_table(chip, config, groups: "mem")
 
-#render_constraint_table(chip, config, groups: "sys")
+== Branching
 
-== Input and output to the ALU
+A branch is expressed by having the `BRANCH` flag set to 1.
+Since `BRANCH` and `MEMORY` are mutually exclusive (@cpu:a:mem-branch-mutex),
+we can repurpose the `mem_flags` field to indicate a JALR instruction.
+When JALR is not set, we have a conditional branch that is decided upon by the result of the ALU instructions,
+as set in the `res` variable.
+As such, we can set `branch_cond` appropriately as multiplicity flag for the `BRANCH` chip.
 
-We constrain `arg1`, `arg2` and `rvd` to correspond to the wanted values,
-including the appropriate sign/zero extension, depending on `word_instr`.
+#render_constraint_table(chip, config, groups: "branch")
 
-#render_constraint_table(chip, config, groups: "ext")
+== System
 
-== Other constraints
-For @cpu:c:is_equal, note that @cpu:c:sub sets `res` to be the difference between `arg1` and `arg2` whenever `BEQ` is $1$.
-Given that this difference is $0$ when both are equal, @cpu:c:is_equal ensures `is_equal` is set to $1$ if and only if $#`arg1` = #`arg2`$ and `BEQ` is set.
+The interactions with the wider system go through the `ECALL` interface.
+Since we treat `EBREAK` instructions as unprovable traps, we avoid emitting `DECODE` rows
+for these, and do not need any further handling in the CPU.
 
-#render_constraint_table(chip, config, groups: "misc")
+#render_constraint_table(chip, config, groups: "sys")
 
-#rj[Document the choice to not have a multiplicity column here for padding]
 
 = Padding
 
@@ -104,4 +124,4 @@ in the DECODE table, at the _odd_ address 1, only reachable through a HALT ecall
 
 #render_chip_padding_table(chip, config)
 
-This approach minimizes the number of dependent lookups, increasing only multiplicities in the DECODE table and the IS_BYTE lookup.
+This approach minimizes the number of dependent lookups, increasing only multiplicities in the `DECODE` table and the `IS_BYTE` and `IS_HALF` lookups.
diff --git a/spec/cpu32.typ b/spec/cpu32.typ
new file mode 100644
index 000000000..e5e8963bc
--- /dev/null
+++ b/spec/cpu32.typ
@@ -0,0 +1,61 @@
+#import "/book.typ": book-page, rj
+#import "/src.typ": load_config, load_chip
+#import "/chip.typ": (
+  render_chip_assumptions,
+  render_chip_variable_table,
+  render_chip_padding_table,
+  render_constraint_table,
+  compute_nr_interactions,
+  total_nr_instantiated_columns,
+  total_nr_variables,
+)
+
+#let config = load_config()
+#let chip = load_chip("src/cpu32.toml", config)
+
+#show: book-page(chip.name)
+#let cpu32 = raw(chip.name)
+
+The #cpu32 chip is used to delegate the 32-bit instructions of the RV64I instruction set
+from the main CPU table (@cpu).
+All 32-bit instructions are ALU-only instructions, so the BRANCH, MEMORY and ECALL paths need no elaboration.
+The timestamp and PC have already been read by the CPU table at this point, and need no further checking;
+the PC for the next instruction will also already be handled by CPU.
+
+The structure follows the regular ALU path, with some extra variables and constraints to contain the required sign extensions.
+
+= Variables
+#let nr_variables = total_nr_variables(chip)
+#let nr_columns = total_nr_instantiated_columns(chip, config)
+#let nr_interactions = compute_nr_interactions(chip)
+
+The #cpu32 chip is comprised of #nr_variables variables that are expressed using #nr_columns columns and leverages #nr_interactions interaction(s):
+#render_chip_variable_table(chip, config)
+
+= Assumptions
+
+#render_chip_assumptions(chip, config)
+
+Some of the assumptions can be checked with only arithmetic constraints, so we
+provide these below.
+
+#render_constraint_table(chip, config, groups: "assumptions")
+
+= Constraints
+
+Most constraints correspond to those already present in the CPU, and we present them here first,
+including some updates to the range checking corresponding to the differing types.
+
+#render_constraint_table(chip, config, groups: ("decode", "range", "alu", "mem", "logup"))
+
+Then, we have the constraints corresponding to the sign-extension and definition of `arg1`, `arg2` and `rd`.
+This includes a step where we extract the `signed` bit from the `alu_flags`, as this determines
+whether to sign extend the inputs or not.
+
+#render_constraint_table(chip, config, groups: "ext")
+
+= Padding
+
+The table can be padded with the following values:
+#render_chip_padding_table(chip, config)
+
diff --git a/spec/decode.typ b/spec/decode.typ
index 21cd26acb..6defcdc84 100644
--- a/spec/decode.typ
+++ b/spec/decode.typ
@@ -8,6 +8,7 @@
   render_constraint_table,
   render_chip_padding_table,
 )
+#import "/expr.typ": expr_to_math
 
 #let config = load_config()
 #let chip = load_chip("src/decode.toml", config)
@@ -34,24 +35,44 @@ Empty rows with the following content can be added to achieve this:
 
 #render_chip_padding_table(chip, config)
 
-Note that this row sets the `EBREAK` flag.
-Given that `CPU` asserts that `EBREAK = 0` (see @cpu:c:ebreak_traps), using this "padding-instruction" would immediately make the CPU table unprovable.
-Note moreover that the `pc` is set to $7$.
-This value is the _smallest odd number_ (i.e., not reachable during regular execution) that is more than _$4$_ (i.e., the max `pc`-increment) greater than _$1$_ (i.e., the `pc`-value used in the #link(<cpu-padding-decode-row>)[additional instruction] referred to by `CPU`-padding lines).
+This is simultaneously the row that is used for padding rows in the CPU,
+if the multiplicity is nonzero,
+so we need to ensure that this table has at least one row of padding.
 
 = Decoding<decode:decoding-overview>
 For the purposes of explaining decoding, we decompress #decode's `packed_decode` variable into its constituent variables.
 Note that the below table is _not_ used in practice: it is solely used for the purposes of this explanation.
+The construction of the `alu_flags` and `mem_flags` columns is given here through virtual columns.
 
 #let config = load_config()
 #let uncompressed_chip = load_chip("src/decode_uncompressed.toml", config)
 
 #render_chip_variable_table(uncompressed_chip, config)
 
+First, we provide a mapping from an an ALU operation "descriptor" to the numerical value as used for the `alu_op` column.
+This is the table used to find the value for the #expr_to_math(("opsel", "OPERATION")) notation when performing `ALU` or `BYTE_ALU` interactions.
+
+#figure(
+  table(columns: (auto, auto),
+        stroke: 0pt,
+        inset: (right: .5em),
+        align: (left, left), table.header[*Descriptor*][*value*], table.hline(stroke: 1.5pt))[
+    *AND*][0][
+    *OR*][1][
+    *XOR*][2][
+    *EQ*][3][
+    *LT*][4][
+    *SHIFT*][5][
+    *SHIFTW*][6][
+    *MUL*][7][
+    *DIVREM*][8]
+)
+
 We will illustrate how each instruction should be expressed in this (uncompressed) decoding table.
 The columns of the accompanying table represent the following:
 - *`operation`*: the assembly operation being encoded.
-- *`op-flag`*: which of the "`ALU` selector flags" operation flags to set. Each operation sets exactly one.
+- *`alu`*: Set to the descriptor of the ALU operation to be used for `alu_op`.
+  If listed as `ADD` or `SUB`, the corresponding flag should be set, otherwise set `ALU = 1` when this column is not empty.
 - *`w_instr`*, *`signed`*: whether to set the `word_instr` and `signed` flags, respectively.
 - *other*: the other flags that should be set or variables that should be given specific values.
 
@@ -59,8 +80,6 @@ For the purpose of brevity and readability, the table uses the following rules-o
 + `rd`, `rs1`, `rs2`, and `imm` are mapped to the values provided by the instruction;
   when a value is not specified by an instruction it defaults to $0$.
 + `read_register1`, `read_register2` and `write_register` are set to $1$ when respectively $#`rs1` != 0$, $#`rs2` != 0$, or  $#`rd` != 0$.
-+ Any flag that is not listed is set to $0$, with the exception of the `c_type` flag. 
-  *The `c_type` flag is set independently of the below table*, as explained next.
 
 Further clarification is provided in the notes following the table.
 
@@ -81,38 +100,38 @@ Further clarification is provided in the notes following the table.
       // Overlay a low-opacity fill color to distinguish the different rows better
       if calc.odd(y) and y <= lines.len() { color.rgb(0, 0, 100, 20) }
       else { color.rgb(255, 255, 255, 20) },
-    table.header([*Operation*], [*op-flag*], [*`w_instr`*], [*`signed`*], [*other*], []),
+    table.header([*Operation*], [*alu*], [*`w_instr`*], [*`signed`*], [*other*], []),
     table.hline(stroke: 1.5pt),
     table.vline(x: 1, start: 1, end: lines.len() + 1, stroke: .5pt),
     ..lines.flatten(),
     table.hline(stroke: 1.5pt),
-    table.footer([*Operation*], [*op-flag*], [*`w_instr`*], [*`signed`*], [*other*]),
+    table.footer([*Operation*], [*alu*], [*`w_instr`*], [*`signed`*], [*other*]),
   ))
 }
 
 #let decoding = (
     // OP-IMM
   ([`ADDI[W]   rd, rs1, imm`], [`ADD`], [`[W]`], [], [], [#ref_note(<note_word_instr>)]),
-  ([`SLTI[U]   rd, rs1, imm`], [`SLT`], [], [#sym.not`[U]`], [], [#ref_note(<note_signed>)]),
+  ([`SLTI[U]   rd, rs1, imm`], [`LT`], [], [#sym.not`[U]`], [], [#ref_note(<note_signed>)]),
   ([`ANDI      rd, rs1, imm`], [`AND`], [], [], [], []),
   ([`ORI       rd, rs1, imm`], [`OR`],   [], [], [], []),
   ([`XORI      rd, rs1, imm`], [`XOR`], [], [], [], []),
-  ([`SLLI[W]   rd, rs1, imm`], [`SHIFT`], [`[W]`], [], [], []),
-  ([`SRLI[W]   rd, rs1, imm`], [`SHIFT`], [`[W]`], [], [`mp_selector`], [#ref_note(<note_word_instr>)]),
-  ([`SRAI[W]   rd, rs1, imm`], [`SHIFT`], [`[W]`], [1], [`mp_selector`], [#ref_note(<note_word_instr>)]),
+  ([`SLLI[W]   rd, rs1, imm`], [`SHIFT[W]`], [`[W]`], [], [], [#ref_note(<note_word_instr>)]),
+  ([`SRLI[W]   rd, rs1, imm`], [`SHIFT[W]`], [`[W]`], [], [`invert`], [#ref_note(<note_word_instr>)]),
+  ([`SRAI[W]   rd, rs1, imm`], [`SHIFT[W]`], [`[W]`], [1], [`invert`], [#ref_note(<note_word_instr>)]),
   // OP
   ([`ADD[W]    rd, rs1, rs2`], [`ADD`], [`[W]`], [], [], [#ref_note(<note_word_instr>)]),
   ([`SUB[W]    rd, rs1, rs2`], [`SUB`], [`[W]`], [], [], [#ref_note(<note_word_instr>)]),
-  ([`SLT[U]    rd, rs1, rs2`], [`SLT`], [], [#sym.not`[U]`], [], [#ref_note(<note_signed>)]),
+  ([`SLT[U]    rd, rs1, rs2`], [`LT`], [], [#sym.not`[U]`], [], [#ref_note(<note_signed>)]),
   ([`AND       rd, rs1, rs2`], [`AND`], [], [], [], []),
   ([`OR        rd, rs1, rs2`], [`OR`], [], [], [], []),
   ([`XOR       rd, rs1, rs2`], [`XOR`], [], [], [], []),
-  ([`SLL[W]    rd, rs1, rs2`], [`SHIFT`], [`[W]`], [], [], [#ref_note(<note_word_instr>)]),
-  ([`SRL[W]    rd, rs1, rs2`], [`SHIFT`], [`[W]`], [], [`mp_selector`], [#ref_note(<note_word_instr>)]),
-  ([`SRA[W]    rd, rs1, rs2`], [`SHIFT`], [`[W]`], [1], [`mp_selector`], [#ref_note(<note_word_instr>)]),
+  ([`SLL[W]    rd, rs1, rs2`], [`SHIFT[W]`], [`[W]`], [], [], [#ref_note(<note_word_instr>)]),
+  ([`SRL[W]    rd, rs1, rs2`], [`SHIFT[W]`], [`[W]`], [], [`invert`], [#ref_note(<note_word_instr>)]),
+  ([`SRA[W]    rd, rs1, rs2`], [`SHIFT[W]`], [`[W]`], [1], [`invert`], [#ref_note(<note_word_instr>)]),
   // OP - M
-  ([`MUL[W]    rd, rs1, rs2`], [`MUL`], [`[W]`], [1], [`mp_selector`], [#ref_note(<note_word_instr>)]),
-  ([`MULH      rd, rs1, rs2`], [`MUL`], [], [1], [`mp_selector`, `muldiv_selector`], []),
+  ([`MUL[W]    rd, rs1, rs2`], [`MUL`], [`[W]`], [1], [`signed2`], [#ref_note(<note_word_instr>)]),
+  ([`MULH      rd, rs1, rs2`], [`MUL`], [], [1], [`signed2`, `muldiv_selector`], []),
   ([`MULHU     rd, rs1, rs2`], [`MUL`], [], [], [`muldiv_selector`], []),
   ([`MULHSU    rd, rs1, rs2`], [`MUL`], [], [1], [`muldiv_selector`], []),
   ([`DIV[U][W] rd, rs1, rs2`], [`DIVREM`], [`[W]`], [#sym.not`[U]`], [], [#ref_note(<note_word_instr>, <note_signed>)]),
@@ -120,37 +139,42 @@ Further clarification is provided in the notes following the table.
   // LUI/AUIPC
   ([`LUI       rd, imm`], [`ADD`], [], [], [], [#ref_note(<note-lui>)]),
   ([`AUIPC     rd, imm`], [`ADD`], [], [], [`rs1 := x255`], [#ref_note(<note-auipc>)]),
-  ([`JAL       rd, imm`], [`JALR`], [], [], [`rs1 := x255`], [#ref_note(<note-jal>)]),
+  ([`JAL       rd, imm`], [], [], [], [`BRANCH`, `JALR`, `rs1 := x255`], [#ref_note(<note-jal>)]),
   // Branching
-  ([`JALR      rd, rs1, imm`], [`JALR`], [], [], [], []),
-  ([`BEQ      rs1, rs2, imm`], [`BEQ`], [], [], [], []),
-  ([`BNE      rs1, rs2, imm`], [`BEQ`], [], [], [`mp_selector`], []),
-  ([`BLT[U]   rs1, rs2, imm`], [`BLT`], [], [#sym.not`[U]`], [], [#ref_note(<note_signed>)]),
-  ([`BGE[U]   rs1, rs2, imm`], [`BLT`], [], [#sym.not`[U]`], [`mp_selector`], [#ref_note(<note_signed>)]),
+  ([`JALR      rd, rs1, imm`], [], [], [], [`BRANCH`, `JALR`], []),
+  ([`BEQ      rs1, rs2, imm`], [`EQ`], [], [], [`BRANCH`], []),
+  ([`BNE      rs1, rs2, imm`], [`EQ`], [], [], [`BRANCH`, `invert`], []),
+  ([`BLT[U]   rs1, rs2, imm`], [`LT`], [], [#sym.not`[U]`], [`BRANCH`], [#ref_note(<note_signed>)]),
+  ([`BGE[U]   rs1, rs2, imm`], [`LT`], [], [#sym.not`[U]`], [`BRANCH`, `invert`], [#ref_note(<note_signed>)]),
   // LOAD
-  ([`LD        rd, rs1, imm`], [`LOAD`], [], [], [`mem_8B`], []),
-  ([`LW[U]     rd, rs1, imm`], [`LOAD`], [], [#sym.not`[U]`], [`mem_4B`], [#ref_note(<note_signed>)]),
-  ([`LH[U]     rd, rs1, imm`], [`LOAD`], [], [#sym.not`[U]`], [`mem_2B`], [#ref_note(<note_signed>)]),
-  ([`LB[U]     rd, rs1, imm`], [`LOAD`], [], [#sym.not`[U]`], [], [#ref_note(<note_signed>)]),
+  ([`LD        rd, rs1, imm`], [`ADD`], [], [], [`MEMORY`, `mem_8B`], []),
+  ([`LW[U]     rd, rs1, imm`], [`ADD`], [], [], [`MEMORY`, `mem_signed := `#sym.not`[U]`, `mem_4B`], [#ref_note(<note_signed>)]),
+  ([`LH[U]     rd, rs1, imm`], [`ADD`], [], [], [`MEMORY`, `mem_signed := `#sym.not`[U]`, `mem_2B`], [#ref_note(<note_signed>)]),
+  ([`LB[U]     rd, rs1, imm`], [`ADD`], [], [], [`MEMORY`, `mem_signed := `#sym.not`[U]`], [#ref_note(<note_signed>)]),
   // STORE
-  ([`SD       rs1, rs2, imm`], [`STORE`], [], [], [`mem_8B`], []),
-  ([`SW       rs1, rs2, imm`], [`STORE`], [], [], [`mem_4B`], []),
-  ([`SH       rs1, rs2, imm`], [`STORE`], [], [], [`mem_2B`], []),
-  ([`SB       rs1, rs2, imm`], [`STORE`], [], [], [], []),
+  ([`SD       rs1, rs2, imm`], [`ADD`], [], [], [`MEMORY`, `memory_op`, `mem_8B`], []),
+  ([`SW       rs1, rs2, imm`], [`ADD`], [], [], [`MEMORY`, `memory_op`, `mem_4B`], []),
+  ([`SH       rs1, rs2, imm`], [`ADD`], [], [], [`MEMORY`, `memory_op`, `mem_2B`], []),
+  ([`SB       rs1, rs2, imm`], [`ADD`], [], [], [`MEMORY`, `memory_op`], []),
   // ECALL/EBREAK
-  ([`ECALL`], [`ECALL`], [], [], [$#`rs1` := #`x17`$], [#ref_note(<note-ecall>)]),
-  ([`EBREAK`], [`EBREAK`], [], [], [], []),
+  ([`ECALL`], [], [], [], [`ECALL`, $#`rs1` := #`x17`$], [#ref_note(<note-ecall>)]),
   // FENCE
   ([`FENCE`], [`ADD`], [], [], [], [#ref_note(<note-fence>)]),
 )
 
 #decoding_table(decoding)
 
+Note that the above table has no entry for the `EBREAK` instruction.
+We treat `EBREAK` as an unprovable trap, and its absence from the table enables
+this by having no valid decoding available for when the instruction is encountered.
+
 == C-type instructions
 The `RV64C` extension for compressed instructions specifies that \~50% of all instructions can be represented using a 16-bit instruction (rather than 32-bits), saving \~25% in code size.
 This execution of assembly code is _not_ agnostic to an instruction's compression state; after executing a compressed instruction, the `pc` should be incremented by $2$ rather than $4$.
-To indicate an instruction is provided in compressed form, the `c_type` flag is introduced.
-*This flag should be set to $1$ whenever the decoded instruction is provided in compressed form and $0$ otherwise.*
+As such, we provide the `half_instruction_length` column that *must take on the value $1$ for compressed instructions and $2$ for regular instructions*.
+It is represented as half the number of bytes in the instruction to make misaligned instructions lengths unrepresentable.
+Additionally, having the variable opens the door for future optimizations involving "fused" instructions, where common sequences
+of instructions are merged into a single decoded version and need only a single CPU row to prove.
 
 // Construct a note that can be referenced through `lbl`
 #let referenceable_note(lbl, note) = {
@@ -164,7 +188,7 @@ We note the following about the above decoding table:
   enum.item(
     referenceable_note(
       "note_word_instr",
-      [`word_instr`: `[W]` indicates that $#`word_instr` = 1$ for the `W`-variant of the operation, and $0$ for the non-`W`-variant.]
+      [`word_instr`: `[W]` indicates that $#`word_instr` = 1$ for the `W`-variant of the operation, and $0$ for the non-`W`-variant. Similarly, `SHIFT[W]` indicates the `SHIFTW` operation for the `W`-variant, and `SHIFT` otherwise.]
     )
   ),
   enum.item(
@@ -192,7 +216,7 @@ We note the following about the above decoding table:
   enum.item(
     referenceable_note(
       "note-jal",
-      [`JAL`: this operation stores $#`pc` + 4$ in `rd` and adds two times the sign-extended 20-bit immediate to the `pc`.
+      [`JAL`: this operation stores $#`pc` + #`2 * half_instruction_length`$ in `rd` and adds two times the sign-extended 20-bit immediate to the `pc`.
       Note that this can be represented using `JALR rd, x255, imm`.
       As such, *we expect the decoding to take care of writing the immediate in bit range $[1:21]$ of `imm` and extending it to 64 bits; the least significant bit should always be 0.*]
     )
@@ -212,10 +236,3 @@ We note the following about the above decoding table:
     )
   )
 )
-
-== One more instruction <cpu-padding-decode-row>
-In addition to decoding all instructions provided in the ELF and adding a corresponding entry to the #decode table, one must include an entry that has $#`pc` = 1$ and every other variable set to $0$.
-Note that this will never conflict with any entry in the ELF, since it has an odd `pc` value.
-
-This entry is used to pad the `CPU` table.
-More details on this matter are provided in the `CPU` chip.
diff --git a/spec/dvrm.typ b/spec/dvrm.typ
index 1118aa10a..c2d1c6450 100644
--- a/spec/dvrm.typ
+++ b/spec/dvrm.typ
@@ -27,10 +27,12 @@ The #dvrm chip provides division and remainder functionality, both signed and un
 The #dvrm chip is comprised of #nr_variables variables that are expressed using #nr_columns columns and leverages #nr_interactions interaction(s):
 #render_chip_variable_table(chip, config)
 
-= Assumptions
-#render_chip_assumptions(chip, config)
 
 = Constraints
+
+First, we range-check all inputs.
+#render_constraint_table(chip, config, groups: "range")
+
 From the ISA, we gather five requirements for the `DIV[U][W]` and `REM[U][W]` instructions:
 #enum(numbering: "R1.",
   enum.item([
@@ -106,7 +108,7 @@ Rewriting R1, we find the constraint $not#`overflow` => #`n` - #`r` = #`qd`$.
 #footnote([Recall that @dvrm:c:sign_q allows to assert this equality even when `overflow`.])
 Since `n`, `d`, `q` and `r` are all 64-bit integers, we must assert this equality $mod 2^128$, rather than $mod 2^64$.
 To this end, we introduce `extended_n_sub_r` and leverage the `MUL` chip to verify that it is equal to $#`qd` mod 2^128$ using constraints @dvrm:c:mul_lower and @dvrm:c:mul_upper;
-@dvrm:c:q_range is included to uphold assumption @mul:a:rhs.
+@dvrm:c:q_range is included to uphold assumption @mul:c:rhs.
 
 #render_constraint_table(chip, config, groups:("equality", ))
 
diff --git a/spec/eq.typ b/spec/eq.typ
new file mode 100644
index 000000000..379df796d
--- /dev/null
+++ b/spec/eq.typ
@@ -0,0 +1,41 @@
+#import "/book.typ": book-page, rj
+#import "/src.typ": load_config, load_chip
+#import "/chip.typ": (
+  render_chip_assumptions,
+  render_chip_variable_table,
+  total_nr_variables,
+  total_nr_instantiated_columns,
+  compute_nr_interactions,
+  render_constraint_table,
+  render_chip_padding_table,
+)
+
+#let config = load_config()
+#let chip = load_chip("src/eq.toml", config)
+
+#show: book-page(chip.name)
+#let eq = raw(chip.name)
+
+The #eq chip is an ALU chip that compares two values and outputs a bit indicating whether they are equal or not.
+It optionally inverts the result if the `invert` flag is set.
+
+= Variables
+#let nr_variables = total_nr_variables(chip)
+#let nr_columns = total_nr_instantiated_columns(chip, config)
+#let nr_interactions = compute_nr_interactions(chip)
+
+The #eq chip is comprised of #nr_variables variables that are expressed using #nr_columns columns and leverages #nr_interactions interaction(s):
+#render_chip_variable_table(chip, config)
+
+= Assumptions
+
+#render_chip_assumptions(chip, config)
+
+= Constraints
+
+#render_constraint_table(chip, config)
+
+= Padding
+
+The chip can be padded with the following values:
+#render_chip_padding_table(chip, config)
diff --git a/spec/expr.typ b/spec/expr.typ
index 20a55d753..5a275f47e 100644
--- a/spec/expr.typ
+++ b/spec/expr.typ
@@ -99,6 +99,10 @@
 // Typeset an expression as code
 #let expr_to_code = make_expr_formatter(
   (
+    "opsel": (pp, rec, e) => {
+      assert(type(e.at(1)) == type(""), message: "opsel expects a string")
+      `⧼` + raw(e.at(1)) + `⧽`
+    },
     "arr": (pp, rec, e) => `[` + e.slice(1).map(rec.with(PREC.MAX)).join(`, `) + `]`,
     "idx": (pp, rec, e) => rec(PREC.MIN, e.at(1)) + `[` + rec(PREC.MAX, e.at(2)) + `]`,
     "not": (pp, rec, e) => cwrap(rec(PREC.not, 1) + ` - ` + rec(PREC.not, e.at(1)), pp < PREC.not),
@@ -165,6 +169,10 @@
 // Typeset an expression as math
 #let expr_to_math = make_expr_formatter(
   (
+    "opsel": (pp, rec, e) => {
+      assert(type(e.at(1)) == type(""), message: "opsel expects a string")
+      $lr(chevron.l.curly#raw(e.at(1))chevron.r.curly)$
+    },
     "arr": (pp, rec, e) => $[#e.slice(1).map(rec.with(PREC.MAX)).join($, $)]$,
     "idx": (pp, rec, e) => {
       let (val, idxs) = flat_idxs(e)
diff --git a/spec/halt.typ b/spec/halt.typ
index 691154f61..50a39ac40 100644
--- a/spec/halt.typ
+++ b/spec/halt.typ
@@ -32,9 +32,12 @@ It is assumed the input is range checked:
 The #halt chip:
 + makes sure register `x10` (containing the exit code) equals $0$ (@halt:c:read_zero_exit_code),
 + writes $0$ to all other registers (@halt:c:zeroize_registers_lo/@halt:c:zeroize_registers_hi), and
-+ sets `pc` equal to $1$ (@halt:c:pc).
-Note that the writes performed by all these interactions are accompanied by the timestamp $2^64-1$; the maximum timestamp.
++ sets `pc` equal to $1$ (@halt:c:consume_pc, @halt:c:emit_pc).
+Note that the writes performed by all these interactions --- except for the `pc` --- are accompanied by the timestamp $2^64-1$; the maximum timestamp.
 This prevents any other operation involving memory from being executed hereafter.
+The `pc` is consumed and re-emitted at the same timestamp to enable padding rows for the CPU.
+This means that the verifier will have to know the final timestamp at which a CPU padding `pc` was written
+to be able to balance the final LogUp.
 #render_constraint_table(chip, config, groups: "all")
 
 #aside("Note on register clean up",
diff --git a/spec/lt.typ b/spec/lt.typ
index 2e0ac8be4..6dd60b236 100644
--- a/spec/lt.typ
+++ b/spec/lt.typ
@@ -17,6 +17,7 @@
 #let lt = raw(chip.name)
 
 The #lt chip constrains an indicator bit for the less-than relation, signed or unsigned.
+If the `invert` flag is set, it inverts the result.
 
 = Variables
 #let nr_variables = total_nr_variables(chip)
@@ -31,7 +32,7 @@ We assume the inputs `lhs`, `rhs` and `signed` are partially range checked.
 #render_chip_assumptions(chip, config)
 
 = Constraints
-We first constrain that all variables correspond to their definition.
+We first constrain that all inputs are range checked and all variables correspond to their definition.
 For the defining constraint of `lt`, @lt:c:lt, observe that it is a choice
 between two options, depending on the input flag `signed`.
 In the case of unsigned comparison, we simply need `unsigned_lt`, indicating
@@ -74,7 +75,7 @@ However, the left hand side of this is at least $3 dot 2^31$, as $(A, C) = (1, 1
 and the right hand side is at most $(2^31 - 1) + (2^32 - 1) + 1 = 3 dot 2^31 - 1$.
 Therefore, we can use $Q$ to constrain `lt` when `signed = 1`.
 
-#render_constraint_table(chip, config, groups: "defs")
+#render_constraint_table(chip, config, groups: ("range", "defs"))
 
 And then we constrain the subtraction,
 taking care of the remaining range checking not yet covered by the assumptions or the `MSB16` lookup.
@@ -90,3 +91,7 @@ The chip contributes the following to the lookup argument.
 The table can be padded to the next power of two with the following value assignments:
 
 #render_chip_padding_table(chip, config)
+
+= Potential optimizations
+
+- Split the chip into a signed and an unsigned chip, making the unsigned version cheaper.
diff --git a/spec/memw.typ b/spec/memw.typ
index 425508597..b963ea562 100644
--- a/spec/memw.typ
+++ b/spec/memw.typ
@@ -34,6 +34,11 @@ The #memw chip is comprised of #nr_variables variables that are expressed using
 
 #render_chip_assumptions(chip, config)
 
+Some of the assumptions can be checked with only arithmetic constraints, so we
+provide these below.
+
+#render_constraint_table(chip, config, groups: "assumptions")
+
 Our assumptions do not explicitly cover any range checks for the `is_register` and `value` columns,
 as these are not necessary for the correctness of this chip in isolation.
 Still, these properties are necessary for the consistency of the system as a whole, and therefore
@@ -93,6 +98,12 @@ Further logic remains essentially the same, so we briefly present the relevant t
 The #aligned chip only needs #nr_variables variables, expressed through #nr_columns columns; it leverages #nr_aligned_interactions interactions.
 #render_chip_variable_table(alignedchip, config)
 #render_chip_assumptions(alignedchip, config)
+
+Some of the assumptions can be checked with only arithmetic constraints, so we
+provide these below.
+
+#render_constraint_table(alignedchip, config, groups: "assumptions")
+
 #render_constraint_table(alignedchip, config)
 
 == Padding
@@ -159,4 +170,4 @@ The table can be padded to the next power of two with the following value assign
 The following ideas may prove to be optimizations for the #memw/#aligned/#reg chip:
 - `MEMB` chip that does a one-byte write to remove old_timestamp from here (uncertain tradeoffs)
 - Adding `μ_sum`/`w2`/`w4`/`write8` multiplicities to the `IS_HALF` lookups may make some GKR things faster if there are known zeroes.
-- For the register fast-path, one may upgrade the `IS_HALF` check to an `IS_B20` check for extended range at the cost of looking through a larger table.
\ No newline at end of file
+- For the register fast-path, one may upgrade the `IS_HALF` check to an `IS_B20` check for extended range at the cost of looking through a larger table.
diff --git a/spec/mul.typ b/spec/mul.typ
index 6e6de12e0..e5dc0de7c 100644
--- a/spec/mul.typ
+++ b/spec/mul.typ
@@ -32,9 +32,6 @@ The #mul chip is comprised of #nr_variables variables that are expressed using #
  $mat(delim: #none, top; bottom)$
 }
 
-= Assumptions
-The following range checks are assumed to be performed/enforced outside of this chip:
-#render_chip_assumptions(chip, config)
 
 = Constraints
 == Overview
diff --git a/spec/shift.typ b/spec/shift.typ
index c464d5d55..a1583e3e7 100644
--- a/spec/shift.typ
+++ b/spec/shift.typ
@@ -42,9 +42,6 @@ Here, `<<` and `>>` denote the _logical_ left and right shift operations, while
 The `SHIFT` chip is comprised of #nr_variables variables that are expressed using #nr_columns columns and leverages #nr_interactions interaction(s):
 #render_chip_variable_table(chip, config)
 
-= Assumptions
-#render_chip_assumptions(chip, config)
-
 = Explanation
 This chip has a rather complex design as a result of designing it to fit in as few columns possible.
 We briefly discuss the intricacies of the design, attempting to illustrate its correctness.
@@ -109,7 +106,10 @@ Copies of this variable are used for any full limbs shifted in when $#`right` =
 Moreover, `X[4]` contains a copy of `extension` shifted over by the right number of bits, to allow the construction of $#`in >>> shift` mod 16$ as the appropriate intermediate.
 
 = Constraints
-First, we constrain `bit_shift` based on whether we are left or right-shifting.
+First, we range check our inputs appropriately.
+#render_constraint_table(chip, config, groups: "input")
+
+Then, we constrain `bit_shift` based on whether we are left or right-shifting.
 @shift:c:zbs makes sure `zbs` is set to `1` if and only if `bit_shift = 0`. 
 This flag is used to indicate the special case that $#`right` = 1$ and $#`shift` = 0 mod 16$.
 #render_constraint_table(chip, config, groups: "bit_shift")
diff --git a/spec/src/bitwise.toml b/spec/src/bitwise.toml
index 30875a810..24b54442a 100644
--- a/spec/src/bitwise.toml
+++ b/spec/src/bitwise.toml
@@ -127,22 +127,22 @@ name = "contributions"
 
 [[constraints.contributions]]
 kind = "interaction"
-tag = "AND_BYTE"
-input = ["X", "Y"]
+tag = "BYTE_ALU"
+input = [["opsel", "AND"], "X", "Y"]
 output = "AND"
 multiplicity = ["-", "μ_AND"]
 
 [[constraints.contributions]]
 kind = "interaction"
-tag = "OR_BYTE"
-input = ["X", "Y"]
+tag = "BYTE_ALU"
+input = [["opsel", "OR"], "X", "Y"]
 output = "OR"
 multiplicity = ["-", "μ_OR"]
 
 [[constraints.contributions]]
 kind = "interaction"
-tag = "XOR_BYTE"
-input = ["X", "Y"]
+tag = "BYTE_ALU"
+input = [["opsel", "XOR"], "X", "Y"]
 output = "XOR"
 multiplicity = ["-", "μ_XOR"]
 
diff --git a/spec/src/branch.toml b/spec/src/branch.toml
index 49a7833a3..92b93d015 100644
--- a/spec/src/branch.toml
+++ b/spec/src/branch.toml
@@ -97,6 +97,13 @@ iter = ["i", 0, 1]
 desc = "`IS_BIT<JALR>`"
 
 
+[[constraint_groups]]
+name = "assumptions"
+
+[[constraints.assumptions]]
+kind = "template"
+tag = "IS_BIT"
+input = ["JALR"]
 
 [[constraint_groups]]
 name = "all"
@@ -123,8 +130,8 @@ cond = "μ"
 
 [[constraints.all]]
 kind = "interaction"
-tag = "AND_BYTE"
-input = ["unmasked_low_byte", 254]
+tag = "BYTE_ALU"
+input = [["opsel", "AND"], "unmasked_low_byte", 254]
 output = ["idx", "next_pc_low", 0]
 multiplicity = "μ"
 
diff --git a/spec/src/bytewise.toml b/spec/src/bytewise.toml
new file mode 100644
index 000000000..b315a92b3
--- /dev/null
+++ b/spec/src/bytewise.toml
@@ -0,0 +1,49 @@
+name = "BYTEWISE"
+
+[[variables.input]]
+name = "a"
+type = "DWordBL"
+desc = "The first input"
+pad = 0
+
+[[variables.input]]
+name = "b"
+type = "DWordBL"
+desc = "The second input"
+pad = 0
+
+[[variables.input]]
+name = "op"
+type = "Byte"
+desc = "The operation to perform"
+pad = 0
+
+[[variables.output]]
+name = "res"
+type = "DWordBL"
+desc = "The result"
+pad = 0
+
+[[variables.multiplicity]]
+name = "μ"
+type = "BaseField"
+desc = ""
+pad = 0
+
+[[constraint_groups]]
+name = "all"
+
+[[constraints.all]]
+kind = "interaction"
+tag = "BYTE_ALU"
+input = ["op", ["idx", "a", "i"], ["idx", "b", "i"]]
+output = ["idx", "res", "i"]
+multiplicity = "μ"
+iter = ["i", 0, 7]
+
+[[constraints.all]]
+kind = "interaction"
+tag = "ALU"
+input = [["cast", "a", "DWordWL"], ["cast", "b", "DWordWL"], "op"]
+output = ["cast", "res", "DWordWL"]
+multiplicity = ["-", "μ"]
diff --git a/spec/src/config.toml b/spec/src/config.toml
index 9ced2ce0d..9eac1d8c8 100644
--- a/spec/src/config.toml
+++ b/spec/src/config.toml
@@ -91,6 +91,15 @@ desc = """\
        The `Word` is the *most* significant digit.
        """
 
+[[variables.types]]
+label = "DWordWHBB"
+subtypes = ["Byte", "Byte", "Half", "Word"]
+desc = """\
+       Variable that can only assume values in the range $[0, 2^64)$. \\
+       Represented as a `Word`, a `Half` and two `Byte` variables. \
+       The `Word` is the *most* significant digit.
+       """
+
 [[variables.types]]
 label = "WordBL"
 subtypes = ["Byte", "Byte", "Byte", "Byte"]
diff --git a/spec/src/cpu.toml b/spec/src/cpu.toml
index a0bd3925c..85c3aaf70 100644
--- a/spec/src/cpu.toml
+++ b/spec/src/cpu.toml
@@ -7,7 +7,7 @@ name = "CPU"
 [[variables.input]]
 name = "timestamp"
 type = "Timestamp"
-desc = "A preprocessed timestamp to coordinate the memory argument. Since we have at most 3 non-disjoint memory accesses (`(rs1, rs2, rd)`, `(rs1, pc, pc)`, `(LOAD)` or `(STORE)`) a maximum of 4 slots is enough."
+desc = "A preprocessed timestamp to coordinate the memory argument. Since we have at most 3 non-disjoint memory accesses (`(rs1, rs2, rd)`, `(rs1, pc, pc)`, `MEMORY`) a maximum of 4 slots is enough."
 
 [[variables.input]]
 name = "pc"
@@ -51,58 +51,18 @@ type = "Bit"
 desc = "Whether to write back to the destination register"
 pad = 0
 
-# TODO: can we compress this to a single value? (1: is it worth it, 2: does it work)
-[[variables.input]]
-name = "memory_2bytes"
-type = "Bit"
-desc = "Whether the memory access (read or write) touches exactly 2 bytes"
-pad = 0
-
-[[variables.input]]
-name = "memory_4bytes"
-type = "Bit"
-desc = "Whether the memory access (read or write) touches exactly 4 bytes"
-pad = 0
-
-[[variables.input]]
-name = "memory_8bytes"
-type = "Bit"
-desc = "Whether the memory access (read or write) touches exactly 8 bytes"
-pad = 0
-
-# TODO: Are there usecases where it's nicer to just have this as a length constant?
-[[variables.input]]
-name = "c_type_instruction"
-type = "Bit"
-desc = "Whether the instruction is of C type, i.e., whether it is 2 bytes long instead of 4"
-pad = 0
-
 [[variables.input]]
 name = "imm"
 type = "DWordWL"
 desc = "The fully extended 64-bit version of the immediate"
 pad = 0
 
+# We encode C-type instructions with a length of 2, as this generality allows fusing common instruction combos
 [[variables.input]]
-name = "signed"
-type = "Bit"
-desc = "Indicates whether we're dealing with a signed or unsigned instruction"
-pad = 0
-
-[[variables.input]]
-name = "mp_selector"
-type = "Bit"
-desc = """Multi-purpose selector used by different ALU operations for different purposes. Currently, it is used
-    - by the `MUL` chip to select between `MUL`/`MULH` and `MULH[S]U`, and
-    - as flag for inverting the condition of conditional branches (see `branch_cond`)
-    - as direction (left or right) for `SHIFT`"""
-pad = 0
-
-[[variables.input]]
-name = "muldiv_selector"
-type = "Bit"
-desc = "Selects which output of `MUL` (lo/hi) or `DIV` (quo/rem) is wanted"
-pad = 0
+name = "half_instruction_length"
+type = "Byte"
+desc = "Half the number of bytes consumed by this instruction, commonly used to indicate whether the instruction is of C type, i.e., whether it is 2 bytes long (= 1) instead of 4 (= 2)"
+pad = 2
 
 [[variables.input]]
 name = "word_instr"
@@ -111,108 +71,59 @@ desc = "Whether the instruction is a \\*W instruction, requiring the inputs and
 pad = 0
 
 [[variables.input]]
-name = "ADD"
-type = "Bit"
-desc = "One-hot ALU selector flag"
-pad = 0
-
-[[variables.input]]
-name = "SUB"
-type = "Bit"
-desc = "One-hot ALU selector flag"
-pad = 0
-
-[[variables.input]]
-name = "SLT"
-type = "Bit"
-desc = "One-hot ALU selector flag"
-pad = 0
-
-[[variables.input]]
-name = "AND"
-type = "Bit"
-desc = "One-hot ALU selector flag"
-pad = 0
-
-[[variables.input]]
-name = "OR"
-type = "Bit"
-desc = "One-hot ALU selector flag"
-pad = 0
-
-[[variables.input]]
-name = "XOR"
-type = "Bit"
-desc = "One-hot ALU selector flag"
-pad = 0
-
-[[variables.input]]
-name = "SHIFT"
-type = "Bit"
-desc = "One-hot ALU selector flag"
-pad = 0
-
-[[variables.input]]
-name = "JALR"
+name = "ALU"
 type = "Bit"
-desc = "One-hot ALU selector flag"
+desc = "Whether to use the ALU for this instruction"
 pad = 0
 
 [[variables.input]]
-name = "BEQ"
-type = "Bit"
-desc = "One-hot ALU selector flag"
+name = "alu_flags"
+type = "Byte"
+desc = "The ALU operation + flags (interpreting things as signed/unsigned, choosing the MUL/DVRM output, ...) to pass to the ALU"
 pad = 0
 
 [[variables.input]]
-name = "BLT"
+name = "ADD"
 type = "Bit"
-desc = "One-hot ALU selector flag"
+desc = "Addition fast-path bypassing the ALU"
 pad = 0
 
 [[variables.input]]
-name = "LOAD"
+name = "SUB"
 type = "Bit"
-desc = "One-hot ALU selector flag"
+desc = "Subtraction fast-path bypassing the ALU"
 pad = 0
 
 [[variables.input]]
-name = "STORE"
+name = "MEMORY"
 type = "Bit"
-desc = "One-hot ALU selector flag"
+desc = "Whether this instruction touches memory (LOAD/STORE)"
 pad = 0
 
 [[variables.input]]
-name = "MUL"
-type = "Bit"
-desc = "One-hot ALU selector flag"
+name = "mem_flags"
+type = "Byte"
+desc = "The flags to pass for MEMORY operations (LOAD vs STORE, number of bytes touched, signed)"
 pad = 0
 
 [[variables.input]]
-name = "DIVREM"
+name = "BRANCH"
 type = "Bit"
-desc = "One-hot ALU selector flag"
+desc = "Whether this instruction is a conditional branch (BLT, BEQ)"
 pad = 0
 
 [[variables.input]]
 name = "ECALL"
 type = "Bit"
-desc = "One-hot ALU selector flag"
+desc = "Whether this instruction is an ECALL"
 pad = 0
 
-[[variables.input]]
-name = "EBREAK"
-type = "Bit"
-desc = "One-hot ALU selector flag"
-pad = 0
-
-
 # Output
 [[variables.output]]
 name = "next_pc"
 type = "DWordWL"
 desc = "The program counter for the next instruction"
-pad = 5
+pad = 1
 
 [[variables.output]]
 name = "rvd"
@@ -235,65 +146,41 @@ pad = 0
 
 [[variables.auxiliary]]
 name = "rv1"
-type = "DWordWHH"
+type = "DWordWL"
 desc = "The value of register `rs1`"
 pad = 0
 
 [[variables.auxiliary]]
 name = "rv2"
-type = "DWordWHH"
+type = "DWordWL"
 desc = "The value of register `rs2`"
 pad = 0
 
-[[variables.auxiliary]]
-name = "rv1_ext_bit"
-type = "Bit"
-desc = "The sign bit of `rv1` if seen as a 32-bit word, used for sign extension with `word_instr`"
-pad = 0
-
-[[variables.auxiliary]]
-name = "arg1"
-type = "DWordBL"
-desc = "The extended version of `rv1`, depending on `word_instr`"
-pad = 0
-
-[[variables.auxiliary]]
-name = "rv2_ext_bit"
-type = "Bit"
-desc = "The sign bit of `rv2` if seen as a 32-bit word, used for sign extension with `word_instr`"
-pad = 0
-
 [[variables.auxiliary]]
 name = "arg2"
-type = "DWordBL"
+type = "DWordWL"
 desc = "A multiplexed version of `rv2` and `imm`, to be used as second argument to ALU calls"
 pad = 0
 
-[[variables.auxiliary]]
-name = "res_ext_bit"
-type = "Bit"
-desc = "The sign bit of `res`, if seen as a 32-bit word, used for sign extension with `word_instr`"
-pad = 0
-
 [[variables.auxiliary]]
 name = "res"
-type = "DWordBL"
+type = "DWordHL"
 desc = "The ALU result"
 pad = 0
 
-[[variables.auxiliary]]
-name = "is_equal"
-type = "Bit"
-desc = "Whether `rv1` and `arg2` are equal"
-pad = 0
-
 [[variables.auxiliary]]
 name = "branch_cond"
 type = "Bit"
-desc = "Whether a branch is taken, i.e., the branch condition"
+desc = "Whether a branch is taken: the branch condition evaluates to true, or we are doing an unconditional jump"
 pad = 0
 
 # Virtual
+[[variables.virtual]]
+name = "JALR"
+type = "Bit"
+desc = "Read whether our BRANCH corresponds to a JAL(R) instruction from `mem_flags`, as `MEMORY` and `BRANCH` are mutually exclusive"
+def = "mem_flags"
+
 [[variables.virtual]]
 name = "packed_decode"
 type = "BaseField"
@@ -302,50 +189,53 @@ def = ["+",
     ["*", ["^", 2, 0], "read_register1"],
     ["*", ["^", 2, 1], "read_register2"],
     ["*", ["^", 2, 2], "write_register"],
-    ["*", ["^", 2, 3], "memory_2bytes"],
-    ["*", ["^", 2, 4], "memory_4bytes"],
-    ["*", ["^", 2, 5], "memory_8bytes"],
-    ["*", ["^", 2, 6], "c_type_instruction"],
-    ["*", ["^", 2, 7], "signed"],
-    ["*", ["^", 2, 8], "mp_selector"],
-    ["*", ["^", 2, 9], "muldiv_selector"],
-    ["*", ["^", 2, 10], "word_instr"],
-    ["*", ["^", 2, 11], "ADD"],
-    ["*", ["^", 2, 12], "SUB"],
-    ["*", ["^", 2, 13], "SLT"],
-    ["*", ["^", 2, 14], "AND"],
-    ["*", ["^", 2, 15], "OR"],
-    ["*", ["^", 2, 16], "XOR"],
-    ["*", ["^", 2, 17], "SHIFT"],
-    ["*", ["^", 2, 18], "JALR"],
-    ["*", ["^", 2, 19], "BEQ"],
-    ["*", ["^", 2, 20], "BLT"],
-    ["*", ["^", 2, 21], "LOAD"],
-    ["*", ["^", 2, 22], "STORE"],
-    ["*", ["^", 2, 23], "MUL"],
-    ["*", ["^", 2, 24], "DIVREM"],
-    ["*", ["^", 2, 25], "ECALL"],
-    ["*", ["^", 2, 26], "EBREAK"],
-    ["*", ["^", 2, 27], "rs1"],
-    ["*", ["^", 2, 35], "rs2"],
-    ["*", ["^", 2, 43], "rd"],
+    ["*", ["^", 2, 3], "word_instr"],
+    ["*", ["^", 2, 4], "ALU"],
+    ["*", ["^", 2, 5], "ADD"],
+    ["*", ["^", 2, 6], "SUB"],
+    ["*", ["^", 2, 7], "MEMORY"],
+    ["*", ["^", 2, 8], "BRANCH"],
+    ["*", ["^", 2, 9], "ECALL"],
+    ["*", ["^", 2, 10], "rs1"],
+    ["*", ["^", 2, 18], "rs2"],
+    ["*", ["^", 2, 26], "rd"],
+    ["*", ["^", 2, 34], "half_instruction_length"],
+    ["*", ["^", 2, 42], "alu_flags"],
+    ["*", ["^", 2, 50], "mem_flags"],
 ]
 
-[[variables.virtual]]
-name = "pad"
-type = "Bit"
-desc = "When no flags are set, we must be in a padding row."
-def = ["-", 1, "ADD", "SUB", "SLT", "AND", "OR", "XOR", "SHIFT", "JALR", "BEQ", "BLT", "LOAD", "STORE", "MUL", "DIVREM", "ECALL", "EBREAK"]
-
 
 [[assumptions]]
-desc = "At most one ALU selector flag is 1 by the decoding, and every other flag is 0."
-ref = "cpu:a:one-hot"
+desc = "`MEMORY` and `BRANCH` are mutually exclusive"
+ref = "cpu:a:mem-branch-mutex"
 
 [[assumptions]]
-desc = "When `STORE + LOAD + BEQ + BLT = 0`, either `rs2 = 0` or `imm = 0` should be enforced by the decoding. This is needed for `arg2`."
+desc = "When `MEMORY + BRANCH = 0`, either `read_register2 = 0` or `imm = 0` should be enforced by the decoding. This is needed for `arg2`."
 ref = "cpu:a:arg2-multiplex"
 
+[[assumptions]]
+desc = "$#`!MEMORY` => #`IS_BIT<mem_flags>`$"
+
+[[constraint_groups]]
+name = "assumptions"
+
+[[constraints.assumptions]]
+kind = "arith"
+constraint = "$not (#`MEMORY` and #`BRANCH`)$"
+poly = ["*", "MEMORY", "BRANCH"]
+
+[[constraints.assumptions]]
+kind = "arith"
+constraint = "$(1 - #`MEMORY` - #`BRANCH`) => (#`read_register2` = 0 or #`imm[i]` = 0)$"
+poly = ["*", ["-", 1, "MEMORY", "BRANCH"], "read_register2", ["+", ["idx", "imm", 0], ["idx", "imm", 1]]]
+
+[[constraints.assumptions]]
+kind = "template"
+tag = "IS_BIT"
+input = ["mem_flags"]
+cond = ["not", "MEMORY"]
+
+
 [[constraint_groups]]
 name = "decode"
 
@@ -353,7 +243,44 @@ name = "decode"
 kind = "interaction"
 tag = "DECODE"
 input = ["pc", "imm", "packed_decode"]
-multiplicity = 1
+multiplicity = ["not", "word_instr"]
+
+[[constraints.decode]]
+kind = "arith"
+constraint = "$#`word_instr` => #`MEMORY = 0`$"
+poly = ["*", "word_instr", "MEMORY"]
+
+[[constraints.decode]]
+kind = "arith"
+constraint = "$#`word_instr` => #`BRANCH = 0`$"
+poly = ["*", "word_instr", "BRANCH"]
+
+[[constraints.decode]]
+kind = "arith"
+constraint = "$#`word_instr` => #`ECALL = 0`$"
+poly = ["*", "word_instr", "ECALL"]
+
+[[constraints.decode]]
+kind = "arith"
+constraint = "$#`word_instr` => #`read_register1 = 0`$"
+poly = ["*", "word_instr", "read_register1"]
+
+[[constraints.decode]]
+kind = "arith"
+constraint = "$#`word_instr` => #`read_register2 = 0`$"
+poly = ["*", "word_instr", "read_register2"]
+
+[[constraints.decode]]
+kind = "arith"
+constraint = "$#`word_instr` => #`write_register = 0`$"
+poly = ["*", "word_instr", "write_register"]
+
+[[constraints.decode]]
+kind = "interaction"
+tag = "CPU32"
+input = ["timestamp", "pc"]
+output = "half_instruction_length"
+multiplicity = "word_instr"
 
 
 [[constraint_groups]]
@@ -380,51 +307,27 @@ ref = "cpu:c:range_write_register"
 
 [[constraints.range]]
 kind = "template"
-tag = "IS_BIT"
-input = ["memory_2bytes"]
-ref = "cpu:c:range_memory_2bytes"
-
-[[constraints.range]]
-kind = "template"
-tag = "IS_BIT"
-input = ["memory_4bytes"]
-ref = "cpu:c:range_memory_4bytes"
-
-[[constraints.range]]
-kind = "template"
-tag = "IS_BIT"
-input = ["memory_8bytes"]
-ref = "cpu:c:range_memory_8bytes"
-
-[[constraints.range]]
-kind = "template"
-tag = "IS_BIT"
-input = ["c_type_instruction"]
+tag = "IS_BYTE"
+input = ["half_instruction_length"]
 ref = "cpu:c:range_c_type_instruction"
 
 [[constraints.range]]
 kind = "template"
 tag = "IS_BIT"
-input = ["signed"]
-ref = "cpu:c:range_signed"
-
-[[constraints.range]]
-kind = "template"
-tag = "IS_BIT"
-input = ["mp_selector"]
-ref = "cpu:c:range_mp_selector"
+input = ["word_instr"]
+ref = "cpu:c:range_word_instr"
 
 [[constraints.range]]
 kind = "template"
 tag = "IS_BIT"
-input = ["muldiv_selector"]
-ref = "cpu:c:range_muldiv_selector"
+input = ["ALU"]
+ref = "cpu:c:range_ALU"
 
 [[constraints.range]]
 kind = "template"
-tag = "IS_BIT"
-input = ["word_instr"]
-ref = "cpu:c:range_word_instr"
+tag = "IS_BYTE"
+input = ["alu_flags"]
+ref = "cpu:c:range_alu_flags"
 
 [[constraints.range]]
 kind = "template"
@@ -441,74 +344,20 @@ ref = "cpu:c:range_SUB"
 [[constraints.range]]
 kind = "template"
 tag = "IS_BIT"
-input = ["SLT"]
-ref = "cpu:c:range_SLT"
-
-[[constraints.range]]
-kind = "template"
-tag = "IS_BIT"
-input = ["AND"]
-ref = "cpu:c:range_AND"
-
-[[constraints.range]]
-kind = "template"
-tag = "IS_BIT"
-input = ["OR"]
-ref = "cpu:c:range_OR"
-
-[[constraints.range]]
-kind = "template"
-tag = "IS_BIT"
-input = ["XOR"]
-ref = "cpu:c:range_XOR"
-
-[[constraints.range]]
-kind = "template"
-tag = "IS_BIT"
-input = ["SHIFT"]
-ref = "cpu:c:range_SHIFT"
-
-[[constraints.range]]
-kind = "template"
-tag = "IS_BIT"
-input = ["JALR"]
-ref = "cpu:c:range_JALR"
-
-[[constraints.range]]
-kind = "template"
-tag = "IS_BIT"
-input = ["BEQ"]
-ref = "cpu:c:range_BEQ"
-
-[[constraints.range]]
-kind = "template"
-tag = "IS_BIT"
-input = ["BLT"]
-ref = "cpu:c:range_BLT"
-
-[[constraints.range]]
-kind = "template"
-tag = "IS_BIT"
-input = ["LOAD"]
-ref = "cpu:c:range_LOAD"
-
-[[constraints.range]]
-kind = "template"
-tag = "IS_BIT"
-input = ["STORE"]
-ref = "cpu:c:range_STORE"
+input = ["MEMORY"]
+ref = "cpu:c:range_MEMORY"
 
 [[constraints.range]]
 kind = "template"
-tag = "IS_BIT"
-input = ["MUL"]
-ref = "cpu:c:range_MUL"
+tag = "IS_BYTE"
+input = ["mem_flags"]
+ref = "cpu:c:range_mem_flags"
 
 [[constraints.range]]
 kind = "template"
 tag = "IS_BIT"
-input = ["DIVREM"]
-ref = "cpu:c:range_DIVREM"
+input = ["BRANCH"]
+ref = "cpu:c:range_BRANCH"
 
 [[constraints.range]]
 kind = "template"
@@ -516,12 +365,6 @@ tag = "IS_BIT"
 input = ["ECALL"]
 ref = "cpu:c:range_ECALL"
 
-[[constraints.range]]
-kind = "template"
-tag = "IS_BIT"
-input = ["EBREAK"]
-ref = "cpu:c:range_EBREAK"
-
 [[constraints.range]]
 kind = "template"
 tag = "IS_BYTE"
@@ -538,22 +381,11 @@ tag = "IS_BYTE"
 input = ["rd"]
 
 [[constraints.range]]
-kind = "template"
-tag = "IS_BYTE"
-input = [["idx", "arg1", "i"]]
-iter = ["i", 0, 7]
-
-[[constraints.range]]
-kind = "template"
-tag = "IS_BYTE"
-input = [["idx", "arg2", "i"]]
-iter = ["i", 0, 7]
-
-[[constraints.range]]
-kind = "template"
-tag = "IS_BYTE"
+kind = "interaction"
+tag = "IS_HALF"
 input = [["idx", "res", "i"]]
-iter = ["i", 0, 7]
+multiplicity = 1
+iter = ["i", 0, 3]
 
 
 [[constraint_groups]]
@@ -561,91 +393,36 @@ name = "alu"
 prefix = "A"
 
 [[constraints.alu]]
-kind = "template"
-tag = "ADD"
-cond = ["+", "ADD", "LOAD"]
-input = [["cast", "arg1", "DWordWL"], ["cast", "arg2", "DWordWL"]]
-output = ["cast", "res", "DWordWL"]
+kind = "arith"
+constraint = "$#`arg2` = #`MEMORY` dot #`imm` + #`BRANCH` dot #`rv2` + (1 - #`MEMORY` - #`BRANCH`) dot (#`rv2` + #`imm`)$"
+poly = ["-", ["idx", "arg2", "i"],
+    ["*", "MEMORY", ["idx", "imm", "i"]],
+    ["*", "BRANCH", ["idx", "rv2", "i"]],
+    ["*", ["-", 1, "MEMORY", "BRANCH"], ["idx", ["+", "rv2", "imm"], "i"]]
+]
+iter = ["i", 0, 1]
 
 [[constraints.alu]]
 kind = "template"
 tag = "ADD"
-cond = "STORE"
-input = [["cast", "arg1", "DWordWL"], "imm"]
+cond = "ADD"
+input = ["rv1", "arg2"]
 output = ["cast", "res", "DWordWL"]
 
 [[constraints.alu]]
 kind = "template"
 tag = "SUB"
-cond = ["+", "SUB", "BEQ"]
-input = [["cast", "arg1", "DWordWL"], ["cast", "arg2", "DWordWL"]]
+cond = "SUB"
+input = ["rv1", "arg2"]
 output = ["cast", "res", "DWordWL"]
 ref = "cpu:c:sub"
 
 [[constraints.alu]]
 kind = "interaction"
-tag = "LT"
-input = [["cast", "arg1", "DWordWL"], ["cast", "arg2", "DWordWL"], "signed"]
-output = ["idx", "res", 0]
-multiplicity = ["+", "SLT", "BLT"]
-
-[[constraints.alu]]
-kind = "arith"
-constraint = "$#`SLT` + #`BLT` => #`res[i]` = 0$"
-poly = ["*", ["+", "SLT", "BLT"], ["idx", "res", "i"]]
-iter = ["i", 1, 7]
-
-[[constraints.alu]]
-kind = "interaction"
-tag = "AND_BYTE"
-input = [["idx", "arg1", "i"], ["idx", "arg2", "i"]]
-output = ["idx", "res", "i"]
-multiplicity = "AND"
-iter = ["i", 0, 7]
-
-[[constraints.alu]]
-kind = "interaction"
-tag = "OR_BYTE"
-input = [["idx", "arg1", "i"], ["idx", "arg2", "i"]]
-output = ["idx", "res", "i"]
-multiplicity = "OR"
-iter = ["i", 0, 7]
-
-[[constraints.alu]]
-kind = "interaction"
-tag = "XOR_BYTE"
-input = [["idx", "arg1", "i"], ["idx", "arg2", "i"]]
-output = ["idx", "res", "i"]
-multiplicity = "XOR"
-iter = ["i", 0, 7]
-
-[[constraints.alu]]
-kind = "interaction"
-tag = "SHIFT"
-input = [["cast", "arg1", "DWordHL"], ["idx", "arg2", 0], "mp_selector", "signed", "word_instr"]
+tag = "ALU"
+input = ["rv1", "arg2", "alu_flags"]
 output = ["cast", "res", "DWordWL"]
-multiplicity = "SHIFT"
-
-[[constraints.alu]]
-kind = "template"
-tag = "ADD"
-input = ["pc", ["*", ["+", ["*", 2, "c_type_instruction"], ["*", 4, ["not", "c_type_instruction"]]], ["cast", 1, "DWordWL"]]]
-output = ["cast", "res", "DWordWL"]
-cond = "JALR"
-
-[[constraints.alu]]
-kind = "interaction"
-tag = "MUL"
-input = [["cast", "arg1", "DWordHL"], "signed", ["cast", "arg2", "DWordHL"], "mp_selector", "muldiv_selector"]
-output = ["cast", "res", "DWordWL"]
-multiplicity = "MUL"
-
-[[constraints.alu]]
-kind = "interaction"
-tag = "DVRM"
-input = [["cast", "arg1", "DWordHL"], ["cast", "arg2", "DWordHL"], "signed", "muldiv_selector"]
-output = ["cast", "res", "DWordWL"]
-multiplicity = "DIVREM"
+multiplicity = "ALU"
 
 
 [[constraint_groups]]
@@ -655,8 +432,8 @@ prefix = "M"
 [[constraints.mem]]
 kind = "interaction"
 tag = "MEMW"
-input = [1, ["*", ["cast", 2, "DWordWL"], "rs1"], ["arr", ["idx", ["cast", "rv1", "DWordWL"], 0], ["idx", ["cast", "rv1", "DWordWL"], 1], 0, 0, 0, 0, 0, 0], ["+", "timestamp", ["cast", 0, "DWordWL"]], 1, 0, 0]
-output = ["arr", ["idx", ["cast", "rv1", "DWordWL"], 0], ["idx", ["cast", "rv1", "DWordWL"], 1], 0, 0, 0, 0, 0, 0]
+input = [1, ["*", ["cast", 2, "DWordWL"], "rs1"], ["arr", ["idx", "rv1", 0], ["idx", "rv1", 1], 0, 0, 0, 0, 0, 0], ["+", "timestamp", ["cast", 0, "DWordWL"]], 1, 0, 0]
+output = ["arr", ["idx", "rv1", 0], ["idx", "rv1", 1], 0, 0, 0, 0, 0, 0]
 multiplicity = "read_register1"
 ref = "cpu:c:read_rv1"
 
@@ -664,20 +441,20 @@ ref = "cpu:c:read_rv1"
 kind = "arith"
 constraint = "$#`!read_register1` => #`rv1[i]` = 0$"
 poly = ["*", ["not", "read_register1"], ["idx", "rv1", "i"]]
-iter = ["i", 0, 2]
+iter = ["i", 0, 1]
 
 [[constraints.mem]]
 kind = "interaction"
 tag = "MEMW"
-input = [1, ["*", ["cast", 2, "DWordWL"], "rs2"], ["arr", ["idx", ["cast", "rv2", "DWordWL"], 0], ["idx", ["cast", "rv2", "DWordWL"], 1], 0, 0, 0, 0, 0, 0], ["+", "timestamp", ["cast", 1, "DWordWL"]], 1, 0, 0]
-output = ["arr", ["idx", ["cast", "rv2", "DWordWL"], 0], ["idx", ["cast", "rv2", "DWordWL"], 1], 0, 0, 0, 0, 0, 0]
+input = [1, ["*", ["cast", 2, "DWordWL"], "rs2"], ["arr", ["idx", "rv2", 0], ["idx", "rv2", 1], 0, 0, 0, 0, 0, 0], ["+", "timestamp", ["cast", 1, "DWordWL"]], 1, 0, 0]
+output = ["arr", ["idx", "rv2", 0], ["idx", "rv2", 1], 0, 0, 0, 0, 0, 0]
 multiplicity = "read_register2"
 
 [[constraints.mem]]
 kind = "arith"
 constraint = "$#`!read_register2` => #`rv2[i]` = 0$"
 poly = ["*", ["not", "read_register2"], ["idx", "rv2", "i"]]
-iter = ["i", 0, 2]
+iter = ["i", 0, 1]
 
 [[constraints.mem]]
 kind = "interaction"
@@ -687,16 +464,16 @@ multiplicity = "write_register"
 
 [[constraints.mem]]
 kind = "interaction"
-tag = "LOAD"
-input = [["cast", "res", "DWordWL"], ["+", "timestamp", ["cast", 0, "DWordWL"]], "memory_2bytes", "memory_4bytes", "memory_8bytes", "signed"]
+tag = "MEMOP"
+input = ["timestamp", ["cast", "res", "DWordWL"], "rv2", "mem_flags"]
 output = "rvd"
-multiplicity = "LOAD"
+multiplicity = "MEMORY"
 
 [[constraints.mem]]
-kind = "interaction"
-tag = "MEMW"
-input = [0, ["cast", "res", "DWordWL"], ["cast", "arg2", ["Byte", 8]], ["+", "timestamp", ["cast", 1, "DWordWL"]], "memory_2bytes", "memory_4bytes", "memory_8bytes"]
-multiplicity = "STORE"
+kind = "arith"
+constraint = "$#`!MEMORY` and #`!BRANCH` => #`rvd` = #`res`$"
+poly = ["*", ["-", 1, "MEMORY", "BRANCH"], ["-", ["idx", "rvd", "i"], ["idx", ["cast", "res", "DWordWL"], "i"]]]
+iter = ["i", 0, 1]
 
 [[constraints.mem]]
 kind = "template"
@@ -712,124 +489,59 @@ input = ["prev_pc_timestamp_borrow"]
 kind = "interaction"
 tag = "memory"
 input = [1, ["arr", ["+", ["*", 2, 255], "i"], 0], ["arr", ["+", ["-", ["idx", "timestamp", 0], ["*", 3, ["not", "pc_double_read"]]], ["*", ["^", 2, 32], "prev_pc_timestamp_borrow"]], ["-", ["idx", "timestamp", 1], "prev_pc_timestamp_borrow"]], ["idx", "pc", "i"]]
-multiplicity = ["not", "pad"]
 iter = ["i", 0, 1]
+multiplicity = 1
 
 [[constraints.mem]]
 kind = "interaction"
 tag = "memory"
 input = [1, ["arr", ["+", ["*", 2, 255], "i"], 0], ["+", "timestamp", ["cast", 1, "DWordWL"]], ["idx", "next_pc", "i"]]
-multiplicity = ["-", ["not", "pad"]]
 iter = ["i", 0, 1]
-
+multiplicity = -1
 
 [[constraint_groups]]
-name = "sys"
-prefix = "S"
+name = "branch"
+prefix = "B"
 
-[[constraints.sys]]
+[[constraints.branch]]
 kind = "arith"
-constraint = "`!EBREAK`"
-desc = "We treat `EBREAK` as an unprovable trap"
-poly = ["not", "EBREAK"]
-ref = "cpu:c:ebreak_traps"
-
-[[constraints.sys]]
-kind = "interaction"
-tag = "ECALL"
-input = ["timestamp", ["cast", "rv1", "DWordWL"]]
-multiplicity = "ECALL"
-
-
-[[constraint_groups]]
-name = "ext"
-prefix = "E"
-
-[[constraints.ext]]
-kind = "template"
-tag = "SIGN"
-input = [["idx", "rv1", 1], "word_instr"]
-output = "rv1_ext_bit"
-
-[[constraints.ext]]
-kind = "arith"
-constraint = "$#`arg1[:4]` = #`rv1[:2]`$"
-poly = ["-", ["idx", ["cast", "arg1", "DWordWL"], 0], ["idx", ["cast", "rv1", "DWordWL"], 0]]
-
-[[constraints.ext]]
-kind = "arith"
-constraint = "$#`arg1[4:]` = #`rv1[2]` dot (1 - #`word_instr`) + (2^(32) - 1) dot #`rv1_ext_bit` dot #`signed`$"
-poly = ["-", ["idx", ["cast", "arg1", "DWordWL"], 1], ["*", ["not", "word_instr"], ["idx", "rv1", 2]], ["*", "signed", "rv1_ext_bit", ["-", ["^", 2, 32], 1]]]
-
-[[constraints.ext]]
-kind = "template"
-tag = "SIGN"
-input = [["idx", "rv2", 1], "word_instr"]
-output = "rv2_ext_bit"
-
-[[constraints.ext]]
-kind = "arith"
-constraint = "$#`arg2[:4]` = (1 - #`LOAD`) dot #`rv2[:2]` + (1 - #`BEQ` - #`BLT` - #`STORE`) dot #`imm[0]`$"
-poly = ["-", ["idx", ["cast", "arg2", "DWordWL"], 0], ["*", ["not", "LOAD"], ["idx", ["cast", "rv2", "DWordWL"], 0]], ["*", ["-", 1, "BEQ", "BLT", "STORE"], ["idx", "imm", 0]]]
-
-[[constraints.ext]]
-kind = "arith"
-constraint = "$#`arg2[4:]` = (1 - #`LOAD`) dot ((1 - #`word_instr`) dot #`rv2[2]` + #`signed` dot #`rv2_ext_bit` dot (2^(32) - 1)) + (1 - #`BEQ` - #`BLT` - #`STORE`) dot #`imm[1]`$"
-poly = ["-", ["idx", ["cast", "arg2", "DWordWL"], 1], ["*", ["not", "LOAD"], ["not", "word_instr"], ["idx", "rv2", 2]], ["*", ["not", "LOAD"], "signed", "rv2_ext_bit", ["-", ["^", 2, 32], 1]], ["*", ["-", 1, "BEQ", "BLT", "STORE"], ["idx", "imm", 1]]]
-
-[[constraints.ext]]
-kind = "template"
-tag = "SIGN"
-input = [["idx", ["cast", "res", "DWordHL"], 1],  "word_instr"]
-output = "res_ext_bit"
-
-[[constraints.ext]]
-kind = "arith"
-constraint = "$#`!LOAD` => #`rvd[0]` = #`res[:4]`$"
-poly = ["*", ["not", "LOAD"], ["-", ["idx", "rvd", 0], ["idx", ["cast", "res", "DWordWL"], 0]]]
-
-[[constraints.ext]]
-kind = "arith"
-constraint = "$#`!LOAD` => #`rvd[1]` = (1 - #`word_instr`) dot #`res[4:]` + #`res_ext_bit` dot (2^(32) - 1)$"
-desc = "_Sign_ extend the output if it wasn't a `LOAD`. Only `LOAD` has both `write_register = 1` and `rvd ≠ res`. `LOAD` and `word_instr` are disjoint"
-poly = ["*", ["not", "LOAD"], ["-", ["idx", "rvd", 1], ["*", ["not", "word_instr"], ["idx", ["cast", "res", "DWordWL"], 1]], ["*", "res_ext_bit", ["-", ["^", 2, 32], 1]]]]
-
-
-[[constraint_groups]]
-name = "misc"
-prefix = "O"
-
-[[constraints.misc]]
-kind = "interaction"
-tag = "ZERO"
-input = [["+", ["idx", "res", 0], ["idx", "res", 1], ["idx", "res", 2], ["idx", "res", 3], ["idx", "res", 4], ["idx", "res", 5], ["idx", "res", 6], ["idx", "res", 7]]]
-output = "is_equal"
-multiplicity = "BEQ"
-ref = "cpu:c:is_equal"
-
-[[constraints.misc]]
-kind = "arith"
-constraint = "$#`branch_cond` = #`JALR` or (#`BLT` and (#`res` xor #`invert`)) or (#`BEQ` and (#`is_equal` xor #`invert`))$"
-desc = "where `invert` is represented by `mp_selector`"
-poly = ["+",
-        ["-", "branch_cond"],
-        "JALR",
-        ["*", ["idx", "res", 0], ["not", "mp_selector"], "BLT"],
-        ["*", ["-", 1, ["idx", "res", 0]], "mp_selector", "BLT"],
-        ["*", "is_equal", ["not", "mp_selector"], "BEQ"],
-        ["*", ["not", "is_equal"], "mp_selector", "BEQ"]
+constraint = "$#`branch_cond` = #`BRANCH` and (#`JALR` or #`res`)$"
+poly = ["-",
+        "branch_cond",
+        ["*", "BRANCH", "JALR"],
+        ["*", "BRANCH", ["-", 1, "JALR"], ["idx", "res", 0]]
     ]
 
-[[constraints.misc]]
+[[constraints.branch]]
 kind = "interaction"
 tag = "BRANCH"
-input = ["pc", "imm", ["cast", "arg1", "DWordWL"], "JALR"]
+input = ["pc", "imm", "rv1", "JALR"]
 output = "next_pc"
 multiplicity = "branch_cond"
 
-[[constraints.misc]]
+[[constraints.branch]]
 kind = "template"
 tag = "ADD"
-input = ["pc", ["*", ["+", ["*", 2, "c_type_instruction"], ["*", 4, ["not", "c_type_instruction"]]], ["cast", 1, "DWordWL"]]]
+input = ["pc", ["arr", ["*", 2, "half_instruction_length"], 0]]
 output = "next_pc"
+cond = ["not", "branch_cond"]
 desc = "Increment `pc` to `next_pc` if we're not branching"
+
+[[constraints.branch]]
+kind = "template"
+tag = "ADD"
+input = ["pc", ["arr", ["*", 2, "half_instruction_length"], 0]]
+output = "rvd"
+cond = "BRANCH"
+desc = "Compute the next instruction address in `rvd` when BRANCH is active to enable JALR"
+
+
+[[constraint_groups]]
+name = "sys"
+prefix = "S"
+
+[[constraints.sys]]
+kind = "interaction"
+tag = "ECALL"
+input = ["timestamp", "rv1"]
+multiplicity = "ECALL"
diff --git a/spec/src/cpu32.toml b/spec/src/cpu32.toml
new file mode 100644
index 000000000..f26ce0e8c
--- /dev/null
+++ b/spec/src/cpu32.toml
@@ -0,0 +1,416 @@
+name = "CPU32"
+
+[[variables.input]]
+name = "timestamp"
+type = "DWordWL"
+desc = "The timestamp for the CPU row"
+pad = 0
+
+[[variables.input]]
+name = "pc"
+type = "DWordWL"
+desc = "The PC at which the instruction occurs"
+pad = 0
+
+[[variables.output]]
+name = "half_instruction_length"
+type = "Byte"
+desc = "The length of this instruction"
+pad = 2
+
+[[variables.auxiliary]]
+name = "rs1"
+type = "Byte"
+desc = "Source register 1"
+pad = 0
+
+[[variables.auxiliary]]
+name = "read_register1"
+type = "Bit"
+desc = "Whether to read from `rs1` or not"
+pad = 0
+
+# Note in case we inline register accesses here: We don't need the upper word
+[[variables.auxiliary]]
+name = "rv1"
+type = "DWordWHH"
+desc = "The value in register `rs1`"
+pad = 0
+
+[[variables.auxiliary]]
+name = "rv1_sign"
+type = "Bit"
+desc = "The sign bit of the lower word of `rv1`"
+pad = 0
+
+[[variables.auxiliary]]
+name = "arg1"
+type = "DWordWL"
+desc = "The sign-extended version of `rv1`"
+pad = 0
+
+[[variables.auxiliary]]
+name = "rs2"
+type = "Byte"
+desc = "Source register 2"
+pad = 0
+
+[[variables.auxiliary]]
+name = "read_register2"
+type = "Bit"
+desc = "Whether to read from `rs2`"
+pad = 0
+
+[[variables.auxiliary]]
+name = "rv2"
+type = "DWordWHH"
+desc = "The value in register `rs2`"
+pad = 0
+
+[[variables.auxiliary]]
+name = "rv2_sign"
+type = "Bit"
+desc = "The sign bit of the lower word of `rv2`"
+pad = 0
+
+[[variables.auxiliary]]
+name = "imm"
+type = "DWordWL"
+desc = "The fully sign-extended immediate to use"
+pad = 0
+
+[[variables.auxiliary]]
+name = "arg2"
+type = "DWordWL"
+desc = "Either the sign-extended version of `rv2` or all of `imm`"
+pad = 0
+
+[[variables.auxiliary]]
+name = "res"
+type = "DWordHL"
+desc = "The ALU result"
+pad = 0
+
+[[variables.auxiliary]]
+name = "res_sign"
+type = "Bit"
+desc = "The sign bit of the lower word of `res`"
+pad = 0
+
+[[variables.auxiliary]]
+name = "rd"
+type = "Byte"
+desc = "Destination register"
+pad = 0
+
+[[variables.auxiliary]]
+name = "write_register"
+type = "Bit"
+desc = "Whether to write back to `rd`"
+pad = 0
+
+[[variables.auxiliary]]
+name = "rvd"
+type = "DWordWL"
+desc = "The value to write back to `rd`, the sign-extended version of `res`"
+pad = 0
+
+[[variables.auxiliary]]
+name = "ALU"
+type = "Bit"
+desc = "Whether the full ALU is active"
+pad = 0
+
+[[variables.auxiliary]]
+name = "alu_flags"
+type = "Byte"
+desc = "The ALU operation + flags"
+pad = 0
+
+[[variables.auxiliary]]
+name = "ADD"
+type = "Bit"
+desc = "Whether the full ALU is active"
+pad = 0
+
+[[variables.auxiliary]]
+name = "SUB"
+type = "Bit"
+desc = "Whether the full ALU is active"
+pad = 0
+
+[[variables.auxiliary]]
+name = "signed"
+type = "Bit"
+desc = "Whether the instruction is signed or not. Extracted from `alu_flags`, used to determine the extension for the inputs"
+pad = 0
+
+[[variables.virtual]]
+name = "packed_decode"
+type = "BaseField"
+desc = "The packed representation of all flags and information from the decode table"
+def = ["+",
+    ["*", ["^", 2, 0], "read_register1"],
+    ["*", ["^", 2, 1], "read_register2"],
+    ["*", ["^", 2, 2], "write_register"],
+    ["*", ["^", 2, 3], 1], # word_instr
+    ["*", ["^", 2, 4], "ALU"],
+    ["*", ["^", 2, 5], "ADD"],
+    ["*", ["^", 2, 6], "SUB"],
+    ["*", ["^", 2, 10], "rs1"],
+    ["*", ["^", 2, 18], "rs2"],
+    ["*", ["^", 2, 26], "rd"],
+    ["*", ["^", 2, 34], "half_instruction_length"],
+    ["*", ["^", 2, 42], "alu_flags"],
+]
+
+[[variables.multiplicity]]
+name = "μ"
+type = "Bit"
+desc = ""
+pad = 0
+
+[[assumptions]]
+desc = "`IS_WORD[timestamp[i]]`"
+iter = ["i", 0, 1]
+
+[[assumptions]]
+desc = "`IS_WORD[pc[i]]`"
+iter = ["i", 0, 1]
+
+[[assumptions]]
+desc = "`read_register2 = 0` or `imm = 0`, enforced by decoding."
+
+[[constraint_groups]]
+name = "assumptions"
+
+[[constraints.assumptions]]
+kind = "arith"
+constraint = "$#`read_register2` = 0 or #`imm[i] = 0`$"
+poly = ["*", "read_register2", ["+", ["idx", "imm", 0], ["idx", "imm", 1]]]
+
+[[constraint_groups]]
+name = "decode"
+
+[[constraints.decode]]
+kind = "interaction"
+tag = "DECODE"
+input = ["pc", "imm", "packed_decode"]
+multiplicity = "μ"
+
+[[constraint_groups]]
+name = "range"
+prefix = "R"
+
+[[constraints.range]]
+kind = "template"
+tag = "IS_BIT"
+input = ["μ"]
+
+[[constraints.range]]
+kind = "template"
+tag = "IS_BIT"
+input = ["read_register1"]
+
+[[constraints.range]]
+kind = "template"
+tag = "IS_BIT"
+input = ["read_register2"]
+
+[[constraints.range]]
+kind = "template"
+tag = "IS_BIT"
+input = ["write_register"]
+
+[[constraints.range]]
+kind = "template"
+tag = "IS_BYTE"
+input = ["half_instruction_length"]
+
+[[constraints.range]]
+kind = "template"
+tag = "IS_BIT"
+input = ["ALU"]
+
+# Technically constraint by the BYTE_ALU in `ext`, but this is safer/cleaner
+[[constraints.range]]
+kind = "template"
+tag = "IS_BYTE"
+input = ["alu_flags"]
+
+[[constraints.range]]
+kind = "template"
+tag = "IS_BIT"
+input = ["ADD"]
+
+[[constraints.range]]
+kind = "template"
+tag = "IS_BIT"
+input = ["SUB"]
+
+[[constraints.range]]
+kind = "template"
+tag = "IS_BYTE"
+input = ["rs1"]
+
+[[constraints.range]]
+kind = "template"
+tag = "IS_BYTE"
+input = ["rs2"]
+
+[[constraints.range]]
+kind = "template"
+tag = "IS_BYTE"
+input = ["rd"]
+
+[[constraints.range]]
+kind = "interaction"
+tag = "IS_HALF"
+input = [["idx", "rv1", "i"]]
+multiplicity = "μ"
+iter = ["i", 0, 1]
+
+[[constraints.range]]
+kind = "interaction"
+tag = "IS_HALF"
+input = [["idx", "rv2", "i"]]
+multiplicity = "μ"
+iter = ["i", 0, 1]
+
+[[constraints.range]]
+kind = "interaction"
+tag = "IS_HALF"
+input = [["idx", "res", "i"]]
+multiplicity = "μ"
+iter = ["i", 0, 3]
+
+
+[[constraint_groups]]
+name = "alu"
+prefix = "A"
+
+[[constraints.alu]]
+kind = "template"
+tag = "ADD"
+cond = "ADD"
+input = ["arg1", "arg2"]
+output = ["cast", "res", "DWordWL"]
+
+[[constraints.alu]]
+kind = "template"
+tag = "SUB"
+cond = "SUB"
+input = ["arg1", "arg2"]
+output = ["cast", "res", "DWordWL"]
+
+[[constraints.alu]]
+kind = "interaction"
+tag = "ALU"
+input = ["arg1", "arg2", "alu_flags"]
+output = ["cast", "res", "DWordWL"]
+multiplicity = "ALU"
+
+[[constraint_groups]]
+name = "mem"
+prefix = "M"
+
+
+[[constraints.mem]]
+kind = "interaction"
+tag = "MEMW"
+input = [1, ["*", ["cast", 2, "DWordWL"], "rs1"], ["arr", ["idx", ["cast", "rv1", "DWordWL"], 0], ["idx", "rv1", 2], 0, 0, 0, 0, 0, 0], ["+", "timestamp", ["cast", 0, "DWordWL"]], 1, 0, 0]
+output = ["arr", ["idx", ["cast", "rv1", "DWordWL"], 0], ["idx", "rv1", 2], 0, 0, 0, 0, 0, 0]
+multiplicity = "read_register1"
+
+[[constraints.mem]]
+kind = "arith"
+constraint = "$#`!read_register1` => #`rv1[i]` = 0$"
+poly = ["*", ["not", "read_register1"], ["idx", "rv1", "i"]]
+iter = ["i", 0, 2]
+
+[[constraints.mem]]
+kind = "interaction"
+tag = "MEMW"
+input = [1, ["*", ["cast", 2, "DWordWL"], "rs2"], ["arr", ["idx", ["cast", "rv2", "DWordWL"], 0], ["idx", "rv2", 2], 0, 0, 0, 0, 0, 0], ["+", "timestamp", ["cast", 1, "DWordWL"]], 1, 0, 0]
+output = ["arr", ["idx", ["cast", "rv2", "DWordWL"], 0], ["idx", "rv2", 2], 0, 0, 0, 0, 0, 0]
+multiplicity = "read_register2"
+
+[[constraints.mem]]
+kind = "arith"
+constraint = "$#`!read_register2` => #`rv2[i]` = 0$"
+poly = ["*", ["not", "read_register2"], ["idx", "rv2", "i"]]
+iter = ["i", 0, 2]
+
+[[constraints.mem]]
+kind = "interaction"
+tag = "MEMW"
+input = [1, ["*", ["cast", 2, "DWordWL"], "rd"], ["arr", ["idx", "rvd", 0], ["idx", "rvd", 1], 0, 0, 0, 0, 0, 0], ["+", "timestamp", ["cast", 2, "DWordWL"]], 1, 0, 0]
+multiplicity = "write_register"
+
+[[constraint_groups]]
+name = "logup"
+
+[[constraints.logup]]
+kind = "interaction"
+tag = "CPU32"
+input = ["timestamp", "pc"]
+output = "half_instruction_length"
+multiplicity = ["-", "μ"]
+
+[[constraint_groups]]
+name = "ext"
+
+[[constraints.ext]]
+kind = "interaction"
+tag = "BYTE_ALU"
+input = [["opsel", "AND"], 32, "alu_flags"]
+output = ["*", 32, "signed"]
+multiplicity = "μ"
+
+[[constraints.ext]]
+kind = "template"
+tag = "SIGN"
+input = [["idx", "rv1", 1], "signed"]
+output = "rv1_sign"
+
+[[constraints.ext]]
+kind = "arith"
+constraint = "$#`arg1[0]` = #`rv1[:2]`$"
+poly = ["-", ["idx", "arg1", 0], ["idx", ["cast", "rv1", "DWordWL"], 0]]
+
+[[constraints.ext]]
+kind = "arith"
+constraint = "$#`arg1[1]` = (2^(32) - 1) dot #`rv1_sign`$"
+poly = ["-", ["idx", "arg1", 1], ["*", ["-", ["^", 2, 32], 1], "rv1_sign"]]
+
+[[constraints.ext]]
+kind = "template"
+tag = "SIGN"
+input = [["idx", "rv2", 1], "signed"]
+output = "rv2_sign"
+
+[[constraints.ext]]
+kind = "arith"
+constraint = "$#`arg2[0]` = #`rv2[:2]` + #`imm[0]`$"
+poly = ["-", ["idx", "arg2", 0], ["idx", ["cast", "rv2", "DWordWL"], 0], ["idx", "imm", 0]]
+
+[[constraints.ext]]
+kind = "arith"
+constraint = "$#`arg2[1]` = (2^(32) - 1) dot #`rv2_sign` + #`imm[1]`$"
+poly = ["-", ["idx", "arg2", 1], ["*", ["-", ["^", 2, 32], 1], "rv2_sign"], ["idx", "imm", 1]]
+
+[[constraints.ext]]
+kind = "template"
+tag = "SIGN"
+input = [["idx", "res", 1], 1]
+output = "res_sign"
+
+[[constraints.ext]]
+kind = "arith"
+constraint = "$#`rvd[0]` = #`res[:2]`$"
+poly = ["-", ["idx", "rvd", 0], ["idx", ["cast", "res", "DWordWL"], 0]]
+
+[[constraints.ext]]
+kind = "arith"
+constraint = "$#`rvd[1]` = (2^(32) - 1) dot #`res_sign`$"
+poly = ["-", ["idx", "rvd", 1], ["*", ["-", ["^", 2, 32], 1], "res_sign"]]
diff --git a/spec/src/decode.toml b/spec/src/decode.toml
index 367db1568..1e0c6dd8f 100644
--- a/spec/src/decode.toml
+++ b/spec/src/decode.toml
@@ -4,7 +4,7 @@ name = "DECODE"
 name = "pc"
 type = "DWordWL"
 desc = "value of the program counter this instruction is associated with."
-pad = 7
+pad = 1
 
 [[variables.output]]
 name = "packed_decode"
@@ -15,36 +15,22 @@ A list of each variable and the bit(-range) in which it is located:\\
 [0] `read_register1`, \\
 [1] `read_register2`, \\
 [2] `write_register`, \\
-[3] `memory_2bytes`, \\
-[4] `memory_4bytes`, \\
-[5] `memory_8bytes`, \\
-[6] `c_type`, \\
-[7] `signed`, \\
-[8] `mp_selector`, \\
-[9] `muldiv_selector`, \\
-[10] `word_instr`, \\
-[11] `ADD`, \\
-[12] `SUB`, \\
-[13] `SLT`, \\
-[14] `AND`, \\
-[15] `OR`, \\
-[16] `XOR`, \\
-[17] `SHIFT`, \\
-[18] `JALR`, \\
-[19] `BEQ`, \\
-[20] `BLT`, \\
-[21] `LOAD`, \\
-[22] `STORE`, \\
-[23] `MUL`, \\
-[24] `DIVREM`, \\
-[25] `ECALL`, \\
-[26] `EBREAK`; \\
-[27:35] `rs1`, \\
-[35:43] `rs2`, \\
-[43:51] `rd`, \\
+[3] `word_instr`, \\
+[4] `ALU`, \\
+[5] `ADD`, \\
+[6] `SUB`, \\
+[7] `MEMORY`, \\
+[8] `BRANCH`, \\
+[9] `ECALL`, \\
+[10:17] `rs1`, \\
+[18:25] `rs2`, \\
+[26:33] `rd`, \\
+[34:41] `half_instruction_length`, \\
+[42:49] `alu_flags`, \\
+[50:57] `mem_flags`, \\
 the remaining bits are set to zero.
 """
-pad = ["^", 2, 26]
+pad = 0
 
 [[variables.output]]
 name = "imm"
diff --git a/spec/src/decode_uncompressed.toml b/spec/src/decode_uncompressed.toml
index 0f6c931c2..692ac4acd 100644
--- a/spec/src/decode_uncompressed.toml
+++ b/spec/src/decode_uncompressed.toml
@@ -33,27 +33,7 @@ desc = "whether to load the contents of address `rs2` (1) or `0` (0) into `rv2`.
 [[variables.output]]
 name = "write_register"
 type = "Bit"
-desc = "whether the result should be written to `rd` ($=0$ for memory write and when $#`rd` = #`x0`$."
-
-[[variables.output]]
-name = "mem_2B"
-type = "Bit"
-desc = "whether the memory access (read or write) touches exactly $2$ bytes."
-
-[[variables.output]]
-name = "mem_4B"
-type = "Bit"
-desc = "whether the memory access (read or write) touches exactly $4$ bytes."
-
-[[variables.output]]
-name = "mem_8B"
-type = "Bit"
-desc = "whether the memory access (read or write) touches exactly $8$ bytes."
-
-[[variables.output]]
-name = "c_type"
-type = "Bit"
-desc = "Whether the instruction is of type `C`, i.e., whether it is $2$ bytes long instead of $4$."
+desc = "whether the result should be written to `rd` ($=0$ for memory write and when $#`rd` = #`x0`)$."
 
 [[variables.output]]
 name = "imm"
@@ -61,105 +41,124 @@ type = "DWordWL"
 desc = "the *fully extended (!)* 64-bit version of the immediate."
 
 [[variables.output]]
-name = "signed"
-type = "Bit"
-desc = "selector used to indicate signed or unsigned input interpretation."
-
-[[variables.output]]
-name = "mp_selector"
-type = "Bit"
-desc = """Multi-purpose selector used by the CPU to to configure several ALU operations in different ways. 
-          See the `CPU` chip for more details."""
-
-[[variables.output]]
-name = "muldiv_selector"
+name = "word_instr"
 type = "Bit"
-desc = "selects which output of `MUL` (lo/hi) or `DVRM` (quo/rem) is wanted."
+desc = "Whether the instruction is a `*W` instruction, requiring the inputs and outputs to be (sign) extended."
 
 [[variables.output]]
-name = "word_instr"
+name = "ALU"
 type = "Bit"
-desc = "Whether the instruction is a `*W` instruction, requiring the inputs and outputs to be (sign) extended."
+desc = "Enable the ALU"
 
 [[variables.output]]
 name = "ADD"
 type = "Bit"
-desc = "ALU selector flag"
+desc = "ALU does an ADD"
 
 [[variables.output]]
 name = "SUB"
 type = "Bit"
-desc = "ALU selector flag"
+desc = "ALU does a SUB"
 
 [[variables.output]]
-name = "SLT"
+name = "BRANCH"
 type = "Bit"
-desc = "ALU selector flag"
+desc = "The instruction is a branch"
 
 [[variables.output]]
-name = "AND"
+name = "MEMORY"
 type = "Bit"
-desc = "ALU selector flag"
+desc = "The instruction is a memory access"
 
 [[variables.output]]
-name = "OR"
+name = "ECALL"
 type = "Bit"
-desc = "ALU selector flag"
+desc = "Perform an ECALL"
 
 [[variables.output]]
-name = "XOR"
-type = "Bit"
-desc = "ALU selector flag"
+name = "half_instruction_length"
+type = "Byte"
+desc = "Half of how many bytes this instruction takes up in the program"
 
-[[variables.output]]
-name = "SHIFT"
+[[variables.auxiliary]]
+name = "alu_op"
+type = "B4"
+desc = "Operation selector value for the ALU"
+
+[[variables.auxiliary]]
+name = "signed"
 type = "Bit"
-desc = "ALU selector flag"
+desc = "selector used to indicate signed or unsigned input interpretation."
 
-[[variables.output]]
-name = "JALR"
+[[variables.auxiliary]]
+name = "signed2"
 type = "Bit"
-desc = "ALU selector flag"
+desc = """A second signed bit, useful for MUL instructions"""
 
-[[variables.output]]
-name = "BEQ"
+[[variables.auxiliary]]
+name = "muldiv_selector"
 type = "Bit"
-desc = "ALU selector flag"
+desc = "selects which output of `MUL` (lo/hi) or `DVRM` (quo/rem) is wanted."
 
-[[variables.output]]
-name = "BLT"
+[[variables.auxiliary]]
+name = "invert"
 type = "Bit"
-desc = "ALU selector flag"
+desc = "Instructs the EQ or LT chip to invert its result, or inverts the direction of the SHIFT chip (right instead of left)"
 
-[[variables.output]]
-name = "LOAD"
+[[variables.auxiliary]]
+name = "memory_op"
 type = "Bit"
-desc = "ALU selector flag"
+desc = "Selects whether to LOAD (0) or STORE (1)"
 
-[[variables.output]]
-name = "STORE"
+[[variables.auxiliary]]
+name = "mem_2B"
 type = "Bit"
-desc = "ALU selector flag"
+desc = "whether the memory access (read or write) touches exactly $2$ bytes."
 
-[[variables.output]]
-name = "MUL"
+[[variables.auxiliary]]
+name = "mem_4B"
 type = "Bit"
-desc = "ALU selector flag"
+desc = "whether the memory access (read or write) touches exactly $4$ bytes."
 
-[[variables.output]]
-name = "DIVREM"
+[[variables.auxiliary]]
+name = "mem_8B"
 type = "Bit"
-desc = "ALU selector flag"
+desc = "whether the memory access (read or write) touches exactly $8$ bytes."
 
-[[variables.output]]
-name = "ECALL"
+[[variables.auxiliary]]
+name = "mem_signed"
 type = "Bit"
-desc = "ALU selector flag"
+desc = "Whether the memory operation is a signed one, this is distinct from `signed` to enable the `JALR` flag to alias `mem_flags`"
 
-[[variables.output]]
-name = "EBREAK"
+[[variables.auxiliary]]
+name = "JALR"
 type = "Bit"
-desc = "ALU selector flag"
+desc = "The branch is a JAL(R)"
+
+[[variables.virtual]]
+name = "alu_flags"
+type = "Byte"
+desc = "The combined ALU flags"
+def = ["+",
+          "alu_op",
+          ["*", 32, "signed"],
+          ["*", 64, ["+", "signed2", "invert"]],
+          ["*", 128, "muldiv_selector"]
+]
+
+[[variables.virtual]]
+name = "mem_flags"
+type = "Byte"
+desc = "The combined memory flags (or JALR when BRANCHing)"
+def = ["+",
+          "JALR",
+          "memory_op",
+          ["*", 2, "mem_signed"],
+          ["*", 4, "mem_2B"],
+          ["*", 8, "mem_4B"],
+          ["*", 16, "mem_8B"]
+]
+
 
 [[variables.multiplicity]]
 name = "μ"
diff --git a/spec/src/dvrm.toml b/spec/src/dvrm.toml
index 52583907c..6ff9b994c 100644
--- a/spec/src/dvrm.toml
+++ b/spec/src/dvrm.toml
@@ -182,23 +182,32 @@ desc = ""
 pad = 0
 
 
-# Assumptions
+# Constraints
 
-[[assumptions]]
-desc = "`IS_HALF[n[i]]`"
-iter = ["i", 0, 3]
-ref = "lt:a:range_n"
+[[constraint_groups]]
+name = "range"
 
-[[assumptions]]
-desc = "`IS_HALF[d[i]]`"
+[[constraints.range]]
+kind = "interaction"
+tag = "IS_HALF"
+input = [["idx", "n", "i"]]
 iter = ["i", 0, 3]
-ref = "lt:a:range_d"
+multiplicity = "μ_sum"
+ref = "lt:c:range_n"
 
-[[assumptions]]
-desc = "`IS_BIT<signed>`"
-ref = "lt:a:range_signed"
+[[constraints.range]]
+kind = "interaction"
+tag = "IS_HALF"
+input = [["idx", "d", "i"]]
+iter = ["i", 0, 3]
+multiplicity = "μ_sum"
+ref = "lt:c:range_d"
 
-# Constraints
+[[constraints.range]]
+kind = "template"
+tag = "IS_BIT"
+input = ["signed"]
+ref = "lt:c:range_signed"
 
 [[constraint_groups]]
 name = "sign_equality"
@@ -214,9 +223,9 @@ name = "abs_diff"
 
 [[constraints.abs_diff]]
 kind = "interaction"
-tag = "LT"
-input = ["abs_r", "abs_d", 0]
-output = ["not", "div_by_zero"]
+tag = "ALU"
+input = ["abs_r", "abs_d", ["opsel", "LT"]]
+output = ["arr", ["not", "div_by_zero"], 0]
 multiplicity = "μ_sum"
 ref ="dvrm:c:abs_r_lt_abs_d"
 
@@ -304,16 +313,16 @@ name = "equality"
 
 [[constraints.equality]]
 kind = "interaction"
-tag = "MUL"
-input = ["d", "signed", "q", "sign_q", 0]
+tag = "ALU"
+input = [["cast", "d", "DWordWL"], ["cast", "q", "DWordWL"], ["+", ["opsel", "MUL"], ["*", 32, "signed"], ["*", 64, "sign_q"]]]
 output = ["cast", "n_sub_r", "DWordWL"]
 multiplicity = "μ_sum"
 ref = "dvrm:c:mul_lower"
 
 [[constraints.equality]]
 kind = "interaction"
-tag = "MUL"
-input = ["d", "signed", "q", "sign_q", 1]
+tag = "ALU"
+input = [["cast", "d", "DWordWL"], ["cast", "q", "DWordWL"], ["+", ["opsel", "MUL"], ["*", 32, "signed"], ["*", 64, "sign_q"], 128]]
 output = ["cast", "extension_n_sub_r", "DWordWL"]
 multiplicity = "μ_sum"
 ref = "dvrm:c:mul_upper"
@@ -375,14 +384,14 @@ desc = "Each row contributes the following to the LogUp sum"
 
 [[constraints.output]]
 kind = "interaction"
-tag = "DVRM"
-input = ["n", "d", "signed", 0]
+tag = "ALU"
+input = [["cast", "n", "DWordWL"], ["cast", "d", "DWordWL"], ["+", ["opsel", "DIVREM"], ["*", 32, "signed"]]]
 output = ["cast", "q", "DWordWL"]
 multiplicity = ["-", "μ_q"]
 
 [[constraints.output]]
 kind = "interaction"
-tag = "DVRM"
-input = ["n", "d", "signed", 1]
+tag = "ALU"
+input = [["cast", "n", "DWordWL"], ["cast", "d", "DWordWL"], ["+", ["opsel", "DIVREM"], ["*", 32, "signed"], 128]]
 output = ["cast", "r", "DWordWL"]
 multiplicity = ["-", "μ_r"]
diff --git a/spec/src/eq.toml b/spec/src/eq.toml
new file mode 100644
index 000000000..a06bb1b16
--- /dev/null
+++ b/spec/src/eq.toml
@@ -0,0 +1,93 @@
+name = "EQ"
+
+[[variables.input]]
+name = "a"
+type = "DWordWL"
+desc = "The first input"
+pad = 0
+
+[[variables.input]]
+name = "b"
+type = "DWordWL"
+desc = "The second input"
+pad = 0
+
+[[variables.input]]
+name = "invert"
+type = "Bit"
+desc = "Whether to invert the result"
+pad = 0
+
+[[variables.output]]
+name = "res"
+type = "Bit"
+desc = "The result"
+pad = 0
+
+[[variables.auxiliary]]
+name = "diff"
+type = "DWordHL"
+desc = "The difference `a - b`"
+pad = 0
+
+[[variables.auxiliary]]
+name = "eq"
+type = "Bit"
+desc = "The bit indicating `a == b`"
+pad = 0
+
+[[variables.multiplicity]]
+name = "μ"
+type = "BaseField"
+desc = ""
+pad = 0
+
+
+[[assumptions]]
+desc = "`IS_WORD[a[i]]`"
+iter = ["i", 0, 1]
+
+[[assumptions]]
+desc = "`IS_WORD[b[i]]`"
+iter = ["i", 0, 1]
+
+
+[[constraint_groups]]
+name = "all"
+
+[[constraints.all]]
+kind = "interaction"
+tag = "IS_HALF"
+input = [["idx", "diff", "i"]]
+multiplicity = "μ"
+iter = ["i", 0, 3]
+
+[[constraints.all]]
+kind = "template"
+tag = "IS_BIT"
+input = ["invert"]
+
+[[constraints.all]]
+kind = "template"
+tag = "SUB"
+input = ["a", "b"]
+output = ["cast", "diff", "DWordWL"]
+
+[[constraints.all]]
+kind = "interaction"
+tag = "ZERO"
+input = [["+", ["idx", "diff", 0], ["idx", "diff", 1], ["idx", "diff", 2], ["idx", "diff", 3]]]
+output = "eq"
+multiplicity = "μ"
+
+[[constraints.all]]
+kind = "arith"
+constraint = "$#`res` = #`eq` xor #`invert`$"
+poly = ["-", ["+", "res", ["*", 2, "eq", "invert"]], "eq", "invert"]
+
+[[constraints.all]]
+kind = "interaction"
+tag = "ALU"
+input = ["a", "b", ["+", ["opsel", "EQ"], ["*", 64, "invert"]]]
+output = ["arr", "res", 0]
+multiplicity = ["-", "μ"]
diff --git a/spec/src/halt.toml b/spec/src/halt.toml
index 9fee04877..6adc1efdb 100644
--- a/spec/src/halt.toml
+++ b/spec/src/halt.toml
@@ -5,6 +5,10 @@ name = "timestamp"
 type = "DWordWL"
 desc = "timestamp at which to halt the program"
 
+[[variables.auxiliary]]
+name = "pc"
+type = "DWordWL"
+desc = "The `next_pc` value the CPU wrote during the instruction HALT was invoked"
 
 [[assumptions]]
 desc = "`IS_WORD[timestamp[i]]`"
@@ -40,10 +44,19 @@ ref = "halt:c:zeroize_registers_hi"
 
 [[constraints.all]]
 kind = "interaction"
-tag = "MEMW"
-input = [1, ["cast", ["*", 2, 255], "DWordWL"], ["arr", 1, 0, 0, 0, 0, 0, 0, 0], ["cast", ["-", ["^", 2, 64], 1], "DWordWL"], 1, 0, 0]
+tag = "memory"
+input = [1, ["arr", ["+", ["*", 2, 255], "i"], 0], ["arr", ["+", ["idx", "timestamp", 0], 1], ["idx", "timestamp", 1]], ["idx", "pc", "i"]]
 multiplicity = 1
-ref = "halt:c:pc"
+iter = ["i", 0, 1]
+ref = "halt:c:consume_pc"
+
+[[constraints.all]]
+kind = "interaction"
+tag = "memory"
+input = [1, ["arr", ["+", ["*", 2, 255], "i"], 0], ["arr", ["+", ["idx", "timestamp", 0], 1], ["idx", "timestamp", 1]], ["idx", ["arr", 1, 0], "i"]]
+multiplicity = -1
+iter = ["i", 0, 1]
+ref = "halt:c:emit_pc"
 
 [[constraint_groups]]
 name = "lookup"
diff --git a/spec/src/keccak_round.toml b/spec/src/keccak_round.toml
index 59daba923..6ee05e29a 100644
--- a/spec/src/keccak_round.toml
+++ b/spec/src/keccak_round.toml
@@ -191,8 +191,8 @@ name = "theta"
 
 [[constraints.theta]]
 kind = "interaction"
-tag = "XOR_BYTE"
-input = [["idx", ["idx", ["idx", "start", "x"], 0], "z"], ["idx", ["idx", ["idx", "start", "x"], 1], "z"]]
+tag = "BYTE_ALU"
+input = [["opsel", "XOR"], ["idx", ["idx", ["idx", "start", "x"], 0], "z"], ["idx", ["idx", ["idx", "start", "x"], 1], "z"]]
 output = ["idx", ["idx", ["idx", "Cxz", "x"], 0], "z"]
 iters = [["x", 0, 4], ["z", 0, 7]]
 multiplicity = "μ"
@@ -200,8 +200,8 @@ ref = "keccak:c:theta_cxz_start"
 
 [[constraints.theta]]
 kind = "interaction"
-tag = "XOR_BYTE"
-input = [["idx", ["idx", ["idx", "Cxz", "x"], ["-", "y", 2]], "z"], ["idx", ["idx", ["idx", "start", "x"], "y"], "z"]]
+tag = "BYTE_ALU"
+input = [["opsel", "XOR"], ["idx", ["idx", ["idx", "Cxz", "x"], ["-", "y", 2]], "z"], ["idx", ["idx", ["idx", "start", "x"], "y"], "z"]]
 output = ["idx", ["idx", ["idx", "Cxz", "x"], ["-", "y", 1]], "z"]
 iters = [["x", 0, 4], ["y", 2, 4], ["z", 0, 7]]
 multiplicity = "μ"
@@ -237,8 +237,8 @@ iters = [["x", 0, 4], ["z", 0, 3]]
 
 [[constraints.theta]]
 kind = "interaction"
-tag = "XOR_BYTE"
-input = [["idx", ["idx", ["idx", "Cxz", ["mod", ["-", "x", 1], 5]], 3], "z"], ["idx", ["idx", "rotated_Cxz", ["mod", ["+", "x", 1], 5]], "z"]]
+tag = "BYTE_ALU"
+input = [["opsel", "XOR"], ["idx", ["idx", ["idx", "Cxz", ["mod", ["-", "x", 1], 5]], 3], "z"], ["idx", ["idx", "rotated_Cxz", ["mod", ["+", "x", 1], 5]], "z"]]
 output = ["idx", ["idx", "Dxz", "x"], "z"]
 iters = [["x", 0, 4], ["z", 0, 7]]
 multiplicity = "μ"
@@ -246,8 +246,8 @@ ref = "keccak:c:Dxz"
 
 [[constraints.theta]]
 kind = "interaction"
-tag = "XOR_BYTE"
-input = [["idx", ["idx", ["idx", "start", "x"], "y"], "z"], ["idx", ["idx", "Dxz", "x"], "z"]]
+tag = "BYTE_ALU"
+input = [["opsel", "XOR"], ["idx", ["idx", ["idx", "start", "x"], "y"], "z"], ["idx", ["idx", "Dxz", "x"], "z"]]
 output = ["idx", ["idx", ["idx", "theta", "x"], "y"], "z"]
 iters = [["x", 0, 4], ["y", 0, 4], ["z", 0, 7]]
 multiplicity = "μ"
@@ -289,16 +289,16 @@ name = "chi"
 
 [[constraints.chi]]
 kind = "interaction"
-tag = "AND_BYTE"
-input = [["-", 255, ["idx", ["idx", ["idx", "pi", ["mod", ["+", "x", 1], 5]], "y"], "z"]], ["idx",["idx",["idx", "pi", ["mod", ["+", "x", 2], 5]], "y"], "z"]]
+tag = "BYTE_ALU"
+input = [["opsel", "AND"], ["-", 255, ["idx", ["idx", ["idx", "pi", ["mod", ["+", "x", 1], 5]], "y"], "z"]], ["idx",["idx",["idx", "pi", ["mod", ["+", "x", 2], 5]], "y"], "z"]]
 output = ["idx", ["idx", ["idx", "chi_ANDs", "x"], "y"], "z"]
 iters = [["x", 0, 4], ["y", 0, 4], ["z", 0, 7]]
 multiplicity = "μ"
 
 [[constraints.chi]]
 kind = "interaction"
-tag = "XOR_BYTE"
-input = [["idx", ["idx", ["idx", "pi", "x"], "y"], "z"], ["idx",["idx",["idx", "chi_ANDs", "x"], "y"], "z"]]
+tag = "BYTE_ALU"
+input = [["opsel", "XOR"], ["idx", ["idx", ["idx", "pi", "x"], "y"], "z"], ["idx",["idx",["idx", "chi_ANDs", "x"], "y"], "z"]]
 output = ["idx", ["idx", ["idx", "chi", "x"], "y"], "z"]
 iters = [["x", 0, 4], ["y", 0, 4], ["z", 0, 7]]
 multiplicity = "μ"
@@ -308,8 +308,8 @@ name = "iota"
 
 [[constraints.iota]]
 kind = "interaction"
-tag = "XOR_BYTE"
-input = [["idx", ["idx", ["idx", "chi", 0], 0], "z"], ["idx","rc","z"]]
+tag = "BYTE_ALU"
+input = [["opsel", "XOR"], ["idx", ["idx", ["idx", "chi", 0], 0], "z"], ["idx","rc","z"]]
 output = ["idx", "iota", "z"]
 iter = ["z", 0, 7]
 multiplicity = "μ"
diff --git a/spec/src/load.toml b/spec/src/load.toml
index f8a974c9a..e6cf56f00 100644
--- a/spec/src/load.toml
+++ b/spec/src/load.toml
@@ -5,7 +5,7 @@ name = "LOAD"
 [[variables.input]]
 name = "base_address"
 type = "DWordWL"
-desc = "The base address to read/write from/to, gets offset by $[0, 7]$, depending on how big the access is"
+desc = "The base address to read from, gets offset by $[0, 7]$, depending on how big the access is"
 pad = 0
 
 [[variables.input]]
@@ -76,27 +76,37 @@ desc = "`IS_WORD[base_address[i]]`"
 iter = ["i", 0, 1]
 
 [[assumptions]]
-desc = "`IS_BIT<signed>`"
+desc = "`IS_WORD[timestamp[i]]`"
+iter = ["i", 0, 1]
 
-[[assumptions]]
-desc = "`IS_BIT<read2>`"
 
-[[assumptions]]
-desc = "`IS_BIT<read4>`"
+[[constraint_groups]]
+name = "all"
 
-[[assumptions]]
-desc = "`IS_BIT<read8>`"
+[[constraints.all]]
+kind = "template"
+tag = "IS_BIT"
+input = ["signed"]
 
-[[assumptions]]
-desc = "`IS_BIT<read2 + read4 + read8>`"
+[[constraints.all]]
+kind = "template"
+tag = "IS_BIT"
+input = ["read2"]
 
-[[assumptions]]
-desc = "`IS_WORD[timestamp[i]]`"
-iter = ["i", 0, 1]
+[[constraints.all]]
+kind = "template"
+tag = "IS_BIT"
+input = ["read4"]
 
+[[constraints.all]]
+kind = "template"
+tag = "IS_BIT"
+input = ["read8"]
 
-[[constraint_groups]]
-name = "all"
+[[constraints.all]]
+kind = "template"
+tag = "IS_BIT"
+input = [["+", "read2", "read4", "read8"]]
 
 [[constraints.all]]
 kind = "arith"
@@ -154,7 +164,7 @@ name = "output"
 
 [[constraints.output]]
 kind = "interaction"
-tag = "LOAD"
-input = ["base_address", "timestamp", "read2", "read4", "read8", "signed"]
+tag = "MEMOP"
+input = ["timestamp", "base_address", ["cast", 0, "DWordWL"], ["+", ["*", 2, "signed"], ["*", 4, "read2"], ["*", 8, "read4"], ["*", 16, "read8"]]]
 output = ["cast", "res", "DWordWL"]
 multiplicity = ["-", "μ"]
diff --git a/spec/src/lt.toml b/spec/src/lt.toml
index 70d25c919..57661333d 100644
--- a/spec/src/lt.toml
+++ b/spec/src/lt.toml
@@ -21,12 +21,18 @@ type = "Bit"
 desc = "whether to interpret `lhs` and `rhs` as signed integers (1) or not (0)"
 pad = 0
 
+[[variables.input]]
+name = "invert"
+type = "Bit"
+desc = "Whether to invert the result"
+pad = 0
+
 # Output
 
 [[variables.output]]
-name = "lt"
+name = "res"
 type = "Bit"
-desc = "Whether $#`lhs` < #`rhs`$, taking `signed` into account"
+desc = "The result"
 pad = 0
 
 
@@ -50,6 +56,12 @@ type = "Bit"
 desc = "The most significant bit of `rhs`"
 pad = 0
 
+[[variables.auxiliary]]
+name = "lt"
+type = "Bit"
+desc = "Whether $#`lhs` < #`rhs`$, taking `signed` into account"
+pad = 0
+
 # Virtual
 
 [[variables.virtual]]
@@ -85,10 +97,35 @@ ref = "lt:a:range_lhs"
 desc = "`IS_WORD[rhs[0]]`"
 ref = "lt:a:range_rhs"
 
-[[assumptions]]
-desc = "`IS_BIT<signed>`"
-ref = "lt:a:range_signed"
+[[constraint_groups]]
+name = "range"
+desc = "Range-check the inputs"
+
+[[constraints.range]]
+kind = "interaction"
+tag = "IS_HALF"
+input = [["idx", "lhs", 1]]
+multiplicity = "μ"
+ref = "lt:c:range_lhs"
+
+[[constraints.range]]
+kind = "interaction"
+tag = "IS_HALF"
+input = [["idx", "rhs", 1]]
+multiplicity = "μ"
+ref = "lt:c:range_rhs"
+
+[[constraints.range]]
+kind = "template"
+tag = "IS_BIT"
+input = ["signed"]
+ref = "lt:c:range_signed"
 
+[[constraints.range]]
+kind = "template"
+tag = "IS_BIT"
+input = ["invert"]
+ref = "lt:c:range_invert"
 
 [[constraint_groups]]
 name = "defs"
@@ -117,6 +154,11 @@ desc = "Where $A = #`lhs_msb`$, $B = #`rhs_msb`$ and $C = #`carry[1]`$"
 poly = ["-", "lt", ["*", "signed", ["+", ["*", "lhs_msb", ["not", "rhs_msb"]], ["*", "lhs_msb", ["idx", "carry", 1]], ["*", ["not", "rhs_msb"], ["idx", "carry", 1]]]], ["*", ["-", 1, "signed"], "unsigned_lt"]]
 ref = "lt:c:lt"
 
+[[constraints.defs]]
+kind = "arith"
+constraint = "$#`res` = #`lt` xor #`invert`$"
+poly = ["-", ["+", "res", ["*", 2, "lt", "invert"]], "lt", "invert"]
+
 
 [[constraint_groups]]
 name = "sub"
@@ -128,20 +170,6 @@ tag = "IS_BIT"
 input = [["idx", "carry", "i"]]
 iter = ["i", 0, 1]
 
-[[constraints.defs]]
-kind = "interaction"
-tag = "IS_HALF"
-input = [["idx", "lhs", 1]]
-multiplicity = "μ"
-ref = "lt:c:range_lhs"
-
-[[constraints.defs]]
-kind = "interaction"
-tag = "IS_HALF"
-input = [["idx", "rhs", 1]]
-multiplicity = "μ"
-ref = "lt:c:range_rhs"
-
 [[constraints.sub]]
 kind = "interaction"
 tag = "IS_HALF"
@@ -157,7 +185,7 @@ desc = "Each row contributes the following to the LogUp sum"
 
 [[constraints.output]]
 kind = "interaction"
-tag = "LT"
-input = [["cast", "lhs", "DWordWL"], ["cast", "rhs", "DWordWL"], "signed"]
-output = "lt"
+tag = "ALU"
+input = [["cast", "lhs", "DWordWL"], ["cast", "rhs", "DWordWL"], ["+", ["opsel", "LT"], ["*", 32, "signed"], ["*", 64, "invert"]]]
+output = ["arr", "res", 0]
 multiplicity = ["-", "μ"]
diff --git a/spec/src/memw.toml b/spec/src/memw.toml
index 1cc0dd3c2..c04fe5d34 100644
--- a/spec/src/memw.toml
+++ b/spec/src/memw.toml
@@ -130,6 +130,28 @@ desc = "`IS_BIT<write2 + write4 + write8>`"
 desc = "`IS_WORD[timestamp[i]]`"
 iter = ["i", 0, 1]
 
+[[constraint_groups]]
+name = "assumptions"
+
+[[constraints.assumptions]]
+kind = "template"
+tag = "IS_BIT"
+input = ["write2"]
+
+[[constraints.assumptions]]
+kind = "template"
+tag = "IS_BIT"
+input = ["write4"]
+
+[[constraints.assumptions]]
+kind = "template"
+tag = "IS_BIT"
+input = ["write8"]
+
+[[constraints.assumptions]]
+kind = "template"
+tag = "IS_BIT"
+input = [["+", "write2", "write4", "write8"]]
 
 [[constraint_groups]]
 name = "consistency"
@@ -162,31 +184,31 @@ iter = ["i", 0, 6]
 
 [[constraints.consistency]]
 kind = "interaction"
-tag = "LT"
-input = [["idx", "old_timestamp", 0], "timestamp", 0]
-output = 1
+tag = "ALU"
+input = [["idx", "old_timestamp", 0], "timestamp", ["opsel", "LT"]]
+output = ["arr", 1, 0]
 multiplicity = "μ_sum"
 
 [[constraints.consistency]]
 kind = "interaction"
-tag = "LT"
-input = [["idx", "old_timestamp", 1], "timestamp", 0]
-output = 1
+tag = "ALU"
+input = [["idx", "old_timestamp", 1], "timestamp", ["opsel", "LT"]]
+output = ["arr", 1, 0]
 multiplicity = "w2"
 
 [[constraints.consistency]]
 kind = "interaction"
-tag = "LT"
-input = [["idx", "old_timestamp", "i"], "timestamp", 0]
-output = 1
+tag = "ALU"
+input = [["idx", "old_timestamp", "i"], "timestamp", ["opsel", "LT"]]
+output = ["arr", 1, 0]
 iter = ["i", 2, 3]
 multiplicity = "w4"
 
 [[constraints.consistency]]
 kind = "interaction"
-tag = "LT"
-input = [["idx", "old_timestamp", "i"], "timestamp", 0]
-output = 1
+tag = "ALU"
+input = [["idx", "old_timestamp", "i"], "timestamp", ["opsel", "LT"]]
+output = ["arr", 1, 0]
 iter = ["i", 4, 7]
 multiplicity = "write8"
 
diff --git a/spec/src/memw_aligned.toml b/spec/src/memw_aligned.toml
index 93a636aba..0e0e20d5d 100644
--- a/spec/src/memw_aligned.toml
+++ b/spec/src/memw_aligned.toml
@@ -118,6 +118,28 @@ desc = "`IS_BIT<write2 + write4 + write8>`"
 desc = "`IS_WORD[timestamp[i]]`"
 iter = ["i", 0, 1]
 
+[[constraint_groups]]
+name = "assumptions"
+
+[[constraints.assumptions]]
+kind = "template"
+tag = "IS_BIT"
+input = ["write2"]
+
+[[constraints.assumptions]]
+kind = "template"
+tag = "IS_BIT"
+input = ["write4"]
+
+[[constraints.assumptions]]
+kind = "template"
+tag = "IS_BIT"
+input = ["write8"]
+
+[[constraints.assumptions]]
+kind = "template"
+tag = "IS_BIT"
+input = [["+", "write2", "write4", "write8"]]
 
 [[constraint_groups]]
 name = "consistency"
@@ -150,9 +172,9 @@ poly = ["*", "w2", ["not", "μ_sum"]]
 
 [[constraints.consistency]]
 kind = "interaction"
-tag = "LT"
-input = ["old_timestamp", "timestamp", 0]
-output = 1
+tag = "ALU"
+input = ["old_timestamp", "timestamp", ["opsel", "LT"]]
+output = ["arr", 1, 0]
 multiplicity = "μ_sum"
 
 [[constraint_groups]]
diff --git a/spec/src/mul.toml b/spec/src/mul.toml
index a798c682d..e1837020a 100644
--- a/spec/src/mul.toml
+++ b/spec/src/mul.toml
@@ -121,22 +121,36 @@ type = "BaseField"
 desc = ""
 pad = 0
 
-# Assumptions
-
-[[assumptions]]
-desc = "`IS_HALF[lhs[i]]`"
-iter = ["i", 0, 3]
-
-[[assumptions]]
-desc = "`IS_HALF[rhs[i]]`"
-iter = ["i", 0, 3]
-ref = "mul:a:rhs"
-
 # Constraints
 
 [[constraint_groups]]
 name = "def"
 
+[[constraints.def]]
+kind = "template"
+tag = "IS_BIT"
+input = ["lhs_signed"]
+
+[[constraints.def]]
+kind = "template"
+tag = "IS_BIT"
+input = ["rhs_signed"]
+
+[[constraints.def]]
+kind = "interaction"
+tag = "IS_HALF"
+input = [["idx", "lhs", "i"]]
+multiplicity = "μ_sum"
+iter = ["i", 0, 3]
+
+[[constraints.def]]
+kind = "interaction"
+tag = "IS_HALF"
+input = [["idx", "rhs", "i"]]
+multiplicity = "μ_sum"
+iter = ["i", 0, 3]
+ref = "mul:c:rhs"
+
 [[constraints.def]]
 kind = "template"
 tag = "SIGN"
@@ -191,16 +205,16 @@ name = "lookup"
 
 [[constraints.lookup]]
 kind = "interaction"
-tag = "MUL"
-input = ["lhs", "lhs_signed", "rhs", "rhs_signed", 0]
+tag = "ALU"
+input = [["cast", "lhs", "DWordWL"], ["cast", "rhs", "DWordWL"], ["+", ["opsel", "MUL"], ["*", 32, "lhs_signed"], ["*", 64, "rhs_signed"]]]
 output = ["cast", "lo", "DWordWL"]
 multiplicity = ["-", "μ_lo"]
 ref = "mul:c:lookup_lo"
 
 [[constraints.lookup]]
 kind = "interaction"
-tag = "MUL"
-input = ["lhs", "lhs_signed", "rhs", "rhs_signed", 1]
+tag = "ALU"
+input = [["cast", "lhs", "DWordWL"], ["cast", "rhs", "DWordWL"], ["+", ["opsel", "MUL"], ["*", 32, "lhs_signed"], ["*", 64, "rhs_signed"], 128]]
 output = ["cast", "hi", "DWordWL"]
 multiplicity = ["-", "μ_hi"]
 ref = "mul:c:lookup_hi"
diff --git a/spec/src/rotxor.toml b/spec/src/rotxor.toml
index f1ff904b0..c3c5ce343 100644
--- a/spec/src/rotxor.toml
+++ b/spec/src/rotxor.toml
@@ -161,16 +161,16 @@ name = "xor"
 
 [[constraints.xor]]
 kind = "interaction"
-tag = "XOR_BYTE"
-input = [["idx", "a0", "i"], ["idx", "a1", "i"]]
+tag = "BYTE_ALU"
+input = [["opsel", "XOR"], ["idx", "a0", "i"], ["idx", "a1", "i"]]
 output = ["idx", "a01", "i"]
 multiplicity = "μ"
 iter = ["i", 0, 3]
 
 [[constraints.xor]]
 kind = "interaction"
-tag = "XOR_BYTE"
-input = [["idx", "a01", "i"], ["idx", "a2", "i"]]
+tag = "BYTE_ALU"
+input = [["opsel", "XOR"], ["idx", "a01", "i"], ["idx", "a2", "i"]]
 output = ["idx", "out", "i"]
 multiplicity = "μ"
 iter = ["i", 0, 3]
diff --git a/spec/src/sha256round.toml b/spec/src/sha256round.toml
index 2469b560c..45da4d452 100644
--- a/spec/src/sha256round.toml
+++ b/spec/src/sha256round.toml
@@ -177,40 +177,40 @@ name = "value"
 
 [[constraints.value]]
 kind = "interaction"
-tag = "AND_BYTE"
-input = [["idx", "a", "i"], ["idx", "b", "i"]]
+tag = "BYTE_ALU"
+input = [["opsel", "AND"], ["idx", "a", "i"], ["idx", "b", "i"]]
 output = ["idx", "a_and_b", "i"]
 multiplicity = "μ"
 iter = ["i", 0, 3]
 
 [[constraints.value]]
 kind = "interaction"
-tag = "XOR_BYTE"
-input = [["idx", "a", "i"], ["idx", "b", "i"]]
+tag = "BYTE_ALU"
+input = [["opsel", "XOR"], ["idx", "a", "i"], ["idx", "b", "i"]]
 output = ["idx", "a_xor_b", "i"]
 multiplicity = "μ"
 iter = ["i", 0, 3]
 
 [[constraints.value]]
 kind = "interaction"
-tag = "AND_BYTE"
-input = [["idx", "c", "i"], ["idx", "a_xor_b", "i"]]
+tag = "BYTE_ALU"
+input = [["opsel", "AND"], ["idx", "c", "i"], ["idx", "a_xor_b", "i"]]
 output = ["idx", "c_and_a_xor_b", "i"]
 multiplicity = "μ"
 iter = ["i", 0, 3]
 
 [[constraints.value]]
 kind = "interaction"
-tag = "AND_BYTE"
-input = [["idx", "e", "i"], ["idx", "f", "i"]]
+tag = "BYTE_ALU"
+input = [["opsel", "AND"], ["idx", "e", "i"], ["idx", "f", "i"]]
 output = ["idx", "e_and_f", "i"]
 multiplicity = "μ"
 iter = ["i", 0, 3]
 
 [[constraints.value]]
 kind = "interaction"
-tag = "AND_BYTE"
-input = [["-", 255, ["idx", "e", "i"]], ["idx", "g", "i"]]
+tag = "BYTE_ALU"
+input = [["opsel", "AND"], ["-", 255, ["idx", "e", "i"]], ["idx", "g", "i"]]
 output = ["idx", "not_e_and_g", "i"]
 multiplicity = "μ"
 iter = ["i", 0, 3]
diff --git a/spec/src/shift.toml b/spec/src/shift.toml
index 18c03ecad..a7f3a5f43 100644
--- a/spec/src/shift.toml
+++ b/spec/src/shift.toml
@@ -10,7 +10,7 @@ pad = 0
 
 [[variables.input]]
 name = "shift"
-type = "Byte"
+type = "DWordWHBB"
 desc = "Number of bits to shift `in` by."
 pad = 0
 
@@ -137,32 +137,49 @@ type = "Bit"
 desc = ""
 pad = 0
 
+# Constraints
 
+[[constraint_groups]]
+name = "input"
 
-# Assumptions
-
-[[assumptions]]
-desc = "`IS_HALF[in[i]]`"
+[[constraints.input]]
+kind = "interaction"
+tag = "IS_HALF"
+input = [["idx", "in", "i"]]
 iter = ["i", 0, 3]
-ref = "shift:a:range_in"
+multiplicity = "μ"
+ref = "shift:c:range_in"
 
-[[assumptions]]
-desc = "`IS_BYTE<shift>`"
-ref = "shift:a:range_shift"
+[[constraints.input]]
+kind = "interaction"
+tag = "IS_HALF"
+input = [["idx", "shift", 2]]
+multiplicity = "μ"
 
-[[assumptions]]
-desc = "`IS_BIT<direction>`"
-ref = "shift:a:direction"
+[[constraints.input]]
+kind = "template"
+tag = "IS_BYTE"
+input = [["idx", "shift", "i"]]
+iter = ["i", 0, 1]
+ref = "shift:c:range_shift"
 
-[[assumptions]]
-desc = "`IS_BIT<signed>`"
-ref = "shift:a:signed"
+[[constraints.input]]
+kind = "template"
+tag = "IS_BIT"
+input = ["direction"]
+ref = "shift:c:direction"
 
-[[assumptions]]
-desc = "`IS_BIT<word_instr>`"
-ref = "shift:a:word_instr"
+[[constraints.input]]
+kind = "template"
+tag = "IS_BIT"
+input = ["signed"]
+ref = "shift:c:signed"
 
-# Constraints
+[[constraints.input]]
+kind = "template"
+tag = "IS_BIT"
+input = ["word_instr"]
+ref = "shift:c:word_instr"
 
 [[constraint_groups]]
 name = "left_flag"
@@ -192,16 +209,16 @@ name = "bit_shift"
 
 [[constraints.bit_shift]]
 kind = "interaction"
-tag = "AND_BYTE"
-input = ["shift", 0x0F]
+tag = "BYTE_ALU"
+input = [["opsel", "AND"], ["idx", "shift", 0], 0x0F]
 output = "bit_shift"
 ref = "shift:c:bit_shift_if_left"
 multiplicity = "left"
 
 [[constraints.bit_shift]]
 kind = "interaction"
-tag = "AND_BYTE"
-input = [["-", ["^", 2, 8], ["*", 16, "zbs"], "shift"], 0x0F]
+tag = "BYTE_ALU"
+input = [["opsel", "AND"], ["-", ["^", 2, 8], ["*", 16, "zbs"], ["idx", "shift", 0]], 0x0F]
 output = "bit_shift"
 ref = "shift:c:bit_shift_if_right"
 multiplicity = "right"
@@ -268,8 +285,8 @@ ref = "shift:c:limb_shift_is_bit"
 
 [[constraints.limb_shifting]]
 kind = "interaction"
-tag = "AND_BYTE"
-input = ["shift", ["-", 0x30, ["*", 0x20, "word_instr"]]]
+tag = "BYTE_ALU"
+input = [["opsel", "AND"], ["idx", "shift", 0], ["-", 0x30, ["*", 0x20, "word_instr"]]]
 output = ["+", ["-", 1, ["idx", "limb_shift", 0]], ["*", 15, ["idx", "limb_shift", 1]], ["*", 31, ["idx", "limb_shift", 2]], ["*", 47, ["idx", "limb_shift", 3]]]
 ref = "shift:c:limb_shift_lookup"
 multiplicity = "μ"
@@ -289,8 +306,8 @@ name = "lookups"
 
 [[constraints.lookups]]
 kind = "interaction"
-tag = "SHIFT"
-input = ["in", "shift", "direction", "signed", "word_instr"]
+tag = "ALU"
+input = [["cast", "in", "DWordWL"], ["cast", "shift", "DWordWL"], ["+", ["opsel", "SHIFT"], "word_instr", ["*", 32, "signed"], ["*", 64, "direction"]]]
 output = "out"
 multiplicity = ["-", "μ"]
 ref = "shift:c:lookup"
diff --git a/spec/src/signatures.toml b/spec/src/signatures.toml
index e93c87b05..6ea23bb9e 100644
--- a/spec/src/signatures.toml
+++ b/spec/src/signatures.toml
@@ -49,11 +49,25 @@ tag = "DECODE"
 kind = "interaction"
 input = ["DWordWL", "DWordWL", "BaseField"]
 
-# SHIFT[out; in, shift, direction, signed, word_instr]
+# CPU32[half_instruction_length; timestamp, pc]
 [[signatures]]
-tag = "SHIFT"
+tag = "CPU32"
 kind = "interaction"
-input = ["DWordHL", "Byte", "Bit", "Bit", "Bit"]
+input = ["DWordWL", "DWordWL"]
+output = "Byte"
+
+# ALU[out; in1, in2, flags]
+[[signatures]]
+tag = "ALU"
+kind = "interaction"
+input = ["DWordWL", "DWordWL", "Byte"]
+output = "DWordWL"
+
+# MEMOP[out; timestamp, address, value, flags]
+[[signatures]]
+tag = "MEMOP"
+kind = "interaction"
+input = ["DWordWL", "DWordWL", "DWordWL", "Byte"]
 output = "DWordWL"
 
 # BRANCH[next_pc; pc, offset, register, JALR]
@@ -76,32 +90,11 @@ tag = "MEMW"
 kind = "interaction"
 input = ["Bit", "DWordWL", ["BaseField", 8], "DWordWL", "Bit", "Bit", "Bit"]
 
-# LT[lt; lhs, rhs, signed]
-[[signatures]]
-tag = "LT"
-kind = "interaction"
-input = ["DWordWL", "DWordWL", "Bit"]
-output = "Bit"
-
-# MUL[lo/hi; lhs, lhs_signed, rhs, rhs_signed, 0/1]
-[[signatures]]
-tag = "MUL"
-kind = "interaction"
-input = ["DWordHL", "Bit", "DWordHL", "Bit", "Bit"]
-output = "DWordWL"
-
-# DVRM[q/r; n, d, signed, 0/1]
-[[signatures]]
-tag = "DVRM"
-kind = "interaction"
-input = ["DWordHL", "DWordHL", "Bit", "Bit"]
-output = "DWordWL"
-
-# LOAD[res; base_address, timestamp, read2, read4, read8, signed]
+# LOAD[res; base_address, timestamp, flags]
 [[signatures]]
 tag = "LOAD"
 kind = "interaction"
-input = ["DWordWL", "DWordWL", "Bit", "Bit", "Bit", "Bit"]
+input = ["DWordWL", "DWordWL", "Byte"]
 output = "DWordWL"
 
 # ECALL[timestamp, syscallnr]
@@ -122,25 +115,11 @@ tag = "COMMIT"
 kind = "interaction"
 input = ["BaseField", "Byte"]
 
-# AND_BYTE[res; X, Y]
-[[signatures]]
-tag = "AND_BYTE"
-kind = "interaction"
-input = ["Byte", "Byte"]
-output = "Byte"
-
-# OR_BYTE[res; X, Y]
-[[signatures]]
-tag = "OR_BYTE"
-kind = "interaction"
-input = ["Byte", "Byte"]
-output = "Byte"
-
-# XOR_BYTE[res; X, Y]
+# BYTE_ALU[res; selector, X, Y]
 [[signatures]]
-tag = "XOR_BYTE"
+tag = "BYTE_ALU"
 kind = "interaction"
-input = ["Byte", "Byte"]
+input = ["Byte", "Byte", "Byte"]
 output = "Byte"
 
 # MSB8[msb; X]
diff --git a/spec/src/store.toml b/spec/src/store.toml
new file mode 100644
index 000000000..2d97dd00b
--- /dev/null
+++ b/spec/src/store.toml
@@ -0,0 +1,122 @@
+name = "STORE"
+
+# Input
+
+[[variables.input]]
+name = "base_address"
+type = "DWordWL"
+desc = "The base address to write to, gets offset by $[0, 7]$, depending on how big the access is"
+pad = 0
+
+[[variables.input]]
+name = "timestamp"
+type = "DWordWL"
+desc = "The timestamp at which this memory access is said to occur"
+pad = 0
+
+[[variables.input]]
+name = "write2"
+type = "Bit"
+desc = "Whether to write exactly 2 bytes"
+pad = 0
+
+[[variables.input]]
+name = "write4"
+type = "Bit"
+desc = "Whether to write exactly 4 bytes"
+pad = 0
+
+[[variables.input]]
+name = "write8"
+type = "Bit"
+desc = "Whether to write exactly 8 bytes"
+pad = 0
+
+[[variables.input]]
+name = "value"
+type = "DWordBL"
+desc = "The value to store"
+pad = 0
+
+# Virtual
+
+[[variables.virtual]]
+name = "write1"
+type = "Bit"
+desc = "Whether to write exactly 1 byte"
+def = ["-", "μ", "write2", "write4", "write8"]
+
+# Multiplicity
+
+[[variables.multiplicity]]
+name = "μ"
+type = "Bit"
+desc = ""
+pad = 0
+
+
+[[assumptions]]
+desc = "`IS_WORD[base_address[i]]`"
+iter = ["i", 0, 1]
+
+[[assumptions]]
+desc = "`IS_WORD[timestamp[i]]`"
+iter = ["i", 0, 1]
+
+
+[[constraint_groups]]
+name = "all"
+
+[[constraints.all]]
+kind = "template"
+tag = "IS_BIT"
+input = ["μ"]
+
+[[constraints.all]]
+kind = "template"
+tag = "IS_BIT"
+input = ["write2"]
+
+[[constraints.all]]
+kind = "template"
+tag = "IS_BIT"
+input = ["write4"]
+
+[[constraints.all]]
+kind = "template"
+tag = "IS_BIT"
+input = ["write8"]
+
+[[constraints.all]]
+kind = "template"
+tag = "IS_BIT"
+input = [["+", "write2", "write4", "write8"]]
+
+[[constraints.all]]
+kind = "arith"
+constraint = "$#`write2` + #`write4` + #`write8` => #`μ` = 1$"
+poly = ["*", ["+", "write2", "write4", "write8"], ["not", "μ"]]
+
+[[constraints.all]]
+kind = "template"
+tag = "IS_BYTE"
+input = [["idx", "value", "i"]]
+cond = "μ"
+iter = ["i", 0, 7]
+
+[[constraints.all]]
+kind = "interaction"
+tag = "MEMW"
+input = [0, "base_address", "value", "timestamp", "write2", "write4", "write8"]
+multiplicity = "μ"
+
+
+[[constraint_groups]]
+name = "output"
+
+[[constraints.output]]
+kind = "interaction"
+tag = "MEMOP"
+input = ["timestamp", "base_address", ["cast", "value", "DWordWL"], ["+", 1, ["*", 4, "write2"], ["*", 8, "write4"], ["*", 16, "write8"]]]
+output = ["cast", 0, "DWordWL"]
+multiplicity = ["-", "μ"]
diff --git a/spec/store.typ b/spec/store.typ
new file mode 100644
index 000000000..5b9872b58
--- /dev/null
+++ b/spec/store.typ
@@ -0,0 +1,47 @@
+#import "/book.typ": book-page, rj
+#import "/src.typ": load_config, load_chip
+#import "/chip.typ": (
+  render_chip_assumptions,
+  render_chip_variable_table,
+  render_chip_padding_table,
+  render_constraint_table,
+  compute_nr_interactions,
+  total_nr_instantiated_columns,
+  total_nr_variables,
+)
+
+#let config = load_config()
+#let chip = load_chip("src/store.toml", config)
+
+#show: book-page(chip.name)
+#let store = raw(chip.name)
+
+The #store chip provides functionality to store a value to memory.
+It decomposes a `DWord` into bytes and delegates low-level memory handling to the `MEMW` chip (@memw).
+
+= Variables
+#let nr_variables = total_nr_variables(chip)
+#let nr_columns = total_nr_instantiated_columns(chip, config)
+#let nr_interactions = compute_nr_interactions(chip)
+
+The #store chip is comprised of #nr_variables variables that are expressed using #nr_columns columns and leverages #nr_interactions interaction(s):
+#render_chip_variable_table(chip, config)
+
+= Assumptions
+#render_chip_assumptions(chip, config)
+
+= Constraints
+The chip delegates the actual memory interaction to the `MEMW` chip,
+and ensures the values are proper bytes.
+
+#render_constraint_table(chip, config, groups: "all")
+
+The chip contributes the following to the lookup argument.
+
+#render_constraint_table(chip, config, groups: "output")
+
+= Padding
+
+The table can be padded to the next power of two with the following value assignments:
+
+#render_chip_padding_table(chip, config)
diff --git a/spec/tooling/chip.py b/spec/tooling/chip.py
index 7f7ecca81..44cbbed83 100644
--- a/spec/tooling/chip.py
+++ b/spec/tooling/chip.py
@@ -92,6 +92,8 @@ def constant_fits(cst: int, target: Type) -> bool:
     | DummyExpr
 )
 
+OPSEL = ["AND", "OR", "XOR", "EQ", "LT", "SHIFT", "SHIFTW", "MUL", "DIVREM"]
+
 
 @dataclass
 class Environment:
@@ -351,6 +353,11 @@ def build_expr(config: Optional["Config"], data: object) -> Expr:
                 x.isidentifier(), f"Invalid identifier name for variable {x!r}"
             )
             return VarExpr(x)
+        case ["opsel", str(x)]:
+            if x not in OPSEL:
+                reporter.error(f"Unknown operation selector: {x!r}")
+                return LitExpr(0)
+            return LitExpr(OPSEL.index(x))
         case ["arr", *elems]:
             return ArrExpr([build_expr(config, e) for e in elems])
         case ["idx", x, y]:

From 5a09d70f5b0d224bd1a5c180ff76559d764a6e5e Mon Sep 17 00:00:00 2001
From: Robin Jadoul <robin.jadoul@3milabs.tech>
Date: Tue, 9 Jun 2026 14:32:37 +0200
Subject: [PATCH 102/105] Add extra constraints to prevent register side
 effects in CPU32 padding rows (#646)

* Add extra constraints to prevent register side effects in CPU32 padding rows

* fixes

* Patch 'signed' soundness hole and small cleanups

* More explicit constraint
---
 spec/cpu32.typ      |  1 +
 spec/src/cpu32.toml | 24 ++++++++++++++++++++++--
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/spec/cpu32.typ b/spec/cpu32.typ
index e5e8963bc..a3c639cc7 100644
--- a/spec/cpu32.typ
+++ b/spec/cpu32.typ
@@ -45,6 +45,7 @@ provide these below.
 
 Most constraints correspond to those already present in the CPU, and we present them here first,
 including some updates to the range checking corresponding to the differing types.
+We also need to make sure that for padding rows ($mu = 0$), no side effects can occur.
 
 #render_constraint_table(chip, config, groups: ("decode", "range", "alu", "mem", "logup"))
 
diff --git a/spec/src/cpu32.toml b/spec/src/cpu32.toml
index f26ce0e8c..e226c847c 100644
--- a/spec/src/cpu32.toml
+++ b/spec/src/cpu32.toml
@@ -186,7 +186,7 @@ name = "assumptions"
 
 [[constraints.assumptions]]
 kind = "arith"
-constraint = "$#`read_register2` = 0 or #`imm[i] = 0`$"
+constraint = "$#`read_register2` = 0 or #`imm = 0`$"
 poly = ["*", "read_register2", ["+", ["idx", "imm", 0], ["idx", "imm", 1]]]
 
 [[constraint_groups]]
@@ -350,6 +350,21 @@ multiplicity = "write_register"
 [[constraint_groups]]
 name = "logup"
 
+[[constraints.logup]]
+kind = "arith"
+constraint = "$#`!μ` => #`read_register1 = 0`$"
+poly = ["*", ["not", "μ"], "read_register1"]
+
+[[constraints.logup]]
+kind = "arith"
+constraint = "$#`!μ` => #`read_register2 = 0`$"
+poly = ["*", ["not", "μ"], "read_register2"]
+
+[[constraints.logup]]
+kind = "arith"
+constraint = "$#`!μ` => #`write_register = 0`$"
+poly = ["*", ["not", "μ"], "write_register"]
+
 [[constraints.logup]]
 kind = "interaction"
 tag = "CPU32"
@@ -360,6 +375,11 @@ multiplicity = ["-", "μ"]
 [[constraint_groups]]
 name = "ext"
 
+[[constraints.ext]]
+kind = "arith"
+constraint = "$#`signed` != 0 => #`μ` = 1$"
+poly = ["*", "signed", ["not", "μ"]]
+
 [[constraints.ext]]
 kind = "interaction"
 tag = "BYTE_ALU"
@@ -402,7 +422,7 @@ poly = ["-", ["idx", "arg2", 1], ["*", ["-", ["^", 2, 32], 1], "rv2_sign"], ["id
 [[constraints.ext]]
 kind = "template"
 tag = "SIGN"
-input = [["idx", "res", 1], 1]
+input = [["idx", "res", 1], "μ"]
 output = "res_sign"
 
 [[constraints.ext]]

From af9b79591be3c7607a821ace6fc6d4c2875c664d Mon Sep 17 00:00:00 2001
From: MauroFab <maurotoscano2@gmail.com>
Date: Wed, 10 Jun 2026 18:42:57 -0300
Subject: [PATCH 103/105] Regenerate markdown spec from latest spec/main

- Sync CHAPTERS with book.typ: add is_byte, cpu32, eq, bytewise, store, keccak
- Support new expr ops: arr, opsel, mod (spec/expr.typ grammar)
- Padding tables now read per-variable 'pad' attrs (mirrors chip.typ)
- Drop stale ecall.md (ecall.typ is not a book chapter on spec/main)
---
 docs/spec/bitwise.md   |   15 +-
 docs/spec/branch.md    |   31 +-
 docs/spec/bytewise.md  |   46 +
 docs/spec/commit.md    |   23 +-
 docs/spec/cpu.md       |  280 +++--
 docs/spec/cpu32.md     |  187 +++
 docs/spec/decode.md    |  100 +-
 docs/spec/dvrm.md      |   82 +-
 docs/spec/ecall.md     |    1 -
 docs/spec/eq.md        |   67 ++
 docs/spec/halt.md      |   13 +-
 docs/spec/is_byte.md   |   25 +
 docs/spec/keccak.md    |  229 ++++
 docs/spec/load.md      |   46 +-
 docs/spec/lt.md        |   51 +-
 docs/spec/memory.md    |   27 +
 docs/spec/memw.md      |  139 ++-
 docs/spec/mul.md       |   43 +-
 docs/spec/sha256.md    |  122 +-
 docs/spec/shift.md     |   71 +-
 docs/spec/spec_full.md | 2443 +++++++++++++++++++++++++++-------------
 docs/spec/store.md     |   78 ++
 scripts/typst_to_md.py |   59 +-
 23 files changed, 3017 insertions(+), 1161 deletions(-)
 create mode 100644 docs/spec/bytewise.md
 create mode 100644 docs/spec/cpu32.md
 delete mode 100644 docs/spec/ecall.md
 create mode 100644 docs/spec/eq.md
 create mode 100644 docs/spec/is_byte.md
 create mode 100644 docs/spec/keccak.md
 create mode 100644 docs/spec/store.md

diff --git a/docs/spec/bitwise.md b/docs/spec/bitwise.md
index 8d78d0e35..a2048846e 100644
--- a/docs/spec/bitwise.md
+++ b/docs/spec/bitwise.md
@@ -38,29 +38,32 @@ The  chip is comprised of  variables that are expressed using  columns. Of these
 | `μ_MSB16` | `BaseField` |  |
 | `μ_ZERO` | `BaseField` |  |
 | `μ_IS_BYTE` | `BaseField` |  |
+| `μ_ARE_BYTES` | `BaseField` |  |
 | `μ_IS_HALF` | `BaseField` |  |
 | `μ_IS_B20` | `BaseField` |  |
 | `μ_HWSL` | `BaseField` |  |
 
 *Note*: This table contains one row for every possible value of `(X, Y, Z)`. As such, it has length `2^8 dot 2^8 dot 2^4 = 2^(20)`.
 
+We use the ALU operation descriptors from [decode] to identify the operations in the `BYTE_ALU` interaction. Since each of the three columns is only `2^16` rows long, they can be combined in a single `2^20` column (with room to spare).
+
 ## Lookup
 
 This chip adds the following interactions to the lookup:
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `BITWISE-C1` | `AND_BYTE[AND; X, Y]` | -μ_AND |
-| `BITWISE-C2` | `OR_BYTE[OR; X, Y]` | -μ_OR |
-| `BITWISE-C3` | `XOR_BYTE[XOR; X, Y]` | -μ_XOR |
+| `BITWISE-C1` | `BYTE_ALU[AND; ⧼AND⧽, X, Y]` | -μ_AND |
+| `BITWISE-C2` | `BYTE_ALU[OR; ⧼OR⧽, X, Y]` | -μ_OR |
+| `BITWISE-C3` | `BYTE_ALU[XOR; ⧼XOR⧽, X, Y]` | -μ_XOR |
 | `BITWISE-C4` | `MSB8[MSB8; X]` | -μ_MSB8 |
 | `BITWISE-C5` | `MSB16[MSB16; X + 256 * Y]` | -μ_MSB16 |
 | `BITWISE-C6` | `ZERO[ZERO; X + 256 * Y + 65536 * Z]` | -μ_ZERO |
-| `BITWISE-C7` | `IS_BYTE[X]` | -μ_IS_BYTE |
+| `BITWISE-C7` | `ARE_BYTES[X, Y]` | -μ_ARE_BYTES |
 | `BITWISE-C8` | `IS_HALF[X + 256 * Y]` | -μ_IS_HALF |
 | `BITWISE-C9` | `IS_B20[X + 256 * Y + 65536 * Z]` | -μ_IS_B20 |
-| `BITWISE-C10` | `HWSL[['arr', 'SLL', 'SLLC']; X + 256 * Y, Z]` | -μ_HWSL |
+| `BITWISE-C10` | `HWSL[[SLL, SLLC]; X + 256 * Y, Z]` | -μ_HWSL |
 
 ## Notes/Optimizations
 
-The following ideas may prove to be optimizations for the  chip: + Extend `IS_BYTE[X]` to `ARE_BYTES[X, Y]`, such that two bytes are range checked at once. When only a single check is required, one can still execute `IS_BYTE[X] := ARE_BYTES[X, 0]`. + Drop `MSB8` column, and instead define the `MSB8` lookup as `MSB8<X> := MSB16[256X]`. Note: currently, `MSB8` also implicity range checks the input `X` (the lookup fails if `X` is not a `Byte`). This optimization should only be executed when all chips leveraging `MSB8` do _not_ need this implicit range check. + Place the 16-bit (`AND`, `OR`, `XOR`, `MSB16`, etc.) and 20-bit (`HWSL`, `IS_B20`, `ZERO`) lookups in separate tables.
\ No newline at end of file
+The following ideas may prove to be optimizations for the  chip: + Drop `MSB8` column, and instead define the `MSB8` lookup as `MSB8<X> := MSB16[256X]`. Note: currently, `MSB8` also implicity range checks the input `X` (the lookup fails if `X` is not a `Byte`). This optimization should only be executed when all chips leveraging `MSB8` do _not_ need this implicit range check. + Place the 16-bit (`AND`, `OR`, `XOR`, `MSB16`, etc.) and 20-bit (`HWSL`, `IS_B20`, `ZERO`) lookups in separate tables.
\ No newline at end of file
diff --git a/docs/spec/branch.md b/docs/spec/branch.md
index fd4aba45f..7e03cdea6 100644
--- a/docs/spec/branch.md
+++ b/docs/spec/branch.md
@@ -62,6 +62,12 @@ next_pc (when iter=1) := 2^16 * next_pc_high[2] + next_pc_high[1]
 | `BRANCH-A3.i` | i ∈ [0, 1] | `register` is range checked, `IS_WORD[register[i]]` |
 | `BRANCH-A4` |  | `IS_BIT<JALR>` |
 
+Some of the assumptions can be checked with only arithmetic constraints, so we provide these below.
+
+| Tag | Description |
+|-----|-------------|
+| `BRANCH-C1` | `IS_BIT<JALR>` |
+
 ## Constraints
 
 We constrain `next_pc` to be ``base_address` + `offset``, where `base_address` equals `pc` when ``JALR` = 0` and `register` otherwise.
@@ -70,18 +76,29 @@ The range checks on `unmasked_low_byte` and `next_pc_low[0]` are performed impli
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `BRANCH-C1` |  | 1 - JALR ⇒ `ADD<next_pc_unmasked; pc, offset::DWordWL>` |  |
-| `BRANCH-C2` |  | JALR ⇒ `ADD<next_pc_unmasked; register, offset::DWordWL>` |  |
-| `BRANCH-C3` |  | `IS_BYTE[next_pc_low[1]]` | μ |
-| `BRANCH-C4` |  | `AND_BYTE[next_pc_low[0]; unmasked_low_byte, 254]` | μ |
-| `BRANCH-C5.i` | i ∈ [0, 2] | `IS_HALF[next_pc_high[i]]` | μ |
+| `BRANCH-C2` |  | 1 - JALR ⇒ `ADD<next_pc_unmasked; pc, offset::DWordWL>` |  |
+| `BRANCH-C3` |  | JALR ⇒ `ADD<next_pc_unmasked; register, offset::DWordWL>` |  |
+| `BRANCH-C4` |  | μ ⇒ `IS_BYTE<next_pc_low[1]>` |  |
+| `BRANCH-C5` |  | `BYTE_ALU[next_pc_low[0]; ⧼AND⧽, unmasked_low_byte, 254]` | μ |
+| `BRANCH-C6.i` | i ∈ [0, 2] | `IS_HALF[next_pc_high[i]]` | μ |
 
 This chip contributes the following to the lookup argument.
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `BRANCH-C6` | `BRANCH[next_pc; pc, offset, register, JALR]` | -μ |
+| `BRANCH-C7` | `BRANCH[next_pc; pc, offset, register, JALR]` | -μ |
 
 ## Padding
 
-The table can be padded to the next power of two with the following value assignments:
\ No newline at end of file
+The table can be padded to the next power of two with the following value assignments:
+
+| Column | Padding value |
+|--------|---------------|
+| `pc` | `0` |
+| `offset` | `0` |
+| `register` | `0` |
+| `JALR` | `0` |
+| `next_pc_high` | `[0, 0, 0]` |
+| `next_pc_low` | `0` |
+| `unmasked_low_byte` | `0` |
+| `μ` | `0` |
\ No newline at end of file
diff --git a/docs/spec/bytewise.md b/docs/spec/bytewise.md
new file mode 100644
index 000000000..4555051ba
--- /dev/null
+++ b/docs/spec/bytewise.md
@@ -0,0 +1,46 @@
+# BYTEWISE Chip
+
+The  chip is an ALU chip that decomposes the input `DWordWL` values into bytes and performs a `BITWISE` operation pairwise (AND, OR, XOR). The `BITWISE` lookup inherently performs a range check, so no further constraints are necessary.
+
+## Variables
+
+The  chip is comprised of  variables that are expressed using  columns and leverages  interaction(s):
+
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `a` | `DWordBL` | The first input |
+| `b` | `DWordBL` | The second input |
+| `op` | `Byte` | The operation to perform |
+
+### Output
+
+| Name | Type | Description |
+|------|------|-------------|
+| `res` | `DWordBL` | The result |
+
+### Multiplicity
+
+| Name | Type | Description |
+|------|------|-------------|
+| `μ` | `BaseField` |  |
+
+## Constraints
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `BYTEWISE-C1.i` | i ∈ [0, 7] | `BYTE_ALU[res[i]; op, a[i], b[i]]` | μ |
+| `BYTEWISE-C2` |  | `ALU[res::DWordWL; a::DWordWL, b::DWordWL, op]` | -μ |
+
+## Padding
+
+The chip can be padded with the following values:
+
+| Column | Padding value |
+|--------|---------------|
+| `a` | `0` |
+| `b` | `0` |
+| `op` | `0` |
+| `res` | `0` |
+| `μ` | `0` |
\ No newline at end of file
diff --git a/docs/spec/commit.md b/docs/spec/commit.md
index 83f921d3b..4b76fda3f 100644
--- a/docs/spec/commit.md
+++ b/docs/spec/commit.md
@@ -51,10 +51,10 @@ we assert that `x10` contains `1` in [commit:c:read_fd_write_count]. Note that t
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `COMMIT-C2` | `MEMW[['arr', ['idx', 'address', 0], ['idx', 'address', 1], 0, 0, 0, 0, 0, 0]; 1, (2 * 11)::DWordWL, ['arr', ['idx', 'address', 0], ['idx', 'address', 1], 0, 0, 0, 0, 0, 0], timestamp, 1, 0, 0]` | first |
-| `COMMIT-C3` | `MEMW[['arr', ['idx', 'count', 0], ['idx', 'count', 1], 0, 0, 0, 0, 0, 0]; 1, (2 * 12)::DWordWL, ['arr', ['idx', 'count', 0], ['idx', 'count', 1], 0, 0, 0, 0, 0, 0], timestamp, 1, 0, 0]` | first |
-| `COMMIT-C4` | `MEMW[['arr', 1, 0, 0, 0, 0, 0, 0, 0]; 1, (2 * 10)::DWordWL, ['arr', ['idx', 'count', 0], ['idx', 'count', 1], 0, 0, 0, 0, 0, 0], timestamp, 1, 0, 0]` | first |
-| `COMMIT-C5` | `MEMW[['arr', 'index', 0, 0, 0, 0, 0, 0, 0]; 1, (2 * 254)::DWordWL, ['arr', ['+', 'index', ['cast', 'count', 'BaseField']], 0, 0, 0, 0, 0, 0, 0], timestamp, 0, 0, 0]` | first |
+| `COMMIT-C2` | `MEMW[[address[0], address[1], 0, 0, 0, 0, 0, 0]; 1, (2 * 11)::DWordWL, [address[0], address[1], 0, 0, 0, 0, 0, 0], timestamp, 1, 0, 0]` | first |
+| `COMMIT-C3` | `MEMW[[count[0], count[1], 0, 0, 0, 0, 0, 0]; 1, (2 * 12)::DWordWL, [count[0], count[1], 0, 0, 0, 0, 0, 0], timestamp, 1, 0, 0]` | first |
+| `COMMIT-C4` | `MEMW[[1, 0, 0, 0, 0, 0, 0, 0]; 1, (2 * 10)::DWordWL, [count[0], count[1], 0, 0, 0, 0, 0, 0], timestamp, 1, 0, 0]` | first |
+| `COMMIT-C5` | `MEMW[[index, 0, 0, 0, 0, 0, 0, 0]; 1, (2 * 254)::DWordWL, [index + count::BaseField, 0, 0, 0, 0, 0, 0, 0], timestamp, 0, 0, 0]` | first |
 
 *Note*: the observant reader will notice that [commit:c:read_index] casts `count` to a `BaseField`, potentiallly losing information. This is indeed correct. However, since it is practically impossible to commit more than `2^64-2^32` bytes in a single VM execution, it was decided to permit this.
 
@@ -62,7 +62,7 @@ Next, we read the `value` located at buffer address `address` and commit to it u
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `COMMIT-C6` | `MEMW[['arr', 'value', 0, 0, 0, 0, 0, 0, 0]; 0, address, ['arr', 'value', 0, 0, 0, 0, 0, 0, 0], timestamp, 0, 0, 0]` | μ - end |
+| `COMMIT-C6` | `MEMW[[value, 0, 0, 0, 0, 0, 0, 0]; 0, address, [value, 0, 0, 0, 0, 0, 0, 0], timestamp, 0, 0, 0]` | μ - end |
 | `COMMIT-C7` | `COMMIT[index, value]` | μ - end |
 
 In parallel, we compute ``address_incr` = `address` + 1` ([commit:c:address_incr]) as address of the next byte to commit, and ``count_decr` = `count` - 1` ([commit:c:count_decr]) as the number of bytes that still has to be committed after committing this byte. [commit:c:range_address_incr] and [commit:c:range_count_decr] are included to satisfy [add:a:sum] respectively [add:a:rhs].
@@ -103,6 +103,19 @@ Lastly, we must make sure `first`, `end` and `μ` are bits ([commit:c:range_firs
 
 To pad this chip, use the below data.
 
+| Column | Padding value |
+|--------|---------------|
+| `timestamp` | `0` |
+| `index` | `0` |
+| `address` | `[0, 0, 0, 0]` |
+| `address_incr` | `[1, 0, 0, 0]` |
+| `count` | `[1, 0, 0, 0]` |
+| `count_decr` | `[0, 0, 0, 0]` |
+| `first` | `0` |
+| `end` | `0` |
+| `value` | `0` |
+| `μ` | `0` |
+
 ## Notes/optimizations
 
 - The current version only supports writing to `stdout`. This chip could potentially be extended to support writing to arbitrary `fd`s - One might be able to replace [commit:c:end] by `end => count = 0`. While loosening the constraint (`count = 0 => end` is no longer enforced), this should not cause any problems: if the prover does not set `end` when `count=0`, they simply cannot complete the proof. First of all, one would have to recursively work through all `2^64` values of `count`, something that is practically infeasible. Moreover, if this is done with a sequence that originally has ``count` > 0`, one will inevitably have to read a memory address twice at the same timestamp, which is impossible to prove. In addition to dropping the `ZERO` lookup, this optimization might also permit moving `count_decr` from a `DWordHL` to a `DWordWL`, saving two columns. - Given that it is practically infeasible to commit more than ``p`-1 = 2^64-2^32` bytes in a program, it might suffice to store `count_decr` in a `BaseField`. Note that this would probably involve having an extra (virtual) column storing `count` in `BaseField` form as well. Moreover, one might need to add a lookup to `LT` to ensure ``count` <= `p`-1` when being read from memory at the beginning of each commitment sequence.
\ No newline at end of file
diff --git a/docs/spec/cpu.md b/docs/spec/cpu.md
index 3c00a3091..14f091412 100644
--- a/docs/spec/cpu.md
+++ b/docs/spec/cpu.md
@@ -1,6 +1,6 @@
 # CPU Chip
 
-The  chip coordinates memory accesses and dispatches to other chips for arithmetic and logical operations. It bases its decisions on the entry of the `DECODE` table ([decode]) corresponding the the current program counter (PC).
+The  chip coordinates memory accesses and dispatches to other chips for arithmetic and logical operations. It bases its decisions on the entry of the `DECODE` table ([decode]) corresponding the current program counter (PC).
 
 ## Variables
 
@@ -10,7 +10,7 @@ The  chip is comprised of  variables that are expressed using  columns and lever
 
 | Name | Type | Description |
 |------|------|-------------|
-| `timestamp` | `Timestamp` | A preprocessed timestamp to coordinate the memory argument. Since we have at most 3 non-disjoint memory accesses (`(rs1, rs2, rd)`, `(rs1, pc, pc)`, `(LOAD)` or `(STORE)`) a maximum of 4 slots is enough. |
+| `timestamp` | `Timestamp` | A preprocessed timestamp to coordinate the memory argument. Since we have at most 3 non-disjoint memory accesses (`(rs1, rs2, rd)`, `(rs1, pc, pc)`, `MEMORY`) a maximum of 4 slots is enough. |
 | `pc` | `DWordWL` | The program counter |
 | `rs1` | `Byte` | Source register 1 index |
 | `rs2` | `Byte` | Source register 2 index |
@@ -18,31 +18,17 @@ The  chip is comprised of  variables that are expressed using  columns and lever
 | `read_register1` | `Bit` | Whether to read from `rs1` (1) or to place a 0 in `rv1` (0) |
 | `read_register2` | `Bit` | Whether to read from `rs2` (1) or to place a 0 in `rv2` (0) |
 | `write_register` | `Bit` | Whether to write back to the destination register |
-| `memory_2bytes` | `Bit` | Whether the memory access (read or write) touches exactly 2 bytes |
-| `memory_4bytes` | `Bit` | Whether the memory access (read or write) touches exactly 4 bytes |
-| `memory_8bytes` | `Bit` | Whether the memory access (read or write) touches exactly 8 bytes |
-| `c_type_instruction` | `Bit` | Whether the instruction is of C type, i.e., whether it is 2 bytes long instead of 4 |
 | `imm` | `DWordWL` | The fully extended 64-bit version of the immediate |
-| `signed` | `Bit` | Indicates whether we're dealing with a signed or unsigned instruction |
-| `mp_selector` | `Bit` | Multi-purpose selector used by different ALU operations for different purposes. Currently, it is used     - by the `MUL` chip to select between `MUL`/`MULH` and `MULH[S]U`, and     - as flag for inverting the condition of conditional branches (see `branch_cond`)     - as direction (left or right) for `SHIFT` |
-| `muldiv_selector` | `Bit` | Selects which output of `MUL` (lo/hi) or `DIV` (quo/rem) is wanted |
+| `half_instruction_length` | `Byte` | Half the number of bytes consumed by this instruction, commonly used to indicate whether the instruction is of C type, i.e., whether it is 2 bytes long (= 1) instead of 4 (= 2) |
 | `word_instr` | `Bit` | Whether the instruction is a \*W instruction, requiring the inputs and outputs to be (sign) extended |
-| `ADD` | `Bit` | One-hot ALU selector flag |
-| `SUB` | `Bit` | One-hot ALU selector flag |
-| `SLT` | `Bit` | One-hot ALU selector flag |
-| `AND` | `Bit` | One-hot ALU selector flag |
-| `OR` | `Bit` | One-hot ALU selector flag |
-| `XOR` | `Bit` | One-hot ALU selector flag |
-| `SHIFT` | `Bit` | One-hot ALU selector flag |
-| `JALR` | `Bit` | One-hot ALU selector flag |
-| `BEQ` | `Bit` | One-hot ALU selector flag |
-| `BLT` | `Bit` | One-hot ALU selector flag |
-| `LOAD` | `Bit` | One-hot ALU selector flag |
-| `STORE` | `Bit` | One-hot ALU selector flag |
-| `MUL` | `Bit` | One-hot ALU selector flag |
-| `DIVREM` | `Bit` | One-hot ALU selector flag |
-| `ECALL` | `Bit` | One-hot ALU selector flag |
-| `EBREAK` | `Bit` | One-hot ALU selector flag |
+| `ALU` | `Bit` | Whether to use the ALU for this instruction |
+| `alu_flags` | `Byte` | The ALU operation + flags (interpreting things as signed/unsigned, choosing the MUL/DVRM output, ...) to pass to the ALU |
+| `ADD` | `Bit` | Addition fast-path bypassing the ALU |
+| `SUB` | `Bit` | Subtraction fast-path bypassing the ALU |
+| `MEMORY` | `Bit` | Whether this instruction touches memory (LOAD/STORE) |
+| `mem_flags` | `Byte` | The flags to pass for MEMORY operations (LOAD vs STORE, number of bytes touched, signed) |
+| `BRANCH` | `Bit` | Whether this instruction is a conditional branch (BLT, BEQ) |
+| `ECALL` | `Bit` | Whether this instruction is an ECALL |
 
 ### Output
 
@@ -57,187 +43,183 @@ The  chip is comprised of  variables that are expressed using  columns and lever
 |------|------|-------------|
 | `prev_pc_timestamp_borrow` | `Bit` | The borrow bit for computing the previous timestamp the PC was accessed |
 | `pc_double_read` | `Bit` | Whether the PC is being read as a general purpose register (`rs1`) this cycle |
-| `rv1` | `DWordWHH` | The value of register `rs1` |
-| `rv2` | `DWordWHH` | The value of register `rs2` |
-| `rv1_ext_bit` | `Bit` | The sign bit of `rv1` if seen as a 32-bit word, used for sign extension with `word_instr` |
-| `arg1` | `DWordBL` | The extended version of `rv1`, depending on `word_instr` |
-| `rv2_ext_bit` | `Bit` | The sign bit of `rv2` if seen as a 32-bit word, used for sign extension with `word_instr` |
-| `arg2` | `DWordBL` | A multiplexed version of `rv2` and `imm`, to be used as second argument to ALU calls |
-| `res_ext_bit` | `Bit` | The sign bit of `res`, if seen as a 32-bit word, used for sign extension with `word_instr` |
-| `res` | `DWordBL` | The ALU result |
-| `is_equal` | `Bit` | Whether `rv1` and `arg2` are equal |
-| `branch_cond` | `Bit` | Whether a branch is taken, i.e., the branch condition |
+| `rv1` | `DWordWL` | The value of register `rs1` |
+| `rv2` | `DWordWL` | The value of register `rs2` |
+| `arg2` | `DWordWL` | A multiplexed version of `rv2` and `imm`, to be used as second argument to ALU calls |
+| `res` | `DWordHL` | The ALU result |
+| `branch_cond` | `Bit` | Whether a branch is taken: the branch condition evaluates to true, or we are doing an unconditional jump |
 
 ### Virtual
 
 | Name | Type | Description |
 |------|------|-------------|
+| `JALR` | `Bit` | Read whether our BRANCH corresponds to a JAL(R) instruction from `mem_flags`, as `MEMORY` and `BRANCH` are mutually exclusive |
 | `packed_decode` | `BaseField` | A packed representation of all bit flags and register indices obtained from the decoding |
-| `pad` | `Bit` | When no flags are set, we must be in a padding row. |
 
-**Definition of `packed_decode`:**
+**Definition of `JALR`:**
 ```
-packed_decode := 2^0 * read_register1 + 2^1 * read_register2 + 2^2 * write_register + 2^3 * memory_2bytes + 2^4 * memory_4bytes + 2^5 * memory_8bytes + 2^6 * c_type_instruction + 2^7 * signed + 2^8 * mp_selector + 2^9 * muldiv_selector + 2^10 * word_instr + 2^11 * ADD + 2^12 * SUB + 2^13 * SLT + 2^14 * AND + 2^15 * OR + 2^16 * XOR + 2^17 * SHIFT + 2^18 * JALR + 2^19 * BEQ + 2^20 * BLT + 2^21 * LOAD + 2^22 * STORE + 2^23 * MUL + 2^24 * DIVREM + 2^25 * ECALL + 2^26 * EBREAK + 2^27 * rs1 + 2^35 * rs2 + 2^43 * rd
+JALR := mem_flags
 ```
 
-**Definition of `pad`:**
+**Definition of `packed_decode`:**
 ```
-pad := 1 - ADD - SUB - SLT - AND - OR - XOR - SHIFT - JALR - BEQ - BLT - LOAD - STORE - MUL - DIVREM - ECALL - EBREAK
+packed_decode := 2^0 * read_register1 + 2^1 * read_register2 + 2^2 * write_register + 2^3 * word_instr + 2^4 * ALU + 2^5 * ADD + 2^6 * SUB + 2^7 * MEMORY + 2^8 * BRANCH + 2^9 * ECALL + 2^10 * rs1 + 2^18 * rs2 + 2^26 * rd + 2^34 * half_instruction_length + 2^42 * alu_flags + 2^50 * mem_flags
 ```
 
 ## Assumptions
 
 | Tag | Range | Description |
 |-----|-------|-------------|
-| `CPU-A1` |  | At most one ALU selector flag is 1 by the decoding, and every other flag is 0. |
-| `CPU-A2` |  | When `STORE + LOAD + BEQ + BLT = 0`, either `rs2 = 0` or `imm = 0` should be enforced by the decoding. This is needed for `arg2`. |
+| `CPU-A1` |  | `MEMORY` and `BRANCH` are mutually exclusive |
+| `CPU-A2` |  | When `MEMORY + BRANCH = 0`, either `read_register2 = 0` or `imm = 0` should be enforced by the decoding. This is needed for `arg2`. |
+| `CPU-A3` |  | $#`!MEMORY` => #`IS_BIT<mem_flags>`$ |
+
+Additionally, the following constraints can be used to provide defense-in-depth validation of the assumptions.
+
+| Tag | Description |
+|-----|-------------|
+| `CPU-C1` | not (`MEMORY` and `BRANCH`) |
+| | _polynomial:_ `MEMORY * BRANCH = 0` |
+| `CPU-C2` | (1 - `MEMORY` - `BRANCH`) => (`read_register2` = 0 or `imm[i]` = 0) |
+| | _polynomial:_ `(1 - MEMORY - BRANCH) * read_register2 * (imm[0] + imm[1]) = 0` |
+| `CPU-C3` | 1 - MEMORY ⇒ `IS_BIT<mem_flags>` |
 
 ## Constraints
 
-First, we perform a decoding lookup for the current PC.
+First, we perform a decoding lookup for the current PC. Instructions having the `word_instr` flag set are not decoded here, as they are delegated to the `CPU32` chip. In that case, we ensure that the current row of the CPU cannot have any other observable effects.
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `CPU-C1` | `DECODE[pc, imm, packed_decode]` | 1 |
+| `CPU-C4` | `DECODE[pc, imm, packed_decode]` | 1 - word_instr |
+| `CPU-C5` | `word_instr` => `MEMORY = 0` |  |
+| | _polynomial:_ `word_instr * MEMORY = 0` | |
+| `CPU-C6` | `word_instr` => `BRANCH = 0` |  |
+| | _polynomial:_ `word_instr * BRANCH = 0` | |
+| `CPU-C7` | `word_instr` => `ECALL = 0` |  |
+| | _polynomial:_ `word_instr * ECALL = 0` | |
+| `CPU-C8` | `word_instr` => `read_register1 = 0` |  |
+| | _polynomial:_ `word_instr * read_register1 = 0` | |
+| `CPU-C9` | `word_instr` => `read_register2 = 0` |  |
+| | _polynomial:_ `word_instr * read_register2 = 0` | |
+| `CPU-C10` | `word_instr` => `write_register = 0` |  |
+| | _polynomial:_ `word_instr * write_register = 0` | |
+| `CPU-C11` | `CPU32[half_instruction_length; timestamp, pc]` | word_instr |
 
 ### Range checks
 
-> **Note:** Make sure we argue for every column here
-
-> **Note:** is `rvd` still sufficiently constrained? (can also be done through the memory argument like `pc`?)
-
-We constrain all columns to have the appropriate ranges. The flags and register indices looked up from the decoding need to be checked, as they are communicated through the interaction in a packed form. In contrast, we know ahead of time that decoding will ensure proper range checks for `pc` and `imm`. Similarly, since `next_pc` will propagate through the memory argument and be looked up in the instruction decoding on the next cycle, it is forced to be in the correct range. For the auxiliary columns, we need to check the limbs of `arg1`, `arg2`, and `res`. The ranges of the other auxiliary columns are enforced through later constraints.
+We constrain all columns to have the appropriate ranges. All values in `packed_decode` need to be checked to ensure the packing is correct for the interaction. In contrast, we know ahead of time that decoding will ensure proper range checks for `pc` and `imm`. Similarly, since `next_pc` will propagate through the memory argument and be looked up in the instruction decoding on the next cycle, it is forced to be in the correct range; the final value for `next_pc` is similarly fixed by the memory finalization. For the auxiliary columns, we need to check the limbs of `res`, since `rv1` and `rv2` are enforced by the memory argument, and `rvd` is correct by the correctness of the dependent chips. The ranges of the other auxiliary columns are enforced through later constraints.
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `CPU-CR2` |  | `IS_BIT<read_register1>` |  |
-| `CPU-CR3` |  | `IS_BIT<read_register2>` |  |
-| `CPU-CR4` |  | `IS_BIT<write_register>` |  |
-| `CPU-CR5` |  | `IS_BIT<memory_2bytes>` |  |
-| `CPU-CR6` |  | `IS_BIT<memory_4bytes>` |  |
-| `CPU-CR7` |  | `IS_BIT<memory_8bytes>` |  |
-| `CPU-CR8` |  | `IS_BIT<c_type_instruction>` |  |
-| `CPU-CR9` |  | `IS_BIT<signed>` |  |
-| `CPU-CR10` |  | `IS_BIT<mp_selector>` |  |
-| `CPU-CR11` |  | `IS_BIT<muldiv_selector>` |  |
-| `CPU-CR12` |  | `IS_BIT<word_instr>` |  |
-| `CPU-CR13` |  | `IS_BIT<ADD>` |  |
-| `CPU-CR14` |  | `IS_BIT<SUB>` |  |
-| `CPU-CR15` |  | `IS_BIT<SLT>` |  |
-| `CPU-CR16` |  | `IS_BIT<AND>` |  |
-| `CPU-CR17` |  | `IS_BIT<OR>` |  |
-| `CPU-CR18` |  | `IS_BIT<XOR>` |  |
-| `CPU-CR19` |  | `IS_BIT<SHIFT>` |  |
-| `CPU-CR20` |  | `IS_BIT<JALR>` |  |
-| `CPU-CR21` |  | `IS_BIT<BEQ>` |  |
-| `CPU-CR22` |  | `IS_BIT<BLT>` |  |
-| `CPU-CR23` |  | `IS_BIT<LOAD>` |  |
-| `CPU-CR24` |  | `IS_BIT<STORE>` |  |
-| `CPU-CR25` |  | `IS_BIT<MUL>` |  |
-| `CPU-CR26` |  | `IS_BIT<DIVREM>` |  |
-| `CPU-CR27` |  | `IS_BIT<ECALL>` |  |
-| `CPU-CR28` |  | `IS_BIT<EBREAK>` |  |
-| `CPU-CR29` |  | `IS_BYTE[rs1]` | 1 |
-| `CPU-CR30` |  | `IS_BYTE[rs2]` | 1 |
-| `CPU-CR31` |  | `IS_BYTE[rd]` | 1 |
-| `CPU-CR32.i` | i ∈ [0, 7] | `IS_BYTE[arg1[i]]` | 1 |
-| `CPU-CR33.i` | i ∈ [0, 7] | `IS_BYTE[arg2[i]]` | 1 |
-| `CPU-CR34.i` | i ∈ [0, 7] | `IS_BYTE[res[i]]` | 1 |
+| `CPU-CR12` |  | `IS_BIT<read_register1>` |  |
+| `CPU-CR13` |  | `IS_BIT<read_register2>` |  |
+| `CPU-CR14` |  | `IS_BIT<write_register>` |  |
+| `CPU-CR15` |  | `IS_BYTE<half_instruction_length>` |  |
+| `CPU-CR16` |  | `IS_BIT<word_instr>` |  |
+| `CPU-CR17` |  | `IS_BIT<ALU>` |  |
+| `CPU-CR18` |  | `IS_BYTE<alu_flags>` |  |
+| `CPU-CR19` |  | `IS_BIT<ADD>` |  |
+| `CPU-CR20` |  | `IS_BIT<SUB>` |  |
+| `CPU-CR21` |  | `IS_BIT<MEMORY>` |  |
+| `CPU-CR22` |  | `IS_BYTE<mem_flags>` |  |
+| `CPU-CR23` |  | `IS_BIT<BRANCH>` |  |
+| `CPU-CR24` |  | `IS_BIT<ECALL>` |  |
+| `CPU-CR25` |  | `IS_BYTE<rs1>` |  |
+| `CPU-CR26` |  | `IS_BYTE<rs2>` |  |
+| `CPU-CR27` |  | `IS_BYTE<rd>` |  |
+| `CPU-CR28.i` | i ∈ [0, 3] | `IS_HALF[res[i]]` | 1 |
 
 ### ALU
 
-The ALU functionality is then obtained through judicious dispatching to the corresponding chips.
+The ALU functionality is then obtained through delegation to the `ALU` signature, backed by the various ALU chips, or by using the appropriate template. For the pure ALU path, `arg2` is computed as `rv2 + imm`, which relies on [cpu:a:arg2]-multiplex to be either `rv2` or `imm`, depending on the instruction. The other contributions for `arg2` are specific to the (mutually exclusive, [cpu:a:mem]-branch-mutex) `MEMORY` and `BRANCH` flags: - For the `MEMORY` path, we want the output of the ALU to be ``rv1` + `imm``, as that is the address at which the memory access occurs. - For the `BRANCH` path, we want the ALU output to reflect the branch condition (or just be inactive for JALR).
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `CPU-CA35` |  | ADD + LOAD ⇒ `ADD<res::DWordWL; arg1::DWordWL, arg2::DWordWL>` |  |
-| `CPU-CA36` |  | STORE ⇒ `ADD<res::DWordWL; arg1::DWordWL, imm>` |  |
-| `CPU-CA37` |  | SUB + BEQ ⇒ `SUB<res::DWordWL; arg1::DWordWL, arg2::DWordWL>` |  |
-| `CPU-CA38` |  | `LT[res[0]; arg1::DWordWL, arg2::DWordWL, signed]` | SLT + BLT |
-| `CPU-CA39.i` | i ∈ [1, 7] | `SLT` + `BLT` => `res[i]` = 0 |  |
-| | | _polynomial:_ `(SLT + BLT) * res[i] = 0` | |
-| `CPU-CA40.i` | i ∈ [0, 7] | `AND_BYTE[res[i]; arg1[i], arg2[i]]` | AND |
-| `CPU-CA41.i` | i ∈ [0, 7] | `OR_BYTE[res[i]; arg1[i], arg2[i]]` | OR |
-| `CPU-CA42.i` | i ∈ [0, 7] | `XOR_BYTE[res[i]; arg1[i], arg2[i]]` | XOR |
-| `CPU-CA43` |  | `SHIFT[res::DWordWL; arg1::DWordHL, arg2[0], mp_selector, signed, word_instr]` | SHIFT |
-| `CPU-CA44` |  | JALR ⇒ `ADD<res::DWordWL; pc, (2 * c_type_instruction + 4 * (1 - c_type_instruction)) * 1::DWordWL>` |  |
-| `CPU-CA45` |  | `MUL[res::DWordWL; arg1::DWordHL, signed, arg2::DWordHL, mp_selector, muldiv_selector]` | MUL |
-| `CPU-CA46` |  | `DVRM[res::DWordWL; arg1::DWordHL, arg2::DWordHL, signed, muldiv_selector]` | DIVREM |
+| `CPU-CA29.i` | i ∈ [0, 1] | `arg2` = `MEMORY` dot `imm` + `BRANCH` dot `rv2` + (1 - `MEMORY` - `BRANCH`) dot (`rv2` + `imm`) |  |
+| | | _polynomial:_ `arg2[i] - MEMORY * imm[i] - BRANCH * rv2[i] - (1 - MEMORY - BRANCH) * (rv2 + imm)[i] = 0` | |
+| `CPU-CA30` |  | ADD ⇒ `ADD<res::DWordWL; rv1, arg2>` |  |
+| `CPU-CA31` |  | SUB ⇒ `SUB<res::DWordWL; rv1, arg2>` |  |
+| `CPU-CA32` |  | `ALU[res::DWordWL; rv1, arg2, alu_flags]` | ALU |
 
 ### Memory<cpu:memory>
 
-The interactions with the memory, both for register loading and storing, as for `LOAD` and `STORE` instructions are handled. Note that since registers need no byte-addressing, we store them in the memory argument with `Word` limbs. The `pc` register behaves very predictably with respect to its timestamps and when it is being read, so for performance reasons, we inline its memory interactions directly into the  chip.
+Note that since registers need no byte-addressing, we store them in the memory argument with `Word` limbs, simultaneously ensuring that register reads are properly range checked as long as all writes are. The `pc` register behaves very predictably with respect to its timestamps and when it is being read, so for performance reasons, we inline its memory interactions directly into the  chip.
 
 Potentially overlapping memory accesses are ensured to have disjoint timestamps. One consequence of that is that `next_pc` is written at `timestamp + 1` to ensure the access is disjoint with the `pc` read into `rv1` as part of the `AUIPC` instruction (see [cpu:c:read_rv1] and [decode]:decoding-overview). Constraints regarding whether `pc_double_read` corresponds to an `AUIPC` instruction are not necessary, as regardless of its value, the old timestamp is guaranteed smaller than the new timestamp, and the integrity of the memory argument therefore ensures the correctness of this bit.
 
+The memory interaction itself is handled by the `MEMORY` signature, which will read the `mem_flags` argument to perform either a `LOAD` or a `STORE`. We refer to the previous section's description of `arg2` for how the address is computed.
+
+The value to (potentially) be written back to `rd` is stored in `rvd`, which can either come from the ALU --- in case of an ALU operation or a JALR branch --- or from the MEMORY interaction.
+
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `CPU-CM47` |  | `MEMW[['arr', ['idx', ['cast', 'rv1', 'DWordWL'], 0], ['idx', ['cast', 'rv1', 'DWordWL'], 1], 0, 0, 0, 0, 0, 0]; 1, 2::DWordWL * rs1, ['arr', ['idx', ['cast', 'rv1', 'DWordWL'], 0], ['idx', ['cast', 'rv1', 'DWordWL'], 1], 0, 0, 0, 0, 0, 0], timestamp + 0::DWordWL, 1, 0, 0]` | read_register1 |
-| `CPU-CM48.i` | i ∈ [0, 2] | `!read_register1` => `rv1[i]` = 0 |  |
+| `CPU-CM33` |  | `MEMW[[rv1[0], rv1[1], 0, 0, 0, 0, 0, 0]; 1, 2::DWordWL * rs1, [rv1[0], rv1[1], 0, 0, 0, 0, 0, 0], timestamp + 0::DWordWL, 1, 0, 0]` | read_register1 |
+| `CPU-CM34.i` | i ∈ [0, 1] | `!read_register1` => `rv1[i]` = 0 |  |
 | | | _polynomial:_ `(1 - read_register1) * rv1[i] = 0` | |
-| `CPU-CM49` |  | `MEMW[['arr', ['idx', ['cast', 'rv2', 'DWordWL'], 0], ['idx', ['cast', 'rv2', 'DWordWL'], 1], 0, 0, 0, 0, 0, 0]; 1, 2::DWordWL * rs2, ['arr', ['idx', ['cast', 'rv2', 'DWordWL'], 0], ['idx', ['cast', 'rv2', 'DWordWL'], 1], 0, 0, 0, 0, 0, 0], timestamp + 1::DWordWL, 1, 0, 0]` | read_register2 |
-| `CPU-CM50.i` | i ∈ [0, 2] | `!read_register2` => `rv2[i]` = 0 |  |
+| `CPU-CM35` |  | `MEMW[[rv2[0], rv2[1], 0, 0, 0, 0, 0, 0]; 1, 2::DWordWL * rs2, [rv2[0], rv2[1], 0, 0, 0, 0, 0, 0], timestamp + 1::DWordWL, 1, 0, 0]` | read_register2 |
+| `CPU-CM36.i` | i ∈ [0, 1] | `!read_register2` => `rv2[i]` = 0 |  |
 | | | _polynomial:_ `(1 - read_register2) * rv2[i] = 0` | |
-| `CPU-CM51` |  | `MEMW[1, 2::DWordWL * rd, ['arr', ['idx', 'rvd', 0], ['idx', 'rvd', 1], 0, 0, 0, 0, 0, 0], timestamp + 2::DWordWL, 1, 0, 0]` | write_register |
-| `CPU-CM52` |  | `LOAD[rvd; res::DWordWL, timestamp + 0::DWordWL, memory_2bytes, memory_4bytes, memory_8bytes, signed]` | LOAD |
-| `CPU-CM53` |  | `MEMW[0, res::DWordWL, arg2::Byte[8], timestamp + 1::DWordWL, memory_2bytes, memory_4bytes, memory_8bytes]` | STORE |
-| `CPU-CM54` |  | `IS_BIT<pc_double_read>` |  |
-| `CPU-CM55` |  | `IS_BIT<prev_pc_timestamp_borrow>` |  |
-| `CPU-CM56.i` | i ∈ [0, 1] | `memory[1, ['arr', ['+', ['*', 2, 255], 'i'], 0], ['arr', ['+', ['-', ['idx', 'timestamp', 0], ['*', 3, ['not', 'pc_double_read']]], ['*', ['^', 2, 32], 'prev_pc_timestamp_borrow']], ['-', ['idx', 'timestamp', 1], 'prev_pc_timestamp_borrow']], pc[i]]` | 1 - pad |
-| `CPU-CM57.i` | i ∈ [0, 1] | `memory[1, ['arr', ['+', ['*', 2, 255], 'i'], 0], timestamp + 1::DWordWL, next_pc[i]]` | -(1 - pad) |
-
-#### Potential optimizations
+| `CPU-CM37` |  | `MEMW[1, 2::DWordWL * rd, [rvd[0], rvd[1], 0, 0, 0, 0, 0, 0], timestamp + 2::DWordWL, 1, 0, 0]` | write_register |
+| `CPU-CM38` |  | `MEMOP[rvd; timestamp, res::DWordWL, rv2, mem_flags]` | MEMORY |
+| `CPU-CM39.i` | i ∈ [0, 1] | `!MEMORY` and `!BRANCH` => `rvd` = `res` |  |
+| | | _polynomial:_ `(1 - MEMORY - BRANCH) * (rvd[i] - (res::DWordWL)[i]) = 0` | |
+| `CPU-CM40` |  | `IS_BIT<pc_double_read>` |  |
+| `CPU-CM41` |  | `IS_BIT<prev_pc_timestamp_borrow>` |  |
+| `CPU-CM42.i` | i ∈ [0, 1] | `memory[1, [2 * 255 + i, 0], [(timestamp[0] - 3 * (1 - pc_double_read)) + 2^32 * prev_pc_timestamp_borrow, timestamp[1] - prev_pc_timestamp_borrow], pc[i]]` | 1 |
+| `CPU-CM43.i` | i ∈ [0, 1] | `memory[1, [2 * 255 + i, 0], timestamp + 1::DWordWL, next_pc[i]]` | -1 |
 
-- `double_pc_read` could be integrated into decoding, so that `AUIPC` could set `read_register1 = 0` and no extra MEMW access for `rv1` is needed at this point.
+### Branching
 
-### System
-
-The interactions with the wider system.
+A branch is expressed by having the `BRANCH` flag set to 1. Since `BRANCH` and `MEMORY` are mutually exclusive ([cpu:a:mem]-branch-mutex), we can repurpose the `mem_flags` field to indicate a JALR instruction. When JALR is not set, we have a conditional branch that is decided upon by the result of the ALU instructions, as set in the `res` variable. As such, we can set `branch_cond` appropriately as multiplicity flag for the `BRANCH` chip.
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `CPU-CS58` | `!EBREAK` |  |
-| | _polynomial:_ `1 - EBREAK = 0` | |
-| `CPU-CS59` | `ECALL[timestamp, rv1::DWordWL]` | ECALL |
-
-### Input and output to the ALU
+| `CPU-CB44` | `branch_cond` = `BRANCH` and (`JALR` or `res`) |  |
+| | _polynomial:_ `branch_cond - BRANCH * JALR - BRANCH * (1 - JALR) * res[0] = 0` | |
+| `CPU-CB45` | `BRANCH[next_pc; pc, imm, rv1, JALR]` | branch_cond |
+| `CPU-CB46` | 1 - branch_cond ⇒ `ADD<next_pc; pc, [2 * half_instruction_length, 0]>` |  |
+| `CPU-CB47` | BRANCH ⇒ `ADD<rvd; pc, [2 * half_instruction_length, 0]>` |  |
 
-We constrain `arg1`, `arg2` and `rvd` to correspond to the wanted values, including the appropriate sign/zero extension, depending on `word_instr`.
+### System
 
-| Tag | Description |
-|-----|-------------|
-| `CPU-CE60` | `SIGN<rv1_ext_bit; rv1[1], word_instr>` |
-| `CPU-CE61` | `arg1[:4]` = `rv1[:2]` |
-| | _polynomial:_ `(arg1::DWordWL)[0] - (rv1::DWordWL)[0] = 0` |
-| `CPU-CE62` | `arg1[4:]` = `rv1[2]` dot (1 - `word_instr`) + (2^(32) - 1) dot `rv1_ext_bit` dot `signed` |
-| | _polynomial:_ `(arg1::DWordWL)[1] - (1 - word_instr) * rv1[2] - signed * rv1_ext_bit * (2^32 - 1) = 0` |
-| `CPU-CE63` | `SIGN<rv2_ext_bit; rv2[1], word_instr>` |
-| `CPU-CE64` | `arg2[:4]` = (1 - `LOAD`) dot `rv2[:2]` + (1 - `BEQ` - `BLT` - `STORE`) dot `imm[0]` |
-| | _polynomial:_ `(arg2::DWordWL)[0] - (1 - LOAD) * (rv2::DWordWL)[0] - (1 - BEQ - BLT - STORE) * imm[0] = 0` |
-| `CPU-CE65` | `arg2[4:]` = (1 - `LOAD`) dot ((1 - `word_instr`) dot `rv2[2]` + `signed` dot `rv2_ext_bit` dot (2^(32) - 1)) + (1 - `BEQ` - `BLT` - `STORE`) dot `imm[1]` |
-| | _polynomial:_ `(arg2::DWordWL)[1] - (1 - LOAD) * (1 - word_instr) * rv2[2] - (1 - LOAD) * signed * rv2_ext_bit * (2^32 - 1) - (1 - BEQ - BLT - STORE) * imm[1] = 0` |
-| `CPU-CE66` | `SIGN<res_ext_bit; (res::DWordHL)[1], word_instr>` |
-| `CPU-CE67` | `!LOAD` => `rvd[0]` = `res[:4]` |
-| | _polynomial:_ `(1 - LOAD) * (rvd[0] - (res::DWordWL)[0]) = 0` |
-| `CPU-CE68` | `!LOAD` => `rvd[1]` = (1 - `word_instr`) dot `res[4:]` + `res_ext_bit` dot (2^(32) - 1) |
-| | _polynomial:_ `(1 - LOAD) * (rvd[1] - (1 - word_instr) * (res::DWordWL)[1] - res_ext_bit * (2^32 - 1)) = 0` |
-
-### Other constraints
-
-For [cpu:c:is_equal], note that [cpu:c:sub] sets `res` to be the difference between `arg1` and `arg2` whenever `BEQ` is `1`. Given that this difference is `0` when both are equal, [cpu:c:is_equal] ensures `is_equal` is set to `1` if and only if ``arg1` = `arg2`` and `BEQ` is set.
+The interactions with the wider system go through the `ECALL` interface. Since we treat `EBREAK` instructions as unprovable traps, we avoid emitting `DECODE` rows for these, and do not need any further handling in the CPU.
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `CPU-CO69` | `ZERO[is_equal; res[0] + res[1] + res[2] + res[3] + res[4] + res[5] + res[6] + res[7]]` | BEQ |
-| `CPU-CO70` | `branch_cond` = `JALR` or (`BLT` and (`res` xor `invert`)) or (`BEQ` and (`is_equal` xor `invert`)) |  |
-| | _polynomial:_ `-branch_cond + JALR + res[0] * (1 - mp_selector) * BLT + (1 - res[0]) * mp_selector * BLT + is_equal * (1 - mp_selector) * BEQ + (1 - is_equal) * mp_selector * BEQ = 0` | |
-| `CPU-CO71` | `BRANCH[next_pc; pc, imm, arg1::DWordWL, JALR]` | branch_cond |
-| `CPU-CO72` | `ADD<next_pc; pc, (2 * c_type_instruction + 4 * (1 - c_type_instruction)) * 1::DWordWL>` |  |
-
-> **Note:** Document the choice to not have a multiplicity column here for padding
+| `CPU-CS48` | `ECALL[timestamp, rv1]` | ECALL |
 
 ## Padding
 
 The CPU can be padded with the following values, which have a corresponding row in the DECODE table, at the _odd_ address 1, only reachable through a HALT ecall.
 
-This approach minimizes the number of dependent lookups, increasing only multiplicities in the DECODE table and the IS_BYTE lookup.
\ No newline at end of file
+| Column | Padding value |
+|--------|---------------|
+| `pc` | `1` |
+| `rs1` | `0` |
+| `rs2` | `0` |
+| `rd` | `0` |
+| `read_register1` | `0` |
+| `read_register2` | `0` |
+| `write_register` | `0` |
+| `imm` | `0` |
+| `half_instruction_length` | `2` |
+| `word_instr` | `0` |
+| `ALU` | `0` |
+| `alu_flags` | `0` |
+| `ADD` | `0` |
+| `SUB` | `0` |
+| `MEMORY` | `0` |
+| `mem_flags` | `0` |
+| `BRANCH` | `0` |
+| `ECALL` | `0` |
+| `next_pc` | `1` |
+| `rvd` | `0` |
+| `prev_pc_timestamp_borrow` | `0` |
+| `pc_double_read` | `0` |
+| `rv1` | `0` |
+| `rv2` | `0` |
+| `arg2` | `0` |
+| `res` | `0` |
+| `branch_cond` | `0` |
+
+This approach minimizes the number of dependent lookups, increasing only multiplicities in the `DECODE` table and the `IS_BYTE` and `IS_HALF` lookups.
\ No newline at end of file
diff --git a/docs/spec/cpu32.md b/docs/spec/cpu32.md
new file mode 100644
index 000000000..f5bb0df7e
--- /dev/null
+++ b/docs/spec/cpu32.md
@@ -0,0 +1,187 @@
+# CPU32 Chip
+
+The  chip is used to delegate the 32-bit instructions of the RV64I instruction set from the main CPU table ([cpu]). All 32-bit instructions are ALU-only instructions, so the BRANCH, MEMORY and ECALL paths need no elaboration. The timestamp and PC have already been read by the CPU table at this point, and need no further checking; the PC for the next instruction will also already be handled by CPU.
+
+The structure follows the regular ALU path, with some extra variables and constraints to contain the required sign extensions.
+
+## Variables
+
+The  chip is comprised of  variables that are expressed using  columns and leverages  interaction(s):
+
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `timestamp` | `DWordWL` | The timestamp for the CPU row |
+| `pc` | `DWordWL` | The PC at which the instruction occurs |
+
+### Output
+
+| Name | Type | Description |
+|------|------|-------------|
+| `half_instruction_length` | `Byte` | The length of this instruction |
+
+### Auxiliary
+
+| Name | Type | Description |
+|------|------|-------------|
+| `rs1` | `Byte` | Source register 1 |
+| `read_register1` | `Bit` | Whether to read from `rs1` or not |
+| `rv1` | `DWordWHH` | The value in register `rs1` |
+| `rv1_sign` | `Bit` | The sign bit of the lower word of `rv1` |
+| `arg1` | `DWordWL` | The sign-extended version of `rv1` |
+| `rs2` | `Byte` | Source register 2 |
+| `read_register2` | `Bit` | Whether to read from `rs2` |
+| `rv2` | `DWordWHH` | The value in register `rs2` |
+| `rv2_sign` | `Bit` | The sign bit of the lower word of `rv2` |
+| `imm` | `DWordWL` | The fully sign-extended immediate to use |
+| `arg2` | `DWordWL` | Either the sign-extended version of `rv2` or all of `imm` |
+| `res` | `DWordHL` | The ALU result |
+| `res_sign` | `Bit` | The sign bit of the lower word of `res` |
+| `rd` | `Byte` | Destination register |
+| `write_register` | `Bit` | Whether to write back to `rd` |
+| `rvd` | `DWordWL` | The value to write back to `rd`, the sign-extended version of `res` |
+| `ALU` | `Bit` | Whether the full ALU is active |
+| `alu_flags` | `Byte` | The ALU operation + flags |
+| `ADD` | `Bit` | Whether the full ALU is active |
+| `SUB` | `Bit` | Whether the full ALU is active |
+| `signed` | `Bit` | Whether the instruction is signed or not. Extracted from `alu_flags`, used to determine the extension for the inputs |
+
+### Virtual
+
+| Name | Type | Description |
+|------|------|-------------|
+| `packed_decode` | `BaseField` | The packed representation of all flags and information from the decode table |
+
+**Definition of `packed_decode`:**
+```
+packed_decode := 2^0 * read_register1 + 2^1 * read_register2 + 2^2 * write_register + 2^3 * 1 + 2^4 * ALU + 2^5 * ADD + 2^6 * SUB + 2^10 * rs1 + 2^18 * rs2 + 2^26 * rd + 2^34 * half_instruction_length + 2^42 * alu_flags
+```
+
+### Multiplicity
+
+| Name | Type | Description |
+|------|------|-------------|
+| `μ` | `Bit` |  |
+
+## Assumptions
+
+| Tag | Range | Description |
+|-----|-------|-------------|
+| `CPU32-A1.i` | i ∈ [0, 1] | `IS_WORD[timestamp[i]]` |
+| `CPU32-A2.i` | i ∈ [0, 1] | `IS_WORD[pc[i]]` |
+| `CPU32-A3` |  | `read_register2 = 0` or `imm = 0`, enforced by decoding. |
+
+Some of the assumptions can be checked with only arithmetic constraints, so we provide these below.
+
+| Tag | Description |
+|-----|-------------|
+| `CPU32-C1` | `read_register2` = 0 or `imm = 0` |
+| | _polynomial:_ `read_register2 * (imm[0] + imm[1]) = 0` |
+
+## Constraints
+
+Most constraints correspond to those already present in the CPU, and we present them here first, including some updates to the range checking corresponding to the differing types. We also need to make sure that for padding rows (`mu = 0`), no side effects can occur.
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `CPU32-C2` | `DECODE[pc, imm, packed_decode]` | μ |
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `CPU32-CR3` |  | `IS_BIT<μ>` |  |
+| `CPU32-CR4` |  | `IS_BIT<read_register1>` |  |
+| `CPU32-CR5` |  | `IS_BIT<read_register2>` |  |
+| `CPU32-CR6` |  | `IS_BIT<write_register>` |  |
+| `CPU32-CR7` |  | `IS_BYTE<half_instruction_length>` |  |
+| `CPU32-CR8` |  | `IS_BIT<ALU>` |  |
+| `CPU32-CR9` |  | `IS_BYTE<alu_flags>` |  |
+| `CPU32-CR10` |  | `IS_BIT<ADD>` |  |
+| `CPU32-CR11` |  | `IS_BIT<SUB>` |  |
+| `CPU32-CR12` |  | `IS_BYTE<rs1>` |  |
+| `CPU32-CR13` |  | `IS_BYTE<rs2>` |  |
+| `CPU32-CR14` |  | `IS_BYTE<rd>` |  |
+| `CPU32-CR15.i` | i ∈ [0, 1] | `IS_HALF[rv1[i]]` | μ |
+| `CPU32-CR16.i` | i ∈ [0, 1] | `IS_HALF[rv2[i]]` | μ |
+| `CPU32-CR17.i` | i ∈ [0, 3] | `IS_HALF[res[i]]` | μ |
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `CPU32-CA18` | ADD ⇒ `ADD<res::DWordWL; arg1, arg2>` |  |
+| `CPU32-CA19` | SUB ⇒ `SUB<res::DWordWL; arg1, arg2>` |  |
+| `CPU32-CA20` | `ALU[res::DWordWL; arg1, arg2, alu_flags]` | ALU |
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `CPU32-CM21` |  | `MEMW[[(rv1::DWordWL)[0], rv1[2], 0, 0, 0, 0, 0, 0]; 1, 2::DWordWL * rs1, [(rv1::DWordWL)[0], rv1[2], 0, 0, 0, 0, 0, 0], timestamp + 0::DWordWL, 1, 0, 0]` | read_register1 |
+| `CPU32-CM22.i` | i ∈ [0, 2] | `!read_register1` => `rv1[i]` = 0 |  |
+| | | _polynomial:_ `(1 - read_register1) * rv1[i] = 0` | |
+| `CPU32-CM23` |  | `MEMW[[(rv2::DWordWL)[0], rv2[2], 0, 0, 0, 0, 0, 0]; 1, 2::DWordWL * rs2, [(rv2::DWordWL)[0], rv2[2], 0, 0, 0, 0, 0, 0], timestamp + 1::DWordWL, 1, 0, 0]` | read_register2 |
+| `CPU32-CM24.i` | i ∈ [0, 2] | `!read_register2` => `rv2[i]` = 0 |  |
+| | | _polynomial:_ `(1 - read_register2) * rv2[i] = 0` | |
+| `CPU32-CM25` |  | `MEMW[1, 2::DWordWL * rd, [rvd[0], rvd[1], 0, 0, 0, 0, 0, 0], timestamp + 2::DWordWL, 1, 0, 0]` | write_register |
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `CPU32-C26` | `!μ` => `read_register1 = 0` |  |
+| | _polynomial:_ `(1 - μ) * read_register1 = 0` | |
+| `CPU32-C27` | `!μ` => `read_register2 = 0` |  |
+| | _polynomial:_ `(1 - μ) * read_register2 = 0` | |
+| `CPU32-C28` | `!μ` => `write_register = 0` |  |
+| | _polynomial:_ `(1 - μ) * write_register = 0` | |
+| `CPU32-C29` | `CPU32[half_instruction_length; timestamp, pc]` | -μ |
+
+Then, we have the constraints corresponding to the sign-extension and definition of `arg1`, `arg2` and `rd`. This includes a step where we extract the `signed` bit from the `alu_flags`, as this determines whether to sign extend the inputs or not.
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `CPU32-C30` | `signed` != 0 => `μ` = 1 |  |
+| | _polynomial:_ `signed * (1 - μ) = 0` | |
+| `CPU32-C31` | `BYTE_ALU[32 * signed; ⧼AND⧽, 32, alu_flags]` | μ |
+| `CPU32-C32` | `SIGN<rv1_sign; rv1[1], signed>` |  |
+| `CPU32-C33` | `arg1[0]` = `rv1[:2]` |  |
+| | _polynomial:_ `arg1[0] - (rv1::DWordWL)[0] = 0` | |
+| `CPU32-C34` | `arg1[1]` = (2^(32) - 1) dot `rv1_sign` |  |
+| | _polynomial:_ `arg1[1] - (2^32 - 1) * rv1_sign = 0` | |
+| `CPU32-C35` | `SIGN<rv2_sign; rv2[1], signed>` |  |
+| `CPU32-C36` | `arg2[0]` = `rv2[:2]` + `imm[0]` |  |
+| | _polynomial:_ `arg2[0] - (rv2::DWordWL)[0] - imm[0] = 0` | |
+| `CPU32-C37` | `arg2[1]` = (2^(32) - 1) dot `rv2_sign` + `imm[1]` |  |
+| | _polynomial:_ `arg2[1] - (2^32 - 1) * rv2_sign - imm[1] = 0` | |
+| `CPU32-C38` | `SIGN<res_sign; res[1], μ>` |  |
+| `CPU32-C39` | `rvd[0]` = `res[:2]` |  |
+| | _polynomial:_ `rvd[0] - (res::DWordWL)[0] = 0` | |
+| `CPU32-C40` | `rvd[1]` = (2^(32) - 1) dot `res_sign` |  |
+| | _polynomial:_ `rvd[1] - (2^32 - 1) * res_sign = 0` | |
+
+## Padding
+
+The table can be padded with the following values:
+
+| Column | Padding value |
+|--------|---------------|
+| `timestamp` | `0` |
+| `pc` | `0` |
+| `half_instruction_length` | `2` |
+| `rs1` | `0` |
+| `read_register1` | `0` |
+| `rv1` | `0` |
+| `rv1_sign` | `0` |
+| `arg1` | `0` |
+| `rs2` | `0` |
+| `read_register2` | `0` |
+| `rv2` | `0` |
+| `rv2_sign` | `0` |
+| `imm` | `0` |
+| `arg2` | `0` |
+| `res` | `0` |
+| `res_sign` | `0` |
+| `rd` | `0` |
+| `write_register` | `0` |
+| `rvd` | `0` |
+| `ALU` | `0` |
+| `alu_flags` | `0` |
+| `ADD` | `0` |
+| `SUB` | `0` |
+| `signed` | `0` |
+| `μ` | `0` |
\ No newline at end of file
diff --git a/docs/spec/decode.md b/docs/spec/decode.md
index afadc0465..605f23a77 100644
--- a/docs/spec/decode.md
+++ b/docs/spec/decode.md
@@ -11,7 +11,7 @@ The  table is comprised of  variables that are expressed using  columns:
 | Name | Type | Description |
 |------|------|-------------|
 | `pc` | `DWordWL` | value of the program counter this instruction is associated with. |
-| `packed_decode` | `BaseField` | Ordered concatenation of several small variables. The `decode (uncompressed)` section explains the purpose of each variable.\ A list of each variable and the bit(-range) in which it is located:\ [0] `read_register1`, \ [1] `read_register2`, \ [2] `write_register`, \ [3] `memory_2bytes`, \ [4] `memory_4bytes`, \ [5] `memory_8bytes`, \ [6] `c_type`, \ [7] `signed`, \ [8] `mp_selector`, \ [9] `muldiv_selector`, \ [10] `word_instr`, \ [11] `ADD`, \ [12] `SUB`, \ [13] `SLT`, \ [14] `AND`, \ [15] `OR`, \ [16] `XOR`, \ [17] `SHIFT`, \ [18] `JALR`, \ [19] `BEQ`, \ [20] `BLT`, \ [21] `LOAD`, \ [22] `STORE`, \ [23] `MUL`, \ [24] `DIVREM`, \ [25] `ECALL`, \ [26] `EBREAK`; \ [27:35] `rs1`, \ [35:43] `rs2`, \ [43:51] `rd`, \ the remaining bits are set to zero.  |
+| `packed_decode` | `BaseField` | Ordered concatenation of several small variables. The `decode (uncompressed)` section explains the purpose of each variable.\ A list of each variable and the bit(-range) in which it is located:\ [0] `read_register1`, \ [1] `read_register2`, \ [2] `write_register`, \ [3] `word_instr`, \ [4] `ALU`, \ [5] `ADD`, \ [6] `SUB`, \ [7] `MEMORY`, \ [8] `BRANCH`, \ [9] `ECALL`, \ [10:17] `rs1`, \ [18:25] `rs2`, \ [26:33] `rd`, \ [34:41] `half_instruction_length`, \ [42:49] `alu_flags`, \ [50:57] `mem_flags`, \ the remaining bits are set to zero.  |
 | `imm` | `DWordWL` | the *fully extended (!)* 64-bit version of the immediate. |
 
 ### Multiplicity
@@ -24,11 +24,18 @@ The  table is comprised of  variables that are expressed using  columns:
 
 The  table must be padded to a length that is a power of two. Empty rows with the following content can be added to achieve this:
 
-Note that this row sets the `EBREAK` flag. Given that `CPU` asserts that `EBREAK = 0` (see [cpu:c:ebreak_traps]), using this "padding-instruction" would immediately make the CPU table unprovable. Note moreover that the `pc` is set to `7`. This value is the _smallest odd number_ (i.e., not reachable during regular execution) that is more than _`4`_ (i.e., the max `pc`-increment) greater than _`1`_ (i.e., the `pc`-value used in the [additional instruction] referred to by `CPU`-padding lines).
+| Column | Padding value |
+|--------|---------------|
+| `pc` | `1` |
+| `packed_decode` | `0` |
+| `imm` | `0` |
+| `μ` | `0` |
+
+This is simultaneously the row that is used for padding rows in the CPU, if the multiplicity is nonzero, so we need to ensure that this table has at least one row of padding.
 
 ## Decoding<decode:decoding-overview>
 
-For the purposes of explaining decoding, we decompress 's `packed_decode` variable into its constituent variables. Note that the below table is _not_ used in practice: it is solely used for the purposes of this explanation.
+For the purposes of explaining decoding, we decompress 's `packed_decode` variable into its constituent variables. Note that the below table is _not_ used in practice: it is solely used for the purposes of this explanation. The construction of the `alu_flags` and `mem_flags` columns is given here through virtual columns.
 
 ### Output
 
@@ -40,32 +47,49 @@ For the purposes of explaining decoding, we decompress 's `packed_decode` variab
 | `rd` | `Byte` | index of destination register. |
 | `read_register1` | `Bit` | whether to load the contents of address `rs1` (1) or `0` (0) into `rv1`. |
 | `read_register2` | `Bit` | whether to load the contents of address `rs2` (1) or `0` (0) into `rv2`. |
-| `write_register` | `Bit` | whether the result should be written to `rd` ($=0$ for memory write and when $`rd` = `x0`$. |
-| `mem_2B` | `Bit` | whether the memory access (read or write) touches exactly $2$ bytes. |
-| `mem_4B` | `Bit` | whether the memory access (read or write) touches exactly $4$ bytes. |
-| `mem_8B` | `Bit` | whether the memory access (read or write) touches exactly $8$ bytes. |
-| `c_type` | `Bit` | Whether the instruction is of type `C`, i.e., whether it is $2$ bytes long instead of $4$. |
+| `write_register` | `Bit` | whether the result should be written to `rd` ($=0$ for memory write and when $`rd` = `x0`)$. |
 | `imm` | `DWordWL` | the *fully extended (!)* 64-bit version of the immediate. |
+| `word_instr` | `Bit` | Whether the instruction is a `*W` instruction, requiring the inputs and outputs to be (sign) extended. |
+| `ALU` | `Bit` | Enable the ALU |
+| `ADD` | `Bit` | ALU does an ADD |
+| `SUB` | `Bit` | ALU does a SUB |
+| `BRANCH` | `Bit` | The instruction is a branch |
+| `MEMORY` | `Bit` | The instruction is a memory access |
+| `ECALL` | `Bit` | Perform an ECALL |
+| `half_instruction_length` | `Byte` | Half of how many bytes this instruction takes up in the program |
+
+### Auxiliary
+
+| Name | Type | Description |
+|------|------|-------------|
+| `alu_op` | `B4` | Operation selector value for the ALU |
 | `signed` | `Bit` | selector used to indicate signed or unsigned input interpretation. |
-| `mp_selector` | `Bit` | Multi-purpose selector used by the CPU to to configure several ALU operations in different ways.            See the `CPU` chip for more details. |
+| `signed2` | `Bit` | A second signed bit, useful for MUL instructions |
 | `muldiv_selector` | `Bit` | selects which output of `MUL` (lo/hi) or `DVRM` (quo/rem) is wanted. |
-| `word_instr` | `Bit` | Whether the instruction is a `*W` instruction, requiring the inputs and outputs to be (sign) extended. |
-| `ADD` | `Bit` | ALU selector flag |
-| `SUB` | `Bit` | ALU selector flag |
-| `SLT` | `Bit` | ALU selector flag |
-| `AND` | `Bit` | ALU selector flag |
-| `OR` | `Bit` | ALU selector flag |
-| `XOR` | `Bit` | ALU selector flag |
-| `SHIFT` | `Bit` | ALU selector flag |
-| `JALR` | `Bit` | ALU selector flag |
-| `BEQ` | `Bit` | ALU selector flag |
-| `BLT` | `Bit` | ALU selector flag |
-| `LOAD` | `Bit` | ALU selector flag |
-| `STORE` | `Bit` | ALU selector flag |
-| `MUL` | `Bit` | ALU selector flag |
-| `DIVREM` | `Bit` | ALU selector flag |
-| `ECALL` | `Bit` | ALU selector flag |
-| `EBREAK` | `Bit` | ALU selector flag |
+| `invert` | `Bit` | Instructs the EQ or LT chip to invert its result, or inverts the direction of the SHIFT chip (right instead of left) |
+| `memory_op` | `Bit` | Selects whether to LOAD (0) or STORE (1) |
+| `mem_2B` | `Bit` | whether the memory access (read or write) touches exactly $2$ bytes. |
+| `mem_4B` | `Bit` | whether the memory access (read or write) touches exactly $4$ bytes. |
+| `mem_8B` | `Bit` | whether the memory access (read or write) touches exactly $8$ bytes. |
+| `mem_signed` | `Bit` | Whether the memory operation is a signed one, this is distinct from `signed` to enable the `JALR` flag to alias `mem_flags` |
+| `JALR` | `Bit` | The branch is a JAL(R) |
+
+### Virtual
+
+| Name | Type | Description |
+|------|------|-------------|
+| `alu_flags` | `Byte` | The combined ALU flags |
+| `mem_flags` | `Byte` | The combined memory flags (or JALR when BRANCHing) |
+
+**Definition of `alu_flags`:**
+```
+alu_flags := alu_op + 32 * signed + 64 * (signed2 + invert) + 128 * muldiv_selector
+```
+
+**Definition of `mem_flags`:**
+```
+mem_flags := JALR + memory_op + 2 * mem_signed + 4 * mem_2B + 8 * mem_4B + 16 * mem_8B
+```
 
 ### Multiplicity
 
@@ -73,9 +97,13 @@ For the purposes of explaining decoding, we decompress 's `packed_decode` variab
 |------|------|-------------|
 | `μ` | `BaseField` | The multiplicity with which this instruction is looked up in the `CPU` table. |
 
-We will illustrate how each instruction should be expressed in this (uncompressed) decoding table. The columns of the accompanying table represent the following: - *`operation`*: the assembly operation being encoded. - *`op-flag`*: which of the "`ALU` selector flags" operation flags to set. Each operation sets exactly one. - *`w_instr`*, *`signed`*: whether to set the `word_instr` and `signed` flags, respectively. - *other*: the other flags that should be set or variables that should be given specific values.
+First, we provide a mapping from an an ALU operation "descriptor" to the numerical value as used for the `alu_op` column. This is the table used to find the value for the ) notation when performing `ALU` or `BYTE_ALU` interactions.
 
-For the purpose of brevity and readability, the table uses the following rules-of-thumb: + `rd`, `rs1`, `rs2`, and `imm` are mapped to the values provided by the instruction; when a value is not specified by an instruction it defaults to `0`. + `read_register1`, `read_register2` and `write_register` are set to `1` when respectively ``rs1` != 0`, ``rs2` != 0`, or  ``rd` != 0`. + Any flag that is not listed is set to `0`, with the exception of the `c_type` flag. *The `c_type` flag is set independently of the below table*, as explained next.
+table(columns: (auto, auto), stroke: 0pt, inset: (right: .5em), align: (left, left), table.header[*Descriptor*][*value*], table.hline(stroke: 1.5pt))[ *AND*][0][ *OR*][1][ *XOR*][2][ *EQ*][3][ *LT*][4][ *SHIFT*][5][ *SHIFTW*][6][ *MUL*][7][ *DIVREM*][8]
+
+We will illustrate how each instruction should be expressed in this (uncompressed) decoding table. The columns of the accompanying table represent the following: - *`operation`*: the assembly operation being encoded. - *`alu`*: Set to the descriptor of the ALU operation to be used for `alu_op`. If listed as `ADD` or `SUB`, the corresponding flag should be set, otherwise set `ALU = 1` when this column is not empty. - *`w_instr`*, *`signed`*: whether to set the `word_instr` and `signed` flags, respectively. - *other*: the other flags that should be set or variables that should be given specific values.
+
+For the purpose of brevity and readability, the table uses the following rules-of-thumb: + `rd`, `rs1`, `rs2`, and `imm` are mapped to the values provided by the instruction; when a value is not specified by an instruction it defaults to `0`. + `read_register1`, `read_register2` and `write_register` are set to `1` when respectively ``rs1` != 0`, ``rs2` != 0`, or  ``rd` != 0`.
 
 Further clarification is provided in the notes following the table.
 
@@ -85,13 +113,15 @@ super("[" + refs.pos().map(r => ref(r)).join(",") + "]") }
 
 show figure: set block(breakable: true)
 
-figure(table( columns: (auto, auto, auto, auto, 1fr, auto), stroke: 0pt, inset: (right: .5em), align: (left, right, center, center, left, right), fill: (_, y) => // Overlay a low-opacity fill color to distinguish the different rows better if calc.odd(y) and y <= lines.len() { color.rgb(0, 0, 100, 20) } else { color.rgb(255, 255, 255, 20) }, table.header([*Operation*], [*op-flag*], [*`w_instr`*], [*`signed`*], [*other*], []), table.hline(stroke: 1.5pt), table.vline(x: 1, start: 1, end: lines.len() + 1, stroke: .5pt), ..lines.flatten(), table.hline(stroke: 1.5pt), table.footer([*Operation*], [*op-flag*], [*`w_instr`*], [*`signed`*], [*other*]), )) }
+figure(table( columns: (auto, auto, auto, auto, 1fr, auto), stroke: 0pt, inset: (right: .5em), align: (left, right, center, center, left, right), fill: (_, y) => // Overlay a low-opacity fill color to distinguish the different rows better if calc.odd(y) and y <= lines.len() { color.rgb(0, 0, 100, 20) } else { color.rgb(255, 255, 255, 20) }, table.header([*Operation*], [*alu*], [*`w_instr`*], [*`signed`*], [*other*], []), table.hline(stroke: 1.5pt), table.vline(x: 1, start: 1, end: lines.len() + 1, stroke: .5pt), ..lines.flatten(), table.hline(stroke: 1.5pt), table.footer([*Operation*], [*alu*], [*`w_instr`*], [*`signed`*], [*other*]), )) }
+
+// OP-IMM ([`ADDI[W]   rd, rs1, imm`], [`ADD`], [`[W]`], [], [], []), ([`SLTI[U]   rd, rs1, imm`], [`LT`], [], [.not`[U]`], [], []), ([`ANDI      rd, rs1, imm`], [`AND`], [], [], [], []), ([`ORI       rd, rs1, imm`], [`OR`],   [], [], [], []), ([`XORI      rd, rs1, imm`], [`XOR`], [], [], [], []), ([`SLLI[W]   rd, rs1, imm`], [`SHIFT[W]`], [`[W]`], [], [], []), ([`SRLI[W]   rd, rs1, imm`], [`SHIFT[W]`], [`[W]`], [], [`invert`], []), ([`SRAI[W]   rd, rs1, imm`], [`SHIFT[W]`], [`[W]`], [1], [`invert`], []), // OP ([`ADD[W]    rd, rs1, rs2`], [`ADD`], [`[W]`], [], [], []), ([`SUB[W]    rd, rs1, rs2`], [`SUB`], [`[W]`], [], [], []), ([`SLT[U]    rd, rs1, rs2`], [`LT`], [], [.not`[U]`], [], []), ([`AND       rd, rs1, rs2`], [`AND`], [], [], [], []), ([`OR        rd, rs1, rs2`], [`OR`], [], [], [], []), ([`XOR       rd, rs1, rs2`], [`XOR`], [], [], [], []), ([`SLL[W]    rd, rs1, rs2`], [`SHIFT[W]`], [`[W]`], [], [], []), ([`SRL[W]    rd, rs1, rs2`], [`SHIFT[W]`], [`[W]`], [], [`invert`], []), ([`SRA[W]    rd, rs1, rs2`], [`SHIFT[W]`], [`[W]`], [1], [`invert`], []), // OP - M ([`MUL[W]    rd, rs1, rs2`], [`MUL`], [`[W]`], [1], [`signed2`], []), ([`MULH      rd, rs1, rs2`], [`MUL`], [], [1], [`signed2`, `muldiv_selector`], []), ([`MULHU     rd, rs1, rs2`], [`MUL`], [], [], [`muldiv_selector`], []), ([`MULHSU    rd, rs1, rs2`], [`MUL`], [], [1], [`muldiv_selector`], []), ([`DIV[U][W] rd, rs1, rs2`], [`DIVREM`], [`[W]`], [.not`[U]`], [], []), ([`REM[U][W] rd, rs1, rs2`], [`DIVREM`], [`[W]`], [.not`[U]`], [`muldiv_selector`], []), // LUI/AUIPC ([`LUI       rd, imm`], [`ADD`], [], [], [], []), ([`AUIPC     rd, imm`], [`ADD`], [], [], [`rs1 := x255`], []), ([`JAL       rd, imm`], [], [], [], [`BRANCH`, `JALR`, `rs1 := x255`], []), // Branching ([`JALR      rd, rs1, imm`], [], [], [], [`BRANCH`, `JALR`], []), ([`BEQ      rs1, rs2, imm`], [`EQ`], [], [], [`BRANCH`], []), ([`BNE      rs1, rs2, imm`], [`EQ`], [], [], [`BRANCH`, `invert`], []), ([`BLT[U]   rs1, rs2, imm`], [`LT`], [], [.not`[U]`], [`BRANCH`], []), ([`BGE[U]   rs1, rs2, imm`], [`LT`], [], [.not`[U]`], [`BRANCH`, `invert`], []), // LOAD ([`LD        rd, rs1, imm`], [`ADD`], [], [], [`MEMORY`, `mem_8B`], []), ([`LW[U]     rd, rs1, imm`], [`ADD`], [], [], [`MEMORY`, `mem_signed := `.not`[U]`, `mem_4B`], []), ([`LH[U]     rd, rs1, imm`], [`ADD`], [], [], [`MEMORY`, `mem_signed := `.not`[U]`, `mem_2B`], []), ([`LB[U]     rd, rs1, imm`], [`ADD`], [], [], [`MEMORY`, `mem_signed := `.not`[U]`], []), // STORE ([`SD       rs1, rs2, imm`], [`ADD`], [], [], [`MEMORY`, `memory_op`, `mem_8B`], []), ([`SW       rs1, rs2, imm`], [`ADD`], [], [], [`MEMORY`, `memory_op`, `mem_4B`], []), ([`SH       rs1, rs2, imm`], [`ADD`], [], [], [`MEMORY`, `memory_op`, `mem_2B`], []), ([`SB       rs1, rs2, imm`], [`ADD`], [], [], [`MEMORY`, `memory_op`], []), // ECALL/EBREAK ([`ECALL`], [], [], [], [`ECALL`, ``rs1` := `x17``], []), // FENCE ([`FENCE`], [`ADD`], [], [], [], []),
 
-// OP-IMM ([`ADDI[W]   rd, rs1, imm`], [`ADD`], [`[W]`], [], [], []), ([`SLTI[U]   rd, rs1, imm`], [`SLT`], [], [.not`[U]`], [], []), ([`ANDI      rd, rs1, imm`], [`AND`], [], [], [], []), ([`ORI       rd, rs1, imm`], [`OR`],   [], [], [], []), ([`XORI      rd, rs1, imm`], [`XOR`], [], [], [], []), ([`SLLI[W]   rd, rs1, imm`], [`SHIFT`], [`[W]`], [], [], []), ([`SRLI[W]   rd, rs1, imm`], [`SHIFT`], [`[W]`], [], [`mp_selector`], []), ([`SRAI[W]   rd, rs1, imm`], [`SHIFT`], [`[W]`], [1], [`mp_selector`], []), // OP ([`ADD[W]    rd, rs1, rs2`], [`ADD`], [`[W]`], [], [], []), ([`SUB[W]    rd, rs1, rs2`], [`SUB`], [`[W]`], [], [], []), ([`SLT[U]    rd, rs1, rs2`], [`SLT`], [], [.not`[U]`], [], []), ([`AND       rd, rs1, rs2`], [`AND`], [], [], [], []), ([`OR        rd, rs1, rs2`], [`OR`], [], [], [], []), ([`XOR       rd, rs1, rs2`], [`XOR`], [], [], [], []), ([`SLL[W]    rd, rs1, rs2`], [`SHIFT`], [`[W]`], [], [], []), ([`SRL[W]    rd, rs1, rs2`], [`SHIFT`], [`[W]`], [], [`mp_selector`], []), ([`SRA[W]    rd, rs1, rs2`], [`SHIFT`], [`[W]`], [1], [`mp_selector`], []), // OP - M ([`MUL[W]    rd, rs1, rs2`], [`MUL`], [`[W]`], [1], [`mp_selector`], []), ([`MULH      rd, rs1, rs2`], [`MUL`], [], [1], [`mp_selector`, `muldiv_selector`], []), ([`MULHU     rd, rs1, rs2`], [`MUL`], [], [], [`muldiv_selector`], []), ([`MULHSU    rd, rs1, rs2`], [`MUL`], [], [1], [`muldiv_selector`], []), ([`DIV[U][W] rd, rs1, rs2`], [`DIVREM`], [`[W]`], [.not`[U]`], [], []), ([`REM[U][W] rd, rs1, rs2`], [`DIVREM`], [`[W]`], [.not`[U]`], [`muldiv_selector`], []), // LUI/AUIPC ([`LUI       rd, imm`], [`ADD`], [], [], [], []), ([`AUIPC     rd, imm`], [`ADD`], [], [], [`rs1 := x255`], []), ([`JAL       rd, imm`], [`JALR`], [], [], [`rs1 := x255`], []), // Branching ([`JALR      rd, rs1, imm`], [`JALR`], [], [], [], []), ([`BEQ      rs1, rs2, imm`], [`BEQ`], [], [], [], []), ([`BNE      rs1, rs2, imm`], [`BEQ`], [], [], [`mp_selector`], []), ([`BLT[U]   rs1, rs2, imm`], [`BLT`], [], [.not`[U]`], [], []), ([`BGE[U]   rs1, rs2, imm`], [`BLT`], [], [.not`[U]`], [`mp_selector`], []), // LOAD ([`LD        rd, rs1, imm`], [`LOAD`], [], [], [`mem_8B`], []), ([`LW[U]     rd, rs1, imm`], [`LOAD`], [], [.not`[U]`], [`mem_4B`], []), ([`LH[U]     rd, rs1, imm`], [`LOAD`], [], [.not`[U]`], [`mem_2B`], []), ([`LB[U]     rd, rs1, imm`], [`LOAD`], [], [.not`[U]`], [], []), // STORE ([`SD       rs1, rs2, imm`], [`STORE`], [], [], [`mem_8B`], []), ([`SW       rs1, rs2, imm`], [`STORE`], [], [], [`mem_4B`], []), ([`SH       rs1, rs2, imm`], [`STORE`], [], [], [`mem_2B`], []), ([`SB       rs1, rs2, imm`], [`STORE`], [], [], [], []), // ECALL/EBREAK ([`ECALL`], [`ECALL`], [], [], [``rs1` := `x17``], []), ([`EBREAK`], [`EBREAK`], [], [], [], []), // FENCE ([`FENCE`], [`ADD`], [], [], [], []),
+Note that the above table has no entry for the `EBREAK` instruction. We treat `EBREAK` as an unprovable trap, and its absence from the table enables this by having no valid decoding available for when the instruction is encountered.
 
 ### C-type instructions
 
-The `RV64C` extension for compressed instructions specifies that \~50% of all instructions can be represented using a 16-bit instruction (rather than 32-bits), saving \~25% in code size. This execution of assembly code is _not_ agnostic to an instruction's compression state; after executing a compressed instruction, the `pc` should be incremented by `2` rather than `4`. To indicate an instruction is provided in compressed form, the `c_type` flag is introduced. *This flag should be set to `1` whenever the decoded instruction is provided in compressed form and `0` otherwise.*
+The `RV64C` extension for compressed instructions specifies that \~50% of all instructions can be represented using a 16-bit instruction (rather than 32-bits), saving \~25% in code size. This execution of assembly code is _not_ agnostic to an instruction's compression state; after executing a compressed instruction, the `pc` should be incremented by `2` rather than `4`. As such, we provide the `half_instruction_length` column that *must take on the value `1` for compressed instructions and `2` for regular instructions*. It is represented as half the number of bytes in the instruction to make misaligned instructions lengths unrepresentable. Additionally, having the variable opens the door for future optimizations involving "fused" instructions, where common sequences of instructions are merged into a single decoded version and need only a single CPU row to prove.
 
 // Construct a note that can be referenced through `lbl`
 
@@ -101,10 +131,4 @@ show figure: (it) => align(left, []) [ ] }
 
 We note the following about the above decoding table:
 
-enum.item( referenceable_note( "note_word_instr", [`word_instr`: `[W]` indicates that ``word_instr` = 1` for the `W`-variant of the operation, and `0` for the non-`W`-variant.] ), enum.item( referenceable_note( "note_signed", [`signed`: .not`[U]` indicates that ``signed` = 1` for the *non-`U`*-variant of the operation, and `0` for the `U`-variant.] ), enum.item( referenceable_note( "note-lui", [`LUI`: this operation loads the 20-bit `imm` in the upper bits of `rd`. Observe that this can be represented using `ADDI rd, x0, imm`. As such, *we expect the decoding to take care of writing the immediate in bit range `[12:32]` of `imm` and extending it to 64 bits.*] ), enum.item( referenceable_note( "note-auipc", [`AUIPC`: this operation adds the 20-bit immediate to the upper bits of `pc` and stores the result in `rd`. Given that the `pc` is stored in `x255`, this operation can be represented using `ADDI rd, x255, imm`. As such, *we expect the decoding to take care of writing the immediate in bit range `[12:32]` of `imm` and extending it to 64 bits.*] ), enum.item( referenceable_note( "note-jal", [`JAL`: this operation stores ``pc` + 4` in `rd` and adds two times the sign-extended 20-bit immediate to the `pc`. Note that this can be represented using `JALR rd, x255, imm`. As such, *we expect the decoding to take care of writing the immediate in bit range `[1:21]` of `imm` and extending it to 64 bits; the least significant bit should always be 0.*] ), enum.item( referenceable_note( "note-ecall", [`ECALL`: "On RISC-V a system call has its own instruction: `ECALL`. [...] A7 [= register `x17`] contains the system call number." [[source]] ] ), enum.item( referenceable_note( "note-fence", [`FENCE`: currently, the VM interprets this operation as `ADDI x0 x0 0`; a no-op.]
-
-### One more instruction <cpu-padding-decode-row>
-
-In addition to decoding all instructions provided in the ELF and adding a corresponding entry to the  table, one must include an entry that has ``pc` = 1` and every other variable set to `0`. Note that this will never conflict with any entry in the ELF, since it has an odd `pc` value.
-
-This entry is used to pad the `CPU` table. More details on this matter are provided in the `CPU` chip.
\ No newline at end of file
+enum.item( referenceable_note( "note_word_instr", [`word_instr`: `[W]` indicates that ``word_instr` = 1` for the `W`-variant of the operation, and `0` for the non-`W`-variant. Similarly, `SHIFT[W]` indicates the `SHIFTW` operation for the `W`-variant, and `SHIFT` otherwise.] ), enum.item( referenceable_note( "note_signed", [`signed`: .not`[U]` indicates that ``signed` = 1` for the *non-`U`*-variant of the operation, and `0` for the `U`-variant.] ), enum.item( referenceable_note( "note-lui", [`LUI`: this operation loads the 20-bit `imm` in the upper bits of `rd`. Observe that this can be represented using `ADDI rd, x0, imm`. As such, *we expect the decoding to take care of writing the immediate in bit range `[12:32]` of `imm` and extending it to 64 bits.*] ), enum.item( referenceable_note( "note-auipc", [`AUIPC`: this operation adds the 20-bit immediate to the upper bits of `pc` and stores the result in `rd`. Given that the `pc` is stored in `x255`, this operation can be represented using `ADDI rd, x255, imm`. As such, *we expect the decoding to take care of writing the immediate in bit range `[12:32]` of `imm` and extending it to 64 bits.*] ), enum.item( referenceable_note( "note-jal", [`JAL`: this operation stores ``pc` + `2 * half_instruction_length`` in `rd` and adds two times the sign-extended 20-bit immediate to the `pc`. Note that this can be represented using `JALR rd, x255, imm`. As such, *we expect the decoding to take care of writing the immediate in bit range `[1:21]` of `imm` and extending it to 64 bits; the least significant bit should always be 0.*] ), enum.item( referenceable_note( "note-ecall", [`ECALL`: "On RISC-V a system call has its own instruction: `ECALL`. [...] A7 [= register `x17`] contains the system call number." [[source]] ] ), enum.item( referenceable_note( "note-fence", [`FENCE`: currently, the VM interprets this operation as `ADDI x0 x0 0`; a no-op.]
\ No newline at end of file
diff --git a/docs/spec/dvrm.md b/docs/spec/dvrm.md
index a57fa5ef1..9156f0db5 100644
--- a/docs/spec/dvrm.md
+++ b/docs/spec/dvrm.md
@@ -88,15 +88,15 @@ carry (when iter=[1, 3]) := 2^-32 * ((extended_n_sub_r::QuadWL)[i] + (extended_r
 | `μ_q` | `BaseField` |  |
 | `μ_r` | `BaseField` |  |
 
-## Assumptions
+## Constraints
 
-| Tag | Range | Description |
-|-----|-------|-------------|
-| `DVRM-A1.i` | i ∈ [0, 3] | `IS_HALF[n[i]]` |
-| `DVRM-A2.i` | i ∈ [0, 3] | `IS_HALF[d[i]]` |
-| `DVRM-A3` |  | `IS_BIT<signed>` |
+First, we range-check all inputs.
 
-## Constraints
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `DVRM-C1.i` | i ∈ [0, 3] | `IS_HALF[n[i]]` | μ_sum |
+| `DVRM-C2.i` | i ∈ [0, 3] | `IS_HALF[d[i]]` | μ_sum |
+| `DVRM-C3` |  | `IS_BIT<signed>` |  |
 
 From the ISA, we gather five requirements for the `DIV[U][W]` and `REM[U][W]` instructions:
 
@@ -108,7 +108,7 @@ We start with R3, which is straightforwardly asserted by constraint [dvrm:c:sign
 
 | Tag | Description |
 |-----|-------------|
-| `DVRM-C1` | `r` eq.not 0 => `sign_r` = `sign_n` |
+| `DVRM-C4` | `r` eq.not 0 => `sign_r` = `sign_n` |
 | | _polynomial:_ `Σ_i = 0^3 r[i] * (sign_r - sign_n) = 0` |
 
 ### R2: rounding towards zero
@@ -121,12 +121,12 @@ Focusing on the first statement, we observe that this trivially holds when ``sig
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `DVRM-C2` |  | `LT[1 - div_by_zero; abs_r, abs_d, 0]` | μ_sum |
-| `DVRM-C3` |  | sign_r ⇒ `NEG<abs_r; r>` |  |
-| `DVRM-C4.i` | i ∈ [0, 1] | not`sign_r` => `abs_r` = `r` |  |
+| `DVRM-C5` |  | `ALU[[1 - div_by_zero, 0]; abs_r, abs_d, ⧼LT⧽]` | μ_sum |
+| `DVRM-C6` |  | sign_r ⇒ `NEG<abs_r; r>` |  |
+| `DVRM-C7.i` | i ∈ [0, 1] | not`sign_r` => `abs_r` = `r` |  |
 | | | _polynomial:_ `(1 - sign_r) * (abs_r[i] - (r::DWordWL)[i]) = 0` | |
-| `DVRM-C5` |  | sign_d ⇒ `NEG<abs_d; d>` |  |
-| `DVRM-C6.i` | i ∈ [0, 1] | not`sign_d` => `abs_d` = `d` |  |
+| `DVRM-C8` |  | sign_d ⇒ `NEG<abs_d; d>` |  |
+| `DVRM-C9.i` | i ∈ [0, 1] | not`sign_d` => `abs_d` = `d` |  |
 | | | _polynomial:_ `(1 - sign_d) * (abs_d[i] - (d::DWordWL)[i]) = 0` | |
 
 ### R5: overflow
@@ -139,9 +139,9 @@ In summary, in case of overflow R2 enforces that ``r` = 0`. Moreover it suffices
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `DVRM-C7` | `sign_q` = `signed` dot (1- `overflow`) |  |
+| `DVRM-C10` | `sign_q` = `signed` dot (1- `overflow`) |  |
 | | _polynomial:_ `signed * (1 - overflow) - sign_q = 0` | |
-| `DVRM-C8` | `ZERO[overflow; n[0] + n[1] + n[2] + (n[3] - 2^15 * sign_n) + (1 - sign_n) + (65535 - d[0]) + (65535 - d[1]) + (65535 - d[2]) + (65535 - d[3])]` | μ_sum |
+| `DVRM-C11` | `ZERO[overflow; n[0] + n[1] + n[2] + (n[3] - 2^15 * sign_n) + (1 - sign_n) + (65535 - d[0]) + (65535 - d[1]) + (65535 - d[2]) + (65535 - d[3])]` | μ_sum |
 
 We highlight [dvrm:c:overflow]. Recall that the `overflow` flag should be set if and only if (i) ``signed` = 1`, (ii) ``n` = `0x80...00``, and (iii) ``d` = `0xFF...FF``. These requirements are equivalent to the state where: $ forall i in [0, 3]:&& 65535 - `d`_i &= 0,\ forall i in [0, 2]:&& `n`_i &= 0,\ && `n`_3 - 2^15 dot `sign_n` &= 0,\ && 1 - `sign_n` &= 0,\ $ where ``signed` = 1` follows from the last equality. The requirement is phrased in this way, because the left-hand sides of the above expressions are `>= 0` by construction. Given that the sum of these expressions does not exceed `2^19` (and thus never wraps in the field), we can now say that the `overflow` bit should be set to `1` if and only if their sum evaluates to `0`. The `ZERO` lookup guarantees this to be the case.
 
@@ -149,13 +149,13 @@ We highlight [dvrm:c:overflow]. Recall that the `overflow` flag should be set if
 
 Rewriting R1, we find the constraint `not`overflow` => `n` - `r` = `qd``.
 
-Since `n`, `d`, `q` and `r` are all 64-bit integers, we must assert this equality `mod 2^128`, rather than `mod 2^64`. To this end, we introduce `extended_n_sub_r` and leverage the `MUL` chip to verify that it is equal to ``qd` mod 2^128` using constraints [dvrm:c:mul_lower] and [dvrm:c:mul_upper]; [dvrm:c:q_range] is included to uphold assumption [mul:a:rhs].
+Since `n`, `d`, `q` and `r` are all 64-bit integers, we must assert this equality `mod 2^128`, rather than `mod 2^64`. To this end, we introduce `extended_n_sub_r` and leverage the `MUL` chip to verify that it is equal to ``qd` mod 2^128` using constraints [dvrm:c:mul_lower] and [dvrm:c:mul_upper]; [dvrm:c:q_range] is included to uphold assumption [mul:c:rhs].
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `DVRM-C9` |  | `MUL[n_sub_r::DWordWL; d, signed, q, sign_q, 0]` | μ_sum |
-| `DVRM-C10` |  | `MUL[extension_n_sub_r::DWordWL; d, signed, q, sign_q, 1]` | μ_sum |
-| `DVRM-C11.i` | i ∈ [0, 3] | `IS_HALF[q[i]]` | μ_sum |
+| `DVRM-C12` |  | `ALU[n_sub_r::DWordWL; d::DWordWL, q::DWordWL, ⧼MUL⧽ + 32 * signed + 64 * sign_q]` | μ_sum |
+| `DVRM-C13` |  | `ALU[extension_n_sub_r::DWordWL; d::DWordWL, q::DWordWL, ⧼MUL⧽ + 32 * signed + 64 * sign_q + 128]` | μ_sum |
+| `DVRM-C14.i` | i ∈ [0, 3] | `IS_HALF[q[i]]` | μ_sum |
 
 It now remains to enforce that `extended_n_sub_r` is the _signed_ 128-bit representation of ``n`-`r``. Here, we introduce `extended_n` and `extended_r`. By their definition, these variables contain the signed 128-bit representations of `n` and `r`. The `carry` variable has been defined such that it mimics those in the `ADD` chip, except that here we add two `QuadHL`s rather than two `DWordHL`, thus needing four carry bits instead of two. With this in place, [dvrm:c:n_sub_r] (mimicking [add:c:carry]) ensures `extended_n_sub_r` must contain the correct value.
 
@@ -163,10 +163,10 @@ Lastly, observe that ``n` - `r` in (-2^64, 2^64)`, _regardless_ of the value of
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `DVRM-C12.i` | i ∈ [0, 3] | `IS_BIT<carry[i]>` |  |
-| `DVRM-C13.i` | i ∈ [0, 3] | `IS_HALF[r[i]]` | μ_sum |
-| `DVRM-C14.i` | i ∈ [0, 3] | `IS_HALF[n_sub_r[i]]` | μ_sum |
-| `DVRM-C15` |  | `IS_BIT<sign_n_sub_r>` |  |
+| `DVRM-C15.i` | i ∈ [0, 3] | `IS_BIT<carry[i]>` |  |
+| `DVRM-C16.i` | i ∈ [0, 3] | `IS_HALF[r[i]]` | μ_sum |
+| `DVRM-C17.i` | i ∈ [0, 3] | `IS_HALF[n_sub_r[i]]` | μ_sum |
+| `DVRM-C18` |  | `IS_BIT<sign_n_sub_r>` |  |
 
 ### R4: division-by-zero
 
@@ -174,9 +174,9 @@ R4 requires that ``q` = 2^64-1` (unsigned) or `-1` (signed) and ``r` = n` when `
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `DVRM-C16.i` | i ∈ [0, 3] | `div_by_zero` => `q[i]` = 65535 |  |
+| `DVRM-C19.i` | i ∈ [0, 3] | `div_by_zero` => `q[i]` = 65535 |  |
 | | | _polynomial:_ `div_by_zero * (q[i] - 65535) = 0` | |
-| `DVRM-C17` |  | `ZERO[div_by_zero; d[0] + d[1] + d[2] + d[3]]` | μ_sum |
+| `DVRM-C20` |  | `ZERO[div_by_zero; d[0] + d[1] + d[2] + d[3]]` | μ_sum |
 
 ### Other
 
@@ -184,9 +184,9 @@ The following constraints are included to enforce the values of `sign_n`, `sign_
 
 | Tag | Description |
 |-----|-------------|
-| `DVRM-C18` | `SIGN<sign_n; n[3], signed>` |
-| `DVRM-C19` | `SIGN<sign_r; r[3], signed>` |
-| `DVRM-C20` | `SIGN<sign_d; d[3], signed>` |
+| `DVRM-C21` | `SIGN<sign_n; n[3], signed>` |
+| `DVRM-C22` | `SIGN<sign_r; r[3], signed>` |
+| `DVRM-C23` | `SIGN<sign_d; d[3], signed>` |
 
 ### Output
 
@@ -194,9 +194,29 @@ Lastly, this chip contributes the following to the lookup:
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `DVRM-C21` | `DVRM[q::DWordWL; n, d, signed, 0]` | -μ_q |
-| `DVRM-C22` | `DVRM[r::DWordWL; n, d, signed, 1]` | -μ_r |
+| `DVRM-C24` | `ALU[q::DWordWL; n::DWordWL, d::DWordWL, ⧼DIVREM⧽ + 32 * signed]` | -μ_q |
+| `DVRM-C25` | `ALU[r::DWordWL; n::DWordWL, d::DWordWL, ⧼DIVREM⧽ + 32 * signed + 128]` | -μ_r |
 
 ## Padding
 
-To pad the  table, we use the following data, representing the unsigned division `frac(0, 0, style: "horizontal")`:
\ No newline at end of file
+To pad the  table, we use the following data, representing the unsigned division `frac(0, 0, style: "horizontal")`:
+
+| Column | Padding value |
+|--------|---------------|
+| `n` | `0` |
+| `d` | `0` |
+| `signed` | `0` |
+| `q` | `0` |
+| `r` | `0` |
+| `div_by_zero` | `1` |
+| `overflow` | `0` |
+| `abs_r` | `0` |
+| `abs_d` | `0` |
+| `n_sub_r` | `0` |
+| `sign_n_sub_r` | `0` |
+| `sign_n` | `0` |
+| `sign_d` | `0` |
+| `sign_q` | `0` |
+| `sign_r` | `0` |
+| `μ_q` | `0` |
+| `μ_r` | `0` |
\ No newline at end of file
diff --git a/docs/spec/ecall.md b/docs/spec/ecall.md
deleted file mode 100644
index 7e90cb363..000000000
--- a/docs/spec/ecall.md
+++ /dev/null
@@ -1 +0,0 @@
-ca# ECALL Chips
diff --git a/docs/spec/eq.md b/docs/spec/eq.md
new file mode 100644
index 000000000..215bca910
--- /dev/null
+++ b/docs/spec/eq.md
@@ -0,0 +1,67 @@
+# EQ Chip
+
+The  chip is an ALU chip that compares two values and outputs a bit indicating whether they are equal or not. It optionally inverts the result if the `invert` flag is set.
+
+## Variables
+
+The  chip is comprised of  variables that are expressed using  columns and leverages  interaction(s):
+
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `a` | `DWordWL` | The first input |
+| `b` | `DWordWL` | The second input |
+| `invert` | `Bit` | Whether to invert the result |
+
+### Output
+
+| Name | Type | Description |
+|------|------|-------------|
+| `res` | `Bit` | The result |
+
+### Auxiliary
+
+| Name | Type | Description |
+|------|------|-------------|
+| `diff` | `DWordHL` | The difference `a - b` |
+| `eq` | `Bit` | The bit indicating `a == b` |
+
+### Multiplicity
+
+| Name | Type | Description |
+|------|------|-------------|
+| `μ` | `BaseField` |  |
+
+## Assumptions
+
+| Tag | Range | Description |
+|-----|-------|-------------|
+| `EQ-A1.i` | i ∈ [0, 1] | `IS_WORD[a[i]]` |
+| `EQ-A2.i` | i ∈ [0, 1] | `IS_WORD[b[i]]` |
+
+## Constraints
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `EQ-C1.i` | i ∈ [0, 3] | `IS_HALF[diff[i]]` | μ |
+| `EQ-C2` |  | `IS_BIT<invert>` |  |
+| `EQ-C3` |  | `SUB<diff::DWordWL; a, b>` |  |
+| `EQ-C4` |  | `ZERO[eq; diff[0] + diff[1] + diff[2] + diff[3]]` | μ |
+| `EQ-C5` |  | `res` = `eq` xor `invert` |  |
+| | | _polynomial:_ `res + 2 * eq * invert - eq - invert = 0` | |
+| `EQ-C6` |  | `ALU[[res, 0]; a, b, ⧼EQ⧽ + 64 * invert]` | -μ |
+
+## Padding
+
+The chip can be padded with the following values:
+
+| Column | Padding value |
+|--------|---------------|
+| `a` | `0` |
+| `b` | `0` |
+| `invert` | `0` |
+| `res` | `0` |
+| `diff` | `0` |
+| `eq` | `0` |
+| `μ` | `0` |
\ No newline at end of file
diff --git a/docs/spec/halt.md b/docs/spec/halt.md
index 1c516c628..4498e37c5 100644
--- a/docs/spec/halt.md
+++ b/docs/spec/halt.md
@@ -10,6 +10,12 @@ The  chip leverages  variable, spanning  columns and leverages  interactions:
 |------|------|-------------|
 | `timestamp` | `DWordWL` | timestamp at which to halt the program |
 
+### Auxiliary
+
+| Name | Type | Description |
+|------|------|-------------|
+| `pc` | `DWordWL` | The `next_pc` value the CPU wrote during the instruction HALT was invoked |
+
 ## Assumptions
 
 It is assumed the input is range checked:
@@ -20,14 +26,15 @@ It is assumed the input is range checked:
 
 ## Constraints
 
-The  chip: + makes sure register `x10` (containing the exit code) equals `0` ([halt:c:read_zero_exit_code]), + writes `0` to all other registers ([halt:c:zeroize_registers_lo]/[halt:c:zeroize_registers_hi]), and + sets `pc` equal to `1` ([halt:c:pc]). Note that the writes performed by all these interactions are accompanied by the timestamp `2^64-1`; the maximum timestamp. This prevents any other operation involving memory from being executed hereafter.
+The  chip: + makes sure register `x10` (containing the exit code) equals `0` ([halt:c:read_zero_exit_code]), + writes `0` to all other registers ([halt:c:zeroize_registers_lo]/[halt:c:zeroize_registers_hi]), and + sets `pc` equal to `1` ([halt:c:consume_pc], [halt:c:emit_pc]). Note that the writes performed by all these interactions --- except for the `pc` --- are accompanied by the timestamp `2^64-1`; the maximum timestamp. This prevents any other operation involving memory from being executed hereafter. The `pc` is consumed and re-emitted at the same timestamp to enable padding rows for the CPU. This means that the verifier will have to know the final timestamp at which a CPU padding `pc` was written to be able to balance the final LogUp.
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
 | `HALT-C1.i` | i ∈ [1, 9] | `MEMW[1, (2 * i)::DWordWL, 0::BaseField[8], (2^64 - 1)::DWordWL, 1, 0, 0]` | 1 |
 | `HALT-C2` |  | `MEMW[0::BaseField[8]; 1, (2 * 10)::DWordWL, 0::BaseField[8], (2^64 - 1)::DWordWL, 1, 0, 0]` | 1 |
 | `HALT-C3.i` | i ∈ [11, 31] | `MEMW[1, (2 * i)::DWordWL, 0::BaseField[8], (2^64 - 1)::DWordWL, 1, 0, 0]` | 1 |
-| `HALT-C4` |  | `MEMW[1, (2 * 255)::DWordWL, ['arr', 1, 0, 0, 0, 0, 0, 0, 0], (2^64 - 1)::DWordWL, 1, 0, 0]` | 1 |
+| `HALT-C4.i` | i ∈ [0, 1] | `memory[1, [2 * 255 + i, 0], [timestamp[0] + 1, timestamp[1]], pc[i]]` | 1 |
+| `HALT-C5.i` | i ∈ [0, 1] | `memory[1, [2 * 255 + i, 0], [timestamp[0] + 1, timestamp[1]], [1, 0][i]]` | -1 |
 
 [ Observe that --- in its current state --- this solution puts the burden of verifying the register cleanup on the verifier inside of the lookup argument. Alternatively, one could add 31 lookups to the "memory" table to remove the _known_ final tokens for the registers there. ])
 
@@ -39,7 +46,7 @@ The HALT chip therefore contributes the following interaction to the lookup-argu
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `HALT-C5` | `ECALL[timestamp, 93::DWordWL]` | -1 |
+| `HALT-C6` | `ECALL[timestamp, 93::DWordWL]` | -1 |
 
 ## Padding
 
diff --git a/docs/spec/is_byte.md b/docs/spec/is_byte.md
new file mode 100644
index 000000000..b19561251
--- /dev/null
+++ b/docs/spec/is_byte.md
@@ -0,0 +1,25 @@
+# IS_BYTE Template
+
+When a chip leverages this template twice or more, implementors are encouraged to merge pairs of  interactions with identical conditions into `ARE_BYTES` interactions; the  template is included for convenience of notation, and to complete the specification of chips that use an odd number of  range checks.
+
+## Variables
+
+The  template leverages  interaction(s):
+
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `X` | `BaseField` | Value for which to assert that it lies in the range $[0, 255]$. |
+
+### Condition
+
+| Name | Type | Description |
+|------|------|-------------|
+| `cond` | `BaseField` |  |
+
+## Constraints
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `IS_BYTE-C1` | `ARE_BYTES[0, X]` | cond |
\ No newline at end of file
diff --git a/docs/spec/keccak.md b/docs/spec/keccak.md
new file mode 100644
index 000000000..5bc573738
--- /dev/null
+++ b/docs/spec/keccak.md
@@ -0,0 +1,229 @@
+# KECCAK Accelerator
+
+The  chip applies the keccak permutation `kappa` to a given memory range; other aspects of keccak hashing (such as repeated permutation invocation, input padding and state initialization) fall outside the scope of this accelerator.
+
+This permutation `kappa: FF_2^1600 -> FF_2^1600` operates on 1600 bits and is composed of 24 applications of round-permutation `Lambda: FF_2^1600 times NN -> FF_2^1600`, where the additional parameter is the round constant. `Lambda` is defined as the composition `iota compose chi compose pi compose rho compose theta`, where only `iota` depends on the round constant.
+
+The keccak accelerator comprises two chips: a core chip that interacts with the memory --- loading the input and writing the output, and a round chip that applies the round permutation.
+
+## Core chip
+
+### Columns
+
+The  chip is comprised of  variables that are expressed using  columns and leverages  interaction(s):
+
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `timestamp` | `DWordWL` | timestamp at which the permutation is performed |
+| `addr` | `DWordBL` | memory address storing the first bit of the state |
+| `input_state` | `[['Byte', 8], 5][5]` | state at the start of executing the permutation |
+
+### Output
+
+| Name | Type | Description |
+|------|------|-------------|
+| `output_state` | `[['Byte', 8], 5][5]` | state after executing the permutation |
+
+### Auxiliary
+
+| Name | Type | Description |
+|------|------|-------------|
+| `state_ptr` | `['DWordHL', 5][5]` | memory addresses storing the entire state |
+
+### Multiplicity
+
+| Name | Type | Description |
+|------|------|-------------|
+| `μ` | `Bit` |  |
+
+### Constraints
+
+In this VM, we assign syscall number -2 to the  accelerator. The chip therefore contributes the following interaction to the lookup-argument:
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `KECCAK-C1` | `ECALL[timestamp, (2^64 - 2)::DWordWL]` | -μ |
+
+The address containing the state to be permuted is passed in as argument `A0 = x10`. The following constraints describe that this address is read into `addr` ([keccak:c:read_addr]), from which `state_ptr` --- the collection of pointers to all lanes of the state --- is derived ([keccak:c:state_ptr]). The state is then read into `input_state`, while the `output_state` is written back to the indicated address ([keccak:c:load_store_state]).
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `KECCAK-C2` |  | `MEMW[addr; 1, (2 * 10)::DWordWL, addr, timestamp, 1, 0, 0]` | μ |
+| `KECCAK-C3.i` | x ∈ [0, 4], y ∈ [0, 4] | `ADD<state_ptr[x][y]::DWordWL; addr::DWordWL, (8 * (5 * y + x))::DWordWL>` |  |
+| `KECCAK-C4.i` | x ∈ [0, 4], y ∈ [0, 4], z ∈ [0, 3] | `IS_HALF[state_ptr[x][y][z]]` | μ |
+| `KECCAK-C5.i` | x ∈ [0, 4], y ∈ [0, 4] | `MEMW[input_state[x][y]; 0, state_ptr[x][y]::DWordWL, output_state[x][y], timestamp, 0, 0, 1]` | μ |
+
+Lastly, the input state is pushed to the Keccak-round function, while the output after 24 rounds is taken off the bus:
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `KECCAK-C6` | `KECCAK[timestamp, 0, input_state]` | μ |
+| `KECCAK-C7` | `KECCAK[timestamp, 24, output_state]` | -μ |
+
+### Padding
+
+The  table can be padded to the next power of two with the following value assignments:
+
+| Column | Padding value |
+|--------|---------------|
+| `timestamp` | `0` |
+| `addr` | `0` |
+| `input_state` | `0` |
+| `output_state` | `0` |
+| `state_ptr` | `8 * [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9], [10, 11, 12, 13, 14], [15, 16, 17, 18, 19], [20, 21, 22, 23, 24]]` |
+| `μ` | `0` |
+
+## Round chip
+
+### Columns
+
+The  chip is comprised of  variables that are expressed using  columns and leverages  interaction(s):
+
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `timestamp` | `DWordWL` | timestamp at which the permutation is performed |
+| `round` | `BaseField` | index of the permutation round |
+| `start` | `[['Byte', 8], 5][5]` | state at the start of executing the permutation |
+
+### Auxiliary
+
+| Name | Type | Description |
+|------|------|-------------|
+| `Cxz` | `[['Byte', 8], 4][5]` | $xor_(i=0)^(y+2) `start[x,i,z]`$ |
+| `Cxz_left` | `['Byte', 8][5]` | the left-rotated component of `rotated_Cxz` |
+| `Cxz_right` | `['Bit', 4][5]` | the right-rotated component of `rotated_Cxz` (which is a single bit) |
+| `Dxz` | `['Byte', 8][5]` | $`Cxz[`\(`x` - 1) mod 5`,y,z]` xor `rotated_Cxz[`\(`x` + 1) mod 5`,y,z]`$ |
+| `theta` | `[['Byte', 8], 5][5]` | $theta(`start`)$, the state after applying $theta$. |
+| `rot_left` | `[['Byte', 8], 5][5]` | the left-rotated component of $`theta[x,y]` <<< `rnc`$ |
+| `rot_right` | `[['Byte', 8], 5][5]` | the right-rotated component of $`theta[x,y]` <<< `rnc`$ |
+| `chi_ANDs` | `[['Byte', 8], 5][5]` | $(`pi[`\(x+1) mod 5`,y,z]` xor 255) times.o `pi[`\(x + 2) mod 5`,y,z]`$ |
+| `chi` | `[['Byte', 8], 5][5]` | $(chi compose pi compose rho compose theta)(`start`)$; the state after applying $chi$ |
+| `rc` | `Byte[8]` | round constants |
+| `iota` | `Byte[8]` | state update following from step $iota$. |
+
+### Virtual
+
+| Name | Type | Description |
+|------|------|-------------|
+| `rotated_Cxz` | `['Byte', 8][5]` | $`Cxz[x,`3`,z]` <<< 1$ |
+| `out` | `[['Byte', 8], 5][5]` | state at the end of executing the permutation |
+| `rho` | `[['Byte', 8], 5][5]` | $(rho compose theta)(`start`)$; the state after applying $rho$ |
+| `pi` | `[['Byte', 8], 5][5]` | $(pi compose rho compose theta)(`start`)$; the state after applying $pi$ |
+
+**Definition of `rotated_Cxz`:**
+```
+rotated_Cxz := Cxz_left[x][z] + Cxz_right[x][3]
+rotated_Cxz := Cxz_left[x][z]
+rotated_Cxz := Cxz_left[x][z] + Cxz_right[x][0]
+rotated_Cxz := Cxz_left[x][z]
+rotated_Cxz := Cxz_left[x][z] + Cxz_right[x][1]
+rotated_Cxz := Cxz_left[x][z]
+rotated_Cxz := Cxz_left[x][z] + Cxz_right[x][2]
+rotated_Cxz := Cxz_left[x][z]
+```
+
+**Definition of `out`:**
+```
+out := iota[z]
+out := chi[x][y][z]
+out := chi[x][y][z]
+out := chi[x][y][z]
+```
+
+**Definition of `rho`:**
+```
+rho := (1 - rbc[x][y][0]) * (1 - rbc[x][y][1]) * (rot_left[x][y][z] + rot_right[x][y][(z - 2) mod 8]) + rbc[x][y][0] * (1 - rbc[x][y][1]) * (rot_left[x][y][(z - 2) mod 8] + rot_right[x][y][(z - 4) mod 8]) + (1 - rbc[x][y][0]) * rbc[x][y][1] * (rot_left[x][y][(z - 4) mod 8] + rot_right[x][y][(z - 6) mod 8]) + rbc[x][y][0] * rbc[x][y][1] * (rot_left[x][y][(z - 6) mod 8] + rot_right[x][y][z])
+```
+
+**Definition of `pi`:**
+```
+pi := rho[(x + 3 * y) mod 5][x][z]
+```
+
+### Multiplicity
+
+| Name | Type | Description |
+|------|------|-------------|
+| `μ` | `Bit` |  |
+
+`start` contains the state to which the permutation should be applied. Its three-dimensional array mimics the specification's three-dimensional state
+
+and orders the bits as prescribed.
+
+Rho rotates every lane by a rotation offset in `[0, 64)`. These offsets are identical for every round.
+
+We decompose each offset in three components: the lower nibble (4 bits) are represented by `rnc`, while the upper two bits are represented by as `Bit`s in `rbc`. That is, ``rho_offset[x][y]` = `rnc[x][y]` + 16 dot `rbc[x][y][0]` + 32 dot `rbc[x][y][1]``.
+
+### Constraints
+
+The following constraints ensure that `theta` captures the state after applying the first subpermutation of the round-permutation: `theta`. Note here that `Cxz_left` and `Cxz_right` do have to be range-checked; it cannot be assumed that this implicitly follows from [keccak:c:Dxz] combined with `rotated_Cxz`'s definition.
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `KECCAK_RND-C1.i` | x ∈ [0, 4], z ∈ [0, 7] | `BYTE_ALU[Cxz[x][0][z]; ⧼XOR⧽, start[x][0][z], start[x][1][z]]` | μ |
+| `KECCAK_RND-C2.i` | x ∈ [0, 4], y ∈ [2, 4], z ∈ [0, 7] | `BYTE_ALU[Cxz[x][y - 1][z]; ⧼XOR⧽, Cxz[x][y - 2][z], start[x][y][z]]` | μ |
+| `KECCAK_RND-C3.i` | x ∈ [0, 4], z ∈ [0, 3] | `HWSL[[(Cxz_left[x]::DWordHL)[z], Cxz_right[x][z]::Half]; (Cxz[x][3]::DWordHL)[z], 1]` | μ |
+| `KECCAK_RND-C4.i` | x ∈ [0, 4], z ∈ [0, 7] | μ ⇒ `IS_BYTE<Cxz_left[x][z]>` |  |
+| `KECCAK_RND-C5.i` | x ∈ [0, 4], z ∈ [0, 3] | `IS_BIT<Cxz_right[x][z]>` |  |
+| `KECCAK_RND-C6.i` | x ∈ [0, 4], z ∈ [0, 7] | `BYTE_ALU[Dxz[x][z]; ⧼XOR⧽, Cxz[(x - 1) mod 5][3][z], rotated_Cxz[(x + 1) mod 5][z]]` | μ |
+| `KECCAK_RND-C7.i` | x ∈ [0, 4], y ∈ [0, 4], z ∈ [0, 7] | `BYTE_ALU[theta[x][y][z]; ⧼XOR⧽, start[x][y][z], Dxz[x][z]]` | μ |
+
+Next, we constrain that `rho` captures the state after applying subpermutation `rho`. Note here as well that `rot_left` and `rot_right` do have to be range-checked; it cannot be assumed that this implicitly follows from later constraints.
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `KECCAK_RND-C8.i` | x ∈ [0, 4], y ∈ [0, 4], z ∈ [0, 3] | `HWSL[[(rot_left[x][y]::DWordHL)[z], (rot_right[x][y]::DWordHL)[z]]; (theta[x][y]::DWordHL)[z], rnc[x][y]]` | μ |
+| `KECCAK_RND-C9.i` | x ∈ [0, 4], y ∈ [0, 4], z ∈ [0, 7] | μ ⇒ `IS_BYTE<rot_left[x][y][z]>` |  |
+| `KECCAK_RND-C10.i` | x ∈ [0, 4], y ∈ [0, 4], z ∈ [0, 7] | μ ⇒ `IS_BYTE<rot_right[x][y][z]>` |  |
+
+Observe that the lane-permutation performed by `pi` is absorbed in `pi`'s definition. The next permutation that is constrained in `chi`:
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `KECCAK_RND-C11.i` | x ∈ [0, 4], y ∈ [0, 4], z ∈ [0, 7] | `BYTE_ALU[chi_ANDs[x][y][z]; ⧼AND⧽, 255 - pi[(x + 1) mod 5][y][z], pi[(x + 2) mod 5][y][z]]` | μ |
+| `KECCAK_RND-C12.i` | x ∈ [0, 4], y ∈ [0, 4], z ∈ [0, 7] | `BYTE_ALU[chi[x][y][z]; ⧼XOR⧽, pi[x][y][z], chi_ANDs[x][y][z]]` | μ |
+
+Lastly, the round constants are added to one of the lanes in the state. `iota` contains the updated lane. In the definition of `out`, the output of `chi` and `iota` is combined to construct the output of the permutation.
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `KECCAK_RND-C13.i` | z ∈ [0, 7] | `BYTE_ALU[iota[z]; ⧼XOR⧽, chi[0][0][z], rc[z]]` | μ |
+
+Lastly, the round chip contributes the following interactions to the lookup:
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `KECCAK_RND-C14` | `KECCAK[timestamp, round, start]` | -μ |
+| `KECCAK_RND-C15` | `KECCAK[timestamp, round + 1, out]` | μ |
+| `KECCAK_RND-C16` | `KECCAK_RC[rc; round]` | -μ |
+
+### Notes/potential optimizations
+
+- one does not have to repeat `addr` in `state_ptr`; this saves 4 columns and 4 `IS_HALF` checks. - step `rho` does not need to be applied to `state[0][0]`; its has a zero-shift. This saves 16 columns and 4 `HWSL` interactions. - when the output of `HWSL` are `Byte`s mapped as `Half`s, we find that out of every four output bytes, at least one is zero. Since `rnc` is constant, [keccak:c:rho_rotation] makes those zero-bytes show up in `rot_left` and `rot_right` at constant locations. This means 96 columns can be removed from the chip at no cost. Likewise, 96 `IS_BYTE` interactions can be dropped from [keccak:c:range_rot_left] and [keccak:c:range_rot_right]. - the shift-constants are equivalent to `1 mod 16` for `(`x`, `y`) = (1, 0)` and `-1 mod 16` for `(2, 3)`. This means that for those lanes it suffices to constrain `rot_left`/`rot_right` as `Bit`s rather than `Byte`s, saving an additional 8 `IS_BYTE` interactions. - ``rc[2]` = `rc[4]` = `rc[5]` = `rc[6]` = 0`. As such, those elements need not be stored in `rc`, and need not be XORed into the state in the `iota`-step. This saves 8 columns and 4 `XOR_BYTE` interactions. - when executed in large volumnes, `KECCAK_RND` could benefit from having a three-way XOR lookup table. With this in place, the 80 interactions in [keccak:c:theta_cxz_start] and [keccak:c:theta_cxz] could be dropped. Likewise, 80 columns could be removed from the chip (a \~5% savings).
+
+## Round constant lookup
+
+### Columns
+
+We provide the round constants through a short precomputed lookup table: .
+
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `round` | `BaseField` |  |
+| `RC` | `Byte[8]` | round constants for the given `round` |
+
+### Multiplicity
+
+| Name | Type | Description |
+|------|------|-------------|
+| `μ` | `BaseField` |  |
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `KECCAK_RC-C1` | `KECCAK_RC[RC; round]` | -μ |
\ No newline at end of file
diff --git a/docs/spec/load.md b/docs/spec/load.md
index 51f80997d..3f8b36837 100644
--- a/docs/spec/load.md
+++ b/docs/spec/load.md
@@ -10,7 +10,7 @@ The  chip is comprised of  variables that are expressed using  columns and lever
 
 | Name | Type | Description |
 |------|------|-------------|
-| `base_address` | `DWordWL` | The base address to read/write from/to, gets offset by $[0, 7]$, depending on how big the access is |
+| `base_address` | `DWordWL` | The base address to read from, gets offset by $[0, 7]$, depending on how big the access is |
 | `timestamp` | `DWordWL` | The timestamp at which this memory access is said to occur |
 | `read2` | `Bit` | Whether to read exactly 2 bytes |
 | `read4` | `Bit` | Whether to read exactly 4 bytes |
@@ -51,12 +51,7 @@ read1 := μ - read2 - read4 - read8
 | Tag | Range | Description |
 |-----|-------|-------------|
 | `LOAD-A1.i` | i ∈ [0, 1] | `IS_WORD[base_address[i]]` |
-| `LOAD-A2` |  | `IS_BIT<signed>` |
-| `LOAD-A3` |  | `IS_BIT<read2>` |
-| `LOAD-A4` |  | `IS_BIT<read4>` |
-| `LOAD-A5` |  | `IS_BIT<read8>` |
-| `LOAD-A6` |  | `IS_BIT<read2 + read4 + read8>` |
-| `LOAD-A7.i` | i ∈ [0, 1] | `IS_WORD[timestamp[i]]` |
+| `LOAD-A2.i` | i ∈ [0, 1] | `IS_WORD[timestamp[i]]` |
 
 ## Constraints
 
@@ -64,25 +59,42 @@ The chip delegates the actual memory interaction to the `MEMW` chip, and ensures
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `LOAD-C1` |  | `read2` + `read4` + `read8` => `μ` |  |
+| `LOAD-C1` |  | `IS_BIT<signed>` |  |
+| `LOAD-C2` |  | `IS_BIT<read2>` |  |
+| `LOAD-C3` |  | `IS_BIT<read4>` |  |
+| `LOAD-C4` |  | `IS_BIT<read8>` |  |
+| `LOAD-C5` |  | `IS_BIT<read2 + read4 + read8>` |  |
+| `LOAD-C6` |  | `read2` + `read4` + `read8` => `μ` |  |
 | | | _polynomial:_ `(read2 + read4 + read8) * (1 - μ) = 0` | |
-| `LOAD-C2` |  | `MEMW[res; 0, base_address, res::BaseField[8], timestamp, read2, read4, read8]` | μ |
-| `LOAD-C3` |  | `MSB8[sign_bit; res[0]]` | read1 |
-| `LOAD-C4` |  | `MSB8[sign_bit; res[1]]` | read2 |
-| `LOAD-C5` |  | `MSB8[sign_bit; res[3]]` | read4 |
-| `LOAD-C6.i` | i ∈ [4, 7] | !`read8` => `res`_i = `signed` dot `sign_bit` dot 255 |  |
+| `LOAD-C7` |  | `MEMW[res; 0, base_address, res::BaseField[8], timestamp, read2, read4, read8]` | μ |
+| `LOAD-C8` |  | `MSB8[sign_bit; res[0]]` | read1 |
+| `LOAD-C9` |  | `MSB8[sign_bit; res[1]]` | read2 |
+| `LOAD-C10` |  | `MSB8[sign_bit; res[3]]` | read4 |
+| `LOAD-C11.i` | i ∈ [4, 7] | !`read8` => `res`_i = `signed` dot `sign_bit` dot 255 |  |
 | | | _polynomial:_ `(1 - read8) * (res[i] - signed * sign_bit * 255) = 0` | |
-| `LOAD-C7.i` | i ∈ [2, 3] | !(`read4` + `read8`) => `res`_i = `signed` dot `sign_bit` dot 255 |  |
+| `LOAD-C12.i` | i ∈ [2, 3] | !(`read4` + `read8`) => `res`_i = `signed` dot `sign_bit` dot 255 |  |
 | | | _polynomial:_ `(1 - read4 - read8) * (res[i] - signed * sign_bit * 255) = 0` | |
-| `LOAD-C8` |  | !(`read2` + `read4` + `read8`) => `res`_1 = `signed` dot `sign_bit` dot 255 |  |
+| `LOAD-C13` |  | !(`read2` + `read4` + `read8`) => `res`_1 = `signed` dot `sign_bit` dot 255 |  |
 | | | _polynomial:_ `(1 - read2 - read4 - read8) * (res[1] - signed * sign_bit * 255) = 0` | |
 
 The chip contributes the following to the lookup argument.
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `LOAD-C9` | `LOAD[res::DWordWL; base_address, timestamp, read2, read4, read8, signed]` | -μ |
+| `LOAD-C14` | `MEMOP[res::DWordWL; timestamp, base_address, 0::DWordWL, 2 * signed + 4 * read2 + 8 * read4 + 16 * read8]` | -μ |
 
 ## Padding
 
-The table can be padded to the next power of two with the following value assignments:
\ No newline at end of file
+The table can be padded to the next power of two with the following value assignments:
+
+| Column | Padding value |
+|--------|---------------|
+| `base_address` | `0` |
+| `timestamp` | `0` |
+| `read2` | `0` |
+| `read4` | `0` |
+| `read8` | `0` |
+| `signed` | `0` |
+| `res` | `0` |
+| `sign_bit` | `0` |
+| `μ` | `0` |
\ No newline at end of file
diff --git a/docs/spec/lt.md b/docs/spec/lt.md
index a043d4db9..8f412e009 100644
--- a/docs/spec/lt.md
+++ b/docs/spec/lt.md
@@ -1,6 +1,6 @@
 # LT Chip
 
-The  chip constrains an indicator bit for the less-than relation, signed or unsigned.
+The  chip constrains an indicator bit for the less-than relation, signed or unsigned. If the `invert` flag is set, it inverts the result.
 
 ## Variables
 
@@ -13,12 +13,13 @@ The  chip is comprised of  variables that are expressed using  columns and lever
 | `lhs` | `DWordHHW` | The left operand |
 | `rhs` | `DWordHHW` | The right operand |
 | `signed` | `Bit` | whether to interpret `lhs` and `rhs` as signed integers (1) or not (0) |
+| `invert` | `Bit` | Whether to invert the result |
 
 ### Output
 
 | Name | Type | Description |
 |------|------|-------------|
-| `lt` | `Bit` | Whether $`lhs` < `rhs`$, taking `signed` into account |
+| `res` | `Bit` | The result |
 
 ### Auxiliary
 
@@ -27,6 +28,7 @@ The  chip is comprised of  variables that are expressed using  columns and lever
 | `lhs_sub_rhs` | `DWordHL` | $`lhs` - `rhs`$ |
 | `lhs_msb` | `Bit` | The most significant bit of `lhs` |
 | `rhs_msb` | `Bit` | The most significant bit of `rhs` |
+| `lt` | `Bit` | Whether $`lhs` < `rhs`$, taking `signed` into account |
 
 ### Virtual
 
@@ -60,11 +62,10 @@ We assume the inputs `lhs`, `rhs` and `signed` are partially range checked.
 |-----|-------|-------------|
 | `LT-A1` |  | `IS_WORD[lhs[0]]` |
 | `LT-A2` |  | `IS_WORD[rhs[0]]` |
-| `LT-A3` |  | `IS_BIT<signed>` |
 
 ## Constraints
 
-We first constrain that all variables correspond to their definition. For the defining constraint of `lt`, [lt:c:lt], observe that it is a choice between two options, depending on the input flag `signed`. In the case of unsigned comparison, we simply need `unsigned_lt`, indicating that a wraparound (carry bit) modulo `2^64` is needed to go from `rhs` to `lhs` via addition. For the case of signed comparison, we first need some case analysis.
+We first constrain that all inputs are range checked and all variables correspond to their definition. For the defining constraint of `lt`, [lt:c:lt], observe that it is a choice between two options, depending on the input flag `signed`. In the case of unsigned comparison, we simply need `unsigned_lt`, indicating that a wraparound (carry bit) modulo `2^64` is needed to go from `rhs` to `lhs` via addition. For the case of signed comparison, we first need some case analysis.
 
 We split `a < b` into four disjoint cases, conditioned on the sign of `a` and `b`. Recall that the sign of a number in two's complement can be read off from the MSB, being `1` for a negative number and `0` for a positive one. For this analysis, we denote the MSB of `a` as `A` and the MSB of `b` as `B`. The four disjoint cases then become:
 
@@ -78,26 +79,50 @@ The polynomial `P` can be simplified to a total degree of two. We claim that the
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `LT-C1` | `MSB16[lhs_msb; lhs[2]]` | μ |
-| `LT-C2` | `MSB16[rhs_msb; rhs[2]]` | μ |
-| `LT-C3` | `lt` = `signed` dot (A (1 - B) + A C + (1 - B) C) + (1 - `signed`) dot `unsigned_lt` |  |
+| `LT-C1` | `IS_HALF[lhs[1]]` | μ |
+| `LT-C2` | `IS_HALF[rhs[1]]` | μ |
+| `LT-C3` | `IS_BIT<signed>` |  |
+| `LT-C4` | `IS_BIT<invert>` |  |
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `LT-C5` | `MSB16[lhs_msb; lhs[2]]` | μ |
+| `LT-C6` | `MSB16[rhs_msb; rhs[2]]` | μ |
+| `LT-C7` | `lt` = `signed` dot (A (1 - B) + A C + (1 - B) C) + (1 - `signed`) dot `unsigned_lt` |  |
 | | _polynomial:_ `lt - signed * (lhs_msb * (1 - rhs_msb) + lhs_msb * carry[1] + (1 - rhs_msb) * carry[1]) - (1 - signed) * unsigned_lt = 0` | |
-| `LT-C4` | `IS_HALF[lhs[1]]` | μ |
-| `LT-C5` | `IS_HALF[rhs[1]]` | μ |
+| `LT-C8` | `res` = `lt` xor `invert` |  |
+| | _polynomial:_ `res + 2 * lt * invert - lt - invert = 0` | |
 
 And then we constrain the subtraction, taking care of the remaining range checking not yet covered by the assumptions or the `MSB16` lookup.
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `LT-C6.i` | i ∈ [0, 1] | `IS_BIT<carry[i]>` |  |
-| `LT-C7.i` | i ∈ [0, 3] | `IS_HALF[lhs_sub_rhs[i]]` | μ |
+| `LT-C9.i` | i ∈ [0, 1] | `IS_BIT<carry[i]>` |  |
+| `LT-C10.i` | i ∈ [0, 3] | `IS_HALF[lhs_sub_rhs[i]]` | μ |
 
 The chip contributes the following to the lookup argument.
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `LT-C8` | `LT[lt; lhs::DWordWL, rhs::DWordWL, signed]` | -μ |
+| `LT-C11` | `ALU[[res, 0]; lhs::DWordWL, rhs::DWordWL, ⧼LT⧽ + 32 * signed + 64 * invert]` | -μ |
 
 ## Padding
 
-The table can be padded to the next power of two with the following value assignments:
\ No newline at end of file
+The table can be padded to the next power of two with the following value assignments:
+
+| Column | Padding value |
+|--------|---------------|
+| `lhs` | `0` |
+| `rhs` | `0` |
+| `signed` | `0` |
+| `invert` | `0` |
+| `res` | `0` |
+| `lhs_sub_rhs` | `0` |
+| `lhs_msb` | `0` |
+| `rhs_msb` | `0` |
+| `lt` | `0` |
+| `μ` | `0` |
+
+## Potential optimizations
+
+- Split the chip into a signed and an unsigned chip, making the unsigned version cheaper.
\ No newline at end of file
diff --git a/docs/spec/memory.md b/docs/spec/memory.md
index ae02f69f0..efbcb8944 100644
--- a/docs/spec/memory.md
+++ b/docs/spec/memory.md
@@ -58,6 +58,33 @@ We present here a set of constraints on the `PAGE` table that
 
 For zero-initialized pages, `init` can be a constant `0`, and hence doesn't need a column, nor a range check.
 
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `offset` | `RowIndex` | The offset from the page base address. |
+| `init` | `Byte` | The initial value of this address. Can be replaced by a constant zero for zero-initialization |
+| `fini` | `Byte` | The final value this address took |
+| `timestamp` | `DWordWL` | The timestamp at which this address was last accessed |
+
+### Virtual
+
+| Name | Type | Description |
+|------|------|-------------|
+| `address` | `DWordWL` | Adding `offset` to the page base address `page`. `page` is a constant with respect to a single instance of this table. |
+
+**Definition of `address`:**
+```
+address := page + offset * 1::DWordWL
+```
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `PAGE-C1` | `IS_BYTE<init>` |  |
+| `PAGE-C2` | `IS_BYTE<fini>` |  |
+| `PAGE-C3` | `memory[0, address, 0::DWordWL, init]` | -1 |
+| `PAGE-C4` | `memory[0, address, timestamp, fini]` | 1 |
+
 We identify a few alternatives that would achieve the desired initialization/finalization functionalities, and consider their respective trade-offs.
 
 _"Free-zero" initialization_
diff --git a/docs/spec/memw.md b/docs/spec/memw.md
index 2fc1ce831..32022d081 100644
--- a/docs/spec/memw.md
+++ b/docs/spec/memw.md
@@ -52,7 +52,7 @@ w4 := write4 + write8
 
 **Definition of `address_add`:**
 ```
-address_add := ['arr', ['-', ['+', ['idx', 'base_address', 0], 'i', 1], ['*', ['^', 2, 32], ['idx', 'carry', 'i']]], ['+', ['idx', 'base_address', 1], ['idx', 'carry', 'i']]]
+address_add := [base_address[0] + i + 1 - 2^32 * carry[i], base_address[1] + carry[i]]
 ```
 
 **Definition of `μ_sum`:**
@@ -78,6 +78,15 @@ address_add := ['arr', ['-', ['+', ['idx', 'base_address', 0], 'i', 1], ['*', ['
 | `MEMW-A5` |  | `IS_BIT<write2 + write4 + write8>` |
 | `MEMW-A6.i` | i ∈ [0, 1] | `IS_WORD[timestamp[i]]` |
 
+Some of the assumptions can be checked with only arithmetic constraints, so we provide these below.
+
+| Tag | Description |
+|-----|-------------|
+| `MEMW-C1` | `IS_BIT<write2>` |
+| `MEMW-C2` | `IS_BIT<write4>` |
+| `MEMW-C3` | `IS_BIT<write8>` |
+| `MEMW-C4` | `IS_BIT<write2 + write4 + write8>` |
+
 Our assumptions do not explicitly cover any range checks for the `is_register` and `value` columns, as these are not necessary for the correctness of this chip in isolation. Still, these properties are necessary for the consistency of the system as a whole, and therefore we document it here, keeping the type information as a reading help.
 
 ## Constraints
@@ -86,16 +95,16 @@ Depending on the values of `write2`, `write4` and `write8`, the addresses follow
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `MEMW-C1` |  | `IS_BIT<μ_read>` |  |
-| `MEMW-C2` |  | `IS_BIT<μ_write>` |  |
-| `MEMW-C3` |  | `IS_BIT<μ_sum>` |  |
-| `MEMW-C4` |  | `w2` => `μ_sum` |  |
+| `MEMW-C5` |  | `IS_BIT<μ_read>` |  |
+| `MEMW-C6` |  | `IS_BIT<μ_write>` |  |
+| `MEMW-C7` |  | `IS_BIT<μ_sum>` |  |
+| `MEMW-C8` |  | `w2` => `μ_sum` |  |
 | | | _polynomial:_ `w2 * (1 - μ_sum) = 0` | |
-| `MEMW-C5.i` | i ∈ [0, 6] | `IS_BIT<carry[i]>` |  |
-| `MEMW-C6` |  | `LT[1; old_timestamp[0], timestamp, 0]` | μ_sum |
-| `MEMW-C7` |  | `LT[1; old_timestamp[1], timestamp, 0]` | w2 |
-| `MEMW-C8.i` | i ∈ [2, 3] | `LT[1; old_timestamp[i], timestamp, 0]` | w4 |
-| `MEMW-C9.i` | i ∈ [4, 7] | `LT[1; old_timestamp[i], timestamp, 0]` | write8 |
+| `MEMW-C9.i` | i ∈ [0, 6] | `IS_BIT<carry[i]>` |  |
+| `MEMW-C10` |  | `ALU[[1, 0]; old_timestamp[0], timestamp, ⧼LT⧽]` | μ_sum |
+| `MEMW-C11` |  | `ALU[[1, 0]; old_timestamp[1], timestamp, ⧼LT⧽]` | w2 |
+| `MEMW-C12.i` | i ∈ [2, 3] | `ALU[[1, 0]; old_timestamp[i], timestamp, ⧼LT⧽]` | w4 |
+| `MEMW-C13.i` | i ∈ [4, 7] | `ALU[[1, 0]; old_timestamp[i], timestamp, ⧼LT⧽]` | write8 |
 
 As long as `timestamp` is properly range-checked, the presence of `old_timestamp` in the memory argument automatically ensures it is appropriately range checked (this assumes no external entities provide negative multiplicities without range checking the timestamp). This ensures the assumptions for `LT` are satisfied.
 
@@ -105,26 +114,41 @@ The chip adds the following tuples to the lookup argument, to effectuate that pa
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `MEMW-CM10` |  | `memory[is_register, base_address, old_timestamp[0], old[0]]` | μ_sum |
-| `MEMW-CM11` |  | `memory[is_register, base_address, timestamp, value[0]]` | -μ_sum |
-| `MEMW-CM12` |  | `memory[is_register, address_add[0], old_timestamp[1], old[1]]` | w2 |
-| `MEMW-CM13` |  | `memory[is_register, address_add[0], timestamp, value[1]]` | -w2 |
-| `MEMW-CM14.i` | i ∈ [2, 3] | `memory[is_register, address_add[i - 1], old_timestamp[i], old[i]]` | w4 |
-| `MEMW-CM15.i` | i ∈ [2, 3] | `memory[is_register, address_add[i - 1], timestamp, value[i]]` | -w4 |
-| `MEMW-CM16.i` | i ∈ [4, 7] | `memory[is_register, address_add[i - 1], old_timestamp[i], old[i]]` | write8 |
-| `MEMW-CM17.i` | i ∈ [4, 7] | `memory[is_register, address_add[i - 1], timestamp, value[i]]` | -write8 |
+| `MEMW-CM14` |  | `memory[is_register, base_address, old_timestamp[0], old[0]]` | μ_sum |
+| `MEMW-CM15` |  | `memory[is_register, base_address, timestamp, value[0]]` | -μ_sum |
+| `MEMW-CM16` |  | `memory[is_register, address_add[0], old_timestamp[1], old[1]]` | w2 |
+| `MEMW-CM17` |  | `memory[is_register, address_add[0], timestamp, value[1]]` | -w2 |
+| `MEMW-CM18.i` | i ∈ [2, 3] | `memory[is_register, address_add[i - 1], old_timestamp[i], old[i]]` | w4 |
+| `MEMW-CM19.i` | i ∈ [2, 3] | `memory[is_register, address_add[i - 1], timestamp, value[i]]` | -w4 |
+| `MEMW-CM20.i` | i ∈ [4, 7] | `memory[is_register, address_add[i - 1], old_timestamp[i], old[i]]` | write8 |
+| `MEMW-CM21.i` | i ∈ [4, 7] | `memory[is_register, address_add[i - 1], timestamp, value[i]]` | -write8 |
 
 This chip contributes the following to the lookup argument:
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `MEMW-CO18` | `MEMW[old; is_register, base_address, value, timestamp, write2, write4, write8]` | -μ_read |
-| `MEMW-CO19` | `MEMW[is_register, base_address, value, timestamp, write2, write4, write8]` | -μ_write |
+| `MEMW-CO22` | `MEMW[old; is_register, base_address, value, timestamp, write2, write4, write8]` | -μ_read |
+| `MEMW-CO23` | `MEMW[is_register, base_address, value, timestamp, write2, write4, write8]` | -μ_write |
 
 ## Padding
 
 The table can be padded to the next power of two with the following value assignments:
 
+| Column | Padding value |
+|--------|---------------|
+| `is_register` | `0` |
+| `base_address` | `0` |
+| `value` | `0` |
+| `timestamp` | `0` |
+| `write2` | `0` |
+| `write4` | `0` |
+| `write8` | `0` |
+| `old` | `0` |
+| `carry` | `0` |
+| `old_timestamp` | `0` |
+| `μ_read` | `0` |
+| `μ_write` | `0` |
+
 ## Read-size aligned fast path
 
 When a memory access happens at an address with proper alignment for its access size (i.e., adding the access size to `base_address`'s lowest limb does not overflow), and all accessed elements were last accessed at the same timestamp, we can instead use the  chip to save on total column count. The saving comes from only requiring a single old timestamp to be stored, as well as being able to guarantee that all values of `add_limb_overflow` would be zero. A minor extra cost is introduced in the form of a check that the alignment is indeed correct, and the corresponding decomposition of the `base_address`.
@@ -197,36 +221,59 @@ w4 := write4 + write8
 | `MEMW_A-A6` |  | `IS_BIT<write2 + write4 + write8>` |
 | `MEMW_A-A7.i` | i ∈ [0, 1] | `IS_WORD[timestamp[i]]` |
 
+Some of the assumptions can be checked with only arithmetic constraints, so we provide these below.
+
+| Tag | Description |
+|-----|-------------|
+| `MEMW_A-C1` | `IS_BIT<write2>` |
+| `MEMW_A-C2` | `IS_BIT<write4>` |
+| `MEMW_A-C3` | `IS_BIT<write8>` |
+| `MEMW_A-C4` | `IS_BIT<write2 + write4 + write8>` |
+
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `MEMW_A-C1` | `IS_HALF[base_address[0] + write2 + 3 * write4 + 7 * write8]` | μ_sum |
-| `MEMW_A-C2` | `IS_BIT<μ_read>` |  |
-| `MEMW_A-C3` | `IS_BIT<μ_write>` |  |
-| `MEMW_A-C4` | `IS_BIT<μ_sum>` |  |
-| `MEMW_A-C5` | `w2` => `μ_sum` |  |
+| `MEMW_A-C9` | `IS_HALF[base_address[0] + write2 + 3 * write4 + 7 * write8]` | μ_sum |
+| `MEMW_A-C10` | `IS_BIT<μ_read>` |  |
+| `MEMW_A-C11` | `IS_BIT<μ_write>` |  |
+| `MEMW_A-C12` | `IS_BIT<μ_sum>` |  |
+| `MEMW_A-C13` | `w2` => `μ_sum` |  |
 | | _polynomial:_ `w2 * (1 - μ_sum) = 0` | |
-| `MEMW_A-C6` | `LT[1; old_timestamp, timestamp, 0]` | μ_sum |
+| `MEMW_A-C14` | `ALU[[1, 0]; old_timestamp, timestamp, ⧼LT⧽]` | μ_sum |
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `MEMW_A-CM7` |  | `memory[is_register, base_address::DWordWL, old_timestamp, old[0]]` | μ_sum |
-| `MEMW_A-CM8` |  | `memory[is_register, base_address::DWordWL, timestamp, value[0]]` | -μ_sum |
-| `MEMW_A-CM9` |  | `memory[is_register, base_address::DWordWL + 1::DWordWL, old_timestamp, old[1]]` | w2 |
-| `MEMW_A-CM10` |  | `memory[is_register, base_address::DWordWL + 1::DWordWL, timestamp, value[1]]` | -w2 |
-| `MEMW_A-CM11.i` | i ∈ [2, 3] | `memory[is_register, base_address::DWordWL + i::DWordWL, old_timestamp, old[i]]` | w4 |
-| `MEMW_A-CM12.i` | i ∈ [2, 3] | `memory[is_register, base_address::DWordWL + i::DWordWL, timestamp, value[i]]` | -w4 |
-| `MEMW_A-CM13.i` | i ∈ [4, 7] | `memory[is_register, base_address::DWordWL + i::DWordWL, old_timestamp, old[i]]` | write8 |
-| `MEMW_A-CM14.i` | i ∈ [4, 7] | `memory[is_register, base_address::DWordWL + i::DWordWL, timestamp, value[i]]` | -write8 |
+| `MEMW_A-CM15` |  | `memory[is_register, base_address::DWordWL, old_timestamp, old[0]]` | μ_sum |
+| `MEMW_A-CM16` |  | `memory[is_register, base_address::DWordWL, timestamp, value[0]]` | -μ_sum |
+| `MEMW_A-CM17` |  | `memory[is_register, base_address::DWordWL + 1::DWordWL, old_timestamp, old[1]]` | w2 |
+| `MEMW_A-CM18` |  | `memory[is_register, base_address::DWordWL + 1::DWordWL, timestamp, value[1]]` | -w2 |
+| `MEMW_A-CM19.i` | i ∈ [2, 3] | `memory[is_register, base_address::DWordWL + i::DWordWL, old_timestamp, old[i]]` | w4 |
+| `MEMW_A-CM20.i` | i ∈ [2, 3] | `memory[is_register, base_address::DWordWL + i::DWordWL, timestamp, value[i]]` | -w4 |
+| `MEMW_A-CM21.i` | i ∈ [4, 7] | `memory[is_register, base_address::DWordWL + i::DWordWL, old_timestamp, old[i]]` | write8 |
+| `MEMW_A-CM22.i` | i ∈ [4, 7] | `memory[is_register, base_address::DWordWL + i::DWordWL, timestamp, value[i]]` | -write8 |
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `MEMW_A-CO15` | `MEMW[old; is_register, base_address::DWordWL, value, timestamp, write2, write4, write8]` | -μ_read |
-| `MEMW_A-CO16` | `MEMW[is_register, base_address::DWordWL, value, timestamp, write2, write4, write8]` | -μ_write |
+| `MEMW_A-CO23` | `MEMW[old; is_register, base_address::DWordWL, value, timestamp, write2, write4, write8]` | -μ_read |
+| `MEMW_A-CO24` | `MEMW[is_register, base_address::DWordWL, value, timestamp, write2, write4, write8]` | -μ_write |
 
 ### Padding
 
 The table can be padded to the next power of two with the following value assignments:
 
+| Column | Padding value |
+|--------|---------------|
+| `is_register` | `0` |
+| `base_address` | `0` |
+| `value` | `0` |
+| `timestamp` | `0` |
+| `write2` | `0` |
+| `write4` | `0` |
+| `write8` | `0` |
+| `old` | `0` |
+| `old_timestamp` | `0` |
+| `μ_read` | `0` |
+| `μ_write` | `0` |
+
 ## Register fast-path
 
 The  chip provides a fast-path for accessing registers. This fast-path leverages that registers + can be addressed using a `Byte`, rather than a full `DWord`, + are constantly accessed, i.e., ``timestamp` - `old_timestamp`` is small, and + have a fixed access pattern to achieve a footprint that is significantly smaller than both  and .
@@ -268,7 +315,7 @@ The  chip is comprised of  variables that are expressed using  columns and lever
 
 **Definition of `old_timestamp`:**
 ```
-old_timestamp := ['arr', 'old_timestamp_lo', ['idx', 'timestamp', 1]]::DWordWL
+old_timestamp := [old_timestamp_lo, timestamp[1]]::DWordWL
 ```
 
 **Definition of `μ_sum`:**
@@ -306,8 +353,8 @@ With ``old_timestamp`<`timestamp`` asserted, `old` is read from the register ([r
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `MEMW_R-C2.i` | i ∈ [0, 1] | `memory[1, ['arr', ['cast', ['+', ['*', 2, 'address'], 'i'], 'Word'], 0], old_timestamp, old[i]]` | μ_sum |
-| `MEMW_R-C3.i` | i ∈ [0, 1] | `memory[1, ['arr', ['cast', ['+', ['*', 2, 'address'], 'i'], 'Word'], 0], timestamp, val[i]]` | -μ_sum |
+| `MEMW_R-C2.i` | i ∈ [0, 1] | `memory[1, [(2 * address + i)::Word, 0], old_timestamp, old[i]]` | μ_sum |
+| `MEMW_R-C3.i` | i ∈ [0, 1] | `memory[1, [(2 * address + i)::Word, 0], timestamp, val[i]]` | -μ_sum |
 
 This chip can either just write (``μ_write` = 1`), or both read and write (``μ_read` = 1`) in the same cycle. It must be asserted that at most one of these two options is selected:
 
@@ -321,13 +368,23 @@ Lastly, this chip contributes the following interactions to the logup:
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `MEMW_R-C7` | `MEMW[['arr', ['idx', 'old', 0], ['idx', 'old', 1], 0, 0, 0, 0, 0, 0]; 1, ['arr', ['cast', ['*', 2, 'address'], 'Word'], 0], ['arr', ['idx', 'val', 0], ['idx', 'val', 1], 0, 0, 0, 0, 0, 0], timestamp, 1, 0, 0]` | -μ_read |
-| `MEMW_R-C8` | `MEMW[1, ['arr', ['cast', ['*', 2, 'address'], 'Word'], 0], ['arr', ['idx', 'val', 0], ['idx', 'val', 1], 0, 0, 0, 0, 0, 0], timestamp, 1, 0, 0]` | -μ_write |
+| `MEMW_R-C7` | `MEMW[[old[0], old[1], 0, 0, 0, 0, 0, 0]; 1, [(2 * address)::Word, 0], [val[0], val[1], 0, 0, 0, 0, 0, 0], timestamp, 1, 0, 0]` | -μ_read |
+| `MEMW_R-C8` | `MEMW[1, [(2 * address)::Word, 0], [val[0], val[1], 0, 0, 0, 0, 0, 0], timestamp, 1, 0, 0]` | -μ_write |
 
 ### Padding
 
 The table can be padded to the next power of two with the following value assignments:
 
+| Column | Padding value |
+|--------|---------------|
+| `address` | `0` |
+| `timestamp` | `0` |
+| `val` | `0` |
+| `old` | `0` |
+| `old_timestamp_lo` | `0` |
+| `μ_read` | `0` |
+| `μ_write` | `0` |
+
 ## Notes/optimizations
 
 The following ideas may prove to be optimizations for the // chip: - `MEMB` chip that does a one-byte write to remove old_timestamp from here (uncertain tradeoffs) - Adding `μ_sum`/`w2`/`w4`/`write8` multiplicities to the `IS_HALF` lookups may make some GKR things faster if there are known zeroes. - For the register fast-path, one may upgrade the `IS_HALF` check to an `IS_B20` check for extended range at the cost of looking through a larger table.
\ No newline at end of file
diff --git a/docs/spec/mul.md b/docs/spec/mul.md
index e8fcf2fbc..f6fe129bc 100644
--- a/docs/spec/mul.md
+++ b/docs/spec/mul.md
@@ -78,15 +78,6 @@ carry (when iter=[1, 3]) := 2^-32 * (raw_product[i] + carry[i - 1] - res[i])
 
 `mat(delim: , top; bottom)` }
 
-## Assumptions
-
-The following range checks are assumed to be performed/enforced outside of this chip:
-
-| Tag | Range | Description |
-|-----|-------|-------------|
-| `MUL-A1.i` | i ∈ [0, 3] | `IS_HALF[lhs[i]]` |
-| `MUL-A2.i` | i ∈ [0, 3] | `IS_HALF[rhs[i]]` |
-
 ## Constraints
 
 ### Overview
@@ -107,11 +98,15 @@ We constrain `lhs_is_negative` and `rhs_is_negative` according to their definiti
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `MUL-C1` |  | `SIGN<lhs_is_negative; lhs[3], lhs_signed>` |  |
-| `MUL-C2` |  | `SIGN<rhs_is_negative; rhs[3], rhs_signed>` |  |
-| `MUL-C3.i` | i ∈ [0, 3] | `IS_HALF[lo[i]]` | μ_sum |
-| `MUL-C4.i` | i ∈ [0, 3] | `IS_HALF[hi[i]]` | μ_sum |
-| `MUL-C5.i` | i ∈ [0, 3] | `IS_B20[carry[i]]` | μ_sum |
+| `MUL-C1` |  | `IS_BIT<lhs_signed>` |  |
+| `MUL-C2` |  | `IS_BIT<rhs_signed>` |  |
+| `MUL-C3.i` | i ∈ [0, 3] | `IS_HALF[lhs[i]]` | μ_sum |
+| `MUL-C4.i` | i ∈ [0, 3] | `IS_HALF[rhs[i]]` | μ_sum |
+| `MUL-C5` |  | `SIGN<lhs_is_negative; lhs[3], lhs_signed>` |  |
+| `MUL-C6` |  | `SIGN<rhs_is_negative; rhs[3], rhs_signed>` |  |
+| `MUL-C7.i` | i ∈ [0, 3] | `IS_HALF[lo[i]]` | μ_sum |
+| `MUL-C8.i` | i ∈ [0, 3] | `IS_HALF[hi[i]]` | μ_sum |
+| `MUL-C9.i` | i ∈ [0, 3] | `IS_B20[carry[i]]` | μ_sum |
 
 ### Product
 
@@ -119,7 +114,7 @@ We constrain `lhs_is_negative` and `rhs_is_negative` according to their definiti
 
 | Tag | Range | Description |
 |-----|-------|-------------|
-| `MUL-C6.i` | i ∈ [0, 3] | `raw_product[i]` = sum_(`k`=0)^1 2^(16k) sum_(`j`=0)^(2i+k) `lhs_ext[j]` dot `rhs_ext[2i+k-j]` |
+| `MUL-C10.i` | i ∈ [0, 3] | `raw_product[i]` = sum_(`k`=0)^1 2^(16k) sum_(`j`=0)^(2i+k) `lhs_ext[j]` dot `rhs_ext[2i+k-j]` |
 | | | _polynomial:_ `Σ_k = 0^1 2^(16 * k) * Σ_j = 0^2 * i + k lhs_ext[j] * rhs_ext[2 * i + k - j] - raw_product[i] = 0` |
 
 ### Lookup
@@ -128,13 +123,27 @@ The  chip contributes the following to the lookup:
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `MUL-C7` | `MUL[lo::DWordWL; lhs, lhs_signed, rhs, rhs_signed, 0]` | -μ_lo |
-| `MUL-C8` | `MUL[hi::DWordWL; lhs, lhs_signed, rhs, rhs_signed, 1]` | -μ_hi |
+| `MUL-C11` | `ALU[lo::DWordWL; lhs::DWordWL, rhs::DWordWL, ⧼MUL⧽ + 32 * lhs_signed + 64 * rhs_signed]` | -μ_lo |
+| `MUL-C12` | `ALU[hi::DWordWL; lhs::DWordWL, rhs::DWordWL, ⧼MUL⧽ + 32 * lhs_signed + 64 * rhs_signed + 128]` | -μ_hi |
 
 ## Padding
 
 The table can be padded to the next power of two with the following value assignments:
 
+| Column | Padding value |
+|--------|---------------|
+| `lhs` | `0` |
+| `lhs_signed` | `0` |
+| `rhs` | `0` |
+| `rhs_signed` | `0` |
+| `lo` | `0` |
+| `hi` | `0` |
+| `lhs_is_negative` | `0` |
+| `rhs_is_negative` | `0` |
+| `raw_product` | `0` |
+| `μ_lo` | `0` |
+| `μ_hi` | `0` |
+
 ## Notes/optimizations
 
 - `lo` and `hi` are stored in `DWordHL`s (rather than `DWordWL`s) because of their values being range checked. Since it is not required that both `μ_lo` and `μ_hi` are non-zero at the same time, one cannot safely assume their range to be checked elsewhere. - As an optimization, one might be able to use a `DWordWL` and `DWordHL` to store `lo` and `hi`, where one would decide which to store in which based on the multiplicities `μ_lo` and `μ_hi`; the value sent into the lookup could then be assumed range-checked by the other side of the relation. This optimization was not included at this moment because of its negative impact on the readability and verifiability of the chip.
\ No newline at end of file
diff --git a/docs/spec/sha256.md b/docs/spec/sha256.md
index 61b756060..cc04de723 100644
--- a/docs/spec/sha256.md
+++ b/docs/spec/sha256.md
@@ -46,14 +46,14 @@ The first responsibility of the chip is to read the current state and message ch
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `SHA256-C1` |  | `MEMW[['arr', ['idx', ['cast', ['idx', 'm_addr', 0], 'DWordWL'], 0], ['idx', ['cast', ['idx', 'm_addr', 0], 'DWordWL'], 0], 0, 0, 0, 0, 0, 0]; 1, (2 * 11)::DWordWL, ['arr', ['idx', ['cast', ['idx', 'm_addr', 0], 'DWordWL'], 0], ['idx', ['cast', ['idx', 'm_addr', 0], 'DWordWL'], 1], 0, 0, 0, 0, 0, 0], timestamp, 1, 0, 0]` | μ |
+| `SHA256-C1` |  | `MEMW[[(m_addr[0]::DWordWL)[0], (m_addr[0]::DWordWL)[1], 0, 0, 0, 0, 0, 0]; 1, (2 * 11)::DWordWL, [(m_addr[0]::DWordWL)[0], (m_addr[0]::DWordWL)[1], 0, 0, 0, 0, 0, 0], timestamp, 1, 0, 0]` | μ |
 | `SHA256-C2.i` | i ∈ [0, 7], j ∈ [0, 3] | `IS_HALF[m_addr[i][j]]` | μ |
 | `SHA256-C3.i` | i ∈ [1, 7] | `ADD<m_addr[i]::DWordWL; m_addr[0]::DWordWL, (8 * i)::DWordWL>` |  |
-| `SHA256-C4.i` | i ∈ [0, 7] | `MEMW[['arr', ['idx', 'm', ['+', ['*', 8, 'i'], 3]], ['idx', 'm', ['+', ['*', 8, 'i'], 2]], ['idx', 'm', ['+', ['*', 8, 'i'], 1]], ['idx', 'm', ['+', ['*', 8, 'i'], 0]], ['idx', 'm', ['+', ['*', 8, 'i'], 7]], ['idx', 'm', ['+', ['*', 8, 'i'], 6]], ['idx', 'm', ['+', ['*', 8, 'i'], 5]], ['idx', 'm', ['+', ['*', 8, 'i'], 4]]]; 0, m_addr[i]::DWordWL, ['arr', ['idx', 'm', ['+', ['*', 8, 'i'], 3]], ['idx', 'm', ['+', ['*', 8, 'i'], 2]], ['idx', 'm', ['+', ['*', 8, 'i'], 1]], ['idx', 'm', ['+', ['*', 8, 'i'], 0]], ['idx', 'm', ['+', ['*', 8, 'i'], 7]], ['idx', 'm', ['+', ['*', 8, 'i'], 6]], ['idx', 'm', ['+', ['*', 8, 'i'], 5]], ['idx', 'm', ['+', ['*', 8, 'i'], 4]]], timestamp, 0, 0, 1]` | μ |
-| `SHA256-C5` |  | `MEMW[['arr', ['idx', ['cast', ['idx', 'h_addr', 0], 'DWordWL'], 0], ['idx', ['cast', ['idx', 'h_addr', 0], 'DWordWL'], 1], 0, 0, 0, 0, 0, 0]; 1, (2 * 10)::DWordWL, ['arr', ['idx', ['cast', ['idx', 'h_addr', 0], 'DWordWL'], 0], ['idx', ['cast', ['idx', 'h_addr', 0], 'DWordWL'], 1], 0, 0, 0, 0, 0, 0], timestamp, 1, 0, 0]` | μ |
+| `SHA256-C4.i` | i ∈ [0, 7] | `MEMW[[m[8 * i + 3], m[8 * i + 2], m[8 * i + 1], m[8 * i + 0], m[8 * i + 7], m[8 * i + 6], m[8 * i + 5], m[8 * i + 4]]; 0, m_addr[i]::DWordWL, [m[8 * i + 3], m[8 * i + 2], m[8 * i + 1], m[8 * i + 0], m[8 * i + 7], m[8 * i + 6], m[8 * i + 5], m[8 * i + 4]], timestamp, 0, 0, 1]` | μ |
+| `SHA256-C5` |  | `MEMW[[(h_addr[0]::DWordWL)[0], (h_addr[0]::DWordWL)[1], 0, 0, 0, 0, 0, 0]; 1, (2 * 10)::DWordWL, [(h_addr[0]::DWordWL)[0], (h_addr[0]::DWordWL)[1], 0, 0, 0, 0, 0, 0], timestamp, 1, 0, 0]` | μ |
 | `SHA256-C6.i` | i ∈ [0, 3], j ∈ [0, 3] | `IS_HALF[h_addr[i][j]]` | μ |
 | `SHA256-C7.i` | i ∈ [1, 3] | `ADD<h_addr[i]::DWordWL; h_addr[0]::DWordWL, 8 * i::DWordWL>` |  |
-| `SHA256-C8.i` | i ∈ [0, 3] | `MEMW[['arr', ['idx', 'h', ['+', ['*', 8, 'i'], 3]], ['idx', 'h', ['+', ['*', 8, 'i'], 2]], ['idx', 'h', ['+', ['*', 8, 'i'], 1]], ['idx', 'h', ['+', ['*', 8, 'i'], 0]], ['idx', 'h', ['+', ['*', 8, 'i'], 7]], ['idx', 'h', ['+', ['*', 8, 'i'], 6]], ['idx', 'h', ['+', ['*', 8, 'i'], 5]], ['idx', 'h', ['+', ['*', 8, 'i'], 4]]]; 0, h_addr[i]::DWordWL, ['arr', ['idx', 'out', ['+', ['*', 8, 'i'], 3]], ['idx', 'out', ['+', ['*', 8, 'i'], 2]], ['idx', 'out', ['+', ['*', 8, 'i'], 1]], ['idx', 'out', ['+', ['*', 8, 'i'], 0]], ['idx', 'out', ['+', ['*', 8, 'i'], 7]], ['idx', 'out', ['+', ['*', 8, 'i'], 6]], ['idx', 'out', ['+', ['*', 8, 'i'], 5]], ['idx', 'out', ['+', ['*', 8, 'i'], 4]]], timestamp + 1::DWordWL, 0, 0, 1]` | μ |
+| `SHA256-C8.i` | i ∈ [0, 3] | `MEMW[[h[8 * i + 3], h[8 * i + 2], h[8 * i + 1], h[8 * i + 0], h[8 * i + 7], h[8 * i + 6], h[8 * i + 5], h[8 * i + 4]]; 0, h_addr[i]::DWordWL, [out[8 * i + 3], out[8 * i + 2], out[8 * i + 1], out[8 * i + 0], out[8 * i + 7], out[8 * i + 6], out[8 * i + 5], out[8 * i + 4]], timestamp + 1::DWordWL, 0, 0, 1]` | μ |
 
 Then we prepare the message schedule, by emitting the input chunk with multiplicities corresponding to the number of times it will be read during a compression evaluation. The  chip itself is implicitly invoked by itself and , setting the `amount` column appropriately for the number of times the `w` value is required.
 
@@ -68,20 +68,31 @@ And finally, we provide the boundaries for the  chip and the final addition of t
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `SHA256-C13` |  | `SHA256ROUND[timestamp, ['arr', ['+', ['*', ['^', 2, 0], ['idx', 'h', 3]], ['*', ['^', 2, 8], ['idx', 'h', 2]], ['*', ['^', 2, 16], ['idx', 'h', 1]], ['*', ['^', 2, 24], ['idx', 'h', 0]]], ['+', ['*', ['^', 2, 0], ['idx', 'h', 7]], ['*', ['^', 2, 8], ['idx', 'h', 6]], ['*', ['^', 2, 16], ['idx', 'h', 5]], ['*', ['^', 2, 24], ['idx', 'h', 4]]], ['+', ['*', ['^', 2, 0], ['idx', 'h', 11]], ['*', ['^', 2, 8], ['idx', 'h', 10]], ['*', ['^', 2, 16], ['idx', 'h', 9]], ['*', ['^', 2, 24], ['idx', 'h', 8]]], ['+', ['*', ['^', 2, 0], ['idx', 'h', 15]], ['*', ['^', 2, 8], ['idx', 'h', 14]], ['*', ['^', 2, 16], ['idx', 'h', 13]], ['*', ['^', 2, 24], ['idx', 'h', 12]]], ['+', ['*', ['^', 2, 0], ['idx', 'h', 19]], ['*', ['^', 2, 8], ['idx', 'h', 18]], ['*', ['^', 2, 16], ['idx', 'h', 17]], ['*', ['^', 2, 24], ['idx', 'h', 16]]], ['+', ['*', ['^', 2, 0], ['idx', 'h', 23]], ['*', ['^', 2, 8], ['idx', 'h', 22]], ['*', ['^', 2, 16], ['idx', 'h', 21]], ['*', ['^', 2, 24], ['idx', 'h', 20]]], ['+', ['*', ['^', 2, 0], ['idx', 'h', 27]], ['*', ['^', 2, 8], ['idx', 'h', 26]], ['*', ['^', 2, 16], ['idx', 'h', 25]], ['*', ['^', 2, 24], ['idx', 'h', 24]]], ['+', ['*', ['^', 2, 0], ['idx', 'h', 31]], ['*', ['^', 2, 8], ['idx', 'h', 30]], ['*', ['^', 2, 16], ['idx', 'h', 29]], ['*', ['^', 2, 24], ['idx', 'h', 28]]]], 0]` | μ |
+| `SHA256-C13` |  | `SHA256ROUND[timestamp, [2^0 * h[3] + 2^8 * h[2] + 2^16 * h[1] + 2^24 * h[0], 2^0 * h[7] + 2^8 * h[6] + 2^16 * h[5] + 2^24 * h[4], 2^0 * h[11] + 2^8 * h[10] + 2^16 * h[9] + 2^24 * h[8], 2^0 * h[15] + 2^8 * h[14] + 2^16 * h[13] + 2^24 * h[12], 2^0 * h[19] + 2^8 * h[18] + 2^16 * h[17] + 2^24 * h[16], 2^0 * h[23] + 2^8 * h[22] + 2^16 * h[21] + 2^24 * h[20], 2^0 * h[27] + 2^8 * h[26] + 2^16 * h[25] + 2^24 * h[24], 2^0 * h[31] + 2^8 * h[30] + 2^16 * h[29] + 2^24 * h[28]], 0]` | μ |
 | `SHA256-C14` |  | `SHA256ROUND[timestamp, last_round_out, 64]` | -μ |
-| `SHA256-C15.i` | i ∈ [0, 31] | `IS_BYTE[out[i]]` | μ |
-| `SHA256-C16.i` | i ∈ [0, 7] | `ADD<['arr', 0, ['+', ['*', ['^', 2, 0], ['idx', 'out', ['+', ['*', 4, 'i'], 3]]], ['*', ['^', 2, 8], ['idx', 'out', ['+', ['*', 4, 'i'], 2]]], ['*', ['^', 2, 16], ['idx', 'out', ['+', ['*', 4, 'i'], 1]]], ['*', ['^', 2, 24], ['idx', 'out', ['+', ['*', 4, 'i'], 0]]]]]; ['arr', 0, ['idx', 'last_round_out', 'i']], ['arr', 0, ['+', ['*', ['^', 2, 0], ['idx', 'h', ['+', ['*', 4, 'i'], 3]]], ['*', ['^', 2, 8], ['idx', 'h', ['+', ['*', 4, 'i'], 2]]], ['*', ['^', 2, 16], ['idx', 'h', ['+', ['*', 4, 'i'], 1]]], ['*', ['^', 2, 24], ['idx', 'h', ['+', ['*', 4, 'i'], 0]]]]]>` |  |
+| `SHA256-C15.i` | i ∈ [0, 31] | μ ⇒ `IS_BYTE<out[i]>` |  |
+| `SHA256-C16.i` | i ∈ [0, 7] | `ADD<[0, 2^0 * out[4 * i + 3] + 2^8 * out[4 * i + 2] + 2^16 * out[4 * i + 1] + 2^24 * out[4 * i + 0]]; [0, last_round_out[i]], [0, 2^0 * h[4 * i + 3] + 2^8 * h[4 * i + 2] + 2^16 * h[4 * i + 1] + 2^24 * h[4 * i + 0]]>` |  |
 
 In this VM, we assign syscall number -1 to the  accelerator. The chip therefore contributes the following interaction to the lookup-argument:
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
 | `SHA256-C17` | `IS_BIT<μ>` |  |
-| `SHA256-C18` | `ECALL[timestamp, ['arr', ['-', ['^', 2, 32], 1], ['-', ['^', 2, 32], 1]]]` | -μ |
+| `SHA256-C18` | `ECALL[timestamp, (2^64 - 1)::DWordWL]` | -μ |
 
 ### Padding
 
+| Column | Padding value |
+|--------|---------------|
+| `timestamp` | `0` |
+| `h` | `0` |
+| `h_addr` | `[0, 8, 16, 24]` |
+| `m` | `0` |
+| `m_addr` | `[0, 8, 16, 24, 32, 40, 48, 56]` |
+| `out` | `0` |
+| `last_round_out` | `0` |
+| `μ` | `0` |
+
 ## `SHA256`msgsched chip
 
 ### Columns
@@ -142,7 +153,7 @@ First, we gather the dependencies from earlier in the message schedule.
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `SHA256MSGSCHED-C1` | `IS_BYTE[index - 16]` | μ |
+| `SHA256MSGSCHED-C1` | μ ⇒ `IS_BYTE<index - 16>` |  |
 | `SHA256MSGSCHED-C2` | `SHA256_M[back2; timestamp, index - 2]` | μ |
 | `SHA256MSGSCHED-C3` | `SHA256_M[back7; timestamp, index - 7]` | μ |
 | `SHA256MSGSCHED-C4` | `SHA256_M[back15; timestamp, index - 15]` | μ |
@@ -154,7 +165,7 @@ Then, we calculate the result. It suffices to check that the carry of adding fou
 |-----|-------|-------------|--------------|
 | `SHA256MSGSCHED-C6` |  | `ROTXOR[s0; back15, 2, 11, 3, 0]` | μ |
 | `SHA256MSGSCHED-C7` |  | `ROTXOR[s1; back2, 3, 2, 10, 0]` | μ |
-| `SHA256MSGSCHED-C8` |  | `IS_BYTE[carry]` | μ |
+| `SHA256MSGSCHED-C8` |  | μ ⇒ `IS_BYTE<carry>` |  |
 | `SHA256MSGSCHED-C9.i` | i ∈ [0, 1] | `IS_HALF[out[i]]` | μ |
 
 Finally, we contribute to the LogUp.
@@ -269,11 +280,11 @@ To compute `maj`, observe that ` (a bitand b) xor (a bitand c) xor (b bitand c)
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `SHA256ROUND-C1.i` | i ∈ [0, 3] | `AND_BYTE[a_and_b[i]; a[i], b[i]]` | μ |
-| `SHA256ROUND-C2.i` | i ∈ [0, 3] | `XOR_BYTE[a_xor_b[i]; a[i], b[i]]` | μ |
-| `SHA256ROUND-C3.i` | i ∈ [0, 3] | `AND_BYTE[c_and_a_xor_b[i]; c[i], a_xor_b[i]]` | μ |
-| `SHA256ROUND-C4.i` | i ∈ [0, 3] | `AND_BYTE[e_and_f[i]; e[i], f[i]]` | μ |
-| `SHA256ROUND-C5.i` | i ∈ [0, 3] | `AND_BYTE[not_e_and_g[i]; 255 - e[i], g[i]]` | μ |
+| `SHA256ROUND-C1.i` | i ∈ [0, 3] | `BYTE_ALU[a_and_b[i]; ⧼AND⧽, a[i], b[i]]` | μ |
+| `SHA256ROUND-C2.i` | i ∈ [0, 3] | `BYTE_ALU[a_xor_b[i]; ⧼XOR⧽, a[i], b[i]]` | μ |
+| `SHA256ROUND-C3.i` | i ∈ [0, 3] | `BYTE_ALU[c_and_a_xor_b[i]; ⧼AND⧽, c[i], a_xor_b[i]]` | μ |
+| `SHA256ROUND-C4.i` | i ∈ [0, 3] | `BYTE_ALU[e_and_f[i]; ⧼AND⧽, e[i], f[i]]` | μ |
+| `SHA256ROUND-C5.i` | i ∈ [0, 3] | `BYTE_ALU[not_e_and_g[i]; ⧼AND⧽, 255 - e[i], g[i]]` | μ |
 | `SHA256ROUND-C6` |  | `SHA256_K[kval; index]` | μ |
 | `SHA256ROUND-C7` |  | `SHA256_M[wval; timestamp, index]` | μ |
 | `SHA256ROUND-C8` |  | `ROTXOR[S0; a::Word, 6, 9, 2, 1]` | μ |
@@ -284,19 +295,44 @@ Then we constrain the addition for the new state, constraining additions with th
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
 | `SHA256ROUND-C10.i` | i ∈ [0, 1] | `IS_HALF[out_a[i]]` | μ |
-| `SHA256ROUND-C11` |  | `IS_BYTE[carry_a]` | μ |
+| `SHA256ROUND-C11` |  | μ ⇒ `IS_BYTE<carry_a>` |  |
 | `SHA256ROUND-C12.i` | i ∈ [0, 1] | `IS_HALF[out_e[i]]` | μ |
-| `SHA256ROUND-C13` |  | `IS_BYTE[carry_e]` | μ |
+| `SHA256ROUND-C13` |  | μ ⇒ `IS_BYTE<carry_e>` |  |
 
 Finally, we chain the rounds together through the interactions.
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `SHA256ROUND-C14` | `SHA256ROUND[timestamp, ['arr', ['cast', 'a', 'Word'], ['cast', 'b', 'Word'], ['cast', 'c', 'Word'], 'd', ['cast', 'e', 'Word'], ['cast', 'f', 'Word'], ['cast', 'g', 'Word'], 'h'], index]` | -μ |
-| `SHA256ROUND-C15` | `SHA256ROUND[timestamp, ['arr', ['cast', 'out_a', 'Word'], ['cast', 'a', 'Word'], ['cast', 'b', 'Word'], ['cast', 'c', 'Word'], ['cast', 'out_e', 'Word'], ['cast', 'e', 'Word'], ['cast', 'f', 'Word'], ['cast', 'g', 'Word']], index + 1]` | μ |
+| `SHA256ROUND-C14` | `SHA256ROUND[timestamp, [a::Word, b::Word, c::Word, d, e::Word, f::Word, g::Word, h], index]` | -μ |
+| `SHA256ROUND-C15` | `SHA256ROUND[timestamp, [out_a::Word, a::Word, b::Word, c::Word, out_e::Word, e::Word, f::Word, g::Word], index + 1]` | μ |
 
 ### Padding
 
+| Column | Padding value |
+|--------|---------------|
+| `timestamp` | `0` |
+| `a` | `0` |
+| `b` | `0` |
+| `c` | `0` |
+| `d` | `0` |
+| `e` | `0` |
+| `f` | `0` |
+| `g` | `0` |
+| `h` | `0` |
+| `index` | `0` |
+| `out_a` | `0` |
+| `out_e` | `0` |
+| `a_and_b` | `0` |
+| `a_xor_b` | `0` |
+| `c_and_a_xor_b` | `0` |
+| `e_and_f` | `0` |
+| `not_e_and_g` | `0` |
+| `kval` | `0` |
+| `S0` | `0` |
+| `S1` | `0` |
+| `wval` | `0` |
+| `μ` | `0` |
+
 ## `ROTXOR` chip
 
 This chip takes as input `a`, `r0`, `r1`, `r2` (4-bit values) and a bit `last_rot` to compute $ cases( (a >>> (16 + r_0)) xor (a >>> (16 + r_0 - r_1)) xor (a >>> r_2) quad "if" `last_rot`, (a >>> (16 + r_0)) xor (a >>> (16 + r_0 - r_1)) xor (a >> r_2) quad "if" `!last_rot` ), $ where we let `>>>` denote right rotation and `>>` logical shift right. We choose this representation so that all shift amounts required fit into 4 bits, making the usage of `HWSL` more straightforward and avoid extra columns to represent more bits.
@@ -356,9 +392,9 @@ We first compute all rotations (or shifts) of `a`. `a1` is computed as a left ro
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `ROTXOR-C1.i` | i ∈ [0, 1] | `HWSL[['arr', ['idx', 'a0_left', 'i'], ['idx', 'a0_right', 'i']]; a[i], 16 - r0]` | μ |
-| `ROTXOR-C2.i` | i ∈ [0, 1] | `HWSL[['arr', ['idx', 'a1_left', 'i'], ['idx', 'a1_right', 'i']]; (a0::WordHL)[i], r1]` | μ |
-| `ROTXOR-C3.i` | i ∈ [0, 1] | `HWSL[['arr', ['idx', 'a2_left', 'i'], ['idx', 'a2_right', 'i']]; a[i], 16 - r2]` | μ |
+| `ROTXOR-C1.i` | i ∈ [0, 1] | `HWSL[[a0_left[i], a0_right[i]]; a[i], 16 - r0]` | μ |
+| `ROTXOR-C2.i` | i ∈ [0, 1] | `HWSL[[a1_left[i], a1_right[i]]; (a0::WordHL)[i], r1]` | μ |
+| `ROTXOR-C3.i` | i ∈ [0, 1] | `HWSL[[a2_left[i], a2_right[i]]; a[i], 16 - r2]` | μ |
 | `ROTXOR-C4.i` | i ∈ [0, 1] | `a0[i]` = `a0_left[i]` + `a0_right[1 - i]` |  |
 | | | _polynomial:_ `(a0::WordHL)[i] - a0_left[i] - a0_right[1 - i] = 0` | |
 | `ROTXOR-C5.i` | i ∈ [0, 1] | `a1[i]` = `a1_left[i]` + `a1_right[1 - i]` |  |
@@ -366,14 +402,14 @@ We first compute all rotations (or shifts) of `a`. `a1` is computed as a left ro
 | `ROTXOR-C6` |  | `a2[0]` = `a2_left[1]` + `a2_right[0]` |  |
 | | | _polynomial:_ `(a2::WordHL)[0] - a2_left[1] - a2_right[0] = 0` | |
 | `ROTXOR-C7` |  | `a2[1]` = `last_rot` dot `a2_left[0]` + `a2_right[1]` |  |
-| | | _polynomial:_ `(a2::WordHL)[0] - last_rot * a2_left[0] - a2_right[1] = 0` | |
+| | | _polynomial:_ `(a2::WordHL)[1] - last_rot * a2_left[0] - a2_right[1] = 0` | |
 
 Then the bitwise XOR of the results.
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `ROTXOR-C8.i` | i ∈ [0, 3] | `XOR_BYTE[a01[i]; a0[i], a1[i]]` | μ |
-| `ROTXOR-C9.i` | i ∈ [0, 3] | `XOR_BYTE[out[i]; a01[i], a2[i]]` | μ |
+| `ROTXOR-C8.i` | i ∈ [0, 3] | `BYTE_ALU[a01[i]; ⧼XOR⧽, a0[i], a1[i]]` | μ |
+| `ROTXOR-C9.i` | i ∈ [0, 3] | `BYTE_ALU[out[i]; ⧼XOR⧽, a01[i], a2[i]]` | μ |
 
 And finally contribute to the lookup argument.
 
@@ -383,6 +419,26 @@ And finally contribute to the lookup argument.
 
 ### Padding
 
+| Column | Padding value |
+|--------|---------------|
+| `a` | `0` |
+| `r0` | `0` |
+| `r1` | `0` |
+| `r2` | `0` |
+| `last_rot` | `0` |
+| `out` | `0` |
+| `a0_left` | `0` |
+| `a0_right` | `0` |
+| `a1_left` | `0` |
+| `a1_right` | `0` |
+| `a2_left` | `0` |
+| `a2_right` | `0` |
+| `a0` | `0` |
+| `a1` | `0` |
+| `a2` | `0` |
+| `a01` | `0` |
+| `μ` | `0` |
+
 ## Constant lookup
 
 As mentioned, we provide the round constants through a short precomputed lookup table: .
@@ -444,14 +500,14 @@ As mentioned, we provide the round constants through a short precomputed lookup
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `SHA256-C1` |  | `MEMW[['arr', ['idx', ['cast', ['idx', 'm_addr', 0], 'DWordWL'], 0], ['idx', ['cast', ['idx', 'm_addr', 0], 'DWordWL'], 0], 0, 0, 0, 0, 0, 0]; 1, (2 * 11)::DWordWL, ['arr', ['idx', ['cast', ['idx', 'm_addr', 0], 'DWordWL'], 0], ['idx', ['cast', ['idx', 'm_addr', 0], 'DWordWL'], 1], 0, 0, 0, 0, 0, 0], timestamp, 1, 0, 0]` | μ |
+| `SHA256-C1` |  | `MEMW[[(m_addr[0]::DWordWL)[0], (m_addr[0]::DWordWL)[1], 0, 0, 0, 0, 0, 0]; 1, (2 * 11)::DWordWL, [(m_addr[0]::DWordWL)[0], (m_addr[0]::DWordWL)[1], 0, 0, 0, 0, 0, 0], timestamp, 1, 0, 0]` | μ |
 | `SHA256-C2.i` | i ∈ [0, 7], j ∈ [0, 3] | `IS_HALF[m_addr[i][j]]` | μ |
 | `SHA256-C3.i` | i ∈ [1, 7] | `ADD<m_addr[i]::DWordWL; m_addr[0]::DWordWL, (8 * i)::DWordWL>` |  |
-| `SHA256-C4.i` | i ∈ [0, 7] | `MEMW[['arr', ['idx', 'm', ['+', ['*', 8, 'i'], 3]], ['idx', 'm', ['+', ['*', 8, 'i'], 2]], ['idx', 'm', ['+', ['*', 8, 'i'], 1]], ['idx', 'm', ['+', ['*', 8, 'i'], 0]], ['idx', 'm', ['+', ['*', 8, 'i'], 7]], ['idx', 'm', ['+', ['*', 8, 'i'], 6]], ['idx', 'm', ['+', ['*', 8, 'i'], 5]], ['idx', 'm', ['+', ['*', 8, 'i'], 4]]]; 0, m_addr[i]::DWordWL, ['arr', ['idx', 'm', ['+', ['*', 8, 'i'], 3]], ['idx', 'm', ['+', ['*', 8, 'i'], 2]], ['idx', 'm', ['+', ['*', 8, 'i'], 1]], ['idx', 'm', ['+', ['*', 8, 'i'], 0]], ['idx', 'm', ['+', ['*', 8, 'i'], 7]], ['idx', 'm', ['+', ['*', 8, 'i'], 6]], ['idx', 'm', ['+', ['*', 8, 'i'], 5]], ['idx', 'm', ['+', ['*', 8, 'i'], 4]]], timestamp, 0, 0, 1]` | μ |
-| `SHA256-C5` |  | `MEMW[['arr', ['idx', ['cast', ['idx', 'h_addr', 0], 'DWordWL'], 0], ['idx', ['cast', ['idx', 'h_addr', 0], 'DWordWL'], 1], 0, 0, 0, 0, 0, 0]; 1, (2 * 10)::DWordWL, ['arr', ['idx', ['cast', ['idx', 'h_addr', 0], 'DWordWL'], 0], ['idx', ['cast', ['idx', 'h_addr', 0], 'DWordWL'], 1], 0, 0, 0, 0, 0, 0], timestamp, 1, 0, 0]` | μ |
+| `SHA256-C4.i` | i ∈ [0, 7] | `MEMW[[m[8 * i + 3], m[8 * i + 2], m[8 * i + 1], m[8 * i + 0], m[8 * i + 7], m[8 * i + 6], m[8 * i + 5], m[8 * i + 4]]; 0, m_addr[i]::DWordWL, [m[8 * i + 3], m[8 * i + 2], m[8 * i + 1], m[8 * i + 0], m[8 * i + 7], m[8 * i + 6], m[8 * i + 5], m[8 * i + 4]], timestamp, 0, 0, 1]` | μ |
+| `SHA256-C5` |  | `MEMW[[(h_addr[0]::DWordWL)[0], (h_addr[0]::DWordWL)[1], 0, 0, 0, 0, 0, 0]; 1, (2 * 10)::DWordWL, [(h_addr[0]::DWordWL)[0], (h_addr[0]::DWordWL)[1], 0, 0, 0, 0, 0, 0], timestamp, 1, 0, 0]` | μ |
 | `SHA256-C6.i` | i ∈ [0, 3], j ∈ [0, 3] | `IS_HALF[h_addr[i][j]]` | μ |
 | `SHA256-C7.i` | i ∈ [1, 3] | `ADD<h_addr[i]::DWordWL; h_addr[0]::DWordWL, 8 * i::DWordWL>` |  |
-| `SHA256-C8.i` | i ∈ [0, 3] | `MEMW[['arr', ['idx', 'h', ['+', ['*', 8, 'i'], 3]], ['idx', 'h', ['+', ['*', 8, 'i'], 2]], ['idx', 'h', ['+', ['*', 8, 'i'], 1]], ['idx', 'h', ['+', ['*', 8, 'i'], 0]], ['idx', 'h', ['+', ['*', 8, 'i'], 7]], ['idx', 'h', ['+', ['*', 8, 'i'], 6]], ['idx', 'h', ['+', ['*', 8, 'i'], 5]], ['idx', 'h', ['+', ['*', 8, 'i'], 4]]]; 0, h_addr[i]::DWordWL, ['arr', ['idx', 'out', ['+', ['*', 8, 'i'], 3]], ['idx', 'out', ['+', ['*', 8, 'i'], 2]], ['idx', 'out', ['+', ['*', 8, 'i'], 1]], ['idx', 'out', ['+', ['*', 8, 'i'], 0]], ['idx', 'out', ['+', ['*', 8, 'i'], 7]], ['idx', 'out', ['+', ['*', 8, 'i'], 6]], ['idx', 'out', ['+', ['*', 8, 'i'], 5]], ['idx', 'out', ['+', ['*', 8, 'i'], 4]]], timestamp + 1::DWordWL, 0, 0, 1]` | μ |
+| `SHA256-C8.i` | i ∈ [0, 3] | `MEMW[[h[8 * i + 3], h[8 * i + 2], h[8 * i + 1], h[8 * i + 0], h[8 * i + 7], h[8 * i + 6], h[8 * i + 5], h[8 * i + 4]]; 0, h_addr[i]::DWordWL, [out[8 * i + 3], out[8 * i + 2], out[8 * i + 1], out[8 * i + 0], out[8 * i + 7], out[8 * i + 6], out[8 * i + 5], out[8 * i + 4]], timestamp + 1::DWordWL, 0, 0, 1]` | μ |
 
 ### sched
 
@@ -466,14 +522,14 @@ As mentioned, we provide the round constants through a short precomputed lookup
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `SHA256-C13` |  | `SHA256ROUND[timestamp, ['arr', ['+', ['*', ['^', 2, 0], ['idx', 'h', 3]], ['*', ['^', 2, 8], ['idx', 'h', 2]], ['*', ['^', 2, 16], ['idx', 'h', 1]], ['*', ['^', 2, 24], ['idx', 'h', 0]]], ['+', ['*', ['^', 2, 0], ['idx', 'h', 7]], ['*', ['^', 2, 8], ['idx', 'h', 6]], ['*', ['^', 2, 16], ['idx', 'h', 5]], ['*', ['^', 2, 24], ['idx', 'h', 4]]], ['+', ['*', ['^', 2, 0], ['idx', 'h', 11]], ['*', ['^', 2, 8], ['idx', 'h', 10]], ['*', ['^', 2, 16], ['idx', 'h', 9]], ['*', ['^', 2, 24], ['idx', 'h', 8]]], ['+', ['*', ['^', 2, 0], ['idx', 'h', 15]], ['*', ['^', 2, 8], ['idx', 'h', 14]], ['*', ['^', 2, 16], ['idx', 'h', 13]], ['*', ['^', 2, 24], ['idx', 'h', 12]]], ['+', ['*', ['^', 2, 0], ['idx', 'h', 19]], ['*', ['^', 2, 8], ['idx', 'h', 18]], ['*', ['^', 2, 16], ['idx', 'h', 17]], ['*', ['^', 2, 24], ['idx', 'h', 16]]], ['+', ['*', ['^', 2, 0], ['idx', 'h', 23]], ['*', ['^', 2, 8], ['idx', 'h', 22]], ['*', ['^', 2, 16], ['idx', 'h', 21]], ['*', ['^', 2, 24], ['idx', 'h', 20]]], ['+', ['*', ['^', 2, 0], ['idx', 'h', 27]], ['*', ['^', 2, 8], ['idx', 'h', 26]], ['*', ['^', 2, 16], ['idx', 'h', 25]], ['*', ['^', 2, 24], ['idx', 'h', 24]]], ['+', ['*', ['^', 2, 0], ['idx', 'h', 31]], ['*', ['^', 2, 8], ['idx', 'h', 30]], ['*', ['^', 2, 16], ['idx', 'h', 29]], ['*', ['^', 2, 24], ['idx', 'h', 28]]]], 0]` | μ |
+| `SHA256-C13` |  | `SHA256ROUND[timestamp, [2^0 * h[3] + 2^8 * h[2] + 2^16 * h[1] + 2^24 * h[0], 2^0 * h[7] + 2^8 * h[6] + 2^16 * h[5] + 2^24 * h[4], 2^0 * h[11] + 2^8 * h[10] + 2^16 * h[9] + 2^24 * h[8], 2^0 * h[15] + 2^8 * h[14] + 2^16 * h[13] + 2^24 * h[12], 2^0 * h[19] + 2^8 * h[18] + 2^16 * h[17] + 2^24 * h[16], 2^0 * h[23] + 2^8 * h[22] + 2^16 * h[21] + 2^24 * h[20], 2^0 * h[27] + 2^8 * h[26] + 2^16 * h[25] + 2^24 * h[24], 2^0 * h[31] + 2^8 * h[30] + 2^16 * h[29] + 2^24 * h[28]], 0]` | μ |
 | `SHA256-C14` |  | `SHA256ROUND[timestamp, last_round_out, 64]` | -μ |
-| `SHA256-C15.i` | i ∈ [0, 31] | `IS_BYTE[out[i]]` | μ |
-| `SHA256-C16.i` | i ∈ [0, 7] | `ADD<['arr', 0, ['+', ['*', ['^', 2, 0], ['idx', 'out', ['+', ['*', 4, 'i'], 3]]], ['*', ['^', 2, 8], ['idx', 'out', ['+', ['*', 4, 'i'], 2]]], ['*', ['^', 2, 16], ['idx', 'out', ['+', ['*', 4, 'i'], 1]]], ['*', ['^', 2, 24], ['idx', 'out', ['+', ['*', 4, 'i'], 0]]]]]; ['arr', 0, ['idx', 'last_round_out', 'i']], ['arr', 0, ['+', ['*', ['^', 2, 0], ['idx', 'h', ['+', ['*', 4, 'i'], 3]]], ['*', ['^', 2, 8], ['idx', 'h', ['+', ['*', 4, 'i'], 2]]], ['*', ['^', 2, 16], ['idx', 'h', ['+', ['*', 4, 'i'], 1]]], ['*', ['^', 2, 24], ['idx', 'h', ['+', ['*', 4, 'i'], 0]]]]]>` |  |
+| `SHA256-C15.i` | i ∈ [0, 31] | μ ⇒ `IS_BYTE<out[i]>` |  |
+| `SHA256-C16.i` | i ∈ [0, 7] | `ADD<[0, 2^0 * out[4 * i + 3] + 2^8 * out[4 * i + 2] + 2^16 * out[4 * i + 1] + 2^24 * out[4 * i + 0]]; [0, last_round_out[i]], [0, 2^0 * h[4 * i + 3] + 2^8 * h[4 * i + 2] + 2^16 * h[4 * i + 1] + 2^24 * h[4 * i + 0]]>` |  |
 
 ### lookup
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
 | `SHA256-C17` | `IS_BIT<μ>` |  |
-| `SHA256-C18` | `ECALL[timestamp, ['arr', ['-', ['^', 2, 32], 1], ['-', ['^', 2, 32], 1]]]` | -μ |
\ No newline at end of file
+| `SHA256-C18` | `ECALL[timestamp, (2^64 - 1)::DWordWL]` | -μ |
\ No newline at end of file
diff --git a/docs/spec/shift.md b/docs/spec/shift.md
index 53c46ca2a..df80954ea 100644
--- a/docs/spec/shift.md
+++ b/docs/spec/shift.md
@@ -15,7 +15,7 @@ The `SHIFT` chip is comprised of  variables that are expressed using  columns an
 | Name | Type | Description |
 |------|------|-------------|
 | `in` | `DWordHL` | The value being shifted |
-| `shift` | `Byte` | Number of bits to shift `in` by. |
+| `shift` | `DWordWHBB` | Number of bits to shift `in` by. |
 | `direction` | `Bit` | Whether to shift left (0) or right (1). |
 | `signed` | `Bit` | Whether to interpret `in` as a signed integer. |
 | `word_instr` | `Bit` | Whether this is a Word-instruction (1) or not (0). |
@@ -92,16 +92,6 @@ shifted := left * Σ_j = 0^i limb_shift[j] * intra_limb_left[i - j] + right * (
 |------|------|-------------|
 | `μ` | `Bit` |  |
 
-## Assumptions
-
-| Tag | Range | Description |
-|-----|-------|-------------|
-| `SHIFT-A1.i` | i ∈ [0, 3] | `IS_HALF[in[i]]` |
-| `SHIFT-A2` |  | `IS_BYTE[shift]` |
-| `SHIFT-A3` |  | `IS_BIT<direction>` |
-| `SHIFT-A4` |  | `IS_BIT<signed>` |
-| `SHIFT-A5` |  | `IS_BIT<word_instr>` |
-
 ## Explanation
 
 This chip has a rather complex design as a result of designing it to fit in as few columns possible. We briefly discuss the intricacies of the design, attempting to illustrate its correctness.
@@ -134,13 +124,24 @@ Lastly, we discuss the case of performing the _arithmetic_ right shift. Here, `e
 
 ## Constraints
 
-First, we constrain `bit_shift` based on whether we are left or right-shifting. [shift:c:zbs] makes sure `zbs` is set to `1` if and only if `bit_shift = 0`. This flag is used to indicate the special case that ``right` = 1` and ``shift` = 0 mod 16`.
+First, we range check our inputs appropriately.
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `SHIFT-C1.i` | i ∈ [0, 3] | `IS_HALF[in[i]]` | μ |
+| `SHIFT-C2` |  | `IS_HALF[shift[2]]` | μ |
+| `SHIFT-C3.i` | i ∈ [0, 1] | `IS_BYTE<shift[i]>` |  |
+| `SHIFT-C4` |  | `IS_BIT<direction>` |  |
+| `SHIFT-C5` |  | `IS_BIT<signed>` |  |
+| `SHIFT-C6` |  | `IS_BIT<word_instr>` |  |
+
+Then, we constrain `bit_shift` based on whether we are left or right-shifting. [shift:c:zbs] makes sure `zbs` is set to `1` if and only if `bit_shift = 0`. This flag is used to indicate the special case that ``right` = 1` and ``shift` = 0 mod 16`.
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `SHIFT-C1` | `AND_BYTE[bit_shift; shift, 15]` | left |
-| `SHIFT-C2` | `AND_BYTE[bit_shift; 2^8 - 16 * zbs - shift, 15]` | right |
-| `SHIFT-C3` | `ZERO[zbs; bit_shift]` | μ |
+| `SHIFT-C7` | `BYTE_ALU[bit_shift; ⧼AND⧽, shift[0], 15]` | left |
+| `SHIFT-C8` | `BYTE_ALU[bit_shift; ⧼AND⧽, 2^8 - 16 * zbs - shift[0], 15]` | right |
+| `SHIFT-C9` | `ZERO[zbs; bit_shift]` | μ |
 
 Next, we shift the limbs of `in` left and right by the appropriate amount, storing the results in `X` and `Y` respectively. When `zbs = 1`, the output cannot be used to compose ``in >>/>>> shift` mod 16`. To resolve this, we override `Y[i] := in[i]` and `X[i] := 0` in this case.
 
@@ -148,13 +149,13 @@ The case of `left`-shifting and ``bit_shift` = 0` will be used for padding rows.
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `SHIFT-C4.i` | i ∈ [0, 3] | `HWSL[['arr', ['idx', 'X', 'i'], ['idx', 'Y', 'i']]; in[i], bit_shift]` | 1 - zbs |
-| `SHIFT-C5.i` | i ∈ [0, 3] | `zbs` => `X[i]` = `in[i]` dot `left` |  |
+| `SHIFT-C10.i` | i ∈ [0, 3] | `HWSL[[X[i], Y[i]]; in[i], bit_shift]` | 1 - zbs |
+| `SHIFT-C11.i` | i ∈ [0, 3] | `zbs` => `X[i]` = `in[i]` dot `left` |  |
 | | | _polynomial:_ `zbs * (X[i] - in[i] * left) = 0` | |
-| `SHIFT-C6.i` | i ∈ [0, 3] | `zbs` => `Y[i]` = `in[i]` dot `right` |  |
+| `SHIFT-C12.i` | i ∈ [0, 3] | `zbs` => `Y[i]` = `in[i]` dot `right` |  |
 | | | _polynomial:_ `zbs * (Y[i] - in[i] * right) = 0` | |
-| `SHIFT-C7` |  | `HWSL[['arr', ['idx', 'X', 4], ['-', 'extension', ['idx', 'X', 4]]]; extension, bit_shift]` | 1 - zbs |
-| `SHIFT-C8` |  | `zbs` => `X[4]` = 0 |  |
+| `SHIFT-C13` |  | `HWSL[[X[4], extension - X[4]]; extension, bit_shift]` | 1 - zbs |
+| `SHIFT-C14` |  | `zbs` => `X[4]` = 0 |  |
 | | | _polynomial:_ `zbs * X[4] = 0` | |
 
 ### Full-limb shifting
@@ -165,21 +166,21 @@ Hereafter, one must only check that `out` is the proper cast of `shifted` into a
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `SHIFT-C9.i` | i ∈ [0, 3] | `IS_BIT<limb_shift[i]>` |  |
-| `SHIFT-C10` |  | `AND_BYTE[(1 - limb_shift[0]) + 15 * limb_shift[1] + 31 * limb_shift[2] + 47 * limb_shift[3]; shift, 48 - 32 * word_instr]` | μ |
-| `SHIFT-C11.i` | i ∈ [0, 1] | `out[:2]` = `shifted[:4]` |  |
+| `SHIFT-C15.i` | i ∈ [0, 3] | `IS_BIT<limb_shift[i]>` |  |
+| `SHIFT-C16` |  | `BYTE_ALU[(1 - limb_shift[0]) + 15 * limb_shift[1] + 31 * limb_shift[2] + 47 * limb_shift[3]; ⧼AND⧽, shift[0], 48 - 32 * word_instr]` | μ |
+| `SHIFT-C17.i` | i ∈ [0, 1] | `out[:2]` = `shifted[:4]` |  |
 | | | _polynomial:_ `out[i] - (shifted::DWordWL)[i] = 0` | |
 
 ### Miscellaneous
 
 | Tag | Description |
 |-----|-------------|
-| `SHIFT-C12` | `direction` => `μ` = 1 |
+| `SHIFT-C18` | `direction` => `μ` = 1 |
 | | _polynomial:_ `direction * (1 - μ) = 0` |
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `SHIFT-C13` | `MSB16[is_negative; in[3]]` | signed |
+| `SHIFT-C19` | `MSB16[is_negative; in[3]]` | signed |
 
 *Note*: `is_negative` is not used when `signed = 0`. As such, there is no problem with it being unconstrained in this case.
 
@@ -189,8 +190,24 @@ This chip adds the following interaction to the lookup.
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `SHIFT-C14` | `SHIFT[out; in, shift, direction, signed, word_instr]` | -μ |
+| `SHIFT-C20` | `ALU[out; in::DWordWL, shift::DWordWL, ⧼SHIFT⧽ + word_instr + 32 * signed + 64 * direction]` | -μ |
 
 ## Padding
 
-The table can be padded to the next power of two with the following value assignments:
\ No newline at end of file
+The table can be padded to the next power of two with the following value assignments:
+
+| Column | Padding value |
+|--------|---------------|
+| `in` | `0` |
+| `shift` | `0` |
+| `direction` | `0` |
+| `signed` | `0` |
+| `word_instr` | `0` |
+| `out` | `0` |
+| `is_negative` | `0` |
+| `bit_shift` | `0` |
+| `zbs` | `1` |
+| `X` | `[0, 0, 0, 0, 0]` |
+| `Y` | `[0, 0, 0, 0]` |
+| `limb_shift_raw` | `[0, 0, 0]` |
+| `μ` | `0` |
\ No newline at end of file
diff --git a/docs/spec/spec_full.md b/docs/spec/spec_full.md
index f42aec0ca..72dc880cf 100644
--- a/docs/spec/spec_full.md
+++ b/docs/spec/spec_full.md
@@ -144,6 +144,33 @@ We present here a set of constraints on the `PAGE` table that
 
 For zero-initialized pages, `init` can be a constant `0`, and hence doesn't need a column, nor a range check.
 
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `offset` | `RowIndex` | The offset from the page base address. |
+| `init` | `Byte` | The initial value of this address. Can be replaced by a constant zero for zero-initialization |
+| `fini` | `Byte` | The final value this address took |
+| `timestamp` | `DWordWL` | The timestamp at which this address was last accessed |
+
+### Virtual
+
+| Name | Type | Description |
+|------|------|-------------|
+| `address` | `DWordWL` | Adding `offset` to the page base address `page`. `page` is a constant with respect to a single instance of this table. |
+
+**Definition of `address`:**
+```
+address := page + offset * 1::DWordWL
+```
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `PAGE-C1` | `IS_BYTE<init>` |  |
+| `PAGE-C2` | `IS_BYTE<fini>` |  |
+| `PAGE-C3` | `memory[0, address, 0::DWordWL, init]` | -1 |
+| `PAGE-C4` | `memory[0, address, timestamp, fini]` | 1 |
+
 We identify a few alternatives that would achieve the desired initialization/finalization functionalities, and consider their respective trade-offs.
 
 _"Free-zero" initialization_
@@ -243,6 +270,34 @@ If `cond` is `0`, [isbit:c:isbit] is trivially satisfied: `X` can assume any val
 
 ---
 
+# IS_BYTE Template
+
+When a chip leverages this template twice or more, implementors are encouraged to merge pairs of  interactions with identical conditions into `ARE_BYTES` interactions; the  template is included for convenience of notation, and to complete the specification of chips that use an odd number of  range checks.
+
+## Variables
+
+The  template leverages  interaction(s):
+
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `X` | `BaseField` | Value for which to assert that it lies in the range $[0, 255]$. |
+
+### Condition
+
+| Name | Type | Description |
+|------|------|-------------|
+| `cond` | `BaseField` |  |
+
+## Constraints
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `IS_BYTE-C1` | `ARE_BYTES[0, X]` | cond |
+
+---
+
 # SIGN Template
 
 It constrains that `sign` is set to `1` when both `X`'s most significant bit and `signed` are `1`, and `0` otherwise.
@@ -417,458 +472,376 @@ It is worth noting that this construction does _not_ require the limbs of `neg`
 
 ---
 
-# MEMW Chip
+# DECODE Table
 
-The  chip is used to read and write memory locations (both RAM and registers) in chunks of 1, 2, 4 or 8 values. It introduces the old value and last-accessed timestamps of memory addresses internally, in order to satisfy the design of the memory argument ([memory]).
+All `RV64IMC` instruction are to be decoded to a format that can be interpreted by the VM. This section outlines the decoding table being used in the VM. For reasons of efficiency, data in this table is significantly compressed. Since reasoning about this compressed form is needlessly complex, the `decode (uncompressed)` section presents the same table in uncompressed form, and explains how to decode `RV64IM` assembly instructions to it. Instructions on how to compress the uncompressed table to form the compressed decode table, can be derived from the `packed_decode` variable provided below.
 
 ## Variables
 
-The  chip is comprised of  variables that are expressed using  columns and leverages  interaction(s):
+The  table is comprised of  variables that are expressed using  columns:
 
-### Input
+### Output
 
 | Name | Type | Description |
 |------|------|-------------|
-| `is_register` | `Bit` | Whether the address represents a register index |
-| `base_address` | `DWordWL` | The base address to read from/write to. Gets offset by $[0, 7]$ depending on the size of the access |
-| `value` | `BaseField[8]` | The values to store in memory. For RAM, these should be (up to) 8 range-checked `Byte`s; registers are stored as two range-checked `Word`s |
-| `timestamp` | `DWordWL` | The timestamp at which this memory access occurs |
-| `write2` | `Bit` | Whether to write exactly 2 values |
-| `write4` | `Bit` | Whether to write exactly 4 values |
-| `write8` | `Bit` | Whether to write exactly 8 values |
+| `pc` | `DWordWL` | value of the program counter this instruction is associated with. |
+| `packed_decode` | `BaseField` | Ordered concatenation of several small variables. The `decode (uncompressed)` section explains the purpose of each variable.\ A list of each variable and the bit(-range) in which it is located:\ [0] `read_register1`, \ [1] `read_register2`, \ [2] `write_register`, \ [3] `word_instr`, \ [4] `ALU`, \ [5] `ADD`, \ [6] `SUB`, \ [7] `MEMORY`, \ [8] `BRANCH`, \ [9] `ECALL`, \ [10:17] `rs1`, \ [18:25] `rs2`, \ [26:33] `rd`, \ [34:41] `half_instruction_length`, \ [42:49] `alu_flags`, \ [50:57] `mem_flags`, \ the remaining bits are set to zero.  |
+| `imm` | `DWordWL` | the *fully extended (!)* 64-bit version of the immediate. |
+
+### Multiplicity
+
+| Name | Type | Description |
+|------|------|-------------|
+| `μ` | `BaseField` | The multiplicity with which this instruction is looked up in the `CPU` table. |
+
+## Padding
+
+The  table must be padded to a length that is a power of two. Empty rows with the following content can be added to achieve this:
+
+| Column | Padding value |
+|--------|---------------|
+| `pc` | `1` |
+| `packed_decode` | `0` |
+| `imm` | `0` |
+| `μ` | `0` |
+
+This is simultaneously the row that is used for padding rows in the CPU, if the multiplicity is nonzero, so we need to ensure that this table has at least one row of padding.
+
+## Decoding<decode:decoding-overview>
+
+For the purposes of explaining decoding, we decompress 's `packed_decode` variable into its constituent variables. Note that the below table is _not_ used in practice: it is solely used for the purposes of this explanation. The construction of the `alu_flags` and `mem_flags` columns is given here through virtual columns.
 
 ### Output
 
 | Name | Type | Description |
 |------|------|-------------|
-| `old` | `BaseField[8]` | The old value written at `base_address`. See `value` for information about representation. Only the elements corresponding to the `writeN` bits are guaranteed |
+| `pc` | `DWordWL` | value of the program counter this instruction is associated with. |
+| `rs1` | `Byte` | index of source register 1. |
+| `rs2` | `Byte` | index of source register 2. |
+| `rd` | `Byte` | index of destination register. |
+| `read_register1` | `Bit` | whether to load the contents of address `rs1` (1) or `0` (0) into `rv1`. |
+| `read_register2` | `Bit` | whether to load the contents of address `rs2` (1) or `0` (0) into `rv2`. |
+| `write_register` | `Bit` | whether the result should be written to `rd` ($=0$ for memory write and when $`rd` = `x0`)$. |
+| `imm` | `DWordWL` | the *fully extended (!)* 64-bit version of the immediate. |
+| `word_instr` | `Bit` | Whether the instruction is a `*W` instruction, requiring the inputs and outputs to be (sign) extended. |
+| `ALU` | `Bit` | Enable the ALU |
+| `ADD` | `Bit` | ALU does an ADD |
+| `SUB` | `Bit` | ALU does a SUB |
+| `BRANCH` | `Bit` | The instruction is a branch |
+| `MEMORY` | `Bit` | The instruction is a memory access |
+| `ECALL` | `Bit` | Perform an ECALL |
+| `half_instruction_length` | `Byte` | Half of how many bytes this instruction takes up in the program |
 
 ### Auxiliary
 
 | Name | Type | Description |
 |------|------|-------------|
-| `carry` | `Bit[7]` | Whether `base_address[0] + i + 1` $>= 2^32$ |
-| `old_timestamp` | `DWordWL[8]` | The timestamp at which address `base_address + i` was last accessed |
+| `alu_op` | `B4` | Operation selector value for the ALU |
+| `signed` | `Bit` | selector used to indicate signed or unsigned input interpretation. |
+| `signed2` | `Bit` | A second signed bit, useful for MUL instructions |
+| `muldiv_selector` | `Bit` | selects which output of `MUL` (lo/hi) or `DVRM` (quo/rem) is wanted. |
+| `invert` | `Bit` | Instructs the EQ or LT chip to invert its result, or inverts the direction of the SHIFT chip (right instead of left) |
+| `memory_op` | `Bit` | Selects whether to LOAD (0) or STORE (1) |
+| `mem_2B` | `Bit` | whether the memory access (read or write) touches exactly $2$ bytes. |
+| `mem_4B` | `Bit` | whether the memory access (read or write) touches exactly $4$ bytes. |
+| `mem_8B` | `Bit` | whether the memory access (read or write) touches exactly $8$ bytes. |
+| `mem_signed` | `Bit` | Whether the memory operation is a signed one, this is distinct from `signed` to enable the `JALR` flag to alias `mem_flags` |
+| `JALR` | `Bit` | The branch is a JAL(R) |
 
 ### Virtual
 
 | Name | Type | Description |
 |------|------|-------------|
-| `w2` | `Bit` | writing at least 2 bytes |
-| `w4` | `Bit` | writing at least 4 bytes |
-| `address_add` | `DWordWL[7]` | `address_add[i] = base_address + i + 1` |
-| `μ_sum` | `Bit` |  |
-
-**Definition of `w2`:**
-```
-w2 := write2 + write4 + write8
-```
-
-**Definition of `w4`:**
-```
-w4 := write4 + write8
-```
+| `alu_flags` | `Byte` | The combined ALU flags |
+| `mem_flags` | `Byte` | The combined memory flags (or JALR when BRANCHing) |
 
-**Definition of `address_add`:**
+**Definition of `alu_flags`:**
 ```
-address_add := ['arr', ['-', ['+', ['idx', 'base_address', 0], 'i', 1], ['*', ['^', 2, 32], ['idx', 'carry', 'i']]], ['+', ['idx', 'base_address', 1], ['idx', 'carry', 'i']]]
+alu_flags := alu_op + 32 * signed + 64 * (signed2 + invert) + 128 * muldiv_selector
 ```
 
-**Definition of `μ_sum`:**
+**Definition of `mem_flags`:**
 ```
-μ_sum := μ_read + μ_write
+mem_flags := JALR + memory_op + 2 * mem_signed + 4 * mem_2B + 8 * mem_4B + 16 * mem_8B
 ```
 
 ### Multiplicity
 
 | Name | Type | Description |
 |------|------|-------------|
-| `μ_read` | `Bit` | Whether we are performing a read (and hence return `out`) |
-| `μ_write` | `Bit` | Whether we are performing a write (and hence not return `out`) |
+| `μ` | `BaseField` | The multiplicity with which this instruction is looked up in the `CPU` table. |
 
-## Assumptions
+First, we provide a mapping from an an ALU operation "descriptor" to the numerical value as used for the `alu_op` column. This is the table used to find the value for the ) notation when performing `ALU` or `BYTE_ALU` interactions.
 
-| Tag | Range | Description |
-|-----|-------|-------------|
-| `MEMW-A1.i` | i ∈ [0, 1] | `IS_WORD[base_address[i]]` |
-| `MEMW-A2` |  | `IS_BIT<write2>` |
-| `MEMW-A3` |  | `IS_BIT<write4>` |
-| `MEMW-A4` |  | `IS_BIT<write8>` |
-| `MEMW-A5` |  | `IS_BIT<write2 + write4 + write8>` |
-| `MEMW-A6.i` | i ∈ [0, 1] | `IS_WORD[timestamp[i]]` |
+table(columns: (auto, auto), stroke: 0pt, inset: (right: .5em), align: (left, left), table.header[*Descriptor*][*value*], table.hline(stroke: 1.5pt))[ *AND*][0][ *OR*][1][ *XOR*][2][ *EQ*][3][ *LT*][4][ *SHIFT*][5][ *SHIFTW*][6][ *MUL*][7][ *DIVREM*][8]
 
-Our assumptions do not explicitly cover any range checks for the `is_register` and `value` columns, as these are not necessary for the correctness of this chip in isolation. Still, these properties are necessary for the consistency of the system as a whole, and therefore we document it here, keeping the type information as a reading help.
+We will illustrate how each instruction should be expressed in this (uncompressed) decoding table. The columns of the accompanying table represent the following: - *`operation`*: the assembly operation being encoded. - *`alu`*: Set to the descriptor of the ALU operation to be used for `alu_op`. If listed as `ADD` or `SUB`, the corresponding flag should be set, otherwise set `ALU = 1` when this column is not empty. - *`w_instr`*, *`signed`*: whether to set the `word_instr` and `signed` flags, respectively. - *other*: the other flags that should be set or variables that should be given specific values.
 
-## Constraints
+For the purpose of brevity and readability, the table uses the following rules-of-thumb: + `rd`, `rs1`, `rs2`, and `imm` are mapped to the values provided by the instruction; when a value is not specified by an instruction it defaults to `0`. + `read_register1`, `read_register2` and `write_register` are set to `1` when respectively ``rs1` != 0`, ``rs2` != 0`, or  ``rd` != 0`.
 
-Depending on the values of `write2`, `write4` and `write8`, the addresses following `base_address` need to be constructed. Rather than computing these in full (which would require the later addresses to be instantiated), it suffices to know the `carry`: the bit indicating whether ``base_address`_0 + t >= 2^32`, i.e., whether adding `t in [1, 7]` to `base_address` requires a carry from the lower to the upper limb. Note that it is safe for the prover to chose these bits: additions for which this bit is not correctly set will yield an address where either the lower or upper limb is out of bounds. As such, the constructed address will not match any existing memory tokens, which are only initialized for correctly formatted and range-checked doublewords (see [memory]).
+Further clarification is provided in the notes following the table.
 
-| Tag | Range | Description | Multiplicity |
-|-----|-------|-------------|--------------|
-| `MEMW-C1` |  | `IS_BIT<μ_read>` |  |
-| `MEMW-C2` |  | `IS_BIT<μ_write>` |  |
-| `MEMW-C3` |  | `IS_BIT<μ_sum>` |  |
-| `MEMW-C4` |  | `w2` => `μ_sum` |  |
-| | | _polynomial:_ `w2 * (1 - μ_sum) = 0` | |
-| `MEMW-C5.i` | i ∈ [0, 6] | `IS_BIT<carry[i]>` |  |
-| `MEMW-C6` |  | `LT[1; old_timestamp[0], timestamp, 0]` | μ_sum |
-| `MEMW-C7` |  | `LT[1; old_timestamp[1], timestamp, 0]` | w2 |
-| `MEMW-C8.i` | i ∈ [2, 3] | `LT[1; old_timestamp[i], timestamp, 0]` | w4 |
-| `MEMW-C9.i` | i ∈ [4, 7] | `LT[1; old_timestamp[i], timestamp, 0]` | write8 |
+/// Add a reference to one or more notes following this table.
 
-As long as `timestamp` is properly range-checked, the presence of `old_timestamp` in the memory argument automatically ensures it is appropriately range checked (this assumes no external entities provide negative multiplicities without range checking the timestamp). This ensures the assumptions for `LT` are satisfied.
+super("[" + refs.pos().map(r => ref(r)).join(",") + "]") }
 
-There is no need to check that the additions do not overflow, as our address calculations are not performed modulo `2^64` here, and any overflow will result in an address without matching initialization.
+show figure: set block(breakable: true)
 
-The chip adds the following tuples to the lookup argument, to effectuate that part of the memory argument.
+figure(table( columns: (auto, auto, auto, auto, 1fr, auto), stroke: 0pt, inset: (right: .5em), align: (left, right, center, center, left, right), fill: (_, y) => // Overlay a low-opacity fill color to distinguish the different rows better if calc.odd(y) and y <= lines.len() { color.rgb(0, 0, 100, 20) } else { color.rgb(255, 255, 255, 20) }, table.header([*Operation*], [*alu*], [*`w_instr`*], [*`signed`*], [*other*], []), table.hline(stroke: 1.5pt), table.vline(x: 1, start: 1, end: lines.len() + 1, stroke: .5pt), ..lines.flatten(), table.hline(stroke: 1.5pt), table.footer([*Operation*], [*alu*], [*`w_instr`*], [*`signed`*], [*other*]), )) }
 
-| Tag | Range | Description | Multiplicity |
-|-----|-------|-------------|--------------|
-| `MEMW-CM10` |  | `memory[is_register, base_address, old_timestamp[0], old[0]]` | μ_sum |
-| `MEMW-CM11` |  | `memory[is_register, base_address, timestamp, value[0]]` | -μ_sum |
-| `MEMW-CM12` |  | `memory[is_register, address_add[0], old_timestamp[1], old[1]]` | w2 |
-| `MEMW-CM13` |  | `memory[is_register, address_add[0], timestamp, value[1]]` | -w2 |
-| `MEMW-CM14.i` | i ∈ [2, 3] | `memory[is_register, address_add[i - 1], old_timestamp[i], old[i]]` | w4 |
-| `MEMW-CM15.i` | i ∈ [2, 3] | `memory[is_register, address_add[i - 1], timestamp, value[i]]` | -w4 |
-| `MEMW-CM16.i` | i ∈ [4, 7] | `memory[is_register, address_add[i - 1], old_timestamp[i], old[i]]` | write8 |
-| `MEMW-CM17.i` | i ∈ [4, 7] | `memory[is_register, address_add[i - 1], timestamp, value[i]]` | -write8 |
+// OP-IMM ([`ADDI[W]   rd, rs1, imm`], [`ADD`], [`[W]`], [], [], []), ([`SLTI[U]   rd, rs1, imm`], [`LT`], [], [.not`[U]`], [], []), ([`ANDI      rd, rs1, imm`], [`AND`], [], [], [], []), ([`ORI       rd, rs1, imm`], [`OR`],   [], [], [], []), ([`XORI      rd, rs1, imm`], [`XOR`], [], [], [], []), ([`SLLI[W]   rd, rs1, imm`], [`SHIFT[W]`], [`[W]`], [], [], []), ([`SRLI[W]   rd, rs1, imm`], [`SHIFT[W]`], [`[W]`], [], [`invert`], []), ([`SRAI[W]   rd, rs1, imm`], [`SHIFT[W]`], [`[W]`], [1], [`invert`], []), // OP ([`ADD[W]    rd, rs1, rs2`], [`ADD`], [`[W]`], [], [], []), ([`SUB[W]    rd, rs1, rs2`], [`SUB`], [`[W]`], [], [], []), ([`SLT[U]    rd, rs1, rs2`], [`LT`], [], [.not`[U]`], [], []), ([`AND       rd, rs1, rs2`], [`AND`], [], [], [], []), ([`OR        rd, rs1, rs2`], [`OR`], [], [], [], []), ([`XOR       rd, rs1, rs2`], [`XOR`], [], [], [], []), ([`SLL[W]    rd, rs1, rs2`], [`SHIFT[W]`], [`[W]`], [], [], []), ([`SRL[W]    rd, rs1, rs2`], [`SHIFT[W]`], [`[W]`], [], [`invert`], []), ([`SRA[W]    rd, rs1, rs2`], [`SHIFT[W]`], [`[W]`], [1], [`invert`], []), // OP - M ([`MUL[W]    rd, rs1, rs2`], [`MUL`], [`[W]`], [1], [`signed2`], []), ([`MULH      rd, rs1, rs2`], [`MUL`], [], [1], [`signed2`, `muldiv_selector`], []), ([`MULHU     rd, rs1, rs2`], [`MUL`], [], [], [`muldiv_selector`], []), ([`MULHSU    rd, rs1, rs2`], [`MUL`], [], [1], [`muldiv_selector`], []), ([`DIV[U][W] rd, rs1, rs2`], [`DIVREM`], [`[W]`], [.not`[U]`], [], []), ([`REM[U][W] rd, rs1, rs2`], [`DIVREM`], [`[W]`], [.not`[U]`], [`muldiv_selector`], []), // LUI/AUIPC ([`LUI       rd, imm`], [`ADD`], [], [], [], []), ([`AUIPC     rd, imm`], [`ADD`], [], [], [`rs1 := x255`], []), ([`JAL       rd, imm`], [], [], [], [`BRANCH`, `JALR`, `rs1 := x255`], []), // Branching ([`JALR      rd, rs1, imm`], [], [], [], [`BRANCH`, `JALR`], []), ([`BEQ      rs1, rs2, imm`], [`EQ`], [], [], [`BRANCH`], []), ([`BNE      rs1, rs2, imm`], [`EQ`], [], [], [`BRANCH`, `invert`], []), ([`BLT[U]   rs1, rs2, imm`], [`LT`], [], [.not`[U]`], [`BRANCH`], []), ([`BGE[U]   rs1, rs2, imm`], [`LT`], [], [.not`[U]`], [`BRANCH`, `invert`], []), // LOAD ([`LD        rd, rs1, imm`], [`ADD`], [], [], [`MEMORY`, `mem_8B`], []), ([`LW[U]     rd, rs1, imm`], [`ADD`], [], [], [`MEMORY`, `mem_signed := `.not`[U]`, `mem_4B`], []), ([`LH[U]     rd, rs1, imm`], [`ADD`], [], [], [`MEMORY`, `mem_signed := `.not`[U]`, `mem_2B`], []), ([`LB[U]     rd, rs1, imm`], [`ADD`], [], [], [`MEMORY`, `mem_signed := `.not`[U]`], []), // STORE ([`SD       rs1, rs2, imm`], [`ADD`], [], [], [`MEMORY`, `memory_op`, `mem_8B`], []), ([`SW       rs1, rs2, imm`], [`ADD`], [], [], [`MEMORY`, `memory_op`, `mem_4B`], []), ([`SH       rs1, rs2, imm`], [`ADD`], [], [], [`MEMORY`, `memory_op`, `mem_2B`], []), ([`SB       rs1, rs2, imm`], [`ADD`], [], [], [`MEMORY`, `memory_op`], []), // ECALL/EBREAK ([`ECALL`], [], [], [], [`ECALL`, ``rs1` := `x17``], []), // FENCE ([`FENCE`], [`ADD`], [], [], [], []),
 
-This chip contributes the following to the lookup argument:
+Note that the above table has no entry for the `EBREAK` instruction. We treat `EBREAK` as an unprovable trap, and its absence from the table enables this by having no valid decoding available for when the instruction is encountered.
 
-| Tag | Description | Multiplicity |
-|-----|-------------|--------------|
-| `MEMW-CO18` | `MEMW[old; is_register, base_address, value, timestamp, write2, write4, write8]` | -μ_read |
-| `MEMW-CO19` | `MEMW[is_register, base_address, value, timestamp, write2, write4, write8]` | -μ_write |
+### C-type instructions
 
-## Padding
+The `RV64C` extension for compressed instructions specifies that \~50% of all instructions can be represented using a 16-bit instruction (rather than 32-bits), saving \~25% in code size. This execution of assembly code is _not_ agnostic to an instruction's compression state; after executing a compressed instruction, the `pc` should be incremented by `2` rather than `4`. As such, we provide the `half_instruction_length` column that *must take on the value `1` for compressed instructions and `2` for regular instructions*. It is represented as half the number of bytes in the instruction to make misaligned instructions lengths unrepresentable. Additionally, having the variable opens the door for future optimizations involving "fused" instructions, where common sequences of instructions are merged into a single decoded version and need only a single CPU row to prove.
 
-The table can be padded to the next power of two with the following value assignments:
+// Construct a note that can be referenced through `lbl`
 
-## Read-size aligned fast path
+show figure: (it) => align(left, []) [ ] }
 
-When a memory access happens at an address with proper alignment for its access size (i.e., adding the access size to `base_address`'s lowest limb does not overflow), and all accessed elements were last accessed at the same timestamp, we can instead use the  chip to save on total column count. The saving comes from only requiring a single old timestamp to be stored, as well as being able to guarantee that all values of `add_limb_overflow` would be zero. A minor extra cost is introduced in the form of a check that the alignment is indeed correct, and the corresponding decomposition of the `base_address`.
+### Notes
 
-Further logic remains essentially the same, so we briefly present the relevant tables for this chip.
+We note the following about the above decoding table:
 
-The  chip only needs  variables, expressed through  columns; it leverages  interactions.
+enum.item( referenceable_note( "note_word_instr", [`word_instr`: `[W]` indicates that ``word_instr` = 1` for the `W`-variant of the operation, and `0` for the non-`W`-variant. Similarly, `SHIFT[W]` indicates the `SHIFTW` operation for the `W`-variant, and `SHIFT` otherwise.] ), enum.item( referenceable_note( "note_signed", [`signed`: .not`[U]` indicates that ``signed` = 1` for the *non-`U`*-variant of the operation, and `0` for the `U`-variant.] ), enum.item( referenceable_note( "note-lui", [`LUI`: this operation loads the 20-bit `imm` in the upper bits of `rd`. Observe that this can be represented using `ADDI rd, x0, imm`. As such, *we expect the decoding to take care of writing the immediate in bit range `[12:32]` of `imm` and extending it to 64 bits.*] ), enum.item( referenceable_note( "note-auipc", [`AUIPC`: this operation adds the 20-bit immediate to the upper bits of `pc` and stores the result in `rd`. Given that the `pc` is stored in `x255`, this operation can be represented using `ADDI rd, x255, imm`. As such, *we expect the decoding to take care of writing the immediate in bit range `[12:32]` of `imm` and extending it to 64 bits.*] ), enum.item( referenceable_note( "note-jal", [`JAL`: this operation stores ``pc` + `2 * half_instruction_length`` in `rd` and adds two times the sign-extended 20-bit immediate to the `pc`. Note that this can be represented using `JALR rd, x255, imm`. As such, *we expect the decoding to take care of writing the immediate in bit range `[1:21]` of `imm` and extending it to 64 bits; the least significant bit should always be 0.*] ), enum.item( referenceable_note( "note-ecall", [`ECALL`: "On RISC-V a system call has its own instruction: `ECALL`. [...] A7 [= register `x17`] contains the system call number." [[source]] ] ), enum.item( referenceable_note( "note-fence", [`FENCE`: currently, the VM interprets this operation as `ADDI x0 x0 0`; a no-op.]
+
+---
+
+# CPU Chip
+
+The  chip coordinates memory accesses and dispatches to other chips for arithmetic and logical operations. It bases its decisions on the entry of the `DECODE` table ([decode]) corresponding the current program counter (PC).
+
+## Variables
+
+The  chip is comprised of  variables that are expressed using  columns and leverages  interaction(s):
 
 ### Input
 
 | Name | Type | Description |
 |------|------|-------------|
-| `is_register` | `Bit` | Whether the address represents a register index |
-| `base_address` | `DWordWHH` | The base address to read from/write to. Gets offset by $[0, 7]$ depending on the size of the access |
-| `value` | `BaseField[8]` | The values to store in memory. For regular memory, these should be (up to) 8 range-checked `Byte`s; registers are stored as two range-checked `Word`s |
-| `timestamp` | `DWordWL` | The timestamp at which this memory access is said to occur |
-| `write2` | `Bit` | Whether to write exactly 2 values |
-| `write4` | `Bit` | Whether to write exactly 4 values |
-| `write8` | `Bit` | Whether to write exactly 8 values |
+| `timestamp` | `Timestamp` | A preprocessed timestamp to coordinate the memory argument. Since we have at most 3 non-disjoint memory accesses (`(rs1, rs2, rd)`, `(rs1, pc, pc)`, `MEMORY`) a maximum of 4 slots is enough. |
+| `pc` | `DWordWL` | The program counter |
+| `rs1` | `Byte` | Source register 1 index |
+| `rs2` | `Byte` | Source register 2 index |
+| `rd` | `Byte` | Destination register index |
+| `read_register1` | `Bit` | Whether to read from `rs1` (1) or to place a 0 in `rv1` (0) |
+| `read_register2` | `Bit` | Whether to read from `rs2` (1) or to place a 0 in `rv2` (0) |
+| `write_register` | `Bit` | Whether to write back to the destination register |
+| `imm` | `DWordWL` | The fully extended 64-bit version of the immediate |
+| `half_instruction_length` | `Byte` | Half the number of bytes consumed by this instruction, commonly used to indicate whether the instruction is of C type, i.e., whether it is 2 bytes long (= 1) instead of 4 (= 2) |
+| `word_instr` | `Bit` | Whether the instruction is a \*W instruction, requiring the inputs and outputs to be (sign) extended |
+| `ALU` | `Bit` | Whether to use the ALU for this instruction |
+| `alu_flags` | `Byte` | The ALU operation + flags (interpreting things as signed/unsigned, choosing the MUL/DVRM output, ...) to pass to the ALU |
+| `ADD` | `Bit` | Addition fast-path bypassing the ALU |
+| `SUB` | `Bit` | Subtraction fast-path bypassing the ALU |
+| `MEMORY` | `Bit` | Whether this instruction touches memory (LOAD/STORE) |
+| `mem_flags` | `Byte` | The flags to pass for MEMORY operations (LOAD vs STORE, number of bytes touched, signed) |
+| `BRANCH` | `Bit` | Whether this instruction is a conditional branch (BLT, BEQ) |
+| `ECALL` | `Bit` | Whether this instruction is an ECALL |
 
 ### Output
 
 | Name | Type | Description |
 |------|------|-------------|
-| `old` | `BaseField[8]` | The old value written at `base_address + i`. See `value` for information about representation. Only the elements corresponding to the `writeN` bits are guaranteed |
+| `next_pc` | `DWordWL` | The program counter for the next instruction |
+| `rvd` | `DWordWL` | The value to (maybe) be written back to rvd |
 
 ### Auxiliary
 
 | Name | Type | Description |
 |------|------|-------------|
-| `old_timestamp` | `DWordWL` | The timestamp at which the address was last accessed |
+| `prev_pc_timestamp_borrow` | `Bit` | The borrow bit for computing the previous timestamp the PC was accessed |
+| `pc_double_read` | `Bit` | Whether the PC is being read as a general purpose register (`rs1`) this cycle |
+| `rv1` | `DWordWL` | The value of register `rs1` |
+| `rv2` | `DWordWL` | The value of register `rs2` |
+| `arg2` | `DWordWL` | A multiplexed version of `rv2` and `imm`, to be used as second argument to ALU calls |
+| `res` | `DWordHL` | The ALU result |
+| `branch_cond` | `Bit` | Whether a branch is taken: the branch condition evaluates to true, or we are doing an unconditional jump |
 
 ### Virtual
 
 | Name | Type | Description |
 |------|------|-------------|
-| `w2` | `Bit` | writing at least 2 bytes |
-| `w4` | `Bit` | writing at least 4 bytes |
-| `μ_sum` | `Bit` |  |
-
-**Definition of `w2`:**
-```
-w2 := write2 + write4 + write8
-```
+| `JALR` | `Bit` | Read whether our BRANCH corresponds to a JAL(R) instruction from `mem_flags`, as `MEMORY` and `BRANCH` are mutually exclusive |
+| `packed_decode` | `BaseField` | A packed representation of all bit flags and register indices obtained from the decoding |
 
-**Definition of `w4`:**
+**Definition of `JALR`:**
 ```
-w4 := write4 + write8
+JALR := mem_flags
 ```
 
-**Definition of `μ_sum`:**
+**Definition of `packed_decode`:**
 ```
-μ_sum := μ_read + μ_write
+packed_decode := 2^0 * read_register1 + 2^1 * read_register2 + 2^2 * write_register + 2^3 * word_instr + 2^4 * ALU + 2^5 * ADD + 2^6 * SUB + 2^7 * MEMORY + 2^8 * BRANCH + 2^9 * ECALL + 2^10 * rs1 + 2^18 * rs2 + 2^26 * rd + 2^34 * half_instruction_length + 2^42 * alu_flags + 2^50 * mem_flags
 ```
 
-### Multiplicity
-
-| Name | Type | Description |
-|------|------|-------------|
-| `μ_read` | `Bit` | Whether we are performing a read (and hence return `out`) |
-| `μ_write` | `Bit` | Whether we are performing a write (and hence not return `out`) |
+## Assumptions
 
 | Tag | Range | Description |
 |-----|-------|-------------|
-| `MEMW_A-A1.i` | i ∈ [0, 1] | `IS_HALF[base_address[i]]` |
-| `MEMW_A-A2` |  | `IS_WORD[base_address[2]]` |
-| `MEMW_A-A3` |  | `IS_BIT<write2>` |
-| `MEMW_A-A4` |  | `IS_BIT<write4>` |
-| `MEMW_A-A5` |  | `IS_BIT<write8>` |
-| `MEMW_A-A6` |  | `IS_BIT<write2 + write4 + write8>` |
-| `MEMW_A-A7.i` | i ∈ [0, 1] | `IS_WORD[timestamp[i]]` |
+| `CPU-A1` |  | `MEMORY` and `BRANCH` are mutually exclusive |
+| `CPU-A2` |  | When `MEMORY + BRANCH = 0`, either `read_register2 = 0` or `imm = 0` should be enforced by the decoding. This is needed for `arg2`. |
+| `CPU-A3` |  | $#`!MEMORY` => #`IS_BIT<mem_flags>`$ |
+
+Additionally, the following constraints can be used to provide defense-in-depth validation of the assumptions.
+
+| Tag | Description |
+|-----|-------------|
+| `CPU-C1` | not (`MEMORY` and `BRANCH`) |
+| | _polynomial:_ `MEMORY * BRANCH = 0` |
+| `CPU-C2` | (1 - `MEMORY` - `BRANCH`) => (`read_register2` = 0 or `imm[i]` = 0) |
+| | _polynomial:_ `(1 - MEMORY - BRANCH) * read_register2 * (imm[0] + imm[1]) = 0` |
+| `CPU-C3` | 1 - MEMORY ⇒ `IS_BIT<mem_flags>` |
+
+## Constraints
+
+First, we perform a decoding lookup for the current PC. Instructions having the `word_instr` flag set are not decoded here, as they are delegated to the `CPU32` chip. In that case, we ensure that the current row of the CPU cannot have any other observable effects.
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `MEMW_A-C1` | `IS_HALF[base_address[0] + write2 + 3 * write4 + 7 * write8]` | μ_sum |
-| `MEMW_A-C2` | `IS_BIT<μ_read>` |  |
-| `MEMW_A-C3` | `IS_BIT<μ_write>` |  |
-| `MEMW_A-C4` | `IS_BIT<μ_sum>` |  |
-| `MEMW_A-C5` | `w2` => `μ_sum` |  |
-| | _polynomial:_ `w2 * (1 - μ_sum) = 0` | |
-| `MEMW_A-C6` | `LT[1; old_timestamp, timestamp, 0]` | μ_sum |
+| `CPU-C4` | `DECODE[pc, imm, packed_decode]` | 1 - word_instr |
+| `CPU-C5` | `word_instr` => `MEMORY = 0` |  |
+| | _polynomial:_ `word_instr * MEMORY = 0` | |
+| `CPU-C6` | `word_instr` => `BRANCH = 0` |  |
+| | _polynomial:_ `word_instr * BRANCH = 0` | |
+| `CPU-C7` | `word_instr` => `ECALL = 0` |  |
+| | _polynomial:_ `word_instr * ECALL = 0` | |
+| `CPU-C8` | `word_instr` => `read_register1 = 0` |  |
+| | _polynomial:_ `word_instr * read_register1 = 0` | |
+| `CPU-C9` | `word_instr` => `read_register2 = 0` |  |
+| | _polynomial:_ `word_instr * read_register2 = 0` | |
+| `CPU-C10` | `word_instr` => `write_register = 0` |  |
+| | _polynomial:_ `word_instr * write_register = 0` | |
+| `CPU-C11` | `CPU32[half_instruction_length; timestamp, pc]` | word_instr |
+
+### Range checks
+
+We constrain all columns to have the appropriate ranges. All values in `packed_decode` need to be checked to ensure the packing is correct for the interaction. In contrast, we know ahead of time that decoding will ensure proper range checks for `pc` and `imm`. Similarly, since `next_pc` will propagate through the memory argument and be looked up in the instruction decoding on the next cycle, it is forced to be in the correct range; the final value for `next_pc` is similarly fixed by the memory finalization. For the auxiliary columns, we need to check the limbs of `res`, since `rv1` and `rv2` are enforced by the memory argument, and `rvd` is correct by the correctness of the dependent chips. The ranges of the other auxiliary columns are enforced through later constraints.
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `MEMW_A-CM7` |  | `memory[is_register, base_address::DWordWL, old_timestamp, old[0]]` | μ_sum |
-| `MEMW_A-CM8` |  | `memory[is_register, base_address::DWordWL, timestamp, value[0]]` | -μ_sum |
-| `MEMW_A-CM9` |  | `memory[is_register, base_address::DWordWL + 1::DWordWL, old_timestamp, old[1]]` | w2 |
-| `MEMW_A-CM10` |  | `memory[is_register, base_address::DWordWL + 1::DWordWL, timestamp, value[1]]` | -w2 |
-| `MEMW_A-CM11.i` | i ∈ [2, 3] | `memory[is_register, base_address::DWordWL + i::DWordWL, old_timestamp, old[i]]` | w4 |
-| `MEMW_A-CM12.i` | i ∈ [2, 3] | `memory[is_register, base_address::DWordWL + i::DWordWL, timestamp, value[i]]` | -w4 |
-| `MEMW_A-CM13.i` | i ∈ [4, 7] | `memory[is_register, base_address::DWordWL + i::DWordWL, old_timestamp, old[i]]` | write8 |
-| `MEMW_A-CM14.i` | i ∈ [4, 7] | `memory[is_register, base_address::DWordWL + i::DWordWL, timestamp, value[i]]` | -write8 |
+| `CPU-CR12` |  | `IS_BIT<read_register1>` |  |
+| `CPU-CR13` |  | `IS_BIT<read_register2>` |  |
+| `CPU-CR14` |  | `IS_BIT<write_register>` |  |
+| `CPU-CR15` |  | `IS_BYTE<half_instruction_length>` |  |
+| `CPU-CR16` |  | `IS_BIT<word_instr>` |  |
+| `CPU-CR17` |  | `IS_BIT<ALU>` |  |
+| `CPU-CR18` |  | `IS_BYTE<alu_flags>` |  |
+| `CPU-CR19` |  | `IS_BIT<ADD>` |  |
+| `CPU-CR20` |  | `IS_BIT<SUB>` |  |
+| `CPU-CR21` |  | `IS_BIT<MEMORY>` |  |
+| `CPU-CR22` |  | `IS_BYTE<mem_flags>` |  |
+| `CPU-CR23` |  | `IS_BIT<BRANCH>` |  |
+| `CPU-CR24` |  | `IS_BIT<ECALL>` |  |
+| `CPU-CR25` |  | `IS_BYTE<rs1>` |  |
+| `CPU-CR26` |  | `IS_BYTE<rs2>` |  |
+| `CPU-CR27` |  | `IS_BYTE<rd>` |  |
+| `CPU-CR28.i` | i ∈ [0, 3] | `IS_HALF[res[i]]` | 1 |
 
-| Tag | Description | Multiplicity |
-|-----|-------------|--------------|
-| `MEMW_A-CO15` | `MEMW[old; is_register, base_address::DWordWL, value, timestamp, write2, write4, write8]` | -μ_read |
-| `MEMW_A-CO16` | `MEMW[is_register, base_address::DWordWL, value, timestamp, write2, write4, write8]` | -μ_write |
+### ALU
 
-### Padding
+The ALU functionality is then obtained through delegation to the `ALU` signature, backed by the various ALU chips, or by using the appropriate template. For the pure ALU path, `arg2` is computed as `rv2 + imm`, which relies on [cpu:a:arg2]-multiplex to be either `rv2` or `imm`, depending on the instruction. The other contributions for `arg2` are specific to the (mutually exclusive, [cpu:a:mem]-branch-mutex) `MEMORY` and `BRANCH` flags: - For the `MEMORY` path, we want the output of the ALU to be ``rv1` + `imm``, as that is the address at which the memory access occurs. - For the `BRANCH` path, we want the ALU output to reflect the branch condition (or just be inactive for JALR).
 
-The table can be padded to the next power of two with the following value assignments:
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `CPU-CA29.i` | i ∈ [0, 1] | `arg2` = `MEMORY` dot `imm` + `BRANCH` dot `rv2` + (1 - `MEMORY` - `BRANCH`) dot (`rv2` + `imm`) |  |
+| | | _polynomial:_ `arg2[i] - MEMORY * imm[i] - BRANCH * rv2[i] - (1 - MEMORY - BRANCH) * (rv2 + imm)[i] = 0` | |
+| `CPU-CA30` |  | ADD ⇒ `ADD<res::DWordWL; rv1, arg2>` |  |
+| `CPU-CA31` |  | SUB ⇒ `SUB<res::DWordWL; rv1, arg2>` |  |
+| `CPU-CA32` |  | `ALU[res::DWordWL; rv1, arg2, alu_flags]` | ALU |
 
-## Register fast-path
+### Memory<cpu:memory>
 
-The  chip provides a fast-path for accessing registers. This fast-path leverages that registers + can be addressed using a `Byte`, rather than a full `DWord`, + are constantly accessed, i.e., ``timestamp` - `old_timestamp`` is small, and + have a fixed access pattern to achieve a footprint that is significantly smaller than both  and .
+Note that since registers need no byte-addressing, we store them in the memory argument with `Word` limbs, simultaneously ensuring that register reads are properly range checked as long as all writes are. The `pc` register behaves very predictably with respect to its timestamps and when it is being read, so for performance reasons, we inline its memory interactions directly into the  chip.
 
-Note: as a result of hard optimization, this chip can only be used for register accesses for which + ``timestamp` - `old_timestamp` in [1, 2^16]`, and + ``timestamp[0]` > `old_timestamp[0]`` If either of these rules does not apply to your access, you should fall back to using `MEMW_A`.
+Potentially overlapping memory accesses are ensured to have disjoint timestamps. One consequence of that is that `next_pc` is written at `timestamp + 1` to ensure the access is disjoint with the `pc` read into `rv1` as part of the `AUIPC` instruction (see [cpu:c:read_rv1] and [decode]:decoding-overview). Constraints regarding whether `pc_double_read` corresponds to an `AUIPC` instruction are not necessary, as regardless of its value, the old timestamp is guaranteed smaller than the new timestamp, and the integrity of the memory argument therefore ensures the correctness of this bit.
 
-Note moreover that this chip does not guard against misaligned register access faults: to access register with a given `address`, one must provide `2 dot `address`` in the lookup.
+The memory interaction itself is handled by the `MEMORY` signature, which will read the `mem_flags` argument to perform either a `LOAD` or a `STORE`. We refer to the previous section's description of `arg2` for how the address is computed.
 
-### Variables
+The value to (potentially) be written back to `rd` is stored in `rvd`, which can either come from the ALU --- in case of an ALU operation or a JALR branch --- or from the MEMORY interaction.
 
-The  chip is comprised of  variables that are expressed using  columns and leverages  interactions:
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `CPU-CM33` |  | `MEMW[[rv1[0], rv1[1], 0, 0, 0, 0, 0, 0]; 1, 2::DWordWL * rs1, [rv1[0], rv1[1], 0, 0, 0, 0, 0, 0], timestamp + 0::DWordWL, 1, 0, 0]` | read_register1 |
+| `CPU-CM34.i` | i ∈ [0, 1] | `!read_register1` => `rv1[i]` = 0 |  |
+| | | _polynomial:_ `(1 - read_register1) * rv1[i] = 0` | |
+| `CPU-CM35` |  | `MEMW[[rv2[0], rv2[1], 0, 0, 0, 0, 0, 0]; 1, 2::DWordWL * rs2, [rv2[0], rv2[1], 0, 0, 0, 0, 0, 0], timestamp + 1::DWordWL, 1, 0, 0]` | read_register2 |
+| `CPU-CM36.i` | i ∈ [0, 1] | `!read_register2` => `rv2[i]` = 0 |  |
+| | | _polynomial:_ `(1 - read_register2) * rv2[i] = 0` | |
+| `CPU-CM37` |  | `MEMW[1, 2::DWordWL * rd, [rvd[0], rvd[1], 0, 0, 0, 0, 0, 0], timestamp + 2::DWordWL, 1, 0, 0]` | write_register |
+| `CPU-CM38` |  | `MEMOP[rvd; timestamp, res::DWordWL, rv2, mem_flags]` | MEMORY |
+| `CPU-CM39.i` | i ∈ [0, 1] | `!MEMORY` and `!BRANCH` => `rvd` = `res` |  |
+| | | _polynomial:_ `(1 - MEMORY - BRANCH) * (rvd[i] - (res::DWordWL)[i]) = 0` | |
+| `CPU-CM40` |  | `IS_BIT<pc_double_read>` |  |
+| `CPU-CM41` |  | `IS_BIT<prev_pc_timestamp_borrow>` |  |
+| `CPU-CM42.i` | i ∈ [0, 1] | `memory[1, [2 * 255 + i, 0], [(timestamp[0] - 3 * (1 - pc_double_read)) + 2^32 * prev_pc_timestamp_borrow, timestamp[1] - prev_pc_timestamp_borrow], pc[i]]` | 1 |
+| `CPU-CM43.i` | i ∈ [0, 1] | `memory[1, [2 * 255 + i, 0], timestamp + 1::DWordWL, next_pc[i]]` | -1 |
 
-### Input
+### Branching
 
-| Name | Type | Description |
-|------|------|-------------|
-| `address` | `Byte` | address of the register being accessed |
-| `timestamp` | `DWordWL` | timestamp at which the access takes place |
-| `val` | `DWordWL` | value being written to this register |
+A branch is expressed by having the `BRANCH` flag set to 1. Since `BRANCH` and `MEMORY` are mutually exclusive ([cpu:a:mem]-branch-mutex), we can repurpose the `mem_flags` field to indicate a JALR instruction. When JALR is not set, we have a conditional branch that is decided upon by the result of the ALU instructions, as set in the `res` variable. As such, we can set `branch_cond` appropriately as multiplicity flag for the `BRANCH` chip.
 
-### Output
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `CPU-CB44` | `branch_cond` = `BRANCH` and (`JALR` or `res`) |  |
+| | _polynomial:_ `branch_cond - BRANCH * JALR - BRANCH * (1 - JALR) * res[0] = 0` | |
+| `CPU-CB45` | `BRANCH[next_pc; pc, imm, rv1, JALR]` | branch_cond |
+| `CPU-CB46` | 1 - branch_cond ⇒ `ADD<next_pc; pc, [2 * half_instruction_length, 0]>` |  |
+| `CPU-CB47` | BRANCH ⇒ `ADD<rvd; pc, [2 * half_instruction_length, 0]>` |  |
 
-| Name | Type | Description |
-|------|------|-------------|
-| `old` | `DWordWL` | value of this register at `old_timestamp`. |
+### System
 
-### Auxiliary
+The interactions with the wider system go through the `ECALL` interface. Since we treat `EBREAK` instructions as unprovable traps, we avoid emitting `DECODE` rows for these, and do not need any further handling in the CPU.
 
-| Name | Type | Description |
-|------|------|-------------|
-| `old_timestamp_lo` | `Word` | the lower limb of `old_timestamp` |
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `CPU-CS48` | `ECALL[timestamp, rv1]` | ECALL |
 
-### Virtual
+## Padding
 
-| Name | Type | Description |
-|------|------|-------------|
-| `old_timestamp` | `DWordWL` | timestamp at which this register was last accessed |
-| `μ_sum` | `Bit` |  |
-
-**Definition of `old_timestamp`:**
-```
-old_timestamp := ['arr', 'old_timestamp_lo', ['idx', 'timestamp', 1]]::DWordWL
-```
-
-**Definition of `μ_sum`:**
-```
-μ_sum := μ_read + μ_write
-```
-
-### Multiplicity
-
-| Name | Type | Description |
-|------|------|-------------|
-| `μ_read` | `Bit` | Whether we are performing a read (and hence return `out`) |
-| `μ_write` | `Bit` | Whether we are performing a write (and hence not return `out`) |
-
-### Assumptions
-
-The following range checks are assumed to be performed/enforced outside of this chip:
-
-| Tag | Range | Description |
-|-----|-------|-------------|
-| `MEMW_R-A1.i` | i ∈ [0, 1] | `IS_WORD[val[i]]` |
-| `MEMW_R-A2.i` | i ∈ [0, 1] | `IS_WORD[timestamp[i]]` |
-
-### Constraints
-
-Since most registers are frequently accessed, the difference between `timestamp` and `old_timestamp` is small most of the times. Rather than storing their (nearly) identical upper limbs twice, it is instead assumed that ``old_timestamp[1]` = `timestamp[1]``;  can be used for accesses where this is not the case.
-
-Verifying that ``timestamp` > `old_timestamp`` now simplifies to verifying that ``timestamp[0]` - `old_timestamp[0]` > 0`. For most accesses, this value will be small enough to fit in a `Half`. This chip thus enforces this by means of the following constraint:
-
-| Tag | Description | Multiplicity |
-|-----|-------------|--------------|
-| `MEMW_R-C1` | `IS_HALF[timestamp[0] - old_timestamp[0] - 1]` | μ_sum |
-
-With ``old_timestamp`<`timestamp`` asserted, `old` is read from the register ([regw:c:read_old]) and `val` is written back ([regw:c:write_val]).
-
-| Tag | Range | Description | Multiplicity |
-|-----|-------|-------------|--------------|
-| `MEMW_R-C2.i` | i ∈ [0, 1] | `memory[1, ['arr', ['cast', ['+', ['*', 2, 'address'], 'i'], 'Word'], 0], old_timestamp, old[i]]` | μ_sum |
-| `MEMW_R-C3.i` | i ∈ [0, 1] | `memory[1, ['arr', ['cast', ['+', ['*', 2, 'address'], 'i'], 'Word'], 0], timestamp, val[i]]` | -μ_sum |
-
-This chip can either just write (``μ_write` = 1`), or both read and write (``μ_read` = 1`) in the same cycle. It must be asserted that at most one of these two options is selected:
-
-| Tag | Description |
-|-----|-------------|
-| `MEMW_R-C4` | `IS_BIT<μ_read>` |
-| `MEMW_R-C5` | `IS_BIT<μ_write>` |
-| `MEMW_R-C6` | `IS_BIT<μ_sum>` |
-
-Lastly, this chip contributes the following interactions to the logup:
-
-| Tag | Description | Multiplicity |
-|-----|-------------|--------------|
-| `MEMW_R-C7` | `MEMW[['arr', ['idx', 'old', 0], ['idx', 'old', 1], 0, 0, 0, 0, 0, 0]; 1, ['arr', ['cast', ['*', 2, 'address'], 'Word'], 0], ['arr', ['idx', 'val', 0], ['idx', 'val', 1], 0, 0, 0, 0, 0, 0], timestamp, 1, 0, 0]` | -μ_read |
-| `MEMW_R-C8` | `MEMW[1, ['arr', ['cast', ['*', 2, 'address'], 'Word'], 0], ['arr', ['idx', 'val', 0], ['idx', 'val', 1], 0, 0, 0, 0, 0, 0], timestamp, 1, 0, 0]` | -μ_write |
-
-### Padding
-
-The table can be padded to the next power of two with the following value assignments:
-
-## Notes/optimizations
+The CPU can be padded with the following values, which have a corresponding row in the DECODE table, at the _odd_ address 1, only reachable through a HALT ecall.
 
-The following ideas may prove to be optimizations for the // chip: - `MEMB` chip that does a one-byte write to remove old_timestamp from here (uncertain tradeoffs) - Adding `μ_sum`/`w2`/`w4`/`write8` multiplicities to the `IS_HALF` lookups may make some GKR things faster if there are known zeroes. - For the register fast-path, one may upgrade the `IS_HALF` check to an `IS_B20` check for extended range at the cost of looking through a larger table.
+| Column | Padding value |
+|--------|---------------|
+| `pc` | `1` |
+| `rs1` | `0` |
+| `rs2` | `0` |
+| `rd` | `0` |
+| `read_register1` | `0` |
+| `read_register2` | `0` |
+| `write_register` | `0` |
+| `imm` | `0` |
+| `half_instruction_length` | `2` |
+| `word_instr` | `0` |
+| `ALU` | `0` |
+| `alu_flags` | `0` |
+| `ADD` | `0` |
+| `SUB` | `0` |
+| `MEMORY` | `0` |
+| `mem_flags` | `0` |
+| `BRANCH` | `0` |
+| `ECALL` | `0` |
+| `next_pc` | `1` |
+| `rvd` | `0` |
+| `prev_pc_timestamp_borrow` | `0` |
+| `pc_double_read` | `0` |
+| `rv1` | `0` |
+| `rv2` | `0` |
+| `arg2` | `0` |
+| `res` | `0` |
+| `branch_cond` | `0` |
+
+This approach minimizes the number of dependent lookups, increasing only multiplicities in the `DECODE` table and the `IS_BYTE` and `IS_HALF` lookups.
 
 ---
 
-# DECODE Table
-
-All `RV64IMC` instruction are to be decoded to a format that can be interpreted by the VM. This section outlines the decoding table being used in the VM. For reasons of efficiency, data in this table is significantly compressed. Since reasoning about this compressed form is needlessly complex, the `decode (uncompressed)` section presents the same table in uncompressed form, and explains how to decode `RV64IM` assembly instructions to it. Instructions on how to compress the uncompressed table to form the compressed decode table, can be derived from the `packed_decode` variable provided below.
-
-## Variables
-
-The  table is comprised of  variables that are expressed using  columns:
-
-### Output
-
-| Name | Type | Description |
-|------|------|-------------|
-| `pc` | `DWordWL` | value of the program counter this instruction is associated with. |
-| `packed_decode` | `BaseField` | Ordered concatenation of several small variables. The `decode (uncompressed)` section explains the purpose of each variable.\ A list of each variable and the bit(-range) in which it is located:\ [0] `read_register1`, \ [1] `read_register2`, \ [2] `write_register`, \ [3] `memory_2bytes`, \ [4] `memory_4bytes`, \ [5] `memory_8bytes`, \ [6] `c_type`, \ [7] `signed`, \ [8] `mp_selector`, \ [9] `muldiv_selector`, \ [10] `word_instr`, \ [11] `ADD`, \ [12] `SUB`, \ [13] `SLT`, \ [14] `AND`, \ [15] `OR`, \ [16] `XOR`, \ [17] `SHIFT`, \ [18] `JALR`, \ [19] `BEQ`, \ [20] `BLT`, \ [21] `LOAD`, \ [22] `STORE`, \ [23] `MUL`, \ [24] `DIVREM`, \ [25] `ECALL`, \ [26] `EBREAK`; \ [27:35] `rs1`, \ [35:43] `rs2`, \ [43:51] `rd`, \ the remaining bits are set to zero.  |
-| `imm` | `DWordWL` | the *fully extended (!)* 64-bit version of the immediate. |
-
-### Multiplicity
-
-| Name | Type | Description |
-|------|------|-------------|
-| `μ` | `BaseField` | The multiplicity with which this instruction is looked up in the `CPU` table. |
-
-## Padding
-
-The  table must be padded to a length that is a power of two. Empty rows with the following content can be added to achieve this:
-
-Note that this row sets the `EBREAK` flag. Given that `CPU` asserts that `EBREAK = 0` (see [cpu:c:ebreak_traps]), using this "padding-instruction" would immediately make the CPU table unprovable. Note moreover that the `pc` is set to `7`. This value is the _smallest odd number_ (i.e., not reachable during regular execution) that is more than _`4`_ (i.e., the max `pc`-increment) greater than _`1`_ (i.e., the `pc`-value used in the [additional instruction] referred to by `CPU`-padding lines).
-
-## Decoding<decode:decoding-overview>
-
-For the purposes of explaining decoding, we decompress 's `packed_decode` variable into its constituent variables. Note that the below table is _not_ used in practice: it is solely used for the purposes of this explanation.
-
-### Output
-
-| Name | Type | Description |
-|------|------|-------------|
-| `pc` | `DWordWL` | value of the program counter this instruction is associated with. |
-| `rs1` | `Byte` | index of source register 1. |
-| `rs2` | `Byte` | index of source register 2. |
-| `rd` | `Byte` | index of destination register. |
-| `read_register1` | `Bit` | whether to load the contents of address `rs1` (1) or `0` (0) into `rv1`. |
-| `read_register2` | `Bit` | whether to load the contents of address `rs2` (1) or `0` (0) into `rv2`. |
-| `write_register` | `Bit` | whether the result should be written to `rd` ($=0$ for memory write and when $`rd` = `x0`$. |
-| `mem_2B` | `Bit` | whether the memory access (read or write) touches exactly $2$ bytes. |
-| `mem_4B` | `Bit` | whether the memory access (read or write) touches exactly $4$ bytes. |
-| `mem_8B` | `Bit` | whether the memory access (read or write) touches exactly $8$ bytes. |
-| `c_type` | `Bit` | Whether the instruction is of type `C`, i.e., whether it is $2$ bytes long instead of $4$. |
-| `imm` | `DWordWL` | the *fully extended (!)* 64-bit version of the immediate. |
-| `signed` | `Bit` | selector used to indicate signed or unsigned input interpretation. |
-| `mp_selector` | `Bit` | Multi-purpose selector used by the CPU to to configure several ALU operations in different ways.            See the `CPU` chip for more details. |
-| `muldiv_selector` | `Bit` | selects which output of `MUL` (lo/hi) or `DVRM` (quo/rem) is wanted. |
-| `word_instr` | `Bit` | Whether the instruction is a `*W` instruction, requiring the inputs and outputs to be (sign) extended. |
-| `ADD` | `Bit` | ALU selector flag |
-| `SUB` | `Bit` | ALU selector flag |
-| `SLT` | `Bit` | ALU selector flag |
-| `AND` | `Bit` | ALU selector flag |
-| `OR` | `Bit` | ALU selector flag |
-| `XOR` | `Bit` | ALU selector flag |
-| `SHIFT` | `Bit` | ALU selector flag |
-| `JALR` | `Bit` | ALU selector flag |
-| `BEQ` | `Bit` | ALU selector flag |
-| `BLT` | `Bit` | ALU selector flag |
-| `LOAD` | `Bit` | ALU selector flag |
-| `STORE` | `Bit` | ALU selector flag |
-| `MUL` | `Bit` | ALU selector flag |
-| `DIVREM` | `Bit` | ALU selector flag |
-| `ECALL` | `Bit` | ALU selector flag |
-| `EBREAK` | `Bit` | ALU selector flag |
-
-### Multiplicity
-
-| Name | Type | Description |
-|------|------|-------------|
-| `μ` | `BaseField` | The multiplicity with which this instruction is looked up in the `CPU` table. |
-
-We will illustrate how each instruction should be expressed in this (uncompressed) decoding table. The columns of the accompanying table represent the following: - *`operation`*: the assembly operation being encoded. - *`op-flag`*: which of the "`ALU` selector flags" operation flags to set. Each operation sets exactly one. - *`w_instr`*, *`signed`*: whether to set the `word_instr` and `signed` flags, respectively. - *other*: the other flags that should be set or variables that should be given specific values.
-
-For the purpose of brevity and readability, the table uses the following rules-of-thumb: + `rd`, `rs1`, `rs2`, and `imm` are mapped to the values provided by the instruction; when a value is not specified by an instruction it defaults to `0`. + `read_register1`, `read_register2` and `write_register` are set to `1` when respectively ``rs1` != 0`, ``rs2` != 0`, or  ``rd` != 0`. + Any flag that is not listed is set to `0`, with the exception of the `c_type` flag. *The `c_type` flag is set independently of the below table*, as explained next.
-
-Further clarification is provided in the notes following the table.
-
-/// Add a reference to one or more notes following this table.
-
-super("[" + refs.pos().map(r => ref(r)).join(",") + "]") }
-
-show figure: set block(breakable: true)
-
-figure(table( columns: (auto, auto, auto, auto, 1fr, auto), stroke: 0pt, inset: (right: .5em), align: (left, right, center, center, left, right), fill: (_, y) => // Overlay a low-opacity fill color to distinguish the different rows better if calc.odd(y) and y <= lines.len() { color.rgb(0, 0, 100, 20) } else { color.rgb(255, 255, 255, 20) }, table.header([*Operation*], [*op-flag*], [*`w_instr`*], [*`signed`*], [*other*], []), table.hline(stroke: 1.5pt), table.vline(x: 1, start: 1, end: lines.len() + 1, stroke: .5pt), ..lines.flatten(), table.hline(stroke: 1.5pt), table.footer([*Operation*], [*op-flag*], [*`w_instr`*], [*`signed`*], [*other*]), )) }
-
-// OP-IMM ([`ADDI[W]   rd, rs1, imm`], [`ADD`], [`[W]`], [], [], []), ([`SLTI[U]   rd, rs1, imm`], [`SLT`], [], [.not`[U]`], [], []), ([`ANDI      rd, rs1, imm`], [`AND`], [], [], [], []), ([`ORI       rd, rs1, imm`], [`OR`],   [], [], [], []), ([`XORI      rd, rs1, imm`], [`XOR`], [], [], [], []), ([`SLLI[W]   rd, rs1, imm`], [`SHIFT`], [`[W]`], [], [], []), ([`SRLI[W]   rd, rs1, imm`], [`SHIFT`], [`[W]`], [], [`mp_selector`], []), ([`SRAI[W]   rd, rs1, imm`], [`SHIFT`], [`[W]`], [1], [`mp_selector`], []), // OP ([`ADD[W]    rd, rs1, rs2`], [`ADD`], [`[W]`], [], [], []), ([`SUB[W]    rd, rs1, rs2`], [`SUB`], [`[W]`], [], [], []), ([`SLT[U]    rd, rs1, rs2`], [`SLT`], [], [.not`[U]`], [], []), ([`AND       rd, rs1, rs2`], [`AND`], [], [], [], []), ([`OR        rd, rs1, rs2`], [`OR`], [], [], [], []), ([`XOR       rd, rs1, rs2`], [`XOR`], [], [], [], []), ([`SLL[W]    rd, rs1, rs2`], [`SHIFT`], [`[W]`], [], [], []), ([`SRL[W]    rd, rs1, rs2`], [`SHIFT`], [`[W]`], [], [`mp_selector`], []), ([`SRA[W]    rd, rs1, rs2`], [`SHIFT`], [`[W]`], [1], [`mp_selector`], []), // OP - M ([`MUL[W]    rd, rs1, rs2`], [`MUL`], [`[W]`], [1], [`mp_selector`], []), ([`MULH      rd, rs1, rs2`], [`MUL`], [], [1], [`mp_selector`, `muldiv_selector`], []), ([`MULHU     rd, rs1, rs2`], [`MUL`], [], [], [`muldiv_selector`], []), ([`MULHSU    rd, rs1, rs2`], [`MUL`], [], [1], [`muldiv_selector`], []), ([`DIV[U][W] rd, rs1, rs2`], [`DIVREM`], [`[W]`], [.not`[U]`], [], []), ([`REM[U][W] rd, rs1, rs2`], [`DIVREM`], [`[W]`], [.not`[U]`], [`muldiv_selector`], []), // LUI/AUIPC ([`LUI       rd, imm`], [`ADD`], [], [], [], []), ([`AUIPC     rd, imm`], [`ADD`], [], [], [`rs1 := x255`], []), ([`JAL       rd, imm`], [`JALR`], [], [], [`rs1 := x255`], []), // Branching ([`JALR      rd, rs1, imm`], [`JALR`], [], [], [], []), ([`BEQ      rs1, rs2, imm`], [`BEQ`], [], [], [], []), ([`BNE      rs1, rs2, imm`], [`BEQ`], [], [], [`mp_selector`], []), ([`BLT[U]   rs1, rs2, imm`], [`BLT`], [], [.not`[U]`], [], []), ([`BGE[U]   rs1, rs2, imm`], [`BLT`], [], [.not`[U]`], [`mp_selector`], []), // LOAD ([`LD        rd, rs1, imm`], [`LOAD`], [], [], [`mem_8B`], []), ([`LW[U]     rd, rs1, imm`], [`LOAD`], [], [.not`[U]`], [`mem_4B`], []), ([`LH[U]     rd, rs1, imm`], [`LOAD`], [], [.not`[U]`], [`mem_2B`], []), ([`LB[U]     rd, rs1, imm`], [`LOAD`], [], [.not`[U]`], [], []), // STORE ([`SD       rs1, rs2, imm`], [`STORE`], [], [], [`mem_8B`], []), ([`SW       rs1, rs2, imm`], [`STORE`], [], [], [`mem_4B`], []), ([`SH       rs1, rs2, imm`], [`STORE`], [], [], [`mem_2B`], []), ([`SB       rs1, rs2, imm`], [`STORE`], [], [], [], []), // ECALL/EBREAK ([`ECALL`], [`ECALL`], [], [], [``rs1` := `x17``], []), ([`EBREAK`], [`EBREAK`], [], [], [], []), // FENCE ([`FENCE`], [`ADD`], [], [], [], []),
-
-### C-type instructions
-
-The `RV64C` extension for compressed instructions specifies that \~50% of all instructions can be represented using a 16-bit instruction (rather than 32-bits), saving \~25% in code size. This execution of assembly code is _not_ agnostic to an instruction's compression state; after executing a compressed instruction, the `pc` should be incremented by `2` rather than `4`. To indicate an instruction is provided in compressed form, the `c_type` flag is introduced. *This flag should be set to `1` whenever the decoded instruction is provided in compressed form and `0` otherwise.*
-
-// Construct a note that can be referenced through `lbl`
-
-show figure: (it) => align(left, []) [ ] }
-
-### Notes
-
-We note the following about the above decoding table:
-
-enum.item( referenceable_note( "note_word_instr", [`word_instr`: `[W]` indicates that ``word_instr` = 1` for the `W`-variant of the operation, and `0` for the non-`W`-variant.] ), enum.item( referenceable_note( "note_signed", [`signed`: .not`[U]` indicates that ``signed` = 1` for the *non-`U`*-variant of the operation, and `0` for the `U`-variant.] ), enum.item( referenceable_note( "note-lui", [`LUI`: this operation loads the 20-bit `imm` in the upper bits of `rd`. Observe that this can be represented using `ADDI rd, x0, imm`. As such, *we expect the decoding to take care of writing the immediate in bit range `[12:32]` of `imm` and extending it to 64 bits.*] ), enum.item( referenceable_note( "note-auipc", [`AUIPC`: this operation adds the 20-bit immediate to the upper bits of `pc` and stores the result in `rd`. Given that the `pc` is stored in `x255`, this operation can be represented using `ADDI rd, x255, imm`. As such, *we expect the decoding to take care of writing the immediate in bit range `[12:32]` of `imm` and extending it to 64 bits.*] ), enum.item( referenceable_note( "note-jal", [`JAL`: this operation stores ``pc` + 4` in `rd` and adds two times the sign-extended 20-bit immediate to the `pc`. Note that this can be represented using `JALR rd, x255, imm`. As such, *we expect the decoding to take care of writing the immediate in bit range `[1:21]` of `imm` and extending it to 64 bits; the least significant bit should always be 0.*] ), enum.item( referenceable_note( "note-ecall", [`ECALL`: "On RISC-V a system call has its own instruction: `ECALL`. [...] A7 [= register `x17`] contains the system call number." [[source]] ] ), enum.item( referenceable_note( "note-fence", [`FENCE`: currently, the VM interprets this operation as `ADDI x0 x0 0`; a no-op.]
-
-### One more instruction <cpu-padding-decode-row>
-
-In addition to decoding all instructions provided in the ELF and adding a corresponding entry to the  table, one must include an entry that has ``pc` = 1` and every other variable set to `0`. Note that this will never conflict with any entry in the ELF, since it has an odd `pc` value.
-
-This entry is used to pad the `CPU` table. More details on this matter are provided in the `CPU` chip.
-
----
+# CPU32 Chip
 
-# CPU Chip
+The  chip is used to delegate the 32-bit instructions of the RV64I instruction set from the main CPU table ([cpu]). All 32-bit instructions are ALU-only instructions, so the BRANCH, MEMORY and ECALL paths need no elaboration. The timestamp and PC have already been read by the CPU table at this point, and need no further checking; the PC for the next instruction will also already be handled by CPU.
 
-The  chip coordinates memory accesses and dispatches to other chips for arithmetic and logical operations. It bases its decisions on the entry of the `DECODE` table ([decode]) corresponding the the current program counter (PC).
+The structure follows the regular ALU path, with some extra variables and constraints to contain the required sign extensions.
 
 ## Variables
 
@@ -878,237 +851,179 @@ The  chip is comprised of  variables that are expressed using  columns and lever
 
 | Name | Type | Description |
 |------|------|-------------|
-| `timestamp` | `Timestamp` | A preprocessed timestamp to coordinate the memory argument. Since we have at most 3 non-disjoint memory accesses (`(rs1, rs2, rd)`, `(rs1, pc, pc)`, `(LOAD)` or `(STORE)`) a maximum of 4 slots is enough. |
-| `pc` | `DWordWL` | The program counter |
-| `rs1` | `Byte` | Source register 1 index |
-| `rs2` | `Byte` | Source register 2 index |
-| `rd` | `Byte` | Destination register index |
-| `read_register1` | `Bit` | Whether to read from `rs1` (1) or to place a 0 in `rv1` (0) |
-| `read_register2` | `Bit` | Whether to read from `rs2` (1) or to place a 0 in `rv2` (0) |
-| `write_register` | `Bit` | Whether to write back to the destination register |
-| `memory_2bytes` | `Bit` | Whether the memory access (read or write) touches exactly 2 bytes |
-| `memory_4bytes` | `Bit` | Whether the memory access (read or write) touches exactly 4 bytes |
-| `memory_8bytes` | `Bit` | Whether the memory access (read or write) touches exactly 8 bytes |
-| `c_type_instruction` | `Bit` | Whether the instruction is of C type, i.e., whether it is 2 bytes long instead of 4 |
-| `imm` | `DWordWL` | The fully extended 64-bit version of the immediate |
-| `signed` | `Bit` | Indicates whether we're dealing with a signed or unsigned instruction |
-| `mp_selector` | `Bit` | Multi-purpose selector used by different ALU operations for different purposes. Currently, it is used     - by the `MUL` chip to select between `MUL`/`MULH` and `MULH[S]U`, and     - as flag for inverting the condition of conditional branches (see `branch_cond`)     - as direction (left or right) for `SHIFT` |
-| `muldiv_selector` | `Bit` | Selects which output of `MUL` (lo/hi) or `DIV` (quo/rem) is wanted |
-| `word_instr` | `Bit` | Whether the instruction is a \*W instruction, requiring the inputs and outputs to be (sign) extended |
-| `ADD` | `Bit` | One-hot ALU selector flag |
-| `SUB` | `Bit` | One-hot ALU selector flag |
-| `SLT` | `Bit` | One-hot ALU selector flag |
-| `AND` | `Bit` | One-hot ALU selector flag |
-| `OR` | `Bit` | One-hot ALU selector flag |
-| `XOR` | `Bit` | One-hot ALU selector flag |
-| `SHIFT` | `Bit` | One-hot ALU selector flag |
-| `JALR` | `Bit` | One-hot ALU selector flag |
-| `BEQ` | `Bit` | One-hot ALU selector flag |
-| `BLT` | `Bit` | One-hot ALU selector flag |
-| `LOAD` | `Bit` | One-hot ALU selector flag |
-| `STORE` | `Bit` | One-hot ALU selector flag |
-| `MUL` | `Bit` | One-hot ALU selector flag |
-| `DIVREM` | `Bit` | One-hot ALU selector flag |
-| `ECALL` | `Bit` | One-hot ALU selector flag |
-| `EBREAK` | `Bit` | One-hot ALU selector flag |
+| `timestamp` | `DWordWL` | The timestamp for the CPU row |
+| `pc` | `DWordWL` | The PC at which the instruction occurs |
 
 ### Output
 
 | Name | Type | Description |
 |------|------|-------------|
-| `next_pc` | `DWordWL` | The program counter for the next instruction |
-| `rvd` | `DWordWL` | The value to (maybe) be written back to rvd |
+| `half_instruction_length` | `Byte` | The length of this instruction |
 
 ### Auxiliary
 
 | Name | Type | Description |
 |------|------|-------------|
-| `prev_pc_timestamp_borrow` | `Bit` | The borrow bit for computing the previous timestamp the PC was accessed |
-| `pc_double_read` | `Bit` | Whether the PC is being read as a general purpose register (`rs1`) this cycle |
-| `rv1` | `DWordWHH` | The value of register `rs1` |
-| `rv2` | `DWordWHH` | The value of register `rs2` |
-| `rv1_ext_bit` | `Bit` | The sign bit of `rv1` if seen as a 32-bit word, used for sign extension with `word_instr` |
-| `arg1` | `DWordBL` | The extended version of `rv1`, depending on `word_instr` |
-| `rv2_ext_bit` | `Bit` | The sign bit of `rv2` if seen as a 32-bit word, used for sign extension with `word_instr` |
-| `arg2` | `DWordBL` | A multiplexed version of `rv2` and `imm`, to be used as second argument to ALU calls |
-| `res_ext_bit` | `Bit` | The sign bit of `res`, if seen as a 32-bit word, used for sign extension with `word_instr` |
-| `res` | `DWordBL` | The ALU result |
-| `is_equal` | `Bit` | Whether `rv1` and `arg2` are equal |
-| `branch_cond` | `Bit` | Whether a branch is taken, i.e., the branch condition |
+| `rs1` | `Byte` | Source register 1 |
+| `read_register1` | `Bit` | Whether to read from `rs1` or not |
+| `rv1` | `DWordWHH` | The value in register `rs1` |
+| `rv1_sign` | `Bit` | The sign bit of the lower word of `rv1` |
+| `arg1` | `DWordWL` | The sign-extended version of `rv1` |
+| `rs2` | `Byte` | Source register 2 |
+| `read_register2` | `Bit` | Whether to read from `rs2` |
+| `rv2` | `DWordWHH` | The value in register `rs2` |
+| `rv2_sign` | `Bit` | The sign bit of the lower word of `rv2` |
+| `imm` | `DWordWL` | The fully sign-extended immediate to use |
+| `arg2` | `DWordWL` | Either the sign-extended version of `rv2` or all of `imm` |
+| `res` | `DWordHL` | The ALU result |
+| `res_sign` | `Bit` | The sign bit of the lower word of `res` |
+| `rd` | `Byte` | Destination register |
+| `write_register` | `Bit` | Whether to write back to `rd` |
+| `rvd` | `DWordWL` | The value to write back to `rd`, the sign-extended version of `res` |
+| `ALU` | `Bit` | Whether the full ALU is active |
+| `alu_flags` | `Byte` | The ALU operation + flags |
+| `ADD` | `Bit` | Whether the full ALU is active |
+| `SUB` | `Bit` | Whether the full ALU is active |
+| `signed` | `Bit` | Whether the instruction is signed or not. Extracted from `alu_flags`, used to determine the extension for the inputs |
 
 ### Virtual
 
 | Name | Type | Description |
 |------|------|-------------|
-| `packed_decode` | `BaseField` | A packed representation of all bit flags and register indices obtained from the decoding |
-| `pad` | `Bit` | When no flags are set, we must be in a padding row. |
+| `packed_decode` | `BaseField` | The packed representation of all flags and information from the decode table |
 
 **Definition of `packed_decode`:**
 ```
-packed_decode := 2^0 * read_register1 + 2^1 * read_register2 + 2^2 * write_register + 2^3 * memory_2bytes + 2^4 * memory_4bytes + 2^5 * memory_8bytes + 2^6 * c_type_instruction + 2^7 * signed + 2^8 * mp_selector + 2^9 * muldiv_selector + 2^10 * word_instr + 2^11 * ADD + 2^12 * SUB + 2^13 * SLT + 2^14 * AND + 2^15 * OR + 2^16 * XOR + 2^17 * SHIFT + 2^18 * JALR + 2^19 * BEQ + 2^20 * BLT + 2^21 * LOAD + 2^22 * STORE + 2^23 * MUL + 2^24 * DIVREM + 2^25 * ECALL + 2^26 * EBREAK + 2^27 * rs1 + 2^35 * rs2 + 2^43 * rd
+packed_decode := 2^0 * read_register1 + 2^1 * read_register2 + 2^2 * write_register + 2^3 * 1 + 2^4 * ALU + 2^5 * ADD + 2^6 * SUB + 2^10 * rs1 + 2^18 * rs2 + 2^26 * rd + 2^34 * half_instruction_length + 2^42 * alu_flags
 ```
 
-**Definition of `pad`:**
-```
-pad := 1 - ADD - SUB - SLT - AND - OR - XOR - SHIFT - JALR - BEQ - BLT - LOAD - STORE - MUL - DIVREM - ECALL - EBREAK
-```
+### Multiplicity
+
+| Name | Type | Description |
+|------|------|-------------|
+| `μ` | `Bit` |  |
 
 ## Assumptions
 
 | Tag | Range | Description |
 |-----|-------|-------------|
-| `CPU-A1` |  | At most one ALU selector flag is 1 by the decoding, and every other flag is 0. |
-| `CPU-A2` |  | When `STORE + LOAD + BEQ + BLT = 0`, either `rs2 = 0` or `imm = 0` should be enforced by the decoding. This is needed for `arg2`. |
+| `CPU32-A1.i` | i ∈ [0, 1] | `IS_WORD[timestamp[i]]` |
+| `CPU32-A2.i` | i ∈ [0, 1] | `IS_WORD[pc[i]]` |
+| `CPU32-A3` |  | `read_register2 = 0` or `imm = 0`, enforced by decoding. |
+
+Some of the assumptions can be checked with only arithmetic constraints, so we provide these below.
+
+| Tag | Description |
+|-----|-------------|
+| `CPU32-C1` | `read_register2` = 0 or `imm = 0` |
+| | _polynomial:_ `read_register2 * (imm[0] + imm[1]) = 0` |
 
 ## Constraints
 
-First, we perform a decoding lookup for the current PC.
+Most constraints correspond to those already present in the CPU, and we present them here first, including some updates to the range checking corresponding to the differing types. We also need to make sure that for padding rows (`mu = 0`), no side effects can occur.
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `CPU-C1` | `DECODE[pc, imm, packed_decode]` | 1 |
-
-### Range checks
-
-> **Note:** Make sure we argue for every column here
-
-> **Note:** is `rvd` still sufficiently constrained? (can also be done through the memory argument like `pc`?)
-
-We constrain all columns to have the appropriate ranges. The flags and register indices looked up from the decoding need to be checked, as they are communicated through the interaction in a packed form. In contrast, we know ahead of time that decoding will ensure proper range checks for `pc` and `imm`. Similarly, since `next_pc` will propagate through the memory argument and be looked up in the instruction decoding on the next cycle, it is forced to be in the correct range. For the auxiliary columns, we need to check the limbs of `arg1`, `arg2`, and `res`. The ranges of the other auxiliary columns are enforced through later constraints.
-
-| Tag | Range | Description | Multiplicity |
-|-----|-------|-------------|--------------|
-| `CPU-CR2` |  | `IS_BIT<read_register1>` |  |
-| `CPU-CR3` |  | `IS_BIT<read_register2>` |  |
-| `CPU-CR4` |  | `IS_BIT<write_register>` |  |
-| `CPU-CR5` |  | `IS_BIT<memory_2bytes>` |  |
-| `CPU-CR6` |  | `IS_BIT<memory_4bytes>` |  |
-| `CPU-CR7` |  | `IS_BIT<memory_8bytes>` |  |
-| `CPU-CR8` |  | `IS_BIT<c_type_instruction>` |  |
-| `CPU-CR9` |  | `IS_BIT<signed>` |  |
-| `CPU-CR10` |  | `IS_BIT<mp_selector>` |  |
-| `CPU-CR11` |  | `IS_BIT<muldiv_selector>` |  |
-| `CPU-CR12` |  | `IS_BIT<word_instr>` |  |
-| `CPU-CR13` |  | `IS_BIT<ADD>` |  |
-| `CPU-CR14` |  | `IS_BIT<SUB>` |  |
-| `CPU-CR15` |  | `IS_BIT<SLT>` |  |
-| `CPU-CR16` |  | `IS_BIT<AND>` |  |
-| `CPU-CR17` |  | `IS_BIT<OR>` |  |
-| `CPU-CR18` |  | `IS_BIT<XOR>` |  |
-| `CPU-CR19` |  | `IS_BIT<SHIFT>` |  |
-| `CPU-CR20` |  | `IS_BIT<JALR>` |  |
-| `CPU-CR21` |  | `IS_BIT<BEQ>` |  |
-| `CPU-CR22` |  | `IS_BIT<BLT>` |  |
-| `CPU-CR23` |  | `IS_BIT<LOAD>` |  |
-| `CPU-CR24` |  | `IS_BIT<STORE>` |  |
-| `CPU-CR25` |  | `IS_BIT<MUL>` |  |
-| `CPU-CR26` |  | `IS_BIT<DIVREM>` |  |
-| `CPU-CR27` |  | `IS_BIT<ECALL>` |  |
-| `CPU-CR28` |  | `IS_BIT<EBREAK>` |  |
-| `CPU-CR29` |  | `IS_BYTE[rs1]` | 1 |
-| `CPU-CR30` |  | `IS_BYTE[rs2]` | 1 |
-| `CPU-CR31` |  | `IS_BYTE[rd]` | 1 |
-| `CPU-CR32.i` | i ∈ [0, 7] | `IS_BYTE[arg1[i]]` | 1 |
-| `CPU-CR33.i` | i ∈ [0, 7] | `IS_BYTE[arg2[i]]` | 1 |
-| `CPU-CR34.i` | i ∈ [0, 7] | `IS_BYTE[res[i]]` | 1 |
-
-### ALU
-
-The ALU functionality is then obtained through judicious dispatching to the corresponding chips.
+| `CPU32-C2` | `DECODE[pc, imm, packed_decode]` | μ |
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `CPU-CA35` |  | ADD + LOAD ⇒ `ADD<res::DWordWL; arg1::DWordWL, arg2::DWordWL>` |  |
-| `CPU-CA36` |  | STORE ⇒ `ADD<res::DWordWL; arg1::DWordWL, imm>` |  |
-| `CPU-CA37` |  | SUB + BEQ ⇒ `SUB<res::DWordWL; arg1::DWordWL, arg2::DWordWL>` |  |
-| `CPU-CA38` |  | `LT[res[0]; arg1::DWordWL, arg2::DWordWL, signed]` | SLT + BLT |
-| `CPU-CA39.i` | i ∈ [1, 7] | `SLT` + `BLT` => `res[i]` = 0 |  |
-| | | _polynomial:_ `(SLT + BLT) * res[i] = 0` | |
-| `CPU-CA40.i` | i ∈ [0, 7] | `AND_BYTE[res[i]; arg1[i], arg2[i]]` | AND |
-| `CPU-CA41.i` | i ∈ [0, 7] | `OR_BYTE[res[i]; arg1[i], arg2[i]]` | OR |
-| `CPU-CA42.i` | i ∈ [0, 7] | `XOR_BYTE[res[i]; arg1[i], arg2[i]]` | XOR |
-| `CPU-CA43` |  | `SHIFT[res::DWordWL; arg1::DWordHL, arg2[0], mp_selector, signed, word_instr]` | SHIFT |
-| `CPU-CA44` |  | JALR ⇒ `ADD<res::DWordWL; pc, (2 * c_type_instruction + 4 * (1 - c_type_instruction)) * 1::DWordWL>` |  |
-| `CPU-CA45` |  | `MUL[res::DWordWL; arg1::DWordHL, signed, arg2::DWordHL, mp_selector, muldiv_selector]` | MUL |
-| `CPU-CA46` |  | `DVRM[res::DWordWL; arg1::DWordHL, arg2::DWordHL, signed, muldiv_selector]` | DIVREM |
-
-### Memory<cpu:memory>
-
-The interactions with the memory, both for register loading and storing, as for `LOAD` and `STORE` instructions are handled. Note that since registers need no byte-addressing, we store them in the memory argument with `Word` limbs. The `pc` register behaves very predictably with respect to its timestamps and when it is being read, so for performance reasons, we inline its memory interactions directly into the  chip.
+| `CPU32-CR3` |  | `IS_BIT<μ>` |  |
+| `CPU32-CR4` |  | `IS_BIT<read_register1>` |  |
+| `CPU32-CR5` |  | `IS_BIT<read_register2>` |  |
+| `CPU32-CR6` |  | `IS_BIT<write_register>` |  |
+| `CPU32-CR7` |  | `IS_BYTE<half_instruction_length>` |  |
+| `CPU32-CR8` |  | `IS_BIT<ALU>` |  |
+| `CPU32-CR9` |  | `IS_BYTE<alu_flags>` |  |
+| `CPU32-CR10` |  | `IS_BIT<ADD>` |  |
+| `CPU32-CR11` |  | `IS_BIT<SUB>` |  |
+| `CPU32-CR12` |  | `IS_BYTE<rs1>` |  |
+| `CPU32-CR13` |  | `IS_BYTE<rs2>` |  |
+| `CPU32-CR14` |  | `IS_BYTE<rd>` |  |
+| `CPU32-CR15.i` | i ∈ [0, 1] | `IS_HALF[rv1[i]]` | μ |
+| `CPU32-CR16.i` | i ∈ [0, 1] | `IS_HALF[rv2[i]]` | μ |
+| `CPU32-CR17.i` | i ∈ [0, 3] | `IS_HALF[res[i]]` | μ |
 
-Potentially overlapping memory accesses are ensured to have disjoint timestamps. One consequence of that is that `next_pc` is written at `timestamp + 1` to ensure the access is disjoint with the `pc` read into `rv1` as part of the `AUIPC` instruction (see [cpu:c:read_rv1] and [decode]:decoding-overview). Constraints regarding whether `pc_double_read` corresponds to an `AUIPC` instruction are not necessary, as regardless of its value, the old timestamp is guaranteed smaller than the new timestamp, and the integrity of the memory argument therefore ensures the correctness of this bit.
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `CPU32-CA18` | ADD ⇒ `ADD<res::DWordWL; arg1, arg2>` |  |
+| `CPU32-CA19` | SUB ⇒ `SUB<res::DWordWL; arg1, arg2>` |  |
+| `CPU32-CA20` | `ALU[res::DWordWL; arg1, arg2, alu_flags]` | ALU |
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `CPU-CM47` |  | `MEMW[['arr', ['idx', ['cast', 'rv1', 'DWordWL'], 0], ['idx', ['cast', 'rv1', 'DWordWL'], 1], 0, 0, 0, 0, 0, 0]; 1, 2::DWordWL * rs1, ['arr', ['idx', ['cast', 'rv1', 'DWordWL'], 0], ['idx', ['cast', 'rv1', 'DWordWL'], 1], 0, 0, 0, 0, 0, 0], timestamp + 0::DWordWL, 1, 0, 0]` | read_register1 |
-| `CPU-CM48.i` | i ∈ [0, 2] | `!read_register1` => `rv1[i]` = 0 |  |
+| `CPU32-CM21` |  | `MEMW[[(rv1::DWordWL)[0], rv1[2], 0, 0, 0, 0, 0, 0]; 1, 2::DWordWL * rs1, [(rv1::DWordWL)[0], rv1[2], 0, 0, 0, 0, 0, 0], timestamp + 0::DWordWL, 1, 0, 0]` | read_register1 |
+| `CPU32-CM22.i` | i ∈ [0, 2] | `!read_register1` => `rv1[i]` = 0 |  |
 | | | _polynomial:_ `(1 - read_register1) * rv1[i] = 0` | |
-| `CPU-CM49` |  | `MEMW[['arr', ['idx', ['cast', 'rv2', 'DWordWL'], 0], ['idx', ['cast', 'rv2', 'DWordWL'], 1], 0, 0, 0, 0, 0, 0]; 1, 2::DWordWL * rs2, ['arr', ['idx', ['cast', 'rv2', 'DWordWL'], 0], ['idx', ['cast', 'rv2', 'DWordWL'], 1], 0, 0, 0, 0, 0, 0], timestamp + 1::DWordWL, 1, 0, 0]` | read_register2 |
-| `CPU-CM50.i` | i ∈ [0, 2] | `!read_register2` => `rv2[i]` = 0 |  |
+| `CPU32-CM23` |  | `MEMW[[(rv2::DWordWL)[0], rv2[2], 0, 0, 0, 0, 0, 0]; 1, 2::DWordWL * rs2, [(rv2::DWordWL)[0], rv2[2], 0, 0, 0, 0, 0, 0], timestamp + 1::DWordWL, 1, 0, 0]` | read_register2 |
+| `CPU32-CM24.i` | i ∈ [0, 2] | `!read_register2` => `rv2[i]` = 0 |  |
 | | | _polynomial:_ `(1 - read_register2) * rv2[i] = 0` | |
-| `CPU-CM51` |  | `MEMW[1, 2::DWordWL * rd, ['arr', ['idx', 'rvd', 0], ['idx', 'rvd', 1], 0, 0, 0, 0, 0, 0], timestamp + 2::DWordWL, 1, 0, 0]` | write_register |
-| `CPU-CM52` |  | `LOAD[rvd; res::DWordWL, timestamp + 0::DWordWL, memory_2bytes, memory_4bytes, memory_8bytes, signed]` | LOAD |
-| `CPU-CM53` |  | `MEMW[0, res::DWordWL, arg2::Byte[8], timestamp + 1::DWordWL, memory_2bytes, memory_4bytes, memory_8bytes]` | STORE |
-| `CPU-CM54` |  | `IS_BIT<pc_double_read>` |  |
-| `CPU-CM55` |  | `IS_BIT<prev_pc_timestamp_borrow>` |  |
-| `CPU-CM56.i` | i ∈ [0, 1] | `memory[1, ['arr', ['+', ['*', 2, 255], 'i'], 0], ['arr', ['+', ['-', ['idx', 'timestamp', 0], ['*', 3, ['not', 'pc_double_read']]], ['*', ['^', 2, 32], 'prev_pc_timestamp_borrow']], ['-', ['idx', 'timestamp', 1], 'prev_pc_timestamp_borrow']], pc[i]]` | 1 - pad |
-| `CPU-CM57.i` | i ∈ [0, 1] | `memory[1, ['arr', ['+', ['*', 2, 255], 'i'], 0], timestamp + 1::DWordWL, next_pc[i]]` | -(1 - pad) |
-
-#### Potential optimizations
-
-- `double_pc_read` could be integrated into decoding, so that `AUIPC` could set `read_register1 = 0` and no extra MEMW access for `rv1` is needed at this point.
-
-### System
-
-The interactions with the wider system.
+| `CPU32-CM25` |  | `MEMW[1, 2::DWordWL * rd, [rvd[0], rvd[1], 0, 0, 0, 0, 0, 0], timestamp + 2::DWordWL, 1, 0, 0]` | write_register |
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `CPU-CS58` | `!EBREAK` |  |
-| | _polynomial:_ `1 - EBREAK = 0` | |
-| `CPU-CS59` | `ECALL[timestamp, rv1::DWordWL]` | ECALL |
+| `CPU32-C26` | `!μ` => `read_register1 = 0` |  |
+| | _polynomial:_ `(1 - μ) * read_register1 = 0` | |
+| `CPU32-C27` | `!μ` => `read_register2 = 0` |  |
+| | _polynomial:_ `(1 - μ) * read_register2 = 0` | |
+| `CPU32-C28` | `!μ` => `write_register = 0` |  |
+| | _polynomial:_ `(1 - μ) * write_register = 0` | |
+| `CPU32-C29` | `CPU32[half_instruction_length; timestamp, pc]` | -μ |
 
-### Input and output to the ALU
-
-We constrain `arg1`, `arg2` and `rvd` to correspond to the wanted values, including the appropriate sign/zero extension, depending on `word_instr`.
-
-| Tag | Description |
-|-----|-------------|
-| `CPU-CE60` | `SIGN<rv1_ext_bit; rv1[1], word_instr>` |
-| `CPU-CE61` | `arg1[:4]` = `rv1[:2]` |
-| | _polynomial:_ `(arg1::DWordWL)[0] - (rv1::DWordWL)[0] = 0` |
-| `CPU-CE62` | `arg1[4:]` = `rv1[2]` dot (1 - `word_instr`) + (2^(32) - 1) dot `rv1_ext_bit` dot `signed` |
-| | _polynomial:_ `(arg1::DWordWL)[1] - (1 - word_instr) * rv1[2] - signed * rv1_ext_bit * (2^32 - 1) = 0` |
-| `CPU-CE63` | `SIGN<rv2_ext_bit; rv2[1], word_instr>` |
-| `CPU-CE64` | `arg2[:4]` = (1 - `LOAD`) dot `rv2[:2]` + (1 - `BEQ` - `BLT` - `STORE`) dot `imm[0]` |
-| | _polynomial:_ `(arg2::DWordWL)[0] - (1 - LOAD) * (rv2::DWordWL)[0] - (1 - BEQ - BLT - STORE) * imm[0] = 0` |
-| `CPU-CE65` | `arg2[4:]` = (1 - `LOAD`) dot ((1 - `word_instr`) dot `rv2[2]` + `signed` dot `rv2_ext_bit` dot (2^(32) - 1)) + (1 - `BEQ` - `BLT` - `STORE`) dot `imm[1]` |
-| | _polynomial:_ `(arg2::DWordWL)[1] - (1 - LOAD) * (1 - word_instr) * rv2[2] - (1 - LOAD) * signed * rv2_ext_bit * (2^32 - 1) - (1 - BEQ - BLT - STORE) * imm[1] = 0` |
-| `CPU-CE66` | `SIGN<res_ext_bit; (res::DWordHL)[1], word_instr>` |
-| `CPU-CE67` | `!LOAD` => `rvd[0]` = `res[:4]` |
-| | _polynomial:_ `(1 - LOAD) * (rvd[0] - (res::DWordWL)[0]) = 0` |
-| `CPU-CE68` | `!LOAD` => `rvd[1]` = (1 - `word_instr`) dot `res[4:]` + `res_ext_bit` dot (2^(32) - 1) |
-| | _polynomial:_ `(1 - LOAD) * (rvd[1] - (1 - word_instr) * (res::DWordWL)[1] - res_ext_bit * (2^32 - 1)) = 0` |
-
-### Other constraints
-
-For [cpu:c:is_equal], note that [cpu:c:sub] sets `res` to be the difference between `arg1` and `arg2` whenever `BEQ` is `1`. Given that this difference is `0` when both are equal, [cpu:c:is_equal] ensures `is_equal` is set to `1` if and only if ``arg1` = `arg2`` and `BEQ` is set.
+Then, we have the constraints corresponding to the sign-extension and definition of `arg1`, `arg2` and `rd`. This includes a step where we extract the `signed` bit from the `alu_flags`, as this determines whether to sign extend the inputs or not.
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `CPU-CO69` | `ZERO[is_equal; res[0] + res[1] + res[2] + res[3] + res[4] + res[5] + res[6] + res[7]]` | BEQ |
-| `CPU-CO70` | `branch_cond` = `JALR` or (`BLT` and (`res` xor `invert`)) or (`BEQ` and (`is_equal` xor `invert`)) |  |
-| | _polynomial:_ `-branch_cond + JALR + res[0] * (1 - mp_selector) * BLT + (1 - res[0]) * mp_selector * BLT + is_equal * (1 - mp_selector) * BEQ + (1 - is_equal) * mp_selector * BEQ = 0` | |
-| `CPU-CO71` | `BRANCH[next_pc; pc, imm, arg1::DWordWL, JALR]` | branch_cond |
-| `CPU-CO72` | `ADD<next_pc; pc, (2 * c_type_instruction + 4 * (1 - c_type_instruction)) * 1::DWordWL>` |  |
-
-> **Note:** Document the choice to not have a multiplicity column here for padding
+| `CPU32-C30` | `signed` != 0 => `μ` = 1 |  |
+| | _polynomial:_ `signed * (1 - μ) = 0` | |
+| `CPU32-C31` | `BYTE_ALU[32 * signed; ⧼AND⧽, 32, alu_flags]` | μ |
+| `CPU32-C32` | `SIGN<rv1_sign; rv1[1], signed>` |  |
+| `CPU32-C33` | `arg1[0]` = `rv1[:2]` |  |
+| | _polynomial:_ `arg1[0] - (rv1::DWordWL)[0] = 0` | |
+| `CPU32-C34` | `arg1[1]` = (2^(32) - 1) dot `rv1_sign` |  |
+| | _polynomial:_ `arg1[1] - (2^32 - 1) * rv1_sign = 0` | |
+| `CPU32-C35` | `SIGN<rv2_sign; rv2[1], signed>` |  |
+| `CPU32-C36` | `arg2[0]` = `rv2[:2]` + `imm[0]` |  |
+| | _polynomial:_ `arg2[0] - (rv2::DWordWL)[0] - imm[0] = 0` | |
+| `CPU32-C37` | `arg2[1]` = (2^(32) - 1) dot `rv2_sign` + `imm[1]` |  |
+| | _polynomial:_ `arg2[1] - (2^32 - 1) * rv2_sign - imm[1] = 0` | |
+| `CPU32-C38` | `SIGN<res_sign; res[1], μ>` |  |
+| `CPU32-C39` | `rvd[0]` = `res[:2]` |  |
+| | _polynomial:_ `rvd[0] - (res::DWordWL)[0] = 0` | |
+| `CPU32-C40` | `rvd[1]` = (2^(32) - 1) dot `res_sign` |  |
+| | _polynomial:_ `rvd[1] - (2^32 - 1) * res_sign = 0` | |
 
 ## Padding
 
-The CPU can be padded with the following values, which have a corresponding row in the DECODE table, at the _odd_ address 1, only reachable through a HALT ecall.
-
-This approach minimizes the number of dependent lookups, increasing only multiplicities in the DECODE table and the IS_BYTE lookup.
+The table can be padded with the following values:
+
+| Column | Padding value |
+|--------|---------------|
+| `timestamp` | `0` |
+| `pc` | `0` |
+| `half_instruction_length` | `2` |
+| `rs1` | `0` |
+| `read_register1` | `0` |
+| `rv1` | `0` |
+| `rv1_sign` | `0` |
+| `arg1` | `0` |
+| `rs2` | `0` |
+| `read_register2` | `0` |
+| `rv2` | `0` |
+| `rv2_sign` | `0` |
+| `imm` | `0` |
+| `arg2` | `0` |
+| `res` | `0` |
+| `res_sign` | `0` |
+| `rd` | `0` |
+| `write_register` | `0` |
+| `rvd` | `0` |
+| `ALU` | `0` |
+| `alu_flags` | `0` |
+| `ADD` | `0` |
+| `SUB` | `0` |
+| `signed` | `0` |
+| `μ` | `0` |
 
 ---
 
@@ -1129,7 +1044,7 @@ The `SHIFT` chip is comprised of  variables that are expressed using  columns an
 | Name | Type | Description |
 |------|------|-------------|
 | `in` | `DWordHL` | The value being shifted |
-| `shift` | `Byte` | Number of bits to shift `in` by. |
+| `shift` | `DWordWHBB` | Number of bits to shift `in` by. |
 | `direction` | `Bit` | Whether to shift left (0) or right (1). |
 | `signed` | `Bit` | Whether to interpret `in` as a signed integer. |
 | `word_instr` | `Bit` | Whether this is a Word-instruction (1) or not (0). |
@@ -1206,16 +1121,6 @@ shifted := left * Σ_j = 0^i limb_shift[j] * intra_limb_left[i - j] + right * (
 |------|------|-------------|
 | `μ` | `Bit` |  |
 
-## Assumptions
-
-| Tag | Range | Description |
-|-----|-------|-------------|
-| `SHIFT-A1.i` | i ∈ [0, 3] | `IS_HALF[in[i]]` |
-| `SHIFT-A2` |  | `IS_BYTE[shift]` |
-| `SHIFT-A3` |  | `IS_BIT<direction>` |
-| `SHIFT-A4` |  | `IS_BIT<signed>` |
-| `SHIFT-A5` |  | `IS_BIT<word_instr>` |
-
 ## Explanation
 
 This chip has a rather complex design as a result of designing it to fit in as few columns possible. We briefly discuss the intricacies of the design, attempting to illustrate its correctness.
@@ -1248,13 +1153,24 @@ Lastly, we discuss the case of performing the _arithmetic_ right shift. Here, `e
 
 ## Constraints
 
-First, we constrain `bit_shift` based on whether we are left or right-shifting. [shift:c:zbs] makes sure `zbs` is set to `1` if and only if `bit_shift = 0`. This flag is used to indicate the special case that ``right` = 1` and ``shift` = 0 mod 16`.
+First, we range check our inputs appropriately.
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `SHIFT-C1.i` | i ∈ [0, 3] | `IS_HALF[in[i]]` | μ |
+| `SHIFT-C2` |  | `IS_HALF[shift[2]]` | μ |
+| `SHIFT-C3.i` | i ∈ [0, 1] | `IS_BYTE<shift[i]>` |  |
+| `SHIFT-C4` |  | `IS_BIT<direction>` |  |
+| `SHIFT-C5` |  | `IS_BIT<signed>` |  |
+| `SHIFT-C6` |  | `IS_BIT<word_instr>` |  |
+
+Then, we constrain `bit_shift` based on whether we are left or right-shifting. [shift:c:zbs] makes sure `zbs` is set to `1` if and only if `bit_shift = 0`. This flag is used to indicate the special case that ``right` = 1` and ``shift` = 0 mod 16`.
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `SHIFT-C1` | `AND_BYTE[bit_shift; shift, 15]` | left |
-| `SHIFT-C2` | `AND_BYTE[bit_shift; 2^8 - 16 * zbs - shift, 15]` | right |
-| `SHIFT-C3` | `ZERO[zbs; bit_shift]` | μ |
+| `SHIFT-C7` | `BYTE_ALU[bit_shift; ⧼AND⧽, shift[0], 15]` | left |
+| `SHIFT-C8` | `BYTE_ALU[bit_shift; ⧼AND⧽, 2^8 - 16 * zbs - shift[0], 15]` | right |
+| `SHIFT-C9` | `ZERO[zbs; bit_shift]` | μ |
 
 Next, we shift the limbs of `in` left and right by the appropriate amount, storing the results in `X` and `Y` respectively. When `zbs = 1`, the output cannot be used to compose ``in >>/>>> shift` mod 16`. To resolve this, we override `Y[i] := in[i]` and `X[i] := 0` in this case.
 
@@ -1262,13 +1178,13 @@ The case of `left`-shifting and ``bit_shift` = 0` will be used for padding rows.
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `SHIFT-C4.i` | i ∈ [0, 3] | `HWSL[['arr', ['idx', 'X', 'i'], ['idx', 'Y', 'i']]; in[i], bit_shift]` | 1 - zbs |
-| `SHIFT-C5.i` | i ∈ [0, 3] | `zbs` => `X[i]` = `in[i]` dot `left` |  |
+| `SHIFT-C10.i` | i ∈ [0, 3] | `HWSL[[X[i], Y[i]]; in[i], bit_shift]` | 1 - zbs |
+| `SHIFT-C11.i` | i ∈ [0, 3] | `zbs` => `X[i]` = `in[i]` dot `left` |  |
 | | | _polynomial:_ `zbs * (X[i] - in[i] * left) = 0` | |
-| `SHIFT-C6.i` | i ∈ [0, 3] | `zbs` => `Y[i]` = `in[i]` dot `right` |  |
+| `SHIFT-C12.i` | i ∈ [0, 3] | `zbs` => `Y[i]` = `in[i]` dot `right` |  |
 | | | _polynomial:_ `zbs * (Y[i] - in[i] * right) = 0` | |
-| `SHIFT-C7` |  | `HWSL[['arr', ['idx', 'X', 4], ['-', 'extension', ['idx', 'X', 4]]]; extension, bit_shift]` | 1 - zbs |
-| `SHIFT-C8` |  | `zbs` => `X[4]` = 0 |  |
+| `SHIFT-C13` |  | `HWSL[[X[4], extension - X[4]]; extension, bit_shift]` | 1 - zbs |
+| `SHIFT-C14` |  | `zbs` => `X[4]` = 0 |  |
 | | | _polynomial:_ `zbs * X[4] = 0` | |
 
 ### Full-limb shifting
@@ -1279,21 +1195,21 @@ Hereafter, one must only check that `out` is the proper cast of `shifted` into a
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `SHIFT-C9.i` | i ∈ [0, 3] | `IS_BIT<limb_shift[i]>` |  |
-| `SHIFT-C10` |  | `AND_BYTE[(1 - limb_shift[0]) + 15 * limb_shift[1] + 31 * limb_shift[2] + 47 * limb_shift[3]; shift, 48 - 32 * word_instr]` | μ |
-| `SHIFT-C11.i` | i ∈ [0, 1] | `out[:2]` = `shifted[:4]` |  |
+| `SHIFT-C15.i` | i ∈ [0, 3] | `IS_BIT<limb_shift[i]>` |  |
+| `SHIFT-C16` |  | `BYTE_ALU[(1 - limb_shift[0]) + 15 * limb_shift[1] + 31 * limb_shift[2] + 47 * limb_shift[3]; ⧼AND⧽, shift[0], 48 - 32 * word_instr]` | μ |
+| `SHIFT-C17.i` | i ∈ [0, 1] | `out[:2]` = `shifted[:4]` |  |
 | | | _polynomial:_ `out[i] - (shifted::DWordWL)[i] = 0` | |
 
 ### Miscellaneous
 
 | Tag | Description |
 |-----|-------------|
-| `SHIFT-C12` | `direction` => `μ` = 1 |
+| `SHIFT-C18` | `direction` => `μ` = 1 |
 | | _polynomial:_ `direction * (1 - μ) = 0` |
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `SHIFT-C13` | `MSB16[is_negative; in[3]]` | signed |
+| `SHIFT-C19` | `MSB16[is_negative; in[3]]` | signed |
 
 *Note*: `is_negative` is not used when `signed = 0`. As such, there is no problem with it being unconstrained in this case.
 
@@ -1303,12 +1219,28 @@ This chip adds the following interaction to the lookup.
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `SHIFT-C14` | `SHIFT[out; in, shift, direction, signed, word_instr]` | -μ |
+| `SHIFT-C20` | `ALU[out; in::DWordWL, shift::DWordWL, ⧼SHIFT⧽ + word_instr + 32 * signed + 64 * direction]` | -μ |
 
 ## Padding
 
 The table can be padded to the next power of two with the following value assignments:
 
+| Column | Padding value |
+|--------|---------------|
+| `in` | `0` |
+| `shift` | `0` |
+| `direction` | `0` |
+| `signed` | `0` |
+| `word_instr` | `0` |
+| `out` | `0` |
+| `is_negative` | `0` |
+| `bit_shift` | `0` |
+| `zbs` | `1` |
+| `X` | `[0, 0, 0, 0, 0]` |
+| `Y` | `[0, 0, 0, 0]` |
+| `limb_shift_raw` | `[0, 0, 0]` |
+| `μ` | `0` |
+
 ---
 
 # BRANCH Chip
@@ -1375,6 +1307,12 @@ next_pc (when iter=1) := 2^16 * next_pc_high[2] + next_pc_high[1]
 | `BRANCH-A3.i` | i ∈ [0, 1] | `register` is range checked, `IS_WORD[register[i]]` |
 | `BRANCH-A4` |  | `IS_BIT<JALR>` |
 
+Some of the assumptions can be checked with only arithmetic constraints, so we provide these below.
+
+| Tag | Description |
+|-----|-------------|
+| `BRANCH-C1` | `IS_BIT<JALR>` |
+
 ## Constraints
 
 We constrain `next_pc` to be ``base_address` + `offset``, where `base_address` equals `pc` when ``JALR` = 0` and `register` otherwise.
@@ -1383,27 +1321,38 @@ The range checks on `unmasked_low_byte` and `next_pc_low[0]` are performed impli
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `BRANCH-C1` |  | 1 - JALR ⇒ `ADD<next_pc_unmasked; pc, offset::DWordWL>` |  |
-| `BRANCH-C2` |  | JALR ⇒ `ADD<next_pc_unmasked; register, offset::DWordWL>` |  |
-| `BRANCH-C3` |  | `IS_BYTE[next_pc_low[1]]` | μ |
-| `BRANCH-C4` |  | `AND_BYTE[next_pc_low[0]; unmasked_low_byte, 254]` | μ |
-| `BRANCH-C5.i` | i ∈ [0, 2] | `IS_HALF[next_pc_high[i]]` | μ |
+| `BRANCH-C2` |  | 1 - JALR ⇒ `ADD<next_pc_unmasked; pc, offset::DWordWL>` |  |
+| `BRANCH-C3` |  | JALR ⇒ `ADD<next_pc_unmasked; register, offset::DWordWL>` |  |
+| `BRANCH-C4` |  | μ ⇒ `IS_BYTE<next_pc_low[1]>` |  |
+| `BRANCH-C5` |  | `BYTE_ALU[next_pc_low[0]; ⧼AND⧽, unmasked_low_byte, 254]` | μ |
+| `BRANCH-C6.i` | i ∈ [0, 2] | `IS_HALF[next_pc_high[i]]` | μ |
 
 This chip contributes the following to the lookup argument.
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `BRANCH-C6` | `BRANCH[next_pc; pc, offset, register, JALR]` | -μ |
+| `BRANCH-C7` | `BRANCH[next_pc; pc, offset, register, JALR]` | -μ |
 
 ## Padding
 
 The table can be padded to the next power of two with the following value assignments:
 
+| Column | Padding value |
+|--------|---------------|
+| `pc` | `0` |
+| `offset` | `0` |
+| `register` | `0` |
+| `JALR` | `0` |
+| `next_pc_high` | `[0, 0, 0]` |
+| `next_pc_low` | `0` |
+| `unmasked_low_byte` | `0` |
+| `μ` | `0` |
+
 ---
 
 # LT Chip
 
-The  chip constrains an indicator bit for the less-than relation, signed or unsigned.
+The  chip constrains an indicator bit for the less-than relation, signed or unsigned. If the `invert` flag is set, it inverts the result.
 
 ## Variables
 
@@ -1416,12 +1365,13 @@ The  chip is comprised of  variables that are expressed using  columns and lever
 | `lhs` | `DWordHHW` | The left operand |
 | `rhs` | `DWordHHW` | The right operand |
 | `signed` | `Bit` | whether to interpret `lhs` and `rhs` as signed integers (1) or not (0) |
+| `invert` | `Bit` | Whether to invert the result |
 
 ### Output
 
 | Name | Type | Description |
 |------|------|-------------|
-| `lt` | `Bit` | Whether $`lhs` < `rhs`$, taking `signed` into account |
+| `res` | `Bit` | The result |
 
 ### Auxiliary
 
@@ -1430,6 +1380,7 @@ The  chip is comprised of  variables that are expressed using  columns and lever
 | `lhs_sub_rhs` | `DWordHL` | $`lhs` - `rhs`$ |
 | `lhs_msb` | `Bit` | The most significant bit of `lhs` |
 | `rhs_msb` | `Bit` | The most significant bit of `rhs` |
+| `lt` | `Bit` | Whether $`lhs` < `rhs`$, taking `signed` into account |
 
 ### Virtual
 
@@ -1463,11 +1414,10 @@ We assume the inputs `lhs`, `rhs` and `signed` are partially range checked.
 |-----|-------|-------------|
 | `LT-A1` |  | `IS_WORD[lhs[0]]` |
 | `LT-A2` |  | `IS_WORD[rhs[0]]` |
-| `LT-A3` |  | `IS_BIT<signed>` |
 
 ## Constraints
 
-We first constrain that all variables correspond to their definition. For the defining constraint of `lt`, [lt:c:lt], observe that it is a choice between two options, depending on the input flag `signed`. In the case of unsigned comparison, we simply need `unsigned_lt`, indicating that a wraparound (carry bit) modulo `2^64` is needed to go from `rhs` to `lhs` via addition. For the case of signed comparison, we first need some case analysis.
+We first constrain that all inputs are range checked and all variables correspond to their definition. For the defining constraint of `lt`, [lt:c:lt], observe that it is a choice between two options, depending on the input flag `signed`. In the case of unsigned comparison, we simply need `unsigned_lt`, indicating that a wraparound (carry bit) modulo `2^64` is needed to go from `rhs` to `lhs` via addition. For the case of signed comparison, we first need some case analysis.
 
 We split `a < b` into four disjoint cases, conditioned on the sign of `a` and `b`. Recall that the sign of a number in two's complement can be read off from the MSB, being `1` for a negative number and `0` for a positive one. For this analysis, we denote the MSB of `a` as `A` and the MSB of `b` as `B`. The four disjoint cases then become:
 
@@ -1481,35 +1431,59 @@ The polynomial `P` can be simplified to a total degree of two. We claim that the
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `LT-C1` | `MSB16[lhs_msb; lhs[2]]` | μ |
-| `LT-C2` | `MSB16[rhs_msb; rhs[2]]` | μ |
-| `LT-C3` | `lt` = `signed` dot (A (1 - B) + A C + (1 - B) C) + (1 - `signed`) dot `unsigned_lt` |  |
+| `LT-C1` | `IS_HALF[lhs[1]]` | μ |
+| `LT-C2` | `IS_HALF[rhs[1]]` | μ |
+| `LT-C3` | `IS_BIT<signed>` |  |
+| `LT-C4` | `IS_BIT<invert>` |  |
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `LT-C5` | `MSB16[lhs_msb; lhs[2]]` | μ |
+| `LT-C6` | `MSB16[rhs_msb; rhs[2]]` | μ |
+| `LT-C7` | `lt` = `signed` dot (A (1 - B) + A C + (1 - B) C) + (1 - `signed`) dot `unsigned_lt` |  |
 | | _polynomial:_ `lt - signed * (lhs_msb * (1 - rhs_msb) + lhs_msb * carry[1] + (1 - rhs_msb) * carry[1]) - (1 - signed) * unsigned_lt = 0` | |
-| `LT-C4` | `IS_HALF[lhs[1]]` | μ |
-| `LT-C5` | `IS_HALF[rhs[1]]` | μ |
+| `LT-C8` | `res` = `lt` xor `invert` |  |
+| | _polynomial:_ `res + 2 * lt * invert - lt - invert = 0` | |
 
 And then we constrain the subtraction, taking care of the remaining range checking not yet covered by the assumptions or the `MSB16` lookup.
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `LT-C6.i` | i ∈ [0, 1] | `IS_BIT<carry[i]>` |  |
-| `LT-C7.i` | i ∈ [0, 3] | `IS_HALF[lhs_sub_rhs[i]]` | μ |
+| `LT-C9.i` | i ∈ [0, 1] | `IS_BIT<carry[i]>` |  |
+| `LT-C10.i` | i ∈ [0, 3] | `IS_HALF[lhs_sub_rhs[i]]` | μ |
 
 The chip contributes the following to the lookup argument.
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `LT-C8` | `LT[lt; lhs::DWordWL, rhs::DWordWL, signed]` | -μ |
+| `LT-C11` | `ALU[[res, 0]; lhs::DWordWL, rhs::DWordWL, ⧼LT⧽ + 32 * signed + 64 * invert]` | -μ |
 
 ## Padding
 
 The table can be padded to the next power of two with the following value assignments:
 
+| Column | Padding value |
+|--------|---------------|
+| `lhs` | `0` |
+| `rhs` | `0` |
+| `signed` | `0` |
+| `invert` | `0` |
+| `res` | `0` |
+| `lhs_sub_rhs` | `0` |
+| `lhs_msb` | `0` |
+| `rhs_msb` | `0` |
+| `lt` | `0` |
+| `μ` | `0` |
+
+## Potential optimizations
+
+- Split the chip into a signed and an unsigned chip, making the unsigned version cheaper.
+
 ---
 
-# MUL Chip
+# EQ Chip
 
-The  chip constrains multiplication, both signed and unsigned, as well as providing access to the low and high halfs of the multiplication result.
+The  chip is an ALU chip that compares two values and outputs a bit indicating whether they are equal or not. It optionally inverts the result if the `invert` flag is set.
 
 ## Variables
 
@@ -1519,50 +1493,120 @@ The  chip is comprised of  variables that are expressed using  columns and lever
 
 | Name | Type | Description |
 |------|------|-------------|
-| `lhs` | `DWordHL` | the left hand operator. |
-| `lhs_signed` | `Bit` | whether to interpret `lhs` as a signed integer (1) or not (0). |
-| `rhs` | `DWordHL` | the right hand operator. |
-| `rhs_signed` | `Bit` | whether to interpret `rhs` as a signed integer (1) or not (0). |
+| `a` | `DWordWL` | The first input |
+| `b` | `DWordWL` | The second input |
+| `invert` | `Bit` | Whether to invert the result |
 
 ### Output
 
 | Name | Type | Description |
 |------|------|-------------|
-| `lo` | `DWordHL` | the lower limbs of the (extended) multiplication result |
-| `hi` | `DWordHL` | the upper limbs of the (extended) multiplication result |
+| `res` | `Bit` | The result |
 
 ### Auxiliary
 
 | Name | Type | Description |
 |------|------|-------------|
-| `lhs_is_negative` | `Bit` | whether `lhs` is negative (1) or not (0) |
-| `rhs_is_negative` | `Bit` | whether `rhs` is negative (1) or not (0) |
-| `raw_product` | `B51[4]` | raw multiplication output |
+| `diff` | `DWordHL` | The difference `a - b` |
+| `eq` | `Bit` | The bit indicating `a == b` |
 
-### Virtual
+### Multiplicity
 
 | Name | Type | Description |
 |------|------|-------------|
-| `lhs_ext` | `Half[8]` | sign-extended value of `lhs` |
-| `rhs_ext` | `Half[8]` | sign-extended value of `rhs` |
-| `res` | `QuadWL` | concatenation of `lo` and `hi`. |
-| `carry` | `B20[4]` | carry values |
-| `μ_sum` | `BaseField` | sum of multiplicies |
+| `μ` | `BaseField` |  |
 
-**Definition of `lhs_ext`:**
-```
-lhs_ext (when iter=[0, 3]) := lhs[i]
-lhs_ext (when iter=[4, 7]) := 65535 * lhs_is_negative
-```
+## Assumptions
 
-**Definition of `rhs_ext`:**
-```
-rhs_ext (when iter=[0, 3]) := rhs[i]
-rhs_ext (when iter=[4, 7]) := 65535 * rhs_is_negative
-```
+| Tag | Range | Description |
+|-----|-------|-------------|
+| `EQ-A1.i` | i ∈ [0, 1] | `IS_WORD[a[i]]` |
+| `EQ-A2.i` | i ∈ [0, 1] | `IS_WORD[b[i]]` |
 
-**Definition of `res`:**
-```
+## Constraints
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `EQ-C1.i` | i ∈ [0, 3] | `IS_HALF[diff[i]]` | μ |
+| `EQ-C2` |  | `IS_BIT<invert>` |  |
+| `EQ-C3` |  | `SUB<diff::DWordWL; a, b>` |  |
+| `EQ-C4` |  | `ZERO[eq; diff[0] + diff[1] + diff[2] + diff[3]]` | μ |
+| `EQ-C5` |  | `res` = `eq` xor `invert` |  |
+| | | _polynomial:_ `res + 2 * eq * invert - eq - invert = 0` | |
+| `EQ-C6` |  | `ALU[[res, 0]; a, b, ⧼EQ⧽ + 64 * invert]` | -μ |
+
+## Padding
+
+The chip can be padded with the following values:
+
+| Column | Padding value |
+|--------|---------------|
+| `a` | `0` |
+| `b` | `0` |
+| `invert` | `0` |
+| `res` | `0` |
+| `diff` | `0` |
+| `eq` | `0` |
+| `μ` | `0` |
+
+---
+
+# MUL Chip
+
+The  chip constrains multiplication, both signed and unsigned, as well as providing access to the low and high halfs of the multiplication result.
+
+## Variables
+
+The  chip is comprised of  variables that are expressed using  columns and leverages  interaction(s):
+
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `lhs` | `DWordHL` | the left hand operator. |
+| `lhs_signed` | `Bit` | whether to interpret `lhs` as a signed integer (1) or not (0). |
+| `rhs` | `DWordHL` | the right hand operator. |
+| `rhs_signed` | `Bit` | whether to interpret `rhs` as a signed integer (1) or not (0). |
+
+### Output
+
+| Name | Type | Description |
+|------|------|-------------|
+| `lo` | `DWordHL` | the lower limbs of the (extended) multiplication result |
+| `hi` | `DWordHL` | the upper limbs of the (extended) multiplication result |
+
+### Auxiliary
+
+| Name | Type | Description |
+|------|------|-------------|
+| `lhs_is_negative` | `Bit` | whether `lhs` is negative (1) or not (0) |
+| `rhs_is_negative` | `Bit` | whether `rhs` is negative (1) or not (0) |
+| `raw_product` | `B51[4]` | raw multiplication output |
+
+### Virtual
+
+| Name | Type | Description |
+|------|------|-------------|
+| `lhs_ext` | `Half[8]` | sign-extended value of `lhs` |
+| `rhs_ext` | `Half[8]` | sign-extended value of `rhs` |
+| `res` | `QuadWL` | concatenation of `lo` and `hi`. |
+| `carry` | `B20[4]` | carry values |
+| `μ_sum` | `BaseField` | sum of multiplicies |
+
+**Definition of `lhs_ext`:**
+```
+lhs_ext (when iter=[0, 3]) := lhs[i]
+lhs_ext (when iter=[4, 7]) := 65535 * lhs_is_negative
+```
+
+**Definition of `rhs_ext`:**
+```
+rhs_ext (when iter=[0, 3]) := rhs[i]
+rhs_ext (when iter=[4, 7]) := 65535 * rhs_is_negative
+```
+
+**Definition of `res`:**
+```
 res (when iter=[0, 1]) := (lo::DWordWL)[i]
 res (when iter=[2, 3]) := (hi::DWordWL)[i - 2]
 ```
@@ -1587,15 +1631,6 @@ carry (when iter=[1, 3]) := 2^-32 * (raw_product[i] + carry[i - 1] - res[i])
 
 `mat(delim: , top; bottom)` }
 
-## Assumptions
-
-The following range checks are assumed to be performed/enforced outside of this chip:
-
-| Tag | Range | Description |
-|-----|-------|-------------|
-| `MUL-A1.i` | i ∈ [0, 3] | `IS_HALF[lhs[i]]` |
-| `MUL-A2.i` | i ∈ [0, 3] | `IS_HALF[rhs[i]]` |
-
 ## Constraints
 
 ### Overview
@@ -1616,11 +1651,15 @@ We constrain `lhs_is_negative` and `rhs_is_negative` according to their definiti
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `MUL-C1` |  | `SIGN<lhs_is_negative; lhs[3], lhs_signed>` |  |
-| `MUL-C2` |  | `SIGN<rhs_is_negative; rhs[3], rhs_signed>` |  |
-| `MUL-C3.i` | i ∈ [0, 3] | `IS_HALF[lo[i]]` | μ_sum |
-| `MUL-C4.i` | i ∈ [0, 3] | `IS_HALF[hi[i]]` | μ_sum |
-| `MUL-C5.i` | i ∈ [0, 3] | `IS_B20[carry[i]]` | μ_sum |
+| `MUL-C1` |  | `IS_BIT<lhs_signed>` |  |
+| `MUL-C2` |  | `IS_BIT<rhs_signed>` |  |
+| `MUL-C3.i` | i ∈ [0, 3] | `IS_HALF[lhs[i]]` | μ_sum |
+| `MUL-C4.i` | i ∈ [0, 3] | `IS_HALF[rhs[i]]` | μ_sum |
+| `MUL-C5` |  | `SIGN<lhs_is_negative; lhs[3], lhs_signed>` |  |
+| `MUL-C6` |  | `SIGN<rhs_is_negative; rhs[3], rhs_signed>` |  |
+| `MUL-C7.i` | i ∈ [0, 3] | `IS_HALF[lo[i]]` | μ_sum |
+| `MUL-C8.i` | i ∈ [0, 3] | `IS_HALF[hi[i]]` | μ_sum |
+| `MUL-C9.i` | i ∈ [0, 3] | `IS_B20[carry[i]]` | μ_sum |
 
 ### Product
 
@@ -1628,7 +1667,7 @@ We constrain `lhs_is_negative` and `rhs_is_negative` according to their definiti
 
 | Tag | Range | Description |
 |-----|-------|-------------|
-| `MUL-C6.i` | i ∈ [0, 3] | `raw_product[i]` = sum_(`k`=0)^1 2^(16k) sum_(`j`=0)^(2i+k) `lhs_ext[j]` dot `rhs_ext[2i+k-j]` |
+| `MUL-C10.i` | i ∈ [0, 3] | `raw_product[i]` = sum_(`k`=0)^1 2^(16k) sum_(`j`=0)^(2i+k) `lhs_ext[j]` dot `rhs_ext[2i+k-j]` |
 | | | _polynomial:_ `Σ_k = 0^1 2^(16 * k) * Σ_j = 0^2 * i + k lhs_ext[j] * rhs_ext[2 * i + k - j] - raw_product[i] = 0` |
 
 ### Lookup
@@ -1637,13 +1676,27 @@ The  chip contributes the following to the lookup:
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `MUL-C7` | `MUL[lo::DWordWL; lhs, lhs_signed, rhs, rhs_signed, 0]` | -μ_lo |
-| `MUL-C8` | `MUL[hi::DWordWL; lhs, lhs_signed, rhs, rhs_signed, 1]` | -μ_hi |
+| `MUL-C11` | `ALU[lo::DWordWL; lhs::DWordWL, rhs::DWordWL, ⧼MUL⧽ + 32 * lhs_signed + 64 * rhs_signed]` | -μ_lo |
+| `MUL-C12` | `ALU[hi::DWordWL; lhs::DWordWL, rhs::DWordWL, ⧼MUL⧽ + 32 * lhs_signed + 64 * rhs_signed + 128]` | -μ_hi |
 
 ## Padding
 
 The table can be padded to the next power of two with the following value assignments:
 
+| Column | Padding value |
+|--------|---------------|
+| `lhs` | `0` |
+| `lhs_signed` | `0` |
+| `rhs` | `0` |
+| `rhs_signed` | `0` |
+| `lo` | `0` |
+| `hi` | `0` |
+| `lhs_is_negative` | `0` |
+| `rhs_is_negative` | `0` |
+| `raw_product` | `0` |
+| `μ_lo` | `0` |
+| `μ_hi` | `0` |
+
 ## Notes/optimizations
 
 - `lo` and `hi` are stored in `DWordHL`s (rather than `DWordWL`s) because of their values being range checked. Since it is not required that both `μ_lo` and `μ_hi` are non-zero at the same time, one cannot safely assume their range to be checked elsewhere. - As an optimization, one might be able to use a `DWordWL` and `DWordHL` to store `lo` and `hi`, where one would decide which to store in which based on the multiplicities `μ_lo` and `μ_hi`; the value sent into the lookup could then be assumed range-checked by the other side of the relation. This optimization was not included at this moment because of its negative impact on the readability and verifiability of the chip.
@@ -1740,15 +1793,15 @@ carry (when iter=[1, 3]) := 2^-32 * ((extended_n_sub_r::QuadWL)[i] + (extended_r
 | `μ_q` | `BaseField` |  |
 | `μ_r` | `BaseField` |  |
 
-## Assumptions
+## Constraints
 
-| Tag | Range | Description |
-|-----|-------|-------------|
-| `DVRM-A1.i` | i ∈ [0, 3] | `IS_HALF[n[i]]` |
-| `DVRM-A2.i` | i ∈ [0, 3] | `IS_HALF[d[i]]` |
-| `DVRM-A3` |  | `IS_BIT<signed>` |
+First, we range-check all inputs.
 
-## Constraints
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `DVRM-C1.i` | i ∈ [0, 3] | `IS_HALF[n[i]]` | μ_sum |
+| `DVRM-C2.i` | i ∈ [0, 3] | `IS_HALF[d[i]]` | μ_sum |
+| `DVRM-C3` |  | `IS_BIT<signed>` |  |
 
 From the ISA, we gather five requirements for the `DIV[U][W]` and `REM[U][W]` instructions:
 
@@ -1760,7 +1813,7 @@ We start with R3, which is straightforwardly asserted by constraint [dvrm:c:sign
 
 | Tag | Description |
 |-----|-------------|
-| `DVRM-C1` | `r` eq.not 0 => `sign_r` = `sign_n` |
+| `DVRM-C4` | `r` eq.not 0 => `sign_r` = `sign_n` |
 | | _polynomial:_ `Σ_i = 0^3 r[i] * (sign_r - sign_n) = 0` |
 
 ### R2: rounding towards zero
@@ -1773,12 +1826,12 @@ Focusing on the first statement, we observe that this trivially holds when ``sig
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `DVRM-C2` |  | `LT[1 - div_by_zero; abs_r, abs_d, 0]` | μ_sum |
-| `DVRM-C3` |  | sign_r ⇒ `NEG<abs_r; r>` |  |
-| `DVRM-C4.i` | i ∈ [0, 1] | not`sign_r` => `abs_r` = `r` |  |
+| `DVRM-C5` |  | `ALU[[1 - div_by_zero, 0]; abs_r, abs_d, ⧼LT⧽]` | μ_sum |
+| `DVRM-C6` |  | sign_r ⇒ `NEG<abs_r; r>` |  |
+| `DVRM-C7.i` | i ∈ [0, 1] | not`sign_r` => `abs_r` = `r` |  |
 | | | _polynomial:_ `(1 - sign_r) * (abs_r[i] - (r::DWordWL)[i]) = 0` | |
-| `DVRM-C5` |  | sign_d ⇒ `NEG<abs_d; d>` |  |
-| `DVRM-C6.i` | i ∈ [0, 1] | not`sign_d` => `abs_d` = `d` |  |
+| `DVRM-C8` |  | sign_d ⇒ `NEG<abs_d; d>` |  |
+| `DVRM-C9.i` | i ∈ [0, 1] | not`sign_d` => `abs_d` = `d` |  |
 | | | _polynomial:_ `(1 - sign_d) * (abs_d[i] - (d::DWordWL)[i]) = 0` | |
 
 ### R5: overflow
@@ -1789,69 +1842,603 @@ We moreover find that R1 can be leveraged to enforce the correct value of `q`. W
 
 In summary, in case of overflow R2 enforces that ``r` = 0`. Moreover it suffices to interpret `q` as unsigned integer ([dvrm:c:sign_q]); R1 will ensure it contains the correct value.
 
-| Tag | Description | Multiplicity |
-|-----|-------------|--------------|
-| `DVRM-C7` | `sign_q` = `signed` dot (1- `overflow`) |  |
-| | _polynomial:_ `signed * (1 - overflow) - sign_q = 0` | |
-| `DVRM-C8` | `ZERO[overflow; n[0] + n[1] + n[2] + (n[3] - 2^15 * sign_n) + (1 - sign_n) + (65535 - d[0]) + (65535 - d[1]) + (65535 - d[2]) + (65535 - d[3])]` | μ_sum |
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `DVRM-C10` | `sign_q` = `signed` dot (1- `overflow`) |  |
+| | _polynomial:_ `signed * (1 - overflow) - sign_q = 0` | |
+| `DVRM-C11` | `ZERO[overflow; n[0] + n[1] + n[2] + (n[3] - 2^15 * sign_n) + (1 - sign_n) + (65535 - d[0]) + (65535 - d[1]) + (65535 - d[2]) + (65535 - d[3])]` | μ_sum |
+
+We highlight [dvrm:c:overflow]. Recall that the `overflow` flag should be set if and only if (i) ``signed` = 1`, (ii) ``n` = `0x80...00``, and (iii) ``d` = `0xFF...FF``. These requirements are equivalent to the state where: $ forall i in [0, 3]:&& 65535 - `d`_i &= 0,\ forall i in [0, 2]:&& `n`_i &= 0,\ && `n`_3 - 2^15 dot `sign_n` &= 0,\ && 1 - `sign_n` &= 0,\ $ where ``signed` = 1` follows from the last equality. The requirement is phrased in this way, because the left-hand sides of the above expressions are `>= 0` by construction. Given that the sum of these expressions does not exceed `2^19` (and thus never wraps in the field), we can now say that the `overflow` bit should be set to `1` if and only if their sum evaluates to `0`. The `ZERO` lookup guarantees this to be the case.
+
+### R1: $#`n` = #`qd` + #`r`$
+
+Rewriting R1, we find the constraint `not`overflow` => `n` - `r` = `qd``.
+
+Since `n`, `d`, `q` and `r` are all 64-bit integers, we must assert this equality `mod 2^128`, rather than `mod 2^64`. To this end, we introduce `extended_n_sub_r` and leverage the `MUL` chip to verify that it is equal to ``qd` mod 2^128` using constraints [dvrm:c:mul_lower] and [dvrm:c:mul_upper]; [dvrm:c:q_range] is included to uphold assumption [mul:c:rhs].
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `DVRM-C12` |  | `ALU[n_sub_r::DWordWL; d::DWordWL, q::DWordWL, ⧼MUL⧽ + 32 * signed + 64 * sign_q]` | μ_sum |
+| `DVRM-C13` |  | `ALU[extension_n_sub_r::DWordWL; d::DWordWL, q::DWordWL, ⧼MUL⧽ + 32 * signed + 64 * sign_q + 128]` | μ_sum |
+| `DVRM-C14.i` | i ∈ [0, 3] | `IS_HALF[q[i]]` | μ_sum |
+
+It now remains to enforce that `extended_n_sub_r` is the _signed_ 128-bit representation of ``n`-`r``. Here, we introduce `extended_n` and `extended_r`. By their definition, these variables contain the signed 128-bit representations of `n` and `r`. The `carry` variable has been defined such that it mimics those in the `ADD` chip, except that here we add two `QuadHL`s rather than two `DWordHL`, thus needing four carry bits instead of two. With this in place, [dvrm:c:n_sub_r] (mimicking [add:c:carry]) ensures `extended_n_sub_r` must contain the correct value.
+
+Lastly, observe that ``n` - `r` in (-2^64, 2^64)`, _regardless_ of the value of `signed`. Moreover, note that the upper halves of the 128-bit representations of all values in this range are either `0xFFFFFFFF` (negative) or `0x00000000` (non-negative). This means that we do not need to store all 128 bits of `extended_n_sub_r`. Rather, we need only store the lower 64-bits, and a separate bit (`sign_n_sub_r`) indicating whether the top limbs are all-ones or all-zeroes. The prover is free to select the value for `sign_n_sub_r`; only one of the two will fit the proof.
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `DVRM-C15.i` | i ∈ [0, 3] | `IS_BIT<carry[i]>` |  |
+| `DVRM-C16.i` | i ∈ [0, 3] | `IS_HALF[r[i]]` | μ_sum |
+| `DVRM-C17.i` | i ∈ [0, 3] | `IS_HALF[n_sub_r[i]]` | μ_sum |
+| `DVRM-C18` |  | `IS_BIT<sign_n_sub_r>` |  |
+
+### R4: division-by-zero
+
+R4 requires that ``q` = 2^64-1` (unsigned) or `-1` (signed) and ``r` = n` when ``d` = 0`. Recalling R1, we see that ``n` = `q` `d` + `r` = `r`` when ``d` = 0`, already enforces the latter. Next, we note that, in two's complement, the _unsigned_ value `2^64-1` and _signed_ value `-1` are both represented by the bit string `0xFFFFFFFF`. Hence, only [dvrm:c:q_if_div_by_zero] is required to completely constrain R4; [dvrm:c:div_by_zero] just ensures the `div_by_zero` flag is set when ``d` = 0`.
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `DVRM-C19.i` | i ∈ [0, 3] | `div_by_zero` => `q[i]` = 65535 |  |
+| | | _polynomial:_ `div_by_zero * (q[i] - 65535) = 0` | |
+| `DVRM-C20` |  | `ZERO[div_by_zero; d[0] + d[1] + d[2] + d[3]]` | μ_sum |
+
+### Other
+
+The following constraints are included to enforce the values of `sign_n`, `sign_r` and `sign_d` are correct.
+
+| Tag | Description |
+|-----|-------------|
+| `DVRM-C21` | `SIGN<sign_n; n[3], signed>` |
+| `DVRM-C22` | `SIGN<sign_r; r[3], signed>` |
+| `DVRM-C23` | `SIGN<sign_d; d[3], signed>` |
+
+### Output
+
+Lastly, this chip contributes the following to the lookup:
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `DVRM-C24` | `ALU[q::DWordWL; n::DWordWL, d::DWordWL, ⧼DIVREM⧽ + 32 * signed]` | -μ_q |
+| `DVRM-C25` | `ALU[r::DWordWL; n::DWordWL, d::DWordWL, ⧼DIVREM⧽ + 32 * signed + 128]` | -μ_r |
+
+## Padding
+
+To pad the  table, we use the following data, representing the unsigned division `frac(0, 0, style: "horizontal")`:
+
+| Column | Padding value |
+|--------|---------------|
+| `n` | `0` |
+| `d` | `0` |
+| `signed` | `0` |
+| `q` | `0` |
+| `r` | `0` |
+| `div_by_zero` | `1` |
+| `overflow` | `0` |
+| `abs_r` | `0` |
+| `abs_d` | `0` |
+| `n_sub_r` | `0` |
+| `sign_n_sub_r` | `0` |
+| `sign_n` | `0` |
+| `sign_d` | `0` |
+| `sign_q` | `0` |
+| `sign_r` | `0` |
+| `μ_q` | `0` |
+| `μ_r` | `0` |
+
+---
+
+# BITWISE Chips
+
+The  chips deal with precomputed lookup tables for bitwise boolean operations and convenience functionalities over small domains.
+
+## Variables
+
+The  chip is comprised of  variables that are expressed using  columns. Of these, the _input_ and _output_ variables ( in total) are precomputed.
+
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `X` | `Byte` |  |
+| `Y` | `Byte` |  |
+| `Z` | `B4` |  |
+
+### Output
+
+| Name | Type | Description |
+|------|------|-------------|
+| `AND` | `Byte` | the binary AND of `X` and `Y` |
+| `OR` | `Byte` | the binary OR of `X` and `Y` |
+| `XOR` | `Byte` | the binary XOR of `X` and `Y` |
+| `MSB8` | `Bit` | the most significant bit of `X` |
+| `MSB16` | `Bit` | the most significant bit of `Y` |
+| `ZERO` | `Bit` | whether $`X` = 0$, $`Y` = 0$ and $`Z` = 0$. |
+| `SLL` | `Half` | `X\|\|Y` logically left-shifted by `Z`: $((`X` + 256`Y`) `<<` `Z`) mod 2^16$ |
+| `SLLC` | `Half` | `X\|\|Y` logically right-shifted by `Z`: $(`X` + 256`Y`) `>>` (16 - `Z`)$ |
+
+### Multiplicity
+
+| Name | Type | Description |
+|------|------|-------------|
+| `μ_AND` | `BaseField` |  |
+| `μ_OR` | `BaseField` |  |
+| `μ_XOR` | `BaseField` |  |
+| `μ_MSB8` | `BaseField` |  |
+| `μ_MSB16` | `BaseField` |  |
+| `μ_ZERO` | `BaseField` |  |
+| `μ_IS_BYTE` | `BaseField` |  |
+| `μ_ARE_BYTES` | `BaseField` |  |
+| `μ_IS_HALF` | `BaseField` |  |
+| `μ_IS_B20` | `BaseField` |  |
+| `μ_HWSL` | `BaseField` |  |
+
+*Note*: This table contains one row for every possible value of `(X, Y, Z)`. As such, it has length `2^8 dot 2^8 dot 2^4 = 2^(20)`.
+
+We use the ALU operation descriptors from [decode] to identify the operations in the `BYTE_ALU` interaction. Since each of the three columns is only `2^16` rows long, they can be combined in a single `2^20` column (with room to spare).
+
+## Lookup
+
+This chip adds the following interactions to the lookup:
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `BITWISE-C1` | `BYTE_ALU[AND; ⧼AND⧽, X, Y]` | -μ_AND |
+| `BITWISE-C2` | `BYTE_ALU[OR; ⧼OR⧽, X, Y]` | -μ_OR |
+| `BITWISE-C3` | `BYTE_ALU[XOR; ⧼XOR⧽, X, Y]` | -μ_XOR |
+| `BITWISE-C4` | `MSB8[MSB8; X]` | -μ_MSB8 |
+| `BITWISE-C5` | `MSB16[MSB16; X + 256 * Y]` | -μ_MSB16 |
+| `BITWISE-C6` | `ZERO[ZERO; X + 256 * Y + 65536 * Z]` | -μ_ZERO |
+| `BITWISE-C7` | `ARE_BYTES[X, Y]` | -μ_ARE_BYTES |
+| `BITWISE-C8` | `IS_HALF[X + 256 * Y]` | -μ_IS_HALF |
+| `BITWISE-C9` | `IS_B20[X + 256 * Y + 65536 * Z]` | -μ_IS_B20 |
+| `BITWISE-C10` | `HWSL[[SLL, SLLC]; X + 256 * Y, Z]` | -μ_HWSL |
+
+## Notes/Optimizations
+
+The following ideas may prove to be optimizations for the  chip: + Drop `MSB8` column, and instead define the `MSB8` lookup as `MSB8<X> := MSB16[256X]`. Note: currently, `MSB8` also implicity range checks the input `X` (the lookup fails if `X` is not a `Byte`). This optimization should only be executed when all chips leveraging `MSB8` do _not_ need this implicit range check. + Place the 16-bit (`AND`, `OR`, `XOR`, `MSB16`, etc.) and 20-bit (`HWSL`, `IS_B20`, `ZERO`) lookups in separate tables.
+
+---
+
+# BYTEWISE Chip
+
+The  chip is an ALU chip that decomposes the input `DWordWL` values into bytes and performs a `BITWISE` operation pairwise (AND, OR, XOR). The `BITWISE` lookup inherently performs a range check, so no further constraints are necessary.
+
+## Variables
+
+The  chip is comprised of  variables that are expressed using  columns and leverages  interaction(s):
+
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `a` | `DWordBL` | The first input |
+| `b` | `DWordBL` | The second input |
+| `op` | `Byte` | The operation to perform |
+
+### Output
+
+| Name | Type | Description |
+|------|------|-------------|
+| `res` | `DWordBL` | The result |
+
+### Multiplicity
+
+| Name | Type | Description |
+|------|------|-------------|
+| `μ` | `BaseField` |  |
+
+## Constraints
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `BYTEWISE-C1.i` | i ∈ [0, 7] | `BYTE_ALU[res[i]; op, a[i], b[i]]` | μ |
+| `BYTEWISE-C2` |  | `ALU[res::DWordWL; a::DWordWL, b::DWordWL, op]` | -μ |
+
+## Padding
+
+The chip can be padded with the following values:
+
+| Column | Padding value |
+|--------|---------------|
+| `a` | `0` |
+| `b` | `0` |
+| `op` | `0` |
+| `res` | `0` |
+| `μ` | `0` |
+
+---
+
+# MEMW Chip
+
+The  chip is used to read and write memory locations (both RAM and registers) in chunks of 1, 2, 4 or 8 values. It introduces the old value and last-accessed timestamps of memory addresses internally, in order to satisfy the design of the memory argument ([memory]).
+
+## Variables
+
+The  chip is comprised of  variables that are expressed using  columns and leverages  interaction(s):
+
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `is_register` | `Bit` | Whether the address represents a register index |
+| `base_address` | `DWordWL` | The base address to read from/write to. Gets offset by $[0, 7]$ depending on the size of the access |
+| `value` | `BaseField[8]` | The values to store in memory. For RAM, these should be (up to) 8 range-checked `Byte`s; registers are stored as two range-checked `Word`s |
+| `timestamp` | `DWordWL` | The timestamp at which this memory access occurs |
+| `write2` | `Bit` | Whether to write exactly 2 values |
+| `write4` | `Bit` | Whether to write exactly 4 values |
+| `write8` | `Bit` | Whether to write exactly 8 values |
+
+### Output
+
+| Name | Type | Description |
+|------|------|-------------|
+| `old` | `BaseField[8]` | The old value written at `base_address`. See `value` for information about representation. Only the elements corresponding to the `writeN` bits are guaranteed |
+
+### Auxiliary
+
+| Name | Type | Description |
+|------|------|-------------|
+| `carry` | `Bit[7]` | Whether `base_address[0] + i + 1` $>= 2^32$ |
+| `old_timestamp` | `DWordWL[8]` | The timestamp at which address `base_address + i` was last accessed |
+
+### Virtual
+
+| Name | Type | Description |
+|------|------|-------------|
+| `w2` | `Bit` | writing at least 2 bytes |
+| `w4` | `Bit` | writing at least 4 bytes |
+| `address_add` | `DWordWL[7]` | `address_add[i] = base_address + i + 1` |
+| `μ_sum` | `Bit` |  |
+
+**Definition of `w2`:**
+```
+w2 := write2 + write4 + write8
+```
+
+**Definition of `w4`:**
+```
+w4 := write4 + write8
+```
+
+**Definition of `address_add`:**
+```
+address_add := [base_address[0] + i + 1 - 2^32 * carry[i], base_address[1] + carry[i]]
+```
+
+**Definition of `μ_sum`:**
+```
+μ_sum := μ_read + μ_write
+```
+
+### Multiplicity
+
+| Name | Type | Description |
+|------|------|-------------|
+| `μ_read` | `Bit` | Whether we are performing a read (and hence return `out`) |
+| `μ_write` | `Bit` | Whether we are performing a write (and hence not return `out`) |
+
+## Assumptions
+
+| Tag | Range | Description |
+|-----|-------|-------------|
+| `MEMW-A1.i` | i ∈ [0, 1] | `IS_WORD[base_address[i]]` |
+| `MEMW-A2` |  | `IS_BIT<write2>` |
+| `MEMW-A3` |  | `IS_BIT<write4>` |
+| `MEMW-A4` |  | `IS_BIT<write8>` |
+| `MEMW-A5` |  | `IS_BIT<write2 + write4 + write8>` |
+| `MEMW-A6.i` | i ∈ [0, 1] | `IS_WORD[timestamp[i]]` |
+
+Some of the assumptions can be checked with only arithmetic constraints, so we provide these below.
+
+| Tag | Description |
+|-----|-------------|
+| `MEMW-C1` | `IS_BIT<write2>` |
+| `MEMW-C2` | `IS_BIT<write4>` |
+| `MEMW-C3` | `IS_BIT<write8>` |
+| `MEMW-C4` | `IS_BIT<write2 + write4 + write8>` |
+
+Our assumptions do not explicitly cover any range checks for the `is_register` and `value` columns, as these are not necessary for the correctness of this chip in isolation. Still, these properties are necessary for the consistency of the system as a whole, and therefore we document it here, keeping the type information as a reading help.
+
+## Constraints
+
+Depending on the values of `write2`, `write4` and `write8`, the addresses following `base_address` need to be constructed. Rather than computing these in full (which would require the later addresses to be instantiated), it suffices to know the `carry`: the bit indicating whether ``base_address`_0 + t >= 2^32`, i.e., whether adding `t in [1, 7]` to `base_address` requires a carry from the lower to the upper limb. Note that it is safe for the prover to chose these bits: additions for which this bit is not correctly set will yield an address where either the lower or upper limb is out of bounds. As such, the constructed address will not match any existing memory tokens, which are only initialized for correctly formatted and range-checked doublewords (see [memory]).
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `MEMW-C5` |  | `IS_BIT<μ_read>` |  |
+| `MEMW-C6` |  | `IS_BIT<μ_write>` |  |
+| `MEMW-C7` |  | `IS_BIT<μ_sum>` |  |
+| `MEMW-C8` |  | `w2` => `μ_sum` |  |
+| | | _polynomial:_ `w2 * (1 - μ_sum) = 0` | |
+| `MEMW-C9.i` | i ∈ [0, 6] | `IS_BIT<carry[i]>` |  |
+| `MEMW-C10` |  | `ALU[[1, 0]; old_timestamp[0], timestamp, ⧼LT⧽]` | μ_sum |
+| `MEMW-C11` |  | `ALU[[1, 0]; old_timestamp[1], timestamp, ⧼LT⧽]` | w2 |
+| `MEMW-C12.i` | i ∈ [2, 3] | `ALU[[1, 0]; old_timestamp[i], timestamp, ⧼LT⧽]` | w4 |
+| `MEMW-C13.i` | i ∈ [4, 7] | `ALU[[1, 0]; old_timestamp[i], timestamp, ⧼LT⧽]` | write8 |
+
+As long as `timestamp` is properly range-checked, the presence of `old_timestamp` in the memory argument automatically ensures it is appropriately range checked (this assumes no external entities provide negative multiplicities without range checking the timestamp). This ensures the assumptions for `LT` are satisfied.
+
+There is no need to check that the additions do not overflow, as our address calculations are not performed modulo `2^64` here, and any overflow will result in an address without matching initialization.
+
+The chip adds the following tuples to the lookup argument, to effectuate that part of the memory argument.
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `MEMW-CM14` |  | `memory[is_register, base_address, old_timestamp[0], old[0]]` | μ_sum |
+| `MEMW-CM15` |  | `memory[is_register, base_address, timestamp, value[0]]` | -μ_sum |
+| `MEMW-CM16` |  | `memory[is_register, address_add[0], old_timestamp[1], old[1]]` | w2 |
+| `MEMW-CM17` |  | `memory[is_register, address_add[0], timestamp, value[1]]` | -w2 |
+| `MEMW-CM18.i` | i ∈ [2, 3] | `memory[is_register, address_add[i - 1], old_timestamp[i], old[i]]` | w4 |
+| `MEMW-CM19.i` | i ∈ [2, 3] | `memory[is_register, address_add[i - 1], timestamp, value[i]]` | -w4 |
+| `MEMW-CM20.i` | i ∈ [4, 7] | `memory[is_register, address_add[i - 1], old_timestamp[i], old[i]]` | write8 |
+| `MEMW-CM21.i` | i ∈ [4, 7] | `memory[is_register, address_add[i - 1], timestamp, value[i]]` | -write8 |
+
+This chip contributes the following to the lookup argument:
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `MEMW-CO22` | `MEMW[old; is_register, base_address, value, timestamp, write2, write4, write8]` | -μ_read |
+| `MEMW-CO23` | `MEMW[is_register, base_address, value, timestamp, write2, write4, write8]` | -μ_write |
+
+## Padding
+
+The table can be padded to the next power of two with the following value assignments:
+
+| Column | Padding value |
+|--------|---------------|
+| `is_register` | `0` |
+| `base_address` | `0` |
+| `value` | `0` |
+| `timestamp` | `0` |
+| `write2` | `0` |
+| `write4` | `0` |
+| `write8` | `0` |
+| `old` | `0` |
+| `carry` | `0` |
+| `old_timestamp` | `0` |
+| `μ_read` | `0` |
+| `μ_write` | `0` |
+
+## Read-size aligned fast path
+
+When a memory access happens at an address with proper alignment for its access size (i.e., adding the access size to `base_address`'s lowest limb does not overflow), and all accessed elements were last accessed at the same timestamp, we can instead use the  chip to save on total column count. The saving comes from only requiring a single old timestamp to be stored, as well as being able to guarantee that all values of `add_limb_overflow` would be zero. A minor extra cost is introduced in the form of a check that the alignment is indeed correct, and the corresponding decomposition of the `base_address`.
+
+Further logic remains essentially the same, so we briefly present the relevant tables for this chip.
+
+The  chip only needs  variables, expressed through  columns; it leverages  interactions.
+
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `is_register` | `Bit` | Whether the address represents a register index |
+| `base_address` | `DWordWHH` | The base address to read from/write to. Gets offset by $[0, 7]$ depending on the size of the access |
+| `value` | `BaseField[8]` | The values to store in memory. For regular memory, these should be (up to) 8 range-checked `Byte`s; registers are stored as two range-checked `Word`s |
+| `timestamp` | `DWordWL` | The timestamp at which this memory access is said to occur |
+| `write2` | `Bit` | Whether to write exactly 2 values |
+| `write4` | `Bit` | Whether to write exactly 4 values |
+| `write8` | `Bit` | Whether to write exactly 8 values |
+
+### Output
+
+| Name | Type | Description |
+|------|------|-------------|
+| `old` | `BaseField[8]` | The old value written at `base_address + i`. See `value` for information about representation. Only the elements corresponding to the `writeN` bits are guaranteed |
+
+### Auxiliary
+
+| Name | Type | Description |
+|------|------|-------------|
+| `old_timestamp` | `DWordWL` | The timestamp at which the address was last accessed |
+
+### Virtual
+
+| Name | Type | Description |
+|------|------|-------------|
+| `w2` | `Bit` | writing at least 2 bytes |
+| `w4` | `Bit` | writing at least 4 bytes |
+| `μ_sum` | `Bit` |  |
+
+**Definition of `w2`:**
+```
+w2 := write2 + write4 + write8
+```
+
+**Definition of `w4`:**
+```
+w4 := write4 + write8
+```
+
+**Definition of `μ_sum`:**
+```
+μ_sum := μ_read + μ_write
+```
+
+### Multiplicity
+
+| Name | Type | Description |
+|------|------|-------------|
+| `μ_read` | `Bit` | Whether we are performing a read (and hence return `out`) |
+| `μ_write` | `Bit` | Whether we are performing a write (and hence not return `out`) |
+
+| Tag | Range | Description |
+|-----|-------|-------------|
+| `MEMW_A-A1.i` | i ∈ [0, 1] | `IS_HALF[base_address[i]]` |
+| `MEMW_A-A2` |  | `IS_WORD[base_address[2]]` |
+| `MEMW_A-A3` |  | `IS_BIT<write2>` |
+| `MEMW_A-A4` |  | `IS_BIT<write4>` |
+| `MEMW_A-A5` |  | `IS_BIT<write8>` |
+| `MEMW_A-A6` |  | `IS_BIT<write2 + write4 + write8>` |
+| `MEMW_A-A7.i` | i ∈ [0, 1] | `IS_WORD[timestamp[i]]` |
+
+Some of the assumptions can be checked with only arithmetic constraints, so we provide these below.
+
+| Tag | Description |
+|-----|-------------|
+| `MEMW_A-C1` | `IS_BIT<write2>` |
+| `MEMW_A-C2` | `IS_BIT<write4>` |
+| `MEMW_A-C3` | `IS_BIT<write8>` |
+| `MEMW_A-C4` | `IS_BIT<write2 + write4 + write8>` |
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `MEMW_A-C9` | `IS_HALF[base_address[0] + write2 + 3 * write4 + 7 * write8]` | μ_sum |
+| `MEMW_A-C10` | `IS_BIT<μ_read>` |  |
+| `MEMW_A-C11` | `IS_BIT<μ_write>` |  |
+| `MEMW_A-C12` | `IS_BIT<μ_sum>` |  |
+| `MEMW_A-C13` | `w2` => `μ_sum` |  |
+| | _polynomial:_ `w2 * (1 - μ_sum) = 0` | |
+| `MEMW_A-C14` | `ALU[[1, 0]; old_timestamp, timestamp, ⧼LT⧽]` | μ_sum |
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `MEMW_A-CM15` |  | `memory[is_register, base_address::DWordWL, old_timestamp, old[0]]` | μ_sum |
+| `MEMW_A-CM16` |  | `memory[is_register, base_address::DWordWL, timestamp, value[0]]` | -μ_sum |
+| `MEMW_A-CM17` |  | `memory[is_register, base_address::DWordWL + 1::DWordWL, old_timestamp, old[1]]` | w2 |
+| `MEMW_A-CM18` |  | `memory[is_register, base_address::DWordWL + 1::DWordWL, timestamp, value[1]]` | -w2 |
+| `MEMW_A-CM19.i` | i ∈ [2, 3] | `memory[is_register, base_address::DWordWL + i::DWordWL, old_timestamp, old[i]]` | w4 |
+| `MEMW_A-CM20.i` | i ∈ [2, 3] | `memory[is_register, base_address::DWordWL + i::DWordWL, timestamp, value[i]]` | -w4 |
+| `MEMW_A-CM21.i` | i ∈ [4, 7] | `memory[is_register, base_address::DWordWL + i::DWordWL, old_timestamp, old[i]]` | write8 |
+| `MEMW_A-CM22.i` | i ∈ [4, 7] | `memory[is_register, base_address::DWordWL + i::DWordWL, timestamp, value[i]]` | -write8 |
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `MEMW_A-CO23` | `MEMW[old; is_register, base_address::DWordWL, value, timestamp, write2, write4, write8]` | -μ_read |
+| `MEMW_A-CO24` | `MEMW[is_register, base_address::DWordWL, value, timestamp, write2, write4, write8]` | -μ_write |
+
+### Padding
+
+The table can be padded to the next power of two with the following value assignments:
+
+| Column | Padding value |
+|--------|---------------|
+| `is_register` | `0` |
+| `base_address` | `0` |
+| `value` | `0` |
+| `timestamp` | `0` |
+| `write2` | `0` |
+| `write4` | `0` |
+| `write8` | `0` |
+| `old` | `0` |
+| `old_timestamp` | `0` |
+| `μ_read` | `0` |
+| `μ_write` | `0` |
+
+## Register fast-path
+
+The  chip provides a fast-path for accessing registers. This fast-path leverages that registers + can be addressed using a `Byte`, rather than a full `DWord`, + are constantly accessed, i.e., ``timestamp` - `old_timestamp`` is small, and + have a fixed access pattern to achieve a footprint that is significantly smaller than both  and .
+
+Note: as a result of hard optimization, this chip can only be used for register accesses for which + ``timestamp` - `old_timestamp` in [1, 2^16]`, and + ``timestamp[0]` > `old_timestamp[0]`` If either of these rules does not apply to your access, you should fall back to using `MEMW_A`.
+
+Note moreover that this chip does not guard against misaligned register access faults: to access register with a given `address`, one must provide `2 dot `address`` in the lookup.
+
+### Variables
+
+The  chip is comprised of  variables that are expressed using  columns and leverages  interactions:
+
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `address` | `Byte` | address of the register being accessed |
+| `timestamp` | `DWordWL` | timestamp at which the access takes place |
+| `val` | `DWordWL` | value being written to this register |
+
+### Output
+
+| Name | Type | Description |
+|------|------|-------------|
+| `old` | `DWordWL` | value of this register at `old_timestamp`. |
+
+### Auxiliary
+
+| Name | Type | Description |
+|------|------|-------------|
+| `old_timestamp_lo` | `Word` | the lower limb of `old_timestamp` |
+
+### Virtual
+
+| Name | Type | Description |
+|------|------|-------------|
+| `old_timestamp` | `DWordWL` | timestamp at which this register was last accessed |
+| `μ_sum` | `Bit` |  |
+
+**Definition of `old_timestamp`:**
+```
+old_timestamp := [old_timestamp_lo, timestamp[1]]::DWordWL
+```
+
+**Definition of `μ_sum`:**
+```
+μ_sum := μ_read + μ_write
+```
 
-We highlight [dvrm:c:overflow]. Recall that the `overflow` flag should be set if and only if (i) ``signed` = 1`, (ii) ``n` = `0x80...00``, and (iii) ``d` = `0xFF...FF``. These requirements are equivalent to the state where: $ forall i in [0, 3]:&& 65535 - `d`_i &= 0,\ forall i in [0, 2]:&& `n`_i &= 0,\ && `n`_3 - 2^15 dot `sign_n` &= 0,\ && 1 - `sign_n` &= 0,\ $ where ``signed` = 1` follows from the last equality. The requirement is phrased in this way, because the left-hand sides of the above expressions are `>= 0` by construction. Given that the sum of these expressions does not exceed `2^19` (and thus never wraps in the field), we can now say that the `overflow` bit should be set to `1` if and only if their sum evaluates to `0`. The `ZERO` lookup guarantees this to be the case.
+### Multiplicity
 
-### R1: $#`n` = #`qd` + #`r`$
+| Name | Type | Description |
+|------|------|-------------|
+| `μ_read` | `Bit` | Whether we are performing a read (and hence return `out`) |
+| `μ_write` | `Bit` | Whether we are performing a write (and hence not return `out`) |
 
-Rewriting R1, we find the constraint `not`overflow` => `n` - `r` = `qd``.
+### Assumptions
 
-Since `n`, `d`, `q` and `r` are all 64-bit integers, we must assert this equality `mod 2^128`, rather than `mod 2^64`. To this end, we introduce `extended_n_sub_r` and leverage the `MUL` chip to verify that it is equal to ``qd` mod 2^128` using constraints [dvrm:c:mul_lower] and [dvrm:c:mul_upper]; [dvrm:c:q_range] is included to uphold assumption [mul:a:rhs].
+The following range checks are assumed to be performed/enforced outside of this chip:
 
-| Tag | Range | Description | Multiplicity |
-|-----|-------|-------------|--------------|
-| `DVRM-C9` |  | `MUL[n_sub_r::DWordWL; d, signed, q, sign_q, 0]` | μ_sum |
-| `DVRM-C10` |  | `MUL[extension_n_sub_r::DWordWL; d, signed, q, sign_q, 1]` | μ_sum |
-| `DVRM-C11.i` | i ∈ [0, 3] | `IS_HALF[q[i]]` | μ_sum |
+| Tag | Range | Description |
+|-----|-------|-------------|
+| `MEMW_R-A1.i` | i ∈ [0, 1] | `IS_WORD[val[i]]` |
+| `MEMW_R-A2.i` | i ∈ [0, 1] | `IS_WORD[timestamp[i]]` |
 
-It now remains to enforce that `extended_n_sub_r` is the _signed_ 128-bit representation of ``n`-`r``. Here, we introduce `extended_n` and `extended_r`. By their definition, these variables contain the signed 128-bit representations of `n` and `r`. The `carry` variable has been defined such that it mimics those in the `ADD` chip, except that here we add two `QuadHL`s rather than two `DWordHL`, thus needing four carry bits instead of two. With this in place, [dvrm:c:n_sub_r] (mimicking [add:c:carry]) ensures `extended_n_sub_r` must contain the correct value.
+### Constraints
 
-Lastly, observe that ``n` - `r` in (-2^64, 2^64)`, _regardless_ of the value of `signed`. Moreover, note that the upper halves of the 128-bit representations of all values in this range are either `0xFFFFFFFF` (negative) or `0x00000000` (non-negative). This means that we do not need to store all 128 bits of `extended_n_sub_r`. Rather, we need only store the lower 64-bits, and a separate bit (`sign_n_sub_r`) indicating whether the top limbs are all-ones or all-zeroes. The prover is free to select the value for `sign_n_sub_r`; only one of the two will fit the proof.
+Since most registers are frequently accessed, the difference between `timestamp` and `old_timestamp` is small most of the times. Rather than storing their (nearly) identical upper limbs twice, it is instead assumed that ``old_timestamp[1]` = `timestamp[1]``;  can be used for accesses where this is not the case.
 
-| Tag | Range | Description | Multiplicity |
-|-----|-------|-------------|--------------|
-| `DVRM-C12.i` | i ∈ [0, 3] | `IS_BIT<carry[i]>` |  |
-| `DVRM-C13.i` | i ∈ [0, 3] | `IS_HALF[r[i]]` | μ_sum |
-| `DVRM-C14.i` | i ∈ [0, 3] | `IS_HALF[n_sub_r[i]]` | μ_sum |
-| `DVRM-C15` |  | `IS_BIT<sign_n_sub_r>` |  |
+Verifying that ``timestamp` > `old_timestamp`` now simplifies to verifying that ``timestamp[0]` - `old_timestamp[0]` > 0`. For most accesses, this value will be small enough to fit in a `Half`. This chip thus enforces this by means of the following constraint:
 
-### R4: division-by-zero
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `MEMW_R-C1` | `IS_HALF[timestamp[0] - old_timestamp[0] - 1]` | μ_sum |
 
-R4 requires that ``q` = 2^64-1` (unsigned) or `-1` (signed) and ``r` = n` when ``d` = 0`. Recalling R1, we see that ``n` = `q` `d` + `r` = `r`` when ``d` = 0`, already enforces the latter. Next, we note that, in two's complement, the _unsigned_ value `2^64-1` and _signed_ value `-1` are both represented by the bit string `0xFFFFFFFF`. Hence, only [dvrm:c:q_if_div_by_zero] is required to completely constrain R4; [dvrm:c:div_by_zero] just ensures the `div_by_zero` flag is set when ``d` = 0`.
+With ``old_timestamp`<`timestamp`` asserted, `old` is read from the register ([regw:c:read_old]) and `val` is written back ([regw:c:write_val]).
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `DVRM-C16.i` | i ∈ [0, 3] | `div_by_zero` => `q[i]` = 65535 |  |
-| | | _polynomial:_ `div_by_zero * (q[i] - 65535) = 0` | |
-| `DVRM-C17` |  | `ZERO[div_by_zero; d[0] + d[1] + d[2] + d[3]]` | μ_sum |
-
-### Other
+| `MEMW_R-C2.i` | i ∈ [0, 1] | `memory[1, [(2 * address + i)::Word, 0], old_timestamp, old[i]]` | μ_sum |
+| `MEMW_R-C3.i` | i ∈ [0, 1] | `memory[1, [(2 * address + i)::Word, 0], timestamp, val[i]]` | -μ_sum |
 
-The following constraints are included to enforce the values of `sign_n`, `sign_r` and `sign_d` are correct.
+This chip can either just write (``μ_write` = 1`), or both read and write (``μ_read` = 1`) in the same cycle. It must be asserted that at most one of these two options is selected:
 
 | Tag | Description |
 |-----|-------------|
-| `DVRM-C18` | `SIGN<sign_n; n[3], signed>` |
-| `DVRM-C19` | `SIGN<sign_r; r[3], signed>` |
-| `DVRM-C20` | `SIGN<sign_d; d[3], signed>` |
-
-### Output
+| `MEMW_R-C4` | `IS_BIT<μ_read>` |
+| `MEMW_R-C5` | `IS_BIT<μ_write>` |
+| `MEMW_R-C6` | `IS_BIT<μ_sum>` |
 
-Lastly, this chip contributes the following to the lookup:
+Lastly, this chip contributes the following interactions to the logup:
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `DVRM-C21` | `DVRM[q::DWordWL; n, d, signed, 0]` | -μ_q |
-| `DVRM-C22` | `DVRM[r::DWordWL; n, d, signed, 1]` | -μ_r |
+| `MEMW_R-C7` | `MEMW[[old[0], old[1], 0, 0, 0, 0, 0, 0]; 1, [(2 * address)::Word, 0], [val[0], val[1], 0, 0, 0, 0, 0, 0], timestamp, 1, 0, 0]` | -μ_read |
+| `MEMW_R-C8` | `MEMW[1, [(2 * address)::Word, 0], [val[0], val[1], 0, 0, 0, 0, 0, 0], timestamp, 1, 0, 0]` | -μ_write |
 
-## Padding
+### Padding
 
-To pad the  table, we use the following data, representing the unsigned division `frac(0, 0, style: "horizontal")`:
+The table can be padded to the next power of two with the following value assignments:
+
+| Column | Padding value |
+|--------|---------------|
+| `address` | `0` |
+| `timestamp` | `0` |
+| `val` | `0` |
+| `old` | `0` |
+| `old_timestamp_lo` | `0` |
+| `μ_read` | `0` |
+| `μ_write` | `0` |
+
+## Notes/optimizations
+
+The following ideas may prove to be optimizations for the // chip: - `MEMB` chip that does a one-byte write to remove old_timestamp from here (uncertain tradeoffs) - Adding `μ_sum`/`w2`/`w4`/`write8` multiplicities to the `IS_HALF` lookups may make some GKR things faster if there are known zeroes. - For the register fast-path, one may upgrade the `IS_HALF` check to an `IS_B20` check for extended range at the cost of looking through a larger table.
 
 ---
 
@@ -1867,7 +2454,7 @@ The  chip is comprised of  variables that are expressed using  columns and lever
 
 | Name | Type | Description |
 |------|------|-------------|
-| `base_address` | `DWordWL` | The base address to read/write from/to, gets offset by $[0, 7]$, depending on how big the access is |
+| `base_address` | `DWordWL` | The base address to read from, gets offset by $[0, 7]$, depending on how big the access is |
 | `timestamp` | `DWordWL` | The timestamp at which this memory access is said to occur |
 | `read2` | `Bit` | Whether to read exactly 2 bytes |
 | `read4` | `Bit` | Whether to read exactly 4 bytes |
@@ -1908,12 +2495,7 @@ read1 := μ - read2 - read4 - read8
 | Tag | Range | Description |
 |-----|-------|-------------|
 | `LOAD-A1.i` | i ∈ [0, 1] | `IS_WORD[base_address[i]]` |
-| `LOAD-A2` |  | `IS_BIT<signed>` |
-| `LOAD-A3` |  | `IS_BIT<read2>` |
-| `LOAD-A4` |  | `IS_BIT<read4>` |
-| `LOAD-A5` |  | `IS_BIT<read8>` |
-| `LOAD-A6` |  | `IS_BIT<read2 + read4 + read8>` |
-| `LOAD-A7.i` | i ∈ [0, 1] | `IS_WORD[timestamp[i]]` |
+| `LOAD-A2.i` | i ∈ [0, 1] | `IS_WORD[timestamp[i]]` |
 
 ## Constraints
 
@@ -1921,97 +2503,126 @@ The chip delegates the actual memory interaction to the `MEMW` chip, and ensures
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `LOAD-C1` |  | `read2` + `read4` + `read8` => `μ` |  |
+| `LOAD-C1` |  | `IS_BIT<signed>` |  |
+| `LOAD-C2` |  | `IS_BIT<read2>` |  |
+| `LOAD-C3` |  | `IS_BIT<read4>` |  |
+| `LOAD-C4` |  | `IS_BIT<read8>` |  |
+| `LOAD-C5` |  | `IS_BIT<read2 + read4 + read8>` |  |
+| `LOAD-C6` |  | `read2` + `read4` + `read8` => `μ` |  |
 | | | _polynomial:_ `(read2 + read4 + read8) * (1 - μ) = 0` | |
-| `LOAD-C2` |  | `MEMW[res; 0, base_address, res::BaseField[8], timestamp, read2, read4, read8]` | μ |
-| `LOAD-C3` |  | `MSB8[sign_bit; res[0]]` | read1 |
-| `LOAD-C4` |  | `MSB8[sign_bit; res[1]]` | read2 |
-| `LOAD-C5` |  | `MSB8[sign_bit; res[3]]` | read4 |
-| `LOAD-C6.i` | i ∈ [4, 7] | !`read8` => `res`_i = `signed` dot `sign_bit` dot 255 |  |
+| `LOAD-C7` |  | `MEMW[res; 0, base_address, res::BaseField[8], timestamp, read2, read4, read8]` | μ |
+| `LOAD-C8` |  | `MSB8[sign_bit; res[0]]` | read1 |
+| `LOAD-C9` |  | `MSB8[sign_bit; res[1]]` | read2 |
+| `LOAD-C10` |  | `MSB8[sign_bit; res[3]]` | read4 |
+| `LOAD-C11.i` | i ∈ [4, 7] | !`read8` => `res`_i = `signed` dot `sign_bit` dot 255 |  |
 | | | _polynomial:_ `(1 - read8) * (res[i] - signed * sign_bit * 255) = 0` | |
-| `LOAD-C7.i` | i ∈ [2, 3] | !(`read4` + `read8`) => `res`_i = `signed` dot `sign_bit` dot 255 |  |
+| `LOAD-C12.i` | i ∈ [2, 3] | !(`read4` + `read8`) => `res`_i = `signed` dot `sign_bit` dot 255 |  |
 | | | _polynomial:_ `(1 - read4 - read8) * (res[i] - signed * sign_bit * 255) = 0` | |
-| `LOAD-C8` |  | !(`read2` + `read4` + `read8`) => `res`_1 = `signed` dot `sign_bit` dot 255 |  |
+| `LOAD-C13` |  | !(`read2` + `read4` + `read8`) => `res`_1 = `signed` dot `sign_bit` dot 255 |  |
 | | | _polynomial:_ `(1 - read2 - read4 - read8) * (res[1] - signed * sign_bit * 255) = 0` | |
 
 The chip contributes the following to the lookup argument.
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `LOAD-C9` | `LOAD[res::DWordWL; base_address, timestamp, read2, read4, read8, signed]` | -μ |
+| `LOAD-C14` | `MEMOP[res::DWordWL; timestamp, base_address, 0::DWordWL, 2 * signed + 4 * read2 + 8 * read4 + 16 * read8]` | -μ |
 
 ## Padding
 
 The table can be padded to the next power of two with the following value assignments:
 
+| Column | Padding value |
+|--------|---------------|
+| `base_address` | `0` |
+| `timestamp` | `0` |
+| `read2` | `0` |
+| `read4` | `0` |
+| `read8` | `0` |
+| `signed` | `0` |
+| `res` | `0` |
+| `sign_bit` | `0` |
+| `μ` | `0` |
+
 ---
 
-# BITWISE Chips
+# STORE Chip
 
-The  chips deal with precomputed lookup tables for bitwise boolean operations and convenience functionalities over small domains.
+The  chip provides functionality to store a value to memory. It decomposes a `DWord` into bytes and delegates low-level memory handling to the `MEMW` chip ([memw]).
 
 ## Variables
 
-The  chip is comprised of  variables that are expressed using  columns. Of these, the _input_ and _output_ variables ( in total) are precomputed.
+The  chip is comprised of  variables that are expressed using  columns and leverages  interaction(s):
 
 ### Input
 
 | Name | Type | Description |
 |------|------|-------------|
-| `X` | `Byte` |  |
-| `Y` | `Byte` |  |
-| `Z` | `B4` |  |
+| `base_address` | `DWordWL` | The base address to write to, gets offset by $[0, 7]$, depending on how big the access is |
+| `timestamp` | `DWordWL` | The timestamp at which this memory access is said to occur |
+| `write2` | `Bit` | Whether to write exactly 2 bytes |
+| `write4` | `Bit` | Whether to write exactly 4 bytes |
+| `write8` | `Bit` | Whether to write exactly 8 bytes |
+| `value` | `DWordBL` | The value to store |
 
-### Output
+### Virtual
 
 | Name | Type | Description |
 |------|------|-------------|
-| `AND` | `Byte` | the binary AND of `X` and `Y` |
-| `OR` | `Byte` | the binary OR of `X` and `Y` |
-| `XOR` | `Byte` | the binary XOR of `X` and `Y` |
-| `MSB8` | `Bit` | the most significant bit of `X` |
-| `MSB16` | `Bit` | the most significant bit of `Y` |
-| `ZERO` | `Bit` | whether $`X` = 0$, $`Y` = 0$ and $`Z` = 0$. |
-| `SLL` | `Half` | `X\|\|Y` logically left-shifted by `Z`: $((`X` + 256`Y`) `<<` `Z`) mod 2^16$ |
-| `SLLC` | `Half` | `X\|\|Y` logically right-shifted by `Z`: $(`X` + 256`Y`) `>>` (16 - `Z`)$ |
+| `write1` | `Bit` | Whether to write exactly 1 byte |
+
+**Definition of `write1`:**
+```
+write1 := μ - write2 - write4 - write8
+```
 
 ### Multiplicity
 
 | Name | Type | Description |
 |------|------|-------------|
-| `μ_AND` | `BaseField` |  |
-| `μ_OR` | `BaseField` |  |
-| `μ_XOR` | `BaseField` |  |
-| `μ_MSB8` | `BaseField` |  |
-| `μ_MSB16` | `BaseField` |  |
-| `μ_ZERO` | `BaseField` |  |
-| `μ_IS_BYTE` | `BaseField` |  |
-| `μ_IS_HALF` | `BaseField` |  |
-| `μ_IS_B20` | `BaseField` |  |
-| `μ_HWSL` | `BaseField` |  |
+| `μ` | `Bit` |  |
 
-*Note*: This table contains one row for every possible value of `(X, Y, Z)`. As such, it has length `2^8 dot 2^8 dot 2^4 = 2^(20)`.
+## Assumptions
 
-## Lookup
+| Tag | Range | Description |
+|-----|-------|-------------|
+| `STORE-A1.i` | i ∈ [0, 1] | `IS_WORD[base_address[i]]` |
+| `STORE-A2.i` | i ∈ [0, 1] | `IS_WORD[timestamp[i]]` |
 
-This chip adds the following interactions to the lookup:
+## Constraints
+
+The chip delegates the actual memory interaction to the `MEMW` chip, and ensures the values are proper bytes.
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `STORE-C1` |  | `IS_BIT<μ>` |  |
+| `STORE-C2` |  | `IS_BIT<write2>` |  |
+| `STORE-C3` |  | `IS_BIT<write4>` |  |
+| `STORE-C4` |  | `IS_BIT<write8>` |  |
+| `STORE-C5` |  | `IS_BIT<write2 + write4 + write8>` |  |
+| `STORE-C6` |  | `write2` + `write4` + `write8` => `μ` = 1 |  |
+| | | _polynomial:_ `(write2 + write4 + write8) * (1 - μ) = 0` | |
+| `STORE-C7.i` | i ∈ [0, 7] | μ ⇒ `IS_BYTE<value[i]>` |  |
+| `STORE-C8` |  | `MEMW[0, base_address, value, timestamp, write2, write4, write8]` | μ |
+
+The chip contributes the following to the lookup argument.
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `BITWISE-C1` | `AND_BYTE[AND; X, Y]` | -μ_AND |
-| `BITWISE-C2` | `OR_BYTE[OR; X, Y]` | -μ_OR |
-| `BITWISE-C3` | `XOR_BYTE[XOR; X, Y]` | -μ_XOR |
-| `BITWISE-C4` | `MSB8[MSB8; X]` | -μ_MSB8 |
-| `BITWISE-C5` | `MSB16[MSB16; X + 256 * Y]` | -μ_MSB16 |
-| `BITWISE-C6` | `ZERO[ZERO; X + 256 * Y + 65536 * Z]` | -μ_ZERO |
-| `BITWISE-C7` | `IS_BYTE[X]` | -μ_IS_BYTE |
-| `BITWISE-C8` | `IS_HALF[X + 256 * Y]` | -μ_IS_HALF |
-| `BITWISE-C9` | `IS_B20[X + 256 * Y + 65536 * Z]` | -μ_IS_B20 |
-| `BITWISE-C10` | `HWSL[['arr', 'SLL', 'SLLC']; X + 256 * Y, Z]` | -μ_HWSL |
+| `STORE-C9` | `MEMOP[0::DWordWL; timestamp, base_address, value::DWordWL, 1 + 4 * write2 + 8 * write4 + 16 * write8]` | -μ |
 
-## Notes/Optimizations
+## Padding
+
+The table can be padded to the next power of two with the following value assignments:
 
-The following ideas may prove to be optimizations for the  chip: + Extend `IS_BYTE[X]` to `ARE_BYTES[X, Y]`, such that two bytes are range checked at once. When only a single check is required, one can still execute `IS_BYTE[X] := ARE_BYTES[X, 0]`. + Drop `MSB8` column, and instead define the `MSB8` lookup as `MSB8<X> := MSB16[256X]`. Note: currently, `MSB8` also implicity range checks the input `X` (the lookup fails if `X` is not a `Byte`). This optimization should only be executed when all chips leveraging `MSB8` do _not_ need this implicit range check. + Place the 16-bit (`AND`, `OR`, `XOR`, `MSB16`, etc.) and 20-bit (`HWSL`, `IS_B20`, `ZERO`) lookups in separate tables.
+| Column | Padding value |
+|--------|---------------|
+| `base_address` | `0` |
+| `timestamp` | `0` |
+| `write2` | `0` |
+| `write4` | `0` |
+| `write8` | `0` |
+| `value` | `0` |
+| `μ` | `0` |
 
 ---
 
@@ -2043,6 +2654,12 @@ The  chip leverages  variable, spanning  columns and leverages  interactions:
 |------|------|-------------|
 | `timestamp` | `DWordWL` | timestamp at which to halt the program |
 
+### Auxiliary
+
+| Name | Type | Description |
+|------|------|-------------|
+| `pc` | `DWordWL` | The `next_pc` value the CPU wrote during the instruction HALT was invoked |
+
 ## Assumptions
 
 It is assumed the input is range checked:
@@ -2053,14 +2670,15 @@ It is assumed the input is range checked:
 
 ## Constraints
 
-The  chip: + makes sure register `x10` (containing the exit code) equals `0` ([halt:c:read_zero_exit_code]), + writes `0` to all other registers ([halt:c:zeroize_registers_lo]/[halt:c:zeroize_registers_hi]), and + sets `pc` equal to `1` ([halt:c:pc]). Note that the writes performed by all these interactions are accompanied by the timestamp `2^64-1`; the maximum timestamp. This prevents any other operation involving memory from being executed hereafter.
+The  chip: + makes sure register `x10` (containing the exit code) equals `0` ([halt:c:read_zero_exit_code]), + writes `0` to all other registers ([halt:c:zeroize_registers_lo]/[halt:c:zeroize_registers_hi]), and + sets `pc` equal to `1` ([halt:c:consume_pc], [halt:c:emit_pc]). Note that the writes performed by all these interactions --- except for the `pc` --- are accompanied by the timestamp `2^64-1`; the maximum timestamp. This prevents any other operation involving memory from being executed hereafter. The `pc` is consumed and re-emitted at the same timestamp to enable padding rows for the CPU. This means that the verifier will have to know the final timestamp at which a CPU padding `pc` was written to be able to balance the final LogUp.
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
 | `HALT-C1.i` | i ∈ [1, 9] | `MEMW[1, (2 * i)::DWordWL, 0::BaseField[8], (2^64 - 1)::DWordWL, 1, 0, 0]` | 1 |
 | `HALT-C2` |  | `MEMW[0::BaseField[8]; 1, (2 * 10)::DWordWL, 0::BaseField[8], (2^64 - 1)::DWordWL, 1, 0, 0]` | 1 |
 | `HALT-C3.i` | i ∈ [11, 31] | `MEMW[1, (2 * i)::DWordWL, 0::BaseField[8], (2^64 - 1)::DWordWL, 1, 0, 0]` | 1 |
-| `HALT-C4` |  | `MEMW[1, (2 * 255)::DWordWL, ['arr', 1, 0, 0, 0, 0, 0, 0, 0], (2^64 - 1)::DWordWL, 1, 0, 0]` | 1 |
+| `HALT-C4.i` | i ∈ [0, 1] | `memory[1, [2 * 255 + i, 0], [timestamp[0] + 1, timestamp[1]], pc[i]]` | 1 |
+| `HALT-C5.i` | i ∈ [0, 1] | `memory[1, [2 * 255 + i, 0], [timestamp[0] + 1, timestamp[1]], [1, 0][i]]` | -1 |
 
 [ Observe that --- in its current state --- this solution puts the burden of verifying the register cleanup on the verifier inside of the lookup argument. Alternatively, one could add 31 lookups to the "memory" table to remove the _known_ final tokens for the registers there. ])
 
@@ -2072,7 +2690,7 @@ The HALT chip therefore contributes the following interaction to the lookup-argu
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `HALT-C5` | `ECALL[timestamp, 93::DWordWL]` | -1 |
+| `HALT-C6` | `ECALL[timestamp, 93::DWordWL]` | -1 |
 
 ## Padding
 
@@ -2133,10 +2751,10 @@ we assert that `x10` contains `1` in [commit:c:read_fd_write_count]. Note that t
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `COMMIT-C2` | `MEMW[['arr', ['idx', 'address', 0], ['idx', 'address', 1], 0, 0, 0, 0, 0, 0]; 1, (2 * 11)::DWordWL, ['arr', ['idx', 'address', 0], ['idx', 'address', 1], 0, 0, 0, 0, 0, 0], timestamp, 1, 0, 0]` | first |
-| `COMMIT-C3` | `MEMW[['arr', ['idx', 'count', 0], ['idx', 'count', 1], 0, 0, 0, 0, 0, 0]; 1, (2 * 12)::DWordWL, ['arr', ['idx', 'count', 0], ['idx', 'count', 1], 0, 0, 0, 0, 0, 0], timestamp, 1, 0, 0]` | first |
-| `COMMIT-C4` | `MEMW[['arr', 1, 0, 0, 0, 0, 0, 0, 0]; 1, (2 * 10)::DWordWL, ['arr', ['idx', 'count', 0], ['idx', 'count', 1], 0, 0, 0, 0, 0, 0], timestamp, 1, 0, 0]` | first |
-| `COMMIT-C5` | `MEMW[['arr', 'index', 0, 0, 0, 0, 0, 0, 0]; 1, (2 * 254)::DWordWL, ['arr', ['+', 'index', ['cast', 'count', 'BaseField']], 0, 0, 0, 0, 0, 0, 0], timestamp, 0, 0, 0]` | first |
+| `COMMIT-C2` | `MEMW[[address[0], address[1], 0, 0, 0, 0, 0, 0]; 1, (2 * 11)::DWordWL, [address[0], address[1], 0, 0, 0, 0, 0, 0], timestamp, 1, 0, 0]` | first |
+| `COMMIT-C3` | `MEMW[[count[0], count[1], 0, 0, 0, 0, 0, 0]; 1, (2 * 12)::DWordWL, [count[0], count[1], 0, 0, 0, 0, 0, 0], timestamp, 1, 0, 0]` | first |
+| `COMMIT-C4` | `MEMW[[1, 0, 0, 0, 0, 0, 0, 0]; 1, (2 * 10)::DWordWL, [count[0], count[1], 0, 0, 0, 0, 0, 0], timestamp, 1, 0, 0]` | first |
+| `COMMIT-C5` | `MEMW[[index, 0, 0, 0, 0, 0, 0, 0]; 1, (2 * 254)::DWordWL, [index + count::BaseField, 0, 0, 0, 0, 0, 0, 0], timestamp, 0, 0, 0]` | first |
 
 *Note*: the observant reader will notice that [commit:c:read_index] casts `count` to a `BaseField`, potentiallly losing information. This is indeed correct. However, since it is practically impossible to commit more than `2^64-2^32` bytes in a single VM execution, it was decided to permit this.
 
@@ -2144,7 +2762,7 @@ Next, we read the `value` located at buffer address `address` and commit to it u
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `COMMIT-C6` | `MEMW[['arr', 'value', 0, 0, 0, 0, 0, 0, 0]; 0, address, ['arr', 'value', 0, 0, 0, 0, 0, 0, 0], timestamp, 0, 0, 0]` | μ - end |
+| `COMMIT-C6` | `MEMW[[value, 0, 0, 0, 0, 0, 0, 0]; 0, address, [value, 0, 0, 0, 0, 0, 0, 0], timestamp, 0, 0, 0]` | μ - end |
 | `COMMIT-C7` | `COMMIT[index, value]` | μ - end |
 
 In parallel, we compute ``address_incr` = `address` + 1` ([commit:c:address_incr]) as address of the next byte to commit, and ``count_decr` = `count` - 1` ([commit:c:count_decr]) as the number of bytes that still has to be committed after committing this byte. [commit:c:range_address_incr] and [commit:c:range_count_decr] are included to satisfy [add:a:sum] respectively [add:a:rhs].
@@ -2185,6 +2803,19 @@ Lastly, we must make sure `first`, `end` and `μ` are bits ([commit:c:range_firs
 
 To pad this chip, use the below data.
 
+| Column | Padding value |
+|--------|---------------|
+| `timestamp` | `0` |
+| `index` | `0` |
+| `address` | `[0, 0, 0, 0]` |
+| `address_incr` | `[1, 0, 0, 0]` |
+| `count` | `[1, 0, 0, 0]` |
+| `count_decr` | `[0, 0, 0, 0]` |
+| `first` | `0` |
+| `end` | `0` |
+| `value` | `0` |
+| `μ` | `0` |
+
 ## Notes/optimizations
 
 - The current version only supports writing to `stdout`. This chip could potentially be extended to support writing to arbitrary `fd`s - One might be able to replace [commit:c:end] by `end => count = 0`. While loosening the constraint (`count = 0 => end` is no longer enforced), this should not cause any problems: if the prover does not set `end` when `count=0`, they simply cannot complete the proof. First of all, one would have to recursively work through all `2^64` values of `count`, something that is practically infeasible. Moreover, if this is done with a sequence that originally has ``count` > 0`, one will inevitably have to read a memory address twice at the same timestamp, which is impossible to prove. In addition to dropping the `ZERO` lookup, this optimization might also permit moving `count_decr` from a `DWordHL` to a `DWordWL`, saving two columns. - Given that it is practically infeasible to commit more than ``p`-1 = 2^64-2^32` bytes in a program, it might suffice to store `count_decr` in a `BaseField`. Note that this would probably involve having an extra (virtual) column storing `count` in `BaseField` form as well. Moreover, one might need to add a lookup to `LT` to ensure ``count` <= `p`-1` when being read from memory at the beginning of each commitment sequence.
@@ -2239,14 +2870,14 @@ The first responsibility of the chip is to read the current state and message ch
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `SHA256-C1` |  | `MEMW[['arr', ['idx', ['cast', ['idx', 'm_addr', 0], 'DWordWL'], 0], ['idx', ['cast', ['idx', 'm_addr', 0], 'DWordWL'], 0], 0, 0, 0, 0, 0, 0]; 1, (2 * 11)::DWordWL, ['arr', ['idx', ['cast', ['idx', 'm_addr', 0], 'DWordWL'], 0], ['idx', ['cast', ['idx', 'm_addr', 0], 'DWordWL'], 1], 0, 0, 0, 0, 0, 0], timestamp, 1, 0, 0]` | μ |
+| `SHA256-C1` |  | `MEMW[[(m_addr[0]::DWordWL)[0], (m_addr[0]::DWordWL)[1], 0, 0, 0, 0, 0, 0]; 1, (2 * 11)::DWordWL, [(m_addr[0]::DWordWL)[0], (m_addr[0]::DWordWL)[1], 0, 0, 0, 0, 0, 0], timestamp, 1, 0, 0]` | μ |
 | `SHA256-C2.i` | i ∈ [0, 7], j ∈ [0, 3] | `IS_HALF[m_addr[i][j]]` | μ |
 | `SHA256-C3.i` | i ∈ [1, 7] | `ADD<m_addr[i]::DWordWL; m_addr[0]::DWordWL, (8 * i)::DWordWL>` |  |
-| `SHA256-C4.i` | i ∈ [0, 7] | `MEMW[['arr', ['idx', 'm', ['+', ['*', 8, 'i'], 3]], ['idx', 'm', ['+', ['*', 8, 'i'], 2]], ['idx', 'm', ['+', ['*', 8, 'i'], 1]], ['idx', 'm', ['+', ['*', 8, 'i'], 0]], ['idx', 'm', ['+', ['*', 8, 'i'], 7]], ['idx', 'm', ['+', ['*', 8, 'i'], 6]], ['idx', 'm', ['+', ['*', 8, 'i'], 5]], ['idx', 'm', ['+', ['*', 8, 'i'], 4]]]; 0, m_addr[i]::DWordWL, ['arr', ['idx', 'm', ['+', ['*', 8, 'i'], 3]], ['idx', 'm', ['+', ['*', 8, 'i'], 2]], ['idx', 'm', ['+', ['*', 8, 'i'], 1]], ['idx', 'm', ['+', ['*', 8, 'i'], 0]], ['idx', 'm', ['+', ['*', 8, 'i'], 7]], ['idx', 'm', ['+', ['*', 8, 'i'], 6]], ['idx', 'm', ['+', ['*', 8, 'i'], 5]], ['idx', 'm', ['+', ['*', 8, 'i'], 4]]], timestamp, 0, 0, 1]` | μ |
-| `SHA256-C5` |  | `MEMW[['arr', ['idx', ['cast', ['idx', 'h_addr', 0], 'DWordWL'], 0], ['idx', ['cast', ['idx', 'h_addr', 0], 'DWordWL'], 1], 0, 0, 0, 0, 0, 0]; 1, (2 * 10)::DWordWL, ['arr', ['idx', ['cast', ['idx', 'h_addr', 0], 'DWordWL'], 0], ['idx', ['cast', ['idx', 'h_addr', 0], 'DWordWL'], 1], 0, 0, 0, 0, 0, 0], timestamp, 1, 0, 0]` | μ |
+| `SHA256-C4.i` | i ∈ [0, 7] | `MEMW[[m[8 * i + 3], m[8 * i + 2], m[8 * i + 1], m[8 * i + 0], m[8 * i + 7], m[8 * i + 6], m[8 * i + 5], m[8 * i + 4]]; 0, m_addr[i]::DWordWL, [m[8 * i + 3], m[8 * i + 2], m[8 * i + 1], m[8 * i + 0], m[8 * i + 7], m[8 * i + 6], m[8 * i + 5], m[8 * i + 4]], timestamp, 0, 0, 1]` | μ |
+| `SHA256-C5` |  | `MEMW[[(h_addr[0]::DWordWL)[0], (h_addr[0]::DWordWL)[1], 0, 0, 0, 0, 0, 0]; 1, (2 * 10)::DWordWL, [(h_addr[0]::DWordWL)[0], (h_addr[0]::DWordWL)[1], 0, 0, 0, 0, 0, 0], timestamp, 1, 0, 0]` | μ |
 | `SHA256-C6.i` | i ∈ [0, 3], j ∈ [0, 3] | `IS_HALF[h_addr[i][j]]` | μ |
 | `SHA256-C7.i` | i ∈ [1, 3] | `ADD<h_addr[i]::DWordWL; h_addr[0]::DWordWL, 8 * i::DWordWL>` |  |
-| `SHA256-C8.i` | i ∈ [0, 3] | `MEMW[['arr', ['idx', 'h', ['+', ['*', 8, 'i'], 3]], ['idx', 'h', ['+', ['*', 8, 'i'], 2]], ['idx', 'h', ['+', ['*', 8, 'i'], 1]], ['idx', 'h', ['+', ['*', 8, 'i'], 0]], ['idx', 'h', ['+', ['*', 8, 'i'], 7]], ['idx', 'h', ['+', ['*', 8, 'i'], 6]], ['idx', 'h', ['+', ['*', 8, 'i'], 5]], ['idx', 'h', ['+', ['*', 8, 'i'], 4]]]; 0, h_addr[i]::DWordWL, ['arr', ['idx', 'out', ['+', ['*', 8, 'i'], 3]], ['idx', 'out', ['+', ['*', 8, 'i'], 2]], ['idx', 'out', ['+', ['*', 8, 'i'], 1]], ['idx', 'out', ['+', ['*', 8, 'i'], 0]], ['idx', 'out', ['+', ['*', 8, 'i'], 7]], ['idx', 'out', ['+', ['*', 8, 'i'], 6]], ['idx', 'out', ['+', ['*', 8, 'i'], 5]], ['idx', 'out', ['+', ['*', 8, 'i'], 4]]], timestamp + 1::DWordWL, 0, 0, 1]` | μ |
+| `SHA256-C8.i` | i ∈ [0, 3] | `MEMW[[h[8 * i + 3], h[8 * i + 2], h[8 * i + 1], h[8 * i + 0], h[8 * i + 7], h[8 * i + 6], h[8 * i + 5], h[8 * i + 4]]; 0, h_addr[i]::DWordWL, [out[8 * i + 3], out[8 * i + 2], out[8 * i + 1], out[8 * i + 0], out[8 * i + 7], out[8 * i + 6], out[8 * i + 5], out[8 * i + 4]], timestamp + 1::DWordWL, 0, 0, 1]` | μ |
 
 Then we prepare the message schedule, by emitting the input chunk with multiplicities corresponding to the number of times it will be read during a compression evaluation. The  chip itself is implicitly invoked by itself and , setting the `amount` column appropriately for the number of times the `w` value is required.
 
@@ -2261,20 +2892,31 @@ And finally, we provide the boundaries for the  chip and the final addition of t
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `SHA256-C13` |  | `SHA256ROUND[timestamp, ['arr', ['+', ['*', ['^', 2, 0], ['idx', 'h', 3]], ['*', ['^', 2, 8], ['idx', 'h', 2]], ['*', ['^', 2, 16], ['idx', 'h', 1]], ['*', ['^', 2, 24], ['idx', 'h', 0]]], ['+', ['*', ['^', 2, 0], ['idx', 'h', 7]], ['*', ['^', 2, 8], ['idx', 'h', 6]], ['*', ['^', 2, 16], ['idx', 'h', 5]], ['*', ['^', 2, 24], ['idx', 'h', 4]]], ['+', ['*', ['^', 2, 0], ['idx', 'h', 11]], ['*', ['^', 2, 8], ['idx', 'h', 10]], ['*', ['^', 2, 16], ['idx', 'h', 9]], ['*', ['^', 2, 24], ['idx', 'h', 8]]], ['+', ['*', ['^', 2, 0], ['idx', 'h', 15]], ['*', ['^', 2, 8], ['idx', 'h', 14]], ['*', ['^', 2, 16], ['idx', 'h', 13]], ['*', ['^', 2, 24], ['idx', 'h', 12]]], ['+', ['*', ['^', 2, 0], ['idx', 'h', 19]], ['*', ['^', 2, 8], ['idx', 'h', 18]], ['*', ['^', 2, 16], ['idx', 'h', 17]], ['*', ['^', 2, 24], ['idx', 'h', 16]]], ['+', ['*', ['^', 2, 0], ['idx', 'h', 23]], ['*', ['^', 2, 8], ['idx', 'h', 22]], ['*', ['^', 2, 16], ['idx', 'h', 21]], ['*', ['^', 2, 24], ['idx', 'h', 20]]], ['+', ['*', ['^', 2, 0], ['idx', 'h', 27]], ['*', ['^', 2, 8], ['idx', 'h', 26]], ['*', ['^', 2, 16], ['idx', 'h', 25]], ['*', ['^', 2, 24], ['idx', 'h', 24]]], ['+', ['*', ['^', 2, 0], ['idx', 'h', 31]], ['*', ['^', 2, 8], ['idx', 'h', 30]], ['*', ['^', 2, 16], ['idx', 'h', 29]], ['*', ['^', 2, 24], ['idx', 'h', 28]]]], 0]` | μ |
+| `SHA256-C13` |  | `SHA256ROUND[timestamp, [2^0 * h[3] + 2^8 * h[2] + 2^16 * h[1] + 2^24 * h[0], 2^0 * h[7] + 2^8 * h[6] + 2^16 * h[5] + 2^24 * h[4], 2^0 * h[11] + 2^8 * h[10] + 2^16 * h[9] + 2^24 * h[8], 2^0 * h[15] + 2^8 * h[14] + 2^16 * h[13] + 2^24 * h[12], 2^0 * h[19] + 2^8 * h[18] + 2^16 * h[17] + 2^24 * h[16], 2^0 * h[23] + 2^8 * h[22] + 2^16 * h[21] + 2^24 * h[20], 2^0 * h[27] + 2^8 * h[26] + 2^16 * h[25] + 2^24 * h[24], 2^0 * h[31] + 2^8 * h[30] + 2^16 * h[29] + 2^24 * h[28]], 0]` | μ |
 | `SHA256-C14` |  | `SHA256ROUND[timestamp, last_round_out, 64]` | -μ |
-| `SHA256-C15.i` | i ∈ [0, 31] | `IS_BYTE[out[i]]` | μ |
-| `SHA256-C16.i` | i ∈ [0, 7] | `ADD<['arr', 0, ['+', ['*', ['^', 2, 0], ['idx', 'out', ['+', ['*', 4, 'i'], 3]]], ['*', ['^', 2, 8], ['idx', 'out', ['+', ['*', 4, 'i'], 2]]], ['*', ['^', 2, 16], ['idx', 'out', ['+', ['*', 4, 'i'], 1]]], ['*', ['^', 2, 24], ['idx', 'out', ['+', ['*', 4, 'i'], 0]]]]]; ['arr', 0, ['idx', 'last_round_out', 'i']], ['arr', 0, ['+', ['*', ['^', 2, 0], ['idx', 'h', ['+', ['*', 4, 'i'], 3]]], ['*', ['^', 2, 8], ['idx', 'h', ['+', ['*', 4, 'i'], 2]]], ['*', ['^', 2, 16], ['idx', 'h', ['+', ['*', 4, 'i'], 1]]], ['*', ['^', 2, 24], ['idx', 'h', ['+', ['*', 4, 'i'], 0]]]]]>` |  |
+| `SHA256-C15.i` | i ∈ [0, 31] | μ ⇒ `IS_BYTE<out[i]>` |  |
+| `SHA256-C16.i` | i ∈ [0, 7] | `ADD<[0, 2^0 * out[4 * i + 3] + 2^8 * out[4 * i + 2] + 2^16 * out[4 * i + 1] + 2^24 * out[4 * i + 0]]; [0, last_round_out[i]], [0, 2^0 * h[4 * i + 3] + 2^8 * h[4 * i + 2] + 2^16 * h[4 * i + 1] + 2^24 * h[4 * i + 0]]>` |  |
 
 In this VM, we assign syscall number -1 to the  accelerator. The chip therefore contributes the following interaction to the lookup-argument:
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
 | `SHA256-C17` | `IS_BIT<μ>` |  |
-| `SHA256-C18` | `ECALL[timestamp, ['arr', ['-', ['^', 2, 32], 1], ['-', ['^', 2, 32], 1]]]` | -μ |
+| `SHA256-C18` | `ECALL[timestamp, (2^64 - 1)::DWordWL]` | -μ |
 
 ### Padding
 
+| Column | Padding value |
+|--------|---------------|
+| `timestamp` | `0` |
+| `h` | `0` |
+| `h_addr` | `[0, 8, 16, 24]` |
+| `m` | `0` |
+| `m_addr` | `[0, 8, 16, 24, 32, 40, 48, 56]` |
+| `out` | `0` |
+| `last_round_out` | `0` |
+| `μ` | `0` |
+
 ## `SHA256`msgsched chip
 
 ### Columns
@@ -2335,7 +2977,7 @@ First, we gather the dependencies from earlier in the message schedule.
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `SHA256MSGSCHED-C1` | `IS_BYTE[index - 16]` | μ |
+| `SHA256MSGSCHED-C1` | μ ⇒ `IS_BYTE<index - 16>` |  |
 | `SHA256MSGSCHED-C2` | `SHA256_M[back2; timestamp, index - 2]` | μ |
 | `SHA256MSGSCHED-C3` | `SHA256_M[back7; timestamp, index - 7]` | μ |
 | `SHA256MSGSCHED-C4` | `SHA256_M[back15; timestamp, index - 15]` | μ |
@@ -2347,7 +2989,7 @@ Then, we calculate the result. It suffices to check that the carry of adding fou
 |-----|-------|-------------|--------------|
 | `SHA256MSGSCHED-C6` |  | `ROTXOR[s0; back15, 2, 11, 3, 0]` | μ |
 | `SHA256MSGSCHED-C7` |  | `ROTXOR[s1; back2, 3, 2, 10, 0]` | μ |
-| `SHA256MSGSCHED-C8` |  | `IS_BYTE[carry]` | μ |
+| `SHA256MSGSCHED-C8` |  | μ ⇒ `IS_BYTE<carry>` |  |
 | `SHA256MSGSCHED-C9.i` | i ∈ [0, 1] | `IS_HALF[out[i]]` | μ |
 
 Finally, we contribute to the LogUp.
@@ -2462,11 +3104,11 @@ To compute `maj`, observe that ` (a bitand b) xor (a bitand c) xor (b bitand c)
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `SHA256ROUND-C1.i` | i ∈ [0, 3] | `AND_BYTE[a_and_b[i]; a[i], b[i]]` | μ |
-| `SHA256ROUND-C2.i` | i ∈ [0, 3] | `XOR_BYTE[a_xor_b[i]; a[i], b[i]]` | μ |
-| `SHA256ROUND-C3.i` | i ∈ [0, 3] | `AND_BYTE[c_and_a_xor_b[i]; c[i], a_xor_b[i]]` | μ |
-| `SHA256ROUND-C4.i` | i ∈ [0, 3] | `AND_BYTE[e_and_f[i]; e[i], f[i]]` | μ |
-| `SHA256ROUND-C5.i` | i ∈ [0, 3] | `AND_BYTE[not_e_and_g[i]; 255 - e[i], g[i]]` | μ |
+| `SHA256ROUND-C1.i` | i ∈ [0, 3] | `BYTE_ALU[a_and_b[i]; ⧼AND⧽, a[i], b[i]]` | μ |
+| `SHA256ROUND-C2.i` | i ∈ [0, 3] | `BYTE_ALU[a_xor_b[i]; ⧼XOR⧽, a[i], b[i]]` | μ |
+| `SHA256ROUND-C3.i` | i ∈ [0, 3] | `BYTE_ALU[c_and_a_xor_b[i]; ⧼AND⧽, c[i], a_xor_b[i]]` | μ |
+| `SHA256ROUND-C4.i` | i ∈ [0, 3] | `BYTE_ALU[e_and_f[i]; ⧼AND⧽, e[i], f[i]]` | μ |
+| `SHA256ROUND-C5.i` | i ∈ [0, 3] | `BYTE_ALU[not_e_and_g[i]; ⧼AND⧽, 255 - e[i], g[i]]` | μ |
 | `SHA256ROUND-C6` |  | `SHA256_K[kval; index]` | μ |
 | `SHA256ROUND-C7` |  | `SHA256_M[wval; timestamp, index]` | μ |
 | `SHA256ROUND-C8` |  | `ROTXOR[S0; a::Word, 6, 9, 2, 1]` | μ |
@@ -2477,19 +3119,44 @@ Then we constrain the addition for the new state, constraining additions with th
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
 | `SHA256ROUND-C10.i` | i ∈ [0, 1] | `IS_HALF[out_a[i]]` | μ |
-| `SHA256ROUND-C11` |  | `IS_BYTE[carry_a]` | μ |
+| `SHA256ROUND-C11` |  | μ ⇒ `IS_BYTE<carry_a>` |  |
 | `SHA256ROUND-C12.i` | i ∈ [0, 1] | `IS_HALF[out_e[i]]` | μ |
-| `SHA256ROUND-C13` |  | `IS_BYTE[carry_e]` | μ |
+| `SHA256ROUND-C13` |  | μ ⇒ `IS_BYTE<carry_e>` |  |
 
 Finally, we chain the rounds together through the interactions.
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
-| `SHA256ROUND-C14` | `SHA256ROUND[timestamp, ['arr', ['cast', 'a', 'Word'], ['cast', 'b', 'Word'], ['cast', 'c', 'Word'], 'd', ['cast', 'e', 'Word'], ['cast', 'f', 'Word'], ['cast', 'g', 'Word'], 'h'], index]` | -μ |
-| `SHA256ROUND-C15` | `SHA256ROUND[timestamp, ['arr', ['cast', 'out_a', 'Word'], ['cast', 'a', 'Word'], ['cast', 'b', 'Word'], ['cast', 'c', 'Word'], ['cast', 'out_e', 'Word'], ['cast', 'e', 'Word'], ['cast', 'f', 'Word'], ['cast', 'g', 'Word']], index + 1]` | μ |
+| `SHA256ROUND-C14` | `SHA256ROUND[timestamp, [a::Word, b::Word, c::Word, d, e::Word, f::Word, g::Word, h], index]` | -μ |
+| `SHA256ROUND-C15` | `SHA256ROUND[timestamp, [out_a::Word, a::Word, b::Word, c::Word, out_e::Word, e::Word, f::Word, g::Word], index + 1]` | μ |
 
 ### Padding
 
+| Column | Padding value |
+|--------|---------------|
+| `timestamp` | `0` |
+| `a` | `0` |
+| `b` | `0` |
+| `c` | `0` |
+| `d` | `0` |
+| `e` | `0` |
+| `f` | `0` |
+| `g` | `0` |
+| `h` | `0` |
+| `index` | `0` |
+| `out_a` | `0` |
+| `out_e` | `0` |
+| `a_and_b` | `0` |
+| `a_xor_b` | `0` |
+| `c_and_a_xor_b` | `0` |
+| `e_and_f` | `0` |
+| `not_e_and_g` | `0` |
+| `kval` | `0` |
+| `S0` | `0` |
+| `S1` | `0` |
+| `wval` | `0` |
+| `μ` | `0` |
+
 ## `ROTXOR` chip
 
 This chip takes as input `a`, `r0`, `r1`, `r2` (4-bit values) and a bit `last_rot` to compute $ cases( (a >>> (16 + r_0)) xor (a >>> (16 + r_0 - r_1)) xor (a >>> r_2) quad "if" `last_rot`, (a >>> (16 + r_0)) xor (a >>> (16 + r_0 - r_1)) xor (a >> r_2) quad "if" `!last_rot` ), $ where we let `>>>` denote right rotation and `>>` logical shift right. We choose this representation so that all shift amounts required fit into 4 bits, making the usage of `HWSL` more straightforward and avoid extra columns to represent more bits.
@@ -2549,9 +3216,9 @@ We first compute all rotations (or shifts) of `a`. `a1` is computed as a left ro
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `ROTXOR-C1.i` | i ∈ [0, 1] | `HWSL[['arr', ['idx', 'a0_left', 'i'], ['idx', 'a0_right', 'i']]; a[i], 16 - r0]` | μ |
-| `ROTXOR-C2.i` | i ∈ [0, 1] | `HWSL[['arr', ['idx', 'a1_left', 'i'], ['idx', 'a1_right', 'i']]; (a0::WordHL)[i], r1]` | μ |
-| `ROTXOR-C3.i` | i ∈ [0, 1] | `HWSL[['arr', ['idx', 'a2_left', 'i'], ['idx', 'a2_right', 'i']]; a[i], 16 - r2]` | μ |
+| `ROTXOR-C1.i` | i ∈ [0, 1] | `HWSL[[a0_left[i], a0_right[i]]; a[i], 16 - r0]` | μ |
+| `ROTXOR-C2.i` | i ∈ [0, 1] | `HWSL[[a1_left[i], a1_right[i]]; (a0::WordHL)[i], r1]` | μ |
+| `ROTXOR-C3.i` | i ∈ [0, 1] | `HWSL[[a2_left[i], a2_right[i]]; a[i], 16 - r2]` | μ |
 | `ROTXOR-C4.i` | i ∈ [0, 1] | `a0[i]` = `a0_left[i]` + `a0_right[1 - i]` |  |
 | | | _polynomial:_ `(a0::WordHL)[i] - a0_left[i] - a0_right[1 - i] = 0` | |
 | `ROTXOR-C5.i` | i ∈ [0, 1] | `a1[i]` = `a1_left[i]` + `a1_right[1 - i]` |  |
@@ -2559,14 +3226,14 @@ We first compute all rotations (or shifts) of `a`. `a1` is computed as a left ro
 | `ROTXOR-C6` |  | `a2[0]` = `a2_left[1]` + `a2_right[0]` |  |
 | | | _polynomial:_ `(a2::WordHL)[0] - a2_left[1] - a2_right[0] = 0` | |
 | `ROTXOR-C7` |  | `a2[1]` = `last_rot` dot `a2_left[0]` + `a2_right[1]` |  |
-| | | _polynomial:_ `(a2::WordHL)[0] - last_rot * a2_left[0] - a2_right[1] = 0` | |
+| | | _polynomial:_ `(a2::WordHL)[1] - last_rot * a2_left[0] - a2_right[1] = 0` | |
 
 Then the bitwise XOR of the results.
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `ROTXOR-C8.i` | i ∈ [0, 3] | `XOR_BYTE[a01[i]; a0[i], a1[i]]` | μ |
-| `ROTXOR-C9.i` | i ∈ [0, 3] | `XOR_BYTE[out[i]; a01[i], a2[i]]` | μ |
+| `ROTXOR-C8.i` | i ∈ [0, 3] | `BYTE_ALU[a01[i]; ⧼XOR⧽, a0[i], a1[i]]` | μ |
+| `ROTXOR-C9.i` | i ∈ [0, 3] | `BYTE_ALU[out[i]; ⧼XOR⧽, a01[i], a2[i]]` | μ |
 
 And finally contribute to the lookup argument.
 
@@ -2576,6 +3243,26 @@ And finally contribute to the lookup argument.
 
 ### Padding
 
+| Column | Padding value |
+|--------|---------------|
+| `a` | `0` |
+| `r0` | `0` |
+| `r1` | `0` |
+| `r2` | `0` |
+| `last_rot` | `0` |
+| `out` | `0` |
+| `a0_left` | `0` |
+| `a0_right` | `0` |
+| `a1_left` | `0` |
+| `a1_right` | `0` |
+| `a2_left` | `0` |
+| `a2_right` | `0` |
+| `a0` | `0` |
+| `a1` | `0` |
+| `a2` | `0` |
+| `a01` | `0` |
+| `μ` | `0` |
+
 ## Constant lookup
 
 As mentioned, we provide the round constants through a short precomputed lookup table: .
@@ -2637,14 +3324,14 @@ As mentioned, we provide the round constants through a short precomputed lookup
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `SHA256-C1` |  | `MEMW[['arr', ['idx', ['cast', ['idx', 'm_addr', 0], 'DWordWL'], 0], ['idx', ['cast', ['idx', 'm_addr', 0], 'DWordWL'], 0], 0, 0, 0, 0, 0, 0]; 1, (2 * 11)::DWordWL, ['arr', ['idx', ['cast', ['idx', 'm_addr', 0], 'DWordWL'], 0], ['idx', ['cast', ['idx', 'm_addr', 0], 'DWordWL'], 1], 0, 0, 0, 0, 0, 0], timestamp, 1, 0, 0]` | μ |
+| `SHA256-C1` |  | `MEMW[[(m_addr[0]::DWordWL)[0], (m_addr[0]::DWordWL)[1], 0, 0, 0, 0, 0, 0]; 1, (2 * 11)::DWordWL, [(m_addr[0]::DWordWL)[0], (m_addr[0]::DWordWL)[1], 0, 0, 0, 0, 0, 0], timestamp, 1, 0, 0]` | μ |
 | `SHA256-C2.i` | i ∈ [0, 7], j ∈ [0, 3] | `IS_HALF[m_addr[i][j]]` | μ |
 | `SHA256-C3.i` | i ∈ [1, 7] | `ADD<m_addr[i]::DWordWL; m_addr[0]::DWordWL, (8 * i)::DWordWL>` |  |
-| `SHA256-C4.i` | i ∈ [0, 7] | `MEMW[['arr', ['idx', 'm', ['+', ['*', 8, 'i'], 3]], ['idx', 'm', ['+', ['*', 8, 'i'], 2]], ['idx', 'm', ['+', ['*', 8, 'i'], 1]], ['idx', 'm', ['+', ['*', 8, 'i'], 0]], ['idx', 'm', ['+', ['*', 8, 'i'], 7]], ['idx', 'm', ['+', ['*', 8, 'i'], 6]], ['idx', 'm', ['+', ['*', 8, 'i'], 5]], ['idx', 'm', ['+', ['*', 8, 'i'], 4]]]; 0, m_addr[i]::DWordWL, ['arr', ['idx', 'm', ['+', ['*', 8, 'i'], 3]], ['idx', 'm', ['+', ['*', 8, 'i'], 2]], ['idx', 'm', ['+', ['*', 8, 'i'], 1]], ['idx', 'm', ['+', ['*', 8, 'i'], 0]], ['idx', 'm', ['+', ['*', 8, 'i'], 7]], ['idx', 'm', ['+', ['*', 8, 'i'], 6]], ['idx', 'm', ['+', ['*', 8, 'i'], 5]], ['idx', 'm', ['+', ['*', 8, 'i'], 4]]], timestamp, 0, 0, 1]` | μ |
-| `SHA256-C5` |  | `MEMW[['arr', ['idx', ['cast', ['idx', 'h_addr', 0], 'DWordWL'], 0], ['idx', ['cast', ['idx', 'h_addr', 0], 'DWordWL'], 1], 0, 0, 0, 0, 0, 0]; 1, (2 * 10)::DWordWL, ['arr', ['idx', ['cast', ['idx', 'h_addr', 0], 'DWordWL'], 0], ['idx', ['cast', ['idx', 'h_addr', 0], 'DWordWL'], 1], 0, 0, 0, 0, 0, 0], timestamp, 1, 0, 0]` | μ |
+| `SHA256-C4.i` | i ∈ [0, 7] | `MEMW[[m[8 * i + 3], m[8 * i + 2], m[8 * i + 1], m[8 * i + 0], m[8 * i + 7], m[8 * i + 6], m[8 * i + 5], m[8 * i + 4]]; 0, m_addr[i]::DWordWL, [m[8 * i + 3], m[8 * i + 2], m[8 * i + 1], m[8 * i + 0], m[8 * i + 7], m[8 * i + 6], m[8 * i + 5], m[8 * i + 4]], timestamp, 0, 0, 1]` | μ |
+| `SHA256-C5` |  | `MEMW[[(h_addr[0]::DWordWL)[0], (h_addr[0]::DWordWL)[1], 0, 0, 0, 0, 0, 0]; 1, (2 * 10)::DWordWL, [(h_addr[0]::DWordWL)[0], (h_addr[0]::DWordWL)[1], 0, 0, 0, 0, 0, 0], timestamp, 1, 0, 0]` | μ |
 | `SHA256-C6.i` | i ∈ [0, 3], j ∈ [0, 3] | `IS_HALF[h_addr[i][j]]` | μ |
 | `SHA256-C7.i` | i ∈ [1, 3] | `ADD<h_addr[i]::DWordWL; h_addr[0]::DWordWL, 8 * i::DWordWL>` |  |
-| `SHA256-C8.i` | i ∈ [0, 3] | `MEMW[['arr', ['idx', 'h', ['+', ['*', 8, 'i'], 3]], ['idx', 'h', ['+', ['*', 8, 'i'], 2]], ['idx', 'h', ['+', ['*', 8, 'i'], 1]], ['idx', 'h', ['+', ['*', 8, 'i'], 0]], ['idx', 'h', ['+', ['*', 8, 'i'], 7]], ['idx', 'h', ['+', ['*', 8, 'i'], 6]], ['idx', 'h', ['+', ['*', 8, 'i'], 5]], ['idx', 'h', ['+', ['*', 8, 'i'], 4]]]; 0, h_addr[i]::DWordWL, ['arr', ['idx', 'out', ['+', ['*', 8, 'i'], 3]], ['idx', 'out', ['+', ['*', 8, 'i'], 2]], ['idx', 'out', ['+', ['*', 8, 'i'], 1]], ['idx', 'out', ['+', ['*', 8, 'i'], 0]], ['idx', 'out', ['+', ['*', 8, 'i'], 7]], ['idx', 'out', ['+', ['*', 8, 'i'], 6]], ['idx', 'out', ['+', ['*', 8, 'i'], 5]], ['idx', 'out', ['+', ['*', 8, 'i'], 4]]], timestamp + 1::DWordWL, 0, 0, 1]` | μ |
+| `SHA256-C8.i` | i ∈ [0, 3] | `MEMW[[h[8 * i + 3], h[8 * i + 2], h[8 * i + 1], h[8 * i + 0], h[8 * i + 7], h[8 * i + 6], h[8 * i + 5], h[8 * i + 4]]; 0, h_addr[i]::DWordWL, [out[8 * i + 3], out[8 * i + 2], out[8 * i + 1], out[8 * i + 0], out[8 * i + 7], out[8 * i + 6], out[8 * i + 5], out[8 * i + 4]], timestamp + 1::DWordWL, 0, 0, 1]` | μ |
 
 ### sched
 
@@ -2659,14 +3346,246 @@ As mentioned, we provide the round constants through a short precomputed lookup
 
 | Tag | Range | Description | Multiplicity |
 |-----|-------|-------------|--------------|
-| `SHA256-C13` |  | `SHA256ROUND[timestamp, ['arr', ['+', ['*', ['^', 2, 0], ['idx', 'h', 3]], ['*', ['^', 2, 8], ['idx', 'h', 2]], ['*', ['^', 2, 16], ['idx', 'h', 1]], ['*', ['^', 2, 24], ['idx', 'h', 0]]], ['+', ['*', ['^', 2, 0], ['idx', 'h', 7]], ['*', ['^', 2, 8], ['idx', 'h', 6]], ['*', ['^', 2, 16], ['idx', 'h', 5]], ['*', ['^', 2, 24], ['idx', 'h', 4]]], ['+', ['*', ['^', 2, 0], ['idx', 'h', 11]], ['*', ['^', 2, 8], ['idx', 'h', 10]], ['*', ['^', 2, 16], ['idx', 'h', 9]], ['*', ['^', 2, 24], ['idx', 'h', 8]]], ['+', ['*', ['^', 2, 0], ['idx', 'h', 15]], ['*', ['^', 2, 8], ['idx', 'h', 14]], ['*', ['^', 2, 16], ['idx', 'h', 13]], ['*', ['^', 2, 24], ['idx', 'h', 12]]], ['+', ['*', ['^', 2, 0], ['idx', 'h', 19]], ['*', ['^', 2, 8], ['idx', 'h', 18]], ['*', ['^', 2, 16], ['idx', 'h', 17]], ['*', ['^', 2, 24], ['idx', 'h', 16]]], ['+', ['*', ['^', 2, 0], ['idx', 'h', 23]], ['*', ['^', 2, 8], ['idx', 'h', 22]], ['*', ['^', 2, 16], ['idx', 'h', 21]], ['*', ['^', 2, 24], ['idx', 'h', 20]]], ['+', ['*', ['^', 2, 0], ['idx', 'h', 27]], ['*', ['^', 2, 8], ['idx', 'h', 26]], ['*', ['^', 2, 16], ['idx', 'h', 25]], ['*', ['^', 2, 24], ['idx', 'h', 24]]], ['+', ['*', ['^', 2, 0], ['idx', 'h', 31]], ['*', ['^', 2, 8], ['idx', 'h', 30]], ['*', ['^', 2, 16], ['idx', 'h', 29]], ['*', ['^', 2, 24], ['idx', 'h', 28]]]], 0]` | μ |
+| `SHA256-C13` |  | `SHA256ROUND[timestamp, [2^0 * h[3] + 2^8 * h[2] + 2^16 * h[1] + 2^24 * h[0], 2^0 * h[7] + 2^8 * h[6] + 2^16 * h[5] + 2^24 * h[4], 2^0 * h[11] + 2^8 * h[10] + 2^16 * h[9] + 2^24 * h[8], 2^0 * h[15] + 2^8 * h[14] + 2^16 * h[13] + 2^24 * h[12], 2^0 * h[19] + 2^8 * h[18] + 2^16 * h[17] + 2^24 * h[16], 2^0 * h[23] + 2^8 * h[22] + 2^16 * h[21] + 2^24 * h[20], 2^0 * h[27] + 2^8 * h[26] + 2^16 * h[25] + 2^24 * h[24], 2^0 * h[31] + 2^8 * h[30] + 2^16 * h[29] + 2^24 * h[28]], 0]` | μ |
 | `SHA256-C14` |  | `SHA256ROUND[timestamp, last_round_out, 64]` | -μ |
-| `SHA256-C15.i` | i ∈ [0, 31] | `IS_BYTE[out[i]]` | μ |
-| `SHA256-C16.i` | i ∈ [0, 7] | `ADD<['arr', 0, ['+', ['*', ['^', 2, 0], ['idx', 'out', ['+', ['*', 4, 'i'], 3]]], ['*', ['^', 2, 8], ['idx', 'out', ['+', ['*', 4, 'i'], 2]]], ['*', ['^', 2, 16], ['idx', 'out', ['+', ['*', 4, 'i'], 1]]], ['*', ['^', 2, 24], ['idx', 'out', ['+', ['*', 4, 'i'], 0]]]]]; ['arr', 0, ['idx', 'last_round_out', 'i']], ['arr', 0, ['+', ['*', ['^', 2, 0], ['idx', 'h', ['+', ['*', 4, 'i'], 3]]], ['*', ['^', 2, 8], ['idx', 'h', ['+', ['*', 4, 'i'], 2]]], ['*', ['^', 2, 16], ['idx', 'h', ['+', ['*', 4, 'i'], 1]]], ['*', ['^', 2, 24], ['idx', 'h', ['+', ['*', 4, 'i'], 0]]]]]>` |  |
+| `SHA256-C15.i` | i ∈ [0, 31] | μ ⇒ `IS_BYTE<out[i]>` |  |
+| `SHA256-C16.i` | i ∈ [0, 7] | `ADD<[0, 2^0 * out[4 * i + 3] + 2^8 * out[4 * i + 2] + 2^16 * out[4 * i + 1] + 2^24 * out[4 * i + 0]]; [0, last_round_out[i]], [0, 2^0 * h[4 * i + 3] + 2^8 * h[4 * i + 2] + 2^16 * h[4 * i + 1] + 2^24 * h[4 * i + 0]]>` |  |
 
 ### lookup
 
 | Tag | Description | Multiplicity |
 |-----|-------------|--------------|
 | `SHA256-C17` | `IS_BIT<μ>` |  |
-| `SHA256-C18` | `ECALL[timestamp, ['arr', ['-', ['^', 2, 32], 1], ['-', ['^', 2, 32], 1]]]` | -μ |
\ No newline at end of file
+| `SHA256-C18` | `ECALL[timestamp, (2^64 - 1)::DWordWL]` | -μ |
+
+---
+
+# KECCAK Accelerator
+
+The  chip applies the keccak permutation `kappa` to a given memory range; other aspects of keccak hashing (such as repeated permutation invocation, input padding and state initialization) fall outside the scope of this accelerator.
+
+This permutation `kappa: FF_2^1600 -> FF_2^1600` operates on 1600 bits and is composed of 24 applications of round-permutation `Lambda: FF_2^1600 times NN -> FF_2^1600`, where the additional parameter is the round constant. `Lambda` is defined as the composition `iota compose chi compose pi compose rho compose theta`, where only `iota` depends on the round constant.
+
+The keccak accelerator comprises two chips: a core chip that interacts with the memory --- loading the input and writing the output, and a round chip that applies the round permutation.
+
+## Core chip
+
+### Columns
+
+The  chip is comprised of  variables that are expressed using  columns and leverages  interaction(s):
+
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `timestamp` | `DWordWL` | timestamp at which the permutation is performed |
+| `addr` | `DWordBL` | memory address storing the first bit of the state |
+| `input_state` | `[['Byte', 8], 5][5]` | state at the start of executing the permutation |
+
+### Output
+
+| Name | Type | Description |
+|------|------|-------------|
+| `output_state` | `[['Byte', 8], 5][5]` | state after executing the permutation |
+
+### Auxiliary
+
+| Name | Type | Description |
+|------|------|-------------|
+| `state_ptr` | `['DWordHL', 5][5]` | memory addresses storing the entire state |
+
+### Multiplicity
+
+| Name | Type | Description |
+|------|------|-------------|
+| `μ` | `Bit` |  |
+
+### Constraints
+
+In this VM, we assign syscall number -2 to the  accelerator. The chip therefore contributes the following interaction to the lookup-argument:
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `KECCAK-C1` | `ECALL[timestamp, (2^64 - 2)::DWordWL]` | -μ |
+
+The address containing the state to be permuted is passed in as argument `A0 = x10`. The following constraints describe that this address is read into `addr` ([keccak:c:read_addr]), from which `state_ptr` --- the collection of pointers to all lanes of the state --- is derived ([keccak:c:state_ptr]). The state is then read into `input_state`, while the `output_state` is written back to the indicated address ([keccak:c:load_store_state]).
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `KECCAK-C2` |  | `MEMW[addr; 1, (2 * 10)::DWordWL, addr, timestamp, 1, 0, 0]` | μ |
+| `KECCAK-C3.i` | x ∈ [0, 4], y ∈ [0, 4] | `ADD<state_ptr[x][y]::DWordWL; addr::DWordWL, (8 * (5 * y + x))::DWordWL>` |  |
+| `KECCAK-C4.i` | x ∈ [0, 4], y ∈ [0, 4], z ∈ [0, 3] | `IS_HALF[state_ptr[x][y][z]]` | μ |
+| `KECCAK-C5.i` | x ∈ [0, 4], y ∈ [0, 4] | `MEMW[input_state[x][y]; 0, state_ptr[x][y]::DWordWL, output_state[x][y], timestamp, 0, 0, 1]` | μ |
+
+Lastly, the input state is pushed to the Keccak-round function, while the output after 24 rounds is taken off the bus:
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `KECCAK-C6` | `KECCAK[timestamp, 0, input_state]` | μ |
+| `KECCAK-C7` | `KECCAK[timestamp, 24, output_state]` | -μ |
+
+### Padding
+
+The  table can be padded to the next power of two with the following value assignments:
+
+| Column | Padding value |
+|--------|---------------|
+| `timestamp` | `0` |
+| `addr` | `0` |
+| `input_state` | `0` |
+| `output_state` | `0` |
+| `state_ptr` | `8 * [[0, 1, 2, 3, 4], [5, 6, 7, 8, 9], [10, 11, 12, 13, 14], [15, 16, 17, 18, 19], [20, 21, 22, 23, 24]]` |
+| `μ` | `0` |
+
+## Round chip
+
+### Columns
+
+The  chip is comprised of  variables that are expressed using  columns and leverages  interaction(s):
+
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `timestamp` | `DWordWL` | timestamp at which the permutation is performed |
+| `round` | `BaseField` | index of the permutation round |
+| `start` | `[['Byte', 8], 5][5]` | state at the start of executing the permutation |
+
+### Auxiliary
+
+| Name | Type | Description |
+|------|------|-------------|
+| `Cxz` | `[['Byte', 8], 4][5]` | $xor_(i=0)^(y+2) `start[x,i,z]`$ |
+| `Cxz_left` | `['Byte', 8][5]` | the left-rotated component of `rotated_Cxz` |
+| `Cxz_right` | `['Bit', 4][5]` | the right-rotated component of `rotated_Cxz` (which is a single bit) |
+| `Dxz` | `['Byte', 8][5]` | $`Cxz[`\(`x` - 1) mod 5`,y,z]` xor `rotated_Cxz[`\(`x` + 1) mod 5`,y,z]`$ |
+| `theta` | `[['Byte', 8], 5][5]` | $theta(`start`)$, the state after applying $theta$. |
+| `rot_left` | `[['Byte', 8], 5][5]` | the left-rotated component of $`theta[x,y]` <<< `rnc`$ |
+| `rot_right` | `[['Byte', 8], 5][5]` | the right-rotated component of $`theta[x,y]` <<< `rnc`$ |
+| `chi_ANDs` | `[['Byte', 8], 5][5]` | $(`pi[`\(x+1) mod 5`,y,z]` xor 255) times.o `pi[`\(x + 2) mod 5`,y,z]`$ |
+| `chi` | `[['Byte', 8], 5][5]` | $(chi compose pi compose rho compose theta)(`start`)$; the state after applying $chi$ |
+| `rc` | `Byte[8]` | round constants |
+| `iota` | `Byte[8]` | state update following from step $iota$. |
+
+### Virtual
+
+| Name | Type | Description |
+|------|------|-------------|
+| `rotated_Cxz` | `['Byte', 8][5]` | $`Cxz[x,`3`,z]` <<< 1$ |
+| `out` | `[['Byte', 8], 5][5]` | state at the end of executing the permutation |
+| `rho` | `[['Byte', 8], 5][5]` | $(rho compose theta)(`start`)$; the state after applying $rho$ |
+| `pi` | `[['Byte', 8], 5][5]` | $(pi compose rho compose theta)(`start`)$; the state after applying $pi$ |
+
+**Definition of `rotated_Cxz`:**
+```
+rotated_Cxz := Cxz_left[x][z] + Cxz_right[x][3]
+rotated_Cxz := Cxz_left[x][z]
+rotated_Cxz := Cxz_left[x][z] + Cxz_right[x][0]
+rotated_Cxz := Cxz_left[x][z]
+rotated_Cxz := Cxz_left[x][z] + Cxz_right[x][1]
+rotated_Cxz := Cxz_left[x][z]
+rotated_Cxz := Cxz_left[x][z] + Cxz_right[x][2]
+rotated_Cxz := Cxz_left[x][z]
+```
+
+**Definition of `out`:**
+```
+out := iota[z]
+out := chi[x][y][z]
+out := chi[x][y][z]
+out := chi[x][y][z]
+```
+
+**Definition of `rho`:**
+```
+rho := (1 - rbc[x][y][0]) * (1 - rbc[x][y][1]) * (rot_left[x][y][z] + rot_right[x][y][(z - 2) mod 8]) + rbc[x][y][0] * (1 - rbc[x][y][1]) * (rot_left[x][y][(z - 2) mod 8] + rot_right[x][y][(z - 4) mod 8]) + (1 - rbc[x][y][0]) * rbc[x][y][1] * (rot_left[x][y][(z - 4) mod 8] + rot_right[x][y][(z - 6) mod 8]) + rbc[x][y][0] * rbc[x][y][1] * (rot_left[x][y][(z - 6) mod 8] + rot_right[x][y][z])
+```
+
+**Definition of `pi`:**
+```
+pi := rho[(x + 3 * y) mod 5][x][z]
+```
+
+### Multiplicity
+
+| Name | Type | Description |
+|------|------|-------------|
+| `μ` | `Bit` |  |
+
+`start` contains the state to which the permutation should be applied. Its three-dimensional array mimics the specification's three-dimensional state
+
+and orders the bits as prescribed.
+
+Rho rotates every lane by a rotation offset in `[0, 64)`. These offsets are identical for every round.
+
+We decompose each offset in three components: the lower nibble (4 bits) are represented by `rnc`, while the upper two bits are represented by as `Bit`s in `rbc`. That is, ``rho_offset[x][y]` = `rnc[x][y]` + 16 dot `rbc[x][y][0]` + 32 dot `rbc[x][y][1]``.
+
+### Constraints
+
+The following constraints ensure that `theta` captures the state after applying the first subpermutation of the round-permutation: `theta`. Note here that `Cxz_left` and `Cxz_right` do have to be range-checked; it cannot be assumed that this implicitly follows from [keccak:c:Dxz] combined with `rotated_Cxz`'s definition.
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `KECCAK_RND-C1.i` | x ∈ [0, 4], z ∈ [0, 7] | `BYTE_ALU[Cxz[x][0][z]; ⧼XOR⧽, start[x][0][z], start[x][1][z]]` | μ |
+| `KECCAK_RND-C2.i` | x ∈ [0, 4], y ∈ [2, 4], z ∈ [0, 7] | `BYTE_ALU[Cxz[x][y - 1][z]; ⧼XOR⧽, Cxz[x][y - 2][z], start[x][y][z]]` | μ |
+| `KECCAK_RND-C3.i` | x ∈ [0, 4], z ∈ [0, 3] | `HWSL[[(Cxz_left[x]::DWordHL)[z], Cxz_right[x][z]::Half]; (Cxz[x][3]::DWordHL)[z], 1]` | μ |
+| `KECCAK_RND-C4.i` | x ∈ [0, 4], z ∈ [0, 7] | μ ⇒ `IS_BYTE<Cxz_left[x][z]>` |  |
+| `KECCAK_RND-C5.i` | x ∈ [0, 4], z ∈ [0, 3] | `IS_BIT<Cxz_right[x][z]>` |  |
+| `KECCAK_RND-C6.i` | x ∈ [0, 4], z ∈ [0, 7] | `BYTE_ALU[Dxz[x][z]; ⧼XOR⧽, Cxz[(x - 1) mod 5][3][z], rotated_Cxz[(x + 1) mod 5][z]]` | μ |
+| `KECCAK_RND-C7.i` | x ∈ [0, 4], y ∈ [0, 4], z ∈ [0, 7] | `BYTE_ALU[theta[x][y][z]; ⧼XOR⧽, start[x][y][z], Dxz[x][z]]` | μ |
+
+Next, we constrain that `rho` captures the state after applying subpermutation `rho`. Note here as well that `rot_left` and `rot_right` do have to be range-checked; it cannot be assumed that this implicitly follows from later constraints.
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `KECCAK_RND-C8.i` | x ∈ [0, 4], y ∈ [0, 4], z ∈ [0, 3] | `HWSL[[(rot_left[x][y]::DWordHL)[z], (rot_right[x][y]::DWordHL)[z]]; (theta[x][y]::DWordHL)[z], rnc[x][y]]` | μ |
+| `KECCAK_RND-C9.i` | x ∈ [0, 4], y ∈ [0, 4], z ∈ [0, 7] | μ ⇒ `IS_BYTE<rot_left[x][y][z]>` |  |
+| `KECCAK_RND-C10.i` | x ∈ [0, 4], y ∈ [0, 4], z ∈ [0, 7] | μ ⇒ `IS_BYTE<rot_right[x][y][z]>` |  |
+
+Observe that the lane-permutation performed by `pi` is absorbed in `pi`'s definition. The next permutation that is constrained in `chi`:
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `KECCAK_RND-C11.i` | x ∈ [0, 4], y ∈ [0, 4], z ∈ [0, 7] | `BYTE_ALU[chi_ANDs[x][y][z]; ⧼AND⧽, 255 - pi[(x + 1) mod 5][y][z], pi[(x + 2) mod 5][y][z]]` | μ |
+| `KECCAK_RND-C12.i` | x ∈ [0, 4], y ∈ [0, 4], z ∈ [0, 7] | `BYTE_ALU[chi[x][y][z]; ⧼XOR⧽, pi[x][y][z], chi_ANDs[x][y][z]]` | μ |
+
+Lastly, the round constants are added to one of the lanes in the state. `iota` contains the updated lane. In the definition of `out`, the output of `chi` and `iota` is combined to construct the output of the permutation.
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `KECCAK_RND-C13.i` | z ∈ [0, 7] | `BYTE_ALU[iota[z]; ⧼XOR⧽, chi[0][0][z], rc[z]]` | μ |
+
+Lastly, the round chip contributes the following interactions to the lookup:
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `KECCAK_RND-C14` | `KECCAK[timestamp, round, start]` | -μ |
+| `KECCAK_RND-C15` | `KECCAK[timestamp, round + 1, out]` | μ |
+| `KECCAK_RND-C16` | `KECCAK_RC[rc; round]` | -μ |
+
+### Notes/potential optimizations
+
+- one does not have to repeat `addr` in `state_ptr`; this saves 4 columns and 4 `IS_HALF` checks. - step `rho` does not need to be applied to `state[0][0]`; its has a zero-shift. This saves 16 columns and 4 `HWSL` interactions. - when the output of `HWSL` are `Byte`s mapped as `Half`s, we find that out of every four output bytes, at least one is zero. Since `rnc` is constant, [keccak:c:rho_rotation] makes those zero-bytes show up in `rot_left` and `rot_right` at constant locations. This means 96 columns can be removed from the chip at no cost. Likewise, 96 `IS_BYTE` interactions can be dropped from [keccak:c:range_rot_left] and [keccak:c:range_rot_right]. - the shift-constants are equivalent to `1 mod 16` for `(`x`, `y`) = (1, 0)` and `-1 mod 16` for `(2, 3)`. This means that for those lanes it suffices to constrain `rot_left`/`rot_right` as `Bit`s rather than `Byte`s, saving an additional 8 `IS_BYTE` interactions. - ``rc[2]` = `rc[4]` = `rc[5]` = `rc[6]` = 0`. As such, those elements need not be stored in `rc`, and need not be XORed into the state in the `iota`-step. This saves 8 columns and 4 `XOR_BYTE` interactions. - when executed in large volumnes, `KECCAK_RND` could benefit from having a three-way XOR lookup table. With this in place, the 80 interactions in [keccak:c:theta_cxz_start] and [keccak:c:theta_cxz] could be dropped. Likewise, 80 columns could be removed from the chip (a \~5% savings).
+
+## Round constant lookup
+
+### Columns
+
+We provide the round constants through a short precomputed lookup table: .
+
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `round` | `BaseField` |  |
+| `RC` | `Byte[8]` | round constants for the given `round` |
+
+### Multiplicity
+
+| Name | Type | Description |
+|------|------|-------------|
+| `μ` | `BaseField` |  |
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `KECCAK_RC-C1` | `KECCAK_RC[RC; round]` | -μ |
\ No newline at end of file
diff --git a/docs/spec/store.md b/docs/spec/store.md
new file mode 100644
index 000000000..c694b319b
--- /dev/null
+++ b/docs/spec/store.md
@@ -0,0 +1,78 @@
+# STORE Chip
+
+The  chip provides functionality to store a value to memory. It decomposes a `DWord` into bytes and delegates low-level memory handling to the `MEMW` chip ([memw]).
+
+## Variables
+
+The  chip is comprised of  variables that are expressed using  columns and leverages  interaction(s):
+
+### Input
+
+| Name | Type | Description |
+|------|------|-------------|
+| `base_address` | `DWordWL` | The base address to write to, gets offset by $[0, 7]$, depending on how big the access is |
+| `timestamp` | `DWordWL` | The timestamp at which this memory access is said to occur |
+| `write2` | `Bit` | Whether to write exactly 2 bytes |
+| `write4` | `Bit` | Whether to write exactly 4 bytes |
+| `write8` | `Bit` | Whether to write exactly 8 bytes |
+| `value` | `DWordBL` | The value to store |
+
+### Virtual
+
+| Name | Type | Description |
+|------|------|-------------|
+| `write1` | `Bit` | Whether to write exactly 1 byte |
+
+**Definition of `write1`:**
+```
+write1 := μ - write2 - write4 - write8
+```
+
+### Multiplicity
+
+| Name | Type | Description |
+|------|------|-------------|
+| `μ` | `Bit` |  |
+
+## Assumptions
+
+| Tag | Range | Description |
+|-----|-------|-------------|
+| `STORE-A1.i` | i ∈ [0, 1] | `IS_WORD[base_address[i]]` |
+| `STORE-A2.i` | i ∈ [0, 1] | `IS_WORD[timestamp[i]]` |
+
+## Constraints
+
+The chip delegates the actual memory interaction to the `MEMW` chip, and ensures the values are proper bytes.
+
+| Tag | Range | Description | Multiplicity |
+|-----|-------|-------------|--------------|
+| `STORE-C1` |  | `IS_BIT<μ>` |  |
+| `STORE-C2` |  | `IS_BIT<write2>` |  |
+| `STORE-C3` |  | `IS_BIT<write4>` |  |
+| `STORE-C4` |  | `IS_BIT<write8>` |  |
+| `STORE-C5` |  | `IS_BIT<write2 + write4 + write8>` |  |
+| `STORE-C6` |  | `write2` + `write4` + `write8` => `μ` = 1 |  |
+| | | _polynomial:_ `(write2 + write4 + write8) * (1 - μ) = 0` | |
+| `STORE-C7.i` | i ∈ [0, 7] | μ ⇒ `IS_BYTE<value[i]>` |  |
+| `STORE-C8` |  | `MEMW[0, base_address, value, timestamp, write2, write4, write8]` | μ |
+
+The chip contributes the following to the lookup argument.
+
+| Tag | Description | Multiplicity |
+|-----|-------------|--------------|
+| `STORE-C9` | `MEMOP[0::DWordWL; timestamp, base_address, value::DWordWL, 1 + 4 * write2 + 8 * write4 + 16 * write8]` | -μ |
+
+## Padding
+
+The table can be padded to the next power of two with the following value assignments:
+
+| Column | Padding value |
+|--------|---------------|
+| `base_address` | `0` |
+| `timestamp` | `0` |
+| `write2` | `0` |
+| `write4` | `0` |
+| `write8` | `0` |
+| `value` | `0` |
+| `μ` | `0` |
\ No newline at end of file
diff --git a/scripts/typst_to_md.py b/scripts/typst_to_md.py
index e0a949536..3860acba1 100644
--- a/scripts/typst_to_md.py
+++ b/scripts/typst_to_md.py
@@ -69,7 +69,7 @@ def expr_to_text(expr, parent_prec: int = 100) -> str:
     """
     PREC = {
         "idx": 0, "pow": 1, "neg": 2, "cast": 3, "mul": 4,
-        "div": 5, "sum": 6, "not": 7, "add": 8, "sub": 9, "eq": 10,
+        "div": 5, "mod": 6, "sum": 7, "not": 8, "add": 9, "sub": 10, "eq": 11,
     }
 
     def wrap(s: str, prec: int) -> str:
@@ -89,6 +89,15 @@ def wrap(s: str, prec: int) -> str:
             base = expr_to_text(expr[1], PREC["idx"])
             idx = expr_to_text(expr[2], 100)
             return f"{base}[{idx}]"
+        elif op == "arr":
+            parts = [expr_to_text(e, 100) for e in expr[1:]]
+            return "[" + ", ".join(parts) + "]"
+        elif op == "opsel":
+            return f"⧼{expr[1]}⧽"
+        elif op == "mod":
+            lhs = expr_to_text(expr[1], PREC["mod"])
+            rhs = expr_to_text(expr[2], PREC["mod"])
+            return wrap(f"{lhs} mod {rhs}", PREC["mod"])
         elif op == "not":
             inner = expr_to_text(expr[1], PREC["not"])
             return wrap(f"1 - {inner}", PREC["not"])
@@ -164,23 +173,29 @@ def iters_to_text(obj: dict) -> str:
     ("variables", "Variables"),
     ("signatures", "Signatures"),
     ("is_bit", "IS_BIT Template"),
+    ("is_byte", "IS_BYTE Template"),
     ("sign", "SIGN Template"),
     ("add", "ADD/SUB Template"),
     ("neg", "NEG Template"),
-    ("memw", "MEMW Chip"),
     ("decode", "DECODE Table"),
     ("cpu", "CPU Chip"),
+    ("cpu32", "CPU32 Chip"),
     ("shift", "SHIFT Chip"),
     ("branch", "BRANCH Chip"),
     ("lt", "LT Chip"),
+    ("eq", "EQ Chip"),
     ("mul", "MUL Chip"),
     ("dvrm", "DVRM Chip"),
-    ("load", "LOAD Chip"),
     ("bitwise", "BITWISE Chips"),
+    ("bytewise", "BYTEWISE Chip"),
+    ("memw", "MEMW Chip"),
+    ("load", "LOAD Chip"),
+    ("store", "STORE Chip"),
     ("about_ecalls", "About ECALL"),
     ("halt", "HALT Chip"),
     ("commit", "COMMIT Chip"),
     ("sha256", "SHA256 Accelerator"),
+    ("keccak", "KECCAK Accelerator"),
 ]
 
 
@@ -587,17 +602,39 @@ def render_assumptions_table(chip: dict, config: dict) -> str:
 
 
 def render_padding_table(chip: dict, config: dict) -> str:
-    """Render padding data as Markdown table."""
-    padding = chip.get("padding", {})
-    if not padding:
+    """Render padding data as Markdown table.
+
+    Padding values live on each variable as a `pad` attribute (mirrors
+    `render_chip_padding_table` in spec/chip.typ): instantiated,
+    non-preprocessed variables only.
+    """
+    var_cfg = config.get("variables", {})
+    instantiated = var_cfg.get("categories", {}).get("instantiated", [])
+    preprocessed_labels = {
+        t["label"] for t in var_cfg.get("types", []) if t.get("preprocessed", False)
+    }
+
+    rows = []
+    for category in instantiated:
+        for var in chip.get("variables", {}).get(category, []):
+            var_type = var.get("type")
+            if isinstance(var_type, str) and var_type in preprocessed_labels:
+                continue
+            if "pad" in var:
+                rows.append((var["name"], expr_to_text(var["pad"])))
+
+    # Legacy schema fallback: top-level `padding` table.
+    for col_name, value in chip.get("padding", {}).items():
+        rows.append((col_name, str(value)))
+
+    if not rows:
         return ""
 
     lines = []
-    lines.append("| Column | Value |")
-    lines.append("|--------|-------|")
-
-    for col_name, value in padding.items():
-        lines.append(f"| `{col_name}` | `{value}` |")
+    lines.append("| Column | Padding value |")
+    lines.append("|--------|---------------|")
+    for name, value in rows:
+        lines.append(f"| `{name}` | `{value}` |")
 
     lines.append("")
     return "\n".join(lines)

From a6ef9d15731be176597389c033460bf1d5854c01 Mon Sep 17 00:00:00 2001
From: Robin Jadoul <robin.jadoul@3milabs.tech>
Date: Tue, 9 Jun 2026 14:32:37 +0200
Subject: [PATCH 104/105] Add extra constraints to prevent register side
 effects in CPU32 padding rows (#646)

* Add extra constraints to prevent register side effects in CPU32 padding rows

* fixes

* Patch 'signed' soundness hole and small cleanups

* More explicit constraint
---
 spec/cpu32.typ      |  1 +
 spec/src/cpu32.toml | 24 ++++++++++++++++++++++--
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/spec/cpu32.typ b/spec/cpu32.typ
index e5e8963bc..a3c639cc7 100644
--- a/spec/cpu32.typ
+++ b/spec/cpu32.typ
@@ -45,6 +45,7 @@ provide these below.
 
 Most constraints correspond to those already present in the CPU, and we present them here first,
 including some updates to the range checking corresponding to the differing types.
+We also need to make sure that for padding rows ($mu = 0$), no side effects can occur.
 
 #render_constraint_table(chip, config, groups: ("decode", "range", "alu", "mem", "logup"))
 
diff --git a/spec/src/cpu32.toml b/spec/src/cpu32.toml
index f26ce0e8c..e226c847c 100644
--- a/spec/src/cpu32.toml
+++ b/spec/src/cpu32.toml
@@ -186,7 +186,7 @@ name = "assumptions"
 
 [[constraints.assumptions]]
 kind = "arith"
-constraint = "$#`read_register2` = 0 or #`imm[i] = 0`$"
+constraint = "$#`read_register2` = 0 or #`imm = 0`$"
 poly = ["*", "read_register2", ["+", ["idx", "imm", 0], ["idx", "imm", 1]]]
 
 [[constraint_groups]]
@@ -350,6 +350,21 @@ multiplicity = "write_register"
 [[constraint_groups]]
 name = "logup"
 
+[[constraints.logup]]
+kind = "arith"
+constraint = "$#`!μ` => #`read_register1 = 0`$"
+poly = ["*", ["not", "μ"], "read_register1"]
+
+[[constraints.logup]]
+kind = "arith"
+constraint = "$#`!μ` => #`read_register2 = 0`$"
+poly = ["*", ["not", "μ"], "read_register2"]
+
+[[constraints.logup]]
+kind = "arith"
+constraint = "$#`!μ` => #`write_register = 0`$"
+poly = ["*", ["not", "μ"], "write_register"]
+
 [[constraints.logup]]
 kind = "interaction"
 tag = "CPU32"
@@ -360,6 +375,11 @@ multiplicity = ["-", "μ"]
 [[constraint_groups]]
 name = "ext"
 
+[[constraints.ext]]
+kind = "arith"
+constraint = "$#`signed` != 0 => #`μ` = 1$"
+poly = ["*", "signed", ["not", "μ"]]
+
 [[constraints.ext]]
 kind = "interaction"
 tag = "BYTE_ALU"
@@ -402,7 +422,7 @@ poly = ["-", ["idx", "arg2", 1], ["*", ["-", ["^", 2, 32], 1], "rv2_sign"], ["id
 [[constraints.ext]]
 kind = "template"
 tag = "SIGN"
-input = [["idx", "res", 1], 1]
+input = [["idx", "res", 1], "μ"]
 output = "res_sign"
 
 [[constraints.ext]]

From 4b60eac5930aa571f8528a9adb85a04c6ff2ce33 Mon Sep 17 00:00:00 2001
From: MauroFab <maurotoscano2@gmail.com>
Date: Fri, 12 Jun 2026 12:19:05 -0300
Subject: [PATCH 105/105] Fix markdown spec extraction

---
 scripts/extract_and_convert_spec.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/extract_and_convert_spec.sh b/scripts/extract_and_convert_spec.sh
index ee3ac62de..d70daf809 100755
--- a/scripts/extract_and_convert_spec.sh
+++ b/scripts/extract_and_convert_spec.sh
@@ -29,7 +29,7 @@ git show "$BRANCH:spec/src/config.toml" > "$TEMP_DIR/src/config.toml" 2>/dev/nul
 }
 
 # Extract all chip TOML files
-for file in $(git ls-tree -r "$BRANCH" --name-only | grep '^spec/src/.*\.toml$' | grep -v config.toml | grep -v page.toml); do
+for file in $(git ls-tree -r "$BRANCH" --name-only | grep '^spec/src/.*\.toml$' | grep -v config.toml); do
     filename=$(basename "$file")
     git show "$BRANCH:$file" > "$TEMP_DIR/src/$filename" 2>/dev/null || true
 done