From 28b56ce7441c97b03d146ec3ab45f32a42348d40 Mon Sep 17 00:00:00 2001
From: Joseph Mattiello <git@joemattiello.com>
Date: Fri, 1 May 2026 23:12:07 -0400
Subject: [PATCH 1/6] perf: add reusable perf_counters.h instrumentation system

Generalizes the ad-hoc BLITTER_PROFILE pattern into a reusable,
zero-overhead-when-off counter system any subsystem can use.

* src/core/perf_counters.{h,c} - PERF_COUNTER / PERF_INC / PERF_ADD
  macros backed by a constructor-registered linked list.  When
  BENCH_PROFILE is undefined every macro expands to (0) so there is
  no runtime, code-size, or symbol cost in shipped builds.

* src/tom/blitter.c - migrate the existing BLITTER_PROFILE counters
  in BlitterMidsummer2 onto the new system.  Counters are embedded in
  existing initializers via the comma operator so the file stays
  C89-clean (no statements before declarations).

* Makefile - `make BENCH_PROFILE=1` defines the macro globally.
  `make benchmark BENCH_PROFILE=1` re-invokes with TEST_EXPORTS=1 so
  test_benchmark can dlsym `perf_counters_dump` and print all
  registered counters next to the FPS report.

* test/tools/test_benchmark.c - dlsym the optional dump symbol; if
  present (BENCH_PROFILE build), call it before the BENCHMARK RESULTS
  block.  No effect on default builds.

* exports-test.list / link-test.T - add perf_counters_{dump,reset,
  register} so harnesses can reach them under TEST_EXPORTS=1.

* scripts/profile-mac.sh - one-line wrapper around `xctrace record`.
  Defaults to Time Profiler; --template "CPU Counters" for PMU
  events on Apple Silicon.  Builds the core + harness, runs the
  benchmark under instrumentation, writes a .trace bundle, and can
  auto-open it with --open.

* docs/profiling.md - new sections covering BENCH_PROFILE counters
  (when to use vs sampling profilers) and the profile-mac.sh wrapper.

Validated end-to-end with `make benchmark BENCH_PROFILE=1
BENCH_BLITTER=accurate`: counters populate
(blitter_inner=283994 over 120 frames of yarc), and default builds
remain unchanged.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 Makefile                    | 26 +++++++++---
 Makefile.common             |  1 +
 docs/profiling.md           | 63 ++++++++++++++++++++++++++-
 exports-test.list           |  3 ++
 link-test.T                 |  3 ++
 scripts/profile-mac.sh      | 81 +++++++++++++++++++++++++++++++++++
 src/core/perf_counters.c    | 41 ++++++++++++++++++
 src/core/perf_counters.h    | 85 +++++++++++++++++++++++++++++++++++++
 src/tom/blitter.c           | 48 +++++++++++++++++++--
 test/tools/test_benchmark.c |  8 ++++
 10 files changed, 349 insertions(+), 10 deletions(-)
 create mode 100755 scripts/profile-mac.sh
 create mode 100644 src/core/perf_counters.c
 create mode 100644 src/core/perf_counters.h

diff --git a/Makefile b/Makefile
index 064b5e16..2db5a7d2 100644
--- a/Makefile
+++ b/Makefile
@@ -53,6 +53,13 @@ ifeq ($(DEBUG),1)
    CFLAGS += -DBUILD_TIMESTAMP="\"debug $(shell date -u +%Y-%m-%dT%H:%M:%SZ)\""
 endif
 
+# Opt-in instrumentation counters (src/core/perf_counters.h).
+# `make BENCH_PROFILE=1` defines the macro so PERF_COUNTER/PERF_INC
+# emit real code; otherwise every counter macro is a no-op.
+ifeq ($(BENCH_PROFILE),1)
+   CFLAGS += -DBENCH_PROFILE
+endif
+
 # Symbol export gating.
 #
 #   GNU ld (Linux, Windows MSYS2, ARM, ...) honours --version-script:
@@ -869,12 +876,19 @@ BENCH_ROM     ?= test/roms/yarc.j64
 BENCH_FRAMES  ?= 600
 BENCH_WARMUP  ?= 60
 BENCH_BLITTER ?= fast
-benchmark: $(TARGET)
-	@# Build the harness inline so this works whether or not TEST_EXPORTS=1
-	@# was used for $(TARGET); the harness only uses retro_* exports.
-	@# -ldl is Linux-specific; macOS/BSD provide dl* in libSystem/libc
-	@# (and Apple's clang silently accepts -ldl as a no-op, but other
-	@# linkers may not).
+# BENCH_PROFILE=1 enables src/core/perf_counters.h instrumentation and
+# wide-export ABI so test_benchmark can dlsym `perf_counters_dump`.
+ifeq ($(BENCH_PROFILE),1)
+BENCH_TEST_EXPORTS := TEST_EXPORTS=1
+else
+BENCH_TEST_EXPORTS :=
+endif
+benchmark:
+	@# Re-invoke make so BENCH_PROFILE / TEST_EXPORTS take effect on the .so/.dylib.
+	$(MAKE) $(BENCH_TEST_EXPORTS) BENCH_PROFILE=$(BENCH_PROFILE) -j$(shell getconf _NPROCESSORS_ONLN 2>/dev/null || echo 4)
+	@# Build the harness inline; it dlopens the core, so it only needs the retro_* ABI
+	@# (plus the optional perf_counters_dump symbol when BENCH_PROFILE=1).
+	@# -ldl is Linux-specific; macOS/BSD provide dl* in libSystem/libc.
 	$(CC) -O2 -Wall -std=c99 $(INCFLAGS) \
 		-o test/tools/test_benchmark test/tools/test_benchmark.c \
 		$(if $(filter Linux,$(shell uname -s)),-ldl)
diff --git a/Makefile.common b/Makefile.common
index 091677fa..21c8a521 100644
--- a/Makefile.common
+++ b/Makefile.common
@@ -33,6 +33,7 @@ SOURCES_C :=  \
 	$(CORE_DIR)/src/cd/cdrom.c \
 	$(CORE_DIR)/src/core/cheat.c \
 	$(CORE_DIR)/src/core/crc32.c \
+	$(CORE_DIR)/src/core/perf_counters.c \
 	$(CORE_DIR)/src/core/event.c \
 	$(CORE_DIR)/src/jerry/eeprom.c \
 	$(CORE_DIR)/src/core/filedb.c \
diff --git a/docs/profiling.md b/docs/profiling.md
index 5c2dbd20..924371fb 100644
--- a/docs/profiling.md
+++ b/docs/profiling.md
@@ -23,11 +23,21 @@ Reports `Frames/sec`, `Time/frame`, total wall time.  Boots the core via `dlopen
 
 **Instruments (Time Profiler)** is the easiest way to get a flame graph on macOS.
 
+The wrapper at `scripts/profile-mac.sh` builds the core, runs the benchmark
+under `xctrace`, and writes a `.trace` bundle you can open in Instruments:
+
+```bash
+scripts/profile-mac.sh                                    # default: Time Profiler, accurate blitter
+scripts/profile-mac.sh --template "CPU Counters"          # PMU: cycles, instructions, branch misses
+scripts/profile-mac.sh --rom test/roms/yarc.j64 --open    # auto-open the trace
+```
+
+Manual invocation if you'd rather attach to a running process:
+
 ```bash
 make benchmark BENCH_FRAMES=6000 BENCH_WARMUP=120 &
 BENCH_PID=$!
 
-# Sample for 30 seconds, output to .trace bundle
 xcrun xctrace record --template "Time Profiler" --attach $BENCH_PID --output bench.trace --time-limit 30s
 open bench.trace
 ```
@@ -41,6 +51,57 @@ sample $BENCH_PID 5 -file /tmp/sample.txt
 # 5-second sample.  Read /tmp/sample.txt for collapsed call stacks.
 ```
 
+## Bespoke counters — `BENCH_PROFILE=1`
+
+Sampling profilers tell you *where* time goes; counters tell you *how often*
+something happens.  When you want exact iteration counts (e.g., "did my
+fast-path actually skip the inner loop?"), use the `perf_counters` system in
+`src/core/perf_counters.h`.
+
+```bash
+make benchmark BENCH_PROFILE=1 BENCH_BLITTER=accurate BENCH_FRAMES=300
+# ...
+# [perf] counter dump:
+# [perf]   blitter_phrase_writes                    3034993
+# [perf]   blitter_phrase_reads                     931821
+# [perf]   blitter_inner_io                         3966814
+# [perf]   blitter_inner                            4131151
+# [perf]   blitter_outer                            337722
+# [perf]   blitter_calls                            131628
+```
+
+The macros are zero-overhead when `BENCH_PROFILE` is undefined (default
+build) — every `PERF_INC` becomes `((void)0)`, every `PERF_COUNTER`
+becomes a typedef.  Use them freely in hot paths to instrument
+hypotheses.
+
+Adding a counter:
+
+```c
+#include "perf_counters.h"
+
+PERF_COUNTER(my_event);             /* file scope */
+
+void hot(void) {
+    PERF_INC(my_event);             /* in-loop */
+    PERF_ADD(my_event, n);          /* batch */
+}
+```
+
+The harness (`test/tools/test_benchmark.c`) calls
+`perf_counters_dump(stderr)` at exit; counter values appear right
+before the `BENCHMARK RESULTS` block.
+
+When to reach for this vs. Time Profiler:
+
+| Question | Tool |
+|---|---|
+| "Where are we spending cycles?" | `xctrace` Time Profiler |
+| "How many times does the inner loop run per frame?" | `BENCH_PROFILE=1` |
+| "What fraction of inner iterations are no-ops?" | `BENCH_PROFILE=1` |
+| "Are we hitting L1 / branch-mispredicting?" | `xctrace` CPU Counters |
+| "Did this optimization change behavior, not just timing?" | `BENCH_PROFILE=1` (deltas in counts) |
+
 ## Linux — `perf` + flamegraph
 
 ```bash
diff --git a/exports-test.list b/exports-test.list
index 0b3fbff2..3ebeab6f 100644
--- a/exports-test.list
+++ b/exports-test.list
@@ -33,3 +33,6 @@ _sclk
 _smode
 _lowerField
 _vjs
+_perf_counters_dump
+_perf_counters_reset
+_perf_counters_register
diff --git a/link-test.T b/link-test.T
index 9a52e19a..4b57c7e4 100644
--- a/link-test.T
+++ b/link-test.T
@@ -36,5 +36,8 @@
       smode;
       lowerField;
       vjs;
+      perf_counters_dump;
+      perf_counters_reset;
+      perf_counters_register;
    local: *;
 };
diff --git a/scripts/profile-mac.sh b/scripts/profile-mac.sh
new file mode 100755
index 00000000..26c9f400
--- /dev/null
+++ b/scripts/profile-mac.sh
@@ -0,0 +1,81 @@
+#!/usr/bin/env bash
+#
+# profile-mac.sh -- Run test_benchmark under Xcode Instruments on Apple Silicon
+# (or any Mac with Xcode CLT).
+#
+# Usage:
+#   scripts/profile-mac.sh [--template NAME] [--frames N] [--warmup N]
+#                          [--blitter fast|accurate] [--rom PATH] [--open]
+#
+# Defaults:
+#   template = "Time Profiler"
+#   frames   = 600  warmup = 60  blitter = accurate
+#   rom      = test/roms/yarc.j64
+#   --open   = open the .trace bundle in Instruments when finished
+#
+# Common templates:
+#   "Time Profiler"   -- where time is being spent (call tree / flame)
+#   "CPU Counters"    -- Apple Silicon PMU (cycles, instr, branches, misses)
+#   "System Trace"    -- syscalls, scheduler, VM events
+#
+set -euo pipefail
+
+TEMPLATE="Time Profiler"
+FRAMES=600
+WARMUP=60
+BLITTER=accurate
+ROM="test/roms/yarc.j64"
+OPEN_TRACE=0
+
+while [ $# -gt 0 ]; do
+   case "$1" in
+      --template) TEMPLATE="$2"; shift 2 ;;
+      --frames)   FRAMES="$2"; shift 2 ;;
+      --warmup)   WARMUP="$2"; shift 2 ;;
+      --blitter)  BLITTER="$2"; shift 2 ;;
+      --rom)      ROM="$2"; shift 2 ;;
+      --open)     OPEN_TRACE=1; shift ;;
+      -h|--help)
+         sed -n '2,20p' "$0"
+         exit 0 ;;
+      *)
+         echo "Unknown arg: $1" >&2
+         exit 2 ;;
+   esac
+done
+
+if ! command -v xctrace >/dev/null 2>&1; then
+   echo "xctrace not found. Install Xcode Command Line Tools: xcode-select --install" >&2
+   exit 1
+fi
+
+ROOT="$(cd "$(dirname "$0")/.." && pwd)"
+cd "$ROOT"
+
+mkdir -p build
+TRACE="build/profile-$(date +%Y%m%d-%H%M%S).trace"
+
+# Make sure the core + harness are built (no BENCH_PROFILE; profiling
+# instrumentation skews sampling results).
+make -j"$(getconf _NPROCESSORS_ONLN 2>/dev/null || echo 4)" >/dev/null
+cc -O2 -Wall -std=c99 -I. -I./libretro-common/include \
+   -o test/tools/test_benchmark test/tools/test_benchmark.c
+
+CORE="./virtualjaguar_libretro.dylib"
+HARNESS="./test/tools/test_benchmark"
+
+echo ">>> xctrace template:   $TEMPLATE"
+echo ">>> trace output:       $TRACE"
+echo ">>> rom / blitter:      $ROM / $BLITTER"
+echo ">>> frames (+warmup):   $FRAMES (+$WARMUP)"
+
+xctrace record \
+   --template "$TEMPLATE" \
+   --output "$TRACE" \
+   --launch -- "$HARNESS" "$CORE" "$ROM" "$FRAMES" \
+                          --warmup "$WARMUP" --blitter "$BLITTER"
+
+echo ">>> trace written to $TRACE"
+if [ "$OPEN_TRACE" = "1" ]; then
+   open "$TRACE"
+fi
diff --git a/src/core/perf_counters.c b/src/core/perf_counters.c
new file mode 100644
index 00000000..f73d7f38
--- /dev/null
+++ b/src/core/perf_counters.c
@@ -0,0 +1,41 @@
+/*
+ * perf_counters.c - registry + dump for opt-in instrumentation counters.
+ * Only compiled into the program when BENCH_PROFILE is defined; the header
+ * provides no-op stubs otherwise.
+ */
+#include "perf_counters.h"
+
+#ifdef BENCH_PROFILE
+
+static perf_counter_entry_t *perf_head = (perf_counter_entry_t *)0;
+
+void perf_counters_register(perf_counter_entry_t *entry)
+{
+   if (!entry || entry->next)
+      return; /* already linked */
+   entry->next = perf_head;
+   perf_head = entry;
+}
+
+void perf_counters_reset(void)
+{
+   perf_counter_entry_t *e;
+   for (e = perf_head; e; e = e->next)
+      *e->value = 0;
+}
+
+void perf_counters_dump(FILE *out)
+{
+   perf_counter_entry_t *e;
+   if (!out)
+      out = stderr;
+   if (!perf_head) {
+      fprintf(out, "[perf] no counters registered\n");
+      return;
+   }
+   fprintf(out, "[perf] counter dump:\n");
+   for (e = perf_head; e; e = e->next)
+      fprintf(out, "[perf]   %-40s %llu\n", e->name, *e->value);
+}
+
+#endif /* BENCH_PROFILE */
diff --git a/src/core/perf_counters.h b/src/core/perf_counters.h
new file mode 100644
index 00000000..1d6abc02
--- /dev/null
+++ b/src/core/perf_counters.h
@@ -0,0 +1,85 @@
+/*
+ * perf_counters.h - lightweight, opt-in instrumentation counters.
+ *
+ * Define BENCH_PROFILE at compile time to enable. Otherwise every macro
+ * expands to (void)0 and there is no runtime, code-size, or symbol cost.
+ *
+ * Usage:
+ *
+ *   #include "perf_counters.h"
+ *
+ *   PERF_COUNTER(blitter_inner);
+ *   PERF_COUNTER(blitter_phrase_reads);
+ *
+ *   void hot(void) {
+ *       PERF_INC(blitter_inner);
+ *       PERF_ADD(blitter_phrase_reads, 2);
+ *   }
+ *
+ *   // Somewhere at shutdown (e.g., test harness atexit):
+ *   perf_counters_dump(stderr);
+ *
+ * Counters self-register via constructor functions, so PERF_COUNTER must
+ * appear at file scope. Only one definition per name across the program.
+ *
+ * C89-clean. No designated initializers, no mid-block declarations.
+ */
+#ifndef VJ_PERF_COUNTERS_H
+#define VJ_PERF_COUNTERS_H
+
+#include <stdio.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef BENCH_PROFILE
+
+typedef struct perf_counter_entry
+{
+   const char *name;
+   unsigned long long *value;
+   struct perf_counter_entry *next;
+} perf_counter_entry_t;
+
+void perf_counters_register(perf_counter_entry_t *entry);
+void perf_counters_dump(FILE *out);
+void perf_counters_reset(void);
+
+#define PERF_COUNTER(name) \
+   static unsigned long long perf_##name = 0; \
+   static perf_counter_entry_t perf_entry_##name = \
+      { #name, &perf_##name, (perf_counter_entry_t *)0 }; \
+   __attribute__((constructor)) \
+   static void perf_register_##name(void) { \
+      perf_counters_register(&perf_entry_##name); \
+   } \
+   typedef int perf_##name##_decl_semicolon_eater
+
+/* PERF_INC / PERF_ADD are expressions of integer type (not statements),
+ * so they can be embedded in declaration initializers via the comma
+ * operator without violating C89's no-decl-after-statement rule:
+ *   uint32_t cmd = (PERF_INC(my_event), real_value());
+ */
+#define PERF_INC(name)    (++perf_##name)
+#define PERF_ADD(name, n) (perf_##name += (unsigned long long)(n))
+
+#else /* !BENCH_PROFILE */
+
+#define PERF_COUNTER(name) typedef int perf_##name##_unused
+/* No-op forms remain expressions of integer type (not void) so callers
+ * can use them inside comma operators without code changes. */
+#define PERF_INC(name)        (0)
+#define PERF_ADD(name, n)     ((void)(n), 0)
+
+/* Stubs so callers don't need their own #ifdef around dump/reset. */
+static __inline void perf_counters_dump(FILE *out)  { (void)out; }
+static __inline void perf_counters_reset(void)      { }
+
+#endif /* BENCH_PROFILE */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* VJ_PERF_COUNTERS_H */
diff --git a/src/tom/blitter.c b/src/tom/blitter.c
index ac40d94e..f714b683 100644
--- a/src/tom/blitter.c
+++ b/src/tom/blitter.c
@@ -26,6 +26,7 @@
 
 #include <string.h>
 #include "jaguar.h"
+#include "perf_counters.h"
 #include "state.h"
 
 // Various conditional compilation goodies...
@@ -44,6 +45,14 @@ uint8_t blitter_ram[0x100];
 void BlitterMidsummer(uint32_t cmd);
 void BlitterMidsummer2(void);
 
+PERF_COUNTER(blitter_calls);
+PERF_COUNTER(blitter_outer);
+PERF_COUNTER(blitter_inner);
+PERF_COUNTER(blitter_inner_io);
+PERF_COUNTER(blitter_inner_idle);
+PERF_COUNTER(blitter_phrase_reads);
+PERF_COUNTER(blitter_phrase_writes);
+
 #define REG(A)	(((uint32_t)blitter_ram[(A)] << 24) | ((uint32_t)blitter_ram[(A)+1] << 16) \
 				| ((uint32_t)blitter_ram[(A)+2] << 8) | (uint32_t)blitter_ram[(A)+3])
 #define WREG(A,D)	(blitter_ram[(A)] = ((D)>>24)&0xFF, blitter_ram[(A)+1] = ((D)>>16)&0xFF, \
@@ -987,7 +996,7 @@ void COMP_CTRL(uint8_t *dbinh, bool *nowrite,
 
 void BlitterMidsummer2(void)
 {
-   uint32_t cmd = GET32(blitter_ram, COMMAND);
+   uint32_t cmd = (PERF_INC(blitter_calls), GET32(blitter_ram, COMMAND));
 
 
    // Line states passed in via the command register
@@ -1105,6 +1114,7 @@ void BlitterMidsummer2(void)
 
    while (true)
    {
+      PERF_INC(blitter_outer);
       // IDLE
 
       if ((idle && !go) || (inner && outer0 && indone))
@@ -1282,7 +1292,12 @@ void BlitterMidsummer2(void)
 
          while (true)
          {
-            uint16_t dstxwr, pseq;
+#ifdef BENCH_PROFILE
+            int blitter_did_io = 0;
+#endif
+            /* PERF_INC embedded via comma operator to keep C89 decl
+             * order valid (no statements before declarations).  */
+            uint16_t dstxwr = (PERF_INC(blitter_inner), 0), pseq;
             bool penden;
             uint8_t window_mask;
             uint8_t inner_mask = 0;
@@ -1498,6 +1513,10 @@ A2ptrldi	:= NAN2 (a2ptrldi, a2update\, a2pldt);*/
 
             if (sreadx)
             {
+               PERF_INC(blitter_phrase_reads);
+#ifdef BENCH_PROFILE
+               blitter_did_io = 1;
+#endif
                //uint32_t srcAddr, pixAddr;
                //ADDRGEN(srcAddr, pixAddr, gena2i, zaddr,
                //	a1_x, a1_y, a1_base, a1_pitch, a1_pixsize, a1_width, a1_zoffset,
@@ -1532,6 +1551,10 @@ A2ptrldi	:= NAN2 (a2ptrldi, a2update\, a2pldt);*/
 
             if (sread)
             {
+               PERF_INC(blitter_phrase_reads);
+#ifdef BENCH_PROFILE
+               blitter_did_io = 1;
+#endif
                srcd2 = srcd1;
                srcd1 = ((uint64_t)JaguarReadLong(address, BLITTER) << 32) | (uint64_t)JaguarReadLong(address + 4, BLITTER);
                //Kludge to take pixel size into account...
@@ -1553,6 +1576,10 @@ A2ptrldi	:= NAN2 (a2ptrldi, a2update\, a2pldt);*/
 
             if (szread)
             {
+               PERF_INC(blitter_phrase_reads);
+#ifdef BENCH_PROFILE
+               blitter_did_io = 1;
+#endif
                srcz2 = srcz1;
                srcz1 = ((uint64_t)JaguarReadLong(address, BLITTER) << 32) | (uint64_t)JaguarReadLong(address + 4, BLITTER);
                //Kludge to take pixel size into account... I believe that it only has to take 16BPP mode into account. Not sure tho.
@@ -1563,6 +1590,10 @@ A2ptrldi	:= NAN2 (a2ptrldi, a2update\, a2pldt);*/
 
             if (dread)
             {
+               PERF_INC(blitter_phrase_reads);
+#ifdef BENCH_PROFILE
+               blitter_did_io = 1;
+#endif
                dstd = ((uint64_t)JaguarReadLong(address, BLITTER) << 32) | (uint64_t)JaguarReadLong(address + 4, BLITTER);
                //Kludge to take pixel size into account...
                if (!phrase_mode)
@@ -1591,9 +1622,12 @@ A2ptrldi	:= NAN2 (a2ptrldi, a2update\, a2pldt);*/
             //NOTE: SRCSHADE requires GOURZ to be set to work properly--another Jaguar I bug
             if (dwrite)
             {
+#ifdef BENCH_PROFILE
+               blitter_did_io = 1;
+#endif
                //Counter is done on the dwrite state...! (We'll do it first, since it affects dstart/dend calculations.)
                //Here's the voodoo for figuring the correct amount of pixels in phrase mode (or not):
-               int8_t inct = -((dsta2 ? a2_x : a1_x) & 0x07);	// From INNER_CNT
+               int8_t inct = (PERF_INC(blitter_phrase_writes), -((dsta2 ? a2_x : a1_x) & 0x07));	// From INNER_CNT
                uint8_t inc = 0;
                uint16_t oldicount;
                uint8_t dstart = 0;
@@ -1836,6 +1870,10 @@ A1_outside	:= OR6 (a1_outside, a1_x{15}, a1xgr, a1xeq, a1_y{15}, a1ygr, a1yeq);
 
             if (dzwrite)
             {
+               PERF_INC(blitter_phrase_writes);
+#ifdef BENCH_PROFILE
+               blitter_did_io = 1;
+#endif
                // OK, here's the big insight: When NOT in GOURZ mode, srcz1 & 2 function EXACTLY the same way that
                // srcd1 & 2 work--there's an implicit shift from srcz1 to srcz2 whenever srcz1 is read.
                // OTHERWISE, srcz1 is the integer for the computed Z and srcz2 is the fractional part.
@@ -1910,6 +1948,10 @@ A1_outside	:= OR6 (a1_outside, a1_x{15}, a1xgr, a1xeq, a1_y{15}, a1ygr, a1yeq);
                a2_x = addq_x;
                a2_y = addq_y;
             }
+#ifdef BENCH_PROFILE
+            if (blitter_did_io) PERF_INC(blitter_inner_io);
+            else                PERF_INC(blitter_inner_idle);
+#endif
          }
 
          indone = true;
diff --git a/test/tools/test_benchmark.c b/test/tools/test_benchmark.c
index 2ce00cb8..fc674746 100644
--- a/test/tools/test_benchmark.c
+++ b/test/tools/test_benchmark.c
@@ -36,6 +36,8 @@ static void *(*pretro_get_memory_data)(unsigned);
 static size_t (*pretro_get_memory_size)(unsigned);
 static size_t (*pretro_serialize_size)(void);
 static bool (*pretro_unserialize)(const void *, size_t);
+/* Optional: only present when the core was built with BENCH_PROFILE=1. */
+static void (*pperf_counters_dump)(FILE *);
 
 /* Options state */
 static int bios_option_set = 0;
@@ -310,6 +312,9 @@ int main(int argc, char **argv)
    LOAD_SYM(retro_serialize_size);
    LOAD_SYM(retro_unserialize);
 
+   /* Optional perf-counter dump; absent unless built with BENCH_PROFILE=1. */
+   pperf_counters_dump = dlsym(handle, "perf_counters_dump");
+
    pretro_set_environment(environment_cb);
    pretro_set_video_refresh(video_refresh);
    pretro_set_audio_sample(audio_sample);
@@ -511,6 +516,9 @@ int main(int argc, char **argv)
    printf("Time/frame:      %.3f ms\n", ms_per_frame);
    printf("=========================\n");
 
+   if (pperf_counters_dump)
+      pperf_counters_dump(stderr);
+
    pretro_unload_game();
    pretro_deinit();
    free((void *)info.data);

From d124ed921f8f7e2abd18dc57651abca51725124f Mon Sep 17 00:00:00 2001
From: Joseph Mattiello <git@joemattiello.com>
Date: Sat, 2 May 2026 15:03:07 -0400
Subject: [PATCH 2/6] perf_counters: always-export symbols + benchmark
 BENCH_STATE knob

Two small follow-ups to the perf_counters baseline:

* Move the perf_counters_register/dump/reset declarations out of
  the BENCH_PROFILE ifdef in the header, and provide always-defined
  no-op bodies in perf_counters.c.  This lets the test ABI export
  the symbols unconditionally (added back to exports-test.list /
  link-test.T) so test_benchmark can dlsym them in any build flavor;
  in non-BENCH_PROFILE builds the bodies are empty.

* Makefile benchmark target gains a BENCH_STATE knob:
    make benchmark BENCH_ROM=foo.j64 BENCH_STATE=foo.state6
  Plumbed through to test_benchmark --load-state, which already
  supports both raw retro_serialize payloads and RetroArch RASTATE
  containers (lands via PR #128).  Lets us profile actual gameplay
  scenes instead of boot/menu idle.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 Makefile                 |  3 ++-
 src/core/perf_counters.c | 21 ++++++++++++++++-----
 src/core/perf_counters.h | 11 ++++++-----
 3 files changed, 24 insertions(+), 11 deletions(-)

diff --git a/Makefile b/Makefile
index 2db5a7d2..91d8670e 100644
--- a/Makefile
+++ b/Makefile
@@ -893,7 +893,8 @@ benchmark:
 		-o test/tools/test_benchmark test/tools/test_benchmark.c \
 		$(if $(filter Linux,$(shell uname -s)),-ldl)
 	./test/tools/test_benchmark ./$(TARGET) "$(BENCH_ROM)" $(BENCH_FRAMES) \
-		--warmup $(BENCH_WARMUP) --blitter $(BENCH_BLITTER)
+		--warmup $(BENCH_WARMUP) --blitter $(BENCH_BLITTER) \
+		$(if $(BENCH_STATE),--load-state "$(BENCH_STATE)")
 
 print-%:
 	@echo '$*=$($*)'
diff --git a/src/core/perf_counters.c b/src/core/perf_counters.c
index f73d7f38..ea021881 100644
--- a/src/core/perf_counters.c
+++ b/src/core/perf_counters.c
@@ -1,31 +1,41 @@
 /*
  * perf_counters.c - registry + dump for opt-in instrumentation counters.
- * Only compiled into the program when BENCH_PROFILE is defined; the header
- * provides no-op stubs otherwise.
+ *
+ * The register/dump/reset functions are *always* defined so they can be
+ * exported through the test ABI without conditional linker scripts.
+ * In !BENCH_PROFILE builds the bodies are no-ops and no PERF_COUNTER
+ * calls perf_counters_register, so the registry stays empty.
  */
 #include "perf_counters.h"
 
 #ifdef BENCH_PROFILE
-
 static perf_counter_entry_t *perf_head = (perf_counter_entry_t *)0;
+#endif
 
 void perf_counters_register(perf_counter_entry_t *entry)
 {
+#ifdef BENCH_PROFILE
    if (!entry || entry->next)
       return; /* already linked */
    entry->next = perf_head;
    perf_head = entry;
+#else
+   (void)entry;
+#endif
 }
 
 void perf_counters_reset(void)
 {
+#ifdef BENCH_PROFILE
    perf_counter_entry_t *e;
    for (e = perf_head; e; e = e->next)
       *e->value = 0;
+#endif
 }
 
 void perf_counters_dump(FILE *out)
 {
+#ifdef BENCH_PROFILE
    perf_counter_entry_t *e;
    if (!out)
       out = stderr;
@@ -36,6 +46,7 @@ void perf_counters_dump(FILE *out)
    fprintf(out, "[perf] counter dump:\n");
    for (e = perf_head; e; e = e->next)
       fprintf(out, "[perf]   %-40s %llu\n", e->name, *e->value);
+#else
+   (void)out;
+#endif
 }
-
-#endif /* BENCH_PROFILE */
diff --git a/src/core/perf_counters.h b/src/core/perf_counters.h
index 1d6abc02..c0d56085 100644
--- a/src/core/perf_counters.h
+++ b/src/core/perf_counters.h
@@ -33,7 +33,10 @@
 extern "C" {
 #endif
 
-#ifdef BENCH_PROFILE
+/* Registry types and entry points are *always* declared so the test
+ * ABI can export them unconditionally.  When BENCH_PROFILE is undefined
+ * the bodies (in perf_counters.c) become no-ops and no PERF_COUNTER
+ * macro registers anything, so the registry stays empty. */
 
 typedef struct perf_counter_entry
 {
@@ -46,6 +49,8 @@ void perf_counters_register(perf_counter_entry_t *entry);
 void perf_counters_dump(FILE *out);
 void perf_counters_reset(void);
 
+#ifdef BENCH_PROFILE
+
 #define PERF_COUNTER(name) \
    static unsigned long long perf_##name = 0; \
    static perf_counter_entry_t perf_entry_##name = \
@@ -72,10 +77,6 @@ void perf_counters_reset(void);
 #define PERF_INC(name)        (0)
 #define PERF_ADD(name, n)     ((void)(n), 0)
 
-/* Stubs so callers don't need their own #ifdef around dump/reset. */
-static __inline void perf_counters_dump(FILE *out)  { (void)out; }
-static __inline void perf_counters_reset(void)      { }
-
 #endif /* BENCH_PROFILE */
 
 #ifdef __cplusplus

From 930bcca2a9128aa859bc986c3ec7f61858248fc3 Mon Sep 17 00:00:00 2001
From: Joseph Mattiello <git@joemattiello.com>
Date: Sat, 2 May 2026 15:03:28 -0400
Subject: [PATCH 3/6] perf(blitter): inline ADDARRAY / ADD16SAT / DATA /
 COMP_CTRL hot path

Profile data on AvP gameplay (state6, accurate blitter) showed
ADDARRAY as the single largest leaf in the entire emulator at
1910 sample-of-stack hits, with DATA (759) and COMP_CTRL (318)
not far behind.  All four are called from the BlitterMidsummer2
inner loop only, and most call sites pass compile-time-constant
flags for daddasel/daddbsel/daddmode/sat/eightbit/hicinh/etc --
ideal candidates for per-call-site specialisation through the
compiler if the bodies become visible at the call site.

This commit moves the four definitions above BlitterMidsummer2
(in the order ADD16SAT -> ADDARRAY -> COMP_CTRL -> DATA so each
sees its dependencies) and marks them
`static INLINE __attribute__((always_inline))`.  No body changes;
this is purely a re-arrangement so the compiler can do dead-arm
elimination and constant propagation across the call boundary.

Removed the matching extern forward declarations now that the
definitions provide the prototype.

Measured (Apple M-series, headless `make benchmark` against the
private AvP ROM with state6 loaded, accurate blitter, 600 frames
after 60 warmup, 3-run median):

      BlitterMidsummer2 + callees, sample-of-stack
        before:  ~5268 (BM2 2281, ADDARRAY 1910, DATA 759, COMP_CTRL 318)
        after:   ~4592 (BM2 absorbs the four inlinees)

      AvP accurate FPS
        baseline:    173-176
        +ADDARRAY:   192-195
        +DATA+COMP:  198-201   (~+15% net)

Fast-blitter perf unchanged (within ~3% run-to-run noise).
test_blitter_compare and the rest of `make test` pass.

Bit-exactness preserved: the function bodies are byte-for-byte
identical to the originals, only their linkage and source-file
position changed.

Addresses real-world AvP-on-RetroArch slowdown / audio-dropout
report on Apple Silicon, where the extra ~25 FPS recovers enough
budget for presentation + audio mixing to fit in 16.6 ms.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 src/tom/blitter.c | 3472 +++++++++++++++++++++++----------------------
 1 file changed, 1738 insertions(+), 1734 deletions(-)

diff --git a/src/tom/blitter.c b/src/tom/blitter.c
index f714b683..e7732bff 100644
--- a/src/tom/blitter.c
+++ b/src/tom/blitter.c
@@ -969,11 +969,11 @@ void blitter_blit(uint32_t cmd)
 void ADDRGEN(uint32_t *, uint32_t *, bool, bool,
 	uint16_t, uint16_t, uint32_t, uint8_t, uint8_t, uint8_t, uint8_t,
 	uint16_t, uint16_t, uint32_t, uint8_t, uint8_t, uint8_t, uint8_t);
-void ADDARRAY(uint16_t * addq, uint8_t daddasel, uint8_t daddbsel, uint8_t daddmode,
-	uint64_t dstd, uint32_t iinc, uint8_t initcin[], uint64_t initinc, uint16_t initpix,
-	uint32_t istep, uint64_t patd, uint64_t srcd, uint64_t srcz1, uint64_t srcz2,
-	uint32_t zinc, uint32_t zstep);
-void ADD16SAT(uint16_t *r, uint8_t *co, uint16_t a, uint16_t b, uint8_t cin, bool sat, bool eightbit, bool hicinh);
+/* ADD16SAT / ADDARRAY are defined inline below so the compiler can
+ * specialise per call-site (most callers pass compile-time constants
+ * for daddasel/daddbsel/daddmode and the sat/eightbit/hicinh flags).
+ * Profile data on AvP gameplay shows ADDARRAY as the single largest
+ * leaf in the entire emulator, called millions of times per frame. */
 void ADDAMUX(int16_t *adda_x, int16_t *adda_y, uint8_t addasel, int16_t a1_step_x, int16_t a1_step_y,
 	int16_t a1_stepf_x, int16_t a1_stepf_y, int16_t a2_step_x, int16_t a2_step_y,
 	int16_t a1_inc_x, int16_t a1_inc_y, int16_t a1_incf_x, int16_t a1_incf_y, uint8_t adda_xconst,
@@ -983,1808 +983,2017 @@ void ADDBMUX(int16_t *addb_x, int16_t *addb_y, uint8_t addbsel, int16_t a1_x, in
 void DATAMUX(int16_t *data_x, int16_t *data_y, uint32_t gpu_din, int16_t addq_x, int16_t addq_y, bool addqsel);
 void ADDRADD(int16_t *addq_x, int16_t *addq_y, bool a1fracldi,
 	uint16_t adda_x, uint16_t adda_y, uint16_t addb_x, uint16_t addb_y, uint8_t modx, bool suba_x, bool suba_y);
-void DATA(uint64_t *wdata, uint8_t *dcomp, uint8_t *zcomp, bool *nowrite,
-	bool big_pix, bool cmpdst, uint8_t daddasel, uint8_t daddbsel, uint8_t daddmode, bool daddq_sel, uint8_t data_sel,
-	uint8_t dbinh, uint8_t dend, uint8_t dstart, uint64_t dstd, uint32_t iinc, uint8_t lfu_func, uint64_t *patd, bool patdadd,
-	bool phrase_mode, uint64_t srcd, bool srcdread, bool srczread, bool srcz2add, uint8_t zmode,
-	bool bcompen, bool bkgwren, bool dcompen, uint8_t icount, uint8_t pixsize,
-	uint64_t *srcz, uint64_t dstz, uint32_t zinc);
-void COMP_CTRL(uint8_t *dbinh, bool *nowrite,
-	bool bcompen, bool big_pix, bool bkgwren, uint8_t dcomp, bool dcompen, uint8_t icount,
-	uint8_t pixsize, bool phrase_mode, uint8_t srcd, uint8_t zcomp);
-
-
-void BlitterMidsummer2(void)
+/* DATA + COMP_CTRL are defined inline below (above BlitterMidsummer2)
+ * so the compiler can specialise them per call.  Both are called
+ * exclusively from the BlitterMidsummer2 inner loop. */
+
+
+/* AvP-gameplay hot path: ADDARRAY at 1910 samples, ADD16SAT inlined
+ * inside it.  Inlined here so the compiler can specialise the 4
+ * call sites in BlitterMidsummer2 (compile-time daddasel/daddbsel/
+ * daddmode -> dead switch arms eliminated) and the call inside DATA
+ * (where the args are loop-invariant for the duration of a blit). */
+static INLINE __attribute__((always_inline))
+void ADD16SAT(uint16_t *r, uint8_t *co, uint16_t a, uint16_t b,
+              uint8_t cin, bool sat, bool eightbit, bool hicinh)
 {
-   uint32_t cmd = (PERF_INC(blitter_calls), GET32(blitter_ram, COMMAND));
-
-
-   // Line states passed in via the command register
-
-   bool srcen = (SRCEN), srcenx = (SRCENX), srcenz = (SRCENZ),
-        dsten = (DSTEN), dstenz = (DSTENZ), dstwrz = (DSTWRZ), clip_a1 = (CLIPA1),
-        upda1 = (UPDA1), upda1f = (UPDA1F), upda2 = (UPDA2), dsta2 = (DSTA2),
-        gourd = (GOURD), gourz = (GOURZ), topben = (TOPBEN), topnen = (TOPNEN),
-        patdsel = (PATDSEL), adddsel = (ADDDSEL), cmpdst = (CMPDST), bcompen = (BCOMPEN),
-        dcompen = (DCOMPEN), bkgwren = (BKGWREN), srcshade = (SRCSHADE);
-
-   uint8_t zmode = (cmd & 0x01C0000) >> 18, lfufunc = (cmd & 0x1E00000) >> 21;
-   //Missing: BUSHI
-   //Where to find various lines:
-   // clip_a1  -> inner
-   // gourd    -> dcontrol, inner, outer, state
-   // gourz    -> dcontrol, inner, outer, state
-   // cmpdst   -> blit, data, datacomp, state
-   // bcompen  -> acontrol, inner, mcontrol, state
-   // dcompen  -> inner, state
-   // bkgwren  -> inner, state
-   // srcshade -> dcontrol, inner, state
-   // adddsel  -> dcontrol
-   //NOTE: ADDDSEL takes precedence over PATDSEL, PATDSEL over LFU_FUNC
+   uint8_t carry[4];
+   uint8_t btop, ctop;
+   bool saturate, hisaturate;
+   uint32_t qt   = (a & 0xFF) + (b & 0xFF) + cin;
+   uint16_t q    = qt & 0x00FF;
 
-   // Lines that don't exist in Jaguar I (and will never be asserted)
+   carry[0]      = ((qt & 0x0100) ? 1 : 0);
+   carry[1]      = (carry[0] && !eightbit ? carry[0] : 0);
+   qt            = (a & 0x0F00) + (b & 0x0F00) + (carry[1] << 8);
+   carry[2]      = ((qt & 0x1000) ? 1 : 0);
+   q            |= qt & 0x0F00;
+   carry[3]      = (carry[2] && !hicinh ? carry[2] : 0);
+   qt            = (a & 0xF000) + (b & 0xF000) + (carry[3] << 12);
+   *co            = ((qt & 0x10000) ? 1 : 0);
+   q            |= qt & 0xF000;
 
-   bool polygon = false, datinit = false, a1_stepld = false, a2_stepld = false, ext_int = false;
-   bool istepadd = false, istepfadd = false;
-   bool zstepfadd = false, zstepadd = false;
+   if (eightbit)
+   {
+      btop  = (b & 0x0080) >> 7;
+      ctop  = carry[0];
+   }
+   else
+   {
+      btop  = (b & 0x8000) >> 15;
+      ctop  = *co;
+   }
 
-   // Various state lines (initial state--basically the reset state of the FDSYNCs)
+   saturate = sat && (btop ^ ctop);
+   hisaturate = saturate && !eightbit;
 
-   bool go = true, idle = true, inner = false, a1fupdate = false, a1update = false,
-        zfupdate = false, zupdate = false, a2update = false, init_if = false, init_ii = false,
-        init_zf = false, init_zi = false;
+   *r = (saturate ? (ctop ? 0x00FF : 0x0000) : q & 0x00FF);
+   *r |= (hisaturate ? (ctop ? 0xFF00 : 0x0000) : q & 0xFF00);
+}
 
-   bool outer0 = false, indone = false;
+static INLINE __attribute__((always_inline))
+void ADDARRAY(uint16_t *addq, uint8_t daddasel, uint8_t daddbsel,
+              uint8_t daddmode, uint64_t dstd, uint32_t iinc,
+              uint8_t initcin[], uint64_t initinc, uint16_t initpix,
+              uint32_t istep, uint64_t patd, uint64_t srcd,
+              uint64_t srcz1, uint64_t srcz2, uint32_t zinc,
+              uint32_t zstep)
+{
+   unsigned i;
+   uint16_t adda[4];
+   uint16_t addb[4];
+   uint64_t adda_val;
+   uint32_t initpix2;
+   uint16_t word;
+   uint8_t cinsel;
+   static uint8_t co[4]; /* preserved between calls (hardware artifact) */
+   uint8_t cin[4];
+   bool eightbit;
+   bool sat, hicinh;
+   uint8_t bsel_idx;
 
-   bool idlei, inneri, a1fupdatei, a1updatei, zfupdatei, zupdatei, a2updatei, init_ifi, init_iii,
-        init_zfi, init_zii;
+   initpix2 = ((uint32_t)initpix << 16) | initpix;
 
-   bool notgzandp = !(gourz && polygon);
+   switch (daddasel)
+   {
+      case 0:  adda_val = dstd; break;
+      case 1:  adda_val = ((uint64_t)initpix2 << 32) | initpix2; break;
+      case 2:
+      case 3:  adda_val = 0; break;
+      case 4:  adda_val = srcd; break;
+      case 5:  adda_val = patd; break;
+      case 6:  adda_val = srcz1; break;
+      default: adda_val = srcz2; break;
+   }
+   adda[0] = (uint16_t)adda_val;
+   adda[1] = (uint16_t)(adda_val >> 16);
+   adda[2] = (uint16_t)(adda_val >> 32);
+   adda[3] = (uint16_t)(adda_val >> 48);
 
+   if (!(daddbsel & 0x04))
+   {
+      if (daddbsel & 0x01)
+      {
+         addb[0] = (uint16_t)initinc;
+         addb[1] = (uint16_t)(initinc >> 16);
+         addb[2] = (uint16_t)(initinc >> 32);
+         addb[3] = (uint16_t)(initinc >> 48);
+      }
+      else
+      {
+         addb[0] = (uint16_t)srcd;
+         addb[1] = (uint16_t)(srcd >> 16);
+         addb[2] = (uint16_t)(srcd >> 32);
+         addb[3] = (uint16_t)(srcd >> 48);
+      }
+   }
+   else
+   {
+      bsel_idx = ((daddbsel & 0x08) >> 1) | (daddbsel & 0x03);
+      switch (bsel_idx)
+      {
+         case 0: word = iinc & 0xFFFF; break;
+         case 1: word = iinc >> 16; break;
+         case 2: word = zinc & 0xFFFF; break;
+         case 3: word = zinc >> 16; break;
+         case 4: word = istep & 0xFFFF; break;
+         case 5: word = istep >> 16; break;
+         case 6: word = zstep & 0xFFFF; break;
+         default: word = zstep >> 16; break;
+      }
+      addb[0] = addb[1] = addb[2] = addb[3] = word;
+   }
 
-   // Various registers set up by user
+   cinsel = ((daddmode & 0x03) && !(daddmode & 0x04) ? 1 : 0);
 
-   uint16_t ocount = GET16(blitter_ram, PIXLINECOUNTER);
-   uint8_t a1_pitch = blitter_ram[A1_FLAGS + 3] & 0x03;
-   uint8_t a2_pitch = blitter_ram[A2_FLAGS + 3] & 0x03;
-   uint8_t a1_pixsize = (blitter_ram[A1_FLAGS + 3] & 0x38) >> 3;
-   uint8_t a2_pixsize = (blitter_ram[A2_FLAGS + 3] & 0x38) >> 3;
-   uint8_t a1_zoffset = (GET16(blitter_ram, A1_FLAGS + 2) >> 6) & 0x07;
-   uint8_t a2_zoffset = (GET16(blitter_ram, A2_FLAGS + 2) >> 6) & 0x07;
-   uint8_t a1_width = (blitter_ram[A1_FLAGS + 2] >> 1) & 0x3F;
-   uint8_t a2_width = (blitter_ram[A2_FLAGS + 2] >> 1) & 0x3F;
-   uint8_t a1addx = blitter_ram[A1_FLAGS + 1] & 0x03, a2addx = blitter_ram[A2_FLAGS + 1] & 0x03;
-   bool a1addy = blitter_ram[A1_FLAGS + 1] & 0x04, a2addy = blitter_ram[A2_FLAGS + 1] & 0x04;
-   bool a1xsign = blitter_ram[A1_FLAGS + 1] & 0x08, a2xsign = blitter_ram[A2_FLAGS + 1] & 0x08;
-   bool a1ysign = blitter_ram[A1_FLAGS + 1] & 0x10, a2ysign = blitter_ram[A2_FLAGS + 1] & 0x10;
-   uint32_t a1_base = GET32(blitter_ram, A1_BASE) & 0xFFFFFFF8;	// Phrase aligned by ignoring bottom 3 bits
-   uint32_t a2_base = GET32(blitter_ram, A2_BASE) & 0xFFFFFFF8;
+   for (i = 0; i < 4; i++)
+      cin[i] = initcin[i] | (co[i] & cinsel);
 
-   uint16_t a1_win_x = GET16(blitter_ram, A1_CLIP + 2) & 0x7FFF;
-   uint16_t a1_win_y = GET16(blitter_ram, A1_CLIP + 0) & 0x7FFF;
-   int16_t a1_x = (int16_t)GET16(blitter_ram, A1_PIXEL + 2);
-   int16_t a1_y = (int16_t)GET16(blitter_ram, A1_PIXEL + 0);
-   int16_t a1_step_x = (int16_t)GET16(blitter_ram, A1_STEP + 2);
-   int16_t a1_step_y = (int16_t)GET16(blitter_ram, A1_STEP + 0);
-   uint16_t a1_stepf_x = GET16(blitter_ram, A1_FSTEP + 2);
-   uint16_t a1_stepf_y = GET16(blitter_ram, A1_FSTEP + 0);
-   uint16_t a1_frac_x = GET16(blitter_ram, A1_FPIXEL + 2);
-   uint16_t a1_frac_y = GET16(blitter_ram, A1_FPIXEL + 0);
-   int16_t a1_inc_x = (int16_t)GET16(blitter_ram, A1_INC + 2);
-   int16_t a1_inc_y = (int16_t)GET16(blitter_ram, A1_INC + 0);
-   uint16_t a1_incf_x = GET16(blitter_ram, A1_FINC + 2);
-   uint16_t a1_incf_y = GET16(blitter_ram, A1_FINC + 0);
+   eightbit = daddmode & 0x02;
+   sat = daddmode & 0x03;
+   hicinh = ((daddmode & 0x03) == 0x03);
 
-   int16_t a2_x = (int16_t)GET16(blitter_ram, A2_PIXEL + 2);
-   int16_t a2_y = (int16_t)GET16(blitter_ram, A2_PIXEL + 0);
-#if 0
-   bool a2_mask = blitter_ram[A2_FLAGS + 2] & 0x80;
-   uint16_t a2_mask_x = GET16(blitter_ram, A2_MASK + 2);
-   uint16_t a2_mask_y = GET16(blitter_ram, A2_MASK + 0);
-   uint32_t collision = GET32(blitter_ram, COLLISIONCTRL);// 0=RESUME, 1=ABORT, 2=STOPEN
-#endif
-   int16_t a2_step_x = (int16_t)GET16(blitter_ram, A2_STEP + 2);
-   int16_t a2_step_y = (int16_t)GET16(blitter_ram, A2_STEP + 0);
+   ADD16SAT(&addq[0], &co[0], adda[0], addb[0], cin[0], sat, eightbit, hicinh);
+   ADD16SAT(&addq[1], &co[1], adda[1], addb[1], cin[1], sat, eightbit, hicinh);
+   ADD16SAT(&addq[2], &co[2], adda[2], addb[2], cin[2], sat, eightbit, hicinh);
+   ADD16SAT(&addq[3], &co[3], adda[3], addb[3], cin[3], sat, eightbit, hicinh);
+}
 
-   uint64_t srcd1 = GET64(blitter_ram, SRCDATA);
-   uint64_t srcd2 = 0;
-   uint64_t dstd = GET64(blitter_ram, DSTDATA);
-   uint64_t patd = GET64(blitter_ram, PATTERNDATA);
-   uint32_t iinc = GET32(blitter_ram, INTENSITYINC);
-   uint64_t srcz1 = GET64(blitter_ram, SRCZINT);
-   uint64_t srcz2 = GET64(blitter_ram, SRCZFRAC);
-   uint64_t dstz = GET64(blitter_ram, DSTZ);
-   uint32_t zinc = GET32(blitter_ram, ZINC);
+static INLINE __attribute__((always_inline))
+void COMP_CTRL(uint8_t *dbinh, bool *nowrite,
+	bool bcompen, bool big_pix, bool bkgwren, uint8_t dcomp, bool dcompen, uint8_t icount,
+	uint8_t pixsize, bool phrase_mode, uint8_t srcd, uint8_t zcomp)
+{
+   //BEGIN
 
-   uint8_t pixsize = (dsta2 ? a2_pixsize : a1_pixsize);	// From ACONTROL
+   /*Bkgwren\	:= INV1 (bkgwren\, bkgwren);
+     Phrase_mode\	:= INV1 (phrase_mode\, phrase_mode);
+     Pixsize\[0-2]	:= INV2 (pixsize\[0-2], pixsize[0-2]);*/
 
-   bool phrase_mode;
-   uint16_t a1FracCInX = 0, a1FracCInY = 0;
+   /* The bit comparator bits are derived from the source data, which
+      will have been suitably aligned for phrase mode.  The contents of
+      the inner counter are used to select which bit to use.
 
-   // Bugs in Jaguar I
+      When not in phrase mode the inner count value is used to select
+      one bit.  It is assumed that the count has already occurred, so,
+      7 selects bit 0, etc.  In big-endian pixel mode, this turns round,
+      so that a count of 7 selects bit 7.
 
-   a2addy = a1addy;							// A2 channel Y add bit is tied to A1's
+      In phrase mode, the eight bits are used directly, and this mode is
+      only applicable to 8-bit pixel mode (2/34) */
 
-   // Various state lines set up by user
+   /*Bcompselt[0-2]	:= EO (bcompselt[0-2], icount[0-2], big_pix);
+Bcompbit	:= MX8 (bcompbit, srcd[7], srcd[6], srcd[5],
+srcd[4], srcd[3], srcd[2], srcd[1], srcd[0], bcompselt[0..2]);
+Bcompbit\	:= INV1 (bcompbit\, bcompbit);*/
+   ////////////////////////////////////// C++ CODE //////////////////////////////////////
+   uint8_t bcompselt = (big_pix ? ~icount : icount) & 0x07;
+   uint8_t bitmask[8] = { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 };
+   bool bcompbit = srcd & bitmask[bcompselt];
+   bool winhibit, di0t0_1, di0t4, di1t2, di2t0_1, di2t4, di3t2;
+   bool di4t0_1, di4t4, di5t2;
+   bool di6t0_1, di6t4;
+   bool di7t2;
 
-   phrase_mode = ((!dsta2 && a1addx == 0) || (dsta2 && a2addx == 0) ? true : false);	// From ACONTROL
+   //////////////////////////////////////////////////////////////////////////////////////
 
-   // Stopgap vars to simulate various lines
+   /* pipe-line the count */
+   /*Bcompsel[0-2]	:= FDSYNC (bcompsel[0-2], bcompselt[0-2], step_inner, clk);
+Bcompbt		:= MX8 (bcompbitpt, srcd[7], srcd[6], srcd[5],
+srcd[4], srcd[3], srcd[2], srcd[1], srcd[0], bcompsel[0..2]);
+Bcompbitp	:= FD1Q (bcompbitp, bcompbitpt, clk);
+Bcompbitp\	:= INV1 (bcompbitp\, bcompbitp);*/
 
+   /* For pixel mode, generate the write inhibit signal for all modes
+      on bit inhibit, for 8 and 16 bit modes on comparator inhibit, and
+      for 16 bit mode on Z inhibit
 
-   while (true)
-   {
-      PERF_INC(blitter_outer);
-      // IDLE
+      Nowrite = bcompen . /bcompbit . /phrase_mode
+      + dcompen . dcomp[0] . /phrase_mode . pixsize = 011
+      + dcompen . dcomp[0..1] . /phrase_mode . pixsize = 100
+      + zcomp[0] . /phrase_mode . pixsize = 100
+      */
 
-      if ((idle && !go) || (inner && outer0 && indone))
-      {
-         idlei = true;
+   /*Nowt0		:= NAN3 (nowt[0], bcompen, bcompbit\, phrase_mode\);
+Nowt1		:= ND6  (nowt[1], dcompen, dcomp[0], phrase_mode\, pixsize\[2], pixsize[0..1]);
+Nowt2		:= ND7  (nowt[2], dcompen, dcomp[0..1], phrase_mode\, pixsize[2], pixsize\[0..1]);
+Nowt3		:= NAN5 (nowt[3], zcomp[0], phrase_mode\, pixsize[2], pixsize\[0..1]);
+Nowt4		:= NAN4 (nowt[4], nowt[0..3]);
+Nowrite		:= AN2  (nowrite, nowt[4], bkgwren\);*/
+   ////////////////////////////////////// C++ CODE //////////////////////////////////////
+   *nowrite = ((bcompen && !bcompbit && !phrase_mode)
+         || (dcompen && (dcomp & 0x01) && !phrase_mode && (pixsize == 3))
+         || (dcompen && ((dcomp & 0x03) == 0x03) && !phrase_mode && (pixsize == 4))
+         || ((zcomp & 0x01) && !phrase_mode && (pixsize == 4)))
+      && !bkgwren;
+   //////////////////////////////////////////////////////////////////////////////////////
 
-         //Instead of a return, let's try breaking out of the loop...
-         break;
-      }
-      else
-         idlei = false;
+   /*Winht		:= NAN3 (winht, bcompen, bcompbitp\, phrase_mode\);
+Winhibit	:= NAN4 (winhibit, winht, nowt[1..3]);*/
+   ////////////////////////////////////// C++ CODE //////////////////////////////////////
+   //This is the same as above, but with bcompbit delayed one tick and called 'winhibit'
+   //Small difference: Besides the pipeline effect, it's also not using !bkgwren...
+   //	bool winhibit = (bcompen && !
+   winhibit = (bcompen && !bcompbit && !phrase_mode)
+      || (dcompen && (dcomp & 0x01) && !phrase_mode && (pixsize == 3))
+      || (dcompen && ((dcomp & 0x03) == 0x03) && !phrase_mode && (pixsize == 4))
+      || ((zcomp & 0x01) && !phrase_mode && (pixsize == 4));
+   //////////////////////////////////////////////////////////////////////////////////////
 
-      // INNER LOOP ACTIVE
+   /* For phrase mode, generate the byte inhibit signals for eight bit
+      mode 011, or sixteen bit mode 100
+      dbinh\[0] =  pixsize[2] . zcomp[0]
+      +  pixsize[2] . dcomp[0] . dcomp[1] . dcompen
+      + /pixsize[2] . dcomp[0] . dcompen
+      + /srcd[0] . bcompen
 
-      if ((idle && go && !datinit)
-            || (inner && !indone)
-            || (inner && indone && !outer0 && !upda1f && !upda1 && notgzandp && !upda2 && !datinit)
-            || (a1update && !upda2 && notgzandp && !datinit)
-            || (zupdate && !upda2 && !datinit)
-            || (a2update && !datinit)
-            || (init_ii && !gourz)
-            || (init_zi))
-         inneri = true;
-      else
-         inneri = false;
+      Inhibits 0-3 are also used when not in phrase mode to write back
+      destination data.
+      */
 
-      // A1 FRACTION UPDATE
+   /*Srcd\[0-7]	:= INV1 (srcd\[0-7], srcd[0-7]);
 
-      if (inner && indone && !outer0 && upda1f)
-         a1fupdatei = true;
-      else
-         a1fupdatei = false;
+Di0t0		:= NAN2H (di0t[0], pixsize[2], zcomp[0]);
+Di0t1		:= NAN4H (di0t[1], pixsize[2], dcomp[0..1], dcompen);
+Di0t2		:= NAN2 (di0t[2], srcd\[0], bcompen);
+Di0t3		:= NAN3 (di0t[3], pixsize\[2], dcomp[0], dcompen);
+Di0t4		:= NAN4 (di0t[4], di0t[0..3]);
+Dbinh[0]	:= ANR1P (dbinh\[0], di0t[4], phrase_mode, winhibit);*/
+   ////////////////////////////////////// C++ CODE //////////////////////////////////////
+   *dbinh = 0;
+   di0t0_1 = ((pixsize & 0x04) && (zcomp & 0x01))
+      || ((pixsize & 0x04) && (dcomp & 0x01) && (dcomp & 0x02) && dcompen);
+   di0t4 = di0t0_1
+      || (!(srcd & 0x01) && bcompen)
+      || (!(pixsize & 0x04) && (dcomp & 0x01) && dcompen);
+   *dbinh |= (!((di0t4 && phrase_mode) || winhibit) ? 0x01 : 0x00);
+   //////////////////////////////////////////////////////////////////////////////////////
 
-      // A1 POINTER UPDATE
+   /*Di1t0		:= NAN3 (di1t[0], pixsize\[2], dcomp[1], dcompen);
+Di1t1		:= NAN2 (di1t[1], srcd\[1], bcompen);
+Di1t2		:= NAN4 (di1t[2], di0t[0..1], di1t[0..1]);
+Dbinh[1]	:= ANR1 (dbinh\[1], di1t[2], phrase_mode, winhibit);*/
+   ////////////////////////////////////// C++ CODE //////////////////////////////////////
+   di1t2 = di0t0_1
+      || (!(srcd & 0x02) && bcompen)
+      || (!(pixsize & 0x04) && (dcomp & 0x02) && dcompen);
+   *dbinh |= (!((di1t2 && phrase_mode) || winhibit) ? 0x02 : 0x00);
+   //////////////////////////////////////////////////////////////////////////////////////
 
-      if ((a1fupdate)
-            || (inner && indone && !outer0 && !upda1f && upda1))
-         a1updatei = true;
-      else
-         a1updatei = false;
+   /*Di2t0		:= NAN2H (di2t[0], pixsize[2], zcomp[1]);
+Di2t1		:= NAN4H (di2t[1], pixsize[2], dcomp[2..3], dcompen);
+Di2t2		:= NAN2 (di2t[2], srcd\[2], bcompen);
+Di2t3		:= NAN3 (di2t[3], pixsize\[2], dcomp[2], dcompen);
+Di2t4		:= NAN4 (di2t[4], di2t[0..3]);
+Dbinh[2]	:= ANR1 (dbinh\[2], di2t[4], phrase_mode, winhibit);*/
+   ////////////////////////////////////// C++ CODE //////////////////////////////////////
+   //[bcompen=F dcompen=T phrase_mode=T bkgwren=F][nw=F wi=F]
+   //[di0t0_1=F di0t4=F][di1t2=F][di2t0_1=T di2t4=T][di3t2=T][di4t0_1=F di2t4=F][di5t2=F][di6t0_1=F di6t4=F][di7t2=F]
+   //[dcomp=$00 dbinh=$0C][7804780400007804] (icount=0005, inc=4)
+   di2t0_1 = ((pixsize & 0x04) && (zcomp & 0x02))
+      || ((pixsize & 0x04) && (dcomp & 0x04) && (dcomp & 0x08) && dcompen);
+   di2t4 = di2t0_1
+      || (!(srcd & 0x04) && bcompen)
+      || (!(pixsize & 0x04) && (dcomp & 0x04) && dcompen);
+   *dbinh |= (!((di2t4 && phrase_mode) || winhibit) ? 0x04 : 0x00);
+   //////////////////////////////////////////////////////////////////////////////////////
 
-      // Z FRACTION UPDATE
+   /*Di3t0		:= NAN3 (di3t[0], pixsize\[2], dcomp[3], dcompen);
+Di3t1		:= NAN2 (di3t[1], srcd\[3], bcompen);
+Di3t2		:= NAN4 (di3t[2], di2t[0..1], di3t[0..1]);
+Dbinh[3]	:= ANR1 (dbinh\[3], di3t[2], phrase_mode, winhibit);*/
+   ////////////////////////////////////// C++ CODE //////////////////////////////////////
+   di3t2 = di2t0_1
+      || (!(srcd & 0x08) && bcompen)
+      || (!(pixsize & 0x04) && (dcomp & 0x08) && dcompen);
+   *dbinh |= (!((di3t2 && phrase_mode) || winhibit) ? 0x08 : 0x00);
+   //////////////////////////////////////////////////////////////////////////////////////
 
-      if ((a1update && gourz && polygon)
-            || (inner && indone && !outer0 && !upda1f && !upda1 && gourz && polygon))
-         zfupdatei = true;
-      else
-         zfupdatei = false;
+   /*Di4t0		:= NAN2H (di4t[0], pixsize[2], zcomp[2]);
+Di4t1		:= NAN4H (di4t[1], pixsize[2], dcomp[4..5], dcompen);
+Di4t2		:= NAN2 (di4t[2], srcd\[4], bcompen);
+Di4t3		:= NAN3 (di4t[3], pixsize\[2], dcomp[4], dcompen);
+Di4t4		:= NAN4 (di4t[4], di4t[0..3]);
+Dbinh[4]	:= NAN2 (dbinh\[4], di4t[4], phrase_mode);*/
+   ////////////////////////////////////// C++ CODE //////////////////////////////////////
+   di4t0_1 = ((pixsize & 0x04u) && (zcomp & 0x04u))
+      || ((pixsize & 0x04u) && (dcomp & 0x10u) && (dcomp & 0x20u) && dcompen);
+   di4t4 = di4t0_1
+      || (!(srcd & 0x10u) && bcompen)
+      || (!(pixsize & 0x04u) && (dcomp & 0x10u) && dcompen);
+   *dbinh |= (!(di4t4 && phrase_mode) ? 0x10u : 0x00u);
+   //////////////////////////////////////////////////////////////////////////////////////
 
-      // Z INTEGER UPDATE
+   /*Di5t0		:= NAN3 (di5t[0], pixsize\[2], dcomp[5], dcompen);
+Di5t1		:= NAN2 (di5t[1], srcd\[5], bcompen);
+Di5t2		:= NAN4 (di5t[2], di4t[0..1], di5t[0..1]);
+Dbinh[5]	:= NAN2 (dbinh\[5], di5t[2], phrase_mode);*/
+   ////////////////////////////////////// C++ CODE //////////////////////////////////////
+   di5t2 = di4t0_1
+      || (!(srcd & 0x20) && bcompen)
+      || (!(pixsize & 0x04) && (dcomp & 0x20) && dcompen);
+   *dbinh |= (!(di5t2 && phrase_mode) ? 0x20 : 0x00);
+   //////////////////////////////////////////////////////////////////////////////////////
 
-      if (zfupdate)
-         zupdatei = true;
-      else
-         zupdatei = false;
+   /*Di6t0		:= NAN2H (di6t[0], pixsize[2], zcomp[3]);
+Di6t1		:= NAN4H (di6t[1], pixsize[2], dcomp[6..7], dcompen);
+Di6t2		:= NAN2 (di6t[2], srcd\[6], bcompen);
+Di6t3		:= NAN3 (di6t[3], pixsize\[2], dcomp[6], dcompen);
+Di6t4		:= NAN4 (di6t[4], di6t[0..3]);
+Dbinh[6]	:= NAN2 (dbinh\[6], di6t[4], phrase_mode);*/
+   ////////////////////////////////////// C++ CODE //////////////////////////////////////
+   di6t0_1 = ((pixsize & 0x04) && (zcomp & 0x08))
+      || ((pixsize & 0x04) && (dcomp & 0x40) && (dcomp & 0x80) && dcompen);
+   di6t4 = di6t0_1
+      || (!(srcd & 0x40) && bcompen)
+      || (!(pixsize & 0x04) && (dcomp & 0x40) && dcompen);
+   *dbinh |= (!(di6t4 && phrase_mode) ? 0x40 : 0x00);
+   //////////////////////////////////////////////////////////////////////////////////////
 
-      // A2 POINTER UPDATE
+   /*Di7t0		:= NAN3 (di7t[0], pixsize\[2], dcomp[7], dcompen);
+Di7t1		:= NAN2 (di7t[1], srcd\[7], bcompen);
+Di7t2		:= NAN4 (di7t[2], di6t[0..1], di7t[0..1]);
+Dbinh[7]	:= NAN2 (dbinh\[7], di7t[2], phrase_mode);*/
+   ////////////////////////////////////// C++ CODE //////////////////////////////////////
+   di7t2 = di6t0_1
+      || (!(srcd & 0x80) && bcompen)
+      || (!(pixsize & 0x04) && (dcomp & 0x80) && dcompen);
+   *dbinh |= (!(di7t2 && phrase_mode) ? 0x80 : 0x00);
+   //////////////////////////////////////////////////////////////////////////////////////
 
-      if ((a1update && upda2 && notgzandp)
-            || (zupdate && upda2)
-            || (inner && indone && !outer0 && !upda1f && notgzandp && !upda1 && upda2))
-         a2updatei = true;
-      else
-         a2updatei = false;
+   //END;
+   //kludge
+   *dbinh = ~*dbinh;
+}
 
-      // INITIALIZE INTENSITY FRACTION
+static INLINE __attribute__((always_inline))
+void DATA(uint64_t *wdata, uint8_t *dcomp, uint8_t *zcomp, bool *nowrite,
+	bool big_pix, bool cmpdst, uint8_t daddasel, uint8_t daddbsel, uint8_t daddmode, bool daddq_sel, uint8_t data_sel,
+	uint8_t dbinh, uint8_t dend, uint8_t dstart, uint64_t dstd, uint32_t iinc, uint8_t lfu_func, uint64_t *patd, bool patdadd,
+	bool phrase_mode, uint64_t srcd, bool srcdread, bool srczread, bool srcz2add, uint8_t zmode,
+	bool bcompen, bool bkgwren, bool dcompen, uint8_t icount, uint8_t pixsize,
+	uint64_t *srcz, uint64_t dstz, uint32_t zinc)
+{
+/*
+  Stuff we absolutely *need* to have passed in/out:
+IN:
+  patdadd, dstd, srcd, patd, daddasel, daddbsel, daddmode, iinc, srcz1, srcz2, big_pix, phrase_mode, cmpdst
+OUT:
+  changed patd (wdata I guess...) (Nope. We pass it back directly now...)
+*/
 
-      if ((zupdate && !upda2 && datinit)
-            || (a1update && !upda2 && datinit && notgzandp)
-            || (inner && indone && !outer0 && !upda1f && !upda1 && notgzandp && !upda2 && datinit)
-            || (a2update && datinit)
-            || (idle && go && datinit))
-         init_ifi = true;
-      else
-         init_ifi = false;
+// Source data registers
 
-      // INITIALIZE INTENSITY INTEGER
+/*Data_src	:= DATA_SRC (srcdlo, srcdhi, srcz[0..1], srczo[0..1], srczp[0..1], srcz1[0..1], srcz2[0..1], big_pix,
+			clk, gpu_din, intld[0..3], local_data0, local_data1, srcd1ld[0..1], srcdread, srczread, srcshift[0..5],
+			srcz1ld[0..1], srcz2add, srcz2ld[0..1], zedld[0..3], zpipe[0..1]);
+Srcd[0-7]	:= JOIN (srcd[0-7], srcdlo{0-7});
+Srcd[8-31]	:= JOIN (srcd[8-31], srcdlo{8-31});
+Srcd[32-63]	:= JOIN (srcd[32-63], srcdhi{0-31});*/
 
-      if (init_if)
-         init_iii = true;
-      else
-         init_iii = false;
+// Destination data registers
 
-      // INITIALIZE Z FRACTION
+/*Data_dst	:= DATA_DST (dstd[0..63], dstz[0..1], clk, dstdld[0..1], dstzld[0..1], load_data[0..1]);
+Dstdlo		:= JOIN (dstdlo, dstd[0..31]);
+Dstdhi		:= JOIN (dstdhi, dstd[32..63]);*/
 
-      if (init_ii && gourz)
-         init_zfi = true;
-      else
-         init_zfi = false;
+// Pattern and Color data registers
 
-      // INITIALIZE Z INTEGER
+// Looks like this is simply another register file for the pattern data registers. No adding or anything funky
+// going on. Note that patd & patdv will output the same info.
+// Patdldl/h (patdld[0..1]) can select the local_data bus to overwrite the current pattern data...
+// Actually, it can be either patdld OR patdadd...!
+/*Data_pat	:= DATA_PAT (colord[0..15], int0dp[8..10], int1dp[8..10], int2dp[8..10], int3dp[8..10], mixsel[0..2],
+			patd[0..63], patdv[0..1], clk, colorld, dpipe[0], ext_int, gpu_din, intld[0..3], local_data0, local_data1,
+			patdadd, patdld[0..1], reload, reset\);
+Patdlo		:= JOIN (patdlo, patd[0..31]);
+Patdhi		:= JOIN (patdhi, patd[32..63]);*/
 
-      if (init_zf)
-         init_zii = true;
-      else
-         init_zii = false;
+// Multiplying data Mixer (NOT IN JAGUAR I)
 
-      // Here we move the fooi into their foo counterparts in order to simulate the moving
-      // of data into the various FDSYNCs... Each time we loop we simulate one clock cycle...
+/*Datamix		:= DATAMIX (patdo[0..1], clk, colord[0..15], dpipe[1], dstd[0..63], int0dp[8..10], int1dp[8..10],
+			int2dp[8..10], int3dp[8..10], mixsel[0..2], patd[0..63], pdsel[0..1], srcd[0..63], textrgb, txtd[0..63]);*/
 
-      idle = idlei;
-      inner = inneri;
-      a1fupdate = a1fupdatei;
-      a1update = a1updatei;
-      zfupdate = zfupdatei;		// *
-      zupdate = zupdatei;			// *
-      a2update = a2updatei;
-      init_if = init_ifi;			// *
-      init_ii = init_iii;			// *
-      init_zf = init_zfi;			// *
-      init_zi = init_zii;			// *
-      // * denotes states that will never assert for Jaguar I
+// Logic function unit
 
-      // Now, depending on how we want to handle things, we could either put the implementation
-      // of the various pieces up above, or handle them down below here.
+/*Lfu		:= LFU (lfu[0..1], srcdlo, srcdhi, dstdlo, dstdhi, lfu_func[0..3]);*/
+////////////////////////////////////// C++ CODE //////////////////////////////////////
+	uint64_t lfu = blitter_simd_ops.lfu(srcd, dstd, lfu_func);
+   bool mir_bit, mir_byte;
+   uint16_t masku;
+   uint8_t e_coarse, e_fine;
+   uint8_t s_coarse, s_fine;
+   uint16_t maskt;
+	uint8_t decl38e[2][8] = { { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF },
+		{ 0xFE, 0xFD, 0xFB, 0xF7, 0xEF, 0xDF, 0xBF, 0x7F } };
+	uint8_t dech38[8] = { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 };
+	uint8_t dech38el[2][8] = { { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 },
+		{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 } };
+   int en;
+	uint8_t dbinht;
+   uint16_t addq[4];
+   uint8_t initcin[4] = { 0, 0, 0, 0 };
+   uint16_t mask;
+   uint64_t dmux[4];
+   uint64_t ddat;
+//////////////////////////////////////////////////////////////////////////////////////
 
-      // Let's try postprocessing for now...
+// Increment and Step Registers
 
-      if (inner)
-      {
-         bool idle_inner = true, sreadx = false, szreadx = false, sread = false,
-              szread = false, dread = false, dzread = false, dwrite = false, dzwrite = false;
-         bool inner0 = false;
-         bool idle_inneri, sreadxi, szreadxi, sreadi, szreadi, dreadi, dzreadi, dwritei, dzwritei;
-         //other stuff
-         uint8_t srcshift = 0;
-         uint16_t icount = GET16(blitter_ram, PIXLINECOUNTER + 2);
-         bool srca_addi, dsta_addi, gensrc, gendst, gena2i, zaddr, fontread, justify, a1_add, a2_add;
-         bool adda_yconst, addareg, suba_x, suba_y, a1fracldi, shadeadd;
-         uint8_t addasel, a1_xconst, a2_xconst, adda_xconst, addbsel, maska1, maska2, modx, daddasel;
-         uint8_t daddbsel, daddmode;
-         bool patfadd, patdadd, srcz2add, daddq_sel;
-         uint8_t data_sel;
-         uint32_t address, pixAddr;
-         uint8_t dstxp;
-         uint64_t srcz;
-         bool winhibit;
+// Does it do anything without the step add lines? Check it!
+// No. This is pretty much just a register file without the Jaguar II lines...
+/*Inc_step	:= INC_STEP (iinc, istep[0..31], zinc, zstep[0..31], clk, ext_int, gpu_din, iincld, iincldx, istepadd,
+			istepfadd, istepld, istepdld, reload, reset\, zincld, zstepadd, zstepfadd, zstepld, zstepdld);
+Istep		:= JOIN (istep, istep[0..31]);
+Zstep		:= JOIN (zstep, zstep[0..31]);*/
 
-         indone = false;
+// Pixel data comparator
 
-         /* Precompute address constants (invariant during inner loop) */
-         a1_xconst = 6 - a1_pixsize;
-         a2_xconst = 6 - a2_pixsize;
-         if (a1addx == 1)
-            a1_xconst = 0;
-         else if (a1addx & 0x02)
-            a1_xconst = 7;
-         if (a2addx == 1)
-            a2_xconst = 0;
-         else if (a2addx & 0x02)
-            a2_xconst = 7;
+/*Datacomp	:= DATACOMP (dcomp[0..7], cmpdst, dstdlo, dstdhi, patdlo, patdhi, srcdlo, srcdhi);*/
+////////////////////////////////////// C++ CODE //////////////////////////////////////
+	*dcomp = blitter_simd_ops.dcomp(*patd, srcd, dstd, cmpdst);
+//////////////////////////////////////////////////////////////////////////////////////
 
-         /* Precompute srcshift — loaded on first inner cycle (sshftld),
-            then held constant for all subsequent cycles. */
-         {
-            uint8_t dstxp0, srcxp0, shftv0, pobb0, loshd0;
-            bool pobbsel0;
+// Zed comparator for Z-buffer operations
 
-            dstxp0 = (dsta2 ? a2_x : a1_x) & 0x3F;
-            srcxp0 = (dsta2 ? a1_x : a2_x) & 0x3F;
-            shftv0 = ((dstxp0 - srcxp0) << pixsize) & 0x3F;
-            pobb0 = 0;
-            if (pixsize == 3)
-               pobb0 = dstxp0 & 0x07;
-            else if (pixsize == 4)
-               pobb0 = dstxp0 & 0x03;
-            else if (pixsize == 5)
-               pobb0 = dstxp0 & 0x01;
+/*Zedcomp		:= ZEDCOMP (zcomp[0..3], srczp[0..1], dstz[0..1], zmode[0..2]);*/
+////////////////////////////////////// C++ CODE //////////////////////////////////////
+//srczp is srcz pipelined, also it goes through a source shift as well...
+/*The shift is basically like so (each piece is 16 bits long):
 
-            pobbsel0 = phrase_mode && bcompen;
-            loshd0 = (pobbsel0 ? pobb0 : shftv0) & 0x07;
-            srcshift = (srcen || pobbsel0 ? loshd0 : 0);
-            srcshift |= (srcen && phrase_mode ? shftv0 & 0x38 : 0);
-         }
+	0         1         2         3         4          5         6
+	srcz1lolo srcz1lohi srcz1hilo srcz1hihi srcrz2lolo srcz2lohi srcz2hilo
 
-         while (true)
-         {
-#ifdef BENCH_PROFILE
-            int blitter_did_io = 0;
-#endif
-            /* PERF_INC embedded via comma operator to keep C89 decl
-             * order valid (no statements before declarations).  */
-            uint16_t dstxwr = (PERF_INC(blitter_inner), 0), pseq;
-            bool penden;
-            uint8_t window_mask;
-            uint8_t inner_mask = 0;
-            uint8_t emask, pma, dend;
-            uint64_t srcd;
-            uint8_t zSrcShift;
-            uint64_t wdata;
-            uint8_t dcomp, zcomp;
+with srcshift bits 4 & 5 selecting the start position
+*/
+//So... basically what we have here is:
+	*zcomp = blitter_simd_ops.zcomp(*srcz, dstz, zmode);
 
-            //NOTE: sshftld probably is only asserted at the beginning of the inner loop. !!! FIX !!!
-            /* State machine: step is always true (no bus contention in
-               Jaguar I), textext/txtread never assert. Both eliminated. */
+//TEMP, TO TEST IF ZCOMP IS THE CULPRIT...
+//Nope, this is NOT the problem...
+//zcomp=0;
+// We'll do the comparison/bit/byte inhibits here, since that's they way it happens
+// in the real thing (dcomp goes out to COMP_CTRL and back into DATA through dbinh)...
+	{
+	uint8_t bcomp_bits;
+	if (bcompen && phrase_mode)
+	{
+		bcomp_bits = (srcd >> 56) & 0xFF;
+	}
+	else
+		bcomp_bits = srcd & 0xFF;
 
-            if ((dzwrite && inner0)
-                  || (dwrite && !dstwrz && inner0))
-            {
-               idle_inneri = true;
-               break;
-            }
-            else
-               idle_inneri = false;
+	COMP_CTRL(&dbinht, nowrite,
+		bcompen, true/*big_pix*/, bkgwren, *dcomp, dcompen, icount, pixsize, phrase_mode, bcomp_bits, *zcomp);
+	}
+	dbinh = dbinht;
 
-            sreadxi = (idle_inner && srcenx);
-            szreadxi = (sreadx && srcenz);
+//////////////////////////////////////////////////////////////////////////////////////
 
-            sreadi = (szreadx
-                  || (sreadx && !srcenz && srcen)
-                  || (idle_inner && !srcenx && srcen)
-                  || (dzwrite && !inner0 && srcen)
-                  || (dwrite && !dstwrz && !inner0 && srcen));
+// 22 Mar 94
+// The data initializer - allows all four initial values to be computed from one (NOT IN JAGUAR I)
 
-            szreadi = (sread && srcenz);
+/*Datinit		:= DATINIT (initcin[0..3], initinc[0..63], initpix[0..15], a1_x[0..1], big_pix, clk, iinc, init_if, init_ii,
+			init_zf, istep[0..31], zinc, zstep[0..31]);*/
 
-            dreadi = ((szread && dsten)
-                  || (sread && !srcenz && dsten)
-                  || (sreadx && !srcenz && !srcen && dsten)
-                  || (idle_inner && !srcenx && !srcen && dsten)
-                  || (dzwrite && !inner0 && !srcen && dsten)
-                  || (dwrite && !dstwrz && !inner0 && !srcen && dsten));
+// Adder array for Z and intensity increments
 
-            dzreadi = ((dread && dstenz)
-                  || (szread && !dsten && dstenz)
-                  || (sread && !srcenz && !dsten && dstenz)
-                  || (sreadx && !srcenz && !srcen && !dsten && dstenz)
-                  || (idle_inner && !srcenx && !srcen && !dsten && dstenz)
-                  || (dzwrite && !inner0 && !srcen && !dsten && dstenz)
-                  || (dwrite && !dstwrz && !inner0 && !srcen && !dsten && dstenz));
+/*Addarray	:= ADDARRAY (addq[0..3], clk, daddasel[0..2], daddbsel[0..3], daddmode[0..2], dstdlo, dstdhi, iinc,
+			initcin[0..3], initinc[0..63], initpix[0..15], istep, patdv[0..1], srcdlo, srcdhi, srcz1[0..1],
+			srcz2[0..1], reset\, zinc, zstep);*/
+/*void ADDARRAY(uint16_t * addq, uint8_t daddasel, uint8_t daddbsel, uint8_t daddmode,
+	uint64_t dstd, uint32_t iinc, uint8_t initcin[], uint64_t initinc, uint16_t initpix,
+	uint32_t istep, uint64_t patd, uint64_t srcd, uint64_t srcz1, uint64_t srcz2,
+	uint32_t zinc, uint32_t zstep)*/
+////////////////////////////////////// C++ CODE //////////////////////////////////////
+	{
+	uint64_t patd_pre = *patd;
+	ADDARRAY(addq, daddasel, daddbsel, daddmode, dstd, iinc, initcin, 0, 0, 0, *patd, srcd, 0, 0, 0, 0);
 
-            dwritei = (dzread
-                  || (dread && !dstenz)
-                  || (szread && !dsten && !dstenz)
-                  || (sread && !srcenz && !dsten && !dstenz)
-                  || (sreadx && !srcenz && !srcen && !dsten && !dstenz)
-                  || (idle_inner && !srcenx && !srcen && !dsten && !dstenz)
-                  || (dzwrite && !inner0 && !srcen && !dsten && !dstenz)
-                  || (dwrite && !dstwrz && !inner0 && !srcen && !dsten && !dstenz));
+	if (patdadd)
+		*patd = ((uint64_t)addq[3] << 48) | ((uint64_t)addq[2] << 32) | ((uint64_t)addq[1] << 16) | (uint64_t)addq[0];
+//////////////////////////////////////////////////////////////////////////////////////
 
-            dzwritei = (dwrite && dstwrz);
+// Local data bus multiplexer
+// In hardware, the write data mux reads patd BEFORE the register update.
+// patd_pre captures the pre-increment value for the data output mux.
 
-            // Here we move the fooi into their foo counterparts in order to simulate the moving
-            // of data into the various FDSYNCs... Each time we loop we simulate one clock cycle...
+/*Local_mux	:= LOCAL_MUX (local_data[0..1], load_data[0..1],
+	addq[0..3], gpu_din, data[0..63], blitter_active, daddq_sel);
+Local_data0	:= JOIN (local_data0, local_data[0]);
+Local_data1	:= JOIN (local_data1, local_data[1]);*/
+////////////////////////////////////// C++ CODE //////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////////////
 
-            idle_inner = idle_inneri;
-            sreadx = sreadxi;
-            szreadx = szreadxi;
-            sread = sreadi;
-            szread = szreadi;
-            dread = dreadi;
-            dzread = dzreadi;
-            dwrite = dwritei;
-            dzwrite = dzwritei;
+// Data output multiplexer and tri-state drive
 
-            // Here's a few more decodes--not sure if they're supposed to go here or not...
+/*Data_mux	:= DATA_MUX (wdata[0..63], addq[0..3], big_pix, dstdlo, dstdhi, dstz[0..1], data_sel[0..1], data_ena,
+			dstart[0..5], dend[0..5], dbinh\[0..7], lfu[0..1], patdo[0..1], phrase_mode, srczo[0..1]);*/
+////////////////////////////////////// C++ CODE //////////////////////////////////////
+// NOTE: patdo comes from DATAMIX and can be considered the same as patd for Jaguar I
 
+//////////////////////////////////////////////////////////////////////////////////////
+//}
 
-            srca_addi = (sreadxi && !srcenz) || (sreadi && !srcenz) || szreadxi || szreadi;
+/*DEF DATA_MUX (
+		wdata[0..63]	// co-processor rwrite data bus
+		:BUS;
+INT16/	addq[0..3]
+		big_pix			// Pixel organisation is big-endian
+INT32/	dstdlo
+INT32/	dstdhi
+INT32/	dstzlo
+INT32/	dstzhi
+		data_sel[0..1]	// source of write data
+		data_ena		// enable write data onto read/write bus
+		dstart[0..5]	// start of changed write data
+		dend[0..5]		// end of changed write data
+		dbinh\[0..7]	// byte oriented changed data inhibits
+INT32/	lfu[0..1]
+INT32/	patd[0..1]
+		phrase_mode		// phrase write mode
+INT32/	srczlo
+INT32/	srczhi
+		:IN);*/
 
-            dsta_addi = (dwritei && !dstwrz) || dzwritei;
+/*INT32/	addql[0..1], ddatlo, ddathi zero32
+:LOCAL;
+BEGIN
 
-            gensrc = sreadxi || szreadxi || sreadi || szreadi;
-            gendst = dreadi || dzreadi || dwritei || dzwritei;
-            gena2i = (gensrc && !dsta2) || (gendst && dsta2);
+Phrase_mode\	:= INV1 (phrase_mode\, phrase_mode);
+Zero		:= TIE0 (zero);
+Zero32		:= JOIN (zero32, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero);*/
 
-            zaddr = szreadx || szread || dzread || dzwrite;
+/* Generate a changed data mask */
 
-            // Some stuff from MCONTROL.NET--not sure if this is the correct use of this decode or not...
-            /*Fontread\	:= OND1 (fontread\, sread[1], sreadx[1], bcompen);
-Fontread	:= INV1 (fontread, fontread\);
-Justt		:= NAN3 (justt, fontread\, phrase_mode, tactive\);
-Justify		:= TS (justify, justt, busen);*/
-            fontread = (sread || sreadx) && bcompen;
-            justify = !(!fontread && phrase_mode /*&& tactive*/);
+/*Edis		:= OR6 (edis\, dend[0..5]);
+Ecoarse		:= DECL38E (e_coarse\[0..7], dend[3..5], edis\);
+E_coarse[0]	:= INV1 (e_coarse[0], e_coarse\[0]);
+Efine		:= DECL38E (unused[0], e_fine\[1..7], dend[0..2], e_coarse[0]);*/
+////////////////////////////////////// C++ CODE //////////////////////////////////////
 
-            /* Generate inner loop update enables */
-            /*
-A1_addi		:= MX2 (a1_addi, dsta_addi, srca_addi, dsta2);
-A2_addi		:= MX2 (a2_addi, srca_addi, dsta_addi, dsta2);
-A1_add		:= FD1 (a1_add, a1_add\, a1_addi, clk);
-A2_add		:= FD1 (a2_add, a2_add\, a2_addi, clk);
-A2_addb		:= BUF1 (a2_addb, a2_add);
-*/
-            a1_add = (dsta2 ? srca_addi : dsta_addi);
-            a2_add = (dsta2 ? dsta_addi : srca_addi);
+	en = ((dend & 0x3F) ? 1 : 0);
+	e_coarse = decl38e[en][(dend & 0x38) >> 3];		// Actually, this is e_coarse inverted...
+	e_fine = decl38e[(e_coarse & 0x01) ^ 0x01][dend & 0x07];
+	e_fine &= 0xFE;
+//////////////////////////////////////////////////////////////////////////////////////
 
-            /* Address adder input A register selection
-               000	A1 step integer part
-               001	A1 step fraction part
-               010	A1 increment integer part
-               011	A1 increment fraction part
-               100	A2 step
+/*Scoarse		:= DECH38 (s_coarse[0..7], dstart[3..5]);
+Sfen\		:= INV1 (sfen\, s_coarse[0]);
+Sfine		:= DECH38EL (s_fine[0..7], dstart[0..2], sfen\);*/
+////////////////////////////////////// C++ CODE //////////////////////////////////////
+	s_coarse = dech38[(dstart & 0x38) >> 3];
+	s_fine = dech38el[(s_coarse & 0x01) ^ 0x01][dstart & 0x07];
+//////////////////////////////////////////////////////////////////////////////////////
 
-               bit 2 = a2update
-               bit 1 = /a2update . (a1_add . a1addx[0..1])
-               bit 0 = /a2update . ( a1fupdate
-               + a1_add . atick[0] . a1addx[0..1])
-               The /a2update term on bits 0 and 1 is redundant.
-               Now look-ahead based
-               */
+/*Maskt[0]	:= BUF1 (maskt[0], s_fine[0]);
+Maskt[1-7]	:= OAN1P (maskt[1-7], maskt[0-6], s_fine[1-7], e_fine\[1-7]);*/
+////////////////////////////////////// C++ CODE //////////////////////////////////////
+	maskt = s_fine & 0x0001;
+	maskt |= (((maskt & 0x0001) || (s_fine & 0x02u)) && (e_fine & 0x02u) ? 0x0002 : 0x0000);
+	maskt |= (((maskt & 0x0002) || (s_fine & 0x04u)) && (e_fine & 0x04u) ? 0x0004 : 0x0000);
+	maskt |= (((maskt & 0x0004) || (s_fine & 0x08u)) && (e_fine & 0x08u) ? 0x0008 : 0x0000);
+	maskt |= (((maskt & 0x0008) || (s_fine & 0x10u)) && (e_fine & 0x10u) ? 0x0010 : 0x0000);
+	maskt |= (((maskt & 0x0010) || (s_fine & 0x20u)) && (e_fine & 0x20u) ? 0x0020 : 0x0000);
+	maskt |= (((maskt & 0x0020) || (s_fine & 0x40u)) && (e_fine & 0x40u) ? 0x0040 : 0x0000);
+	maskt |= (((maskt & 0x0040) || (s_fine & 0x80u)) && (e_fine & 0x80u) ? 0x0080 : 0x0000);
+//////////////////////////////////////////////////////////////////////////////////////
 
-            addasel = (a1fupdate || (a1_add && a1addx == 3) ? 0x01 : 0x00);
-            addasel |= (a1_add && a1addx == 3 ? 0x02 : 0x00);
-            addasel |= (a2update ? 0x04 : 0x00);
-            /* Address adder input A X constant selection
-               adda_xconst[0..2] generate a power of 2 in the range 1-64 or all
-               zeroes when they are all 1
-               Remember - these are pixels, so to add one phrase the pixel size
-               has to be taken into account to get the appropriate value.
-               for A1
-               if a1addx[0..1] are 00 set 6 - pixel size
-               if a1addx[0..1] are 01 set the value 000
-               if a1addx[0..1] are 10 set the value 111
-               similarly for A2
-JLH: Also, 11 will likewise set the value to 111
-*/
-            adda_xconst = (a2_add ? a2_xconst : a1_xconst);
-            /* Address adder input A Y constant selection
-               22 June 94 - This was erroneous, because only the a1addy bit was reflected here.
-               Therefore, the selection has to be controlled by a bug fix bit.
-JLH: Bug fix bit in Jaguar II--not in Jaguar I!
-*/
-            adda_yconst = a1addy;
-            /* Address adder input A register versus constant selection
-               given by	  a1_add . a1addx[0..1]
-               + a1update
-               + a1fupdate
-               + a2_add . a2addx[0..1]
-               + a2update
-               */
-            addareg = ((a1_add && a1addx == 3) || a1update || a1fupdate
-                  || (a2_add && a2addx == 3) || a2update ? true : false);
-            /* The adders can be put into subtract mode in add pixel size
-               mode when the corresponding flags are set */
-            suba_x = ((a1_add && a1xsign && a1addx == 1) || (a2_add && a2xsign && a2addx == 1) ? true : false);
-            suba_y = ((a1_add && a1addy && a1ysign) || (a2_add && a2addy && a2ysign) ? true : false);
-            /* Address adder input B selection
-               00	A1 pointer
-               01	A2 pointer
-               10	A1 fraction
-               11	Zero
+   /* Produce a look-ahead on the ripple carry */
+	maskt |= (((s_coarse & e_coarse & 0x01u) || (s_coarse & 0x02u)) && (e_coarse & 0x02u) ? 0x0100 : 0x0000);
+	maskt |= (((maskt & 0x0100) || (s_coarse & 0x04u)) && (e_coarse & 0x04u) ? 0x0200 : 0x0000);
+	maskt |= (((maskt & 0x0200) || (s_coarse & 0x08u)) && (e_coarse & 0x08u) ? 0x0400 : 0x0000);
+	maskt |= (((maskt & 0x0400) || (s_coarse & 0x10u)) && (e_coarse & 0x10u) ? 0x0800 : 0x0000);
+	maskt |= (((maskt & 0x0800) || (s_coarse & 0x20u)) && (e_coarse & 0x20u) ? 0x1000 : 0x0000);
+	maskt |= (((maskt & 0x1000) || (s_coarse & 0x40u)) && (e_coarse & 0x40u) ? 0x2000 : 0x0000);
+	maskt |= (((maskt & 0x2000) || (s_coarse & 0x80u)) && (e_coarse & 0x80u) ? 0x4000 : 0x0000);
 
-               Bit 1 =   a1fupdate
-               + (a1_add . atick[0] . a1addx[0..1])
-               + a1fupdate . a1_stepld
-               + a1update . a1_stepld
-               + a2update . a2_stepld
-               Bit 0 =   a2update + a2_add
-               + a1fupdate . a1_stepld
-               + a1update . a1_stepld
-               + a2update . a2_stepld
-               */
-            addbsel = (a2update || a2_add || (a1fupdate && a1_stepld)
-                  || (a1update && a1_stepld) || (a2update && a2_stepld) ? 0x01 : 0x00);
-            addbsel |= (a1fupdate || (a1_add && a1addx == 3) || (a1fupdate && a1_stepld)
-                  || (a1update && a1_stepld) || (a2update && a2_stepld) ? 0x02 : 0x00);
+/* The bit terms are mirrored for big-endian pixels outside phrase
+mode.  The byte terms are mirrored for big-endian pixels in phrase
+mode.  */
 
-            /* The modulo bits are used to align X onto a phrase boundary when
-               it is being updated by one phrase
-               000	no mask
-               001	mask bit 0
-               010	mask bits 1-0
-               ..
-               110  	mask bits 5-0
+/*Mirror_bit	:= AN2M (mir_bit, phrase_mode\, big_pix);
+Mirror_byte	:= AN2H (mir_byte, phrase_mode, big_pix);
 
-               Masking is enabled for a1 when a1addx[0..1] is 00, and the value
-               is 6 - the pixel size (again!)
-               */
-            maska1 = (a1_add && a1addx == 0 ? 6 - a1_pixsize : 0);
-            maska2 = (a2_add && a2addx == 0 ? 6 - a2_pixsize : 0);
-            modx = (a2_add ? maska2 : maska1);
-            /* Generate load strobes for the increment updates */
+Masktb[14]	:= BUF1 (masktb[14], maskt[14]);
+Masku[0]	:= MX4 (masku[0],  maskt[0],  maskt[7],  maskt[14],  zero, mir_bit, mir_byte);
+Masku[1]	:= MX4 (masku[1],  maskt[1],  maskt[6],  maskt[14],  zero, mir_bit, mir_byte);
+Masku[2]	:= MX4 (masku[2],  maskt[2],  maskt[5],  maskt[14],  zero, mir_bit, mir_byte);
+Masku[3]	:= MX4 (masku[3],  maskt[3],  maskt[4],  masktb[14], zero, mir_bit, mir_byte);
+Masku[4]	:= MX4 (masku[4],  maskt[4],  maskt[3],  masktb[14], zero, mir_bit, mir_byte);
+Masku[5]	:= MX4 (masku[5],  maskt[5],  maskt[2],  masktb[14], zero, mir_bit, mir_byte);
+Masku[6]	:= MX4 (masku[6],  maskt[6],  maskt[1],  masktb[14], zero, mir_bit, mir_byte);
+Masku[7]	:= MX4 (masku[7],  maskt[7],  maskt[0],  masktb[14], zero, mir_bit, mir_byte);
+Masku[8]	:= MX2 (masku[8],  maskt[8],  maskt[13], mir_byte);
+Masku[9]	:= MX2 (masku[9],  maskt[9],  maskt[12], mir_byte);
+Masku[10]	:= MX2 (masku[10], maskt[10], maskt[11], mir_byte);
+Masku[11]	:= MX2 (masku[11], maskt[11], maskt[10], mir_byte);
+Masku[12]	:= MX2 (masku[12], maskt[12], maskt[9],  mir_byte);
+Masku[13]	:= MX2 (masku[13], maskt[13], maskt[8],  mir_byte);
+Masku[14]	:= MX2 (masku[14], maskt[14], maskt[0],  mir_byte);*/
+////////////////////////////////////// C++ CODE //////////////////////////////////////
 
-            /*A1pldt		:= NAN2 (a1pldt, atick[1], a1_add);
-A1ptrldi	:= NAN2 (a1ptrldi, a1update\, a1pldt);
+	mir_bit  = true/*big_pix*/ && !phrase_mode;
+	mir_byte = true/*big_pix*/ && phrase_mode;
+	masku    = maskt;
 
-A1fldt		:= NAN4 (a1fldt, atick[0], a1_add, a1addx[0..1]);
-A1fracldi	:= NAN2 (a1fracldi, a1fupdate\, a1fldt);
+	if (mir_bit)
+	{
+		masku &= 0xFF00;
+		masku |= (maskt >> 7) & 0x0001;
+		masku |= (maskt >> 5) & 0x0002;
+		masku |= (maskt >> 3) & 0x0004;
+		masku |= (maskt >> 1) & 0x0008;
+		masku |= (maskt << 1) & 0x0010;
+		masku |= (maskt << 3) & 0x0020;
+		masku |= (maskt << 5) & 0x0040;
+		masku |= (maskt << 7) & 0x0080;
+	}
 
-A2pldt		:= NAN2 (a2pldt, atick[1], a2_add);
-A2ptrldi	:= NAN2 (a2ptrldi, a2update\, a2pldt);*/
+	if (mir_byte)
+	{
+		/* MX4 input 2: masku[7:0] = {8{maskt[14]}} (broadcast bit 14) */
+		masku = (maskt & 0x4000) ? 0x00FF : 0x0000;
+		/* MX2: reverse bits 8-13, maskt[0] at position 14 */
+		masku |= (maskt >> 5) & 0x0100;
+		masku |= (maskt >> 3) & 0x0200;
+		masku |= (maskt >> 1) & 0x0400;
+		masku |= (maskt << 1) & 0x0800;
+		masku |= (maskt << 3) & 0x1000;
+		masku |= (maskt << 5) & 0x2000;
+		masku |= (maskt & 0x0001) << 14;
+	}
+//////////////////////////////////////////////////////////////////////////////////////
 
-            a1fracldi = a1fupdate || (a1_add && a1addx == 3);
+/* The maskt terms define the area for changed data, but the byte
+inhibit terms can override these */
 
-            ADDRGEN(&address, &pixAddr, gena2i, zaddr,
-                  a1_x, a1_y, a1_base, a1_pitch, a1_pixsize, a1_width, a1_zoffset,
-                  a2_x, a2_y, a2_base, a2_pitch, a2_pixsize, a2_width, a2_zoffset);
+/*Mask[0-7]	:= AN2 (mask[0-7], masku[0-7], dbinh\[0]);
+Mask[8-14]	:= AN2H (mask[8-14], masku[8-14], dbinh\[1-7]);*/
+////////////////////////////////////// C++ CODE //////////////////////////////////////
+	mask = masku & (!(dbinh & 0x01) ? 0xFFFF : 0xFF00);
+	mask &= ~(((uint16_t)dbinh & 0x00FE) << 7);
+//////////////////////////////////////////////////////////////////////////////////////
 
-            //Here's my guess as to how the addresses get truncated to phrase boundaries in phrase mode...
-            if (!justify)
-               address &= 0xFFFFF8;
+/*Addql[0]	:= JOIN (addql[0], addq[0..1]);
+Addql[1]	:= JOIN (addql[1], addq[2..3]);
 
-            /* dstxp needed for dstart computation in dwrite */
-            dstxp = (dsta2 ? a2_x : a1_x) & 0x3F;
+Dsel0b[0-1]	:= BUF8 (dsel0b[0-1], data_sel[0]);
+Dsel1b[0-1]	:= BUF8 (dsel1b[0-1], data_sel[1]);
+Ddatlo		:= MX4 (ddatlo, patd[0], lfu[0], addql[0], zero32, dsel0b[0], dsel1b[0]);
+Ddathi		:= MX4 (ddathi, patd[1], lfu[1], addql[1], zero32, dsel0b[1], dsel1b[1]);*/
+////////////////////////////////////// C++ CODE //////////////////////////////////////
+	dmux[0] = patd_pre;
+	dmux[1] = lfu;
+	dmux[2] = ((uint64_t)addq[3] << 48) | ((uint64_t)addq[2] << 32) | ((uint64_t)addq[1] << 16) | (uint64_t)addq[0];
+	dmux[3] = 0;
+	ddat = dmux[data_sel];
+	}
+//////////////////////////////////////////////////////////////////////////////////////
 
-            if (sreadx)
-            {
-               PERF_INC(blitter_phrase_reads);
-#ifdef BENCH_PROFILE
-               blitter_did_io = 1;
-#endif
-               //uint32_t srcAddr, pixAddr;
-               //ADDRGEN(srcAddr, pixAddr, gena2i, zaddr,
-               //	a1_x, a1_y, a1_base, a1_pitch, a1_pixsize, a1_width, a1_zoffset,
-               //	a2_x, a2_y, a2_base, a2_pitch, a2_pixsize, a2_width, a2_zoffset);
-               srcd2 = srcd1;
-               srcd1 = ((uint64_t)JaguarReadLong(address + 0, BLITTER) << 32)
-                  | (uint64_t)JaguarReadLong(address + 4, BLITTER);
-               //Kludge to take pixel size into account...
-               //Hmm. If we're not in phrase mode, this is most likely NOT going to be used...
-               //Actually, it would be--because of BCOMPEN expansion, for example...
-               if (!phrase_mode)
-               {
-                  if (bcompen)
-                     srcd1 >>= 56;
-                  else
-                  {
-                     if (pixsize == 5)
-                        srcd1 >>= 32;
-                     else if (pixsize == 4)
-                        srcd1 >>= 48;
-                     else
-                        srcd1 >>= 56;
-                  }
-               }//*/
-            }
+/*Zed_sel		:= AN2 (zed_sel, data_sel[0..1]);
+Zed_selb[0-1]	:= BUF8 (zed_selb[0-1], zed_sel);
 
-            if (szreadx)
-            {
-               srcz2 = srcz1;
-               srcz1 = ((uint64_t)JaguarReadLong(address, BLITTER) << 32) | (uint64_t)JaguarReadLong(address + 4, BLITTER);
-            }
+Dat[0-7]	:= MX4 (dat[0-7],   dstdlo{0-7},   ddatlo{0-7},   dstzlo{0-7},   srczlo{0-7},   mask[0-7], zed_selb[0]);
+Dat[8-15]	:= MX4 (dat[8-15],  dstdlo{8-15},  ddatlo{8-15},  dstzlo{8-15},  srczlo{8-15},  mask[8],   zed_selb[0]);
+Dat[16-23]	:= MX4 (dat[16-23], dstdlo{16-23}, ddatlo{16-23}, dstzlo{16-23}, srczlo{16-23}, mask[9],   zed_selb[0]);
+Dat[24-31]	:= MX4 (dat[24-31], dstdlo{24-31}, ddatlo{24-31}, dstzlo{24-31}, srczlo{24-31}, mask[10],  zed_selb[0]);
+Dat[32-39]	:= MX4 (dat[32-39], dstdhi{0-7},   ddathi{0-7},   dstzhi{0-7},   srczhi{0-7},   mask[11],  zed_selb[1]);
+Dat[40-47]	:= MX4 (dat[40-47], dstdhi{8-15},  ddathi{8-15},  dstzhi{8-15},  srczhi{8-15},  mask[12],  zed_selb[1]);
+Dat[48-55]	:= MX4 (dat[48-55], dstdhi{16-23}, ddathi{16-23}, dstzhi{16-23}, srczhi{16-23}, mask[13],  zed_selb[1]);
+Dat[56-63]	:= MX4 (dat[56-63], dstdhi{24-31}, ddathi{24-31}, dstzhi{24-31}, srczhi{24-31}, mask[14],  zed_selb[1]);*/
+////////////////////////////////////// C++ CODE //////////////////////////////////////
+	*wdata = blitter_simd_ops.byte_merge(ddat, dstd, mask);
+	*srcz = blitter_simd_ops.byte_merge(*srcz, dstz, mask);
+//////////////////////////////////////////////////////////////////////////////////////
+
+/*Data_enab[0-1]	:= BUF8 (data_enab[0-1], data_ena);
+Datadrv[0-31]	:= TS (wdata[0-31],  dat[0-31],  data_enab[0]);
+Datadrv[32-63]	:= TS (wdata[32-63], dat[32-63], data_enab[1]);
+
+Unused[0]	:= DUMMY (unused[0]);
+
+END;*/
+}
+
+void BlitterMidsummer2(void)
+{
+   uint32_t cmd = (PERF_INC(blitter_calls), GET32(blitter_ram, COMMAND));
+
+
+   // Line states passed in via the command register
+
+   bool srcen = (SRCEN), srcenx = (SRCENX), srcenz = (SRCENZ),
+        dsten = (DSTEN), dstenz = (DSTENZ), dstwrz = (DSTWRZ), clip_a1 = (CLIPA1),
+        upda1 = (UPDA1), upda1f = (UPDA1F), upda2 = (UPDA2), dsta2 = (DSTA2),
+        gourd = (GOURD), gourz = (GOURZ), topben = (TOPBEN), topnen = (TOPNEN),
+        patdsel = (PATDSEL), adddsel = (ADDDSEL), cmpdst = (CMPDST), bcompen = (BCOMPEN),
+        dcompen = (DCOMPEN), bkgwren = (BKGWREN), srcshade = (SRCSHADE);
+
+   uint8_t zmode = (cmd & 0x01C0000) >> 18, lfufunc = (cmd & 0x1E00000) >> 21;
+   //Missing: BUSHI
+   //Where to find various lines:
+   // clip_a1  -> inner
+   // gourd    -> dcontrol, inner, outer, state
+   // gourz    -> dcontrol, inner, outer, state
+   // cmpdst   -> blit, data, datacomp, state
+   // bcompen  -> acontrol, inner, mcontrol, state
+   // dcompen  -> inner, state
+   // bkgwren  -> inner, state
+   // srcshade -> dcontrol, inner, state
+   // adddsel  -> dcontrol
+   //NOTE: ADDDSEL takes precedence over PATDSEL, PATDSEL over LFU_FUNC
+
+   // Lines that don't exist in Jaguar I (and will never be asserted)
+
+   bool polygon = false, datinit = false, a1_stepld = false, a2_stepld = false, ext_int = false;
+   bool istepadd = false, istepfadd = false;
+   bool zstepfadd = false, zstepadd = false;
+
+   // Various state lines (initial state--basically the reset state of the FDSYNCs)
+
+   bool go = true, idle = true, inner = false, a1fupdate = false, a1update = false,
+        zfupdate = false, zupdate = false, a2update = false, init_if = false, init_ii = false,
+        init_zf = false, init_zi = false;
+
+   bool outer0 = false, indone = false;
+
+   bool idlei, inneri, a1fupdatei, a1updatei, zfupdatei, zupdatei, a2updatei, init_ifi, init_iii,
+        init_zfi, init_zii;
+
+   bool notgzandp = !(gourz && polygon);
+
+
+   // Various registers set up by user
+
+   uint16_t ocount = GET16(blitter_ram, PIXLINECOUNTER);
+   uint8_t a1_pitch = blitter_ram[A1_FLAGS + 3] & 0x03;
+   uint8_t a2_pitch = blitter_ram[A2_FLAGS + 3] & 0x03;
+   uint8_t a1_pixsize = (blitter_ram[A1_FLAGS + 3] & 0x38) >> 3;
+   uint8_t a2_pixsize = (blitter_ram[A2_FLAGS + 3] & 0x38) >> 3;
+   uint8_t a1_zoffset = (GET16(blitter_ram, A1_FLAGS + 2) >> 6) & 0x07;
+   uint8_t a2_zoffset = (GET16(blitter_ram, A2_FLAGS + 2) >> 6) & 0x07;
+   uint8_t a1_width = (blitter_ram[A1_FLAGS + 2] >> 1) & 0x3F;
+   uint8_t a2_width = (blitter_ram[A2_FLAGS + 2] >> 1) & 0x3F;
+   uint8_t a1addx = blitter_ram[A1_FLAGS + 1] & 0x03, a2addx = blitter_ram[A2_FLAGS + 1] & 0x03;
+   bool a1addy = blitter_ram[A1_FLAGS + 1] & 0x04, a2addy = blitter_ram[A2_FLAGS + 1] & 0x04;
+   bool a1xsign = blitter_ram[A1_FLAGS + 1] & 0x08, a2xsign = blitter_ram[A2_FLAGS + 1] & 0x08;
+   bool a1ysign = blitter_ram[A1_FLAGS + 1] & 0x10, a2ysign = blitter_ram[A2_FLAGS + 1] & 0x10;
+   uint32_t a1_base = GET32(blitter_ram, A1_BASE) & 0xFFFFFFF8;	// Phrase aligned by ignoring bottom 3 bits
+   uint32_t a2_base = GET32(blitter_ram, A2_BASE) & 0xFFFFFFF8;
+
+   uint16_t a1_win_x = GET16(blitter_ram, A1_CLIP + 2) & 0x7FFF;
+   uint16_t a1_win_y = GET16(blitter_ram, A1_CLIP + 0) & 0x7FFF;
+   int16_t a1_x = (int16_t)GET16(blitter_ram, A1_PIXEL + 2);
+   int16_t a1_y = (int16_t)GET16(blitter_ram, A1_PIXEL + 0);
+   int16_t a1_step_x = (int16_t)GET16(blitter_ram, A1_STEP + 2);
+   int16_t a1_step_y = (int16_t)GET16(blitter_ram, A1_STEP + 0);
+   uint16_t a1_stepf_x = GET16(blitter_ram, A1_FSTEP + 2);
+   uint16_t a1_stepf_y = GET16(blitter_ram, A1_FSTEP + 0);
+   uint16_t a1_frac_x = GET16(blitter_ram, A1_FPIXEL + 2);
+   uint16_t a1_frac_y = GET16(blitter_ram, A1_FPIXEL + 0);
+   int16_t a1_inc_x = (int16_t)GET16(blitter_ram, A1_INC + 2);
+   int16_t a1_inc_y = (int16_t)GET16(blitter_ram, A1_INC + 0);
+   uint16_t a1_incf_x = GET16(blitter_ram, A1_FINC + 2);
+   uint16_t a1_incf_y = GET16(blitter_ram, A1_FINC + 0);
 
-            if (sread)
-            {
-               PERF_INC(blitter_phrase_reads);
-#ifdef BENCH_PROFILE
-               blitter_did_io = 1;
+   int16_t a2_x = (int16_t)GET16(blitter_ram, A2_PIXEL + 2);
+   int16_t a2_y = (int16_t)GET16(blitter_ram, A2_PIXEL + 0);
+#if 0
+   bool a2_mask = blitter_ram[A2_FLAGS + 2] & 0x80;
+   uint16_t a2_mask_x = GET16(blitter_ram, A2_MASK + 2);
+   uint16_t a2_mask_y = GET16(blitter_ram, A2_MASK + 0);
+   uint32_t collision = GET32(blitter_ram, COLLISIONCTRL);// 0=RESUME, 1=ABORT, 2=STOPEN
 #endif
-               srcd2 = srcd1;
-               srcd1 = ((uint64_t)JaguarReadLong(address, BLITTER) << 32) | (uint64_t)JaguarReadLong(address + 4, BLITTER);
-               //Kludge to take pixel size into account...
-               if (!phrase_mode)
-               {
-                  if (bcompen)
-                     srcd1 >>= 56;
-                  else
-                  {
-                     if (pixsize == 5)
-                        srcd1 >>= 32;
-                     else if (pixsize == 4)
-                        srcd1 >>= 48;
-                     else
-                        srcd1 >>= 56;
-                  }
-               }
-            }
+   int16_t a2_step_x = (int16_t)GET16(blitter_ram, A2_STEP + 2);
+   int16_t a2_step_y = (int16_t)GET16(blitter_ram, A2_STEP + 0);
 
-            if (szread)
-            {
-               PERF_INC(blitter_phrase_reads);
-#ifdef BENCH_PROFILE
-               blitter_did_io = 1;
-#endif
-               srcz2 = srcz1;
-               srcz1 = ((uint64_t)JaguarReadLong(address, BLITTER) << 32) | (uint64_t)JaguarReadLong(address + 4, BLITTER);
-               //Kludge to take pixel size into account... I believe that it only has to take 16BPP mode into account. Not sure tho.
-               if (!phrase_mode && pixsize == 4)
-                  srcz1 >>= 48;
+   uint64_t srcd1 = GET64(blitter_ram, SRCDATA);
+   uint64_t srcd2 = 0;
+   uint64_t dstd = GET64(blitter_ram, DSTDATA);
+   uint64_t patd = GET64(blitter_ram, PATTERNDATA);
+   uint32_t iinc = GET32(blitter_ram, INTENSITYINC);
+   uint64_t srcz1 = GET64(blitter_ram, SRCZINT);
+   uint64_t srcz2 = GET64(blitter_ram, SRCZFRAC);
+   uint64_t dstz = GET64(blitter_ram, DSTZ);
+   uint32_t zinc = GET32(blitter_ram, ZINC);
 
-            }
+   uint8_t pixsize = (dsta2 ? a2_pixsize : a1_pixsize);	// From ACONTROL
 
-            if (dread)
-            {
-               PERF_INC(blitter_phrase_reads);
-#ifdef BENCH_PROFILE
-               blitter_did_io = 1;
-#endif
-               dstd = ((uint64_t)JaguarReadLong(address, BLITTER) << 32) | (uint64_t)JaguarReadLong(address + 4, BLITTER);
-               //Kludge to take pixel size into account...
-               if (!phrase_mode)
-               {
-                  if (pixsize == 5)
-                     dstd >>= 32;
-                  else if (pixsize == 4)
-                     dstd >>= 48;
-                  else
-                     dstd >>= 56;
-               }
-            }
+   bool phrase_mode;
+   uint16_t a1FracCInX = 0, a1FracCInY = 0;
 
-            if (dzread)
-            {
-               // Is Z always 64 bit read? Or sometimes 16 bit (dependent on phrase_mode)?
-               dstz = ((uint64_t)JaguarReadLong(address, BLITTER) << 32) | (uint64_t)JaguarReadLong(address + 4, BLITTER);
-               //Kludge to take pixel size into account... I believe that it only has to take 16BPP mode into account. Not sure tho.
-               if (!phrase_mode && pixsize == 4)
-                  dstz >>= 48;
+   // Bugs in Jaguar I
 
-            }
+   a2addy = a1addy;							// A2 channel Y add bit is tied to A1's
 
-            // These vars should probably go further up in the code... !!! FIX !!!
-            // We can't preassign these unless they're static...
-            //NOTE: SRCSHADE requires GOURZ to be set to work properly--another Jaguar I bug
-            if (dwrite)
-            {
-#ifdef BENCH_PROFILE
-               blitter_did_io = 1;
-#endif
-               //Counter is done on the dwrite state...! (We'll do it first, since it affects dstart/dend calculations.)
-               //Here's the voodoo for figuring the correct amount of pixels in phrase mode (or not):
-               int8_t inct = (PERF_INC(blitter_phrase_writes), -((dsta2 ? a2_x : a1_x) & 0x07));	// From INNER_CNT
-               uint8_t inc = 0;
-               uint16_t oldicount;
-               uint8_t dstart = 0;
+   // Various state lines set up by user
 
-               inc = (!phrase_mode || (phrase_mode && (inct & 0x01)) ? 0x01 : 0x00);
-               inc |= (phrase_mode && (((pixsize == 3 || pixsize == 4) && (inct & 0x02)) || (pixsize == 5 && !(inct & 0x01))) ? 0x02 : 0x00);
-               inc |= (phrase_mode && ((pixsize == 3 && (inct & 0x04)) || (pixsize == 4 && !(inct & 0x03))) ? 0x04 : 0x00);
-               inc |= (phrase_mode && pixsize == 3 && !(inct & 0x07) ? 0x08 : 0x00);
+   phrase_mode = ((!dsta2 && a1addx == 0) || (dsta2 && a2addx == 0) ? true : false);	// From ACONTROL
 
-               oldicount = icount;	// Save icount to detect underflow...
-               icount -= inc;
+   // Stopgap vars to simulate various lines
 
-               if (icount == 0 || ((icount & 0x8000) && !(oldicount & 0x8000)))
-                  inner0 = true;
-               // X/Y stepping is also done here, I think...No. It's done when a1_add or a2_add is asserted...
 
-               //*********************************************************************************
-               //Start & end write mask computations...
-               //*********************************************************************************
+   while (true)
+   {
+      PERF_INC(blitter_outer);
+      // IDLE
+
+      if ((idle && !go) || (inner && outer0 && indone))
+      {
+         idlei = true;
 
+         //Instead of a return, let's try breaking out of the loop...
+         break;
+      }
+      else
+         idlei = false;
 
-               if (phrase_mode)
-               {
-                  if (pixsize == 3)
-                     dstart = (dstxp & 0x07) << 3;
-                  else if (pixsize == 4)
-                     dstart = (dstxp & 0x03) << 4;
-                  else if (pixsize == 5)
-                     dstart = (dstxp & 0x01) << 5;
-               }
-               else
-                  dstart    = pixAddr & 0x07;
+      // INNER LOOP ACTIVE
 
-               //This is the other Jaguar I bug... Normally, should ALWAYS select a1_x here.
-               dstxwr = (dsta2 ? a2_x : a1_x) & 0x7FFE;
-               pseq = dstxwr ^ (a1_win_x & 0x7FFE);
-               pseq = (pixsize == 5 ? pseq : pseq & 0x7FFC);
-               pseq = ((pixsize & 0x06) == 4 ? pseq : pseq & 0x7FF8);
-               penden = clip_a1 && (pseq == 0);
-               window_mask = 0;
+      if ((idle && go && !datinit)
+            || (inner && !indone)
+            || (inner && indone && !outer0 && !upda1f && !upda1 && notgzandp && !upda2 && !datinit)
+            || (a1update && !upda2 && notgzandp && !datinit)
+            || (zupdate && !upda2 && !datinit)
+            || (a2update && !datinit)
+            || (init_ii && !gourz)
+            || (init_zi))
+         inneri = true;
+      else
+         inneri = false;
 
-               if (penden)
-               {
-                  if (pixsize == 3)
-                     window_mask = (a1_win_x & 0x07) << 3;
-                  else if (pixsize == 4)
-                     window_mask = (a1_win_x & 0x03) << 4;
-                  else if (pixsize == 5)
-                     window_mask = (a1_win_x & 0x01) << 5;
-               }
-               else
-                  window_mask    = 0;
+      // A1 FRACTION UPDATE
 
-               /* The mask to be used if within one phrase of the end of the inner
-                  loop, similarly */
+      if (inner && indone && !outer0 && upda1f)
+         a1fupdatei = true;
+      else
+         a1fupdatei = false;
 
-               if (inner0)
-               {
-                  if (pixsize == 3)
-                     inner_mask = (icount & 0x07) << 3;
-                  else if (pixsize == 4)
-                     inner_mask = (icount & 0x03) << 4;
-                  else if (pixsize == 5)
-                     inner_mask = (icount & 0x01) << 5;
-               }
-               else
-                  inner_mask    = 0;
+      // A1 POINTER UPDATE
 
-               /* The actual mask used should be the
-                  lesser of the window masks and
-                  the inner mask, where is all cases 000 means 1000. */
-               window_mask = (window_mask == 0 ? 0x40 : window_mask);
-               inner_mask  = (inner_mask == 0 ? 0x40 : inner_mask);
+      if ((a1fupdate)
+            || (inner && indone && !outer0 && !upda1f && upda1))
+         a1updatei = true;
+      else
+         a1updatei = false;
 
-               emask       = (window_mask > inner_mask ? inner_mask : window_mask);
-               /* The mask to be used for the pixel size, to which must be added
-                  the bit offset */
-               pma = pixAddr + (1 << pixsize);
-               /* Select the mask */
-               dend = (phrase_mode ? emask : pma);
+      // Z FRACTION UPDATE
 
-               /* The cycle width in phrase mode is normally one phrase.  However,
-                  at the start and end it may be narrower.  The start and end masks
-                  are used to generate this.  The width is given by:
+      if ((a1update && gourz && polygon)
+            || (inner && indone && !outer0 && !upda1f && !upda1 && gourz && polygon))
+         zfupdatei = true;
+      else
+         zfupdatei = false;
 
-                  8 - start mask - (8 - end mask)
-                  =	end mask - start mask
+      // Z INTEGER UPDATE
 
-                  This is only used for writes in phrase mode.
-                  Start and end from the address level of the pipeline are used.
-                  */
+      if (zfupdate)
+         zupdatei = true;
+      else
+         zupdatei = false;
 
-               //Phrase mode needs destination data for start/end mask byte merging,
-               //but NOT when bkgwren is set (hardware uses DSTDATA register value).
-               if (phrase_mode && !dsten && !bkgwren)
-                  dstd = ((uint64_t)JaguarReadLong(address, BLITTER) << 32) | (uint64_t)JaguarReadLong(address + 4, BLITTER);
+      // A2 POINTER UPDATE
 
-               // Write data combines srcd and dstd through ADDDSEL, PATDSEL, or LFU.
-               // Precedence is ADDDSEL > PATDSEL > LFU.
+      if ((a1update && upda2 && notgzandp)
+            || (zupdate && upda2)
+            || (inner && indone && !outer0 && !upda1f && notgzandp && !upda1 && upda2))
+         a2updatei = true;
+      else
+         a2updatei = false;
 
-               // srcd2 = xxxx xxxx 0123 4567, srcd = 8901 2345 xxxx xxxx, srcshift = $20 (32)
-               srcd = (srcd2 << (64 - srcshift)) | (srcd1 >> srcshift);
-               //bleh, ugly ugly ugly
-               if (srcshift == 0)
-                  srcd = srcd1;
+      // INITIALIZE INTENSITY FRACTION
 
-               //NOTE: This only works with pixel sizes less than 8BPP...
-               //DOUBLE NOTE: Still need to do regression testing to ensure that this doesn't break other stuff... !!! CHECK !!!
-               if (!phrase_mode && srcshift != 0)
-                  srcd = ((srcd2 & 0xFF) << (8 - srcshift)) | ((srcd1 & 0xFF) >> srcshift);
+      if ((zupdate && !upda2 && datinit)
+            || (a1update && !upda2 && datinit && notgzandp)
+            || (inner && indone && !outer0 && !upda1f && !upda1 && notgzandp && !upda2 && datinit)
+            || (a2update && datinit)
+            || (idle && go && datinit))
+         init_ifi = true;
+      else
+         init_ifi = false;
 
-               //Z DATA() stuff done here... And it has to be done before any Z shifting...
-               //Note that we need to have phrase mode start/end support here... (Not since we moved it from dzwrite...!)
-               /*
-                  Here are a couple of Cybermorph blits with Z:
-                  $00113078	// DSTEN DSTENZ DSTWRZ CLIP_A1 GOURD GOURZ PATDSEL ZMODE=4
-                  $09900F39	// SRCEN DSTEN DSTENZ DSTWRZ UPDA1 UPDA1F UPDA2 DSTA2 ZMODE=4 LFUFUNC=C DCOMPEN
+      // INITIALIZE INTENSITY INTEGER
 
-                  We're having the same phrase mode overwrite problem we had with the pixels... !!! FIX !!!
-                  Odd. It's equating 0 with 0... Even though ZMODE is $04 (less than)!
-                  */
-               if (gourz)
-               {
-                  uint16_t addq[4];
-                  uint8_t initcin[4] = { 0, 0, 0, 0 };
-                  ADDARRAY(addq, 7/*daddasel*/, 6/*daddbsel*/, 0/*daddmode*/, 0, 0, initcin, 0, 0, 0, 0, 0, srcz1, srcz2, zinc, 0);
-                  srcz2 = ((uint64_t)addq[3] << 48) | ((uint64_t)addq[2] << 32) | ((uint64_t)addq[1] << 16) | (uint64_t)addq[0];
-                  ADDARRAY(addq, 6/*daddasel*/, 7/*daddbsel*/, 1/*daddmode*/, 0, 0, initcin, 0, 0, 0, 0, 0, srcz1, srcz2, zinc, 0);
-                  srcz1 = ((uint64_t)addq[3] << 48) | ((uint64_t)addq[2] << 32) | ((uint64_t)addq[1] << 16) | (uint64_t)addq[0];
+      if (init_if)
+         init_iii = true;
+      else
+         init_iii = false;
 
-               }
+      // INITIALIZE Z FRACTION
 
-               zSrcShift = srcshift & 0x30;
-               srcz = (srcz2 << (64 - zSrcShift)) | (srcz1 >> zSrcShift);
-               //bleh, ugly ugly ugly
-               if (zSrcShift == 0)
-                  srcz = srcz1;
+      if (init_ii && gourz)
+         init_zfi = true;
+      else
+         init_zfi = false;
 
+      // INITIALIZE Z INTEGER
 
-               //When in SRCSHADE mode, it adds the IINC to the read source (from LFU???)
-               //According to following line, it gets LFU mode. But does it feed the source into the LFU
-               //after the add?
-               //Dest write address/pix address: 0014E83E/0 [dstart=0 dend=10 pwidth=8 srcshift=0][daas=4 dabs=5 dam=7 ds=1 daq=F] [0000000000006505] (icount=003F, inc=1)
-               //Let's try this:
-               if (srcshade)
-               {
-                  uint16_t addq[4];
-                  uint8_t initcin[4] = { 0, 0, 0, 0 };
-                  uint32_t iinc_masked = iinc & 0x00FFFFFF;
-                  ADDARRAY(addq, 4/*daddasel*/, 5/*daddbsel*/, 7/*daddmode*/, dstd, iinc_masked, initcin, 0, 0, 0, patd, srcd, 0, 0, 0, 0);
-                  srcd = ((uint64_t)addq[3] << 48) | ((uint64_t)addq[2] << 32) | ((uint64_t)addq[1] << 16) | (uint64_t)addq[0];
-               }
+      if (init_zf)
+         init_zii = true;
+      else
+         init_zii = false;
 
-               /* DCONTROL: compute data adder signals.  Moved here from
-                  the per-iteration scope since they are only consumed
-                  during dwrite (dwrite=true, dzwrite=false here). */
-               shadeadd = srcshade;
-               daddasel = (gourd ? 0x01 : 0x00);
-               daddasel |= ((gourd || gourz || srcshade) ? 0x04 : 0x00);
-               daddbsel = (gourd || srcshade ? 0x01 : 0x00);
-               daddbsel |= (gourd || srcshade ? 0x04 : 0x00);
-               /* daddmode bit 0: NAND tree (dcontrol.v:130-146) makes
-                  bit 0 always 1 when dwrite&&gourd, !gourd&&!gourz,
-                  or shadeadd. */
-               daddmode = (gourd || (!gourd && !gourz) || shadeadd ? 0x01 : 0x00);
-               daddmode |= ((gourd && !topben && !ext_int)
-                     || (!gourd && !gourz && !topben) || (shadeadd && !topben) ? 0x02 : 0x00);
-               daddmode |= ((!gourd && !gourz) || shadeadd || (gourd && ext_int) ? 0x04 : 0x00);
-               patfadd = gourd;
-               patdadd = gourd;
-               srcz2add = false;
-               daddq_sel = gourd;
-               data_sel = ((!patdsel && !adddsel) ? 0x01 : 0x00)
-                  | (adddsel ? 0x02 : 0x00);
+      // Here we move the fooi into their foo counterparts in order to simulate the moving
+      // of data into the various FDSYNCs... Each time we loop we simulate one clock cycle...
 
-               if (patfadd)
-               {
-                  uint16_t addq[4];
-                  uint8_t initcin[4] = { 0, 0, 0, 0 };
-                  ADDARRAY(addq, 4/*daddasel*/, 4/*daddbsel*/, 0/*daddmode*/, dstd, iinc, initcin, 0, 0, 0, patd, srcd, 0, 0, 0, 0);
-                  srcd1 = ((uint64_t)addq[3] << 48) | ((uint64_t)addq[2] << 32) | ((uint64_t)addq[1] << 16) | (uint64_t)addq[0];
-               }
+      idle = idlei;
+      inner = inneri;
+      a1fupdate = a1fupdatei;
+      a1update = a1updatei;
+      zfupdate = zfupdatei;		// *
+      zupdate = zupdatei;			// *
+      a2update = a2updatei;
+      init_if = init_ifi;			// *
+      init_ii = init_iii;			// *
+      init_zf = init_zfi;			// *
+      init_zi = init_zii;			// *
+      // * denotes states that will never assert for Jaguar I
 
-               /* atick[0]/[1] two-phase pipeline: fractional intensity/Z update
-                  runs in the patfadd/srcz2add block above (Phase 0), integer
-                  update runs via DATA→patdadd below (Phase 1).  The dbinh
-                  param below is overwritten inside DATA by COMP_CTRL. */
+      // Now, depending on how we want to handle things, we could either put the implementation
+      // of the various pieces up above, or handle them down below here.
 
-               DATA(&wdata, &dcomp, &zcomp, &winhibit,
-                     true, cmpdst, daddasel, daddbsel, daddmode, daddq_sel, data_sel, 0/*dbinh*/,
-                     dend, dstart, dstd, iinc, lfufunc, &patd, patdadd,
-                     phrase_mode, srcd, false/*srcdread*/, false/*srczread*/, srcz2add, zmode,
-                     bcompen, bkgwren, dcompen, icount & 0x07, pixsize,
-                     &srcz, dstz, zinc);
+      // Let's try postprocessing for now...
 
-               /*
-                  DEF ADDRCOMP (
-                  a1_outside	// A1 pointer is outside window bounds
-                  :OUT;
-                  INT16/	a1_x
-                  INT16/	a1_y
-                  INT15/	a1_win_x
-                  INT15/	a1_win_y
-                  :IN);
-                  BEGIN
+      if (inner)
+      {
+         bool idle_inner = true, sreadx = false, szreadx = false, sread = false,
+              szread = false, dread = false, dzread = false, dwrite = false, dzwrite = false;
+         bool inner0 = false;
+         bool idle_inneri, sreadxi, szreadxi, sreadi, szreadi, dreadi, dzreadi, dwritei, dzwritei;
+         //other stuff
+         uint8_t srcshift = 0;
+         uint16_t icount = GET16(blitter_ram, PIXLINECOUNTER + 2);
+         bool srca_addi, dsta_addi, gensrc, gendst, gena2i, zaddr, fontread, justify, a1_add, a2_add;
+         bool adda_yconst, addareg, suba_x, suba_y, a1fracldi, shadeadd;
+         uint8_t addasel, a1_xconst, a2_xconst, adda_xconst, addbsel, maska1, maska2, modx, daddasel;
+         uint8_t daddbsel, daddmode;
+         bool patfadd, patdadd, srcz2add, daddq_sel;
+         uint8_t data_sel;
+         uint32_t address, pixAddr;
+         uint8_t dstxp;
+         uint64_t srcz;
+         bool winhibit;
 
-               // The address is outside if negative, or if greater than or equal
-               // to the window size
+         indone = false;
 
-A1_xcomp	:= MAG_15 (a1xgr, a1xeq, a1xlt, a1_x{0..14}, a1_win_x{0..14});
-A1_ycomp	:= MAG_15 (a1ygr, a1yeq, a1ylt, a1_y{0..14}, a1_win_y{0..14});
-A1_outside	:= OR6 (a1_outside, a1_x{15}, a1xgr, a1xeq, a1_y{15}, a1ygr, a1yeq);
-*/
-               //NOTE: There seems to be an off-by-one bug here in the clip_a1 section... !!! FIX !!!
-               //      Actually, seems to be related to phrase mode writes...
-               //      Or is it? Could be related to non-15-bit compares as above?
-               if (clip_a1 && ((a1_x & 0x8000) || (a1_y & 0x8000) || (a1_x >= a1_win_x) || (a1_y >= a1_win_y)))
-                  winhibit = true;
+         /* Precompute address constants (invariant during inner loop) */
+         a1_xconst = 6 - a1_pixsize;
+         a2_xconst = 6 - a2_pixsize;
+         if (a1addx == 1)
+            a1_xconst = 0;
+         else if (a1addx & 0x02)
+            a1_xconst = 7;
+         if (a2addx == 1)
+            a2_xconst = 0;
+         else if (a2addx & 0x02)
+            a2_xconst = 7;
 
+         /* Precompute srcshift — loaded on first inner cycle (sshftld),
+            then held constant for all subsequent cycles. */
+         {
+            uint8_t dstxp0, srcxp0, shftv0, pobb0, loshd0;
+            bool pobbsel0;
 
-               if (!winhibit || bkgwren)
-               {
-                  if (phrase_mode)
-                  {
-                     JaguarWriteLong(address + 0, wdata >> 32, BLITTER);
-                     JaguarWriteLong(address + 4, wdata & 0xFFFFFFFF, BLITTER);
-                  }
-                  else
-                  {
-                     if (pixsize == 5)
-                        JaguarWriteLong(address, wdata & 0xFFFFFFFF, BLITTER);
-                     else if (pixsize == 4)
-                        JaguarWriteWord(address, wdata & 0x0000FFFF, BLITTER);
-                     else
-                        JaguarWriteByte(address, wdata & 0x000000FF, BLITTER);
-                  }
-               }
+            dstxp0 = (dsta2 ? a2_x : a1_x) & 0x3F;
+            srcxp0 = (dsta2 ? a1_x : a2_x) & 0x3F;
+            shftv0 = ((dstxp0 - srcxp0) << pixsize) & 0x3F;
+            pobb0 = 0;
+            if (pixsize == 3)
+               pobb0 = dstxp0 & 0x07;
+            else if (pixsize == 4)
+               pobb0 = dstxp0 & 0x03;
+            else if (pixsize == 5)
+               pobb0 = dstxp0 & 0x01;
 
-            }
+            pobbsel0 = phrase_mode && bcompen;
+            loshd0 = (pobbsel0 ? pobb0 : shftv0) & 0x07;
+            srcshift = (srcen || pobbsel0 ? loshd0 : 0);
+            srcshift |= (srcen && phrase_mode ? shftv0 & 0x38 : 0);
+         }
 
-            if (dzwrite)
-            {
-               PERF_INC(blitter_phrase_writes);
+         while (true)
+         {
 #ifdef BENCH_PROFILE
-               blitter_did_io = 1;
+            int blitter_did_io = 0;
 #endif
-               // OK, here's the big insight: When NOT in GOURZ mode, srcz1 & 2 function EXACTLY the same way that
-               // srcd1 & 2 work--there's an implicit shift from srcz1 to srcz2 whenever srcz1 is read.
-               // OTHERWISE, srcz1 is the integer for the computed Z and srcz2 is the fractional part.
-               // Writes to srcz1 & 2 follow the same pattern as the other 64-bit registers--low 32 at the low address,
-               // high 32 at the high address (little endian!).
-               // NOTE: GOURZ is still not properly supported. Check patd/patf handling...
-               //       Phrase mode start/end masks are not properly supported either...
-               //This is not correct... !!! FIX !!!
-               //Should be OK now... We'll see...
-               //Nope. Having the same starstep write problems in phrase mode as we had with pixels... !!! FIX !!!
-               //This is not causing the problem in Hover Strike... :-/
-               //The problem was with the SREADX not shifting. Still problems with Z comparisons & other text in pregame screen...
-               if (!winhibit)
-               {
-                  if (phrase_mode)
-                  {
-                     JaguarWriteLong(address + 0, srcz >> 32, BLITTER);
-                     JaguarWriteLong(address + 4, srcz & 0xFFFFFFFF, BLITTER);
-                  }
-                  else
-                  {
-                     if (pixsize == 4)
-                        JaguarWriteWord(address, srcz & 0x0000FFFF, BLITTER);
-                  }
-               }//*/
+            /* PERF_INC embedded via comma operator to keep C89 decl
+             * order valid (no statements before declarations).  */
+            uint16_t dstxwr = (PERF_INC(blitter_inner), 0), pseq;
+            bool penden;
+            uint8_t window_mask;
+            uint8_t inner_mask = 0;
+            uint8_t emask, pma, dend;
+            uint64_t srcd;
+            uint8_t zSrcShift;
+            uint64_t wdata;
+            uint8_t dcomp, zcomp;
+
+            //NOTE: sshftld probably is only asserted at the beginning of the inner loop. !!! FIX !!!
+            /* State machine: step is always true (no bus contention in
+               Jaguar I), textext/txtread never assert. Both eliminated. */
+
+            if ((dzwrite && inner0)
+                  || (dwrite && !dstwrz && inner0))
+            {
+               idle_inneri = true;
+               break;
             }
+            else
+               idle_inneri = false;
 
+            sreadxi = (idle_inner && srcenx);
+            szreadxi = (sreadx && srcenz);
 
-            if (a1_add)
-            {
-               int16_t adda_x, adda_y, addb_x, addb_y, addq_x, addq_y;
-               ADDAMUX(&adda_x, &adda_y, addasel, a1_step_x, a1_step_y, a1_stepf_x, a1_stepf_y, a2_step_x, a2_step_y,
-                     a1_inc_x, a1_inc_y, a1_incf_x, a1_incf_y, adda_xconst, adda_yconst, addareg, suba_x, suba_y);
-               ADDBMUX(&addb_x, &addb_y, addbsel, a1_x, a1_y, a2_x, a2_y, a1_frac_x, a1_frac_y);
-               ADDRADD(&addq_x, &addq_y, a1fracldi, adda_x, adda_y, addb_x, addb_y, modx, suba_x, suba_y);
+            sreadi = (szreadx
+                  || (sreadx && !srcenz && srcen)
+                  || (idle_inner && !srcenx && srcen)
+                  || (dzwrite && !inner0 && srcen)
+                  || (dwrite && !dstwrz && !inner0 && srcen));
 
-               //Now, write to what???
-               //a2ptrld comes from a2ptrldi...
-               //I believe it's addbsel that determines the writeback...
-               // This is where atick[0] & [1] come in, in determining which part (fractional, integer)
-               // gets written to...
-               //a1_x = addq_x;
-               //a1_y = addq_y;
-               //Kludge, to get A1 channel increment working...
-               if (a1addx == 3)
-               {
-                  a1_frac_x = addq_x, a1_frac_y = addq_y;
+            szreadi = (sread && srcenz);
 
-                  addasel = 2, addbsel = 0, a1fracldi = false;
-                  ADDAMUX(&adda_x, &adda_y, addasel, a1_step_x, a1_step_y, a1_stepf_x, a1_stepf_y, a2_step_x, a2_step_y,
-                        a1_inc_x, a1_inc_y, a1_incf_x, a1_incf_y, adda_xconst, adda_yconst, addareg, suba_x, suba_y);
-                  ADDBMUX(&addb_x,&addb_y, addbsel, a1_x, a1_y, a2_x, a2_y, a1_frac_x, a1_frac_y);
-                  ADDRADD(&addq_x, &addq_y, a1fracldi, adda_x, adda_y, addb_x, addb_y, modx, suba_x, suba_y);
+            dreadi = ((szread && dsten)
+                  || (sread && !srcenz && dsten)
+                  || (sreadx && !srcenz && !srcen && dsten)
+                  || (idle_inner && !srcenx && !srcen && dsten)
+                  || (dzwrite && !inner0 && !srcen && dsten)
+                  || (dwrite && !dstwrz && !inner0 && !srcen && dsten));
 
-                  a1_x = addq_x, a1_y = addq_y;
-               }
-               else
-                  a1_x = addq_x, a1_y = addq_y;
-            }
+            dzreadi = ((dread && dstenz)
+                  || (szread && !dsten && dstenz)
+                  || (sread && !srcenz && !dsten && dstenz)
+                  || (sreadx && !srcenz && !srcen && !dsten && dstenz)
+                  || (idle_inner && !srcenx && !srcen && !dsten && dstenz)
+                  || (dzwrite && !inner0 && !srcen && !dsten && dstenz)
+                  || (dwrite && !dstwrz && !inner0 && !srcen && !dsten && dstenz));
 
-            if (a2_add)
-            {
-               int16_t adda_x, adda_y, addb_x, addb_y, addq_x, addq_y;
-               ADDAMUX(&adda_x, &adda_y, addasel, a1_step_x, a1_step_y, a1_stepf_x, a1_stepf_y, a2_step_x, a2_step_y,
-                     a1_inc_x, a1_inc_y, a1_incf_x, a1_incf_y, adda_xconst, adda_yconst, addareg, suba_x, suba_y);
-               ADDBMUX(&addb_x, &addb_y, addbsel, a1_x, a1_y, a2_x, a2_y, a1_frac_x, a1_frac_y);
-               ADDRADD(&addq_x, &addq_y, a1fracldi, adda_x, adda_y, addb_x, addb_y, modx, suba_x, suba_y);
+            dwritei = (dzread
+                  || (dread && !dstenz)
+                  || (szread && !dsten && !dstenz)
+                  || (sread && !srcenz && !dsten && !dstenz)
+                  || (sreadx && !srcenz && !srcen && !dsten && !dstenz)
+                  || (idle_inner && !srcenx && !srcen && !dsten && !dstenz)
+                  || (dzwrite && !inner0 && !srcen && !dsten && !dstenz)
+                  || (dwrite && !dstwrz && !inner0 && !srcen && !dsten && !dstenz));
 
-               //Now, write to what???
-               //a2ptrld comes from a2ptrldi...
-               //I believe it's addbsel that determines the writeback...
-               a2_x = addq_x;
-               a2_y = addq_y;
-            }
-#ifdef BENCH_PROFILE
-            if (blitter_did_io) PERF_INC(blitter_inner_io);
-            else                PERF_INC(blitter_inner_idle);
-#endif
-         }
+            dzwritei = (dwrite && dstwrz);
 
-         indone = true;
-         // The outer counter is updated here as well on the clock cycle...
+            // Here we move the fooi into their foo counterparts in order to simulate the moving
+            // of data into the various FDSYNCs... Each time we loop we simulate one clock cycle...
 
-         /* the inner loop is started whenever another state is about to
-            cause the inner state to go active */
-         //Instart		:= ND7 (instart, innert[0], innert[2..7]);
+            idle_inner = idle_inneri;
+            sreadx = sreadxi;
+            szreadx = szreadxi;
+            sread = sreadi;
+            szread = szreadi;
+            dread = dreadi;
+            dzread = dzreadi;
+            dwrite = dwritei;
+            dzwrite = dzwritei;
 
-         //Actually, it's done only when inner gets asserted without the 2nd line of conditions
-         //(inner AND !indone)
-         //fixed now...
-         //Since we don't get here until the inner loop is finished (indone = true) we can get
-         //away with doing it here...!
-         ocount--;
+            // Here's a few more decodes--not sure if they're supposed to go here or not...
 
-         if (ocount == 0)
-            outer0 = true;
-      }
 
-      if (a1fupdate)
-      {
-         uint32_t a1_frac_xt = (uint32_t)a1_frac_x + (uint32_t)a1_stepf_x;
-         uint32_t a1_frac_yt = (uint32_t)a1_frac_y + (uint32_t)a1_stepf_y;
-         a1FracCInX = a1_frac_xt >> 16;
-         a1FracCInY = a1_frac_yt >> 16;
-         a1_frac_x = (uint16_t)(a1_frac_xt & 0xFFFF);
-         a1_frac_y = (uint16_t)(a1_frac_yt & 0xFFFF);
-      }
+            srca_addi = (sreadxi && !srcenz) || (sreadi && !srcenz) || szreadxi || szreadi;
 
-      if (a1update)
-      {
-         a1_x += a1_step_x + a1FracCInX;
-         a1_y += a1_step_y + a1FracCInY;
-      }
+            dsta_addi = (dwritei && !dstwrz) || dzwritei;
 
-      if (a2update)
-      {
-         a2_x += a2_step_x;
-         a2_y += a2_step_y;
-      }
-   }
+            gensrc = sreadxi || szreadxi || sreadi || szreadi;
+            gendst = dreadi || dzreadi || dwritei || dzwritei;
+            gena2i = (gensrc && !dsta2) || (gendst && dsta2);
 
-   // Write values back to registers (in real blitter, these are continuously updated)
-   SET16(blitter_ram, A1_PIXEL + 2, a1_x);
-   SET16(blitter_ram, A1_PIXEL + 0, a1_y);
-   SET16(blitter_ram, A1_FPIXEL + 2, a1_frac_x);
-   SET16(blitter_ram, A1_FPIXEL + 0, a1_frac_y);
-   SET16(blitter_ram, A2_PIXEL + 2, a2_x);
-   SET16(blitter_ram, A2_PIXEL + 0, a2_y);
+            zaddr = szreadx || szread || dzread || dzwrite;
 
-}
+            // Some stuff from MCONTROL.NET--not sure if this is the correct use of this decode or not...
+            /*Fontread\	:= OND1 (fontread\, sread[1], sreadx[1], bcompen);
+Fontread	:= INV1 (fontread, fontread\);
+Justt		:= NAN3 (justt, fontread\, phrase_mode, tactive\);
+Justify		:= TS (justify, justt, busen);*/
+            fontread = (sread || sreadx) && bcompen;
+            justify = !(!fontread && phrase_mode /*&& tactive*/);
 
-// Various pieces of the blitter puzzle are teased out here...
+            /* Generate inner loop update enables */
+            /*
+A1_addi		:= MX2 (a1_addi, dsta_addi, srca_addi, dsta2);
+A2_addi		:= MX2 (a2_addi, srca_addi, dsta_addi, dsta2);
+A1_add		:= FD1 (a1_add, a1_add\, a1_addi, clk);
+A2_add		:= FD1 (a2_add, a2_add\, a2_addi, clk);
+A2_addb		:= BUF1 (a2_addb, a2_add);
+*/
+            a1_add = (dsta2 ? srca_addi : dsta_addi);
+            a2_add = (dsta2 ? dsta_addi : srca_addi);
 
-void ADDRGEN(uint32_t *address, uint32_t *pixa, bool gena2, bool zaddr,
-	uint16_t a1_x, uint16_t a1_y, uint32_t a1_base, uint8_t a1_pitch, uint8_t a1_pixsize, uint8_t a1_width, uint8_t a1_zoffset,
-	uint16_t a2_x, uint16_t a2_y, uint32_t a2_base, uint8_t a2_pitch, uint8_t a2_pixsize, uint8_t a2_width, uint8_t a2_zoffset)
-{
-	uint16_t x = (gena2 ? a2_x : a1_x) & 0xFFFF;	// Actually uses all 16 bits to generate address...!
-	uint16_t y = (gena2 ? a2_y : a1_y) & 0x0FFF;
-	uint8_t width = (gena2 ? a2_width : a1_width);
-	uint8_t pixsize = (gena2 ? a2_pixsize : a1_pixsize);
-	uint8_t pitch = (gena2 ? a2_pitch : a1_pitch);
-	uint32_t base = (gena2 ? a2_base : a1_base) >> 3;//Only upper 21 bits are passed around the bus? Seems like it...
-	uint8_t zoffset = (gena2 ? a2_zoffset : a1_zoffset);
+            /* Address adder input A register selection
+               000	A1 step integer part
+               001	A1 step fraction part
+               010	A1 increment integer part
+               011	A1 increment fraction part
+               100	A2 step
 
-	uint32_t ytm = ((uint32_t)y << 2) + ((width & 0x02) ? (uint32_t)y << 1 : 0) + ((width & 0x01) ? (uint32_t)y : 0);
+               bit 2 = a2update
+               bit 1 = /a2update . (a1_add . a1addx[0..1])
+               bit 0 = /a2update . ( a1fupdate
+               + a1_add . atick[0] . a1addx[0..1])
+               The /a2update term on bits 0 and 1 is redundant.
+               Now look-ahead based
+               */
 
-	uint32_t ya = (ytm << (width >> 2)) >> 2;
+            addasel = (a1fupdate || (a1_add && a1addx == 3) ? 0x01 : 0x00);
+            addasel |= (a1_add && a1addx == 3 ? 0x02 : 0x00);
+            addasel |= (a2update ? 0x04 : 0x00);
+            /* Address adder input A X constant selection
+               adda_xconst[0..2] generate a power of 2 in the range 1-64 or all
+               zeroes when they are all 1
+               Remember - these are pixels, so to add one phrase the pixel size
+               has to be taken into account to get the appropriate value.
+               for A1
+               if a1addx[0..1] are 00 set 6 - pixel size
+               if a1addx[0..1] are 01 set the value 000
+               if a1addx[0..1] are 10 set the value 111
+               similarly for A2
+JLH: Also, 11 will likewise set the value to 111
+*/
+            adda_xconst = (a2_add ? a2_xconst : a1_xconst);
+            /* Address adder input A Y constant selection
+               22 June 94 - This was erroneous, because only the a1addy bit was reflected here.
+               Therefore, the selection has to be controlled by a bug fix bit.
+JLH: Bug fix bit in Jaguar II--not in Jaguar I!
+*/
+            adda_yconst = a1addy;
+            /* Address adder input A register versus constant selection
+               given by	  a1_add . a1addx[0..1]
+               + a1update
+               + a1fupdate
+               + a2_add . a2addx[0..1]
+               + a2update
+               */
+            addareg = ((a1_add && a1addx == 3) || a1update || a1fupdate
+                  || (a2_add && a2addx == 3) || a2update ? true : false);
+            /* The adders can be put into subtract mode in add pixel size
+               mode when the corresponding flags are set */
+            suba_x = ((a1_add && a1xsign && a1addx == 1) || (a2_add && a2xsign && a2addx == 1) ? true : false);
+            suba_y = ((a1_add && a1addy && a1ysign) || (a2_add && a2addy && a2ysign) ? true : false);
+            /* Address adder input B selection
+               00	A1 pointer
+               01	A2 pointer
+               10	A1 fraction
+               11	Zero
 
-	uint32_t pa = ya + x;
-   uint8_t pt, za;
-   uint32_t phradr, shup, addr;
+               Bit 1 =   a1fupdate
+               + (a1_add . atick[0] . a1addx[0..1])
+               + a1fupdate . a1_stepld
+               + a1update . a1_stepld
+               + a2update . a2_stepld
+               Bit 0 =   a2update + a2_add
+               + a1fupdate . a1_stepld
+               + a1update . a1_stepld
+               + a2update . a2_stepld
+               */
+            addbsel = (a2update || a2_add || (a1fupdate && a1_stepld)
+                  || (a1update && a1_stepld) || (a2update && a2_stepld) ? 0x01 : 0x00);
+            addbsel |= (a1fupdate || (a1_add && a1addx == 3) || (a1fupdate && a1_stepld)
+                  || (a1update && a1_stepld) || (a2update && a2_stepld) ? 0x02 : 0x00);
+
+            /* The modulo bits are used to align X onto a phrase boundary when
+               it is being updated by one phrase
+               000	no mask
+               001	mask bit 0
+               010	mask bits 1-0
+               ..
+               110  	mask bits 5-0
+
+               Masking is enabled for a1 when a1addx[0..1] is 00, and the value
+               is 6 - the pixel size (again!)
+               */
+            maska1 = (a1_add && a1addx == 0 ? 6 - a1_pixsize : 0);
+            maska2 = (a2_add && a2addx == 0 ? 6 - a2_pixsize : 0);
+            modx = (a2_add ? maska2 : maska1);
+            /* Generate load strobes for the increment updates */
+
+            /*A1pldt		:= NAN2 (a1pldt, atick[1], a1_add);
+A1ptrldi	:= NAN2 (a1ptrldi, a1update\, a1pldt);
 
-	*pixa = pa << pixsize;
+A1fldt		:= NAN4 (a1fldt, atick[0], a1_add, a1addx[0..1]);
+A1fracldi	:= NAN2 (a1fracldi, a1fupdate\, a1fldt);
 
-	pt = ((pitch & 0x01) && !(pitch & 0x02) ? 0x01 : 0x00)
-		| (!(pitch & 0x01) && (pitch & 0x02) ? 0x02 : 0x00);
-	phradr = (*pixa >> 6) << pt;
-	shup = (pitch == 0x03 ? (*pixa >> 6) : 0);
+A2pldt		:= NAN2 (a2pldt, atick[1], a2_add);
+A2ptrldi	:= NAN2 (a2ptrldi, a2update\, a2pldt);*/
 
-	za = (zaddr ? zoffset : 0) & 0x03;
-	addr = za + phradr + (shup << 1) + base;
-	*address = ((*pixa & 0x38) >> 3) | ((addr & 0x1FFFFF) << 3);
-	*pixa &= 0x07;
-}
+            a1fracldi = a1fupdate || (a1_add && a1addx == 3);
 
-////////////////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////////////////
-// Here's an important bit: The source data adder logic. Need to track down the inputs!!! //
-////////////////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////////////////
+            ADDRGEN(&address, &pixAddr, gena2i, zaddr,
+                  a1_x, a1_y, a1_base, a1_pitch, a1_pixsize, a1_width, a1_zoffset,
+                  a2_x, a2_y, a2_base, a2_pitch, a2_pixsize, a2_width, a2_zoffset);
 
-void ADDARRAY(uint16_t * addq, uint8_t daddasel, uint8_t daddbsel, uint8_t daddmode,
-	uint64_t dstd, uint32_t iinc, uint8_t initcin[], uint64_t initinc, uint16_t initpix,
-	uint32_t istep, uint64_t patd, uint64_t srcd, uint64_t srcz1, uint64_t srcz2,
-	uint32_t zinc, uint32_t zstep)
-{
-   unsigned i;
-   uint16_t adda[4];
-   uint16_t addb[4];
-   uint64_t adda_val;
-   uint32_t initpix2;
-   uint16_t word;
-   uint8_t cinsel;
-   static uint8_t co[4]; /* preserved between calls */
-   uint8_t cin[4];
-   bool eightbit;
-   bool sat, hicinh;
-   uint8_t bsel_idx;
+            //Here's my guess as to how the addresses get truncated to phrase boundaries in phrase mode...
+            if (!justify)
+               address &= 0xFFFFF8;
 
-   initpix2 = ((uint32_t)initpix << 16) | initpix;
+            /* dstxp needed for dstart computation in dwrite */
+            dstxp = (dsta2 ? a2_x : a1_x) & 0x3F;
 
-   /* Select adda source directly (replaces 8-element addalo/addahi arrays) */
-   switch (daddasel)
-   {
-      case 0:  adda_val = dstd; break;
-      case 1:  adda_val = ((uint64_t)initpix2 << 32) | initpix2; break;
-      case 2:  /* fall through */
-      case 3:  adda_val = 0; break;
-      case 4:  adda_val = srcd; break;
-      case 5:  adda_val = patd; break;
-      case 6:  adda_val = srcz1; break;
-      default: adda_val = srcz2; break;
-   }
-   adda[0] = (uint16_t)adda_val;
-   adda[1] = (uint16_t)(adda_val >> 16);
-   adda[2] = (uint16_t)(adda_val >> 32);
-   adda[3] = (uint16_t)(adda_val >> 48);
+            if (sreadx)
+            {
+               PERF_INC(blitter_phrase_reads);
+#ifdef BENCH_PROFILE
+               blitter_did_io = 1;
+#endif
+               //uint32_t srcAddr, pixAddr;
+               //ADDRGEN(srcAddr, pixAddr, gena2i, zaddr,
+               //	a1_x, a1_y, a1_base, a1_pitch, a1_pixsize, a1_width, a1_zoffset,
+               //	a2_x, a2_y, a2_base, a2_pitch, a2_pixsize, a2_width, a2_zoffset);
+               srcd2 = srcd1;
+               srcd1 = ((uint64_t)JaguarReadLong(address + 0, BLITTER) << 32)
+                  | (uint64_t)JaguarReadLong(address + 4, BLITTER);
+               //Kludge to take pixel size into account...
+               //Hmm. If we're not in phrase mode, this is most likely NOT going to be used...
+               //Actually, it would be--because of BCOMPEN expansion, for example...
+               if (!phrase_mode)
+               {
+                  if (bcompen)
+                     srcd1 >>= 56;
+                  else
+                  {
+                     if (pixsize == 5)
+                        srcd1 >>= 32;
+                     else if (pixsize == 4)
+                        srcd1 >>= 48;
+                     else
+                        srcd1 >>= 56;
+                  }
+               }//*/
+            }
 
-   /* Select addb source (replaces wordmux array + dbsel2/iincsel logic) */
-   if (!(daddbsel & 0x04))
-   {
-      if (daddbsel & 0x01)
-      {
-         addb[0] = (uint16_t)initinc;
-         addb[1] = (uint16_t)(initinc >> 16);
-         addb[2] = (uint16_t)(initinc >> 32);
-         addb[3] = (uint16_t)(initinc >> 48);
-      }
-      else
-      {
-         addb[0] = (uint16_t)srcd;
-         addb[1] = (uint16_t)(srcd >> 16);
-         addb[2] = (uint16_t)(srcd >> 32);
-         addb[3] = (uint16_t)(srcd >> 48);
-      }
-   }
-   else
-   {
-      bsel_idx = ((daddbsel & 0x08) >> 1) | (daddbsel & 0x03);
-      switch (bsel_idx)
-      {
-         case 0: word = iinc & 0xFFFF; break;
-         case 1: word = iinc >> 16; break;
-         case 2: word = zinc & 0xFFFF; break;
-         case 3: word = zinc >> 16; break;
-         case 4: word = istep & 0xFFFF; break;
-         case 5: word = istep >> 16; break;
-         case 6: word = zstep & 0xFFFF; break;
-         default: word = zstep >> 16; break;
-      }
-      addb[0] = addb[1] = addb[2] = addb[3] = word;
-   }
+            if (szreadx)
+            {
+               srcz2 = srcz1;
+               srcz1 = ((uint64_t)JaguarReadLong(address, BLITTER) << 32) | (uint64_t)JaguarReadLong(address + 4, BLITTER);
+            }
 
-   /* Hardware: cinsel = (daddmode[0] | daddmode[1]) & ~daddmode[2]
-      Only modes 1-3 use carry input; mode 4+ do not. */
-   cinsel = ((daddmode & 0x03) && !(daddmode & 0x04) ? 1 : 0);
+            if (sread)
+            {
+               PERF_INC(blitter_phrase_reads);
+#ifdef BENCH_PROFILE
+               blitter_did_io = 1;
+#endif
+               srcd2 = srcd1;
+               srcd1 = ((uint64_t)JaguarReadLong(address, BLITTER) << 32) | (uint64_t)JaguarReadLong(address + 4, BLITTER);
+               //Kludge to take pixel size into account...
+               if (!phrase_mode)
+               {
+                  if (bcompen)
+                     srcd1 >>= 56;
+                  else
+                  {
+                     if (pixsize == 5)
+                        srcd1 >>= 32;
+                     else if (pixsize == 4)
+                        srcd1 >>= 48;
+                     else
+                        srcd1 >>= 56;
+                  }
+               }
+            }
 
-   for(i = 0; i < 4; i++)
-      cin[i] = initcin[i] | (co[i] & cinsel);
+            if (szread)
+            {
+               PERF_INC(blitter_phrase_reads);
+#ifdef BENCH_PROFILE
+               blitter_did_io = 1;
+#endif
+               srcz2 = srcz1;
+               srcz1 = ((uint64_t)JaguarReadLong(address, BLITTER) << 32) | (uint64_t)JaguarReadLong(address + 4, BLITTER);
+               //Kludge to take pixel size into account... I believe that it only has to take 16BPP mode into account. Not sure tho.
+               if (!phrase_mode && pixsize == 4)
+                  srcz1 >>= 48;
 
-   eightbit = daddmode & 0x02;
-   sat = daddmode & 0x03;
-   hicinh = ((daddmode & 0x03) == 0x03);
+            }
 
-   ADD16SAT(&addq[0], &co[0], adda[0], addb[0], cin[0], sat, eightbit, hicinh);
-   ADD16SAT(&addq[1], &co[1], adda[1], addb[1], cin[1], sat, eightbit, hicinh);
-   ADD16SAT(&addq[2], &co[2], adda[2], addb[2], cin[2], sat, eightbit, hicinh);
-   ADD16SAT(&addq[3], &co[3], adda[3], addb[3], cin[3], sat, eightbit, hicinh);
-}
+            if (dread)
+            {
+               PERF_INC(blitter_phrase_reads);
+#ifdef BENCH_PROFILE
+               blitter_did_io = 1;
+#endif
+               dstd = ((uint64_t)JaguarReadLong(address, BLITTER) << 32) | (uint64_t)JaguarReadLong(address + 4, BLITTER);
+               //Kludge to take pixel size into account...
+               if (!phrase_mode)
+               {
+                  if (pixsize == 5)
+                     dstd >>= 32;
+                  else if (pixsize == 4)
+                     dstd >>= 48;
+                  else
+                     dstd >>= 56;
+               }
+            }
+
+            if (dzread)
+            {
+               // Is Z always 64 bit read? Or sometimes 16 bit (dependent on phrase_mode)?
+               dstz = ((uint64_t)JaguarReadLong(address, BLITTER) << 32) | (uint64_t)JaguarReadLong(address + 4, BLITTER);
+               //Kludge to take pixel size into account... I believe that it only has to take 16BPP mode into account. Not sure tho.
+               if (!phrase_mode && pixsize == 4)
+                  dstz >>= 48;
 
+            }
 
-void ADD16SAT(uint16_t *r, uint8_t *co, uint16_t a, const uint16_t b, const uint8_t cin, const bool sat, const bool eightbit, const bool hicinh)
-{
-   uint8_t carry[4];
-   uint8_t btop, ctop;
-   bool saturate, hisaturate;
-   uint32_t qt   = (a & 0xFF) + (b & 0xFF) + cin;
-   uint16_t q    = qt & 0x00FF;
+            // These vars should probably go further up in the code... !!! FIX !!!
+            // We can't preassign these unless they're static...
+            //NOTE: SRCSHADE requires GOURZ to be set to work properly--another Jaguar I bug
+            if (dwrite)
+            {
+#ifdef BENCH_PROFILE
+               blitter_did_io = 1;
+#endif
+               //Counter is done on the dwrite state...! (We'll do it first, since it affects dstart/dend calculations.)
+               //Here's the voodoo for figuring the correct amount of pixels in phrase mode (or not):
+               int8_t inct = (PERF_INC(blitter_phrase_writes), -((dsta2 ? a2_x : a1_x) & 0x07));	// From INNER_CNT
+               uint8_t inc = 0;
+               uint16_t oldicount;
+               uint8_t dstart = 0;
 
-   carry[0]      = ((qt & 0x0100) ? 1 : 0);
-   carry[1]      = (carry[0] && !eightbit ? carry[0] : 0);
-   qt            = (a & 0x0F00) + (b & 0x0F00) + (carry[1] << 8);
-   carry[2]      = ((qt & 0x1000) ? 1 : 0);
-   q            |= qt & 0x0F00;
-   carry[3]      = (carry[2] && !hicinh ? carry[2] : 0);
-   qt            = (a & 0xF000) + (b & 0xF000) + (carry[3] << 12);
-   *co            = ((qt & 0x10000) ? 1 : 0);
-   q            |= qt & 0xF000;
+               inc = (!phrase_mode || (phrase_mode && (inct & 0x01)) ? 0x01 : 0x00);
+               inc |= (phrase_mode && (((pixsize == 3 || pixsize == 4) && (inct & 0x02)) || (pixsize == 5 && !(inct & 0x01))) ? 0x02 : 0x00);
+               inc |= (phrase_mode && ((pixsize == 3 && (inct & 0x04)) || (pixsize == 4 && !(inct & 0x03))) ? 0x04 : 0x00);
+               inc |= (phrase_mode && pixsize == 3 && !(inct & 0x07) ? 0x08 : 0x00);
 
-   if (eightbit)
-   {
-      btop  = (b & 0x0080) >> 7;
-      ctop  = carry[0];
-   }
-   else
-   {
-      btop  = (b & 0x8000) >> 15;
-      ctop  = *co;
-   }
+               oldicount = icount;	// Save icount to detect underflow...
+               icount -= inc;
 
-   saturate = sat && (btop ^ ctop);
-   hisaturate = saturate && !eightbit;
+               if (icount == 0 || ((icount & 0x8000) && !(oldicount & 0x8000)))
+                  inner0 = true;
+               // X/Y stepping is also done here, I think...No. It's done when a1_add or a2_add is asserted...
 
-   *r = (saturate ? (ctop ? 0x00FF : 0x0000) : q & 0x00FF);
-   *r |= (hisaturate ? (ctop ? 0xFF00 : 0x0000) : q & 0xFF00);
-}
+               //*********************************************************************************
+               //Start & end write mask computations...
+               //*********************************************************************************
 
-void ADDAMUX(int16_t *adda_x, int16_t *adda_y, uint8_t addasel, int16_t a1_step_x, int16_t a1_step_y,
-	int16_t a1_stepf_x, int16_t a1_stepf_y, int16_t a2_step_x, int16_t a2_step_y,
-	int16_t a1_inc_x, int16_t a1_inc_y, int16_t a1_incf_x, int16_t a1_incf_y, uint8_t adda_xconst,
-	bool adda_yconst, bool addareg, bool suba_x, bool suba_y)
-{
 
-   int16_t addar_x, addar_y, addac_x, addac_y, addas_x, addas_y;
-	int16_t xterm[4], yterm[4];
-	xterm[0] = a1_step_x, xterm[1] = a1_stepf_x, xterm[2] = a1_inc_x, xterm[3] = a1_incf_x;
-	yterm[0] = a1_step_y, yterm[1] = a1_stepf_y, yterm[2] = a1_inc_y, yterm[3] = a1_incf_y;
-   if (addasel & 0x04)
-   {
-      addar_x = a2_step_x;
-      addar_y = a2_step_y;
-   }
-   else
-   {
-      addar_x = xterm[addasel & 0x03];
-      addar_y = yterm[addasel & 0x03];
-   }
+               if (phrase_mode)
+               {
+                  if (pixsize == 3)
+                     dstart = (dstxp & 0x07) << 3;
+                  else if (pixsize == 4)
+                     dstart = (dstxp & 0x03) << 4;
+                  else if (pixsize == 5)
+                     dstart = (dstxp & 0x01) << 5;
+               }
+               else
+                  dstart    = pixAddr & 0x07;
 
-   /* Generate a constant value - this is a power of 2 in the range
-      0-64, or zero.  The control bits are adda_xconst[0..2], when they
-      are all 1  the result is 0.
-      Constants for Y can only be 0 or 1 */
+               //This is the other Jaguar I bug... Normally, should ALWAYS select a1_x here.
+               dstxwr = (dsta2 ? a2_x : a1_x) & 0x7FFE;
+               pseq = dstxwr ^ (a1_win_x & 0x7FFE);
+               pseq = (pixsize == 5 ? pseq : pseq & 0x7FFC);
+               pseq = ((pixsize & 0x06) == 4 ? pseq : pseq & 0x7FF8);
+               penden = clip_a1 && (pseq == 0);
+               window_mask = 0;
 
-	addac_x = (adda_xconst == 0x07 ? 0 : 1 << adda_xconst);
-	addac_y = (adda_yconst ? 0x01 : 0);
+               if (penden)
+               {
+                  if (pixsize == 3)
+                     window_mask = (a1_win_x & 0x07) << 3;
+                  else if (pixsize == 4)
+                     window_mask = (a1_win_x & 0x03) << 4;
+                  else if (pixsize == 5)
+                     window_mask = (a1_win_x & 0x01) << 5;
+               }
+               else
+                  window_mask    = 0;
 
-   /* Select between constant value and register value */
+               /* The mask to be used if within one phrase of the end of the inner
+                  loop, similarly */
 
-   if (addareg)
-   {
-      addas_x = (addareg ? addar_x : addac_x);
-      addas_y = (addareg ? addar_y : addac_y);
-   }
-   else
-   {
-      addas_x = (addareg ? addar_x : addac_x);
-      addas_y = (addareg ? addar_y : addac_y);
-   }
+               if (inner0)
+               {
+                  if (pixsize == 3)
+                     inner_mask = (icount & 0x07) << 3;
+                  else if (pixsize == 4)
+                     inner_mask = (icount & 0x03) << 4;
+                  else if (pixsize == 5)
+                     inner_mask = (icount & 0x01) << 5;
+               }
+               else
+                  inner_mask    = 0;
 
-   /* Complement these values (complement flag gives adder carry in)*/
+               /* The actual mask used should be the
+                  lesser of the window masks and
+                  the inner mask, where is all cases 000 means 1000. */
+               window_mask = (window_mask == 0 ? 0x40 : window_mask);
+               inner_mask  = (inner_mask == 0 ? 0x40 : inner_mask);
 
-	*adda_x = addas_x ^ (suba_x ? 0xFFFF : 0x0000);
-	*adda_y = addas_y ^ (suba_y ? 0xFFFF : 0x0000);
-}
+               emask       = (window_mask > inner_mask ? inner_mask : window_mask);
+               /* The mask to be used for the pixel size, to which must be added
+                  the bit offset */
+               pma = pixAddr + (1 << pixsize);
+               /* Select the mask */
+               dend = (phrase_mode ? emask : pma);
 
+               /* The cycle width in phrase mode is normally one phrase.  However,
+                  at the start and end it may be narrower.  The start and end masks
+                  are used to generate this.  The width is given by:
 
-/**  ADDBMUX - Address adder input B selection  *******************
+                  8 - start mask - (8 - end mask)
+                  =	end mask - start mask
 
-This module selects the register to be updated by the address
-adder.  This can be one of three registers, the A1 and A2
-pointers, or the A1 fractional part. It can also be zero, so that the step
-registers load directly into the pointers.
-*/
+                  This is only used for writes in phrase mode.
+                  Start and end from the address level of the pipeline are used.
+                  */
 
-/*DEF ADDBMUX (
-INT16/	addb_x
-INT16/	addb_y
-	:OUT;
-	addbsel[0..1]
-INT16/	a1_x
-INT16/	a1_y
-INT16/	a2_x
-INT16/	a2_y
-INT16/	a1_frac_x
-INT16/	a1_frac_y
-	:IN);
-INT16/	zero16 :LOCAL;
-BEGIN*/
-void ADDBMUX(int16_t *addb_x, int16_t *addb_y, uint8_t addbsel, int16_t a1_x, int16_t a1_y,
-	int16_t a2_x, int16_t a2_y, int16_t a1_frac_x, int16_t a1_frac_y)
-{
+               //Phrase mode needs destination data for start/end mask byte merging,
+               //but NOT when bkgwren is set (hardware uses DSTDATA register value).
+               if (phrase_mode && !dsten && !bkgwren)
+                  dstd = ((uint64_t)JaguarReadLong(address, BLITTER) << 32) | (uint64_t)JaguarReadLong(address + 4, BLITTER);
 
-/*Zero		:= TIE0 (zero);
-Zero16		:= JOIN (zero16, zero, zero, zero, zero, zero, zero, zero,
-			zero, zero, zero, zero, zero, zero, zero, zero, zero);
-Addbselb[0-1]	:= BUF8 (addbselb[0-1], addbsel[0-1]);
-Addb_x		:= MX4 (addb_x, a1_x, a2_x, a1_frac_x, zero16, addbselb[0..1]);
-Addb_y		:= MX4 (addb_y, a1_y, a2_y, a1_frac_y, zero16, addbselb[0..1]);*/
-////////////////////////////////////// C++ CODE //////////////////////////////////////
-	int16_t xterm[4], yterm[4];
-	xterm[0] = a1_x, xterm[1] = a2_x, xterm[2] = a1_frac_x, xterm[3] = 0;
-	yterm[0] = a1_y, yterm[1] = a2_y, yterm[2] = a1_frac_y, yterm[3] = 0;
-	*addb_x = xterm[addbsel & 0x03];
-	*addb_y = yterm[addbsel & 0x03];
-//////////////////////////////////////////////////////////////////////////////////////
+               // Write data combines srcd and dstd through ADDDSEL, PATDSEL, or LFU.
+               // Precedence is ADDDSEL > PATDSEL > LFU.
 
-//END;
-}
+               // srcd2 = xxxx xxxx 0123 4567, srcd = 8901 2345 xxxx xxxx, srcshift = $20 (32)
+               srcd = (srcd2 << (64 - srcshift)) | (srcd1 >> srcshift);
+               //bleh, ugly ugly ugly
+               if (srcshift == 0)
+                  srcd = srcd1;
 
+               //NOTE: This only works with pixel sizes less than 8BPP...
+               //DOUBLE NOTE: Still need to do regression testing to ensure that this doesn't break other stuff... !!! CHECK !!!
+               if (!phrase_mode && srcshift != 0)
+                  srcd = ((srcd2 & 0xFF) << (8 - srcshift)) | ((srcd1 & 0xFF) >> srcshift);
 
-/**  DATAMUX - Address local data bus selection  ******************
+               //Z DATA() stuff done here... And it has to be done before any Z shifting...
+               //Note that we need to have phrase mode start/end support here... (Not since we moved it from dzwrite...!)
+               /*
+                  Here are a couple of Cybermorph blits with Z:
+                  $00113078	// DSTEN DSTENZ DSTWRZ CLIP_A1 GOURD GOURZ PATDSEL ZMODE=4
+                  $09900F39	// SRCEN DSTEN DSTENZ DSTWRZ UPDA1 UPDA1F UPDA2 DSTA2 ZMODE=4 LFUFUNC=C DCOMPEN
 
-Select between the adder output and the input data bus
-*/
+                  We're having the same phrase mode overwrite problem we had with the pixels... !!! FIX !!!
+                  Odd. It's equating 0 with 0... Even though ZMODE is $04 (less than)!
+                  */
+               if (gourz)
+               {
+                  uint16_t addq[4];
+                  uint8_t initcin[4] = { 0, 0, 0, 0 };
+                  ADDARRAY(addq, 7/*daddasel*/, 6/*daddbsel*/, 0/*daddmode*/, 0, 0, initcin, 0, 0, 0, 0, 0, srcz1, srcz2, zinc, 0);
+                  srcz2 = ((uint64_t)addq[3] << 48) | ((uint64_t)addq[2] << 32) | ((uint64_t)addq[1] << 16) | (uint64_t)addq[0];
+                  ADDARRAY(addq, 6/*daddasel*/, 7/*daddbsel*/, 1/*daddmode*/, 0, 0, initcin, 0, 0, 0, 0, 0, srcz1, srcz2, zinc, 0);
+                  srcz1 = ((uint64_t)addq[3] << 48) | ((uint64_t)addq[2] << 32) | ((uint64_t)addq[1] << 16) | (uint64_t)addq[0];
 
-/*DEF DATAMUX (
-INT16/	data_x
-INT16/	data_y
-	:OUT;
-INT32/	gpu_din
-INT16/	addq_x
-INT16/	addq_y
-	addqsel
-	:IN);
+               }
 
-INT16/	gpu_lo, gpu_hi
-:LOCAL;
-BEGIN*/
-void DATAMUX(int16_t *data_x, int16_t *data_y, uint32_t gpu_din, int16_t addq_x, int16_t addq_y, bool addqsel)
-{
-   if (addqsel)
-   {
-      *data_x = addq_x;
-      *data_y = addq_y;
-   }
-   else
-   {
-      *data_x = (int16_t)(gpu_din & 0xFFFF);
-      *data_y = (int16_t)(gpu_din >> 16);
-   }
-}
+               zSrcShift = srcshift & 0x30;
+               srcz = (srcz2 << (64 - zSrcShift)) | (srcz1 >> zSrcShift);
+               //bleh, ugly ugly ugly
+               if (zSrcShift == 0)
+                  srcz = srcz1;
 
 
-/******************************************************************
-addradd
-29/11/90
+               //When in SRCSHADE mode, it adds the IINC to the read source (from LFU???)
+               //According to following line, it gets LFU mode. But does it feed the source into the LFU
+               //after the add?
+               //Dest write address/pix address: 0014E83E/0 [dstart=0 dend=10 pwidth=8 srcshift=0][daas=4 dabs=5 dam=7 ds=1 daq=F] [0000000000006505] (icount=003F, inc=1)
+               //Let's try this:
+               if (srcshade)
+               {
+                  uint16_t addq[4];
+                  uint8_t initcin[4] = { 0, 0, 0, 0 };
+                  uint32_t iinc_masked = iinc & 0x00FFFFFF;
+                  ADDARRAY(addq, 4/*daddasel*/, 5/*daddbsel*/, 7/*daddmode*/, dstd, iinc_masked, initcin, 0, 0, 0, patd, srcd, 0, 0, 0, 0);
+                  srcd = ((uint64_t)addq[3] << 48) | ((uint64_t)addq[2] << 32) | ((uint64_t)addq[1] << 16) | (uint64_t)addq[0];
+               }
 
-Blitter Address Adder
----------------------
-The blitter address adder is a pair of sixteen bit adders, one
-each for X and Y.  The multiplexing of the input terms is
-performed elsewhere, but this adder can also perform modulo
-arithmetic to align X-addresses onto phrase boundaries.
+               /* DCONTROL: compute data adder signals.  Moved here from
+                  the per-iteration scope since they are only consumed
+                  during dwrite (dwrite=true, dzwrite=false here). */
+               shadeadd = srcshade;
+               daddasel = (gourd ? 0x01 : 0x00);
+               daddasel |= ((gourd || gourz || srcshade) ? 0x04 : 0x00);
+               daddbsel = (gourd || srcshade ? 0x01 : 0x00);
+               daddbsel |= (gourd || srcshade ? 0x04 : 0x00);
+               /* daddmode bit 0: NAND tree (dcontrol.v:130-146) makes
+                  bit 0 always 1 when dwrite&&gourd, !gourd&&!gourz,
+                  or shadeadd. */
+               daddmode = (gourd || (!gourd && !gourz) || shadeadd ? 0x01 : 0x00);
+               daddmode |= ((gourd && !topben && !ext_int)
+                     || (!gourd && !gourz && !topben) || (shadeadd && !topben) ? 0x02 : 0x00);
+               daddmode |= ((!gourd && !gourz) || shadeadd || (gourd && ext_int) ? 0x04 : 0x00);
+               patfadd = gourd;
+               patdadd = gourd;
+               srcz2add = false;
+               daddq_sel = gourd;
+               data_sel = ((!patdsel && !adddsel) ? 0x01 : 0x00)
+                  | (adddsel ? 0x02 : 0x00);
 
-modx[0..2] take values
-000	no mask
-001	mask bit 0
-010	mask bits 1-0
-..
-110  	mask bits 5-0
+               if (patfadd)
+               {
+                  uint16_t addq[4];
+                  uint8_t initcin[4] = { 0, 0, 0, 0 };
+                  ADDARRAY(addq, 4/*daddasel*/, 4/*daddbsel*/, 0/*daddmode*/, dstd, iinc, initcin, 0, 0, 0, patd, srcd, 0, 0, 0, 0);
+                  srcd1 = ((uint64_t)addq[3] << 48) | ((uint64_t)addq[2] << 32) | ((uint64_t)addq[1] << 16) | (uint64_t)addq[0];
+               }
 
-******************************************************************/
+               /* atick[0]/[1] two-phase pipeline: fractional intensity/Z update
+                  runs in the patfadd/srcz2add block above (Phase 0), integer
+                  update runs via DATA→patdadd below (Phase 1).  The dbinh
+                  param below is overwritten inside DATA by COMP_CTRL. */
 
-void ADDRADD(int16_t *addq_x, int16_t *addq_y, bool a1fracldi,
-	uint16_t adda_x, uint16_t adda_y, uint16_t addb_x, uint16_t addb_y, uint8_t modx, bool suba_x, bool suba_y)
-{
+               DATA(&wdata, &dcomp, &zcomp, &winhibit,
+                     true, cmpdst, daddasel, daddbsel, daddmode, daddq_sel, data_sel, 0/*dbinh*/,
+                     dend, dstart, dstd, iinc, lfufunc, &patd, patdadd,
+                     phrase_mode, srcd, false/*srcdread*/, false/*srczread*/, srcz2add, zmode,
+                     bcompen, bkgwren, dcompen, icount & 0x07, pixsize,
+                     &srcz, dstz, zinc);
 
-/* Perform the addition */
+               /*
+                  DEF ADDRCOMP (
+                  a1_outside	// A1 pointer is outside window bounds
+                  :OUT;
+                  INT16/	a1_x
+                  INT16/	a1_y
+                  INT15/	a1_win_x
+                  INT15/	a1_win_y
+                  :IN);
+                  BEGIN
 
-/*Adder_x		:= ADD16 (addqt_x[0..15], co_x, adda_x{0..15}, addb_x{0..15}, ci_x);
-Adder_y		:= ADD16 (addq_y[0..15], co_y, adda_y{0..15}, addb_y{0..15}, ci_y);*/
+               // The address is outside if negative, or if greater than or equal
+               // to the window size
 
-/* latch carry and propagate if required */
+A1_xcomp	:= MAG_15 (a1xgr, a1xeq, a1xlt, a1_x{0..14}, a1_win_x{0..14});
+A1_ycomp	:= MAG_15 (a1ygr, a1yeq, a1ylt, a1_y{0..14}, a1_win_y{0..14});
+A1_outside	:= OR6 (a1_outside, a1_x{15}, a1xgr, a1xeq, a1_y{15}, a1ygr, a1yeq);
+*/
+               //NOTE: There seems to be an off-by-one bug here in the clip_a1 section... !!! FIX !!!
+               //      Actually, seems to be related to phrase mode writes...
+               //      Or is it? Could be related to non-15-bit compares as above?
+               if (clip_a1 && ((a1_x & 0x8000) || (a1_y & 0x8000) || (a1_x >= a1_win_x) || (a1_y >= a1_win_y)))
+                  winhibit = true;
 
-/*Cxt0		:= AN2 (cxt[0], co_x, a1fracldi);
-Cxt1		:= FD1Q (cxt[1], cxt[0], clk[0]);
-Ci_x		:= EO (ci_x, cxt[1], suba_x);
 
-yt0			:= AN2 (cyt[0], co_y, a1fracldi);
-Cyt1		:= FD1Q (cyt[1], cyt[0], clk[0]);
-Ci_y		:= EO (ci_y, cyt[1], suba_y);*/
+               if (!winhibit || bkgwren)
+               {
+                  if (phrase_mode)
+                  {
+                     JaguarWriteLong(address + 0, wdata >> 32, BLITTER);
+                     JaguarWriteLong(address + 4, wdata & 0xFFFFFFFF, BLITTER);
+                  }
+                  else
+                  {
+                     if (pixsize == 5)
+                        JaguarWriteLong(address, wdata & 0xFFFFFFFF, BLITTER);
+                     else if (pixsize == 4)
+                        JaguarWriteWord(address, wdata & 0x0000FFFF, BLITTER);
+                     else
+                        JaguarWriteByte(address, wdata & 0x000000FF, BLITTER);
+                  }
+               }
 
-////////////////////////////////////// C++ CODE //////////////////////////////////////
-//I'm sure the following will generate a bunch of warnings, but will have to do for now.
-	static uint16_t co_x = 0, co_y = 0;	// Carry out has to propogate between function calls...
-	uint16_t ci_x = co_x ^ (suba_x ? 1 : 0);
-	uint16_t ci_y = co_y ^ (suba_y ? 1 : 0);
-	uint32_t addqt_x = adda_x + addb_x + ci_x;
-	uint32_t addqt_y = adda_y + addb_y + ci_y;
-	uint16_t mask[8] = { 0xFFFF, 0xFFFE, 0xFFFC, 0xFFF8, 0xFFF0, 0xFFE0, 0xFFC0, 0x0000 };
-	co_x = ((addqt_x & 0x10000) && a1fracldi ? 1 : 0);
-	co_y = ((addqt_y & 0x10000) && a1fracldi ? 1 : 0);
-//////////////////////////////////////////////////////////////////////////////////////
+            }
 
-/* Mask low bits of X to 0 if required */
+            if (dzwrite)
+            {
+               PERF_INC(blitter_phrase_writes);
+#ifdef BENCH_PROFILE
+               blitter_did_io = 1;
+#endif
+               // OK, here's the big insight: When NOT in GOURZ mode, srcz1 & 2 function EXACTLY the same way that
+               // srcd1 & 2 work--there's an implicit shift from srcz1 to srcz2 whenever srcz1 is read.
+               // OTHERWISE, srcz1 is the integer for the computed Z and srcz2 is the fractional part.
+               // Writes to srcz1 & 2 follow the same pattern as the other 64-bit registers--low 32 at the low address,
+               // high 32 at the high address (little endian!).
+               // NOTE: GOURZ is still not properly supported. Check patd/patf handling...
+               //       Phrase mode start/end masks are not properly supported either...
+               //This is not correct... !!! FIX !!!
+               //Should be OK now... We'll see...
+               //Nope. Having the same starstep write problems in phrase mode as we had with pixels... !!! FIX !!!
+               //This is not causing the problem in Hover Strike... :-/
+               //The problem was with the SREADX not shifting. Still problems with Z comparisons & other text in pregame screen...
+               if (!winhibit)
+               {
+                  if (phrase_mode)
+                  {
+                     JaguarWriteLong(address + 0, srcz >> 32, BLITTER);
+                     JaguarWriteLong(address + 4, srcz & 0xFFFFFFFF, BLITTER);
+                  }
+                  else
+                  {
+                     if (pixsize == 4)
+                        JaguarWriteWord(address, srcz & 0x0000FFFF, BLITTER);
+                  }
+               }//*/
+            }
 
-/*Masksel		:= D38H (unused[0], masksel[0..4], maskbit[5], unused[1], modx[0..2]);
 
-Maskbit[0-4]	:= OR2 (maskbit[0-4], masksel[0-4], maskbit[1-5]);
+            if (a1_add)
+            {
+               int16_t adda_x, adda_y, addb_x, addb_y, addq_x, addq_y;
+               ADDAMUX(&adda_x, &adda_y, addasel, a1_step_x, a1_step_y, a1_stepf_x, a1_stepf_y, a2_step_x, a2_step_y,
+                     a1_inc_x, a1_inc_y, a1_incf_x, a1_incf_y, adda_xconst, adda_yconst, addareg, suba_x, suba_y);
+               ADDBMUX(&addb_x, &addb_y, addbsel, a1_x, a1_y, a2_x, a2_y, a1_frac_x, a1_frac_y);
+               ADDRADD(&addq_x, &addq_y, a1fracldi, adda_x, adda_y, addb_x, addb_y, modx, suba_x, suba_y);
 
-Mask[0-5]	:= MX2 (addq_x[0-5], addqt_x[0-5], zero, maskbit[0-5]);
+               //Now, write to what???
+               //a2ptrld comes from a2ptrldi...
+               //I believe it's addbsel that determines the writeback...
+               // This is where atick[0] & [1] come in, in determining which part (fractional, integer)
+               // gets written to...
+               //a1_x = addq_x;
+               //a1_y = addq_y;
+               //Kludge, to get A1 channel increment working...
+               if (a1addx == 3)
+               {
+                  a1_frac_x = addq_x, a1_frac_y = addq_y;
 
-Addq_x		:= JOIN (addq_x, addq_x[0..5], addqt_x[6..15]);
-Addq_y		:= JOIN (addq_y, addq_y[0..15]);*/
+                  addasel = 2, addbsel = 0, a1fracldi = false;
+                  ADDAMUX(&adda_x, &adda_y, addasel, a1_step_x, a1_step_y, a1_stepf_x, a1_stepf_y, a2_step_x, a2_step_y,
+                        a1_inc_x, a1_inc_y, a1_incf_x, a1_incf_y, adda_xconst, adda_yconst, addareg, suba_x, suba_y);
+                  ADDBMUX(&addb_x,&addb_y, addbsel, a1_x, a1_y, a2_x, a2_y, a1_frac_x, a1_frac_y);
+                  ADDRADD(&addq_x, &addq_y, a1fracldi, adda_x, adda_y, addb_x, addb_y, modx, suba_x, suba_y);
 
-////////////////////////////////////// C++ CODE //////////////////////////////////////
-	*addq_x = addqt_x & mask[modx];
-	*addq_y = addqt_y & 0xFFFF;
-//////////////////////////////////////////////////////////////////////////////////////
+                  a1_x = addq_x, a1_y = addq_y;
+               }
+               else
+                  a1_x = addq_x, a1_y = addq_y;
+            }
 
-//Unused[0-1]	:= DUMMY (unused[0-1]);
+            if (a2_add)
+            {
+               int16_t adda_x, adda_y, addb_x, addb_y, addq_x, addq_y;
+               ADDAMUX(&adda_x, &adda_y, addasel, a1_step_x, a1_step_y, a1_stepf_x, a1_stepf_y, a2_step_x, a2_step_y,
+                     a1_inc_x, a1_inc_y, a1_incf_x, a1_incf_y, adda_xconst, adda_yconst, addareg, suba_x, suba_y);
+               ADDBMUX(&addb_x, &addb_y, addbsel, a1_x, a1_y, a2_x, a2_y, a1_frac_x, a1_frac_y);
+               ADDRADD(&addq_x, &addq_y, a1fracldi, adda_x, adda_y, addb_x, addb_y, modx, suba_x, suba_y);
 
-//END;
-}
+               //Now, write to what???
+               //a2ptrld comes from a2ptrldi...
+               //I believe it's addbsel that determines the writeback...
+               a2_x = addq_x;
+               a2_y = addq_y;
+            }
+#ifdef BENCH_PROFILE
+            if (blitter_did_io) PERF_INC(blitter_inner_io);
+            else                PERF_INC(blitter_inner_idle);
+#endif
+         }
 
+         indone = true;
+         // The outer counter is updated here as well on the clock cycle...
 
-/*
-DEF DATA (
-		wdata[0..63]	// co-processor write data bus
-		:BUS;
-		dcomp[0..7]		// data byte equal flags
-		srcd[0..7]		// bits to use for bit to byte expansion
-		zcomp[0..3]		// output from Z comparators
-		:OUT;
-		a1_x[0..1]		// low two bits of A1 X pointer
-		big_pix			// pixel organisation is big-endian
-		blitter_active	// blitter is active
-		clk				// co-processor clock
-		cmpdst			// compare dest rather than source
-		colorld			// load the pattern color fields
-		daddasel[0..2]	// data adder input A selection
-		daddbsel[0..3]	// data adder input B selection
-		daddmode[0..2]	// data adder mode
-		daddq_sel		// select adder output vs. GPU data
-		data[0..63]		// co-processor read data bus
-		data_ena		// enable write data
-		data_sel[0..1]	// select data to write
-		dbinh\[0..7]	// byte oriented changed data inhibits
-		dend[0..5]		// end of changed write data zone
-		dpipe[0..1]		// load computed data pipe-line latch
-		dstart[0..5]	// start of changed write data zone
-		dstdld[0..1]	// dest data load (two halves)
-		dstzld[0..1]	// dest zed load (two halves)
-		ext_int			// enable extended precision intensity calculations
-INT32/	gpu_din			// GPU data bus
-		iincld			// I increment load
-		iincldx			// alternate I increment load
-		init_if			// initialise I fraction phase
-		init_ii			// initialise I integer phase
-		init_zf			// initialise Z fraction phase
-		intld[0..3]		// computed intensities load
-		istepadd		// intensity step integer add
-		istepfadd		// intensity step fraction add
-		istepld			// I step load
-		istepdld		// I step delta load
-		lfu_func[0..3]	// LFU function code
-		patdadd			// pattern data gouraud add
-		patdld[0..1]	// pattern data load (two halves)
-		pdsel[0..1]		// select pattern data type
-		phrase_mode		// phrase write mode
-		reload			// transfer contents of double buffers
-		reset\			// system reset
-		srcd1ld[0..1]	// source register 1 load (two halves)
-		srcdread		// source data read load enable
-		srczread		// source zed read load enable
-		srcshift[0..5]	// source alignment shift
-		srcz1ld[0..1]	// source zed 1 load (two halves)
-		srcz2add		// zed fraction gouraud add
-		srcz2ld[0..1]	// source zed 2 load (two halves)
-		textrgb			// texture mapping in RGB mode
-		txtd[0..63]		// data from the texture unit
-		zedld[0..3]		// computed zeds load
-		zincld			// Z increment load
-		zmode[0..2]		// Z comparator mode
-		zpipe[0..1]		// load computed zed pipe-line latch
-		zstepadd		// zed step integer add
-		zstepfadd		// zed step fraction add
-		zstepld			// Z step load
-		zstepdld		// Z step delta load
-		:IN);
-*/
+         /* the inner loop is started whenever another state is about to
+            cause the inner state to go active */
+         //Instart		:= ND7 (instart, innert[0], innert[2..7]);
 
-void DATA(uint64_t *wdata, uint8_t *dcomp, uint8_t *zcomp, bool *nowrite,
-	bool big_pix, bool cmpdst, uint8_t daddasel, uint8_t daddbsel, uint8_t daddmode, bool daddq_sel, uint8_t data_sel,
-	uint8_t dbinh, uint8_t dend, uint8_t dstart, uint64_t dstd, uint32_t iinc, uint8_t lfu_func, uint64_t *patd, bool patdadd,
-	bool phrase_mode, uint64_t srcd, bool srcdread, bool srczread, bool srcz2add, uint8_t zmode,
-	bool bcompen, bool bkgwren, bool dcompen, uint8_t icount, uint8_t pixsize,
-	uint64_t *srcz, uint64_t dstz, uint32_t zinc)
-{
-/*
-  Stuff we absolutely *need* to have passed in/out:
-IN:
-  patdadd, dstd, srcd, patd, daddasel, daddbsel, daddmode, iinc, srcz1, srcz2, big_pix, phrase_mode, cmpdst
-OUT:
-  changed patd (wdata I guess...) (Nope. We pass it back directly now...)
-*/
+         //Actually, it's done only when inner gets asserted without the 2nd line of conditions
+         //(inner AND !indone)
+         //fixed now...
+         //Since we don't get here until the inner loop is finished (indone = true) we can get
+         //away with doing it here...!
+         ocount--;
 
-// Source data registers
+         if (ocount == 0)
+            outer0 = true;
+      }
 
-/*Data_src	:= DATA_SRC (srcdlo, srcdhi, srcz[0..1], srczo[0..1], srczp[0..1], srcz1[0..1], srcz2[0..1], big_pix,
-			clk, gpu_din, intld[0..3], local_data0, local_data1, srcd1ld[0..1], srcdread, srczread, srcshift[0..5],
-			srcz1ld[0..1], srcz2add, srcz2ld[0..1], zedld[0..3], zpipe[0..1]);
-Srcd[0-7]	:= JOIN (srcd[0-7], srcdlo{0-7});
-Srcd[8-31]	:= JOIN (srcd[8-31], srcdlo{8-31});
-Srcd[32-63]	:= JOIN (srcd[32-63], srcdhi{0-31});*/
+      if (a1fupdate)
+      {
+         uint32_t a1_frac_xt = (uint32_t)a1_frac_x + (uint32_t)a1_stepf_x;
+         uint32_t a1_frac_yt = (uint32_t)a1_frac_y + (uint32_t)a1_stepf_y;
+         a1FracCInX = a1_frac_xt >> 16;
+         a1FracCInY = a1_frac_yt >> 16;
+         a1_frac_x = (uint16_t)(a1_frac_xt & 0xFFFF);
+         a1_frac_y = (uint16_t)(a1_frac_yt & 0xFFFF);
+      }
 
-// Destination data registers
+      if (a1update)
+      {
+         a1_x += a1_step_x + a1FracCInX;
+         a1_y += a1_step_y + a1FracCInY;
+      }
 
-/*Data_dst	:= DATA_DST (dstd[0..63], dstz[0..1], clk, dstdld[0..1], dstzld[0..1], load_data[0..1]);
-Dstdlo		:= JOIN (dstdlo, dstd[0..31]);
-Dstdhi		:= JOIN (dstdhi, dstd[32..63]);*/
+      if (a2update)
+      {
+         a2_x += a2_step_x;
+         a2_y += a2_step_y;
+      }
+   }
 
-// Pattern and Color data registers
+   // Write values back to registers (in real blitter, these are continuously updated)
+   SET16(blitter_ram, A1_PIXEL + 2, a1_x);
+   SET16(blitter_ram, A1_PIXEL + 0, a1_y);
+   SET16(blitter_ram, A1_FPIXEL + 2, a1_frac_x);
+   SET16(blitter_ram, A1_FPIXEL + 0, a1_frac_y);
+   SET16(blitter_ram, A2_PIXEL + 2, a2_x);
+   SET16(blitter_ram, A2_PIXEL + 0, a2_y);
 
-// Looks like this is simply another register file for the pattern data registers. No adding or anything funky
-// going on. Note that patd & patdv will output the same info.
-// Patdldl/h (patdld[0..1]) can select the local_data bus to overwrite the current pattern data...
-// Actually, it can be either patdld OR patdadd...!
-/*Data_pat	:= DATA_PAT (colord[0..15], int0dp[8..10], int1dp[8..10], int2dp[8..10], int3dp[8..10], mixsel[0..2],
-			patd[0..63], patdv[0..1], clk, colorld, dpipe[0], ext_int, gpu_din, intld[0..3], local_data0, local_data1,
-			patdadd, patdld[0..1], reload, reset\);
-Patdlo		:= JOIN (patdlo, patd[0..31]);
-Patdhi		:= JOIN (patdhi, patd[32..63]);*/
+}
 
-// Multiplying data Mixer (NOT IN JAGUAR I)
+// Various pieces of the blitter puzzle are teased out here...
 
-/*Datamix		:= DATAMIX (patdo[0..1], clk, colord[0..15], dpipe[1], dstd[0..63], int0dp[8..10], int1dp[8..10],
-			int2dp[8..10], int3dp[8..10], mixsel[0..2], patd[0..63], pdsel[0..1], srcd[0..63], textrgb, txtd[0..63]);*/
+void ADDRGEN(uint32_t *address, uint32_t *pixa, bool gena2, bool zaddr,
+	uint16_t a1_x, uint16_t a1_y, uint32_t a1_base, uint8_t a1_pitch, uint8_t a1_pixsize, uint8_t a1_width, uint8_t a1_zoffset,
+	uint16_t a2_x, uint16_t a2_y, uint32_t a2_base, uint8_t a2_pitch, uint8_t a2_pixsize, uint8_t a2_width, uint8_t a2_zoffset)
+{
+	uint16_t x = (gena2 ? a2_x : a1_x) & 0xFFFF;	// Actually uses all 16 bits to generate address...!
+	uint16_t y = (gena2 ? a2_y : a1_y) & 0x0FFF;
+	uint8_t width = (gena2 ? a2_width : a1_width);
+	uint8_t pixsize = (gena2 ? a2_pixsize : a1_pixsize);
+	uint8_t pitch = (gena2 ? a2_pitch : a1_pitch);
+	uint32_t base = (gena2 ? a2_base : a1_base) >> 3;//Only upper 21 bits are passed around the bus? Seems like it...
+	uint8_t zoffset = (gena2 ? a2_zoffset : a1_zoffset);
 
-// Logic function unit
+	uint32_t ytm = ((uint32_t)y << 2) + ((width & 0x02) ? (uint32_t)y << 1 : 0) + ((width & 0x01) ? (uint32_t)y : 0);
 
-/*Lfu		:= LFU (lfu[0..1], srcdlo, srcdhi, dstdlo, dstdhi, lfu_func[0..3]);*/
-////////////////////////////////////// C++ CODE //////////////////////////////////////
-	uint64_t lfu = blitter_simd_ops.lfu(srcd, dstd, lfu_func);
-   bool mir_bit, mir_byte;
-   uint16_t masku;
-   uint8_t e_coarse, e_fine;
-   uint8_t s_coarse, s_fine;
-   uint16_t maskt;
-	uint8_t decl38e[2][8] = { { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF },
-		{ 0xFE, 0xFD, 0xFB, 0xF7, 0xEF, 0xDF, 0xBF, 0x7F } };
-	uint8_t dech38[8] = { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 };
-	uint8_t dech38el[2][8] = { { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 },
-		{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 } };
-   int en;
-	uint8_t dbinht;
-   uint16_t addq[4];
-   uint8_t initcin[4] = { 0, 0, 0, 0 };
-   uint16_t mask;
-   uint64_t dmux[4];
-   uint64_t ddat;
-//////////////////////////////////////////////////////////////////////////////////////
+	uint32_t ya = (ytm << (width >> 2)) >> 2;
 
-// Increment and Step Registers
+	uint32_t pa = ya + x;
+   uint8_t pt, za;
+   uint32_t phradr, shup, addr;
 
-// Does it do anything without the step add lines? Check it!
-// No. This is pretty much just a register file without the Jaguar II lines...
-/*Inc_step	:= INC_STEP (iinc, istep[0..31], zinc, zstep[0..31], clk, ext_int, gpu_din, iincld, iincldx, istepadd,
-			istepfadd, istepld, istepdld, reload, reset\, zincld, zstepadd, zstepfadd, zstepld, zstepdld);
-Istep		:= JOIN (istep, istep[0..31]);
-Zstep		:= JOIN (zstep, zstep[0..31]);*/
+	*pixa = pa << pixsize;
 
-// Pixel data comparator
+	pt = ((pitch & 0x01) && !(pitch & 0x02) ? 0x01 : 0x00)
+		| (!(pitch & 0x01) && (pitch & 0x02) ? 0x02 : 0x00);
+	phradr = (*pixa >> 6) << pt;
+	shup = (pitch == 0x03 ? (*pixa >> 6) : 0);
 
-/*Datacomp	:= DATACOMP (dcomp[0..7], cmpdst, dstdlo, dstdhi, patdlo, patdhi, srcdlo, srcdhi);*/
-////////////////////////////////////// C++ CODE //////////////////////////////////////
-	*dcomp = blitter_simd_ops.dcomp(*patd, srcd, dstd, cmpdst);
-//////////////////////////////////////////////////////////////////////////////////////
+	za = (zaddr ? zoffset : 0) & 0x03;
+	addr = za + phradr + (shup << 1) + base;
+	*address = ((*pixa & 0x38) >> 3) | ((addr & 0x1FFFFF) << 3);
+	*pixa &= 0x07;
+}
 
-// Zed comparator for Z-buffer operations
+////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////
+// Here's an important bit: The source data adder logic. Need to track down the inputs!!! //
+////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////
 
-/*Zedcomp		:= ZEDCOMP (zcomp[0..3], srczp[0..1], dstz[0..1], zmode[0..2]);*/
-////////////////////////////////////// C++ CODE //////////////////////////////////////
-//srczp is srcz pipelined, also it goes through a source shift as well...
-/*The shift is basically like so (each piece is 16 bits long):
 
-	0         1         2         3         4          5         6
-	srcz1lolo srcz1lohi srcz1hilo srcz1hihi srcrz2lolo srcz2lohi srcz2hilo
+void ADDAMUX(int16_t *adda_x, int16_t *adda_y, uint8_t addasel, int16_t a1_step_x, int16_t a1_step_y,
+	int16_t a1_stepf_x, int16_t a1_stepf_y, int16_t a2_step_x, int16_t a2_step_y,
+	int16_t a1_inc_x, int16_t a1_inc_y, int16_t a1_incf_x, int16_t a1_incf_y, uint8_t adda_xconst,
+	bool adda_yconst, bool addareg, bool suba_x, bool suba_y)
+{
 
-with srcshift bits 4 & 5 selecting the start position
-*/
-//So... basically what we have here is:
-	*zcomp = blitter_simd_ops.zcomp(*srcz, dstz, zmode);
+   int16_t addar_x, addar_y, addac_x, addac_y, addas_x, addas_y;
+	int16_t xterm[4], yterm[4];
+	xterm[0] = a1_step_x, xterm[1] = a1_stepf_x, xterm[2] = a1_inc_x, xterm[3] = a1_incf_x;
+	yterm[0] = a1_step_y, yterm[1] = a1_stepf_y, yterm[2] = a1_inc_y, yterm[3] = a1_incf_y;
+   if (addasel & 0x04)
+   {
+      addar_x = a2_step_x;
+      addar_y = a2_step_y;
+   }
+   else
+   {
+      addar_x = xterm[addasel & 0x03];
+      addar_y = yterm[addasel & 0x03];
+   }
 
-//TEMP, TO TEST IF ZCOMP IS THE CULPRIT...
-//Nope, this is NOT the problem...
-//zcomp=0;
-// We'll do the comparison/bit/byte inhibits here, since that's they way it happens
-// in the real thing (dcomp goes out to COMP_CTRL and back into DATA through dbinh)...
-	{
-	uint8_t bcomp_bits;
-	if (bcompen && phrase_mode)
-	{
-		bcomp_bits = (srcd >> 56) & 0xFF;
-	}
-	else
-		bcomp_bits = srcd & 0xFF;
+   /* Generate a constant value - this is a power of 2 in the range
+      0-64, or zero.  The control bits are adda_xconst[0..2], when they
+      are all 1  the result is 0.
+      Constants for Y can only be 0 or 1 */
 
-	COMP_CTRL(&dbinht, nowrite,
-		bcompen, true/*big_pix*/, bkgwren, *dcomp, dcompen, icount, pixsize, phrase_mode, bcomp_bits, *zcomp);
-	}
-	dbinh = dbinht;
+	addac_x = (adda_xconst == 0x07 ? 0 : 1 << adda_xconst);
+	addac_y = (adda_yconst ? 0x01 : 0);
 
-//////////////////////////////////////////////////////////////////////////////////////
+   /* Select between constant value and register value */
 
-// 22 Mar 94
-// The data initializer - allows all four initial values to be computed from one (NOT IN JAGUAR I)
+   if (addareg)
+   {
+      addas_x = (addareg ? addar_x : addac_x);
+      addas_y = (addareg ? addar_y : addac_y);
+   }
+   else
+   {
+      addas_x = (addareg ? addar_x : addac_x);
+      addas_y = (addareg ? addar_y : addac_y);
+   }
 
-/*Datinit		:= DATINIT (initcin[0..3], initinc[0..63], initpix[0..15], a1_x[0..1], big_pix, clk, iinc, init_if, init_ii,
-			init_zf, istep[0..31], zinc, zstep[0..31]);*/
+   /* Complement these values (complement flag gives adder carry in)*/
 
-// Adder array for Z and intensity increments
+	*adda_x = addas_x ^ (suba_x ? 0xFFFF : 0x0000);
+	*adda_y = addas_y ^ (suba_y ? 0xFFFF : 0x0000);
+}
 
-/*Addarray	:= ADDARRAY (addq[0..3], clk, daddasel[0..2], daddbsel[0..3], daddmode[0..2], dstdlo, dstdhi, iinc,
-			initcin[0..3], initinc[0..63], initpix[0..15], istep, patdv[0..1], srcdlo, srcdhi, srcz1[0..1],
-			srcz2[0..1], reset\, zinc, zstep);*/
-/*void ADDARRAY(uint16_t * addq, uint8_t daddasel, uint8_t daddbsel, uint8_t daddmode,
-	uint64_t dstd, uint32_t iinc, uint8_t initcin[], uint64_t initinc, uint16_t initpix,
-	uint32_t istep, uint64_t patd, uint64_t srcd, uint64_t srcz1, uint64_t srcz2,
-	uint32_t zinc, uint32_t zstep)*/
-////////////////////////////////////// C++ CODE //////////////////////////////////////
-	{
-	uint64_t patd_pre = *patd;
-	ADDARRAY(addq, daddasel, daddbsel, daddmode, dstd, iinc, initcin, 0, 0, 0, *patd, srcd, 0, 0, 0, 0);
 
-	if (patdadd)
-		*patd = ((uint64_t)addq[3] << 48) | ((uint64_t)addq[2] << 32) | ((uint64_t)addq[1] << 16) | (uint64_t)addq[0];
-//////////////////////////////////////////////////////////////////////////////////////
+/**  ADDBMUX - Address adder input B selection  *******************
 
-// Local data bus multiplexer
-// In hardware, the write data mux reads patd BEFORE the register update.
-// patd_pre captures the pre-increment value for the data output mux.
+This module selects the register to be updated by the address
+adder.  This can be one of three registers, the A1 and A2
+pointers, or the A1 fractional part. It can also be zero, so that the step
+registers load directly into the pointers.
+*/
 
-/*Local_mux	:= LOCAL_MUX (local_data[0..1], load_data[0..1],
-	addq[0..3], gpu_din, data[0..63], blitter_active, daddq_sel);
-Local_data0	:= JOIN (local_data0, local_data[0]);
-Local_data1	:= JOIN (local_data1, local_data[1]);*/
+/*DEF ADDBMUX (
+INT16/	addb_x
+INT16/	addb_y
+	:OUT;
+	addbsel[0..1]
+INT16/	a1_x
+INT16/	a1_y
+INT16/	a2_x
+INT16/	a2_y
+INT16/	a1_frac_x
+INT16/	a1_frac_y
+	:IN);
+INT16/	zero16 :LOCAL;
+BEGIN*/
+void ADDBMUX(int16_t *addb_x, int16_t *addb_y, uint8_t addbsel, int16_t a1_x, int16_t a1_y,
+	int16_t a2_x, int16_t a2_y, int16_t a1_frac_x, int16_t a1_frac_y)
+{
+
+/*Zero		:= TIE0 (zero);
+Zero16		:= JOIN (zero16, zero, zero, zero, zero, zero, zero, zero,
+			zero, zero, zero, zero, zero, zero, zero, zero, zero);
+Addbselb[0-1]	:= BUF8 (addbselb[0-1], addbsel[0-1]);
+Addb_x		:= MX4 (addb_x, a1_x, a2_x, a1_frac_x, zero16, addbselb[0..1]);
+Addb_y		:= MX4 (addb_y, a1_y, a2_y, a1_frac_y, zero16, addbselb[0..1]);*/
 ////////////////////////////////////// C++ CODE //////////////////////////////////////
+	int16_t xterm[4], yterm[4];
+	xterm[0] = a1_x, xterm[1] = a2_x, xterm[2] = a1_frac_x, xterm[3] = 0;
+	yterm[0] = a1_y, yterm[1] = a2_y, yterm[2] = a1_frac_y, yterm[3] = 0;
+	*addb_x = xterm[addbsel & 0x03];
+	*addb_y = yterm[addbsel & 0x03];
 //////////////////////////////////////////////////////////////////////////////////////
 
-// Data output multiplexer and tri-state drive
+//END;
+}
+
 
-/*Data_mux	:= DATA_MUX (wdata[0..63], addq[0..3], big_pix, dstdlo, dstdhi, dstz[0..1], data_sel[0..1], data_ena,
-			dstart[0..5], dend[0..5], dbinh\[0..7], lfu[0..1], patdo[0..1], phrase_mode, srczo[0..1]);*/
-////////////////////////////////////// C++ CODE //////////////////////////////////////
-// NOTE: patdo comes from DATAMIX and can be considered the same as patd for Jaguar I
+/**  DATAMUX - Address local data bus selection  ******************
 
-//////////////////////////////////////////////////////////////////////////////////////
-//}
+Select between the adder output and the input data bus
+*/
 
-/*DEF DATA_MUX (
-		wdata[0..63]	// co-processor rwrite data bus
-		:BUS;
-INT16/	addq[0..3]
-		big_pix			// Pixel organisation is big-endian
-INT32/	dstdlo
-INT32/	dstdhi
-INT32/	dstzlo
-INT32/	dstzhi
-		data_sel[0..1]	// source of write data
-		data_ena		// enable write data onto read/write bus
-		dstart[0..5]	// start of changed write data
-		dend[0..5]		// end of changed write data
-		dbinh\[0..7]	// byte oriented changed data inhibits
-INT32/	lfu[0..1]
-INT32/	patd[0..1]
-		phrase_mode		// phrase write mode
-INT32/	srczlo
-INT32/	srczhi
-		:IN);*/
+/*DEF DATAMUX (
+INT16/	data_x
+INT16/	data_y
+	:OUT;
+INT32/	gpu_din
+INT16/	addq_x
+INT16/	addq_y
+	addqsel
+	:IN);
 
-/*INT32/	addql[0..1], ddatlo, ddathi zero32
+INT16/	gpu_lo, gpu_hi
 :LOCAL;
-BEGIN
-
-Phrase_mode\	:= INV1 (phrase_mode\, phrase_mode);
-Zero		:= TIE0 (zero);
-Zero32		:= JOIN (zero32, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero);*/
+BEGIN*/
+void DATAMUX(int16_t *data_x, int16_t *data_y, uint32_t gpu_din, int16_t addq_x, int16_t addq_y, bool addqsel)
+{
+   if (addqsel)
+   {
+      *data_x = addq_x;
+      *data_y = addq_y;
+   }
+   else
+   {
+      *data_x = (int16_t)(gpu_din & 0xFFFF);
+      *data_y = (int16_t)(gpu_din >> 16);
+   }
+}
 
-/* Generate a changed data mask */
 
-/*Edis		:= OR6 (edis\, dend[0..5]);
-Ecoarse		:= DECL38E (e_coarse\[0..7], dend[3..5], edis\);
-E_coarse[0]	:= INV1 (e_coarse[0], e_coarse\[0]);
-Efine		:= DECL38E (unused[0], e_fine\[1..7], dend[0..2], e_coarse[0]);*/
-////////////////////////////////////// C++ CODE //////////////////////////////////////
+/******************************************************************
+addradd
+29/11/90
 
-	en = ((dend & 0x3F) ? 1 : 0);
-	e_coarse = decl38e[en][(dend & 0x38) >> 3];		// Actually, this is e_coarse inverted...
-	e_fine = decl38e[(e_coarse & 0x01) ^ 0x01][dend & 0x07];
-	e_fine &= 0xFE;
-//////////////////////////////////////////////////////////////////////////////////////
+Blitter Address Adder
+---------------------
+The blitter address adder is a pair of sixteen bit adders, one
+each for X and Y.  The multiplexing of the input terms is
+performed elsewhere, but this adder can also perform modulo
+arithmetic to align X-addresses onto phrase boundaries.
 
-/*Scoarse		:= DECH38 (s_coarse[0..7], dstart[3..5]);
-Sfen\		:= INV1 (sfen\, s_coarse[0]);
-Sfine		:= DECH38EL (s_fine[0..7], dstart[0..2], sfen\);*/
-////////////////////////////////////// C++ CODE //////////////////////////////////////
-	s_coarse = dech38[(dstart & 0x38) >> 3];
-	s_fine = dech38el[(s_coarse & 0x01) ^ 0x01][dstart & 0x07];
-//////////////////////////////////////////////////////////////////////////////////////
+modx[0..2] take values
+000	no mask
+001	mask bit 0
+010	mask bits 1-0
+..
+110  	mask bits 5-0
 
-/*Maskt[0]	:= BUF1 (maskt[0], s_fine[0]);
-Maskt[1-7]	:= OAN1P (maskt[1-7], maskt[0-6], s_fine[1-7], e_fine\[1-7]);*/
-////////////////////////////////////// C++ CODE //////////////////////////////////////
-	maskt = s_fine & 0x0001;
-	maskt |= (((maskt & 0x0001) || (s_fine & 0x02u)) && (e_fine & 0x02u) ? 0x0002 : 0x0000);
-	maskt |= (((maskt & 0x0002) || (s_fine & 0x04u)) && (e_fine & 0x04u) ? 0x0004 : 0x0000);
-	maskt |= (((maskt & 0x0004) || (s_fine & 0x08u)) && (e_fine & 0x08u) ? 0x0008 : 0x0000);
-	maskt |= (((maskt & 0x0008) || (s_fine & 0x10u)) && (e_fine & 0x10u) ? 0x0010 : 0x0000);
-	maskt |= (((maskt & 0x0010) || (s_fine & 0x20u)) && (e_fine & 0x20u) ? 0x0020 : 0x0000);
-	maskt |= (((maskt & 0x0020) || (s_fine & 0x40u)) && (e_fine & 0x40u) ? 0x0040 : 0x0000);
-	maskt |= (((maskt & 0x0040) || (s_fine & 0x80u)) && (e_fine & 0x80u) ? 0x0080 : 0x0000);
-//////////////////////////////////////////////////////////////////////////////////////
+******************************************************************/
 
-   /* Produce a look-ahead on the ripple carry */
-	maskt |= (((s_coarse & e_coarse & 0x01u) || (s_coarse & 0x02u)) && (e_coarse & 0x02u) ? 0x0100 : 0x0000);
-	maskt |= (((maskt & 0x0100) || (s_coarse & 0x04u)) && (e_coarse & 0x04u) ? 0x0200 : 0x0000);
-	maskt |= (((maskt & 0x0200) || (s_coarse & 0x08u)) && (e_coarse & 0x08u) ? 0x0400 : 0x0000);
-	maskt |= (((maskt & 0x0400) || (s_coarse & 0x10u)) && (e_coarse & 0x10u) ? 0x0800 : 0x0000);
-	maskt |= (((maskt & 0x0800) || (s_coarse & 0x20u)) && (e_coarse & 0x20u) ? 0x1000 : 0x0000);
-	maskt |= (((maskt & 0x1000) || (s_coarse & 0x40u)) && (e_coarse & 0x40u) ? 0x2000 : 0x0000);
-	maskt |= (((maskt & 0x2000) || (s_coarse & 0x80u)) && (e_coarse & 0x80u) ? 0x4000 : 0x0000);
+void ADDRADD(int16_t *addq_x, int16_t *addq_y, bool a1fracldi,
+	uint16_t adda_x, uint16_t adda_y, uint16_t addb_x, uint16_t addb_y, uint8_t modx, bool suba_x, bool suba_y)
+{
 
-/* The bit terms are mirrored for big-endian pixels outside phrase
-mode.  The byte terms are mirrored for big-endian pixels in phrase
-mode.  */
+/* Perform the addition */
 
-/*Mirror_bit	:= AN2M (mir_bit, phrase_mode\, big_pix);
-Mirror_byte	:= AN2H (mir_byte, phrase_mode, big_pix);
+/*Adder_x		:= ADD16 (addqt_x[0..15], co_x, adda_x{0..15}, addb_x{0..15}, ci_x);
+Adder_y		:= ADD16 (addq_y[0..15], co_y, adda_y{0..15}, addb_y{0..15}, ci_y);*/
 
-Masktb[14]	:= BUF1 (masktb[14], maskt[14]);
-Masku[0]	:= MX4 (masku[0],  maskt[0],  maskt[7],  maskt[14],  zero, mir_bit, mir_byte);
-Masku[1]	:= MX4 (masku[1],  maskt[1],  maskt[6],  maskt[14],  zero, mir_bit, mir_byte);
-Masku[2]	:= MX4 (masku[2],  maskt[2],  maskt[5],  maskt[14],  zero, mir_bit, mir_byte);
-Masku[3]	:= MX4 (masku[3],  maskt[3],  maskt[4],  masktb[14], zero, mir_bit, mir_byte);
-Masku[4]	:= MX4 (masku[4],  maskt[4],  maskt[3],  masktb[14], zero, mir_bit, mir_byte);
-Masku[5]	:= MX4 (masku[5],  maskt[5],  maskt[2],  masktb[14], zero, mir_bit, mir_byte);
-Masku[6]	:= MX4 (masku[6],  maskt[6],  maskt[1],  masktb[14], zero, mir_bit, mir_byte);
-Masku[7]	:= MX4 (masku[7],  maskt[7],  maskt[0],  masktb[14], zero, mir_bit, mir_byte);
-Masku[8]	:= MX2 (masku[8],  maskt[8],  maskt[13], mir_byte);
-Masku[9]	:= MX2 (masku[9],  maskt[9],  maskt[12], mir_byte);
-Masku[10]	:= MX2 (masku[10], maskt[10], maskt[11], mir_byte);
-Masku[11]	:= MX2 (masku[11], maskt[11], maskt[10], mir_byte);
-Masku[12]	:= MX2 (masku[12], maskt[12], maskt[9],  mir_byte);
-Masku[13]	:= MX2 (masku[13], maskt[13], maskt[8],  mir_byte);
-Masku[14]	:= MX2 (masku[14], maskt[14], maskt[0],  mir_byte);*/
-////////////////////////////////////// C++ CODE //////////////////////////////////////
+/* latch carry and propagate if required */
 
-	mir_bit  = true/*big_pix*/ && !phrase_mode;
-	mir_byte = true/*big_pix*/ && phrase_mode;
-	masku    = maskt;
+/*Cxt0		:= AN2 (cxt[0], co_x, a1fracldi);
+Cxt1		:= FD1Q (cxt[1], cxt[0], clk[0]);
+Ci_x		:= EO (ci_x, cxt[1], suba_x);
 
-	if (mir_bit)
-	{
-		masku &= 0xFF00;
-		masku |= (maskt >> 7) & 0x0001;
-		masku |= (maskt >> 5) & 0x0002;
-		masku |= (maskt >> 3) & 0x0004;
-		masku |= (maskt >> 1) & 0x0008;
-		masku |= (maskt << 1) & 0x0010;
-		masku |= (maskt << 3) & 0x0020;
-		masku |= (maskt << 5) & 0x0040;
-		masku |= (maskt << 7) & 0x0080;
-	}
+yt0			:= AN2 (cyt[0], co_y, a1fracldi);
+Cyt1		:= FD1Q (cyt[1], cyt[0], clk[0]);
+Ci_y		:= EO (ci_y, cyt[1], suba_y);*/
 
-	if (mir_byte)
-	{
-		/* MX4 input 2: masku[7:0] = {8{maskt[14]}} (broadcast bit 14) */
-		masku = (maskt & 0x4000) ? 0x00FF : 0x0000;
-		/* MX2: reverse bits 8-13, maskt[0] at position 14 */
-		masku |= (maskt >> 5) & 0x0100;
-		masku |= (maskt >> 3) & 0x0200;
-		masku |= (maskt >> 1) & 0x0400;
-		masku |= (maskt << 1) & 0x0800;
-		masku |= (maskt << 3) & 0x1000;
-		masku |= (maskt << 5) & 0x2000;
-		masku |= (maskt & 0x0001) << 14;
-	}
+////////////////////////////////////// C++ CODE //////////////////////////////////////
+//I'm sure the following will generate a bunch of warnings, but will have to do for now.
+	static uint16_t co_x = 0, co_y = 0;	// Carry out has to propogate between function calls...
+	uint16_t ci_x = co_x ^ (suba_x ? 1 : 0);
+	uint16_t ci_y = co_y ^ (suba_y ? 1 : 0);
+	uint32_t addqt_x = adda_x + addb_x + ci_x;
+	uint32_t addqt_y = adda_y + addb_y + ci_y;
+	uint16_t mask[8] = { 0xFFFF, 0xFFFE, 0xFFFC, 0xFFF8, 0xFFF0, 0xFFE0, 0xFFC0, 0x0000 };
+	co_x = ((addqt_x & 0x10000) && a1fracldi ? 1 : 0);
+	co_y = ((addqt_y & 0x10000) && a1fracldi ? 1 : 0);
 //////////////////////////////////////////////////////////////////////////////////////
 
-/* The maskt terms define the area for changed data, but the byte
-inhibit terms can override these */
+/* Mask low bits of X to 0 if required */
 
-/*Mask[0-7]	:= AN2 (mask[0-7], masku[0-7], dbinh\[0]);
-Mask[8-14]	:= AN2H (mask[8-14], masku[8-14], dbinh\[1-7]);*/
-////////////////////////////////////// C++ CODE //////////////////////////////////////
-	mask = masku & (!(dbinh & 0x01) ? 0xFFFF : 0xFF00);
-	mask &= ~(((uint16_t)dbinh & 0x00FE) << 7);
-//////////////////////////////////////////////////////////////////////////////////////
+/*Masksel		:= D38H (unused[0], masksel[0..4], maskbit[5], unused[1], modx[0..2]);
 
-/*Addql[0]	:= JOIN (addql[0], addq[0..1]);
-Addql[1]	:= JOIN (addql[1], addq[2..3]);
+Maskbit[0-4]	:= OR2 (maskbit[0-4], masksel[0-4], maskbit[1-5]);
 
-Dsel0b[0-1]	:= BUF8 (dsel0b[0-1], data_sel[0]);
-Dsel1b[0-1]	:= BUF8 (dsel1b[0-1], data_sel[1]);
-Ddatlo		:= MX4 (ddatlo, patd[0], lfu[0], addql[0], zero32, dsel0b[0], dsel1b[0]);
-Ddathi		:= MX4 (ddathi, patd[1], lfu[1], addql[1], zero32, dsel0b[1], dsel1b[1]);*/
-////////////////////////////////////// C++ CODE //////////////////////////////////////
-	dmux[0] = patd_pre;
-	dmux[1] = lfu;
-	dmux[2] = ((uint64_t)addq[3] << 48) | ((uint64_t)addq[2] << 32) | ((uint64_t)addq[1] << 16) | (uint64_t)addq[0];
-	dmux[3] = 0;
-	ddat = dmux[data_sel];
-	}
-//////////////////////////////////////////////////////////////////////////////////////
+Mask[0-5]	:= MX2 (addq_x[0-5], addqt_x[0-5], zero, maskbit[0-5]);
 
-/*Zed_sel		:= AN2 (zed_sel, data_sel[0..1]);
-Zed_selb[0-1]	:= BUF8 (zed_selb[0-1], zed_sel);
+Addq_x		:= JOIN (addq_x, addq_x[0..5], addqt_x[6..15]);
+Addq_y		:= JOIN (addq_y, addq_y[0..15]);*/
 
-Dat[0-7]	:= MX4 (dat[0-7],   dstdlo{0-7},   ddatlo{0-7},   dstzlo{0-7},   srczlo{0-7},   mask[0-7], zed_selb[0]);
-Dat[8-15]	:= MX4 (dat[8-15],  dstdlo{8-15},  ddatlo{8-15},  dstzlo{8-15},  srczlo{8-15},  mask[8],   zed_selb[0]);
-Dat[16-23]	:= MX4 (dat[16-23], dstdlo{16-23}, ddatlo{16-23}, dstzlo{16-23}, srczlo{16-23}, mask[9],   zed_selb[0]);
-Dat[24-31]	:= MX4 (dat[24-31], dstdlo{24-31}, ddatlo{24-31}, dstzlo{24-31}, srczlo{24-31}, mask[10],  zed_selb[0]);
-Dat[32-39]	:= MX4 (dat[32-39], dstdhi{0-7},   ddathi{0-7},   dstzhi{0-7},   srczhi{0-7},   mask[11],  zed_selb[1]);
-Dat[40-47]	:= MX4 (dat[40-47], dstdhi{8-15},  ddathi{8-15},  dstzhi{8-15},  srczhi{8-15},  mask[12],  zed_selb[1]);
-Dat[48-55]	:= MX4 (dat[48-55], dstdhi{16-23}, ddathi{16-23}, dstzhi{16-23}, srczhi{16-23}, mask[13],  zed_selb[1]);
-Dat[56-63]	:= MX4 (dat[56-63], dstdhi{24-31}, ddathi{24-31}, dstzhi{24-31}, srczhi{24-31}, mask[14],  zed_selb[1]);*/
 ////////////////////////////////////// C++ CODE //////////////////////////////////////
-	*wdata = blitter_simd_ops.byte_merge(ddat, dstd, mask);
-	*srcz = blitter_simd_ops.byte_merge(*srcz, dstz, mask);
+	*addq_x = addqt_x & mask[modx];
+	*addq_y = addqt_y & 0xFFFF;
 //////////////////////////////////////////////////////////////////////////////////////
 
-/*Data_enab[0-1]	:= BUF8 (data_enab[0-1], data_ena);
-Datadrv[0-31]	:= TS (wdata[0-31],  dat[0-31],  data_enab[0]);
-Datadrv[32-63]	:= TS (wdata[32-63], dat[32-63], data_enab[1]);
+//Unused[0-1]	:= DUMMY (unused[0-1]);
 
-Unused[0]	:= DUMMY (unused[0]);
+//END;
+}
+
+
+/*
+DEF DATA (
+		wdata[0..63]	// co-processor write data bus
+		:BUS;
+		dcomp[0..7]		// data byte equal flags
+		srcd[0..7]		// bits to use for bit to byte expansion
+		zcomp[0..3]		// output from Z comparators
+		:OUT;
+		a1_x[0..1]		// low two bits of A1 X pointer
+		big_pix			// pixel organisation is big-endian
+		blitter_active	// blitter is active
+		clk				// co-processor clock
+		cmpdst			// compare dest rather than source
+		colorld			// load the pattern color fields
+		daddasel[0..2]	// data adder input A selection
+		daddbsel[0..3]	// data adder input B selection
+		daddmode[0..2]	// data adder mode
+		daddq_sel		// select adder output vs. GPU data
+		data[0..63]		// co-processor read data bus
+		data_ena		// enable write data
+		data_sel[0..1]	// select data to write
+		dbinh\[0..7]	// byte oriented changed data inhibits
+		dend[0..5]		// end of changed write data zone
+		dpipe[0..1]		// load computed data pipe-line latch
+		dstart[0..5]	// start of changed write data zone
+		dstdld[0..1]	// dest data load (two halves)
+		dstzld[0..1]	// dest zed load (two halves)
+		ext_int			// enable extended precision intensity calculations
+INT32/	gpu_din			// GPU data bus
+		iincld			// I increment load
+		iincldx			// alternate I increment load
+		init_if			// initialise I fraction phase
+		init_ii			// initialise I integer phase
+		init_zf			// initialise Z fraction phase
+		intld[0..3]		// computed intensities load
+		istepadd		// intensity step integer add
+		istepfadd		// intensity step fraction add
+		istepld			// I step load
+		istepdld		// I step delta load
+		lfu_func[0..3]	// LFU function code
+		patdadd			// pattern data gouraud add
+		patdld[0..1]	// pattern data load (two halves)
+		pdsel[0..1]		// select pattern data type
+		phrase_mode		// phrase write mode
+		reload			// transfer contents of double buffers
+		reset\			// system reset
+		srcd1ld[0..1]	// source register 1 load (two halves)
+		srcdread		// source data read load enable
+		srczread		// source zed read load enable
+		srcshift[0..5]	// source alignment shift
+		srcz1ld[0..1]	// source zed 1 load (two halves)
+		srcz2add		// zed fraction gouraud add
+		srcz2ld[0..1]	// source zed 2 load (two halves)
+		textrgb			// texture mapping in RGB mode
+		txtd[0..63]		// data from the texture unit
+		zedld[0..3]		// computed zeds load
+		zincld			// Z increment load
+		zmode[0..2]		// Z comparator mode
+		zpipe[0..1]		// load computed zed pipe-line latch
+		zstepadd		// zed step integer add
+		zstepfadd		// zed step fraction add
+		zstepld			// Z step load
+		zstepdld		// Z step delta load
+		:IN);
+*/
 
-END;*/
-}
 
 
 /**  COMP_CTRL - Comparator output control logic  *****************
@@ -2817,211 +3026,6 @@ performed.  The is taken care of within the zed comparator by
 pipe-lining the comparator inputs where appropriate.
 */
 
-void COMP_CTRL(uint8_t *dbinh, bool *nowrite,
-	bool bcompen, bool big_pix, bool bkgwren, uint8_t dcomp, bool dcompen, uint8_t icount,
-	uint8_t pixsize, bool phrase_mode, uint8_t srcd, uint8_t zcomp)
-{
-   //BEGIN
-
-   /*Bkgwren\	:= INV1 (bkgwren\, bkgwren);
-     Phrase_mode\	:= INV1 (phrase_mode\, phrase_mode);
-     Pixsize\[0-2]	:= INV2 (pixsize\[0-2], pixsize[0-2]);*/
-
-   /* The bit comparator bits are derived from the source data, which
-      will have been suitably aligned for phrase mode.  The contents of
-      the inner counter are used to select which bit to use.
-
-      When not in phrase mode the inner count value is used to select
-      one bit.  It is assumed that the count has already occurred, so,
-      7 selects bit 0, etc.  In big-endian pixel mode, this turns round,
-      so that a count of 7 selects bit 7.
-
-      In phrase mode, the eight bits are used directly, and this mode is
-      only applicable to 8-bit pixel mode (2/34) */
-
-   /*Bcompselt[0-2]	:= EO (bcompselt[0-2], icount[0-2], big_pix);
-Bcompbit	:= MX8 (bcompbit, srcd[7], srcd[6], srcd[5],
-srcd[4], srcd[3], srcd[2], srcd[1], srcd[0], bcompselt[0..2]);
-Bcompbit\	:= INV1 (bcompbit\, bcompbit);*/
-   ////////////////////////////////////// C++ CODE //////////////////////////////////////
-   uint8_t bcompselt = (big_pix ? ~icount : icount) & 0x07;
-   uint8_t bitmask[8] = { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 };
-   bool bcompbit = srcd & bitmask[bcompselt];
-   bool winhibit, di0t0_1, di0t4, di1t2, di2t0_1, di2t4, di3t2;
-   bool di4t0_1, di4t4, di5t2;
-   bool di6t0_1, di6t4;
-   bool di7t2;
-
-   //////////////////////////////////////////////////////////////////////////////////////
-
-   /* pipe-line the count */
-   /*Bcompsel[0-2]	:= FDSYNC (bcompsel[0-2], bcompselt[0-2], step_inner, clk);
-Bcompbt		:= MX8 (bcompbitpt, srcd[7], srcd[6], srcd[5],
-srcd[4], srcd[3], srcd[2], srcd[1], srcd[0], bcompsel[0..2]);
-Bcompbitp	:= FD1Q (bcompbitp, bcompbitpt, clk);
-Bcompbitp\	:= INV1 (bcompbitp\, bcompbitp);*/
-
-   /* For pixel mode, generate the write inhibit signal for all modes
-      on bit inhibit, for 8 and 16 bit modes on comparator inhibit, and
-      for 16 bit mode on Z inhibit
-
-      Nowrite = bcompen . /bcompbit . /phrase_mode
-      + dcompen . dcomp[0] . /phrase_mode . pixsize = 011
-      + dcompen . dcomp[0..1] . /phrase_mode . pixsize = 100
-      + zcomp[0] . /phrase_mode . pixsize = 100
-      */
-
-   /*Nowt0		:= NAN3 (nowt[0], bcompen, bcompbit\, phrase_mode\);
-Nowt1		:= ND6  (nowt[1], dcompen, dcomp[0], phrase_mode\, pixsize\[2], pixsize[0..1]);
-Nowt2		:= ND7  (nowt[2], dcompen, dcomp[0..1], phrase_mode\, pixsize[2], pixsize\[0..1]);
-Nowt3		:= NAN5 (nowt[3], zcomp[0], phrase_mode\, pixsize[2], pixsize\[0..1]);
-Nowt4		:= NAN4 (nowt[4], nowt[0..3]);
-Nowrite		:= AN2  (nowrite, nowt[4], bkgwren\);*/
-   ////////////////////////////////////// C++ CODE //////////////////////////////////////
-   *nowrite = ((bcompen && !bcompbit && !phrase_mode)
-         || (dcompen && (dcomp & 0x01) && !phrase_mode && (pixsize == 3))
-         || (dcompen && ((dcomp & 0x03) == 0x03) && !phrase_mode && (pixsize == 4))
-         || ((zcomp & 0x01) && !phrase_mode && (pixsize == 4)))
-      && !bkgwren;
-   //////////////////////////////////////////////////////////////////////////////////////
-
-   /*Winht		:= NAN3 (winht, bcompen, bcompbitp\, phrase_mode\);
-Winhibit	:= NAN4 (winhibit, winht, nowt[1..3]);*/
-   ////////////////////////////////////// C++ CODE //////////////////////////////////////
-   //This is the same as above, but with bcompbit delayed one tick and called 'winhibit'
-   //Small difference: Besides the pipeline effect, it's also not using !bkgwren...
-   //	bool winhibit = (bcompen && !
-   winhibit = (bcompen && !bcompbit && !phrase_mode)
-      || (dcompen && (dcomp & 0x01) && !phrase_mode && (pixsize == 3))
-      || (dcompen && ((dcomp & 0x03) == 0x03) && !phrase_mode && (pixsize == 4))
-      || ((zcomp & 0x01) && !phrase_mode && (pixsize == 4));
-   //////////////////////////////////////////////////////////////////////////////////////
-
-   /* For phrase mode, generate the byte inhibit signals for eight bit
-      mode 011, or sixteen bit mode 100
-      dbinh\[0] =  pixsize[2] . zcomp[0]
-      +  pixsize[2] . dcomp[0] . dcomp[1] . dcompen
-      + /pixsize[2] . dcomp[0] . dcompen
-      + /srcd[0] . bcompen
-
-      Inhibits 0-3 are also used when not in phrase mode to write back
-      destination data.
-      */
-
-   /*Srcd\[0-7]	:= INV1 (srcd\[0-7], srcd[0-7]);
-
-Di0t0		:= NAN2H (di0t[0], pixsize[2], zcomp[0]);
-Di0t1		:= NAN4H (di0t[1], pixsize[2], dcomp[0..1], dcompen);
-Di0t2		:= NAN2 (di0t[2], srcd\[0], bcompen);
-Di0t3		:= NAN3 (di0t[3], pixsize\[2], dcomp[0], dcompen);
-Di0t4		:= NAN4 (di0t[4], di0t[0..3]);
-Dbinh[0]	:= ANR1P (dbinh\[0], di0t[4], phrase_mode, winhibit);*/
-   ////////////////////////////////////// C++ CODE //////////////////////////////////////
-   *dbinh = 0;
-   di0t0_1 = ((pixsize & 0x04) && (zcomp & 0x01))
-      || ((pixsize & 0x04) && (dcomp & 0x01) && (dcomp & 0x02) && dcompen);
-   di0t4 = di0t0_1
-      || (!(srcd & 0x01) && bcompen)
-      || (!(pixsize & 0x04) && (dcomp & 0x01) && dcompen);
-   *dbinh |= (!((di0t4 && phrase_mode) || winhibit) ? 0x01 : 0x00);
-   //////////////////////////////////////////////////////////////////////////////////////
-
-   /*Di1t0		:= NAN3 (di1t[0], pixsize\[2], dcomp[1], dcompen);
-Di1t1		:= NAN2 (di1t[1], srcd\[1], bcompen);
-Di1t2		:= NAN4 (di1t[2], di0t[0..1], di1t[0..1]);
-Dbinh[1]	:= ANR1 (dbinh\[1], di1t[2], phrase_mode, winhibit);*/
-   ////////////////////////////////////// C++ CODE //////////////////////////////////////
-   di1t2 = di0t0_1
-      || (!(srcd & 0x02) && bcompen)
-      || (!(pixsize & 0x04) && (dcomp & 0x02) && dcompen);
-   *dbinh |= (!((di1t2 && phrase_mode) || winhibit) ? 0x02 : 0x00);
-   //////////////////////////////////////////////////////////////////////////////////////
-
-   /*Di2t0		:= NAN2H (di2t[0], pixsize[2], zcomp[1]);
-Di2t1		:= NAN4H (di2t[1], pixsize[2], dcomp[2..3], dcompen);
-Di2t2		:= NAN2 (di2t[2], srcd\[2], bcompen);
-Di2t3		:= NAN3 (di2t[3], pixsize\[2], dcomp[2], dcompen);
-Di2t4		:= NAN4 (di2t[4], di2t[0..3]);
-Dbinh[2]	:= ANR1 (dbinh\[2], di2t[4], phrase_mode, winhibit);*/
-   ////////////////////////////////////// C++ CODE //////////////////////////////////////
-   //[bcompen=F dcompen=T phrase_mode=T bkgwren=F][nw=F wi=F]
-   //[di0t0_1=F di0t4=F][di1t2=F][di2t0_1=T di2t4=T][di3t2=T][di4t0_1=F di2t4=F][di5t2=F][di6t0_1=F di6t4=F][di7t2=F]
-   //[dcomp=$00 dbinh=$0C][7804780400007804] (icount=0005, inc=4)
-   di2t0_1 = ((pixsize & 0x04) && (zcomp & 0x02))
-      || ((pixsize & 0x04) && (dcomp & 0x04) && (dcomp & 0x08) && dcompen);
-   di2t4 = di2t0_1
-      || (!(srcd & 0x04) && bcompen)
-      || (!(pixsize & 0x04) && (dcomp & 0x04) && dcompen);
-   *dbinh |= (!((di2t4 && phrase_mode) || winhibit) ? 0x04 : 0x00);
-   //////////////////////////////////////////////////////////////////////////////////////
-
-   /*Di3t0		:= NAN3 (di3t[0], pixsize\[2], dcomp[3], dcompen);
-Di3t1		:= NAN2 (di3t[1], srcd\[3], bcompen);
-Di3t2		:= NAN4 (di3t[2], di2t[0..1], di3t[0..1]);
-Dbinh[3]	:= ANR1 (dbinh\[3], di3t[2], phrase_mode, winhibit);*/
-   ////////////////////////////////////// C++ CODE //////////////////////////////////////
-   di3t2 = di2t0_1
-      || (!(srcd & 0x08) && bcompen)
-      || (!(pixsize & 0x04) && (dcomp & 0x08) && dcompen);
-   *dbinh |= (!((di3t2 && phrase_mode) || winhibit) ? 0x08 : 0x00);
-   //////////////////////////////////////////////////////////////////////////////////////
-
-   /*Di4t0		:= NAN2H (di4t[0], pixsize[2], zcomp[2]);
-Di4t1		:= NAN4H (di4t[1], pixsize[2], dcomp[4..5], dcompen);
-Di4t2		:= NAN2 (di4t[2], srcd\[4], bcompen);
-Di4t3		:= NAN3 (di4t[3], pixsize\[2], dcomp[4], dcompen);
-Di4t4		:= NAN4 (di4t[4], di4t[0..3]);
-Dbinh[4]	:= NAN2 (dbinh\[4], di4t[4], phrase_mode);*/
-   ////////////////////////////////////// C++ CODE //////////////////////////////////////
-   di4t0_1 = ((pixsize & 0x04u) && (zcomp & 0x04u))
-      || ((pixsize & 0x04u) && (dcomp & 0x10u) && (dcomp & 0x20u) && dcompen);
-   di4t4 = di4t0_1
-      || (!(srcd & 0x10u) && bcompen)
-      || (!(pixsize & 0x04u) && (dcomp & 0x10u) && dcompen);
-   *dbinh |= (!(di4t4 && phrase_mode) ? 0x10u : 0x00u);
-   //////////////////////////////////////////////////////////////////////////////////////
-
-   /*Di5t0		:= NAN3 (di5t[0], pixsize\[2], dcomp[5], dcompen);
-Di5t1		:= NAN2 (di5t[1], srcd\[5], bcompen);
-Di5t2		:= NAN4 (di5t[2], di4t[0..1], di5t[0..1]);
-Dbinh[5]	:= NAN2 (dbinh\[5], di5t[2], phrase_mode);*/
-   ////////////////////////////////////// C++ CODE //////////////////////////////////////
-   di5t2 = di4t0_1
-      || (!(srcd & 0x20) && bcompen)
-      || (!(pixsize & 0x04) && (dcomp & 0x20) && dcompen);
-   *dbinh |= (!(di5t2 && phrase_mode) ? 0x20 : 0x00);
-   //////////////////////////////////////////////////////////////////////////////////////
-
-   /*Di6t0		:= NAN2H (di6t[0], pixsize[2], zcomp[3]);
-Di6t1		:= NAN4H (di6t[1], pixsize[2], dcomp[6..7], dcompen);
-Di6t2		:= NAN2 (di6t[2], srcd\[6], bcompen);
-Di6t3		:= NAN3 (di6t[3], pixsize\[2], dcomp[6], dcompen);
-Di6t4		:= NAN4 (di6t[4], di6t[0..3]);
-Dbinh[6]	:= NAN2 (dbinh\[6], di6t[4], phrase_mode);*/
-   ////////////////////////////////////// C++ CODE //////////////////////////////////////
-   di6t0_1 = ((pixsize & 0x04) && (zcomp & 0x08))
-      || ((pixsize & 0x04) && (dcomp & 0x40) && (dcomp & 0x80) && dcompen);
-   di6t4 = di6t0_1
-      || (!(srcd & 0x40) && bcompen)
-      || (!(pixsize & 0x04) && (dcomp & 0x40) && dcompen);
-   *dbinh |= (!(di6t4 && phrase_mode) ? 0x40 : 0x00);
-   //////////////////////////////////////////////////////////////////////////////////////
-
-   /*Di7t0		:= NAN3 (di7t[0], pixsize\[2], dcomp[7], dcompen);
-Di7t1		:= NAN2 (di7t[1], srcd\[7], bcompen);
-Di7t2		:= NAN4 (di7t[2], di6t[0..1], di7t[0..1]);
-Dbinh[7]	:= NAN2 (dbinh\[7], di7t[2], phrase_mode);*/
-   ////////////////////////////////////// C++ CODE //////////////////////////////////////
-   di7t2 = di6t0_1
-      || (!(srcd & 0x80) && bcompen)
-      || (!(pixsize & 0x04) && (dcomp & 0x80) && dcompen);
-   *dbinh |= (!(di7t2 && phrase_mode) ? 0x80 : 0x00);
-   //////////////////////////////////////////////////////////////////////////////////////
-
-   //END;
-   //kludge
-   *dbinh = ~*dbinh;
-}
 
 #endif
 

From 25278cbd2ccb68097f8fa45e8cf67f6766914779 Mon Sep 17 00:00:00 2001
From: Joseph Mattiello <git@joemattiello.com>
Date: Sat, 2 May 2026 15:32:25 -0400
Subject: [PATCH 4/6] bench: per-frame variance + blitter trace + perf counter
 lookup

The +15% inlining win shipped in the previous commit didn't fix the
real-world AvP audio stutter, because audio dropouts are governed
by *worst-case* frame time, not average.  This commit lands the
diagnostic infrastructure that found the actual cost driver.

* test_benchmark now records per-frame timing and reports
  p50 / p99 / p999 / max ms-per-frame plus the count of frames
  that blew the 16.67 ms (60 Hz) budget.

* perf_counters.h gains `perf_counters_find(name)` so the harness
  can snapshot a counter before/after each retro_run() call and
  report per-frame deltas.

* test_benchmark uses the new lookup to print the slowest frames
  along with their blitter call count and inner-iteration count,
  and the slow-vs-avg ratio.

* Makefile gains BLITTER_TRACE=1 toggle that wires
  -DBLITTER_TRACE; src/tom/blitter.c grows a per-blit elapsed-ms
  dump that fires for any single BlitterMidsummer2 call exceeding
  a threshold, so we can tell whether worst-case frame spikes are
  one giant blit or many small ones.

What this revealed for AvP gameplay (state7, accurate blitter,
1200 frames after 120 warmup, M-series host):

   avg 5.1 ms, p99 17.0 ms, p999 18.2 ms, max 18.4 ms
   25 / 1200 frames over 16.67 ms (2%)

   Avg per frame:    403 blits,   49,774 inner iters
   Slow per frame:  2009 blits,  247,508 inner iters  (5x)

The slow frames are perfectly periodic and consistent (5x the work
on every spike, every spike same exact count).  No single blit is
slow -- the BLITTER_TRACE dump prints zero lines even at a 0.3 ms
threshold.  AvP is just doing 5x more blitter work on roughly
every 24th frame.  That's the audio dropout root cause: the inlining
helped average throughput but the periodic 5x spikes still blow
the budget.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 Makefile                    |   9 ++
 exports-test.list           |   1 +
 link-test.T                 |   1 +
 src/core/perf_counters.c    |  15 ++++
 src/core/perf_counters.h    |   4 +
 src/tom/blitter.c           |  33 ++++++++
 test/tools/test_benchmark.c | 158 ++++++++++++++++++++++++++++++------
 7 files changed, 197 insertions(+), 24 deletions(-)

diff --git a/Makefile b/Makefile
index 91d8670e..bd01395c 100644
--- a/Makefile
+++ b/Makefile
@@ -60,6 +60,15 @@ ifeq ($(BENCH_PROFILE),1)
    CFLAGS += -DBENCH_PROFILE
 endif
 
+# Per-blit slow-path tracing in BlitterMidsummer2.
+# `make BLITTER_TRACE=1` enables an stderr dump of any single blit
+# whose wall time exceeds ~1.5 ms (configurable via the threshold in
+# src/tom/blitter.c).  Useful for finding pathological blit commands
+# that dominate frame-time variance.  macOS-only (uses mach_*).
+ifeq ($(BLITTER_TRACE),1)
+   CFLAGS += -DBLITTER_TRACE
+endif
+
 # Symbol export gating.
 #
 #   GNU ld (Linux, Windows MSYS2, ARM, ...) honours --version-script:
diff --git a/exports-test.list b/exports-test.list
index 3ebeab6f..9cf1a5b8 100644
--- a/exports-test.list
+++ b/exports-test.list
@@ -36,3 +36,4 @@ _vjs
 _perf_counters_dump
 _perf_counters_reset
 _perf_counters_register
+_perf_counters_find
diff --git a/link-test.T b/link-test.T
index 4b57c7e4..9642acf7 100644
--- a/link-test.T
+++ b/link-test.T
@@ -39,5 +39,6 @@
       perf_counters_dump;
       perf_counters_reset;
       perf_counters_register;
+      perf_counters_find;
    local: *;
 };
diff --git a/src/core/perf_counters.c b/src/core/perf_counters.c
index ea021881..fb31153b 100644
--- a/src/core/perf_counters.c
+++ b/src/core/perf_counters.c
@@ -6,6 +6,7 @@
  * In !BENCH_PROFILE builds the bodies are no-ops and no PERF_COUNTER
  * calls perf_counters_register, so the registry stays empty.
  */
+#include <string.h>
 #include "perf_counters.h"
 
 #ifdef BENCH_PROFILE
@@ -50,3 +51,17 @@ void perf_counters_dump(FILE *out)
    (void)out;
 #endif
 }
+
+unsigned long long *perf_counters_find(const char *name)
+{
+#ifdef BENCH_PROFILE
+   perf_counter_entry_t *e;
+   if (!name) return (unsigned long long *)0;
+   for (e = perf_head; e; e = e->next)
+      if (e->name && strcmp(e->name, name) == 0)
+         return e->value;
+#else
+   (void)name;
+#endif
+   return (unsigned long long *)0;
+}
diff --git a/src/core/perf_counters.h b/src/core/perf_counters.h
index c0d56085..12c2cfd1 100644
--- a/src/core/perf_counters.h
+++ b/src/core/perf_counters.h
@@ -48,6 +48,10 @@ typedef struct perf_counter_entry
 void perf_counters_register(perf_counter_entry_t *entry);
 void perf_counters_dump(FILE *out);
 void perf_counters_reset(void);
+/* Return a pointer to the named counter's value, or NULL if unknown.
+ * Lets harnesses snapshot a counter before/after retro_run for
+ * per-frame deltas without exporting individual symbols. */
+unsigned long long *perf_counters_find(const char *name);
 
 #ifdef BENCH_PROFILE
 
diff --git a/src/tom/blitter.c b/src/tom/blitter.c
index e7732bff..1dffbfe5 100644
--- a/src/tom/blitter.c
+++ b/src/tom/blitter.c
@@ -1676,9 +1676,19 @@ Unused[0]	:= DUMMY (unused[0]);
 END;*/
 }
 
+#ifdef BLITTER_TRACE
+#include <mach/mach_time.h>
+#include <stdio.h>
+static double bm2_trace_threshold_ms = 0.3; /* dump any blit slower than this */
+static uint64_t bm2_trace_t0;
+#endif
+
 void BlitterMidsummer2(void)
 {
    uint32_t cmd = (PERF_INC(blitter_calls), GET32(blitter_ram, COMMAND));
+#ifdef BLITTER_TRACE
+   bm2_trace_t0 = mach_absolute_time();
+#endif
 
 
    // Line states passed in via the command register
@@ -2685,6 +2695,29 @@ A1_outside	:= OR6 (a1_outside, a1_x{15}, a1xgr, a1xeq, a1_y{15}, a1ygr, a1yeq);
    SET16(blitter_ram, A2_PIXEL + 2, a2_x);
    SET16(blitter_ram, A2_PIXEL + 0, a2_y);
 
+#ifdef BLITTER_TRACE
+   {
+      static mach_timebase_info_data_t tb;
+      uint64_t t1 = mach_absolute_time();
+      double ms;
+      if (tb.denom == 0) mach_timebase_info(&tb);
+      ms = (double)(t1 - bm2_trace_t0) * (double)tb.numer / (double)tb.denom / 1e6;
+      if (ms >= bm2_trace_threshold_ms) {
+         uint16_t pcount = GET16(blitter_ram, PIXLINECOUNTER + 2);
+         uint16_t lcount = GET16(blitter_ram, PIXLINECOUNTER);
+         uint8_t pixsize = (blitter_ram[A1_FLAGS + 3] & 0x38) >> 3;
+         fprintf(stderr,
+            "[BLITTER_TRACE] %.2f ms cmd=%08x pixsize=%u inner=%u outer=%u "
+            "src(en=%d enx=%d enz=%d) dst(en=%d enz=%d wrz=%d) "
+            "gourd=%d gourz=%d srcshade=%d bcompen=%d dcompen=%d\n",
+            ms, cmd, pixsize, pcount, lcount,
+            (int)srcen, (int)srcenx, (int)srcenz,
+            (int)dsten, (int)dstenz, (int)dstwrz,
+            (int)gourd, (int)gourz, (int)srcshade,
+            (int)bcompen, (int)dcompen);
+      }
+   }
+#endif
 }
 
 // Various pieces of the blitter puzzle are teased out here...
diff --git a/test/tools/test_benchmark.c b/test/tools/test_benchmark.c
index fc674746..61ba743b 100644
--- a/test/tools/test_benchmark.c
+++ b/test/tools/test_benchmark.c
@@ -38,6 +38,7 @@ static size_t (*pretro_serialize_size)(void);
 static bool (*pretro_unserialize)(const void *, size_t);
 /* Optional: only present when the core was built with BENCH_PROFILE=1. */
 static void (*pperf_counters_dump)(FILE *);
+static unsigned long long *(*pperf_counters_find)(const char *);
 
 /* Options state */
 static int bios_option_set = 0;
@@ -312,8 +313,9 @@ int main(int argc, char **argv)
    LOAD_SYM(retro_serialize_size);
    LOAD_SYM(retro_unserialize);
 
-   /* Optional perf-counter dump; absent unless built with BENCH_PROFILE=1. */
+   /* Optional perf-counter access; absent unless built with BENCH_PROFILE=1. */
    pperf_counters_dump = dlsym(handle, "perf_counters_dump");
+   pperf_counters_find = dlsym(handle, "perf_counters_find");
 
    pretro_set_environment(environment_cb);
    pretro_set_video_refresh(video_refresh);
@@ -492,29 +494,137 @@ int main(int argc, char **argv)
       fprintf(stderr, "--- Warmup complete ---\n");
    }
 
-   /* Timed run */
-   fprintf(stderr, "--- Benchmarking %d frames ---\n", num_frames);
-   t_start = timer_now();
-
-   for (i = 0; i < num_frames; i++)
-      pretro_run();
-
-   t_end = timer_now();
-
-   elapsed = timer_elapsed_sec(t_start, t_end);
-   fps = (double)num_frames / elapsed;
-   ms_per_frame = (elapsed * 1000.0) / (double)num_frames;
-
-   /* Print results */
-   printf("\n=== BENCHMARK RESULTS ===\n");
-   printf("Blitter mode:    %s\n",
-          strcmp(blitter_value, "enabled") == 0 ? "fast" : "accurate");
-   printf("Frames measured: %d\n", num_frames);
-   printf("Warmup frames:   %d\n", warmup_frames);
-   printf("Total time:      %.3f s\n", elapsed);
-   printf("Frames/sec:      %.2f\n", fps);
-   printf("Time/frame:      %.3f ms\n", ms_per_frame);
-   printf("=========================\n");
+   /* Timed run with per-frame samples to expose variance.  Audio
+    * dropouts in real frontends are caused by *worst-case* frames
+    * exceeding the 16.6 ms (60 Hz) budget, not by the average. */
+   {
+      double *frame_ms = (double *)malloc((size_t)num_frames * sizeof(double));
+      unsigned long long *blit_calls_at_frame = (unsigned long long *)malloc((size_t)num_frames * sizeof(unsigned long long));
+      unsigned long long *blit_inner_at_frame = (unsigned long long *)malloc((size_t)num_frames * sizeof(unsigned long long));
+      double frame_budget_ms = 1000.0 / 60.0;
+      int over_budget = 0;
+      double max_ms = 0.0;
+      double p50_ms = 0.0, p99_ms = 0.0, p999_ms = 0.0;
+      unsigned long long *blit_calls_ctr = pperf_counters_find ? pperf_counters_find("blitter_calls") : NULL;
+      unsigned long long *blit_inner_ctr = pperf_counters_find ? pperf_counters_find("blitter_inner") : NULL;
+      unsigned long long blit_calls_prev = blit_calls_ctr ? *blit_calls_ctr : 0;
+      unsigned long long blit_inner_prev = blit_inner_ctr ? *blit_inner_ctr : 0;
+
+      if (!frame_ms || !blit_calls_at_frame || !blit_inner_at_frame)
+      {
+         fprintf(stderr, "ERROR: malloc failed for per-frame timing\n");
+         pretro_unload_game(); pretro_deinit();
+         free((void *)info.data); dlclose(handle);
+         return 1;
+      }
+
+      fprintf(stderr, "--- Benchmarking %d frames ---\n", num_frames);
+      t_start = timer_now();
+
+      for (i = 0; i < num_frames; i++)
+      {
+         uint64_t f0 = timer_now();
+         uint64_t f1;
+         pretro_run();
+         f1 = timer_now();
+         frame_ms[i] = timer_elapsed_sec(f0, f1) * 1000.0;
+         if (blit_calls_ctr) {
+            blit_calls_at_frame[i] = *blit_calls_ctr - blit_calls_prev;
+            blit_calls_prev = *blit_calls_ctr;
+         } else blit_calls_at_frame[i] = 0;
+         if (blit_inner_ctr) {
+            blit_inner_at_frame[i] = *blit_inner_ctr - blit_inner_prev;
+            blit_inner_prev = *blit_inner_ctr;
+         } else blit_inner_at_frame[i] = 0;
+      }
+
+      t_end = timer_now();
+
+      elapsed = timer_elapsed_sec(t_start, t_end);
+      fps = (double)num_frames / elapsed;
+      ms_per_frame = (elapsed * 1000.0) / (double)num_frames;
+
+      /* Quicksort copy so the original order is preserved for any
+       * later analysis (currently we don't print it, but cheap). */
+      {
+         double *sorted = (double *)malloc((size_t)num_frames * sizeof(double));
+         int j;
+         if (sorted)
+         {
+            memcpy(sorted, frame_ms, (size_t)num_frames * sizeof(double));
+            /* Insertion sort (small N typical). */
+            for (i = 1; i < num_frames; i++)
+            {
+               double key = sorted[i];
+               j = i - 1;
+               while (j >= 0 && sorted[j] > key) { sorted[j + 1] = sorted[j]; j--; }
+               sorted[j + 1] = key;
+            }
+            p50_ms  = sorted[(int)((double)num_frames * 0.50)];
+            p99_ms  = sorted[(int)((double)num_frames * 0.99)];
+            p999_ms = sorted[(int)((double)num_frames * 0.999)];
+            max_ms  = sorted[num_frames - 1];
+            free(sorted);
+         }
+      }
+      for (i = 0; i < num_frames; i++)
+         if (frame_ms[i] > frame_budget_ms) over_budget++;
+
+      /* Print results */
+      printf("\n=== BENCHMARK RESULTS ===\n");
+      printf("Blitter mode:    %s\n",
+             strcmp(blitter_value, "enabled") == 0 ? "fast" : "accurate");
+      printf("Frames measured: %d\n", num_frames);
+      printf("Warmup frames:   %d\n", warmup_frames);
+      printf("Total time:      %.3f s\n", elapsed);
+      printf("Frames/sec:      %.2f\n", fps);
+      printf("Time/frame avg:  %.3f ms\n", ms_per_frame);
+      printf("Time/frame p50:  %.3f ms\n", p50_ms);
+      printf("Time/frame p99:  %.3f ms\n", p99_ms);
+      printf("Time/frame p999: %.3f ms\n", p999_ms);
+      printf("Time/frame max:  %.3f ms\n", max_ms);
+      printf("Over 16.67 ms:   %d / %d frames (%.2f%%)\n",
+             over_budget, num_frames, 100.0 * over_budget / num_frames);
+      printf("=========================\n");
+
+      /* If we have per-frame blitter counters, dump the slowest frames
+       * so we can correlate blit volume with frame-time spikes. */
+      if (over_budget > 0 && blit_calls_ctr) {
+         int j;
+         double avg_calls = 0.0, avg_inner = 0.0;
+         double slow_calls = 0.0, slow_inner = 0.0;
+         int slow_n = 0;
+         printf("\n--- Worst frames (>16.67ms) -----------------------------\n");
+         printf("  idx  frame_ms  blit_calls  blit_inner_iter\n");
+         for (j = 0; j < num_frames; j++) {
+            avg_calls += blit_calls_at_frame[j];
+            avg_inner += blit_inner_at_frame[j];
+            if (frame_ms[j] > frame_budget_ms) {
+               slow_calls += blit_calls_at_frame[j];
+               slow_inner += blit_inner_at_frame[j];
+               slow_n++;
+               if (slow_n <= 12)
+                  printf("  %4d  %7.2f   %10llu   %15llu\n",
+                         j, frame_ms[j],
+                         blit_calls_at_frame[j],
+                         blit_inner_at_frame[j]);
+            }
+         }
+         printf("---\n");
+         printf("Avg per frame (all):    blits=%.0f  inner_iter=%.0f\n",
+                avg_calls / num_frames, avg_inner / num_frames);
+         if (slow_n > 0)
+            printf("Avg per frame (slow):   blits=%.0f  inner_iter=%.0f  (%dx, %dx vs avg)\n",
+                   slow_calls / slow_n, slow_inner / slow_n,
+                   (int)((slow_calls / slow_n) / (avg_calls / num_frames + 1e-9)),
+                   (int)((slow_inner / slow_n) / (avg_inner / num_frames + 1e-9)));
+         printf("=========================================================\n");
+      }
+
+      free(frame_ms);
+      free(blit_calls_at_frame);
+      free(blit_inner_at_frame);
+   }
 
    if (pperf_counters_dump)
       pperf_counters_dump(stderr);

From b8cc8b42e88fd1ced65b0f0fbcf6e67f33ea2032 Mon Sep 17 00:00:00 2001
From: Joseph Mattiello <git@joemattiello.com>
Date: Sat, 2 May 2026 15:37:51 -0400
Subject: [PATCH 5/6] fix(blitter): portable always-inline (MSVC has no GCC
 attribute)

MSVC x86 / x64 compilation checks failed on PR #129 because
`__attribute__((always_inline))` is GCC/Clang-specific.  Wrap it
in a BLITTER_ALWAYS_INLINE macro that maps to:

  GCC / Clang: inline __attribute__((always_inline))
  MSVC:        __forceinline                    (replaces inline)
  Other:       inline                           (best-effort)

The macro spells the inline keyword itself so call sites are just
`static BLITTER_ALWAYS_INLINE void foo(...)` -- no extra INLINE
qualifier (MSVC's __forceinline conflicts with another inline
keyword, which is why the original `static INLINE __attribute__(...)`
form would have failed there even if MSVC understood the attribute).

Verified: clang still inlines (AvP accurate ~196 FPS, same as the
attribute-only form); test suite passes; libretro buildbot's MSVC
target should now build clean.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 src/tom/blitter.c | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/src/tom/blitter.c b/src/tom/blitter.c
index 1dffbfe5..c76461a1 100644
--- a/src/tom/blitter.c
+++ b/src/tom/blitter.c
@@ -34,6 +34,20 @@
 #define USE_ORIGINAL_BLITTER
 #define USE_MIDSUMMER_BLITTER_MKII
 
+/* Portable always-inline.  Spelled to include the inline keyword
+ * itself (MSVC's __forceinline IS the inline keyword for that
+ * compiler), so call sites use `static BLITTER_ALWAYS_INLINE void
+ * foo(...)` without an extra INLINE/inline.  Used to force inlining
+ * of the blitter helpers (ADD16SAT, ADDARRAY, COMP_CTRL, DATA) so
+ * the compiler can specialise them per call site. */
+#if defined(_MSC_VER)
+#  define BLITTER_ALWAYS_INLINE __forceinline
+#elif defined(__GNUC__) || defined(__clang__)
+#  define BLITTER_ALWAYS_INLINE inline __attribute__((always_inline))
+#else
+#  define BLITTER_ALWAYS_INLINE inline
+#endif
+
 // Local global variables
 
 // Blitter register RAM (most of it is hidden from the user)
@@ -993,7 +1007,7 @@ void ADDRADD(int16_t *addq_x, int16_t *addq_y, bool a1fracldi,
  * call sites in BlitterMidsummer2 (compile-time daddasel/daddbsel/
  * daddmode -> dead switch arms eliminated) and the call inside DATA
  * (where the args are loop-invariant for the duration of a blit). */
-static INLINE __attribute__((always_inline))
+static BLITTER_ALWAYS_INLINE
 void ADD16SAT(uint16_t *r, uint8_t *co, uint16_t a, uint16_t b,
               uint8_t cin, bool sat, bool eightbit, bool hicinh)
 {
@@ -1031,7 +1045,7 @@ void ADD16SAT(uint16_t *r, uint8_t *co, uint16_t a, uint16_t b,
    *r |= (hisaturate ? (ctop ? 0xFF00 : 0x0000) : q & 0xFF00);
 }
 
-static INLINE __attribute__((always_inline))
+static BLITTER_ALWAYS_INLINE
 void ADDARRAY(uint16_t *addq, uint8_t daddasel, uint8_t daddbsel,
               uint8_t daddmode, uint64_t dstd, uint32_t iinc,
               uint8_t initcin[], uint64_t initinc, uint16_t initpix,
@@ -1119,7 +1133,7 @@ void ADDARRAY(uint16_t *addq, uint8_t daddasel, uint8_t daddbsel,
    ADD16SAT(&addq[3], &co[3], adda[3], addb[3], cin[3], sat, eightbit, hicinh);
 }
 
-static INLINE __attribute__((always_inline))
+static BLITTER_ALWAYS_INLINE
 void COMP_CTRL(uint8_t *dbinh, bool *nowrite,
 	bool bcompen, bool big_pix, bool bkgwren, uint8_t dcomp, bool dcompen, uint8_t icount,
 	uint8_t pixsize, bool phrase_mode, uint8_t srcd, uint8_t zcomp)
@@ -1326,7 +1340,7 @@ Dbinh[7]	:= NAN2 (dbinh\[7], di7t[2], phrase_mode);*/
    *dbinh = ~*dbinh;
 }
 
-static INLINE __attribute__((always_inline))
+static BLITTER_ALWAYS_INLINE
 void DATA(uint64_t *wdata, uint8_t *dcomp, uint8_t *zcomp, bool *nowrite,
 	bool big_pix, bool cmpdst, uint8_t daddasel, uint8_t daddbsel, uint8_t daddmode, bool daddq_sel, uint8_t data_sel,
 	uint8_t dbinh, uint8_t dend, uint8_t dstart, uint64_t dstd, uint32_t iinc, uint8_t lfu_func, uint64_t *patd, bool patdadd,

From 57741b4908606b6f249bd421af1f66e419bd7d5f Mon Sep 17 00:00:00 2001
From: Joseph Mattiello <git@joemattiello.com>
Date: Sat, 2 May 2026 15:50:03 -0400
Subject: [PATCH 6/6] fix(bench): validate num_frames > 0 and warmup_frames >=
 0

Copilot review on PR #129 caught that the per-frame stats path
unconditionally allocates / sorts / divides by num_frames, which
would OOB-access sorted[num_frames - 1] and divide by zero if a
caller passed 0 or a negative number.  Both come from atoi() which
returns 0 on garbage input, so this was reachable.

Validate both at parse time and exit with a clear error before
allocating anything.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 test/tools/test_benchmark.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/test/tools/test_benchmark.c b/test/tools/test_benchmark.c
index 61ba743b..05b3e2a4 100644
--- a/test/tools/test_benchmark.c
+++ b/test/tools/test_benchmark.c
@@ -256,6 +256,17 @@ int main(int argc, char **argv)
          num_frames = atoi(argv[i]);
    }
 
+   if (num_frames <= 0)
+   {
+      fprintf(stderr, "ERROR: num_frames must be a positive integer (got %d)\n", num_frames);
+      return 1;
+   }
+   if (warmup_frames < 0)
+   {
+      fprintf(stderr, "ERROR: --warmup must be >= 0 (got %d)\n", warmup_frames);
+      return 1;
+   }
+
 #ifdef __APPLE__
    /* Initialize timebase for mach_absolute_time conversion */
    mach_timebase_info(&timebase_info);