diff --git a/Makefile b/Makefile
index 064b5e16..bd01395c 100644
--- a/Makefile
+++ b/Makefile
@@ -53,6 +53,22 @@ ifeq ($(DEBUG),1)
    CFLAGS += -DBUILD_TIMESTAMP="\"debug $(shell date -u +%Y-%m-%dT%H:%M:%SZ)\""
 endif
 
+# Opt-in instrumentation counters (src/core/perf_counters.h).
+# `make BENCH_PROFILE=1` defines the macro so PERF_COUNTER/PERF_INC
+# emit real code; otherwise every counter macro is a no-op.
+ifeq ($(BENCH_PROFILE),1)
+   CFLAGS += -DBENCH_PROFILE
+endif
+
+# Per-blit slow-path tracing in BlitterMidsummer2.
+# `make BLITTER_TRACE=1` enables an stderr dump of any single blit
+# whose wall time exceeds ~1.5 ms (configurable via the threshold in
+# src/tom/blitter.c).  Useful for finding pathological blit commands
+# that dominate frame-time variance.  macOS-only (uses mach_*).
+ifeq ($(BLITTER_TRACE),1)
+   CFLAGS += -DBLITTER_TRACE
+endif
+
 # Symbol export gating.
 #
 #   GNU ld (Linux, Windows MSYS2, ARM, ...) honours --version-script:
@@ -869,17 +885,25 @@ BENCH_ROM     ?= test/roms/yarc.j64
 BENCH_FRAMES  ?= 600
 BENCH_WARMUP  ?= 60
 BENCH_BLITTER ?= fast
-benchmark: $(TARGET)
-	@# Build the harness inline so this works whether or not TEST_EXPORTS=1
-	@# was used for $(TARGET); the harness only uses retro_* exports.
-	@# -ldl is Linux-specific; macOS/BSD provide dl* in libSystem/libc
-	@# (and Apple's clang silently accepts -ldl as a no-op, but other
-	@# linkers may not).
+# BENCH_PROFILE=1 enables src/core/perf_counters.h instrumentation and
+# wide-export ABI so test_benchmark can dlsym `perf_counters_dump`.
+ifeq ($(BENCH_PROFILE),1)
+BENCH_TEST_EXPORTS := TEST_EXPORTS=1
+else
+BENCH_TEST_EXPORTS :=
+endif
+benchmark:
+	@# Re-invoke make so BENCH_PROFILE / TEST_EXPORTS take effect on the .so/.dylib.
+	$(MAKE) $(BENCH_TEST_EXPORTS) BENCH_PROFILE=$(BENCH_PROFILE) -j$(shell getconf _NPROCESSORS_ONLN 2>/dev/null || echo 4)
+	@# Build the harness inline; it dlopens the core, so it only needs the retro_* ABI
+	@# (plus the optional perf_counters_dump symbol when BENCH_PROFILE=1).
+	@# -ldl is Linux-specific; macOS/BSD provide dl* in libSystem/libc.
 	$(CC) -O2 -Wall -std=c99 $(INCFLAGS) \
 		-o test/tools/test_benchmark test/tools/test_benchmark.c \
 		$(if $(filter Linux,$(shell uname -s)),-ldl)
 	./test/tools/test_benchmark ./$(TARGET) "$(BENCH_ROM)" $(BENCH_FRAMES) \
-		--warmup $(BENCH_WARMUP) --blitter $(BENCH_BLITTER)
+		--warmup $(BENCH_WARMUP) --blitter $(BENCH_BLITTER) \
+		$(if $(BENCH_STATE),--load-state "$(BENCH_STATE)")
 
 print-%:
 	@echo '$*=$($*)'
diff --git a/Makefile.common b/Makefile.common
index 091677fa..21c8a521 100644
--- a/Makefile.common
+++ b/Makefile.common
@@ -33,6 +33,7 @@ SOURCES_C :=  \
 	$(CORE_DIR)/src/cd/cdrom.c \
 	$(CORE_DIR)/src/core/cheat.c \
 	$(CORE_DIR)/src/core/crc32.c \
+	$(CORE_DIR)/src/core/perf_counters.c \
 	$(CORE_DIR)/src/core/event.c \
 	$(CORE_DIR)/src/jerry/eeprom.c \
 	$(CORE_DIR)/src/core/filedb.c \
diff --git a/docs/profiling.md b/docs/profiling.md
index 5c2dbd20..924371fb 100644
--- a/docs/profiling.md
+++ b/docs/profiling.md
@@ -23,11 +23,21 @@ Reports `Frames/sec`, `Time/frame`, total wall time.  Boots the core via `dlopen
 
 **Instruments (Time Profiler)** is the easiest way to get a flame graph on macOS.
 
+The wrapper at `scripts/profile-mac.sh` builds the core, runs the benchmark
+under `xctrace`, and writes a `.trace` bundle you can open in Instruments:
+
+```bash
+scripts/profile-mac.sh                                    # default: Time Profiler, accurate blitter
+scripts/profile-mac.sh --template "CPU Counters"          # PMU: cycles, instructions, branch misses
+scripts/profile-mac.sh --rom test/roms/yarc.j64 --open    # auto-open the trace
+```
+
+Manual invocation if you'd rather attach to a running process:
+
 ```bash
 make benchmark BENCH_FRAMES=6000 BENCH_WARMUP=120 &
 BENCH_PID=$!
 
-# Sample for 30 seconds, output to .trace bundle
 xcrun xctrace record --template "Time Profiler" --attach $BENCH_PID --output bench.trace --time-limit 30s
 open bench.trace
 ```
@@ -41,6 +51,57 @@ sample $BENCH_PID 5 -file /tmp/sample.txt
 # 5-second sample.  Read /tmp/sample.txt for collapsed call stacks.
 ```
 
+## Bespoke counters — `BENCH_PROFILE=1`
+
+Sampling profilers tell you *where* time goes; counters tell you *how often*
+something happens.  When you want exact iteration counts (e.g., "did my
+fast-path actually skip the inner loop?"), use the `perf_counters` system in
+`src/core/perf_counters.h`.
+
+```bash
+make benchmark BENCH_PROFILE=1 BENCH_BLITTER=accurate BENCH_FRAMES=300
+# ...
+# [perf] counter dump:
+# [perf]   blitter_phrase_writes                    3034993
+# [perf]   blitter_phrase_reads                     931821
+# [perf]   blitter_inner_io                         3966814
+# [perf]   blitter_inner                            4131151
+# [perf]   blitter_outer                            337722
+# [perf]   blitter_calls                            131628
+```
+
+The macros are zero-overhead when `BENCH_PROFILE` is undefined (default
+build) — every `PERF_INC` becomes `((void)0)`, every `PERF_COUNTER`
+becomes a typedef.  Use them freely in hot paths to instrument
+hypotheses.
+
+Adding a counter:
+
+```c
+#include "perf_counters.h"
+
+PERF_COUNTER(my_event);             /* file scope */
+
+void hot(void) {
+    PERF_INC(my_event);             /* in-loop */
+    PERF_ADD(my_event, n);          /* batch */
+}
+```
+
+The harness (`test/tools/test_benchmark.c`) calls
+`perf_counters_dump(stderr)` at exit; counter values appear right
+before the `BENCHMARK RESULTS` block.
+
+When to reach for this vs. Time Profiler:
+
+| Question | Tool |
+|---|---|
+| "Where are we spending cycles?" | `xctrace` Time Profiler |
+| "How many times does the inner loop run per frame?" | `BENCH_PROFILE=1` |
+| "What fraction of inner iterations are no-ops?" | `BENCH_PROFILE=1` |
+| "Are we hitting L1 / branch-mispredicting?" | `xctrace` CPU Counters |
+| "Did this optimization change behavior, not just timing?" | `BENCH_PROFILE=1` (deltas in counts) |
+
 ## Linux — `perf` + flamegraph
 
 ```bash
diff --git a/exports-test.list b/exports-test.list
index 0b3fbff2..9cf1a5b8 100644
--- a/exports-test.list
+++ b/exports-test.list
@@ -33,3 +33,7 @@ _sclk
 _smode
 _lowerField
 _vjs
+_perf_counters_dump
+_perf_counters_reset
+_perf_counters_register
+_perf_counters_find
diff --git a/link-test.T b/link-test.T
index 9a52e19a..9642acf7 100644
--- a/link-test.T
+++ b/link-test.T
@@ -36,5 +36,9 @@
       smode;
       lowerField;
       vjs;
+      perf_counters_dump;
+      perf_counters_reset;
+      perf_counters_register;
+      perf_counters_find;
    local: *;
 };
diff --git a/scripts/profile-mac.sh b/scripts/profile-mac.sh
new file mode 100755
index 00000000..26c9f400
--- /dev/null
+++ b/scripts/profile-mac.sh
@@ -0,0 +1,81 @@
+#!/usr/bin/env bash
+#
+# profile-mac.sh -- Run test_benchmark under Xcode Instruments on Apple Silicon
+# (or any Mac with Xcode CLT).
+#
+# Usage:
+#   scripts/profile-mac.sh [--template NAME] [--frames N] [--warmup N]
+#                          [--blitter fast|accurate] [--rom PATH] [--open]
+#
+# Defaults:
+#   template = "Time Profiler"
+#   frames   = 600  warmup = 60  blitter = accurate
+#   rom      = test/roms/yarc.j64
+#   --open   = open the .trace bundle in Instruments when finished
+#
+# Common templates:
+#   "Time Profiler"   -- where time is being spent (call tree / flame)
+#   "CPU Counters"    -- Apple Silicon PMU (cycles, instr, branches, misses)
+#   "System Trace"    -- syscalls, scheduler, VM events
+#
+set -euo pipefail
+
+TEMPLATE="Time Profiler"
+FRAMES=600
+WARMUP=60
+BLITTER=accurate
+ROM="test/roms/yarc.j64"
+OPEN_TRACE=0
+
+while [ $# -gt 0 ]; do
+   case "$1" in
+      --template) TEMPLATE="$2"; shift 2 ;;
+      --frames)   FRAMES="$2"; shift 2 ;;
+      --warmup)   WARMUP="$2"; shift 2 ;;
+      --blitter)  BLITTER="$2"; shift 2 ;;
+      --rom)      ROM="$2"; shift 2 ;;
+      --open)     OPEN_TRACE=1; shift ;;
+      -h|--help)
+         sed -n '2,20p' "$0"
+         exit 0 ;;
+      *)
+         echo "Unknown arg: $1" >&2
+         exit 2 ;;
+   esac
+done
+
+if ! command -v xctrace >/dev/null 2>&1; then
+   echo "xctrace not found. Install Xcode Command Line Tools: xcode-select --install" >&2
+   exit 1
+fi
+
+ROOT="$(cd "$(dirname "$0")/.." && pwd)"
+cd "$ROOT"
+
+mkdir -p build
+TRACE="build/profile-$(date +%Y%m%d-%H%M%S).trace"
+
+# Make sure the core + harness are built (no BENCH_PROFILE; profiling
+# instrumentation skews sampling results).
+make -j"$(getconf _NPROCESSORS_ONLN 2>/dev/null || echo 4)" >/dev/null
+cc -O2 -Wall -std=c99 -I. -I./libretro-common/include \
+   -o test/tools/test_benchmark test/tools/test_benchmark.c
+
+CORE="./virtualjaguar_libretro.dylib"
+HARNESS="./test/tools/test_benchmark"
+
+echo ">>> xctrace template:   $TEMPLATE"
+echo ">>> trace output:       $TRACE"
+echo ">>> rom / blitter:      $ROM / $BLITTER"
+echo ">>> frames (+warmup):   $FRAMES (+$WARMUP)"
+
+xctrace record \
+   --template "$TEMPLATE" \
+   --output "$TRACE" \
+   --launch -- "$HARNESS" "$CORE" "$ROM" "$FRAMES" \
+                          --warmup "$WARMUP" --blitter "$BLITTER"
+
+echo ">>> trace written to $TRACE"
+if [ "$OPEN_TRACE" = "1" ]; then
+   open "$TRACE"
+fi
diff --git a/src/core/perf_counters.c b/src/core/perf_counters.c
new file mode 100644
index 00000000..fb31153b
--- /dev/null
+++ b/src/core/perf_counters.c
@@ -0,0 +1,67 @@
+/*
+ * perf_counters.c - registry + dump for opt-in instrumentation counters.
+ *
+ * The register/dump/reset functions are *always* defined so they can be
+ * exported through the test ABI without conditional linker scripts.
+ * In !BENCH_PROFILE builds the bodies are no-ops and no PERF_COUNTER
+ * calls perf_counters_register, so the registry stays empty.
+ */
+#include <string.h>
+#include "perf_counters.h"
+
+#ifdef BENCH_PROFILE
+static perf_counter_entry_t *perf_head = (perf_counter_entry_t *)0;
+#endif
+
+void perf_counters_register(perf_counter_entry_t *entry)
+{
+#ifdef BENCH_PROFILE
+   if (!entry || entry->next)
+      return; /* already linked */
+   entry->next = perf_head;
+   perf_head = entry;
+#else
+   (void)entry;
+#endif
+}
+
+void perf_counters_reset(void)
+{
+#ifdef BENCH_PROFILE
+   perf_counter_entry_t *e;
+   for (e = perf_head; e; e = e->next)
+      *e->value = 0;
+#endif
+}
+
+void perf_counters_dump(FILE *out)
+{
+#ifdef BENCH_PROFILE
+   perf_counter_entry_t *e;
+   if (!out)
+      out = stderr;
+   if (!perf_head) {
+      fprintf(out, "[perf] no counters registered\n");
+      return;
+   }
+   fprintf(out, "[perf] counter dump:\n");
+   for (e = perf_head; e; e = e->next)
+      fprintf(out, "[perf]   %-40s %llu\n", e->name, *e->value);
+#else
+   (void)out;
+#endif
+}
+
+unsigned long long *perf_counters_find(const char *name)
+{
+#ifdef BENCH_PROFILE
+   perf_counter_entry_t *e;
+   if (!name) return (unsigned long long *)0;
+   for (e = perf_head; e; e = e->next)
+      if (e->name && strcmp(e->name, name) == 0)
+         return e->value;
+#else
+   (void)name;
+#endif
+   return (unsigned long long *)0;
+}
diff --git a/src/core/perf_counters.h b/src/core/perf_counters.h
new file mode 100644
index 00000000..12c2cfd1
--- /dev/null
+++ b/src/core/perf_counters.h
@@ -0,0 +1,90 @@
+/*
+ * perf_counters.h - lightweight, opt-in instrumentation counters.
+ *
+ * Define BENCH_PROFILE at compile time to enable. Otherwise every macro
+ * expands to (void)0 and there is no runtime, code-size, or symbol cost.
+ *
+ * Usage:
+ *
+ *   #include "perf_counters.h"
+ *
+ *   PERF_COUNTER(blitter_inner);
+ *   PERF_COUNTER(blitter_phrase_reads);
+ *
+ *   void hot(void) {
+ *       PERF_INC(blitter_inner);
+ *       PERF_ADD(blitter_phrase_reads, 2);
+ *   }
+ *
+ *   // Somewhere at shutdown (e.g., test harness atexit):
+ *   perf_counters_dump(stderr);
+ *
+ * Counters self-register via constructor functions, so PERF_COUNTER must
+ * appear at file scope. Only one definition per name across the program.
+ *
+ * C89-clean. No designated initializers, no mid-block declarations.
+ */
+#ifndef VJ_PERF_COUNTERS_H
+#define VJ_PERF_COUNTERS_H
+
+#include <stdio.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Registry types and entry points are *always* declared so the test
+ * ABI can export them unconditionally.  When BENCH_PROFILE is undefined
+ * the bodies (in perf_counters.c) become no-ops and no PERF_COUNTER
+ * macro registers anything, so the registry stays empty. */
+
+typedef struct perf_counter_entry
+{
+   const char *name;
+   unsigned long long *value;
+   struct perf_counter_entry *next;
+} perf_counter_entry_t;
+
+void perf_counters_register(perf_counter_entry_t *entry);
+void perf_counters_dump(FILE *out);
+void perf_counters_reset(void);
+/* Return a pointer to the named counter's value, or NULL if unknown.
+ * Lets harnesses snapshot a counter before/after retro_run for
+ * per-frame deltas without exporting individual symbols. */
+unsigned long long *perf_counters_find(const char *name);
+
+#ifdef BENCH_PROFILE
+
+#define PERF_COUNTER(name) \
+   static unsigned long long perf_##name = 0; \
+   static perf_counter_entry_t perf_entry_##name = \
+      { #name, &perf_##name, (perf_counter_entry_t *)0 }; \
+   __attribute__((constructor)) \
+   static void perf_register_##name(void) { \
+      perf_counters_register(&perf_entry_##name); \
+   } \
+   typedef int perf_##name##_decl_semicolon_eater
+
+/* PERF_INC / PERF_ADD are expressions of integer type (not statements),
+ * so they can be embedded in declaration initializers via the comma
+ * operator without violating C89's no-decl-after-statement rule:
+ *   uint32_t cmd = (PERF_INC(my_event), real_value());
+ */
+#define PERF_INC(name)    (++perf_##name)
+#define PERF_ADD(name, n) (perf_##name += (unsigned long long)(n))
+
+#else /* !BENCH_PROFILE */
+
+#define PERF_COUNTER(name) typedef int perf_##name##_unused
+/* No-op forms remain expressions of integer type (not void) so callers
+ * can use them inside comma operators without code changes. */
+#define PERF_INC(name)        (0)
+#define PERF_ADD(name, n)     ((void)(n), 0)
+
+#endif /* BENCH_PROFILE */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* VJ_PERF_COUNTERS_H */
diff --git a/src/tom/blitter.c b/src/tom/blitter.c
index ac40d94e..c76461a1 100644
--- a/src/tom/blitter.c
+++ b/src/tom/blitter.c
@@ -26,6 +26,7 @@
 
 #include <string.h>
 #include "jaguar.h"
+#include "perf_counters.h"
 #include "state.h"
 
 // Various conditional compilation goodies...
@@ -33,6 +34,20 @@
 #define USE_ORIGINAL_BLITTER
 #define USE_MIDSUMMER_BLITTER_MKII
 
+/* Portable always-inline.  Spelled to include the inline keyword
+ * itself (MSVC's __forceinline IS the inline keyword for that
+ * compiler), so call sites use `static BLITTER_ALWAYS_INLINE void
+ * foo(...)` without an extra INLINE/inline.  Used to force inlining
+ * of the blitter helpers (ADD16SAT, ADDARRAY, COMP_CTRL, DATA) so
+ * the compiler can specialise them per call site. */
+#if defined(_MSC_VER)
+#  define BLITTER_ALWAYS_INLINE __forceinline
+#elif defined(__GNUC__) || defined(__clang__)
+#  define BLITTER_ALWAYS_INLINE inline __attribute__((always_inline))
+#else
+#  define BLITTER_ALWAYS_INLINE inline
+#endif
+
 // Local global variables
 
 // Blitter register RAM (most of it is hidden from the user)
@@ -44,6 +59,14 @@ uint8_t blitter_ram[0x100];
 void BlitterMidsummer(uint32_t cmd);
 void BlitterMidsummer2(void);
 
+PERF_COUNTER(blitter_calls);
+PERF_COUNTER(blitter_outer);
+PERF_COUNTER(blitter_inner);
+PERF_COUNTER(blitter_inner_io);
+PERF_COUNTER(blitter_inner_idle);
+PERF_COUNTER(blitter_phrase_reads);
+PERF_COUNTER(blitter_phrase_writes);
+
 #define REG(A)	(((uint32_t)blitter_ram[(A)] << 24) | ((uint32_t)blitter_ram[(A)+1] << 16) \
 				| ((uint32_t)blitter_ram[(A)+2] << 8) | (uint32_t)blitter_ram[(A)+3])
 #define WREG(A,D)	(blitter_ram[(A)] = ((D)>>24)&0xFF, blitter_ram[(A)+1] = ((D)>>16)&0xFF, \
@@ -960,11 +983,11 @@ void blitter_blit(uint32_t cmd)
 void ADDRGEN(uint32_t *, uint32_t *, bool, bool,
 	uint16_t, uint16_t, uint32_t, uint8_t, uint8_t, uint8_t, uint8_t,
 	uint16_t, uint16_t, uint32_t, uint8_t, uint8_t, uint8_t, uint8_t);
-void ADDARRAY(uint16_t * addq, uint8_t daddasel, uint8_t daddbsel, uint8_t daddmode,
-	uint64_t dstd, uint32_t iinc, uint8_t initcin[], uint64_t initinc, uint16_t initpix,
-	uint32_t istep, uint64_t patd, uint64_t srcd, uint64_t srcz1, uint64_t srcz2,
-	uint32_t zinc, uint32_t zstep);
-void ADD16SAT(uint16_t *r, uint8_t *co, uint16_t a, uint16_t b, uint8_t cin, bool sat, bool eightbit, bool hicinh);
+/* ADD16SAT / ADDARRAY are defined inline below so the compiler can
+ * specialise per call-site (most callers pass compile-time constants
+ * for daddasel/daddbsel/daddmode and the sat/eightbit/hicinh flags).
+ * Profile data on AvP gameplay shows ADDARRAY as the single largest
+ * leaf in the entire emulator, called millions of times per frame. */
 void ADDAMUX(int16_t *adda_x, int16_t *adda_y, uint8_t addasel, int16_t a1_step_x, int16_t a1_step_y,
 	int16_t a1_stepf_x, int16_t a1_stepf_y, int16_t a2_step_x, int16_t a2_step_y,
 	int16_t a1_inc_x, int16_t a1_inc_y, int16_t a1_incf_x, int16_t a1_incf_y, uint8_t adda_xconst,
@@ -974,1775 +997,2050 @@ void ADDBMUX(int16_t *addb_x, int16_t *addb_y, uint8_t addbsel, int16_t a1_x, in
 void DATAMUX(int16_t *data_x, int16_t *data_y, uint32_t gpu_din, int16_t addq_x, int16_t addq_y, bool addqsel);
 void ADDRADD(int16_t *addq_x, int16_t *addq_y, bool a1fracldi,
 	uint16_t adda_x, uint16_t adda_y, uint16_t addb_x, uint16_t addb_y, uint8_t modx, bool suba_x, bool suba_y);
-void DATA(uint64_t *wdata, uint8_t *dcomp, uint8_t *zcomp, bool *nowrite,
-	bool big_pix, bool cmpdst, uint8_t daddasel, uint8_t daddbsel, uint8_t daddmode, bool daddq_sel, uint8_t data_sel,
-	uint8_t dbinh, uint8_t dend, uint8_t dstart, uint64_t dstd, uint32_t iinc, uint8_t lfu_func, uint64_t *patd, bool patdadd,
-	bool phrase_mode, uint64_t srcd, bool srcdread, bool srczread, bool srcz2add, uint8_t zmode,
-	bool bcompen, bool bkgwren, bool dcompen, uint8_t icount, uint8_t pixsize,
-	uint64_t *srcz, uint64_t dstz, uint32_t zinc);
-void COMP_CTRL(uint8_t *dbinh, bool *nowrite,
-	bool bcompen, bool big_pix, bool bkgwren, uint8_t dcomp, bool dcompen, uint8_t icount,
-	uint8_t pixsize, bool phrase_mode, uint8_t srcd, uint8_t zcomp);
-
-
-void BlitterMidsummer2(void)
+/* DATA + COMP_CTRL are defined inline below (above BlitterMidsummer2)
+ * so the compiler can specialise them per call.  Both are called
+ * exclusively from the BlitterMidsummer2 inner loop. */
+
+
+/* AvP-gameplay hot path: ADDARRAY at 1910 samples, ADD16SAT inlined
+ * inside it.  Inlined here so the compiler can specialise the 4
+ * call sites in BlitterMidsummer2 (compile-time daddasel/daddbsel/
+ * daddmode -> dead switch arms eliminated) and the call inside DATA
+ * (where the args are loop-invariant for the duration of a blit). */
+static BLITTER_ALWAYS_INLINE
+void ADD16SAT(uint16_t *r, uint8_t *co, uint16_t a, uint16_t b,
+              uint8_t cin, bool sat, bool eightbit, bool hicinh)
 {
-   uint32_t cmd = GET32(blitter_ram, COMMAND);
-
-
-   // Line states passed in via the command register
-
-   bool srcen = (SRCEN), srcenx = (SRCENX), srcenz = (SRCENZ),
-        dsten = (DSTEN), dstenz = (DSTENZ), dstwrz = (DSTWRZ), clip_a1 = (CLIPA1),
-        upda1 = (UPDA1), upda1f = (UPDA1F), upda2 = (UPDA2), dsta2 = (DSTA2),
-        gourd = (GOURD), gourz = (GOURZ), topben = (TOPBEN), topnen = (TOPNEN),
-        patdsel = (PATDSEL), adddsel = (ADDDSEL), cmpdst = (CMPDST), bcompen = (BCOMPEN),
-        dcompen = (DCOMPEN), bkgwren = (BKGWREN), srcshade = (SRCSHADE);
-
-   uint8_t zmode = (cmd & 0x01C0000) >> 18, lfufunc = (cmd & 0x1E00000) >> 21;
-   //Missing: BUSHI
-   //Where to find various lines:
-   // clip_a1  -> inner
-   // gourd    -> dcontrol, inner, outer, state
-   // gourz    -> dcontrol, inner, outer, state
-   // cmpdst   -> blit, data, datacomp, state
-   // bcompen  -> acontrol, inner, mcontrol, state
-   // dcompen  -> inner, state
-   // bkgwren  -> inner, state
-   // srcshade -> dcontrol, inner, state
-   // adddsel  -> dcontrol
-   //NOTE: ADDDSEL takes precedence over PATDSEL, PATDSEL over LFU_FUNC
+   uint8_t carry[4];
+   uint8_t btop, ctop;
+   bool saturate, hisaturate;
+   uint32_t qt   = (a & 0xFF) + (b & 0xFF) + cin;
+   uint16_t q    = qt & 0x00FF;
 
-   // Lines that don't exist in Jaguar I (and will never be asserted)
+   carry[0]      = ((qt & 0x0100) ? 1 : 0);
+   carry[1]      = (carry[0] && !eightbit ? carry[0] : 0);
+   qt            = (a & 0x0F00) + (b & 0x0F00) + (carry[1] << 8);
+   carry[2]      = ((qt & 0x1000) ? 1 : 0);
+   q            |= qt & 0x0F00;
+   carry[3]      = (carry[2] && !hicinh ? carry[2] : 0);
+   qt            = (a & 0xF000) + (b & 0xF000) + (carry[3] << 12);
+   *co            = ((qt & 0x10000) ? 1 : 0);
+   q            |= qt & 0xF000;
 
-   bool polygon = false, datinit = false, a1_stepld = false, a2_stepld = false, ext_int = false;
-   bool istepadd = false, istepfadd = false;
-   bool zstepfadd = false, zstepadd = false;
+   if (eightbit)
+   {
+      btop  = (b & 0x0080) >> 7;
+      ctop  = carry[0];
+   }
+   else
+   {
+      btop  = (b & 0x8000) >> 15;
+      ctop  = *co;
+   }
 
-   // Various state lines (initial state--basically the reset state of the FDSYNCs)
+   saturate = sat && (btop ^ ctop);
+   hisaturate = saturate && !eightbit;
 
-   bool go = true, idle = true, inner = false, a1fupdate = false, a1update = false,
-        zfupdate = false, zupdate = false, a2update = false, init_if = false, init_ii = false,
-        init_zf = false, init_zi = false;
+   *r = (saturate ? (ctop ? 0x00FF : 0x0000) : q & 0x00FF);
+   *r |= (hisaturate ? (ctop ? 0xFF00 : 0x0000) : q & 0xFF00);
+}
 
-   bool outer0 = false, indone = false;
+static BLITTER_ALWAYS_INLINE
+void ADDARRAY(uint16_t *addq, uint8_t daddasel, uint8_t daddbsel,
+              uint8_t daddmode, uint64_t dstd, uint32_t iinc,
+              uint8_t initcin[], uint64_t initinc, uint16_t initpix,
+              uint32_t istep, uint64_t patd, uint64_t srcd,
+              uint64_t srcz1, uint64_t srcz2, uint32_t zinc,
+              uint32_t zstep)
+{
+   unsigned i;
+   uint16_t adda[4];
+   uint16_t addb[4];
+   uint64_t adda_val;
+   uint32_t initpix2;
+   uint16_t word;
+   uint8_t cinsel;
+   static uint8_t co[4]; /* preserved between calls (hardware artifact) */
+   uint8_t cin[4];
+   bool eightbit;
+   bool sat, hicinh;
+   uint8_t bsel_idx;
 
-   bool idlei, inneri, a1fupdatei, a1updatei, zfupdatei, zupdatei, a2updatei, init_ifi, init_iii,
-        init_zfi, init_zii;
+   initpix2 = ((uint32_t)initpix << 16) | initpix;
 
-   bool notgzandp = !(gourz && polygon);
+   switch (daddasel)
+   {
+      case 0:  adda_val = dstd; break;
+      case 1:  adda_val = ((uint64_t)initpix2 << 32) | initpix2; break;
+      case 2:
+      case 3:  adda_val = 0; break;
+      case 4:  adda_val = srcd; break;
+      case 5:  adda_val = patd; break;
+      case 6:  adda_val = srcz1; break;
+      default: adda_val = srcz2; break;
+   }
+   adda[0] = (uint16_t)adda_val;
+   adda[1] = (uint16_t)(adda_val >> 16);
+   adda[2] = (uint16_t)(adda_val >> 32);
+   adda[3] = (uint16_t)(adda_val >> 48);
 
+   if (!(daddbsel & 0x04))
+   {
+      if (daddbsel & 0x01)
+      {
+         addb[0] = (uint16_t)initinc;
+         addb[1] = (uint16_t)(initinc >> 16);
+         addb[2] = (uint16_t)(initinc >> 32);
+         addb[3] = (uint16_t)(initinc >> 48);
+      }
+      else
+      {
+         addb[0] = (uint16_t)srcd;
+         addb[1] = (uint16_t)(srcd >> 16);
+         addb[2] = (uint16_t)(srcd >> 32);
+         addb[3] = (uint16_t)(srcd >> 48);
+      }
+   }
+   else
+   {
+      bsel_idx = ((daddbsel & 0x08) >> 1) | (daddbsel & 0x03);
+      switch (bsel_idx)
+      {
+         case 0: word = iinc & 0xFFFF; break;
+         case 1: word = iinc >> 16; break;
+         case 2: word = zinc & 0xFFFF; break;
+         case 3: word = zinc >> 16; break;
+         case 4: word = istep & 0xFFFF; break;
+         case 5: word = istep >> 16; break;
+         case 6: word = zstep & 0xFFFF; break;
+         default: word = zstep >> 16; break;
+      }
+      addb[0] = addb[1] = addb[2] = addb[3] = word;
+   }
 
-   // Various registers set up by user
+   cinsel = ((daddmode & 0x03) && !(daddmode & 0x04) ? 1 : 0);
 
-   uint16_t ocount = GET16(blitter_ram, PIXLINECOUNTER);
-   uint8_t a1_pitch = blitter_ram[A1_FLAGS + 3] & 0x03;
-   uint8_t a2_pitch = blitter_ram[A2_FLAGS + 3] & 0x03;
-   uint8_t a1_pixsize = (blitter_ram[A1_FLAGS + 3] & 0x38) >> 3;
-   uint8_t a2_pixsize = (blitter_ram[A2_FLAGS + 3] & 0x38) >> 3;
-   uint8_t a1_zoffset = (GET16(blitter_ram, A1_FLAGS + 2) >> 6) & 0x07;
-   uint8_t a2_zoffset = (GET16(blitter_ram, A2_FLAGS + 2) >> 6) & 0x07;
-   uint8_t a1_width = (blitter_ram[A1_FLAGS + 2] >> 1) & 0x3F;
-   uint8_t a2_width = (blitter_ram[A2_FLAGS + 2] >> 1) & 0x3F;
-   uint8_t a1addx = blitter_ram[A1_FLAGS + 1] & 0x03, a2addx = blitter_ram[A2_FLAGS + 1] & 0x03;
-   bool a1addy = blitter_ram[A1_FLAGS + 1] & 0x04, a2addy = blitter_ram[A2_FLAGS + 1] & 0x04;
-   bool a1xsign = blitter_ram[A1_FLAGS + 1] & 0x08, a2xsign = blitter_ram[A2_FLAGS + 1] & 0x08;
-   bool a1ysign = blitter_ram[A1_FLAGS + 1] & 0x10, a2ysign = blitter_ram[A2_FLAGS + 1] & 0x10;
-   uint32_t a1_base = GET32(blitter_ram, A1_BASE) & 0xFFFFFFF8;	// Phrase aligned by ignoring bottom 3 bits
-   uint32_t a2_base = GET32(blitter_ram, A2_BASE) & 0xFFFFFFF8;
+   for (i = 0; i < 4; i++)
+      cin[i] = initcin[i] | (co[i] & cinsel);
 
-   uint16_t a1_win_x = GET16(blitter_ram, A1_CLIP + 2) & 0x7FFF;
-   uint16_t a1_win_y = GET16(blitter_ram, A1_CLIP + 0) & 0x7FFF;
-   int16_t a1_x = (int16_t)GET16(blitter_ram, A1_PIXEL + 2);
-   int16_t a1_y = (int16_t)GET16(blitter_ram, A1_PIXEL + 0);
-   int16_t a1_step_x = (int16_t)GET16(blitter_ram, A1_STEP + 2);
-   int16_t a1_step_y = (int16_t)GET16(blitter_ram, A1_STEP + 0);
-   uint16_t a1_stepf_x = GET16(blitter_ram, A1_FSTEP + 2);
-   uint16_t a1_stepf_y = GET16(blitter_ram, A1_FSTEP + 0);
-   uint16_t a1_frac_x = GET16(blitter_ram, A1_FPIXEL + 2);
-   uint16_t a1_frac_y = GET16(blitter_ram, A1_FPIXEL + 0);
-   int16_t a1_inc_x = (int16_t)GET16(blitter_ram, A1_INC + 2);
-   int16_t a1_inc_y = (int16_t)GET16(blitter_ram, A1_INC + 0);
-   uint16_t a1_incf_x = GET16(blitter_ram, A1_FINC + 2);
-   uint16_t a1_incf_y = GET16(blitter_ram, A1_FINC + 0);
+   eightbit = daddmode & 0x02;
+   sat = daddmode & 0x03;
+   hicinh = ((daddmode & 0x03) == 0x03);
 
-   int16_t a2_x = (int16_t)GET16(blitter_ram, A2_PIXEL + 2);
-   int16_t a2_y = (int16_t)GET16(blitter_ram, A2_PIXEL + 0);
-#if 0
-   bool a2_mask = blitter_ram[A2_FLAGS + 2] & 0x80;
-   uint16_t a2_mask_x = GET16(blitter_ram, A2_MASK + 2);
-   uint16_t a2_mask_y = GET16(blitter_ram, A2_MASK + 0);
-   uint32_t collision = GET32(blitter_ram, COLLISIONCTRL);// 0=RESUME, 1=ABORT, 2=STOPEN
-#endif
-   int16_t a2_step_x = (int16_t)GET16(blitter_ram, A2_STEP + 2);
-   int16_t a2_step_y = (int16_t)GET16(blitter_ram, A2_STEP + 0);
+   ADD16SAT(&addq[0], &co[0], adda[0], addb[0], cin[0], sat, eightbit, hicinh);
+   ADD16SAT(&addq[1], &co[1], adda[1], addb[1], cin[1], sat, eightbit, hicinh);
+   ADD16SAT(&addq[2], &co[2], adda[2], addb[2], cin[2], sat, eightbit, hicinh);
+   ADD16SAT(&addq[3], &co[3], adda[3], addb[3], cin[3], sat, eightbit, hicinh);
+}
 
-   uint64_t srcd1 = GET64(blitter_ram, SRCDATA);
-   uint64_t srcd2 = 0;
-   uint64_t dstd = GET64(blitter_ram, DSTDATA);
-   uint64_t patd = GET64(blitter_ram, PATTERNDATA);
-   uint32_t iinc = GET32(blitter_ram, INTENSITYINC);
-   uint64_t srcz1 = GET64(blitter_ram, SRCZINT);
-   uint64_t srcz2 = GET64(blitter_ram, SRCZFRAC);
-   uint64_t dstz = GET64(blitter_ram, DSTZ);
-   uint32_t zinc = GET32(blitter_ram, ZINC);
+static BLITTER_ALWAYS_INLINE
+void COMP_CTRL(uint8_t *dbinh, bool *nowrite,
+	bool bcompen, bool big_pix, bool bkgwren, uint8_t dcomp, bool dcompen, uint8_t icount,
+	uint8_t pixsize, bool phrase_mode, uint8_t srcd, uint8_t zcomp)
+{
+   //BEGIN
 
-   uint8_t pixsize = (dsta2 ? a2_pixsize : a1_pixsize);	// From ACONTROL
+   /*Bkgwren\	:= INV1 (bkgwren\, bkgwren);
+     Phrase_mode\	:= INV1 (phrase_mode\, phrase_mode);
+     Pixsize\[0-2]	:= INV2 (pixsize\[0-2], pixsize[0-2]);*/
 
-   bool phrase_mode;
-   uint16_t a1FracCInX = 0, a1FracCInY = 0;
+   /* The bit comparator bits are derived from the source data, which
+      will have been suitably aligned for phrase mode.  The contents of
+      the inner counter are used to select which bit to use.
 
-   // Bugs in Jaguar I
+      When not in phrase mode the inner count value is used to select
+      one bit.  It is assumed that the count has already occurred, so,
+      7 selects bit 0, etc.  In big-endian pixel mode, this turns round,
+      so that a count of 7 selects bit 7.
 
-   a2addy = a1addy;							// A2 channel Y add bit is tied to A1's
+      In phrase mode, the eight bits are used directly, and this mode is
+      only applicable to 8-bit pixel mode (2/34) */
 
-   // Various state lines set up by user
+   /*Bcompselt[0-2]	:= EO (bcompselt[0-2], icount[0-2], big_pix);
+Bcompbit	:= MX8 (bcompbit, srcd[7], srcd[6], srcd[5],
+srcd[4], srcd[3], srcd[2], srcd[1], srcd[0], bcompselt[0..2]);
+Bcompbit\	:= INV1 (bcompbit\, bcompbit);*/
+   ////////////////////////////////////// C++ CODE //////////////////////////////////////
+   uint8_t bcompselt = (big_pix ? ~icount : icount) & 0x07;
+   uint8_t bitmask[8] = { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 };
+   bool bcompbit = srcd & bitmask[bcompselt];
+   bool winhibit, di0t0_1, di0t4, di1t2, di2t0_1, di2t4, di3t2;
+   bool di4t0_1, di4t4, di5t2;
+   bool di6t0_1, di6t4;
+   bool di7t2;
 
-   phrase_mode = ((!dsta2 && a1addx == 0) || (dsta2 && a2addx == 0) ? true : false);	// From ACONTROL
+   //////////////////////////////////////////////////////////////////////////////////////
 
-   // Stopgap vars to simulate various lines
+   /* pipe-line the count */
+   /*Bcompsel[0-2]	:= FDSYNC (bcompsel[0-2], bcompselt[0-2], step_inner, clk);
+Bcompbt		:= MX8 (bcompbitpt, srcd[7], srcd[6], srcd[5],
+srcd[4], srcd[3], srcd[2], srcd[1], srcd[0], bcompsel[0..2]);
+Bcompbitp	:= FD1Q (bcompbitp, bcompbitpt, clk);
+Bcompbitp\	:= INV1 (bcompbitp\, bcompbitp);*/
 
+   /* For pixel mode, generate the write inhibit signal for all modes
+      on bit inhibit, for 8 and 16 bit modes on comparator inhibit, and
+      for 16 bit mode on Z inhibit
 
-   while (true)
-   {
-      // IDLE
+      Nowrite = bcompen . /bcompbit . /phrase_mode
+      + dcompen . dcomp[0] . /phrase_mode . pixsize = 011
+      + dcompen . dcomp[0..1] . /phrase_mode . pixsize = 100
+      + zcomp[0] . /phrase_mode . pixsize = 100
+      */
 
-      if ((idle && !go) || (inner && outer0 && indone))
-      {
-         idlei = true;
+   /*Nowt0		:= NAN3 (nowt[0], bcompen, bcompbit\, phrase_mode\);
+Nowt1		:= ND6  (nowt[1], dcompen, dcomp[0], phrase_mode\, pixsize\[2], pixsize[0..1]);
+Nowt2		:= ND7  (nowt[2], dcompen, dcomp[0..1], phrase_mode\, pixsize[2], pixsize\[0..1]);
+Nowt3		:= NAN5 (nowt[3], zcomp[0], phrase_mode\, pixsize[2], pixsize\[0..1]);
+Nowt4		:= NAN4 (nowt[4], nowt[0..3]);
+Nowrite		:= AN2  (nowrite, nowt[4], bkgwren\);*/
+   ////////////////////////////////////// C++ CODE //////////////////////////////////////
+   *nowrite = ((bcompen && !bcompbit && !phrase_mode)
+         || (dcompen && (dcomp & 0x01) && !phrase_mode && (pixsize == 3))
+         || (dcompen && ((dcomp & 0x03) == 0x03) && !phrase_mode && (pixsize == 4))
+         || ((zcomp & 0x01) && !phrase_mode && (pixsize == 4)))
+      && !bkgwren;
+   //////////////////////////////////////////////////////////////////////////////////////
 
-         //Instead of a return, let's try breaking out of the loop...
-         break;
-      }
-      else
-         idlei = false;
+   /*Winht		:= NAN3 (winht, bcompen, bcompbitp\, phrase_mode\);
+Winhibit	:= NAN4 (winhibit, winht, nowt[1..3]);*/
+   ////////////////////////////////////// C++ CODE //////////////////////////////////////
+   //This is the same as above, but with bcompbit delayed one tick and called 'winhibit'
+   //Small difference: Besides the pipeline effect, it's also not using !bkgwren...
+   //	bool winhibit = (bcompen && !
+   winhibit = (bcompen && !bcompbit && !phrase_mode)
+      || (dcompen && (dcomp & 0x01) && !phrase_mode && (pixsize == 3))
+      || (dcompen && ((dcomp & 0x03) == 0x03) && !phrase_mode && (pixsize == 4))
+      || ((zcomp & 0x01) && !phrase_mode && (pixsize == 4));
+   //////////////////////////////////////////////////////////////////////////////////////
 
-      // INNER LOOP ACTIVE
+   /* For phrase mode, generate the byte inhibit signals for eight bit
+      mode 011, or sixteen bit mode 100
+      dbinh\[0] =  pixsize[2] . zcomp[0]
+      +  pixsize[2] . dcomp[0] . dcomp[1] . dcompen
+      + /pixsize[2] . dcomp[0] . dcompen
+      + /srcd[0] . bcompen
 
-      if ((idle && go && !datinit)
-            || (inner && !indone)
-            || (inner && indone && !outer0 && !upda1f && !upda1 && notgzandp && !upda2 && !datinit)
-            || (a1update && !upda2 && notgzandp && !datinit)
-            || (zupdate && !upda2 && !datinit)
-            || (a2update && !datinit)
-            || (init_ii && !gourz)
-            || (init_zi))
-         inneri = true;
-      else
-         inneri = false;
+      Inhibits 0-3 are also used when not in phrase mode to write back
+      destination data.
+      */
 
-      // A1 FRACTION UPDATE
+   /*Srcd\[0-7]	:= INV1 (srcd\[0-7], srcd[0-7]);
 
-      if (inner && indone && !outer0 && upda1f)
-         a1fupdatei = true;
-      else
-         a1fupdatei = false;
+Di0t0		:= NAN2H (di0t[0], pixsize[2], zcomp[0]);
+Di0t1		:= NAN4H (di0t[1], pixsize[2], dcomp[0..1], dcompen);
+Di0t2		:= NAN2 (di0t[2], srcd\[0], bcompen);
+Di0t3		:= NAN3 (di0t[3], pixsize\[2], dcomp[0], dcompen);
+Di0t4		:= NAN4 (di0t[4], di0t[0..3]);
+Dbinh[0]	:= ANR1P (dbinh\[0], di0t[4], phrase_mode, winhibit);*/
+   ////////////////////////////////////// C++ CODE //////////////////////////////////////
+   *dbinh = 0;
+   di0t0_1 = ((pixsize & 0x04) && (zcomp & 0x01))
+      || ((pixsize & 0x04) && (dcomp & 0x01) && (dcomp & 0x02) && dcompen);
+   di0t4 = di0t0_1
+      || (!(srcd & 0x01) && bcompen)
+      || (!(pixsize & 0x04) && (dcomp & 0x01) && dcompen);
+   *dbinh |= (!((di0t4 && phrase_mode) || winhibit) ? 0x01 : 0x00);
+   //////////////////////////////////////////////////////////////////////////////////////
 
-      // A1 POINTER UPDATE
+   /*Di1t0		:= NAN3 (di1t[0], pixsize\[2], dcomp[1], dcompen);
+Di1t1		:= NAN2 (di1t[1], srcd\[1], bcompen);
+Di1t2		:= NAN4 (di1t[2], di0t[0..1], di1t[0..1]);
+Dbinh[1]	:= ANR1 (dbinh\[1], di1t[2], phrase_mode, winhibit);*/
+   ////////////////////////////////////// C++ CODE //////////////////////////////////////
+   di1t2 = di0t0_1
+      || (!(srcd & 0x02) && bcompen)
+      || (!(pixsize & 0x04) && (dcomp & 0x02) && dcompen);
+   *dbinh |= (!((di1t2 && phrase_mode) || winhibit) ? 0x02 : 0x00);
+   //////////////////////////////////////////////////////////////////////////////////////
 
-      if ((a1fupdate)
-            || (inner && indone && !outer0 && !upda1f && upda1))
-         a1updatei = true;
-      else
-         a1updatei = false;
+   /*Di2t0		:= NAN2H (di2t[0], pixsize[2], zcomp[1]);
+Di2t1		:= NAN4H (di2t[1], pixsize[2], dcomp[2..3], dcompen);
+Di2t2		:= NAN2 (di2t[2], srcd\[2], bcompen);
+Di2t3		:= NAN3 (di2t[3], pixsize\[2], dcomp[2], dcompen);
+Di2t4		:= NAN4 (di2t[4], di2t[0..3]);
+Dbinh[2]	:= ANR1 (dbinh\[2], di2t[4], phrase_mode, winhibit);*/
+   ////////////////////////////////////// C++ CODE //////////////////////////////////////
+   //[bcompen=F dcompen=T phrase_mode=T bkgwren=F][nw=F wi=F]
+   //[di0t0_1=F di0t4=F][di1t2=F][di2t0_1=T di2t4=T][di3t2=T][di4t0_1=F di2t4=F][di5t2=F][di6t0_1=F di6t4=F][di7t2=F]
+   //[dcomp=$00 dbinh=$0C][7804780400007804] (icount=0005, inc=4)
+   di2t0_1 = ((pixsize & 0x04) && (zcomp & 0x02))
+      || ((pixsize & 0x04) && (dcomp & 0x04) && (dcomp & 0x08) && dcompen);
+   di2t4 = di2t0_1
+      || (!(srcd & 0x04) && bcompen)
+      || (!(pixsize & 0x04) && (dcomp & 0x04) && dcompen);
+   *dbinh |= (!((di2t4 && phrase_mode) || winhibit) ? 0x04 : 0x00);
+   //////////////////////////////////////////////////////////////////////////////////////
 
-      // Z FRACTION UPDATE
+   /*Di3t0		:= NAN3 (di3t[0], pixsize\[2], dcomp[3], dcompen);
+Di3t1		:= NAN2 (di3t[1], srcd\[3], bcompen);
+Di3t2		:= NAN4 (di3t[2], di2t[0..1], di3t[0..1]);
+Dbinh[3]	:= ANR1 (dbinh\[3], di3t[2], phrase_mode, winhibit);*/
+   ////////////////////////////////////// C++ CODE //////////////////////////////////////
+   di3t2 = di2t0_1
+      || (!(srcd & 0x08) && bcompen)
+      || (!(pixsize & 0x04) && (dcomp & 0x08) && dcompen);
+   *dbinh |= (!((di3t2 && phrase_mode) || winhibit) ? 0x08 : 0x00);
+   //////////////////////////////////////////////////////////////////////////////////////
 
-      if ((a1update && gourz && polygon)
-            || (inner && indone && !outer0 && !upda1f && !upda1 && gourz && polygon))
-         zfupdatei = true;
-      else
-         zfupdatei = false;
+   /*Di4t0		:= NAN2H (di4t[0], pixsize[2], zcomp[2]);
+Di4t1		:= NAN4H (di4t[1], pixsize[2], dcomp[4..5], dcompen);
+Di4t2		:= NAN2 (di4t[2], srcd\[4], bcompen);
+Di4t3		:= NAN3 (di4t[3], pixsize\[2], dcomp[4], dcompen);
+Di4t4		:= NAN4 (di4t[4], di4t[0..3]);
+Dbinh[4]	:= NAN2 (dbinh\[4], di4t[4], phrase_mode);*/
+   ////////////////////////////////////// C++ CODE //////////////////////////////////////
+   di4t0_1 = ((pixsize & 0x04u) && (zcomp & 0x04u))
+      || ((pixsize & 0x04u) && (dcomp & 0x10u) && (dcomp & 0x20u) && dcompen);
+   di4t4 = di4t0_1
+      || (!(srcd & 0x10u) && bcompen)
+      || (!(pixsize & 0x04u) && (dcomp & 0x10u) && dcompen);
+   *dbinh |= (!(di4t4 && phrase_mode) ? 0x10u : 0x00u);
+   //////////////////////////////////////////////////////////////////////////////////////
 
-      // Z INTEGER UPDATE
+   /*Di5t0		:= NAN3 (di5t[0], pixsize\[2], dcomp[5], dcompen);
+Di5t1		:= NAN2 (di5t[1], srcd\[5], bcompen);
+Di5t2		:= NAN4 (di5t[2], di4t[0..1], di5t[0..1]);
+Dbinh[5]	:= NAN2 (dbinh\[5], di5t[2], phrase_mode);*/
+   ////////////////////////////////////// C++ CODE //////////////////////////////////////
+   di5t2 = di4t0_1
+      || (!(srcd & 0x20) && bcompen)
+      || (!(pixsize & 0x04) && (dcomp & 0x20) && dcompen);
+   *dbinh |= (!(di5t2 && phrase_mode) ? 0x20 : 0x00);
+   //////////////////////////////////////////////////////////////////////////////////////
 
-      if (zfupdate)
-         zupdatei = true;
-      else
-         zupdatei = false;
+   /*Di6t0		:= NAN2H (di6t[0], pixsize[2], zcomp[3]);
+Di6t1		:= NAN4H (di6t[1], pixsize[2], dcomp[6..7], dcompen);
+Di6t2		:= NAN2 (di6t[2], srcd\[6], bcompen);
+Di6t3		:= NAN3 (di6t[3], pixsize\[2], dcomp[6], dcompen);
+Di6t4		:= NAN4 (di6t[4], di6t[0..3]);
+Dbinh[6]	:= NAN2 (dbinh\[6], di6t[4], phrase_mode);*/
+   ////////////////////////////////////// C++ CODE //////////////////////////////////////
+   di6t0_1 = ((pixsize & 0x04) && (zcomp & 0x08))
+      || ((pixsize & 0x04) && (dcomp & 0x40) && (dcomp & 0x80) && dcompen);
+   di6t4 = di6t0_1
+      || (!(srcd & 0x40) && bcompen)
+      || (!(pixsize & 0x04) && (dcomp & 0x40) && dcompen);
+   *dbinh |= (!(di6t4 && phrase_mode) ? 0x40 : 0x00);
+   //////////////////////////////////////////////////////////////////////////////////////
 
-      // A2 POINTER UPDATE
+   /*Di7t0		:= NAN3 (di7t[0], pixsize\[2], dcomp[7], dcompen);
+Di7t1		:= NAN2 (di7t[1], srcd\[7], bcompen);
+Di7t2		:= NAN4 (di7t[2], di6t[0..1], di7t[0..1]);
+Dbinh[7]	:= NAN2 (dbinh\[7], di7t[2], phrase_mode);*/
+   ////////////////////////////////////// C++ CODE //////////////////////////////////////
+   di7t2 = di6t0_1
+      || (!(srcd & 0x80) && bcompen)
+      || (!(pixsize & 0x04) && (dcomp & 0x80) && dcompen);
+   *dbinh |= (!(di7t2 && phrase_mode) ? 0x80 : 0x00);
+   //////////////////////////////////////////////////////////////////////////////////////
 
-      if ((a1update && upda2 && notgzandp)
-            || (zupdate && upda2)
-            || (inner && indone && !outer0 && !upda1f && notgzandp && !upda1 && upda2))
-         a2updatei = true;
-      else
-         a2updatei = false;
+   //END;
+   //kludge
+   *dbinh = ~*dbinh;
+}
 
-      // INITIALIZE INTENSITY FRACTION
+static BLITTER_ALWAYS_INLINE
+void DATA(uint64_t *wdata, uint8_t *dcomp, uint8_t *zcomp, bool *nowrite,
+	bool big_pix, bool cmpdst, uint8_t daddasel, uint8_t daddbsel, uint8_t daddmode, bool daddq_sel, uint8_t data_sel,
+	uint8_t dbinh, uint8_t dend, uint8_t dstart, uint64_t dstd, uint32_t iinc, uint8_t lfu_func, uint64_t *patd, bool patdadd,
+	bool phrase_mode, uint64_t srcd, bool srcdread, bool srczread, bool srcz2add, uint8_t zmode,
+	bool bcompen, bool bkgwren, bool dcompen, uint8_t icount, uint8_t pixsize,
+	uint64_t *srcz, uint64_t dstz, uint32_t zinc)
+{
+/*
+  Stuff we absolutely *need* to have passed in/out:
+IN:
+  patdadd, dstd, srcd, patd, daddasel, daddbsel, daddmode, iinc, srcz1, srcz2, big_pix, phrase_mode, cmpdst
+OUT:
+  changed patd (wdata I guess...) (Nope. We pass it back directly now...)
+*/
 
-      if ((zupdate && !upda2 && datinit)
-            || (a1update && !upda2 && datinit && notgzandp)
-            || (inner && indone && !outer0 && !upda1f && !upda1 && notgzandp && !upda2 && datinit)
-            || (a2update && datinit)
-            || (idle && go && datinit))
-         init_ifi = true;
-      else
-         init_ifi = false;
+// Source data registers
 
-      // INITIALIZE INTENSITY INTEGER
+/*Data_src	:= DATA_SRC (srcdlo, srcdhi, srcz[0..1], srczo[0..1], srczp[0..1], srcz1[0..1], srcz2[0..1], big_pix,
+			clk, gpu_din, intld[0..3], local_data0, local_data1, srcd1ld[0..1], srcdread, srczread, srcshift[0..5],
+			srcz1ld[0..1], srcz2add, srcz2ld[0..1], zedld[0..3], zpipe[0..1]);
+Srcd[0-7]	:= JOIN (srcd[0-7], srcdlo{0-7});
+Srcd[8-31]	:= JOIN (srcd[8-31], srcdlo{8-31});
+Srcd[32-63]	:= JOIN (srcd[32-63], srcdhi{0-31});*/
 
-      if (init_if)
-         init_iii = true;
-      else
-         init_iii = false;
+// Destination data registers
 
-      // INITIALIZE Z FRACTION
+/*Data_dst	:= DATA_DST (dstd[0..63], dstz[0..1], clk, dstdld[0..1], dstzld[0..1], load_data[0..1]);
+Dstdlo		:= JOIN (dstdlo, dstd[0..31]);
+Dstdhi		:= JOIN (dstdhi, dstd[32..63]);*/
 
-      if (init_ii && gourz)
-         init_zfi = true;
-      else
-         init_zfi = false;
+// Pattern and Color data registers
 
-      // INITIALIZE Z INTEGER
+// Looks like this is simply another register file for the pattern data registers. No adding or anything funky
+// going on. Note that patd & patdv will output the same info.
+// Patdldl/h (patdld[0..1]) can select the local_data bus to overwrite the current pattern data...
+// Actually, it can be either patdld OR patdadd...!
+/*Data_pat	:= DATA_PAT (colord[0..15], int0dp[8..10], int1dp[8..10], int2dp[8..10], int3dp[8..10], mixsel[0..2],
+			patd[0..63], patdv[0..1], clk, colorld, dpipe[0], ext_int, gpu_din, intld[0..3], local_data0, local_data1,
+			patdadd, patdld[0..1], reload, reset\);
+Patdlo		:= JOIN (patdlo, patd[0..31]);
+Patdhi		:= JOIN (patdhi, patd[32..63]);*/
 
-      if (init_zf)
-         init_zii = true;
-      else
-         init_zii = false;
+// Multiplying data Mixer (NOT IN JAGUAR I)
 
-      // Here we move the fooi into their foo counterparts in order to simulate the moving
-      // of data into the various FDSYNCs... Each time we loop we simulate one clock cycle...
+/*Datamix		:= DATAMIX (patdo[0..1], clk, colord[0..15], dpipe[1], dstd[0..63], int0dp[8..10], int1dp[8..10],
+			int2dp[8..10], int3dp[8..10], mixsel[0..2], patd[0..63], pdsel[0..1], srcd[0..63], textrgb, txtd[0..63]);*/
 
-      idle = idlei;
-      inner = inneri;
-      a1fupdate = a1fupdatei;
-      a1update = a1updatei;
-      zfupdate = zfupdatei;		// *
-      zupdate = zupdatei;			// *
-      a2update = a2updatei;
-      init_if = init_ifi;			// *
-      init_ii = init_iii;			// *
-      init_zf = init_zfi;			// *
-      init_zi = init_zii;			// *
-      // * denotes states that will never assert for Jaguar I
+// Logic function unit
 
-      // Now, depending on how we want to handle things, we could either put the implementation
-      // of the various pieces up above, or handle them down below here.
+/*Lfu		:= LFU (lfu[0..1], srcdlo, srcdhi, dstdlo, dstdhi, lfu_func[0..3]);*/
+////////////////////////////////////// C++ CODE //////////////////////////////////////
+	uint64_t lfu = blitter_simd_ops.lfu(srcd, dstd, lfu_func);
+   bool mir_bit, mir_byte;
+   uint16_t masku;
+   uint8_t e_coarse, e_fine;
+   uint8_t s_coarse, s_fine;
+   uint16_t maskt;
+	uint8_t decl38e[2][8] = { { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF },
+		{ 0xFE, 0xFD, 0xFB, 0xF7, 0xEF, 0xDF, 0xBF, 0x7F } };
+	uint8_t dech38[8] = { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 };
+	uint8_t dech38el[2][8] = { { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 },
+		{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 } };
+   int en;
+	uint8_t dbinht;
+   uint16_t addq[4];
+   uint8_t initcin[4] = { 0, 0, 0, 0 };
+   uint16_t mask;
+   uint64_t dmux[4];
+   uint64_t ddat;
+//////////////////////////////////////////////////////////////////////////////////////
 
-      // Let's try postprocessing for now...
+// Increment and Step Registers
 
-      if (inner)
-      {
-         bool idle_inner = true, sreadx = false, szreadx = false, sread = false,
-              szread = false, dread = false, dzread = false, dwrite = false, dzwrite = false;
-         bool inner0 = false;
-         bool idle_inneri, sreadxi, szreadxi, sreadi, szreadi, dreadi, dzreadi, dwritei, dzwritei;
-         //other stuff
-         uint8_t srcshift = 0;
-         uint16_t icount = GET16(blitter_ram, PIXLINECOUNTER + 2);
-         bool srca_addi, dsta_addi, gensrc, gendst, gena2i, zaddr, fontread, justify, a1_add, a2_add;
-         bool adda_yconst, addareg, suba_x, suba_y, a1fracldi, shadeadd;
-         uint8_t addasel, a1_xconst, a2_xconst, adda_xconst, addbsel, maska1, maska2, modx, daddasel;
-         uint8_t daddbsel, daddmode;
-         bool patfadd, patdadd, srcz2add, daddq_sel;
-         uint8_t data_sel;
-         uint32_t address, pixAddr;
-         uint8_t dstxp;
-         uint64_t srcz;
-         bool winhibit;
+// Does it do anything without the step add lines? Check it!
+// No. This is pretty much just a register file without the Jaguar II lines...
+/*Inc_step	:= INC_STEP (iinc, istep[0..31], zinc, zstep[0..31], clk, ext_int, gpu_din, iincld, iincldx, istepadd,
+			istepfadd, istepld, istepdld, reload, reset\, zincld, zstepadd, zstepfadd, zstepld, zstepdld);
+Istep		:= JOIN (istep, istep[0..31]);
+Zstep		:= JOIN (zstep, zstep[0..31]);*/
 
-         indone = false;
+// Pixel data comparator
 
-         /* Precompute address constants (invariant during inner loop) */
-         a1_xconst = 6 - a1_pixsize;
-         a2_xconst = 6 - a2_pixsize;
-         if (a1addx == 1)
-            a1_xconst = 0;
-         else if (a1addx & 0x02)
-            a1_xconst = 7;
-         if (a2addx == 1)
-            a2_xconst = 0;
-         else if (a2addx & 0x02)
-            a2_xconst = 7;
+/*Datacomp	:= DATACOMP (dcomp[0..7], cmpdst, dstdlo, dstdhi, patdlo, patdhi, srcdlo, srcdhi);*/
+////////////////////////////////////// C++ CODE //////////////////////////////////////
+	*dcomp = blitter_simd_ops.dcomp(*patd, srcd, dstd, cmpdst);
+//////////////////////////////////////////////////////////////////////////////////////
 
-         /* Precompute srcshift — loaded on first inner cycle (sshftld),
-            then held constant for all subsequent cycles. */
-         {
-            uint8_t dstxp0, srcxp0, shftv0, pobb0, loshd0;
-            bool pobbsel0;
+// Zed comparator for Z-buffer operations
 
-            dstxp0 = (dsta2 ? a2_x : a1_x) & 0x3F;
-            srcxp0 = (dsta2 ? a1_x : a2_x) & 0x3F;
-            shftv0 = ((dstxp0 - srcxp0) << pixsize) & 0x3F;
-            pobb0 = 0;
-            if (pixsize == 3)
-               pobb0 = dstxp0 & 0x07;
-            else if (pixsize == 4)
-               pobb0 = dstxp0 & 0x03;
-            else if (pixsize == 5)
-               pobb0 = dstxp0 & 0x01;
+/*Zedcomp		:= ZEDCOMP (zcomp[0..3], srczp[0..1], dstz[0..1], zmode[0..2]);*/
+////////////////////////////////////// C++ CODE //////////////////////////////////////
+//srczp is srcz pipelined, also it goes through a source shift as well...
+/*The shift is basically like so (each piece is 16 bits long):
 
-            pobbsel0 = phrase_mode && bcompen;
-            loshd0 = (pobbsel0 ? pobb0 : shftv0) & 0x07;
-            srcshift = (srcen || pobbsel0 ? loshd0 : 0);
-            srcshift |= (srcen && phrase_mode ? shftv0 & 0x38 : 0);
-         }
+	0         1         2         3         4          5         6
+	srcz1lolo srcz1lohi srcz1hilo srcz1hihi srcrz2lolo srcz2lohi srcz2hilo
 
-         while (true)
-         {
-            uint16_t dstxwr, pseq;
-            bool penden;
-            uint8_t window_mask;
-            uint8_t inner_mask = 0;
-            uint8_t emask, pma, dend;
-            uint64_t srcd;
-            uint8_t zSrcShift;
-            uint64_t wdata;
-            uint8_t dcomp, zcomp;
+with srcshift bits 4 & 5 selecting the start position
+*/
+//So... basically what we have here is:
+	*zcomp = blitter_simd_ops.zcomp(*srcz, dstz, zmode);
 
-            //NOTE: sshftld probably is only asserted at the beginning of the inner loop. !!! FIX !!!
-            /* State machine: step is always true (no bus contention in
-               Jaguar I), textext/txtread never assert. Both eliminated. */
+//TEMP, TO TEST IF ZCOMP IS THE CULPRIT...
+//Nope, this is NOT the problem...
+//zcomp=0;
+// We'll do the comparison/bit/byte inhibits here, since that's they way it happens
+// in the real thing (dcomp goes out to COMP_CTRL and back into DATA through dbinh)...
+	{
+	uint8_t bcomp_bits;
+	if (bcompen && phrase_mode)
+	{
+		bcomp_bits = (srcd >> 56) & 0xFF;
+	}
+	else
+		bcomp_bits = srcd & 0xFF;
 
-            if ((dzwrite && inner0)
-                  || (dwrite && !dstwrz && inner0))
-            {
-               idle_inneri = true;
-               break;
-            }
-            else
-               idle_inneri = false;
+	COMP_CTRL(&dbinht, nowrite,
+		bcompen, true/*big_pix*/, bkgwren, *dcomp, dcompen, icount, pixsize, phrase_mode, bcomp_bits, *zcomp);
+	}
+	dbinh = dbinht;
 
-            sreadxi = (idle_inner && srcenx);
-            szreadxi = (sreadx && srcenz);
+//////////////////////////////////////////////////////////////////////////////////////
 
-            sreadi = (szreadx
-                  || (sreadx && !srcenz && srcen)
-                  || (idle_inner && !srcenx && srcen)
-                  || (dzwrite && !inner0 && srcen)
-                  || (dwrite && !dstwrz && !inner0 && srcen));
+// 22 Mar 94
+// The data initializer - allows all four initial values to be computed from one (NOT IN JAGUAR I)
 
-            szreadi = (sread && srcenz);
+/*Datinit		:= DATINIT (initcin[0..3], initinc[0..63], initpix[0..15], a1_x[0..1], big_pix, clk, iinc, init_if, init_ii,
+			init_zf, istep[0..31], zinc, zstep[0..31]);*/
 
-            dreadi = ((szread && dsten)
-                  || (sread && !srcenz && dsten)
-                  || (sreadx && !srcenz && !srcen && dsten)
-                  || (idle_inner && !srcenx && !srcen && dsten)
-                  || (dzwrite && !inner0 && !srcen && dsten)
-                  || (dwrite && !dstwrz && !inner0 && !srcen && dsten));
+// Adder array for Z and intensity increments
 
-            dzreadi = ((dread && dstenz)
-                  || (szread && !dsten && dstenz)
-                  || (sread && !srcenz && !dsten && dstenz)
-                  || (sreadx && !srcenz && !srcen && !dsten && dstenz)
-                  || (idle_inner && !srcenx && !srcen && !dsten && dstenz)
-                  || (dzwrite && !inner0 && !srcen && !dsten && dstenz)
-                  || (dwrite && !dstwrz && !inner0 && !srcen && !dsten && dstenz));
+/*Addarray	:= ADDARRAY (addq[0..3], clk, daddasel[0..2], daddbsel[0..3], daddmode[0..2], dstdlo, dstdhi, iinc,
+			initcin[0..3], initinc[0..63], initpix[0..15], istep, patdv[0..1], srcdlo, srcdhi, srcz1[0..1],
+			srcz2[0..1], reset\, zinc, zstep);*/
+/*void ADDARRAY(uint16_t * addq, uint8_t daddasel, uint8_t daddbsel, uint8_t daddmode,
+	uint64_t dstd, uint32_t iinc, uint8_t initcin[], uint64_t initinc, uint16_t initpix,
+	uint32_t istep, uint64_t patd, uint64_t srcd, uint64_t srcz1, uint64_t srcz2,
+	uint32_t zinc, uint32_t zstep)*/
+////////////////////////////////////// C++ CODE //////////////////////////////////////
+	{
+	uint64_t patd_pre = *patd;
+	ADDARRAY(addq, daddasel, daddbsel, daddmode, dstd, iinc, initcin, 0, 0, 0, *patd, srcd, 0, 0, 0, 0);
 
-            dwritei = (dzread
-                  || (dread && !dstenz)
-                  || (szread && !dsten && !dstenz)
-                  || (sread && !srcenz && !dsten && !dstenz)
-                  || (sreadx && !srcenz && !srcen && !dsten && !dstenz)
-                  || (idle_inner && !srcenx && !srcen && !dsten && !dstenz)
-                  || (dzwrite && !inner0 && !srcen && !dsten && !dstenz)
-                  || (dwrite && !dstwrz && !inner0 && !srcen && !dsten && !dstenz));
+	if (patdadd)
+		*patd = ((uint64_t)addq[3] << 48) | ((uint64_t)addq[2] << 32) | ((uint64_t)addq[1] << 16) | (uint64_t)addq[0];
+//////////////////////////////////////////////////////////////////////////////////////
 
-            dzwritei = (dwrite && dstwrz);
+// Local data bus multiplexer
+// In hardware, the write data mux reads patd BEFORE the register update.
+// patd_pre captures the pre-increment value for the data output mux.
 
-            // Here we move the fooi into their foo counterparts in order to simulate the moving
-            // of data into the various FDSYNCs... Each time we loop we simulate one clock cycle...
+/*Local_mux	:= LOCAL_MUX (local_data[0..1], load_data[0..1],
+	addq[0..3], gpu_din, data[0..63], blitter_active, daddq_sel);
+Local_data0	:= JOIN (local_data0, local_data[0]);
+Local_data1	:= JOIN (local_data1, local_data[1]);*/
+////////////////////////////////////// C++ CODE //////////////////////////////////////
+//////////////////////////////////////////////////////////////////////////////////////
 
-            idle_inner = idle_inneri;
-            sreadx = sreadxi;
-            szreadx = szreadxi;
-            sread = sreadi;
-            szread = szreadi;
-            dread = dreadi;
-            dzread = dzreadi;
-            dwrite = dwritei;
-            dzwrite = dzwritei;
+// Data output multiplexer and tri-state drive
 
-            // Here's a few more decodes--not sure if they're supposed to go here or not...
+/*Data_mux	:= DATA_MUX (wdata[0..63], addq[0..3], big_pix, dstdlo, dstdhi, dstz[0..1], data_sel[0..1], data_ena,
+			dstart[0..5], dend[0..5], dbinh\[0..7], lfu[0..1], patdo[0..1], phrase_mode, srczo[0..1]);*/
+////////////////////////////////////// C++ CODE //////////////////////////////////////
+// NOTE: patdo comes from DATAMIX and can be considered the same as patd for Jaguar I
 
+//////////////////////////////////////////////////////////////////////////////////////
+//}
 
-            srca_addi = (sreadxi && !srcenz) || (sreadi && !srcenz) || szreadxi || szreadi;
+/*DEF DATA_MUX (
+		wdata[0..63]	// co-processor rwrite data bus
+		:BUS;
+INT16/	addq[0..3]
+		big_pix			// Pixel organisation is big-endian
+INT32/	dstdlo
+INT32/	dstdhi
+INT32/	dstzlo
+INT32/	dstzhi
+		data_sel[0..1]	// source of write data
+		data_ena		// enable write data onto read/write bus
+		dstart[0..5]	// start of changed write data
+		dend[0..5]		// end of changed write data
+		dbinh\[0..7]	// byte oriented changed data inhibits
+INT32/	lfu[0..1]
+INT32/	patd[0..1]
+		phrase_mode		// phrase write mode
+INT32/	srczlo
+INT32/	srczhi
+		:IN);*/
 
-            dsta_addi = (dwritei && !dstwrz) || dzwritei;
+/*INT32/	addql[0..1], ddatlo, ddathi zero32
+:LOCAL;
+BEGIN
 
-            gensrc = sreadxi || szreadxi || sreadi || szreadi;
-            gendst = dreadi || dzreadi || dwritei || dzwritei;
-            gena2i = (gensrc && !dsta2) || (gendst && dsta2);
+Phrase_mode\	:= INV1 (phrase_mode\, phrase_mode);
+Zero		:= TIE0 (zero);
+Zero32		:= JOIN (zero32, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero);*/
 
-            zaddr = szreadx || szread || dzread || dzwrite;
+/* Generate a changed data mask */
 
-            // Some stuff from MCONTROL.NET--not sure if this is the correct use of this decode or not...
-            /*Fontread\	:= OND1 (fontread\, sread[1], sreadx[1], bcompen);
-Fontread	:= INV1 (fontread, fontread\);
-Justt		:= NAN3 (justt, fontread\, phrase_mode, tactive\);
-Justify		:= TS (justify, justt, busen);*/
-            fontread = (sread || sreadx) && bcompen;
-            justify = !(!fontread && phrase_mode /*&& tactive*/);
+/*Edis		:= OR6 (edis\, dend[0..5]);
+Ecoarse		:= DECL38E (e_coarse\[0..7], dend[3..5], edis\);
+E_coarse[0]	:= INV1 (e_coarse[0], e_coarse\[0]);
+Efine		:= DECL38E (unused[0], e_fine\[1..7], dend[0..2], e_coarse[0]);*/
+////////////////////////////////////// C++ CODE //////////////////////////////////////
 
-            /* Generate inner loop update enables */
-            /*
-A1_addi		:= MX2 (a1_addi, dsta_addi, srca_addi, dsta2);
-A2_addi		:= MX2 (a2_addi, srca_addi, dsta_addi, dsta2);
-A1_add		:= FD1 (a1_add, a1_add\, a1_addi, clk);
-A2_add		:= FD1 (a2_add, a2_add\, a2_addi, clk);
-A2_addb		:= BUF1 (a2_addb, a2_add);
-*/
-            a1_add = (dsta2 ? srca_addi : dsta_addi);
-            a2_add = (dsta2 ? dsta_addi : srca_addi);
+	en = ((dend & 0x3F) ? 1 : 0);
+	e_coarse = decl38e[en][(dend & 0x38) >> 3];		// Actually, this is e_coarse inverted...
+	e_fine = decl38e[(e_coarse & 0x01) ^ 0x01][dend & 0x07];
+	e_fine &= 0xFE;
+//////////////////////////////////////////////////////////////////////////////////////
 
-            /* Address adder input A register selection
-               000	A1 step integer part
-               001	A1 step fraction part
-               010	A1 increment integer part
-               011	A1 increment fraction part
-               100	A2 step
+/*Scoarse		:= DECH38 (s_coarse[0..7], dstart[3..5]);
+Sfen\		:= INV1 (sfen\, s_coarse[0]);
+Sfine		:= DECH38EL (s_fine[0..7], dstart[0..2], sfen\);*/
+////////////////////////////////////// C++ CODE //////////////////////////////////////
+	s_coarse = dech38[(dstart & 0x38) >> 3];
+	s_fine = dech38el[(s_coarse & 0x01) ^ 0x01][dstart & 0x07];
+//////////////////////////////////////////////////////////////////////////////////////
 
-               bit 2 = a2update
-               bit 1 = /a2update . (a1_add . a1addx[0..1])
-               bit 0 = /a2update . ( a1fupdate
-               + a1_add . atick[0] . a1addx[0..1])
-               The /a2update term on bits 0 and 1 is redundant.
-               Now look-ahead based
-               */
+/*Maskt[0]	:= BUF1 (maskt[0], s_fine[0]);
+Maskt[1-7]	:= OAN1P (maskt[1-7], maskt[0-6], s_fine[1-7], e_fine\[1-7]);*/
+////////////////////////////////////// C++ CODE //////////////////////////////////////
+	maskt = s_fine & 0x0001;
+	maskt |= (((maskt & 0x0001) || (s_fine & 0x02u)) && (e_fine & 0x02u) ? 0x0002 : 0x0000);
+	maskt |= (((maskt & 0x0002) || (s_fine & 0x04u)) && (e_fine & 0x04u) ? 0x0004 : 0x0000);
+	maskt |= (((maskt & 0x0004) || (s_fine & 0x08u)) && (e_fine & 0x08u) ? 0x0008 : 0x0000);
+	maskt |= (((maskt & 0x0008) || (s_fine & 0x10u)) && (e_fine & 0x10u) ? 0x0010 : 0x0000);
+	maskt |= (((maskt & 0x0010) || (s_fine & 0x20u)) && (e_fine & 0x20u) ? 0x0020 : 0x0000);
+	maskt |= (((maskt & 0x0020) || (s_fine & 0x40u)) && (e_fine & 0x40u) ? 0x0040 : 0x0000);
+	maskt |= (((maskt & 0x0040) || (s_fine & 0x80u)) && (e_fine & 0x80u) ? 0x0080 : 0x0000);
+//////////////////////////////////////////////////////////////////////////////////////
 
-            addasel = (a1fupdate || (a1_add && a1addx == 3) ? 0x01 : 0x00);
-            addasel |= (a1_add && a1addx == 3 ? 0x02 : 0x00);
-            addasel |= (a2update ? 0x04 : 0x00);
-            /* Address adder input A X constant selection
-               adda_xconst[0..2] generate a power of 2 in the range 1-64 or all
-               zeroes when they are all 1
-               Remember - these are pixels, so to add one phrase the pixel size
-               has to be taken into account to get the appropriate value.
-               for A1
-               if a1addx[0..1] are 00 set 6 - pixel size
-               if a1addx[0..1] are 01 set the value 000
-               if a1addx[0..1] are 10 set the value 111
-               similarly for A2
-JLH: Also, 11 will likewise set the value to 111
-*/
-            adda_xconst = (a2_add ? a2_xconst : a1_xconst);
-            /* Address adder input A Y constant selection
-               22 June 94 - This was erroneous, because only the a1addy bit was reflected here.
-               Therefore, the selection has to be controlled by a bug fix bit.
-JLH: Bug fix bit in Jaguar II--not in Jaguar I!
-*/
-            adda_yconst = a1addy;
-            /* Address adder input A register versus constant selection
-               given by	  a1_add . a1addx[0..1]
-               + a1update
-               + a1fupdate
-               + a2_add . a2addx[0..1]
-               + a2update
-               */
-            addareg = ((a1_add && a1addx == 3) || a1update || a1fupdate
-                  || (a2_add && a2addx == 3) || a2update ? true : false);
-            /* The adders can be put into subtract mode in add pixel size
-               mode when the corresponding flags are set */
-            suba_x = ((a1_add && a1xsign && a1addx == 1) || (a2_add && a2xsign && a2addx == 1) ? true : false);
-            suba_y = ((a1_add && a1addy && a1ysign) || (a2_add && a2addy && a2ysign) ? true : false);
-            /* Address adder input B selection
-               00	A1 pointer
-               01	A2 pointer
-               10	A1 fraction
-               11	Zero
+   /* Produce a look-ahead on the ripple carry */
+	maskt |= (((s_coarse & e_coarse & 0x01u) || (s_coarse & 0x02u)) && (e_coarse & 0x02u) ? 0x0100 : 0x0000);
+	maskt |= (((maskt & 0x0100) || (s_coarse & 0x04u)) && (e_coarse & 0x04u) ? 0x0200 : 0x0000);
+	maskt |= (((maskt & 0x0200) || (s_coarse & 0x08u)) && (e_coarse & 0x08u) ? 0x0400 : 0x0000);
+	maskt |= (((maskt & 0x0400) || (s_coarse & 0x10u)) && (e_coarse & 0x10u) ? 0x0800 : 0x0000);
+	maskt |= (((maskt & 0x0800) || (s_coarse & 0x20u)) && (e_coarse & 0x20u) ? 0x1000 : 0x0000);
+	maskt |= (((maskt & 0x1000) || (s_coarse & 0x40u)) && (e_coarse & 0x40u) ? 0x2000 : 0x0000);
+	maskt |= (((maskt & 0x2000) || (s_coarse & 0x80u)) && (e_coarse & 0x80u) ? 0x4000 : 0x0000);
 
-               Bit 1 =   a1fupdate
-               + (a1_add . atick[0] . a1addx[0..1])
-               + a1fupdate . a1_stepld
-               + a1update . a1_stepld
-               + a2update . a2_stepld
-               Bit 0 =   a2update + a2_add
-               + a1fupdate . a1_stepld
-               + a1update . a1_stepld
-               + a2update . a2_stepld
-               */
-            addbsel = (a2update || a2_add || (a1fupdate && a1_stepld)
-                  || (a1update && a1_stepld) || (a2update && a2_stepld) ? 0x01 : 0x00);
-            addbsel |= (a1fupdate || (a1_add && a1addx == 3) || (a1fupdate && a1_stepld)
-                  || (a1update && a1_stepld) || (a2update && a2_stepld) ? 0x02 : 0x00);
+/* The bit terms are mirrored for big-endian pixels outside phrase
+mode.  The byte terms are mirrored for big-endian pixels in phrase
+mode.  */
 
-            /* The modulo bits are used to align X onto a phrase boundary when
-               it is being updated by one phrase
-               000	no mask
-               001	mask bit 0
-               010	mask bits 1-0
-               ..
-               110  	mask bits 5-0
+/*Mirror_bit	:= AN2M (mir_bit, phrase_mode\, big_pix);
+Mirror_byte	:= AN2H (mir_byte, phrase_mode, big_pix);
 
-               Masking is enabled for a1 when a1addx[0..1] is 00, and the value
-               is 6 - the pixel size (again!)
-               */
-            maska1 = (a1_add && a1addx == 0 ? 6 - a1_pixsize : 0);
-            maska2 = (a2_add && a2addx == 0 ? 6 - a2_pixsize : 0);
-            modx = (a2_add ? maska2 : maska1);
-            /* Generate load strobes for the increment updates */
+Masktb[14]	:= BUF1 (masktb[14], maskt[14]);
+Masku[0]	:= MX4 (masku[0],  maskt[0],  maskt[7],  maskt[14],  zero, mir_bit, mir_byte);
+Masku[1]	:= MX4 (masku[1],  maskt[1],  maskt[6],  maskt[14],  zero, mir_bit, mir_byte);
+Masku[2]	:= MX4 (masku[2],  maskt[2],  maskt[5],  maskt[14],  zero, mir_bit, mir_byte);
+Masku[3]	:= MX4 (masku[3],  maskt[3],  maskt[4],  masktb[14], zero, mir_bit, mir_byte);
+Masku[4]	:= MX4 (masku[4],  maskt[4],  maskt[3],  masktb[14], zero, mir_bit, mir_byte);
+Masku[5]	:= MX4 (masku[5],  maskt[5],  maskt[2],  masktb[14], zero, mir_bit, mir_byte);
+Masku[6]	:= MX4 (masku[6],  maskt[6],  maskt[1],  masktb[14], zero, mir_bit, mir_byte);
+Masku[7]	:= MX4 (masku[7],  maskt[7],  maskt[0],  masktb[14], zero, mir_bit, mir_byte);
+Masku[8]	:= MX2 (masku[8],  maskt[8],  maskt[13], mir_byte);
+Masku[9]	:= MX2 (masku[9],  maskt[9],  maskt[12], mir_byte);
+Masku[10]	:= MX2 (masku[10], maskt[10], maskt[11], mir_byte);
+Masku[11]	:= MX2 (masku[11], maskt[11], maskt[10], mir_byte);
+Masku[12]	:= MX2 (masku[12], maskt[12], maskt[9],  mir_byte);
+Masku[13]	:= MX2 (masku[13], maskt[13], maskt[8],  mir_byte);
+Masku[14]	:= MX2 (masku[14], maskt[14], maskt[0],  mir_byte);*/
+////////////////////////////////////// C++ CODE //////////////////////////////////////
 
-            /*A1pldt		:= NAN2 (a1pldt, atick[1], a1_add);
-A1ptrldi	:= NAN2 (a1ptrldi, a1update\, a1pldt);
+	mir_bit  = true/*big_pix*/ && !phrase_mode;
+	mir_byte = true/*big_pix*/ && phrase_mode;
+	masku    = maskt;
 
-A1fldt		:= NAN4 (a1fldt, atick[0], a1_add, a1addx[0..1]);
-A1fracldi	:= NAN2 (a1fracldi, a1fupdate\, a1fldt);
+	if (mir_bit)
+	{
+		masku &= 0xFF00;
+		masku |= (maskt >> 7) & 0x0001;
+		masku |= (maskt >> 5) & 0x0002;
+		masku |= (maskt >> 3) & 0x0004;
+		masku |= (maskt >> 1) & 0x0008;
+		masku |= (maskt << 1) & 0x0010;
+		masku |= (maskt << 3) & 0x0020;
+		masku |= (maskt << 5) & 0x0040;
+		masku |= (maskt << 7) & 0x0080;
+	}
 
-A2pldt		:= NAN2 (a2pldt, atick[1], a2_add);
-A2ptrldi	:= NAN2 (a2ptrldi, a2update\, a2pldt);*/
+	if (mir_byte)
+	{
+		/* MX4 input 2: masku[7:0] = {8{maskt[14]}} (broadcast bit 14) */
+		masku = (maskt & 0x4000) ? 0x00FF : 0x0000;
+		/* MX2: reverse bits 8-13, maskt[0] at position 14 */
+		masku |= (maskt >> 5) & 0x0100;
+		masku |= (maskt >> 3) & 0x0200;
+		masku |= (maskt >> 1) & 0x0400;
+		masku |= (maskt << 1) & 0x0800;
+		masku |= (maskt << 3) & 0x1000;
+		masku |= (maskt << 5) & 0x2000;
+		masku |= (maskt & 0x0001) << 14;
+	}
+//////////////////////////////////////////////////////////////////////////////////////
+
+/* The maskt terms define the area for changed data, but the byte
+inhibit terms can override these */
+
+/*Mask[0-7]	:= AN2 (mask[0-7], masku[0-7], dbinh\[0]);
+Mask[8-14]	:= AN2H (mask[8-14], masku[8-14], dbinh\[1-7]);*/
+////////////////////////////////////// C++ CODE //////////////////////////////////////
+	mask = masku & (!(dbinh & 0x01) ? 0xFFFF : 0xFF00);
+	mask &= ~(((uint16_t)dbinh & 0x00FE) << 7);
+//////////////////////////////////////////////////////////////////////////////////////
+
+/*Addql[0]	:= JOIN (addql[0], addq[0..1]);
+Addql[1]	:= JOIN (addql[1], addq[2..3]);
+
+Dsel0b[0-1]	:= BUF8 (dsel0b[0-1], data_sel[0]);
+Dsel1b[0-1]	:= BUF8 (dsel1b[0-1], data_sel[1]);
+Ddatlo		:= MX4 (ddatlo, patd[0], lfu[0], addql[0], zero32, dsel0b[0], dsel1b[0]);
+Ddathi		:= MX4 (ddathi, patd[1], lfu[1], addql[1], zero32, dsel0b[1], dsel1b[1]);*/
+////////////////////////////////////// C++ CODE //////////////////////////////////////
+	dmux[0] = patd_pre;
+	dmux[1] = lfu;
+	dmux[2] = ((uint64_t)addq[3] << 48) | ((uint64_t)addq[2] << 32) | ((uint64_t)addq[1] << 16) | (uint64_t)addq[0];
+	dmux[3] = 0;
+	ddat = dmux[data_sel];
+	}
+//////////////////////////////////////////////////////////////////////////////////////
+
+/*Zed_sel		:= AN2 (zed_sel, data_sel[0..1]);
+Zed_selb[0-1]	:= BUF8 (zed_selb[0-1], zed_sel);
+
+Dat[0-7]	:= MX4 (dat[0-7],   dstdlo{0-7},   ddatlo{0-7},   dstzlo{0-7},   srczlo{0-7},   mask[0-7], zed_selb[0]);
+Dat[8-15]	:= MX4 (dat[8-15],  dstdlo{8-15},  ddatlo{8-15},  dstzlo{8-15},  srczlo{8-15},  mask[8],   zed_selb[0]);
+Dat[16-23]	:= MX4 (dat[16-23], dstdlo{16-23}, ddatlo{16-23}, dstzlo{16-23}, srczlo{16-23}, mask[9],   zed_selb[0]);
+Dat[24-31]	:= MX4 (dat[24-31], dstdlo{24-31}, ddatlo{24-31}, dstzlo{24-31}, srczlo{24-31}, mask[10],  zed_selb[0]);
+Dat[32-39]	:= MX4 (dat[32-39], dstdhi{0-7},   ddathi{0-7},   dstzhi{0-7},   srczhi{0-7},   mask[11],  zed_selb[1]);
+Dat[40-47]	:= MX4 (dat[40-47], dstdhi{8-15},  ddathi{8-15},  dstzhi{8-15},  srczhi{8-15},  mask[12],  zed_selb[1]);
+Dat[48-55]	:= MX4 (dat[48-55], dstdhi{16-23}, ddathi{16-23}, dstzhi{16-23}, srczhi{16-23}, mask[13],  zed_selb[1]);
+Dat[56-63]	:= MX4 (dat[56-63], dstdhi{24-31}, ddathi{24-31}, dstzhi{24-31}, srczhi{24-31}, mask[14],  zed_selb[1]);*/
+////////////////////////////////////// C++ CODE //////////////////////////////////////
+	*wdata = blitter_simd_ops.byte_merge(ddat, dstd, mask);
+	*srcz = blitter_simd_ops.byte_merge(*srcz, dstz, mask);
+//////////////////////////////////////////////////////////////////////////////////////
+
+/*Data_enab[0-1]	:= BUF8 (data_enab[0-1], data_ena);
+Datadrv[0-31]	:= TS (wdata[0-31],  dat[0-31],  data_enab[0]);
+Datadrv[32-63]	:= TS (wdata[32-63], dat[32-63], data_enab[1]);
+
+Unused[0]	:= DUMMY (unused[0]);
+
+END;*/
+}
+
+#ifdef BLITTER_TRACE
+#include <mach/mach_time.h>
+#include <stdio.h>
+static double bm2_trace_threshold_ms = 0.3; /* dump any blit slower than this */
+static uint64_t bm2_trace_t0;
+#endif
+
+void BlitterMidsummer2(void)
+{
+   uint32_t cmd = (PERF_INC(blitter_calls), GET32(blitter_ram, COMMAND));
+#ifdef BLITTER_TRACE
+   bm2_trace_t0 = mach_absolute_time();
+#endif
+
+
+   // Line states passed in via the command register
+
+   bool srcen = (SRCEN), srcenx = (SRCENX), srcenz = (SRCENZ),
+        dsten = (DSTEN), dstenz = (DSTENZ), dstwrz = (DSTWRZ), clip_a1 = (CLIPA1),
+        upda1 = (UPDA1), upda1f = (UPDA1F), upda2 = (UPDA2), dsta2 = (DSTA2),
+        gourd = (GOURD), gourz = (GOURZ), topben = (TOPBEN), topnen = (TOPNEN),
+        patdsel = (PATDSEL), adddsel = (ADDDSEL), cmpdst = (CMPDST), bcompen = (BCOMPEN),
+        dcompen = (DCOMPEN), bkgwren = (BKGWREN), srcshade = (SRCSHADE);
+
+   uint8_t zmode = (cmd & 0x01C0000) >> 18, lfufunc = (cmd & 0x1E00000) >> 21;
+   //Missing: BUSHI
+   //Where to find various lines:
+   // clip_a1  -> inner
+   // gourd    -> dcontrol, inner, outer, state
+   // gourz    -> dcontrol, inner, outer, state
+   // cmpdst   -> blit, data, datacomp, state
+   // bcompen  -> acontrol, inner, mcontrol, state
+   // dcompen  -> inner, state
+   // bkgwren  -> inner, state
+   // srcshade -> dcontrol, inner, state
+   // adddsel  -> dcontrol
+   //NOTE: ADDDSEL takes precedence over PATDSEL, PATDSEL over LFU_FUNC
+
+   // Lines that don't exist in Jaguar I (and will never be asserted)
+
+   bool polygon = false, datinit = false, a1_stepld = false, a2_stepld = false, ext_int = false;
+   bool istepadd = false, istepfadd = false;
+   bool zstepfadd = false, zstepadd = false;
+
+   // Various state lines (initial state--basically the reset state of the FDSYNCs)
+
+   bool go = true, idle = true, inner = false, a1fupdate = false, a1update = false,
+        zfupdate = false, zupdate = false, a2update = false, init_if = false, init_ii = false,
+        init_zf = false, init_zi = false;
+
+   bool outer0 = false, indone = false;
+
+   bool idlei, inneri, a1fupdatei, a1updatei, zfupdatei, zupdatei, a2updatei, init_ifi, init_iii,
+        init_zfi, init_zii;
+
+   bool notgzandp = !(gourz && polygon);
+
+
+   // Various registers set up by user
+
+   uint16_t ocount = GET16(blitter_ram, PIXLINECOUNTER);
+   uint8_t a1_pitch = blitter_ram[A1_FLAGS + 3] & 0x03;
+   uint8_t a2_pitch = blitter_ram[A2_FLAGS + 3] & 0x03;
+   uint8_t a1_pixsize = (blitter_ram[A1_FLAGS + 3] & 0x38) >> 3;
+   uint8_t a2_pixsize = (blitter_ram[A2_FLAGS + 3] & 0x38) >> 3;
+   uint8_t a1_zoffset = (GET16(blitter_ram, A1_FLAGS + 2) >> 6) & 0x07;
+   uint8_t a2_zoffset = (GET16(blitter_ram, A2_FLAGS + 2) >> 6) & 0x07;
+   uint8_t a1_width = (blitter_ram[A1_FLAGS + 2] >> 1) & 0x3F;
+   uint8_t a2_width = (blitter_ram[A2_FLAGS + 2] >> 1) & 0x3F;
+   uint8_t a1addx = blitter_ram[A1_FLAGS + 1] & 0x03, a2addx = blitter_ram[A2_FLAGS + 1] & 0x03;
+   bool a1addy = blitter_ram[A1_FLAGS + 1] & 0x04, a2addy = blitter_ram[A2_FLAGS + 1] & 0x04;
+   bool a1xsign = blitter_ram[A1_FLAGS + 1] & 0x08, a2xsign = blitter_ram[A2_FLAGS + 1] & 0x08;
+   bool a1ysign = blitter_ram[A1_FLAGS + 1] & 0x10, a2ysign = blitter_ram[A2_FLAGS + 1] & 0x10;
+   uint32_t a1_base = GET32(blitter_ram, A1_BASE) & 0xFFFFFFF8;	// Phrase aligned by ignoring bottom 3 bits
+   uint32_t a2_base = GET32(blitter_ram, A2_BASE) & 0xFFFFFFF8;
+
+   uint16_t a1_win_x = GET16(blitter_ram, A1_CLIP + 2) & 0x7FFF;
+   uint16_t a1_win_y = GET16(blitter_ram, A1_CLIP + 0) & 0x7FFF;
+   int16_t a1_x = (int16_t)GET16(blitter_ram, A1_PIXEL + 2);
+   int16_t a1_y = (int16_t)GET16(blitter_ram, A1_PIXEL + 0);
+   int16_t a1_step_x = (int16_t)GET16(blitter_ram, A1_STEP + 2);
+   int16_t a1_step_y = (int16_t)GET16(blitter_ram, A1_STEP + 0);
+   uint16_t a1_stepf_x = GET16(blitter_ram, A1_FSTEP + 2);
+   uint16_t a1_stepf_y = GET16(blitter_ram, A1_FSTEP + 0);
+   uint16_t a1_frac_x = GET16(blitter_ram, A1_FPIXEL + 2);
+   uint16_t a1_frac_y = GET16(blitter_ram, A1_FPIXEL + 0);
+   int16_t a1_inc_x = (int16_t)GET16(blitter_ram, A1_INC + 2);
+   int16_t a1_inc_y = (int16_t)GET16(blitter_ram, A1_INC + 0);
+   uint16_t a1_incf_x = GET16(blitter_ram, A1_FINC + 2);
+   uint16_t a1_incf_y = GET16(blitter_ram, A1_FINC + 0);
 
-            a1fracldi = a1fupdate || (a1_add && a1addx == 3);
+   int16_t a2_x = (int16_t)GET16(blitter_ram, A2_PIXEL + 2);
+   int16_t a2_y = (int16_t)GET16(blitter_ram, A2_PIXEL + 0);
+#if 0
+   bool a2_mask = blitter_ram[A2_FLAGS + 2] & 0x80;
+   uint16_t a2_mask_x = GET16(blitter_ram, A2_MASK + 2);
+   uint16_t a2_mask_y = GET16(blitter_ram, A2_MASK + 0);
+   uint32_t collision = GET32(blitter_ram, COLLISIONCTRL);// 0=RESUME, 1=ABORT, 2=STOPEN
+#endif
+   int16_t a2_step_x = (int16_t)GET16(blitter_ram, A2_STEP + 2);
+   int16_t a2_step_y = (int16_t)GET16(blitter_ram, A2_STEP + 0);
 
-            ADDRGEN(&address, &pixAddr, gena2i, zaddr,
-                  a1_x, a1_y, a1_base, a1_pitch, a1_pixsize, a1_width, a1_zoffset,
-                  a2_x, a2_y, a2_base, a2_pitch, a2_pixsize, a2_width, a2_zoffset);
+   uint64_t srcd1 = GET64(blitter_ram, SRCDATA);
+   uint64_t srcd2 = 0;
+   uint64_t dstd = GET64(blitter_ram, DSTDATA);
+   uint64_t patd = GET64(blitter_ram, PATTERNDATA);
+   uint32_t iinc = GET32(blitter_ram, INTENSITYINC);
+   uint64_t srcz1 = GET64(blitter_ram, SRCZINT);
+   uint64_t srcz2 = GET64(blitter_ram, SRCZFRAC);
+   uint64_t dstz = GET64(blitter_ram, DSTZ);
+   uint32_t zinc = GET32(blitter_ram, ZINC);
 
-            //Here's my guess as to how the addresses get truncated to phrase boundaries in phrase mode...
-            if (!justify)
-               address &= 0xFFFFF8;
+   uint8_t pixsize = (dsta2 ? a2_pixsize : a1_pixsize);	// From ACONTROL
 
-            /* dstxp needed for dstart computation in dwrite */
-            dstxp = (dsta2 ? a2_x : a1_x) & 0x3F;
+   bool phrase_mode;
+   uint16_t a1FracCInX = 0, a1FracCInY = 0;
 
-            if (sreadx)
-            {
-               //uint32_t srcAddr, pixAddr;
-               //ADDRGEN(srcAddr, pixAddr, gena2i, zaddr,
-               //	a1_x, a1_y, a1_base, a1_pitch, a1_pixsize, a1_width, a1_zoffset,
-               //	a2_x, a2_y, a2_base, a2_pitch, a2_pixsize, a2_width, a2_zoffset);
-               srcd2 = srcd1;
-               srcd1 = ((uint64_t)JaguarReadLong(address + 0, BLITTER) << 32)
-                  | (uint64_t)JaguarReadLong(address + 4, BLITTER);
-               //Kludge to take pixel size into account...
-               //Hmm. If we're not in phrase mode, this is most likely NOT going to be used...
-               //Actually, it would be--because of BCOMPEN expansion, for example...
-               if (!phrase_mode)
-               {
-                  if (bcompen)
-                     srcd1 >>= 56;
-                  else
-                  {
-                     if (pixsize == 5)
-                        srcd1 >>= 32;
-                     else if (pixsize == 4)
-                        srcd1 >>= 48;
-                     else
-                        srcd1 >>= 56;
-                  }
-               }//*/
-            }
+   // Bugs in Jaguar I
 
-            if (szreadx)
-            {
-               srcz2 = srcz1;
-               srcz1 = ((uint64_t)JaguarReadLong(address, BLITTER) << 32) | (uint64_t)JaguarReadLong(address + 4, BLITTER);
-            }
+   a2addy = a1addy;							// A2 channel Y add bit is tied to A1's
 
-            if (sread)
-            {
-               srcd2 = srcd1;
-               srcd1 = ((uint64_t)JaguarReadLong(address, BLITTER) << 32) | (uint64_t)JaguarReadLong(address + 4, BLITTER);
-               //Kludge to take pixel size into account...
-               if (!phrase_mode)
-               {
-                  if (bcompen)
-                     srcd1 >>= 56;
-                  else
-                  {
-                     if (pixsize == 5)
-                        srcd1 >>= 32;
-                     else if (pixsize == 4)
-                        srcd1 >>= 48;
-                     else
-                        srcd1 >>= 56;
-                  }
-               }
-            }
+   // Various state lines set up by user
 
-            if (szread)
-            {
-               srcz2 = srcz1;
-               srcz1 = ((uint64_t)JaguarReadLong(address, BLITTER) << 32) | (uint64_t)JaguarReadLong(address + 4, BLITTER);
-               //Kludge to take pixel size into account... I believe that it only has to take 16BPP mode into account. Not sure tho.
-               if (!phrase_mode && pixsize == 4)
-                  srcz1 >>= 48;
+   phrase_mode = ((!dsta2 && a1addx == 0) || (dsta2 && a2addx == 0) ? true : false);	// From ACONTROL
 
-            }
+   // Stopgap vars to simulate various lines
 
-            if (dread)
-            {
-               dstd = ((uint64_t)JaguarReadLong(address, BLITTER) << 32) | (uint64_t)JaguarReadLong(address + 4, BLITTER);
-               //Kludge to take pixel size into account...
-               if (!phrase_mode)
-               {
-                  if (pixsize == 5)
-                     dstd >>= 32;
-                  else if (pixsize == 4)
-                     dstd >>= 48;
-                  else
-                     dstd >>= 56;
-               }
-            }
 
-            if (dzread)
-            {
-               // Is Z always 64 bit read? Or sometimes 16 bit (dependent on phrase_mode)?
-               dstz = ((uint64_t)JaguarReadLong(address, BLITTER) << 32) | (uint64_t)JaguarReadLong(address + 4, BLITTER);
-               //Kludge to take pixel size into account... I believe that it only has to take 16BPP mode into account. Not sure tho.
-               if (!phrase_mode && pixsize == 4)
-                  dstz >>= 48;
+   while (true)
+   {
+      PERF_INC(blitter_outer);
+      // IDLE
 
-            }
+      if ((idle && !go) || (inner && outer0 && indone))
+      {
+         idlei = true;
 
-            // These vars should probably go further up in the code... !!! FIX !!!
-            // We can't preassign these unless they're static...
-            //NOTE: SRCSHADE requires GOURZ to be set to work properly--another Jaguar I bug
-            if (dwrite)
-            {
-               //Counter is done on the dwrite state...! (We'll do it first, since it affects dstart/dend calculations.)
-               //Here's the voodoo for figuring the correct amount of pixels in phrase mode (or not):
-               int8_t inct = -((dsta2 ? a2_x : a1_x) & 0x07);	// From INNER_CNT
-               uint8_t inc = 0;
-               uint16_t oldicount;
-               uint8_t dstart = 0;
+         //Instead of a return, let's try breaking out of the loop...
+         break;
+      }
+      else
+         idlei = false;
 
-               inc = (!phrase_mode || (phrase_mode && (inct & 0x01)) ? 0x01 : 0x00);
-               inc |= (phrase_mode && (((pixsize == 3 || pixsize == 4) && (inct & 0x02)) || (pixsize == 5 && !(inct & 0x01))) ? 0x02 : 0x00);
-               inc |= (phrase_mode && ((pixsize == 3 && (inct & 0x04)) || (pixsize == 4 && !(inct & 0x03))) ? 0x04 : 0x00);
-               inc |= (phrase_mode && pixsize == 3 && !(inct & 0x07) ? 0x08 : 0x00);
+      // INNER LOOP ACTIVE
 
-               oldicount = icount;	// Save icount to detect underflow...
-               icount -= inc;
+      if ((idle && go && !datinit)
+            || (inner && !indone)
+            || (inner && indone && !outer0 && !upda1f && !upda1 && notgzandp && !upda2 && !datinit)
+            || (a1update && !upda2 && notgzandp && !datinit)
+            || (zupdate && !upda2 && !datinit)
+            || (a2update && !datinit)
+            || (init_ii && !gourz)
+            || (init_zi))
+         inneri = true;
+      else
+         inneri = false;
 
-               if (icount == 0 || ((icount & 0x8000) && !(oldicount & 0x8000)))
-                  inner0 = true;
-               // X/Y stepping is also done here, I think...No. It's done when a1_add or a2_add is asserted...
+      // A1 FRACTION UPDATE
 
-               //*********************************************************************************
-               //Start & end write mask computations...
-               //*********************************************************************************
+      if (inner && indone && !outer0 && upda1f)
+         a1fupdatei = true;
+      else
+         a1fupdatei = false;
 
+      // A1 POINTER UPDATE
 
-               if (phrase_mode)
-               {
-                  if (pixsize == 3)
-                     dstart = (dstxp & 0x07) << 3;
-                  else if (pixsize == 4)
-                     dstart = (dstxp & 0x03) << 4;
-                  else if (pixsize == 5)
-                     dstart = (dstxp & 0x01) << 5;
-               }
-               else
-                  dstart    = pixAddr & 0x07;
+      if ((a1fupdate)
+            || (inner && indone && !outer0 && !upda1f && upda1))
+         a1updatei = true;
+      else
+         a1updatei = false;
 
-               //This is the other Jaguar I bug... Normally, should ALWAYS select a1_x here.
-               dstxwr = (dsta2 ? a2_x : a1_x) & 0x7FFE;
-               pseq = dstxwr ^ (a1_win_x & 0x7FFE);
-               pseq = (pixsize == 5 ? pseq : pseq & 0x7FFC);
-               pseq = ((pixsize & 0x06) == 4 ? pseq : pseq & 0x7FF8);
-               penden = clip_a1 && (pseq == 0);
-               window_mask = 0;
+      // Z FRACTION UPDATE
 
-               if (penden)
-               {
-                  if (pixsize == 3)
-                     window_mask = (a1_win_x & 0x07) << 3;
-                  else if (pixsize == 4)
-                     window_mask = (a1_win_x & 0x03) << 4;
-                  else if (pixsize == 5)
-                     window_mask = (a1_win_x & 0x01) << 5;
-               }
-               else
-                  window_mask    = 0;
+      if ((a1update && gourz && polygon)
+            || (inner && indone && !outer0 && !upda1f && !upda1 && gourz && polygon))
+         zfupdatei = true;
+      else
+         zfupdatei = false;
 
-               /* The mask to be used if within one phrase of the end of the inner
-                  loop, similarly */
+      // Z INTEGER UPDATE
 
-               if (inner0)
-               {
-                  if (pixsize == 3)
-                     inner_mask = (icount & 0x07) << 3;
-                  else if (pixsize == 4)
-                     inner_mask = (icount & 0x03) << 4;
-                  else if (pixsize == 5)
-                     inner_mask = (icount & 0x01) << 5;
-               }
-               else
-                  inner_mask    = 0;
+      if (zfupdate)
+         zupdatei = true;
+      else
+         zupdatei = false;
 
-               /* The actual mask used should be the
-                  lesser of the window masks and
-                  the inner mask, where is all cases 000 means 1000. */
-               window_mask = (window_mask == 0 ? 0x40 : window_mask);
-               inner_mask  = (inner_mask == 0 ? 0x40 : inner_mask);
+      // A2 POINTER UPDATE
 
-               emask       = (window_mask > inner_mask ? inner_mask : window_mask);
-               /* The mask to be used for the pixel size, to which must be added
-                  the bit offset */
-               pma = pixAddr + (1 << pixsize);
-               /* Select the mask */
-               dend = (phrase_mode ? emask : pma);
+      if ((a1update && upda2 && notgzandp)
+            || (zupdate && upda2)
+            || (inner && indone && !outer0 && !upda1f && notgzandp && !upda1 && upda2))
+         a2updatei = true;
+      else
+         a2updatei = false;
 
-               /* The cycle width in phrase mode is normally one phrase.  However,
-                  at the start and end it may be narrower.  The start and end masks
-                  are used to generate this.  The width is given by:
+      // INITIALIZE INTENSITY FRACTION
 
-                  8 - start mask - (8 - end mask)
-                  =	end mask - start mask
+      if ((zupdate && !upda2 && datinit)
+            || (a1update && !upda2 && datinit && notgzandp)
+            || (inner && indone && !outer0 && !upda1f && !upda1 && notgzandp && !upda2 && datinit)
+            || (a2update && datinit)
+            || (idle && go && datinit))
+         init_ifi = true;
+      else
+         init_ifi = false;
 
-                  This is only used for writes in phrase mode.
-                  Start and end from the address level of the pipeline are used.
-                  */
+      // INITIALIZE INTENSITY INTEGER
 
-               //Phrase mode needs destination data for start/end mask byte merging,
-               //but NOT when bkgwren is set (hardware uses DSTDATA register value).
-               if (phrase_mode && !dsten && !bkgwren)
-                  dstd = ((uint64_t)JaguarReadLong(address, BLITTER) << 32) | (uint64_t)JaguarReadLong(address + 4, BLITTER);
+      if (init_if)
+         init_iii = true;
+      else
+         init_iii = false;
 
-               // Write data combines srcd and dstd through ADDDSEL, PATDSEL, or LFU.
-               // Precedence is ADDDSEL > PATDSEL > LFU.
+      // INITIALIZE Z FRACTION
 
-               // srcd2 = xxxx xxxx 0123 4567, srcd = 8901 2345 xxxx xxxx, srcshift = $20 (32)
-               srcd = (srcd2 << (64 - srcshift)) | (srcd1 >> srcshift);
-               //bleh, ugly ugly ugly
-               if (srcshift == 0)
-                  srcd = srcd1;
+      if (init_ii && gourz)
+         init_zfi = true;
+      else
+         init_zfi = false;
 
-               //NOTE: This only works with pixel sizes less than 8BPP...
-               //DOUBLE NOTE: Still need to do regression testing to ensure that this doesn't break other stuff... !!! CHECK !!!
-               if (!phrase_mode && srcshift != 0)
-                  srcd = ((srcd2 & 0xFF) << (8 - srcshift)) | ((srcd1 & 0xFF) >> srcshift);
+      // INITIALIZE Z INTEGER
 
-               //Z DATA() stuff done here... And it has to be done before any Z shifting...
-               //Note that we need to have phrase mode start/end support here... (Not since we moved it from dzwrite...!)
-               /*
-                  Here are a couple of Cybermorph blits with Z:
-                  $00113078	// DSTEN DSTENZ DSTWRZ CLIP_A1 GOURD GOURZ PATDSEL ZMODE=4
-                  $09900F39	// SRCEN DSTEN DSTENZ DSTWRZ UPDA1 UPDA1F UPDA2 DSTA2 ZMODE=4 LFUFUNC=C DCOMPEN
+      if (init_zf)
+         init_zii = true;
+      else
+         init_zii = false;
 
-                  We're having the same phrase mode overwrite problem we had with the pixels... !!! FIX !!!
-                  Odd. It's equating 0 with 0... Even though ZMODE is $04 (less than)!
-                  */
-               if (gourz)
-               {
-                  uint16_t addq[4];
-                  uint8_t initcin[4] = { 0, 0, 0, 0 };
-                  ADDARRAY(addq, 7/*daddasel*/, 6/*daddbsel*/, 0/*daddmode*/, 0, 0, initcin, 0, 0, 0, 0, 0, srcz1, srcz2, zinc, 0);
-                  srcz2 = ((uint64_t)addq[3] << 48) | ((uint64_t)addq[2] << 32) | ((uint64_t)addq[1] << 16) | (uint64_t)addq[0];
-                  ADDARRAY(addq, 6/*daddasel*/, 7/*daddbsel*/, 1/*daddmode*/, 0, 0, initcin, 0, 0, 0, 0, 0, srcz1, srcz2, zinc, 0);
-                  srcz1 = ((uint64_t)addq[3] << 48) | ((uint64_t)addq[2] << 32) | ((uint64_t)addq[1] << 16) | (uint64_t)addq[0];
+      // Here we move the fooi into their foo counterparts in order to simulate the moving
+      // of data into the various FDSYNCs... Each time we loop we simulate one clock cycle...
 
-               }
+      idle = idlei;
+      inner = inneri;
+      a1fupdate = a1fupdatei;
+      a1update = a1updatei;
+      zfupdate = zfupdatei;		// *
+      zupdate = zupdatei;			// *
+      a2update = a2updatei;
+      init_if = init_ifi;			// *
+      init_ii = init_iii;			// *
+      init_zf = init_zfi;			// *
+      init_zi = init_zii;			// *
+      // * denotes states that will never assert for Jaguar I
 
-               zSrcShift = srcshift & 0x30;
-               srcz = (srcz2 << (64 - zSrcShift)) | (srcz1 >> zSrcShift);
-               //bleh, ugly ugly ugly
-               if (zSrcShift == 0)
-                  srcz = srcz1;
+      // Now, depending on how we want to handle things, we could either put the implementation
+      // of the various pieces up above, or handle them down below here.
 
+      // Let's try postprocessing for now...
 
-               //When in SRCSHADE mode, it adds the IINC to the read source (from LFU???)
-               //According to following line, it gets LFU mode. But does it feed the source into the LFU
-               //after the add?
-               //Dest write address/pix address: 0014E83E/0 [dstart=0 dend=10 pwidth=8 srcshift=0][daas=4 dabs=5 dam=7 ds=1 daq=F] [0000000000006505] (icount=003F, inc=1)
-               //Let's try this:
-               if (srcshade)
-               {
-                  uint16_t addq[4];
-                  uint8_t initcin[4] = { 0, 0, 0, 0 };
-                  uint32_t iinc_masked = iinc & 0x00FFFFFF;
-                  ADDARRAY(addq, 4/*daddasel*/, 5/*daddbsel*/, 7/*daddmode*/, dstd, iinc_masked, initcin, 0, 0, 0, patd, srcd, 0, 0, 0, 0);
-                  srcd = ((uint64_t)addq[3] << 48) | ((uint64_t)addq[2] << 32) | ((uint64_t)addq[1] << 16) | (uint64_t)addq[0];
-               }
+      if (inner)
+      {
+         bool idle_inner = true, sreadx = false, szreadx = false, sread = false,
+              szread = false, dread = false, dzread = false, dwrite = false, dzwrite = false;
+         bool inner0 = false;
+         bool idle_inneri, sreadxi, szreadxi, sreadi, szreadi, dreadi, dzreadi, dwritei, dzwritei;
+         //other stuff
+         uint8_t srcshift = 0;
+         uint16_t icount = GET16(blitter_ram, PIXLINECOUNTER + 2);
+         bool srca_addi, dsta_addi, gensrc, gendst, gena2i, zaddr, fontread, justify, a1_add, a2_add;
+         bool adda_yconst, addareg, suba_x, suba_y, a1fracldi, shadeadd;
+         uint8_t addasel, a1_xconst, a2_xconst, adda_xconst, addbsel, maska1, maska2, modx, daddasel;
+         uint8_t daddbsel, daddmode;
+         bool patfadd, patdadd, srcz2add, daddq_sel;
+         uint8_t data_sel;
+         uint32_t address, pixAddr;
+         uint8_t dstxp;
+         uint64_t srcz;
+         bool winhibit;
 
-               /* DCONTROL: compute data adder signals.  Moved here from
-                  the per-iteration scope since they are only consumed
-                  during dwrite (dwrite=true, dzwrite=false here). */
-               shadeadd = srcshade;
-               daddasel = (gourd ? 0x01 : 0x00);
-               daddasel |= ((gourd || gourz || srcshade) ? 0x04 : 0x00);
-               daddbsel = (gourd || srcshade ? 0x01 : 0x00);
-               daddbsel |= (gourd || srcshade ? 0x04 : 0x00);
-               /* daddmode bit 0: NAND tree (dcontrol.v:130-146) makes
-                  bit 0 always 1 when dwrite&&gourd, !gourd&&!gourz,
-                  or shadeadd. */
-               daddmode = (gourd || (!gourd && !gourz) || shadeadd ? 0x01 : 0x00);
-               daddmode |= ((gourd && !topben && !ext_int)
-                     || (!gourd && !gourz && !topben) || (shadeadd && !topben) ? 0x02 : 0x00);
-               daddmode |= ((!gourd && !gourz) || shadeadd || (gourd && ext_int) ? 0x04 : 0x00);
-               patfadd = gourd;
-               patdadd = gourd;
-               srcz2add = false;
-               daddq_sel = gourd;
-               data_sel = ((!patdsel && !adddsel) ? 0x01 : 0x00)
-                  | (adddsel ? 0x02 : 0x00);
+         indone = false;
 
-               if (patfadd)
-               {
-                  uint16_t addq[4];
-                  uint8_t initcin[4] = { 0, 0, 0, 0 };
-                  ADDARRAY(addq, 4/*daddasel*/, 4/*daddbsel*/, 0/*daddmode*/, dstd, iinc, initcin, 0, 0, 0, patd, srcd, 0, 0, 0, 0);
-                  srcd1 = ((uint64_t)addq[3] << 48) | ((uint64_t)addq[2] << 32) | ((uint64_t)addq[1] << 16) | (uint64_t)addq[0];
-               }
+         /* Precompute address constants (invariant during inner loop) */
+         a1_xconst = 6 - a1_pixsize;
+         a2_xconst = 6 - a2_pixsize;
+         if (a1addx == 1)
+            a1_xconst = 0;
+         else if (a1addx & 0x02)
+            a1_xconst = 7;
+         if (a2addx == 1)
+            a2_xconst = 0;
+         else if (a2addx & 0x02)
+            a2_xconst = 7;
 
-               /* atick[0]/[1] two-phase pipeline: fractional intensity/Z update
-                  runs in the patfadd/srcz2add block above (Phase 0), integer
-                  update runs via DATA→patdadd below (Phase 1).  The dbinh
-                  param below is overwritten inside DATA by COMP_CTRL. */
+         /* Precompute srcshift — loaded on first inner cycle (sshftld),
+            then held constant for all subsequent cycles. */
+         {
+            uint8_t dstxp0, srcxp0, shftv0, pobb0, loshd0;
+            bool pobbsel0;
 
-               DATA(&wdata, &dcomp, &zcomp, &winhibit,
-                     true, cmpdst, daddasel, daddbsel, daddmode, daddq_sel, data_sel, 0/*dbinh*/,
-                     dend, dstart, dstd, iinc, lfufunc, &patd, patdadd,
-                     phrase_mode, srcd, false/*srcdread*/, false/*srczread*/, srcz2add, zmode,
-                     bcompen, bkgwren, dcompen, icount & 0x07, pixsize,
-                     &srcz, dstz, zinc);
+            dstxp0 = (dsta2 ? a2_x : a1_x) & 0x3F;
+            srcxp0 = (dsta2 ? a1_x : a2_x) & 0x3F;
+            shftv0 = ((dstxp0 - srcxp0) << pixsize) & 0x3F;
+            pobb0 = 0;
+            if (pixsize == 3)
+               pobb0 = dstxp0 & 0x07;
+            else if (pixsize == 4)
+               pobb0 = dstxp0 & 0x03;
+            else if (pixsize == 5)
+               pobb0 = dstxp0 & 0x01;
 
-               /*
-                  DEF ADDRCOMP (
-                  a1_outside	// A1 pointer is outside window bounds
-                  :OUT;
-                  INT16/	a1_x
-                  INT16/	a1_y
-                  INT15/	a1_win_x
-                  INT15/	a1_win_y
-                  :IN);
-                  BEGIN
+            pobbsel0 = phrase_mode && bcompen;
+            loshd0 = (pobbsel0 ? pobb0 : shftv0) & 0x07;
+            srcshift = (srcen || pobbsel0 ? loshd0 : 0);
+            srcshift |= (srcen && phrase_mode ? shftv0 & 0x38 : 0);
+         }
 
-               // The address is outside if negative, or if greater than or equal
-               // to the window size
+         while (true)
+         {
+#ifdef BENCH_PROFILE
+            int blitter_did_io = 0;
+#endif
+            /* PERF_INC embedded via comma operator to keep C89 decl
+             * order valid (no statements before declarations).  */
+            uint16_t dstxwr = (PERF_INC(blitter_inner), 0), pseq;
+            bool penden;
+            uint8_t window_mask;
+            uint8_t inner_mask = 0;
+            uint8_t emask, pma, dend;
+            uint64_t srcd;
+            uint8_t zSrcShift;
+            uint64_t wdata;
+            uint8_t dcomp, zcomp;
 
-A1_xcomp	:= MAG_15 (a1xgr, a1xeq, a1xlt, a1_x{0..14}, a1_win_x{0..14});
-A1_ycomp	:= MAG_15 (a1ygr, a1yeq, a1ylt, a1_y{0..14}, a1_win_y{0..14});
-A1_outside	:= OR6 (a1_outside, a1_x{15}, a1xgr, a1xeq, a1_y{15}, a1ygr, a1yeq);
-*/
-               //NOTE: There seems to be an off-by-one bug here in the clip_a1 section... !!! FIX !!!
-               //      Actually, seems to be related to phrase mode writes...
-               //      Or is it? Could be related to non-15-bit compares as above?
-               if (clip_a1 && ((a1_x & 0x8000) || (a1_y & 0x8000) || (a1_x >= a1_win_x) || (a1_y >= a1_win_y)))
-                  winhibit = true;
+            //NOTE: sshftld probably is only asserted at the beginning of the inner loop. !!! FIX !!!
+            /* State machine: step is always true (no bus contention in
+               Jaguar I), textext/txtread never assert. Both eliminated. */
 
+            if ((dzwrite && inner0)
+                  || (dwrite && !dstwrz && inner0))
+            {
+               idle_inneri = true;
+               break;
+            }
+            else
+               idle_inneri = false;
 
-               if (!winhibit || bkgwren)
-               {
-                  if (phrase_mode)
-                  {
-                     JaguarWriteLong(address + 0, wdata >> 32, BLITTER);
-                     JaguarWriteLong(address + 4, wdata & 0xFFFFFFFF, BLITTER);
-                  }
-                  else
-                  {
-                     if (pixsize == 5)
-                        JaguarWriteLong(address, wdata & 0xFFFFFFFF, BLITTER);
-                     else if (pixsize == 4)
-                        JaguarWriteWord(address, wdata & 0x0000FFFF, BLITTER);
-                     else
-                        JaguarWriteByte(address, wdata & 0x000000FF, BLITTER);
-                  }
-               }
+            sreadxi = (idle_inner && srcenx);
+            szreadxi = (sreadx && srcenz);
+
+            sreadi = (szreadx
+                  || (sreadx && !srcenz && srcen)
+                  || (idle_inner && !srcenx && srcen)
+                  || (dzwrite && !inner0 && srcen)
+                  || (dwrite && !dstwrz && !inner0 && srcen));
 
-            }
+            szreadi = (sread && srcenz);
 
-            if (dzwrite)
-            {
-               // OK, here's the big insight: When NOT in GOURZ mode, srcz1 & 2 function EXACTLY the same way that
-               // srcd1 & 2 work--there's an implicit shift from srcz1 to srcz2 whenever srcz1 is read.
-               // OTHERWISE, srcz1 is the integer for the computed Z and srcz2 is the fractional part.
-               // Writes to srcz1 & 2 follow the same pattern as the other 64-bit registers--low 32 at the low address,
-               // high 32 at the high address (little endian!).
-               // NOTE: GOURZ is still not properly supported. Check patd/patf handling...
-               //       Phrase mode start/end masks are not properly supported either...
-               //This is not correct... !!! FIX !!!
-               //Should be OK now... We'll see...
-               //Nope. Having the same starstep write problems in phrase mode as we had with pixels... !!! FIX !!!
-               //This is not causing the problem in Hover Strike... :-/
-               //The problem was with the SREADX not shifting. Still problems with Z comparisons & other text in pregame screen...
-               if (!winhibit)
-               {
-                  if (phrase_mode)
-                  {
-                     JaguarWriteLong(address + 0, srcz >> 32, BLITTER);
-                     JaguarWriteLong(address + 4, srcz & 0xFFFFFFFF, BLITTER);
-                  }
-                  else
-                  {
-                     if (pixsize == 4)
-                        JaguarWriteWord(address, srcz & 0x0000FFFF, BLITTER);
-                  }
-               }//*/
-            }
+            dreadi = ((szread && dsten)
+                  || (sread && !srcenz && dsten)
+                  || (sreadx && !srcenz && !srcen && dsten)
+                  || (idle_inner && !srcenx && !srcen && dsten)
+                  || (dzwrite && !inner0 && !srcen && dsten)
+                  || (dwrite && !dstwrz && !inner0 && !srcen && dsten));
 
+            dzreadi = ((dread && dstenz)
+                  || (szread && !dsten && dstenz)
+                  || (sread && !srcenz && !dsten && dstenz)
+                  || (sreadx && !srcenz && !srcen && !dsten && dstenz)
+                  || (idle_inner && !srcenx && !srcen && !dsten && dstenz)
+                  || (dzwrite && !inner0 && !srcen && !dsten && dstenz)
+                  || (dwrite && !dstwrz && !inner0 && !srcen && !dsten && dstenz));
 
-            if (a1_add)
-            {
-               int16_t adda_x, adda_y, addb_x, addb_y, addq_x, addq_y;
-               ADDAMUX(&adda_x, &adda_y, addasel, a1_step_x, a1_step_y, a1_stepf_x, a1_stepf_y, a2_step_x, a2_step_y,
-                     a1_inc_x, a1_inc_y, a1_incf_x, a1_incf_y, adda_xconst, adda_yconst, addareg, suba_x, suba_y);
-               ADDBMUX(&addb_x, &addb_y, addbsel, a1_x, a1_y, a2_x, a2_y, a1_frac_x, a1_frac_y);
-               ADDRADD(&addq_x, &addq_y, a1fracldi, adda_x, adda_y, addb_x, addb_y, modx, suba_x, suba_y);
+            dwritei = (dzread
+                  || (dread && !dstenz)
+                  || (szread && !dsten && !dstenz)
+                  || (sread && !srcenz && !dsten && !dstenz)
+                  || (sreadx && !srcenz && !srcen && !dsten && !dstenz)
+                  || (idle_inner && !srcenx && !srcen && !dsten && !dstenz)
+                  || (dzwrite && !inner0 && !srcen && !dsten && !dstenz)
+                  || (dwrite && !dstwrz && !inner0 && !srcen && !dsten && !dstenz));
 
-               //Now, write to what???
-               //a2ptrld comes from a2ptrldi...
-               //I believe it's addbsel that determines the writeback...
-               // This is where atick[0] & [1] come in, in determining which part (fractional, integer)
-               // gets written to...
-               //a1_x = addq_x;
-               //a1_y = addq_y;
-               //Kludge, to get A1 channel increment working...
-               if (a1addx == 3)
-               {
-                  a1_frac_x = addq_x, a1_frac_y = addq_y;
+            dzwritei = (dwrite && dstwrz);
 
-                  addasel = 2, addbsel = 0, a1fracldi = false;
-                  ADDAMUX(&adda_x, &adda_y, addasel, a1_step_x, a1_step_y, a1_stepf_x, a1_stepf_y, a2_step_x, a2_step_y,
-                        a1_inc_x, a1_inc_y, a1_incf_x, a1_incf_y, adda_xconst, adda_yconst, addareg, suba_x, suba_y);
-                  ADDBMUX(&addb_x,&addb_y, addbsel, a1_x, a1_y, a2_x, a2_y, a1_frac_x, a1_frac_y);
-                  ADDRADD(&addq_x, &addq_y, a1fracldi, adda_x, adda_y, addb_x, addb_y, modx, suba_x, suba_y);
+            // Here we move the fooi into their foo counterparts in order to simulate the moving
+            // of data into the various FDSYNCs... Each time we loop we simulate one clock cycle...
 
-                  a1_x = addq_x, a1_y = addq_y;
-               }
-               else
-                  a1_x = addq_x, a1_y = addq_y;
-            }
+            idle_inner = idle_inneri;
+            sreadx = sreadxi;
+            szreadx = szreadxi;
+            sread = sreadi;
+            szread = szreadi;
+            dread = dreadi;
+            dzread = dzreadi;
+            dwrite = dwritei;
+            dzwrite = dzwritei;
 
-            if (a2_add)
-            {
-               int16_t adda_x, adda_y, addb_x, addb_y, addq_x, addq_y;
-               ADDAMUX(&adda_x, &adda_y, addasel, a1_step_x, a1_step_y, a1_stepf_x, a1_stepf_y, a2_step_x, a2_step_y,
-                     a1_inc_x, a1_inc_y, a1_incf_x, a1_incf_y, adda_xconst, adda_yconst, addareg, suba_x, suba_y);
-               ADDBMUX(&addb_x, &addb_y, addbsel, a1_x, a1_y, a2_x, a2_y, a1_frac_x, a1_frac_y);
-               ADDRADD(&addq_x, &addq_y, a1fracldi, adda_x, adda_y, addb_x, addb_y, modx, suba_x, suba_y);
+            // Here's a few more decodes--not sure if they're supposed to go here or not...
 
-               //Now, write to what???
-               //a2ptrld comes from a2ptrldi...
-               //I believe it's addbsel that determines the writeback...
-               a2_x = addq_x;
-               a2_y = addq_y;
-            }
-         }
 
-         indone = true;
-         // The outer counter is updated here as well on the clock cycle...
+            srca_addi = (sreadxi && !srcenz) || (sreadi && !srcenz) || szreadxi || szreadi;
 
-         /* the inner loop is started whenever another state is about to
-            cause the inner state to go active */
-         //Instart		:= ND7 (instart, innert[0], innert[2..7]);
+            dsta_addi = (dwritei && !dstwrz) || dzwritei;
 
-         //Actually, it's done only when inner gets asserted without the 2nd line of conditions
-         //(inner AND !indone)
-         //fixed now...
-         //Since we don't get here until the inner loop is finished (indone = true) we can get
-         //away with doing it here...!
-         ocount--;
+            gensrc = sreadxi || szreadxi || sreadi || szreadi;
+            gendst = dreadi || dzreadi || dwritei || dzwritei;
+            gena2i = (gensrc && !dsta2) || (gendst && dsta2);
 
-         if (ocount == 0)
-            outer0 = true;
-      }
+            zaddr = szreadx || szread || dzread || dzwrite;
 
-      if (a1fupdate)
-      {
-         uint32_t a1_frac_xt = (uint32_t)a1_frac_x + (uint32_t)a1_stepf_x;
-         uint32_t a1_frac_yt = (uint32_t)a1_frac_y + (uint32_t)a1_stepf_y;
-         a1FracCInX = a1_frac_xt >> 16;
-         a1FracCInY = a1_frac_yt >> 16;
-         a1_frac_x = (uint16_t)(a1_frac_xt & 0xFFFF);
-         a1_frac_y = (uint16_t)(a1_frac_yt & 0xFFFF);
-      }
+            // Some stuff from MCONTROL.NET--not sure if this is the correct use of this decode or not...
+            /*Fontread\	:= OND1 (fontread\, sread[1], sreadx[1], bcompen);
+Fontread	:= INV1 (fontread, fontread\);
+Justt		:= NAN3 (justt, fontread\, phrase_mode, tactive\);
+Justify		:= TS (justify, justt, busen);*/
+            fontread = (sread || sreadx) && bcompen;
+            justify = !(!fontread && phrase_mode /*&& tactive*/);
 
-      if (a1update)
-      {
-         a1_x += a1_step_x + a1FracCInX;
-         a1_y += a1_step_y + a1FracCInY;
-      }
+            /* Generate inner loop update enables */
+            /*
+A1_addi		:= MX2 (a1_addi, dsta_addi, srca_addi, dsta2);
+A2_addi		:= MX2 (a2_addi, srca_addi, dsta_addi, dsta2);
+A1_add		:= FD1 (a1_add, a1_add\, a1_addi, clk);
+A2_add		:= FD1 (a2_add, a2_add\, a2_addi, clk);
+A2_addb		:= BUF1 (a2_addb, a2_add);
+*/
+            a1_add = (dsta2 ? srca_addi : dsta_addi);
+            a2_add = (dsta2 ? dsta_addi : srca_addi);
 
-      if (a2update)
-      {
-         a2_x += a2_step_x;
-         a2_y += a2_step_y;
-      }
-   }
+            /* Address adder input A register selection
+               000	A1 step integer part
+               001	A1 step fraction part
+               010	A1 increment integer part
+               011	A1 increment fraction part
+               100	A2 step
 
-   // Write values back to registers (in real blitter, these are continuously updated)
-   SET16(blitter_ram, A1_PIXEL + 2, a1_x);
-   SET16(blitter_ram, A1_PIXEL + 0, a1_y);
-   SET16(blitter_ram, A1_FPIXEL + 2, a1_frac_x);
-   SET16(blitter_ram, A1_FPIXEL + 0, a1_frac_y);
-   SET16(blitter_ram, A2_PIXEL + 2, a2_x);
-   SET16(blitter_ram, A2_PIXEL + 0, a2_y);
+               bit 2 = a2update
+               bit 1 = /a2update . (a1_add . a1addx[0..1])
+               bit 0 = /a2update . ( a1fupdate
+               + a1_add . atick[0] . a1addx[0..1])
+               The /a2update term on bits 0 and 1 is redundant.
+               Now look-ahead based
+               */
 
-}
+            addasel = (a1fupdate || (a1_add && a1addx == 3) ? 0x01 : 0x00);
+            addasel |= (a1_add && a1addx == 3 ? 0x02 : 0x00);
+            addasel |= (a2update ? 0x04 : 0x00);
+            /* Address adder input A X constant selection
+               adda_xconst[0..2] generate a power of 2 in the range 1-64 or all
+               zeroes when they are all 1
+               Remember - these are pixels, so to add one phrase the pixel size
+               has to be taken into account to get the appropriate value.
+               for A1
+               if a1addx[0..1] are 00 set 6 - pixel size
+               if a1addx[0..1] are 01 set the value 000
+               if a1addx[0..1] are 10 set the value 111
+               similarly for A2
+JLH: Also, 11 will likewise set the value to 111
+*/
+            adda_xconst = (a2_add ? a2_xconst : a1_xconst);
+            /* Address adder input A Y constant selection
+               22 June 94 - This was erroneous, because only the a1addy bit was reflected here.
+               Therefore, the selection has to be controlled by a bug fix bit.
+JLH: Bug fix bit in Jaguar II--not in Jaguar I!
+*/
+            adda_yconst = a1addy;
+            /* Address adder input A register versus constant selection
+               given by	  a1_add . a1addx[0..1]
+               + a1update
+               + a1fupdate
+               + a2_add . a2addx[0..1]
+               + a2update
+               */
+            addareg = ((a1_add && a1addx == 3) || a1update || a1fupdate
+                  || (a2_add && a2addx == 3) || a2update ? true : false);
+            /* The adders can be put into subtract mode in add pixel size
+               mode when the corresponding flags are set */
+            suba_x = ((a1_add && a1xsign && a1addx == 1) || (a2_add && a2xsign && a2addx == 1) ? true : false);
+            suba_y = ((a1_add && a1addy && a1ysign) || (a2_add && a2addy && a2ysign) ? true : false);
+            /* Address adder input B selection
+               00	A1 pointer
+               01	A2 pointer
+               10	A1 fraction
+               11	Zero
 
-// Various pieces of the blitter puzzle are teased out here...
+               Bit 1 =   a1fupdate
+               + (a1_add . atick[0] . a1addx[0..1])
+               + a1fupdate . a1_stepld
+               + a1update . a1_stepld
+               + a2update . a2_stepld
+               Bit 0 =   a2update + a2_add
+               + a1fupdate . a1_stepld
+               + a1update . a1_stepld
+               + a2update . a2_stepld
+               */
+            addbsel = (a2update || a2_add || (a1fupdate && a1_stepld)
+                  || (a1update && a1_stepld) || (a2update && a2_stepld) ? 0x01 : 0x00);
+            addbsel |= (a1fupdate || (a1_add && a1addx == 3) || (a1fupdate && a1_stepld)
+                  || (a1update && a1_stepld) || (a2update && a2_stepld) ? 0x02 : 0x00);
 
-void ADDRGEN(uint32_t *address, uint32_t *pixa, bool gena2, bool zaddr,
-	uint16_t a1_x, uint16_t a1_y, uint32_t a1_base, uint8_t a1_pitch, uint8_t a1_pixsize, uint8_t a1_width, uint8_t a1_zoffset,
-	uint16_t a2_x, uint16_t a2_y, uint32_t a2_base, uint8_t a2_pitch, uint8_t a2_pixsize, uint8_t a2_width, uint8_t a2_zoffset)
-{
-	uint16_t x = (gena2 ? a2_x : a1_x) & 0xFFFF;	// Actually uses all 16 bits to generate address...!
-	uint16_t y = (gena2 ? a2_y : a1_y) & 0x0FFF;
-	uint8_t width = (gena2 ? a2_width : a1_width);
-	uint8_t pixsize = (gena2 ? a2_pixsize : a1_pixsize);
-	uint8_t pitch = (gena2 ? a2_pitch : a1_pitch);
-	uint32_t base = (gena2 ? a2_base : a1_base) >> 3;//Only upper 21 bits are passed around the bus? Seems like it...
-	uint8_t zoffset = (gena2 ? a2_zoffset : a1_zoffset);
+            /* The modulo bits are used to align X onto a phrase boundary when
+               it is being updated by one phrase
+               000	no mask
+               001	mask bit 0
+               010	mask bits 1-0
+               ..
+               110  	mask bits 5-0
 
-	uint32_t ytm = ((uint32_t)y << 2) + ((width & 0x02) ? (uint32_t)y << 1 : 0) + ((width & 0x01) ? (uint32_t)y : 0);
+               Masking is enabled for a1 when a1addx[0..1] is 00, and the value
+               is 6 - the pixel size (again!)
+               */
+            maska1 = (a1_add && a1addx == 0 ? 6 - a1_pixsize : 0);
+            maska2 = (a2_add && a2addx == 0 ? 6 - a2_pixsize : 0);
+            modx = (a2_add ? maska2 : maska1);
+            /* Generate load strobes for the increment updates */
 
-	uint32_t ya = (ytm << (width >> 2)) >> 2;
+            /*A1pldt		:= NAN2 (a1pldt, atick[1], a1_add);
+A1ptrldi	:= NAN2 (a1ptrldi, a1update\, a1pldt);
 
-	uint32_t pa = ya + x;
-   uint8_t pt, za;
-   uint32_t phradr, shup, addr;
+A1fldt		:= NAN4 (a1fldt, atick[0], a1_add, a1addx[0..1]);
+A1fracldi	:= NAN2 (a1fracldi, a1fupdate\, a1fldt);
 
-	*pixa = pa << pixsize;
+A2pldt		:= NAN2 (a2pldt, atick[1], a2_add);
+A2ptrldi	:= NAN2 (a2ptrldi, a2update\, a2pldt);*/
 
-	pt = ((pitch & 0x01) && !(pitch & 0x02) ? 0x01 : 0x00)
-		| (!(pitch & 0x01) && (pitch & 0x02) ? 0x02 : 0x00);
-	phradr = (*pixa >> 6) << pt;
-	shup = (pitch == 0x03 ? (*pixa >> 6) : 0);
+            a1fracldi = a1fupdate || (a1_add && a1addx == 3);
 
-	za = (zaddr ? zoffset : 0) & 0x03;
-	addr = za + phradr + (shup << 1) + base;
-	*address = ((*pixa & 0x38) >> 3) | ((addr & 0x1FFFFF) << 3);
-	*pixa &= 0x07;
-}
+            ADDRGEN(&address, &pixAddr, gena2i, zaddr,
+                  a1_x, a1_y, a1_base, a1_pitch, a1_pixsize, a1_width, a1_zoffset,
+                  a2_x, a2_y, a2_base, a2_pitch, a2_pixsize, a2_width, a2_zoffset);
 
-////////////////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////////////////
-// Here's an important bit: The source data adder logic. Need to track down the inputs!!! //
-////////////////////////////////////////////////////////////////////////////////////////////
-////////////////////////////////////////////////////////////////////////////////////////////
+            //Here's my guess as to how the addresses get truncated to phrase boundaries in phrase mode...
+            if (!justify)
+               address &= 0xFFFFF8;
 
-void ADDARRAY(uint16_t * addq, uint8_t daddasel, uint8_t daddbsel, uint8_t daddmode,
-	uint64_t dstd, uint32_t iinc, uint8_t initcin[], uint64_t initinc, uint16_t initpix,
-	uint32_t istep, uint64_t patd, uint64_t srcd, uint64_t srcz1, uint64_t srcz2,
-	uint32_t zinc, uint32_t zstep)
-{
-   unsigned i;
-   uint16_t adda[4];
-   uint16_t addb[4];
-   uint64_t adda_val;
-   uint32_t initpix2;
-   uint16_t word;
-   uint8_t cinsel;
-   static uint8_t co[4]; /* preserved between calls */
-   uint8_t cin[4];
-   bool eightbit;
-   bool sat, hicinh;
-   uint8_t bsel_idx;
+            /* dstxp needed for dstart computation in dwrite */
+            dstxp = (dsta2 ? a2_x : a1_x) & 0x3F;
 
-   initpix2 = ((uint32_t)initpix << 16) | initpix;
+            if (sreadx)
+            {
+               PERF_INC(blitter_phrase_reads);
+#ifdef BENCH_PROFILE
+               blitter_did_io = 1;
+#endif
+               //uint32_t srcAddr, pixAddr;
+               //ADDRGEN(srcAddr, pixAddr, gena2i, zaddr,
+               //	a1_x, a1_y, a1_base, a1_pitch, a1_pixsize, a1_width, a1_zoffset,
+               //	a2_x, a2_y, a2_base, a2_pitch, a2_pixsize, a2_width, a2_zoffset);
+               srcd2 = srcd1;
+               srcd1 = ((uint64_t)JaguarReadLong(address + 0, BLITTER) << 32)
+                  | (uint64_t)JaguarReadLong(address + 4, BLITTER);
+               //Kludge to take pixel size into account...
+               //Hmm. If we're not in phrase mode, this is most likely NOT going to be used...
+               //Actually, it would be--because of BCOMPEN expansion, for example...
+               if (!phrase_mode)
+               {
+                  if (bcompen)
+                     srcd1 >>= 56;
+                  else
+                  {
+                     if (pixsize == 5)
+                        srcd1 >>= 32;
+                     else if (pixsize == 4)
+                        srcd1 >>= 48;
+                     else
+                        srcd1 >>= 56;
+                  }
+               }//*/
+            }
 
-   /* Select adda source directly (replaces 8-element addalo/addahi arrays) */
-   switch (daddasel)
-   {
-      case 0:  adda_val = dstd; break;
-      case 1:  adda_val = ((uint64_t)initpix2 << 32) | initpix2; break;
-      case 2:  /* fall through */
-      case 3:  adda_val = 0; break;
-      case 4:  adda_val = srcd; break;
-      case 5:  adda_val = patd; break;
-      case 6:  adda_val = srcz1; break;
-      default: adda_val = srcz2; break;
-   }
-   adda[0] = (uint16_t)adda_val;
-   adda[1] = (uint16_t)(adda_val >> 16);
-   adda[2] = (uint16_t)(adda_val >> 32);
-   adda[3] = (uint16_t)(adda_val >> 48);
+            if (szreadx)
+            {
+               srcz2 = srcz1;
+               srcz1 = ((uint64_t)JaguarReadLong(address, BLITTER) << 32) | (uint64_t)JaguarReadLong(address + 4, BLITTER);
+            }
 
-   /* Select addb source (replaces wordmux array + dbsel2/iincsel logic) */
-   if (!(daddbsel & 0x04))
-   {
-      if (daddbsel & 0x01)
-      {
-         addb[0] = (uint16_t)initinc;
-         addb[1] = (uint16_t)(initinc >> 16);
-         addb[2] = (uint16_t)(initinc >> 32);
-         addb[3] = (uint16_t)(initinc >> 48);
-      }
-      else
-      {
-         addb[0] = (uint16_t)srcd;
-         addb[1] = (uint16_t)(srcd >> 16);
-         addb[2] = (uint16_t)(srcd >> 32);
-         addb[3] = (uint16_t)(srcd >> 48);
-      }
-   }
-   else
-   {
-      bsel_idx = ((daddbsel & 0x08) >> 1) | (daddbsel & 0x03);
-      switch (bsel_idx)
-      {
-         case 0: word = iinc & 0xFFFF; break;
-         case 1: word = iinc >> 16; break;
-         case 2: word = zinc & 0xFFFF; break;
-         case 3: word = zinc >> 16; break;
-         case 4: word = istep & 0xFFFF; break;
-         case 5: word = istep >> 16; break;
-         case 6: word = zstep & 0xFFFF; break;
-         default: word = zstep >> 16; break;
-      }
-      addb[0] = addb[1] = addb[2] = addb[3] = word;
-   }
+            if (sread)
+            {
+               PERF_INC(blitter_phrase_reads);
+#ifdef BENCH_PROFILE
+               blitter_did_io = 1;
+#endif
+               srcd2 = srcd1;
+               srcd1 = ((uint64_t)JaguarReadLong(address, BLITTER) << 32) | (uint64_t)JaguarReadLong(address + 4, BLITTER);
+               //Kludge to take pixel size into account...
+               if (!phrase_mode)
+               {
+                  if (bcompen)
+                     srcd1 >>= 56;
+                  else
+                  {
+                     if (pixsize == 5)
+                        srcd1 >>= 32;
+                     else if (pixsize == 4)
+                        srcd1 >>= 48;
+                     else
+                        srcd1 >>= 56;
+                  }
+               }
+            }
 
-   /* Hardware: cinsel = (daddmode[0] | daddmode[1]) & ~daddmode[2]
-      Only modes 1-3 use carry input; mode 4+ do not. */
-   cinsel = ((daddmode & 0x03) && !(daddmode & 0x04) ? 1 : 0);
+            if (szread)
+            {
+               PERF_INC(blitter_phrase_reads);
+#ifdef BENCH_PROFILE
+               blitter_did_io = 1;
+#endif
+               srcz2 = srcz1;
+               srcz1 = ((uint64_t)JaguarReadLong(address, BLITTER) << 32) | (uint64_t)JaguarReadLong(address + 4, BLITTER);
+               //Kludge to take pixel size into account... I believe that it only has to take 16BPP mode into account. Not sure tho.
+               if (!phrase_mode && pixsize == 4)
+                  srcz1 >>= 48;
 
-   for(i = 0; i < 4; i++)
-      cin[i] = initcin[i] | (co[i] & cinsel);
+            }
 
-   eightbit = daddmode & 0x02;
-   sat = daddmode & 0x03;
-   hicinh = ((daddmode & 0x03) == 0x03);
+            if (dread)
+            {
+               PERF_INC(blitter_phrase_reads);
+#ifdef BENCH_PROFILE
+               blitter_did_io = 1;
+#endif
+               dstd = ((uint64_t)JaguarReadLong(address, BLITTER) << 32) | (uint64_t)JaguarReadLong(address + 4, BLITTER);
+               //Kludge to take pixel size into account...
+               if (!phrase_mode)
+               {
+                  if (pixsize == 5)
+                     dstd >>= 32;
+                  else if (pixsize == 4)
+                     dstd >>= 48;
+                  else
+                     dstd >>= 56;
+               }
+            }
 
-   ADD16SAT(&addq[0], &co[0], adda[0], addb[0], cin[0], sat, eightbit, hicinh);
-   ADD16SAT(&addq[1], &co[1], adda[1], addb[1], cin[1], sat, eightbit, hicinh);
-   ADD16SAT(&addq[2], &co[2], adda[2], addb[2], cin[2], sat, eightbit, hicinh);
-   ADD16SAT(&addq[3], &co[3], adda[3], addb[3], cin[3], sat, eightbit, hicinh);
-}
+            if (dzread)
+            {
+               // Is Z always 64 bit read? Or sometimes 16 bit (dependent on phrase_mode)?
+               dstz = ((uint64_t)JaguarReadLong(address, BLITTER) << 32) | (uint64_t)JaguarReadLong(address + 4, BLITTER);
+               //Kludge to take pixel size into account... I believe that it only has to take 16BPP mode into account. Not sure tho.
+               if (!phrase_mode && pixsize == 4)
+                  dstz >>= 48;
 
+            }
 
-void ADD16SAT(uint16_t *r, uint8_t *co, uint16_t a, const uint16_t b, const uint8_t cin, const bool sat, const bool eightbit, const bool hicinh)
-{
-   uint8_t carry[4];
-   uint8_t btop, ctop;
-   bool saturate, hisaturate;
-   uint32_t qt   = (a & 0xFF) + (b & 0xFF) + cin;
-   uint16_t q    = qt & 0x00FF;
+            // These vars should probably go further up in the code... !!! FIX !!!
+            // We can't preassign these unless they're static...
+            //NOTE: SRCSHADE requires GOURZ to be set to work properly--another Jaguar I bug
+            if (dwrite)
+            {
+#ifdef BENCH_PROFILE
+               blitter_did_io = 1;
+#endif
+               //Counter is done on the dwrite state...! (We'll do it first, since it affects dstart/dend calculations.)
+               //Here's the voodoo for figuring the correct amount of pixels in phrase mode (or not):
+               int8_t inct = (PERF_INC(blitter_phrase_writes), -((dsta2 ? a2_x : a1_x) & 0x07));	// From INNER_CNT
+               uint8_t inc = 0;
+               uint16_t oldicount;
+               uint8_t dstart = 0;
 
-   carry[0]      = ((qt & 0x0100) ? 1 : 0);
-   carry[1]      = (carry[0] && !eightbit ? carry[0] : 0);
-   qt            = (a & 0x0F00) + (b & 0x0F00) + (carry[1] << 8);
-   carry[2]      = ((qt & 0x1000) ? 1 : 0);
-   q            |= qt & 0x0F00;
-   carry[3]      = (carry[2] && !hicinh ? carry[2] : 0);
-   qt            = (a & 0xF000) + (b & 0xF000) + (carry[3] << 12);
-   *co            = ((qt & 0x10000) ? 1 : 0);
-   q            |= qt & 0xF000;
+               inc = (!phrase_mode || (phrase_mode && (inct & 0x01)) ? 0x01 : 0x00);
+               inc |= (phrase_mode && (((pixsize == 3 || pixsize == 4) && (inct & 0x02)) || (pixsize == 5 && !(inct & 0x01))) ? 0x02 : 0x00);
+               inc |= (phrase_mode && ((pixsize == 3 && (inct & 0x04)) || (pixsize == 4 && !(inct & 0x03))) ? 0x04 : 0x00);
+               inc |= (phrase_mode && pixsize == 3 && !(inct & 0x07) ? 0x08 : 0x00);
 
-   if (eightbit)
-   {
-      btop  = (b & 0x0080) >> 7;
-      ctop  = carry[0];
-   }
-   else
-   {
-      btop  = (b & 0x8000) >> 15;
-      ctop  = *co;
-   }
+               oldicount = icount;	// Save icount to detect underflow...
+               icount -= inc;
 
-   saturate = sat && (btop ^ ctop);
-   hisaturate = saturate && !eightbit;
+               if (icount == 0 || ((icount & 0x8000) && !(oldicount & 0x8000)))
+                  inner0 = true;
+               // X/Y stepping is also done here, I think...No. It's done when a1_add or a2_add is asserted...
 
-   *r = (saturate ? (ctop ? 0x00FF : 0x0000) : q & 0x00FF);
-   *r |= (hisaturate ? (ctop ? 0xFF00 : 0x0000) : q & 0xFF00);
-}
+               //*********************************************************************************
+               //Start & end write mask computations...
+               //*********************************************************************************
 
-void ADDAMUX(int16_t *adda_x, int16_t *adda_y, uint8_t addasel, int16_t a1_step_x, int16_t a1_step_y,
-	int16_t a1_stepf_x, int16_t a1_stepf_y, int16_t a2_step_x, int16_t a2_step_y,
-	int16_t a1_inc_x, int16_t a1_inc_y, int16_t a1_incf_x, int16_t a1_incf_y, uint8_t adda_xconst,
-	bool adda_yconst, bool addareg, bool suba_x, bool suba_y)
-{
 
-   int16_t addar_x, addar_y, addac_x, addac_y, addas_x, addas_y;
-	int16_t xterm[4], yterm[4];
-	xterm[0] = a1_step_x, xterm[1] = a1_stepf_x, xterm[2] = a1_inc_x, xterm[3] = a1_incf_x;
-	yterm[0] = a1_step_y, yterm[1] = a1_stepf_y, yterm[2] = a1_inc_y, yterm[3] = a1_incf_y;
-   if (addasel & 0x04)
-   {
-      addar_x = a2_step_x;
-      addar_y = a2_step_y;
-   }
-   else
-   {
-      addar_x = xterm[addasel & 0x03];
-      addar_y = yterm[addasel & 0x03];
-   }
+               if (phrase_mode)
+               {
+                  if (pixsize == 3)
+                     dstart = (dstxp & 0x07) << 3;
+                  else if (pixsize == 4)
+                     dstart = (dstxp & 0x03) << 4;
+                  else if (pixsize == 5)
+                     dstart = (dstxp & 0x01) << 5;
+               }
+               else
+                  dstart    = pixAddr & 0x07;
 
-   /* Generate a constant value - this is a power of 2 in the range
-      0-64, or zero.  The control bits are adda_xconst[0..2], when they
-      are all 1  the result is 0.
-      Constants for Y can only be 0 or 1 */
+               //This is the other Jaguar I bug... Normally, should ALWAYS select a1_x here.
+               dstxwr = (dsta2 ? a2_x : a1_x) & 0x7FFE;
+               pseq = dstxwr ^ (a1_win_x & 0x7FFE);
+               pseq = (pixsize == 5 ? pseq : pseq & 0x7FFC);
+               pseq = ((pixsize & 0x06) == 4 ? pseq : pseq & 0x7FF8);
+               penden = clip_a1 && (pseq == 0);
+               window_mask = 0;
 
-	addac_x = (adda_xconst == 0x07 ? 0 : 1 << adda_xconst);
-	addac_y = (adda_yconst ? 0x01 : 0);
+               if (penden)
+               {
+                  if (pixsize == 3)
+                     window_mask = (a1_win_x & 0x07) << 3;
+                  else if (pixsize == 4)
+                     window_mask = (a1_win_x & 0x03) << 4;
+                  else if (pixsize == 5)
+                     window_mask = (a1_win_x & 0x01) << 5;
+               }
+               else
+                  window_mask    = 0;
 
-   /* Select between constant value and register value */
+               /* The mask to be used if within one phrase of the end of the inner
+                  loop, similarly */
 
-   if (addareg)
-   {
-      addas_x = (addareg ? addar_x : addac_x);
-      addas_y = (addareg ? addar_y : addac_y);
-   }
-   else
-   {
-      addas_x = (addareg ? addar_x : addac_x);
-      addas_y = (addareg ? addar_y : addac_y);
-   }
+               if (inner0)
+               {
+                  if (pixsize == 3)
+                     inner_mask = (icount & 0x07) << 3;
+                  else if (pixsize == 4)
+                     inner_mask = (icount & 0x03) << 4;
+                  else if (pixsize == 5)
+                     inner_mask = (icount & 0x01) << 5;
+               }
+               else
+                  inner_mask    = 0;
 
-   /* Complement these values (complement flag gives adder carry in)*/
+               /* The actual mask used should be the
+                  lesser of the window masks and
+                  the inner mask, where is all cases 000 means 1000. */
+               window_mask = (window_mask == 0 ? 0x40 : window_mask);
+               inner_mask  = (inner_mask == 0 ? 0x40 : inner_mask);
 
-	*adda_x = addas_x ^ (suba_x ? 0xFFFF : 0x0000);
-	*adda_y = addas_y ^ (suba_y ? 0xFFFF : 0x0000);
-}
+               emask       = (window_mask > inner_mask ? inner_mask : window_mask);
+               /* The mask to be used for the pixel size, to which must be added
+                  the bit offset */
+               pma = pixAddr + (1 << pixsize);
+               /* Select the mask */
+               dend = (phrase_mode ? emask : pma);
 
+               /* The cycle width in phrase mode is normally one phrase.  However,
+                  at the start and end it may be narrower.  The start and end masks
+                  are used to generate this.  The width is given by:
 
-/**  ADDBMUX - Address adder input B selection  *******************
+                  8 - start mask - (8 - end mask)
+                  =	end mask - start mask
 
-This module selects the register to be updated by the address
-adder.  This can be one of three registers, the A1 and A2
-pointers, or the A1 fractional part. It can also be zero, so that the step
-registers load directly into the pointers.
-*/
+                  This is only used for writes in phrase mode.
+                  Start and end from the address level of the pipeline are used.
+                  */
 
-/*DEF ADDBMUX (
-INT16/	addb_x
-INT16/	addb_y
-	:OUT;
-	addbsel[0..1]
-INT16/	a1_x
-INT16/	a1_y
-INT16/	a2_x
-INT16/	a2_y
-INT16/	a1_frac_x
-INT16/	a1_frac_y
-	:IN);
-INT16/	zero16 :LOCAL;
-BEGIN*/
-void ADDBMUX(int16_t *addb_x, int16_t *addb_y, uint8_t addbsel, int16_t a1_x, int16_t a1_y,
-	int16_t a2_x, int16_t a2_y, int16_t a1_frac_x, int16_t a1_frac_y)
-{
+               //Phrase mode needs destination data for start/end mask byte merging,
+               //but NOT when bkgwren is set (hardware uses DSTDATA register value).
+               if (phrase_mode && !dsten && !bkgwren)
+                  dstd = ((uint64_t)JaguarReadLong(address, BLITTER) << 32) | (uint64_t)JaguarReadLong(address + 4, BLITTER);
 
-/*Zero		:= TIE0 (zero);
-Zero16		:= JOIN (zero16, zero, zero, zero, zero, zero, zero, zero,
-			zero, zero, zero, zero, zero, zero, zero, zero, zero);
-Addbselb[0-1]	:= BUF8 (addbselb[0-1], addbsel[0-1]);
-Addb_x		:= MX4 (addb_x, a1_x, a2_x, a1_frac_x, zero16, addbselb[0..1]);
-Addb_y		:= MX4 (addb_y, a1_y, a2_y, a1_frac_y, zero16, addbselb[0..1]);*/
-////////////////////////////////////// C++ CODE //////////////////////////////////////
-	int16_t xterm[4], yterm[4];
-	xterm[0] = a1_x, xterm[1] = a2_x, xterm[2] = a1_frac_x, xterm[3] = 0;
-	yterm[0] = a1_y, yterm[1] = a2_y, yterm[2] = a1_frac_y, yterm[3] = 0;
-	*addb_x = xterm[addbsel & 0x03];
-	*addb_y = yterm[addbsel & 0x03];
-//////////////////////////////////////////////////////////////////////////////////////
+               // Write data combines srcd and dstd through ADDDSEL, PATDSEL, or LFU.
+               // Precedence is ADDDSEL > PATDSEL > LFU.
 
-//END;
-}
+               // srcd2 = xxxx xxxx 0123 4567, srcd = 8901 2345 xxxx xxxx, srcshift = $20 (32)
+               srcd = (srcd2 << (64 - srcshift)) | (srcd1 >> srcshift);
+               //bleh, ugly ugly ugly
+               if (srcshift == 0)
+                  srcd = srcd1;
 
+               //NOTE: This only works with pixel sizes less than 8BPP...
+               //DOUBLE NOTE: Still need to do regression testing to ensure that this doesn't break other stuff... !!! CHECK !!!
+               if (!phrase_mode && srcshift != 0)
+                  srcd = ((srcd2 & 0xFF) << (8 - srcshift)) | ((srcd1 & 0xFF) >> srcshift);
 
-/**  DATAMUX - Address local data bus selection  ******************
+               //Z DATA() stuff done here... And it has to be done before any Z shifting...
+               //Note that we need to have phrase mode start/end support here... (Not since we moved it from dzwrite...!)
+               /*
+                  Here are a couple of Cybermorph blits with Z:
+                  $00113078	// DSTEN DSTENZ DSTWRZ CLIP_A1 GOURD GOURZ PATDSEL ZMODE=4
+                  $09900F39	// SRCEN DSTEN DSTENZ DSTWRZ UPDA1 UPDA1F UPDA2 DSTA2 ZMODE=4 LFUFUNC=C DCOMPEN
 
-Select between the adder output and the input data bus
-*/
+                  We're having the same phrase mode overwrite problem we had with the pixels... !!! FIX !!!
+                  Odd. It's equating 0 with 0... Even though ZMODE is $04 (less than)!
+                  */
+               if (gourz)
+               {
+                  uint16_t addq[4];
+                  uint8_t initcin[4] = { 0, 0, 0, 0 };
+                  ADDARRAY(addq, 7/*daddasel*/, 6/*daddbsel*/, 0/*daddmode*/, 0, 0, initcin, 0, 0, 0, 0, 0, srcz1, srcz2, zinc, 0);
+                  srcz2 = ((uint64_t)addq[3] << 48) | ((uint64_t)addq[2] << 32) | ((uint64_t)addq[1] << 16) | (uint64_t)addq[0];
+                  ADDARRAY(addq, 6/*daddasel*/, 7/*daddbsel*/, 1/*daddmode*/, 0, 0, initcin, 0, 0, 0, 0, 0, srcz1, srcz2, zinc, 0);
+                  srcz1 = ((uint64_t)addq[3] << 48) | ((uint64_t)addq[2] << 32) | ((uint64_t)addq[1] << 16) | (uint64_t)addq[0];
 
-/*DEF DATAMUX (
-INT16/	data_x
-INT16/	data_y
-	:OUT;
-INT32/	gpu_din
-INT16/	addq_x
-INT16/	addq_y
-	addqsel
-	:IN);
+               }
 
-INT16/	gpu_lo, gpu_hi
-:LOCAL;
-BEGIN*/
-void DATAMUX(int16_t *data_x, int16_t *data_y, uint32_t gpu_din, int16_t addq_x, int16_t addq_y, bool addqsel)
-{
-   if (addqsel)
-   {
-      *data_x = addq_x;
-      *data_y = addq_y;
-   }
-   else
-   {
-      *data_x = (int16_t)(gpu_din & 0xFFFF);
-      *data_y = (int16_t)(gpu_din >> 16);
-   }
-}
+               zSrcShift = srcshift & 0x30;
+               srcz = (srcz2 << (64 - zSrcShift)) | (srcz1 >> zSrcShift);
+               //bleh, ugly ugly ugly
+               if (zSrcShift == 0)
+                  srcz = srcz1;
 
 
-/******************************************************************
-addradd
-29/11/90
+               //When in SRCSHADE mode, it adds the IINC to the read source (from LFU???)
+               //According to following line, it gets LFU mode. But does it feed the source into the LFU
+               //after the add?
+               //Dest write address/pix address: 0014E83E/0 [dstart=0 dend=10 pwidth=8 srcshift=0][daas=4 dabs=5 dam=7 ds=1 daq=F] [0000000000006505] (icount=003F, inc=1)
+               //Let's try this:
+               if (srcshade)
+               {
+                  uint16_t addq[4];
+                  uint8_t initcin[4] = { 0, 0, 0, 0 };
+                  uint32_t iinc_masked = iinc & 0x00FFFFFF;
+                  ADDARRAY(addq, 4/*daddasel*/, 5/*daddbsel*/, 7/*daddmode*/, dstd, iinc_masked, initcin, 0, 0, 0, patd, srcd, 0, 0, 0, 0);
+                  srcd = ((uint64_t)addq[3] << 48) | ((uint64_t)addq[2] << 32) | ((uint64_t)addq[1] << 16) | (uint64_t)addq[0];
+               }
 
-Blitter Address Adder
----------------------
-The blitter address adder is a pair of sixteen bit adders, one
-each for X and Y.  The multiplexing of the input terms is
-performed elsewhere, but this adder can also perform modulo
-arithmetic to align X-addresses onto phrase boundaries.
+               /* DCONTROL: compute data adder signals.  Moved here from
+                  the per-iteration scope since they are only consumed
+                  during dwrite (dwrite=true, dzwrite=false here). */
+               shadeadd = srcshade;
+               daddasel = (gourd ? 0x01 : 0x00);
+               daddasel |= ((gourd || gourz || srcshade) ? 0x04 : 0x00);
+               daddbsel = (gourd || srcshade ? 0x01 : 0x00);
+               daddbsel |= (gourd || srcshade ? 0x04 : 0x00);
+               /* daddmode bit 0: NAND tree (dcontrol.v:130-146) makes
+                  bit 0 always 1 when dwrite&&gourd, !gourd&&!gourz,
+                  or shadeadd. */
+               daddmode = (gourd || (!gourd && !gourz) || shadeadd ? 0x01 : 0x00);
+               daddmode |= ((gourd && !topben && !ext_int)
+                     || (!gourd && !gourz && !topben) || (shadeadd && !topben) ? 0x02 : 0x00);
+               daddmode |= ((!gourd && !gourz) || shadeadd || (gourd && ext_int) ? 0x04 : 0x00);
+               patfadd = gourd;
+               patdadd = gourd;
+               srcz2add = false;
+               daddq_sel = gourd;
+               data_sel = ((!patdsel && !adddsel) ? 0x01 : 0x00)
+                  | (adddsel ? 0x02 : 0x00);
 
-modx[0..2] take values
-000	no mask
-001	mask bit 0
-010	mask bits 1-0
-..
-110  	mask bits 5-0
+               if (patfadd)
+               {
+                  uint16_t addq[4];
+                  uint8_t initcin[4] = { 0, 0, 0, 0 };
+                  ADDARRAY(addq, 4/*daddasel*/, 4/*daddbsel*/, 0/*daddmode*/, dstd, iinc, initcin, 0, 0, 0, patd, srcd, 0, 0, 0, 0);
+                  srcd1 = ((uint64_t)addq[3] << 48) | ((uint64_t)addq[2] << 32) | ((uint64_t)addq[1] << 16) | (uint64_t)addq[0];
+               }
 
-******************************************************************/
+               /* atick[0]/[1] two-phase pipeline: fractional intensity/Z update
+                  runs in the patfadd/srcz2add block above (Phase 0), integer
+                  update runs via DATA→patdadd below (Phase 1).  The dbinh
+                  param below is overwritten inside DATA by COMP_CTRL. */
 
-void ADDRADD(int16_t *addq_x, int16_t *addq_y, bool a1fracldi,
-	uint16_t adda_x, uint16_t adda_y, uint16_t addb_x, uint16_t addb_y, uint8_t modx, bool suba_x, bool suba_y)
-{
+               DATA(&wdata, &dcomp, &zcomp, &winhibit,
+                     true, cmpdst, daddasel, daddbsel, daddmode, daddq_sel, data_sel, 0/*dbinh*/,
+                     dend, dstart, dstd, iinc, lfufunc, &patd, patdadd,
+                     phrase_mode, srcd, false/*srcdread*/, false/*srczread*/, srcz2add, zmode,
+                     bcompen, bkgwren, dcompen, icount & 0x07, pixsize,
+                     &srcz, dstz, zinc);
 
-/* Perform the addition */
+               /*
+                  DEF ADDRCOMP (
+                  a1_outside	// A1 pointer is outside window bounds
+                  :OUT;
+                  INT16/	a1_x
+                  INT16/	a1_y
+                  INT15/	a1_win_x
+                  INT15/	a1_win_y
+                  :IN);
+                  BEGIN
 
-/*Adder_x		:= ADD16 (addqt_x[0..15], co_x, adda_x{0..15}, addb_x{0..15}, ci_x);
-Adder_y		:= ADD16 (addq_y[0..15], co_y, adda_y{0..15}, addb_y{0..15}, ci_y);*/
+               // The address is outside if negative, or if greater than or equal
+               // to the window size
 
-/* latch carry and propagate if required */
+A1_xcomp	:= MAG_15 (a1xgr, a1xeq, a1xlt, a1_x{0..14}, a1_win_x{0..14});
+A1_ycomp	:= MAG_15 (a1ygr, a1yeq, a1ylt, a1_y{0..14}, a1_win_y{0..14});
+A1_outside	:= OR6 (a1_outside, a1_x{15}, a1xgr, a1xeq, a1_y{15}, a1ygr, a1yeq);
+*/
+               //NOTE: There seems to be an off-by-one bug here in the clip_a1 section... !!! FIX !!!
+               //      Actually, seems to be related to phrase mode writes...
+               //      Or is it? Could be related to non-15-bit compares as above?
+               if (clip_a1 && ((a1_x & 0x8000) || (a1_y & 0x8000) || (a1_x >= a1_win_x) || (a1_y >= a1_win_y)))
+                  winhibit = true;
 
-/*Cxt0		:= AN2 (cxt[0], co_x, a1fracldi);
-Cxt1		:= FD1Q (cxt[1], cxt[0], clk[0]);
-Ci_x		:= EO (ci_x, cxt[1], suba_x);
 
-yt0			:= AN2 (cyt[0], co_y, a1fracldi);
-Cyt1		:= FD1Q (cyt[1], cyt[0], clk[0]);
-Ci_y		:= EO (ci_y, cyt[1], suba_y);*/
+               if (!winhibit || bkgwren)
+               {
+                  if (phrase_mode)
+                  {
+                     JaguarWriteLong(address + 0, wdata >> 32, BLITTER);
+                     JaguarWriteLong(address + 4, wdata & 0xFFFFFFFF, BLITTER);
+                  }
+                  else
+                  {
+                     if (pixsize == 5)
+                        JaguarWriteLong(address, wdata & 0xFFFFFFFF, BLITTER);
+                     else if (pixsize == 4)
+                        JaguarWriteWord(address, wdata & 0x0000FFFF, BLITTER);
+                     else
+                        JaguarWriteByte(address, wdata & 0x000000FF, BLITTER);
+                  }
+               }
 
-////////////////////////////////////// C++ CODE //////////////////////////////////////
-//I'm sure the following will generate a bunch of warnings, but will have to do for now.
-	static uint16_t co_x = 0, co_y = 0;	// Carry out has to propogate between function calls...
-	uint16_t ci_x = co_x ^ (suba_x ? 1 : 0);
-	uint16_t ci_y = co_y ^ (suba_y ? 1 : 0);
-	uint32_t addqt_x = adda_x + addb_x + ci_x;
-	uint32_t addqt_y = adda_y + addb_y + ci_y;
-	uint16_t mask[8] = { 0xFFFF, 0xFFFE, 0xFFFC, 0xFFF8, 0xFFF0, 0xFFE0, 0xFFC0, 0x0000 };
-	co_x = ((addqt_x & 0x10000) && a1fracldi ? 1 : 0);
-	co_y = ((addqt_y & 0x10000) && a1fracldi ? 1 : 0);
-//////////////////////////////////////////////////////////////////////////////////////
+            }
 
-/* Mask low bits of X to 0 if required */
+            if (dzwrite)
+            {
+               PERF_INC(blitter_phrase_writes);
+#ifdef BENCH_PROFILE
+               blitter_did_io = 1;
+#endif
+               // OK, here's the big insight: When NOT in GOURZ mode, srcz1 & 2 function EXACTLY the same way that
+               // srcd1 & 2 work--there's an implicit shift from srcz1 to srcz2 whenever srcz1 is read.
+               // OTHERWISE, srcz1 is the integer for the computed Z and srcz2 is the fractional part.
+               // Writes to srcz1 & 2 follow the same pattern as the other 64-bit registers--low 32 at the low address,
+               // high 32 at the high address (little endian!).
+               // NOTE: GOURZ is still not properly supported. Check patd/patf handling...
+               //       Phrase mode start/end masks are not properly supported either...
+               //This is not correct... !!! FIX !!!
+               //Should be OK now... We'll see...
+               //Nope. Having the same starstep write problems in phrase mode as we had with pixels... !!! FIX !!!
+               //This is not causing the problem in Hover Strike... :-/
+               //The problem was with the SREADX not shifting. Still problems with Z comparisons & other text in pregame screen...
+               if (!winhibit)
+               {
+                  if (phrase_mode)
+                  {
+                     JaguarWriteLong(address + 0, srcz >> 32, BLITTER);
+                     JaguarWriteLong(address + 4, srcz & 0xFFFFFFFF, BLITTER);
+                  }
+                  else
+                  {
+                     if (pixsize == 4)
+                        JaguarWriteWord(address, srcz & 0x0000FFFF, BLITTER);
+                  }
+               }//*/
+            }
 
-/*Masksel		:= D38H (unused[0], masksel[0..4], maskbit[5], unused[1], modx[0..2]);
 
-Maskbit[0-4]	:= OR2 (maskbit[0-4], masksel[0-4], maskbit[1-5]);
+            if (a1_add)
+            {
+               int16_t adda_x, adda_y, addb_x, addb_y, addq_x, addq_y;
+               ADDAMUX(&adda_x, &adda_y, addasel, a1_step_x, a1_step_y, a1_stepf_x, a1_stepf_y, a2_step_x, a2_step_y,
+                     a1_inc_x, a1_inc_y, a1_incf_x, a1_incf_y, adda_xconst, adda_yconst, addareg, suba_x, suba_y);
+               ADDBMUX(&addb_x, &addb_y, addbsel, a1_x, a1_y, a2_x, a2_y, a1_frac_x, a1_frac_y);
+               ADDRADD(&addq_x, &addq_y, a1fracldi, adda_x, adda_y, addb_x, addb_y, modx, suba_x, suba_y);
 
-Mask[0-5]	:= MX2 (addq_x[0-5], addqt_x[0-5], zero, maskbit[0-5]);
+               //Now, write to what???
+               //a2ptrld comes from a2ptrldi...
+               //I believe it's addbsel that determines the writeback...
+               // This is where atick[0] & [1] come in, in determining which part (fractional, integer)
+               // gets written to...
+               //a1_x = addq_x;
+               //a1_y = addq_y;
+               //Kludge, to get A1 channel increment working...
+               if (a1addx == 3)
+               {
+                  a1_frac_x = addq_x, a1_frac_y = addq_y;
 
-Addq_x		:= JOIN (addq_x, addq_x[0..5], addqt_x[6..15]);
-Addq_y		:= JOIN (addq_y, addq_y[0..15]);*/
+                  addasel = 2, addbsel = 0, a1fracldi = false;
+                  ADDAMUX(&adda_x, &adda_y, addasel, a1_step_x, a1_step_y, a1_stepf_x, a1_stepf_y, a2_step_x, a2_step_y,
+                        a1_inc_x, a1_inc_y, a1_incf_x, a1_incf_y, adda_xconst, adda_yconst, addareg, suba_x, suba_y);
+                  ADDBMUX(&addb_x,&addb_y, addbsel, a1_x, a1_y, a2_x, a2_y, a1_frac_x, a1_frac_y);
+                  ADDRADD(&addq_x, &addq_y, a1fracldi, adda_x, adda_y, addb_x, addb_y, modx, suba_x, suba_y);
 
-////////////////////////////////////// C++ CODE //////////////////////////////////////
-	*addq_x = addqt_x & mask[modx];
-	*addq_y = addqt_y & 0xFFFF;
-//////////////////////////////////////////////////////////////////////////////////////
+                  a1_x = addq_x, a1_y = addq_y;
+               }
+               else
+                  a1_x = addq_x, a1_y = addq_y;
+            }
 
-//Unused[0-1]	:= DUMMY (unused[0-1]);
+            if (a2_add)
+            {
+               int16_t adda_x, adda_y, addb_x, addb_y, addq_x, addq_y;
+               ADDAMUX(&adda_x, &adda_y, addasel, a1_step_x, a1_step_y, a1_stepf_x, a1_stepf_y, a2_step_x, a2_step_y,
+                     a1_inc_x, a1_inc_y, a1_incf_x, a1_incf_y, adda_xconst, adda_yconst, addareg, suba_x, suba_y);
+               ADDBMUX(&addb_x, &addb_y, addbsel, a1_x, a1_y, a2_x, a2_y, a1_frac_x, a1_frac_y);
+               ADDRADD(&addq_x, &addq_y, a1fracldi, adda_x, adda_y, addb_x, addb_y, modx, suba_x, suba_y);
 
-//END;
-}
+               //Now, write to what???
+               //a2ptrld comes from a2ptrldi...
+               //I believe it's addbsel that determines the writeback...
+               a2_x = addq_x;
+               a2_y = addq_y;
+            }
+#ifdef BENCH_PROFILE
+            if (blitter_did_io) PERF_INC(blitter_inner_io);
+            else                PERF_INC(blitter_inner_idle);
+#endif
+         }
 
+         indone = true;
+         // The outer counter is updated here as well on the clock cycle...
 
-/*
-DEF DATA (
-		wdata[0..63]	// co-processor write data bus
-		:BUS;
-		dcomp[0..7]		// data byte equal flags
-		srcd[0..7]		// bits to use for bit to byte expansion
-		zcomp[0..3]		// output from Z comparators
-		:OUT;
-		a1_x[0..1]		// low two bits of A1 X pointer
-		big_pix			// pixel organisation is big-endian
-		blitter_active	// blitter is active
-		clk				// co-processor clock
-		cmpdst			// compare dest rather than source
-		colorld			// load the pattern color fields
-		daddasel[0..2]	// data adder input A selection
-		daddbsel[0..3]	// data adder input B selection
-		daddmode[0..2]	// data adder mode
-		daddq_sel		// select adder output vs. GPU data
-		data[0..63]		// co-processor read data bus
-		data_ena		// enable write data
-		data_sel[0..1]	// select data to write
-		dbinh\[0..7]	// byte oriented changed data inhibits
-		dend[0..5]		// end of changed write data zone
-		dpipe[0..1]		// load computed data pipe-line latch
-		dstart[0..5]	// start of changed write data zone
-		dstdld[0..1]	// dest data load (two halves)
-		dstzld[0..1]	// dest zed load (two halves)
-		ext_int			// enable extended precision intensity calculations
-INT32/	gpu_din			// GPU data bus
-		iincld			// I increment load
-		iincldx			// alternate I increment load
-		init_if			// initialise I fraction phase
-		init_ii			// initialise I integer phase
-		init_zf			// initialise Z fraction phase
-		intld[0..3]		// computed intensities load
-		istepadd		// intensity step integer add
-		istepfadd		// intensity step fraction add
-		istepld			// I step load
-		istepdld		// I step delta load
-		lfu_func[0..3]	// LFU function code
-		patdadd			// pattern data gouraud add
-		patdld[0..1]	// pattern data load (two halves)
-		pdsel[0..1]		// select pattern data type
-		phrase_mode		// phrase write mode
-		reload			// transfer contents of double buffers
-		reset\			// system reset
-		srcd1ld[0..1]	// source register 1 load (two halves)
-		srcdread		// source data read load enable
-		srczread		// source zed read load enable
-		srcshift[0..5]	// source alignment shift
-		srcz1ld[0..1]	// source zed 1 load (two halves)
-		srcz2add		// zed fraction gouraud add
-		srcz2ld[0..1]	// source zed 2 load (two halves)
-		textrgb			// texture mapping in RGB mode
-		txtd[0..63]		// data from the texture unit
-		zedld[0..3]		// computed zeds load
-		zincld			// Z increment load
-		zmode[0..2]		// Z comparator mode
-		zpipe[0..1]		// load computed zed pipe-line latch
-		zstepadd		// zed step integer add
-		zstepfadd		// zed step fraction add
-		zstepld			// Z step load
-		zstepdld		// Z step delta load
-		:IN);
-*/
+         /* the inner loop is started whenever another state is about to
+            cause the inner state to go active */
+         //Instart		:= ND7 (instart, innert[0], innert[2..7]);
 
-void DATA(uint64_t *wdata, uint8_t *dcomp, uint8_t *zcomp, bool *nowrite,
-	bool big_pix, bool cmpdst, uint8_t daddasel, uint8_t daddbsel, uint8_t daddmode, bool daddq_sel, uint8_t data_sel,
-	uint8_t dbinh, uint8_t dend, uint8_t dstart, uint64_t dstd, uint32_t iinc, uint8_t lfu_func, uint64_t *patd, bool patdadd,
-	bool phrase_mode, uint64_t srcd, bool srcdread, bool srczread, bool srcz2add, uint8_t zmode,
-	bool bcompen, bool bkgwren, bool dcompen, uint8_t icount, uint8_t pixsize,
-	uint64_t *srcz, uint64_t dstz, uint32_t zinc)
-{
-/*
-  Stuff we absolutely *need* to have passed in/out:
-IN:
-  patdadd, dstd, srcd, patd, daddasel, daddbsel, daddmode, iinc, srcz1, srcz2, big_pix, phrase_mode, cmpdst
-OUT:
-  changed patd (wdata I guess...) (Nope. We pass it back directly now...)
-*/
+         //Actually, it's done only when inner gets asserted without the 2nd line of conditions
+         //(inner AND !indone)
+         //fixed now...
+         //Since we don't get here until the inner loop is finished (indone = true) we can get
+         //away with doing it here...!
+         ocount--;
 
-// Source data registers
+         if (ocount == 0)
+            outer0 = true;
+      }
 
-/*Data_src	:= DATA_SRC (srcdlo, srcdhi, srcz[0..1], srczo[0..1], srczp[0..1], srcz1[0..1], srcz2[0..1], big_pix,
-			clk, gpu_din, intld[0..3], local_data0, local_data1, srcd1ld[0..1], srcdread, srczread, srcshift[0..5],
-			srcz1ld[0..1], srcz2add, srcz2ld[0..1], zedld[0..3], zpipe[0..1]);
-Srcd[0-7]	:= JOIN (srcd[0-7], srcdlo{0-7});
-Srcd[8-31]	:= JOIN (srcd[8-31], srcdlo{8-31});
-Srcd[32-63]	:= JOIN (srcd[32-63], srcdhi{0-31});*/
+      if (a1fupdate)
+      {
+         uint32_t a1_frac_xt = (uint32_t)a1_frac_x + (uint32_t)a1_stepf_x;
+         uint32_t a1_frac_yt = (uint32_t)a1_frac_y + (uint32_t)a1_stepf_y;
+         a1FracCInX = a1_frac_xt >> 16;
+         a1FracCInY = a1_frac_yt >> 16;
+         a1_frac_x = (uint16_t)(a1_frac_xt & 0xFFFF);
+         a1_frac_y = (uint16_t)(a1_frac_yt & 0xFFFF);
+      }
 
-// Destination data registers
+      if (a1update)
+      {
+         a1_x += a1_step_x + a1FracCInX;
+         a1_y += a1_step_y + a1FracCInY;
+      }
 
-/*Data_dst	:= DATA_DST (dstd[0..63], dstz[0..1], clk, dstdld[0..1], dstzld[0..1], load_data[0..1]);
-Dstdlo		:= JOIN (dstdlo, dstd[0..31]);
-Dstdhi		:= JOIN (dstdhi, dstd[32..63]);*/
+      if (a2update)
+      {
+         a2_x += a2_step_x;
+         a2_y += a2_step_y;
+      }
+   }
 
-// Pattern and Color data registers
+   // Write values back to registers (in real blitter, these are continuously updated)
+   SET16(blitter_ram, A1_PIXEL + 2, a1_x);
+   SET16(blitter_ram, A1_PIXEL + 0, a1_y);
+   SET16(blitter_ram, A1_FPIXEL + 2, a1_frac_x);
+   SET16(blitter_ram, A1_FPIXEL + 0, a1_frac_y);
+   SET16(blitter_ram, A2_PIXEL + 2, a2_x);
+   SET16(blitter_ram, A2_PIXEL + 0, a2_y);
 
-// Looks like this is simply another register file for the pattern data registers. No adding or anything funky
-// going on. Note that patd & patdv will output the same info.
-// Patdldl/h (patdld[0..1]) can select the local_data bus to overwrite the current pattern data...
-// Actually, it can be either patdld OR patdadd...!
-/*Data_pat	:= DATA_PAT (colord[0..15], int0dp[8..10], int1dp[8..10], int2dp[8..10], int3dp[8..10], mixsel[0..2],
-			patd[0..63], patdv[0..1], clk, colorld, dpipe[0], ext_int, gpu_din, intld[0..3], local_data0, local_data1,
-			patdadd, patdld[0..1], reload, reset\);
-Patdlo		:= JOIN (patdlo, patd[0..31]);
-Patdhi		:= JOIN (patdhi, patd[32..63]);*/
+#ifdef BLITTER_TRACE
+   {
+      static mach_timebase_info_data_t tb;
+      uint64_t t1 = mach_absolute_time();
+      double ms;
+      if (tb.denom == 0) mach_timebase_info(&tb);
+      ms = (double)(t1 - bm2_trace_t0) * (double)tb.numer / (double)tb.denom / 1e6;
+      if (ms >= bm2_trace_threshold_ms) {
+         uint16_t pcount = GET16(blitter_ram, PIXLINECOUNTER + 2);
+         uint16_t lcount = GET16(blitter_ram, PIXLINECOUNTER);
+         uint8_t pixsize = (blitter_ram[A1_FLAGS + 3] & 0x38) >> 3;
+         fprintf(stderr,
+            "[BLITTER_TRACE] %.2f ms cmd=%08x pixsize=%u inner=%u outer=%u "
+            "src(en=%d enx=%d enz=%d) dst(en=%d enz=%d wrz=%d) "
+            "gourd=%d gourz=%d srcshade=%d bcompen=%d dcompen=%d\n",
+            ms, cmd, pixsize, pcount, lcount,
+            (int)srcen, (int)srcenx, (int)srcenz,
+            (int)dsten, (int)dstenz, (int)dstwrz,
+            (int)gourd, (int)gourz, (int)srcshade,
+            (int)bcompen, (int)dcompen);
+      }
+   }
+#endif
+}
 
-// Multiplying data Mixer (NOT IN JAGUAR I)
+// Various pieces of the blitter puzzle are teased out here...
 
-/*Datamix		:= DATAMIX (patdo[0..1], clk, colord[0..15], dpipe[1], dstd[0..63], int0dp[8..10], int1dp[8..10],
-			int2dp[8..10], int3dp[8..10], mixsel[0..2], patd[0..63], pdsel[0..1], srcd[0..63], textrgb, txtd[0..63]);*/
+void ADDRGEN(uint32_t *address, uint32_t *pixa, bool gena2, bool zaddr,
+	uint16_t a1_x, uint16_t a1_y, uint32_t a1_base, uint8_t a1_pitch, uint8_t a1_pixsize, uint8_t a1_width, uint8_t a1_zoffset,
+	uint16_t a2_x, uint16_t a2_y, uint32_t a2_base, uint8_t a2_pitch, uint8_t a2_pixsize, uint8_t a2_width, uint8_t a2_zoffset)
+{
+	uint16_t x = (gena2 ? a2_x : a1_x) & 0xFFFF;	// Actually uses all 16 bits to generate address...!
+	uint16_t y = (gena2 ? a2_y : a1_y) & 0x0FFF;
+	uint8_t width = (gena2 ? a2_width : a1_width);
+	uint8_t pixsize = (gena2 ? a2_pixsize : a1_pixsize);
+	uint8_t pitch = (gena2 ? a2_pitch : a1_pitch);
+	uint32_t base = (gena2 ? a2_base : a1_base) >> 3;//Only upper 21 bits are passed around the bus? Seems like it...
+	uint8_t zoffset = (gena2 ? a2_zoffset : a1_zoffset);
 
-// Logic function unit
+	uint32_t ytm = ((uint32_t)y << 2) + ((width & 0x02) ? (uint32_t)y << 1 : 0) + ((width & 0x01) ? (uint32_t)y : 0);
 
-/*Lfu		:= LFU (lfu[0..1], srcdlo, srcdhi, dstdlo, dstdhi, lfu_func[0..3]);*/
-////////////////////////////////////// C++ CODE //////////////////////////////////////
-	uint64_t lfu = blitter_simd_ops.lfu(srcd, dstd, lfu_func);
-   bool mir_bit, mir_byte;
-   uint16_t masku;
-   uint8_t e_coarse, e_fine;
-   uint8_t s_coarse, s_fine;
-   uint16_t maskt;
-	uint8_t decl38e[2][8] = { { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF },
-		{ 0xFE, 0xFD, 0xFB, 0xF7, 0xEF, 0xDF, 0xBF, 0x7F } };
-	uint8_t dech38[8] = { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 };
-	uint8_t dech38el[2][8] = { { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 },
-		{ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 } };
-   int en;
-	uint8_t dbinht;
-   uint16_t addq[4];
-   uint8_t initcin[4] = { 0, 0, 0, 0 };
-   uint16_t mask;
-   uint64_t dmux[4];
-   uint64_t ddat;
-//////////////////////////////////////////////////////////////////////////////////////
+	uint32_t ya = (ytm << (width >> 2)) >> 2;
 
-// Increment and Step Registers
+	uint32_t pa = ya + x;
+   uint8_t pt, za;
+   uint32_t phradr, shup, addr;
 
-// Does it do anything without the step add lines? Check it!
-// No. This is pretty much just a register file without the Jaguar II lines...
-/*Inc_step	:= INC_STEP (iinc, istep[0..31], zinc, zstep[0..31], clk, ext_int, gpu_din, iincld, iincldx, istepadd,
-			istepfadd, istepld, istepdld, reload, reset\, zincld, zstepadd, zstepfadd, zstepld, zstepdld);
-Istep		:= JOIN (istep, istep[0..31]);
-Zstep		:= JOIN (zstep, zstep[0..31]);*/
+	*pixa = pa << pixsize;
 
-// Pixel data comparator
+	pt = ((pitch & 0x01) && !(pitch & 0x02) ? 0x01 : 0x00)
+		| (!(pitch & 0x01) && (pitch & 0x02) ? 0x02 : 0x00);
+	phradr = (*pixa >> 6) << pt;
+	shup = (pitch == 0x03 ? (*pixa >> 6) : 0);
 
-/*Datacomp	:= DATACOMP (dcomp[0..7], cmpdst, dstdlo, dstdhi, patdlo, patdhi, srcdlo, srcdhi);*/
-////////////////////////////////////// C++ CODE //////////////////////////////////////
-	*dcomp = blitter_simd_ops.dcomp(*patd, srcd, dstd, cmpdst);
-//////////////////////////////////////////////////////////////////////////////////////
+	za = (zaddr ? zoffset : 0) & 0x03;
+	addr = za + phradr + (shup << 1) + base;
+	*address = ((*pixa & 0x38) >> 3) | ((addr & 0x1FFFFF) << 3);
+	*pixa &= 0x07;
+}
 
-// Zed comparator for Z-buffer operations
+////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////
+// Here's an important bit: The source data adder logic. Need to track down the inputs!!! //
+////////////////////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////////////////
 
-/*Zedcomp		:= ZEDCOMP (zcomp[0..3], srczp[0..1], dstz[0..1], zmode[0..2]);*/
-////////////////////////////////////// C++ CODE //////////////////////////////////////
-//srczp is srcz pipelined, also it goes through a source shift as well...
-/*The shift is basically like so (each piece is 16 bits long):
 
-	0         1         2         3         4          5         6
-	srcz1lolo srcz1lohi srcz1hilo srcz1hihi srcrz2lolo srcz2lohi srcz2hilo
+void ADDAMUX(int16_t *adda_x, int16_t *adda_y, uint8_t addasel, int16_t a1_step_x, int16_t a1_step_y,
+	int16_t a1_stepf_x, int16_t a1_stepf_y, int16_t a2_step_x, int16_t a2_step_y,
+	int16_t a1_inc_x, int16_t a1_inc_y, int16_t a1_incf_x, int16_t a1_incf_y, uint8_t adda_xconst,
+	bool adda_yconst, bool addareg, bool suba_x, bool suba_y)
+{
 
-with srcshift bits 4 & 5 selecting the start position
-*/
-//So... basically what we have here is:
-	*zcomp = blitter_simd_ops.zcomp(*srcz, dstz, zmode);
+   int16_t addar_x, addar_y, addac_x, addac_y, addas_x, addas_y;
+	int16_t xterm[4], yterm[4];
+	xterm[0] = a1_step_x, xterm[1] = a1_stepf_x, xterm[2] = a1_inc_x, xterm[3] = a1_incf_x;
+	yterm[0] = a1_step_y, yterm[1] = a1_stepf_y, yterm[2] = a1_inc_y, yterm[3] = a1_incf_y;
+   if (addasel & 0x04)
+   {
+      addar_x = a2_step_x;
+      addar_y = a2_step_y;
+   }
+   else
+   {
+      addar_x = xterm[addasel & 0x03];
+      addar_y = yterm[addasel & 0x03];
+   }
 
-//TEMP, TO TEST IF ZCOMP IS THE CULPRIT...
-//Nope, this is NOT the problem...
-//zcomp=0;
-// We'll do the comparison/bit/byte inhibits here, since that's they way it happens
-// in the real thing (dcomp goes out to COMP_CTRL and back into DATA through dbinh)...
-	{
-	uint8_t bcomp_bits;
-	if (bcompen && phrase_mode)
-	{
-		bcomp_bits = (srcd >> 56) & 0xFF;
-	}
-	else
-		bcomp_bits = srcd & 0xFF;
+   /* Generate a constant value - this is a power of 2 in the range
+      0-64, or zero.  The control bits are adda_xconst[0..2], when they
+      are all 1  the result is 0.
+      Constants for Y can only be 0 or 1 */
 
-	COMP_CTRL(&dbinht, nowrite,
-		bcompen, true/*big_pix*/, bkgwren, *dcomp, dcompen, icount, pixsize, phrase_mode, bcomp_bits, *zcomp);
-	}
-	dbinh = dbinht;
+	addac_x = (adda_xconst == 0x07 ? 0 : 1 << adda_xconst);
+	addac_y = (adda_yconst ? 0x01 : 0);
 
-//////////////////////////////////////////////////////////////////////////////////////
+   /* Select between constant value and register value */
 
-// 22 Mar 94
-// The data initializer - allows all four initial values to be computed from one (NOT IN JAGUAR I)
+   if (addareg)
+   {
+      addas_x = (addareg ? addar_x : addac_x);
+      addas_y = (addareg ? addar_y : addac_y);
+   }
+   else
+   {
+      addas_x = (addareg ? addar_x : addac_x);
+      addas_y = (addareg ? addar_y : addac_y);
+   }
 
-/*Datinit		:= DATINIT (initcin[0..3], initinc[0..63], initpix[0..15], a1_x[0..1], big_pix, clk, iinc, init_if, init_ii,
-			init_zf, istep[0..31], zinc, zstep[0..31]);*/
+   /* Complement these values (complement flag gives adder carry in)*/
 
-// Adder array for Z and intensity increments
+	*adda_x = addas_x ^ (suba_x ? 0xFFFF : 0x0000);
+	*adda_y = addas_y ^ (suba_y ? 0xFFFF : 0x0000);
+}
 
-/*Addarray	:= ADDARRAY (addq[0..3], clk, daddasel[0..2], daddbsel[0..3], daddmode[0..2], dstdlo, dstdhi, iinc,
-			initcin[0..3], initinc[0..63], initpix[0..15], istep, patdv[0..1], srcdlo, srcdhi, srcz1[0..1],
-			srcz2[0..1], reset\, zinc, zstep);*/
-/*void ADDARRAY(uint16_t * addq, uint8_t daddasel, uint8_t daddbsel, uint8_t daddmode,
-	uint64_t dstd, uint32_t iinc, uint8_t initcin[], uint64_t initinc, uint16_t initpix,
-	uint32_t istep, uint64_t patd, uint64_t srcd, uint64_t srcz1, uint64_t srcz2,
-	uint32_t zinc, uint32_t zstep)*/
-////////////////////////////////////// C++ CODE //////////////////////////////////////
-	{
-	uint64_t patd_pre = *patd;
-	ADDARRAY(addq, daddasel, daddbsel, daddmode, dstd, iinc, initcin, 0, 0, 0, *patd, srcd, 0, 0, 0, 0);
 
-	if (patdadd)
-		*patd = ((uint64_t)addq[3] << 48) | ((uint64_t)addq[2] << 32) | ((uint64_t)addq[1] << 16) | (uint64_t)addq[0];
-//////////////////////////////////////////////////////////////////////////////////////
+/**  ADDBMUX - Address adder input B selection  *******************
 
-// Local data bus multiplexer
-// In hardware, the write data mux reads patd BEFORE the register update.
-// patd_pre captures the pre-increment value for the data output mux.
+This module selects the register to be updated by the address
+adder.  This can be one of three registers, the A1 and A2
+pointers, or the A1 fractional part. It can also be zero, so that the step
+registers load directly into the pointers.
+*/
 
-/*Local_mux	:= LOCAL_MUX (local_data[0..1], load_data[0..1],
-	addq[0..3], gpu_din, data[0..63], blitter_active, daddq_sel);
-Local_data0	:= JOIN (local_data0, local_data[0]);
-Local_data1	:= JOIN (local_data1, local_data[1]);*/
+/*DEF ADDBMUX (
+INT16/	addb_x
+INT16/	addb_y
+	:OUT;
+	addbsel[0..1]
+INT16/	a1_x
+INT16/	a1_y
+INT16/	a2_x
+INT16/	a2_y
+INT16/	a1_frac_x
+INT16/	a1_frac_y
+	:IN);
+INT16/	zero16 :LOCAL;
+BEGIN*/
+void ADDBMUX(int16_t *addb_x, int16_t *addb_y, uint8_t addbsel, int16_t a1_x, int16_t a1_y,
+	int16_t a2_x, int16_t a2_y, int16_t a1_frac_x, int16_t a1_frac_y)
+{
+
+/*Zero		:= TIE0 (zero);
+Zero16		:= JOIN (zero16, zero, zero, zero, zero, zero, zero, zero,
+			zero, zero, zero, zero, zero, zero, zero, zero, zero);
+Addbselb[0-1]	:= BUF8 (addbselb[0-1], addbsel[0-1]);
+Addb_x		:= MX4 (addb_x, a1_x, a2_x, a1_frac_x, zero16, addbselb[0..1]);
+Addb_y		:= MX4 (addb_y, a1_y, a2_y, a1_frac_y, zero16, addbselb[0..1]);*/
 ////////////////////////////////////// C++ CODE //////////////////////////////////////
+	int16_t xterm[4], yterm[4];
+	xterm[0] = a1_x, xterm[1] = a2_x, xterm[2] = a1_frac_x, xterm[3] = 0;
+	yterm[0] = a1_y, yterm[1] = a2_y, yterm[2] = a1_frac_y, yterm[3] = 0;
+	*addb_x = xterm[addbsel & 0x03];
+	*addb_y = yterm[addbsel & 0x03];
 //////////////////////////////////////////////////////////////////////////////////////
 
-// Data output multiplexer and tri-state drive
+//END;
+}
+
 
-/*Data_mux	:= DATA_MUX (wdata[0..63], addq[0..3], big_pix, dstdlo, dstdhi, dstz[0..1], data_sel[0..1], data_ena,
-			dstart[0..5], dend[0..5], dbinh\[0..7], lfu[0..1], patdo[0..1], phrase_mode, srczo[0..1]);*/
-////////////////////////////////////// C++ CODE //////////////////////////////////////
-// NOTE: patdo comes from DATAMIX and can be considered the same as patd for Jaguar I
+/**  DATAMUX - Address local data bus selection  ******************
 
-//////////////////////////////////////////////////////////////////////////////////////
-//}
+Select between the adder output and the input data bus
+*/
 
-/*DEF DATA_MUX (
-		wdata[0..63]	// co-processor rwrite data bus
-		:BUS;
-INT16/	addq[0..3]
-		big_pix			// Pixel organisation is big-endian
-INT32/	dstdlo
-INT32/	dstdhi
-INT32/	dstzlo
-INT32/	dstzhi
-		data_sel[0..1]	// source of write data
-		data_ena		// enable write data onto read/write bus
-		dstart[0..5]	// start of changed write data
-		dend[0..5]		// end of changed write data
-		dbinh\[0..7]	// byte oriented changed data inhibits
-INT32/	lfu[0..1]
-INT32/	patd[0..1]
-		phrase_mode		// phrase write mode
-INT32/	srczlo
-INT32/	srczhi
-		:IN);*/
+/*DEF DATAMUX (
+INT16/	data_x
+INT16/	data_y
+	:OUT;
+INT32/	gpu_din
+INT16/	addq_x
+INT16/	addq_y
+	addqsel
+	:IN);
 
-/*INT32/	addql[0..1], ddatlo, ddathi zero32
+INT16/	gpu_lo, gpu_hi
 :LOCAL;
-BEGIN
-
-Phrase_mode\	:= INV1 (phrase_mode\, phrase_mode);
-Zero		:= TIE0 (zero);
-Zero32		:= JOIN (zero32, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero, zero);*/
+BEGIN*/
+void DATAMUX(int16_t *data_x, int16_t *data_y, uint32_t gpu_din, int16_t addq_x, int16_t addq_y, bool addqsel)
+{
+   if (addqsel)
+   {
+      *data_x = addq_x;
+      *data_y = addq_y;
+   }
+   else
+   {
+      *data_x = (int16_t)(gpu_din & 0xFFFF);
+      *data_y = (int16_t)(gpu_din >> 16);
+   }
+}
 
-/* Generate a changed data mask */
 
-/*Edis		:= OR6 (edis\, dend[0..5]);
-Ecoarse		:= DECL38E (e_coarse\[0..7], dend[3..5], edis\);
-E_coarse[0]	:= INV1 (e_coarse[0], e_coarse\[0]);
-Efine		:= DECL38E (unused[0], e_fine\[1..7], dend[0..2], e_coarse[0]);*/
-////////////////////////////////////// C++ CODE //////////////////////////////////////
+/******************************************************************
+addradd
+29/11/90
 
-	en = ((dend & 0x3F) ? 1 : 0);
-	e_coarse = decl38e[en][(dend & 0x38) >> 3];		// Actually, this is e_coarse inverted...
-	e_fine = decl38e[(e_coarse & 0x01) ^ 0x01][dend & 0x07];
-	e_fine &= 0xFE;
-//////////////////////////////////////////////////////////////////////////////////////
+Blitter Address Adder
+---------------------
+The blitter address adder is a pair of sixteen bit adders, one
+each for X and Y.  The multiplexing of the input terms is
+performed elsewhere, but this adder can also perform modulo
+arithmetic to align X-addresses onto phrase boundaries.
 
-/*Scoarse		:= DECH38 (s_coarse[0..7], dstart[3..5]);
-Sfen\		:= INV1 (sfen\, s_coarse[0]);
-Sfine		:= DECH38EL (s_fine[0..7], dstart[0..2], sfen\);*/
-////////////////////////////////////// C++ CODE //////////////////////////////////////
-	s_coarse = dech38[(dstart & 0x38) >> 3];
-	s_fine = dech38el[(s_coarse & 0x01) ^ 0x01][dstart & 0x07];
-//////////////////////////////////////////////////////////////////////////////////////
+modx[0..2] take values
+000	no mask
+001	mask bit 0
+010	mask bits 1-0
+..
+110  	mask bits 5-0
 
-/*Maskt[0]	:= BUF1 (maskt[0], s_fine[0]);
-Maskt[1-7]	:= OAN1P (maskt[1-7], maskt[0-6], s_fine[1-7], e_fine\[1-7]);*/
-////////////////////////////////////// C++ CODE //////////////////////////////////////
-	maskt = s_fine & 0x0001;
-	maskt |= (((maskt & 0x0001) || (s_fine & 0x02u)) && (e_fine & 0x02u) ? 0x0002 : 0x0000);
-	maskt |= (((maskt & 0x0002) || (s_fine & 0x04u)) && (e_fine & 0x04u) ? 0x0004 : 0x0000);
-	maskt |= (((maskt & 0x0004) || (s_fine & 0x08u)) && (e_fine & 0x08u) ? 0x0008 : 0x0000);
-	maskt |= (((maskt & 0x0008) || (s_fine & 0x10u)) && (e_fine & 0x10u) ? 0x0010 : 0x0000);
-	maskt |= (((maskt & 0x0010) || (s_fine & 0x20u)) && (e_fine & 0x20u) ? 0x0020 : 0x0000);
-	maskt |= (((maskt & 0x0020) || (s_fine & 0x40u)) && (e_fine & 0x40u) ? 0x0040 : 0x0000);
-	maskt |= (((maskt & 0x0040) || (s_fine & 0x80u)) && (e_fine & 0x80u) ? 0x0080 : 0x0000);
-//////////////////////////////////////////////////////////////////////////////////////
+******************************************************************/
 
-   /* Produce a look-ahead on the ripple carry */
-	maskt |= (((s_coarse & e_coarse & 0x01u) || (s_coarse & 0x02u)) && (e_coarse & 0x02u) ? 0x0100 : 0x0000);
-	maskt |= (((maskt & 0x0100) || (s_coarse & 0x04u)) && (e_coarse & 0x04u) ? 0x0200 : 0x0000);
-	maskt |= (((maskt & 0x0200) || (s_coarse & 0x08u)) && (e_coarse & 0x08u) ? 0x0400 : 0x0000);
-	maskt |= (((maskt & 0x0400) || (s_coarse & 0x10u)) && (e_coarse & 0x10u) ? 0x0800 : 0x0000);
-	maskt |= (((maskt & 0x0800) || (s_coarse & 0x20u)) && (e_coarse & 0x20u) ? 0x1000 : 0x0000);
-	maskt |= (((maskt & 0x1000) || (s_coarse & 0x40u)) && (e_coarse & 0x40u) ? 0x2000 : 0x0000);
-	maskt |= (((maskt & 0x2000) || (s_coarse & 0x80u)) && (e_coarse & 0x80u) ? 0x4000 : 0x0000);
+void ADDRADD(int16_t *addq_x, int16_t *addq_y, bool a1fracldi,
+	uint16_t adda_x, uint16_t adda_y, uint16_t addb_x, uint16_t addb_y, uint8_t modx, bool suba_x, bool suba_y)
+{
 
-/* The bit terms are mirrored for big-endian pixels outside phrase
-mode.  The byte terms are mirrored for big-endian pixels in phrase
-mode.  */
+/* Perform the addition */
 
-/*Mirror_bit	:= AN2M (mir_bit, phrase_mode\, big_pix);
-Mirror_byte	:= AN2H (mir_byte, phrase_mode, big_pix);
+/*Adder_x		:= ADD16 (addqt_x[0..15], co_x, adda_x{0..15}, addb_x{0..15}, ci_x);
+Adder_y		:= ADD16 (addq_y[0..15], co_y, adda_y{0..15}, addb_y{0..15}, ci_y);*/
 
-Masktb[14]	:= BUF1 (masktb[14], maskt[14]);
-Masku[0]	:= MX4 (masku[0],  maskt[0],  maskt[7],  maskt[14],  zero, mir_bit, mir_byte);
-Masku[1]	:= MX4 (masku[1],  maskt[1],  maskt[6],  maskt[14],  zero, mir_bit, mir_byte);
-Masku[2]	:= MX4 (masku[2],  maskt[2],  maskt[5],  maskt[14],  zero, mir_bit, mir_byte);
-Masku[3]	:= MX4 (masku[3],  maskt[3],  maskt[4],  masktb[14], zero, mir_bit, mir_byte);
-Masku[4]	:= MX4 (masku[4],  maskt[4],  maskt[3],  masktb[14], zero, mir_bit, mir_byte);
-Masku[5]	:= MX4 (masku[5],  maskt[5],  maskt[2],  masktb[14], zero, mir_bit, mir_byte);
-Masku[6]	:= MX4 (masku[6],  maskt[6],  maskt[1],  masktb[14], zero, mir_bit, mir_byte);
-Masku[7]	:= MX4 (masku[7],  maskt[7],  maskt[0],  masktb[14], zero, mir_bit, mir_byte);
-Masku[8]	:= MX2 (masku[8],  maskt[8],  maskt[13], mir_byte);
-Masku[9]	:= MX2 (masku[9],  maskt[9],  maskt[12], mir_byte);
-Masku[10]	:= MX2 (masku[10], maskt[10], maskt[11], mir_byte);
-Masku[11]	:= MX2 (masku[11], maskt[11], maskt[10], mir_byte);
-Masku[12]	:= MX2 (masku[12], maskt[12], maskt[9],  mir_byte);
-Masku[13]	:= MX2 (masku[13], maskt[13], maskt[8],  mir_byte);
-Masku[14]	:= MX2 (masku[14], maskt[14], maskt[0],  mir_byte);*/
-////////////////////////////////////// C++ CODE //////////////////////////////////////
+/* latch carry and propagate if required */
 
-	mir_bit  = true/*big_pix*/ && !phrase_mode;
-	mir_byte = true/*big_pix*/ && phrase_mode;
-	masku    = maskt;
+/*Cxt0		:= AN2 (cxt[0], co_x, a1fracldi);
+Cxt1		:= FD1Q (cxt[1], cxt[0], clk[0]);
+Ci_x		:= EO (ci_x, cxt[1], suba_x);
 
-	if (mir_bit)
-	{
-		masku &= 0xFF00;
-		masku |= (maskt >> 7) & 0x0001;
-		masku |= (maskt >> 5) & 0x0002;
-		masku |= (maskt >> 3) & 0x0004;
-		masku |= (maskt >> 1) & 0x0008;
-		masku |= (maskt << 1) & 0x0010;
-		masku |= (maskt << 3) & 0x0020;
-		masku |= (maskt << 5) & 0x0040;
-		masku |= (maskt << 7) & 0x0080;
-	}
+yt0			:= AN2 (cyt[0], co_y, a1fracldi);
+Cyt1		:= FD1Q (cyt[1], cyt[0], clk[0]);
+Ci_y		:= EO (ci_y, cyt[1], suba_y);*/
 
-	if (mir_byte)
-	{
-		/* MX4 input 2: masku[7:0] = {8{maskt[14]}} (broadcast bit 14) */
-		masku = (maskt & 0x4000) ? 0x00FF : 0x0000;
-		/* MX2: reverse bits 8-13, maskt[0] at position 14 */
-		masku |= (maskt >> 5) & 0x0100;
-		masku |= (maskt >> 3) & 0x0200;
-		masku |= (maskt >> 1) & 0x0400;
-		masku |= (maskt << 1) & 0x0800;
-		masku |= (maskt << 3) & 0x1000;
-		masku |= (maskt << 5) & 0x2000;
-		masku |= (maskt & 0x0001) << 14;
-	}
+////////////////////////////////////// C++ CODE //////////////////////////////////////
+//I'm sure the following will generate a bunch of warnings, but will have to do for now.
+	static uint16_t co_x = 0, co_y = 0;	// Carry out has to propogate between function calls...
+	uint16_t ci_x = co_x ^ (suba_x ? 1 : 0);
+	uint16_t ci_y = co_y ^ (suba_y ? 1 : 0);
+	uint32_t addqt_x = adda_x + addb_x + ci_x;
+	uint32_t addqt_y = adda_y + addb_y + ci_y;
+	uint16_t mask[8] = { 0xFFFF, 0xFFFE, 0xFFFC, 0xFFF8, 0xFFF0, 0xFFE0, 0xFFC0, 0x0000 };
+	co_x = ((addqt_x & 0x10000) && a1fracldi ? 1 : 0);
+	co_y = ((addqt_y & 0x10000) && a1fracldi ? 1 : 0);
 //////////////////////////////////////////////////////////////////////////////////////
 
-/* The maskt terms define the area for changed data, but the byte
-inhibit terms can override these */
+/* Mask low bits of X to 0 if required */
 
-/*Mask[0-7]	:= AN2 (mask[0-7], masku[0-7], dbinh\[0]);
-Mask[8-14]	:= AN2H (mask[8-14], masku[8-14], dbinh\[1-7]);*/
-////////////////////////////////////// C++ CODE //////////////////////////////////////
-	mask = masku & (!(dbinh & 0x01) ? 0xFFFF : 0xFF00);
-	mask &= ~(((uint16_t)dbinh & 0x00FE) << 7);
-//////////////////////////////////////////////////////////////////////////////////////
+/*Masksel		:= D38H (unused[0], masksel[0..4], maskbit[5], unused[1], modx[0..2]);
 
-/*Addql[0]	:= JOIN (addql[0], addq[0..1]);
-Addql[1]	:= JOIN (addql[1], addq[2..3]);
+Maskbit[0-4]	:= OR2 (maskbit[0-4], masksel[0-4], maskbit[1-5]);
 
-Dsel0b[0-1]	:= BUF8 (dsel0b[0-1], data_sel[0]);
-Dsel1b[0-1]	:= BUF8 (dsel1b[0-1], data_sel[1]);
-Ddatlo		:= MX4 (ddatlo, patd[0], lfu[0], addql[0], zero32, dsel0b[0], dsel1b[0]);
-Ddathi		:= MX4 (ddathi, patd[1], lfu[1], addql[1], zero32, dsel0b[1], dsel1b[1]);*/
-////////////////////////////////////// C++ CODE //////////////////////////////////////
-	dmux[0] = patd_pre;
-	dmux[1] = lfu;
-	dmux[2] = ((uint64_t)addq[3] << 48) | ((uint64_t)addq[2] << 32) | ((uint64_t)addq[1] << 16) | (uint64_t)addq[0];
-	dmux[3] = 0;
-	ddat = dmux[data_sel];
-	}
-//////////////////////////////////////////////////////////////////////////////////////
+Mask[0-5]	:= MX2 (addq_x[0-5], addqt_x[0-5], zero, maskbit[0-5]);
 
-/*Zed_sel		:= AN2 (zed_sel, data_sel[0..1]);
-Zed_selb[0-1]	:= BUF8 (zed_selb[0-1], zed_sel);
+Addq_x		:= JOIN (addq_x, addq_x[0..5], addqt_x[6..15]);
+Addq_y		:= JOIN (addq_y, addq_y[0..15]);*/
 
-Dat[0-7]	:= MX4 (dat[0-7],   dstdlo{0-7},   ddatlo{0-7},   dstzlo{0-7},   srczlo{0-7},   mask[0-7], zed_selb[0]);
-Dat[8-15]	:= MX4 (dat[8-15],  dstdlo{8-15},  ddatlo{8-15},  dstzlo{8-15},  srczlo{8-15},  mask[8],   zed_selb[0]);
-Dat[16-23]	:= MX4 (dat[16-23], dstdlo{16-23}, ddatlo{16-23}, dstzlo{16-23}, srczlo{16-23}, mask[9],   zed_selb[0]);
-Dat[24-31]	:= MX4 (dat[24-31], dstdlo{24-31}, ddatlo{24-31}, dstzlo{24-31}, srczlo{24-31}, mask[10],  zed_selb[0]);
-Dat[32-39]	:= MX4 (dat[32-39], dstdhi{0-7},   ddathi{0-7},   dstzhi{0-7},   srczhi{0-7},   mask[11],  zed_selb[1]);
-Dat[40-47]	:= MX4 (dat[40-47], dstdhi{8-15},  ddathi{8-15},  dstzhi{8-15},  srczhi{8-15},  mask[12],  zed_selb[1]);
-Dat[48-55]	:= MX4 (dat[48-55], dstdhi{16-23}, ddathi{16-23}, dstzhi{16-23}, srczhi{16-23}, mask[13],  zed_selb[1]);
-Dat[56-63]	:= MX4 (dat[56-63], dstdhi{24-31}, ddathi{24-31}, dstzhi{24-31}, srczhi{24-31}, mask[14],  zed_selb[1]);*/
 ////////////////////////////////////// C++ CODE //////////////////////////////////////
-	*wdata = blitter_simd_ops.byte_merge(ddat, dstd, mask);
-	*srcz = blitter_simd_ops.byte_merge(*srcz, dstz, mask);
+	*addq_x = addqt_x & mask[modx];
+	*addq_y = addqt_y & 0xFFFF;
 //////////////////////////////////////////////////////////////////////////////////////
 
-/*Data_enab[0-1]	:= BUF8 (data_enab[0-1], data_ena);
-Datadrv[0-31]	:= TS (wdata[0-31],  dat[0-31],  data_enab[0]);
-Datadrv[32-63]	:= TS (wdata[32-63], dat[32-63], data_enab[1]);
+//Unused[0-1]	:= DUMMY (unused[0-1]);
 
-Unused[0]	:= DUMMY (unused[0]);
+//END;
+}
+
+
+/*
+DEF DATA (
+		wdata[0..63]	// co-processor write data bus
+		:BUS;
+		dcomp[0..7]		// data byte equal flags
+		srcd[0..7]		// bits to use for bit to byte expansion
+		zcomp[0..3]		// output from Z comparators
+		:OUT;
+		a1_x[0..1]		// low two bits of A1 X pointer
+		big_pix			// pixel organisation is big-endian
+		blitter_active	// blitter is active
+		clk				// co-processor clock
+		cmpdst			// compare dest rather than source
+		colorld			// load the pattern color fields
+		daddasel[0..2]	// data adder input A selection
+		daddbsel[0..3]	// data adder input B selection
+		daddmode[0..2]	// data adder mode
+		daddq_sel		// select adder output vs. GPU data
+		data[0..63]		// co-processor read data bus
+		data_ena		// enable write data
+		data_sel[0..1]	// select data to write
+		dbinh\[0..7]	// byte oriented changed data inhibits
+		dend[0..5]		// end of changed write data zone
+		dpipe[0..1]		// load computed data pipe-line latch
+		dstart[0..5]	// start of changed write data zone
+		dstdld[0..1]	// dest data load (two halves)
+		dstzld[0..1]	// dest zed load (two halves)
+		ext_int			// enable extended precision intensity calculations
+INT32/	gpu_din			// GPU data bus
+		iincld			// I increment load
+		iincldx			// alternate I increment load
+		init_if			// initialise I fraction phase
+		init_ii			// initialise I integer phase
+		init_zf			// initialise Z fraction phase
+		intld[0..3]		// computed intensities load
+		istepadd		// intensity step integer add
+		istepfadd		// intensity step fraction add
+		istepld			// I step load
+		istepdld		// I step delta load
+		lfu_func[0..3]	// LFU function code
+		patdadd			// pattern data gouraud add
+		patdld[0..1]	// pattern data load (two halves)
+		pdsel[0..1]		// select pattern data type
+		phrase_mode		// phrase write mode
+		reload			// transfer contents of double buffers
+		reset\			// system reset
+		srcd1ld[0..1]	// source register 1 load (two halves)
+		srcdread		// source data read load enable
+		srczread		// source zed read load enable
+		srcshift[0..5]	// source alignment shift
+		srcz1ld[0..1]	// source zed 1 load (two halves)
+		srcz2add		// zed fraction gouraud add
+		srcz2ld[0..1]	// source zed 2 load (two halves)
+		textrgb			// texture mapping in RGB mode
+		txtd[0..63]		// data from the texture unit
+		zedld[0..3]		// computed zeds load
+		zincld			// Z increment load
+		zmode[0..2]		// Z comparator mode
+		zpipe[0..1]		// load computed zed pipe-line latch
+		zstepadd		// zed step integer add
+		zstepfadd		// zed step fraction add
+		zstepld			// Z step load
+		zstepdld		// Z step delta load
+		:IN);
+*/
 
-END;*/
-}
 
 
 /**  COMP_CTRL - Comparator output control logic  *****************
@@ -2775,211 +3073,6 @@ performed.  The is taken care of within the zed comparator by
 pipe-lining the comparator inputs where appropriate.
 */
 
-void COMP_CTRL(uint8_t *dbinh, bool *nowrite,
-	bool bcompen, bool big_pix, bool bkgwren, uint8_t dcomp, bool dcompen, uint8_t icount,
-	uint8_t pixsize, bool phrase_mode, uint8_t srcd, uint8_t zcomp)
-{
-   //BEGIN
-
-   /*Bkgwren\	:= INV1 (bkgwren\, bkgwren);
-     Phrase_mode\	:= INV1 (phrase_mode\, phrase_mode);
-     Pixsize\[0-2]	:= INV2 (pixsize\[0-2], pixsize[0-2]);*/
-
-   /* The bit comparator bits are derived from the source data, which
-      will have been suitably aligned for phrase mode.  The contents of
-      the inner counter are used to select which bit to use.
-
-      When not in phrase mode the inner count value is used to select
-      one bit.  It is assumed that the count has already occurred, so,
-      7 selects bit 0, etc.  In big-endian pixel mode, this turns round,
-      so that a count of 7 selects bit 7.
-
-      In phrase mode, the eight bits are used directly, and this mode is
-      only applicable to 8-bit pixel mode (2/34) */
-
-   /*Bcompselt[0-2]	:= EO (bcompselt[0-2], icount[0-2], big_pix);
-Bcompbit	:= MX8 (bcompbit, srcd[7], srcd[6], srcd[5],
-srcd[4], srcd[3], srcd[2], srcd[1], srcd[0], bcompselt[0..2]);
-Bcompbit\	:= INV1 (bcompbit\, bcompbit);*/
-   ////////////////////////////////////// C++ CODE //////////////////////////////////////
-   uint8_t bcompselt = (big_pix ? ~icount : icount) & 0x07;
-   uint8_t bitmask[8] = { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 };
-   bool bcompbit = srcd & bitmask[bcompselt];
-   bool winhibit, di0t0_1, di0t4, di1t2, di2t0_1, di2t4, di3t2;
-   bool di4t0_1, di4t4, di5t2;
-   bool di6t0_1, di6t4;
-   bool di7t2;
-
-   //////////////////////////////////////////////////////////////////////////////////////
-
-   /* pipe-line the count */
-   /*Bcompsel[0-2]	:= FDSYNC (bcompsel[0-2], bcompselt[0-2], step_inner, clk);
-Bcompbt		:= MX8 (bcompbitpt, srcd[7], srcd[6], srcd[5],
-srcd[4], srcd[3], srcd[2], srcd[1], srcd[0], bcompsel[0..2]);
-Bcompbitp	:= FD1Q (bcompbitp, bcompbitpt, clk);
-Bcompbitp\	:= INV1 (bcompbitp\, bcompbitp);*/
-
-   /* For pixel mode, generate the write inhibit signal for all modes
-      on bit inhibit, for 8 and 16 bit modes on comparator inhibit, and
-      for 16 bit mode on Z inhibit
-
-      Nowrite = bcompen . /bcompbit . /phrase_mode
-      + dcompen . dcomp[0] . /phrase_mode . pixsize = 011
-      + dcompen . dcomp[0..1] . /phrase_mode . pixsize = 100
-      + zcomp[0] . /phrase_mode . pixsize = 100
-      */
-
-   /*Nowt0		:= NAN3 (nowt[0], bcompen, bcompbit\, phrase_mode\);
-Nowt1		:= ND6  (nowt[1], dcompen, dcomp[0], phrase_mode\, pixsize\[2], pixsize[0..1]);
-Nowt2		:= ND7  (nowt[2], dcompen, dcomp[0..1], phrase_mode\, pixsize[2], pixsize\[0..1]);
-Nowt3		:= NAN5 (nowt[3], zcomp[0], phrase_mode\, pixsize[2], pixsize\[0..1]);
-Nowt4		:= NAN4 (nowt[4], nowt[0..3]);
-Nowrite		:= AN2  (nowrite, nowt[4], bkgwren\);*/
-   ////////////////////////////////////// C++ CODE //////////////////////////////////////
-   *nowrite = ((bcompen && !bcompbit && !phrase_mode)
-         || (dcompen && (dcomp & 0x01) && !phrase_mode && (pixsize == 3))
-         || (dcompen && ((dcomp & 0x03) == 0x03) && !phrase_mode && (pixsize == 4))
-         || ((zcomp & 0x01) && !phrase_mode && (pixsize == 4)))
-      && !bkgwren;
-   //////////////////////////////////////////////////////////////////////////////////////
-
-   /*Winht		:= NAN3 (winht, bcompen, bcompbitp\, phrase_mode\);
-Winhibit	:= NAN4 (winhibit, winht, nowt[1..3]);*/
-   ////////////////////////////////////// C++ CODE //////////////////////////////////////
-   //This is the same as above, but with bcompbit delayed one tick and called 'winhibit'
-   //Small difference: Besides the pipeline effect, it's also not using !bkgwren...
-   //	bool winhibit = (bcompen && !
-   winhibit = (bcompen && !bcompbit && !phrase_mode)
-      || (dcompen && (dcomp & 0x01) && !phrase_mode && (pixsize == 3))
-      || (dcompen && ((dcomp & 0x03) == 0x03) && !phrase_mode && (pixsize == 4))
-      || ((zcomp & 0x01) && !phrase_mode && (pixsize == 4));
-   //////////////////////////////////////////////////////////////////////////////////////
-
-   /* For phrase mode, generate the byte inhibit signals for eight bit
-      mode 011, or sixteen bit mode 100
-      dbinh\[0] =  pixsize[2] . zcomp[0]
-      +  pixsize[2] . dcomp[0] . dcomp[1] . dcompen
-      + /pixsize[2] . dcomp[0] . dcompen
-      + /srcd[0] . bcompen
-
-      Inhibits 0-3 are also used when not in phrase mode to write back
-      destination data.
-      */
-
-   /*Srcd\[0-7]	:= INV1 (srcd\[0-7], srcd[0-7]);
-
-Di0t0		:= NAN2H (di0t[0], pixsize[2], zcomp[0]);
-Di0t1		:= NAN4H (di0t[1], pixsize[2], dcomp[0..1], dcompen);
-Di0t2		:= NAN2 (di0t[2], srcd\[0], bcompen);
-Di0t3		:= NAN3 (di0t[3], pixsize\[2], dcomp[0], dcompen);
-Di0t4		:= NAN4 (di0t[4], di0t[0..3]);
-Dbinh[0]	:= ANR1P (dbinh\[0], di0t[4], phrase_mode, winhibit);*/
-   ////////////////////////////////////// C++ CODE //////////////////////////////////////
-   *dbinh = 0;
-   di0t0_1 = ((pixsize & 0x04) && (zcomp & 0x01))
-      || ((pixsize & 0x04) && (dcomp & 0x01) && (dcomp & 0x02) && dcompen);
-   di0t4 = di0t0_1
-      || (!(srcd & 0x01) && bcompen)
-      || (!(pixsize & 0x04) && (dcomp & 0x01) && dcompen);
-   *dbinh |= (!((di0t4 && phrase_mode) || winhibit) ? 0x01 : 0x00);
-   //////////////////////////////////////////////////////////////////////////////////////
-
-   /*Di1t0		:= NAN3 (di1t[0], pixsize\[2], dcomp[1], dcompen);
-Di1t1		:= NAN2 (di1t[1], srcd\[1], bcompen);
-Di1t2		:= NAN4 (di1t[2], di0t[0..1], di1t[0..1]);
-Dbinh[1]	:= ANR1 (dbinh\[1], di1t[2], phrase_mode, winhibit);*/
-   ////////////////////////////////////// C++ CODE //////////////////////////////////////
-   di1t2 = di0t0_1
-      || (!(srcd & 0x02) && bcompen)
-      || (!(pixsize & 0x04) && (dcomp & 0x02) && dcompen);
-   *dbinh |= (!((di1t2 && phrase_mode) || winhibit) ? 0x02 : 0x00);
-   //////////////////////////////////////////////////////////////////////////////////////
-
-   /*Di2t0		:= NAN2H (di2t[0], pixsize[2], zcomp[1]);
-Di2t1		:= NAN4H (di2t[1], pixsize[2], dcomp[2..3], dcompen);
-Di2t2		:= NAN2 (di2t[2], srcd\[2], bcompen);
-Di2t3		:= NAN3 (di2t[3], pixsize\[2], dcomp[2], dcompen);
-Di2t4		:= NAN4 (di2t[4], di2t[0..3]);
-Dbinh[2]	:= ANR1 (dbinh\[2], di2t[4], phrase_mode, winhibit);*/
-   ////////////////////////////////////// C++ CODE //////////////////////////////////////
-   //[bcompen=F dcompen=T phrase_mode=T bkgwren=F][nw=F wi=F]
-   //[di0t0_1=F di0t4=F][di1t2=F][di2t0_1=T di2t4=T][di3t2=T][di4t0_1=F di2t4=F][di5t2=F][di6t0_1=F di6t4=F][di7t2=F]
-   //[dcomp=$00 dbinh=$0C][7804780400007804] (icount=0005, inc=4)
-   di2t0_1 = ((pixsize & 0x04) && (zcomp & 0x02))
-      || ((pixsize & 0x04) && (dcomp & 0x04) && (dcomp & 0x08) && dcompen);
-   di2t4 = di2t0_1
-      || (!(srcd & 0x04) && bcompen)
-      || (!(pixsize & 0x04) && (dcomp & 0x04) && dcompen);
-   *dbinh |= (!((di2t4 && phrase_mode) || winhibit) ? 0x04 : 0x00);
-   //////////////////////////////////////////////////////////////////////////////////////
-
-   /*Di3t0		:= NAN3 (di3t[0], pixsize\[2], dcomp[3], dcompen);
-Di3t1		:= NAN2 (di3t[1], srcd\[3], bcompen);
-Di3t2		:= NAN4 (di3t[2], di2t[0..1], di3t[0..1]);
-Dbinh[3]	:= ANR1 (dbinh\[3], di3t[2], phrase_mode, winhibit);*/
-   ////////////////////////////////////// C++ CODE //////////////////////////////////////
-   di3t2 = di2t0_1
-      || (!(srcd & 0x08) && bcompen)
-      || (!(pixsize & 0x04) && (dcomp & 0x08) && dcompen);
-   *dbinh |= (!((di3t2 && phrase_mode) || winhibit) ? 0x08 : 0x00);
-   //////////////////////////////////////////////////////////////////////////////////////
-
-   /*Di4t0		:= NAN2H (di4t[0], pixsize[2], zcomp[2]);
-Di4t1		:= NAN4H (di4t[1], pixsize[2], dcomp[4..5], dcompen);
-Di4t2		:= NAN2 (di4t[2], srcd\[4], bcompen);
-Di4t3		:= NAN3 (di4t[3], pixsize\[2], dcomp[4], dcompen);
-Di4t4		:= NAN4 (di4t[4], di4t[0..3]);
-Dbinh[4]	:= NAN2 (dbinh\[4], di4t[4], phrase_mode);*/
-   ////////////////////////////////////// C++ CODE //////////////////////////////////////
-   di4t0_1 = ((pixsize & 0x04u) && (zcomp & 0x04u))
-      || ((pixsize & 0x04u) && (dcomp & 0x10u) && (dcomp & 0x20u) && dcompen);
-   di4t4 = di4t0_1
-      || (!(srcd & 0x10u) && bcompen)
-      || (!(pixsize & 0x04u) && (dcomp & 0x10u) && dcompen);
-   *dbinh |= (!(di4t4 && phrase_mode) ? 0x10u : 0x00u);
-   //////////////////////////////////////////////////////////////////////////////////////
-
-   /*Di5t0		:= NAN3 (di5t[0], pixsize\[2], dcomp[5], dcompen);
-Di5t1		:= NAN2 (di5t[1], srcd\[5], bcompen);
-Di5t2		:= NAN4 (di5t[2], di4t[0..1], di5t[0..1]);
-Dbinh[5]	:= NAN2 (dbinh\[5], di5t[2], phrase_mode);*/
-   ////////////////////////////////////// C++ CODE //////////////////////////////////////
-   di5t2 = di4t0_1
-      || (!(srcd & 0x20) && bcompen)
-      || (!(pixsize & 0x04) && (dcomp & 0x20) && dcompen);
-   *dbinh |= (!(di5t2 && phrase_mode) ? 0x20 : 0x00);
-   //////////////////////////////////////////////////////////////////////////////////////
-
-   /*Di6t0		:= NAN2H (di6t[0], pixsize[2], zcomp[3]);
-Di6t1		:= NAN4H (di6t[1], pixsize[2], dcomp[6..7], dcompen);
-Di6t2		:= NAN2 (di6t[2], srcd\[6], bcompen);
-Di6t3		:= NAN3 (di6t[3], pixsize\[2], dcomp[6], dcompen);
-Di6t4		:= NAN4 (di6t[4], di6t[0..3]);
-Dbinh[6]	:= NAN2 (dbinh\[6], di6t[4], phrase_mode);*/
-   ////////////////////////////////////// C++ CODE //////////////////////////////////////
-   di6t0_1 = ((pixsize & 0x04) && (zcomp & 0x08))
-      || ((pixsize & 0x04) && (dcomp & 0x40) && (dcomp & 0x80) && dcompen);
-   di6t4 = di6t0_1
-      || (!(srcd & 0x40) && bcompen)
-      || (!(pixsize & 0x04) && (dcomp & 0x40) && dcompen);
-   *dbinh |= (!(di6t4 && phrase_mode) ? 0x40 : 0x00);
-   //////////////////////////////////////////////////////////////////////////////////////
-
-   /*Di7t0		:= NAN3 (di7t[0], pixsize\[2], dcomp[7], dcompen);
-Di7t1		:= NAN2 (di7t[1], srcd\[7], bcompen);
-Di7t2		:= NAN4 (di7t[2], di6t[0..1], di7t[0..1]);
-Dbinh[7]	:= NAN2 (dbinh\[7], di7t[2], phrase_mode);*/
-   ////////////////////////////////////// C++ CODE //////////////////////////////////////
-   di7t2 = di6t0_1
-      || (!(srcd & 0x80) && bcompen)
-      || (!(pixsize & 0x04) && (dcomp & 0x80) && dcompen);
-   *dbinh |= (!(di7t2 && phrase_mode) ? 0x80 : 0x00);
-   //////////////////////////////////////////////////////////////////////////////////////
-
-   //END;
-   //kludge
-   *dbinh = ~*dbinh;
-}
 
 #endif
 
diff --git a/test/tools/test_benchmark.c b/test/tools/test_benchmark.c
index 2ce00cb8..05b3e2a4 100644
--- a/test/tools/test_benchmark.c
+++ b/test/tools/test_benchmark.c
@@ -36,6 +36,9 @@ static void *(*pretro_get_memory_data)(unsigned);
 static size_t (*pretro_get_memory_size)(unsigned);
 static size_t (*pretro_serialize_size)(void);
 static bool (*pretro_unserialize)(const void *, size_t);
+/* Optional: only present when the core was built with BENCH_PROFILE=1. */
+static void (*pperf_counters_dump)(FILE *);
+static unsigned long long *(*pperf_counters_find)(const char *);
 
 /* Options state */
 static int bios_option_set = 0;
@@ -253,6 +256,17 @@ int main(int argc, char **argv)
          num_frames = atoi(argv[i]);
    }
 
+   if (num_frames <= 0)
+   {
+      fprintf(stderr, "ERROR: num_frames must be a positive integer (got %d)\n", num_frames);
+      return 1;
+   }
+   if (warmup_frames < 0)
+   {
+      fprintf(stderr, "ERROR: --warmup must be >= 0 (got %d)\n", warmup_frames);
+      return 1;
+   }
+
 #ifdef __APPLE__
    /* Initialize timebase for mach_absolute_time conversion */
    mach_timebase_info(&timebase_info);
@@ -310,6 +324,10 @@ int main(int argc, char **argv)
    LOAD_SYM(retro_serialize_size);
    LOAD_SYM(retro_unserialize);
 
+   /* Optional perf-counter access; absent unless built with BENCH_PROFILE=1. */
+   pperf_counters_dump = dlsym(handle, "perf_counters_dump");
+   pperf_counters_find = dlsym(handle, "perf_counters_find");
+
    pretro_set_environment(environment_cb);
    pretro_set_video_refresh(video_refresh);
    pretro_set_audio_sample(audio_sample);
@@ -487,29 +505,140 @@ int main(int argc, char **argv)
       fprintf(stderr, "--- Warmup complete ---\n");
    }
 
-   /* Timed run */
-   fprintf(stderr, "--- Benchmarking %d frames ---\n", num_frames);
-   t_start = timer_now();
-
-   for (i = 0; i < num_frames; i++)
-      pretro_run();
-
-   t_end = timer_now();
-
-   elapsed = timer_elapsed_sec(t_start, t_end);
-   fps = (double)num_frames / elapsed;
-   ms_per_frame = (elapsed * 1000.0) / (double)num_frames;
-
-   /* Print results */
-   printf("\n=== BENCHMARK RESULTS ===\n");
-   printf("Blitter mode:    %s\n",
-          strcmp(blitter_value, "enabled") == 0 ? "fast" : "accurate");
-   printf("Frames measured: %d\n", num_frames);
-   printf("Warmup frames:   %d\n", warmup_frames);
-   printf("Total time:      %.3f s\n", elapsed);
-   printf("Frames/sec:      %.2f\n", fps);
-   printf("Time/frame:      %.3f ms\n", ms_per_frame);
-   printf("=========================\n");
+   /* Timed run with per-frame samples to expose variance.  Audio
+    * dropouts in real frontends are caused by *worst-case* frames
+    * exceeding the 16.6 ms (60 Hz) budget, not by the average. */
+   {
+      double *frame_ms = (double *)malloc((size_t)num_frames * sizeof(double));
+      unsigned long long *blit_calls_at_frame = (unsigned long long *)malloc((size_t)num_frames * sizeof(unsigned long long));
+      unsigned long long *blit_inner_at_frame = (unsigned long long *)malloc((size_t)num_frames * sizeof(unsigned long long));
+      double frame_budget_ms = 1000.0 / 60.0;
+      int over_budget = 0;
+      double max_ms = 0.0;
+      double p50_ms = 0.0, p99_ms = 0.0, p999_ms = 0.0;
+      unsigned long long *blit_calls_ctr = pperf_counters_find ? pperf_counters_find("blitter_calls") : NULL;
+      unsigned long long *blit_inner_ctr = pperf_counters_find ? pperf_counters_find("blitter_inner") : NULL;
+      unsigned long long blit_calls_prev = blit_calls_ctr ? *blit_calls_ctr : 0;
+      unsigned long long blit_inner_prev = blit_inner_ctr ? *blit_inner_ctr : 0;
+
+      if (!frame_ms || !blit_calls_at_frame || !blit_inner_at_frame)
+      {
+         fprintf(stderr, "ERROR: malloc failed for per-frame timing\n");
+         pretro_unload_game(); pretro_deinit();
+         free((void *)info.data); dlclose(handle);
+         return 1;
+      }
+
+      fprintf(stderr, "--- Benchmarking %d frames ---\n", num_frames);
+      t_start = timer_now();
+
+      for (i = 0; i < num_frames; i++)
+      {
+         uint64_t f0 = timer_now();
+         uint64_t f1;
+         pretro_run();
+         f1 = timer_now();
+         frame_ms[i] = timer_elapsed_sec(f0, f1) * 1000.0;
+         if (blit_calls_ctr) {
+            blit_calls_at_frame[i] = *blit_calls_ctr - blit_calls_prev;
+            blit_calls_prev = *blit_calls_ctr;
+         } else blit_calls_at_frame[i] = 0;
+         if (blit_inner_ctr) {
+            blit_inner_at_frame[i] = *blit_inner_ctr - blit_inner_prev;
+            blit_inner_prev = *blit_inner_ctr;
+         } else blit_inner_at_frame[i] = 0;
+      }
+
+      t_end = timer_now();
+
+      elapsed = timer_elapsed_sec(t_start, t_end);
+      fps = (double)num_frames / elapsed;
+      ms_per_frame = (elapsed * 1000.0) / (double)num_frames;
+
+      /* Quicksort copy so the original order is preserved for any
+       * later analysis (currently we don't print it, but cheap). */
+      {
+         double *sorted = (double *)malloc((size_t)num_frames * sizeof(double));
+         int j;
+         if (sorted)
+         {
+            memcpy(sorted, frame_ms, (size_t)num_frames * sizeof(double));
+            /* Insertion sort (small N typical). */
+            for (i = 1; i < num_frames; i++)
+            {
+               double key = sorted[i];
+               j = i - 1;
+               while (j >= 0 && sorted[j] > key) { sorted[j + 1] = sorted[j]; j--; }
+               sorted[j + 1] = key;
+            }
+            p50_ms  = sorted[(int)((double)num_frames * 0.50)];
+            p99_ms  = sorted[(int)((double)num_frames * 0.99)];
+            p999_ms = sorted[(int)((double)num_frames * 0.999)];
+            max_ms  = sorted[num_frames - 1];
+            free(sorted);
+         }
+      }
+      for (i = 0; i < num_frames; i++)
+         if (frame_ms[i] > frame_budget_ms) over_budget++;
+
+      /* Print results */
+      printf("\n=== BENCHMARK RESULTS ===\n");
+      printf("Blitter mode:    %s\n",
+             strcmp(blitter_value, "enabled") == 0 ? "fast" : "accurate");
+      printf("Frames measured: %d\n", num_frames);
+      printf("Warmup frames:   %d\n", warmup_frames);
+      printf("Total time:      %.3f s\n", elapsed);
+      printf("Frames/sec:      %.2f\n", fps);
+      printf("Time/frame avg:  %.3f ms\n", ms_per_frame);
+      printf("Time/frame p50:  %.3f ms\n", p50_ms);
+      printf("Time/frame p99:  %.3f ms\n", p99_ms);
+      printf("Time/frame p999: %.3f ms\n", p999_ms);
+      printf("Time/frame max:  %.3f ms\n", max_ms);
+      printf("Over 16.67 ms:   %d / %d frames (%.2f%%)\n",
+             over_budget, num_frames, 100.0 * over_budget / num_frames);
+      printf("=========================\n");
+
+      /* If we have per-frame blitter counters, dump the slowest frames
+       * so we can correlate blit volume with frame-time spikes. */
+      if (over_budget > 0 && blit_calls_ctr) {
+         int j;
+         double avg_calls = 0.0, avg_inner = 0.0;
+         double slow_calls = 0.0, slow_inner = 0.0;
+         int slow_n = 0;
+         printf("\n--- Worst frames (>16.67ms) -----------------------------\n");
+         printf("  idx  frame_ms  blit_calls  blit_inner_iter\n");
+         for (j = 0; j < num_frames; j++) {
+            avg_calls += blit_calls_at_frame[j];
+            avg_inner += blit_inner_at_frame[j];
+            if (frame_ms[j] > frame_budget_ms) {
+               slow_calls += blit_calls_at_frame[j];
+               slow_inner += blit_inner_at_frame[j];
+               slow_n++;
+               if (slow_n <= 12)
+                  printf("  %4d  %7.2f   %10llu   %15llu\n",
+                         j, frame_ms[j],
+                         blit_calls_at_frame[j],
+                         blit_inner_at_frame[j]);
+            }
+         }
+         printf("---\n");
+         printf("Avg per frame (all):    blits=%.0f  inner_iter=%.0f\n",
+                avg_calls / num_frames, avg_inner / num_frames);
+         if (slow_n > 0)
+            printf("Avg per frame (slow):   blits=%.0f  inner_iter=%.0f  (%dx, %dx vs avg)\n",
+                   slow_calls / slow_n, slow_inner / slow_n,
+                   (int)((slow_calls / slow_n) / (avg_calls / num_frames + 1e-9)),
+                   (int)((slow_inner / slow_n) / (avg_inner / num_frames + 1e-9)));
+         printf("=========================================================\n");
+      }
+
+      free(frame_ms);
+      free(blit_calls_at_frame);
+      free(blit_inner_at_frame);
+   }
+
+   if (pperf_counters_dump)
+      pperf_counters_dump(stderr);
 
    pretro_unload_game();
    pretro_deinit();