diff --git a/edge-smoothing/vectorscale/shaders/cell-rasterizer.slang b/edge-smoothing/vectorscale/shaders/cell-rasterizer.slang
index 0adefdf82f..7cdb0bb252 100644
--- a/edge-smoothing/vectorscale/shaders/cell-rasterizer.slang
+++ b/edge-smoothing/vectorscale/shaders/cell-rasterizer.slang
@@ -11,8 +11,11 @@
 // Multi-hit resolution: top-3 nearest CPs with endpoint-defer rejection.
 //
 // Input textures:
-//   FinalPositions: optimized absolute positions
+//   PackedPositions: per-CP denormalized geometry (pp, cp, np with ghosts
+//                    pre-applied + t_branch + neighbor indices + is_line)
+//                    written by pack-positions.slang.
 //   CellGraph: original positions + neighbors + flags + packed directions
+//              (resolve_hit still uses neighbors for color resolution).
 //   Original: input pixel art image
 //
 // Output: viewport-sized final image
@@ -43,8 +46,8 @@ void main() {
 layout(location = 0) in vec2 vTexCoord;
 layout(location = 0) out vec4 FragColor;
 
-layout(set = 0, binding = 2) uniform sampler2D FinalPositions;  // optimized CP absolute positions
-layout(set = 0, binding = 3) uniform sampler2D CellGraph;       // positions + neighbors + flags + directions
+layout(set = 0, binding = 2) uniform sampler2D PackedPositions; // pack-positions output: (pp, cp, np, t_branch, prev_ci, next_ci, validity, is_line) per CP across 3 horizontal texels
+layout(set = 0, binding = 3) uniform sampler2D CellGraph;       // flags + directions (and neighbor indices, now redundant for test_one_cp)
 layout(set = 0, binding = 4) uniform sampler2D Original;        // input pixel art image
 
 int IMG_W()    { return int(params.OriginalSize.x); }
@@ -79,13 +82,48 @@ vec2 read_orig(int cp_idx) {
     return vec2(val.r, val.g);
 }
 
-// Read optimized absolute position from FinalPositions.
-vec2 read_pos(int cp_idx) {
-    if (cp_idx < 0 || cp_idx >= NUM_CPS()) return vec2(-1e10);
+// Per-CP packed render geometry from pack-positions. 3 horizontal texels:
+//   col 0 = (pp.x, pp.y, prev_ci_or_-1, _)
+//   col 1 = (cp.x, cp.y, t_branch, validity 0=skip 1=normal 2=2cp-chain)
+//   col 2 = (np.x, np.y, next_ci_or_-1, _)
+struct PackedCp {
+    vec2 pp;
+    vec2 cp;
+    vec2 np;
+    float t_branch;
+    int prev_ci;
+    int next_ci;
+    bool is_line;
+    bool valid;
+};
+
+PackedCp read_packed_cp(int cp_idx) {
+    PackedCp r;
+    r.valid = false;
+    r.is_line = false;
+    r.t_branch = 0.5;
+    r.prev_ci = -1; r.next_ci = -1;
+    r.pp = vec2(0.0); r.cp = vec2(0.0); r.np = vec2(0.0);
+
+    if (cp_idx < 0 || cp_idx >= NUM_CPS()) return r;
     int cx, cy_slot;
     decode_cp(cp_idx, cx, cy_slot);
-    vec4 val = texelFetch(FinalPositions, ivec2(cx, cy_slot), 0);
-    return vec2(val.r, val.g);
+
+    vec4 t1 = texelFetch(PackedPositions, ivec2(cx*3 + 1, cy_slot), 0);
+    if (t1.a < 0.5) return r;  // pack-positions wrote skip/inactive
+
+    vec4 t0 = texelFetch(PackedPositions, ivec2(cx*3,     cy_slot), 0);
+    vec4 t2 = texelFetch(PackedPositions, ivec2(cx*3 + 2, cy_slot), 0);
+
+    r.pp = t0.rg;
+    r.cp = t1.rg;
+    r.np = t2.rg;
+    r.t_branch = t1.b;
+    r.is_line = t1.a > 1.5;
+    r.prev_ci = (t0.b < -0.5) ? -1 : int(t0.b + 0.5);
+    r.next_ci = (t2.b < -0.5) ? -1 : int(t2.b + 0.5);
+    r.valid = true;
+    return r;
 }
 
 // Decode a neighbor CP index from component encoding.
@@ -417,42 +455,16 @@ uint test_one_cp(int ci, vec2 query, inout Hit hits[3], inout int num_hits) {
     uint flag = read_flags(ci);
     if (flag == 0u) return 0u;
 
-    ivec2 nbrs = read_neighbors(ci);
-    int prev_ci = nbrs.x;
-    int next_ci = nbrs.y;
-    if (prev_ci < 0 && next_ci < 0) return flag;
-
-    bool i_am_endpoint = (prev_ci < 0 || next_ci < 0);
-    bool two_cp_chain = false;
-    if (i_am_endpoint) {
-        int other = (prev_ci < 0) ? next_ci : prev_ci;
-        bool other_is_end = (read_flags(other) & IS_ENDPOINT) != 0u;
-        if (!other_is_end) return flag;
-        if (ci > other) return flag;
-        two_cp_chain = true;
-    }
+    // Per-CP geometry — denormalized by pack-positions. Replaces the
+    // read_neighbors + 3 read_pos chain + ghost-extension dance with 3
+    // texel reads. valid=false means pack-positions wrote skip
+    // (inactive, isolated, or non-owner of a 2-CP chain).
+    PackedCp pcp = read_packed_cp(ci);
+    if (!pcp.valid) return flag;
 
-    vec2 cp_real = read_pos(ci);
-    vec2 prev_pos = (prev_ci >= 0) ? read_pos(prev_ci) : cp_real;
-    vec2 next_pos = (next_ci >= 0) ? read_pos(next_ci) : cp_real;
-
-    bool prev_is_end = (prev_ci >= 0) && ((read_flags(prev_ci) & IS_ENDPOINT) != 0u);
-    bool next_is_end = (next_ci >= 0) && ((read_flags(next_ci) & IS_ENDPOINT) != 0u);
-
-    vec2 cp, pp, np;
-    if (two_cp_chain) {
-        int other = (prev_ci < 0) ? next_ci : prev_ci;
-        vec2 other_pos = read_pos(other);
-        vec2 a0 = (prev_ci < 0) ? cp_real : other_pos;
-        vec2 a1 = (next_ci < 0) ? cp_real : other_pos;
-        pp = 1.5 * a0 - 0.5 * a1;
-        cp = 0.5 * (a0 + a1);
-        np = 1.5 * a1 - 0.5 * a0;
-    } else {
-        cp = cp_real;
-        pp = prev_is_end ? (2.0 * prev_pos - cp) : prev_pos;
-        np = next_is_end ? (2.0 * next_pos - cp) : next_pos;
-    }
+    vec2 pp = pcp.pp;
+    vec2 cp = pcp.cp;
+    vec2 np = pcp.np;
 
     // Quick screen: 3-sample distance² reject. Evaluating beval at
     // t=0, 1/2, 1 with the ghost-extended pp/cp/np gives the real
@@ -468,7 +480,7 @@ uint test_one_cp(int ci, vec2 query, inout Hit hits[3], inout int num_hits) {
     if (quick_d2 > 2.0) return flag;
 
     vec2 result;
-    if (two_cp_chain) {
+    if (pcp.is_line) {
         vec2 a0 = 0.5 * (pp + cp);
         vec2 a1 = 0.5 * (cp + np);
         result = closest_on_segment(a0, a1, query);
@@ -479,23 +491,13 @@ uint test_one_cp(int ci, vec2 query, inout Hit hits[3], inout int num_hits) {
     float span_d2 = result.y;
     if (span_d2 >= 1.0) return flag;
 
-    float t_branch;
-    if (two_cp_chain) {
-        t_branch = 0.5;
-    } else if (prev_is_end || next_is_end) {
-        vec2 interior_mid = 0.125 * pp + 0.75 * cp + 0.125 * np;
-        t_branch = closest_on_span(pp, cp, np, interior_mid).x;
-    } else {
-        t_branch = 0.5;
-    }
-
     // Build candidate Hit and insert via explicit if/else over constant
     // indices — keeps hits[] register-allocated.
     Hit cand;
     cand.d2 = span_d2; cand.t = span_t; cand.cp_idx = ci;
-    cand.prev_ci = prev_ci; cand.next_ci = next_ci;
+    cand.prev_ci = pcp.prev_ci; cand.next_ci = pcp.next_ci;
     cand.cp_pos = cp; cand.prev_pos = pp; cand.next_pos = np;
-    cand.t_branch = t_branch; cand.is_line = two_cp_chain;
+    cand.t_branch = pcp.t_branch; cand.is_line = pcp.is_line;
 
     if (span_d2 < hits[0].d2) {
         hits[2] = hits[1];
diff --git a/edge-smoothing/vectorscale/shaders/pack-positions.slang b/edge-smoothing/vectorscale/shaders/pack-positions.slang
new file mode 100644
index 0000000000..4e62cd29c7
--- /dev/null
+++ b/edge-smoothing/vectorscale/shaders/pack-positions.slang
@@ -0,0 +1,360 @@
+#version 450
+#pragma format R32G32B32A32_SFLOAT
+
+// Pass: Denormalize per-CP render geometry into PackedPositions.
+//
+// Reads FinalPositions (post-tjunction stem snap) + CellGraph
+// (neighbors, flags) and packs each CP's complete render geometry
+// into 3 horizontally-adjacent texels:
+//
+//   (cx*3 + 0, cy_slot) = (pp.x, pp.y, prev_ci_as_float, 0)
+//   (cx*3 + 1, cy_slot) = (cp.x, cp.y, t_branch, validity)
+//                         validity: 0.0 = skip / inactive,
+//                                   1.0 = valid normal,
+//                                   2.0 = valid 2-CP chain (is_line)
+//   (cx*3 + 2, cy_slot) = (np.x, np.y, next_ci_as_float, 0)
+//
+// pp / np are already ghost-extended (pp = 2·prev - cp etc.) for
+// endpoint neighbors. t_branch is computed in the right way per CP
+// type (see the t_branch dispatch in main()):
+//   - IS_CROSSING: 2D Newton iteration on F(t,s) = B_a(t) - B_b(s) = 0,
+//     starting from (0.5, 0.5).
+//   - 2-CP chain (degenerate stem): t_branch = 0.5; pp/cp/np built as
+//     a straight line so the rasterizer's is_line path takes over.
+//   - One-sided clamped Bezier: closed-form cubic project of the
+//     interior B-spline midpoint onto the clamped span.
+//   - Else: t_branch = 0.5.
+//
+// Cost: ~once-per-frame O(num_cps) work — 3 fragments per CP slot, so
+// 6·corners_w·corners_h fragments total. Drops per-pixel work in the
+// rasterizer's test_one_cp from ~6 fetches + ghost construction +
+// cubic/Newton solve down to 4 fetches (1 flag + 3 packed reads).
+//
+// resolve_hit still needs neighbor flags/dirs for color resolution, so
+// the neighbor indices stay encoded in the B channels of texels 0/2.
+
+layout(push_constant) uniform Push {
+    vec4 SourceSize;
+    vec4 OriginalSize;
+    vec4 OutputSize;
+    uint FrameCount;
+} params;
+
+layout(std140, set = 0, binding = 0) uniform UBO {
+    mat4 MVP;
+    vec4 CellGraphSize;
+} global;
+
+#pragma stage vertex
+layout(location = 0) in vec4 Position;
+layout(location = 1) in vec2 TexCoord;
+layout(location = 0) out vec2 vTexCoord;
+
+void main() {
+    gl_Position = global.MVP * Position;
+    vTexCoord = TexCoord;
+}
+
+#pragma stage fragment
+layout(location = 0) in vec2 vTexCoord;
+layout(location = 0) out vec4 FragColor;
+
+layout(set = 0, binding = 2) uniform sampler2D FinalPositions; // post-tjunction stem-snapped positions
+layout(set = 0, binding = 3) uniform sampler2D CellGraph;      // neighbors + flags
+
+int IMG_W()    { return int(params.OriginalSize.x); }
+int IMG_H()    { return int(params.OriginalSize.y); }
+int CORNERS_W() { return IMG_W() + 1; }
+int CORNERS_H() { return IMG_H() + 1; }
+int NUM_CPS()  { return CORNERS_W() * CORNERS_H() * 2; }
+
+const uint IS_CROSSING = 64u;
+const uint IS_ENDPOINT = 128u;
+
+void decode_cp(int cp_idx, out int cx, out int cy_slot) {
+    int cp_half = cp_idx / 2;
+    int cp_slot = cp_idx & 1;
+    cx = cp_half % CORNERS_W();
+    int cy = cp_half / CORNERS_W();
+    cy_slot = cy * 2 + cp_slot;
+}
+
+vec2 read_pos(int cp_idx) {
+    if (cp_idx < 0 || cp_idx >= NUM_CPS()) return vec2(0.0);
+    int cx, cy_slot;
+    decode_cp(cp_idx, cx, cy_slot);
+    return texelFetch(FinalPositions, ivec2(cx, cy_slot), 0).rg;
+}
+
+// ---- B-spline curve-curve intersection (2D Newton) ----
+//
+// Solves F(t, s) = B_a(t) - B_b(s) = 0 from (t, s) = (0.5, 0.5). The
+// optimizer keeps crossings near the grid corner so the initial guess
+// is within ~0.1 of the answer; quadratic Newton convergence drives
+// the residual below f32 epsilon in 3 iterations (4 for safety).
+// Returns vec2(t_a, t_b).
+//
+// Each step: J·Δ = -F where J is the 2×2 partial-derivative matrix
+//   J11 = ∂Fx/∂t = 2·aa.x·t + ba.x       J12 = ∂Fx/∂s = -(2·ab.x·s + bb.x)
+//   J21 = ∂Fy/∂t = 2·aa.y·t + ba.y       J22 = ∂Fy/∂s = -(2·ab.y·s + bb.y)
+// Inverted analytically. The early-break guard on |det(J)| < 1e-12
+// is the tangent / parallel-curves case; in practice the pipeline
+// guarantees a real crossing so this never fires, but it keeps the
+// shader well-defined if a degenerate input ever shows up.
+vec2 bspline_intersect(vec2 a_p0, vec2 a_p1, vec2 a_p2,
+                       vec2 b_p0, vec2 b_p1, vec2 b_p2) {
+    vec2 aa = 0.5 * a_p0 - a_p1 + 0.5 * a_p2;
+    vec2 ba = -a_p0 + a_p1;
+    vec2 ca = 0.5 * a_p0 + 0.5 * a_p1;
+    vec2 ab = 0.5 * b_p0 - b_p1 + 0.5 * b_p2;
+    vec2 bb = -b_p0 + b_p1;
+    vec2 cb = 0.5 * b_p0 + 0.5 * b_p1;
+
+    float t = 0.5;
+    float s = 0.5;
+    for (int iter = 0; iter < 4; iter++) {
+        vec2 fa = aa * t * t + ba * t + ca;
+        vec2 fb = ab * s * s + bb * s + cb;
+        vec2 f = fa - fb;
+        vec2 dft = 2.0 * aa * t + ba;
+        vec2 dfs = -(2.0 * ab * s + bb);
+        float det = dft.x * dfs.y - dft.y * dfs.x;
+        if (abs(det) < 1e-12) break;
+        float dt = ( dfs.y * f.x - dfs.x * f.y) / det;
+        float ds = (-dft.y * f.x + dft.x * f.y) / det;
+        t -= dt;
+        s -= ds;
+    }
+    return vec2(clamp(t, 0.0, 1.0), clamp(s, 0.0, 1.0));
+}
+
+int decode_neighbor(vec4 val) {
+    if (val.r < -0.5) return -1;
+    int ncx = int(val.r + 0.5);
+    int ncy = int(val.g + 0.5);
+    int nslot = int(val.b + 0.5);
+    return (ncy * CORNERS_W() + ncx) * 2 + nslot;
+}
+
+uint read_flags(int cp_idx) {
+    if (cp_idx < 0 || cp_idx >= NUM_CPS()) return 0u;
+    int cx, cy_slot;
+    decode_cp(cp_idx, cx, cy_slot);
+    int base_row = cy_slot * 3;
+    return uint(texelFetch(CellGraph, ivec2(cx, base_row), 0).b + 0.5);
+}
+
+// Closest-point cubic solver — must match cell-rasterizer.slang's
+// closest_on_span exactly so that t_branch values agree with what the
+// rasterizer would compute. Inlined here because pack-positions runs once
+// per frame (cheap) while the rasterizer runs per pixel.
+float span_closest_t(vec2 p0, vec2 p1, vec2 p2, vec2 pt) {
+    vec2 a = 0.5 * (p0 - 2.0 * p1 + p2);
+    vec2 b = p1 - p0;
+    vec2 e = 0.5 * (p0 + p1) - pt;
+
+    float c3 = 2.0 * dot(a, a);
+    float c2 = 3.0 * dot(a, b);
+    float c1 = 2.0 * dot(a, e) + dot(b, b);
+    float c0 = dot(b, e);
+
+    #define EVAL_D2(t) dot((a*(t)+b)*(t)+e, (a*(t)+b)*(t)+e)
+    float d0 = EVAL_D2(0.0);
+    float d1 = EVAL_D2(1.0);
+    float best_d2 = d0; float best_t = 0.0;
+    if (d1 < best_d2) { best_d2 = d1; best_t = 1.0; }
+
+    if (abs(c3) < 1e-12) {
+        if (abs(c2) > 1e-12) {
+            float disc = c1*c1 - 4.0*c2*c0;
+            if (disc >= 0.0) {
+                float sq = sqrt(disc);
+                float qq = -0.5 * (c1 + sign(c1) * sq);
+                float t1 = qq / c2;
+                float t2 = c0 / qq;
+                if (t1 > 0.0 && t1 < 1.0) { float dd = EVAL_D2(t1); if (dd < best_d2) { best_d2 = dd; best_t = t1; } }
+                if (t2 > 0.0 && t2 < 1.0) { float dd = EVAL_D2(t2); if (dd < best_d2) { best_d2 = dd; best_t = t2; } }
+            }
+        } else if (abs(c1) > 1e-12) {
+            float t1 = -c0 / c1;
+            if (t1 > 0.0 && t1 < 1.0) { float dd = EVAL_D2(t1); if (dd < best_d2) { best_d2 = dd; best_t = t1; } }
+        }
+    } else {
+        float inv3a = 1.0 / (3.0 * c3);
+        float shift = -c2 * inv3a;
+        float p = (3.0*c3*c1 - c2*c2) / (3.0*c3*c3);
+        float q = (2.0*c2*c2*c2 - 9.0*c3*c2*c1 + 27.0*c3*c3*c0) / (27.0*c3*c3*c3);
+        float disc = q*q/4.0 + p*p*p/27.0;
+
+        if (disc > 1e-12) {
+            float sq = sqrt(disc);
+            float hq = -q * 0.5;
+            float u1 = sign(hq+sq) * pow(abs(hq+sq), 1.0/3.0);
+            float u2 = sign(hq-sq) * pow(abs(hq-sq), 1.0/3.0);
+            float t1 = u1 + u2 + shift;
+            if (t1 > 0.0 && t1 < 1.0) { float dd = EVAL_D2(t1); if (dd < best_d2) { best_d2 = dd; best_t = t1; } }
+        } else {
+            float r = sqrt(max(-p*p*p/27.0, 0.0));
+            float phi = (r < 1e-15) ? 0.0 : acos(clamp(-q/(2.0*r), -1.0, 1.0));
+            float cube_r = 2.0 * sign(r) * pow(abs(r), 1.0/3.0);
+            const float TAU = 6.283185307;
+            for (int k = 0; k < 3; k++) {
+                float angle = (phi + TAU * float(k)) / 3.0;
+                float t1 = cube_r * cos(angle) + shift;
+                if (t1 > 0.0 && t1 < 1.0) { float dd = EVAL_D2(t1); if (dd < best_d2) { best_d2 = dd; best_t = t1; } }
+            }
+        }
+    }
+    #undef EVAL_D2
+    return best_t;
+}
+
+void main() {
+    ivec2 opos = ivec2(vTexCoord * params.OutputSize.xy);
+
+    // Decode (cx, slot_within_cp, cy_slot) from the output coordinate.
+    // The texture is 3*corners_w wide, so column = cx*3 + which_pos.
+    int which_pos = opos.x % 3;             // 0=pp, 1=cp, 2=np
+    int cx = opos.x / 3;
+    int cy_slot = opos.y;
+
+    if (cx >= CORNERS_W() || cy_slot >= CORNERS_H() * 2) {
+        FragColor = vec4(0.0);
+        return;
+    }
+
+    int cy = cy_slot / 2;
+    int slot = cy_slot & 1;
+    int ci = (cy * CORNERS_W() + cx) * 2 + slot;
+
+    // Read flags + neighbors. Inactive CPs write zeros — the rasterizer
+    // already skips them via its own flag check.
+    uint flag = read_flags(ci);
+    if (flag == 0u) {
+        FragColor = vec4(0.0);
+        return;
+    }
+
+    int base_row = cy_slot * 3;
+    int prev_ci = decode_neighbor(texelFetch(CellGraph, ivec2(cx, base_row + 1), 0));
+    int next_ci = decode_neighbor(texelFetch(CellGraph, ivec2(cx, base_row + 2), 0));
+
+    // CPs with no neighbors (e.g. fully-isolated chain endpoints owned by
+    // someone else's clamped extension) — write zeros.
+    if (prev_ci < 0 && next_ci < 0) {
+        FragColor = vec4(0.0);
+        return;
+    }
+
+    // 2-CP-chain detection: both ends are endpoint markers and we own the
+    // span. Only the lower-indexed endpoint renders. Mirror the
+    // rasterizer's existing logic so the packed geometry matches what
+    // test_one_cp would have constructed.
+    bool i_am_endpoint = (prev_ci < 0 || next_ci < 0);
+    bool two_cp_chain = false;
+    if (i_am_endpoint) {
+        int other = (prev_ci < 0) ? next_ci : prev_ci;
+        bool other_is_end = (read_flags(other) & IS_ENDPOINT) != 0u;
+        if (!other_is_end || ci > other) {
+            FragColor = vec4(0.0);
+            return;
+        }
+        two_cp_chain = true;
+    }
+
+    vec2 cp_real = read_pos(ci);
+    vec2 prev_pos = (prev_ci >= 0) ? read_pos(prev_ci) : cp_real;
+    vec2 next_pos = (next_ci >= 0) ? read_pos(next_ci) : cp_real;
+
+    bool prev_is_end = (prev_ci >= 0) && ((read_flags(prev_ci) & IS_ENDPOINT) != 0u);
+    bool next_is_end = (next_ci >= 0) && ((read_flags(next_ci) & IS_ENDPOINT) != 0u);
+
+    vec2 cp, pp, np;
+    if (two_cp_chain) {
+        // Render as a straight line between the two markers. Pick
+        // p0=(3·a0-a1)/2, p1=midpoint, p2=(3·a1-a0)/2 so beval gives
+        // B(t)=lerp(a0,a1,t). Closest-point queries dispatch to a line
+        // solver via is_line in the rasterizer.
+        int other = (prev_ci < 0) ? next_ci : prev_ci;
+        vec2 other_pos = read_pos(other);
+        vec2 a0 = (prev_ci < 0) ? cp_real : other_pos;
+        vec2 a1 = (next_ci < 0) ? cp_real : other_pos;
+        pp = 1.5 * a0 - 0.5 * a1;
+        cp = 0.5 * (a0 + a1);
+        np = 1.5 * a1 - 0.5 * a0;
+    } else {
+        cp = cp_real;
+        pp = prev_is_end ? (2.0 * prev_pos - cp) : prev_pos;
+        np = next_is_end ? (2.0 * next_pos - cp) : next_pos;
+    }
+
+    // t_branch: rasterizer's branch threshold between prev_dir and
+    // next_dir for color resolution. Mirrors test_one_cp's logic.
+    //
+    // Crossings: solve the curve-curve intersection inline (Newton
+    // iteration on F(t,s) = B_a(t) - B_b(s) = 0). Need both this slot's
+    // own neighbors and the partner slot's neighbors — slot 0 holds
+    // the N-S chain, slot 1 holds E-W. cp_a and cp_b are the same
+    // physical position (both slots of a crossing are co-located).
+    float t_branch;
+    if ((flag & IS_CROSSING) != 0u) {
+        // Read partner-slot neighbor indices (partner is at cy_slot ^ 1
+        // within the same cx column).
+        int p_base_row = (cy_slot ^ 1) * 3;
+        int p_prev = decode_neighbor(texelFetch(CellGraph, ivec2(cx, p_base_row + 1), 0));
+        int p_next = decode_neighbor(texelFetch(CellGraph, ivec2(cx, p_base_row + 2), 0));
+
+        int n_idx, s_idx, e_idx, w_idx;
+        if (slot == 0) {
+            n_idx = prev_ci; s_idx = next_ci;   // own (slot 0) = N-S
+            e_idx = p_prev;  w_idx = p_next;    // partner (slot 1) = E-W
+        } else {
+            n_idx = p_prev;  s_idx = p_next;    // partner (slot 0) = N-S
+            e_idx = prev_ci; w_idx = next_ci;   // own (slot 1) = E-W
+        }
+
+        if (n_idx >= 0 && s_idx >= 0 && e_idx >= 0 && w_idx >= 0) {
+            vec2 n_pos = read_pos(n_idx);
+            vec2 s_pos = read_pos(s_idx);
+            vec2 e_pos = read_pos(e_idx);
+            vec2 w_pos = read_pos(w_idx);
+
+            bool n_is_end = (read_flags(n_idx) & IS_ENDPOINT) != 0u;
+            bool s_is_end = (read_flags(s_idx) & IS_ENDPOINT) != 0u;
+            bool e_is_end = (read_flags(e_idx) & IS_ENDPOINT) != 0u;
+            bool w_is_end = (read_flags(w_idx) & IS_ENDPOINT) != 0u;
+
+            vec2 n_in = n_is_end ? (2.0 * n_pos - cp_real) : n_pos;
+            vec2 s_in = s_is_end ? (2.0 * s_pos - cp_real) : s_pos;
+            vec2 e_in = e_is_end ? (2.0 * e_pos - cp_real) : e_pos;
+            vec2 w_in = w_is_end ? (2.0 * w_pos - cp_real) : w_pos;
+
+            vec2 t_pair = bspline_intersect(n_in, cp_real, s_in,
+                                             e_in, cp_real, w_in);
+            t_branch = (slot == 0) ? t_pair.x : t_pair.y;
+        } else {
+            t_branch = 0.5;
+        }
+    } else if (two_cp_chain) {
+        t_branch = 0.5;
+    } else if (prev_is_end || next_is_end) {
+        // One-sided clamped Bezier: parameterization is shifted, so the
+        // t at which the rendered curve reaches the symmetric
+        // "before/after sc" location of an interior span isn't 0.5.
+        // Project the interior-span midpoint onto the clamped span.
+        vec2 interior_mid = 0.125 * pp + 0.75 * cp + 0.125 * np;
+        t_branch = span_closest_t(pp, cp, np, interior_mid);
+    } else {
+        t_branch = 0.5;
+    }
+
+    float validity = two_cp_chain ? 2.0 : 1.0;
+    float prev_ci_f = (prev_ci < 0) ? -1.0 : float(prev_ci);
+    float next_ci_f = (next_ci < 0) ? -1.0 : float(next_ci);
+
+    vec4 out_texel;
+    if (which_pos == 0)      out_texel = vec4(pp.x, pp.y, prev_ci_f, 0.0);
+    else if (which_pos == 1) out_texel = vec4(cp.x, cp.y, t_branch,  validity);
+    else                     out_texel = vec4(np.x, np.y, next_ci_f, 0.0);
+    FragColor = out_texel;
+}
diff --git a/edge-smoothing/vectorscale/shaders/update-tjunction.slang b/edge-smoothing/vectorscale/shaders/update-tjunction.slang
index 9b7811d5dc..493ab584e4 100644
--- a/edge-smoothing/vectorscale/shaders/update-tjunction.slang
+++ b/edge-smoothing/vectorscale/shaders/update-tjunction.slang
@@ -1,24 +1,23 @@
 #version 450
 #pragma format R32G32B32A32_SFLOAT
 
-// Pass: Post-optimization junction position correction.
+// Pass: T-junction stem CP snap.
 //
-// Crossings: ghost-aware inverse B-spline correction (each slot
-//   independently). Coefficients depend on whether prev/next is a
-//   chain endpoint, which causes the rasterizer to ghost-adjust
-//   that side of the clamped Bezier.
-// T-junctions: through-CP is left at the optimizer's value (not
-//   overwritten). Stem CPs snap onto the rendered through-curve
-//   via the ghost-aware algebraic B(0.5) formula.
+// Through-CPs (IS_TJUNCTION) and crossings (IS_CROSSING) pass through
+// at their optimizer-final positions. Stem CPs (the slot-1 partner of
+// an IS_TJUNCTION through-CP) snap onto the rendered through-curve via
+// the ghost-aware algebraic B(0.5) formula. Coefficients depend on
+// whether the through-CP's prev/next is a chain endpoint.
 //
-// Uses Opt2 (original optimizer output) for the crossing-correction
-// grid target, Source (previous pass) for neighbor positions. Multi-
-// pass iteration: dispatch a few times so junction CPs whose neighbors
-// are themselves junctions converge.
+// Crossing curve-curve intersection is computed in pack-positions; this
+// pass leaves the crossing CP positions alone.
+//
+// Multi-pass iteration: dispatched 3× so stem CPs whose chain
+// neighbors are themselves stems converge (Jacobi iteration on the
+// stem-position system, contraction ≈ 0.17/pass).
 //
 // Input textures:
 //   Source (previous pass): current positions (neighbors)
-//   Opt2: original optimizer output (center weight)
 //   CellGraph: neighbors + flags
 //
 // Output: (R=pos.x, G=pos.y, B=0, A=0)
@@ -33,7 +32,6 @@ layout(push_constant) uniform Push {
 layout(std140, set = 0, binding = 0) uniform UBO {
     mat4 MVP;
     vec4 CellGraphSize;
-    vec4 Opt2Size;
 } global;
 
 #pragma stage vertex
@@ -52,7 +50,6 @@ layout(location = 0) out vec4 FragColor;
 
 layout(set = 0, binding = 2) uniform sampler2D Source;    // current positions (neighbors)
 layout(set = 0, binding = 3) uniform sampler2D CellGraph; // pass 3 (neighbors + flags)
-layout(set = 0, binding = 4) uniform sampler2D Opt2;      // original optimizer output
 
 int IMG_W()    { return int(params.OriginalSize.x); }
 int IMG_H()    { return int(params.OriginalSize.y); }
@@ -61,7 +58,6 @@ int CORNERS_H() { return IMG_H() + 1; }
 int NUM_CPS()  { return CORNERS_W() * CORNERS_H() * 2; }
 
 const uint IS_TJUNCTION = 32u;
-const uint IS_CROSSING  = 64u;
 const uint IS_ENDPOINT  = 128u;
 
 void decode_cp(int cp_idx, out int cx, out int cy_slot) {
@@ -80,14 +76,6 @@ vec2 read_pos(int cp_idx) {
     return vec2(val.r, val.g);
 }
 
-vec2 read_orig_pos(int cp_idx) {
-    if (cp_idx < 0 || cp_idx >= NUM_CPS()) return vec2(0.0);
-    int cx, cy_slot;
-    decode_cp(cp_idx, cx, cy_slot);
-    vec4 val = texelFetch(Opt2, ivec2(cx, cy_slot), 0);
-    return vec2(val.r, val.g);
-}
-
 int decode_neighbor(vec4 val) {
     if (val.r < -0.5) return -1;
     int ncx = int(val.r + 0.5);
@@ -131,43 +119,14 @@ void main() {
 
     uint flags = read_flags(i);
     vec2 curr_pos = read_pos(i);
-    vec2 orig_pos = read_orig_pos(i);
 
     int partner = i ^ 1;
     uint partner_flags = read_flags(partner);
 
-    // Valence-4 crossing: ghost-aware inverse B-spline correction.
-    // Each slot processed independently (different pp/np). The
-    // coefficients (a, bp, bn) solve B_rendered(0.5) = grid for cp,
-    // accounting for the rasterizer's ghost extension of any endpoint
-    // neighbor on this slot's chain.
-    if ((flags & IS_CROSSING) != 0u) {
-        ivec2 nbrs = read_neighbors(i);
-        int prev_idx = nbrs.x;
-        int next_idx = nbrs.y;
-
-        if (prev_idx >= 0 && next_idx >= 0) {
-            vec2 prev_pos = read_pos(prev_idx);
-            vec2 next_pos = read_pos(next_idx);
-            bool prev_is_end = (read_flags(prev_idx) & IS_ENDPOINT) != 0u;
-            bool next_is_end = (read_flags(next_idx) & IS_ENDPOINT) != 0u;
-            float a, bp, bn;
-            if (!prev_is_end && !next_is_end) { a = 0.75; bp = 0.125; bn = 0.125; }
-            else if (prev_is_end && !next_is_end) { a = 0.625; bp = 0.25; bn = 0.125; }
-            else if (!prev_is_end && next_is_end) { a = 0.625; bp = 0.125; bn = 0.25; }
-            else { a = 0.5; bp = 0.25; bn = 0.25; }
-            vec2 corrected = (orig_pos - bp * prev_pos - bn * next_pos) / a;
-            FragColor = vec4(corrected.x, corrected.y, 0.0, 0.0);
-            return;
-        }
-    } else if ((flags & IS_TJUNCTION) != 0u) {
-        // T-junction through-CP: leave at the optimizer's position.
-        FragColor = vec4(curr_pos.x, curr_pos.y, 0.0, 0.0);
-        return;
-    } else if ((partner_flags & IS_TJUNCTION) != 0u && (flags & ~IS_ENDPOINT) == 1u) {
+    if ((partner_flags & IS_TJUNCTION) != 0u && (flags & ~IS_ENDPOINT) == 1u) {
         // Stem CPs carry flags = IS_USED | IS_ENDPOINT (= 129); mask
         // IS_ENDPOINT before testing that this slot is a plain stem.
-        // Stem: snap onto the rendered through-curve via ghost-aware B(0.5).
+        // Snap onto the rendered through-curve via ghost-aware B(0.5).
         ivec2 partner_nbrs = read_neighbors(partner);
         int prev_idx = partner_nbrs.x;
         int next_idx = partner_nbrs.y;
@@ -189,6 +148,9 @@ void main() {
         }
     }
 
-    // Default: pass through position unchanged
+    // Default: pass through position unchanged. Crossings and T-junction
+    // through-CPs both end up here — they keep the optimizer's position;
+    // the crossing's curve-curve intersection parameter is computed
+    // downstream in pack-positions.
     FragColor = vec4(curr_pos.x, curr_pos.y, 0.0, 0.0);
 }
diff --git a/edge-smoothing/vectorscale/vectorscale.slangp b/edge-smoothing/vectorscale/vectorscale.slangp
index cac088e971..d0bf772ac7 100644
--- a/edge-smoothing/vectorscale/vectorscale.slangp
+++ b/edge-smoothing/vectorscale/vectorscale.slangp
@@ -1,15 +1,16 @@
 # Vibeboy vectorize shader preset
 # Works with any input resolution (not just Game Boy 160x144).
 # Intermediate textures are over-allocated using source-relative scales:
-#   Cell graph: 2W x 7H (needs (W+1) x 6(H+1), works for H >= 6)
-#   Positions:  2W x 3.5H (needs (W+1) x 2(H+1), works for H >= 2)
+#   Cell graph:   2W x 7H   (needs (W+1) x 6(H+1), works for H >= 6)
+#   Positions:    2W x 3.5H (needs (W+1) x 2(H+1), works for H >= 2)
+#   PackedPositions: 6W x 3.5H (3 horizontally-adjacent texels per CP slot)
 #
 # Framebuffer formats are set per-shader via `#pragma format` (FP32 for
 # position-storing passes). DO NOT add `float_framebuffer<N> = true` here
 # — it forces RGBA16F on some backends and silently overrides the pragma,
 # causing sub-pixel position rounding visible as hairline misalignment
 # at the rasterizer.
-shaders = 10
+shaders = 11
 
 shader0 = shaders/similarity-graph.slang
 alias0 = SimilarityGraph
@@ -57,9 +58,10 @@ scale_type5 = source
 scale5 = 1.0
 wrap_mode5 = clamp_to_border
 
-# T-junction/crossing correction: 3 iterations for convergence.
-# Each pass reads neighbor positions from Source (previous pass) and
-# original center positions from Opt2.
+# T-junction stem CP snap (3 iterations for convergence). Only stem CPs
+# are repositioned here — through-CPs and crossings pass through with
+# their optimizer-final positions intact. The crossing curve-curve
+# intersection is handled in pack-positions below.
 shader6 = shaders/update-tjunction.slang
 alias6 = TJunc1
 filter_linear6 = false
@@ -81,9 +83,25 @@ scale_type8 = source
 scale8 = 1.0
 wrap_mode8 = clamp_to_border
 
-shader9 = shaders/cell-rasterizer.slang
-alias9 = FinalOutput
-filter_linear9 = true
-scale_type9 = viewport
-scale9 = 1.0
+# Denormalize per-CP geometry: pack each CP's (pp, cp, np) ghost-extended
+# triple, t_branch, neighbor indices, validity, and is_line flag into 3
+# horizontally-adjacent texels of PackedPositions. For IS_CROSSING CPs,
+# t_branch is the curve-curve intersection parameter (Newton iteration on
+# the two B-spline spans, inlined). Lets the rasterizer skip
+# neighbor-index decode + neighbor position fetches + ghost construction
+# + cubic/Newton solves in its hot loop.
+shader9 = shaders/pack-positions.slang
+alias9 = PackedPositions
+filter_linear9 = false
+scale_type_x9 = source
+scale_x9 = 3.0
+scale_type_y9 = source
+scale_y9 = 1.0
 wrap_mode9 = clamp_to_border
+
+shader10 = shaders/cell-rasterizer.slang
+alias10 = FinalOutput
+filter_linear10 = true
+scale_type10 = viewport
+scale10 = 1.0
+wrap_mode10 = clamp_to_border