Skip to content

Commit b984124

Browse files
committed
gdi: replace per-pixel /255 divides with shift+add equivalent
The hot pixel paths in gfx_display_gdi_draw and gdi_font_render_line do many `(uint32_t)x / 255u` operations per pixel — that's a 20-30 cycle integer divide on x86 vs a few cycles for shift+add. For a typical Ozone-with-widgets frame: - General 4-corner gradient: 14 divides per pixel. - 1D gradients (vertical/horizontal): 4 divides per row/column, plus 3 per non-opaque pixel. Less hot since the previous commit collapsed those to 1D loops, but still worth a free win. - Tinted-glyph font composite: 4 divides per glyph pixel. Add a GDI_DIV255 macro: #define GDI_DIV255(x) ((((x) + 1) + ((x) >> 8)) >> 8) Verified bit-exact equivalent of `(uint32_t)x / 255u` for every input in [0, 255*255 = 65025] — a brute-force comparison against integer division across all 65026 values produces zero diffs. That's exactly the input range that products of two 8-bit values land in, which is what every divide-by-255 site here computes. Applied at every hot per-pixel /255 site: - Gradient bilinear (general 4-corner path): 14 sites per pixel. - 1D gradient paths (vertical-only, horizontal-only): 4 sites per row/column plus 3 sites per non-opaque pixel. - Tinted-glyph font scratch composite: 4 sites per pixel. - 1x1 translucent-solid premultiply: 3 sites per draw. - Texture-modulated tint (out_a only): 1 site per pixel. - Font line outer premultiply: 3 sites per line. - gdi_load_texture / gdi_overlay_load: 3 sites per non-opaque pixel. Load-time only, but free to apply for consistency. Deliberately NOT changed: - The `/ (255u * 255u)` divides for out_r/g/b in gdi_blit_texture_modulated. Collapsing those to two sequential GDI_DIV255 calls would introduce up to 1 LSB of rounding error compared to the single divide, since (a/255)*(b/255) has a different rounding boundary than (a*b)/(255*255). The cost saving isn't worth a visible drift in tinted-icon pixels. - The `(x + 127) / 255` rounded form in gdi_blit_rgui_alpha. That's deliberately round-to-nearest rather than truncate, which GDI_DIV255 doesn't reproduce. RGUI's per-frame cost is dominated by syscall / blit overhead, not the divides. - The `(iy * 255u) / (dst_h - 1)` interp-factor divides. Divisor varies per draw; not a constant-255 case. No visual change intended. Output is byte-identical to the divide-based code at every converted site.
1 parent b68f735 commit b984124

1 file changed

Lines changed: 74 additions & 46 deletions

File tree

gfx/drivers/gdi_gfx.c

Lines changed: 74 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,28 @@ static INLINE uint8_t gdi_float_to_byte(float f)
142142
return (uint8_t)v;
143143
}
144144

145+
/* Fast divide-by-255 for 8-bit-times-8-bit products.
146+
*
147+
* The hot pixel paths (gradient bilinear, RGUI alpha composite,
148+
* texture-modulated tint, font tinted-glyph composite, load-time
149+
* texture / overlay premultiply) all need to divide a value of
150+
* the form (a * b) where both a and b are 0..255 by 255 to get
151+
* back into 0..255 range. An integer divide is 20-30 cycles on
152+
* x86; the shift+add form below is bit-exact equivalent to
153+
* (uint32_t)x / 255u for x in [0, 255*255 = 65025], i.e. for
154+
* any product of two 8-bit values.
155+
*
156+
* Brute-force verified against integer division for every input
157+
* in that range; produces the same truncating (round-toward-
158+
* zero) result the original `/ 255u` produced, so the
159+
* substitution is byte-identical at every call site rather than
160+
* being a "close enough" rounding approximation.
161+
*
162+
* x is referenced multiple times, so callers should pass an
163+
* already-evaluated lvalue or temporary; passing a side-effecting
164+
* expression would evaluate it twice. */
165+
#define GDI_DIV255(x) ((((x) + 1) + ((x) >> 8)) >> 8)
166+
145167
/* Pull the four corners of the per-vertex colour array off a
146168
* gfx_display_ctx_draw_t. Caller passes pointers to four uint32_t
147169
* BGRA values plus a single averaged tint colour (used when we
@@ -917,7 +939,13 @@ static void gdi_blit_texture_modulated(
917939
uint8_t sr = (uint8_t)((s >> 16) & 0xFF);
918940
uint8_t sg = (uint8_t)((s >> 8) & 0xFF);
919941
uint8_t sb = (uint8_t)( s & 0xFF);
920-
uint32_t out_a = ((uint32_t)sa * mod_a) / 255u;
942+
uint32_t out_a = GDI_DIV255((uint32_t)sa * mod_a);
943+
/* The /255²/ divides for out_r/g/b are deliberately left
944+
* unchanged — collapsing to two GDI_DIV255 calls would
945+
* introduce rounding error of up to 1 LSB compared to
946+
* the single 16-bit divide, and the cost of one divide
947+
* per channel here isn't worth a visible drift in
948+
* tinted-icon pixels. */
921949
uint32_t out_r = ((uint32_t)sr * mod_r * mod_a) / (255u * 255u);
922950
uint32_t out_g = ((uint32_t)sg * mod_g * mod_a) / (255u * 255u);
923951
uint32_t out_b = ((uint32_t)sb * mod_b * mod_a) / (255u * 255u);
@@ -1326,18 +1354,18 @@ static void gfx_display_gdi_draw(gfx_display_ctx_draw_t *draw,
13261354
unsigned ty = (dst_h <= 1) ? 0 : (iy * 255u) / (dst_h - 1);
13271355
unsigned t_top = 255u - ty;
13281356
unsigned t_bot = ty;
1329-
uint32_t r_ = (tl_r * t_top + bl_r * t_bot) / 255u;
1330-
uint32_t g_ = (tl_g * t_top + bl_g * t_bot) / 255u;
1331-
uint32_t b_ = (tl_b * t_top + bl_b * t_bot) / 255u;
1332-
uint32_t a_ = (tl_a * t_top + bl_a * t_bot) / 255u;
1357+
uint32_t r_ = GDI_DIV255(tl_r * t_top + bl_r * t_bot);
1358+
uint32_t g_ = GDI_DIV255(tl_g * t_top + bl_g * t_bot);
1359+
uint32_t b_ = GDI_DIV255(tl_b * t_top + bl_b * t_bot);
1360+
uint32_t a_ = GDI_DIV255(tl_a * t_top + bl_a * t_bot);
13331361
uint32_t pix;
13341362
if (all_opaque)
13351363
pix = (0xFFu << 24) | (r_ << 16) | (g_ << 8) | b_;
13361364
else
13371365
{
1338-
uint32_t pr = (r_ * a_) / 255u;
1339-
uint32_t pg = (g_ * a_) / 255u;
1340-
uint32_t pb = (b_ * a_) / 255u;
1366+
uint32_t pr = GDI_DIV255(r_ * a_);
1367+
uint32_t pg = GDI_DIV255(g_ * a_);
1368+
uint32_t pb = GDI_DIV255(b_ * a_);
13411369
pix = (a_ << 24) | (pr << 16) | (pg << 8) | pb;
13421370
}
13431371
/* Fill the row. Inline 32-bit stores are what
@@ -1359,17 +1387,17 @@ static void gfx_display_gdi_draw(gfx_display_ctx_draw_t *draw,
13591387
unsigned tx = (dst_w <= 1) ? 0 : (ix * 255u) / (dst_w - 1);
13601388
unsigned t_left = 255u - tx;
13611389
unsigned t_right = tx;
1362-
uint32_t r_ = (tl_r * t_left + tr_r * t_right) / 255u;
1363-
uint32_t g_ = (tl_g * t_left + tr_g * t_right) / 255u;
1364-
uint32_t b_ = (tl_b * t_left + tr_b * t_right) / 255u;
1365-
uint32_t a_ = (tl_a * t_left + tr_a * t_right) / 255u;
1390+
uint32_t r_ = GDI_DIV255(tl_r * t_left + tr_r * t_right);
1391+
uint32_t g_ = GDI_DIV255(tl_g * t_left + tr_g * t_right);
1392+
uint32_t b_ = GDI_DIV255(tl_b * t_left + tr_b * t_right);
1393+
uint32_t a_ = GDI_DIV255(tl_a * t_left + tr_a * t_right);
13661394
if (all_opaque)
13671395
first_row[ix] = (0xFFu << 24) | (r_ << 16) | (g_ << 8) | b_;
13681396
else
13691397
{
1370-
uint32_t pr = (r_ * a_) / 255u;
1371-
uint32_t pg = (g_ * a_) / 255u;
1372-
uint32_t pb = (b_ * a_) / 255u;
1398+
uint32_t pr = GDI_DIV255(r_ * a_);
1399+
uint32_t pg = GDI_DIV255(g_ * a_);
1400+
uint32_t pb = GDI_DIV255(b_ * a_);
13731401
first_row[ix] = (a_ << 24) | (pr << 16) | (pg << 8) | pb;
13741402
}
13751403
}
@@ -1408,29 +1436,29 @@ static void gfx_display_gdi_draw(gfx_display_ctx_draw_t *draw,
14081436

14091437
/* Vertical interp: left edge (TL→BL) and right
14101438
* edge (TR→BR). */
1411-
left_r = (tl_r * t_top + bl_r * t_bot) / 255u;
1412-
left_g = (tl_g * t_top + bl_g * t_bot) / 255u;
1413-
left_b = (tl_b * t_top + bl_b * t_bot) / 255u;
1414-
left_a = (tl_a * t_top + bl_a * t_bot) / 255u;
1415-
right_r = (tr_r * t_top + br_r * t_bot) / 255u;
1416-
right_g = (tr_g * t_top + br_g * t_bot) / 255u;
1417-
right_b = (tr_b * t_top + br_b * t_bot) / 255u;
1418-
right_a = (tr_a * t_top + br_a * t_bot) / 255u;
1439+
left_r = GDI_DIV255(tl_r * t_top + bl_r * t_bot);
1440+
left_g = GDI_DIV255(tl_g * t_top + bl_g * t_bot);
1441+
left_b = GDI_DIV255(tl_b * t_top + bl_b * t_bot);
1442+
left_a = GDI_DIV255(tl_a * t_top + bl_a * t_bot);
1443+
right_r = GDI_DIV255(tr_r * t_top + br_r * t_bot);
1444+
right_g = GDI_DIV255(tr_g * t_top + br_g * t_bot);
1445+
right_b = GDI_DIV255(tr_b * t_top + br_b * t_bot);
1446+
right_a = GDI_DIV255(tr_a * t_top + br_a * t_bot);
14191447

14201448
/* Horizontal interp between the two vertical
14211449
* edges. */
1422-
r_ = (left_r * t_left + right_r * t_right) / 255u;
1423-
g_ = (left_g * t_left + right_g * t_right) / 255u;
1424-
b_ = (left_b * t_left + right_b * t_right) / 255u;
1425-
a_ = (left_a * t_left + right_a * t_right) / 255u;
1450+
r_ = GDI_DIV255(left_r * t_left + right_r * t_right);
1451+
g_ = GDI_DIV255(left_g * t_left + right_g * t_right);
1452+
b_ = GDI_DIV255(left_b * t_left + right_b * t_right);
1453+
a_ = GDI_DIV255(left_a * t_left + right_a * t_right);
14261454

14271455
if (all_opaque)
14281456
row[ix] = (0xFFu << 24) | (r_ << 16) | (g_ << 8) | b_;
14291457
else
14301458
{
1431-
uint32_t pr = (r_ * a_) / 255u;
1432-
uint32_t pg = (g_ * a_) / 255u;
1433-
uint32_t pb = (b_ * a_) / 255u;
1459+
uint32_t pr = GDI_DIV255(r_ * a_);
1460+
uint32_t pg = GDI_DIV255(g_ * a_);
1461+
uint32_t pb = GDI_DIV255(b_ * a_);
14341462
row[ix] = (a_ << 24) | (pr << 16) | (pg << 8) | pb;
14351463
}
14361464
}
@@ -1482,9 +1510,9 @@ static void gfx_display_gdi_draw(gfx_display_ctx_draw_t *draw,
14821510

14831511
/* Premultiply the source colour by its alpha. */
14841512
pre = ((uint32_t)avg_a << 24)
1485-
| (((uint32_t)avg_r * avg_a / 255u) << 16)
1486-
| (((uint32_t)avg_g * avg_a / 255u) << 8)
1487-
| ((uint32_t)avg_b * avg_a / 255u);
1513+
| (GDI_DIV255((uint32_t)avg_r * avg_a) << 16)
1514+
| (GDI_DIV255((uint32_t)avg_g * avg_a) << 8)
1515+
| GDI_DIV255((uint32_t)avg_b * avg_a);
14881516
*gdi->scratch_1x1_pixels = pre;
14891517

14901518
if (!gdi->texDC)
@@ -1977,9 +2005,9 @@ static void gdi_font_render_line(
19772005
/* Premultiply the requested colour. We'll multiply by the
19782006
* atlas alpha per pixel below. */
19792007
pre_a = a;
1980-
pre_r = ((uint32_t)r * a) / 255u;
1981-
pre_g = ((uint32_t)g * a) / 255u;
1982-
pre_b = ((uint32_t)b * a) / 255u;
2008+
pre_r = GDI_DIV255((uint32_t)r * a);
2009+
pre_g = GDI_DIV255((uint32_t)g * a);
2010+
pre_b = GDI_DIV255((uint32_t)b * a);
19832011

19842012
/* Composite glyphs into the scratch DIB. Scale-1.0 fast path
19852013
* does direct A8 → premultiplied BGRA copy; scaled glyphs go
@@ -2046,10 +2074,10 @@ static void gdi_font_render_line(
20462074
/* Premultiplied glyph pixel at the requested tint.
20472075
* Output alpha = atlas_a * tint_a; output RGB =
20482076
* tint_RGB premultiplied by output alpha. */
2049-
out_a = ((uint32_t)alpha * pre_a) / 255u;
2050-
out_r = ((uint32_t)alpha * pre_r) / 255u;
2051-
out_g = ((uint32_t)alpha * pre_g) / 255u;
2052-
out_b = ((uint32_t)alpha * pre_b) / 255u;
2077+
out_a = GDI_DIV255((uint32_t)alpha * pre_a);
2078+
out_r = GDI_DIV255((uint32_t)alpha * pre_r);
2079+
out_g = GDI_DIV255((uint32_t)alpha * pre_g);
2080+
out_b = GDI_DIV255((uint32_t)alpha * pre_b);
20532081
/* Last-write wins where glyphs overlap: kerned
20542082
* fonts can produce overlapping bounding boxes,
20552083
* but the actual coverage rarely overlaps. */
@@ -3329,9 +3357,9 @@ static uintptr_t gdi_load_texture(void *video_data, void *data,
33293357
}
33303358
else
33313359
{
3332-
pr = (uint8_t)(((unsigned)sr * sa) / 255u);
3333-
pg = (uint8_t)(((unsigned)sg * sa) / 255u);
3334-
pb = (uint8_t)(((unsigned)sb * sa) / 255u);
3360+
pr = (uint8_t)GDI_DIV255((unsigned)sr * sa);
3361+
pg = (uint8_t)GDI_DIV255((unsigned)sg * sa);
3362+
pb = (uint8_t)GDI_DIV255((unsigned)sb * sa);
33353363
}
33363364
dst[i] = ((uint32_t)sa << 24)
33373365
| ((uint32_t)pr << 16)
@@ -3598,9 +3626,9 @@ static bool gdi_overlay_load(void *data,
35983626
else if (sa == 0) { pr = pg = pb = 0; }
35993627
else
36003628
{
3601-
pr = (uint8_t)(((unsigned)sr * sa) / 255u);
3602-
pg = (uint8_t)(((unsigned)sg * sa) / 255u);
3603-
pb = (uint8_t)(((unsigned)sb * sa) / 255u);
3629+
pr = (uint8_t)GDI_DIV255((unsigned)sr * sa);
3630+
pg = (uint8_t)GDI_DIV255((unsigned)sg * sa);
3631+
pb = (uint8_t)GDI_DIV255((unsigned)sb * sa);
36043632
}
36053633
dst[j] = ((uint32_t)sa << 24)
36063634
| ((uint32_t)pr << 16)

0 commit comments

Comments
 (0)