Skip to content

Commit cf9c2b6

Browse files
committed
gfx/gl1: Upload RGUI menu framebuffer directly as RGBA4444
Previously the GL1 driver expanded RGUI's 16bpp framebuffer to 32bpp on the CPU every frame via conv_rgba4444_argb8888 before uploading it as BGRA8888. The expansion was a per-pixel loop (with an MMX fast path on x86) and doubled the GPU upload bandwidth. RGUI already assembles its framebuffer in RGBA4444; GL has been able to consume that layout directly via GL_UNSIGNED_SHORT_4_4_4_4 since GL 1.2 (1998). Add a SUPPORTS_PACKED_PIXELS capability flag, probed once at init from the GL version (>= 1.2) or the GL_EXT_packed_pixels extension. When set, the menu draw path keeps RGUI's native 16bpp layout end-to-end: rows are memcpy'd into a POT-padded staging buffer at half the previous size and uploaded via (GL_RGBA, GL_UNSIGNED_SHORT_4_4_4_4) with no swizzle or expansion. The original 32bpp expansion path is preserved as a fallback for strict GL 1.1 implementations and for the Vita build (vitaGL packed-pixel paths are unverified). The new path is endian-safe by construction. RGUI's argb32_to_rgba4444 produces a host-endian uint16_t with R in bits 15..12 and A in bits 3..0; glTexImage2D reads each GL_UNSIGNED_SHORT_4_4_4_4 unit through the host's native uint16_t interpretation, so the same source bytes work on LE and BE hosts without a byte swap. gl1_draw_tex gains a fb_4444 parameter that selects the new upload format and skips the BGRA-fallback CPU swizzle (the 16bpp path's bytes already match GL_RGBA channel order). All three existing callers update accordingly; the content path passes false and is byte-identical to before. Removes the misleading "I could not get 444 or 555 to work" FIXME on the original gl1_draw_tex; this commit is what it asked for.
1 parent 4e3be96 commit cf9c2b6

1 file changed

Lines changed: 88 additions & 16 deletions

File tree

gfx/drivers/gl1.c

Lines changed: 88 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,12 @@ enum gl1_flags
108108
GL1_FLAG_MENU_SMOOTH = (1 << 9),
109109
GL1_FLAG_OVERLAY_ENABLE = (1 << 10),
110110
GL1_FLAG_OVERLAY_FULLSCREEN = (1 << 11),
111-
GL1_FLAG_FRAME_DUPE_LOCK = (1 << 12)
111+
GL1_FLAG_FRAME_DUPE_LOCK = (1 << 12),
112+
/* GL_UNSIGNED_SHORT_4_4_4_4 is core in GL 1.2; on strict 1.1
113+
* implementations it is provided by GL_EXT_packed_pixels. When
114+
* neither is available, the menu path falls back to expanding
115+
* RGUI's RGBA4444 framebuffer to BGRA8888 on the CPU. */
116+
GL1_FLAG_SUPPORTS_PACKED_PIXELS = (1 << 13)
112117
};
113118

114119
typedef struct gl1
@@ -1328,6 +1333,18 @@ static void *gl1_init(const video_info_t *video,
13281333
if (string_list_find_elem(gl1->extensions, "GL_EXT_bgra"))
13291334
gl1->flags |= GL1_FLAG_SUPPORTS_BGRA;
13301335

1336+
/* GL_UNSIGNED_SHORT_4_4_4_4 became core in GL 1.2 (1998); strict
1337+
* 1.1 implementations may still expose it via GL_EXT_packed_pixels.
1338+
* If neither is present we fall back to CPU expansion in the menu
1339+
* path. Skip on Vita: vitaGL is a fixed-function wrapper and we
1340+
* have not verified packed-pixel upload paths there. */
1341+
#ifndef VITA
1342+
if ( gl1->version_major > 1
1343+
|| (gl1->version_major == 1 && gl1->version_minor >= 2)
1344+
|| string_list_find_elem(gl1->extensions, "GL_EXT_packed_pixels"))
1345+
gl1->flags |= GL1_FLAG_SUPPORTS_PACKED_PIXELS;
1346+
#endif
1347+
13311348
glDisable(GL_BLEND);
13321349
glDisable(GL_DEPTH_TEST);
13331350
glDisable(GL_CULL_FACE);
@@ -1417,18 +1434,31 @@ static void gl1_set_viewport(gl1_t *gl1,
14171434
}
14181435
}
14191436

1420-
static void gl1_draw_tex(gl1_t *gl1, int pot_width, int pot_height, int width, int height, GLuint tex, const void *frame_to_copy)
1437+
static void gl1_draw_tex(gl1_t *gl1, int pot_width, int pot_height, int width, int height, GLuint tex, const void *frame_to_copy, bool fb_4444)
14211438
{
14221439
uint8_t *frame = NULL;
14231440
uint8_t *frame_rgba = NULL;
1424-
/* FIXME: For now, everything is uploaded as BGRA8888, I could not get 444 or 555 to work, and there is no 565 support in GL 1.1 either. */
1425-
GLint internalFormat = GL_RGB8;
1426-
bool supports_native = gl1->flags & GL1_FLAG_SUPPORTS_BGRA;
1427-
GLenum format = supports_native ? GL_BGRA_EXT : GL_RGBA;
1441+
/* When fb_4444 is true the source is RGUI's 16bpp framebuffer in
1442+
* RGBA4444 layout (uint16_t with R in bits 15..12, A in 3..0) and
1443+
* is uploaded directly via GL_UNSIGNED_SHORT_4_4_4_4 — the channel
1444+
* order matches GL_RGBA exactly, so no swizzle/expansion is needed.
1445+
* Otherwise the source is BGRA8888 (or its byte-swapped equivalent
1446+
* on big-endian builds) and we use the original 32bpp upload path,
1447+
* which falls back to a CPU swizzle to RGBA8888 when the GL
1448+
* implementation lacks GL_EXT_bgra. */
1449+
GLint internalFormat = fb_4444 ? GL_RGBA : GL_RGB8;
1450+
bool supports_native = (gl1->flags & GL1_FLAG_SUPPORTS_BGRA) ? true : false;
1451+
GLenum format = fb_4444
1452+
? GL_RGBA
1453+
: (supports_native ? GL_BGRA_EXT : GL_RGBA);
14281454
#ifdef MSB_FIRST
1429-
GLenum type = supports_native ? GL_UNSIGNED_INT_8_8_8_8_REV : GL_UNSIGNED_BYTE;
1455+
GLenum type = fb_4444
1456+
? GL_UNSIGNED_SHORT_4_4_4_4
1457+
: (supports_native ? GL_UNSIGNED_INT_8_8_8_8_REV : GL_UNSIGNED_BYTE);
14301458
#else
1431-
GLenum type = GL_UNSIGNED_BYTE;
1459+
GLenum type = fb_4444
1460+
? GL_UNSIGNED_SHORT_4_4_4_4
1461+
: GL_UNSIGNED_BYTE;
14321462
#endif
14331463
float vertices[] = {
14341464
-1.0f, -1.0f, 0.0f,
@@ -1474,7 +1504,10 @@ static void gl1_draw_tex(gl1_t *gl1, int pot_width, int pot_height, int width, i
14741504

14751505
frame = (uint8_t*)frame_to_copy;
14761506

1477-
if (!supports_native)
1507+
/* The BGRA-fallback swizzle below only applies to the 32bpp upload
1508+
* path; the 16bpp 4444 path's bytes already match GL_RGBA channel
1509+
* order. */
1510+
if (!fb_4444 && !supports_native)
14781511
{
14791512
frame_rgba = (uint8_t*)malloc(pot_width * pot_height * 4);
14801513
if (frame_rgba)
@@ -1722,18 +1755,30 @@ static bool gl1_frame(void *data, const void *frame,
17221755

17231756
if (frame_to_copy)
17241757
gl1_draw_tex(gl1, pot_width, pot_height,
1725-
width, height, gl1->tex, frame_to_copy);
1758+
width, height, gl1->tex, frame_to_copy, false);
17261759
}
17271760

17281761
#ifdef HAVE_MENU
17291762
if (gl1->menu_frame && menu_is_alive)
17301763
{
1764+
bool fb_4444;
1765+
unsigned bpp;
1766+
17311767
frame_to_copy = NULL;
17321768
width = gl1->menu_width;
17331769
height = gl1->menu_height;
17341770
pitch = gl1->menu_pitch;
17351771
bits = gl1->menu_bits;
17361772

1773+
/* Decide upload path now that menu_bits has been latched.
1774+
* Fast path: keep RGUI's native 16bpp RGBA4444 layout end-to-end
1775+
* and let GL consume it via GL_UNSIGNED_SHORT_4_4_4_4. Fallback
1776+
* expands to 32bpp on the CPU and uploads as BGRA8888 (or RGBA8888
1777+
* on implementations without GL_EXT_bgra). */
1778+
fb_4444 = (bits == 16)
1779+
&& (gl1->flags & GL1_FLAG_SUPPORTS_PACKED_PIXELS);
1780+
bpp = fb_4444 ? 2 : 4;
1781+
17371782
pot_width = GET_POT(width);
17381783
pot_height = GET_POT(height);
17391784

@@ -1750,26 +1795,53 @@ static bool gl1_frame(void *data, const void *frame,
17501795

17511796
if (!gl1->menu_video_buf)
17521797
gl1->menu_video_buf = (unsigned char*)
1753-
malloc(pot_width * pot_height * 4);
1798+
malloc((size_t)pot_width * (size_t)pot_height * bpp);
17541799

17551800
if (bits == 16 && gl1->menu_video_buf)
17561801
{
1757-
conv_rgba4444_argb8888(gl1->menu_video_buf,
1758-
gl1->menu_frame, width, height,
1759-
pot_width * sizeof(unsigned), pitch);
1802+
if (fb_4444)
1803+
{
1804+
/* Direct upload path: RGUI emits its framebuffer in
1805+
* RGBA4444 (host-endian uint16_t with R in bits 15..12,
1806+
* G 11..8, B 7..4, A 3..0). Endianness of the upload is
1807+
* implicit: glTexImage2D reads each GL_UNSIGNED_SHORT_4_4_4_4
1808+
* unit using the host's native uint16_t interpretation, so
1809+
* the same source bytes work on LE and BE hosts without a
1810+
* byte swap. Copy width-rows into the top-left of the
1811+
* pot-padded staging buffer; rows beyond `height` and
1812+
* pixels beyond `width` are sampled outside the
1813+
* (norm_width, norm_height) tex-coord rectangle in
1814+
* gl1_draw_tex and never reach the screen. */
1815+
unsigned y;
1816+
const uint8_t *src = (const uint8_t*)gl1->menu_frame;
1817+
uint8_t *dst = (uint8_t*)gl1->menu_video_buf;
1818+
unsigned dst_pitch = pot_width * 2;
1819+
unsigned row_bytes = width * 2;
1820+
for (y = 0; y < height; y++)
1821+
memcpy(dst + dst_pitch * y, src + pitch * y, row_bytes);
1822+
}
1823+
else
1824+
{
1825+
/* Fallback expansion to 32bpp for GL <1.2 without
1826+
* GL_EXT_packed_pixels (and for the Vita build). This
1827+
* preserves the original behaviour. */
1828+
conv_rgba4444_argb8888(gl1->menu_video_buf,
1829+
gl1->menu_frame, width, height,
1830+
pot_width * sizeof(unsigned), pitch);
1831+
}
17601832

17611833
frame_to_copy = gl1->menu_video_buf;
17621834

17631835
if (gl1->flags & GL1_FLAG_MENU_TEXTURE_FULLSCREEN)
17641836
{
17651837
glViewport(0, 0, video_width, video_height);
17661838
gl1_draw_tex(gl1, pot_width, pot_height,
1767-
width, height, gl1->menu_tex, frame_to_copy);
1839+
width, height, gl1->menu_tex, frame_to_copy, fb_4444);
17681840
glViewport(gl1->vp.x, gl1->vp.y, gl1->vp.width, gl1->vp.height);
17691841
}
17701842
else
17711843
gl1_draw_tex(gl1, pot_width, pot_height,
1772-
width, height, gl1->menu_tex, frame_to_copy);
1844+
width, height, gl1->menu_tex, frame_to_copy, fb_4444);
17731845
}
17741846
}
17751847

0 commit comments

Comments
 (0)