11// SPDX-License-Identifier: GPL-2.0-only
22/*
3- * Copyright (C) 2024, SUSE LLC
3+ * Copyright (C) 2024-2026 , SUSE LLC
44 *
55 * Authors: Enzo Matsumiya <[email protected] > 66 *
1616/*
1717 * Compression parameters.
1818 */
19- #define LZ77_MATCH_MIN_LEN 4
2019#define LZ77_MATCH_MAX_DIST SZ_8K
2120#define LZ77_HASH_LOG 15
2221#define LZ77_HASH_SIZE (1 << LZ77_HASH_LOG)
23- #define LZ77_STEP_SIZE sizeof(u64)
22+ #define LZ77_RSTEP_SIZE sizeof(u32)
23+ #define LZ77_MSTEP_SIZE sizeof(u64)
24+ #define LZ77_SKIP_TRIGGER 4
25+
26+ #define LZ77_PREFETCH (ptr ) __builtin_prefetch((ptr), 0, 3)
27+ #define LZ77_FLAG_MAX 32
2428
2529static __always_inline u8 lz77_read8 (const u8 * ptr )
2630{
2731 return get_unaligned (ptr );
2832}
2933
34+ static __always_inline u32 lz77_read32 (const u32 * ptr )
35+ {
36+ return get_unaligned (ptr );
37+ }
38+
3039static __always_inline u64 lz77_read64 (const u64 * ptr )
3140{
3241 return get_unaligned (ptr );
@@ -50,14 +59,14 @@ static __always_inline void lz77_write32(u32 *ptr, u32 v)
5059static __always_inline u32 lz77_match_len (const void * match , const void * cur , const void * end )
5160{
5261 const void * start = cur ;
53- u64 diff ;
5462
5563 /* Safe for a do/while because otherwise we wouldn't reach here from the main loop. */
5664 do {
57- diff = lz77_read64 (cur ) ^ lz77_read64 (match );
65+ const u64 diff = lz77_read64 (cur ) ^ lz77_read64 (match );
66+
5867 if (!diff ) {
59- cur += LZ77_STEP_SIZE ;
60- match += LZ77_STEP_SIZE ;
68+ cur += LZ77_MSTEP_SIZE ;
69+ match += LZ77_MSTEP_SIZE ;
6170
6271 continue ;
6372 }
@@ -66,7 +75,7 @@ static __always_inline u32 lz77_match_len(const void *match, const void *cur, co
6675 cur += count_trailing_zeros (diff ) >> 3 ;
6776
6877 return (cur - start );
69- } while (likely (cur + LZ77_STEP_SIZE <= end ));
78+ } while (likely (cur + LZ77_MSTEP_SIZE <= end ));
7079
7180 /* Fallback to byte-by-byte comparison for last <8 bytes. */
7281 while (cur < end && lz77_read8 (cur ) == lz77_read8 (match )) {
@@ -77,7 +86,7 @@ static __always_inline u32 lz77_match_len(const void *match, const void *cur, co
7786 return (cur - start );
7887}
7988
80- static __always_inline void * lz77_write_match (void * dst , void * * nib , u32 dist , u32 len )
89+ static __always_inline void * lz77_encode_match (void * dst , void * * nib , u16 dist , u32 len )
8190{
8291 len -= 3 ;
8392 dist -- ;
@@ -131,94 +140,124 @@ static __always_inline void *lz77_write_match(void *dst, void **nib, u32 dist, u
131140 return dst + 4 ;
132141}
133142
134- noinline int lz77_compress (const void * src , u32 slen , void * dst , u32 * dlen )
143+ static __always_inline void * lz77_encode_literals (const void * start , const void * end , void * dst ,
144+ long * f , u32 * fc , void * * fp )
145+ {
146+ if (start >= end )
147+ return dst ;
148+
149+ do {
150+ const u32 len = umin (end - start , LZ77_FLAG_MAX - * fc );
151+
152+ memcpy (dst , start , len );
153+
154+ dst += len ;
155+ start += len ;
156+
157+ * f <<= len ;
158+ * fc += len ;
159+ if (* fc == LZ77_FLAG_MAX ) {
160+ lz77_write32 (* fp , * f );
161+ * fc = 0 ;
162+ * fp = dst ;
163+ dst += 4 ;
164+ }
165+ } while (start < end );
166+
167+ return dst ;
168+ }
169+
170+ static __always_inline u32 lz77_hash (const u32 v )
171+ {
172+ return ((v ^ 0x9E3779B9 ) * 0x85EBCA6B ) >> (32 - LZ77_HASH_LOG );
173+ }
174+
175+ noinline int lz77_compress (const void * src , const u32 slen , void * dst , u32 * dlen )
135176{
136- const void * srcp , * end ;
177+ const void * srcp , * rlim , * end , * anchor ;
178+ u32 * htable , hash , flag_count = 0 ;
137179 void * dstp , * nib , * flag_pos ;
138- u32 flag_count = 0 ;
139180 long flag = 0 ;
140- u64 * htable ;
141181
142182 /* This is probably a bug, so throw a warning. */
143183 if (WARN_ON_ONCE (* dlen < lz77_compressed_alloc_size (slen )))
144184 return - EINVAL ;
145185
146- srcp = src ;
147- end = src + slen ;
186+ srcp = anchor = src ;
187+ end = srcp + slen ; /* absolute end */
188+ rlim = end - LZ77_MSTEP_SIZE ; /* read limit (for lz77_match_len()) */
148189 dstp = dst ;
149- nib = NULL ;
150190 flag_pos = dstp ;
151191 dstp += 4 ;
192+ nib = NULL ;
152193
153194 htable = kvcalloc (LZ77_HASH_SIZE , sizeof (* htable ), GFP_KERNEL );
154195 if (!htable )
155196 return - ENOMEM ;
156197
157- /* Main loop. */
158- do {
159- u32 dist , len = 0 ;
160- const void * wnd ;
161- u64 hash ;
162-
163- hash = ((lz77_read64 (srcp ) << 24 ) * 889523592379ULL ) >> (64 - LZ77_HASH_LOG );
164- wnd = src + htable [hash ];
165- htable [hash ] = srcp - src ;
166- dist = srcp - wnd ;
167-
168- if (dist && dist < LZ77_MATCH_MAX_DIST )
169- len = lz77_match_len (wnd , srcp , end );
198+ LZ77_PREFETCH (srcp + LZ77_RSTEP_SIZE );
170199
171- if (len < LZ77_MATCH_MIN_LEN ) {
172- lz77_write8 (dstp , lz77_read8 (srcp ));
173-
174- dstp ++ ;
175- srcp ++ ;
176-
177- flag <<= 1 ;
178- flag_count ++ ;
179- if (flag_count == 32 ) {
180- lz77_write32 (flag_pos , flag );
181- flag_count = 0 ;
182- flag_pos = dstp ;
183- dstp += 4 ;
184- }
185-
186- continue ;
187- }
200+ hash = lz77_hash (lz77_read32 (srcp ++ ));
201+ htable [hash ] = 0 ;
202+ hash = lz77_hash (lz77_read32 (srcp ));
188203
189- dstp = lz77_write_match (dstp , & nib , dist , len );
204+ /*
205+ * Main loop.
206+ *
207+ * @dlen is >= lz77_compressed_alloc_size(), so run without bound-checking @dstp.
208+ *
209+ * This code was crafted in a way to best utilise fetch-decode-execute CPU flow.
210+ * Any attempt to optimize it, or even organize it, can lead to huge performance loss.
211+ */
212+ do {
213+ const void * match , * next = srcp ;
214+ u32 len , step = 1 , skip = 1U << LZ77_SKIP_TRIGGER ;
215+
216+ /* Match finding (hot path -- don't change the read/check/write order). */
217+ do {
218+ const u32 cur_hash = hash ;
219+
220+ srcp = next ;
221+ next += step ;
222+ step = (skip ++ >> LZ77_SKIP_TRIGGER );
223+ if (unlikely (next > rlim ))
224+ goto out ;
225+
226+ hash = lz77_hash (lz77_read32 (next ));
227+ match = src + htable [cur_hash ];
228+ htable [cur_hash ] = srcp - src ;
229+ } while (likely (match + LZ77_MATCH_MAX_DIST < srcp ) ||
230+ lz77_read32 (match ) != lz77_read32 (srcp ));
231+
232+ dstp = lz77_encode_literals (anchor , srcp , dstp , & flag , & flag_count , & flag_pos );
233+ len = lz77_match_len (match , srcp , end );
234+ dstp = lz77_encode_match (dstp , & nib , srcp - match , len );
190235 srcp += len ;
236+ anchor = srcp ;
237+
238+ LZ77_PREFETCH (srcp );
191239
192240 flag = (flag << 1 ) | 1 ;
193241 flag_count ++ ;
194- if (flag_count == 32 ) {
242+ if (flag_count == LZ77_FLAG_MAX ) {
195243 lz77_write32 (flag_pos , flag );
196244 flag_count = 0 ;
197245 flag_pos = dstp ;
198246 dstp += 4 ;
199247 }
200- } while (likely (srcp + LZ77_STEP_SIZE <= end ));
201-
202- while (srcp < end ) {
203- u32 c = umin (end - srcp , 32 - flag_count );
204248
205- memcpy (dstp , srcp , c );
249+ if (unlikely (srcp > rlim ))
250+ break ;
206251
207- dstp += c ;
208- srcp += c ;
209-
210- flag <<= c ;
211- flag_count += c ;
212- if (flag_count == 32 ) {
213- lz77_write32 (flag_pos , flag );
214- flag_count = 0 ;
215- flag_pos = dstp ;
216- dstp += 4 ;
217- }
218- }
252+ /* Prepare for next loop. */
253+ hash = lz77_hash (lz77_read32 (srcp ));
254+ } while (srcp < end );
255+ out :
256+ dstp = lz77_encode_literals (anchor , end , dstp , & flag , & flag_count , & flag_pos );
219257
220- flag <<= (32 - flag_count );
221- flag |= (1UL << (32 - flag_count )) - 1 ;
258+ flag_count = LZ77_FLAG_MAX - flag_count ;
259+ flag <<= flag_count ;
260+ flag |= (1UL << flag_count ) - 1 ;
222261 lz77_write32 (flag_pos , flag );
223262
224263 * dlen = dstp - dst ;
0 commit comments