Skip to content

Commit 4460e9c

Browse files
ematsumiyasmfrench
authored andcommitted
smb: client: compress: LZ77 optimizations
This patch implements several micro-optimizations on lz77_compress() with the goal of reducing the number of instructions per [input] byte (a.k.a. IPB). Changes: - change hashtable to be u32 (instead of u64) -- change the hash function to reflect that (adds lz77_hash() and lz77_read32() helpers) - batch-write literals instead of 1 by 1 -- now that we have a well defined hot path (match finding) and a cold path (encode literals + match), batch writing makes a significant difference - implement adaptive skipping of input bytes -- skip input bytes more aggressively if too few matches are being found - name some constants for more meaningful context Signed-off-by: Enzo Matsumiya <[email protected]> Signed-off-by: Steve French <[email protected]>
1 parent fca46b0 commit 4460e9c

2 files changed

Lines changed: 108 additions & 69 deletions

File tree

fs/smb/client/compress/lz77.c

Lines changed: 106 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
// SPDX-License-Identifier: GPL-2.0-only
22
/*
3-
* Copyright (C) 2024, SUSE LLC
3+
* Copyright (C) 2024-2026, SUSE LLC
44
*
55
* Authors: Enzo Matsumiya <[email protected]>
66
*
@@ -16,17 +16,26 @@
1616
/*
1717
* Compression parameters.
1818
*/
19-
#define LZ77_MATCH_MIN_LEN 4
2019
#define LZ77_MATCH_MAX_DIST SZ_8K
2120
#define LZ77_HASH_LOG 15
2221
#define LZ77_HASH_SIZE (1 << LZ77_HASH_LOG)
23-
#define LZ77_STEP_SIZE sizeof(u64)
22+
#define LZ77_RSTEP_SIZE sizeof(u32)
23+
#define LZ77_MSTEP_SIZE sizeof(u64)
24+
#define LZ77_SKIP_TRIGGER 4
25+
26+
#define LZ77_PREFETCH(ptr) __builtin_prefetch((ptr), 0, 3)
27+
#define LZ77_FLAG_MAX 32
2428

2529
static __always_inline u8 lz77_read8(const u8 *ptr)
2630
{
2731
return get_unaligned(ptr);
2832
}
2933

34+
static __always_inline u32 lz77_read32(const u32 *ptr)
35+
{
36+
return get_unaligned(ptr);
37+
}
38+
3039
static __always_inline u64 lz77_read64(const u64 *ptr)
3140
{
3241
return get_unaligned(ptr);
@@ -50,14 +59,14 @@ static __always_inline void lz77_write32(u32 *ptr, u32 v)
5059
static __always_inline u32 lz77_match_len(const void *match, const void *cur, const void *end)
5160
{
5261
const void *start = cur;
53-
u64 diff;
5462

5563
/* Safe for a do/while because otherwise we wouldn't reach here from the main loop. */
5664
do {
57-
diff = lz77_read64(cur) ^ lz77_read64(match);
65+
const u64 diff = lz77_read64(cur) ^ lz77_read64(match);
66+
5867
if (!diff) {
59-
cur += LZ77_STEP_SIZE;
60-
match += LZ77_STEP_SIZE;
68+
cur += LZ77_MSTEP_SIZE;
69+
match += LZ77_MSTEP_SIZE;
6170

6271
continue;
6372
}
@@ -66,7 +75,7 @@ static __always_inline u32 lz77_match_len(const void *match, const void *cur, co
6675
cur += count_trailing_zeros(diff) >> 3;
6776

6877
return (cur - start);
69-
} while (likely(cur + LZ77_STEP_SIZE <= end));
78+
} while (likely(cur + LZ77_MSTEP_SIZE <= end));
7079

7180
/* Fallback to byte-by-byte comparison for last <8 bytes. */
7281
while (cur < end && lz77_read8(cur) == lz77_read8(match)) {
@@ -77,7 +86,7 @@ static __always_inline u32 lz77_match_len(const void *match, const void *cur, co
7786
return (cur - start);
7887
}
7988

80-
static __always_inline void *lz77_write_match(void *dst, void **nib, u32 dist, u32 len)
89+
static __always_inline void *lz77_encode_match(void *dst, void **nib, u16 dist, u32 len)
8190
{
8291
len -= 3;
8392
dist--;
@@ -131,94 +140,124 @@ static __always_inline void *lz77_write_match(void *dst, void **nib, u32 dist, u
131140
return dst + 4;
132141
}
133142

134-
noinline int lz77_compress(const void *src, u32 slen, void *dst, u32 *dlen)
143+
static __always_inline void *lz77_encode_literals(const void *start, const void *end, void *dst,
144+
long *f, u32 *fc, void **fp)
145+
{
146+
if (start >= end)
147+
return dst;
148+
149+
do {
150+
const u32 len = umin(end - start, LZ77_FLAG_MAX - *fc);
151+
152+
memcpy(dst, start, len);
153+
154+
dst += len;
155+
start += len;
156+
157+
*f <<= len;
158+
*fc += len;
159+
if (*fc == LZ77_FLAG_MAX) {
160+
lz77_write32(*fp, *f);
161+
*fc = 0;
162+
*fp = dst;
163+
dst += 4;
164+
}
165+
} while (start < end);
166+
167+
return dst;
168+
}
169+
170+
static __always_inline u32 lz77_hash(const u32 v)
171+
{
172+
return ((v ^ 0x9E3779B9) * 0x85EBCA6B) >> (32 - LZ77_HASH_LOG);
173+
}
174+
175+
noinline int lz77_compress(const void *src, const u32 slen, void *dst, u32 *dlen)
135176
{
136-
const void *srcp, *end;
177+
const void *srcp, *rlim, *end, *anchor;
178+
u32 *htable, hash, flag_count = 0;
137179
void *dstp, *nib, *flag_pos;
138-
u32 flag_count = 0;
139180
long flag = 0;
140-
u64 *htable;
141181

142182
/* This is probably a bug, so throw a warning. */
143183
if (WARN_ON_ONCE(*dlen < lz77_compressed_alloc_size(slen)))
144184
return -EINVAL;
145185

146-
srcp = src;
147-
end = src + slen;
186+
srcp = anchor = src;
187+
end = srcp + slen; /* absolute end */
188+
rlim = end - LZ77_MSTEP_SIZE; /* read limit (for lz77_match_len()) */
148189
dstp = dst;
149-
nib = NULL;
150190
flag_pos = dstp;
151191
dstp += 4;
192+
nib = NULL;
152193

153194
htable = kvcalloc(LZ77_HASH_SIZE, sizeof(*htable), GFP_KERNEL);
154195
if (!htable)
155196
return -ENOMEM;
156197

157-
/* Main loop. */
158-
do {
159-
u32 dist, len = 0;
160-
const void *wnd;
161-
u64 hash;
162-
163-
hash = ((lz77_read64(srcp) << 24) * 889523592379ULL) >> (64 - LZ77_HASH_LOG);
164-
wnd = src + htable[hash];
165-
htable[hash] = srcp - src;
166-
dist = srcp - wnd;
167-
168-
if (dist && dist < LZ77_MATCH_MAX_DIST)
169-
len = lz77_match_len(wnd, srcp, end);
198+
LZ77_PREFETCH(srcp + LZ77_RSTEP_SIZE);
170199

171-
if (len < LZ77_MATCH_MIN_LEN) {
172-
lz77_write8(dstp, lz77_read8(srcp));
173-
174-
dstp++;
175-
srcp++;
176-
177-
flag <<= 1;
178-
flag_count++;
179-
if (flag_count == 32) {
180-
lz77_write32(flag_pos, flag);
181-
flag_count = 0;
182-
flag_pos = dstp;
183-
dstp += 4;
184-
}
185-
186-
continue;
187-
}
200+
hash = lz77_hash(lz77_read32(srcp++));
201+
htable[hash] = 0;
202+
hash = lz77_hash(lz77_read32(srcp));
188203

189-
dstp = lz77_write_match(dstp, &nib, dist, len);
204+
/*
205+
* Main loop.
206+
*
207+
* @dlen is >= lz77_compressed_alloc_size(), so run without bound-checking @dstp.
208+
*
209+
* This code was crafted in a way to best utilise fetch-decode-execute CPU flow.
210+
* Any attempt to optimize it, or even organize it, can lead to huge performance loss.
211+
*/
212+
do {
213+
const void *match, *next = srcp;
214+
u32 len, step = 1, skip = 1U << LZ77_SKIP_TRIGGER;
215+
216+
/* Match finding (hot path -- don't change the read/check/write order). */
217+
do {
218+
const u32 cur_hash = hash;
219+
220+
srcp = next;
221+
next += step;
222+
step = (skip++ >> LZ77_SKIP_TRIGGER);
223+
if (unlikely(next > rlim))
224+
goto out;
225+
226+
hash = lz77_hash(lz77_read32(next));
227+
match = src + htable[cur_hash];
228+
htable[cur_hash] = srcp - src;
229+
} while (likely(match + LZ77_MATCH_MAX_DIST < srcp) ||
230+
lz77_read32(match) != lz77_read32(srcp));
231+
232+
dstp = lz77_encode_literals(anchor, srcp, dstp, &flag, &flag_count, &flag_pos);
233+
len = lz77_match_len(match, srcp, end);
234+
dstp = lz77_encode_match(dstp, &nib, srcp - match, len);
190235
srcp += len;
236+
anchor = srcp;
237+
238+
LZ77_PREFETCH(srcp);
191239

192240
flag = (flag << 1) | 1;
193241
flag_count++;
194-
if (flag_count == 32) {
242+
if (flag_count == LZ77_FLAG_MAX) {
195243
lz77_write32(flag_pos, flag);
196244
flag_count = 0;
197245
flag_pos = dstp;
198246
dstp += 4;
199247
}
200-
} while (likely(srcp + LZ77_STEP_SIZE <= end));
201-
202-
while (srcp < end) {
203-
u32 c = umin(end - srcp, 32 - flag_count);
204248

205-
memcpy(dstp, srcp, c);
249+
if (unlikely(srcp > rlim))
250+
break;
206251

207-
dstp += c;
208-
srcp += c;
209-
210-
flag <<= c;
211-
flag_count += c;
212-
if (flag_count == 32) {
213-
lz77_write32(flag_pos, flag);
214-
flag_count = 0;
215-
flag_pos = dstp;
216-
dstp += 4;
217-
}
218-
}
252+
/* Prepare for next loop. */
253+
hash = lz77_hash(lz77_read32(srcp));
254+
} while (srcp < end);
255+
out:
256+
dstp = lz77_encode_literals(anchor, end, dstp, &flag, &flag_count, &flag_pos);
219257

220-
flag <<= (32 - flag_count);
221-
flag |= (1UL << (32 - flag_count)) - 1;
258+
flag_count = LZ77_FLAG_MAX - flag_count;
259+
flag <<= flag_count;
260+
flag |= (1UL << flag_count) - 1;
222261
lz77_write32(flag_pos, flag);
223262

224263
*dlen = dstp - dst;

fs/smb/client/compress/lz77.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/* SPDX-License-Identifier: GPL-2.0-only */
22
/*
3-
* Copyright (C) 2024, SUSE LLC
3+
* Copyright (C) 2024-2026, SUSE LLC
44
*
55
* Authors: Enzo Matsumiya <[email protected]>
66
*
@@ -39,5 +39,5 @@ static __always_inline u32 lz77_compressed_alloc_size(const u32 size)
3939
return size + (size >> 3) + 8;
4040
}
4141

42-
int lz77_compress(const void *src, u32 slen, void *dst, u32 *dlen);
42+
int lz77_compress(const void *src, const u32 slen, void *dst, u32 *dlen);
4343
#endif /* _SMB_COMPRESS_LZ77_H */

0 commit comments

Comments
 (0)