From e101840fbc1a9fe2e28a375cabe1a18152c569fe Mon Sep 17 00:00:00 2001
From: Xueyuan Chen <xueyuan.chen21@gmail.com>
Date: Tue, 21 Apr 2026 20:16:13 +0800
Subject: [PATCH 1/4] mm:zsmalloc: drop class lock before freeing zspage

Currently in zs_free(), the class->lock is held until the zspage is
completely freed and the counters are updated. However, freeing pages back
to the buddy allocator requires acquiring the zone lock.

Under heavy memory pressure, zone lock contention can be severe. When this
happens, the CPU holding the class->lock will stall waiting for the zone
lock, thereby blocking all other CPUs attempting to acquire the same
class->lock.

This patch shrinks the critical section of the class->lock to reduce lock
contention. By moving the actual page freeing process outside the
class->lock, we can improve the concurrency performance of zs_free().

Testing on the RADXA O6 platform shows that with 12 CPUs concurrently
performing zs_free() operations, the execution time is reduced by 20%.

Signed-off-by: Xueyuan Chen <xueyuan.chen21@gmail.com>
Signed-off-by: Wenchao Hao <haowenchao@xiaomi.com>
---
 mm/zsmalloc.c | 28 ++++++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 63128ddb7959..40687c8a7469 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -801,13 +801,10 @@ static int trylock_zspage(struct zspage *zspage)
 	return 0;
 }
 
-static void __free_zspage(struct zs_pool *pool, struct size_class *class,
-				struct zspage *zspage)
+static inline void __free_zspage_lockless(struct zs_pool *pool, struct zspage *zspage)
 {
 	struct zpdesc *zpdesc, *next;
 
-	assert_spin_locked(&class->lock);
-
 	VM_BUG_ON(get_zspage_inuse(zspage));
 	VM_BUG_ON(zspage->fullness != ZS_INUSE_RATIO_0);
 
@@ -823,7 +820,13 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class,
 	} while (zpdesc != NULL);
 
 	cache_free_zspage(zspage);
+}
 
+static void __free_zspage(struct zs_pool *pool, struct size_class *class,
+				struct zspage *zspage)
+{
+	assert_spin_locked(&class->lock);
+	__free_zspage_lockless(pool, zspage);
 	class_stat_sub(class, ZS_OBJS_ALLOCATED, class->objs_per_zspage);
 	atomic_long_sub(class->pages_per_zspage, &pool->pages_allocated);
 }
@@ -1388,6 +1391,7 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
 	unsigned long obj;
 	struct size_class *class;
 	int fullness;
+	struct zspage *zspage_to_free = NULL;
 
 	if (IS_ERR_OR_NULL((void *)handle))
 		return;
@@ -1408,10 +1412,22 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
 	obj_free(class->size, obj);
 
 	fullness = fix_fullness_group(class, zspage);
-	if (fullness == ZS_INUSE_RATIO_0)
-		free_zspage(pool, class, zspage);
+	if (fullness == ZS_INUSE_RATIO_0) {
+		if (trylock_zspage(zspage)) {
+			remove_zspage(class, zspage);
+			class_stat_sub(class, ZS_OBJS_ALLOCATED,
+				class->objs_per_zspage);
+			zspage_to_free = zspage;
+		} else
+			kick_deferred_free(pool);
+	}
 
 	spin_unlock(&class->lock);
+
+	if (likely(zspage_to_free)) {
+		__free_zspage_lockless(pool, zspage_to_free);
+		atomic_long_sub(class->pages_per_zspage, &pool->pages_allocated);
+	}
 	cache_free_handle(handle);
 }
 EXPORT_SYMBOL_GPL(zs_free);

From 029543c1ff75c02f78b4e5b7a0575b3cd8d71f16 Mon Sep 17 00:00:00 2001
From: Wenchao Hao <haowenchao22@gmail.com>
Date: Tue, 21 Apr 2026 20:16:14 +0800
Subject: [PATCH 2/4] mm/zsmalloc: introduce zs_free_deferred() for async
 handle freeing

zs_free() is expensive due to internal locking (pool->lock, class->lock)
and potential zspage freeing. On the process exit path, the slow
zs_free() blocks memory reclamation, delaying overall memory release.
This has been reported to significantly impact Android low-memory
killing where slot_free() accounts for over 80% of the total swap
entry freeing cost.

Introduce zs_free_deferred() which queues handles into a fixed-size
per-pool array for later processing by a workqueue. This allows callers
to defer the expensive zs_free() and return quickly, so the process
exit path can release memory faster. The array capacity is derived from
a 128MB uncompressed data budget (128MB >> PAGE_SHIFT entries), which
scales naturally with PAGE_SIZE. When the array reaches half capacity,
the workqueue is scheduled to drain pending handles.

zs_free_deferred() uses spin_trylock() to access the deferred queue.
If the lock is contended (e.g. drain in progress) or the queue is full,
it falls back to synchronous zs_free() to guarantee correctness.

Also introduce zs_free_deferred_flush() for use during pool teardown to
ensure all pending handles are freed.

Signed-off-by: Wenchao Hao <haowenchao@xiaomi.com>
---
 include/linux/zsmalloc.h |   2 +
 mm/zsmalloc.c            | 111 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 113 insertions(+)

diff --git a/include/linux/zsmalloc.h b/include/linux/zsmalloc.h
index 478410c880b1..1e5ac1a39d41 100644
--- a/include/linux/zsmalloc.h
+++ b/include/linux/zsmalloc.h
@@ -30,6 +30,8 @@ void zs_destroy_pool(struct zs_pool *pool);
 unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t flags,
 			const int nid);
 void zs_free(struct zs_pool *pool, unsigned long obj);
+void zs_free_deferred(struct zs_pool *pool, unsigned long handle);
+void zs_free_deferred_flush(struct zs_pool *pool);
 
 size_t zs_huge_class_size(struct zs_pool *pool);
 
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 40687c8a7469..defc892555e4 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -53,6 +53,10 @@
 
 #define ZS_HANDLE_SIZE (sizeof(unsigned long))
 
+#define ZS_DEFERRED_FREE_MAX_BYTES	(128 << 20)
+#define ZS_DEFERRED_FREE_CAPACITY	(ZS_DEFERRED_FREE_MAX_BYTES >> PAGE_SHIFT)
+#define ZS_DEFERRED_FREE_THRESHOLD	(ZS_DEFERRED_FREE_CAPACITY / 2)
+
 /*
  * Object location (<PFN>, <obj_idx>) is encoded as
  * a single (unsigned long) handle value.
@@ -217,6 +221,13 @@ struct zs_pool {
 	/* protect zspage migration/compaction */
 	rwlock_t lock;
 	atomic_t compaction_in_progress;
+
+	/* deferred free support */
+	spinlock_t deferred_lock;
+	unsigned long *deferred_handles;
+	unsigned int deferred_count;
+	unsigned int deferred_capacity;
+	struct work_struct deferred_free_work;
 };
 
 static inline void zpdesc_set_first(struct zpdesc *zpdesc)
@@ -579,6 +590,19 @@ static int zs_stats_size_show(struct seq_file *s, void *v)
 }
 DEFINE_SHOW_ATTRIBUTE(zs_stats_size);
 
+static int zs_stats_deferred_show(struct seq_file *s, void *v)
+{
+	struct zs_pool *pool = s->private;
+
+	spin_lock(&pool->deferred_lock);
+	seq_printf(s, "pending: %u\n", pool->deferred_count);
+	seq_printf(s, "capacity: %u\n", pool->deferred_capacity);
+	spin_unlock(&pool->deferred_lock);
+
+	return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(zs_stats_deferred);
+
 static void zs_pool_stat_create(struct zs_pool *pool, const char *name)
 {
 	if (!zs_stat_root) {
@@ -590,6 +614,9 @@ static void zs_pool_stat_create(struct zs_pool *pool, const char *name)
 
 	debugfs_create_file("classes", S_IFREG | 0444, pool->stat_dentry, pool,
 			    &zs_stats_size_fops);
+	debugfs_create_file("deferred_free", S_IFREG | 0444,
+			    pool->stat_dentry, pool,
+			    &zs_stats_deferred_fops);
 }
 
 static void zs_pool_stat_destroy(struct zs_pool *pool)
@@ -1432,6 +1459,76 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
 }
 EXPORT_SYMBOL_GPL(zs_free);
 
+static void zs_deferred_free_work(struct work_struct *work)
+{
+	struct zs_pool *pool = container_of(work, struct zs_pool,
+					    deferred_free_work);
+	unsigned long handle;
+
+	while (1) {
+		spin_lock(&pool->deferred_lock);
+		if (pool->deferred_count == 0) {
+			spin_unlock(&pool->deferred_lock);
+			break;
+		}
+		handle = pool->deferred_handles[--pool->deferred_count];
+		spin_unlock(&pool->deferred_lock);
+
+		zs_free(pool, handle);
+		cond_resched();
+	}
+}
+
+/**
+ * zs_free_deferred - queue a handle for asynchronous freeing
+ * @pool: pool to free from
+ * @handle: handle to free
+ *
+ * Place @handle into a deferred free queue for later processing by a
+ * workqueue.  This is intended for callers that are in atomic context
+ * (e.g. under a spinlock) and cannot afford the cost of zs_free()
+ * directly.  When the queue reaches a threshold the work is scheduled.
+ * Falls back to synchronous zs_free() if the lock is contended (drain
+ * in progress) or if the queue is full.
+ */
+void zs_free_deferred(struct zs_pool *pool, unsigned long handle)
+{
+	if (IS_ERR_OR_NULL((void *)handle))
+		return;
+
+	if (!spin_trylock(&pool->deferred_lock))
+		goto sync_free;
+
+	if (pool->deferred_count >= pool->deferred_capacity) {
+		spin_unlock(&pool->deferred_lock);
+		goto sync_free;
+	}
+
+	pool->deferred_handles[pool->deferred_count++] = handle;
+	if (pool->deferred_count >= ZS_DEFERRED_FREE_THRESHOLD)
+		queue_work(system_wq, &pool->deferred_free_work);
+	spin_unlock(&pool->deferred_lock);
+	return;
+
+sync_free:
+	zs_free(pool, handle);
+}
+EXPORT_SYMBOL_GPL(zs_free_deferred);
+
+/**
+ * zs_free_deferred_flush - flush all pending deferred frees
+ * @pool: pool to flush
+ *
+ * Wait for any scheduled work to complete, then drain any remaining
+ * handles.  Must be called from process context.
+ */
+void zs_free_deferred_flush(struct zs_pool *pool)
+{
+	flush_work(&pool->deferred_free_work);
+	zs_deferred_free_work(&pool->deferred_free_work);
+}
+EXPORT_SYMBOL_GPL(zs_free_deferred_flush);
+
 static void zs_object_copy(struct size_class *class, unsigned long dst,
 				unsigned long src)
 {
@@ -2099,6 +2196,18 @@ struct zs_pool *zs_create_pool(const char *name)
 	rwlock_init(&pool->lock);
 	atomic_set(&pool->compaction_in_progress, 0);
 
+	spin_lock_init(&pool->deferred_lock);
+	pool->deferred_capacity = ZS_DEFERRED_FREE_CAPACITY;
+	pool->deferred_handles = kvmalloc_array(pool->deferred_capacity,
+						sizeof(unsigned long),
+						GFP_KERNEL);
+	if (!pool->deferred_handles) {
+		kfree(pool);
+		return NULL;
+	}
+	pool->deferred_count = 0;
+	INIT_WORK(&pool->deferred_free_work, zs_deferred_free_work);
+
 	pool->name = kstrdup(name, GFP_KERNEL);
 	if (!pool->name)
 		goto err;
@@ -2201,6 +2310,7 @@ void zs_destroy_pool(struct zs_pool *pool)
 	int i;
 
 	zs_unregister_shrinker(pool);
+	zs_free_deferred_flush(pool);
 	zs_flush_migration(pool);
 	zs_pool_stat_destroy(pool);
 
@@ -2224,6 +2334,7 @@ void zs_destroy_pool(struct zs_pool *pool)
 		kfree(class);
 	}
 
+	kvfree(pool->deferred_handles);
 	kfree(pool->name);
 	kfree(pool);
 }

From 4fec80f0a3641c19e09887d0ead1e913a3fe92b1 Mon Sep 17 00:00:00 2001
From: "Barry Song (Xiaomi)" <baohua@kernel.org>
Date: Tue, 21 Apr 2026 20:16:15 +0800
Subject: [PATCH 3/4] zram: defer zs_free() in swap slot free notification path

zram_slot_free_notify() is called on the process exit path when
unmapping swap entries. The slot_free() it calls internally invokes
zs_free(), which accounts for ~87% of slot_free() cost due to zsmalloc
internal locking (pool->lock, class->lock) and potential zspage freeing.
This blocks the process exit path, delaying overall memory release
during Android low-memory killing.

Split slot_free() into slot_free_extract() and the actual zs_free()
call. slot_free_extract() handles all slot metadata cleanup (clearing
flags, updating stats, zeroing handle/size) and returns the zsmalloc
handle that needs freeing. This separation has two benefits:

1. It makes the two responsibilities of slot_free() explicit: slot
   metadata management (must be done under slot lock) vs zsmalloc
   memory release (can be deferred).

2. It allows zram_slot_free_notify() to use zs_free_deferred() for
   the handle, deferring the expensive zs_free() to a workqueue so
   the exit path can release memory faster.

While at it, merge three separate clear_slot_flag() calls for
ZRAM_IDLE, ZRAM_INCOMPRESSIBLE, and ZRAM_PP_SLOT into a single
bitmask operation via clear_slot_flags_on_free(), reducing redundant
read-modify-write cycles on the same flags word.

All other slot_free() callers (write, discard, meta_free) continue
to use synchronous zs_free() through the unchanged slot_free()
wrapper.

Signed-off-by: Barry Song (Xiaomi) <baohua@kernel.org>
Signed-off-by: Wenchao Hao <haowenchao@xiaomi.com>
---
 drivers/block/zram/zram_drv.c | 37 ++++++++++++++++++++++++++---------
 1 file changed, 28 insertions(+), 9 deletions(-)

diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index aebc710f0d6a..b7732c6665aa 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -165,6 +165,15 @@ static inline bool slot_allocated(struct zram *zram, u32 index)
 		test_slot_flag(zram, index, ZRAM_WB);
 }
 
+#define ZRAM_FLAGS_TO_CLEAR_ON_FREE	(BIT(ZRAM_IDLE) | \
+					 BIT(ZRAM_INCOMPRESSIBLE) | \
+					 BIT(ZRAM_PP_SLOT))
+
+static inline void clear_slot_flags_on_free(struct zram *zram, u32 index)
+{
+	zram->table[index].attr.flags &= ~ZRAM_FLAGS_TO_CLEAR_ON_FREE;
+}
+
 static inline void set_slot_comp_priority(struct zram *zram, u32 index,
 					  u32 prio)
 {
@@ -2000,17 +2009,20 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize)
 	return true;
 }
 
-static void slot_free(struct zram *zram, u32 index)
+/*
+ * Clear slot metadata and extract the zsmalloc handle for freeing.
+ * Returns the handle that needs to be freed via zs_free(), or 0 if
+ * no zsmalloc freeing is needed (e.g. same-filled or writeback slots).
+ */
+static unsigned long slot_free_extract(struct zram *zram, u32 index)
 {
-	unsigned long handle;
+	unsigned long handle = 0;
 
 #ifdef CONFIG_ZRAM_TRACK_ENTRY_ACTIME
 	zram->table[index].attr.ac_time = 0;
 #endif
 
-	clear_slot_flag(zram, index, ZRAM_IDLE);
-	clear_slot_flag(zram, index, ZRAM_INCOMPRESSIBLE);
-	clear_slot_flag(zram, index, ZRAM_PP_SLOT);
+	clear_slot_flags_on_free(zram, index);
 	set_slot_comp_priority(zram, index, 0);
 
 	if (test_slot_flag(zram, index, ZRAM_HUGE)) {
@@ -2041,9 +2053,7 @@ static void slot_free(struct zram *zram, u32 index)
 
 	handle = get_slot_handle(zram, index);
 	if (!handle)
-		return;
-
-	zs_free(zram->mem_pool, handle);
+		return 0;
 
 	atomic64_sub(get_slot_size(zram, index),
 		     &zram->stats.compr_data_size);
@@ -2051,6 +2061,15 @@ static void slot_free(struct zram *zram, u32 index)
 	atomic64_dec(&zram->stats.pages_stored);
 	set_slot_handle(zram, index, 0);
 	set_slot_size(zram, index, 0);
+
+	return handle;
+}
+
+static void slot_free(struct zram *zram, u32 index)
+{
+	unsigned long handle = slot_free_extract(zram, index);
+
+	zs_free(zram->mem_pool, handle);
 }
 
 static int read_same_filled_page(struct zram *zram, struct page *page,
@@ -2797,7 +2816,7 @@ static void zram_slot_free_notify(struct block_device *bdev,
 		return;
 	}
 
-	slot_free(zram, index);
+	zs_free_deferred(zram->mem_pool, slot_free_extract(zram, index));
 	slot_unlock(zram, index);
 }
 

From 695765178a40cdfadd5303ce724c6ab293a65b0c Mon Sep 17 00:00:00 2001
From: Wenchao Hao <haowenchao22@gmail.com>
Date: Tue, 21 Apr 2026 20:16:16 +0800
Subject: [PATCH 4/4] mm/zswap: defer zs_free() in zswap_invalidate() path

zswap_invalidate() is called on the same process exit path as
zram_slot_free_notify(). The zswap_entry_free() it calls internally
performs zs_free() which is expensive due to zsmalloc internal locking.
Unlike zram which has a trylock fallback, zswap_invalidate() executes
unconditionally, making the latency impact potentially worse.

Like zram, the expensive zs_free() here blocks the process exit path,
delaying overall memory release. Additionally, zswap_entry_free()
performs extra work beyond zs_free(): list_lru_del() (takes its own
spinlock), obj_cgroup accounting, and kmem_cache_free for the entry
itself.

Use zs_free_deferred() in zswap_invalidate() path to defer the
expensive zsmalloc handle freeing to a workqueue, allowing the exit
path to release memory faster. All other callers (zswap_load,
zswap_writeback_entry, zswap_store error paths) run in process context
and continue to use synchronous zs_free().

Signed-off-by: Wenchao Hao <haowenchao@xiaomi.com>
---
 mm/zswap.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/mm/zswap.c b/mm/zswap.c
index 4b5149173b0e..d858662a93af 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -761,11 +761,16 @@ static void zswap_entry_cache_free(struct zswap_entry *entry)
 /*
  * Carries out the common pattern of freeing an entry's zsmalloc allocation,
  * freeing the entry itself, and decrementing the number of stored pages.
+ * When @deferred is true, the zsmalloc handle is queued for async freeing
+ * instead of being freed immediately.
  */
-static void zswap_entry_free(struct zswap_entry *entry)
+static void __zswap_entry_free(struct zswap_entry *entry, bool deferred)
 {
 	zswap_lru_del(&zswap_list_lru, entry);
-	zs_free(entry->pool->zs_pool, entry->handle);
+	if (deferred)
+		zs_free_deferred(entry->pool->zs_pool, entry->handle);
+	else
+		zs_free(entry->pool->zs_pool, entry->handle);
 	zswap_pool_put(entry->pool);
 	if (entry->objcg) {
 		obj_cgroup_uncharge_zswap(entry->objcg, entry->length);
@@ -777,6 +782,11 @@ static void zswap_entry_free(struct zswap_entry *entry)
 	atomic_long_dec(&zswap_stored_pages);
 }
 
+static void zswap_entry_free(struct zswap_entry *entry)
+{
+	__zswap_entry_free(entry, false);
+}
+
 /*********************************
 * compressed storage functions
 **********************************/
@@ -1648,7 +1658,7 @@ void zswap_invalidate(swp_entry_t swp)
 
 	entry = xa_erase(tree, offset);
 	if (entry)
-		zswap_entry_free(entry);
+		__zswap_entry_free(entry, true);
 }
 
 int zswap_swapon(int type, unsigned long nr_pages)