Skip to content

Commit 3738097

Browse files
committed
Florian Westphal says: ==================== netfilter: updates for net 1) Inseo An reported a bug with the set element handling in nf_tables: When set cannot accept more elements, we unlink and immediately free an element that was inserted into a public data structure, freeing it without waiting for RCU grace period. Fix this by doing the increment earlier and by deferring possible unlink-and-free to the existing abort path, which performs the needed synchronize_rcu before free. From Pablo Neira Ayuso. This is an ancient bug, dating back to kernel 4.10. 2) syzbot reported WARN_ON() splat in nf_tables that occurs on memory allocation failure. Fix this by a new iterator annotation: The affected walker does not need to clone the data structure and can just use the live version if no clone exists yet. Also from Pablo. This bug existed since 6.10 days. 3) Ancient forever bug in nft_pipapo data structure: The garbage collection logic to remove expired elements is broken. We must unlink from data structure and can only hand the freeing to call_rcu after the clone/live pointers of the data structures have been swapped. Else, readers can observe the free'd element. Reported by Yiming Qian. * tag 'nf-26-03-05' of https://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf: netfilter: nft_set_pipapo: split gc into unlink and reclaim phase netfilter: nf_tables: clone set on flush only netfilter: nf_tables: unconditionally bump set->nelems before insertion ==================== Link: https://patch.msgid.link/[email protected] Signed-off-by: Jakub Kicinski <[email protected]>
2 parents 0abc73c + 9df9578 commit 3738097

6 files changed

Lines changed: 92 additions & 33 deletions

File tree

include/net/netfilter/nf_tables.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -320,11 +320,13 @@ static inline void *nft_elem_priv_cast(const struct nft_elem_priv *priv)
320320
* @NFT_ITER_UNSPEC: unspecified, to catch errors
321321
* @NFT_ITER_READ: read-only iteration over set elements
322322
* @NFT_ITER_UPDATE: iteration under mutex to update set element state
323+
* @NFT_ITER_UPDATE_CLONE: clone set before iteration under mutex to update element
323324
*/
324325
enum nft_iter_type {
325326
NFT_ITER_UNSPEC,
326327
NFT_ITER_READ,
327328
NFT_ITER_UPDATE,
329+
NFT_ITER_UPDATE_CLONE,
328330
};
329331

330332
struct nft_set;
@@ -1861,6 +1863,11 @@ struct nft_trans_gc {
18611863
struct rcu_head rcu;
18621864
};
18631865

1866+
static inline int nft_trans_gc_space(const struct nft_trans_gc *trans)
1867+
{
1868+
return NFT_TRANS_GC_BATCHCOUNT - trans->count;
1869+
}
1870+
18641871
static inline void nft_ctx_update(struct nft_ctx *ctx,
18651872
const struct nft_trans *trans)
18661873
{

net/netfilter/nf_tables_api.c

Lines changed: 25 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -833,6 +833,11 @@ static void nft_map_catchall_deactivate(const struct nft_ctx *ctx,
833833
}
834834
}
835835

836+
/* Use NFT_ITER_UPDATE iterator even if this may be called from the preparation
837+
* phase, the set clone might already exist from a previous command, or it might
838+
* be a set that is going away and does not require a clone. The netns and
839+
* netlink release paths also need to work on the live set.
840+
*/
836841
static void nft_map_deactivate(const struct nft_ctx *ctx, struct nft_set *set)
837842
{
838843
struct nft_set_iter iter = {
@@ -7170,6 +7175,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
71707175
struct nft_data_desc desc;
71717176
enum nft_registers dreg;
71727177
struct nft_trans *trans;
7178+
bool set_full = false;
71737179
u64 expiration;
71747180
u64 timeout;
71757181
int err, i;
@@ -7461,10 +7467,18 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
74617467
if (err < 0)
74627468
goto err_elem_free;
74637469

7470+
if (!(flags & NFT_SET_ELEM_CATCHALL)) {
7471+
unsigned int max = nft_set_maxsize(set), nelems;
7472+
7473+
nelems = atomic_inc_return(&set->nelems);
7474+
if (nelems > max)
7475+
set_full = true;
7476+
}
7477+
74647478
trans = nft_trans_elem_alloc(ctx, NFT_MSG_NEWSETELEM, set);
74657479
if (trans == NULL) {
74667480
err = -ENOMEM;
7467-
goto err_elem_free;
7481+
goto err_set_size;
74687482
}
74697483

74707484
ext->genmask = nft_genmask_cur(ctx->net);
@@ -7516,7 +7530,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
75167530

75177531
ue->priv = elem_priv;
75187532
nft_trans_commit_list_add_elem(ctx->net, trans);
7519-
goto err_elem_free;
7533+
goto err_set_size;
75207534
}
75217535
}
75227536
}
@@ -7534,23 +7548,16 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
75347548
goto err_element_clash;
75357549
}
75367550

7537-
if (!(flags & NFT_SET_ELEM_CATCHALL)) {
7538-
unsigned int max = nft_set_maxsize(set);
7539-
7540-
if (!atomic_add_unless(&set->nelems, 1, max)) {
7541-
err = -ENFILE;
7542-
goto err_set_full;
7543-
}
7544-
}
7545-
75467551
nft_trans_container_elem(trans)->elems[0].priv = elem.priv;
75477552
nft_trans_commit_list_add_elem(ctx->net, trans);
7548-
return 0;
75497553

7550-
err_set_full:
7551-
nft_setelem_remove(ctx->net, set, elem.priv);
7554+
return set_full ? -ENFILE : 0;
7555+
75527556
err_element_clash:
75537557
kfree(trans);
7558+
err_set_size:
7559+
if (!(flags & NFT_SET_ELEM_CATCHALL))
7560+
atomic_dec(&set->nelems);
75547561
err_elem_free:
75557562
nf_tables_set_elem_destroy(ctx, set, elem.priv);
75567563
err_parse_data:
@@ -7901,9 +7908,12 @@ static int nft_set_catchall_flush(const struct nft_ctx *ctx,
79017908

79027909
static int nft_set_flush(struct nft_ctx *ctx, struct nft_set *set, u8 genmask)
79037910
{
7911+
/* The set backend might need to clone the set, do it now from the
7912+
* preparation phase, use NFT_ITER_UPDATE_CLONE iterator type.
7913+
*/
79047914
struct nft_set_iter iter = {
79057915
.genmask = genmask,
7906-
.type = NFT_ITER_UPDATE,
7916+
.type = NFT_ITER_UPDATE_CLONE,
79077917
.fn = nft_setelem_flush,
79087918
};
79097919

@@ -10483,11 +10493,6 @@ static void nft_trans_gc_queue_work(struct nft_trans_gc *trans)
1048310493
schedule_work(&trans_gc_work);
1048410494
}
1048510495

10486-
static int nft_trans_gc_space(struct nft_trans_gc *trans)
10487-
{
10488-
return NFT_TRANS_GC_BATCHCOUNT - trans->count;
10489-
}
10490-
1049110496
struct nft_trans_gc *nft_trans_gc_queue_async(struct nft_trans_gc *gc,
1049210497
unsigned int gc_seq, gfp_t gfp)
1049310498
{

net/netfilter/nft_set_hash.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -374,6 +374,7 @@ static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set,
374374
{
375375
switch (iter->type) {
376376
case NFT_ITER_UPDATE:
377+
case NFT_ITER_UPDATE_CLONE:
377378
/* only relevant for netlink dumps which use READ type */
378379
WARN_ON_ONCE(iter->skip != 0);
379380

net/netfilter/nft_set_pipapo.c

Lines changed: 52 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1680,11 +1680,11 @@ static void nft_pipapo_gc_deactivate(struct net *net, struct nft_set *set,
16801680
}
16811681

16821682
/**
1683-
* pipapo_gc() - Drop expired entries from set, destroy start and end elements
1683+
* pipapo_gc_scan() - Drop expired entries from set and link them to gc list
16841684
* @set: nftables API set representation
16851685
* @m: Matching data
16861686
*/
1687-
static void pipapo_gc(struct nft_set *set, struct nft_pipapo_match *m)
1687+
static void pipapo_gc_scan(struct nft_set *set, struct nft_pipapo_match *m)
16881688
{
16891689
struct nft_pipapo *priv = nft_set_priv(set);
16901690
struct net *net = read_pnet(&set->net);
@@ -1697,6 +1697,8 @@ static void pipapo_gc(struct nft_set *set, struct nft_pipapo_match *m)
16971697
if (!gc)
16981698
return;
16991699

1700+
list_add(&gc->list, &priv->gc_head);
1701+
17001702
while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) {
17011703
union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS];
17021704
const struct nft_pipapo_field *f;
@@ -1724,9 +1726,13 @@ static void pipapo_gc(struct nft_set *set, struct nft_pipapo_match *m)
17241726
* NFT_SET_ELEM_DEAD_BIT.
17251727
*/
17261728
if (__nft_set_elem_expired(&e->ext, tstamp)) {
1727-
gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL);
1728-
if (!gc)
1729-
return;
1729+
if (!nft_trans_gc_space(gc)) {
1730+
gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL);
1731+
if (!gc)
1732+
return;
1733+
1734+
list_add(&gc->list, &priv->gc_head);
1735+
}
17301736

17311737
nft_pipapo_gc_deactivate(net, set, e);
17321738
pipapo_drop(m, rulemap);
@@ -1740,10 +1746,30 @@ static void pipapo_gc(struct nft_set *set, struct nft_pipapo_match *m)
17401746
}
17411747
}
17421748

1743-
gc = nft_trans_gc_catchall_sync(gc);
1749+
priv->last_gc = jiffies;
1750+
}
1751+
1752+
/**
1753+
* pipapo_gc_queue() - Free expired elements
1754+
* @set: nftables API set representation
1755+
*/
1756+
static void pipapo_gc_queue(struct nft_set *set)
1757+
{
1758+
struct nft_pipapo *priv = nft_set_priv(set);
1759+
struct nft_trans_gc *gc, *next;
1760+
1761+
/* always do a catchall cycle: */
1762+
gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL);
17441763
if (gc) {
1764+
gc = nft_trans_gc_catchall_sync(gc);
1765+
if (gc)
1766+
nft_trans_gc_queue_sync_done(gc);
1767+
}
1768+
1769+
/* always purge queued gc elements. */
1770+
list_for_each_entry_safe(gc, next, &priv->gc_head, list) {
1771+
list_del(&gc->list);
17451772
nft_trans_gc_queue_sync_done(gc);
1746-
priv->last_gc = jiffies;
17471773
}
17481774
}
17491775

@@ -1797,6 +1823,10 @@ static void pipapo_reclaim_match(struct rcu_head *rcu)
17971823
*
17981824
* We also need to create a new working copy for subsequent insertions and
17991825
* deletions.
1826+
*
1827+
* After the live copy has been replaced by the clone, we can safely queue
1828+
* expired elements that have been collected by pipapo_gc_scan() for
1829+
* memory reclaim.
18001830
*/
18011831
static void nft_pipapo_commit(struct nft_set *set)
18021832
{
@@ -1807,14 +1837,16 @@ static void nft_pipapo_commit(struct nft_set *set)
18071837
return;
18081838

18091839
if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set)))
1810-
pipapo_gc(set, priv->clone);
1840+
pipapo_gc_scan(set, priv->clone);
18111841

18121842
old = rcu_replace_pointer(priv->match, priv->clone,
18131843
nft_pipapo_transaction_mutex_held(set));
18141844
priv->clone = NULL;
18151845

18161846
if (old)
18171847
call_rcu(&old->rcu, pipapo_reclaim_match);
1848+
1849+
pipapo_gc_queue(set);
18181850
}
18191851

18201852
static void nft_pipapo_abort(const struct nft_set *set)
@@ -2144,13 +2176,20 @@ static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set,
21442176
const struct nft_pipapo_match *m;
21452177

21462178
switch (iter->type) {
2147-
case NFT_ITER_UPDATE:
2179+
case NFT_ITER_UPDATE_CLONE:
21482180
m = pipapo_maybe_clone(set);
21492181
if (!m) {
21502182
iter->err = -ENOMEM;
21512183
return;
21522184
}
2153-
2185+
nft_pipapo_do_walk(ctx, set, m, iter);
2186+
break;
2187+
case NFT_ITER_UPDATE:
2188+
if (priv->clone)
2189+
m = priv->clone;
2190+
else
2191+
m = rcu_dereference_protected(priv->match,
2192+
nft_pipapo_transaction_mutex_held(set));
21542193
nft_pipapo_do_walk(ctx, set, m, iter);
21552194
break;
21562195
case NFT_ITER_READ:
@@ -2272,6 +2311,7 @@ static int nft_pipapo_init(const struct nft_set *set,
22722311
f->mt = NULL;
22732312
}
22742313

2314+
INIT_LIST_HEAD(&priv->gc_head);
22752315
rcu_assign_pointer(priv->match, m);
22762316

22772317
return 0;
@@ -2321,6 +2361,8 @@ static void nft_pipapo_destroy(const struct nft_ctx *ctx,
23212361
struct nft_pipapo *priv = nft_set_priv(set);
23222362
struct nft_pipapo_match *m;
23232363

2364+
WARN_ON_ONCE(!list_empty(&priv->gc_head));
2365+
23242366
m = rcu_dereference_protected(priv->match, true);
23252367

23262368
if (priv->clone) {

net/netfilter/nft_set_pipapo.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -156,12 +156,14 @@ struct nft_pipapo_match {
156156
* @clone: Copy where pending insertions and deletions are kept
157157
* @width: Total bytes to be matched for one packet, including padding
158158
* @last_gc: Timestamp of last garbage collection run, jiffies
159+
* @gc_head: list of nft_trans_gc to queue up for mem reclaim
159160
*/
160161
struct nft_pipapo {
161162
struct nft_pipapo_match __rcu *match;
162163
struct nft_pipapo_match *clone;
163164
int width;
164165
unsigned long last_gc;
166+
struct list_head gc_head;
165167
};
166168

167169
struct nft_pipapo_elem;

net/netfilter/nft_set_rbtree.c

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -861,13 +861,15 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx,
861861
struct nft_rbtree *priv = nft_set_priv(set);
862862

863863
switch (iter->type) {
864-
case NFT_ITER_UPDATE:
865-
lockdep_assert_held(&nft_pernet(ctx->net)->commit_mutex);
866-
864+
case NFT_ITER_UPDATE_CLONE:
867865
if (nft_array_may_resize(set) < 0) {
868866
iter->err = -ENOMEM;
869867
break;
870868
}
869+
fallthrough;
870+
case NFT_ITER_UPDATE:
871+
lockdep_assert_held(&nft_pernet(ctx->net)->commit_mutex);
872+
871873
nft_rbtree_do_walk(ctx, set, iter);
872874
break;
873875
case NFT_ITER_READ:

0 commit comments

Comments
 (0)