Skip to content

Commit fafdd92

Browse files
committed
netfilter: nft_set_rbtree: revisit array resize logic
Chris Arges reports high memory consumption with thousands of containers, this patch revisits the array allocation logic. For anonymous sets, start by 16 slots (which takes 256 bytes on x86_64). Expand it by x2 until threshold of 512 slots is reached, over that threshold, expand it by x1.5. For non-anonymous set, start by 1024 slots in the array (which takes 16 Kbytes initially on x86_64). Expand it by x1.5. Use set->ndeact to subtract deactivated elements when calculating the number of the slots in the array, otherwise the array size array gets increased artifically. Add special case shrink logic to deal with flush set too. The shrink logic is skipped by anonymous sets. Use check_add_overflow() to calculate the new array size. Add a WARN_ON_ONCE check to make sure elements fit into the new array size. Reported-by: Chris Arges <[email protected]> Fixes: 7e43e0a ("netfilter: nft_set_rbtree: translate rbtree to array for binary search") Signed-off-by: Florian Westphal <[email protected]> Signed-off-by: Pablo Neira Ayuso <[email protected]>
1 parent 9d3f027 commit fafdd92

1 file changed

Lines changed: 75 additions & 17 deletions

File tree

net/netfilter/nft_set_rbtree.c

Lines changed: 75 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -572,14 +572,12 @@ static struct nft_array *nft_array_alloc(u32 max_intervals)
572572
return array;
573573
}
574574

575-
#define NFT_ARRAY_EXTRA_SIZE 10240
576-
577575
/* Similar to nft_rbtree_{u,k}size to hide details to userspace, but consider
578576
* packed representation coming from userspace for anonymous sets too.
579577
*/
580578
static u32 nft_array_elems(const struct nft_set *set)
581579
{
582-
u32 nelems = atomic_read(&set->nelems);
580+
u32 nelems = atomic_read(&set->nelems) - set->ndeact;
583581

584582
/* Adjacent intervals are represented with a single start element in
585583
* anonymous sets, use the current element counter as is.
@@ -595,27 +593,87 @@ static u32 nft_array_elems(const struct nft_set *set)
595593
return (nelems / 2) + 2;
596594
}
597595

598-
static int nft_array_may_resize(const struct nft_set *set)
596+
#define NFT_ARRAY_INITIAL_SIZE 1024
597+
#define NFT_ARRAY_INITIAL_ANON_SIZE 16
598+
#define NFT_ARRAY_INITIAL_ANON_THRESH (8192U / sizeof(struct nft_array_interval))
599+
600+
static int nft_array_may_resize(const struct nft_set *set, bool flush)
599601
{
600-
u32 nelems = nft_array_elems(set), new_max_intervals;
602+
u32 initial_intervals, max_intervals, new_max_intervals, delta;
603+
u32 shrinked_max_intervals, nelems = nft_array_elems(set);
601604
struct nft_rbtree *priv = nft_set_priv(set);
602605
struct nft_array *array;
603606

604-
if (!priv->array_next) {
605-
array = nft_array_alloc(nelems + NFT_ARRAY_EXTRA_SIZE);
606-
if (!array)
607-
return -ENOMEM;
607+
if (nft_set_is_anonymous(set))
608+
initial_intervals = NFT_ARRAY_INITIAL_ANON_SIZE;
609+
else
610+
initial_intervals = NFT_ARRAY_INITIAL_SIZE;
611+
612+
if (priv->array_next) {
613+
max_intervals = priv->array_next->max_intervals;
614+
new_max_intervals = priv->array_next->max_intervals;
615+
} else {
616+
if (priv->array) {
617+
max_intervals = priv->array->max_intervals;
618+
new_max_intervals = priv->array->max_intervals;
619+
} else {
620+
max_intervals = 0;
621+
new_max_intervals = initial_intervals;
622+
}
623+
}
608624

609-
priv->array_next = array;
625+
if (nft_set_is_anonymous(set))
626+
goto maybe_grow;
627+
628+
if (flush) {
629+
/* Set flush just started, nelems still report elements.*/
630+
nelems = 0;
631+
new_max_intervals = NFT_ARRAY_INITIAL_SIZE;
632+
goto realloc_array;
610633
}
611634

612-
if (nelems < priv->array_next->max_intervals)
613-
return 0;
635+
if (check_add_overflow(new_max_intervals, new_max_intervals,
636+
&shrinked_max_intervals))
637+
return -EOVERFLOW;
638+
639+
shrinked_max_intervals = DIV_ROUND_UP(shrinked_max_intervals, 3);
614640

615-
new_max_intervals = priv->array_next->max_intervals + NFT_ARRAY_EXTRA_SIZE;
616-
if (nft_array_intervals_alloc(priv->array_next, new_max_intervals) < 0)
641+
if (shrinked_max_intervals > NFT_ARRAY_INITIAL_SIZE &&
642+
nelems < shrinked_max_intervals) {
643+
new_max_intervals = shrinked_max_intervals;
644+
goto realloc_array;
645+
}
646+
maybe_grow:
647+
if (nelems > new_max_intervals) {
648+
if (nft_set_is_anonymous(set) &&
649+
new_max_intervals < NFT_ARRAY_INITIAL_ANON_THRESH) {
650+
new_max_intervals <<= 1;
651+
} else {
652+
delta = new_max_intervals >> 1;
653+
if (check_add_overflow(new_max_intervals, delta,
654+
&new_max_intervals))
655+
return -EOVERFLOW;
656+
}
657+
}
658+
659+
realloc_array:
660+
if (WARN_ON_ONCE(nelems > new_max_intervals))
617661
return -ENOMEM;
618662

663+
if (priv->array_next) {
664+
if (max_intervals == new_max_intervals)
665+
return 0;
666+
667+
if (nft_array_intervals_alloc(priv->array_next, new_max_intervals) < 0)
668+
return -ENOMEM;
669+
} else {
670+
array = nft_array_alloc(new_max_intervals);
671+
if (!array)
672+
return -ENOMEM;
673+
674+
priv->array_next = array;
675+
}
676+
619677
return 0;
620678
}
621679

@@ -630,7 +688,7 @@ static int nft_rbtree_insert(const struct net *net, const struct nft_set *set,
630688

631689
nft_rbtree_maybe_reset_start_cookie(priv, tstamp);
632690

633-
if (nft_array_may_resize(set) < 0)
691+
if (nft_array_may_resize(set, false) < 0)
634692
return -ENOMEM;
635693

636694
do {
@@ -741,7 +799,7 @@ nft_rbtree_deactivate(const struct net *net, const struct nft_set *set,
741799
nft_rbtree_interval_null(set, this))
742800
priv->start_rbe_cookie = 0;
743801

744-
if (nft_array_may_resize(set) < 0)
802+
if (nft_array_may_resize(set, false) < 0)
745803
return NULL;
746804

747805
while (parent != NULL) {
@@ -811,7 +869,7 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx,
811869

812870
switch (iter->type) {
813871
case NFT_ITER_UPDATE_CLONE:
814-
if (nft_array_may_resize(set) < 0) {
872+
if (nft_array_may_resize(set, true) < 0) {
815873
iter->err = -ENOMEM;
816874
break;
817875
}

0 commit comments

Comments
 (0)