diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 58d890fe2100e..a20a58b8c29b7 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -151,6 +151,7 @@ config X86 select ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP if X86_64 select ARCH_WANT_HUGETLB_VMEMMAP_PREINIT if X86_64 select ARCH_WANTS_THP_SWAP if X86_64 + select ARCH_WANTS_STATIC_HUGE_ZERO_FOLIO if X86_64 select ARCH_HAS_PARANOID_L1D_FLUSH select ARCH_WANT_IRQS_OFF_ACTIVATE_MM select BUILDTIME_TABLE_SORT diff --git a/block/blk-lib.c b/block/blk-lib.c index 4c9f20a689f7b..3030a772d3aa0 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -196,6 +196,8 @@ static void __blkdev_issue_zero_pages(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct bio **biop, unsigned int flags) { + struct folio *zero_folio = largest_zero_folio(); + while (nr_sects) { unsigned int nr_vecs = __blkdev_sectors_to_bio_pages(nr_sects); struct bio *bio; @@ -208,15 +210,14 @@ static void __blkdev_issue_zero_pages(struct block_device *bdev, break; do { - unsigned int len, added; + unsigned int len; - len = min_t(sector_t, - PAGE_SIZE, nr_sects << SECTOR_SHIFT); - added = bio_add_page(bio, ZERO_PAGE(0), len, 0); - if (added < len) + len = min_t(sector_t, folio_size(zero_folio), + nr_sects << SECTOR_SHIFT); + if (!bio_add_folio(bio, zero_folio, len, 0)) break; - nr_sects -= added >> SECTOR_SHIFT; - sector += added >> SECTOR_SHIFT; + nr_sects -= len >> SECTOR_SHIFT; + sector += len >> SECTOR_SHIFT; } while (nr_sects); *biop = bio_chain_and_submit(*biop, bio); diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 7748489fde1b7..c44a6736704bb 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -476,6 +476,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf); extern struct folio *huge_zero_folio; extern unsigned long huge_zero_pfn; +extern atomic_t huge_zero_folio_is_static; static inline bool is_huge_zero_folio(const struct folio *folio) { @@ -494,6 +495,18 @@ static inline bool is_huge_zero_pmd(pmd_t pmd) struct folio *mm_get_huge_zero_folio(struct mm_struct *mm); void mm_put_huge_zero_folio(struct mm_struct *mm); +struct folio *__get_static_huge_zero_folio(void); + +static inline struct folio *get_static_huge_zero_folio(void) +{ + if (!IS_ENABLED(CONFIG_STATIC_HUGE_ZERO_FOLIO)) + return NULL; + + if (likely(atomic_read(&huge_zero_folio_is_static))) + return huge_zero_folio; + + return __get_static_huge_zero_folio(); +} static inline bool thp_migration_supported(void) { @@ -685,6 +698,11 @@ static inline int change_huge_pud(struct mmu_gather *tlb, { return 0; } + +static inline struct folio *get_static_huge_zero_folio(void) +{ + return NULL; +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static inline int split_folio_to_list_to_order(struct folio *folio, @@ -698,4 +716,21 @@ static inline int split_folio_to_order(struct folio *folio, int new_order) return split_folio_to_list_to_order(folio, NULL, new_order); } +/* + * largest_zero_folio - Get the largest zero size folio available + * + * This function will return huge_zero_folio if CONFIG_STATIC_HUGE_ZERO_FOLIO + * is enabled. Otherwise, a ZERO_PAGE folio is returned. + * + * Deduce the size of the folio with folio_size instead of assuming the + * folio size. + */ +static inline struct folio *largest_zero_folio(void) +{ + struct folio *folio = get_static_huge_zero_folio(); + + if (folio) + return folio; + return page_folio(ZERO_PAGE(0)); +} #endif /* _LINUX_HUGE_MM_H */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 08bc2442db934..a87b40610ab9e 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1758,7 +1758,7 @@ enum { #define MMF_RECALC_UPROBES 20 /* MMF_HAS_UPROBES can be wrong */ #define MMF_OOM_SKIP 21 /* mm is of no interest for the OOM killer */ #define MMF_UNSTABLE 22 /* mm is unstable for copy_from_user */ -#define MMF_HUGE_ZERO_PAGE 23 /* mm has ever used the global huge zero page */ +#define MMF_HUGE_ZERO_FOLIO 23 /* mm has ever used the global huge zero folio */ #define MMF_DISABLE_THP 24 /* disable THP for all VMAs */ #define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP) #define MMF_OOM_REAP_QUEUED 25 /* mm was queued for oom_reaper */ diff --git a/mm/Kconfig b/mm/Kconfig index e443fe8cd6cf2..366a6d2d771e3 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -823,6 +823,27 @@ config ARCH_WANT_GENERAL_HUGETLB config ARCH_WANTS_THP_SWAP def_bool n +config ARCH_WANTS_STATIC_HUGE_ZERO_FOLIO + def_bool n + +config STATIC_HUGE_ZERO_FOLIO + bool "Allocate a PMD sized folio for zeroing" + depends on ARCH_WANTS_STATIC_HUGE_ZERO_FOLIO && TRANSPARENT_HUGEPAGE + help + Without this config enabled, the huge zero folio is allocated on + demand and freed under memory pressure once no longer in use. + To detect remaining users reliably, references to the huge zero folio + must be tracked precisely, so it is commonly only available for mapping + it into user page tables. + + With this config enabled, the huge zero folio can also be used + for other purposes that do not implement precise reference counting: + it is still allocated on demand, but never freed, allowing for more + wide-spread use, for example, when performing I/O similar to the + traditional shared zeropage. + + Not suitable for memory constrained systems. + config MM_ID def_bool n diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9c38a95e9f091..db4fb108ffd90 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -75,6 +75,7 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, static bool split_underused_thp = true; static atomic_t huge_zero_refcount; +atomic_t huge_zero_folio_is_static __read_mostly; struct folio *huge_zero_folio __read_mostly; unsigned long huge_zero_pfn __read_mostly = ~0UL; unsigned long huge_anon_orders_always __read_mostly; @@ -207,7 +208,7 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, return orders; } -static bool get_huge_zero_page(void) +static bool get_huge_zero_folio(void) { struct folio *zero_folio; retry: @@ -237,7 +238,7 @@ static bool get_huge_zero_page(void) return true; } -static void put_huge_zero_page(void) +static void put_huge_zero_folio(void) { /* * Counter should never go to zero here. Only shrinker can put @@ -248,36 +249,79 @@ static void put_huge_zero_page(void) struct folio *mm_get_huge_zero_folio(struct mm_struct *mm) { - if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) + if (test_bit(MMF_HUGE_ZERO_FOLIO, &mm->flags)) return READ_ONCE(huge_zero_folio); - if (!get_huge_zero_page()) + if (!get_huge_zero_folio()) return NULL; - if (test_and_set_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) - put_huge_zero_page(); + if (test_and_set_bit(MMF_HUGE_ZERO_FOLIO, &mm->flags)) + put_huge_zero_folio(); return READ_ONCE(huge_zero_folio); } void mm_put_huge_zero_folio(struct mm_struct *mm) { - if (test_bit(MMF_HUGE_ZERO_PAGE, &mm->flags)) - put_huge_zero_page(); + if (test_bit(MMF_HUGE_ZERO_FOLIO, &mm->flags)) + put_huge_zero_folio(); } -static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink, - struct shrink_control *sc) +#ifdef CONFIG_STATIC_HUGE_ZERO_FOLIO + +struct folio *__get_static_huge_zero_folio(void) +{ + static unsigned long fail_count_clear_timer; + static atomic_t huge_zero_static_fail_count __read_mostly; + + if (unlikely(!slab_is_available())) + return NULL; + + /* + * If we failed to allocate a huge zero folio, just refrain from + * trying for one minute before retrying to get a reference again. + */ + if (atomic_read(&huge_zero_static_fail_count) > 1) { + if (time_before(jiffies, fail_count_clear_timer)) + return NULL; + atomic_set(&huge_zero_static_fail_count, 0); + } + /* + * Our raised reference will prevent the shrinker from ever having + * success. + */ + if (!get_huge_zero_folio()) { + int count = atomic_inc_return(&huge_zero_static_fail_count); + + if (count > 1) + fail_count_clear_timer = get_jiffies_64() + 60 * HZ; + + return NULL; + } + + if (atomic_cmpxchg(&huge_zero_folio_is_static, 0, 1) != 0) + put_huge_zero_folio(); + + return huge_zero_folio; +} +#endif /* CONFIG_STATIC_HUGE_ZERO_FOLIO */ + +static unsigned long shrink_huge_zero_folio_count(struct shrinker *shrink, + struct shrink_control *sc) { /* we can free zero page only if last reference remains */ return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0; } -static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink, - struct shrink_control *sc) +static unsigned long shrink_huge_zero_folio_scan(struct shrinker *shrink, + struct shrink_control *sc) { if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) { - struct folio *zero_folio = xchg(&huge_zero_folio, NULL); + struct folio *zero_folio; + + if (WARN_ON_ONCE(atomic_read(&huge_zero_folio_is_static))) + return 0; + zero_folio = xchg(&huge_zero_folio, NULL); BUG_ON(zero_folio == NULL); WRITE_ONCE(huge_zero_pfn, ~0UL); folio_put(zero_folio); @@ -287,7 +331,7 @@ static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink, return 0; } -static struct shrinker *huge_zero_page_shrinker; +static struct shrinker *huge_zero_folio_shrinker; #ifdef CONFIG_SYSFS static ssize_t enabled_show(struct kobject *kobj, @@ -849,8 +893,8 @@ static inline void hugepage_exit_sysfs(struct kobject *hugepage_kobj) static int __init thp_shrinker_init(void) { - huge_zero_page_shrinker = shrinker_alloc(0, "thp-zero"); - if (!huge_zero_page_shrinker) + huge_zero_folio_shrinker = shrinker_alloc(0, "thp-zero"); + if (!huge_zero_folio_shrinker) return -ENOMEM; deferred_split_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE | @@ -858,13 +902,13 @@ static int __init thp_shrinker_init(void) SHRINKER_NONSLAB, "thp-deferred_split"); if (!deferred_split_shrinker) { - shrinker_free(huge_zero_page_shrinker); + shrinker_free(huge_zero_folio_shrinker); return -ENOMEM; } - huge_zero_page_shrinker->count_objects = shrink_huge_zero_page_count; - huge_zero_page_shrinker->scan_objects = shrink_huge_zero_page_scan; - shrinker_register(huge_zero_page_shrinker); + huge_zero_folio_shrinker->count_objects = shrink_huge_zero_folio_count; + huge_zero_folio_shrinker->scan_objects = shrink_huge_zero_folio_scan; + shrinker_register(huge_zero_folio_shrinker); deferred_split_shrinker->count_objects = deferred_split_count; deferred_split_shrinker->scan_objects = deferred_split_scan; @@ -875,7 +919,7 @@ static int __init thp_shrinker_init(void) static void __init thp_shrinker_exit(void) { - shrinker_free(huge_zero_page_shrinker); + shrinker_free(huge_zero_folio_shrinker); shrinker_free(deferred_split_shrinker); }