|
| 1 | +// SPDX-License-Identifier: GPL-2.0 |
| 2 | +/* Copyright(c) 2026 Micron Technology, Inc. */ |
| 3 | +#include <linux/memremap.h> |
| 4 | +#include <linux/pagemap.h> |
| 5 | +#include <linux/module.h> |
| 6 | +#include <linux/device.h> |
| 7 | +#include <linux/cdev.h> |
| 8 | +#include <linux/slab.h> |
| 9 | +#include <linux/dax.h> |
| 10 | +#include <linux/uio.h> |
| 11 | +#include <linux/fs.h> |
| 12 | +#include <linux/mm.h> |
| 13 | +#include "dax-private.h" |
| 14 | +#include "bus.h" |
| 15 | + |
| 16 | +/* |
| 17 | + * FS-DAX compatible devdax driver |
| 18 | + * |
| 19 | + * Unlike drivers/dax/device.c which pre-initializes compound folios based |
| 20 | + * on device alignment (via vmemmap_shift), this driver leaves folios |
| 21 | + * uninitialized similar to pmem. This allows fs-dax filesystems like famfs |
| 22 | + * to work without needing special handling for pre-initialized folios. |
| 23 | + * |
| 24 | + * Key differences from device.c: |
| 25 | + * - pgmap type is MEMORY_DEVICE_FS_DAX (not MEMORY_DEVICE_GENERIC) |
| 26 | + * - vmemmap_shift is NOT set (folios remain order-0) |
| 27 | + * - fs-dax can dynamically create compound folios as needed |
| 28 | + * - No mmap support - all access is through fs-dax/iomap |
| 29 | + */ |
| 30 | + |
| 31 | +static void fsdev_cdev_del(void *cdev) |
| 32 | +{ |
| 33 | + cdev_del(cdev); |
| 34 | +} |
| 35 | + |
| 36 | +static void fsdev_kill(void *dev_dax) |
| 37 | +{ |
| 38 | + kill_dev_dax(dev_dax); |
| 39 | +} |
| 40 | + |
| 41 | +/* |
| 42 | + * Page map operations for FS-DAX mode |
| 43 | + * Similar to fsdax_pagemap_ops in drivers/nvdimm/pmem.c |
| 44 | + * |
| 45 | + * Note: folio_free callback is not needed for MEMORY_DEVICE_FS_DAX. |
| 46 | + * The core mm code in free_zone_device_folio() handles the wake_up_var() |
| 47 | + * directly for this memory type. |
| 48 | + */ |
| 49 | +static int fsdev_pagemap_memory_failure(struct dev_pagemap *pgmap, |
| 50 | + unsigned long pfn, unsigned long nr_pages, int mf_flags) |
| 51 | +{ |
| 52 | + struct dev_dax *dev_dax = pgmap->owner; |
| 53 | + u64 offset = PFN_PHYS(pfn) - dev_dax->ranges[0].range.start; |
| 54 | + u64 len = nr_pages << PAGE_SHIFT; |
| 55 | + |
| 56 | + return dax_holder_notify_failure(dev_dax->dax_dev, offset, |
| 57 | + len, mf_flags); |
| 58 | +} |
| 59 | + |
| 60 | +static const struct dev_pagemap_ops fsdev_pagemap_ops = { |
| 61 | + .memory_failure = fsdev_pagemap_memory_failure, |
| 62 | +}; |
| 63 | + |
| 64 | +/* |
| 65 | + * Clear any stale folio state from pages in the given range. |
| 66 | + * This is necessary because device_dax pre-initializes compound folios |
| 67 | + * based on vmemmap_shift, and that state may persist after driver unbind. |
| 68 | + * Since fsdev_dax uses MEMORY_DEVICE_FS_DAX without vmemmap_shift, fs-dax |
| 69 | + * expects to find clean order-0 folios that it can build into compound |
| 70 | + * folios on demand. |
| 71 | + * |
| 72 | + * At probe time, no filesystem should be mounted yet, so all mappings |
| 73 | + * are stale and must be cleared along with compound state. |
| 74 | + */ |
| 75 | +static void fsdev_clear_folio_state(struct dev_dax *dev_dax) |
| 76 | +{ |
| 77 | + for (int i = 0; i < dev_dax->nr_range; i++) { |
| 78 | + struct range *range = &dev_dax->ranges[i].range; |
| 79 | + unsigned long pfn = PHYS_PFN(range->start); |
| 80 | + unsigned long end_pfn = PHYS_PFN(range->end) + 1; |
| 81 | + |
| 82 | + while (pfn < end_pfn) { |
| 83 | + struct folio *folio = pfn_folio(pfn); |
| 84 | + int order = dax_folio_reset_order(folio); |
| 85 | + |
| 86 | + pfn += 1UL << order; |
| 87 | + } |
| 88 | + } |
| 89 | +} |
| 90 | + |
| 91 | +static void fsdev_clear_folio_state_action(void *data) |
| 92 | +{ |
| 93 | + fsdev_clear_folio_state(data); |
| 94 | +} |
| 95 | + |
| 96 | +static int fsdev_open(struct inode *inode, struct file *filp) |
| 97 | +{ |
| 98 | + struct dax_device *dax_dev = inode_dax(inode); |
| 99 | + struct dev_dax *dev_dax = dax_get_private(dax_dev); |
| 100 | + |
| 101 | + filp->private_data = dev_dax; |
| 102 | + |
| 103 | + return 0; |
| 104 | +} |
| 105 | + |
| 106 | +static int fsdev_release(struct inode *inode, struct file *filp) |
| 107 | +{ |
| 108 | + return 0; |
| 109 | +} |
| 110 | + |
| 111 | +static const struct file_operations fsdev_fops = { |
| 112 | + .llseek = noop_llseek, |
| 113 | + .owner = THIS_MODULE, |
| 114 | + .open = fsdev_open, |
| 115 | + .release = fsdev_release, |
| 116 | +}; |
| 117 | + |
| 118 | +static int fsdev_dax_probe(struct dev_dax *dev_dax) |
| 119 | +{ |
| 120 | + struct dax_device *dax_dev = dev_dax->dax_dev; |
| 121 | + struct device *dev = &dev_dax->dev; |
| 122 | + struct dev_pagemap *pgmap; |
| 123 | + struct inode *inode; |
| 124 | + struct cdev *cdev; |
| 125 | + void *addr; |
| 126 | + int rc, i; |
| 127 | + |
| 128 | + if (static_dev_dax(dev_dax)) { |
| 129 | + if (dev_dax->nr_range > 1) { |
| 130 | + dev_warn(dev, "static pgmap / multi-range device conflict\n"); |
| 131 | + return -EINVAL; |
| 132 | + } |
| 133 | + |
| 134 | + pgmap = dev_dax->pgmap; |
| 135 | + } else { |
| 136 | + size_t pgmap_size; |
| 137 | + |
| 138 | + if (dev_dax->pgmap) { |
| 139 | + dev_warn(dev, "dynamic-dax with pre-populated page map\n"); |
| 140 | + return -EINVAL; |
| 141 | + } |
| 142 | + |
| 143 | + pgmap_size = struct_size(pgmap, ranges, dev_dax->nr_range - 1); |
| 144 | + pgmap = devm_kzalloc(dev, pgmap_size, GFP_KERNEL); |
| 145 | + if (!pgmap) |
| 146 | + return -ENOMEM; |
| 147 | + |
| 148 | + pgmap->nr_range = dev_dax->nr_range; |
| 149 | + dev_dax->pgmap = pgmap; |
| 150 | + |
| 151 | + for (i = 0; i < dev_dax->nr_range; i++) { |
| 152 | + struct range *range = &dev_dax->ranges[i].range; |
| 153 | + |
| 154 | + pgmap->ranges[i] = *range; |
| 155 | + } |
| 156 | + } |
| 157 | + |
| 158 | + for (i = 0; i < dev_dax->nr_range; i++) { |
| 159 | + struct range *range = &dev_dax->ranges[i].range; |
| 160 | + |
| 161 | + if (!devm_request_mem_region(dev, range->start, |
| 162 | + range_len(range), dev_name(dev))) { |
| 163 | + dev_warn(dev, "mapping%d: %#llx-%#llx could not reserve range\n", |
| 164 | + i, range->start, range->end); |
| 165 | + return -EBUSY; |
| 166 | + } |
| 167 | + } |
| 168 | + |
| 169 | + /* |
| 170 | + * Use MEMORY_DEVICE_FS_DAX without setting vmemmap_shift, leaving |
| 171 | + * folios at order-0. Unlike device.c (MEMORY_DEVICE_GENERIC), this |
| 172 | + * lets fs-dax dynamically build compound folios as needed, similar |
| 173 | + * to pmem behavior. |
| 174 | + */ |
| 175 | + pgmap->type = MEMORY_DEVICE_FS_DAX; |
| 176 | + pgmap->ops = &fsdev_pagemap_ops; |
| 177 | + pgmap->owner = dev_dax; |
| 178 | + |
| 179 | + addr = devm_memremap_pages(dev, pgmap); |
| 180 | + if (IS_ERR(addr)) |
| 181 | + return PTR_ERR(addr); |
| 182 | + |
| 183 | + /* |
| 184 | + * Clear any stale compound folio state left over from a previous |
| 185 | + * driver (e.g., device_dax with vmemmap_shift). Also register this |
| 186 | + * as a devm action so folio state is cleared on unbind, ensuring |
| 187 | + * clean pages for subsequent drivers (e.g., kmem for system-ram). |
| 188 | + */ |
| 189 | + fsdev_clear_folio_state(dev_dax); |
| 190 | + rc = devm_add_action_or_reset(dev, fsdev_clear_folio_state_action, |
| 191 | + dev_dax); |
| 192 | + if (rc) |
| 193 | + return rc; |
| 194 | + |
| 195 | + /* Detect whether the data is at a non-zero offset into the memory */ |
| 196 | + if (pgmap->range.start != dev_dax->ranges[0].range.start) { |
| 197 | + u64 phys = dev_dax->ranges[0].range.start; |
| 198 | + u64 pgmap_phys = dev_dax->pgmap[0].range.start; |
| 199 | + u64 data_offset = 0; |
| 200 | + |
| 201 | + if (!WARN_ON(pgmap_phys > phys)) |
| 202 | + data_offset = phys - pgmap_phys; |
| 203 | + |
| 204 | + pr_debug("%s: offset detected phys=%llx pgmap_phys=%llx offset=%llx\n", |
| 205 | + __func__, phys, pgmap_phys, data_offset); |
| 206 | + } |
| 207 | + |
| 208 | + inode = dax_inode(dax_dev); |
| 209 | + cdev = inode->i_cdev; |
| 210 | + cdev_init(cdev, &fsdev_fops); |
| 211 | + cdev->owner = dev->driver->owner; |
| 212 | + cdev_set_parent(cdev, &dev->kobj); |
| 213 | + rc = cdev_add(cdev, dev->devt, 1); |
| 214 | + if (rc) |
| 215 | + return rc; |
| 216 | + |
| 217 | + rc = devm_add_action_or_reset(dev, fsdev_cdev_del, cdev); |
| 218 | + if (rc) |
| 219 | + return rc; |
| 220 | + |
| 221 | + run_dax(dax_dev); |
| 222 | + return devm_add_action_or_reset(dev, fsdev_kill, dev_dax); |
| 223 | +} |
| 224 | + |
| 225 | +static struct dax_device_driver fsdev_dax_driver = { |
| 226 | + .probe = fsdev_dax_probe, |
| 227 | + .type = DAXDRV_FSDEV_TYPE, |
| 228 | +}; |
| 229 | + |
| 230 | +static int __init dax_init(void) |
| 231 | +{ |
| 232 | + return dax_driver_register(&fsdev_dax_driver); |
| 233 | +} |
| 234 | + |
| 235 | +static void __exit dax_exit(void) |
| 236 | +{ |
| 237 | + dax_driver_unregister(&fsdev_dax_driver); |
| 238 | +} |
| 239 | + |
| 240 | +MODULE_AUTHOR("John Groves"); |
| 241 | +MODULE_DESCRIPTION("FS-DAX Device: fs-dax compatible devdax driver"); |
| 242 | +MODULE_LICENSE("GPL"); |
| 243 | +module_init(dax_init); |
| 244 | +module_exit(dax_exit); |
| 245 | +MODULE_ALIAS_DAX_DEVICE(0); |
0 commit comments