Skip to content

Commit 320d3bf

Browse files
yangdongshengkawasaki
authored andcommitted
dm-pcache: add cache device
Add cache_dev.{c,h} to manage the persistent-memory device that stores all pcache metadata and data segments. Splitting this logic out keeps the main dm-pcache code focused on policy while cache_dev handles the low-level interaction with the DAX block device. * DAX mapping - Opens the underlying device via dm_get_device(). - Uses dax_direct_access() to obtain a direct linear mapping; falls back to vmap() when the range is fragmented. * On-disk layout ┌─ 4 KB ─┐ super-block (SB) ├─ 4 KB ─┤ cache_info[0] ├─ 4 KB ─┤ cache_info[1] ├─ 4 KB ─┤ cache_ctrl └─ ... ─┘ segments Constants and macros in the header expose offsets and sizes. * Super-block handling - sb_read(), sb_validate(), sb_init() verify magic, CRC32 and host endianness (flag *PCACHE_SB_F_BIGENDIAN*). - Formatting zeroes the metadata replicas and initialises the segment bitmap when the SB is blank. * Segment allocator - Bitmap protected by seg_lock; find_next_zero_bit() yields the next free 16 MB segment. * Lifecycle helpers - cache_dev_start()/stop() encapsulate init/exit and are invoked by dm-pcache core. - Gracefully handles errors: CRC mismatch, wrong endianness, device too small (< 512 MB), or failed DAX mapping. Signed-off-by: Dongsheng Yang <[email protected]>
1 parent b39f9f2 commit 320d3bf

2 files changed

Lines changed: 369 additions & 0 deletions

File tree

drivers/md/dm-pcache/cache_dev.c

Lines changed: 299 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,299 @@
1+
// SPDX-License-Identifier: GPL-2.0-or-later
2+
3+
#include <linux/blkdev.h>
4+
#include <linux/dax.h>
5+
#include <linux/vmalloc.h>
6+
#include <linux/pfn_t.h>
7+
#include <linux/parser.h>
8+
9+
#include "cache_dev.h"
10+
#include "backing_dev.h"
11+
#include "cache.h"
12+
#include "dm_pcache.h"
13+
14+
static void cache_dev_dax_exit(struct pcache_cache_dev *cache_dev)
15+
{
16+
if (cache_dev->use_vmap)
17+
vunmap(cache_dev->mapping);
18+
}
19+
20+
static int build_vmap(struct dax_device *dax_dev, long total_pages, void **vaddr)
21+
{
22+
struct page **pages;
23+
long i = 0, chunk;
24+
pfn_t pfn;
25+
int ret;
26+
27+
pages = vmalloc_array(total_pages, sizeof(struct page *));
28+
if (!pages)
29+
return -ENOMEM;
30+
31+
do {
32+
chunk = dax_direct_access(dax_dev, i, total_pages - i,
33+
DAX_ACCESS, NULL, &pfn);
34+
if (chunk <= 0) {
35+
ret = chunk ? chunk : -EINVAL;
36+
goto out_free;
37+
}
38+
39+
if (!pfn_t_has_page(pfn)) {
40+
ret = -EOPNOTSUPP;
41+
goto out_free;
42+
}
43+
44+
while (chunk-- && i < total_pages) {
45+
pages[i++] = pfn_t_to_page(pfn);
46+
pfn.val++;
47+
if (!(i & 15))
48+
cond_resched();
49+
}
50+
} while (i < total_pages);
51+
52+
*vaddr = vmap(pages, total_pages, VM_MAP, PAGE_KERNEL);
53+
if (!*vaddr)
54+
ret = -ENOMEM;
55+
out_free:
56+
vfree(pages);
57+
return ret;
58+
}
59+
60+
static int cache_dev_dax_init(struct pcache_cache_dev *cache_dev)
61+
{
62+
struct dm_pcache *pcache = CACHE_DEV_TO_PCACHE(cache_dev);
63+
struct dax_device *dax_dev;
64+
long total_pages, mapped_pages;
65+
u64 bdev_size;
66+
void *vaddr;
67+
int ret;
68+
int id;
69+
pfn_t pfn;
70+
71+
dax_dev = cache_dev->dm_dev->dax_dev;
72+
/* total size check */
73+
bdev_size = bdev_nr_bytes(cache_dev->dm_dev->bdev);
74+
if (bdev_size < PCACHE_CACHE_DEV_SIZE_MIN) {
75+
pcache_dev_err(pcache, "dax device is too small, required at least %llu",
76+
PCACHE_CACHE_DEV_SIZE_MIN);
77+
ret = -ENOSPC;
78+
goto out;
79+
}
80+
81+
total_pages = bdev_size >> PAGE_SHIFT;
82+
/* attempt: direct-map the whole range */
83+
id = dax_read_lock();
84+
mapped_pages = dax_direct_access(dax_dev, 0, total_pages,
85+
DAX_ACCESS, &vaddr, &pfn);
86+
if (mapped_pages < 0) {
87+
pcache_dev_err(pcache, "dax_direct_access failed: %ld\n", mapped_pages);
88+
ret = mapped_pages;
89+
goto unlock;
90+
}
91+
92+
if (!pfn_t_has_page(pfn)) {
93+
ret = -EOPNOTSUPP;
94+
goto unlock;
95+
}
96+
97+
if (mapped_pages == total_pages) {
98+
/* success: contiguous direct mapping */
99+
cache_dev->mapping = vaddr;
100+
} else {
101+
/* need vmap fallback */
102+
ret = build_vmap(dax_dev, total_pages, &vaddr);
103+
if (ret) {
104+
pcache_dev_err(pcache, "vmap fallback failed: %d\n", ret);
105+
goto unlock;
106+
}
107+
108+
cache_dev->mapping = vaddr;
109+
cache_dev->use_vmap = true;
110+
}
111+
dax_read_unlock(id);
112+
113+
return 0;
114+
unlock:
115+
dax_read_unlock(id);
116+
out:
117+
return ret;
118+
}
119+
120+
void cache_dev_zero_range(struct pcache_cache_dev *cache_dev, void *pos, u32 size)
121+
{
122+
memset(pos, 0, size);
123+
dax_flush(cache_dev->dm_dev->dax_dev, pos, size);
124+
}
125+
126+
static int sb_read(struct pcache_cache_dev *cache_dev, struct pcache_sb *sb)
127+
{
128+
struct pcache_sb *sb_addr = CACHE_DEV_SB(cache_dev);
129+
130+
if (copy_mc_to_kernel(sb, sb_addr, sizeof(struct pcache_sb)))
131+
return -EIO;
132+
133+
return 0;
134+
}
135+
136+
static void sb_write(struct pcache_cache_dev *cache_dev, struct pcache_sb *sb)
137+
{
138+
struct pcache_sb *sb_addr = CACHE_DEV_SB(cache_dev);
139+
140+
memcpy_flushcache(sb_addr, sb, sizeof(struct pcache_sb));
141+
pmem_wmb();
142+
}
143+
144+
static int sb_init(struct pcache_cache_dev *cache_dev, struct pcache_sb *sb)
145+
{
146+
struct dm_pcache *pcache = CACHE_DEV_TO_PCACHE(cache_dev);
147+
u64 nr_segs;
148+
u64 cache_dev_size;
149+
u64 magic;
150+
u32 flags = 0;
151+
152+
magic = le64_to_cpu(sb->magic);
153+
if (magic)
154+
return -EEXIST;
155+
156+
cache_dev_size = bdev_nr_bytes(file_bdev(cache_dev->dm_dev->bdev_file));
157+
if (cache_dev_size < PCACHE_CACHE_DEV_SIZE_MIN) {
158+
pcache_dev_err(pcache, "dax device is too small, required at least %llu",
159+
PCACHE_CACHE_DEV_SIZE_MIN);
160+
return -ENOSPC;
161+
}
162+
163+
nr_segs = (cache_dev_size - PCACHE_SEGMENTS_OFF) / ((PCACHE_SEG_SIZE));
164+
165+
#if defined(__BYTE_ORDER) ? (__BIG_ENDIAN == __BYTE_ORDER) : defined(__BIG_ENDIAN)
166+
flags |= PCACHE_SB_F_BIGENDIAN;
167+
#endif
168+
sb->flags = cpu_to_le32(flags);
169+
sb->magic = cpu_to_le64(PCACHE_MAGIC);
170+
sb->seg_num = cpu_to_le32(nr_segs);
171+
sb->crc = cpu_to_le32(crc32c(PCACHE_CRC_SEED, (void *)(sb) + 4, sizeof(struct pcache_sb) - 4));
172+
173+
cache_dev_zero_range(cache_dev, CACHE_DEV_CACHE_INFO(cache_dev),
174+
PCACHE_CACHE_INFO_SIZE * PCACHE_META_INDEX_MAX +
175+
PCACHE_CACHE_CTRL_SIZE);
176+
177+
return 0;
178+
}
179+
180+
static int sb_validate(struct pcache_cache_dev *cache_dev, struct pcache_sb *sb)
181+
{
182+
struct dm_pcache *pcache = CACHE_DEV_TO_PCACHE(cache_dev);
183+
u32 flags;
184+
u32 crc;
185+
186+
if (le64_to_cpu(sb->magic) != PCACHE_MAGIC) {
187+
pcache_dev_err(pcache, "unexpected magic: %llx\n",
188+
le64_to_cpu(sb->magic));
189+
return -EINVAL;
190+
}
191+
192+
crc = crc32c(PCACHE_CRC_SEED, (void *)(sb) + 4, sizeof(struct pcache_sb) - 4);
193+
if (crc != le32_to_cpu(sb->crc)) {
194+
pcache_dev_err(pcache, "corrupted sb: %u, expected: %u\n", crc, le32_to_cpu(sb->crc));
195+
return -EINVAL;
196+
}
197+
198+
flags = le32_to_cpu(sb->flags);
199+
#if defined(__BYTE_ORDER) ? (__BIG_ENDIAN == __BYTE_ORDER) : defined(__BIG_ENDIAN)
200+
if (!(flags & PCACHE_SB_F_BIGENDIAN)) {
201+
pcache_dev_err(pcache, "cache_dev is not big endian\n");
202+
return -EINVAL;
203+
}
204+
#else
205+
if (flags & PCACHE_SB_F_BIGENDIAN) {
206+
pcache_dev_err(pcache, "cache_dev is big endian\n");
207+
return -EINVAL;
208+
}
209+
#endif
210+
return 0;
211+
}
212+
213+
static int cache_dev_init(struct pcache_cache_dev *cache_dev, u32 seg_num)
214+
{
215+
cache_dev->seg_num = seg_num;
216+
cache_dev->seg_bitmap = bitmap_zalloc(cache_dev->seg_num, GFP_KERNEL);
217+
if (!cache_dev->seg_bitmap)
218+
return -ENOMEM;
219+
220+
return 0;
221+
}
222+
223+
static void cache_dev_exit(struct pcache_cache_dev *cache_dev)
224+
{
225+
bitmap_free(cache_dev->seg_bitmap);
226+
}
227+
228+
void cache_dev_stop(struct dm_pcache *pcache)
229+
{
230+
struct pcache_cache_dev *cache_dev = &pcache->cache_dev;
231+
232+
cache_dev_exit(cache_dev);
233+
cache_dev_dax_exit(cache_dev);
234+
}
235+
236+
int cache_dev_start(struct dm_pcache *pcache)
237+
{
238+
struct pcache_cache_dev *cache_dev = &pcache->cache_dev;
239+
struct pcache_sb sb;
240+
bool format = false;
241+
int ret;
242+
243+
mutex_init(&cache_dev->seg_lock);
244+
245+
ret = cache_dev_dax_init(cache_dev);
246+
if (ret) {
247+
pcache_dev_err(pcache, "failed to init cache_dev %s via dax way: %d.",
248+
cache_dev->dm_dev->name, ret);
249+
goto err;
250+
}
251+
252+
ret = sb_read(cache_dev, &sb);
253+
if (ret)
254+
goto dax_release;
255+
256+
if (le64_to_cpu(sb.magic) == 0) {
257+
format = true;
258+
ret = sb_init(cache_dev, &sb);
259+
if (ret < 0)
260+
goto dax_release;
261+
}
262+
263+
ret = sb_validate(cache_dev, &sb);
264+
if (ret)
265+
goto dax_release;
266+
267+
cache_dev->sb_flags = le32_to_cpu(sb.flags);
268+
ret = cache_dev_init(cache_dev, sb.seg_num);
269+
if (ret)
270+
goto dax_release;
271+
272+
if (format)
273+
sb_write(cache_dev, &sb);
274+
275+
return 0;
276+
277+
dax_release:
278+
cache_dev_dax_exit(cache_dev);
279+
err:
280+
return ret;
281+
}
282+
283+
int cache_dev_get_empty_segment_id(struct pcache_cache_dev *cache_dev, u32 *seg_id)
284+
{
285+
int ret;
286+
287+
mutex_lock(&cache_dev->seg_lock);
288+
*seg_id = find_next_zero_bit(cache_dev->seg_bitmap, cache_dev->seg_num, 0);
289+
if (*seg_id == cache_dev->seg_num) {
290+
ret = -ENOSPC;
291+
goto unlock;
292+
}
293+
294+
set_bit(*seg_id, cache_dev->seg_bitmap);
295+
ret = 0;
296+
unlock:
297+
mutex_unlock(&cache_dev->seg_lock);
298+
return ret;
299+
}

drivers/md/dm-pcache/cache_dev.h

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
/* SPDX-License-Identifier: GPL-2.0-or-later */
2+
#ifndef _PCACHE_CACHE_DEV_H
3+
#define _PCACHE_CACHE_DEV_H
4+
5+
#include <linux/device.h>
6+
#include <linux/device-mapper.h>
7+
8+
#include "pcache_internal.h"
9+
10+
#define PCACHE_MAGIC 0x65B05EFA96C596EFULL
11+
12+
#define PCACHE_SB_OFF (4 * PCACHE_KB)
13+
#define PCACHE_SB_SIZE (4 * PCACHE_KB)
14+
15+
#define PCACHE_CACHE_INFO_OFF (PCACHE_SB_OFF + PCACHE_SB_SIZE)
16+
#define PCACHE_CACHE_INFO_SIZE (4 * PCACHE_KB)
17+
18+
#define PCACHE_CACHE_CTRL_OFF (PCACHE_CACHE_INFO_OFF + (PCACHE_CACHE_INFO_SIZE * PCACHE_META_INDEX_MAX))
19+
#define PCACHE_CACHE_CTRL_SIZE (4 * PCACHE_KB)
20+
21+
#define PCACHE_SEGMENTS_OFF (PCACHE_CACHE_CTRL_OFF + PCACHE_CACHE_CTRL_SIZE)
22+
#define PCACHE_SEG_INFO_SIZE (4 * PCACHE_KB)
23+
24+
#define PCACHE_CACHE_DEV_SIZE_MIN (512 * PCACHE_MB) /* 512 MB */
25+
#define PCACHE_SEG_SIZE (16 * PCACHE_MB) /* Size of each PCACHE segment (16 MB) */
26+
27+
#define CACHE_DEV_SB(cache_dev) ((struct pcache_sb *)(cache_dev->mapping + PCACHE_SB_OFF))
28+
#define CACHE_DEV_CACHE_INFO(cache_dev) ((void *)cache_dev->mapping + PCACHE_CACHE_INFO_OFF)
29+
#define CACHE_DEV_CACHE_CTRL(cache_dev) ((void *)cache_dev->mapping + PCACHE_CACHE_CTRL_OFF)
30+
#define CACHE_DEV_SEGMENTS(cache_dev) ((void *)cache_dev->mapping + PCACHE_SEGMENTS_OFF)
31+
#define CACHE_DEV_SEGMENT(cache_dev, id) ((void *)CACHE_DEV_SEGMENTS(cache_dev) + (u64)id * PCACHE_SEG_SIZE)
32+
33+
/*
34+
* PCACHE SB flags configured during formatting
35+
*
36+
* The PCACHE_SB_F_xxx flags define registration requirements based on cache_dev
37+
* formatting. For a machine to register a cache_dev:
38+
* - PCACHE_SB_F_BIGENDIAN: Requires a big-endian machine.
39+
*/
40+
#define PCACHE_SB_F_BIGENDIAN BIT(0)
41+
42+
struct pcache_sb {
43+
__le32 crc;
44+
__le32 flags;
45+
__le64 magic;
46+
47+
__le32 seg_num;
48+
};
49+
50+
struct pcache_cache_dev {
51+
u32 sb_flags;
52+
u32 seg_num;
53+
void *mapping;
54+
bool use_vmap;
55+
56+
struct dm_dev *dm_dev;
57+
58+
struct mutex seg_lock;
59+
unsigned long *seg_bitmap;
60+
};
61+
62+
struct dm_pcache;
63+
int cache_dev_start(struct dm_pcache *pcache);
64+
void cache_dev_stop(struct dm_pcache *pcache);
65+
66+
void cache_dev_zero_range(struct pcache_cache_dev *cache_dev, void *pos, u32 size);
67+
68+
int cache_dev_get_empty_segment_id(struct pcache_cache_dev *cache_dev, u32 *seg_id);
69+
70+
#endif /* _PCACHE_CACHE_DEV_H */

0 commit comments

Comments
 (0)