mirror of
https://github.com/torvalds/linux.git
synced 2025-12-07 20:06:24 +00:00
Merge tag 'for-6.18/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm
Pull device mapper updates from Mikulas Patocka: - a new dm-pcache target for read/write caching on persistent memory - fix typos in docs - misc small refactoring - mark dm-error with DM_TARGET_PASSES_INTEGRITY - dm-request-based: fix NULL pointer dereference and quiesce_depth out of sync - dm-linear: optimize REQ_PREFLUSH - dm-vdo: return error on corrupted metadata - dm-integrity: support asynchronous hash interface * tag 'for-6.18/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm: (27 commits) dm raid: use proper md_ro_state enumerators dm-integrity: prefer synchronous hash interface dm-integrity: enable asynchronous hash interface dm-integrity: rename internal_hash dm-integrity: add the "offset" argument dm-integrity: allocate the recalculate buffer with kmalloc dm-integrity: introduce integrity_kmap and integrity_kunmap dm-integrity: replace bvec_kmap_local with kmap_local_page dm-integrity: use internal variable for digestsize dm vdo: return error on corrupted metadata in start_restoring_volume functions dm vdo: Update code to use mem_is_zero dm: optimize REQ_PREFLUSH with data when using the linear target dm-pcache: use int type to store negative error codes dm: fix "writen"->"written" dm-pcache: cleanup: fix coding style report by checkpatch.pl dm-pcache: remove ctrl_lock for pcache_cache_segment dm: fix NULL pointer dereference in __dm_suspend() dm: fix queue start/stop imbalance under suspend/load/resume races dm-pcache: add persistent cache target in device-mapper dm error: mark as DM_TARGET_PASSES_INTEGRITY ...
This commit is contained in:
@@ -3,7 +3,7 @@ dm-delay
|
||||
========
|
||||
|
||||
Device-Mapper's "delay" target delays reads and/or writes
|
||||
and/or flushs and optionally maps them to different devices.
|
||||
and/or flushes and optionally maps them to different devices.
|
||||
|
||||
Arguments::
|
||||
|
||||
@@ -18,7 +18,7 @@ Table line has to either have 3, 6 or 9 arguments:
|
||||
to write and flush operations on optionally different write_device with
|
||||
optionally different sector offset
|
||||
|
||||
9: same as 6 arguments plus define flush_offset and flush_delay explicitely
|
||||
9: same as 6 arguments plus define flush_offset and flush_delay explicitly
|
||||
on/with optionally different flush_device/flush_offset.
|
||||
|
||||
Offsets are specified in sectors.
|
||||
@@ -40,7 +40,7 @@ Example scripts
|
||||
#!/bin/sh
|
||||
#
|
||||
# Create mapped device delaying write and flush operations for 400ms and
|
||||
# splitting reads to device $1 but writes and flushs to different device $2
|
||||
# splitting reads to device $1 but writes and flushes to different device $2
|
||||
# to different offsets of 2048 and 4096 sectors respectively.
|
||||
#
|
||||
dmsetup create delayed --table "0 `blockdev --getsz $1` delay $1 2048 0 $2 4096 400"
|
||||
@@ -48,7 +48,7 @@ Example scripts
|
||||
::
|
||||
#!/bin/sh
|
||||
#
|
||||
# Create mapped device delaying reads for 50ms, writes for 100ms and flushs for 333ms
|
||||
# Create mapped device delaying reads for 50ms, writes for 100ms and flushes for 333ms
|
||||
# onto the same backing device at offset 0 sectors.
|
||||
#
|
||||
dmsetup create delayed --table "0 `blockdev --getsz $1` delay $1 0 50 $2 0 100 $1 0 333"
|
||||
|
||||
202
Documentation/admin-guide/device-mapper/dm-pcache.rst
Normal file
202
Documentation/admin-guide/device-mapper/dm-pcache.rst
Normal file
@@ -0,0 +1,202 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
=================================
|
||||
dm-pcache — Persistent Cache
|
||||
=================================
|
||||
|
||||
*Author: Dongsheng Yang <dongsheng.yang@linux.dev>*
|
||||
|
||||
This document describes *dm-pcache*, a Device-Mapper target that lets a
|
||||
byte-addressable *DAX* (persistent-memory, “pmem”) region act as a
|
||||
high-performance, crash-persistent cache in front of a slower block
|
||||
device. The code lives in `drivers/md/dm-pcache/`.
|
||||
|
||||
Quick feature summary
|
||||
=====================
|
||||
|
||||
* *Write-back* caching (only mode currently supported).
|
||||
* *16 MiB segments* allocated on the pmem device.
|
||||
* *Data CRC32* verification (optional, per cache).
|
||||
* Crash-safe: every metadata structure is duplicated (`PCACHE_META_INDEX_MAX
|
||||
== 2`) and protected with CRC+sequence numbers.
|
||||
* *Multi-tree indexing* (indexing trees sharded by logical address) for high PMem parallelism
|
||||
* Pure *DAX path* I/O – no extra BIO round-trips
|
||||
* *Log-structured write-back* that preserves backend crash-consistency
|
||||
|
||||
|
||||
Constructor
|
||||
===========
|
||||
|
||||
::
|
||||
|
||||
pcache <cache_dev> <backing_dev> [<number_of_optional_arguments> <cache_mode writeback> <data_crc true|false>]
|
||||
|
||||
========================= ====================================================
|
||||
``cache_dev`` Any DAX-capable block device (``/dev/pmem0``…).
|
||||
All metadata *and* cached blocks are stored here.
|
||||
|
||||
``backing_dev`` The slow block device to be cached.
|
||||
|
||||
``cache_mode`` Optional, Only ``writeback`` is accepted at the
|
||||
moment.
|
||||
|
||||
``data_crc`` Optional, default to ``false``
|
||||
|
||||
* ``true`` – store CRC32 for every cached entry
|
||||
and verify on reads
|
||||
* ``false`` – skip CRC (faster)
|
||||
========================= ====================================================
|
||||
|
||||
Example
|
||||
-------
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
dmsetup create pcache_sdb --table \
|
||||
"0 $(blockdev --getsz /dev/sdb) pcache /dev/pmem0 /dev/sdb 4 cache_mode writeback data_crc true"
|
||||
|
||||
The first time a pmem device is used, dm-pcache formats it automatically
|
||||
(super-block, cache_info, etc.).
|
||||
|
||||
|
||||
Status line
|
||||
===========
|
||||
|
||||
``dmsetup status <device>`` (``STATUSTYPE_INFO``) prints:
|
||||
|
||||
::
|
||||
|
||||
<sb_flags> <seg_total> <cache_segs> <segs_used> \
|
||||
<gc_percent> <cache_flags> \
|
||||
<key_head_seg>:<key_head_off> \
|
||||
<dirty_tail_seg>:<dirty_tail_off> \
|
||||
<key_tail_seg>:<key_tail_off>
|
||||
|
||||
Field meanings
|
||||
--------------
|
||||
|
||||
=============================== =============================================
|
||||
``sb_flags`` Super-block flags (e.g. endian marker).
|
||||
|
||||
``seg_total`` Number of physical *pmem* segments.
|
||||
|
||||
``cache_segs`` Number of segments used for cache.
|
||||
|
||||
``segs_used`` Segments currently allocated (bitmap weight).
|
||||
|
||||
``gc_percent`` Current GC high-water mark (0-90).
|
||||
|
||||
``cache_flags`` Bit 0 – DATA_CRC enabled
|
||||
Bit 1 – INIT_DONE (cache initialised)
|
||||
Bits 2-5 – cache mode (0 == WB).
|
||||
|
||||
``key_head`` Where new key-sets are being written.
|
||||
|
||||
``dirty_tail`` First dirty key-set that still needs
|
||||
write-back to the backing device.
|
||||
|
||||
``key_tail`` First key-set that may be reclaimed by GC.
|
||||
=============================== =============================================
|
||||
|
||||
|
||||
Messages
|
||||
========
|
||||
|
||||
*Change GC trigger*
|
||||
|
||||
::
|
||||
|
||||
dmsetup message <dev> 0 gc_percent <0-90>
|
||||
|
||||
|
||||
Theory of operation
|
||||
===================
|
||||
|
||||
Sub-devices
|
||||
-----------
|
||||
|
||||
==================== =========================================================
|
||||
backing_dev Any block device (SSD/HDD/loop/LVM, etc.).
|
||||
cache_dev DAX device; must expose direct-access memory.
|
||||
==================== =========================================================
|
||||
|
||||
Segments and key-sets
|
||||
---------------------
|
||||
|
||||
* The pmem space is divided into *16 MiB segments*.
|
||||
* Each write allocates space from a per-CPU *data_head* inside a segment.
|
||||
* A *cache-key* records a logical range on the origin and where it lives
|
||||
inside pmem (segment + offset + generation).
|
||||
* 128 keys form a *key-set* (kset); ksets are written sequentially in pmem
|
||||
and are themselves crash-safe (CRC).
|
||||
* The pair *(key_tail, dirty_tail)* delimit clean/dirty and live/dead ksets.
|
||||
|
||||
Write-back
|
||||
----------
|
||||
|
||||
Dirty keys are queued into a tree; a background worker copies data
|
||||
back to the backing_dev and advances *dirty_tail*. A FLUSH/FUA bio from the
|
||||
upper layers forces an immediate metadata commit.
|
||||
|
||||
Garbage collection
|
||||
------------------
|
||||
|
||||
GC starts when ``segs_used >= seg_total * gc_percent / 100``. It walks
|
||||
from *key_tail*, frees segments whose every key has been invalidated, and
|
||||
advances *key_tail*.
|
||||
|
||||
CRC verification
|
||||
----------------
|
||||
|
||||
If ``data_crc is enabled`` dm-pcache computes a CRC32 over every cached data
|
||||
range when it is inserted and stores it in the on-media key. Reads
|
||||
validate the CRC before copying to the caller.
|
||||
|
||||
|
||||
Failure handling
|
||||
================
|
||||
|
||||
* *pmem media errors* – all metadata copies are read with
|
||||
``copy_mc_to_kernel``; an uncorrectable error logs and aborts initialisation.
|
||||
* *Cache full* – if no free segment can be found, writes return ``-EBUSY``;
|
||||
dm-pcache retries internally (request deferral).
|
||||
* *System crash* – on attach, the driver replays ksets from *key_tail* to
|
||||
rebuild the in-core trees; every segment’s generation guards against
|
||||
use-after-free keys.
|
||||
|
||||
|
||||
Limitations & TODO
|
||||
==================
|
||||
|
||||
* Only *write-back* mode; other modes planned.
|
||||
* Only FIFO cache invalidate; other (LRU, ARC...) planned.
|
||||
* Table reload is not supported currently.
|
||||
* Discard planned.
|
||||
|
||||
|
||||
Example workflow
|
||||
================
|
||||
|
||||
.. code-block:: shell
|
||||
|
||||
# 1. Create devices
|
||||
dmsetup create pcache_sdb --table \
|
||||
"0 $(blockdev --getsz /dev/sdb) pcache /dev/pmem0 /dev/sdb 4 cache_mode writeback data_crc true"
|
||||
|
||||
# 2. Put a filesystem on top
|
||||
mkfs.ext4 /dev/mapper/pcache_sdb
|
||||
mount /dev/mapper/pcache_sdb /mnt
|
||||
|
||||
# 3. Tune GC threshold to 80 %
|
||||
dmsetup message pcache_sdb 0 gc_percent 80
|
||||
|
||||
# 4. Observe status
|
||||
watch -n1 'dmsetup status pcache_sdb'
|
||||
|
||||
# 5. Shutdown
|
||||
umount /mnt
|
||||
dmsetup remove pcache_sdb
|
||||
|
||||
|
||||
``dm-pcache`` is under active development; feedback, bug reports and patches
|
||||
are very welcome!
|
||||
@@ -18,6 +18,7 @@ Device Mapper
|
||||
dm-integrity
|
||||
dm-io
|
||||
dm-log
|
||||
dm-pcache
|
||||
dm-queue-length
|
||||
dm-raid
|
||||
dm-service-time
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0-only
|
||||
|
||||
======
|
||||
dm-vdo
|
||||
======
|
||||
|
||||
|
||||
@@ -7133,6 +7133,14 @@ S: Maintained
|
||||
F: Documentation/admin-guide/device-mapper/vdo*.rst
|
||||
F: drivers/md/dm-vdo/
|
||||
|
||||
DEVICE-MAPPER PCACHE TARGET
|
||||
M: Dongsheng Yang <dongsheng.yang@linux.dev>
|
||||
M: Zheng Gu <cengku@gmail.com>
|
||||
L: dm-devel@lists.linux.dev
|
||||
S: Maintained
|
||||
F: Documentation/admin-guide/device-mapper/dm-pcache.rst
|
||||
F: drivers/md/dm-pcache/
|
||||
|
||||
DEVLINK
|
||||
M: Jiri Pirko <jiri@resnulli.us>
|
||||
L: netdev@vger.kernel.org
|
||||
|
||||
@@ -688,4 +688,6 @@ config DM_AUDIT
|
||||
|
||||
source "drivers/md/dm-vdo/Kconfig"
|
||||
|
||||
source "drivers/md/dm-pcache/Kconfig"
|
||||
|
||||
endif # MD
|
||||
|
||||
@@ -73,6 +73,7 @@ obj-$(CONFIG_DM_RAID) += dm-raid.o
|
||||
obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o
|
||||
obj-$(CONFIG_DM_VERITY) += dm-verity.o
|
||||
obj-$(CONFIG_DM_VDO) += dm-vdo/
|
||||
obj-$(CONFIG_DM_PCACHE) += dm-pcache/
|
||||
obj-$(CONFIG_DM_CACHE) += dm-cache.o
|
||||
obj-$(CONFIG_DM_CACHE_SMQ) += dm-cache-smq.o
|
||||
obj-$(CONFIG_DM_EBS) += dm-ebs.o
|
||||
|
||||
@@ -1337,7 +1337,7 @@ static void use_bio(struct dm_buffer *b, enum req_op op, sector_t sector,
|
||||
char *ptr;
|
||||
unsigned int len;
|
||||
|
||||
bio = bio_kmalloc(1, GFP_NOWAIT | __GFP_NORETRY | __GFP_NOWARN);
|
||||
bio = bio_kmalloc(1, GFP_NOWAIT);
|
||||
if (!bio) {
|
||||
use_dmio(b, op, sector, n_sectors, offset, ioprio);
|
||||
return;
|
||||
@@ -1601,18 +1601,18 @@ static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client
|
||||
* dm-bufio is resistant to allocation failures (it just keeps
|
||||
* one buffer reserved in cases all the allocations fail).
|
||||
* So set flags to not try too hard:
|
||||
* GFP_NOWAIT: don't wait; if we need to sleep we'll release our
|
||||
* mutex and wait ourselves.
|
||||
* GFP_NOWAIT: don't wait and don't print a warning in case of
|
||||
* failure; if we need to sleep we'll release our mutex
|
||||
* and wait ourselves.
|
||||
* __GFP_NORETRY: don't retry and rather return failure
|
||||
* __GFP_NOMEMALLOC: don't use emergency reserves
|
||||
* __GFP_NOWARN: don't print a warning in case of failure
|
||||
*
|
||||
* For debugging, if we set the cache size to 1, no new buffers will
|
||||
* be allocated.
|
||||
*/
|
||||
while (1) {
|
||||
if (dm_bufio_cache_size_latch != 1) {
|
||||
b = alloc_buffer(c, GFP_NOWAIT | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
|
||||
b = alloc_buffer(c, GFP_NOWAIT | __GFP_NORETRY | __GFP_NOMEMALLOC);
|
||||
if (b)
|
||||
return b;
|
||||
}
|
||||
|
||||
@@ -590,7 +590,7 @@ static int h_init(struct smq_hash_table *ht, struct entry_space *es, unsigned in
|
||||
nr_buckets = roundup_pow_of_two(max(nr_entries / 4u, 16u));
|
||||
ht->hash_bits = __ffs(nr_buckets);
|
||||
|
||||
ht->buckets = vmalloc(array_size(nr_buckets, sizeof(*ht->buckets)));
|
||||
ht->buckets = vmalloc_array(nr_buckets, sizeof(*ht->buckets));
|
||||
if (!ht->buckets)
|
||||
return -ENOMEM;
|
||||
|
||||
|
||||
@@ -162,6 +162,7 @@ struct mapped_device {
|
||||
#define DMF_SUSPENDED_INTERNALLY 7
|
||||
#define DMF_POST_SUSPENDING 8
|
||||
#define DMF_EMULATE_ZONE_APPEND 9
|
||||
#define DMF_QUEUE_STOPPED 10
|
||||
|
||||
static inline sector_t dm_get_size(struct mapped_device *md)
|
||||
{
|
||||
@@ -291,6 +292,7 @@ struct dm_io {
|
||||
struct dm_io *next;
|
||||
struct dm_stats_aux stats_aux;
|
||||
blk_status_t status;
|
||||
bool requeue_flush_with_data;
|
||||
atomic_t io_count;
|
||||
struct mapped_device *md;
|
||||
|
||||
|
||||
@@ -45,7 +45,7 @@ static void fix_separator_chars(char **buf)
|
||||
/*
|
||||
* Internal function to allocate memory for IMA measurements.
|
||||
*/
|
||||
static void *dm_ima_alloc(size_t len, gfp_t flags, bool noio)
|
||||
static void *dm_ima_alloc(size_t len, bool noio)
|
||||
{
|
||||
unsigned int noio_flag;
|
||||
void *ptr;
|
||||
@@ -53,7 +53,7 @@ static void *dm_ima_alloc(size_t len, gfp_t flags, bool noio)
|
||||
if (noio)
|
||||
noio_flag = memalloc_noio_save();
|
||||
|
||||
ptr = kzalloc(len, flags);
|
||||
ptr = kzalloc(len, GFP_KERNEL);
|
||||
|
||||
if (noio)
|
||||
memalloc_noio_restore(noio_flag);
|
||||
@@ -68,13 +68,13 @@ static int dm_ima_alloc_and_copy_name_uuid(struct mapped_device *md, char **dev_
|
||||
char **dev_uuid, bool noio)
|
||||
{
|
||||
int r;
|
||||
*dev_name = dm_ima_alloc(DM_NAME_LEN*2, GFP_KERNEL, noio);
|
||||
*dev_name = dm_ima_alloc(DM_NAME_LEN*2, noio);
|
||||
if (!(*dev_name)) {
|
||||
r = -ENOMEM;
|
||||
goto error;
|
||||
}
|
||||
|
||||
*dev_uuid = dm_ima_alloc(DM_UUID_LEN*2, GFP_KERNEL, noio);
|
||||
*dev_uuid = dm_ima_alloc(DM_UUID_LEN*2, noio);
|
||||
if (!(*dev_uuid)) {
|
||||
r = -ENOMEM;
|
||||
goto error;
|
||||
@@ -109,7 +109,7 @@ static int dm_ima_alloc_and_copy_device_data(struct mapped_device *md, char **de
|
||||
if (r)
|
||||
return r;
|
||||
|
||||
*device_data = dm_ima_alloc(DM_IMA_DEVICE_BUF_LEN, GFP_KERNEL, noio);
|
||||
*device_data = dm_ima_alloc(DM_IMA_DEVICE_BUF_LEN, noio);
|
||||
if (!(*device_data)) {
|
||||
r = -ENOMEM;
|
||||
goto error;
|
||||
@@ -153,14 +153,12 @@ static int dm_ima_alloc_and_copy_capacity_str(struct mapped_device *md, char **c
|
||||
|
||||
capacity = get_capacity(md->disk);
|
||||
|
||||
*capacity_str = dm_ima_alloc(DM_IMA_DEVICE_CAPACITY_BUF_LEN, GFP_KERNEL, noio);
|
||||
*capacity_str = dm_ima_alloc(DM_IMA_DEVICE_CAPACITY_BUF_LEN, noio);
|
||||
if (!(*capacity_str))
|
||||
return -ENOMEM;
|
||||
|
||||
scnprintf(*capacity_str, DM_IMA_DEVICE_BUF_LEN, "current_device_capacity=%llu;",
|
||||
capacity);
|
||||
|
||||
return 0;
|
||||
return scnprintf(*capacity_str, DM_IMA_DEVICE_BUF_LEN, "current_device_capacity=%llu;",
|
||||
capacity);
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -195,15 +193,15 @@ void dm_ima_measure_on_table_load(struct dm_table *table, unsigned int status_fl
|
||||
const size_t hash_alg_prefix_len = strlen(DM_IMA_TABLE_HASH_ALG) + 1;
|
||||
char table_load_event_name[] = "dm_table_load";
|
||||
|
||||
ima_buf = dm_ima_alloc(DM_IMA_MEASUREMENT_BUF_LEN, GFP_KERNEL, noio);
|
||||
ima_buf = dm_ima_alloc(DM_IMA_MEASUREMENT_BUF_LEN, noio);
|
||||
if (!ima_buf)
|
||||
return;
|
||||
|
||||
target_metadata_buf = dm_ima_alloc(DM_IMA_TARGET_METADATA_BUF_LEN, GFP_KERNEL, noio);
|
||||
target_metadata_buf = dm_ima_alloc(DM_IMA_TARGET_METADATA_BUF_LEN, noio);
|
||||
if (!target_metadata_buf)
|
||||
goto error;
|
||||
|
||||
target_data_buf = dm_ima_alloc(DM_IMA_TARGET_DATA_BUF_LEN, GFP_KERNEL, noio);
|
||||
target_data_buf = dm_ima_alloc(DM_IMA_TARGET_DATA_BUF_LEN, noio);
|
||||
if (!target_data_buf)
|
||||
goto error;
|
||||
|
||||
@@ -218,7 +216,7 @@ void dm_ima_measure_on_table_load(struct dm_table *table, unsigned int status_fl
|
||||
|
||||
shash->tfm = tfm;
|
||||
digest_size = crypto_shash_digestsize(tfm);
|
||||
digest = dm_ima_alloc(digest_size, GFP_KERNEL, noio);
|
||||
digest = dm_ima_alloc(digest_size, noio);
|
||||
if (!digest)
|
||||
goto error;
|
||||
|
||||
@@ -327,7 +325,7 @@ void dm_ima_measure_on_table_load(struct dm_table *table, unsigned int status_fl
|
||||
if (r < 0)
|
||||
goto error;
|
||||
|
||||
digest_buf = dm_ima_alloc((digest_size*2) + hash_alg_prefix_len + 1, GFP_KERNEL, noio);
|
||||
digest_buf = dm_ima_alloc((digest_size*2) + hash_alg_prefix_len + 1, noio);
|
||||
|
||||
if (!digest_buf)
|
||||
goto error;
|
||||
@@ -371,18 +369,18 @@ void dm_ima_measure_on_device_resume(struct mapped_device *md, bool swap)
|
||||
{
|
||||
char *device_table_data, *dev_name = NULL, *dev_uuid = NULL, *capacity_str = NULL;
|
||||
char active[] = "active_table_hash=";
|
||||
unsigned int active_len = strlen(active), capacity_len = 0;
|
||||
unsigned int active_len = strlen(active);
|
||||
unsigned int l = 0;
|
||||
bool noio = true;
|
||||
bool nodata = true;
|
||||
int r;
|
||||
int capacity_len;
|
||||
|
||||
device_table_data = dm_ima_alloc(DM_IMA_DEVICE_BUF_LEN, GFP_KERNEL, noio);
|
||||
device_table_data = dm_ima_alloc(DM_IMA_DEVICE_BUF_LEN, noio);
|
||||
if (!device_table_data)
|
||||
return;
|
||||
|
||||
r = dm_ima_alloc_and_copy_capacity_str(md, &capacity_str, noio);
|
||||
if (r)
|
||||
capacity_len = dm_ima_alloc_and_copy_capacity_str(md, &capacity_str, noio);
|
||||
if (capacity_len < 0)
|
||||
goto error;
|
||||
|
||||
memcpy(device_table_data + l, DM_IMA_VERSION_STR, md->ima.dm_version_str_len);
|
||||
@@ -445,8 +443,7 @@ void dm_ima_measure_on_device_resume(struct mapped_device *md, bool swap)
|
||||
}
|
||||
|
||||
if (nodata) {
|
||||
r = dm_ima_alloc_and_copy_name_uuid(md, &dev_name, &dev_uuid, noio);
|
||||
if (r)
|
||||
if (dm_ima_alloc_and_copy_name_uuid(md, &dev_name, &dev_uuid, noio))
|
||||
goto error;
|
||||
|
||||
l = scnprintf(device_table_data, DM_IMA_DEVICE_BUF_LEN,
|
||||
@@ -454,7 +451,6 @@ void dm_ima_measure_on_device_resume(struct mapped_device *md, bool swap)
|
||||
DM_IMA_VERSION_STR, dev_name, dev_uuid);
|
||||
}
|
||||
|
||||
capacity_len = strlen(capacity_str);
|
||||
memcpy(device_table_data + l, capacity_str, capacity_len);
|
||||
l += capacity_len;
|
||||
|
||||
@@ -483,18 +479,17 @@ void dm_ima_measure_on_device_remove(struct mapped_device *md, bool remove_all)
|
||||
unsigned int device_active_len = strlen(device_active_str);
|
||||
unsigned int device_inactive_len = strlen(device_inactive_str);
|
||||
unsigned int remove_all_len = strlen(remove_all_str);
|
||||
unsigned int capacity_len = 0;
|
||||
unsigned int l = 0;
|
||||
bool noio = true;
|
||||
bool nodata = true;
|
||||
int r;
|
||||
int capacity_len;
|
||||
|
||||
device_table_data = dm_ima_alloc(DM_IMA_DEVICE_BUF_LEN*2, GFP_KERNEL, noio);
|
||||
device_table_data = dm_ima_alloc(DM_IMA_DEVICE_BUF_LEN*2, noio);
|
||||
if (!device_table_data)
|
||||
goto exit;
|
||||
|
||||
r = dm_ima_alloc_and_copy_capacity_str(md, &capacity_str, noio);
|
||||
if (r) {
|
||||
capacity_len = dm_ima_alloc_and_copy_capacity_str(md, &capacity_str, noio);
|
||||
if (capacity_len < 0) {
|
||||
kfree(device_table_data);
|
||||
goto exit;
|
||||
}
|
||||
@@ -570,7 +565,6 @@ void dm_ima_measure_on_device_remove(struct mapped_device *md, bool remove_all)
|
||||
memcpy(device_table_data + l, remove_all ? "y;" : "n;", 2);
|
||||
l += 2;
|
||||
|
||||
capacity_len = strlen(capacity_str);
|
||||
memcpy(device_table_data + l, capacity_str, capacity_len);
|
||||
l += capacity_len;
|
||||
|
||||
@@ -602,20 +596,20 @@ exit:
|
||||
*/
|
||||
void dm_ima_measure_on_table_clear(struct mapped_device *md, bool new_map)
|
||||
{
|
||||
unsigned int l = 0, capacity_len = 0;
|
||||
unsigned int l = 0;
|
||||
char *device_table_data = NULL, *dev_name = NULL, *dev_uuid = NULL, *capacity_str = NULL;
|
||||
char inactive_str[] = "inactive_table_hash=";
|
||||
unsigned int inactive_len = strlen(inactive_str);
|
||||
bool noio = true;
|
||||
bool nodata = true;
|
||||
int r;
|
||||
int capacity_len;
|
||||
|
||||
device_table_data = dm_ima_alloc(DM_IMA_DEVICE_BUF_LEN, GFP_KERNEL, noio);
|
||||
device_table_data = dm_ima_alloc(DM_IMA_DEVICE_BUF_LEN, noio);
|
||||
if (!device_table_data)
|
||||
return;
|
||||
|
||||
r = dm_ima_alloc_and_copy_capacity_str(md, &capacity_str, noio);
|
||||
if (r)
|
||||
capacity_len = dm_ima_alloc_and_copy_capacity_str(md, &capacity_str, noio);
|
||||
if (capacity_len < 0)
|
||||
goto error1;
|
||||
|
||||
memcpy(device_table_data + l, DM_IMA_VERSION_STR, md->ima.dm_version_str_len);
|
||||
@@ -650,7 +644,6 @@ void dm_ima_measure_on_table_clear(struct mapped_device *md, bool new_map)
|
||||
DM_IMA_VERSION_STR, dev_name, dev_uuid);
|
||||
}
|
||||
|
||||
capacity_len = strlen(capacity_str);
|
||||
memcpy(device_table_data + l, capacity_str, capacity_len);
|
||||
l += capacity_len;
|
||||
|
||||
@@ -703,7 +696,7 @@ void dm_ima_measure_on_device_rename(struct mapped_device *md)
|
||||
char *old_device_data = NULL, *new_device_data = NULL, *combined_device_data = NULL;
|
||||
char *new_dev_name = NULL, *new_dev_uuid = NULL, *capacity_str = NULL;
|
||||
bool noio = true;
|
||||
int r, len;
|
||||
int len;
|
||||
|
||||
if (dm_ima_alloc_and_copy_device_data(md, &new_device_data,
|
||||
md->ima.active_table.num_targets, noio))
|
||||
@@ -712,12 +705,11 @@ void dm_ima_measure_on_device_rename(struct mapped_device *md)
|
||||
if (dm_ima_alloc_and_copy_name_uuid(md, &new_dev_name, &new_dev_uuid, noio))
|
||||
goto error;
|
||||
|
||||
combined_device_data = dm_ima_alloc(DM_IMA_DEVICE_BUF_LEN * 2, GFP_KERNEL, noio);
|
||||
combined_device_data = dm_ima_alloc(DM_IMA_DEVICE_BUF_LEN * 2, noio);
|
||||
if (!combined_device_data)
|
||||
goto error;
|
||||
|
||||
r = dm_ima_alloc_and_copy_capacity_str(md, &capacity_str, noio);
|
||||
if (r)
|
||||
if (dm_ima_alloc_and_copy_capacity_str(md, &capacity_str, noio) < 0)
|
||||
goto error;
|
||||
|
||||
old_device_data = md->ima.active_table.device_metadata;
|
||||
|
||||
@@ -219,10 +219,13 @@ struct dm_integrity_c {
|
||||
__u8 log2_blocks_per_bitmap_bit;
|
||||
|
||||
unsigned char mode;
|
||||
bool internal_hash;
|
||||
|
||||
int failed;
|
||||
|
||||
struct crypto_shash *internal_hash;
|
||||
struct crypto_shash *internal_shash;
|
||||
struct crypto_ahash *internal_ahash;
|
||||
unsigned int internal_hash_digestsize;
|
||||
|
||||
struct dm_target *ti;
|
||||
|
||||
@@ -277,6 +280,9 @@ struct dm_integrity_c {
|
||||
bool fix_hmac;
|
||||
bool legacy_recalculate;
|
||||
|
||||
mempool_t ahash_req_pool;
|
||||
struct ahash_request *journal_ahash_req;
|
||||
|
||||
struct alg_spec internal_hash_alg;
|
||||
struct alg_spec journal_crypt_alg;
|
||||
struct alg_spec journal_mac_alg;
|
||||
@@ -326,6 +332,8 @@ struct dm_integrity_io {
|
||||
unsigned payload_len;
|
||||
bool integrity_payload_from_mempool;
|
||||
bool integrity_range_locked;
|
||||
|
||||
struct ahash_request *ahash_req;
|
||||
};
|
||||
|
||||
struct journal_completion {
|
||||
@@ -352,6 +360,7 @@ struct bitmap_block_status {
|
||||
static struct kmem_cache *journal_io_cache;
|
||||
|
||||
#define JOURNAL_IO_MEMPOOL 32
|
||||
#define AHASH_MEMPOOL 32
|
||||
|
||||
#ifdef DEBUG_PRINT
|
||||
#define DEBUG_print(x, ...) printk(KERN_DEBUG x, ##__VA_ARGS__)
|
||||
@@ -1634,15 +1643,15 @@ static void integrity_end_io(struct bio *bio)
|
||||
dec_in_flight(dio);
|
||||
}
|
||||
|
||||
static void integrity_sector_checksum(struct dm_integrity_c *ic, sector_t sector,
|
||||
const char *data, char *result)
|
||||
static void integrity_sector_checksum_shash(struct dm_integrity_c *ic, sector_t sector,
|
||||
const char *data, unsigned offset, char *result)
|
||||
{
|
||||
__le64 sector_le = cpu_to_le64(sector);
|
||||
SHASH_DESC_ON_STACK(req, ic->internal_hash);
|
||||
SHASH_DESC_ON_STACK(req, ic->internal_shash);
|
||||
int r;
|
||||
unsigned int digest_size;
|
||||
|
||||
req->tfm = ic->internal_hash;
|
||||
req->tfm = ic->internal_shash;
|
||||
|
||||
r = crypto_shash_init(req);
|
||||
if (unlikely(r < 0)) {
|
||||
@@ -1664,7 +1673,7 @@ static void integrity_sector_checksum(struct dm_integrity_c *ic, sector_t sector
|
||||
goto failed;
|
||||
}
|
||||
|
||||
r = crypto_shash_update(req, data, ic->sectors_per_block << SECTOR_SHIFT);
|
||||
r = crypto_shash_update(req, data + offset, ic->sectors_per_block << SECTOR_SHIFT);
|
||||
if (unlikely(r < 0)) {
|
||||
dm_integrity_io_error(ic, "crypto_shash_update", r);
|
||||
goto failed;
|
||||
@@ -1676,7 +1685,7 @@ static void integrity_sector_checksum(struct dm_integrity_c *ic, sector_t sector
|
||||
goto failed;
|
||||
}
|
||||
|
||||
digest_size = crypto_shash_digestsize(ic->internal_hash);
|
||||
digest_size = ic->internal_hash_digestsize;
|
||||
if (unlikely(digest_size < ic->tag_size))
|
||||
memset(result + digest_size, 0, ic->tag_size - digest_size);
|
||||
|
||||
@@ -1687,6 +1696,104 @@ failed:
|
||||
get_random_bytes(result, ic->tag_size);
|
||||
}
|
||||
|
||||
static void integrity_sector_checksum_ahash(struct dm_integrity_c *ic, struct ahash_request **ahash_req,
|
||||
sector_t sector, struct page *page, unsigned offset, char *result)
|
||||
{
|
||||
__le64 sector_le = cpu_to_le64(sector);
|
||||
struct ahash_request *req;
|
||||
DECLARE_CRYPTO_WAIT(wait);
|
||||
struct scatterlist sg[3], *s = sg;
|
||||
int r;
|
||||
unsigned int digest_size;
|
||||
unsigned int nbytes = 0;
|
||||
|
||||
might_sleep();
|
||||
|
||||
req = *ahash_req;
|
||||
if (unlikely(!req)) {
|
||||
req = mempool_alloc(&ic->ahash_req_pool, GFP_NOIO);
|
||||
*ahash_req = req;
|
||||
}
|
||||
|
||||
ahash_request_set_tfm(req, ic->internal_ahash);
|
||||
ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, crypto_req_done, &wait);
|
||||
|
||||
if (ic->sb->flags & cpu_to_le32(SB_FLAG_FIXED_HMAC)) {
|
||||
sg_init_table(sg, 3);
|
||||
sg_set_buf(s, (const __u8 *)&ic->sb->salt, SALT_SIZE);
|
||||
nbytes += SALT_SIZE;
|
||||
s++;
|
||||
} else {
|
||||
sg_init_table(sg, 2);
|
||||
}
|
||||
|
||||
if (likely(!is_vmalloc_addr(§or_le))) {
|
||||
sg_set_buf(s, §or_le, sizeof(sector_le));
|
||||
} else {
|
||||
struct page *sec_page = vmalloc_to_page(§or_le);
|
||||
unsigned int sec_off = offset_in_page(§or_le);
|
||||
sg_set_page(s, sec_page, sizeof(sector_le), sec_off);
|
||||
}
|
||||
nbytes += sizeof(sector_le);
|
||||
s++;
|
||||
|
||||
sg_set_page(s, page, ic->sectors_per_block << SECTOR_SHIFT, offset);
|
||||
nbytes += ic->sectors_per_block << SECTOR_SHIFT;
|
||||
|
||||
ahash_request_set_crypt(req, sg, result, nbytes);
|
||||
|
||||
r = crypto_wait_req(crypto_ahash_digest(req), &wait);
|
||||
if (unlikely(r)) {
|
||||
dm_integrity_io_error(ic, "crypto_ahash_digest", r);
|
||||
goto failed;
|
||||
}
|
||||
|
||||
digest_size = ic->internal_hash_digestsize;
|
||||
if (unlikely(digest_size < ic->tag_size))
|
||||
memset(result + digest_size, 0, ic->tag_size - digest_size);
|
||||
|
||||
return;
|
||||
|
||||
failed:
|
||||
/* this shouldn't happen anyway, the hash functions have no reason to fail */
|
||||
get_random_bytes(result, ic->tag_size);
|
||||
}
|
||||
|
||||
static void integrity_sector_checksum(struct dm_integrity_c *ic, struct ahash_request **ahash_req,
|
||||
sector_t sector, const char *data, unsigned offset, char *result)
|
||||
{
|
||||
if (likely(ic->internal_shash != NULL))
|
||||
integrity_sector_checksum_shash(ic, sector, data, offset, result);
|
||||
else
|
||||
integrity_sector_checksum_ahash(ic, ahash_req, sector, (struct page *)data, offset, result);
|
||||
}
|
||||
|
||||
static void *integrity_kmap(struct dm_integrity_c *ic, struct page *p)
|
||||
{
|
||||
if (likely(ic->internal_shash != NULL))
|
||||
return kmap_local_page(p);
|
||||
else
|
||||
return p;
|
||||
}
|
||||
|
||||
static void integrity_kunmap(struct dm_integrity_c *ic, const void *ptr)
|
||||
{
|
||||
if (likely(ic->internal_shash != NULL))
|
||||
kunmap_local(ptr);
|
||||
}
|
||||
|
||||
static void *integrity_identity(struct dm_integrity_c *ic, void *data)
|
||||
{
|
||||
#ifdef CONFIG_DEBUG_SG
|
||||
BUG_ON(offset_in_page(data));
|
||||
BUG_ON(!virt_addr_valid(data));
|
||||
#endif
|
||||
if (likely(ic->internal_shash != NULL))
|
||||
return data;
|
||||
else
|
||||
return virt_to_page(data);
|
||||
}
|
||||
|
||||
static noinline void integrity_recheck(struct dm_integrity_io *dio, char *checksum)
|
||||
{
|
||||
struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
|
||||
@@ -1711,6 +1818,7 @@ static noinline void integrity_recheck(struct dm_integrity_io *dio, char *checks
|
||||
sector_t alignment;
|
||||
char *mem;
|
||||
char *buffer = page_to_virt(page);
|
||||
unsigned int buffer_offset;
|
||||
int r;
|
||||
struct dm_io_request io_req;
|
||||
struct dm_io_region io_loc;
|
||||
@@ -1728,7 +1836,7 @@ static noinline void integrity_recheck(struct dm_integrity_io *dio, char *checks
|
||||
alignment &= -alignment;
|
||||
io_loc.sector = round_down(io_loc.sector, alignment);
|
||||
io_loc.count += sector - io_loc.sector;
|
||||
buffer += (sector - io_loc.sector) << SECTOR_SHIFT;
|
||||
buffer_offset = (sector - io_loc.sector) << SECTOR_SHIFT;
|
||||
io_loc.count = round_up(io_loc.count, alignment);
|
||||
|
||||
r = dm_io(&io_req, 1, &io_loc, NULL, IOPRIO_DEFAULT);
|
||||
@@ -1737,7 +1845,7 @@ static noinline void integrity_recheck(struct dm_integrity_io *dio, char *checks
|
||||
goto free_ret;
|
||||
}
|
||||
|
||||
integrity_sector_checksum(ic, logical_sector, buffer, checksum);
|
||||
integrity_sector_checksum(ic, &dio->ahash_req, logical_sector, integrity_identity(ic, buffer), buffer_offset, checksum);
|
||||
r = dm_integrity_rw_tag(ic, checksum, &dio->metadata_block,
|
||||
&dio->metadata_offset, ic->tag_size, TAG_CMP);
|
||||
if (r) {
|
||||
@@ -1754,7 +1862,7 @@ static noinline void integrity_recheck(struct dm_integrity_io *dio, char *checks
|
||||
}
|
||||
|
||||
mem = bvec_kmap_local(&bv);
|
||||
memcpy(mem + pos, buffer, ic->sectors_per_block << SECTOR_SHIFT);
|
||||
memcpy(mem + pos, buffer + buffer_offset, ic->sectors_per_block << SECTOR_SHIFT);
|
||||
kunmap_local(mem);
|
||||
|
||||
pos += ic->sectors_per_block << SECTOR_SHIFT;
|
||||
@@ -1776,7 +1884,7 @@ static void integrity_metadata(struct work_struct *w)
|
||||
if (ic->internal_hash) {
|
||||
struct bvec_iter iter;
|
||||
struct bio_vec bv;
|
||||
unsigned int digest_size = crypto_shash_digestsize(ic->internal_hash);
|
||||
unsigned int digest_size = ic->internal_hash_digestsize;
|
||||
struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
|
||||
char *checksums;
|
||||
unsigned int extra_space = unlikely(digest_size > ic->tag_size) ? digest_size - ic->tag_size : 0;
|
||||
@@ -1837,17 +1945,17 @@ static void integrity_metadata(struct work_struct *w)
|
||||
char *mem, *checksums_ptr;
|
||||
|
||||
again:
|
||||
mem = bvec_kmap_local(&bv_copy);
|
||||
mem = integrity_kmap(ic, bv_copy.bv_page);
|
||||
pos = 0;
|
||||
checksums_ptr = checksums;
|
||||
do {
|
||||
integrity_sector_checksum(ic, sector, mem + pos, checksums_ptr);
|
||||
integrity_sector_checksum(ic, &dio->ahash_req, sector, mem, bv_copy.bv_offset + pos, checksums_ptr);
|
||||
checksums_ptr += ic->tag_size;
|
||||
sectors_to_process -= ic->sectors_per_block;
|
||||
pos += ic->sectors_per_block << SECTOR_SHIFT;
|
||||
sector += ic->sectors_per_block;
|
||||
} while (pos < bv_copy.bv_len && sectors_to_process && checksums != checksums_onstack);
|
||||
kunmap_local(mem);
|
||||
integrity_kunmap(ic, mem);
|
||||
|
||||
r = dm_integrity_rw_tag(ic, checksums, &dio->metadata_block, &dio->metadata_offset,
|
||||
checksums_ptr - checksums, dio->op == REQ_OP_READ ? TAG_CMP : TAG_WRITE);
|
||||
@@ -1949,6 +2057,7 @@ static int dm_integrity_map(struct dm_target *ti, struct bio *bio)
|
||||
dio->ic = ic;
|
||||
dio->bi_status = 0;
|
||||
dio->op = bio_op(bio);
|
||||
dio->ahash_req = NULL;
|
||||
|
||||
if (ic->mode == 'I') {
|
||||
bio->bi_iter.bi_sector = dm_target_offset(ic->ti, bio->bi_iter.bi_sector);
|
||||
@@ -2071,19 +2180,6 @@ retry_kmap:
|
||||
js++;
|
||||
mem_ptr += 1 << SECTOR_SHIFT;
|
||||
} while (++s < ic->sectors_per_block);
|
||||
#ifdef INTERNAL_VERIFY
|
||||
if (ic->internal_hash) {
|
||||
char checksums_onstack[MAX_T(size_t, HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)];
|
||||
|
||||
integrity_sector_checksum(ic, logical_sector, mem + bv.bv_offset, checksums_onstack);
|
||||
if (unlikely(crypto_memneq(checksums_onstack, journal_entry_tag(ic, je), ic->tag_size))) {
|
||||
DMERR_LIMIT("Checksum failed when reading from journal, at sector 0x%llx",
|
||||
logical_sector);
|
||||
dm_audit_log_bio(DM_MSG_PREFIX, "journal-checksum",
|
||||
bio, logical_sector, 0);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
if (!ic->internal_hash) {
|
||||
@@ -2124,15 +2220,17 @@ retry_kmap:
|
||||
} while (++s < ic->sectors_per_block);
|
||||
|
||||
if (ic->internal_hash) {
|
||||
unsigned int digest_size = crypto_shash_digestsize(ic->internal_hash);
|
||||
unsigned int digest_size = ic->internal_hash_digestsize;
|
||||
void *js_page = integrity_identity(ic, (char *)js - offset_in_page(js));
|
||||
unsigned js_offset = offset_in_page(js);
|
||||
|
||||
if (unlikely(digest_size > ic->tag_size)) {
|
||||
char checksums_onstack[HASH_MAX_DIGESTSIZE];
|
||||
|
||||
integrity_sector_checksum(ic, logical_sector, (char *)js, checksums_onstack);
|
||||
integrity_sector_checksum(ic, &dio->ahash_req, logical_sector, js_page, js_offset, checksums_onstack);
|
||||
memcpy(journal_entry_tag(ic, je), checksums_onstack, ic->tag_size);
|
||||
} else
|
||||
integrity_sector_checksum(ic, logical_sector, (char *)js, journal_entry_tag(ic, je));
|
||||
integrity_sector_checksum(ic, &dio->ahash_req, logical_sector, js_page, js_offset, journal_entry_tag(ic, je));
|
||||
}
|
||||
|
||||
journal_entry_set_sector(je, logical_sector);
|
||||
@@ -2428,7 +2526,7 @@ retry:
|
||||
if (!dio->integrity_payload) {
|
||||
unsigned digest_size, extra_size;
|
||||
dio->payload_len = ic->tuple_size * (bio_sectors(bio) >> ic->sb->log2_sectors_per_block);
|
||||
digest_size = crypto_shash_digestsize(ic->internal_hash);
|
||||
digest_size = ic->internal_hash_digestsize;
|
||||
extra_size = unlikely(digest_size > ic->tag_size) ? digest_size - ic->tag_size : 0;
|
||||
dio->payload_len += extra_size;
|
||||
dio->integrity_payload = kmalloc(dio->payload_len, GFP_NOIO | __GFP_NORETRY | __GFP_NOMEMALLOC | __GFP_NOWARN);
|
||||
@@ -2505,11 +2603,11 @@ skip_spinlock:
|
||||
unsigned pos = 0;
|
||||
while (dio->bio_details.bi_iter.bi_size) {
|
||||
struct bio_vec bv = bio_iter_iovec(bio, dio->bio_details.bi_iter);
|
||||
const char *mem = bvec_kmap_local(&bv);
|
||||
const char *mem = integrity_kmap(ic, bv.bv_page);
|
||||
if (ic->tag_size < ic->tuple_size)
|
||||
memset(dio->integrity_payload + pos + ic->tag_size, 0, ic->tuple_size - ic->tuple_size);
|
||||
integrity_sector_checksum(ic, dio->bio_details.bi_iter.bi_sector, mem, dio->integrity_payload + pos);
|
||||
kunmap_local(mem);
|
||||
integrity_sector_checksum(ic, &dio->ahash_req, dio->bio_details.bi_iter.bi_sector, mem, bv.bv_offset, dio->integrity_payload + pos);
|
||||
integrity_kunmap(ic, mem);
|
||||
pos += ic->tuple_size;
|
||||
bio_advance_iter_single(bio, &dio->bio_details.bi_iter, ic->sectors_per_block << SECTOR_SHIFT);
|
||||
}
|
||||
@@ -2588,8 +2686,8 @@ static void dm_integrity_inline_recheck(struct work_struct *w)
|
||||
}
|
||||
bio_put(outgoing_bio);
|
||||
|
||||
integrity_sector_checksum(ic, dio->bio_details.bi_iter.bi_sector, outgoing_data, digest);
|
||||
if (unlikely(crypto_memneq(digest, dio->integrity_payload, min(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)))) {
|
||||
integrity_sector_checksum(ic, &dio->ahash_req, dio->bio_details.bi_iter.bi_sector, integrity_identity(ic, outgoing_data), 0, digest);
|
||||
if (unlikely(crypto_memneq(digest, dio->integrity_payload, min(ic->internal_hash_digestsize, ic->tag_size)))) {
|
||||
DMERR_LIMIT("%pg: Checksum failed at sector 0x%llx",
|
||||
ic->dev->bdev, dio->bio_details.bi_iter.bi_sector);
|
||||
atomic64_inc(&ic->number_of_mismatches);
|
||||
@@ -2612,33 +2710,58 @@ static void dm_integrity_inline_recheck(struct work_struct *w)
|
||||
bio_endio(bio);
|
||||
}
|
||||
|
||||
static inline bool dm_integrity_check(struct dm_integrity_c *ic, struct dm_integrity_io *dio)
|
||||
{
|
||||
struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
|
||||
unsigned pos = 0;
|
||||
|
||||
while (dio->bio_details.bi_iter.bi_size) {
|
||||
char digest[HASH_MAX_DIGESTSIZE];
|
||||
struct bio_vec bv = bio_iter_iovec(bio, dio->bio_details.bi_iter);
|
||||
char *mem = integrity_kmap(ic, bv.bv_page);
|
||||
integrity_sector_checksum(ic, &dio->ahash_req, dio->bio_details.bi_iter.bi_sector, mem, bv.bv_offset, digest);
|
||||
if (unlikely(crypto_memneq(digest, dio->integrity_payload + pos,
|
||||
min(ic->internal_hash_digestsize, ic->tag_size)))) {
|
||||
integrity_kunmap(ic, mem);
|
||||
dm_integrity_free_payload(dio);
|
||||
INIT_WORK(&dio->work, dm_integrity_inline_recheck);
|
||||
queue_work(ic->offload_wq, &dio->work);
|
||||
return false;
|
||||
}
|
||||
integrity_kunmap(ic, mem);
|
||||
pos += ic->tuple_size;
|
||||
bio_advance_iter_single(bio, &dio->bio_details.bi_iter, ic->sectors_per_block << SECTOR_SHIFT);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static void dm_integrity_inline_async_check(struct work_struct *w)
|
||||
{
|
||||
struct dm_integrity_io *dio = container_of(w, struct dm_integrity_io, work);
|
||||
struct dm_integrity_c *ic = dio->ic;
|
||||
struct bio *bio = dm_bio_from_per_bio_data(dio, sizeof(struct dm_integrity_io));
|
||||
|
||||
if (likely(dm_integrity_check(ic, dio)))
|
||||
bio_endio(bio);
|
||||
}
|
||||
|
||||
static int dm_integrity_end_io(struct dm_target *ti, struct bio *bio, blk_status_t *status)
|
||||
{
|
||||
struct dm_integrity_c *ic = ti->private;
|
||||
struct dm_integrity_io *dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io));
|
||||
if (ic->mode == 'I') {
|
||||
struct dm_integrity_io *dio = dm_per_bio_data(bio, sizeof(struct dm_integrity_io));
|
||||
if (dio->op == REQ_OP_READ && likely(*status == BLK_STS_OK)) {
|
||||
unsigned pos = 0;
|
||||
if (dio->op == REQ_OP_READ && likely(*status == BLK_STS_OK) && likely(dio->bio_details.bi_iter.bi_size != 0)) {
|
||||
if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING) &&
|
||||
unlikely(dio->integrity_range_locked))
|
||||
goto skip_check;
|
||||
while (dio->bio_details.bi_iter.bi_size) {
|
||||
char digest[HASH_MAX_DIGESTSIZE];
|
||||
struct bio_vec bv = bio_iter_iovec(bio, dio->bio_details.bi_iter);
|
||||
char *mem = bvec_kmap_local(&bv);
|
||||
//memset(mem, 0xff, ic->sectors_per_block << SECTOR_SHIFT);
|
||||
integrity_sector_checksum(ic, dio->bio_details.bi_iter.bi_sector, mem, digest);
|
||||
if (unlikely(crypto_memneq(digest, dio->integrity_payload + pos,
|
||||
min(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)))) {
|
||||
kunmap_local(mem);
|
||||
dm_integrity_free_payload(dio);
|
||||
INIT_WORK(&dio->work, dm_integrity_inline_recheck);
|
||||
queue_work(ic->offload_wq, &dio->work);
|
||||
goto skip_check;
|
||||
if (likely(ic->internal_shash != NULL)) {
|
||||
if (unlikely(!dm_integrity_check(ic, dio)))
|
||||
return DM_ENDIO_INCOMPLETE;
|
||||
}
|
||||
kunmap_local(mem);
|
||||
pos += ic->tuple_size;
|
||||
bio_advance_iter_single(bio, &dio->bio_details.bi_iter, ic->sectors_per_block << SECTOR_SHIFT);
|
||||
} else {
|
||||
INIT_WORK(&dio->work, dm_integrity_inline_async_check);
|
||||
queue_work(ic->offload_wq, &dio->work);
|
||||
return DM_ENDIO_INCOMPLETE;
|
||||
}
|
||||
}
|
||||
skip_check:
|
||||
@@ -2646,6 +2769,8 @@ skip_check:
|
||||
if (unlikely(dio->integrity_range_locked))
|
||||
remove_range(ic, &dio->range);
|
||||
}
|
||||
if (unlikely(dio->ahash_req))
|
||||
mempool_free(dio->ahash_req, &ic->ahash_req_pool);
|
||||
return DM_ENDIO_DONE;
|
||||
}
|
||||
|
||||
@@ -2902,9 +3027,12 @@ static void do_journal_write(struct dm_integrity_c *ic, unsigned int write_start
|
||||
#endif
|
||||
ic->internal_hash) {
|
||||
char test_tag[MAX_T(size_t, HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)];
|
||||
struct journal_sector *js = access_journal_data(ic, i, l);
|
||||
void *js_page = integrity_identity(ic, (char *)js - offset_in_page(js));
|
||||
unsigned js_offset = offset_in_page(js);
|
||||
|
||||
integrity_sector_checksum(ic, sec + ((l - j) << ic->sb->log2_sectors_per_block),
|
||||
(char *)access_journal_data(ic, i, l), test_tag);
|
||||
integrity_sector_checksum(ic, &ic->journal_ahash_req, sec + ((l - j) << ic->sb->log2_sectors_per_block),
|
||||
js_page, js_offset, test_tag);
|
||||
if (unlikely(crypto_memneq(test_tag, journal_entry_tag(ic, je2), ic->tag_size))) {
|
||||
dm_integrity_io_error(ic, "tag mismatch when replaying journal", -EILSEQ);
|
||||
dm_audit_log_target(DM_MSG_PREFIX, "integrity-replay-journal", ic->ti, 0);
|
||||
@@ -2987,6 +3115,7 @@ static void integrity_recalc(struct work_struct *w)
|
||||
size_t recalc_tags_size;
|
||||
u8 *recalc_buffer = NULL;
|
||||
u8 *recalc_tags = NULL;
|
||||
struct ahash_request *ahash_req = NULL;
|
||||
struct dm_integrity_range range;
|
||||
struct dm_io_request io_req;
|
||||
struct dm_io_region io_loc;
|
||||
@@ -3001,7 +3130,7 @@ static void integrity_recalc(struct work_struct *w)
|
||||
unsigned recalc_sectors = RECALC_SECTORS;
|
||||
|
||||
retry:
|
||||
recalc_buffer = __vmalloc(recalc_sectors << SECTOR_SHIFT, GFP_NOIO);
|
||||
recalc_buffer = kmalloc(recalc_sectors << SECTOR_SHIFT, GFP_NOIO | __GFP_NOWARN);
|
||||
if (!recalc_buffer) {
|
||||
oom:
|
||||
recalc_sectors >>= 1;
|
||||
@@ -3011,11 +3140,11 @@ oom:
|
||||
goto free_ret;
|
||||
}
|
||||
recalc_tags_size = (recalc_sectors >> ic->sb->log2_sectors_per_block) * ic->tag_size;
|
||||
if (crypto_shash_digestsize(ic->internal_hash) > ic->tag_size)
|
||||
recalc_tags_size += crypto_shash_digestsize(ic->internal_hash) - ic->tag_size;
|
||||
if (ic->internal_hash_digestsize > ic->tag_size)
|
||||
recalc_tags_size += ic->internal_hash_digestsize - ic->tag_size;
|
||||
recalc_tags = kvmalloc(recalc_tags_size, GFP_NOIO);
|
||||
if (!recalc_tags) {
|
||||
vfree(recalc_buffer);
|
||||
kfree(recalc_buffer);
|
||||
recalc_buffer = NULL;
|
||||
goto oom;
|
||||
}
|
||||
@@ -3081,7 +3210,7 @@ next_chunk:
|
||||
goto err;
|
||||
|
||||
io_req.bi_opf = REQ_OP_READ;
|
||||
io_req.mem.type = DM_IO_VMA;
|
||||
io_req.mem.type = DM_IO_KMEM;
|
||||
io_req.mem.ptr.addr = recalc_buffer;
|
||||
io_req.notify.fn = NULL;
|
||||
io_req.client = ic->io;
|
||||
@@ -3097,7 +3226,10 @@ next_chunk:
|
||||
|
||||
t = recalc_tags;
|
||||
for (i = 0; i < n_sectors; i += ic->sectors_per_block) {
|
||||
integrity_sector_checksum(ic, logical_sector + i, recalc_buffer + (i << SECTOR_SHIFT), t);
|
||||
void *ptr = recalc_buffer + (i << SECTOR_SHIFT);
|
||||
void *ptr_page = integrity_identity(ic, (char *)ptr - offset_in_page(ptr));
|
||||
unsigned ptr_offset = offset_in_page(ptr);
|
||||
integrity_sector_checksum(ic, &ahash_req, logical_sector + i, ptr_page, ptr_offset, t);
|
||||
t += ic->tag_size;
|
||||
}
|
||||
|
||||
@@ -3139,8 +3271,9 @@ unlock_ret:
|
||||
recalc_write_super(ic);
|
||||
|
||||
free_ret:
|
||||
vfree(recalc_buffer);
|
||||
kfree(recalc_buffer);
|
||||
kvfree(recalc_tags);
|
||||
mempool_free(ahash_req, &ic->ahash_req_pool);
|
||||
}
|
||||
|
||||
static void integrity_recalc_inline(struct work_struct *w)
|
||||
@@ -3149,6 +3282,7 @@ static void integrity_recalc_inline(struct work_struct *w)
|
||||
size_t recalc_tags_size;
|
||||
u8 *recalc_buffer = NULL;
|
||||
u8 *recalc_tags = NULL;
|
||||
struct ahash_request *ahash_req = NULL;
|
||||
struct dm_integrity_range range;
|
||||
struct bio *bio;
|
||||
struct bio_integrity_payload *bip;
|
||||
@@ -3171,8 +3305,8 @@ oom:
|
||||
}
|
||||
|
||||
recalc_tags_size = (recalc_sectors >> ic->sb->log2_sectors_per_block) * ic->tuple_size;
|
||||
if (crypto_shash_digestsize(ic->internal_hash) > ic->tuple_size)
|
||||
recalc_tags_size += crypto_shash_digestsize(ic->internal_hash) - ic->tuple_size;
|
||||
if (ic->internal_hash_digestsize > ic->tuple_size)
|
||||
recalc_tags_size += ic->internal_hash_digestsize - ic->tuple_size;
|
||||
recalc_tags = kmalloc(recalc_tags_size, GFP_NOIO | __GFP_NOWARN);
|
||||
if (!recalc_tags) {
|
||||
kfree(recalc_buffer);
|
||||
@@ -3217,8 +3351,11 @@ next_chunk:
|
||||
|
||||
t = recalc_tags;
|
||||
for (i = 0; i < range.n_sectors; i += ic->sectors_per_block) {
|
||||
void *ptr = recalc_buffer + (i << SECTOR_SHIFT);
|
||||
void *ptr_page = integrity_identity(ic, (char *)ptr - offset_in_page(ptr));
|
||||
unsigned ptr_offset = offset_in_page(ptr);
|
||||
memset(t, 0, ic->tuple_size);
|
||||
integrity_sector_checksum(ic, range.logical_sector + i, recalc_buffer + (i << SECTOR_SHIFT), t);
|
||||
integrity_sector_checksum(ic, &ahash_req, range.logical_sector + i, ptr_page, ptr_offset, t);
|
||||
t += ic->tuple_size;
|
||||
}
|
||||
|
||||
@@ -3270,6 +3407,7 @@ unlock_ret:
|
||||
free_ret:
|
||||
kfree(recalc_buffer);
|
||||
kfree(recalc_tags);
|
||||
mempool_free(ahash_req, &ic->ahash_req_pool);
|
||||
}
|
||||
|
||||
static void bitmap_block_work(struct work_struct *w)
|
||||
@@ -4210,30 +4348,53 @@ nomem:
|
||||
return -ENOMEM;
|
||||
}
|
||||
|
||||
static int get_mac(struct crypto_shash **hash, struct alg_spec *a, char **error,
|
||||
char *error_alg, char *error_key)
|
||||
static int get_mac(struct crypto_shash **shash, struct crypto_ahash **ahash,
|
||||
struct alg_spec *a, char **error, char *error_alg, char *error_key)
|
||||
{
|
||||
int r;
|
||||
|
||||
if (a->alg_string) {
|
||||
*hash = crypto_alloc_shash(a->alg_string, 0, CRYPTO_ALG_ALLOCATES_MEMORY);
|
||||
if (IS_ERR(*hash)) {
|
||||
*error = error_alg;
|
||||
r = PTR_ERR(*hash);
|
||||
*hash = NULL;
|
||||
return r;
|
||||
}
|
||||
|
||||
if (a->key) {
|
||||
r = crypto_shash_setkey(*hash, a->key, a->key_size);
|
||||
if (r) {
|
||||
if (shash) {
|
||||
*shash = crypto_alloc_shash(a->alg_string, 0, CRYPTO_ALG_ALLOCATES_MEMORY);
|
||||
if (IS_ERR(*shash)) {
|
||||
*shash = NULL;
|
||||
goto try_ahash;
|
||||
}
|
||||
if (a->key) {
|
||||
r = crypto_shash_setkey(*shash, a->key, a->key_size);
|
||||
if (r) {
|
||||
*error = error_key;
|
||||
return r;
|
||||
}
|
||||
} else if (crypto_shash_get_flags(*shash) & CRYPTO_TFM_NEED_KEY) {
|
||||
*error = error_key;
|
||||
return -ENOKEY;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
try_ahash:
|
||||
if (ahash) {
|
||||
*ahash = crypto_alloc_ahash(a->alg_string, 0, CRYPTO_ALG_ALLOCATES_MEMORY);
|
||||
if (IS_ERR(*ahash)) {
|
||||
*error = error_alg;
|
||||
r = PTR_ERR(*ahash);
|
||||
*ahash = NULL;
|
||||
return r;
|
||||
}
|
||||
} else if (crypto_shash_get_flags(*hash) & CRYPTO_TFM_NEED_KEY) {
|
||||
*error = error_key;
|
||||
return -ENOKEY;
|
||||
if (a->key) {
|
||||
r = crypto_ahash_setkey(*ahash, a->key, a->key_size);
|
||||
if (r) {
|
||||
*error = error_key;
|
||||
return r;
|
||||
}
|
||||
} else if (crypto_ahash_get_flags(*ahash) & CRYPTO_TFM_NEED_KEY) {
|
||||
*error = error_key;
|
||||
return -ENOKEY;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
*error = error_alg;
|
||||
return -ENOENT;
|
||||
}
|
||||
|
||||
return 0;
|
||||
@@ -4690,12 +4851,26 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned int argc, char **argv
|
||||
buffer_sectors = 1;
|
||||
ic->log2_buffer_sectors = min((int)__fls(buffer_sectors), 31 - SECTOR_SHIFT);
|
||||
|
||||
r = get_mac(&ic->internal_hash, &ic->internal_hash_alg, &ti->error,
|
||||
r = get_mac(&ic->internal_shash, &ic->internal_ahash, &ic->internal_hash_alg, &ti->error,
|
||||
"Invalid internal hash", "Error setting internal hash key");
|
||||
if (r)
|
||||
goto bad;
|
||||
if (ic->internal_shash) {
|
||||
ic->internal_hash = true;
|
||||
ic->internal_hash_digestsize = crypto_shash_digestsize(ic->internal_shash);
|
||||
}
|
||||
if (ic->internal_ahash) {
|
||||
ic->internal_hash = true;
|
||||
ic->internal_hash_digestsize = crypto_ahash_digestsize(ic->internal_ahash);
|
||||
r = mempool_init_kmalloc_pool(&ic->ahash_req_pool, AHASH_MEMPOOL,
|
||||
sizeof(struct ahash_request) + crypto_ahash_reqsize(ic->internal_ahash));
|
||||
if (r) {
|
||||
ti->error = "Cannot allocate mempool";
|
||||
goto bad;
|
||||
}
|
||||
}
|
||||
|
||||
r = get_mac(&ic->journal_mac, &ic->journal_mac_alg, &ti->error,
|
||||
r = get_mac(&ic->journal_mac, NULL, &ic->journal_mac_alg, &ti->error,
|
||||
"Invalid journal mac", "Error setting journal mac key");
|
||||
if (r)
|
||||
goto bad;
|
||||
@@ -4706,7 +4881,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned int argc, char **argv
|
||||
r = -EINVAL;
|
||||
goto bad;
|
||||
}
|
||||
ic->tag_size = crypto_shash_digestsize(ic->internal_hash);
|
||||
ic->tag_size = ic->internal_hash_digestsize;
|
||||
}
|
||||
if (ic->tag_size > MAX_TAG_SIZE) {
|
||||
ti->error = "Too big tag size";
|
||||
@@ -5178,6 +5353,8 @@ static void dm_integrity_dtr(struct dm_target *ti)
|
||||
kvfree(ic->bbs);
|
||||
if (ic->bufio)
|
||||
dm_bufio_client_destroy(ic->bufio);
|
||||
mempool_free(ic->journal_ahash_req, &ic->ahash_req_pool);
|
||||
mempool_exit(&ic->ahash_req_pool);
|
||||
bioset_exit(&ic->recalc_bios);
|
||||
bioset_exit(&ic->recheck_bios);
|
||||
mempool_exit(&ic->recheck_pool);
|
||||
@@ -5215,8 +5392,10 @@ static void dm_integrity_dtr(struct dm_target *ti)
|
||||
if (ic->sb)
|
||||
free_pages_exact(ic->sb, SB_SECTORS << SECTOR_SHIFT);
|
||||
|
||||
if (ic->internal_hash)
|
||||
crypto_free_shash(ic->internal_hash);
|
||||
if (ic->internal_shash)
|
||||
crypto_free_shash(ic->internal_shash);
|
||||
if (ic->internal_ahash)
|
||||
crypto_free_ahash(ic->internal_ahash);
|
||||
free_alg(&ic->internal_hash_alg);
|
||||
|
||||
if (ic->journal_crypt)
|
||||
@@ -5233,7 +5412,7 @@ static void dm_integrity_dtr(struct dm_target *ti)
|
||||
|
||||
static struct target_type integrity_target = {
|
||||
.name = "integrity",
|
||||
.version = {1, 13, 0},
|
||||
.version = {1, 14, 0},
|
||||
.module = THIS_MODULE,
|
||||
.features = DM_TARGET_SINGLETON | DM_TARGET_INTEGRITY,
|
||||
.ctr = dm_integrity_ctr,
|
||||
|
||||
@@ -414,7 +414,7 @@ static int log_super(struct log_writes_c *lc)
|
||||
}
|
||||
|
||||
/*
|
||||
* Super sector should be writen in-order, otherwise the
|
||||
* Super sector should be written in-order, otherwise the
|
||||
* nr_entries could be rewritten incorrectly by an old bio.
|
||||
*/
|
||||
wait_for_completion_io(&lc->super_done);
|
||||
|
||||
17
drivers/md/dm-pcache/Kconfig
Normal file
17
drivers/md/dm-pcache/Kconfig
Normal file
@@ -0,0 +1,17 @@
|
||||
config DM_PCACHE
|
||||
tristate "Persistent cache for Block Device (Experimental)"
|
||||
depends on BLK_DEV_DM
|
||||
depends on DEV_DAX
|
||||
help
|
||||
PCACHE provides a mechanism to use persistent memory (e.g., CXL persistent memory,
|
||||
DAX-enabled devices) as a high-performance cache layer in front of
|
||||
traditional block devices such as SSDs or HDDs.
|
||||
|
||||
PCACHE is implemented as a kernel module that integrates with the block
|
||||
layer and supports direct access (DAX) to persistent memory for low-latency,
|
||||
byte-addressable caching.
|
||||
|
||||
Note: This feature is experimental and should be tested thoroughly
|
||||
before use in production environments.
|
||||
|
||||
If unsure, say 'N'.
|
||||
3
drivers/md/dm-pcache/Makefile
Normal file
3
drivers/md/dm-pcache/Makefile
Normal file
@@ -0,0 +1,3 @@
|
||||
dm-pcache-y := dm_pcache.o cache_dev.o segment.o backing_dev.o cache.o cache_gc.o cache_writeback.o cache_segment.o cache_key.o cache_req.o
|
||||
|
||||
obj-m += dm-pcache.o
|
||||
374
drivers/md/dm-pcache/backing_dev.c
Normal file
374
drivers/md/dm-pcache/backing_dev.c
Normal file
@@ -0,0 +1,374 @@
|
||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
#include <linux/blkdev.h>
|
||||
|
||||
#include "../dm-core.h"
|
||||
#include "pcache_internal.h"
|
||||
#include "cache_dev.h"
|
||||
#include "backing_dev.h"
|
||||
#include "cache.h"
|
||||
#include "dm_pcache.h"
|
||||
|
||||
static struct kmem_cache *backing_req_cache;
|
||||
static struct kmem_cache *backing_bvec_cache;
|
||||
|
||||
static void backing_dev_exit(struct pcache_backing_dev *backing_dev)
|
||||
{
|
||||
mempool_exit(&backing_dev->req_pool);
|
||||
mempool_exit(&backing_dev->bvec_pool);
|
||||
}
|
||||
|
||||
static void req_submit_fn(struct work_struct *work);
|
||||
static void req_complete_fn(struct work_struct *work);
|
||||
static int backing_dev_init(struct dm_pcache *pcache)
|
||||
{
|
||||
struct pcache_backing_dev *backing_dev = &pcache->backing_dev;
|
||||
int ret;
|
||||
|
||||
ret = mempool_init_slab_pool(&backing_dev->req_pool, 128, backing_req_cache);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
ret = mempool_init_slab_pool(&backing_dev->bvec_pool, 128, backing_bvec_cache);
|
||||
if (ret)
|
||||
goto req_pool_exit;
|
||||
|
||||
INIT_LIST_HEAD(&backing_dev->submit_list);
|
||||
INIT_LIST_HEAD(&backing_dev->complete_list);
|
||||
spin_lock_init(&backing_dev->submit_lock);
|
||||
spin_lock_init(&backing_dev->complete_lock);
|
||||
INIT_WORK(&backing_dev->req_submit_work, req_submit_fn);
|
||||
INIT_WORK(&backing_dev->req_complete_work, req_complete_fn);
|
||||
atomic_set(&backing_dev->inflight_reqs, 0);
|
||||
init_waitqueue_head(&backing_dev->inflight_wq);
|
||||
|
||||
return 0;
|
||||
|
||||
req_pool_exit:
|
||||
mempool_exit(&backing_dev->req_pool);
|
||||
err:
|
||||
return ret;
|
||||
}
|
||||
|
||||
int backing_dev_start(struct dm_pcache *pcache)
|
||||
{
|
||||
struct pcache_backing_dev *backing_dev = &pcache->backing_dev;
|
||||
int ret;
|
||||
|
||||
ret = backing_dev_init(pcache);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
backing_dev->dev_size = bdev_nr_sectors(backing_dev->dm_dev->bdev);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void backing_dev_stop(struct dm_pcache *pcache)
|
||||
{
|
||||
struct pcache_backing_dev *backing_dev = &pcache->backing_dev;
|
||||
|
||||
/*
|
||||
* There should not be any new request comming, just wait
|
||||
* inflight requests done.
|
||||
*/
|
||||
wait_event(backing_dev->inflight_wq,
|
||||
atomic_read(&backing_dev->inflight_reqs) == 0);
|
||||
|
||||
flush_work(&backing_dev->req_submit_work);
|
||||
flush_work(&backing_dev->req_complete_work);
|
||||
|
||||
backing_dev_exit(backing_dev);
|
||||
}
|
||||
|
||||
/* pcache_backing_dev_req functions */
|
||||
void backing_dev_req_end(struct pcache_backing_dev_req *backing_req)
|
||||
{
|
||||
struct pcache_backing_dev *backing_dev = backing_req->backing_dev;
|
||||
|
||||
if (backing_req->end_req)
|
||||
backing_req->end_req(backing_req, backing_req->ret);
|
||||
|
||||
switch (backing_req->type) {
|
||||
case BACKING_DEV_REQ_TYPE_REQ:
|
||||
if (backing_req->req.upper_req)
|
||||
pcache_req_put(backing_req->req.upper_req, backing_req->ret);
|
||||
break;
|
||||
case BACKING_DEV_REQ_TYPE_KMEM:
|
||||
if (backing_req->kmem.bvecs != backing_req->kmem.inline_bvecs)
|
||||
mempool_free(backing_req->kmem.bvecs, &backing_dev->bvec_pool);
|
||||
break;
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
|
||||
mempool_free(backing_req, &backing_dev->req_pool);
|
||||
|
||||
if (atomic_dec_and_test(&backing_dev->inflight_reqs))
|
||||
wake_up(&backing_dev->inflight_wq);
|
||||
}
|
||||
|
||||
static void req_complete_fn(struct work_struct *work)
|
||||
{
|
||||
struct pcache_backing_dev *backing_dev = container_of(work, struct pcache_backing_dev, req_complete_work);
|
||||
struct pcache_backing_dev_req *backing_req;
|
||||
LIST_HEAD(tmp_list);
|
||||
|
||||
spin_lock_irq(&backing_dev->complete_lock);
|
||||
list_splice_init(&backing_dev->complete_list, &tmp_list);
|
||||
spin_unlock_irq(&backing_dev->complete_lock);
|
||||
|
||||
while (!list_empty(&tmp_list)) {
|
||||
backing_req = list_first_entry(&tmp_list,
|
||||
struct pcache_backing_dev_req, node);
|
||||
list_del_init(&backing_req->node);
|
||||
backing_dev_req_end(backing_req);
|
||||
}
|
||||
}
|
||||
|
||||
static void backing_dev_bio_end(struct bio *bio)
|
||||
{
|
||||
struct pcache_backing_dev_req *backing_req = bio->bi_private;
|
||||
struct pcache_backing_dev *backing_dev = backing_req->backing_dev;
|
||||
unsigned long flags;
|
||||
|
||||
backing_req->ret = blk_status_to_errno(bio->bi_status);
|
||||
|
||||
spin_lock_irqsave(&backing_dev->complete_lock, flags);
|
||||
list_move_tail(&backing_req->node, &backing_dev->complete_list);
|
||||
queue_work(BACKING_DEV_TO_PCACHE(backing_dev)->task_wq, &backing_dev->req_complete_work);
|
||||
spin_unlock_irqrestore(&backing_dev->complete_lock, flags);
|
||||
}
|
||||
|
||||
static void req_submit_fn(struct work_struct *work)
|
||||
{
|
||||
struct pcache_backing_dev *backing_dev = container_of(work, struct pcache_backing_dev, req_submit_work);
|
||||
struct pcache_backing_dev_req *backing_req;
|
||||
LIST_HEAD(tmp_list);
|
||||
|
||||
spin_lock(&backing_dev->submit_lock);
|
||||
list_splice_init(&backing_dev->submit_list, &tmp_list);
|
||||
spin_unlock(&backing_dev->submit_lock);
|
||||
|
||||
while (!list_empty(&tmp_list)) {
|
||||
backing_req = list_first_entry(&tmp_list,
|
||||
struct pcache_backing_dev_req, node);
|
||||
list_del_init(&backing_req->node);
|
||||
submit_bio_noacct(&backing_req->bio);
|
||||
}
|
||||
}
|
||||
|
||||
void backing_dev_req_submit(struct pcache_backing_dev_req *backing_req, bool direct)
|
||||
{
|
||||
struct pcache_backing_dev *backing_dev = backing_req->backing_dev;
|
||||
|
||||
if (direct) {
|
||||
submit_bio_noacct(&backing_req->bio);
|
||||
return;
|
||||
}
|
||||
|
||||
spin_lock(&backing_dev->submit_lock);
|
||||
list_add_tail(&backing_req->node, &backing_dev->submit_list);
|
||||
queue_work(BACKING_DEV_TO_PCACHE(backing_dev)->task_wq, &backing_dev->req_submit_work);
|
||||
spin_unlock(&backing_dev->submit_lock);
|
||||
}
|
||||
|
||||
static void bio_map(struct bio *bio, void *base, size_t size)
|
||||
{
|
||||
struct page *page;
|
||||
unsigned int offset;
|
||||
unsigned int len;
|
||||
|
||||
if (!is_vmalloc_addr(base)) {
|
||||
page = virt_to_page(base);
|
||||
offset = offset_in_page(base);
|
||||
|
||||
BUG_ON(!bio_add_page(bio, page, size, offset));
|
||||
return;
|
||||
}
|
||||
|
||||
flush_kernel_vmap_range(base, size);
|
||||
while (size) {
|
||||
page = vmalloc_to_page(base);
|
||||
offset = offset_in_page(base);
|
||||
len = min_t(size_t, PAGE_SIZE - offset, size);
|
||||
|
||||
BUG_ON(!bio_add_page(bio, page, len, offset));
|
||||
size -= len;
|
||||
base += len;
|
||||
}
|
||||
}
|
||||
|
||||
static struct pcache_backing_dev_req *req_type_req_alloc(struct pcache_backing_dev *backing_dev,
|
||||
struct pcache_backing_dev_req_opts *opts)
|
||||
{
|
||||
struct pcache_request *pcache_req = opts->req.upper_req;
|
||||
struct pcache_backing_dev_req *backing_req;
|
||||
struct bio *orig = pcache_req->bio;
|
||||
|
||||
backing_req = mempool_alloc(&backing_dev->req_pool, opts->gfp_mask);
|
||||
if (!backing_req)
|
||||
return NULL;
|
||||
|
||||
memset(backing_req, 0, sizeof(struct pcache_backing_dev_req));
|
||||
|
||||
bio_init_clone(backing_dev->dm_dev->bdev, &backing_req->bio, orig, opts->gfp_mask);
|
||||
|
||||
backing_req->type = BACKING_DEV_REQ_TYPE_REQ;
|
||||
backing_req->backing_dev = backing_dev;
|
||||
atomic_inc(&backing_dev->inflight_reqs);
|
||||
|
||||
return backing_req;
|
||||
}
|
||||
|
||||
static struct pcache_backing_dev_req *kmem_type_req_alloc(struct pcache_backing_dev *backing_dev,
|
||||
struct pcache_backing_dev_req_opts *opts)
|
||||
{
|
||||
struct pcache_backing_dev_req *backing_req;
|
||||
u32 n_vecs = bio_add_max_vecs(opts->kmem.data, opts->kmem.len);
|
||||
|
||||
backing_req = mempool_alloc(&backing_dev->req_pool, opts->gfp_mask);
|
||||
if (!backing_req)
|
||||
return NULL;
|
||||
|
||||
memset(backing_req, 0, sizeof(struct pcache_backing_dev_req));
|
||||
|
||||
if (n_vecs > BACKING_DEV_REQ_INLINE_BVECS) {
|
||||
backing_req->kmem.bvecs = mempool_alloc(&backing_dev->bvec_pool, opts->gfp_mask);
|
||||
if (!backing_req->kmem.bvecs)
|
||||
goto free_backing_req;
|
||||
} else {
|
||||
backing_req->kmem.bvecs = backing_req->kmem.inline_bvecs;
|
||||
}
|
||||
|
||||
backing_req->kmem.n_vecs = n_vecs;
|
||||
backing_req->type = BACKING_DEV_REQ_TYPE_KMEM;
|
||||
backing_req->backing_dev = backing_dev;
|
||||
atomic_inc(&backing_dev->inflight_reqs);
|
||||
|
||||
return backing_req;
|
||||
|
||||
free_backing_req:
|
||||
mempool_free(backing_req, &backing_dev->req_pool);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
struct pcache_backing_dev_req *backing_dev_req_alloc(struct pcache_backing_dev *backing_dev,
|
||||
struct pcache_backing_dev_req_opts *opts)
|
||||
{
|
||||
if (opts->type == BACKING_DEV_REQ_TYPE_REQ)
|
||||
return req_type_req_alloc(backing_dev, opts);
|
||||
|
||||
if (opts->type == BACKING_DEV_REQ_TYPE_KMEM)
|
||||
return kmem_type_req_alloc(backing_dev, opts);
|
||||
|
||||
BUG();
|
||||
}
|
||||
|
||||
static void req_type_req_init(struct pcache_backing_dev_req *backing_req,
|
||||
struct pcache_backing_dev_req_opts *opts)
|
||||
{
|
||||
struct pcache_request *pcache_req = opts->req.upper_req;
|
||||
struct bio *clone;
|
||||
u32 off = opts->req.req_off;
|
||||
u32 len = opts->req.len;
|
||||
|
||||
clone = &backing_req->bio;
|
||||
BUG_ON(off & SECTOR_MASK);
|
||||
BUG_ON(len & SECTOR_MASK);
|
||||
bio_trim(clone, off >> SECTOR_SHIFT, len >> SECTOR_SHIFT);
|
||||
|
||||
clone->bi_iter.bi_sector = (pcache_req->off + off) >> SECTOR_SHIFT;
|
||||
clone->bi_private = backing_req;
|
||||
clone->bi_end_io = backing_dev_bio_end;
|
||||
|
||||
INIT_LIST_HEAD(&backing_req->node);
|
||||
backing_req->end_req = opts->end_fn;
|
||||
|
||||
pcache_req_get(pcache_req);
|
||||
backing_req->req.upper_req = pcache_req;
|
||||
backing_req->req.bio_off = off;
|
||||
}
|
||||
|
||||
static void kmem_type_req_init(struct pcache_backing_dev_req *backing_req,
|
||||
struct pcache_backing_dev_req_opts *opts)
|
||||
{
|
||||
struct pcache_backing_dev *backing_dev = backing_req->backing_dev;
|
||||
struct bio *backing_bio;
|
||||
|
||||
bio_init(&backing_req->bio, backing_dev->dm_dev->bdev, backing_req->kmem.bvecs,
|
||||
backing_req->kmem.n_vecs, opts->kmem.opf);
|
||||
|
||||
backing_bio = &backing_req->bio;
|
||||
bio_map(backing_bio, opts->kmem.data, opts->kmem.len);
|
||||
|
||||
backing_bio->bi_iter.bi_sector = (opts->kmem.backing_off) >> SECTOR_SHIFT;
|
||||
backing_bio->bi_private = backing_req;
|
||||
backing_bio->bi_end_io = backing_dev_bio_end;
|
||||
|
||||
INIT_LIST_HEAD(&backing_req->node);
|
||||
backing_req->end_req = opts->end_fn;
|
||||
backing_req->priv_data = opts->priv_data;
|
||||
}
|
||||
|
||||
void backing_dev_req_init(struct pcache_backing_dev_req *backing_req,
|
||||
struct pcache_backing_dev_req_opts *opts)
|
||||
{
|
||||
if (opts->type == BACKING_DEV_REQ_TYPE_REQ)
|
||||
return req_type_req_init(backing_req, opts);
|
||||
|
||||
if (opts->type == BACKING_DEV_REQ_TYPE_KMEM)
|
||||
return kmem_type_req_init(backing_req, opts);
|
||||
|
||||
BUG();
|
||||
}
|
||||
|
||||
struct pcache_backing_dev_req *backing_dev_req_create(struct pcache_backing_dev *backing_dev,
|
||||
struct pcache_backing_dev_req_opts *opts)
|
||||
{
|
||||
struct pcache_backing_dev_req *backing_req;
|
||||
|
||||
backing_req = backing_dev_req_alloc(backing_dev, opts);
|
||||
if (!backing_req)
|
||||
return NULL;
|
||||
|
||||
backing_dev_req_init(backing_req, opts);
|
||||
|
||||
return backing_req;
|
||||
}
|
||||
|
||||
void backing_dev_flush(struct pcache_backing_dev *backing_dev)
|
||||
{
|
||||
blkdev_issue_flush(backing_dev->dm_dev->bdev);
|
||||
}
|
||||
|
||||
int pcache_backing_init(void)
|
||||
{
|
||||
u32 max_bvecs = (PCACHE_CACHE_SUBTREE_SIZE >> PAGE_SHIFT) + 1;
|
||||
int ret;
|
||||
|
||||
backing_req_cache = KMEM_CACHE(pcache_backing_dev_req, 0);
|
||||
if (!backing_req_cache) {
|
||||
ret = -ENOMEM;
|
||||
goto err;
|
||||
}
|
||||
|
||||
backing_bvec_cache = kmem_cache_create("pcache-bvec-slab",
|
||||
max_bvecs * sizeof(struct bio_vec),
|
||||
0, 0, NULL);
|
||||
if (!backing_bvec_cache) {
|
||||
ret = -ENOMEM;
|
||||
goto destroy_req_cache;
|
||||
}
|
||||
|
||||
return 0;
|
||||
destroy_req_cache:
|
||||
kmem_cache_destroy(backing_req_cache);
|
||||
err:
|
||||
return ret;
|
||||
}
|
||||
|
||||
void pcache_backing_exit(void)
|
||||
{
|
||||
kmem_cache_destroy(backing_bvec_cache);
|
||||
kmem_cache_destroy(backing_req_cache);
|
||||
}
|
||||
127
drivers/md/dm-pcache/backing_dev.h
Normal file
127
drivers/md/dm-pcache/backing_dev.h
Normal file
@@ -0,0 +1,127 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
||||
#ifndef _BACKING_DEV_H
|
||||
#define _BACKING_DEV_H
|
||||
|
||||
#include <linux/device-mapper.h>
|
||||
|
||||
#include "pcache_internal.h"
|
||||
|
||||
struct pcache_backing_dev_req;
|
||||
typedef void (*backing_req_end_fn_t)(struct pcache_backing_dev_req *backing_req, int ret);
|
||||
|
||||
#define BACKING_DEV_REQ_TYPE_REQ 1
|
||||
#define BACKING_DEV_REQ_TYPE_KMEM 2
|
||||
|
||||
#define BACKING_DEV_REQ_INLINE_BVECS 4
|
||||
|
||||
struct pcache_request;
|
||||
struct pcache_backing_dev_req {
|
||||
u8 type;
|
||||
struct bio bio;
|
||||
struct pcache_backing_dev *backing_dev;
|
||||
|
||||
void *priv_data;
|
||||
backing_req_end_fn_t end_req;
|
||||
|
||||
struct list_head node;
|
||||
int ret;
|
||||
|
||||
union {
|
||||
struct {
|
||||
struct pcache_request *upper_req;
|
||||
u32 bio_off;
|
||||
} req;
|
||||
struct {
|
||||
struct bio_vec inline_bvecs[BACKING_DEV_REQ_INLINE_BVECS];
|
||||
struct bio_vec *bvecs;
|
||||
u32 n_vecs;
|
||||
} kmem;
|
||||
};
|
||||
};
|
||||
|
||||
struct pcache_backing_dev {
|
||||
struct pcache_cache *cache;
|
||||
|
||||
struct dm_dev *dm_dev;
|
||||
mempool_t req_pool;
|
||||
mempool_t bvec_pool;
|
||||
|
||||
struct list_head submit_list;
|
||||
spinlock_t submit_lock;
|
||||
struct work_struct req_submit_work;
|
||||
|
||||
struct list_head complete_list;
|
||||
spinlock_t complete_lock;
|
||||
struct work_struct req_complete_work;
|
||||
|
||||
atomic_t inflight_reqs;
|
||||
wait_queue_head_t inflight_wq;
|
||||
|
||||
u64 dev_size;
|
||||
};
|
||||
|
||||
struct dm_pcache;
|
||||
int backing_dev_start(struct dm_pcache *pcache);
|
||||
void backing_dev_stop(struct dm_pcache *pcache);
|
||||
|
||||
struct pcache_backing_dev_req_opts {
|
||||
u32 type;
|
||||
union {
|
||||
struct {
|
||||
struct pcache_request *upper_req;
|
||||
u32 req_off;
|
||||
u32 len;
|
||||
} req;
|
||||
struct {
|
||||
void *data;
|
||||
blk_opf_t opf;
|
||||
u32 len;
|
||||
u64 backing_off;
|
||||
} kmem;
|
||||
};
|
||||
|
||||
gfp_t gfp_mask;
|
||||
backing_req_end_fn_t end_fn;
|
||||
void *priv_data;
|
||||
};
|
||||
|
||||
static inline u32 backing_dev_req_coalesced_max_len(const void *data, u32 len)
|
||||
{
|
||||
const void *p = data;
|
||||
u32 done = 0, in_page, to_advance;
|
||||
struct page *first_page, *next_page;
|
||||
|
||||
if (!is_vmalloc_addr(data))
|
||||
return len;
|
||||
|
||||
first_page = vmalloc_to_page(p);
|
||||
advance:
|
||||
in_page = PAGE_SIZE - offset_in_page(p);
|
||||
to_advance = min_t(u32, in_page, len - done);
|
||||
|
||||
done += to_advance;
|
||||
p += to_advance;
|
||||
|
||||
if (done == len)
|
||||
return done;
|
||||
|
||||
next_page = vmalloc_to_page(p);
|
||||
if (zone_device_pages_have_same_pgmap(first_page, next_page))
|
||||
goto advance;
|
||||
|
||||
return done;
|
||||
}
|
||||
|
||||
void backing_dev_req_submit(struct pcache_backing_dev_req *backing_req, bool direct);
|
||||
void backing_dev_req_end(struct pcache_backing_dev_req *backing_req);
|
||||
struct pcache_backing_dev_req *backing_dev_req_create(struct pcache_backing_dev *backing_dev,
|
||||
struct pcache_backing_dev_req_opts *opts);
|
||||
struct pcache_backing_dev_req *backing_dev_req_alloc(struct pcache_backing_dev *backing_dev,
|
||||
struct pcache_backing_dev_req_opts *opts);
|
||||
void backing_dev_req_init(struct pcache_backing_dev_req *backing_req,
|
||||
struct pcache_backing_dev_req_opts *opts);
|
||||
void backing_dev_flush(struct pcache_backing_dev *backing_dev);
|
||||
|
||||
int pcache_backing_init(void);
|
||||
void pcache_backing_exit(void);
|
||||
#endif /* _BACKING_DEV_H */
|
||||
445
drivers/md/dm-pcache/cache.c
Normal file
445
drivers/md/dm-pcache/cache.c
Normal file
@@ -0,0 +1,445 @@
|
||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
#include <linux/blk_types.h>
|
||||
|
||||
#include "cache.h"
|
||||
#include "cache_dev.h"
|
||||
#include "backing_dev.h"
|
||||
#include "dm_pcache.h"
|
||||
|
||||
struct kmem_cache *key_cache;
|
||||
|
||||
static inline struct pcache_cache_info *get_cache_info_addr(struct pcache_cache *cache)
|
||||
{
|
||||
return cache->cache_info_addr + cache->info_index;
|
||||
}
|
||||
|
||||
static void cache_info_write(struct pcache_cache *cache)
|
||||
{
|
||||
struct pcache_cache_info *cache_info = &cache->cache_info;
|
||||
|
||||
cache_info->header.seq++;
|
||||
cache_info->header.crc = pcache_meta_crc(&cache_info->header,
|
||||
sizeof(struct pcache_cache_info));
|
||||
|
||||
memcpy_flushcache(get_cache_info_addr(cache), cache_info,
|
||||
sizeof(struct pcache_cache_info));
|
||||
|
||||
cache->info_index = (cache->info_index + 1) % PCACHE_META_INDEX_MAX;
|
||||
}
|
||||
|
||||
static void cache_info_init_default(struct pcache_cache *cache);
|
||||
static int cache_info_init(struct pcache_cache *cache, struct pcache_cache_options *opts)
|
||||
{
|
||||
struct dm_pcache *pcache = CACHE_TO_PCACHE(cache);
|
||||
struct pcache_cache_info *cache_info_addr;
|
||||
|
||||
cache_info_addr = pcache_meta_find_latest(&cache->cache_info_addr->header,
|
||||
sizeof(struct pcache_cache_info),
|
||||
PCACHE_CACHE_INFO_SIZE,
|
||||
&cache->cache_info);
|
||||
if (IS_ERR(cache_info_addr))
|
||||
return PTR_ERR(cache_info_addr);
|
||||
|
||||
if (cache_info_addr) {
|
||||
if (opts->data_crc !=
|
||||
(cache->cache_info.flags & PCACHE_CACHE_FLAGS_DATA_CRC)) {
|
||||
pcache_dev_err(pcache, "invalid option for data_crc: %s, expected: %s",
|
||||
opts->data_crc ? "true" : "false",
|
||||
cache->cache_info.flags & PCACHE_CACHE_FLAGS_DATA_CRC ? "true" : "false");
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* init cache_info for new cache */
|
||||
cache_info_init_default(cache);
|
||||
cache_mode_set(cache, opts->cache_mode);
|
||||
if (opts->data_crc)
|
||||
cache->cache_info.flags |= PCACHE_CACHE_FLAGS_DATA_CRC;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void cache_info_set_gc_percent(struct pcache_cache_info *cache_info, u8 percent)
|
||||
{
|
||||
cache_info->flags &= ~PCACHE_CACHE_FLAGS_GC_PERCENT_MASK;
|
||||
cache_info->flags |= FIELD_PREP(PCACHE_CACHE_FLAGS_GC_PERCENT_MASK, percent);
|
||||
}
|
||||
|
||||
int pcache_cache_set_gc_percent(struct pcache_cache *cache, u8 percent)
|
||||
{
|
||||
if (percent > PCACHE_CACHE_GC_PERCENT_MAX || percent < PCACHE_CACHE_GC_PERCENT_MIN)
|
||||
return -EINVAL;
|
||||
|
||||
mutex_lock(&cache->cache_info_lock);
|
||||
cache_info_set_gc_percent(&cache->cache_info, percent);
|
||||
|
||||
cache_info_write(cache);
|
||||
mutex_unlock(&cache->cache_info_lock);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void cache_pos_encode(struct pcache_cache *cache,
|
||||
struct pcache_cache_pos_onmedia *pos_onmedia_base,
|
||||
struct pcache_cache_pos *pos, u64 seq, u32 *index)
|
||||
{
|
||||
struct pcache_cache_pos_onmedia pos_onmedia;
|
||||
struct pcache_cache_pos_onmedia *pos_onmedia_addr = pos_onmedia_base + *index;
|
||||
|
||||
pos_onmedia.cache_seg_id = pos->cache_seg->cache_seg_id;
|
||||
pos_onmedia.seg_off = pos->seg_off;
|
||||
pos_onmedia.header.seq = seq;
|
||||
pos_onmedia.header.crc = cache_pos_onmedia_crc(&pos_onmedia);
|
||||
|
||||
memcpy_flushcache(pos_onmedia_addr, &pos_onmedia, sizeof(struct pcache_cache_pos_onmedia));
|
||||
pmem_wmb();
|
||||
|
||||
*index = (*index + 1) % PCACHE_META_INDEX_MAX;
|
||||
}
|
||||
|
||||
int cache_pos_decode(struct pcache_cache *cache,
|
||||
struct pcache_cache_pos_onmedia *pos_onmedia,
|
||||
struct pcache_cache_pos *pos, u64 *seq, u32 *index)
|
||||
{
|
||||
struct pcache_cache_pos_onmedia latest, *latest_addr;
|
||||
|
||||
latest_addr = pcache_meta_find_latest(&pos_onmedia->header,
|
||||
sizeof(struct pcache_cache_pos_onmedia),
|
||||
sizeof(struct pcache_cache_pos_onmedia),
|
||||
&latest);
|
||||
if (IS_ERR(latest_addr))
|
||||
return PTR_ERR(latest_addr);
|
||||
|
||||
if (!latest_addr)
|
||||
return -EIO;
|
||||
|
||||
pos->cache_seg = &cache->segments[latest.cache_seg_id];
|
||||
pos->seg_off = latest.seg_off;
|
||||
*seq = latest.header.seq;
|
||||
*index = (latest_addr - pos_onmedia);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline void cache_info_set_seg_id(struct pcache_cache *cache, u32 seg_id)
|
||||
{
|
||||
cache->cache_info.seg_id = seg_id;
|
||||
}
|
||||
|
||||
static int cache_init(struct dm_pcache *pcache)
|
||||
{
|
||||
struct pcache_cache *cache = &pcache->cache;
|
||||
struct pcache_backing_dev *backing_dev = &pcache->backing_dev;
|
||||
struct pcache_cache_dev *cache_dev = &pcache->cache_dev;
|
||||
int ret;
|
||||
|
||||
cache->segments = kvcalloc(cache_dev->seg_num, sizeof(struct pcache_cache_segment), GFP_KERNEL);
|
||||
if (!cache->segments) {
|
||||
ret = -ENOMEM;
|
||||
goto err;
|
||||
}
|
||||
|
||||
cache->seg_map = kvcalloc(BITS_TO_LONGS(cache_dev->seg_num), sizeof(unsigned long), GFP_KERNEL);
|
||||
if (!cache->seg_map) {
|
||||
ret = -ENOMEM;
|
||||
goto free_segments;
|
||||
}
|
||||
|
||||
cache->backing_dev = backing_dev;
|
||||
cache->cache_dev = &pcache->cache_dev;
|
||||
cache->n_segs = cache_dev->seg_num;
|
||||
atomic_set(&cache->gc_errors, 0);
|
||||
spin_lock_init(&cache->seg_map_lock);
|
||||
spin_lock_init(&cache->key_head_lock);
|
||||
|
||||
mutex_init(&cache->cache_info_lock);
|
||||
mutex_init(&cache->key_tail_lock);
|
||||
mutex_init(&cache->dirty_tail_lock);
|
||||
mutex_init(&cache->writeback_lock);
|
||||
|
||||
INIT_DELAYED_WORK(&cache->writeback_work, cache_writeback_fn);
|
||||
INIT_DELAYED_WORK(&cache->gc_work, pcache_cache_gc_fn);
|
||||
INIT_WORK(&cache->clean_work, clean_fn);
|
||||
|
||||
return 0;
|
||||
|
||||
free_segments:
|
||||
kvfree(cache->segments);
|
||||
err:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void cache_exit(struct pcache_cache *cache)
|
||||
{
|
||||
kvfree(cache->seg_map);
|
||||
kvfree(cache->segments);
|
||||
}
|
||||
|
||||
static void cache_info_init_default(struct pcache_cache *cache)
|
||||
{
|
||||
struct pcache_cache_info *cache_info = &cache->cache_info;
|
||||
|
||||
cache_info->header.seq = 0;
|
||||
cache_info->n_segs = cache->cache_dev->seg_num;
|
||||
cache_info_set_gc_percent(cache_info, PCACHE_CACHE_GC_PERCENT_DEFAULT);
|
||||
}
|
||||
|
||||
static int cache_tail_init(struct pcache_cache *cache)
|
||||
{
|
||||
struct dm_pcache *pcache = CACHE_TO_PCACHE(cache);
|
||||
bool new_cache = !(cache->cache_info.flags & PCACHE_CACHE_FLAGS_INIT_DONE);
|
||||
|
||||
if (new_cache) {
|
||||
__set_bit(0, cache->seg_map);
|
||||
|
||||
cache->key_head.cache_seg = &cache->segments[0];
|
||||
cache->key_head.seg_off = 0;
|
||||
cache_pos_copy(&cache->key_tail, &cache->key_head);
|
||||
cache_pos_copy(&cache->dirty_tail, &cache->key_head);
|
||||
|
||||
cache_encode_dirty_tail(cache);
|
||||
cache_encode_key_tail(cache);
|
||||
} else {
|
||||
if (cache_decode_key_tail(cache) || cache_decode_dirty_tail(cache)) {
|
||||
pcache_dev_err(pcache, "Corrupted key tail or dirty tail.\n");
|
||||
return -EIO;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int get_seg_id(struct pcache_cache *cache,
|
||||
struct pcache_cache_segment *prev_cache_seg,
|
||||
bool new_cache, u32 *seg_id)
|
||||
{
|
||||
struct dm_pcache *pcache = CACHE_TO_PCACHE(cache);
|
||||
struct pcache_cache_dev *cache_dev = cache->cache_dev;
|
||||
int ret;
|
||||
|
||||
if (new_cache) {
|
||||
ret = cache_dev_get_empty_segment_id(cache_dev, seg_id);
|
||||
if (ret) {
|
||||
pcache_dev_err(pcache, "no available segment\n");
|
||||
goto err;
|
||||
}
|
||||
|
||||
if (prev_cache_seg)
|
||||
cache_seg_set_next_seg(prev_cache_seg, *seg_id);
|
||||
else
|
||||
cache_info_set_seg_id(cache, *seg_id);
|
||||
} else {
|
||||
if (prev_cache_seg) {
|
||||
struct pcache_segment_info *prev_seg_info;
|
||||
|
||||
prev_seg_info = &prev_cache_seg->cache_seg_info;
|
||||
if (!segment_info_has_next(prev_seg_info)) {
|
||||
ret = -EFAULT;
|
||||
goto err;
|
||||
}
|
||||
*seg_id = prev_cache_seg->cache_seg_info.next_seg;
|
||||
} else {
|
||||
*seg_id = cache->cache_info.seg_id;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
err:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int cache_segs_init(struct pcache_cache *cache)
|
||||
{
|
||||
struct pcache_cache_segment *prev_cache_seg = NULL;
|
||||
struct pcache_cache_info *cache_info = &cache->cache_info;
|
||||
bool new_cache = !(cache->cache_info.flags & PCACHE_CACHE_FLAGS_INIT_DONE);
|
||||
u32 seg_id;
|
||||
int ret;
|
||||
u32 i;
|
||||
|
||||
for (i = 0; i < cache_info->n_segs; i++) {
|
||||
ret = get_seg_id(cache, prev_cache_seg, new_cache, &seg_id);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
ret = cache_seg_init(cache, seg_id, i, new_cache);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
prev_cache_seg = &cache->segments[i];
|
||||
}
|
||||
return 0;
|
||||
err:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int cache_init_req_keys(struct pcache_cache *cache, u32 n_paral)
|
||||
{
|
||||
struct dm_pcache *pcache = CACHE_TO_PCACHE(cache);
|
||||
u32 n_subtrees;
|
||||
int ret;
|
||||
u32 i, cpu;
|
||||
|
||||
/* Calculate number of cache trees based on the device size */
|
||||
n_subtrees = DIV_ROUND_UP(cache->dev_size << SECTOR_SHIFT, PCACHE_CACHE_SUBTREE_SIZE);
|
||||
ret = cache_tree_init(cache, &cache->req_key_tree, n_subtrees);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
cache->n_ksets = n_paral;
|
||||
cache->ksets = kvcalloc(cache->n_ksets, PCACHE_KSET_SIZE, GFP_KERNEL);
|
||||
if (!cache->ksets) {
|
||||
ret = -ENOMEM;
|
||||
goto req_tree_exit;
|
||||
}
|
||||
|
||||
/*
|
||||
* Initialize each kset with a spinlock and delayed work for flushing.
|
||||
* Each kset is associated with one queue to ensure independent handling
|
||||
* of cache keys across multiple queues, maximizing multiqueue concurrency.
|
||||
*/
|
||||
for (i = 0; i < cache->n_ksets; i++) {
|
||||
struct pcache_cache_kset *kset = get_kset(cache, i);
|
||||
|
||||
kset->cache = cache;
|
||||
spin_lock_init(&kset->kset_lock);
|
||||
INIT_DELAYED_WORK(&kset->flush_work, kset_flush_fn);
|
||||
}
|
||||
|
||||
cache->data_heads = alloc_percpu(struct pcache_cache_data_head);
|
||||
if (!cache->data_heads) {
|
||||
ret = -ENOMEM;
|
||||
goto free_kset;
|
||||
}
|
||||
|
||||
for_each_possible_cpu(cpu) {
|
||||
struct pcache_cache_data_head *h =
|
||||
per_cpu_ptr(cache->data_heads, cpu);
|
||||
h->head_pos.cache_seg = NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Replay persisted cache keys using cache_replay.
|
||||
* This function loads and replays cache keys from previously stored
|
||||
* ksets, allowing the cache to restore its state after a restart.
|
||||
*/
|
||||
ret = cache_replay(cache);
|
||||
if (ret) {
|
||||
pcache_dev_err(pcache, "failed to replay keys\n");
|
||||
goto free_heads;
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
free_heads:
|
||||
free_percpu(cache->data_heads);
|
||||
free_kset:
|
||||
kvfree(cache->ksets);
|
||||
req_tree_exit:
|
||||
cache_tree_exit(&cache->req_key_tree);
|
||||
err:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void cache_destroy_req_keys(struct pcache_cache *cache)
|
||||
{
|
||||
u32 i;
|
||||
|
||||
for (i = 0; i < cache->n_ksets; i++) {
|
||||
struct pcache_cache_kset *kset = get_kset(cache, i);
|
||||
|
||||
cancel_delayed_work_sync(&kset->flush_work);
|
||||
}
|
||||
|
||||
free_percpu(cache->data_heads);
|
||||
kvfree(cache->ksets);
|
||||
cache_tree_exit(&cache->req_key_tree);
|
||||
}
|
||||
|
||||
int pcache_cache_start(struct dm_pcache *pcache)
|
||||
{
|
||||
struct pcache_backing_dev *backing_dev = &pcache->backing_dev;
|
||||
struct pcache_cache *cache = &pcache->cache;
|
||||
struct pcache_cache_options *opts = &pcache->opts;
|
||||
int ret;
|
||||
|
||||
ret = cache_init(pcache);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
cache->cache_info_addr = CACHE_DEV_CACHE_INFO(cache->cache_dev);
|
||||
cache->cache_ctrl = CACHE_DEV_CACHE_CTRL(cache->cache_dev);
|
||||
backing_dev->cache = cache;
|
||||
cache->dev_size = backing_dev->dev_size;
|
||||
|
||||
ret = cache_info_init(cache, opts);
|
||||
if (ret)
|
||||
goto cache_exit;
|
||||
|
||||
ret = cache_segs_init(cache);
|
||||
if (ret)
|
||||
goto cache_exit;
|
||||
|
||||
ret = cache_tail_init(cache);
|
||||
if (ret)
|
||||
goto cache_exit;
|
||||
|
||||
ret = cache_init_req_keys(cache, num_online_cpus());
|
||||
if (ret)
|
||||
goto cache_exit;
|
||||
|
||||
ret = cache_writeback_init(cache);
|
||||
if (ret)
|
||||
goto destroy_keys;
|
||||
|
||||
cache->cache_info.flags |= PCACHE_CACHE_FLAGS_INIT_DONE;
|
||||
cache_info_write(cache);
|
||||
queue_delayed_work(cache_get_wq(cache), &cache->gc_work, 0);
|
||||
|
||||
return 0;
|
||||
|
||||
destroy_keys:
|
||||
cache_destroy_req_keys(cache);
|
||||
cache_exit:
|
||||
cache_exit(cache);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
void pcache_cache_stop(struct dm_pcache *pcache)
|
||||
{
|
||||
struct pcache_cache *cache = &pcache->cache;
|
||||
|
||||
cache_flush(cache);
|
||||
|
||||
cancel_delayed_work_sync(&cache->gc_work);
|
||||
flush_work(&cache->clean_work);
|
||||
cache_writeback_exit(cache);
|
||||
|
||||
if (cache->req_key_tree.n_subtrees)
|
||||
cache_destroy_req_keys(cache);
|
||||
|
||||
cache_exit(cache);
|
||||
}
|
||||
|
||||
struct workqueue_struct *cache_get_wq(struct pcache_cache *cache)
|
||||
{
|
||||
struct dm_pcache *pcache = CACHE_TO_PCACHE(cache);
|
||||
|
||||
return pcache->task_wq;
|
||||
}
|
||||
|
||||
int pcache_cache_init(void)
|
||||
{
|
||||
key_cache = KMEM_CACHE(pcache_cache_key, 0);
|
||||
if (!key_cache)
|
||||
return -ENOMEM;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void pcache_cache_exit(void)
|
||||
{
|
||||
kmem_cache_destroy(key_cache);
|
||||
}
|
||||
635
drivers/md/dm-pcache/cache.h
Normal file
635
drivers/md/dm-pcache/cache.h
Normal file
@@ -0,0 +1,635 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
||||
#ifndef _PCACHE_CACHE_H
|
||||
#define _PCACHE_CACHE_H
|
||||
|
||||
#include "segment.h"
|
||||
|
||||
/* Garbage collection thresholds */
|
||||
#define PCACHE_CACHE_GC_PERCENT_MIN 0 /* Minimum GC percentage */
|
||||
#define PCACHE_CACHE_GC_PERCENT_MAX 90 /* Maximum GC percentage */
|
||||
#define PCACHE_CACHE_GC_PERCENT_DEFAULT 70 /* Default GC percentage */
|
||||
|
||||
#define PCACHE_CACHE_SUBTREE_SIZE (4 * PCACHE_MB) /* 4MB total tree size */
|
||||
#define PCACHE_CACHE_SUBTREE_SIZE_MASK 0x3FFFFF /* Mask for tree size */
|
||||
#define PCACHE_CACHE_SUBTREE_SIZE_SHIFT 22 /* Bit shift for tree size */
|
||||
|
||||
/* Maximum number of keys per key set */
|
||||
#define PCACHE_KSET_KEYS_MAX 128
|
||||
#define PCACHE_CACHE_SEGS_MAX (1024 * 1024) /* maximum cache size for each device is 16T */
|
||||
#define PCACHE_KSET_ONMEDIA_SIZE_MAX struct_size_t(struct pcache_cache_kset_onmedia, data, PCACHE_KSET_KEYS_MAX)
|
||||
#define PCACHE_KSET_SIZE (sizeof(struct pcache_cache_kset) + sizeof(struct pcache_cache_key_onmedia) * PCACHE_KSET_KEYS_MAX)
|
||||
|
||||
/* Maximum number of keys to clean in one round of clean_work */
|
||||
#define PCACHE_CLEAN_KEYS_MAX 10
|
||||
|
||||
/* Writeback and garbage collection intervals in jiffies */
|
||||
#define PCACHE_CACHE_WRITEBACK_INTERVAL (5 * HZ)
|
||||
#define PCACHE_CACHE_GC_INTERVAL (5 * HZ)
|
||||
|
||||
/* Macro to get the cache key structure from an rb_node pointer */
|
||||
#define CACHE_KEY(node) (container_of(node, struct pcache_cache_key, rb_node))
|
||||
|
||||
struct pcache_cache_pos_onmedia {
|
||||
struct pcache_meta_header header;
|
||||
__u32 cache_seg_id;
|
||||
__u32 seg_off;
|
||||
};
|
||||
|
||||
/* Offset and size definitions for cache segment control */
|
||||
#define PCACHE_CACHE_SEG_CTRL_OFF (PCACHE_SEG_INFO_SIZE * PCACHE_META_INDEX_MAX)
|
||||
#define PCACHE_CACHE_SEG_CTRL_SIZE (4 * PCACHE_KB)
|
||||
|
||||
struct pcache_cache_seg_gen {
|
||||
struct pcache_meta_header header;
|
||||
__u64 gen;
|
||||
};
|
||||
|
||||
/* Control structure for cache segments */
|
||||
struct pcache_cache_seg_ctrl {
|
||||
struct pcache_cache_seg_gen gen[PCACHE_META_INDEX_MAX];
|
||||
__u64 res[64];
|
||||
};
|
||||
|
||||
#define PCACHE_CACHE_FLAGS_DATA_CRC BIT(0)
|
||||
#define PCACHE_CACHE_FLAGS_INIT_DONE BIT(1)
|
||||
|
||||
#define PCACHE_CACHE_FLAGS_CACHE_MODE_MASK GENMASK(5, 2)
|
||||
#define PCACHE_CACHE_MODE_WRITEBACK 0
|
||||
#define PCACHE_CACHE_MODE_WRITETHROUGH 1
|
||||
#define PCACHE_CACHE_MODE_WRITEAROUND 2
|
||||
#define PCACHE_CACHE_MODE_WRITEONLY 3
|
||||
|
||||
#define PCACHE_CACHE_FLAGS_GC_PERCENT_MASK GENMASK(12, 6)
|
||||
|
||||
struct pcache_cache_info {
|
||||
struct pcache_meta_header header;
|
||||
__u32 seg_id;
|
||||
__u32 n_segs;
|
||||
__u32 flags;
|
||||
__u32 reserved;
|
||||
};
|
||||
|
||||
struct pcache_cache_pos {
|
||||
struct pcache_cache_segment *cache_seg;
|
||||
u32 seg_off;
|
||||
};
|
||||
|
||||
struct pcache_cache_segment {
|
||||
struct pcache_cache *cache;
|
||||
u32 cache_seg_id; /* Index in cache->segments */
|
||||
struct pcache_segment segment;
|
||||
atomic_t refs;
|
||||
|
||||
struct pcache_segment_info cache_seg_info;
|
||||
struct mutex info_lock;
|
||||
u32 info_index;
|
||||
|
||||
spinlock_t gen_lock;
|
||||
u64 gen;
|
||||
u64 gen_seq;
|
||||
u32 gen_index;
|
||||
|
||||
struct pcache_cache_seg_ctrl *cache_seg_ctrl;
|
||||
};
|
||||
|
||||
/* rbtree for cache entries */
|
||||
struct pcache_cache_subtree {
|
||||
struct rb_root root;
|
||||
spinlock_t tree_lock;
|
||||
};
|
||||
|
||||
struct pcache_cache_tree {
|
||||
struct pcache_cache *cache;
|
||||
u32 n_subtrees;
|
||||
mempool_t key_pool;
|
||||
struct pcache_cache_subtree *subtrees;
|
||||
};
|
||||
|
||||
extern struct kmem_cache *key_cache;
|
||||
|
||||
struct pcache_cache_key {
|
||||
struct pcache_cache_tree *cache_tree;
|
||||
struct pcache_cache_subtree *cache_subtree;
|
||||
struct kref ref;
|
||||
struct rb_node rb_node;
|
||||
struct list_head list_node;
|
||||
u64 off;
|
||||
u32 len;
|
||||
u32 flags;
|
||||
struct pcache_cache_pos cache_pos;
|
||||
u64 seg_gen;
|
||||
};
|
||||
|
||||
#define PCACHE_CACHE_KEY_FLAGS_EMPTY BIT(0)
|
||||
#define PCACHE_CACHE_KEY_FLAGS_CLEAN BIT(1)
|
||||
|
||||
struct pcache_cache_key_onmedia {
|
||||
__u64 off;
|
||||
__u32 len;
|
||||
__u32 flags;
|
||||
__u32 cache_seg_id;
|
||||
__u32 cache_seg_off;
|
||||
__u64 seg_gen;
|
||||
__u32 data_crc;
|
||||
__u32 reserved;
|
||||
};
|
||||
|
||||
struct pcache_cache_kset_onmedia {
|
||||
__u32 crc;
|
||||
union {
|
||||
__u32 key_num;
|
||||
__u32 next_cache_seg_id;
|
||||
};
|
||||
__u64 magic;
|
||||
__u64 flags;
|
||||
struct pcache_cache_key_onmedia data[];
|
||||
};
|
||||
|
||||
struct pcache_cache {
|
||||
struct pcache_backing_dev *backing_dev;
|
||||
struct pcache_cache_dev *cache_dev;
|
||||
struct pcache_cache_ctrl *cache_ctrl;
|
||||
u64 dev_size;
|
||||
|
||||
struct pcache_cache_data_head __percpu *data_heads;
|
||||
|
||||
spinlock_t key_head_lock;
|
||||
struct pcache_cache_pos key_head;
|
||||
u32 n_ksets;
|
||||
struct pcache_cache_kset *ksets;
|
||||
|
||||
struct mutex key_tail_lock;
|
||||
struct pcache_cache_pos key_tail;
|
||||
u64 key_tail_seq;
|
||||
u32 key_tail_index;
|
||||
|
||||
struct mutex dirty_tail_lock;
|
||||
struct pcache_cache_pos dirty_tail;
|
||||
u64 dirty_tail_seq;
|
||||
u32 dirty_tail_index;
|
||||
|
||||
struct pcache_cache_tree req_key_tree;
|
||||
struct work_struct clean_work;
|
||||
|
||||
struct mutex writeback_lock;
|
||||
char wb_kset_onmedia_buf[PCACHE_KSET_ONMEDIA_SIZE_MAX];
|
||||
struct pcache_cache_tree writeback_key_tree;
|
||||
struct delayed_work writeback_work;
|
||||
struct {
|
||||
atomic_t pending;
|
||||
u32 advance;
|
||||
int ret;
|
||||
} writeback_ctx;
|
||||
|
||||
char gc_kset_onmedia_buf[PCACHE_KSET_ONMEDIA_SIZE_MAX];
|
||||
struct delayed_work gc_work;
|
||||
atomic_t gc_errors;
|
||||
|
||||
struct mutex cache_info_lock;
|
||||
struct pcache_cache_info cache_info;
|
||||
struct pcache_cache_info *cache_info_addr;
|
||||
u32 info_index;
|
||||
|
||||
u32 n_segs;
|
||||
unsigned long *seg_map;
|
||||
u32 last_cache_seg;
|
||||
bool cache_full;
|
||||
spinlock_t seg_map_lock;
|
||||
struct pcache_cache_segment *segments;
|
||||
};
|
||||
|
||||
struct workqueue_struct *cache_get_wq(struct pcache_cache *cache);
|
||||
|
||||
struct dm_pcache;
|
||||
struct pcache_cache_options {
|
||||
u32 cache_mode:4;
|
||||
u32 data_crc:1;
|
||||
};
|
||||
int pcache_cache_start(struct dm_pcache *pcache);
|
||||
void pcache_cache_stop(struct dm_pcache *pcache);
|
||||
|
||||
struct pcache_cache_ctrl {
|
||||
/* Updated by gc_thread */
|
||||
struct pcache_cache_pos_onmedia key_tail_pos[PCACHE_META_INDEX_MAX];
|
||||
|
||||
/* Updated by writeback_thread */
|
||||
struct pcache_cache_pos_onmedia dirty_tail_pos[PCACHE_META_INDEX_MAX];
|
||||
};
|
||||
|
||||
struct pcache_cache_data_head {
|
||||
struct pcache_cache_pos head_pos;
|
||||
};
|
||||
|
||||
static inline u16 pcache_cache_get_gc_percent(struct pcache_cache *cache)
|
||||
{
|
||||
return FIELD_GET(PCACHE_CACHE_FLAGS_GC_PERCENT_MASK, cache->cache_info.flags);
|
||||
}
|
||||
|
||||
int pcache_cache_set_gc_percent(struct pcache_cache *cache, u8 percent);
|
||||
|
||||
/* cache key */
|
||||
struct pcache_cache_key *cache_key_alloc(struct pcache_cache_tree *cache_tree, gfp_t gfp_mask);
|
||||
void cache_key_init(struct pcache_cache_tree *cache_tree, struct pcache_cache_key *key);
|
||||
void cache_key_get(struct pcache_cache_key *key);
|
||||
void cache_key_put(struct pcache_cache_key *key);
|
||||
int cache_key_append(struct pcache_cache *cache, struct pcache_cache_key *key, bool force_close);
|
||||
void cache_key_insert(struct pcache_cache_tree *cache_tree, struct pcache_cache_key *key, bool fixup);
|
||||
int cache_key_decode(struct pcache_cache *cache,
|
||||
struct pcache_cache_key_onmedia *key_onmedia,
|
||||
struct pcache_cache_key *key);
|
||||
void cache_pos_advance(struct pcache_cache_pos *pos, u32 len);
|
||||
|
||||
#define PCACHE_KSET_FLAGS_LAST BIT(0)
|
||||
#define PCACHE_KSET_MAGIC 0x676894a64e164f1aULL
|
||||
|
||||
struct pcache_cache_kset {
|
||||
struct pcache_cache *cache;
|
||||
spinlock_t kset_lock;
|
||||
struct delayed_work flush_work;
|
||||
struct pcache_cache_kset_onmedia kset_onmedia;
|
||||
};
|
||||
|
||||
extern struct pcache_cache_kset_onmedia pcache_empty_kset;
|
||||
|
||||
#define SUBTREE_WALK_RET_OK 0
|
||||
#define SUBTREE_WALK_RET_ERR 1
|
||||
#define SUBTREE_WALK_RET_NEED_KEY 2
|
||||
#define SUBTREE_WALK_RET_NEED_REQ 3
|
||||
#define SUBTREE_WALK_RET_RESEARCH 4
|
||||
|
||||
struct pcache_cache_subtree_walk_ctx {
|
||||
struct pcache_cache_tree *cache_tree;
|
||||
struct rb_node *start_node;
|
||||
struct pcache_request *pcache_req;
|
||||
struct pcache_cache_key *key;
|
||||
u32 req_done;
|
||||
int ret;
|
||||
|
||||
/* pre-allocated key and backing_dev_req */
|
||||
struct pcache_cache_key *pre_alloc_key;
|
||||
struct pcache_backing_dev_req *pre_alloc_req;
|
||||
|
||||
struct list_head *delete_key_list;
|
||||
struct list_head *submit_req_list;
|
||||
|
||||
/*
|
||||
* |--------| key_tmp
|
||||
* |====| key
|
||||
*/
|
||||
int (*before)(struct pcache_cache_key *key, struct pcache_cache_key *key_tmp,
|
||||
struct pcache_cache_subtree_walk_ctx *ctx);
|
||||
|
||||
/*
|
||||
* |----------| key_tmp
|
||||
* |=====| key
|
||||
*/
|
||||
int (*after)(struct pcache_cache_key *key, struct pcache_cache_key *key_tmp,
|
||||
struct pcache_cache_subtree_walk_ctx *ctx);
|
||||
|
||||
/*
|
||||
* |----------------| key_tmp
|
||||
* |===========| key
|
||||
*/
|
||||
int (*overlap_tail)(struct pcache_cache_key *key, struct pcache_cache_key *key_tmp,
|
||||
struct pcache_cache_subtree_walk_ctx *ctx);
|
||||
|
||||
/*
|
||||
* |--------| key_tmp
|
||||
* |==========| key
|
||||
*/
|
||||
int (*overlap_head)(struct pcache_cache_key *key, struct pcache_cache_key *key_tmp,
|
||||
struct pcache_cache_subtree_walk_ctx *ctx);
|
||||
|
||||
/*
|
||||
* |----| key_tmp
|
||||
* |==========| key
|
||||
*/
|
||||
int (*overlap_contain)(struct pcache_cache_key *key, struct pcache_cache_key *key_tmp,
|
||||
struct pcache_cache_subtree_walk_ctx *ctx);
|
||||
|
||||
/*
|
||||
* |-----------| key_tmp
|
||||
* |====| key
|
||||
*/
|
||||
int (*overlap_contained)(struct pcache_cache_key *key, struct pcache_cache_key *key_tmp,
|
||||
struct pcache_cache_subtree_walk_ctx *ctx);
|
||||
|
||||
int (*walk_finally)(struct pcache_cache_subtree_walk_ctx *ctx, int ret);
|
||||
bool (*walk_done)(struct pcache_cache_subtree_walk_ctx *ctx);
|
||||
};
|
||||
|
||||
int cache_subtree_walk(struct pcache_cache_subtree_walk_ctx *ctx);
|
||||
struct rb_node *cache_subtree_search(struct pcache_cache_subtree *cache_subtree, struct pcache_cache_key *key,
|
||||
struct rb_node **parentp, struct rb_node ***newp,
|
||||
struct list_head *delete_key_list);
|
||||
int cache_kset_close(struct pcache_cache *cache, struct pcache_cache_kset *kset);
|
||||
void clean_fn(struct work_struct *work);
|
||||
void kset_flush_fn(struct work_struct *work);
|
||||
int cache_replay(struct pcache_cache *cache);
|
||||
int cache_tree_init(struct pcache_cache *cache, struct pcache_cache_tree *cache_tree, u32 n_subtrees);
|
||||
void cache_tree_clear(struct pcache_cache_tree *cache_tree);
|
||||
void cache_tree_exit(struct pcache_cache_tree *cache_tree);
|
||||
|
||||
/* cache segments */
|
||||
struct pcache_cache_segment *get_cache_segment(struct pcache_cache *cache);
|
||||
int cache_seg_init(struct pcache_cache *cache, u32 seg_id, u32 cache_seg_id,
|
||||
bool new_cache);
|
||||
void cache_seg_get(struct pcache_cache_segment *cache_seg);
|
||||
void cache_seg_put(struct pcache_cache_segment *cache_seg);
|
||||
void cache_seg_set_next_seg(struct pcache_cache_segment *cache_seg, u32 seg_id);
|
||||
|
||||
/* cache request*/
|
||||
int cache_flush(struct pcache_cache *cache);
|
||||
void miss_read_end_work_fn(struct work_struct *work);
|
||||
int pcache_cache_handle_req(struct pcache_cache *cache, struct pcache_request *pcache_req);
|
||||
|
||||
/* gc */
|
||||
void pcache_cache_gc_fn(struct work_struct *work);
|
||||
|
||||
/* writeback */
|
||||
void cache_writeback_exit(struct pcache_cache *cache);
|
||||
int cache_writeback_init(struct pcache_cache *cache);
|
||||
void cache_writeback_fn(struct work_struct *work);
|
||||
|
||||
/* inline functions */
|
||||
static inline struct pcache_cache_subtree *get_subtree(struct pcache_cache_tree *cache_tree, u64 off)
|
||||
{
|
||||
if (cache_tree->n_subtrees == 1)
|
||||
return &cache_tree->subtrees[0];
|
||||
|
||||
return &cache_tree->subtrees[off >> PCACHE_CACHE_SUBTREE_SIZE_SHIFT];
|
||||
}
|
||||
|
||||
static inline void *cache_pos_addr(struct pcache_cache_pos *pos)
|
||||
{
|
||||
return (pos->cache_seg->segment.data + pos->seg_off);
|
||||
}
|
||||
|
||||
static inline void *get_key_head_addr(struct pcache_cache *cache)
|
||||
{
|
||||
return cache_pos_addr(&cache->key_head);
|
||||
}
|
||||
|
||||
static inline u32 get_kset_id(struct pcache_cache *cache, u64 off)
|
||||
{
|
||||
u32 kset_id;
|
||||
|
||||
div_u64_rem(off >> PCACHE_CACHE_SUBTREE_SIZE_SHIFT, cache->n_ksets, &kset_id);
|
||||
|
||||
return kset_id;
|
||||
}
|
||||
|
||||
static inline struct pcache_cache_kset *get_kset(struct pcache_cache *cache, u32 kset_id)
|
||||
{
|
||||
return (void *)cache->ksets + PCACHE_KSET_SIZE * kset_id;
|
||||
}
|
||||
|
||||
static inline struct pcache_cache_data_head *get_data_head(struct pcache_cache *cache)
|
||||
{
|
||||
return this_cpu_ptr(cache->data_heads);
|
||||
}
|
||||
|
||||
static inline bool cache_key_empty(struct pcache_cache_key *key)
|
||||
{
|
||||
return key->flags & PCACHE_CACHE_KEY_FLAGS_EMPTY;
|
||||
}
|
||||
|
||||
static inline bool cache_key_clean(struct pcache_cache_key *key)
|
||||
{
|
||||
return key->flags & PCACHE_CACHE_KEY_FLAGS_CLEAN;
|
||||
}
|
||||
|
||||
static inline void cache_pos_copy(struct pcache_cache_pos *dst, struct pcache_cache_pos *src)
|
||||
{
|
||||
memcpy(dst, src, sizeof(struct pcache_cache_pos));
|
||||
}
|
||||
|
||||
/**
|
||||
* cache_seg_is_ctrl_seg - Checks if a cache segment is a cache ctrl segment.
|
||||
* @cache_seg_id: ID of the cache segment.
|
||||
*
|
||||
* Returns true if the cache segment ID corresponds to a cache ctrl segment.
|
||||
*
|
||||
* Note: We extend the segment control of the first cache segment
|
||||
* (cache segment ID 0) to serve as the cache control (pcache_cache_ctrl)
|
||||
* for the entire PCACHE cache. This function determines whether the given
|
||||
* cache segment is the one storing the pcache_cache_ctrl information.
|
||||
*/
|
||||
static inline bool cache_seg_is_ctrl_seg(u32 cache_seg_id)
|
||||
{
|
||||
return (cache_seg_id == 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* cache_key_cutfront - Cuts a specified length from the front of a cache key.
|
||||
* @key: Pointer to pcache_cache_key structure.
|
||||
* @cut_len: Length to cut from the front.
|
||||
*
|
||||
* Advances the cache key position by cut_len and adjusts offset and length accordingly.
|
||||
*/
|
||||
static inline void cache_key_cutfront(struct pcache_cache_key *key, u32 cut_len)
|
||||
{
|
||||
if (key->cache_pos.cache_seg)
|
||||
cache_pos_advance(&key->cache_pos, cut_len);
|
||||
|
||||
key->off += cut_len;
|
||||
key->len -= cut_len;
|
||||
}
|
||||
|
||||
/**
|
||||
* cache_key_cutback - Cuts a specified length from the back of a cache key.
|
||||
* @key: Pointer to pcache_cache_key structure.
|
||||
* @cut_len: Length to cut from the back.
|
||||
*
|
||||
* Reduces the length of the cache key by cut_len.
|
||||
*/
|
||||
static inline void cache_key_cutback(struct pcache_cache_key *key, u32 cut_len)
|
||||
{
|
||||
key->len -= cut_len;
|
||||
}
|
||||
|
||||
static inline void cache_key_delete(struct pcache_cache_key *key)
|
||||
{
|
||||
struct pcache_cache_subtree *cache_subtree;
|
||||
|
||||
cache_subtree = key->cache_subtree;
|
||||
BUG_ON(!cache_subtree);
|
||||
|
||||
rb_erase(&key->rb_node, &cache_subtree->root);
|
||||
key->flags = 0;
|
||||
cache_key_put(key);
|
||||
}
|
||||
|
||||
static inline bool cache_data_crc_on(struct pcache_cache *cache)
|
||||
{
|
||||
return (cache->cache_info.flags & PCACHE_CACHE_FLAGS_DATA_CRC);
|
||||
}
|
||||
|
||||
static inline u32 cache_mode_get(struct pcache_cache *cache)
|
||||
{
|
||||
return FIELD_GET(PCACHE_CACHE_FLAGS_CACHE_MODE_MASK, cache->cache_info.flags);
|
||||
}
|
||||
|
||||
static inline void cache_mode_set(struct pcache_cache *cache, u32 cache_mode)
|
||||
{
|
||||
cache->cache_info.flags &= ~PCACHE_CACHE_FLAGS_CACHE_MODE_MASK;
|
||||
cache->cache_info.flags |= FIELD_PREP(PCACHE_CACHE_FLAGS_CACHE_MODE_MASK, cache_mode);
|
||||
}
|
||||
|
||||
/**
|
||||
* cache_key_data_crc - Calculates CRC for data in a cache key.
|
||||
* @key: Pointer to the pcache_cache_key structure.
|
||||
*
|
||||
* Returns the CRC-32 checksum of the data within the cache key's position.
|
||||
*/
|
||||
static inline u32 cache_key_data_crc(struct pcache_cache_key *key)
|
||||
{
|
||||
void *data;
|
||||
|
||||
data = cache_pos_addr(&key->cache_pos);
|
||||
|
||||
return crc32c(PCACHE_CRC_SEED, data, key->len);
|
||||
}
|
||||
|
||||
static inline u32 cache_kset_crc(struct pcache_cache_kset_onmedia *kset_onmedia)
|
||||
{
|
||||
u32 crc_size;
|
||||
|
||||
if (kset_onmedia->flags & PCACHE_KSET_FLAGS_LAST)
|
||||
crc_size = sizeof(struct pcache_cache_kset_onmedia) - 4;
|
||||
else
|
||||
crc_size = struct_size(kset_onmedia, data, kset_onmedia->key_num) - 4;
|
||||
|
||||
return crc32c(PCACHE_CRC_SEED, (void *)kset_onmedia + 4, crc_size);
|
||||
}
|
||||
|
||||
static inline u32 get_kset_onmedia_size(struct pcache_cache_kset_onmedia *kset_onmedia)
|
||||
{
|
||||
return struct_size_t(struct pcache_cache_kset_onmedia, data, kset_onmedia->key_num);
|
||||
}
|
||||
|
||||
/**
|
||||
* cache_seg_remain - Computes remaining space in a cache segment.
|
||||
* @pos: Pointer to pcache_cache_pos structure.
|
||||
*
|
||||
* Returns the amount of remaining space in the segment data starting from
|
||||
* the current position offset.
|
||||
*/
|
||||
static inline u32 cache_seg_remain(struct pcache_cache_pos *pos)
|
||||
{
|
||||
struct pcache_cache_segment *cache_seg;
|
||||
struct pcache_segment *segment;
|
||||
u32 seg_remain;
|
||||
|
||||
cache_seg = pos->cache_seg;
|
||||
segment = &cache_seg->segment;
|
||||
seg_remain = segment->data_size - pos->seg_off;
|
||||
|
||||
return seg_remain;
|
||||
}
|
||||
|
||||
/**
|
||||
* cache_key_invalid - Checks if a cache key is invalid.
|
||||
* @key: Pointer to pcache_cache_key structure.
|
||||
*
|
||||
* Returns true if the cache key is invalid due to its generation being
|
||||
* less than the generation of its segment; otherwise returns false.
|
||||
*
|
||||
* When the GC (garbage collection) thread identifies a segment
|
||||
* as reclaimable, it increments the segment's generation (gen). However,
|
||||
* it does not immediately remove all related cache keys. When accessing
|
||||
* such a cache key, this function can be used to determine if the cache
|
||||
* key has already become invalid.
|
||||
*/
|
||||
static inline bool cache_key_invalid(struct pcache_cache_key *key)
|
||||
{
|
||||
if (cache_key_empty(key))
|
||||
return false;
|
||||
|
||||
return (key->seg_gen < key->cache_pos.cache_seg->gen);
|
||||
}
|
||||
|
||||
/**
|
||||
* cache_key_lstart - Retrieves the logical start offset of a cache key.
|
||||
* @key: Pointer to pcache_cache_key structure.
|
||||
*
|
||||
* Returns the logical start offset for the cache key.
|
||||
*/
|
||||
static inline u64 cache_key_lstart(struct pcache_cache_key *key)
|
||||
{
|
||||
return key->off;
|
||||
}
|
||||
|
||||
/**
|
||||
* cache_key_lend - Retrieves the logical end offset of a cache key.
|
||||
* @key: Pointer to pcache_cache_key structure.
|
||||
*
|
||||
* Returns the logical end offset for the cache key.
|
||||
*/
|
||||
static inline u64 cache_key_lend(struct pcache_cache_key *key)
|
||||
{
|
||||
return key->off + key->len;
|
||||
}
|
||||
|
||||
static inline void cache_key_copy(struct pcache_cache_key *key_dst, struct pcache_cache_key *key_src)
|
||||
{
|
||||
key_dst->off = key_src->off;
|
||||
key_dst->len = key_src->len;
|
||||
key_dst->seg_gen = key_src->seg_gen;
|
||||
key_dst->cache_tree = key_src->cache_tree;
|
||||
key_dst->cache_subtree = key_src->cache_subtree;
|
||||
key_dst->flags = key_src->flags;
|
||||
|
||||
cache_pos_copy(&key_dst->cache_pos, &key_src->cache_pos);
|
||||
}
|
||||
|
||||
/**
|
||||
* cache_pos_onmedia_crc - Calculates the CRC for an on-media cache position.
|
||||
* @pos_om: Pointer to pcache_cache_pos_onmedia structure.
|
||||
*
|
||||
* Calculates the CRC-32 checksum of the position, excluding the first 4 bytes.
|
||||
* Returns the computed CRC value.
|
||||
*/
|
||||
static inline u32 cache_pos_onmedia_crc(struct pcache_cache_pos_onmedia *pos_om)
|
||||
{
|
||||
return pcache_meta_crc(&pos_om->header, sizeof(struct pcache_cache_pos_onmedia));
|
||||
}
|
||||
|
||||
void cache_pos_encode(struct pcache_cache *cache,
|
||||
struct pcache_cache_pos_onmedia *pos_onmedia,
|
||||
struct pcache_cache_pos *pos, u64 seq, u32 *index);
|
||||
int cache_pos_decode(struct pcache_cache *cache,
|
||||
struct pcache_cache_pos_onmedia *pos_onmedia,
|
||||
struct pcache_cache_pos *pos, u64 *seq, u32 *index);
|
||||
|
||||
static inline void cache_encode_key_tail(struct pcache_cache *cache)
|
||||
{
|
||||
cache_pos_encode(cache, cache->cache_ctrl->key_tail_pos,
|
||||
&cache->key_tail, ++cache->key_tail_seq,
|
||||
&cache->key_tail_index);
|
||||
}
|
||||
|
||||
static inline int cache_decode_key_tail(struct pcache_cache *cache)
|
||||
{
|
||||
return cache_pos_decode(cache, cache->cache_ctrl->key_tail_pos,
|
||||
&cache->key_tail, &cache->key_tail_seq,
|
||||
&cache->key_tail_index);
|
||||
}
|
||||
|
||||
static inline void cache_encode_dirty_tail(struct pcache_cache *cache)
|
||||
{
|
||||
cache_pos_encode(cache, cache->cache_ctrl->dirty_tail_pos,
|
||||
&cache->dirty_tail, ++cache->dirty_tail_seq,
|
||||
&cache->dirty_tail_index);
|
||||
}
|
||||
|
||||
static inline int cache_decode_dirty_tail(struct pcache_cache *cache)
|
||||
{
|
||||
return cache_pos_decode(cache, cache->cache_ctrl->dirty_tail_pos,
|
||||
&cache->dirty_tail, &cache->dirty_tail_seq,
|
||||
&cache->dirty_tail_index);
|
||||
}
|
||||
|
||||
int pcache_cache_init(void);
|
||||
void pcache_cache_exit(void);
|
||||
#endif /* _PCACHE_CACHE_H */
|
||||
303
drivers/md/dm-pcache/cache_dev.c
Normal file
303
drivers/md/dm-pcache/cache_dev.c
Normal file
@@ -0,0 +1,303 @@
|
||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/dax.h>
|
||||
#include <linux/vmalloc.h>
|
||||
#include <linux/parser.h>
|
||||
|
||||
#include "cache_dev.h"
|
||||
#include "backing_dev.h"
|
||||
#include "cache.h"
|
||||
#include "dm_pcache.h"
|
||||
|
||||
static void cache_dev_dax_exit(struct pcache_cache_dev *cache_dev)
|
||||
{
|
||||
if (cache_dev->use_vmap)
|
||||
vunmap(cache_dev->mapping);
|
||||
}
|
||||
|
||||
static int build_vmap(struct dax_device *dax_dev, long total_pages, void **vaddr)
|
||||
{
|
||||
struct page **pages;
|
||||
long i = 0, chunk;
|
||||
unsigned long pfn;
|
||||
int ret;
|
||||
|
||||
pages = vmalloc_array(total_pages, sizeof(struct page *));
|
||||
if (!pages)
|
||||
return -ENOMEM;
|
||||
|
||||
do {
|
||||
chunk = dax_direct_access(dax_dev, i, total_pages - i,
|
||||
DAX_ACCESS, NULL, &pfn);
|
||||
if (chunk <= 0) {
|
||||
ret = chunk ? chunk : -EINVAL;
|
||||
goto out_free;
|
||||
}
|
||||
|
||||
if (!pfn_valid(pfn)) {
|
||||
ret = -EOPNOTSUPP;
|
||||
goto out_free;
|
||||
}
|
||||
|
||||
while (chunk-- && i < total_pages) {
|
||||
pages[i++] = pfn_to_page(pfn);
|
||||
pfn++;
|
||||
if (!(i & 15))
|
||||
cond_resched();
|
||||
}
|
||||
} while (i < total_pages);
|
||||
|
||||
*vaddr = vmap(pages, total_pages, VM_MAP, PAGE_KERNEL);
|
||||
if (!*vaddr) {
|
||||
ret = -ENOMEM;
|
||||
goto out_free;
|
||||
}
|
||||
|
||||
ret = 0;
|
||||
|
||||
out_free:
|
||||
vfree(pages);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int cache_dev_dax_init(struct pcache_cache_dev *cache_dev)
|
||||
{
|
||||
struct dm_pcache *pcache = CACHE_DEV_TO_PCACHE(cache_dev);
|
||||
struct dax_device *dax_dev;
|
||||
long total_pages, mapped_pages;
|
||||
u64 bdev_size;
|
||||
void *vaddr;
|
||||
int ret;
|
||||
int id;
|
||||
unsigned long pfn;
|
||||
|
||||
dax_dev = cache_dev->dm_dev->dax_dev;
|
||||
/* total size check */
|
||||
bdev_size = bdev_nr_bytes(cache_dev->dm_dev->bdev);
|
||||
if (bdev_size < PCACHE_CACHE_DEV_SIZE_MIN) {
|
||||
pcache_dev_err(pcache, "dax device is too small, required at least %llu",
|
||||
PCACHE_CACHE_DEV_SIZE_MIN);
|
||||
ret = -ENOSPC;
|
||||
goto out;
|
||||
}
|
||||
|
||||
total_pages = bdev_size >> PAGE_SHIFT;
|
||||
/* attempt: direct-map the whole range */
|
||||
id = dax_read_lock();
|
||||
mapped_pages = dax_direct_access(dax_dev, 0, total_pages,
|
||||
DAX_ACCESS, &vaddr, &pfn);
|
||||
if (mapped_pages < 0) {
|
||||
pcache_dev_err(pcache, "dax_direct_access failed: %ld\n", mapped_pages);
|
||||
ret = mapped_pages;
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
if (!pfn_valid(pfn)) {
|
||||
ret = -EOPNOTSUPP;
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
if (mapped_pages == total_pages) {
|
||||
/* success: contiguous direct mapping */
|
||||
cache_dev->mapping = vaddr;
|
||||
} else {
|
||||
/* need vmap fallback */
|
||||
ret = build_vmap(dax_dev, total_pages, &vaddr);
|
||||
if (ret) {
|
||||
pcache_dev_err(pcache, "vmap fallback failed: %d\n", ret);
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
cache_dev->mapping = vaddr;
|
||||
cache_dev->use_vmap = true;
|
||||
}
|
||||
dax_read_unlock(id);
|
||||
|
||||
return 0;
|
||||
unlock:
|
||||
dax_read_unlock(id);
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
void cache_dev_zero_range(struct pcache_cache_dev *cache_dev, void *pos, u32 size)
|
||||
{
|
||||
memset(pos, 0, size);
|
||||
dax_flush(cache_dev->dm_dev->dax_dev, pos, size);
|
||||
}
|
||||
|
||||
static int sb_read(struct pcache_cache_dev *cache_dev, struct pcache_sb *sb)
|
||||
{
|
||||
struct pcache_sb *sb_addr = CACHE_DEV_SB(cache_dev);
|
||||
|
||||
if (copy_mc_to_kernel(sb, sb_addr, sizeof(struct pcache_sb)))
|
||||
return -EIO;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void sb_write(struct pcache_cache_dev *cache_dev, struct pcache_sb *sb)
|
||||
{
|
||||
struct pcache_sb *sb_addr = CACHE_DEV_SB(cache_dev);
|
||||
|
||||
memcpy_flushcache(sb_addr, sb, sizeof(struct pcache_sb));
|
||||
pmem_wmb();
|
||||
}
|
||||
|
||||
static int sb_init(struct pcache_cache_dev *cache_dev, struct pcache_sb *sb)
|
||||
{
|
||||
struct dm_pcache *pcache = CACHE_DEV_TO_PCACHE(cache_dev);
|
||||
u64 nr_segs;
|
||||
u64 cache_dev_size;
|
||||
u64 magic;
|
||||
u32 flags = 0;
|
||||
|
||||
magic = le64_to_cpu(sb->magic);
|
||||
if (magic)
|
||||
return -EEXIST;
|
||||
|
||||
cache_dev_size = bdev_nr_bytes(file_bdev(cache_dev->dm_dev->bdev_file));
|
||||
if (cache_dev_size < PCACHE_CACHE_DEV_SIZE_MIN) {
|
||||
pcache_dev_err(pcache, "dax device is too small, required at least %llu",
|
||||
PCACHE_CACHE_DEV_SIZE_MIN);
|
||||
return -ENOSPC;
|
||||
}
|
||||
|
||||
nr_segs = (cache_dev_size - PCACHE_SEGMENTS_OFF) / ((PCACHE_SEG_SIZE));
|
||||
|
||||
#if defined(__BYTE_ORDER) ? (__BIG_ENDIAN == __BYTE_ORDER) : defined(__BIG_ENDIAN)
|
||||
flags |= PCACHE_SB_F_BIGENDIAN;
|
||||
#endif
|
||||
sb->flags = cpu_to_le32(flags);
|
||||
sb->magic = cpu_to_le64(PCACHE_MAGIC);
|
||||
sb->seg_num = cpu_to_le32(nr_segs);
|
||||
sb->crc = cpu_to_le32(crc32c(PCACHE_CRC_SEED, (void *)(sb) + 4, sizeof(struct pcache_sb) - 4));
|
||||
|
||||
cache_dev_zero_range(cache_dev, CACHE_DEV_CACHE_INFO(cache_dev),
|
||||
PCACHE_CACHE_INFO_SIZE * PCACHE_META_INDEX_MAX +
|
||||
PCACHE_CACHE_CTRL_SIZE);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int sb_validate(struct pcache_cache_dev *cache_dev, struct pcache_sb *sb)
|
||||
{
|
||||
struct dm_pcache *pcache = CACHE_DEV_TO_PCACHE(cache_dev);
|
||||
u32 flags;
|
||||
u32 crc;
|
||||
|
||||
if (le64_to_cpu(sb->magic) != PCACHE_MAGIC) {
|
||||
pcache_dev_err(pcache, "unexpected magic: %llx\n",
|
||||
le64_to_cpu(sb->magic));
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
crc = crc32c(PCACHE_CRC_SEED, (void *)(sb) + 4, sizeof(struct pcache_sb) - 4);
|
||||
if (crc != le32_to_cpu(sb->crc)) {
|
||||
pcache_dev_err(pcache, "corrupted sb: %u, expected: %u\n", crc, le32_to_cpu(sb->crc));
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
flags = le32_to_cpu(sb->flags);
|
||||
#if defined(__BYTE_ORDER) ? (__BIG_ENDIAN == __BYTE_ORDER) : defined(__BIG_ENDIAN)
|
||||
if (!(flags & PCACHE_SB_F_BIGENDIAN)) {
|
||||
pcache_dev_err(pcache, "cache_dev is not big endian\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
#else
|
||||
if (flags & PCACHE_SB_F_BIGENDIAN) {
|
||||
pcache_dev_err(pcache, "cache_dev is big endian\n");
|
||||
return -EINVAL;
|
||||
}
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int cache_dev_init(struct pcache_cache_dev *cache_dev, u32 seg_num)
|
||||
{
|
||||
cache_dev->seg_num = seg_num;
|
||||
cache_dev->seg_bitmap = kvcalloc(BITS_TO_LONGS(cache_dev->seg_num), sizeof(unsigned long), GFP_KERNEL);
|
||||
if (!cache_dev->seg_bitmap)
|
||||
return -ENOMEM;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void cache_dev_exit(struct pcache_cache_dev *cache_dev)
|
||||
{
|
||||
kvfree(cache_dev->seg_bitmap);
|
||||
}
|
||||
|
||||
void cache_dev_stop(struct dm_pcache *pcache)
|
||||
{
|
||||
struct pcache_cache_dev *cache_dev = &pcache->cache_dev;
|
||||
|
||||
cache_dev_exit(cache_dev);
|
||||
cache_dev_dax_exit(cache_dev);
|
||||
}
|
||||
|
||||
int cache_dev_start(struct dm_pcache *pcache)
|
||||
{
|
||||
struct pcache_cache_dev *cache_dev = &pcache->cache_dev;
|
||||
struct pcache_sb sb;
|
||||
bool format = false;
|
||||
int ret;
|
||||
|
||||
mutex_init(&cache_dev->seg_lock);
|
||||
|
||||
ret = cache_dev_dax_init(cache_dev);
|
||||
if (ret) {
|
||||
pcache_dev_err(pcache, "failed to init cache_dev %s via dax way: %d.",
|
||||
cache_dev->dm_dev->name, ret);
|
||||
goto err;
|
||||
}
|
||||
|
||||
ret = sb_read(cache_dev, &sb);
|
||||
if (ret)
|
||||
goto dax_release;
|
||||
|
||||
if (le64_to_cpu(sb.magic) == 0) {
|
||||
format = true;
|
||||
ret = sb_init(cache_dev, &sb);
|
||||
if (ret < 0)
|
||||
goto dax_release;
|
||||
}
|
||||
|
||||
ret = sb_validate(cache_dev, &sb);
|
||||
if (ret)
|
||||
goto dax_release;
|
||||
|
||||
cache_dev->sb_flags = le32_to_cpu(sb.flags);
|
||||
ret = cache_dev_init(cache_dev, le32_to_cpu(sb.seg_num));
|
||||
if (ret)
|
||||
goto dax_release;
|
||||
|
||||
if (format)
|
||||
sb_write(cache_dev, &sb);
|
||||
|
||||
return 0;
|
||||
|
||||
dax_release:
|
||||
cache_dev_dax_exit(cache_dev);
|
||||
err:
|
||||
return ret;
|
||||
}
|
||||
|
||||
int cache_dev_get_empty_segment_id(struct pcache_cache_dev *cache_dev, u32 *seg_id)
|
||||
{
|
||||
int ret;
|
||||
|
||||
mutex_lock(&cache_dev->seg_lock);
|
||||
*seg_id = find_next_zero_bit(cache_dev->seg_bitmap, cache_dev->seg_num, 0);
|
||||
if (*seg_id == cache_dev->seg_num) {
|
||||
ret = -ENOSPC;
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
__set_bit(*seg_id, cache_dev->seg_bitmap);
|
||||
ret = 0;
|
||||
unlock:
|
||||
mutex_unlock(&cache_dev->seg_lock);
|
||||
return ret;
|
||||
}
|
||||
70
drivers/md/dm-pcache/cache_dev.h
Normal file
70
drivers/md/dm-pcache/cache_dev.h
Normal file
@@ -0,0 +1,70 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
||||
#ifndef _PCACHE_CACHE_DEV_H
|
||||
#define _PCACHE_CACHE_DEV_H
|
||||
|
||||
#include <linux/device.h>
|
||||
#include <linux/device-mapper.h>
|
||||
|
||||
#include "pcache_internal.h"
|
||||
|
||||
#define PCACHE_MAGIC 0x65B05EFA96C596EFULL
|
||||
|
||||
#define PCACHE_SB_OFF (4 * PCACHE_KB)
|
||||
#define PCACHE_SB_SIZE (4 * PCACHE_KB)
|
||||
|
||||
#define PCACHE_CACHE_INFO_OFF (PCACHE_SB_OFF + PCACHE_SB_SIZE)
|
||||
#define PCACHE_CACHE_INFO_SIZE (4 * PCACHE_KB)
|
||||
|
||||
#define PCACHE_CACHE_CTRL_OFF (PCACHE_CACHE_INFO_OFF + (PCACHE_CACHE_INFO_SIZE * PCACHE_META_INDEX_MAX))
|
||||
#define PCACHE_CACHE_CTRL_SIZE (4 * PCACHE_KB)
|
||||
|
||||
#define PCACHE_SEGMENTS_OFF (PCACHE_CACHE_CTRL_OFF + PCACHE_CACHE_CTRL_SIZE)
|
||||
#define PCACHE_SEG_INFO_SIZE (4 * PCACHE_KB)
|
||||
|
||||
#define PCACHE_CACHE_DEV_SIZE_MIN (512 * PCACHE_MB) /* 512 MB */
|
||||
#define PCACHE_SEG_SIZE (16 * PCACHE_MB) /* Size of each PCACHE segment (16 MB) */
|
||||
|
||||
#define CACHE_DEV_SB(cache_dev) ((struct pcache_sb *)(cache_dev->mapping + PCACHE_SB_OFF))
|
||||
#define CACHE_DEV_CACHE_INFO(cache_dev) ((void *)cache_dev->mapping + PCACHE_CACHE_INFO_OFF)
|
||||
#define CACHE_DEV_CACHE_CTRL(cache_dev) ((void *)cache_dev->mapping + PCACHE_CACHE_CTRL_OFF)
|
||||
#define CACHE_DEV_SEGMENTS(cache_dev) ((void *)cache_dev->mapping + PCACHE_SEGMENTS_OFF)
|
||||
#define CACHE_DEV_SEGMENT(cache_dev, id) ((void *)CACHE_DEV_SEGMENTS(cache_dev) + (u64)id * PCACHE_SEG_SIZE)
|
||||
|
||||
/*
|
||||
* PCACHE SB flags configured during formatting
|
||||
*
|
||||
* The PCACHE_SB_F_xxx flags define registration requirements based on cache_dev
|
||||
* formatting. For a machine to register a cache_dev:
|
||||
* - PCACHE_SB_F_BIGENDIAN: Requires a big-endian machine.
|
||||
*/
|
||||
#define PCACHE_SB_F_BIGENDIAN BIT(0)
|
||||
|
||||
struct pcache_sb {
|
||||
__le32 crc;
|
||||
__le32 flags;
|
||||
__le64 magic;
|
||||
|
||||
__le32 seg_num;
|
||||
};
|
||||
|
||||
struct pcache_cache_dev {
|
||||
u32 sb_flags;
|
||||
u32 seg_num;
|
||||
void *mapping;
|
||||
bool use_vmap;
|
||||
|
||||
struct dm_dev *dm_dev;
|
||||
|
||||
struct mutex seg_lock;
|
||||
unsigned long *seg_bitmap;
|
||||
};
|
||||
|
||||
struct dm_pcache;
|
||||
int cache_dev_start(struct dm_pcache *pcache);
|
||||
void cache_dev_stop(struct dm_pcache *pcache);
|
||||
|
||||
void cache_dev_zero_range(struct pcache_cache_dev *cache_dev, void *pos, u32 size);
|
||||
|
||||
int cache_dev_get_empty_segment_id(struct pcache_cache_dev *cache_dev, u32 *seg_id);
|
||||
|
||||
#endif /* _PCACHE_CACHE_DEV_H */
|
||||
170
drivers/md/dm-pcache/cache_gc.c
Normal file
170
drivers/md/dm-pcache/cache_gc.c
Normal file
@@ -0,0 +1,170 @@
|
||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
#include "cache.h"
|
||||
#include "backing_dev.h"
|
||||
#include "cache_dev.h"
|
||||
#include "dm_pcache.h"
|
||||
|
||||
/**
|
||||
* cache_key_gc - Releases the reference of a cache key segment.
|
||||
* @cache: Pointer to the pcache_cache structure.
|
||||
* @key: Pointer to the cache key to be garbage collected.
|
||||
*
|
||||
* This function decrements the reference count of the cache segment
|
||||
* associated with the given key. If the reference count drops to zero,
|
||||
* the segment may be invalidated and reused.
|
||||
*/
|
||||
static void cache_key_gc(struct pcache_cache *cache, struct pcache_cache_key *key)
|
||||
{
|
||||
cache_seg_put(key->cache_pos.cache_seg);
|
||||
}
|
||||
|
||||
static bool need_gc(struct pcache_cache *cache, struct pcache_cache_pos *dirty_tail, struct pcache_cache_pos *key_tail)
|
||||
{
|
||||
struct dm_pcache *pcache = CACHE_TO_PCACHE(cache);
|
||||
struct pcache_cache_kset_onmedia *kset_onmedia;
|
||||
void *dirty_addr, *key_addr;
|
||||
u32 segs_used, segs_gc_threshold, to_copy;
|
||||
int ret;
|
||||
|
||||
dirty_addr = cache_pos_addr(dirty_tail);
|
||||
key_addr = cache_pos_addr(key_tail);
|
||||
if (dirty_addr == key_addr) {
|
||||
pcache_dev_debug(pcache, "key tail is equal to dirty tail: %u:%u\n",
|
||||
dirty_tail->cache_seg->cache_seg_id,
|
||||
dirty_tail->seg_off);
|
||||
return false;
|
||||
}
|
||||
|
||||
kset_onmedia = (struct pcache_cache_kset_onmedia *)cache->gc_kset_onmedia_buf;
|
||||
|
||||
to_copy = min(PCACHE_KSET_ONMEDIA_SIZE_MAX, PCACHE_SEG_SIZE - key_tail->seg_off);
|
||||
ret = copy_mc_to_kernel(kset_onmedia, key_addr, to_copy);
|
||||
if (ret) {
|
||||
pcache_dev_err(pcache, "error to read kset: %d", ret);
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Check if kset_onmedia is corrupted */
|
||||
if (kset_onmedia->magic != PCACHE_KSET_MAGIC) {
|
||||
pcache_dev_debug(pcache, "gc error: magic is not as expected. key_tail: %u:%u magic: %llx, expected: %llx\n",
|
||||
key_tail->cache_seg->cache_seg_id, key_tail->seg_off,
|
||||
kset_onmedia->magic, PCACHE_KSET_MAGIC);
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Verify the CRC of the kset_onmedia */
|
||||
if (kset_onmedia->crc != cache_kset_crc(kset_onmedia)) {
|
||||
pcache_dev_debug(pcache, "gc error: crc is not as expected. crc: %x, expected: %x\n",
|
||||
cache_kset_crc(kset_onmedia), kset_onmedia->crc);
|
||||
return false;
|
||||
}
|
||||
|
||||
segs_used = bitmap_weight(cache->seg_map, cache->n_segs);
|
||||
segs_gc_threshold = cache->n_segs * pcache_cache_get_gc_percent(cache) / 100;
|
||||
if (segs_used < segs_gc_threshold) {
|
||||
pcache_dev_debug(pcache, "segs_used: %u, segs_gc_threshold: %u\n", segs_used, segs_gc_threshold);
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* last_kset_gc - Advances the garbage collection for the last kset.
|
||||
* @cache: Pointer to the pcache_cache structure.
|
||||
* @kset_onmedia: Pointer to the kset_onmedia structure for the last kset.
|
||||
*/
|
||||
static void last_kset_gc(struct pcache_cache *cache, struct pcache_cache_kset_onmedia *kset_onmedia)
|
||||
{
|
||||
struct dm_pcache *pcache = CACHE_TO_PCACHE(cache);
|
||||
struct pcache_cache_segment *cur_seg, *next_seg;
|
||||
|
||||
cur_seg = cache->key_tail.cache_seg;
|
||||
|
||||
next_seg = &cache->segments[kset_onmedia->next_cache_seg_id];
|
||||
|
||||
mutex_lock(&cache->key_tail_lock);
|
||||
cache->key_tail.cache_seg = next_seg;
|
||||
cache->key_tail.seg_off = 0;
|
||||
cache_encode_key_tail(cache);
|
||||
mutex_unlock(&cache->key_tail_lock);
|
||||
|
||||
pcache_dev_debug(pcache, "gc advance kset seg: %u\n", cur_seg->cache_seg_id);
|
||||
|
||||
spin_lock(&cache->seg_map_lock);
|
||||
__clear_bit(cur_seg->cache_seg_id, cache->seg_map);
|
||||
spin_unlock(&cache->seg_map_lock);
|
||||
}
|
||||
|
||||
void pcache_cache_gc_fn(struct work_struct *work)
|
||||
{
|
||||
struct pcache_cache *cache = container_of(work, struct pcache_cache, gc_work.work);
|
||||
struct dm_pcache *pcache = CACHE_TO_PCACHE(cache);
|
||||
struct pcache_cache_pos dirty_tail, key_tail;
|
||||
struct pcache_cache_kset_onmedia *kset_onmedia;
|
||||
struct pcache_cache_key_onmedia *key_onmedia;
|
||||
struct pcache_cache_key *key;
|
||||
int ret;
|
||||
int i;
|
||||
|
||||
kset_onmedia = (struct pcache_cache_kset_onmedia *)cache->gc_kset_onmedia_buf;
|
||||
|
||||
while (true) {
|
||||
if (pcache_is_stopping(pcache) || atomic_read(&cache->gc_errors))
|
||||
return;
|
||||
|
||||
/* Get new tail positions */
|
||||
mutex_lock(&cache->dirty_tail_lock);
|
||||
cache_pos_copy(&dirty_tail, &cache->dirty_tail);
|
||||
mutex_unlock(&cache->dirty_tail_lock);
|
||||
|
||||
mutex_lock(&cache->key_tail_lock);
|
||||
cache_pos_copy(&key_tail, &cache->key_tail);
|
||||
mutex_unlock(&cache->key_tail_lock);
|
||||
|
||||
if (!need_gc(cache, &dirty_tail, &key_tail))
|
||||
break;
|
||||
|
||||
if (kset_onmedia->flags & PCACHE_KSET_FLAGS_LAST) {
|
||||
/* Don't move to the next segment if dirty_tail has not moved */
|
||||
if (dirty_tail.cache_seg == key_tail.cache_seg)
|
||||
break;
|
||||
|
||||
last_kset_gc(cache, kset_onmedia);
|
||||
continue;
|
||||
}
|
||||
|
||||
for (i = 0; i < kset_onmedia->key_num; i++) {
|
||||
struct pcache_cache_key key_tmp = { 0 };
|
||||
|
||||
key_onmedia = &kset_onmedia->data[i];
|
||||
|
||||
key = &key_tmp;
|
||||
cache_key_init(&cache->req_key_tree, key);
|
||||
|
||||
ret = cache_key_decode(cache, key_onmedia, key);
|
||||
if (ret) {
|
||||
/* return without re-arm gc work, and prevent future
|
||||
* gc, because we can't retry the partial-gc-ed kset
|
||||
*/
|
||||
atomic_inc(&cache->gc_errors);
|
||||
pcache_dev_err(pcache, "failed to decode cache key in gc\n");
|
||||
return;
|
||||
}
|
||||
|
||||
cache_key_gc(cache, key);
|
||||
}
|
||||
|
||||
pcache_dev_debug(pcache, "gc advance: %u:%u %u\n",
|
||||
key_tail.cache_seg->cache_seg_id,
|
||||
key_tail.seg_off,
|
||||
get_kset_onmedia_size(kset_onmedia));
|
||||
|
||||
mutex_lock(&cache->key_tail_lock);
|
||||
cache_pos_advance(&cache->key_tail, get_kset_onmedia_size(kset_onmedia));
|
||||
cache_encode_key_tail(cache);
|
||||
mutex_unlock(&cache->key_tail_lock);
|
||||
}
|
||||
|
||||
queue_delayed_work(cache_get_wq(cache), &cache->gc_work, PCACHE_CACHE_GC_INTERVAL);
|
||||
}
|
||||
888
drivers/md/dm-pcache/cache_key.c
Normal file
888
drivers/md/dm-pcache/cache_key.c
Normal file
@@ -0,0 +1,888 @@
|
||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
#include "cache.h"
|
||||
#include "backing_dev.h"
|
||||
#include "cache_dev.h"
|
||||
#include "dm_pcache.h"
|
||||
|
||||
struct pcache_cache_kset_onmedia pcache_empty_kset = { 0 };
|
||||
|
||||
void cache_key_init(struct pcache_cache_tree *cache_tree, struct pcache_cache_key *key)
|
||||
{
|
||||
kref_init(&key->ref);
|
||||
key->cache_tree = cache_tree;
|
||||
INIT_LIST_HEAD(&key->list_node);
|
||||
RB_CLEAR_NODE(&key->rb_node);
|
||||
}
|
||||
|
||||
struct pcache_cache_key *cache_key_alloc(struct pcache_cache_tree *cache_tree, gfp_t gfp_mask)
|
||||
{
|
||||
struct pcache_cache_key *key;
|
||||
|
||||
key = mempool_alloc(&cache_tree->key_pool, gfp_mask);
|
||||
if (!key)
|
||||
return NULL;
|
||||
|
||||
memset(key, 0, sizeof(struct pcache_cache_key));
|
||||
cache_key_init(cache_tree, key);
|
||||
|
||||
return key;
|
||||
}
|
||||
|
||||
/**
|
||||
* cache_key_get - Increment the reference count of a cache key.
|
||||
* @key: Pointer to the pcache_cache_key structure.
|
||||
*
|
||||
* This function increments the reference count of the specified cache key,
|
||||
* ensuring that it is not freed while still in use.
|
||||
*/
|
||||
void cache_key_get(struct pcache_cache_key *key)
|
||||
{
|
||||
kref_get(&key->ref);
|
||||
}
|
||||
|
||||
/**
|
||||
* cache_key_destroy - Free a cache key structure when its reference count drops to zero.
|
||||
* @ref: Pointer to the kref structure.
|
||||
*
|
||||
* This function is called when the reference count of the cache key reaches zero.
|
||||
* It frees the allocated cache key back to the slab cache.
|
||||
*/
|
||||
static void cache_key_destroy(struct kref *ref)
|
||||
{
|
||||
struct pcache_cache_key *key = container_of(ref, struct pcache_cache_key, ref);
|
||||
struct pcache_cache_tree *cache_tree = key->cache_tree;
|
||||
|
||||
mempool_free(key, &cache_tree->key_pool);
|
||||
}
|
||||
|
||||
void cache_key_put(struct pcache_cache_key *key)
|
||||
{
|
||||
kref_put(&key->ref, cache_key_destroy);
|
||||
}
|
||||
|
||||
void cache_pos_advance(struct pcache_cache_pos *pos, u32 len)
|
||||
{
|
||||
/* Ensure enough space remains in the current segment */
|
||||
BUG_ON(cache_seg_remain(pos) < len);
|
||||
|
||||
pos->seg_off += len;
|
||||
}
|
||||
|
||||
static void cache_key_encode(struct pcache_cache *cache,
|
||||
struct pcache_cache_key_onmedia *key_onmedia,
|
||||
struct pcache_cache_key *key)
|
||||
{
|
||||
key_onmedia->off = key->off;
|
||||
key_onmedia->len = key->len;
|
||||
|
||||
key_onmedia->cache_seg_id = key->cache_pos.cache_seg->cache_seg_id;
|
||||
key_onmedia->cache_seg_off = key->cache_pos.seg_off;
|
||||
|
||||
key_onmedia->seg_gen = key->seg_gen;
|
||||
key_onmedia->flags = key->flags;
|
||||
|
||||
if (cache_data_crc_on(cache))
|
||||
key_onmedia->data_crc = cache_key_data_crc(key);
|
||||
}
|
||||
|
||||
int cache_key_decode(struct pcache_cache *cache,
|
||||
struct pcache_cache_key_onmedia *key_onmedia,
|
||||
struct pcache_cache_key *key)
|
||||
{
|
||||
struct dm_pcache *pcache = CACHE_TO_PCACHE(cache);
|
||||
|
||||
key->off = key_onmedia->off;
|
||||
key->len = key_onmedia->len;
|
||||
|
||||
key->cache_pos.cache_seg = &cache->segments[key_onmedia->cache_seg_id];
|
||||
key->cache_pos.seg_off = key_onmedia->cache_seg_off;
|
||||
|
||||
key->seg_gen = key_onmedia->seg_gen;
|
||||
key->flags = key_onmedia->flags;
|
||||
|
||||
if (cache_data_crc_on(cache) &&
|
||||
key_onmedia->data_crc != cache_key_data_crc(key)) {
|
||||
pcache_dev_err(pcache, "key: %llu:%u seg %u:%u data_crc error: %x, expected: %x\n",
|
||||
key->off, key->len, key->cache_pos.cache_seg->cache_seg_id,
|
||||
key->cache_pos.seg_off, cache_key_data_crc(key), key_onmedia->data_crc);
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void append_last_kset(struct pcache_cache *cache, u32 next_seg)
|
||||
{
|
||||
struct pcache_cache_kset_onmedia kset_onmedia = { 0 };
|
||||
|
||||
kset_onmedia.flags |= PCACHE_KSET_FLAGS_LAST;
|
||||
kset_onmedia.next_cache_seg_id = next_seg;
|
||||
kset_onmedia.magic = PCACHE_KSET_MAGIC;
|
||||
kset_onmedia.crc = cache_kset_crc(&kset_onmedia);
|
||||
|
||||
memcpy_flushcache(get_key_head_addr(cache), &kset_onmedia, sizeof(struct pcache_cache_kset_onmedia));
|
||||
pmem_wmb();
|
||||
cache_pos_advance(&cache->key_head, sizeof(struct pcache_cache_kset_onmedia));
|
||||
}
|
||||
|
||||
int cache_kset_close(struct pcache_cache *cache, struct pcache_cache_kset *kset)
|
||||
{
|
||||
struct pcache_cache_kset_onmedia *kset_onmedia;
|
||||
u32 kset_onmedia_size;
|
||||
int ret;
|
||||
|
||||
kset_onmedia = &kset->kset_onmedia;
|
||||
|
||||
if (!kset_onmedia->key_num)
|
||||
return 0;
|
||||
|
||||
kset_onmedia_size = struct_size(kset_onmedia, data, kset_onmedia->key_num);
|
||||
|
||||
spin_lock(&cache->key_head_lock);
|
||||
again:
|
||||
/* Reserve space for the last kset */
|
||||
if (cache_seg_remain(&cache->key_head) < kset_onmedia_size + sizeof(struct pcache_cache_kset_onmedia)) {
|
||||
struct pcache_cache_segment *next_seg;
|
||||
|
||||
next_seg = get_cache_segment(cache);
|
||||
if (!next_seg) {
|
||||
ret = -EBUSY;
|
||||
goto out;
|
||||
}
|
||||
|
||||
/* clear outdated kset in next seg */
|
||||
memcpy_flushcache(next_seg->segment.data, &pcache_empty_kset,
|
||||
sizeof(struct pcache_cache_kset_onmedia));
|
||||
append_last_kset(cache, next_seg->cache_seg_id);
|
||||
cache->key_head.cache_seg = next_seg;
|
||||
cache->key_head.seg_off = 0;
|
||||
goto again;
|
||||
}
|
||||
|
||||
kset_onmedia->magic = PCACHE_KSET_MAGIC;
|
||||
kset_onmedia->crc = cache_kset_crc(kset_onmedia);
|
||||
|
||||
/* clear outdated kset after current kset */
|
||||
memcpy_flushcache(get_key_head_addr(cache) + kset_onmedia_size, &pcache_empty_kset,
|
||||
sizeof(struct pcache_cache_kset_onmedia));
|
||||
/* write current kset into segment */
|
||||
memcpy_flushcache(get_key_head_addr(cache), kset_onmedia, kset_onmedia_size);
|
||||
pmem_wmb();
|
||||
|
||||
/* reset kset_onmedia */
|
||||
memset(kset_onmedia, 0, sizeof(struct pcache_cache_kset_onmedia));
|
||||
cache_pos_advance(&cache->key_head, kset_onmedia_size);
|
||||
|
||||
ret = 0;
|
||||
out:
|
||||
spin_unlock(&cache->key_head_lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* cache_key_append - Append a cache key to the related kset.
|
||||
* @cache: Pointer to the pcache_cache structure.
|
||||
* @key: Pointer to the cache key structure to append.
|
||||
* @force_close: Need to close current kset if true.
|
||||
*
|
||||
* This function appends a cache key to the appropriate kset. If the kset
|
||||
* is full, it closes the kset. If not, it queues a flush work to write
|
||||
* the kset to media.
|
||||
*
|
||||
* Returns 0 on success, or a negative error code on failure.
|
||||
*/
|
||||
int cache_key_append(struct pcache_cache *cache, struct pcache_cache_key *key, bool force_close)
|
||||
{
|
||||
struct pcache_cache_kset *kset;
|
||||
struct pcache_cache_kset_onmedia *kset_onmedia;
|
||||
struct pcache_cache_key_onmedia *key_onmedia;
|
||||
u32 kset_id = get_kset_id(cache, key->off);
|
||||
int ret = 0;
|
||||
|
||||
kset = get_kset(cache, kset_id);
|
||||
kset_onmedia = &kset->kset_onmedia;
|
||||
|
||||
spin_lock(&kset->kset_lock);
|
||||
key_onmedia = &kset_onmedia->data[kset_onmedia->key_num];
|
||||
cache_key_encode(cache, key_onmedia, key);
|
||||
|
||||
/* Check if the current kset has reached the maximum number of keys */
|
||||
if (++kset_onmedia->key_num == PCACHE_KSET_KEYS_MAX || force_close) {
|
||||
/* If full, close the kset */
|
||||
ret = cache_kset_close(cache, kset);
|
||||
if (ret) {
|
||||
kset_onmedia->key_num--;
|
||||
goto out;
|
||||
}
|
||||
} else {
|
||||
/* If not full, queue a delayed work to flush the kset */
|
||||
queue_delayed_work(cache_get_wq(cache), &kset->flush_work, 1 * HZ);
|
||||
}
|
||||
out:
|
||||
spin_unlock(&kset->kset_lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* cache_subtree_walk - Traverse the cache tree.
|
||||
* @ctx: Pointer to the context structure for traversal.
|
||||
*
|
||||
* This function traverses the cache tree starting from the specified node.
|
||||
* It calls the appropriate callback functions based on the relationships
|
||||
* between the keys in the cache tree.
|
||||
*
|
||||
* Returns 0 on success, or a negative error code on failure.
|
||||
*/
|
||||
int cache_subtree_walk(struct pcache_cache_subtree_walk_ctx *ctx)
|
||||
{
|
||||
struct pcache_cache_key *key_tmp, *key;
|
||||
struct rb_node *node_tmp;
|
||||
int ret = SUBTREE_WALK_RET_OK;
|
||||
|
||||
key = ctx->key;
|
||||
node_tmp = ctx->start_node;
|
||||
|
||||
while (node_tmp) {
|
||||
if (ctx->walk_done && ctx->walk_done(ctx))
|
||||
break;
|
||||
|
||||
key_tmp = CACHE_KEY(node_tmp);
|
||||
/*
|
||||
* If key_tmp ends before the start of key, continue to the next node.
|
||||
* |----------|
|
||||
* |=====|
|
||||
*/
|
||||
if (cache_key_lend(key_tmp) <= cache_key_lstart(key)) {
|
||||
if (ctx->after) {
|
||||
ret = ctx->after(key, key_tmp, ctx);
|
||||
if (ret)
|
||||
goto out;
|
||||
}
|
||||
goto next;
|
||||
}
|
||||
|
||||
/*
|
||||
* If key_tmp starts after the end of key, stop traversing.
|
||||
* |--------|
|
||||
* |====|
|
||||
*/
|
||||
if (cache_key_lstart(key_tmp) >= cache_key_lend(key)) {
|
||||
if (ctx->before) {
|
||||
ret = ctx->before(key, key_tmp, ctx);
|
||||
if (ret)
|
||||
goto out;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
/* Handle overlapping keys */
|
||||
if (cache_key_lstart(key_tmp) >= cache_key_lstart(key)) {
|
||||
/*
|
||||
* If key_tmp encompasses key.
|
||||
* |----------------| key_tmp
|
||||
* |===========| key
|
||||
*/
|
||||
if (cache_key_lend(key_tmp) >= cache_key_lend(key)) {
|
||||
if (ctx->overlap_tail) {
|
||||
ret = ctx->overlap_tail(key, key_tmp, ctx);
|
||||
if (ret)
|
||||
goto out;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
* If key_tmp is contained within key.
|
||||
* |----| key_tmp
|
||||
* |==========| key
|
||||
*/
|
||||
if (ctx->overlap_contain) {
|
||||
ret = ctx->overlap_contain(key, key_tmp, ctx);
|
||||
if (ret)
|
||||
goto out;
|
||||
}
|
||||
|
||||
goto next;
|
||||
}
|
||||
|
||||
/*
|
||||
* If key_tmp starts before key ends but ends after key.
|
||||
* |-----------| key_tmp
|
||||
* |====| key
|
||||
*/
|
||||
if (cache_key_lend(key_tmp) > cache_key_lend(key)) {
|
||||
if (ctx->overlap_contained) {
|
||||
ret = ctx->overlap_contained(key, key_tmp, ctx);
|
||||
if (ret)
|
||||
goto out;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
* If key_tmp starts before key and ends within key.
|
||||
* |--------| key_tmp
|
||||
* |==========| key
|
||||
*/
|
||||
if (ctx->overlap_head) {
|
||||
ret = ctx->overlap_head(key, key_tmp, ctx);
|
||||
if (ret)
|
||||
goto out;
|
||||
}
|
||||
next:
|
||||
node_tmp = rb_next(node_tmp);
|
||||
}
|
||||
|
||||
out:
|
||||
if (ctx->walk_finally)
|
||||
ret = ctx->walk_finally(ctx, ret);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* cache_subtree_search - Search for a key in the cache tree.
|
||||
* @cache_subtree: Pointer to the cache tree structure.
|
||||
* @key: Pointer to the cache key to search for.
|
||||
* @parentp: Pointer to store the parent node of the found node.
|
||||
* @newp: Pointer to store the location where the new node should be inserted.
|
||||
* @delete_key_list: List to collect invalid keys for deletion.
|
||||
*
|
||||
* This function searches the cache tree for a specific key and returns
|
||||
* the node that is the predecessor of the key, or first node if the key is
|
||||
* less than all keys in the tree. If any invalid keys are found during
|
||||
* the search, they are added to the delete_key_list for later cleanup.
|
||||
*
|
||||
* Returns a pointer to the previous node.
|
||||
*/
|
||||
struct rb_node *cache_subtree_search(struct pcache_cache_subtree *cache_subtree, struct pcache_cache_key *key,
|
||||
struct rb_node **parentp, struct rb_node ***newp,
|
||||
struct list_head *delete_key_list)
|
||||
{
|
||||
struct rb_node **new, *parent = NULL;
|
||||
struct pcache_cache_key *key_tmp;
|
||||
struct rb_node *prev_node = NULL;
|
||||
|
||||
new = &(cache_subtree->root.rb_node);
|
||||
while (*new) {
|
||||
key_tmp = container_of(*new, struct pcache_cache_key, rb_node);
|
||||
if (cache_key_invalid(key_tmp))
|
||||
list_add(&key_tmp->list_node, delete_key_list);
|
||||
|
||||
parent = *new;
|
||||
if (key_tmp->off >= key->off) {
|
||||
new = &((*new)->rb_left);
|
||||
} else {
|
||||
prev_node = *new;
|
||||
new = &((*new)->rb_right);
|
||||
}
|
||||
}
|
||||
|
||||
if (!prev_node)
|
||||
prev_node = rb_first(&cache_subtree->root);
|
||||
|
||||
if (parentp)
|
||||
*parentp = parent;
|
||||
|
||||
if (newp)
|
||||
*newp = new;
|
||||
|
||||
return prev_node;
|
||||
}
|
||||
|
||||
static struct pcache_cache_key *get_pre_alloc_key(struct pcache_cache_subtree_walk_ctx *ctx)
|
||||
{
|
||||
struct pcache_cache_key *key;
|
||||
|
||||
if (ctx->pre_alloc_key) {
|
||||
key = ctx->pre_alloc_key;
|
||||
ctx->pre_alloc_key = NULL;
|
||||
|
||||
return key;
|
||||
}
|
||||
|
||||
return cache_key_alloc(ctx->cache_tree, GFP_NOWAIT);
|
||||
}
|
||||
|
||||
/**
|
||||
* fixup_overlap_tail - Adjust the key when it overlaps at the tail.
|
||||
* @key: Pointer to the new cache key being inserted.
|
||||
* @key_tmp: Pointer to the existing key that overlaps.
|
||||
* @ctx: Pointer to the context for walking the cache tree.
|
||||
*
|
||||
* This function modifies the existing key (key_tmp) when there is an
|
||||
* overlap at the tail with the new key. If the modified key becomes
|
||||
* empty, it is deleted.
|
||||
*/
|
||||
static int fixup_overlap_tail(struct pcache_cache_key *key,
|
||||
struct pcache_cache_key *key_tmp,
|
||||
struct pcache_cache_subtree_walk_ctx *ctx)
|
||||
{
|
||||
/*
|
||||
* |----------------| key_tmp
|
||||
* |===========| key
|
||||
*/
|
||||
BUG_ON(cache_key_empty(key));
|
||||
if (cache_key_empty(key_tmp)) {
|
||||
cache_key_delete(key_tmp);
|
||||
return SUBTREE_WALK_RET_RESEARCH;
|
||||
}
|
||||
|
||||
cache_key_cutfront(key_tmp, cache_key_lend(key) - cache_key_lstart(key_tmp));
|
||||
if (key_tmp->len == 0) {
|
||||
cache_key_delete(key_tmp);
|
||||
return SUBTREE_WALK_RET_RESEARCH;
|
||||
}
|
||||
|
||||
return SUBTREE_WALK_RET_OK;
|
||||
}
|
||||
|
||||
/**
|
||||
* fixup_overlap_contain - Handle case where new key completely contains an existing key.
|
||||
* @key: Pointer to the new cache key being inserted.
|
||||
* @key_tmp: Pointer to the existing key that is being contained.
|
||||
* @ctx: Pointer to the context for walking the cache tree.
|
||||
*
|
||||
* This function deletes the existing key (key_tmp) when the new key
|
||||
* completely contains it. It returns SUBTREE_WALK_RET_RESEARCH to indicate that the
|
||||
* tree structure may have changed, necessitating a re-insertion of
|
||||
* the new key.
|
||||
*/
|
||||
static int fixup_overlap_contain(struct pcache_cache_key *key,
|
||||
struct pcache_cache_key *key_tmp,
|
||||
struct pcache_cache_subtree_walk_ctx *ctx)
|
||||
{
|
||||
/*
|
||||
* |----| key_tmp
|
||||
* |==========| key
|
||||
*/
|
||||
BUG_ON(cache_key_empty(key));
|
||||
cache_key_delete(key_tmp);
|
||||
|
||||
return SUBTREE_WALK_RET_RESEARCH;
|
||||
}
|
||||
|
||||
/**
|
||||
* fixup_overlap_contained - Handle overlap when a new key is contained in an existing key.
|
||||
* @key: The new cache key being inserted.
|
||||
* @key_tmp: The existing cache key that overlaps with the new key.
|
||||
* @ctx: Context for the cache tree walk.
|
||||
*
|
||||
* This function adjusts the existing key if the new key is contained
|
||||
* within it. If the existing key is empty, it indicates a placeholder key
|
||||
* that was inserted during a miss read. This placeholder will later be
|
||||
* updated with real data from the backing_dev, making it no longer an empty key.
|
||||
*
|
||||
* If we delete key or insert a key, the structure of the entire cache tree may change,
|
||||
* requiring a full research of the tree to find a new insertion point.
|
||||
*/
|
||||
static int fixup_overlap_contained(struct pcache_cache_key *key,
|
||||
struct pcache_cache_key *key_tmp, struct pcache_cache_subtree_walk_ctx *ctx)
|
||||
{
|
||||
struct pcache_cache_tree *cache_tree = ctx->cache_tree;
|
||||
|
||||
/*
|
||||
* |-----------| key_tmp
|
||||
* |====| key
|
||||
*/
|
||||
BUG_ON(cache_key_empty(key));
|
||||
if (cache_key_empty(key_tmp)) {
|
||||
/* If key_tmp is empty, don't split it;
|
||||
* it's a placeholder key for miss reads that will be updated later.
|
||||
*/
|
||||
cache_key_cutback(key_tmp, cache_key_lend(key_tmp) - cache_key_lstart(key));
|
||||
if (key_tmp->len == 0) {
|
||||
cache_key_delete(key_tmp);
|
||||
return SUBTREE_WALK_RET_RESEARCH;
|
||||
}
|
||||
} else {
|
||||
struct pcache_cache_key *key_fixup;
|
||||
bool need_research = false;
|
||||
|
||||
key_fixup = get_pre_alloc_key(ctx);
|
||||
if (!key_fixup)
|
||||
return SUBTREE_WALK_RET_NEED_KEY;
|
||||
|
||||
cache_key_copy(key_fixup, key_tmp);
|
||||
|
||||
/* Split key_tmp based on the new key's range */
|
||||
cache_key_cutback(key_tmp, cache_key_lend(key_tmp) - cache_key_lstart(key));
|
||||
if (key_tmp->len == 0) {
|
||||
cache_key_delete(key_tmp);
|
||||
need_research = true;
|
||||
}
|
||||
|
||||
/* Create a new portion for key_fixup */
|
||||
cache_key_cutfront(key_fixup, cache_key_lend(key) - cache_key_lstart(key_tmp));
|
||||
if (key_fixup->len == 0) {
|
||||
cache_key_put(key_fixup);
|
||||
} else {
|
||||
/* Insert the new key into the cache */
|
||||
cache_key_insert(cache_tree, key_fixup, false);
|
||||
need_research = true;
|
||||
}
|
||||
|
||||
if (need_research)
|
||||
return SUBTREE_WALK_RET_RESEARCH;
|
||||
}
|
||||
|
||||
return SUBTREE_WALK_RET_OK;
|
||||
}
|
||||
|
||||
/**
|
||||
* fixup_overlap_head - Handle overlap when a new key overlaps with the head of an existing key.
|
||||
* @key: The new cache key being inserted.
|
||||
* @key_tmp: The existing cache key that overlaps with the new key.
|
||||
* @ctx: Context for the cache tree walk.
|
||||
*
|
||||
* This function adjusts the existing key if the new key overlaps
|
||||
* with the beginning of it. If the resulting key length is zero
|
||||
* after the adjustment, the key is deleted. This indicates that
|
||||
* the key no longer holds valid data and requires the tree to be
|
||||
* re-researched for a new insertion point.
|
||||
*/
|
||||
static int fixup_overlap_head(struct pcache_cache_key *key,
|
||||
struct pcache_cache_key *key_tmp, struct pcache_cache_subtree_walk_ctx *ctx)
|
||||
{
|
||||
/*
|
||||
* |--------| key_tmp
|
||||
* |==========| key
|
||||
*/
|
||||
BUG_ON(cache_key_empty(key));
|
||||
/* Adjust key_tmp by cutting back based on the new key's start */
|
||||
cache_key_cutback(key_tmp, cache_key_lend(key_tmp) - cache_key_lstart(key));
|
||||
if (key_tmp->len == 0) {
|
||||
/* If the adjusted key_tmp length is zero, delete it */
|
||||
cache_key_delete(key_tmp);
|
||||
return SUBTREE_WALK_RET_RESEARCH;
|
||||
}
|
||||
|
||||
return SUBTREE_WALK_RET_OK;
|
||||
}
|
||||
|
||||
/**
|
||||
* cache_key_insert - Insert a new cache key into the cache tree.
|
||||
* @cache_tree: Pointer to the cache_tree structure.
|
||||
* @key: The cache key to insert.
|
||||
* @fixup: Indicates if this is a new key being inserted.
|
||||
*
|
||||
* This function searches for the appropriate location to insert
|
||||
* a new cache key into the cache tree. It handles key overlaps
|
||||
* and ensures any invalid keys are removed before insertion.
|
||||
*/
|
||||
void cache_key_insert(struct pcache_cache_tree *cache_tree, struct pcache_cache_key *key, bool fixup)
|
||||
{
|
||||
struct pcache_cache *cache = cache_tree->cache;
|
||||
struct pcache_cache_subtree_walk_ctx walk_ctx = { 0 };
|
||||
struct rb_node **new, *parent = NULL;
|
||||
struct pcache_cache_subtree *cache_subtree;
|
||||
struct pcache_cache_key *key_tmp = NULL, *key_next;
|
||||
struct rb_node *prev_node = NULL;
|
||||
LIST_HEAD(delete_key_list);
|
||||
int ret;
|
||||
|
||||
cache_subtree = get_subtree(cache_tree, key->off);
|
||||
key->cache_subtree = cache_subtree;
|
||||
search:
|
||||
prev_node = cache_subtree_search(cache_subtree, key, &parent, &new, &delete_key_list);
|
||||
if (!list_empty(&delete_key_list)) {
|
||||
/* Remove invalid keys from the delete list */
|
||||
list_for_each_entry_safe(key_tmp, key_next, &delete_key_list, list_node) {
|
||||
list_del_init(&key_tmp->list_node);
|
||||
cache_key_delete(key_tmp);
|
||||
}
|
||||
goto search;
|
||||
}
|
||||
|
||||
if (fixup) {
|
||||
/* Set up the context with the cache, start node, and new key */
|
||||
walk_ctx.cache_tree = cache_tree;
|
||||
walk_ctx.start_node = prev_node;
|
||||
walk_ctx.key = key;
|
||||
|
||||
/* Assign overlap handling functions for different scenarios */
|
||||
walk_ctx.overlap_tail = fixup_overlap_tail;
|
||||
walk_ctx.overlap_head = fixup_overlap_head;
|
||||
walk_ctx.overlap_contain = fixup_overlap_contain;
|
||||
walk_ctx.overlap_contained = fixup_overlap_contained;
|
||||
|
||||
ret = cache_subtree_walk(&walk_ctx);
|
||||
switch (ret) {
|
||||
case SUBTREE_WALK_RET_OK:
|
||||
break;
|
||||
case SUBTREE_WALK_RET_RESEARCH:
|
||||
goto search;
|
||||
case SUBTREE_WALK_RET_NEED_KEY:
|
||||
spin_unlock(&cache_subtree->tree_lock);
|
||||
pcache_dev_debug(CACHE_TO_PCACHE(cache), "allocate pre_alloc_key with GFP_NOIO");
|
||||
walk_ctx.pre_alloc_key = cache_key_alloc(cache_tree, GFP_NOIO);
|
||||
spin_lock(&cache_subtree->tree_lock);
|
||||
goto search;
|
||||
default:
|
||||
BUG();
|
||||
}
|
||||
}
|
||||
|
||||
if (walk_ctx.pre_alloc_key)
|
||||
cache_key_put(walk_ctx.pre_alloc_key);
|
||||
|
||||
/* Link and insert the new key into the red-black tree */
|
||||
rb_link_node(&key->rb_node, parent, new);
|
||||
rb_insert_color(&key->rb_node, &cache_subtree->root);
|
||||
}
|
||||
|
||||
/**
|
||||
* clean_fn - Cleanup function to remove invalid keys from the cache tree.
|
||||
* @work: Pointer to the work_struct associated with the cleanup.
|
||||
*
|
||||
* This function cleans up invalid keys from the cache tree in the background
|
||||
* after a cache segment has been invalidated during cache garbage collection.
|
||||
* It processes a maximum of PCACHE_CLEAN_KEYS_MAX keys per iteration and holds
|
||||
* the tree lock to ensure thread safety.
|
||||
*/
|
||||
void clean_fn(struct work_struct *work)
|
||||
{
|
||||
struct pcache_cache *cache = container_of(work, struct pcache_cache, clean_work);
|
||||
struct pcache_cache_subtree *cache_subtree;
|
||||
struct rb_node *node;
|
||||
struct pcache_cache_key *key;
|
||||
int i, count;
|
||||
|
||||
for (i = 0; i < cache->req_key_tree.n_subtrees; i++) {
|
||||
cache_subtree = &cache->req_key_tree.subtrees[i];
|
||||
|
||||
again:
|
||||
if (pcache_is_stopping(CACHE_TO_PCACHE(cache)))
|
||||
return;
|
||||
|
||||
/* Delete up to PCACHE_CLEAN_KEYS_MAX keys in one iteration */
|
||||
count = 0;
|
||||
spin_lock(&cache_subtree->tree_lock);
|
||||
node = rb_first(&cache_subtree->root);
|
||||
while (node) {
|
||||
key = CACHE_KEY(node);
|
||||
node = rb_next(node);
|
||||
if (cache_key_invalid(key)) {
|
||||
count++;
|
||||
cache_key_delete(key);
|
||||
}
|
||||
|
||||
if (count >= PCACHE_CLEAN_KEYS_MAX) {
|
||||
/* Unlock and pause before continuing cleanup */
|
||||
spin_unlock(&cache_subtree->tree_lock);
|
||||
usleep_range(1000, 2000);
|
||||
goto again;
|
||||
}
|
||||
}
|
||||
spin_unlock(&cache_subtree->tree_lock);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* kset_flush_fn - Flush work for a cache kset.
|
||||
*
|
||||
* This function is called when a kset flush work is queued from
|
||||
* cache_key_append(). If the kset is full, it will be closed
|
||||
* immediately. If not, the flush work will be queued for later closure.
|
||||
*
|
||||
* If cache_kset_close detects that a new segment is required to store
|
||||
* the kset and there are no available segments, it will return an error.
|
||||
* In this scenario, a retry will be attempted.
|
||||
*/
|
||||
void kset_flush_fn(struct work_struct *work)
|
||||
{
|
||||
struct pcache_cache_kset *kset = container_of(work, struct pcache_cache_kset, flush_work.work);
|
||||
struct pcache_cache *cache = kset->cache;
|
||||
int ret;
|
||||
|
||||
if (pcache_is_stopping(CACHE_TO_PCACHE(cache)))
|
||||
return;
|
||||
|
||||
spin_lock(&kset->kset_lock);
|
||||
ret = cache_kset_close(cache, kset);
|
||||
spin_unlock(&kset->kset_lock);
|
||||
|
||||
if (ret) {
|
||||
/* Failed to flush kset, schedule a retry. */
|
||||
queue_delayed_work(cache_get_wq(cache), &kset->flush_work, msecs_to_jiffies(100));
|
||||
}
|
||||
}
|
||||
|
||||
static int kset_replay(struct pcache_cache *cache, struct pcache_cache_kset_onmedia *kset_onmedia)
|
||||
{
|
||||
struct pcache_cache_key_onmedia *key_onmedia;
|
||||
struct pcache_cache_subtree *cache_subtree;
|
||||
struct pcache_cache_key *key;
|
||||
int ret;
|
||||
int i;
|
||||
|
||||
for (i = 0; i < kset_onmedia->key_num; i++) {
|
||||
key_onmedia = &kset_onmedia->data[i];
|
||||
|
||||
key = cache_key_alloc(&cache->req_key_tree, GFP_NOIO);
|
||||
ret = cache_key_decode(cache, key_onmedia, key);
|
||||
if (ret) {
|
||||
cache_key_put(key);
|
||||
goto err;
|
||||
}
|
||||
|
||||
__set_bit(key->cache_pos.cache_seg->cache_seg_id, cache->seg_map);
|
||||
|
||||
/* Check if the segment generation is valid for insertion. */
|
||||
if (key->seg_gen < key->cache_pos.cache_seg->gen) {
|
||||
cache_key_put(key);
|
||||
} else {
|
||||
cache_subtree = get_subtree(&cache->req_key_tree, key->off);
|
||||
spin_lock(&cache_subtree->tree_lock);
|
||||
cache_key_insert(&cache->req_key_tree, key, true);
|
||||
spin_unlock(&cache_subtree->tree_lock);
|
||||
}
|
||||
|
||||
cache_seg_get(key->cache_pos.cache_seg);
|
||||
}
|
||||
|
||||
return 0;
|
||||
err:
|
||||
return ret;
|
||||
}
|
||||
|
||||
int cache_replay(struct pcache_cache *cache)
|
||||
{
|
||||
struct dm_pcache *pcache = CACHE_TO_PCACHE(cache);
|
||||
struct pcache_cache_pos pos_tail;
|
||||
struct pcache_cache_pos *pos;
|
||||
struct pcache_cache_kset_onmedia *kset_onmedia;
|
||||
u32 to_copy, count = 0;
|
||||
int ret = 0;
|
||||
|
||||
kset_onmedia = kzalloc(PCACHE_KSET_ONMEDIA_SIZE_MAX, GFP_KERNEL);
|
||||
if (!kset_onmedia)
|
||||
return -ENOMEM;
|
||||
|
||||
cache_pos_copy(&pos_tail, &cache->key_tail);
|
||||
pos = &pos_tail;
|
||||
|
||||
/*
|
||||
* In cache replaying stage, there is no other one will access
|
||||
* cache->seg_map, so we can set bit here without cache->seg_map_lock.
|
||||
*/
|
||||
__set_bit(pos->cache_seg->cache_seg_id, cache->seg_map);
|
||||
|
||||
while (true) {
|
||||
to_copy = min(PCACHE_KSET_ONMEDIA_SIZE_MAX, PCACHE_SEG_SIZE - pos->seg_off);
|
||||
ret = copy_mc_to_kernel(kset_onmedia, cache_pos_addr(pos), to_copy);
|
||||
if (ret) {
|
||||
ret = -EIO;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (kset_onmedia->magic != PCACHE_KSET_MAGIC ||
|
||||
kset_onmedia->crc != cache_kset_crc(kset_onmedia)) {
|
||||
break;
|
||||
}
|
||||
|
||||
/* Process the last kset and prepare for the next segment. */
|
||||
if (kset_onmedia->flags & PCACHE_KSET_FLAGS_LAST) {
|
||||
struct pcache_cache_segment *next_seg;
|
||||
|
||||
pcache_dev_debug(pcache, "last kset replay, next: %u\n", kset_onmedia->next_cache_seg_id);
|
||||
|
||||
next_seg = &cache->segments[kset_onmedia->next_cache_seg_id];
|
||||
|
||||
pos->cache_seg = next_seg;
|
||||
pos->seg_off = 0;
|
||||
|
||||
__set_bit(pos->cache_seg->cache_seg_id, cache->seg_map);
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Replay the kset and check for errors. */
|
||||
ret = kset_replay(cache, kset_onmedia);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
/* Advance the position after processing the kset. */
|
||||
cache_pos_advance(pos, get_kset_onmedia_size(kset_onmedia));
|
||||
if (++count > 512) {
|
||||
cond_resched();
|
||||
count = 0;
|
||||
}
|
||||
}
|
||||
|
||||
/* Update the key_head position after replaying. */
|
||||
spin_lock(&cache->key_head_lock);
|
||||
cache_pos_copy(&cache->key_head, pos);
|
||||
spin_unlock(&cache->key_head_lock);
|
||||
out:
|
||||
kfree(kset_onmedia);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int cache_tree_init(struct pcache_cache *cache, struct pcache_cache_tree *cache_tree, u32 n_subtrees)
|
||||
{
|
||||
int ret;
|
||||
u32 i;
|
||||
|
||||
cache_tree->cache = cache;
|
||||
cache_tree->n_subtrees = n_subtrees;
|
||||
|
||||
ret = mempool_init_slab_pool(&cache_tree->key_pool, 1024, key_cache);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
/*
|
||||
* Allocate and initialize the subtrees array.
|
||||
* Each element is a cache tree structure that contains
|
||||
* an RB tree root and a spinlock for protecting its contents.
|
||||
*/
|
||||
cache_tree->subtrees = kvcalloc(cache_tree->n_subtrees, sizeof(struct pcache_cache_subtree), GFP_KERNEL);
|
||||
if (!cache_tree->subtrees) {
|
||||
ret = -ENOMEM;
|
||||
goto key_pool_exit;
|
||||
}
|
||||
|
||||
for (i = 0; i < cache_tree->n_subtrees; i++) {
|
||||
struct pcache_cache_subtree *cache_subtree = &cache_tree->subtrees[i];
|
||||
|
||||
cache_subtree->root = RB_ROOT;
|
||||
spin_lock_init(&cache_subtree->tree_lock);
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
key_pool_exit:
|
||||
mempool_exit(&cache_tree->key_pool);
|
||||
err:
|
||||
return ret;
|
||||
}
|
||||
|
||||
void cache_tree_clear(struct pcache_cache_tree *cache_tree)
|
||||
{
|
||||
struct pcache_cache_subtree *cache_subtree;
|
||||
struct rb_node *node;
|
||||
struct pcache_cache_key *key;
|
||||
u32 i;
|
||||
|
||||
for (i = 0; i < cache_tree->n_subtrees; i++) {
|
||||
cache_subtree = &cache_tree->subtrees[i];
|
||||
|
||||
spin_lock(&cache_subtree->tree_lock);
|
||||
node = rb_first(&cache_subtree->root);
|
||||
while (node) {
|
||||
key = CACHE_KEY(node);
|
||||
node = rb_next(node);
|
||||
|
||||
cache_key_delete(key);
|
||||
}
|
||||
spin_unlock(&cache_subtree->tree_lock);
|
||||
}
|
||||
}
|
||||
|
||||
void cache_tree_exit(struct pcache_cache_tree *cache_tree)
|
||||
{
|
||||
cache_tree_clear(cache_tree);
|
||||
kvfree(cache_tree->subtrees);
|
||||
mempool_exit(&cache_tree->key_pool);
|
||||
}
|
||||
836
drivers/md/dm-pcache/cache_req.c
Normal file
836
drivers/md/dm-pcache/cache_req.c
Normal file
@@ -0,0 +1,836 @@
|
||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
|
||||
#include "cache.h"
|
||||
#include "backing_dev.h"
|
||||
#include "cache_dev.h"
|
||||
#include "dm_pcache.h"
|
||||
|
||||
static int cache_data_head_init(struct pcache_cache *cache)
|
||||
{
|
||||
struct pcache_cache_segment *next_seg;
|
||||
struct pcache_cache_data_head *data_head;
|
||||
|
||||
data_head = get_data_head(cache);
|
||||
next_seg = get_cache_segment(cache);
|
||||
if (!next_seg)
|
||||
return -EBUSY;
|
||||
|
||||
cache_seg_get(next_seg);
|
||||
data_head->head_pos.cache_seg = next_seg;
|
||||
data_head->head_pos.seg_off = 0;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* cache_data_alloc - Allocate data for a cache key.
|
||||
* @cache: Pointer to the cache structure.
|
||||
* @key: Pointer to the cache key to allocate data for.
|
||||
*
|
||||
* This function tries to allocate space from the cache segment specified by the
|
||||
* data head. If the remaining space in the segment is insufficient to allocate
|
||||
* the requested length for the cache key, it will allocate whatever is available
|
||||
* and adjust the key's length accordingly. This function does not allocate
|
||||
* space that crosses segment boundaries.
|
||||
*/
|
||||
static int cache_data_alloc(struct pcache_cache *cache, struct pcache_cache_key *key)
|
||||
{
|
||||
struct pcache_cache_data_head *data_head;
|
||||
struct pcache_cache_pos *head_pos;
|
||||
struct pcache_cache_segment *cache_seg;
|
||||
u32 seg_remain;
|
||||
u32 allocated = 0, to_alloc;
|
||||
int ret = 0;
|
||||
|
||||
preempt_disable();
|
||||
data_head = get_data_head(cache);
|
||||
again:
|
||||
to_alloc = key->len - allocated;
|
||||
if (!data_head->head_pos.cache_seg) {
|
||||
seg_remain = 0;
|
||||
} else {
|
||||
cache_pos_copy(&key->cache_pos, &data_head->head_pos);
|
||||
key->seg_gen = key->cache_pos.cache_seg->gen;
|
||||
|
||||
head_pos = &data_head->head_pos;
|
||||
cache_seg = head_pos->cache_seg;
|
||||
seg_remain = cache_seg_remain(head_pos);
|
||||
}
|
||||
|
||||
if (seg_remain > to_alloc) {
|
||||
/* If remaining space in segment is sufficient for the cache key, allocate it. */
|
||||
cache_pos_advance(head_pos, to_alloc);
|
||||
allocated += to_alloc;
|
||||
cache_seg_get(cache_seg);
|
||||
} else if (seg_remain) {
|
||||
/* If remaining space is not enough, allocate the remaining space and adjust the cache key length. */
|
||||
cache_pos_advance(head_pos, seg_remain);
|
||||
key->len = seg_remain;
|
||||
|
||||
/* Get for key: obtain a reference to the cache segment for the key. */
|
||||
cache_seg_get(cache_seg);
|
||||
/* Put for head_pos->cache_seg: release the reference for the current head's segment. */
|
||||
cache_seg_put(head_pos->cache_seg);
|
||||
head_pos->cache_seg = NULL;
|
||||
} else {
|
||||
/* Initialize a new data head if no segment is available. */
|
||||
ret = cache_data_head_init(cache);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
goto again;
|
||||
}
|
||||
|
||||
out:
|
||||
preempt_enable();
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int cache_copy_from_req_bio(struct pcache_cache *cache, struct pcache_cache_key *key,
|
||||
struct pcache_request *pcache_req, u32 bio_off)
|
||||
{
|
||||
struct pcache_cache_pos *pos = &key->cache_pos;
|
||||
struct pcache_segment *segment;
|
||||
|
||||
segment = &pos->cache_seg->segment;
|
||||
|
||||
return segment_copy_from_bio(segment, pos->seg_off, key->len, pcache_req->bio, bio_off);
|
||||
}
|
||||
|
||||
static int cache_copy_to_req_bio(struct pcache_cache *cache, struct pcache_request *pcache_req,
|
||||
u32 bio_off, u32 len, struct pcache_cache_pos *pos, u64 key_gen)
|
||||
{
|
||||
struct pcache_cache_segment *cache_seg = pos->cache_seg;
|
||||
struct pcache_segment *segment = &cache_seg->segment;
|
||||
int ret;
|
||||
|
||||
spin_lock(&cache_seg->gen_lock);
|
||||
if (key_gen < cache_seg->gen) {
|
||||
spin_unlock(&cache_seg->gen_lock);
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
ret = segment_copy_to_bio(segment, pos->seg_off, len, pcache_req->bio, bio_off);
|
||||
spin_unlock(&cache_seg->gen_lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* miss_read_end_req - Handle the end of a miss read request.
|
||||
* @backing_req: Pointer to the request structure.
|
||||
* @read_ret: Return value of read.
|
||||
*
|
||||
* This function is called when a backing request to read data from
|
||||
* the backing_dev is completed. If the key associated with the request
|
||||
* is empty (a placeholder), it allocates cache space for the key,
|
||||
* copies the data read from the bio into the cache, and updates
|
||||
* the key's status. If the key has been overwritten by a write
|
||||
* request during this process, it will be deleted from the cache
|
||||
* tree and no further action will be taken.
|
||||
*/
|
||||
static void miss_read_end_req(struct pcache_backing_dev_req *backing_req, int read_ret)
|
||||
{
|
||||
void *priv_data = backing_req->priv_data;
|
||||
struct pcache_request *pcache_req = backing_req->req.upper_req;
|
||||
struct pcache_cache *cache = backing_req->backing_dev->cache;
|
||||
int ret;
|
||||
|
||||
if (priv_data) {
|
||||
struct pcache_cache_key *key;
|
||||
struct pcache_cache_subtree *cache_subtree;
|
||||
|
||||
key = (struct pcache_cache_key *)priv_data;
|
||||
cache_subtree = key->cache_subtree;
|
||||
|
||||
/* if this key was deleted from cache_subtree by a write, key->flags should be cleared,
|
||||
* so if cache_key_empty() return true, this key is still in cache_subtree
|
||||
*/
|
||||
spin_lock(&cache_subtree->tree_lock);
|
||||
if (cache_key_empty(key)) {
|
||||
/* Check if the backing request was successful. */
|
||||
if (read_ret) {
|
||||
cache_key_delete(key);
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
/* Allocate cache space for the key and copy data from the backing_dev. */
|
||||
ret = cache_data_alloc(cache, key);
|
||||
if (ret) {
|
||||
cache_key_delete(key);
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
ret = cache_copy_from_req_bio(cache, key, pcache_req, backing_req->req.bio_off);
|
||||
if (ret) {
|
||||
cache_seg_put(key->cache_pos.cache_seg);
|
||||
cache_key_delete(key);
|
||||
goto unlock;
|
||||
}
|
||||
key->flags &= ~PCACHE_CACHE_KEY_FLAGS_EMPTY;
|
||||
key->flags |= PCACHE_CACHE_KEY_FLAGS_CLEAN;
|
||||
|
||||
/* Append the key to the cache. */
|
||||
ret = cache_key_append(cache, key, false);
|
||||
if (ret) {
|
||||
cache_seg_put(key->cache_pos.cache_seg);
|
||||
cache_key_delete(key);
|
||||
goto unlock;
|
||||
}
|
||||
}
|
||||
unlock:
|
||||
spin_unlock(&cache_subtree->tree_lock);
|
||||
cache_key_put(key);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* submit_cache_miss_req - Submit a backing request when cache data is missing
|
||||
* @cache: The cache context that manages cache operations
|
||||
* @backing_req: The cache request containing information about the read request
|
||||
*
|
||||
* This function is used to handle cases where a cache read request cannot locate
|
||||
* the required data in the cache. When such a miss occurs during `cache_subtree_walk`,
|
||||
* it triggers a backing read request to fetch data from the backing storage.
|
||||
*
|
||||
* If `pcache_req->priv_data` is set, it points to a `pcache_cache_key`, representing
|
||||
* a new cache key to be inserted into the cache. The function calls `cache_key_insert`
|
||||
* to attempt adding the key. On insertion failure, it releases the key reference and
|
||||
* clears `priv_data` to avoid further processing.
|
||||
*/
|
||||
static void submit_cache_miss_req(struct pcache_cache *cache, struct pcache_backing_dev_req *backing_req)
|
||||
{
|
||||
if (backing_req->priv_data) {
|
||||
struct pcache_cache_key *key;
|
||||
|
||||
/* Attempt to insert the key into the cache if priv_data is set */
|
||||
key = (struct pcache_cache_key *)backing_req->priv_data;
|
||||
cache_key_insert(&cache->req_key_tree, key, true);
|
||||
}
|
||||
backing_dev_req_submit(backing_req, false);
|
||||
}
|
||||
|
||||
static void cache_miss_req_free(struct pcache_backing_dev_req *backing_req)
|
||||
{
|
||||
struct pcache_cache_key *key;
|
||||
|
||||
if (backing_req->priv_data) {
|
||||
key = backing_req->priv_data;
|
||||
backing_req->priv_data = NULL;
|
||||
cache_key_put(key); /* for ->priv_data */
|
||||
cache_key_put(key); /* for init ref in alloc */
|
||||
}
|
||||
|
||||
backing_dev_req_end(backing_req);
|
||||
}
|
||||
|
||||
static struct pcache_backing_dev_req *cache_miss_req_alloc(struct pcache_cache *cache,
|
||||
struct pcache_request *parent,
|
||||
gfp_t gfp_mask)
|
||||
{
|
||||
struct pcache_backing_dev *backing_dev = cache->backing_dev;
|
||||
struct pcache_backing_dev_req *backing_req;
|
||||
struct pcache_cache_key *key = NULL;
|
||||
struct pcache_backing_dev_req_opts req_opts = { 0 };
|
||||
|
||||
req_opts.type = BACKING_DEV_REQ_TYPE_REQ;
|
||||
req_opts.gfp_mask = gfp_mask;
|
||||
req_opts.req.upper_req = parent;
|
||||
|
||||
backing_req = backing_dev_req_alloc(backing_dev, &req_opts);
|
||||
if (!backing_req)
|
||||
return NULL;
|
||||
|
||||
key = cache_key_alloc(&cache->req_key_tree, gfp_mask);
|
||||
if (!key)
|
||||
goto free_backing_req;
|
||||
|
||||
cache_key_get(key);
|
||||
backing_req->priv_data = key;
|
||||
|
||||
return backing_req;
|
||||
|
||||
free_backing_req:
|
||||
cache_miss_req_free(backing_req);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static void cache_miss_req_init(struct pcache_cache *cache,
|
||||
struct pcache_backing_dev_req *backing_req,
|
||||
struct pcache_request *parent,
|
||||
u32 off, u32 len, bool insert_key)
|
||||
{
|
||||
struct pcache_cache_key *key;
|
||||
struct pcache_backing_dev_req_opts req_opts = { 0 };
|
||||
|
||||
req_opts.type = BACKING_DEV_REQ_TYPE_REQ;
|
||||
req_opts.req.upper_req = parent;
|
||||
req_opts.req.req_off = off;
|
||||
req_opts.req.len = len;
|
||||
req_opts.end_fn = miss_read_end_req;
|
||||
|
||||
backing_dev_req_init(backing_req, &req_opts);
|
||||
|
||||
if (insert_key) {
|
||||
key = backing_req->priv_data;
|
||||
key->off = parent->off + off;
|
||||
key->len = len;
|
||||
key->flags |= PCACHE_CACHE_KEY_FLAGS_EMPTY;
|
||||
} else {
|
||||
key = backing_req->priv_data;
|
||||
backing_req->priv_data = NULL;
|
||||
cache_key_put(key);
|
||||
cache_key_put(key);
|
||||
}
|
||||
}
|
||||
|
||||
static struct pcache_backing_dev_req *get_pre_alloc_req(struct pcache_cache_subtree_walk_ctx *ctx)
|
||||
{
|
||||
struct pcache_cache *cache = ctx->cache_tree->cache;
|
||||
struct pcache_request *pcache_req = ctx->pcache_req;
|
||||
struct pcache_backing_dev_req *backing_req;
|
||||
|
||||
if (ctx->pre_alloc_req) {
|
||||
backing_req = ctx->pre_alloc_req;
|
||||
ctx->pre_alloc_req = NULL;
|
||||
|
||||
return backing_req;
|
||||
}
|
||||
|
||||
return cache_miss_req_alloc(cache, pcache_req, GFP_NOWAIT);
|
||||
}
|
||||
|
||||
/*
|
||||
* In the process of walking the cache tree to locate cached data, this
|
||||
* function handles the situation where the requested data range lies
|
||||
* entirely before an existing cache node (`key_tmp`). This outcome
|
||||
* signifies that the target data is absent from the cache (cache miss).
|
||||
*
|
||||
* To fulfill this portion of the read request, the function creates a
|
||||
* backing request (`backing_req`) for the missing data range represented
|
||||
* by `key`. It then appends this request to the submission list in the
|
||||
* `ctx`, which will later be processed to retrieve the data from backing
|
||||
* storage. After setting up the backing request, `req_done` in `ctx` is
|
||||
* updated to reflect the length of the handled range, and the range
|
||||
* in `key` is adjusted by trimming off the portion that is now handled.
|
||||
*
|
||||
* The scenario handled here:
|
||||
*
|
||||
* |--------| key_tmp (existing cached range)
|
||||
* |====| key (requested range, preceding key_tmp)
|
||||
*
|
||||
* Since `key` is before `key_tmp`, it signifies that the requested data
|
||||
* range is missing in the cache (cache miss) and needs retrieval from
|
||||
* backing storage.
|
||||
*/
|
||||
static int read_before(struct pcache_cache_key *key, struct pcache_cache_key *key_tmp,
|
||||
struct pcache_cache_subtree_walk_ctx *ctx)
|
||||
{
|
||||
struct pcache_backing_dev_req *backing_req;
|
||||
struct pcache_cache *cache = ctx->cache_tree->cache;
|
||||
|
||||
/*
|
||||
* In this scenario, `key` represents a range that precedes `key_tmp`,
|
||||
* meaning the requested data range is missing from the cache tree
|
||||
* and must be retrieved from the backing_dev.
|
||||
*/
|
||||
backing_req = get_pre_alloc_req(ctx);
|
||||
if (!backing_req)
|
||||
return SUBTREE_WALK_RET_NEED_REQ;
|
||||
|
||||
cache_miss_req_init(cache, backing_req, ctx->pcache_req, ctx->req_done, key->len, true);
|
||||
|
||||
list_add(&backing_req->node, ctx->submit_req_list);
|
||||
ctx->req_done += key->len;
|
||||
cache_key_cutfront(key, key->len);
|
||||
|
||||
return SUBTREE_WALK_RET_OK;
|
||||
}
|
||||
|
||||
/*
|
||||
* During cache_subtree_walk, this function manages a scenario where part of the
|
||||
* requested data range overlaps with an existing cache node (`key_tmp`).
|
||||
*
|
||||
* |----------------| key_tmp (existing cached range)
|
||||
* |===========| key (requested range, overlapping the tail of key_tmp)
|
||||
*/
|
||||
static int read_overlap_tail(struct pcache_cache_key *key, struct pcache_cache_key *key_tmp,
|
||||
struct pcache_cache_subtree_walk_ctx *ctx)
|
||||
{
|
||||
struct pcache_cache *cache = ctx->cache_tree->cache;
|
||||
struct pcache_backing_dev_req *backing_req;
|
||||
u32 io_len;
|
||||
int ret;
|
||||
|
||||
/*
|
||||
* Calculate the length of the non-overlapping portion of `key`
|
||||
* before `key_tmp`, representing the data missing in the cache.
|
||||
*/
|
||||
io_len = cache_key_lstart(key_tmp) - cache_key_lstart(key);
|
||||
if (io_len) {
|
||||
backing_req = get_pre_alloc_req(ctx);
|
||||
if (!backing_req)
|
||||
return SUBTREE_WALK_RET_NEED_REQ;
|
||||
|
||||
cache_miss_req_init(cache, backing_req, ctx->pcache_req, ctx->req_done, io_len, true);
|
||||
|
||||
list_add(&backing_req->node, ctx->submit_req_list);
|
||||
ctx->req_done += io_len;
|
||||
cache_key_cutfront(key, io_len);
|
||||
}
|
||||
|
||||
/*
|
||||
* Handle the overlapping portion by calculating the length of
|
||||
* the remaining data in `key` that coincides with `key_tmp`.
|
||||
*/
|
||||
io_len = cache_key_lend(key) - cache_key_lstart(key_tmp);
|
||||
if (cache_key_empty(key_tmp)) {
|
||||
backing_req = get_pre_alloc_req(ctx);
|
||||
if (!backing_req)
|
||||
return SUBTREE_WALK_RET_NEED_REQ;
|
||||
|
||||
cache_miss_req_init(cache, backing_req, ctx->pcache_req, ctx->req_done, io_len, false);
|
||||
submit_cache_miss_req(cache, backing_req);
|
||||
} else {
|
||||
ret = cache_copy_to_req_bio(ctx->cache_tree->cache, ctx->pcache_req, ctx->req_done,
|
||||
io_len, &key_tmp->cache_pos, key_tmp->seg_gen);
|
||||
if (ret) {
|
||||
if (ret == -EINVAL) {
|
||||
cache_key_delete(key_tmp);
|
||||
return SUBTREE_WALK_RET_RESEARCH;
|
||||
}
|
||||
|
||||
ctx->ret = ret;
|
||||
return SUBTREE_WALK_RET_ERR;
|
||||
}
|
||||
}
|
||||
|
||||
ctx->req_done += io_len;
|
||||
cache_key_cutfront(key, io_len);
|
||||
|
||||
return SUBTREE_WALK_RET_OK;
|
||||
}
|
||||
|
||||
/*
|
||||
* |----| key_tmp (existing cached range)
|
||||
* |==========| key (requested range)
|
||||
*/
|
||||
static int read_overlap_contain(struct pcache_cache_key *key, struct pcache_cache_key *key_tmp,
|
||||
struct pcache_cache_subtree_walk_ctx *ctx)
|
||||
{
|
||||
struct pcache_cache *cache = ctx->cache_tree->cache;
|
||||
struct pcache_backing_dev_req *backing_req;
|
||||
u32 io_len;
|
||||
int ret;
|
||||
|
||||
/*
|
||||
* Calculate the non-overlapping part of `key` before `key_tmp`
|
||||
* to identify the missing data length.
|
||||
*/
|
||||
io_len = cache_key_lstart(key_tmp) - cache_key_lstart(key);
|
||||
if (io_len) {
|
||||
backing_req = get_pre_alloc_req(ctx);
|
||||
if (!backing_req)
|
||||
return SUBTREE_WALK_RET_NEED_REQ;
|
||||
|
||||
cache_miss_req_init(cache, backing_req, ctx->pcache_req, ctx->req_done, io_len, true);
|
||||
|
||||
list_add(&backing_req->node, ctx->submit_req_list);
|
||||
|
||||
ctx->req_done += io_len;
|
||||
cache_key_cutfront(key, io_len);
|
||||
}
|
||||
|
||||
/*
|
||||
* Handle the overlapping portion between `key` and `key_tmp`.
|
||||
*/
|
||||
io_len = key_tmp->len;
|
||||
if (cache_key_empty(key_tmp)) {
|
||||
backing_req = get_pre_alloc_req(ctx);
|
||||
if (!backing_req)
|
||||
return SUBTREE_WALK_RET_NEED_REQ;
|
||||
|
||||
cache_miss_req_init(cache, backing_req, ctx->pcache_req, ctx->req_done, io_len, false);
|
||||
submit_cache_miss_req(cache, backing_req);
|
||||
} else {
|
||||
ret = cache_copy_to_req_bio(ctx->cache_tree->cache, ctx->pcache_req, ctx->req_done,
|
||||
io_len, &key_tmp->cache_pos, key_tmp->seg_gen);
|
||||
if (ret) {
|
||||
if (ret == -EINVAL) {
|
||||
cache_key_delete(key_tmp);
|
||||
return SUBTREE_WALK_RET_RESEARCH;
|
||||
}
|
||||
|
||||
ctx->ret = ret;
|
||||
return SUBTREE_WALK_RET_ERR;
|
||||
}
|
||||
}
|
||||
|
||||
ctx->req_done += io_len;
|
||||
cache_key_cutfront(key, io_len);
|
||||
|
||||
return SUBTREE_WALK_RET_OK;
|
||||
}
|
||||
|
||||
/*
|
||||
* |-----------| key_tmp (existing cached range)
|
||||
* |====| key (requested range, fully within key_tmp)
|
||||
*
|
||||
* If `key_tmp` contains valid cached data, this function copies the relevant
|
||||
* portion to the request's bio. Otherwise, it sends a backing request to
|
||||
* fetch the required data range.
|
||||
*/
|
||||
static int read_overlap_contained(struct pcache_cache_key *key, struct pcache_cache_key *key_tmp,
|
||||
struct pcache_cache_subtree_walk_ctx *ctx)
|
||||
{
|
||||
struct pcache_cache *cache = ctx->cache_tree->cache;
|
||||
struct pcache_backing_dev_req *backing_req;
|
||||
struct pcache_cache_pos pos;
|
||||
int ret;
|
||||
|
||||
/*
|
||||
* Check if `key_tmp` is empty, indicating a miss. If so, initiate
|
||||
* a backing request to fetch the required data for `key`.
|
||||
*/
|
||||
if (cache_key_empty(key_tmp)) {
|
||||
backing_req = get_pre_alloc_req(ctx);
|
||||
if (!backing_req)
|
||||
return SUBTREE_WALK_RET_NEED_REQ;
|
||||
|
||||
cache_miss_req_init(cache, backing_req, ctx->pcache_req, ctx->req_done, key->len, false);
|
||||
submit_cache_miss_req(cache, backing_req);
|
||||
} else {
|
||||
cache_pos_copy(&pos, &key_tmp->cache_pos);
|
||||
cache_pos_advance(&pos, cache_key_lstart(key) - cache_key_lstart(key_tmp));
|
||||
|
||||
ret = cache_copy_to_req_bio(ctx->cache_tree->cache, ctx->pcache_req, ctx->req_done,
|
||||
key->len, &pos, key_tmp->seg_gen);
|
||||
if (ret) {
|
||||
if (ret == -EINVAL) {
|
||||
cache_key_delete(key_tmp);
|
||||
return SUBTREE_WALK_RET_RESEARCH;
|
||||
}
|
||||
|
||||
ctx->ret = ret;
|
||||
return SUBTREE_WALK_RET_ERR;
|
||||
}
|
||||
}
|
||||
|
||||
ctx->req_done += key->len;
|
||||
cache_key_cutfront(key, key->len);
|
||||
|
||||
return SUBTREE_WALK_RET_OK;
|
||||
}
|
||||
|
||||
/*
|
||||
* |--------| key_tmp (existing cached range)
|
||||
* |==========| key (requested range, overlapping the head of key_tmp)
|
||||
*/
|
||||
static int read_overlap_head(struct pcache_cache_key *key, struct pcache_cache_key *key_tmp,
|
||||
struct pcache_cache_subtree_walk_ctx *ctx)
|
||||
{
|
||||
struct pcache_cache *cache = ctx->cache_tree->cache;
|
||||
struct pcache_backing_dev_req *backing_req;
|
||||
struct pcache_cache_pos pos;
|
||||
u32 io_len;
|
||||
int ret;
|
||||
|
||||
io_len = cache_key_lend(key_tmp) - cache_key_lstart(key);
|
||||
|
||||
if (cache_key_empty(key_tmp)) {
|
||||
backing_req = get_pre_alloc_req(ctx);
|
||||
if (!backing_req)
|
||||
return SUBTREE_WALK_RET_NEED_REQ;
|
||||
|
||||
cache_miss_req_init(cache, backing_req, ctx->pcache_req, ctx->req_done, io_len, false);
|
||||
submit_cache_miss_req(cache, backing_req);
|
||||
} else {
|
||||
cache_pos_copy(&pos, &key_tmp->cache_pos);
|
||||
cache_pos_advance(&pos, cache_key_lstart(key) - cache_key_lstart(key_tmp));
|
||||
|
||||
ret = cache_copy_to_req_bio(ctx->cache_tree->cache, ctx->pcache_req, ctx->req_done,
|
||||
io_len, &pos, key_tmp->seg_gen);
|
||||
if (ret) {
|
||||
if (ret == -EINVAL) {
|
||||
cache_key_delete(key_tmp);
|
||||
return SUBTREE_WALK_RET_RESEARCH;
|
||||
}
|
||||
|
||||
ctx->ret = ret;
|
||||
return SUBTREE_WALK_RET_ERR;
|
||||
}
|
||||
}
|
||||
|
||||
ctx->req_done += io_len;
|
||||
cache_key_cutfront(key, io_len);
|
||||
|
||||
return SUBTREE_WALK_RET_OK;
|
||||
}
|
||||
|
||||
/**
|
||||
* read_walk_finally - Finalizes the cache read tree walk by submitting any
|
||||
* remaining backing requests
|
||||
* @ctx: Context structure holding information about the cache,
|
||||
* read request, and submission list
|
||||
* @ret: the return value after this walk.
|
||||
*
|
||||
* This function is called at the end of the `cache_subtree_walk` during a
|
||||
* cache read operation. It completes the walk by checking if any data
|
||||
* requested by `key` was not found in the cache tree, and if so, it sends
|
||||
* a backing request to retrieve that data. Then, it iterates through the
|
||||
* submission list of backing requests created during the walk, removing
|
||||
* each request from the list and submitting it.
|
||||
*
|
||||
* The scenario managed here includes:
|
||||
* - Sending a backing request for the remaining length of `key` if it was
|
||||
* not fulfilled by existing cache entries.
|
||||
* - Iterating through `ctx->submit_req_list` to submit each backing request
|
||||
* enqueued during the walk.
|
||||
*
|
||||
* This ensures all necessary backing requests for cache misses are submitted
|
||||
* to the backing storage to retrieve any data that could not be found in
|
||||
* the cache.
|
||||
*/
|
||||
static int read_walk_finally(struct pcache_cache_subtree_walk_ctx *ctx, int ret)
|
||||
{
|
||||
struct pcache_cache *cache = ctx->cache_tree->cache;
|
||||
struct pcache_backing_dev_req *backing_req, *next_req;
|
||||
struct pcache_cache_key *key = ctx->key;
|
||||
|
||||
list_for_each_entry_safe(backing_req, next_req, ctx->submit_req_list, node) {
|
||||
list_del_init(&backing_req->node);
|
||||
submit_cache_miss_req(ctx->cache_tree->cache, backing_req);
|
||||
}
|
||||
|
||||
if (ret != SUBTREE_WALK_RET_OK)
|
||||
return ret;
|
||||
|
||||
if (key->len) {
|
||||
backing_req = get_pre_alloc_req(ctx);
|
||||
if (!backing_req)
|
||||
return SUBTREE_WALK_RET_NEED_REQ;
|
||||
|
||||
cache_miss_req_init(cache, backing_req, ctx->pcache_req, ctx->req_done, key->len, true);
|
||||
submit_cache_miss_req(cache, backing_req);
|
||||
ctx->req_done += key->len;
|
||||
}
|
||||
|
||||
return SUBTREE_WALK_RET_OK;
|
||||
}
|
||||
|
||||
/*
|
||||
* This function is used within `cache_subtree_walk` to determine whether the
|
||||
* read operation has covered the requested data length. It compares the
|
||||
* amount of data processed (`ctx->req_done`) with the total data length
|
||||
* specified in the original request (`ctx->pcache_req->data_len`).
|
||||
*
|
||||
* If `req_done` meets or exceeds the required data length, the function
|
||||
* returns `true`, indicating the walk is complete. Otherwise, it returns `false`,
|
||||
* signaling that additional data processing is needed to fulfill the request.
|
||||
*/
|
||||
static bool read_walk_done(struct pcache_cache_subtree_walk_ctx *ctx)
|
||||
{
|
||||
return (ctx->req_done >= ctx->pcache_req->data_len);
|
||||
}
|
||||
|
||||
/**
|
||||
* cache_read - Process a read request by traversing the cache tree
|
||||
* @cache: Cache structure holding cache trees and related configurations
|
||||
* @pcache_req: Request structure with information about the data to read
|
||||
*
|
||||
* This function attempts to fulfill a read request by traversing the cache tree(s)
|
||||
* to locate cached data for the requested range. If parts of the data are missing
|
||||
* in the cache, backing requests are generated to retrieve the required segments.
|
||||
*
|
||||
* The function operates by initializing a key for the requested data range and
|
||||
* preparing a context (`walk_ctx`) to manage the cache tree traversal. The context
|
||||
* includes pointers to functions (e.g., `read_before`, `read_overlap_tail`) that handle
|
||||
* specific conditions encountered during the traversal. The `walk_finally` and `walk_done`
|
||||
* functions manage the end stages of the traversal, while the `delete_key_list` and
|
||||
* `submit_req_list` lists track any keys to be deleted or requests to be submitted.
|
||||
*
|
||||
* The function first calculates the requested range and checks if it fits within the
|
||||
* current cache tree (based on the tree's size limits). It then locks the cache tree
|
||||
* and performs a search to locate any matching keys. If there are outdated keys,
|
||||
* these are deleted, and the search is restarted to ensure accurate data retrieval.
|
||||
*
|
||||
* If the requested range spans multiple cache trees, the function moves on to the
|
||||
* next tree once the current range has been processed. This continues until the
|
||||
* entire requested data length has been handled.
|
||||
*/
|
||||
static int cache_read(struct pcache_cache *cache, struct pcache_request *pcache_req)
|
||||
{
|
||||
struct pcache_cache_key key_data = { .off = pcache_req->off, .len = pcache_req->data_len };
|
||||
struct pcache_cache_subtree *cache_subtree;
|
||||
struct pcache_cache_key *key_tmp = NULL, *key_next;
|
||||
struct rb_node *prev_node = NULL;
|
||||
struct pcache_cache_key *key = &key_data;
|
||||
struct pcache_cache_subtree_walk_ctx walk_ctx = { 0 };
|
||||
struct pcache_backing_dev_req *backing_req, *next_req;
|
||||
LIST_HEAD(delete_key_list);
|
||||
LIST_HEAD(submit_req_list);
|
||||
int ret;
|
||||
|
||||
walk_ctx.cache_tree = &cache->req_key_tree;
|
||||
walk_ctx.req_done = 0;
|
||||
walk_ctx.pcache_req = pcache_req;
|
||||
walk_ctx.before = read_before;
|
||||
walk_ctx.overlap_tail = read_overlap_tail;
|
||||
walk_ctx.overlap_head = read_overlap_head;
|
||||
walk_ctx.overlap_contain = read_overlap_contain;
|
||||
walk_ctx.overlap_contained = read_overlap_contained;
|
||||
walk_ctx.walk_finally = read_walk_finally;
|
||||
walk_ctx.walk_done = read_walk_done;
|
||||
walk_ctx.delete_key_list = &delete_key_list;
|
||||
walk_ctx.submit_req_list = &submit_req_list;
|
||||
|
||||
next:
|
||||
key->off = pcache_req->off + walk_ctx.req_done;
|
||||
key->len = pcache_req->data_len - walk_ctx.req_done;
|
||||
if (key->len > PCACHE_CACHE_SUBTREE_SIZE - (key->off & PCACHE_CACHE_SUBTREE_SIZE_MASK))
|
||||
key->len = PCACHE_CACHE_SUBTREE_SIZE - (key->off & PCACHE_CACHE_SUBTREE_SIZE_MASK);
|
||||
|
||||
cache_subtree = get_subtree(&cache->req_key_tree, key->off);
|
||||
spin_lock(&cache_subtree->tree_lock);
|
||||
search:
|
||||
prev_node = cache_subtree_search(cache_subtree, key, NULL, NULL, &delete_key_list);
|
||||
if (!list_empty(&delete_key_list)) {
|
||||
list_for_each_entry_safe(key_tmp, key_next, &delete_key_list, list_node) {
|
||||
list_del_init(&key_tmp->list_node);
|
||||
cache_key_delete(key_tmp);
|
||||
}
|
||||
goto search;
|
||||
}
|
||||
|
||||
walk_ctx.start_node = prev_node;
|
||||
walk_ctx.key = key;
|
||||
|
||||
ret = cache_subtree_walk(&walk_ctx);
|
||||
if (ret == SUBTREE_WALK_RET_RESEARCH)
|
||||
goto search;
|
||||
spin_unlock(&cache_subtree->tree_lock);
|
||||
|
||||
if (ret == SUBTREE_WALK_RET_ERR) {
|
||||
ret = walk_ctx.ret;
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (ret == SUBTREE_WALK_RET_NEED_REQ) {
|
||||
walk_ctx.pre_alloc_req = cache_miss_req_alloc(cache, pcache_req, GFP_NOIO);
|
||||
pcache_dev_debug(CACHE_TO_PCACHE(cache), "allocate pre_alloc_req with GFP_NOIO");
|
||||
}
|
||||
|
||||
if (walk_ctx.req_done < pcache_req->data_len)
|
||||
goto next;
|
||||
ret = 0;
|
||||
out:
|
||||
if (walk_ctx.pre_alloc_req)
|
||||
cache_miss_req_free(walk_ctx.pre_alloc_req);
|
||||
|
||||
list_for_each_entry_safe(backing_req, next_req, &submit_req_list, node) {
|
||||
list_del_init(&backing_req->node);
|
||||
backing_dev_req_end(backing_req);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int cache_write(struct pcache_cache *cache, struct pcache_request *pcache_req)
|
||||
{
|
||||
struct pcache_cache_subtree *cache_subtree;
|
||||
struct pcache_cache_key *key;
|
||||
u64 offset = pcache_req->off;
|
||||
u32 length = pcache_req->data_len;
|
||||
u32 io_done = 0;
|
||||
int ret;
|
||||
|
||||
while (true) {
|
||||
if (io_done >= length)
|
||||
break;
|
||||
|
||||
key = cache_key_alloc(&cache->req_key_tree, GFP_NOIO);
|
||||
key->off = offset + io_done;
|
||||
key->len = length - io_done;
|
||||
if (key->len > PCACHE_CACHE_SUBTREE_SIZE - (key->off & PCACHE_CACHE_SUBTREE_SIZE_MASK))
|
||||
key->len = PCACHE_CACHE_SUBTREE_SIZE - (key->off & PCACHE_CACHE_SUBTREE_SIZE_MASK);
|
||||
|
||||
ret = cache_data_alloc(cache, key);
|
||||
if (ret) {
|
||||
cache_key_put(key);
|
||||
goto err;
|
||||
}
|
||||
|
||||
ret = cache_copy_from_req_bio(cache, key, pcache_req, io_done);
|
||||
if (ret) {
|
||||
cache_seg_put(key->cache_pos.cache_seg);
|
||||
cache_key_put(key);
|
||||
goto err;
|
||||
}
|
||||
|
||||
cache_subtree = get_subtree(&cache->req_key_tree, key->off);
|
||||
spin_lock(&cache_subtree->tree_lock);
|
||||
cache_key_insert(&cache->req_key_tree, key, true);
|
||||
ret = cache_key_append(cache, key, pcache_req->bio->bi_opf & REQ_FUA);
|
||||
if (ret) {
|
||||
cache_seg_put(key->cache_pos.cache_seg);
|
||||
cache_key_delete(key);
|
||||
goto unlock;
|
||||
}
|
||||
|
||||
io_done += key->len;
|
||||
spin_unlock(&cache_subtree->tree_lock);
|
||||
}
|
||||
|
||||
return 0;
|
||||
unlock:
|
||||
spin_unlock(&cache_subtree->tree_lock);
|
||||
err:
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* cache_flush - Flush all ksets to persist any pending cache data
|
||||
* @cache: Pointer to the cache structure
|
||||
*
|
||||
* This function iterates through all ksets associated with the provided `cache`
|
||||
* and ensures that any data marked for persistence is written to media. For each
|
||||
* kset, it acquires the kset lock, then invokes `cache_kset_close`, which handles
|
||||
* the persistence logic for that kset.
|
||||
*
|
||||
* If `cache_kset_close` encounters an error, the function exits immediately with
|
||||
* the respective error code, preventing the flush operation from proceeding to
|
||||
* subsequent ksets.
|
||||
*/
|
||||
int cache_flush(struct pcache_cache *cache)
|
||||
{
|
||||
struct pcache_cache_kset *kset;
|
||||
int ret;
|
||||
u32 i;
|
||||
|
||||
for (i = 0; i < cache->n_ksets; i++) {
|
||||
kset = get_kset(cache, i);
|
||||
|
||||
spin_lock(&kset->kset_lock);
|
||||
ret = cache_kset_close(cache, kset);
|
||||
spin_unlock(&kset->kset_lock);
|
||||
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int pcache_cache_handle_req(struct pcache_cache *cache, struct pcache_request *pcache_req)
|
||||
{
|
||||
struct bio *bio = pcache_req->bio;
|
||||
|
||||
if (unlikely(bio->bi_opf & REQ_PREFLUSH))
|
||||
return cache_flush(cache);
|
||||
|
||||
if (bio_data_dir(bio) == READ)
|
||||
return cache_read(cache, pcache_req);
|
||||
|
||||
return cache_write(cache, pcache_req);
|
||||
}
|
||||
305
drivers/md/dm-pcache/cache_segment.c
Normal file
305
drivers/md/dm-pcache/cache_segment.c
Normal file
@@ -0,0 +1,305 @@
|
||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
|
||||
#include "cache_dev.h"
|
||||
#include "cache.h"
|
||||
#include "backing_dev.h"
|
||||
#include "dm_pcache.h"
|
||||
|
||||
static inline struct pcache_segment_info *get_seg_info_addr(struct pcache_cache_segment *cache_seg)
|
||||
{
|
||||
struct pcache_segment_info *seg_info_addr;
|
||||
u32 seg_id = cache_seg->segment.seg_id;
|
||||
void *seg_addr;
|
||||
|
||||
seg_addr = CACHE_DEV_SEGMENT(cache_seg->cache->cache_dev, seg_id);
|
||||
seg_info_addr = seg_addr + PCACHE_SEG_INFO_SIZE * cache_seg->info_index;
|
||||
|
||||
return seg_info_addr;
|
||||
}
|
||||
|
||||
static void cache_seg_info_write(struct pcache_cache_segment *cache_seg)
|
||||
{
|
||||
struct pcache_segment_info *seg_info_addr;
|
||||
struct pcache_segment_info *seg_info = &cache_seg->cache_seg_info;
|
||||
|
||||
mutex_lock(&cache_seg->info_lock);
|
||||
seg_info->header.seq++;
|
||||
seg_info->header.crc = pcache_meta_crc(&seg_info->header, sizeof(struct pcache_segment_info));
|
||||
|
||||
seg_info_addr = get_seg_info_addr(cache_seg);
|
||||
memcpy_flushcache(seg_info_addr, seg_info, sizeof(struct pcache_segment_info));
|
||||
pmem_wmb();
|
||||
|
||||
cache_seg->info_index = (cache_seg->info_index + 1) % PCACHE_META_INDEX_MAX;
|
||||
mutex_unlock(&cache_seg->info_lock);
|
||||
}
|
||||
|
||||
static int cache_seg_info_load(struct pcache_cache_segment *cache_seg)
|
||||
{
|
||||
struct pcache_segment_info *cache_seg_info_addr_base, *cache_seg_info_addr;
|
||||
struct pcache_cache_dev *cache_dev = cache_seg->cache->cache_dev;
|
||||
struct dm_pcache *pcache = CACHE_DEV_TO_PCACHE(cache_dev);
|
||||
u32 seg_id = cache_seg->segment.seg_id;
|
||||
int ret = 0;
|
||||
|
||||
cache_seg_info_addr_base = CACHE_DEV_SEGMENT(cache_dev, seg_id);
|
||||
|
||||
mutex_lock(&cache_seg->info_lock);
|
||||
cache_seg_info_addr = pcache_meta_find_latest(&cache_seg_info_addr_base->header,
|
||||
sizeof(struct pcache_segment_info),
|
||||
PCACHE_SEG_INFO_SIZE,
|
||||
&cache_seg->cache_seg_info);
|
||||
if (IS_ERR(cache_seg_info_addr)) {
|
||||
ret = PTR_ERR(cache_seg_info_addr);
|
||||
goto out;
|
||||
} else if (!cache_seg_info_addr) {
|
||||
ret = -EIO;
|
||||
goto out;
|
||||
}
|
||||
cache_seg->info_index = cache_seg_info_addr - cache_seg_info_addr_base;
|
||||
out:
|
||||
mutex_unlock(&cache_seg->info_lock);
|
||||
|
||||
if (ret)
|
||||
pcache_dev_err(pcache, "can't read segment info of segment: %u, ret: %d\n",
|
||||
cache_seg->segment.seg_id, ret);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int cache_seg_ctrl_load(struct pcache_cache_segment *cache_seg)
|
||||
{
|
||||
struct pcache_cache_seg_ctrl *cache_seg_ctrl = cache_seg->cache_seg_ctrl;
|
||||
struct pcache_cache_seg_gen cache_seg_gen, *cache_seg_gen_addr;
|
||||
int ret = 0;
|
||||
|
||||
cache_seg_gen_addr = pcache_meta_find_latest(&cache_seg_ctrl->gen->header,
|
||||
sizeof(struct pcache_cache_seg_gen),
|
||||
sizeof(struct pcache_cache_seg_gen),
|
||||
&cache_seg_gen);
|
||||
if (IS_ERR(cache_seg_gen_addr)) {
|
||||
ret = PTR_ERR(cache_seg_gen_addr);
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (!cache_seg_gen_addr) {
|
||||
cache_seg->gen = 0;
|
||||
cache_seg->gen_seq = 0;
|
||||
cache_seg->gen_index = 0;
|
||||
goto out;
|
||||
}
|
||||
|
||||
cache_seg->gen = cache_seg_gen.gen;
|
||||
cache_seg->gen_seq = cache_seg_gen.header.seq;
|
||||
cache_seg->gen_index = (cache_seg_gen_addr - cache_seg_ctrl->gen);
|
||||
out:
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static inline struct pcache_cache_seg_gen *get_cache_seg_gen_addr(struct pcache_cache_segment *cache_seg)
|
||||
{
|
||||
struct pcache_cache_seg_ctrl *cache_seg_ctrl = cache_seg->cache_seg_ctrl;
|
||||
|
||||
return (cache_seg_ctrl->gen + cache_seg->gen_index);
|
||||
}
|
||||
|
||||
/*
|
||||
* cache_seg_ctrl_write - write cache segment control information
|
||||
* @seg: the cache segment to update
|
||||
*
|
||||
* This function writes the control information of a cache segment to media.
|
||||
*
|
||||
* Although this updates shared control data, we intentionally do not use
|
||||
* any locking here. All accesses to control information are single-threaded:
|
||||
*
|
||||
* - All reads occur during the init phase, where no concurrent writes
|
||||
* can happen.
|
||||
* - Writes happen once during init and once when the last reference
|
||||
* to the segment is dropped in cache_seg_put().
|
||||
*
|
||||
* Both cases are guaranteed to be single-threaded, so there is no risk
|
||||
* of concurrent read/write races.
|
||||
*/
|
||||
static void cache_seg_ctrl_write(struct pcache_cache_segment *cache_seg)
|
||||
{
|
||||
struct pcache_cache_seg_gen cache_seg_gen;
|
||||
|
||||
cache_seg_gen.gen = cache_seg->gen;
|
||||
cache_seg_gen.header.seq = ++cache_seg->gen_seq;
|
||||
cache_seg_gen.header.crc = pcache_meta_crc(&cache_seg_gen.header,
|
||||
sizeof(struct pcache_cache_seg_gen));
|
||||
|
||||
memcpy_flushcache(get_cache_seg_gen_addr(cache_seg), &cache_seg_gen, sizeof(struct pcache_cache_seg_gen));
|
||||
pmem_wmb();
|
||||
|
||||
cache_seg->gen_index = (cache_seg->gen_index + 1) % PCACHE_META_INDEX_MAX;
|
||||
}
|
||||
|
||||
static void cache_seg_ctrl_init(struct pcache_cache_segment *cache_seg)
|
||||
{
|
||||
cache_seg->gen = 0;
|
||||
cache_seg->gen_seq = 0;
|
||||
cache_seg->gen_index = 0;
|
||||
cache_seg_ctrl_write(cache_seg);
|
||||
}
|
||||
|
||||
static int cache_seg_meta_load(struct pcache_cache_segment *cache_seg)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = cache_seg_info_load(cache_seg);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
ret = cache_seg_ctrl_load(cache_seg);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
return 0;
|
||||
err:
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* cache_seg_set_next_seg - Sets the ID of the next segment
|
||||
* @cache_seg: Pointer to the cache segment structure.
|
||||
* @seg_id: The segment ID to set as the next segment.
|
||||
*
|
||||
* A pcache_cache allocates multiple cache segments, which are linked together
|
||||
* through next_seg. When loading a pcache_cache, the first cache segment can
|
||||
* be found using cache->seg_id, which allows access to all the cache segments.
|
||||
*/
|
||||
void cache_seg_set_next_seg(struct pcache_cache_segment *cache_seg, u32 seg_id)
|
||||
{
|
||||
cache_seg->cache_seg_info.flags |= PCACHE_SEG_INFO_FLAGS_HAS_NEXT;
|
||||
cache_seg->cache_seg_info.next_seg = seg_id;
|
||||
cache_seg_info_write(cache_seg);
|
||||
}
|
||||
|
||||
int cache_seg_init(struct pcache_cache *cache, u32 seg_id, u32 cache_seg_id,
|
||||
bool new_cache)
|
||||
{
|
||||
struct pcache_cache_dev *cache_dev = cache->cache_dev;
|
||||
struct pcache_cache_segment *cache_seg = &cache->segments[cache_seg_id];
|
||||
struct pcache_segment_init_options seg_options = { 0 };
|
||||
struct pcache_segment *segment = &cache_seg->segment;
|
||||
int ret;
|
||||
|
||||
cache_seg->cache = cache;
|
||||
cache_seg->cache_seg_id = cache_seg_id;
|
||||
spin_lock_init(&cache_seg->gen_lock);
|
||||
atomic_set(&cache_seg->refs, 0);
|
||||
mutex_init(&cache_seg->info_lock);
|
||||
|
||||
/* init pcache_segment */
|
||||
seg_options.type = PCACHE_SEGMENT_TYPE_CACHE_DATA;
|
||||
seg_options.data_off = PCACHE_CACHE_SEG_CTRL_OFF + PCACHE_CACHE_SEG_CTRL_SIZE;
|
||||
seg_options.seg_id = seg_id;
|
||||
seg_options.seg_info = &cache_seg->cache_seg_info;
|
||||
pcache_segment_init(cache_dev, segment, &seg_options);
|
||||
|
||||
cache_seg->cache_seg_ctrl = CACHE_DEV_SEGMENT(cache_dev, seg_id) + PCACHE_CACHE_SEG_CTRL_OFF;
|
||||
|
||||
if (new_cache) {
|
||||
cache_dev_zero_range(cache_dev, CACHE_DEV_SEGMENT(cache_dev, seg_id),
|
||||
PCACHE_SEG_INFO_SIZE * PCACHE_META_INDEX_MAX +
|
||||
PCACHE_CACHE_SEG_CTRL_SIZE);
|
||||
|
||||
cache_seg_ctrl_init(cache_seg);
|
||||
|
||||
cache_seg->info_index = 0;
|
||||
cache_seg_info_write(cache_seg);
|
||||
|
||||
/* clear outdated kset in segment */
|
||||
memcpy_flushcache(segment->data, &pcache_empty_kset, sizeof(struct pcache_cache_kset_onmedia));
|
||||
pmem_wmb();
|
||||
} else {
|
||||
ret = cache_seg_meta_load(cache_seg);
|
||||
if (ret)
|
||||
goto err;
|
||||
}
|
||||
|
||||
return 0;
|
||||
err:
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* get_cache_segment - Retrieves a free cache segment from the cache.
|
||||
* @cache: Pointer to the cache structure.
|
||||
*
|
||||
* This function attempts to find a free cache segment that can be used.
|
||||
* It locks the segment map and checks for the next available segment ID.
|
||||
* If a free segment is found, it initializes it and returns a pointer to the
|
||||
* cache segment structure. Returns NULL if no segments are available.
|
||||
*/
|
||||
struct pcache_cache_segment *get_cache_segment(struct pcache_cache *cache)
|
||||
{
|
||||
struct pcache_cache_segment *cache_seg;
|
||||
u32 seg_id;
|
||||
|
||||
spin_lock(&cache->seg_map_lock);
|
||||
again:
|
||||
seg_id = find_next_zero_bit(cache->seg_map, cache->n_segs, cache->last_cache_seg);
|
||||
if (seg_id == cache->n_segs) {
|
||||
/* reset the hint of ->last_cache_seg and retry */
|
||||
if (cache->last_cache_seg) {
|
||||
cache->last_cache_seg = 0;
|
||||
goto again;
|
||||
}
|
||||
cache->cache_full = true;
|
||||
spin_unlock(&cache->seg_map_lock);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* found an available cache_seg, mark it used in seg_map
|
||||
* and update the search hint ->last_cache_seg
|
||||
*/
|
||||
__set_bit(seg_id, cache->seg_map);
|
||||
cache->last_cache_seg = seg_id;
|
||||
spin_unlock(&cache->seg_map_lock);
|
||||
|
||||
cache_seg = &cache->segments[seg_id];
|
||||
cache_seg->cache_seg_id = seg_id;
|
||||
|
||||
return cache_seg;
|
||||
}
|
||||
|
||||
static void cache_seg_gen_increase(struct pcache_cache_segment *cache_seg)
|
||||
{
|
||||
spin_lock(&cache_seg->gen_lock);
|
||||
cache_seg->gen++;
|
||||
spin_unlock(&cache_seg->gen_lock);
|
||||
|
||||
cache_seg_ctrl_write(cache_seg);
|
||||
}
|
||||
|
||||
void cache_seg_get(struct pcache_cache_segment *cache_seg)
|
||||
{
|
||||
atomic_inc(&cache_seg->refs);
|
||||
}
|
||||
|
||||
static void cache_seg_invalidate(struct pcache_cache_segment *cache_seg)
|
||||
{
|
||||
struct pcache_cache *cache;
|
||||
|
||||
cache = cache_seg->cache;
|
||||
cache_seg_gen_increase(cache_seg);
|
||||
|
||||
spin_lock(&cache->seg_map_lock);
|
||||
if (cache->cache_full)
|
||||
cache->cache_full = false;
|
||||
__clear_bit(cache_seg->cache_seg_id, cache->seg_map);
|
||||
spin_unlock(&cache->seg_map_lock);
|
||||
|
||||
pcache_defer_reqs_kick(CACHE_TO_PCACHE(cache));
|
||||
/* clean_work will clean the bad key in key_tree*/
|
||||
queue_work(cache_get_wq(cache), &cache->clean_work);
|
||||
}
|
||||
|
||||
void cache_seg_put(struct pcache_cache_segment *cache_seg)
|
||||
{
|
||||
if (atomic_dec_and_test(&cache_seg->refs))
|
||||
cache_seg_invalidate(cache_seg);
|
||||
}
|
||||
261
drivers/md/dm-pcache/cache_writeback.c
Normal file
261
drivers/md/dm-pcache/cache_writeback.c
Normal file
@@ -0,0 +1,261 @@
|
||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
|
||||
#include <linux/bio.h>
|
||||
|
||||
#include "cache.h"
|
||||
#include "backing_dev.h"
|
||||
#include "cache_dev.h"
|
||||
#include "dm_pcache.h"
|
||||
|
||||
static void writeback_ctx_end(struct pcache_cache *cache, int ret)
|
||||
{
|
||||
if (ret && !cache->writeback_ctx.ret) {
|
||||
pcache_dev_err(CACHE_TO_PCACHE(cache), "writeback error: %d", ret);
|
||||
cache->writeback_ctx.ret = ret;
|
||||
}
|
||||
|
||||
if (!atomic_dec_and_test(&cache->writeback_ctx.pending))
|
||||
return;
|
||||
|
||||
if (!cache->writeback_ctx.ret) {
|
||||
backing_dev_flush(cache->backing_dev);
|
||||
|
||||
mutex_lock(&cache->dirty_tail_lock);
|
||||
cache_pos_advance(&cache->dirty_tail, cache->writeback_ctx.advance);
|
||||
cache_encode_dirty_tail(cache);
|
||||
mutex_unlock(&cache->dirty_tail_lock);
|
||||
}
|
||||
queue_delayed_work(cache_get_wq(cache), &cache->writeback_work, 0);
|
||||
}
|
||||
|
||||
static void writeback_end_req(struct pcache_backing_dev_req *backing_req, int ret)
|
||||
{
|
||||
struct pcache_cache *cache = backing_req->priv_data;
|
||||
|
||||
mutex_lock(&cache->writeback_lock);
|
||||
writeback_ctx_end(cache, ret);
|
||||
mutex_unlock(&cache->writeback_lock);
|
||||
}
|
||||
|
||||
static inline bool is_cache_clean(struct pcache_cache *cache, struct pcache_cache_pos *dirty_tail)
|
||||
{
|
||||
struct dm_pcache *pcache = CACHE_TO_PCACHE(cache);
|
||||
struct pcache_cache_kset_onmedia *kset_onmedia;
|
||||
u32 to_copy;
|
||||
void *addr;
|
||||
int ret;
|
||||
|
||||
addr = cache_pos_addr(dirty_tail);
|
||||
kset_onmedia = (struct pcache_cache_kset_onmedia *)cache->wb_kset_onmedia_buf;
|
||||
|
||||
to_copy = min(PCACHE_KSET_ONMEDIA_SIZE_MAX, PCACHE_SEG_SIZE - dirty_tail->seg_off);
|
||||
ret = copy_mc_to_kernel(kset_onmedia, addr, to_copy);
|
||||
if (ret) {
|
||||
pcache_dev_err(pcache, "error to read kset: %d", ret);
|
||||
return true;
|
||||
}
|
||||
|
||||
/* Check if the magic number matches the expected value */
|
||||
if (kset_onmedia->magic != PCACHE_KSET_MAGIC) {
|
||||
pcache_dev_debug(pcache, "dirty_tail: %u:%u magic: %llx, not expected: %llx\n",
|
||||
dirty_tail->cache_seg->cache_seg_id, dirty_tail->seg_off,
|
||||
kset_onmedia->magic, PCACHE_KSET_MAGIC);
|
||||
return true;
|
||||
}
|
||||
|
||||
/* Verify the CRC checksum for data integrity */
|
||||
if (kset_onmedia->crc != cache_kset_crc(kset_onmedia)) {
|
||||
pcache_dev_debug(pcache, "dirty_tail: %u:%u crc: %x, not expected: %x\n",
|
||||
dirty_tail->cache_seg->cache_seg_id, dirty_tail->seg_off,
|
||||
cache_kset_crc(kset_onmedia), kset_onmedia->crc);
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
void cache_writeback_exit(struct pcache_cache *cache)
|
||||
{
|
||||
cancel_delayed_work_sync(&cache->writeback_work);
|
||||
backing_dev_flush(cache->backing_dev);
|
||||
cache_tree_exit(&cache->writeback_key_tree);
|
||||
}
|
||||
|
||||
int cache_writeback_init(struct pcache_cache *cache)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = cache_tree_init(cache, &cache->writeback_key_tree, 1);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
atomic_set(&cache->writeback_ctx.pending, 0);
|
||||
|
||||
/* Queue delayed work to start writeback handling */
|
||||
queue_delayed_work(cache_get_wq(cache), &cache->writeback_work, 0);
|
||||
|
||||
return 0;
|
||||
err:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void cache_key_writeback(struct pcache_cache *cache, struct pcache_cache_key *key)
|
||||
{
|
||||
struct pcache_backing_dev_req *writeback_req;
|
||||
struct pcache_backing_dev_req_opts writeback_req_opts = { 0 };
|
||||
struct pcache_cache_pos *pos;
|
||||
void *addr;
|
||||
u32 seg_remain, req_len, done = 0;
|
||||
|
||||
if (cache_key_clean(key))
|
||||
return;
|
||||
|
||||
pos = &key->cache_pos;
|
||||
|
||||
seg_remain = cache_seg_remain(pos);
|
||||
BUG_ON(seg_remain < key->len);
|
||||
next_req:
|
||||
addr = cache_pos_addr(pos) + done;
|
||||
req_len = backing_dev_req_coalesced_max_len(addr, key->len - done);
|
||||
|
||||
writeback_req_opts.type = BACKING_DEV_REQ_TYPE_KMEM;
|
||||
writeback_req_opts.gfp_mask = GFP_NOIO;
|
||||
writeback_req_opts.end_fn = writeback_end_req;
|
||||
writeback_req_opts.priv_data = cache;
|
||||
|
||||
writeback_req_opts.kmem.data = addr;
|
||||
writeback_req_opts.kmem.opf = REQ_OP_WRITE;
|
||||
writeback_req_opts.kmem.len = req_len;
|
||||
writeback_req_opts.kmem.backing_off = key->off + done;
|
||||
|
||||
writeback_req = backing_dev_req_create(cache->backing_dev, &writeback_req_opts);
|
||||
|
||||
atomic_inc(&cache->writeback_ctx.pending);
|
||||
backing_dev_req_submit(writeback_req, true);
|
||||
|
||||
done += req_len;
|
||||
if (done < key->len)
|
||||
goto next_req;
|
||||
}
|
||||
|
||||
static void cache_wb_tree_writeback(struct pcache_cache *cache, u32 advance)
|
||||
{
|
||||
struct pcache_cache_tree *cache_tree = &cache->writeback_key_tree;
|
||||
struct pcache_cache_subtree *cache_subtree;
|
||||
struct rb_node *node;
|
||||
struct pcache_cache_key *key;
|
||||
u32 i;
|
||||
|
||||
cache->writeback_ctx.ret = 0;
|
||||
cache->writeback_ctx.advance = advance;
|
||||
atomic_set(&cache->writeback_ctx.pending, 1);
|
||||
|
||||
for (i = 0; i < cache_tree->n_subtrees; i++) {
|
||||
cache_subtree = &cache_tree->subtrees[i];
|
||||
|
||||
node = rb_first(&cache_subtree->root);
|
||||
while (node) {
|
||||
key = CACHE_KEY(node);
|
||||
node = rb_next(node);
|
||||
|
||||
cache_key_writeback(cache, key);
|
||||
cache_key_delete(key);
|
||||
}
|
||||
}
|
||||
writeback_ctx_end(cache, 0);
|
||||
}
|
||||
|
||||
static int cache_kset_insert_tree(struct pcache_cache *cache, struct pcache_cache_kset_onmedia *kset_onmedia)
|
||||
{
|
||||
struct pcache_cache_key_onmedia *key_onmedia;
|
||||
struct pcache_cache_subtree *cache_subtree;
|
||||
struct pcache_cache_key *key;
|
||||
int ret;
|
||||
u32 i;
|
||||
|
||||
/* Iterate through all keys in the kset and write each back to storage */
|
||||
for (i = 0; i < kset_onmedia->key_num; i++) {
|
||||
key_onmedia = &kset_onmedia->data[i];
|
||||
|
||||
key = cache_key_alloc(&cache->writeback_key_tree, GFP_NOIO);
|
||||
ret = cache_key_decode(cache, key_onmedia, key);
|
||||
if (ret) {
|
||||
cache_key_put(key);
|
||||
goto clear_tree;
|
||||
}
|
||||
|
||||
cache_subtree = get_subtree(&cache->writeback_key_tree, key->off);
|
||||
spin_lock(&cache_subtree->tree_lock);
|
||||
cache_key_insert(&cache->writeback_key_tree, key, true);
|
||||
spin_unlock(&cache_subtree->tree_lock);
|
||||
}
|
||||
|
||||
return 0;
|
||||
clear_tree:
|
||||
cache_tree_clear(&cache->writeback_key_tree);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void last_kset_writeback(struct pcache_cache *cache,
|
||||
struct pcache_cache_kset_onmedia *last_kset_onmedia)
|
||||
{
|
||||
struct dm_pcache *pcache = CACHE_TO_PCACHE(cache);
|
||||
struct pcache_cache_segment *next_seg;
|
||||
|
||||
pcache_dev_debug(pcache, "last kset, next: %u\n", last_kset_onmedia->next_cache_seg_id);
|
||||
|
||||
next_seg = &cache->segments[last_kset_onmedia->next_cache_seg_id];
|
||||
|
||||
mutex_lock(&cache->dirty_tail_lock);
|
||||
cache->dirty_tail.cache_seg = next_seg;
|
||||
cache->dirty_tail.seg_off = 0;
|
||||
cache_encode_dirty_tail(cache);
|
||||
mutex_unlock(&cache->dirty_tail_lock);
|
||||
}
|
||||
|
||||
void cache_writeback_fn(struct work_struct *work)
|
||||
{
|
||||
struct pcache_cache *cache = container_of(work, struct pcache_cache, writeback_work.work);
|
||||
struct dm_pcache *pcache = CACHE_TO_PCACHE(cache);
|
||||
struct pcache_cache_pos dirty_tail;
|
||||
struct pcache_cache_kset_onmedia *kset_onmedia;
|
||||
u32 delay;
|
||||
int ret;
|
||||
|
||||
mutex_lock(&cache->writeback_lock);
|
||||
if (atomic_read(&cache->writeback_ctx.pending))
|
||||
goto unlock;
|
||||
|
||||
if (pcache_is_stopping(pcache))
|
||||
goto unlock;
|
||||
|
||||
kset_onmedia = (struct pcache_cache_kset_onmedia *)cache->wb_kset_onmedia_buf;
|
||||
|
||||
mutex_lock(&cache->dirty_tail_lock);
|
||||
cache_pos_copy(&dirty_tail, &cache->dirty_tail);
|
||||
mutex_unlock(&cache->dirty_tail_lock);
|
||||
|
||||
if (is_cache_clean(cache, &dirty_tail)) {
|
||||
delay = PCACHE_CACHE_WRITEBACK_INTERVAL;
|
||||
goto queue_work;
|
||||
}
|
||||
|
||||
if (kset_onmedia->flags & PCACHE_KSET_FLAGS_LAST) {
|
||||
last_kset_writeback(cache, kset_onmedia);
|
||||
delay = 0;
|
||||
goto queue_work;
|
||||
}
|
||||
|
||||
ret = cache_kset_insert_tree(cache, kset_onmedia);
|
||||
if (ret) {
|
||||
delay = PCACHE_CACHE_WRITEBACK_INTERVAL;
|
||||
goto queue_work;
|
||||
}
|
||||
|
||||
cache_wb_tree_writeback(cache, get_kset_onmedia_size(kset_onmedia));
|
||||
delay = 0;
|
||||
queue_work:
|
||||
queue_delayed_work(cache_get_wq(cache), &cache->writeback_work, delay);
|
||||
unlock:
|
||||
mutex_unlock(&cache->writeback_lock);
|
||||
}
|
||||
497
drivers/md/dm-pcache/dm_pcache.c
Normal file
497
drivers/md/dm-pcache/dm_pcache.c
Normal file
@@ -0,0 +1,497 @@
|
||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
#include <linux/module.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/bio.h>
|
||||
|
||||
#include "../dm-core.h"
|
||||
#include "cache_dev.h"
|
||||
#include "backing_dev.h"
|
||||
#include "cache.h"
|
||||
#include "dm_pcache.h"
|
||||
|
||||
void pcache_defer_reqs_kick(struct dm_pcache *pcache)
|
||||
{
|
||||
struct pcache_cache *cache = &pcache->cache;
|
||||
|
||||
spin_lock(&cache->seg_map_lock);
|
||||
if (!cache->cache_full)
|
||||
queue_work(pcache->task_wq, &pcache->defered_req_work);
|
||||
spin_unlock(&cache->seg_map_lock);
|
||||
}
|
||||
|
||||
static void defer_req(struct pcache_request *pcache_req)
|
||||
{
|
||||
struct dm_pcache *pcache = pcache_req->pcache;
|
||||
|
||||
BUG_ON(!list_empty(&pcache_req->list_node));
|
||||
|
||||
spin_lock(&pcache->defered_req_list_lock);
|
||||
list_add(&pcache_req->list_node, &pcache->defered_req_list);
|
||||
pcache_defer_reqs_kick(pcache);
|
||||
spin_unlock(&pcache->defered_req_list_lock);
|
||||
}
|
||||
|
||||
static void defered_req_fn(struct work_struct *work)
|
||||
{
|
||||
struct dm_pcache *pcache = container_of(work, struct dm_pcache, defered_req_work);
|
||||
struct pcache_request *pcache_req;
|
||||
LIST_HEAD(tmp_list);
|
||||
int ret;
|
||||
|
||||
if (pcache_is_stopping(pcache))
|
||||
return;
|
||||
|
||||
spin_lock(&pcache->defered_req_list_lock);
|
||||
list_splice_init(&pcache->defered_req_list, &tmp_list);
|
||||
spin_unlock(&pcache->defered_req_list_lock);
|
||||
|
||||
while (!list_empty(&tmp_list)) {
|
||||
pcache_req = list_first_entry(&tmp_list,
|
||||
struct pcache_request, list_node);
|
||||
list_del_init(&pcache_req->list_node);
|
||||
pcache_req->ret = 0;
|
||||
ret = pcache_cache_handle_req(&pcache->cache, pcache_req);
|
||||
if (ret == -EBUSY)
|
||||
defer_req(pcache_req);
|
||||
else
|
||||
pcache_req_put(pcache_req, ret);
|
||||
}
|
||||
}
|
||||
|
||||
void pcache_req_get(struct pcache_request *pcache_req)
|
||||
{
|
||||
kref_get(&pcache_req->ref);
|
||||
}
|
||||
|
||||
static void end_req(struct kref *ref)
|
||||
{
|
||||
struct pcache_request *pcache_req = container_of(ref, struct pcache_request, ref);
|
||||
struct dm_pcache *pcache = pcache_req->pcache;
|
||||
struct bio *bio = pcache_req->bio;
|
||||
int ret = pcache_req->ret;
|
||||
|
||||
if (ret == -EBUSY) {
|
||||
pcache_req_get(pcache_req);
|
||||
defer_req(pcache_req);
|
||||
} else {
|
||||
bio->bi_status = errno_to_blk_status(ret);
|
||||
bio_endio(bio);
|
||||
|
||||
if (atomic_dec_and_test(&pcache->inflight_reqs))
|
||||
wake_up(&pcache->inflight_wq);
|
||||
}
|
||||
}
|
||||
|
||||
void pcache_req_put(struct pcache_request *pcache_req, int ret)
|
||||
{
|
||||
/* Set the return status if it is not already set */
|
||||
if (ret && !pcache_req->ret)
|
||||
pcache_req->ret = ret;
|
||||
|
||||
kref_put(&pcache_req->ref, end_req);
|
||||
}
|
||||
|
||||
static bool at_least_one_arg(struct dm_arg_set *as, char **error)
|
||||
{
|
||||
if (!as->argc) {
|
||||
*error = "Insufficient args";
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static int parse_cache_dev(struct dm_pcache *pcache, struct dm_arg_set *as,
|
||||
char **error)
|
||||
{
|
||||
int ret;
|
||||
|
||||
if (!at_least_one_arg(as, error))
|
||||
return -EINVAL;
|
||||
ret = dm_get_device(pcache->ti, dm_shift_arg(as),
|
||||
BLK_OPEN_READ | BLK_OPEN_WRITE,
|
||||
&pcache->cache_dev.dm_dev);
|
||||
if (ret) {
|
||||
*error = "Error opening cache device";
|
||||
return ret;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int parse_backing_dev(struct dm_pcache *pcache, struct dm_arg_set *as,
|
||||
char **error)
|
||||
{
|
||||
int ret;
|
||||
|
||||
if (!at_least_one_arg(as, error))
|
||||
return -EINVAL;
|
||||
|
||||
ret = dm_get_device(pcache->ti, dm_shift_arg(as),
|
||||
BLK_OPEN_READ | BLK_OPEN_WRITE,
|
||||
&pcache->backing_dev.dm_dev);
|
||||
if (ret) {
|
||||
*error = "Error opening backing device";
|
||||
return ret;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void pcache_init_opts(struct pcache_cache_options *opts)
|
||||
{
|
||||
opts->cache_mode = PCACHE_CACHE_MODE_WRITEBACK;
|
||||
opts->data_crc = false;
|
||||
}
|
||||
|
||||
static int parse_cache_opts(struct dm_pcache *pcache, struct dm_arg_set *as,
|
||||
char **error)
|
||||
{
|
||||
struct pcache_cache_options *opts = &pcache->opts;
|
||||
static const struct dm_arg _args[] = {
|
||||
{0, 4, "Invalid number of cache option arguments"},
|
||||
};
|
||||
unsigned int argc;
|
||||
const char *arg;
|
||||
int ret;
|
||||
|
||||
pcache_init_opts(opts);
|
||||
if (!as->argc)
|
||||
return 0;
|
||||
|
||||
ret = dm_read_arg_group(_args, as, &argc, error);
|
||||
if (ret)
|
||||
return -EINVAL;
|
||||
|
||||
while (argc) {
|
||||
arg = dm_shift_arg(as);
|
||||
argc--;
|
||||
|
||||
if (!strcmp(arg, "cache_mode")) {
|
||||
arg = dm_shift_arg(as);
|
||||
if (!strcmp(arg, "writeback")) {
|
||||
opts->cache_mode = PCACHE_CACHE_MODE_WRITEBACK;
|
||||
} else {
|
||||
*error = "Invalid cache mode parameter";
|
||||
return -EINVAL;
|
||||
}
|
||||
argc--;
|
||||
} else if (!strcmp(arg, "data_crc")) {
|
||||
arg = dm_shift_arg(as);
|
||||
if (!strcmp(arg, "true")) {
|
||||
opts->data_crc = true;
|
||||
} else if (!strcmp(arg, "false")) {
|
||||
opts->data_crc = false;
|
||||
} else {
|
||||
*error = "Invalid data crc parameter";
|
||||
return -EINVAL;
|
||||
}
|
||||
argc--;
|
||||
} else {
|
||||
*error = "Unrecognised cache option requested";
|
||||
return -EINVAL;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int pcache_start(struct dm_pcache *pcache, char **error)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = cache_dev_start(pcache);
|
||||
if (ret) {
|
||||
*error = "Failed to start cache dev";
|
||||
return ret;
|
||||
}
|
||||
|
||||
ret = backing_dev_start(pcache);
|
||||
if (ret) {
|
||||
*error = "Failed to start backing dev";
|
||||
goto stop_cache;
|
||||
}
|
||||
|
||||
ret = pcache_cache_start(pcache);
|
||||
if (ret) {
|
||||
*error = "Failed to start pcache";
|
||||
goto stop_backing;
|
||||
}
|
||||
|
||||
return 0;
|
||||
stop_backing:
|
||||
backing_dev_stop(pcache);
|
||||
stop_cache:
|
||||
cache_dev_stop(pcache);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void pcache_destroy_args(struct dm_pcache *pcache)
|
||||
{
|
||||
if (pcache->cache_dev.dm_dev)
|
||||
dm_put_device(pcache->ti, pcache->cache_dev.dm_dev);
|
||||
if (pcache->backing_dev.dm_dev)
|
||||
dm_put_device(pcache->ti, pcache->backing_dev.dm_dev);
|
||||
}
|
||||
|
||||
static int pcache_parse_args(struct dm_pcache *pcache, unsigned int argc, char **argv,
|
||||
char **error)
|
||||
{
|
||||
struct dm_arg_set as;
|
||||
int ret;
|
||||
|
||||
as.argc = argc;
|
||||
as.argv = argv;
|
||||
|
||||
/*
|
||||
* Parse cache device
|
||||
*/
|
||||
ret = parse_cache_dev(pcache, &as, error);
|
||||
if (ret)
|
||||
return ret;
|
||||
/*
|
||||
* Parse backing device
|
||||
*/
|
||||
ret = parse_backing_dev(pcache, &as, error);
|
||||
if (ret)
|
||||
goto out;
|
||||
/*
|
||||
* Parse optional arguments
|
||||
*/
|
||||
ret = parse_cache_opts(pcache, &as, error);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
return 0;
|
||||
out:
|
||||
pcache_destroy_args(pcache);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int dm_pcache_ctr(struct dm_target *ti, unsigned int argc, char **argv)
|
||||
{
|
||||
struct mapped_device *md = ti->table->md;
|
||||
struct dm_pcache *pcache;
|
||||
int ret;
|
||||
|
||||
if (md->map) {
|
||||
ti->error = "Don't support table loading for live md";
|
||||
return -EOPNOTSUPP;
|
||||
}
|
||||
|
||||
/* Allocate memory for the cache structure */
|
||||
pcache = kzalloc(sizeof(struct dm_pcache), GFP_KERNEL);
|
||||
if (!pcache)
|
||||
return -ENOMEM;
|
||||
|
||||
pcache->task_wq = alloc_workqueue("pcache-%s-wq", WQ_UNBOUND | WQ_MEM_RECLAIM,
|
||||
0, md->name);
|
||||
if (!pcache->task_wq) {
|
||||
ret = -ENOMEM;
|
||||
goto free_pcache;
|
||||
}
|
||||
|
||||
spin_lock_init(&pcache->defered_req_list_lock);
|
||||
INIT_LIST_HEAD(&pcache->defered_req_list);
|
||||
INIT_WORK(&pcache->defered_req_work, defered_req_fn);
|
||||
pcache->ti = ti;
|
||||
|
||||
ret = pcache_parse_args(pcache, argc, argv, &ti->error);
|
||||
if (ret)
|
||||
goto destroy_wq;
|
||||
|
||||
ret = pcache_start(pcache, &ti->error);
|
||||
if (ret)
|
||||
goto destroy_args;
|
||||
|
||||
ti->num_flush_bios = 1;
|
||||
ti->flush_supported = true;
|
||||
ti->per_io_data_size = sizeof(struct pcache_request);
|
||||
ti->private = pcache;
|
||||
atomic_set(&pcache->inflight_reqs, 0);
|
||||
atomic_set(&pcache->state, PCACHE_STATE_RUNNING);
|
||||
init_waitqueue_head(&pcache->inflight_wq);
|
||||
|
||||
return 0;
|
||||
destroy_args:
|
||||
pcache_destroy_args(pcache);
|
||||
destroy_wq:
|
||||
destroy_workqueue(pcache->task_wq);
|
||||
free_pcache:
|
||||
kfree(pcache);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void defer_req_stop(struct dm_pcache *pcache)
|
||||
{
|
||||
struct pcache_request *pcache_req;
|
||||
LIST_HEAD(tmp_list);
|
||||
|
||||
flush_work(&pcache->defered_req_work);
|
||||
|
||||
spin_lock(&pcache->defered_req_list_lock);
|
||||
list_splice_init(&pcache->defered_req_list, &tmp_list);
|
||||
spin_unlock(&pcache->defered_req_list_lock);
|
||||
|
||||
while (!list_empty(&tmp_list)) {
|
||||
pcache_req = list_first_entry(&tmp_list,
|
||||
struct pcache_request, list_node);
|
||||
list_del_init(&pcache_req->list_node);
|
||||
pcache_req_put(pcache_req, -EIO);
|
||||
}
|
||||
}
|
||||
|
||||
static void dm_pcache_dtr(struct dm_target *ti)
|
||||
{
|
||||
struct dm_pcache *pcache;
|
||||
|
||||
pcache = ti->private;
|
||||
atomic_set(&pcache->state, PCACHE_STATE_STOPPING);
|
||||
defer_req_stop(pcache);
|
||||
|
||||
wait_event(pcache->inflight_wq,
|
||||
atomic_read(&pcache->inflight_reqs) == 0);
|
||||
|
||||
pcache_cache_stop(pcache);
|
||||
backing_dev_stop(pcache);
|
||||
cache_dev_stop(pcache);
|
||||
|
||||
pcache_destroy_args(pcache);
|
||||
drain_workqueue(pcache->task_wq);
|
||||
destroy_workqueue(pcache->task_wq);
|
||||
|
||||
kfree(pcache);
|
||||
}
|
||||
|
||||
static int dm_pcache_map_bio(struct dm_target *ti, struct bio *bio)
|
||||
{
|
||||
struct pcache_request *pcache_req = dm_per_bio_data(bio, sizeof(struct pcache_request));
|
||||
struct dm_pcache *pcache = ti->private;
|
||||
int ret;
|
||||
|
||||
pcache_req->pcache = pcache;
|
||||
kref_init(&pcache_req->ref);
|
||||
pcache_req->ret = 0;
|
||||
pcache_req->bio = bio;
|
||||
pcache_req->off = (u64)bio->bi_iter.bi_sector << SECTOR_SHIFT;
|
||||
pcache_req->data_len = bio->bi_iter.bi_size;
|
||||
INIT_LIST_HEAD(&pcache_req->list_node);
|
||||
atomic_inc(&pcache->inflight_reqs);
|
||||
|
||||
ret = pcache_cache_handle_req(&pcache->cache, pcache_req);
|
||||
if (ret == -EBUSY)
|
||||
defer_req(pcache_req);
|
||||
else
|
||||
pcache_req_put(pcache_req, ret);
|
||||
|
||||
return DM_MAPIO_SUBMITTED;
|
||||
}
|
||||
|
||||
static void dm_pcache_status(struct dm_target *ti, status_type_t type,
|
||||
unsigned int status_flags, char *result,
|
||||
unsigned int maxlen)
|
||||
{
|
||||
struct dm_pcache *pcache = ti->private;
|
||||
struct pcache_cache_dev *cache_dev = &pcache->cache_dev;
|
||||
struct pcache_backing_dev *backing_dev = &pcache->backing_dev;
|
||||
struct pcache_cache *cache = &pcache->cache;
|
||||
unsigned int sz = 0;
|
||||
|
||||
switch (type) {
|
||||
case STATUSTYPE_INFO:
|
||||
DMEMIT("%x %u %u %u %u %x %u:%u %u:%u %u:%u",
|
||||
cache_dev->sb_flags,
|
||||
cache_dev->seg_num,
|
||||
cache->n_segs,
|
||||
bitmap_weight(cache->seg_map, cache->n_segs),
|
||||
pcache_cache_get_gc_percent(cache),
|
||||
cache->cache_info.flags,
|
||||
cache->key_head.cache_seg->cache_seg_id,
|
||||
cache->key_head.seg_off,
|
||||
cache->dirty_tail.cache_seg->cache_seg_id,
|
||||
cache->dirty_tail.seg_off,
|
||||
cache->key_tail.cache_seg->cache_seg_id,
|
||||
cache->key_tail.seg_off);
|
||||
break;
|
||||
case STATUSTYPE_TABLE:
|
||||
DMEMIT("%s %s 4 cache_mode writeback crc %s",
|
||||
cache_dev->dm_dev->name,
|
||||
backing_dev->dm_dev->name,
|
||||
cache_data_crc_on(cache) ? "true" : "false");
|
||||
break;
|
||||
case STATUSTYPE_IMA:
|
||||
*result = '\0';
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
static int dm_pcache_message(struct dm_target *ti, unsigned int argc,
|
||||
char **argv, char *result, unsigned int maxlen)
|
||||
{
|
||||
struct dm_pcache *pcache = ti->private;
|
||||
unsigned long val;
|
||||
|
||||
if (argc != 2)
|
||||
goto err;
|
||||
|
||||
if (!strcasecmp(argv[0], "gc_percent")) {
|
||||
if (kstrtoul(argv[1], 10, &val))
|
||||
goto err;
|
||||
|
||||
return pcache_cache_set_gc_percent(&pcache->cache, val);
|
||||
}
|
||||
err:
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
static struct target_type dm_pcache_target = {
|
||||
.name = "pcache",
|
||||
.version = {0, 1, 0},
|
||||
.module = THIS_MODULE,
|
||||
.features = DM_TARGET_SINGLETON,
|
||||
.ctr = dm_pcache_ctr,
|
||||
.dtr = dm_pcache_dtr,
|
||||
.map = dm_pcache_map_bio,
|
||||
.status = dm_pcache_status,
|
||||
.message = dm_pcache_message,
|
||||
};
|
||||
|
||||
static int __init dm_pcache_init(void)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = pcache_backing_init();
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
ret = pcache_cache_init();
|
||||
if (ret)
|
||||
goto backing_exit;
|
||||
|
||||
ret = dm_register_target(&dm_pcache_target);
|
||||
if (ret)
|
||||
goto cache_exit;
|
||||
return 0;
|
||||
|
||||
cache_exit:
|
||||
pcache_cache_exit();
|
||||
backing_exit:
|
||||
pcache_backing_exit();
|
||||
err:
|
||||
return ret;
|
||||
}
|
||||
module_init(dm_pcache_init);
|
||||
|
||||
static void __exit dm_pcache_exit(void)
|
||||
{
|
||||
dm_unregister_target(&dm_pcache_target);
|
||||
pcache_cache_exit();
|
||||
pcache_backing_exit();
|
||||
}
|
||||
module_exit(dm_pcache_exit);
|
||||
|
||||
MODULE_DESCRIPTION("dm-pcache Persistent Cache for block device");
|
||||
MODULE_AUTHOR("Dongsheng Yang <dongsheng.yang@linux.dev>");
|
||||
MODULE_LICENSE("GPL");
|
||||
67
drivers/md/dm-pcache/dm_pcache.h
Normal file
67
drivers/md/dm-pcache/dm_pcache.h
Normal file
@@ -0,0 +1,67 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
||||
#ifndef _DM_PCACHE_H
|
||||
#define _DM_PCACHE_H
|
||||
#include <linux/device-mapper.h>
|
||||
|
||||
#include "../dm-core.h"
|
||||
|
||||
#define CACHE_DEV_TO_PCACHE(cache_dev) (container_of(cache_dev, struct dm_pcache, cache_dev))
|
||||
#define BACKING_DEV_TO_PCACHE(backing_dev) (container_of(backing_dev, struct dm_pcache, backing_dev))
|
||||
#define CACHE_TO_PCACHE(cache) (container_of(cache, struct dm_pcache, cache))
|
||||
|
||||
#define PCACHE_STATE_RUNNING 1
|
||||
#define PCACHE_STATE_STOPPING 2
|
||||
|
||||
struct pcache_cache_dev;
|
||||
struct pcache_backing_dev;
|
||||
struct pcache_cache;
|
||||
struct pcache_cache_options;
|
||||
struct dm_pcache {
|
||||
struct dm_target *ti;
|
||||
struct pcache_cache_dev cache_dev;
|
||||
struct pcache_backing_dev backing_dev;
|
||||
struct pcache_cache cache;
|
||||
struct pcache_cache_options opts;
|
||||
|
||||
spinlock_t defered_req_list_lock;
|
||||
struct list_head defered_req_list;
|
||||
struct workqueue_struct *task_wq;
|
||||
|
||||
struct work_struct defered_req_work;
|
||||
|
||||
atomic_t state;
|
||||
atomic_t inflight_reqs;
|
||||
wait_queue_head_t inflight_wq;
|
||||
};
|
||||
|
||||
static inline bool pcache_is_stopping(struct dm_pcache *pcache)
|
||||
{
|
||||
return (atomic_read(&pcache->state) == PCACHE_STATE_STOPPING);
|
||||
}
|
||||
|
||||
#define pcache_dev_err(pcache, fmt, ...) \
|
||||
pcache_err("%s " fmt, pcache->ti->table->md->name, ##__VA_ARGS__)
|
||||
#define pcache_dev_info(pcache, fmt, ...) \
|
||||
pcache_info("%s " fmt, pcache->ti->table->md->name, ##__VA_ARGS__)
|
||||
#define pcache_dev_debug(pcache, fmt, ...) \
|
||||
pcache_debug("%s " fmt, pcache->ti->table->md->name, ##__VA_ARGS__)
|
||||
|
||||
struct pcache_request {
|
||||
struct dm_pcache *pcache;
|
||||
struct bio *bio;
|
||||
|
||||
u64 off;
|
||||
u32 data_len;
|
||||
|
||||
struct kref ref;
|
||||
int ret;
|
||||
|
||||
struct list_head list_node;
|
||||
};
|
||||
|
||||
void pcache_req_get(struct pcache_request *pcache_req);
|
||||
void pcache_req_put(struct pcache_request *pcache_req, int ret);
|
||||
|
||||
void pcache_defer_reqs_kick(struct dm_pcache *pcache);
|
||||
|
||||
#endif /* _DM_PCACHE_H */
|
||||
117
drivers/md/dm-pcache/pcache_internal.h
Normal file
117
drivers/md/dm-pcache/pcache_internal.h
Normal file
@@ -0,0 +1,117 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
||||
#ifndef _PCACHE_INTERNAL_H
|
||||
#define _PCACHE_INTERNAL_H
|
||||
|
||||
#include <linux/delay.h>
|
||||
#include <linux/crc32c.h>
|
||||
|
||||
#define pcache_err(fmt, ...) \
|
||||
pr_err("dm-pcache: %s:%u " fmt, __func__, __LINE__, ##__VA_ARGS__)
|
||||
#define pcache_info(fmt, ...) \
|
||||
pr_info("dm-pcache: %s:%u " fmt, __func__, __LINE__, ##__VA_ARGS__)
|
||||
#define pcache_debug(fmt, ...) \
|
||||
pr_debug("dm-pcache: %s:%u " fmt, __func__, __LINE__, ##__VA_ARGS__)
|
||||
|
||||
#define PCACHE_KB (1024ULL)
|
||||
#define PCACHE_MB (1024 * PCACHE_KB)
|
||||
|
||||
/* Maximum number of metadata indices */
|
||||
#define PCACHE_META_INDEX_MAX 2
|
||||
|
||||
#define PCACHE_CRC_SEED 0x3B15A
|
||||
/*
|
||||
* struct pcache_meta_header - PCACHE metadata header structure
|
||||
* @crc: CRC checksum for validating metadata integrity.
|
||||
* @seq: Sequence number to track metadata updates.
|
||||
* @version: Metadata version.
|
||||
* @res: Reserved space for future use.
|
||||
*/
|
||||
struct pcache_meta_header {
|
||||
__u32 crc;
|
||||
__u8 seq;
|
||||
__u8 version;
|
||||
__u16 res;
|
||||
};
|
||||
|
||||
/*
|
||||
* pcache_meta_crc - Calculate CRC for the given metadata header.
|
||||
* @header: Pointer to the metadata header.
|
||||
* @meta_size: Size of the metadata structure.
|
||||
*
|
||||
* Returns the CRC checksum calculated by excluding the CRC field itself.
|
||||
*/
|
||||
static inline u32 pcache_meta_crc(struct pcache_meta_header *header, u32 meta_size)
|
||||
{
|
||||
return crc32c(PCACHE_CRC_SEED, (void *)header + 4, meta_size - 4);
|
||||
}
|
||||
|
||||
/*
|
||||
* pcache_meta_seq_after - Check if a sequence number is more recent, accounting for overflow.
|
||||
* @seq1: First sequence number.
|
||||
* @seq2: Second sequence number.
|
||||
*
|
||||
* Determines if @seq1 is more recent than @seq2 by calculating the signed
|
||||
* difference between them. This approach allows handling sequence number
|
||||
* overflow correctly because the difference wraps naturally, and any value
|
||||
* greater than zero indicates that @seq1 is "after" @seq2. This method
|
||||
* assumes 8-bit unsigned sequence numbers, where the difference wraps
|
||||
* around if seq1 overflows past seq2.
|
||||
*
|
||||
* Returns:
|
||||
* - true if @seq1 is more recent than @seq2, indicating it comes "after"
|
||||
* - false otherwise.
|
||||
*/
|
||||
static inline bool pcache_meta_seq_after(u8 seq1, u8 seq2)
|
||||
{
|
||||
return (s8)(seq1 - seq2) > 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* pcache_meta_find_latest - Find the latest valid metadata.
|
||||
* @header: Pointer to the metadata header.
|
||||
* @meta_size: Size of each metadata block.
|
||||
*
|
||||
* Finds the latest valid metadata by checking sequence numbers. If a
|
||||
* valid entry with the highest sequence number is found, its pointer
|
||||
* is returned. Returns NULL if no valid metadata is found.
|
||||
*/
|
||||
static inline void __must_check *pcache_meta_find_latest(struct pcache_meta_header *header,
|
||||
u32 meta_size, u32 meta_max_size,
|
||||
void *meta_ret)
|
||||
{
|
||||
struct pcache_meta_header *meta, *latest = NULL;
|
||||
u32 i, seq_latest = 0;
|
||||
void *meta_addr;
|
||||
|
||||
meta = meta_ret;
|
||||
|
||||
for (i = 0; i < PCACHE_META_INDEX_MAX; i++) {
|
||||
meta_addr = (void *)header + (i * meta_max_size);
|
||||
if (copy_mc_to_kernel(meta, meta_addr, meta_size)) {
|
||||
pcache_err("hardware memory error when copy meta");
|
||||
return ERR_PTR(-EIO);
|
||||
}
|
||||
|
||||
/* Skip if CRC check fails, which means corrupted */
|
||||
if (meta->crc != pcache_meta_crc(meta, meta_size))
|
||||
continue;
|
||||
|
||||
/* Update latest if a more recent sequence is found */
|
||||
if (!latest || pcache_meta_seq_after(meta->seq, seq_latest)) {
|
||||
seq_latest = meta->seq;
|
||||
latest = (void *)header + (i * meta_max_size);
|
||||
}
|
||||
}
|
||||
|
||||
if (!latest)
|
||||
return NULL;
|
||||
|
||||
if (copy_mc_to_kernel(meta_ret, latest, meta_size)) {
|
||||
pcache_err("hardware memory error");
|
||||
return ERR_PTR(-EIO);
|
||||
}
|
||||
|
||||
return latest;
|
||||
}
|
||||
|
||||
#endif /* _PCACHE_INTERNAL_H */
|
||||
61
drivers/md/dm-pcache/segment.c
Normal file
61
drivers/md/dm-pcache/segment.c
Normal file
@@ -0,0 +1,61 @@
|
||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
#include <linux/dax.h>
|
||||
|
||||
#include "pcache_internal.h"
|
||||
#include "cache_dev.h"
|
||||
#include "segment.h"
|
||||
|
||||
int segment_copy_to_bio(struct pcache_segment *segment,
|
||||
u32 data_off, u32 data_len, struct bio *bio, u32 bio_off)
|
||||
{
|
||||
struct iov_iter iter;
|
||||
size_t copied;
|
||||
void *src;
|
||||
|
||||
iov_iter_bvec(&iter, ITER_DEST, &bio->bi_io_vec[bio->bi_iter.bi_idx],
|
||||
bio_segments(bio), bio->bi_iter.bi_size);
|
||||
iter.iov_offset = bio->bi_iter.bi_bvec_done;
|
||||
if (bio_off)
|
||||
iov_iter_advance(&iter, bio_off);
|
||||
|
||||
src = segment->data + data_off;
|
||||
copied = _copy_mc_to_iter(src, data_len, &iter);
|
||||
if (copied != data_len)
|
||||
return -EIO;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
int segment_copy_from_bio(struct pcache_segment *segment,
|
||||
u32 data_off, u32 data_len, struct bio *bio, u32 bio_off)
|
||||
{
|
||||
struct iov_iter iter;
|
||||
size_t copied;
|
||||
void *dst;
|
||||
|
||||
iov_iter_bvec(&iter, ITER_SOURCE, &bio->bi_io_vec[bio->bi_iter.bi_idx],
|
||||
bio_segments(bio), bio->bi_iter.bi_size);
|
||||
iter.iov_offset = bio->bi_iter.bi_bvec_done;
|
||||
if (bio_off)
|
||||
iov_iter_advance(&iter, bio_off);
|
||||
|
||||
dst = segment->data + data_off;
|
||||
copied = _copy_from_iter_flushcache(dst, data_len, &iter);
|
||||
if (copied != data_len)
|
||||
return -EIO;
|
||||
pmem_wmb();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void pcache_segment_init(struct pcache_cache_dev *cache_dev, struct pcache_segment *segment,
|
||||
struct pcache_segment_init_options *options)
|
||||
{
|
||||
segment->seg_info = options->seg_info;
|
||||
segment_info_set_type(segment->seg_info, options->type);
|
||||
|
||||
segment->cache_dev = cache_dev;
|
||||
segment->seg_id = options->seg_id;
|
||||
segment->data_size = PCACHE_SEG_SIZE - options->data_off;
|
||||
segment->data = CACHE_DEV_SEGMENT(cache_dev, options->seg_id) + options->data_off;
|
||||
}
|
||||
74
drivers/md/dm-pcache/segment.h
Normal file
74
drivers/md/dm-pcache/segment.h
Normal file
@@ -0,0 +1,74 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
||||
#ifndef _PCACHE_SEGMENT_H
|
||||
#define _PCACHE_SEGMENT_H
|
||||
|
||||
#include <linux/bio.h>
|
||||
#include <linux/bitfield.h>
|
||||
|
||||
#include "pcache_internal.h"
|
||||
|
||||
struct pcache_segment_info {
|
||||
struct pcache_meta_header header;
|
||||
__u32 flags;
|
||||
__u32 next_seg;
|
||||
};
|
||||
|
||||
#define PCACHE_SEG_INFO_FLAGS_HAS_NEXT BIT(0)
|
||||
|
||||
#define PCACHE_SEG_INFO_FLAGS_TYPE_MASK GENMASK(4, 1)
|
||||
#define PCACHE_SEGMENT_TYPE_CACHE_DATA 1
|
||||
|
||||
static inline bool segment_info_has_next(struct pcache_segment_info *seg_info)
|
||||
{
|
||||
return (seg_info->flags & PCACHE_SEG_INFO_FLAGS_HAS_NEXT);
|
||||
}
|
||||
|
||||
static inline void segment_info_set_type(struct pcache_segment_info *seg_info, u8 type)
|
||||
{
|
||||
seg_info->flags &= ~PCACHE_SEG_INFO_FLAGS_TYPE_MASK;
|
||||
seg_info->flags |= FIELD_PREP(PCACHE_SEG_INFO_FLAGS_TYPE_MASK, type);
|
||||
}
|
||||
|
||||
static inline u8 segment_info_get_type(struct pcache_segment_info *seg_info)
|
||||
{
|
||||
return FIELD_GET(PCACHE_SEG_INFO_FLAGS_TYPE_MASK, seg_info->flags);
|
||||
}
|
||||
|
||||
struct pcache_segment_pos {
|
||||
struct pcache_segment *segment; /* Segment associated with the position */
|
||||
u32 off; /* Offset within the segment */
|
||||
};
|
||||
|
||||
struct pcache_segment_init_options {
|
||||
u8 type;
|
||||
u32 seg_id;
|
||||
u32 data_off;
|
||||
|
||||
struct pcache_segment_info *seg_info;
|
||||
};
|
||||
|
||||
struct pcache_segment {
|
||||
struct pcache_cache_dev *cache_dev;
|
||||
|
||||
void *data;
|
||||
u32 data_size;
|
||||
u32 seg_id;
|
||||
|
||||
struct pcache_segment_info *seg_info;
|
||||
};
|
||||
|
||||
int segment_copy_to_bio(struct pcache_segment *segment,
|
||||
u32 data_off, u32 data_len, struct bio *bio, u32 bio_off);
|
||||
int segment_copy_from_bio(struct pcache_segment *segment,
|
||||
u32 data_off, u32 data_len, struct bio *bio, u32 bio_off);
|
||||
|
||||
static inline void segment_pos_advance(struct pcache_segment_pos *seg_pos, u32 len)
|
||||
{
|
||||
BUG_ON(seg_pos->off + len > seg_pos->segment->data_size);
|
||||
|
||||
seg_pos->off += len;
|
||||
}
|
||||
|
||||
void pcache_segment_init(struct pcache_cache_dev *cache_dev, struct pcache_segment *segment,
|
||||
struct pcache_segment_init_options *options);
|
||||
#endif /* _PCACHE_SEGMENT_H */
|
||||
@@ -3247,7 +3247,7 @@ size_check:
|
||||
rs_reset_inconclusive_reshape(rs);
|
||||
|
||||
/* Start raid set read-only and assumed clean to change in raid_resume() */
|
||||
rs->md.ro = 1;
|
||||
rs->md.ro = MD_RDONLY;
|
||||
rs->md.in_sync = 1;
|
||||
|
||||
/* Has to be held on running the array */
|
||||
@@ -3385,7 +3385,7 @@ static enum sync_state decipher_sync_action(struct mddev *mddev, unsigned long r
|
||||
/* The MD sync thread can be done with io or be interrupted but still be running */
|
||||
if (!test_bit(MD_RECOVERY_DONE, &recovery) &&
|
||||
(test_bit(MD_RECOVERY_RUNNING, &recovery) ||
|
||||
(!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &recovery)))) {
|
||||
(md_is_rdwr(mddev) && test_bit(MD_RECOVERY_NEEDED, &recovery)))) {
|
||||
if (test_bit(MD_RECOVERY_RESHAPE, &recovery))
|
||||
return st_reshape;
|
||||
|
||||
@@ -3775,11 +3775,11 @@ static int raid_message(struct dm_target *ti, unsigned int argc, char **argv,
|
||||
} else
|
||||
return -EINVAL;
|
||||
}
|
||||
if (mddev->ro == 2) {
|
||||
if (mddev->ro == MD_AUTO_READ) {
|
||||
/* A write to sync_action is enough to justify
|
||||
* canceling read-auto mode
|
||||
*/
|
||||
mddev->ro = 0;
|
||||
mddev->ro = MD_RDWR;
|
||||
if (!mddev->suspended)
|
||||
md_wakeup_thread(mddev->sync_thread);
|
||||
}
|
||||
@@ -3860,6 +3860,7 @@ static void raid_postsuspend(struct dm_target *ti)
|
||||
*/
|
||||
md_stop_writes(&rs->md);
|
||||
mddev_suspend(&rs->md, false);
|
||||
rs->md.ro = MD_RDONLY;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3972,7 +3973,7 @@ static void rs_update_sbs(struct raid_set *rs)
|
||||
int ro = mddev->ro;
|
||||
|
||||
set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
|
||||
mddev->ro = 0;
|
||||
mddev->ro = MD_RDWR;
|
||||
md_update_sb(mddev, 1);
|
||||
mddev->ro = ro;
|
||||
}
|
||||
@@ -4131,7 +4132,7 @@ static void raid_resume(struct dm_target *ti)
|
||||
WARN_ON_ONCE(rcu_dereference_protected(mddev->sync_thread,
|
||||
lockdep_is_held(&mddev->reconfig_mutex)));
|
||||
clear_bit(RT_FLAG_RS_FROZEN, &rs->runtime_flags);
|
||||
mddev->ro = 0;
|
||||
mddev->ro = MD_RDWR;
|
||||
mddev->in_sync = 0;
|
||||
md_unfrozen_sync_thread(mddev);
|
||||
mddev_unlock_and_resume(mddev);
|
||||
|
||||
@@ -206,7 +206,7 @@ struct dm_region_hash *dm_region_hash_create(
|
||||
rh->shift = RH_HASH_SHIFT;
|
||||
rh->prime = RH_HASH_MULT;
|
||||
|
||||
rh->buckets = vmalloc(array_size(nr_buckets, sizeof(*rh->buckets)));
|
||||
rh->buckets = vmalloc_array(nr_buckets, sizeof(*rh->buckets));
|
||||
if (!rh->buckets) {
|
||||
DMERR("unable to allocate region hash bucket memory");
|
||||
kfree(rh);
|
||||
|
||||
@@ -114,8 +114,8 @@ static int alloc_region_table(struct dm_target *ti, unsigned int nr_paths)
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
sctx->region_table = vmalloc(array_size(nr_slots,
|
||||
sizeof(region_table_slot_t)));
|
||||
sctx->region_table = vmalloc_array(nr_slots,
|
||||
sizeof(region_table_slot_t));
|
||||
if (!sctx->region_table) {
|
||||
ti->error = "Cannot allocate region table";
|
||||
return -ENOMEM;
|
||||
|
||||
@@ -263,7 +263,8 @@ static long io_err_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
|
||||
static struct target_type error_target = {
|
||||
.name = "error",
|
||||
.version = {1, 7, 0},
|
||||
.features = DM_TARGET_WILDCARD | DM_TARGET_ZONED_HM,
|
||||
.features = DM_TARGET_WILDCARD | DM_TARGET_ZONED_HM |
|
||||
DM_TARGET_PASSES_INTEGRITY,
|
||||
.ctr = io_err_ctr,
|
||||
.dtr = io_err_dtr,
|
||||
.map = io_err_map,
|
||||
|
||||
@@ -3031,8 +3031,8 @@ static struct pool *pool_create(struct mapped_device *pool_md,
|
||||
}
|
||||
|
||||
pool->cell_sort_array =
|
||||
vmalloc(array_size(CELL_SORT_ARRAY_SIZE,
|
||||
sizeof(*pool->cell_sort_array)));
|
||||
vmalloc_array(CELL_SORT_ARRAY_SIZE,
|
||||
sizeof(*pool->cell_sort_array));
|
||||
if (!pool->cell_sort_array) {
|
||||
*error = "Error allocating cell sort array";
|
||||
err_p = ERR_PTR(-ENOMEM);
|
||||
|
||||
@@ -17,6 +17,7 @@
|
||||
#include <linux/minmax.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/spinlock.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/wait.h>
|
||||
|
||||
#include "logger.h"
|
||||
@@ -509,18 +510,6 @@ static void launch_data_vio(struct data_vio *data_vio, logical_block_number_t lb
|
||||
vdo_enqueue_completion(completion, VDO_DEFAULT_Q_MAP_BIO_PRIORITY);
|
||||
}
|
||||
|
||||
static bool is_zero_block(char *block)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < VDO_BLOCK_SIZE; i += sizeof(u64)) {
|
||||
if (*((u64 *) &block[i]))
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static void copy_from_bio(struct bio *bio, char *data_ptr)
|
||||
{
|
||||
struct bio_vec biovec;
|
||||
@@ -572,7 +561,7 @@ static void launch_bio(struct vdo *vdo, struct data_vio *data_vio, struct bio *b
|
||||
* we acknowledge the bio.
|
||||
*/
|
||||
copy_from_bio(bio, data_vio->vio.data);
|
||||
data_vio->is_zero = is_zero_block(data_vio->vio.data);
|
||||
data_vio->is_zero = mem_is_zero(data_vio->vio.data, VDO_BLOCK_SIZE);
|
||||
data_vio->write = true;
|
||||
}
|
||||
|
||||
@@ -1459,7 +1448,7 @@ static void modify_for_partial_write(struct vdo_completion *completion)
|
||||
copy_from_bio(bio, data + data_vio->offset);
|
||||
}
|
||||
|
||||
data_vio->is_zero = is_zero_block(data);
|
||||
data_vio->is_zero = mem_is_zero(data, VDO_BLOCK_SIZE);
|
||||
data_vio->read = false;
|
||||
launch_data_vio_logical_callback(data_vio,
|
||||
continue_data_vio_with_block_map_slot);
|
||||
|
||||
@@ -836,7 +836,7 @@ static int start_restoring_volume_sub_index(struct volume_sub_index *sub_index,
|
||||
"%zu bytes decoded of %zu expected", offset,
|
||||
sizeof(buffer));
|
||||
if (result != VDO_SUCCESS)
|
||||
result = UDS_CORRUPT_DATA;
|
||||
return UDS_CORRUPT_DATA;
|
||||
|
||||
if (memcmp(header.magic, MAGIC_START_5, MAGIC_SIZE) != 0) {
|
||||
return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
|
||||
@@ -928,7 +928,7 @@ static int start_restoring_volume_index(struct volume_index *volume_index,
|
||||
"%zu bytes decoded of %zu expected", offset,
|
||||
sizeof(buffer));
|
||||
if (result != VDO_SUCCESS)
|
||||
result = UDS_CORRUPT_DATA;
|
||||
return UDS_CORRUPT_DATA;
|
||||
|
||||
if (memcmp(header.magic, MAGIC_START_6, MAGIC_SIZE) != 0)
|
||||
return vdo_log_warning_strerror(UDS_CORRUPT_DATA,
|
||||
|
||||
@@ -490,18 +490,13 @@ u64 dm_start_time_ns_from_clone(struct bio *bio)
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(dm_start_time_ns_from_clone);
|
||||
|
||||
static inline bool bio_is_flush_with_data(struct bio *bio)
|
||||
{
|
||||
return ((bio->bi_opf & REQ_PREFLUSH) && bio->bi_iter.bi_size);
|
||||
}
|
||||
|
||||
static inline unsigned int dm_io_sectors(struct dm_io *io, struct bio *bio)
|
||||
{
|
||||
/*
|
||||
* If REQ_PREFLUSH set, don't account payload, it will be
|
||||
* submitted (and accounted) after this flush completes.
|
||||
*/
|
||||
if (bio_is_flush_with_data(bio))
|
||||
if (io->requeue_flush_with_data)
|
||||
return 0;
|
||||
if (unlikely(dm_io_flagged(io, DM_IO_WAS_SPLIT)))
|
||||
return io->sectors;
|
||||
@@ -590,6 +585,7 @@ static struct dm_io *alloc_io(struct mapped_device *md, struct bio *bio, gfp_t g
|
||||
io = container_of(tio, struct dm_io, tio);
|
||||
io->magic = DM_IO_MAGIC;
|
||||
io->status = BLK_STS_OK;
|
||||
io->requeue_flush_with_data = false;
|
||||
|
||||
/* one ref is for submission, the other is for completion */
|
||||
atomic_set(&io->io_count, 2);
|
||||
@@ -948,6 +944,7 @@ static void __dm_io_complete(struct dm_io *io, bool first_stage)
|
||||
struct mapped_device *md = io->md;
|
||||
blk_status_t io_error;
|
||||
bool requeued;
|
||||
bool requeue_flush_with_data;
|
||||
|
||||
requeued = dm_handle_requeue(io, first_stage);
|
||||
if (requeued && first_stage)
|
||||
@@ -964,6 +961,7 @@ static void __dm_io_complete(struct dm_io *io, bool first_stage)
|
||||
__dm_start_io_acct(io);
|
||||
dm_end_io_acct(io);
|
||||
}
|
||||
requeue_flush_with_data = io->requeue_flush_with_data;
|
||||
free_io(io);
|
||||
smp_wmb();
|
||||
this_cpu_dec(*md->pending_io);
|
||||
@@ -976,7 +974,7 @@ static void __dm_io_complete(struct dm_io *io, bool first_stage)
|
||||
if (requeued)
|
||||
return;
|
||||
|
||||
if (bio_is_flush_with_data(bio)) {
|
||||
if (unlikely(requeue_flush_with_data)) {
|
||||
/*
|
||||
* Preflush done for flush with data, reissue
|
||||
* without REQ_PREFLUSH.
|
||||
@@ -1996,12 +1994,30 @@ static void dm_split_and_process_bio(struct mapped_device *md,
|
||||
}
|
||||
init_clone_info(&ci, io, map, bio, is_abnormal);
|
||||
|
||||
if (bio->bi_opf & REQ_PREFLUSH) {
|
||||
if (unlikely((bio->bi_opf & REQ_PREFLUSH) != 0)) {
|
||||
/*
|
||||
* The "flush_bypasses_map" is set on targets where it is safe
|
||||
* to skip the map function and submit bios directly to the
|
||||
* underlying block devices - currently, it is set for dm-linear
|
||||
* and dm-stripe.
|
||||
*
|
||||
* If we have just one underlying device (i.e. there is one
|
||||
* linear target or multiple linear targets pointing to the same
|
||||
* device), we can send the flush with data directly to it.
|
||||
*/
|
||||
if (map->flush_bypasses_map) {
|
||||
struct list_head *devices = dm_table_get_devices(map);
|
||||
if (devices->next == devices->prev)
|
||||
goto send_preflush_with_data;
|
||||
}
|
||||
if (bio->bi_iter.bi_size)
|
||||
io->requeue_flush_with_data = true;
|
||||
__send_empty_flush(&ci);
|
||||
/* dm_io_complete submits any data associated with flush */
|
||||
goto out;
|
||||
}
|
||||
|
||||
send_preflush_with_data:
|
||||
if (static_branch_unlikely(&zoned_enabled) &&
|
||||
(bio_op(bio) == REQ_OP_ZONE_RESET_ALL)) {
|
||||
error = __send_zone_reset_all(&ci);
|
||||
@@ -2908,7 +2924,7 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
|
||||
{
|
||||
bool do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG;
|
||||
bool noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG;
|
||||
int r;
|
||||
int r = 0;
|
||||
|
||||
lockdep_assert_held(&md->suspend_lock);
|
||||
|
||||
@@ -2960,8 +2976,10 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
|
||||
* Stop md->queue before flushing md->wq in case request-based
|
||||
* dm defers requests to md->wq from md->queue.
|
||||
*/
|
||||
if (dm_request_based(md))
|
||||
if (map && dm_request_based(md)) {
|
||||
dm_stop_queue(md->queue);
|
||||
set_bit(DMF_QUEUE_STOPPED, &md->flags);
|
||||
}
|
||||
|
||||
flush_workqueue(md->wq);
|
||||
|
||||
@@ -2970,7 +2988,8 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
|
||||
* We call dm_wait_for_completion to wait for all existing requests
|
||||
* to finish.
|
||||
*/
|
||||
r = dm_wait_for_completion(md, task_state);
|
||||
if (map)
|
||||
r = dm_wait_for_completion(md, task_state);
|
||||
if (!r)
|
||||
set_bit(dmf_suspended_flag, &md->flags);
|
||||
|
||||
@@ -2983,7 +3002,7 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
|
||||
if (r < 0) {
|
||||
dm_queue_flush(md);
|
||||
|
||||
if (dm_request_based(md))
|
||||
if (test_and_clear_bit(DMF_QUEUE_STOPPED, &md->flags))
|
||||
dm_start_queue(md->queue);
|
||||
|
||||
unlock_fs(md);
|
||||
@@ -3067,7 +3086,7 @@ static int __dm_resume(struct mapped_device *md, struct dm_table *map)
|
||||
* so that mapping of targets can work correctly.
|
||||
* Request-based dm is queueing the deferred I/Os in its request_queue.
|
||||
*/
|
||||
if (dm_request_based(md))
|
||||
if (test_and_clear_bit(DMF_QUEUE_STOPPED, &md->flags))
|
||||
dm_start_queue(md->queue);
|
||||
|
||||
unlock_fs(md);
|
||||
|
||||
Reference in New Issue
Block a user