mirror of
https://github.com/torvalds/linux.git
synced 2025-12-07 20:06:24 +00:00
Commit20d72b00ca("netfs: Fix the request's work item to not require a ref") modified netfs_alloc_request() to initialize the reference counter to 2 instead of 1. The rationale was that the requet's "work" would release the second reference after completion (via netfs_{read,write}_collection_worker()). That works most of the time if all goes well. However, it leaks this additional reference if the request is released before the I/O operation has been submitted: the error code path only decrements the reference counter once and the work item will never be queued because there will never be a completion. This has caused outages of our whole server cluster today because tasks were blocked in netfs_wait_for_outstanding_io(), leading to deadlocks in Ceph (another bug that I will address soon in another patch). This was caused by a netfs_pgpriv2_begin_copy_to_cache() call which failed in fscache_begin_write_operation(). The leaked netfs_io_request was never completed, leaving `netfs_inode.io_count` with a positive value forever. All of this is super-fragile code. Finding out which code paths will lead to an eventual completion and which do not is hard to see: - Some functions like netfs_create_write_req() allocate a request, but will never submit any I/O. - netfs_unbuffered_read_iter_locked() calls netfs_unbuffered_read() and then netfs_put_request(); however, netfs_unbuffered_read() can also fail early before submitting the I/O request, therefore another netfs_put_request() call must be added there. A rule of thumb is that functions that return a `netfs_io_request` do not submit I/O, and all of their callers must be checked. For my taste, the whole netfs code needs an overhaul to make reference counting easier to understand and less fragile & obscure. But to fix this bug here and now and produce a patch that is adequate for a stable backport, I tried a minimal approach that quickly frees the request object upon early failure. I decided against adding a second netfs_put_request() each time because that would cause code duplication which obscures the code further. Instead, I added the function netfs_put_failed_request() which frees such a failed request synchronously under the assumption that the reference count is exactly 2 (as initially set by netfs_alloc_request() and never touched), verified by a WARN_ON_ONCE(). It then deinitializes the request object (without going through the "cleanup_work" indirection) and frees the allocation (with RCU protection to protect against concurrent access by netfs_requests_seq_start()). All code paths that fail early have been changed to call netfs_put_failed_request() instead of netfs_put_request(). Additionally, I have added a netfs_put_request() call to netfs_unbuffered_read() as explained above because the netfs_put_failed_request() approach does not work there. Fixes:20d72b00ca("netfs: Fix the request's work item to not require a ref") Signed-off-by: Max Kellermann <max.kellermann@ionos.com> Signed-off-by: David Howells <dhowells@redhat.com> cc: Paulo Alcantara <pc@manguebit.org> cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org cc: stable@vger.kernel.org Signed-off-by: Christian Brauner <brauner@kernel.org>
233 lines
6.1 KiB
C
233 lines
6.1 KiB
C
// SPDX-License-Identifier: GPL-2.0-only
|
|
/* Read with PG_private_2 [DEPRECATED].
|
|
*
|
|
* Copyright (C) 2024 Red Hat, Inc. All Rights Reserved.
|
|
* Written by David Howells (dhowells@redhat.com)
|
|
*/
|
|
|
|
#include <linux/export.h>
|
|
#include <linux/fs.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/pagemap.h>
|
|
#include <linux/slab.h>
|
|
#include <linux/task_io_accounting_ops.h>
|
|
#include "internal.h"
|
|
|
|
/*
|
|
* [DEPRECATED] Copy a folio to the cache with PG_private_2 set.
|
|
*/
|
|
static void netfs_pgpriv2_copy_folio(struct netfs_io_request *creq, struct folio *folio)
|
|
{
|
|
struct netfs_io_stream *cache = &creq->io_streams[1];
|
|
size_t fsize = folio_size(folio), flen = fsize;
|
|
loff_t fpos = folio_pos(folio), i_size;
|
|
bool to_eof = false;
|
|
|
|
_enter("");
|
|
|
|
/* netfs_perform_write() may shift i_size around the page or from out
|
|
* of the page to beyond it, but cannot move i_size into or through the
|
|
* page since we have it locked.
|
|
*/
|
|
i_size = i_size_read(creq->inode);
|
|
|
|
if (fpos >= i_size) {
|
|
/* mmap beyond eof. */
|
|
_debug("beyond eof");
|
|
folio_end_private_2(folio);
|
|
return;
|
|
}
|
|
|
|
if (fpos + fsize > creq->i_size)
|
|
creq->i_size = i_size;
|
|
|
|
if (flen > i_size - fpos) {
|
|
flen = i_size - fpos;
|
|
to_eof = true;
|
|
} else if (flen == i_size - fpos) {
|
|
to_eof = true;
|
|
}
|
|
|
|
_debug("folio %zx %zx", flen, fsize);
|
|
|
|
trace_netfs_folio(folio, netfs_folio_trace_store_copy);
|
|
|
|
/* Attach the folio to the rolling buffer. */
|
|
if (rolling_buffer_append(&creq->buffer, folio, 0) < 0) {
|
|
clear_bit(NETFS_RREQ_FOLIO_COPY_TO_CACHE, &creq->flags);
|
|
return;
|
|
}
|
|
|
|
cache->submit_extendable_to = fsize;
|
|
cache->submit_off = 0;
|
|
cache->submit_len = flen;
|
|
|
|
/* Attach the folio to one or more subrequests. For a big folio, we
|
|
* could end up with thousands of subrequests if the wsize is small -
|
|
* but we might need to wait during the creation of subrequests for
|
|
* network resources (eg. SMB credits).
|
|
*/
|
|
do {
|
|
ssize_t part;
|
|
|
|
creq->buffer.iter.iov_offset = cache->submit_off;
|
|
|
|
atomic64_set(&creq->issued_to, fpos + cache->submit_off);
|
|
cache->submit_extendable_to = fsize - cache->submit_off;
|
|
part = netfs_advance_write(creq, cache, fpos + cache->submit_off,
|
|
cache->submit_len, to_eof);
|
|
cache->submit_off += part;
|
|
if (part > cache->submit_len)
|
|
cache->submit_len = 0;
|
|
else
|
|
cache->submit_len -= part;
|
|
} while (cache->submit_len > 0);
|
|
|
|
creq->buffer.iter.iov_offset = 0;
|
|
rolling_buffer_advance(&creq->buffer, fsize);
|
|
atomic64_set(&creq->issued_to, fpos + fsize);
|
|
|
|
if (flen < fsize)
|
|
netfs_issue_write(creq, cache);
|
|
}
|
|
|
|
/*
|
|
* [DEPRECATED] Set up copying to the cache.
|
|
*/
|
|
static struct netfs_io_request *netfs_pgpriv2_begin_copy_to_cache(
|
|
struct netfs_io_request *rreq, struct folio *folio)
|
|
{
|
|
struct netfs_io_request *creq;
|
|
|
|
if (!fscache_resources_valid(&rreq->cache_resources))
|
|
goto cancel;
|
|
|
|
creq = netfs_create_write_req(rreq->mapping, NULL, folio_pos(folio),
|
|
NETFS_PGPRIV2_COPY_TO_CACHE);
|
|
if (IS_ERR(creq))
|
|
goto cancel;
|
|
|
|
if (!creq->io_streams[1].avail)
|
|
goto cancel_put;
|
|
|
|
__set_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &creq->flags);
|
|
trace_netfs_copy2cache(rreq, creq);
|
|
trace_netfs_write(creq, netfs_write_trace_copy_to_cache);
|
|
netfs_stat(&netfs_n_wh_copy_to_cache);
|
|
rreq->copy_to_cache = creq;
|
|
return creq;
|
|
|
|
cancel_put:
|
|
netfs_put_failed_request(creq);
|
|
cancel:
|
|
rreq->copy_to_cache = ERR_PTR(-ENOBUFS);
|
|
clear_bit(NETFS_RREQ_FOLIO_COPY_TO_CACHE, &rreq->flags);
|
|
return ERR_PTR(-ENOBUFS);
|
|
}
|
|
|
|
/*
|
|
* [DEPRECATED] Mark page as requiring copy-to-cache using PG_private_2 and add
|
|
* it to the copy write request.
|
|
*/
|
|
void netfs_pgpriv2_copy_to_cache(struct netfs_io_request *rreq, struct folio *folio)
|
|
{
|
|
struct netfs_io_request *creq = rreq->copy_to_cache;
|
|
|
|
if (!creq)
|
|
creq = netfs_pgpriv2_begin_copy_to_cache(rreq, folio);
|
|
if (IS_ERR(creq))
|
|
return;
|
|
|
|
trace_netfs_folio(folio, netfs_folio_trace_copy_to_cache);
|
|
folio_start_private_2(folio);
|
|
netfs_pgpriv2_copy_folio(creq, folio);
|
|
}
|
|
|
|
/*
|
|
* [DEPRECATED] End writing to the cache, flushing out any outstanding writes.
|
|
*/
|
|
void netfs_pgpriv2_end_copy_to_cache(struct netfs_io_request *rreq)
|
|
{
|
|
struct netfs_io_request *creq = rreq->copy_to_cache;
|
|
|
|
if (IS_ERR_OR_NULL(creq))
|
|
return;
|
|
|
|
netfs_issue_write(creq, &creq->io_streams[1]);
|
|
smp_wmb(); /* Write lists before ALL_QUEUED. */
|
|
set_bit(NETFS_RREQ_ALL_QUEUED, &creq->flags);
|
|
trace_netfs_rreq(rreq, netfs_rreq_trace_end_copy_to_cache);
|
|
if (list_empty_careful(&creq->io_streams[1].subrequests))
|
|
netfs_wake_collector(creq);
|
|
|
|
netfs_put_request(creq, netfs_rreq_trace_put_return);
|
|
creq->copy_to_cache = NULL;
|
|
}
|
|
|
|
/*
|
|
* [DEPRECATED] Remove the PG_private_2 mark from any folios we've finished
|
|
* copying.
|
|
*/
|
|
bool netfs_pgpriv2_unlock_copied_folios(struct netfs_io_request *creq)
|
|
{
|
|
struct folio_queue *folioq = creq->buffer.tail;
|
|
unsigned long long collected_to = creq->collected_to;
|
|
unsigned int slot = creq->buffer.first_tail_slot;
|
|
bool made_progress = false;
|
|
|
|
if (slot >= folioq_nr_slots(folioq)) {
|
|
folioq = rolling_buffer_delete_spent(&creq->buffer);
|
|
slot = 0;
|
|
}
|
|
|
|
for (;;) {
|
|
struct folio *folio;
|
|
unsigned long long fpos, fend;
|
|
size_t fsize, flen;
|
|
|
|
folio = folioq_folio(folioq, slot);
|
|
if (WARN_ONCE(!folio_test_private_2(folio),
|
|
"R=%08x: folio %lx is not marked private_2\n",
|
|
creq->debug_id, folio->index))
|
|
trace_netfs_folio(folio, netfs_folio_trace_not_under_wback);
|
|
|
|
fpos = folio_pos(folio);
|
|
fsize = folio_size(folio);
|
|
flen = fsize;
|
|
|
|
fend = min_t(unsigned long long, fpos + flen, creq->i_size);
|
|
|
|
trace_netfs_collect_folio(creq, folio, fend, collected_to);
|
|
|
|
/* Unlock any folio we've transferred all of. */
|
|
if (collected_to < fend)
|
|
break;
|
|
|
|
trace_netfs_folio(folio, netfs_folio_trace_end_copy);
|
|
folio_end_private_2(folio);
|
|
creq->cleaned_to = fpos + fsize;
|
|
made_progress = true;
|
|
|
|
/* Clean up the head folioq. If we clear an entire folioq, then
|
|
* we can get rid of it provided it's not also the tail folioq
|
|
* being filled by the issuer.
|
|
*/
|
|
folioq_clear(folioq, slot);
|
|
slot++;
|
|
if (slot >= folioq_nr_slots(folioq)) {
|
|
folioq = rolling_buffer_delete_spent(&creq->buffer);
|
|
if (!folioq)
|
|
goto done;
|
|
slot = 0;
|
|
}
|
|
|
|
if (fpos + fsize >= collected_to)
|
|
break;
|
|
}
|
|
|
|
creq->buffer.tail = folioq;
|
|
done:
|
|
creq->buffer.first_tail_slot = slot;
|
|
return made_progress;
|
|
}
|