Skip to content

Commit 0b3bb20

Browse files
committed
Merge tag 'vfs-7.0-rc3.fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs
Pull vfs fixes from Christian Brauner: - kthread: consolidate kthread exit paths to prevent use-after-free - iomap: - don't mark folio uptodate if read IO has bytes pending - don't report direct-io retries to fserror - reject delalloc mappings during writeback - ns: tighten visibility checks - netfs: Fix unbuffered/DIO writes to dispatch subrequests in strict sequence * tag 'vfs-7.0-rc3.fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs: iomap: reject delalloc mappings during writeback iomap: don't mark folio uptodate if read IO has bytes pending selftests: fix mntns iteration selftests nstree: tighten permission checks for listing nsfs: tighten permission checks for handle opening nsfs: tighten permission checks for ns iteration ioctls netfs: Fix unbuffered/DIO writes to dispatch subrequests in strict sequence kthread: consolidate kthread exit paths to prevent use-after-free iomap: don't report direct-io retries to fserror
2 parents ecc64d2 + d320f16 commit 0b3bb20

16 files changed

Lines changed: 326 additions & 160 deletions

File tree

fs/iomap/buffered-io.c

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -80,18 +80,27 @@ static void iomap_set_range_uptodate(struct folio *folio, size_t off,
8080
{
8181
struct iomap_folio_state *ifs = folio->private;
8282
unsigned long flags;
83-
bool uptodate = true;
83+
bool mark_uptodate = true;
8484

8585
if (folio_test_uptodate(folio))
8686
return;
8787

8888
if (ifs) {
8989
spin_lock_irqsave(&ifs->state_lock, flags);
90-
uptodate = ifs_set_range_uptodate(folio, ifs, off, len);
90+
/*
91+
* If a read with bytes pending is in progress, we must not call
92+
* folio_mark_uptodate(). The read completion path
93+
* (iomap_read_end()) will call folio_end_read(), which uses XOR
94+
* semantics to set the uptodate bit. If we set it here, the XOR
95+
* in folio_end_read() will clear it, leaving the folio not
96+
* uptodate.
97+
*/
98+
mark_uptodate = ifs_set_range_uptodate(folio, ifs, off, len) &&
99+
!ifs->read_bytes_pending;
91100
spin_unlock_irqrestore(&ifs->state_lock, flags);
92101
}
93102

94-
if (uptodate)
103+
if (mark_uptodate)
95104
folio_mark_uptodate(folio);
96105
}
97106

fs/iomap/direct-io.c

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,19 @@ static inline enum fserror_type iomap_dio_err_type(const struct iomap_dio *dio)
8787
return FSERR_DIRECTIO_READ;
8888
}
8989

90+
static inline bool should_report_dio_fserror(const struct iomap_dio *dio)
91+
{
92+
switch (dio->error) {
93+
case 0:
94+
case -EAGAIN:
95+
case -ENOTBLK:
96+
/* don't send fsnotify for success or magic retry codes */
97+
return false;
98+
default:
99+
return true;
100+
}
101+
}
102+
90103
ssize_t iomap_dio_complete(struct iomap_dio *dio)
91104
{
92105
const struct iomap_dio_ops *dops = dio->dops;
@@ -96,7 +109,7 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio)
96109

97110
if (dops && dops->end_io)
98111
ret = dops->end_io(iocb, dio->size, ret, dio->flags);
99-
if (dio->error)
112+
if (should_report_dio_fserror(dio))
100113
fserror_report_io(file_inode(iocb->ki_filp),
101114
iomap_dio_err_type(dio), offset, dio->size,
102115
dio->error, GFP_NOFS);

fs/iomap/ioend.c

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -215,17 +215,18 @@ ssize_t iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, struct folio *folio,
215215
WARN_ON_ONCE(!folio->private && map_len < dirty_len);
216216

217217
switch (wpc->iomap.type) {
218-
case IOMAP_INLINE:
219-
WARN_ON_ONCE(1);
220-
return -EIO;
218+
case IOMAP_UNWRITTEN:
219+
ioend_flags |= IOMAP_IOEND_UNWRITTEN;
220+
break;
221+
case IOMAP_MAPPED:
222+
break;
221223
case IOMAP_HOLE:
222224
return map_len;
223225
default:
224-
break;
226+
WARN_ON_ONCE(1);
227+
return -EIO;
225228
}
226229

227-
if (wpc->iomap.type == IOMAP_UNWRITTEN)
228-
ioend_flags |= IOMAP_IOEND_UNWRITTEN;
229230
if (wpc->iomap.flags & IOMAP_F_SHARED)
230231
ioend_flags |= IOMAP_IOEND_SHARED;
231232
if (folio_test_dropbehind(folio))

fs/netfs/direct_write.c

Lines changed: 212 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,202 @@
99
#include <linux/uio.h>
1010
#include "internal.h"
1111

12+
/*
13+
* Perform the cleanup rituals after an unbuffered write is complete.
14+
*/
15+
static void netfs_unbuffered_write_done(struct netfs_io_request *wreq)
16+
{
17+
struct netfs_inode *ictx = netfs_inode(wreq->inode);
18+
19+
_enter("R=%x", wreq->debug_id);
20+
21+
/* Okay, declare that all I/O is complete. */
22+
trace_netfs_rreq(wreq, netfs_rreq_trace_write_done);
23+
24+
if (!wreq->error)
25+
netfs_update_i_size(ictx, &ictx->inode, wreq->start, wreq->transferred);
26+
27+
if (wreq->origin == NETFS_DIO_WRITE &&
28+
wreq->mapping->nrpages) {
29+
/* mmap may have got underfoot and we may now have folios
30+
* locally covering the region we just wrote. Attempt to
31+
* discard the folios, but leave in place any modified locally.
32+
* ->write_iter() is prevented from interfering by the DIO
33+
* counter.
34+
*/
35+
pgoff_t first = wreq->start >> PAGE_SHIFT;
36+
pgoff_t last = (wreq->start + wreq->transferred - 1) >> PAGE_SHIFT;
37+
38+
invalidate_inode_pages2_range(wreq->mapping, first, last);
39+
}
40+
41+
if (wreq->origin == NETFS_DIO_WRITE)
42+
inode_dio_end(wreq->inode);
43+
44+
_debug("finished");
45+
netfs_wake_rreq_flag(wreq, NETFS_RREQ_IN_PROGRESS, netfs_rreq_trace_wake_ip);
46+
/* As we cleared NETFS_RREQ_IN_PROGRESS, we acquired its ref. */
47+
48+
if (wreq->iocb) {
49+
size_t written = umin(wreq->transferred, wreq->len);
50+
51+
wreq->iocb->ki_pos += written;
52+
if (wreq->iocb->ki_complete) {
53+
trace_netfs_rreq(wreq, netfs_rreq_trace_ki_complete);
54+
wreq->iocb->ki_complete(wreq->iocb, wreq->error ?: written);
55+
}
56+
wreq->iocb = VFS_PTR_POISON;
57+
}
58+
59+
netfs_clear_subrequests(wreq);
60+
}
61+
62+
/*
63+
* Collect the subrequest results of unbuffered write subrequests.
64+
*/
65+
static void netfs_unbuffered_write_collect(struct netfs_io_request *wreq,
66+
struct netfs_io_stream *stream,
67+
struct netfs_io_subrequest *subreq)
68+
{
69+
trace_netfs_collect_sreq(wreq, subreq);
70+
71+
spin_lock(&wreq->lock);
72+
list_del_init(&subreq->rreq_link);
73+
spin_unlock(&wreq->lock);
74+
75+
wreq->transferred += subreq->transferred;
76+
iov_iter_advance(&wreq->buffer.iter, subreq->transferred);
77+
78+
stream->collected_to = subreq->start + subreq->transferred;
79+
wreq->collected_to = stream->collected_to;
80+
netfs_put_subrequest(subreq, netfs_sreq_trace_put_done);
81+
82+
trace_netfs_collect_stream(wreq, stream);
83+
trace_netfs_collect_state(wreq, wreq->collected_to, 0);
84+
}
85+
86+
/*
87+
* Write data to the server without going through the pagecache and without
88+
* writing it to the local cache. We dispatch the subrequests serially and
89+
* wait for each to complete before dispatching the next, lest we leave a gap
90+
* in the data written due to a failure such as ENOSPC. We could, however
91+
* attempt to do preparation such as content encryption for the next subreq
92+
* whilst the current is in progress.
93+
*/
94+
static int netfs_unbuffered_write(struct netfs_io_request *wreq)
95+
{
96+
struct netfs_io_subrequest *subreq = NULL;
97+
struct netfs_io_stream *stream = &wreq->io_streams[0];
98+
int ret;
99+
100+
_enter("%llx", wreq->len);
101+
102+
if (wreq->origin == NETFS_DIO_WRITE)
103+
inode_dio_begin(wreq->inode);
104+
105+
stream->collected_to = wreq->start;
106+
107+
for (;;) {
108+
bool retry = false;
109+
110+
if (!subreq) {
111+
netfs_prepare_write(wreq, stream, wreq->start + wreq->transferred);
112+
subreq = stream->construct;
113+
stream->construct = NULL;
114+
stream->front = NULL;
115+
}
116+
117+
/* Check if (re-)preparation failed. */
118+
if (unlikely(test_bit(NETFS_SREQ_FAILED, &subreq->flags))) {
119+
netfs_write_subrequest_terminated(subreq, subreq->error);
120+
wreq->error = subreq->error;
121+
break;
122+
}
123+
124+
iov_iter_truncate(&subreq->io_iter, wreq->len - wreq->transferred);
125+
if (!iov_iter_count(&subreq->io_iter))
126+
break;
127+
128+
subreq->len = netfs_limit_iter(&subreq->io_iter, 0,
129+
stream->sreq_max_len,
130+
stream->sreq_max_segs);
131+
iov_iter_truncate(&subreq->io_iter, subreq->len);
132+
stream->submit_extendable_to = subreq->len;
133+
134+
trace_netfs_sreq(subreq, netfs_sreq_trace_submit);
135+
stream->issue_write(subreq);
136+
137+
/* Async, need to wait. */
138+
netfs_wait_for_in_progress_stream(wreq, stream);
139+
140+
if (test_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) {
141+
retry = true;
142+
} else if (test_bit(NETFS_SREQ_FAILED, &subreq->flags)) {
143+
ret = subreq->error;
144+
wreq->error = ret;
145+
netfs_see_subrequest(subreq, netfs_sreq_trace_see_failed);
146+
subreq = NULL;
147+
break;
148+
}
149+
ret = 0;
150+
151+
if (!retry) {
152+
netfs_unbuffered_write_collect(wreq, stream, subreq);
153+
subreq = NULL;
154+
if (wreq->transferred >= wreq->len)
155+
break;
156+
if (!wreq->iocb && signal_pending(current)) {
157+
ret = wreq->transferred ? -EINTR : -ERESTARTSYS;
158+
trace_netfs_rreq(wreq, netfs_rreq_trace_intr);
159+
break;
160+
}
161+
continue;
162+
}
163+
164+
/* We need to retry the last subrequest, so first reset the
165+
* iterator, taking into account what, if anything, we managed
166+
* to transfer.
167+
*/
168+
subreq->error = -EAGAIN;
169+
trace_netfs_sreq(subreq, netfs_sreq_trace_retry);
170+
if (subreq->transferred > 0)
171+
iov_iter_advance(&wreq->buffer.iter, subreq->transferred);
172+
173+
if (stream->source == NETFS_UPLOAD_TO_SERVER &&
174+
wreq->netfs_ops->retry_request)
175+
wreq->netfs_ops->retry_request(wreq, stream);
176+
177+
__clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags);
178+
__clear_bit(NETFS_SREQ_BOUNDARY, &subreq->flags);
179+
__clear_bit(NETFS_SREQ_FAILED, &subreq->flags);
180+
subreq->io_iter = wreq->buffer.iter;
181+
subreq->start = wreq->start + wreq->transferred;
182+
subreq->len = wreq->len - wreq->transferred;
183+
subreq->transferred = 0;
184+
subreq->retry_count += 1;
185+
stream->sreq_max_len = UINT_MAX;
186+
stream->sreq_max_segs = INT_MAX;
187+
188+
netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
189+
stream->prepare_write(subreq);
190+
191+
__set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
192+
netfs_stat(&netfs_n_wh_retry_write_subreq);
193+
}
194+
195+
netfs_unbuffered_write_done(wreq);
196+
_leave(" = %d", ret);
197+
return ret;
198+
}
199+
200+
static void netfs_unbuffered_write_async(struct work_struct *work)
201+
{
202+
struct netfs_io_request *wreq = container_of(work, struct netfs_io_request, work);
203+
204+
netfs_unbuffered_write(wreq);
205+
netfs_put_request(wreq, netfs_rreq_trace_put_complete);
206+
}
207+
12208
/*
13209
* Perform an unbuffered write where we may have to do an RMW operation on an
14210
* encrypted file. This can also be used for direct I/O writes.
@@ -70,35 +266,35 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter *
70266
*/
71267
wreq->buffer.iter = *iter;
72268
}
269+
270+
wreq->len = iov_iter_count(&wreq->buffer.iter);
73271
}
74272

75273
__set_bit(NETFS_RREQ_USE_IO_ITER, &wreq->flags);
76-
if (async)
77-
__set_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &wreq->flags);
78274

79275
/* Copy the data into the bounce buffer and encrypt it. */
80276
// TODO
81277

82278
/* Dispatch the write. */
83279
__set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags);
84-
if (async)
85-
wreq->iocb = iocb;
86-
wreq->len = iov_iter_count(&wreq->buffer.iter);
87-
ret = netfs_unbuffered_write(wreq, is_sync_kiocb(iocb), wreq->len);
88-
if (ret < 0) {
89-
_debug("begin = %zd", ret);
90-
goto out;
91-
}
92280

93-
if (!async) {
94-
ret = netfs_wait_for_write(wreq);
95-
if (ret > 0)
96-
iocb->ki_pos += ret;
97-
} else {
281+
if (async) {
282+
INIT_WORK(&wreq->work, netfs_unbuffered_write_async);
283+
wreq->iocb = iocb;
284+
queue_work(system_dfl_wq, &wreq->work);
98285
ret = -EIOCBQUEUED;
286+
} else {
287+
ret = netfs_unbuffered_write(wreq);
288+
if (ret < 0) {
289+
_debug("begin = %zd", ret);
290+
} else {
291+
iocb->ki_pos += wreq->transferred;
292+
ret = wreq->transferred ?: wreq->error;
293+
}
294+
295+
netfs_put_request(wreq, netfs_rreq_trace_put_complete);
99296
}
100297

101-
out:
102298
netfs_put_request(wreq, netfs_rreq_trace_put_return);
103299
return ret;
104300

fs/netfs/internal.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,9 @@ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping,
198198
struct file *file,
199199
loff_t start,
200200
enum netfs_io_origin origin);
201+
void netfs_prepare_write(struct netfs_io_request *wreq,
202+
struct netfs_io_stream *stream,
203+
loff_t start);
201204
void netfs_reissue_write(struct netfs_io_stream *stream,
202205
struct netfs_io_subrequest *subreq,
203206
struct iov_iter *source);
@@ -212,7 +215,6 @@ int netfs_advance_writethrough(struct netfs_io_request *wreq, struct writeback_c
212215
struct folio **writethrough_cache);
213216
ssize_t netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc,
214217
struct folio *writethrough_cache);
215-
int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t len);
216218

217219
/*
218220
* write_retry.c

fs/netfs/write_collect.c

Lines changed: 0 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -399,27 +399,6 @@ bool netfs_write_collection(struct netfs_io_request *wreq)
399399
ictx->ops->invalidate_cache(wreq);
400400
}
401401

402-
if ((wreq->origin == NETFS_UNBUFFERED_WRITE ||
403-
wreq->origin == NETFS_DIO_WRITE) &&
404-
!wreq->error)
405-
netfs_update_i_size(ictx, &ictx->inode, wreq->start, wreq->transferred);
406-
407-
if (wreq->origin == NETFS_DIO_WRITE &&
408-
wreq->mapping->nrpages) {
409-
/* mmap may have got underfoot and we may now have folios
410-
* locally covering the region we just wrote. Attempt to
411-
* discard the folios, but leave in place any modified locally.
412-
* ->write_iter() is prevented from interfering by the DIO
413-
* counter.
414-
*/
415-
pgoff_t first = wreq->start >> PAGE_SHIFT;
416-
pgoff_t last = (wreq->start + wreq->transferred - 1) >> PAGE_SHIFT;
417-
invalidate_inode_pages2_range(wreq->mapping, first, last);
418-
}
419-
420-
if (wreq->origin == NETFS_DIO_WRITE)
421-
inode_dio_end(wreq->inode);
422-
423402
_debug("finished");
424403
netfs_wake_rreq_flag(wreq, NETFS_RREQ_IN_PROGRESS, netfs_rreq_trace_wake_ip);
425404
/* As we cleared NETFS_RREQ_IN_PROGRESS, we acquired its ref. */

0 commit comments

Comments
 (0)