@@ -148,13 +148,6 @@ struct epitem {
148148 /* The file descriptor information this item refers to */
149149 struct epoll_filefd ffd ;
150150
151- /*
152- * Protected by file->f_lock, true for to-be-released epitem already
153- * removed from the "struct file" items list; together with
154- * eventpoll->refcount orchestrates "struct eventpoll" disposal
155- */
156- bool dying ;
157-
158151 /* List containing poll wait queues */
159152 struct eppoll_entry * pwqlist ;
160153
@@ -220,10 +213,7 @@ struct eventpoll {
220213 struct hlist_head refs ;
221214 u8 loop_check_depth ;
222215
223- /*
224- * usage count, used together with epitem->dying to
225- * orchestrate the disposal of this struct
226- */
216+ /* usage count, orchestrates "struct eventpoll" disposal */
227217 refcount_t refcount ;
228218
229219 /* used to defer freeing past ep_get_upwards_depth_proc() RCU walk */
@@ -827,36 +817,47 @@ static void ep_free(struct eventpoll *ep)
827817}
828818
829819/*
830- * Removes a "struct epitem" from the eventpoll RB tree and deallocates
831- * all the associated resources. Must be called with "mtx" held.
832- * If the dying flag is set, do the removal only if force is true.
833- * This prevents ep_clear_and_put() from dropping all the ep references
834- * while running concurrently with eventpoll_release_file().
835- * Returns true if the eventpoll can be disposed.
820+ * The ffd.file pointer may be in the process of being torn down due to
821+ * being closed, but we may not have finished eventpoll_release() yet.
822+ *
823+ * Normally, even with the atomic_long_inc_not_zero, the file may have
824+ * been free'd and then gotten re-allocated to something else (since
825+ * files are not RCU-delayed, they are SLAB_TYPESAFE_BY_RCU).
826+ *
827+ * But for epoll, users hold the ep->mtx mutex, and as such any file in
828+ * the process of being free'd will block in eventpoll_release_file()
829+ * and thus the underlying file allocation will not be free'd, and the
830+ * file re-use cannot happen.
831+ *
832+ * For the same reason we can avoid a rcu_read_lock() around the
833+ * operation - 'ffd.file' cannot go away even if the refcount has
834+ * reached zero (but we must still not call out to ->poll() functions
835+ * etc).
836836 */
837- static bool __ep_remove ( struct eventpoll * ep , struct epitem * epi , bool force )
837+ static struct file * epi_fget ( const struct epitem * epi )
838838{
839- struct file * file = epi -> ffd .file ;
840- struct epitems_head * to_free ;
841- struct hlist_head * head ;
839+ struct file * file ;
842840
843- lockdep_assert_irqs_enabled ();
841+ file = epi -> ffd .file ;
842+ if (!file_ref_get (& file -> f_ref ))
843+ file = NULL ;
844+ return file ;
845+ }
844846
845- /*
846- * Removes poll wait queue hooks.
847- */
848- ep_unregister_pollwait (ep , epi );
847+ /*
848+ * Takes &file->f_lock; returns with it released.
849+ */
850+ static void ep_remove_file (struct eventpoll * ep , struct epitem * epi ,
851+ struct file * file )
852+ {
853+ struct epitems_head * to_free = NULL ;
854+ struct hlist_head * head ;
849855
850- /* Remove the current item from the list of epoll hooks */
851- spin_lock (& file -> f_lock );
852- if (epi -> dying && !force ) {
853- spin_unlock (& file -> f_lock );
854- return false;
855- }
856+ lockdep_assert_held (& ep -> mtx );
856857
857- to_free = NULL ;
858+ spin_lock ( & file -> f_lock ) ;
858859 head = file -> f_ep ;
859- if (head -> first == & epi -> fllink && ! epi -> fllink . next ) {
860+ if (hlist_is_singular_node ( & epi -> fllink , head ) ) {
860861 /* See eventpoll_release() for details. */
861862 WRITE_ONCE (file -> f_ep , NULL );
862863 if (!is_file_epoll (file )) {
@@ -869,6 +870,11 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force)
869870 hlist_del_rcu (& epi -> fllink );
870871 spin_unlock (& file -> f_lock );
871872 free_ephead (to_free );
873+ }
874+
875+ static void ep_remove_epi (struct eventpoll * ep , struct epitem * epi )
876+ {
877+ lockdep_assert_held (& ep -> mtx );
872878
873879 rb_erase_cached (& epi -> rbn , & ep -> rbr );
874880
@@ -888,16 +894,32 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force)
888894 kfree_rcu (epi , rcu );
889895
890896 percpu_counter_dec (& ep -> user -> epoll_watches );
891- return true;
892897}
893898
894899/*
895900 * ep_remove variant for callers owing an additional reference to the ep
896901 */
897- static void ep_remove_safe (struct eventpoll * ep , struct epitem * epi )
902+ static void ep_remove (struct eventpoll * ep , struct epitem * epi )
898903{
899- if (__ep_remove (ep , epi , false))
900- WARN_ON_ONCE (ep_refcount_dec_and_test (ep ));
904+ struct file * file __free (fput ) = NULL ;
905+
906+ lockdep_assert_irqs_enabled ();
907+ lockdep_assert_held (& ep -> mtx );
908+
909+ ep_unregister_pollwait (ep , epi );
910+
911+ /*
912+ * If we manage to grab a reference it means we're not in
913+ * eventpoll_release_file() and aren't going to be: once @file's
914+ * refcount has reached zero, file_ref_get() cannot bring it back.
915+ */
916+ file = epi_fget (epi );
917+ if (!file )
918+ return ;
919+
920+ ep_remove_file (ep , epi , file );
921+ ep_remove_epi (ep , epi );
922+ WARN_ON_ONCE (ep_refcount_dec_and_test (ep ));
901923}
902924
903925static void ep_clear_and_put (struct eventpoll * ep )
@@ -923,7 +945,7 @@ static void ep_clear_and_put(struct eventpoll *ep)
923945
924946 /*
925947 * Walks through the whole tree and try to free each "struct epitem".
926- * Note that ep_remove_safe () will not remove the epitem in case of a
948+ * Note that ep_remove () will not remove the epitem in case of a
927949 * racing eventpoll_release_file(); the latter will do the removal.
928950 * At this point we are sure no poll callbacks will be lingering around.
929951 * Since we still own a reference to the eventpoll struct, the loop can't
@@ -932,7 +954,7 @@ static void ep_clear_and_put(struct eventpoll *ep)
932954 for (rbp = rb_first_cached (& ep -> rbr ); rbp ; rbp = next ) {
933955 next = rb_next (rbp );
934956 epi = rb_entry (rbp , struct epitem , rbn );
935- ep_remove_safe (ep , epi );
957+ ep_remove (ep , epi );
936958 cond_resched ();
937959 }
938960
@@ -1012,34 +1034,6 @@ static __poll_t __ep_eventpoll_poll(struct file *file, poll_table *wait, int dep
10121034 return res ;
10131035}
10141036
1015- /*
1016- * The ffd.file pointer may be in the process of being torn down due to
1017- * being closed, but we may not have finished eventpoll_release() yet.
1018- *
1019- * Normally, even with the atomic_long_inc_not_zero, the file may have
1020- * been free'd and then gotten re-allocated to something else (since
1021- * files are not RCU-delayed, they are SLAB_TYPESAFE_BY_RCU).
1022- *
1023- * But for epoll, users hold the ep->mtx mutex, and as such any file in
1024- * the process of being free'd will block in eventpoll_release_file()
1025- * and thus the underlying file allocation will not be free'd, and the
1026- * file re-use cannot happen.
1027- *
1028- * For the same reason we can avoid a rcu_read_lock() around the
1029- * operation - 'ffd.file' cannot go away even if the refcount has
1030- * reached zero (but we must still not call out to ->poll() functions
1031- * etc).
1032- */
1033- static struct file * epi_fget (const struct epitem * epi )
1034- {
1035- struct file * file ;
1036-
1037- file = epi -> ffd .file ;
1038- if (!file_ref_get (& file -> f_ref ))
1039- file = NULL ;
1040- return file ;
1041- }
1042-
10431037/*
10441038 * Differs from ep_eventpoll_poll() in that internal callers already have
10451039 * the ep->mtx so we need to start from depth=1, such that mutex_lock_nested()
@@ -1117,18 +1111,17 @@ void eventpoll_release_file(struct file *file)
11171111{
11181112 struct eventpoll * ep ;
11191113 struct epitem * epi ;
1120- bool dispose ;
11211114
11221115 /*
1123- * Use the 'dying' flag to prevent a concurrent ep_clear_and_put() from
1124- * touching the epitems list before eventpoll_release_file() can access
1125- * the ep->mtx.
1116+ * A concurrent ep_remove() cannot outrace us: it pins @file via
1117+ * epi_fget(), which fails once __fput() has dropped the refcount
1118+ * to zero -- the path we're on. So any racing ep_remove() bails
1119+ * and leaves the epi for us to clean up here.
11261120 */
11271121again :
11281122 spin_lock (& file -> f_lock );
11291123 if (file -> f_ep && file -> f_ep -> first ) {
11301124 epi = hlist_entry (file -> f_ep -> first , struct epitem , fllink );
1131- epi -> dying = true;
11321125 spin_unlock (& file -> f_lock );
11331126
11341127 /*
@@ -1137,10 +1130,15 @@ void eventpoll_release_file(struct file *file)
11371130 */
11381131 ep = epi -> ep ;
11391132 mutex_lock (& ep -> mtx );
1140- dispose = __ep_remove (ep , epi , true);
1133+
1134+ ep_unregister_pollwait (ep , epi );
1135+
1136+ ep_remove_file (ep , epi , file );
1137+ ep_remove_epi (ep , epi );
1138+
11411139 mutex_unlock (& ep -> mtx );
11421140
1143- if (dispose && ep_refcount_dec_and_test (ep ))
1141+ if (ep_refcount_dec_and_test (ep ))
11441142 ep_free (ep );
11451143 goto again ;
11461144 }
@@ -1619,21 +1617,21 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
16191617 mutex_unlock (& tep -> mtx );
16201618
16211619 /*
1622- * ep_remove_safe () calls in the later error paths can't lead to
1620+ * ep_remove () calls in the later error paths can't lead to
16231621 * ep_free() as the ep file itself still holds an ep reference.
16241622 */
16251623 ep_get (ep );
16261624
16271625 /* now check if we've created too many backpaths */
16281626 if (unlikely (full_check && reverse_path_check ())) {
1629- ep_remove_safe (ep , epi );
1627+ ep_remove (ep , epi );
16301628 return - EINVAL ;
16311629 }
16321630
16331631 if (epi -> event .events & EPOLLWAKEUP ) {
16341632 error = ep_create_wakeup_source (epi );
16351633 if (error ) {
1636- ep_remove_safe (ep , epi );
1634+ ep_remove (ep , epi );
16371635 return error ;
16381636 }
16391637 }
@@ -1657,7 +1655,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
16571655 * high memory pressure.
16581656 */
16591657 if (unlikely (!epq .epi )) {
1660- ep_remove_safe (ep , epi );
1658+ ep_remove (ep , epi );
16611659 return - ENOMEM ;
16621660 }
16631661
@@ -2352,7 +2350,7 @@ int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds,
23522350 * The eventpoll itself is still alive: the refcount
23532351 * can't go to zero here.
23542352 */
2355- ep_remove_safe (ep , epi );
2353+ ep_remove (ep , epi );
23562354 error = 0 ;
23572355 } else {
23582356 error = - ENOENT ;
0 commit comments