Skip to content

Commit bcc843b

Browse files
committed
Merge branch 'tcp-fix-receive-autotune-again'
Matthieu Baerts says: ==================== tcp: fix receive autotune again Neal Cardwell found that recent kernels were having RWIN limited issues, even when net.ipv4.tcp_rmem[2] was set to a very big value like 512MB. He suspected that tcp_stream default buffer size (64KB) was triggering heuristic added in ea33537 ("tcp: add receive queue awareness in tcp_rcv_space_adjust()"). After more testing, it turns out the bug was added earlier with commit 65c5287 ("tcp: fix sk_rcvbuf overshoot"). I forgot once again that DRS has one RTT latency. MPTCP also got the same issue. This series : - Prevents calling tcp_rcvbuf_grow() on some MPTCP subflows. - adds rcv_ssthresh, window_clamp and rcv_wnd to trace_tcp_rcvbuf_grow(). - Refactors code in a patch with no functional changes. - Fixes the issue in the final patch. ==================== Link: https://patch.msgid.link/[email protected] Signed-off-by: Jakub Kicinski <[email protected]>
2 parents f99c579 + aa251c8 commit bcc843b

4 files changed

Lines changed: 41 additions & 17 deletions

File tree

include/net/tcp.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -370,7 +370,7 @@ void tcp_delack_timer_handler(struct sock *sk);
370370
int tcp_ioctl(struct sock *sk, int cmd, int *karg);
371371
enum skb_drop_reason tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb);
372372
void tcp_rcv_established(struct sock *sk, struct sk_buff *skb);
373-
void tcp_rcvbuf_grow(struct sock *sk);
373+
void tcp_rcvbuf_grow(struct sock *sk, u32 newval);
374374
void tcp_rcv_space_adjust(struct sock *sk);
375375
int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp);
376376
void tcp_twsk_destructor(struct sock *sk);

include/trace/events/tcp.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,9 @@ TRACE_EVENT(tcp_rcvbuf_grow,
218218
__field(__u32, space)
219219
__field(__u32, ooo_space)
220220
__field(__u32, rcvbuf)
221+
__field(__u32, rcv_ssthresh)
222+
__field(__u32, window_clamp)
223+
__field(__u32, rcv_wnd)
221224
__field(__u8, scaling_ratio)
222225
__field(__u16, sport)
223226
__field(__u16, dport)
@@ -245,6 +248,9 @@ TRACE_EVENT(tcp_rcvbuf_grow,
245248
tp->rcv_nxt;
246249

247250
__entry->rcvbuf = sk->sk_rcvbuf;
251+
__entry->rcv_ssthresh = tp->rcv_ssthresh;
252+
__entry->window_clamp = tp->window_clamp;
253+
__entry->rcv_wnd = tp->rcv_wnd;
248254
__entry->scaling_ratio = tp->scaling_ratio;
249255
__entry->sport = ntohs(inet->inet_sport);
250256
__entry->dport = ntohs(inet->inet_dport);
@@ -264,11 +270,14 @@ TRACE_EVENT(tcp_rcvbuf_grow,
264270
),
265271

266272
TP_printk("time=%u rtt_us=%u copied=%u inq=%u space=%u ooo=%u scaling_ratio=%u rcvbuf=%u "
273+
"rcv_ssthresh=%u window_clamp=%u rcv_wnd=%u "
267274
"family=%s sport=%hu dport=%hu saddr=%pI4 daddr=%pI4 "
268275
"saddrv6=%pI6c daddrv6=%pI6c skaddr=%p sock_cookie=%llx",
269276
__entry->time, __entry->rtt_us, __entry->copied,
270277
__entry->inq, __entry->space, __entry->ooo_space,
271278
__entry->scaling_ratio, __entry->rcvbuf,
279+
__entry->rcv_ssthresh, __entry->window_clamp,
280+
__entry->rcv_wnd,
272281
show_family_name(__entry->family),
273282
__entry->sport, __entry->dport,
274283
__entry->saddr, __entry->daddr,

net/ipv4/tcp_input.c

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -891,18 +891,27 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
891891
}
892892
}
893893

894-
void tcp_rcvbuf_grow(struct sock *sk)
894+
void tcp_rcvbuf_grow(struct sock *sk, u32 newval)
895895
{
896896
const struct net *net = sock_net(sk);
897897
struct tcp_sock *tp = tcp_sk(sk);
898-
int rcvwin, rcvbuf, cap;
898+
u32 rcvwin, rcvbuf, cap, oldval;
899+
u64 grow;
900+
901+
oldval = tp->rcvq_space.space;
902+
tp->rcvq_space.space = newval;
899903

900904
if (!READ_ONCE(net->ipv4.sysctl_tcp_moderate_rcvbuf) ||
901905
(sk->sk_userlocks & SOCK_RCVBUF_LOCK))
902906
return;
903907

908+
/* DRS is always one RTT late. */
909+
rcvwin = newval << 1;
910+
904911
/* slow start: allow the sender to double its rate. */
905-
rcvwin = tp->rcvq_space.space << 1;
912+
grow = (u64)rcvwin * (newval - oldval);
913+
do_div(grow, oldval);
914+
rcvwin += grow << 1;
906915

907916
if (!RB_EMPTY_ROOT(&tp->out_of_order_queue))
908917
rcvwin += TCP_SKB_CB(tp->ooo_last_skb)->end_seq - tp->rcv_nxt;
@@ -943,9 +952,7 @@ void tcp_rcv_space_adjust(struct sock *sk)
943952

944953
trace_tcp_rcvbuf_grow(sk, time);
945954

946-
tp->rcvq_space.space = copied;
947-
948-
tcp_rcvbuf_grow(sk);
955+
tcp_rcvbuf_grow(sk, copied);
949956

950957
new_measure:
951958
tp->rcvq_space.seq = tp->copied_seq;
@@ -5270,7 +5277,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb)
52705277
}
52715278
/* do not grow rcvbuf for not-yet-accepted or orphaned sockets. */
52725279
if (sk->sk_socket)
5273-
tcp_rcvbuf_grow(sk);
5280+
tcp_rcvbuf_grow(sk, tp->rcvq_space.space);
52745281
}
52755282

52765283
static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb,

net/mptcp/protocol.c

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -194,17 +194,26 @@ static bool mptcp_ooo_try_coalesce(struct mptcp_sock *msk, struct sk_buff *to,
194194
* - mptcp does not maintain a msk-level window clamp
195195
* - returns true when the receive buffer is actually updated
196196
*/
197-
static bool mptcp_rcvbuf_grow(struct sock *sk)
197+
static bool mptcp_rcvbuf_grow(struct sock *sk, u32 newval)
198198
{
199199
struct mptcp_sock *msk = mptcp_sk(sk);
200200
const struct net *net = sock_net(sk);
201-
int rcvwin, rcvbuf, cap;
201+
u32 rcvwin, rcvbuf, cap, oldval;
202+
u64 grow;
202203

204+
oldval = msk->rcvq_space.space;
205+
msk->rcvq_space.space = newval;
203206
if (!READ_ONCE(net->ipv4.sysctl_tcp_moderate_rcvbuf) ||
204207
(sk->sk_userlocks & SOCK_RCVBUF_LOCK))
205208
return false;
206209

207-
rcvwin = msk->rcvq_space.space << 1;
210+
/* DRS is always one RTT late. */
211+
rcvwin = newval << 1;
212+
213+
/* slow start: allow the sender to double its rate. */
214+
grow = (u64)rcvwin * (newval - oldval);
215+
do_div(grow, oldval);
216+
rcvwin += grow << 1;
208217

209218
if (!RB_EMPTY_ROOT(&msk->out_of_order_queue))
210219
rcvwin += MPTCP_SKB_CB(msk->ooo_last_skb)->end_seq - msk->ack_seq;
@@ -334,7 +343,7 @@ static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb)
334343
skb_set_owner_r(skb, sk);
335344
/* do not grow rcvbuf for not-yet-accepted or orphaned sockets. */
336345
if (sk->sk_socket)
337-
mptcp_rcvbuf_grow(sk);
346+
mptcp_rcvbuf_grow(sk, msk->rcvq_space.space);
338347
}
339348

340349
static void mptcp_init_skb(struct sock *ssk, struct sk_buff *skb, int offset,
@@ -2049,9 +2058,7 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
20492058
if (msk->rcvq_space.copied <= msk->rcvq_space.space)
20502059
goto new_measure;
20512060

2052-
msk->rcvq_space.space = msk->rcvq_space.copied;
2053-
if (mptcp_rcvbuf_grow(sk)) {
2054-
2061+
if (mptcp_rcvbuf_grow(sk, msk->rcvq_space.copied)) {
20552062
/* Make subflows follow along. If we do not do this, we
20562063
* get drops at subflow level if skbs can't be moved to
20572064
* the mptcp rx queue fast enough (announced rcv_win can
@@ -2063,8 +2070,9 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
20632070

20642071
ssk = mptcp_subflow_tcp_sock(subflow);
20652072
slow = lock_sock_fast(ssk);
2066-
tcp_sk(ssk)->rcvq_space.space = msk->rcvq_space.copied;
2067-
tcp_rcvbuf_grow(ssk);
2073+
/* subflows can be added before tcp_init_transfer() */
2074+
if (tcp_sk(ssk)->rcvq_space.space)
2075+
tcp_rcvbuf_grow(ssk, msk->rcvq_space.copied);
20682076
unlock_sock_fast(ssk, slow);
20692077
}
20702078
}

0 commit comments

Comments
 (0)