diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst index 4a010a7cde7f8085db5ba6f1b9af53e9e5223cd5..82f2117cf2b36a834e5e391feda0210d916bff8b 100644 --- a/Documentation/networking/ip-sysctl.rst +++ b/Documentation/networking/ip-sysctl.rst @@ -321,6 +321,7 @@ tcp_abort_on_overflow - BOOLEAN option can harm clients of your server. tcp_adv_win_scale - INTEGER + Obsolete since linux-6.6 Count buffering overhead as bytes/2^tcp_adv_win_scale (if tcp_adv_win_scale > 0) or bytes-bytes/2^(-tcp_adv_win_scale), if it is <= 0. diff --git a/include/linux/tcp.h b/include/linux/tcp.h index b4c08ac86983568a9511258708724da15d0b999e..fbcb0ce13171d46aa3697abcd48482b08e78e5e0 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -172,6 +172,8 @@ static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req) return (struct tcp_request_sock *)req; } +#define TCP_RMEM_TO_WIN_SCALE 8 + struct tcp_sock { /* inet_connection_sock has to be the first member of tcp_sock */ struct inet_connection_sock inet_conn; @@ -238,7 +240,7 @@ struct tcp_sock { u32 window_clamp; /* Maximal window to advertise */ u32 rcv_ssthresh; /* Current window clamp */ - + u8 scaling_ratio; /* see tcp_win_from_space() */ /* Information of the most recently (s)acked skb */ struct tcp_rack { u64 mstamp; /* (Re)sent time of the skb */ diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index f003747181593559a4efe1838be719d445417041..7a41c4791536732005cedbb80c223b86aa43249e 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -152,7 +152,7 @@ struct netns_ipv4 { u8 sysctl_tcp_abort_on_overflow; u8 sysctl_tcp_fack; /* obsolete */ int sysctl_tcp_max_reordering; - int sysctl_tcp_adv_win_scale; + int sysctl_tcp_adv_win_scale; /* obsolete */ u8 sysctl_tcp_dsack; u8 sysctl_tcp_app_win; u8 sysctl_tcp_frto; diff --git a/include/net/tcp.h b/include/net/tcp.h index 226bce6d1e8c30185260baadec449b67323db91c..2104a71c75ba7eee40612395be4103ae370b3c03 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1434,11 +1434,27 @@ void tcp_select_initial_window(const struct sock *sk, int __space, static inline int tcp_win_from_space(const struct sock *sk, int space) { - int tcp_adv_win_scale = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_adv_win_scale); + s64 scaled_space = (s64)space * tcp_sk(sk)->scaling_ratio; - return tcp_adv_win_scale <= 0 ? - (space>>(-tcp_adv_win_scale)) : - space - (space>>tcp_adv_win_scale); + return scaled_space >> TCP_RMEM_TO_WIN_SCALE; +} + +/* inverse of tcp_win_from_space() */ +static inline int tcp_space_from_win(const struct sock *sk, int win) +{ + u64 val = (u64)win << TCP_RMEM_TO_WIN_SCALE; + + do_div(val, tcp_sk(sk)->scaling_ratio); + return val; +} + +static inline void tcp_scaling_ratio_init(struct sock *sk) +{ + /* Assume a conservative default of 1200 bytes of payload per 4K page. + * This may be adjusted later in tcp_measure_rcv_mss(). + */ + tcp_sk(sk)->scaling_ratio = (1200 << TCP_RMEM_TO_WIN_SCALE) / + SKB_TRUESIZE(4096); } /* Note: caller must be prepared to deal with negative returns */ diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index e03e08745308189c9d64509c2cff94da56c86a0c..88f4ebab12acc11d5f3feb6b13974a0b8e565671 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -457,6 +457,7 @@ void tcp_init_sock(struct sock *sk) WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[1])); WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[1])); + tcp_scaling_ratio_init(sk); set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); sk_sockets_allocated_inc(sk); @@ -1700,7 +1701,7 @@ EXPORT_SYMBOL(tcp_peek_len); /* Make sure sk_rcvbuf is big enough to satisfy SO_RCVLOWAT hint */ int tcp_set_rcvlowat(struct sock *sk, int val) { - int cap; + int space, cap; if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) cap = sk->sk_rcvbuf >> 1; @@ -1715,10 +1716,10 @@ int tcp_set_rcvlowat(struct sock *sk, int val) if (sk->sk_userlocks & SOCK_RCVBUF_LOCK) return 0; - val <<= 1; - if (val > sk->sk_rcvbuf) { - WRITE_ONCE(sk->sk_rcvbuf, val); - tcp_sk(sk)->window_clamp = tcp_win_from_space(sk, val); + space = tcp_space_from_win(sk, val); + if (space > sk->sk_rcvbuf) { + WRITE_ONCE(sk->sk_rcvbuf, space); + tcp_sk(sk)->window_clamp = val; } return 0; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 57c8af1859c16eba5e952a23ea959b628006f9c1..3cd92035e0902298baa8afd89ae5edcbfce300e5 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -237,6 +237,16 @@ static void tcp_measure_rcv_mss(struct sock *sk, const struct sk_buff *skb) */ len = skb_shinfo(skb)->gso_size ? : skb->len; if (len >= icsk->icsk_ack.rcv_mss) { + /* Note: divides are still a bit expensive. + * For the moment, only adjust scaling_ratio + * when we update icsk_ack.rcv_mss. + */ + if (unlikely(len != icsk->icsk_ack.rcv_mss)) { + u64 val = (u64)skb->len << TCP_RMEM_TO_WIN_SCALE; + + do_div(val, skb->truesize); + tcp_sk(sk)->scaling_ratio = val ? val : 1; + } icsk->icsk_ack.rcv_mss = min_t(unsigned int, len, tcp_sk(sk)->advmss); /* Account for possibly-removed options */ @@ -727,8 +737,8 @@ void tcp_rcv_space_adjust(struct sock *sk) if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) && !(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) { - int rcvmem, rcvbuf; u64 rcvwin, grow; + int rcvbuf; /* minimal window to cope with packet losses, assuming * steady state. Add some cushion because of small variations. @@ -740,12 +750,7 @@ void tcp_rcv_space_adjust(struct sock *sk) do_div(grow, tp->rcvq_space.space); rcvwin += (grow << 1); - rcvmem = SKB_TRUESIZE(tp->advmss + MAX_TCP_HEADER); - while (tcp_win_from_space(sk, rcvmem) < tp->advmss) - rcvmem += 128; - - do_div(rcvwin, tp->advmss); - rcvbuf = min_t(u64, rcvwin * rcvmem, + rcvbuf = min_t(u64, tcp_space_from_win(sk, rcvwin), READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2])); if (rcvbuf > sk->sk_rcvbuf) { WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);