From 00483690552c5fb6aa30bf3acb75b0ee89b4c0fd Mon Sep 17 00:00:00 2001 From: Jon Maxwell Date: Thu, 10 May 2018 16:53:51 +1000 Subject: tcp: Add mark for TIMEWAIT sockets This version has some suggestions by Eric Dumazet: - Use a local variable for the mark in IPv6 instead of ctl_sk to avoid SMP races. - Use the more elegant "IP4_REPLY_MARK(net, skb->mark) ?: sk->sk_mark" statement. - Factorize code as sk_fullsock() check is not necessary. Aidan McGurn from Openwave Mobility systems reported the following bug: "Marked routing is broken on customer deployment. Its effects are large increase in Uplink retransmissions caused by the client never receiving the final ACK to their FINACK - this ACK misses the mark and routes out of the incorrect route." Currently marks are added to sk_buffs for replies when the "fwmark_reflect" sysctl is enabled. But not for TW sockets that had sk->sk_mark set via setsockopt(SO_MARK..). Fix this in IPv4/v6 by adding tw->tw_mark for TIME_WAIT sockets. Copy the the original sk->sk_mark in __inet_twsk_hashdance() to the new tw->tw_mark location. Then progate this so that the skb gets sent with the correct mark. Do the same for resets. Give the "fwmark_reflect" sysctl precedence over sk->sk_mark so that netfilter rules are still honored. Signed-off-by: Jon Maxwell Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/tcp_ipv6.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net/ipv6/tcp_ipv6.c') diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 6d664d83cd16..7d47c2b550a9 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -803,6 +803,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 unsigned int tot_len = sizeof(struct tcphdr); struct dst_entry *dst; __be32 *topt; + __u32 mark = 0; if (tsecr) tot_len += TCPOLEN_TSTAMP_ALIGNED; @@ -871,7 +872,10 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 fl6.flowi6_oif = oif; } - fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark); + if (sk) + mark = (sk->sk_state == TCP_TIME_WAIT) ? + inet_twsk(sk)->tw_mark : sk->sk_mark; + fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark) ?: mark; fl6.fl6_dport = t1->dest; fl6.fl6_sport = t1->source; fl6.flowi6_uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); -- cgit v1.2.3 From 3d97d88e8091f3501e016f6b4ce45a32c4b8f2f6 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Tue, 29 May 2018 23:27:31 +0800 Subject: tcp: minor optimization around tcp_hdr() usage in receive path This is additional to the commit ea1627c20c34 ("tcp: minor optimizations around tcp_hdr() usage"). At this point, skb->data is same with tcp_hdr() as tcp header has not been pulled yet. So use the less expensive one to get the tcp header. Remove the third parameter of tcp_rcv_established() and put it into the function body. Furthermore, the local variables are listed as a reverse christmas tree :) Cc: Eric Dumazet Signed-off-by: Yafang Shao Signed-off-by: David S. Miller --- include/net/tcp.h | 3 +-- include/trace/events/tcp.h | 5 +++-- net/ipv4/tcp_input.c | 6 +++--- net/ipv4/tcp_ipv4.c | 2 +- net/ipv6/tcp_ipv6.c | 2 +- 5 files changed, 9 insertions(+), 9 deletions(-) (limited to 'net/ipv6/tcp_ipv6.c') diff --git a/include/net/tcp.h b/include/net/tcp.h index 952d842a604a..029a51b91742 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -334,8 +334,7 @@ void tcp_write_timer_handler(struct sock *sk); void tcp_delack_timer_handler(struct sock *sk); int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg); int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb); -void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, - const struct tcphdr *th); +void tcp_rcv_established(struct sock *sk, struct sk_buff *skb); void tcp_rcv_space_adjust(struct sock *sk); int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp); void tcp_twsk_destructor(struct sock *sk); diff --git a/include/trace/events/tcp.h b/include/trace/events/tcp.h index 703abb6e11fa..ac55b328d61b 100644 --- a/include/trace/events/tcp.h +++ b/include/trace/events/tcp.h @@ -248,8 +248,9 @@ TRACE_EVENT(tcp_probe, ), TP_fast_assign( - const struct tcp_sock *tp = tcp_sk(sk); + const struct tcphdr *th = (const struct tcphdr *)skb->data; const struct inet_sock *inet = inet_sk(sk); + const struct tcp_sock *tp = tcp_sk(sk); memset(__entry->saddr, 0, sizeof(struct sockaddr_in6)); memset(__entry->daddr, 0, sizeof(struct sockaddr_in6)); @@ -261,7 +262,7 @@ TRACE_EVENT(tcp_probe, __entry->dport = ntohs(inet->inet_dport); __entry->mark = skb->mark; - __entry->data_len = skb->len - tcp_hdrlen(skb); + __entry->data_len = skb->len - __tcp_hdrlen(th); __entry->snd_nxt = tp->snd_nxt; __entry->snd_una = tp->snd_una; __entry->snd_cwnd = tp->snd_cwnd; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 1191cac72109..d5ffb573ca4d 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5390,11 +5390,11 @@ discard: * the rest is checked inline. Fast processing is turned on in * tcp_data_queue when everything is OK. */ -void tcp_rcv_established(struct sock *sk, struct sk_buff *skb, - const struct tcphdr *th) +void tcp_rcv_established(struct sock *sk, struct sk_buff *skb) { - unsigned int len = skb->len; + const struct tcphdr *th = (const struct tcphdr *)skb->data; struct tcp_sock *tp = tcp_sk(sk); + unsigned int len = skb->len; /* TCP congestion window tracking */ trace_tcp_probe(sk, skb); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index adbdb503db0c..749b0ef9f405 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1486,7 +1486,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) sk->sk_rx_dst = NULL; } } - tcp_rcv_established(sk, skb, tcp_hdr(skb)); + tcp_rcv_established(sk, skb); return 0; } diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 7d47c2b550a9..8764a63abd91 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1322,7 +1322,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) } } - tcp_rcv_established(sk, skb, tcp_hdr(skb)); + tcp_rcv_established(sk, skb); if (opt_skb) goto ipv6_pktoptions; return 0; -- cgit v1.2.3