From b4f55925ace5646d72df5af6fdab74fdbb288229 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Fri, 2 Aug 2013 11:32:43 +0200 Subject: net: rtm_to_ifaddr: free ifa if ifa_cacheinfo processing fails [ Upstream commit 446266b0c742a2c9ee8f0dce759a0117bce58a86 ] Commit 5c766d642 ("ipv4: introduce address lifetime") leaves the ifa resource that was allocated via inet_alloc_ifa() unfreed when returning the function with -EINVAL. Thus, free it first via inet_free_ifa(). Signed-off-by: Daniel Borkmann Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/devinet.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index dfc39d4d48b..9e38217c393 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -771,7 +771,7 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh, ci = nla_data(tb[IFA_CACHEINFO]); if (!ci->ifa_valid || ci->ifa_prefered > ci->ifa_valid) { err = -EINVAL; - goto errout; + goto errout_free; } *pvalid_lft = ci->ifa_valid; *pprefered_lft = ci->ifa_prefered; @@ -779,6 +779,8 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh, return ifa; +errout_free: + inet_free_ifa(ifa); errout: return ERR_PTR(err); } -- cgit v1.2.3 From 8b2b5e27cae0bd2572687869111e516ab445be34 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 5 Aug 2013 11:18:49 -0700 Subject: fib_trie: remove potential out of bound access [ Upstream commit aab515d7c32a34300312416c50314e755ea6f765 ] AddressSanitizer [1] dynamic checker pointed a potential out of bound access in leaf_walk_rcu() We could allocate one more slot in tnode_new() to leave the prefetch() in-place but it looks not worth the pain. Bug added in commit 82cfbb008572b ("[IPV4] fib_trie: iterator recode") [1] : https://code.google.com/p/address-sanitizer/wiki/AddressSanitizerForKernel Reported-by: Andrey Konovalov Signed-off-by: Eric Dumazet Cc: Dmitry Vyukov Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/fib_trie.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 49616fed934..6e8a13da6cb 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -71,7 +71,6 @@ #include #include #include -#include #include #include #include @@ -1761,10 +1760,8 @@ static struct leaf *leaf_walk_rcu(struct tnode *p, struct rt_trie_node *c) if (!c) continue; - if (IS_LEAF(c)) { - prefetch(rcu_dereference_rtnl(p->child[idx])); + if (IS_LEAF(c)) return (struct leaf *) c; - } /* Rescan start scanning in new node */ p = (struct tnode *) c; -- cgit v1.2.3 From 16f033319c6805588375afbda88ea5a513393dd3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 5 Aug 2013 17:10:15 -0700 Subject: tcp: cubic: fix overflow error in bictcp_update() [ Upstream commit 2ed0edf9090bf4afa2c6fc4f38575a85a80d4b20 ] commit 17a6e9f1aa9 ("tcp_cubic: fix clock dependency") added an overflow error in bictcp_update() in following code : /* change the unit from HZ to bictcp_HZ */ t = ((tcp_time_stamp + msecs_to_jiffies(ca->delay_min>>3) - ca->epoch_start) << BICTCP_HZ) / HZ; Because msecs_to_jiffies() being unsigned long, compiler does implicit type promotion. We really want to constrain (tcp_time_stamp - ca->epoch_start) to a signed 32bit value, or else 't' has unexpected high values. This bugs triggers an increase of retransmit rates ~24 days after boot [1], as the high order bit of tcp_time_stamp flips. [1] for hosts with HZ=1000 Big thanks to Van Jacobson for spotting this problem. Diagnosed-by: Van Jacobson Signed-off-by: Eric Dumazet Cc: Neal Cardwell Cc: Yuchung Cheng Cc: Stephen Hemminger Acked-by: Neal Cardwell Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/tcp_cubic.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index a9077f441cb..b6b591f0a78 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -206,8 +206,8 @@ static u32 cubic_root(u64 a) */ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) { - u64 offs; - u32 delta, t, bic_target, max_cnt; + u32 delta, bic_target, max_cnt; + u64 offs, t; ca->ack_cnt++; /* count the number of ACKs */ @@ -250,9 +250,11 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) * if the cwnd < 1 million packets !!! */ + t = (s32)(tcp_time_stamp - ca->epoch_start); + t += msecs_to_jiffies(ca->delay_min >> 3); /* change the unit from HZ to bictcp_HZ */ - t = ((tcp_time_stamp + msecs_to_jiffies(ca->delay_min>>3) - - ca->epoch_start) << BICTCP_HZ) / HZ; + t <<= BICTCP_HZ; + do_div(t, HZ); if (t < ca->bic_K) /* t - K */ offs = ca->bic_K - t; -- cgit v1.2.3 From ca02d414915693909f9d7f455c4598d3f8b514b3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 5 Aug 2013 20:05:12 -0700 Subject: tcp: cubic: fix bug in bictcp_acked() [ Upstream commit cd6b423afd3c08b27e1fed52db828ade0addbc6b ] While investigating about strange increase of retransmit rates on hosts ~24 days after boot, Van found hystart was disabled if ca->epoch_start was 0, as following condition is true when tcp_time_stamp high order bit is set. (s32)(tcp_time_stamp - ca->epoch_start) < HZ Quoting Van : At initialization & after every loss ca->epoch_start is set to zero so I believe that the above line will turn off hystart as soon as the 2^31 bit is set in tcp_time_stamp & hystart will stay off for 24 days. I think we've observed that cubic's restart is too aggressive without hystart so this might account for the higher drop rate we observe. Diagnosed-by: Van Jacobson Signed-off-by: Eric Dumazet Cc: Neal Cardwell Cc: Yuchung Cheng Acked-by: Neal Cardwell Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/tcp_cubic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index b6b591f0a78..b6ae92a51f5 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -416,7 +416,7 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us) return; /* Discard delay samples right after fast recovery */ - if ((s32)(tcp_time_stamp - ca->epoch_start) < HZ) + if (ca->epoch_start && (s32)(tcp_time_stamp - ca->epoch_start) < HZ) return; delay = (rtt_us << 3) / USEC_PER_MSEC; -- cgit v1.2.3 From 27c1c98bd3b44b7c5f5c0ecfe1a1ec1240b73829 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timo=20Ter=C3=A4s?= Date: Tue, 6 Aug 2013 13:45:43 +0300 Subject: ip_gre: fix ipgre_header to return correct offset MIME-Version: 1.0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 77a482bdb2e68d13fae87541b341905ba70d572b ] Fix ipgre_header() (header_ops->create) to return the correct amount of bytes pushed. Most callers of dev_hard_header() seem to care only if it was success, but af_packet.c uses it as offset to the skb to copy from userspace only once. In practice this fixes packet socket sendto()/sendmsg() to gre tunnels. Regression introduced in c54419321455631079c7d6e60bc732dd0c5914c5 ("GRE: Refactor GRE tunneling code.") Cc: Pravin B Shelar Signed-off-by: Timo Teräs Acked-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/ip_gre.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 855004f0832..c52fee0976d 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -572,7 +572,7 @@ static int ipgre_header(struct sk_buff *skb, struct net_device *dev, if (daddr) memcpy(&iph->daddr, daddr, 4); if (iph->daddr) - return t->hlen; + return t->hlen + sizeof(*iph); return -(t->hlen + sizeof(*iph)); } -- cgit v1.2.3 From e8d11678770b9bbdcc5c2a9b3a041d136575f322 Mon Sep 17 00:00:00 2001 From: Pravin B Shelar Date: Tue, 13 Aug 2013 01:41:06 -0700 Subject: ip_tunnel: Do not use inner ip-header-id for tunnel ip-header-id. [ Upstream commit 4221f40513233fa8edeef7fc82e44163fde03b9b ] Using inner-id for tunnel id is not safe in some rare cases. E.g. packets coming from multiple sources entering same tunnel can have same id. Therefore on tunnel packet receive we could have packets from two different stream but with same source and dst IP with same ip-id which could confuse ip packet reassembly. Following patch reverts optimization from commit 490ab08127 (IP_GRE: Fix IP-Identification.) Signed-off-by: Pravin B Shelar CC: Jarno Rajahalme CC: Ansis Atteka Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/ip_tunnel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index cbfc37f5f05..b7a4c21c06e 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -686,7 +686,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev, iph->daddr = fl4.daddr; iph->saddr = fl4.saddr; iph->ttl = ttl; - tunnel_ip_select_ident(skb, inner_iph, &rt->dst); + __ip_select_ident(iph, &rt->dst, (skb_shinfo(skb)->gso_segs ?: 1) - 1); iptunnel_xmit(skb, dev); return; -- cgit v1.2.3 From f3f905389f1aaae9e091a28d018c66e08c85eddd Mon Sep 17 00:00:00 2001 From: Andrey Vagin Date: Fri, 16 Aug 2013 19:04:36 +0400 Subject: tcp: set timestamps for restored skb-s [ Upstream commit 7ed5c5ae96d23da22de95e1c7a239537acd378b1 ] When the repair mode is turned off, the write queue seqs are updated so that the whole queue is considered to be 'already sent. The "when" field must be set for such skb. It's used in tcp_rearm_rto for example. If the "when" field isn't set, the retransmit timeout can be calculated incorrectly and a tcp connected can stop for two minutes (TCP_RTO_MAX). Acked-by: Pavel Emelyanov Cc: "David S. Miller" Cc: Alexey Kuznetsov Cc: James Morris Cc: Hideaki YOSHIFUJI Cc: Patrick McHardy Signed-off-by: Andrey Vagin Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/tcp.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index ab450c099aa..2005561861a 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1117,6 +1117,13 @@ new_segment: if (!skb) goto wait_for_memory; + /* + * All packets are restored as if they have + * already been sent. + */ + if (tp->repair) + TCP_SKB_CB(skb)->when = tcp_time_stamp; + /* * Check whether we can use HW checksum. */ -- cgit v1.2.3 From 6f198dcab737db9335f1f4c7d97f801e0d5de186 Mon Sep 17 00:00:00 2001 From: Andrew Vagin Date: Tue, 27 Aug 2013 12:20:40 +0400 Subject: tcp: initialize rcv_tstamp for restored sockets [ Upstream commit c7781a6e3c4a9a17e144ec2db00ebfea327bd627 ] u32 rcv_tstamp; /* timestamp of last received ACK */ Its value used in tcp_retransmit_timer, which closes socket if the last ack was received more then TCP_RTO_MAX ago. Currently rcv_tstamp is initialized to zero and if tcp_retransmit_timer is called before receiving a first ack, the connection is closed. This patch initializes rcv_tstamp to a timestamp, when a socket was restored. Reported-by: Cyrill Gorcunov Cc: Pavel Emelyanov Cc: Eric Dumazet Cc: "David S. Miller" Cc: Alexey Kuznetsov Cc: James Morris Cc: Hideaki YOSHIFUJI Cc: Patrick McHardy Signed-off-by: Andrey Vagin Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/tcp_output.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index ec335fabd5c..2c48d51f47a 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2808,6 +2808,8 @@ void tcp_connect_init(struct sock *sk) if (likely(!tp->repair)) tp->rcv_nxt = 0; + else + tp->rcv_tstamp = tcp_time_stamp; tp->rcv_wup = tp->rcv_nxt; tp->copied_seq = tp->rcv_nxt; -- cgit v1.2.3 From 6fe6efd941d49f7169e78b2e60ca2dc6a56ca8b4 Mon Sep 17 00:00:00 2001 From: Andrew Vagin Date: Tue, 27 Aug 2013 12:21:55 +0400 Subject: tcp: don't apply tsoffset if rcv_tsecr is zero [ Upstream commit e3e12028315749b7fa2edbc37328e5847be9ede9 ] The zero value means that tsecr is not valid, so it's a special case. tsoffset is used to customize tcp_time_stamp for one socket. tsoffset is usually zero, it's used when a socket was moved from one host to another host. Currently this issue affects logic of tcp_rcv_rtt_measure_ts. Due to incorrect value of rcv_tsecr, tcp_rcv_rtt_measure_ts sets rto to TCP_RTO_MAX. Reported-by: Cyrill Gorcunov Cc: Pavel Emelyanov Cc: Eric Dumazet Cc: "David S. Miller" Cc: Alexey Kuznetsov Cc: James Morris Cc: Hideaki YOSHIFUJI Cc: Patrick McHardy Signed-off-by: Andrey Vagin Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/tcp_input.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 9c6225780bd..4b75aad14b0 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3598,7 +3598,10 @@ static bool tcp_parse_aligned_timestamp(struct tcp_sock *tp, const struct tcphdr ++ptr; tp->rx_opt.rcv_tsval = ntohl(*ptr); ++ptr; - tp->rx_opt.rcv_tsecr = ntohl(*ptr) - tp->tsoffset; + if (*ptr) + tp->rx_opt.rcv_tsecr = ntohl(*ptr) - tp->tsoffset; + else + tp->rx_opt.rcv_tsecr = 0; return true; } return false; @@ -3623,7 +3626,7 @@ static bool tcp_fast_parse_options(const struct sk_buff *skb, } tcp_parse_options(skb, &tp->rx_opt, 1, NULL); - if (tp->rx_opt.saw_tstamp) + if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) tp->rx_opt.rcv_tsecr -= tp->tsoffset; return true; @@ -5376,7 +5379,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, int saved_clamp = tp->rx_opt.mss_clamp; tcp_parse_options(skb, &tp->rx_opt, 0, &foc); - if (tp->rx_opt.saw_tstamp) + if (tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) tp->rx_opt.rcv_tsecr -= tp->tsoffset; if (th->ack) { -- cgit v1.2.3 From 5612d36ca1438c23280b962a80943d14b2e9f778 Mon Sep 17 00:00:00 2001 From: Chris Clark Date: Tue, 27 Aug 2013 12:02:15 -0600 Subject: ipv4: sendto/hdrincl: don't use destination address found in header [ Upstream commit c27c9322d015dc1d9dfdf31724fca71c0476c4d1 ] ipv4: raw_sendmsg: don't use header's destination address A sendto() regression was bisected and found to start with commit f8126f1d5136be1 (ipv4: Adjust semantics of rt->rt_gateway.) The problem is that it tries to ARP-lookup the constructed packet's destination address rather than the explicitly provided address. Fix this using FLOWI_FLAG_KNOWN_NH so that given nexthop is used. cf. commit 2ad5b9e4bd314fc685086b99e90e5de3bc59e26b Reported-by: Chris Clark Bisected-by: Chris Clark Tested-by: Chris Clark Suggested-by: Julian Anastasov Signed-off-by: Chris Clark Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/raw.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index dd44e0ab600..61e60d67adc 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -571,7 +571,8 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE, inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, - inet_sk_flowi_flags(sk) | FLOWI_FLAG_CAN_SLEEP, + inet_sk_flowi_flags(sk) | FLOWI_FLAG_CAN_SLEEP | + (inet->hdrincl ? FLOWI_FLAG_KNOWN_NH : 0), daddr, saddr, 0, 0); if (!inet->hdrincl) { -- cgit v1.2.3 From b70a23ab4ab5a95ab9be1bf77b73c1ad9f4e15a4 Mon Sep 17 00:00:00 2001 From: Phil Oester Date: Tue, 27 Aug 2013 16:41:40 -0700 Subject: tcp: tcp_make_synack() should use sock_wmalloc [ Upstream commit eb8895debe1baba41fcb62c78a16f0c63c21662a ] In commit 90ba9b19 (tcp: tcp_make_synack() can use alloc_skb()), Eric changed the call to sock_wmalloc in tcp_make_synack to alloc_skb. In doing so, the netfilter owner match lost its ability to block the SYNACK packet on outbound listening sockets. Revert the change, restoring the owner match functionality. This closes netfilter bugzilla #847. Signed-off-by: Phil Oester Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/tcp_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/ipv4') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 2c48d51f47a..0145ce7e609 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2664,7 +2664,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, int tcp_header_size; int mss; - skb = alloc_skb(MAX_TCP_HEADER + 15, sk_gfp_atomic(sk, GFP_ATOMIC)); + skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC); if (unlikely(!skb)) { dst_release(dst); return NULL; -- cgit v1.2.3