From 5d299f3d3c8a2fbc732b1bf03af36333ccec3130 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 6 Aug 2012 05:09:33 +0000 Subject: net: ipv6: fix TCP early demux IPv6 needs a cookie in dst_check() call. We need to add rx_dst_cookie and provide a family independent sk_rx_dst_set(sk, skb) method to properly support IPv6 TCP early demux. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 42b2a6a7309..272241f16fc 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1627,9 +1627,6 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) sk->sk_rx_dst = NULL; } } - if (unlikely(sk->sk_rx_dst == NULL)) - inet_sk_rx_dst_set(sk, skb); - if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) { rsk = sk; goto reset; @@ -1872,10 +1869,20 @@ static struct timewait_sock_ops tcp_timewait_sock_ops = { .twsk_destructor= tcp_twsk_destructor, }; +static void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) +{ + struct dst_entry *dst = skb_dst(skb); + + dst_hold(dst); + sk->sk_rx_dst = dst; + inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; +} + const struct inet_connection_sock_af_ops ipv4_specific = { .queue_xmit = ip_queue_xmit, .send_check = tcp_v4_send_check, .rebuild_header = inet_sk_rebuild_header, + .sk_rx_dst_set = inet_sk_rx_dst_set, .conn_request = tcp_v4_conn_request, .syn_recv_sock = tcp_v4_syn_recv_sock, .net_header_len = sizeof(struct iphdr), -- cgit v1.2.3 From 63d02d157ec4124990258d66517b6c11fd6df0cf Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 9 Aug 2012 14:11:00 +0000 Subject: net: tcp: ipv6_mapped needs sk_rx_dst_set method commit 5d299f3d3c8a2fb (net: ipv6: fix TCP early demux) added a regression for ipv6_mapped case. [ 67.422369] SELinux: initialized (dev autofs, type autofs), uses genfs_contexts [ 67.449678] SELinux: initialized (dev autofs, type autofs), uses genfs_contexts [ 92.631060] BUG: unable to handle kernel NULL pointer dereference at (null) [ 92.631435] IP: [< (null)>] (null) [ 92.631645] PGD 0 [ 92.631846] Oops: 0010 [#1] SMP [ 92.632095] Modules linked in: autofs4 sunrpc ipv6 dm_mirror dm_region_hash dm_log dm_multipath dm_mod video sbs sbshc battery ac lp parport sg snd_hda_intel snd_hda_codec snd_seq_oss snd_seq_midi_event snd_seq snd_seq_device pcspkr snd_pcm_oss snd_mixer_oss snd_pcm snd_timer serio_raw button floppy snd i2c_i801 i2c_core soundcore snd_page_alloc shpchp ide_cd_mod cdrom microcode ehci_hcd ohci_hcd uhci_hcd [ 92.634294] CPU 0 [ 92.634294] Pid: 4469, comm: sendmail Not tainted 3.6.0-rc1 #3 [ 92.634294] RIP: 0010:[<0000000000000000>] [< (null)>] (null) [ 92.634294] RSP: 0018:ffff880245fc7cb0 EFLAGS: 00010282 [ 92.634294] RAX: ffffffffa01985f0 RBX: ffff88024827ad00 RCX: 0000000000000000 [ 92.634294] RDX: 0000000000000218 RSI: ffff880254735380 RDI: ffff88024827ad00 [ 92.634294] RBP: ffff880245fc7cc8 R08: 0000000000000001 R09: 0000000000000000 [ 92.634294] R10: 0000000000000000 R11: ffff880245fc7bf8 R12: ffff880254735380 [ 92.634294] R13: ffff880254735380 R14: 0000000000000000 R15: 7fffffffffff0218 [ 92.634294] FS: 00007f4516ccd6f0(0000) GS:ffff880256600000(0000) knlGS:0000000000000000 [ 92.634294] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [ 92.634294] CR2: 0000000000000000 CR3: 0000000245ed1000 CR4: 00000000000007f0 [ 92.634294] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 92.634294] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 [ 92.634294] Process sendmail (pid: 4469, threadinfo ffff880245fc6000, task ffff880254b8cac0) [ 92.634294] Stack: [ 92.634294] ffffffff813837a7 ffff88024827ad00 ffff880254b6b0e8 ffff880245fc7d68 [ 92.634294] ffffffff81385083 00000000001d2680 ffff8802547353a8 ffff880245fc7d18 [ 92.634294] ffffffff8105903a ffff88024827ad60 0000000000000002 00000000000000ff [ 92.634294] Call Trace: [ 92.634294] [] ? tcp_finish_connect+0x2c/0xfa [ 92.634294] [] tcp_rcv_state_process+0x2b6/0x9c6 [ 92.634294] [] ? sched_clock_cpu+0xc3/0xd1 [ 92.634294] [] ? local_clock+0x2b/0x3c [ 92.634294] [] tcp_v4_do_rcv+0x63a/0x670 [ 92.634294] [] release_sock+0x128/0x1bd [ 92.634294] [] __inet_stream_connect+0x1b1/0x352 [ 92.634294] [] ? lock_sock_nested+0x74/0x7f [ 92.634294] [] ? wake_up_bit+0x25/0x25 [ 92.634294] [] ? lock_sock_nested+0x74/0x7f [ 92.634294] [] ? inet_stream_connect+0x22/0x4b [ 92.634294] [] inet_stream_connect+0x33/0x4b [ 92.634294] [] sys_connect+0x78/0x9e [ 92.634294] [] ? sysret_check+0x1b/0x56 [ 92.634294] [] ? __audit_syscall_entry+0x195/0x1c8 [ 92.634294] [] ? trace_hardirqs_on_thunk+0x3a/0x3f [ 92.634294] [] system_call_fastpath+0x16/0x1b [ 92.634294] Code: Bad RIP value. [ 92.634294] RIP [< (null)>] (null) [ 92.634294] RSP [ 92.634294] CR2: 0000000000000000 [ 92.648982] ---[ end trace 24e2bed94314c8d9 ]--- [ 92.649146] Kernel panic - not syncing: Fatal exception in interrupt Fix this using inet_sk_rx_dst_set(), and export this function in case IPv6 is modular. Reported-by: Andrew Morton Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 272241f16fc..76782376401 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1869,7 +1869,7 @@ static struct timewait_sock_ops tcp_timewait_sock_ops = { .twsk_destructor= tcp_twsk_destructor, }; -static void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) +void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) { struct dst_entry *dst = skb_dst(skb); @@ -1877,6 +1877,7 @@ static void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb) sk->sk_rx_dst = dst; inet_sk(sk)->rx_dst_ifindex = skb->skb_iif; } +EXPORT_SYMBOL(inet_sk_rx_dst_set); const struct inet_connection_sock_af_ops ipv4_specific = { .queue_xmit = ip_queue_xmit, -- cgit v1.2.3 From fae6ef87faeb8853896920c68ee703d715799d28 Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Sun, 19 Aug 2012 03:30:38 +0000 Subject: net: tcp: move sk_rx_dst_set call after tcp_create_openreq_child() This commit removes the sk_rx_dst_set calls from tcp_create_openreq_child(), because at that point the icsk_af_ops field of ipv6_mapped TCP sockets has not been set to its proper final value. Instead, to make sure we get the right sk_rx_dst_set variant appropriate for the address family of the new connection, we have tcp_v{4,6}_syn_recv_sock() directly call the appropriate function shortly after the call to tcp_create_openreq_child() returns. This also moves inet6_sk_rx_dst_set() to avoid a forward declaration with the new approach. Signed-off-by: Neal Cardwell Reported-by: Artem Savkov Cc: Eric Dumazet Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 76782376401..5bf2040b25b 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1462,6 +1462,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, goto exit_nonewsk; newsk->sk_gso_type = SKB_GSO_TCPV4; + inet_sk_rx_dst_set(newsk, skb); newtp = tcp_sk(newsk); newinet = inet_sk(newsk); -- cgit v1.2.3 From 144d56e91044181ec0ef67aeca91e9a8b5718348 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 20 Aug 2012 00:22:46 +0000 Subject: tcp: fix possible socket refcount problem Commit 6f458dfb40 (tcp: improve latencies of timer triggered events) added bug leading to following trace : [ 2866.131281] IPv4: Attempt to release TCP socket in state 1 ffff880019ec0000 [ 2866.131726] [ 2866.132188] ========================= [ 2866.132281] [ BUG: held lock freed! ] [ 2866.132281] 3.6.0-rc1+ #622 Not tainted [ 2866.132281] ------------------------- [ 2866.132281] kworker/0:1/652 is freeing memory ffff880019ec0000-ffff880019ec0a1f, with a lock still held there! [ 2866.132281] (sk_lock-AF_INET-RPC){+.+...}, at: [] tcp_sendmsg+0x29/0xcc6 [ 2866.132281] 4 locks held by kworker/0:1/652: [ 2866.132281] #0: (rpciod){.+.+.+}, at: [] process_one_work+0x1de/0x47f [ 2866.132281] #1: ((&task->u.tk_work)){+.+.+.}, at: [] process_one_work+0x1de/0x47f [ 2866.132281] #2: (sk_lock-AF_INET-RPC){+.+...}, at: [] tcp_sendmsg+0x29/0xcc6 [ 2866.132281] #3: (&icsk->icsk_retransmit_timer){+.-...}, at: [] run_timer_softirq+0x1ad/0x35f [ 2866.132281] [ 2866.132281] stack backtrace: [ 2866.132281] Pid: 652, comm: kworker/0:1 Not tainted 3.6.0-rc1+ #622 [ 2866.132281] Call Trace: [ 2866.132281] [] debug_check_no_locks_freed+0x112/0x159 [ 2866.132281] [] ? __sk_free+0xfd/0x114 [ 2866.132281] [] kmem_cache_free+0x6b/0x13a [ 2866.132281] [] __sk_free+0xfd/0x114 [ 2866.132281] [] sk_free+0x1c/0x1e [ 2866.132281] [] tcp_write_timer+0x51/0x56 [ 2866.132281] [] run_timer_softirq+0x218/0x35f [ 2866.132281] [] ? run_timer_softirq+0x1ad/0x35f [ 2866.132281] [] ? rb_commit+0x58/0x85 [ 2866.132281] [] ? tcp_write_timer_handler+0x148/0x148 [ 2866.132281] [] __do_softirq+0xcb/0x1f9 [ 2866.132281] [] ? _raw_spin_unlock+0x29/0x2e [ 2866.132281] [] call_softirq+0x1c/0x30 [ 2866.132281] [] do_softirq+0x4a/0xa6 [ 2866.132281] [] irq_exit+0x51/0xad [ 2866.132281] [] do_IRQ+0x9d/0xb4 [ 2866.132281] [] common_interrupt+0x6f/0x6f [ 2866.132281] [] ? sched_clock_cpu+0x58/0xd1 [ 2866.132281] [] ? _raw_spin_unlock_irqrestore+0x4c/0x56 [ 2866.132281] [] mod_timer+0x178/0x1a9 [ 2866.132281] [] sk_reset_timer+0x19/0x26 [ 2866.132281] [] tcp_rearm_rto+0x99/0xa4 [ 2866.132281] [] tcp_event_new_data_sent+0x6e/0x70 [ 2866.132281] [] tcp_write_xmit+0x7de/0x8e4 [ 2866.132281] [] ? __alloc_skb+0xa0/0x1a1 [ 2866.132281] [] __tcp_push_pending_frames+0x2e/0x8a [ 2866.132281] [] tcp_sendmsg+0xb32/0xcc6 [ 2866.132281] [] inet_sendmsg+0xaa/0xd5 [ 2866.132281] [] ? inet_autobind+0x5f/0x5f [ 2866.132281] [] ? trace_clock_local+0x9/0xb [ 2866.132281] [] sock_sendmsg+0xa3/0xc4 [ 2866.132281] [] ? rb_reserve_next_event+0x26f/0x2d5 [ 2866.132281] [] ? native_sched_clock+0x29/0x6f [ 2866.132281] [] ? sched_clock+0x9/0xd [ 2866.132281] [] ? trace_clock_local+0x9/0xb [ 2866.132281] [] kernel_sendmsg+0x37/0x43 [ 2866.132281] [] xs_send_kvec+0x77/0x80 [ 2866.132281] [] xs_sendpages+0x6f/0x1a0 [ 2866.132281] [] ? try_to_del_timer_sync+0x55/0x61 [ 2866.132281] [] xs_tcp_send_request+0x55/0xf1 [ 2866.132281] [] xprt_transmit+0x89/0x1db [ 2866.132281] [] ? call_connect+0x3c/0x3c [ 2866.132281] [] call_transmit+0x1c5/0x20e [ 2866.132281] [] __rpc_execute+0x6f/0x225 [ 2866.132281] [] ? call_connect+0x3c/0x3c [ 2866.132281] [] rpc_async_schedule+0x28/0x34 [ 2866.132281] [] process_one_work+0x24d/0x47f [ 2866.132281] [] ? process_one_work+0x1de/0x47f [ 2866.132281] [] ? __rpc_execute+0x225/0x225 [ 2866.132281] [] worker_thread+0x236/0x317 [ 2866.132281] [] ? process_scheduled_works+0x2f/0x2f [ 2866.132281] [] kthread+0x9a/0xa2 [ 2866.132281] [] kernel_thread_helper+0x4/0x10 [ 2866.132281] [] ? retint_restore_args+0x13/0x13 [ 2866.132281] [] ? __init_kthread_worker+0x5a/0x5a [ 2866.132281] [] ? gs_change+0x13/0x13 [ 2866.308506] IPv4: Attempt to release TCP socket in state 1 ffff880019ec0000 [ 2866.309689] ============================================================================= [ 2866.310254] BUG TCP (Not tainted): Object already free [ 2866.310254] ----------------------------------------------------------------------------- [ 2866.310254] The bug comes from the fact that timer set in sk_reset_timer() can run before we actually do the sock_hold(). socket refcount reaches zero and we free the socket too soon. timer handler is not allowed to reduce socket refcnt if socket is owned by the user, or we need to change sk_reset_timer() implementation. We should take a reference on the socket in case TCP_DELACK_TIMER_DEFERRED or TCP_DELACK_TIMER_DEFERRED bit are set in tsq_flags Also fix a typo in tcp_delack_timer(), where TCP_WRITE_TIMER_DEFERRED was used instead of TCP_DELACK_TIMER_DEFERRED. For consistency, use same socket refcount change for TCP_MTU_REDUCED_DEFERRED, even if not fired from a timer. Reported-by: Fengguang Wu Tested-by: Fengguang Wu Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'net/ipv4/tcp_ipv4.c') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 5bf2040b25b..00a748d1406 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -417,10 +417,12 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info) if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */ tp->mtu_info = info; - if (!sock_owned_by_user(sk)) + if (!sock_owned_by_user(sk)) { tcp_v4_mtu_reduced(sk); - else - set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags); + } else { + if (!test_and_set_bit(TCP_MTU_REDUCED_DEFERRED, &tp->tsq_flags)) + sock_hold(sk); + } goto out; } -- cgit v1.2.3